diff --git a/.gitattributes b/.gitattributes index c0d99b39e07957ff34ff35da2919856c5724380b..7717a6f81a6613b732acd3959dd7173535037a1e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -76,3 +76,32 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text 2025.03.05/13.39.58_train_llm_lowdim_push-back-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-back-v2/checkpoint-800/tokenizer.json filter=lfs diff=lfs merge=lfs -text 2025.03.05/13.39.58_train_llm_lowdim_push-back-v2/wandb/run-20250305_133958-ognmy7d4/run-ognmy7d4.wandb filter=lfs diff=lfs merge=lfs -text 2025.02.18/12.48.25_train_bc_lowdim_push-v2/wandb/run-20250218_124831-m3b3rr9e/run-m3b3rr9e.wandb filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/run-6awu8klx.wandb filter=lfs diff=lfs merge=lfs -text diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/.hydra/config.yaml b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..013b11f2c65fc2417af8e71cc678d6978c39743c --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/.hydra/config.yaml @@ -0,0 +1,113 @@ +name: train_llm_lowdim +_target_: llmbc.workspace.train_llm_workspace.TrainLLMWorkspace +obs_dim: ${task.obs_dim} +action_dim: ${task.action_dim} +horizon: 1 +n_obs_steps: 1 +n_action_steps: 1 +task_name: ${task.name} +exp_name: train llm +model_name: ${llm.name} +use_quantization: ${llm.use_quantization} +lora_config: ${llm.lora_config} +dataset: + test_data_ratio: 0.01 +debug: false +training: + seed: 42 + per_device_train_batch_size: 1 + per_device_eval_batch_size: 1 + gradient_accumulation_steps: 64 + optim: paged_adamw_32bit + num_train_epochs: 20 + eval_strategy: steps + logging_steps: 1 + warmup_steps: 10 + logging_strategy: steps + learning_rate: 3.0e-05 + fp16: false + bf16: true + tf32: true + group_by_length: true + report_to: wandb + save_steps: 200 + eval_steps: 10 + use_joint_mlp_projector: ${llm.use_joint_mlp_projector} + joint_obs_action_mlp_lr: 1.0e-06 +trainer: + obs_dim: ${obs_dim} + action_dim: ${action_dim} + use_joint_mlp_projector: ${llm.use_joint_mlp_projector} + max_seq_length: ${llm.max_length} + dataset_text_field: text + packing: false +logging: + project: llm_module_finetuning + resume: true + mode: online + name: ${now:%Y.%m.%d-%H.%M.%S}_${name}_${task_name} + tags: + - ${name} + - ${task_name} + - ${exp_name} + id: null + group: null +multi_run: + run_dir: data/outputs/${now:%Y.%m.%d}/${now:%H.%M.%S}_${name}_${task_name} + wandb_name_base: ${now:%Y.%m.%d-%H.%M.%S}_${name}_${task_name} +task: + name: push-v2 + obs_dim: 9 + action_dim: 4 + env_runner: + _target_: llmbc.env_runner.metaworld_lowdim_runner.MetaworldLowdimRunner + env_name: llf-metaworld-push-v2 + max_steps: 30 + n_obs_steps: ${n_obs_steps} + n_action_steps: ${n_action_steps} + instruction_type: b + feedback_type: + - hp + - hn + - fp + visual: false + dataset: + _target_: llmbc.dataset.metaworld_lowdim_dataset.MetaworldLowdimDataset + data_path: datasets/push-v2-general.pt + data_path2: datasets/push-v2.pt + horizon: ${horizon} + pad_before: ${eval:'${n_obs_steps}-1'} + pad_after: ${eval:'${n_action_steps}-1'} + obs_eef_target: true + use_manual_normalizer: false + val_ratio: 0.2 + dummy_normalizer: true + instructor: + _target_: llmbc.translator.instructor.metaworld_instructor.push_v2_instructor.PushV2Instructor +llm: + name: meta-llama/Llama-3.2-1B-Instruct + model_name: Llama-3.2-1B-Instruct + use_quantization: false + load_from_checkpoint: false + adaptor_path: /home/chyang/workspace/LLM-BC/data/outputs/2025.03.05/13.39.46_train_llm_lowdim_sweep-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-sweep-v2/checkpoint-3550 + use_orig_model: false + use_joint_mlp_projector: true + load_from_mlp_projector_checkpoint: true + mlp_projector_checkpoint_path: /home/chyang/workspace/LLM-BC/data/outputs/2025.03.09/12.41.09_train_mlp_projector_metaworld/checkpoints/latest.ckpt + max_length: 100 + config_target: llmbc.model.llm.llama_lowdim_model.LowdimLlamaConfig + causal_lm_target: llmbc.model.llm.llama_lowdim_model.LowdimLlamaForCausalLM + lora_config: + r: 16 + lora_alpha: 32 + lora_dropout: 0.05 + bias: none + task_type: CAUSAL_LM + prompter: + _target_: llmbc.translator.prompter.llama_prompter.LlamaPrompter + use_joint_mlp_projector: true + hydra: + job: + override_dirname: ${model_name} + run: + dir: data/outputs/${now:%Y.%m.%d}/${now:%H.%M.%S}_${model_name} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/.hydra/hydra.yaml b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cbc703365640a6311ff2ad4856d242d784708610 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/.hydra/hydra.yaml @@ -0,0 +1,156 @@ +hydra: + run: + dir: data/outputs/${now:%Y.%m.%d}/${now:%H.%M.%S}_${name}_${task_name} + sweep: + dir: data/outputs/${now:%Y.%m.%d}/${now:%H.%M.%S}_${name}_${task_name} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: [] + job: + name: train_llm_workspace + chdir: null + override_dirname: '' + id: ??? + num: ??? + config_name: train_llm_workspace + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.2.0 + version_base: '1.2' + cwd: /home/chyang/workspace/LLM-BC + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /home/chyang/workspace/LLM-BC/llmbc/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /home/chyang/workspace/LLM-BC/data/outputs/2025.03.10/09.20.26_train_llm_lowdim_push-v2 + choices: + llm: llama-3.2-1b-instruct + task: push-v2 + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/.hydra/overrides.yaml b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fe51488c7066f6687ef680d6bfaa4f7768ef205c --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/.hydra/overrides.yaml @@ -0,0 +1 @@ +[] diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..be3941b5328fd1ccd2f8dafb34a49b4a80e4c6cd --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7991a23e501447dc2f286d36e0f8cbb601735f39ce53187169fe163c5c18b4a +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..97cc549bc1344bd82c87327c3620c929d76ced72 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a67da5fc77769da8b399f1014d3b67bfa548f5a956db2ad7c336521459712d3f +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..1030d1d8e32363e86e7a2f9c13af95bbdaec95ff --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:479425b06b56c48309c34e160eca886bdc0906b225beb7dd316ed8222b1ea92b +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe3cd4de1682cb8a484bc2666caa73784954bac7 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6826f91a1f8ac3a2c0b1b30d61970d948f27a47335805ea8cb35b67818482140 +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..fff36500e58d18f0e78c91421da1feb7f2325211 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e9b318849f673d6d4304f4a6becbdab431f636b4c198e68a6db81c6f622e277 +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2665bb8d3d0a5d2ebbd9bc0e3b22ae28ecbc888b --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8758e39009d1b5a43432dd9aedba62a97e95100d1c8d9fcda0bb4cfc72e50ae4 +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b5ab25ab3988ce79f5f22f6222b417a1ec73151b --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/trainer_state.json @@ -0,0 +1,7833 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.6103837471783295, + "eval_steps": 10, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.91170047026135e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..68867a2b1165f4dfd34c14946c8471ee92fc6680 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:add3d7bbfd67d63c1a56d95d3e37121185297d10b64c97a8ef2ebe051d9efeda +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f8c558bdcff1c2560d2da2cc7a06daac0281f21 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:987394ffa16179ff049c43c453fc8a429907f25373c026bc6cb8047447f6969a +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5705d7ee0ed17ad89318aad3095805fcbce113da --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0434106b0bb9937b157cf203667944a74c3b23a093d73b27ada95121acca288b +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..842e4c9b3da77c0bb0776b7058aa4d60543239cc --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9ada8e9045a3139a04588e1c9fa6d74683e291967aafb9fac5cf27cd81e68a6 +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9608cc4fa6af4baff85a2d2f2cdc8ad537e1a994 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50835ebede7779a568bd1f6be48d6a6d7f42b99ca3d5e3dc04877f92d75946c0 +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f28005f9e0130da77f0f875c4d772620b32ab414 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/trainer_state.json @@ -0,0 +1,9393 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.332279909706546, + "eval_steps": 10, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.888921013249638e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..eafd4eb3a424cd3f626a1571309d9ebfbd800f50 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c317fb533e9427bea4d3f0c01e6b59a55b59cf768fe6a4312a2919cf70268f3 +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..23d413741c62ba9fdcd3d062cab10e103882a0d4 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:962555670167ce5ff0485d2f520c06f2b9ceb1c8e9ad726e7496d1b5448de29c +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8cd03c5cb6a6235f0ada28e207623e5852d222fc --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:861218e312eb5653584f2d6de265075d4f4820af7c5e32e5fd3e744f484950b0 +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..281112d470d14e17bb78f8b55a208555656b75fa --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efdff8fd5f2b5dd46ed3a62172c4bc8a5e0376aca067ebc1b637d48a795a6c6f +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e88adcac56addaef39b55efaeb5f2077a8206443 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e30c36715848e1843dbf5212a4cc704dcf99e543abae9f557fd22ece27fa0153 +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ce01bd28ccac90c4cba1d2b50ba93051c9c00638 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/trainer_state.json @@ -0,0 +1,10953 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.054176072234763, + "eval_steps": 10, + "global_step": 1400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + }, + { + "epoch": 4.33589164785553, + "grad_norm": 232.332763671875, + "learning_rate": 2.3515426497277678e-05, + "loss": 38.1029, + "step": 1201 + }, + { + "epoch": 4.339503386004514, + "grad_norm": 251.8763885498047, + "learning_rate": 2.3509981851179673e-05, + "loss": 40.2538, + "step": 1202 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 260.1363525390625, + "learning_rate": 2.350453720508167e-05, + "loss": 39.115, + "step": 1203 + }, + { + "epoch": 4.346726862302483, + "grad_norm": 227.32473754882812, + "learning_rate": 2.3499092558983667e-05, + "loss": 37.7692, + "step": 1204 + }, + { + "epoch": 4.350338600451467, + "grad_norm": 208.3872528076172, + "learning_rate": 2.3493647912885663e-05, + "loss": 26.7583, + "step": 1205 + }, + { + "epoch": 4.353950338600452, + "grad_norm": 173.05075073242188, + "learning_rate": 2.348820326678766e-05, + "loss": 24.7576, + "step": 1206 + }, + { + "epoch": 4.357562076749436, + "grad_norm": 214.4512939453125, + "learning_rate": 2.3482758620689657e-05, + "loss": 24.8792, + "step": 1207 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 179.293701171875, + "learning_rate": 2.3477313974591652e-05, + "loss": 26.1507, + "step": 1208 + }, + { + "epoch": 4.364785553047404, + "grad_norm": 401.9908142089844, + "learning_rate": 2.3471869328493648e-05, + "loss": 47.4017, + "step": 1209 + }, + { + "epoch": 4.368397291196389, + "grad_norm": 399.3369140625, + "learning_rate": 2.3466424682395643e-05, + "loss": 48.0082, + "step": 1210 + }, + { + "epoch": 4.368397291196389, + "eval_loss": 0.6664602756500244, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.18, + "eval_steps_per_second": 57.18, + "step": 1210 + }, + { + "epoch": 4.372009029345373, + "grad_norm": 320.49090576171875, + "learning_rate": 2.346098003629764e-05, + "loss": 47.4843, + "step": 1211 + }, + { + "epoch": 4.375620767494357, + "grad_norm": 297.55615234375, + "learning_rate": 2.3455535390199637e-05, + "loss": 46.3087, + "step": 1212 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 245.03399658203125, + "learning_rate": 2.3450090744101636e-05, + "loss": 45.4889, + "step": 1213 + }, + { + "epoch": 4.382844243792325, + "grad_norm": 227.94091796875, + "learning_rate": 2.344464609800363e-05, + "loss": 45.8501, + "step": 1214 + }, + { + "epoch": 4.386455981941309, + "grad_norm": 262.7824401855469, + "learning_rate": 2.3439201451905627e-05, + "loss": 46.2737, + "step": 1215 + }, + { + "epoch": 4.390067720090293, + "grad_norm": 235.969970703125, + "learning_rate": 2.3433756805807622e-05, + "loss": 45.2876, + "step": 1216 + }, + { + "epoch": 4.393679458239277, + "grad_norm": 244.8028106689453, + "learning_rate": 2.342831215970962e-05, + "loss": 45.4931, + "step": 1217 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 236.24844360351562, + "learning_rate": 2.3422867513611616e-05, + "loss": 45.6649, + "step": 1218 + }, + { + "epoch": 4.400902934537246, + "grad_norm": 204.7911834716797, + "learning_rate": 2.341742286751361e-05, + "loss": 43.9613, + "step": 1219 + }, + { + "epoch": 4.40451467268623, + "grad_norm": 190.6739044189453, + "learning_rate": 2.3411978221415607e-05, + "loss": 41.9267, + "step": 1220 + }, + { + "epoch": 4.40451467268623, + "eval_loss": 0.6481396555900574, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1220 + }, + { + "epoch": 4.408126410835214, + "grad_norm": 224.25758361816406, + "learning_rate": 2.3406533575317602e-05, + "loss": 42.34, + "step": 1221 + }, + { + "epoch": 4.411738148984199, + "grad_norm": 238.21913146972656, + "learning_rate": 2.34010889292196e-05, + "loss": 40.6947, + "step": 1222 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 255.64395141601562, + "learning_rate": 2.33956442831216e-05, + "loss": 39.8585, + "step": 1223 + }, + { + "epoch": 4.418961625282167, + "grad_norm": 202.08859252929688, + "learning_rate": 2.3390199637023595e-05, + "loss": 42.6031, + "step": 1224 + }, + { + "epoch": 4.422573363431152, + "grad_norm": 222.359619140625, + "learning_rate": 2.338475499092559e-05, + "loss": 41.9946, + "step": 1225 + }, + { + "epoch": 4.426185101580136, + "grad_norm": 198.84461975097656, + "learning_rate": 2.3379310344827586e-05, + "loss": 40.9174, + "step": 1226 + }, + { + "epoch": 4.42979683972912, + "grad_norm": 227.34942626953125, + "learning_rate": 2.337386569872958e-05, + "loss": 42.2865, + "step": 1227 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 249.9097900390625, + "learning_rate": 2.336842105263158e-05, + "loss": 42.6508, + "step": 1228 + }, + { + "epoch": 4.437020316027088, + "grad_norm": 236.96009826660156, + "learning_rate": 2.3362976406533576e-05, + "loss": 43.0846, + "step": 1229 + }, + { + "epoch": 4.440632054176072, + "grad_norm": 183.06201171875, + "learning_rate": 2.335753176043557e-05, + "loss": 42.4119, + "step": 1230 + }, + { + "epoch": 4.440632054176072, + "eval_loss": 0.6428424715995789, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 1230 + }, + { + "epoch": 4.444243792325056, + "grad_norm": 199.0382843017578, + "learning_rate": 2.335208711433757e-05, + "loss": 43.1702, + "step": 1231 + }, + { + "epoch": 4.44785553047404, + "grad_norm": 221.87939453125, + "learning_rate": 2.3346642468239565e-05, + "loss": 43.3518, + "step": 1232 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 205.0601043701172, + "learning_rate": 2.3341197822141564e-05, + "loss": 42.9713, + "step": 1233 + }, + { + "epoch": 4.455079006772009, + "grad_norm": 235.3998565673828, + "learning_rate": 2.333575317604356e-05, + "loss": 42.6973, + "step": 1234 + }, + { + "epoch": 4.458690744920993, + "grad_norm": 171.76986694335938, + "learning_rate": 2.3330308529945555e-05, + "loss": 43.351, + "step": 1235 + }, + { + "epoch": 4.462302483069977, + "grad_norm": 261.549072265625, + "learning_rate": 2.332486388384755e-05, + "loss": 43.8662, + "step": 1236 + }, + { + "epoch": 4.465914221218962, + "grad_norm": 256.76837158203125, + "learning_rate": 2.3319419237749545e-05, + "loss": 40.7938, + "step": 1237 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 176.35060119628906, + "learning_rate": 2.331397459165154e-05, + "loss": 38.1021, + "step": 1238 + }, + { + "epoch": 4.47313769751693, + "grad_norm": 203.00906372070312, + "learning_rate": 2.330852994555354e-05, + "loss": 36.6359, + "step": 1239 + }, + { + "epoch": 4.476749435665914, + "grad_norm": 259.6462707519531, + "learning_rate": 2.3303085299455535e-05, + "loss": 34.448, + "step": 1240 + }, + { + "epoch": 4.476749435665914, + "eval_loss": 0.6386051177978516, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 1240 + }, + { + "epoch": 4.480361173814899, + "grad_norm": 215.24737548828125, + "learning_rate": 2.3297640653357534e-05, + "loss": 35.2353, + "step": 1241 + }, + { + "epoch": 4.483972911963883, + "grad_norm": 249.12355041503906, + "learning_rate": 2.329219600725953e-05, + "loss": 38.2077, + "step": 1242 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 191.0881805419922, + "learning_rate": 2.3286751361161525e-05, + "loss": 36.8363, + "step": 1243 + }, + { + "epoch": 4.491196388261851, + "grad_norm": 229.26449584960938, + "learning_rate": 2.3281306715063523e-05, + "loss": 36.7398, + "step": 1244 + }, + { + "epoch": 4.4948081264108355, + "grad_norm": 184.931884765625, + "learning_rate": 2.327586206896552e-05, + "loss": 35.6614, + "step": 1245 + }, + { + "epoch": 4.4984198645598195, + "grad_norm": 183.7378387451172, + "learning_rate": 2.3270417422867514e-05, + "loss": 36.9818, + "step": 1246 + }, + { + "epoch": 4.502031602708803, + "grad_norm": 191.42543029785156, + "learning_rate": 2.326497277676951e-05, + "loss": 38.1348, + "step": 1247 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 211.6359100341797, + "learning_rate": 2.3259528130671505e-05, + "loss": 37.0112, + "step": 1248 + }, + { + "epoch": 4.509255079006772, + "grad_norm": 245.6946563720703, + "learning_rate": 2.32540834845735e-05, + "loss": 38.6218, + "step": 1249 + }, + { + "epoch": 4.512866817155756, + "grad_norm": 193.29095458984375, + "learning_rate": 2.3248638838475502e-05, + "loss": 36.9687, + "step": 1250 + }, + { + "epoch": 4.512866817155756, + "eval_loss": 0.6432057023048401, + "eval_runtime": 3.1301, + "eval_samples_per_second": 57.187, + "eval_steps_per_second": 57.187, + "step": 1250 + }, + { + "epoch": 4.51647855530474, + "grad_norm": 247.0595245361328, + "learning_rate": 2.3243194192377498e-05, + "loss": 39.8086, + "step": 1251 + }, + { + "epoch": 4.520090293453725, + "grad_norm": 243.1544189453125, + "learning_rate": 2.3237749546279493e-05, + "loss": 38.7245, + "step": 1252 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 322.0834045410156, + "learning_rate": 2.323230490018149e-05, + "loss": 39.5335, + "step": 1253 + }, + { + "epoch": 4.527313769751693, + "grad_norm": 201.5956573486328, + "learning_rate": 2.3226860254083484e-05, + "loss": 30.2928, + "step": 1254 + }, + { + "epoch": 4.530925507900677, + "grad_norm": 186.13291931152344, + "learning_rate": 2.3221415607985483e-05, + "loss": 24.8504, + "step": 1255 + }, + { + "epoch": 4.534537246049661, + "grad_norm": 251.50608825683594, + "learning_rate": 2.3215970961887478e-05, + "loss": 24.5528, + "step": 1256 + }, + { + "epoch": 4.538148984198646, + "grad_norm": 180.21124267578125, + "learning_rate": 2.3210526315789473e-05, + "loss": 25.0864, + "step": 1257 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 206.5410614013672, + "learning_rate": 2.320508166969147e-05, + "loss": 27.1602, + "step": 1258 + }, + { + "epoch": 4.545372460496614, + "grad_norm": 342.1103210449219, + "learning_rate": 2.3199637023593468e-05, + "loss": 47.3734, + "step": 1259 + }, + { + "epoch": 4.5489841986455986, + "grad_norm": 418.3056945800781, + "learning_rate": 2.3194192377495463e-05, + "loss": 48.0316, + "step": 1260 + }, + { + "epoch": 4.5489841986455986, + "eval_loss": 0.6742400527000427, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 1260 + }, + { + "epoch": 4.5525959367945825, + "grad_norm": 369.8560791015625, + "learning_rate": 2.3188747731397462e-05, + "loss": 47.4532, + "step": 1261 + }, + { + "epoch": 4.5562076749435665, + "grad_norm": 322.0288391113281, + "learning_rate": 2.3183303085299457e-05, + "loss": 47.0661, + "step": 1262 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 244.79066467285156, + "learning_rate": 2.3177858439201453e-05, + "loss": 45.1875, + "step": 1263 + }, + { + "epoch": 4.563431151241535, + "grad_norm": 209.29397583007812, + "learning_rate": 2.3172413793103448e-05, + "loss": 46.1355, + "step": 1264 + }, + { + "epoch": 4.567042889390519, + "grad_norm": 271.5123291015625, + "learning_rate": 2.3166969147005443e-05, + "loss": 45.8947, + "step": 1265 + }, + { + "epoch": 4.570654627539503, + "grad_norm": 232.42913818359375, + "learning_rate": 2.3161524500907442e-05, + "loss": 45.6542, + "step": 1266 + }, + { + "epoch": 4.574266365688487, + "grad_norm": 282.50738525390625, + "learning_rate": 2.3156079854809437e-05, + "loss": 45.8805, + "step": 1267 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 203.39031982421875, + "learning_rate": 2.3150635208711436e-05, + "loss": 44.8926, + "step": 1268 + }, + { + "epoch": 4.581489841986456, + "grad_norm": 213.94894409179688, + "learning_rate": 2.314519056261343e-05, + "loss": 43.7589, + "step": 1269 + }, + { + "epoch": 4.58510158013544, + "grad_norm": 198.9677734375, + "learning_rate": 2.3139745916515427e-05, + "loss": 41.819, + "step": 1270 + }, + { + "epoch": 4.58510158013544, + "eval_loss": 0.6428627371788025, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1270 + }, + { + "epoch": 4.588713318284425, + "grad_norm": 197.69903564453125, + "learning_rate": 2.3134301270417422e-05, + "loss": 40.6128, + "step": 1271 + }, + { + "epoch": 4.592325056433409, + "grad_norm": 229.10488891601562, + "learning_rate": 2.312885662431942e-05, + "loss": 41.1856, + "step": 1272 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 254.4750213623047, + "learning_rate": 2.3123411978221417e-05, + "loss": 40.2048, + "step": 1273 + }, + { + "epoch": 4.599548532731377, + "grad_norm": 247.2012939453125, + "learning_rate": 2.3117967332123412e-05, + "loss": 41.663, + "step": 1274 + }, + { + "epoch": 4.603160270880361, + "grad_norm": 196.78761291503906, + "learning_rate": 2.3112522686025407e-05, + "loss": 41.1102, + "step": 1275 + }, + { + "epoch": 4.606772009029346, + "grad_norm": 179.03880310058594, + "learning_rate": 2.3107078039927403e-05, + "loss": 39.6368, + "step": 1276 + }, + { + "epoch": 4.6103837471783295, + "grad_norm": 203.49159240722656, + "learning_rate": 2.3101633393829405e-05, + "loss": 42.9424, + "step": 1277 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 254.80018615722656, + "learning_rate": 2.30961887477314e-05, + "loss": 42.0636, + "step": 1278 + }, + { + "epoch": 4.617607223476298, + "grad_norm": 201.86109924316406, + "learning_rate": 2.3090744101633396e-05, + "loss": 41.4738, + "step": 1279 + }, + { + "epoch": 4.621218961625282, + "grad_norm": 185.1239471435547, + "learning_rate": 2.308529945553539e-05, + "loss": 41.8529, + "step": 1280 + }, + { + "epoch": 4.621218961625282, + "eval_loss": 0.6457561254501343, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 1280 + }, + { + "epoch": 4.624830699774266, + "grad_norm": 198.6769561767578, + "learning_rate": 2.3079854809437386e-05, + "loss": 41.8397, + "step": 1281 + }, + { + "epoch": 4.62844243792325, + "grad_norm": 254.9165496826172, + "learning_rate": 2.3074410163339382e-05, + "loss": 43.5585, + "step": 1282 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 183.61181640625, + "learning_rate": 2.306896551724138e-05, + "loss": 41.7349, + "step": 1283 + }, + { + "epoch": 4.635665914221219, + "grad_norm": 206.0381622314453, + "learning_rate": 2.3063520871143376e-05, + "loss": 42.6239, + "step": 1284 + }, + { + "epoch": 4.639277652370203, + "grad_norm": 188.5303497314453, + "learning_rate": 2.305807622504537e-05, + "loss": 43.0988, + "step": 1285 + }, + { + "epoch": 4.642889390519187, + "grad_norm": 208.30039978027344, + "learning_rate": 2.3052631578947367e-05, + "loss": 43.8379, + "step": 1286 + }, + { + "epoch": 4.646501128668172, + "grad_norm": 209.494384765625, + "learning_rate": 2.3047186932849365e-05, + "loss": 41.4395, + "step": 1287 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 223.97824096679688, + "learning_rate": 2.3041742286751364e-05, + "loss": 38.5792, + "step": 1288 + }, + { + "epoch": 4.65372460496614, + "grad_norm": 209.16192626953125, + "learning_rate": 2.303629764065336e-05, + "loss": 36.2448, + "step": 1289 + }, + { + "epoch": 4.657336343115124, + "grad_norm": 260.72821044921875, + "learning_rate": 2.3030852994555355e-05, + "loss": 35.1692, + "step": 1290 + }, + { + "epoch": 4.657336343115124, + "eval_loss": 0.6381233334541321, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1290 + }, + { + "epoch": 4.660948081264109, + "grad_norm": 222.2270965576172, + "learning_rate": 2.302540834845735e-05, + "loss": 35.2234, + "step": 1291 + }, + { + "epoch": 4.664559819413093, + "grad_norm": 208.68218994140625, + "learning_rate": 2.3019963702359346e-05, + "loss": 35.6167, + "step": 1292 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 199.57015991210938, + "learning_rate": 2.301451905626134e-05, + "loss": 36.9489, + "step": 1293 + }, + { + "epoch": 4.6717832957110605, + "grad_norm": 249.1312255859375, + "learning_rate": 2.300907441016334e-05, + "loss": 37.0681, + "step": 1294 + }, + { + "epoch": 4.675395033860045, + "grad_norm": 227.86341857910156, + "learning_rate": 2.3003629764065335e-05, + "loss": 38.3897, + "step": 1295 + }, + { + "epoch": 4.679006772009029, + "grad_norm": 290.3368225097656, + "learning_rate": 2.2998185117967334e-05, + "loss": 39.1391, + "step": 1296 + }, + { + "epoch": 4.682618510158013, + "grad_norm": 222.59974670410156, + "learning_rate": 2.299274047186933e-05, + "loss": 38.6362, + "step": 1297 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 233.853515625, + "learning_rate": 2.2987295825771325e-05, + "loss": 37.1796, + "step": 1298 + }, + { + "epoch": 4.689841986455982, + "grad_norm": 202.83212280273438, + "learning_rate": 2.2981851179673324e-05, + "loss": 38.5097, + "step": 1299 + }, + { + "epoch": 4.693453724604966, + "grad_norm": 203.59027099609375, + "learning_rate": 2.297640653357532e-05, + "loss": 38.3335, + "step": 1300 + }, + { + "epoch": 4.693453724604966, + "eval_loss": 0.6446877717971802, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 1300 + }, + { + "epoch": 4.69706546275395, + "grad_norm": 250.48324584960938, + "learning_rate": 2.2970961887477314e-05, + "loss": 39.1848, + "step": 1301 + }, + { + "epoch": 4.700677200902934, + "grad_norm": 218.0867462158203, + "learning_rate": 2.296551724137931e-05, + "loss": 38.2276, + "step": 1302 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 316.4258728027344, + "learning_rate": 2.2960072595281305e-05, + "loss": 38.4487, + "step": 1303 + }, + { + "epoch": 4.707900677200903, + "grad_norm": 262.96832275390625, + "learning_rate": 2.29546279491833e-05, + "loss": 29.1075, + "step": 1304 + }, + { + "epoch": 4.711512415349887, + "grad_norm": 261.25897216796875, + "learning_rate": 2.2949183303085303e-05, + "loss": 24.6257, + "step": 1305 + }, + { + "epoch": 4.715124153498872, + "grad_norm": 223.29014587402344, + "learning_rate": 2.2943738656987298e-05, + "loss": 24.4387, + "step": 1306 + }, + { + "epoch": 4.718735891647856, + "grad_norm": 167.95193481445312, + "learning_rate": 2.2938294010889293e-05, + "loss": 25.0916, + "step": 1307 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 203.88392639160156, + "learning_rate": 2.293284936479129e-05, + "loss": 26.1631, + "step": 1308 + }, + { + "epoch": 4.725959367945824, + "grad_norm": 350.67657470703125, + "learning_rate": 2.2927404718693284e-05, + "loss": 47.7021, + "step": 1309 + }, + { + "epoch": 4.7295711060948085, + "grad_norm": 357.1839294433594, + "learning_rate": 2.2921960072595283e-05, + "loss": 47.8161, + "step": 1310 + }, + { + "epoch": 4.7295711060948085, + "eval_loss": 0.6716815829277039, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 1310 + }, + { + "epoch": 4.733182844243792, + "grad_norm": 334.40216064453125, + "learning_rate": 2.291651542649728e-05, + "loss": 47.5608, + "step": 1311 + }, + { + "epoch": 4.736794582392776, + "grad_norm": 322.90008544921875, + "learning_rate": 2.2911070780399274e-05, + "loss": 45.9858, + "step": 1312 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 291.5083923339844, + "learning_rate": 2.290562613430127e-05, + "loss": 45.9813, + "step": 1313 + }, + { + "epoch": 4.744018058690745, + "grad_norm": 234.91102600097656, + "learning_rate": 2.2900181488203268e-05, + "loss": 44.4287, + "step": 1314 + }, + { + "epoch": 4.747629796839729, + "grad_norm": 271.03582763671875, + "learning_rate": 2.2894736842105263e-05, + "loss": 45.3697, + "step": 1315 + }, + { + "epoch": 4.751241534988713, + "grad_norm": 256.219482421875, + "learning_rate": 2.2889292196007262e-05, + "loss": 45.1817, + "step": 1316 + }, + { + "epoch": 4.754853273137698, + "grad_norm": 252.0631561279297, + "learning_rate": 2.2883847549909257e-05, + "loss": 45.2029, + "step": 1317 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 249.41812133789062, + "learning_rate": 2.2878402903811253e-05, + "loss": 44.9802, + "step": 1318 + }, + { + "epoch": 4.762076749435666, + "grad_norm": 208.9102325439453, + "learning_rate": 2.2872958257713248e-05, + "loss": 44.3745, + "step": 1319 + }, + { + "epoch": 4.76568848758465, + "grad_norm": 322.94903564453125, + "learning_rate": 2.2867513611615244e-05, + "loss": 40.9193, + "step": 1320 + }, + { + "epoch": 4.76568848758465, + "eval_loss": 0.6515910029411316, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1320 + }, + { + "epoch": 4.769300225733634, + "grad_norm": 264.6942138671875, + "learning_rate": 2.2862068965517242e-05, + "loss": 39.7286, + "step": 1321 + }, + { + "epoch": 4.772911963882619, + "grad_norm": 276.6095886230469, + "learning_rate": 2.2856624319419238e-05, + "loss": 41.3846, + "step": 1322 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 199.59877014160156, + "learning_rate": 2.2851179673321233e-05, + "loss": 40.5583, + "step": 1323 + }, + { + "epoch": 4.780135440180587, + "grad_norm": 252.59158325195312, + "learning_rate": 2.2845735027223232e-05, + "loss": 40.9513, + "step": 1324 + }, + { + "epoch": 4.7837471783295715, + "grad_norm": 215.53826904296875, + "learning_rate": 2.2840290381125227e-05, + "loss": 41.5119, + "step": 1325 + }, + { + "epoch": 4.7873589164785555, + "grad_norm": 290.7100524902344, + "learning_rate": 2.2834845735027226e-05, + "loss": 42.7646, + "step": 1326 + }, + { + "epoch": 4.7909706546275395, + "grad_norm": 190.2306671142578, + "learning_rate": 2.282940108892922e-05, + "loss": 42.2708, + "step": 1327 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 187.5550079345703, + "learning_rate": 2.2823956442831217e-05, + "loss": 41.9279, + "step": 1328 + }, + { + "epoch": 4.798194130925508, + "grad_norm": 169.10414123535156, + "learning_rate": 2.2818511796733212e-05, + "loss": 42.2688, + "step": 1329 + }, + { + "epoch": 4.801805869074492, + "grad_norm": 199.5216064453125, + "learning_rate": 2.2813067150635208e-05, + "loss": 41.9192, + "step": 1330 + }, + { + "epoch": 4.801805869074492, + "eval_loss": 0.6402038335800171, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 1330 + }, + { + "epoch": 4.805417607223476, + "grad_norm": 222.4996337890625, + "learning_rate": 2.2807622504537203e-05, + "loss": 43.8218, + "step": 1331 + }, + { + "epoch": 4.80902934537246, + "grad_norm": 228.1157684326172, + "learning_rate": 2.2802177858439202e-05, + "loss": 42.9497, + "step": 1332 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 179.83697509765625, + "learning_rate": 2.27967332123412e-05, + "loss": 43.9723, + "step": 1333 + }, + { + "epoch": 4.816252821670429, + "grad_norm": 196.81983947753906, + "learning_rate": 2.2791288566243196e-05, + "loss": 43.3302, + "step": 1334 + }, + { + "epoch": 4.819864559819413, + "grad_norm": 186.61160278320312, + "learning_rate": 2.278584392014519e-05, + "loss": 41.8957, + "step": 1335 + }, + { + "epoch": 4.823476297968397, + "grad_norm": 242.55886840820312, + "learning_rate": 2.2780399274047187e-05, + "loss": 43.1916, + "step": 1336 + }, + { + "epoch": 4.827088036117382, + "grad_norm": 212.07177734375, + "learning_rate": 2.2774954627949185e-05, + "loss": 38.3371, + "step": 1337 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 180.1990966796875, + "learning_rate": 2.276950998185118e-05, + "loss": 36.3413, + "step": 1338 + }, + { + "epoch": 4.83431151241535, + "grad_norm": 202.69529724121094, + "learning_rate": 2.2764065335753176e-05, + "loss": 35.4426, + "step": 1339 + }, + { + "epoch": 4.837923250564334, + "grad_norm": 180.47283935546875, + "learning_rate": 2.275862068965517e-05, + "loss": 35.5281, + "step": 1340 + }, + { + "epoch": 4.837923250564334, + "eval_loss": 0.6356105804443359, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 1340 + }, + { + "epoch": 4.8415349887133186, + "grad_norm": 204.674560546875, + "learning_rate": 2.2753176043557167e-05, + "loss": 36.2566, + "step": 1341 + }, + { + "epoch": 4.8451467268623025, + "grad_norm": 272.1197204589844, + "learning_rate": 2.2747731397459166e-05, + "loss": 36.3862, + "step": 1342 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 235.55101013183594, + "learning_rate": 2.2742286751361165e-05, + "loss": 35.1455, + "step": 1343 + }, + { + "epoch": 4.852370203160271, + "grad_norm": 271.2718200683594, + "learning_rate": 2.273684210526316e-05, + "loss": 37.3824, + "step": 1344 + }, + { + "epoch": 4.855981941309255, + "grad_norm": 242.15728759765625, + "learning_rate": 2.2731397459165155e-05, + "loss": 37.6587, + "step": 1345 + }, + { + "epoch": 4.859593679458239, + "grad_norm": 218.59481811523438, + "learning_rate": 2.272595281306715e-05, + "loss": 36.7602, + "step": 1346 + }, + { + "epoch": 4.863205417607223, + "grad_norm": 231.9490203857422, + "learning_rate": 2.2720508166969146e-05, + "loss": 38.187, + "step": 1347 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 385.56158447265625, + "learning_rate": 2.2715063520871145e-05, + "loss": 38.1905, + "step": 1348 + }, + { + "epoch": 4.870428893905192, + "grad_norm": 219.38204956054688, + "learning_rate": 2.270961887477314e-05, + "loss": 38.2179, + "step": 1349 + }, + { + "epoch": 4.874040632054176, + "grad_norm": 209.46580505371094, + "learning_rate": 2.2704174228675136e-05, + "loss": 37.3696, + "step": 1350 + }, + { + "epoch": 4.874040632054176, + "eval_loss": 0.6412517428398132, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 1350 + }, + { + "epoch": 4.87765237020316, + "grad_norm": 205.53416442871094, + "learning_rate": 2.2698729582577134e-05, + "loss": 38.5144, + "step": 1351 + }, + { + "epoch": 4.881264108352145, + "grad_norm": 214.2522735595703, + "learning_rate": 2.269328493647913e-05, + "loss": 38.7372, + "step": 1352 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 236.9787139892578, + "learning_rate": 2.2687840290381125e-05, + "loss": 38.8987, + "step": 1353 + }, + { + "epoch": 4.888487584650113, + "grad_norm": 247.30906677246094, + "learning_rate": 2.2682395644283124e-05, + "loss": 35.0837, + "step": 1354 + }, + { + "epoch": 4.892099322799097, + "grad_norm": 287.5954284667969, + "learning_rate": 2.267695099818512e-05, + "loss": 25.5272, + "step": 1355 + }, + { + "epoch": 4.895711060948082, + "grad_norm": 254.61672973632812, + "learning_rate": 2.2671506352087115e-05, + "loss": 25.1288, + "step": 1356 + }, + { + "epoch": 4.899322799097066, + "grad_norm": 180.98666381835938, + "learning_rate": 2.266606170598911e-05, + "loss": 25.0588, + "step": 1357 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 213.0275421142578, + "learning_rate": 2.2660617059891105e-05, + "loss": 25.464, + "step": 1358 + }, + { + "epoch": 4.9065462753950335, + "grad_norm": 385.18035888671875, + "learning_rate": 2.2655172413793104e-05, + "loss": 47.0056, + "step": 1359 + }, + { + "epoch": 4.910158013544018, + "grad_norm": 383.4106140136719, + "learning_rate": 2.2649727767695103e-05, + "loss": 46.9892, + "step": 1360 + }, + { + "epoch": 4.910158013544018, + "eval_loss": 0.6618479490280151, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1360 + }, + { + "epoch": 4.913769751693002, + "grad_norm": 415.4345397949219, + "learning_rate": 2.26442831215971e-05, + "loss": 47.1619, + "step": 1361 + }, + { + "epoch": 4.917381489841986, + "grad_norm": 362.338134765625, + "learning_rate": 2.2638838475499094e-05, + "loss": 46.7232, + "step": 1362 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 378.7535400390625, + "learning_rate": 2.263339382940109e-05, + "loss": 46.4438, + "step": 1363 + }, + { + "epoch": 4.924604966139955, + "grad_norm": 251.64901733398438, + "learning_rate": 2.2627949183303085e-05, + "loss": 44.8178, + "step": 1364 + }, + { + "epoch": 4.928216704288939, + "grad_norm": 273.1052551269531, + "learning_rate": 2.2622504537205083e-05, + "loss": 43.0865, + "step": 1365 + }, + { + "epoch": 4.931828442437923, + "grad_norm": 229.66415405273438, + "learning_rate": 2.261705989110708e-05, + "loss": 42.2463, + "step": 1366 + }, + { + "epoch": 4.935440180586907, + "grad_norm": 229.47940063476562, + "learning_rate": 2.2611615245009074e-05, + "loss": 42.4395, + "step": 1367 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 224.48890686035156, + "learning_rate": 2.260617059891107e-05, + "loss": 42.4994, + "step": 1368 + }, + { + "epoch": 4.942663656884876, + "grad_norm": 241.98745727539062, + "learning_rate": 2.2600725952813065e-05, + "loss": 42.5535, + "step": 1369 + }, + { + "epoch": 4.94627539503386, + "grad_norm": 258.1711120605469, + "learning_rate": 2.2595281306715067e-05, + "loss": 42.8475, + "step": 1370 + }, + { + "epoch": 4.94627539503386, + "eval_loss": 0.639252245426178, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.09, + "eval_steps_per_second": 57.09, + "step": 1370 + }, + { + "epoch": 4.949887133182845, + "grad_norm": 204.64927673339844, + "learning_rate": 2.2589836660617062e-05, + "loss": 42.9895, + "step": 1371 + }, + { + "epoch": 4.953498871331829, + "grad_norm": 342.9057922363281, + "learning_rate": 2.2584392014519058e-05, + "loss": 43.1972, + "step": 1372 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 207.45504760742188, + "learning_rate": 2.2578947368421053e-05, + "loss": 42.406, + "step": 1373 + }, + { + "epoch": 4.960722347629797, + "grad_norm": 232.78831481933594, + "learning_rate": 2.257350272232305e-05, + "loss": 36.8817, + "step": 1374 + }, + { + "epoch": 4.9643340857787805, + "grad_norm": 249.3349609375, + "learning_rate": 2.2568058076225044e-05, + "loss": 34.584, + "step": 1375 + }, + { + "epoch": 4.967945823927765, + "grad_norm": 322.7100524902344, + "learning_rate": 2.2562613430127043e-05, + "loss": 36.9512, + "step": 1376 + }, + { + "epoch": 4.971557562076749, + "grad_norm": 357.65228271484375, + "learning_rate": 2.2557168784029038e-05, + "loss": 37.6833, + "step": 1377 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 300.0970153808594, + "learning_rate": 2.2551724137931033e-05, + "loss": 38.597, + "step": 1378 + }, + { + "epoch": 4.978781038374718, + "grad_norm": 234.52508544921875, + "learning_rate": 2.2546279491833032e-05, + "loss": 38.4155, + "step": 1379 + }, + { + "epoch": 4.982392776523702, + "grad_norm": 270.60626220703125, + "learning_rate": 2.2540834845735028e-05, + "loss": 38.1589, + "step": 1380 + }, + { + "epoch": 4.982392776523702, + "eval_loss": 0.6409950256347656, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 1380 + }, + { + "epoch": 4.986004514672686, + "grad_norm": 232.9596710205078, + "learning_rate": 2.2535390199637026e-05, + "loss": 39.281, + "step": 1381 + }, + { + "epoch": 4.98961625282167, + "grad_norm": 248.0550994873047, + "learning_rate": 2.2529945553539022e-05, + "loss": 40.0868, + "step": 1382 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 256.327880859375, + "learning_rate": 2.2524500907441017e-05, + "loss": 28.1259, + "step": 1383 + }, + { + "epoch": 4.996839729119639, + "grad_norm": 198.29559326171875, + "learning_rate": 2.2519056261343012e-05, + "loss": 25.3166, + "step": 1384 + }, + { + "epoch": 5.0, + "grad_norm": 174.66856384277344, + "learning_rate": 2.2513611615245008e-05, + "loss": 22.0749, + "step": 1385 + }, + { + "epoch": 5.003611738148984, + "grad_norm": 309.0927429199219, + "learning_rate": 2.2508166969147003e-05, + "loss": 45.2433, + "step": 1386 + }, + { + "epoch": 5.007223476297969, + "grad_norm": 293.1455383300781, + "learning_rate": 2.2502722323049002e-05, + "loss": 46.7025, + "step": 1387 + }, + { + "epoch": 5.010835214446953, + "grad_norm": 269.47662353515625, + "learning_rate": 2.2497277676951e-05, + "loss": 45.3218, + "step": 1388 + }, + { + "epoch": 5.014446952595937, + "grad_norm": 284.49560546875, + "learning_rate": 2.2491833030852996e-05, + "loss": 44.9849, + "step": 1389 + }, + { + "epoch": 5.018058690744921, + "grad_norm": 223.5511474609375, + "learning_rate": 2.248638838475499e-05, + "loss": 44.887, + "step": 1390 + }, + { + "epoch": 5.018058690744921, + "eval_loss": 0.6435533165931702, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1390 + }, + { + "epoch": 5.021670428893906, + "grad_norm": 243.4492645263672, + "learning_rate": 2.2480943738656987e-05, + "loss": 45.1483, + "step": 1391 + }, + { + "epoch": 5.0252821670428895, + "grad_norm": 265.1712646484375, + "learning_rate": 2.2475499092558986e-05, + "loss": 44.3713, + "step": 1392 + }, + { + "epoch": 5.0288939051918735, + "grad_norm": 190.72190856933594, + "learning_rate": 2.247005444646098e-05, + "loss": 45.3138, + "step": 1393 + }, + { + "epoch": 5.0325056433408575, + "grad_norm": 177.26686096191406, + "learning_rate": 2.2464609800362976e-05, + "loss": 43.302, + "step": 1394 + }, + { + "epoch": 5.036117381489842, + "grad_norm": 198.6124725341797, + "learning_rate": 2.2459165154264972e-05, + "loss": 43.6363, + "step": 1395 + }, + { + "epoch": 5.039729119638826, + "grad_norm": 233.78738403320312, + "learning_rate": 2.2453720508166967e-05, + "loss": 43.0345, + "step": 1396 + }, + { + "epoch": 5.04334085778781, + "grad_norm": 225.48614501953125, + "learning_rate": 2.2448275862068966e-05, + "loss": 41.5932, + "step": 1397 + }, + { + "epoch": 5.046952595936794, + "grad_norm": 204.31179809570312, + "learning_rate": 2.2442831215970965e-05, + "loss": 40.1401, + "step": 1398 + }, + { + "epoch": 5.050564334085779, + "grad_norm": 219.5385284423828, + "learning_rate": 2.243738656987296e-05, + "loss": 40.8834, + "step": 1399 + }, + { + "epoch": 5.054176072234763, + "grad_norm": 168.3094024658203, + "learning_rate": 2.2431941923774956e-05, + "loss": 40.4476, + "step": 1400 + }, + { + "epoch": 5.054176072234763, + "eval_loss": 0.6361114382743835, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 1400 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.872052061510042e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0df5bb8d072da58c080f53aee8b5741254764fa2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0971a2d3a1bd62e1dbd54bf009897b4aefcea55057d84ced08d871a3fe19228 +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a014b519056b0c2122232cbe6e805e9130cc854 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:882e08ff8fbdc1fa031776a93c004f827857e72497b0a5cf2c831ab6878dfdeb +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4dded2c8a791cef56f9b554e638945e630c4ec4f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a138a499e7c7751e9bb0b929ada643ba1de8c5bd1449c9c4b3dc5cfc9560e8b2 +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..43fec138eeaf936a53296bc816d30240c2d7a06a --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f24adcb8b9dbbf252758ef3d64e32a1c7ffb972248819b92b7b4e85c08b57fd1 +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f24588bbb433fa5e7599627ea4be1e942d7c4ff0 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1fef047a5180960190b02878d8f28532932621f8812da3e97410cc6d0bef9c1 +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a9e0917e831494855c8feb22f22dc39901e5cd6c --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/trainer_state.json @@ -0,0 +1,12513 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.776523702031603, + "eval_steps": 10, + "global_step": 1600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + }, + { + "epoch": 4.33589164785553, + "grad_norm": 232.332763671875, + "learning_rate": 2.3515426497277678e-05, + "loss": 38.1029, + "step": 1201 + }, + { + "epoch": 4.339503386004514, + "grad_norm": 251.8763885498047, + "learning_rate": 2.3509981851179673e-05, + "loss": 40.2538, + "step": 1202 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 260.1363525390625, + "learning_rate": 2.350453720508167e-05, + "loss": 39.115, + "step": 1203 + }, + { + "epoch": 4.346726862302483, + "grad_norm": 227.32473754882812, + "learning_rate": 2.3499092558983667e-05, + "loss": 37.7692, + "step": 1204 + }, + { + "epoch": 4.350338600451467, + "grad_norm": 208.3872528076172, + "learning_rate": 2.3493647912885663e-05, + "loss": 26.7583, + "step": 1205 + }, + { + "epoch": 4.353950338600452, + "grad_norm": 173.05075073242188, + "learning_rate": 2.348820326678766e-05, + "loss": 24.7576, + "step": 1206 + }, + { + "epoch": 4.357562076749436, + "grad_norm": 214.4512939453125, + "learning_rate": 2.3482758620689657e-05, + "loss": 24.8792, + "step": 1207 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 179.293701171875, + "learning_rate": 2.3477313974591652e-05, + "loss": 26.1507, + "step": 1208 + }, + { + "epoch": 4.364785553047404, + "grad_norm": 401.9908142089844, + "learning_rate": 2.3471869328493648e-05, + "loss": 47.4017, + "step": 1209 + }, + { + "epoch": 4.368397291196389, + "grad_norm": 399.3369140625, + "learning_rate": 2.3466424682395643e-05, + "loss": 48.0082, + "step": 1210 + }, + { + "epoch": 4.368397291196389, + "eval_loss": 0.6664602756500244, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.18, + "eval_steps_per_second": 57.18, + "step": 1210 + }, + { + "epoch": 4.372009029345373, + "grad_norm": 320.49090576171875, + "learning_rate": 2.346098003629764e-05, + "loss": 47.4843, + "step": 1211 + }, + { + "epoch": 4.375620767494357, + "grad_norm": 297.55615234375, + "learning_rate": 2.3455535390199637e-05, + "loss": 46.3087, + "step": 1212 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 245.03399658203125, + "learning_rate": 2.3450090744101636e-05, + "loss": 45.4889, + "step": 1213 + }, + { + "epoch": 4.382844243792325, + "grad_norm": 227.94091796875, + "learning_rate": 2.344464609800363e-05, + "loss": 45.8501, + "step": 1214 + }, + { + "epoch": 4.386455981941309, + "grad_norm": 262.7824401855469, + "learning_rate": 2.3439201451905627e-05, + "loss": 46.2737, + "step": 1215 + }, + { + "epoch": 4.390067720090293, + "grad_norm": 235.969970703125, + "learning_rate": 2.3433756805807622e-05, + "loss": 45.2876, + "step": 1216 + }, + { + "epoch": 4.393679458239277, + "grad_norm": 244.8028106689453, + "learning_rate": 2.342831215970962e-05, + "loss": 45.4931, + "step": 1217 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 236.24844360351562, + "learning_rate": 2.3422867513611616e-05, + "loss": 45.6649, + "step": 1218 + }, + { + "epoch": 4.400902934537246, + "grad_norm": 204.7911834716797, + "learning_rate": 2.341742286751361e-05, + "loss": 43.9613, + "step": 1219 + }, + { + "epoch": 4.40451467268623, + "grad_norm": 190.6739044189453, + "learning_rate": 2.3411978221415607e-05, + "loss": 41.9267, + "step": 1220 + }, + { + "epoch": 4.40451467268623, + "eval_loss": 0.6481396555900574, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1220 + }, + { + "epoch": 4.408126410835214, + "grad_norm": 224.25758361816406, + "learning_rate": 2.3406533575317602e-05, + "loss": 42.34, + "step": 1221 + }, + { + "epoch": 4.411738148984199, + "grad_norm": 238.21913146972656, + "learning_rate": 2.34010889292196e-05, + "loss": 40.6947, + "step": 1222 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 255.64395141601562, + "learning_rate": 2.33956442831216e-05, + "loss": 39.8585, + "step": 1223 + }, + { + "epoch": 4.418961625282167, + "grad_norm": 202.08859252929688, + "learning_rate": 2.3390199637023595e-05, + "loss": 42.6031, + "step": 1224 + }, + { + "epoch": 4.422573363431152, + "grad_norm": 222.359619140625, + "learning_rate": 2.338475499092559e-05, + "loss": 41.9946, + "step": 1225 + }, + { + "epoch": 4.426185101580136, + "grad_norm": 198.84461975097656, + "learning_rate": 2.3379310344827586e-05, + "loss": 40.9174, + "step": 1226 + }, + { + "epoch": 4.42979683972912, + "grad_norm": 227.34942626953125, + "learning_rate": 2.337386569872958e-05, + "loss": 42.2865, + "step": 1227 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 249.9097900390625, + "learning_rate": 2.336842105263158e-05, + "loss": 42.6508, + "step": 1228 + }, + { + "epoch": 4.437020316027088, + "grad_norm": 236.96009826660156, + "learning_rate": 2.3362976406533576e-05, + "loss": 43.0846, + "step": 1229 + }, + { + "epoch": 4.440632054176072, + "grad_norm": 183.06201171875, + "learning_rate": 2.335753176043557e-05, + "loss": 42.4119, + "step": 1230 + }, + { + "epoch": 4.440632054176072, + "eval_loss": 0.6428424715995789, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 1230 + }, + { + "epoch": 4.444243792325056, + "grad_norm": 199.0382843017578, + "learning_rate": 2.335208711433757e-05, + "loss": 43.1702, + "step": 1231 + }, + { + "epoch": 4.44785553047404, + "grad_norm": 221.87939453125, + "learning_rate": 2.3346642468239565e-05, + "loss": 43.3518, + "step": 1232 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 205.0601043701172, + "learning_rate": 2.3341197822141564e-05, + "loss": 42.9713, + "step": 1233 + }, + { + "epoch": 4.455079006772009, + "grad_norm": 235.3998565673828, + "learning_rate": 2.333575317604356e-05, + "loss": 42.6973, + "step": 1234 + }, + { + "epoch": 4.458690744920993, + "grad_norm": 171.76986694335938, + "learning_rate": 2.3330308529945555e-05, + "loss": 43.351, + "step": 1235 + }, + { + "epoch": 4.462302483069977, + "grad_norm": 261.549072265625, + "learning_rate": 2.332486388384755e-05, + "loss": 43.8662, + "step": 1236 + }, + { + "epoch": 4.465914221218962, + "grad_norm": 256.76837158203125, + "learning_rate": 2.3319419237749545e-05, + "loss": 40.7938, + "step": 1237 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 176.35060119628906, + "learning_rate": 2.331397459165154e-05, + "loss": 38.1021, + "step": 1238 + }, + { + "epoch": 4.47313769751693, + "grad_norm": 203.00906372070312, + "learning_rate": 2.330852994555354e-05, + "loss": 36.6359, + "step": 1239 + }, + { + "epoch": 4.476749435665914, + "grad_norm": 259.6462707519531, + "learning_rate": 2.3303085299455535e-05, + "loss": 34.448, + "step": 1240 + }, + { + "epoch": 4.476749435665914, + "eval_loss": 0.6386051177978516, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 1240 + }, + { + "epoch": 4.480361173814899, + "grad_norm": 215.24737548828125, + "learning_rate": 2.3297640653357534e-05, + "loss": 35.2353, + "step": 1241 + }, + { + "epoch": 4.483972911963883, + "grad_norm": 249.12355041503906, + "learning_rate": 2.329219600725953e-05, + "loss": 38.2077, + "step": 1242 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 191.0881805419922, + "learning_rate": 2.3286751361161525e-05, + "loss": 36.8363, + "step": 1243 + }, + { + "epoch": 4.491196388261851, + "grad_norm": 229.26449584960938, + "learning_rate": 2.3281306715063523e-05, + "loss": 36.7398, + "step": 1244 + }, + { + "epoch": 4.4948081264108355, + "grad_norm": 184.931884765625, + "learning_rate": 2.327586206896552e-05, + "loss": 35.6614, + "step": 1245 + }, + { + "epoch": 4.4984198645598195, + "grad_norm": 183.7378387451172, + "learning_rate": 2.3270417422867514e-05, + "loss": 36.9818, + "step": 1246 + }, + { + "epoch": 4.502031602708803, + "grad_norm": 191.42543029785156, + "learning_rate": 2.326497277676951e-05, + "loss": 38.1348, + "step": 1247 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 211.6359100341797, + "learning_rate": 2.3259528130671505e-05, + "loss": 37.0112, + "step": 1248 + }, + { + "epoch": 4.509255079006772, + "grad_norm": 245.6946563720703, + "learning_rate": 2.32540834845735e-05, + "loss": 38.6218, + "step": 1249 + }, + { + "epoch": 4.512866817155756, + "grad_norm": 193.29095458984375, + "learning_rate": 2.3248638838475502e-05, + "loss": 36.9687, + "step": 1250 + }, + { + "epoch": 4.512866817155756, + "eval_loss": 0.6432057023048401, + "eval_runtime": 3.1301, + "eval_samples_per_second": 57.187, + "eval_steps_per_second": 57.187, + "step": 1250 + }, + { + "epoch": 4.51647855530474, + "grad_norm": 247.0595245361328, + "learning_rate": 2.3243194192377498e-05, + "loss": 39.8086, + "step": 1251 + }, + { + "epoch": 4.520090293453725, + "grad_norm": 243.1544189453125, + "learning_rate": 2.3237749546279493e-05, + "loss": 38.7245, + "step": 1252 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 322.0834045410156, + "learning_rate": 2.323230490018149e-05, + "loss": 39.5335, + "step": 1253 + }, + { + "epoch": 4.527313769751693, + "grad_norm": 201.5956573486328, + "learning_rate": 2.3226860254083484e-05, + "loss": 30.2928, + "step": 1254 + }, + { + "epoch": 4.530925507900677, + "grad_norm": 186.13291931152344, + "learning_rate": 2.3221415607985483e-05, + "loss": 24.8504, + "step": 1255 + }, + { + "epoch": 4.534537246049661, + "grad_norm": 251.50608825683594, + "learning_rate": 2.3215970961887478e-05, + "loss": 24.5528, + "step": 1256 + }, + { + "epoch": 4.538148984198646, + "grad_norm": 180.21124267578125, + "learning_rate": 2.3210526315789473e-05, + "loss": 25.0864, + "step": 1257 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 206.5410614013672, + "learning_rate": 2.320508166969147e-05, + "loss": 27.1602, + "step": 1258 + }, + { + "epoch": 4.545372460496614, + "grad_norm": 342.1103210449219, + "learning_rate": 2.3199637023593468e-05, + "loss": 47.3734, + "step": 1259 + }, + { + "epoch": 4.5489841986455986, + "grad_norm": 418.3056945800781, + "learning_rate": 2.3194192377495463e-05, + "loss": 48.0316, + "step": 1260 + }, + { + "epoch": 4.5489841986455986, + "eval_loss": 0.6742400527000427, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 1260 + }, + { + "epoch": 4.5525959367945825, + "grad_norm": 369.8560791015625, + "learning_rate": 2.3188747731397462e-05, + "loss": 47.4532, + "step": 1261 + }, + { + "epoch": 4.5562076749435665, + "grad_norm": 322.0288391113281, + "learning_rate": 2.3183303085299457e-05, + "loss": 47.0661, + "step": 1262 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 244.79066467285156, + "learning_rate": 2.3177858439201453e-05, + "loss": 45.1875, + "step": 1263 + }, + { + "epoch": 4.563431151241535, + "grad_norm": 209.29397583007812, + "learning_rate": 2.3172413793103448e-05, + "loss": 46.1355, + "step": 1264 + }, + { + "epoch": 4.567042889390519, + "grad_norm": 271.5123291015625, + "learning_rate": 2.3166969147005443e-05, + "loss": 45.8947, + "step": 1265 + }, + { + "epoch": 4.570654627539503, + "grad_norm": 232.42913818359375, + "learning_rate": 2.3161524500907442e-05, + "loss": 45.6542, + "step": 1266 + }, + { + "epoch": 4.574266365688487, + "grad_norm": 282.50738525390625, + "learning_rate": 2.3156079854809437e-05, + "loss": 45.8805, + "step": 1267 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 203.39031982421875, + "learning_rate": 2.3150635208711436e-05, + "loss": 44.8926, + "step": 1268 + }, + { + "epoch": 4.581489841986456, + "grad_norm": 213.94894409179688, + "learning_rate": 2.314519056261343e-05, + "loss": 43.7589, + "step": 1269 + }, + { + "epoch": 4.58510158013544, + "grad_norm": 198.9677734375, + "learning_rate": 2.3139745916515427e-05, + "loss": 41.819, + "step": 1270 + }, + { + "epoch": 4.58510158013544, + "eval_loss": 0.6428627371788025, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1270 + }, + { + "epoch": 4.588713318284425, + "grad_norm": 197.69903564453125, + "learning_rate": 2.3134301270417422e-05, + "loss": 40.6128, + "step": 1271 + }, + { + "epoch": 4.592325056433409, + "grad_norm": 229.10488891601562, + "learning_rate": 2.312885662431942e-05, + "loss": 41.1856, + "step": 1272 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 254.4750213623047, + "learning_rate": 2.3123411978221417e-05, + "loss": 40.2048, + "step": 1273 + }, + { + "epoch": 4.599548532731377, + "grad_norm": 247.2012939453125, + "learning_rate": 2.3117967332123412e-05, + "loss": 41.663, + "step": 1274 + }, + { + "epoch": 4.603160270880361, + "grad_norm": 196.78761291503906, + "learning_rate": 2.3112522686025407e-05, + "loss": 41.1102, + "step": 1275 + }, + { + "epoch": 4.606772009029346, + "grad_norm": 179.03880310058594, + "learning_rate": 2.3107078039927403e-05, + "loss": 39.6368, + "step": 1276 + }, + { + "epoch": 4.6103837471783295, + "grad_norm": 203.49159240722656, + "learning_rate": 2.3101633393829405e-05, + "loss": 42.9424, + "step": 1277 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 254.80018615722656, + "learning_rate": 2.30961887477314e-05, + "loss": 42.0636, + "step": 1278 + }, + { + "epoch": 4.617607223476298, + "grad_norm": 201.86109924316406, + "learning_rate": 2.3090744101633396e-05, + "loss": 41.4738, + "step": 1279 + }, + { + "epoch": 4.621218961625282, + "grad_norm": 185.1239471435547, + "learning_rate": 2.308529945553539e-05, + "loss": 41.8529, + "step": 1280 + }, + { + "epoch": 4.621218961625282, + "eval_loss": 0.6457561254501343, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 1280 + }, + { + "epoch": 4.624830699774266, + "grad_norm": 198.6769561767578, + "learning_rate": 2.3079854809437386e-05, + "loss": 41.8397, + "step": 1281 + }, + { + "epoch": 4.62844243792325, + "grad_norm": 254.9165496826172, + "learning_rate": 2.3074410163339382e-05, + "loss": 43.5585, + "step": 1282 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 183.61181640625, + "learning_rate": 2.306896551724138e-05, + "loss": 41.7349, + "step": 1283 + }, + { + "epoch": 4.635665914221219, + "grad_norm": 206.0381622314453, + "learning_rate": 2.3063520871143376e-05, + "loss": 42.6239, + "step": 1284 + }, + { + "epoch": 4.639277652370203, + "grad_norm": 188.5303497314453, + "learning_rate": 2.305807622504537e-05, + "loss": 43.0988, + "step": 1285 + }, + { + "epoch": 4.642889390519187, + "grad_norm": 208.30039978027344, + "learning_rate": 2.3052631578947367e-05, + "loss": 43.8379, + "step": 1286 + }, + { + "epoch": 4.646501128668172, + "grad_norm": 209.494384765625, + "learning_rate": 2.3047186932849365e-05, + "loss": 41.4395, + "step": 1287 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 223.97824096679688, + "learning_rate": 2.3041742286751364e-05, + "loss": 38.5792, + "step": 1288 + }, + { + "epoch": 4.65372460496614, + "grad_norm": 209.16192626953125, + "learning_rate": 2.303629764065336e-05, + "loss": 36.2448, + "step": 1289 + }, + { + "epoch": 4.657336343115124, + "grad_norm": 260.72821044921875, + "learning_rate": 2.3030852994555355e-05, + "loss": 35.1692, + "step": 1290 + }, + { + "epoch": 4.657336343115124, + "eval_loss": 0.6381233334541321, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1290 + }, + { + "epoch": 4.660948081264109, + "grad_norm": 222.2270965576172, + "learning_rate": 2.302540834845735e-05, + "loss": 35.2234, + "step": 1291 + }, + { + "epoch": 4.664559819413093, + "grad_norm": 208.68218994140625, + "learning_rate": 2.3019963702359346e-05, + "loss": 35.6167, + "step": 1292 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 199.57015991210938, + "learning_rate": 2.301451905626134e-05, + "loss": 36.9489, + "step": 1293 + }, + { + "epoch": 4.6717832957110605, + "grad_norm": 249.1312255859375, + "learning_rate": 2.300907441016334e-05, + "loss": 37.0681, + "step": 1294 + }, + { + "epoch": 4.675395033860045, + "grad_norm": 227.86341857910156, + "learning_rate": 2.3003629764065335e-05, + "loss": 38.3897, + "step": 1295 + }, + { + "epoch": 4.679006772009029, + "grad_norm": 290.3368225097656, + "learning_rate": 2.2998185117967334e-05, + "loss": 39.1391, + "step": 1296 + }, + { + "epoch": 4.682618510158013, + "grad_norm": 222.59974670410156, + "learning_rate": 2.299274047186933e-05, + "loss": 38.6362, + "step": 1297 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 233.853515625, + "learning_rate": 2.2987295825771325e-05, + "loss": 37.1796, + "step": 1298 + }, + { + "epoch": 4.689841986455982, + "grad_norm": 202.83212280273438, + "learning_rate": 2.2981851179673324e-05, + "loss": 38.5097, + "step": 1299 + }, + { + "epoch": 4.693453724604966, + "grad_norm": 203.59027099609375, + "learning_rate": 2.297640653357532e-05, + "loss": 38.3335, + "step": 1300 + }, + { + "epoch": 4.693453724604966, + "eval_loss": 0.6446877717971802, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 1300 + }, + { + "epoch": 4.69706546275395, + "grad_norm": 250.48324584960938, + "learning_rate": 2.2970961887477314e-05, + "loss": 39.1848, + "step": 1301 + }, + { + "epoch": 4.700677200902934, + "grad_norm": 218.0867462158203, + "learning_rate": 2.296551724137931e-05, + "loss": 38.2276, + "step": 1302 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 316.4258728027344, + "learning_rate": 2.2960072595281305e-05, + "loss": 38.4487, + "step": 1303 + }, + { + "epoch": 4.707900677200903, + "grad_norm": 262.96832275390625, + "learning_rate": 2.29546279491833e-05, + "loss": 29.1075, + "step": 1304 + }, + { + "epoch": 4.711512415349887, + "grad_norm": 261.25897216796875, + "learning_rate": 2.2949183303085303e-05, + "loss": 24.6257, + "step": 1305 + }, + { + "epoch": 4.715124153498872, + "grad_norm": 223.29014587402344, + "learning_rate": 2.2943738656987298e-05, + "loss": 24.4387, + "step": 1306 + }, + { + "epoch": 4.718735891647856, + "grad_norm": 167.95193481445312, + "learning_rate": 2.2938294010889293e-05, + "loss": 25.0916, + "step": 1307 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 203.88392639160156, + "learning_rate": 2.293284936479129e-05, + "loss": 26.1631, + "step": 1308 + }, + { + "epoch": 4.725959367945824, + "grad_norm": 350.67657470703125, + "learning_rate": 2.2927404718693284e-05, + "loss": 47.7021, + "step": 1309 + }, + { + "epoch": 4.7295711060948085, + "grad_norm": 357.1839294433594, + "learning_rate": 2.2921960072595283e-05, + "loss": 47.8161, + "step": 1310 + }, + { + "epoch": 4.7295711060948085, + "eval_loss": 0.6716815829277039, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 1310 + }, + { + "epoch": 4.733182844243792, + "grad_norm": 334.40216064453125, + "learning_rate": 2.291651542649728e-05, + "loss": 47.5608, + "step": 1311 + }, + { + "epoch": 4.736794582392776, + "grad_norm": 322.90008544921875, + "learning_rate": 2.2911070780399274e-05, + "loss": 45.9858, + "step": 1312 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 291.5083923339844, + "learning_rate": 2.290562613430127e-05, + "loss": 45.9813, + "step": 1313 + }, + { + "epoch": 4.744018058690745, + "grad_norm": 234.91102600097656, + "learning_rate": 2.2900181488203268e-05, + "loss": 44.4287, + "step": 1314 + }, + { + "epoch": 4.747629796839729, + "grad_norm": 271.03582763671875, + "learning_rate": 2.2894736842105263e-05, + "loss": 45.3697, + "step": 1315 + }, + { + "epoch": 4.751241534988713, + "grad_norm": 256.219482421875, + "learning_rate": 2.2889292196007262e-05, + "loss": 45.1817, + "step": 1316 + }, + { + "epoch": 4.754853273137698, + "grad_norm": 252.0631561279297, + "learning_rate": 2.2883847549909257e-05, + "loss": 45.2029, + "step": 1317 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 249.41812133789062, + "learning_rate": 2.2878402903811253e-05, + "loss": 44.9802, + "step": 1318 + }, + { + "epoch": 4.762076749435666, + "grad_norm": 208.9102325439453, + "learning_rate": 2.2872958257713248e-05, + "loss": 44.3745, + "step": 1319 + }, + { + "epoch": 4.76568848758465, + "grad_norm": 322.94903564453125, + "learning_rate": 2.2867513611615244e-05, + "loss": 40.9193, + "step": 1320 + }, + { + "epoch": 4.76568848758465, + "eval_loss": 0.6515910029411316, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1320 + }, + { + "epoch": 4.769300225733634, + "grad_norm": 264.6942138671875, + "learning_rate": 2.2862068965517242e-05, + "loss": 39.7286, + "step": 1321 + }, + { + "epoch": 4.772911963882619, + "grad_norm": 276.6095886230469, + "learning_rate": 2.2856624319419238e-05, + "loss": 41.3846, + "step": 1322 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 199.59877014160156, + "learning_rate": 2.2851179673321233e-05, + "loss": 40.5583, + "step": 1323 + }, + { + "epoch": 4.780135440180587, + "grad_norm": 252.59158325195312, + "learning_rate": 2.2845735027223232e-05, + "loss": 40.9513, + "step": 1324 + }, + { + "epoch": 4.7837471783295715, + "grad_norm": 215.53826904296875, + "learning_rate": 2.2840290381125227e-05, + "loss": 41.5119, + "step": 1325 + }, + { + "epoch": 4.7873589164785555, + "grad_norm": 290.7100524902344, + "learning_rate": 2.2834845735027226e-05, + "loss": 42.7646, + "step": 1326 + }, + { + "epoch": 4.7909706546275395, + "grad_norm": 190.2306671142578, + "learning_rate": 2.282940108892922e-05, + "loss": 42.2708, + "step": 1327 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 187.5550079345703, + "learning_rate": 2.2823956442831217e-05, + "loss": 41.9279, + "step": 1328 + }, + { + "epoch": 4.798194130925508, + "grad_norm": 169.10414123535156, + "learning_rate": 2.2818511796733212e-05, + "loss": 42.2688, + "step": 1329 + }, + { + "epoch": 4.801805869074492, + "grad_norm": 199.5216064453125, + "learning_rate": 2.2813067150635208e-05, + "loss": 41.9192, + "step": 1330 + }, + { + "epoch": 4.801805869074492, + "eval_loss": 0.6402038335800171, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 1330 + }, + { + "epoch": 4.805417607223476, + "grad_norm": 222.4996337890625, + "learning_rate": 2.2807622504537203e-05, + "loss": 43.8218, + "step": 1331 + }, + { + "epoch": 4.80902934537246, + "grad_norm": 228.1157684326172, + "learning_rate": 2.2802177858439202e-05, + "loss": 42.9497, + "step": 1332 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 179.83697509765625, + "learning_rate": 2.27967332123412e-05, + "loss": 43.9723, + "step": 1333 + }, + { + "epoch": 4.816252821670429, + "grad_norm": 196.81983947753906, + "learning_rate": 2.2791288566243196e-05, + "loss": 43.3302, + "step": 1334 + }, + { + "epoch": 4.819864559819413, + "grad_norm": 186.61160278320312, + "learning_rate": 2.278584392014519e-05, + "loss": 41.8957, + "step": 1335 + }, + { + "epoch": 4.823476297968397, + "grad_norm": 242.55886840820312, + "learning_rate": 2.2780399274047187e-05, + "loss": 43.1916, + "step": 1336 + }, + { + "epoch": 4.827088036117382, + "grad_norm": 212.07177734375, + "learning_rate": 2.2774954627949185e-05, + "loss": 38.3371, + "step": 1337 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 180.1990966796875, + "learning_rate": 2.276950998185118e-05, + "loss": 36.3413, + "step": 1338 + }, + { + "epoch": 4.83431151241535, + "grad_norm": 202.69529724121094, + "learning_rate": 2.2764065335753176e-05, + "loss": 35.4426, + "step": 1339 + }, + { + "epoch": 4.837923250564334, + "grad_norm": 180.47283935546875, + "learning_rate": 2.275862068965517e-05, + "loss": 35.5281, + "step": 1340 + }, + { + "epoch": 4.837923250564334, + "eval_loss": 0.6356105804443359, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 1340 + }, + { + "epoch": 4.8415349887133186, + "grad_norm": 204.674560546875, + "learning_rate": 2.2753176043557167e-05, + "loss": 36.2566, + "step": 1341 + }, + { + "epoch": 4.8451467268623025, + "grad_norm": 272.1197204589844, + "learning_rate": 2.2747731397459166e-05, + "loss": 36.3862, + "step": 1342 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 235.55101013183594, + "learning_rate": 2.2742286751361165e-05, + "loss": 35.1455, + "step": 1343 + }, + { + "epoch": 4.852370203160271, + "grad_norm": 271.2718200683594, + "learning_rate": 2.273684210526316e-05, + "loss": 37.3824, + "step": 1344 + }, + { + "epoch": 4.855981941309255, + "grad_norm": 242.15728759765625, + "learning_rate": 2.2731397459165155e-05, + "loss": 37.6587, + "step": 1345 + }, + { + "epoch": 4.859593679458239, + "grad_norm": 218.59481811523438, + "learning_rate": 2.272595281306715e-05, + "loss": 36.7602, + "step": 1346 + }, + { + "epoch": 4.863205417607223, + "grad_norm": 231.9490203857422, + "learning_rate": 2.2720508166969146e-05, + "loss": 38.187, + "step": 1347 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 385.56158447265625, + "learning_rate": 2.2715063520871145e-05, + "loss": 38.1905, + "step": 1348 + }, + { + "epoch": 4.870428893905192, + "grad_norm": 219.38204956054688, + "learning_rate": 2.270961887477314e-05, + "loss": 38.2179, + "step": 1349 + }, + { + "epoch": 4.874040632054176, + "grad_norm": 209.46580505371094, + "learning_rate": 2.2704174228675136e-05, + "loss": 37.3696, + "step": 1350 + }, + { + "epoch": 4.874040632054176, + "eval_loss": 0.6412517428398132, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 1350 + }, + { + "epoch": 4.87765237020316, + "grad_norm": 205.53416442871094, + "learning_rate": 2.2698729582577134e-05, + "loss": 38.5144, + "step": 1351 + }, + { + "epoch": 4.881264108352145, + "grad_norm": 214.2522735595703, + "learning_rate": 2.269328493647913e-05, + "loss": 38.7372, + "step": 1352 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 236.9787139892578, + "learning_rate": 2.2687840290381125e-05, + "loss": 38.8987, + "step": 1353 + }, + { + "epoch": 4.888487584650113, + "grad_norm": 247.30906677246094, + "learning_rate": 2.2682395644283124e-05, + "loss": 35.0837, + "step": 1354 + }, + { + "epoch": 4.892099322799097, + "grad_norm": 287.5954284667969, + "learning_rate": 2.267695099818512e-05, + "loss": 25.5272, + "step": 1355 + }, + { + "epoch": 4.895711060948082, + "grad_norm": 254.61672973632812, + "learning_rate": 2.2671506352087115e-05, + "loss": 25.1288, + "step": 1356 + }, + { + "epoch": 4.899322799097066, + "grad_norm": 180.98666381835938, + "learning_rate": 2.266606170598911e-05, + "loss": 25.0588, + "step": 1357 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 213.0275421142578, + "learning_rate": 2.2660617059891105e-05, + "loss": 25.464, + "step": 1358 + }, + { + "epoch": 4.9065462753950335, + "grad_norm": 385.18035888671875, + "learning_rate": 2.2655172413793104e-05, + "loss": 47.0056, + "step": 1359 + }, + { + "epoch": 4.910158013544018, + "grad_norm": 383.4106140136719, + "learning_rate": 2.2649727767695103e-05, + "loss": 46.9892, + "step": 1360 + }, + { + "epoch": 4.910158013544018, + "eval_loss": 0.6618479490280151, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1360 + }, + { + "epoch": 4.913769751693002, + "grad_norm": 415.4345397949219, + "learning_rate": 2.26442831215971e-05, + "loss": 47.1619, + "step": 1361 + }, + { + "epoch": 4.917381489841986, + "grad_norm": 362.338134765625, + "learning_rate": 2.2638838475499094e-05, + "loss": 46.7232, + "step": 1362 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 378.7535400390625, + "learning_rate": 2.263339382940109e-05, + "loss": 46.4438, + "step": 1363 + }, + { + "epoch": 4.924604966139955, + "grad_norm": 251.64901733398438, + "learning_rate": 2.2627949183303085e-05, + "loss": 44.8178, + "step": 1364 + }, + { + "epoch": 4.928216704288939, + "grad_norm": 273.1052551269531, + "learning_rate": 2.2622504537205083e-05, + "loss": 43.0865, + "step": 1365 + }, + { + "epoch": 4.931828442437923, + "grad_norm": 229.66415405273438, + "learning_rate": 2.261705989110708e-05, + "loss": 42.2463, + "step": 1366 + }, + { + "epoch": 4.935440180586907, + "grad_norm": 229.47940063476562, + "learning_rate": 2.2611615245009074e-05, + "loss": 42.4395, + "step": 1367 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 224.48890686035156, + "learning_rate": 2.260617059891107e-05, + "loss": 42.4994, + "step": 1368 + }, + { + "epoch": 4.942663656884876, + "grad_norm": 241.98745727539062, + "learning_rate": 2.2600725952813065e-05, + "loss": 42.5535, + "step": 1369 + }, + { + "epoch": 4.94627539503386, + "grad_norm": 258.1711120605469, + "learning_rate": 2.2595281306715067e-05, + "loss": 42.8475, + "step": 1370 + }, + { + "epoch": 4.94627539503386, + "eval_loss": 0.639252245426178, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.09, + "eval_steps_per_second": 57.09, + "step": 1370 + }, + { + "epoch": 4.949887133182845, + "grad_norm": 204.64927673339844, + "learning_rate": 2.2589836660617062e-05, + "loss": 42.9895, + "step": 1371 + }, + { + "epoch": 4.953498871331829, + "grad_norm": 342.9057922363281, + "learning_rate": 2.2584392014519058e-05, + "loss": 43.1972, + "step": 1372 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 207.45504760742188, + "learning_rate": 2.2578947368421053e-05, + "loss": 42.406, + "step": 1373 + }, + { + "epoch": 4.960722347629797, + "grad_norm": 232.78831481933594, + "learning_rate": 2.257350272232305e-05, + "loss": 36.8817, + "step": 1374 + }, + { + "epoch": 4.9643340857787805, + "grad_norm": 249.3349609375, + "learning_rate": 2.2568058076225044e-05, + "loss": 34.584, + "step": 1375 + }, + { + "epoch": 4.967945823927765, + "grad_norm": 322.7100524902344, + "learning_rate": 2.2562613430127043e-05, + "loss": 36.9512, + "step": 1376 + }, + { + "epoch": 4.971557562076749, + "grad_norm": 357.65228271484375, + "learning_rate": 2.2557168784029038e-05, + "loss": 37.6833, + "step": 1377 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 300.0970153808594, + "learning_rate": 2.2551724137931033e-05, + "loss": 38.597, + "step": 1378 + }, + { + "epoch": 4.978781038374718, + "grad_norm": 234.52508544921875, + "learning_rate": 2.2546279491833032e-05, + "loss": 38.4155, + "step": 1379 + }, + { + "epoch": 4.982392776523702, + "grad_norm": 270.60626220703125, + "learning_rate": 2.2540834845735028e-05, + "loss": 38.1589, + "step": 1380 + }, + { + "epoch": 4.982392776523702, + "eval_loss": 0.6409950256347656, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 1380 + }, + { + "epoch": 4.986004514672686, + "grad_norm": 232.9596710205078, + "learning_rate": 2.2535390199637026e-05, + "loss": 39.281, + "step": 1381 + }, + { + "epoch": 4.98961625282167, + "grad_norm": 248.0550994873047, + "learning_rate": 2.2529945553539022e-05, + "loss": 40.0868, + "step": 1382 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 256.327880859375, + "learning_rate": 2.2524500907441017e-05, + "loss": 28.1259, + "step": 1383 + }, + { + "epoch": 4.996839729119639, + "grad_norm": 198.29559326171875, + "learning_rate": 2.2519056261343012e-05, + "loss": 25.3166, + "step": 1384 + }, + { + "epoch": 5.0, + "grad_norm": 174.66856384277344, + "learning_rate": 2.2513611615245008e-05, + "loss": 22.0749, + "step": 1385 + }, + { + "epoch": 5.003611738148984, + "grad_norm": 309.0927429199219, + "learning_rate": 2.2508166969147003e-05, + "loss": 45.2433, + "step": 1386 + }, + { + "epoch": 5.007223476297969, + "grad_norm": 293.1455383300781, + "learning_rate": 2.2502722323049002e-05, + "loss": 46.7025, + "step": 1387 + }, + { + "epoch": 5.010835214446953, + "grad_norm": 269.47662353515625, + "learning_rate": 2.2497277676951e-05, + "loss": 45.3218, + "step": 1388 + }, + { + "epoch": 5.014446952595937, + "grad_norm": 284.49560546875, + "learning_rate": 2.2491833030852996e-05, + "loss": 44.9849, + "step": 1389 + }, + { + "epoch": 5.018058690744921, + "grad_norm": 223.5511474609375, + "learning_rate": 2.248638838475499e-05, + "loss": 44.887, + "step": 1390 + }, + { + "epoch": 5.018058690744921, + "eval_loss": 0.6435533165931702, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1390 + }, + { + "epoch": 5.021670428893906, + "grad_norm": 243.4492645263672, + "learning_rate": 2.2480943738656987e-05, + "loss": 45.1483, + "step": 1391 + }, + { + "epoch": 5.0252821670428895, + "grad_norm": 265.1712646484375, + "learning_rate": 2.2475499092558986e-05, + "loss": 44.3713, + "step": 1392 + }, + { + "epoch": 5.0288939051918735, + "grad_norm": 190.72190856933594, + "learning_rate": 2.247005444646098e-05, + "loss": 45.3138, + "step": 1393 + }, + { + "epoch": 5.0325056433408575, + "grad_norm": 177.26686096191406, + "learning_rate": 2.2464609800362976e-05, + "loss": 43.302, + "step": 1394 + }, + { + "epoch": 5.036117381489842, + "grad_norm": 198.6124725341797, + "learning_rate": 2.2459165154264972e-05, + "loss": 43.6363, + "step": 1395 + }, + { + "epoch": 5.039729119638826, + "grad_norm": 233.78738403320312, + "learning_rate": 2.2453720508166967e-05, + "loss": 43.0345, + "step": 1396 + }, + { + "epoch": 5.04334085778781, + "grad_norm": 225.48614501953125, + "learning_rate": 2.2448275862068966e-05, + "loss": 41.5932, + "step": 1397 + }, + { + "epoch": 5.046952595936794, + "grad_norm": 204.31179809570312, + "learning_rate": 2.2442831215970965e-05, + "loss": 40.1401, + "step": 1398 + }, + { + "epoch": 5.050564334085779, + "grad_norm": 219.5385284423828, + "learning_rate": 2.243738656987296e-05, + "loss": 40.8834, + "step": 1399 + }, + { + "epoch": 5.054176072234763, + "grad_norm": 168.3094024658203, + "learning_rate": 2.2431941923774956e-05, + "loss": 40.4476, + "step": 1400 + }, + { + "epoch": 5.054176072234763, + "eval_loss": 0.6361114382743835, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 1400 + }, + { + "epoch": 5.057787810383747, + "grad_norm": 169.45201110839844, + "learning_rate": 2.242649727767695e-05, + "loss": 40.1949, + "step": 1401 + }, + { + "epoch": 5.061399548532731, + "grad_norm": 208.84634399414062, + "learning_rate": 2.2421052631578946e-05, + "loss": 41.0091, + "step": 1402 + }, + { + "epoch": 5.065011286681716, + "grad_norm": 248.86221313476562, + "learning_rate": 2.2415607985480945e-05, + "loss": 40.2435, + "step": 1403 + }, + { + "epoch": 5.0686230248307, + "grad_norm": 297.0834655761719, + "learning_rate": 2.241016333938294e-05, + "loss": 42.37, + "step": 1404 + }, + { + "epoch": 5.072234762979684, + "grad_norm": 242.12661743164062, + "learning_rate": 2.2404718693284936e-05, + "loss": 42.3822, + "step": 1405 + }, + { + "epoch": 5.075846501128668, + "grad_norm": 230.1178741455078, + "learning_rate": 2.2399274047186935e-05, + "loss": 41.3722, + "step": 1406 + }, + { + "epoch": 5.079458239277653, + "grad_norm": 191.32371520996094, + "learning_rate": 2.239382940108893e-05, + "loss": 41.8087, + "step": 1407 + }, + { + "epoch": 5.083069977426637, + "grad_norm": 267.28753662109375, + "learning_rate": 2.2388384754990925e-05, + "loss": 42.5938, + "step": 1408 + }, + { + "epoch": 5.0866817155756205, + "grad_norm": 186.61978149414062, + "learning_rate": 2.2382940108892924e-05, + "loss": 42.8553, + "step": 1409 + }, + { + "epoch": 5.090293453724605, + "grad_norm": 242.53433227539062, + "learning_rate": 2.237749546279492e-05, + "loss": 41.9677, + "step": 1410 + }, + { + "epoch": 5.090293453724605, + "eval_loss": 0.6330043077468872, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 1410 + }, + { + "epoch": 5.093905191873589, + "grad_norm": 199.74696350097656, + "learning_rate": 2.2372050816696915e-05, + "loss": 42.9821, + "step": 1411 + }, + { + "epoch": 5.097516930022573, + "grad_norm": 254.1063690185547, + "learning_rate": 2.236660617059891e-05, + "loss": 42.7956, + "step": 1412 + }, + { + "epoch": 5.101128668171557, + "grad_norm": 215.59056091308594, + "learning_rate": 2.2361161524500906e-05, + "loss": 43.6312, + "step": 1413 + }, + { + "epoch": 5.104740406320542, + "grad_norm": 218.69973754882812, + "learning_rate": 2.2355716878402904e-05, + "loss": 40.9468, + "step": 1414 + }, + { + "epoch": 5.108352144469526, + "grad_norm": 200.34927368164062, + "learning_rate": 2.23502722323049e-05, + "loss": 38.2656, + "step": 1415 + }, + { + "epoch": 5.11196388261851, + "grad_norm": 191.56883239746094, + "learning_rate": 2.23448275862069e-05, + "loss": 35.8111, + "step": 1416 + }, + { + "epoch": 5.115575620767494, + "grad_norm": 192.629150390625, + "learning_rate": 2.2339382940108894e-05, + "loss": 35.1287, + "step": 1417 + }, + { + "epoch": 5.119187358916479, + "grad_norm": 217.54855346679688, + "learning_rate": 2.233393829401089e-05, + "loss": 34.9664, + "step": 1418 + }, + { + "epoch": 5.122799097065463, + "grad_norm": 234.12355041503906, + "learning_rate": 2.2328493647912888e-05, + "loss": 35.9252, + "step": 1419 + }, + { + "epoch": 5.126410835214447, + "grad_norm": 201.83477783203125, + "learning_rate": 2.2323049001814884e-05, + "loss": 36.4664, + "step": 1420 + }, + { + "epoch": 5.126410835214447, + "eval_loss": 0.6359394192695618, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 1420 + }, + { + "epoch": 5.130022573363431, + "grad_norm": 212.38943481445312, + "learning_rate": 2.231760435571688e-05, + "loss": 35.2733, + "step": 1421 + }, + { + "epoch": 5.133634311512416, + "grad_norm": 219.8803253173828, + "learning_rate": 2.2312159709618874e-05, + "loss": 37.2009, + "step": 1422 + }, + { + "epoch": 5.1372460496614, + "grad_norm": 222.28221130371094, + "learning_rate": 2.230671506352087e-05, + "loss": 36.9338, + "step": 1423 + }, + { + "epoch": 5.140857787810384, + "grad_norm": 217.56607055664062, + "learning_rate": 2.2301270417422865e-05, + "loss": 38.0419, + "step": 1424 + }, + { + "epoch": 5.144469525959368, + "grad_norm": 232.7363739013672, + "learning_rate": 2.2295825771324867e-05, + "loss": 38.1393, + "step": 1425 + }, + { + "epoch": 5.148081264108352, + "grad_norm": 228.12091064453125, + "learning_rate": 2.2290381125226863e-05, + "loss": 37.4169, + "step": 1426 + }, + { + "epoch": 5.151693002257336, + "grad_norm": 247.9901580810547, + "learning_rate": 2.2284936479128858e-05, + "loss": 37.6386, + "step": 1427 + }, + { + "epoch": 5.15530474040632, + "grad_norm": 227.96649169921875, + "learning_rate": 2.2279491833030853e-05, + "loss": 38.7843, + "step": 1428 + }, + { + "epoch": 5.158916478555304, + "grad_norm": 197.85072326660156, + "learning_rate": 2.227404718693285e-05, + "loss": 37.7056, + "step": 1429 + }, + { + "epoch": 5.162528216704289, + "grad_norm": 270.6370544433594, + "learning_rate": 2.2268602540834848e-05, + "loss": 38.5554, + "step": 1430 + }, + { + "epoch": 5.162528216704289, + "eval_loss": 0.6463288068771362, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 1430 + }, + { + "epoch": 5.166139954853273, + "grad_norm": 251.65847778320312, + "learning_rate": 2.2263157894736843e-05, + "loss": 32.6593, + "step": 1431 + }, + { + "epoch": 5.169751693002257, + "grad_norm": 248.84368896484375, + "learning_rate": 2.225771324863884e-05, + "loss": 24.8031, + "step": 1432 + }, + { + "epoch": 5.173363431151241, + "grad_norm": 218.12979125976562, + "learning_rate": 2.2252268602540834e-05, + "loss": 23.8542, + "step": 1433 + }, + { + "epoch": 5.176975169300226, + "grad_norm": 171.4182586669922, + "learning_rate": 2.2246823956442832e-05, + "loss": 25.1994, + "step": 1434 + }, + { + "epoch": 5.18058690744921, + "grad_norm": 200.76271057128906, + "learning_rate": 2.2241379310344828e-05, + "loss": 25.1259, + "step": 1435 + }, + { + "epoch": 5.184198645598194, + "grad_norm": 324.8979797363281, + "learning_rate": 2.2235934664246827e-05, + "loss": 46.7466, + "step": 1436 + }, + { + "epoch": 5.187810383747179, + "grad_norm": 391.9200439453125, + "learning_rate": 2.2230490018148822e-05, + "loss": 47.366, + "step": 1437 + }, + { + "epoch": 5.191422121896163, + "grad_norm": 332.51080322265625, + "learning_rate": 2.2225045372050817e-05, + "loss": 47.5236, + "step": 1438 + }, + { + "epoch": 5.195033860045147, + "grad_norm": 295.85333251953125, + "learning_rate": 2.2219600725952813e-05, + "loss": 44.9235, + "step": 1439 + }, + { + "epoch": 5.198645598194131, + "grad_norm": 246.46482849121094, + "learning_rate": 2.2214156079854808e-05, + "loss": 44.5892, + "step": 1440 + }, + { + "epoch": 5.198645598194131, + "eval_loss": 0.6501885056495667, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.096, + "eval_steps_per_second": 57.096, + "step": 1440 + }, + { + "epoch": 5.2022573363431155, + "grad_norm": 224.99964904785156, + "learning_rate": 2.2208711433756807e-05, + "loss": 45.1496, + "step": 1441 + }, + { + "epoch": 5.2058690744920995, + "grad_norm": 201.5928497314453, + "learning_rate": 2.2203266787658802e-05, + "loss": 44.2362, + "step": 1442 + }, + { + "epoch": 5.209480812641083, + "grad_norm": 220.72509765625, + "learning_rate": 2.21978221415608e-05, + "loss": 45.7963, + "step": 1443 + }, + { + "epoch": 5.213092550790067, + "grad_norm": 229.04412841796875, + "learning_rate": 2.2192377495462796e-05, + "loss": 44.1812, + "step": 1444 + }, + { + "epoch": 5.216704288939052, + "grad_norm": 214.86207580566406, + "learning_rate": 2.2186932849364792e-05, + "loss": 44.364, + "step": 1445 + }, + { + "epoch": 5.220316027088036, + "grad_norm": 169.3239288330078, + "learning_rate": 2.2181488203266787e-05, + "loss": 44.1106, + "step": 1446 + }, + { + "epoch": 5.22392776523702, + "grad_norm": 180.3131561279297, + "learning_rate": 2.2176043557168786e-05, + "loss": 41.8791, + "step": 1447 + }, + { + "epoch": 5.227539503386004, + "grad_norm": 227.83078002929688, + "learning_rate": 2.217059891107078e-05, + "loss": 39.7917, + "step": 1448 + }, + { + "epoch": 5.231151241534989, + "grad_norm": 267.4294738769531, + "learning_rate": 2.2165154264972777e-05, + "loss": 41.2864, + "step": 1449 + }, + { + "epoch": 5.234762979683973, + "grad_norm": 210.79034423828125, + "learning_rate": 2.2159709618874772e-05, + "loss": 40.7219, + "step": 1450 + }, + { + "epoch": 5.234762979683973, + "eval_loss": 0.6369529366493225, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 1450 + }, + { + "epoch": 5.238374717832957, + "grad_norm": 205.2632598876953, + "learning_rate": 2.2154264972776768e-05, + "loss": 41.0364, + "step": 1451 + }, + { + "epoch": 5.241986455981941, + "grad_norm": 199.7196807861328, + "learning_rate": 2.214882032667877e-05, + "loss": 40.2733, + "step": 1452 + }, + { + "epoch": 5.245598194130926, + "grad_norm": 184.26495361328125, + "learning_rate": 2.2143375680580765e-05, + "loss": 40.3418, + "step": 1453 + }, + { + "epoch": 5.24920993227991, + "grad_norm": 170.1937713623047, + "learning_rate": 2.213793103448276e-05, + "loss": 40.5658, + "step": 1454 + }, + { + "epoch": 5.252821670428894, + "grad_norm": 167.71109008789062, + "learning_rate": 2.2132486388384756e-05, + "loss": 41.9252, + "step": 1455 + }, + { + "epoch": 5.2564334085778786, + "grad_norm": 184.73162841796875, + "learning_rate": 2.212704174228675e-05, + "loss": 40.0485, + "step": 1456 + }, + { + "epoch": 5.2600451467268625, + "grad_norm": 195.0812225341797, + "learning_rate": 2.2121597096188747e-05, + "loss": 41.6424, + "step": 1457 + }, + { + "epoch": 5.2636568848758465, + "grad_norm": 218.23553466796875, + "learning_rate": 2.2116152450090745e-05, + "loss": 40.6179, + "step": 1458 + }, + { + "epoch": 5.2672686230248305, + "grad_norm": 229.79299926757812, + "learning_rate": 2.211070780399274e-05, + "loss": 42.8747, + "step": 1459 + }, + { + "epoch": 5.270880361173815, + "grad_norm": 231.70692443847656, + "learning_rate": 2.2105263157894736e-05, + "loss": 42.7016, + "step": 1460 + }, + { + "epoch": 5.270880361173815, + "eval_loss": 0.6424433588981628, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 1460 + }, + { + "epoch": 5.274492099322799, + "grad_norm": 204.9513397216797, + "learning_rate": 2.209981851179673e-05, + "loss": 41.206, + "step": 1461 + }, + { + "epoch": 5.278103837471783, + "grad_norm": 220.89083862304688, + "learning_rate": 2.209437386569873e-05, + "loss": 44.0126, + "step": 1462 + }, + { + "epoch": 5.281715575620767, + "grad_norm": 266.7763671875, + "learning_rate": 2.208892921960073e-05, + "loss": 41.4934, + "step": 1463 + }, + { + "epoch": 5.285327313769752, + "grad_norm": 241.42636108398438, + "learning_rate": 2.2083484573502724e-05, + "loss": 43.3433, + "step": 1464 + }, + { + "epoch": 5.288939051918736, + "grad_norm": 221.7669219970703, + "learning_rate": 2.207803992740472e-05, + "loss": 35.9569, + "step": 1465 + }, + { + "epoch": 5.29255079006772, + "grad_norm": 236.0152130126953, + "learning_rate": 2.2072595281306715e-05, + "loss": 36.0824, + "step": 1466 + }, + { + "epoch": 5.296162528216704, + "grad_norm": 239.56224060058594, + "learning_rate": 2.206715063520871e-05, + "loss": 33.6127, + "step": 1467 + }, + { + "epoch": 5.299774266365689, + "grad_norm": 277.1287841796875, + "learning_rate": 2.2061705989110706e-05, + "loss": 36.11, + "step": 1468 + }, + { + "epoch": 5.303386004514673, + "grad_norm": 250.19515991210938, + "learning_rate": 2.2056261343012705e-05, + "loss": 36.9984, + "step": 1469 + }, + { + "epoch": 5.306997742663657, + "grad_norm": 214.2754669189453, + "learning_rate": 2.20508166969147e-05, + "loss": 36.5917, + "step": 1470 + }, + { + "epoch": 5.306997742663657, + "eval_loss": 0.6356943845748901, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 1470 + }, + { + "epoch": 5.310609480812641, + "grad_norm": 224.37388610839844, + "learning_rate": 2.20453720508167e-05, + "loss": 36.5302, + "step": 1471 + }, + { + "epoch": 5.314221218961626, + "grad_norm": 276.2541809082031, + "learning_rate": 2.2039927404718694e-05, + "loss": 36.7978, + "step": 1472 + }, + { + "epoch": 5.3178329571106095, + "grad_norm": 361.717041015625, + "learning_rate": 2.203448275862069e-05, + "loss": 37.4063, + "step": 1473 + }, + { + "epoch": 5.3214446952595935, + "grad_norm": 285.3569641113281, + "learning_rate": 2.202903811252269e-05, + "loss": 37.2472, + "step": 1474 + }, + { + "epoch": 5.3250564334085775, + "grad_norm": 268.160400390625, + "learning_rate": 2.2023593466424684e-05, + "loss": 37.7361, + "step": 1475 + }, + { + "epoch": 5.328668171557562, + "grad_norm": 211.38070678710938, + "learning_rate": 2.201814882032668e-05, + "loss": 37.7794, + "step": 1476 + }, + { + "epoch": 5.332279909706546, + "grad_norm": 214.10638427734375, + "learning_rate": 2.2012704174228675e-05, + "loss": 39.0787, + "step": 1477 + }, + { + "epoch": 5.33589164785553, + "grad_norm": 238.9603271484375, + "learning_rate": 2.200725952813067e-05, + "loss": 37.6853, + "step": 1478 + }, + { + "epoch": 5.339503386004514, + "grad_norm": 323.44976806640625, + "learning_rate": 2.2001814882032665e-05, + "loss": 38.2844, + "step": 1479 + }, + { + "epoch": 5.343115124153499, + "grad_norm": 289.6131896972656, + "learning_rate": 2.1996370235934668e-05, + "loss": 38.8953, + "step": 1480 + }, + { + "epoch": 5.343115124153499, + "eval_loss": 0.6462770700454712, + "eval_runtime": 3.1673, + "eval_samples_per_second": 56.516, + "eval_steps_per_second": 56.516, + "step": 1480 + }, + { + "epoch": 5.346726862302483, + "grad_norm": 197.47299194335938, + "learning_rate": 2.1990925589836663e-05, + "loss": 28.126, + "step": 1481 + }, + { + "epoch": 5.350338600451467, + "grad_norm": 198.37156677246094, + "learning_rate": 2.1985480943738658e-05, + "loss": 24.2205, + "step": 1482 + }, + { + "epoch": 5.353950338600452, + "grad_norm": 211.03501892089844, + "learning_rate": 2.1980036297640654e-05, + "loss": 24.119, + "step": 1483 + }, + { + "epoch": 5.357562076749436, + "grad_norm": 182.23316955566406, + "learning_rate": 2.197459165154265e-05, + "loss": 24.7386, + "step": 1484 + }, + { + "epoch": 5.36117381489842, + "grad_norm": 192.6392822265625, + "learning_rate": 2.1969147005444648e-05, + "loss": 26.0739, + "step": 1485 + }, + { + "epoch": 5.364785553047404, + "grad_norm": 380.62896728515625, + "learning_rate": 2.1963702359346643e-05, + "loss": 46.6945, + "step": 1486 + }, + { + "epoch": 5.368397291196389, + "grad_norm": 342.5572814941406, + "learning_rate": 2.195825771324864e-05, + "loss": 46.1797, + "step": 1487 + }, + { + "epoch": 5.372009029345373, + "grad_norm": 311.7198791503906, + "learning_rate": 2.1952813067150634e-05, + "loss": 45.6588, + "step": 1488 + }, + { + "epoch": 5.375620767494357, + "grad_norm": 260.9885559082031, + "learning_rate": 2.1947368421052633e-05, + "loss": 45.2405, + "step": 1489 + }, + { + "epoch": 5.3792325056433405, + "grad_norm": 263.3132019042969, + "learning_rate": 2.1941923774954628e-05, + "loss": 44.117, + "step": 1490 + }, + { + "epoch": 5.3792325056433405, + "eval_loss": 0.644275426864624, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 1490 + }, + { + "epoch": 5.382844243792325, + "grad_norm": 254.92022705078125, + "learning_rate": 2.1936479128856627e-05, + "loss": 45.4002, + "step": 1491 + }, + { + "epoch": 5.386455981941309, + "grad_norm": 246.1839599609375, + "learning_rate": 2.1931034482758622e-05, + "loss": 45.3481, + "step": 1492 + }, + { + "epoch": 5.390067720090293, + "grad_norm": 282.2879638671875, + "learning_rate": 2.1925589836660618e-05, + "loss": 45.3958, + "step": 1493 + }, + { + "epoch": 5.393679458239277, + "grad_norm": 266.9140930175781, + "learning_rate": 2.1920145190562613e-05, + "loss": 44.2959, + "step": 1494 + }, + { + "epoch": 5.397291196388262, + "grad_norm": 196.81199645996094, + "learning_rate": 2.191470054446461e-05, + "loss": 44.765, + "step": 1495 + }, + { + "epoch": 5.400902934537246, + "grad_norm": 270.7329406738281, + "learning_rate": 2.1909255898366607e-05, + "loss": 42.8581, + "step": 1496 + }, + { + "epoch": 5.40451467268623, + "grad_norm": 187.3281707763672, + "learning_rate": 2.1903811252268603e-05, + "loss": 40.7167, + "step": 1497 + }, + { + "epoch": 5.408126410835214, + "grad_norm": 302.9165954589844, + "learning_rate": 2.1898366606170598e-05, + "loss": 41.0712, + "step": 1498 + }, + { + "epoch": 5.411738148984199, + "grad_norm": 395.1492614746094, + "learning_rate": 2.1892921960072597e-05, + "loss": 40.4098, + "step": 1499 + }, + { + "epoch": 5.415349887133183, + "grad_norm": 253.91494750976562, + "learning_rate": 2.1887477313974592e-05, + "loss": 41.2985, + "step": 1500 + }, + { + "epoch": 5.415349887133183, + "eval_loss": 0.6383773684501648, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1500 + }, + { + "epoch": 5.418961625282167, + "grad_norm": 248.4109344482422, + "learning_rate": 2.1882032667876588e-05, + "loss": 41.179, + "step": 1501 + }, + { + "epoch": 5.422573363431152, + "grad_norm": 210.50015258789062, + "learning_rate": 2.1876588021778586e-05, + "loss": 41.1934, + "step": 1502 + }, + { + "epoch": 5.426185101580136, + "grad_norm": 170.64334106445312, + "learning_rate": 2.187114337568058e-05, + "loss": 41.5535, + "step": 1503 + }, + { + "epoch": 5.42979683972912, + "grad_norm": 249.41270446777344, + "learning_rate": 2.1865698729582577e-05, + "loss": 41.8323, + "step": 1504 + }, + { + "epoch": 5.433408577878104, + "grad_norm": 214.53770446777344, + "learning_rate": 2.1860254083484572e-05, + "loss": 42.1517, + "step": 1505 + }, + { + "epoch": 5.437020316027088, + "grad_norm": 225.6502227783203, + "learning_rate": 2.1854809437386568e-05, + "loss": 42.7675, + "step": 1506 + }, + { + "epoch": 5.440632054176072, + "grad_norm": 210.19219970703125, + "learning_rate": 2.1849364791288567e-05, + "loss": 42.5094, + "step": 1507 + }, + { + "epoch": 5.444243792325056, + "grad_norm": 187.03294372558594, + "learning_rate": 2.1843920145190565e-05, + "loss": 42.2218, + "step": 1508 + }, + { + "epoch": 5.44785553047404, + "grad_norm": 227.6764373779297, + "learning_rate": 2.183847549909256e-05, + "loss": 42.7061, + "step": 1509 + }, + { + "epoch": 5.451467268623025, + "grad_norm": 239.2847442626953, + "learning_rate": 2.1833030852994556e-05, + "loss": 43.1959, + "step": 1510 + }, + { + "epoch": 5.451467268623025, + "eval_loss": 0.6405091285705566, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 1510 + }, + { + "epoch": 5.455079006772009, + "grad_norm": 268.887451171875, + "learning_rate": 2.182758620689655e-05, + "loss": 42.4915, + "step": 1511 + }, + { + "epoch": 5.458690744920993, + "grad_norm": 261.0531311035156, + "learning_rate": 2.182214156079855e-05, + "loss": 42.1777, + "step": 1512 + }, + { + "epoch": 5.462302483069977, + "grad_norm": 241.58819580078125, + "learning_rate": 2.1816696914700546e-05, + "loss": 40.8728, + "step": 1513 + }, + { + "epoch": 5.465914221218962, + "grad_norm": 227.302001953125, + "learning_rate": 2.181125226860254e-05, + "loss": 39.8861, + "step": 1514 + }, + { + "epoch": 5.469525959367946, + "grad_norm": 293.8402404785156, + "learning_rate": 2.1805807622504536e-05, + "loss": 36.8716, + "step": 1515 + }, + { + "epoch": 5.47313769751693, + "grad_norm": 332.8829650878906, + "learning_rate": 2.1800362976406532e-05, + "loss": 35.6049, + "step": 1516 + }, + { + "epoch": 5.476749435665914, + "grad_norm": 271.6636962890625, + "learning_rate": 2.179491833030853e-05, + "loss": 34.6785, + "step": 1517 + }, + { + "epoch": 5.480361173814899, + "grad_norm": 211.5673065185547, + "learning_rate": 2.178947368421053e-05, + "loss": 35.5321, + "step": 1518 + }, + { + "epoch": 5.483972911963883, + "grad_norm": 168.95346069335938, + "learning_rate": 2.1784029038112525e-05, + "loss": 35.1604, + "step": 1519 + }, + { + "epoch": 5.487584650112867, + "grad_norm": 242.66725158691406, + "learning_rate": 2.177858439201452e-05, + "loss": 37.8709, + "step": 1520 + }, + { + "epoch": 5.487584650112867, + "eval_loss": 0.6324127912521362, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1520 + }, + { + "epoch": 5.491196388261851, + "grad_norm": 202.7799530029297, + "learning_rate": 2.1773139745916516e-05, + "loss": 38.1727, + "step": 1521 + }, + { + "epoch": 5.4948081264108355, + "grad_norm": 210.12704467773438, + "learning_rate": 2.176769509981851e-05, + "loss": 36.4171, + "step": 1522 + }, + { + "epoch": 5.4984198645598195, + "grad_norm": 214.7133331298828, + "learning_rate": 2.176225045372051e-05, + "loss": 37.7873, + "step": 1523 + }, + { + "epoch": 5.502031602708803, + "grad_norm": 197.89781188964844, + "learning_rate": 2.1756805807622505e-05, + "loss": 37.1096, + "step": 1524 + }, + { + "epoch": 5.505643340857787, + "grad_norm": 203.01992797851562, + "learning_rate": 2.17513611615245e-05, + "loss": 36.9907, + "step": 1525 + }, + { + "epoch": 5.509255079006772, + "grad_norm": 210.42164611816406, + "learning_rate": 2.17459165154265e-05, + "loss": 38.0291, + "step": 1526 + }, + { + "epoch": 5.512866817155756, + "grad_norm": 210.2798309326172, + "learning_rate": 2.1740471869328495e-05, + "loss": 37.5385, + "step": 1527 + }, + { + "epoch": 5.51647855530474, + "grad_norm": 217.986572265625, + "learning_rate": 2.173502722323049e-05, + "loss": 39.2736, + "step": 1528 + }, + { + "epoch": 5.520090293453725, + "grad_norm": 221.05831909179688, + "learning_rate": 2.172958257713249e-05, + "loss": 39.2733, + "step": 1529 + }, + { + "epoch": 5.523702031602709, + "grad_norm": 250.36065673828125, + "learning_rate": 2.1724137931034484e-05, + "loss": 37.8987, + "step": 1530 + }, + { + "epoch": 5.523702031602709, + "eval_loss": 0.6414559483528137, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 1530 + }, + { + "epoch": 5.527313769751693, + "grad_norm": 275.062255859375, + "learning_rate": 2.171869328493648e-05, + "loss": 29.4874, + "step": 1531 + }, + { + "epoch": 5.530925507900677, + "grad_norm": 178.79615783691406, + "learning_rate": 2.1713248638838475e-05, + "loss": 25.2165, + "step": 1532 + }, + { + "epoch": 5.534537246049661, + "grad_norm": 221.6693572998047, + "learning_rate": 2.170780399274047e-05, + "loss": 24.7139, + "step": 1533 + }, + { + "epoch": 5.538148984198646, + "grad_norm": 207.15869140625, + "learning_rate": 2.170235934664247e-05, + "loss": 25.2773, + "step": 1534 + }, + { + "epoch": 5.54176072234763, + "grad_norm": 193.37644958496094, + "learning_rate": 2.1696914700544468e-05, + "loss": 25.7936, + "step": 1535 + }, + { + "epoch": 5.545372460496614, + "grad_norm": 314.101318359375, + "learning_rate": 2.1691470054446463e-05, + "loss": 45.8573, + "step": 1536 + }, + { + "epoch": 5.5489841986455986, + "grad_norm": 376.9578552246094, + "learning_rate": 2.168602540834846e-05, + "loss": 47.1284, + "step": 1537 + }, + { + "epoch": 5.5525959367945825, + "grad_norm": 343.3904724121094, + "learning_rate": 2.1680580762250454e-05, + "loss": 45.1873, + "step": 1538 + }, + { + "epoch": 5.5562076749435665, + "grad_norm": 263.31768798828125, + "learning_rate": 2.167513611615245e-05, + "loss": 45.4906, + "step": 1539 + }, + { + "epoch": 5.5598194130925505, + "grad_norm": 295.50384521484375, + "learning_rate": 2.1669691470054448e-05, + "loss": 44.9259, + "step": 1540 + }, + { + "epoch": 5.5598194130925505, + "eval_loss": 0.6483813524246216, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.923, + "eval_steps_per_second": 56.923, + "step": 1540 + }, + { + "epoch": 5.563431151241535, + "grad_norm": 208.8861846923828, + "learning_rate": 2.1664246823956444e-05, + "loss": 43.7965, + "step": 1541 + }, + { + "epoch": 5.567042889390519, + "grad_norm": 195.8695526123047, + "learning_rate": 2.165880217785844e-05, + "loss": 44.7409, + "step": 1542 + }, + { + "epoch": 5.570654627539503, + "grad_norm": 218.10089111328125, + "learning_rate": 2.1653357531760434e-05, + "loss": 45.9364, + "step": 1543 + }, + { + "epoch": 5.574266365688487, + "grad_norm": 204.17205810546875, + "learning_rate": 2.164791288566243e-05, + "loss": 45.468, + "step": 1544 + }, + { + "epoch": 5.577878103837472, + "grad_norm": 239.03952026367188, + "learning_rate": 2.1642468239564432e-05, + "loss": 44.7685, + "step": 1545 + }, + { + "epoch": 5.581489841986456, + "grad_norm": 251.59300231933594, + "learning_rate": 2.1637023593466427e-05, + "loss": 43.011, + "step": 1546 + }, + { + "epoch": 5.58510158013544, + "grad_norm": 186.72540283203125, + "learning_rate": 2.1631578947368423e-05, + "loss": 41.5255, + "step": 1547 + }, + { + "epoch": 5.588713318284425, + "grad_norm": 199.89732360839844, + "learning_rate": 2.1626134301270418e-05, + "loss": 40.2522, + "step": 1548 + }, + { + "epoch": 5.592325056433409, + "grad_norm": 182.16624450683594, + "learning_rate": 2.1620689655172413e-05, + "loss": 41.0931, + "step": 1549 + }, + { + "epoch": 5.595936794582393, + "grad_norm": 221.58680725097656, + "learning_rate": 2.161524500907441e-05, + "loss": 40.2717, + "step": 1550 + }, + { + "epoch": 5.595936794582393, + "eval_loss": 0.6393340229988098, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 1550 + }, + { + "epoch": 5.599548532731377, + "grad_norm": 209.82183837890625, + "learning_rate": 2.1609800362976408e-05, + "loss": 41.7522, + "step": 1551 + }, + { + "epoch": 5.603160270880361, + "grad_norm": 226.1896209716797, + "learning_rate": 2.1604355716878403e-05, + "loss": 40.8078, + "step": 1552 + }, + { + "epoch": 5.606772009029346, + "grad_norm": 219.57899475097656, + "learning_rate": 2.1598911070780398e-05, + "loss": 42.2331, + "step": 1553 + }, + { + "epoch": 5.6103837471783295, + "grad_norm": 185.2303009033203, + "learning_rate": 2.1593466424682397e-05, + "loss": 42.0695, + "step": 1554 + }, + { + "epoch": 5.6139954853273135, + "grad_norm": 192.32913208007812, + "learning_rate": 2.1588021778584392e-05, + "loss": 42.1317, + "step": 1555 + }, + { + "epoch": 5.617607223476298, + "grad_norm": 183.3128662109375, + "learning_rate": 2.158257713248639e-05, + "loss": 40.4957, + "step": 1556 + }, + { + "epoch": 5.621218961625282, + "grad_norm": 178.10691833496094, + "learning_rate": 2.1577132486388387e-05, + "loss": 40.9154, + "step": 1557 + }, + { + "epoch": 5.624830699774266, + "grad_norm": 207.3495330810547, + "learning_rate": 2.1571687840290382e-05, + "loss": 42.8389, + "step": 1558 + }, + { + "epoch": 5.62844243792325, + "grad_norm": 191.46353149414062, + "learning_rate": 2.1566243194192377e-05, + "loss": 41.9483, + "step": 1559 + }, + { + "epoch": 5.632054176072235, + "grad_norm": 218.9544219970703, + "learning_rate": 2.1560798548094373e-05, + "loss": 41.2037, + "step": 1560 + }, + { + "epoch": 5.632054176072235, + "eval_loss": 0.6345452070236206, + "eval_runtime": 3.1432, + "eval_samples_per_second": 56.949, + "eval_steps_per_second": 56.949, + "step": 1560 + }, + { + "epoch": 5.635665914221219, + "grad_norm": 235.9405059814453, + "learning_rate": 2.1555353901996368e-05, + "loss": 43.1159, + "step": 1561 + }, + { + "epoch": 5.639277652370203, + "grad_norm": 207.1119384765625, + "learning_rate": 2.1549909255898367e-05, + "loss": 43.4384, + "step": 1562 + }, + { + "epoch": 5.642889390519187, + "grad_norm": 305.3013916015625, + "learning_rate": 2.1544464609800366e-05, + "loss": 42.436, + "step": 1563 + }, + { + "epoch": 5.646501128668172, + "grad_norm": 226.25282287597656, + "learning_rate": 2.153901996370236e-05, + "loss": 39.6844, + "step": 1564 + }, + { + "epoch": 5.650112866817156, + "grad_norm": 201.5033416748047, + "learning_rate": 2.1533575317604356e-05, + "loss": 35.9103, + "step": 1565 + }, + { + "epoch": 5.65372460496614, + "grad_norm": 206.63229370117188, + "learning_rate": 2.1528130671506352e-05, + "loss": 35.0026, + "step": 1566 + }, + { + "epoch": 5.657336343115124, + "grad_norm": 212.67581176757812, + "learning_rate": 2.152268602540835e-05, + "loss": 35.6298, + "step": 1567 + }, + { + "epoch": 5.660948081264109, + "grad_norm": 193.2886199951172, + "learning_rate": 2.1517241379310346e-05, + "loss": 36.0356, + "step": 1568 + }, + { + "epoch": 5.664559819413093, + "grad_norm": 166.189208984375, + "learning_rate": 2.151179673321234e-05, + "loss": 35.5423, + "step": 1569 + }, + { + "epoch": 5.668171557562077, + "grad_norm": 288.91552734375, + "learning_rate": 2.1506352087114337e-05, + "loss": 36.6227, + "step": 1570 + }, + { + "epoch": 5.668171557562077, + "eval_loss": 0.6339959502220154, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1570 + }, + { + "epoch": 5.6717832957110605, + "grad_norm": 210.91664123535156, + "learning_rate": 2.1500907441016332e-05, + "loss": 37.3015, + "step": 1571 + }, + { + "epoch": 5.675395033860045, + "grad_norm": 206.54299926757812, + "learning_rate": 2.149546279491833e-05, + "loss": 36.961, + "step": 1572 + }, + { + "epoch": 5.679006772009029, + "grad_norm": 206.55613708496094, + "learning_rate": 2.149001814882033e-05, + "loss": 36.722, + "step": 1573 + }, + { + "epoch": 5.682618510158013, + "grad_norm": 206.86563110351562, + "learning_rate": 2.1484573502722325e-05, + "loss": 37.7482, + "step": 1574 + }, + { + "epoch": 5.686230248306998, + "grad_norm": 219.96533203125, + "learning_rate": 2.147912885662432e-05, + "loss": 37.7964, + "step": 1575 + }, + { + "epoch": 5.689841986455982, + "grad_norm": 226.23887634277344, + "learning_rate": 2.1473684210526316e-05, + "loss": 38.6577, + "step": 1576 + }, + { + "epoch": 5.693453724604966, + "grad_norm": 195.1751708984375, + "learning_rate": 2.146823956442831e-05, + "loss": 36.9764, + "step": 1577 + }, + { + "epoch": 5.69706546275395, + "grad_norm": 194.3510284423828, + "learning_rate": 2.146279491833031e-05, + "loss": 39.4842, + "step": 1578 + }, + { + "epoch": 5.700677200902934, + "grad_norm": 187.02281188964844, + "learning_rate": 2.1457350272232305e-05, + "loss": 38.9574, + "step": 1579 + }, + { + "epoch": 5.704288939051919, + "grad_norm": 242.91925048828125, + "learning_rate": 2.14519056261343e-05, + "loss": 37.6359, + "step": 1580 + }, + { + "epoch": 5.704288939051919, + "eval_loss": 0.6384473443031311, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 1580 + }, + { + "epoch": 5.707900677200903, + "grad_norm": 242.9617156982422, + "learning_rate": 2.14464609800363e-05, + "loss": 31.3564, + "step": 1581 + }, + { + "epoch": 5.711512415349887, + "grad_norm": 182.00540161132812, + "learning_rate": 2.1441016333938295e-05, + "loss": 24.2933, + "step": 1582 + }, + { + "epoch": 5.715124153498872, + "grad_norm": 257.7115173339844, + "learning_rate": 2.143557168784029e-05, + "loss": 24.6299, + "step": 1583 + }, + { + "epoch": 5.718735891647856, + "grad_norm": 198.71554565429688, + "learning_rate": 2.143012704174229e-05, + "loss": 24.7344, + "step": 1584 + }, + { + "epoch": 5.72234762979684, + "grad_norm": 198.24520874023438, + "learning_rate": 2.1424682395644284e-05, + "loss": 26.0825, + "step": 1585 + }, + { + "epoch": 5.725959367945824, + "grad_norm": 248.9528045654297, + "learning_rate": 2.141923774954628e-05, + "loss": 45.1176, + "step": 1586 + }, + { + "epoch": 5.7295711060948085, + "grad_norm": 293.7327575683594, + "learning_rate": 2.1413793103448275e-05, + "loss": 45.8517, + "step": 1587 + }, + { + "epoch": 5.733182844243792, + "grad_norm": 293.1148681640625, + "learning_rate": 2.140834845735027e-05, + "loss": 45.6659, + "step": 1588 + }, + { + "epoch": 5.736794582392776, + "grad_norm": 312.7779846191406, + "learning_rate": 2.140290381125227e-05, + "loss": 44.4863, + "step": 1589 + }, + { + "epoch": 5.74040632054176, + "grad_norm": 309.1000061035156, + "learning_rate": 2.1397459165154265e-05, + "loss": 43.649, + "step": 1590 + }, + { + "epoch": 5.74040632054176, + "eval_loss": 0.6471736431121826, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 1590 + }, + { + "epoch": 5.744018058690745, + "grad_norm": 276.4226989746094, + "learning_rate": 2.1392014519056263e-05, + "loss": 45.3135, + "step": 1591 + }, + { + "epoch": 5.747629796839729, + "grad_norm": 233.6791229248047, + "learning_rate": 2.138656987295826e-05, + "loss": 44.4919, + "step": 1592 + }, + { + "epoch": 5.751241534988713, + "grad_norm": 194.2917022705078, + "learning_rate": 2.1381125226860254e-05, + "loss": 44.8033, + "step": 1593 + }, + { + "epoch": 5.754853273137698, + "grad_norm": 241.76060485839844, + "learning_rate": 2.137568058076225e-05, + "loss": 45.1427, + "step": 1594 + }, + { + "epoch": 5.758465011286682, + "grad_norm": 216.56283569335938, + "learning_rate": 2.137023593466425e-05, + "loss": 43.1769, + "step": 1595 + }, + { + "epoch": 5.762076749435666, + "grad_norm": 230.0026092529297, + "learning_rate": 2.1364791288566244e-05, + "loss": 44.1141, + "step": 1596 + }, + { + "epoch": 5.76568848758465, + "grad_norm": 191.55433654785156, + "learning_rate": 2.135934664246824e-05, + "loss": 40.7227, + "step": 1597 + }, + { + "epoch": 5.769300225733634, + "grad_norm": 180.25885009765625, + "learning_rate": 2.1353901996370235e-05, + "loss": 40.9842, + "step": 1598 + }, + { + "epoch": 5.772911963882619, + "grad_norm": 220.4018096923828, + "learning_rate": 2.134845735027223e-05, + "loss": 40.0403, + "step": 1599 + }, + { + "epoch": 5.776523702031603, + "grad_norm": 264.20587158203125, + "learning_rate": 2.1343012704174232e-05, + "loss": 40.1543, + "step": 1600 + }, + { + "epoch": 5.776523702031603, + "eval_loss": 0.6374311447143555, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1600 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.85263715547218e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bf65555201bacffc6af2d4fe5242e8c5461657e4 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6aea78f737665b32ff682a94523a4bd011eabe266ff866e1ca694170c1d801d2 +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..986adba6de1da38043336ac5b88aec0856432b0d --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2c4e1c326f7061143673c26514892e020c79becb15409ecef4745433d3c815a +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c3900ac06843676ddb14c73bea215977cf4da73 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d84fad467daedb53478711dc5b9c5ae65e89dc43c60589da4ff3dda178e1829 +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ba81e8ad326038e5c9c9ae03c7559c048cff2e4e --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:570f9c1e6b14619b48355c94eb2fbdb25982372276f6c874c7c8f82cce548f2a +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2c27c8c159d3807da57f0274bdc788d56085a0f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d8b5cebddb4be69754b7cc7819e296176f45b4d0eea62707df05df1a3336297 +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..32225c8cf229eb5c4dfb2335cdfd92a608851a25 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/trainer_state.json @@ -0,0 +1,14073 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.4984198645598195, + "eval_steps": 10, + "global_step": 1800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + }, + { + "epoch": 4.33589164785553, + "grad_norm": 232.332763671875, + "learning_rate": 2.3515426497277678e-05, + "loss": 38.1029, + "step": 1201 + }, + { + "epoch": 4.339503386004514, + "grad_norm": 251.8763885498047, + "learning_rate": 2.3509981851179673e-05, + "loss": 40.2538, + "step": 1202 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 260.1363525390625, + "learning_rate": 2.350453720508167e-05, + "loss": 39.115, + "step": 1203 + }, + { + "epoch": 4.346726862302483, + "grad_norm": 227.32473754882812, + "learning_rate": 2.3499092558983667e-05, + "loss": 37.7692, + "step": 1204 + }, + { + "epoch": 4.350338600451467, + "grad_norm": 208.3872528076172, + "learning_rate": 2.3493647912885663e-05, + "loss": 26.7583, + "step": 1205 + }, + { + "epoch": 4.353950338600452, + "grad_norm": 173.05075073242188, + "learning_rate": 2.348820326678766e-05, + "loss": 24.7576, + "step": 1206 + }, + { + "epoch": 4.357562076749436, + "grad_norm": 214.4512939453125, + "learning_rate": 2.3482758620689657e-05, + "loss": 24.8792, + "step": 1207 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 179.293701171875, + "learning_rate": 2.3477313974591652e-05, + "loss": 26.1507, + "step": 1208 + }, + { + "epoch": 4.364785553047404, + "grad_norm": 401.9908142089844, + "learning_rate": 2.3471869328493648e-05, + "loss": 47.4017, + "step": 1209 + }, + { + "epoch": 4.368397291196389, + "grad_norm": 399.3369140625, + "learning_rate": 2.3466424682395643e-05, + "loss": 48.0082, + "step": 1210 + }, + { + "epoch": 4.368397291196389, + "eval_loss": 0.6664602756500244, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.18, + "eval_steps_per_second": 57.18, + "step": 1210 + }, + { + "epoch": 4.372009029345373, + "grad_norm": 320.49090576171875, + "learning_rate": 2.346098003629764e-05, + "loss": 47.4843, + "step": 1211 + }, + { + "epoch": 4.375620767494357, + "grad_norm": 297.55615234375, + "learning_rate": 2.3455535390199637e-05, + "loss": 46.3087, + "step": 1212 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 245.03399658203125, + "learning_rate": 2.3450090744101636e-05, + "loss": 45.4889, + "step": 1213 + }, + { + "epoch": 4.382844243792325, + "grad_norm": 227.94091796875, + "learning_rate": 2.344464609800363e-05, + "loss": 45.8501, + "step": 1214 + }, + { + "epoch": 4.386455981941309, + "grad_norm": 262.7824401855469, + "learning_rate": 2.3439201451905627e-05, + "loss": 46.2737, + "step": 1215 + }, + { + "epoch": 4.390067720090293, + "grad_norm": 235.969970703125, + "learning_rate": 2.3433756805807622e-05, + "loss": 45.2876, + "step": 1216 + }, + { + "epoch": 4.393679458239277, + "grad_norm": 244.8028106689453, + "learning_rate": 2.342831215970962e-05, + "loss": 45.4931, + "step": 1217 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 236.24844360351562, + "learning_rate": 2.3422867513611616e-05, + "loss": 45.6649, + "step": 1218 + }, + { + "epoch": 4.400902934537246, + "grad_norm": 204.7911834716797, + "learning_rate": 2.341742286751361e-05, + "loss": 43.9613, + "step": 1219 + }, + { + "epoch": 4.40451467268623, + "grad_norm": 190.6739044189453, + "learning_rate": 2.3411978221415607e-05, + "loss": 41.9267, + "step": 1220 + }, + { + "epoch": 4.40451467268623, + "eval_loss": 0.6481396555900574, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1220 + }, + { + "epoch": 4.408126410835214, + "grad_norm": 224.25758361816406, + "learning_rate": 2.3406533575317602e-05, + "loss": 42.34, + "step": 1221 + }, + { + "epoch": 4.411738148984199, + "grad_norm": 238.21913146972656, + "learning_rate": 2.34010889292196e-05, + "loss": 40.6947, + "step": 1222 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 255.64395141601562, + "learning_rate": 2.33956442831216e-05, + "loss": 39.8585, + "step": 1223 + }, + { + "epoch": 4.418961625282167, + "grad_norm": 202.08859252929688, + "learning_rate": 2.3390199637023595e-05, + "loss": 42.6031, + "step": 1224 + }, + { + "epoch": 4.422573363431152, + "grad_norm": 222.359619140625, + "learning_rate": 2.338475499092559e-05, + "loss": 41.9946, + "step": 1225 + }, + { + "epoch": 4.426185101580136, + "grad_norm": 198.84461975097656, + "learning_rate": 2.3379310344827586e-05, + "loss": 40.9174, + "step": 1226 + }, + { + "epoch": 4.42979683972912, + "grad_norm": 227.34942626953125, + "learning_rate": 2.337386569872958e-05, + "loss": 42.2865, + "step": 1227 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 249.9097900390625, + "learning_rate": 2.336842105263158e-05, + "loss": 42.6508, + "step": 1228 + }, + { + "epoch": 4.437020316027088, + "grad_norm": 236.96009826660156, + "learning_rate": 2.3362976406533576e-05, + "loss": 43.0846, + "step": 1229 + }, + { + "epoch": 4.440632054176072, + "grad_norm": 183.06201171875, + "learning_rate": 2.335753176043557e-05, + "loss": 42.4119, + "step": 1230 + }, + { + "epoch": 4.440632054176072, + "eval_loss": 0.6428424715995789, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 1230 + }, + { + "epoch": 4.444243792325056, + "grad_norm": 199.0382843017578, + "learning_rate": 2.335208711433757e-05, + "loss": 43.1702, + "step": 1231 + }, + { + "epoch": 4.44785553047404, + "grad_norm": 221.87939453125, + "learning_rate": 2.3346642468239565e-05, + "loss": 43.3518, + "step": 1232 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 205.0601043701172, + "learning_rate": 2.3341197822141564e-05, + "loss": 42.9713, + "step": 1233 + }, + { + "epoch": 4.455079006772009, + "grad_norm": 235.3998565673828, + "learning_rate": 2.333575317604356e-05, + "loss": 42.6973, + "step": 1234 + }, + { + "epoch": 4.458690744920993, + "grad_norm": 171.76986694335938, + "learning_rate": 2.3330308529945555e-05, + "loss": 43.351, + "step": 1235 + }, + { + "epoch": 4.462302483069977, + "grad_norm": 261.549072265625, + "learning_rate": 2.332486388384755e-05, + "loss": 43.8662, + "step": 1236 + }, + { + "epoch": 4.465914221218962, + "grad_norm": 256.76837158203125, + "learning_rate": 2.3319419237749545e-05, + "loss": 40.7938, + "step": 1237 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 176.35060119628906, + "learning_rate": 2.331397459165154e-05, + "loss": 38.1021, + "step": 1238 + }, + { + "epoch": 4.47313769751693, + "grad_norm": 203.00906372070312, + "learning_rate": 2.330852994555354e-05, + "loss": 36.6359, + "step": 1239 + }, + { + "epoch": 4.476749435665914, + "grad_norm": 259.6462707519531, + "learning_rate": 2.3303085299455535e-05, + "loss": 34.448, + "step": 1240 + }, + { + "epoch": 4.476749435665914, + "eval_loss": 0.6386051177978516, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 1240 + }, + { + "epoch": 4.480361173814899, + "grad_norm": 215.24737548828125, + "learning_rate": 2.3297640653357534e-05, + "loss": 35.2353, + "step": 1241 + }, + { + "epoch": 4.483972911963883, + "grad_norm": 249.12355041503906, + "learning_rate": 2.329219600725953e-05, + "loss": 38.2077, + "step": 1242 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 191.0881805419922, + "learning_rate": 2.3286751361161525e-05, + "loss": 36.8363, + "step": 1243 + }, + { + "epoch": 4.491196388261851, + "grad_norm": 229.26449584960938, + "learning_rate": 2.3281306715063523e-05, + "loss": 36.7398, + "step": 1244 + }, + { + "epoch": 4.4948081264108355, + "grad_norm": 184.931884765625, + "learning_rate": 2.327586206896552e-05, + "loss": 35.6614, + "step": 1245 + }, + { + "epoch": 4.4984198645598195, + "grad_norm": 183.7378387451172, + "learning_rate": 2.3270417422867514e-05, + "loss": 36.9818, + "step": 1246 + }, + { + "epoch": 4.502031602708803, + "grad_norm": 191.42543029785156, + "learning_rate": 2.326497277676951e-05, + "loss": 38.1348, + "step": 1247 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 211.6359100341797, + "learning_rate": 2.3259528130671505e-05, + "loss": 37.0112, + "step": 1248 + }, + { + "epoch": 4.509255079006772, + "grad_norm": 245.6946563720703, + "learning_rate": 2.32540834845735e-05, + "loss": 38.6218, + "step": 1249 + }, + { + "epoch": 4.512866817155756, + "grad_norm": 193.29095458984375, + "learning_rate": 2.3248638838475502e-05, + "loss": 36.9687, + "step": 1250 + }, + { + "epoch": 4.512866817155756, + "eval_loss": 0.6432057023048401, + "eval_runtime": 3.1301, + "eval_samples_per_second": 57.187, + "eval_steps_per_second": 57.187, + "step": 1250 + }, + { + "epoch": 4.51647855530474, + "grad_norm": 247.0595245361328, + "learning_rate": 2.3243194192377498e-05, + "loss": 39.8086, + "step": 1251 + }, + { + "epoch": 4.520090293453725, + "grad_norm": 243.1544189453125, + "learning_rate": 2.3237749546279493e-05, + "loss": 38.7245, + "step": 1252 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 322.0834045410156, + "learning_rate": 2.323230490018149e-05, + "loss": 39.5335, + "step": 1253 + }, + { + "epoch": 4.527313769751693, + "grad_norm": 201.5956573486328, + "learning_rate": 2.3226860254083484e-05, + "loss": 30.2928, + "step": 1254 + }, + { + "epoch": 4.530925507900677, + "grad_norm": 186.13291931152344, + "learning_rate": 2.3221415607985483e-05, + "loss": 24.8504, + "step": 1255 + }, + { + "epoch": 4.534537246049661, + "grad_norm": 251.50608825683594, + "learning_rate": 2.3215970961887478e-05, + "loss": 24.5528, + "step": 1256 + }, + { + "epoch": 4.538148984198646, + "grad_norm": 180.21124267578125, + "learning_rate": 2.3210526315789473e-05, + "loss": 25.0864, + "step": 1257 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 206.5410614013672, + "learning_rate": 2.320508166969147e-05, + "loss": 27.1602, + "step": 1258 + }, + { + "epoch": 4.545372460496614, + "grad_norm": 342.1103210449219, + "learning_rate": 2.3199637023593468e-05, + "loss": 47.3734, + "step": 1259 + }, + { + "epoch": 4.5489841986455986, + "grad_norm": 418.3056945800781, + "learning_rate": 2.3194192377495463e-05, + "loss": 48.0316, + "step": 1260 + }, + { + "epoch": 4.5489841986455986, + "eval_loss": 0.6742400527000427, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 1260 + }, + { + "epoch": 4.5525959367945825, + "grad_norm": 369.8560791015625, + "learning_rate": 2.3188747731397462e-05, + "loss": 47.4532, + "step": 1261 + }, + { + "epoch": 4.5562076749435665, + "grad_norm": 322.0288391113281, + "learning_rate": 2.3183303085299457e-05, + "loss": 47.0661, + "step": 1262 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 244.79066467285156, + "learning_rate": 2.3177858439201453e-05, + "loss": 45.1875, + "step": 1263 + }, + { + "epoch": 4.563431151241535, + "grad_norm": 209.29397583007812, + "learning_rate": 2.3172413793103448e-05, + "loss": 46.1355, + "step": 1264 + }, + { + "epoch": 4.567042889390519, + "grad_norm": 271.5123291015625, + "learning_rate": 2.3166969147005443e-05, + "loss": 45.8947, + "step": 1265 + }, + { + "epoch": 4.570654627539503, + "grad_norm": 232.42913818359375, + "learning_rate": 2.3161524500907442e-05, + "loss": 45.6542, + "step": 1266 + }, + { + "epoch": 4.574266365688487, + "grad_norm": 282.50738525390625, + "learning_rate": 2.3156079854809437e-05, + "loss": 45.8805, + "step": 1267 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 203.39031982421875, + "learning_rate": 2.3150635208711436e-05, + "loss": 44.8926, + "step": 1268 + }, + { + "epoch": 4.581489841986456, + "grad_norm": 213.94894409179688, + "learning_rate": 2.314519056261343e-05, + "loss": 43.7589, + "step": 1269 + }, + { + "epoch": 4.58510158013544, + "grad_norm": 198.9677734375, + "learning_rate": 2.3139745916515427e-05, + "loss": 41.819, + "step": 1270 + }, + { + "epoch": 4.58510158013544, + "eval_loss": 0.6428627371788025, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1270 + }, + { + "epoch": 4.588713318284425, + "grad_norm": 197.69903564453125, + "learning_rate": 2.3134301270417422e-05, + "loss": 40.6128, + "step": 1271 + }, + { + "epoch": 4.592325056433409, + "grad_norm": 229.10488891601562, + "learning_rate": 2.312885662431942e-05, + "loss": 41.1856, + "step": 1272 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 254.4750213623047, + "learning_rate": 2.3123411978221417e-05, + "loss": 40.2048, + "step": 1273 + }, + { + "epoch": 4.599548532731377, + "grad_norm": 247.2012939453125, + "learning_rate": 2.3117967332123412e-05, + "loss": 41.663, + "step": 1274 + }, + { + "epoch": 4.603160270880361, + "grad_norm": 196.78761291503906, + "learning_rate": 2.3112522686025407e-05, + "loss": 41.1102, + "step": 1275 + }, + { + "epoch": 4.606772009029346, + "grad_norm": 179.03880310058594, + "learning_rate": 2.3107078039927403e-05, + "loss": 39.6368, + "step": 1276 + }, + { + "epoch": 4.6103837471783295, + "grad_norm": 203.49159240722656, + "learning_rate": 2.3101633393829405e-05, + "loss": 42.9424, + "step": 1277 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 254.80018615722656, + "learning_rate": 2.30961887477314e-05, + "loss": 42.0636, + "step": 1278 + }, + { + "epoch": 4.617607223476298, + "grad_norm": 201.86109924316406, + "learning_rate": 2.3090744101633396e-05, + "loss": 41.4738, + "step": 1279 + }, + { + "epoch": 4.621218961625282, + "grad_norm": 185.1239471435547, + "learning_rate": 2.308529945553539e-05, + "loss": 41.8529, + "step": 1280 + }, + { + "epoch": 4.621218961625282, + "eval_loss": 0.6457561254501343, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 1280 + }, + { + "epoch": 4.624830699774266, + "grad_norm": 198.6769561767578, + "learning_rate": 2.3079854809437386e-05, + "loss": 41.8397, + "step": 1281 + }, + { + "epoch": 4.62844243792325, + "grad_norm": 254.9165496826172, + "learning_rate": 2.3074410163339382e-05, + "loss": 43.5585, + "step": 1282 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 183.61181640625, + "learning_rate": 2.306896551724138e-05, + "loss": 41.7349, + "step": 1283 + }, + { + "epoch": 4.635665914221219, + "grad_norm": 206.0381622314453, + "learning_rate": 2.3063520871143376e-05, + "loss": 42.6239, + "step": 1284 + }, + { + "epoch": 4.639277652370203, + "grad_norm": 188.5303497314453, + "learning_rate": 2.305807622504537e-05, + "loss": 43.0988, + "step": 1285 + }, + { + "epoch": 4.642889390519187, + "grad_norm": 208.30039978027344, + "learning_rate": 2.3052631578947367e-05, + "loss": 43.8379, + "step": 1286 + }, + { + "epoch": 4.646501128668172, + "grad_norm": 209.494384765625, + "learning_rate": 2.3047186932849365e-05, + "loss": 41.4395, + "step": 1287 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 223.97824096679688, + "learning_rate": 2.3041742286751364e-05, + "loss": 38.5792, + "step": 1288 + }, + { + "epoch": 4.65372460496614, + "grad_norm": 209.16192626953125, + "learning_rate": 2.303629764065336e-05, + "loss": 36.2448, + "step": 1289 + }, + { + "epoch": 4.657336343115124, + "grad_norm": 260.72821044921875, + "learning_rate": 2.3030852994555355e-05, + "loss": 35.1692, + "step": 1290 + }, + { + "epoch": 4.657336343115124, + "eval_loss": 0.6381233334541321, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1290 + }, + { + "epoch": 4.660948081264109, + "grad_norm": 222.2270965576172, + "learning_rate": 2.302540834845735e-05, + "loss": 35.2234, + "step": 1291 + }, + { + "epoch": 4.664559819413093, + "grad_norm": 208.68218994140625, + "learning_rate": 2.3019963702359346e-05, + "loss": 35.6167, + "step": 1292 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 199.57015991210938, + "learning_rate": 2.301451905626134e-05, + "loss": 36.9489, + "step": 1293 + }, + { + "epoch": 4.6717832957110605, + "grad_norm": 249.1312255859375, + "learning_rate": 2.300907441016334e-05, + "loss": 37.0681, + "step": 1294 + }, + { + "epoch": 4.675395033860045, + "grad_norm": 227.86341857910156, + "learning_rate": 2.3003629764065335e-05, + "loss": 38.3897, + "step": 1295 + }, + { + "epoch": 4.679006772009029, + "grad_norm": 290.3368225097656, + "learning_rate": 2.2998185117967334e-05, + "loss": 39.1391, + "step": 1296 + }, + { + "epoch": 4.682618510158013, + "grad_norm": 222.59974670410156, + "learning_rate": 2.299274047186933e-05, + "loss": 38.6362, + "step": 1297 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 233.853515625, + "learning_rate": 2.2987295825771325e-05, + "loss": 37.1796, + "step": 1298 + }, + { + "epoch": 4.689841986455982, + "grad_norm": 202.83212280273438, + "learning_rate": 2.2981851179673324e-05, + "loss": 38.5097, + "step": 1299 + }, + { + "epoch": 4.693453724604966, + "grad_norm": 203.59027099609375, + "learning_rate": 2.297640653357532e-05, + "loss": 38.3335, + "step": 1300 + }, + { + "epoch": 4.693453724604966, + "eval_loss": 0.6446877717971802, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 1300 + }, + { + "epoch": 4.69706546275395, + "grad_norm": 250.48324584960938, + "learning_rate": 2.2970961887477314e-05, + "loss": 39.1848, + "step": 1301 + }, + { + "epoch": 4.700677200902934, + "grad_norm": 218.0867462158203, + "learning_rate": 2.296551724137931e-05, + "loss": 38.2276, + "step": 1302 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 316.4258728027344, + "learning_rate": 2.2960072595281305e-05, + "loss": 38.4487, + "step": 1303 + }, + { + "epoch": 4.707900677200903, + "grad_norm": 262.96832275390625, + "learning_rate": 2.29546279491833e-05, + "loss": 29.1075, + "step": 1304 + }, + { + "epoch": 4.711512415349887, + "grad_norm": 261.25897216796875, + "learning_rate": 2.2949183303085303e-05, + "loss": 24.6257, + "step": 1305 + }, + { + "epoch": 4.715124153498872, + "grad_norm": 223.29014587402344, + "learning_rate": 2.2943738656987298e-05, + "loss": 24.4387, + "step": 1306 + }, + { + "epoch": 4.718735891647856, + "grad_norm": 167.95193481445312, + "learning_rate": 2.2938294010889293e-05, + "loss": 25.0916, + "step": 1307 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 203.88392639160156, + "learning_rate": 2.293284936479129e-05, + "loss": 26.1631, + "step": 1308 + }, + { + "epoch": 4.725959367945824, + "grad_norm": 350.67657470703125, + "learning_rate": 2.2927404718693284e-05, + "loss": 47.7021, + "step": 1309 + }, + { + "epoch": 4.7295711060948085, + "grad_norm": 357.1839294433594, + "learning_rate": 2.2921960072595283e-05, + "loss": 47.8161, + "step": 1310 + }, + { + "epoch": 4.7295711060948085, + "eval_loss": 0.6716815829277039, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 1310 + }, + { + "epoch": 4.733182844243792, + "grad_norm": 334.40216064453125, + "learning_rate": 2.291651542649728e-05, + "loss": 47.5608, + "step": 1311 + }, + { + "epoch": 4.736794582392776, + "grad_norm": 322.90008544921875, + "learning_rate": 2.2911070780399274e-05, + "loss": 45.9858, + "step": 1312 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 291.5083923339844, + "learning_rate": 2.290562613430127e-05, + "loss": 45.9813, + "step": 1313 + }, + { + "epoch": 4.744018058690745, + "grad_norm": 234.91102600097656, + "learning_rate": 2.2900181488203268e-05, + "loss": 44.4287, + "step": 1314 + }, + { + "epoch": 4.747629796839729, + "grad_norm": 271.03582763671875, + "learning_rate": 2.2894736842105263e-05, + "loss": 45.3697, + "step": 1315 + }, + { + "epoch": 4.751241534988713, + "grad_norm": 256.219482421875, + "learning_rate": 2.2889292196007262e-05, + "loss": 45.1817, + "step": 1316 + }, + { + "epoch": 4.754853273137698, + "grad_norm": 252.0631561279297, + "learning_rate": 2.2883847549909257e-05, + "loss": 45.2029, + "step": 1317 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 249.41812133789062, + "learning_rate": 2.2878402903811253e-05, + "loss": 44.9802, + "step": 1318 + }, + { + "epoch": 4.762076749435666, + "grad_norm": 208.9102325439453, + "learning_rate": 2.2872958257713248e-05, + "loss": 44.3745, + "step": 1319 + }, + { + "epoch": 4.76568848758465, + "grad_norm": 322.94903564453125, + "learning_rate": 2.2867513611615244e-05, + "loss": 40.9193, + "step": 1320 + }, + { + "epoch": 4.76568848758465, + "eval_loss": 0.6515910029411316, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1320 + }, + { + "epoch": 4.769300225733634, + "grad_norm": 264.6942138671875, + "learning_rate": 2.2862068965517242e-05, + "loss": 39.7286, + "step": 1321 + }, + { + "epoch": 4.772911963882619, + "grad_norm": 276.6095886230469, + "learning_rate": 2.2856624319419238e-05, + "loss": 41.3846, + "step": 1322 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 199.59877014160156, + "learning_rate": 2.2851179673321233e-05, + "loss": 40.5583, + "step": 1323 + }, + { + "epoch": 4.780135440180587, + "grad_norm": 252.59158325195312, + "learning_rate": 2.2845735027223232e-05, + "loss": 40.9513, + "step": 1324 + }, + { + "epoch": 4.7837471783295715, + "grad_norm": 215.53826904296875, + "learning_rate": 2.2840290381125227e-05, + "loss": 41.5119, + "step": 1325 + }, + { + "epoch": 4.7873589164785555, + "grad_norm": 290.7100524902344, + "learning_rate": 2.2834845735027226e-05, + "loss": 42.7646, + "step": 1326 + }, + { + "epoch": 4.7909706546275395, + "grad_norm": 190.2306671142578, + "learning_rate": 2.282940108892922e-05, + "loss": 42.2708, + "step": 1327 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 187.5550079345703, + "learning_rate": 2.2823956442831217e-05, + "loss": 41.9279, + "step": 1328 + }, + { + "epoch": 4.798194130925508, + "grad_norm": 169.10414123535156, + "learning_rate": 2.2818511796733212e-05, + "loss": 42.2688, + "step": 1329 + }, + { + "epoch": 4.801805869074492, + "grad_norm": 199.5216064453125, + "learning_rate": 2.2813067150635208e-05, + "loss": 41.9192, + "step": 1330 + }, + { + "epoch": 4.801805869074492, + "eval_loss": 0.6402038335800171, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 1330 + }, + { + "epoch": 4.805417607223476, + "grad_norm": 222.4996337890625, + "learning_rate": 2.2807622504537203e-05, + "loss": 43.8218, + "step": 1331 + }, + { + "epoch": 4.80902934537246, + "grad_norm": 228.1157684326172, + "learning_rate": 2.2802177858439202e-05, + "loss": 42.9497, + "step": 1332 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 179.83697509765625, + "learning_rate": 2.27967332123412e-05, + "loss": 43.9723, + "step": 1333 + }, + { + "epoch": 4.816252821670429, + "grad_norm": 196.81983947753906, + "learning_rate": 2.2791288566243196e-05, + "loss": 43.3302, + "step": 1334 + }, + { + "epoch": 4.819864559819413, + "grad_norm": 186.61160278320312, + "learning_rate": 2.278584392014519e-05, + "loss": 41.8957, + "step": 1335 + }, + { + "epoch": 4.823476297968397, + "grad_norm": 242.55886840820312, + "learning_rate": 2.2780399274047187e-05, + "loss": 43.1916, + "step": 1336 + }, + { + "epoch": 4.827088036117382, + "grad_norm": 212.07177734375, + "learning_rate": 2.2774954627949185e-05, + "loss": 38.3371, + "step": 1337 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 180.1990966796875, + "learning_rate": 2.276950998185118e-05, + "loss": 36.3413, + "step": 1338 + }, + { + "epoch": 4.83431151241535, + "grad_norm": 202.69529724121094, + "learning_rate": 2.2764065335753176e-05, + "loss": 35.4426, + "step": 1339 + }, + { + "epoch": 4.837923250564334, + "grad_norm": 180.47283935546875, + "learning_rate": 2.275862068965517e-05, + "loss": 35.5281, + "step": 1340 + }, + { + "epoch": 4.837923250564334, + "eval_loss": 0.6356105804443359, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 1340 + }, + { + "epoch": 4.8415349887133186, + "grad_norm": 204.674560546875, + "learning_rate": 2.2753176043557167e-05, + "loss": 36.2566, + "step": 1341 + }, + { + "epoch": 4.8451467268623025, + "grad_norm": 272.1197204589844, + "learning_rate": 2.2747731397459166e-05, + "loss": 36.3862, + "step": 1342 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 235.55101013183594, + "learning_rate": 2.2742286751361165e-05, + "loss": 35.1455, + "step": 1343 + }, + { + "epoch": 4.852370203160271, + "grad_norm": 271.2718200683594, + "learning_rate": 2.273684210526316e-05, + "loss": 37.3824, + "step": 1344 + }, + { + "epoch": 4.855981941309255, + "grad_norm": 242.15728759765625, + "learning_rate": 2.2731397459165155e-05, + "loss": 37.6587, + "step": 1345 + }, + { + "epoch": 4.859593679458239, + "grad_norm": 218.59481811523438, + "learning_rate": 2.272595281306715e-05, + "loss": 36.7602, + "step": 1346 + }, + { + "epoch": 4.863205417607223, + "grad_norm": 231.9490203857422, + "learning_rate": 2.2720508166969146e-05, + "loss": 38.187, + "step": 1347 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 385.56158447265625, + "learning_rate": 2.2715063520871145e-05, + "loss": 38.1905, + "step": 1348 + }, + { + "epoch": 4.870428893905192, + "grad_norm": 219.38204956054688, + "learning_rate": 2.270961887477314e-05, + "loss": 38.2179, + "step": 1349 + }, + { + "epoch": 4.874040632054176, + "grad_norm": 209.46580505371094, + "learning_rate": 2.2704174228675136e-05, + "loss": 37.3696, + "step": 1350 + }, + { + "epoch": 4.874040632054176, + "eval_loss": 0.6412517428398132, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 1350 + }, + { + "epoch": 4.87765237020316, + "grad_norm": 205.53416442871094, + "learning_rate": 2.2698729582577134e-05, + "loss": 38.5144, + "step": 1351 + }, + { + "epoch": 4.881264108352145, + "grad_norm": 214.2522735595703, + "learning_rate": 2.269328493647913e-05, + "loss": 38.7372, + "step": 1352 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 236.9787139892578, + "learning_rate": 2.2687840290381125e-05, + "loss": 38.8987, + "step": 1353 + }, + { + "epoch": 4.888487584650113, + "grad_norm": 247.30906677246094, + "learning_rate": 2.2682395644283124e-05, + "loss": 35.0837, + "step": 1354 + }, + { + "epoch": 4.892099322799097, + "grad_norm": 287.5954284667969, + "learning_rate": 2.267695099818512e-05, + "loss": 25.5272, + "step": 1355 + }, + { + "epoch": 4.895711060948082, + "grad_norm": 254.61672973632812, + "learning_rate": 2.2671506352087115e-05, + "loss": 25.1288, + "step": 1356 + }, + { + "epoch": 4.899322799097066, + "grad_norm": 180.98666381835938, + "learning_rate": 2.266606170598911e-05, + "loss": 25.0588, + "step": 1357 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 213.0275421142578, + "learning_rate": 2.2660617059891105e-05, + "loss": 25.464, + "step": 1358 + }, + { + "epoch": 4.9065462753950335, + "grad_norm": 385.18035888671875, + "learning_rate": 2.2655172413793104e-05, + "loss": 47.0056, + "step": 1359 + }, + { + "epoch": 4.910158013544018, + "grad_norm": 383.4106140136719, + "learning_rate": 2.2649727767695103e-05, + "loss": 46.9892, + "step": 1360 + }, + { + "epoch": 4.910158013544018, + "eval_loss": 0.6618479490280151, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1360 + }, + { + "epoch": 4.913769751693002, + "grad_norm": 415.4345397949219, + "learning_rate": 2.26442831215971e-05, + "loss": 47.1619, + "step": 1361 + }, + { + "epoch": 4.917381489841986, + "grad_norm": 362.338134765625, + "learning_rate": 2.2638838475499094e-05, + "loss": 46.7232, + "step": 1362 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 378.7535400390625, + "learning_rate": 2.263339382940109e-05, + "loss": 46.4438, + "step": 1363 + }, + { + "epoch": 4.924604966139955, + "grad_norm": 251.64901733398438, + "learning_rate": 2.2627949183303085e-05, + "loss": 44.8178, + "step": 1364 + }, + { + "epoch": 4.928216704288939, + "grad_norm": 273.1052551269531, + "learning_rate": 2.2622504537205083e-05, + "loss": 43.0865, + "step": 1365 + }, + { + "epoch": 4.931828442437923, + "grad_norm": 229.66415405273438, + "learning_rate": 2.261705989110708e-05, + "loss": 42.2463, + "step": 1366 + }, + { + "epoch": 4.935440180586907, + "grad_norm": 229.47940063476562, + "learning_rate": 2.2611615245009074e-05, + "loss": 42.4395, + "step": 1367 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 224.48890686035156, + "learning_rate": 2.260617059891107e-05, + "loss": 42.4994, + "step": 1368 + }, + { + "epoch": 4.942663656884876, + "grad_norm": 241.98745727539062, + "learning_rate": 2.2600725952813065e-05, + "loss": 42.5535, + "step": 1369 + }, + { + "epoch": 4.94627539503386, + "grad_norm": 258.1711120605469, + "learning_rate": 2.2595281306715067e-05, + "loss": 42.8475, + "step": 1370 + }, + { + "epoch": 4.94627539503386, + "eval_loss": 0.639252245426178, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.09, + "eval_steps_per_second": 57.09, + "step": 1370 + }, + { + "epoch": 4.949887133182845, + "grad_norm": 204.64927673339844, + "learning_rate": 2.2589836660617062e-05, + "loss": 42.9895, + "step": 1371 + }, + { + "epoch": 4.953498871331829, + "grad_norm": 342.9057922363281, + "learning_rate": 2.2584392014519058e-05, + "loss": 43.1972, + "step": 1372 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 207.45504760742188, + "learning_rate": 2.2578947368421053e-05, + "loss": 42.406, + "step": 1373 + }, + { + "epoch": 4.960722347629797, + "grad_norm": 232.78831481933594, + "learning_rate": 2.257350272232305e-05, + "loss": 36.8817, + "step": 1374 + }, + { + "epoch": 4.9643340857787805, + "grad_norm": 249.3349609375, + "learning_rate": 2.2568058076225044e-05, + "loss": 34.584, + "step": 1375 + }, + { + "epoch": 4.967945823927765, + "grad_norm": 322.7100524902344, + "learning_rate": 2.2562613430127043e-05, + "loss": 36.9512, + "step": 1376 + }, + { + "epoch": 4.971557562076749, + "grad_norm": 357.65228271484375, + "learning_rate": 2.2557168784029038e-05, + "loss": 37.6833, + "step": 1377 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 300.0970153808594, + "learning_rate": 2.2551724137931033e-05, + "loss": 38.597, + "step": 1378 + }, + { + "epoch": 4.978781038374718, + "grad_norm": 234.52508544921875, + "learning_rate": 2.2546279491833032e-05, + "loss": 38.4155, + "step": 1379 + }, + { + "epoch": 4.982392776523702, + "grad_norm": 270.60626220703125, + "learning_rate": 2.2540834845735028e-05, + "loss": 38.1589, + "step": 1380 + }, + { + "epoch": 4.982392776523702, + "eval_loss": 0.6409950256347656, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 1380 + }, + { + "epoch": 4.986004514672686, + "grad_norm": 232.9596710205078, + "learning_rate": 2.2535390199637026e-05, + "loss": 39.281, + "step": 1381 + }, + { + "epoch": 4.98961625282167, + "grad_norm": 248.0550994873047, + "learning_rate": 2.2529945553539022e-05, + "loss": 40.0868, + "step": 1382 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 256.327880859375, + "learning_rate": 2.2524500907441017e-05, + "loss": 28.1259, + "step": 1383 + }, + { + "epoch": 4.996839729119639, + "grad_norm": 198.29559326171875, + "learning_rate": 2.2519056261343012e-05, + "loss": 25.3166, + "step": 1384 + }, + { + "epoch": 5.0, + "grad_norm": 174.66856384277344, + "learning_rate": 2.2513611615245008e-05, + "loss": 22.0749, + "step": 1385 + }, + { + "epoch": 5.003611738148984, + "grad_norm": 309.0927429199219, + "learning_rate": 2.2508166969147003e-05, + "loss": 45.2433, + "step": 1386 + }, + { + "epoch": 5.007223476297969, + "grad_norm": 293.1455383300781, + "learning_rate": 2.2502722323049002e-05, + "loss": 46.7025, + "step": 1387 + }, + { + "epoch": 5.010835214446953, + "grad_norm": 269.47662353515625, + "learning_rate": 2.2497277676951e-05, + "loss": 45.3218, + "step": 1388 + }, + { + "epoch": 5.014446952595937, + "grad_norm": 284.49560546875, + "learning_rate": 2.2491833030852996e-05, + "loss": 44.9849, + "step": 1389 + }, + { + "epoch": 5.018058690744921, + "grad_norm": 223.5511474609375, + "learning_rate": 2.248638838475499e-05, + "loss": 44.887, + "step": 1390 + }, + { + "epoch": 5.018058690744921, + "eval_loss": 0.6435533165931702, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1390 + }, + { + "epoch": 5.021670428893906, + "grad_norm": 243.4492645263672, + "learning_rate": 2.2480943738656987e-05, + "loss": 45.1483, + "step": 1391 + }, + { + "epoch": 5.0252821670428895, + "grad_norm": 265.1712646484375, + "learning_rate": 2.2475499092558986e-05, + "loss": 44.3713, + "step": 1392 + }, + { + "epoch": 5.0288939051918735, + "grad_norm": 190.72190856933594, + "learning_rate": 2.247005444646098e-05, + "loss": 45.3138, + "step": 1393 + }, + { + "epoch": 5.0325056433408575, + "grad_norm": 177.26686096191406, + "learning_rate": 2.2464609800362976e-05, + "loss": 43.302, + "step": 1394 + }, + { + "epoch": 5.036117381489842, + "grad_norm": 198.6124725341797, + "learning_rate": 2.2459165154264972e-05, + "loss": 43.6363, + "step": 1395 + }, + { + "epoch": 5.039729119638826, + "grad_norm": 233.78738403320312, + "learning_rate": 2.2453720508166967e-05, + "loss": 43.0345, + "step": 1396 + }, + { + "epoch": 5.04334085778781, + "grad_norm": 225.48614501953125, + "learning_rate": 2.2448275862068966e-05, + "loss": 41.5932, + "step": 1397 + }, + { + "epoch": 5.046952595936794, + "grad_norm": 204.31179809570312, + "learning_rate": 2.2442831215970965e-05, + "loss": 40.1401, + "step": 1398 + }, + { + "epoch": 5.050564334085779, + "grad_norm": 219.5385284423828, + "learning_rate": 2.243738656987296e-05, + "loss": 40.8834, + "step": 1399 + }, + { + "epoch": 5.054176072234763, + "grad_norm": 168.3094024658203, + "learning_rate": 2.2431941923774956e-05, + "loss": 40.4476, + "step": 1400 + }, + { + "epoch": 5.054176072234763, + "eval_loss": 0.6361114382743835, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 1400 + }, + { + "epoch": 5.057787810383747, + "grad_norm": 169.45201110839844, + "learning_rate": 2.242649727767695e-05, + "loss": 40.1949, + "step": 1401 + }, + { + "epoch": 5.061399548532731, + "grad_norm": 208.84634399414062, + "learning_rate": 2.2421052631578946e-05, + "loss": 41.0091, + "step": 1402 + }, + { + "epoch": 5.065011286681716, + "grad_norm": 248.86221313476562, + "learning_rate": 2.2415607985480945e-05, + "loss": 40.2435, + "step": 1403 + }, + { + "epoch": 5.0686230248307, + "grad_norm": 297.0834655761719, + "learning_rate": 2.241016333938294e-05, + "loss": 42.37, + "step": 1404 + }, + { + "epoch": 5.072234762979684, + "grad_norm": 242.12661743164062, + "learning_rate": 2.2404718693284936e-05, + "loss": 42.3822, + "step": 1405 + }, + { + "epoch": 5.075846501128668, + "grad_norm": 230.1178741455078, + "learning_rate": 2.2399274047186935e-05, + "loss": 41.3722, + "step": 1406 + }, + { + "epoch": 5.079458239277653, + "grad_norm": 191.32371520996094, + "learning_rate": 2.239382940108893e-05, + "loss": 41.8087, + "step": 1407 + }, + { + "epoch": 5.083069977426637, + "grad_norm": 267.28753662109375, + "learning_rate": 2.2388384754990925e-05, + "loss": 42.5938, + "step": 1408 + }, + { + "epoch": 5.0866817155756205, + "grad_norm": 186.61978149414062, + "learning_rate": 2.2382940108892924e-05, + "loss": 42.8553, + "step": 1409 + }, + { + "epoch": 5.090293453724605, + "grad_norm": 242.53433227539062, + "learning_rate": 2.237749546279492e-05, + "loss": 41.9677, + "step": 1410 + }, + { + "epoch": 5.090293453724605, + "eval_loss": 0.6330043077468872, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 1410 + }, + { + "epoch": 5.093905191873589, + "grad_norm": 199.74696350097656, + "learning_rate": 2.2372050816696915e-05, + "loss": 42.9821, + "step": 1411 + }, + { + "epoch": 5.097516930022573, + "grad_norm": 254.1063690185547, + "learning_rate": 2.236660617059891e-05, + "loss": 42.7956, + "step": 1412 + }, + { + "epoch": 5.101128668171557, + "grad_norm": 215.59056091308594, + "learning_rate": 2.2361161524500906e-05, + "loss": 43.6312, + "step": 1413 + }, + { + "epoch": 5.104740406320542, + "grad_norm": 218.69973754882812, + "learning_rate": 2.2355716878402904e-05, + "loss": 40.9468, + "step": 1414 + }, + { + "epoch": 5.108352144469526, + "grad_norm": 200.34927368164062, + "learning_rate": 2.23502722323049e-05, + "loss": 38.2656, + "step": 1415 + }, + { + "epoch": 5.11196388261851, + "grad_norm": 191.56883239746094, + "learning_rate": 2.23448275862069e-05, + "loss": 35.8111, + "step": 1416 + }, + { + "epoch": 5.115575620767494, + "grad_norm": 192.629150390625, + "learning_rate": 2.2339382940108894e-05, + "loss": 35.1287, + "step": 1417 + }, + { + "epoch": 5.119187358916479, + "grad_norm": 217.54855346679688, + "learning_rate": 2.233393829401089e-05, + "loss": 34.9664, + "step": 1418 + }, + { + "epoch": 5.122799097065463, + "grad_norm": 234.12355041503906, + "learning_rate": 2.2328493647912888e-05, + "loss": 35.9252, + "step": 1419 + }, + { + "epoch": 5.126410835214447, + "grad_norm": 201.83477783203125, + "learning_rate": 2.2323049001814884e-05, + "loss": 36.4664, + "step": 1420 + }, + { + "epoch": 5.126410835214447, + "eval_loss": 0.6359394192695618, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 1420 + }, + { + "epoch": 5.130022573363431, + "grad_norm": 212.38943481445312, + "learning_rate": 2.231760435571688e-05, + "loss": 35.2733, + "step": 1421 + }, + { + "epoch": 5.133634311512416, + "grad_norm": 219.8803253173828, + "learning_rate": 2.2312159709618874e-05, + "loss": 37.2009, + "step": 1422 + }, + { + "epoch": 5.1372460496614, + "grad_norm": 222.28221130371094, + "learning_rate": 2.230671506352087e-05, + "loss": 36.9338, + "step": 1423 + }, + { + "epoch": 5.140857787810384, + "grad_norm": 217.56607055664062, + "learning_rate": 2.2301270417422865e-05, + "loss": 38.0419, + "step": 1424 + }, + { + "epoch": 5.144469525959368, + "grad_norm": 232.7363739013672, + "learning_rate": 2.2295825771324867e-05, + "loss": 38.1393, + "step": 1425 + }, + { + "epoch": 5.148081264108352, + "grad_norm": 228.12091064453125, + "learning_rate": 2.2290381125226863e-05, + "loss": 37.4169, + "step": 1426 + }, + { + "epoch": 5.151693002257336, + "grad_norm": 247.9901580810547, + "learning_rate": 2.2284936479128858e-05, + "loss": 37.6386, + "step": 1427 + }, + { + "epoch": 5.15530474040632, + "grad_norm": 227.96649169921875, + "learning_rate": 2.2279491833030853e-05, + "loss": 38.7843, + "step": 1428 + }, + { + "epoch": 5.158916478555304, + "grad_norm": 197.85072326660156, + "learning_rate": 2.227404718693285e-05, + "loss": 37.7056, + "step": 1429 + }, + { + "epoch": 5.162528216704289, + "grad_norm": 270.6370544433594, + "learning_rate": 2.2268602540834848e-05, + "loss": 38.5554, + "step": 1430 + }, + { + "epoch": 5.162528216704289, + "eval_loss": 0.6463288068771362, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 1430 + }, + { + "epoch": 5.166139954853273, + "grad_norm": 251.65847778320312, + "learning_rate": 2.2263157894736843e-05, + "loss": 32.6593, + "step": 1431 + }, + { + "epoch": 5.169751693002257, + "grad_norm": 248.84368896484375, + "learning_rate": 2.225771324863884e-05, + "loss": 24.8031, + "step": 1432 + }, + { + "epoch": 5.173363431151241, + "grad_norm": 218.12979125976562, + "learning_rate": 2.2252268602540834e-05, + "loss": 23.8542, + "step": 1433 + }, + { + "epoch": 5.176975169300226, + "grad_norm": 171.4182586669922, + "learning_rate": 2.2246823956442832e-05, + "loss": 25.1994, + "step": 1434 + }, + { + "epoch": 5.18058690744921, + "grad_norm": 200.76271057128906, + "learning_rate": 2.2241379310344828e-05, + "loss": 25.1259, + "step": 1435 + }, + { + "epoch": 5.184198645598194, + "grad_norm": 324.8979797363281, + "learning_rate": 2.2235934664246827e-05, + "loss": 46.7466, + "step": 1436 + }, + { + "epoch": 5.187810383747179, + "grad_norm": 391.9200439453125, + "learning_rate": 2.2230490018148822e-05, + "loss": 47.366, + "step": 1437 + }, + { + "epoch": 5.191422121896163, + "grad_norm": 332.51080322265625, + "learning_rate": 2.2225045372050817e-05, + "loss": 47.5236, + "step": 1438 + }, + { + "epoch": 5.195033860045147, + "grad_norm": 295.85333251953125, + "learning_rate": 2.2219600725952813e-05, + "loss": 44.9235, + "step": 1439 + }, + { + "epoch": 5.198645598194131, + "grad_norm": 246.46482849121094, + "learning_rate": 2.2214156079854808e-05, + "loss": 44.5892, + "step": 1440 + }, + { + "epoch": 5.198645598194131, + "eval_loss": 0.6501885056495667, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.096, + "eval_steps_per_second": 57.096, + "step": 1440 + }, + { + "epoch": 5.2022573363431155, + "grad_norm": 224.99964904785156, + "learning_rate": 2.2208711433756807e-05, + "loss": 45.1496, + "step": 1441 + }, + { + "epoch": 5.2058690744920995, + "grad_norm": 201.5928497314453, + "learning_rate": 2.2203266787658802e-05, + "loss": 44.2362, + "step": 1442 + }, + { + "epoch": 5.209480812641083, + "grad_norm": 220.72509765625, + "learning_rate": 2.21978221415608e-05, + "loss": 45.7963, + "step": 1443 + }, + { + "epoch": 5.213092550790067, + "grad_norm": 229.04412841796875, + "learning_rate": 2.2192377495462796e-05, + "loss": 44.1812, + "step": 1444 + }, + { + "epoch": 5.216704288939052, + "grad_norm": 214.86207580566406, + "learning_rate": 2.2186932849364792e-05, + "loss": 44.364, + "step": 1445 + }, + { + "epoch": 5.220316027088036, + "grad_norm": 169.3239288330078, + "learning_rate": 2.2181488203266787e-05, + "loss": 44.1106, + "step": 1446 + }, + { + "epoch": 5.22392776523702, + "grad_norm": 180.3131561279297, + "learning_rate": 2.2176043557168786e-05, + "loss": 41.8791, + "step": 1447 + }, + { + "epoch": 5.227539503386004, + "grad_norm": 227.83078002929688, + "learning_rate": 2.217059891107078e-05, + "loss": 39.7917, + "step": 1448 + }, + { + "epoch": 5.231151241534989, + "grad_norm": 267.4294738769531, + "learning_rate": 2.2165154264972777e-05, + "loss": 41.2864, + "step": 1449 + }, + { + "epoch": 5.234762979683973, + "grad_norm": 210.79034423828125, + "learning_rate": 2.2159709618874772e-05, + "loss": 40.7219, + "step": 1450 + }, + { + "epoch": 5.234762979683973, + "eval_loss": 0.6369529366493225, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 1450 + }, + { + "epoch": 5.238374717832957, + "grad_norm": 205.2632598876953, + "learning_rate": 2.2154264972776768e-05, + "loss": 41.0364, + "step": 1451 + }, + { + "epoch": 5.241986455981941, + "grad_norm": 199.7196807861328, + "learning_rate": 2.214882032667877e-05, + "loss": 40.2733, + "step": 1452 + }, + { + "epoch": 5.245598194130926, + "grad_norm": 184.26495361328125, + "learning_rate": 2.2143375680580765e-05, + "loss": 40.3418, + "step": 1453 + }, + { + "epoch": 5.24920993227991, + "grad_norm": 170.1937713623047, + "learning_rate": 2.213793103448276e-05, + "loss": 40.5658, + "step": 1454 + }, + { + "epoch": 5.252821670428894, + "grad_norm": 167.71109008789062, + "learning_rate": 2.2132486388384756e-05, + "loss": 41.9252, + "step": 1455 + }, + { + "epoch": 5.2564334085778786, + "grad_norm": 184.73162841796875, + "learning_rate": 2.212704174228675e-05, + "loss": 40.0485, + "step": 1456 + }, + { + "epoch": 5.2600451467268625, + "grad_norm": 195.0812225341797, + "learning_rate": 2.2121597096188747e-05, + "loss": 41.6424, + "step": 1457 + }, + { + "epoch": 5.2636568848758465, + "grad_norm": 218.23553466796875, + "learning_rate": 2.2116152450090745e-05, + "loss": 40.6179, + "step": 1458 + }, + { + "epoch": 5.2672686230248305, + "grad_norm": 229.79299926757812, + "learning_rate": 2.211070780399274e-05, + "loss": 42.8747, + "step": 1459 + }, + { + "epoch": 5.270880361173815, + "grad_norm": 231.70692443847656, + "learning_rate": 2.2105263157894736e-05, + "loss": 42.7016, + "step": 1460 + }, + { + "epoch": 5.270880361173815, + "eval_loss": 0.6424433588981628, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 1460 + }, + { + "epoch": 5.274492099322799, + "grad_norm": 204.9513397216797, + "learning_rate": 2.209981851179673e-05, + "loss": 41.206, + "step": 1461 + }, + { + "epoch": 5.278103837471783, + "grad_norm": 220.89083862304688, + "learning_rate": 2.209437386569873e-05, + "loss": 44.0126, + "step": 1462 + }, + { + "epoch": 5.281715575620767, + "grad_norm": 266.7763671875, + "learning_rate": 2.208892921960073e-05, + "loss": 41.4934, + "step": 1463 + }, + { + "epoch": 5.285327313769752, + "grad_norm": 241.42636108398438, + "learning_rate": 2.2083484573502724e-05, + "loss": 43.3433, + "step": 1464 + }, + { + "epoch": 5.288939051918736, + "grad_norm": 221.7669219970703, + "learning_rate": 2.207803992740472e-05, + "loss": 35.9569, + "step": 1465 + }, + { + "epoch": 5.29255079006772, + "grad_norm": 236.0152130126953, + "learning_rate": 2.2072595281306715e-05, + "loss": 36.0824, + "step": 1466 + }, + { + "epoch": 5.296162528216704, + "grad_norm": 239.56224060058594, + "learning_rate": 2.206715063520871e-05, + "loss": 33.6127, + "step": 1467 + }, + { + "epoch": 5.299774266365689, + "grad_norm": 277.1287841796875, + "learning_rate": 2.2061705989110706e-05, + "loss": 36.11, + "step": 1468 + }, + { + "epoch": 5.303386004514673, + "grad_norm": 250.19515991210938, + "learning_rate": 2.2056261343012705e-05, + "loss": 36.9984, + "step": 1469 + }, + { + "epoch": 5.306997742663657, + "grad_norm": 214.2754669189453, + "learning_rate": 2.20508166969147e-05, + "loss": 36.5917, + "step": 1470 + }, + { + "epoch": 5.306997742663657, + "eval_loss": 0.6356943845748901, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 1470 + }, + { + "epoch": 5.310609480812641, + "grad_norm": 224.37388610839844, + "learning_rate": 2.20453720508167e-05, + "loss": 36.5302, + "step": 1471 + }, + { + "epoch": 5.314221218961626, + "grad_norm": 276.2541809082031, + "learning_rate": 2.2039927404718694e-05, + "loss": 36.7978, + "step": 1472 + }, + { + "epoch": 5.3178329571106095, + "grad_norm": 361.717041015625, + "learning_rate": 2.203448275862069e-05, + "loss": 37.4063, + "step": 1473 + }, + { + "epoch": 5.3214446952595935, + "grad_norm": 285.3569641113281, + "learning_rate": 2.202903811252269e-05, + "loss": 37.2472, + "step": 1474 + }, + { + "epoch": 5.3250564334085775, + "grad_norm": 268.160400390625, + "learning_rate": 2.2023593466424684e-05, + "loss": 37.7361, + "step": 1475 + }, + { + "epoch": 5.328668171557562, + "grad_norm": 211.38070678710938, + "learning_rate": 2.201814882032668e-05, + "loss": 37.7794, + "step": 1476 + }, + { + "epoch": 5.332279909706546, + "grad_norm": 214.10638427734375, + "learning_rate": 2.2012704174228675e-05, + "loss": 39.0787, + "step": 1477 + }, + { + "epoch": 5.33589164785553, + "grad_norm": 238.9603271484375, + "learning_rate": 2.200725952813067e-05, + "loss": 37.6853, + "step": 1478 + }, + { + "epoch": 5.339503386004514, + "grad_norm": 323.44976806640625, + "learning_rate": 2.2001814882032665e-05, + "loss": 38.2844, + "step": 1479 + }, + { + "epoch": 5.343115124153499, + "grad_norm": 289.6131896972656, + "learning_rate": 2.1996370235934668e-05, + "loss": 38.8953, + "step": 1480 + }, + { + "epoch": 5.343115124153499, + "eval_loss": 0.6462770700454712, + "eval_runtime": 3.1673, + "eval_samples_per_second": 56.516, + "eval_steps_per_second": 56.516, + "step": 1480 + }, + { + "epoch": 5.346726862302483, + "grad_norm": 197.47299194335938, + "learning_rate": 2.1990925589836663e-05, + "loss": 28.126, + "step": 1481 + }, + { + "epoch": 5.350338600451467, + "grad_norm": 198.37156677246094, + "learning_rate": 2.1985480943738658e-05, + "loss": 24.2205, + "step": 1482 + }, + { + "epoch": 5.353950338600452, + "grad_norm": 211.03501892089844, + "learning_rate": 2.1980036297640654e-05, + "loss": 24.119, + "step": 1483 + }, + { + "epoch": 5.357562076749436, + "grad_norm": 182.23316955566406, + "learning_rate": 2.197459165154265e-05, + "loss": 24.7386, + "step": 1484 + }, + { + "epoch": 5.36117381489842, + "grad_norm": 192.6392822265625, + "learning_rate": 2.1969147005444648e-05, + "loss": 26.0739, + "step": 1485 + }, + { + "epoch": 5.364785553047404, + "grad_norm": 380.62896728515625, + "learning_rate": 2.1963702359346643e-05, + "loss": 46.6945, + "step": 1486 + }, + { + "epoch": 5.368397291196389, + "grad_norm": 342.5572814941406, + "learning_rate": 2.195825771324864e-05, + "loss": 46.1797, + "step": 1487 + }, + { + "epoch": 5.372009029345373, + "grad_norm": 311.7198791503906, + "learning_rate": 2.1952813067150634e-05, + "loss": 45.6588, + "step": 1488 + }, + { + "epoch": 5.375620767494357, + "grad_norm": 260.9885559082031, + "learning_rate": 2.1947368421052633e-05, + "loss": 45.2405, + "step": 1489 + }, + { + "epoch": 5.3792325056433405, + "grad_norm": 263.3132019042969, + "learning_rate": 2.1941923774954628e-05, + "loss": 44.117, + "step": 1490 + }, + { + "epoch": 5.3792325056433405, + "eval_loss": 0.644275426864624, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 1490 + }, + { + "epoch": 5.382844243792325, + "grad_norm": 254.92022705078125, + "learning_rate": 2.1936479128856627e-05, + "loss": 45.4002, + "step": 1491 + }, + { + "epoch": 5.386455981941309, + "grad_norm": 246.1839599609375, + "learning_rate": 2.1931034482758622e-05, + "loss": 45.3481, + "step": 1492 + }, + { + "epoch": 5.390067720090293, + "grad_norm": 282.2879638671875, + "learning_rate": 2.1925589836660618e-05, + "loss": 45.3958, + "step": 1493 + }, + { + "epoch": 5.393679458239277, + "grad_norm": 266.9140930175781, + "learning_rate": 2.1920145190562613e-05, + "loss": 44.2959, + "step": 1494 + }, + { + "epoch": 5.397291196388262, + "grad_norm": 196.81199645996094, + "learning_rate": 2.191470054446461e-05, + "loss": 44.765, + "step": 1495 + }, + { + "epoch": 5.400902934537246, + "grad_norm": 270.7329406738281, + "learning_rate": 2.1909255898366607e-05, + "loss": 42.8581, + "step": 1496 + }, + { + "epoch": 5.40451467268623, + "grad_norm": 187.3281707763672, + "learning_rate": 2.1903811252268603e-05, + "loss": 40.7167, + "step": 1497 + }, + { + "epoch": 5.408126410835214, + "grad_norm": 302.9165954589844, + "learning_rate": 2.1898366606170598e-05, + "loss": 41.0712, + "step": 1498 + }, + { + "epoch": 5.411738148984199, + "grad_norm": 395.1492614746094, + "learning_rate": 2.1892921960072597e-05, + "loss": 40.4098, + "step": 1499 + }, + { + "epoch": 5.415349887133183, + "grad_norm": 253.91494750976562, + "learning_rate": 2.1887477313974592e-05, + "loss": 41.2985, + "step": 1500 + }, + { + "epoch": 5.415349887133183, + "eval_loss": 0.6383773684501648, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1500 + }, + { + "epoch": 5.418961625282167, + "grad_norm": 248.4109344482422, + "learning_rate": 2.1882032667876588e-05, + "loss": 41.179, + "step": 1501 + }, + { + "epoch": 5.422573363431152, + "grad_norm": 210.50015258789062, + "learning_rate": 2.1876588021778586e-05, + "loss": 41.1934, + "step": 1502 + }, + { + "epoch": 5.426185101580136, + "grad_norm": 170.64334106445312, + "learning_rate": 2.187114337568058e-05, + "loss": 41.5535, + "step": 1503 + }, + { + "epoch": 5.42979683972912, + "grad_norm": 249.41270446777344, + "learning_rate": 2.1865698729582577e-05, + "loss": 41.8323, + "step": 1504 + }, + { + "epoch": 5.433408577878104, + "grad_norm": 214.53770446777344, + "learning_rate": 2.1860254083484572e-05, + "loss": 42.1517, + "step": 1505 + }, + { + "epoch": 5.437020316027088, + "grad_norm": 225.6502227783203, + "learning_rate": 2.1854809437386568e-05, + "loss": 42.7675, + "step": 1506 + }, + { + "epoch": 5.440632054176072, + "grad_norm": 210.19219970703125, + "learning_rate": 2.1849364791288567e-05, + "loss": 42.5094, + "step": 1507 + }, + { + "epoch": 5.444243792325056, + "grad_norm": 187.03294372558594, + "learning_rate": 2.1843920145190565e-05, + "loss": 42.2218, + "step": 1508 + }, + { + "epoch": 5.44785553047404, + "grad_norm": 227.6764373779297, + "learning_rate": 2.183847549909256e-05, + "loss": 42.7061, + "step": 1509 + }, + { + "epoch": 5.451467268623025, + "grad_norm": 239.2847442626953, + "learning_rate": 2.1833030852994556e-05, + "loss": 43.1959, + "step": 1510 + }, + { + "epoch": 5.451467268623025, + "eval_loss": 0.6405091285705566, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 1510 + }, + { + "epoch": 5.455079006772009, + "grad_norm": 268.887451171875, + "learning_rate": 2.182758620689655e-05, + "loss": 42.4915, + "step": 1511 + }, + { + "epoch": 5.458690744920993, + "grad_norm": 261.0531311035156, + "learning_rate": 2.182214156079855e-05, + "loss": 42.1777, + "step": 1512 + }, + { + "epoch": 5.462302483069977, + "grad_norm": 241.58819580078125, + "learning_rate": 2.1816696914700546e-05, + "loss": 40.8728, + "step": 1513 + }, + { + "epoch": 5.465914221218962, + "grad_norm": 227.302001953125, + "learning_rate": 2.181125226860254e-05, + "loss": 39.8861, + "step": 1514 + }, + { + "epoch": 5.469525959367946, + "grad_norm": 293.8402404785156, + "learning_rate": 2.1805807622504536e-05, + "loss": 36.8716, + "step": 1515 + }, + { + "epoch": 5.47313769751693, + "grad_norm": 332.8829650878906, + "learning_rate": 2.1800362976406532e-05, + "loss": 35.6049, + "step": 1516 + }, + { + "epoch": 5.476749435665914, + "grad_norm": 271.6636962890625, + "learning_rate": 2.179491833030853e-05, + "loss": 34.6785, + "step": 1517 + }, + { + "epoch": 5.480361173814899, + "grad_norm": 211.5673065185547, + "learning_rate": 2.178947368421053e-05, + "loss": 35.5321, + "step": 1518 + }, + { + "epoch": 5.483972911963883, + "grad_norm": 168.95346069335938, + "learning_rate": 2.1784029038112525e-05, + "loss": 35.1604, + "step": 1519 + }, + { + "epoch": 5.487584650112867, + "grad_norm": 242.66725158691406, + "learning_rate": 2.177858439201452e-05, + "loss": 37.8709, + "step": 1520 + }, + { + "epoch": 5.487584650112867, + "eval_loss": 0.6324127912521362, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1520 + }, + { + "epoch": 5.491196388261851, + "grad_norm": 202.7799530029297, + "learning_rate": 2.1773139745916516e-05, + "loss": 38.1727, + "step": 1521 + }, + { + "epoch": 5.4948081264108355, + "grad_norm": 210.12704467773438, + "learning_rate": 2.176769509981851e-05, + "loss": 36.4171, + "step": 1522 + }, + { + "epoch": 5.4984198645598195, + "grad_norm": 214.7133331298828, + "learning_rate": 2.176225045372051e-05, + "loss": 37.7873, + "step": 1523 + }, + { + "epoch": 5.502031602708803, + "grad_norm": 197.89781188964844, + "learning_rate": 2.1756805807622505e-05, + "loss": 37.1096, + "step": 1524 + }, + { + "epoch": 5.505643340857787, + "grad_norm": 203.01992797851562, + "learning_rate": 2.17513611615245e-05, + "loss": 36.9907, + "step": 1525 + }, + { + "epoch": 5.509255079006772, + "grad_norm": 210.42164611816406, + "learning_rate": 2.17459165154265e-05, + "loss": 38.0291, + "step": 1526 + }, + { + "epoch": 5.512866817155756, + "grad_norm": 210.2798309326172, + "learning_rate": 2.1740471869328495e-05, + "loss": 37.5385, + "step": 1527 + }, + { + "epoch": 5.51647855530474, + "grad_norm": 217.986572265625, + "learning_rate": 2.173502722323049e-05, + "loss": 39.2736, + "step": 1528 + }, + { + "epoch": 5.520090293453725, + "grad_norm": 221.05831909179688, + "learning_rate": 2.172958257713249e-05, + "loss": 39.2733, + "step": 1529 + }, + { + "epoch": 5.523702031602709, + "grad_norm": 250.36065673828125, + "learning_rate": 2.1724137931034484e-05, + "loss": 37.8987, + "step": 1530 + }, + { + "epoch": 5.523702031602709, + "eval_loss": 0.6414559483528137, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 1530 + }, + { + "epoch": 5.527313769751693, + "grad_norm": 275.062255859375, + "learning_rate": 2.171869328493648e-05, + "loss": 29.4874, + "step": 1531 + }, + { + "epoch": 5.530925507900677, + "grad_norm": 178.79615783691406, + "learning_rate": 2.1713248638838475e-05, + "loss": 25.2165, + "step": 1532 + }, + { + "epoch": 5.534537246049661, + "grad_norm": 221.6693572998047, + "learning_rate": 2.170780399274047e-05, + "loss": 24.7139, + "step": 1533 + }, + { + "epoch": 5.538148984198646, + "grad_norm": 207.15869140625, + "learning_rate": 2.170235934664247e-05, + "loss": 25.2773, + "step": 1534 + }, + { + "epoch": 5.54176072234763, + "grad_norm": 193.37644958496094, + "learning_rate": 2.1696914700544468e-05, + "loss": 25.7936, + "step": 1535 + }, + { + "epoch": 5.545372460496614, + "grad_norm": 314.101318359375, + "learning_rate": 2.1691470054446463e-05, + "loss": 45.8573, + "step": 1536 + }, + { + "epoch": 5.5489841986455986, + "grad_norm": 376.9578552246094, + "learning_rate": 2.168602540834846e-05, + "loss": 47.1284, + "step": 1537 + }, + { + "epoch": 5.5525959367945825, + "grad_norm": 343.3904724121094, + "learning_rate": 2.1680580762250454e-05, + "loss": 45.1873, + "step": 1538 + }, + { + "epoch": 5.5562076749435665, + "grad_norm": 263.31768798828125, + "learning_rate": 2.167513611615245e-05, + "loss": 45.4906, + "step": 1539 + }, + { + "epoch": 5.5598194130925505, + "grad_norm": 295.50384521484375, + "learning_rate": 2.1669691470054448e-05, + "loss": 44.9259, + "step": 1540 + }, + { + "epoch": 5.5598194130925505, + "eval_loss": 0.6483813524246216, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.923, + "eval_steps_per_second": 56.923, + "step": 1540 + }, + { + "epoch": 5.563431151241535, + "grad_norm": 208.8861846923828, + "learning_rate": 2.1664246823956444e-05, + "loss": 43.7965, + "step": 1541 + }, + { + "epoch": 5.567042889390519, + "grad_norm": 195.8695526123047, + "learning_rate": 2.165880217785844e-05, + "loss": 44.7409, + "step": 1542 + }, + { + "epoch": 5.570654627539503, + "grad_norm": 218.10089111328125, + "learning_rate": 2.1653357531760434e-05, + "loss": 45.9364, + "step": 1543 + }, + { + "epoch": 5.574266365688487, + "grad_norm": 204.17205810546875, + "learning_rate": 2.164791288566243e-05, + "loss": 45.468, + "step": 1544 + }, + { + "epoch": 5.577878103837472, + "grad_norm": 239.03952026367188, + "learning_rate": 2.1642468239564432e-05, + "loss": 44.7685, + "step": 1545 + }, + { + "epoch": 5.581489841986456, + "grad_norm": 251.59300231933594, + "learning_rate": 2.1637023593466427e-05, + "loss": 43.011, + "step": 1546 + }, + { + "epoch": 5.58510158013544, + "grad_norm": 186.72540283203125, + "learning_rate": 2.1631578947368423e-05, + "loss": 41.5255, + "step": 1547 + }, + { + "epoch": 5.588713318284425, + "grad_norm": 199.89732360839844, + "learning_rate": 2.1626134301270418e-05, + "loss": 40.2522, + "step": 1548 + }, + { + "epoch": 5.592325056433409, + "grad_norm": 182.16624450683594, + "learning_rate": 2.1620689655172413e-05, + "loss": 41.0931, + "step": 1549 + }, + { + "epoch": 5.595936794582393, + "grad_norm": 221.58680725097656, + "learning_rate": 2.161524500907441e-05, + "loss": 40.2717, + "step": 1550 + }, + { + "epoch": 5.595936794582393, + "eval_loss": 0.6393340229988098, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 1550 + }, + { + "epoch": 5.599548532731377, + "grad_norm": 209.82183837890625, + "learning_rate": 2.1609800362976408e-05, + "loss": 41.7522, + "step": 1551 + }, + { + "epoch": 5.603160270880361, + "grad_norm": 226.1896209716797, + "learning_rate": 2.1604355716878403e-05, + "loss": 40.8078, + "step": 1552 + }, + { + "epoch": 5.606772009029346, + "grad_norm": 219.57899475097656, + "learning_rate": 2.1598911070780398e-05, + "loss": 42.2331, + "step": 1553 + }, + { + "epoch": 5.6103837471783295, + "grad_norm": 185.2303009033203, + "learning_rate": 2.1593466424682397e-05, + "loss": 42.0695, + "step": 1554 + }, + { + "epoch": 5.6139954853273135, + "grad_norm": 192.32913208007812, + "learning_rate": 2.1588021778584392e-05, + "loss": 42.1317, + "step": 1555 + }, + { + "epoch": 5.617607223476298, + "grad_norm": 183.3128662109375, + "learning_rate": 2.158257713248639e-05, + "loss": 40.4957, + "step": 1556 + }, + { + "epoch": 5.621218961625282, + "grad_norm": 178.10691833496094, + "learning_rate": 2.1577132486388387e-05, + "loss": 40.9154, + "step": 1557 + }, + { + "epoch": 5.624830699774266, + "grad_norm": 207.3495330810547, + "learning_rate": 2.1571687840290382e-05, + "loss": 42.8389, + "step": 1558 + }, + { + "epoch": 5.62844243792325, + "grad_norm": 191.46353149414062, + "learning_rate": 2.1566243194192377e-05, + "loss": 41.9483, + "step": 1559 + }, + { + "epoch": 5.632054176072235, + "grad_norm": 218.9544219970703, + "learning_rate": 2.1560798548094373e-05, + "loss": 41.2037, + "step": 1560 + }, + { + "epoch": 5.632054176072235, + "eval_loss": 0.6345452070236206, + "eval_runtime": 3.1432, + "eval_samples_per_second": 56.949, + "eval_steps_per_second": 56.949, + "step": 1560 + }, + { + "epoch": 5.635665914221219, + "grad_norm": 235.9405059814453, + "learning_rate": 2.1555353901996368e-05, + "loss": 43.1159, + "step": 1561 + }, + { + "epoch": 5.639277652370203, + "grad_norm": 207.1119384765625, + "learning_rate": 2.1549909255898367e-05, + "loss": 43.4384, + "step": 1562 + }, + { + "epoch": 5.642889390519187, + "grad_norm": 305.3013916015625, + "learning_rate": 2.1544464609800366e-05, + "loss": 42.436, + "step": 1563 + }, + { + "epoch": 5.646501128668172, + "grad_norm": 226.25282287597656, + "learning_rate": 2.153901996370236e-05, + "loss": 39.6844, + "step": 1564 + }, + { + "epoch": 5.650112866817156, + "grad_norm": 201.5033416748047, + "learning_rate": 2.1533575317604356e-05, + "loss": 35.9103, + "step": 1565 + }, + { + "epoch": 5.65372460496614, + "grad_norm": 206.63229370117188, + "learning_rate": 2.1528130671506352e-05, + "loss": 35.0026, + "step": 1566 + }, + { + "epoch": 5.657336343115124, + "grad_norm": 212.67581176757812, + "learning_rate": 2.152268602540835e-05, + "loss": 35.6298, + "step": 1567 + }, + { + "epoch": 5.660948081264109, + "grad_norm": 193.2886199951172, + "learning_rate": 2.1517241379310346e-05, + "loss": 36.0356, + "step": 1568 + }, + { + "epoch": 5.664559819413093, + "grad_norm": 166.189208984375, + "learning_rate": 2.151179673321234e-05, + "loss": 35.5423, + "step": 1569 + }, + { + "epoch": 5.668171557562077, + "grad_norm": 288.91552734375, + "learning_rate": 2.1506352087114337e-05, + "loss": 36.6227, + "step": 1570 + }, + { + "epoch": 5.668171557562077, + "eval_loss": 0.6339959502220154, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1570 + }, + { + "epoch": 5.6717832957110605, + "grad_norm": 210.91664123535156, + "learning_rate": 2.1500907441016332e-05, + "loss": 37.3015, + "step": 1571 + }, + { + "epoch": 5.675395033860045, + "grad_norm": 206.54299926757812, + "learning_rate": 2.149546279491833e-05, + "loss": 36.961, + "step": 1572 + }, + { + "epoch": 5.679006772009029, + "grad_norm": 206.55613708496094, + "learning_rate": 2.149001814882033e-05, + "loss": 36.722, + "step": 1573 + }, + { + "epoch": 5.682618510158013, + "grad_norm": 206.86563110351562, + "learning_rate": 2.1484573502722325e-05, + "loss": 37.7482, + "step": 1574 + }, + { + "epoch": 5.686230248306998, + "grad_norm": 219.96533203125, + "learning_rate": 2.147912885662432e-05, + "loss": 37.7964, + "step": 1575 + }, + { + "epoch": 5.689841986455982, + "grad_norm": 226.23887634277344, + "learning_rate": 2.1473684210526316e-05, + "loss": 38.6577, + "step": 1576 + }, + { + "epoch": 5.693453724604966, + "grad_norm": 195.1751708984375, + "learning_rate": 2.146823956442831e-05, + "loss": 36.9764, + "step": 1577 + }, + { + "epoch": 5.69706546275395, + "grad_norm": 194.3510284423828, + "learning_rate": 2.146279491833031e-05, + "loss": 39.4842, + "step": 1578 + }, + { + "epoch": 5.700677200902934, + "grad_norm": 187.02281188964844, + "learning_rate": 2.1457350272232305e-05, + "loss": 38.9574, + "step": 1579 + }, + { + "epoch": 5.704288939051919, + "grad_norm": 242.91925048828125, + "learning_rate": 2.14519056261343e-05, + "loss": 37.6359, + "step": 1580 + }, + { + "epoch": 5.704288939051919, + "eval_loss": 0.6384473443031311, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 1580 + }, + { + "epoch": 5.707900677200903, + "grad_norm": 242.9617156982422, + "learning_rate": 2.14464609800363e-05, + "loss": 31.3564, + "step": 1581 + }, + { + "epoch": 5.711512415349887, + "grad_norm": 182.00540161132812, + "learning_rate": 2.1441016333938295e-05, + "loss": 24.2933, + "step": 1582 + }, + { + "epoch": 5.715124153498872, + "grad_norm": 257.7115173339844, + "learning_rate": 2.143557168784029e-05, + "loss": 24.6299, + "step": 1583 + }, + { + "epoch": 5.718735891647856, + "grad_norm": 198.71554565429688, + "learning_rate": 2.143012704174229e-05, + "loss": 24.7344, + "step": 1584 + }, + { + "epoch": 5.72234762979684, + "grad_norm": 198.24520874023438, + "learning_rate": 2.1424682395644284e-05, + "loss": 26.0825, + "step": 1585 + }, + { + "epoch": 5.725959367945824, + "grad_norm": 248.9528045654297, + "learning_rate": 2.141923774954628e-05, + "loss": 45.1176, + "step": 1586 + }, + { + "epoch": 5.7295711060948085, + "grad_norm": 293.7327575683594, + "learning_rate": 2.1413793103448275e-05, + "loss": 45.8517, + "step": 1587 + }, + { + "epoch": 5.733182844243792, + "grad_norm": 293.1148681640625, + "learning_rate": 2.140834845735027e-05, + "loss": 45.6659, + "step": 1588 + }, + { + "epoch": 5.736794582392776, + "grad_norm": 312.7779846191406, + "learning_rate": 2.140290381125227e-05, + "loss": 44.4863, + "step": 1589 + }, + { + "epoch": 5.74040632054176, + "grad_norm": 309.1000061035156, + "learning_rate": 2.1397459165154265e-05, + "loss": 43.649, + "step": 1590 + }, + { + "epoch": 5.74040632054176, + "eval_loss": 0.6471736431121826, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 1590 + }, + { + "epoch": 5.744018058690745, + "grad_norm": 276.4226989746094, + "learning_rate": 2.1392014519056263e-05, + "loss": 45.3135, + "step": 1591 + }, + { + "epoch": 5.747629796839729, + "grad_norm": 233.6791229248047, + "learning_rate": 2.138656987295826e-05, + "loss": 44.4919, + "step": 1592 + }, + { + "epoch": 5.751241534988713, + "grad_norm": 194.2917022705078, + "learning_rate": 2.1381125226860254e-05, + "loss": 44.8033, + "step": 1593 + }, + { + "epoch": 5.754853273137698, + "grad_norm": 241.76060485839844, + "learning_rate": 2.137568058076225e-05, + "loss": 45.1427, + "step": 1594 + }, + { + "epoch": 5.758465011286682, + "grad_norm": 216.56283569335938, + "learning_rate": 2.137023593466425e-05, + "loss": 43.1769, + "step": 1595 + }, + { + "epoch": 5.762076749435666, + "grad_norm": 230.0026092529297, + "learning_rate": 2.1364791288566244e-05, + "loss": 44.1141, + "step": 1596 + }, + { + "epoch": 5.76568848758465, + "grad_norm": 191.55433654785156, + "learning_rate": 2.135934664246824e-05, + "loss": 40.7227, + "step": 1597 + }, + { + "epoch": 5.769300225733634, + "grad_norm": 180.25885009765625, + "learning_rate": 2.1353901996370235e-05, + "loss": 40.9842, + "step": 1598 + }, + { + "epoch": 5.772911963882619, + "grad_norm": 220.4018096923828, + "learning_rate": 2.134845735027223e-05, + "loss": 40.0403, + "step": 1599 + }, + { + "epoch": 5.776523702031603, + "grad_norm": 264.20587158203125, + "learning_rate": 2.1343012704174232e-05, + "loss": 40.1543, + "step": 1600 + }, + { + "epoch": 5.776523702031603, + "eval_loss": 0.6374311447143555, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1600 + }, + { + "epoch": 5.780135440180587, + "grad_norm": 167.9457244873047, + "learning_rate": 2.1337568058076227e-05, + "loss": 40.9575, + "step": 1601 + }, + { + "epoch": 5.7837471783295715, + "grad_norm": 190.05247497558594, + "learning_rate": 2.1332123411978223e-05, + "loss": 39.5593, + "step": 1602 + }, + { + "epoch": 5.7873589164785555, + "grad_norm": 246.4980926513672, + "learning_rate": 2.1326678765880218e-05, + "loss": 40.7016, + "step": 1603 + }, + { + "epoch": 5.7909706546275395, + "grad_norm": 208.7435302734375, + "learning_rate": 2.1321234119782214e-05, + "loss": 41.7855, + "step": 1604 + }, + { + "epoch": 5.794582392776523, + "grad_norm": 190.84188842773438, + "learning_rate": 2.1315789473684212e-05, + "loss": 41.2129, + "step": 1605 + }, + { + "epoch": 5.798194130925508, + "grad_norm": 196.7161102294922, + "learning_rate": 2.1310344827586208e-05, + "loss": 40.8209, + "step": 1606 + }, + { + "epoch": 5.801805869074492, + "grad_norm": 181.4319305419922, + "learning_rate": 2.1304900181488203e-05, + "loss": 41.8345, + "step": 1607 + }, + { + "epoch": 5.805417607223476, + "grad_norm": 201.2064971923828, + "learning_rate": 2.12994555353902e-05, + "loss": 43.1464, + "step": 1608 + }, + { + "epoch": 5.80902934537246, + "grad_norm": 199.15174865722656, + "learning_rate": 2.1294010889292197e-05, + "loss": 42.6041, + "step": 1609 + }, + { + "epoch": 5.812641083521445, + "grad_norm": 231.0398406982422, + "learning_rate": 2.1288566243194193e-05, + "loss": 42.867, + "step": 1610 + }, + { + "epoch": 5.812641083521445, + "eval_loss": 0.6334222555160522, + "eval_runtime": 3.1534, + "eval_samples_per_second": 56.764, + "eval_steps_per_second": 56.764, + "step": 1610 + }, + { + "epoch": 5.816252821670429, + "grad_norm": 189.26132202148438, + "learning_rate": 2.128312159709619e-05, + "loss": 41.7717, + "step": 1611 + }, + { + "epoch": 5.819864559819413, + "grad_norm": 215.5289764404297, + "learning_rate": 2.1277676950998187e-05, + "loss": 41.3994, + "step": 1612 + }, + { + "epoch": 5.823476297968397, + "grad_norm": 267.4259033203125, + "learning_rate": 2.1272232304900182e-05, + "loss": 41.8173, + "step": 1613 + }, + { + "epoch": 5.827088036117382, + "grad_norm": 241.74749755859375, + "learning_rate": 2.1266787658802178e-05, + "loss": 39.9873, + "step": 1614 + }, + { + "epoch": 5.830699774266366, + "grad_norm": 242.233642578125, + "learning_rate": 2.1261343012704173e-05, + "loss": 37.0662, + "step": 1615 + }, + { + "epoch": 5.83431151241535, + "grad_norm": 217.06141662597656, + "learning_rate": 2.1255898366606172e-05, + "loss": 36.8948, + "step": 1616 + }, + { + "epoch": 5.837923250564334, + "grad_norm": 242.05567932128906, + "learning_rate": 2.1250453720508167e-05, + "loss": 34.9909, + "step": 1617 + }, + { + "epoch": 5.8415349887133186, + "grad_norm": 178.65618896484375, + "learning_rate": 2.1245009074410166e-05, + "loss": 35.603, + "step": 1618 + }, + { + "epoch": 5.8451467268623025, + "grad_norm": 216.36865234375, + "learning_rate": 2.123956442831216e-05, + "loss": 35.9822, + "step": 1619 + }, + { + "epoch": 5.8487584650112865, + "grad_norm": 241.22161865234375, + "learning_rate": 2.1234119782214157e-05, + "loss": 35.1473, + "step": 1620 + }, + { + "epoch": 5.8487584650112865, + "eval_loss": 0.6312161087989807, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 1620 + }, + { + "epoch": 5.852370203160271, + "grad_norm": 192.05210876464844, + "learning_rate": 2.1228675136116152e-05, + "loss": 36.145, + "step": 1621 + }, + { + "epoch": 5.855981941309255, + "grad_norm": 194.0652618408203, + "learning_rate": 2.122323049001815e-05, + "loss": 37.7076, + "step": 1622 + }, + { + "epoch": 5.859593679458239, + "grad_norm": 255.59286499023438, + "learning_rate": 2.1217785843920146e-05, + "loss": 37.6837, + "step": 1623 + }, + { + "epoch": 5.863205417607223, + "grad_norm": 184.0017852783203, + "learning_rate": 2.121234119782214e-05, + "loss": 37.1681, + "step": 1624 + }, + { + "epoch": 5.866817155756207, + "grad_norm": 186.98338317871094, + "learning_rate": 2.1206896551724137e-05, + "loss": 37.4902, + "step": 1625 + }, + { + "epoch": 5.870428893905192, + "grad_norm": 253.53775024414062, + "learning_rate": 2.1201451905626132e-05, + "loss": 37.2771, + "step": 1626 + }, + { + "epoch": 5.874040632054176, + "grad_norm": 196.43038940429688, + "learning_rate": 2.119600725952813e-05, + "loss": 37.7681, + "step": 1627 + }, + { + "epoch": 5.87765237020316, + "grad_norm": 255.99879455566406, + "learning_rate": 2.119056261343013e-05, + "loss": 40.0097, + "step": 1628 + }, + { + "epoch": 5.881264108352145, + "grad_norm": 275.1465148925781, + "learning_rate": 2.1185117967332125e-05, + "loss": 38.1076, + "step": 1629 + }, + { + "epoch": 5.884875846501129, + "grad_norm": 281.8592529296875, + "learning_rate": 2.117967332123412e-05, + "loss": 38.6463, + "step": 1630 + }, + { + "epoch": 5.884875846501129, + "eval_loss": 0.6449099779129028, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 1630 + }, + { + "epoch": 5.888487584650113, + "grad_norm": 246.7912139892578, + "learning_rate": 2.1174228675136116e-05, + "loss": 36.9158, + "step": 1631 + }, + { + "epoch": 5.892099322799097, + "grad_norm": 176.7545623779297, + "learning_rate": 2.116878402903811e-05, + "loss": 25.1153, + "step": 1632 + }, + { + "epoch": 5.895711060948082, + "grad_norm": 202.2602996826172, + "learning_rate": 2.116333938294011e-05, + "loss": 24.1999, + "step": 1633 + }, + { + "epoch": 5.899322799097066, + "grad_norm": 186.26255798339844, + "learning_rate": 2.1157894736842106e-05, + "loss": 24.185, + "step": 1634 + }, + { + "epoch": 5.9029345372460496, + "grad_norm": 231.0543670654297, + "learning_rate": 2.11524500907441e-05, + "loss": 26.1841, + "step": 1635 + }, + { + "epoch": 5.9065462753950335, + "grad_norm": 336.677001953125, + "learning_rate": 2.1147005444646096e-05, + "loss": 47.1367, + "step": 1636 + }, + { + "epoch": 5.910158013544018, + "grad_norm": 299.3211975097656, + "learning_rate": 2.1141560798548095e-05, + "loss": 46.7711, + "step": 1637 + }, + { + "epoch": 5.913769751693002, + "grad_norm": 287.5389099121094, + "learning_rate": 2.1136116152450094e-05, + "loss": 44.9163, + "step": 1638 + }, + { + "epoch": 5.917381489841986, + "grad_norm": 290.34930419921875, + "learning_rate": 2.113067150635209e-05, + "loss": 45.1651, + "step": 1639 + }, + { + "epoch": 5.92099322799097, + "grad_norm": 244.7100372314453, + "learning_rate": 2.1125226860254085e-05, + "loss": 45.6252, + "step": 1640 + }, + { + "epoch": 5.92099322799097, + "eval_loss": 0.6506878733634949, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 1640 + }, + { + "epoch": 5.924604966139955, + "grad_norm": 301.48223876953125, + "learning_rate": 2.111978221415608e-05, + "loss": 44.5345, + "step": 1641 + }, + { + "epoch": 5.928216704288939, + "grad_norm": 261.05987548828125, + "learning_rate": 2.1114337568058075e-05, + "loss": 42.0263, + "step": 1642 + }, + { + "epoch": 5.931828442437923, + "grad_norm": 220.4369659423828, + "learning_rate": 2.110889292196007e-05, + "loss": 41.2405, + "step": 1643 + }, + { + "epoch": 5.935440180586907, + "grad_norm": 261.3221435546875, + "learning_rate": 2.110344827586207e-05, + "loss": 42.2734, + "step": 1644 + }, + { + "epoch": 5.939051918735892, + "grad_norm": 253.70855712890625, + "learning_rate": 2.1098003629764065e-05, + "loss": 43.0752, + "step": 1645 + }, + { + "epoch": 5.942663656884876, + "grad_norm": 198.76138305664062, + "learning_rate": 2.1092558983666064e-05, + "loss": 42.7103, + "step": 1646 + }, + { + "epoch": 5.94627539503386, + "grad_norm": 212.21466064453125, + "learning_rate": 2.108711433756806e-05, + "loss": 42.6215, + "step": 1647 + }, + { + "epoch": 5.949887133182845, + "grad_norm": 212.9633026123047, + "learning_rate": 2.1081669691470055e-05, + "loss": 42.795, + "step": 1648 + }, + { + "epoch": 5.953498871331829, + "grad_norm": 263.2871398925781, + "learning_rate": 2.1076225045372053e-05, + "loss": 43.8843, + "step": 1649 + }, + { + "epoch": 5.957110609480813, + "grad_norm": 207.67120361328125, + "learning_rate": 2.107078039927405e-05, + "loss": 43.0161, + "step": 1650 + }, + { + "epoch": 5.957110609480813, + "eval_loss": 0.6315081715583801, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1650 + }, + { + "epoch": 5.960722347629797, + "grad_norm": 176.6342010498047, + "learning_rate": 2.1065335753176044e-05, + "loss": 38.803, + "step": 1651 + }, + { + "epoch": 5.9643340857787805, + "grad_norm": 223.57485961914062, + "learning_rate": 2.105989110707804e-05, + "loss": 35.1905, + "step": 1652 + }, + { + "epoch": 5.967945823927765, + "grad_norm": 291.507568359375, + "learning_rate": 2.1054446460980035e-05, + "loss": 34.9454, + "step": 1653 + }, + { + "epoch": 5.971557562076749, + "grad_norm": 250.51063537597656, + "learning_rate": 2.104900181488203e-05, + "loss": 37.4404, + "step": 1654 + }, + { + "epoch": 5.975169300225733, + "grad_norm": 307.9601135253906, + "learning_rate": 2.1043557168784032e-05, + "loss": 36.9775, + "step": 1655 + }, + { + "epoch": 5.978781038374718, + "grad_norm": 277.24151611328125, + "learning_rate": 2.1038112522686028e-05, + "loss": 38.2696, + "step": 1656 + }, + { + "epoch": 5.982392776523702, + "grad_norm": 186.7593994140625, + "learning_rate": 2.1032667876588023e-05, + "loss": 37.0656, + "step": 1657 + }, + { + "epoch": 5.986004514672686, + "grad_norm": 201.67047119140625, + "learning_rate": 2.102722323049002e-05, + "loss": 38.1747, + "step": 1658 + }, + { + "epoch": 5.98961625282167, + "grad_norm": 216.87525939941406, + "learning_rate": 2.1021778584392014e-05, + "loss": 39.3248, + "step": 1659 + }, + { + "epoch": 5.993227990970655, + "grad_norm": 227.381103515625, + "learning_rate": 2.1016333938294013e-05, + "loss": 33.4017, + "step": 1660 + }, + { + "epoch": 5.993227990970655, + "eval_loss": 0.6369583010673523, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1660 + }, + { + "epoch": 5.996839729119639, + "grad_norm": 237.2648468017578, + "learning_rate": 2.1010889292196008e-05, + "loss": 24.679, + "step": 1661 + }, + { + "epoch": 6.0, + "grad_norm": 191.99951171875, + "learning_rate": 2.1005444646098003e-05, + "loss": 21.9552, + "step": 1662 + }, + { + "epoch": 6.003611738148984, + "grad_norm": 267.92181396484375, + "learning_rate": 2.1e-05, + "loss": 43.6884, + "step": 1663 + }, + { + "epoch": 6.007223476297969, + "grad_norm": 318.86602783203125, + "learning_rate": 2.0994555353901998e-05, + "loss": 46.0709, + "step": 1664 + }, + { + "epoch": 6.010835214446953, + "grad_norm": 282.772705078125, + "learning_rate": 2.0989110707803993e-05, + "loss": 44.2746, + "step": 1665 + }, + { + "epoch": 6.014446952595937, + "grad_norm": 263.2024841308594, + "learning_rate": 2.0983666061705992e-05, + "loss": 43.818, + "step": 1666 + }, + { + "epoch": 6.018058690744921, + "grad_norm": 229.41725158691406, + "learning_rate": 2.0978221415607987e-05, + "loss": 43.9441, + "step": 1667 + }, + { + "epoch": 6.021670428893906, + "grad_norm": 253.25624084472656, + "learning_rate": 2.0972776769509983e-05, + "loss": 43.517, + "step": 1668 + }, + { + "epoch": 6.0252821670428895, + "grad_norm": 202.00238037109375, + "learning_rate": 2.0967332123411978e-05, + "loss": 44.3685, + "step": 1669 + }, + { + "epoch": 6.0288939051918735, + "grad_norm": 196.92825317382812, + "learning_rate": 2.0961887477313973e-05, + "loss": 44.9367, + "step": 1670 + }, + { + "epoch": 6.0288939051918735, + "eval_loss": 0.6381568312644958, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1670 + }, + { + "epoch": 6.0325056433408575, + "grad_norm": 191.00900268554688, + "learning_rate": 2.0956442831215972e-05, + "loss": 44.0743, + "step": 1671 + }, + { + "epoch": 6.036117381489842, + "grad_norm": 195.92141723632812, + "learning_rate": 2.0950998185117967e-05, + "loss": 43.3278, + "step": 1672 + }, + { + "epoch": 6.039729119638826, + "grad_norm": 230.04708862304688, + "learning_rate": 2.0945553539019963e-05, + "loss": 41.6419, + "step": 1673 + }, + { + "epoch": 6.04334085778781, + "grad_norm": 215.70689392089844, + "learning_rate": 2.094010889292196e-05, + "loss": 41.0927, + "step": 1674 + }, + { + "epoch": 6.046952595936794, + "grad_norm": 227.51797485351562, + "learning_rate": 2.0934664246823957e-05, + "loss": 40.1888, + "step": 1675 + }, + { + "epoch": 6.050564334085779, + "grad_norm": 216.93089294433594, + "learning_rate": 2.0929219600725952e-05, + "loss": 39.8766, + "step": 1676 + }, + { + "epoch": 6.054176072234763, + "grad_norm": 199.3091583251953, + "learning_rate": 2.092377495462795e-05, + "loss": 40.3851, + "step": 1677 + }, + { + "epoch": 6.057787810383747, + "grad_norm": 188.56056213378906, + "learning_rate": 2.0918330308529947e-05, + "loss": 40.5289, + "step": 1678 + }, + { + "epoch": 6.061399548532731, + "grad_norm": 194.23265075683594, + "learning_rate": 2.0912885662431942e-05, + "loss": 40.7509, + "step": 1679 + }, + { + "epoch": 6.065011286681716, + "grad_norm": 199.7327423095703, + "learning_rate": 2.0907441016333937e-05, + "loss": 41.3404, + "step": 1680 + }, + { + "epoch": 6.065011286681716, + "eval_loss": 0.6312655806541443, + "eval_runtime": 3.1482, + "eval_samples_per_second": 56.858, + "eval_steps_per_second": 56.858, + "step": 1680 + }, + { + "epoch": 6.0686230248307, + "grad_norm": 189.40150451660156, + "learning_rate": 2.0901996370235933e-05, + "loss": 41.3719, + "step": 1681 + }, + { + "epoch": 6.072234762979684, + "grad_norm": 222.07705688476562, + "learning_rate": 2.089655172413793e-05, + "loss": 41.8194, + "step": 1682 + }, + { + "epoch": 6.075846501128668, + "grad_norm": 205.6264190673828, + "learning_rate": 2.089110707803993e-05, + "loss": 39.8522, + "step": 1683 + }, + { + "epoch": 6.079458239277653, + "grad_norm": 207.98802185058594, + "learning_rate": 2.0885662431941926e-05, + "loss": 41.5093, + "step": 1684 + }, + { + "epoch": 6.083069977426637, + "grad_norm": 197.24134826660156, + "learning_rate": 2.088021778584392e-05, + "loss": 41.7284, + "step": 1685 + }, + { + "epoch": 6.0866817155756205, + "grad_norm": 220.84255981445312, + "learning_rate": 2.0874773139745916e-05, + "loss": 42.7841, + "step": 1686 + }, + { + "epoch": 6.090293453724605, + "grad_norm": 239.06854248046875, + "learning_rate": 2.0869328493647912e-05, + "loss": 43.6391, + "step": 1687 + }, + { + "epoch": 6.093905191873589, + "grad_norm": 193.2572021484375, + "learning_rate": 2.086388384754991e-05, + "loss": 41.9963, + "step": 1688 + }, + { + "epoch": 6.097516930022573, + "grad_norm": 206.66473388671875, + "learning_rate": 2.0858439201451906e-05, + "loss": 41.9834, + "step": 1689 + }, + { + "epoch": 6.101128668171557, + "grad_norm": 214.81956481933594, + "learning_rate": 2.08529945553539e-05, + "loss": 41.7128, + "step": 1690 + }, + { + "epoch": 6.101128668171557, + "eval_loss": 0.6309775114059448, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1690 + }, + { + "epoch": 6.104740406320542, + "grad_norm": 189.58360290527344, + "learning_rate": 2.0847549909255897e-05, + "loss": 37.7807, + "step": 1691 + }, + { + "epoch": 6.108352144469526, + "grad_norm": 265.76934814453125, + "learning_rate": 2.0842105263157895e-05, + "loss": 37.7091, + "step": 1692 + }, + { + "epoch": 6.11196388261851, + "grad_norm": 266.4632568359375, + "learning_rate": 2.0836660617059894e-05, + "loss": 34.7386, + "step": 1693 + }, + { + "epoch": 6.115575620767494, + "grad_norm": 309.3799743652344, + "learning_rate": 2.083121597096189e-05, + "loss": 34.9386, + "step": 1694 + }, + { + "epoch": 6.119187358916479, + "grad_norm": 252.98681640625, + "learning_rate": 2.0825771324863885e-05, + "loss": 34.9113, + "step": 1695 + }, + { + "epoch": 6.122799097065463, + "grad_norm": 199.3408660888672, + "learning_rate": 2.082032667876588e-05, + "loss": 35.1914, + "step": 1696 + }, + { + "epoch": 6.126410835214447, + "grad_norm": 231.67514038085938, + "learning_rate": 2.0814882032667876e-05, + "loss": 36.3151, + "step": 1697 + }, + { + "epoch": 6.130022573363431, + "grad_norm": 215.49317932128906, + "learning_rate": 2.080943738656987e-05, + "loss": 37.6763, + "step": 1698 + }, + { + "epoch": 6.133634311512416, + "grad_norm": 239.3602752685547, + "learning_rate": 2.080399274047187e-05, + "loss": 35.7805, + "step": 1699 + }, + { + "epoch": 6.1372460496614, + "grad_norm": 192.8195037841797, + "learning_rate": 2.0798548094373865e-05, + "loss": 36.7353, + "step": 1700 + }, + { + "epoch": 6.1372460496614, + "eval_loss": 0.6290757060050964, + "eval_runtime": 3.1486, + "eval_samples_per_second": 56.851, + "eval_steps_per_second": 56.851, + "step": 1700 + }, + { + "epoch": 6.140857787810384, + "grad_norm": 191.125, + "learning_rate": 2.0793103448275864e-05, + "loss": 36.6377, + "step": 1701 + }, + { + "epoch": 6.144469525959368, + "grad_norm": 232.39170837402344, + "learning_rate": 2.078765880217786e-05, + "loss": 36.5235, + "step": 1702 + }, + { + "epoch": 6.148081264108352, + "grad_norm": 259.41204833984375, + "learning_rate": 2.0782214156079855e-05, + "loss": 37.7093, + "step": 1703 + }, + { + "epoch": 6.151693002257336, + "grad_norm": 218.00814819335938, + "learning_rate": 2.0776769509981854e-05, + "loss": 37.8061, + "step": 1704 + }, + { + "epoch": 6.15530474040632, + "grad_norm": 183.78170776367188, + "learning_rate": 2.077132486388385e-05, + "loss": 37.9451, + "step": 1705 + }, + { + "epoch": 6.158916478555304, + "grad_norm": 242.387939453125, + "learning_rate": 2.0765880217785844e-05, + "loss": 38.687, + "step": 1706 + }, + { + "epoch": 6.162528216704289, + "grad_norm": 247.09152221679688, + "learning_rate": 2.076043557168784e-05, + "loss": 38.5109, + "step": 1707 + }, + { + "epoch": 6.166139954853273, + "grad_norm": 202.3104705810547, + "learning_rate": 2.0754990925589835e-05, + "loss": 28.0115, + "step": 1708 + }, + { + "epoch": 6.169751693002257, + "grad_norm": 239.5511016845703, + "learning_rate": 2.0749546279491834e-05, + "loss": 23.8873, + "step": 1709 + }, + { + "epoch": 6.173363431151241, + "grad_norm": 233.80007934570312, + "learning_rate": 2.0744101633393833e-05, + "loss": 24.0236, + "step": 1710 + }, + { + "epoch": 6.173363431151241, + "eval_loss": 0.6451307535171509, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1710 + }, + { + "epoch": 6.176975169300226, + "grad_norm": 231.85955810546875, + "learning_rate": 2.0738656987295828e-05, + "loss": 25.2521, + "step": 1711 + }, + { + "epoch": 6.18058690744921, + "grad_norm": 207.05453491210938, + "learning_rate": 2.0733212341197823e-05, + "loss": 25.5774, + "step": 1712 + }, + { + "epoch": 6.184198645598194, + "grad_norm": 265.9180908203125, + "learning_rate": 2.072776769509982e-05, + "loss": 46.0267, + "step": 1713 + }, + { + "epoch": 6.187810383747179, + "grad_norm": 289.2763671875, + "learning_rate": 2.0722323049001814e-05, + "loss": 46.6262, + "step": 1714 + }, + { + "epoch": 6.191422121896163, + "grad_norm": 254.466552734375, + "learning_rate": 2.0716878402903813e-05, + "loss": 44.2758, + "step": 1715 + }, + { + "epoch": 6.195033860045147, + "grad_norm": 262.713134765625, + "learning_rate": 2.071143375680581e-05, + "loss": 44.6334, + "step": 1716 + }, + { + "epoch": 6.198645598194131, + "grad_norm": 272.8150939941406, + "learning_rate": 2.0705989110707804e-05, + "loss": 44.9617, + "step": 1717 + }, + { + "epoch": 6.2022573363431155, + "grad_norm": 288.115478515625, + "learning_rate": 2.07005444646098e-05, + "loss": 44.4382, + "step": 1718 + }, + { + "epoch": 6.2058690744920995, + "grad_norm": 226.08058166503906, + "learning_rate": 2.0695099818511795e-05, + "loss": 44.8551, + "step": 1719 + }, + { + "epoch": 6.209480812641083, + "grad_norm": 219.95835876464844, + "learning_rate": 2.0689655172413797e-05, + "loss": 45.5901, + "step": 1720 + }, + { + "epoch": 6.209480812641083, + "eval_loss": 0.6379314661026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 1720 + }, + { + "epoch": 6.213092550790067, + "grad_norm": 190.3118896484375, + "learning_rate": 2.0684210526315792e-05, + "loss": 44.0675, + "step": 1721 + }, + { + "epoch": 6.216704288939052, + "grad_norm": 177.408935546875, + "learning_rate": 2.0678765880217787e-05, + "loss": 42.6333, + "step": 1722 + }, + { + "epoch": 6.220316027088036, + "grad_norm": 231.3040313720703, + "learning_rate": 2.0673321234119783e-05, + "loss": 41.6771, + "step": 1723 + }, + { + "epoch": 6.22392776523702, + "grad_norm": 226.51663208007812, + "learning_rate": 2.0667876588021778e-05, + "loss": 41.0829, + "step": 1724 + }, + { + "epoch": 6.227539503386004, + "grad_norm": 184.55775451660156, + "learning_rate": 2.0662431941923774e-05, + "loss": 39.2682, + "step": 1725 + }, + { + "epoch": 6.231151241534989, + "grad_norm": 205.0491943359375, + "learning_rate": 2.0656987295825772e-05, + "loss": 40.4101, + "step": 1726 + }, + { + "epoch": 6.234762979683973, + "grad_norm": 201.45838928222656, + "learning_rate": 2.0651542649727768e-05, + "loss": 39.9147, + "step": 1727 + }, + { + "epoch": 6.238374717832957, + "grad_norm": 220.16213989257812, + "learning_rate": 2.0646098003629763e-05, + "loss": 40.7215, + "step": 1728 + }, + { + "epoch": 6.241986455981941, + "grad_norm": 260.9661560058594, + "learning_rate": 2.0640653357531762e-05, + "loss": 40.0256, + "step": 1729 + }, + { + "epoch": 6.245598194130926, + "grad_norm": 314.2476806640625, + "learning_rate": 2.0635208711433757e-05, + "loss": 41.1147, + "step": 1730 + }, + { + "epoch": 6.245598194130926, + "eval_loss": 0.6347935199737549, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1730 + }, + { + "epoch": 6.24920993227991, + "grad_norm": 262.24505615234375, + "learning_rate": 2.0629764065335756e-05, + "loss": 41.7255, + "step": 1731 + }, + { + "epoch": 6.252821670428894, + "grad_norm": 212.0876922607422, + "learning_rate": 2.062431941923775e-05, + "loss": 41.2559, + "step": 1732 + }, + { + "epoch": 6.2564334085778786, + "grad_norm": 185.3249969482422, + "learning_rate": 2.0618874773139747e-05, + "loss": 41.1664, + "step": 1733 + }, + { + "epoch": 6.2600451467268625, + "grad_norm": 184.7873077392578, + "learning_rate": 2.0613430127041742e-05, + "loss": 41.3357, + "step": 1734 + }, + { + "epoch": 6.2636568848758465, + "grad_norm": 230.11257934570312, + "learning_rate": 2.0607985480943738e-05, + "loss": 43.0978, + "step": 1735 + }, + { + "epoch": 6.2672686230248305, + "grad_norm": 251.255126953125, + "learning_rate": 2.0602540834845733e-05, + "loss": 42.4169, + "step": 1736 + }, + { + "epoch": 6.270880361173815, + "grad_norm": 230.1149444580078, + "learning_rate": 2.0597096188747732e-05, + "loss": 43.2969, + "step": 1737 + }, + { + "epoch": 6.274492099322799, + "grad_norm": 217.2769012451172, + "learning_rate": 2.059165154264973e-05, + "loss": 42.6037, + "step": 1738 + }, + { + "epoch": 6.278103837471783, + "grad_norm": 189.85533142089844, + "learning_rate": 2.0586206896551726e-05, + "loss": 42.1215, + "step": 1739 + }, + { + "epoch": 6.281715575620767, + "grad_norm": 242.15667724609375, + "learning_rate": 2.058076225045372e-05, + "loss": 42.6337, + "step": 1740 + }, + { + "epoch": 6.281715575620767, + "eval_loss": 0.6310555934906006, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 1740 + }, + { + "epoch": 6.285327313769752, + "grad_norm": 213.7873992919922, + "learning_rate": 2.0575317604355717e-05, + "loss": 40.5315, + "step": 1741 + }, + { + "epoch": 6.288939051918736, + "grad_norm": 243.86492919921875, + "learning_rate": 2.0569872958257715e-05, + "loss": 38.9483, + "step": 1742 + }, + { + "epoch": 6.29255079006772, + "grad_norm": 276.0108642578125, + "learning_rate": 2.056442831215971e-05, + "loss": 35.9627, + "step": 1743 + }, + { + "epoch": 6.296162528216704, + "grad_norm": 252.5875701904297, + "learning_rate": 2.0558983666061706e-05, + "loss": 35.4305, + "step": 1744 + }, + { + "epoch": 6.299774266365689, + "grad_norm": 227.15142822265625, + "learning_rate": 2.05535390199637e-05, + "loss": 35.2385, + "step": 1745 + }, + { + "epoch": 6.303386004514673, + "grad_norm": 259.6727294921875, + "learning_rate": 2.0548094373865697e-05, + "loss": 35.735, + "step": 1746 + }, + { + "epoch": 6.306997742663657, + "grad_norm": 185.07765197753906, + "learning_rate": 2.0542649727767696e-05, + "loss": 36.8835, + "step": 1747 + }, + { + "epoch": 6.310609480812641, + "grad_norm": 207.650146484375, + "learning_rate": 2.0537205081669694e-05, + "loss": 36.346, + "step": 1748 + }, + { + "epoch": 6.314221218961626, + "grad_norm": 223.2378692626953, + "learning_rate": 2.053176043557169e-05, + "loss": 36.1527, + "step": 1749 + }, + { + "epoch": 6.3178329571106095, + "grad_norm": 162.90794372558594, + "learning_rate": 2.0526315789473685e-05, + "loss": 35.7408, + "step": 1750 + }, + { + "epoch": 6.3178329571106095, + "eval_loss": 0.6276403069496155, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 1750 + }, + { + "epoch": 6.3214446952595935, + "grad_norm": 165.8592987060547, + "learning_rate": 2.052087114337568e-05, + "loss": 37.7916, + "step": 1751 + }, + { + "epoch": 6.3250564334085775, + "grad_norm": 179.7499542236328, + "learning_rate": 2.0515426497277676e-05, + "loss": 36.8409, + "step": 1752 + }, + { + "epoch": 6.328668171557562, + "grad_norm": 227.0990753173828, + "learning_rate": 2.0509981851179675e-05, + "loss": 37.1766, + "step": 1753 + }, + { + "epoch": 6.332279909706546, + "grad_norm": 216.3297882080078, + "learning_rate": 2.050453720508167e-05, + "loss": 37.5, + "step": 1754 + }, + { + "epoch": 6.33589164785553, + "grad_norm": 197.88409423828125, + "learning_rate": 2.0499092558983666e-05, + "loss": 38.8293, + "step": 1755 + }, + { + "epoch": 6.339503386004514, + "grad_norm": 189.74916076660156, + "learning_rate": 2.049364791288566e-05, + "loss": 37.9873, + "step": 1756 + }, + { + "epoch": 6.343115124153499, + "grad_norm": 241.16644287109375, + "learning_rate": 2.048820326678766e-05, + "loss": 39.3107, + "step": 1757 + }, + { + "epoch": 6.346726862302483, + "grad_norm": 224.3491668701172, + "learning_rate": 2.0482758620689655e-05, + "loss": 36.2482, + "step": 1758 + }, + { + "epoch": 6.350338600451467, + "grad_norm": 217.30882263183594, + "learning_rate": 2.0477313974591654e-05, + "loss": 24.1945, + "step": 1759 + }, + { + "epoch": 6.353950338600452, + "grad_norm": 213.23683166503906, + "learning_rate": 2.047186932849365e-05, + "loss": 24.2356, + "step": 1760 + }, + { + "epoch": 6.353950338600452, + "eval_loss": 0.6382855772972107, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.795, + "eval_steps_per_second": 56.795, + "step": 1760 + }, + { + "epoch": 6.357562076749436, + "grad_norm": 209.8166961669922, + "learning_rate": 2.0466424682395645e-05, + "loss": 25.1916, + "step": 1761 + }, + { + "epoch": 6.36117381489842, + "grad_norm": 197.86773681640625, + "learning_rate": 2.046098003629764e-05, + "loss": 25.1372, + "step": 1762 + }, + { + "epoch": 6.364785553047404, + "grad_norm": 280.80517578125, + "learning_rate": 2.0455535390199635e-05, + "loss": 45.0431, + "step": 1763 + }, + { + "epoch": 6.368397291196389, + "grad_norm": 239.85861206054688, + "learning_rate": 2.0450090744101634e-05, + "loss": 45.4893, + "step": 1764 + }, + { + "epoch": 6.372009029345373, + "grad_norm": 302.56024169921875, + "learning_rate": 2.044464609800363e-05, + "loss": 45.3313, + "step": 1765 + }, + { + "epoch": 6.375620767494357, + "grad_norm": 255.5519256591797, + "learning_rate": 2.043920145190563e-05, + "loss": 44.703, + "step": 1766 + }, + { + "epoch": 6.3792325056433405, + "grad_norm": 223.1331024169922, + "learning_rate": 2.0433756805807624e-05, + "loss": 45.0278, + "step": 1767 + }, + { + "epoch": 6.382844243792325, + "grad_norm": 240.68817138671875, + "learning_rate": 2.042831215970962e-05, + "loss": 44.7298, + "step": 1768 + }, + { + "epoch": 6.386455981941309, + "grad_norm": 239.5072021484375, + "learning_rate": 2.0422867513611614e-05, + "loss": 44.0512, + "step": 1769 + }, + { + "epoch": 6.390067720090293, + "grad_norm": 186.3783416748047, + "learning_rate": 2.0417422867513613e-05, + "loss": 43.8646, + "step": 1770 + }, + { + "epoch": 6.390067720090293, + "eval_loss": 0.6325972676277161, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 1770 + }, + { + "epoch": 6.393679458239277, + "grad_norm": 169.77285766601562, + "learning_rate": 2.041197822141561e-05, + "loss": 43.8688, + "step": 1771 + }, + { + "epoch": 6.397291196388262, + "grad_norm": 158.4019012451172, + "learning_rate": 2.0406533575317604e-05, + "loss": 42.5757, + "step": 1772 + }, + { + "epoch": 6.400902934537246, + "grad_norm": 209.79916381835938, + "learning_rate": 2.04010889292196e-05, + "loss": 44.8075, + "step": 1773 + }, + { + "epoch": 6.40451467268623, + "grad_norm": 215.74639892578125, + "learning_rate": 2.0395644283121595e-05, + "loss": 42.0121, + "step": 1774 + }, + { + "epoch": 6.408126410835214, + "grad_norm": 215.21121215820312, + "learning_rate": 2.0390199637023597e-05, + "loss": 40.6564, + "step": 1775 + }, + { + "epoch": 6.411738148984199, + "grad_norm": 244.49574279785156, + "learning_rate": 2.0384754990925592e-05, + "loss": 40.543, + "step": 1776 + }, + { + "epoch": 6.415349887133183, + "grad_norm": 189.22781372070312, + "learning_rate": 2.0379310344827588e-05, + "loss": 39.5569, + "step": 1777 + }, + { + "epoch": 6.418961625282167, + "grad_norm": 204.32664489746094, + "learning_rate": 2.0373865698729583e-05, + "loss": 40.0789, + "step": 1778 + }, + { + "epoch": 6.422573363431152, + "grad_norm": 217.5277557373047, + "learning_rate": 2.036842105263158e-05, + "loss": 39.6436, + "step": 1779 + }, + { + "epoch": 6.426185101580136, + "grad_norm": 196.25918579101562, + "learning_rate": 2.0362976406533574e-05, + "loss": 41.0794, + "step": 1780 + }, + { + "epoch": 6.426185101580136, + "eval_loss": 0.6334295868873596, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1780 + }, + { + "epoch": 6.42979683972912, + "grad_norm": 191.50656127929688, + "learning_rate": 2.0357531760435573e-05, + "loss": 41.2976, + "step": 1781 + }, + { + "epoch": 6.433408577878104, + "grad_norm": 192.98692321777344, + "learning_rate": 2.0352087114337568e-05, + "loss": 41.0843, + "step": 1782 + }, + { + "epoch": 6.437020316027088, + "grad_norm": 197.32862854003906, + "learning_rate": 2.0346642468239563e-05, + "loss": 40.4123, + "step": 1783 + }, + { + "epoch": 6.440632054176072, + "grad_norm": 205.18751525878906, + "learning_rate": 2.0341197822141562e-05, + "loss": 41.9185, + "step": 1784 + }, + { + "epoch": 6.444243792325056, + "grad_norm": 201.69070434570312, + "learning_rate": 2.0335753176043558e-05, + "loss": 41.6794, + "step": 1785 + }, + { + "epoch": 6.44785553047404, + "grad_norm": 218.77044677734375, + "learning_rate": 2.0330308529945556e-05, + "loss": 43.5805, + "step": 1786 + }, + { + "epoch": 6.451467268623025, + "grad_norm": 183.25967407226562, + "learning_rate": 2.0324863883847552e-05, + "loss": 41.2777, + "step": 1787 + }, + { + "epoch": 6.455079006772009, + "grad_norm": 219.97369384765625, + "learning_rate": 2.0319419237749547e-05, + "loss": 42.4618, + "step": 1788 + }, + { + "epoch": 6.458690744920993, + "grad_norm": 216.1624298095703, + "learning_rate": 2.0313974591651542e-05, + "loss": 41.6424, + "step": 1789 + }, + { + "epoch": 6.462302483069977, + "grad_norm": 222.29965209960938, + "learning_rate": 2.0308529945553538e-05, + "loss": 41.4058, + "step": 1790 + }, + { + "epoch": 6.462302483069977, + "eval_loss": 0.6282982230186462, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 1790 + }, + { + "epoch": 6.465914221218962, + "grad_norm": 215.50511169433594, + "learning_rate": 2.0303085299455533e-05, + "loss": 39.474, + "step": 1791 + }, + { + "epoch": 6.469525959367946, + "grad_norm": 237.2119903564453, + "learning_rate": 2.0297640653357532e-05, + "loss": 36.0508, + "step": 1792 + }, + { + "epoch": 6.47313769751693, + "grad_norm": 234.52975463867188, + "learning_rate": 2.029219600725953e-05, + "loss": 34.1704, + "step": 1793 + }, + { + "epoch": 6.476749435665914, + "grad_norm": 213.22216796875, + "learning_rate": 2.0286751361161526e-05, + "loss": 34.7592, + "step": 1794 + }, + { + "epoch": 6.480361173814899, + "grad_norm": 215.77244567871094, + "learning_rate": 2.028130671506352e-05, + "loss": 35.3051, + "step": 1795 + }, + { + "epoch": 6.483972911963883, + "grad_norm": 179.0439910888672, + "learning_rate": 2.0275862068965517e-05, + "loss": 35.2493, + "step": 1796 + }, + { + "epoch": 6.487584650112867, + "grad_norm": 217.47218322753906, + "learning_rate": 2.0270417422867516e-05, + "loss": 35.6169, + "step": 1797 + }, + { + "epoch": 6.491196388261851, + "grad_norm": 191.3380584716797, + "learning_rate": 2.026497277676951e-05, + "loss": 36.428, + "step": 1798 + }, + { + "epoch": 6.4948081264108355, + "grad_norm": 200.8570098876953, + "learning_rate": 2.0259528130671506e-05, + "loss": 36.5983, + "step": 1799 + }, + { + "epoch": 6.4984198645598195, + "grad_norm": 173.1240234375, + "learning_rate": 2.0254083484573502e-05, + "loss": 36.0163, + "step": 1800 + }, + { + "epoch": 6.4984198645598195, + "eval_loss": 0.6268841624259949, + "eval_runtime": 3.146, + "eval_samples_per_second": 56.898, + "eval_steps_per_second": 56.898, + "step": 1800 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.831998506722918e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-1800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9b56ad090743a7cbeb1e4e4e3f8d2aa1f51df785 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:752447f96cd14d90ba0e92c4511a88f6e6f5666b208981395ef8bf8c17029189 +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..8570bc860b53e6117c283597e2f69a87b3e1f643 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d00c75a29afc03896905906879a65d9a7aa247ddff37e2cd8aa6b0dc1d431e09 +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..dadb5ebac95d2a7d6b218f06945aa7479ce4851f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a7e9ec568acd37fb3e0d9bd95f8255f72f7ad52ed216ec2b4b89b1fe8c0ee34 +size 124114170 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8c72d5af7fa4c49fd048c74a62e852c1c4d2f83b --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7f16fa6852715190720d2c7e539e20c49fb9c35a14af3fccfffab045d5b1c9e +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a329e77c34219b608ceece9ea9fc13783d0f8ee8 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f76b8d57b0dd3fab6c6bd79ca794bb22a3abcd0c17a70c6ada90d14bc492c67c +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5fa5e20300869016b9a1c06f288e43f3d8d036a8 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/trainer_state.json @@ -0,0 +1,1593 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7223476297968398, + "eval_steps": 10, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9812316666986496.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..34b0f9d8b7bf434bea5898d7f79e27ea0683d9db --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32a0728bdd59825b106b69bc32e0cca67812afdb5fafbeff6e349deeb8f74440 +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..122ecdfb2dc8db346509ce9af99b4de1a3da4b18 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b92e253e71ed0dd11321b3df531e3863ded2ab85dd1ea14897d476331ed58272 +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4f7973da0d02f352e5f6b16e60ae1638a1ddf25 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:334f1da75a37c24ed32b3a0804447fec02b9ccc30370f340ddb6e9b57afd801c +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..5b90262809ac4e7b17e81a3e68dab7994b5e007d --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:339f44c56baa525ca73c31b41991d2d0e6a9160d393cff70b3a162086dd5544c +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..67d267d4242e1c364a1981a8f2c3d4df47e0dc2d --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e710ab3a98298c3cf1ca00c45239915f48103a8dc4df95b6f9c5c6e840cf478 +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..00bf8f53ae223904d2266525973883f2f54770a2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/trainer_state.json @@ -0,0 +1,15633 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.220316027088036, + "eval_steps": 10, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + }, + { + "epoch": 4.33589164785553, + "grad_norm": 232.332763671875, + "learning_rate": 2.3515426497277678e-05, + "loss": 38.1029, + "step": 1201 + }, + { + "epoch": 4.339503386004514, + "grad_norm": 251.8763885498047, + "learning_rate": 2.3509981851179673e-05, + "loss": 40.2538, + "step": 1202 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 260.1363525390625, + "learning_rate": 2.350453720508167e-05, + "loss": 39.115, + "step": 1203 + }, + { + "epoch": 4.346726862302483, + "grad_norm": 227.32473754882812, + "learning_rate": 2.3499092558983667e-05, + "loss": 37.7692, + "step": 1204 + }, + { + "epoch": 4.350338600451467, + "grad_norm": 208.3872528076172, + "learning_rate": 2.3493647912885663e-05, + "loss": 26.7583, + "step": 1205 + }, + { + "epoch": 4.353950338600452, + "grad_norm": 173.05075073242188, + "learning_rate": 2.348820326678766e-05, + "loss": 24.7576, + "step": 1206 + }, + { + "epoch": 4.357562076749436, + "grad_norm": 214.4512939453125, + "learning_rate": 2.3482758620689657e-05, + "loss": 24.8792, + "step": 1207 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 179.293701171875, + "learning_rate": 2.3477313974591652e-05, + "loss": 26.1507, + "step": 1208 + }, + { + "epoch": 4.364785553047404, + "grad_norm": 401.9908142089844, + "learning_rate": 2.3471869328493648e-05, + "loss": 47.4017, + "step": 1209 + }, + { + "epoch": 4.368397291196389, + "grad_norm": 399.3369140625, + "learning_rate": 2.3466424682395643e-05, + "loss": 48.0082, + "step": 1210 + }, + { + "epoch": 4.368397291196389, + "eval_loss": 0.6664602756500244, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.18, + "eval_steps_per_second": 57.18, + "step": 1210 + }, + { + "epoch": 4.372009029345373, + "grad_norm": 320.49090576171875, + "learning_rate": 2.346098003629764e-05, + "loss": 47.4843, + "step": 1211 + }, + { + "epoch": 4.375620767494357, + "grad_norm": 297.55615234375, + "learning_rate": 2.3455535390199637e-05, + "loss": 46.3087, + "step": 1212 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 245.03399658203125, + "learning_rate": 2.3450090744101636e-05, + "loss": 45.4889, + "step": 1213 + }, + { + "epoch": 4.382844243792325, + "grad_norm": 227.94091796875, + "learning_rate": 2.344464609800363e-05, + "loss": 45.8501, + "step": 1214 + }, + { + "epoch": 4.386455981941309, + "grad_norm": 262.7824401855469, + "learning_rate": 2.3439201451905627e-05, + "loss": 46.2737, + "step": 1215 + }, + { + "epoch": 4.390067720090293, + "grad_norm": 235.969970703125, + "learning_rate": 2.3433756805807622e-05, + "loss": 45.2876, + "step": 1216 + }, + { + "epoch": 4.393679458239277, + "grad_norm": 244.8028106689453, + "learning_rate": 2.342831215970962e-05, + "loss": 45.4931, + "step": 1217 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 236.24844360351562, + "learning_rate": 2.3422867513611616e-05, + "loss": 45.6649, + "step": 1218 + }, + { + "epoch": 4.400902934537246, + "grad_norm": 204.7911834716797, + "learning_rate": 2.341742286751361e-05, + "loss": 43.9613, + "step": 1219 + }, + { + "epoch": 4.40451467268623, + "grad_norm": 190.6739044189453, + "learning_rate": 2.3411978221415607e-05, + "loss": 41.9267, + "step": 1220 + }, + { + "epoch": 4.40451467268623, + "eval_loss": 0.6481396555900574, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1220 + }, + { + "epoch": 4.408126410835214, + "grad_norm": 224.25758361816406, + "learning_rate": 2.3406533575317602e-05, + "loss": 42.34, + "step": 1221 + }, + { + "epoch": 4.411738148984199, + "grad_norm": 238.21913146972656, + "learning_rate": 2.34010889292196e-05, + "loss": 40.6947, + "step": 1222 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 255.64395141601562, + "learning_rate": 2.33956442831216e-05, + "loss": 39.8585, + "step": 1223 + }, + { + "epoch": 4.418961625282167, + "grad_norm": 202.08859252929688, + "learning_rate": 2.3390199637023595e-05, + "loss": 42.6031, + "step": 1224 + }, + { + "epoch": 4.422573363431152, + "grad_norm": 222.359619140625, + "learning_rate": 2.338475499092559e-05, + "loss": 41.9946, + "step": 1225 + }, + { + "epoch": 4.426185101580136, + "grad_norm": 198.84461975097656, + "learning_rate": 2.3379310344827586e-05, + "loss": 40.9174, + "step": 1226 + }, + { + "epoch": 4.42979683972912, + "grad_norm": 227.34942626953125, + "learning_rate": 2.337386569872958e-05, + "loss": 42.2865, + "step": 1227 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 249.9097900390625, + "learning_rate": 2.336842105263158e-05, + "loss": 42.6508, + "step": 1228 + }, + { + "epoch": 4.437020316027088, + "grad_norm": 236.96009826660156, + "learning_rate": 2.3362976406533576e-05, + "loss": 43.0846, + "step": 1229 + }, + { + "epoch": 4.440632054176072, + "grad_norm": 183.06201171875, + "learning_rate": 2.335753176043557e-05, + "loss": 42.4119, + "step": 1230 + }, + { + "epoch": 4.440632054176072, + "eval_loss": 0.6428424715995789, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 1230 + }, + { + "epoch": 4.444243792325056, + "grad_norm": 199.0382843017578, + "learning_rate": 2.335208711433757e-05, + "loss": 43.1702, + "step": 1231 + }, + { + "epoch": 4.44785553047404, + "grad_norm": 221.87939453125, + "learning_rate": 2.3346642468239565e-05, + "loss": 43.3518, + "step": 1232 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 205.0601043701172, + "learning_rate": 2.3341197822141564e-05, + "loss": 42.9713, + "step": 1233 + }, + { + "epoch": 4.455079006772009, + "grad_norm": 235.3998565673828, + "learning_rate": 2.333575317604356e-05, + "loss": 42.6973, + "step": 1234 + }, + { + "epoch": 4.458690744920993, + "grad_norm": 171.76986694335938, + "learning_rate": 2.3330308529945555e-05, + "loss": 43.351, + "step": 1235 + }, + { + "epoch": 4.462302483069977, + "grad_norm": 261.549072265625, + "learning_rate": 2.332486388384755e-05, + "loss": 43.8662, + "step": 1236 + }, + { + "epoch": 4.465914221218962, + "grad_norm": 256.76837158203125, + "learning_rate": 2.3319419237749545e-05, + "loss": 40.7938, + "step": 1237 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 176.35060119628906, + "learning_rate": 2.331397459165154e-05, + "loss": 38.1021, + "step": 1238 + }, + { + "epoch": 4.47313769751693, + "grad_norm": 203.00906372070312, + "learning_rate": 2.330852994555354e-05, + "loss": 36.6359, + "step": 1239 + }, + { + "epoch": 4.476749435665914, + "grad_norm": 259.6462707519531, + "learning_rate": 2.3303085299455535e-05, + "loss": 34.448, + "step": 1240 + }, + { + "epoch": 4.476749435665914, + "eval_loss": 0.6386051177978516, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 1240 + }, + { + "epoch": 4.480361173814899, + "grad_norm": 215.24737548828125, + "learning_rate": 2.3297640653357534e-05, + "loss": 35.2353, + "step": 1241 + }, + { + "epoch": 4.483972911963883, + "grad_norm": 249.12355041503906, + "learning_rate": 2.329219600725953e-05, + "loss": 38.2077, + "step": 1242 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 191.0881805419922, + "learning_rate": 2.3286751361161525e-05, + "loss": 36.8363, + "step": 1243 + }, + { + "epoch": 4.491196388261851, + "grad_norm": 229.26449584960938, + "learning_rate": 2.3281306715063523e-05, + "loss": 36.7398, + "step": 1244 + }, + { + "epoch": 4.4948081264108355, + "grad_norm": 184.931884765625, + "learning_rate": 2.327586206896552e-05, + "loss": 35.6614, + "step": 1245 + }, + { + "epoch": 4.4984198645598195, + "grad_norm": 183.7378387451172, + "learning_rate": 2.3270417422867514e-05, + "loss": 36.9818, + "step": 1246 + }, + { + "epoch": 4.502031602708803, + "grad_norm": 191.42543029785156, + "learning_rate": 2.326497277676951e-05, + "loss": 38.1348, + "step": 1247 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 211.6359100341797, + "learning_rate": 2.3259528130671505e-05, + "loss": 37.0112, + "step": 1248 + }, + { + "epoch": 4.509255079006772, + "grad_norm": 245.6946563720703, + "learning_rate": 2.32540834845735e-05, + "loss": 38.6218, + "step": 1249 + }, + { + "epoch": 4.512866817155756, + "grad_norm": 193.29095458984375, + "learning_rate": 2.3248638838475502e-05, + "loss": 36.9687, + "step": 1250 + }, + { + "epoch": 4.512866817155756, + "eval_loss": 0.6432057023048401, + "eval_runtime": 3.1301, + "eval_samples_per_second": 57.187, + "eval_steps_per_second": 57.187, + "step": 1250 + }, + { + "epoch": 4.51647855530474, + "grad_norm": 247.0595245361328, + "learning_rate": 2.3243194192377498e-05, + "loss": 39.8086, + "step": 1251 + }, + { + "epoch": 4.520090293453725, + "grad_norm": 243.1544189453125, + "learning_rate": 2.3237749546279493e-05, + "loss": 38.7245, + "step": 1252 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 322.0834045410156, + "learning_rate": 2.323230490018149e-05, + "loss": 39.5335, + "step": 1253 + }, + { + "epoch": 4.527313769751693, + "grad_norm": 201.5956573486328, + "learning_rate": 2.3226860254083484e-05, + "loss": 30.2928, + "step": 1254 + }, + { + "epoch": 4.530925507900677, + "grad_norm": 186.13291931152344, + "learning_rate": 2.3221415607985483e-05, + "loss": 24.8504, + "step": 1255 + }, + { + "epoch": 4.534537246049661, + "grad_norm": 251.50608825683594, + "learning_rate": 2.3215970961887478e-05, + "loss": 24.5528, + "step": 1256 + }, + { + "epoch": 4.538148984198646, + "grad_norm": 180.21124267578125, + "learning_rate": 2.3210526315789473e-05, + "loss": 25.0864, + "step": 1257 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 206.5410614013672, + "learning_rate": 2.320508166969147e-05, + "loss": 27.1602, + "step": 1258 + }, + { + "epoch": 4.545372460496614, + "grad_norm": 342.1103210449219, + "learning_rate": 2.3199637023593468e-05, + "loss": 47.3734, + "step": 1259 + }, + { + "epoch": 4.5489841986455986, + "grad_norm": 418.3056945800781, + "learning_rate": 2.3194192377495463e-05, + "loss": 48.0316, + "step": 1260 + }, + { + "epoch": 4.5489841986455986, + "eval_loss": 0.6742400527000427, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 1260 + }, + { + "epoch": 4.5525959367945825, + "grad_norm": 369.8560791015625, + "learning_rate": 2.3188747731397462e-05, + "loss": 47.4532, + "step": 1261 + }, + { + "epoch": 4.5562076749435665, + "grad_norm": 322.0288391113281, + "learning_rate": 2.3183303085299457e-05, + "loss": 47.0661, + "step": 1262 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 244.79066467285156, + "learning_rate": 2.3177858439201453e-05, + "loss": 45.1875, + "step": 1263 + }, + { + "epoch": 4.563431151241535, + "grad_norm": 209.29397583007812, + "learning_rate": 2.3172413793103448e-05, + "loss": 46.1355, + "step": 1264 + }, + { + "epoch": 4.567042889390519, + "grad_norm": 271.5123291015625, + "learning_rate": 2.3166969147005443e-05, + "loss": 45.8947, + "step": 1265 + }, + { + "epoch": 4.570654627539503, + "grad_norm": 232.42913818359375, + "learning_rate": 2.3161524500907442e-05, + "loss": 45.6542, + "step": 1266 + }, + { + "epoch": 4.574266365688487, + "grad_norm": 282.50738525390625, + "learning_rate": 2.3156079854809437e-05, + "loss": 45.8805, + "step": 1267 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 203.39031982421875, + "learning_rate": 2.3150635208711436e-05, + "loss": 44.8926, + "step": 1268 + }, + { + "epoch": 4.581489841986456, + "grad_norm": 213.94894409179688, + "learning_rate": 2.314519056261343e-05, + "loss": 43.7589, + "step": 1269 + }, + { + "epoch": 4.58510158013544, + "grad_norm": 198.9677734375, + "learning_rate": 2.3139745916515427e-05, + "loss": 41.819, + "step": 1270 + }, + { + "epoch": 4.58510158013544, + "eval_loss": 0.6428627371788025, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1270 + }, + { + "epoch": 4.588713318284425, + "grad_norm": 197.69903564453125, + "learning_rate": 2.3134301270417422e-05, + "loss": 40.6128, + "step": 1271 + }, + { + "epoch": 4.592325056433409, + "grad_norm": 229.10488891601562, + "learning_rate": 2.312885662431942e-05, + "loss": 41.1856, + "step": 1272 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 254.4750213623047, + "learning_rate": 2.3123411978221417e-05, + "loss": 40.2048, + "step": 1273 + }, + { + "epoch": 4.599548532731377, + "grad_norm": 247.2012939453125, + "learning_rate": 2.3117967332123412e-05, + "loss": 41.663, + "step": 1274 + }, + { + "epoch": 4.603160270880361, + "grad_norm": 196.78761291503906, + "learning_rate": 2.3112522686025407e-05, + "loss": 41.1102, + "step": 1275 + }, + { + "epoch": 4.606772009029346, + "grad_norm": 179.03880310058594, + "learning_rate": 2.3107078039927403e-05, + "loss": 39.6368, + "step": 1276 + }, + { + "epoch": 4.6103837471783295, + "grad_norm": 203.49159240722656, + "learning_rate": 2.3101633393829405e-05, + "loss": 42.9424, + "step": 1277 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 254.80018615722656, + "learning_rate": 2.30961887477314e-05, + "loss": 42.0636, + "step": 1278 + }, + { + "epoch": 4.617607223476298, + "grad_norm": 201.86109924316406, + "learning_rate": 2.3090744101633396e-05, + "loss": 41.4738, + "step": 1279 + }, + { + "epoch": 4.621218961625282, + "grad_norm": 185.1239471435547, + "learning_rate": 2.308529945553539e-05, + "loss": 41.8529, + "step": 1280 + }, + { + "epoch": 4.621218961625282, + "eval_loss": 0.6457561254501343, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 1280 + }, + { + "epoch": 4.624830699774266, + "grad_norm": 198.6769561767578, + "learning_rate": 2.3079854809437386e-05, + "loss": 41.8397, + "step": 1281 + }, + { + "epoch": 4.62844243792325, + "grad_norm": 254.9165496826172, + "learning_rate": 2.3074410163339382e-05, + "loss": 43.5585, + "step": 1282 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 183.61181640625, + "learning_rate": 2.306896551724138e-05, + "loss": 41.7349, + "step": 1283 + }, + { + "epoch": 4.635665914221219, + "grad_norm": 206.0381622314453, + "learning_rate": 2.3063520871143376e-05, + "loss": 42.6239, + "step": 1284 + }, + { + "epoch": 4.639277652370203, + "grad_norm": 188.5303497314453, + "learning_rate": 2.305807622504537e-05, + "loss": 43.0988, + "step": 1285 + }, + { + "epoch": 4.642889390519187, + "grad_norm": 208.30039978027344, + "learning_rate": 2.3052631578947367e-05, + "loss": 43.8379, + "step": 1286 + }, + { + "epoch": 4.646501128668172, + "grad_norm": 209.494384765625, + "learning_rate": 2.3047186932849365e-05, + "loss": 41.4395, + "step": 1287 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 223.97824096679688, + "learning_rate": 2.3041742286751364e-05, + "loss": 38.5792, + "step": 1288 + }, + { + "epoch": 4.65372460496614, + "grad_norm": 209.16192626953125, + "learning_rate": 2.303629764065336e-05, + "loss": 36.2448, + "step": 1289 + }, + { + "epoch": 4.657336343115124, + "grad_norm": 260.72821044921875, + "learning_rate": 2.3030852994555355e-05, + "loss": 35.1692, + "step": 1290 + }, + { + "epoch": 4.657336343115124, + "eval_loss": 0.6381233334541321, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1290 + }, + { + "epoch": 4.660948081264109, + "grad_norm": 222.2270965576172, + "learning_rate": 2.302540834845735e-05, + "loss": 35.2234, + "step": 1291 + }, + { + "epoch": 4.664559819413093, + "grad_norm": 208.68218994140625, + "learning_rate": 2.3019963702359346e-05, + "loss": 35.6167, + "step": 1292 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 199.57015991210938, + "learning_rate": 2.301451905626134e-05, + "loss": 36.9489, + "step": 1293 + }, + { + "epoch": 4.6717832957110605, + "grad_norm": 249.1312255859375, + "learning_rate": 2.300907441016334e-05, + "loss": 37.0681, + "step": 1294 + }, + { + "epoch": 4.675395033860045, + "grad_norm": 227.86341857910156, + "learning_rate": 2.3003629764065335e-05, + "loss": 38.3897, + "step": 1295 + }, + { + "epoch": 4.679006772009029, + "grad_norm": 290.3368225097656, + "learning_rate": 2.2998185117967334e-05, + "loss": 39.1391, + "step": 1296 + }, + { + "epoch": 4.682618510158013, + "grad_norm": 222.59974670410156, + "learning_rate": 2.299274047186933e-05, + "loss": 38.6362, + "step": 1297 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 233.853515625, + "learning_rate": 2.2987295825771325e-05, + "loss": 37.1796, + "step": 1298 + }, + { + "epoch": 4.689841986455982, + "grad_norm": 202.83212280273438, + "learning_rate": 2.2981851179673324e-05, + "loss": 38.5097, + "step": 1299 + }, + { + "epoch": 4.693453724604966, + "grad_norm": 203.59027099609375, + "learning_rate": 2.297640653357532e-05, + "loss": 38.3335, + "step": 1300 + }, + { + "epoch": 4.693453724604966, + "eval_loss": 0.6446877717971802, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 1300 + }, + { + "epoch": 4.69706546275395, + "grad_norm": 250.48324584960938, + "learning_rate": 2.2970961887477314e-05, + "loss": 39.1848, + "step": 1301 + }, + { + "epoch": 4.700677200902934, + "grad_norm": 218.0867462158203, + "learning_rate": 2.296551724137931e-05, + "loss": 38.2276, + "step": 1302 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 316.4258728027344, + "learning_rate": 2.2960072595281305e-05, + "loss": 38.4487, + "step": 1303 + }, + { + "epoch": 4.707900677200903, + "grad_norm": 262.96832275390625, + "learning_rate": 2.29546279491833e-05, + "loss": 29.1075, + "step": 1304 + }, + { + "epoch": 4.711512415349887, + "grad_norm": 261.25897216796875, + "learning_rate": 2.2949183303085303e-05, + "loss": 24.6257, + "step": 1305 + }, + { + "epoch": 4.715124153498872, + "grad_norm": 223.29014587402344, + "learning_rate": 2.2943738656987298e-05, + "loss": 24.4387, + "step": 1306 + }, + { + "epoch": 4.718735891647856, + "grad_norm": 167.95193481445312, + "learning_rate": 2.2938294010889293e-05, + "loss": 25.0916, + "step": 1307 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 203.88392639160156, + "learning_rate": 2.293284936479129e-05, + "loss": 26.1631, + "step": 1308 + }, + { + "epoch": 4.725959367945824, + "grad_norm": 350.67657470703125, + "learning_rate": 2.2927404718693284e-05, + "loss": 47.7021, + "step": 1309 + }, + { + "epoch": 4.7295711060948085, + "grad_norm": 357.1839294433594, + "learning_rate": 2.2921960072595283e-05, + "loss": 47.8161, + "step": 1310 + }, + { + "epoch": 4.7295711060948085, + "eval_loss": 0.6716815829277039, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 1310 + }, + { + "epoch": 4.733182844243792, + "grad_norm": 334.40216064453125, + "learning_rate": 2.291651542649728e-05, + "loss": 47.5608, + "step": 1311 + }, + { + "epoch": 4.736794582392776, + "grad_norm": 322.90008544921875, + "learning_rate": 2.2911070780399274e-05, + "loss": 45.9858, + "step": 1312 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 291.5083923339844, + "learning_rate": 2.290562613430127e-05, + "loss": 45.9813, + "step": 1313 + }, + { + "epoch": 4.744018058690745, + "grad_norm": 234.91102600097656, + "learning_rate": 2.2900181488203268e-05, + "loss": 44.4287, + "step": 1314 + }, + { + "epoch": 4.747629796839729, + "grad_norm": 271.03582763671875, + "learning_rate": 2.2894736842105263e-05, + "loss": 45.3697, + "step": 1315 + }, + { + "epoch": 4.751241534988713, + "grad_norm": 256.219482421875, + "learning_rate": 2.2889292196007262e-05, + "loss": 45.1817, + "step": 1316 + }, + { + "epoch": 4.754853273137698, + "grad_norm": 252.0631561279297, + "learning_rate": 2.2883847549909257e-05, + "loss": 45.2029, + "step": 1317 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 249.41812133789062, + "learning_rate": 2.2878402903811253e-05, + "loss": 44.9802, + "step": 1318 + }, + { + "epoch": 4.762076749435666, + "grad_norm": 208.9102325439453, + "learning_rate": 2.2872958257713248e-05, + "loss": 44.3745, + "step": 1319 + }, + { + "epoch": 4.76568848758465, + "grad_norm": 322.94903564453125, + "learning_rate": 2.2867513611615244e-05, + "loss": 40.9193, + "step": 1320 + }, + { + "epoch": 4.76568848758465, + "eval_loss": 0.6515910029411316, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1320 + }, + { + "epoch": 4.769300225733634, + "grad_norm": 264.6942138671875, + "learning_rate": 2.2862068965517242e-05, + "loss": 39.7286, + "step": 1321 + }, + { + "epoch": 4.772911963882619, + "grad_norm": 276.6095886230469, + "learning_rate": 2.2856624319419238e-05, + "loss": 41.3846, + "step": 1322 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 199.59877014160156, + "learning_rate": 2.2851179673321233e-05, + "loss": 40.5583, + "step": 1323 + }, + { + "epoch": 4.780135440180587, + "grad_norm": 252.59158325195312, + "learning_rate": 2.2845735027223232e-05, + "loss": 40.9513, + "step": 1324 + }, + { + "epoch": 4.7837471783295715, + "grad_norm": 215.53826904296875, + "learning_rate": 2.2840290381125227e-05, + "loss": 41.5119, + "step": 1325 + }, + { + "epoch": 4.7873589164785555, + "grad_norm": 290.7100524902344, + "learning_rate": 2.2834845735027226e-05, + "loss": 42.7646, + "step": 1326 + }, + { + "epoch": 4.7909706546275395, + "grad_norm": 190.2306671142578, + "learning_rate": 2.282940108892922e-05, + "loss": 42.2708, + "step": 1327 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 187.5550079345703, + "learning_rate": 2.2823956442831217e-05, + "loss": 41.9279, + "step": 1328 + }, + { + "epoch": 4.798194130925508, + "grad_norm": 169.10414123535156, + "learning_rate": 2.2818511796733212e-05, + "loss": 42.2688, + "step": 1329 + }, + { + "epoch": 4.801805869074492, + "grad_norm": 199.5216064453125, + "learning_rate": 2.2813067150635208e-05, + "loss": 41.9192, + "step": 1330 + }, + { + "epoch": 4.801805869074492, + "eval_loss": 0.6402038335800171, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 1330 + }, + { + "epoch": 4.805417607223476, + "grad_norm": 222.4996337890625, + "learning_rate": 2.2807622504537203e-05, + "loss": 43.8218, + "step": 1331 + }, + { + "epoch": 4.80902934537246, + "grad_norm": 228.1157684326172, + "learning_rate": 2.2802177858439202e-05, + "loss": 42.9497, + "step": 1332 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 179.83697509765625, + "learning_rate": 2.27967332123412e-05, + "loss": 43.9723, + "step": 1333 + }, + { + "epoch": 4.816252821670429, + "grad_norm": 196.81983947753906, + "learning_rate": 2.2791288566243196e-05, + "loss": 43.3302, + "step": 1334 + }, + { + "epoch": 4.819864559819413, + "grad_norm": 186.61160278320312, + "learning_rate": 2.278584392014519e-05, + "loss": 41.8957, + "step": 1335 + }, + { + "epoch": 4.823476297968397, + "grad_norm": 242.55886840820312, + "learning_rate": 2.2780399274047187e-05, + "loss": 43.1916, + "step": 1336 + }, + { + "epoch": 4.827088036117382, + "grad_norm": 212.07177734375, + "learning_rate": 2.2774954627949185e-05, + "loss": 38.3371, + "step": 1337 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 180.1990966796875, + "learning_rate": 2.276950998185118e-05, + "loss": 36.3413, + "step": 1338 + }, + { + "epoch": 4.83431151241535, + "grad_norm": 202.69529724121094, + "learning_rate": 2.2764065335753176e-05, + "loss": 35.4426, + "step": 1339 + }, + { + "epoch": 4.837923250564334, + "grad_norm": 180.47283935546875, + "learning_rate": 2.275862068965517e-05, + "loss": 35.5281, + "step": 1340 + }, + { + "epoch": 4.837923250564334, + "eval_loss": 0.6356105804443359, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 1340 + }, + { + "epoch": 4.8415349887133186, + "grad_norm": 204.674560546875, + "learning_rate": 2.2753176043557167e-05, + "loss": 36.2566, + "step": 1341 + }, + { + "epoch": 4.8451467268623025, + "grad_norm": 272.1197204589844, + "learning_rate": 2.2747731397459166e-05, + "loss": 36.3862, + "step": 1342 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 235.55101013183594, + "learning_rate": 2.2742286751361165e-05, + "loss": 35.1455, + "step": 1343 + }, + { + "epoch": 4.852370203160271, + "grad_norm": 271.2718200683594, + "learning_rate": 2.273684210526316e-05, + "loss": 37.3824, + "step": 1344 + }, + { + "epoch": 4.855981941309255, + "grad_norm": 242.15728759765625, + "learning_rate": 2.2731397459165155e-05, + "loss": 37.6587, + "step": 1345 + }, + { + "epoch": 4.859593679458239, + "grad_norm": 218.59481811523438, + "learning_rate": 2.272595281306715e-05, + "loss": 36.7602, + "step": 1346 + }, + { + "epoch": 4.863205417607223, + "grad_norm": 231.9490203857422, + "learning_rate": 2.2720508166969146e-05, + "loss": 38.187, + "step": 1347 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 385.56158447265625, + "learning_rate": 2.2715063520871145e-05, + "loss": 38.1905, + "step": 1348 + }, + { + "epoch": 4.870428893905192, + "grad_norm": 219.38204956054688, + "learning_rate": 2.270961887477314e-05, + "loss": 38.2179, + "step": 1349 + }, + { + "epoch": 4.874040632054176, + "grad_norm": 209.46580505371094, + "learning_rate": 2.2704174228675136e-05, + "loss": 37.3696, + "step": 1350 + }, + { + "epoch": 4.874040632054176, + "eval_loss": 0.6412517428398132, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 1350 + }, + { + "epoch": 4.87765237020316, + "grad_norm": 205.53416442871094, + "learning_rate": 2.2698729582577134e-05, + "loss": 38.5144, + "step": 1351 + }, + { + "epoch": 4.881264108352145, + "grad_norm": 214.2522735595703, + "learning_rate": 2.269328493647913e-05, + "loss": 38.7372, + "step": 1352 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 236.9787139892578, + "learning_rate": 2.2687840290381125e-05, + "loss": 38.8987, + "step": 1353 + }, + { + "epoch": 4.888487584650113, + "grad_norm": 247.30906677246094, + "learning_rate": 2.2682395644283124e-05, + "loss": 35.0837, + "step": 1354 + }, + { + "epoch": 4.892099322799097, + "grad_norm": 287.5954284667969, + "learning_rate": 2.267695099818512e-05, + "loss": 25.5272, + "step": 1355 + }, + { + "epoch": 4.895711060948082, + "grad_norm": 254.61672973632812, + "learning_rate": 2.2671506352087115e-05, + "loss": 25.1288, + "step": 1356 + }, + { + "epoch": 4.899322799097066, + "grad_norm": 180.98666381835938, + "learning_rate": 2.266606170598911e-05, + "loss": 25.0588, + "step": 1357 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 213.0275421142578, + "learning_rate": 2.2660617059891105e-05, + "loss": 25.464, + "step": 1358 + }, + { + "epoch": 4.9065462753950335, + "grad_norm": 385.18035888671875, + "learning_rate": 2.2655172413793104e-05, + "loss": 47.0056, + "step": 1359 + }, + { + "epoch": 4.910158013544018, + "grad_norm": 383.4106140136719, + "learning_rate": 2.2649727767695103e-05, + "loss": 46.9892, + "step": 1360 + }, + { + "epoch": 4.910158013544018, + "eval_loss": 0.6618479490280151, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1360 + }, + { + "epoch": 4.913769751693002, + "grad_norm": 415.4345397949219, + "learning_rate": 2.26442831215971e-05, + "loss": 47.1619, + "step": 1361 + }, + { + "epoch": 4.917381489841986, + "grad_norm": 362.338134765625, + "learning_rate": 2.2638838475499094e-05, + "loss": 46.7232, + "step": 1362 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 378.7535400390625, + "learning_rate": 2.263339382940109e-05, + "loss": 46.4438, + "step": 1363 + }, + { + "epoch": 4.924604966139955, + "grad_norm": 251.64901733398438, + "learning_rate": 2.2627949183303085e-05, + "loss": 44.8178, + "step": 1364 + }, + { + "epoch": 4.928216704288939, + "grad_norm": 273.1052551269531, + "learning_rate": 2.2622504537205083e-05, + "loss": 43.0865, + "step": 1365 + }, + { + "epoch": 4.931828442437923, + "grad_norm": 229.66415405273438, + "learning_rate": 2.261705989110708e-05, + "loss": 42.2463, + "step": 1366 + }, + { + "epoch": 4.935440180586907, + "grad_norm": 229.47940063476562, + "learning_rate": 2.2611615245009074e-05, + "loss": 42.4395, + "step": 1367 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 224.48890686035156, + "learning_rate": 2.260617059891107e-05, + "loss": 42.4994, + "step": 1368 + }, + { + "epoch": 4.942663656884876, + "grad_norm": 241.98745727539062, + "learning_rate": 2.2600725952813065e-05, + "loss": 42.5535, + "step": 1369 + }, + { + "epoch": 4.94627539503386, + "grad_norm": 258.1711120605469, + "learning_rate": 2.2595281306715067e-05, + "loss": 42.8475, + "step": 1370 + }, + { + "epoch": 4.94627539503386, + "eval_loss": 0.639252245426178, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.09, + "eval_steps_per_second": 57.09, + "step": 1370 + }, + { + "epoch": 4.949887133182845, + "grad_norm": 204.64927673339844, + "learning_rate": 2.2589836660617062e-05, + "loss": 42.9895, + "step": 1371 + }, + { + "epoch": 4.953498871331829, + "grad_norm": 342.9057922363281, + "learning_rate": 2.2584392014519058e-05, + "loss": 43.1972, + "step": 1372 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 207.45504760742188, + "learning_rate": 2.2578947368421053e-05, + "loss": 42.406, + "step": 1373 + }, + { + "epoch": 4.960722347629797, + "grad_norm": 232.78831481933594, + "learning_rate": 2.257350272232305e-05, + "loss": 36.8817, + "step": 1374 + }, + { + "epoch": 4.9643340857787805, + "grad_norm": 249.3349609375, + "learning_rate": 2.2568058076225044e-05, + "loss": 34.584, + "step": 1375 + }, + { + "epoch": 4.967945823927765, + "grad_norm": 322.7100524902344, + "learning_rate": 2.2562613430127043e-05, + "loss": 36.9512, + "step": 1376 + }, + { + "epoch": 4.971557562076749, + "grad_norm": 357.65228271484375, + "learning_rate": 2.2557168784029038e-05, + "loss": 37.6833, + "step": 1377 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 300.0970153808594, + "learning_rate": 2.2551724137931033e-05, + "loss": 38.597, + "step": 1378 + }, + { + "epoch": 4.978781038374718, + "grad_norm": 234.52508544921875, + "learning_rate": 2.2546279491833032e-05, + "loss": 38.4155, + "step": 1379 + }, + { + "epoch": 4.982392776523702, + "grad_norm": 270.60626220703125, + "learning_rate": 2.2540834845735028e-05, + "loss": 38.1589, + "step": 1380 + }, + { + "epoch": 4.982392776523702, + "eval_loss": 0.6409950256347656, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 1380 + }, + { + "epoch": 4.986004514672686, + "grad_norm": 232.9596710205078, + "learning_rate": 2.2535390199637026e-05, + "loss": 39.281, + "step": 1381 + }, + { + "epoch": 4.98961625282167, + "grad_norm": 248.0550994873047, + "learning_rate": 2.2529945553539022e-05, + "loss": 40.0868, + "step": 1382 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 256.327880859375, + "learning_rate": 2.2524500907441017e-05, + "loss": 28.1259, + "step": 1383 + }, + { + "epoch": 4.996839729119639, + "grad_norm": 198.29559326171875, + "learning_rate": 2.2519056261343012e-05, + "loss": 25.3166, + "step": 1384 + }, + { + "epoch": 5.0, + "grad_norm": 174.66856384277344, + "learning_rate": 2.2513611615245008e-05, + "loss": 22.0749, + "step": 1385 + }, + { + "epoch": 5.003611738148984, + "grad_norm": 309.0927429199219, + "learning_rate": 2.2508166969147003e-05, + "loss": 45.2433, + "step": 1386 + }, + { + "epoch": 5.007223476297969, + "grad_norm": 293.1455383300781, + "learning_rate": 2.2502722323049002e-05, + "loss": 46.7025, + "step": 1387 + }, + { + "epoch": 5.010835214446953, + "grad_norm": 269.47662353515625, + "learning_rate": 2.2497277676951e-05, + "loss": 45.3218, + "step": 1388 + }, + { + "epoch": 5.014446952595937, + "grad_norm": 284.49560546875, + "learning_rate": 2.2491833030852996e-05, + "loss": 44.9849, + "step": 1389 + }, + { + "epoch": 5.018058690744921, + "grad_norm": 223.5511474609375, + "learning_rate": 2.248638838475499e-05, + "loss": 44.887, + "step": 1390 + }, + { + "epoch": 5.018058690744921, + "eval_loss": 0.6435533165931702, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1390 + }, + { + "epoch": 5.021670428893906, + "grad_norm": 243.4492645263672, + "learning_rate": 2.2480943738656987e-05, + "loss": 45.1483, + "step": 1391 + }, + { + "epoch": 5.0252821670428895, + "grad_norm": 265.1712646484375, + "learning_rate": 2.2475499092558986e-05, + "loss": 44.3713, + "step": 1392 + }, + { + "epoch": 5.0288939051918735, + "grad_norm": 190.72190856933594, + "learning_rate": 2.247005444646098e-05, + "loss": 45.3138, + "step": 1393 + }, + { + "epoch": 5.0325056433408575, + "grad_norm": 177.26686096191406, + "learning_rate": 2.2464609800362976e-05, + "loss": 43.302, + "step": 1394 + }, + { + "epoch": 5.036117381489842, + "grad_norm": 198.6124725341797, + "learning_rate": 2.2459165154264972e-05, + "loss": 43.6363, + "step": 1395 + }, + { + "epoch": 5.039729119638826, + "grad_norm": 233.78738403320312, + "learning_rate": 2.2453720508166967e-05, + "loss": 43.0345, + "step": 1396 + }, + { + "epoch": 5.04334085778781, + "grad_norm": 225.48614501953125, + "learning_rate": 2.2448275862068966e-05, + "loss": 41.5932, + "step": 1397 + }, + { + "epoch": 5.046952595936794, + "grad_norm": 204.31179809570312, + "learning_rate": 2.2442831215970965e-05, + "loss": 40.1401, + "step": 1398 + }, + { + "epoch": 5.050564334085779, + "grad_norm": 219.5385284423828, + "learning_rate": 2.243738656987296e-05, + "loss": 40.8834, + "step": 1399 + }, + { + "epoch": 5.054176072234763, + "grad_norm": 168.3094024658203, + "learning_rate": 2.2431941923774956e-05, + "loss": 40.4476, + "step": 1400 + }, + { + "epoch": 5.054176072234763, + "eval_loss": 0.6361114382743835, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 1400 + }, + { + "epoch": 5.057787810383747, + "grad_norm": 169.45201110839844, + "learning_rate": 2.242649727767695e-05, + "loss": 40.1949, + "step": 1401 + }, + { + "epoch": 5.061399548532731, + "grad_norm": 208.84634399414062, + "learning_rate": 2.2421052631578946e-05, + "loss": 41.0091, + "step": 1402 + }, + { + "epoch": 5.065011286681716, + "grad_norm": 248.86221313476562, + "learning_rate": 2.2415607985480945e-05, + "loss": 40.2435, + "step": 1403 + }, + { + "epoch": 5.0686230248307, + "grad_norm": 297.0834655761719, + "learning_rate": 2.241016333938294e-05, + "loss": 42.37, + "step": 1404 + }, + { + "epoch": 5.072234762979684, + "grad_norm": 242.12661743164062, + "learning_rate": 2.2404718693284936e-05, + "loss": 42.3822, + "step": 1405 + }, + { + "epoch": 5.075846501128668, + "grad_norm": 230.1178741455078, + "learning_rate": 2.2399274047186935e-05, + "loss": 41.3722, + "step": 1406 + }, + { + "epoch": 5.079458239277653, + "grad_norm": 191.32371520996094, + "learning_rate": 2.239382940108893e-05, + "loss": 41.8087, + "step": 1407 + }, + { + "epoch": 5.083069977426637, + "grad_norm": 267.28753662109375, + "learning_rate": 2.2388384754990925e-05, + "loss": 42.5938, + "step": 1408 + }, + { + "epoch": 5.0866817155756205, + "grad_norm": 186.61978149414062, + "learning_rate": 2.2382940108892924e-05, + "loss": 42.8553, + "step": 1409 + }, + { + "epoch": 5.090293453724605, + "grad_norm": 242.53433227539062, + "learning_rate": 2.237749546279492e-05, + "loss": 41.9677, + "step": 1410 + }, + { + "epoch": 5.090293453724605, + "eval_loss": 0.6330043077468872, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 1410 + }, + { + "epoch": 5.093905191873589, + "grad_norm": 199.74696350097656, + "learning_rate": 2.2372050816696915e-05, + "loss": 42.9821, + "step": 1411 + }, + { + "epoch": 5.097516930022573, + "grad_norm": 254.1063690185547, + "learning_rate": 2.236660617059891e-05, + "loss": 42.7956, + "step": 1412 + }, + { + "epoch": 5.101128668171557, + "grad_norm": 215.59056091308594, + "learning_rate": 2.2361161524500906e-05, + "loss": 43.6312, + "step": 1413 + }, + { + "epoch": 5.104740406320542, + "grad_norm": 218.69973754882812, + "learning_rate": 2.2355716878402904e-05, + "loss": 40.9468, + "step": 1414 + }, + { + "epoch": 5.108352144469526, + "grad_norm": 200.34927368164062, + "learning_rate": 2.23502722323049e-05, + "loss": 38.2656, + "step": 1415 + }, + { + "epoch": 5.11196388261851, + "grad_norm": 191.56883239746094, + "learning_rate": 2.23448275862069e-05, + "loss": 35.8111, + "step": 1416 + }, + { + "epoch": 5.115575620767494, + "grad_norm": 192.629150390625, + "learning_rate": 2.2339382940108894e-05, + "loss": 35.1287, + "step": 1417 + }, + { + "epoch": 5.119187358916479, + "grad_norm": 217.54855346679688, + "learning_rate": 2.233393829401089e-05, + "loss": 34.9664, + "step": 1418 + }, + { + "epoch": 5.122799097065463, + "grad_norm": 234.12355041503906, + "learning_rate": 2.2328493647912888e-05, + "loss": 35.9252, + "step": 1419 + }, + { + "epoch": 5.126410835214447, + "grad_norm": 201.83477783203125, + "learning_rate": 2.2323049001814884e-05, + "loss": 36.4664, + "step": 1420 + }, + { + "epoch": 5.126410835214447, + "eval_loss": 0.6359394192695618, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 1420 + }, + { + "epoch": 5.130022573363431, + "grad_norm": 212.38943481445312, + "learning_rate": 2.231760435571688e-05, + "loss": 35.2733, + "step": 1421 + }, + { + "epoch": 5.133634311512416, + "grad_norm": 219.8803253173828, + "learning_rate": 2.2312159709618874e-05, + "loss": 37.2009, + "step": 1422 + }, + { + "epoch": 5.1372460496614, + "grad_norm": 222.28221130371094, + "learning_rate": 2.230671506352087e-05, + "loss": 36.9338, + "step": 1423 + }, + { + "epoch": 5.140857787810384, + "grad_norm": 217.56607055664062, + "learning_rate": 2.2301270417422865e-05, + "loss": 38.0419, + "step": 1424 + }, + { + "epoch": 5.144469525959368, + "grad_norm": 232.7363739013672, + "learning_rate": 2.2295825771324867e-05, + "loss": 38.1393, + "step": 1425 + }, + { + "epoch": 5.148081264108352, + "grad_norm": 228.12091064453125, + "learning_rate": 2.2290381125226863e-05, + "loss": 37.4169, + "step": 1426 + }, + { + "epoch": 5.151693002257336, + "grad_norm": 247.9901580810547, + "learning_rate": 2.2284936479128858e-05, + "loss": 37.6386, + "step": 1427 + }, + { + "epoch": 5.15530474040632, + "grad_norm": 227.96649169921875, + "learning_rate": 2.2279491833030853e-05, + "loss": 38.7843, + "step": 1428 + }, + { + "epoch": 5.158916478555304, + "grad_norm": 197.85072326660156, + "learning_rate": 2.227404718693285e-05, + "loss": 37.7056, + "step": 1429 + }, + { + "epoch": 5.162528216704289, + "grad_norm": 270.6370544433594, + "learning_rate": 2.2268602540834848e-05, + "loss": 38.5554, + "step": 1430 + }, + { + "epoch": 5.162528216704289, + "eval_loss": 0.6463288068771362, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 1430 + }, + { + "epoch": 5.166139954853273, + "grad_norm": 251.65847778320312, + "learning_rate": 2.2263157894736843e-05, + "loss": 32.6593, + "step": 1431 + }, + { + "epoch": 5.169751693002257, + "grad_norm": 248.84368896484375, + "learning_rate": 2.225771324863884e-05, + "loss": 24.8031, + "step": 1432 + }, + { + "epoch": 5.173363431151241, + "grad_norm": 218.12979125976562, + "learning_rate": 2.2252268602540834e-05, + "loss": 23.8542, + "step": 1433 + }, + { + "epoch": 5.176975169300226, + "grad_norm": 171.4182586669922, + "learning_rate": 2.2246823956442832e-05, + "loss": 25.1994, + "step": 1434 + }, + { + "epoch": 5.18058690744921, + "grad_norm": 200.76271057128906, + "learning_rate": 2.2241379310344828e-05, + "loss": 25.1259, + "step": 1435 + }, + { + "epoch": 5.184198645598194, + "grad_norm": 324.8979797363281, + "learning_rate": 2.2235934664246827e-05, + "loss": 46.7466, + "step": 1436 + }, + { + "epoch": 5.187810383747179, + "grad_norm": 391.9200439453125, + "learning_rate": 2.2230490018148822e-05, + "loss": 47.366, + "step": 1437 + }, + { + "epoch": 5.191422121896163, + "grad_norm": 332.51080322265625, + "learning_rate": 2.2225045372050817e-05, + "loss": 47.5236, + "step": 1438 + }, + { + "epoch": 5.195033860045147, + "grad_norm": 295.85333251953125, + "learning_rate": 2.2219600725952813e-05, + "loss": 44.9235, + "step": 1439 + }, + { + "epoch": 5.198645598194131, + "grad_norm": 246.46482849121094, + "learning_rate": 2.2214156079854808e-05, + "loss": 44.5892, + "step": 1440 + }, + { + "epoch": 5.198645598194131, + "eval_loss": 0.6501885056495667, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.096, + "eval_steps_per_second": 57.096, + "step": 1440 + }, + { + "epoch": 5.2022573363431155, + "grad_norm": 224.99964904785156, + "learning_rate": 2.2208711433756807e-05, + "loss": 45.1496, + "step": 1441 + }, + { + "epoch": 5.2058690744920995, + "grad_norm": 201.5928497314453, + "learning_rate": 2.2203266787658802e-05, + "loss": 44.2362, + "step": 1442 + }, + { + "epoch": 5.209480812641083, + "grad_norm": 220.72509765625, + "learning_rate": 2.21978221415608e-05, + "loss": 45.7963, + "step": 1443 + }, + { + "epoch": 5.213092550790067, + "grad_norm": 229.04412841796875, + "learning_rate": 2.2192377495462796e-05, + "loss": 44.1812, + "step": 1444 + }, + { + "epoch": 5.216704288939052, + "grad_norm": 214.86207580566406, + "learning_rate": 2.2186932849364792e-05, + "loss": 44.364, + "step": 1445 + }, + { + "epoch": 5.220316027088036, + "grad_norm": 169.3239288330078, + "learning_rate": 2.2181488203266787e-05, + "loss": 44.1106, + "step": 1446 + }, + { + "epoch": 5.22392776523702, + "grad_norm": 180.3131561279297, + "learning_rate": 2.2176043557168786e-05, + "loss": 41.8791, + "step": 1447 + }, + { + "epoch": 5.227539503386004, + "grad_norm": 227.83078002929688, + "learning_rate": 2.217059891107078e-05, + "loss": 39.7917, + "step": 1448 + }, + { + "epoch": 5.231151241534989, + "grad_norm": 267.4294738769531, + "learning_rate": 2.2165154264972777e-05, + "loss": 41.2864, + "step": 1449 + }, + { + "epoch": 5.234762979683973, + "grad_norm": 210.79034423828125, + "learning_rate": 2.2159709618874772e-05, + "loss": 40.7219, + "step": 1450 + }, + { + "epoch": 5.234762979683973, + "eval_loss": 0.6369529366493225, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 1450 + }, + { + "epoch": 5.238374717832957, + "grad_norm": 205.2632598876953, + "learning_rate": 2.2154264972776768e-05, + "loss": 41.0364, + "step": 1451 + }, + { + "epoch": 5.241986455981941, + "grad_norm": 199.7196807861328, + "learning_rate": 2.214882032667877e-05, + "loss": 40.2733, + "step": 1452 + }, + { + "epoch": 5.245598194130926, + "grad_norm": 184.26495361328125, + "learning_rate": 2.2143375680580765e-05, + "loss": 40.3418, + "step": 1453 + }, + { + "epoch": 5.24920993227991, + "grad_norm": 170.1937713623047, + "learning_rate": 2.213793103448276e-05, + "loss": 40.5658, + "step": 1454 + }, + { + "epoch": 5.252821670428894, + "grad_norm": 167.71109008789062, + "learning_rate": 2.2132486388384756e-05, + "loss": 41.9252, + "step": 1455 + }, + { + "epoch": 5.2564334085778786, + "grad_norm": 184.73162841796875, + "learning_rate": 2.212704174228675e-05, + "loss": 40.0485, + "step": 1456 + }, + { + "epoch": 5.2600451467268625, + "grad_norm": 195.0812225341797, + "learning_rate": 2.2121597096188747e-05, + "loss": 41.6424, + "step": 1457 + }, + { + "epoch": 5.2636568848758465, + "grad_norm": 218.23553466796875, + "learning_rate": 2.2116152450090745e-05, + "loss": 40.6179, + "step": 1458 + }, + { + "epoch": 5.2672686230248305, + "grad_norm": 229.79299926757812, + "learning_rate": 2.211070780399274e-05, + "loss": 42.8747, + "step": 1459 + }, + { + "epoch": 5.270880361173815, + "grad_norm": 231.70692443847656, + "learning_rate": 2.2105263157894736e-05, + "loss": 42.7016, + "step": 1460 + }, + { + "epoch": 5.270880361173815, + "eval_loss": 0.6424433588981628, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 1460 + }, + { + "epoch": 5.274492099322799, + "grad_norm": 204.9513397216797, + "learning_rate": 2.209981851179673e-05, + "loss": 41.206, + "step": 1461 + }, + { + "epoch": 5.278103837471783, + "grad_norm": 220.89083862304688, + "learning_rate": 2.209437386569873e-05, + "loss": 44.0126, + "step": 1462 + }, + { + "epoch": 5.281715575620767, + "grad_norm": 266.7763671875, + "learning_rate": 2.208892921960073e-05, + "loss": 41.4934, + "step": 1463 + }, + { + "epoch": 5.285327313769752, + "grad_norm": 241.42636108398438, + "learning_rate": 2.2083484573502724e-05, + "loss": 43.3433, + "step": 1464 + }, + { + "epoch": 5.288939051918736, + "grad_norm": 221.7669219970703, + "learning_rate": 2.207803992740472e-05, + "loss": 35.9569, + "step": 1465 + }, + { + "epoch": 5.29255079006772, + "grad_norm": 236.0152130126953, + "learning_rate": 2.2072595281306715e-05, + "loss": 36.0824, + "step": 1466 + }, + { + "epoch": 5.296162528216704, + "grad_norm": 239.56224060058594, + "learning_rate": 2.206715063520871e-05, + "loss": 33.6127, + "step": 1467 + }, + { + "epoch": 5.299774266365689, + "grad_norm": 277.1287841796875, + "learning_rate": 2.2061705989110706e-05, + "loss": 36.11, + "step": 1468 + }, + { + "epoch": 5.303386004514673, + "grad_norm": 250.19515991210938, + "learning_rate": 2.2056261343012705e-05, + "loss": 36.9984, + "step": 1469 + }, + { + "epoch": 5.306997742663657, + "grad_norm": 214.2754669189453, + "learning_rate": 2.20508166969147e-05, + "loss": 36.5917, + "step": 1470 + }, + { + "epoch": 5.306997742663657, + "eval_loss": 0.6356943845748901, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 1470 + }, + { + "epoch": 5.310609480812641, + "grad_norm": 224.37388610839844, + "learning_rate": 2.20453720508167e-05, + "loss": 36.5302, + "step": 1471 + }, + { + "epoch": 5.314221218961626, + "grad_norm": 276.2541809082031, + "learning_rate": 2.2039927404718694e-05, + "loss": 36.7978, + "step": 1472 + }, + { + "epoch": 5.3178329571106095, + "grad_norm": 361.717041015625, + "learning_rate": 2.203448275862069e-05, + "loss": 37.4063, + "step": 1473 + }, + { + "epoch": 5.3214446952595935, + "grad_norm": 285.3569641113281, + "learning_rate": 2.202903811252269e-05, + "loss": 37.2472, + "step": 1474 + }, + { + "epoch": 5.3250564334085775, + "grad_norm": 268.160400390625, + "learning_rate": 2.2023593466424684e-05, + "loss": 37.7361, + "step": 1475 + }, + { + "epoch": 5.328668171557562, + "grad_norm": 211.38070678710938, + "learning_rate": 2.201814882032668e-05, + "loss": 37.7794, + "step": 1476 + }, + { + "epoch": 5.332279909706546, + "grad_norm": 214.10638427734375, + "learning_rate": 2.2012704174228675e-05, + "loss": 39.0787, + "step": 1477 + }, + { + "epoch": 5.33589164785553, + "grad_norm": 238.9603271484375, + "learning_rate": 2.200725952813067e-05, + "loss": 37.6853, + "step": 1478 + }, + { + "epoch": 5.339503386004514, + "grad_norm": 323.44976806640625, + "learning_rate": 2.2001814882032665e-05, + "loss": 38.2844, + "step": 1479 + }, + { + "epoch": 5.343115124153499, + "grad_norm": 289.6131896972656, + "learning_rate": 2.1996370235934668e-05, + "loss": 38.8953, + "step": 1480 + }, + { + "epoch": 5.343115124153499, + "eval_loss": 0.6462770700454712, + "eval_runtime": 3.1673, + "eval_samples_per_second": 56.516, + "eval_steps_per_second": 56.516, + "step": 1480 + }, + { + "epoch": 5.346726862302483, + "grad_norm": 197.47299194335938, + "learning_rate": 2.1990925589836663e-05, + "loss": 28.126, + "step": 1481 + }, + { + "epoch": 5.350338600451467, + "grad_norm": 198.37156677246094, + "learning_rate": 2.1985480943738658e-05, + "loss": 24.2205, + "step": 1482 + }, + { + "epoch": 5.353950338600452, + "grad_norm": 211.03501892089844, + "learning_rate": 2.1980036297640654e-05, + "loss": 24.119, + "step": 1483 + }, + { + "epoch": 5.357562076749436, + "grad_norm": 182.23316955566406, + "learning_rate": 2.197459165154265e-05, + "loss": 24.7386, + "step": 1484 + }, + { + "epoch": 5.36117381489842, + "grad_norm": 192.6392822265625, + "learning_rate": 2.1969147005444648e-05, + "loss": 26.0739, + "step": 1485 + }, + { + "epoch": 5.364785553047404, + "grad_norm": 380.62896728515625, + "learning_rate": 2.1963702359346643e-05, + "loss": 46.6945, + "step": 1486 + }, + { + "epoch": 5.368397291196389, + "grad_norm": 342.5572814941406, + "learning_rate": 2.195825771324864e-05, + "loss": 46.1797, + "step": 1487 + }, + { + "epoch": 5.372009029345373, + "grad_norm": 311.7198791503906, + "learning_rate": 2.1952813067150634e-05, + "loss": 45.6588, + "step": 1488 + }, + { + "epoch": 5.375620767494357, + "grad_norm": 260.9885559082031, + "learning_rate": 2.1947368421052633e-05, + "loss": 45.2405, + "step": 1489 + }, + { + "epoch": 5.3792325056433405, + "grad_norm": 263.3132019042969, + "learning_rate": 2.1941923774954628e-05, + "loss": 44.117, + "step": 1490 + }, + { + "epoch": 5.3792325056433405, + "eval_loss": 0.644275426864624, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 1490 + }, + { + "epoch": 5.382844243792325, + "grad_norm": 254.92022705078125, + "learning_rate": 2.1936479128856627e-05, + "loss": 45.4002, + "step": 1491 + }, + { + "epoch": 5.386455981941309, + "grad_norm": 246.1839599609375, + "learning_rate": 2.1931034482758622e-05, + "loss": 45.3481, + "step": 1492 + }, + { + "epoch": 5.390067720090293, + "grad_norm": 282.2879638671875, + "learning_rate": 2.1925589836660618e-05, + "loss": 45.3958, + "step": 1493 + }, + { + "epoch": 5.393679458239277, + "grad_norm": 266.9140930175781, + "learning_rate": 2.1920145190562613e-05, + "loss": 44.2959, + "step": 1494 + }, + { + "epoch": 5.397291196388262, + "grad_norm": 196.81199645996094, + "learning_rate": 2.191470054446461e-05, + "loss": 44.765, + "step": 1495 + }, + { + "epoch": 5.400902934537246, + "grad_norm": 270.7329406738281, + "learning_rate": 2.1909255898366607e-05, + "loss": 42.8581, + "step": 1496 + }, + { + "epoch": 5.40451467268623, + "grad_norm": 187.3281707763672, + "learning_rate": 2.1903811252268603e-05, + "loss": 40.7167, + "step": 1497 + }, + { + "epoch": 5.408126410835214, + "grad_norm": 302.9165954589844, + "learning_rate": 2.1898366606170598e-05, + "loss": 41.0712, + "step": 1498 + }, + { + "epoch": 5.411738148984199, + "grad_norm": 395.1492614746094, + "learning_rate": 2.1892921960072597e-05, + "loss": 40.4098, + "step": 1499 + }, + { + "epoch": 5.415349887133183, + "grad_norm": 253.91494750976562, + "learning_rate": 2.1887477313974592e-05, + "loss": 41.2985, + "step": 1500 + }, + { + "epoch": 5.415349887133183, + "eval_loss": 0.6383773684501648, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1500 + }, + { + "epoch": 5.418961625282167, + "grad_norm": 248.4109344482422, + "learning_rate": 2.1882032667876588e-05, + "loss": 41.179, + "step": 1501 + }, + { + "epoch": 5.422573363431152, + "grad_norm": 210.50015258789062, + "learning_rate": 2.1876588021778586e-05, + "loss": 41.1934, + "step": 1502 + }, + { + "epoch": 5.426185101580136, + "grad_norm": 170.64334106445312, + "learning_rate": 2.187114337568058e-05, + "loss": 41.5535, + "step": 1503 + }, + { + "epoch": 5.42979683972912, + "grad_norm": 249.41270446777344, + "learning_rate": 2.1865698729582577e-05, + "loss": 41.8323, + "step": 1504 + }, + { + "epoch": 5.433408577878104, + "grad_norm": 214.53770446777344, + "learning_rate": 2.1860254083484572e-05, + "loss": 42.1517, + "step": 1505 + }, + { + "epoch": 5.437020316027088, + "grad_norm": 225.6502227783203, + "learning_rate": 2.1854809437386568e-05, + "loss": 42.7675, + "step": 1506 + }, + { + "epoch": 5.440632054176072, + "grad_norm": 210.19219970703125, + "learning_rate": 2.1849364791288567e-05, + "loss": 42.5094, + "step": 1507 + }, + { + "epoch": 5.444243792325056, + "grad_norm": 187.03294372558594, + "learning_rate": 2.1843920145190565e-05, + "loss": 42.2218, + "step": 1508 + }, + { + "epoch": 5.44785553047404, + "grad_norm": 227.6764373779297, + "learning_rate": 2.183847549909256e-05, + "loss": 42.7061, + "step": 1509 + }, + { + "epoch": 5.451467268623025, + "grad_norm": 239.2847442626953, + "learning_rate": 2.1833030852994556e-05, + "loss": 43.1959, + "step": 1510 + }, + { + "epoch": 5.451467268623025, + "eval_loss": 0.6405091285705566, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 1510 + }, + { + "epoch": 5.455079006772009, + "grad_norm": 268.887451171875, + "learning_rate": 2.182758620689655e-05, + "loss": 42.4915, + "step": 1511 + }, + { + "epoch": 5.458690744920993, + "grad_norm": 261.0531311035156, + "learning_rate": 2.182214156079855e-05, + "loss": 42.1777, + "step": 1512 + }, + { + "epoch": 5.462302483069977, + "grad_norm": 241.58819580078125, + "learning_rate": 2.1816696914700546e-05, + "loss": 40.8728, + "step": 1513 + }, + { + "epoch": 5.465914221218962, + "grad_norm": 227.302001953125, + "learning_rate": 2.181125226860254e-05, + "loss": 39.8861, + "step": 1514 + }, + { + "epoch": 5.469525959367946, + "grad_norm": 293.8402404785156, + "learning_rate": 2.1805807622504536e-05, + "loss": 36.8716, + "step": 1515 + }, + { + "epoch": 5.47313769751693, + "grad_norm": 332.8829650878906, + "learning_rate": 2.1800362976406532e-05, + "loss": 35.6049, + "step": 1516 + }, + { + "epoch": 5.476749435665914, + "grad_norm": 271.6636962890625, + "learning_rate": 2.179491833030853e-05, + "loss": 34.6785, + "step": 1517 + }, + { + "epoch": 5.480361173814899, + "grad_norm": 211.5673065185547, + "learning_rate": 2.178947368421053e-05, + "loss": 35.5321, + "step": 1518 + }, + { + "epoch": 5.483972911963883, + "grad_norm": 168.95346069335938, + "learning_rate": 2.1784029038112525e-05, + "loss": 35.1604, + "step": 1519 + }, + { + "epoch": 5.487584650112867, + "grad_norm": 242.66725158691406, + "learning_rate": 2.177858439201452e-05, + "loss": 37.8709, + "step": 1520 + }, + { + "epoch": 5.487584650112867, + "eval_loss": 0.6324127912521362, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1520 + }, + { + "epoch": 5.491196388261851, + "grad_norm": 202.7799530029297, + "learning_rate": 2.1773139745916516e-05, + "loss": 38.1727, + "step": 1521 + }, + { + "epoch": 5.4948081264108355, + "grad_norm": 210.12704467773438, + "learning_rate": 2.176769509981851e-05, + "loss": 36.4171, + "step": 1522 + }, + { + "epoch": 5.4984198645598195, + "grad_norm": 214.7133331298828, + "learning_rate": 2.176225045372051e-05, + "loss": 37.7873, + "step": 1523 + }, + { + "epoch": 5.502031602708803, + "grad_norm": 197.89781188964844, + "learning_rate": 2.1756805807622505e-05, + "loss": 37.1096, + "step": 1524 + }, + { + "epoch": 5.505643340857787, + "grad_norm": 203.01992797851562, + "learning_rate": 2.17513611615245e-05, + "loss": 36.9907, + "step": 1525 + }, + { + "epoch": 5.509255079006772, + "grad_norm": 210.42164611816406, + "learning_rate": 2.17459165154265e-05, + "loss": 38.0291, + "step": 1526 + }, + { + "epoch": 5.512866817155756, + "grad_norm": 210.2798309326172, + "learning_rate": 2.1740471869328495e-05, + "loss": 37.5385, + "step": 1527 + }, + { + "epoch": 5.51647855530474, + "grad_norm": 217.986572265625, + "learning_rate": 2.173502722323049e-05, + "loss": 39.2736, + "step": 1528 + }, + { + "epoch": 5.520090293453725, + "grad_norm": 221.05831909179688, + "learning_rate": 2.172958257713249e-05, + "loss": 39.2733, + "step": 1529 + }, + { + "epoch": 5.523702031602709, + "grad_norm": 250.36065673828125, + "learning_rate": 2.1724137931034484e-05, + "loss": 37.8987, + "step": 1530 + }, + { + "epoch": 5.523702031602709, + "eval_loss": 0.6414559483528137, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 1530 + }, + { + "epoch": 5.527313769751693, + "grad_norm": 275.062255859375, + "learning_rate": 2.171869328493648e-05, + "loss": 29.4874, + "step": 1531 + }, + { + "epoch": 5.530925507900677, + "grad_norm": 178.79615783691406, + "learning_rate": 2.1713248638838475e-05, + "loss": 25.2165, + "step": 1532 + }, + { + "epoch": 5.534537246049661, + "grad_norm": 221.6693572998047, + "learning_rate": 2.170780399274047e-05, + "loss": 24.7139, + "step": 1533 + }, + { + "epoch": 5.538148984198646, + "grad_norm": 207.15869140625, + "learning_rate": 2.170235934664247e-05, + "loss": 25.2773, + "step": 1534 + }, + { + "epoch": 5.54176072234763, + "grad_norm": 193.37644958496094, + "learning_rate": 2.1696914700544468e-05, + "loss": 25.7936, + "step": 1535 + }, + { + "epoch": 5.545372460496614, + "grad_norm": 314.101318359375, + "learning_rate": 2.1691470054446463e-05, + "loss": 45.8573, + "step": 1536 + }, + { + "epoch": 5.5489841986455986, + "grad_norm": 376.9578552246094, + "learning_rate": 2.168602540834846e-05, + "loss": 47.1284, + "step": 1537 + }, + { + "epoch": 5.5525959367945825, + "grad_norm": 343.3904724121094, + "learning_rate": 2.1680580762250454e-05, + "loss": 45.1873, + "step": 1538 + }, + { + "epoch": 5.5562076749435665, + "grad_norm": 263.31768798828125, + "learning_rate": 2.167513611615245e-05, + "loss": 45.4906, + "step": 1539 + }, + { + "epoch": 5.5598194130925505, + "grad_norm": 295.50384521484375, + "learning_rate": 2.1669691470054448e-05, + "loss": 44.9259, + "step": 1540 + }, + { + "epoch": 5.5598194130925505, + "eval_loss": 0.6483813524246216, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.923, + "eval_steps_per_second": 56.923, + "step": 1540 + }, + { + "epoch": 5.563431151241535, + "grad_norm": 208.8861846923828, + "learning_rate": 2.1664246823956444e-05, + "loss": 43.7965, + "step": 1541 + }, + { + "epoch": 5.567042889390519, + "grad_norm": 195.8695526123047, + "learning_rate": 2.165880217785844e-05, + "loss": 44.7409, + "step": 1542 + }, + { + "epoch": 5.570654627539503, + "grad_norm": 218.10089111328125, + "learning_rate": 2.1653357531760434e-05, + "loss": 45.9364, + "step": 1543 + }, + { + "epoch": 5.574266365688487, + "grad_norm": 204.17205810546875, + "learning_rate": 2.164791288566243e-05, + "loss": 45.468, + "step": 1544 + }, + { + "epoch": 5.577878103837472, + "grad_norm": 239.03952026367188, + "learning_rate": 2.1642468239564432e-05, + "loss": 44.7685, + "step": 1545 + }, + { + "epoch": 5.581489841986456, + "grad_norm": 251.59300231933594, + "learning_rate": 2.1637023593466427e-05, + "loss": 43.011, + "step": 1546 + }, + { + "epoch": 5.58510158013544, + "grad_norm": 186.72540283203125, + "learning_rate": 2.1631578947368423e-05, + "loss": 41.5255, + "step": 1547 + }, + { + "epoch": 5.588713318284425, + "grad_norm": 199.89732360839844, + "learning_rate": 2.1626134301270418e-05, + "loss": 40.2522, + "step": 1548 + }, + { + "epoch": 5.592325056433409, + "grad_norm": 182.16624450683594, + "learning_rate": 2.1620689655172413e-05, + "loss": 41.0931, + "step": 1549 + }, + { + "epoch": 5.595936794582393, + "grad_norm": 221.58680725097656, + "learning_rate": 2.161524500907441e-05, + "loss": 40.2717, + "step": 1550 + }, + { + "epoch": 5.595936794582393, + "eval_loss": 0.6393340229988098, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 1550 + }, + { + "epoch": 5.599548532731377, + "grad_norm": 209.82183837890625, + "learning_rate": 2.1609800362976408e-05, + "loss": 41.7522, + "step": 1551 + }, + { + "epoch": 5.603160270880361, + "grad_norm": 226.1896209716797, + "learning_rate": 2.1604355716878403e-05, + "loss": 40.8078, + "step": 1552 + }, + { + "epoch": 5.606772009029346, + "grad_norm": 219.57899475097656, + "learning_rate": 2.1598911070780398e-05, + "loss": 42.2331, + "step": 1553 + }, + { + "epoch": 5.6103837471783295, + "grad_norm": 185.2303009033203, + "learning_rate": 2.1593466424682397e-05, + "loss": 42.0695, + "step": 1554 + }, + { + "epoch": 5.6139954853273135, + "grad_norm": 192.32913208007812, + "learning_rate": 2.1588021778584392e-05, + "loss": 42.1317, + "step": 1555 + }, + { + "epoch": 5.617607223476298, + "grad_norm": 183.3128662109375, + "learning_rate": 2.158257713248639e-05, + "loss": 40.4957, + "step": 1556 + }, + { + "epoch": 5.621218961625282, + "grad_norm": 178.10691833496094, + "learning_rate": 2.1577132486388387e-05, + "loss": 40.9154, + "step": 1557 + }, + { + "epoch": 5.624830699774266, + "grad_norm": 207.3495330810547, + "learning_rate": 2.1571687840290382e-05, + "loss": 42.8389, + "step": 1558 + }, + { + "epoch": 5.62844243792325, + "grad_norm": 191.46353149414062, + "learning_rate": 2.1566243194192377e-05, + "loss": 41.9483, + "step": 1559 + }, + { + "epoch": 5.632054176072235, + "grad_norm": 218.9544219970703, + "learning_rate": 2.1560798548094373e-05, + "loss": 41.2037, + "step": 1560 + }, + { + "epoch": 5.632054176072235, + "eval_loss": 0.6345452070236206, + "eval_runtime": 3.1432, + "eval_samples_per_second": 56.949, + "eval_steps_per_second": 56.949, + "step": 1560 + }, + { + "epoch": 5.635665914221219, + "grad_norm": 235.9405059814453, + "learning_rate": 2.1555353901996368e-05, + "loss": 43.1159, + "step": 1561 + }, + { + "epoch": 5.639277652370203, + "grad_norm": 207.1119384765625, + "learning_rate": 2.1549909255898367e-05, + "loss": 43.4384, + "step": 1562 + }, + { + "epoch": 5.642889390519187, + "grad_norm": 305.3013916015625, + "learning_rate": 2.1544464609800366e-05, + "loss": 42.436, + "step": 1563 + }, + { + "epoch": 5.646501128668172, + "grad_norm": 226.25282287597656, + "learning_rate": 2.153901996370236e-05, + "loss": 39.6844, + "step": 1564 + }, + { + "epoch": 5.650112866817156, + "grad_norm": 201.5033416748047, + "learning_rate": 2.1533575317604356e-05, + "loss": 35.9103, + "step": 1565 + }, + { + "epoch": 5.65372460496614, + "grad_norm": 206.63229370117188, + "learning_rate": 2.1528130671506352e-05, + "loss": 35.0026, + "step": 1566 + }, + { + "epoch": 5.657336343115124, + "grad_norm": 212.67581176757812, + "learning_rate": 2.152268602540835e-05, + "loss": 35.6298, + "step": 1567 + }, + { + "epoch": 5.660948081264109, + "grad_norm": 193.2886199951172, + "learning_rate": 2.1517241379310346e-05, + "loss": 36.0356, + "step": 1568 + }, + { + "epoch": 5.664559819413093, + "grad_norm": 166.189208984375, + "learning_rate": 2.151179673321234e-05, + "loss": 35.5423, + "step": 1569 + }, + { + "epoch": 5.668171557562077, + "grad_norm": 288.91552734375, + "learning_rate": 2.1506352087114337e-05, + "loss": 36.6227, + "step": 1570 + }, + { + "epoch": 5.668171557562077, + "eval_loss": 0.6339959502220154, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1570 + }, + { + "epoch": 5.6717832957110605, + "grad_norm": 210.91664123535156, + "learning_rate": 2.1500907441016332e-05, + "loss": 37.3015, + "step": 1571 + }, + { + "epoch": 5.675395033860045, + "grad_norm": 206.54299926757812, + "learning_rate": 2.149546279491833e-05, + "loss": 36.961, + "step": 1572 + }, + { + "epoch": 5.679006772009029, + "grad_norm": 206.55613708496094, + "learning_rate": 2.149001814882033e-05, + "loss": 36.722, + "step": 1573 + }, + { + "epoch": 5.682618510158013, + "grad_norm": 206.86563110351562, + "learning_rate": 2.1484573502722325e-05, + "loss": 37.7482, + "step": 1574 + }, + { + "epoch": 5.686230248306998, + "grad_norm": 219.96533203125, + "learning_rate": 2.147912885662432e-05, + "loss": 37.7964, + "step": 1575 + }, + { + "epoch": 5.689841986455982, + "grad_norm": 226.23887634277344, + "learning_rate": 2.1473684210526316e-05, + "loss": 38.6577, + "step": 1576 + }, + { + "epoch": 5.693453724604966, + "grad_norm": 195.1751708984375, + "learning_rate": 2.146823956442831e-05, + "loss": 36.9764, + "step": 1577 + }, + { + "epoch": 5.69706546275395, + "grad_norm": 194.3510284423828, + "learning_rate": 2.146279491833031e-05, + "loss": 39.4842, + "step": 1578 + }, + { + "epoch": 5.700677200902934, + "grad_norm": 187.02281188964844, + "learning_rate": 2.1457350272232305e-05, + "loss": 38.9574, + "step": 1579 + }, + { + "epoch": 5.704288939051919, + "grad_norm": 242.91925048828125, + "learning_rate": 2.14519056261343e-05, + "loss": 37.6359, + "step": 1580 + }, + { + "epoch": 5.704288939051919, + "eval_loss": 0.6384473443031311, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 1580 + }, + { + "epoch": 5.707900677200903, + "grad_norm": 242.9617156982422, + "learning_rate": 2.14464609800363e-05, + "loss": 31.3564, + "step": 1581 + }, + { + "epoch": 5.711512415349887, + "grad_norm": 182.00540161132812, + "learning_rate": 2.1441016333938295e-05, + "loss": 24.2933, + "step": 1582 + }, + { + "epoch": 5.715124153498872, + "grad_norm": 257.7115173339844, + "learning_rate": 2.143557168784029e-05, + "loss": 24.6299, + "step": 1583 + }, + { + "epoch": 5.718735891647856, + "grad_norm": 198.71554565429688, + "learning_rate": 2.143012704174229e-05, + "loss": 24.7344, + "step": 1584 + }, + { + "epoch": 5.72234762979684, + "grad_norm": 198.24520874023438, + "learning_rate": 2.1424682395644284e-05, + "loss": 26.0825, + "step": 1585 + }, + { + "epoch": 5.725959367945824, + "grad_norm": 248.9528045654297, + "learning_rate": 2.141923774954628e-05, + "loss": 45.1176, + "step": 1586 + }, + { + "epoch": 5.7295711060948085, + "grad_norm": 293.7327575683594, + "learning_rate": 2.1413793103448275e-05, + "loss": 45.8517, + "step": 1587 + }, + { + "epoch": 5.733182844243792, + "grad_norm": 293.1148681640625, + "learning_rate": 2.140834845735027e-05, + "loss": 45.6659, + "step": 1588 + }, + { + "epoch": 5.736794582392776, + "grad_norm": 312.7779846191406, + "learning_rate": 2.140290381125227e-05, + "loss": 44.4863, + "step": 1589 + }, + { + "epoch": 5.74040632054176, + "grad_norm": 309.1000061035156, + "learning_rate": 2.1397459165154265e-05, + "loss": 43.649, + "step": 1590 + }, + { + "epoch": 5.74040632054176, + "eval_loss": 0.6471736431121826, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 1590 + }, + { + "epoch": 5.744018058690745, + "grad_norm": 276.4226989746094, + "learning_rate": 2.1392014519056263e-05, + "loss": 45.3135, + "step": 1591 + }, + { + "epoch": 5.747629796839729, + "grad_norm": 233.6791229248047, + "learning_rate": 2.138656987295826e-05, + "loss": 44.4919, + "step": 1592 + }, + { + "epoch": 5.751241534988713, + "grad_norm": 194.2917022705078, + "learning_rate": 2.1381125226860254e-05, + "loss": 44.8033, + "step": 1593 + }, + { + "epoch": 5.754853273137698, + "grad_norm": 241.76060485839844, + "learning_rate": 2.137568058076225e-05, + "loss": 45.1427, + "step": 1594 + }, + { + "epoch": 5.758465011286682, + "grad_norm": 216.56283569335938, + "learning_rate": 2.137023593466425e-05, + "loss": 43.1769, + "step": 1595 + }, + { + "epoch": 5.762076749435666, + "grad_norm": 230.0026092529297, + "learning_rate": 2.1364791288566244e-05, + "loss": 44.1141, + "step": 1596 + }, + { + "epoch": 5.76568848758465, + "grad_norm": 191.55433654785156, + "learning_rate": 2.135934664246824e-05, + "loss": 40.7227, + "step": 1597 + }, + { + "epoch": 5.769300225733634, + "grad_norm": 180.25885009765625, + "learning_rate": 2.1353901996370235e-05, + "loss": 40.9842, + "step": 1598 + }, + { + "epoch": 5.772911963882619, + "grad_norm": 220.4018096923828, + "learning_rate": 2.134845735027223e-05, + "loss": 40.0403, + "step": 1599 + }, + { + "epoch": 5.776523702031603, + "grad_norm": 264.20587158203125, + "learning_rate": 2.1343012704174232e-05, + "loss": 40.1543, + "step": 1600 + }, + { + "epoch": 5.776523702031603, + "eval_loss": 0.6374311447143555, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1600 + }, + { + "epoch": 5.780135440180587, + "grad_norm": 167.9457244873047, + "learning_rate": 2.1337568058076227e-05, + "loss": 40.9575, + "step": 1601 + }, + { + "epoch": 5.7837471783295715, + "grad_norm": 190.05247497558594, + "learning_rate": 2.1332123411978223e-05, + "loss": 39.5593, + "step": 1602 + }, + { + "epoch": 5.7873589164785555, + "grad_norm": 246.4980926513672, + "learning_rate": 2.1326678765880218e-05, + "loss": 40.7016, + "step": 1603 + }, + { + "epoch": 5.7909706546275395, + "grad_norm": 208.7435302734375, + "learning_rate": 2.1321234119782214e-05, + "loss": 41.7855, + "step": 1604 + }, + { + "epoch": 5.794582392776523, + "grad_norm": 190.84188842773438, + "learning_rate": 2.1315789473684212e-05, + "loss": 41.2129, + "step": 1605 + }, + { + "epoch": 5.798194130925508, + "grad_norm": 196.7161102294922, + "learning_rate": 2.1310344827586208e-05, + "loss": 40.8209, + "step": 1606 + }, + { + "epoch": 5.801805869074492, + "grad_norm": 181.4319305419922, + "learning_rate": 2.1304900181488203e-05, + "loss": 41.8345, + "step": 1607 + }, + { + "epoch": 5.805417607223476, + "grad_norm": 201.2064971923828, + "learning_rate": 2.12994555353902e-05, + "loss": 43.1464, + "step": 1608 + }, + { + "epoch": 5.80902934537246, + "grad_norm": 199.15174865722656, + "learning_rate": 2.1294010889292197e-05, + "loss": 42.6041, + "step": 1609 + }, + { + "epoch": 5.812641083521445, + "grad_norm": 231.0398406982422, + "learning_rate": 2.1288566243194193e-05, + "loss": 42.867, + "step": 1610 + }, + { + "epoch": 5.812641083521445, + "eval_loss": 0.6334222555160522, + "eval_runtime": 3.1534, + "eval_samples_per_second": 56.764, + "eval_steps_per_second": 56.764, + "step": 1610 + }, + { + "epoch": 5.816252821670429, + "grad_norm": 189.26132202148438, + "learning_rate": 2.128312159709619e-05, + "loss": 41.7717, + "step": 1611 + }, + { + "epoch": 5.819864559819413, + "grad_norm": 215.5289764404297, + "learning_rate": 2.1277676950998187e-05, + "loss": 41.3994, + "step": 1612 + }, + { + "epoch": 5.823476297968397, + "grad_norm": 267.4259033203125, + "learning_rate": 2.1272232304900182e-05, + "loss": 41.8173, + "step": 1613 + }, + { + "epoch": 5.827088036117382, + "grad_norm": 241.74749755859375, + "learning_rate": 2.1266787658802178e-05, + "loss": 39.9873, + "step": 1614 + }, + { + "epoch": 5.830699774266366, + "grad_norm": 242.233642578125, + "learning_rate": 2.1261343012704173e-05, + "loss": 37.0662, + "step": 1615 + }, + { + "epoch": 5.83431151241535, + "grad_norm": 217.06141662597656, + "learning_rate": 2.1255898366606172e-05, + "loss": 36.8948, + "step": 1616 + }, + { + "epoch": 5.837923250564334, + "grad_norm": 242.05567932128906, + "learning_rate": 2.1250453720508167e-05, + "loss": 34.9909, + "step": 1617 + }, + { + "epoch": 5.8415349887133186, + "grad_norm": 178.65618896484375, + "learning_rate": 2.1245009074410166e-05, + "loss": 35.603, + "step": 1618 + }, + { + "epoch": 5.8451467268623025, + "grad_norm": 216.36865234375, + "learning_rate": 2.123956442831216e-05, + "loss": 35.9822, + "step": 1619 + }, + { + "epoch": 5.8487584650112865, + "grad_norm": 241.22161865234375, + "learning_rate": 2.1234119782214157e-05, + "loss": 35.1473, + "step": 1620 + }, + { + "epoch": 5.8487584650112865, + "eval_loss": 0.6312161087989807, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 1620 + }, + { + "epoch": 5.852370203160271, + "grad_norm": 192.05210876464844, + "learning_rate": 2.1228675136116152e-05, + "loss": 36.145, + "step": 1621 + }, + { + "epoch": 5.855981941309255, + "grad_norm": 194.0652618408203, + "learning_rate": 2.122323049001815e-05, + "loss": 37.7076, + "step": 1622 + }, + { + "epoch": 5.859593679458239, + "grad_norm": 255.59286499023438, + "learning_rate": 2.1217785843920146e-05, + "loss": 37.6837, + "step": 1623 + }, + { + "epoch": 5.863205417607223, + "grad_norm": 184.0017852783203, + "learning_rate": 2.121234119782214e-05, + "loss": 37.1681, + "step": 1624 + }, + { + "epoch": 5.866817155756207, + "grad_norm": 186.98338317871094, + "learning_rate": 2.1206896551724137e-05, + "loss": 37.4902, + "step": 1625 + }, + { + "epoch": 5.870428893905192, + "grad_norm": 253.53775024414062, + "learning_rate": 2.1201451905626132e-05, + "loss": 37.2771, + "step": 1626 + }, + { + "epoch": 5.874040632054176, + "grad_norm": 196.43038940429688, + "learning_rate": 2.119600725952813e-05, + "loss": 37.7681, + "step": 1627 + }, + { + "epoch": 5.87765237020316, + "grad_norm": 255.99879455566406, + "learning_rate": 2.119056261343013e-05, + "loss": 40.0097, + "step": 1628 + }, + { + "epoch": 5.881264108352145, + "grad_norm": 275.1465148925781, + "learning_rate": 2.1185117967332125e-05, + "loss": 38.1076, + "step": 1629 + }, + { + "epoch": 5.884875846501129, + "grad_norm": 281.8592529296875, + "learning_rate": 2.117967332123412e-05, + "loss": 38.6463, + "step": 1630 + }, + { + "epoch": 5.884875846501129, + "eval_loss": 0.6449099779129028, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 1630 + }, + { + "epoch": 5.888487584650113, + "grad_norm": 246.7912139892578, + "learning_rate": 2.1174228675136116e-05, + "loss": 36.9158, + "step": 1631 + }, + { + "epoch": 5.892099322799097, + "grad_norm": 176.7545623779297, + "learning_rate": 2.116878402903811e-05, + "loss": 25.1153, + "step": 1632 + }, + { + "epoch": 5.895711060948082, + "grad_norm": 202.2602996826172, + "learning_rate": 2.116333938294011e-05, + "loss": 24.1999, + "step": 1633 + }, + { + "epoch": 5.899322799097066, + "grad_norm": 186.26255798339844, + "learning_rate": 2.1157894736842106e-05, + "loss": 24.185, + "step": 1634 + }, + { + "epoch": 5.9029345372460496, + "grad_norm": 231.0543670654297, + "learning_rate": 2.11524500907441e-05, + "loss": 26.1841, + "step": 1635 + }, + { + "epoch": 5.9065462753950335, + "grad_norm": 336.677001953125, + "learning_rate": 2.1147005444646096e-05, + "loss": 47.1367, + "step": 1636 + }, + { + "epoch": 5.910158013544018, + "grad_norm": 299.3211975097656, + "learning_rate": 2.1141560798548095e-05, + "loss": 46.7711, + "step": 1637 + }, + { + "epoch": 5.913769751693002, + "grad_norm": 287.5389099121094, + "learning_rate": 2.1136116152450094e-05, + "loss": 44.9163, + "step": 1638 + }, + { + "epoch": 5.917381489841986, + "grad_norm": 290.34930419921875, + "learning_rate": 2.113067150635209e-05, + "loss": 45.1651, + "step": 1639 + }, + { + "epoch": 5.92099322799097, + "grad_norm": 244.7100372314453, + "learning_rate": 2.1125226860254085e-05, + "loss": 45.6252, + "step": 1640 + }, + { + "epoch": 5.92099322799097, + "eval_loss": 0.6506878733634949, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 1640 + }, + { + "epoch": 5.924604966139955, + "grad_norm": 301.48223876953125, + "learning_rate": 2.111978221415608e-05, + "loss": 44.5345, + "step": 1641 + }, + { + "epoch": 5.928216704288939, + "grad_norm": 261.05987548828125, + "learning_rate": 2.1114337568058075e-05, + "loss": 42.0263, + "step": 1642 + }, + { + "epoch": 5.931828442437923, + "grad_norm": 220.4369659423828, + "learning_rate": 2.110889292196007e-05, + "loss": 41.2405, + "step": 1643 + }, + { + "epoch": 5.935440180586907, + "grad_norm": 261.3221435546875, + "learning_rate": 2.110344827586207e-05, + "loss": 42.2734, + "step": 1644 + }, + { + "epoch": 5.939051918735892, + "grad_norm": 253.70855712890625, + "learning_rate": 2.1098003629764065e-05, + "loss": 43.0752, + "step": 1645 + }, + { + "epoch": 5.942663656884876, + "grad_norm": 198.76138305664062, + "learning_rate": 2.1092558983666064e-05, + "loss": 42.7103, + "step": 1646 + }, + { + "epoch": 5.94627539503386, + "grad_norm": 212.21466064453125, + "learning_rate": 2.108711433756806e-05, + "loss": 42.6215, + "step": 1647 + }, + { + "epoch": 5.949887133182845, + "grad_norm": 212.9633026123047, + "learning_rate": 2.1081669691470055e-05, + "loss": 42.795, + "step": 1648 + }, + { + "epoch": 5.953498871331829, + "grad_norm": 263.2871398925781, + "learning_rate": 2.1076225045372053e-05, + "loss": 43.8843, + "step": 1649 + }, + { + "epoch": 5.957110609480813, + "grad_norm": 207.67120361328125, + "learning_rate": 2.107078039927405e-05, + "loss": 43.0161, + "step": 1650 + }, + { + "epoch": 5.957110609480813, + "eval_loss": 0.6315081715583801, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1650 + }, + { + "epoch": 5.960722347629797, + "grad_norm": 176.6342010498047, + "learning_rate": 2.1065335753176044e-05, + "loss": 38.803, + "step": 1651 + }, + { + "epoch": 5.9643340857787805, + "grad_norm": 223.57485961914062, + "learning_rate": 2.105989110707804e-05, + "loss": 35.1905, + "step": 1652 + }, + { + "epoch": 5.967945823927765, + "grad_norm": 291.507568359375, + "learning_rate": 2.1054446460980035e-05, + "loss": 34.9454, + "step": 1653 + }, + { + "epoch": 5.971557562076749, + "grad_norm": 250.51063537597656, + "learning_rate": 2.104900181488203e-05, + "loss": 37.4404, + "step": 1654 + }, + { + "epoch": 5.975169300225733, + "grad_norm": 307.9601135253906, + "learning_rate": 2.1043557168784032e-05, + "loss": 36.9775, + "step": 1655 + }, + { + "epoch": 5.978781038374718, + "grad_norm": 277.24151611328125, + "learning_rate": 2.1038112522686028e-05, + "loss": 38.2696, + "step": 1656 + }, + { + "epoch": 5.982392776523702, + "grad_norm": 186.7593994140625, + "learning_rate": 2.1032667876588023e-05, + "loss": 37.0656, + "step": 1657 + }, + { + "epoch": 5.986004514672686, + "grad_norm": 201.67047119140625, + "learning_rate": 2.102722323049002e-05, + "loss": 38.1747, + "step": 1658 + }, + { + "epoch": 5.98961625282167, + "grad_norm": 216.87525939941406, + "learning_rate": 2.1021778584392014e-05, + "loss": 39.3248, + "step": 1659 + }, + { + "epoch": 5.993227990970655, + "grad_norm": 227.381103515625, + "learning_rate": 2.1016333938294013e-05, + "loss": 33.4017, + "step": 1660 + }, + { + "epoch": 5.993227990970655, + "eval_loss": 0.6369583010673523, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1660 + }, + { + "epoch": 5.996839729119639, + "grad_norm": 237.2648468017578, + "learning_rate": 2.1010889292196008e-05, + "loss": 24.679, + "step": 1661 + }, + { + "epoch": 6.0, + "grad_norm": 191.99951171875, + "learning_rate": 2.1005444646098003e-05, + "loss": 21.9552, + "step": 1662 + }, + { + "epoch": 6.003611738148984, + "grad_norm": 267.92181396484375, + "learning_rate": 2.1e-05, + "loss": 43.6884, + "step": 1663 + }, + { + "epoch": 6.007223476297969, + "grad_norm": 318.86602783203125, + "learning_rate": 2.0994555353901998e-05, + "loss": 46.0709, + "step": 1664 + }, + { + "epoch": 6.010835214446953, + "grad_norm": 282.772705078125, + "learning_rate": 2.0989110707803993e-05, + "loss": 44.2746, + "step": 1665 + }, + { + "epoch": 6.014446952595937, + "grad_norm": 263.2024841308594, + "learning_rate": 2.0983666061705992e-05, + "loss": 43.818, + "step": 1666 + }, + { + "epoch": 6.018058690744921, + "grad_norm": 229.41725158691406, + "learning_rate": 2.0978221415607987e-05, + "loss": 43.9441, + "step": 1667 + }, + { + "epoch": 6.021670428893906, + "grad_norm": 253.25624084472656, + "learning_rate": 2.0972776769509983e-05, + "loss": 43.517, + "step": 1668 + }, + { + "epoch": 6.0252821670428895, + "grad_norm": 202.00238037109375, + "learning_rate": 2.0967332123411978e-05, + "loss": 44.3685, + "step": 1669 + }, + { + "epoch": 6.0288939051918735, + "grad_norm": 196.92825317382812, + "learning_rate": 2.0961887477313973e-05, + "loss": 44.9367, + "step": 1670 + }, + { + "epoch": 6.0288939051918735, + "eval_loss": 0.6381568312644958, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1670 + }, + { + "epoch": 6.0325056433408575, + "grad_norm": 191.00900268554688, + "learning_rate": 2.0956442831215972e-05, + "loss": 44.0743, + "step": 1671 + }, + { + "epoch": 6.036117381489842, + "grad_norm": 195.92141723632812, + "learning_rate": 2.0950998185117967e-05, + "loss": 43.3278, + "step": 1672 + }, + { + "epoch": 6.039729119638826, + "grad_norm": 230.04708862304688, + "learning_rate": 2.0945553539019963e-05, + "loss": 41.6419, + "step": 1673 + }, + { + "epoch": 6.04334085778781, + "grad_norm": 215.70689392089844, + "learning_rate": 2.094010889292196e-05, + "loss": 41.0927, + "step": 1674 + }, + { + "epoch": 6.046952595936794, + "grad_norm": 227.51797485351562, + "learning_rate": 2.0934664246823957e-05, + "loss": 40.1888, + "step": 1675 + }, + { + "epoch": 6.050564334085779, + "grad_norm": 216.93089294433594, + "learning_rate": 2.0929219600725952e-05, + "loss": 39.8766, + "step": 1676 + }, + { + "epoch": 6.054176072234763, + "grad_norm": 199.3091583251953, + "learning_rate": 2.092377495462795e-05, + "loss": 40.3851, + "step": 1677 + }, + { + "epoch": 6.057787810383747, + "grad_norm": 188.56056213378906, + "learning_rate": 2.0918330308529947e-05, + "loss": 40.5289, + "step": 1678 + }, + { + "epoch": 6.061399548532731, + "grad_norm": 194.23265075683594, + "learning_rate": 2.0912885662431942e-05, + "loss": 40.7509, + "step": 1679 + }, + { + "epoch": 6.065011286681716, + "grad_norm": 199.7327423095703, + "learning_rate": 2.0907441016333937e-05, + "loss": 41.3404, + "step": 1680 + }, + { + "epoch": 6.065011286681716, + "eval_loss": 0.6312655806541443, + "eval_runtime": 3.1482, + "eval_samples_per_second": 56.858, + "eval_steps_per_second": 56.858, + "step": 1680 + }, + { + "epoch": 6.0686230248307, + "grad_norm": 189.40150451660156, + "learning_rate": 2.0901996370235933e-05, + "loss": 41.3719, + "step": 1681 + }, + { + "epoch": 6.072234762979684, + "grad_norm": 222.07705688476562, + "learning_rate": 2.089655172413793e-05, + "loss": 41.8194, + "step": 1682 + }, + { + "epoch": 6.075846501128668, + "grad_norm": 205.6264190673828, + "learning_rate": 2.089110707803993e-05, + "loss": 39.8522, + "step": 1683 + }, + { + "epoch": 6.079458239277653, + "grad_norm": 207.98802185058594, + "learning_rate": 2.0885662431941926e-05, + "loss": 41.5093, + "step": 1684 + }, + { + "epoch": 6.083069977426637, + "grad_norm": 197.24134826660156, + "learning_rate": 2.088021778584392e-05, + "loss": 41.7284, + "step": 1685 + }, + { + "epoch": 6.0866817155756205, + "grad_norm": 220.84255981445312, + "learning_rate": 2.0874773139745916e-05, + "loss": 42.7841, + "step": 1686 + }, + { + "epoch": 6.090293453724605, + "grad_norm": 239.06854248046875, + "learning_rate": 2.0869328493647912e-05, + "loss": 43.6391, + "step": 1687 + }, + { + "epoch": 6.093905191873589, + "grad_norm": 193.2572021484375, + "learning_rate": 2.086388384754991e-05, + "loss": 41.9963, + "step": 1688 + }, + { + "epoch": 6.097516930022573, + "grad_norm": 206.66473388671875, + "learning_rate": 2.0858439201451906e-05, + "loss": 41.9834, + "step": 1689 + }, + { + "epoch": 6.101128668171557, + "grad_norm": 214.81956481933594, + "learning_rate": 2.08529945553539e-05, + "loss": 41.7128, + "step": 1690 + }, + { + "epoch": 6.101128668171557, + "eval_loss": 0.6309775114059448, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1690 + }, + { + "epoch": 6.104740406320542, + "grad_norm": 189.58360290527344, + "learning_rate": 2.0847549909255897e-05, + "loss": 37.7807, + "step": 1691 + }, + { + "epoch": 6.108352144469526, + "grad_norm": 265.76934814453125, + "learning_rate": 2.0842105263157895e-05, + "loss": 37.7091, + "step": 1692 + }, + { + "epoch": 6.11196388261851, + "grad_norm": 266.4632568359375, + "learning_rate": 2.0836660617059894e-05, + "loss": 34.7386, + "step": 1693 + }, + { + "epoch": 6.115575620767494, + "grad_norm": 309.3799743652344, + "learning_rate": 2.083121597096189e-05, + "loss": 34.9386, + "step": 1694 + }, + { + "epoch": 6.119187358916479, + "grad_norm": 252.98681640625, + "learning_rate": 2.0825771324863885e-05, + "loss": 34.9113, + "step": 1695 + }, + { + "epoch": 6.122799097065463, + "grad_norm": 199.3408660888672, + "learning_rate": 2.082032667876588e-05, + "loss": 35.1914, + "step": 1696 + }, + { + "epoch": 6.126410835214447, + "grad_norm": 231.67514038085938, + "learning_rate": 2.0814882032667876e-05, + "loss": 36.3151, + "step": 1697 + }, + { + "epoch": 6.130022573363431, + "grad_norm": 215.49317932128906, + "learning_rate": 2.080943738656987e-05, + "loss": 37.6763, + "step": 1698 + }, + { + "epoch": 6.133634311512416, + "grad_norm": 239.3602752685547, + "learning_rate": 2.080399274047187e-05, + "loss": 35.7805, + "step": 1699 + }, + { + "epoch": 6.1372460496614, + "grad_norm": 192.8195037841797, + "learning_rate": 2.0798548094373865e-05, + "loss": 36.7353, + "step": 1700 + }, + { + "epoch": 6.1372460496614, + "eval_loss": 0.6290757060050964, + "eval_runtime": 3.1486, + "eval_samples_per_second": 56.851, + "eval_steps_per_second": 56.851, + "step": 1700 + }, + { + "epoch": 6.140857787810384, + "grad_norm": 191.125, + "learning_rate": 2.0793103448275864e-05, + "loss": 36.6377, + "step": 1701 + }, + { + "epoch": 6.144469525959368, + "grad_norm": 232.39170837402344, + "learning_rate": 2.078765880217786e-05, + "loss": 36.5235, + "step": 1702 + }, + { + "epoch": 6.148081264108352, + "grad_norm": 259.41204833984375, + "learning_rate": 2.0782214156079855e-05, + "loss": 37.7093, + "step": 1703 + }, + { + "epoch": 6.151693002257336, + "grad_norm": 218.00814819335938, + "learning_rate": 2.0776769509981854e-05, + "loss": 37.8061, + "step": 1704 + }, + { + "epoch": 6.15530474040632, + "grad_norm": 183.78170776367188, + "learning_rate": 2.077132486388385e-05, + "loss": 37.9451, + "step": 1705 + }, + { + "epoch": 6.158916478555304, + "grad_norm": 242.387939453125, + "learning_rate": 2.0765880217785844e-05, + "loss": 38.687, + "step": 1706 + }, + { + "epoch": 6.162528216704289, + "grad_norm": 247.09152221679688, + "learning_rate": 2.076043557168784e-05, + "loss": 38.5109, + "step": 1707 + }, + { + "epoch": 6.166139954853273, + "grad_norm": 202.3104705810547, + "learning_rate": 2.0754990925589835e-05, + "loss": 28.0115, + "step": 1708 + }, + { + "epoch": 6.169751693002257, + "grad_norm": 239.5511016845703, + "learning_rate": 2.0749546279491834e-05, + "loss": 23.8873, + "step": 1709 + }, + { + "epoch": 6.173363431151241, + "grad_norm": 233.80007934570312, + "learning_rate": 2.0744101633393833e-05, + "loss": 24.0236, + "step": 1710 + }, + { + "epoch": 6.173363431151241, + "eval_loss": 0.6451307535171509, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1710 + }, + { + "epoch": 6.176975169300226, + "grad_norm": 231.85955810546875, + "learning_rate": 2.0738656987295828e-05, + "loss": 25.2521, + "step": 1711 + }, + { + "epoch": 6.18058690744921, + "grad_norm": 207.05453491210938, + "learning_rate": 2.0733212341197823e-05, + "loss": 25.5774, + "step": 1712 + }, + { + "epoch": 6.184198645598194, + "grad_norm": 265.9180908203125, + "learning_rate": 2.072776769509982e-05, + "loss": 46.0267, + "step": 1713 + }, + { + "epoch": 6.187810383747179, + "grad_norm": 289.2763671875, + "learning_rate": 2.0722323049001814e-05, + "loss": 46.6262, + "step": 1714 + }, + { + "epoch": 6.191422121896163, + "grad_norm": 254.466552734375, + "learning_rate": 2.0716878402903813e-05, + "loss": 44.2758, + "step": 1715 + }, + { + "epoch": 6.195033860045147, + "grad_norm": 262.713134765625, + "learning_rate": 2.071143375680581e-05, + "loss": 44.6334, + "step": 1716 + }, + { + "epoch": 6.198645598194131, + "grad_norm": 272.8150939941406, + "learning_rate": 2.0705989110707804e-05, + "loss": 44.9617, + "step": 1717 + }, + { + "epoch": 6.2022573363431155, + "grad_norm": 288.115478515625, + "learning_rate": 2.07005444646098e-05, + "loss": 44.4382, + "step": 1718 + }, + { + "epoch": 6.2058690744920995, + "grad_norm": 226.08058166503906, + "learning_rate": 2.0695099818511795e-05, + "loss": 44.8551, + "step": 1719 + }, + { + "epoch": 6.209480812641083, + "grad_norm": 219.95835876464844, + "learning_rate": 2.0689655172413797e-05, + "loss": 45.5901, + "step": 1720 + }, + { + "epoch": 6.209480812641083, + "eval_loss": 0.6379314661026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 1720 + }, + { + "epoch": 6.213092550790067, + "grad_norm": 190.3118896484375, + "learning_rate": 2.0684210526315792e-05, + "loss": 44.0675, + "step": 1721 + }, + { + "epoch": 6.216704288939052, + "grad_norm": 177.408935546875, + "learning_rate": 2.0678765880217787e-05, + "loss": 42.6333, + "step": 1722 + }, + { + "epoch": 6.220316027088036, + "grad_norm": 231.3040313720703, + "learning_rate": 2.0673321234119783e-05, + "loss": 41.6771, + "step": 1723 + }, + { + "epoch": 6.22392776523702, + "grad_norm": 226.51663208007812, + "learning_rate": 2.0667876588021778e-05, + "loss": 41.0829, + "step": 1724 + }, + { + "epoch": 6.227539503386004, + "grad_norm": 184.55775451660156, + "learning_rate": 2.0662431941923774e-05, + "loss": 39.2682, + "step": 1725 + }, + { + "epoch": 6.231151241534989, + "grad_norm": 205.0491943359375, + "learning_rate": 2.0656987295825772e-05, + "loss": 40.4101, + "step": 1726 + }, + { + "epoch": 6.234762979683973, + "grad_norm": 201.45838928222656, + "learning_rate": 2.0651542649727768e-05, + "loss": 39.9147, + "step": 1727 + }, + { + "epoch": 6.238374717832957, + "grad_norm": 220.16213989257812, + "learning_rate": 2.0646098003629763e-05, + "loss": 40.7215, + "step": 1728 + }, + { + "epoch": 6.241986455981941, + "grad_norm": 260.9661560058594, + "learning_rate": 2.0640653357531762e-05, + "loss": 40.0256, + "step": 1729 + }, + { + "epoch": 6.245598194130926, + "grad_norm": 314.2476806640625, + "learning_rate": 2.0635208711433757e-05, + "loss": 41.1147, + "step": 1730 + }, + { + "epoch": 6.245598194130926, + "eval_loss": 0.6347935199737549, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1730 + }, + { + "epoch": 6.24920993227991, + "grad_norm": 262.24505615234375, + "learning_rate": 2.0629764065335756e-05, + "loss": 41.7255, + "step": 1731 + }, + { + "epoch": 6.252821670428894, + "grad_norm": 212.0876922607422, + "learning_rate": 2.062431941923775e-05, + "loss": 41.2559, + "step": 1732 + }, + { + "epoch": 6.2564334085778786, + "grad_norm": 185.3249969482422, + "learning_rate": 2.0618874773139747e-05, + "loss": 41.1664, + "step": 1733 + }, + { + "epoch": 6.2600451467268625, + "grad_norm": 184.7873077392578, + "learning_rate": 2.0613430127041742e-05, + "loss": 41.3357, + "step": 1734 + }, + { + "epoch": 6.2636568848758465, + "grad_norm": 230.11257934570312, + "learning_rate": 2.0607985480943738e-05, + "loss": 43.0978, + "step": 1735 + }, + { + "epoch": 6.2672686230248305, + "grad_norm": 251.255126953125, + "learning_rate": 2.0602540834845733e-05, + "loss": 42.4169, + "step": 1736 + }, + { + "epoch": 6.270880361173815, + "grad_norm": 230.1149444580078, + "learning_rate": 2.0597096188747732e-05, + "loss": 43.2969, + "step": 1737 + }, + { + "epoch": 6.274492099322799, + "grad_norm": 217.2769012451172, + "learning_rate": 2.059165154264973e-05, + "loss": 42.6037, + "step": 1738 + }, + { + "epoch": 6.278103837471783, + "grad_norm": 189.85533142089844, + "learning_rate": 2.0586206896551726e-05, + "loss": 42.1215, + "step": 1739 + }, + { + "epoch": 6.281715575620767, + "grad_norm": 242.15667724609375, + "learning_rate": 2.058076225045372e-05, + "loss": 42.6337, + "step": 1740 + }, + { + "epoch": 6.281715575620767, + "eval_loss": 0.6310555934906006, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 1740 + }, + { + "epoch": 6.285327313769752, + "grad_norm": 213.7873992919922, + "learning_rate": 2.0575317604355717e-05, + "loss": 40.5315, + "step": 1741 + }, + { + "epoch": 6.288939051918736, + "grad_norm": 243.86492919921875, + "learning_rate": 2.0569872958257715e-05, + "loss": 38.9483, + "step": 1742 + }, + { + "epoch": 6.29255079006772, + "grad_norm": 276.0108642578125, + "learning_rate": 2.056442831215971e-05, + "loss": 35.9627, + "step": 1743 + }, + { + "epoch": 6.296162528216704, + "grad_norm": 252.5875701904297, + "learning_rate": 2.0558983666061706e-05, + "loss": 35.4305, + "step": 1744 + }, + { + "epoch": 6.299774266365689, + "grad_norm": 227.15142822265625, + "learning_rate": 2.05535390199637e-05, + "loss": 35.2385, + "step": 1745 + }, + { + "epoch": 6.303386004514673, + "grad_norm": 259.6727294921875, + "learning_rate": 2.0548094373865697e-05, + "loss": 35.735, + "step": 1746 + }, + { + "epoch": 6.306997742663657, + "grad_norm": 185.07765197753906, + "learning_rate": 2.0542649727767696e-05, + "loss": 36.8835, + "step": 1747 + }, + { + "epoch": 6.310609480812641, + "grad_norm": 207.650146484375, + "learning_rate": 2.0537205081669694e-05, + "loss": 36.346, + "step": 1748 + }, + { + "epoch": 6.314221218961626, + "grad_norm": 223.2378692626953, + "learning_rate": 2.053176043557169e-05, + "loss": 36.1527, + "step": 1749 + }, + { + "epoch": 6.3178329571106095, + "grad_norm": 162.90794372558594, + "learning_rate": 2.0526315789473685e-05, + "loss": 35.7408, + "step": 1750 + }, + { + "epoch": 6.3178329571106095, + "eval_loss": 0.6276403069496155, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 1750 + }, + { + "epoch": 6.3214446952595935, + "grad_norm": 165.8592987060547, + "learning_rate": 2.052087114337568e-05, + "loss": 37.7916, + "step": 1751 + }, + { + "epoch": 6.3250564334085775, + "grad_norm": 179.7499542236328, + "learning_rate": 2.0515426497277676e-05, + "loss": 36.8409, + "step": 1752 + }, + { + "epoch": 6.328668171557562, + "grad_norm": 227.0990753173828, + "learning_rate": 2.0509981851179675e-05, + "loss": 37.1766, + "step": 1753 + }, + { + "epoch": 6.332279909706546, + "grad_norm": 216.3297882080078, + "learning_rate": 2.050453720508167e-05, + "loss": 37.5, + "step": 1754 + }, + { + "epoch": 6.33589164785553, + "grad_norm": 197.88409423828125, + "learning_rate": 2.0499092558983666e-05, + "loss": 38.8293, + "step": 1755 + }, + { + "epoch": 6.339503386004514, + "grad_norm": 189.74916076660156, + "learning_rate": 2.049364791288566e-05, + "loss": 37.9873, + "step": 1756 + }, + { + "epoch": 6.343115124153499, + "grad_norm": 241.16644287109375, + "learning_rate": 2.048820326678766e-05, + "loss": 39.3107, + "step": 1757 + }, + { + "epoch": 6.346726862302483, + "grad_norm": 224.3491668701172, + "learning_rate": 2.0482758620689655e-05, + "loss": 36.2482, + "step": 1758 + }, + { + "epoch": 6.350338600451467, + "grad_norm": 217.30882263183594, + "learning_rate": 2.0477313974591654e-05, + "loss": 24.1945, + "step": 1759 + }, + { + "epoch": 6.353950338600452, + "grad_norm": 213.23683166503906, + "learning_rate": 2.047186932849365e-05, + "loss": 24.2356, + "step": 1760 + }, + { + "epoch": 6.353950338600452, + "eval_loss": 0.6382855772972107, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.795, + "eval_steps_per_second": 56.795, + "step": 1760 + }, + { + "epoch": 6.357562076749436, + "grad_norm": 209.8166961669922, + "learning_rate": 2.0466424682395645e-05, + "loss": 25.1916, + "step": 1761 + }, + { + "epoch": 6.36117381489842, + "grad_norm": 197.86773681640625, + "learning_rate": 2.046098003629764e-05, + "loss": 25.1372, + "step": 1762 + }, + { + "epoch": 6.364785553047404, + "grad_norm": 280.80517578125, + "learning_rate": 2.0455535390199635e-05, + "loss": 45.0431, + "step": 1763 + }, + { + "epoch": 6.368397291196389, + "grad_norm": 239.85861206054688, + "learning_rate": 2.0450090744101634e-05, + "loss": 45.4893, + "step": 1764 + }, + { + "epoch": 6.372009029345373, + "grad_norm": 302.56024169921875, + "learning_rate": 2.044464609800363e-05, + "loss": 45.3313, + "step": 1765 + }, + { + "epoch": 6.375620767494357, + "grad_norm": 255.5519256591797, + "learning_rate": 2.043920145190563e-05, + "loss": 44.703, + "step": 1766 + }, + { + "epoch": 6.3792325056433405, + "grad_norm": 223.1331024169922, + "learning_rate": 2.0433756805807624e-05, + "loss": 45.0278, + "step": 1767 + }, + { + "epoch": 6.382844243792325, + "grad_norm": 240.68817138671875, + "learning_rate": 2.042831215970962e-05, + "loss": 44.7298, + "step": 1768 + }, + { + "epoch": 6.386455981941309, + "grad_norm": 239.5072021484375, + "learning_rate": 2.0422867513611614e-05, + "loss": 44.0512, + "step": 1769 + }, + { + "epoch": 6.390067720090293, + "grad_norm": 186.3783416748047, + "learning_rate": 2.0417422867513613e-05, + "loss": 43.8646, + "step": 1770 + }, + { + "epoch": 6.390067720090293, + "eval_loss": 0.6325972676277161, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 1770 + }, + { + "epoch": 6.393679458239277, + "grad_norm": 169.77285766601562, + "learning_rate": 2.041197822141561e-05, + "loss": 43.8688, + "step": 1771 + }, + { + "epoch": 6.397291196388262, + "grad_norm": 158.4019012451172, + "learning_rate": 2.0406533575317604e-05, + "loss": 42.5757, + "step": 1772 + }, + { + "epoch": 6.400902934537246, + "grad_norm": 209.79916381835938, + "learning_rate": 2.04010889292196e-05, + "loss": 44.8075, + "step": 1773 + }, + { + "epoch": 6.40451467268623, + "grad_norm": 215.74639892578125, + "learning_rate": 2.0395644283121595e-05, + "loss": 42.0121, + "step": 1774 + }, + { + "epoch": 6.408126410835214, + "grad_norm": 215.21121215820312, + "learning_rate": 2.0390199637023597e-05, + "loss": 40.6564, + "step": 1775 + }, + { + "epoch": 6.411738148984199, + "grad_norm": 244.49574279785156, + "learning_rate": 2.0384754990925592e-05, + "loss": 40.543, + "step": 1776 + }, + { + "epoch": 6.415349887133183, + "grad_norm": 189.22781372070312, + "learning_rate": 2.0379310344827588e-05, + "loss": 39.5569, + "step": 1777 + }, + { + "epoch": 6.418961625282167, + "grad_norm": 204.32664489746094, + "learning_rate": 2.0373865698729583e-05, + "loss": 40.0789, + "step": 1778 + }, + { + "epoch": 6.422573363431152, + "grad_norm": 217.5277557373047, + "learning_rate": 2.036842105263158e-05, + "loss": 39.6436, + "step": 1779 + }, + { + "epoch": 6.426185101580136, + "grad_norm": 196.25918579101562, + "learning_rate": 2.0362976406533574e-05, + "loss": 41.0794, + "step": 1780 + }, + { + "epoch": 6.426185101580136, + "eval_loss": 0.6334295868873596, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1780 + }, + { + "epoch": 6.42979683972912, + "grad_norm": 191.50656127929688, + "learning_rate": 2.0357531760435573e-05, + "loss": 41.2976, + "step": 1781 + }, + { + "epoch": 6.433408577878104, + "grad_norm": 192.98692321777344, + "learning_rate": 2.0352087114337568e-05, + "loss": 41.0843, + "step": 1782 + }, + { + "epoch": 6.437020316027088, + "grad_norm": 197.32862854003906, + "learning_rate": 2.0346642468239563e-05, + "loss": 40.4123, + "step": 1783 + }, + { + "epoch": 6.440632054176072, + "grad_norm": 205.18751525878906, + "learning_rate": 2.0341197822141562e-05, + "loss": 41.9185, + "step": 1784 + }, + { + "epoch": 6.444243792325056, + "grad_norm": 201.69070434570312, + "learning_rate": 2.0335753176043558e-05, + "loss": 41.6794, + "step": 1785 + }, + { + "epoch": 6.44785553047404, + "grad_norm": 218.77044677734375, + "learning_rate": 2.0330308529945556e-05, + "loss": 43.5805, + "step": 1786 + }, + { + "epoch": 6.451467268623025, + "grad_norm": 183.25967407226562, + "learning_rate": 2.0324863883847552e-05, + "loss": 41.2777, + "step": 1787 + }, + { + "epoch": 6.455079006772009, + "grad_norm": 219.97369384765625, + "learning_rate": 2.0319419237749547e-05, + "loss": 42.4618, + "step": 1788 + }, + { + "epoch": 6.458690744920993, + "grad_norm": 216.1624298095703, + "learning_rate": 2.0313974591651542e-05, + "loss": 41.6424, + "step": 1789 + }, + { + "epoch": 6.462302483069977, + "grad_norm": 222.29965209960938, + "learning_rate": 2.0308529945553538e-05, + "loss": 41.4058, + "step": 1790 + }, + { + "epoch": 6.462302483069977, + "eval_loss": 0.6282982230186462, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 1790 + }, + { + "epoch": 6.465914221218962, + "grad_norm": 215.50511169433594, + "learning_rate": 2.0303085299455533e-05, + "loss": 39.474, + "step": 1791 + }, + { + "epoch": 6.469525959367946, + "grad_norm": 237.2119903564453, + "learning_rate": 2.0297640653357532e-05, + "loss": 36.0508, + "step": 1792 + }, + { + "epoch": 6.47313769751693, + "grad_norm": 234.52975463867188, + "learning_rate": 2.029219600725953e-05, + "loss": 34.1704, + "step": 1793 + }, + { + "epoch": 6.476749435665914, + "grad_norm": 213.22216796875, + "learning_rate": 2.0286751361161526e-05, + "loss": 34.7592, + "step": 1794 + }, + { + "epoch": 6.480361173814899, + "grad_norm": 215.77244567871094, + "learning_rate": 2.028130671506352e-05, + "loss": 35.3051, + "step": 1795 + }, + { + "epoch": 6.483972911963883, + "grad_norm": 179.0439910888672, + "learning_rate": 2.0275862068965517e-05, + "loss": 35.2493, + "step": 1796 + }, + { + "epoch": 6.487584650112867, + "grad_norm": 217.47218322753906, + "learning_rate": 2.0270417422867516e-05, + "loss": 35.6169, + "step": 1797 + }, + { + "epoch": 6.491196388261851, + "grad_norm": 191.3380584716797, + "learning_rate": 2.026497277676951e-05, + "loss": 36.428, + "step": 1798 + }, + { + "epoch": 6.4948081264108355, + "grad_norm": 200.8570098876953, + "learning_rate": 2.0259528130671506e-05, + "loss": 36.5983, + "step": 1799 + }, + { + "epoch": 6.4984198645598195, + "grad_norm": 173.1240234375, + "learning_rate": 2.0254083484573502e-05, + "loss": 36.0163, + "step": 1800 + }, + { + "epoch": 6.4984198645598195, + "eval_loss": 0.6268841624259949, + "eval_runtime": 3.146, + "eval_samples_per_second": 56.898, + "eval_steps_per_second": 56.898, + "step": 1800 + }, + { + "epoch": 6.502031602708803, + "grad_norm": 225.66845703125, + "learning_rate": 2.0248638838475497e-05, + "loss": 36.2461, + "step": 1801 + }, + { + "epoch": 6.505643340857787, + "grad_norm": 189.66233825683594, + "learning_rate": 2.0243194192377496e-05, + "loss": 37.416, + "step": 1802 + }, + { + "epoch": 6.509255079006772, + "grad_norm": 243.0270233154297, + "learning_rate": 2.0237749546279495e-05, + "loss": 38.5309, + "step": 1803 + }, + { + "epoch": 6.512866817155756, + "grad_norm": 192.0927276611328, + "learning_rate": 2.023230490018149e-05, + "loss": 37.087, + "step": 1804 + }, + { + "epoch": 6.51647855530474, + "grad_norm": 222.2957305908203, + "learning_rate": 2.0226860254083486e-05, + "loss": 37.8877, + "step": 1805 + }, + { + "epoch": 6.520090293453725, + "grad_norm": 259.84722900390625, + "learning_rate": 2.022141560798548e-05, + "loss": 39.2138, + "step": 1806 + }, + { + "epoch": 6.523702031602709, + "grad_norm": 205.5794219970703, + "learning_rate": 2.0215970961887476e-05, + "loss": 38.6066, + "step": 1807 + }, + { + "epoch": 6.527313769751693, + "grad_norm": 300.455810546875, + "learning_rate": 2.0210526315789475e-05, + "loss": 36.1581, + "step": 1808 + }, + { + "epoch": 6.530925507900677, + "grad_norm": 207.18063354492188, + "learning_rate": 2.020508166969147e-05, + "loss": 24.3689, + "step": 1809 + }, + { + "epoch": 6.534537246049661, + "grad_norm": 230.98516845703125, + "learning_rate": 2.0199637023593466e-05, + "loss": 23.7019, + "step": 1810 + }, + { + "epoch": 6.534537246049661, + "eval_loss": 0.6379140615463257, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 1810 + }, + { + "epoch": 6.538148984198646, + "grad_norm": 153.8694610595703, + "learning_rate": 2.019419237749546e-05, + "loss": 24.5035, + "step": 1811 + }, + { + "epoch": 6.54176072234763, + "grad_norm": 229.9432373046875, + "learning_rate": 2.018874773139746e-05, + "loss": 26.1645, + "step": 1812 + }, + { + "epoch": 6.545372460496614, + "grad_norm": 325.3592529296875, + "learning_rate": 2.018330308529946e-05, + "loss": 45.6349, + "step": 1813 + }, + { + "epoch": 6.5489841986455986, + "grad_norm": 261.0744323730469, + "learning_rate": 2.0177858439201454e-05, + "loss": 45.5545, + "step": 1814 + }, + { + "epoch": 6.5525959367945825, + "grad_norm": 261.4237976074219, + "learning_rate": 2.017241379310345e-05, + "loss": 45.321, + "step": 1815 + }, + { + "epoch": 6.5562076749435665, + "grad_norm": 238.8377685546875, + "learning_rate": 2.0166969147005445e-05, + "loss": 44.5963, + "step": 1816 + }, + { + "epoch": 6.5598194130925505, + "grad_norm": 225.89730834960938, + "learning_rate": 2.016152450090744e-05, + "loss": 43.593, + "step": 1817 + }, + { + "epoch": 6.563431151241535, + "grad_norm": 265.09625244140625, + "learning_rate": 2.0156079854809436e-05, + "loss": 43.536, + "step": 1818 + }, + { + "epoch": 6.567042889390519, + "grad_norm": 257.9114685058594, + "learning_rate": 2.0150635208711434e-05, + "loss": 44.1125, + "step": 1819 + }, + { + "epoch": 6.570654627539503, + "grad_norm": 188.06382751464844, + "learning_rate": 2.014519056261343e-05, + "loss": 45.097, + "step": 1820 + }, + { + "epoch": 6.570654627539503, + "eval_loss": 0.6347097754478455, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 1820 + }, + { + "epoch": 6.574266365688487, + "grad_norm": 227.7350616455078, + "learning_rate": 2.013974591651543e-05, + "loss": 43.9367, + "step": 1821 + }, + { + "epoch": 6.577878103837472, + "grad_norm": 207.54774475097656, + "learning_rate": 2.0134301270417424e-05, + "loss": 43.8266, + "step": 1822 + }, + { + "epoch": 6.581489841986456, + "grad_norm": 204.62364196777344, + "learning_rate": 2.012885662431942e-05, + "loss": 42.7973, + "step": 1823 + }, + { + "epoch": 6.58510158013544, + "grad_norm": 244.32159423828125, + "learning_rate": 2.0123411978221418e-05, + "loss": 42.7741, + "step": 1824 + }, + { + "epoch": 6.588713318284425, + "grad_norm": 304.9100036621094, + "learning_rate": 2.0117967332123414e-05, + "loss": 40.6529, + "step": 1825 + }, + { + "epoch": 6.592325056433409, + "grad_norm": 275.5767517089844, + "learning_rate": 2.011252268602541e-05, + "loss": 40.2909, + "step": 1826 + }, + { + "epoch": 6.595936794582393, + "grad_norm": 227.69642639160156, + "learning_rate": 2.0107078039927404e-05, + "loss": 39.8786, + "step": 1827 + }, + { + "epoch": 6.599548532731377, + "grad_norm": 261.4333190917969, + "learning_rate": 2.01016333938294e-05, + "loss": 40.7009, + "step": 1828 + }, + { + "epoch": 6.603160270880361, + "grad_norm": 213.0095977783203, + "learning_rate": 2.0096188747731395e-05, + "loss": 40.0595, + "step": 1829 + }, + { + "epoch": 6.606772009029346, + "grad_norm": 251.78590393066406, + "learning_rate": 2.0090744101633397e-05, + "loss": 40.8939, + "step": 1830 + }, + { + "epoch": 6.606772009029346, + "eval_loss": 0.6333281397819519, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1830 + }, + { + "epoch": 6.6103837471783295, + "grad_norm": 224.89805603027344, + "learning_rate": 2.0085299455535393e-05, + "loss": 41.4123, + "step": 1831 + }, + { + "epoch": 6.6139954853273135, + "grad_norm": 195.67982482910156, + "learning_rate": 2.0079854809437388e-05, + "loss": 41.3483, + "step": 1832 + }, + { + "epoch": 6.617607223476298, + "grad_norm": 214.318603515625, + "learning_rate": 2.0074410163339383e-05, + "loss": 40.5516, + "step": 1833 + }, + { + "epoch": 6.621218961625282, + "grad_norm": 226.60968017578125, + "learning_rate": 2.006896551724138e-05, + "loss": 41.3523, + "step": 1834 + }, + { + "epoch": 6.624830699774266, + "grad_norm": 231.63604736328125, + "learning_rate": 2.0063520871143378e-05, + "loss": 41.8734, + "step": 1835 + }, + { + "epoch": 6.62844243792325, + "grad_norm": 224.1644287109375, + "learning_rate": 2.0058076225045373e-05, + "loss": 42.7386, + "step": 1836 + }, + { + "epoch": 6.632054176072235, + "grad_norm": 273.651123046875, + "learning_rate": 2.0052631578947368e-05, + "loss": 42.4525, + "step": 1837 + }, + { + "epoch": 6.635665914221219, + "grad_norm": 270.8088684082031, + "learning_rate": 2.0047186932849364e-05, + "loss": 42.1051, + "step": 1838 + }, + { + "epoch": 6.639277652370203, + "grad_norm": 303.1058044433594, + "learning_rate": 2.0041742286751362e-05, + "loss": 42.1301, + "step": 1839 + }, + { + "epoch": 6.642889390519187, + "grad_norm": 207.29380798339844, + "learning_rate": 2.0036297640653358e-05, + "loss": 42.1495, + "step": 1840 + }, + { + "epoch": 6.642889390519187, + "eval_loss": 0.6321585774421692, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1840 + }, + { + "epoch": 6.646501128668172, + "grad_norm": 262.1852722167969, + "learning_rate": 2.0030852994555357e-05, + "loss": 39.6408, + "step": 1841 + }, + { + "epoch": 6.650112866817156, + "grad_norm": 233.7991943359375, + "learning_rate": 2.0025408348457352e-05, + "loss": 37.6177, + "step": 1842 + }, + { + "epoch": 6.65372460496614, + "grad_norm": 247.25514221191406, + "learning_rate": 2.0019963702359347e-05, + "loss": 35.4287, + "step": 1843 + }, + { + "epoch": 6.657336343115124, + "grad_norm": 191.53343200683594, + "learning_rate": 2.0014519056261343e-05, + "loss": 34.2335, + "step": 1844 + }, + { + "epoch": 6.660948081264109, + "grad_norm": 245.22821044921875, + "learning_rate": 2.0009074410163338e-05, + "loss": 35.8097, + "step": 1845 + }, + { + "epoch": 6.664559819413093, + "grad_norm": 213.8151092529297, + "learning_rate": 2.0003629764065337e-05, + "loss": 35.2621, + "step": 1846 + }, + { + "epoch": 6.668171557562077, + "grad_norm": 174.6085205078125, + "learning_rate": 1.9998185117967332e-05, + "loss": 36.6137, + "step": 1847 + }, + { + "epoch": 6.6717832957110605, + "grad_norm": 287.4677429199219, + "learning_rate": 1.9992740471869328e-05, + "loss": 37.5896, + "step": 1848 + }, + { + "epoch": 6.675395033860045, + "grad_norm": 224.59771728515625, + "learning_rate": 1.9987295825771326e-05, + "loss": 36.5515, + "step": 1849 + }, + { + "epoch": 6.679006772009029, + "grad_norm": 212.73065185546875, + "learning_rate": 1.9981851179673322e-05, + "loss": 36.2511, + "step": 1850 + }, + { + "epoch": 6.679006772009029, + "eval_loss": 0.6308404803276062, + "eval_runtime": 3.1419, + "eval_samples_per_second": 56.972, + "eval_steps_per_second": 56.972, + "step": 1850 + }, + { + "epoch": 6.682618510158013, + "grad_norm": 214.7340850830078, + "learning_rate": 1.9976406533575317e-05, + "loss": 37.6949, + "step": 1851 + }, + { + "epoch": 6.686230248306998, + "grad_norm": 220.3029327392578, + "learning_rate": 1.9970961887477316e-05, + "loss": 36.5785, + "step": 1852 + }, + { + "epoch": 6.689841986455982, + "grad_norm": 198.97564697265625, + "learning_rate": 1.996551724137931e-05, + "loss": 38.5277, + "step": 1853 + }, + { + "epoch": 6.693453724604966, + "grad_norm": 180.94789123535156, + "learning_rate": 1.9960072595281307e-05, + "loss": 37.5197, + "step": 1854 + }, + { + "epoch": 6.69706546275395, + "grad_norm": 212.17584228515625, + "learning_rate": 1.9954627949183302e-05, + "loss": 37.3483, + "step": 1855 + }, + { + "epoch": 6.700677200902934, + "grad_norm": 253.88601684570312, + "learning_rate": 1.9949183303085298e-05, + "loss": 38.5224, + "step": 1856 + }, + { + "epoch": 6.704288939051919, + "grad_norm": 193.17698669433594, + "learning_rate": 1.9943738656987296e-05, + "loss": 37.5679, + "step": 1857 + }, + { + "epoch": 6.707900677200903, + "grad_norm": 217.2652130126953, + "learning_rate": 1.9938294010889295e-05, + "loss": 27.7344, + "step": 1858 + }, + { + "epoch": 6.711512415349887, + "grad_norm": 183.9295196533203, + "learning_rate": 1.993284936479129e-05, + "loss": 24.3864, + "step": 1859 + }, + { + "epoch": 6.715124153498872, + "grad_norm": 200.3455352783203, + "learning_rate": 1.9927404718693286e-05, + "loss": 23.7328, + "step": 1860 + }, + { + "epoch": 6.715124153498872, + "eval_loss": 0.636415421962738, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.943, + "eval_steps_per_second": 56.943, + "step": 1860 + }, + { + "epoch": 6.718735891647856, + "grad_norm": 206.7858123779297, + "learning_rate": 1.992196007259528e-05, + "loss": 24.6541, + "step": 1861 + }, + { + "epoch": 6.72234762979684, + "grad_norm": 208.10414123535156, + "learning_rate": 1.9916515426497277e-05, + "loss": 25.1223, + "step": 1862 + }, + { + "epoch": 6.725959367945824, + "grad_norm": 270.6657409667969, + "learning_rate": 1.9911070780399275e-05, + "loss": 44.8561, + "step": 1863 + }, + { + "epoch": 6.7295711060948085, + "grad_norm": 246.69094848632812, + "learning_rate": 1.990562613430127e-05, + "loss": 45.8683, + "step": 1864 + }, + { + "epoch": 6.733182844243792, + "grad_norm": 243.4462432861328, + "learning_rate": 1.9900181488203266e-05, + "loss": 45.1845, + "step": 1865 + }, + { + "epoch": 6.736794582392776, + "grad_norm": 218.0637969970703, + "learning_rate": 1.989473684210526e-05, + "loss": 43.9492, + "step": 1866 + }, + { + "epoch": 6.74040632054176, + "grad_norm": 200.28140258789062, + "learning_rate": 1.988929219600726e-05, + "loss": 44.0612, + "step": 1867 + }, + { + "epoch": 6.744018058690745, + "grad_norm": 200.3120880126953, + "learning_rate": 1.988384754990926e-05, + "loss": 43.4748, + "step": 1868 + }, + { + "epoch": 6.747629796839729, + "grad_norm": 186.1811065673828, + "learning_rate": 1.9878402903811254e-05, + "loss": 43.6851, + "step": 1869 + }, + { + "epoch": 6.751241534988713, + "grad_norm": 208.15167236328125, + "learning_rate": 1.987295825771325e-05, + "loss": 44.4196, + "step": 1870 + }, + { + "epoch": 6.751241534988713, + "eval_loss": 0.6353851556777954, + "eval_runtime": 3.1436, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1870 + }, + { + "epoch": 6.754853273137698, + "grad_norm": 207.500244140625, + "learning_rate": 1.9867513611615245e-05, + "loss": 44.1493, + "step": 1871 + }, + { + "epoch": 6.758465011286682, + "grad_norm": 238.17047119140625, + "learning_rate": 1.986206896551724e-05, + "loss": 44.6587, + "step": 1872 + }, + { + "epoch": 6.762076749435666, + "grad_norm": 192.9468231201172, + "learning_rate": 1.9856624319419236e-05, + "loss": 43.2409, + "step": 1873 + }, + { + "epoch": 6.76568848758465, + "grad_norm": 205.26492309570312, + "learning_rate": 1.9851179673321235e-05, + "loss": 40.8636, + "step": 1874 + }, + { + "epoch": 6.769300225733634, + "grad_norm": 190.49908447265625, + "learning_rate": 1.984573502722323e-05, + "loss": 41.0769, + "step": 1875 + }, + { + "epoch": 6.772911963882619, + "grad_norm": 206.56097412109375, + "learning_rate": 1.984029038112523e-05, + "loss": 40.1137, + "step": 1876 + }, + { + "epoch": 6.776523702031603, + "grad_norm": 212.89256286621094, + "learning_rate": 1.9834845735027224e-05, + "loss": 41.0114, + "step": 1877 + }, + { + "epoch": 6.780135440180587, + "grad_norm": 197.24267578125, + "learning_rate": 1.982940108892922e-05, + "loss": 40.6027, + "step": 1878 + }, + { + "epoch": 6.7837471783295715, + "grad_norm": 187.01942443847656, + "learning_rate": 1.982395644283122e-05, + "loss": 40.5933, + "step": 1879 + }, + { + "epoch": 6.7873589164785555, + "grad_norm": 236.31092834472656, + "learning_rate": 1.9818511796733214e-05, + "loss": 41.2282, + "step": 1880 + }, + { + "epoch": 6.7873589164785555, + "eval_loss": 0.6299392580986023, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 1880 + }, + { + "epoch": 6.7909706546275395, + "grad_norm": 194.92059326171875, + "learning_rate": 1.981306715063521e-05, + "loss": 41.5858, + "step": 1881 + }, + { + "epoch": 6.794582392776523, + "grad_norm": 192.26272583007812, + "learning_rate": 1.9807622504537205e-05, + "loss": 40.6826, + "step": 1882 + }, + { + "epoch": 6.798194130925508, + "grad_norm": 181.8116912841797, + "learning_rate": 1.98021778584392e-05, + "loss": 40.0867, + "step": 1883 + }, + { + "epoch": 6.801805869074492, + "grad_norm": 219.03494262695312, + "learning_rate": 1.9796733212341195e-05, + "loss": 41.4496, + "step": 1884 + }, + { + "epoch": 6.805417607223476, + "grad_norm": 190.7852325439453, + "learning_rate": 1.9791288566243194e-05, + "loss": 42.4147, + "step": 1885 + }, + { + "epoch": 6.80902934537246, + "grad_norm": 200.32476806640625, + "learning_rate": 1.9785843920145193e-05, + "loss": 42.0316, + "step": 1886 + }, + { + "epoch": 6.812641083521445, + "grad_norm": 240.6086883544922, + "learning_rate": 1.9780399274047188e-05, + "loss": 39.6992, + "step": 1887 + }, + { + "epoch": 6.816252821670429, + "grad_norm": 222.31700134277344, + "learning_rate": 1.9774954627949184e-05, + "loss": 42.9572, + "step": 1888 + }, + { + "epoch": 6.819864559819413, + "grad_norm": 215.65292358398438, + "learning_rate": 1.976950998185118e-05, + "loss": 42.5147, + "step": 1889 + }, + { + "epoch": 6.823476297968397, + "grad_norm": 195.71624755859375, + "learning_rate": 1.9764065335753178e-05, + "loss": 40.9536, + "step": 1890 + }, + { + "epoch": 6.823476297968397, + "eval_loss": 0.6288287043571472, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 1890 + }, + { + "epoch": 6.827088036117382, + "grad_norm": 202.301025390625, + "learning_rate": 1.9758620689655173e-05, + "loss": 40.1754, + "step": 1891 + }, + { + "epoch": 6.830699774266366, + "grad_norm": 217.07186889648438, + "learning_rate": 1.975317604355717e-05, + "loss": 35.7505, + "step": 1892 + }, + { + "epoch": 6.83431151241535, + "grad_norm": 189.78782653808594, + "learning_rate": 1.9747731397459164e-05, + "loss": 34.813, + "step": 1893 + }, + { + "epoch": 6.837923250564334, + "grad_norm": 247.2117462158203, + "learning_rate": 1.974228675136116e-05, + "loss": 33.932, + "step": 1894 + }, + { + "epoch": 6.8415349887133186, + "grad_norm": 244.06321716308594, + "learning_rate": 1.9736842105263158e-05, + "loss": 36.2514, + "step": 1895 + }, + { + "epoch": 6.8451467268623025, + "grad_norm": 235.78692626953125, + "learning_rate": 1.9731397459165157e-05, + "loss": 35.2123, + "step": 1896 + }, + { + "epoch": 6.8487584650112865, + "grad_norm": 193.82456970214844, + "learning_rate": 1.9725952813067152e-05, + "loss": 36.5477, + "step": 1897 + }, + { + "epoch": 6.852370203160271, + "grad_norm": 230.2017059326172, + "learning_rate": 1.9720508166969148e-05, + "loss": 36.1244, + "step": 1898 + }, + { + "epoch": 6.855981941309255, + "grad_norm": 205.5274200439453, + "learning_rate": 1.9715063520871143e-05, + "loss": 36.7059, + "step": 1899 + }, + { + "epoch": 6.859593679458239, + "grad_norm": 236.6873016357422, + "learning_rate": 1.970961887477314e-05, + "loss": 36.6212, + "step": 1900 + }, + { + "epoch": 6.859593679458239, + "eval_loss": 0.6235609650611877, + "eval_runtime": 3.1497, + "eval_samples_per_second": 56.831, + "eval_steps_per_second": 56.831, + "step": 1900 + }, + { + "epoch": 6.863205417607223, + "grad_norm": 217.63638305664062, + "learning_rate": 1.9704174228675137e-05, + "loss": 37.3918, + "step": 1901 + }, + { + "epoch": 6.866817155756207, + "grad_norm": 169.31996154785156, + "learning_rate": 1.9698729582577133e-05, + "loss": 37.8555, + "step": 1902 + }, + { + "epoch": 6.870428893905192, + "grad_norm": 204.2144775390625, + "learning_rate": 1.9693284936479128e-05, + "loss": 38.0013, + "step": 1903 + }, + { + "epoch": 6.874040632054176, + "grad_norm": 219.13595581054688, + "learning_rate": 1.9687840290381127e-05, + "loss": 37.2128, + "step": 1904 + }, + { + "epoch": 6.87765237020316, + "grad_norm": 189.8477325439453, + "learning_rate": 1.9682395644283122e-05, + "loss": 39.272, + "step": 1905 + }, + { + "epoch": 6.881264108352145, + "grad_norm": 214.21360778808594, + "learning_rate": 1.967695099818512e-05, + "loss": 37.5185, + "step": 1906 + }, + { + "epoch": 6.884875846501129, + "grad_norm": 252.57867431640625, + "learning_rate": 1.9671506352087116e-05, + "loss": 37.6195, + "step": 1907 + }, + { + "epoch": 6.888487584650113, + "grad_norm": 169.85382080078125, + "learning_rate": 1.966606170598911e-05, + "loss": 29.083, + "step": 1908 + }, + { + "epoch": 6.892099322799097, + "grad_norm": 161.38137817382812, + "learning_rate": 1.9660617059891107e-05, + "loss": 24.4547, + "step": 1909 + }, + { + "epoch": 6.895711060948082, + "grad_norm": 192.5706787109375, + "learning_rate": 1.9655172413793102e-05, + "loss": 24.2235, + "step": 1910 + }, + { + "epoch": 6.895711060948082, + "eval_loss": 0.6387229561805725, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1910 + }, + { + "epoch": 6.899322799097066, + "grad_norm": 177.5368194580078, + "learning_rate": 1.9649727767695098e-05, + "loss": 24.8032, + "step": 1911 + }, + { + "epoch": 6.9029345372460496, + "grad_norm": 206.98458862304688, + "learning_rate": 1.9644283121597097e-05, + "loss": 25.7293, + "step": 1912 + }, + { + "epoch": 6.9065462753950335, + "grad_norm": 238.7289581298828, + "learning_rate": 1.9638838475499095e-05, + "loss": 44.2514, + "step": 1913 + }, + { + "epoch": 6.910158013544018, + "grad_norm": 225.86854553222656, + "learning_rate": 1.963339382940109e-05, + "loss": 44.4858, + "step": 1914 + }, + { + "epoch": 6.913769751693002, + "grad_norm": 235.71524047851562, + "learning_rate": 1.9627949183303086e-05, + "loss": 44.5351, + "step": 1915 + }, + { + "epoch": 6.917381489841986, + "grad_norm": 233.1634063720703, + "learning_rate": 1.962250453720508e-05, + "loss": 44.0865, + "step": 1916 + }, + { + "epoch": 6.92099322799097, + "grad_norm": 201.48944091796875, + "learning_rate": 1.961705989110708e-05, + "loss": 45.0226, + "step": 1917 + }, + { + "epoch": 6.924604966139955, + "grad_norm": 226.95469665527344, + "learning_rate": 1.9611615245009076e-05, + "loss": 44.3969, + "step": 1918 + }, + { + "epoch": 6.928216704288939, + "grad_norm": 242.79940795898438, + "learning_rate": 1.960617059891107e-05, + "loss": 41.3037, + "step": 1919 + }, + { + "epoch": 6.931828442437923, + "grad_norm": 255.3524932861328, + "learning_rate": 1.9600725952813066e-05, + "loss": 41.3567, + "step": 1920 + }, + { + "epoch": 6.931828442437923, + "eval_loss": 0.6346065998077393, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 1920 + }, + { + "epoch": 6.935440180586907, + "grad_norm": 277.0763854980469, + "learning_rate": 1.9595281306715062e-05, + "loss": 41.142, + "step": 1921 + }, + { + "epoch": 6.939051918735892, + "grad_norm": 176.02658081054688, + "learning_rate": 1.958983666061706e-05, + "loss": 42.1963, + "step": 1922 + }, + { + "epoch": 6.942663656884876, + "grad_norm": 236.36398315429688, + "learning_rate": 1.958439201451906e-05, + "loss": 42.351, + "step": 1923 + }, + { + "epoch": 6.94627539503386, + "grad_norm": 203.0919647216797, + "learning_rate": 1.9578947368421055e-05, + "loss": 41.5248, + "step": 1924 + }, + { + "epoch": 6.949887133182845, + "grad_norm": 273.605712890625, + "learning_rate": 1.957350272232305e-05, + "loss": 42.1004, + "step": 1925 + }, + { + "epoch": 6.953498871331829, + "grad_norm": 214.04319763183594, + "learning_rate": 1.9568058076225045e-05, + "loss": 42.6326, + "step": 1926 + }, + { + "epoch": 6.957110609480813, + "grad_norm": 250.81832885742188, + "learning_rate": 1.956261343012704e-05, + "loss": 43.8045, + "step": 1927 + }, + { + "epoch": 6.960722347629797, + "grad_norm": 233.58116149902344, + "learning_rate": 1.955716878402904e-05, + "loss": 39.8991, + "step": 1928 + }, + { + "epoch": 6.9643340857787805, + "grad_norm": 269.0545654296875, + "learning_rate": 1.9551724137931035e-05, + "loss": 34.6192, + "step": 1929 + }, + { + "epoch": 6.967945823927765, + "grad_norm": 266.1218566894531, + "learning_rate": 1.954627949183303e-05, + "loss": 35.7568, + "step": 1930 + }, + { + "epoch": 6.967945823927765, + "eval_loss": 0.6233173608779907, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1930 + }, + { + "epoch": 6.971557562076749, + "grad_norm": 294.6914978027344, + "learning_rate": 1.9540834845735026e-05, + "loss": 36.0795, + "step": 1931 + }, + { + "epoch": 6.975169300225733, + "grad_norm": 373.6831970214844, + "learning_rate": 1.9535390199637025e-05, + "loss": 37.2715, + "step": 1932 + }, + { + "epoch": 6.978781038374718, + "grad_norm": 240.34738159179688, + "learning_rate": 1.952994555353902e-05, + "loss": 37.8335, + "step": 1933 + }, + { + "epoch": 6.982392776523702, + "grad_norm": 312.1968994140625, + "learning_rate": 1.952450090744102e-05, + "loss": 37.8251, + "step": 1934 + }, + { + "epoch": 6.986004514672686, + "grad_norm": 276.3544006347656, + "learning_rate": 1.9519056261343014e-05, + "loss": 38.8466, + "step": 1935 + }, + { + "epoch": 6.98961625282167, + "grad_norm": 282.6874694824219, + "learning_rate": 1.951361161524501e-05, + "loss": 37.774, + "step": 1936 + }, + { + "epoch": 6.993227990970655, + "grad_norm": 323.96612548828125, + "learning_rate": 1.9508166969147005e-05, + "loss": 34.3747, + "step": 1937 + }, + { + "epoch": 6.996839729119639, + "grad_norm": 235.02915954589844, + "learning_rate": 1.9502722323049e-05, + "loss": 24.5297, + "step": 1938 + }, + { + "epoch": 7.0, + "grad_norm": 176.4046173095703, + "learning_rate": 1.9497277676951e-05, + "loss": 22.3179, + "step": 1939 + }, + { + "epoch": 7.003611738148984, + "grad_norm": 248.2797393798828, + "learning_rate": 1.9491833030852994e-05, + "loss": 42.225, + "step": 1940 + }, + { + "epoch": 7.003611738148984, + "eval_loss": 0.6272363066673279, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.911, + "eval_steps_per_second": 56.911, + "step": 1940 + }, + { + "epoch": 7.007223476297969, + "grad_norm": 235.9131622314453, + "learning_rate": 1.9486388384754993e-05, + "loss": 43.6526, + "step": 1941 + }, + { + "epoch": 7.010835214446953, + "grad_norm": 223.63479614257812, + "learning_rate": 1.948094373865699e-05, + "loss": 42.9052, + "step": 1942 + }, + { + "epoch": 7.014446952595937, + "grad_norm": 203.92141723632812, + "learning_rate": 1.9475499092558984e-05, + "loss": 43.5819, + "step": 1943 + }, + { + "epoch": 7.018058690744921, + "grad_norm": 209.6050567626953, + "learning_rate": 1.947005444646098e-05, + "loss": 43.1077, + "step": 1944 + }, + { + "epoch": 7.021670428893906, + "grad_norm": 245.77700805664062, + "learning_rate": 1.9464609800362978e-05, + "loss": 42.7508, + "step": 1945 + }, + { + "epoch": 7.0252821670428895, + "grad_norm": 203.13465881347656, + "learning_rate": 1.9459165154264973e-05, + "loss": 42.5234, + "step": 1946 + }, + { + "epoch": 7.0288939051918735, + "grad_norm": 226.4978485107422, + "learning_rate": 1.945372050816697e-05, + "loss": 44.0725, + "step": 1947 + }, + { + "epoch": 7.0325056433408575, + "grad_norm": 225.68116760253906, + "learning_rate": 1.9448275862068964e-05, + "loss": 42.6408, + "step": 1948 + }, + { + "epoch": 7.036117381489842, + "grad_norm": 182.14202880859375, + "learning_rate": 1.944283121597096e-05, + "loss": 41.7696, + "step": 1949 + }, + { + "epoch": 7.039729119638826, + "grad_norm": 196.1949005126953, + "learning_rate": 1.9437386569872962e-05, + "loss": 42.7008, + "step": 1950 + }, + { + "epoch": 7.039729119638826, + "eval_loss": 0.6277336478233337, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 1950 + }, + { + "epoch": 7.04334085778781, + "grad_norm": 180.6853485107422, + "learning_rate": 1.9431941923774957e-05, + "loss": 41.9946, + "step": 1951 + }, + { + "epoch": 7.046952595936794, + "grad_norm": 199.0644073486328, + "learning_rate": 1.9426497277676953e-05, + "loss": 39.8965, + "step": 1952 + }, + { + "epoch": 7.050564334085779, + "grad_norm": 208.21371459960938, + "learning_rate": 1.9421052631578948e-05, + "loss": 39.3263, + "step": 1953 + }, + { + "epoch": 7.054176072234763, + "grad_norm": 239.78677368164062, + "learning_rate": 1.9415607985480943e-05, + "loss": 40.1478, + "step": 1954 + }, + { + "epoch": 7.057787810383747, + "grad_norm": 211.55030822753906, + "learning_rate": 1.941016333938294e-05, + "loss": 40.061, + "step": 1955 + }, + { + "epoch": 7.061399548532731, + "grad_norm": 199.51455688476562, + "learning_rate": 1.9404718693284937e-05, + "loss": 39.8707, + "step": 1956 + }, + { + "epoch": 7.065011286681716, + "grad_norm": 183.39486694335938, + "learning_rate": 1.9399274047186933e-05, + "loss": 40.3183, + "step": 1957 + }, + { + "epoch": 7.0686230248307, + "grad_norm": 238.36737060546875, + "learning_rate": 1.9393829401088928e-05, + "loss": 40.8581, + "step": 1958 + }, + { + "epoch": 7.072234762979684, + "grad_norm": 202.5072021484375, + "learning_rate": 1.9388384754990927e-05, + "loss": 40.2192, + "step": 1959 + }, + { + "epoch": 7.075846501128668, + "grad_norm": 204.236083984375, + "learning_rate": 1.9382940108892922e-05, + "loss": 40.8533, + "step": 1960 + }, + { + "epoch": 7.075846501128668, + "eval_loss": 0.6252757906913757, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 1960 + }, + { + "epoch": 7.079458239277653, + "grad_norm": 260.2081298828125, + "learning_rate": 1.937749546279492e-05, + "loss": 39.7229, + "step": 1961 + }, + { + "epoch": 7.083069977426637, + "grad_norm": 241.91722106933594, + "learning_rate": 1.9372050816696917e-05, + "loss": 41.547, + "step": 1962 + }, + { + "epoch": 7.0866817155756205, + "grad_norm": 168.9304656982422, + "learning_rate": 1.9366606170598912e-05, + "loss": 41.4826, + "step": 1963 + }, + { + "epoch": 7.090293453724605, + "grad_norm": 230.05349731445312, + "learning_rate": 1.9361161524500907e-05, + "loss": 41.5411, + "step": 1964 + }, + { + "epoch": 7.093905191873589, + "grad_norm": 172.16851806640625, + "learning_rate": 1.9355716878402903e-05, + "loss": 42.2347, + "step": 1965 + }, + { + "epoch": 7.097516930022573, + "grad_norm": 312.65838623046875, + "learning_rate": 1.9350272232304898e-05, + "loss": 41.4039, + "step": 1966 + }, + { + "epoch": 7.101128668171557, + "grad_norm": 249.62351989746094, + "learning_rate": 1.9344827586206897e-05, + "loss": 41.4234, + "step": 1967 + }, + { + "epoch": 7.104740406320542, + "grad_norm": 250.49143981933594, + "learning_rate": 1.9339382940108896e-05, + "loss": 38.0539, + "step": 1968 + }, + { + "epoch": 7.108352144469526, + "grad_norm": 238.41546630859375, + "learning_rate": 1.933393829401089e-05, + "loss": 35.5584, + "step": 1969 + }, + { + "epoch": 7.11196388261851, + "grad_norm": 200.78282165527344, + "learning_rate": 1.9328493647912886e-05, + "loss": 34.4491, + "step": 1970 + }, + { + "epoch": 7.11196388261851, + "eval_loss": 0.6286216378211975, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 1970 + }, + { + "epoch": 7.115575620767494, + "grad_norm": 244.61717224121094, + "learning_rate": 1.9323049001814882e-05, + "loss": 34.5403, + "step": 1971 + }, + { + "epoch": 7.119187358916479, + "grad_norm": 219.14312744140625, + "learning_rate": 1.931760435571688e-05, + "loss": 35.7815, + "step": 1972 + }, + { + "epoch": 7.122799097065463, + "grad_norm": 221.85130310058594, + "learning_rate": 1.9312159709618876e-05, + "loss": 35.638, + "step": 1973 + }, + { + "epoch": 7.126410835214447, + "grad_norm": 237.97921752929688, + "learning_rate": 1.930671506352087e-05, + "loss": 35.1348, + "step": 1974 + }, + { + "epoch": 7.130022573363431, + "grad_norm": 234.06256103515625, + "learning_rate": 1.9301270417422867e-05, + "loss": 35.8709, + "step": 1975 + }, + { + "epoch": 7.133634311512416, + "grad_norm": 231.6852264404297, + "learning_rate": 1.9295825771324862e-05, + "loss": 36.6859, + "step": 1976 + }, + { + "epoch": 7.1372460496614, + "grad_norm": 208.2762908935547, + "learning_rate": 1.9290381125226857e-05, + "loss": 37.24, + "step": 1977 + }, + { + "epoch": 7.140857787810384, + "grad_norm": 219.8532257080078, + "learning_rate": 1.928493647912886e-05, + "loss": 36.4058, + "step": 1978 + }, + { + "epoch": 7.144469525959368, + "grad_norm": 242.73159790039062, + "learning_rate": 1.9279491833030855e-05, + "loss": 36.7565, + "step": 1979 + }, + { + "epoch": 7.148081264108352, + "grad_norm": 227.09645080566406, + "learning_rate": 1.927404718693285e-05, + "loss": 37.6752, + "step": 1980 + }, + { + "epoch": 7.148081264108352, + "eval_loss": 0.6243596076965332, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 1980 + }, + { + "epoch": 7.151693002257336, + "grad_norm": 236.27169799804688, + "learning_rate": 1.9268602540834846e-05, + "loss": 38.3857, + "step": 1981 + }, + { + "epoch": 7.15530474040632, + "grad_norm": 244.84912109375, + "learning_rate": 1.926315789473684e-05, + "loss": 38.414, + "step": 1982 + }, + { + "epoch": 7.158916478555304, + "grad_norm": 203.36798095703125, + "learning_rate": 1.925771324863884e-05, + "loss": 38.938, + "step": 1983 + }, + { + "epoch": 7.162528216704289, + "grad_norm": 225.50152587890625, + "learning_rate": 1.9252268602540835e-05, + "loss": 37.654, + "step": 1984 + }, + { + "epoch": 7.166139954853273, + "grad_norm": 236.4989471435547, + "learning_rate": 1.924682395644283e-05, + "loss": 28.2794, + "step": 1985 + }, + { + "epoch": 7.169751693002257, + "grad_norm": 173.909423828125, + "learning_rate": 1.9241379310344826e-05, + "loss": 23.3804, + "step": 1986 + }, + { + "epoch": 7.173363431151241, + "grad_norm": 195.63526916503906, + "learning_rate": 1.9235934664246825e-05, + "loss": 24.4696, + "step": 1987 + }, + { + "epoch": 7.176975169300226, + "grad_norm": 150.0059356689453, + "learning_rate": 1.923049001814882e-05, + "loss": 23.9438, + "step": 1988 + }, + { + "epoch": 7.18058690744921, + "grad_norm": 217.61630249023438, + "learning_rate": 1.922504537205082e-05, + "loss": 25.4084, + "step": 1989 + }, + { + "epoch": 7.184198645598194, + "grad_norm": 259.2041015625, + "learning_rate": 1.9219600725952814e-05, + "loss": 44.7159, + "step": 1990 + }, + { + "epoch": 7.184198645598194, + "eval_loss": 0.6465168595314026, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 1990 + }, + { + "epoch": 7.187810383747179, + "grad_norm": 282.1758117675781, + "learning_rate": 1.921415607985481e-05, + "loss": 45.7571, + "step": 1991 + }, + { + "epoch": 7.191422121896163, + "grad_norm": 276.5455322265625, + "learning_rate": 1.9208711433756805e-05, + "loss": 44.7227, + "step": 1992 + }, + { + "epoch": 7.195033860045147, + "grad_norm": 251.93589782714844, + "learning_rate": 1.92032667876588e-05, + "loss": 43.0705, + "step": 1993 + }, + { + "epoch": 7.198645598194131, + "grad_norm": 224.8245086669922, + "learning_rate": 1.91978221415608e-05, + "loss": 43.2009, + "step": 1994 + }, + { + "epoch": 7.2022573363431155, + "grad_norm": 233.61770629882812, + "learning_rate": 1.9192377495462795e-05, + "loss": 43.4496, + "step": 1995 + }, + { + "epoch": 7.2058690744920995, + "grad_norm": 188.65252685546875, + "learning_rate": 1.9186932849364793e-05, + "loss": 42.5907, + "step": 1996 + }, + { + "epoch": 7.209480812641083, + "grad_norm": 185.1155242919922, + "learning_rate": 1.918148820326679e-05, + "loss": 44.4651, + "step": 1997 + }, + { + "epoch": 7.213092550790067, + "grad_norm": 169.09701538085938, + "learning_rate": 1.9176043557168784e-05, + "loss": 43.6325, + "step": 1998 + }, + { + "epoch": 7.216704288939052, + "grad_norm": 198.49114990234375, + "learning_rate": 1.9170598911070783e-05, + "loss": 43.5817, + "step": 1999 + }, + { + "epoch": 7.220316027088036, + "grad_norm": 193.17591857910156, + "learning_rate": 1.916515426497278e-05, + "loss": 41.4884, + "step": 2000 + }, + { + "epoch": 7.220316027088036, + "eval_loss": 0.6329721212387085, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.995, + "eval_steps_per_second": 56.995, + "step": 2000 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.813342285574963e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2608069ca1844a3187a4cee81ade17528474251c --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04b49eea147491809125f3b6783563a449939e196e87f7653c07f8987dc46446 +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..ca6815ae8b0171b24dd052bfa0e2a0919af85bfe --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4204c38cf8f1389e2f50b7f5afb1c69b846a69164c716dbe9d7a057772586ce6 +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ca79339d98f58c97aeccb699f1dc727a574dec5 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e91c7fa235915be3c9a47b36ea65b7da85fb330ca6b1a703c64f6f0f19366d3e +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..71656c8c701f76a2bdfa941f08990e7e3c13f2d3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80eac772dad2864f5b7aba59ebda0b22ea7a7cb235a7e9d8437ab53a971d7fdb +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3500ab965fdaa032b7d741b16f6ba38af1547ecb --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b65b6fc11c305d94d56b0c63fb5f5f05d3ad90349fd1c06538a5f4a946d3db8e +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..af6d9627bd14efd037aa38ace7bbab5b3c8cc24c --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/trainer_state.json @@ -0,0 +1,17193 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.942663656884876, + "eval_steps": 10, + "global_step": 2200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + }, + { + "epoch": 4.33589164785553, + "grad_norm": 232.332763671875, + "learning_rate": 2.3515426497277678e-05, + "loss": 38.1029, + "step": 1201 + }, + { + "epoch": 4.339503386004514, + "grad_norm": 251.8763885498047, + "learning_rate": 2.3509981851179673e-05, + "loss": 40.2538, + "step": 1202 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 260.1363525390625, + "learning_rate": 2.350453720508167e-05, + "loss": 39.115, + "step": 1203 + }, + { + "epoch": 4.346726862302483, + "grad_norm": 227.32473754882812, + "learning_rate": 2.3499092558983667e-05, + "loss": 37.7692, + "step": 1204 + }, + { + "epoch": 4.350338600451467, + "grad_norm": 208.3872528076172, + "learning_rate": 2.3493647912885663e-05, + "loss": 26.7583, + "step": 1205 + }, + { + "epoch": 4.353950338600452, + "grad_norm": 173.05075073242188, + "learning_rate": 2.348820326678766e-05, + "loss": 24.7576, + "step": 1206 + }, + { + "epoch": 4.357562076749436, + "grad_norm": 214.4512939453125, + "learning_rate": 2.3482758620689657e-05, + "loss": 24.8792, + "step": 1207 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 179.293701171875, + "learning_rate": 2.3477313974591652e-05, + "loss": 26.1507, + "step": 1208 + }, + { + "epoch": 4.364785553047404, + "grad_norm": 401.9908142089844, + "learning_rate": 2.3471869328493648e-05, + "loss": 47.4017, + "step": 1209 + }, + { + "epoch": 4.368397291196389, + "grad_norm": 399.3369140625, + "learning_rate": 2.3466424682395643e-05, + "loss": 48.0082, + "step": 1210 + }, + { + "epoch": 4.368397291196389, + "eval_loss": 0.6664602756500244, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.18, + "eval_steps_per_second": 57.18, + "step": 1210 + }, + { + "epoch": 4.372009029345373, + "grad_norm": 320.49090576171875, + "learning_rate": 2.346098003629764e-05, + "loss": 47.4843, + "step": 1211 + }, + { + "epoch": 4.375620767494357, + "grad_norm": 297.55615234375, + "learning_rate": 2.3455535390199637e-05, + "loss": 46.3087, + "step": 1212 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 245.03399658203125, + "learning_rate": 2.3450090744101636e-05, + "loss": 45.4889, + "step": 1213 + }, + { + "epoch": 4.382844243792325, + "grad_norm": 227.94091796875, + "learning_rate": 2.344464609800363e-05, + "loss": 45.8501, + "step": 1214 + }, + { + "epoch": 4.386455981941309, + "grad_norm": 262.7824401855469, + "learning_rate": 2.3439201451905627e-05, + "loss": 46.2737, + "step": 1215 + }, + { + "epoch": 4.390067720090293, + "grad_norm": 235.969970703125, + "learning_rate": 2.3433756805807622e-05, + "loss": 45.2876, + "step": 1216 + }, + { + "epoch": 4.393679458239277, + "grad_norm": 244.8028106689453, + "learning_rate": 2.342831215970962e-05, + "loss": 45.4931, + "step": 1217 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 236.24844360351562, + "learning_rate": 2.3422867513611616e-05, + "loss": 45.6649, + "step": 1218 + }, + { + "epoch": 4.400902934537246, + "grad_norm": 204.7911834716797, + "learning_rate": 2.341742286751361e-05, + "loss": 43.9613, + "step": 1219 + }, + { + "epoch": 4.40451467268623, + "grad_norm": 190.6739044189453, + "learning_rate": 2.3411978221415607e-05, + "loss": 41.9267, + "step": 1220 + }, + { + "epoch": 4.40451467268623, + "eval_loss": 0.6481396555900574, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1220 + }, + { + "epoch": 4.408126410835214, + "grad_norm": 224.25758361816406, + "learning_rate": 2.3406533575317602e-05, + "loss": 42.34, + "step": 1221 + }, + { + "epoch": 4.411738148984199, + "grad_norm": 238.21913146972656, + "learning_rate": 2.34010889292196e-05, + "loss": 40.6947, + "step": 1222 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 255.64395141601562, + "learning_rate": 2.33956442831216e-05, + "loss": 39.8585, + "step": 1223 + }, + { + "epoch": 4.418961625282167, + "grad_norm": 202.08859252929688, + "learning_rate": 2.3390199637023595e-05, + "loss": 42.6031, + "step": 1224 + }, + { + "epoch": 4.422573363431152, + "grad_norm": 222.359619140625, + "learning_rate": 2.338475499092559e-05, + "loss": 41.9946, + "step": 1225 + }, + { + "epoch": 4.426185101580136, + "grad_norm": 198.84461975097656, + "learning_rate": 2.3379310344827586e-05, + "loss": 40.9174, + "step": 1226 + }, + { + "epoch": 4.42979683972912, + "grad_norm": 227.34942626953125, + "learning_rate": 2.337386569872958e-05, + "loss": 42.2865, + "step": 1227 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 249.9097900390625, + "learning_rate": 2.336842105263158e-05, + "loss": 42.6508, + "step": 1228 + }, + { + "epoch": 4.437020316027088, + "grad_norm": 236.96009826660156, + "learning_rate": 2.3362976406533576e-05, + "loss": 43.0846, + "step": 1229 + }, + { + "epoch": 4.440632054176072, + "grad_norm": 183.06201171875, + "learning_rate": 2.335753176043557e-05, + "loss": 42.4119, + "step": 1230 + }, + { + "epoch": 4.440632054176072, + "eval_loss": 0.6428424715995789, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 1230 + }, + { + "epoch": 4.444243792325056, + "grad_norm": 199.0382843017578, + "learning_rate": 2.335208711433757e-05, + "loss": 43.1702, + "step": 1231 + }, + { + "epoch": 4.44785553047404, + "grad_norm": 221.87939453125, + "learning_rate": 2.3346642468239565e-05, + "loss": 43.3518, + "step": 1232 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 205.0601043701172, + "learning_rate": 2.3341197822141564e-05, + "loss": 42.9713, + "step": 1233 + }, + { + "epoch": 4.455079006772009, + "grad_norm": 235.3998565673828, + "learning_rate": 2.333575317604356e-05, + "loss": 42.6973, + "step": 1234 + }, + { + "epoch": 4.458690744920993, + "grad_norm": 171.76986694335938, + "learning_rate": 2.3330308529945555e-05, + "loss": 43.351, + "step": 1235 + }, + { + "epoch": 4.462302483069977, + "grad_norm": 261.549072265625, + "learning_rate": 2.332486388384755e-05, + "loss": 43.8662, + "step": 1236 + }, + { + "epoch": 4.465914221218962, + "grad_norm": 256.76837158203125, + "learning_rate": 2.3319419237749545e-05, + "loss": 40.7938, + "step": 1237 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 176.35060119628906, + "learning_rate": 2.331397459165154e-05, + "loss": 38.1021, + "step": 1238 + }, + { + "epoch": 4.47313769751693, + "grad_norm": 203.00906372070312, + "learning_rate": 2.330852994555354e-05, + "loss": 36.6359, + "step": 1239 + }, + { + "epoch": 4.476749435665914, + "grad_norm": 259.6462707519531, + "learning_rate": 2.3303085299455535e-05, + "loss": 34.448, + "step": 1240 + }, + { + "epoch": 4.476749435665914, + "eval_loss": 0.6386051177978516, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 1240 + }, + { + "epoch": 4.480361173814899, + "grad_norm": 215.24737548828125, + "learning_rate": 2.3297640653357534e-05, + "loss": 35.2353, + "step": 1241 + }, + { + "epoch": 4.483972911963883, + "grad_norm": 249.12355041503906, + "learning_rate": 2.329219600725953e-05, + "loss": 38.2077, + "step": 1242 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 191.0881805419922, + "learning_rate": 2.3286751361161525e-05, + "loss": 36.8363, + "step": 1243 + }, + { + "epoch": 4.491196388261851, + "grad_norm": 229.26449584960938, + "learning_rate": 2.3281306715063523e-05, + "loss": 36.7398, + "step": 1244 + }, + { + "epoch": 4.4948081264108355, + "grad_norm": 184.931884765625, + "learning_rate": 2.327586206896552e-05, + "loss": 35.6614, + "step": 1245 + }, + { + "epoch": 4.4984198645598195, + "grad_norm": 183.7378387451172, + "learning_rate": 2.3270417422867514e-05, + "loss": 36.9818, + "step": 1246 + }, + { + "epoch": 4.502031602708803, + "grad_norm": 191.42543029785156, + "learning_rate": 2.326497277676951e-05, + "loss": 38.1348, + "step": 1247 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 211.6359100341797, + "learning_rate": 2.3259528130671505e-05, + "loss": 37.0112, + "step": 1248 + }, + { + "epoch": 4.509255079006772, + "grad_norm": 245.6946563720703, + "learning_rate": 2.32540834845735e-05, + "loss": 38.6218, + "step": 1249 + }, + { + "epoch": 4.512866817155756, + "grad_norm": 193.29095458984375, + "learning_rate": 2.3248638838475502e-05, + "loss": 36.9687, + "step": 1250 + }, + { + "epoch": 4.512866817155756, + "eval_loss": 0.6432057023048401, + "eval_runtime": 3.1301, + "eval_samples_per_second": 57.187, + "eval_steps_per_second": 57.187, + "step": 1250 + }, + { + "epoch": 4.51647855530474, + "grad_norm": 247.0595245361328, + "learning_rate": 2.3243194192377498e-05, + "loss": 39.8086, + "step": 1251 + }, + { + "epoch": 4.520090293453725, + "grad_norm": 243.1544189453125, + "learning_rate": 2.3237749546279493e-05, + "loss": 38.7245, + "step": 1252 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 322.0834045410156, + "learning_rate": 2.323230490018149e-05, + "loss": 39.5335, + "step": 1253 + }, + { + "epoch": 4.527313769751693, + "grad_norm": 201.5956573486328, + "learning_rate": 2.3226860254083484e-05, + "loss": 30.2928, + "step": 1254 + }, + { + "epoch": 4.530925507900677, + "grad_norm": 186.13291931152344, + "learning_rate": 2.3221415607985483e-05, + "loss": 24.8504, + "step": 1255 + }, + { + "epoch": 4.534537246049661, + "grad_norm": 251.50608825683594, + "learning_rate": 2.3215970961887478e-05, + "loss": 24.5528, + "step": 1256 + }, + { + "epoch": 4.538148984198646, + "grad_norm": 180.21124267578125, + "learning_rate": 2.3210526315789473e-05, + "loss": 25.0864, + "step": 1257 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 206.5410614013672, + "learning_rate": 2.320508166969147e-05, + "loss": 27.1602, + "step": 1258 + }, + { + "epoch": 4.545372460496614, + "grad_norm": 342.1103210449219, + "learning_rate": 2.3199637023593468e-05, + "loss": 47.3734, + "step": 1259 + }, + { + "epoch": 4.5489841986455986, + "grad_norm": 418.3056945800781, + "learning_rate": 2.3194192377495463e-05, + "loss": 48.0316, + "step": 1260 + }, + { + "epoch": 4.5489841986455986, + "eval_loss": 0.6742400527000427, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 1260 + }, + { + "epoch": 4.5525959367945825, + "grad_norm": 369.8560791015625, + "learning_rate": 2.3188747731397462e-05, + "loss": 47.4532, + "step": 1261 + }, + { + "epoch": 4.5562076749435665, + "grad_norm": 322.0288391113281, + "learning_rate": 2.3183303085299457e-05, + "loss": 47.0661, + "step": 1262 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 244.79066467285156, + "learning_rate": 2.3177858439201453e-05, + "loss": 45.1875, + "step": 1263 + }, + { + "epoch": 4.563431151241535, + "grad_norm": 209.29397583007812, + "learning_rate": 2.3172413793103448e-05, + "loss": 46.1355, + "step": 1264 + }, + { + "epoch": 4.567042889390519, + "grad_norm": 271.5123291015625, + "learning_rate": 2.3166969147005443e-05, + "loss": 45.8947, + "step": 1265 + }, + { + "epoch": 4.570654627539503, + "grad_norm": 232.42913818359375, + "learning_rate": 2.3161524500907442e-05, + "loss": 45.6542, + "step": 1266 + }, + { + "epoch": 4.574266365688487, + "grad_norm": 282.50738525390625, + "learning_rate": 2.3156079854809437e-05, + "loss": 45.8805, + "step": 1267 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 203.39031982421875, + "learning_rate": 2.3150635208711436e-05, + "loss": 44.8926, + "step": 1268 + }, + { + "epoch": 4.581489841986456, + "grad_norm": 213.94894409179688, + "learning_rate": 2.314519056261343e-05, + "loss": 43.7589, + "step": 1269 + }, + { + "epoch": 4.58510158013544, + "grad_norm": 198.9677734375, + "learning_rate": 2.3139745916515427e-05, + "loss": 41.819, + "step": 1270 + }, + { + "epoch": 4.58510158013544, + "eval_loss": 0.6428627371788025, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1270 + }, + { + "epoch": 4.588713318284425, + "grad_norm": 197.69903564453125, + "learning_rate": 2.3134301270417422e-05, + "loss": 40.6128, + "step": 1271 + }, + { + "epoch": 4.592325056433409, + "grad_norm": 229.10488891601562, + "learning_rate": 2.312885662431942e-05, + "loss": 41.1856, + "step": 1272 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 254.4750213623047, + "learning_rate": 2.3123411978221417e-05, + "loss": 40.2048, + "step": 1273 + }, + { + "epoch": 4.599548532731377, + "grad_norm": 247.2012939453125, + "learning_rate": 2.3117967332123412e-05, + "loss": 41.663, + "step": 1274 + }, + { + "epoch": 4.603160270880361, + "grad_norm": 196.78761291503906, + "learning_rate": 2.3112522686025407e-05, + "loss": 41.1102, + "step": 1275 + }, + { + "epoch": 4.606772009029346, + "grad_norm": 179.03880310058594, + "learning_rate": 2.3107078039927403e-05, + "loss": 39.6368, + "step": 1276 + }, + { + "epoch": 4.6103837471783295, + "grad_norm": 203.49159240722656, + "learning_rate": 2.3101633393829405e-05, + "loss": 42.9424, + "step": 1277 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 254.80018615722656, + "learning_rate": 2.30961887477314e-05, + "loss": 42.0636, + "step": 1278 + }, + { + "epoch": 4.617607223476298, + "grad_norm": 201.86109924316406, + "learning_rate": 2.3090744101633396e-05, + "loss": 41.4738, + "step": 1279 + }, + { + "epoch": 4.621218961625282, + "grad_norm": 185.1239471435547, + "learning_rate": 2.308529945553539e-05, + "loss": 41.8529, + "step": 1280 + }, + { + "epoch": 4.621218961625282, + "eval_loss": 0.6457561254501343, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 1280 + }, + { + "epoch": 4.624830699774266, + "grad_norm": 198.6769561767578, + "learning_rate": 2.3079854809437386e-05, + "loss": 41.8397, + "step": 1281 + }, + { + "epoch": 4.62844243792325, + "grad_norm": 254.9165496826172, + "learning_rate": 2.3074410163339382e-05, + "loss": 43.5585, + "step": 1282 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 183.61181640625, + "learning_rate": 2.306896551724138e-05, + "loss": 41.7349, + "step": 1283 + }, + { + "epoch": 4.635665914221219, + "grad_norm": 206.0381622314453, + "learning_rate": 2.3063520871143376e-05, + "loss": 42.6239, + "step": 1284 + }, + { + "epoch": 4.639277652370203, + "grad_norm": 188.5303497314453, + "learning_rate": 2.305807622504537e-05, + "loss": 43.0988, + "step": 1285 + }, + { + "epoch": 4.642889390519187, + "grad_norm": 208.30039978027344, + "learning_rate": 2.3052631578947367e-05, + "loss": 43.8379, + "step": 1286 + }, + { + "epoch": 4.646501128668172, + "grad_norm": 209.494384765625, + "learning_rate": 2.3047186932849365e-05, + "loss": 41.4395, + "step": 1287 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 223.97824096679688, + "learning_rate": 2.3041742286751364e-05, + "loss": 38.5792, + "step": 1288 + }, + { + "epoch": 4.65372460496614, + "grad_norm": 209.16192626953125, + "learning_rate": 2.303629764065336e-05, + "loss": 36.2448, + "step": 1289 + }, + { + "epoch": 4.657336343115124, + "grad_norm": 260.72821044921875, + "learning_rate": 2.3030852994555355e-05, + "loss": 35.1692, + "step": 1290 + }, + { + "epoch": 4.657336343115124, + "eval_loss": 0.6381233334541321, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1290 + }, + { + "epoch": 4.660948081264109, + "grad_norm": 222.2270965576172, + "learning_rate": 2.302540834845735e-05, + "loss": 35.2234, + "step": 1291 + }, + { + "epoch": 4.664559819413093, + "grad_norm": 208.68218994140625, + "learning_rate": 2.3019963702359346e-05, + "loss": 35.6167, + "step": 1292 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 199.57015991210938, + "learning_rate": 2.301451905626134e-05, + "loss": 36.9489, + "step": 1293 + }, + { + "epoch": 4.6717832957110605, + "grad_norm": 249.1312255859375, + "learning_rate": 2.300907441016334e-05, + "loss": 37.0681, + "step": 1294 + }, + { + "epoch": 4.675395033860045, + "grad_norm": 227.86341857910156, + "learning_rate": 2.3003629764065335e-05, + "loss": 38.3897, + "step": 1295 + }, + { + "epoch": 4.679006772009029, + "grad_norm": 290.3368225097656, + "learning_rate": 2.2998185117967334e-05, + "loss": 39.1391, + "step": 1296 + }, + { + "epoch": 4.682618510158013, + "grad_norm": 222.59974670410156, + "learning_rate": 2.299274047186933e-05, + "loss": 38.6362, + "step": 1297 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 233.853515625, + "learning_rate": 2.2987295825771325e-05, + "loss": 37.1796, + "step": 1298 + }, + { + "epoch": 4.689841986455982, + "grad_norm": 202.83212280273438, + "learning_rate": 2.2981851179673324e-05, + "loss": 38.5097, + "step": 1299 + }, + { + "epoch": 4.693453724604966, + "grad_norm": 203.59027099609375, + "learning_rate": 2.297640653357532e-05, + "loss": 38.3335, + "step": 1300 + }, + { + "epoch": 4.693453724604966, + "eval_loss": 0.6446877717971802, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 1300 + }, + { + "epoch": 4.69706546275395, + "grad_norm": 250.48324584960938, + "learning_rate": 2.2970961887477314e-05, + "loss": 39.1848, + "step": 1301 + }, + { + "epoch": 4.700677200902934, + "grad_norm": 218.0867462158203, + "learning_rate": 2.296551724137931e-05, + "loss": 38.2276, + "step": 1302 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 316.4258728027344, + "learning_rate": 2.2960072595281305e-05, + "loss": 38.4487, + "step": 1303 + }, + { + "epoch": 4.707900677200903, + "grad_norm": 262.96832275390625, + "learning_rate": 2.29546279491833e-05, + "loss": 29.1075, + "step": 1304 + }, + { + "epoch": 4.711512415349887, + "grad_norm": 261.25897216796875, + "learning_rate": 2.2949183303085303e-05, + "loss": 24.6257, + "step": 1305 + }, + { + "epoch": 4.715124153498872, + "grad_norm": 223.29014587402344, + "learning_rate": 2.2943738656987298e-05, + "loss": 24.4387, + "step": 1306 + }, + { + "epoch": 4.718735891647856, + "grad_norm": 167.95193481445312, + "learning_rate": 2.2938294010889293e-05, + "loss": 25.0916, + "step": 1307 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 203.88392639160156, + "learning_rate": 2.293284936479129e-05, + "loss": 26.1631, + "step": 1308 + }, + { + "epoch": 4.725959367945824, + "grad_norm": 350.67657470703125, + "learning_rate": 2.2927404718693284e-05, + "loss": 47.7021, + "step": 1309 + }, + { + "epoch": 4.7295711060948085, + "grad_norm": 357.1839294433594, + "learning_rate": 2.2921960072595283e-05, + "loss": 47.8161, + "step": 1310 + }, + { + "epoch": 4.7295711060948085, + "eval_loss": 0.6716815829277039, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 1310 + }, + { + "epoch": 4.733182844243792, + "grad_norm": 334.40216064453125, + "learning_rate": 2.291651542649728e-05, + "loss": 47.5608, + "step": 1311 + }, + { + "epoch": 4.736794582392776, + "grad_norm": 322.90008544921875, + "learning_rate": 2.2911070780399274e-05, + "loss": 45.9858, + "step": 1312 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 291.5083923339844, + "learning_rate": 2.290562613430127e-05, + "loss": 45.9813, + "step": 1313 + }, + { + "epoch": 4.744018058690745, + "grad_norm": 234.91102600097656, + "learning_rate": 2.2900181488203268e-05, + "loss": 44.4287, + "step": 1314 + }, + { + "epoch": 4.747629796839729, + "grad_norm": 271.03582763671875, + "learning_rate": 2.2894736842105263e-05, + "loss": 45.3697, + "step": 1315 + }, + { + "epoch": 4.751241534988713, + "grad_norm": 256.219482421875, + "learning_rate": 2.2889292196007262e-05, + "loss": 45.1817, + "step": 1316 + }, + { + "epoch": 4.754853273137698, + "grad_norm": 252.0631561279297, + "learning_rate": 2.2883847549909257e-05, + "loss": 45.2029, + "step": 1317 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 249.41812133789062, + "learning_rate": 2.2878402903811253e-05, + "loss": 44.9802, + "step": 1318 + }, + { + "epoch": 4.762076749435666, + "grad_norm": 208.9102325439453, + "learning_rate": 2.2872958257713248e-05, + "loss": 44.3745, + "step": 1319 + }, + { + "epoch": 4.76568848758465, + "grad_norm": 322.94903564453125, + "learning_rate": 2.2867513611615244e-05, + "loss": 40.9193, + "step": 1320 + }, + { + "epoch": 4.76568848758465, + "eval_loss": 0.6515910029411316, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1320 + }, + { + "epoch": 4.769300225733634, + "grad_norm": 264.6942138671875, + "learning_rate": 2.2862068965517242e-05, + "loss": 39.7286, + "step": 1321 + }, + { + "epoch": 4.772911963882619, + "grad_norm": 276.6095886230469, + "learning_rate": 2.2856624319419238e-05, + "loss": 41.3846, + "step": 1322 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 199.59877014160156, + "learning_rate": 2.2851179673321233e-05, + "loss": 40.5583, + "step": 1323 + }, + { + "epoch": 4.780135440180587, + "grad_norm": 252.59158325195312, + "learning_rate": 2.2845735027223232e-05, + "loss": 40.9513, + "step": 1324 + }, + { + "epoch": 4.7837471783295715, + "grad_norm": 215.53826904296875, + "learning_rate": 2.2840290381125227e-05, + "loss": 41.5119, + "step": 1325 + }, + { + "epoch": 4.7873589164785555, + "grad_norm": 290.7100524902344, + "learning_rate": 2.2834845735027226e-05, + "loss": 42.7646, + "step": 1326 + }, + { + "epoch": 4.7909706546275395, + "grad_norm": 190.2306671142578, + "learning_rate": 2.282940108892922e-05, + "loss": 42.2708, + "step": 1327 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 187.5550079345703, + "learning_rate": 2.2823956442831217e-05, + "loss": 41.9279, + "step": 1328 + }, + { + "epoch": 4.798194130925508, + "grad_norm": 169.10414123535156, + "learning_rate": 2.2818511796733212e-05, + "loss": 42.2688, + "step": 1329 + }, + { + "epoch": 4.801805869074492, + "grad_norm": 199.5216064453125, + "learning_rate": 2.2813067150635208e-05, + "loss": 41.9192, + "step": 1330 + }, + { + "epoch": 4.801805869074492, + "eval_loss": 0.6402038335800171, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 1330 + }, + { + "epoch": 4.805417607223476, + "grad_norm": 222.4996337890625, + "learning_rate": 2.2807622504537203e-05, + "loss": 43.8218, + "step": 1331 + }, + { + "epoch": 4.80902934537246, + "grad_norm": 228.1157684326172, + "learning_rate": 2.2802177858439202e-05, + "loss": 42.9497, + "step": 1332 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 179.83697509765625, + "learning_rate": 2.27967332123412e-05, + "loss": 43.9723, + "step": 1333 + }, + { + "epoch": 4.816252821670429, + "grad_norm": 196.81983947753906, + "learning_rate": 2.2791288566243196e-05, + "loss": 43.3302, + "step": 1334 + }, + { + "epoch": 4.819864559819413, + "grad_norm": 186.61160278320312, + "learning_rate": 2.278584392014519e-05, + "loss": 41.8957, + "step": 1335 + }, + { + "epoch": 4.823476297968397, + "grad_norm": 242.55886840820312, + "learning_rate": 2.2780399274047187e-05, + "loss": 43.1916, + "step": 1336 + }, + { + "epoch": 4.827088036117382, + "grad_norm": 212.07177734375, + "learning_rate": 2.2774954627949185e-05, + "loss": 38.3371, + "step": 1337 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 180.1990966796875, + "learning_rate": 2.276950998185118e-05, + "loss": 36.3413, + "step": 1338 + }, + { + "epoch": 4.83431151241535, + "grad_norm": 202.69529724121094, + "learning_rate": 2.2764065335753176e-05, + "loss": 35.4426, + "step": 1339 + }, + { + "epoch": 4.837923250564334, + "grad_norm": 180.47283935546875, + "learning_rate": 2.275862068965517e-05, + "loss": 35.5281, + "step": 1340 + }, + { + "epoch": 4.837923250564334, + "eval_loss": 0.6356105804443359, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 1340 + }, + { + "epoch": 4.8415349887133186, + "grad_norm": 204.674560546875, + "learning_rate": 2.2753176043557167e-05, + "loss": 36.2566, + "step": 1341 + }, + { + "epoch": 4.8451467268623025, + "grad_norm": 272.1197204589844, + "learning_rate": 2.2747731397459166e-05, + "loss": 36.3862, + "step": 1342 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 235.55101013183594, + "learning_rate": 2.2742286751361165e-05, + "loss": 35.1455, + "step": 1343 + }, + { + "epoch": 4.852370203160271, + "grad_norm": 271.2718200683594, + "learning_rate": 2.273684210526316e-05, + "loss": 37.3824, + "step": 1344 + }, + { + "epoch": 4.855981941309255, + "grad_norm": 242.15728759765625, + "learning_rate": 2.2731397459165155e-05, + "loss": 37.6587, + "step": 1345 + }, + { + "epoch": 4.859593679458239, + "grad_norm": 218.59481811523438, + "learning_rate": 2.272595281306715e-05, + "loss": 36.7602, + "step": 1346 + }, + { + "epoch": 4.863205417607223, + "grad_norm": 231.9490203857422, + "learning_rate": 2.2720508166969146e-05, + "loss": 38.187, + "step": 1347 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 385.56158447265625, + "learning_rate": 2.2715063520871145e-05, + "loss": 38.1905, + "step": 1348 + }, + { + "epoch": 4.870428893905192, + "grad_norm": 219.38204956054688, + "learning_rate": 2.270961887477314e-05, + "loss": 38.2179, + "step": 1349 + }, + { + "epoch": 4.874040632054176, + "grad_norm": 209.46580505371094, + "learning_rate": 2.2704174228675136e-05, + "loss": 37.3696, + "step": 1350 + }, + { + "epoch": 4.874040632054176, + "eval_loss": 0.6412517428398132, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 1350 + }, + { + "epoch": 4.87765237020316, + "grad_norm": 205.53416442871094, + "learning_rate": 2.2698729582577134e-05, + "loss": 38.5144, + "step": 1351 + }, + { + "epoch": 4.881264108352145, + "grad_norm": 214.2522735595703, + "learning_rate": 2.269328493647913e-05, + "loss": 38.7372, + "step": 1352 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 236.9787139892578, + "learning_rate": 2.2687840290381125e-05, + "loss": 38.8987, + "step": 1353 + }, + { + "epoch": 4.888487584650113, + "grad_norm": 247.30906677246094, + "learning_rate": 2.2682395644283124e-05, + "loss": 35.0837, + "step": 1354 + }, + { + "epoch": 4.892099322799097, + "grad_norm": 287.5954284667969, + "learning_rate": 2.267695099818512e-05, + "loss": 25.5272, + "step": 1355 + }, + { + "epoch": 4.895711060948082, + "grad_norm": 254.61672973632812, + "learning_rate": 2.2671506352087115e-05, + "loss": 25.1288, + "step": 1356 + }, + { + "epoch": 4.899322799097066, + "grad_norm": 180.98666381835938, + "learning_rate": 2.266606170598911e-05, + "loss": 25.0588, + "step": 1357 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 213.0275421142578, + "learning_rate": 2.2660617059891105e-05, + "loss": 25.464, + "step": 1358 + }, + { + "epoch": 4.9065462753950335, + "grad_norm": 385.18035888671875, + "learning_rate": 2.2655172413793104e-05, + "loss": 47.0056, + "step": 1359 + }, + { + "epoch": 4.910158013544018, + "grad_norm": 383.4106140136719, + "learning_rate": 2.2649727767695103e-05, + "loss": 46.9892, + "step": 1360 + }, + { + "epoch": 4.910158013544018, + "eval_loss": 0.6618479490280151, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1360 + }, + { + "epoch": 4.913769751693002, + "grad_norm": 415.4345397949219, + "learning_rate": 2.26442831215971e-05, + "loss": 47.1619, + "step": 1361 + }, + { + "epoch": 4.917381489841986, + "grad_norm": 362.338134765625, + "learning_rate": 2.2638838475499094e-05, + "loss": 46.7232, + "step": 1362 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 378.7535400390625, + "learning_rate": 2.263339382940109e-05, + "loss": 46.4438, + "step": 1363 + }, + { + "epoch": 4.924604966139955, + "grad_norm": 251.64901733398438, + "learning_rate": 2.2627949183303085e-05, + "loss": 44.8178, + "step": 1364 + }, + { + "epoch": 4.928216704288939, + "grad_norm": 273.1052551269531, + "learning_rate": 2.2622504537205083e-05, + "loss": 43.0865, + "step": 1365 + }, + { + "epoch": 4.931828442437923, + "grad_norm": 229.66415405273438, + "learning_rate": 2.261705989110708e-05, + "loss": 42.2463, + "step": 1366 + }, + { + "epoch": 4.935440180586907, + "grad_norm": 229.47940063476562, + "learning_rate": 2.2611615245009074e-05, + "loss": 42.4395, + "step": 1367 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 224.48890686035156, + "learning_rate": 2.260617059891107e-05, + "loss": 42.4994, + "step": 1368 + }, + { + "epoch": 4.942663656884876, + "grad_norm": 241.98745727539062, + "learning_rate": 2.2600725952813065e-05, + "loss": 42.5535, + "step": 1369 + }, + { + "epoch": 4.94627539503386, + "grad_norm": 258.1711120605469, + "learning_rate": 2.2595281306715067e-05, + "loss": 42.8475, + "step": 1370 + }, + { + "epoch": 4.94627539503386, + "eval_loss": 0.639252245426178, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.09, + "eval_steps_per_second": 57.09, + "step": 1370 + }, + { + "epoch": 4.949887133182845, + "grad_norm": 204.64927673339844, + "learning_rate": 2.2589836660617062e-05, + "loss": 42.9895, + "step": 1371 + }, + { + "epoch": 4.953498871331829, + "grad_norm": 342.9057922363281, + "learning_rate": 2.2584392014519058e-05, + "loss": 43.1972, + "step": 1372 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 207.45504760742188, + "learning_rate": 2.2578947368421053e-05, + "loss": 42.406, + "step": 1373 + }, + { + "epoch": 4.960722347629797, + "grad_norm": 232.78831481933594, + "learning_rate": 2.257350272232305e-05, + "loss": 36.8817, + "step": 1374 + }, + { + "epoch": 4.9643340857787805, + "grad_norm": 249.3349609375, + "learning_rate": 2.2568058076225044e-05, + "loss": 34.584, + "step": 1375 + }, + { + "epoch": 4.967945823927765, + "grad_norm": 322.7100524902344, + "learning_rate": 2.2562613430127043e-05, + "loss": 36.9512, + "step": 1376 + }, + { + "epoch": 4.971557562076749, + "grad_norm": 357.65228271484375, + "learning_rate": 2.2557168784029038e-05, + "loss": 37.6833, + "step": 1377 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 300.0970153808594, + "learning_rate": 2.2551724137931033e-05, + "loss": 38.597, + "step": 1378 + }, + { + "epoch": 4.978781038374718, + "grad_norm": 234.52508544921875, + "learning_rate": 2.2546279491833032e-05, + "loss": 38.4155, + "step": 1379 + }, + { + "epoch": 4.982392776523702, + "grad_norm": 270.60626220703125, + "learning_rate": 2.2540834845735028e-05, + "loss": 38.1589, + "step": 1380 + }, + { + "epoch": 4.982392776523702, + "eval_loss": 0.6409950256347656, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 1380 + }, + { + "epoch": 4.986004514672686, + "grad_norm": 232.9596710205078, + "learning_rate": 2.2535390199637026e-05, + "loss": 39.281, + "step": 1381 + }, + { + "epoch": 4.98961625282167, + "grad_norm": 248.0550994873047, + "learning_rate": 2.2529945553539022e-05, + "loss": 40.0868, + "step": 1382 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 256.327880859375, + "learning_rate": 2.2524500907441017e-05, + "loss": 28.1259, + "step": 1383 + }, + { + "epoch": 4.996839729119639, + "grad_norm": 198.29559326171875, + "learning_rate": 2.2519056261343012e-05, + "loss": 25.3166, + "step": 1384 + }, + { + "epoch": 5.0, + "grad_norm": 174.66856384277344, + "learning_rate": 2.2513611615245008e-05, + "loss": 22.0749, + "step": 1385 + }, + { + "epoch": 5.003611738148984, + "grad_norm": 309.0927429199219, + "learning_rate": 2.2508166969147003e-05, + "loss": 45.2433, + "step": 1386 + }, + { + "epoch": 5.007223476297969, + "grad_norm": 293.1455383300781, + "learning_rate": 2.2502722323049002e-05, + "loss": 46.7025, + "step": 1387 + }, + { + "epoch": 5.010835214446953, + "grad_norm": 269.47662353515625, + "learning_rate": 2.2497277676951e-05, + "loss": 45.3218, + "step": 1388 + }, + { + "epoch": 5.014446952595937, + "grad_norm": 284.49560546875, + "learning_rate": 2.2491833030852996e-05, + "loss": 44.9849, + "step": 1389 + }, + { + "epoch": 5.018058690744921, + "grad_norm": 223.5511474609375, + "learning_rate": 2.248638838475499e-05, + "loss": 44.887, + "step": 1390 + }, + { + "epoch": 5.018058690744921, + "eval_loss": 0.6435533165931702, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1390 + }, + { + "epoch": 5.021670428893906, + "grad_norm": 243.4492645263672, + "learning_rate": 2.2480943738656987e-05, + "loss": 45.1483, + "step": 1391 + }, + { + "epoch": 5.0252821670428895, + "grad_norm": 265.1712646484375, + "learning_rate": 2.2475499092558986e-05, + "loss": 44.3713, + "step": 1392 + }, + { + "epoch": 5.0288939051918735, + "grad_norm": 190.72190856933594, + "learning_rate": 2.247005444646098e-05, + "loss": 45.3138, + "step": 1393 + }, + { + "epoch": 5.0325056433408575, + "grad_norm": 177.26686096191406, + "learning_rate": 2.2464609800362976e-05, + "loss": 43.302, + "step": 1394 + }, + { + "epoch": 5.036117381489842, + "grad_norm": 198.6124725341797, + "learning_rate": 2.2459165154264972e-05, + "loss": 43.6363, + "step": 1395 + }, + { + "epoch": 5.039729119638826, + "grad_norm": 233.78738403320312, + "learning_rate": 2.2453720508166967e-05, + "loss": 43.0345, + "step": 1396 + }, + { + "epoch": 5.04334085778781, + "grad_norm": 225.48614501953125, + "learning_rate": 2.2448275862068966e-05, + "loss": 41.5932, + "step": 1397 + }, + { + "epoch": 5.046952595936794, + "grad_norm": 204.31179809570312, + "learning_rate": 2.2442831215970965e-05, + "loss": 40.1401, + "step": 1398 + }, + { + "epoch": 5.050564334085779, + "grad_norm": 219.5385284423828, + "learning_rate": 2.243738656987296e-05, + "loss": 40.8834, + "step": 1399 + }, + { + "epoch": 5.054176072234763, + "grad_norm": 168.3094024658203, + "learning_rate": 2.2431941923774956e-05, + "loss": 40.4476, + "step": 1400 + }, + { + "epoch": 5.054176072234763, + "eval_loss": 0.6361114382743835, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 1400 + }, + { + "epoch": 5.057787810383747, + "grad_norm": 169.45201110839844, + "learning_rate": 2.242649727767695e-05, + "loss": 40.1949, + "step": 1401 + }, + { + "epoch": 5.061399548532731, + "grad_norm": 208.84634399414062, + "learning_rate": 2.2421052631578946e-05, + "loss": 41.0091, + "step": 1402 + }, + { + "epoch": 5.065011286681716, + "grad_norm": 248.86221313476562, + "learning_rate": 2.2415607985480945e-05, + "loss": 40.2435, + "step": 1403 + }, + { + "epoch": 5.0686230248307, + "grad_norm": 297.0834655761719, + "learning_rate": 2.241016333938294e-05, + "loss": 42.37, + "step": 1404 + }, + { + "epoch": 5.072234762979684, + "grad_norm": 242.12661743164062, + "learning_rate": 2.2404718693284936e-05, + "loss": 42.3822, + "step": 1405 + }, + { + "epoch": 5.075846501128668, + "grad_norm": 230.1178741455078, + "learning_rate": 2.2399274047186935e-05, + "loss": 41.3722, + "step": 1406 + }, + { + "epoch": 5.079458239277653, + "grad_norm": 191.32371520996094, + "learning_rate": 2.239382940108893e-05, + "loss": 41.8087, + "step": 1407 + }, + { + "epoch": 5.083069977426637, + "grad_norm": 267.28753662109375, + "learning_rate": 2.2388384754990925e-05, + "loss": 42.5938, + "step": 1408 + }, + { + "epoch": 5.0866817155756205, + "grad_norm": 186.61978149414062, + "learning_rate": 2.2382940108892924e-05, + "loss": 42.8553, + "step": 1409 + }, + { + "epoch": 5.090293453724605, + "grad_norm": 242.53433227539062, + "learning_rate": 2.237749546279492e-05, + "loss": 41.9677, + "step": 1410 + }, + { + "epoch": 5.090293453724605, + "eval_loss": 0.6330043077468872, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 1410 + }, + { + "epoch": 5.093905191873589, + "grad_norm": 199.74696350097656, + "learning_rate": 2.2372050816696915e-05, + "loss": 42.9821, + "step": 1411 + }, + { + "epoch": 5.097516930022573, + "grad_norm": 254.1063690185547, + "learning_rate": 2.236660617059891e-05, + "loss": 42.7956, + "step": 1412 + }, + { + "epoch": 5.101128668171557, + "grad_norm": 215.59056091308594, + "learning_rate": 2.2361161524500906e-05, + "loss": 43.6312, + "step": 1413 + }, + { + "epoch": 5.104740406320542, + "grad_norm": 218.69973754882812, + "learning_rate": 2.2355716878402904e-05, + "loss": 40.9468, + "step": 1414 + }, + { + "epoch": 5.108352144469526, + "grad_norm": 200.34927368164062, + "learning_rate": 2.23502722323049e-05, + "loss": 38.2656, + "step": 1415 + }, + { + "epoch": 5.11196388261851, + "grad_norm": 191.56883239746094, + "learning_rate": 2.23448275862069e-05, + "loss": 35.8111, + "step": 1416 + }, + { + "epoch": 5.115575620767494, + "grad_norm": 192.629150390625, + "learning_rate": 2.2339382940108894e-05, + "loss": 35.1287, + "step": 1417 + }, + { + "epoch": 5.119187358916479, + "grad_norm": 217.54855346679688, + "learning_rate": 2.233393829401089e-05, + "loss": 34.9664, + "step": 1418 + }, + { + "epoch": 5.122799097065463, + "grad_norm": 234.12355041503906, + "learning_rate": 2.2328493647912888e-05, + "loss": 35.9252, + "step": 1419 + }, + { + "epoch": 5.126410835214447, + "grad_norm": 201.83477783203125, + "learning_rate": 2.2323049001814884e-05, + "loss": 36.4664, + "step": 1420 + }, + { + "epoch": 5.126410835214447, + "eval_loss": 0.6359394192695618, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 1420 + }, + { + "epoch": 5.130022573363431, + "grad_norm": 212.38943481445312, + "learning_rate": 2.231760435571688e-05, + "loss": 35.2733, + "step": 1421 + }, + { + "epoch": 5.133634311512416, + "grad_norm": 219.8803253173828, + "learning_rate": 2.2312159709618874e-05, + "loss": 37.2009, + "step": 1422 + }, + { + "epoch": 5.1372460496614, + "grad_norm": 222.28221130371094, + "learning_rate": 2.230671506352087e-05, + "loss": 36.9338, + "step": 1423 + }, + { + "epoch": 5.140857787810384, + "grad_norm": 217.56607055664062, + "learning_rate": 2.2301270417422865e-05, + "loss": 38.0419, + "step": 1424 + }, + { + "epoch": 5.144469525959368, + "grad_norm": 232.7363739013672, + "learning_rate": 2.2295825771324867e-05, + "loss": 38.1393, + "step": 1425 + }, + { + "epoch": 5.148081264108352, + "grad_norm": 228.12091064453125, + "learning_rate": 2.2290381125226863e-05, + "loss": 37.4169, + "step": 1426 + }, + { + "epoch": 5.151693002257336, + "grad_norm": 247.9901580810547, + "learning_rate": 2.2284936479128858e-05, + "loss": 37.6386, + "step": 1427 + }, + { + "epoch": 5.15530474040632, + "grad_norm": 227.96649169921875, + "learning_rate": 2.2279491833030853e-05, + "loss": 38.7843, + "step": 1428 + }, + { + "epoch": 5.158916478555304, + "grad_norm": 197.85072326660156, + "learning_rate": 2.227404718693285e-05, + "loss": 37.7056, + "step": 1429 + }, + { + "epoch": 5.162528216704289, + "grad_norm": 270.6370544433594, + "learning_rate": 2.2268602540834848e-05, + "loss": 38.5554, + "step": 1430 + }, + { + "epoch": 5.162528216704289, + "eval_loss": 0.6463288068771362, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 1430 + }, + { + "epoch": 5.166139954853273, + "grad_norm": 251.65847778320312, + "learning_rate": 2.2263157894736843e-05, + "loss": 32.6593, + "step": 1431 + }, + { + "epoch": 5.169751693002257, + "grad_norm": 248.84368896484375, + "learning_rate": 2.225771324863884e-05, + "loss": 24.8031, + "step": 1432 + }, + { + "epoch": 5.173363431151241, + "grad_norm": 218.12979125976562, + "learning_rate": 2.2252268602540834e-05, + "loss": 23.8542, + "step": 1433 + }, + { + "epoch": 5.176975169300226, + "grad_norm": 171.4182586669922, + "learning_rate": 2.2246823956442832e-05, + "loss": 25.1994, + "step": 1434 + }, + { + "epoch": 5.18058690744921, + "grad_norm": 200.76271057128906, + "learning_rate": 2.2241379310344828e-05, + "loss": 25.1259, + "step": 1435 + }, + { + "epoch": 5.184198645598194, + "grad_norm": 324.8979797363281, + "learning_rate": 2.2235934664246827e-05, + "loss": 46.7466, + "step": 1436 + }, + { + "epoch": 5.187810383747179, + "grad_norm": 391.9200439453125, + "learning_rate": 2.2230490018148822e-05, + "loss": 47.366, + "step": 1437 + }, + { + "epoch": 5.191422121896163, + "grad_norm": 332.51080322265625, + "learning_rate": 2.2225045372050817e-05, + "loss": 47.5236, + "step": 1438 + }, + { + "epoch": 5.195033860045147, + "grad_norm": 295.85333251953125, + "learning_rate": 2.2219600725952813e-05, + "loss": 44.9235, + "step": 1439 + }, + { + "epoch": 5.198645598194131, + "grad_norm": 246.46482849121094, + "learning_rate": 2.2214156079854808e-05, + "loss": 44.5892, + "step": 1440 + }, + { + "epoch": 5.198645598194131, + "eval_loss": 0.6501885056495667, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.096, + "eval_steps_per_second": 57.096, + "step": 1440 + }, + { + "epoch": 5.2022573363431155, + "grad_norm": 224.99964904785156, + "learning_rate": 2.2208711433756807e-05, + "loss": 45.1496, + "step": 1441 + }, + { + "epoch": 5.2058690744920995, + "grad_norm": 201.5928497314453, + "learning_rate": 2.2203266787658802e-05, + "loss": 44.2362, + "step": 1442 + }, + { + "epoch": 5.209480812641083, + "grad_norm": 220.72509765625, + "learning_rate": 2.21978221415608e-05, + "loss": 45.7963, + "step": 1443 + }, + { + "epoch": 5.213092550790067, + "grad_norm": 229.04412841796875, + "learning_rate": 2.2192377495462796e-05, + "loss": 44.1812, + "step": 1444 + }, + { + "epoch": 5.216704288939052, + "grad_norm": 214.86207580566406, + "learning_rate": 2.2186932849364792e-05, + "loss": 44.364, + "step": 1445 + }, + { + "epoch": 5.220316027088036, + "grad_norm": 169.3239288330078, + "learning_rate": 2.2181488203266787e-05, + "loss": 44.1106, + "step": 1446 + }, + { + "epoch": 5.22392776523702, + "grad_norm": 180.3131561279297, + "learning_rate": 2.2176043557168786e-05, + "loss": 41.8791, + "step": 1447 + }, + { + "epoch": 5.227539503386004, + "grad_norm": 227.83078002929688, + "learning_rate": 2.217059891107078e-05, + "loss": 39.7917, + "step": 1448 + }, + { + "epoch": 5.231151241534989, + "grad_norm": 267.4294738769531, + "learning_rate": 2.2165154264972777e-05, + "loss": 41.2864, + "step": 1449 + }, + { + "epoch": 5.234762979683973, + "grad_norm": 210.79034423828125, + "learning_rate": 2.2159709618874772e-05, + "loss": 40.7219, + "step": 1450 + }, + { + "epoch": 5.234762979683973, + "eval_loss": 0.6369529366493225, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 1450 + }, + { + "epoch": 5.238374717832957, + "grad_norm": 205.2632598876953, + "learning_rate": 2.2154264972776768e-05, + "loss": 41.0364, + "step": 1451 + }, + { + "epoch": 5.241986455981941, + "grad_norm": 199.7196807861328, + "learning_rate": 2.214882032667877e-05, + "loss": 40.2733, + "step": 1452 + }, + { + "epoch": 5.245598194130926, + "grad_norm": 184.26495361328125, + "learning_rate": 2.2143375680580765e-05, + "loss": 40.3418, + "step": 1453 + }, + { + "epoch": 5.24920993227991, + "grad_norm": 170.1937713623047, + "learning_rate": 2.213793103448276e-05, + "loss": 40.5658, + "step": 1454 + }, + { + "epoch": 5.252821670428894, + "grad_norm": 167.71109008789062, + "learning_rate": 2.2132486388384756e-05, + "loss": 41.9252, + "step": 1455 + }, + { + "epoch": 5.2564334085778786, + "grad_norm": 184.73162841796875, + "learning_rate": 2.212704174228675e-05, + "loss": 40.0485, + "step": 1456 + }, + { + "epoch": 5.2600451467268625, + "grad_norm": 195.0812225341797, + "learning_rate": 2.2121597096188747e-05, + "loss": 41.6424, + "step": 1457 + }, + { + "epoch": 5.2636568848758465, + "grad_norm": 218.23553466796875, + "learning_rate": 2.2116152450090745e-05, + "loss": 40.6179, + "step": 1458 + }, + { + "epoch": 5.2672686230248305, + "grad_norm": 229.79299926757812, + "learning_rate": 2.211070780399274e-05, + "loss": 42.8747, + "step": 1459 + }, + { + "epoch": 5.270880361173815, + "grad_norm": 231.70692443847656, + "learning_rate": 2.2105263157894736e-05, + "loss": 42.7016, + "step": 1460 + }, + { + "epoch": 5.270880361173815, + "eval_loss": 0.6424433588981628, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 1460 + }, + { + "epoch": 5.274492099322799, + "grad_norm": 204.9513397216797, + "learning_rate": 2.209981851179673e-05, + "loss": 41.206, + "step": 1461 + }, + { + "epoch": 5.278103837471783, + "grad_norm": 220.89083862304688, + "learning_rate": 2.209437386569873e-05, + "loss": 44.0126, + "step": 1462 + }, + { + "epoch": 5.281715575620767, + "grad_norm": 266.7763671875, + "learning_rate": 2.208892921960073e-05, + "loss": 41.4934, + "step": 1463 + }, + { + "epoch": 5.285327313769752, + "grad_norm": 241.42636108398438, + "learning_rate": 2.2083484573502724e-05, + "loss": 43.3433, + "step": 1464 + }, + { + "epoch": 5.288939051918736, + "grad_norm": 221.7669219970703, + "learning_rate": 2.207803992740472e-05, + "loss": 35.9569, + "step": 1465 + }, + { + "epoch": 5.29255079006772, + "grad_norm": 236.0152130126953, + "learning_rate": 2.2072595281306715e-05, + "loss": 36.0824, + "step": 1466 + }, + { + "epoch": 5.296162528216704, + "grad_norm": 239.56224060058594, + "learning_rate": 2.206715063520871e-05, + "loss": 33.6127, + "step": 1467 + }, + { + "epoch": 5.299774266365689, + "grad_norm": 277.1287841796875, + "learning_rate": 2.2061705989110706e-05, + "loss": 36.11, + "step": 1468 + }, + { + "epoch": 5.303386004514673, + "grad_norm": 250.19515991210938, + "learning_rate": 2.2056261343012705e-05, + "loss": 36.9984, + "step": 1469 + }, + { + "epoch": 5.306997742663657, + "grad_norm": 214.2754669189453, + "learning_rate": 2.20508166969147e-05, + "loss": 36.5917, + "step": 1470 + }, + { + "epoch": 5.306997742663657, + "eval_loss": 0.6356943845748901, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 1470 + }, + { + "epoch": 5.310609480812641, + "grad_norm": 224.37388610839844, + "learning_rate": 2.20453720508167e-05, + "loss": 36.5302, + "step": 1471 + }, + { + "epoch": 5.314221218961626, + "grad_norm": 276.2541809082031, + "learning_rate": 2.2039927404718694e-05, + "loss": 36.7978, + "step": 1472 + }, + { + "epoch": 5.3178329571106095, + "grad_norm": 361.717041015625, + "learning_rate": 2.203448275862069e-05, + "loss": 37.4063, + "step": 1473 + }, + { + "epoch": 5.3214446952595935, + "grad_norm": 285.3569641113281, + "learning_rate": 2.202903811252269e-05, + "loss": 37.2472, + "step": 1474 + }, + { + "epoch": 5.3250564334085775, + "grad_norm": 268.160400390625, + "learning_rate": 2.2023593466424684e-05, + "loss": 37.7361, + "step": 1475 + }, + { + "epoch": 5.328668171557562, + "grad_norm": 211.38070678710938, + "learning_rate": 2.201814882032668e-05, + "loss": 37.7794, + "step": 1476 + }, + { + "epoch": 5.332279909706546, + "grad_norm": 214.10638427734375, + "learning_rate": 2.2012704174228675e-05, + "loss": 39.0787, + "step": 1477 + }, + { + "epoch": 5.33589164785553, + "grad_norm": 238.9603271484375, + "learning_rate": 2.200725952813067e-05, + "loss": 37.6853, + "step": 1478 + }, + { + "epoch": 5.339503386004514, + "grad_norm": 323.44976806640625, + "learning_rate": 2.2001814882032665e-05, + "loss": 38.2844, + "step": 1479 + }, + { + "epoch": 5.343115124153499, + "grad_norm": 289.6131896972656, + "learning_rate": 2.1996370235934668e-05, + "loss": 38.8953, + "step": 1480 + }, + { + "epoch": 5.343115124153499, + "eval_loss": 0.6462770700454712, + "eval_runtime": 3.1673, + "eval_samples_per_second": 56.516, + "eval_steps_per_second": 56.516, + "step": 1480 + }, + { + "epoch": 5.346726862302483, + "grad_norm": 197.47299194335938, + "learning_rate": 2.1990925589836663e-05, + "loss": 28.126, + "step": 1481 + }, + { + "epoch": 5.350338600451467, + "grad_norm": 198.37156677246094, + "learning_rate": 2.1985480943738658e-05, + "loss": 24.2205, + "step": 1482 + }, + { + "epoch": 5.353950338600452, + "grad_norm": 211.03501892089844, + "learning_rate": 2.1980036297640654e-05, + "loss": 24.119, + "step": 1483 + }, + { + "epoch": 5.357562076749436, + "grad_norm": 182.23316955566406, + "learning_rate": 2.197459165154265e-05, + "loss": 24.7386, + "step": 1484 + }, + { + "epoch": 5.36117381489842, + "grad_norm": 192.6392822265625, + "learning_rate": 2.1969147005444648e-05, + "loss": 26.0739, + "step": 1485 + }, + { + "epoch": 5.364785553047404, + "grad_norm": 380.62896728515625, + "learning_rate": 2.1963702359346643e-05, + "loss": 46.6945, + "step": 1486 + }, + { + "epoch": 5.368397291196389, + "grad_norm": 342.5572814941406, + "learning_rate": 2.195825771324864e-05, + "loss": 46.1797, + "step": 1487 + }, + { + "epoch": 5.372009029345373, + "grad_norm": 311.7198791503906, + "learning_rate": 2.1952813067150634e-05, + "loss": 45.6588, + "step": 1488 + }, + { + "epoch": 5.375620767494357, + "grad_norm": 260.9885559082031, + "learning_rate": 2.1947368421052633e-05, + "loss": 45.2405, + "step": 1489 + }, + { + "epoch": 5.3792325056433405, + "grad_norm": 263.3132019042969, + "learning_rate": 2.1941923774954628e-05, + "loss": 44.117, + "step": 1490 + }, + { + "epoch": 5.3792325056433405, + "eval_loss": 0.644275426864624, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 1490 + }, + { + "epoch": 5.382844243792325, + "grad_norm": 254.92022705078125, + "learning_rate": 2.1936479128856627e-05, + "loss": 45.4002, + "step": 1491 + }, + { + "epoch": 5.386455981941309, + "grad_norm": 246.1839599609375, + "learning_rate": 2.1931034482758622e-05, + "loss": 45.3481, + "step": 1492 + }, + { + "epoch": 5.390067720090293, + "grad_norm": 282.2879638671875, + "learning_rate": 2.1925589836660618e-05, + "loss": 45.3958, + "step": 1493 + }, + { + "epoch": 5.393679458239277, + "grad_norm": 266.9140930175781, + "learning_rate": 2.1920145190562613e-05, + "loss": 44.2959, + "step": 1494 + }, + { + "epoch": 5.397291196388262, + "grad_norm": 196.81199645996094, + "learning_rate": 2.191470054446461e-05, + "loss": 44.765, + "step": 1495 + }, + { + "epoch": 5.400902934537246, + "grad_norm": 270.7329406738281, + "learning_rate": 2.1909255898366607e-05, + "loss": 42.8581, + "step": 1496 + }, + { + "epoch": 5.40451467268623, + "grad_norm": 187.3281707763672, + "learning_rate": 2.1903811252268603e-05, + "loss": 40.7167, + "step": 1497 + }, + { + "epoch": 5.408126410835214, + "grad_norm": 302.9165954589844, + "learning_rate": 2.1898366606170598e-05, + "loss": 41.0712, + "step": 1498 + }, + { + "epoch": 5.411738148984199, + "grad_norm": 395.1492614746094, + "learning_rate": 2.1892921960072597e-05, + "loss": 40.4098, + "step": 1499 + }, + { + "epoch": 5.415349887133183, + "grad_norm": 253.91494750976562, + "learning_rate": 2.1887477313974592e-05, + "loss": 41.2985, + "step": 1500 + }, + { + "epoch": 5.415349887133183, + "eval_loss": 0.6383773684501648, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1500 + }, + { + "epoch": 5.418961625282167, + "grad_norm": 248.4109344482422, + "learning_rate": 2.1882032667876588e-05, + "loss": 41.179, + "step": 1501 + }, + { + "epoch": 5.422573363431152, + "grad_norm": 210.50015258789062, + "learning_rate": 2.1876588021778586e-05, + "loss": 41.1934, + "step": 1502 + }, + { + "epoch": 5.426185101580136, + "grad_norm": 170.64334106445312, + "learning_rate": 2.187114337568058e-05, + "loss": 41.5535, + "step": 1503 + }, + { + "epoch": 5.42979683972912, + "grad_norm": 249.41270446777344, + "learning_rate": 2.1865698729582577e-05, + "loss": 41.8323, + "step": 1504 + }, + { + "epoch": 5.433408577878104, + "grad_norm": 214.53770446777344, + "learning_rate": 2.1860254083484572e-05, + "loss": 42.1517, + "step": 1505 + }, + { + "epoch": 5.437020316027088, + "grad_norm": 225.6502227783203, + "learning_rate": 2.1854809437386568e-05, + "loss": 42.7675, + "step": 1506 + }, + { + "epoch": 5.440632054176072, + "grad_norm": 210.19219970703125, + "learning_rate": 2.1849364791288567e-05, + "loss": 42.5094, + "step": 1507 + }, + { + "epoch": 5.444243792325056, + "grad_norm": 187.03294372558594, + "learning_rate": 2.1843920145190565e-05, + "loss": 42.2218, + "step": 1508 + }, + { + "epoch": 5.44785553047404, + "grad_norm": 227.6764373779297, + "learning_rate": 2.183847549909256e-05, + "loss": 42.7061, + "step": 1509 + }, + { + "epoch": 5.451467268623025, + "grad_norm": 239.2847442626953, + "learning_rate": 2.1833030852994556e-05, + "loss": 43.1959, + "step": 1510 + }, + { + "epoch": 5.451467268623025, + "eval_loss": 0.6405091285705566, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 1510 + }, + { + "epoch": 5.455079006772009, + "grad_norm": 268.887451171875, + "learning_rate": 2.182758620689655e-05, + "loss": 42.4915, + "step": 1511 + }, + { + "epoch": 5.458690744920993, + "grad_norm": 261.0531311035156, + "learning_rate": 2.182214156079855e-05, + "loss": 42.1777, + "step": 1512 + }, + { + "epoch": 5.462302483069977, + "grad_norm": 241.58819580078125, + "learning_rate": 2.1816696914700546e-05, + "loss": 40.8728, + "step": 1513 + }, + { + "epoch": 5.465914221218962, + "grad_norm": 227.302001953125, + "learning_rate": 2.181125226860254e-05, + "loss": 39.8861, + "step": 1514 + }, + { + "epoch": 5.469525959367946, + "grad_norm": 293.8402404785156, + "learning_rate": 2.1805807622504536e-05, + "loss": 36.8716, + "step": 1515 + }, + { + "epoch": 5.47313769751693, + "grad_norm": 332.8829650878906, + "learning_rate": 2.1800362976406532e-05, + "loss": 35.6049, + "step": 1516 + }, + { + "epoch": 5.476749435665914, + "grad_norm": 271.6636962890625, + "learning_rate": 2.179491833030853e-05, + "loss": 34.6785, + "step": 1517 + }, + { + "epoch": 5.480361173814899, + "grad_norm": 211.5673065185547, + "learning_rate": 2.178947368421053e-05, + "loss": 35.5321, + "step": 1518 + }, + { + "epoch": 5.483972911963883, + "grad_norm": 168.95346069335938, + "learning_rate": 2.1784029038112525e-05, + "loss": 35.1604, + "step": 1519 + }, + { + "epoch": 5.487584650112867, + "grad_norm": 242.66725158691406, + "learning_rate": 2.177858439201452e-05, + "loss": 37.8709, + "step": 1520 + }, + { + "epoch": 5.487584650112867, + "eval_loss": 0.6324127912521362, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1520 + }, + { + "epoch": 5.491196388261851, + "grad_norm": 202.7799530029297, + "learning_rate": 2.1773139745916516e-05, + "loss": 38.1727, + "step": 1521 + }, + { + "epoch": 5.4948081264108355, + "grad_norm": 210.12704467773438, + "learning_rate": 2.176769509981851e-05, + "loss": 36.4171, + "step": 1522 + }, + { + "epoch": 5.4984198645598195, + "grad_norm": 214.7133331298828, + "learning_rate": 2.176225045372051e-05, + "loss": 37.7873, + "step": 1523 + }, + { + "epoch": 5.502031602708803, + "grad_norm": 197.89781188964844, + "learning_rate": 2.1756805807622505e-05, + "loss": 37.1096, + "step": 1524 + }, + { + "epoch": 5.505643340857787, + "grad_norm": 203.01992797851562, + "learning_rate": 2.17513611615245e-05, + "loss": 36.9907, + "step": 1525 + }, + { + "epoch": 5.509255079006772, + "grad_norm": 210.42164611816406, + "learning_rate": 2.17459165154265e-05, + "loss": 38.0291, + "step": 1526 + }, + { + "epoch": 5.512866817155756, + "grad_norm": 210.2798309326172, + "learning_rate": 2.1740471869328495e-05, + "loss": 37.5385, + "step": 1527 + }, + { + "epoch": 5.51647855530474, + "grad_norm": 217.986572265625, + "learning_rate": 2.173502722323049e-05, + "loss": 39.2736, + "step": 1528 + }, + { + "epoch": 5.520090293453725, + "grad_norm": 221.05831909179688, + "learning_rate": 2.172958257713249e-05, + "loss": 39.2733, + "step": 1529 + }, + { + "epoch": 5.523702031602709, + "grad_norm": 250.36065673828125, + "learning_rate": 2.1724137931034484e-05, + "loss": 37.8987, + "step": 1530 + }, + { + "epoch": 5.523702031602709, + "eval_loss": 0.6414559483528137, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 1530 + }, + { + "epoch": 5.527313769751693, + "grad_norm": 275.062255859375, + "learning_rate": 2.171869328493648e-05, + "loss": 29.4874, + "step": 1531 + }, + { + "epoch": 5.530925507900677, + "grad_norm": 178.79615783691406, + "learning_rate": 2.1713248638838475e-05, + "loss": 25.2165, + "step": 1532 + }, + { + "epoch": 5.534537246049661, + "grad_norm": 221.6693572998047, + "learning_rate": 2.170780399274047e-05, + "loss": 24.7139, + "step": 1533 + }, + { + "epoch": 5.538148984198646, + "grad_norm": 207.15869140625, + "learning_rate": 2.170235934664247e-05, + "loss": 25.2773, + "step": 1534 + }, + { + "epoch": 5.54176072234763, + "grad_norm": 193.37644958496094, + "learning_rate": 2.1696914700544468e-05, + "loss": 25.7936, + "step": 1535 + }, + { + "epoch": 5.545372460496614, + "grad_norm": 314.101318359375, + "learning_rate": 2.1691470054446463e-05, + "loss": 45.8573, + "step": 1536 + }, + { + "epoch": 5.5489841986455986, + "grad_norm": 376.9578552246094, + "learning_rate": 2.168602540834846e-05, + "loss": 47.1284, + "step": 1537 + }, + { + "epoch": 5.5525959367945825, + "grad_norm": 343.3904724121094, + "learning_rate": 2.1680580762250454e-05, + "loss": 45.1873, + "step": 1538 + }, + { + "epoch": 5.5562076749435665, + "grad_norm": 263.31768798828125, + "learning_rate": 2.167513611615245e-05, + "loss": 45.4906, + "step": 1539 + }, + { + "epoch": 5.5598194130925505, + "grad_norm": 295.50384521484375, + "learning_rate": 2.1669691470054448e-05, + "loss": 44.9259, + "step": 1540 + }, + { + "epoch": 5.5598194130925505, + "eval_loss": 0.6483813524246216, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.923, + "eval_steps_per_second": 56.923, + "step": 1540 + }, + { + "epoch": 5.563431151241535, + "grad_norm": 208.8861846923828, + "learning_rate": 2.1664246823956444e-05, + "loss": 43.7965, + "step": 1541 + }, + { + "epoch": 5.567042889390519, + "grad_norm": 195.8695526123047, + "learning_rate": 2.165880217785844e-05, + "loss": 44.7409, + "step": 1542 + }, + { + "epoch": 5.570654627539503, + "grad_norm": 218.10089111328125, + "learning_rate": 2.1653357531760434e-05, + "loss": 45.9364, + "step": 1543 + }, + { + "epoch": 5.574266365688487, + "grad_norm": 204.17205810546875, + "learning_rate": 2.164791288566243e-05, + "loss": 45.468, + "step": 1544 + }, + { + "epoch": 5.577878103837472, + "grad_norm": 239.03952026367188, + "learning_rate": 2.1642468239564432e-05, + "loss": 44.7685, + "step": 1545 + }, + { + "epoch": 5.581489841986456, + "grad_norm": 251.59300231933594, + "learning_rate": 2.1637023593466427e-05, + "loss": 43.011, + "step": 1546 + }, + { + "epoch": 5.58510158013544, + "grad_norm": 186.72540283203125, + "learning_rate": 2.1631578947368423e-05, + "loss": 41.5255, + "step": 1547 + }, + { + "epoch": 5.588713318284425, + "grad_norm": 199.89732360839844, + "learning_rate": 2.1626134301270418e-05, + "loss": 40.2522, + "step": 1548 + }, + { + "epoch": 5.592325056433409, + "grad_norm": 182.16624450683594, + "learning_rate": 2.1620689655172413e-05, + "loss": 41.0931, + "step": 1549 + }, + { + "epoch": 5.595936794582393, + "grad_norm": 221.58680725097656, + "learning_rate": 2.161524500907441e-05, + "loss": 40.2717, + "step": 1550 + }, + { + "epoch": 5.595936794582393, + "eval_loss": 0.6393340229988098, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 1550 + }, + { + "epoch": 5.599548532731377, + "grad_norm": 209.82183837890625, + "learning_rate": 2.1609800362976408e-05, + "loss": 41.7522, + "step": 1551 + }, + { + "epoch": 5.603160270880361, + "grad_norm": 226.1896209716797, + "learning_rate": 2.1604355716878403e-05, + "loss": 40.8078, + "step": 1552 + }, + { + "epoch": 5.606772009029346, + "grad_norm": 219.57899475097656, + "learning_rate": 2.1598911070780398e-05, + "loss": 42.2331, + "step": 1553 + }, + { + "epoch": 5.6103837471783295, + "grad_norm": 185.2303009033203, + "learning_rate": 2.1593466424682397e-05, + "loss": 42.0695, + "step": 1554 + }, + { + "epoch": 5.6139954853273135, + "grad_norm": 192.32913208007812, + "learning_rate": 2.1588021778584392e-05, + "loss": 42.1317, + "step": 1555 + }, + { + "epoch": 5.617607223476298, + "grad_norm": 183.3128662109375, + "learning_rate": 2.158257713248639e-05, + "loss": 40.4957, + "step": 1556 + }, + { + "epoch": 5.621218961625282, + "grad_norm": 178.10691833496094, + "learning_rate": 2.1577132486388387e-05, + "loss": 40.9154, + "step": 1557 + }, + { + "epoch": 5.624830699774266, + "grad_norm": 207.3495330810547, + "learning_rate": 2.1571687840290382e-05, + "loss": 42.8389, + "step": 1558 + }, + { + "epoch": 5.62844243792325, + "grad_norm": 191.46353149414062, + "learning_rate": 2.1566243194192377e-05, + "loss": 41.9483, + "step": 1559 + }, + { + "epoch": 5.632054176072235, + "grad_norm": 218.9544219970703, + "learning_rate": 2.1560798548094373e-05, + "loss": 41.2037, + "step": 1560 + }, + { + "epoch": 5.632054176072235, + "eval_loss": 0.6345452070236206, + "eval_runtime": 3.1432, + "eval_samples_per_second": 56.949, + "eval_steps_per_second": 56.949, + "step": 1560 + }, + { + "epoch": 5.635665914221219, + "grad_norm": 235.9405059814453, + "learning_rate": 2.1555353901996368e-05, + "loss": 43.1159, + "step": 1561 + }, + { + "epoch": 5.639277652370203, + "grad_norm": 207.1119384765625, + "learning_rate": 2.1549909255898367e-05, + "loss": 43.4384, + "step": 1562 + }, + { + "epoch": 5.642889390519187, + "grad_norm": 305.3013916015625, + "learning_rate": 2.1544464609800366e-05, + "loss": 42.436, + "step": 1563 + }, + { + "epoch": 5.646501128668172, + "grad_norm": 226.25282287597656, + "learning_rate": 2.153901996370236e-05, + "loss": 39.6844, + "step": 1564 + }, + { + "epoch": 5.650112866817156, + "grad_norm": 201.5033416748047, + "learning_rate": 2.1533575317604356e-05, + "loss": 35.9103, + "step": 1565 + }, + { + "epoch": 5.65372460496614, + "grad_norm": 206.63229370117188, + "learning_rate": 2.1528130671506352e-05, + "loss": 35.0026, + "step": 1566 + }, + { + "epoch": 5.657336343115124, + "grad_norm": 212.67581176757812, + "learning_rate": 2.152268602540835e-05, + "loss": 35.6298, + "step": 1567 + }, + { + "epoch": 5.660948081264109, + "grad_norm": 193.2886199951172, + "learning_rate": 2.1517241379310346e-05, + "loss": 36.0356, + "step": 1568 + }, + { + "epoch": 5.664559819413093, + "grad_norm": 166.189208984375, + "learning_rate": 2.151179673321234e-05, + "loss": 35.5423, + "step": 1569 + }, + { + "epoch": 5.668171557562077, + "grad_norm": 288.91552734375, + "learning_rate": 2.1506352087114337e-05, + "loss": 36.6227, + "step": 1570 + }, + { + "epoch": 5.668171557562077, + "eval_loss": 0.6339959502220154, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1570 + }, + { + "epoch": 5.6717832957110605, + "grad_norm": 210.91664123535156, + "learning_rate": 2.1500907441016332e-05, + "loss": 37.3015, + "step": 1571 + }, + { + "epoch": 5.675395033860045, + "grad_norm": 206.54299926757812, + "learning_rate": 2.149546279491833e-05, + "loss": 36.961, + "step": 1572 + }, + { + "epoch": 5.679006772009029, + "grad_norm": 206.55613708496094, + "learning_rate": 2.149001814882033e-05, + "loss": 36.722, + "step": 1573 + }, + { + "epoch": 5.682618510158013, + "grad_norm": 206.86563110351562, + "learning_rate": 2.1484573502722325e-05, + "loss": 37.7482, + "step": 1574 + }, + { + "epoch": 5.686230248306998, + "grad_norm": 219.96533203125, + "learning_rate": 2.147912885662432e-05, + "loss": 37.7964, + "step": 1575 + }, + { + "epoch": 5.689841986455982, + "grad_norm": 226.23887634277344, + "learning_rate": 2.1473684210526316e-05, + "loss": 38.6577, + "step": 1576 + }, + { + "epoch": 5.693453724604966, + "grad_norm": 195.1751708984375, + "learning_rate": 2.146823956442831e-05, + "loss": 36.9764, + "step": 1577 + }, + { + "epoch": 5.69706546275395, + "grad_norm": 194.3510284423828, + "learning_rate": 2.146279491833031e-05, + "loss": 39.4842, + "step": 1578 + }, + { + "epoch": 5.700677200902934, + "grad_norm": 187.02281188964844, + "learning_rate": 2.1457350272232305e-05, + "loss": 38.9574, + "step": 1579 + }, + { + "epoch": 5.704288939051919, + "grad_norm": 242.91925048828125, + "learning_rate": 2.14519056261343e-05, + "loss": 37.6359, + "step": 1580 + }, + { + "epoch": 5.704288939051919, + "eval_loss": 0.6384473443031311, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 1580 + }, + { + "epoch": 5.707900677200903, + "grad_norm": 242.9617156982422, + "learning_rate": 2.14464609800363e-05, + "loss": 31.3564, + "step": 1581 + }, + { + "epoch": 5.711512415349887, + "grad_norm": 182.00540161132812, + "learning_rate": 2.1441016333938295e-05, + "loss": 24.2933, + "step": 1582 + }, + { + "epoch": 5.715124153498872, + "grad_norm": 257.7115173339844, + "learning_rate": 2.143557168784029e-05, + "loss": 24.6299, + "step": 1583 + }, + { + "epoch": 5.718735891647856, + "grad_norm": 198.71554565429688, + "learning_rate": 2.143012704174229e-05, + "loss": 24.7344, + "step": 1584 + }, + { + "epoch": 5.72234762979684, + "grad_norm": 198.24520874023438, + "learning_rate": 2.1424682395644284e-05, + "loss": 26.0825, + "step": 1585 + }, + { + "epoch": 5.725959367945824, + "grad_norm": 248.9528045654297, + "learning_rate": 2.141923774954628e-05, + "loss": 45.1176, + "step": 1586 + }, + { + "epoch": 5.7295711060948085, + "grad_norm": 293.7327575683594, + "learning_rate": 2.1413793103448275e-05, + "loss": 45.8517, + "step": 1587 + }, + { + "epoch": 5.733182844243792, + "grad_norm": 293.1148681640625, + "learning_rate": 2.140834845735027e-05, + "loss": 45.6659, + "step": 1588 + }, + { + "epoch": 5.736794582392776, + "grad_norm": 312.7779846191406, + "learning_rate": 2.140290381125227e-05, + "loss": 44.4863, + "step": 1589 + }, + { + "epoch": 5.74040632054176, + "grad_norm": 309.1000061035156, + "learning_rate": 2.1397459165154265e-05, + "loss": 43.649, + "step": 1590 + }, + { + "epoch": 5.74040632054176, + "eval_loss": 0.6471736431121826, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 1590 + }, + { + "epoch": 5.744018058690745, + "grad_norm": 276.4226989746094, + "learning_rate": 2.1392014519056263e-05, + "loss": 45.3135, + "step": 1591 + }, + { + "epoch": 5.747629796839729, + "grad_norm": 233.6791229248047, + "learning_rate": 2.138656987295826e-05, + "loss": 44.4919, + "step": 1592 + }, + { + "epoch": 5.751241534988713, + "grad_norm": 194.2917022705078, + "learning_rate": 2.1381125226860254e-05, + "loss": 44.8033, + "step": 1593 + }, + { + "epoch": 5.754853273137698, + "grad_norm": 241.76060485839844, + "learning_rate": 2.137568058076225e-05, + "loss": 45.1427, + "step": 1594 + }, + { + "epoch": 5.758465011286682, + "grad_norm": 216.56283569335938, + "learning_rate": 2.137023593466425e-05, + "loss": 43.1769, + "step": 1595 + }, + { + "epoch": 5.762076749435666, + "grad_norm": 230.0026092529297, + "learning_rate": 2.1364791288566244e-05, + "loss": 44.1141, + "step": 1596 + }, + { + "epoch": 5.76568848758465, + "grad_norm": 191.55433654785156, + "learning_rate": 2.135934664246824e-05, + "loss": 40.7227, + "step": 1597 + }, + { + "epoch": 5.769300225733634, + "grad_norm": 180.25885009765625, + "learning_rate": 2.1353901996370235e-05, + "loss": 40.9842, + "step": 1598 + }, + { + "epoch": 5.772911963882619, + "grad_norm": 220.4018096923828, + "learning_rate": 2.134845735027223e-05, + "loss": 40.0403, + "step": 1599 + }, + { + "epoch": 5.776523702031603, + "grad_norm": 264.20587158203125, + "learning_rate": 2.1343012704174232e-05, + "loss": 40.1543, + "step": 1600 + }, + { + "epoch": 5.776523702031603, + "eval_loss": 0.6374311447143555, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1600 + }, + { + "epoch": 5.780135440180587, + "grad_norm": 167.9457244873047, + "learning_rate": 2.1337568058076227e-05, + "loss": 40.9575, + "step": 1601 + }, + { + "epoch": 5.7837471783295715, + "grad_norm": 190.05247497558594, + "learning_rate": 2.1332123411978223e-05, + "loss": 39.5593, + "step": 1602 + }, + { + "epoch": 5.7873589164785555, + "grad_norm": 246.4980926513672, + "learning_rate": 2.1326678765880218e-05, + "loss": 40.7016, + "step": 1603 + }, + { + "epoch": 5.7909706546275395, + "grad_norm": 208.7435302734375, + "learning_rate": 2.1321234119782214e-05, + "loss": 41.7855, + "step": 1604 + }, + { + "epoch": 5.794582392776523, + "grad_norm": 190.84188842773438, + "learning_rate": 2.1315789473684212e-05, + "loss": 41.2129, + "step": 1605 + }, + { + "epoch": 5.798194130925508, + "grad_norm": 196.7161102294922, + "learning_rate": 2.1310344827586208e-05, + "loss": 40.8209, + "step": 1606 + }, + { + "epoch": 5.801805869074492, + "grad_norm": 181.4319305419922, + "learning_rate": 2.1304900181488203e-05, + "loss": 41.8345, + "step": 1607 + }, + { + "epoch": 5.805417607223476, + "grad_norm": 201.2064971923828, + "learning_rate": 2.12994555353902e-05, + "loss": 43.1464, + "step": 1608 + }, + { + "epoch": 5.80902934537246, + "grad_norm": 199.15174865722656, + "learning_rate": 2.1294010889292197e-05, + "loss": 42.6041, + "step": 1609 + }, + { + "epoch": 5.812641083521445, + "grad_norm": 231.0398406982422, + "learning_rate": 2.1288566243194193e-05, + "loss": 42.867, + "step": 1610 + }, + { + "epoch": 5.812641083521445, + "eval_loss": 0.6334222555160522, + "eval_runtime": 3.1534, + "eval_samples_per_second": 56.764, + "eval_steps_per_second": 56.764, + "step": 1610 + }, + { + "epoch": 5.816252821670429, + "grad_norm": 189.26132202148438, + "learning_rate": 2.128312159709619e-05, + "loss": 41.7717, + "step": 1611 + }, + { + "epoch": 5.819864559819413, + "grad_norm": 215.5289764404297, + "learning_rate": 2.1277676950998187e-05, + "loss": 41.3994, + "step": 1612 + }, + { + "epoch": 5.823476297968397, + "grad_norm": 267.4259033203125, + "learning_rate": 2.1272232304900182e-05, + "loss": 41.8173, + "step": 1613 + }, + { + "epoch": 5.827088036117382, + "grad_norm": 241.74749755859375, + "learning_rate": 2.1266787658802178e-05, + "loss": 39.9873, + "step": 1614 + }, + { + "epoch": 5.830699774266366, + "grad_norm": 242.233642578125, + "learning_rate": 2.1261343012704173e-05, + "loss": 37.0662, + "step": 1615 + }, + { + "epoch": 5.83431151241535, + "grad_norm": 217.06141662597656, + "learning_rate": 2.1255898366606172e-05, + "loss": 36.8948, + "step": 1616 + }, + { + "epoch": 5.837923250564334, + "grad_norm": 242.05567932128906, + "learning_rate": 2.1250453720508167e-05, + "loss": 34.9909, + "step": 1617 + }, + { + "epoch": 5.8415349887133186, + "grad_norm": 178.65618896484375, + "learning_rate": 2.1245009074410166e-05, + "loss": 35.603, + "step": 1618 + }, + { + "epoch": 5.8451467268623025, + "grad_norm": 216.36865234375, + "learning_rate": 2.123956442831216e-05, + "loss": 35.9822, + "step": 1619 + }, + { + "epoch": 5.8487584650112865, + "grad_norm": 241.22161865234375, + "learning_rate": 2.1234119782214157e-05, + "loss": 35.1473, + "step": 1620 + }, + { + "epoch": 5.8487584650112865, + "eval_loss": 0.6312161087989807, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 1620 + }, + { + "epoch": 5.852370203160271, + "grad_norm": 192.05210876464844, + "learning_rate": 2.1228675136116152e-05, + "loss": 36.145, + "step": 1621 + }, + { + "epoch": 5.855981941309255, + "grad_norm": 194.0652618408203, + "learning_rate": 2.122323049001815e-05, + "loss": 37.7076, + "step": 1622 + }, + { + "epoch": 5.859593679458239, + "grad_norm": 255.59286499023438, + "learning_rate": 2.1217785843920146e-05, + "loss": 37.6837, + "step": 1623 + }, + { + "epoch": 5.863205417607223, + "grad_norm": 184.0017852783203, + "learning_rate": 2.121234119782214e-05, + "loss": 37.1681, + "step": 1624 + }, + { + "epoch": 5.866817155756207, + "grad_norm": 186.98338317871094, + "learning_rate": 2.1206896551724137e-05, + "loss": 37.4902, + "step": 1625 + }, + { + "epoch": 5.870428893905192, + "grad_norm": 253.53775024414062, + "learning_rate": 2.1201451905626132e-05, + "loss": 37.2771, + "step": 1626 + }, + { + "epoch": 5.874040632054176, + "grad_norm": 196.43038940429688, + "learning_rate": 2.119600725952813e-05, + "loss": 37.7681, + "step": 1627 + }, + { + "epoch": 5.87765237020316, + "grad_norm": 255.99879455566406, + "learning_rate": 2.119056261343013e-05, + "loss": 40.0097, + "step": 1628 + }, + { + "epoch": 5.881264108352145, + "grad_norm": 275.1465148925781, + "learning_rate": 2.1185117967332125e-05, + "loss": 38.1076, + "step": 1629 + }, + { + "epoch": 5.884875846501129, + "grad_norm": 281.8592529296875, + "learning_rate": 2.117967332123412e-05, + "loss": 38.6463, + "step": 1630 + }, + { + "epoch": 5.884875846501129, + "eval_loss": 0.6449099779129028, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 1630 + }, + { + "epoch": 5.888487584650113, + "grad_norm": 246.7912139892578, + "learning_rate": 2.1174228675136116e-05, + "loss": 36.9158, + "step": 1631 + }, + { + "epoch": 5.892099322799097, + "grad_norm": 176.7545623779297, + "learning_rate": 2.116878402903811e-05, + "loss": 25.1153, + "step": 1632 + }, + { + "epoch": 5.895711060948082, + "grad_norm": 202.2602996826172, + "learning_rate": 2.116333938294011e-05, + "loss": 24.1999, + "step": 1633 + }, + { + "epoch": 5.899322799097066, + "grad_norm": 186.26255798339844, + "learning_rate": 2.1157894736842106e-05, + "loss": 24.185, + "step": 1634 + }, + { + "epoch": 5.9029345372460496, + "grad_norm": 231.0543670654297, + "learning_rate": 2.11524500907441e-05, + "loss": 26.1841, + "step": 1635 + }, + { + "epoch": 5.9065462753950335, + "grad_norm": 336.677001953125, + "learning_rate": 2.1147005444646096e-05, + "loss": 47.1367, + "step": 1636 + }, + { + "epoch": 5.910158013544018, + "grad_norm": 299.3211975097656, + "learning_rate": 2.1141560798548095e-05, + "loss": 46.7711, + "step": 1637 + }, + { + "epoch": 5.913769751693002, + "grad_norm": 287.5389099121094, + "learning_rate": 2.1136116152450094e-05, + "loss": 44.9163, + "step": 1638 + }, + { + "epoch": 5.917381489841986, + "grad_norm": 290.34930419921875, + "learning_rate": 2.113067150635209e-05, + "loss": 45.1651, + "step": 1639 + }, + { + "epoch": 5.92099322799097, + "grad_norm": 244.7100372314453, + "learning_rate": 2.1125226860254085e-05, + "loss": 45.6252, + "step": 1640 + }, + { + "epoch": 5.92099322799097, + "eval_loss": 0.6506878733634949, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 1640 + }, + { + "epoch": 5.924604966139955, + "grad_norm": 301.48223876953125, + "learning_rate": 2.111978221415608e-05, + "loss": 44.5345, + "step": 1641 + }, + { + "epoch": 5.928216704288939, + "grad_norm": 261.05987548828125, + "learning_rate": 2.1114337568058075e-05, + "loss": 42.0263, + "step": 1642 + }, + { + "epoch": 5.931828442437923, + "grad_norm": 220.4369659423828, + "learning_rate": 2.110889292196007e-05, + "loss": 41.2405, + "step": 1643 + }, + { + "epoch": 5.935440180586907, + "grad_norm": 261.3221435546875, + "learning_rate": 2.110344827586207e-05, + "loss": 42.2734, + "step": 1644 + }, + { + "epoch": 5.939051918735892, + "grad_norm": 253.70855712890625, + "learning_rate": 2.1098003629764065e-05, + "loss": 43.0752, + "step": 1645 + }, + { + "epoch": 5.942663656884876, + "grad_norm": 198.76138305664062, + "learning_rate": 2.1092558983666064e-05, + "loss": 42.7103, + "step": 1646 + }, + { + "epoch": 5.94627539503386, + "grad_norm": 212.21466064453125, + "learning_rate": 2.108711433756806e-05, + "loss": 42.6215, + "step": 1647 + }, + { + "epoch": 5.949887133182845, + "grad_norm": 212.9633026123047, + "learning_rate": 2.1081669691470055e-05, + "loss": 42.795, + "step": 1648 + }, + { + "epoch": 5.953498871331829, + "grad_norm": 263.2871398925781, + "learning_rate": 2.1076225045372053e-05, + "loss": 43.8843, + "step": 1649 + }, + { + "epoch": 5.957110609480813, + "grad_norm": 207.67120361328125, + "learning_rate": 2.107078039927405e-05, + "loss": 43.0161, + "step": 1650 + }, + { + "epoch": 5.957110609480813, + "eval_loss": 0.6315081715583801, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1650 + }, + { + "epoch": 5.960722347629797, + "grad_norm": 176.6342010498047, + "learning_rate": 2.1065335753176044e-05, + "loss": 38.803, + "step": 1651 + }, + { + "epoch": 5.9643340857787805, + "grad_norm": 223.57485961914062, + "learning_rate": 2.105989110707804e-05, + "loss": 35.1905, + "step": 1652 + }, + { + "epoch": 5.967945823927765, + "grad_norm": 291.507568359375, + "learning_rate": 2.1054446460980035e-05, + "loss": 34.9454, + "step": 1653 + }, + { + "epoch": 5.971557562076749, + "grad_norm": 250.51063537597656, + "learning_rate": 2.104900181488203e-05, + "loss": 37.4404, + "step": 1654 + }, + { + "epoch": 5.975169300225733, + "grad_norm": 307.9601135253906, + "learning_rate": 2.1043557168784032e-05, + "loss": 36.9775, + "step": 1655 + }, + { + "epoch": 5.978781038374718, + "grad_norm": 277.24151611328125, + "learning_rate": 2.1038112522686028e-05, + "loss": 38.2696, + "step": 1656 + }, + { + "epoch": 5.982392776523702, + "grad_norm": 186.7593994140625, + "learning_rate": 2.1032667876588023e-05, + "loss": 37.0656, + "step": 1657 + }, + { + "epoch": 5.986004514672686, + "grad_norm": 201.67047119140625, + "learning_rate": 2.102722323049002e-05, + "loss": 38.1747, + "step": 1658 + }, + { + "epoch": 5.98961625282167, + "grad_norm": 216.87525939941406, + "learning_rate": 2.1021778584392014e-05, + "loss": 39.3248, + "step": 1659 + }, + { + "epoch": 5.993227990970655, + "grad_norm": 227.381103515625, + "learning_rate": 2.1016333938294013e-05, + "loss": 33.4017, + "step": 1660 + }, + { + "epoch": 5.993227990970655, + "eval_loss": 0.6369583010673523, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1660 + }, + { + "epoch": 5.996839729119639, + "grad_norm": 237.2648468017578, + "learning_rate": 2.1010889292196008e-05, + "loss": 24.679, + "step": 1661 + }, + { + "epoch": 6.0, + "grad_norm": 191.99951171875, + "learning_rate": 2.1005444646098003e-05, + "loss": 21.9552, + "step": 1662 + }, + { + "epoch": 6.003611738148984, + "grad_norm": 267.92181396484375, + "learning_rate": 2.1e-05, + "loss": 43.6884, + "step": 1663 + }, + { + "epoch": 6.007223476297969, + "grad_norm": 318.86602783203125, + "learning_rate": 2.0994555353901998e-05, + "loss": 46.0709, + "step": 1664 + }, + { + "epoch": 6.010835214446953, + "grad_norm": 282.772705078125, + "learning_rate": 2.0989110707803993e-05, + "loss": 44.2746, + "step": 1665 + }, + { + "epoch": 6.014446952595937, + "grad_norm": 263.2024841308594, + "learning_rate": 2.0983666061705992e-05, + "loss": 43.818, + "step": 1666 + }, + { + "epoch": 6.018058690744921, + "grad_norm": 229.41725158691406, + "learning_rate": 2.0978221415607987e-05, + "loss": 43.9441, + "step": 1667 + }, + { + "epoch": 6.021670428893906, + "grad_norm": 253.25624084472656, + "learning_rate": 2.0972776769509983e-05, + "loss": 43.517, + "step": 1668 + }, + { + "epoch": 6.0252821670428895, + "grad_norm": 202.00238037109375, + "learning_rate": 2.0967332123411978e-05, + "loss": 44.3685, + "step": 1669 + }, + { + "epoch": 6.0288939051918735, + "grad_norm": 196.92825317382812, + "learning_rate": 2.0961887477313973e-05, + "loss": 44.9367, + "step": 1670 + }, + { + "epoch": 6.0288939051918735, + "eval_loss": 0.6381568312644958, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1670 + }, + { + "epoch": 6.0325056433408575, + "grad_norm": 191.00900268554688, + "learning_rate": 2.0956442831215972e-05, + "loss": 44.0743, + "step": 1671 + }, + { + "epoch": 6.036117381489842, + "grad_norm": 195.92141723632812, + "learning_rate": 2.0950998185117967e-05, + "loss": 43.3278, + "step": 1672 + }, + { + "epoch": 6.039729119638826, + "grad_norm": 230.04708862304688, + "learning_rate": 2.0945553539019963e-05, + "loss": 41.6419, + "step": 1673 + }, + { + "epoch": 6.04334085778781, + "grad_norm": 215.70689392089844, + "learning_rate": 2.094010889292196e-05, + "loss": 41.0927, + "step": 1674 + }, + { + "epoch": 6.046952595936794, + "grad_norm": 227.51797485351562, + "learning_rate": 2.0934664246823957e-05, + "loss": 40.1888, + "step": 1675 + }, + { + "epoch": 6.050564334085779, + "grad_norm": 216.93089294433594, + "learning_rate": 2.0929219600725952e-05, + "loss": 39.8766, + "step": 1676 + }, + { + "epoch": 6.054176072234763, + "grad_norm": 199.3091583251953, + "learning_rate": 2.092377495462795e-05, + "loss": 40.3851, + "step": 1677 + }, + { + "epoch": 6.057787810383747, + "grad_norm": 188.56056213378906, + "learning_rate": 2.0918330308529947e-05, + "loss": 40.5289, + "step": 1678 + }, + { + "epoch": 6.061399548532731, + "grad_norm": 194.23265075683594, + "learning_rate": 2.0912885662431942e-05, + "loss": 40.7509, + "step": 1679 + }, + { + "epoch": 6.065011286681716, + "grad_norm": 199.7327423095703, + "learning_rate": 2.0907441016333937e-05, + "loss": 41.3404, + "step": 1680 + }, + { + "epoch": 6.065011286681716, + "eval_loss": 0.6312655806541443, + "eval_runtime": 3.1482, + "eval_samples_per_second": 56.858, + "eval_steps_per_second": 56.858, + "step": 1680 + }, + { + "epoch": 6.0686230248307, + "grad_norm": 189.40150451660156, + "learning_rate": 2.0901996370235933e-05, + "loss": 41.3719, + "step": 1681 + }, + { + "epoch": 6.072234762979684, + "grad_norm": 222.07705688476562, + "learning_rate": 2.089655172413793e-05, + "loss": 41.8194, + "step": 1682 + }, + { + "epoch": 6.075846501128668, + "grad_norm": 205.6264190673828, + "learning_rate": 2.089110707803993e-05, + "loss": 39.8522, + "step": 1683 + }, + { + "epoch": 6.079458239277653, + "grad_norm": 207.98802185058594, + "learning_rate": 2.0885662431941926e-05, + "loss": 41.5093, + "step": 1684 + }, + { + "epoch": 6.083069977426637, + "grad_norm": 197.24134826660156, + "learning_rate": 2.088021778584392e-05, + "loss": 41.7284, + "step": 1685 + }, + { + "epoch": 6.0866817155756205, + "grad_norm": 220.84255981445312, + "learning_rate": 2.0874773139745916e-05, + "loss": 42.7841, + "step": 1686 + }, + { + "epoch": 6.090293453724605, + "grad_norm": 239.06854248046875, + "learning_rate": 2.0869328493647912e-05, + "loss": 43.6391, + "step": 1687 + }, + { + "epoch": 6.093905191873589, + "grad_norm": 193.2572021484375, + "learning_rate": 2.086388384754991e-05, + "loss": 41.9963, + "step": 1688 + }, + { + "epoch": 6.097516930022573, + "grad_norm": 206.66473388671875, + "learning_rate": 2.0858439201451906e-05, + "loss": 41.9834, + "step": 1689 + }, + { + "epoch": 6.101128668171557, + "grad_norm": 214.81956481933594, + "learning_rate": 2.08529945553539e-05, + "loss": 41.7128, + "step": 1690 + }, + { + "epoch": 6.101128668171557, + "eval_loss": 0.6309775114059448, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1690 + }, + { + "epoch": 6.104740406320542, + "grad_norm": 189.58360290527344, + "learning_rate": 2.0847549909255897e-05, + "loss": 37.7807, + "step": 1691 + }, + { + "epoch": 6.108352144469526, + "grad_norm": 265.76934814453125, + "learning_rate": 2.0842105263157895e-05, + "loss": 37.7091, + "step": 1692 + }, + { + "epoch": 6.11196388261851, + "grad_norm": 266.4632568359375, + "learning_rate": 2.0836660617059894e-05, + "loss": 34.7386, + "step": 1693 + }, + { + "epoch": 6.115575620767494, + "grad_norm": 309.3799743652344, + "learning_rate": 2.083121597096189e-05, + "loss": 34.9386, + "step": 1694 + }, + { + "epoch": 6.119187358916479, + "grad_norm": 252.98681640625, + "learning_rate": 2.0825771324863885e-05, + "loss": 34.9113, + "step": 1695 + }, + { + "epoch": 6.122799097065463, + "grad_norm": 199.3408660888672, + "learning_rate": 2.082032667876588e-05, + "loss": 35.1914, + "step": 1696 + }, + { + "epoch": 6.126410835214447, + "grad_norm": 231.67514038085938, + "learning_rate": 2.0814882032667876e-05, + "loss": 36.3151, + "step": 1697 + }, + { + "epoch": 6.130022573363431, + "grad_norm": 215.49317932128906, + "learning_rate": 2.080943738656987e-05, + "loss": 37.6763, + "step": 1698 + }, + { + "epoch": 6.133634311512416, + "grad_norm": 239.3602752685547, + "learning_rate": 2.080399274047187e-05, + "loss": 35.7805, + "step": 1699 + }, + { + "epoch": 6.1372460496614, + "grad_norm": 192.8195037841797, + "learning_rate": 2.0798548094373865e-05, + "loss": 36.7353, + "step": 1700 + }, + { + "epoch": 6.1372460496614, + "eval_loss": 0.6290757060050964, + "eval_runtime": 3.1486, + "eval_samples_per_second": 56.851, + "eval_steps_per_second": 56.851, + "step": 1700 + }, + { + "epoch": 6.140857787810384, + "grad_norm": 191.125, + "learning_rate": 2.0793103448275864e-05, + "loss": 36.6377, + "step": 1701 + }, + { + "epoch": 6.144469525959368, + "grad_norm": 232.39170837402344, + "learning_rate": 2.078765880217786e-05, + "loss": 36.5235, + "step": 1702 + }, + { + "epoch": 6.148081264108352, + "grad_norm": 259.41204833984375, + "learning_rate": 2.0782214156079855e-05, + "loss": 37.7093, + "step": 1703 + }, + { + "epoch": 6.151693002257336, + "grad_norm": 218.00814819335938, + "learning_rate": 2.0776769509981854e-05, + "loss": 37.8061, + "step": 1704 + }, + { + "epoch": 6.15530474040632, + "grad_norm": 183.78170776367188, + "learning_rate": 2.077132486388385e-05, + "loss": 37.9451, + "step": 1705 + }, + { + "epoch": 6.158916478555304, + "grad_norm": 242.387939453125, + "learning_rate": 2.0765880217785844e-05, + "loss": 38.687, + "step": 1706 + }, + { + "epoch": 6.162528216704289, + "grad_norm": 247.09152221679688, + "learning_rate": 2.076043557168784e-05, + "loss": 38.5109, + "step": 1707 + }, + { + "epoch": 6.166139954853273, + "grad_norm": 202.3104705810547, + "learning_rate": 2.0754990925589835e-05, + "loss": 28.0115, + "step": 1708 + }, + { + "epoch": 6.169751693002257, + "grad_norm": 239.5511016845703, + "learning_rate": 2.0749546279491834e-05, + "loss": 23.8873, + "step": 1709 + }, + { + "epoch": 6.173363431151241, + "grad_norm": 233.80007934570312, + "learning_rate": 2.0744101633393833e-05, + "loss": 24.0236, + "step": 1710 + }, + { + "epoch": 6.173363431151241, + "eval_loss": 0.6451307535171509, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1710 + }, + { + "epoch": 6.176975169300226, + "grad_norm": 231.85955810546875, + "learning_rate": 2.0738656987295828e-05, + "loss": 25.2521, + "step": 1711 + }, + { + "epoch": 6.18058690744921, + "grad_norm": 207.05453491210938, + "learning_rate": 2.0733212341197823e-05, + "loss": 25.5774, + "step": 1712 + }, + { + "epoch": 6.184198645598194, + "grad_norm": 265.9180908203125, + "learning_rate": 2.072776769509982e-05, + "loss": 46.0267, + "step": 1713 + }, + { + "epoch": 6.187810383747179, + "grad_norm": 289.2763671875, + "learning_rate": 2.0722323049001814e-05, + "loss": 46.6262, + "step": 1714 + }, + { + "epoch": 6.191422121896163, + "grad_norm": 254.466552734375, + "learning_rate": 2.0716878402903813e-05, + "loss": 44.2758, + "step": 1715 + }, + { + "epoch": 6.195033860045147, + "grad_norm": 262.713134765625, + "learning_rate": 2.071143375680581e-05, + "loss": 44.6334, + "step": 1716 + }, + { + "epoch": 6.198645598194131, + "grad_norm": 272.8150939941406, + "learning_rate": 2.0705989110707804e-05, + "loss": 44.9617, + "step": 1717 + }, + { + "epoch": 6.2022573363431155, + "grad_norm": 288.115478515625, + "learning_rate": 2.07005444646098e-05, + "loss": 44.4382, + "step": 1718 + }, + { + "epoch": 6.2058690744920995, + "grad_norm": 226.08058166503906, + "learning_rate": 2.0695099818511795e-05, + "loss": 44.8551, + "step": 1719 + }, + { + "epoch": 6.209480812641083, + "grad_norm": 219.95835876464844, + "learning_rate": 2.0689655172413797e-05, + "loss": 45.5901, + "step": 1720 + }, + { + "epoch": 6.209480812641083, + "eval_loss": 0.6379314661026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 1720 + }, + { + "epoch": 6.213092550790067, + "grad_norm": 190.3118896484375, + "learning_rate": 2.0684210526315792e-05, + "loss": 44.0675, + "step": 1721 + }, + { + "epoch": 6.216704288939052, + "grad_norm": 177.408935546875, + "learning_rate": 2.0678765880217787e-05, + "loss": 42.6333, + "step": 1722 + }, + { + "epoch": 6.220316027088036, + "grad_norm": 231.3040313720703, + "learning_rate": 2.0673321234119783e-05, + "loss": 41.6771, + "step": 1723 + }, + { + "epoch": 6.22392776523702, + "grad_norm": 226.51663208007812, + "learning_rate": 2.0667876588021778e-05, + "loss": 41.0829, + "step": 1724 + }, + { + "epoch": 6.227539503386004, + "grad_norm": 184.55775451660156, + "learning_rate": 2.0662431941923774e-05, + "loss": 39.2682, + "step": 1725 + }, + { + "epoch": 6.231151241534989, + "grad_norm": 205.0491943359375, + "learning_rate": 2.0656987295825772e-05, + "loss": 40.4101, + "step": 1726 + }, + { + "epoch": 6.234762979683973, + "grad_norm": 201.45838928222656, + "learning_rate": 2.0651542649727768e-05, + "loss": 39.9147, + "step": 1727 + }, + { + "epoch": 6.238374717832957, + "grad_norm": 220.16213989257812, + "learning_rate": 2.0646098003629763e-05, + "loss": 40.7215, + "step": 1728 + }, + { + "epoch": 6.241986455981941, + "grad_norm": 260.9661560058594, + "learning_rate": 2.0640653357531762e-05, + "loss": 40.0256, + "step": 1729 + }, + { + "epoch": 6.245598194130926, + "grad_norm": 314.2476806640625, + "learning_rate": 2.0635208711433757e-05, + "loss": 41.1147, + "step": 1730 + }, + { + "epoch": 6.245598194130926, + "eval_loss": 0.6347935199737549, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1730 + }, + { + "epoch": 6.24920993227991, + "grad_norm": 262.24505615234375, + "learning_rate": 2.0629764065335756e-05, + "loss": 41.7255, + "step": 1731 + }, + { + "epoch": 6.252821670428894, + "grad_norm": 212.0876922607422, + "learning_rate": 2.062431941923775e-05, + "loss": 41.2559, + "step": 1732 + }, + { + "epoch": 6.2564334085778786, + "grad_norm": 185.3249969482422, + "learning_rate": 2.0618874773139747e-05, + "loss": 41.1664, + "step": 1733 + }, + { + "epoch": 6.2600451467268625, + "grad_norm": 184.7873077392578, + "learning_rate": 2.0613430127041742e-05, + "loss": 41.3357, + "step": 1734 + }, + { + "epoch": 6.2636568848758465, + "grad_norm": 230.11257934570312, + "learning_rate": 2.0607985480943738e-05, + "loss": 43.0978, + "step": 1735 + }, + { + "epoch": 6.2672686230248305, + "grad_norm": 251.255126953125, + "learning_rate": 2.0602540834845733e-05, + "loss": 42.4169, + "step": 1736 + }, + { + "epoch": 6.270880361173815, + "grad_norm": 230.1149444580078, + "learning_rate": 2.0597096188747732e-05, + "loss": 43.2969, + "step": 1737 + }, + { + "epoch": 6.274492099322799, + "grad_norm": 217.2769012451172, + "learning_rate": 2.059165154264973e-05, + "loss": 42.6037, + "step": 1738 + }, + { + "epoch": 6.278103837471783, + "grad_norm": 189.85533142089844, + "learning_rate": 2.0586206896551726e-05, + "loss": 42.1215, + "step": 1739 + }, + { + "epoch": 6.281715575620767, + "grad_norm": 242.15667724609375, + "learning_rate": 2.058076225045372e-05, + "loss": 42.6337, + "step": 1740 + }, + { + "epoch": 6.281715575620767, + "eval_loss": 0.6310555934906006, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 1740 + }, + { + "epoch": 6.285327313769752, + "grad_norm": 213.7873992919922, + "learning_rate": 2.0575317604355717e-05, + "loss": 40.5315, + "step": 1741 + }, + { + "epoch": 6.288939051918736, + "grad_norm": 243.86492919921875, + "learning_rate": 2.0569872958257715e-05, + "loss": 38.9483, + "step": 1742 + }, + { + "epoch": 6.29255079006772, + "grad_norm": 276.0108642578125, + "learning_rate": 2.056442831215971e-05, + "loss": 35.9627, + "step": 1743 + }, + { + "epoch": 6.296162528216704, + "grad_norm": 252.5875701904297, + "learning_rate": 2.0558983666061706e-05, + "loss": 35.4305, + "step": 1744 + }, + { + "epoch": 6.299774266365689, + "grad_norm": 227.15142822265625, + "learning_rate": 2.05535390199637e-05, + "loss": 35.2385, + "step": 1745 + }, + { + "epoch": 6.303386004514673, + "grad_norm": 259.6727294921875, + "learning_rate": 2.0548094373865697e-05, + "loss": 35.735, + "step": 1746 + }, + { + "epoch": 6.306997742663657, + "grad_norm": 185.07765197753906, + "learning_rate": 2.0542649727767696e-05, + "loss": 36.8835, + "step": 1747 + }, + { + "epoch": 6.310609480812641, + "grad_norm": 207.650146484375, + "learning_rate": 2.0537205081669694e-05, + "loss": 36.346, + "step": 1748 + }, + { + "epoch": 6.314221218961626, + "grad_norm": 223.2378692626953, + "learning_rate": 2.053176043557169e-05, + "loss": 36.1527, + "step": 1749 + }, + { + "epoch": 6.3178329571106095, + "grad_norm": 162.90794372558594, + "learning_rate": 2.0526315789473685e-05, + "loss": 35.7408, + "step": 1750 + }, + { + "epoch": 6.3178329571106095, + "eval_loss": 0.6276403069496155, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 1750 + }, + { + "epoch": 6.3214446952595935, + "grad_norm": 165.8592987060547, + "learning_rate": 2.052087114337568e-05, + "loss": 37.7916, + "step": 1751 + }, + { + "epoch": 6.3250564334085775, + "grad_norm": 179.7499542236328, + "learning_rate": 2.0515426497277676e-05, + "loss": 36.8409, + "step": 1752 + }, + { + "epoch": 6.328668171557562, + "grad_norm": 227.0990753173828, + "learning_rate": 2.0509981851179675e-05, + "loss": 37.1766, + "step": 1753 + }, + { + "epoch": 6.332279909706546, + "grad_norm": 216.3297882080078, + "learning_rate": 2.050453720508167e-05, + "loss": 37.5, + "step": 1754 + }, + { + "epoch": 6.33589164785553, + "grad_norm": 197.88409423828125, + "learning_rate": 2.0499092558983666e-05, + "loss": 38.8293, + "step": 1755 + }, + { + "epoch": 6.339503386004514, + "grad_norm": 189.74916076660156, + "learning_rate": 2.049364791288566e-05, + "loss": 37.9873, + "step": 1756 + }, + { + "epoch": 6.343115124153499, + "grad_norm": 241.16644287109375, + "learning_rate": 2.048820326678766e-05, + "loss": 39.3107, + "step": 1757 + }, + { + "epoch": 6.346726862302483, + "grad_norm": 224.3491668701172, + "learning_rate": 2.0482758620689655e-05, + "loss": 36.2482, + "step": 1758 + }, + { + "epoch": 6.350338600451467, + "grad_norm": 217.30882263183594, + "learning_rate": 2.0477313974591654e-05, + "loss": 24.1945, + "step": 1759 + }, + { + "epoch": 6.353950338600452, + "grad_norm": 213.23683166503906, + "learning_rate": 2.047186932849365e-05, + "loss": 24.2356, + "step": 1760 + }, + { + "epoch": 6.353950338600452, + "eval_loss": 0.6382855772972107, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.795, + "eval_steps_per_second": 56.795, + "step": 1760 + }, + { + "epoch": 6.357562076749436, + "grad_norm": 209.8166961669922, + "learning_rate": 2.0466424682395645e-05, + "loss": 25.1916, + "step": 1761 + }, + { + "epoch": 6.36117381489842, + "grad_norm": 197.86773681640625, + "learning_rate": 2.046098003629764e-05, + "loss": 25.1372, + "step": 1762 + }, + { + "epoch": 6.364785553047404, + "grad_norm": 280.80517578125, + "learning_rate": 2.0455535390199635e-05, + "loss": 45.0431, + "step": 1763 + }, + { + "epoch": 6.368397291196389, + "grad_norm": 239.85861206054688, + "learning_rate": 2.0450090744101634e-05, + "loss": 45.4893, + "step": 1764 + }, + { + "epoch": 6.372009029345373, + "grad_norm": 302.56024169921875, + "learning_rate": 2.044464609800363e-05, + "loss": 45.3313, + "step": 1765 + }, + { + "epoch": 6.375620767494357, + "grad_norm": 255.5519256591797, + "learning_rate": 2.043920145190563e-05, + "loss": 44.703, + "step": 1766 + }, + { + "epoch": 6.3792325056433405, + "grad_norm": 223.1331024169922, + "learning_rate": 2.0433756805807624e-05, + "loss": 45.0278, + "step": 1767 + }, + { + "epoch": 6.382844243792325, + "grad_norm": 240.68817138671875, + "learning_rate": 2.042831215970962e-05, + "loss": 44.7298, + "step": 1768 + }, + { + "epoch": 6.386455981941309, + "grad_norm": 239.5072021484375, + "learning_rate": 2.0422867513611614e-05, + "loss": 44.0512, + "step": 1769 + }, + { + "epoch": 6.390067720090293, + "grad_norm": 186.3783416748047, + "learning_rate": 2.0417422867513613e-05, + "loss": 43.8646, + "step": 1770 + }, + { + "epoch": 6.390067720090293, + "eval_loss": 0.6325972676277161, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 1770 + }, + { + "epoch": 6.393679458239277, + "grad_norm": 169.77285766601562, + "learning_rate": 2.041197822141561e-05, + "loss": 43.8688, + "step": 1771 + }, + { + "epoch": 6.397291196388262, + "grad_norm": 158.4019012451172, + "learning_rate": 2.0406533575317604e-05, + "loss": 42.5757, + "step": 1772 + }, + { + "epoch": 6.400902934537246, + "grad_norm": 209.79916381835938, + "learning_rate": 2.04010889292196e-05, + "loss": 44.8075, + "step": 1773 + }, + { + "epoch": 6.40451467268623, + "grad_norm": 215.74639892578125, + "learning_rate": 2.0395644283121595e-05, + "loss": 42.0121, + "step": 1774 + }, + { + "epoch": 6.408126410835214, + "grad_norm": 215.21121215820312, + "learning_rate": 2.0390199637023597e-05, + "loss": 40.6564, + "step": 1775 + }, + { + "epoch": 6.411738148984199, + "grad_norm": 244.49574279785156, + "learning_rate": 2.0384754990925592e-05, + "loss": 40.543, + "step": 1776 + }, + { + "epoch": 6.415349887133183, + "grad_norm": 189.22781372070312, + "learning_rate": 2.0379310344827588e-05, + "loss": 39.5569, + "step": 1777 + }, + { + "epoch": 6.418961625282167, + "grad_norm": 204.32664489746094, + "learning_rate": 2.0373865698729583e-05, + "loss": 40.0789, + "step": 1778 + }, + { + "epoch": 6.422573363431152, + "grad_norm": 217.5277557373047, + "learning_rate": 2.036842105263158e-05, + "loss": 39.6436, + "step": 1779 + }, + { + "epoch": 6.426185101580136, + "grad_norm": 196.25918579101562, + "learning_rate": 2.0362976406533574e-05, + "loss": 41.0794, + "step": 1780 + }, + { + "epoch": 6.426185101580136, + "eval_loss": 0.6334295868873596, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1780 + }, + { + "epoch": 6.42979683972912, + "grad_norm": 191.50656127929688, + "learning_rate": 2.0357531760435573e-05, + "loss": 41.2976, + "step": 1781 + }, + { + "epoch": 6.433408577878104, + "grad_norm": 192.98692321777344, + "learning_rate": 2.0352087114337568e-05, + "loss": 41.0843, + "step": 1782 + }, + { + "epoch": 6.437020316027088, + "grad_norm": 197.32862854003906, + "learning_rate": 2.0346642468239563e-05, + "loss": 40.4123, + "step": 1783 + }, + { + "epoch": 6.440632054176072, + "grad_norm": 205.18751525878906, + "learning_rate": 2.0341197822141562e-05, + "loss": 41.9185, + "step": 1784 + }, + { + "epoch": 6.444243792325056, + "grad_norm": 201.69070434570312, + "learning_rate": 2.0335753176043558e-05, + "loss": 41.6794, + "step": 1785 + }, + { + "epoch": 6.44785553047404, + "grad_norm": 218.77044677734375, + "learning_rate": 2.0330308529945556e-05, + "loss": 43.5805, + "step": 1786 + }, + { + "epoch": 6.451467268623025, + "grad_norm": 183.25967407226562, + "learning_rate": 2.0324863883847552e-05, + "loss": 41.2777, + "step": 1787 + }, + { + "epoch": 6.455079006772009, + "grad_norm": 219.97369384765625, + "learning_rate": 2.0319419237749547e-05, + "loss": 42.4618, + "step": 1788 + }, + { + "epoch": 6.458690744920993, + "grad_norm": 216.1624298095703, + "learning_rate": 2.0313974591651542e-05, + "loss": 41.6424, + "step": 1789 + }, + { + "epoch": 6.462302483069977, + "grad_norm": 222.29965209960938, + "learning_rate": 2.0308529945553538e-05, + "loss": 41.4058, + "step": 1790 + }, + { + "epoch": 6.462302483069977, + "eval_loss": 0.6282982230186462, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 1790 + }, + { + "epoch": 6.465914221218962, + "grad_norm": 215.50511169433594, + "learning_rate": 2.0303085299455533e-05, + "loss": 39.474, + "step": 1791 + }, + { + "epoch": 6.469525959367946, + "grad_norm": 237.2119903564453, + "learning_rate": 2.0297640653357532e-05, + "loss": 36.0508, + "step": 1792 + }, + { + "epoch": 6.47313769751693, + "grad_norm": 234.52975463867188, + "learning_rate": 2.029219600725953e-05, + "loss": 34.1704, + "step": 1793 + }, + { + "epoch": 6.476749435665914, + "grad_norm": 213.22216796875, + "learning_rate": 2.0286751361161526e-05, + "loss": 34.7592, + "step": 1794 + }, + { + "epoch": 6.480361173814899, + "grad_norm": 215.77244567871094, + "learning_rate": 2.028130671506352e-05, + "loss": 35.3051, + "step": 1795 + }, + { + "epoch": 6.483972911963883, + "grad_norm": 179.0439910888672, + "learning_rate": 2.0275862068965517e-05, + "loss": 35.2493, + "step": 1796 + }, + { + "epoch": 6.487584650112867, + "grad_norm": 217.47218322753906, + "learning_rate": 2.0270417422867516e-05, + "loss": 35.6169, + "step": 1797 + }, + { + "epoch": 6.491196388261851, + "grad_norm": 191.3380584716797, + "learning_rate": 2.026497277676951e-05, + "loss": 36.428, + "step": 1798 + }, + { + "epoch": 6.4948081264108355, + "grad_norm": 200.8570098876953, + "learning_rate": 2.0259528130671506e-05, + "loss": 36.5983, + "step": 1799 + }, + { + "epoch": 6.4984198645598195, + "grad_norm": 173.1240234375, + "learning_rate": 2.0254083484573502e-05, + "loss": 36.0163, + "step": 1800 + }, + { + "epoch": 6.4984198645598195, + "eval_loss": 0.6268841624259949, + "eval_runtime": 3.146, + "eval_samples_per_second": 56.898, + "eval_steps_per_second": 56.898, + "step": 1800 + }, + { + "epoch": 6.502031602708803, + "grad_norm": 225.66845703125, + "learning_rate": 2.0248638838475497e-05, + "loss": 36.2461, + "step": 1801 + }, + { + "epoch": 6.505643340857787, + "grad_norm": 189.66233825683594, + "learning_rate": 2.0243194192377496e-05, + "loss": 37.416, + "step": 1802 + }, + { + "epoch": 6.509255079006772, + "grad_norm": 243.0270233154297, + "learning_rate": 2.0237749546279495e-05, + "loss": 38.5309, + "step": 1803 + }, + { + "epoch": 6.512866817155756, + "grad_norm": 192.0927276611328, + "learning_rate": 2.023230490018149e-05, + "loss": 37.087, + "step": 1804 + }, + { + "epoch": 6.51647855530474, + "grad_norm": 222.2957305908203, + "learning_rate": 2.0226860254083486e-05, + "loss": 37.8877, + "step": 1805 + }, + { + "epoch": 6.520090293453725, + "grad_norm": 259.84722900390625, + "learning_rate": 2.022141560798548e-05, + "loss": 39.2138, + "step": 1806 + }, + { + "epoch": 6.523702031602709, + "grad_norm": 205.5794219970703, + "learning_rate": 2.0215970961887476e-05, + "loss": 38.6066, + "step": 1807 + }, + { + "epoch": 6.527313769751693, + "grad_norm": 300.455810546875, + "learning_rate": 2.0210526315789475e-05, + "loss": 36.1581, + "step": 1808 + }, + { + "epoch": 6.530925507900677, + "grad_norm": 207.18063354492188, + "learning_rate": 2.020508166969147e-05, + "loss": 24.3689, + "step": 1809 + }, + { + "epoch": 6.534537246049661, + "grad_norm": 230.98516845703125, + "learning_rate": 2.0199637023593466e-05, + "loss": 23.7019, + "step": 1810 + }, + { + "epoch": 6.534537246049661, + "eval_loss": 0.6379140615463257, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 1810 + }, + { + "epoch": 6.538148984198646, + "grad_norm": 153.8694610595703, + "learning_rate": 2.019419237749546e-05, + "loss": 24.5035, + "step": 1811 + }, + { + "epoch": 6.54176072234763, + "grad_norm": 229.9432373046875, + "learning_rate": 2.018874773139746e-05, + "loss": 26.1645, + "step": 1812 + }, + { + "epoch": 6.545372460496614, + "grad_norm": 325.3592529296875, + "learning_rate": 2.018330308529946e-05, + "loss": 45.6349, + "step": 1813 + }, + { + "epoch": 6.5489841986455986, + "grad_norm": 261.0744323730469, + "learning_rate": 2.0177858439201454e-05, + "loss": 45.5545, + "step": 1814 + }, + { + "epoch": 6.5525959367945825, + "grad_norm": 261.4237976074219, + "learning_rate": 2.017241379310345e-05, + "loss": 45.321, + "step": 1815 + }, + { + "epoch": 6.5562076749435665, + "grad_norm": 238.8377685546875, + "learning_rate": 2.0166969147005445e-05, + "loss": 44.5963, + "step": 1816 + }, + { + "epoch": 6.5598194130925505, + "grad_norm": 225.89730834960938, + "learning_rate": 2.016152450090744e-05, + "loss": 43.593, + "step": 1817 + }, + { + "epoch": 6.563431151241535, + "grad_norm": 265.09625244140625, + "learning_rate": 2.0156079854809436e-05, + "loss": 43.536, + "step": 1818 + }, + { + "epoch": 6.567042889390519, + "grad_norm": 257.9114685058594, + "learning_rate": 2.0150635208711434e-05, + "loss": 44.1125, + "step": 1819 + }, + { + "epoch": 6.570654627539503, + "grad_norm": 188.06382751464844, + "learning_rate": 2.014519056261343e-05, + "loss": 45.097, + "step": 1820 + }, + { + "epoch": 6.570654627539503, + "eval_loss": 0.6347097754478455, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 1820 + }, + { + "epoch": 6.574266365688487, + "grad_norm": 227.7350616455078, + "learning_rate": 2.013974591651543e-05, + "loss": 43.9367, + "step": 1821 + }, + { + "epoch": 6.577878103837472, + "grad_norm": 207.54774475097656, + "learning_rate": 2.0134301270417424e-05, + "loss": 43.8266, + "step": 1822 + }, + { + "epoch": 6.581489841986456, + "grad_norm": 204.62364196777344, + "learning_rate": 2.012885662431942e-05, + "loss": 42.7973, + "step": 1823 + }, + { + "epoch": 6.58510158013544, + "grad_norm": 244.32159423828125, + "learning_rate": 2.0123411978221418e-05, + "loss": 42.7741, + "step": 1824 + }, + { + "epoch": 6.588713318284425, + "grad_norm": 304.9100036621094, + "learning_rate": 2.0117967332123414e-05, + "loss": 40.6529, + "step": 1825 + }, + { + "epoch": 6.592325056433409, + "grad_norm": 275.5767517089844, + "learning_rate": 2.011252268602541e-05, + "loss": 40.2909, + "step": 1826 + }, + { + "epoch": 6.595936794582393, + "grad_norm": 227.69642639160156, + "learning_rate": 2.0107078039927404e-05, + "loss": 39.8786, + "step": 1827 + }, + { + "epoch": 6.599548532731377, + "grad_norm": 261.4333190917969, + "learning_rate": 2.01016333938294e-05, + "loss": 40.7009, + "step": 1828 + }, + { + "epoch": 6.603160270880361, + "grad_norm": 213.0095977783203, + "learning_rate": 2.0096188747731395e-05, + "loss": 40.0595, + "step": 1829 + }, + { + "epoch": 6.606772009029346, + "grad_norm": 251.78590393066406, + "learning_rate": 2.0090744101633397e-05, + "loss": 40.8939, + "step": 1830 + }, + { + "epoch": 6.606772009029346, + "eval_loss": 0.6333281397819519, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1830 + }, + { + "epoch": 6.6103837471783295, + "grad_norm": 224.89805603027344, + "learning_rate": 2.0085299455535393e-05, + "loss": 41.4123, + "step": 1831 + }, + { + "epoch": 6.6139954853273135, + "grad_norm": 195.67982482910156, + "learning_rate": 2.0079854809437388e-05, + "loss": 41.3483, + "step": 1832 + }, + { + "epoch": 6.617607223476298, + "grad_norm": 214.318603515625, + "learning_rate": 2.0074410163339383e-05, + "loss": 40.5516, + "step": 1833 + }, + { + "epoch": 6.621218961625282, + "grad_norm": 226.60968017578125, + "learning_rate": 2.006896551724138e-05, + "loss": 41.3523, + "step": 1834 + }, + { + "epoch": 6.624830699774266, + "grad_norm": 231.63604736328125, + "learning_rate": 2.0063520871143378e-05, + "loss": 41.8734, + "step": 1835 + }, + { + "epoch": 6.62844243792325, + "grad_norm": 224.1644287109375, + "learning_rate": 2.0058076225045373e-05, + "loss": 42.7386, + "step": 1836 + }, + { + "epoch": 6.632054176072235, + "grad_norm": 273.651123046875, + "learning_rate": 2.0052631578947368e-05, + "loss": 42.4525, + "step": 1837 + }, + { + "epoch": 6.635665914221219, + "grad_norm": 270.8088684082031, + "learning_rate": 2.0047186932849364e-05, + "loss": 42.1051, + "step": 1838 + }, + { + "epoch": 6.639277652370203, + "grad_norm": 303.1058044433594, + "learning_rate": 2.0041742286751362e-05, + "loss": 42.1301, + "step": 1839 + }, + { + "epoch": 6.642889390519187, + "grad_norm": 207.29380798339844, + "learning_rate": 2.0036297640653358e-05, + "loss": 42.1495, + "step": 1840 + }, + { + "epoch": 6.642889390519187, + "eval_loss": 0.6321585774421692, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1840 + }, + { + "epoch": 6.646501128668172, + "grad_norm": 262.1852722167969, + "learning_rate": 2.0030852994555357e-05, + "loss": 39.6408, + "step": 1841 + }, + { + "epoch": 6.650112866817156, + "grad_norm": 233.7991943359375, + "learning_rate": 2.0025408348457352e-05, + "loss": 37.6177, + "step": 1842 + }, + { + "epoch": 6.65372460496614, + "grad_norm": 247.25514221191406, + "learning_rate": 2.0019963702359347e-05, + "loss": 35.4287, + "step": 1843 + }, + { + "epoch": 6.657336343115124, + "grad_norm": 191.53343200683594, + "learning_rate": 2.0014519056261343e-05, + "loss": 34.2335, + "step": 1844 + }, + { + "epoch": 6.660948081264109, + "grad_norm": 245.22821044921875, + "learning_rate": 2.0009074410163338e-05, + "loss": 35.8097, + "step": 1845 + }, + { + "epoch": 6.664559819413093, + "grad_norm": 213.8151092529297, + "learning_rate": 2.0003629764065337e-05, + "loss": 35.2621, + "step": 1846 + }, + { + "epoch": 6.668171557562077, + "grad_norm": 174.6085205078125, + "learning_rate": 1.9998185117967332e-05, + "loss": 36.6137, + "step": 1847 + }, + { + "epoch": 6.6717832957110605, + "grad_norm": 287.4677429199219, + "learning_rate": 1.9992740471869328e-05, + "loss": 37.5896, + "step": 1848 + }, + { + "epoch": 6.675395033860045, + "grad_norm": 224.59771728515625, + "learning_rate": 1.9987295825771326e-05, + "loss": 36.5515, + "step": 1849 + }, + { + "epoch": 6.679006772009029, + "grad_norm": 212.73065185546875, + "learning_rate": 1.9981851179673322e-05, + "loss": 36.2511, + "step": 1850 + }, + { + "epoch": 6.679006772009029, + "eval_loss": 0.6308404803276062, + "eval_runtime": 3.1419, + "eval_samples_per_second": 56.972, + "eval_steps_per_second": 56.972, + "step": 1850 + }, + { + "epoch": 6.682618510158013, + "grad_norm": 214.7340850830078, + "learning_rate": 1.9976406533575317e-05, + "loss": 37.6949, + "step": 1851 + }, + { + "epoch": 6.686230248306998, + "grad_norm": 220.3029327392578, + "learning_rate": 1.9970961887477316e-05, + "loss": 36.5785, + "step": 1852 + }, + { + "epoch": 6.689841986455982, + "grad_norm": 198.97564697265625, + "learning_rate": 1.996551724137931e-05, + "loss": 38.5277, + "step": 1853 + }, + { + "epoch": 6.693453724604966, + "grad_norm": 180.94789123535156, + "learning_rate": 1.9960072595281307e-05, + "loss": 37.5197, + "step": 1854 + }, + { + "epoch": 6.69706546275395, + "grad_norm": 212.17584228515625, + "learning_rate": 1.9954627949183302e-05, + "loss": 37.3483, + "step": 1855 + }, + { + "epoch": 6.700677200902934, + "grad_norm": 253.88601684570312, + "learning_rate": 1.9949183303085298e-05, + "loss": 38.5224, + "step": 1856 + }, + { + "epoch": 6.704288939051919, + "grad_norm": 193.17698669433594, + "learning_rate": 1.9943738656987296e-05, + "loss": 37.5679, + "step": 1857 + }, + { + "epoch": 6.707900677200903, + "grad_norm": 217.2652130126953, + "learning_rate": 1.9938294010889295e-05, + "loss": 27.7344, + "step": 1858 + }, + { + "epoch": 6.711512415349887, + "grad_norm": 183.9295196533203, + "learning_rate": 1.993284936479129e-05, + "loss": 24.3864, + "step": 1859 + }, + { + "epoch": 6.715124153498872, + "grad_norm": 200.3455352783203, + "learning_rate": 1.9927404718693286e-05, + "loss": 23.7328, + "step": 1860 + }, + { + "epoch": 6.715124153498872, + "eval_loss": 0.636415421962738, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.943, + "eval_steps_per_second": 56.943, + "step": 1860 + }, + { + "epoch": 6.718735891647856, + "grad_norm": 206.7858123779297, + "learning_rate": 1.992196007259528e-05, + "loss": 24.6541, + "step": 1861 + }, + { + "epoch": 6.72234762979684, + "grad_norm": 208.10414123535156, + "learning_rate": 1.9916515426497277e-05, + "loss": 25.1223, + "step": 1862 + }, + { + "epoch": 6.725959367945824, + "grad_norm": 270.6657409667969, + "learning_rate": 1.9911070780399275e-05, + "loss": 44.8561, + "step": 1863 + }, + { + "epoch": 6.7295711060948085, + "grad_norm": 246.69094848632812, + "learning_rate": 1.990562613430127e-05, + "loss": 45.8683, + "step": 1864 + }, + { + "epoch": 6.733182844243792, + "grad_norm": 243.4462432861328, + "learning_rate": 1.9900181488203266e-05, + "loss": 45.1845, + "step": 1865 + }, + { + "epoch": 6.736794582392776, + "grad_norm": 218.0637969970703, + "learning_rate": 1.989473684210526e-05, + "loss": 43.9492, + "step": 1866 + }, + { + "epoch": 6.74040632054176, + "grad_norm": 200.28140258789062, + "learning_rate": 1.988929219600726e-05, + "loss": 44.0612, + "step": 1867 + }, + { + "epoch": 6.744018058690745, + "grad_norm": 200.3120880126953, + "learning_rate": 1.988384754990926e-05, + "loss": 43.4748, + "step": 1868 + }, + { + "epoch": 6.747629796839729, + "grad_norm": 186.1811065673828, + "learning_rate": 1.9878402903811254e-05, + "loss": 43.6851, + "step": 1869 + }, + { + "epoch": 6.751241534988713, + "grad_norm": 208.15167236328125, + "learning_rate": 1.987295825771325e-05, + "loss": 44.4196, + "step": 1870 + }, + { + "epoch": 6.751241534988713, + "eval_loss": 0.6353851556777954, + "eval_runtime": 3.1436, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1870 + }, + { + "epoch": 6.754853273137698, + "grad_norm": 207.500244140625, + "learning_rate": 1.9867513611615245e-05, + "loss": 44.1493, + "step": 1871 + }, + { + "epoch": 6.758465011286682, + "grad_norm": 238.17047119140625, + "learning_rate": 1.986206896551724e-05, + "loss": 44.6587, + "step": 1872 + }, + { + "epoch": 6.762076749435666, + "grad_norm": 192.9468231201172, + "learning_rate": 1.9856624319419236e-05, + "loss": 43.2409, + "step": 1873 + }, + { + "epoch": 6.76568848758465, + "grad_norm": 205.26492309570312, + "learning_rate": 1.9851179673321235e-05, + "loss": 40.8636, + "step": 1874 + }, + { + "epoch": 6.769300225733634, + "grad_norm": 190.49908447265625, + "learning_rate": 1.984573502722323e-05, + "loss": 41.0769, + "step": 1875 + }, + { + "epoch": 6.772911963882619, + "grad_norm": 206.56097412109375, + "learning_rate": 1.984029038112523e-05, + "loss": 40.1137, + "step": 1876 + }, + { + "epoch": 6.776523702031603, + "grad_norm": 212.89256286621094, + "learning_rate": 1.9834845735027224e-05, + "loss": 41.0114, + "step": 1877 + }, + { + "epoch": 6.780135440180587, + "grad_norm": 197.24267578125, + "learning_rate": 1.982940108892922e-05, + "loss": 40.6027, + "step": 1878 + }, + { + "epoch": 6.7837471783295715, + "grad_norm": 187.01942443847656, + "learning_rate": 1.982395644283122e-05, + "loss": 40.5933, + "step": 1879 + }, + { + "epoch": 6.7873589164785555, + "grad_norm": 236.31092834472656, + "learning_rate": 1.9818511796733214e-05, + "loss": 41.2282, + "step": 1880 + }, + { + "epoch": 6.7873589164785555, + "eval_loss": 0.6299392580986023, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 1880 + }, + { + "epoch": 6.7909706546275395, + "grad_norm": 194.92059326171875, + "learning_rate": 1.981306715063521e-05, + "loss": 41.5858, + "step": 1881 + }, + { + "epoch": 6.794582392776523, + "grad_norm": 192.26272583007812, + "learning_rate": 1.9807622504537205e-05, + "loss": 40.6826, + "step": 1882 + }, + { + "epoch": 6.798194130925508, + "grad_norm": 181.8116912841797, + "learning_rate": 1.98021778584392e-05, + "loss": 40.0867, + "step": 1883 + }, + { + "epoch": 6.801805869074492, + "grad_norm": 219.03494262695312, + "learning_rate": 1.9796733212341195e-05, + "loss": 41.4496, + "step": 1884 + }, + { + "epoch": 6.805417607223476, + "grad_norm": 190.7852325439453, + "learning_rate": 1.9791288566243194e-05, + "loss": 42.4147, + "step": 1885 + }, + { + "epoch": 6.80902934537246, + "grad_norm": 200.32476806640625, + "learning_rate": 1.9785843920145193e-05, + "loss": 42.0316, + "step": 1886 + }, + { + "epoch": 6.812641083521445, + "grad_norm": 240.6086883544922, + "learning_rate": 1.9780399274047188e-05, + "loss": 39.6992, + "step": 1887 + }, + { + "epoch": 6.816252821670429, + "grad_norm": 222.31700134277344, + "learning_rate": 1.9774954627949184e-05, + "loss": 42.9572, + "step": 1888 + }, + { + "epoch": 6.819864559819413, + "grad_norm": 215.65292358398438, + "learning_rate": 1.976950998185118e-05, + "loss": 42.5147, + "step": 1889 + }, + { + "epoch": 6.823476297968397, + "grad_norm": 195.71624755859375, + "learning_rate": 1.9764065335753178e-05, + "loss": 40.9536, + "step": 1890 + }, + { + "epoch": 6.823476297968397, + "eval_loss": 0.6288287043571472, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 1890 + }, + { + "epoch": 6.827088036117382, + "grad_norm": 202.301025390625, + "learning_rate": 1.9758620689655173e-05, + "loss": 40.1754, + "step": 1891 + }, + { + "epoch": 6.830699774266366, + "grad_norm": 217.07186889648438, + "learning_rate": 1.975317604355717e-05, + "loss": 35.7505, + "step": 1892 + }, + { + "epoch": 6.83431151241535, + "grad_norm": 189.78782653808594, + "learning_rate": 1.9747731397459164e-05, + "loss": 34.813, + "step": 1893 + }, + { + "epoch": 6.837923250564334, + "grad_norm": 247.2117462158203, + "learning_rate": 1.974228675136116e-05, + "loss": 33.932, + "step": 1894 + }, + { + "epoch": 6.8415349887133186, + "grad_norm": 244.06321716308594, + "learning_rate": 1.9736842105263158e-05, + "loss": 36.2514, + "step": 1895 + }, + { + "epoch": 6.8451467268623025, + "grad_norm": 235.78692626953125, + "learning_rate": 1.9731397459165157e-05, + "loss": 35.2123, + "step": 1896 + }, + { + "epoch": 6.8487584650112865, + "grad_norm": 193.82456970214844, + "learning_rate": 1.9725952813067152e-05, + "loss": 36.5477, + "step": 1897 + }, + { + "epoch": 6.852370203160271, + "grad_norm": 230.2017059326172, + "learning_rate": 1.9720508166969148e-05, + "loss": 36.1244, + "step": 1898 + }, + { + "epoch": 6.855981941309255, + "grad_norm": 205.5274200439453, + "learning_rate": 1.9715063520871143e-05, + "loss": 36.7059, + "step": 1899 + }, + { + "epoch": 6.859593679458239, + "grad_norm": 236.6873016357422, + "learning_rate": 1.970961887477314e-05, + "loss": 36.6212, + "step": 1900 + }, + { + "epoch": 6.859593679458239, + "eval_loss": 0.6235609650611877, + "eval_runtime": 3.1497, + "eval_samples_per_second": 56.831, + "eval_steps_per_second": 56.831, + "step": 1900 + }, + { + "epoch": 6.863205417607223, + "grad_norm": 217.63638305664062, + "learning_rate": 1.9704174228675137e-05, + "loss": 37.3918, + "step": 1901 + }, + { + "epoch": 6.866817155756207, + "grad_norm": 169.31996154785156, + "learning_rate": 1.9698729582577133e-05, + "loss": 37.8555, + "step": 1902 + }, + { + "epoch": 6.870428893905192, + "grad_norm": 204.2144775390625, + "learning_rate": 1.9693284936479128e-05, + "loss": 38.0013, + "step": 1903 + }, + { + "epoch": 6.874040632054176, + "grad_norm": 219.13595581054688, + "learning_rate": 1.9687840290381127e-05, + "loss": 37.2128, + "step": 1904 + }, + { + "epoch": 6.87765237020316, + "grad_norm": 189.8477325439453, + "learning_rate": 1.9682395644283122e-05, + "loss": 39.272, + "step": 1905 + }, + { + "epoch": 6.881264108352145, + "grad_norm": 214.21360778808594, + "learning_rate": 1.967695099818512e-05, + "loss": 37.5185, + "step": 1906 + }, + { + "epoch": 6.884875846501129, + "grad_norm": 252.57867431640625, + "learning_rate": 1.9671506352087116e-05, + "loss": 37.6195, + "step": 1907 + }, + { + "epoch": 6.888487584650113, + "grad_norm": 169.85382080078125, + "learning_rate": 1.966606170598911e-05, + "loss": 29.083, + "step": 1908 + }, + { + "epoch": 6.892099322799097, + "grad_norm": 161.38137817382812, + "learning_rate": 1.9660617059891107e-05, + "loss": 24.4547, + "step": 1909 + }, + { + "epoch": 6.895711060948082, + "grad_norm": 192.5706787109375, + "learning_rate": 1.9655172413793102e-05, + "loss": 24.2235, + "step": 1910 + }, + { + "epoch": 6.895711060948082, + "eval_loss": 0.6387229561805725, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1910 + }, + { + "epoch": 6.899322799097066, + "grad_norm": 177.5368194580078, + "learning_rate": 1.9649727767695098e-05, + "loss": 24.8032, + "step": 1911 + }, + { + "epoch": 6.9029345372460496, + "grad_norm": 206.98458862304688, + "learning_rate": 1.9644283121597097e-05, + "loss": 25.7293, + "step": 1912 + }, + { + "epoch": 6.9065462753950335, + "grad_norm": 238.7289581298828, + "learning_rate": 1.9638838475499095e-05, + "loss": 44.2514, + "step": 1913 + }, + { + "epoch": 6.910158013544018, + "grad_norm": 225.86854553222656, + "learning_rate": 1.963339382940109e-05, + "loss": 44.4858, + "step": 1914 + }, + { + "epoch": 6.913769751693002, + "grad_norm": 235.71524047851562, + "learning_rate": 1.9627949183303086e-05, + "loss": 44.5351, + "step": 1915 + }, + { + "epoch": 6.917381489841986, + "grad_norm": 233.1634063720703, + "learning_rate": 1.962250453720508e-05, + "loss": 44.0865, + "step": 1916 + }, + { + "epoch": 6.92099322799097, + "grad_norm": 201.48944091796875, + "learning_rate": 1.961705989110708e-05, + "loss": 45.0226, + "step": 1917 + }, + { + "epoch": 6.924604966139955, + "grad_norm": 226.95469665527344, + "learning_rate": 1.9611615245009076e-05, + "loss": 44.3969, + "step": 1918 + }, + { + "epoch": 6.928216704288939, + "grad_norm": 242.79940795898438, + "learning_rate": 1.960617059891107e-05, + "loss": 41.3037, + "step": 1919 + }, + { + "epoch": 6.931828442437923, + "grad_norm": 255.3524932861328, + "learning_rate": 1.9600725952813066e-05, + "loss": 41.3567, + "step": 1920 + }, + { + "epoch": 6.931828442437923, + "eval_loss": 0.6346065998077393, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 1920 + }, + { + "epoch": 6.935440180586907, + "grad_norm": 277.0763854980469, + "learning_rate": 1.9595281306715062e-05, + "loss": 41.142, + "step": 1921 + }, + { + "epoch": 6.939051918735892, + "grad_norm": 176.02658081054688, + "learning_rate": 1.958983666061706e-05, + "loss": 42.1963, + "step": 1922 + }, + { + "epoch": 6.942663656884876, + "grad_norm": 236.36398315429688, + "learning_rate": 1.958439201451906e-05, + "loss": 42.351, + "step": 1923 + }, + { + "epoch": 6.94627539503386, + "grad_norm": 203.0919647216797, + "learning_rate": 1.9578947368421055e-05, + "loss": 41.5248, + "step": 1924 + }, + { + "epoch": 6.949887133182845, + "grad_norm": 273.605712890625, + "learning_rate": 1.957350272232305e-05, + "loss": 42.1004, + "step": 1925 + }, + { + "epoch": 6.953498871331829, + "grad_norm": 214.04319763183594, + "learning_rate": 1.9568058076225045e-05, + "loss": 42.6326, + "step": 1926 + }, + { + "epoch": 6.957110609480813, + "grad_norm": 250.81832885742188, + "learning_rate": 1.956261343012704e-05, + "loss": 43.8045, + "step": 1927 + }, + { + "epoch": 6.960722347629797, + "grad_norm": 233.58116149902344, + "learning_rate": 1.955716878402904e-05, + "loss": 39.8991, + "step": 1928 + }, + { + "epoch": 6.9643340857787805, + "grad_norm": 269.0545654296875, + "learning_rate": 1.9551724137931035e-05, + "loss": 34.6192, + "step": 1929 + }, + { + "epoch": 6.967945823927765, + "grad_norm": 266.1218566894531, + "learning_rate": 1.954627949183303e-05, + "loss": 35.7568, + "step": 1930 + }, + { + "epoch": 6.967945823927765, + "eval_loss": 0.6233173608779907, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1930 + }, + { + "epoch": 6.971557562076749, + "grad_norm": 294.6914978027344, + "learning_rate": 1.9540834845735026e-05, + "loss": 36.0795, + "step": 1931 + }, + { + "epoch": 6.975169300225733, + "grad_norm": 373.6831970214844, + "learning_rate": 1.9535390199637025e-05, + "loss": 37.2715, + "step": 1932 + }, + { + "epoch": 6.978781038374718, + "grad_norm": 240.34738159179688, + "learning_rate": 1.952994555353902e-05, + "loss": 37.8335, + "step": 1933 + }, + { + "epoch": 6.982392776523702, + "grad_norm": 312.1968994140625, + "learning_rate": 1.952450090744102e-05, + "loss": 37.8251, + "step": 1934 + }, + { + "epoch": 6.986004514672686, + "grad_norm": 276.3544006347656, + "learning_rate": 1.9519056261343014e-05, + "loss": 38.8466, + "step": 1935 + }, + { + "epoch": 6.98961625282167, + "grad_norm": 282.6874694824219, + "learning_rate": 1.951361161524501e-05, + "loss": 37.774, + "step": 1936 + }, + { + "epoch": 6.993227990970655, + "grad_norm": 323.96612548828125, + "learning_rate": 1.9508166969147005e-05, + "loss": 34.3747, + "step": 1937 + }, + { + "epoch": 6.996839729119639, + "grad_norm": 235.02915954589844, + "learning_rate": 1.9502722323049e-05, + "loss": 24.5297, + "step": 1938 + }, + { + "epoch": 7.0, + "grad_norm": 176.4046173095703, + "learning_rate": 1.9497277676951e-05, + "loss": 22.3179, + "step": 1939 + }, + { + "epoch": 7.003611738148984, + "grad_norm": 248.2797393798828, + "learning_rate": 1.9491833030852994e-05, + "loss": 42.225, + "step": 1940 + }, + { + "epoch": 7.003611738148984, + "eval_loss": 0.6272363066673279, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.911, + "eval_steps_per_second": 56.911, + "step": 1940 + }, + { + "epoch": 7.007223476297969, + "grad_norm": 235.9131622314453, + "learning_rate": 1.9486388384754993e-05, + "loss": 43.6526, + "step": 1941 + }, + { + "epoch": 7.010835214446953, + "grad_norm": 223.63479614257812, + "learning_rate": 1.948094373865699e-05, + "loss": 42.9052, + "step": 1942 + }, + { + "epoch": 7.014446952595937, + "grad_norm": 203.92141723632812, + "learning_rate": 1.9475499092558984e-05, + "loss": 43.5819, + "step": 1943 + }, + { + "epoch": 7.018058690744921, + "grad_norm": 209.6050567626953, + "learning_rate": 1.947005444646098e-05, + "loss": 43.1077, + "step": 1944 + }, + { + "epoch": 7.021670428893906, + "grad_norm": 245.77700805664062, + "learning_rate": 1.9464609800362978e-05, + "loss": 42.7508, + "step": 1945 + }, + { + "epoch": 7.0252821670428895, + "grad_norm": 203.13465881347656, + "learning_rate": 1.9459165154264973e-05, + "loss": 42.5234, + "step": 1946 + }, + { + "epoch": 7.0288939051918735, + "grad_norm": 226.4978485107422, + "learning_rate": 1.945372050816697e-05, + "loss": 44.0725, + "step": 1947 + }, + { + "epoch": 7.0325056433408575, + "grad_norm": 225.68116760253906, + "learning_rate": 1.9448275862068964e-05, + "loss": 42.6408, + "step": 1948 + }, + { + "epoch": 7.036117381489842, + "grad_norm": 182.14202880859375, + "learning_rate": 1.944283121597096e-05, + "loss": 41.7696, + "step": 1949 + }, + { + "epoch": 7.039729119638826, + "grad_norm": 196.1949005126953, + "learning_rate": 1.9437386569872962e-05, + "loss": 42.7008, + "step": 1950 + }, + { + "epoch": 7.039729119638826, + "eval_loss": 0.6277336478233337, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 1950 + }, + { + "epoch": 7.04334085778781, + "grad_norm": 180.6853485107422, + "learning_rate": 1.9431941923774957e-05, + "loss": 41.9946, + "step": 1951 + }, + { + "epoch": 7.046952595936794, + "grad_norm": 199.0644073486328, + "learning_rate": 1.9426497277676953e-05, + "loss": 39.8965, + "step": 1952 + }, + { + "epoch": 7.050564334085779, + "grad_norm": 208.21371459960938, + "learning_rate": 1.9421052631578948e-05, + "loss": 39.3263, + "step": 1953 + }, + { + "epoch": 7.054176072234763, + "grad_norm": 239.78677368164062, + "learning_rate": 1.9415607985480943e-05, + "loss": 40.1478, + "step": 1954 + }, + { + "epoch": 7.057787810383747, + "grad_norm": 211.55030822753906, + "learning_rate": 1.941016333938294e-05, + "loss": 40.061, + "step": 1955 + }, + { + "epoch": 7.061399548532731, + "grad_norm": 199.51455688476562, + "learning_rate": 1.9404718693284937e-05, + "loss": 39.8707, + "step": 1956 + }, + { + "epoch": 7.065011286681716, + "grad_norm": 183.39486694335938, + "learning_rate": 1.9399274047186933e-05, + "loss": 40.3183, + "step": 1957 + }, + { + "epoch": 7.0686230248307, + "grad_norm": 238.36737060546875, + "learning_rate": 1.9393829401088928e-05, + "loss": 40.8581, + "step": 1958 + }, + { + "epoch": 7.072234762979684, + "grad_norm": 202.5072021484375, + "learning_rate": 1.9388384754990927e-05, + "loss": 40.2192, + "step": 1959 + }, + { + "epoch": 7.075846501128668, + "grad_norm": 204.236083984375, + "learning_rate": 1.9382940108892922e-05, + "loss": 40.8533, + "step": 1960 + }, + { + "epoch": 7.075846501128668, + "eval_loss": 0.6252757906913757, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 1960 + }, + { + "epoch": 7.079458239277653, + "grad_norm": 260.2081298828125, + "learning_rate": 1.937749546279492e-05, + "loss": 39.7229, + "step": 1961 + }, + { + "epoch": 7.083069977426637, + "grad_norm": 241.91722106933594, + "learning_rate": 1.9372050816696917e-05, + "loss": 41.547, + "step": 1962 + }, + { + "epoch": 7.0866817155756205, + "grad_norm": 168.9304656982422, + "learning_rate": 1.9366606170598912e-05, + "loss": 41.4826, + "step": 1963 + }, + { + "epoch": 7.090293453724605, + "grad_norm": 230.05349731445312, + "learning_rate": 1.9361161524500907e-05, + "loss": 41.5411, + "step": 1964 + }, + { + "epoch": 7.093905191873589, + "grad_norm": 172.16851806640625, + "learning_rate": 1.9355716878402903e-05, + "loss": 42.2347, + "step": 1965 + }, + { + "epoch": 7.097516930022573, + "grad_norm": 312.65838623046875, + "learning_rate": 1.9350272232304898e-05, + "loss": 41.4039, + "step": 1966 + }, + { + "epoch": 7.101128668171557, + "grad_norm": 249.62351989746094, + "learning_rate": 1.9344827586206897e-05, + "loss": 41.4234, + "step": 1967 + }, + { + "epoch": 7.104740406320542, + "grad_norm": 250.49143981933594, + "learning_rate": 1.9339382940108896e-05, + "loss": 38.0539, + "step": 1968 + }, + { + "epoch": 7.108352144469526, + "grad_norm": 238.41546630859375, + "learning_rate": 1.933393829401089e-05, + "loss": 35.5584, + "step": 1969 + }, + { + "epoch": 7.11196388261851, + "grad_norm": 200.78282165527344, + "learning_rate": 1.9328493647912886e-05, + "loss": 34.4491, + "step": 1970 + }, + { + "epoch": 7.11196388261851, + "eval_loss": 0.6286216378211975, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 1970 + }, + { + "epoch": 7.115575620767494, + "grad_norm": 244.61717224121094, + "learning_rate": 1.9323049001814882e-05, + "loss": 34.5403, + "step": 1971 + }, + { + "epoch": 7.119187358916479, + "grad_norm": 219.14312744140625, + "learning_rate": 1.931760435571688e-05, + "loss": 35.7815, + "step": 1972 + }, + { + "epoch": 7.122799097065463, + "grad_norm": 221.85130310058594, + "learning_rate": 1.9312159709618876e-05, + "loss": 35.638, + "step": 1973 + }, + { + "epoch": 7.126410835214447, + "grad_norm": 237.97921752929688, + "learning_rate": 1.930671506352087e-05, + "loss": 35.1348, + "step": 1974 + }, + { + "epoch": 7.130022573363431, + "grad_norm": 234.06256103515625, + "learning_rate": 1.9301270417422867e-05, + "loss": 35.8709, + "step": 1975 + }, + { + "epoch": 7.133634311512416, + "grad_norm": 231.6852264404297, + "learning_rate": 1.9295825771324862e-05, + "loss": 36.6859, + "step": 1976 + }, + { + "epoch": 7.1372460496614, + "grad_norm": 208.2762908935547, + "learning_rate": 1.9290381125226857e-05, + "loss": 37.24, + "step": 1977 + }, + { + "epoch": 7.140857787810384, + "grad_norm": 219.8532257080078, + "learning_rate": 1.928493647912886e-05, + "loss": 36.4058, + "step": 1978 + }, + { + "epoch": 7.144469525959368, + "grad_norm": 242.73159790039062, + "learning_rate": 1.9279491833030855e-05, + "loss": 36.7565, + "step": 1979 + }, + { + "epoch": 7.148081264108352, + "grad_norm": 227.09645080566406, + "learning_rate": 1.927404718693285e-05, + "loss": 37.6752, + "step": 1980 + }, + { + "epoch": 7.148081264108352, + "eval_loss": 0.6243596076965332, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 1980 + }, + { + "epoch": 7.151693002257336, + "grad_norm": 236.27169799804688, + "learning_rate": 1.9268602540834846e-05, + "loss": 38.3857, + "step": 1981 + }, + { + "epoch": 7.15530474040632, + "grad_norm": 244.84912109375, + "learning_rate": 1.926315789473684e-05, + "loss": 38.414, + "step": 1982 + }, + { + "epoch": 7.158916478555304, + "grad_norm": 203.36798095703125, + "learning_rate": 1.925771324863884e-05, + "loss": 38.938, + "step": 1983 + }, + { + "epoch": 7.162528216704289, + "grad_norm": 225.50152587890625, + "learning_rate": 1.9252268602540835e-05, + "loss": 37.654, + "step": 1984 + }, + { + "epoch": 7.166139954853273, + "grad_norm": 236.4989471435547, + "learning_rate": 1.924682395644283e-05, + "loss": 28.2794, + "step": 1985 + }, + { + "epoch": 7.169751693002257, + "grad_norm": 173.909423828125, + "learning_rate": 1.9241379310344826e-05, + "loss": 23.3804, + "step": 1986 + }, + { + "epoch": 7.173363431151241, + "grad_norm": 195.63526916503906, + "learning_rate": 1.9235934664246825e-05, + "loss": 24.4696, + "step": 1987 + }, + { + "epoch": 7.176975169300226, + "grad_norm": 150.0059356689453, + "learning_rate": 1.923049001814882e-05, + "loss": 23.9438, + "step": 1988 + }, + { + "epoch": 7.18058690744921, + "grad_norm": 217.61630249023438, + "learning_rate": 1.922504537205082e-05, + "loss": 25.4084, + "step": 1989 + }, + { + "epoch": 7.184198645598194, + "grad_norm": 259.2041015625, + "learning_rate": 1.9219600725952814e-05, + "loss": 44.7159, + "step": 1990 + }, + { + "epoch": 7.184198645598194, + "eval_loss": 0.6465168595314026, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 1990 + }, + { + "epoch": 7.187810383747179, + "grad_norm": 282.1758117675781, + "learning_rate": 1.921415607985481e-05, + "loss": 45.7571, + "step": 1991 + }, + { + "epoch": 7.191422121896163, + "grad_norm": 276.5455322265625, + "learning_rate": 1.9208711433756805e-05, + "loss": 44.7227, + "step": 1992 + }, + { + "epoch": 7.195033860045147, + "grad_norm": 251.93589782714844, + "learning_rate": 1.92032667876588e-05, + "loss": 43.0705, + "step": 1993 + }, + { + "epoch": 7.198645598194131, + "grad_norm": 224.8245086669922, + "learning_rate": 1.91978221415608e-05, + "loss": 43.2009, + "step": 1994 + }, + { + "epoch": 7.2022573363431155, + "grad_norm": 233.61770629882812, + "learning_rate": 1.9192377495462795e-05, + "loss": 43.4496, + "step": 1995 + }, + { + "epoch": 7.2058690744920995, + "grad_norm": 188.65252685546875, + "learning_rate": 1.9186932849364793e-05, + "loss": 42.5907, + "step": 1996 + }, + { + "epoch": 7.209480812641083, + "grad_norm": 185.1155242919922, + "learning_rate": 1.918148820326679e-05, + "loss": 44.4651, + "step": 1997 + }, + { + "epoch": 7.213092550790067, + "grad_norm": 169.09701538085938, + "learning_rate": 1.9176043557168784e-05, + "loss": 43.6325, + "step": 1998 + }, + { + "epoch": 7.216704288939052, + "grad_norm": 198.49114990234375, + "learning_rate": 1.9170598911070783e-05, + "loss": 43.5817, + "step": 1999 + }, + { + "epoch": 7.220316027088036, + "grad_norm": 193.17591857910156, + "learning_rate": 1.916515426497278e-05, + "loss": 41.4884, + "step": 2000 + }, + { + "epoch": 7.220316027088036, + "eval_loss": 0.6329721212387085, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.995, + "eval_steps_per_second": 56.995, + "step": 2000 + }, + { + "epoch": 7.22392776523702, + "grad_norm": 202.32730102539062, + "learning_rate": 1.9159709618874774e-05, + "loss": 41.2168, + "step": 2001 + }, + { + "epoch": 7.227539503386004, + "grad_norm": 206.4916534423828, + "learning_rate": 1.915426497277677e-05, + "loss": 39.9909, + "step": 2002 + }, + { + "epoch": 7.231151241534989, + "grad_norm": 202.2099609375, + "learning_rate": 1.9148820326678765e-05, + "loss": 40.1413, + "step": 2003 + }, + { + "epoch": 7.234762979683973, + "grad_norm": 223.7954559326172, + "learning_rate": 1.914337568058076e-05, + "loss": 39.5872, + "step": 2004 + }, + { + "epoch": 7.238374717832957, + "grad_norm": 225.8967742919922, + "learning_rate": 1.9137931034482762e-05, + "loss": 41.3396, + "step": 2005 + }, + { + "epoch": 7.241986455981941, + "grad_norm": 248.0997772216797, + "learning_rate": 1.9132486388384757e-05, + "loss": 39.012, + "step": 2006 + }, + { + "epoch": 7.245598194130926, + "grad_norm": 227.4576873779297, + "learning_rate": 1.9127041742286753e-05, + "loss": 42.5922, + "step": 2007 + }, + { + "epoch": 7.24920993227991, + "grad_norm": 197.62547302246094, + "learning_rate": 1.9121597096188748e-05, + "loss": 41.6107, + "step": 2008 + }, + { + "epoch": 7.252821670428894, + "grad_norm": 170.18817138671875, + "learning_rate": 1.9116152450090744e-05, + "loss": 40.3326, + "step": 2009 + }, + { + "epoch": 7.2564334085778786, + "grad_norm": 186.9420166015625, + "learning_rate": 1.9110707803992742e-05, + "loss": 41.0365, + "step": 2010 + }, + { + "epoch": 7.2564334085778786, + "eval_loss": 0.6230406761169434, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2010 + }, + { + "epoch": 7.2600451467268625, + "grad_norm": 188.11244201660156, + "learning_rate": 1.9105263157894738e-05, + "loss": 42.0278, + "step": 2011 + }, + { + "epoch": 7.2636568848758465, + "grad_norm": 242.47305297851562, + "learning_rate": 1.9099818511796733e-05, + "loss": 41.5539, + "step": 2012 + }, + { + "epoch": 7.2672686230248305, + "grad_norm": 190.83987426757812, + "learning_rate": 1.909437386569873e-05, + "loss": 41.8641, + "step": 2013 + }, + { + "epoch": 7.270880361173815, + "grad_norm": 214.44650268554688, + "learning_rate": 1.9088929219600724e-05, + "loss": 42.232, + "step": 2014 + }, + { + "epoch": 7.274492099322799, + "grad_norm": 216.3888397216797, + "learning_rate": 1.9083484573502723e-05, + "loss": 41.6186, + "step": 2015 + }, + { + "epoch": 7.278103837471783, + "grad_norm": 210.46673583984375, + "learning_rate": 1.907803992740472e-05, + "loss": 42.2099, + "step": 2016 + }, + { + "epoch": 7.281715575620767, + "grad_norm": 194.84165954589844, + "learning_rate": 1.9072595281306717e-05, + "loss": 42.78, + "step": 2017 + }, + { + "epoch": 7.285327313769752, + "grad_norm": 201.91297912597656, + "learning_rate": 1.9067150635208712e-05, + "loss": 38.7115, + "step": 2018 + }, + { + "epoch": 7.288939051918736, + "grad_norm": 245.42625427246094, + "learning_rate": 1.9061705989110708e-05, + "loss": 35.7841, + "step": 2019 + }, + { + "epoch": 7.29255079006772, + "grad_norm": 182.4967041015625, + "learning_rate": 1.9056261343012703e-05, + "loss": 34.3308, + "step": 2020 + }, + { + "epoch": 7.29255079006772, + "eval_loss": 0.6238341331481934, + "eval_runtime": 3.1431, + "eval_samples_per_second": 56.95, + "eval_steps_per_second": 56.95, + "step": 2020 + }, + { + "epoch": 7.296162528216704, + "grad_norm": 297.3916320800781, + "learning_rate": 1.9050816696914702e-05, + "loss": 34.7534, + "step": 2021 + }, + { + "epoch": 7.299774266365689, + "grad_norm": 211.52554321289062, + "learning_rate": 1.9045372050816697e-05, + "loss": 34.0303, + "step": 2022 + }, + { + "epoch": 7.303386004514673, + "grad_norm": 232.99844360351562, + "learning_rate": 1.9039927404718693e-05, + "loss": 35.7378, + "step": 2023 + }, + { + "epoch": 7.306997742663657, + "grad_norm": 230.34642028808594, + "learning_rate": 1.903448275862069e-05, + "loss": 36.7492, + "step": 2024 + }, + { + "epoch": 7.310609480812641, + "grad_norm": 228.88966369628906, + "learning_rate": 1.9029038112522687e-05, + "loss": 35.1188, + "step": 2025 + }, + { + "epoch": 7.314221218961626, + "grad_norm": 213.2604522705078, + "learning_rate": 1.9023593466424682e-05, + "loss": 35.0688, + "step": 2026 + }, + { + "epoch": 7.3178329571106095, + "grad_norm": 202.62200927734375, + "learning_rate": 1.901814882032668e-05, + "loss": 37.6721, + "step": 2027 + }, + { + "epoch": 7.3214446952595935, + "grad_norm": 191.8877410888672, + "learning_rate": 1.9012704174228676e-05, + "loss": 36.7728, + "step": 2028 + }, + { + "epoch": 7.3250564334085775, + "grad_norm": 211.57571411132812, + "learning_rate": 1.900725952813067e-05, + "loss": 36.6342, + "step": 2029 + }, + { + "epoch": 7.328668171557562, + "grad_norm": 177.2289581298828, + "learning_rate": 1.9001814882032667e-05, + "loss": 36.8319, + "step": 2030 + }, + { + "epoch": 7.328668171557562, + "eval_loss": 0.6231008172035217, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2030 + }, + { + "epoch": 7.332279909706546, + "grad_norm": 227.7028350830078, + "learning_rate": 1.8996370235934662e-05, + "loss": 36.6706, + "step": 2031 + }, + { + "epoch": 7.33589164785553, + "grad_norm": 229.02972412109375, + "learning_rate": 1.899092558983666e-05, + "loss": 37.0749, + "step": 2032 + }, + { + "epoch": 7.339503386004514, + "grad_norm": 234.30946350097656, + "learning_rate": 1.898548094373866e-05, + "loss": 37.3716, + "step": 2033 + }, + { + "epoch": 7.343115124153499, + "grad_norm": 236.79893493652344, + "learning_rate": 1.8980036297640655e-05, + "loss": 38.9503, + "step": 2034 + }, + { + "epoch": 7.346726862302483, + "grad_norm": 256.5646057128906, + "learning_rate": 1.897459165154265e-05, + "loss": 32.5056, + "step": 2035 + }, + { + "epoch": 7.350338600451467, + "grad_norm": 183.38961791992188, + "learning_rate": 1.8969147005444646e-05, + "loss": 25.3982, + "step": 2036 + }, + { + "epoch": 7.353950338600452, + "grad_norm": 214.09742736816406, + "learning_rate": 1.896370235934664e-05, + "loss": 23.2743, + "step": 2037 + }, + { + "epoch": 7.357562076749436, + "grad_norm": 190.10867309570312, + "learning_rate": 1.895825771324864e-05, + "loss": 24.8062, + "step": 2038 + }, + { + "epoch": 7.36117381489842, + "grad_norm": 197.85313415527344, + "learning_rate": 1.8952813067150636e-05, + "loss": 25.5098, + "step": 2039 + }, + { + "epoch": 7.364785553047404, + "grad_norm": 235.79090881347656, + "learning_rate": 1.894736842105263e-05, + "loss": 44.3536, + "step": 2040 + }, + { + "epoch": 7.364785553047404, + "eval_loss": 0.6341925263404846, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.785, + "eval_steps_per_second": 56.785, + "step": 2040 + }, + { + "epoch": 7.368397291196389, + "grad_norm": 232.7415771484375, + "learning_rate": 1.8941923774954626e-05, + "loss": 44.6073, + "step": 2041 + }, + { + "epoch": 7.372009029345373, + "grad_norm": 302.3766174316406, + "learning_rate": 1.8936479128856625e-05, + "loss": 43.8575, + "step": 2042 + }, + { + "epoch": 7.375620767494357, + "grad_norm": 208.41441345214844, + "learning_rate": 1.8931034482758624e-05, + "loss": 42.4378, + "step": 2043 + }, + { + "epoch": 7.3792325056433405, + "grad_norm": 228.000732421875, + "learning_rate": 1.892558983666062e-05, + "loss": 44.5641, + "step": 2044 + }, + { + "epoch": 7.382844243792325, + "grad_norm": 201.757080078125, + "learning_rate": 1.8920145190562615e-05, + "loss": 43.7578, + "step": 2045 + }, + { + "epoch": 7.386455981941309, + "grad_norm": 220.2481689453125, + "learning_rate": 1.891470054446461e-05, + "loss": 42.755, + "step": 2046 + }, + { + "epoch": 7.390067720090293, + "grad_norm": 225.5443115234375, + "learning_rate": 1.8909255898366605e-05, + "loss": 44.3785, + "step": 2047 + }, + { + "epoch": 7.393679458239277, + "grad_norm": 200.2024688720703, + "learning_rate": 1.89038112522686e-05, + "loss": 42.994, + "step": 2048 + }, + { + "epoch": 7.397291196388262, + "grad_norm": 205.64794921875, + "learning_rate": 1.88983666061706e-05, + "loss": 43.1902, + "step": 2049 + }, + { + "epoch": 7.400902934537246, + "grad_norm": 183.3535919189453, + "learning_rate": 1.8892921960072595e-05, + "loss": 40.9422, + "step": 2050 + }, + { + "epoch": 7.400902934537246, + "eval_loss": 0.626913845539093, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2050 + }, + { + "epoch": 7.40451467268623, + "grad_norm": 201.8138885498047, + "learning_rate": 1.8887477313974594e-05, + "loss": 39.4408, + "step": 2051 + }, + { + "epoch": 7.408126410835214, + "grad_norm": 201.8863525390625, + "learning_rate": 1.888203266787659e-05, + "loss": 39.5467, + "step": 2052 + }, + { + "epoch": 7.411738148984199, + "grad_norm": 239.10687255859375, + "learning_rate": 1.8876588021778585e-05, + "loss": 41.2256, + "step": 2053 + }, + { + "epoch": 7.415349887133183, + "grad_norm": 209.47796630859375, + "learning_rate": 1.8871143375680583e-05, + "loss": 40.8963, + "step": 2054 + }, + { + "epoch": 7.418961625282167, + "grad_norm": 202.6414794921875, + "learning_rate": 1.886569872958258e-05, + "loss": 40.5138, + "step": 2055 + }, + { + "epoch": 7.422573363431152, + "grad_norm": 198.01795959472656, + "learning_rate": 1.8860254083484574e-05, + "loss": 39.1767, + "step": 2056 + }, + { + "epoch": 7.426185101580136, + "grad_norm": 173.26507568359375, + "learning_rate": 1.885480943738657e-05, + "loss": 40.6713, + "step": 2057 + }, + { + "epoch": 7.42979683972912, + "grad_norm": 166.11607360839844, + "learning_rate": 1.8849364791288565e-05, + "loss": 41.2602, + "step": 2058 + }, + { + "epoch": 7.433408577878104, + "grad_norm": 200.76956176757812, + "learning_rate": 1.884392014519056e-05, + "loss": 41.0714, + "step": 2059 + }, + { + "epoch": 7.437020316027088, + "grad_norm": 213.75315856933594, + "learning_rate": 1.883847549909256e-05, + "loss": 39.6812, + "step": 2060 + }, + { + "epoch": 7.437020316027088, + "eval_loss": 0.6279598474502563, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.0, + "eval_steps_per_second": 57.0, + "step": 2060 + }, + { + "epoch": 7.440632054176072, + "grad_norm": 221.25025939941406, + "learning_rate": 1.8833030852994558e-05, + "loss": 41.6964, + "step": 2061 + }, + { + "epoch": 7.444243792325056, + "grad_norm": 171.32106018066406, + "learning_rate": 1.8827586206896553e-05, + "loss": 41.4608, + "step": 2062 + }, + { + "epoch": 7.44785553047404, + "grad_norm": 222.76600646972656, + "learning_rate": 1.882214156079855e-05, + "loss": 41.2687, + "step": 2063 + }, + { + "epoch": 7.451467268623025, + "grad_norm": 169.82395935058594, + "learning_rate": 1.8816696914700544e-05, + "loss": 41.6048, + "step": 2064 + }, + { + "epoch": 7.455079006772009, + "grad_norm": 190.5113525390625, + "learning_rate": 1.8811252268602543e-05, + "loss": 41.8843, + "step": 2065 + }, + { + "epoch": 7.458690744920993, + "grad_norm": 194.5990447998047, + "learning_rate": 1.8805807622504538e-05, + "loss": 43.5968, + "step": 2066 + }, + { + "epoch": 7.462302483069977, + "grad_norm": 216.0985870361328, + "learning_rate": 1.8800362976406533e-05, + "loss": 41.6743, + "step": 2067 + }, + { + "epoch": 7.465914221218962, + "grad_norm": 249.05270385742188, + "learning_rate": 1.879491833030853e-05, + "loss": 39.4203, + "step": 2068 + }, + { + "epoch": 7.469525959367946, + "grad_norm": 232.5495147705078, + "learning_rate": 1.8789473684210524e-05, + "loss": 36.2202, + "step": 2069 + }, + { + "epoch": 7.47313769751693, + "grad_norm": 218.72299194335938, + "learning_rate": 1.8784029038112523e-05, + "loss": 34.9116, + "step": 2070 + }, + { + "epoch": 7.47313769751693, + "eval_loss": 0.6241349577903748, + "eval_runtime": 3.1499, + "eval_samples_per_second": 56.827, + "eval_steps_per_second": 56.827, + "step": 2070 + }, + { + "epoch": 7.476749435665914, + "grad_norm": 241.78179931640625, + "learning_rate": 1.8778584392014522e-05, + "loss": 36.2476, + "step": 2071 + }, + { + "epoch": 7.480361173814899, + "grad_norm": 194.92982482910156, + "learning_rate": 1.8773139745916517e-05, + "loss": 34.4524, + "step": 2072 + }, + { + "epoch": 7.483972911963883, + "grad_norm": 227.76156616210938, + "learning_rate": 1.8767695099818513e-05, + "loss": 34.5292, + "step": 2073 + }, + { + "epoch": 7.487584650112867, + "grad_norm": 287.61309814453125, + "learning_rate": 1.8762250453720508e-05, + "loss": 37.8068, + "step": 2074 + }, + { + "epoch": 7.491196388261851, + "grad_norm": 191.0822296142578, + "learning_rate": 1.8756805807622503e-05, + "loss": 36.0941, + "step": 2075 + }, + { + "epoch": 7.4948081264108355, + "grad_norm": 197.5564422607422, + "learning_rate": 1.8751361161524502e-05, + "loss": 36.3624, + "step": 2076 + }, + { + "epoch": 7.4984198645598195, + "grad_norm": 187.72479248046875, + "learning_rate": 1.8745916515426497e-05, + "loss": 37.5074, + "step": 2077 + }, + { + "epoch": 7.502031602708803, + "grad_norm": 220.4607391357422, + "learning_rate": 1.8740471869328493e-05, + "loss": 35.6139, + "step": 2078 + }, + { + "epoch": 7.505643340857787, + "grad_norm": 179.05612182617188, + "learning_rate": 1.873502722323049e-05, + "loss": 37.7286, + "step": 2079 + }, + { + "epoch": 7.509255079006772, + "grad_norm": 230.91879272460938, + "learning_rate": 1.8729582577132487e-05, + "loss": 36.1803, + "step": 2080 + }, + { + "epoch": 7.509255079006772, + "eval_loss": 0.6255043148994446, + "eval_runtime": 3.1466, + "eval_samples_per_second": 56.887, + "eval_steps_per_second": 56.887, + "step": 2080 + }, + { + "epoch": 7.512866817155756, + "grad_norm": 182.89437866210938, + "learning_rate": 1.8724137931034482e-05, + "loss": 36.5782, + "step": 2081 + }, + { + "epoch": 7.51647855530474, + "grad_norm": 215.36769104003906, + "learning_rate": 1.871869328493648e-05, + "loss": 38.233, + "step": 2082 + }, + { + "epoch": 7.520090293453725, + "grad_norm": 232.6095733642578, + "learning_rate": 1.8713248638838477e-05, + "loss": 38.6268, + "step": 2083 + }, + { + "epoch": 7.523702031602709, + "grad_norm": 236.94281005859375, + "learning_rate": 1.8707803992740472e-05, + "loss": 38.1768, + "step": 2084 + }, + { + "epoch": 7.527313769751693, + "grad_norm": 214.16079711914062, + "learning_rate": 1.8702359346642467e-05, + "loss": 27.514, + "step": 2085 + }, + { + "epoch": 7.530925507900677, + "grad_norm": 192.6107940673828, + "learning_rate": 1.8696914700544463e-05, + "loss": 24.274, + "step": 2086 + }, + { + "epoch": 7.534537246049661, + "grad_norm": 217.98619079589844, + "learning_rate": 1.869147005444646e-05, + "loss": 23.2824, + "step": 2087 + }, + { + "epoch": 7.538148984198646, + "grad_norm": 183.04296875, + "learning_rate": 1.868602540834846e-05, + "loss": 24.9622, + "step": 2088 + }, + { + "epoch": 7.54176072234763, + "grad_norm": 167.1417236328125, + "learning_rate": 1.8680580762250456e-05, + "loss": 25.1446, + "step": 2089 + }, + { + "epoch": 7.545372460496614, + "grad_norm": 287.29937744140625, + "learning_rate": 1.867513611615245e-05, + "loss": 44.1171, + "step": 2090 + }, + { + "epoch": 7.545372460496614, + "eval_loss": 0.6376849412918091, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.929, + "eval_steps_per_second": 56.929, + "step": 2090 + }, + { + "epoch": 7.5489841986455986, + "grad_norm": 285.3408203125, + "learning_rate": 1.8669691470054446e-05, + "loss": 46.3716, + "step": 2091 + }, + { + "epoch": 7.5525959367945825, + "grad_norm": 233.18389892578125, + "learning_rate": 1.8664246823956445e-05, + "loss": 44.0514, + "step": 2092 + }, + { + "epoch": 7.5562076749435665, + "grad_norm": 256.4196472167969, + "learning_rate": 1.865880217785844e-05, + "loss": 44.1784, + "step": 2093 + }, + { + "epoch": 7.5598194130925505, + "grad_norm": 223.28128051757812, + "learning_rate": 1.8653357531760436e-05, + "loss": 42.9897, + "step": 2094 + }, + { + "epoch": 7.563431151241535, + "grad_norm": 235.2901153564453, + "learning_rate": 1.864791288566243e-05, + "loss": 43.7651, + "step": 2095 + }, + { + "epoch": 7.567042889390519, + "grad_norm": 285.9206237792969, + "learning_rate": 1.8642468239564427e-05, + "loss": 44.6333, + "step": 2096 + }, + { + "epoch": 7.570654627539503, + "grad_norm": 200.00210571289062, + "learning_rate": 1.8637023593466425e-05, + "loss": 43.9845, + "step": 2097 + }, + { + "epoch": 7.574266365688487, + "grad_norm": 277.73394775390625, + "learning_rate": 1.8631578947368424e-05, + "loss": 44.7301, + "step": 2098 + }, + { + "epoch": 7.577878103837472, + "grad_norm": 216.9422149658203, + "learning_rate": 1.862613430127042e-05, + "loss": 44.0409, + "step": 2099 + }, + { + "epoch": 7.581489841986456, + "grad_norm": 198.86639404296875, + "learning_rate": 1.8620689655172415e-05, + "loss": 43.4026, + "step": 2100 + }, + { + "epoch": 7.581489841986456, + "eval_loss": 0.6270378232002258, + "eval_runtime": 3.1464, + "eval_samples_per_second": 56.891, + "eval_steps_per_second": 56.891, + "step": 2100 + }, + { + "epoch": 7.58510158013544, + "grad_norm": 240.495361328125, + "learning_rate": 1.861524500907441e-05, + "loss": 41.4092, + "step": 2101 + }, + { + "epoch": 7.588713318284425, + "grad_norm": 240.1851043701172, + "learning_rate": 1.8609800362976406e-05, + "loss": 40.1396, + "step": 2102 + }, + { + "epoch": 7.592325056433409, + "grad_norm": 241.21495056152344, + "learning_rate": 1.8604355716878405e-05, + "loss": 39.1778, + "step": 2103 + }, + { + "epoch": 7.595936794582393, + "grad_norm": 287.3133544921875, + "learning_rate": 1.85989110707804e-05, + "loss": 41.0348, + "step": 2104 + }, + { + "epoch": 7.599548532731377, + "grad_norm": 230.4313201904297, + "learning_rate": 1.8593466424682395e-05, + "loss": 39.5872, + "step": 2105 + }, + { + "epoch": 7.603160270880361, + "grad_norm": 210.32962036132812, + "learning_rate": 1.858802177858439e-05, + "loss": 40.6146, + "step": 2106 + }, + { + "epoch": 7.606772009029346, + "grad_norm": 185.81752014160156, + "learning_rate": 1.858257713248639e-05, + "loss": 39.6363, + "step": 2107 + }, + { + "epoch": 7.6103837471783295, + "grad_norm": 234.63037109375, + "learning_rate": 1.8577132486388385e-05, + "loss": 40.558, + "step": 2108 + }, + { + "epoch": 7.6139954853273135, + "grad_norm": 289.92803955078125, + "learning_rate": 1.8571687840290384e-05, + "loss": 41.1624, + "step": 2109 + }, + { + "epoch": 7.617607223476298, + "grad_norm": 252.82188415527344, + "learning_rate": 1.856624319419238e-05, + "loss": 41.7827, + "step": 2110 + }, + { + "epoch": 7.617607223476298, + "eval_loss": 0.6290409564971924, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2110 + }, + { + "epoch": 7.621218961625282, + "grad_norm": 201.8303985595703, + "learning_rate": 1.8560798548094374e-05, + "loss": 39.0072, + "step": 2111 + }, + { + "epoch": 7.624830699774266, + "grad_norm": 158.71446228027344, + "learning_rate": 1.855535390199637e-05, + "loss": 39.9822, + "step": 2112 + }, + { + "epoch": 7.62844243792325, + "grad_norm": 171.3879852294922, + "learning_rate": 1.8549909255898365e-05, + "loss": 42.1973, + "step": 2113 + }, + { + "epoch": 7.632054176072235, + "grad_norm": 218.584228515625, + "learning_rate": 1.8544464609800364e-05, + "loss": 42.933, + "step": 2114 + }, + { + "epoch": 7.635665914221219, + "grad_norm": 200.60093688964844, + "learning_rate": 1.853901996370236e-05, + "loss": 41.9847, + "step": 2115 + }, + { + "epoch": 7.639277652370203, + "grad_norm": 210.75128173828125, + "learning_rate": 1.8533575317604358e-05, + "loss": 42.4961, + "step": 2116 + }, + { + "epoch": 7.642889390519187, + "grad_norm": 187.47406005859375, + "learning_rate": 1.8528130671506353e-05, + "loss": 39.3404, + "step": 2117 + }, + { + "epoch": 7.646501128668172, + "grad_norm": 204.87693786621094, + "learning_rate": 1.852268602540835e-05, + "loss": 40.3011, + "step": 2118 + }, + { + "epoch": 7.650112866817156, + "grad_norm": 228.8159637451172, + "learning_rate": 1.8517241379310344e-05, + "loss": 37.4416, + "step": 2119 + }, + { + "epoch": 7.65372460496614, + "grad_norm": 237.59664916992188, + "learning_rate": 1.8511796733212343e-05, + "loss": 35.3079, + "step": 2120 + }, + { + "epoch": 7.65372460496614, + "eval_loss": 0.6256567239761353, + "eval_runtime": 3.1458, + "eval_samples_per_second": 56.902, + "eval_steps_per_second": 56.902, + "step": 2120 + }, + { + "epoch": 7.657336343115124, + "grad_norm": 233.3187713623047, + "learning_rate": 1.850635208711434e-05, + "loss": 34.5055, + "step": 2121 + }, + { + "epoch": 7.660948081264109, + "grad_norm": 232.7037353515625, + "learning_rate": 1.8500907441016334e-05, + "loss": 34.1232, + "step": 2122 + }, + { + "epoch": 7.664559819413093, + "grad_norm": 254.53050231933594, + "learning_rate": 1.849546279491833e-05, + "loss": 35.3301, + "step": 2123 + }, + { + "epoch": 7.668171557562077, + "grad_norm": 234.93154907226562, + "learning_rate": 1.8490018148820324e-05, + "loss": 35.9202, + "step": 2124 + }, + { + "epoch": 7.6717832957110605, + "grad_norm": 237.99671936035156, + "learning_rate": 1.8484573502722327e-05, + "loss": 36.5702, + "step": 2125 + }, + { + "epoch": 7.675395033860045, + "grad_norm": 186.25271606445312, + "learning_rate": 1.8479128856624322e-05, + "loss": 35.9423, + "step": 2126 + }, + { + "epoch": 7.679006772009029, + "grad_norm": 226.461669921875, + "learning_rate": 1.8473684210526317e-05, + "loss": 37.4121, + "step": 2127 + }, + { + "epoch": 7.682618510158013, + "grad_norm": 227.0966033935547, + "learning_rate": 1.8468239564428313e-05, + "loss": 36.8802, + "step": 2128 + }, + { + "epoch": 7.686230248306998, + "grad_norm": 193.4064178466797, + "learning_rate": 1.8462794918330308e-05, + "loss": 36.0245, + "step": 2129 + }, + { + "epoch": 7.689841986455982, + "grad_norm": 279.1668395996094, + "learning_rate": 1.8457350272232304e-05, + "loss": 37.4833, + "step": 2130 + }, + { + "epoch": 7.689841986455982, + "eval_loss": 0.6227458715438843, + "eval_runtime": 3.1429, + "eval_samples_per_second": 56.953, + "eval_steps_per_second": 56.953, + "step": 2130 + }, + { + "epoch": 7.693453724604966, + "grad_norm": 254.59234619140625, + "learning_rate": 1.8451905626134302e-05, + "loss": 36.8538, + "step": 2131 + }, + { + "epoch": 7.69706546275395, + "grad_norm": 191.14463806152344, + "learning_rate": 1.8446460980036298e-05, + "loss": 37.8517, + "step": 2132 + }, + { + "epoch": 7.700677200902934, + "grad_norm": 189.20896911621094, + "learning_rate": 1.8441016333938293e-05, + "loss": 38.406, + "step": 2133 + }, + { + "epoch": 7.704288939051919, + "grad_norm": 209.61175537109375, + "learning_rate": 1.8435571687840292e-05, + "loss": 37.7692, + "step": 2134 + }, + { + "epoch": 7.707900677200903, + "grad_norm": 220.5150146484375, + "learning_rate": 1.8430127041742287e-05, + "loss": 36.087, + "step": 2135 + }, + { + "epoch": 7.711512415349887, + "grad_norm": 211.78372192382812, + "learning_rate": 1.8424682395644286e-05, + "loss": 25.6052, + "step": 2136 + }, + { + "epoch": 7.715124153498872, + "grad_norm": 223.85789489746094, + "learning_rate": 1.841923774954628e-05, + "loss": 23.5576, + "step": 2137 + }, + { + "epoch": 7.718735891647856, + "grad_norm": 163.74220275878906, + "learning_rate": 1.8413793103448277e-05, + "loss": 24.4869, + "step": 2138 + }, + { + "epoch": 7.72234762979684, + "grad_norm": 182.80079650878906, + "learning_rate": 1.8408348457350272e-05, + "loss": 25.1878, + "step": 2139 + }, + { + "epoch": 7.725959367945824, + "grad_norm": 296.0340270996094, + "learning_rate": 1.8402903811252268e-05, + "loss": 44.4643, + "step": 2140 + }, + { + "epoch": 7.725959367945824, + "eval_loss": 0.6382863521575928, + "eval_runtime": 3.1441, + "eval_samples_per_second": 56.932, + "eval_steps_per_second": 56.932, + "step": 2140 + }, + { + "epoch": 7.7295711060948085, + "grad_norm": 248.48643493652344, + "learning_rate": 1.8397459165154263e-05, + "loss": 45.2141, + "step": 2141 + }, + { + "epoch": 7.733182844243792, + "grad_norm": 240.9061279296875, + "learning_rate": 1.8392014519056262e-05, + "loss": 42.9435, + "step": 2142 + }, + { + "epoch": 7.736794582392776, + "grad_norm": 231.62315368652344, + "learning_rate": 1.8386569872958257e-05, + "loss": 42.9769, + "step": 2143 + }, + { + "epoch": 7.74040632054176, + "grad_norm": 244.36915588378906, + "learning_rate": 1.8381125226860256e-05, + "loss": 43.6058, + "step": 2144 + }, + { + "epoch": 7.744018058690745, + "grad_norm": 252.9080047607422, + "learning_rate": 1.837568058076225e-05, + "loss": 43.1753, + "step": 2145 + }, + { + "epoch": 7.747629796839729, + "grad_norm": 274.0201721191406, + "learning_rate": 1.8370235934664247e-05, + "loss": 43.3285, + "step": 2146 + }, + { + "epoch": 7.751241534988713, + "grad_norm": 226.75595092773438, + "learning_rate": 1.8364791288566245e-05, + "loss": 43.3158, + "step": 2147 + }, + { + "epoch": 7.754853273137698, + "grad_norm": 197.0859832763672, + "learning_rate": 1.835934664246824e-05, + "loss": 43.5773, + "step": 2148 + }, + { + "epoch": 7.758465011286682, + "grad_norm": 212.14720153808594, + "learning_rate": 1.8353901996370236e-05, + "loss": 43.9208, + "step": 2149 + }, + { + "epoch": 7.762076749435666, + "grad_norm": 230.22158813476562, + "learning_rate": 1.834845735027223e-05, + "loss": 42.8429, + "step": 2150 + }, + { + "epoch": 7.762076749435666, + "eval_loss": 0.6291994452476501, + "eval_runtime": 3.1473, + "eval_samples_per_second": 56.874, + "eval_steps_per_second": 56.874, + "step": 2150 + }, + { + "epoch": 7.76568848758465, + "grad_norm": 215.79391479492188, + "learning_rate": 1.8343012704174227e-05, + "loss": 40.7289, + "step": 2151 + }, + { + "epoch": 7.769300225733634, + "grad_norm": 210.00296020507812, + "learning_rate": 1.8337568058076222e-05, + "loss": 39.9759, + "step": 2152 + }, + { + "epoch": 7.772911963882619, + "grad_norm": 291.2987976074219, + "learning_rate": 1.8332123411978224e-05, + "loss": 40.551, + "step": 2153 + }, + { + "epoch": 7.776523702031603, + "grad_norm": 218.08819580078125, + "learning_rate": 1.832667876588022e-05, + "loss": 40.7981, + "step": 2154 + }, + { + "epoch": 7.780135440180587, + "grad_norm": 268.615966796875, + "learning_rate": 1.8321234119782215e-05, + "loss": 40.5463, + "step": 2155 + }, + { + "epoch": 7.7837471783295715, + "grad_norm": 269.939697265625, + "learning_rate": 1.831578947368421e-05, + "loss": 40.6168, + "step": 2156 + }, + { + "epoch": 7.7873589164785555, + "grad_norm": 268.9761657714844, + "learning_rate": 1.8310344827586206e-05, + "loss": 41.2449, + "step": 2157 + }, + { + "epoch": 7.7909706546275395, + "grad_norm": 161.08811950683594, + "learning_rate": 1.8304900181488205e-05, + "loss": 40.6308, + "step": 2158 + }, + { + "epoch": 7.794582392776523, + "grad_norm": 190.44696044921875, + "learning_rate": 1.82994555353902e-05, + "loss": 40.9708, + "step": 2159 + }, + { + "epoch": 7.798194130925508, + "grad_norm": 202.4305419921875, + "learning_rate": 1.8294010889292196e-05, + "loss": 41.2053, + "step": 2160 + }, + { + "epoch": 7.798194130925508, + "eval_loss": 0.6233534812927246, + "eval_runtime": 3.1457, + "eval_samples_per_second": 56.903, + "eval_steps_per_second": 56.903, + "step": 2160 + }, + { + "epoch": 7.801805869074492, + "grad_norm": 188.5523681640625, + "learning_rate": 1.828856624319419e-05, + "loss": 40.3928, + "step": 2161 + }, + { + "epoch": 7.805417607223476, + "grad_norm": 184.18296813964844, + "learning_rate": 1.828312159709619e-05, + "loss": 42.3466, + "step": 2162 + }, + { + "epoch": 7.80902934537246, + "grad_norm": 223.9243927001953, + "learning_rate": 1.8277676950998185e-05, + "loss": 42.0301, + "step": 2163 + }, + { + "epoch": 7.812641083521445, + "grad_norm": 202.3498077392578, + "learning_rate": 1.8272232304900184e-05, + "loss": 42.3284, + "step": 2164 + }, + { + "epoch": 7.816252821670429, + "grad_norm": 205.77940368652344, + "learning_rate": 1.826678765880218e-05, + "loss": 42.0951, + "step": 2165 + }, + { + "epoch": 7.819864559819413, + "grad_norm": 191.46728515625, + "learning_rate": 1.8261343012704175e-05, + "loss": 40.826, + "step": 2166 + }, + { + "epoch": 7.823476297968397, + "grad_norm": 276.8330383300781, + "learning_rate": 1.825589836660617e-05, + "loss": 42.7909, + "step": 2167 + }, + { + "epoch": 7.827088036117382, + "grad_norm": 181.93955993652344, + "learning_rate": 1.8250453720508165e-05, + "loss": 38.6068, + "step": 2168 + }, + { + "epoch": 7.830699774266366, + "grad_norm": 178.79856872558594, + "learning_rate": 1.8245009074410164e-05, + "loss": 35.694, + "step": 2169 + }, + { + "epoch": 7.83431151241535, + "grad_norm": 224.6522979736328, + "learning_rate": 1.823956442831216e-05, + "loss": 36.7127, + "step": 2170 + }, + { + "epoch": 7.83431151241535, + "eval_loss": 0.6237645745277405, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 2170 + }, + { + "epoch": 7.837923250564334, + "grad_norm": 203.37196350097656, + "learning_rate": 1.823411978221416e-05, + "loss": 34.0039, + "step": 2171 + }, + { + "epoch": 7.8415349887133186, + "grad_norm": 212.79307556152344, + "learning_rate": 1.8228675136116154e-05, + "loss": 33.2787, + "step": 2172 + }, + { + "epoch": 7.8451467268623025, + "grad_norm": 215.5691375732422, + "learning_rate": 1.822323049001815e-05, + "loss": 35.4241, + "step": 2173 + }, + { + "epoch": 7.8487584650112865, + "grad_norm": 230.0751190185547, + "learning_rate": 1.8217785843920144e-05, + "loss": 36.9333, + "step": 2174 + }, + { + "epoch": 7.852370203160271, + "grad_norm": 217.8132781982422, + "learning_rate": 1.8212341197822143e-05, + "loss": 35.7233, + "step": 2175 + }, + { + "epoch": 7.855981941309255, + "grad_norm": 245.93177795410156, + "learning_rate": 1.820689655172414e-05, + "loss": 36.6111, + "step": 2176 + }, + { + "epoch": 7.859593679458239, + "grad_norm": 210.58218383789062, + "learning_rate": 1.8201451905626134e-05, + "loss": 36.3243, + "step": 2177 + }, + { + "epoch": 7.863205417607223, + "grad_norm": 234.6280059814453, + "learning_rate": 1.819600725952813e-05, + "loss": 37.0315, + "step": 2178 + }, + { + "epoch": 7.866817155756207, + "grad_norm": 184.53121948242188, + "learning_rate": 1.8190562613430125e-05, + "loss": 35.8725, + "step": 2179 + }, + { + "epoch": 7.870428893905192, + "grad_norm": 201.5563507080078, + "learning_rate": 1.8185117967332127e-05, + "loss": 37.9183, + "step": 2180 + }, + { + "epoch": 7.870428893905192, + "eval_loss": 0.6210297346115112, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2180 + }, + { + "epoch": 7.874040632054176, + "grad_norm": 192.29579162597656, + "learning_rate": 1.8179673321234122e-05, + "loss": 37.1709, + "step": 2181 + }, + { + "epoch": 7.87765237020316, + "grad_norm": 246.0638427734375, + "learning_rate": 1.8174228675136118e-05, + "loss": 38.5338, + "step": 2182 + }, + { + "epoch": 7.881264108352145, + "grad_norm": 237.47607421875, + "learning_rate": 1.8168784029038113e-05, + "loss": 37.7041, + "step": 2183 + }, + { + "epoch": 7.884875846501129, + "grad_norm": 215.06407165527344, + "learning_rate": 1.816333938294011e-05, + "loss": 38.1663, + "step": 2184 + }, + { + "epoch": 7.888487584650113, + "grad_norm": 193.76809692382812, + "learning_rate": 1.8157894736842107e-05, + "loss": 32.1679, + "step": 2185 + }, + { + "epoch": 7.892099322799097, + "grad_norm": 208.66111755371094, + "learning_rate": 1.8152450090744103e-05, + "loss": 24.2413, + "step": 2186 + }, + { + "epoch": 7.895711060948082, + "grad_norm": 182.810546875, + "learning_rate": 1.8147005444646098e-05, + "loss": 24.1102, + "step": 2187 + }, + { + "epoch": 7.899322799097066, + "grad_norm": 200.25823974609375, + "learning_rate": 1.8141560798548093e-05, + "loss": 24.5778, + "step": 2188 + }, + { + "epoch": 7.9029345372460496, + "grad_norm": 224.19125366210938, + "learning_rate": 1.813611615245009e-05, + "loss": 26.1643, + "step": 2189 + }, + { + "epoch": 7.9065462753950335, + "grad_norm": 261.03033447265625, + "learning_rate": 1.8130671506352088e-05, + "loss": 45.1071, + "step": 2190 + }, + { + "epoch": 7.9065462753950335, + "eval_loss": 0.6303785443305969, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2190 + }, + { + "epoch": 7.910158013544018, + "grad_norm": 273.6593322753906, + "learning_rate": 1.8125226860254086e-05, + "loss": 43.8271, + "step": 2191 + }, + { + "epoch": 7.913769751693002, + "grad_norm": 304.0534362792969, + "learning_rate": 1.8119782214156082e-05, + "loss": 43.7623, + "step": 2192 + }, + { + "epoch": 7.917381489841986, + "grad_norm": 249.27255249023438, + "learning_rate": 1.8114337568058077e-05, + "loss": 43.7191, + "step": 2193 + }, + { + "epoch": 7.92099322799097, + "grad_norm": 199.5006103515625, + "learning_rate": 1.8108892921960072e-05, + "loss": 44.1019, + "step": 2194 + }, + { + "epoch": 7.924604966139955, + "grad_norm": 228.42832946777344, + "learning_rate": 1.8103448275862068e-05, + "loss": 43.9717, + "step": 2195 + }, + { + "epoch": 7.928216704288939, + "grad_norm": 247.20901489257812, + "learning_rate": 1.8098003629764067e-05, + "loss": 40.022, + "step": 2196 + }, + { + "epoch": 7.931828442437923, + "grad_norm": 297.5372619628906, + "learning_rate": 1.8092558983666062e-05, + "loss": 40.6639, + "step": 2197 + }, + { + "epoch": 7.935440180586907, + "grad_norm": 245.11915588378906, + "learning_rate": 1.8087114337568057e-05, + "loss": 40.3569, + "step": 2198 + }, + { + "epoch": 7.939051918735892, + "grad_norm": 255.53297424316406, + "learning_rate": 1.8081669691470056e-05, + "loss": 41.7983, + "step": 2199 + }, + { + "epoch": 7.942663656884876, + "grad_norm": 226.12783813476562, + "learning_rate": 1.807622504537205e-05, + "loss": 41.7844, + "step": 2200 + }, + { + "epoch": 7.942663656884876, + "eval_loss": 0.6214397549629211, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2200 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0792291372557926e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..68aafb98ba3990bb355a5313b1121ae7ce5fab58 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:121996489b5074f435aa70d61206cdaa2a019e943b85456d5e41646729b6791f +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..ba3df25d657c9643ffc26e7e2f4b3a0df85ffb6b --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:754182e956058a41e90546ff7e3f051702e7fa4743da1f971f71ae01c0ae0397 +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8403d3d091d71ea69c6fbf0acf308fbf6380146c --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50b813beb2bdd309d0b4fa0e2a849ea6a223aa705b62976f9b4403e8df0a6d6a +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1b2dde3a9c6e8dae078b0440589d4dc2e2c64b11 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:624b1a34bd78c2d11d07be2a55a9dbe6b7a815d0a1f304f9444b1911855036ca +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..16b1f566799794514a4f75ce9297b5edbededa84 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:703fd4a1b5410238155546fe0cd2a82da4a3827c3bb7126b0270d51014e846a9 +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0f39fbdff63206be4e80ed1610c7411e5381b600 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/trainer_state.json @@ -0,0 +1,18753 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.664559819413093, + "eval_steps": 10, + "global_step": 2400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + }, + { + "epoch": 4.33589164785553, + "grad_norm": 232.332763671875, + "learning_rate": 2.3515426497277678e-05, + "loss": 38.1029, + "step": 1201 + }, + { + "epoch": 4.339503386004514, + "grad_norm": 251.8763885498047, + "learning_rate": 2.3509981851179673e-05, + "loss": 40.2538, + "step": 1202 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 260.1363525390625, + "learning_rate": 2.350453720508167e-05, + "loss": 39.115, + "step": 1203 + }, + { + "epoch": 4.346726862302483, + "grad_norm": 227.32473754882812, + "learning_rate": 2.3499092558983667e-05, + "loss": 37.7692, + "step": 1204 + }, + { + "epoch": 4.350338600451467, + "grad_norm": 208.3872528076172, + "learning_rate": 2.3493647912885663e-05, + "loss": 26.7583, + "step": 1205 + }, + { + "epoch": 4.353950338600452, + "grad_norm": 173.05075073242188, + "learning_rate": 2.348820326678766e-05, + "loss": 24.7576, + "step": 1206 + }, + { + "epoch": 4.357562076749436, + "grad_norm": 214.4512939453125, + "learning_rate": 2.3482758620689657e-05, + "loss": 24.8792, + "step": 1207 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 179.293701171875, + "learning_rate": 2.3477313974591652e-05, + "loss": 26.1507, + "step": 1208 + }, + { + "epoch": 4.364785553047404, + "grad_norm": 401.9908142089844, + "learning_rate": 2.3471869328493648e-05, + "loss": 47.4017, + "step": 1209 + }, + { + "epoch": 4.368397291196389, + "grad_norm": 399.3369140625, + "learning_rate": 2.3466424682395643e-05, + "loss": 48.0082, + "step": 1210 + }, + { + "epoch": 4.368397291196389, + "eval_loss": 0.6664602756500244, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.18, + "eval_steps_per_second": 57.18, + "step": 1210 + }, + { + "epoch": 4.372009029345373, + "grad_norm": 320.49090576171875, + "learning_rate": 2.346098003629764e-05, + "loss": 47.4843, + "step": 1211 + }, + { + "epoch": 4.375620767494357, + "grad_norm": 297.55615234375, + "learning_rate": 2.3455535390199637e-05, + "loss": 46.3087, + "step": 1212 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 245.03399658203125, + "learning_rate": 2.3450090744101636e-05, + "loss": 45.4889, + "step": 1213 + }, + { + "epoch": 4.382844243792325, + "grad_norm": 227.94091796875, + "learning_rate": 2.344464609800363e-05, + "loss": 45.8501, + "step": 1214 + }, + { + "epoch": 4.386455981941309, + "grad_norm": 262.7824401855469, + "learning_rate": 2.3439201451905627e-05, + "loss": 46.2737, + "step": 1215 + }, + { + "epoch": 4.390067720090293, + "grad_norm": 235.969970703125, + "learning_rate": 2.3433756805807622e-05, + "loss": 45.2876, + "step": 1216 + }, + { + "epoch": 4.393679458239277, + "grad_norm": 244.8028106689453, + "learning_rate": 2.342831215970962e-05, + "loss": 45.4931, + "step": 1217 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 236.24844360351562, + "learning_rate": 2.3422867513611616e-05, + "loss": 45.6649, + "step": 1218 + }, + { + "epoch": 4.400902934537246, + "grad_norm": 204.7911834716797, + "learning_rate": 2.341742286751361e-05, + "loss": 43.9613, + "step": 1219 + }, + { + "epoch": 4.40451467268623, + "grad_norm": 190.6739044189453, + "learning_rate": 2.3411978221415607e-05, + "loss": 41.9267, + "step": 1220 + }, + { + "epoch": 4.40451467268623, + "eval_loss": 0.6481396555900574, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1220 + }, + { + "epoch": 4.408126410835214, + "grad_norm": 224.25758361816406, + "learning_rate": 2.3406533575317602e-05, + "loss": 42.34, + "step": 1221 + }, + { + "epoch": 4.411738148984199, + "grad_norm": 238.21913146972656, + "learning_rate": 2.34010889292196e-05, + "loss": 40.6947, + "step": 1222 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 255.64395141601562, + "learning_rate": 2.33956442831216e-05, + "loss": 39.8585, + "step": 1223 + }, + { + "epoch": 4.418961625282167, + "grad_norm": 202.08859252929688, + "learning_rate": 2.3390199637023595e-05, + "loss": 42.6031, + "step": 1224 + }, + { + "epoch": 4.422573363431152, + "grad_norm": 222.359619140625, + "learning_rate": 2.338475499092559e-05, + "loss": 41.9946, + "step": 1225 + }, + { + "epoch": 4.426185101580136, + "grad_norm": 198.84461975097656, + "learning_rate": 2.3379310344827586e-05, + "loss": 40.9174, + "step": 1226 + }, + { + "epoch": 4.42979683972912, + "grad_norm": 227.34942626953125, + "learning_rate": 2.337386569872958e-05, + "loss": 42.2865, + "step": 1227 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 249.9097900390625, + "learning_rate": 2.336842105263158e-05, + "loss": 42.6508, + "step": 1228 + }, + { + "epoch": 4.437020316027088, + "grad_norm": 236.96009826660156, + "learning_rate": 2.3362976406533576e-05, + "loss": 43.0846, + "step": 1229 + }, + { + "epoch": 4.440632054176072, + "grad_norm": 183.06201171875, + "learning_rate": 2.335753176043557e-05, + "loss": 42.4119, + "step": 1230 + }, + { + "epoch": 4.440632054176072, + "eval_loss": 0.6428424715995789, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 1230 + }, + { + "epoch": 4.444243792325056, + "grad_norm": 199.0382843017578, + "learning_rate": 2.335208711433757e-05, + "loss": 43.1702, + "step": 1231 + }, + { + "epoch": 4.44785553047404, + "grad_norm": 221.87939453125, + "learning_rate": 2.3346642468239565e-05, + "loss": 43.3518, + "step": 1232 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 205.0601043701172, + "learning_rate": 2.3341197822141564e-05, + "loss": 42.9713, + "step": 1233 + }, + { + "epoch": 4.455079006772009, + "grad_norm": 235.3998565673828, + "learning_rate": 2.333575317604356e-05, + "loss": 42.6973, + "step": 1234 + }, + { + "epoch": 4.458690744920993, + "grad_norm": 171.76986694335938, + "learning_rate": 2.3330308529945555e-05, + "loss": 43.351, + "step": 1235 + }, + { + "epoch": 4.462302483069977, + "grad_norm": 261.549072265625, + "learning_rate": 2.332486388384755e-05, + "loss": 43.8662, + "step": 1236 + }, + { + "epoch": 4.465914221218962, + "grad_norm": 256.76837158203125, + "learning_rate": 2.3319419237749545e-05, + "loss": 40.7938, + "step": 1237 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 176.35060119628906, + "learning_rate": 2.331397459165154e-05, + "loss": 38.1021, + "step": 1238 + }, + { + "epoch": 4.47313769751693, + "grad_norm": 203.00906372070312, + "learning_rate": 2.330852994555354e-05, + "loss": 36.6359, + "step": 1239 + }, + { + "epoch": 4.476749435665914, + "grad_norm": 259.6462707519531, + "learning_rate": 2.3303085299455535e-05, + "loss": 34.448, + "step": 1240 + }, + { + "epoch": 4.476749435665914, + "eval_loss": 0.6386051177978516, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 1240 + }, + { + "epoch": 4.480361173814899, + "grad_norm": 215.24737548828125, + "learning_rate": 2.3297640653357534e-05, + "loss": 35.2353, + "step": 1241 + }, + { + "epoch": 4.483972911963883, + "grad_norm": 249.12355041503906, + "learning_rate": 2.329219600725953e-05, + "loss": 38.2077, + "step": 1242 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 191.0881805419922, + "learning_rate": 2.3286751361161525e-05, + "loss": 36.8363, + "step": 1243 + }, + { + "epoch": 4.491196388261851, + "grad_norm": 229.26449584960938, + "learning_rate": 2.3281306715063523e-05, + "loss": 36.7398, + "step": 1244 + }, + { + "epoch": 4.4948081264108355, + "grad_norm": 184.931884765625, + "learning_rate": 2.327586206896552e-05, + "loss": 35.6614, + "step": 1245 + }, + { + "epoch": 4.4984198645598195, + "grad_norm": 183.7378387451172, + "learning_rate": 2.3270417422867514e-05, + "loss": 36.9818, + "step": 1246 + }, + { + "epoch": 4.502031602708803, + "grad_norm": 191.42543029785156, + "learning_rate": 2.326497277676951e-05, + "loss": 38.1348, + "step": 1247 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 211.6359100341797, + "learning_rate": 2.3259528130671505e-05, + "loss": 37.0112, + "step": 1248 + }, + { + "epoch": 4.509255079006772, + "grad_norm": 245.6946563720703, + "learning_rate": 2.32540834845735e-05, + "loss": 38.6218, + "step": 1249 + }, + { + "epoch": 4.512866817155756, + "grad_norm": 193.29095458984375, + "learning_rate": 2.3248638838475502e-05, + "loss": 36.9687, + "step": 1250 + }, + { + "epoch": 4.512866817155756, + "eval_loss": 0.6432057023048401, + "eval_runtime": 3.1301, + "eval_samples_per_second": 57.187, + "eval_steps_per_second": 57.187, + "step": 1250 + }, + { + "epoch": 4.51647855530474, + "grad_norm": 247.0595245361328, + "learning_rate": 2.3243194192377498e-05, + "loss": 39.8086, + "step": 1251 + }, + { + "epoch": 4.520090293453725, + "grad_norm": 243.1544189453125, + "learning_rate": 2.3237749546279493e-05, + "loss": 38.7245, + "step": 1252 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 322.0834045410156, + "learning_rate": 2.323230490018149e-05, + "loss": 39.5335, + "step": 1253 + }, + { + "epoch": 4.527313769751693, + "grad_norm": 201.5956573486328, + "learning_rate": 2.3226860254083484e-05, + "loss": 30.2928, + "step": 1254 + }, + { + "epoch": 4.530925507900677, + "grad_norm": 186.13291931152344, + "learning_rate": 2.3221415607985483e-05, + "loss": 24.8504, + "step": 1255 + }, + { + "epoch": 4.534537246049661, + "grad_norm": 251.50608825683594, + "learning_rate": 2.3215970961887478e-05, + "loss": 24.5528, + "step": 1256 + }, + { + "epoch": 4.538148984198646, + "grad_norm": 180.21124267578125, + "learning_rate": 2.3210526315789473e-05, + "loss": 25.0864, + "step": 1257 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 206.5410614013672, + "learning_rate": 2.320508166969147e-05, + "loss": 27.1602, + "step": 1258 + }, + { + "epoch": 4.545372460496614, + "grad_norm": 342.1103210449219, + "learning_rate": 2.3199637023593468e-05, + "loss": 47.3734, + "step": 1259 + }, + { + "epoch": 4.5489841986455986, + "grad_norm": 418.3056945800781, + "learning_rate": 2.3194192377495463e-05, + "loss": 48.0316, + "step": 1260 + }, + { + "epoch": 4.5489841986455986, + "eval_loss": 0.6742400527000427, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 1260 + }, + { + "epoch": 4.5525959367945825, + "grad_norm": 369.8560791015625, + "learning_rate": 2.3188747731397462e-05, + "loss": 47.4532, + "step": 1261 + }, + { + "epoch": 4.5562076749435665, + "grad_norm": 322.0288391113281, + "learning_rate": 2.3183303085299457e-05, + "loss": 47.0661, + "step": 1262 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 244.79066467285156, + "learning_rate": 2.3177858439201453e-05, + "loss": 45.1875, + "step": 1263 + }, + { + "epoch": 4.563431151241535, + "grad_norm": 209.29397583007812, + "learning_rate": 2.3172413793103448e-05, + "loss": 46.1355, + "step": 1264 + }, + { + "epoch": 4.567042889390519, + "grad_norm": 271.5123291015625, + "learning_rate": 2.3166969147005443e-05, + "loss": 45.8947, + "step": 1265 + }, + { + "epoch": 4.570654627539503, + "grad_norm": 232.42913818359375, + "learning_rate": 2.3161524500907442e-05, + "loss": 45.6542, + "step": 1266 + }, + { + "epoch": 4.574266365688487, + "grad_norm": 282.50738525390625, + "learning_rate": 2.3156079854809437e-05, + "loss": 45.8805, + "step": 1267 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 203.39031982421875, + "learning_rate": 2.3150635208711436e-05, + "loss": 44.8926, + "step": 1268 + }, + { + "epoch": 4.581489841986456, + "grad_norm": 213.94894409179688, + "learning_rate": 2.314519056261343e-05, + "loss": 43.7589, + "step": 1269 + }, + { + "epoch": 4.58510158013544, + "grad_norm": 198.9677734375, + "learning_rate": 2.3139745916515427e-05, + "loss": 41.819, + "step": 1270 + }, + { + "epoch": 4.58510158013544, + "eval_loss": 0.6428627371788025, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1270 + }, + { + "epoch": 4.588713318284425, + "grad_norm": 197.69903564453125, + "learning_rate": 2.3134301270417422e-05, + "loss": 40.6128, + "step": 1271 + }, + { + "epoch": 4.592325056433409, + "grad_norm": 229.10488891601562, + "learning_rate": 2.312885662431942e-05, + "loss": 41.1856, + "step": 1272 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 254.4750213623047, + "learning_rate": 2.3123411978221417e-05, + "loss": 40.2048, + "step": 1273 + }, + { + "epoch": 4.599548532731377, + "grad_norm": 247.2012939453125, + "learning_rate": 2.3117967332123412e-05, + "loss": 41.663, + "step": 1274 + }, + { + "epoch": 4.603160270880361, + "grad_norm": 196.78761291503906, + "learning_rate": 2.3112522686025407e-05, + "loss": 41.1102, + "step": 1275 + }, + { + "epoch": 4.606772009029346, + "grad_norm": 179.03880310058594, + "learning_rate": 2.3107078039927403e-05, + "loss": 39.6368, + "step": 1276 + }, + { + "epoch": 4.6103837471783295, + "grad_norm": 203.49159240722656, + "learning_rate": 2.3101633393829405e-05, + "loss": 42.9424, + "step": 1277 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 254.80018615722656, + "learning_rate": 2.30961887477314e-05, + "loss": 42.0636, + "step": 1278 + }, + { + "epoch": 4.617607223476298, + "grad_norm": 201.86109924316406, + "learning_rate": 2.3090744101633396e-05, + "loss": 41.4738, + "step": 1279 + }, + { + "epoch": 4.621218961625282, + "grad_norm": 185.1239471435547, + "learning_rate": 2.308529945553539e-05, + "loss": 41.8529, + "step": 1280 + }, + { + "epoch": 4.621218961625282, + "eval_loss": 0.6457561254501343, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 1280 + }, + { + "epoch": 4.624830699774266, + "grad_norm": 198.6769561767578, + "learning_rate": 2.3079854809437386e-05, + "loss": 41.8397, + "step": 1281 + }, + { + "epoch": 4.62844243792325, + "grad_norm": 254.9165496826172, + "learning_rate": 2.3074410163339382e-05, + "loss": 43.5585, + "step": 1282 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 183.61181640625, + "learning_rate": 2.306896551724138e-05, + "loss": 41.7349, + "step": 1283 + }, + { + "epoch": 4.635665914221219, + "grad_norm": 206.0381622314453, + "learning_rate": 2.3063520871143376e-05, + "loss": 42.6239, + "step": 1284 + }, + { + "epoch": 4.639277652370203, + "grad_norm": 188.5303497314453, + "learning_rate": 2.305807622504537e-05, + "loss": 43.0988, + "step": 1285 + }, + { + "epoch": 4.642889390519187, + "grad_norm": 208.30039978027344, + "learning_rate": 2.3052631578947367e-05, + "loss": 43.8379, + "step": 1286 + }, + { + "epoch": 4.646501128668172, + "grad_norm": 209.494384765625, + "learning_rate": 2.3047186932849365e-05, + "loss": 41.4395, + "step": 1287 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 223.97824096679688, + "learning_rate": 2.3041742286751364e-05, + "loss": 38.5792, + "step": 1288 + }, + { + "epoch": 4.65372460496614, + "grad_norm": 209.16192626953125, + "learning_rate": 2.303629764065336e-05, + "loss": 36.2448, + "step": 1289 + }, + { + "epoch": 4.657336343115124, + "grad_norm": 260.72821044921875, + "learning_rate": 2.3030852994555355e-05, + "loss": 35.1692, + "step": 1290 + }, + { + "epoch": 4.657336343115124, + "eval_loss": 0.6381233334541321, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1290 + }, + { + "epoch": 4.660948081264109, + "grad_norm": 222.2270965576172, + "learning_rate": 2.302540834845735e-05, + "loss": 35.2234, + "step": 1291 + }, + { + "epoch": 4.664559819413093, + "grad_norm": 208.68218994140625, + "learning_rate": 2.3019963702359346e-05, + "loss": 35.6167, + "step": 1292 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 199.57015991210938, + "learning_rate": 2.301451905626134e-05, + "loss": 36.9489, + "step": 1293 + }, + { + "epoch": 4.6717832957110605, + "grad_norm": 249.1312255859375, + "learning_rate": 2.300907441016334e-05, + "loss": 37.0681, + "step": 1294 + }, + { + "epoch": 4.675395033860045, + "grad_norm": 227.86341857910156, + "learning_rate": 2.3003629764065335e-05, + "loss": 38.3897, + "step": 1295 + }, + { + "epoch": 4.679006772009029, + "grad_norm": 290.3368225097656, + "learning_rate": 2.2998185117967334e-05, + "loss": 39.1391, + "step": 1296 + }, + { + "epoch": 4.682618510158013, + "grad_norm": 222.59974670410156, + "learning_rate": 2.299274047186933e-05, + "loss": 38.6362, + "step": 1297 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 233.853515625, + "learning_rate": 2.2987295825771325e-05, + "loss": 37.1796, + "step": 1298 + }, + { + "epoch": 4.689841986455982, + "grad_norm": 202.83212280273438, + "learning_rate": 2.2981851179673324e-05, + "loss": 38.5097, + "step": 1299 + }, + { + "epoch": 4.693453724604966, + "grad_norm": 203.59027099609375, + "learning_rate": 2.297640653357532e-05, + "loss": 38.3335, + "step": 1300 + }, + { + "epoch": 4.693453724604966, + "eval_loss": 0.6446877717971802, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 1300 + }, + { + "epoch": 4.69706546275395, + "grad_norm": 250.48324584960938, + "learning_rate": 2.2970961887477314e-05, + "loss": 39.1848, + "step": 1301 + }, + { + "epoch": 4.700677200902934, + "grad_norm": 218.0867462158203, + "learning_rate": 2.296551724137931e-05, + "loss": 38.2276, + "step": 1302 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 316.4258728027344, + "learning_rate": 2.2960072595281305e-05, + "loss": 38.4487, + "step": 1303 + }, + { + "epoch": 4.707900677200903, + "grad_norm": 262.96832275390625, + "learning_rate": 2.29546279491833e-05, + "loss": 29.1075, + "step": 1304 + }, + { + "epoch": 4.711512415349887, + "grad_norm": 261.25897216796875, + "learning_rate": 2.2949183303085303e-05, + "loss": 24.6257, + "step": 1305 + }, + { + "epoch": 4.715124153498872, + "grad_norm": 223.29014587402344, + "learning_rate": 2.2943738656987298e-05, + "loss": 24.4387, + "step": 1306 + }, + { + "epoch": 4.718735891647856, + "grad_norm": 167.95193481445312, + "learning_rate": 2.2938294010889293e-05, + "loss": 25.0916, + "step": 1307 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 203.88392639160156, + "learning_rate": 2.293284936479129e-05, + "loss": 26.1631, + "step": 1308 + }, + { + "epoch": 4.725959367945824, + "grad_norm": 350.67657470703125, + "learning_rate": 2.2927404718693284e-05, + "loss": 47.7021, + "step": 1309 + }, + { + "epoch": 4.7295711060948085, + "grad_norm": 357.1839294433594, + "learning_rate": 2.2921960072595283e-05, + "loss": 47.8161, + "step": 1310 + }, + { + "epoch": 4.7295711060948085, + "eval_loss": 0.6716815829277039, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 1310 + }, + { + "epoch": 4.733182844243792, + "grad_norm": 334.40216064453125, + "learning_rate": 2.291651542649728e-05, + "loss": 47.5608, + "step": 1311 + }, + { + "epoch": 4.736794582392776, + "grad_norm": 322.90008544921875, + "learning_rate": 2.2911070780399274e-05, + "loss": 45.9858, + "step": 1312 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 291.5083923339844, + "learning_rate": 2.290562613430127e-05, + "loss": 45.9813, + "step": 1313 + }, + { + "epoch": 4.744018058690745, + "grad_norm": 234.91102600097656, + "learning_rate": 2.2900181488203268e-05, + "loss": 44.4287, + "step": 1314 + }, + { + "epoch": 4.747629796839729, + "grad_norm": 271.03582763671875, + "learning_rate": 2.2894736842105263e-05, + "loss": 45.3697, + "step": 1315 + }, + { + "epoch": 4.751241534988713, + "grad_norm": 256.219482421875, + "learning_rate": 2.2889292196007262e-05, + "loss": 45.1817, + "step": 1316 + }, + { + "epoch": 4.754853273137698, + "grad_norm": 252.0631561279297, + "learning_rate": 2.2883847549909257e-05, + "loss": 45.2029, + "step": 1317 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 249.41812133789062, + "learning_rate": 2.2878402903811253e-05, + "loss": 44.9802, + "step": 1318 + }, + { + "epoch": 4.762076749435666, + "grad_norm": 208.9102325439453, + "learning_rate": 2.2872958257713248e-05, + "loss": 44.3745, + "step": 1319 + }, + { + "epoch": 4.76568848758465, + "grad_norm": 322.94903564453125, + "learning_rate": 2.2867513611615244e-05, + "loss": 40.9193, + "step": 1320 + }, + { + "epoch": 4.76568848758465, + "eval_loss": 0.6515910029411316, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1320 + }, + { + "epoch": 4.769300225733634, + "grad_norm": 264.6942138671875, + "learning_rate": 2.2862068965517242e-05, + "loss": 39.7286, + "step": 1321 + }, + { + "epoch": 4.772911963882619, + "grad_norm": 276.6095886230469, + "learning_rate": 2.2856624319419238e-05, + "loss": 41.3846, + "step": 1322 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 199.59877014160156, + "learning_rate": 2.2851179673321233e-05, + "loss": 40.5583, + "step": 1323 + }, + { + "epoch": 4.780135440180587, + "grad_norm": 252.59158325195312, + "learning_rate": 2.2845735027223232e-05, + "loss": 40.9513, + "step": 1324 + }, + { + "epoch": 4.7837471783295715, + "grad_norm": 215.53826904296875, + "learning_rate": 2.2840290381125227e-05, + "loss": 41.5119, + "step": 1325 + }, + { + "epoch": 4.7873589164785555, + "grad_norm": 290.7100524902344, + "learning_rate": 2.2834845735027226e-05, + "loss": 42.7646, + "step": 1326 + }, + { + "epoch": 4.7909706546275395, + "grad_norm": 190.2306671142578, + "learning_rate": 2.282940108892922e-05, + "loss": 42.2708, + "step": 1327 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 187.5550079345703, + "learning_rate": 2.2823956442831217e-05, + "loss": 41.9279, + "step": 1328 + }, + { + "epoch": 4.798194130925508, + "grad_norm": 169.10414123535156, + "learning_rate": 2.2818511796733212e-05, + "loss": 42.2688, + "step": 1329 + }, + { + "epoch": 4.801805869074492, + "grad_norm": 199.5216064453125, + "learning_rate": 2.2813067150635208e-05, + "loss": 41.9192, + "step": 1330 + }, + { + "epoch": 4.801805869074492, + "eval_loss": 0.6402038335800171, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 1330 + }, + { + "epoch": 4.805417607223476, + "grad_norm": 222.4996337890625, + "learning_rate": 2.2807622504537203e-05, + "loss": 43.8218, + "step": 1331 + }, + { + "epoch": 4.80902934537246, + "grad_norm": 228.1157684326172, + "learning_rate": 2.2802177858439202e-05, + "loss": 42.9497, + "step": 1332 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 179.83697509765625, + "learning_rate": 2.27967332123412e-05, + "loss": 43.9723, + "step": 1333 + }, + { + "epoch": 4.816252821670429, + "grad_norm": 196.81983947753906, + "learning_rate": 2.2791288566243196e-05, + "loss": 43.3302, + "step": 1334 + }, + { + "epoch": 4.819864559819413, + "grad_norm": 186.61160278320312, + "learning_rate": 2.278584392014519e-05, + "loss": 41.8957, + "step": 1335 + }, + { + "epoch": 4.823476297968397, + "grad_norm": 242.55886840820312, + "learning_rate": 2.2780399274047187e-05, + "loss": 43.1916, + "step": 1336 + }, + { + "epoch": 4.827088036117382, + "grad_norm": 212.07177734375, + "learning_rate": 2.2774954627949185e-05, + "loss": 38.3371, + "step": 1337 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 180.1990966796875, + "learning_rate": 2.276950998185118e-05, + "loss": 36.3413, + "step": 1338 + }, + { + "epoch": 4.83431151241535, + "grad_norm": 202.69529724121094, + "learning_rate": 2.2764065335753176e-05, + "loss": 35.4426, + "step": 1339 + }, + { + "epoch": 4.837923250564334, + "grad_norm": 180.47283935546875, + "learning_rate": 2.275862068965517e-05, + "loss": 35.5281, + "step": 1340 + }, + { + "epoch": 4.837923250564334, + "eval_loss": 0.6356105804443359, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 1340 + }, + { + "epoch": 4.8415349887133186, + "grad_norm": 204.674560546875, + "learning_rate": 2.2753176043557167e-05, + "loss": 36.2566, + "step": 1341 + }, + { + "epoch": 4.8451467268623025, + "grad_norm": 272.1197204589844, + "learning_rate": 2.2747731397459166e-05, + "loss": 36.3862, + "step": 1342 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 235.55101013183594, + "learning_rate": 2.2742286751361165e-05, + "loss": 35.1455, + "step": 1343 + }, + { + "epoch": 4.852370203160271, + "grad_norm": 271.2718200683594, + "learning_rate": 2.273684210526316e-05, + "loss": 37.3824, + "step": 1344 + }, + { + "epoch": 4.855981941309255, + "grad_norm": 242.15728759765625, + "learning_rate": 2.2731397459165155e-05, + "loss": 37.6587, + "step": 1345 + }, + { + "epoch": 4.859593679458239, + "grad_norm": 218.59481811523438, + "learning_rate": 2.272595281306715e-05, + "loss": 36.7602, + "step": 1346 + }, + { + "epoch": 4.863205417607223, + "grad_norm": 231.9490203857422, + "learning_rate": 2.2720508166969146e-05, + "loss": 38.187, + "step": 1347 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 385.56158447265625, + "learning_rate": 2.2715063520871145e-05, + "loss": 38.1905, + "step": 1348 + }, + { + "epoch": 4.870428893905192, + "grad_norm": 219.38204956054688, + "learning_rate": 2.270961887477314e-05, + "loss": 38.2179, + "step": 1349 + }, + { + "epoch": 4.874040632054176, + "grad_norm": 209.46580505371094, + "learning_rate": 2.2704174228675136e-05, + "loss": 37.3696, + "step": 1350 + }, + { + "epoch": 4.874040632054176, + "eval_loss": 0.6412517428398132, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 1350 + }, + { + "epoch": 4.87765237020316, + "grad_norm": 205.53416442871094, + "learning_rate": 2.2698729582577134e-05, + "loss": 38.5144, + "step": 1351 + }, + { + "epoch": 4.881264108352145, + "grad_norm": 214.2522735595703, + "learning_rate": 2.269328493647913e-05, + "loss": 38.7372, + "step": 1352 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 236.9787139892578, + "learning_rate": 2.2687840290381125e-05, + "loss": 38.8987, + "step": 1353 + }, + { + "epoch": 4.888487584650113, + "grad_norm": 247.30906677246094, + "learning_rate": 2.2682395644283124e-05, + "loss": 35.0837, + "step": 1354 + }, + { + "epoch": 4.892099322799097, + "grad_norm": 287.5954284667969, + "learning_rate": 2.267695099818512e-05, + "loss": 25.5272, + "step": 1355 + }, + { + "epoch": 4.895711060948082, + "grad_norm": 254.61672973632812, + "learning_rate": 2.2671506352087115e-05, + "loss": 25.1288, + "step": 1356 + }, + { + "epoch": 4.899322799097066, + "grad_norm": 180.98666381835938, + "learning_rate": 2.266606170598911e-05, + "loss": 25.0588, + "step": 1357 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 213.0275421142578, + "learning_rate": 2.2660617059891105e-05, + "loss": 25.464, + "step": 1358 + }, + { + "epoch": 4.9065462753950335, + "grad_norm": 385.18035888671875, + "learning_rate": 2.2655172413793104e-05, + "loss": 47.0056, + "step": 1359 + }, + { + "epoch": 4.910158013544018, + "grad_norm": 383.4106140136719, + "learning_rate": 2.2649727767695103e-05, + "loss": 46.9892, + "step": 1360 + }, + { + "epoch": 4.910158013544018, + "eval_loss": 0.6618479490280151, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1360 + }, + { + "epoch": 4.913769751693002, + "grad_norm": 415.4345397949219, + "learning_rate": 2.26442831215971e-05, + "loss": 47.1619, + "step": 1361 + }, + { + "epoch": 4.917381489841986, + "grad_norm": 362.338134765625, + "learning_rate": 2.2638838475499094e-05, + "loss": 46.7232, + "step": 1362 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 378.7535400390625, + "learning_rate": 2.263339382940109e-05, + "loss": 46.4438, + "step": 1363 + }, + { + "epoch": 4.924604966139955, + "grad_norm": 251.64901733398438, + "learning_rate": 2.2627949183303085e-05, + "loss": 44.8178, + "step": 1364 + }, + { + "epoch": 4.928216704288939, + "grad_norm": 273.1052551269531, + "learning_rate": 2.2622504537205083e-05, + "loss": 43.0865, + "step": 1365 + }, + { + "epoch": 4.931828442437923, + "grad_norm": 229.66415405273438, + "learning_rate": 2.261705989110708e-05, + "loss": 42.2463, + "step": 1366 + }, + { + "epoch": 4.935440180586907, + "grad_norm": 229.47940063476562, + "learning_rate": 2.2611615245009074e-05, + "loss": 42.4395, + "step": 1367 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 224.48890686035156, + "learning_rate": 2.260617059891107e-05, + "loss": 42.4994, + "step": 1368 + }, + { + "epoch": 4.942663656884876, + "grad_norm": 241.98745727539062, + "learning_rate": 2.2600725952813065e-05, + "loss": 42.5535, + "step": 1369 + }, + { + "epoch": 4.94627539503386, + "grad_norm": 258.1711120605469, + "learning_rate": 2.2595281306715067e-05, + "loss": 42.8475, + "step": 1370 + }, + { + "epoch": 4.94627539503386, + "eval_loss": 0.639252245426178, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.09, + "eval_steps_per_second": 57.09, + "step": 1370 + }, + { + "epoch": 4.949887133182845, + "grad_norm": 204.64927673339844, + "learning_rate": 2.2589836660617062e-05, + "loss": 42.9895, + "step": 1371 + }, + { + "epoch": 4.953498871331829, + "grad_norm": 342.9057922363281, + "learning_rate": 2.2584392014519058e-05, + "loss": 43.1972, + "step": 1372 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 207.45504760742188, + "learning_rate": 2.2578947368421053e-05, + "loss": 42.406, + "step": 1373 + }, + { + "epoch": 4.960722347629797, + "grad_norm": 232.78831481933594, + "learning_rate": 2.257350272232305e-05, + "loss": 36.8817, + "step": 1374 + }, + { + "epoch": 4.9643340857787805, + "grad_norm": 249.3349609375, + "learning_rate": 2.2568058076225044e-05, + "loss": 34.584, + "step": 1375 + }, + { + "epoch": 4.967945823927765, + "grad_norm": 322.7100524902344, + "learning_rate": 2.2562613430127043e-05, + "loss": 36.9512, + "step": 1376 + }, + { + "epoch": 4.971557562076749, + "grad_norm": 357.65228271484375, + "learning_rate": 2.2557168784029038e-05, + "loss": 37.6833, + "step": 1377 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 300.0970153808594, + "learning_rate": 2.2551724137931033e-05, + "loss": 38.597, + "step": 1378 + }, + { + "epoch": 4.978781038374718, + "grad_norm": 234.52508544921875, + "learning_rate": 2.2546279491833032e-05, + "loss": 38.4155, + "step": 1379 + }, + { + "epoch": 4.982392776523702, + "grad_norm": 270.60626220703125, + "learning_rate": 2.2540834845735028e-05, + "loss": 38.1589, + "step": 1380 + }, + { + "epoch": 4.982392776523702, + "eval_loss": 0.6409950256347656, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 1380 + }, + { + "epoch": 4.986004514672686, + "grad_norm": 232.9596710205078, + "learning_rate": 2.2535390199637026e-05, + "loss": 39.281, + "step": 1381 + }, + { + "epoch": 4.98961625282167, + "grad_norm": 248.0550994873047, + "learning_rate": 2.2529945553539022e-05, + "loss": 40.0868, + "step": 1382 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 256.327880859375, + "learning_rate": 2.2524500907441017e-05, + "loss": 28.1259, + "step": 1383 + }, + { + "epoch": 4.996839729119639, + "grad_norm": 198.29559326171875, + "learning_rate": 2.2519056261343012e-05, + "loss": 25.3166, + "step": 1384 + }, + { + "epoch": 5.0, + "grad_norm": 174.66856384277344, + "learning_rate": 2.2513611615245008e-05, + "loss": 22.0749, + "step": 1385 + }, + { + "epoch": 5.003611738148984, + "grad_norm": 309.0927429199219, + "learning_rate": 2.2508166969147003e-05, + "loss": 45.2433, + "step": 1386 + }, + { + "epoch": 5.007223476297969, + "grad_norm": 293.1455383300781, + "learning_rate": 2.2502722323049002e-05, + "loss": 46.7025, + "step": 1387 + }, + { + "epoch": 5.010835214446953, + "grad_norm": 269.47662353515625, + "learning_rate": 2.2497277676951e-05, + "loss": 45.3218, + "step": 1388 + }, + { + "epoch": 5.014446952595937, + "grad_norm": 284.49560546875, + "learning_rate": 2.2491833030852996e-05, + "loss": 44.9849, + "step": 1389 + }, + { + "epoch": 5.018058690744921, + "grad_norm": 223.5511474609375, + "learning_rate": 2.248638838475499e-05, + "loss": 44.887, + "step": 1390 + }, + { + "epoch": 5.018058690744921, + "eval_loss": 0.6435533165931702, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1390 + }, + { + "epoch": 5.021670428893906, + "grad_norm": 243.4492645263672, + "learning_rate": 2.2480943738656987e-05, + "loss": 45.1483, + "step": 1391 + }, + { + "epoch": 5.0252821670428895, + "grad_norm": 265.1712646484375, + "learning_rate": 2.2475499092558986e-05, + "loss": 44.3713, + "step": 1392 + }, + { + "epoch": 5.0288939051918735, + "grad_norm": 190.72190856933594, + "learning_rate": 2.247005444646098e-05, + "loss": 45.3138, + "step": 1393 + }, + { + "epoch": 5.0325056433408575, + "grad_norm": 177.26686096191406, + "learning_rate": 2.2464609800362976e-05, + "loss": 43.302, + "step": 1394 + }, + { + "epoch": 5.036117381489842, + "grad_norm": 198.6124725341797, + "learning_rate": 2.2459165154264972e-05, + "loss": 43.6363, + "step": 1395 + }, + { + "epoch": 5.039729119638826, + "grad_norm": 233.78738403320312, + "learning_rate": 2.2453720508166967e-05, + "loss": 43.0345, + "step": 1396 + }, + { + "epoch": 5.04334085778781, + "grad_norm": 225.48614501953125, + "learning_rate": 2.2448275862068966e-05, + "loss": 41.5932, + "step": 1397 + }, + { + "epoch": 5.046952595936794, + "grad_norm": 204.31179809570312, + "learning_rate": 2.2442831215970965e-05, + "loss": 40.1401, + "step": 1398 + }, + { + "epoch": 5.050564334085779, + "grad_norm": 219.5385284423828, + "learning_rate": 2.243738656987296e-05, + "loss": 40.8834, + "step": 1399 + }, + { + "epoch": 5.054176072234763, + "grad_norm": 168.3094024658203, + "learning_rate": 2.2431941923774956e-05, + "loss": 40.4476, + "step": 1400 + }, + { + "epoch": 5.054176072234763, + "eval_loss": 0.6361114382743835, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 1400 + }, + { + "epoch": 5.057787810383747, + "grad_norm": 169.45201110839844, + "learning_rate": 2.242649727767695e-05, + "loss": 40.1949, + "step": 1401 + }, + { + "epoch": 5.061399548532731, + "grad_norm": 208.84634399414062, + "learning_rate": 2.2421052631578946e-05, + "loss": 41.0091, + "step": 1402 + }, + { + "epoch": 5.065011286681716, + "grad_norm": 248.86221313476562, + "learning_rate": 2.2415607985480945e-05, + "loss": 40.2435, + "step": 1403 + }, + { + "epoch": 5.0686230248307, + "grad_norm": 297.0834655761719, + "learning_rate": 2.241016333938294e-05, + "loss": 42.37, + "step": 1404 + }, + { + "epoch": 5.072234762979684, + "grad_norm": 242.12661743164062, + "learning_rate": 2.2404718693284936e-05, + "loss": 42.3822, + "step": 1405 + }, + { + "epoch": 5.075846501128668, + "grad_norm": 230.1178741455078, + "learning_rate": 2.2399274047186935e-05, + "loss": 41.3722, + "step": 1406 + }, + { + "epoch": 5.079458239277653, + "grad_norm": 191.32371520996094, + "learning_rate": 2.239382940108893e-05, + "loss": 41.8087, + "step": 1407 + }, + { + "epoch": 5.083069977426637, + "grad_norm": 267.28753662109375, + "learning_rate": 2.2388384754990925e-05, + "loss": 42.5938, + "step": 1408 + }, + { + "epoch": 5.0866817155756205, + "grad_norm": 186.61978149414062, + "learning_rate": 2.2382940108892924e-05, + "loss": 42.8553, + "step": 1409 + }, + { + "epoch": 5.090293453724605, + "grad_norm": 242.53433227539062, + "learning_rate": 2.237749546279492e-05, + "loss": 41.9677, + "step": 1410 + }, + { + "epoch": 5.090293453724605, + "eval_loss": 0.6330043077468872, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 1410 + }, + { + "epoch": 5.093905191873589, + "grad_norm": 199.74696350097656, + "learning_rate": 2.2372050816696915e-05, + "loss": 42.9821, + "step": 1411 + }, + { + "epoch": 5.097516930022573, + "grad_norm": 254.1063690185547, + "learning_rate": 2.236660617059891e-05, + "loss": 42.7956, + "step": 1412 + }, + { + "epoch": 5.101128668171557, + "grad_norm": 215.59056091308594, + "learning_rate": 2.2361161524500906e-05, + "loss": 43.6312, + "step": 1413 + }, + { + "epoch": 5.104740406320542, + "grad_norm": 218.69973754882812, + "learning_rate": 2.2355716878402904e-05, + "loss": 40.9468, + "step": 1414 + }, + { + "epoch": 5.108352144469526, + "grad_norm": 200.34927368164062, + "learning_rate": 2.23502722323049e-05, + "loss": 38.2656, + "step": 1415 + }, + { + "epoch": 5.11196388261851, + "grad_norm": 191.56883239746094, + "learning_rate": 2.23448275862069e-05, + "loss": 35.8111, + "step": 1416 + }, + { + "epoch": 5.115575620767494, + "grad_norm": 192.629150390625, + "learning_rate": 2.2339382940108894e-05, + "loss": 35.1287, + "step": 1417 + }, + { + "epoch": 5.119187358916479, + "grad_norm": 217.54855346679688, + "learning_rate": 2.233393829401089e-05, + "loss": 34.9664, + "step": 1418 + }, + { + "epoch": 5.122799097065463, + "grad_norm": 234.12355041503906, + "learning_rate": 2.2328493647912888e-05, + "loss": 35.9252, + "step": 1419 + }, + { + "epoch": 5.126410835214447, + "grad_norm": 201.83477783203125, + "learning_rate": 2.2323049001814884e-05, + "loss": 36.4664, + "step": 1420 + }, + { + "epoch": 5.126410835214447, + "eval_loss": 0.6359394192695618, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 1420 + }, + { + "epoch": 5.130022573363431, + "grad_norm": 212.38943481445312, + "learning_rate": 2.231760435571688e-05, + "loss": 35.2733, + "step": 1421 + }, + { + "epoch": 5.133634311512416, + "grad_norm": 219.8803253173828, + "learning_rate": 2.2312159709618874e-05, + "loss": 37.2009, + "step": 1422 + }, + { + "epoch": 5.1372460496614, + "grad_norm": 222.28221130371094, + "learning_rate": 2.230671506352087e-05, + "loss": 36.9338, + "step": 1423 + }, + { + "epoch": 5.140857787810384, + "grad_norm": 217.56607055664062, + "learning_rate": 2.2301270417422865e-05, + "loss": 38.0419, + "step": 1424 + }, + { + "epoch": 5.144469525959368, + "grad_norm": 232.7363739013672, + "learning_rate": 2.2295825771324867e-05, + "loss": 38.1393, + "step": 1425 + }, + { + "epoch": 5.148081264108352, + "grad_norm": 228.12091064453125, + "learning_rate": 2.2290381125226863e-05, + "loss": 37.4169, + "step": 1426 + }, + { + "epoch": 5.151693002257336, + "grad_norm": 247.9901580810547, + "learning_rate": 2.2284936479128858e-05, + "loss": 37.6386, + "step": 1427 + }, + { + "epoch": 5.15530474040632, + "grad_norm": 227.96649169921875, + "learning_rate": 2.2279491833030853e-05, + "loss": 38.7843, + "step": 1428 + }, + { + "epoch": 5.158916478555304, + "grad_norm": 197.85072326660156, + "learning_rate": 2.227404718693285e-05, + "loss": 37.7056, + "step": 1429 + }, + { + "epoch": 5.162528216704289, + "grad_norm": 270.6370544433594, + "learning_rate": 2.2268602540834848e-05, + "loss": 38.5554, + "step": 1430 + }, + { + "epoch": 5.162528216704289, + "eval_loss": 0.6463288068771362, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 1430 + }, + { + "epoch": 5.166139954853273, + "grad_norm": 251.65847778320312, + "learning_rate": 2.2263157894736843e-05, + "loss": 32.6593, + "step": 1431 + }, + { + "epoch": 5.169751693002257, + "grad_norm": 248.84368896484375, + "learning_rate": 2.225771324863884e-05, + "loss": 24.8031, + "step": 1432 + }, + { + "epoch": 5.173363431151241, + "grad_norm": 218.12979125976562, + "learning_rate": 2.2252268602540834e-05, + "loss": 23.8542, + "step": 1433 + }, + { + "epoch": 5.176975169300226, + "grad_norm": 171.4182586669922, + "learning_rate": 2.2246823956442832e-05, + "loss": 25.1994, + "step": 1434 + }, + { + "epoch": 5.18058690744921, + "grad_norm": 200.76271057128906, + "learning_rate": 2.2241379310344828e-05, + "loss": 25.1259, + "step": 1435 + }, + { + "epoch": 5.184198645598194, + "grad_norm": 324.8979797363281, + "learning_rate": 2.2235934664246827e-05, + "loss": 46.7466, + "step": 1436 + }, + { + "epoch": 5.187810383747179, + "grad_norm": 391.9200439453125, + "learning_rate": 2.2230490018148822e-05, + "loss": 47.366, + "step": 1437 + }, + { + "epoch": 5.191422121896163, + "grad_norm": 332.51080322265625, + "learning_rate": 2.2225045372050817e-05, + "loss": 47.5236, + "step": 1438 + }, + { + "epoch": 5.195033860045147, + "grad_norm": 295.85333251953125, + "learning_rate": 2.2219600725952813e-05, + "loss": 44.9235, + "step": 1439 + }, + { + "epoch": 5.198645598194131, + "grad_norm": 246.46482849121094, + "learning_rate": 2.2214156079854808e-05, + "loss": 44.5892, + "step": 1440 + }, + { + "epoch": 5.198645598194131, + "eval_loss": 0.6501885056495667, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.096, + "eval_steps_per_second": 57.096, + "step": 1440 + }, + { + "epoch": 5.2022573363431155, + "grad_norm": 224.99964904785156, + "learning_rate": 2.2208711433756807e-05, + "loss": 45.1496, + "step": 1441 + }, + { + "epoch": 5.2058690744920995, + "grad_norm": 201.5928497314453, + "learning_rate": 2.2203266787658802e-05, + "loss": 44.2362, + "step": 1442 + }, + { + "epoch": 5.209480812641083, + "grad_norm": 220.72509765625, + "learning_rate": 2.21978221415608e-05, + "loss": 45.7963, + "step": 1443 + }, + { + "epoch": 5.213092550790067, + "grad_norm": 229.04412841796875, + "learning_rate": 2.2192377495462796e-05, + "loss": 44.1812, + "step": 1444 + }, + { + "epoch": 5.216704288939052, + "grad_norm": 214.86207580566406, + "learning_rate": 2.2186932849364792e-05, + "loss": 44.364, + "step": 1445 + }, + { + "epoch": 5.220316027088036, + "grad_norm": 169.3239288330078, + "learning_rate": 2.2181488203266787e-05, + "loss": 44.1106, + "step": 1446 + }, + { + "epoch": 5.22392776523702, + "grad_norm": 180.3131561279297, + "learning_rate": 2.2176043557168786e-05, + "loss": 41.8791, + "step": 1447 + }, + { + "epoch": 5.227539503386004, + "grad_norm": 227.83078002929688, + "learning_rate": 2.217059891107078e-05, + "loss": 39.7917, + "step": 1448 + }, + { + "epoch": 5.231151241534989, + "grad_norm": 267.4294738769531, + "learning_rate": 2.2165154264972777e-05, + "loss": 41.2864, + "step": 1449 + }, + { + "epoch": 5.234762979683973, + "grad_norm": 210.79034423828125, + "learning_rate": 2.2159709618874772e-05, + "loss": 40.7219, + "step": 1450 + }, + { + "epoch": 5.234762979683973, + "eval_loss": 0.6369529366493225, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 1450 + }, + { + "epoch": 5.238374717832957, + "grad_norm": 205.2632598876953, + "learning_rate": 2.2154264972776768e-05, + "loss": 41.0364, + "step": 1451 + }, + { + "epoch": 5.241986455981941, + "grad_norm": 199.7196807861328, + "learning_rate": 2.214882032667877e-05, + "loss": 40.2733, + "step": 1452 + }, + { + "epoch": 5.245598194130926, + "grad_norm": 184.26495361328125, + "learning_rate": 2.2143375680580765e-05, + "loss": 40.3418, + "step": 1453 + }, + { + "epoch": 5.24920993227991, + "grad_norm": 170.1937713623047, + "learning_rate": 2.213793103448276e-05, + "loss": 40.5658, + "step": 1454 + }, + { + "epoch": 5.252821670428894, + "grad_norm": 167.71109008789062, + "learning_rate": 2.2132486388384756e-05, + "loss": 41.9252, + "step": 1455 + }, + { + "epoch": 5.2564334085778786, + "grad_norm": 184.73162841796875, + "learning_rate": 2.212704174228675e-05, + "loss": 40.0485, + "step": 1456 + }, + { + "epoch": 5.2600451467268625, + "grad_norm": 195.0812225341797, + "learning_rate": 2.2121597096188747e-05, + "loss": 41.6424, + "step": 1457 + }, + { + "epoch": 5.2636568848758465, + "grad_norm": 218.23553466796875, + "learning_rate": 2.2116152450090745e-05, + "loss": 40.6179, + "step": 1458 + }, + { + "epoch": 5.2672686230248305, + "grad_norm": 229.79299926757812, + "learning_rate": 2.211070780399274e-05, + "loss": 42.8747, + "step": 1459 + }, + { + "epoch": 5.270880361173815, + "grad_norm": 231.70692443847656, + "learning_rate": 2.2105263157894736e-05, + "loss": 42.7016, + "step": 1460 + }, + { + "epoch": 5.270880361173815, + "eval_loss": 0.6424433588981628, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 1460 + }, + { + "epoch": 5.274492099322799, + "grad_norm": 204.9513397216797, + "learning_rate": 2.209981851179673e-05, + "loss": 41.206, + "step": 1461 + }, + { + "epoch": 5.278103837471783, + "grad_norm": 220.89083862304688, + "learning_rate": 2.209437386569873e-05, + "loss": 44.0126, + "step": 1462 + }, + { + "epoch": 5.281715575620767, + "grad_norm": 266.7763671875, + "learning_rate": 2.208892921960073e-05, + "loss": 41.4934, + "step": 1463 + }, + { + "epoch": 5.285327313769752, + "grad_norm": 241.42636108398438, + "learning_rate": 2.2083484573502724e-05, + "loss": 43.3433, + "step": 1464 + }, + { + "epoch": 5.288939051918736, + "grad_norm": 221.7669219970703, + "learning_rate": 2.207803992740472e-05, + "loss": 35.9569, + "step": 1465 + }, + { + "epoch": 5.29255079006772, + "grad_norm": 236.0152130126953, + "learning_rate": 2.2072595281306715e-05, + "loss": 36.0824, + "step": 1466 + }, + { + "epoch": 5.296162528216704, + "grad_norm": 239.56224060058594, + "learning_rate": 2.206715063520871e-05, + "loss": 33.6127, + "step": 1467 + }, + { + "epoch": 5.299774266365689, + "grad_norm": 277.1287841796875, + "learning_rate": 2.2061705989110706e-05, + "loss": 36.11, + "step": 1468 + }, + { + "epoch": 5.303386004514673, + "grad_norm": 250.19515991210938, + "learning_rate": 2.2056261343012705e-05, + "loss": 36.9984, + "step": 1469 + }, + { + "epoch": 5.306997742663657, + "grad_norm": 214.2754669189453, + "learning_rate": 2.20508166969147e-05, + "loss": 36.5917, + "step": 1470 + }, + { + "epoch": 5.306997742663657, + "eval_loss": 0.6356943845748901, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 1470 + }, + { + "epoch": 5.310609480812641, + "grad_norm": 224.37388610839844, + "learning_rate": 2.20453720508167e-05, + "loss": 36.5302, + "step": 1471 + }, + { + "epoch": 5.314221218961626, + "grad_norm": 276.2541809082031, + "learning_rate": 2.2039927404718694e-05, + "loss": 36.7978, + "step": 1472 + }, + { + "epoch": 5.3178329571106095, + "grad_norm": 361.717041015625, + "learning_rate": 2.203448275862069e-05, + "loss": 37.4063, + "step": 1473 + }, + { + "epoch": 5.3214446952595935, + "grad_norm": 285.3569641113281, + "learning_rate": 2.202903811252269e-05, + "loss": 37.2472, + "step": 1474 + }, + { + "epoch": 5.3250564334085775, + "grad_norm": 268.160400390625, + "learning_rate": 2.2023593466424684e-05, + "loss": 37.7361, + "step": 1475 + }, + { + "epoch": 5.328668171557562, + "grad_norm": 211.38070678710938, + "learning_rate": 2.201814882032668e-05, + "loss": 37.7794, + "step": 1476 + }, + { + "epoch": 5.332279909706546, + "grad_norm": 214.10638427734375, + "learning_rate": 2.2012704174228675e-05, + "loss": 39.0787, + "step": 1477 + }, + { + "epoch": 5.33589164785553, + "grad_norm": 238.9603271484375, + "learning_rate": 2.200725952813067e-05, + "loss": 37.6853, + "step": 1478 + }, + { + "epoch": 5.339503386004514, + "grad_norm": 323.44976806640625, + "learning_rate": 2.2001814882032665e-05, + "loss": 38.2844, + "step": 1479 + }, + { + "epoch": 5.343115124153499, + "grad_norm": 289.6131896972656, + "learning_rate": 2.1996370235934668e-05, + "loss": 38.8953, + "step": 1480 + }, + { + "epoch": 5.343115124153499, + "eval_loss": 0.6462770700454712, + "eval_runtime": 3.1673, + "eval_samples_per_second": 56.516, + "eval_steps_per_second": 56.516, + "step": 1480 + }, + { + "epoch": 5.346726862302483, + "grad_norm": 197.47299194335938, + "learning_rate": 2.1990925589836663e-05, + "loss": 28.126, + "step": 1481 + }, + { + "epoch": 5.350338600451467, + "grad_norm": 198.37156677246094, + "learning_rate": 2.1985480943738658e-05, + "loss": 24.2205, + "step": 1482 + }, + { + "epoch": 5.353950338600452, + "grad_norm": 211.03501892089844, + "learning_rate": 2.1980036297640654e-05, + "loss": 24.119, + "step": 1483 + }, + { + "epoch": 5.357562076749436, + "grad_norm": 182.23316955566406, + "learning_rate": 2.197459165154265e-05, + "loss": 24.7386, + "step": 1484 + }, + { + "epoch": 5.36117381489842, + "grad_norm": 192.6392822265625, + "learning_rate": 2.1969147005444648e-05, + "loss": 26.0739, + "step": 1485 + }, + { + "epoch": 5.364785553047404, + "grad_norm": 380.62896728515625, + "learning_rate": 2.1963702359346643e-05, + "loss": 46.6945, + "step": 1486 + }, + { + "epoch": 5.368397291196389, + "grad_norm": 342.5572814941406, + "learning_rate": 2.195825771324864e-05, + "loss": 46.1797, + "step": 1487 + }, + { + "epoch": 5.372009029345373, + "grad_norm": 311.7198791503906, + "learning_rate": 2.1952813067150634e-05, + "loss": 45.6588, + "step": 1488 + }, + { + "epoch": 5.375620767494357, + "grad_norm": 260.9885559082031, + "learning_rate": 2.1947368421052633e-05, + "loss": 45.2405, + "step": 1489 + }, + { + "epoch": 5.3792325056433405, + "grad_norm": 263.3132019042969, + "learning_rate": 2.1941923774954628e-05, + "loss": 44.117, + "step": 1490 + }, + { + "epoch": 5.3792325056433405, + "eval_loss": 0.644275426864624, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 1490 + }, + { + "epoch": 5.382844243792325, + "grad_norm": 254.92022705078125, + "learning_rate": 2.1936479128856627e-05, + "loss": 45.4002, + "step": 1491 + }, + { + "epoch": 5.386455981941309, + "grad_norm": 246.1839599609375, + "learning_rate": 2.1931034482758622e-05, + "loss": 45.3481, + "step": 1492 + }, + { + "epoch": 5.390067720090293, + "grad_norm": 282.2879638671875, + "learning_rate": 2.1925589836660618e-05, + "loss": 45.3958, + "step": 1493 + }, + { + "epoch": 5.393679458239277, + "grad_norm": 266.9140930175781, + "learning_rate": 2.1920145190562613e-05, + "loss": 44.2959, + "step": 1494 + }, + { + "epoch": 5.397291196388262, + "grad_norm": 196.81199645996094, + "learning_rate": 2.191470054446461e-05, + "loss": 44.765, + "step": 1495 + }, + { + "epoch": 5.400902934537246, + "grad_norm": 270.7329406738281, + "learning_rate": 2.1909255898366607e-05, + "loss": 42.8581, + "step": 1496 + }, + { + "epoch": 5.40451467268623, + "grad_norm": 187.3281707763672, + "learning_rate": 2.1903811252268603e-05, + "loss": 40.7167, + "step": 1497 + }, + { + "epoch": 5.408126410835214, + "grad_norm": 302.9165954589844, + "learning_rate": 2.1898366606170598e-05, + "loss": 41.0712, + "step": 1498 + }, + { + "epoch": 5.411738148984199, + "grad_norm": 395.1492614746094, + "learning_rate": 2.1892921960072597e-05, + "loss": 40.4098, + "step": 1499 + }, + { + "epoch": 5.415349887133183, + "grad_norm": 253.91494750976562, + "learning_rate": 2.1887477313974592e-05, + "loss": 41.2985, + "step": 1500 + }, + { + "epoch": 5.415349887133183, + "eval_loss": 0.6383773684501648, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1500 + }, + { + "epoch": 5.418961625282167, + "grad_norm": 248.4109344482422, + "learning_rate": 2.1882032667876588e-05, + "loss": 41.179, + "step": 1501 + }, + { + "epoch": 5.422573363431152, + "grad_norm": 210.50015258789062, + "learning_rate": 2.1876588021778586e-05, + "loss": 41.1934, + "step": 1502 + }, + { + "epoch": 5.426185101580136, + "grad_norm": 170.64334106445312, + "learning_rate": 2.187114337568058e-05, + "loss": 41.5535, + "step": 1503 + }, + { + "epoch": 5.42979683972912, + "grad_norm": 249.41270446777344, + "learning_rate": 2.1865698729582577e-05, + "loss": 41.8323, + "step": 1504 + }, + { + "epoch": 5.433408577878104, + "grad_norm": 214.53770446777344, + "learning_rate": 2.1860254083484572e-05, + "loss": 42.1517, + "step": 1505 + }, + { + "epoch": 5.437020316027088, + "grad_norm": 225.6502227783203, + "learning_rate": 2.1854809437386568e-05, + "loss": 42.7675, + "step": 1506 + }, + { + "epoch": 5.440632054176072, + "grad_norm": 210.19219970703125, + "learning_rate": 2.1849364791288567e-05, + "loss": 42.5094, + "step": 1507 + }, + { + "epoch": 5.444243792325056, + "grad_norm": 187.03294372558594, + "learning_rate": 2.1843920145190565e-05, + "loss": 42.2218, + "step": 1508 + }, + { + "epoch": 5.44785553047404, + "grad_norm": 227.6764373779297, + "learning_rate": 2.183847549909256e-05, + "loss": 42.7061, + "step": 1509 + }, + { + "epoch": 5.451467268623025, + "grad_norm": 239.2847442626953, + "learning_rate": 2.1833030852994556e-05, + "loss": 43.1959, + "step": 1510 + }, + { + "epoch": 5.451467268623025, + "eval_loss": 0.6405091285705566, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 1510 + }, + { + "epoch": 5.455079006772009, + "grad_norm": 268.887451171875, + "learning_rate": 2.182758620689655e-05, + "loss": 42.4915, + "step": 1511 + }, + { + "epoch": 5.458690744920993, + "grad_norm": 261.0531311035156, + "learning_rate": 2.182214156079855e-05, + "loss": 42.1777, + "step": 1512 + }, + { + "epoch": 5.462302483069977, + "grad_norm": 241.58819580078125, + "learning_rate": 2.1816696914700546e-05, + "loss": 40.8728, + "step": 1513 + }, + { + "epoch": 5.465914221218962, + "grad_norm": 227.302001953125, + "learning_rate": 2.181125226860254e-05, + "loss": 39.8861, + "step": 1514 + }, + { + "epoch": 5.469525959367946, + "grad_norm": 293.8402404785156, + "learning_rate": 2.1805807622504536e-05, + "loss": 36.8716, + "step": 1515 + }, + { + "epoch": 5.47313769751693, + "grad_norm": 332.8829650878906, + "learning_rate": 2.1800362976406532e-05, + "loss": 35.6049, + "step": 1516 + }, + { + "epoch": 5.476749435665914, + "grad_norm": 271.6636962890625, + "learning_rate": 2.179491833030853e-05, + "loss": 34.6785, + "step": 1517 + }, + { + "epoch": 5.480361173814899, + "grad_norm": 211.5673065185547, + "learning_rate": 2.178947368421053e-05, + "loss": 35.5321, + "step": 1518 + }, + { + "epoch": 5.483972911963883, + "grad_norm": 168.95346069335938, + "learning_rate": 2.1784029038112525e-05, + "loss": 35.1604, + "step": 1519 + }, + { + "epoch": 5.487584650112867, + "grad_norm": 242.66725158691406, + "learning_rate": 2.177858439201452e-05, + "loss": 37.8709, + "step": 1520 + }, + { + "epoch": 5.487584650112867, + "eval_loss": 0.6324127912521362, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1520 + }, + { + "epoch": 5.491196388261851, + "grad_norm": 202.7799530029297, + "learning_rate": 2.1773139745916516e-05, + "loss": 38.1727, + "step": 1521 + }, + { + "epoch": 5.4948081264108355, + "grad_norm": 210.12704467773438, + "learning_rate": 2.176769509981851e-05, + "loss": 36.4171, + "step": 1522 + }, + { + "epoch": 5.4984198645598195, + "grad_norm": 214.7133331298828, + "learning_rate": 2.176225045372051e-05, + "loss": 37.7873, + "step": 1523 + }, + { + "epoch": 5.502031602708803, + "grad_norm": 197.89781188964844, + "learning_rate": 2.1756805807622505e-05, + "loss": 37.1096, + "step": 1524 + }, + { + "epoch": 5.505643340857787, + "grad_norm": 203.01992797851562, + "learning_rate": 2.17513611615245e-05, + "loss": 36.9907, + "step": 1525 + }, + { + "epoch": 5.509255079006772, + "grad_norm": 210.42164611816406, + "learning_rate": 2.17459165154265e-05, + "loss": 38.0291, + "step": 1526 + }, + { + "epoch": 5.512866817155756, + "grad_norm": 210.2798309326172, + "learning_rate": 2.1740471869328495e-05, + "loss": 37.5385, + "step": 1527 + }, + { + "epoch": 5.51647855530474, + "grad_norm": 217.986572265625, + "learning_rate": 2.173502722323049e-05, + "loss": 39.2736, + "step": 1528 + }, + { + "epoch": 5.520090293453725, + "grad_norm": 221.05831909179688, + "learning_rate": 2.172958257713249e-05, + "loss": 39.2733, + "step": 1529 + }, + { + "epoch": 5.523702031602709, + "grad_norm": 250.36065673828125, + "learning_rate": 2.1724137931034484e-05, + "loss": 37.8987, + "step": 1530 + }, + { + "epoch": 5.523702031602709, + "eval_loss": 0.6414559483528137, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 1530 + }, + { + "epoch": 5.527313769751693, + "grad_norm": 275.062255859375, + "learning_rate": 2.171869328493648e-05, + "loss": 29.4874, + "step": 1531 + }, + { + "epoch": 5.530925507900677, + "grad_norm": 178.79615783691406, + "learning_rate": 2.1713248638838475e-05, + "loss": 25.2165, + "step": 1532 + }, + { + "epoch": 5.534537246049661, + "grad_norm": 221.6693572998047, + "learning_rate": 2.170780399274047e-05, + "loss": 24.7139, + "step": 1533 + }, + { + "epoch": 5.538148984198646, + "grad_norm": 207.15869140625, + "learning_rate": 2.170235934664247e-05, + "loss": 25.2773, + "step": 1534 + }, + { + "epoch": 5.54176072234763, + "grad_norm": 193.37644958496094, + "learning_rate": 2.1696914700544468e-05, + "loss": 25.7936, + "step": 1535 + }, + { + "epoch": 5.545372460496614, + "grad_norm": 314.101318359375, + "learning_rate": 2.1691470054446463e-05, + "loss": 45.8573, + "step": 1536 + }, + { + "epoch": 5.5489841986455986, + "grad_norm": 376.9578552246094, + "learning_rate": 2.168602540834846e-05, + "loss": 47.1284, + "step": 1537 + }, + { + "epoch": 5.5525959367945825, + "grad_norm": 343.3904724121094, + "learning_rate": 2.1680580762250454e-05, + "loss": 45.1873, + "step": 1538 + }, + { + "epoch": 5.5562076749435665, + "grad_norm": 263.31768798828125, + "learning_rate": 2.167513611615245e-05, + "loss": 45.4906, + "step": 1539 + }, + { + "epoch": 5.5598194130925505, + "grad_norm": 295.50384521484375, + "learning_rate": 2.1669691470054448e-05, + "loss": 44.9259, + "step": 1540 + }, + { + "epoch": 5.5598194130925505, + "eval_loss": 0.6483813524246216, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.923, + "eval_steps_per_second": 56.923, + "step": 1540 + }, + { + "epoch": 5.563431151241535, + "grad_norm": 208.8861846923828, + "learning_rate": 2.1664246823956444e-05, + "loss": 43.7965, + "step": 1541 + }, + { + "epoch": 5.567042889390519, + "grad_norm": 195.8695526123047, + "learning_rate": 2.165880217785844e-05, + "loss": 44.7409, + "step": 1542 + }, + { + "epoch": 5.570654627539503, + "grad_norm": 218.10089111328125, + "learning_rate": 2.1653357531760434e-05, + "loss": 45.9364, + "step": 1543 + }, + { + "epoch": 5.574266365688487, + "grad_norm": 204.17205810546875, + "learning_rate": 2.164791288566243e-05, + "loss": 45.468, + "step": 1544 + }, + { + "epoch": 5.577878103837472, + "grad_norm": 239.03952026367188, + "learning_rate": 2.1642468239564432e-05, + "loss": 44.7685, + "step": 1545 + }, + { + "epoch": 5.581489841986456, + "grad_norm": 251.59300231933594, + "learning_rate": 2.1637023593466427e-05, + "loss": 43.011, + "step": 1546 + }, + { + "epoch": 5.58510158013544, + "grad_norm": 186.72540283203125, + "learning_rate": 2.1631578947368423e-05, + "loss": 41.5255, + "step": 1547 + }, + { + "epoch": 5.588713318284425, + "grad_norm": 199.89732360839844, + "learning_rate": 2.1626134301270418e-05, + "loss": 40.2522, + "step": 1548 + }, + { + "epoch": 5.592325056433409, + "grad_norm": 182.16624450683594, + "learning_rate": 2.1620689655172413e-05, + "loss": 41.0931, + "step": 1549 + }, + { + "epoch": 5.595936794582393, + "grad_norm": 221.58680725097656, + "learning_rate": 2.161524500907441e-05, + "loss": 40.2717, + "step": 1550 + }, + { + "epoch": 5.595936794582393, + "eval_loss": 0.6393340229988098, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 1550 + }, + { + "epoch": 5.599548532731377, + "grad_norm": 209.82183837890625, + "learning_rate": 2.1609800362976408e-05, + "loss": 41.7522, + "step": 1551 + }, + { + "epoch": 5.603160270880361, + "grad_norm": 226.1896209716797, + "learning_rate": 2.1604355716878403e-05, + "loss": 40.8078, + "step": 1552 + }, + { + "epoch": 5.606772009029346, + "grad_norm": 219.57899475097656, + "learning_rate": 2.1598911070780398e-05, + "loss": 42.2331, + "step": 1553 + }, + { + "epoch": 5.6103837471783295, + "grad_norm": 185.2303009033203, + "learning_rate": 2.1593466424682397e-05, + "loss": 42.0695, + "step": 1554 + }, + { + "epoch": 5.6139954853273135, + "grad_norm": 192.32913208007812, + "learning_rate": 2.1588021778584392e-05, + "loss": 42.1317, + "step": 1555 + }, + { + "epoch": 5.617607223476298, + "grad_norm": 183.3128662109375, + "learning_rate": 2.158257713248639e-05, + "loss": 40.4957, + "step": 1556 + }, + { + "epoch": 5.621218961625282, + "grad_norm": 178.10691833496094, + "learning_rate": 2.1577132486388387e-05, + "loss": 40.9154, + "step": 1557 + }, + { + "epoch": 5.624830699774266, + "grad_norm": 207.3495330810547, + "learning_rate": 2.1571687840290382e-05, + "loss": 42.8389, + "step": 1558 + }, + { + "epoch": 5.62844243792325, + "grad_norm": 191.46353149414062, + "learning_rate": 2.1566243194192377e-05, + "loss": 41.9483, + "step": 1559 + }, + { + "epoch": 5.632054176072235, + "grad_norm": 218.9544219970703, + "learning_rate": 2.1560798548094373e-05, + "loss": 41.2037, + "step": 1560 + }, + { + "epoch": 5.632054176072235, + "eval_loss": 0.6345452070236206, + "eval_runtime": 3.1432, + "eval_samples_per_second": 56.949, + "eval_steps_per_second": 56.949, + "step": 1560 + }, + { + "epoch": 5.635665914221219, + "grad_norm": 235.9405059814453, + "learning_rate": 2.1555353901996368e-05, + "loss": 43.1159, + "step": 1561 + }, + { + "epoch": 5.639277652370203, + "grad_norm": 207.1119384765625, + "learning_rate": 2.1549909255898367e-05, + "loss": 43.4384, + "step": 1562 + }, + { + "epoch": 5.642889390519187, + "grad_norm": 305.3013916015625, + "learning_rate": 2.1544464609800366e-05, + "loss": 42.436, + "step": 1563 + }, + { + "epoch": 5.646501128668172, + "grad_norm": 226.25282287597656, + "learning_rate": 2.153901996370236e-05, + "loss": 39.6844, + "step": 1564 + }, + { + "epoch": 5.650112866817156, + "grad_norm": 201.5033416748047, + "learning_rate": 2.1533575317604356e-05, + "loss": 35.9103, + "step": 1565 + }, + { + "epoch": 5.65372460496614, + "grad_norm": 206.63229370117188, + "learning_rate": 2.1528130671506352e-05, + "loss": 35.0026, + "step": 1566 + }, + { + "epoch": 5.657336343115124, + "grad_norm": 212.67581176757812, + "learning_rate": 2.152268602540835e-05, + "loss": 35.6298, + "step": 1567 + }, + { + "epoch": 5.660948081264109, + "grad_norm": 193.2886199951172, + "learning_rate": 2.1517241379310346e-05, + "loss": 36.0356, + "step": 1568 + }, + { + "epoch": 5.664559819413093, + "grad_norm": 166.189208984375, + "learning_rate": 2.151179673321234e-05, + "loss": 35.5423, + "step": 1569 + }, + { + "epoch": 5.668171557562077, + "grad_norm": 288.91552734375, + "learning_rate": 2.1506352087114337e-05, + "loss": 36.6227, + "step": 1570 + }, + { + "epoch": 5.668171557562077, + "eval_loss": 0.6339959502220154, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1570 + }, + { + "epoch": 5.6717832957110605, + "grad_norm": 210.91664123535156, + "learning_rate": 2.1500907441016332e-05, + "loss": 37.3015, + "step": 1571 + }, + { + "epoch": 5.675395033860045, + "grad_norm": 206.54299926757812, + "learning_rate": 2.149546279491833e-05, + "loss": 36.961, + "step": 1572 + }, + { + "epoch": 5.679006772009029, + "grad_norm": 206.55613708496094, + "learning_rate": 2.149001814882033e-05, + "loss": 36.722, + "step": 1573 + }, + { + "epoch": 5.682618510158013, + "grad_norm": 206.86563110351562, + "learning_rate": 2.1484573502722325e-05, + "loss": 37.7482, + "step": 1574 + }, + { + "epoch": 5.686230248306998, + "grad_norm": 219.96533203125, + "learning_rate": 2.147912885662432e-05, + "loss": 37.7964, + "step": 1575 + }, + { + "epoch": 5.689841986455982, + "grad_norm": 226.23887634277344, + "learning_rate": 2.1473684210526316e-05, + "loss": 38.6577, + "step": 1576 + }, + { + "epoch": 5.693453724604966, + "grad_norm": 195.1751708984375, + "learning_rate": 2.146823956442831e-05, + "loss": 36.9764, + "step": 1577 + }, + { + "epoch": 5.69706546275395, + "grad_norm": 194.3510284423828, + "learning_rate": 2.146279491833031e-05, + "loss": 39.4842, + "step": 1578 + }, + { + "epoch": 5.700677200902934, + "grad_norm": 187.02281188964844, + "learning_rate": 2.1457350272232305e-05, + "loss": 38.9574, + "step": 1579 + }, + { + "epoch": 5.704288939051919, + "grad_norm": 242.91925048828125, + "learning_rate": 2.14519056261343e-05, + "loss": 37.6359, + "step": 1580 + }, + { + "epoch": 5.704288939051919, + "eval_loss": 0.6384473443031311, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 1580 + }, + { + "epoch": 5.707900677200903, + "grad_norm": 242.9617156982422, + "learning_rate": 2.14464609800363e-05, + "loss": 31.3564, + "step": 1581 + }, + { + "epoch": 5.711512415349887, + "grad_norm": 182.00540161132812, + "learning_rate": 2.1441016333938295e-05, + "loss": 24.2933, + "step": 1582 + }, + { + "epoch": 5.715124153498872, + "grad_norm": 257.7115173339844, + "learning_rate": 2.143557168784029e-05, + "loss": 24.6299, + "step": 1583 + }, + { + "epoch": 5.718735891647856, + "grad_norm": 198.71554565429688, + "learning_rate": 2.143012704174229e-05, + "loss": 24.7344, + "step": 1584 + }, + { + "epoch": 5.72234762979684, + "grad_norm": 198.24520874023438, + "learning_rate": 2.1424682395644284e-05, + "loss": 26.0825, + "step": 1585 + }, + { + "epoch": 5.725959367945824, + "grad_norm": 248.9528045654297, + "learning_rate": 2.141923774954628e-05, + "loss": 45.1176, + "step": 1586 + }, + { + "epoch": 5.7295711060948085, + "grad_norm": 293.7327575683594, + "learning_rate": 2.1413793103448275e-05, + "loss": 45.8517, + "step": 1587 + }, + { + "epoch": 5.733182844243792, + "grad_norm": 293.1148681640625, + "learning_rate": 2.140834845735027e-05, + "loss": 45.6659, + "step": 1588 + }, + { + "epoch": 5.736794582392776, + "grad_norm": 312.7779846191406, + "learning_rate": 2.140290381125227e-05, + "loss": 44.4863, + "step": 1589 + }, + { + "epoch": 5.74040632054176, + "grad_norm": 309.1000061035156, + "learning_rate": 2.1397459165154265e-05, + "loss": 43.649, + "step": 1590 + }, + { + "epoch": 5.74040632054176, + "eval_loss": 0.6471736431121826, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 1590 + }, + { + "epoch": 5.744018058690745, + "grad_norm": 276.4226989746094, + "learning_rate": 2.1392014519056263e-05, + "loss": 45.3135, + "step": 1591 + }, + { + "epoch": 5.747629796839729, + "grad_norm": 233.6791229248047, + "learning_rate": 2.138656987295826e-05, + "loss": 44.4919, + "step": 1592 + }, + { + "epoch": 5.751241534988713, + "grad_norm": 194.2917022705078, + "learning_rate": 2.1381125226860254e-05, + "loss": 44.8033, + "step": 1593 + }, + { + "epoch": 5.754853273137698, + "grad_norm": 241.76060485839844, + "learning_rate": 2.137568058076225e-05, + "loss": 45.1427, + "step": 1594 + }, + { + "epoch": 5.758465011286682, + "grad_norm": 216.56283569335938, + "learning_rate": 2.137023593466425e-05, + "loss": 43.1769, + "step": 1595 + }, + { + "epoch": 5.762076749435666, + "grad_norm": 230.0026092529297, + "learning_rate": 2.1364791288566244e-05, + "loss": 44.1141, + "step": 1596 + }, + { + "epoch": 5.76568848758465, + "grad_norm": 191.55433654785156, + "learning_rate": 2.135934664246824e-05, + "loss": 40.7227, + "step": 1597 + }, + { + "epoch": 5.769300225733634, + "grad_norm": 180.25885009765625, + "learning_rate": 2.1353901996370235e-05, + "loss": 40.9842, + "step": 1598 + }, + { + "epoch": 5.772911963882619, + "grad_norm": 220.4018096923828, + "learning_rate": 2.134845735027223e-05, + "loss": 40.0403, + "step": 1599 + }, + { + "epoch": 5.776523702031603, + "grad_norm": 264.20587158203125, + "learning_rate": 2.1343012704174232e-05, + "loss": 40.1543, + "step": 1600 + }, + { + "epoch": 5.776523702031603, + "eval_loss": 0.6374311447143555, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1600 + }, + { + "epoch": 5.780135440180587, + "grad_norm": 167.9457244873047, + "learning_rate": 2.1337568058076227e-05, + "loss": 40.9575, + "step": 1601 + }, + { + "epoch": 5.7837471783295715, + "grad_norm": 190.05247497558594, + "learning_rate": 2.1332123411978223e-05, + "loss": 39.5593, + "step": 1602 + }, + { + "epoch": 5.7873589164785555, + "grad_norm": 246.4980926513672, + "learning_rate": 2.1326678765880218e-05, + "loss": 40.7016, + "step": 1603 + }, + { + "epoch": 5.7909706546275395, + "grad_norm": 208.7435302734375, + "learning_rate": 2.1321234119782214e-05, + "loss": 41.7855, + "step": 1604 + }, + { + "epoch": 5.794582392776523, + "grad_norm": 190.84188842773438, + "learning_rate": 2.1315789473684212e-05, + "loss": 41.2129, + "step": 1605 + }, + { + "epoch": 5.798194130925508, + "grad_norm": 196.7161102294922, + "learning_rate": 2.1310344827586208e-05, + "loss": 40.8209, + "step": 1606 + }, + { + "epoch": 5.801805869074492, + "grad_norm": 181.4319305419922, + "learning_rate": 2.1304900181488203e-05, + "loss": 41.8345, + "step": 1607 + }, + { + "epoch": 5.805417607223476, + "grad_norm": 201.2064971923828, + "learning_rate": 2.12994555353902e-05, + "loss": 43.1464, + "step": 1608 + }, + { + "epoch": 5.80902934537246, + "grad_norm": 199.15174865722656, + "learning_rate": 2.1294010889292197e-05, + "loss": 42.6041, + "step": 1609 + }, + { + "epoch": 5.812641083521445, + "grad_norm": 231.0398406982422, + "learning_rate": 2.1288566243194193e-05, + "loss": 42.867, + "step": 1610 + }, + { + "epoch": 5.812641083521445, + "eval_loss": 0.6334222555160522, + "eval_runtime": 3.1534, + "eval_samples_per_second": 56.764, + "eval_steps_per_second": 56.764, + "step": 1610 + }, + { + "epoch": 5.816252821670429, + "grad_norm": 189.26132202148438, + "learning_rate": 2.128312159709619e-05, + "loss": 41.7717, + "step": 1611 + }, + { + "epoch": 5.819864559819413, + "grad_norm": 215.5289764404297, + "learning_rate": 2.1277676950998187e-05, + "loss": 41.3994, + "step": 1612 + }, + { + "epoch": 5.823476297968397, + "grad_norm": 267.4259033203125, + "learning_rate": 2.1272232304900182e-05, + "loss": 41.8173, + "step": 1613 + }, + { + "epoch": 5.827088036117382, + "grad_norm": 241.74749755859375, + "learning_rate": 2.1266787658802178e-05, + "loss": 39.9873, + "step": 1614 + }, + { + "epoch": 5.830699774266366, + "grad_norm": 242.233642578125, + "learning_rate": 2.1261343012704173e-05, + "loss": 37.0662, + "step": 1615 + }, + { + "epoch": 5.83431151241535, + "grad_norm": 217.06141662597656, + "learning_rate": 2.1255898366606172e-05, + "loss": 36.8948, + "step": 1616 + }, + { + "epoch": 5.837923250564334, + "grad_norm": 242.05567932128906, + "learning_rate": 2.1250453720508167e-05, + "loss": 34.9909, + "step": 1617 + }, + { + "epoch": 5.8415349887133186, + "grad_norm": 178.65618896484375, + "learning_rate": 2.1245009074410166e-05, + "loss": 35.603, + "step": 1618 + }, + { + "epoch": 5.8451467268623025, + "grad_norm": 216.36865234375, + "learning_rate": 2.123956442831216e-05, + "loss": 35.9822, + "step": 1619 + }, + { + "epoch": 5.8487584650112865, + "grad_norm": 241.22161865234375, + "learning_rate": 2.1234119782214157e-05, + "loss": 35.1473, + "step": 1620 + }, + { + "epoch": 5.8487584650112865, + "eval_loss": 0.6312161087989807, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 1620 + }, + { + "epoch": 5.852370203160271, + "grad_norm": 192.05210876464844, + "learning_rate": 2.1228675136116152e-05, + "loss": 36.145, + "step": 1621 + }, + { + "epoch": 5.855981941309255, + "grad_norm": 194.0652618408203, + "learning_rate": 2.122323049001815e-05, + "loss": 37.7076, + "step": 1622 + }, + { + "epoch": 5.859593679458239, + "grad_norm": 255.59286499023438, + "learning_rate": 2.1217785843920146e-05, + "loss": 37.6837, + "step": 1623 + }, + { + "epoch": 5.863205417607223, + "grad_norm": 184.0017852783203, + "learning_rate": 2.121234119782214e-05, + "loss": 37.1681, + "step": 1624 + }, + { + "epoch": 5.866817155756207, + "grad_norm": 186.98338317871094, + "learning_rate": 2.1206896551724137e-05, + "loss": 37.4902, + "step": 1625 + }, + { + "epoch": 5.870428893905192, + "grad_norm": 253.53775024414062, + "learning_rate": 2.1201451905626132e-05, + "loss": 37.2771, + "step": 1626 + }, + { + "epoch": 5.874040632054176, + "grad_norm": 196.43038940429688, + "learning_rate": 2.119600725952813e-05, + "loss": 37.7681, + "step": 1627 + }, + { + "epoch": 5.87765237020316, + "grad_norm": 255.99879455566406, + "learning_rate": 2.119056261343013e-05, + "loss": 40.0097, + "step": 1628 + }, + { + "epoch": 5.881264108352145, + "grad_norm": 275.1465148925781, + "learning_rate": 2.1185117967332125e-05, + "loss": 38.1076, + "step": 1629 + }, + { + "epoch": 5.884875846501129, + "grad_norm": 281.8592529296875, + "learning_rate": 2.117967332123412e-05, + "loss": 38.6463, + "step": 1630 + }, + { + "epoch": 5.884875846501129, + "eval_loss": 0.6449099779129028, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 1630 + }, + { + "epoch": 5.888487584650113, + "grad_norm": 246.7912139892578, + "learning_rate": 2.1174228675136116e-05, + "loss": 36.9158, + "step": 1631 + }, + { + "epoch": 5.892099322799097, + "grad_norm": 176.7545623779297, + "learning_rate": 2.116878402903811e-05, + "loss": 25.1153, + "step": 1632 + }, + { + "epoch": 5.895711060948082, + "grad_norm": 202.2602996826172, + "learning_rate": 2.116333938294011e-05, + "loss": 24.1999, + "step": 1633 + }, + { + "epoch": 5.899322799097066, + "grad_norm": 186.26255798339844, + "learning_rate": 2.1157894736842106e-05, + "loss": 24.185, + "step": 1634 + }, + { + "epoch": 5.9029345372460496, + "grad_norm": 231.0543670654297, + "learning_rate": 2.11524500907441e-05, + "loss": 26.1841, + "step": 1635 + }, + { + "epoch": 5.9065462753950335, + "grad_norm": 336.677001953125, + "learning_rate": 2.1147005444646096e-05, + "loss": 47.1367, + "step": 1636 + }, + { + "epoch": 5.910158013544018, + "grad_norm": 299.3211975097656, + "learning_rate": 2.1141560798548095e-05, + "loss": 46.7711, + "step": 1637 + }, + { + "epoch": 5.913769751693002, + "grad_norm": 287.5389099121094, + "learning_rate": 2.1136116152450094e-05, + "loss": 44.9163, + "step": 1638 + }, + { + "epoch": 5.917381489841986, + "grad_norm": 290.34930419921875, + "learning_rate": 2.113067150635209e-05, + "loss": 45.1651, + "step": 1639 + }, + { + "epoch": 5.92099322799097, + "grad_norm": 244.7100372314453, + "learning_rate": 2.1125226860254085e-05, + "loss": 45.6252, + "step": 1640 + }, + { + "epoch": 5.92099322799097, + "eval_loss": 0.6506878733634949, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 1640 + }, + { + "epoch": 5.924604966139955, + "grad_norm": 301.48223876953125, + "learning_rate": 2.111978221415608e-05, + "loss": 44.5345, + "step": 1641 + }, + { + "epoch": 5.928216704288939, + "grad_norm": 261.05987548828125, + "learning_rate": 2.1114337568058075e-05, + "loss": 42.0263, + "step": 1642 + }, + { + "epoch": 5.931828442437923, + "grad_norm": 220.4369659423828, + "learning_rate": 2.110889292196007e-05, + "loss": 41.2405, + "step": 1643 + }, + { + "epoch": 5.935440180586907, + "grad_norm": 261.3221435546875, + "learning_rate": 2.110344827586207e-05, + "loss": 42.2734, + "step": 1644 + }, + { + "epoch": 5.939051918735892, + "grad_norm": 253.70855712890625, + "learning_rate": 2.1098003629764065e-05, + "loss": 43.0752, + "step": 1645 + }, + { + "epoch": 5.942663656884876, + "grad_norm": 198.76138305664062, + "learning_rate": 2.1092558983666064e-05, + "loss": 42.7103, + "step": 1646 + }, + { + "epoch": 5.94627539503386, + "grad_norm": 212.21466064453125, + "learning_rate": 2.108711433756806e-05, + "loss": 42.6215, + "step": 1647 + }, + { + "epoch": 5.949887133182845, + "grad_norm": 212.9633026123047, + "learning_rate": 2.1081669691470055e-05, + "loss": 42.795, + "step": 1648 + }, + { + "epoch": 5.953498871331829, + "grad_norm": 263.2871398925781, + "learning_rate": 2.1076225045372053e-05, + "loss": 43.8843, + "step": 1649 + }, + { + "epoch": 5.957110609480813, + "grad_norm": 207.67120361328125, + "learning_rate": 2.107078039927405e-05, + "loss": 43.0161, + "step": 1650 + }, + { + "epoch": 5.957110609480813, + "eval_loss": 0.6315081715583801, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1650 + }, + { + "epoch": 5.960722347629797, + "grad_norm": 176.6342010498047, + "learning_rate": 2.1065335753176044e-05, + "loss": 38.803, + "step": 1651 + }, + { + "epoch": 5.9643340857787805, + "grad_norm": 223.57485961914062, + "learning_rate": 2.105989110707804e-05, + "loss": 35.1905, + "step": 1652 + }, + { + "epoch": 5.967945823927765, + "grad_norm": 291.507568359375, + "learning_rate": 2.1054446460980035e-05, + "loss": 34.9454, + "step": 1653 + }, + { + "epoch": 5.971557562076749, + "grad_norm": 250.51063537597656, + "learning_rate": 2.104900181488203e-05, + "loss": 37.4404, + "step": 1654 + }, + { + "epoch": 5.975169300225733, + "grad_norm": 307.9601135253906, + "learning_rate": 2.1043557168784032e-05, + "loss": 36.9775, + "step": 1655 + }, + { + "epoch": 5.978781038374718, + "grad_norm": 277.24151611328125, + "learning_rate": 2.1038112522686028e-05, + "loss": 38.2696, + "step": 1656 + }, + { + "epoch": 5.982392776523702, + "grad_norm": 186.7593994140625, + "learning_rate": 2.1032667876588023e-05, + "loss": 37.0656, + "step": 1657 + }, + { + "epoch": 5.986004514672686, + "grad_norm": 201.67047119140625, + "learning_rate": 2.102722323049002e-05, + "loss": 38.1747, + "step": 1658 + }, + { + "epoch": 5.98961625282167, + "grad_norm": 216.87525939941406, + "learning_rate": 2.1021778584392014e-05, + "loss": 39.3248, + "step": 1659 + }, + { + "epoch": 5.993227990970655, + "grad_norm": 227.381103515625, + "learning_rate": 2.1016333938294013e-05, + "loss": 33.4017, + "step": 1660 + }, + { + "epoch": 5.993227990970655, + "eval_loss": 0.6369583010673523, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1660 + }, + { + "epoch": 5.996839729119639, + "grad_norm": 237.2648468017578, + "learning_rate": 2.1010889292196008e-05, + "loss": 24.679, + "step": 1661 + }, + { + "epoch": 6.0, + "grad_norm": 191.99951171875, + "learning_rate": 2.1005444646098003e-05, + "loss": 21.9552, + "step": 1662 + }, + { + "epoch": 6.003611738148984, + "grad_norm": 267.92181396484375, + "learning_rate": 2.1e-05, + "loss": 43.6884, + "step": 1663 + }, + { + "epoch": 6.007223476297969, + "grad_norm": 318.86602783203125, + "learning_rate": 2.0994555353901998e-05, + "loss": 46.0709, + "step": 1664 + }, + { + "epoch": 6.010835214446953, + "grad_norm": 282.772705078125, + "learning_rate": 2.0989110707803993e-05, + "loss": 44.2746, + "step": 1665 + }, + { + "epoch": 6.014446952595937, + "grad_norm": 263.2024841308594, + "learning_rate": 2.0983666061705992e-05, + "loss": 43.818, + "step": 1666 + }, + { + "epoch": 6.018058690744921, + "grad_norm": 229.41725158691406, + "learning_rate": 2.0978221415607987e-05, + "loss": 43.9441, + "step": 1667 + }, + { + "epoch": 6.021670428893906, + "grad_norm": 253.25624084472656, + "learning_rate": 2.0972776769509983e-05, + "loss": 43.517, + "step": 1668 + }, + { + "epoch": 6.0252821670428895, + "grad_norm": 202.00238037109375, + "learning_rate": 2.0967332123411978e-05, + "loss": 44.3685, + "step": 1669 + }, + { + "epoch": 6.0288939051918735, + "grad_norm": 196.92825317382812, + "learning_rate": 2.0961887477313973e-05, + "loss": 44.9367, + "step": 1670 + }, + { + "epoch": 6.0288939051918735, + "eval_loss": 0.6381568312644958, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1670 + }, + { + "epoch": 6.0325056433408575, + "grad_norm": 191.00900268554688, + "learning_rate": 2.0956442831215972e-05, + "loss": 44.0743, + "step": 1671 + }, + { + "epoch": 6.036117381489842, + "grad_norm": 195.92141723632812, + "learning_rate": 2.0950998185117967e-05, + "loss": 43.3278, + "step": 1672 + }, + { + "epoch": 6.039729119638826, + "grad_norm": 230.04708862304688, + "learning_rate": 2.0945553539019963e-05, + "loss": 41.6419, + "step": 1673 + }, + { + "epoch": 6.04334085778781, + "grad_norm": 215.70689392089844, + "learning_rate": 2.094010889292196e-05, + "loss": 41.0927, + "step": 1674 + }, + { + "epoch": 6.046952595936794, + "grad_norm": 227.51797485351562, + "learning_rate": 2.0934664246823957e-05, + "loss": 40.1888, + "step": 1675 + }, + { + "epoch": 6.050564334085779, + "grad_norm": 216.93089294433594, + "learning_rate": 2.0929219600725952e-05, + "loss": 39.8766, + "step": 1676 + }, + { + "epoch": 6.054176072234763, + "grad_norm": 199.3091583251953, + "learning_rate": 2.092377495462795e-05, + "loss": 40.3851, + "step": 1677 + }, + { + "epoch": 6.057787810383747, + "grad_norm": 188.56056213378906, + "learning_rate": 2.0918330308529947e-05, + "loss": 40.5289, + "step": 1678 + }, + { + "epoch": 6.061399548532731, + "grad_norm": 194.23265075683594, + "learning_rate": 2.0912885662431942e-05, + "loss": 40.7509, + "step": 1679 + }, + { + "epoch": 6.065011286681716, + "grad_norm": 199.7327423095703, + "learning_rate": 2.0907441016333937e-05, + "loss": 41.3404, + "step": 1680 + }, + { + "epoch": 6.065011286681716, + "eval_loss": 0.6312655806541443, + "eval_runtime": 3.1482, + "eval_samples_per_second": 56.858, + "eval_steps_per_second": 56.858, + "step": 1680 + }, + { + "epoch": 6.0686230248307, + "grad_norm": 189.40150451660156, + "learning_rate": 2.0901996370235933e-05, + "loss": 41.3719, + "step": 1681 + }, + { + "epoch": 6.072234762979684, + "grad_norm": 222.07705688476562, + "learning_rate": 2.089655172413793e-05, + "loss": 41.8194, + "step": 1682 + }, + { + "epoch": 6.075846501128668, + "grad_norm": 205.6264190673828, + "learning_rate": 2.089110707803993e-05, + "loss": 39.8522, + "step": 1683 + }, + { + "epoch": 6.079458239277653, + "grad_norm": 207.98802185058594, + "learning_rate": 2.0885662431941926e-05, + "loss": 41.5093, + "step": 1684 + }, + { + "epoch": 6.083069977426637, + "grad_norm": 197.24134826660156, + "learning_rate": 2.088021778584392e-05, + "loss": 41.7284, + "step": 1685 + }, + { + "epoch": 6.0866817155756205, + "grad_norm": 220.84255981445312, + "learning_rate": 2.0874773139745916e-05, + "loss": 42.7841, + "step": 1686 + }, + { + "epoch": 6.090293453724605, + "grad_norm": 239.06854248046875, + "learning_rate": 2.0869328493647912e-05, + "loss": 43.6391, + "step": 1687 + }, + { + "epoch": 6.093905191873589, + "grad_norm": 193.2572021484375, + "learning_rate": 2.086388384754991e-05, + "loss": 41.9963, + "step": 1688 + }, + { + "epoch": 6.097516930022573, + "grad_norm": 206.66473388671875, + "learning_rate": 2.0858439201451906e-05, + "loss": 41.9834, + "step": 1689 + }, + { + "epoch": 6.101128668171557, + "grad_norm": 214.81956481933594, + "learning_rate": 2.08529945553539e-05, + "loss": 41.7128, + "step": 1690 + }, + { + "epoch": 6.101128668171557, + "eval_loss": 0.6309775114059448, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1690 + }, + { + "epoch": 6.104740406320542, + "grad_norm": 189.58360290527344, + "learning_rate": 2.0847549909255897e-05, + "loss": 37.7807, + "step": 1691 + }, + { + "epoch": 6.108352144469526, + "grad_norm": 265.76934814453125, + "learning_rate": 2.0842105263157895e-05, + "loss": 37.7091, + "step": 1692 + }, + { + "epoch": 6.11196388261851, + "grad_norm": 266.4632568359375, + "learning_rate": 2.0836660617059894e-05, + "loss": 34.7386, + "step": 1693 + }, + { + "epoch": 6.115575620767494, + "grad_norm": 309.3799743652344, + "learning_rate": 2.083121597096189e-05, + "loss": 34.9386, + "step": 1694 + }, + { + "epoch": 6.119187358916479, + "grad_norm": 252.98681640625, + "learning_rate": 2.0825771324863885e-05, + "loss": 34.9113, + "step": 1695 + }, + { + "epoch": 6.122799097065463, + "grad_norm": 199.3408660888672, + "learning_rate": 2.082032667876588e-05, + "loss": 35.1914, + "step": 1696 + }, + { + "epoch": 6.126410835214447, + "grad_norm": 231.67514038085938, + "learning_rate": 2.0814882032667876e-05, + "loss": 36.3151, + "step": 1697 + }, + { + "epoch": 6.130022573363431, + "grad_norm": 215.49317932128906, + "learning_rate": 2.080943738656987e-05, + "loss": 37.6763, + "step": 1698 + }, + { + "epoch": 6.133634311512416, + "grad_norm": 239.3602752685547, + "learning_rate": 2.080399274047187e-05, + "loss": 35.7805, + "step": 1699 + }, + { + "epoch": 6.1372460496614, + "grad_norm": 192.8195037841797, + "learning_rate": 2.0798548094373865e-05, + "loss": 36.7353, + "step": 1700 + }, + { + "epoch": 6.1372460496614, + "eval_loss": 0.6290757060050964, + "eval_runtime": 3.1486, + "eval_samples_per_second": 56.851, + "eval_steps_per_second": 56.851, + "step": 1700 + }, + { + "epoch": 6.140857787810384, + "grad_norm": 191.125, + "learning_rate": 2.0793103448275864e-05, + "loss": 36.6377, + "step": 1701 + }, + { + "epoch": 6.144469525959368, + "grad_norm": 232.39170837402344, + "learning_rate": 2.078765880217786e-05, + "loss": 36.5235, + "step": 1702 + }, + { + "epoch": 6.148081264108352, + "grad_norm": 259.41204833984375, + "learning_rate": 2.0782214156079855e-05, + "loss": 37.7093, + "step": 1703 + }, + { + "epoch": 6.151693002257336, + "grad_norm": 218.00814819335938, + "learning_rate": 2.0776769509981854e-05, + "loss": 37.8061, + "step": 1704 + }, + { + "epoch": 6.15530474040632, + "grad_norm": 183.78170776367188, + "learning_rate": 2.077132486388385e-05, + "loss": 37.9451, + "step": 1705 + }, + { + "epoch": 6.158916478555304, + "grad_norm": 242.387939453125, + "learning_rate": 2.0765880217785844e-05, + "loss": 38.687, + "step": 1706 + }, + { + "epoch": 6.162528216704289, + "grad_norm": 247.09152221679688, + "learning_rate": 2.076043557168784e-05, + "loss": 38.5109, + "step": 1707 + }, + { + "epoch": 6.166139954853273, + "grad_norm": 202.3104705810547, + "learning_rate": 2.0754990925589835e-05, + "loss": 28.0115, + "step": 1708 + }, + { + "epoch": 6.169751693002257, + "grad_norm": 239.5511016845703, + "learning_rate": 2.0749546279491834e-05, + "loss": 23.8873, + "step": 1709 + }, + { + "epoch": 6.173363431151241, + "grad_norm": 233.80007934570312, + "learning_rate": 2.0744101633393833e-05, + "loss": 24.0236, + "step": 1710 + }, + { + "epoch": 6.173363431151241, + "eval_loss": 0.6451307535171509, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1710 + }, + { + "epoch": 6.176975169300226, + "grad_norm": 231.85955810546875, + "learning_rate": 2.0738656987295828e-05, + "loss": 25.2521, + "step": 1711 + }, + { + "epoch": 6.18058690744921, + "grad_norm": 207.05453491210938, + "learning_rate": 2.0733212341197823e-05, + "loss": 25.5774, + "step": 1712 + }, + { + "epoch": 6.184198645598194, + "grad_norm": 265.9180908203125, + "learning_rate": 2.072776769509982e-05, + "loss": 46.0267, + "step": 1713 + }, + { + "epoch": 6.187810383747179, + "grad_norm": 289.2763671875, + "learning_rate": 2.0722323049001814e-05, + "loss": 46.6262, + "step": 1714 + }, + { + "epoch": 6.191422121896163, + "grad_norm": 254.466552734375, + "learning_rate": 2.0716878402903813e-05, + "loss": 44.2758, + "step": 1715 + }, + { + "epoch": 6.195033860045147, + "grad_norm": 262.713134765625, + "learning_rate": 2.071143375680581e-05, + "loss": 44.6334, + "step": 1716 + }, + { + "epoch": 6.198645598194131, + "grad_norm": 272.8150939941406, + "learning_rate": 2.0705989110707804e-05, + "loss": 44.9617, + "step": 1717 + }, + { + "epoch": 6.2022573363431155, + "grad_norm": 288.115478515625, + "learning_rate": 2.07005444646098e-05, + "loss": 44.4382, + "step": 1718 + }, + { + "epoch": 6.2058690744920995, + "grad_norm": 226.08058166503906, + "learning_rate": 2.0695099818511795e-05, + "loss": 44.8551, + "step": 1719 + }, + { + "epoch": 6.209480812641083, + "grad_norm": 219.95835876464844, + "learning_rate": 2.0689655172413797e-05, + "loss": 45.5901, + "step": 1720 + }, + { + "epoch": 6.209480812641083, + "eval_loss": 0.6379314661026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 1720 + }, + { + "epoch": 6.213092550790067, + "grad_norm": 190.3118896484375, + "learning_rate": 2.0684210526315792e-05, + "loss": 44.0675, + "step": 1721 + }, + { + "epoch": 6.216704288939052, + "grad_norm": 177.408935546875, + "learning_rate": 2.0678765880217787e-05, + "loss": 42.6333, + "step": 1722 + }, + { + "epoch": 6.220316027088036, + "grad_norm": 231.3040313720703, + "learning_rate": 2.0673321234119783e-05, + "loss": 41.6771, + "step": 1723 + }, + { + "epoch": 6.22392776523702, + "grad_norm": 226.51663208007812, + "learning_rate": 2.0667876588021778e-05, + "loss": 41.0829, + "step": 1724 + }, + { + "epoch": 6.227539503386004, + "grad_norm": 184.55775451660156, + "learning_rate": 2.0662431941923774e-05, + "loss": 39.2682, + "step": 1725 + }, + { + "epoch": 6.231151241534989, + "grad_norm": 205.0491943359375, + "learning_rate": 2.0656987295825772e-05, + "loss": 40.4101, + "step": 1726 + }, + { + "epoch": 6.234762979683973, + "grad_norm": 201.45838928222656, + "learning_rate": 2.0651542649727768e-05, + "loss": 39.9147, + "step": 1727 + }, + { + "epoch": 6.238374717832957, + "grad_norm": 220.16213989257812, + "learning_rate": 2.0646098003629763e-05, + "loss": 40.7215, + "step": 1728 + }, + { + "epoch": 6.241986455981941, + "grad_norm": 260.9661560058594, + "learning_rate": 2.0640653357531762e-05, + "loss": 40.0256, + "step": 1729 + }, + { + "epoch": 6.245598194130926, + "grad_norm": 314.2476806640625, + "learning_rate": 2.0635208711433757e-05, + "loss": 41.1147, + "step": 1730 + }, + { + "epoch": 6.245598194130926, + "eval_loss": 0.6347935199737549, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1730 + }, + { + "epoch": 6.24920993227991, + "grad_norm": 262.24505615234375, + "learning_rate": 2.0629764065335756e-05, + "loss": 41.7255, + "step": 1731 + }, + { + "epoch": 6.252821670428894, + "grad_norm": 212.0876922607422, + "learning_rate": 2.062431941923775e-05, + "loss": 41.2559, + "step": 1732 + }, + { + "epoch": 6.2564334085778786, + "grad_norm": 185.3249969482422, + "learning_rate": 2.0618874773139747e-05, + "loss": 41.1664, + "step": 1733 + }, + { + "epoch": 6.2600451467268625, + "grad_norm": 184.7873077392578, + "learning_rate": 2.0613430127041742e-05, + "loss": 41.3357, + "step": 1734 + }, + { + "epoch": 6.2636568848758465, + "grad_norm": 230.11257934570312, + "learning_rate": 2.0607985480943738e-05, + "loss": 43.0978, + "step": 1735 + }, + { + "epoch": 6.2672686230248305, + "grad_norm": 251.255126953125, + "learning_rate": 2.0602540834845733e-05, + "loss": 42.4169, + "step": 1736 + }, + { + "epoch": 6.270880361173815, + "grad_norm": 230.1149444580078, + "learning_rate": 2.0597096188747732e-05, + "loss": 43.2969, + "step": 1737 + }, + { + "epoch": 6.274492099322799, + "grad_norm": 217.2769012451172, + "learning_rate": 2.059165154264973e-05, + "loss": 42.6037, + "step": 1738 + }, + { + "epoch": 6.278103837471783, + "grad_norm": 189.85533142089844, + "learning_rate": 2.0586206896551726e-05, + "loss": 42.1215, + "step": 1739 + }, + { + "epoch": 6.281715575620767, + "grad_norm": 242.15667724609375, + "learning_rate": 2.058076225045372e-05, + "loss": 42.6337, + "step": 1740 + }, + { + "epoch": 6.281715575620767, + "eval_loss": 0.6310555934906006, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 1740 + }, + { + "epoch": 6.285327313769752, + "grad_norm": 213.7873992919922, + "learning_rate": 2.0575317604355717e-05, + "loss": 40.5315, + "step": 1741 + }, + { + "epoch": 6.288939051918736, + "grad_norm": 243.86492919921875, + "learning_rate": 2.0569872958257715e-05, + "loss": 38.9483, + "step": 1742 + }, + { + "epoch": 6.29255079006772, + "grad_norm": 276.0108642578125, + "learning_rate": 2.056442831215971e-05, + "loss": 35.9627, + "step": 1743 + }, + { + "epoch": 6.296162528216704, + "grad_norm": 252.5875701904297, + "learning_rate": 2.0558983666061706e-05, + "loss": 35.4305, + "step": 1744 + }, + { + "epoch": 6.299774266365689, + "grad_norm": 227.15142822265625, + "learning_rate": 2.05535390199637e-05, + "loss": 35.2385, + "step": 1745 + }, + { + "epoch": 6.303386004514673, + "grad_norm": 259.6727294921875, + "learning_rate": 2.0548094373865697e-05, + "loss": 35.735, + "step": 1746 + }, + { + "epoch": 6.306997742663657, + "grad_norm": 185.07765197753906, + "learning_rate": 2.0542649727767696e-05, + "loss": 36.8835, + "step": 1747 + }, + { + "epoch": 6.310609480812641, + "grad_norm": 207.650146484375, + "learning_rate": 2.0537205081669694e-05, + "loss": 36.346, + "step": 1748 + }, + { + "epoch": 6.314221218961626, + "grad_norm": 223.2378692626953, + "learning_rate": 2.053176043557169e-05, + "loss": 36.1527, + "step": 1749 + }, + { + "epoch": 6.3178329571106095, + "grad_norm": 162.90794372558594, + "learning_rate": 2.0526315789473685e-05, + "loss": 35.7408, + "step": 1750 + }, + { + "epoch": 6.3178329571106095, + "eval_loss": 0.6276403069496155, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 1750 + }, + { + "epoch": 6.3214446952595935, + "grad_norm": 165.8592987060547, + "learning_rate": 2.052087114337568e-05, + "loss": 37.7916, + "step": 1751 + }, + { + "epoch": 6.3250564334085775, + "grad_norm": 179.7499542236328, + "learning_rate": 2.0515426497277676e-05, + "loss": 36.8409, + "step": 1752 + }, + { + "epoch": 6.328668171557562, + "grad_norm": 227.0990753173828, + "learning_rate": 2.0509981851179675e-05, + "loss": 37.1766, + "step": 1753 + }, + { + "epoch": 6.332279909706546, + "grad_norm": 216.3297882080078, + "learning_rate": 2.050453720508167e-05, + "loss": 37.5, + "step": 1754 + }, + { + "epoch": 6.33589164785553, + "grad_norm": 197.88409423828125, + "learning_rate": 2.0499092558983666e-05, + "loss": 38.8293, + "step": 1755 + }, + { + "epoch": 6.339503386004514, + "grad_norm": 189.74916076660156, + "learning_rate": 2.049364791288566e-05, + "loss": 37.9873, + "step": 1756 + }, + { + "epoch": 6.343115124153499, + "grad_norm": 241.16644287109375, + "learning_rate": 2.048820326678766e-05, + "loss": 39.3107, + "step": 1757 + }, + { + "epoch": 6.346726862302483, + "grad_norm": 224.3491668701172, + "learning_rate": 2.0482758620689655e-05, + "loss": 36.2482, + "step": 1758 + }, + { + "epoch": 6.350338600451467, + "grad_norm": 217.30882263183594, + "learning_rate": 2.0477313974591654e-05, + "loss": 24.1945, + "step": 1759 + }, + { + "epoch": 6.353950338600452, + "grad_norm": 213.23683166503906, + "learning_rate": 2.047186932849365e-05, + "loss": 24.2356, + "step": 1760 + }, + { + "epoch": 6.353950338600452, + "eval_loss": 0.6382855772972107, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.795, + "eval_steps_per_second": 56.795, + "step": 1760 + }, + { + "epoch": 6.357562076749436, + "grad_norm": 209.8166961669922, + "learning_rate": 2.0466424682395645e-05, + "loss": 25.1916, + "step": 1761 + }, + { + "epoch": 6.36117381489842, + "grad_norm": 197.86773681640625, + "learning_rate": 2.046098003629764e-05, + "loss": 25.1372, + "step": 1762 + }, + { + "epoch": 6.364785553047404, + "grad_norm": 280.80517578125, + "learning_rate": 2.0455535390199635e-05, + "loss": 45.0431, + "step": 1763 + }, + { + "epoch": 6.368397291196389, + "grad_norm": 239.85861206054688, + "learning_rate": 2.0450090744101634e-05, + "loss": 45.4893, + "step": 1764 + }, + { + "epoch": 6.372009029345373, + "grad_norm": 302.56024169921875, + "learning_rate": 2.044464609800363e-05, + "loss": 45.3313, + "step": 1765 + }, + { + "epoch": 6.375620767494357, + "grad_norm": 255.5519256591797, + "learning_rate": 2.043920145190563e-05, + "loss": 44.703, + "step": 1766 + }, + { + "epoch": 6.3792325056433405, + "grad_norm": 223.1331024169922, + "learning_rate": 2.0433756805807624e-05, + "loss": 45.0278, + "step": 1767 + }, + { + "epoch": 6.382844243792325, + "grad_norm": 240.68817138671875, + "learning_rate": 2.042831215970962e-05, + "loss": 44.7298, + "step": 1768 + }, + { + "epoch": 6.386455981941309, + "grad_norm": 239.5072021484375, + "learning_rate": 2.0422867513611614e-05, + "loss": 44.0512, + "step": 1769 + }, + { + "epoch": 6.390067720090293, + "grad_norm": 186.3783416748047, + "learning_rate": 2.0417422867513613e-05, + "loss": 43.8646, + "step": 1770 + }, + { + "epoch": 6.390067720090293, + "eval_loss": 0.6325972676277161, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 1770 + }, + { + "epoch": 6.393679458239277, + "grad_norm": 169.77285766601562, + "learning_rate": 2.041197822141561e-05, + "loss": 43.8688, + "step": 1771 + }, + { + "epoch": 6.397291196388262, + "grad_norm": 158.4019012451172, + "learning_rate": 2.0406533575317604e-05, + "loss": 42.5757, + "step": 1772 + }, + { + "epoch": 6.400902934537246, + "grad_norm": 209.79916381835938, + "learning_rate": 2.04010889292196e-05, + "loss": 44.8075, + "step": 1773 + }, + { + "epoch": 6.40451467268623, + "grad_norm": 215.74639892578125, + "learning_rate": 2.0395644283121595e-05, + "loss": 42.0121, + "step": 1774 + }, + { + "epoch": 6.408126410835214, + "grad_norm": 215.21121215820312, + "learning_rate": 2.0390199637023597e-05, + "loss": 40.6564, + "step": 1775 + }, + { + "epoch": 6.411738148984199, + "grad_norm": 244.49574279785156, + "learning_rate": 2.0384754990925592e-05, + "loss": 40.543, + "step": 1776 + }, + { + "epoch": 6.415349887133183, + "grad_norm": 189.22781372070312, + "learning_rate": 2.0379310344827588e-05, + "loss": 39.5569, + "step": 1777 + }, + { + "epoch": 6.418961625282167, + "grad_norm": 204.32664489746094, + "learning_rate": 2.0373865698729583e-05, + "loss": 40.0789, + "step": 1778 + }, + { + "epoch": 6.422573363431152, + "grad_norm": 217.5277557373047, + "learning_rate": 2.036842105263158e-05, + "loss": 39.6436, + "step": 1779 + }, + { + "epoch": 6.426185101580136, + "grad_norm": 196.25918579101562, + "learning_rate": 2.0362976406533574e-05, + "loss": 41.0794, + "step": 1780 + }, + { + "epoch": 6.426185101580136, + "eval_loss": 0.6334295868873596, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1780 + }, + { + "epoch": 6.42979683972912, + "grad_norm": 191.50656127929688, + "learning_rate": 2.0357531760435573e-05, + "loss": 41.2976, + "step": 1781 + }, + { + "epoch": 6.433408577878104, + "grad_norm": 192.98692321777344, + "learning_rate": 2.0352087114337568e-05, + "loss": 41.0843, + "step": 1782 + }, + { + "epoch": 6.437020316027088, + "grad_norm": 197.32862854003906, + "learning_rate": 2.0346642468239563e-05, + "loss": 40.4123, + "step": 1783 + }, + { + "epoch": 6.440632054176072, + "grad_norm": 205.18751525878906, + "learning_rate": 2.0341197822141562e-05, + "loss": 41.9185, + "step": 1784 + }, + { + "epoch": 6.444243792325056, + "grad_norm": 201.69070434570312, + "learning_rate": 2.0335753176043558e-05, + "loss": 41.6794, + "step": 1785 + }, + { + "epoch": 6.44785553047404, + "grad_norm": 218.77044677734375, + "learning_rate": 2.0330308529945556e-05, + "loss": 43.5805, + "step": 1786 + }, + { + "epoch": 6.451467268623025, + "grad_norm": 183.25967407226562, + "learning_rate": 2.0324863883847552e-05, + "loss": 41.2777, + "step": 1787 + }, + { + "epoch": 6.455079006772009, + "grad_norm": 219.97369384765625, + "learning_rate": 2.0319419237749547e-05, + "loss": 42.4618, + "step": 1788 + }, + { + "epoch": 6.458690744920993, + "grad_norm": 216.1624298095703, + "learning_rate": 2.0313974591651542e-05, + "loss": 41.6424, + "step": 1789 + }, + { + "epoch": 6.462302483069977, + "grad_norm": 222.29965209960938, + "learning_rate": 2.0308529945553538e-05, + "loss": 41.4058, + "step": 1790 + }, + { + "epoch": 6.462302483069977, + "eval_loss": 0.6282982230186462, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 1790 + }, + { + "epoch": 6.465914221218962, + "grad_norm": 215.50511169433594, + "learning_rate": 2.0303085299455533e-05, + "loss": 39.474, + "step": 1791 + }, + { + "epoch": 6.469525959367946, + "grad_norm": 237.2119903564453, + "learning_rate": 2.0297640653357532e-05, + "loss": 36.0508, + "step": 1792 + }, + { + "epoch": 6.47313769751693, + "grad_norm": 234.52975463867188, + "learning_rate": 2.029219600725953e-05, + "loss": 34.1704, + "step": 1793 + }, + { + "epoch": 6.476749435665914, + "grad_norm": 213.22216796875, + "learning_rate": 2.0286751361161526e-05, + "loss": 34.7592, + "step": 1794 + }, + { + "epoch": 6.480361173814899, + "grad_norm": 215.77244567871094, + "learning_rate": 2.028130671506352e-05, + "loss": 35.3051, + "step": 1795 + }, + { + "epoch": 6.483972911963883, + "grad_norm": 179.0439910888672, + "learning_rate": 2.0275862068965517e-05, + "loss": 35.2493, + "step": 1796 + }, + { + "epoch": 6.487584650112867, + "grad_norm": 217.47218322753906, + "learning_rate": 2.0270417422867516e-05, + "loss": 35.6169, + "step": 1797 + }, + { + "epoch": 6.491196388261851, + "grad_norm": 191.3380584716797, + "learning_rate": 2.026497277676951e-05, + "loss": 36.428, + "step": 1798 + }, + { + "epoch": 6.4948081264108355, + "grad_norm": 200.8570098876953, + "learning_rate": 2.0259528130671506e-05, + "loss": 36.5983, + "step": 1799 + }, + { + "epoch": 6.4984198645598195, + "grad_norm": 173.1240234375, + "learning_rate": 2.0254083484573502e-05, + "loss": 36.0163, + "step": 1800 + }, + { + "epoch": 6.4984198645598195, + "eval_loss": 0.6268841624259949, + "eval_runtime": 3.146, + "eval_samples_per_second": 56.898, + "eval_steps_per_second": 56.898, + "step": 1800 + }, + { + "epoch": 6.502031602708803, + "grad_norm": 225.66845703125, + "learning_rate": 2.0248638838475497e-05, + "loss": 36.2461, + "step": 1801 + }, + { + "epoch": 6.505643340857787, + "grad_norm": 189.66233825683594, + "learning_rate": 2.0243194192377496e-05, + "loss": 37.416, + "step": 1802 + }, + { + "epoch": 6.509255079006772, + "grad_norm": 243.0270233154297, + "learning_rate": 2.0237749546279495e-05, + "loss": 38.5309, + "step": 1803 + }, + { + "epoch": 6.512866817155756, + "grad_norm": 192.0927276611328, + "learning_rate": 2.023230490018149e-05, + "loss": 37.087, + "step": 1804 + }, + { + "epoch": 6.51647855530474, + "grad_norm": 222.2957305908203, + "learning_rate": 2.0226860254083486e-05, + "loss": 37.8877, + "step": 1805 + }, + { + "epoch": 6.520090293453725, + "grad_norm": 259.84722900390625, + "learning_rate": 2.022141560798548e-05, + "loss": 39.2138, + "step": 1806 + }, + { + "epoch": 6.523702031602709, + "grad_norm": 205.5794219970703, + "learning_rate": 2.0215970961887476e-05, + "loss": 38.6066, + "step": 1807 + }, + { + "epoch": 6.527313769751693, + "grad_norm": 300.455810546875, + "learning_rate": 2.0210526315789475e-05, + "loss": 36.1581, + "step": 1808 + }, + { + "epoch": 6.530925507900677, + "grad_norm": 207.18063354492188, + "learning_rate": 2.020508166969147e-05, + "loss": 24.3689, + "step": 1809 + }, + { + "epoch": 6.534537246049661, + "grad_norm": 230.98516845703125, + "learning_rate": 2.0199637023593466e-05, + "loss": 23.7019, + "step": 1810 + }, + { + "epoch": 6.534537246049661, + "eval_loss": 0.6379140615463257, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 1810 + }, + { + "epoch": 6.538148984198646, + "grad_norm": 153.8694610595703, + "learning_rate": 2.019419237749546e-05, + "loss": 24.5035, + "step": 1811 + }, + { + "epoch": 6.54176072234763, + "grad_norm": 229.9432373046875, + "learning_rate": 2.018874773139746e-05, + "loss": 26.1645, + "step": 1812 + }, + { + "epoch": 6.545372460496614, + "grad_norm": 325.3592529296875, + "learning_rate": 2.018330308529946e-05, + "loss": 45.6349, + "step": 1813 + }, + { + "epoch": 6.5489841986455986, + "grad_norm": 261.0744323730469, + "learning_rate": 2.0177858439201454e-05, + "loss": 45.5545, + "step": 1814 + }, + { + "epoch": 6.5525959367945825, + "grad_norm": 261.4237976074219, + "learning_rate": 2.017241379310345e-05, + "loss": 45.321, + "step": 1815 + }, + { + "epoch": 6.5562076749435665, + "grad_norm": 238.8377685546875, + "learning_rate": 2.0166969147005445e-05, + "loss": 44.5963, + "step": 1816 + }, + { + "epoch": 6.5598194130925505, + "grad_norm": 225.89730834960938, + "learning_rate": 2.016152450090744e-05, + "loss": 43.593, + "step": 1817 + }, + { + "epoch": 6.563431151241535, + "grad_norm": 265.09625244140625, + "learning_rate": 2.0156079854809436e-05, + "loss": 43.536, + "step": 1818 + }, + { + "epoch": 6.567042889390519, + "grad_norm": 257.9114685058594, + "learning_rate": 2.0150635208711434e-05, + "loss": 44.1125, + "step": 1819 + }, + { + "epoch": 6.570654627539503, + "grad_norm": 188.06382751464844, + "learning_rate": 2.014519056261343e-05, + "loss": 45.097, + "step": 1820 + }, + { + "epoch": 6.570654627539503, + "eval_loss": 0.6347097754478455, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 1820 + }, + { + "epoch": 6.574266365688487, + "grad_norm": 227.7350616455078, + "learning_rate": 2.013974591651543e-05, + "loss": 43.9367, + "step": 1821 + }, + { + "epoch": 6.577878103837472, + "grad_norm": 207.54774475097656, + "learning_rate": 2.0134301270417424e-05, + "loss": 43.8266, + "step": 1822 + }, + { + "epoch": 6.581489841986456, + "grad_norm": 204.62364196777344, + "learning_rate": 2.012885662431942e-05, + "loss": 42.7973, + "step": 1823 + }, + { + "epoch": 6.58510158013544, + "grad_norm": 244.32159423828125, + "learning_rate": 2.0123411978221418e-05, + "loss": 42.7741, + "step": 1824 + }, + { + "epoch": 6.588713318284425, + "grad_norm": 304.9100036621094, + "learning_rate": 2.0117967332123414e-05, + "loss": 40.6529, + "step": 1825 + }, + { + "epoch": 6.592325056433409, + "grad_norm": 275.5767517089844, + "learning_rate": 2.011252268602541e-05, + "loss": 40.2909, + "step": 1826 + }, + { + "epoch": 6.595936794582393, + "grad_norm": 227.69642639160156, + "learning_rate": 2.0107078039927404e-05, + "loss": 39.8786, + "step": 1827 + }, + { + "epoch": 6.599548532731377, + "grad_norm": 261.4333190917969, + "learning_rate": 2.01016333938294e-05, + "loss": 40.7009, + "step": 1828 + }, + { + "epoch": 6.603160270880361, + "grad_norm": 213.0095977783203, + "learning_rate": 2.0096188747731395e-05, + "loss": 40.0595, + "step": 1829 + }, + { + "epoch": 6.606772009029346, + "grad_norm": 251.78590393066406, + "learning_rate": 2.0090744101633397e-05, + "loss": 40.8939, + "step": 1830 + }, + { + "epoch": 6.606772009029346, + "eval_loss": 0.6333281397819519, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1830 + }, + { + "epoch": 6.6103837471783295, + "grad_norm": 224.89805603027344, + "learning_rate": 2.0085299455535393e-05, + "loss": 41.4123, + "step": 1831 + }, + { + "epoch": 6.6139954853273135, + "grad_norm": 195.67982482910156, + "learning_rate": 2.0079854809437388e-05, + "loss": 41.3483, + "step": 1832 + }, + { + "epoch": 6.617607223476298, + "grad_norm": 214.318603515625, + "learning_rate": 2.0074410163339383e-05, + "loss": 40.5516, + "step": 1833 + }, + { + "epoch": 6.621218961625282, + "grad_norm": 226.60968017578125, + "learning_rate": 2.006896551724138e-05, + "loss": 41.3523, + "step": 1834 + }, + { + "epoch": 6.624830699774266, + "grad_norm": 231.63604736328125, + "learning_rate": 2.0063520871143378e-05, + "loss": 41.8734, + "step": 1835 + }, + { + "epoch": 6.62844243792325, + "grad_norm": 224.1644287109375, + "learning_rate": 2.0058076225045373e-05, + "loss": 42.7386, + "step": 1836 + }, + { + "epoch": 6.632054176072235, + "grad_norm": 273.651123046875, + "learning_rate": 2.0052631578947368e-05, + "loss": 42.4525, + "step": 1837 + }, + { + "epoch": 6.635665914221219, + "grad_norm": 270.8088684082031, + "learning_rate": 2.0047186932849364e-05, + "loss": 42.1051, + "step": 1838 + }, + { + "epoch": 6.639277652370203, + "grad_norm": 303.1058044433594, + "learning_rate": 2.0041742286751362e-05, + "loss": 42.1301, + "step": 1839 + }, + { + "epoch": 6.642889390519187, + "grad_norm": 207.29380798339844, + "learning_rate": 2.0036297640653358e-05, + "loss": 42.1495, + "step": 1840 + }, + { + "epoch": 6.642889390519187, + "eval_loss": 0.6321585774421692, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1840 + }, + { + "epoch": 6.646501128668172, + "grad_norm": 262.1852722167969, + "learning_rate": 2.0030852994555357e-05, + "loss": 39.6408, + "step": 1841 + }, + { + "epoch": 6.650112866817156, + "grad_norm": 233.7991943359375, + "learning_rate": 2.0025408348457352e-05, + "loss": 37.6177, + "step": 1842 + }, + { + "epoch": 6.65372460496614, + "grad_norm": 247.25514221191406, + "learning_rate": 2.0019963702359347e-05, + "loss": 35.4287, + "step": 1843 + }, + { + "epoch": 6.657336343115124, + "grad_norm": 191.53343200683594, + "learning_rate": 2.0014519056261343e-05, + "loss": 34.2335, + "step": 1844 + }, + { + "epoch": 6.660948081264109, + "grad_norm": 245.22821044921875, + "learning_rate": 2.0009074410163338e-05, + "loss": 35.8097, + "step": 1845 + }, + { + "epoch": 6.664559819413093, + "grad_norm": 213.8151092529297, + "learning_rate": 2.0003629764065337e-05, + "loss": 35.2621, + "step": 1846 + }, + { + "epoch": 6.668171557562077, + "grad_norm": 174.6085205078125, + "learning_rate": 1.9998185117967332e-05, + "loss": 36.6137, + "step": 1847 + }, + { + "epoch": 6.6717832957110605, + "grad_norm": 287.4677429199219, + "learning_rate": 1.9992740471869328e-05, + "loss": 37.5896, + "step": 1848 + }, + { + "epoch": 6.675395033860045, + "grad_norm": 224.59771728515625, + "learning_rate": 1.9987295825771326e-05, + "loss": 36.5515, + "step": 1849 + }, + { + "epoch": 6.679006772009029, + "grad_norm": 212.73065185546875, + "learning_rate": 1.9981851179673322e-05, + "loss": 36.2511, + "step": 1850 + }, + { + "epoch": 6.679006772009029, + "eval_loss": 0.6308404803276062, + "eval_runtime": 3.1419, + "eval_samples_per_second": 56.972, + "eval_steps_per_second": 56.972, + "step": 1850 + }, + { + "epoch": 6.682618510158013, + "grad_norm": 214.7340850830078, + "learning_rate": 1.9976406533575317e-05, + "loss": 37.6949, + "step": 1851 + }, + { + "epoch": 6.686230248306998, + "grad_norm": 220.3029327392578, + "learning_rate": 1.9970961887477316e-05, + "loss": 36.5785, + "step": 1852 + }, + { + "epoch": 6.689841986455982, + "grad_norm": 198.97564697265625, + "learning_rate": 1.996551724137931e-05, + "loss": 38.5277, + "step": 1853 + }, + { + "epoch": 6.693453724604966, + "grad_norm": 180.94789123535156, + "learning_rate": 1.9960072595281307e-05, + "loss": 37.5197, + "step": 1854 + }, + { + "epoch": 6.69706546275395, + "grad_norm": 212.17584228515625, + "learning_rate": 1.9954627949183302e-05, + "loss": 37.3483, + "step": 1855 + }, + { + "epoch": 6.700677200902934, + "grad_norm": 253.88601684570312, + "learning_rate": 1.9949183303085298e-05, + "loss": 38.5224, + "step": 1856 + }, + { + "epoch": 6.704288939051919, + "grad_norm": 193.17698669433594, + "learning_rate": 1.9943738656987296e-05, + "loss": 37.5679, + "step": 1857 + }, + { + "epoch": 6.707900677200903, + "grad_norm": 217.2652130126953, + "learning_rate": 1.9938294010889295e-05, + "loss": 27.7344, + "step": 1858 + }, + { + "epoch": 6.711512415349887, + "grad_norm": 183.9295196533203, + "learning_rate": 1.993284936479129e-05, + "loss": 24.3864, + "step": 1859 + }, + { + "epoch": 6.715124153498872, + "grad_norm": 200.3455352783203, + "learning_rate": 1.9927404718693286e-05, + "loss": 23.7328, + "step": 1860 + }, + { + "epoch": 6.715124153498872, + "eval_loss": 0.636415421962738, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.943, + "eval_steps_per_second": 56.943, + "step": 1860 + }, + { + "epoch": 6.718735891647856, + "grad_norm": 206.7858123779297, + "learning_rate": 1.992196007259528e-05, + "loss": 24.6541, + "step": 1861 + }, + { + "epoch": 6.72234762979684, + "grad_norm": 208.10414123535156, + "learning_rate": 1.9916515426497277e-05, + "loss": 25.1223, + "step": 1862 + }, + { + "epoch": 6.725959367945824, + "grad_norm": 270.6657409667969, + "learning_rate": 1.9911070780399275e-05, + "loss": 44.8561, + "step": 1863 + }, + { + "epoch": 6.7295711060948085, + "grad_norm": 246.69094848632812, + "learning_rate": 1.990562613430127e-05, + "loss": 45.8683, + "step": 1864 + }, + { + "epoch": 6.733182844243792, + "grad_norm": 243.4462432861328, + "learning_rate": 1.9900181488203266e-05, + "loss": 45.1845, + "step": 1865 + }, + { + "epoch": 6.736794582392776, + "grad_norm": 218.0637969970703, + "learning_rate": 1.989473684210526e-05, + "loss": 43.9492, + "step": 1866 + }, + { + "epoch": 6.74040632054176, + "grad_norm": 200.28140258789062, + "learning_rate": 1.988929219600726e-05, + "loss": 44.0612, + "step": 1867 + }, + { + "epoch": 6.744018058690745, + "grad_norm": 200.3120880126953, + "learning_rate": 1.988384754990926e-05, + "loss": 43.4748, + "step": 1868 + }, + { + "epoch": 6.747629796839729, + "grad_norm": 186.1811065673828, + "learning_rate": 1.9878402903811254e-05, + "loss": 43.6851, + "step": 1869 + }, + { + "epoch": 6.751241534988713, + "grad_norm": 208.15167236328125, + "learning_rate": 1.987295825771325e-05, + "loss": 44.4196, + "step": 1870 + }, + { + "epoch": 6.751241534988713, + "eval_loss": 0.6353851556777954, + "eval_runtime": 3.1436, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1870 + }, + { + "epoch": 6.754853273137698, + "grad_norm": 207.500244140625, + "learning_rate": 1.9867513611615245e-05, + "loss": 44.1493, + "step": 1871 + }, + { + "epoch": 6.758465011286682, + "grad_norm": 238.17047119140625, + "learning_rate": 1.986206896551724e-05, + "loss": 44.6587, + "step": 1872 + }, + { + "epoch": 6.762076749435666, + "grad_norm": 192.9468231201172, + "learning_rate": 1.9856624319419236e-05, + "loss": 43.2409, + "step": 1873 + }, + { + "epoch": 6.76568848758465, + "grad_norm": 205.26492309570312, + "learning_rate": 1.9851179673321235e-05, + "loss": 40.8636, + "step": 1874 + }, + { + "epoch": 6.769300225733634, + "grad_norm": 190.49908447265625, + "learning_rate": 1.984573502722323e-05, + "loss": 41.0769, + "step": 1875 + }, + { + "epoch": 6.772911963882619, + "grad_norm": 206.56097412109375, + "learning_rate": 1.984029038112523e-05, + "loss": 40.1137, + "step": 1876 + }, + { + "epoch": 6.776523702031603, + "grad_norm": 212.89256286621094, + "learning_rate": 1.9834845735027224e-05, + "loss": 41.0114, + "step": 1877 + }, + { + "epoch": 6.780135440180587, + "grad_norm": 197.24267578125, + "learning_rate": 1.982940108892922e-05, + "loss": 40.6027, + "step": 1878 + }, + { + "epoch": 6.7837471783295715, + "grad_norm": 187.01942443847656, + "learning_rate": 1.982395644283122e-05, + "loss": 40.5933, + "step": 1879 + }, + { + "epoch": 6.7873589164785555, + "grad_norm": 236.31092834472656, + "learning_rate": 1.9818511796733214e-05, + "loss": 41.2282, + "step": 1880 + }, + { + "epoch": 6.7873589164785555, + "eval_loss": 0.6299392580986023, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 1880 + }, + { + "epoch": 6.7909706546275395, + "grad_norm": 194.92059326171875, + "learning_rate": 1.981306715063521e-05, + "loss": 41.5858, + "step": 1881 + }, + { + "epoch": 6.794582392776523, + "grad_norm": 192.26272583007812, + "learning_rate": 1.9807622504537205e-05, + "loss": 40.6826, + "step": 1882 + }, + { + "epoch": 6.798194130925508, + "grad_norm": 181.8116912841797, + "learning_rate": 1.98021778584392e-05, + "loss": 40.0867, + "step": 1883 + }, + { + "epoch": 6.801805869074492, + "grad_norm": 219.03494262695312, + "learning_rate": 1.9796733212341195e-05, + "loss": 41.4496, + "step": 1884 + }, + { + "epoch": 6.805417607223476, + "grad_norm": 190.7852325439453, + "learning_rate": 1.9791288566243194e-05, + "loss": 42.4147, + "step": 1885 + }, + { + "epoch": 6.80902934537246, + "grad_norm": 200.32476806640625, + "learning_rate": 1.9785843920145193e-05, + "loss": 42.0316, + "step": 1886 + }, + { + "epoch": 6.812641083521445, + "grad_norm": 240.6086883544922, + "learning_rate": 1.9780399274047188e-05, + "loss": 39.6992, + "step": 1887 + }, + { + "epoch": 6.816252821670429, + "grad_norm": 222.31700134277344, + "learning_rate": 1.9774954627949184e-05, + "loss": 42.9572, + "step": 1888 + }, + { + "epoch": 6.819864559819413, + "grad_norm": 215.65292358398438, + "learning_rate": 1.976950998185118e-05, + "loss": 42.5147, + "step": 1889 + }, + { + "epoch": 6.823476297968397, + "grad_norm": 195.71624755859375, + "learning_rate": 1.9764065335753178e-05, + "loss": 40.9536, + "step": 1890 + }, + { + "epoch": 6.823476297968397, + "eval_loss": 0.6288287043571472, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 1890 + }, + { + "epoch": 6.827088036117382, + "grad_norm": 202.301025390625, + "learning_rate": 1.9758620689655173e-05, + "loss": 40.1754, + "step": 1891 + }, + { + "epoch": 6.830699774266366, + "grad_norm": 217.07186889648438, + "learning_rate": 1.975317604355717e-05, + "loss": 35.7505, + "step": 1892 + }, + { + "epoch": 6.83431151241535, + "grad_norm": 189.78782653808594, + "learning_rate": 1.9747731397459164e-05, + "loss": 34.813, + "step": 1893 + }, + { + "epoch": 6.837923250564334, + "grad_norm": 247.2117462158203, + "learning_rate": 1.974228675136116e-05, + "loss": 33.932, + "step": 1894 + }, + { + "epoch": 6.8415349887133186, + "grad_norm": 244.06321716308594, + "learning_rate": 1.9736842105263158e-05, + "loss": 36.2514, + "step": 1895 + }, + { + "epoch": 6.8451467268623025, + "grad_norm": 235.78692626953125, + "learning_rate": 1.9731397459165157e-05, + "loss": 35.2123, + "step": 1896 + }, + { + "epoch": 6.8487584650112865, + "grad_norm": 193.82456970214844, + "learning_rate": 1.9725952813067152e-05, + "loss": 36.5477, + "step": 1897 + }, + { + "epoch": 6.852370203160271, + "grad_norm": 230.2017059326172, + "learning_rate": 1.9720508166969148e-05, + "loss": 36.1244, + "step": 1898 + }, + { + "epoch": 6.855981941309255, + "grad_norm": 205.5274200439453, + "learning_rate": 1.9715063520871143e-05, + "loss": 36.7059, + "step": 1899 + }, + { + "epoch": 6.859593679458239, + "grad_norm": 236.6873016357422, + "learning_rate": 1.970961887477314e-05, + "loss": 36.6212, + "step": 1900 + }, + { + "epoch": 6.859593679458239, + "eval_loss": 0.6235609650611877, + "eval_runtime": 3.1497, + "eval_samples_per_second": 56.831, + "eval_steps_per_second": 56.831, + "step": 1900 + }, + { + "epoch": 6.863205417607223, + "grad_norm": 217.63638305664062, + "learning_rate": 1.9704174228675137e-05, + "loss": 37.3918, + "step": 1901 + }, + { + "epoch": 6.866817155756207, + "grad_norm": 169.31996154785156, + "learning_rate": 1.9698729582577133e-05, + "loss": 37.8555, + "step": 1902 + }, + { + "epoch": 6.870428893905192, + "grad_norm": 204.2144775390625, + "learning_rate": 1.9693284936479128e-05, + "loss": 38.0013, + "step": 1903 + }, + { + "epoch": 6.874040632054176, + "grad_norm": 219.13595581054688, + "learning_rate": 1.9687840290381127e-05, + "loss": 37.2128, + "step": 1904 + }, + { + "epoch": 6.87765237020316, + "grad_norm": 189.8477325439453, + "learning_rate": 1.9682395644283122e-05, + "loss": 39.272, + "step": 1905 + }, + { + "epoch": 6.881264108352145, + "grad_norm": 214.21360778808594, + "learning_rate": 1.967695099818512e-05, + "loss": 37.5185, + "step": 1906 + }, + { + "epoch": 6.884875846501129, + "grad_norm": 252.57867431640625, + "learning_rate": 1.9671506352087116e-05, + "loss": 37.6195, + "step": 1907 + }, + { + "epoch": 6.888487584650113, + "grad_norm": 169.85382080078125, + "learning_rate": 1.966606170598911e-05, + "loss": 29.083, + "step": 1908 + }, + { + "epoch": 6.892099322799097, + "grad_norm": 161.38137817382812, + "learning_rate": 1.9660617059891107e-05, + "loss": 24.4547, + "step": 1909 + }, + { + "epoch": 6.895711060948082, + "grad_norm": 192.5706787109375, + "learning_rate": 1.9655172413793102e-05, + "loss": 24.2235, + "step": 1910 + }, + { + "epoch": 6.895711060948082, + "eval_loss": 0.6387229561805725, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1910 + }, + { + "epoch": 6.899322799097066, + "grad_norm": 177.5368194580078, + "learning_rate": 1.9649727767695098e-05, + "loss": 24.8032, + "step": 1911 + }, + { + "epoch": 6.9029345372460496, + "grad_norm": 206.98458862304688, + "learning_rate": 1.9644283121597097e-05, + "loss": 25.7293, + "step": 1912 + }, + { + "epoch": 6.9065462753950335, + "grad_norm": 238.7289581298828, + "learning_rate": 1.9638838475499095e-05, + "loss": 44.2514, + "step": 1913 + }, + { + "epoch": 6.910158013544018, + "grad_norm": 225.86854553222656, + "learning_rate": 1.963339382940109e-05, + "loss": 44.4858, + "step": 1914 + }, + { + "epoch": 6.913769751693002, + "grad_norm": 235.71524047851562, + "learning_rate": 1.9627949183303086e-05, + "loss": 44.5351, + "step": 1915 + }, + { + "epoch": 6.917381489841986, + "grad_norm": 233.1634063720703, + "learning_rate": 1.962250453720508e-05, + "loss": 44.0865, + "step": 1916 + }, + { + "epoch": 6.92099322799097, + "grad_norm": 201.48944091796875, + "learning_rate": 1.961705989110708e-05, + "loss": 45.0226, + "step": 1917 + }, + { + "epoch": 6.924604966139955, + "grad_norm": 226.95469665527344, + "learning_rate": 1.9611615245009076e-05, + "loss": 44.3969, + "step": 1918 + }, + { + "epoch": 6.928216704288939, + "grad_norm": 242.79940795898438, + "learning_rate": 1.960617059891107e-05, + "loss": 41.3037, + "step": 1919 + }, + { + "epoch": 6.931828442437923, + "grad_norm": 255.3524932861328, + "learning_rate": 1.9600725952813066e-05, + "loss": 41.3567, + "step": 1920 + }, + { + "epoch": 6.931828442437923, + "eval_loss": 0.6346065998077393, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 1920 + }, + { + "epoch": 6.935440180586907, + "grad_norm": 277.0763854980469, + "learning_rate": 1.9595281306715062e-05, + "loss": 41.142, + "step": 1921 + }, + { + "epoch": 6.939051918735892, + "grad_norm": 176.02658081054688, + "learning_rate": 1.958983666061706e-05, + "loss": 42.1963, + "step": 1922 + }, + { + "epoch": 6.942663656884876, + "grad_norm": 236.36398315429688, + "learning_rate": 1.958439201451906e-05, + "loss": 42.351, + "step": 1923 + }, + { + "epoch": 6.94627539503386, + "grad_norm": 203.0919647216797, + "learning_rate": 1.9578947368421055e-05, + "loss": 41.5248, + "step": 1924 + }, + { + "epoch": 6.949887133182845, + "grad_norm": 273.605712890625, + "learning_rate": 1.957350272232305e-05, + "loss": 42.1004, + "step": 1925 + }, + { + "epoch": 6.953498871331829, + "grad_norm": 214.04319763183594, + "learning_rate": 1.9568058076225045e-05, + "loss": 42.6326, + "step": 1926 + }, + { + "epoch": 6.957110609480813, + "grad_norm": 250.81832885742188, + "learning_rate": 1.956261343012704e-05, + "loss": 43.8045, + "step": 1927 + }, + { + "epoch": 6.960722347629797, + "grad_norm": 233.58116149902344, + "learning_rate": 1.955716878402904e-05, + "loss": 39.8991, + "step": 1928 + }, + { + "epoch": 6.9643340857787805, + "grad_norm": 269.0545654296875, + "learning_rate": 1.9551724137931035e-05, + "loss": 34.6192, + "step": 1929 + }, + { + "epoch": 6.967945823927765, + "grad_norm": 266.1218566894531, + "learning_rate": 1.954627949183303e-05, + "loss": 35.7568, + "step": 1930 + }, + { + "epoch": 6.967945823927765, + "eval_loss": 0.6233173608779907, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1930 + }, + { + "epoch": 6.971557562076749, + "grad_norm": 294.6914978027344, + "learning_rate": 1.9540834845735026e-05, + "loss": 36.0795, + "step": 1931 + }, + { + "epoch": 6.975169300225733, + "grad_norm": 373.6831970214844, + "learning_rate": 1.9535390199637025e-05, + "loss": 37.2715, + "step": 1932 + }, + { + "epoch": 6.978781038374718, + "grad_norm": 240.34738159179688, + "learning_rate": 1.952994555353902e-05, + "loss": 37.8335, + "step": 1933 + }, + { + "epoch": 6.982392776523702, + "grad_norm": 312.1968994140625, + "learning_rate": 1.952450090744102e-05, + "loss": 37.8251, + "step": 1934 + }, + { + "epoch": 6.986004514672686, + "grad_norm": 276.3544006347656, + "learning_rate": 1.9519056261343014e-05, + "loss": 38.8466, + "step": 1935 + }, + { + "epoch": 6.98961625282167, + "grad_norm": 282.6874694824219, + "learning_rate": 1.951361161524501e-05, + "loss": 37.774, + "step": 1936 + }, + { + "epoch": 6.993227990970655, + "grad_norm": 323.96612548828125, + "learning_rate": 1.9508166969147005e-05, + "loss": 34.3747, + "step": 1937 + }, + { + "epoch": 6.996839729119639, + "grad_norm": 235.02915954589844, + "learning_rate": 1.9502722323049e-05, + "loss": 24.5297, + "step": 1938 + }, + { + "epoch": 7.0, + "grad_norm": 176.4046173095703, + "learning_rate": 1.9497277676951e-05, + "loss": 22.3179, + "step": 1939 + }, + { + "epoch": 7.003611738148984, + "grad_norm": 248.2797393798828, + "learning_rate": 1.9491833030852994e-05, + "loss": 42.225, + "step": 1940 + }, + { + "epoch": 7.003611738148984, + "eval_loss": 0.6272363066673279, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.911, + "eval_steps_per_second": 56.911, + "step": 1940 + }, + { + "epoch": 7.007223476297969, + "grad_norm": 235.9131622314453, + "learning_rate": 1.9486388384754993e-05, + "loss": 43.6526, + "step": 1941 + }, + { + "epoch": 7.010835214446953, + "grad_norm": 223.63479614257812, + "learning_rate": 1.948094373865699e-05, + "loss": 42.9052, + "step": 1942 + }, + { + "epoch": 7.014446952595937, + "grad_norm": 203.92141723632812, + "learning_rate": 1.9475499092558984e-05, + "loss": 43.5819, + "step": 1943 + }, + { + "epoch": 7.018058690744921, + "grad_norm": 209.6050567626953, + "learning_rate": 1.947005444646098e-05, + "loss": 43.1077, + "step": 1944 + }, + { + "epoch": 7.021670428893906, + "grad_norm": 245.77700805664062, + "learning_rate": 1.9464609800362978e-05, + "loss": 42.7508, + "step": 1945 + }, + { + "epoch": 7.0252821670428895, + "grad_norm": 203.13465881347656, + "learning_rate": 1.9459165154264973e-05, + "loss": 42.5234, + "step": 1946 + }, + { + "epoch": 7.0288939051918735, + "grad_norm": 226.4978485107422, + "learning_rate": 1.945372050816697e-05, + "loss": 44.0725, + "step": 1947 + }, + { + "epoch": 7.0325056433408575, + "grad_norm": 225.68116760253906, + "learning_rate": 1.9448275862068964e-05, + "loss": 42.6408, + "step": 1948 + }, + { + "epoch": 7.036117381489842, + "grad_norm": 182.14202880859375, + "learning_rate": 1.944283121597096e-05, + "loss": 41.7696, + "step": 1949 + }, + { + "epoch": 7.039729119638826, + "grad_norm": 196.1949005126953, + "learning_rate": 1.9437386569872962e-05, + "loss": 42.7008, + "step": 1950 + }, + { + "epoch": 7.039729119638826, + "eval_loss": 0.6277336478233337, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 1950 + }, + { + "epoch": 7.04334085778781, + "grad_norm": 180.6853485107422, + "learning_rate": 1.9431941923774957e-05, + "loss": 41.9946, + "step": 1951 + }, + { + "epoch": 7.046952595936794, + "grad_norm": 199.0644073486328, + "learning_rate": 1.9426497277676953e-05, + "loss": 39.8965, + "step": 1952 + }, + { + "epoch": 7.050564334085779, + "grad_norm": 208.21371459960938, + "learning_rate": 1.9421052631578948e-05, + "loss": 39.3263, + "step": 1953 + }, + { + "epoch": 7.054176072234763, + "grad_norm": 239.78677368164062, + "learning_rate": 1.9415607985480943e-05, + "loss": 40.1478, + "step": 1954 + }, + { + "epoch": 7.057787810383747, + "grad_norm": 211.55030822753906, + "learning_rate": 1.941016333938294e-05, + "loss": 40.061, + "step": 1955 + }, + { + "epoch": 7.061399548532731, + "grad_norm": 199.51455688476562, + "learning_rate": 1.9404718693284937e-05, + "loss": 39.8707, + "step": 1956 + }, + { + "epoch": 7.065011286681716, + "grad_norm": 183.39486694335938, + "learning_rate": 1.9399274047186933e-05, + "loss": 40.3183, + "step": 1957 + }, + { + "epoch": 7.0686230248307, + "grad_norm": 238.36737060546875, + "learning_rate": 1.9393829401088928e-05, + "loss": 40.8581, + "step": 1958 + }, + { + "epoch": 7.072234762979684, + "grad_norm": 202.5072021484375, + "learning_rate": 1.9388384754990927e-05, + "loss": 40.2192, + "step": 1959 + }, + { + "epoch": 7.075846501128668, + "grad_norm": 204.236083984375, + "learning_rate": 1.9382940108892922e-05, + "loss": 40.8533, + "step": 1960 + }, + { + "epoch": 7.075846501128668, + "eval_loss": 0.6252757906913757, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 1960 + }, + { + "epoch": 7.079458239277653, + "grad_norm": 260.2081298828125, + "learning_rate": 1.937749546279492e-05, + "loss": 39.7229, + "step": 1961 + }, + { + "epoch": 7.083069977426637, + "grad_norm": 241.91722106933594, + "learning_rate": 1.9372050816696917e-05, + "loss": 41.547, + "step": 1962 + }, + { + "epoch": 7.0866817155756205, + "grad_norm": 168.9304656982422, + "learning_rate": 1.9366606170598912e-05, + "loss": 41.4826, + "step": 1963 + }, + { + "epoch": 7.090293453724605, + "grad_norm": 230.05349731445312, + "learning_rate": 1.9361161524500907e-05, + "loss": 41.5411, + "step": 1964 + }, + { + "epoch": 7.093905191873589, + "grad_norm": 172.16851806640625, + "learning_rate": 1.9355716878402903e-05, + "loss": 42.2347, + "step": 1965 + }, + { + "epoch": 7.097516930022573, + "grad_norm": 312.65838623046875, + "learning_rate": 1.9350272232304898e-05, + "loss": 41.4039, + "step": 1966 + }, + { + "epoch": 7.101128668171557, + "grad_norm": 249.62351989746094, + "learning_rate": 1.9344827586206897e-05, + "loss": 41.4234, + "step": 1967 + }, + { + "epoch": 7.104740406320542, + "grad_norm": 250.49143981933594, + "learning_rate": 1.9339382940108896e-05, + "loss": 38.0539, + "step": 1968 + }, + { + "epoch": 7.108352144469526, + "grad_norm": 238.41546630859375, + "learning_rate": 1.933393829401089e-05, + "loss": 35.5584, + "step": 1969 + }, + { + "epoch": 7.11196388261851, + "grad_norm": 200.78282165527344, + "learning_rate": 1.9328493647912886e-05, + "loss": 34.4491, + "step": 1970 + }, + { + "epoch": 7.11196388261851, + "eval_loss": 0.6286216378211975, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 1970 + }, + { + "epoch": 7.115575620767494, + "grad_norm": 244.61717224121094, + "learning_rate": 1.9323049001814882e-05, + "loss": 34.5403, + "step": 1971 + }, + { + "epoch": 7.119187358916479, + "grad_norm": 219.14312744140625, + "learning_rate": 1.931760435571688e-05, + "loss": 35.7815, + "step": 1972 + }, + { + "epoch": 7.122799097065463, + "grad_norm": 221.85130310058594, + "learning_rate": 1.9312159709618876e-05, + "loss": 35.638, + "step": 1973 + }, + { + "epoch": 7.126410835214447, + "grad_norm": 237.97921752929688, + "learning_rate": 1.930671506352087e-05, + "loss": 35.1348, + "step": 1974 + }, + { + "epoch": 7.130022573363431, + "grad_norm": 234.06256103515625, + "learning_rate": 1.9301270417422867e-05, + "loss": 35.8709, + "step": 1975 + }, + { + "epoch": 7.133634311512416, + "grad_norm": 231.6852264404297, + "learning_rate": 1.9295825771324862e-05, + "loss": 36.6859, + "step": 1976 + }, + { + "epoch": 7.1372460496614, + "grad_norm": 208.2762908935547, + "learning_rate": 1.9290381125226857e-05, + "loss": 37.24, + "step": 1977 + }, + { + "epoch": 7.140857787810384, + "grad_norm": 219.8532257080078, + "learning_rate": 1.928493647912886e-05, + "loss": 36.4058, + "step": 1978 + }, + { + "epoch": 7.144469525959368, + "grad_norm": 242.73159790039062, + "learning_rate": 1.9279491833030855e-05, + "loss": 36.7565, + "step": 1979 + }, + { + "epoch": 7.148081264108352, + "grad_norm": 227.09645080566406, + "learning_rate": 1.927404718693285e-05, + "loss": 37.6752, + "step": 1980 + }, + { + "epoch": 7.148081264108352, + "eval_loss": 0.6243596076965332, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 1980 + }, + { + "epoch": 7.151693002257336, + "grad_norm": 236.27169799804688, + "learning_rate": 1.9268602540834846e-05, + "loss": 38.3857, + "step": 1981 + }, + { + "epoch": 7.15530474040632, + "grad_norm": 244.84912109375, + "learning_rate": 1.926315789473684e-05, + "loss": 38.414, + "step": 1982 + }, + { + "epoch": 7.158916478555304, + "grad_norm": 203.36798095703125, + "learning_rate": 1.925771324863884e-05, + "loss": 38.938, + "step": 1983 + }, + { + "epoch": 7.162528216704289, + "grad_norm": 225.50152587890625, + "learning_rate": 1.9252268602540835e-05, + "loss": 37.654, + "step": 1984 + }, + { + "epoch": 7.166139954853273, + "grad_norm": 236.4989471435547, + "learning_rate": 1.924682395644283e-05, + "loss": 28.2794, + "step": 1985 + }, + { + "epoch": 7.169751693002257, + "grad_norm": 173.909423828125, + "learning_rate": 1.9241379310344826e-05, + "loss": 23.3804, + "step": 1986 + }, + { + "epoch": 7.173363431151241, + "grad_norm": 195.63526916503906, + "learning_rate": 1.9235934664246825e-05, + "loss": 24.4696, + "step": 1987 + }, + { + "epoch": 7.176975169300226, + "grad_norm": 150.0059356689453, + "learning_rate": 1.923049001814882e-05, + "loss": 23.9438, + "step": 1988 + }, + { + "epoch": 7.18058690744921, + "grad_norm": 217.61630249023438, + "learning_rate": 1.922504537205082e-05, + "loss": 25.4084, + "step": 1989 + }, + { + "epoch": 7.184198645598194, + "grad_norm": 259.2041015625, + "learning_rate": 1.9219600725952814e-05, + "loss": 44.7159, + "step": 1990 + }, + { + "epoch": 7.184198645598194, + "eval_loss": 0.6465168595314026, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 1990 + }, + { + "epoch": 7.187810383747179, + "grad_norm": 282.1758117675781, + "learning_rate": 1.921415607985481e-05, + "loss": 45.7571, + "step": 1991 + }, + { + "epoch": 7.191422121896163, + "grad_norm": 276.5455322265625, + "learning_rate": 1.9208711433756805e-05, + "loss": 44.7227, + "step": 1992 + }, + { + "epoch": 7.195033860045147, + "grad_norm": 251.93589782714844, + "learning_rate": 1.92032667876588e-05, + "loss": 43.0705, + "step": 1993 + }, + { + "epoch": 7.198645598194131, + "grad_norm": 224.8245086669922, + "learning_rate": 1.91978221415608e-05, + "loss": 43.2009, + "step": 1994 + }, + { + "epoch": 7.2022573363431155, + "grad_norm": 233.61770629882812, + "learning_rate": 1.9192377495462795e-05, + "loss": 43.4496, + "step": 1995 + }, + { + "epoch": 7.2058690744920995, + "grad_norm": 188.65252685546875, + "learning_rate": 1.9186932849364793e-05, + "loss": 42.5907, + "step": 1996 + }, + { + "epoch": 7.209480812641083, + "grad_norm": 185.1155242919922, + "learning_rate": 1.918148820326679e-05, + "loss": 44.4651, + "step": 1997 + }, + { + "epoch": 7.213092550790067, + "grad_norm": 169.09701538085938, + "learning_rate": 1.9176043557168784e-05, + "loss": 43.6325, + "step": 1998 + }, + { + "epoch": 7.216704288939052, + "grad_norm": 198.49114990234375, + "learning_rate": 1.9170598911070783e-05, + "loss": 43.5817, + "step": 1999 + }, + { + "epoch": 7.220316027088036, + "grad_norm": 193.17591857910156, + "learning_rate": 1.916515426497278e-05, + "loss": 41.4884, + "step": 2000 + }, + { + "epoch": 7.220316027088036, + "eval_loss": 0.6329721212387085, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.995, + "eval_steps_per_second": 56.995, + "step": 2000 + }, + { + "epoch": 7.22392776523702, + "grad_norm": 202.32730102539062, + "learning_rate": 1.9159709618874774e-05, + "loss": 41.2168, + "step": 2001 + }, + { + "epoch": 7.227539503386004, + "grad_norm": 206.4916534423828, + "learning_rate": 1.915426497277677e-05, + "loss": 39.9909, + "step": 2002 + }, + { + "epoch": 7.231151241534989, + "grad_norm": 202.2099609375, + "learning_rate": 1.9148820326678765e-05, + "loss": 40.1413, + "step": 2003 + }, + { + "epoch": 7.234762979683973, + "grad_norm": 223.7954559326172, + "learning_rate": 1.914337568058076e-05, + "loss": 39.5872, + "step": 2004 + }, + { + "epoch": 7.238374717832957, + "grad_norm": 225.8967742919922, + "learning_rate": 1.9137931034482762e-05, + "loss": 41.3396, + "step": 2005 + }, + { + "epoch": 7.241986455981941, + "grad_norm": 248.0997772216797, + "learning_rate": 1.9132486388384757e-05, + "loss": 39.012, + "step": 2006 + }, + { + "epoch": 7.245598194130926, + "grad_norm": 227.4576873779297, + "learning_rate": 1.9127041742286753e-05, + "loss": 42.5922, + "step": 2007 + }, + { + "epoch": 7.24920993227991, + "grad_norm": 197.62547302246094, + "learning_rate": 1.9121597096188748e-05, + "loss": 41.6107, + "step": 2008 + }, + { + "epoch": 7.252821670428894, + "grad_norm": 170.18817138671875, + "learning_rate": 1.9116152450090744e-05, + "loss": 40.3326, + "step": 2009 + }, + { + "epoch": 7.2564334085778786, + "grad_norm": 186.9420166015625, + "learning_rate": 1.9110707803992742e-05, + "loss": 41.0365, + "step": 2010 + }, + { + "epoch": 7.2564334085778786, + "eval_loss": 0.6230406761169434, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2010 + }, + { + "epoch": 7.2600451467268625, + "grad_norm": 188.11244201660156, + "learning_rate": 1.9105263157894738e-05, + "loss": 42.0278, + "step": 2011 + }, + { + "epoch": 7.2636568848758465, + "grad_norm": 242.47305297851562, + "learning_rate": 1.9099818511796733e-05, + "loss": 41.5539, + "step": 2012 + }, + { + "epoch": 7.2672686230248305, + "grad_norm": 190.83987426757812, + "learning_rate": 1.909437386569873e-05, + "loss": 41.8641, + "step": 2013 + }, + { + "epoch": 7.270880361173815, + "grad_norm": 214.44650268554688, + "learning_rate": 1.9088929219600724e-05, + "loss": 42.232, + "step": 2014 + }, + { + "epoch": 7.274492099322799, + "grad_norm": 216.3888397216797, + "learning_rate": 1.9083484573502723e-05, + "loss": 41.6186, + "step": 2015 + }, + { + "epoch": 7.278103837471783, + "grad_norm": 210.46673583984375, + "learning_rate": 1.907803992740472e-05, + "loss": 42.2099, + "step": 2016 + }, + { + "epoch": 7.281715575620767, + "grad_norm": 194.84165954589844, + "learning_rate": 1.9072595281306717e-05, + "loss": 42.78, + "step": 2017 + }, + { + "epoch": 7.285327313769752, + "grad_norm": 201.91297912597656, + "learning_rate": 1.9067150635208712e-05, + "loss": 38.7115, + "step": 2018 + }, + { + "epoch": 7.288939051918736, + "grad_norm": 245.42625427246094, + "learning_rate": 1.9061705989110708e-05, + "loss": 35.7841, + "step": 2019 + }, + { + "epoch": 7.29255079006772, + "grad_norm": 182.4967041015625, + "learning_rate": 1.9056261343012703e-05, + "loss": 34.3308, + "step": 2020 + }, + { + "epoch": 7.29255079006772, + "eval_loss": 0.6238341331481934, + "eval_runtime": 3.1431, + "eval_samples_per_second": 56.95, + "eval_steps_per_second": 56.95, + "step": 2020 + }, + { + "epoch": 7.296162528216704, + "grad_norm": 297.3916320800781, + "learning_rate": 1.9050816696914702e-05, + "loss": 34.7534, + "step": 2021 + }, + { + "epoch": 7.299774266365689, + "grad_norm": 211.52554321289062, + "learning_rate": 1.9045372050816697e-05, + "loss": 34.0303, + "step": 2022 + }, + { + "epoch": 7.303386004514673, + "grad_norm": 232.99844360351562, + "learning_rate": 1.9039927404718693e-05, + "loss": 35.7378, + "step": 2023 + }, + { + "epoch": 7.306997742663657, + "grad_norm": 230.34642028808594, + "learning_rate": 1.903448275862069e-05, + "loss": 36.7492, + "step": 2024 + }, + { + "epoch": 7.310609480812641, + "grad_norm": 228.88966369628906, + "learning_rate": 1.9029038112522687e-05, + "loss": 35.1188, + "step": 2025 + }, + { + "epoch": 7.314221218961626, + "grad_norm": 213.2604522705078, + "learning_rate": 1.9023593466424682e-05, + "loss": 35.0688, + "step": 2026 + }, + { + "epoch": 7.3178329571106095, + "grad_norm": 202.62200927734375, + "learning_rate": 1.901814882032668e-05, + "loss": 37.6721, + "step": 2027 + }, + { + "epoch": 7.3214446952595935, + "grad_norm": 191.8877410888672, + "learning_rate": 1.9012704174228676e-05, + "loss": 36.7728, + "step": 2028 + }, + { + "epoch": 7.3250564334085775, + "grad_norm": 211.57571411132812, + "learning_rate": 1.900725952813067e-05, + "loss": 36.6342, + "step": 2029 + }, + { + "epoch": 7.328668171557562, + "grad_norm": 177.2289581298828, + "learning_rate": 1.9001814882032667e-05, + "loss": 36.8319, + "step": 2030 + }, + { + "epoch": 7.328668171557562, + "eval_loss": 0.6231008172035217, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2030 + }, + { + "epoch": 7.332279909706546, + "grad_norm": 227.7028350830078, + "learning_rate": 1.8996370235934662e-05, + "loss": 36.6706, + "step": 2031 + }, + { + "epoch": 7.33589164785553, + "grad_norm": 229.02972412109375, + "learning_rate": 1.899092558983666e-05, + "loss": 37.0749, + "step": 2032 + }, + { + "epoch": 7.339503386004514, + "grad_norm": 234.30946350097656, + "learning_rate": 1.898548094373866e-05, + "loss": 37.3716, + "step": 2033 + }, + { + "epoch": 7.343115124153499, + "grad_norm": 236.79893493652344, + "learning_rate": 1.8980036297640655e-05, + "loss": 38.9503, + "step": 2034 + }, + { + "epoch": 7.346726862302483, + "grad_norm": 256.5646057128906, + "learning_rate": 1.897459165154265e-05, + "loss": 32.5056, + "step": 2035 + }, + { + "epoch": 7.350338600451467, + "grad_norm": 183.38961791992188, + "learning_rate": 1.8969147005444646e-05, + "loss": 25.3982, + "step": 2036 + }, + { + "epoch": 7.353950338600452, + "grad_norm": 214.09742736816406, + "learning_rate": 1.896370235934664e-05, + "loss": 23.2743, + "step": 2037 + }, + { + "epoch": 7.357562076749436, + "grad_norm": 190.10867309570312, + "learning_rate": 1.895825771324864e-05, + "loss": 24.8062, + "step": 2038 + }, + { + "epoch": 7.36117381489842, + "grad_norm": 197.85313415527344, + "learning_rate": 1.8952813067150636e-05, + "loss": 25.5098, + "step": 2039 + }, + { + "epoch": 7.364785553047404, + "grad_norm": 235.79090881347656, + "learning_rate": 1.894736842105263e-05, + "loss": 44.3536, + "step": 2040 + }, + { + "epoch": 7.364785553047404, + "eval_loss": 0.6341925263404846, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.785, + "eval_steps_per_second": 56.785, + "step": 2040 + }, + { + "epoch": 7.368397291196389, + "grad_norm": 232.7415771484375, + "learning_rate": 1.8941923774954626e-05, + "loss": 44.6073, + "step": 2041 + }, + { + "epoch": 7.372009029345373, + "grad_norm": 302.3766174316406, + "learning_rate": 1.8936479128856625e-05, + "loss": 43.8575, + "step": 2042 + }, + { + "epoch": 7.375620767494357, + "grad_norm": 208.41441345214844, + "learning_rate": 1.8931034482758624e-05, + "loss": 42.4378, + "step": 2043 + }, + { + "epoch": 7.3792325056433405, + "grad_norm": 228.000732421875, + "learning_rate": 1.892558983666062e-05, + "loss": 44.5641, + "step": 2044 + }, + { + "epoch": 7.382844243792325, + "grad_norm": 201.757080078125, + "learning_rate": 1.8920145190562615e-05, + "loss": 43.7578, + "step": 2045 + }, + { + "epoch": 7.386455981941309, + "grad_norm": 220.2481689453125, + "learning_rate": 1.891470054446461e-05, + "loss": 42.755, + "step": 2046 + }, + { + "epoch": 7.390067720090293, + "grad_norm": 225.5443115234375, + "learning_rate": 1.8909255898366605e-05, + "loss": 44.3785, + "step": 2047 + }, + { + "epoch": 7.393679458239277, + "grad_norm": 200.2024688720703, + "learning_rate": 1.89038112522686e-05, + "loss": 42.994, + "step": 2048 + }, + { + "epoch": 7.397291196388262, + "grad_norm": 205.64794921875, + "learning_rate": 1.88983666061706e-05, + "loss": 43.1902, + "step": 2049 + }, + { + "epoch": 7.400902934537246, + "grad_norm": 183.3535919189453, + "learning_rate": 1.8892921960072595e-05, + "loss": 40.9422, + "step": 2050 + }, + { + "epoch": 7.400902934537246, + "eval_loss": 0.626913845539093, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2050 + }, + { + "epoch": 7.40451467268623, + "grad_norm": 201.8138885498047, + "learning_rate": 1.8887477313974594e-05, + "loss": 39.4408, + "step": 2051 + }, + { + "epoch": 7.408126410835214, + "grad_norm": 201.8863525390625, + "learning_rate": 1.888203266787659e-05, + "loss": 39.5467, + "step": 2052 + }, + { + "epoch": 7.411738148984199, + "grad_norm": 239.10687255859375, + "learning_rate": 1.8876588021778585e-05, + "loss": 41.2256, + "step": 2053 + }, + { + "epoch": 7.415349887133183, + "grad_norm": 209.47796630859375, + "learning_rate": 1.8871143375680583e-05, + "loss": 40.8963, + "step": 2054 + }, + { + "epoch": 7.418961625282167, + "grad_norm": 202.6414794921875, + "learning_rate": 1.886569872958258e-05, + "loss": 40.5138, + "step": 2055 + }, + { + "epoch": 7.422573363431152, + "grad_norm": 198.01795959472656, + "learning_rate": 1.8860254083484574e-05, + "loss": 39.1767, + "step": 2056 + }, + { + "epoch": 7.426185101580136, + "grad_norm": 173.26507568359375, + "learning_rate": 1.885480943738657e-05, + "loss": 40.6713, + "step": 2057 + }, + { + "epoch": 7.42979683972912, + "grad_norm": 166.11607360839844, + "learning_rate": 1.8849364791288565e-05, + "loss": 41.2602, + "step": 2058 + }, + { + "epoch": 7.433408577878104, + "grad_norm": 200.76956176757812, + "learning_rate": 1.884392014519056e-05, + "loss": 41.0714, + "step": 2059 + }, + { + "epoch": 7.437020316027088, + "grad_norm": 213.75315856933594, + "learning_rate": 1.883847549909256e-05, + "loss": 39.6812, + "step": 2060 + }, + { + "epoch": 7.437020316027088, + "eval_loss": 0.6279598474502563, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.0, + "eval_steps_per_second": 57.0, + "step": 2060 + }, + { + "epoch": 7.440632054176072, + "grad_norm": 221.25025939941406, + "learning_rate": 1.8833030852994558e-05, + "loss": 41.6964, + "step": 2061 + }, + { + "epoch": 7.444243792325056, + "grad_norm": 171.32106018066406, + "learning_rate": 1.8827586206896553e-05, + "loss": 41.4608, + "step": 2062 + }, + { + "epoch": 7.44785553047404, + "grad_norm": 222.76600646972656, + "learning_rate": 1.882214156079855e-05, + "loss": 41.2687, + "step": 2063 + }, + { + "epoch": 7.451467268623025, + "grad_norm": 169.82395935058594, + "learning_rate": 1.8816696914700544e-05, + "loss": 41.6048, + "step": 2064 + }, + { + "epoch": 7.455079006772009, + "grad_norm": 190.5113525390625, + "learning_rate": 1.8811252268602543e-05, + "loss": 41.8843, + "step": 2065 + }, + { + "epoch": 7.458690744920993, + "grad_norm": 194.5990447998047, + "learning_rate": 1.8805807622504538e-05, + "loss": 43.5968, + "step": 2066 + }, + { + "epoch": 7.462302483069977, + "grad_norm": 216.0985870361328, + "learning_rate": 1.8800362976406533e-05, + "loss": 41.6743, + "step": 2067 + }, + { + "epoch": 7.465914221218962, + "grad_norm": 249.05270385742188, + "learning_rate": 1.879491833030853e-05, + "loss": 39.4203, + "step": 2068 + }, + { + "epoch": 7.469525959367946, + "grad_norm": 232.5495147705078, + "learning_rate": 1.8789473684210524e-05, + "loss": 36.2202, + "step": 2069 + }, + { + "epoch": 7.47313769751693, + "grad_norm": 218.72299194335938, + "learning_rate": 1.8784029038112523e-05, + "loss": 34.9116, + "step": 2070 + }, + { + "epoch": 7.47313769751693, + "eval_loss": 0.6241349577903748, + "eval_runtime": 3.1499, + "eval_samples_per_second": 56.827, + "eval_steps_per_second": 56.827, + "step": 2070 + }, + { + "epoch": 7.476749435665914, + "grad_norm": 241.78179931640625, + "learning_rate": 1.8778584392014522e-05, + "loss": 36.2476, + "step": 2071 + }, + { + "epoch": 7.480361173814899, + "grad_norm": 194.92982482910156, + "learning_rate": 1.8773139745916517e-05, + "loss": 34.4524, + "step": 2072 + }, + { + "epoch": 7.483972911963883, + "grad_norm": 227.76156616210938, + "learning_rate": 1.8767695099818513e-05, + "loss": 34.5292, + "step": 2073 + }, + { + "epoch": 7.487584650112867, + "grad_norm": 287.61309814453125, + "learning_rate": 1.8762250453720508e-05, + "loss": 37.8068, + "step": 2074 + }, + { + "epoch": 7.491196388261851, + "grad_norm": 191.0822296142578, + "learning_rate": 1.8756805807622503e-05, + "loss": 36.0941, + "step": 2075 + }, + { + "epoch": 7.4948081264108355, + "grad_norm": 197.5564422607422, + "learning_rate": 1.8751361161524502e-05, + "loss": 36.3624, + "step": 2076 + }, + { + "epoch": 7.4984198645598195, + "grad_norm": 187.72479248046875, + "learning_rate": 1.8745916515426497e-05, + "loss": 37.5074, + "step": 2077 + }, + { + "epoch": 7.502031602708803, + "grad_norm": 220.4607391357422, + "learning_rate": 1.8740471869328493e-05, + "loss": 35.6139, + "step": 2078 + }, + { + "epoch": 7.505643340857787, + "grad_norm": 179.05612182617188, + "learning_rate": 1.873502722323049e-05, + "loss": 37.7286, + "step": 2079 + }, + { + "epoch": 7.509255079006772, + "grad_norm": 230.91879272460938, + "learning_rate": 1.8729582577132487e-05, + "loss": 36.1803, + "step": 2080 + }, + { + "epoch": 7.509255079006772, + "eval_loss": 0.6255043148994446, + "eval_runtime": 3.1466, + "eval_samples_per_second": 56.887, + "eval_steps_per_second": 56.887, + "step": 2080 + }, + { + "epoch": 7.512866817155756, + "grad_norm": 182.89437866210938, + "learning_rate": 1.8724137931034482e-05, + "loss": 36.5782, + "step": 2081 + }, + { + "epoch": 7.51647855530474, + "grad_norm": 215.36769104003906, + "learning_rate": 1.871869328493648e-05, + "loss": 38.233, + "step": 2082 + }, + { + "epoch": 7.520090293453725, + "grad_norm": 232.6095733642578, + "learning_rate": 1.8713248638838477e-05, + "loss": 38.6268, + "step": 2083 + }, + { + "epoch": 7.523702031602709, + "grad_norm": 236.94281005859375, + "learning_rate": 1.8707803992740472e-05, + "loss": 38.1768, + "step": 2084 + }, + { + "epoch": 7.527313769751693, + "grad_norm": 214.16079711914062, + "learning_rate": 1.8702359346642467e-05, + "loss": 27.514, + "step": 2085 + }, + { + "epoch": 7.530925507900677, + "grad_norm": 192.6107940673828, + "learning_rate": 1.8696914700544463e-05, + "loss": 24.274, + "step": 2086 + }, + { + "epoch": 7.534537246049661, + "grad_norm": 217.98619079589844, + "learning_rate": 1.869147005444646e-05, + "loss": 23.2824, + "step": 2087 + }, + { + "epoch": 7.538148984198646, + "grad_norm": 183.04296875, + "learning_rate": 1.868602540834846e-05, + "loss": 24.9622, + "step": 2088 + }, + { + "epoch": 7.54176072234763, + "grad_norm": 167.1417236328125, + "learning_rate": 1.8680580762250456e-05, + "loss": 25.1446, + "step": 2089 + }, + { + "epoch": 7.545372460496614, + "grad_norm": 287.29937744140625, + "learning_rate": 1.867513611615245e-05, + "loss": 44.1171, + "step": 2090 + }, + { + "epoch": 7.545372460496614, + "eval_loss": 0.6376849412918091, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.929, + "eval_steps_per_second": 56.929, + "step": 2090 + }, + { + "epoch": 7.5489841986455986, + "grad_norm": 285.3408203125, + "learning_rate": 1.8669691470054446e-05, + "loss": 46.3716, + "step": 2091 + }, + { + "epoch": 7.5525959367945825, + "grad_norm": 233.18389892578125, + "learning_rate": 1.8664246823956445e-05, + "loss": 44.0514, + "step": 2092 + }, + { + "epoch": 7.5562076749435665, + "grad_norm": 256.4196472167969, + "learning_rate": 1.865880217785844e-05, + "loss": 44.1784, + "step": 2093 + }, + { + "epoch": 7.5598194130925505, + "grad_norm": 223.28128051757812, + "learning_rate": 1.8653357531760436e-05, + "loss": 42.9897, + "step": 2094 + }, + { + "epoch": 7.563431151241535, + "grad_norm": 235.2901153564453, + "learning_rate": 1.864791288566243e-05, + "loss": 43.7651, + "step": 2095 + }, + { + "epoch": 7.567042889390519, + "grad_norm": 285.9206237792969, + "learning_rate": 1.8642468239564427e-05, + "loss": 44.6333, + "step": 2096 + }, + { + "epoch": 7.570654627539503, + "grad_norm": 200.00210571289062, + "learning_rate": 1.8637023593466425e-05, + "loss": 43.9845, + "step": 2097 + }, + { + "epoch": 7.574266365688487, + "grad_norm": 277.73394775390625, + "learning_rate": 1.8631578947368424e-05, + "loss": 44.7301, + "step": 2098 + }, + { + "epoch": 7.577878103837472, + "grad_norm": 216.9422149658203, + "learning_rate": 1.862613430127042e-05, + "loss": 44.0409, + "step": 2099 + }, + { + "epoch": 7.581489841986456, + "grad_norm": 198.86639404296875, + "learning_rate": 1.8620689655172415e-05, + "loss": 43.4026, + "step": 2100 + }, + { + "epoch": 7.581489841986456, + "eval_loss": 0.6270378232002258, + "eval_runtime": 3.1464, + "eval_samples_per_second": 56.891, + "eval_steps_per_second": 56.891, + "step": 2100 + }, + { + "epoch": 7.58510158013544, + "grad_norm": 240.495361328125, + "learning_rate": 1.861524500907441e-05, + "loss": 41.4092, + "step": 2101 + }, + { + "epoch": 7.588713318284425, + "grad_norm": 240.1851043701172, + "learning_rate": 1.8609800362976406e-05, + "loss": 40.1396, + "step": 2102 + }, + { + "epoch": 7.592325056433409, + "grad_norm": 241.21495056152344, + "learning_rate": 1.8604355716878405e-05, + "loss": 39.1778, + "step": 2103 + }, + { + "epoch": 7.595936794582393, + "grad_norm": 287.3133544921875, + "learning_rate": 1.85989110707804e-05, + "loss": 41.0348, + "step": 2104 + }, + { + "epoch": 7.599548532731377, + "grad_norm": 230.4313201904297, + "learning_rate": 1.8593466424682395e-05, + "loss": 39.5872, + "step": 2105 + }, + { + "epoch": 7.603160270880361, + "grad_norm": 210.32962036132812, + "learning_rate": 1.858802177858439e-05, + "loss": 40.6146, + "step": 2106 + }, + { + "epoch": 7.606772009029346, + "grad_norm": 185.81752014160156, + "learning_rate": 1.858257713248639e-05, + "loss": 39.6363, + "step": 2107 + }, + { + "epoch": 7.6103837471783295, + "grad_norm": 234.63037109375, + "learning_rate": 1.8577132486388385e-05, + "loss": 40.558, + "step": 2108 + }, + { + "epoch": 7.6139954853273135, + "grad_norm": 289.92803955078125, + "learning_rate": 1.8571687840290384e-05, + "loss": 41.1624, + "step": 2109 + }, + { + "epoch": 7.617607223476298, + "grad_norm": 252.82188415527344, + "learning_rate": 1.856624319419238e-05, + "loss": 41.7827, + "step": 2110 + }, + { + "epoch": 7.617607223476298, + "eval_loss": 0.6290409564971924, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2110 + }, + { + "epoch": 7.621218961625282, + "grad_norm": 201.8303985595703, + "learning_rate": 1.8560798548094374e-05, + "loss": 39.0072, + "step": 2111 + }, + { + "epoch": 7.624830699774266, + "grad_norm": 158.71446228027344, + "learning_rate": 1.855535390199637e-05, + "loss": 39.9822, + "step": 2112 + }, + { + "epoch": 7.62844243792325, + "grad_norm": 171.3879852294922, + "learning_rate": 1.8549909255898365e-05, + "loss": 42.1973, + "step": 2113 + }, + { + "epoch": 7.632054176072235, + "grad_norm": 218.584228515625, + "learning_rate": 1.8544464609800364e-05, + "loss": 42.933, + "step": 2114 + }, + { + "epoch": 7.635665914221219, + "grad_norm": 200.60093688964844, + "learning_rate": 1.853901996370236e-05, + "loss": 41.9847, + "step": 2115 + }, + { + "epoch": 7.639277652370203, + "grad_norm": 210.75128173828125, + "learning_rate": 1.8533575317604358e-05, + "loss": 42.4961, + "step": 2116 + }, + { + "epoch": 7.642889390519187, + "grad_norm": 187.47406005859375, + "learning_rate": 1.8528130671506353e-05, + "loss": 39.3404, + "step": 2117 + }, + { + "epoch": 7.646501128668172, + "grad_norm": 204.87693786621094, + "learning_rate": 1.852268602540835e-05, + "loss": 40.3011, + "step": 2118 + }, + { + "epoch": 7.650112866817156, + "grad_norm": 228.8159637451172, + "learning_rate": 1.8517241379310344e-05, + "loss": 37.4416, + "step": 2119 + }, + { + "epoch": 7.65372460496614, + "grad_norm": 237.59664916992188, + "learning_rate": 1.8511796733212343e-05, + "loss": 35.3079, + "step": 2120 + }, + { + "epoch": 7.65372460496614, + "eval_loss": 0.6256567239761353, + "eval_runtime": 3.1458, + "eval_samples_per_second": 56.902, + "eval_steps_per_second": 56.902, + "step": 2120 + }, + { + "epoch": 7.657336343115124, + "grad_norm": 233.3187713623047, + "learning_rate": 1.850635208711434e-05, + "loss": 34.5055, + "step": 2121 + }, + { + "epoch": 7.660948081264109, + "grad_norm": 232.7037353515625, + "learning_rate": 1.8500907441016334e-05, + "loss": 34.1232, + "step": 2122 + }, + { + "epoch": 7.664559819413093, + "grad_norm": 254.53050231933594, + "learning_rate": 1.849546279491833e-05, + "loss": 35.3301, + "step": 2123 + }, + { + "epoch": 7.668171557562077, + "grad_norm": 234.93154907226562, + "learning_rate": 1.8490018148820324e-05, + "loss": 35.9202, + "step": 2124 + }, + { + "epoch": 7.6717832957110605, + "grad_norm": 237.99671936035156, + "learning_rate": 1.8484573502722327e-05, + "loss": 36.5702, + "step": 2125 + }, + { + "epoch": 7.675395033860045, + "grad_norm": 186.25271606445312, + "learning_rate": 1.8479128856624322e-05, + "loss": 35.9423, + "step": 2126 + }, + { + "epoch": 7.679006772009029, + "grad_norm": 226.461669921875, + "learning_rate": 1.8473684210526317e-05, + "loss": 37.4121, + "step": 2127 + }, + { + "epoch": 7.682618510158013, + "grad_norm": 227.0966033935547, + "learning_rate": 1.8468239564428313e-05, + "loss": 36.8802, + "step": 2128 + }, + { + "epoch": 7.686230248306998, + "grad_norm": 193.4064178466797, + "learning_rate": 1.8462794918330308e-05, + "loss": 36.0245, + "step": 2129 + }, + { + "epoch": 7.689841986455982, + "grad_norm": 279.1668395996094, + "learning_rate": 1.8457350272232304e-05, + "loss": 37.4833, + "step": 2130 + }, + { + "epoch": 7.689841986455982, + "eval_loss": 0.6227458715438843, + "eval_runtime": 3.1429, + "eval_samples_per_second": 56.953, + "eval_steps_per_second": 56.953, + "step": 2130 + }, + { + "epoch": 7.693453724604966, + "grad_norm": 254.59234619140625, + "learning_rate": 1.8451905626134302e-05, + "loss": 36.8538, + "step": 2131 + }, + { + "epoch": 7.69706546275395, + "grad_norm": 191.14463806152344, + "learning_rate": 1.8446460980036298e-05, + "loss": 37.8517, + "step": 2132 + }, + { + "epoch": 7.700677200902934, + "grad_norm": 189.20896911621094, + "learning_rate": 1.8441016333938293e-05, + "loss": 38.406, + "step": 2133 + }, + { + "epoch": 7.704288939051919, + "grad_norm": 209.61175537109375, + "learning_rate": 1.8435571687840292e-05, + "loss": 37.7692, + "step": 2134 + }, + { + "epoch": 7.707900677200903, + "grad_norm": 220.5150146484375, + "learning_rate": 1.8430127041742287e-05, + "loss": 36.087, + "step": 2135 + }, + { + "epoch": 7.711512415349887, + "grad_norm": 211.78372192382812, + "learning_rate": 1.8424682395644286e-05, + "loss": 25.6052, + "step": 2136 + }, + { + "epoch": 7.715124153498872, + "grad_norm": 223.85789489746094, + "learning_rate": 1.841923774954628e-05, + "loss": 23.5576, + "step": 2137 + }, + { + "epoch": 7.718735891647856, + "grad_norm": 163.74220275878906, + "learning_rate": 1.8413793103448277e-05, + "loss": 24.4869, + "step": 2138 + }, + { + "epoch": 7.72234762979684, + "grad_norm": 182.80079650878906, + "learning_rate": 1.8408348457350272e-05, + "loss": 25.1878, + "step": 2139 + }, + { + "epoch": 7.725959367945824, + "grad_norm": 296.0340270996094, + "learning_rate": 1.8402903811252268e-05, + "loss": 44.4643, + "step": 2140 + }, + { + "epoch": 7.725959367945824, + "eval_loss": 0.6382863521575928, + "eval_runtime": 3.1441, + "eval_samples_per_second": 56.932, + "eval_steps_per_second": 56.932, + "step": 2140 + }, + { + "epoch": 7.7295711060948085, + "grad_norm": 248.48643493652344, + "learning_rate": 1.8397459165154263e-05, + "loss": 45.2141, + "step": 2141 + }, + { + "epoch": 7.733182844243792, + "grad_norm": 240.9061279296875, + "learning_rate": 1.8392014519056262e-05, + "loss": 42.9435, + "step": 2142 + }, + { + "epoch": 7.736794582392776, + "grad_norm": 231.62315368652344, + "learning_rate": 1.8386569872958257e-05, + "loss": 42.9769, + "step": 2143 + }, + { + "epoch": 7.74040632054176, + "grad_norm": 244.36915588378906, + "learning_rate": 1.8381125226860256e-05, + "loss": 43.6058, + "step": 2144 + }, + { + "epoch": 7.744018058690745, + "grad_norm": 252.9080047607422, + "learning_rate": 1.837568058076225e-05, + "loss": 43.1753, + "step": 2145 + }, + { + "epoch": 7.747629796839729, + "grad_norm": 274.0201721191406, + "learning_rate": 1.8370235934664247e-05, + "loss": 43.3285, + "step": 2146 + }, + { + "epoch": 7.751241534988713, + "grad_norm": 226.75595092773438, + "learning_rate": 1.8364791288566245e-05, + "loss": 43.3158, + "step": 2147 + }, + { + "epoch": 7.754853273137698, + "grad_norm": 197.0859832763672, + "learning_rate": 1.835934664246824e-05, + "loss": 43.5773, + "step": 2148 + }, + { + "epoch": 7.758465011286682, + "grad_norm": 212.14720153808594, + "learning_rate": 1.8353901996370236e-05, + "loss": 43.9208, + "step": 2149 + }, + { + "epoch": 7.762076749435666, + "grad_norm": 230.22158813476562, + "learning_rate": 1.834845735027223e-05, + "loss": 42.8429, + "step": 2150 + }, + { + "epoch": 7.762076749435666, + "eval_loss": 0.6291994452476501, + "eval_runtime": 3.1473, + "eval_samples_per_second": 56.874, + "eval_steps_per_second": 56.874, + "step": 2150 + }, + { + "epoch": 7.76568848758465, + "grad_norm": 215.79391479492188, + "learning_rate": 1.8343012704174227e-05, + "loss": 40.7289, + "step": 2151 + }, + { + "epoch": 7.769300225733634, + "grad_norm": 210.00296020507812, + "learning_rate": 1.8337568058076222e-05, + "loss": 39.9759, + "step": 2152 + }, + { + "epoch": 7.772911963882619, + "grad_norm": 291.2987976074219, + "learning_rate": 1.8332123411978224e-05, + "loss": 40.551, + "step": 2153 + }, + { + "epoch": 7.776523702031603, + "grad_norm": 218.08819580078125, + "learning_rate": 1.832667876588022e-05, + "loss": 40.7981, + "step": 2154 + }, + { + "epoch": 7.780135440180587, + "grad_norm": 268.615966796875, + "learning_rate": 1.8321234119782215e-05, + "loss": 40.5463, + "step": 2155 + }, + { + "epoch": 7.7837471783295715, + "grad_norm": 269.939697265625, + "learning_rate": 1.831578947368421e-05, + "loss": 40.6168, + "step": 2156 + }, + { + "epoch": 7.7873589164785555, + "grad_norm": 268.9761657714844, + "learning_rate": 1.8310344827586206e-05, + "loss": 41.2449, + "step": 2157 + }, + { + "epoch": 7.7909706546275395, + "grad_norm": 161.08811950683594, + "learning_rate": 1.8304900181488205e-05, + "loss": 40.6308, + "step": 2158 + }, + { + "epoch": 7.794582392776523, + "grad_norm": 190.44696044921875, + "learning_rate": 1.82994555353902e-05, + "loss": 40.9708, + "step": 2159 + }, + { + "epoch": 7.798194130925508, + "grad_norm": 202.4305419921875, + "learning_rate": 1.8294010889292196e-05, + "loss": 41.2053, + "step": 2160 + }, + { + "epoch": 7.798194130925508, + "eval_loss": 0.6233534812927246, + "eval_runtime": 3.1457, + "eval_samples_per_second": 56.903, + "eval_steps_per_second": 56.903, + "step": 2160 + }, + { + "epoch": 7.801805869074492, + "grad_norm": 188.5523681640625, + "learning_rate": 1.828856624319419e-05, + "loss": 40.3928, + "step": 2161 + }, + { + "epoch": 7.805417607223476, + "grad_norm": 184.18296813964844, + "learning_rate": 1.828312159709619e-05, + "loss": 42.3466, + "step": 2162 + }, + { + "epoch": 7.80902934537246, + "grad_norm": 223.9243927001953, + "learning_rate": 1.8277676950998185e-05, + "loss": 42.0301, + "step": 2163 + }, + { + "epoch": 7.812641083521445, + "grad_norm": 202.3498077392578, + "learning_rate": 1.8272232304900184e-05, + "loss": 42.3284, + "step": 2164 + }, + { + "epoch": 7.816252821670429, + "grad_norm": 205.77940368652344, + "learning_rate": 1.826678765880218e-05, + "loss": 42.0951, + "step": 2165 + }, + { + "epoch": 7.819864559819413, + "grad_norm": 191.46728515625, + "learning_rate": 1.8261343012704175e-05, + "loss": 40.826, + "step": 2166 + }, + { + "epoch": 7.823476297968397, + "grad_norm": 276.8330383300781, + "learning_rate": 1.825589836660617e-05, + "loss": 42.7909, + "step": 2167 + }, + { + "epoch": 7.827088036117382, + "grad_norm": 181.93955993652344, + "learning_rate": 1.8250453720508165e-05, + "loss": 38.6068, + "step": 2168 + }, + { + "epoch": 7.830699774266366, + "grad_norm": 178.79856872558594, + "learning_rate": 1.8245009074410164e-05, + "loss": 35.694, + "step": 2169 + }, + { + "epoch": 7.83431151241535, + "grad_norm": 224.6522979736328, + "learning_rate": 1.823956442831216e-05, + "loss": 36.7127, + "step": 2170 + }, + { + "epoch": 7.83431151241535, + "eval_loss": 0.6237645745277405, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 2170 + }, + { + "epoch": 7.837923250564334, + "grad_norm": 203.37196350097656, + "learning_rate": 1.823411978221416e-05, + "loss": 34.0039, + "step": 2171 + }, + { + "epoch": 7.8415349887133186, + "grad_norm": 212.79307556152344, + "learning_rate": 1.8228675136116154e-05, + "loss": 33.2787, + "step": 2172 + }, + { + "epoch": 7.8451467268623025, + "grad_norm": 215.5691375732422, + "learning_rate": 1.822323049001815e-05, + "loss": 35.4241, + "step": 2173 + }, + { + "epoch": 7.8487584650112865, + "grad_norm": 230.0751190185547, + "learning_rate": 1.8217785843920144e-05, + "loss": 36.9333, + "step": 2174 + }, + { + "epoch": 7.852370203160271, + "grad_norm": 217.8132781982422, + "learning_rate": 1.8212341197822143e-05, + "loss": 35.7233, + "step": 2175 + }, + { + "epoch": 7.855981941309255, + "grad_norm": 245.93177795410156, + "learning_rate": 1.820689655172414e-05, + "loss": 36.6111, + "step": 2176 + }, + { + "epoch": 7.859593679458239, + "grad_norm": 210.58218383789062, + "learning_rate": 1.8201451905626134e-05, + "loss": 36.3243, + "step": 2177 + }, + { + "epoch": 7.863205417607223, + "grad_norm": 234.6280059814453, + "learning_rate": 1.819600725952813e-05, + "loss": 37.0315, + "step": 2178 + }, + { + "epoch": 7.866817155756207, + "grad_norm": 184.53121948242188, + "learning_rate": 1.8190562613430125e-05, + "loss": 35.8725, + "step": 2179 + }, + { + "epoch": 7.870428893905192, + "grad_norm": 201.5563507080078, + "learning_rate": 1.8185117967332127e-05, + "loss": 37.9183, + "step": 2180 + }, + { + "epoch": 7.870428893905192, + "eval_loss": 0.6210297346115112, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2180 + }, + { + "epoch": 7.874040632054176, + "grad_norm": 192.29579162597656, + "learning_rate": 1.8179673321234122e-05, + "loss": 37.1709, + "step": 2181 + }, + { + "epoch": 7.87765237020316, + "grad_norm": 246.0638427734375, + "learning_rate": 1.8174228675136118e-05, + "loss": 38.5338, + "step": 2182 + }, + { + "epoch": 7.881264108352145, + "grad_norm": 237.47607421875, + "learning_rate": 1.8168784029038113e-05, + "loss": 37.7041, + "step": 2183 + }, + { + "epoch": 7.884875846501129, + "grad_norm": 215.06407165527344, + "learning_rate": 1.816333938294011e-05, + "loss": 38.1663, + "step": 2184 + }, + { + "epoch": 7.888487584650113, + "grad_norm": 193.76809692382812, + "learning_rate": 1.8157894736842107e-05, + "loss": 32.1679, + "step": 2185 + }, + { + "epoch": 7.892099322799097, + "grad_norm": 208.66111755371094, + "learning_rate": 1.8152450090744103e-05, + "loss": 24.2413, + "step": 2186 + }, + { + "epoch": 7.895711060948082, + "grad_norm": 182.810546875, + "learning_rate": 1.8147005444646098e-05, + "loss": 24.1102, + "step": 2187 + }, + { + "epoch": 7.899322799097066, + "grad_norm": 200.25823974609375, + "learning_rate": 1.8141560798548093e-05, + "loss": 24.5778, + "step": 2188 + }, + { + "epoch": 7.9029345372460496, + "grad_norm": 224.19125366210938, + "learning_rate": 1.813611615245009e-05, + "loss": 26.1643, + "step": 2189 + }, + { + "epoch": 7.9065462753950335, + "grad_norm": 261.03033447265625, + "learning_rate": 1.8130671506352088e-05, + "loss": 45.1071, + "step": 2190 + }, + { + "epoch": 7.9065462753950335, + "eval_loss": 0.6303785443305969, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2190 + }, + { + "epoch": 7.910158013544018, + "grad_norm": 273.6593322753906, + "learning_rate": 1.8125226860254086e-05, + "loss": 43.8271, + "step": 2191 + }, + { + "epoch": 7.913769751693002, + "grad_norm": 304.0534362792969, + "learning_rate": 1.8119782214156082e-05, + "loss": 43.7623, + "step": 2192 + }, + { + "epoch": 7.917381489841986, + "grad_norm": 249.27255249023438, + "learning_rate": 1.8114337568058077e-05, + "loss": 43.7191, + "step": 2193 + }, + { + "epoch": 7.92099322799097, + "grad_norm": 199.5006103515625, + "learning_rate": 1.8108892921960072e-05, + "loss": 44.1019, + "step": 2194 + }, + { + "epoch": 7.924604966139955, + "grad_norm": 228.42832946777344, + "learning_rate": 1.8103448275862068e-05, + "loss": 43.9717, + "step": 2195 + }, + { + "epoch": 7.928216704288939, + "grad_norm": 247.20901489257812, + "learning_rate": 1.8098003629764067e-05, + "loss": 40.022, + "step": 2196 + }, + { + "epoch": 7.931828442437923, + "grad_norm": 297.5372619628906, + "learning_rate": 1.8092558983666062e-05, + "loss": 40.6639, + "step": 2197 + }, + { + "epoch": 7.935440180586907, + "grad_norm": 245.11915588378906, + "learning_rate": 1.8087114337568057e-05, + "loss": 40.3569, + "step": 2198 + }, + { + "epoch": 7.939051918735892, + "grad_norm": 255.53297424316406, + "learning_rate": 1.8081669691470056e-05, + "loss": 41.7983, + "step": 2199 + }, + { + "epoch": 7.942663656884876, + "grad_norm": 226.12783813476562, + "learning_rate": 1.807622504537205e-05, + "loss": 41.7844, + "step": 2200 + }, + { + "epoch": 7.942663656884876, + "eval_loss": 0.6214397549629211, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2200 + }, + { + "epoch": 7.94627539503386, + "grad_norm": 220.90577697753906, + "learning_rate": 1.8070780399274047e-05, + "loss": 42.057, + "step": 2201 + }, + { + "epoch": 7.949887133182845, + "grad_norm": 192.33856201171875, + "learning_rate": 1.8065335753176046e-05, + "loss": 42.0299, + "step": 2202 + }, + { + "epoch": 7.953498871331829, + "grad_norm": 192.8511962890625, + "learning_rate": 1.805989110707804e-05, + "loss": 41.7752, + "step": 2203 + }, + { + "epoch": 7.957110609480813, + "grad_norm": 223.10275268554688, + "learning_rate": 1.8054446460980036e-05, + "loss": 41.0178, + "step": 2204 + }, + { + "epoch": 7.960722347629797, + "grad_norm": 189.8402099609375, + "learning_rate": 1.8049001814882032e-05, + "loss": 37.9747, + "step": 2205 + }, + { + "epoch": 7.9643340857787805, + "grad_norm": 233.5938720703125, + "learning_rate": 1.8043557168784027e-05, + "loss": 35.3994, + "step": 2206 + }, + { + "epoch": 7.967945823927765, + "grad_norm": 218.5577850341797, + "learning_rate": 1.8038112522686026e-05, + "loss": 35.1967, + "step": 2207 + }, + { + "epoch": 7.971557562076749, + "grad_norm": 228.49502563476562, + "learning_rate": 1.8032667876588025e-05, + "loss": 34.5792, + "step": 2208 + }, + { + "epoch": 7.975169300225733, + "grad_norm": 285.4461364746094, + "learning_rate": 1.802722323049002e-05, + "loss": 37.9449, + "step": 2209 + }, + { + "epoch": 7.978781038374718, + "grad_norm": 186.83755493164062, + "learning_rate": 1.8021778584392016e-05, + "loss": 36.3295, + "step": 2210 + }, + { + "epoch": 7.978781038374718, + "eval_loss": 0.6212169528007507, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2210 + }, + { + "epoch": 7.982392776523702, + "grad_norm": 210.31175231933594, + "learning_rate": 1.801633393829401e-05, + "loss": 37.0061, + "step": 2211 + }, + { + "epoch": 7.986004514672686, + "grad_norm": 251.96026611328125, + "learning_rate": 1.8010889292196006e-05, + "loss": 37.8831, + "step": 2212 + }, + { + "epoch": 7.98961625282167, + "grad_norm": 273.8665771484375, + "learning_rate": 1.8005444646098005e-05, + "loss": 38.8926, + "step": 2213 + }, + { + "epoch": 7.993227990970655, + "grad_norm": 207.25836181640625, + "learning_rate": 1.8e-05, + "loss": 30.0468, + "step": 2214 + }, + { + "epoch": 7.996839729119639, + "grad_norm": 200.5218048095703, + "learning_rate": 1.7994555353901996e-05, + "loss": 24.0549, + "step": 2215 + }, + { + "epoch": 8.0, + "grad_norm": 245.7149200439453, + "learning_rate": 1.798911070780399e-05, + "loss": 22.3158, + "step": 2216 + }, + { + "epoch": 8.003611738148985, + "grad_norm": 263.85546875, + "learning_rate": 1.798366606170599e-05, + "loss": 43.2342, + "step": 2217 + }, + { + "epoch": 8.007223476297968, + "grad_norm": 244.57205200195312, + "learning_rate": 1.797822141560799e-05, + "loss": 44.0931, + "step": 2218 + }, + { + "epoch": 8.010835214446953, + "grad_norm": 196.4144287109375, + "learning_rate": 1.7972776769509984e-05, + "loss": 42.1926, + "step": 2219 + }, + { + "epoch": 8.014446952595938, + "grad_norm": 282.3250427246094, + "learning_rate": 1.796733212341198e-05, + "loss": 41.4664, + "step": 2220 + }, + { + "epoch": 8.014446952595938, + "eval_loss": 0.6222901344299316, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 2220 + }, + { + "epoch": 8.01805869074492, + "grad_norm": 186.79281616210938, + "learning_rate": 1.7961887477313975e-05, + "loss": 42.2133, + "step": 2221 + }, + { + "epoch": 8.021670428893906, + "grad_norm": 220.3788299560547, + "learning_rate": 1.795644283121597e-05, + "loss": 42.0159, + "step": 2222 + }, + { + "epoch": 8.025282167042889, + "grad_norm": 262.37078857421875, + "learning_rate": 1.7950998185117966e-05, + "loss": 42.6055, + "step": 2223 + }, + { + "epoch": 8.028893905191874, + "grad_norm": 199.07078552246094, + "learning_rate": 1.7945553539019964e-05, + "loss": 43.3061, + "step": 2224 + }, + { + "epoch": 8.032505643340858, + "grad_norm": 256.6651306152344, + "learning_rate": 1.794010889292196e-05, + "loss": 42.4806, + "step": 2225 + }, + { + "epoch": 8.036117381489841, + "grad_norm": 281.17431640625, + "learning_rate": 1.793466424682396e-05, + "loss": 43.9823, + "step": 2226 + }, + { + "epoch": 8.039729119638826, + "grad_norm": 201.19837951660156, + "learning_rate": 1.7929219600725954e-05, + "loss": 41.8372, + "step": 2227 + }, + { + "epoch": 8.043340857787811, + "grad_norm": 195.1905059814453, + "learning_rate": 1.792377495462795e-05, + "loss": 38.8656, + "step": 2228 + }, + { + "epoch": 8.046952595936794, + "grad_norm": 215.02772521972656, + "learning_rate": 1.7918330308529948e-05, + "loss": 39.8965, + "step": 2229 + }, + { + "epoch": 8.050564334085779, + "grad_norm": 202.16322326660156, + "learning_rate": 1.7912885662431944e-05, + "loss": 41.0917, + "step": 2230 + }, + { + "epoch": 8.050564334085779, + "eval_loss": 0.6212881207466125, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2230 + }, + { + "epoch": 8.054176072234762, + "grad_norm": 218.90786743164062, + "learning_rate": 1.790744101633394e-05, + "loss": 38.5499, + "step": 2231 + }, + { + "epoch": 8.057787810383747, + "grad_norm": 179.57138061523438, + "learning_rate": 1.7901996370235934e-05, + "loss": 39.5915, + "step": 2232 + }, + { + "epoch": 8.061399548532732, + "grad_norm": 242.74801635742188, + "learning_rate": 1.789655172413793e-05, + "loss": 39.6094, + "step": 2233 + }, + { + "epoch": 8.065011286681715, + "grad_norm": 183.07102966308594, + "learning_rate": 1.7891107078039925e-05, + "loss": 40.6025, + "step": 2234 + }, + { + "epoch": 8.0686230248307, + "grad_norm": 192.85418701171875, + "learning_rate": 1.7885662431941924e-05, + "loss": 40.3013, + "step": 2235 + }, + { + "epoch": 8.072234762979685, + "grad_norm": 254.26353454589844, + "learning_rate": 1.7880217785843923e-05, + "loss": 39.1747, + "step": 2236 + }, + { + "epoch": 8.075846501128668, + "grad_norm": 230.7747802734375, + "learning_rate": 1.7874773139745918e-05, + "loss": 40.7569, + "step": 2237 + }, + { + "epoch": 8.079458239277653, + "grad_norm": 179.30528259277344, + "learning_rate": 1.7869328493647913e-05, + "loss": 40.0753, + "step": 2238 + }, + { + "epoch": 8.083069977426636, + "grad_norm": 203.48915100097656, + "learning_rate": 1.786388384754991e-05, + "loss": 41.4453, + "step": 2239 + }, + { + "epoch": 8.08668171557562, + "grad_norm": 274.8970947265625, + "learning_rate": 1.7858439201451908e-05, + "loss": 40.5818, + "step": 2240 + }, + { + "epoch": 8.08668171557562, + "eval_loss": 0.6184170842170715, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.799, + "eval_steps_per_second": 56.799, + "step": 2240 + }, + { + "epoch": 8.090293453724605, + "grad_norm": 237.2452392578125, + "learning_rate": 1.7852994555353903e-05, + "loss": 42.5794, + "step": 2241 + }, + { + "epoch": 8.093905191873588, + "grad_norm": 236.33766174316406, + "learning_rate": 1.7847549909255898e-05, + "loss": 41.89, + "step": 2242 + }, + { + "epoch": 8.097516930022573, + "grad_norm": 269.4791564941406, + "learning_rate": 1.7842105263157894e-05, + "loss": 41.7726, + "step": 2243 + }, + { + "epoch": 8.101128668171558, + "grad_norm": 192.28457641601562, + "learning_rate": 1.783666061705989e-05, + "loss": 40.1187, + "step": 2244 + }, + { + "epoch": 8.104740406320541, + "grad_norm": 201.5625457763672, + "learning_rate": 1.7831215970961888e-05, + "loss": 36.8004, + "step": 2245 + }, + { + "epoch": 8.108352144469526, + "grad_norm": 175.7625274658203, + "learning_rate": 1.7825771324863887e-05, + "loss": 33.8354, + "step": 2246 + }, + { + "epoch": 8.111963882618511, + "grad_norm": 195.6171112060547, + "learning_rate": 1.7820326678765882e-05, + "loss": 33.5176, + "step": 2247 + }, + { + "epoch": 8.115575620767494, + "grad_norm": 158.7554168701172, + "learning_rate": 1.7814882032667877e-05, + "loss": 34.2908, + "step": 2248 + }, + { + "epoch": 8.119187358916479, + "grad_norm": 192.78900146484375, + "learning_rate": 1.7809437386569873e-05, + "loss": 34.0861, + "step": 2249 + }, + { + "epoch": 8.122799097065462, + "grad_norm": 186.6603240966797, + "learning_rate": 1.7803992740471868e-05, + "loss": 35.5742, + "step": 2250 + }, + { + "epoch": 8.122799097065462, + "eval_loss": 0.6207499504089355, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2250 + }, + { + "epoch": 8.126410835214447, + "grad_norm": 264.3590087890625, + "learning_rate": 1.7798548094373867e-05, + "loss": 35.6709, + "step": 2251 + }, + { + "epoch": 8.130022573363432, + "grad_norm": 202.9478302001953, + "learning_rate": 1.7793103448275862e-05, + "loss": 36.4221, + "step": 2252 + }, + { + "epoch": 8.133634311512415, + "grad_norm": 229.260498046875, + "learning_rate": 1.7787658802177858e-05, + "loss": 36.0745, + "step": 2253 + }, + { + "epoch": 8.1372460496614, + "grad_norm": 222.37716674804688, + "learning_rate": 1.7782214156079856e-05, + "loss": 37.3266, + "step": 2254 + }, + { + "epoch": 8.140857787810384, + "grad_norm": 217.02272033691406, + "learning_rate": 1.7776769509981852e-05, + "loss": 37.2819, + "step": 2255 + }, + { + "epoch": 8.144469525959368, + "grad_norm": 247.61016845703125, + "learning_rate": 1.7771324863883847e-05, + "loss": 37.2683, + "step": 2256 + }, + { + "epoch": 8.148081264108352, + "grad_norm": 209.7449493408203, + "learning_rate": 1.7765880217785846e-05, + "loss": 36.7165, + "step": 2257 + }, + { + "epoch": 8.151693002257336, + "grad_norm": 217.30722045898438, + "learning_rate": 1.776043557168784e-05, + "loss": 37.0805, + "step": 2258 + }, + { + "epoch": 8.15530474040632, + "grad_norm": 181.5167236328125, + "learning_rate": 1.7754990925589837e-05, + "loss": 38.0326, + "step": 2259 + }, + { + "epoch": 8.158916478555305, + "grad_norm": 217.4818878173828, + "learning_rate": 1.7749546279491832e-05, + "loss": 37.1798, + "step": 2260 + }, + { + "epoch": 8.158916478555305, + "eval_loss": 0.6218119263648987, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 2260 + }, + { + "epoch": 8.162528216704288, + "grad_norm": 233.60733032226562, + "learning_rate": 1.7744101633393828e-05, + "loss": 36.6039, + "step": 2261 + }, + { + "epoch": 8.166139954853273, + "grad_norm": 184.5128631591797, + "learning_rate": 1.7738656987295826e-05, + "loss": 30.6188, + "step": 2262 + }, + { + "epoch": 8.169751693002258, + "grad_norm": 154.25791931152344, + "learning_rate": 1.7733212341197825e-05, + "loss": 24.0782, + "step": 2263 + }, + { + "epoch": 8.173363431151241, + "grad_norm": 179.92723083496094, + "learning_rate": 1.772776769509982e-05, + "loss": 23.7072, + "step": 2264 + }, + { + "epoch": 8.176975169300226, + "grad_norm": 170.87684631347656, + "learning_rate": 1.7722323049001816e-05, + "loss": 24.0008, + "step": 2265 + }, + { + "epoch": 8.18058690744921, + "grad_norm": 179.25233459472656, + "learning_rate": 1.771687840290381e-05, + "loss": 24.8393, + "step": 2266 + }, + { + "epoch": 8.184198645598194, + "grad_norm": 268.7836608886719, + "learning_rate": 1.7711433756805807e-05, + "loss": 44.0573, + "step": 2267 + }, + { + "epoch": 8.187810383747179, + "grad_norm": 249.12033081054688, + "learning_rate": 1.7705989110707805e-05, + "loss": 45.0218, + "step": 2268 + }, + { + "epoch": 8.191422121896162, + "grad_norm": 275.2551574707031, + "learning_rate": 1.77005444646098e-05, + "loss": 43.1954, + "step": 2269 + }, + { + "epoch": 8.195033860045147, + "grad_norm": 233.5360107421875, + "learning_rate": 1.7695099818511796e-05, + "loss": 43.0807, + "step": 2270 + }, + { + "epoch": 8.195033860045147, + "eval_loss": 0.6311450600624084, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 2270 + }, + { + "epoch": 8.198645598194132, + "grad_norm": 201.01617431640625, + "learning_rate": 1.768965517241379e-05, + "loss": 43.8161, + "step": 2271 + }, + { + "epoch": 8.202257336343115, + "grad_norm": 243.028564453125, + "learning_rate": 1.7684210526315787e-05, + "loss": 43.3388, + "step": 2272 + }, + { + "epoch": 8.2058690744921, + "grad_norm": 191.8246307373047, + "learning_rate": 1.767876588021779e-05, + "loss": 42.6949, + "step": 2273 + }, + { + "epoch": 8.209480812641084, + "grad_norm": 241.33609008789062, + "learning_rate": 1.7673321234119784e-05, + "loss": 43.3541, + "step": 2274 + }, + { + "epoch": 8.213092550790067, + "grad_norm": 247.99066162109375, + "learning_rate": 1.766787658802178e-05, + "loss": 44.4262, + "step": 2275 + }, + { + "epoch": 8.216704288939052, + "grad_norm": 223.35452270507812, + "learning_rate": 1.7662431941923775e-05, + "loss": 42.5696, + "step": 2276 + }, + { + "epoch": 8.220316027088035, + "grad_norm": 208.75209045410156, + "learning_rate": 1.765698729582577e-05, + "loss": 41.9236, + "step": 2277 + }, + { + "epoch": 8.22392776523702, + "grad_norm": 229.60305786132812, + "learning_rate": 1.7651542649727766e-05, + "loss": 39.962, + "step": 2278 + }, + { + "epoch": 8.227539503386005, + "grad_norm": 294.3867492675781, + "learning_rate": 1.7646098003629765e-05, + "loss": 39.0847, + "step": 2279 + }, + { + "epoch": 8.231151241534988, + "grad_norm": 201.49679565429688, + "learning_rate": 1.764065335753176e-05, + "loss": 39.1451, + "step": 2280 + }, + { + "epoch": 8.231151241534988, + "eval_loss": 0.6214079856872559, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 2280 + }, + { + "epoch": 8.234762979683973, + "grad_norm": 201.57894897460938, + "learning_rate": 1.7635208711433756e-05, + "loss": 39.4673, + "step": 2281 + }, + { + "epoch": 8.238374717832958, + "grad_norm": 201.0395965576172, + "learning_rate": 1.7629764065335754e-05, + "loss": 39.9832, + "step": 2282 + }, + { + "epoch": 8.241986455981941, + "grad_norm": 274.41168212890625, + "learning_rate": 1.762431941923775e-05, + "loss": 40.3885, + "step": 2283 + }, + { + "epoch": 8.245598194130926, + "grad_norm": 173.79977416992188, + "learning_rate": 1.761887477313975e-05, + "loss": 39.5292, + "step": 2284 + }, + { + "epoch": 8.249209932279909, + "grad_norm": 194.91806030273438, + "learning_rate": 1.7613430127041744e-05, + "loss": 40.3855, + "step": 2285 + }, + { + "epoch": 8.252821670428894, + "grad_norm": 216.47213745117188, + "learning_rate": 1.760798548094374e-05, + "loss": 40.937, + "step": 2286 + }, + { + "epoch": 8.256433408577879, + "grad_norm": 168.1825714111328, + "learning_rate": 1.7602540834845735e-05, + "loss": 41.2523, + "step": 2287 + }, + { + "epoch": 8.260045146726862, + "grad_norm": 187.51914978027344, + "learning_rate": 1.759709618874773e-05, + "loss": 40.6913, + "step": 2288 + }, + { + "epoch": 8.263656884875846, + "grad_norm": 183.99844360351562, + "learning_rate": 1.759165154264973e-05, + "loss": 42.5074, + "step": 2289 + }, + { + "epoch": 8.267268623024831, + "grad_norm": 201.23797607421875, + "learning_rate": 1.7586206896551724e-05, + "loss": 42.0519, + "step": 2290 + }, + { + "epoch": 8.267268623024831, + "eval_loss": 0.6184054017066956, + "eval_runtime": 3.1465, + "eval_samples_per_second": 56.889, + "eval_steps_per_second": 56.889, + "step": 2290 + }, + { + "epoch": 8.270880361173814, + "grad_norm": 219.0037384033203, + "learning_rate": 1.7580762250453723e-05, + "loss": 41.7059, + "step": 2291 + }, + { + "epoch": 8.2744920993228, + "grad_norm": 221.00173950195312, + "learning_rate": 1.7575317604355718e-05, + "loss": 40.9004, + "step": 2292 + }, + { + "epoch": 8.278103837471784, + "grad_norm": 180.00828552246094, + "learning_rate": 1.7569872958257714e-05, + "loss": 38.7865, + "step": 2293 + }, + { + "epoch": 8.281715575620767, + "grad_norm": 210.69302368164062, + "learning_rate": 1.756442831215971e-05, + "loss": 39.207, + "step": 2294 + }, + { + "epoch": 8.285327313769752, + "grad_norm": 196.8787078857422, + "learning_rate": 1.7558983666061708e-05, + "loss": 39.4472, + "step": 2295 + }, + { + "epoch": 8.288939051918735, + "grad_norm": 229.16331481933594, + "learning_rate": 1.7553539019963703e-05, + "loss": 36.5539, + "step": 2296 + }, + { + "epoch": 8.29255079006772, + "grad_norm": 180.67474365234375, + "learning_rate": 1.75480943738657e-05, + "loss": 34.3887, + "step": 2297 + }, + { + "epoch": 8.296162528216705, + "grad_norm": 234.046875, + "learning_rate": 1.7542649727767694e-05, + "loss": 34.158, + "step": 2298 + }, + { + "epoch": 8.299774266365688, + "grad_norm": 213.34255981445312, + "learning_rate": 1.753720508166969e-05, + "loss": 34.7655, + "step": 2299 + }, + { + "epoch": 8.303386004514673, + "grad_norm": 205.6382598876953, + "learning_rate": 1.753176043557169e-05, + "loss": 34.4223, + "step": 2300 + }, + { + "epoch": 8.303386004514673, + "eval_loss": 0.6200549006462097, + "eval_runtime": 3.1447, + "eval_samples_per_second": 56.921, + "eval_steps_per_second": 56.921, + "step": 2300 + }, + { + "epoch": 8.306997742663658, + "grad_norm": 189.79238891601562, + "learning_rate": 1.7526315789473687e-05, + "loss": 35.3846, + "step": 2301 + }, + { + "epoch": 8.31060948081264, + "grad_norm": 202.27859497070312, + "learning_rate": 1.7520871143375682e-05, + "loss": 34.9006, + "step": 2302 + }, + { + "epoch": 8.314221218961626, + "grad_norm": 217.62327575683594, + "learning_rate": 1.7515426497277678e-05, + "loss": 36.3079, + "step": 2303 + }, + { + "epoch": 8.317832957110609, + "grad_norm": 212.82862854003906, + "learning_rate": 1.7509981851179673e-05, + "loss": 35.8598, + "step": 2304 + }, + { + "epoch": 8.321444695259594, + "grad_norm": 229.778564453125, + "learning_rate": 1.750453720508167e-05, + "loss": 37.0853, + "step": 2305 + }, + { + "epoch": 8.325056433408578, + "grad_norm": 219.99844360351562, + "learning_rate": 1.7499092558983667e-05, + "loss": 38.01, + "step": 2306 + }, + { + "epoch": 8.328668171557561, + "grad_norm": 202.63035583496094, + "learning_rate": 1.7493647912885663e-05, + "loss": 36.4756, + "step": 2307 + }, + { + "epoch": 8.332279909706546, + "grad_norm": 188.44094848632812, + "learning_rate": 1.7488203266787658e-05, + "loss": 37.0509, + "step": 2308 + }, + { + "epoch": 8.335891647855531, + "grad_norm": 187.8760223388672, + "learning_rate": 1.7482758620689657e-05, + "loss": 38.0019, + "step": 2309 + }, + { + "epoch": 8.339503386004514, + "grad_norm": 239.35833740234375, + "learning_rate": 1.7477313974591652e-05, + "loss": 38.2255, + "step": 2310 + }, + { + "epoch": 8.339503386004514, + "eval_loss": 0.6221747994422913, + "eval_runtime": 3.148, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 2310 + }, + { + "epoch": 8.343115124153499, + "grad_norm": 236.3567657470703, + "learning_rate": 1.747186932849365e-05, + "loss": 37.3598, + "step": 2311 + }, + { + "epoch": 8.346726862302482, + "grad_norm": 188.16151428222656, + "learning_rate": 1.7466424682395646e-05, + "loss": 27.1993, + "step": 2312 + }, + { + "epoch": 8.350338600451467, + "grad_norm": 216.58778381347656, + "learning_rate": 1.746098003629764e-05, + "loss": 23.7024, + "step": 2313 + }, + { + "epoch": 8.353950338600452, + "grad_norm": 221.03111267089844, + "learning_rate": 1.7455535390199637e-05, + "loss": 24.2856, + "step": 2314 + }, + { + "epoch": 8.357562076749435, + "grad_norm": 180.36221313476562, + "learning_rate": 1.7450090744101632e-05, + "loss": 23.7624, + "step": 2315 + }, + { + "epoch": 8.36117381489842, + "grad_norm": 198.77438354492188, + "learning_rate": 1.7444646098003628e-05, + "loss": 25.8628, + "step": 2316 + }, + { + "epoch": 8.364785553047405, + "grad_norm": 250.81321716308594, + "learning_rate": 1.7439201451905627e-05, + "loss": 43.4097, + "step": 2317 + }, + { + "epoch": 8.368397291196388, + "grad_norm": 246.19544982910156, + "learning_rate": 1.7433756805807622e-05, + "loss": 44.7141, + "step": 2318 + }, + { + "epoch": 8.372009029345373, + "grad_norm": 245.04241943359375, + "learning_rate": 1.742831215970962e-05, + "loss": 44.4511, + "step": 2319 + }, + { + "epoch": 8.375620767494357, + "grad_norm": 224.05331420898438, + "learning_rate": 1.7422867513611616e-05, + "loss": 43.5971, + "step": 2320 + }, + { + "epoch": 8.375620767494357, + "eval_loss": 0.6324251294136047, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 2320 + }, + { + "epoch": 8.37923250564334, + "grad_norm": 222.3795623779297, + "learning_rate": 1.741742286751361e-05, + "loss": 42.9007, + "step": 2321 + }, + { + "epoch": 8.382844243792325, + "grad_norm": 210.0133514404297, + "learning_rate": 1.741197822141561e-05, + "loss": 42.8733, + "step": 2322 + }, + { + "epoch": 8.386455981941308, + "grad_norm": 222.01031494140625, + "learning_rate": 1.7406533575317606e-05, + "loss": 42.9875, + "step": 2323 + }, + { + "epoch": 8.390067720090293, + "grad_norm": 187.30101013183594, + "learning_rate": 1.74010889292196e-05, + "loss": 42.4873, + "step": 2324 + }, + { + "epoch": 8.393679458239278, + "grad_norm": 188.22048950195312, + "learning_rate": 1.7395644283121596e-05, + "loss": 42.2066, + "step": 2325 + }, + { + "epoch": 8.397291196388261, + "grad_norm": 228.75363159179688, + "learning_rate": 1.7390199637023592e-05, + "loss": 42.7604, + "step": 2326 + }, + { + "epoch": 8.400902934537246, + "grad_norm": 196.8817901611328, + "learning_rate": 1.7384754990925587e-05, + "loss": 42.445, + "step": 2327 + }, + { + "epoch": 8.404514672686231, + "grad_norm": 205.3610382080078, + "learning_rate": 1.737931034482759e-05, + "loss": 39.8408, + "step": 2328 + }, + { + "epoch": 8.408126410835214, + "grad_norm": 259.0702819824219, + "learning_rate": 1.7373865698729585e-05, + "loss": 40.847, + "step": 2329 + }, + { + "epoch": 8.411738148984199, + "grad_norm": 216.12017822265625, + "learning_rate": 1.736842105263158e-05, + "loss": 40.4648, + "step": 2330 + }, + { + "epoch": 8.411738148984199, + "eval_loss": 0.6252871155738831, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2330 + }, + { + "epoch": 8.415349887133182, + "grad_norm": 330.9464111328125, + "learning_rate": 1.7362976406533575e-05, + "loss": 39.7682, + "step": 2331 + }, + { + "epoch": 8.418961625282167, + "grad_norm": 237.19505310058594, + "learning_rate": 1.735753176043557e-05, + "loss": 38.8824, + "step": 2332 + }, + { + "epoch": 8.422573363431152, + "grad_norm": 247.22259521484375, + "learning_rate": 1.735208711433757e-05, + "loss": 40.1187, + "step": 2333 + }, + { + "epoch": 8.426185101580135, + "grad_norm": 267.739990234375, + "learning_rate": 1.7346642468239565e-05, + "loss": 40.4589, + "step": 2334 + }, + { + "epoch": 8.42979683972912, + "grad_norm": 308.715576171875, + "learning_rate": 1.734119782214156e-05, + "loss": 41.5481, + "step": 2335 + }, + { + "epoch": 8.433408577878104, + "grad_norm": 350.8972473144531, + "learning_rate": 1.7335753176043556e-05, + "loss": 41.6628, + "step": 2336 + }, + { + "epoch": 8.437020316027088, + "grad_norm": 245.9825897216797, + "learning_rate": 1.7330308529945555e-05, + "loss": 40.3527, + "step": 2337 + }, + { + "epoch": 8.440632054176072, + "grad_norm": 253.94488525390625, + "learning_rate": 1.732486388384755e-05, + "loss": 39.6388, + "step": 2338 + }, + { + "epoch": 8.444243792325057, + "grad_norm": 226.24179077148438, + "learning_rate": 1.731941923774955e-05, + "loss": 40.5561, + "step": 2339 + }, + { + "epoch": 8.44785553047404, + "grad_norm": 188.66746520996094, + "learning_rate": 1.7313974591651544e-05, + "loss": 41.8422, + "step": 2340 + }, + { + "epoch": 8.44785553047404, + "eval_loss": 0.6197592616081238, + "eval_runtime": 3.1522, + "eval_samples_per_second": 56.786, + "eval_steps_per_second": 56.786, + "step": 2340 + }, + { + "epoch": 8.451467268623025, + "grad_norm": 227.01014709472656, + "learning_rate": 1.730852994555354e-05, + "loss": 41.4184, + "step": 2341 + }, + { + "epoch": 8.455079006772008, + "grad_norm": 187.11643981933594, + "learning_rate": 1.7303085299455535e-05, + "loss": 40.796, + "step": 2342 + }, + { + "epoch": 8.458690744920993, + "grad_norm": 243.1756134033203, + "learning_rate": 1.729764065335753e-05, + "loss": 41.7926, + "step": 2343 + }, + { + "epoch": 8.462302483069978, + "grad_norm": 226.15187072753906, + "learning_rate": 1.729219600725953e-05, + "loss": 41.588, + "step": 2344 + }, + { + "epoch": 8.465914221218961, + "grad_norm": 218.49935913085938, + "learning_rate": 1.7286751361161524e-05, + "loss": 39.6935, + "step": 2345 + }, + { + "epoch": 8.469525959367946, + "grad_norm": 232.4805145263672, + "learning_rate": 1.7281306715063523e-05, + "loss": 37.0718, + "step": 2346 + }, + { + "epoch": 8.47313769751693, + "grad_norm": 201.1748046875, + "learning_rate": 1.727586206896552e-05, + "loss": 33.9633, + "step": 2347 + }, + { + "epoch": 8.476749435665914, + "grad_norm": 208.79733276367188, + "learning_rate": 1.7270417422867514e-05, + "loss": 33.4553, + "step": 2348 + }, + { + "epoch": 8.480361173814899, + "grad_norm": 235.91151428222656, + "learning_rate": 1.726497277676951e-05, + "loss": 33.6144, + "step": 2349 + }, + { + "epoch": 8.483972911963882, + "grad_norm": 206.28811645507812, + "learning_rate": 1.7259528130671508e-05, + "loss": 35.3678, + "step": 2350 + }, + { + "epoch": 8.483972911963882, + "eval_loss": 0.6203061938285828, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 2350 + }, + { + "epoch": 8.487584650112867, + "grad_norm": 305.2204284667969, + "learning_rate": 1.7254083484573503e-05, + "loss": 35.9175, + "step": 2351 + }, + { + "epoch": 8.491196388261852, + "grad_norm": 227.1592254638672, + "learning_rate": 1.72486388384755e-05, + "loss": 35.5001, + "step": 2352 + }, + { + "epoch": 8.494808126410835, + "grad_norm": 194.739501953125, + "learning_rate": 1.7243194192377494e-05, + "loss": 35.0015, + "step": 2353 + }, + { + "epoch": 8.49841986455982, + "grad_norm": 233.8467254638672, + "learning_rate": 1.723774954627949e-05, + "loss": 36.8257, + "step": 2354 + }, + { + "epoch": 8.502031602708804, + "grad_norm": 258.8914489746094, + "learning_rate": 1.7232304900181492e-05, + "loss": 36.1246, + "step": 2355 + }, + { + "epoch": 8.505643340857787, + "grad_norm": 194.8585968017578, + "learning_rate": 1.7226860254083487e-05, + "loss": 36.1245, + "step": 2356 + }, + { + "epoch": 8.509255079006772, + "grad_norm": 191.2276153564453, + "learning_rate": 1.7221415607985483e-05, + "loss": 37.0608, + "step": 2357 + }, + { + "epoch": 8.512866817155757, + "grad_norm": 197.9025115966797, + "learning_rate": 1.7215970961887478e-05, + "loss": 37.0779, + "step": 2358 + }, + { + "epoch": 8.51647855530474, + "grad_norm": 207.01016235351562, + "learning_rate": 1.7210526315789473e-05, + "loss": 37.8432, + "step": 2359 + }, + { + "epoch": 8.520090293453725, + "grad_norm": 222.20201110839844, + "learning_rate": 1.720508166969147e-05, + "loss": 36.6983, + "step": 2360 + }, + { + "epoch": 8.520090293453725, + "eval_loss": 0.6240220665931702, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 2360 + }, + { + "epoch": 8.523702031602708, + "grad_norm": 200.19273376464844, + "learning_rate": 1.7199637023593467e-05, + "loss": 38.0613, + "step": 2361 + }, + { + "epoch": 8.527313769751693, + "grad_norm": 205.36758422851562, + "learning_rate": 1.7194192377495463e-05, + "loss": 29.6395, + "step": 2362 + }, + { + "epoch": 8.530925507900678, + "grad_norm": 206.53396606445312, + "learning_rate": 1.7188747731397458e-05, + "loss": 23.6478, + "step": 2363 + }, + { + "epoch": 8.534537246049661, + "grad_norm": 219.47044372558594, + "learning_rate": 1.7183303085299454e-05, + "loss": 22.8522, + "step": 2364 + }, + { + "epoch": 8.538148984198646, + "grad_norm": 178.48008728027344, + "learning_rate": 1.7177858439201452e-05, + "loss": 24.1411, + "step": 2365 + }, + { + "epoch": 8.54176072234763, + "grad_norm": 222.63731384277344, + "learning_rate": 1.717241379310345e-05, + "loss": 26.2818, + "step": 2366 + }, + { + "epoch": 8.545372460496614, + "grad_norm": 216.6333465576172, + "learning_rate": 1.7166969147005447e-05, + "loss": 42.5599, + "step": 2367 + }, + { + "epoch": 8.548984198645599, + "grad_norm": 241.42532348632812, + "learning_rate": 1.7161524500907442e-05, + "loss": 44.0016, + "step": 2368 + }, + { + "epoch": 8.552595936794582, + "grad_norm": 227.95193481445312, + "learning_rate": 1.7156079854809437e-05, + "loss": 44.1662, + "step": 2369 + }, + { + "epoch": 8.556207674943566, + "grad_norm": 204.9208526611328, + "learning_rate": 1.7150635208711433e-05, + "loss": 41.2255, + "step": 2370 + }, + { + "epoch": 8.556207674943566, + "eval_loss": 0.6293933987617493, + "eval_runtime": 3.1467, + "eval_samples_per_second": 56.884, + "eval_steps_per_second": 56.884, + "step": 2370 + }, + { + "epoch": 8.559819413092551, + "grad_norm": 168.1370849609375, + "learning_rate": 1.7145190562613428e-05, + "loss": 42.8374, + "step": 2371 + }, + { + "epoch": 8.563431151241534, + "grad_norm": 209.16641235351562, + "learning_rate": 1.7139745916515427e-05, + "loss": 42.4378, + "step": 2372 + }, + { + "epoch": 8.56704288939052, + "grad_norm": 235.36373901367188, + "learning_rate": 1.7134301270417422e-05, + "loss": 43.3213, + "step": 2373 + }, + { + "epoch": 8.570654627539504, + "grad_norm": 198.8206329345703, + "learning_rate": 1.712885662431942e-05, + "loss": 43.5621, + "step": 2374 + }, + { + "epoch": 8.574266365688487, + "grad_norm": 191.1640167236328, + "learning_rate": 1.7123411978221416e-05, + "loss": 41.8729, + "step": 2375 + }, + { + "epoch": 8.577878103837472, + "grad_norm": 281.6352233886719, + "learning_rate": 1.7117967332123412e-05, + "loss": 42.8306, + "step": 2376 + }, + { + "epoch": 8.581489841986457, + "grad_norm": 191.68939208984375, + "learning_rate": 1.711252268602541e-05, + "loss": 41.3603, + "step": 2377 + }, + { + "epoch": 8.58510158013544, + "grad_norm": 175.3041229248047, + "learning_rate": 1.7107078039927406e-05, + "loss": 38.7076, + "step": 2378 + }, + { + "epoch": 8.588713318284425, + "grad_norm": 186.31202697753906, + "learning_rate": 1.71016333938294e-05, + "loss": 38.832, + "step": 2379 + }, + { + "epoch": 8.592325056433408, + "grad_norm": 192.0680389404297, + "learning_rate": 1.7096188747731397e-05, + "loss": 40.6542, + "step": 2380 + }, + { + "epoch": 8.592325056433408, + "eval_loss": 0.6245992183685303, + "eval_runtime": 3.1487, + "eval_samples_per_second": 56.848, + "eval_steps_per_second": 56.848, + "step": 2380 + }, + { + "epoch": 8.595936794582393, + "grad_norm": 284.3516540527344, + "learning_rate": 1.7090744101633392e-05, + "loss": 40.3145, + "step": 2381 + }, + { + "epoch": 8.599548532731378, + "grad_norm": 210.2421875, + "learning_rate": 1.708529945553539e-05, + "loss": 39.9109, + "step": 2382 + }, + { + "epoch": 8.60316027088036, + "grad_norm": 202.3438720703125, + "learning_rate": 1.707985480943739e-05, + "loss": 39.0686, + "step": 2383 + }, + { + "epoch": 8.606772009029346, + "grad_norm": 189.5508270263672, + "learning_rate": 1.7074410163339385e-05, + "loss": 40.6673, + "step": 2384 + }, + { + "epoch": 8.610383747178329, + "grad_norm": 199.3516387939453, + "learning_rate": 1.706896551724138e-05, + "loss": 40.5357, + "step": 2385 + }, + { + "epoch": 8.613995485327314, + "grad_norm": 183.11309814453125, + "learning_rate": 1.7063520871143376e-05, + "loss": 40.7691, + "step": 2386 + }, + { + "epoch": 8.617607223476298, + "grad_norm": 347.104248046875, + "learning_rate": 1.705807622504537e-05, + "loss": 40.6822, + "step": 2387 + }, + { + "epoch": 8.621218961625281, + "grad_norm": 341.0453796386719, + "learning_rate": 1.705263157894737e-05, + "loss": 40.9791, + "step": 2388 + }, + { + "epoch": 8.624830699774266, + "grad_norm": 335.33221435546875, + "learning_rate": 1.7047186932849365e-05, + "loss": 41.0977, + "step": 2389 + }, + { + "epoch": 8.628442437923251, + "grad_norm": 209.75198364257812, + "learning_rate": 1.704174228675136e-05, + "loss": 41.3332, + "step": 2390 + }, + { + "epoch": 8.628442437923251, + "eval_loss": 0.6176490783691406, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2390 + }, + { + "epoch": 8.632054176072234, + "grad_norm": 221.6715545654297, + "learning_rate": 1.7036297640653356e-05, + "loss": 41.7456, + "step": 2391 + }, + { + "epoch": 8.635665914221219, + "grad_norm": 255.7875213623047, + "learning_rate": 1.7030852994555355e-05, + "loss": 41.7063, + "step": 2392 + }, + { + "epoch": 8.639277652370204, + "grad_norm": 206.66221618652344, + "learning_rate": 1.7025408348457354e-05, + "loss": 41.941, + "step": 2393 + }, + { + "epoch": 8.642889390519187, + "grad_norm": 381.9871826171875, + "learning_rate": 1.701996370235935e-05, + "loss": 42.8615, + "step": 2394 + }, + { + "epoch": 8.646501128668172, + "grad_norm": 303.8249816894531, + "learning_rate": 1.7014519056261344e-05, + "loss": 37.8472, + "step": 2395 + }, + { + "epoch": 8.650112866817155, + "grad_norm": 201.2444610595703, + "learning_rate": 1.700907441016334e-05, + "loss": 35.4641, + "step": 2396 + }, + { + "epoch": 8.65372460496614, + "grad_norm": 242.34298706054688, + "learning_rate": 1.7003629764065335e-05, + "loss": 33.3414, + "step": 2397 + }, + { + "epoch": 8.657336343115125, + "grad_norm": 214.45384216308594, + "learning_rate": 1.699818511796733e-05, + "loss": 33.7771, + "step": 2398 + }, + { + "epoch": 8.660948081264108, + "grad_norm": 276.4810485839844, + "learning_rate": 1.699274047186933e-05, + "loss": 35.4289, + "step": 2399 + }, + { + "epoch": 8.664559819413093, + "grad_norm": 199.68626403808594, + "learning_rate": 1.6987295825771325e-05, + "loss": 34.4205, + "step": 2400 + }, + { + "epoch": 8.664559819413093, + "eval_loss": 0.6179484128952026, + "eval_runtime": 3.1618, + "eval_samples_per_second": 56.614, + "eval_steps_per_second": 56.614, + "step": 2400 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1775100320822067e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..42ea040f7e8df370cb91727d9be34e56fb42407e --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9a086355cea95a03812ba382e6a9ee72dbcaf7d55e950fb3df928e29a61e258 +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..17118b8be1e147a17f7d97364fc8e3213868f1cf --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:231bec409113b098e960180b1296ae2852dcbc6c5310224cffb086f9611ecc0d +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..35ce84f0586cdd21177b6d11973829a0d069ca1b --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5888b4e0e8281d400e853fd6884e04c834e215cad8fe5773a2883b74142ada13 +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a3894445b4815f1a23a88f9c0b5b7f69faea1285 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79ebdfd67195e9be70cc7b09f62bfb666e84a87bc9e0733eedccc79429a06cbe +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..06ed6fa3982becaff33a9d326a70bf2202f38dd2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df73f446be4205553cb8bb2ba59b0635a4402150cd7c2d8cd9b3a387b2b79fdf +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4602218690b741992172a4c5c8935b08c2e973b3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/trainer_state.json @@ -0,0 +1,20313 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.386455981941308, + "eval_steps": 10, + "global_step": 2600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + }, + { + "epoch": 4.33589164785553, + "grad_norm": 232.332763671875, + "learning_rate": 2.3515426497277678e-05, + "loss": 38.1029, + "step": 1201 + }, + { + "epoch": 4.339503386004514, + "grad_norm": 251.8763885498047, + "learning_rate": 2.3509981851179673e-05, + "loss": 40.2538, + "step": 1202 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 260.1363525390625, + "learning_rate": 2.350453720508167e-05, + "loss": 39.115, + "step": 1203 + }, + { + "epoch": 4.346726862302483, + "grad_norm": 227.32473754882812, + "learning_rate": 2.3499092558983667e-05, + "loss": 37.7692, + "step": 1204 + }, + { + "epoch": 4.350338600451467, + "grad_norm": 208.3872528076172, + "learning_rate": 2.3493647912885663e-05, + "loss": 26.7583, + "step": 1205 + }, + { + "epoch": 4.353950338600452, + "grad_norm": 173.05075073242188, + "learning_rate": 2.348820326678766e-05, + "loss": 24.7576, + "step": 1206 + }, + { + "epoch": 4.357562076749436, + "grad_norm": 214.4512939453125, + "learning_rate": 2.3482758620689657e-05, + "loss": 24.8792, + "step": 1207 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 179.293701171875, + "learning_rate": 2.3477313974591652e-05, + "loss": 26.1507, + "step": 1208 + }, + { + "epoch": 4.364785553047404, + "grad_norm": 401.9908142089844, + "learning_rate": 2.3471869328493648e-05, + "loss": 47.4017, + "step": 1209 + }, + { + "epoch": 4.368397291196389, + "grad_norm": 399.3369140625, + "learning_rate": 2.3466424682395643e-05, + "loss": 48.0082, + "step": 1210 + }, + { + "epoch": 4.368397291196389, + "eval_loss": 0.6664602756500244, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.18, + "eval_steps_per_second": 57.18, + "step": 1210 + }, + { + "epoch": 4.372009029345373, + "grad_norm": 320.49090576171875, + "learning_rate": 2.346098003629764e-05, + "loss": 47.4843, + "step": 1211 + }, + { + "epoch": 4.375620767494357, + "grad_norm": 297.55615234375, + "learning_rate": 2.3455535390199637e-05, + "loss": 46.3087, + "step": 1212 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 245.03399658203125, + "learning_rate": 2.3450090744101636e-05, + "loss": 45.4889, + "step": 1213 + }, + { + "epoch": 4.382844243792325, + "grad_norm": 227.94091796875, + "learning_rate": 2.344464609800363e-05, + "loss": 45.8501, + "step": 1214 + }, + { + "epoch": 4.386455981941309, + "grad_norm": 262.7824401855469, + "learning_rate": 2.3439201451905627e-05, + "loss": 46.2737, + "step": 1215 + }, + { + "epoch": 4.390067720090293, + "grad_norm": 235.969970703125, + "learning_rate": 2.3433756805807622e-05, + "loss": 45.2876, + "step": 1216 + }, + { + "epoch": 4.393679458239277, + "grad_norm": 244.8028106689453, + "learning_rate": 2.342831215970962e-05, + "loss": 45.4931, + "step": 1217 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 236.24844360351562, + "learning_rate": 2.3422867513611616e-05, + "loss": 45.6649, + "step": 1218 + }, + { + "epoch": 4.400902934537246, + "grad_norm": 204.7911834716797, + "learning_rate": 2.341742286751361e-05, + "loss": 43.9613, + "step": 1219 + }, + { + "epoch": 4.40451467268623, + "grad_norm": 190.6739044189453, + "learning_rate": 2.3411978221415607e-05, + "loss": 41.9267, + "step": 1220 + }, + { + "epoch": 4.40451467268623, + "eval_loss": 0.6481396555900574, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1220 + }, + { + "epoch": 4.408126410835214, + "grad_norm": 224.25758361816406, + "learning_rate": 2.3406533575317602e-05, + "loss": 42.34, + "step": 1221 + }, + { + "epoch": 4.411738148984199, + "grad_norm": 238.21913146972656, + "learning_rate": 2.34010889292196e-05, + "loss": 40.6947, + "step": 1222 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 255.64395141601562, + "learning_rate": 2.33956442831216e-05, + "loss": 39.8585, + "step": 1223 + }, + { + "epoch": 4.418961625282167, + "grad_norm": 202.08859252929688, + "learning_rate": 2.3390199637023595e-05, + "loss": 42.6031, + "step": 1224 + }, + { + "epoch": 4.422573363431152, + "grad_norm": 222.359619140625, + "learning_rate": 2.338475499092559e-05, + "loss": 41.9946, + "step": 1225 + }, + { + "epoch": 4.426185101580136, + "grad_norm": 198.84461975097656, + "learning_rate": 2.3379310344827586e-05, + "loss": 40.9174, + "step": 1226 + }, + { + "epoch": 4.42979683972912, + "grad_norm": 227.34942626953125, + "learning_rate": 2.337386569872958e-05, + "loss": 42.2865, + "step": 1227 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 249.9097900390625, + "learning_rate": 2.336842105263158e-05, + "loss": 42.6508, + "step": 1228 + }, + { + "epoch": 4.437020316027088, + "grad_norm": 236.96009826660156, + "learning_rate": 2.3362976406533576e-05, + "loss": 43.0846, + "step": 1229 + }, + { + "epoch": 4.440632054176072, + "grad_norm": 183.06201171875, + "learning_rate": 2.335753176043557e-05, + "loss": 42.4119, + "step": 1230 + }, + { + "epoch": 4.440632054176072, + "eval_loss": 0.6428424715995789, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 1230 + }, + { + "epoch": 4.444243792325056, + "grad_norm": 199.0382843017578, + "learning_rate": 2.335208711433757e-05, + "loss": 43.1702, + "step": 1231 + }, + { + "epoch": 4.44785553047404, + "grad_norm": 221.87939453125, + "learning_rate": 2.3346642468239565e-05, + "loss": 43.3518, + "step": 1232 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 205.0601043701172, + "learning_rate": 2.3341197822141564e-05, + "loss": 42.9713, + "step": 1233 + }, + { + "epoch": 4.455079006772009, + "grad_norm": 235.3998565673828, + "learning_rate": 2.333575317604356e-05, + "loss": 42.6973, + "step": 1234 + }, + { + "epoch": 4.458690744920993, + "grad_norm": 171.76986694335938, + "learning_rate": 2.3330308529945555e-05, + "loss": 43.351, + "step": 1235 + }, + { + "epoch": 4.462302483069977, + "grad_norm": 261.549072265625, + "learning_rate": 2.332486388384755e-05, + "loss": 43.8662, + "step": 1236 + }, + { + "epoch": 4.465914221218962, + "grad_norm": 256.76837158203125, + "learning_rate": 2.3319419237749545e-05, + "loss": 40.7938, + "step": 1237 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 176.35060119628906, + "learning_rate": 2.331397459165154e-05, + "loss": 38.1021, + "step": 1238 + }, + { + "epoch": 4.47313769751693, + "grad_norm": 203.00906372070312, + "learning_rate": 2.330852994555354e-05, + "loss": 36.6359, + "step": 1239 + }, + { + "epoch": 4.476749435665914, + "grad_norm": 259.6462707519531, + "learning_rate": 2.3303085299455535e-05, + "loss": 34.448, + "step": 1240 + }, + { + "epoch": 4.476749435665914, + "eval_loss": 0.6386051177978516, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 1240 + }, + { + "epoch": 4.480361173814899, + "grad_norm": 215.24737548828125, + "learning_rate": 2.3297640653357534e-05, + "loss": 35.2353, + "step": 1241 + }, + { + "epoch": 4.483972911963883, + "grad_norm": 249.12355041503906, + "learning_rate": 2.329219600725953e-05, + "loss": 38.2077, + "step": 1242 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 191.0881805419922, + "learning_rate": 2.3286751361161525e-05, + "loss": 36.8363, + "step": 1243 + }, + { + "epoch": 4.491196388261851, + "grad_norm": 229.26449584960938, + "learning_rate": 2.3281306715063523e-05, + "loss": 36.7398, + "step": 1244 + }, + { + "epoch": 4.4948081264108355, + "grad_norm": 184.931884765625, + "learning_rate": 2.327586206896552e-05, + "loss": 35.6614, + "step": 1245 + }, + { + "epoch": 4.4984198645598195, + "grad_norm": 183.7378387451172, + "learning_rate": 2.3270417422867514e-05, + "loss": 36.9818, + "step": 1246 + }, + { + "epoch": 4.502031602708803, + "grad_norm": 191.42543029785156, + "learning_rate": 2.326497277676951e-05, + "loss": 38.1348, + "step": 1247 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 211.6359100341797, + "learning_rate": 2.3259528130671505e-05, + "loss": 37.0112, + "step": 1248 + }, + { + "epoch": 4.509255079006772, + "grad_norm": 245.6946563720703, + "learning_rate": 2.32540834845735e-05, + "loss": 38.6218, + "step": 1249 + }, + { + "epoch": 4.512866817155756, + "grad_norm": 193.29095458984375, + "learning_rate": 2.3248638838475502e-05, + "loss": 36.9687, + "step": 1250 + }, + { + "epoch": 4.512866817155756, + "eval_loss": 0.6432057023048401, + "eval_runtime": 3.1301, + "eval_samples_per_second": 57.187, + "eval_steps_per_second": 57.187, + "step": 1250 + }, + { + "epoch": 4.51647855530474, + "grad_norm": 247.0595245361328, + "learning_rate": 2.3243194192377498e-05, + "loss": 39.8086, + "step": 1251 + }, + { + "epoch": 4.520090293453725, + "grad_norm": 243.1544189453125, + "learning_rate": 2.3237749546279493e-05, + "loss": 38.7245, + "step": 1252 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 322.0834045410156, + "learning_rate": 2.323230490018149e-05, + "loss": 39.5335, + "step": 1253 + }, + { + "epoch": 4.527313769751693, + "grad_norm": 201.5956573486328, + "learning_rate": 2.3226860254083484e-05, + "loss": 30.2928, + "step": 1254 + }, + { + "epoch": 4.530925507900677, + "grad_norm": 186.13291931152344, + "learning_rate": 2.3221415607985483e-05, + "loss": 24.8504, + "step": 1255 + }, + { + "epoch": 4.534537246049661, + "grad_norm": 251.50608825683594, + "learning_rate": 2.3215970961887478e-05, + "loss": 24.5528, + "step": 1256 + }, + { + "epoch": 4.538148984198646, + "grad_norm": 180.21124267578125, + "learning_rate": 2.3210526315789473e-05, + "loss": 25.0864, + "step": 1257 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 206.5410614013672, + "learning_rate": 2.320508166969147e-05, + "loss": 27.1602, + "step": 1258 + }, + { + "epoch": 4.545372460496614, + "grad_norm": 342.1103210449219, + "learning_rate": 2.3199637023593468e-05, + "loss": 47.3734, + "step": 1259 + }, + { + "epoch": 4.5489841986455986, + "grad_norm": 418.3056945800781, + "learning_rate": 2.3194192377495463e-05, + "loss": 48.0316, + "step": 1260 + }, + { + "epoch": 4.5489841986455986, + "eval_loss": 0.6742400527000427, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 1260 + }, + { + "epoch": 4.5525959367945825, + "grad_norm": 369.8560791015625, + "learning_rate": 2.3188747731397462e-05, + "loss": 47.4532, + "step": 1261 + }, + { + "epoch": 4.5562076749435665, + "grad_norm": 322.0288391113281, + "learning_rate": 2.3183303085299457e-05, + "loss": 47.0661, + "step": 1262 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 244.79066467285156, + "learning_rate": 2.3177858439201453e-05, + "loss": 45.1875, + "step": 1263 + }, + { + "epoch": 4.563431151241535, + "grad_norm": 209.29397583007812, + "learning_rate": 2.3172413793103448e-05, + "loss": 46.1355, + "step": 1264 + }, + { + "epoch": 4.567042889390519, + "grad_norm": 271.5123291015625, + "learning_rate": 2.3166969147005443e-05, + "loss": 45.8947, + "step": 1265 + }, + { + "epoch": 4.570654627539503, + "grad_norm": 232.42913818359375, + "learning_rate": 2.3161524500907442e-05, + "loss": 45.6542, + "step": 1266 + }, + { + "epoch": 4.574266365688487, + "grad_norm": 282.50738525390625, + "learning_rate": 2.3156079854809437e-05, + "loss": 45.8805, + "step": 1267 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 203.39031982421875, + "learning_rate": 2.3150635208711436e-05, + "loss": 44.8926, + "step": 1268 + }, + { + "epoch": 4.581489841986456, + "grad_norm": 213.94894409179688, + "learning_rate": 2.314519056261343e-05, + "loss": 43.7589, + "step": 1269 + }, + { + "epoch": 4.58510158013544, + "grad_norm": 198.9677734375, + "learning_rate": 2.3139745916515427e-05, + "loss": 41.819, + "step": 1270 + }, + { + "epoch": 4.58510158013544, + "eval_loss": 0.6428627371788025, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1270 + }, + { + "epoch": 4.588713318284425, + "grad_norm": 197.69903564453125, + "learning_rate": 2.3134301270417422e-05, + "loss": 40.6128, + "step": 1271 + }, + { + "epoch": 4.592325056433409, + "grad_norm": 229.10488891601562, + "learning_rate": 2.312885662431942e-05, + "loss": 41.1856, + "step": 1272 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 254.4750213623047, + "learning_rate": 2.3123411978221417e-05, + "loss": 40.2048, + "step": 1273 + }, + { + "epoch": 4.599548532731377, + "grad_norm": 247.2012939453125, + "learning_rate": 2.3117967332123412e-05, + "loss": 41.663, + "step": 1274 + }, + { + "epoch": 4.603160270880361, + "grad_norm": 196.78761291503906, + "learning_rate": 2.3112522686025407e-05, + "loss": 41.1102, + "step": 1275 + }, + { + "epoch": 4.606772009029346, + "grad_norm": 179.03880310058594, + "learning_rate": 2.3107078039927403e-05, + "loss": 39.6368, + "step": 1276 + }, + { + "epoch": 4.6103837471783295, + "grad_norm": 203.49159240722656, + "learning_rate": 2.3101633393829405e-05, + "loss": 42.9424, + "step": 1277 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 254.80018615722656, + "learning_rate": 2.30961887477314e-05, + "loss": 42.0636, + "step": 1278 + }, + { + "epoch": 4.617607223476298, + "grad_norm": 201.86109924316406, + "learning_rate": 2.3090744101633396e-05, + "loss": 41.4738, + "step": 1279 + }, + { + "epoch": 4.621218961625282, + "grad_norm": 185.1239471435547, + "learning_rate": 2.308529945553539e-05, + "loss": 41.8529, + "step": 1280 + }, + { + "epoch": 4.621218961625282, + "eval_loss": 0.6457561254501343, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 1280 + }, + { + "epoch": 4.624830699774266, + "grad_norm": 198.6769561767578, + "learning_rate": 2.3079854809437386e-05, + "loss": 41.8397, + "step": 1281 + }, + { + "epoch": 4.62844243792325, + "grad_norm": 254.9165496826172, + "learning_rate": 2.3074410163339382e-05, + "loss": 43.5585, + "step": 1282 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 183.61181640625, + "learning_rate": 2.306896551724138e-05, + "loss": 41.7349, + "step": 1283 + }, + { + "epoch": 4.635665914221219, + "grad_norm": 206.0381622314453, + "learning_rate": 2.3063520871143376e-05, + "loss": 42.6239, + "step": 1284 + }, + { + "epoch": 4.639277652370203, + "grad_norm": 188.5303497314453, + "learning_rate": 2.305807622504537e-05, + "loss": 43.0988, + "step": 1285 + }, + { + "epoch": 4.642889390519187, + "grad_norm": 208.30039978027344, + "learning_rate": 2.3052631578947367e-05, + "loss": 43.8379, + "step": 1286 + }, + { + "epoch": 4.646501128668172, + "grad_norm": 209.494384765625, + "learning_rate": 2.3047186932849365e-05, + "loss": 41.4395, + "step": 1287 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 223.97824096679688, + "learning_rate": 2.3041742286751364e-05, + "loss": 38.5792, + "step": 1288 + }, + { + "epoch": 4.65372460496614, + "grad_norm": 209.16192626953125, + "learning_rate": 2.303629764065336e-05, + "loss": 36.2448, + "step": 1289 + }, + { + "epoch": 4.657336343115124, + "grad_norm": 260.72821044921875, + "learning_rate": 2.3030852994555355e-05, + "loss": 35.1692, + "step": 1290 + }, + { + "epoch": 4.657336343115124, + "eval_loss": 0.6381233334541321, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1290 + }, + { + "epoch": 4.660948081264109, + "grad_norm": 222.2270965576172, + "learning_rate": 2.302540834845735e-05, + "loss": 35.2234, + "step": 1291 + }, + { + "epoch": 4.664559819413093, + "grad_norm": 208.68218994140625, + "learning_rate": 2.3019963702359346e-05, + "loss": 35.6167, + "step": 1292 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 199.57015991210938, + "learning_rate": 2.301451905626134e-05, + "loss": 36.9489, + "step": 1293 + }, + { + "epoch": 4.6717832957110605, + "grad_norm": 249.1312255859375, + "learning_rate": 2.300907441016334e-05, + "loss": 37.0681, + "step": 1294 + }, + { + "epoch": 4.675395033860045, + "grad_norm": 227.86341857910156, + "learning_rate": 2.3003629764065335e-05, + "loss": 38.3897, + "step": 1295 + }, + { + "epoch": 4.679006772009029, + "grad_norm": 290.3368225097656, + "learning_rate": 2.2998185117967334e-05, + "loss": 39.1391, + "step": 1296 + }, + { + "epoch": 4.682618510158013, + "grad_norm": 222.59974670410156, + "learning_rate": 2.299274047186933e-05, + "loss": 38.6362, + "step": 1297 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 233.853515625, + "learning_rate": 2.2987295825771325e-05, + "loss": 37.1796, + "step": 1298 + }, + { + "epoch": 4.689841986455982, + "grad_norm": 202.83212280273438, + "learning_rate": 2.2981851179673324e-05, + "loss": 38.5097, + "step": 1299 + }, + { + "epoch": 4.693453724604966, + "grad_norm": 203.59027099609375, + "learning_rate": 2.297640653357532e-05, + "loss": 38.3335, + "step": 1300 + }, + { + "epoch": 4.693453724604966, + "eval_loss": 0.6446877717971802, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 1300 + }, + { + "epoch": 4.69706546275395, + "grad_norm": 250.48324584960938, + "learning_rate": 2.2970961887477314e-05, + "loss": 39.1848, + "step": 1301 + }, + { + "epoch": 4.700677200902934, + "grad_norm": 218.0867462158203, + "learning_rate": 2.296551724137931e-05, + "loss": 38.2276, + "step": 1302 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 316.4258728027344, + "learning_rate": 2.2960072595281305e-05, + "loss": 38.4487, + "step": 1303 + }, + { + "epoch": 4.707900677200903, + "grad_norm": 262.96832275390625, + "learning_rate": 2.29546279491833e-05, + "loss": 29.1075, + "step": 1304 + }, + { + "epoch": 4.711512415349887, + "grad_norm": 261.25897216796875, + "learning_rate": 2.2949183303085303e-05, + "loss": 24.6257, + "step": 1305 + }, + { + "epoch": 4.715124153498872, + "grad_norm": 223.29014587402344, + "learning_rate": 2.2943738656987298e-05, + "loss": 24.4387, + "step": 1306 + }, + { + "epoch": 4.718735891647856, + "grad_norm": 167.95193481445312, + "learning_rate": 2.2938294010889293e-05, + "loss": 25.0916, + "step": 1307 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 203.88392639160156, + "learning_rate": 2.293284936479129e-05, + "loss": 26.1631, + "step": 1308 + }, + { + "epoch": 4.725959367945824, + "grad_norm": 350.67657470703125, + "learning_rate": 2.2927404718693284e-05, + "loss": 47.7021, + "step": 1309 + }, + { + "epoch": 4.7295711060948085, + "grad_norm": 357.1839294433594, + "learning_rate": 2.2921960072595283e-05, + "loss": 47.8161, + "step": 1310 + }, + { + "epoch": 4.7295711060948085, + "eval_loss": 0.6716815829277039, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 1310 + }, + { + "epoch": 4.733182844243792, + "grad_norm": 334.40216064453125, + "learning_rate": 2.291651542649728e-05, + "loss": 47.5608, + "step": 1311 + }, + { + "epoch": 4.736794582392776, + "grad_norm": 322.90008544921875, + "learning_rate": 2.2911070780399274e-05, + "loss": 45.9858, + "step": 1312 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 291.5083923339844, + "learning_rate": 2.290562613430127e-05, + "loss": 45.9813, + "step": 1313 + }, + { + "epoch": 4.744018058690745, + "grad_norm": 234.91102600097656, + "learning_rate": 2.2900181488203268e-05, + "loss": 44.4287, + "step": 1314 + }, + { + "epoch": 4.747629796839729, + "grad_norm": 271.03582763671875, + "learning_rate": 2.2894736842105263e-05, + "loss": 45.3697, + "step": 1315 + }, + { + "epoch": 4.751241534988713, + "grad_norm": 256.219482421875, + "learning_rate": 2.2889292196007262e-05, + "loss": 45.1817, + "step": 1316 + }, + { + "epoch": 4.754853273137698, + "grad_norm": 252.0631561279297, + "learning_rate": 2.2883847549909257e-05, + "loss": 45.2029, + "step": 1317 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 249.41812133789062, + "learning_rate": 2.2878402903811253e-05, + "loss": 44.9802, + "step": 1318 + }, + { + "epoch": 4.762076749435666, + "grad_norm": 208.9102325439453, + "learning_rate": 2.2872958257713248e-05, + "loss": 44.3745, + "step": 1319 + }, + { + "epoch": 4.76568848758465, + "grad_norm": 322.94903564453125, + "learning_rate": 2.2867513611615244e-05, + "loss": 40.9193, + "step": 1320 + }, + { + "epoch": 4.76568848758465, + "eval_loss": 0.6515910029411316, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1320 + }, + { + "epoch": 4.769300225733634, + "grad_norm": 264.6942138671875, + "learning_rate": 2.2862068965517242e-05, + "loss": 39.7286, + "step": 1321 + }, + { + "epoch": 4.772911963882619, + "grad_norm": 276.6095886230469, + "learning_rate": 2.2856624319419238e-05, + "loss": 41.3846, + "step": 1322 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 199.59877014160156, + "learning_rate": 2.2851179673321233e-05, + "loss": 40.5583, + "step": 1323 + }, + { + "epoch": 4.780135440180587, + "grad_norm": 252.59158325195312, + "learning_rate": 2.2845735027223232e-05, + "loss": 40.9513, + "step": 1324 + }, + { + "epoch": 4.7837471783295715, + "grad_norm": 215.53826904296875, + "learning_rate": 2.2840290381125227e-05, + "loss": 41.5119, + "step": 1325 + }, + { + "epoch": 4.7873589164785555, + "grad_norm": 290.7100524902344, + "learning_rate": 2.2834845735027226e-05, + "loss": 42.7646, + "step": 1326 + }, + { + "epoch": 4.7909706546275395, + "grad_norm": 190.2306671142578, + "learning_rate": 2.282940108892922e-05, + "loss": 42.2708, + "step": 1327 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 187.5550079345703, + "learning_rate": 2.2823956442831217e-05, + "loss": 41.9279, + "step": 1328 + }, + { + "epoch": 4.798194130925508, + "grad_norm": 169.10414123535156, + "learning_rate": 2.2818511796733212e-05, + "loss": 42.2688, + "step": 1329 + }, + { + "epoch": 4.801805869074492, + "grad_norm": 199.5216064453125, + "learning_rate": 2.2813067150635208e-05, + "loss": 41.9192, + "step": 1330 + }, + { + "epoch": 4.801805869074492, + "eval_loss": 0.6402038335800171, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 1330 + }, + { + "epoch": 4.805417607223476, + "grad_norm": 222.4996337890625, + "learning_rate": 2.2807622504537203e-05, + "loss": 43.8218, + "step": 1331 + }, + { + "epoch": 4.80902934537246, + "grad_norm": 228.1157684326172, + "learning_rate": 2.2802177858439202e-05, + "loss": 42.9497, + "step": 1332 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 179.83697509765625, + "learning_rate": 2.27967332123412e-05, + "loss": 43.9723, + "step": 1333 + }, + { + "epoch": 4.816252821670429, + "grad_norm": 196.81983947753906, + "learning_rate": 2.2791288566243196e-05, + "loss": 43.3302, + "step": 1334 + }, + { + "epoch": 4.819864559819413, + "grad_norm": 186.61160278320312, + "learning_rate": 2.278584392014519e-05, + "loss": 41.8957, + "step": 1335 + }, + { + "epoch": 4.823476297968397, + "grad_norm": 242.55886840820312, + "learning_rate": 2.2780399274047187e-05, + "loss": 43.1916, + "step": 1336 + }, + { + "epoch": 4.827088036117382, + "grad_norm": 212.07177734375, + "learning_rate": 2.2774954627949185e-05, + "loss": 38.3371, + "step": 1337 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 180.1990966796875, + "learning_rate": 2.276950998185118e-05, + "loss": 36.3413, + "step": 1338 + }, + { + "epoch": 4.83431151241535, + "grad_norm": 202.69529724121094, + "learning_rate": 2.2764065335753176e-05, + "loss": 35.4426, + "step": 1339 + }, + { + "epoch": 4.837923250564334, + "grad_norm": 180.47283935546875, + "learning_rate": 2.275862068965517e-05, + "loss": 35.5281, + "step": 1340 + }, + { + "epoch": 4.837923250564334, + "eval_loss": 0.6356105804443359, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 1340 + }, + { + "epoch": 4.8415349887133186, + "grad_norm": 204.674560546875, + "learning_rate": 2.2753176043557167e-05, + "loss": 36.2566, + "step": 1341 + }, + { + "epoch": 4.8451467268623025, + "grad_norm": 272.1197204589844, + "learning_rate": 2.2747731397459166e-05, + "loss": 36.3862, + "step": 1342 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 235.55101013183594, + "learning_rate": 2.2742286751361165e-05, + "loss": 35.1455, + "step": 1343 + }, + { + "epoch": 4.852370203160271, + "grad_norm": 271.2718200683594, + "learning_rate": 2.273684210526316e-05, + "loss": 37.3824, + "step": 1344 + }, + { + "epoch": 4.855981941309255, + "grad_norm": 242.15728759765625, + "learning_rate": 2.2731397459165155e-05, + "loss": 37.6587, + "step": 1345 + }, + { + "epoch": 4.859593679458239, + "grad_norm": 218.59481811523438, + "learning_rate": 2.272595281306715e-05, + "loss": 36.7602, + "step": 1346 + }, + { + "epoch": 4.863205417607223, + "grad_norm": 231.9490203857422, + "learning_rate": 2.2720508166969146e-05, + "loss": 38.187, + "step": 1347 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 385.56158447265625, + "learning_rate": 2.2715063520871145e-05, + "loss": 38.1905, + "step": 1348 + }, + { + "epoch": 4.870428893905192, + "grad_norm": 219.38204956054688, + "learning_rate": 2.270961887477314e-05, + "loss": 38.2179, + "step": 1349 + }, + { + "epoch": 4.874040632054176, + "grad_norm": 209.46580505371094, + "learning_rate": 2.2704174228675136e-05, + "loss": 37.3696, + "step": 1350 + }, + { + "epoch": 4.874040632054176, + "eval_loss": 0.6412517428398132, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 1350 + }, + { + "epoch": 4.87765237020316, + "grad_norm": 205.53416442871094, + "learning_rate": 2.2698729582577134e-05, + "loss": 38.5144, + "step": 1351 + }, + { + "epoch": 4.881264108352145, + "grad_norm": 214.2522735595703, + "learning_rate": 2.269328493647913e-05, + "loss": 38.7372, + "step": 1352 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 236.9787139892578, + "learning_rate": 2.2687840290381125e-05, + "loss": 38.8987, + "step": 1353 + }, + { + "epoch": 4.888487584650113, + "grad_norm": 247.30906677246094, + "learning_rate": 2.2682395644283124e-05, + "loss": 35.0837, + "step": 1354 + }, + { + "epoch": 4.892099322799097, + "grad_norm": 287.5954284667969, + "learning_rate": 2.267695099818512e-05, + "loss": 25.5272, + "step": 1355 + }, + { + "epoch": 4.895711060948082, + "grad_norm": 254.61672973632812, + "learning_rate": 2.2671506352087115e-05, + "loss": 25.1288, + "step": 1356 + }, + { + "epoch": 4.899322799097066, + "grad_norm": 180.98666381835938, + "learning_rate": 2.266606170598911e-05, + "loss": 25.0588, + "step": 1357 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 213.0275421142578, + "learning_rate": 2.2660617059891105e-05, + "loss": 25.464, + "step": 1358 + }, + { + "epoch": 4.9065462753950335, + "grad_norm": 385.18035888671875, + "learning_rate": 2.2655172413793104e-05, + "loss": 47.0056, + "step": 1359 + }, + { + "epoch": 4.910158013544018, + "grad_norm": 383.4106140136719, + "learning_rate": 2.2649727767695103e-05, + "loss": 46.9892, + "step": 1360 + }, + { + "epoch": 4.910158013544018, + "eval_loss": 0.6618479490280151, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1360 + }, + { + "epoch": 4.913769751693002, + "grad_norm": 415.4345397949219, + "learning_rate": 2.26442831215971e-05, + "loss": 47.1619, + "step": 1361 + }, + { + "epoch": 4.917381489841986, + "grad_norm": 362.338134765625, + "learning_rate": 2.2638838475499094e-05, + "loss": 46.7232, + "step": 1362 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 378.7535400390625, + "learning_rate": 2.263339382940109e-05, + "loss": 46.4438, + "step": 1363 + }, + { + "epoch": 4.924604966139955, + "grad_norm": 251.64901733398438, + "learning_rate": 2.2627949183303085e-05, + "loss": 44.8178, + "step": 1364 + }, + { + "epoch": 4.928216704288939, + "grad_norm": 273.1052551269531, + "learning_rate": 2.2622504537205083e-05, + "loss": 43.0865, + "step": 1365 + }, + { + "epoch": 4.931828442437923, + "grad_norm": 229.66415405273438, + "learning_rate": 2.261705989110708e-05, + "loss": 42.2463, + "step": 1366 + }, + { + "epoch": 4.935440180586907, + "grad_norm": 229.47940063476562, + "learning_rate": 2.2611615245009074e-05, + "loss": 42.4395, + "step": 1367 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 224.48890686035156, + "learning_rate": 2.260617059891107e-05, + "loss": 42.4994, + "step": 1368 + }, + { + "epoch": 4.942663656884876, + "grad_norm": 241.98745727539062, + "learning_rate": 2.2600725952813065e-05, + "loss": 42.5535, + "step": 1369 + }, + { + "epoch": 4.94627539503386, + "grad_norm": 258.1711120605469, + "learning_rate": 2.2595281306715067e-05, + "loss": 42.8475, + "step": 1370 + }, + { + "epoch": 4.94627539503386, + "eval_loss": 0.639252245426178, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.09, + "eval_steps_per_second": 57.09, + "step": 1370 + }, + { + "epoch": 4.949887133182845, + "grad_norm": 204.64927673339844, + "learning_rate": 2.2589836660617062e-05, + "loss": 42.9895, + "step": 1371 + }, + { + "epoch": 4.953498871331829, + "grad_norm": 342.9057922363281, + "learning_rate": 2.2584392014519058e-05, + "loss": 43.1972, + "step": 1372 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 207.45504760742188, + "learning_rate": 2.2578947368421053e-05, + "loss": 42.406, + "step": 1373 + }, + { + "epoch": 4.960722347629797, + "grad_norm": 232.78831481933594, + "learning_rate": 2.257350272232305e-05, + "loss": 36.8817, + "step": 1374 + }, + { + "epoch": 4.9643340857787805, + "grad_norm": 249.3349609375, + "learning_rate": 2.2568058076225044e-05, + "loss": 34.584, + "step": 1375 + }, + { + "epoch": 4.967945823927765, + "grad_norm": 322.7100524902344, + "learning_rate": 2.2562613430127043e-05, + "loss": 36.9512, + "step": 1376 + }, + { + "epoch": 4.971557562076749, + "grad_norm": 357.65228271484375, + "learning_rate": 2.2557168784029038e-05, + "loss": 37.6833, + "step": 1377 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 300.0970153808594, + "learning_rate": 2.2551724137931033e-05, + "loss": 38.597, + "step": 1378 + }, + { + "epoch": 4.978781038374718, + "grad_norm": 234.52508544921875, + "learning_rate": 2.2546279491833032e-05, + "loss": 38.4155, + "step": 1379 + }, + { + "epoch": 4.982392776523702, + "grad_norm": 270.60626220703125, + "learning_rate": 2.2540834845735028e-05, + "loss": 38.1589, + "step": 1380 + }, + { + "epoch": 4.982392776523702, + "eval_loss": 0.6409950256347656, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 1380 + }, + { + "epoch": 4.986004514672686, + "grad_norm": 232.9596710205078, + "learning_rate": 2.2535390199637026e-05, + "loss": 39.281, + "step": 1381 + }, + { + "epoch": 4.98961625282167, + "grad_norm": 248.0550994873047, + "learning_rate": 2.2529945553539022e-05, + "loss": 40.0868, + "step": 1382 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 256.327880859375, + "learning_rate": 2.2524500907441017e-05, + "loss": 28.1259, + "step": 1383 + }, + { + "epoch": 4.996839729119639, + "grad_norm": 198.29559326171875, + "learning_rate": 2.2519056261343012e-05, + "loss": 25.3166, + "step": 1384 + }, + { + "epoch": 5.0, + "grad_norm": 174.66856384277344, + "learning_rate": 2.2513611615245008e-05, + "loss": 22.0749, + "step": 1385 + }, + { + "epoch": 5.003611738148984, + "grad_norm": 309.0927429199219, + "learning_rate": 2.2508166969147003e-05, + "loss": 45.2433, + "step": 1386 + }, + { + "epoch": 5.007223476297969, + "grad_norm": 293.1455383300781, + "learning_rate": 2.2502722323049002e-05, + "loss": 46.7025, + "step": 1387 + }, + { + "epoch": 5.010835214446953, + "grad_norm": 269.47662353515625, + "learning_rate": 2.2497277676951e-05, + "loss": 45.3218, + "step": 1388 + }, + { + "epoch": 5.014446952595937, + "grad_norm": 284.49560546875, + "learning_rate": 2.2491833030852996e-05, + "loss": 44.9849, + "step": 1389 + }, + { + "epoch": 5.018058690744921, + "grad_norm": 223.5511474609375, + "learning_rate": 2.248638838475499e-05, + "loss": 44.887, + "step": 1390 + }, + { + "epoch": 5.018058690744921, + "eval_loss": 0.6435533165931702, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1390 + }, + { + "epoch": 5.021670428893906, + "grad_norm": 243.4492645263672, + "learning_rate": 2.2480943738656987e-05, + "loss": 45.1483, + "step": 1391 + }, + { + "epoch": 5.0252821670428895, + "grad_norm": 265.1712646484375, + "learning_rate": 2.2475499092558986e-05, + "loss": 44.3713, + "step": 1392 + }, + { + "epoch": 5.0288939051918735, + "grad_norm": 190.72190856933594, + "learning_rate": 2.247005444646098e-05, + "loss": 45.3138, + "step": 1393 + }, + { + "epoch": 5.0325056433408575, + "grad_norm": 177.26686096191406, + "learning_rate": 2.2464609800362976e-05, + "loss": 43.302, + "step": 1394 + }, + { + "epoch": 5.036117381489842, + "grad_norm": 198.6124725341797, + "learning_rate": 2.2459165154264972e-05, + "loss": 43.6363, + "step": 1395 + }, + { + "epoch": 5.039729119638826, + "grad_norm": 233.78738403320312, + "learning_rate": 2.2453720508166967e-05, + "loss": 43.0345, + "step": 1396 + }, + { + "epoch": 5.04334085778781, + "grad_norm": 225.48614501953125, + "learning_rate": 2.2448275862068966e-05, + "loss": 41.5932, + "step": 1397 + }, + { + "epoch": 5.046952595936794, + "grad_norm": 204.31179809570312, + "learning_rate": 2.2442831215970965e-05, + "loss": 40.1401, + "step": 1398 + }, + { + "epoch": 5.050564334085779, + "grad_norm": 219.5385284423828, + "learning_rate": 2.243738656987296e-05, + "loss": 40.8834, + "step": 1399 + }, + { + "epoch": 5.054176072234763, + "grad_norm": 168.3094024658203, + "learning_rate": 2.2431941923774956e-05, + "loss": 40.4476, + "step": 1400 + }, + { + "epoch": 5.054176072234763, + "eval_loss": 0.6361114382743835, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 1400 + }, + { + "epoch": 5.057787810383747, + "grad_norm": 169.45201110839844, + "learning_rate": 2.242649727767695e-05, + "loss": 40.1949, + "step": 1401 + }, + { + "epoch": 5.061399548532731, + "grad_norm": 208.84634399414062, + "learning_rate": 2.2421052631578946e-05, + "loss": 41.0091, + "step": 1402 + }, + { + "epoch": 5.065011286681716, + "grad_norm": 248.86221313476562, + "learning_rate": 2.2415607985480945e-05, + "loss": 40.2435, + "step": 1403 + }, + { + "epoch": 5.0686230248307, + "grad_norm": 297.0834655761719, + "learning_rate": 2.241016333938294e-05, + "loss": 42.37, + "step": 1404 + }, + { + "epoch": 5.072234762979684, + "grad_norm": 242.12661743164062, + "learning_rate": 2.2404718693284936e-05, + "loss": 42.3822, + "step": 1405 + }, + { + "epoch": 5.075846501128668, + "grad_norm": 230.1178741455078, + "learning_rate": 2.2399274047186935e-05, + "loss": 41.3722, + "step": 1406 + }, + { + "epoch": 5.079458239277653, + "grad_norm": 191.32371520996094, + "learning_rate": 2.239382940108893e-05, + "loss": 41.8087, + "step": 1407 + }, + { + "epoch": 5.083069977426637, + "grad_norm": 267.28753662109375, + "learning_rate": 2.2388384754990925e-05, + "loss": 42.5938, + "step": 1408 + }, + { + "epoch": 5.0866817155756205, + "grad_norm": 186.61978149414062, + "learning_rate": 2.2382940108892924e-05, + "loss": 42.8553, + "step": 1409 + }, + { + "epoch": 5.090293453724605, + "grad_norm": 242.53433227539062, + "learning_rate": 2.237749546279492e-05, + "loss": 41.9677, + "step": 1410 + }, + { + "epoch": 5.090293453724605, + "eval_loss": 0.6330043077468872, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 1410 + }, + { + "epoch": 5.093905191873589, + "grad_norm": 199.74696350097656, + "learning_rate": 2.2372050816696915e-05, + "loss": 42.9821, + "step": 1411 + }, + { + "epoch": 5.097516930022573, + "grad_norm": 254.1063690185547, + "learning_rate": 2.236660617059891e-05, + "loss": 42.7956, + "step": 1412 + }, + { + "epoch": 5.101128668171557, + "grad_norm": 215.59056091308594, + "learning_rate": 2.2361161524500906e-05, + "loss": 43.6312, + "step": 1413 + }, + { + "epoch": 5.104740406320542, + "grad_norm": 218.69973754882812, + "learning_rate": 2.2355716878402904e-05, + "loss": 40.9468, + "step": 1414 + }, + { + "epoch": 5.108352144469526, + "grad_norm": 200.34927368164062, + "learning_rate": 2.23502722323049e-05, + "loss": 38.2656, + "step": 1415 + }, + { + "epoch": 5.11196388261851, + "grad_norm": 191.56883239746094, + "learning_rate": 2.23448275862069e-05, + "loss": 35.8111, + "step": 1416 + }, + { + "epoch": 5.115575620767494, + "grad_norm": 192.629150390625, + "learning_rate": 2.2339382940108894e-05, + "loss": 35.1287, + "step": 1417 + }, + { + "epoch": 5.119187358916479, + "grad_norm": 217.54855346679688, + "learning_rate": 2.233393829401089e-05, + "loss": 34.9664, + "step": 1418 + }, + { + "epoch": 5.122799097065463, + "grad_norm": 234.12355041503906, + "learning_rate": 2.2328493647912888e-05, + "loss": 35.9252, + "step": 1419 + }, + { + "epoch": 5.126410835214447, + "grad_norm": 201.83477783203125, + "learning_rate": 2.2323049001814884e-05, + "loss": 36.4664, + "step": 1420 + }, + { + "epoch": 5.126410835214447, + "eval_loss": 0.6359394192695618, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 1420 + }, + { + "epoch": 5.130022573363431, + "grad_norm": 212.38943481445312, + "learning_rate": 2.231760435571688e-05, + "loss": 35.2733, + "step": 1421 + }, + { + "epoch": 5.133634311512416, + "grad_norm": 219.8803253173828, + "learning_rate": 2.2312159709618874e-05, + "loss": 37.2009, + "step": 1422 + }, + { + "epoch": 5.1372460496614, + "grad_norm": 222.28221130371094, + "learning_rate": 2.230671506352087e-05, + "loss": 36.9338, + "step": 1423 + }, + { + "epoch": 5.140857787810384, + "grad_norm": 217.56607055664062, + "learning_rate": 2.2301270417422865e-05, + "loss": 38.0419, + "step": 1424 + }, + { + "epoch": 5.144469525959368, + "grad_norm": 232.7363739013672, + "learning_rate": 2.2295825771324867e-05, + "loss": 38.1393, + "step": 1425 + }, + { + "epoch": 5.148081264108352, + "grad_norm": 228.12091064453125, + "learning_rate": 2.2290381125226863e-05, + "loss": 37.4169, + "step": 1426 + }, + { + "epoch": 5.151693002257336, + "grad_norm": 247.9901580810547, + "learning_rate": 2.2284936479128858e-05, + "loss": 37.6386, + "step": 1427 + }, + { + "epoch": 5.15530474040632, + "grad_norm": 227.96649169921875, + "learning_rate": 2.2279491833030853e-05, + "loss": 38.7843, + "step": 1428 + }, + { + "epoch": 5.158916478555304, + "grad_norm": 197.85072326660156, + "learning_rate": 2.227404718693285e-05, + "loss": 37.7056, + "step": 1429 + }, + { + "epoch": 5.162528216704289, + "grad_norm": 270.6370544433594, + "learning_rate": 2.2268602540834848e-05, + "loss": 38.5554, + "step": 1430 + }, + { + "epoch": 5.162528216704289, + "eval_loss": 0.6463288068771362, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 1430 + }, + { + "epoch": 5.166139954853273, + "grad_norm": 251.65847778320312, + "learning_rate": 2.2263157894736843e-05, + "loss": 32.6593, + "step": 1431 + }, + { + "epoch": 5.169751693002257, + "grad_norm": 248.84368896484375, + "learning_rate": 2.225771324863884e-05, + "loss": 24.8031, + "step": 1432 + }, + { + "epoch": 5.173363431151241, + "grad_norm": 218.12979125976562, + "learning_rate": 2.2252268602540834e-05, + "loss": 23.8542, + "step": 1433 + }, + { + "epoch": 5.176975169300226, + "grad_norm": 171.4182586669922, + "learning_rate": 2.2246823956442832e-05, + "loss": 25.1994, + "step": 1434 + }, + { + "epoch": 5.18058690744921, + "grad_norm": 200.76271057128906, + "learning_rate": 2.2241379310344828e-05, + "loss": 25.1259, + "step": 1435 + }, + { + "epoch": 5.184198645598194, + "grad_norm": 324.8979797363281, + "learning_rate": 2.2235934664246827e-05, + "loss": 46.7466, + "step": 1436 + }, + { + "epoch": 5.187810383747179, + "grad_norm": 391.9200439453125, + "learning_rate": 2.2230490018148822e-05, + "loss": 47.366, + "step": 1437 + }, + { + "epoch": 5.191422121896163, + "grad_norm": 332.51080322265625, + "learning_rate": 2.2225045372050817e-05, + "loss": 47.5236, + "step": 1438 + }, + { + "epoch": 5.195033860045147, + "grad_norm": 295.85333251953125, + "learning_rate": 2.2219600725952813e-05, + "loss": 44.9235, + "step": 1439 + }, + { + "epoch": 5.198645598194131, + "grad_norm": 246.46482849121094, + "learning_rate": 2.2214156079854808e-05, + "loss": 44.5892, + "step": 1440 + }, + { + "epoch": 5.198645598194131, + "eval_loss": 0.6501885056495667, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.096, + "eval_steps_per_second": 57.096, + "step": 1440 + }, + { + "epoch": 5.2022573363431155, + "grad_norm": 224.99964904785156, + "learning_rate": 2.2208711433756807e-05, + "loss": 45.1496, + "step": 1441 + }, + { + "epoch": 5.2058690744920995, + "grad_norm": 201.5928497314453, + "learning_rate": 2.2203266787658802e-05, + "loss": 44.2362, + "step": 1442 + }, + { + "epoch": 5.209480812641083, + "grad_norm": 220.72509765625, + "learning_rate": 2.21978221415608e-05, + "loss": 45.7963, + "step": 1443 + }, + { + "epoch": 5.213092550790067, + "grad_norm": 229.04412841796875, + "learning_rate": 2.2192377495462796e-05, + "loss": 44.1812, + "step": 1444 + }, + { + "epoch": 5.216704288939052, + "grad_norm": 214.86207580566406, + "learning_rate": 2.2186932849364792e-05, + "loss": 44.364, + "step": 1445 + }, + { + "epoch": 5.220316027088036, + "grad_norm": 169.3239288330078, + "learning_rate": 2.2181488203266787e-05, + "loss": 44.1106, + "step": 1446 + }, + { + "epoch": 5.22392776523702, + "grad_norm": 180.3131561279297, + "learning_rate": 2.2176043557168786e-05, + "loss": 41.8791, + "step": 1447 + }, + { + "epoch": 5.227539503386004, + "grad_norm": 227.83078002929688, + "learning_rate": 2.217059891107078e-05, + "loss": 39.7917, + "step": 1448 + }, + { + "epoch": 5.231151241534989, + "grad_norm": 267.4294738769531, + "learning_rate": 2.2165154264972777e-05, + "loss": 41.2864, + "step": 1449 + }, + { + "epoch": 5.234762979683973, + "grad_norm": 210.79034423828125, + "learning_rate": 2.2159709618874772e-05, + "loss": 40.7219, + "step": 1450 + }, + { + "epoch": 5.234762979683973, + "eval_loss": 0.6369529366493225, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 1450 + }, + { + "epoch": 5.238374717832957, + "grad_norm": 205.2632598876953, + "learning_rate": 2.2154264972776768e-05, + "loss": 41.0364, + "step": 1451 + }, + { + "epoch": 5.241986455981941, + "grad_norm": 199.7196807861328, + "learning_rate": 2.214882032667877e-05, + "loss": 40.2733, + "step": 1452 + }, + { + "epoch": 5.245598194130926, + "grad_norm": 184.26495361328125, + "learning_rate": 2.2143375680580765e-05, + "loss": 40.3418, + "step": 1453 + }, + { + "epoch": 5.24920993227991, + "grad_norm": 170.1937713623047, + "learning_rate": 2.213793103448276e-05, + "loss": 40.5658, + "step": 1454 + }, + { + "epoch": 5.252821670428894, + "grad_norm": 167.71109008789062, + "learning_rate": 2.2132486388384756e-05, + "loss": 41.9252, + "step": 1455 + }, + { + "epoch": 5.2564334085778786, + "grad_norm": 184.73162841796875, + "learning_rate": 2.212704174228675e-05, + "loss": 40.0485, + "step": 1456 + }, + { + "epoch": 5.2600451467268625, + "grad_norm": 195.0812225341797, + "learning_rate": 2.2121597096188747e-05, + "loss": 41.6424, + "step": 1457 + }, + { + "epoch": 5.2636568848758465, + "grad_norm": 218.23553466796875, + "learning_rate": 2.2116152450090745e-05, + "loss": 40.6179, + "step": 1458 + }, + { + "epoch": 5.2672686230248305, + "grad_norm": 229.79299926757812, + "learning_rate": 2.211070780399274e-05, + "loss": 42.8747, + "step": 1459 + }, + { + "epoch": 5.270880361173815, + "grad_norm": 231.70692443847656, + "learning_rate": 2.2105263157894736e-05, + "loss": 42.7016, + "step": 1460 + }, + { + "epoch": 5.270880361173815, + "eval_loss": 0.6424433588981628, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 1460 + }, + { + "epoch": 5.274492099322799, + "grad_norm": 204.9513397216797, + "learning_rate": 2.209981851179673e-05, + "loss": 41.206, + "step": 1461 + }, + { + "epoch": 5.278103837471783, + "grad_norm": 220.89083862304688, + "learning_rate": 2.209437386569873e-05, + "loss": 44.0126, + "step": 1462 + }, + { + "epoch": 5.281715575620767, + "grad_norm": 266.7763671875, + "learning_rate": 2.208892921960073e-05, + "loss": 41.4934, + "step": 1463 + }, + { + "epoch": 5.285327313769752, + "grad_norm": 241.42636108398438, + "learning_rate": 2.2083484573502724e-05, + "loss": 43.3433, + "step": 1464 + }, + { + "epoch": 5.288939051918736, + "grad_norm": 221.7669219970703, + "learning_rate": 2.207803992740472e-05, + "loss": 35.9569, + "step": 1465 + }, + { + "epoch": 5.29255079006772, + "grad_norm": 236.0152130126953, + "learning_rate": 2.2072595281306715e-05, + "loss": 36.0824, + "step": 1466 + }, + { + "epoch": 5.296162528216704, + "grad_norm": 239.56224060058594, + "learning_rate": 2.206715063520871e-05, + "loss": 33.6127, + "step": 1467 + }, + { + "epoch": 5.299774266365689, + "grad_norm": 277.1287841796875, + "learning_rate": 2.2061705989110706e-05, + "loss": 36.11, + "step": 1468 + }, + { + "epoch": 5.303386004514673, + "grad_norm": 250.19515991210938, + "learning_rate": 2.2056261343012705e-05, + "loss": 36.9984, + "step": 1469 + }, + { + "epoch": 5.306997742663657, + "grad_norm": 214.2754669189453, + "learning_rate": 2.20508166969147e-05, + "loss": 36.5917, + "step": 1470 + }, + { + "epoch": 5.306997742663657, + "eval_loss": 0.6356943845748901, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 1470 + }, + { + "epoch": 5.310609480812641, + "grad_norm": 224.37388610839844, + "learning_rate": 2.20453720508167e-05, + "loss": 36.5302, + "step": 1471 + }, + { + "epoch": 5.314221218961626, + "grad_norm": 276.2541809082031, + "learning_rate": 2.2039927404718694e-05, + "loss": 36.7978, + "step": 1472 + }, + { + "epoch": 5.3178329571106095, + "grad_norm": 361.717041015625, + "learning_rate": 2.203448275862069e-05, + "loss": 37.4063, + "step": 1473 + }, + { + "epoch": 5.3214446952595935, + "grad_norm": 285.3569641113281, + "learning_rate": 2.202903811252269e-05, + "loss": 37.2472, + "step": 1474 + }, + { + "epoch": 5.3250564334085775, + "grad_norm": 268.160400390625, + "learning_rate": 2.2023593466424684e-05, + "loss": 37.7361, + "step": 1475 + }, + { + "epoch": 5.328668171557562, + "grad_norm": 211.38070678710938, + "learning_rate": 2.201814882032668e-05, + "loss": 37.7794, + "step": 1476 + }, + { + "epoch": 5.332279909706546, + "grad_norm": 214.10638427734375, + "learning_rate": 2.2012704174228675e-05, + "loss": 39.0787, + "step": 1477 + }, + { + "epoch": 5.33589164785553, + "grad_norm": 238.9603271484375, + "learning_rate": 2.200725952813067e-05, + "loss": 37.6853, + "step": 1478 + }, + { + "epoch": 5.339503386004514, + "grad_norm": 323.44976806640625, + "learning_rate": 2.2001814882032665e-05, + "loss": 38.2844, + "step": 1479 + }, + { + "epoch": 5.343115124153499, + "grad_norm": 289.6131896972656, + "learning_rate": 2.1996370235934668e-05, + "loss": 38.8953, + "step": 1480 + }, + { + "epoch": 5.343115124153499, + "eval_loss": 0.6462770700454712, + "eval_runtime": 3.1673, + "eval_samples_per_second": 56.516, + "eval_steps_per_second": 56.516, + "step": 1480 + }, + { + "epoch": 5.346726862302483, + "grad_norm": 197.47299194335938, + "learning_rate": 2.1990925589836663e-05, + "loss": 28.126, + "step": 1481 + }, + { + "epoch": 5.350338600451467, + "grad_norm": 198.37156677246094, + "learning_rate": 2.1985480943738658e-05, + "loss": 24.2205, + "step": 1482 + }, + { + "epoch": 5.353950338600452, + "grad_norm": 211.03501892089844, + "learning_rate": 2.1980036297640654e-05, + "loss": 24.119, + "step": 1483 + }, + { + "epoch": 5.357562076749436, + "grad_norm": 182.23316955566406, + "learning_rate": 2.197459165154265e-05, + "loss": 24.7386, + "step": 1484 + }, + { + "epoch": 5.36117381489842, + "grad_norm": 192.6392822265625, + "learning_rate": 2.1969147005444648e-05, + "loss": 26.0739, + "step": 1485 + }, + { + "epoch": 5.364785553047404, + "grad_norm": 380.62896728515625, + "learning_rate": 2.1963702359346643e-05, + "loss": 46.6945, + "step": 1486 + }, + { + "epoch": 5.368397291196389, + "grad_norm": 342.5572814941406, + "learning_rate": 2.195825771324864e-05, + "loss": 46.1797, + "step": 1487 + }, + { + "epoch": 5.372009029345373, + "grad_norm": 311.7198791503906, + "learning_rate": 2.1952813067150634e-05, + "loss": 45.6588, + "step": 1488 + }, + { + "epoch": 5.375620767494357, + "grad_norm": 260.9885559082031, + "learning_rate": 2.1947368421052633e-05, + "loss": 45.2405, + "step": 1489 + }, + { + "epoch": 5.3792325056433405, + "grad_norm": 263.3132019042969, + "learning_rate": 2.1941923774954628e-05, + "loss": 44.117, + "step": 1490 + }, + { + "epoch": 5.3792325056433405, + "eval_loss": 0.644275426864624, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 1490 + }, + { + "epoch": 5.382844243792325, + "grad_norm": 254.92022705078125, + "learning_rate": 2.1936479128856627e-05, + "loss": 45.4002, + "step": 1491 + }, + { + "epoch": 5.386455981941309, + "grad_norm": 246.1839599609375, + "learning_rate": 2.1931034482758622e-05, + "loss": 45.3481, + "step": 1492 + }, + { + "epoch": 5.390067720090293, + "grad_norm": 282.2879638671875, + "learning_rate": 2.1925589836660618e-05, + "loss": 45.3958, + "step": 1493 + }, + { + "epoch": 5.393679458239277, + "grad_norm": 266.9140930175781, + "learning_rate": 2.1920145190562613e-05, + "loss": 44.2959, + "step": 1494 + }, + { + "epoch": 5.397291196388262, + "grad_norm": 196.81199645996094, + "learning_rate": 2.191470054446461e-05, + "loss": 44.765, + "step": 1495 + }, + { + "epoch": 5.400902934537246, + "grad_norm": 270.7329406738281, + "learning_rate": 2.1909255898366607e-05, + "loss": 42.8581, + "step": 1496 + }, + { + "epoch": 5.40451467268623, + "grad_norm": 187.3281707763672, + "learning_rate": 2.1903811252268603e-05, + "loss": 40.7167, + "step": 1497 + }, + { + "epoch": 5.408126410835214, + "grad_norm": 302.9165954589844, + "learning_rate": 2.1898366606170598e-05, + "loss": 41.0712, + "step": 1498 + }, + { + "epoch": 5.411738148984199, + "grad_norm": 395.1492614746094, + "learning_rate": 2.1892921960072597e-05, + "loss": 40.4098, + "step": 1499 + }, + { + "epoch": 5.415349887133183, + "grad_norm": 253.91494750976562, + "learning_rate": 2.1887477313974592e-05, + "loss": 41.2985, + "step": 1500 + }, + { + "epoch": 5.415349887133183, + "eval_loss": 0.6383773684501648, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1500 + }, + { + "epoch": 5.418961625282167, + "grad_norm": 248.4109344482422, + "learning_rate": 2.1882032667876588e-05, + "loss": 41.179, + "step": 1501 + }, + { + "epoch": 5.422573363431152, + "grad_norm": 210.50015258789062, + "learning_rate": 2.1876588021778586e-05, + "loss": 41.1934, + "step": 1502 + }, + { + "epoch": 5.426185101580136, + "grad_norm": 170.64334106445312, + "learning_rate": 2.187114337568058e-05, + "loss": 41.5535, + "step": 1503 + }, + { + "epoch": 5.42979683972912, + "grad_norm": 249.41270446777344, + "learning_rate": 2.1865698729582577e-05, + "loss": 41.8323, + "step": 1504 + }, + { + "epoch": 5.433408577878104, + "grad_norm": 214.53770446777344, + "learning_rate": 2.1860254083484572e-05, + "loss": 42.1517, + "step": 1505 + }, + { + "epoch": 5.437020316027088, + "grad_norm": 225.6502227783203, + "learning_rate": 2.1854809437386568e-05, + "loss": 42.7675, + "step": 1506 + }, + { + "epoch": 5.440632054176072, + "grad_norm": 210.19219970703125, + "learning_rate": 2.1849364791288567e-05, + "loss": 42.5094, + "step": 1507 + }, + { + "epoch": 5.444243792325056, + "grad_norm": 187.03294372558594, + "learning_rate": 2.1843920145190565e-05, + "loss": 42.2218, + "step": 1508 + }, + { + "epoch": 5.44785553047404, + "grad_norm": 227.6764373779297, + "learning_rate": 2.183847549909256e-05, + "loss": 42.7061, + "step": 1509 + }, + { + "epoch": 5.451467268623025, + "grad_norm": 239.2847442626953, + "learning_rate": 2.1833030852994556e-05, + "loss": 43.1959, + "step": 1510 + }, + { + "epoch": 5.451467268623025, + "eval_loss": 0.6405091285705566, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 1510 + }, + { + "epoch": 5.455079006772009, + "grad_norm": 268.887451171875, + "learning_rate": 2.182758620689655e-05, + "loss": 42.4915, + "step": 1511 + }, + { + "epoch": 5.458690744920993, + "grad_norm": 261.0531311035156, + "learning_rate": 2.182214156079855e-05, + "loss": 42.1777, + "step": 1512 + }, + { + "epoch": 5.462302483069977, + "grad_norm": 241.58819580078125, + "learning_rate": 2.1816696914700546e-05, + "loss": 40.8728, + "step": 1513 + }, + { + "epoch": 5.465914221218962, + "grad_norm": 227.302001953125, + "learning_rate": 2.181125226860254e-05, + "loss": 39.8861, + "step": 1514 + }, + { + "epoch": 5.469525959367946, + "grad_norm": 293.8402404785156, + "learning_rate": 2.1805807622504536e-05, + "loss": 36.8716, + "step": 1515 + }, + { + "epoch": 5.47313769751693, + "grad_norm": 332.8829650878906, + "learning_rate": 2.1800362976406532e-05, + "loss": 35.6049, + "step": 1516 + }, + { + "epoch": 5.476749435665914, + "grad_norm": 271.6636962890625, + "learning_rate": 2.179491833030853e-05, + "loss": 34.6785, + "step": 1517 + }, + { + "epoch": 5.480361173814899, + "grad_norm": 211.5673065185547, + "learning_rate": 2.178947368421053e-05, + "loss": 35.5321, + "step": 1518 + }, + { + "epoch": 5.483972911963883, + "grad_norm": 168.95346069335938, + "learning_rate": 2.1784029038112525e-05, + "loss": 35.1604, + "step": 1519 + }, + { + "epoch": 5.487584650112867, + "grad_norm": 242.66725158691406, + "learning_rate": 2.177858439201452e-05, + "loss": 37.8709, + "step": 1520 + }, + { + "epoch": 5.487584650112867, + "eval_loss": 0.6324127912521362, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1520 + }, + { + "epoch": 5.491196388261851, + "grad_norm": 202.7799530029297, + "learning_rate": 2.1773139745916516e-05, + "loss": 38.1727, + "step": 1521 + }, + { + "epoch": 5.4948081264108355, + "grad_norm": 210.12704467773438, + "learning_rate": 2.176769509981851e-05, + "loss": 36.4171, + "step": 1522 + }, + { + "epoch": 5.4984198645598195, + "grad_norm": 214.7133331298828, + "learning_rate": 2.176225045372051e-05, + "loss": 37.7873, + "step": 1523 + }, + { + "epoch": 5.502031602708803, + "grad_norm": 197.89781188964844, + "learning_rate": 2.1756805807622505e-05, + "loss": 37.1096, + "step": 1524 + }, + { + "epoch": 5.505643340857787, + "grad_norm": 203.01992797851562, + "learning_rate": 2.17513611615245e-05, + "loss": 36.9907, + "step": 1525 + }, + { + "epoch": 5.509255079006772, + "grad_norm": 210.42164611816406, + "learning_rate": 2.17459165154265e-05, + "loss": 38.0291, + "step": 1526 + }, + { + "epoch": 5.512866817155756, + "grad_norm": 210.2798309326172, + "learning_rate": 2.1740471869328495e-05, + "loss": 37.5385, + "step": 1527 + }, + { + "epoch": 5.51647855530474, + "grad_norm": 217.986572265625, + "learning_rate": 2.173502722323049e-05, + "loss": 39.2736, + "step": 1528 + }, + { + "epoch": 5.520090293453725, + "grad_norm": 221.05831909179688, + "learning_rate": 2.172958257713249e-05, + "loss": 39.2733, + "step": 1529 + }, + { + "epoch": 5.523702031602709, + "grad_norm": 250.36065673828125, + "learning_rate": 2.1724137931034484e-05, + "loss": 37.8987, + "step": 1530 + }, + { + "epoch": 5.523702031602709, + "eval_loss": 0.6414559483528137, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 1530 + }, + { + "epoch": 5.527313769751693, + "grad_norm": 275.062255859375, + "learning_rate": 2.171869328493648e-05, + "loss": 29.4874, + "step": 1531 + }, + { + "epoch": 5.530925507900677, + "grad_norm": 178.79615783691406, + "learning_rate": 2.1713248638838475e-05, + "loss": 25.2165, + "step": 1532 + }, + { + "epoch": 5.534537246049661, + "grad_norm": 221.6693572998047, + "learning_rate": 2.170780399274047e-05, + "loss": 24.7139, + "step": 1533 + }, + { + "epoch": 5.538148984198646, + "grad_norm": 207.15869140625, + "learning_rate": 2.170235934664247e-05, + "loss": 25.2773, + "step": 1534 + }, + { + "epoch": 5.54176072234763, + "grad_norm": 193.37644958496094, + "learning_rate": 2.1696914700544468e-05, + "loss": 25.7936, + "step": 1535 + }, + { + "epoch": 5.545372460496614, + "grad_norm": 314.101318359375, + "learning_rate": 2.1691470054446463e-05, + "loss": 45.8573, + "step": 1536 + }, + { + "epoch": 5.5489841986455986, + "grad_norm": 376.9578552246094, + "learning_rate": 2.168602540834846e-05, + "loss": 47.1284, + "step": 1537 + }, + { + "epoch": 5.5525959367945825, + "grad_norm": 343.3904724121094, + "learning_rate": 2.1680580762250454e-05, + "loss": 45.1873, + "step": 1538 + }, + { + "epoch": 5.5562076749435665, + "grad_norm": 263.31768798828125, + "learning_rate": 2.167513611615245e-05, + "loss": 45.4906, + "step": 1539 + }, + { + "epoch": 5.5598194130925505, + "grad_norm": 295.50384521484375, + "learning_rate": 2.1669691470054448e-05, + "loss": 44.9259, + "step": 1540 + }, + { + "epoch": 5.5598194130925505, + "eval_loss": 0.6483813524246216, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.923, + "eval_steps_per_second": 56.923, + "step": 1540 + }, + { + "epoch": 5.563431151241535, + "grad_norm": 208.8861846923828, + "learning_rate": 2.1664246823956444e-05, + "loss": 43.7965, + "step": 1541 + }, + { + "epoch": 5.567042889390519, + "grad_norm": 195.8695526123047, + "learning_rate": 2.165880217785844e-05, + "loss": 44.7409, + "step": 1542 + }, + { + "epoch": 5.570654627539503, + "grad_norm": 218.10089111328125, + "learning_rate": 2.1653357531760434e-05, + "loss": 45.9364, + "step": 1543 + }, + { + "epoch": 5.574266365688487, + "grad_norm": 204.17205810546875, + "learning_rate": 2.164791288566243e-05, + "loss": 45.468, + "step": 1544 + }, + { + "epoch": 5.577878103837472, + "grad_norm": 239.03952026367188, + "learning_rate": 2.1642468239564432e-05, + "loss": 44.7685, + "step": 1545 + }, + { + "epoch": 5.581489841986456, + "grad_norm": 251.59300231933594, + "learning_rate": 2.1637023593466427e-05, + "loss": 43.011, + "step": 1546 + }, + { + "epoch": 5.58510158013544, + "grad_norm": 186.72540283203125, + "learning_rate": 2.1631578947368423e-05, + "loss": 41.5255, + "step": 1547 + }, + { + "epoch": 5.588713318284425, + "grad_norm": 199.89732360839844, + "learning_rate": 2.1626134301270418e-05, + "loss": 40.2522, + "step": 1548 + }, + { + "epoch": 5.592325056433409, + "grad_norm": 182.16624450683594, + "learning_rate": 2.1620689655172413e-05, + "loss": 41.0931, + "step": 1549 + }, + { + "epoch": 5.595936794582393, + "grad_norm": 221.58680725097656, + "learning_rate": 2.161524500907441e-05, + "loss": 40.2717, + "step": 1550 + }, + { + "epoch": 5.595936794582393, + "eval_loss": 0.6393340229988098, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 1550 + }, + { + "epoch": 5.599548532731377, + "grad_norm": 209.82183837890625, + "learning_rate": 2.1609800362976408e-05, + "loss": 41.7522, + "step": 1551 + }, + { + "epoch": 5.603160270880361, + "grad_norm": 226.1896209716797, + "learning_rate": 2.1604355716878403e-05, + "loss": 40.8078, + "step": 1552 + }, + { + "epoch": 5.606772009029346, + "grad_norm": 219.57899475097656, + "learning_rate": 2.1598911070780398e-05, + "loss": 42.2331, + "step": 1553 + }, + { + "epoch": 5.6103837471783295, + "grad_norm": 185.2303009033203, + "learning_rate": 2.1593466424682397e-05, + "loss": 42.0695, + "step": 1554 + }, + { + "epoch": 5.6139954853273135, + "grad_norm": 192.32913208007812, + "learning_rate": 2.1588021778584392e-05, + "loss": 42.1317, + "step": 1555 + }, + { + "epoch": 5.617607223476298, + "grad_norm": 183.3128662109375, + "learning_rate": 2.158257713248639e-05, + "loss": 40.4957, + "step": 1556 + }, + { + "epoch": 5.621218961625282, + "grad_norm": 178.10691833496094, + "learning_rate": 2.1577132486388387e-05, + "loss": 40.9154, + "step": 1557 + }, + { + "epoch": 5.624830699774266, + "grad_norm": 207.3495330810547, + "learning_rate": 2.1571687840290382e-05, + "loss": 42.8389, + "step": 1558 + }, + { + "epoch": 5.62844243792325, + "grad_norm": 191.46353149414062, + "learning_rate": 2.1566243194192377e-05, + "loss": 41.9483, + "step": 1559 + }, + { + "epoch": 5.632054176072235, + "grad_norm": 218.9544219970703, + "learning_rate": 2.1560798548094373e-05, + "loss": 41.2037, + "step": 1560 + }, + { + "epoch": 5.632054176072235, + "eval_loss": 0.6345452070236206, + "eval_runtime": 3.1432, + "eval_samples_per_second": 56.949, + "eval_steps_per_second": 56.949, + "step": 1560 + }, + { + "epoch": 5.635665914221219, + "grad_norm": 235.9405059814453, + "learning_rate": 2.1555353901996368e-05, + "loss": 43.1159, + "step": 1561 + }, + { + "epoch": 5.639277652370203, + "grad_norm": 207.1119384765625, + "learning_rate": 2.1549909255898367e-05, + "loss": 43.4384, + "step": 1562 + }, + { + "epoch": 5.642889390519187, + "grad_norm": 305.3013916015625, + "learning_rate": 2.1544464609800366e-05, + "loss": 42.436, + "step": 1563 + }, + { + "epoch": 5.646501128668172, + "grad_norm": 226.25282287597656, + "learning_rate": 2.153901996370236e-05, + "loss": 39.6844, + "step": 1564 + }, + { + "epoch": 5.650112866817156, + "grad_norm": 201.5033416748047, + "learning_rate": 2.1533575317604356e-05, + "loss": 35.9103, + "step": 1565 + }, + { + "epoch": 5.65372460496614, + "grad_norm": 206.63229370117188, + "learning_rate": 2.1528130671506352e-05, + "loss": 35.0026, + "step": 1566 + }, + { + "epoch": 5.657336343115124, + "grad_norm": 212.67581176757812, + "learning_rate": 2.152268602540835e-05, + "loss": 35.6298, + "step": 1567 + }, + { + "epoch": 5.660948081264109, + "grad_norm": 193.2886199951172, + "learning_rate": 2.1517241379310346e-05, + "loss": 36.0356, + "step": 1568 + }, + { + "epoch": 5.664559819413093, + "grad_norm": 166.189208984375, + "learning_rate": 2.151179673321234e-05, + "loss": 35.5423, + "step": 1569 + }, + { + "epoch": 5.668171557562077, + "grad_norm": 288.91552734375, + "learning_rate": 2.1506352087114337e-05, + "loss": 36.6227, + "step": 1570 + }, + { + "epoch": 5.668171557562077, + "eval_loss": 0.6339959502220154, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1570 + }, + { + "epoch": 5.6717832957110605, + "grad_norm": 210.91664123535156, + "learning_rate": 2.1500907441016332e-05, + "loss": 37.3015, + "step": 1571 + }, + { + "epoch": 5.675395033860045, + "grad_norm": 206.54299926757812, + "learning_rate": 2.149546279491833e-05, + "loss": 36.961, + "step": 1572 + }, + { + "epoch": 5.679006772009029, + "grad_norm": 206.55613708496094, + "learning_rate": 2.149001814882033e-05, + "loss": 36.722, + "step": 1573 + }, + { + "epoch": 5.682618510158013, + "grad_norm": 206.86563110351562, + "learning_rate": 2.1484573502722325e-05, + "loss": 37.7482, + "step": 1574 + }, + { + "epoch": 5.686230248306998, + "grad_norm": 219.96533203125, + "learning_rate": 2.147912885662432e-05, + "loss": 37.7964, + "step": 1575 + }, + { + "epoch": 5.689841986455982, + "grad_norm": 226.23887634277344, + "learning_rate": 2.1473684210526316e-05, + "loss": 38.6577, + "step": 1576 + }, + { + "epoch": 5.693453724604966, + "grad_norm": 195.1751708984375, + "learning_rate": 2.146823956442831e-05, + "loss": 36.9764, + "step": 1577 + }, + { + "epoch": 5.69706546275395, + "grad_norm": 194.3510284423828, + "learning_rate": 2.146279491833031e-05, + "loss": 39.4842, + "step": 1578 + }, + { + "epoch": 5.700677200902934, + "grad_norm": 187.02281188964844, + "learning_rate": 2.1457350272232305e-05, + "loss": 38.9574, + "step": 1579 + }, + { + "epoch": 5.704288939051919, + "grad_norm": 242.91925048828125, + "learning_rate": 2.14519056261343e-05, + "loss": 37.6359, + "step": 1580 + }, + { + "epoch": 5.704288939051919, + "eval_loss": 0.6384473443031311, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 1580 + }, + { + "epoch": 5.707900677200903, + "grad_norm": 242.9617156982422, + "learning_rate": 2.14464609800363e-05, + "loss": 31.3564, + "step": 1581 + }, + { + "epoch": 5.711512415349887, + "grad_norm": 182.00540161132812, + "learning_rate": 2.1441016333938295e-05, + "loss": 24.2933, + "step": 1582 + }, + { + "epoch": 5.715124153498872, + "grad_norm": 257.7115173339844, + "learning_rate": 2.143557168784029e-05, + "loss": 24.6299, + "step": 1583 + }, + { + "epoch": 5.718735891647856, + "grad_norm": 198.71554565429688, + "learning_rate": 2.143012704174229e-05, + "loss": 24.7344, + "step": 1584 + }, + { + "epoch": 5.72234762979684, + "grad_norm": 198.24520874023438, + "learning_rate": 2.1424682395644284e-05, + "loss": 26.0825, + "step": 1585 + }, + { + "epoch": 5.725959367945824, + "grad_norm": 248.9528045654297, + "learning_rate": 2.141923774954628e-05, + "loss": 45.1176, + "step": 1586 + }, + { + "epoch": 5.7295711060948085, + "grad_norm": 293.7327575683594, + "learning_rate": 2.1413793103448275e-05, + "loss": 45.8517, + "step": 1587 + }, + { + "epoch": 5.733182844243792, + "grad_norm": 293.1148681640625, + "learning_rate": 2.140834845735027e-05, + "loss": 45.6659, + "step": 1588 + }, + { + "epoch": 5.736794582392776, + "grad_norm": 312.7779846191406, + "learning_rate": 2.140290381125227e-05, + "loss": 44.4863, + "step": 1589 + }, + { + "epoch": 5.74040632054176, + "grad_norm": 309.1000061035156, + "learning_rate": 2.1397459165154265e-05, + "loss": 43.649, + "step": 1590 + }, + { + "epoch": 5.74040632054176, + "eval_loss": 0.6471736431121826, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 1590 + }, + { + "epoch": 5.744018058690745, + "grad_norm": 276.4226989746094, + "learning_rate": 2.1392014519056263e-05, + "loss": 45.3135, + "step": 1591 + }, + { + "epoch": 5.747629796839729, + "grad_norm": 233.6791229248047, + "learning_rate": 2.138656987295826e-05, + "loss": 44.4919, + "step": 1592 + }, + { + "epoch": 5.751241534988713, + "grad_norm": 194.2917022705078, + "learning_rate": 2.1381125226860254e-05, + "loss": 44.8033, + "step": 1593 + }, + { + "epoch": 5.754853273137698, + "grad_norm": 241.76060485839844, + "learning_rate": 2.137568058076225e-05, + "loss": 45.1427, + "step": 1594 + }, + { + "epoch": 5.758465011286682, + "grad_norm": 216.56283569335938, + "learning_rate": 2.137023593466425e-05, + "loss": 43.1769, + "step": 1595 + }, + { + "epoch": 5.762076749435666, + "grad_norm": 230.0026092529297, + "learning_rate": 2.1364791288566244e-05, + "loss": 44.1141, + "step": 1596 + }, + { + "epoch": 5.76568848758465, + "grad_norm": 191.55433654785156, + "learning_rate": 2.135934664246824e-05, + "loss": 40.7227, + "step": 1597 + }, + { + "epoch": 5.769300225733634, + "grad_norm": 180.25885009765625, + "learning_rate": 2.1353901996370235e-05, + "loss": 40.9842, + "step": 1598 + }, + { + "epoch": 5.772911963882619, + "grad_norm": 220.4018096923828, + "learning_rate": 2.134845735027223e-05, + "loss": 40.0403, + "step": 1599 + }, + { + "epoch": 5.776523702031603, + "grad_norm": 264.20587158203125, + "learning_rate": 2.1343012704174232e-05, + "loss": 40.1543, + "step": 1600 + }, + { + "epoch": 5.776523702031603, + "eval_loss": 0.6374311447143555, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1600 + }, + { + "epoch": 5.780135440180587, + "grad_norm": 167.9457244873047, + "learning_rate": 2.1337568058076227e-05, + "loss": 40.9575, + "step": 1601 + }, + { + "epoch": 5.7837471783295715, + "grad_norm": 190.05247497558594, + "learning_rate": 2.1332123411978223e-05, + "loss": 39.5593, + "step": 1602 + }, + { + "epoch": 5.7873589164785555, + "grad_norm": 246.4980926513672, + "learning_rate": 2.1326678765880218e-05, + "loss": 40.7016, + "step": 1603 + }, + { + "epoch": 5.7909706546275395, + "grad_norm": 208.7435302734375, + "learning_rate": 2.1321234119782214e-05, + "loss": 41.7855, + "step": 1604 + }, + { + "epoch": 5.794582392776523, + "grad_norm": 190.84188842773438, + "learning_rate": 2.1315789473684212e-05, + "loss": 41.2129, + "step": 1605 + }, + { + "epoch": 5.798194130925508, + "grad_norm": 196.7161102294922, + "learning_rate": 2.1310344827586208e-05, + "loss": 40.8209, + "step": 1606 + }, + { + "epoch": 5.801805869074492, + "grad_norm": 181.4319305419922, + "learning_rate": 2.1304900181488203e-05, + "loss": 41.8345, + "step": 1607 + }, + { + "epoch": 5.805417607223476, + "grad_norm": 201.2064971923828, + "learning_rate": 2.12994555353902e-05, + "loss": 43.1464, + "step": 1608 + }, + { + "epoch": 5.80902934537246, + "grad_norm": 199.15174865722656, + "learning_rate": 2.1294010889292197e-05, + "loss": 42.6041, + "step": 1609 + }, + { + "epoch": 5.812641083521445, + "grad_norm": 231.0398406982422, + "learning_rate": 2.1288566243194193e-05, + "loss": 42.867, + "step": 1610 + }, + { + "epoch": 5.812641083521445, + "eval_loss": 0.6334222555160522, + "eval_runtime": 3.1534, + "eval_samples_per_second": 56.764, + "eval_steps_per_second": 56.764, + "step": 1610 + }, + { + "epoch": 5.816252821670429, + "grad_norm": 189.26132202148438, + "learning_rate": 2.128312159709619e-05, + "loss": 41.7717, + "step": 1611 + }, + { + "epoch": 5.819864559819413, + "grad_norm": 215.5289764404297, + "learning_rate": 2.1277676950998187e-05, + "loss": 41.3994, + "step": 1612 + }, + { + "epoch": 5.823476297968397, + "grad_norm": 267.4259033203125, + "learning_rate": 2.1272232304900182e-05, + "loss": 41.8173, + "step": 1613 + }, + { + "epoch": 5.827088036117382, + "grad_norm": 241.74749755859375, + "learning_rate": 2.1266787658802178e-05, + "loss": 39.9873, + "step": 1614 + }, + { + "epoch": 5.830699774266366, + "grad_norm": 242.233642578125, + "learning_rate": 2.1261343012704173e-05, + "loss": 37.0662, + "step": 1615 + }, + { + "epoch": 5.83431151241535, + "grad_norm": 217.06141662597656, + "learning_rate": 2.1255898366606172e-05, + "loss": 36.8948, + "step": 1616 + }, + { + "epoch": 5.837923250564334, + "grad_norm": 242.05567932128906, + "learning_rate": 2.1250453720508167e-05, + "loss": 34.9909, + "step": 1617 + }, + { + "epoch": 5.8415349887133186, + "grad_norm": 178.65618896484375, + "learning_rate": 2.1245009074410166e-05, + "loss": 35.603, + "step": 1618 + }, + { + "epoch": 5.8451467268623025, + "grad_norm": 216.36865234375, + "learning_rate": 2.123956442831216e-05, + "loss": 35.9822, + "step": 1619 + }, + { + "epoch": 5.8487584650112865, + "grad_norm": 241.22161865234375, + "learning_rate": 2.1234119782214157e-05, + "loss": 35.1473, + "step": 1620 + }, + { + "epoch": 5.8487584650112865, + "eval_loss": 0.6312161087989807, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 1620 + }, + { + "epoch": 5.852370203160271, + "grad_norm": 192.05210876464844, + "learning_rate": 2.1228675136116152e-05, + "loss": 36.145, + "step": 1621 + }, + { + "epoch": 5.855981941309255, + "grad_norm": 194.0652618408203, + "learning_rate": 2.122323049001815e-05, + "loss": 37.7076, + "step": 1622 + }, + { + "epoch": 5.859593679458239, + "grad_norm": 255.59286499023438, + "learning_rate": 2.1217785843920146e-05, + "loss": 37.6837, + "step": 1623 + }, + { + "epoch": 5.863205417607223, + "grad_norm": 184.0017852783203, + "learning_rate": 2.121234119782214e-05, + "loss": 37.1681, + "step": 1624 + }, + { + "epoch": 5.866817155756207, + "grad_norm": 186.98338317871094, + "learning_rate": 2.1206896551724137e-05, + "loss": 37.4902, + "step": 1625 + }, + { + "epoch": 5.870428893905192, + "grad_norm": 253.53775024414062, + "learning_rate": 2.1201451905626132e-05, + "loss": 37.2771, + "step": 1626 + }, + { + "epoch": 5.874040632054176, + "grad_norm": 196.43038940429688, + "learning_rate": 2.119600725952813e-05, + "loss": 37.7681, + "step": 1627 + }, + { + "epoch": 5.87765237020316, + "grad_norm": 255.99879455566406, + "learning_rate": 2.119056261343013e-05, + "loss": 40.0097, + "step": 1628 + }, + { + "epoch": 5.881264108352145, + "grad_norm": 275.1465148925781, + "learning_rate": 2.1185117967332125e-05, + "loss": 38.1076, + "step": 1629 + }, + { + "epoch": 5.884875846501129, + "grad_norm": 281.8592529296875, + "learning_rate": 2.117967332123412e-05, + "loss": 38.6463, + "step": 1630 + }, + { + "epoch": 5.884875846501129, + "eval_loss": 0.6449099779129028, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 1630 + }, + { + "epoch": 5.888487584650113, + "grad_norm": 246.7912139892578, + "learning_rate": 2.1174228675136116e-05, + "loss": 36.9158, + "step": 1631 + }, + { + "epoch": 5.892099322799097, + "grad_norm": 176.7545623779297, + "learning_rate": 2.116878402903811e-05, + "loss": 25.1153, + "step": 1632 + }, + { + "epoch": 5.895711060948082, + "grad_norm": 202.2602996826172, + "learning_rate": 2.116333938294011e-05, + "loss": 24.1999, + "step": 1633 + }, + { + "epoch": 5.899322799097066, + "grad_norm": 186.26255798339844, + "learning_rate": 2.1157894736842106e-05, + "loss": 24.185, + "step": 1634 + }, + { + "epoch": 5.9029345372460496, + "grad_norm": 231.0543670654297, + "learning_rate": 2.11524500907441e-05, + "loss": 26.1841, + "step": 1635 + }, + { + "epoch": 5.9065462753950335, + "grad_norm": 336.677001953125, + "learning_rate": 2.1147005444646096e-05, + "loss": 47.1367, + "step": 1636 + }, + { + "epoch": 5.910158013544018, + "grad_norm": 299.3211975097656, + "learning_rate": 2.1141560798548095e-05, + "loss": 46.7711, + "step": 1637 + }, + { + "epoch": 5.913769751693002, + "grad_norm": 287.5389099121094, + "learning_rate": 2.1136116152450094e-05, + "loss": 44.9163, + "step": 1638 + }, + { + "epoch": 5.917381489841986, + "grad_norm": 290.34930419921875, + "learning_rate": 2.113067150635209e-05, + "loss": 45.1651, + "step": 1639 + }, + { + "epoch": 5.92099322799097, + "grad_norm": 244.7100372314453, + "learning_rate": 2.1125226860254085e-05, + "loss": 45.6252, + "step": 1640 + }, + { + "epoch": 5.92099322799097, + "eval_loss": 0.6506878733634949, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 1640 + }, + { + "epoch": 5.924604966139955, + "grad_norm": 301.48223876953125, + "learning_rate": 2.111978221415608e-05, + "loss": 44.5345, + "step": 1641 + }, + { + "epoch": 5.928216704288939, + "grad_norm": 261.05987548828125, + "learning_rate": 2.1114337568058075e-05, + "loss": 42.0263, + "step": 1642 + }, + { + "epoch": 5.931828442437923, + "grad_norm": 220.4369659423828, + "learning_rate": 2.110889292196007e-05, + "loss": 41.2405, + "step": 1643 + }, + { + "epoch": 5.935440180586907, + "grad_norm": 261.3221435546875, + "learning_rate": 2.110344827586207e-05, + "loss": 42.2734, + "step": 1644 + }, + { + "epoch": 5.939051918735892, + "grad_norm": 253.70855712890625, + "learning_rate": 2.1098003629764065e-05, + "loss": 43.0752, + "step": 1645 + }, + { + "epoch": 5.942663656884876, + "grad_norm": 198.76138305664062, + "learning_rate": 2.1092558983666064e-05, + "loss": 42.7103, + "step": 1646 + }, + { + "epoch": 5.94627539503386, + "grad_norm": 212.21466064453125, + "learning_rate": 2.108711433756806e-05, + "loss": 42.6215, + "step": 1647 + }, + { + "epoch": 5.949887133182845, + "grad_norm": 212.9633026123047, + "learning_rate": 2.1081669691470055e-05, + "loss": 42.795, + "step": 1648 + }, + { + "epoch": 5.953498871331829, + "grad_norm": 263.2871398925781, + "learning_rate": 2.1076225045372053e-05, + "loss": 43.8843, + "step": 1649 + }, + { + "epoch": 5.957110609480813, + "grad_norm": 207.67120361328125, + "learning_rate": 2.107078039927405e-05, + "loss": 43.0161, + "step": 1650 + }, + { + "epoch": 5.957110609480813, + "eval_loss": 0.6315081715583801, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1650 + }, + { + "epoch": 5.960722347629797, + "grad_norm": 176.6342010498047, + "learning_rate": 2.1065335753176044e-05, + "loss": 38.803, + "step": 1651 + }, + { + "epoch": 5.9643340857787805, + "grad_norm": 223.57485961914062, + "learning_rate": 2.105989110707804e-05, + "loss": 35.1905, + "step": 1652 + }, + { + "epoch": 5.967945823927765, + "grad_norm": 291.507568359375, + "learning_rate": 2.1054446460980035e-05, + "loss": 34.9454, + "step": 1653 + }, + { + "epoch": 5.971557562076749, + "grad_norm": 250.51063537597656, + "learning_rate": 2.104900181488203e-05, + "loss": 37.4404, + "step": 1654 + }, + { + "epoch": 5.975169300225733, + "grad_norm": 307.9601135253906, + "learning_rate": 2.1043557168784032e-05, + "loss": 36.9775, + "step": 1655 + }, + { + "epoch": 5.978781038374718, + "grad_norm": 277.24151611328125, + "learning_rate": 2.1038112522686028e-05, + "loss": 38.2696, + "step": 1656 + }, + { + "epoch": 5.982392776523702, + "grad_norm": 186.7593994140625, + "learning_rate": 2.1032667876588023e-05, + "loss": 37.0656, + "step": 1657 + }, + { + "epoch": 5.986004514672686, + "grad_norm": 201.67047119140625, + "learning_rate": 2.102722323049002e-05, + "loss": 38.1747, + "step": 1658 + }, + { + "epoch": 5.98961625282167, + "grad_norm": 216.87525939941406, + "learning_rate": 2.1021778584392014e-05, + "loss": 39.3248, + "step": 1659 + }, + { + "epoch": 5.993227990970655, + "grad_norm": 227.381103515625, + "learning_rate": 2.1016333938294013e-05, + "loss": 33.4017, + "step": 1660 + }, + { + "epoch": 5.993227990970655, + "eval_loss": 0.6369583010673523, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1660 + }, + { + "epoch": 5.996839729119639, + "grad_norm": 237.2648468017578, + "learning_rate": 2.1010889292196008e-05, + "loss": 24.679, + "step": 1661 + }, + { + "epoch": 6.0, + "grad_norm": 191.99951171875, + "learning_rate": 2.1005444646098003e-05, + "loss": 21.9552, + "step": 1662 + }, + { + "epoch": 6.003611738148984, + "grad_norm": 267.92181396484375, + "learning_rate": 2.1e-05, + "loss": 43.6884, + "step": 1663 + }, + { + "epoch": 6.007223476297969, + "grad_norm": 318.86602783203125, + "learning_rate": 2.0994555353901998e-05, + "loss": 46.0709, + "step": 1664 + }, + { + "epoch": 6.010835214446953, + "grad_norm": 282.772705078125, + "learning_rate": 2.0989110707803993e-05, + "loss": 44.2746, + "step": 1665 + }, + { + "epoch": 6.014446952595937, + "grad_norm": 263.2024841308594, + "learning_rate": 2.0983666061705992e-05, + "loss": 43.818, + "step": 1666 + }, + { + "epoch": 6.018058690744921, + "grad_norm": 229.41725158691406, + "learning_rate": 2.0978221415607987e-05, + "loss": 43.9441, + "step": 1667 + }, + { + "epoch": 6.021670428893906, + "grad_norm": 253.25624084472656, + "learning_rate": 2.0972776769509983e-05, + "loss": 43.517, + "step": 1668 + }, + { + "epoch": 6.0252821670428895, + "grad_norm": 202.00238037109375, + "learning_rate": 2.0967332123411978e-05, + "loss": 44.3685, + "step": 1669 + }, + { + "epoch": 6.0288939051918735, + "grad_norm": 196.92825317382812, + "learning_rate": 2.0961887477313973e-05, + "loss": 44.9367, + "step": 1670 + }, + { + "epoch": 6.0288939051918735, + "eval_loss": 0.6381568312644958, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1670 + }, + { + "epoch": 6.0325056433408575, + "grad_norm": 191.00900268554688, + "learning_rate": 2.0956442831215972e-05, + "loss": 44.0743, + "step": 1671 + }, + { + "epoch": 6.036117381489842, + "grad_norm": 195.92141723632812, + "learning_rate": 2.0950998185117967e-05, + "loss": 43.3278, + "step": 1672 + }, + { + "epoch": 6.039729119638826, + "grad_norm": 230.04708862304688, + "learning_rate": 2.0945553539019963e-05, + "loss": 41.6419, + "step": 1673 + }, + { + "epoch": 6.04334085778781, + "grad_norm": 215.70689392089844, + "learning_rate": 2.094010889292196e-05, + "loss": 41.0927, + "step": 1674 + }, + { + "epoch": 6.046952595936794, + "grad_norm": 227.51797485351562, + "learning_rate": 2.0934664246823957e-05, + "loss": 40.1888, + "step": 1675 + }, + { + "epoch": 6.050564334085779, + "grad_norm": 216.93089294433594, + "learning_rate": 2.0929219600725952e-05, + "loss": 39.8766, + "step": 1676 + }, + { + "epoch": 6.054176072234763, + "grad_norm": 199.3091583251953, + "learning_rate": 2.092377495462795e-05, + "loss": 40.3851, + "step": 1677 + }, + { + "epoch": 6.057787810383747, + "grad_norm": 188.56056213378906, + "learning_rate": 2.0918330308529947e-05, + "loss": 40.5289, + "step": 1678 + }, + { + "epoch": 6.061399548532731, + "grad_norm": 194.23265075683594, + "learning_rate": 2.0912885662431942e-05, + "loss": 40.7509, + "step": 1679 + }, + { + "epoch": 6.065011286681716, + "grad_norm": 199.7327423095703, + "learning_rate": 2.0907441016333937e-05, + "loss": 41.3404, + "step": 1680 + }, + { + "epoch": 6.065011286681716, + "eval_loss": 0.6312655806541443, + "eval_runtime": 3.1482, + "eval_samples_per_second": 56.858, + "eval_steps_per_second": 56.858, + "step": 1680 + }, + { + "epoch": 6.0686230248307, + "grad_norm": 189.40150451660156, + "learning_rate": 2.0901996370235933e-05, + "loss": 41.3719, + "step": 1681 + }, + { + "epoch": 6.072234762979684, + "grad_norm": 222.07705688476562, + "learning_rate": 2.089655172413793e-05, + "loss": 41.8194, + "step": 1682 + }, + { + "epoch": 6.075846501128668, + "grad_norm": 205.6264190673828, + "learning_rate": 2.089110707803993e-05, + "loss": 39.8522, + "step": 1683 + }, + { + "epoch": 6.079458239277653, + "grad_norm": 207.98802185058594, + "learning_rate": 2.0885662431941926e-05, + "loss": 41.5093, + "step": 1684 + }, + { + "epoch": 6.083069977426637, + "grad_norm": 197.24134826660156, + "learning_rate": 2.088021778584392e-05, + "loss": 41.7284, + "step": 1685 + }, + { + "epoch": 6.0866817155756205, + "grad_norm": 220.84255981445312, + "learning_rate": 2.0874773139745916e-05, + "loss": 42.7841, + "step": 1686 + }, + { + "epoch": 6.090293453724605, + "grad_norm": 239.06854248046875, + "learning_rate": 2.0869328493647912e-05, + "loss": 43.6391, + "step": 1687 + }, + { + "epoch": 6.093905191873589, + "grad_norm": 193.2572021484375, + "learning_rate": 2.086388384754991e-05, + "loss": 41.9963, + "step": 1688 + }, + { + "epoch": 6.097516930022573, + "grad_norm": 206.66473388671875, + "learning_rate": 2.0858439201451906e-05, + "loss": 41.9834, + "step": 1689 + }, + { + "epoch": 6.101128668171557, + "grad_norm": 214.81956481933594, + "learning_rate": 2.08529945553539e-05, + "loss": 41.7128, + "step": 1690 + }, + { + "epoch": 6.101128668171557, + "eval_loss": 0.6309775114059448, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1690 + }, + { + "epoch": 6.104740406320542, + "grad_norm": 189.58360290527344, + "learning_rate": 2.0847549909255897e-05, + "loss": 37.7807, + "step": 1691 + }, + { + "epoch": 6.108352144469526, + "grad_norm": 265.76934814453125, + "learning_rate": 2.0842105263157895e-05, + "loss": 37.7091, + "step": 1692 + }, + { + "epoch": 6.11196388261851, + "grad_norm": 266.4632568359375, + "learning_rate": 2.0836660617059894e-05, + "loss": 34.7386, + "step": 1693 + }, + { + "epoch": 6.115575620767494, + "grad_norm": 309.3799743652344, + "learning_rate": 2.083121597096189e-05, + "loss": 34.9386, + "step": 1694 + }, + { + "epoch": 6.119187358916479, + "grad_norm": 252.98681640625, + "learning_rate": 2.0825771324863885e-05, + "loss": 34.9113, + "step": 1695 + }, + { + "epoch": 6.122799097065463, + "grad_norm": 199.3408660888672, + "learning_rate": 2.082032667876588e-05, + "loss": 35.1914, + "step": 1696 + }, + { + "epoch": 6.126410835214447, + "grad_norm": 231.67514038085938, + "learning_rate": 2.0814882032667876e-05, + "loss": 36.3151, + "step": 1697 + }, + { + "epoch": 6.130022573363431, + "grad_norm": 215.49317932128906, + "learning_rate": 2.080943738656987e-05, + "loss": 37.6763, + "step": 1698 + }, + { + "epoch": 6.133634311512416, + "grad_norm": 239.3602752685547, + "learning_rate": 2.080399274047187e-05, + "loss": 35.7805, + "step": 1699 + }, + { + "epoch": 6.1372460496614, + "grad_norm": 192.8195037841797, + "learning_rate": 2.0798548094373865e-05, + "loss": 36.7353, + "step": 1700 + }, + { + "epoch": 6.1372460496614, + "eval_loss": 0.6290757060050964, + "eval_runtime": 3.1486, + "eval_samples_per_second": 56.851, + "eval_steps_per_second": 56.851, + "step": 1700 + }, + { + "epoch": 6.140857787810384, + "grad_norm": 191.125, + "learning_rate": 2.0793103448275864e-05, + "loss": 36.6377, + "step": 1701 + }, + { + "epoch": 6.144469525959368, + "grad_norm": 232.39170837402344, + "learning_rate": 2.078765880217786e-05, + "loss": 36.5235, + "step": 1702 + }, + { + "epoch": 6.148081264108352, + "grad_norm": 259.41204833984375, + "learning_rate": 2.0782214156079855e-05, + "loss": 37.7093, + "step": 1703 + }, + { + "epoch": 6.151693002257336, + "grad_norm": 218.00814819335938, + "learning_rate": 2.0776769509981854e-05, + "loss": 37.8061, + "step": 1704 + }, + { + "epoch": 6.15530474040632, + "grad_norm": 183.78170776367188, + "learning_rate": 2.077132486388385e-05, + "loss": 37.9451, + "step": 1705 + }, + { + "epoch": 6.158916478555304, + "grad_norm": 242.387939453125, + "learning_rate": 2.0765880217785844e-05, + "loss": 38.687, + "step": 1706 + }, + { + "epoch": 6.162528216704289, + "grad_norm": 247.09152221679688, + "learning_rate": 2.076043557168784e-05, + "loss": 38.5109, + "step": 1707 + }, + { + "epoch": 6.166139954853273, + "grad_norm": 202.3104705810547, + "learning_rate": 2.0754990925589835e-05, + "loss": 28.0115, + "step": 1708 + }, + { + "epoch": 6.169751693002257, + "grad_norm": 239.5511016845703, + "learning_rate": 2.0749546279491834e-05, + "loss": 23.8873, + "step": 1709 + }, + { + "epoch": 6.173363431151241, + "grad_norm": 233.80007934570312, + "learning_rate": 2.0744101633393833e-05, + "loss": 24.0236, + "step": 1710 + }, + { + "epoch": 6.173363431151241, + "eval_loss": 0.6451307535171509, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1710 + }, + { + "epoch": 6.176975169300226, + "grad_norm": 231.85955810546875, + "learning_rate": 2.0738656987295828e-05, + "loss": 25.2521, + "step": 1711 + }, + { + "epoch": 6.18058690744921, + "grad_norm": 207.05453491210938, + "learning_rate": 2.0733212341197823e-05, + "loss": 25.5774, + "step": 1712 + }, + { + "epoch": 6.184198645598194, + "grad_norm": 265.9180908203125, + "learning_rate": 2.072776769509982e-05, + "loss": 46.0267, + "step": 1713 + }, + { + "epoch": 6.187810383747179, + "grad_norm": 289.2763671875, + "learning_rate": 2.0722323049001814e-05, + "loss": 46.6262, + "step": 1714 + }, + { + "epoch": 6.191422121896163, + "grad_norm": 254.466552734375, + "learning_rate": 2.0716878402903813e-05, + "loss": 44.2758, + "step": 1715 + }, + { + "epoch": 6.195033860045147, + "grad_norm": 262.713134765625, + "learning_rate": 2.071143375680581e-05, + "loss": 44.6334, + "step": 1716 + }, + { + "epoch": 6.198645598194131, + "grad_norm": 272.8150939941406, + "learning_rate": 2.0705989110707804e-05, + "loss": 44.9617, + "step": 1717 + }, + { + "epoch": 6.2022573363431155, + "grad_norm": 288.115478515625, + "learning_rate": 2.07005444646098e-05, + "loss": 44.4382, + "step": 1718 + }, + { + "epoch": 6.2058690744920995, + "grad_norm": 226.08058166503906, + "learning_rate": 2.0695099818511795e-05, + "loss": 44.8551, + "step": 1719 + }, + { + "epoch": 6.209480812641083, + "grad_norm": 219.95835876464844, + "learning_rate": 2.0689655172413797e-05, + "loss": 45.5901, + "step": 1720 + }, + { + "epoch": 6.209480812641083, + "eval_loss": 0.6379314661026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 1720 + }, + { + "epoch": 6.213092550790067, + "grad_norm": 190.3118896484375, + "learning_rate": 2.0684210526315792e-05, + "loss": 44.0675, + "step": 1721 + }, + { + "epoch": 6.216704288939052, + "grad_norm": 177.408935546875, + "learning_rate": 2.0678765880217787e-05, + "loss": 42.6333, + "step": 1722 + }, + { + "epoch": 6.220316027088036, + "grad_norm": 231.3040313720703, + "learning_rate": 2.0673321234119783e-05, + "loss": 41.6771, + "step": 1723 + }, + { + "epoch": 6.22392776523702, + "grad_norm": 226.51663208007812, + "learning_rate": 2.0667876588021778e-05, + "loss": 41.0829, + "step": 1724 + }, + { + "epoch": 6.227539503386004, + "grad_norm": 184.55775451660156, + "learning_rate": 2.0662431941923774e-05, + "loss": 39.2682, + "step": 1725 + }, + { + "epoch": 6.231151241534989, + "grad_norm": 205.0491943359375, + "learning_rate": 2.0656987295825772e-05, + "loss": 40.4101, + "step": 1726 + }, + { + "epoch": 6.234762979683973, + "grad_norm": 201.45838928222656, + "learning_rate": 2.0651542649727768e-05, + "loss": 39.9147, + "step": 1727 + }, + { + "epoch": 6.238374717832957, + "grad_norm": 220.16213989257812, + "learning_rate": 2.0646098003629763e-05, + "loss": 40.7215, + "step": 1728 + }, + { + "epoch": 6.241986455981941, + "grad_norm": 260.9661560058594, + "learning_rate": 2.0640653357531762e-05, + "loss": 40.0256, + "step": 1729 + }, + { + "epoch": 6.245598194130926, + "grad_norm": 314.2476806640625, + "learning_rate": 2.0635208711433757e-05, + "loss": 41.1147, + "step": 1730 + }, + { + "epoch": 6.245598194130926, + "eval_loss": 0.6347935199737549, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1730 + }, + { + "epoch": 6.24920993227991, + "grad_norm": 262.24505615234375, + "learning_rate": 2.0629764065335756e-05, + "loss": 41.7255, + "step": 1731 + }, + { + "epoch": 6.252821670428894, + "grad_norm": 212.0876922607422, + "learning_rate": 2.062431941923775e-05, + "loss": 41.2559, + "step": 1732 + }, + { + "epoch": 6.2564334085778786, + "grad_norm": 185.3249969482422, + "learning_rate": 2.0618874773139747e-05, + "loss": 41.1664, + "step": 1733 + }, + { + "epoch": 6.2600451467268625, + "grad_norm": 184.7873077392578, + "learning_rate": 2.0613430127041742e-05, + "loss": 41.3357, + "step": 1734 + }, + { + "epoch": 6.2636568848758465, + "grad_norm": 230.11257934570312, + "learning_rate": 2.0607985480943738e-05, + "loss": 43.0978, + "step": 1735 + }, + { + "epoch": 6.2672686230248305, + "grad_norm": 251.255126953125, + "learning_rate": 2.0602540834845733e-05, + "loss": 42.4169, + "step": 1736 + }, + { + "epoch": 6.270880361173815, + "grad_norm": 230.1149444580078, + "learning_rate": 2.0597096188747732e-05, + "loss": 43.2969, + "step": 1737 + }, + { + "epoch": 6.274492099322799, + "grad_norm": 217.2769012451172, + "learning_rate": 2.059165154264973e-05, + "loss": 42.6037, + "step": 1738 + }, + { + "epoch": 6.278103837471783, + "grad_norm": 189.85533142089844, + "learning_rate": 2.0586206896551726e-05, + "loss": 42.1215, + "step": 1739 + }, + { + "epoch": 6.281715575620767, + "grad_norm": 242.15667724609375, + "learning_rate": 2.058076225045372e-05, + "loss": 42.6337, + "step": 1740 + }, + { + "epoch": 6.281715575620767, + "eval_loss": 0.6310555934906006, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 1740 + }, + { + "epoch": 6.285327313769752, + "grad_norm": 213.7873992919922, + "learning_rate": 2.0575317604355717e-05, + "loss": 40.5315, + "step": 1741 + }, + { + "epoch": 6.288939051918736, + "grad_norm": 243.86492919921875, + "learning_rate": 2.0569872958257715e-05, + "loss": 38.9483, + "step": 1742 + }, + { + "epoch": 6.29255079006772, + "grad_norm": 276.0108642578125, + "learning_rate": 2.056442831215971e-05, + "loss": 35.9627, + "step": 1743 + }, + { + "epoch": 6.296162528216704, + "grad_norm": 252.5875701904297, + "learning_rate": 2.0558983666061706e-05, + "loss": 35.4305, + "step": 1744 + }, + { + "epoch": 6.299774266365689, + "grad_norm": 227.15142822265625, + "learning_rate": 2.05535390199637e-05, + "loss": 35.2385, + "step": 1745 + }, + { + "epoch": 6.303386004514673, + "grad_norm": 259.6727294921875, + "learning_rate": 2.0548094373865697e-05, + "loss": 35.735, + "step": 1746 + }, + { + "epoch": 6.306997742663657, + "grad_norm": 185.07765197753906, + "learning_rate": 2.0542649727767696e-05, + "loss": 36.8835, + "step": 1747 + }, + { + "epoch": 6.310609480812641, + "grad_norm": 207.650146484375, + "learning_rate": 2.0537205081669694e-05, + "loss": 36.346, + "step": 1748 + }, + { + "epoch": 6.314221218961626, + "grad_norm": 223.2378692626953, + "learning_rate": 2.053176043557169e-05, + "loss": 36.1527, + "step": 1749 + }, + { + "epoch": 6.3178329571106095, + "grad_norm": 162.90794372558594, + "learning_rate": 2.0526315789473685e-05, + "loss": 35.7408, + "step": 1750 + }, + { + "epoch": 6.3178329571106095, + "eval_loss": 0.6276403069496155, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 1750 + }, + { + "epoch": 6.3214446952595935, + "grad_norm": 165.8592987060547, + "learning_rate": 2.052087114337568e-05, + "loss": 37.7916, + "step": 1751 + }, + { + "epoch": 6.3250564334085775, + "grad_norm": 179.7499542236328, + "learning_rate": 2.0515426497277676e-05, + "loss": 36.8409, + "step": 1752 + }, + { + "epoch": 6.328668171557562, + "grad_norm": 227.0990753173828, + "learning_rate": 2.0509981851179675e-05, + "loss": 37.1766, + "step": 1753 + }, + { + "epoch": 6.332279909706546, + "grad_norm": 216.3297882080078, + "learning_rate": 2.050453720508167e-05, + "loss": 37.5, + "step": 1754 + }, + { + "epoch": 6.33589164785553, + "grad_norm": 197.88409423828125, + "learning_rate": 2.0499092558983666e-05, + "loss": 38.8293, + "step": 1755 + }, + { + "epoch": 6.339503386004514, + "grad_norm": 189.74916076660156, + "learning_rate": 2.049364791288566e-05, + "loss": 37.9873, + "step": 1756 + }, + { + "epoch": 6.343115124153499, + "grad_norm": 241.16644287109375, + "learning_rate": 2.048820326678766e-05, + "loss": 39.3107, + "step": 1757 + }, + { + "epoch": 6.346726862302483, + "grad_norm": 224.3491668701172, + "learning_rate": 2.0482758620689655e-05, + "loss": 36.2482, + "step": 1758 + }, + { + "epoch": 6.350338600451467, + "grad_norm": 217.30882263183594, + "learning_rate": 2.0477313974591654e-05, + "loss": 24.1945, + "step": 1759 + }, + { + "epoch": 6.353950338600452, + "grad_norm": 213.23683166503906, + "learning_rate": 2.047186932849365e-05, + "loss": 24.2356, + "step": 1760 + }, + { + "epoch": 6.353950338600452, + "eval_loss": 0.6382855772972107, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.795, + "eval_steps_per_second": 56.795, + "step": 1760 + }, + { + "epoch": 6.357562076749436, + "grad_norm": 209.8166961669922, + "learning_rate": 2.0466424682395645e-05, + "loss": 25.1916, + "step": 1761 + }, + { + "epoch": 6.36117381489842, + "grad_norm": 197.86773681640625, + "learning_rate": 2.046098003629764e-05, + "loss": 25.1372, + "step": 1762 + }, + { + "epoch": 6.364785553047404, + "grad_norm": 280.80517578125, + "learning_rate": 2.0455535390199635e-05, + "loss": 45.0431, + "step": 1763 + }, + { + "epoch": 6.368397291196389, + "grad_norm": 239.85861206054688, + "learning_rate": 2.0450090744101634e-05, + "loss": 45.4893, + "step": 1764 + }, + { + "epoch": 6.372009029345373, + "grad_norm": 302.56024169921875, + "learning_rate": 2.044464609800363e-05, + "loss": 45.3313, + "step": 1765 + }, + { + "epoch": 6.375620767494357, + "grad_norm": 255.5519256591797, + "learning_rate": 2.043920145190563e-05, + "loss": 44.703, + "step": 1766 + }, + { + "epoch": 6.3792325056433405, + "grad_norm": 223.1331024169922, + "learning_rate": 2.0433756805807624e-05, + "loss": 45.0278, + "step": 1767 + }, + { + "epoch": 6.382844243792325, + "grad_norm": 240.68817138671875, + "learning_rate": 2.042831215970962e-05, + "loss": 44.7298, + "step": 1768 + }, + { + "epoch": 6.386455981941309, + "grad_norm": 239.5072021484375, + "learning_rate": 2.0422867513611614e-05, + "loss": 44.0512, + "step": 1769 + }, + { + "epoch": 6.390067720090293, + "grad_norm": 186.3783416748047, + "learning_rate": 2.0417422867513613e-05, + "loss": 43.8646, + "step": 1770 + }, + { + "epoch": 6.390067720090293, + "eval_loss": 0.6325972676277161, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 1770 + }, + { + "epoch": 6.393679458239277, + "grad_norm": 169.77285766601562, + "learning_rate": 2.041197822141561e-05, + "loss": 43.8688, + "step": 1771 + }, + { + "epoch": 6.397291196388262, + "grad_norm": 158.4019012451172, + "learning_rate": 2.0406533575317604e-05, + "loss": 42.5757, + "step": 1772 + }, + { + "epoch": 6.400902934537246, + "grad_norm": 209.79916381835938, + "learning_rate": 2.04010889292196e-05, + "loss": 44.8075, + "step": 1773 + }, + { + "epoch": 6.40451467268623, + "grad_norm": 215.74639892578125, + "learning_rate": 2.0395644283121595e-05, + "loss": 42.0121, + "step": 1774 + }, + { + "epoch": 6.408126410835214, + "grad_norm": 215.21121215820312, + "learning_rate": 2.0390199637023597e-05, + "loss": 40.6564, + "step": 1775 + }, + { + "epoch": 6.411738148984199, + "grad_norm": 244.49574279785156, + "learning_rate": 2.0384754990925592e-05, + "loss": 40.543, + "step": 1776 + }, + { + "epoch": 6.415349887133183, + "grad_norm": 189.22781372070312, + "learning_rate": 2.0379310344827588e-05, + "loss": 39.5569, + "step": 1777 + }, + { + "epoch": 6.418961625282167, + "grad_norm": 204.32664489746094, + "learning_rate": 2.0373865698729583e-05, + "loss": 40.0789, + "step": 1778 + }, + { + "epoch": 6.422573363431152, + "grad_norm": 217.5277557373047, + "learning_rate": 2.036842105263158e-05, + "loss": 39.6436, + "step": 1779 + }, + { + "epoch": 6.426185101580136, + "grad_norm": 196.25918579101562, + "learning_rate": 2.0362976406533574e-05, + "loss": 41.0794, + "step": 1780 + }, + { + "epoch": 6.426185101580136, + "eval_loss": 0.6334295868873596, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1780 + }, + { + "epoch": 6.42979683972912, + "grad_norm": 191.50656127929688, + "learning_rate": 2.0357531760435573e-05, + "loss": 41.2976, + "step": 1781 + }, + { + "epoch": 6.433408577878104, + "grad_norm": 192.98692321777344, + "learning_rate": 2.0352087114337568e-05, + "loss": 41.0843, + "step": 1782 + }, + { + "epoch": 6.437020316027088, + "grad_norm": 197.32862854003906, + "learning_rate": 2.0346642468239563e-05, + "loss": 40.4123, + "step": 1783 + }, + { + "epoch": 6.440632054176072, + "grad_norm": 205.18751525878906, + "learning_rate": 2.0341197822141562e-05, + "loss": 41.9185, + "step": 1784 + }, + { + "epoch": 6.444243792325056, + "grad_norm": 201.69070434570312, + "learning_rate": 2.0335753176043558e-05, + "loss": 41.6794, + "step": 1785 + }, + { + "epoch": 6.44785553047404, + "grad_norm": 218.77044677734375, + "learning_rate": 2.0330308529945556e-05, + "loss": 43.5805, + "step": 1786 + }, + { + "epoch": 6.451467268623025, + "grad_norm": 183.25967407226562, + "learning_rate": 2.0324863883847552e-05, + "loss": 41.2777, + "step": 1787 + }, + { + "epoch": 6.455079006772009, + "grad_norm": 219.97369384765625, + "learning_rate": 2.0319419237749547e-05, + "loss": 42.4618, + "step": 1788 + }, + { + "epoch": 6.458690744920993, + "grad_norm": 216.1624298095703, + "learning_rate": 2.0313974591651542e-05, + "loss": 41.6424, + "step": 1789 + }, + { + "epoch": 6.462302483069977, + "grad_norm": 222.29965209960938, + "learning_rate": 2.0308529945553538e-05, + "loss": 41.4058, + "step": 1790 + }, + { + "epoch": 6.462302483069977, + "eval_loss": 0.6282982230186462, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 1790 + }, + { + "epoch": 6.465914221218962, + "grad_norm": 215.50511169433594, + "learning_rate": 2.0303085299455533e-05, + "loss": 39.474, + "step": 1791 + }, + { + "epoch": 6.469525959367946, + "grad_norm": 237.2119903564453, + "learning_rate": 2.0297640653357532e-05, + "loss": 36.0508, + "step": 1792 + }, + { + "epoch": 6.47313769751693, + "grad_norm": 234.52975463867188, + "learning_rate": 2.029219600725953e-05, + "loss": 34.1704, + "step": 1793 + }, + { + "epoch": 6.476749435665914, + "grad_norm": 213.22216796875, + "learning_rate": 2.0286751361161526e-05, + "loss": 34.7592, + "step": 1794 + }, + { + "epoch": 6.480361173814899, + "grad_norm": 215.77244567871094, + "learning_rate": 2.028130671506352e-05, + "loss": 35.3051, + "step": 1795 + }, + { + "epoch": 6.483972911963883, + "grad_norm": 179.0439910888672, + "learning_rate": 2.0275862068965517e-05, + "loss": 35.2493, + "step": 1796 + }, + { + "epoch": 6.487584650112867, + "grad_norm": 217.47218322753906, + "learning_rate": 2.0270417422867516e-05, + "loss": 35.6169, + "step": 1797 + }, + { + "epoch": 6.491196388261851, + "grad_norm": 191.3380584716797, + "learning_rate": 2.026497277676951e-05, + "loss": 36.428, + "step": 1798 + }, + { + "epoch": 6.4948081264108355, + "grad_norm": 200.8570098876953, + "learning_rate": 2.0259528130671506e-05, + "loss": 36.5983, + "step": 1799 + }, + { + "epoch": 6.4984198645598195, + "grad_norm": 173.1240234375, + "learning_rate": 2.0254083484573502e-05, + "loss": 36.0163, + "step": 1800 + }, + { + "epoch": 6.4984198645598195, + "eval_loss": 0.6268841624259949, + "eval_runtime": 3.146, + "eval_samples_per_second": 56.898, + "eval_steps_per_second": 56.898, + "step": 1800 + }, + { + "epoch": 6.502031602708803, + "grad_norm": 225.66845703125, + "learning_rate": 2.0248638838475497e-05, + "loss": 36.2461, + "step": 1801 + }, + { + "epoch": 6.505643340857787, + "grad_norm": 189.66233825683594, + "learning_rate": 2.0243194192377496e-05, + "loss": 37.416, + "step": 1802 + }, + { + "epoch": 6.509255079006772, + "grad_norm": 243.0270233154297, + "learning_rate": 2.0237749546279495e-05, + "loss": 38.5309, + "step": 1803 + }, + { + "epoch": 6.512866817155756, + "grad_norm": 192.0927276611328, + "learning_rate": 2.023230490018149e-05, + "loss": 37.087, + "step": 1804 + }, + { + "epoch": 6.51647855530474, + "grad_norm": 222.2957305908203, + "learning_rate": 2.0226860254083486e-05, + "loss": 37.8877, + "step": 1805 + }, + { + "epoch": 6.520090293453725, + "grad_norm": 259.84722900390625, + "learning_rate": 2.022141560798548e-05, + "loss": 39.2138, + "step": 1806 + }, + { + "epoch": 6.523702031602709, + "grad_norm": 205.5794219970703, + "learning_rate": 2.0215970961887476e-05, + "loss": 38.6066, + "step": 1807 + }, + { + "epoch": 6.527313769751693, + "grad_norm": 300.455810546875, + "learning_rate": 2.0210526315789475e-05, + "loss": 36.1581, + "step": 1808 + }, + { + "epoch": 6.530925507900677, + "grad_norm": 207.18063354492188, + "learning_rate": 2.020508166969147e-05, + "loss": 24.3689, + "step": 1809 + }, + { + "epoch": 6.534537246049661, + "grad_norm": 230.98516845703125, + "learning_rate": 2.0199637023593466e-05, + "loss": 23.7019, + "step": 1810 + }, + { + "epoch": 6.534537246049661, + "eval_loss": 0.6379140615463257, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 1810 + }, + { + "epoch": 6.538148984198646, + "grad_norm": 153.8694610595703, + "learning_rate": 2.019419237749546e-05, + "loss": 24.5035, + "step": 1811 + }, + { + "epoch": 6.54176072234763, + "grad_norm": 229.9432373046875, + "learning_rate": 2.018874773139746e-05, + "loss": 26.1645, + "step": 1812 + }, + { + "epoch": 6.545372460496614, + "grad_norm": 325.3592529296875, + "learning_rate": 2.018330308529946e-05, + "loss": 45.6349, + "step": 1813 + }, + { + "epoch": 6.5489841986455986, + "grad_norm": 261.0744323730469, + "learning_rate": 2.0177858439201454e-05, + "loss": 45.5545, + "step": 1814 + }, + { + "epoch": 6.5525959367945825, + "grad_norm": 261.4237976074219, + "learning_rate": 2.017241379310345e-05, + "loss": 45.321, + "step": 1815 + }, + { + "epoch": 6.5562076749435665, + "grad_norm": 238.8377685546875, + "learning_rate": 2.0166969147005445e-05, + "loss": 44.5963, + "step": 1816 + }, + { + "epoch": 6.5598194130925505, + "grad_norm": 225.89730834960938, + "learning_rate": 2.016152450090744e-05, + "loss": 43.593, + "step": 1817 + }, + { + "epoch": 6.563431151241535, + "grad_norm": 265.09625244140625, + "learning_rate": 2.0156079854809436e-05, + "loss": 43.536, + "step": 1818 + }, + { + "epoch": 6.567042889390519, + "grad_norm": 257.9114685058594, + "learning_rate": 2.0150635208711434e-05, + "loss": 44.1125, + "step": 1819 + }, + { + "epoch": 6.570654627539503, + "grad_norm": 188.06382751464844, + "learning_rate": 2.014519056261343e-05, + "loss": 45.097, + "step": 1820 + }, + { + "epoch": 6.570654627539503, + "eval_loss": 0.6347097754478455, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 1820 + }, + { + "epoch": 6.574266365688487, + "grad_norm": 227.7350616455078, + "learning_rate": 2.013974591651543e-05, + "loss": 43.9367, + "step": 1821 + }, + { + "epoch": 6.577878103837472, + "grad_norm": 207.54774475097656, + "learning_rate": 2.0134301270417424e-05, + "loss": 43.8266, + "step": 1822 + }, + { + "epoch": 6.581489841986456, + "grad_norm": 204.62364196777344, + "learning_rate": 2.012885662431942e-05, + "loss": 42.7973, + "step": 1823 + }, + { + "epoch": 6.58510158013544, + "grad_norm": 244.32159423828125, + "learning_rate": 2.0123411978221418e-05, + "loss": 42.7741, + "step": 1824 + }, + { + "epoch": 6.588713318284425, + "grad_norm": 304.9100036621094, + "learning_rate": 2.0117967332123414e-05, + "loss": 40.6529, + "step": 1825 + }, + { + "epoch": 6.592325056433409, + "grad_norm": 275.5767517089844, + "learning_rate": 2.011252268602541e-05, + "loss": 40.2909, + "step": 1826 + }, + { + "epoch": 6.595936794582393, + "grad_norm": 227.69642639160156, + "learning_rate": 2.0107078039927404e-05, + "loss": 39.8786, + "step": 1827 + }, + { + "epoch": 6.599548532731377, + "grad_norm": 261.4333190917969, + "learning_rate": 2.01016333938294e-05, + "loss": 40.7009, + "step": 1828 + }, + { + "epoch": 6.603160270880361, + "grad_norm": 213.0095977783203, + "learning_rate": 2.0096188747731395e-05, + "loss": 40.0595, + "step": 1829 + }, + { + "epoch": 6.606772009029346, + "grad_norm": 251.78590393066406, + "learning_rate": 2.0090744101633397e-05, + "loss": 40.8939, + "step": 1830 + }, + { + "epoch": 6.606772009029346, + "eval_loss": 0.6333281397819519, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1830 + }, + { + "epoch": 6.6103837471783295, + "grad_norm": 224.89805603027344, + "learning_rate": 2.0085299455535393e-05, + "loss": 41.4123, + "step": 1831 + }, + { + "epoch": 6.6139954853273135, + "grad_norm": 195.67982482910156, + "learning_rate": 2.0079854809437388e-05, + "loss": 41.3483, + "step": 1832 + }, + { + "epoch": 6.617607223476298, + "grad_norm": 214.318603515625, + "learning_rate": 2.0074410163339383e-05, + "loss": 40.5516, + "step": 1833 + }, + { + "epoch": 6.621218961625282, + "grad_norm": 226.60968017578125, + "learning_rate": 2.006896551724138e-05, + "loss": 41.3523, + "step": 1834 + }, + { + "epoch": 6.624830699774266, + "grad_norm": 231.63604736328125, + "learning_rate": 2.0063520871143378e-05, + "loss": 41.8734, + "step": 1835 + }, + { + "epoch": 6.62844243792325, + "grad_norm": 224.1644287109375, + "learning_rate": 2.0058076225045373e-05, + "loss": 42.7386, + "step": 1836 + }, + { + "epoch": 6.632054176072235, + "grad_norm": 273.651123046875, + "learning_rate": 2.0052631578947368e-05, + "loss": 42.4525, + "step": 1837 + }, + { + "epoch": 6.635665914221219, + "grad_norm": 270.8088684082031, + "learning_rate": 2.0047186932849364e-05, + "loss": 42.1051, + "step": 1838 + }, + { + "epoch": 6.639277652370203, + "grad_norm": 303.1058044433594, + "learning_rate": 2.0041742286751362e-05, + "loss": 42.1301, + "step": 1839 + }, + { + "epoch": 6.642889390519187, + "grad_norm": 207.29380798339844, + "learning_rate": 2.0036297640653358e-05, + "loss": 42.1495, + "step": 1840 + }, + { + "epoch": 6.642889390519187, + "eval_loss": 0.6321585774421692, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1840 + }, + { + "epoch": 6.646501128668172, + "grad_norm": 262.1852722167969, + "learning_rate": 2.0030852994555357e-05, + "loss": 39.6408, + "step": 1841 + }, + { + "epoch": 6.650112866817156, + "grad_norm": 233.7991943359375, + "learning_rate": 2.0025408348457352e-05, + "loss": 37.6177, + "step": 1842 + }, + { + "epoch": 6.65372460496614, + "grad_norm": 247.25514221191406, + "learning_rate": 2.0019963702359347e-05, + "loss": 35.4287, + "step": 1843 + }, + { + "epoch": 6.657336343115124, + "grad_norm": 191.53343200683594, + "learning_rate": 2.0014519056261343e-05, + "loss": 34.2335, + "step": 1844 + }, + { + "epoch": 6.660948081264109, + "grad_norm": 245.22821044921875, + "learning_rate": 2.0009074410163338e-05, + "loss": 35.8097, + "step": 1845 + }, + { + "epoch": 6.664559819413093, + "grad_norm": 213.8151092529297, + "learning_rate": 2.0003629764065337e-05, + "loss": 35.2621, + "step": 1846 + }, + { + "epoch": 6.668171557562077, + "grad_norm": 174.6085205078125, + "learning_rate": 1.9998185117967332e-05, + "loss": 36.6137, + "step": 1847 + }, + { + "epoch": 6.6717832957110605, + "grad_norm": 287.4677429199219, + "learning_rate": 1.9992740471869328e-05, + "loss": 37.5896, + "step": 1848 + }, + { + "epoch": 6.675395033860045, + "grad_norm": 224.59771728515625, + "learning_rate": 1.9987295825771326e-05, + "loss": 36.5515, + "step": 1849 + }, + { + "epoch": 6.679006772009029, + "grad_norm": 212.73065185546875, + "learning_rate": 1.9981851179673322e-05, + "loss": 36.2511, + "step": 1850 + }, + { + "epoch": 6.679006772009029, + "eval_loss": 0.6308404803276062, + "eval_runtime": 3.1419, + "eval_samples_per_second": 56.972, + "eval_steps_per_second": 56.972, + "step": 1850 + }, + { + "epoch": 6.682618510158013, + "grad_norm": 214.7340850830078, + "learning_rate": 1.9976406533575317e-05, + "loss": 37.6949, + "step": 1851 + }, + { + "epoch": 6.686230248306998, + "grad_norm": 220.3029327392578, + "learning_rate": 1.9970961887477316e-05, + "loss": 36.5785, + "step": 1852 + }, + { + "epoch": 6.689841986455982, + "grad_norm": 198.97564697265625, + "learning_rate": 1.996551724137931e-05, + "loss": 38.5277, + "step": 1853 + }, + { + "epoch": 6.693453724604966, + "grad_norm": 180.94789123535156, + "learning_rate": 1.9960072595281307e-05, + "loss": 37.5197, + "step": 1854 + }, + { + "epoch": 6.69706546275395, + "grad_norm": 212.17584228515625, + "learning_rate": 1.9954627949183302e-05, + "loss": 37.3483, + "step": 1855 + }, + { + "epoch": 6.700677200902934, + "grad_norm": 253.88601684570312, + "learning_rate": 1.9949183303085298e-05, + "loss": 38.5224, + "step": 1856 + }, + { + "epoch": 6.704288939051919, + "grad_norm": 193.17698669433594, + "learning_rate": 1.9943738656987296e-05, + "loss": 37.5679, + "step": 1857 + }, + { + "epoch": 6.707900677200903, + "grad_norm": 217.2652130126953, + "learning_rate": 1.9938294010889295e-05, + "loss": 27.7344, + "step": 1858 + }, + { + "epoch": 6.711512415349887, + "grad_norm": 183.9295196533203, + "learning_rate": 1.993284936479129e-05, + "loss": 24.3864, + "step": 1859 + }, + { + "epoch": 6.715124153498872, + "grad_norm": 200.3455352783203, + "learning_rate": 1.9927404718693286e-05, + "loss": 23.7328, + "step": 1860 + }, + { + "epoch": 6.715124153498872, + "eval_loss": 0.636415421962738, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.943, + "eval_steps_per_second": 56.943, + "step": 1860 + }, + { + "epoch": 6.718735891647856, + "grad_norm": 206.7858123779297, + "learning_rate": 1.992196007259528e-05, + "loss": 24.6541, + "step": 1861 + }, + { + "epoch": 6.72234762979684, + "grad_norm": 208.10414123535156, + "learning_rate": 1.9916515426497277e-05, + "loss": 25.1223, + "step": 1862 + }, + { + "epoch": 6.725959367945824, + "grad_norm": 270.6657409667969, + "learning_rate": 1.9911070780399275e-05, + "loss": 44.8561, + "step": 1863 + }, + { + "epoch": 6.7295711060948085, + "grad_norm": 246.69094848632812, + "learning_rate": 1.990562613430127e-05, + "loss": 45.8683, + "step": 1864 + }, + { + "epoch": 6.733182844243792, + "grad_norm": 243.4462432861328, + "learning_rate": 1.9900181488203266e-05, + "loss": 45.1845, + "step": 1865 + }, + { + "epoch": 6.736794582392776, + "grad_norm": 218.0637969970703, + "learning_rate": 1.989473684210526e-05, + "loss": 43.9492, + "step": 1866 + }, + { + "epoch": 6.74040632054176, + "grad_norm": 200.28140258789062, + "learning_rate": 1.988929219600726e-05, + "loss": 44.0612, + "step": 1867 + }, + { + "epoch": 6.744018058690745, + "grad_norm": 200.3120880126953, + "learning_rate": 1.988384754990926e-05, + "loss": 43.4748, + "step": 1868 + }, + { + "epoch": 6.747629796839729, + "grad_norm": 186.1811065673828, + "learning_rate": 1.9878402903811254e-05, + "loss": 43.6851, + "step": 1869 + }, + { + "epoch": 6.751241534988713, + "grad_norm": 208.15167236328125, + "learning_rate": 1.987295825771325e-05, + "loss": 44.4196, + "step": 1870 + }, + { + "epoch": 6.751241534988713, + "eval_loss": 0.6353851556777954, + "eval_runtime": 3.1436, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1870 + }, + { + "epoch": 6.754853273137698, + "grad_norm": 207.500244140625, + "learning_rate": 1.9867513611615245e-05, + "loss": 44.1493, + "step": 1871 + }, + { + "epoch": 6.758465011286682, + "grad_norm": 238.17047119140625, + "learning_rate": 1.986206896551724e-05, + "loss": 44.6587, + "step": 1872 + }, + { + "epoch": 6.762076749435666, + "grad_norm": 192.9468231201172, + "learning_rate": 1.9856624319419236e-05, + "loss": 43.2409, + "step": 1873 + }, + { + "epoch": 6.76568848758465, + "grad_norm": 205.26492309570312, + "learning_rate": 1.9851179673321235e-05, + "loss": 40.8636, + "step": 1874 + }, + { + "epoch": 6.769300225733634, + "grad_norm": 190.49908447265625, + "learning_rate": 1.984573502722323e-05, + "loss": 41.0769, + "step": 1875 + }, + { + "epoch": 6.772911963882619, + "grad_norm": 206.56097412109375, + "learning_rate": 1.984029038112523e-05, + "loss": 40.1137, + "step": 1876 + }, + { + "epoch": 6.776523702031603, + "grad_norm": 212.89256286621094, + "learning_rate": 1.9834845735027224e-05, + "loss": 41.0114, + "step": 1877 + }, + { + "epoch": 6.780135440180587, + "grad_norm": 197.24267578125, + "learning_rate": 1.982940108892922e-05, + "loss": 40.6027, + "step": 1878 + }, + { + "epoch": 6.7837471783295715, + "grad_norm": 187.01942443847656, + "learning_rate": 1.982395644283122e-05, + "loss": 40.5933, + "step": 1879 + }, + { + "epoch": 6.7873589164785555, + "grad_norm": 236.31092834472656, + "learning_rate": 1.9818511796733214e-05, + "loss": 41.2282, + "step": 1880 + }, + { + "epoch": 6.7873589164785555, + "eval_loss": 0.6299392580986023, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 1880 + }, + { + "epoch": 6.7909706546275395, + "grad_norm": 194.92059326171875, + "learning_rate": 1.981306715063521e-05, + "loss": 41.5858, + "step": 1881 + }, + { + "epoch": 6.794582392776523, + "grad_norm": 192.26272583007812, + "learning_rate": 1.9807622504537205e-05, + "loss": 40.6826, + "step": 1882 + }, + { + "epoch": 6.798194130925508, + "grad_norm": 181.8116912841797, + "learning_rate": 1.98021778584392e-05, + "loss": 40.0867, + "step": 1883 + }, + { + "epoch": 6.801805869074492, + "grad_norm": 219.03494262695312, + "learning_rate": 1.9796733212341195e-05, + "loss": 41.4496, + "step": 1884 + }, + { + "epoch": 6.805417607223476, + "grad_norm": 190.7852325439453, + "learning_rate": 1.9791288566243194e-05, + "loss": 42.4147, + "step": 1885 + }, + { + "epoch": 6.80902934537246, + "grad_norm": 200.32476806640625, + "learning_rate": 1.9785843920145193e-05, + "loss": 42.0316, + "step": 1886 + }, + { + "epoch": 6.812641083521445, + "grad_norm": 240.6086883544922, + "learning_rate": 1.9780399274047188e-05, + "loss": 39.6992, + "step": 1887 + }, + { + "epoch": 6.816252821670429, + "grad_norm": 222.31700134277344, + "learning_rate": 1.9774954627949184e-05, + "loss": 42.9572, + "step": 1888 + }, + { + "epoch": 6.819864559819413, + "grad_norm": 215.65292358398438, + "learning_rate": 1.976950998185118e-05, + "loss": 42.5147, + "step": 1889 + }, + { + "epoch": 6.823476297968397, + "grad_norm": 195.71624755859375, + "learning_rate": 1.9764065335753178e-05, + "loss": 40.9536, + "step": 1890 + }, + { + "epoch": 6.823476297968397, + "eval_loss": 0.6288287043571472, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 1890 + }, + { + "epoch": 6.827088036117382, + "grad_norm": 202.301025390625, + "learning_rate": 1.9758620689655173e-05, + "loss": 40.1754, + "step": 1891 + }, + { + "epoch": 6.830699774266366, + "grad_norm": 217.07186889648438, + "learning_rate": 1.975317604355717e-05, + "loss": 35.7505, + "step": 1892 + }, + { + "epoch": 6.83431151241535, + "grad_norm": 189.78782653808594, + "learning_rate": 1.9747731397459164e-05, + "loss": 34.813, + "step": 1893 + }, + { + "epoch": 6.837923250564334, + "grad_norm": 247.2117462158203, + "learning_rate": 1.974228675136116e-05, + "loss": 33.932, + "step": 1894 + }, + { + "epoch": 6.8415349887133186, + "grad_norm": 244.06321716308594, + "learning_rate": 1.9736842105263158e-05, + "loss": 36.2514, + "step": 1895 + }, + { + "epoch": 6.8451467268623025, + "grad_norm": 235.78692626953125, + "learning_rate": 1.9731397459165157e-05, + "loss": 35.2123, + "step": 1896 + }, + { + "epoch": 6.8487584650112865, + "grad_norm": 193.82456970214844, + "learning_rate": 1.9725952813067152e-05, + "loss": 36.5477, + "step": 1897 + }, + { + "epoch": 6.852370203160271, + "grad_norm": 230.2017059326172, + "learning_rate": 1.9720508166969148e-05, + "loss": 36.1244, + "step": 1898 + }, + { + "epoch": 6.855981941309255, + "grad_norm": 205.5274200439453, + "learning_rate": 1.9715063520871143e-05, + "loss": 36.7059, + "step": 1899 + }, + { + "epoch": 6.859593679458239, + "grad_norm": 236.6873016357422, + "learning_rate": 1.970961887477314e-05, + "loss": 36.6212, + "step": 1900 + }, + { + "epoch": 6.859593679458239, + "eval_loss": 0.6235609650611877, + "eval_runtime": 3.1497, + "eval_samples_per_second": 56.831, + "eval_steps_per_second": 56.831, + "step": 1900 + }, + { + "epoch": 6.863205417607223, + "grad_norm": 217.63638305664062, + "learning_rate": 1.9704174228675137e-05, + "loss": 37.3918, + "step": 1901 + }, + { + "epoch": 6.866817155756207, + "grad_norm": 169.31996154785156, + "learning_rate": 1.9698729582577133e-05, + "loss": 37.8555, + "step": 1902 + }, + { + "epoch": 6.870428893905192, + "grad_norm": 204.2144775390625, + "learning_rate": 1.9693284936479128e-05, + "loss": 38.0013, + "step": 1903 + }, + { + "epoch": 6.874040632054176, + "grad_norm": 219.13595581054688, + "learning_rate": 1.9687840290381127e-05, + "loss": 37.2128, + "step": 1904 + }, + { + "epoch": 6.87765237020316, + "grad_norm": 189.8477325439453, + "learning_rate": 1.9682395644283122e-05, + "loss": 39.272, + "step": 1905 + }, + { + "epoch": 6.881264108352145, + "grad_norm": 214.21360778808594, + "learning_rate": 1.967695099818512e-05, + "loss": 37.5185, + "step": 1906 + }, + { + "epoch": 6.884875846501129, + "grad_norm": 252.57867431640625, + "learning_rate": 1.9671506352087116e-05, + "loss": 37.6195, + "step": 1907 + }, + { + "epoch": 6.888487584650113, + "grad_norm": 169.85382080078125, + "learning_rate": 1.966606170598911e-05, + "loss": 29.083, + "step": 1908 + }, + { + "epoch": 6.892099322799097, + "grad_norm": 161.38137817382812, + "learning_rate": 1.9660617059891107e-05, + "loss": 24.4547, + "step": 1909 + }, + { + "epoch": 6.895711060948082, + "grad_norm": 192.5706787109375, + "learning_rate": 1.9655172413793102e-05, + "loss": 24.2235, + "step": 1910 + }, + { + "epoch": 6.895711060948082, + "eval_loss": 0.6387229561805725, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1910 + }, + { + "epoch": 6.899322799097066, + "grad_norm": 177.5368194580078, + "learning_rate": 1.9649727767695098e-05, + "loss": 24.8032, + "step": 1911 + }, + { + "epoch": 6.9029345372460496, + "grad_norm": 206.98458862304688, + "learning_rate": 1.9644283121597097e-05, + "loss": 25.7293, + "step": 1912 + }, + { + "epoch": 6.9065462753950335, + "grad_norm": 238.7289581298828, + "learning_rate": 1.9638838475499095e-05, + "loss": 44.2514, + "step": 1913 + }, + { + "epoch": 6.910158013544018, + "grad_norm": 225.86854553222656, + "learning_rate": 1.963339382940109e-05, + "loss": 44.4858, + "step": 1914 + }, + { + "epoch": 6.913769751693002, + "grad_norm": 235.71524047851562, + "learning_rate": 1.9627949183303086e-05, + "loss": 44.5351, + "step": 1915 + }, + { + "epoch": 6.917381489841986, + "grad_norm": 233.1634063720703, + "learning_rate": 1.962250453720508e-05, + "loss": 44.0865, + "step": 1916 + }, + { + "epoch": 6.92099322799097, + "grad_norm": 201.48944091796875, + "learning_rate": 1.961705989110708e-05, + "loss": 45.0226, + "step": 1917 + }, + { + "epoch": 6.924604966139955, + "grad_norm": 226.95469665527344, + "learning_rate": 1.9611615245009076e-05, + "loss": 44.3969, + "step": 1918 + }, + { + "epoch": 6.928216704288939, + "grad_norm": 242.79940795898438, + "learning_rate": 1.960617059891107e-05, + "loss": 41.3037, + "step": 1919 + }, + { + "epoch": 6.931828442437923, + "grad_norm": 255.3524932861328, + "learning_rate": 1.9600725952813066e-05, + "loss": 41.3567, + "step": 1920 + }, + { + "epoch": 6.931828442437923, + "eval_loss": 0.6346065998077393, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 1920 + }, + { + "epoch": 6.935440180586907, + "grad_norm": 277.0763854980469, + "learning_rate": 1.9595281306715062e-05, + "loss": 41.142, + "step": 1921 + }, + { + "epoch": 6.939051918735892, + "grad_norm": 176.02658081054688, + "learning_rate": 1.958983666061706e-05, + "loss": 42.1963, + "step": 1922 + }, + { + "epoch": 6.942663656884876, + "grad_norm": 236.36398315429688, + "learning_rate": 1.958439201451906e-05, + "loss": 42.351, + "step": 1923 + }, + { + "epoch": 6.94627539503386, + "grad_norm": 203.0919647216797, + "learning_rate": 1.9578947368421055e-05, + "loss": 41.5248, + "step": 1924 + }, + { + "epoch": 6.949887133182845, + "grad_norm": 273.605712890625, + "learning_rate": 1.957350272232305e-05, + "loss": 42.1004, + "step": 1925 + }, + { + "epoch": 6.953498871331829, + "grad_norm": 214.04319763183594, + "learning_rate": 1.9568058076225045e-05, + "loss": 42.6326, + "step": 1926 + }, + { + "epoch": 6.957110609480813, + "grad_norm": 250.81832885742188, + "learning_rate": 1.956261343012704e-05, + "loss": 43.8045, + "step": 1927 + }, + { + "epoch": 6.960722347629797, + "grad_norm": 233.58116149902344, + "learning_rate": 1.955716878402904e-05, + "loss": 39.8991, + "step": 1928 + }, + { + "epoch": 6.9643340857787805, + "grad_norm": 269.0545654296875, + "learning_rate": 1.9551724137931035e-05, + "loss": 34.6192, + "step": 1929 + }, + { + "epoch": 6.967945823927765, + "grad_norm": 266.1218566894531, + "learning_rate": 1.954627949183303e-05, + "loss": 35.7568, + "step": 1930 + }, + { + "epoch": 6.967945823927765, + "eval_loss": 0.6233173608779907, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1930 + }, + { + "epoch": 6.971557562076749, + "grad_norm": 294.6914978027344, + "learning_rate": 1.9540834845735026e-05, + "loss": 36.0795, + "step": 1931 + }, + { + "epoch": 6.975169300225733, + "grad_norm": 373.6831970214844, + "learning_rate": 1.9535390199637025e-05, + "loss": 37.2715, + "step": 1932 + }, + { + "epoch": 6.978781038374718, + "grad_norm": 240.34738159179688, + "learning_rate": 1.952994555353902e-05, + "loss": 37.8335, + "step": 1933 + }, + { + "epoch": 6.982392776523702, + "grad_norm": 312.1968994140625, + "learning_rate": 1.952450090744102e-05, + "loss": 37.8251, + "step": 1934 + }, + { + "epoch": 6.986004514672686, + "grad_norm": 276.3544006347656, + "learning_rate": 1.9519056261343014e-05, + "loss": 38.8466, + "step": 1935 + }, + { + "epoch": 6.98961625282167, + "grad_norm": 282.6874694824219, + "learning_rate": 1.951361161524501e-05, + "loss": 37.774, + "step": 1936 + }, + { + "epoch": 6.993227990970655, + "grad_norm": 323.96612548828125, + "learning_rate": 1.9508166969147005e-05, + "loss": 34.3747, + "step": 1937 + }, + { + "epoch": 6.996839729119639, + "grad_norm": 235.02915954589844, + "learning_rate": 1.9502722323049e-05, + "loss": 24.5297, + "step": 1938 + }, + { + "epoch": 7.0, + "grad_norm": 176.4046173095703, + "learning_rate": 1.9497277676951e-05, + "loss": 22.3179, + "step": 1939 + }, + { + "epoch": 7.003611738148984, + "grad_norm": 248.2797393798828, + "learning_rate": 1.9491833030852994e-05, + "loss": 42.225, + "step": 1940 + }, + { + "epoch": 7.003611738148984, + "eval_loss": 0.6272363066673279, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.911, + "eval_steps_per_second": 56.911, + "step": 1940 + }, + { + "epoch": 7.007223476297969, + "grad_norm": 235.9131622314453, + "learning_rate": 1.9486388384754993e-05, + "loss": 43.6526, + "step": 1941 + }, + { + "epoch": 7.010835214446953, + "grad_norm": 223.63479614257812, + "learning_rate": 1.948094373865699e-05, + "loss": 42.9052, + "step": 1942 + }, + { + "epoch": 7.014446952595937, + "grad_norm": 203.92141723632812, + "learning_rate": 1.9475499092558984e-05, + "loss": 43.5819, + "step": 1943 + }, + { + "epoch": 7.018058690744921, + "grad_norm": 209.6050567626953, + "learning_rate": 1.947005444646098e-05, + "loss": 43.1077, + "step": 1944 + }, + { + "epoch": 7.021670428893906, + "grad_norm": 245.77700805664062, + "learning_rate": 1.9464609800362978e-05, + "loss": 42.7508, + "step": 1945 + }, + { + "epoch": 7.0252821670428895, + "grad_norm": 203.13465881347656, + "learning_rate": 1.9459165154264973e-05, + "loss": 42.5234, + "step": 1946 + }, + { + "epoch": 7.0288939051918735, + "grad_norm": 226.4978485107422, + "learning_rate": 1.945372050816697e-05, + "loss": 44.0725, + "step": 1947 + }, + { + "epoch": 7.0325056433408575, + "grad_norm": 225.68116760253906, + "learning_rate": 1.9448275862068964e-05, + "loss": 42.6408, + "step": 1948 + }, + { + "epoch": 7.036117381489842, + "grad_norm": 182.14202880859375, + "learning_rate": 1.944283121597096e-05, + "loss": 41.7696, + "step": 1949 + }, + { + "epoch": 7.039729119638826, + "grad_norm": 196.1949005126953, + "learning_rate": 1.9437386569872962e-05, + "loss": 42.7008, + "step": 1950 + }, + { + "epoch": 7.039729119638826, + "eval_loss": 0.6277336478233337, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 1950 + }, + { + "epoch": 7.04334085778781, + "grad_norm": 180.6853485107422, + "learning_rate": 1.9431941923774957e-05, + "loss": 41.9946, + "step": 1951 + }, + { + "epoch": 7.046952595936794, + "grad_norm": 199.0644073486328, + "learning_rate": 1.9426497277676953e-05, + "loss": 39.8965, + "step": 1952 + }, + { + "epoch": 7.050564334085779, + "grad_norm": 208.21371459960938, + "learning_rate": 1.9421052631578948e-05, + "loss": 39.3263, + "step": 1953 + }, + { + "epoch": 7.054176072234763, + "grad_norm": 239.78677368164062, + "learning_rate": 1.9415607985480943e-05, + "loss": 40.1478, + "step": 1954 + }, + { + "epoch": 7.057787810383747, + "grad_norm": 211.55030822753906, + "learning_rate": 1.941016333938294e-05, + "loss": 40.061, + "step": 1955 + }, + { + "epoch": 7.061399548532731, + "grad_norm": 199.51455688476562, + "learning_rate": 1.9404718693284937e-05, + "loss": 39.8707, + "step": 1956 + }, + { + "epoch": 7.065011286681716, + "grad_norm": 183.39486694335938, + "learning_rate": 1.9399274047186933e-05, + "loss": 40.3183, + "step": 1957 + }, + { + "epoch": 7.0686230248307, + "grad_norm": 238.36737060546875, + "learning_rate": 1.9393829401088928e-05, + "loss": 40.8581, + "step": 1958 + }, + { + "epoch": 7.072234762979684, + "grad_norm": 202.5072021484375, + "learning_rate": 1.9388384754990927e-05, + "loss": 40.2192, + "step": 1959 + }, + { + "epoch": 7.075846501128668, + "grad_norm": 204.236083984375, + "learning_rate": 1.9382940108892922e-05, + "loss": 40.8533, + "step": 1960 + }, + { + "epoch": 7.075846501128668, + "eval_loss": 0.6252757906913757, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 1960 + }, + { + "epoch": 7.079458239277653, + "grad_norm": 260.2081298828125, + "learning_rate": 1.937749546279492e-05, + "loss": 39.7229, + "step": 1961 + }, + { + "epoch": 7.083069977426637, + "grad_norm": 241.91722106933594, + "learning_rate": 1.9372050816696917e-05, + "loss": 41.547, + "step": 1962 + }, + { + "epoch": 7.0866817155756205, + "grad_norm": 168.9304656982422, + "learning_rate": 1.9366606170598912e-05, + "loss": 41.4826, + "step": 1963 + }, + { + "epoch": 7.090293453724605, + "grad_norm": 230.05349731445312, + "learning_rate": 1.9361161524500907e-05, + "loss": 41.5411, + "step": 1964 + }, + { + "epoch": 7.093905191873589, + "grad_norm": 172.16851806640625, + "learning_rate": 1.9355716878402903e-05, + "loss": 42.2347, + "step": 1965 + }, + { + "epoch": 7.097516930022573, + "grad_norm": 312.65838623046875, + "learning_rate": 1.9350272232304898e-05, + "loss": 41.4039, + "step": 1966 + }, + { + "epoch": 7.101128668171557, + "grad_norm": 249.62351989746094, + "learning_rate": 1.9344827586206897e-05, + "loss": 41.4234, + "step": 1967 + }, + { + "epoch": 7.104740406320542, + "grad_norm": 250.49143981933594, + "learning_rate": 1.9339382940108896e-05, + "loss": 38.0539, + "step": 1968 + }, + { + "epoch": 7.108352144469526, + "grad_norm": 238.41546630859375, + "learning_rate": 1.933393829401089e-05, + "loss": 35.5584, + "step": 1969 + }, + { + "epoch": 7.11196388261851, + "grad_norm": 200.78282165527344, + "learning_rate": 1.9328493647912886e-05, + "loss": 34.4491, + "step": 1970 + }, + { + "epoch": 7.11196388261851, + "eval_loss": 0.6286216378211975, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 1970 + }, + { + "epoch": 7.115575620767494, + "grad_norm": 244.61717224121094, + "learning_rate": 1.9323049001814882e-05, + "loss": 34.5403, + "step": 1971 + }, + { + "epoch": 7.119187358916479, + "grad_norm": 219.14312744140625, + "learning_rate": 1.931760435571688e-05, + "loss": 35.7815, + "step": 1972 + }, + { + "epoch": 7.122799097065463, + "grad_norm": 221.85130310058594, + "learning_rate": 1.9312159709618876e-05, + "loss": 35.638, + "step": 1973 + }, + { + "epoch": 7.126410835214447, + "grad_norm": 237.97921752929688, + "learning_rate": 1.930671506352087e-05, + "loss": 35.1348, + "step": 1974 + }, + { + "epoch": 7.130022573363431, + "grad_norm": 234.06256103515625, + "learning_rate": 1.9301270417422867e-05, + "loss": 35.8709, + "step": 1975 + }, + { + "epoch": 7.133634311512416, + "grad_norm": 231.6852264404297, + "learning_rate": 1.9295825771324862e-05, + "loss": 36.6859, + "step": 1976 + }, + { + "epoch": 7.1372460496614, + "grad_norm": 208.2762908935547, + "learning_rate": 1.9290381125226857e-05, + "loss": 37.24, + "step": 1977 + }, + { + "epoch": 7.140857787810384, + "grad_norm": 219.8532257080078, + "learning_rate": 1.928493647912886e-05, + "loss": 36.4058, + "step": 1978 + }, + { + "epoch": 7.144469525959368, + "grad_norm": 242.73159790039062, + "learning_rate": 1.9279491833030855e-05, + "loss": 36.7565, + "step": 1979 + }, + { + "epoch": 7.148081264108352, + "grad_norm": 227.09645080566406, + "learning_rate": 1.927404718693285e-05, + "loss": 37.6752, + "step": 1980 + }, + { + "epoch": 7.148081264108352, + "eval_loss": 0.6243596076965332, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 1980 + }, + { + "epoch": 7.151693002257336, + "grad_norm": 236.27169799804688, + "learning_rate": 1.9268602540834846e-05, + "loss": 38.3857, + "step": 1981 + }, + { + "epoch": 7.15530474040632, + "grad_norm": 244.84912109375, + "learning_rate": 1.926315789473684e-05, + "loss": 38.414, + "step": 1982 + }, + { + "epoch": 7.158916478555304, + "grad_norm": 203.36798095703125, + "learning_rate": 1.925771324863884e-05, + "loss": 38.938, + "step": 1983 + }, + { + "epoch": 7.162528216704289, + "grad_norm": 225.50152587890625, + "learning_rate": 1.9252268602540835e-05, + "loss": 37.654, + "step": 1984 + }, + { + "epoch": 7.166139954853273, + "grad_norm": 236.4989471435547, + "learning_rate": 1.924682395644283e-05, + "loss": 28.2794, + "step": 1985 + }, + { + "epoch": 7.169751693002257, + "grad_norm": 173.909423828125, + "learning_rate": 1.9241379310344826e-05, + "loss": 23.3804, + "step": 1986 + }, + { + "epoch": 7.173363431151241, + "grad_norm": 195.63526916503906, + "learning_rate": 1.9235934664246825e-05, + "loss": 24.4696, + "step": 1987 + }, + { + "epoch": 7.176975169300226, + "grad_norm": 150.0059356689453, + "learning_rate": 1.923049001814882e-05, + "loss": 23.9438, + "step": 1988 + }, + { + "epoch": 7.18058690744921, + "grad_norm": 217.61630249023438, + "learning_rate": 1.922504537205082e-05, + "loss": 25.4084, + "step": 1989 + }, + { + "epoch": 7.184198645598194, + "grad_norm": 259.2041015625, + "learning_rate": 1.9219600725952814e-05, + "loss": 44.7159, + "step": 1990 + }, + { + "epoch": 7.184198645598194, + "eval_loss": 0.6465168595314026, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 1990 + }, + { + "epoch": 7.187810383747179, + "grad_norm": 282.1758117675781, + "learning_rate": 1.921415607985481e-05, + "loss": 45.7571, + "step": 1991 + }, + { + "epoch": 7.191422121896163, + "grad_norm": 276.5455322265625, + "learning_rate": 1.9208711433756805e-05, + "loss": 44.7227, + "step": 1992 + }, + { + "epoch": 7.195033860045147, + "grad_norm": 251.93589782714844, + "learning_rate": 1.92032667876588e-05, + "loss": 43.0705, + "step": 1993 + }, + { + "epoch": 7.198645598194131, + "grad_norm": 224.8245086669922, + "learning_rate": 1.91978221415608e-05, + "loss": 43.2009, + "step": 1994 + }, + { + "epoch": 7.2022573363431155, + "grad_norm": 233.61770629882812, + "learning_rate": 1.9192377495462795e-05, + "loss": 43.4496, + "step": 1995 + }, + { + "epoch": 7.2058690744920995, + "grad_norm": 188.65252685546875, + "learning_rate": 1.9186932849364793e-05, + "loss": 42.5907, + "step": 1996 + }, + { + "epoch": 7.209480812641083, + "grad_norm": 185.1155242919922, + "learning_rate": 1.918148820326679e-05, + "loss": 44.4651, + "step": 1997 + }, + { + "epoch": 7.213092550790067, + "grad_norm": 169.09701538085938, + "learning_rate": 1.9176043557168784e-05, + "loss": 43.6325, + "step": 1998 + }, + { + "epoch": 7.216704288939052, + "grad_norm": 198.49114990234375, + "learning_rate": 1.9170598911070783e-05, + "loss": 43.5817, + "step": 1999 + }, + { + "epoch": 7.220316027088036, + "grad_norm": 193.17591857910156, + "learning_rate": 1.916515426497278e-05, + "loss": 41.4884, + "step": 2000 + }, + { + "epoch": 7.220316027088036, + "eval_loss": 0.6329721212387085, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.995, + "eval_steps_per_second": 56.995, + "step": 2000 + }, + { + "epoch": 7.22392776523702, + "grad_norm": 202.32730102539062, + "learning_rate": 1.9159709618874774e-05, + "loss": 41.2168, + "step": 2001 + }, + { + "epoch": 7.227539503386004, + "grad_norm": 206.4916534423828, + "learning_rate": 1.915426497277677e-05, + "loss": 39.9909, + "step": 2002 + }, + { + "epoch": 7.231151241534989, + "grad_norm": 202.2099609375, + "learning_rate": 1.9148820326678765e-05, + "loss": 40.1413, + "step": 2003 + }, + { + "epoch": 7.234762979683973, + "grad_norm": 223.7954559326172, + "learning_rate": 1.914337568058076e-05, + "loss": 39.5872, + "step": 2004 + }, + { + "epoch": 7.238374717832957, + "grad_norm": 225.8967742919922, + "learning_rate": 1.9137931034482762e-05, + "loss": 41.3396, + "step": 2005 + }, + { + "epoch": 7.241986455981941, + "grad_norm": 248.0997772216797, + "learning_rate": 1.9132486388384757e-05, + "loss": 39.012, + "step": 2006 + }, + { + "epoch": 7.245598194130926, + "grad_norm": 227.4576873779297, + "learning_rate": 1.9127041742286753e-05, + "loss": 42.5922, + "step": 2007 + }, + { + "epoch": 7.24920993227991, + "grad_norm": 197.62547302246094, + "learning_rate": 1.9121597096188748e-05, + "loss": 41.6107, + "step": 2008 + }, + { + "epoch": 7.252821670428894, + "grad_norm": 170.18817138671875, + "learning_rate": 1.9116152450090744e-05, + "loss": 40.3326, + "step": 2009 + }, + { + "epoch": 7.2564334085778786, + "grad_norm": 186.9420166015625, + "learning_rate": 1.9110707803992742e-05, + "loss": 41.0365, + "step": 2010 + }, + { + "epoch": 7.2564334085778786, + "eval_loss": 0.6230406761169434, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2010 + }, + { + "epoch": 7.2600451467268625, + "grad_norm": 188.11244201660156, + "learning_rate": 1.9105263157894738e-05, + "loss": 42.0278, + "step": 2011 + }, + { + "epoch": 7.2636568848758465, + "grad_norm": 242.47305297851562, + "learning_rate": 1.9099818511796733e-05, + "loss": 41.5539, + "step": 2012 + }, + { + "epoch": 7.2672686230248305, + "grad_norm": 190.83987426757812, + "learning_rate": 1.909437386569873e-05, + "loss": 41.8641, + "step": 2013 + }, + { + "epoch": 7.270880361173815, + "grad_norm": 214.44650268554688, + "learning_rate": 1.9088929219600724e-05, + "loss": 42.232, + "step": 2014 + }, + { + "epoch": 7.274492099322799, + "grad_norm": 216.3888397216797, + "learning_rate": 1.9083484573502723e-05, + "loss": 41.6186, + "step": 2015 + }, + { + "epoch": 7.278103837471783, + "grad_norm": 210.46673583984375, + "learning_rate": 1.907803992740472e-05, + "loss": 42.2099, + "step": 2016 + }, + { + "epoch": 7.281715575620767, + "grad_norm": 194.84165954589844, + "learning_rate": 1.9072595281306717e-05, + "loss": 42.78, + "step": 2017 + }, + { + "epoch": 7.285327313769752, + "grad_norm": 201.91297912597656, + "learning_rate": 1.9067150635208712e-05, + "loss": 38.7115, + "step": 2018 + }, + { + "epoch": 7.288939051918736, + "grad_norm": 245.42625427246094, + "learning_rate": 1.9061705989110708e-05, + "loss": 35.7841, + "step": 2019 + }, + { + "epoch": 7.29255079006772, + "grad_norm": 182.4967041015625, + "learning_rate": 1.9056261343012703e-05, + "loss": 34.3308, + "step": 2020 + }, + { + "epoch": 7.29255079006772, + "eval_loss": 0.6238341331481934, + "eval_runtime": 3.1431, + "eval_samples_per_second": 56.95, + "eval_steps_per_second": 56.95, + "step": 2020 + }, + { + "epoch": 7.296162528216704, + "grad_norm": 297.3916320800781, + "learning_rate": 1.9050816696914702e-05, + "loss": 34.7534, + "step": 2021 + }, + { + "epoch": 7.299774266365689, + "grad_norm": 211.52554321289062, + "learning_rate": 1.9045372050816697e-05, + "loss": 34.0303, + "step": 2022 + }, + { + "epoch": 7.303386004514673, + "grad_norm": 232.99844360351562, + "learning_rate": 1.9039927404718693e-05, + "loss": 35.7378, + "step": 2023 + }, + { + "epoch": 7.306997742663657, + "grad_norm": 230.34642028808594, + "learning_rate": 1.903448275862069e-05, + "loss": 36.7492, + "step": 2024 + }, + { + "epoch": 7.310609480812641, + "grad_norm": 228.88966369628906, + "learning_rate": 1.9029038112522687e-05, + "loss": 35.1188, + "step": 2025 + }, + { + "epoch": 7.314221218961626, + "grad_norm": 213.2604522705078, + "learning_rate": 1.9023593466424682e-05, + "loss": 35.0688, + "step": 2026 + }, + { + "epoch": 7.3178329571106095, + "grad_norm": 202.62200927734375, + "learning_rate": 1.901814882032668e-05, + "loss": 37.6721, + "step": 2027 + }, + { + "epoch": 7.3214446952595935, + "grad_norm": 191.8877410888672, + "learning_rate": 1.9012704174228676e-05, + "loss": 36.7728, + "step": 2028 + }, + { + "epoch": 7.3250564334085775, + "grad_norm": 211.57571411132812, + "learning_rate": 1.900725952813067e-05, + "loss": 36.6342, + "step": 2029 + }, + { + "epoch": 7.328668171557562, + "grad_norm": 177.2289581298828, + "learning_rate": 1.9001814882032667e-05, + "loss": 36.8319, + "step": 2030 + }, + { + "epoch": 7.328668171557562, + "eval_loss": 0.6231008172035217, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2030 + }, + { + "epoch": 7.332279909706546, + "grad_norm": 227.7028350830078, + "learning_rate": 1.8996370235934662e-05, + "loss": 36.6706, + "step": 2031 + }, + { + "epoch": 7.33589164785553, + "grad_norm": 229.02972412109375, + "learning_rate": 1.899092558983666e-05, + "loss": 37.0749, + "step": 2032 + }, + { + "epoch": 7.339503386004514, + "grad_norm": 234.30946350097656, + "learning_rate": 1.898548094373866e-05, + "loss": 37.3716, + "step": 2033 + }, + { + "epoch": 7.343115124153499, + "grad_norm": 236.79893493652344, + "learning_rate": 1.8980036297640655e-05, + "loss": 38.9503, + "step": 2034 + }, + { + "epoch": 7.346726862302483, + "grad_norm": 256.5646057128906, + "learning_rate": 1.897459165154265e-05, + "loss": 32.5056, + "step": 2035 + }, + { + "epoch": 7.350338600451467, + "grad_norm": 183.38961791992188, + "learning_rate": 1.8969147005444646e-05, + "loss": 25.3982, + "step": 2036 + }, + { + "epoch": 7.353950338600452, + "grad_norm": 214.09742736816406, + "learning_rate": 1.896370235934664e-05, + "loss": 23.2743, + "step": 2037 + }, + { + "epoch": 7.357562076749436, + "grad_norm": 190.10867309570312, + "learning_rate": 1.895825771324864e-05, + "loss": 24.8062, + "step": 2038 + }, + { + "epoch": 7.36117381489842, + "grad_norm": 197.85313415527344, + "learning_rate": 1.8952813067150636e-05, + "loss": 25.5098, + "step": 2039 + }, + { + "epoch": 7.364785553047404, + "grad_norm": 235.79090881347656, + "learning_rate": 1.894736842105263e-05, + "loss": 44.3536, + "step": 2040 + }, + { + "epoch": 7.364785553047404, + "eval_loss": 0.6341925263404846, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.785, + "eval_steps_per_second": 56.785, + "step": 2040 + }, + { + "epoch": 7.368397291196389, + "grad_norm": 232.7415771484375, + "learning_rate": 1.8941923774954626e-05, + "loss": 44.6073, + "step": 2041 + }, + { + "epoch": 7.372009029345373, + "grad_norm": 302.3766174316406, + "learning_rate": 1.8936479128856625e-05, + "loss": 43.8575, + "step": 2042 + }, + { + "epoch": 7.375620767494357, + "grad_norm": 208.41441345214844, + "learning_rate": 1.8931034482758624e-05, + "loss": 42.4378, + "step": 2043 + }, + { + "epoch": 7.3792325056433405, + "grad_norm": 228.000732421875, + "learning_rate": 1.892558983666062e-05, + "loss": 44.5641, + "step": 2044 + }, + { + "epoch": 7.382844243792325, + "grad_norm": 201.757080078125, + "learning_rate": 1.8920145190562615e-05, + "loss": 43.7578, + "step": 2045 + }, + { + "epoch": 7.386455981941309, + "grad_norm": 220.2481689453125, + "learning_rate": 1.891470054446461e-05, + "loss": 42.755, + "step": 2046 + }, + { + "epoch": 7.390067720090293, + "grad_norm": 225.5443115234375, + "learning_rate": 1.8909255898366605e-05, + "loss": 44.3785, + "step": 2047 + }, + { + "epoch": 7.393679458239277, + "grad_norm": 200.2024688720703, + "learning_rate": 1.89038112522686e-05, + "loss": 42.994, + "step": 2048 + }, + { + "epoch": 7.397291196388262, + "grad_norm": 205.64794921875, + "learning_rate": 1.88983666061706e-05, + "loss": 43.1902, + "step": 2049 + }, + { + "epoch": 7.400902934537246, + "grad_norm": 183.3535919189453, + "learning_rate": 1.8892921960072595e-05, + "loss": 40.9422, + "step": 2050 + }, + { + "epoch": 7.400902934537246, + "eval_loss": 0.626913845539093, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2050 + }, + { + "epoch": 7.40451467268623, + "grad_norm": 201.8138885498047, + "learning_rate": 1.8887477313974594e-05, + "loss": 39.4408, + "step": 2051 + }, + { + "epoch": 7.408126410835214, + "grad_norm": 201.8863525390625, + "learning_rate": 1.888203266787659e-05, + "loss": 39.5467, + "step": 2052 + }, + { + "epoch": 7.411738148984199, + "grad_norm": 239.10687255859375, + "learning_rate": 1.8876588021778585e-05, + "loss": 41.2256, + "step": 2053 + }, + { + "epoch": 7.415349887133183, + "grad_norm": 209.47796630859375, + "learning_rate": 1.8871143375680583e-05, + "loss": 40.8963, + "step": 2054 + }, + { + "epoch": 7.418961625282167, + "grad_norm": 202.6414794921875, + "learning_rate": 1.886569872958258e-05, + "loss": 40.5138, + "step": 2055 + }, + { + "epoch": 7.422573363431152, + "grad_norm": 198.01795959472656, + "learning_rate": 1.8860254083484574e-05, + "loss": 39.1767, + "step": 2056 + }, + { + "epoch": 7.426185101580136, + "grad_norm": 173.26507568359375, + "learning_rate": 1.885480943738657e-05, + "loss": 40.6713, + "step": 2057 + }, + { + "epoch": 7.42979683972912, + "grad_norm": 166.11607360839844, + "learning_rate": 1.8849364791288565e-05, + "loss": 41.2602, + "step": 2058 + }, + { + "epoch": 7.433408577878104, + "grad_norm": 200.76956176757812, + "learning_rate": 1.884392014519056e-05, + "loss": 41.0714, + "step": 2059 + }, + { + "epoch": 7.437020316027088, + "grad_norm": 213.75315856933594, + "learning_rate": 1.883847549909256e-05, + "loss": 39.6812, + "step": 2060 + }, + { + "epoch": 7.437020316027088, + "eval_loss": 0.6279598474502563, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.0, + "eval_steps_per_second": 57.0, + "step": 2060 + }, + { + "epoch": 7.440632054176072, + "grad_norm": 221.25025939941406, + "learning_rate": 1.8833030852994558e-05, + "loss": 41.6964, + "step": 2061 + }, + { + "epoch": 7.444243792325056, + "grad_norm": 171.32106018066406, + "learning_rate": 1.8827586206896553e-05, + "loss": 41.4608, + "step": 2062 + }, + { + "epoch": 7.44785553047404, + "grad_norm": 222.76600646972656, + "learning_rate": 1.882214156079855e-05, + "loss": 41.2687, + "step": 2063 + }, + { + "epoch": 7.451467268623025, + "grad_norm": 169.82395935058594, + "learning_rate": 1.8816696914700544e-05, + "loss": 41.6048, + "step": 2064 + }, + { + "epoch": 7.455079006772009, + "grad_norm": 190.5113525390625, + "learning_rate": 1.8811252268602543e-05, + "loss": 41.8843, + "step": 2065 + }, + { + "epoch": 7.458690744920993, + "grad_norm": 194.5990447998047, + "learning_rate": 1.8805807622504538e-05, + "loss": 43.5968, + "step": 2066 + }, + { + "epoch": 7.462302483069977, + "grad_norm": 216.0985870361328, + "learning_rate": 1.8800362976406533e-05, + "loss": 41.6743, + "step": 2067 + }, + { + "epoch": 7.465914221218962, + "grad_norm": 249.05270385742188, + "learning_rate": 1.879491833030853e-05, + "loss": 39.4203, + "step": 2068 + }, + { + "epoch": 7.469525959367946, + "grad_norm": 232.5495147705078, + "learning_rate": 1.8789473684210524e-05, + "loss": 36.2202, + "step": 2069 + }, + { + "epoch": 7.47313769751693, + "grad_norm": 218.72299194335938, + "learning_rate": 1.8784029038112523e-05, + "loss": 34.9116, + "step": 2070 + }, + { + "epoch": 7.47313769751693, + "eval_loss": 0.6241349577903748, + "eval_runtime": 3.1499, + "eval_samples_per_second": 56.827, + "eval_steps_per_second": 56.827, + "step": 2070 + }, + { + "epoch": 7.476749435665914, + "grad_norm": 241.78179931640625, + "learning_rate": 1.8778584392014522e-05, + "loss": 36.2476, + "step": 2071 + }, + { + "epoch": 7.480361173814899, + "grad_norm": 194.92982482910156, + "learning_rate": 1.8773139745916517e-05, + "loss": 34.4524, + "step": 2072 + }, + { + "epoch": 7.483972911963883, + "grad_norm": 227.76156616210938, + "learning_rate": 1.8767695099818513e-05, + "loss": 34.5292, + "step": 2073 + }, + { + "epoch": 7.487584650112867, + "grad_norm": 287.61309814453125, + "learning_rate": 1.8762250453720508e-05, + "loss": 37.8068, + "step": 2074 + }, + { + "epoch": 7.491196388261851, + "grad_norm": 191.0822296142578, + "learning_rate": 1.8756805807622503e-05, + "loss": 36.0941, + "step": 2075 + }, + { + "epoch": 7.4948081264108355, + "grad_norm": 197.5564422607422, + "learning_rate": 1.8751361161524502e-05, + "loss": 36.3624, + "step": 2076 + }, + { + "epoch": 7.4984198645598195, + "grad_norm": 187.72479248046875, + "learning_rate": 1.8745916515426497e-05, + "loss": 37.5074, + "step": 2077 + }, + { + "epoch": 7.502031602708803, + "grad_norm": 220.4607391357422, + "learning_rate": 1.8740471869328493e-05, + "loss": 35.6139, + "step": 2078 + }, + { + "epoch": 7.505643340857787, + "grad_norm": 179.05612182617188, + "learning_rate": 1.873502722323049e-05, + "loss": 37.7286, + "step": 2079 + }, + { + "epoch": 7.509255079006772, + "grad_norm": 230.91879272460938, + "learning_rate": 1.8729582577132487e-05, + "loss": 36.1803, + "step": 2080 + }, + { + "epoch": 7.509255079006772, + "eval_loss": 0.6255043148994446, + "eval_runtime": 3.1466, + "eval_samples_per_second": 56.887, + "eval_steps_per_second": 56.887, + "step": 2080 + }, + { + "epoch": 7.512866817155756, + "grad_norm": 182.89437866210938, + "learning_rate": 1.8724137931034482e-05, + "loss": 36.5782, + "step": 2081 + }, + { + "epoch": 7.51647855530474, + "grad_norm": 215.36769104003906, + "learning_rate": 1.871869328493648e-05, + "loss": 38.233, + "step": 2082 + }, + { + "epoch": 7.520090293453725, + "grad_norm": 232.6095733642578, + "learning_rate": 1.8713248638838477e-05, + "loss": 38.6268, + "step": 2083 + }, + { + "epoch": 7.523702031602709, + "grad_norm": 236.94281005859375, + "learning_rate": 1.8707803992740472e-05, + "loss": 38.1768, + "step": 2084 + }, + { + "epoch": 7.527313769751693, + "grad_norm": 214.16079711914062, + "learning_rate": 1.8702359346642467e-05, + "loss": 27.514, + "step": 2085 + }, + { + "epoch": 7.530925507900677, + "grad_norm": 192.6107940673828, + "learning_rate": 1.8696914700544463e-05, + "loss": 24.274, + "step": 2086 + }, + { + "epoch": 7.534537246049661, + "grad_norm": 217.98619079589844, + "learning_rate": 1.869147005444646e-05, + "loss": 23.2824, + "step": 2087 + }, + { + "epoch": 7.538148984198646, + "grad_norm": 183.04296875, + "learning_rate": 1.868602540834846e-05, + "loss": 24.9622, + "step": 2088 + }, + { + "epoch": 7.54176072234763, + "grad_norm": 167.1417236328125, + "learning_rate": 1.8680580762250456e-05, + "loss": 25.1446, + "step": 2089 + }, + { + "epoch": 7.545372460496614, + "grad_norm": 287.29937744140625, + "learning_rate": 1.867513611615245e-05, + "loss": 44.1171, + "step": 2090 + }, + { + "epoch": 7.545372460496614, + "eval_loss": 0.6376849412918091, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.929, + "eval_steps_per_second": 56.929, + "step": 2090 + }, + { + "epoch": 7.5489841986455986, + "grad_norm": 285.3408203125, + "learning_rate": 1.8669691470054446e-05, + "loss": 46.3716, + "step": 2091 + }, + { + "epoch": 7.5525959367945825, + "grad_norm": 233.18389892578125, + "learning_rate": 1.8664246823956445e-05, + "loss": 44.0514, + "step": 2092 + }, + { + "epoch": 7.5562076749435665, + "grad_norm": 256.4196472167969, + "learning_rate": 1.865880217785844e-05, + "loss": 44.1784, + "step": 2093 + }, + { + "epoch": 7.5598194130925505, + "grad_norm": 223.28128051757812, + "learning_rate": 1.8653357531760436e-05, + "loss": 42.9897, + "step": 2094 + }, + { + "epoch": 7.563431151241535, + "grad_norm": 235.2901153564453, + "learning_rate": 1.864791288566243e-05, + "loss": 43.7651, + "step": 2095 + }, + { + "epoch": 7.567042889390519, + "grad_norm": 285.9206237792969, + "learning_rate": 1.8642468239564427e-05, + "loss": 44.6333, + "step": 2096 + }, + { + "epoch": 7.570654627539503, + "grad_norm": 200.00210571289062, + "learning_rate": 1.8637023593466425e-05, + "loss": 43.9845, + "step": 2097 + }, + { + "epoch": 7.574266365688487, + "grad_norm": 277.73394775390625, + "learning_rate": 1.8631578947368424e-05, + "loss": 44.7301, + "step": 2098 + }, + { + "epoch": 7.577878103837472, + "grad_norm": 216.9422149658203, + "learning_rate": 1.862613430127042e-05, + "loss": 44.0409, + "step": 2099 + }, + { + "epoch": 7.581489841986456, + "grad_norm": 198.86639404296875, + "learning_rate": 1.8620689655172415e-05, + "loss": 43.4026, + "step": 2100 + }, + { + "epoch": 7.581489841986456, + "eval_loss": 0.6270378232002258, + "eval_runtime": 3.1464, + "eval_samples_per_second": 56.891, + "eval_steps_per_second": 56.891, + "step": 2100 + }, + { + "epoch": 7.58510158013544, + "grad_norm": 240.495361328125, + "learning_rate": 1.861524500907441e-05, + "loss": 41.4092, + "step": 2101 + }, + { + "epoch": 7.588713318284425, + "grad_norm": 240.1851043701172, + "learning_rate": 1.8609800362976406e-05, + "loss": 40.1396, + "step": 2102 + }, + { + "epoch": 7.592325056433409, + "grad_norm": 241.21495056152344, + "learning_rate": 1.8604355716878405e-05, + "loss": 39.1778, + "step": 2103 + }, + { + "epoch": 7.595936794582393, + "grad_norm": 287.3133544921875, + "learning_rate": 1.85989110707804e-05, + "loss": 41.0348, + "step": 2104 + }, + { + "epoch": 7.599548532731377, + "grad_norm": 230.4313201904297, + "learning_rate": 1.8593466424682395e-05, + "loss": 39.5872, + "step": 2105 + }, + { + "epoch": 7.603160270880361, + "grad_norm": 210.32962036132812, + "learning_rate": 1.858802177858439e-05, + "loss": 40.6146, + "step": 2106 + }, + { + "epoch": 7.606772009029346, + "grad_norm": 185.81752014160156, + "learning_rate": 1.858257713248639e-05, + "loss": 39.6363, + "step": 2107 + }, + { + "epoch": 7.6103837471783295, + "grad_norm": 234.63037109375, + "learning_rate": 1.8577132486388385e-05, + "loss": 40.558, + "step": 2108 + }, + { + "epoch": 7.6139954853273135, + "grad_norm": 289.92803955078125, + "learning_rate": 1.8571687840290384e-05, + "loss": 41.1624, + "step": 2109 + }, + { + "epoch": 7.617607223476298, + "grad_norm": 252.82188415527344, + "learning_rate": 1.856624319419238e-05, + "loss": 41.7827, + "step": 2110 + }, + { + "epoch": 7.617607223476298, + "eval_loss": 0.6290409564971924, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2110 + }, + { + "epoch": 7.621218961625282, + "grad_norm": 201.8303985595703, + "learning_rate": 1.8560798548094374e-05, + "loss": 39.0072, + "step": 2111 + }, + { + "epoch": 7.624830699774266, + "grad_norm": 158.71446228027344, + "learning_rate": 1.855535390199637e-05, + "loss": 39.9822, + "step": 2112 + }, + { + "epoch": 7.62844243792325, + "grad_norm": 171.3879852294922, + "learning_rate": 1.8549909255898365e-05, + "loss": 42.1973, + "step": 2113 + }, + { + "epoch": 7.632054176072235, + "grad_norm": 218.584228515625, + "learning_rate": 1.8544464609800364e-05, + "loss": 42.933, + "step": 2114 + }, + { + "epoch": 7.635665914221219, + "grad_norm": 200.60093688964844, + "learning_rate": 1.853901996370236e-05, + "loss": 41.9847, + "step": 2115 + }, + { + "epoch": 7.639277652370203, + "grad_norm": 210.75128173828125, + "learning_rate": 1.8533575317604358e-05, + "loss": 42.4961, + "step": 2116 + }, + { + "epoch": 7.642889390519187, + "grad_norm": 187.47406005859375, + "learning_rate": 1.8528130671506353e-05, + "loss": 39.3404, + "step": 2117 + }, + { + "epoch": 7.646501128668172, + "grad_norm": 204.87693786621094, + "learning_rate": 1.852268602540835e-05, + "loss": 40.3011, + "step": 2118 + }, + { + "epoch": 7.650112866817156, + "grad_norm": 228.8159637451172, + "learning_rate": 1.8517241379310344e-05, + "loss": 37.4416, + "step": 2119 + }, + { + "epoch": 7.65372460496614, + "grad_norm": 237.59664916992188, + "learning_rate": 1.8511796733212343e-05, + "loss": 35.3079, + "step": 2120 + }, + { + "epoch": 7.65372460496614, + "eval_loss": 0.6256567239761353, + "eval_runtime": 3.1458, + "eval_samples_per_second": 56.902, + "eval_steps_per_second": 56.902, + "step": 2120 + }, + { + "epoch": 7.657336343115124, + "grad_norm": 233.3187713623047, + "learning_rate": 1.850635208711434e-05, + "loss": 34.5055, + "step": 2121 + }, + { + "epoch": 7.660948081264109, + "grad_norm": 232.7037353515625, + "learning_rate": 1.8500907441016334e-05, + "loss": 34.1232, + "step": 2122 + }, + { + "epoch": 7.664559819413093, + "grad_norm": 254.53050231933594, + "learning_rate": 1.849546279491833e-05, + "loss": 35.3301, + "step": 2123 + }, + { + "epoch": 7.668171557562077, + "grad_norm": 234.93154907226562, + "learning_rate": 1.8490018148820324e-05, + "loss": 35.9202, + "step": 2124 + }, + { + "epoch": 7.6717832957110605, + "grad_norm": 237.99671936035156, + "learning_rate": 1.8484573502722327e-05, + "loss": 36.5702, + "step": 2125 + }, + { + "epoch": 7.675395033860045, + "grad_norm": 186.25271606445312, + "learning_rate": 1.8479128856624322e-05, + "loss": 35.9423, + "step": 2126 + }, + { + "epoch": 7.679006772009029, + "grad_norm": 226.461669921875, + "learning_rate": 1.8473684210526317e-05, + "loss": 37.4121, + "step": 2127 + }, + { + "epoch": 7.682618510158013, + "grad_norm": 227.0966033935547, + "learning_rate": 1.8468239564428313e-05, + "loss": 36.8802, + "step": 2128 + }, + { + "epoch": 7.686230248306998, + "grad_norm": 193.4064178466797, + "learning_rate": 1.8462794918330308e-05, + "loss": 36.0245, + "step": 2129 + }, + { + "epoch": 7.689841986455982, + "grad_norm": 279.1668395996094, + "learning_rate": 1.8457350272232304e-05, + "loss": 37.4833, + "step": 2130 + }, + { + "epoch": 7.689841986455982, + "eval_loss": 0.6227458715438843, + "eval_runtime": 3.1429, + "eval_samples_per_second": 56.953, + "eval_steps_per_second": 56.953, + "step": 2130 + }, + { + "epoch": 7.693453724604966, + "grad_norm": 254.59234619140625, + "learning_rate": 1.8451905626134302e-05, + "loss": 36.8538, + "step": 2131 + }, + { + "epoch": 7.69706546275395, + "grad_norm": 191.14463806152344, + "learning_rate": 1.8446460980036298e-05, + "loss": 37.8517, + "step": 2132 + }, + { + "epoch": 7.700677200902934, + "grad_norm": 189.20896911621094, + "learning_rate": 1.8441016333938293e-05, + "loss": 38.406, + "step": 2133 + }, + { + "epoch": 7.704288939051919, + "grad_norm": 209.61175537109375, + "learning_rate": 1.8435571687840292e-05, + "loss": 37.7692, + "step": 2134 + }, + { + "epoch": 7.707900677200903, + "grad_norm": 220.5150146484375, + "learning_rate": 1.8430127041742287e-05, + "loss": 36.087, + "step": 2135 + }, + { + "epoch": 7.711512415349887, + "grad_norm": 211.78372192382812, + "learning_rate": 1.8424682395644286e-05, + "loss": 25.6052, + "step": 2136 + }, + { + "epoch": 7.715124153498872, + "grad_norm": 223.85789489746094, + "learning_rate": 1.841923774954628e-05, + "loss": 23.5576, + "step": 2137 + }, + { + "epoch": 7.718735891647856, + "grad_norm": 163.74220275878906, + "learning_rate": 1.8413793103448277e-05, + "loss": 24.4869, + "step": 2138 + }, + { + "epoch": 7.72234762979684, + "grad_norm": 182.80079650878906, + "learning_rate": 1.8408348457350272e-05, + "loss": 25.1878, + "step": 2139 + }, + { + "epoch": 7.725959367945824, + "grad_norm": 296.0340270996094, + "learning_rate": 1.8402903811252268e-05, + "loss": 44.4643, + "step": 2140 + }, + { + "epoch": 7.725959367945824, + "eval_loss": 0.6382863521575928, + "eval_runtime": 3.1441, + "eval_samples_per_second": 56.932, + "eval_steps_per_second": 56.932, + "step": 2140 + }, + { + "epoch": 7.7295711060948085, + "grad_norm": 248.48643493652344, + "learning_rate": 1.8397459165154263e-05, + "loss": 45.2141, + "step": 2141 + }, + { + "epoch": 7.733182844243792, + "grad_norm": 240.9061279296875, + "learning_rate": 1.8392014519056262e-05, + "loss": 42.9435, + "step": 2142 + }, + { + "epoch": 7.736794582392776, + "grad_norm": 231.62315368652344, + "learning_rate": 1.8386569872958257e-05, + "loss": 42.9769, + "step": 2143 + }, + { + "epoch": 7.74040632054176, + "grad_norm": 244.36915588378906, + "learning_rate": 1.8381125226860256e-05, + "loss": 43.6058, + "step": 2144 + }, + { + "epoch": 7.744018058690745, + "grad_norm": 252.9080047607422, + "learning_rate": 1.837568058076225e-05, + "loss": 43.1753, + "step": 2145 + }, + { + "epoch": 7.747629796839729, + "grad_norm": 274.0201721191406, + "learning_rate": 1.8370235934664247e-05, + "loss": 43.3285, + "step": 2146 + }, + { + "epoch": 7.751241534988713, + "grad_norm": 226.75595092773438, + "learning_rate": 1.8364791288566245e-05, + "loss": 43.3158, + "step": 2147 + }, + { + "epoch": 7.754853273137698, + "grad_norm": 197.0859832763672, + "learning_rate": 1.835934664246824e-05, + "loss": 43.5773, + "step": 2148 + }, + { + "epoch": 7.758465011286682, + "grad_norm": 212.14720153808594, + "learning_rate": 1.8353901996370236e-05, + "loss": 43.9208, + "step": 2149 + }, + { + "epoch": 7.762076749435666, + "grad_norm": 230.22158813476562, + "learning_rate": 1.834845735027223e-05, + "loss": 42.8429, + "step": 2150 + }, + { + "epoch": 7.762076749435666, + "eval_loss": 0.6291994452476501, + "eval_runtime": 3.1473, + "eval_samples_per_second": 56.874, + "eval_steps_per_second": 56.874, + "step": 2150 + }, + { + "epoch": 7.76568848758465, + "grad_norm": 215.79391479492188, + "learning_rate": 1.8343012704174227e-05, + "loss": 40.7289, + "step": 2151 + }, + { + "epoch": 7.769300225733634, + "grad_norm": 210.00296020507812, + "learning_rate": 1.8337568058076222e-05, + "loss": 39.9759, + "step": 2152 + }, + { + "epoch": 7.772911963882619, + "grad_norm": 291.2987976074219, + "learning_rate": 1.8332123411978224e-05, + "loss": 40.551, + "step": 2153 + }, + { + "epoch": 7.776523702031603, + "grad_norm": 218.08819580078125, + "learning_rate": 1.832667876588022e-05, + "loss": 40.7981, + "step": 2154 + }, + { + "epoch": 7.780135440180587, + "grad_norm": 268.615966796875, + "learning_rate": 1.8321234119782215e-05, + "loss": 40.5463, + "step": 2155 + }, + { + "epoch": 7.7837471783295715, + "grad_norm": 269.939697265625, + "learning_rate": 1.831578947368421e-05, + "loss": 40.6168, + "step": 2156 + }, + { + "epoch": 7.7873589164785555, + "grad_norm": 268.9761657714844, + "learning_rate": 1.8310344827586206e-05, + "loss": 41.2449, + "step": 2157 + }, + { + "epoch": 7.7909706546275395, + "grad_norm": 161.08811950683594, + "learning_rate": 1.8304900181488205e-05, + "loss": 40.6308, + "step": 2158 + }, + { + "epoch": 7.794582392776523, + "grad_norm": 190.44696044921875, + "learning_rate": 1.82994555353902e-05, + "loss": 40.9708, + "step": 2159 + }, + { + "epoch": 7.798194130925508, + "grad_norm": 202.4305419921875, + "learning_rate": 1.8294010889292196e-05, + "loss": 41.2053, + "step": 2160 + }, + { + "epoch": 7.798194130925508, + "eval_loss": 0.6233534812927246, + "eval_runtime": 3.1457, + "eval_samples_per_second": 56.903, + "eval_steps_per_second": 56.903, + "step": 2160 + }, + { + "epoch": 7.801805869074492, + "grad_norm": 188.5523681640625, + "learning_rate": 1.828856624319419e-05, + "loss": 40.3928, + "step": 2161 + }, + { + "epoch": 7.805417607223476, + "grad_norm": 184.18296813964844, + "learning_rate": 1.828312159709619e-05, + "loss": 42.3466, + "step": 2162 + }, + { + "epoch": 7.80902934537246, + "grad_norm": 223.9243927001953, + "learning_rate": 1.8277676950998185e-05, + "loss": 42.0301, + "step": 2163 + }, + { + "epoch": 7.812641083521445, + "grad_norm": 202.3498077392578, + "learning_rate": 1.8272232304900184e-05, + "loss": 42.3284, + "step": 2164 + }, + { + "epoch": 7.816252821670429, + "grad_norm": 205.77940368652344, + "learning_rate": 1.826678765880218e-05, + "loss": 42.0951, + "step": 2165 + }, + { + "epoch": 7.819864559819413, + "grad_norm": 191.46728515625, + "learning_rate": 1.8261343012704175e-05, + "loss": 40.826, + "step": 2166 + }, + { + "epoch": 7.823476297968397, + "grad_norm": 276.8330383300781, + "learning_rate": 1.825589836660617e-05, + "loss": 42.7909, + "step": 2167 + }, + { + "epoch": 7.827088036117382, + "grad_norm": 181.93955993652344, + "learning_rate": 1.8250453720508165e-05, + "loss": 38.6068, + "step": 2168 + }, + { + "epoch": 7.830699774266366, + "grad_norm": 178.79856872558594, + "learning_rate": 1.8245009074410164e-05, + "loss": 35.694, + "step": 2169 + }, + { + "epoch": 7.83431151241535, + "grad_norm": 224.6522979736328, + "learning_rate": 1.823956442831216e-05, + "loss": 36.7127, + "step": 2170 + }, + { + "epoch": 7.83431151241535, + "eval_loss": 0.6237645745277405, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 2170 + }, + { + "epoch": 7.837923250564334, + "grad_norm": 203.37196350097656, + "learning_rate": 1.823411978221416e-05, + "loss": 34.0039, + "step": 2171 + }, + { + "epoch": 7.8415349887133186, + "grad_norm": 212.79307556152344, + "learning_rate": 1.8228675136116154e-05, + "loss": 33.2787, + "step": 2172 + }, + { + "epoch": 7.8451467268623025, + "grad_norm": 215.5691375732422, + "learning_rate": 1.822323049001815e-05, + "loss": 35.4241, + "step": 2173 + }, + { + "epoch": 7.8487584650112865, + "grad_norm": 230.0751190185547, + "learning_rate": 1.8217785843920144e-05, + "loss": 36.9333, + "step": 2174 + }, + { + "epoch": 7.852370203160271, + "grad_norm": 217.8132781982422, + "learning_rate": 1.8212341197822143e-05, + "loss": 35.7233, + "step": 2175 + }, + { + "epoch": 7.855981941309255, + "grad_norm": 245.93177795410156, + "learning_rate": 1.820689655172414e-05, + "loss": 36.6111, + "step": 2176 + }, + { + "epoch": 7.859593679458239, + "grad_norm": 210.58218383789062, + "learning_rate": 1.8201451905626134e-05, + "loss": 36.3243, + "step": 2177 + }, + { + "epoch": 7.863205417607223, + "grad_norm": 234.6280059814453, + "learning_rate": 1.819600725952813e-05, + "loss": 37.0315, + "step": 2178 + }, + { + "epoch": 7.866817155756207, + "grad_norm": 184.53121948242188, + "learning_rate": 1.8190562613430125e-05, + "loss": 35.8725, + "step": 2179 + }, + { + "epoch": 7.870428893905192, + "grad_norm": 201.5563507080078, + "learning_rate": 1.8185117967332127e-05, + "loss": 37.9183, + "step": 2180 + }, + { + "epoch": 7.870428893905192, + "eval_loss": 0.6210297346115112, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2180 + }, + { + "epoch": 7.874040632054176, + "grad_norm": 192.29579162597656, + "learning_rate": 1.8179673321234122e-05, + "loss": 37.1709, + "step": 2181 + }, + { + "epoch": 7.87765237020316, + "grad_norm": 246.0638427734375, + "learning_rate": 1.8174228675136118e-05, + "loss": 38.5338, + "step": 2182 + }, + { + "epoch": 7.881264108352145, + "grad_norm": 237.47607421875, + "learning_rate": 1.8168784029038113e-05, + "loss": 37.7041, + "step": 2183 + }, + { + "epoch": 7.884875846501129, + "grad_norm": 215.06407165527344, + "learning_rate": 1.816333938294011e-05, + "loss": 38.1663, + "step": 2184 + }, + { + "epoch": 7.888487584650113, + "grad_norm": 193.76809692382812, + "learning_rate": 1.8157894736842107e-05, + "loss": 32.1679, + "step": 2185 + }, + { + "epoch": 7.892099322799097, + "grad_norm": 208.66111755371094, + "learning_rate": 1.8152450090744103e-05, + "loss": 24.2413, + "step": 2186 + }, + { + "epoch": 7.895711060948082, + "grad_norm": 182.810546875, + "learning_rate": 1.8147005444646098e-05, + "loss": 24.1102, + "step": 2187 + }, + { + "epoch": 7.899322799097066, + "grad_norm": 200.25823974609375, + "learning_rate": 1.8141560798548093e-05, + "loss": 24.5778, + "step": 2188 + }, + { + "epoch": 7.9029345372460496, + "grad_norm": 224.19125366210938, + "learning_rate": 1.813611615245009e-05, + "loss": 26.1643, + "step": 2189 + }, + { + "epoch": 7.9065462753950335, + "grad_norm": 261.03033447265625, + "learning_rate": 1.8130671506352088e-05, + "loss": 45.1071, + "step": 2190 + }, + { + "epoch": 7.9065462753950335, + "eval_loss": 0.6303785443305969, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2190 + }, + { + "epoch": 7.910158013544018, + "grad_norm": 273.6593322753906, + "learning_rate": 1.8125226860254086e-05, + "loss": 43.8271, + "step": 2191 + }, + { + "epoch": 7.913769751693002, + "grad_norm": 304.0534362792969, + "learning_rate": 1.8119782214156082e-05, + "loss": 43.7623, + "step": 2192 + }, + { + "epoch": 7.917381489841986, + "grad_norm": 249.27255249023438, + "learning_rate": 1.8114337568058077e-05, + "loss": 43.7191, + "step": 2193 + }, + { + "epoch": 7.92099322799097, + "grad_norm": 199.5006103515625, + "learning_rate": 1.8108892921960072e-05, + "loss": 44.1019, + "step": 2194 + }, + { + "epoch": 7.924604966139955, + "grad_norm": 228.42832946777344, + "learning_rate": 1.8103448275862068e-05, + "loss": 43.9717, + "step": 2195 + }, + { + "epoch": 7.928216704288939, + "grad_norm": 247.20901489257812, + "learning_rate": 1.8098003629764067e-05, + "loss": 40.022, + "step": 2196 + }, + { + "epoch": 7.931828442437923, + "grad_norm": 297.5372619628906, + "learning_rate": 1.8092558983666062e-05, + "loss": 40.6639, + "step": 2197 + }, + { + "epoch": 7.935440180586907, + "grad_norm": 245.11915588378906, + "learning_rate": 1.8087114337568057e-05, + "loss": 40.3569, + "step": 2198 + }, + { + "epoch": 7.939051918735892, + "grad_norm": 255.53297424316406, + "learning_rate": 1.8081669691470056e-05, + "loss": 41.7983, + "step": 2199 + }, + { + "epoch": 7.942663656884876, + "grad_norm": 226.12783813476562, + "learning_rate": 1.807622504537205e-05, + "loss": 41.7844, + "step": 2200 + }, + { + "epoch": 7.942663656884876, + "eval_loss": 0.6214397549629211, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2200 + }, + { + "epoch": 7.94627539503386, + "grad_norm": 220.90577697753906, + "learning_rate": 1.8070780399274047e-05, + "loss": 42.057, + "step": 2201 + }, + { + "epoch": 7.949887133182845, + "grad_norm": 192.33856201171875, + "learning_rate": 1.8065335753176046e-05, + "loss": 42.0299, + "step": 2202 + }, + { + "epoch": 7.953498871331829, + "grad_norm": 192.8511962890625, + "learning_rate": 1.805989110707804e-05, + "loss": 41.7752, + "step": 2203 + }, + { + "epoch": 7.957110609480813, + "grad_norm": 223.10275268554688, + "learning_rate": 1.8054446460980036e-05, + "loss": 41.0178, + "step": 2204 + }, + { + "epoch": 7.960722347629797, + "grad_norm": 189.8402099609375, + "learning_rate": 1.8049001814882032e-05, + "loss": 37.9747, + "step": 2205 + }, + { + "epoch": 7.9643340857787805, + "grad_norm": 233.5938720703125, + "learning_rate": 1.8043557168784027e-05, + "loss": 35.3994, + "step": 2206 + }, + { + "epoch": 7.967945823927765, + "grad_norm": 218.5577850341797, + "learning_rate": 1.8038112522686026e-05, + "loss": 35.1967, + "step": 2207 + }, + { + "epoch": 7.971557562076749, + "grad_norm": 228.49502563476562, + "learning_rate": 1.8032667876588025e-05, + "loss": 34.5792, + "step": 2208 + }, + { + "epoch": 7.975169300225733, + "grad_norm": 285.4461364746094, + "learning_rate": 1.802722323049002e-05, + "loss": 37.9449, + "step": 2209 + }, + { + "epoch": 7.978781038374718, + "grad_norm": 186.83755493164062, + "learning_rate": 1.8021778584392016e-05, + "loss": 36.3295, + "step": 2210 + }, + { + "epoch": 7.978781038374718, + "eval_loss": 0.6212169528007507, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2210 + }, + { + "epoch": 7.982392776523702, + "grad_norm": 210.31175231933594, + "learning_rate": 1.801633393829401e-05, + "loss": 37.0061, + "step": 2211 + }, + { + "epoch": 7.986004514672686, + "grad_norm": 251.96026611328125, + "learning_rate": 1.8010889292196006e-05, + "loss": 37.8831, + "step": 2212 + }, + { + "epoch": 7.98961625282167, + "grad_norm": 273.8665771484375, + "learning_rate": 1.8005444646098005e-05, + "loss": 38.8926, + "step": 2213 + }, + { + "epoch": 7.993227990970655, + "grad_norm": 207.25836181640625, + "learning_rate": 1.8e-05, + "loss": 30.0468, + "step": 2214 + }, + { + "epoch": 7.996839729119639, + "grad_norm": 200.5218048095703, + "learning_rate": 1.7994555353901996e-05, + "loss": 24.0549, + "step": 2215 + }, + { + "epoch": 8.0, + "grad_norm": 245.7149200439453, + "learning_rate": 1.798911070780399e-05, + "loss": 22.3158, + "step": 2216 + }, + { + "epoch": 8.003611738148985, + "grad_norm": 263.85546875, + "learning_rate": 1.798366606170599e-05, + "loss": 43.2342, + "step": 2217 + }, + { + "epoch": 8.007223476297968, + "grad_norm": 244.57205200195312, + "learning_rate": 1.797822141560799e-05, + "loss": 44.0931, + "step": 2218 + }, + { + "epoch": 8.010835214446953, + "grad_norm": 196.4144287109375, + "learning_rate": 1.7972776769509984e-05, + "loss": 42.1926, + "step": 2219 + }, + { + "epoch": 8.014446952595938, + "grad_norm": 282.3250427246094, + "learning_rate": 1.796733212341198e-05, + "loss": 41.4664, + "step": 2220 + }, + { + "epoch": 8.014446952595938, + "eval_loss": 0.6222901344299316, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 2220 + }, + { + "epoch": 8.01805869074492, + "grad_norm": 186.79281616210938, + "learning_rate": 1.7961887477313975e-05, + "loss": 42.2133, + "step": 2221 + }, + { + "epoch": 8.021670428893906, + "grad_norm": 220.3788299560547, + "learning_rate": 1.795644283121597e-05, + "loss": 42.0159, + "step": 2222 + }, + { + "epoch": 8.025282167042889, + "grad_norm": 262.37078857421875, + "learning_rate": 1.7950998185117966e-05, + "loss": 42.6055, + "step": 2223 + }, + { + "epoch": 8.028893905191874, + "grad_norm": 199.07078552246094, + "learning_rate": 1.7945553539019964e-05, + "loss": 43.3061, + "step": 2224 + }, + { + "epoch": 8.032505643340858, + "grad_norm": 256.6651306152344, + "learning_rate": 1.794010889292196e-05, + "loss": 42.4806, + "step": 2225 + }, + { + "epoch": 8.036117381489841, + "grad_norm": 281.17431640625, + "learning_rate": 1.793466424682396e-05, + "loss": 43.9823, + "step": 2226 + }, + { + "epoch": 8.039729119638826, + "grad_norm": 201.19837951660156, + "learning_rate": 1.7929219600725954e-05, + "loss": 41.8372, + "step": 2227 + }, + { + "epoch": 8.043340857787811, + "grad_norm": 195.1905059814453, + "learning_rate": 1.792377495462795e-05, + "loss": 38.8656, + "step": 2228 + }, + { + "epoch": 8.046952595936794, + "grad_norm": 215.02772521972656, + "learning_rate": 1.7918330308529948e-05, + "loss": 39.8965, + "step": 2229 + }, + { + "epoch": 8.050564334085779, + "grad_norm": 202.16322326660156, + "learning_rate": 1.7912885662431944e-05, + "loss": 41.0917, + "step": 2230 + }, + { + "epoch": 8.050564334085779, + "eval_loss": 0.6212881207466125, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2230 + }, + { + "epoch": 8.054176072234762, + "grad_norm": 218.90786743164062, + "learning_rate": 1.790744101633394e-05, + "loss": 38.5499, + "step": 2231 + }, + { + "epoch": 8.057787810383747, + "grad_norm": 179.57138061523438, + "learning_rate": 1.7901996370235934e-05, + "loss": 39.5915, + "step": 2232 + }, + { + "epoch": 8.061399548532732, + "grad_norm": 242.74801635742188, + "learning_rate": 1.789655172413793e-05, + "loss": 39.6094, + "step": 2233 + }, + { + "epoch": 8.065011286681715, + "grad_norm": 183.07102966308594, + "learning_rate": 1.7891107078039925e-05, + "loss": 40.6025, + "step": 2234 + }, + { + "epoch": 8.0686230248307, + "grad_norm": 192.85418701171875, + "learning_rate": 1.7885662431941924e-05, + "loss": 40.3013, + "step": 2235 + }, + { + "epoch": 8.072234762979685, + "grad_norm": 254.26353454589844, + "learning_rate": 1.7880217785843923e-05, + "loss": 39.1747, + "step": 2236 + }, + { + "epoch": 8.075846501128668, + "grad_norm": 230.7747802734375, + "learning_rate": 1.7874773139745918e-05, + "loss": 40.7569, + "step": 2237 + }, + { + "epoch": 8.079458239277653, + "grad_norm": 179.30528259277344, + "learning_rate": 1.7869328493647913e-05, + "loss": 40.0753, + "step": 2238 + }, + { + "epoch": 8.083069977426636, + "grad_norm": 203.48915100097656, + "learning_rate": 1.786388384754991e-05, + "loss": 41.4453, + "step": 2239 + }, + { + "epoch": 8.08668171557562, + "grad_norm": 274.8970947265625, + "learning_rate": 1.7858439201451908e-05, + "loss": 40.5818, + "step": 2240 + }, + { + "epoch": 8.08668171557562, + "eval_loss": 0.6184170842170715, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.799, + "eval_steps_per_second": 56.799, + "step": 2240 + }, + { + "epoch": 8.090293453724605, + "grad_norm": 237.2452392578125, + "learning_rate": 1.7852994555353903e-05, + "loss": 42.5794, + "step": 2241 + }, + { + "epoch": 8.093905191873588, + "grad_norm": 236.33766174316406, + "learning_rate": 1.7847549909255898e-05, + "loss": 41.89, + "step": 2242 + }, + { + "epoch": 8.097516930022573, + "grad_norm": 269.4791564941406, + "learning_rate": 1.7842105263157894e-05, + "loss": 41.7726, + "step": 2243 + }, + { + "epoch": 8.101128668171558, + "grad_norm": 192.28457641601562, + "learning_rate": 1.783666061705989e-05, + "loss": 40.1187, + "step": 2244 + }, + { + "epoch": 8.104740406320541, + "grad_norm": 201.5625457763672, + "learning_rate": 1.7831215970961888e-05, + "loss": 36.8004, + "step": 2245 + }, + { + "epoch": 8.108352144469526, + "grad_norm": 175.7625274658203, + "learning_rate": 1.7825771324863887e-05, + "loss": 33.8354, + "step": 2246 + }, + { + "epoch": 8.111963882618511, + "grad_norm": 195.6171112060547, + "learning_rate": 1.7820326678765882e-05, + "loss": 33.5176, + "step": 2247 + }, + { + "epoch": 8.115575620767494, + "grad_norm": 158.7554168701172, + "learning_rate": 1.7814882032667877e-05, + "loss": 34.2908, + "step": 2248 + }, + { + "epoch": 8.119187358916479, + "grad_norm": 192.78900146484375, + "learning_rate": 1.7809437386569873e-05, + "loss": 34.0861, + "step": 2249 + }, + { + "epoch": 8.122799097065462, + "grad_norm": 186.6603240966797, + "learning_rate": 1.7803992740471868e-05, + "loss": 35.5742, + "step": 2250 + }, + { + "epoch": 8.122799097065462, + "eval_loss": 0.6207499504089355, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2250 + }, + { + "epoch": 8.126410835214447, + "grad_norm": 264.3590087890625, + "learning_rate": 1.7798548094373867e-05, + "loss": 35.6709, + "step": 2251 + }, + { + "epoch": 8.130022573363432, + "grad_norm": 202.9478302001953, + "learning_rate": 1.7793103448275862e-05, + "loss": 36.4221, + "step": 2252 + }, + { + "epoch": 8.133634311512415, + "grad_norm": 229.260498046875, + "learning_rate": 1.7787658802177858e-05, + "loss": 36.0745, + "step": 2253 + }, + { + "epoch": 8.1372460496614, + "grad_norm": 222.37716674804688, + "learning_rate": 1.7782214156079856e-05, + "loss": 37.3266, + "step": 2254 + }, + { + "epoch": 8.140857787810384, + "grad_norm": 217.02272033691406, + "learning_rate": 1.7776769509981852e-05, + "loss": 37.2819, + "step": 2255 + }, + { + "epoch": 8.144469525959368, + "grad_norm": 247.61016845703125, + "learning_rate": 1.7771324863883847e-05, + "loss": 37.2683, + "step": 2256 + }, + { + "epoch": 8.148081264108352, + "grad_norm": 209.7449493408203, + "learning_rate": 1.7765880217785846e-05, + "loss": 36.7165, + "step": 2257 + }, + { + "epoch": 8.151693002257336, + "grad_norm": 217.30722045898438, + "learning_rate": 1.776043557168784e-05, + "loss": 37.0805, + "step": 2258 + }, + { + "epoch": 8.15530474040632, + "grad_norm": 181.5167236328125, + "learning_rate": 1.7754990925589837e-05, + "loss": 38.0326, + "step": 2259 + }, + { + "epoch": 8.158916478555305, + "grad_norm": 217.4818878173828, + "learning_rate": 1.7749546279491832e-05, + "loss": 37.1798, + "step": 2260 + }, + { + "epoch": 8.158916478555305, + "eval_loss": 0.6218119263648987, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 2260 + }, + { + "epoch": 8.162528216704288, + "grad_norm": 233.60733032226562, + "learning_rate": 1.7744101633393828e-05, + "loss": 36.6039, + "step": 2261 + }, + { + "epoch": 8.166139954853273, + "grad_norm": 184.5128631591797, + "learning_rate": 1.7738656987295826e-05, + "loss": 30.6188, + "step": 2262 + }, + { + "epoch": 8.169751693002258, + "grad_norm": 154.25791931152344, + "learning_rate": 1.7733212341197825e-05, + "loss": 24.0782, + "step": 2263 + }, + { + "epoch": 8.173363431151241, + "grad_norm": 179.92723083496094, + "learning_rate": 1.772776769509982e-05, + "loss": 23.7072, + "step": 2264 + }, + { + "epoch": 8.176975169300226, + "grad_norm": 170.87684631347656, + "learning_rate": 1.7722323049001816e-05, + "loss": 24.0008, + "step": 2265 + }, + { + "epoch": 8.18058690744921, + "grad_norm": 179.25233459472656, + "learning_rate": 1.771687840290381e-05, + "loss": 24.8393, + "step": 2266 + }, + { + "epoch": 8.184198645598194, + "grad_norm": 268.7836608886719, + "learning_rate": 1.7711433756805807e-05, + "loss": 44.0573, + "step": 2267 + }, + { + "epoch": 8.187810383747179, + "grad_norm": 249.12033081054688, + "learning_rate": 1.7705989110707805e-05, + "loss": 45.0218, + "step": 2268 + }, + { + "epoch": 8.191422121896162, + "grad_norm": 275.2551574707031, + "learning_rate": 1.77005444646098e-05, + "loss": 43.1954, + "step": 2269 + }, + { + "epoch": 8.195033860045147, + "grad_norm": 233.5360107421875, + "learning_rate": 1.7695099818511796e-05, + "loss": 43.0807, + "step": 2270 + }, + { + "epoch": 8.195033860045147, + "eval_loss": 0.6311450600624084, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 2270 + }, + { + "epoch": 8.198645598194132, + "grad_norm": 201.01617431640625, + "learning_rate": 1.768965517241379e-05, + "loss": 43.8161, + "step": 2271 + }, + { + "epoch": 8.202257336343115, + "grad_norm": 243.028564453125, + "learning_rate": 1.7684210526315787e-05, + "loss": 43.3388, + "step": 2272 + }, + { + "epoch": 8.2058690744921, + "grad_norm": 191.8246307373047, + "learning_rate": 1.767876588021779e-05, + "loss": 42.6949, + "step": 2273 + }, + { + "epoch": 8.209480812641084, + "grad_norm": 241.33609008789062, + "learning_rate": 1.7673321234119784e-05, + "loss": 43.3541, + "step": 2274 + }, + { + "epoch": 8.213092550790067, + "grad_norm": 247.99066162109375, + "learning_rate": 1.766787658802178e-05, + "loss": 44.4262, + "step": 2275 + }, + { + "epoch": 8.216704288939052, + "grad_norm": 223.35452270507812, + "learning_rate": 1.7662431941923775e-05, + "loss": 42.5696, + "step": 2276 + }, + { + "epoch": 8.220316027088035, + "grad_norm": 208.75209045410156, + "learning_rate": 1.765698729582577e-05, + "loss": 41.9236, + "step": 2277 + }, + { + "epoch": 8.22392776523702, + "grad_norm": 229.60305786132812, + "learning_rate": 1.7651542649727766e-05, + "loss": 39.962, + "step": 2278 + }, + { + "epoch": 8.227539503386005, + "grad_norm": 294.3867492675781, + "learning_rate": 1.7646098003629765e-05, + "loss": 39.0847, + "step": 2279 + }, + { + "epoch": 8.231151241534988, + "grad_norm": 201.49679565429688, + "learning_rate": 1.764065335753176e-05, + "loss": 39.1451, + "step": 2280 + }, + { + "epoch": 8.231151241534988, + "eval_loss": 0.6214079856872559, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 2280 + }, + { + "epoch": 8.234762979683973, + "grad_norm": 201.57894897460938, + "learning_rate": 1.7635208711433756e-05, + "loss": 39.4673, + "step": 2281 + }, + { + "epoch": 8.238374717832958, + "grad_norm": 201.0395965576172, + "learning_rate": 1.7629764065335754e-05, + "loss": 39.9832, + "step": 2282 + }, + { + "epoch": 8.241986455981941, + "grad_norm": 274.41168212890625, + "learning_rate": 1.762431941923775e-05, + "loss": 40.3885, + "step": 2283 + }, + { + "epoch": 8.245598194130926, + "grad_norm": 173.79977416992188, + "learning_rate": 1.761887477313975e-05, + "loss": 39.5292, + "step": 2284 + }, + { + "epoch": 8.249209932279909, + "grad_norm": 194.91806030273438, + "learning_rate": 1.7613430127041744e-05, + "loss": 40.3855, + "step": 2285 + }, + { + "epoch": 8.252821670428894, + "grad_norm": 216.47213745117188, + "learning_rate": 1.760798548094374e-05, + "loss": 40.937, + "step": 2286 + }, + { + "epoch": 8.256433408577879, + "grad_norm": 168.1825714111328, + "learning_rate": 1.7602540834845735e-05, + "loss": 41.2523, + "step": 2287 + }, + { + "epoch": 8.260045146726862, + "grad_norm": 187.51914978027344, + "learning_rate": 1.759709618874773e-05, + "loss": 40.6913, + "step": 2288 + }, + { + "epoch": 8.263656884875846, + "grad_norm": 183.99844360351562, + "learning_rate": 1.759165154264973e-05, + "loss": 42.5074, + "step": 2289 + }, + { + "epoch": 8.267268623024831, + "grad_norm": 201.23797607421875, + "learning_rate": 1.7586206896551724e-05, + "loss": 42.0519, + "step": 2290 + }, + { + "epoch": 8.267268623024831, + "eval_loss": 0.6184054017066956, + "eval_runtime": 3.1465, + "eval_samples_per_second": 56.889, + "eval_steps_per_second": 56.889, + "step": 2290 + }, + { + "epoch": 8.270880361173814, + "grad_norm": 219.0037384033203, + "learning_rate": 1.7580762250453723e-05, + "loss": 41.7059, + "step": 2291 + }, + { + "epoch": 8.2744920993228, + "grad_norm": 221.00173950195312, + "learning_rate": 1.7575317604355718e-05, + "loss": 40.9004, + "step": 2292 + }, + { + "epoch": 8.278103837471784, + "grad_norm": 180.00828552246094, + "learning_rate": 1.7569872958257714e-05, + "loss": 38.7865, + "step": 2293 + }, + { + "epoch": 8.281715575620767, + "grad_norm": 210.69302368164062, + "learning_rate": 1.756442831215971e-05, + "loss": 39.207, + "step": 2294 + }, + { + "epoch": 8.285327313769752, + "grad_norm": 196.8787078857422, + "learning_rate": 1.7558983666061708e-05, + "loss": 39.4472, + "step": 2295 + }, + { + "epoch": 8.288939051918735, + "grad_norm": 229.16331481933594, + "learning_rate": 1.7553539019963703e-05, + "loss": 36.5539, + "step": 2296 + }, + { + "epoch": 8.29255079006772, + "grad_norm": 180.67474365234375, + "learning_rate": 1.75480943738657e-05, + "loss": 34.3887, + "step": 2297 + }, + { + "epoch": 8.296162528216705, + "grad_norm": 234.046875, + "learning_rate": 1.7542649727767694e-05, + "loss": 34.158, + "step": 2298 + }, + { + "epoch": 8.299774266365688, + "grad_norm": 213.34255981445312, + "learning_rate": 1.753720508166969e-05, + "loss": 34.7655, + "step": 2299 + }, + { + "epoch": 8.303386004514673, + "grad_norm": 205.6382598876953, + "learning_rate": 1.753176043557169e-05, + "loss": 34.4223, + "step": 2300 + }, + { + "epoch": 8.303386004514673, + "eval_loss": 0.6200549006462097, + "eval_runtime": 3.1447, + "eval_samples_per_second": 56.921, + "eval_steps_per_second": 56.921, + "step": 2300 + }, + { + "epoch": 8.306997742663658, + "grad_norm": 189.79238891601562, + "learning_rate": 1.7526315789473687e-05, + "loss": 35.3846, + "step": 2301 + }, + { + "epoch": 8.31060948081264, + "grad_norm": 202.27859497070312, + "learning_rate": 1.7520871143375682e-05, + "loss": 34.9006, + "step": 2302 + }, + { + "epoch": 8.314221218961626, + "grad_norm": 217.62327575683594, + "learning_rate": 1.7515426497277678e-05, + "loss": 36.3079, + "step": 2303 + }, + { + "epoch": 8.317832957110609, + "grad_norm": 212.82862854003906, + "learning_rate": 1.7509981851179673e-05, + "loss": 35.8598, + "step": 2304 + }, + { + "epoch": 8.321444695259594, + "grad_norm": 229.778564453125, + "learning_rate": 1.750453720508167e-05, + "loss": 37.0853, + "step": 2305 + }, + { + "epoch": 8.325056433408578, + "grad_norm": 219.99844360351562, + "learning_rate": 1.7499092558983667e-05, + "loss": 38.01, + "step": 2306 + }, + { + "epoch": 8.328668171557561, + "grad_norm": 202.63035583496094, + "learning_rate": 1.7493647912885663e-05, + "loss": 36.4756, + "step": 2307 + }, + { + "epoch": 8.332279909706546, + "grad_norm": 188.44094848632812, + "learning_rate": 1.7488203266787658e-05, + "loss": 37.0509, + "step": 2308 + }, + { + "epoch": 8.335891647855531, + "grad_norm": 187.8760223388672, + "learning_rate": 1.7482758620689657e-05, + "loss": 38.0019, + "step": 2309 + }, + { + "epoch": 8.339503386004514, + "grad_norm": 239.35833740234375, + "learning_rate": 1.7477313974591652e-05, + "loss": 38.2255, + "step": 2310 + }, + { + "epoch": 8.339503386004514, + "eval_loss": 0.6221747994422913, + "eval_runtime": 3.148, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 2310 + }, + { + "epoch": 8.343115124153499, + "grad_norm": 236.3567657470703, + "learning_rate": 1.747186932849365e-05, + "loss": 37.3598, + "step": 2311 + }, + { + "epoch": 8.346726862302482, + "grad_norm": 188.16151428222656, + "learning_rate": 1.7466424682395646e-05, + "loss": 27.1993, + "step": 2312 + }, + { + "epoch": 8.350338600451467, + "grad_norm": 216.58778381347656, + "learning_rate": 1.746098003629764e-05, + "loss": 23.7024, + "step": 2313 + }, + { + "epoch": 8.353950338600452, + "grad_norm": 221.03111267089844, + "learning_rate": 1.7455535390199637e-05, + "loss": 24.2856, + "step": 2314 + }, + { + "epoch": 8.357562076749435, + "grad_norm": 180.36221313476562, + "learning_rate": 1.7450090744101632e-05, + "loss": 23.7624, + "step": 2315 + }, + { + "epoch": 8.36117381489842, + "grad_norm": 198.77438354492188, + "learning_rate": 1.7444646098003628e-05, + "loss": 25.8628, + "step": 2316 + }, + { + "epoch": 8.364785553047405, + "grad_norm": 250.81321716308594, + "learning_rate": 1.7439201451905627e-05, + "loss": 43.4097, + "step": 2317 + }, + { + "epoch": 8.368397291196388, + "grad_norm": 246.19544982910156, + "learning_rate": 1.7433756805807622e-05, + "loss": 44.7141, + "step": 2318 + }, + { + "epoch": 8.372009029345373, + "grad_norm": 245.04241943359375, + "learning_rate": 1.742831215970962e-05, + "loss": 44.4511, + "step": 2319 + }, + { + "epoch": 8.375620767494357, + "grad_norm": 224.05331420898438, + "learning_rate": 1.7422867513611616e-05, + "loss": 43.5971, + "step": 2320 + }, + { + "epoch": 8.375620767494357, + "eval_loss": 0.6324251294136047, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 2320 + }, + { + "epoch": 8.37923250564334, + "grad_norm": 222.3795623779297, + "learning_rate": 1.741742286751361e-05, + "loss": 42.9007, + "step": 2321 + }, + { + "epoch": 8.382844243792325, + "grad_norm": 210.0133514404297, + "learning_rate": 1.741197822141561e-05, + "loss": 42.8733, + "step": 2322 + }, + { + "epoch": 8.386455981941308, + "grad_norm": 222.01031494140625, + "learning_rate": 1.7406533575317606e-05, + "loss": 42.9875, + "step": 2323 + }, + { + "epoch": 8.390067720090293, + "grad_norm": 187.30101013183594, + "learning_rate": 1.74010889292196e-05, + "loss": 42.4873, + "step": 2324 + }, + { + "epoch": 8.393679458239278, + "grad_norm": 188.22048950195312, + "learning_rate": 1.7395644283121596e-05, + "loss": 42.2066, + "step": 2325 + }, + { + "epoch": 8.397291196388261, + "grad_norm": 228.75363159179688, + "learning_rate": 1.7390199637023592e-05, + "loss": 42.7604, + "step": 2326 + }, + { + "epoch": 8.400902934537246, + "grad_norm": 196.8817901611328, + "learning_rate": 1.7384754990925587e-05, + "loss": 42.445, + "step": 2327 + }, + { + "epoch": 8.404514672686231, + "grad_norm": 205.3610382080078, + "learning_rate": 1.737931034482759e-05, + "loss": 39.8408, + "step": 2328 + }, + { + "epoch": 8.408126410835214, + "grad_norm": 259.0702819824219, + "learning_rate": 1.7373865698729585e-05, + "loss": 40.847, + "step": 2329 + }, + { + "epoch": 8.411738148984199, + "grad_norm": 216.12017822265625, + "learning_rate": 1.736842105263158e-05, + "loss": 40.4648, + "step": 2330 + }, + { + "epoch": 8.411738148984199, + "eval_loss": 0.6252871155738831, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2330 + }, + { + "epoch": 8.415349887133182, + "grad_norm": 330.9464111328125, + "learning_rate": 1.7362976406533575e-05, + "loss": 39.7682, + "step": 2331 + }, + { + "epoch": 8.418961625282167, + "grad_norm": 237.19505310058594, + "learning_rate": 1.735753176043557e-05, + "loss": 38.8824, + "step": 2332 + }, + { + "epoch": 8.422573363431152, + "grad_norm": 247.22259521484375, + "learning_rate": 1.735208711433757e-05, + "loss": 40.1187, + "step": 2333 + }, + { + "epoch": 8.426185101580135, + "grad_norm": 267.739990234375, + "learning_rate": 1.7346642468239565e-05, + "loss": 40.4589, + "step": 2334 + }, + { + "epoch": 8.42979683972912, + "grad_norm": 308.715576171875, + "learning_rate": 1.734119782214156e-05, + "loss": 41.5481, + "step": 2335 + }, + { + "epoch": 8.433408577878104, + "grad_norm": 350.8972473144531, + "learning_rate": 1.7335753176043556e-05, + "loss": 41.6628, + "step": 2336 + }, + { + "epoch": 8.437020316027088, + "grad_norm": 245.9825897216797, + "learning_rate": 1.7330308529945555e-05, + "loss": 40.3527, + "step": 2337 + }, + { + "epoch": 8.440632054176072, + "grad_norm": 253.94488525390625, + "learning_rate": 1.732486388384755e-05, + "loss": 39.6388, + "step": 2338 + }, + { + "epoch": 8.444243792325057, + "grad_norm": 226.24179077148438, + "learning_rate": 1.731941923774955e-05, + "loss": 40.5561, + "step": 2339 + }, + { + "epoch": 8.44785553047404, + "grad_norm": 188.66746520996094, + "learning_rate": 1.7313974591651544e-05, + "loss": 41.8422, + "step": 2340 + }, + { + "epoch": 8.44785553047404, + "eval_loss": 0.6197592616081238, + "eval_runtime": 3.1522, + "eval_samples_per_second": 56.786, + "eval_steps_per_second": 56.786, + "step": 2340 + }, + { + "epoch": 8.451467268623025, + "grad_norm": 227.01014709472656, + "learning_rate": 1.730852994555354e-05, + "loss": 41.4184, + "step": 2341 + }, + { + "epoch": 8.455079006772008, + "grad_norm": 187.11643981933594, + "learning_rate": 1.7303085299455535e-05, + "loss": 40.796, + "step": 2342 + }, + { + "epoch": 8.458690744920993, + "grad_norm": 243.1756134033203, + "learning_rate": 1.729764065335753e-05, + "loss": 41.7926, + "step": 2343 + }, + { + "epoch": 8.462302483069978, + "grad_norm": 226.15187072753906, + "learning_rate": 1.729219600725953e-05, + "loss": 41.588, + "step": 2344 + }, + { + "epoch": 8.465914221218961, + "grad_norm": 218.49935913085938, + "learning_rate": 1.7286751361161524e-05, + "loss": 39.6935, + "step": 2345 + }, + { + "epoch": 8.469525959367946, + "grad_norm": 232.4805145263672, + "learning_rate": 1.7281306715063523e-05, + "loss": 37.0718, + "step": 2346 + }, + { + "epoch": 8.47313769751693, + "grad_norm": 201.1748046875, + "learning_rate": 1.727586206896552e-05, + "loss": 33.9633, + "step": 2347 + }, + { + "epoch": 8.476749435665914, + "grad_norm": 208.79733276367188, + "learning_rate": 1.7270417422867514e-05, + "loss": 33.4553, + "step": 2348 + }, + { + "epoch": 8.480361173814899, + "grad_norm": 235.91151428222656, + "learning_rate": 1.726497277676951e-05, + "loss": 33.6144, + "step": 2349 + }, + { + "epoch": 8.483972911963882, + "grad_norm": 206.28811645507812, + "learning_rate": 1.7259528130671508e-05, + "loss": 35.3678, + "step": 2350 + }, + { + "epoch": 8.483972911963882, + "eval_loss": 0.6203061938285828, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 2350 + }, + { + "epoch": 8.487584650112867, + "grad_norm": 305.2204284667969, + "learning_rate": 1.7254083484573503e-05, + "loss": 35.9175, + "step": 2351 + }, + { + "epoch": 8.491196388261852, + "grad_norm": 227.1592254638672, + "learning_rate": 1.72486388384755e-05, + "loss": 35.5001, + "step": 2352 + }, + { + "epoch": 8.494808126410835, + "grad_norm": 194.739501953125, + "learning_rate": 1.7243194192377494e-05, + "loss": 35.0015, + "step": 2353 + }, + { + "epoch": 8.49841986455982, + "grad_norm": 233.8467254638672, + "learning_rate": 1.723774954627949e-05, + "loss": 36.8257, + "step": 2354 + }, + { + "epoch": 8.502031602708804, + "grad_norm": 258.8914489746094, + "learning_rate": 1.7232304900181492e-05, + "loss": 36.1246, + "step": 2355 + }, + { + "epoch": 8.505643340857787, + "grad_norm": 194.8585968017578, + "learning_rate": 1.7226860254083487e-05, + "loss": 36.1245, + "step": 2356 + }, + { + "epoch": 8.509255079006772, + "grad_norm": 191.2276153564453, + "learning_rate": 1.7221415607985483e-05, + "loss": 37.0608, + "step": 2357 + }, + { + "epoch": 8.512866817155757, + "grad_norm": 197.9025115966797, + "learning_rate": 1.7215970961887478e-05, + "loss": 37.0779, + "step": 2358 + }, + { + "epoch": 8.51647855530474, + "grad_norm": 207.01016235351562, + "learning_rate": 1.7210526315789473e-05, + "loss": 37.8432, + "step": 2359 + }, + { + "epoch": 8.520090293453725, + "grad_norm": 222.20201110839844, + "learning_rate": 1.720508166969147e-05, + "loss": 36.6983, + "step": 2360 + }, + { + "epoch": 8.520090293453725, + "eval_loss": 0.6240220665931702, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 2360 + }, + { + "epoch": 8.523702031602708, + "grad_norm": 200.19273376464844, + "learning_rate": 1.7199637023593467e-05, + "loss": 38.0613, + "step": 2361 + }, + { + "epoch": 8.527313769751693, + "grad_norm": 205.36758422851562, + "learning_rate": 1.7194192377495463e-05, + "loss": 29.6395, + "step": 2362 + }, + { + "epoch": 8.530925507900678, + "grad_norm": 206.53396606445312, + "learning_rate": 1.7188747731397458e-05, + "loss": 23.6478, + "step": 2363 + }, + { + "epoch": 8.534537246049661, + "grad_norm": 219.47044372558594, + "learning_rate": 1.7183303085299454e-05, + "loss": 22.8522, + "step": 2364 + }, + { + "epoch": 8.538148984198646, + "grad_norm": 178.48008728027344, + "learning_rate": 1.7177858439201452e-05, + "loss": 24.1411, + "step": 2365 + }, + { + "epoch": 8.54176072234763, + "grad_norm": 222.63731384277344, + "learning_rate": 1.717241379310345e-05, + "loss": 26.2818, + "step": 2366 + }, + { + "epoch": 8.545372460496614, + "grad_norm": 216.6333465576172, + "learning_rate": 1.7166969147005447e-05, + "loss": 42.5599, + "step": 2367 + }, + { + "epoch": 8.548984198645599, + "grad_norm": 241.42532348632812, + "learning_rate": 1.7161524500907442e-05, + "loss": 44.0016, + "step": 2368 + }, + { + "epoch": 8.552595936794582, + "grad_norm": 227.95193481445312, + "learning_rate": 1.7156079854809437e-05, + "loss": 44.1662, + "step": 2369 + }, + { + "epoch": 8.556207674943566, + "grad_norm": 204.9208526611328, + "learning_rate": 1.7150635208711433e-05, + "loss": 41.2255, + "step": 2370 + }, + { + "epoch": 8.556207674943566, + "eval_loss": 0.6293933987617493, + "eval_runtime": 3.1467, + "eval_samples_per_second": 56.884, + "eval_steps_per_second": 56.884, + "step": 2370 + }, + { + "epoch": 8.559819413092551, + "grad_norm": 168.1370849609375, + "learning_rate": 1.7145190562613428e-05, + "loss": 42.8374, + "step": 2371 + }, + { + "epoch": 8.563431151241534, + "grad_norm": 209.16641235351562, + "learning_rate": 1.7139745916515427e-05, + "loss": 42.4378, + "step": 2372 + }, + { + "epoch": 8.56704288939052, + "grad_norm": 235.36373901367188, + "learning_rate": 1.7134301270417422e-05, + "loss": 43.3213, + "step": 2373 + }, + { + "epoch": 8.570654627539504, + "grad_norm": 198.8206329345703, + "learning_rate": 1.712885662431942e-05, + "loss": 43.5621, + "step": 2374 + }, + { + "epoch": 8.574266365688487, + "grad_norm": 191.1640167236328, + "learning_rate": 1.7123411978221416e-05, + "loss": 41.8729, + "step": 2375 + }, + { + "epoch": 8.577878103837472, + "grad_norm": 281.6352233886719, + "learning_rate": 1.7117967332123412e-05, + "loss": 42.8306, + "step": 2376 + }, + { + "epoch": 8.581489841986457, + "grad_norm": 191.68939208984375, + "learning_rate": 1.711252268602541e-05, + "loss": 41.3603, + "step": 2377 + }, + { + "epoch": 8.58510158013544, + "grad_norm": 175.3041229248047, + "learning_rate": 1.7107078039927406e-05, + "loss": 38.7076, + "step": 2378 + }, + { + "epoch": 8.588713318284425, + "grad_norm": 186.31202697753906, + "learning_rate": 1.71016333938294e-05, + "loss": 38.832, + "step": 2379 + }, + { + "epoch": 8.592325056433408, + "grad_norm": 192.0680389404297, + "learning_rate": 1.7096188747731397e-05, + "loss": 40.6542, + "step": 2380 + }, + { + "epoch": 8.592325056433408, + "eval_loss": 0.6245992183685303, + "eval_runtime": 3.1487, + "eval_samples_per_second": 56.848, + "eval_steps_per_second": 56.848, + "step": 2380 + }, + { + "epoch": 8.595936794582393, + "grad_norm": 284.3516540527344, + "learning_rate": 1.7090744101633392e-05, + "loss": 40.3145, + "step": 2381 + }, + { + "epoch": 8.599548532731378, + "grad_norm": 210.2421875, + "learning_rate": 1.708529945553539e-05, + "loss": 39.9109, + "step": 2382 + }, + { + "epoch": 8.60316027088036, + "grad_norm": 202.3438720703125, + "learning_rate": 1.707985480943739e-05, + "loss": 39.0686, + "step": 2383 + }, + { + "epoch": 8.606772009029346, + "grad_norm": 189.5508270263672, + "learning_rate": 1.7074410163339385e-05, + "loss": 40.6673, + "step": 2384 + }, + { + "epoch": 8.610383747178329, + "grad_norm": 199.3516387939453, + "learning_rate": 1.706896551724138e-05, + "loss": 40.5357, + "step": 2385 + }, + { + "epoch": 8.613995485327314, + "grad_norm": 183.11309814453125, + "learning_rate": 1.7063520871143376e-05, + "loss": 40.7691, + "step": 2386 + }, + { + "epoch": 8.617607223476298, + "grad_norm": 347.104248046875, + "learning_rate": 1.705807622504537e-05, + "loss": 40.6822, + "step": 2387 + }, + { + "epoch": 8.621218961625281, + "grad_norm": 341.0453796386719, + "learning_rate": 1.705263157894737e-05, + "loss": 40.9791, + "step": 2388 + }, + { + "epoch": 8.624830699774266, + "grad_norm": 335.33221435546875, + "learning_rate": 1.7047186932849365e-05, + "loss": 41.0977, + "step": 2389 + }, + { + "epoch": 8.628442437923251, + "grad_norm": 209.75198364257812, + "learning_rate": 1.704174228675136e-05, + "loss": 41.3332, + "step": 2390 + }, + { + "epoch": 8.628442437923251, + "eval_loss": 0.6176490783691406, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2390 + }, + { + "epoch": 8.632054176072234, + "grad_norm": 221.6715545654297, + "learning_rate": 1.7036297640653356e-05, + "loss": 41.7456, + "step": 2391 + }, + { + "epoch": 8.635665914221219, + "grad_norm": 255.7875213623047, + "learning_rate": 1.7030852994555355e-05, + "loss": 41.7063, + "step": 2392 + }, + { + "epoch": 8.639277652370204, + "grad_norm": 206.66221618652344, + "learning_rate": 1.7025408348457354e-05, + "loss": 41.941, + "step": 2393 + }, + { + "epoch": 8.642889390519187, + "grad_norm": 381.9871826171875, + "learning_rate": 1.701996370235935e-05, + "loss": 42.8615, + "step": 2394 + }, + { + "epoch": 8.646501128668172, + "grad_norm": 303.8249816894531, + "learning_rate": 1.7014519056261344e-05, + "loss": 37.8472, + "step": 2395 + }, + { + "epoch": 8.650112866817155, + "grad_norm": 201.2444610595703, + "learning_rate": 1.700907441016334e-05, + "loss": 35.4641, + "step": 2396 + }, + { + "epoch": 8.65372460496614, + "grad_norm": 242.34298706054688, + "learning_rate": 1.7003629764065335e-05, + "loss": 33.3414, + "step": 2397 + }, + { + "epoch": 8.657336343115125, + "grad_norm": 214.45384216308594, + "learning_rate": 1.699818511796733e-05, + "loss": 33.7771, + "step": 2398 + }, + { + "epoch": 8.660948081264108, + "grad_norm": 276.4810485839844, + "learning_rate": 1.699274047186933e-05, + "loss": 35.4289, + "step": 2399 + }, + { + "epoch": 8.664559819413093, + "grad_norm": 199.68626403808594, + "learning_rate": 1.6987295825771325e-05, + "loss": 34.4205, + "step": 2400 + }, + { + "epoch": 8.664559819413093, + "eval_loss": 0.6179484128952026, + "eval_runtime": 3.1618, + "eval_samples_per_second": 56.614, + "eval_steps_per_second": 56.614, + "step": 2400 + }, + { + "epoch": 8.668171557562077, + "grad_norm": 239.19200134277344, + "learning_rate": 1.698185117967332e-05, + "loss": 34.3428, + "step": 2401 + }, + { + "epoch": 8.67178329571106, + "grad_norm": 341.44927978515625, + "learning_rate": 1.697640653357532e-05, + "loss": 37.6011, + "step": 2402 + }, + { + "epoch": 8.675395033860045, + "grad_norm": 260.5967102050781, + "learning_rate": 1.6970961887477314e-05, + "loss": 34.9222, + "step": 2403 + }, + { + "epoch": 8.679006772009028, + "grad_norm": 217.9357147216797, + "learning_rate": 1.6965517241379313e-05, + "loss": 36.6177, + "step": 2404 + }, + { + "epoch": 8.682618510158013, + "grad_norm": 355.21917724609375, + "learning_rate": 1.696007259528131e-05, + "loss": 36.3072, + "step": 2405 + }, + { + "epoch": 8.686230248306998, + "grad_norm": 279.37200927734375, + "learning_rate": 1.6954627949183304e-05, + "loss": 36.7026, + "step": 2406 + }, + { + "epoch": 8.689841986455981, + "grad_norm": 344.9017028808594, + "learning_rate": 1.69491833030853e-05, + "loss": 37.5009, + "step": 2407 + }, + { + "epoch": 8.693453724604966, + "grad_norm": 225.28668212890625, + "learning_rate": 1.6943738656987295e-05, + "loss": 36.0914, + "step": 2408 + }, + { + "epoch": 8.697065462753951, + "grad_norm": 233.16372680664062, + "learning_rate": 1.693829401088929e-05, + "loss": 38.0917, + "step": 2409 + }, + { + "epoch": 8.700677200902934, + "grad_norm": 220.2307891845703, + "learning_rate": 1.693284936479129e-05, + "loss": 37.4493, + "step": 2410 + }, + { + "epoch": 8.700677200902934, + "eval_loss": 0.6225734949111938, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2410 + }, + { + "epoch": 8.704288939051919, + "grad_norm": 298.2883605957031, + "learning_rate": 1.6927404718693287e-05, + "loss": 37.6527, + "step": 2411 + }, + { + "epoch": 8.707900677200904, + "grad_norm": 329.1615295410156, + "learning_rate": 1.6921960072595283e-05, + "loss": 30.9627, + "step": 2412 + }, + { + "epoch": 8.711512415349887, + "grad_norm": 192.55380249023438, + "learning_rate": 1.6916515426497278e-05, + "loss": 24.2028, + "step": 2413 + }, + { + "epoch": 8.715124153498872, + "grad_norm": 162.13583374023438, + "learning_rate": 1.6911070780399274e-05, + "loss": 23.3005, + "step": 2414 + }, + { + "epoch": 8.718735891647855, + "grad_norm": 152.95108032226562, + "learning_rate": 1.6905626134301272e-05, + "loss": 24.335, + "step": 2415 + }, + { + "epoch": 8.72234762979684, + "grad_norm": 183.4193572998047, + "learning_rate": 1.6900181488203268e-05, + "loss": 24.9279, + "step": 2416 + }, + { + "epoch": 8.725959367945824, + "grad_norm": 232.93650817871094, + "learning_rate": 1.6894736842105263e-05, + "loss": 43.4574, + "step": 2417 + }, + { + "epoch": 8.729571106094808, + "grad_norm": 226.85890197753906, + "learning_rate": 1.688929219600726e-05, + "loss": 44.4136, + "step": 2418 + }, + { + "epoch": 8.733182844243792, + "grad_norm": 232.16064453125, + "learning_rate": 1.6883847549909254e-05, + "loss": 42.8183, + "step": 2419 + }, + { + "epoch": 8.736794582392777, + "grad_norm": 243.5811767578125, + "learning_rate": 1.6878402903811253e-05, + "loss": 43.3031, + "step": 2420 + }, + { + "epoch": 8.736794582392777, + "eval_loss": 0.6284167170524597, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2420 + }, + { + "epoch": 8.74040632054176, + "grad_norm": 194.7115020751953, + "learning_rate": 1.687295825771325e-05, + "loss": 42.1276, + "step": 2421 + }, + { + "epoch": 8.744018058690745, + "grad_norm": 250.81983947753906, + "learning_rate": 1.6867513611615247e-05, + "loss": 42.5535, + "step": 2422 + }, + { + "epoch": 8.747629796839728, + "grad_norm": 205.1988983154297, + "learning_rate": 1.6862068965517242e-05, + "loss": 42.7745, + "step": 2423 + }, + { + "epoch": 8.751241534988713, + "grad_norm": 159.68243408203125, + "learning_rate": 1.6856624319419238e-05, + "loss": 43.6562, + "step": 2424 + }, + { + "epoch": 8.754853273137698, + "grad_norm": 164.31361389160156, + "learning_rate": 1.6851179673321233e-05, + "loss": 43.4602, + "step": 2425 + }, + { + "epoch": 8.758465011286681, + "grad_norm": 213.9793243408203, + "learning_rate": 1.6845735027223232e-05, + "loss": 42.1559, + "step": 2426 + }, + { + "epoch": 8.762076749435666, + "grad_norm": 205.79107666015625, + "learning_rate": 1.6840290381125227e-05, + "loss": 41.5687, + "step": 2427 + }, + { + "epoch": 8.76568848758465, + "grad_norm": 235.80348205566406, + "learning_rate": 1.6834845735027223e-05, + "loss": 41.0748, + "step": 2428 + }, + { + "epoch": 8.769300225733634, + "grad_norm": 203.84884643554688, + "learning_rate": 1.682940108892922e-05, + "loss": 39.3348, + "step": 2429 + }, + { + "epoch": 8.772911963882619, + "grad_norm": 271.2411804199219, + "learning_rate": 1.6823956442831217e-05, + "loss": 39.357, + "step": 2430 + }, + { + "epoch": 8.772911963882619, + "eval_loss": 0.6211046576499939, + "eval_runtime": 3.1402, + "eval_samples_per_second": 57.002, + "eval_steps_per_second": 57.002, + "step": 2430 + }, + { + "epoch": 8.776523702031604, + "grad_norm": 222.4960174560547, + "learning_rate": 1.6818511796733212e-05, + "loss": 39.2198, + "step": 2431 + }, + { + "epoch": 8.780135440180587, + "grad_norm": 325.9942932128906, + "learning_rate": 1.681306715063521e-05, + "loss": 40.572, + "step": 2432 + }, + { + "epoch": 8.783747178329572, + "grad_norm": 195.2740936279297, + "learning_rate": 1.6807622504537206e-05, + "loss": 39.2727, + "step": 2433 + }, + { + "epoch": 8.787358916478555, + "grad_norm": 196.16964721679688, + "learning_rate": 1.68021778584392e-05, + "loss": 40.6503, + "step": 2434 + }, + { + "epoch": 8.79097065462754, + "grad_norm": 183.2659454345703, + "learning_rate": 1.6796733212341197e-05, + "loss": 41.2074, + "step": 2435 + }, + { + "epoch": 8.794582392776524, + "grad_norm": 293.393798828125, + "learning_rate": 1.6791288566243192e-05, + "loss": 40.2778, + "step": 2436 + }, + { + "epoch": 8.798194130925507, + "grad_norm": 232.8402099609375, + "learning_rate": 1.678584392014519e-05, + "loss": 40.0305, + "step": 2437 + }, + { + "epoch": 8.801805869074492, + "grad_norm": 269.957275390625, + "learning_rate": 1.678039927404719e-05, + "loss": 40.4216, + "step": 2438 + }, + { + "epoch": 8.805417607223477, + "grad_norm": 175.6732635498047, + "learning_rate": 1.6774954627949185e-05, + "loss": 40.7998, + "step": 2439 + }, + { + "epoch": 8.80902934537246, + "grad_norm": 209.0604248046875, + "learning_rate": 1.676950998185118e-05, + "loss": 41.1176, + "step": 2440 + }, + { + "epoch": 8.80902934537246, + "eval_loss": 0.6211614012718201, + "eval_runtime": 3.15, + "eval_samples_per_second": 56.826, + "eval_steps_per_second": 56.826, + "step": 2440 + }, + { + "epoch": 8.812641083521445, + "grad_norm": 229.91171264648438, + "learning_rate": 1.6764065335753176e-05, + "loss": 41.37, + "step": 2441 + }, + { + "epoch": 8.816252821670428, + "grad_norm": 192.99610900878906, + "learning_rate": 1.675862068965517e-05, + "loss": 41.8377, + "step": 2442 + }, + { + "epoch": 8.819864559819413, + "grad_norm": 239.290771484375, + "learning_rate": 1.675317604355717e-05, + "loss": 42.3038, + "step": 2443 + }, + { + "epoch": 8.823476297968398, + "grad_norm": 203.52330017089844, + "learning_rate": 1.6747731397459166e-05, + "loss": 41.3334, + "step": 2444 + }, + { + "epoch": 8.827088036117381, + "grad_norm": 247.99099731445312, + "learning_rate": 1.674228675136116e-05, + "loss": 37.7455, + "step": 2445 + }, + { + "epoch": 8.830699774266366, + "grad_norm": 205.9770965576172, + "learning_rate": 1.6736842105263156e-05, + "loss": 34.6828, + "step": 2446 + }, + { + "epoch": 8.83431151241535, + "grad_norm": 215.47024536132812, + "learning_rate": 1.6731397459165152e-05, + "loss": 34.927, + "step": 2447 + }, + { + "epoch": 8.837923250564334, + "grad_norm": 254.14010620117188, + "learning_rate": 1.6725952813067154e-05, + "loss": 35.3194, + "step": 2448 + }, + { + "epoch": 8.841534988713319, + "grad_norm": 221.18174743652344, + "learning_rate": 1.672050816696915e-05, + "loss": 34.9577, + "step": 2449 + }, + { + "epoch": 8.845146726862303, + "grad_norm": 191.1651611328125, + "learning_rate": 1.6715063520871145e-05, + "loss": 33.7244, + "step": 2450 + }, + { + "epoch": 8.845146726862303, + "eval_loss": 0.6216589212417603, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2450 + }, + { + "epoch": 8.848758465011286, + "grad_norm": 228.3920135498047, + "learning_rate": 1.670961887477314e-05, + "loss": 34.9689, + "step": 2451 + }, + { + "epoch": 8.852370203160271, + "grad_norm": 227.6689910888672, + "learning_rate": 1.6704174228675135e-05, + "loss": 36.0718, + "step": 2452 + }, + { + "epoch": 8.855981941309254, + "grad_norm": 182.38978576660156, + "learning_rate": 1.669872958257713e-05, + "loss": 37.1143, + "step": 2453 + }, + { + "epoch": 8.85959367945824, + "grad_norm": 223.66966247558594, + "learning_rate": 1.669328493647913e-05, + "loss": 34.4468, + "step": 2454 + }, + { + "epoch": 8.863205417607224, + "grad_norm": 260.3930358886719, + "learning_rate": 1.6687840290381125e-05, + "loss": 36.7305, + "step": 2455 + }, + { + "epoch": 8.866817155756207, + "grad_norm": 218.60385131835938, + "learning_rate": 1.668239564428312e-05, + "loss": 36.1995, + "step": 2456 + }, + { + "epoch": 8.870428893905192, + "grad_norm": 227.4342041015625, + "learning_rate": 1.667695099818512e-05, + "loss": 35.9138, + "step": 2457 + }, + { + "epoch": 8.874040632054175, + "grad_norm": 208.42196655273438, + "learning_rate": 1.6671506352087115e-05, + "loss": 37.2621, + "step": 2458 + }, + { + "epoch": 8.87765237020316, + "grad_norm": 214.9486541748047, + "learning_rate": 1.6666061705989113e-05, + "loss": 38.5176, + "step": 2459 + }, + { + "epoch": 8.881264108352145, + "grad_norm": 226.6992645263672, + "learning_rate": 1.666061705989111e-05, + "loss": 38.3917, + "step": 2460 + }, + { + "epoch": 8.881264108352145, + "eval_loss": 0.6277003884315491, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.959, + "eval_steps_per_second": 56.959, + "step": 2460 + }, + { + "epoch": 8.884875846501128, + "grad_norm": 282.3875732421875, + "learning_rate": 1.6655172413793104e-05, + "loss": 39.1439, + "step": 2461 + }, + { + "epoch": 8.888487584650113, + "grad_norm": 240.29022216796875, + "learning_rate": 1.66497277676951e-05, + "loss": 33.7717, + "step": 2462 + }, + { + "epoch": 8.892099322799098, + "grad_norm": 231.84727478027344, + "learning_rate": 1.6644283121597095e-05, + "loss": 24.1146, + "step": 2463 + }, + { + "epoch": 8.89571106094808, + "grad_norm": 215.5159149169922, + "learning_rate": 1.663883847549909e-05, + "loss": 24.0165, + "step": 2464 + }, + { + "epoch": 8.899322799097066, + "grad_norm": 278.42950439453125, + "learning_rate": 1.663339382940109e-05, + "loss": 24.2048, + "step": 2465 + }, + { + "epoch": 8.90293453724605, + "grad_norm": 187.03341674804688, + "learning_rate": 1.6627949183303088e-05, + "loss": 24.7332, + "step": 2466 + }, + { + "epoch": 8.906546275395034, + "grad_norm": 261.2938232421875, + "learning_rate": 1.6622504537205083e-05, + "loss": 42.6764, + "step": 2467 + }, + { + "epoch": 8.910158013544018, + "grad_norm": 234.00880432128906, + "learning_rate": 1.661705989110708e-05, + "loss": 42.9894, + "step": 2468 + }, + { + "epoch": 8.913769751693001, + "grad_norm": 263.2890319824219, + "learning_rate": 1.6611615245009074e-05, + "loss": 43.3274, + "step": 2469 + }, + { + "epoch": 8.917381489841986, + "grad_norm": 286.3260192871094, + "learning_rate": 1.6606170598911073e-05, + "loss": 44.3862, + "step": 2470 + }, + { + "epoch": 8.917381489841986, + "eval_loss": 0.6278789043426514, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2470 + }, + { + "epoch": 8.920993227990971, + "grad_norm": 273.5133972167969, + "learning_rate": 1.6600725952813068e-05, + "loss": 43.4195, + "step": 2471 + }, + { + "epoch": 8.924604966139954, + "grad_norm": 246.2245330810547, + "learning_rate": 1.6595281306715063e-05, + "loss": 43.153, + "step": 2472 + }, + { + "epoch": 8.928216704288939, + "grad_norm": 261.3001403808594, + "learning_rate": 1.658983666061706e-05, + "loss": 41.1276, + "step": 2473 + }, + { + "epoch": 8.931828442437924, + "grad_norm": 263.7626037597656, + "learning_rate": 1.6584392014519054e-05, + "loss": 40.5055, + "step": 2474 + }, + { + "epoch": 8.935440180586907, + "grad_norm": 233.80442810058594, + "learning_rate": 1.6578947368421053e-05, + "loss": 40.7098, + "step": 2475 + }, + { + "epoch": 8.939051918735892, + "grad_norm": 334.1268615722656, + "learning_rate": 1.6573502722323052e-05, + "loss": 40.5404, + "step": 2476 + }, + { + "epoch": 8.942663656884875, + "grad_norm": 319.56689453125, + "learning_rate": 1.6568058076225047e-05, + "loss": 40.3434, + "step": 2477 + }, + { + "epoch": 8.94627539503386, + "grad_norm": 388.0625915527344, + "learning_rate": 1.6562613430127043e-05, + "loss": 41.1956, + "step": 2478 + }, + { + "epoch": 8.949887133182845, + "grad_norm": 256.9087829589844, + "learning_rate": 1.6557168784029038e-05, + "loss": 41.9647, + "step": 2479 + }, + { + "epoch": 8.953498871331828, + "grad_norm": 248.2635040283203, + "learning_rate": 1.6551724137931033e-05, + "loss": 41.1885, + "step": 2480 + }, + { + "epoch": 8.953498871331828, + "eval_loss": 0.6198933124542236, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2480 + }, + { + "epoch": 8.957110609480813, + "grad_norm": 236.89004516601562, + "learning_rate": 1.6546279491833032e-05, + "loss": 41.2178, + "step": 2481 + }, + { + "epoch": 8.960722347629797, + "grad_norm": 260.47357177734375, + "learning_rate": 1.6540834845735027e-05, + "loss": 42.1472, + "step": 2482 + }, + { + "epoch": 8.96433408577878, + "grad_norm": 216.1390380859375, + "learning_rate": 1.6535390199637023e-05, + "loss": 36.14, + "step": 2483 + }, + { + "epoch": 8.967945823927765, + "grad_norm": 194.7316131591797, + "learning_rate": 1.652994555353902e-05, + "loss": 33.7272, + "step": 2484 + }, + { + "epoch": 8.97155756207675, + "grad_norm": 202.0404052734375, + "learning_rate": 1.6524500907441017e-05, + "loss": 34.9427, + "step": 2485 + }, + { + "epoch": 8.975169300225733, + "grad_norm": 196.98463439941406, + "learning_rate": 1.6519056261343016e-05, + "loss": 36.4874, + "step": 2486 + }, + { + "epoch": 8.978781038374718, + "grad_norm": 211.46177673339844, + "learning_rate": 1.651361161524501e-05, + "loss": 35.7667, + "step": 2487 + }, + { + "epoch": 8.982392776523701, + "grad_norm": 190.47093200683594, + "learning_rate": 1.6508166969147006e-05, + "loss": 35.6874, + "step": 2488 + }, + { + "epoch": 8.986004514672686, + "grad_norm": 194.9825897216797, + "learning_rate": 1.6502722323049002e-05, + "loss": 36.8718, + "step": 2489 + }, + { + "epoch": 8.989616252821671, + "grad_norm": 230.24774169921875, + "learning_rate": 1.6497277676950997e-05, + "loss": 37.4962, + "step": 2490 + }, + { + "epoch": 8.989616252821671, + "eval_loss": 0.6168100237846375, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 2490 + }, + { + "epoch": 8.993227990970654, + "grad_norm": 266.5688171386719, + "learning_rate": 1.6491833030852993e-05, + "loss": 35.5063, + "step": 2491 + }, + { + "epoch": 8.996839729119639, + "grad_norm": 230.923828125, + "learning_rate": 1.648638838475499e-05, + "loss": 23.5847, + "step": 2492 + }, + { + "epoch": 9.0, + "grad_norm": 187.365478515625, + "learning_rate": 1.6480943738656987e-05, + "loss": 21.7926, + "step": 2493 + }, + { + "epoch": 9.003611738148985, + "grad_norm": 283.487060546875, + "learning_rate": 1.6475499092558986e-05, + "loss": 41.4221, + "step": 2494 + }, + { + "epoch": 9.007223476297968, + "grad_norm": 234.38009643554688, + "learning_rate": 1.647005444646098e-05, + "loss": 43.3343, + "step": 2495 + }, + { + "epoch": 9.010835214446953, + "grad_norm": 253.75588989257812, + "learning_rate": 1.6464609800362976e-05, + "loss": 42.1983, + "step": 2496 + }, + { + "epoch": 9.014446952595938, + "grad_norm": 224.6202392578125, + "learning_rate": 1.6459165154264975e-05, + "loss": 41.5355, + "step": 2497 + }, + { + "epoch": 9.01805869074492, + "grad_norm": 261.0040588378906, + "learning_rate": 1.645372050816697e-05, + "loss": 42.3058, + "step": 2498 + }, + { + "epoch": 9.021670428893906, + "grad_norm": 191.44142150878906, + "learning_rate": 1.6448275862068966e-05, + "loss": 42.3911, + "step": 2499 + }, + { + "epoch": 9.025282167042889, + "grad_norm": 246.79278564453125, + "learning_rate": 1.644283121597096e-05, + "loss": 41.6238, + "step": 2500 + }, + { + "epoch": 9.025282167042889, + "eval_loss": 0.6220878958702087, + "eval_runtime": 3.1552, + "eval_samples_per_second": 56.731, + "eval_steps_per_second": 56.731, + "step": 2500 + }, + { + "epoch": 9.028893905191874, + "grad_norm": 251.5475311279297, + "learning_rate": 1.6437386569872957e-05, + "loss": 43.9275, + "step": 2501 + }, + { + "epoch": 9.032505643340858, + "grad_norm": 300.0381164550781, + "learning_rate": 1.6431941923774952e-05, + "loss": 42.8938, + "step": 2502 + }, + { + "epoch": 9.036117381489841, + "grad_norm": 310.0517883300781, + "learning_rate": 1.6426497277676954e-05, + "loss": 42.3538, + "step": 2503 + }, + { + "epoch": 9.039729119638826, + "grad_norm": 213.50392150878906, + "learning_rate": 1.642105263157895e-05, + "loss": 40.2305, + "step": 2504 + }, + { + "epoch": 9.043340857787811, + "grad_norm": 173.3816680908203, + "learning_rate": 1.6415607985480945e-05, + "loss": 38.3336, + "step": 2505 + }, + { + "epoch": 9.046952595936794, + "grad_norm": 195.51968383789062, + "learning_rate": 1.641016333938294e-05, + "loss": 38.5937, + "step": 2506 + }, + { + "epoch": 9.050564334085779, + "grad_norm": 195.68910217285156, + "learning_rate": 1.6404718693284936e-05, + "loss": 37.9994, + "step": 2507 + }, + { + "epoch": 9.054176072234762, + "grad_norm": 239.56704711914062, + "learning_rate": 1.6399274047186934e-05, + "loss": 38.6006, + "step": 2508 + }, + { + "epoch": 9.057787810383747, + "grad_norm": 455.8309326171875, + "learning_rate": 1.639382940108893e-05, + "loss": 39.9516, + "step": 2509 + }, + { + "epoch": 9.061399548532732, + "grad_norm": 188.0857696533203, + "learning_rate": 1.6388384754990925e-05, + "loss": 38.8922, + "step": 2510 + }, + { + "epoch": 9.061399548532732, + "eval_loss": 0.6177002191543579, + "eval_runtime": 3.1595, + "eval_samples_per_second": 56.654, + "eval_steps_per_second": 56.654, + "step": 2510 + }, + { + "epoch": 9.065011286681715, + "grad_norm": 211.76168823242188, + "learning_rate": 1.638294010889292e-05, + "loss": 38.8895, + "step": 2511 + }, + { + "epoch": 9.0686230248307, + "grad_norm": 281.7332458496094, + "learning_rate": 1.637749546279492e-05, + "loss": 39.9238, + "step": 2512 + }, + { + "epoch": 9.072234762979685, + "grad_norm": 254.9953155517578, + "learning_rate": 1.6372050816696915e-05, + "loss": 41.2667, + "step": 2513 + }, + { + "epoch": 9.075846501128668, + "grad_norm": 233.8746337890625, + "learning_rate": 1.6366606170598914e-05, + "loss": 39.3087, + "step": 2514 + }, + { + "epoch": 9.079458239277653, + "grad_norm": 317.71270751953125, + "learning_rate": 1.636116152450091e-05, + "loss": 40.4902, + "step": 2515 + }, + { + "epoch": 9.083069977426636, + "grad_norm": 227.5228271484375, + "learning_rate": 1.6355716878402904e-05, + "loss": 40.1197, + "step": 2516 + }, + { + "epoch": 9.08668171557562, + "grad_norm": 225.84423828125, + "learning_rate": 1.63502722323049e-05, + "loss": 42.9099, + "step": 2517 + }, + { + "epoch": 9.090293453724605, + "grad_norm": 255.20858764648438, + "learning_rate": 1.6344827586206895e-05, + "loss": 42.0515, + "step": 2518 + }, + { + "epoch": 9.093905191873588, + "grad_norm": 215.45352172851562, + "learning_rate": 1.6339382940108894e-05, + "loss": 41.6817, + "step": 2519 + }, + { + "epoch": 9.097516930022573, + "grad_norm": 233.5334014892578, + "learning_rate": 1.633393829401089e-05, + "loss": 42.6121, + "step": 2520 + }, + { + "epoch": 9.097516930022573, + "eval_loss": 0.6148340106010437, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.926, + "eval_steps_per_second": 56.926, + "step": 2520 + }, + { + "epoch": 9.101128668171558, + "grad_norm": 196.54132080078125, + "learning_rate": 1.6328493647912888e-05, + "loss": 40.5833, + "step": 2521 + }, + { + "epoch": 9.104740406320541, + "grad_norm": 296.7503967285156, + "learning_rate": 1.6323049001814883e-05, + "loss": 39.098, + "step": 2522 + }, + { + "epoch": 9.108352144469526, + "grad_norm": 272.1104431152344, + "learning_rate": 1.631760435571688e-05, + "loss": 36.0076, + "step": 2523 + }, + { + "epoch": 9.111963882618511, + "grad_norm": 197.3100128173828, + "learning_rate": 1.6312159709618874e-05, + "loss": 33.3503, + "step": 2524 + }, + { + "epoch": 9.115575620767494, + "grad_norm": 223.1310272216797, + "learning_rate": 1.6306715063520873e-05, + "loss": 33.1386, + "step": 2525 + }, + { + "epoch": 9.119187358916479, + "grad_norm": 234.86093139648438, + "learning_rate": 1.630127041742287e-05, + "loss": 34.2101, + "step": 2526 + }, + { + "epoch": 9.122799097065462, + "grad_norm": 244.72328186035156, + "learning_rate": 1.6295825771324864e-05, + "loss": 34.955, + "step": 2527 + }, + { + "epoch": 9.126410835214447, + "grad_norm": 198.89134216308594, + "learning_rate": 1.629038112522686e-05, + "loss": 34.5405, + "step": 2528 + }, + { + "epoch": 9.130022573363432, + "grad_norm": 236.64096069335938, + "learning_rate": 1.6284936479128854e-05, + "loss": 35.2328, + "step": 2529 + }, + { + "epoch": 9.133634311512415, + "grad_norm": 212.8743438720703, + "learning_rate": 1.6279491833030853e-05, + "loss": 34.6642, + "step": 2530 + }, + { + "epoch": 9.133634311512415, + "eval_loss": 0.6154256463050842, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 2530 + }, + { + "epoch": 9.1372460496614, + "grad_norm": 227.15135192871094, + "learning_rate": 1.6274047186932852e-05, + "loss": 35.652, + "step": 2531 + }, + { + "epoch": 9.140857787810384, + "grad_norm": 207.30572509765625, + "learning_rate": 1.6268602540834847e-05, + "loss": 36.8476, + "step": 2532 + }, + { + "epoch": 9.144469525959368, + "grad_norm": 222.18023681640625, + "learning_rate": 1.6263157894736843e-05, + "loss": 35.8299, + "step": 2533 + }, + { + "epoch": 9.148081264108352, + "grad_norm": 283.674072265625, + "learning_rate": 1.6257713248638838e-05, + "loss": 36.5074, + "step": 2534 + }, + { + "epoch": 9.151693002257336, + "grad_norm": 235.69752502441406, + "learning_rate": 1.6252268602540834e-05, + "loss": 37.344, + "step": 2535 + }, + { + "epoch": 9.15530474040632, + "grad_norm": 224.37965393066406, + "learning_rate": 1.6246823956442832e-05, + "loss": 37.8138, + "step": 2536 + }, + { + "epoch": 9.158916478555305, + "grad_norm": 217.52230834960938, + "learning_rate": 1.6241379310344828e-05, + "loss": 37.1529, + "step": 2537 + }, + { + "epoch": 9.162528216704288, + "grad_norm": 234.7586212158203, + "learning_rate": 1.6235934664246823e-05, + "loss": 36.3247, + "step": 2538 + }, + { + "epoch": 9.166139954853273, + "grad_norm": 239.52479553222656, + "learning_rate": 1.623049001814882e-05, + "loss": 30.0805, + "step": 2539 + }, + { + "epoch": 9.169751693002258, + "grad_norm": 223.7616424560547, + "learning_rate": 1.6225045372050817e-05, + "loss": 23.8492, + "step": 2540 + }, + { + "epoch": 9.169751693002258, + "eval_loss": 0.6244915723800659, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.031, + "eval_steps_per_second": 57.031, + "step": 2540 + }, + { + "epoch": 9.173363431151241, + "grad_norm": 213.41371154785156, + "learning_rate": 1.6219600725952816e-05, + "loss": 23.3557, + "step": 2541 + }, + { + "epoch": 9.176975169300226, + "grad_norm": 162.4627685546875, + "learning_rate": 1.621415607985481e-05, + "loss": 23.8834, + "step": 2542 + }, + { + "epoch": 9.18058690744921, + "grad_norm": 172.13250732421875, + "learning_rate": 1.6208711433756807e-05, + "loss": 24.6428, + "step": 2543 + }, + { + "epoch": 9.184198645598194, + "grad_norm": 229.30799865722656, + "learning_rate": 1.6203266787658802e-05, + "loss": 42.5908, + "step": 2544 + }, + { + "epoch": 9.187810383747179, + "grad_norm": 195.30130004882812, + "learning_rate": 1.6197822141560798e-05, + "loss": 43.7286, + "step": 2545 + }, + { + "epoch": 9.191422121896162, + "grad_norm": 227.4984893798828, + "learning_rate": 1.6192377495462793e-05, + "loss": 43.5012, + "step": 2546 + }, + { + "epoch": 9.195033860045147, + "grad_norm": 254.69615173339844, + "learning_rate": 1.6186932849364792e-05, + "loss": 41.9295, + "step": 2547 + }, + { + "epoch": 9.198645598194132, + "grad_norm": 251.33778381347656, + "learning_rate": 1.6181488203266787e-05, + "loss": 42.0838, + "step": 2548 + }, + { + "epoch": 9.202257336343115, + "grad_norm": 237.91677856445312, + "learning_rate": 1.6176043557168786e-05, + "loss": 43.0031, + "step": 2549 + }, + { + "epoch": 9.2058690744921, + "grad_norm": 258.0311584472656, + "learning_rate": 1.617059891107078e-05, + "loss": 42.7196, + "step": 2550 + }, + { + "epoch": 9.2058690744921, + "eval_loss": 0.6245208978652954, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2550 + }, + { + "epoch": 9.209480812641084, + "grad_norm": 197.14703369140625, + "learning_rate": 1.6165154264972777e-05, + "loss": 42.1342, + "step": 2551 + }, + { + "epoch": 9.213092550790067, + "grad_norm": 235.19705200195312, + "learning_rate": 1.6159709618874775e-05, + "loss": 41.8462, + "step": 2552 + }, + { + "epoch": 9.216704288939052, + "grad_norm": 198.409423828125, + "learning_rate": 1.615426497277677e-05, + "loss": 43.5993, + "step": 2553 + }, + { + "epoch": 9.220316027088035, + "grad_norm": 254.08590698242188, + "learning_rate": 1.6148820326678766e-05, + "loss": 40.771, + "step": 2554 + }, + { + "epoch": 9.22392776523702, + "grad_norm": 181.64808654785156, + "learning_rate": 1.614337568058076e-05, + "loss": 39.3511, + "step": 2555 + }, + { + "epoch": 9.227539503386005, + "grad_norm": 294.1127014160156, + "learning_rate": 1.6137931034482757e-05, + "loss": 39.6586, + "step": 2556 + }, + { + "epoch": 9.231151241534988, + "grad_norm": 197.59982299804688, + "learning_rate": 1.6132486388384752e-05, + "loss": 38.2575, + "step": 2557 + }, + { + "epoch": 9.234762979683973, + "grad_norm": 223.74717712402344, + "learning_rate": 1.6127041742286754e-05, + "loss": 38.8801, + "step": 2558 + }, + { + "epoch": 9.238374717832958, + "grad_norm": 279.2779541015625, + "learning_rate": 1.612159709618875e-05, + "loss": 40.4591, + "step": 2559 + }, + { + "epoch": 9.241986455981941, + "grad_norm": 258.75909423828125, + "learning_rate": 1.6116152450090745e-05, + "loss": 39.2172, + "step": 2560 + }, + { + "epoch": 9.241986455981941, + "eval_loss": 0.6209923624992371, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.784, + "eval_steps_per_second": 56.784, + "step": 2560 + }, + { + "epoch": 9.245598194130926, + "grad_norm": 305.0645446777344, + "learning_rate": 1.611070780399274e-05, + "loss": 40.442, + "step": 2561 + }, + { + "epoch": 9.249209932279909, + "grad_norm": 196.18557739257812, + "learning_rate": 1.6105263157894736e-05, + "loss": 39.7092, + "step": 2562 + }, + { + "epoch": 9.252821670428894, + "grad_norm": 214.3220977783203, + "learning_rate": 1.6099818511796735e-05, + "loss": 39.3935, + "step": 2563 + }, + { + "epoch": 9.256433408577879, + "grad_norm": 217.2801055908203, + "learning_rate": 1.609437386569873e-05, + "loss": 40.39, + "step": 2564 + }, + { + "epoch": 9.260045146726862, + "grad_norm": 205.17446899414062, + "learning_rate": 1.6088929219600726e-05, + "loss": 39.9531, + "step": 2565 + }, + { + "epoch": 9.263656884875846, + "grad_norm": 197.3854217529297, + "learning_rate": 1.608348457350272e-05, + "loss": 40.474, + "step": 2566 + }, + { + "epoch": 9.267268623024831, + "grad_norm": 264.3934631347656, + "learning_rate": 1.607803992740472e-05, + "loss": 41.2794, + "step": 2567 + }, + { + "epoch": 9.270880361173814, + "grad_norm": 226.6471710205078, + "learning_rate": 1.6072595281306715e-05, + "loss": 40.3425, + "step": 2568 + }, + { + "epoch": 9.2744920993228, + "grad_norm": 198.62734985351562, + "learning_rate": 1.6067150635208714e-05, + "loss": 41.6261, + "step": 2569 + }, + { + "epoch": 9.278103837471784, + "grad_norm": 207.73509216308594, + "learning_rate": 1.606170598911071e-05, + "loss": 41.7835, + "step": 2570 + }, + { + "epoch": 9.278103837471784, + "eval_loss": 0.6173180937767029, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 2570 + }, + { + "epoch": 9.281715575620767, + "grad_norm": 214.13601684570312, + "learning_rate": 1.6056261343012705e-05, + "loss": 40.0095, + "step": 2571 + }, + { + "epoch": 9.285327313769752, + "grad_norm": 218.0533905029297, + "learning_rate": 1.60508166969147e-05, + "loss": 40.014, + "step": 2572 + }, + { + "epoch": 9.288939051918735, + "grad_norm": 211.27984619140625, + "learning_rate": 1.6045372050816695e-05, + "loss": 36.7399, + "step": 2573 + }, + { + "epoch": 9.29255079006772, + "grad_norm": 201.9020233154297, + "learning_rate": 1.6039927404718694e-05, + "loss": 33.7555, + "step": 2574 + }, + { + "epoch": 9.296162528216705, + "grad_norm": 230.27149963378906, + "learning_rate": 1.603448275862069e-05, + "loss": 32.9646, + "step": 2575 + }, + { + "epoch": 9.299774266365688, + "grad_norm": 208.77622985839844, + "learning_rate": 1.6029038112522685e-05, + "loss": 33.5332, + "step": 2576 + }, + { + "epoch": 9.303386004514673, + "grad_norm": 225.02796936035156, + "learning_rate": 1.6023593466424684e-05, + "loss": 34.2592, + "step": 2577 + }, + { + "epoch": 9.306997742663658, + "grad_norm": 201.79612731933594, + "learning_rate": 1.601814882032668e-05, + "loss": 34.6686, + "step": 2578 + }, + { + "epoch": 9.31060948081264, + "grad_norm": 235.6588134765625, + "learning_rate": 1.6012704174228678e-05, + "loss": 35.4554, + "step": 2579 + }, + { + "epoch": 9.314221218961626, + "grad_norm": 273.51904296875, + "learning_rate": 1.6007259528130673e-05, + "loss": 35.2077, + "step": 2580 + }, + { + "epoch": 9.314221218961626, + "eval_loss": 0.6169624328613281, + "eval_runtime": 3.1501, + "eval_samples_per_second": 56.823, + "eval_steps_per_second": 56.823, + "step": 2580 + }, + { + "epoch": 9.317832957110609, + "grad_norm": 199.19541931152344, + "learning_rate": 1.600181488203267e-05, + "loss": 35.0703, + "step": 2581 + }, + { + "epoch": 9.321444695259594, + "grad_norm": 212.49276733398438, + "learning_rate": 1.5996370235934664e-05, + "loss": 35.9691, + "step": 2582 + }, + { + "epoch": 9.325056433408578, + "grad_norm": 193.7330322265625, + "learning_rate": 1.599092558983666e-05, + "loss": 34.9043, + "step": 2583 + }, + { + "epoch": 9.328668171557561, + "grad_norm": 196.00503540039062, + "learning_rate": 1.5985480943738655e-05, + "loss": 36.3508, + "step": 2584 + }, + { + "epoch": 9.332279909706546, + "grad_norm": 218.78392028808594, + "learning_rate": 1.5980036297640654e-05, + "loss": 34.7672, + "step": 2585 + }, + { + "epoch": 9.335891647855531, + "grad_norm": 235.76873779296875, + "learning_rate": 1.5974591651542652e-05, + "loss": 36.8695, + "step": 2586 + }, + { + "epoch": 9.339503386004514, + "grad_norm": 250.538330078125, + "learning_rate": 1.5969147005444648e-05, + "loss": 37.4531, + "step": 2587 + }, + { + "epoch": 9.343115124153499, + "grad_norm": 234.12469482421875, + "learning_rate": 1.5963702359346643e-05, + "loss": 37.4506, + "step": 2588 + }, + { + "epoch": 9.346726862302482, + "grad_norm": 209.3461151123047, + "learning_rate": 1.595825771324864e-05, + "loss": 31.3062, + "step": 2589 + }, + { + "epoch": 9.350338600451467, + "grad_norm": 211.12277221679688, + "learning_rate": 1.5952813067150637e-05, + "loss": 23.3303, + "step": 2590 + }, + { + "epoch": 9.350338600451467, + "eval_loss": 0.6222187876701355, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 2590 + }, + { + "epoch": 9.353950338600452, + "grad_norm": 200.1257781982422, + "learning_rate": 1.5947368421052633e-05, + "loss": 22.9145, + "step": 2591 + }, + { + "epoch": 9.357562076749435, + "grad_norm": 179.01475524902344, + "learning_rate": 1.5941923774954628e-05, + "loss": 23.8842, + "step": 2592 + }, + { + "epoch": 9.36117381489842, + "grad_norm": 214.9254608154297, + "learning_rate": 1.5936479128856623e-05, + "loss": 25.4154, + "step": 2593 + }, + { + "epoch": 9.364785553047405, + "grad_norm": 211.63735961914062, + "learning_rate": 1.593103448275862e-05, + "loss": 42.6467, + "step": 2594 + }, + { + "epoch": 9.368397291196388, + "grad_norm": 232.43194580078125, + "learning_rate": 1.5925589836660618e-05, + "loss": 43.3501, + "step": 2595 + }, + { + "epoch": 9.372009029345373, + "grad_norm": 220.61468505859375, + "learning_rate": 1.5920145190562616e-05, + "loss": 43.4324, + "step": 2596 + }, + { + "epoch": 9.375620767494357, + "grad_norm": 179.00894165039062, + "learning_rate": 1.591470054446461e-05, + "loss": 41.9646, + "step": 2597 + }, + { + "epoch": 9.37923250564334, + "grad_norm": 203.847412109375, + "learning_rate": 1.5909255898366607e-05, + "loss": 41.1242, + "step": 2598 + }, + { + "epoch": 9.382844243792325, + "grad_norm": 244.20164489746094, + "learning_rate": 1.5903811252268602e-05, + "loss": 42.2451, + "step": 2599 + }, + { + "epoch": 9.386455981941308, + "grad_norm": 203.60154724121094, + "learning_rate": 1.5898366606170598e-05, + "loss": 42.0361, + "step": 2600 + }, + { + "epoch": 9.386455981941308, + "eval_loss": 0.627146303653717, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2600 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2754048221433037e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..577833dd340ac43549fe142b516bdbf7577dc40f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:742804ac5ff7ba79ce82a327be01b44f20c5d102bfd1a87b335ae335eccd6565 +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..47dbd3af89cdaa2ec4b673da01ab6d16a884683c --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb1305b9e8f1c306b4f682faa552219e7879167d04b02352353dff574566f946 +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..69c3d9daccee2f14ecca3457dd69bebd502d1da5 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31e3835b8385a8a7f12f1fc3982b96d291fe97c1757d519715b9d02dfe94bbd9 +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2a177a7b3666907a4850226f00cfad3ef789d68c --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:333c13ffdd84f379d6606137107483bcdc7ecc4a2a9fbeed75b2cc73f14baeb4 +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ecd9c03d120c18a4630a802d889eb40c79f3166 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1508edd8fadde719fff675a62ca4cb6a67beef098395acf115aa0e284f622bc4 +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a9978cb76be3ea28de5dd5e126c61cfbf2924fc3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/trainer_state.json @@ -0,0 +1,21873 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.108352144469526, + "eval_steps": 10, + "global_step": 2800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + }, + { + "epoch": 4.33589164785553, + "grad_norm": 232.332763671875, + "learning_rate": 2.3515426497277678e-05, + "loss": 38.1029, + "step": 1201 + }, + { + "epoch": 4.339503386004514, + "grad_norm": 251.8763885498047, + "learning_rate": 2.3509981851179673e-05, + "loss": 40.2538, + "step": 1202 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 260.1363525390625, + "learning_rate": 2.350453720508167e-05, + "loss": 39.115, + "step": 1203 + }, + { + "epoch": 4.346726862302483, + "grad_norm": 227.32473754882812, + "learning_rate": 2.3499092558983667e-05, + "loss": 37.7692, + "step": 1204 + }, + { + "epoch": 4.350338600451467, + "grad_norm": 208.3872528076172, + "learning_rate": 2.3493647912885663e-05, + "loss": 26.7583, + "step": 1205 + }, + { + "epoch": 4.353950338600452, + "grad_norm": 173.05075073242188, + "learning_rate": 2.348820326678766e-05, + "loss": 24.7576, + "step": 1206 + }, + { + "epoch": 4.357562076749436, + "grad_norm": 214.4512939453125, + "learning_rate": 2.3482758620689657e-05, + "loss": 24.8792, + "step": 1207 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 179.293701171875, + "learning_rate": 2.3477313974591652e-05, + "loss": 26.1507, + "step": 1208 + }, + { + "epoch": 4.364785553047404, + "grad_norm": 401.9908142089844, + "learning_rate": 2.3471869328493648e-05, + "loss": 47.4017, + "step": 1209 + }, + { + "epoch": 4.368397291196389, + "grad_norm": 399.3369140625, + "learning_rate": 2.3466424682395643e-05, + "loss": 48.0082, + "step": 1210 + }, + { + "epoch": 4.368397291196389, + "eval_loss": 0.6664602756500244, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.18, + "eval_steps_per_second": 57.18, + "step": 1210 + }, + { + "epoch": 4.372009029345373, + "grad_norm": 320.49090576171875, + "learning_rate": 2.346098003629764e-05, + "loss": 47.4843, + "step": 1211 + }, + { + "epoch": 4.375620767494357, + "grad_norm": 297.55615234375, + "learning_rate": 2.3455535390199637e-05, + "loss": 46.3087, + "step": 1212 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 245.03399658203125, + "learning_rate": 2.3450090744101636e-05, + "loss": 45.4889, + "step": 1213 + }, + { + "epoch": 4.382844243792325, + "grad_norm": 227.94091796875, + "learning_rate": 2.344464609800363e-05, + "loss": 45.8501, + "step": 1214 + }, + { + "epoch": 4.386455981941309, + "grad_norm": 262.7824401855469, + "learning_rate": 2.3439201451905627e-05, + "loss": 46.2737, + "step": 1215 + }, + { + "epoch": 4.390067720090293, + "grad_norm": 235.969970703125, + "learning_rate": 2.3433756805807622e-05, + "loss": 45.2876, + "step": 1216 + }, + { + "epoch": 4.393679458239277, + "grad_norm": 244.8028106689453, + "learning_rate": 2.342831215970962e-05, + "loss": 45.4931, + "step": 1217 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 236.24844360351562, + "learning_rate": 2.3422867513611616e-05, + "loss": 45.6649, + "step": 1218 + }, + { + "epoch": 4.400902934537246, + "grad_norm": 204.7911834716797, + "learning_rate": 2.341742286751361e-05, + "loss": 43.9613, + "step": 1219 + }, + { + "epoch": 4.40451467268623, + "grad_norm": 190.6739044189453, + "learning_rate": 2.3411978221415607e-05, + "loss": 41.9267, + "step": 1220 + }, + { + "epoch": 4.40451467268623, + "eval_loss": 0.6481396555900574, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1220 + }, + { + "epoch": 4.408126410835214, + "grad_norm": 224.25758361816406, + "learning_rate": 2.3406533575317602e-05, + "loss": 42.34, + "step": 1221 + }, + { + "epoch": 4.411738148984199, + "grad_norm": 238.21913146972656, + "learning_rate": 2.34010889292196e-05, + "loss": 40.6947, + "step": 1222 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 255.64395141601562, + "learning_rate": 2.33956442831216e-05, + "loss": 39.8585, + "step": 1223 + }, + { + "epoch": 4.418961625282167, + "grad_norm": 202.08859252929688, + "learning_rate": 2.3390199637023595e-05, + "loss": 42.6031, + "step": 1224 + }, + { + "epoch": 4.422573363431152, + "grad_norm": 222.359619140625, + "learning_rate": 2.338475499092559e-05, + "loss": 41.9946, + "step": 1225 + }, + { + "epoch": 4.426185101580136, + "grad_norm": 198.84461975097656, + "learning_rate": 2.3379310344827586e-05, + "loss": 40.9174, + "step": 1226 + }, + { + "epoch": 4.42979683972912, + "grad_norm": 227.34942626953125, + "learning_rate": 2.337386569872958e-05, + "loss": 42.2865, + "step": 1227 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 249.9097900390625, + "learning_rate": 2.336842105263158e-05, + "loss": 42.6508, + "step": 1228 + }, + { + "epoch": 4.437020316027088, + "grad_norm": 236.96009826660156, + "learning_rate": 2.3362976406533576e-05, + "loss": 43.0846, + "step": 1229 + }, + { + "epoch": 4.440632054176072, + "grad_norm": 183.06201171875, + "learning_rate": 2.335753176043557e-05, + "loss": 42.4119, + "step": 1230 + }, + { + "epoch": 4.440632054176072, + "eval_loss": 0.6428424715995789, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 1230 + }, + { + "epoch": 4.444243792325056, + "grad_norm": 199.0382843017578, + "learning_rate": 2.335208711433757e-05, + "loss": 43.1702, + "step": 1231 + }, + { + "epoch": 4.44785553047404, + "grad_norm": 221.87939453125, + "learning_rate": 2.3346642468239565e-05, + "loss": 43.3518, + "step": 1232 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 205.0601043701172, + "learning_rate": 2.3341197822141564e-05, + "loss": 42.9713, + "step": 1233 + }, + { + "epoch": 4.455079006772009, + "grad_norm": 235.3998565673828, + "learning_rate": 2.333575317604356e-05, + "loss": 42.6973, + "step": 1234 + }, + { + "epoch": 4.458690744920993, + "grad_norm": 171.76986694335938, + "learning_rate": 2.3330308529945555e-05, + "loss": 43.351, + "step": 1235 + }, + { + "epoch": 4.462302483069977, + "grad_norm": 261.549072265625, + "learning_rate": 2.332486388384755e-05, + "loss": 43.8662, + "step": 1236 + }, + { + "epoch": 4.465914221218962, + "grad_norm": 256.76837158203125, + "learning_rate": 2.3319419237749545e-05, + "loss": 40.7938, + "step": 1237 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 176.35060119628906, + "learning_rate": 2.331397459165154e-05, + "loss": 38.1021, + "step": 1238 + }, + { + "epoch": 4.47313769751693, + "grad_norm": 203.00906372070312, + "learning_rate": 2.330852994555354e-05, + "loss": 36.6359, + "step": 1239 + }, + { + "epoch": 4.476749435665914, + "grad_norm": 259.6462707519531, + "learning_rate": 2.3303085299455535e-05, + "loss": 34.448, + "step": 1240 + }, + { + "epoch": 4.476749435665914, + "eval_loss": 0.6386051177978516, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 1240 + }, + { + "epoch": 4.480361173814899, + "grad_norm": 215.24737548828125, + "learning_rate": 2.3297640653357534e-05, + "loss": 35.2353, + "step": 1241 + }, + { + "epoch": 4.483972911963883, + "grad_norm": 249.12355041503906, + "learning_rate": 2.329219600725953e-05, + "loss": 38.2077, + "step": 1242 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 191.0881805419922, + "learning_rate": 2.3286751361161525e-05, + "loss": 36.8363, + "step": 1243 + }, + { + "epoch": 4.491196388261851, + "grad_norm": 229.26449584960938, + "learning_rate": 2.3281306715063523e-05, + "loss": 36.7398, + "step": 1244 + }, + { + "epoch": 4.4948081264108355, + "grad_norm": 184.931884765625, + "learning_rate": 2.327586206896552e-05, + "loss": 35.6614, + "step": 1245 + }, + { + "epoch": 4.4984198645598195, + "grad_norm": 183.7378387451172, + "learning_rate": 2.3270417422867514e-05, + "loss": 36.9818, + "step": 1246 + }, + { + "epoch": 4.502031602708803, + "grad_norm": 191.42543029785156, + "learning_rate": 2.326497277676951e-05, + "loss": 38.1348, + "step": 1247 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 211.6359100341797, + "learning_rate": 2.3259528130671505e-05, + "loss": 37.0112, + "step": 1248 + }, + { + "epoch": 4.509255079006772, + "grad_norm": 245.6946563720703, + "learning_rate": 2.32540834845735e-05, + "loss": 38.6218, + "step": 1249 + }, + { + "epoch": 4.512866817155756, + "grad_norm": 193.29095458984375, + "learning_rate": 2.3248638838475502e-05, + "loss": 36.9687, + "step": 1250 + }, + { + "epoch": 4.512866817155756, + "eval_loss": 0.6432057023048401, + "eval_runtime": 3.1301, + "eval_samples_per_second": 57.187, + "eval_steps_per_second": 57.187, + "step": 1250 + }, + { + "epoch": 4.51647855530474, + "grad_norm": 247.0595245361328, + "learning_rate": 2.3243194192377498e-05, + "loss": 39.8086, + "step": 1251 + }, + { + "epoch": 4.520090293453725, + "grad_norm": 243.1544189453125, + "learning_rate": 2.3237749546279493e-05, + "loss": 38.7245, + "step": 1252 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 322.0834045410156, + "learning_rate": 2.323230490018149e-05, + "loss": 39.5335, + "step": 1253 + }, + { + "epoch": 4.527313769751693, + "grad_norm": 201.5956573486328, + "learning_rate": 2.3226860254083484e-05, + "loss": 30.2928, + "step": 1254 + }, + { + "epoch": 4.530925507900677, + "grad_norm": 186.13291931152344, + "learning_rate": 2.3221415607985483e-05, + "loss": 24.8504, + "step": 1255 + }, + { + "epoch": 4.534537246049661, + "grad_norm": 251.50608825683594, + "learning_rate": 2.3215970961887478e-05, + "loss": 24.5528, + "step": 1256 + }, + { + "epoch": 4.538148984198646, + "grad_norm": 180.21124267578125, + "learning_rate": 2.3210526315789473e-05, + "loss": 25.0864, + "step": 1257 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 206.5410614013672, + "learning_rate": 2.320508166969147e-05, + "loss": 27.1602, + "step": 1258 + }, + { + "epoch": 4.545372460496614, + "grad_norm": 342.1103210449219, + "learning_rate": 2.3199637023593468e-05, + "loss": 47.3734, + "step": 1259 + }, + { + "epoch": 4.5489841986455986, + "grad_norm": 418.3056945800781, + "learning_rate": 2.3194192377495463e-05, + "loss": 48.0316, + "step": 1260 + }, + { + "epoch": 4.5489841986455986, + "eval_loss": 0.6742400527000427, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 1260 + }, + { + "epoch": 4.5525959367945825, + "grad_norm": 369.8560791015625, + "learning_rate": 2.3188747731397462e-05, + "loss": 47.4532, + "step": 1261 + }, + { + "epoch": 4.5562076749435665, + "grad_norm": 322.0288391113281, + "learning_rate": 2.3183303085299457e-05, + "loss": 47.0661, + "step": 1262 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 244.79066467285156, + "learning_rate": 2.3177858439201453e-05, + "loss": 45.1875, + "step": 1263 + }, + { + "epoch": 4.563431151241535, + "grad_norm": 209.29397583007812, + "learning_rate": 2.3172413793103448e-05, + "loss": 46.1355, + "step": 1264 + }, + { + "epoch": 4.567042889390519, + "grad_norm": 271.5123291015625, + "learning_rate": 2.3166969147005443e-05, + "loss": 45.8947, + "step": 1265 + }, + { + "epoch": 4.570654627539503, + "grad_norm": 232.42913818359375, + "learning_rate": 2.3161524500907442e-05, + "loss": 45.6542, + "step": 1266 + }, + { + "epoch": 4.574266365688487, + "grad_norm": 282.50738525390625, + "learning_rate": 2.3156079854809437e-05, + "loss": 45.8805, + "step": 1267 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 203.39031982421875, + "learning_rate": 2.3150635208711436e-05, + "loss": 44.8926, + "step": 1268 + }, + { + "epoch": 4.581489841986456, + "grad_norm": 213.94894409179688, + "learning_rate": 2.314519056261343e-05, + "loss": 43.7589, + "step": 1269 + }, + { + "epoch": 4.58510158013544, + "grad_norm": 198.9677734375, + "learning_rate": 2.3139745916515427e-05, + "loss": 41.819, + "step": 1270 + }, + { + "epoch": 4.58510158013544, + "eval_loss": 0.6428627371788025, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1270 + }, + { + "epoch": 4.588713318284425, + "grad_norm": 197.69903564453125, + "learning_rate": 2.3134301270417422e-05, + "loss": 40.6128, + "step": 1271 + }, + { + "epoch": 4.592325056433409, + "grad_norm": 229.10488891601562, + "learning_rate": 2.312885662431942e-05, + "loss": 41.1856, + "step": 1272 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 254.4750213623047, + "learning_rate": 2.3123411978221417e-05, + "loss": 40.2048, + "step": 1273 + }, + { + "epoch": 4.599548532731377, + "grad_norm": 247.2012939453125, + "learning_rate": 2.3117967332123412e-05, + "loss": 41.663, + "step": 1274 + }, + { + "epoch": 4.603160270880361, + "grad_norm": 196.78761291503906, + "learning_rate": 2.3112522686025407e-05, + "loss": 41.1102, + "step": 1275 + }, + { + "epoch": 4.606772009029346, + "grad_norm": 179.03880310058594, + "learning_rate": 2.3107078039927403e-05, + "loss": 39.6368, + "step": 1276 + }, + { + "epoch": 4.6103837471783295, + "grad_norm": 203.49159240722656, + "learning_rate": 2.3101633393829405e-05, + "loss": 42.9424, + "step": 1277 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 254.80018615722656, + "learning_rate": 2.30961887477314e-05, + "loss": 42.0636, + "step": 1278 + }, + { + "epoch": 4.617607223476298, + "grad_norm": 201.86109924316406, + "learning_rate": 2.3090744101633396e-05, + "loss": 41.4738, + "step": 1279 + }, + { + "epoch": 4.621218961625282, + "grad_norm": 185.1239471435547, + "learning_rate": 2.308529945553539e-05, + "loss": 41.8529, + "step": 1280 + }, + { + "epoch": 4.621218961625282, + "eval_loss": 0.6457561254501343, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 1280 + }, + { + "epoch": 4.624830699774266, + "grad_norm": 198.6769561767578, + "learning_rate": 2.3079854809437386e-05, + "loss": 41.8397, + "step": 1281 + }, + { + "epoch": 4.62844243792325, + "grad_norm": 254.9165496826172, + "learning_rate": 2.3074410163339382e-05, + "loss": 43.5585, + "step": 1282 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 183.61181640625, + "learning_rate": 2.306896551724138e-05, + "loss": 41.7349, + "step": 1283 + }, + { + "epoch": 4.635665914221219, + "grad_norm": 206.0381622314453, + "learning_rate": 2.3063520871143376e-05, + "loss": 42.6239, + "step": 1284 + }, + { + "epoch": 4.639277652370203, + "grad_norm": 188.5303497314453, + "learning_rate": 2.305807622504537e-05, + "loss": 43.0988, + "step": 1285 + }, + { + "epoch": 4.642889390519187, + "grad_norm": 208.30039978027344, + "learning_rate": 2.3052631578947367e-05, + "loss": 43.8379, + "step": 1286 + }, + { + "epoch": 4.646501128668172, + "grad_norm": 209.494384765625, + "learning_rate": 2.3047186932849365e-05, + "loss": 41.4395, + "step": 1287 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 223.97824096679688, + "learning_rate": 2.3041742286751364e-05, + "loss": 38.5792, + "step": 1288 + }, + { + "epoch": 4.65372460496614, + "grad_norm": 209.16192626953125, + "learning_rate": 2.303629764065336e-05, + "loss": 36.2448, + "step": 1289 + }, + { + "epoch": 4.657336343115124, + "grad_norm": 260.72821044921875, + "learning_rate": 2.3030852994555355e-05, + "loss": 35.1692, + "step": 1290 + }, + { + "epoch": 4.657336343115124, + "eval_loss": 0.6381233334541321, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1290 + }, + { + "epoch": 4.660948081264109, + "grad_norm": 222.2270965576172, + "learning_rate": 2.302540834845735e-05, + "loss": 35.2234, + "step": 1291 + }, + { + "epoch": 4.664559819413093, + "grad_norm": 208.68218994140625, + "learning_rate": 2.3019963702359346e-05, + "loss": 35.6167, + "step": 1292 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 199.57015991210938, + "learning_rate": 2.301451905626134e-05, + "loss": 36.9489, + "step": 1293 + }, + { + "epoch": 4.6717832957110605, + "grad_norm": 249.1312255859375, + "learning_rate": 2.300907441016334e-05, + "loss": 37.0681, + "step": 1294 + }, + { + "epoch": 4.675395033860045, + "grad_norm": 227.86341857910156, + "learning_rate": 2.3003629764065335e-05, + "loss": 38.3897, + "step": 1295 + }, + { + "epoch": 4.679006772009029, + "grad_norm": 290.3368225097656, + "learning_rate": 2.2998185117967334e-05, + "loss": 39.1391, + "step": 1296 + }, + { + "epoch": 4.682618510158013, + "grad_norm": 222.59974670410156, + "learning_rate": 2.299274047186933e-05, + "loss": 38.6362, + "step": 1297 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 233.853515625, + "learning_rate": 2.2987295825771325e-05, + "loss": 37.1796, + "step": 1298 + }, + { + "epoch": 4.689841986455982, + "grad_norm": 202.83212280273438, + "learning_rate": 2.2981851179673324e-05, + "loss": 38.5097, + "step": 1299 + }, + { + "epoch": 4.693453724604966, + "grad_norm": 203.59027099609375, + "learning_rate": 2.297640653357532e-05, + "loss": 38.3335, + "step": 1300 + }, + { + "epoch": 4.693453724604966, + "eval_loss": 0.6446877717971802, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 1300 + }, + { + "epoch": 4.69706546275395, + "grad_norm": 250.48324584960938, + "learning_rate": 2.2970961887477314e-05, + "loss": 39.1848, + "step": 1301 + }, + { + "epoch": 4.700677200902934, + "grad_norm": 218.0867462158203, + "learning_rate": 2.296551724137931e-05, + "loss": 38.2276, + "step": 1302 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 316.4258728027344, + "learning_rate": 2.2960072595281305e-05, + "loss": 38.4487, + "step": 1303 + }, + { + "epoch": 4.707900677200903, + "grad_norm": 262.96832275390625, + "learning_rate": 2.29546279491833e-05, + "loss": 29.1075, + "step": 1304 + }, + { + "epoch": 4.711512415349887, + "grad_norm": 261.25897216796875, + "learning_rate": 2.2949183303085303e-05, + "loss": 24.6257, + "step": 1305 + }, + { + "epoch": 4.715124153498872, + "grad_norm": 223.29014587402344, + "learning_rate": 2.2943738656987298e-05, + "loss": 24.4387, + "step": 1306 + }, + { + "epoch": 4.718735891647856, + "grad_norm": 167.95193481445312, + "learning_rate": 2.2938294010889293e-05, + "loss": 25.0916, + "step": 1307 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 203.88392639160156, + "learning_rate": 2.293284936479129e-05, + "loss": 26.1631, + "step": 1308 + }, + { + "epoch": 4.725959367945824, + "grad_norm": 350.67657470703125, + "learning_rate": 2.2927404718693284e-05, + "loss": 47.7021, + "step": 1309 + }, + { + "epoch": 4.7295711060948085, + "grad_norm": 357.1839294433594, + "learning_rate": 2.2921960072595283e-05, + "loss": 47.8161, + "step": 1310 + }, + { + "epoch": 4.7295711060948085, + "eval_loss": 0.6716815829277039, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 1310 + }, + { + "epoch": 4.733182844243792, + "grad_norm": 334.40216064453125, + "learning_rate": 2.291651542649728e-05, + "loss": 47.5608, + "step": 1311 + }, + { + "epoch": 4.736794582392776, + "grad_norm": 322.90008544921875, + "learning_rate": 2.2911070780399274e-05, + "loss": 45.9858, + "step": 1312 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 291.5083923339844, + "learning_rate": 2.290562613430127e-05, + "loss": 45.9813, + "step": 1313 + }, + { + "epoch": 4.744018058690745, + "grad_norm": 234.91102600097656, + "learning_rate": 2.2900181488203268e-05, + "loss": 44.4287, + "step": 1314 + }, + { + "epoch": 4.747629796839729, + "grad_norm": 271.03582763671875, + "learning_rate": 2.2894736842105263e-05, + "loss": 45.3697, + "step": 1315 + }, + { + "epoch": 4.751241534988713, + "grad_norm": 256.219482421875, + "learning_rate": 2.2889292196007262e-05, + "loss": 45.1817, + "step": 1316 + }, + { + "epoch": 4.754853273137698, + "grad_norm": 252.0631561279297, + "learning_rate": 2.2883847549909257e-05, + "loss": 45.2029, + "step": 1317 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 249.41812133789062, + "learning_rate": 2.2878402903811253e-05, + "loss": 44.9802, + "step": 1318 + }, + { + "epoch": 4.762076749435666, + "grad_norm": 208.9102325439453, + "learning_rate": 2.2872958257713248e-05, + "loss": 44.3745, + "step": 1319 + }, + { + "epoch": 4.76568848758465, + "grad_norm": 322.94903564453125, + "learning_rate": 2.2867513611615244e-05, + "loss": 40.9193, + "step": 1320 + }, + { + "epoch": 4.76568848758465, + "eval_loss": 0.6515910029411316, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1320 + }, + { + "epoch": 4.769300225733634, + "grad_norm": 264.6942138671875, + "learning_rate": 2.2862068965517242e-05, + "loss": 39.7286, + "step": 1321 + }, + { + "epoch": 4.772911963882619, + "grad_norm": 276.6095886230469, + "learning_rate": 2.2856624319419238e-05, + "loss": 41.3846, + "step": 1322 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 199.59877014160156, + "learning_rate": 2.2851179673321233e-05, + "loss": 40.5583, + "step": 1323 + }, + { + "epoch": 4.780135440180587, + "grad_norm": 252.59158325195312, + "learning_rate": 2.2845735027223232e-05, + "loss": 40.9513, + "step": 1324 + }, + { + "epoch": 4.7837471783295715, + "grad_norm": 215.53826904296875, + "learning_rate": 2.2840290381125227e-05, + "loss": 41.5119, + "step": 1325 + }, + { + "epoch": 4.7873589164785555, + "grad_norm": 290.7100524902344, + "learning_rate": 2.2834845735027226e-05, + "loss": 42.7646, + "step": 1326 + }, + { + "epoch": 4.7909706546275395, + "grad_norm": 190.2306671142578, + "learning_rate": 2.282940108892922e-05, + "loss": 42.2708, + "step": 1327 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 187.5550079345703, + "learning_rate": 2.2823956442831217e-05, + "loss": 41.9279, + "step": 1328 + }, + { + "epoch": 4.798194130925508, + "grad_norm": 169.10414123535156, + "learning_rate": 2.2818511796733212e-05, + "loss": 42.2688, + "step": 1329 + }, + { + "epoch": 4.801805869074492, + "grad_norm": 199.5216064453125, + "learning_rate": 2.2813067150635208e-05, + "loss": 41.9192, + "step": 1330 + }, + { + "epoch": 4.801805869074492, + "eval_loss": 0.6402038335800171, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 1330 + }, + { + "epoch": 4.805417607223476, + "grad_norm": 222.4996337890625, + "learning_rate": 2.2807622504537203e-05, + "loss": 43.8218, + "step": 1331 + }, + { + "epoch": 4.80902934537246, + "grad_norm": 228.1157684326172, + "learning_rate": 2.2802177858439202e-05, + "loss": 42.9497, + "step": 1332 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 179.83697509765625, + "learning_rate": 2.27967332123412e-05, + "loss": 43.9723, + "step": 1333 + }, + { + "epoch": 4.816252821670429, + "grad_norm": 196.81983947753906, + "learning_rate": 2.2791288566243196e-05, + "loss": 43.3302, + "step": 1334 + }, + { + "epoch": 4.819864559819413, + "grad_norm": 186.61160278320312, + "learning_rate": 2.278584392014519e-05, + "loss": 41.8957, + "step": 1335 + }, + { + "epoch": 4.823476297968397, + "grad_norm": 242.55886840820312, + "learning_rate": 2.2780399274047187e-05, + "loss": 43.1916, + "step": 1336 + }, + { + "epoch": 4.827088036117382, + "grad_norm": 212.07177734375, + "learning_rate": 2.2774954627949185e-05, + "loss": 38.3371, + "step": 1337 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 180.1990966796875, + "learning_rate": 2.276950998185118e-05, + "loss": 36.3413, + "step": 1338 + }, + { + "epoch": 4.83431151241535, + "grad_norm": 202.69529724121094, + "learning_rate": 2.2764065335753176e-05, + "loss": 35.4426, + "step": 1339 + }, + { + "epoch": 4.837923250564334, + "grad_norm": 180.47283935546875, + "learning_rate": 2.275862068965517e-05, + "loss": 35.5281, + "step": 1340 + }, + { + "epoch": 4.837923250564334, + "eval_loss": 0.6356105804443359, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 1340 + }, + { + "epoch": 4.8415349887133186, + "grad_norm": 204.674560546875, + "learning_rate": 2.2753176043557167e-05, + "loss": 36.2566, + "step": 1341 + }, + { + "epoch": 4.8451467268623025, + "grad_norm": 272.1197204589844, + "learning_rate": 2.2747731397459166e-05, + "loss": 36.3862, + "step": 1342 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 235.55101013183594, + "learning_rate": 2.2742286751361165e-05, + "loss": 35.1455, + "step": 1343 + }, + { + "epoch": 4.852370203160271, + "grad_norm": 271.2718200683594, + "learning_rate": 2.273684210526316e-05, + "loss": 37.3824, + "step": 1344 + }, + { + "epoch": 4.855981941309255, + "grad_norm": 242.15728759765625, + "learning_rate": 2.2731397459165155e-05, + "loss": 37.6587, + "step": 1345 + }, + { + "epoch": 4.859593679458239, + "grad_norm": 218.59481811523438, + "learning_rate": 2.272595281306715e-05, + "loss": 36.7602, + "step": 1346 + }, + { + "epoch": 4.863205417607223, + "grad_norm": 231.9490203857422, + "learning_rate": 2.2720508166969146e-05, + "loss": 38.187, + "step": 1347 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 385.56158447265625, + "learning_rate": 2.2715063520871145e-05, + "loss": 38.1905, + "step": 1348 + }, + { + "epoch": 4.870428893905192, + "grad_norm": 219.38204956054688, + "learning_rate": 2.270961887477314e-05, + "loss": 38.2179, + "step": 1349 + }, + { + "epoch": 4.874040632054176, + "grad_norm": 209.46580505371094, + "learning_rate": 2.2704174228675136e-05, + "loss": 37.3696, + "step": 1350 + }, + { + "epoch": 4.874040632054176, + "eval_loss": 0.6412517428398132, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 1350 + }, + { + "epoch": 4.87765237020316, + "grad_norm": 205.53416442871094, + "learning_rate": 2.2698729582577134e-05, + "loss": 38.5144, + "step": 1351 + }, + { + "epoch": 4.881264108352145, + "grad_norm": 214.2522735595703, + "learning_rate": 2.269328493647913e-05, + "loss": 38.7372, + "step": 1352 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 236.9787139892578, + "learning_rate": 2.2687840290381125e-05, + "loss": 38.8987, + "step": 1353 + }, + { + "epoch": 4.888487584650113, + "grad_norm": 247.30906677246094, + "learning_rate": 2.2682395644283124e-05, + "loss": 35.0837, + "step": 1354 + }, + { + "epoch": 4.892099322799097, + "grad_norm": 287.5954284667969, + "learning_rate": 2.267695099818512e-05, + "loss": 25.5272, + "step": 1355 + }, + { + "epoch": 4.895711060948082, + "grad_norm": 254.61672973632812, + "learning_rate": 2.2671506352087115e-05, + "loss": 25.1288, + "step": 1356 + }, + { + "epoch": 4.899322799097066, + "grad_norm": 180.98666381835938, + "learning_rate": 2.266606170598911e-05, + "loss": 25.0588, + "step": 1357 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 213.0275421142578, + "learning_rate": 2.2660617059891105e-05, + "loss": 25.464, + "step": 1358 + }, + { + "epoch": 4.9065462753950335, + "grad_norm": 385.18035888671875, + "learning_rate": 2.2655172413793104e-05, + "loss": 47.0056, + "step": 1359 + }, + { + "epoch": 4.910158013544018, + "grad_norm": 383.4106140136719, + "learning_rate": 2.2649727767695103e-05, + "loss": 46.9892, + "step": 1360 + }, + { + "epoch": 4.910158013544018, + "eval_loss": 0.6618479490280151, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1360 + }, + { + "epoch": 4.913769751693002, + "grad_norm": 415.4345397949219, + "learning_rate": 2.26442831215971e-05, + "loss": 47.1619, + "step": 1361 + }, + { + "epoch": 4.917381489841986, + "grad_norm": 362.338134765625, + "learning_rate": 2.2638838475499094e-05, + "loss": 46.7232, + "step": 1362 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 378.7535400390625, + "learning_rate": 2.263339382940109e-05, + "loss": 46.4438, + "step": 1363 + }, + { + "epoch": 4.924604966139955, + "grad_norm": 251.64901733398438, + "learning_rate": 2.2627949183303085e-05, + "loss": 44.8178, + "step": 1364 + }, + { + "epoch": 4.928216704288939, + "grad_norm": 273.1052551269531, + "learning_rate": 2.2622504537205083e-05, + "loss": 43.0865, + "step": 1365 + }, + { + "epoch": 4.931828442437923, + "grad_norm": 229.66415405273438, + "learning_rate": 2.261705989110708e-05, + "loss": 42.2463, + "step": 1366 + }, + { + "epoch": 4.935440180586907, + "grad_norm": 229.47940063476562, + "learning_rate": 2.2611615245009074e-05, + "loss": 42.4395, + "step": 1367 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 224.48890686035156, + "learning_rate": 2.260617059891107e-05, + "loss": 42.4994, + "step": 1368 + }, + { + "epoch": 4.942663656884876, + "grad_norm": 241.98745727539062, + "learning_rate": 2.2600725952813065e-05, + "loss": 42.5535, + "step": 1369 + }, + { + "epoch": 4.94627539503386, + "grad_norm": 258.1711120605469, + "learning_rate": 2.2595281306715067e-05, + "loss": 42.8475, + "step": 1370 + }, + { + "epoch": 4.94627539503386, + "eval_loss": 0.639252245426178, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.09, + "eval_steps_per_second": 57.09, + "step": 1370 + }, + { + "epoch": 4.949887133182845, + "grad_norm": 204.64927673339844, + "learning_rate": 2.2589836660617062e-05, + "loss": 42.9895, + "step": 1371 + }, + { + "epoch": 4.953498871331829, + "grad_norm": 342.9057922363281, + "learning_rate": 2.2584392014519058e-05, + "loss": 43.1972, + "step": 1372 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 207.45504760742188, + "learning_rate": 2.2578947368421053e-05, + "loss": 42.406, + "step": 1373 + }, + { + "epoch": 4.960722347629797, + "grad_norm": 232.78831481933594, + "learning_rate": 2.257350272232305e-05, + "loss": 36.8817, + "step": 1374 + }, + { + "epoch": 4.9643340857787805, + "grad_norm": 249.3349609375, + "learning_rate": 2.2568058076225044e-05, + "loss": 34.584, + "step": 1375 + }, + { + "epoch": 4.967945823927765, + "grad_norm": 322.7100524902344, + "learning_rate": 2.2562613430127043e-05, + "loss": 36.9512, + "step": 1376 + }, + { + "epoch": 4.971557562076749, + "grad_norm": 357.65228271484375, + "learning_rate": 2.2557168784029038e-05, + "loss": 37.6833, + "step": 1377 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 300.0970153808594, + "learning_rate": 2.2551724137931033e-05, + "loss": 38.597, + "step": 1378 + }, + { + "epoch": 4.978781038374718, + "grad_norm": 234.52508544921875, + "learning_rate": 2.2546279491833032e-05, + "loss": 38.4155, + "step": 1379 + }, + { + "epoch": 4.982392776523702, + "grad_norm": 270.60626220703125, + "learning_rate": 2.2540834845735028e-05, + "loss": 38.1589, + "step": 1380 + }, + { + "epoch": 4.982392776523702, + "eval_loss": 0.6409950256347656, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 1380 + }, + { + "epoch": 4.986004514672686, + "grad_norm": 232.9596710205078, + "learning_rate": 2.2535390199637026e-05, + "loss": 39.281, + "step": 1381 + }, + { + "epoch": 4.98961625282167, + "grad_norm": 248.0550994873047, + "learning_rate": 2.2529945553539022e-05, + "loss": 40.0868, + "step": 1382 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 256.327880859375, + "learning_rate": 2.2524500907441017e-05, + "loss": 28.1259, + "step": 1383 + }, + { + "epoch": 4.996839729119639, + "grad_norm": 198.29559326171875, + "learning_rate": 2.2519056261343012e-05, + "loss": 25.3166, + "step": 1384 + }, + { + "epoch": 5.0, + "grad_norm": 174.66856384277344, + "learning_rate": 2.2513611615245008e-05, + "loss": 22.0749, + "step": 1385 + }, + { + "epoch": 5.003611738148984, + "grad_norm": 309.0927429199219, + "learning_rate": 2.2508166969147003e-05, + "loss": 45.2433, + "step": 1386 + }, + { + "epoch": 5.007223476297969, + "grad_norm": 293.1455383300781, + "learning_rate": 2.2502722323049002e-05, + "loss": 46.7025, + "step": 1387 + }, + { + "epoch": 5.010835214446953, + "grad_norm": 269.47662353515625, + "learning_rate": 2.2497277676951e-05, + "loss": 45.3218, + "step": 1388 + }, + { + "epoch": 5.014446952595937, + "grad_norm": 284.49560546875, + "learning_rate": 2.2491833030852996e-05, + "loss": 44.9849, + "step": 1389 + }, + { + "epoch": 5.018058690744921, + "grad_norm": 223.5511474609375, + "learning_rate": 2.248638838475499e-05, + "loss": 44.887, + "step": 1390 + }, + { + "epoch": 5.018058690744921, + "eval_loss": 0.6435533165931702, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1390 + }, + { + "epoch": 5.021670428893906, + "grad_norm": 243.4492645263672, + "learning_rate": 2.2480943738656987e-05, + "loss": 45.1483, + "step": 1391 + }, + { + "epoch": 5.0252821670428895, + "grad_norm": 265.1712646484375, + "learning_rate": 2.2475499092558986e-05, + "loss": 44.3713, + "step": 1392 + }, + { + "epoch": 5.0288939051918735, + "grad_norm": 190.72190856933594, + "learning_rate": 2.247005444646098e-05, + "loss": 45.3138, + "step": 1393 + }, + { + "epoch": 5.0325056433408575, + "grad_norm": 177.26686096191406, + "learning_rate": 2.2464609800362976e-05, + "loss": 43.302, + "step": 1394 + }, + { + "epoch": 5.036117381489842, + "grad_norm": 198.6124725341797, + "learning_rate": 2.2459165154264972e-05, + "loss": 43.6363, + "step": 1395 + }, + { + "epoch": 5.039729119638826, + "grad_norm": 233.78738403320312, + "learning_rate": 2.2453720508166967e-05, + "loss": 43.0345, + "step": 1396 + }, + { + "epoch": 5.04334085778781, + "grad_norm": 225.48614501953125, + "learning_rate": 2.2448275862068966e-05, + "loss": 41.5932, + "step": 1397 + }, + { + "epoch": 5.046952595936794, + "grad_norm": 204.31179809570312, + "learning_rate": 2.2442831215970965e-05, + "loss": 40.1401, + "step": 1398 + }, + { + "epoch": 5.050564334085779, + "grad_norm": 219.5385284423828, + "learning_rate": 2.243738656987296e-05, + "loss": 40.8834, + "step": 1399 + }, + { + "epoch": 5.054176072234763, + "grad_norm": 168.3094024658203, + "learning_rate": 2.2431941923774956e-05, + "loss": 40.4476, + "step": 1400 + }, + { + "epoch": 5.054176072234763, + "eval_loss": 0.6361114382743835, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 1400 + }, + { + "epoch": 5.057787810383747, + "grad_norm": 169.45201110839844, + "learning_rate": 2.242649727767695e-05, + "loss": 40.1949, + "step": 1401 + }, + { + "epoch": 5.061399548532731, + "grad_norm": 208.84634399414062, + "learning_rate": 2.2421052631578946e-05, + "loss": 41.0091, + "step": 1402 + }, + { + "epoch": 5.065011286681716, + "grad_norm": 248.86221313476562, + "learning_rate": 2.2415607985480945e-05, + "loss": 40.2435, + "step": 1403 + }, + { + "epoch": 5.0686230248307, + "grad_norm": 297.0834655761719, + "learning_rate": 2.241016333938294e-05, + "loss": 42.37, + "step": 1404 + }, + { + "epoch": 5.072234762979684, + "grad_norm": 242.12661743164062, + "learning_rate": 2.2404718693284936e-05, + "loss": 42.3822, + "step": 1405 + }, + { + "epoch": 5.075846501128668, + "grad_norm": 230.1178741455078, + "learning_rate": 2.2399274047186935e-05, + "loss": 41.3722, + "step": 1406 + }, + { + "epoch": 5.079458239277653, + "grad_norm": 191.32371520996094, + "learning_rate": 2.239382940108893e-05, + "loss": 41.8087, + "step": 1407 + }, + { + "epoch": 5.083069977426637, + "grad_norm": 267.28753662109375, + "learning_rate": 2.2388384754990925e-05, + "loss": 42.5938, + "step": 1408 + }, + { + "epoch": 5.0866817155756205, + "grad_norm": 186.61978149414062, + "learning_rate": 2.2382940108892924e-05, + "loss": 42.8553, + "step": 1409 + }, + { + "epoch": 5.090293453724605, + "grad_norm": 242.53433227539062, + "learning_rate": 2.237749546279492e-05, + "loss": 41.9677, + "step": 1410 + }, + { + "epoch": 5.090293453724605, + "eval_loss": 0.6330043077468872, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 1410 + }, + { + "epoch": 5.093905191873589, + "grad_norm": 199.74696350097656, + "learning_rate": 2.2372050816696915e-05, + "loss": 42.9821, + "step": 1411 + }, + { + "epoch": 5.097516930022573, + "grad_norm": 254.1063690185547, + "learning_rate": 2.236660617059891e-05, + "loss": 42.7956, + "step": 1412 + }, + { + "epoch": 5.101128668171557, + "grad_norm": 215.59056091308594, + "learning_rate": 2.2361161524500906e-05, + "loss": 43.6312, + "step": 1413 + }, + { + "epoch": 5.104740406320542, + "grad_norm": 218.69973754882812, + "learning_rate": 2.2355716878402904e-05, + "loss": 40.9468, + "step": 1414 + }, + { + "epoch": 5.108352144469526, + "grad_norm": 200.34927368164062, + "learning_rate": 2.23502722323049e-05, + "loss": 38.2656, + "step": 1415 + }, + { + "epoch": 5.11196388261851, + "grad_norm": 191.56883239746094, + "learning_rate": 2.23448275862069e-05, + "loss": 35.8111, + "step": 1416 + }, + { + "epoch": 5.115575620767494, + "grad_norm": 192.629150390625, + "learning_rate": 2.2339382940108894e-05, + "loss": 35.1287, + "step": 1417 + }, + { + "epoch": 5.119187358916479, + "grad_norm": 217.54855346679688, + "learning_rate": 2.233393829401089e-05, + "loss": 34.9664, + "step": 1418 + }, + { + "epoch": 5.122799097065463, + "grad_norm": 234.12355041503906, + "learning_rate": 2.2328493647912888e-05, + "loss": 35.9252, + "step": 1419 + }, + { + "epoch": 5.126410835214447, + "grad_norm": 201.83477783203125, + "learning_rate": 2.2323049001814884e-05, + "loss": 36.4664, + "step": 1420 + }, + { + "epoch": 5.126410835214447, + "eval_loss": 0.6359394192695618, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 1420 + }, + { + "epoch": 5.130022573363431, + "grad_norm": 212.38943481445312, + "learning_rate": 2.231760435571688e-05, + "loss": 35.2733, + "step": 1421 + }, + { + "epoch": 5.133634311512416, + "grad_norm": 219.8803253173828, + "learning_rate": 2.2312159709618874e-05, + "loss": 37.2009, + "step": 1422 + }, + { + "epoch": 5.1372460496614, + "grad_norm": 222.28221130371094, + "learning_rate": 2.230671506352087e-05, + "loss": 36.9338, + "step": 1423 + }, + { + "epoch": 5.140857787810384, + "grad_norm": 217.56607055664062, + "learning_rate": 2.2301270417422865e-05, + "loss": 38.0419, + "step": 1424 + }, + { + "epoch": 5.144469525959368, + "grad_norm": 232.7363739013672, + "learning_rate": 2.2295825771324867e-05, + "loss": 38.1393, + "step": 1425 + }, + { + "epoch": 5.148081264108352, + "grad_norm": 228.12091064453125, + "learning_rate": 2.2290381125226863e-05, + "loss": 37.4169, + "step": 1426 + }, + { + "epoch": 5.151693002257336, + "grad_norm": 247.9901580810547, + "learning_rate": 2.2284936479128858e-05, + "loss": 37.6386, + "step": 1427 + }, + { + "epoch": 5.15530474040632, + "grad_norm": 227.96649169921875, + "learning_rate": 2.2279491833030853e-05, + "loss": 38.7843, + "step": 1428 + }, + { + "epoch": 5.158916478555304, + "grad_norm": 197.85072326660156, + "learning_rate": 2.227404718693285e-05, + "loss": 37.7056, + "step": 1429 + }, + { + "epoch": 5.162528216704289, + "grad_norm": 270.6370544433594, + "learning_rate": 2.2268602540834848e-05, + "loss": 38.5554, + "step": 1430 + }, + { + "epoch": 5.162528216704289, + "eval_loss": 0.6463288068771362, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 1430 + }, + { + "epoch": 5.166139954853273, + "grad_norm": 251.65847778320312, + "learning_rate": 2.2263157894736843e-05, + "loss": 32.6593, + "step": 1431 + }, + { + "epoch": 5.169751693002257, + "grad_norm": 248.84368896484375, + "learning_rate": 2.225771324863884e-05, + "loss": 24.8031, + "step": 1432 + }, + { + "epoch": 5.173363431151241, + "grad_norm": 218.12979125976562, + "learning_rate": 2.2252268602540834e-05, + "loss": 23.8542, + "step": 1433 + }, + { + "epoch": 5.176975169300226, + "grad_norm": 171.4182586669922, + "learning_rate": 2.2246823956442832e-05, + "loss": 25.1994, + "step": 1434 + }, + { + "epoch": 5.18058690744921, + "grad_norm": 200.76271057128906, + "learning_rate": 2.2241379310344828e-05, + "loss": 25.1259, + "step": 1435 + }, + { + "epoch": 5.184198645598194, + "grad_norm": 324.8979797363281, + "learning_rate": 2.2235934664246827e-05, + "loss": 46.7466, + "step": 1436 + }, + { + "epoch": 5.187810383747179, + "grad_norm": 391.9200439453125, + "learning_rate": 2.2230490018148822e-05, + "loss": 47.366, + "step": 1437 + }, + { + "epoch": 5.191422121896163, + "grad_norm": 332.51080322265625, + "learning_rate": 2.2225045372050817e-05, + "loss": 47.5236, + "step": 1438 + }, + { + "epoch": 5.195033860045147, + "grad_norm": 295.85333251953125, + "learning_rate": 2.2219600725952813e-05, + "loss": 44.9235, + "step": 1439 + }, + { + "epoch": 5.198645598194131, + "grad_norm": 246.46482849121094, + "learning_rate": 2.2214156079854808e-05, + "loss": 44.5892, + "step": 1440 + }, + { + "epoch": 5.198645598194131, + "eval_loss": 0.6501885056495667, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.096, + "eval_steps_per_second": 57.096, + "step": 1440 + }, + { + "epoch": 5.2022573363431155, + "grad_norm": 224.99964904785156, + "learning_rate": 2.2208711433756807e-05, + "loss": 45.1496, + "step": 1441 + }, + { + "epoch": 5.2058690744920995, + "grad_norm": 201.5928497314453, + "learning_rate": 2.2203266787658802e-05, + "loss": 44.2362, + "step": 1442 + }, + { + "epoch": 5.209480812641083, + "grad_norm": 220.72509765625, + "learning_rate": 2.21978221415608e-05, + "loss": 45.7963, + "step": 1443 + }, + { + "epoch": 5.213092550790067, + "grad_norm": 229.04412841796875, + "learning_rate": 2.2192377495462796e-05, + "loss": 44.1812, + "step": 1444 + }, + { + "epoch": 5.216704288939052, + "grad_norm": 214.86207580566406, + "learning_rate": 2.2186932849364792e-05, + "loss": 44.364, + "step": 1445 + }, + { + "epoch": 5.220316027088036, + "grad_norm": 169.3239288330078, + "learning_rate": 2.2181488203266787e-05, + "loss": 44.1106, + "step": 1446 + }, + { + "epoch": 5.22392776523702, + "grad_norm": 180.3131561279297, + "learning_rate": 2.2176043557168786e-05, + "loss": 41.8791, + "step": 1447 + }, + { + "epoch": 5.227539503386004, + "grad_norm": 227.83078002929688, + "learning_rate": 2.217059891107078e-05, + "loss": 39.7917, + "step": 1448 + }, + { + "epoch": 5.231151241534989, + "grad_norm": 267.4294738769531, + "learning_rate": 2.2165154264972777e-05, + "loss": 41.2864, + "step": 1449 + }, + { + "epoch": 5.234762979683973, + "grad_norm": 210.79034423828125, + "learning_rate": 2.2159709618874772e-05, + "loss": 40.7219, + "step": 1450 + }, + { + "epoch": 5.234762979683973, + "eval_loss": 0.6369529366493225, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 1450 + }, + { + "epoch": 5.238374717832957, + "grad_norm": 205.2632598876953, + "learning_rate": 2.2154264972776768e-05, + "loss": 41.0364, + "step": 1451 + }, + { + "epoch": 5.241986455981941, + "grad_norm": 199.7196807861328, + "learning_rate": 2.214882032667877e-05, + "loss": 40.2733, + "step": 1452 + }, + { + "epoch": 5.245598194130926, + "grad_norm": 184.26495361328125, + "learning_rate": 2.2143375680580765e-05, + "loss": 40.3418, + "step": 1453 + }, + { + "epoch": 5.24920993227991, + "grad_norm": 170.1937713623047, + "learning_rate": 2.213793103448276e-05, + "loss": 40.5658, + "step": 1454 + }, + { + "epoch": 5.252821670428894, + "grad_norm": 167.71109008789062, + "learning_rate": 2.2132486388384756e-05, + "loss": 41.9252, + "step": 1455 + }, + { + "epoch": 5.2564334085778786, + "grad_norm": 184.73162841796875, + "learning_rate": 2.212704174228675e-05, + "loss": 40.0485, + "step": 1456 + }, + { + "epoch": 5.2600451467268625, + "grad_norm": 195.0812225341797, + "learning_rate": 2.2121597096188747e-05, + "loss": 41.6424, + "step": 1457 + }, + { + "epoch": 5.2636568848758465, + "grad_norm": 218.23553466796875, + "learning_rate": 2.2116152450090745e-05, + "loss": 40.6179, + "step": 1458 + }, + { + "epoch": 5.2672686230248305, + "grad_norm": 229.79299926757812, + "learning_rate": 2.211070780399274e-05, + "loss": 42.8747, + "step": 1459 + }, + { + "epoch": 5.270880361173815, + "grad_norm": 231.70692443847656, + "learning_rate": 2.2105263157894736e-05, + "loss": 42.7016, + "step": 1460 + }, + { + "epoch": 5.270880361173815, + "eval_loss": 0.6424433588981628, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 1460 + }, + { + "epoch": 5.274492099322799, + "grad_norm": 204.9513397216797, + "learning_rate": 2.209981851179673e-05, + "loss": 41.206, + "step": 1461 + }, + { + "epoch": 5.278103837471783, + "grad_norm": 220.89083862304688, + "learning_rate": 2.209437386569873e-05, + "loss": 44.0126, + "step": 1462 + }, + { + "epoch": 5.281715575620767, + "grad_norm": 266.7763671875, + "learning_rate": 2.208892921960073e-05, + "loss": 41.4934, + "step": 1463 + }, + { + "epoch": 5.285327313769752, + "grad_norm": 241.42636108398438, + "learning_rate": 2.2083484573502724e-05, + "loss": 43.3433, + "step": 1464 + }, + { + "epoch": 5.288939051918736, + "grad_norm": 221.7669219970703, + "learning_rate": 2.207803992740472e-05, + "loss": 35.9569, + "step": 1465 + }, + { + "epoch": 5.29255079006772, + "grad_norm": 236.0152130126953, + "learning_rate": 2.2072595281306715e-05, + "loss": 36.0824, + "step": 1466 + }, + { + "epoch": 5.296162528216704, + "grad_norm": 239.56224060058594, + "learning_rate": 2.206715063520871e-05, + "loss": 33.6127, + "step": 1467 + }, + { + "epoch": 5.299774266365689, + "grad_norm": 277.1287841796875, + "learning_rate": 2.2061705989110706e-05, + "loss": 36.11, + "step": 1468 + }, + { + "epoch": 5.303386004514673, + "grad_norm": 250.19515991210938, + "learning_rate": 2.2056261343012705e-05, + "loss": 36.9984, + "step": 1469 + }, + { + "epoch": 5.306997742663657, + "grad_norm": 214.2754669189453, + "learning_rate": 2.20508166969147e-05, + "loss": 36.5917, + "step": 1470 + }, + { + "epoch": 5.306997742663657, + "eval_loss": 0.6356943845748901, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 1470 + }, + { + "epoch": 5.310609480812641, + "grad_norm": 224.37388610839844, + "learning_rate": 2.20453720508167e-05, + "loss": 36.5302, + "step": 1471 + }, + { + "epoch": 5.314221218961626, + "grad_norm": 276.2541809082031, + "learning_rate": 2.2039927404718694e-05, + "loss": 36.7978, + "step": 1472 + }, + { + "epoch": 5.3178329571106095, + "grad_norm": 361.717041015625, + "learning_rate": 2.203448275862069e-05, + "loss": 37.4063, + "step": 1473 + }, + { + "epoch": 5.3214446952595935, + "grad_norm": 285.3569641113281, + "learning_rate": 2.202903811252269e-05, + "loss": 37.2472, + "step": 1474 + }, + { + "epoch": 5.3250564334085775, + "grad_norm": 268.160400390625, + "learning_rate": 2.2023593466424684e-05, + "loss": 37.7361, + "step": 1475 + }, + { + "epoch": 5.328668171557562, + "grad_norm": 211.38070678710938, + "learning_rate": 2.201814882032668e-05, + "loss": 37.7794, + "step": 1476 + }, + { + "epoch": 5.332279909706546, + "grad_norm": 214.10638427734375, + "learning_rate": 2.2012704174228675e-05, + "loss": 39.0787, + "step": 1477 + }, + { + "epoch": 5.33589164785553, + "grad_norm": 238.9603271484375, + "learning_rate": 2.200725952813067e-05, + "loss": 37.6853, + "step": 1478 + }, + { + "epoch": 5.339503386004514, + "grad_norm": 323.44976806640625, + "learning_rate": 2.2001814882032665e-05, + "loss": 38.2844, + "step": 1479 + }, + { + "epoch": 5.343115124153499, + "grad_norm": 289.6131896972656, + "learning_rate": 2.1996370235934668e-05, + "loss": 38.8953, + "step": 1480 + }, + { + "epoch": 5.343115124153499, + "eval_loss": 0.6462770700454712, + "eval_runtime": 3.1673, + "eval_samples_per_second": 56.516, + "eval_steps_per_second": 56.516, + "step": 1480 + }, + { + "epoch": 5.346726862302483, + "grad_norm": 197.47299194335938, + "learning_rate": 2.1990925589836663e-05, + "loss": 28.126, + "step": 1481 + }, + { + "epoch": 5.350338600451467, + "grad_norm": 198.37156677246094, + "learning_rate": 2.1985480943738658e-05, + "loss": 24.2205, + "step": 1482 + }, + { + "epoch": 5.353950338600452, + "grad_norm": 211.03501892089844, + "learning_rate": 2.1980036297640654e-05, + "loss": 24.119, + "step": 1483 + }, + { + "epoch": 5.357562076749436, + "grad_norm": 182.23316955566406, + "learning_rate": 2.197459165154265e-05, + "loss": 24.7386, + "step": 1484 + }, + { + "epoch": 5.36117381489842, + "grad_norm": 192.6392822265625, + "learning_rate": 2.1969147005444648e-05, + "loss": 26.0739, + "step": 1485 + }, + { + "epoch": 5.364785553047404, + "grad_norm": 380.62896728515625, + "learning_rate": 2.1963702359346643e-05, + "loss": 46.6945, + "step": 1486 + }, + { + "epoch": 5.368397291196389, + "grad_norm": 342.5572814941406, + "learning_rate": 2.195825771324864e-05, + "loss": 46.1797, + "step": 1487 + }, + { + "epoch": 5.372009029345373, + "grad_norm": 311.7198791503906, + "learning_rate": 2.1952813067150634e-05, + "loss": 45.6588, + "step": 1488 + }, + { + "epoch": 5.375620767494357, + "grad_norm": 260.9885559082031, + "learning_rate": 2.1947368421052633e-05, + "loss": 45.2405, + "step": 1489 + }, + { + "epoch": 5.3792325056433405, + "grad_norm": 263.3132019042969, + "learning_rate": 2.1941923774954628e-05, + "loss": 44.117, + "step": 1490 + }, + { + "epoch": 5.3792325056433405, + "eval_loss": 0.644275426864624, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 1490 + }, + { + "epoch": 5.382844243792325, + "grad_norm": 254.92022705078125, + "learning_rate": 2.1936479128856627e-05, + "loss": 45.4002, + "step": 1491 + }, + { + "epoch": 5.386455981941309, + "grad_norm": 246.1839599609375, + "learning_rate": 2.1931034482758622e-05, + "loss": 45.3481, + "step": 1492 + }, + { + "epoch": 5.390067720090293, + "grad_norm": 282.2879638671875, + "learning_rate": 2.1925589836660618e-05, + "loss": 45.3958, + "step": 1493 + }, + { + "epoch": 5.393679458239277, + "grad_norm": 266.9140930175781, + "learning_rate": 2.1920145190562613e-05, + "loss": 44.2959, + "step": 1494 + }, + { + "epoch": 5.397291196388262, + "grad_norm": 196.81199645996094, + "learning_rate": 2.191470054446461e-05, + "loss": 44.765, + "step": 1495 + }, + { + "epoch": 5.400902934537246, + "grad_norm": 270.7329406738281, + "learning_rate": 2.1909255898366607e-05, + "loss": 42.8581, + "step": 1496 + }, + { + "epoch": 5.40451467268623, + "grad_norm": 187.3281707763672, + "learning_rate": 2.1903811252268603e-05, + "loss": 40.7167, + "step": 1497 + }, + { + "epoch": 5.408126410835214, + "grad_norm": 302.9165954589844, + "learning_rate": 2.1898366606170598e-05, + "loss": 41.0712, + "step": 1498 + }, + { + "epoch": 5.411738148984199, + "grad_norm": 395.1492614746094, + "learning_rate": 2.1892921960072597e-05, + "loss": 40.4098, + "step": 1499 + }, + { + "epoch": 5.415349887133183, + "grad_norm": 253.91494750976562, + "learning_rate": 2.1887477313974592e-05, + "loss": 41.2985, + "step": 1500 + }, + { + "epoch": 5.415349887133183, + "eval_loss": 0.6383773684501648, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1500 + }, + { + "epoch": 5.418961625282167, + "grad_norm": 248.4109344482422, + "learning_rate": 2.1882032667876588e-05, + "loss": 41.179, + "step": 1501 + }, + { + "epoch": 5.422573363431152, + "grad_norm": 210.50015258789062, + "learning_rate": 2.1876588021778586e-05, + "loss": 41.1934, + "step": 1502 + }, + { + "epoch": 5.426185101580136, + "grad_norm": 170.64334106445312, + "learning_rate": 2.187114337568058e-05, + "loss": 41.5535, + "step": 1503 + }, + { + "epoch": 5.42979683972912, + "grad_norm": 249.41270446777344, + "learning_rate": 2.1865698729582577e-05, + "loss": 41.8323, + "step": 1504 + }, + { + "epoch": 5.433408577878104, + "grad_norm": 214.53770446777344, + "learning_rate": 2.1860254083484572e-05, + "loss": 42.1517, + "step": 1505 + }, + { + "epoch": 5.437020316027088, + "grad_norm": 225.6502227783203, + "learning_rate": 2.1854809437386568e-05, + "loss": 42.7675, + "step": 1506 + }, + { + "epoch": 5.440632054176072, + "grad_norm": 210.19219970703125, + "learning_rate": 2.1849364791288567e-05, + "loss": 42.5094, + "step": 1507 + }, + { + "epoch": 5.444243792325056, + "grad_norm": 187.03294372558594, + "learning_rate": 2.1843920145190565e-05, + "loss": 42.2218, + "step": 1508 + }, + { + "epoch": 5.44785553047404, + "grad_norm": 227.6764373779297, + "learning_rate": 2.183847549909256e-05, + "loss": 42.7061, + "step": 1509 + }, + { + "epoch": 5.451467268623025, + "grad_norm": 239.2847442626953, + "learning_rate": 2.1833030852994556e-05, + "loss": 43.1959, + "step": 1510 + }, + { + "epoch": 5.451467268623025, + "eval_loss": 0.6405091285705566, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 1510 + }, + { + "epoch": 5.455079006772009, + "grad_norm": 268.887451171875, + "learning_rate": 2.182758620689655e-05, + "loss": 42.4915, + "step": 1511 + }, + { + "epoch": 5.458690744920993, + "grad_norm": 261.0531311035156, + "learning_rate": 2.182214156079855e-05, + "loss": 42.1777, + "step": 1512 + }, + { + "epoch": 5.462302483069977, + "grad_norm": 241.58819580078125, + "learning_rate": 2.1816696914700546e-05, + "loss": 40.8728, + "step": 1513 + }, + { + "epoch": 5.465914221218962, + "grad_norm": 227.302001953125, + "learning_rate": 2.181125226860254e-05, + "loss": 39.8861, + "step": 1514 + }, + { + "epoch": 5.469525959367946, + "grad_norm": 293.8402404785156, + "learning_rate": 2.1805807622504536e-05, + "loss": 36.8716, + "step": 1515 + }, + { + "epoch": 5.47313769751693, + "grad_norm": 332.8829650878906, + "learning_rate": 2.1800362976406532e-05, + "loss": 35.6049, + "step": 1516 + }, + { + "epoch": 5.476749435665914, + "grad_norm": 271.6636962890625, + "learning_rate": 2.179491833030853e-05, + "loss": 34.6785, + "step": 1517 + }, + { + "epoch": 5.480361173814899, + "grad_norm": 211.5673065185547, + "learning_rate": 2.178947368421053e-05, + "loss": 35.5321, + "step": 1518 + }, + { + "epoch": 5.483972911963883, + "grad_norm": 168.95346069335938, + "learning_rate": 2.1784029038112525e-05, + "loss": 35.1604, + "step": 1519 + }, + { + "epoch": 5.487584650112867, + "grad_norm": 242.66725158691406, + "learning_rate": 2.177858439201452e-05, + "loss": 37.8709, + "step": 1520 + }, + { + "epoch": 5.487584650112867, + "eval_loss": 0.6324127912521362, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1520 + }, + { + "epoch": 5.491196388261851, + "grad_norm": 202.7799530029297, + "learning_rate": 2.1773139745916516e-05, + "loss": 38.1727, + "step": 1521 + }, + { + "epoch": 5.4948081264108355, + "grad_norm": 210.12704467773438, + "learning_rate": 2.176769509981851e-05, + "loss": 36.4171, + "step": 1522 + }, + { + "epoch": 5.4984198645598195, + "grad_norm": 214.7133331298828, + "learning_rate": 2.176225045372051e-05, + "loss": 37.7873, + "step": 1523 + }, + { + "epoch": 5.502031602708803, + "grad_norm": 197.89781188964844, + "learning_rate": 2.1756805807622505e-05, + "loss": 37.1096, + "step": 1524 + }, + { + "epoch": 5.505643340857787, + "grad_norm": 203.01992797851562, + "learning_rate": 2.17513611615245e-05, + "loss": 36.9907, + "step": 1525 + }, + { + "epoch": 5.509255079006772, + "grad_norm": 210.42164611816406, + "learning_rate": 2.17459165154265e-05, + "loss": 38.0291, + "step": 1526 + }, + { + "epoch": 5.512866817155756, + "grad_norm": 210.2798309326172, + "learning_rate": 2.1740471869328495e-05, + "loss": 37.5385, + "step": 1527 + }, + { + "epoch": 5.51647855530474, + "grad_norm": 217.986572265625, + "learning_rate": 2.173502722323049e-05, + "loss": 39.2736, + "step": 1528 + }, + { + "epoch": 5.520090293453725, + "grad_norm": 221.05831909179688, + "learning_rate": 2.172958257713249e-05, + "loss": 39.2733, + "step": 1529 + }, + { + "epoch": 5.523702031602709, + "grad_norm": 250.36065673828125, + "learning_rate": 2.1724137931034484e-05, + "loss": 37.8987, + "step": 1530 + }, + { + "epoch": 5.523702031602709, + "eval_loss": 0.6414559483528137, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 1530 + }, + { + "epoch": 5.527313769751693, + "grad_norm": 275.062255859375, + "learning_rate": 2.171869328493648e-05, + "loss": 29.4874, + "step": 1531 + }, + { + "epoch": 5.530925507900677, + "grad_norm": 178.79615783691406, + "learning_rate": 2.1713248638838475e-05, + "loss": 25.2165, + "step": 1532 + }, + { + "epoch": 5.534537246049661, + "grad_norm": 221.6693572998047, + "learning_rate": 2.170780399274047e-05, + "loss": 24.7139, + "step": 1533 + }, + { + "epoch": 5.538148984198646, + "grad_norm": 207.15869140625, + "learning_rate": 2.170235934664247e-05, + "loss": 25.2773, + "step": 1534 + }, + { + "epoch": 5.54176072234763, + "grad_norm": 193.37644958496094, + "learning_rate": 2.1696914700544468e-05, + "loss": 25.7936, + "step": 1535 + }, + { + "epoch": 5.545372460496614, + "grad_norm": 314.101318359375, + "learning_rate": 2.1691470054446463e-05, + "loss": 45.8573, + "step": 1536 + }, + { + "epoch": 5.5489841986455986, + "grad_norm": 376.9578552246094, + "learning_rate": 2.168602540834846e-05, + "loss": 47.1284, + "step": 1537 + }, + { + "epoch": 5.5525959367945825, + "grad_norm": 343.3904724121094, + "learning_rate": 2.1680580762250454e-05, + "loss": 45.1873, + "step": 1538 + }, + { + "epoch": 5.5562076749435665, + "grad_norm": 263.31768798828125, + "learning_rate": 2.167513611615245e-05, + "loss": 45.4906, + "step": 1539 + }, + { + "epoch": 5.5598194130925505, + "grad_norm": 295.50384521484375, + "learning_rate": 2.1669691470054448e-05, + "loss": 44.9259, + "step": 1540 + }, + { + "epoch": 5.5598194130925505, + "eval_loss": 0.6483813524246216, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.923, + "eval_steps_per_second": 56.923, + "step": 1540 + }, + { + "epoch": 5.563431151241535, + "grad_norm": 208.8861846923828, + "learning_rate": 2.1664246823956444e-05, + "loss": 43.7965, + "step": 1541 + }, + { + "epoch": 5.567042889390519, + "grad_norm": 195.8695526123047, + "learning_rate": 2.165880217785844e-05, + "loss": 44.7409, + "step": 1542 + }, + { + "epoch": 5.570654627539503, + "grad_norm": 218.10089111328125, + "learning_rate": 2.1653357531760434e-05, + "loss": 45.9364, + "step": 1543 + }, + { + "epoch": 5.574266365688487, + "grad_norm": 204.17205810546875, + "learning_rate": 2.164791288566243e-05, + "loss": 45.468, + "step": 1544 + }, + { + "epoch": 5.577878103837472, + "grad_norm": 239.03952026367188, + "learning_rate": 2.1642468239564432e-05, + "loss": 44.7685, + "step": 1545 + }, + { + "epoch": 5.581489841986456, + "grad_norm": 251.59300231933594, + "learning_rate": 2.1637023593466427e-05, + "loss": 43.011, + "step": 1546 + }, + { + "epoch": 5.58510158013544, + "grad_norm": 186.72540283203125, + "learning_rate": 2.1631578947368423e-05, + "loss": 41.5255, + "step": 1547 + }, + { + "epoch": 5.588713318284425, + "grad_norm": 199.89732360839844, + "learning_rate": 2.1626134301270418e-05, + "loss": 40.2522, + "step": 1548 + }, + { + "epoch": 5.592325056433409, + "grad_norm": 182.16624450683594, + "learning_rate": 2.1620689655172413e-05, + "loss": 41.0931, + "step": 1549 + }, + { + "epoch": 5.595936794582393, + "grad_norm": 221.58680725097656, + "learning_rate": 2.161524500907441e-05, + "loss": 40.2717, + "step": 1550 + }, + { + "epoch": 5.595936794582393, + "eval_loss": 0.6393340229988098, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 1550 + }, + { + "epoch": 5.599548532731377, + "grad_norm": 209.82183837890625, + "learning_rate": 2.1609800362976408e-05, + "loss": 41.7522, + "step": 1551 + }, + { + "epoch": 5.603160270880361, + "grad_norm": 226.1896209716797, + "learning_rate": 2.1604355716878403e-05, + "loss": 40.8078, + "step": 1552 + }, + { + "epoch": 5.606772009029346, + "grad_norm": 219.57899475097656, + "learning_rate": 2.1598911070780398e-05, + "loss": 42.2331, + "step": 1553 + }, + { + "epoch": 5.6103837471783295, + "grad_norm": 185.2303009033203, + "learning_rate": 2.1593466424682397e-05, + "loss": 42.0695, + "step": 1554 + }, + { + "epoch": 5.6139954853273135, + "grad_norm": 192.32913208007812, + "learning_rate": 2.1588021778584392e-05, + "loss": 42.1317, + "step": 1555 + }, + { + "epoch": 5.617607223476298, + "grad_norm": 183.3128662109375, + "learning_rate": 2.158257713248639e-05, + "loss": 40.4957, + "step": 1556 + }, + { + "epoch": 5.621218961625282, + "grad_norm": 178.10691833496094, + "learning_rate": 2.1577132486388387e-05, + "loss": 40.9154, + "step": 1557 + }, + { + "epoch": 5.624830699774266, + "grad_norm": 207.3495330810547, + "learning_rate": 2.1571687840290382e-05, + "loss": 42.8389, + "step": 1558 + }, + { + "epoch": 5.62844243792325, + "grad_norm": 191.46353149414062, + "learning_rate": 2.1566243194192377e-05, + "loss": 41.9483, + "step": 1559 + }, + { + "epoch": 5.632054176072235, + "grad_norm": 218.9544219970703, + "learning_rate": 2.1560798548094373e-05, + "loss": 41.2037, + "step": 1560 + }, + { + "epoch": 5.632054176072235, + "eval_loss": 0.6345452070236206, + "eval_runtime": 3.1432, + "eval_samples_per_second": 56.949, + "eval_steps_per_second": 56.949, + "step": 1560 + }, + { + "epoch": 5.635665914221219, + "grad_norm": 235.9405059814453, + "learning_rate": 2.1555353901996368e-05, + "loss": 43.1159, + "step": 1561 + }, + { + "epoch": 5.639277652370203, + "grad_norm": 207.1119384765625, + "learning_rate": 2.1549909255898367e-05, + "loss": 43.4384, + "step": 1562 + }, + { + "epoch": 5.642889390519187, + "grad_norm": 305.3013916015625, + "learning_rate": 2.1544464609800366e-05, + "loss": 42.436, + "step": 1563 + }, + { + "epoch": 5.646501128668172, + "grad_norm": 226.25282287597656, + "learning_rate": 2.153901996370236e-05, + "loss": 39.6844, + "step": 1564 + }, + { + "epoch": 5.650112866817156, + "grad_norm": 201.5033416748047, + "learning_rate": 2.1533575317604356e-05, + "loss": 35.9103, + "step": 1565 + }, + { + "epoch": 5.65372460496614, + "grad_norm": 206.63229370117188, + "learning_rate": 2.1528130671506352e-05, + "loss": 35.0026, + "step": 1566 + }, + { + "epoch": 5.657336343115124, + "grad_norm": 212.67581176757812, + "learning_rate": 2.152268602540835e-05, + "loss": 35.6298, + "step": 1567 + }, + { + "epoch": 5.660948081264109, + "grad_norm": 193.2886199951172, + "learning_rate": 2.1517241379310346e-05, + "loss": 36.0356, + "step": 1568 + }, + { + "epoch": 5.664559819413093, + "grad_norm": 166.189208984375, + "learning_rate": 2.151179673321234e-05, + "loss": 35.5423, + "step": 1569 + }, + { + "epoch": 5.668171557562077, + "grad_norm": 288.91552734375, + "learning_rate": 2.1506352087114337e-05, + "loss": 36.6227, + "step": 1570 + }, + { + "epoch": 5.668171557562077, + "eval_loss": 0.6339959502220154, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1570 + }, + { + "epoch": 5.6717832957110605, + "grad_norm": 210.91664123535156, + "learning_rate": 2.1500907441016332e-05, + "loss": 37.3015, + "step": 1571 + }, + { + "epoch": 5.675395033860045, + "grad_norm": 206.54299926757812, + "learning_rate": 2.149546279491833e-05, + "loss": 36.961, + "step": 1572 + }, + { + "epoch": 5.679006772009029, + "grad_norm": 206.55613708496094, + "learning_rate": 2.149001814882033e-05, + "loss": 36.722, + "step": 1573 + }, + { + "epoch": 5.682618510158013, + "grad_norm": 206.86563110351562, + "learning_rate": 2.1484573502722325e-05, + "loss": 37.7482, + "step": 1574 + }, + { + "epoch": 5.686230248306998, + "grad_norm": 219.96533203125, + "learning_rate": 2.147912885662432e-05, + "loss": 37.7964, + "step": 1575 + }, + { + "epoch": 5.689841986455982, + "grad_norm": 226.23887634277344, + "learning_rate": 2.1473684210526316e-05, + "loss": 38.6577, + "step": 1576 + }, + { + "epoch": 5.693453724604966, + "grad_norm": 195.1751708984375, + "learning_rate": 2.146823956442831e-05, + "loss": 36.9764, + "step": 1577 + }, + { + "epoch": 5.69706546275395, + "grad_norm": 194.3510284423828, + "learning_rate": 2.146279491833031e-05, + "loss": 39.4842, + "step": 1578 + }, + { + "epoch": 5.700677200902934, + "grad_norm": 187.02281188964844, + "learning_rate": 2.1457350272232305e-05, + "loss": 38.9574, + "step": 1579 + }, + { + "epoch": 5.704288939051919, + "grad_norm": 242.91925048828125, + "learning_rate": 2.14519056261343e-05, + "loss": 37.6359, + "step": 1580 + }, + { + "epoch": 5.704288939051919, + "eval_loss": 0.6384473443031311, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 1580 + }, + { + "epoch": 5.707900677200903, + "grad_norm": 242.9617156982422, + "learning_rate": 2.14464609800363e-05, + "loss": 31.3564, + "step": 1581 + }, + { + "epoch": 5.711512415349887, + "grad_norm": 182.00540161132812, + "learning_rate": 2.1441016333938295e-05, + "loss": 24.2933, + "step": 1582 + }, + { + "epoch": 5.715124153498872, + "grad_norm": 257.7115173339844, + "learning_rate": 2.143557168784029e-05, + "loss": 24.6299, + "step": 1583 + }, + { + "epoch": 5.718735891647856, + "grad_norm": 198.71554565429688, + "learning_rate": 2.143012704174229e-05, + "loss": 24.7344, + "step": 1584 + }, + { + "epoch": 5.72234762979684, + "grad_norm": 198.24520874023438, + "learning_rate": 2.1424682395644284e-05, + "loss": 26.0825, + "step": 1585 + }, + { + "epoch": 5.725959367945824, + "grad_norm": 248.9528045654297, + "learning_rate": 2.141923774954628e-05, + "loss": 45.1176, + "step": 1586 + }, + { + "epoch": 5.7295711060948085, + "grad_norm": 293.7327575683594, + "learning_rate": 2.1413793103448275e-05, + "loss": 45.8517, + "step": 1587 + }, + { + "epoch": 5.733182844243792, + "grad_norm": 293.1148681640625, + "learning_rate": 2.140834845735027e-05, + "loss": 45.6659, + "step": 1588 + }, + { + "epoch": 5.736794582392776, + "grad_norm": 312.7779846191406, + "learning_rate": 2.140290381125227e-05, + "loss": 44.4863, + "step": 1589 + }, + { + "epoch": 5.74040632054176, + "grad_norm": 309.1000061035156, + "learning_rate": 2.1397459165154265e-05, + "loss": 43.649, + "step": 1590 + }, + { + "epoch": 5.74040632054176, + "eval_loss": 0.6471736431121826, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 1590 + }, + { + "epoch": 5.744018058690745, + "grad_norm": 276.4226989746094, + "learning_rate": 2.1392014519056263e-05, + "loss": 45.3135, + "step": 1591 + }, + { + "epoch": 5.747629796839729, + "grad_norm": 233.6791229248047, + "learning_rate": 2.138656987295826e-05, + "loss": 44.4919, + "step": 1592 + }, + { + "epoch": 5.751241534988713, + "grad_norm": 194.2917022705078, + "learning_rate": 2.1381125226860254e-05, + "loss": 44.8033, + "step": 1593 + }, + { + "epoch": 5.754853273137698, + "grad_norm": 241.76060485839844, + "learning_rate": 2.137568058076225e-05, + "loss": 45.1427, + "step": 1594 + }, + { + "epoch": 5.758465011286682, + "grad_norm": 216.56283569335938, + "learning_rate": 2.137023593466425e-05, + "loss": 43.1769, + "step": 1595 + }, + { + "epoch": 5.762076749435666, + "grad_norm": 230.0026092529297, + "learning_rate": 2.1364791288566244e-05, + "loss": 44.1141, + "step": 1596 + }, + { + "epoch": 5.76568848758465, + "grad_norm": 191.55433654785156, + "learning_rate": 2.135934664246824e-05, + "loss": 40.7227, + "step": 1597 + }, + { + "epoch": 5.769300225733634, + "grad_norm": 180.25885009765625, + "learning_rate": 2.1353901996370235e-05, + "loss": 40.9842, + "step": 1598 + }, + { + "epoch": 5.772911963882619, + "grad_norm": 220.4018096923828, + "learning_rate": 2.134845735027223e-05, + "loss": 40.0403, + "step": 1599 + }, + { + "epoch": 5.776523702031603, + "grad_norm": 264.20587158203125, + "learning_rate": 2.1343012704174232e-05, + "loss": 40.1543, + "step": 1600 + }, + { + "epoch": 5.776523702031603, + "eval_loss": 0.6374311447143555, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1600 + }, + { + "epoch": 5.780135440180587, + "grad_norm": 167.9457244873047, + "learning_rate": 2.1337568058076227e-05, + "loss": 40.9575, + "step": 1601 + }, + { + "epoch": 5.7837471783295715, + "grad_norm": 190.05247497558594, + "learning_rate": 2.1332123411978223e-05, + "loss": 39.5593, + "step": 1602 + }, + { + "epoch": 5.7873589164785555, + "grad_norm": 246.4980926513672, + "learning_rate": 2.1326678765880218e-05, + "loss": 40.7016, + "step": 1603 + }, + { + "epoch": 5.7909706546275395, + "grad_norm": 208.7435302734375, + "learning_rate": 2.1321234119782214e-05, + "loss": 41.7855, + "step": 1604 + }, + { + "epoch": 5.794582392776523, + "grad_norm": 190.84188842773438, + "learning_rate": 2.1315789473684212e-05, + "loss": 41.2129, + "step": 1605 + }, + { + "epoch": 5.798194130925508, + "grad_norm": 196.7161102294922, + "learning_rate": 2.1310344827586208e-05, + "loss": 40.8209, + "step": 1606 + }, + { + "epoch": 5.801805869074492, + "grad_norm": 181.4319305419922, + "learning_rate": 2.1304900181488203e-05, + "loss": 41.8345, + "step": 1607 + }, + { + "epoch": 5.805417607223476, + "grad_norm": 201.2064971923828, + "learning_rate": 2.12994555353902e-05, + "loss": 43.1464, + "step": 1608 + }, + { + "epoch": 5.80902934537246, + "grad_norm": 199.15174865722656, + "learning_rate": 2.1294010889292197e-05, + "loss": 42.6041, + "step": 1609 + }, + { + "epoch": 5.812641083521445, + "grad_norm": 231.0398406982422, + "learning_rate": 2.1288566243194193e-05, + "loss": 42.867, + "step": 1610 + }, + { + "epoch": 5.812641083521445, + "eval_loss": 0.6334222555160522, + "eval_runtime": 3.1534, + "eval_samples_per_second": 56.764, + "eval_steps_per_second": 56.764, + "step": 1610 + }, + { + "epoch": 5.816252821670429, + "grad_norm": 189.26132202148438, + "learning_rate": 2.128312159709619e-05, + "loss": 41.7717, + "step": 1611 + }, + { + "epoch": 5.819864559819413, + "grad_norm": 215.5289764404297, + "learning_rate": 2.1277676950998187e-05, + "loss": 41.3994, + "step": 1612 + }, + { + "epoch": 5.823476297968397, + "grad_norm": 267.4259033203125, + "learning_rate": 2.1272232304900182e-05, + "loss": 41.8173, + "step": 1613 + }, + { + "epoch": 5.827088036117382, + "grad_norm": 241.74749755859375, + "learning_rate": 2.1266787658802178e-05, + "loss": 39.9873, + "step": 1614 + }, + { + "epoch": 5.830699774266366, + "grad_norm": 242.233642578125, + "learning_rate": 2.1261343012704173e-05, + "loss": 37.0662, + "step": 1615 + }, + { + "epoch": 5.83431151241535, + "grad_norm": 217.06141662597656, + "learning_rate": 2.1255898366606172e-05, + "loss": 36.8948, + "step": 1616 + }, + { + "epoch": 5.837923250564334, + "grad_norm": 242.05567932128906, + "learning_rate": 2.1250453720508167e-05, + "loss": 34.9909, + "step": 1617 + }, + { + "epoch": 5.8415349887133186, + "grad_norm": 178.65618896484375, + "learning_rate": 2.1245009074410166e-05, + "loss": 35.603, + "step": 1618 + }, + { + "epoch": 5.8451467268623025, + "grad_norm": 216.36865234375, + "learning_rate": 2.123956442831216e-05, + "loss": 35.9822, + "step": 1619 + }, + { + "epoch": 5.8487584650112865, + "grad_norm": 241.22161865234375, + "learning_rate": 2.1234119782214157e-05, + "loss": 35.1473, + "step": 1620 + }, + { + "epoch": 5.8487584650112865, + "eval_loss": 0.6312161087989807, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 1620 + }, + { + "epoch": 5.852370203160271, + "grad_norm": 192.05210876464844, + "learning_rate": 2.1228675136116152e-05, + "loss": 36.145, + "step": 1621 + }, + { + "epoch": 5.855981941309255, + "grad_norm": 194.0652618408203, + "learning_rate": 2.122323049001815e-05, + "loss": 37.7076, + "step": 1622 + }, + { + "epoch": 5.859593679458239, + "grad_norm": 255.59286499023438, + "learning_rate": 2.1217785843920146e-05, + "loss": 37.6837, + "step": 1623 + }, + { + "epoch": 5.863205417607223, + "grad_norm": 184.0017852783203, + "learning_rate": 2.121234119782214e-05, + "loss": 37.1681, + "step": 1624 + }, + { + "epoch": 5.866817155756207, + "grad_norm": 186.98338317871094, + "learning_rate": 2.1206896551724137e-05, + "loss": 37.4902, + "step": 1625 + }, + { + "epoch": 5.870428893905192, + "grad_norm": 253.53775024414062, + "learning_rate": 2.1201451905626132e-05, + "loss": 37.2771, + "step": 1626 + }, + { + "epoch": 5.874040632054176, + "grad_norm": 196.43038940429688, + "learning_rate": 2.119600725952813e-05, + "loss": 37.7681, + "step": 1627 + }, + { + "epoch": 5.87765237020316, + "grad_norm": 255.99879455566406, + "learning_rate": 2.119056261343013e-05, + "loss": 40.0097, + "step": 1628 + }, + { + "epoch": 5.881264108352145, + "grad_norm": 275.1465148925781, + "learning_rate": 2.1185117967332125e-05, + "loss": 38.1076, + "step": 1629 + }, + { + "epoch": 5.884875846501129, + "grad_norm": 281.8592529296875, + "learning_rate": 2.117967332123412e-05, + "loss": 38.6463, + "step": 1630 + }, + { + "epoch": 5.884875846501129, + "eval_loss": 0.6449099779129028, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 1630 + }, + { + "epoch": 5.888487584650113, + "grad_norm": 246.7912139892578, + "learning_rate": 2.1174228675136116e-05, + "loss": 36.9158, + "step": 1631 + }, + { + "epoch": 5.892099322799097, + "grad_norm": 176.7545623779297, + "learning_rate": 2.116878402903811e-05, + "loss": 25.1153, + "step": 1632 + }, + { + "epoch": 5.895711060948082, + "grad_norm": 202.2602996826172, + "learning_rate": 2.116333938294011e-05, + "loss": 24.1999, + "step": 1633 + }, + { + "epoch": 5.899322799097066, + "grad_norm": 186.26255798339844, + "learning_rate": 2.1157894736842106e-05, + "loss": 24.185, + "step": 1634 + }, + { + "epoch": 5.9029345372460496, + "grad_norm": 231.0543670654297, + "learning_rate": 2.11524500907441e-05, + "loss": 26.1841, + "step": 1635 + }, + { + "epoch": 5.9065462753950335, + "grad_norm": 336.677001953125, + "learning_rate": 2.1147005444646096e-05, + "loss": 47.1367, + "step": 1636 + }, + { + "epoch": 5.910158013544018, + "grad_norm": 299.3211975097656, + "learning_rate": 2.1141560798548095e-05, + "loss": 46.7711, + "step": 1637 + }, + { + "epoch": 5.913769751693002, + "grad_norm": 287.5389099121094, + "learning_rate": 2.1136116152450094e-05, + "loss": 44.9163, + "step": 1638 + }, + { + "epoch": 5.917381489841986, + "grad_norm": 290.34930419921875, + "learning_rate": 2.113067150635209e-05, + "loss": 45.1651, + "step": 1639 + }, + { + "epoch": 5.92099322799097, + "grad_norm": 244.7100372314453, + "learning_rate": 2.1125226860254085e-05, + "loss": 45.6252, + "step": 1640 + }, + { + "epoch": 5.92099322799097, + "eval_loss": 0.6506878733634949, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 1640 + }, + { + "epoch": 5.924604966139955, + "grad_norm": 301.48223876953125, + "learning_rate": 2.111978221415608e-05, + "loss": 44.5345, + "step": 1641 + }, + { + "epoch": 5.928216704288939, + "grad_norm": 261.05987548828125, + "learning_rate": 2.1114337568058075e-05, + "loss": 42.0263, + "step": 1642 + }, + { + "epoch": 5.931828442437923, + "grad_norm": 220.4369659423828, + "learning_rate": 2.110889292196007e-05, + "loss": 41.2405, + "step": 1643 + }, + { + "epoch": 5.935440180586907, + "grad_norm": 261.3221435546875, + "learning_rate": 2.110344827586207e-05, + "loss": 42.2734, + "step": 1644 + }, + { + "epoch": 5.939051918735892, + "grad_norm": 253.70855712890625, + "learning_rate": 2.1098003629764065e-05, + "loss": 43.0752, + "step": 1645 + }, + { + "epoch": 5.942663656884876, + "grad_norm": 198.76138305664062, + "learning_rate": 2.1092558983666064e-05, + "loss": 42.7103, + "step": 1646 + }, + { + "epoch": 5.94627539503386, + "grad_norm": 212.21466064453125, + "learning_rate": 2.108711433756806e-05, + "loss": 42.6215, + "step": 1647 + }, + { + "epoch": 5.949887133182845, + "grad_norm": 212.9633026123047, + "learning_rate": 2.1081669691470055e-05, + "loss": 42.795, + "step": 1648 + }, + { + "epoch": 5.953498871331829, + "grad_norm": 263.2871398925781, + "learning_rate": 2.1076225045372053e-05, + "loss": 43.8843, + "step": 1649 + }, + { + "epoch": 5.957110609480813, + "grad_norm": 207.67120361328125, + "learning_rate": 2.107078039927405e-05, + "loss": 43.0161, + "step": 1650 + }, + { + "epoch": 5.957110609480813, + "eval_loss": 0.6315081715583801, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1650 + }, + { + "epoch": 5.960722347629797, + "grad_norm": 176.6342010498047, + "learning_rate": 2.1065335753176044e-05, + "loss": 38.803, + "step": 1651 + }, + { + "epoch": 5.9643340857787805, + "grad_norm": 223.57485961914062, + "learning_rate": 2.105989110707804e-05, + "loss": 35.1905, + "step": 1652 + }, + { + "epoch": 5.967945823927765, + "grad_norm": 291.507568359375, + "learning_rate": 2.1054446460980035e-05, + "loss": 34.9454, + "step": 1653 + }, + { + "epoch": 5.971557562076749, + "grad_norm": 250.51063537597656, + "learning_rate": 2.104900181488203e-05, + "loss": 37.4404, + "step": 1654 + }, + { + "epoch": 5.975169300225733, + "grad_norm": 307.9601135253906, + "learning_rate": 2.1043557168784032e-05, + "loss": 36.9775, + "step": 1655 + }, + { + "epoch": 5.978781038374718, + "grad_norm": 277.24151611328125, + "learning_rate": 2.1038112522686028e-05, + "loss": 38.2696, + "step": 1656 + }, + { + "epoch": 5.982392776523702, + "grad_norm": 186.7593994140625, + "learning_rate": 2.1032667876588023e-05, + "loss": 37.0656, + "step": 1657 + }, + { + "epoch": 5.986004514672686, + "grad_norm": 201.67047119140625, + "learning_rate": 2.102722323049002e-05, + "loss": 38.1747, + "step": 1658 + }, + { + "epoch": 5.98961625282167, + "grad_norm": 216.87525939941406, + "learning_rate": 2.1021778584392014e-05, + "loss": 39.3248, + "step": 1659 + }, + { + "epoch": 5.993227990970655, + "grad_norm": 227.381103515625, + "learning_rate": 2.1016333938294013e-05, + "loss": 33.4017, + "step": 1660 + }, + { + "epoch": 5.993227990970655, + "eval_loss": 0.6369583010673523, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1660 + }, + { + "epoch": 5.996839729119639, + "grad_norm": 237.2648468017578, + "learning_rate": 2.1010889292196008e-05, + "loss": 24.679, + "step": 1661 + }, + { + "epoch": 6.0, + "grad_norm": 191.99951171875, + "learning_rate": 2.1005444646098003e-05, + "loss": 21.9552, + "step": 1662 + }, + { + "epoch": 6.003611738148984, + "grad_norm": 267.92181396484375, + "learning_rate": 2.1e-05, + "loss": 43.6884, + "step": 1663 + }, + { + "epoch": 6.007223476297969, + "grad_norm": 318.86602783203125, + "learning_rate": 2.0994555353901998e-05, + "loss": 46.0709, + "step": 1664 + }, + { + "epoch": 6.010835214446953, + "grad_norm": 282.772705078125, + "learning_rate": 2.0989110707803993e-05, + "loss": 44.2746, + "step": 1665 + }, + { + "epoch": 6.014446952595937, + "grad_norm": 263.2024841308594, + "learning_rate": 2.0983666061705992e-05, + "loss": 43.818, + "step": 1666 + }, + { + "epoch": 6.018058690744921, + "grad_norm": 229.41725158691406, + "learning_rate": 2.0978221415607987e-05, + "loss": 43.9441, + "step": 1667 + }, + { + "epoch": 6.021670428893906, + "grad_norm": 253.25624084472656, + "learning_rate": 2.0972776769509983e-05, + "loss": 43.517, + "step": 1668 + }, + { + "epoch": 6.0252821670428895, + "grad_norm": 202.00238037109375, + "learning_rate": 2.0967332123411978e-05, + "loss": 44.3685, + "step": 1669 + }, + { + "epoch": 6.0288939051918735, + "grad_norm": 196.92825317382812, + "learning_rate": 2.0961887477313973e-05, + "loss": 44.9367, + "step": 1670 + }, + { + "epoch": 6.0288939051918735, + "eval_loss": 0.6381568312644958, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1670 + }, + { + "epoch": 6.0325056433408575, + "grad_norm": 191.00900268554688, + "learning_rate": 2.0956442831215972e-05, + "loss": 44.0743, + "step": 1671 + }, + { + "epoch": 6.036117381489842, + "grad_norm": 195.92141723632812, + "learning_rate": 2.0950998185117967e-05, + "loss": 43.3278, + "step": 1672 + }, + { + "epoch": 6.039729119638826, + "grad_norm": 230.04708862304688, + "learning_rate": 2.0945553539019963e-05, + "loss": 41.6419, + "step": 1673 + }, + { + "epoch": 6.04334085778781, + "grad_norm": 215.70689392089844, + "learning_rate": 2.094010889292196e-05, + "loss": 41.0927, + "step": 1674 + }, + { + "epoch": 6.046952595936794, + "grad_norm": 227.51797485351562, + "learning_rate": 2.0934664246823957e-05, + "loss": 40.1888, + "step": 1675 + }, + { + "epoch": 6.050564334085779, + "grad_norm": 216.93089294433594, + "learning_rate": 2.0929219600725952e-05, + "loss": 39.8766, + "step": 1676 + }, + { + "epoch": 6.054176072234763, + "grad_norm": 199.3091583251953, + "learning_rate": 2.092377495462795e-05, + "loss": 40.3851, + "step": 1677 + }, + { + "epoch": 6.057787810383747, + "grad_norm": 188.56056213378906, + "learning_rate": 2.0918330308529947e-05, + "loss": 40.5289, + "step": 1678 + }, + { + "epoch": 6.061399548532731, + "grad_norm": 194.23265075683594, + "learning_rate": 2.0912885662431942e-05, + "loss": 40.7509, + "step": 1679 + }, + { + "epoch": 6.065011286681716, + "grad_norm": 199.7327423095703, + "learning_rate": 2.0907441016333937e-05, + "loss": 41.3404, + "step": 1680 + }, + { + "epoch": 6.065011286681716, + "eval_loss": 0.6312655806541443, + "eval_runtime": 3.1482, + "eval_samples_per_second": 56.858, + "eval_steps_per_second": 56.858, + "step": 1680 + }, + { + "epoch": 6.0686230248307, + "grad_norm": 189.40150451660156, + "learning_rate": 2.0901996370235933e-05, + "loss": 41.3719, + "step": 1681 + }, + { + "epoch": 6.072234762979684, + "grad_norm": 222.07705688476562, + "learning_rate": 2.089655172413793e-05, + "loss": 41.8194, + "step": 1682 + }, + { + "epoch": 6.075846501128668, + "grad_norm": 205.6264190673828, + "learning_rate": 2.089110707803993e-05, + "loss": 39.8522, + "step": 1683 + }, + { + "epoch": 6.079458239277653, + "grad_norm": 207.98802185058594, + "learning_rate": 2.0885662431941926e-05, + "loss": 41.5093, + "step": 1684 + }, + { + "epoch": 6.083069977426637, + "grad_norm": 197.24134826660156, + "learning_rate": 2.088021778584392e-05, + "loss": 41.7284, + "step": 1685 + }, + { + "epoch": 6.0866817155756205, + "grad_norm": 220.84255981445312, + "learning_rate": 2.0874773139745916e-05, + "loss": 42.7841, + "step": 1686 + }, + { + "epoch": 6.090293453724605, + "grad_norm": 239.06854248046875, + "learning_rate": 2.0869328493647912e-05, + "loss": 43.6391, + "step": 1687 + }, + { + "epoch": 6.093905191873589, + "grad_norm": 193.2572021484375, + "learning_rate": 2.086388384754991e-05, + "loss": 41.9963, + "step": 1688 + }, + { + "epoch": 6.097516930022573, + "grad_norm": 206.66473388671875, + "learning_rate": 2.0858439201451906e-05, + "loss": 41.9834, + "step": 1689 + }, + { + "epoch": 6.101128668171557, + "grad_norm": 214.81956481933594, + "learning_rate": 2.08529945553539e-05, + "loss": 41.7128, + "step": 1690 + }, + { + "epoch": 6.101128668171557, + "eval_loss": 0.6309775114059448, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1690 + }, + { + "epoch": 6.104740406320542, + "grad_norm": 189.58360290527344, + "learning_rate": 2.0847549909255897e-05, + "loss": 37.7807, + "step": 1691 + }, + { + "epoch": 6.108352144469526, + "grad_norm": 265.76934814453125, + "learning_rate": 2.0842105263157895e-05, + "loss": 37.7091, + "step": 1692 + }, + { + "epoch": 6.11196388261851, + "grad_norm": 266.4632568359375, + "learning_rate": 2.0836660617059894e-05, + "loss": 34.7386, + "step": 1693 + }, + { + "epoch": 6.115575620767494, + "grad_norm": 309.3799743652344, + "learning_rate": 2.083121597096189e-05, + "loss": 34.9386, + "step": 1694 + }, + { + "epoch": 6.119187358916479, + "grad_norm": 252.98681640625, + "learning_rate": 2.0825771324863885e-05, + "loss": 34.9113, + "step": 1695 + }, + { + "epoch": 6.122799097065463, + "grad_norm": 199.3408660888672, + "learning_rate": 2.082032667876588e-05, + "loss": 35.1914, + "step": 1696 + }, + { + "epoch": 6.126410835214447, + "grad_norm": 231.67514038085938, + "learning_rate": 2.0814882032667876e-05, + "loss": 36.3151, + "step": 1697 + }, + { + "epoch": 6.130022573363431, + "grad_norm": 215.49317932128906, + "learning_rate": 2.080943738656987e-05, + "loss": 37.6763, + "step": 1698 + }, + { + "epoch": 6.133634311512416, + "grad_norm": 239.3602752685547, + "learning_rate": 2.080399274047187e-05, + "loss": 35.7805, + "step": 1699 + }, + { + "epoch": 6.1372460496614, + "grad_norm": 192.8195037841797, + "learning_rate": 2.0798548094373865e-05, + "loss": 36.7353, + "step": 1700 + }, + { + "epoch": 6.1372460496614, + "eval_loss": 0.6290757060050964, + "eval_runtime": 3.1486, + "eval_samples_per_second": 56.851, + "eval_steps_per_second": 56.851, + "step": 1700 + }, + { + "epoch": 6.140857787810384, + "grad_norm": 191.125, + "learning_rate": 2.0793103448275864e-05, + "loss": 36.6377, + "step": 1701 + }, + { + "epoch": 6.144469525959368, + "grad_norm": 232.39170837402344, + "learning_rate": 2.078765880217786e-05, + "loss": 36.5235, + "step": 1702 + }, + { + "epoch": 6.148081264108352, + "grad_norm": 259.41204833984375, + "learning_rate": 2.0782214156079855e-05, + "loss": 37.7093, + "step": 1703 + }, + { + "epoch": 6.151693002257336, + "grad_norm": 218.00814819335938, + "learning_rate": 2.0776769509981854e-05, + "loss": 37.8061, + "step": 1704 + }, + { + "epoch": 6.15530474040632, + "grad_norm": 183.78170776367188, + "learning_rate": 2.077132486388385e-05, + "loss": 37.9451, + "step": 1705 + }, + { + "epoch": 6.158916478555304, + "grad_norm": 242.387939453125, + "learning_rate": 2.0765880217785844e-05, + "loss": 38.687, + "step": 1706 + }, + { + "epoch": 6.162528216704289, + "grad_norm": 247.09152221679688, + "learning_rate": 2.076043557168784e-05, + "loss": 38.5109, + "step": 1707 + }, + { + "epoch": 6.166139954853273, + "grad_norm": 202.3104705810547, + "learning_rate": 2.0754990925589835e-05, + "loss": 28.0115, + "step": 1708 + }, + { + "epoch": 6.169751693002257, + "grad_norm": 239.5511016845703, + "learning_rate": 2.0749546279491834e-05, + "loss": 23.8873, + "step": 1709 + }, + { + "epoch": 6.173363431151241, + "grad_norm": 233.80007934570312, + "learning_rate": 2.0744101633393833e-05, + "loss": 24.0236, + "step": 1710 + }, + { + "epoch": 6.173363431151241, + "eval_loss": 0.6451307535171509, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1710 + }, + { + "epoch": 6.176975169300226, + "grad_norm": 231.85955810546875, + "learning_rate": 2.0738656987295828e-05, + "loss": 25.2521, + "step": 1711 + }, + { + "epoch": 6.18058690744921, + "grad_norm": 207.05453491210938, + "learning_rate": 2.0733212341197823e-05, + "loss": 25.5774, + "step": 1712 + }, + { + "epoch": 6.184198645598194, + "grad_norm": 265.9180908203125, + "learning_rate": 2.072776769509982e-05, + "loss": 46.0267, + "step": 1713 + }, + { + "epoch": 6.187810383747179, + "grad_norm": 289.2763671875, + "learning_rate": 2.0722323049001814e-05, + "loss": 46.6262, + "step": 1714 + }, + { + "epoch": 6.191422121896163, + "grad_norm": 254.466552734375, + "learning_rate": 2.0716878402903813e-05, + "loss": 44.2758, + "step": 1715 + }, + { + "epoch": 6.195033860045147, + "grad_norm": 262.713134765625, + "learning_rate": 2.071143375680581e-05, + "loss": 44.6334, + "step": 1716 + }, + { + "epoch": 6.198645598194131, + "grad_norm": 272.8150939941406, + "learning_rate": 2.0705989110707804e-05, + "loss": 44.9617, + "step": 1717 + }, + { + "epoch": 6.2022573363431155, + "grad_norm": 288.115478515625, + "learning_rate": 2.07005444646098e-05, + "loss": 44.4382, + "step": 1718 + }, + { + "epoch": 6.2058690744920995, + "grad_norm": 226.08058166503906, + "learning_rate": 2.0695099818511795e-05, + "loss": 44.8551, + "step": 1719 + }, + { + "epoch": 6.209480812641083, + "grad_norm": 219.95835876464844, + "learning_rate": 2.0689655172413797e-05, + "loss": 45.5901, + "step": 1720 + }, + { + "epoch": 6.209480812641083, + "eval_loss": 0.6379314661026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 1720 + }, + { + "epoch": 6.213092550790067, + "grad_norm": 190.3118896484375, + "learning_rate": 2.0684210526315792e-05, + "loss": 44.0675, + "step": 1721 + }, + { + "epoch": 6.216704288939052, + "grad_norm": 177.408935546875, + "learning_rate": 2.0678765880217787e-05, + "loss": 42.6333, + "step": 1722 + }, + { + "epoch": 6.220316027088036, + "grad_norm": 231.3040313720703, + "learning_rate": 2.0673321234119783e-05, + "loss": 41.6771, + "step": 1723 + }, + { + "epoch": 6.22392776523702, + "grad_norm": 226.51663208007812, + "learning_rate": 2.0667876588021778e-05, + "loss": 41.0829, + "step": 1724 + }, + { + "epoch": 6.227539503386004, + "grad_norm": 184.55775451660156, + "learning_rate": 2.0662431941923774e-05, + "loss": 39.2682, + "step": 1725 + }, + { + "epoch": 6.231151241534989, + "grad_norm": 205.0491943359375, + "learning_rate": 2.0656987295825772e-05, + "loss": 40.4101, + "step": 1726 + }, + { + "epoch": 6.234762979683973, + "grad_norm": 201.45838928222656, + "learning_rate": 2.0651542649727768e-05, + "loss": 39.9147, + "step": 1727 + }, + { + "epoch": 6.238374717832957, + "grad_norm": 220.16213989257812, + "learning_rate": 2.0646098003629763e-05, + "loss": 40.7215, + "step": 1728 + }, + { + "epoch": 6.241986455981941, + "grad_norm": 260.9661560058594, + "learning_rate": 2.0640653357531762e-05, + "loss": 40.0256, + "step": 1729 + }, + { + "epoch": 6.245598194130926, + "grad_norm": 314.2476806640625, + "learning_rate": 2.0635208711433757e-05, + "loss": 41.1147, + "step": 1730 + }, + { + "epoch": 6.245598194130926, + "eval_loss": 0.6347935199737549, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1730 + }, + { + "epoch": 6.24920993227991, + "grad_norm": 262.24505615234375, + "learning_rate": 2.0629764065335756e-05, + "loss": 41.7255, + "step": 1731 + }, + { + "epoch": 6.252821670428894, + "grad_norm": 212.0876922607422, + "learning_rate": 2.062431941923775e-05, + "loss": 41.2559, + "step": 1732 + }, + { + "epoch": 6.2564334085778786, + "grad_norm": 185.3249969482422, + "learning_rate": 2.0618874773139747e-05, + "loss": 41.1664, + "step": 1733 + }, + { + "epoch": 6.2600451467268625, + "grad_norm": 184.7873077392578, + "learning_rate": 2.0613430127041742e-05, + "loss": 41.3357, + "step": 1734 + }, + { + "epoch": 6.2636568848758465, + "grad_norm": 230.11257934570312, + "learning_rate": 2.0607985480943738e-05, + "loss": 43.0978, + "step": 1735 + }, + { + "epoch": 6.2672686230248305, + "grad_norm": 251.255126953125, + "learning_rate": 2.0602540834845733e-05, + "loss": 42.4169, + "step": 1736 + }, + { + "epoch": 6.270880361173815, + "grad_norm": 230.1149444580078, + "learning_rate": 2.0597096188747732e-05, + "loss": 43.2969, + "step": 1737 + }, + { + "epoch": 6.274492099322799, + "grad_norm": 217.2769012451172, + "learning_rate": 2.059165154264973e-05, + "loss": 42.6037, + "step": 1738 + }, + { + "epoch": 6.278103837471783, + "grad_norm": 189.85533142089844, + "learning_rate": 2.0586206896551726e-05, + "loss": 42.1215, + "step": 1739 + }, + { + "epoch": 6.281715575620767, + "grad_norm": 242.15667724609375, + "learning_rate": 2.058076225045372e-05, + "loss": 42.6337, + "step": 1740 + }, + { + "epoch": 6.281715575620767, + "eval_loss": 0.6310555934906006, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 1740 + }, + { + "epoch": 6.285327313769752, + "grad_norm": 213.7873992919922, + "learning_rate": 2.0575317604355717e-05, + "loss": 40.5315, + "step": 1741 + }, + { + "epoch": 6.288939051918736, + "grad_norm": 243.86492919921875, + "learning_rate": 2.0569872958257715e-05, + "loss": 38.9483, + "step": 1742 + }, + { + "epoch": 6.29255079006772, + "grad_norm": 276.0108642578125, + "learning_rate": 2.056442831215971e-05, + "loss": 35.9627, + "step": 1743 + }, + { + "epoch": 6.296162528216704, + "grad_norm": 252.5875701904297, + "learning_rate": 2.0558983666061706e-05, + "loss": 35.4305, + "step": 1744 + }, + { + "epoch": 6.299774266365689, + "grad_norm": 227.15142822265625, + "learning_rate": 2.05535390199637e-05, + "loss": 35.2385, + "step": 1745 + }, + { + "epoch": 6.303386004514673, + "grad_norm": 259.6727294921875, + "learning_rate": 2.0548094373865697e-05, + "loss": 35.735, + "step": 1746 + }, + { + "epoch": 6.306997742663657, + "grad_norm": 185.07765197753906, + "learning_rate": 2.0542649727767696e-05, + "loss": 36.8835, + "step": 1747 + }, + { + "epoch": 6.310609480812641, + "grad_norm": 207.650146484375, + "learning_rate": 2.0537205081669694e-05, + "loss": 36.346, + "step": 1748 + }, + { + "epoch": 6.314221218961626, + "grad_norm": 223.2378692626953, + "learning_rate": 2.053176043557169e-05, + "loss": 36.1527, + "step": 1749 + }, + { + "epoch": 6.3178329571106095, + "grad_norm": 162.90794372558594, + "learning_rate": 2.0526315789473685e-05, + "loss": 35.7408, + "step": 1750 + }, + { + "epoch": 6.3178329571106095, + "eval_loss": 0.6276403069496155, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 1750 + }, + { + "epoch": 6.3214446952595935, + "grad_norm": 165.8592987060547, + "learning_rate": 2.052087114337568e-05, + "loss": 37.7916, + "step": 1751 + }, + { + "epoch": 6.3250564334085775, + "grad_norm": 179.7499542236328, + "learning_rate": 2.0515426497277676e-05, + "loss": 36.8409, + "step": 1752 + }, + { + "epoch": 6.328668171557562, + "grad_norm": 227.0990753173828, + "learning_rate": 2.0509981851179675e-05, + "loss": 37.1766, + "step": 1753 + }, + { + "epoch": 6.332279909706546, + "grad_norm": 216.3297882080078, + "learning_rate": 2.050453720508167e-05, + "loss": 37.5, + "step": 1754 + }, + { + "epoch": 6.33589164785553, + "grad_norm": 197.88409423828125, + "learning_rate": 2.0499092558983666e-05, + "loss": 38.8293, + "step": 1755 + }, + { + "epoch": 6.339503386004514, + "grad_norm": 189.74916076660156, + "learning_rate": 2.049364791288566e-05, + "loss": 37.9873, + "step": 1756 + }, + { + "epoch": 6.343115124153499, + "grad_norm": 241.16644287109375, + "learning_rate": 2.048820326678766e-05, + "loss": 39.3107, + "step": 1757 + }, + { + "epoch": 6.346726862302483, + "grad_norm": 224.3491668701172, + "learning_rate": 2.0482758620689655e-05, + "loss": 36.2482, + "step": 1758 + }, + { + "epoch": 6.350338600451467, + "grad_norm": 217.30882263183594, + "learning_rate": 2.0477313974591654e-05, + "loss": 24.1945, + "step": 1759 + }, + { + "epoch": 6.353950338600452, + "grad_norm": 213.23683166503906, + "learning_rate": 2.047186932849365e-05, + "loss": 24.2356, + "step": 1760 + }, + { + "epoch": 6.353950338600452, + "eval_loss": 0.6382855772972107, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.795, + "eval_steps_per_second": 56.795, + "step": 1760 + }, + { + "epoch": 6.357562076749436, + "grad_norm": 209.8166961669922, + "learning_rate": 2.0466424682395645e-05, + "loss": 25.1916, + "step": 1761 + }, + { + "epoch": 6.36117381489842, + "grad_norm": 197.86773681640625, + "learning_rate": 2.046098003629764e-05, + "loss": 25.1372, + "step": 1762 + }, + { + "epoch": 6.364785553047404, + "grad_norm": 280.80517578125, + "learning_rate": 2.0455535390199635e-05, + "loss": 45.0431, + "step": 1763 + }, + { + "epoch": 6.368397291196389, + "grad_norm": 239.85861206054688, + "learning_rate": 2.0450090744101634e-05, + "loss": 45.4893, + "step": 1764 + }, + { + "epoch": 6.372009029345373, + "grad_norm": 302.56024169921875, + "learning_rate": 2.044464609800363e-05, + "loss": 45.3313, + "step": 1765 + }, + { + "epoch": 6.375620767494357, + "grad_norm": 255.5519256591797, + "learning_rate": 2.043920145190563e-05, + "loss": 44.703, + "step": 1766 + }, + { + "epoch": 6.3792325056433405, + "grad_norm": 223.1331024169922, + "learning_rate": 2.0433756805807624e-05, + "loss": 45.0278, + "step": 1767 + }, + { + "epoch": 6.382844243792325, + "grad_norm": 240.68817138671875, + "learning_rate": 2.042831215970962e-05, + "loss": 44.7298, + "step": 1768 + }, + { + "epoch": 6.386455981941309, + "grad_norm": 239.5072021484375, + "learning_rate": 2.0422867513611614e-05, + "loss": 44.0512, + "step": 1769 + }, + { + "epoch": 6.390067720090293, + "grad_norm": 186.3783416748047, + "learning_rate": 2.0417422867513613e-05, + "loss": 43.8646, + "step": 1770 + }, + { + "epoch": 6.390067720090293, + "eval_loss": 0.6325972676277161, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 1770 + }, + { + "epoch": 6.393679458239277, + "grad_norm": 169.77285766601562, + "learning_rate": 2.041197822141561e-05, + "loss": 43.8688, + "step": 1771 + }, + { + "epoch": 6.397291196388262, + "grad_norm": 158.4019012451172, + "learning_rate": 2.0406533575317604e-05, + "loss": 42.5757, + "step": 1772 + }, + { + "epoch": 6.400902934537246, + "grad_norm": 209.79916381835938, + "learning_rate": 2.04010889292196e-05, + "loss": 44.8075, + "step": 1773 + }, + { + "epoch": 6.40451467268623, + "grad_norm": 215.74639892578125, + "learning_rate": 2.0395644283121595e-05, + "loss": 42.0121, + "step": 1774 + }, + { + "epoch": 6.408126410835214, + "grad_norm": 215.21121215820312, + "learning_rate": 2.0390199637023597e-05, + "loss": 40.6564, + "step": 1775 + }, + { + "epoch": 6.411738148984199, + "grad_norm": 244.49574279785156, + "learning_rate": 2.0384754990925592e-05, + "loss": 40.543, + "step": 1776 + }, + { + "epoch": 6.415349887133183, + "grad_norm": 189.22781372070312, + "learning_rate": 2.0379310344827588e-05, + "loss": 39.5569, + "step": 1777 + }, + { + "epoch": 6.418961625282167, + "grad_norm": 204.32664489746094, + "learning_rate": 2.0373865698729583e-05, + "loss": 40.0789, + "step": 1778 + }, + { + "epoch": 6.422573363431152, + "grad_norm": 217.5277557373047, + "learning_rate": 2.036842105263158e-05, + "loss": 39.6436, + "step": 1779 + }, + { + "epoch": 6.426185101580136, + "grad_norm": 196.25918579101562, + "learning_rate": 2.0362976406533574e-05, + "loss": 41.0794, + "step": 1780 + }, + { + "epoch": 6.426185101580136, + "eval_loss": 0.6334295868873596, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1780 + }, + { + "epoch": 6.42979683972912, + "grad_norm": 191.50656127929688, + "learning_rate": 2.0357531760435573e-05, + "loss": 41.2976, + "step": 1781 + }, + { + "epoch": 6.433408577878104, + "grad_norm": 192.98692321777344, + "learning_rate": 2.0352087114337568e-05, + "loss": 41.0843, + "step": 1782 + }, + { + "epoch": 6.437020316027088, + "grad_norm": 197.32862854003906, + "learning_rate": 2.0346642468239563e-05, + "loss": 40.4123, + "step": 1783 + }, + { + "epoch": 6.440632054176072, + "grad_norm": 205.18751525878906, + "learning_rate": 2.0341197822141562e-05, + "loss": 41.9185, + "step": 1784 + }, + { + "epoch": 6.444243792325056, + "grad_norm": 201.69070434570312, + "learning_rate": 2.0335753176043558e-05, + "loss": 41.6794, + "step": 1785 + }, + { + "epoch": 6.44785553047404, + "grad_norm": 218.77044677734375, + "learning_rate": 2.0330308529945556e-05, + "loss": 43.5805, + "step": 1786 + }, + { + "epoch": 6.451467268623025, + "grad_norm": 183.25967407226562, + "learning_rate": 2.0324863883847552e-05, + "loss": 41.2777, + "step": 1787 + }, + { + "epoch": 6.455079006772009, + "grad_norm": 219.97369384765625, + "learning_rate": 2.0319419237749547e-05, + "loss": 42.4618, + "step": 1788 + }, + { + "epoch": 6.458690744920993, + "grad_norm": 216.1624298095703, + "learning_rate": 2.0313974591651542e-05, + "loss": 41.6424, + "step": 1789 + }, + { + "epoch": 6.462302483069977, + "grad_norm": 222.29965209960938, + "learning_rate": 2.0308529945553538e-05, + "loss": 41.4058, + "step": 1790 + }, + { + "epoch": 6.462302483069977, + "eval_loss": 0.6282982230186462, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 1790 + }, + { + "epoch": 6.465914221218962, + "grad_norm": 215.50511169433594, + "learning_rate": 2.0303085299455533e-05, + "loss": 39.474, + "step": 1791 + }, + { + "epoch": 6.469525959367946, + "grad_norm": 237.2119903564453, + "learning_rate": 2.0297640653357532e-05, + "loss": 36.0508, + "step": 1792 + }, + { + "epoch": 6.47313769751693, + "grad_norm": 234.52975463867188, + "learning_rate": 2.029219600725953e-05, + "loss": 34.1704, + "step": 1793 + }, + { + "epoch": 6.476749435665914, + "grad_norm": 213.22216796875, + "learning_rate": 2.0286751361161526e-05, + "loss": 34.7592, + "step": 1794 + }, + { + "epoch": 6.480361173814899, + "grad_norm": 215.77244567871094, + "learning_rate": 2.028130671506352e-05, + "loss": 35.3051, + "step": 1795 + }, + { + "epoch": 6.483972911963883, + "grad_norm": 179.0439910888672, + "learning_rate": 2.0275862068965517e-05, + "loss": 35.2493, + "step": 1796 + }, + { + "epoch": 6.487584650112867, + "grad_norm": 217.47218322753906, + "learning_rate": 2.0270417422867516e-05, + "loss": 35.6169, + "step": 1797 + }, + { + "epoch": 6.491196388261851, + "grad_norm": 191.3380584716797, + "learning_rate": 2.026497277676951e-05, + "loss": 36.428, + "step": 1798 + }, + { + "epoch": 6.4948081264108355, + "grad_norm": 200.8570098876953, + "learning_rate": 2.0259528130671506e-05, + "loss": 36.5983, + "step": 1799 + }, + { + "epoch": 6.4984198645598195, + "grad_norm": 173.1240234375, + "learning_rate": 2.0254083484573502e-05, + "loss": 36.0163, + "step": 1800 + }, + { + "epoch": 6.4984198645598195, + "eval_loss": 0.6268841624259949, + "eval_runtime": 3.146, + "eval_samples_per_second": 56.898, + "eval_steps_per_second": 56.898, + "step": 1800 + }, + { + "epoch": 6.502031602708803, + "grad_norm": 225.66845703125, + "learning_rate": 2.0248638838475497e-05, + "loss": 36.2461, + "step": 1801 + }, + { + "epoch": 6.505643340857787, + "grad_norm": 189.66233825683594, + "learning_rate": 2.0243194192377496e-05, + "loss": 37.416, + "step": 1802 + }, + { + "epoch": 6.509255079006772, + "grad_norm": 243.0270233154297, + "learning_rate": 2.0237749546279495e-05, + "loss": 38.5309, + "step": 1803 + }, + { + "epoch": 6.512866817155756, + "grad_norm": 192.0927276611328, + "learning_rate": 2.023230490018149e-05, + "loss": 37.087, + "step": 1804 + }, + { + "epoch": 6.51647855530474, + "grad_norm": 222.2957305908203, + "learning_rate": 2.0226860254083486e-05, + "loss": 37.8877, + "step": 1805 + }, + { + "epoch": 6.520090293453725, + "grad_norm": 259.84722900390625, + "learning_rate": 2.022141560798548e-05, + "loss": 39.2138, + "step": 1806 + }, + { + "epoch": 6.523702031602709, + "grad_norm": 205.5794219970703, + "learning_rate": 2.0215970961887476e-05, + "loss": 38.6066, + "step": 1807 + }, + { + "epoch": 6.527313769751693, + "grad_norm": 300.455810546875, + "learning_rate": 2.0210526315789475e-05, + "loss": 36.1581, + "step": 1808 + }, + { + "epoch": 6.530925507900677, + "grad_norm": 207.18063354492188, + "learning_rate": 2.020508166969147e-05, + "loss": 24.3689, + "step": 1809 + }, + { + "epoch": 6.534537246049661, + "grad_norm": 230.98516845703125, + "learning_rate": 2.0199637023593466e-05, + "loss": 23.7019, + "step": 1810 + }, + { + "epoch": 6.534537246049661, + "eval_loss": 0.6379140615463257, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 1810 + }, + { + "epoch": 6.538148984198646, + "grad_norm": 153.8694610595703, + "learning_rate": 2.019419237749546e-05, + "loss": 24.5035, + "step": 1811 + }, + { + "epoch": 6.54176072234763, + "grad_norm": 229.9432373046875, + "learning_rate": 2.018874773139746e-05, + "loss": 26.1645, + "step": 1812 + }, + { + "epoch": 6.545372460496614, + "grad_norm": 325.3592529296875, + "learning_rate": 2.018330308529946e-05, + "loss": 45.6349, + "step": 1813 + }, + { + "epoch": 6.5489841986455986, + "grad_norm": 261.0744323730469, + "learning_rate": 2.0177858439201454e-05, + "loss": 45.5545, + "step": 1814 + }, + { + "epoch": 6.5525959367945825, + "grad_norm": 261.4237976074219, + "learning_rate": 2.017241379310345e-05, + "loss": 45.321, + "step": 1815 + }, + { + "epoch": 6.5562076749435665, + "grad_norm": 238.8377685546875, + "learning_rate": 2.0166969147005445e-05, + "loss": 44.5963, + "step": 1816 + }, + { + "epoch": 6.5598194130925505, + "grad_norm": 225.89730834960938, + "learning_rate": 2.016152450090744e-05, + "loss": 43.593, + "step": 1817 + }, + { + "epoch": 6.563431151241535, + "grad_norm": 265.09625244140625, + "learning_rate": 2.0156079854809436e-05, + "loss": 43.536, + "step": 1818 + }, + { + "epoch": 6.567042889390519, + "grad_norm": 257.9114685058594, + "learning_rate": 2.0150635208711434e-05, + "loss": 44.1125, + "step": 1819 + }, + { + "epoch": 6.570654627539503, + "grad_norm": 188.06382751464844, + "learning_rate": 2.014519056261343e-05, + "loss": 45.097, + "step": 1820 + }, + { + "epoch": 6.570654627539503, + "eval_loss": 0.6347097754478455, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 1820 + }, + { + "epoch": 6.574266365688487, + "grad_norm": 227.7350616455078, + "learning_rate": 2.013974591651543e-05, + "loss": 43.9367, + "step": 1821 + }, + { + "epoch": 6.577878103837472, + "grad_norm": 207.54774475097656, + "learning_rate": 2.0134301270417424e-05, + "loss": 43.8266, + "step": 1822 + }, + { + "epoch": 6.581489841986456, + "grad_norm": 204.62364196777344, + "learning_rate": 2.012885662431942e-05, + "loss": 42.7973, + "step": 1823 + }, + { + "epoch": 6.58510158013544, + "grad_norm": 244.32159423828125, + "learning_rate": 2.0123411978221418e-05, + "loss": 42.7741, + "step": 1824 + }, + { + "epoch": 6.588713318284425, + "grad_norm": 304.9100036621094, + "learning_rate": 2.0117967332123414e-05, + "loss": 40.6529, + "step": 1825 + }, + { + "epoch": 6.592325056433409, + "grad_norm": 275.5767517089844, + "learning_rate": 2.011252268602541e-05, + "loss": 40.2909, + "step": 1826 + }, + { + "epoch": 6.595936794582393, + "grad_norm": 227.69642639160156, + "learning_rate": 2.0107078039927404e-05, + "loss": 39.8786, + "step": 1827 + }, + { + "epoch": 6.599548532731377, + "grad_norm": 261.4333190917969, + "learning_rate": 2.01016333938294e-05, + "loss": 40.7009, + "step": 1828 + }, + { + "epoch": 6.603160270880361, + "grad_norm": 213.0095977783203, + "learning_rate": 2.0096188747731395e-05, + "loss": 40.0595, + "step": 1829 + }, + { + "epoch": 6.606772009029346, + "grad_norm": 251.78590393066406, + "learning_rate": 2.0090744101633397e-05, + "loss": 40.8939, + "step": 1830 + }, + { + "epoch": 6.606772009029346, + "eval_loss": 0.6333281397819519, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1830 + }, + { + "epoch": 6.6103837471783295, + "grad_norm": 224.89805603027344, + "learning_rate": 2.0085299455535393e-05, + "loss": 41.4123, + "step": 1831 + }, + { + "epoch": 6.6139954853273135, + "grad_norm": 195.67982482910156, + "learning_rate": 2.0079854809437388e-05, + "loss": 41.3483, + "step": 1832 + }, + { + "epoch": 6.617607223476298, + "grad_norm": 214.318603515625, + "learning_rate": 2.0074410163339383e-05, + "loss": 40.5516, + "step": 1833 + }, + { + "epoch": 6.621218961625282, + "grad_norm": 226.60968017578125, + "learning_rate": 2.006896551724138e-05, + "loss": 41.3523, + "step": 1834 + }, + { + "epoch": 6.624830699774266, + "grad_norm": 231.63604736328125, + "learning_rate": 2.0063520871143378e-05, + "loss": 41.8734, + "step": 1835 + }, + { + "epoch": 6.62844243792325, + "grad_norm": 224.1644287109375, + "learning_rate": 2.0058076225045373e-05, + "loss": 42.7386, + "step": 1836 + }, + { + "epoch": 6.632054176072235, + "grad_norm": 273.651123046875, + "learning_rate": 2.0052631578947368e-05, + "loss": 42.4525, + "step": 1837 + }, + { + "epoch": 6.635665914221219, + "grad_norm": 270.8088684082031, + "learning_rate": 2.0047186932849364e-05, + "loss": 42.1051, + "step": 1838 + }, + { + "epoch": 6.639277652370203, + "grad_norm": 303.1058044433594, + "learning_rate": 2.0041742286751362e-05, + "loss": 42.1301, + "step": 1839 + }, + { + "epoch": 6.642889390519187, + "grad_norm": 207.29380798339844, + "learning_rate": 2.0036297640653358e-05, + "loss": 42.1495, + "step": 1840 + }, + { + "epoch": 6.642889390519187, + "eval_loss": 0.6321585774421692, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1840 + }, + { + "epoch": 6.646501128668172, + "grad_norm": 262.1852722167969, + "learning_rate": 2.0030852994555357e-05, + "loss": 39.6408, + "step": 1841 + }, + { + "epoch": 6.650112866817156, + "grad_norm": 233.7991943359375, + "learning_rate": 2.0025408348457352e-05, + "loss": 37.6177, + "step": 1842 + }, + { + "epoch": 6.65372460496614, + "grad_norm": 247.25514221191406, + "learning_rate": 2.0019963702359347e-05, + "loss": 35.4287, + "step": 1843 + }, + { + "epoch": 6.657336343115124, + "grad_norm": 191.53343200683594, + "learning_rate": 2.0014519056261343e-05, + "loss": 34.2335, + "step": 1844 + }, + { + "epoch": 6.660948081264109, + "grad_norm": 245.22821044921875, + "learning_rate": 2.0009074410163338e-05, + "loss": 35.8097, + "step": 1845 + }, + { + "epoch": 6.664559819413093, + "grad_norm": 213.8151092529297, + "learning_rate": 2.0003629764065337e-05, + "loss": 35.2621, + "step": 1846 + }, + { + "epoch": 6.668171557562077, + "grad_norm": 174.6085205078125, + "learning_rate": 1.9998185117967332e-05, + "loss": 36.6137, + "step": 1847 + }, + { + "epoch": 6.6717832957110605, + "grad_norm": 287.4677429199219, + "learning_rate": 1.9992740471869328e-05, + "loss": 37.5896, + "step": 1848 + }, + { + "epoch": 6.675395033860045, + "grad_norm": 224.59771728515625, + "learning_rate": 1.9987295825771326e-05, + "loss": 36.5515, + "step": 1849 + }, + { + "epoch": 6.679006772009029, + "grad_norm": 212.73065185546875, + "learning_rate": 1.9981851179673322e-05, + "loss": 36.2511, + "step": 1850 + }, + { + "epoch": 6.679006772009029, + "eval_loss": 0.6308404803276062, + "eval_runtime": 3.1419, + "eval_samples_per_second": 56.972, + "eval_steps_per_second": 56.972, + "step": 1850 + }, + { + "epoch": 6.682618510158013, + "grad_norm": 214.7340850830078, + "learning_rate": 1.9976406533575317e-05, + "loss": 37.6949, + "step": 1851 + }, + { + "epoch": 6.686230248306998, + "grad_norm": 220.3029327392578, + "learning_rate": 1.9970961887477316e-05, + "loss": 36.5785, + "step": 1852 + }, + { + "epoch": 6.689841986455982, + "grad_norm": 198.97564697265625, + "learning_rate": 1.996551724137931e-05, + "loss": 38.5277, + "step": 1853 + }, + { + "epoch": 6.693453724604966, + "grad_norm": 180.94789123535156, + "learning_rate": 1.9960072595281307e-05, + "loss": 37.5197, + "step": 1854 + }, + { + "epoch": 6.69706546275395, + "grad_norm": 212.17584228515625, + "learning_rate": 1.9954627949183302e-05, + "loss": 37.3483, + "step": 1855 + }, + { + "epoch": 6.700677200902934, + "grad_norm": 253.88601684570312, + "learning_rate": 1.9949183303085298e-05, + "loss": 38.5224, + "step": 1856 + }, + { + "epoch": 6.704288939051919, + "grad_norm": 193.17698669433594, + "learning_rate": 1.9943738656987296e-05, + "loss": 37.5679, + "step": 1857 + }, + { + "epoch": 6.707900677200903, + "grad_norm": 217.2652130126953, + "learning_rate": 1.9938294010889295e-05, + "loss": 27.7344, + "step": 1858 + }, + { + "epoch": 6.711512415349887, + "grad_norm": 183.9295196533203, + "learning_rate": 1.993284936479129e-05, + "loss": 24.3864, + "step": 1859 + }, + { + "epoch": 6.715124153498872, + "grad_norm": 200.3455352783203, + "learning_rate": 1.9927404718693286e-05, + "loss": 23.7328, + "step": 1860 + }, + { + "epoch": 6.715124153498872, + "eval_loss": 0.636415421962738, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.943, + "eval_steps_per_second": 56.943, + "step": 1860 + }, + { + "epoch": 6.718735891647856, + "grad_norm": 206.7858123779297, + "learning_rate": 1.992196007259528e-05, + "loss": 24.6541, + "step": 1861 + }, + { + "epoch": 6.72234762979684, + "grad_norm": 208.10414123535156, + "learning_rate": 1.9916515426497277e-05, + "loss": 25.1223, + "step": 1862 + }, + { + "epoch": 6.725959367945824, + "grad_norm": 270.6657409667969, + "learning_rate": 1.9911070780399275e-05, + "loss": 44.8561, + "step": 1863 + }, + { + "epoch": 6.7295711060948085, + "grad_norm": 246.69094848632812, + "learning_rate": 1.990562613430127e-05, + "loss": 45.8683, + "step": 1864 + }, + { + "epoch": 6.733182844243792, + "grad_norm": 243.4462432861328, + "learning_rate": 1.9900181488203266e-05, + "loss": 45.1845, + "step": 1865 + }, + { + "epoch": 6.736794582392776, + "grad_norm": 218.0637969970703, + "learning_rate": 1.989473684210526e-05, + "loss": 43.9492, + "step": 1866 + }, + { + "epoch": 6.74040632054176, + "grad_norm": 200.28140258789062, + "learning_rate": 1.988929219600726e-05, + "loss": 44.0612, + "step": 1867 + }, + { + "epoch": 6.744018058690745, + "grad_norm": 200.3120880126953, + "learning_rate": 1.988384754990926e-05, + "loss": 43.4748, + "step": 1868 + }, + { + "epoch": 6.747629796839729, + "grad_norm": 186.1811065673828, + "learning_rate": 1.9878402903811254e-05, + "loss": 43.6851, + "step": 1869 + }, + { + "epoch": 6.751241534988713, + "grad_norm": 208.15167236328125, + "learning_rate": 1.987295825771325e-05, + "loss": 44.4196, + "step": 1870 + }, + { + "epoch": 6.751241534988713, + "eval_loss": 0.6353851556777954, + "eval_runtime": 3.1436, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1870 + }, + { + "epoch": 6.754853273137698, + "grad_norm": 207.500244140625, + "learning_rate": 1.9867513611615245e-05, + "loss": 44.1493, + "step": 1871 + }, + { + "epoch": 6.758465011286682, + "grad_norm": 238.17047119140625, + "learning_rate": 1.986206896551724e-05, + "loss": 44.6587, + "step": 1872 + }, + { + "epoch": 6.762076749435666, + "grad_norm": 192.9468231201172, + "learning_rate": 1.9856624319419236e-05, + "loss": 43.2409, + "step": 1873 + }, + { + "epoch": 6.76568848758465, + "grad_norm": 205.26492309570312, + "learning_rate": 1.9851179673321235e-05, + "loss": 40.8636, + "step": 1874 + }, + { + "epoch": 6.769300225733634, + "grad_norm": 190.49908447265625, + "learning_rate": 1.984573502722323e-05, + "loss": 41.0769, + "step": 1875 + }, + { + "epoch": 6.772911963882619, + "grad_norm": 206.56097412109375, + "learning_rate": 1.984029038112523e-05, + "loss": 40.1137, + "step": 1876 + }, + { + "epoch": 6.776523702031603, + "grad_norm": 212.89256286621094, + "learning_rate": 1.9834845735027224e-05, + "loss": 41.0114, + "step": 1877 + }, + { + "epoch": 6.780135440180587, + "grad_norm": 197.24267578125, + "learning_rate": 1.982940108892922e-05, + "loss": 40.6027, + "step": 1878 + }, + { + "epoch": 6.7837471783295715, + "grad_norm": 187.01942443847656, + "learning_rate": 1.982395644283122e-05, + "loss": 40.5933, + "step": 1879 + }, + { + "epoch": 6.7873589164785555, + "grad_norm": 236.31092834472656, + "learning_rate": 1.9818511796733214e-05, + "loss": 41.2282, + "step": 1880 + }, + { + "epoch": 6.7873589164785555, + "eval_loss": 0.6299392580986023, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 1880 + }, + { + "epoch": 6.7909706546275395, + "grad_norm": 194.92059326171875, + "learning_rate": 1.981306715063521e-05, + "loss": 41.5858, + "step": 1881 + }, + { + "epoch": 6.794582392776523, + "grad_norm": 192.26272583007812, + "learning_rate": 1.9807622504537205e-05, + "loss": 40.6826, + "step": 1882 + }, + { + "epoch": 6.798194130925508, + "grad_norm": 181.8116912841797, + "learning_rate": 1.98021778584392e-05, + "loss": 40.0867, + "step": 1883 + }, + { + "epoch": 6.801805869074492, + "grad_norm": 219.03494262695312, + "learning_rate": 1.9796733212341195e-05, + "loss": 41.4496, + "step": 1884 + }, + { + "epoch": 6.805417607223476, + "grad_norm": 190.7852325439453, + "learning_rate": 1.9791288566243194e-05, + "loss": 42.4147, + "step": 1885 + }, + { + "epoch": 6.80902934537246, + "grad_norm": 200.32476806640625, + "learning_rate": 1.9785843920145193e-05, + "loss": 42.0316, + "step": 1886 + }, + { + "epoch": 6.812641083521445, + "grad_norm": 240.6086883544922, + "learning_rate": 1.9780399274047188e-05, + "loss": 39.6992, + "step": 1887 + }, + { + "epoch": 6.816252821670429, + "grad_norm": 222.31700134277344, + "learning_rate": 1.9774954627949184e-05, + "loss": 42.9572, + "step": 1888 + }, + { + "epoch": 6.819864559819413, + "grad_norm": 215.65292358398438, + "learning_rate": 1.976950998185118e-05, + "loss": 42.5147, + "step": 1889 + }, + { + "epoch": 6.823476297968397, + "grad_norm": 195.71624755859375, + "learning_rate": 1.9764065335753178e-05, + "loss": 40.9536, + "step": 1890 + }, + { + "epoch": 6.823476297968397, + "eval_loss": 0.6288287043571472, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 1890 + }, + { + "epoch": 6.827088036117382, + "grad_norm": 202.301025390625, + "learning_rate": 1.9758620689655173e-05, + "loss": 40.1754, + "step": 1891 + }, + { + "epoch": 6.830699774266366, + "grad_norm": 217.07186889648438, + "learning_rate": 1.975317604355717e-05, + "loss": 35.7505, + "step": 1892 + }, + { + "epoch": 6.83431151241535, + "grad_norm": 189.78782653808594, + "learning_rate": 1.9747731397459164e-05, + "loss": 34.813, + "step": 1893 + }, + { + "epoch": 6.837923250564334, + "grad_norm": 247.2117462158203, + "learning_rate": 1.974228675136116e-05, + "loss": 33.932, + "step": 1894 + }, + { + "epoch": 6.8415349887133186, + "grad_norm": 244.06321716308594, + "learning_rate": 1.9736842105263158e-05, + "loss": 36.2514, + "step": 1895 + }, + { + "epoch": 6.8451467268623025, + "grad_norm": 235.78692626953125, + "learning_rate": 1.9731397459165157e-05, + "loss": 35.2123, + "step": 1896 + }, + { + "epoch": 6.8487584650112865, + "grad_norm": 193.82456970214844, + "learning_rate": 1.9725952813067152e-05, + "loss": 36.5477, + "step": 1897 + }, + { + "epoch": 6.852370203160271, + "grad_norm": 230.2017059326172, + "learning_rate": 1.9720508166969148e-05, + "loss": 36.1244, + "step": 1898 + }, + { + "epoch": 6.855981941309255, + "grad_norm": 205.5274200439453, + "learning_rate": 1.9715063520871143e-05, + "loss": 36.7059, + "step": 1899 + }, + { + "epoch": 6.859593679458239, + "grad_norm": 236.6873016357422, + "learning_rate": 1.970961887477314e-05, + "loss": 36.6212, + "step": 1900 + }, + { + "epoch": 6.859593679458239, + "eval_loss": 0.6235609650611877, + "eval_runtime": 3.1497, + "eval_samples_per_second": 56.831, + "eval_steps_per_second": 56.831, + "step": 1900 + }, + { + "epoch": 6.863205417607223, + "grad_norm": 217.63638305664062, + "learning_rate": 1.9704174228675137e-05, + "loss": 37.3918, + "step": 1901 + }, + { + "epoch": 6.866817155756207, + "grad_norm": 169.31996154785156, + "learning_rate": 1.9698729582577133e-05, + "loss": 37.8555, + "step": 1902 + }, + { + "epoch": 6.870428893905192, + "grad_norm": 204.2144775390625, + "learning_rate": 1.9693284936479128e-05, + "loss": 38.0013, + "step": 1903 + }, + { + "epoch": 6.874040632054176, + "grad_norm": 219.13595581054688, + "learning_rate": 1.9687840290381127e-05, + "loss": 37.2128, + "step": 1904 + }, + { + "epoch": 6.87765237020316, + "grad_norm": 189.8477325439453, + "learning_rate": 1.9682395644283122e-05, + "loss": 39.272, + "step": 1905 + }, + { + "epoch": 6.881264108352145, + "grad_norm": 214.21360778808594, + "learning_rate": 1.967695099818512e-05, + "loss": 37.5185, + "step": 1906 + }, + { + "epoch": 6.884875846501129, + "grad_norm": 252.57867431640625, + "learning_rate": 1.9671506352087116e-05, + "loss": 37.6195, + "step": 1907 + }, + { + "epoch": 6.888487584650113, + "grad_norm": 169.85382080078125, + "learning_rate": 1.966606170598911e-05, + "loss": 29.083, + "step": 1908 + }, + { + "epoch": 6.892099322799097, + "grad_norm": 161.38137817382812, + "learning_rate": 1.9660617059891107e-05, + "loss": 24.4547, + "step": 1909 + }, + { + "epoch": 6.895711060948082, + "grad_norm": 192.5706787109375, + "learning_rate": 1.9655172413793102e-05, + "loss": 24.2235, + "step": 1910 + }, + { + "epoch": 6.895711060948082, + "eval_loss": 0.6387229561805725, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1910 + }, + { + "epoch": 6.899322799097066, + "grad_norm": 177.5368194580078, + "learning_rate": 1.9649727767695098e-05, + "loss": 24.8032, + "step": 1911 + }, + { + "epoch": 6.9029345372460496, + "grad_norm": 206.98458862304688, + "learning_rate": 1.9644283121597097e-05, + "loss": 25.7293, + "step": 1912 + }, + { + "epoch": 6.9065462753950335, + "grad_norm": 238.7289581298828, + "learning_rate": 1.9638838475499095e-05, + "loss": 44.2514, + "step": 1913 + }, + { + "epoch": 6.910158013544018, + "grad_norm": 225.86854553222656, + "learning_rate": 1.963339382940109e-05, + "loss": 44.4858, + "step": 1914 + }, + { + "epoch": 6.913769751693002, + "grad_norm": 235.71524047851562, + "learning_rate": 1.9627949183303086e-05, + "loss": 44.5351, + "step": 1915 + }, + { + "epoch": 6.917381489841986, + "grad_norm": 233.1634063720703, + "learning_rate": 1.962250453720508e-05, + "loss": 44.0865, + "step": 1916 + }, + { + "epoch": 6.92099322799097, + "grad_norm": 201.48944091796875, + "learning_rate": 1.961705989110708e-05, + "loss": 45.0226, + "step": 1917 + }, + { + "epoch": 6.924604966139955, + "grad_norm": 226.95469665527344, + "learning_rate": 1.9611615245009076e-05, + "loss": 44.3969, + "step": 1918 + }, + { + "epoch": 6.928216704288939, + "grad_norm": 242.79940795898438, + "learning_rate": 1.960617059891107e-05, + "loss": 41.3037, + "step": 1919 + }, + { + "epoch": 6.931828442437923, + "grad_norm": 255.3524932861328, + "learning_rate": 1.9600725952813066e-05, + "loss": 41.3567, + "step": 1920 + }, + { + "epoch": 6.931828442437923, + "eval_loss": 0.6346065998077393, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 1920 + }, + { + "epoch": 6.935440180586907, + "grad_norm": 277.0763854980469, + "learning_rate": 1.9595281306715062e-05, + "loss": 41.142, + "step": 1921 + }, + { + "epoch": 6.939051918735892, + "grad_norm": 176.02658081054688, + "learning_rate": 1.958983666061706e-05, + "loss": 42.1963, + "step": 1922 + }, + { + "epoch": 6.942663656884876, + "grad_norm": 236.36398315429688, + "learning_rate": 1.958439201451906e-05, + "loss": 42.351, + "step": 1923 + }, + { + "epoch": 6.94627539503386, + "grad_norm": 203.0919647216797, + "learning_rate": 1.9578947368421055e-05, + "loss": 41.5248, + "step": 1924 + }, + { + "epoch": 6.949887133182845, + "grad_norm": 273.605712890625, + "learning_rate": 1.957350272232305e-05, + "loss": 42.1004, + "step": 1925 + }, + { + "epoch": 6.953498871331829, + "grad_norm": 214.04319763183594, + "learning_rate": 1.9568058076225045e-05, + "loss": 42.6326, + "step": 1926 + }, + { + "epoch": 6.957110609480813, + "grad_norm": 250.81832885742188, + "learning_rate": 1.956261343012704e-05, + "loss": 43.8045, + "step": 1927 + }, + { + "epoch": 6.960722347629797, + "grad_norm": 233.58116149902344, + "learning_rate": 1.955716878402904e-05, + "loss": 39.8991, + "step": 1928 + }, + { + "epoch": 6.9643340857787805, + "grad_norm": 269.0545654296875, + "learning_rate": 1.9551724137931035e-05, + "loss": 34.6192, + "step": 1929 + }, + { + "epoch": 6.967945823927765, + "grad_norm": 266.1218566894531, + "learning_rate": 1.954627949183303e-05, + "loss": 35.7568, + "step": 1930 + }, + { + "epoch": 6.967945823927765, + "eval_loss": 0.6233173608779907, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1930 + }, + { + "epoch": 6.971557562076749, + "grad_norm": 294.6914978027344, + "learning_rate": 1.9540834845735026e-05, + "loss": 36.0795, + "step": 1931 + }, + { + "epoch": 6.975169300225733, + "grad_norm": 373.6831970214844, + "learning_rate": 1.9535390199637025e-05, + "loss": 37.2715, + "step": 1932 + }, + { + "epoch": 6.978781038374718, + "grad_norm": 240.34738159179688, + "learning_rate": 1.952994555353902e-05, + "loss": 37.8335, + "step": 1933 + }, + { + "epoch": 6.982392776523702, + "grad_norm": 312.1968994140625, + "learning_rate": 1.952450090744102e-05, + "loss": 37.8251, + "step": 1934 + }, + { + "epoch": 6.986004514672686, + "grad_norm": 276.3544006347656, + "learning_rate": 1.9519056261343014e-05, + "loss": 38.8466, + "step": 1935 + }, + { + "epoch": 6.98961625282167, + "grad_norm": 282.6874694824219, + "learning_rate": 1.951361161524501e-05, + "loss": 37.774, + "step": 1936 + }, + { + "epoch": 6.993227990970655, + "grad_norm": 323.96612548828125, + "learning_rate": 1.9508166969147005e-05, + "loss": 34.3747, + "step": 1937 + }, + { + "epoch": 6.996839729119639, + "grad_norm": 235.02915954589844, + "learning_rate": 1.9502722323049e-05, + "loss": 24.5297, + "step": 1938 + }, + { + "epoch": 7.0, + "grad_norm": 176.4046173095703, + "learning_rate": 1.9497277676951e-05, + "loss": 22.3179, + "step": 1939 + }, + { + "epoch": 7.003611738148984, + "grad_norm": 248.2797393798828, + "learning_rate": 1.9491833030852994e-05, + "loss": 42.225, + "step": 1940 + }, + { + "epoch": 7.003611738148984, + "eval_loss": 0.6272363066673279, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.911, + "eval_steps_per_second": 56.911, + "step": 1940 + }, + { + "epoch": 7.007223476297969, + "grad_norm": 235.9131622314453, + "learning_rate": 1.9486388384754993e-05, + "loss": 43.6526, + "step": 1941 + }, + { + "epoch": 7.010835214446953, + "grad_norm": 223.63479614257812, + "learning_rate": 1.948094373865699e-05, + "loss": 42.9052, + "step": 1942 + }, + { + "epoch": 7.014446952595937, + "grad_norm": 203.92141723632812, + "learning_rate": 1.9475499092558984e-05, + "loss": 43.5819, + "step": 1943 + }, + { + "epoch": 7.018058690744921, + "grad_norm": 209.6050567626953, + "learning_rate": 1.947005444646098e-05, + "loss": 43.1077, + "step": 1944 + }, + { + "epoch": 7.021670428893906, + "grad_norm": 245.77700805664062, + "learning_rate": 1.9464609800362978e-05, + "loss": 42.7508, + "step": 1945 + }, + { + "epoch": 7.0252821670428895, + "grad_norm": 203.13465881347656, + "learning_rate": 1.9459165154264973e-05, + "loss": 42.5234, + "step": 1946 + }, + { + "epoch": 7.0288939051918735, + "grad_norm": 226.4978485107422, + "learning_rate": 1.945372050816697e-05, + "loss": 44.0725, + "step": 1947 + }, + { + "epoch": 7.0325056433408575, + "grad_norm": 225.68116760253906, + "learning_rate": 1.9448275862068964e-05, + "loss": 42.6408, + "step": 1948 + }, + { + "epoch": 7.036117381489842, + "grad_norm": 182.14202880859375, + "learning_rate": 1.944283121597096e-05, + "loss": 41.7696, + "step": 1949 + }, + { + "epoch": 7.039729119638826, + "grad_norm": 196.1949005126953, + "learning_rate": 1.9437386569872962e-05, + "loss": 42.7008, + "step": 1950 + }, + { + "epoch": 7.039729119638826, + "eval_loss": 0.6277336478233337, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 1950 + }, + { + "epoch": 7.04334085778781, + "grad_norm": 180.6853485107422, + "learning_rate": 1.9431941923774957e-05, + "loss": 41.9946, + "step": 1951 + }, + { + "epoch": 7.046952595936794, + "grad_norm": 199.0644073486328, + "learning_rate": 1.9426497277676953e-05, + "loss": 39.8965, + "step": 1952 + }, + { + "epoch": 7.050564334085779, + "grad_norm": 208.21371459960938, + "learning_rate": 1.9421052631578948e-05, + "loss": 39.3263, + "step": 1953 + }, + { + "epoch": 7.054176072234763, + "grad_norm": 239.78677368164062, + "learning_rate": 1.9415607985480943e-05, + "loss": 40.1478, + "step": 1954 + }, + { + "epoch": 7.057787810383747, + "grad_norm": 211.55030822753906, + "learning_rate": 1.941016333938294e-05, + "loss": 40.061, + "step": 1955 + }, + { + "epoch": 7.061399548532731, + "grad_norm": 199.51455688476562, + "learning_rate": 1.9404718693284937e-05, + "loss": 39.8707, + "step": 1956 + }, + { + "epoch": 7.065011286681716, + "grad_norm": 183.39486694335938, + "learning_rate": 1.9399274047186933e-05, + "loss": 40.3183, + "step": 1957 + }, + { + "epoch": 7.0686230248307, + "grad_norm": 238.36737060546875, + "learning_rate": 1.9393829401088928e-05, + "loss": 40.8581, + "step": 1958 + }, + { + "epoch": 7.072234762979684, + "grad_norm": 202.5072021484375, + "learning_rate": 1.9388384754990927e-05, + "loss": 40.2192, + "step": 1959 + }, + { + "epoch": 7.075846501128668, + "grad_norm": 204.236083984375, + "learning_rate": 1.9382940108892922e-05, + "loss": 40.8533, + "step": 1960 + }, + { + "epoch": 7.075846501128668, + "eval_loss": 0.6252757906913757, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 1960 + }, + { + "epoch": 7.079458239277653, + "grad_norm": 260.2081298828125, + "learning_rate": 1.937749546279492e-05, + "loss": 39.7229, + "step": 1961 + }, + { + "epoch": 7.083069977426637, + "grad_norm": 241.91722106933594, + "learning_rate": 1.9372050816696917e-05, + "loss": 41.547, + "step": 1962 + }, + { + "epoch": 7.0866817155756205, + "grad_norm": 168.9304656982422, + "learning_rate": 1.9366606170598912e-05, + "loss": 41.4826, + "step": 1963 + }, + { + "epoch": 7.090293453724605, + "grad_norm": 230.05349731445312, + "learning_rate": 1.9361161524500907e-05, + "loss": 41.5411, + "step": 1964 + }, + { + "epoch": 7.093905191873589, + "grad_norm": 172.16851806640625, + "learning_rate": 1.9355716878402903e-05, + "loss": 42.2347, + "step": 1965 + }, + { + "epoch": 7.097516930022573, + "grad_norm": 312.65838623046875, + "learning_rate": 1.9350272232304898e-05, + "loss": 41.4039, + "step": 1966 + }, + { + "epoch": 7.101128668171557, + "grad_norm": 249.62351989746094, + "learning_rate": 1.9344827586206897e-05, + "loss": 41.4234, + "step": 1967 + }, + { + "epoch": 7.104740406320542, + "grad_norm": 250.49143981933594, + "learning_rate": 1.9339382940108896e-05, + "loss": 38.0539, + "step": 1968 + }, + { + "epoch": 7.108352144469526, + "grad_norm": 238.41546630859375, + "learning_rate": 1.933393829401089e-05, + "loss": 35.5584, + "step": 1969 + }, + { + "epoch": 7.11196388261851, + "grad_norm": 200.78282165527344, + "learning_rate": 1.9328493647912886e-05, + "loss": 34.4491, + "step": 1970 + }, + { + "epoch": 7.11196388261851, + "eval_loss": 0.6286216378211975, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 1970 + }, + { + "epoch": 7.115575620767494, + "grad_norm": 244.61717224121094, + "learning_rate": 1.9323049001814882e-05, + "loss": 34.5403, + "step": 1971 + }, + { + "epoch": 7.119187358916479, + "grad_norm": 219.14312744140625, + "learning_rate": 1.931760435571688e-05, + "loss": 35.7815, + "step": 1972 + }, + { + "epoch": 7.122799097065463, + "grad_norm": 221.85130310058594, + "learning_rate": 1.9312159709618876e-05, + "loss": 35.638, + "step": 1973 + }, + { + "epoch": 7.126410835214447, + "grad_norm": 237.97921752929688, + "learning_rate": 1.930671506352087e-05, + "loss": 35.1348, + "step": 1974 + }, + { + "epoch": 7.130022573363431, + "grad_norm": 234.06256103515625, + "learning_rate": 1.9301270417422867e-05, + "loss": 35.8709, + "step": 1975 + }, + { + "epoch": 7.133634311512416, + "grad_norm": 231.6852264404297, + "learning_rate": 1.9295825771324862e-05, + "loss": 36.6859, + "step": 1976 + }, + { + "epoch": 7.1372460496614, + "grad_norm": 208.2762908935547, + "learning_rate": 1.9290381125226857e-05, + "loss": 37.24, + "step": 1977 + }, + { + "epoch": 7.140857787810384, + "grad_norm": 219.8532257080078, + "learning_rate": 1.928493647912886e-05, + "loss": 36.4058, + "step": 1978 + }, + { + "epoch": 7.144469525959368, + "grad_norm": 242.73159790039062, + "learning_rate": 1.9279491833030855e-05, + "loss": 36.7565, + "step": 1979 + }, + { + "epoch": 7.148081264108352, + "grad_norm": 227.09645080566406, + "learning_rate": 1.927404718693285e-05, + "loss": 37.6752, + "step": 1980 + }, + { + "epoch": 7.148081264108352, + "eval_loss": 0.6243596076965332, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 1980 + }, + { + "epoch": 7.151693002257336, + "grad_norm": 236.27169799804688, + "learning_rate": 1.9268602540834846e-05, + "loss": 38.3857, + "step": 1981 + }, + { + "epoch": 7.15530474040632, + "grad_norm": 244.84912109375, + "learning_rate": 1.926315789473684e-05, + "loss": 38.414, + "step": 1982 + }, + { + "epoch": 7.158916478555304, + "grad_norm": 203.36798095703125, + "learning_rate": 1.925771324863884e-05, + "loss": 38.938, + "step": 1983 + }, + { + "epoch": 7.162528216704289, + "grad_norm": 225.50152587890625, + "learning_rate": 1.9252268602540835e-05, + "loss": 37.654, + "step": 1984 + }, + { + "epoch": 7.166139954853273, + "grad_norm": 236.4989471435547, + "learning_rate": 1.924682395644283e-05, + "loss": 28.2794, + "step": 1985 + }, + { + "epoch": 7.169751693002257, + "grad_norm": 173.909423828125, + "learning_rate": 1.9241379310344826e-05, + "loss": 23.3804, + "step": 1986 + }, + { + "epoch": 7.173363431151241, + "grad_norm": 195.63526916503906, + "learning_rate": 1.9235934664246825e-05, + "loss": 24.4696, + "step": 1987 + }, + { + "epoch": 7.176975169300226, + "grad_norm": 150.0059356689453, + "learning_rate": 1.923049001814882e-05, + "loss": 23.9438, + "step": 1988 + }, + { + "epoch": 7.18058690744921, + "grad_norm": 217.61630249023438, + "learning_rate": 1.922504537205082e-05, + "loss": 25.4084, + "step": 1989 + }, + { + "epoch": 7.184198645598194, + "grad_norm": 259.2041015625, + "learning_rate": 1.9219600725952814e-05, + "loss": 44.7159, + "step": 1990 + }, + { + "epoch": 7.184198645598194, + "eval_loss": 0.6465168595314026, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 1990 + }, + { + "epoch": 7.187810383747179, + "grad_norm": 282.1758117675781, + "learning_rate": 1.921415607985481e-05, + "loss": 45.7571, + "step": 1991 + }, + { + "epoch": 7.191422121896163, + "grad_norm": 276.5455322265625, + "learning_rate": 1.9208711433756805e-05, + "loss": 44.7227, + "step": 1992 + }, + { + "epoch": 7.195033860045147, + "grad_norm": 251.93589782714844, + "learning_rate": 1.92032667876588e-05, + "loss": 43.0705, + "step": 1993 + }, + { + "epoch": 7.198645598194131, + "grad_norm": 224.8245086669922, + "learning_rate": 1.91978221415608e-05, + "loss": 43.2009, + "step": 1994 + }, + { + "epoch": 7.2022573363431155, + "grad_norm": 233.61770629882812, + "learning_rate": 1.9192377495462795e-05, + "loss": 43.4496, + "step": 1995 + }, + { + "epoch": 7.2058690744920995, + "grad_norm": 188.65252685546875, + "learning_rate": 1.9186932849364793e-05, + "loss": 42.5907, + "step": 1996 + }, + { + "epoch": 7.209480812641083, + "grad_norm": 185.1155242919922, + "learning_rate": 1.918148820326679e-05, + "loss": 44.4651, + "step": 1997 + }, + { + "epoch": 7.213092550790067, + "grad_norm": 169.09701538085938, + "learning_rate": 1.9176043557168784e-05, + "loss": 43.6325, + "step": 1998 + }, + { + "epoch": 7.216704288939052, + "grad_norm": 198.49114990234375, + "learning_rate": 1.9170598911070783e-05, + "loss": 43.5817, + "step": 1999 + }, + { + "epoch": 7.220316027088036, + "grad_norm": 193.17591857910156, + "learning_rate": 1.916515426497278e-05, + "loss": 41.4884, + "step": 2000 + }, + { + "epoch": 7.220316027088036, + "eval_loss": 0.6329721212387085, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.995, + "eval_steps_per_second": 56.995, + "step": 2000 + }, + { + "epoch": 7.22392776523702, + "grad_norm": 202.32730102539062, + "learning_rate": 1.9159709618874774e-05, + "loss": 41.2168, + "step": 2001 + }, + { + "epoch": 7.227539503386004, + "grad_norm": 206.4916534423828, + "learning_rate": 1.915426497277677e-05, + "loss": 39.9909, + "step": 2002 + }, + { + "epoch": 7.231151241534989, + "grad_norm": 202.2099609375, + "learning_rate": 1.9148820326678765e-05, + "loss": 40.1413, + "step": 2003 + }, + { + "epoch": 7.234762979683973, + "grad_norm": 223.7954559326172, + "learning_rate": 1.914337568058076e-05, + "loss": 39.5872, + "step": 2004 + }, + { + "epoch": 7.238374717832957, + "grad_norm": 225.8967742919922, + "learning_rate": 1.9137931034482762e-05, + "loss": 41.3396, + "step": 2005 + }, + { + "epoch": 7.241986455981941, + "grad_norm": 248.0997772216797, + "learning_rate": 1.9132486388384757e-05, + "loss": 39.012, + "step": 2006 + }, + { + "epoch": 7.245598194130926, + "grad_norm": 227.4576873779297, + "learning_rate": 1.9127041742286753e-05, + "loss": 42.5922, + "step": 2007 + }, + { + "epoch": 7.24920993227991, + "grad_norm": 197.62547302246094, + "learning_rate": 1.9121597096188748e-05, + "loss": 41.6107, + "step": 2008 + }, + { + "epoch": 7.252821670428894, + "grad_norm": 170.18817138671875, + "learning_rate": 1.9116152450090744e-05, + "loss": 40.3326, + "step": 2009 + }, + { + "epoch": 7.2564334085778786, + "grad_norm": 186.9420166015625, + "learning_rate": 1.9110707803992742e-05, + "loss": 41.0365, + "step": 2010 + }, + { + "epoch": 7.2564334085778786, + "eval_loss": 0.6230406761169434, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2010 + }, + { + "epoch": 7.2600451467268625, + "grad_norm": 188.11244201660156, + "learning_rate": 1.9105263157894738e-05, + "loss": 42.0278, + "step": 2011 + }, + { + "epoch": 7.2636568848758465, + "grad_norm": 242.47305297851562, + "learning_rate": 1.9099818511796733e-05, + "loss": 41.5539, + "step": 2012 + }, + { + "epoch": 7.2672686230248305, + "grad_norm": 190.83987426757812, + "learning_rate": 1.909437386569873e-05, + "loss": 41.8641, + "step": 2013 + }, + { + "epoch": 7.270880361173815, + "grad_norm": 214.44650268554688, + "learning_rate": 1.9088929219600724e-05, + "loss": 42.232, + "step": 2014 + }, + { + "epoch": 7.274492099322799, + "grad_norm": 216.3888397216797, + "learning_rate": 1.9083484573502723e-05, + "loss": 41.6186, + "step": 2015 + }, + { + "epoch": 7.278103837471783, + "grad_norm": 210.46673583984375, + "learning_rate": 1.907803992740472e-05, + "loss": 42.2099, + "step": 2016 + }, + { + "epoch": 7.281715575620767, + "grad_norm": 194.84165954589844, + "learning_rate": 1.9072595281306717e-05, + "loss": 42.78, + "step": 2017 + }, + { + "epoch": 7.285327313769752, + "grad_norm": 201.91297912597656, + "learning_rate": 1.9067150635208712e-05, + "loss": 38.7115, + "step": 2018 + }, + { + "epoch": 7.288939051918736, + "grad_norm": 245.42625427246094, + "learning_rate": 1.9061705989110708e-05, + "loss": 35.7841, + "step": 2019 + }, + { + "epoch": 7.29255079006772, + "grad_norm": 182.4967041015625, + "learning_rate": 1.9056261343012703e-05, + "loss": 34.3308, + "step": 2020 + }, + { + "epoch": 7.29255079006772, + "eval_loss": 0.6238341331481934, + "eval_runtime": 3.1431, + "eval_samples_per_second": 56.95, + "eval_steps_per_second": 56.95, + "step": 2020 + }, + { + "epoch": 7.296162528216704, + "grad_norm": 297.3916320800781, + "learning_rate": 1.9050816696914702e-05, + "loss": 34.7534, + "step": 2021 + }, + { + "epoch": 7.299774266365689, + "grad_norm": 211.52554321289062, + "learning_rate": 1.9045372050816697e-05, + "loss": 34.0303, + "step": 2022 + }, + { + "epoch": 7.303386004514673, + "grad_norm": 232.99844360351562, + "learning_rate": 1.9039927404718693e-05, + "loss": 35.7378, + "step": 2023 + }, + { + "epoch": 7.306997742663657, + "grad_norm": 230.34642028808594, + "learning_rate": 1.903448275862069e-05, + "loss": 36.7492, + "step": 2024 + }, + { + "epoch": 7.310609480812641, + "grad_norm": 228.88966369628906, + "learning_rate": 1.9029038112522687e-05, + "loss": 35.1188, + "step": 2025 + }, + { + "epoch": 7.314221218961626, + "grad_norm": 213.2604522705078, + "learning_rate": 1.9023593466424682e-05, + "loss": 35.0688, + "step": 2026 + }, + { + "epoch": 7.3178329571106095, + "grad_norm": 202.62200927734375, + "learning_rate": 1.901814882032668e-05, + "loss": 37.6721, + "step": 2027 + }, + { + "epoch": 7.3214446952595935, + "grad_norm": 191.8877410888672, + "learning_rate": 1.9012704174228676e-05, + "loss": 36.7728, + "step": 2028 + }, + { + "epoch": 7.3250564334085775, + "grad_norm": 211.57571411132812, + "learning_rate": 1.900725952813067e-05, + "loss": 36.6342, + "step": 2029 + }, + { + "epoch": 7.328668171557562, + "grad_norm": 177.2289581298828, + "learning_rate": 1.9001814882032667e-05, + "loss": 36.8319, + "step": 2030 + }, + { + "epoch": 7.328668171557562, + "eval_loss": 0.6231008172035217, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2030 + }, + { + "epoch": 7.332279909706546, + "grad_norm": 227.7028350830078, + "learning_rate": 1.8996370235934662e-05, + "loss": 36.6706, + "step": 2031 + }, + { + "epoch": 7.33589164785553, + "grad_norm": 229.02972412109375, + "learning_rate": 1.899092558983666e-05, + "loss": 37.0749, + "step": 2032 + }, + { + "epoch": 7.339503386004514, + "grad_norm": 234.30946350097656, + "learning_rate": 1.898548094373866e-05, + "loss": 37.3716, + "step": 2033 + }, + { + "epoch": 7.343115124153499, + "grad_norm": 236.79893493652344, + "learning_rate": 1.8980036297640655e-05, + "loss": 38.9503, + "step": 2034 + }, + { + "epoch": 7.346726862302483, + "grad_norm": 256.5646057128906, + "learning_rate": 1.897459165154265e-05, + "loss": 32.5056, + "step": 2035 + }, + { + "epoch": 7.350338600451467, + "grad_norm": 183.38961791992188, + "learning_rate": 1.8969147005444646e-05, + "loss": 25.3982, + "step": 2036 + }, + { + "epoch": 7.353950338600452, + "grad_norm": 214.09742736816406, + "learning_rate": 1.896370235934664e-05, + "loss": 23.2743, + "step": 2037 + }, + { + "epoch": 7.357562076749436, + "grad_norm": 190.10867309570312, + "learning_rate": 1.895825771324864e-05, + "loss": 24.8062, + "step": 2038 + }, + { + "epoch": 7.36117381489842, + "grad_norm": 197.85313415527344, + "learning_rate": 1.8952813067150636e-05, + "loss": 25.5098, + "step": 2039 + }, + { + "epoch": 7.364785553047404, + "grad_norm": 235.79090881347656, + "learning_rate": 1.894736842105263e-05, + "loss": 44.3536, + "step": 2040 + }, + { + "epoch": 7.364785553047404, + "eval_loss": 0.6341925263404846, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.785, + "eval_steps_per_second": 56.785, + "step": 2040 + }, + { + "epoch": 7.368397291196389, + "grad_norm": 232.7415771484375, + "learning_rate": 1.8941923774954626e-05, + "loss": 44.6073, + "step": 2041 + }, + { + "epoch": 7.372009029345373, + "grad_norm": 302.3766174316406, + "learning_rate": 1.8936479128856625e-05, + "loss": 43.8575, + "step": 2042 + }, + { + "epoch": 7.375620767494357, + "grad_norm": 208.41441345214844, + "learning_rate": 1.8931034482758624e-05, + "loss": 42.4378, + "step": 2043 + }, + { + "epoch": 7.3792325056433405, + "grad_norm": 228.000732421875, + "learning_rate": 1.892558983666062e-05, + "loss": 44.5641, + "step": 2044 + }, + { + "epoch": 7.382844243792325, + "grad_norm": 201.757080078125, + "learning_rate": 1.8920145190562615e-05, + "loss": 43.7578, + "step": 2045 + }, + { + "epoch": 7.386455981941309, + "grad_norm": 220.2481689453125, + "learning_rate": 1.891470054446461e-05, + "loss": 42.755, + "step": 2046 + }, + { + "epoch": 7.390067720090293, + "grad_norm": 225.5443115234375, + "learning_rate": 1.8909255898366605e-05, + "loss": 44.3785, + "step": 2047 + }, + { + "epoch": 7.393679458239277, + "grad_norm": 200.2024688720703, + "learning_rate": 1.89038112522686e-05, + "loss": 42.994, + "step": 2048 + }, + { + "epoch": 7.397291196388262, + "grad_norm": 205.64794921875, + "learning_rate": 1.88983666061706e-05, + "loss": 43.1902, + "step": 2049 + }, + { + "epoch": 7.400902934537246, + "grad_norm": 183.3535919189453, + "learning_rate": 1.8892921960072595e-05, + "loss": 40.9422, + "step": 2050 + }, + { + "epoch": 7.400902934537246, + "eval_loss": 0.626913845539093, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2050 + }, + { + "epoch": 7.40451467268623, + "grad_norm": 201.8138885498047, + "learning_rate": 1.8887477313974594e-05, + "loss": 39.4408, + "step": 2051 + }, + { + "epoch": 7.408126410835214, + "grad_norm": 201.8863525390625, + "learning_rate": 1.888203266787659e-05, + "loss": 39.5467, + "step": 2052 + }, + { + "epoch": 7.411738148984199, + "grad_norm": 239.10687255859375, + "learning_rate": 1.8876588021778585e-05, + "loss": 41.2256, + "step": 2053 + }, + { + "epoch": 7.415349887133183, + "grad_norm": 209.47796630859375, + "learning_rate": 1.8871143375680583e-05, + "loss": 40.8963, + "step": 2054 + }, + { + "epoch": 7.418961625282167, + "grad_norm": 202.6414794921875, + "learning_rate": 1.886569872958258e-05, + "loss": 40.5138, + "step": 2055 + }, + { + "epoch": 7.422573363431152, + "grad_norm": 198.01795959472656, + "learning_rate": 1.8860254083484574e-05, + "loss": 39.1767, + "step": 2056 + }, + { + "epoch": 7.426185101580136, + "grad_norm": 173.26507568359375, + "learning_rate": 1.885480943738657e-05, + "loss": 40.6713, + "step": 2057 + }, + { + "epoch": 7.42979683972912, + "grad_norm": 166.11607360839844, + "learning_rate": 1.8849364791288565e-05, + "loss": 41.2602, + "step": 2058 + }, + { + "epoch": 7.433408577878104, + "grad_norm": 200.76956176757812, + "learning_rate": 1.884392014519056e-05, + "loss": 41.0714, + "step": 2059 + }, + { + "epoch": 7.437020316027088, + "grad_norm": 213.75315856933594, + "learning_rate": 1.883847549909256e-05, + "loss": 39.6812, + "step": 2060 + }, + { + "epoch": 7.437020316027088, + "eval_loss": 0.6279598474502563, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.0, + "eval_steps_per_second": 57.0, + "step": 2060 + }, + { + "epoch": 7.440632054176072, + "grad_norm": 221.25025939941406, + "learning_rate": 1.8833030852994558e-05, + "loss": 41.6964, + "step": 2061 + }, + { + "epoch": 7.444243792325056, + "grad_norm": 171.32106018066406, + "learning_rate": 1.8827586206896553e-05, + "loss": 41.4608, + "step": 2062 + }, + { + "epoch": 7.44785553047404, + "grad_norm": 222.76600646972656, + "learning_rate": 1.882214156079855e-05, + "loss": 41.2687, + "step": 2063 + }, + { + "epoch": 7.451467268623025, + "grad_norm": 169.82395935058594, + "learning_rate": 1.8816696914700544e-05, + "loss": 41.6048, + "step": 2064 + }, + { + "epoch": 7.455079006772009, + "grad_norm": 190.5113525390625, + "learning_rate": 1.8811252268602543e-05, + "loss": 41.8843, + "step": 2065 + }, + { + "epoch": 7.458690744920993, + "grad_norm": 194.5990447998047, + "learning_rate": 1.8805807622504538e-05, + "loss": 43.5968, + "step": 2066 + }, + { + "epoch": 7.462302483069977, + "grad_norm": 216.0985870361328, + "learning_rate": 1.8800362976406533e-05, + "loss": 41.6743, + "step": 2067 + }, + { + "epoch": 7.465914221218962, + "grad_norm": 249.05270385742188, + "learning_rate": 1.879491833030853e-05, + "loss": 39.4203, + "step": 2068 + }, + { + "epoch": 7.469525959367946, + "grad_norm": 232.5495147705078, + "learning_rate": 1.8789473684210524e-05, + "loss": 36.2202, + "step": 2069 + }, + { + "epoch": 7.47313769751693, + "grad_norm": 218.72299194335938, + "learning_rate": 1.8784029038112523e-05, + "loss": 34.9116, + "step": 2070 + }, + { + "epoch": 7.47313769751693, + "eval_loss": 0.6241349577903748, + "eval_runtime": 3.1499, + "eval_samples_per_second": 56.827, + "eval_steps_per_second": 56.827, + "step": 2070 + }, + { + "epoch": 7.476749435665914, + "grad_norm": 241.78179931640625, + "learning_rate": 1.8778584392014522e-05, + "loss": 36.2476, + "step": 2071 + }, + { + "epoch": 7.480361173814899, + "grad_norm": 194.92982482910156, + "learning_rate": 1.8773139745916517e-05, + "loss": 34.4524, + "step": 2072 + }, + { + "epoch": 7.483972911963883, + "grad_norm": 227.76156616210938, + "learning_rate": 1.8767695099818513e-05, + "loss": 34.5292, + "step": 2073 + }, + { + "epoch": 7.487584650112867, + "grad_norm": 287.61309814453125, + "learning_rate": 1.8762250453720508e-05, + "loss": 37.8068, + "step": 2074 + }, + { + "epoch": 7.491196388261851, + "grad_norm": 191.0822296142578, + "learning_rate": 1.8756805807622503e-05, + "loss": 36.0941, + "step": 2075 + }, + { + "epoch": 7.4948081264108355, + "grad_norm": 197.5564422607422, + "learning_rate": 1.8751361161524502e-05, + "loss": 36.3624, + "step": 2076 + }, + { + "epoch": 7.4984198645598195, + "grad_norm": 187.72479248046875, + "learning_rate": 1.8745916515426497e-05, + "loss": 37.5074, + "step": 2077 + }, + { + "epoch": 7.502031602708803, + "grad_norm": 220.4607391357422, + "learning_rate": 1.8740471869328493e-05, + "loss": 35.6139, + "step": 2078 + }, + { + "epoch": 7.505643340857787, + "grad_norm": 179.05612182617188, + "learning_rate": 1.873502722323049e-05, + "loss": 37.7286, + "step": 2079 + }, + { + "epoch": 7.509255079006772, + "grad_norm": 230.91879272460938, + "learning_rate": 1.8729582577132487e-05, + "loss": 36.1803, + "step": 2080 + }, + { + "epoch": 7.509255079006772, + "eval_loss": 0.6255043148994446, + "eval_runtime": 3.1466, + "eval_samples_per_second": 56.887, + "eval_steps_per_second": 56.887, + "step": 2080 + }, + { + "epoch": 7.512866817155756, + "grad_norm": 182.89437866210938, + "learning_rate": 1.8724137931034482e-05, + "loss": 36.5782, + "step": 2081 + }, + { + "epoch": 7.51647855530474, + "grad_norm": 215.36769104003906, + "learning_rate": 1.871869328493648e-05, + "loss": 38.233, + "step": 2082 + }, + { + "epoch": 7.520090293453725, + "grad_norm": 232.6095733642578, + "learning_rate": 1.8713248638838477e-05, + "loss": 38.6268, + "step": 2083 + }, + { + "epoch": 7.523702031602709, + "grad_norm": 236.94281005859375, + "learning_rate": 1.8707803992740472e-05, + "loss": 38.1768, + "step": 2084 + }, + { + "epoch": 7.527313769751693, + "grad_norm": 214.16079711914062, + "learning_rate": 1.8702359346642467e-05, + "loss": 27.514, + "step": 2085 + }, + { + "epoch": 7.530925507900677, + "grad_norm": 192.6107940673828, + "learning_rate": 1.8696914700544463e-05, + "loss": 24.274, + "step": 2086 + }, + { + "epoch": 7.534537246049661, + "grad_norm": 217.98619079589844, + "learning_rate": 1.869147005444646e-05, + "loss": 23.2824, + "step": 2087 + }, + { + "epoch": 7.538148984198646, + "grad_norm": 183.04296875, + "learning_rate": 1.868602540834846e-05, + "loss": 24.9622, + "step": 2088 + }, + { + "epoch": 7.54176072234763, + "grad_norm": 167.1417236328125, + "learning_rate": 1.8680580762250456e-05, + "loss": 25.1446, + "step": 2089 + }, + { + "epoch": 7.545372460496614, + "grad_norm": 287.29937744140625, + "learning_rate": 1.867513611615245e-05, + "loss": 44.1171, + "step": 2090 + }, + { + "epoch": 7.545372460496614, + "eval_loss": 0.6376849412918091, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.929, + "eval_steps_per_second": 56.929, + "step": 2090 + }, + { + "epoch": 7.5489841986455986, + "grad_norm": 285.3408203125, + "learning_rate": 1.8669691470054446e-05, + "loss": 46.3716, + "step": 2091 + }, + { + "epoch": 7.5525959367945825, + "grad_norm": 233.18389892578125, + "learning_rate": 1.8664246823956445e-05, + "loss": 44.0514, + "step": 2092 + }, + { + "epoch": 7.5562076749435665, + "grad_norm": 256.4196472167969, + "learning_rate": 1.865880217785844e-05, + "loss": 44.1784, + "step": 2093 + }, + { + "epoch": 7.5598194130925505, + "grad_norm": 223.28128051757812, + "learning_rate": 1.8653357531760436e-05, + "loss": 42.9897, + "step": 2094 + }, + { + "epoch": 7.563431151241535, + "grad_norm": 235.2901153564453, + "learning_rate": 1.864791288566243e-05, + "loss": 43.7651, + "step": 2095 + }, + { + "epoch": 7.567042889390519, + "grad_norm": 285.9206237792969, + "learning_rate": 1.8642468239564427e-05, + "loss": 44.6333, + "step": 2096 + }, + { + "epoch": 7.570654627539503, + "grad_norm": 200.00210571289062, + "learning_rate": 1.8637023593466425e-05, + "loss": 43.9845, + "step": 2097 + }, + { + "epoch": 7.574266365688487, + "grad_norm": 277.73394775390625, + "learning_rate": 1.8631578947368424e-05, + "loss": 44.7301, + "step": 2098 + }, + { + "epoch": 7.577878103837472, + "grad_norm": 216.9422149658203, + "learning_rate": 1.862613430127042e-05, + "loss": 44.0409, + "step": 2099 + }, + { + "epoch": 7.581489841986456, + "grad_norm": 198.86639404296875, + "learning_rate": 1.8620689655172415e-05, + "loss": 43.4026, + "step": 2100 + }, + { + "epoch": 7.581489841986456, + "eval_loss": 0.6270378232002258, + "eval_runtime": 3.1464, + "eval_samples_per_second": 56.891, + "eval_steps_per_second": 56.891, + "step": 2100 + }, + { + "epoch": 7.58510158013544, + "grad_norm": 240.495361328125, + "learning_rate": 1.861524500907441e-05, + "loss": 41.4092, + "step": 2101 + }, + { + "epoch": 7.588713318284425, + "grad_norm": 240.1851043701172, + "learning_rate": 1.8609800362976406e-05, + "loss": 40.1396, + "step": 2102 + }, + { + "epoch": 7.592325056433409, + "grad_norm": 241.21495056152344, + "learning_rate": 1.8604355716878405e-05, + "loss": 39.1778, + "step": 2103 + }, + { + "epoch": 7.595936794582393, + "grad_norm": 287.3133544921875, + "learning_rate": 1.85989110707804e-05, + "loss": 41.0348, + "step": 2104 + }, + { + "epoch": 7.599548532731377, + "grad_norm": 230.4313201904297, + "learning_rate": 1.8593466424682395e-05, + "loss": 39.5872, + "step": 2105 + }, + { + "epoch": 7.603160270880361, + "grad_norm": 210.32962036132812, + "learning_rate": 1.858802177858439e-05, + "loss": 40.6146, + "step": 2106 + }, + { + "epoch": 7.606772009029346, + "grad_norm": 185.81752014160156, + "learning_rate": 1.858257713248639e-05, + "loss": 39.6363, + "step": 2107 + }, + { + "epoch": 7.6103837471783295, + "grad_norm": 234.63037109375, + "learning_rate": 1.8577132486388385e-05, + "loss": 40.558, + "step": 2108 + }, + { + "epoch": 7.6139954853273135, + "grad_norm": 289.92803955078125, + "learning_rate": 1.8571687840290384e-05, + "loss": 41.1624, + "step": 2109 + }, + { + "epoch": 7.617607223476298, + "grad_norm": 252.82188415527344, + "learning_rate": 1.856624319419238e-05, + "loss": 41.7827, + "step": 2110 + }, + { + "epoch": 7.617607223476298, + "eval_loss": 0.6290409564971924, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2110 + }, + { + "epoch": 7.621218961625282, + "grad_norm": 201.8303985595703, + "learning_rate": 1.8560798548094374e-05, + "loss": 39.0072, + "step": 2111 + }, + { + "epoch": 7.624830699774266, + "grad_norm": 158.71446228027344, + "learning_rate": 1.855535390199637e-05, + "loss": 39.9822, + "step": 2112 + }, + { + "epoch": 7.62844243792325, + "grad_norm": 171.3879852294922, + "learning_rate": 1.8549909255898365e-05, + "loss": 42.1973, + "step": 2113 + }, + { + "epoch": 7.632054176072235, + "grad_norm": 218.584228515625, + "learning_rate": 1.8544464609800364e-05, + "loss": 42.933, + "step": 2114 + }, + { + "epoch": 7.635665914221219, + "grad_norm": 200.60093688964844, + "learning_rate": 1.853901996370236e-05, + "loss": 41.9847, + "step": 2115 + }, + { + "epoch": 7.639277652370203, + "grad_norm": 210.75128173828125, + "learning_rate": 1.8533575317604358e-05, + "loss": 42.4961, + "step": 2116 + }, + { + "epoch": 7.642889390519187, + "grad_norm": 187.47406005859375, + "learning_rate": 1.8528130671506353e-05, + "loss": 39.3404, + "step": 2117 + }, + { + "epoch": 7.646501128668172, + "grad_norm": 204.87693786621094, + "learning_rate": 1.852268602540835e-05, + "loss": 40.3011, + "step": 2118 + }, + { + "epoch": 7.650112866817156, + "grad_norm": 228.8159637451172, + "learning_rate": 1.8517241379310344e-05, + "loss": 37.4416, + "step": 2119 + }, + { + "epoch": 7.65372460496614, + "grad_norm": 237.59664916992188, + "learning_rate": 1.8511796733212343e-05, + "loss": 35.3079, + "step": 2120 + }, + { + "epoch": 7.65372460496614, + "eval_loss": 0.6256567239761353, + "eval_runtime": 3.1458, + "eval_samples_per_second": 56.902, + "eval_steps_per_second": 56.902, + "step": 2120 + }, + { + "epoch": 7.657336343115124, + "grad_norm": 233.3187713623047, + "learning_rate": 1.850635208711434e-05, + "loss": 34.5055, + "step": 2121 + }, + { + "epoch": 7.660948081264109, + "grad_norm": 232.7037353515625, + "learning_rate": 1.8500907441016334e-05, + "loss": 34.1232, + "step": 2122 + }, + { + "epoch": 7.664559819413093, + "grad_norm": 254.53050231933594, + "learning_rate": 1.849546279491833e-05, + "loss": 35.3301, + "step": 2123 + }, + { + "epoch": 7.668171557562077, + "grad_norm": 234.93154907226562, + "learning_rate": 1.8490018148820324e-05, + "loss": 35.9202, + "step": 2124 + }, + { + "epoch": 7.6717832957110605, + "grad_norm": 237.99671936035156, + "learning_rate": 1.8484573502722327e-05, + "loss": 36.5702, + "step": 2125 + }, + { + "epoch": 7.675395033860045, + "grad_norm": 186.25271606445312, + "learning_rate": 1.8479128856624322e-05, + "loss": 35.9423, + "step": 2126 + }, + { + "epoch": 7.679006772009029, + "grad_norm": 226.461669921875, + "learning_rate": 1.8473684210526317e-05, + "loss": 37.4121, + "step": 2127 + }, + { + "epoch": 7.682618510158013, + "grad_norm": 227.0966033935547, + "learning_rate": 1.8468239564428313e-05, + "loss": 36.8802, + "step": 2128 + }, + { + "epoch": 7.686230248306998, + "grad_norm": 193.4064178466797, + "learning_rate": 1.8462794918330308e-05, + "loss": 36.0245, + "step": 2129 + }, + { + "epoch": 7.689841986455982, + "grad_norm": 279.1668395996094, + "learning_rate": 1.8457350272232304e-05, + "loss": 37.4833, + "step": 2130 + }, + { + "epoch": 7.689841986455982, + "eval_loss": 0.6227458715438843, + "eval_runtime": 3.1429, + "eval_samples_per_second": 56.953, + "eval_steps_per_second": 56.953, + "step": 2130 + }, + { + "epoch": 7.693453724604966, + "grad_norm": 254.59234619140625, + "learning_rate": 1.8451905626134302e-05, + "loss": 36.8538, + "step": 2131 + }, + { + "epoch": 7.69706546275395, + "grad_norm": 191.14463806152344, + "learning_rate": 1.8446460980036298e-05, + "loss": 37.8517, + "step": 2132 + }, + { + "epoch": 7.700677200902934, + "grad_norm": 189.20896911621094, + "learning_rate": 1.8441016333938293e-05, + "loss": 38.406, + "step": 2133 + }, + { + "epoch": 7.704288939051919, + "grad_norm": 209.61175537109375, + "learning_rate": 1.8435571687840292e-05, + "loss": 37.7692, + "step": 2134 + }, + { + "epoch": 7.707900677200903, + "grad_norm": 220.5150146484375, + "learning_rate": 1.8430127041742287e-05, + "loss": 36.087, + "step": 2135 + }, + { + "epoch": 7.711512415349887, + "grad_norm": 211.78372192382812, + "learning_rate": 1.8424682395644286e-05, + "loss": 25.6052, + "step": 2136 + }, + { + "epoch": 7.715124153498872, + "grad_norm": 223.85789489746094, + "learning_rate": 1.841923774954628e-05, + "loss": 23.5576, + "step": 2137 + }, + { + "epoch": 7.718735891647856, + "grad_norm": 163.74220275878906, + "learning_rate": 1.8413793103448277e-05, + "loss": 24.4869, + "step": 2138 + }, + { + "epoch": 7.72234762979684, + "grad_norm": 182.80079650878906, + "learning_rate": 1.8408348457350272e-05, + "loss": 25.1878, + "step": 2139 + }, + { + "epoch": 7.725959367945824, + "grad_norm": 296.0340270996094, + "learning_rate": 1.8402903811252268e-05, + "loss": 44.4643, + "step": 2140 + }, + { + "epoch": 7.725959367945824, + "eval_loss": 0.6382863521575928, + "eval_runtime": 3.1441, + "eval_samples_per_second": 56.932, + "eval_steps_per_second": 56.932, + "step": 2140 + }, + { + "epoch": 7.7295711060948085, + "grad_norm": 248.48643493652344, + "learning_rate": 1.8397459165154263e-05, + "loss": 45.2141, + "step": 2141 + }, + { + "epoch": 7.733182844243792, + "grad_norm": 240.9061279296875, + "learning_rate": 1.8392014519056262e-05, + "loss": 42.9435, + "step": 2142 + }, + { + "epoch": 7.736794582392776, + "grad_norm": 231.62315368652344, + "learning_rate": 1.8386569872958257e-05, + "loss": 42.9769, + "step": 2143 + }, + { + "epoch": 7.74040632054176, + "grad_norm": 244.36915588378906, + "learning_rate": 1.8381125226860256e-05, + "loss": 43.6058, + "step": 2144 + }, + { + "epoch": 7.744018058690745, + "grad_norm": 252.9080047607422, + "learning_rate": 1.837568058076225e-05, + "loss": 43.1753, + "step": 2145 + }, + { + "epoch": 7.747629796839729, + "grad_norm": 274.0201721191406, + "learning_rate": 1.8370235934664247e-05, + "loss": 43.3285, + "step": 2146 + }, + { + "epoch": 7.751241534988713, + "grad_norm": 226.75595092773438, + "learning_rate": 1.8364791288566245e-05, + "loss": 43.3158, + "step": 2147 + }, + { + "epoch": 7.754853273137698, + "grad_norm": 197.0859832763672, + "learning_rate": 1.835934664246824e-05, + "loss": 43.5773, + "step": 2148 + }, + { + "epoch": 7.758465011286682, + "grad_norm": 212.14720153808594, + "learning_rate": 1.8353901996370236e-05, + "loss": 43.9208, + "step": 2149 + }, + { + "epoch": 7.762076749435666, + "grad_norm": 230.22158813476562, + "learning_rate": 1.834845735027223e-05, + "loss": 42.8429, + "step": 2150 + }, + { + "epoch": 7.762076749435666, + "eval_loss": 0.6291994452476501, + "eval_runtime": 3.1473, + "eval_samples_per_second": 56.874, + "eval_steps_per_second": 56.874, + "step": 2150 + }, + { + "epoch": 7.76568848758465, + "grad_norm": 215.79391479492188, + "learning_rate": 1.8343012704174227e-05, + "loss": 40.7289, + "step": 2151 + }, + { + "epoch": 7.769300225733634, + "grad_norm": 210.00296020507812, + "learning_rate": 1.8337568058076222e-05, + "loss": 39.9759, + "step": 2152 + }, + { + "epoch": 7.772911963882619, + "grad_norm": 291.2987976074219, + "learning_rate": 1.8332123411978224e-05, + "loss": 40.551, + "step": 2153 + }, + { + "epoch": 7.776523702031603, + "grad_norm": 218.08819580078125, + "learning_rate": 1.832667876588022e-05, + "loss": 40.7981, + "step": 2154 + }, + { + "epoch": 7.780135440180587, + "grad_norm": 268.615966796875, + "learning_rate": 1.8321234119782215e-05, + "loss": 40.5463, + "step": 2155 + }, + { + "epoch": 7.7837471783295715, + "grad_norm": 269.939697265625, + "learning_rate": 1.831578947368421e-05, + "loss": 40.6168, + "step": 2156 + }, + { + "epoch": 7.7873589164785555, + "grad_norm": 268.9761657714844, + "learning_rate": 1.8310344827586206e-05, + "loss": 41.2449, + "step": 2157 + }, + { + "epoch": 7.7909706546275395, + "grad_norm": 161.08811950683594, + "learning_rate": 1.8304900181488205e-05, + "loss": 40.6308, + "step": 2158 + }, + { + "epoch": 7.794582392776523, + "grad_norm": 190.44696044921875, + "learning_rate": 1.82994555353902e-05, + "loss": 40.9708, + "step": 2159 + }, + { + "epoch": 7.798194130925508, + "grad_norm": 202.4305419921875, + "learning_rate": 1.8294010889292196e-05, + "loss": 41.2053, + "step": 2160 + }, + { + "epoch": 7.798194130925508, + "eval_loss": 0.6233534812927246, + "eval_runtime": 3.1457, + "eval_samples_per_second": 56.903, + "eval_steps_per_second": 56.903, + "step": 2160 + }, + { + "epoch": 7.801805869074492, + "grad_norm": 188.5523681640625, + "learning_rate": 1.828856624319419e-05, + "loss": 40.3928, + "step": 2161 + }, + { + "epoch": 7.805417607223476, + "grad_norm": 184.18296813964844, + "learning_rate": 1.828312159709619e-05, + "loss": 42.3466, + "step": 2162 + }, + { + "epoch": 7.80902934537246, + "grad_norm": 223.9243927001953, + "learning_rate": 1.8277676950998185e-05, + "loss": 42.0301, + "step": 2163 + }, + { + "epoch": 7.812641083521445, + "grad_norm": 202.3498077392578, + "learning_rate": 1.8272232304900184e-05, + "loss": 42.3284, + "step": 2164 + }, + { + "epoch": 7.816252821670429, + "grad_norm": 205.77940368652344, + "learning_rate": 1.826678765880218e-05, + "loss": 42.0951, + "step": 2165 + }, + { + "epoch": 7.819864559819413, + "grad_norm": 191.46728515625, + "learning_rate": 1.8261343012704175e-05, + "loss": 40.826, + "step": 2166 + }, + { + "epoch": 7.823476297968397, + "grad_norm": 276.8330383300781, + "learning_rate": 1.825589836660617e-05, + "loss": 42.7909, + "step": 2167 + }, + { + "epoch": 7.827088036117382, + "grad_norm": 181.93955993652344, + "learning_rate": 1.8250453720508165e-05, + "loss": 38.6068, + "step": 2168 + }, + { + "epoch": 7.830699774266366, + "grad_norm": 178.79856872558594, + "learning_rate": 1.8245009074410164e-05, + "loss": 35.694, + "step": 2169 + }, + { + "epoch": 7.83431151241535, + "grad_norm": 224.6522979736328, + "learning_rate": 1.823956442831216e-05, + "loss": 36.7127, + "step": 2170 + }, + { + "epoch": 7.83431151241535, + "eval_loss": 0.6237645745277405, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 2170 + }, + { + "epoch": 7.837923250564334, + "grad_norm": 203.37196350097656, + "learning_rate": 1.823411978221416e-05, + "loss": 34.0039, + "step": 2171 + }, + { + "epoch": 7.8415349887133186, + "grad_norm": 212.79307556152344, + "learning_rate": 1.8228675136116154e-05, + "loss": 33.2787, + "step": 2172 + }, + { + "epoch": 7.8451467268623025, + "grad_norm": 215.5691375732422, + "learning_rate": 1.822323049001815e-05, + "loss": 35.4241, + "step": 2173 + }, + { + "epoch": 7.8487584650112865, + "grad_norm": 230.0751190185547, + "learning_rate": 1.8217785843920144e-05, + "loss": 36.9333, + "step": 2174 + }, + { + "epoch": 7.852370203160271, + "grad_norm": 217.8132781982422, + "learning_rate": 1.8212341197822143e-05, + "loss": 35.7233, + "step": 2175 + }, + { + "epoch": 7.855981941309255, + "grad_norm": 245.93177795410156, + "learning_rate": 1.820689655172414e-05, + "loss": 36.6111, + "step": 2176 + }, + { + "epoch": 7.859593679458239, + "grad_norm": 210.58218383789062, + "learning_rate": 1.8201451905626134e-05, + "loss": 36.3243, + "step": 2177 + }, + { + "epoch": 7.863205417607223, + "grad_norm": 234.6280059814453, + "learning_rate": 1.819600725952813e-05, + "loss": 37.0315, + "step": 2178 + }, + { + "epoch": 7.866817155756207, + "grad_norm": 184.53121948242188, + "learning_rate": 1.8190562613430125e-05, + "loss": 35.8725, + "step": 2179 + }, + { + "epoch": 7.870428893905192, + "grad_norm": 201.5563507080078, + "learning_rate": 1.8185117967332127e-05, + "loss": 37.9183, + "step": 2180 + }, + { + "epoch": 7.870428893905192, + "eval_loss": 0.6210297346115112, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2180 + }, + { + "epoch": 7.874040632054176, + "grad_norm": 192.29579162597656, + "learning_rate": 1.8179673321234122e-05, + "loss": 37.1709, + "step": 2181 + }, + { + "epoch": 7.87765237020316, + "grad_norm": 246.0638427734375, + "learning_rate": 1.8174228675136118e-05, + "loss": 38.5338, + "step": 2182 + }, + { + "epoch": 7.881264108352145, + "grad_norm": 237.47607421875, + "learning_rate": 1.8168784029038113e-05, + "loss": 37.7041, + "step": 2183 + }, + { + "epoch": 7.884875846501129, + "grad_norm": 215.06407165527344, + "learning_rate": 1.816333938294011e-05, + "loss": 38.1663, + "step": 2184 + }, + { + "epoch": 7.888487584650113, + "grad_norm": 193.76809692382812, + "learning_rate": 1.8157894736842107e-05, + "loss": 32.1679, + "step": 2185 + }, + { + "epoch": 7.892099322799097, + "grad_norm": 208.66111755371094, + "learning_rate": 1.8152450090744103e-05, + "loss": 24.2413, + "step": 2186 + }, + { + "epoch": 7.895711060948082, + "grad_norm": 182.810546875, + "learning_rate": 1.8147005444646098e-05, + "loss": 24.1102, + "step": 2187 + }, + { + "epoch": 7.899322799097066, + "grad_norm": 200.25823974609375, + "learning_rate": 1.8141560798548093e-05, + "loss": 24.5778, + "step": 2188 + }, + { + "epoch": 7.9029345372460496, + "grad_norm": 224.19125366210938, + "learning_rate": 1.813611615245009e-05, + "loss": 26.1643, + "step": 2189 + }, + { + "epoch": 7.9065462753950335, + "grad_norm": 261.03033447265625, + "learning_rate": 1.8130671506352088e-05, + "loss": 45.1071, + "step": 2190 + }, + { + "epoch": 7.9065462753950335, + "eval_loss": 0.6303785443305969, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2190 + }, + { + "epoch": 7.910158013544018, + "grad_norm": 273.6593322753906, + "learning_rate": 1.8125226860254086e-05, + "loss": 43.8271, + "step": 2191 + }, + { + "epoch": 7.913769751693002, + "grad_norm": 304.0534362792969, + "learning_rate": 1.8119782214156082e-05, + "loss": 43.7623, + "step": 2192 + }, + { + "epoch": 7.917381489841986, + "grad_norm": 249.27255249023438, + "learning_rate": 1.8114337568058077e-05, + "loss": 43.7191, + "step": 2193 + }, + { + "epoch": 7.92099322799097, + "grad_norm": 199.5006103515625, + "learning_rate": 1.8108892921960072e-05, + "loss": 44.1019, + "step": 2194 + }, + { + "epoch": 7.924604966139955, + "grad_norm": 228.42832946777344, + "learning_rate": 1.8103448275862068e-05, + "loss": 43.9717, + "step": 2195 + }, + { + "epoch": 7.928216704288939, + "grad_norm": 247.20901489257812, + "learning_rate": 1.8098003629764067e-05, + "loss": 40.022, + "step": 2196 + }, + { + "epoch": 7.931828442437923, + "grad_norm": 297.5372619628906, + "learning_rate": 1.8092558983666062e-05, + "loss": 40.6639, + "step": 2197 + }, + { + "epoch": 7.935440180586907, + "grad_norm": 245.11915588378906, + "learning_rate": 1.8087114337568057e-05, + "loss": 40.3569, + "step": 2198 + }, + { + "epoch": 7.939051918735892, + "grad_norm": 255.53297424316406, + "learning_rate": 1.8081669691470056e-05, + "loss": 41.7983, + "step": 2199 + }, + { + "epoch": 7.942663656884876, + "grad_norm": 226.12783813476562, + "learning_rate": 1.807622504537205e-05, + "loss": 41.7844, + "step": 2200 + }, + { + "epoch": 7.942663656884876, + "eval_loss": 0.6214397549629211, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2200 + }, + { + "epoch": 7.94627539503386, + "grad_norm": 220.90577697753906, + "learning_rate": 1.8070780399274047e-05, + "loss": 42.057, + "step": 2201 + }, + { + "epoch": 7.949887133182845, + "grad_norm": 192.33856201171875, + "learning_rate": 1.8065335753176046e-05, + "loss": 42.0299, + "step": 2202 + }, + { + "epoch": 7.953498871331829, + "grad_norm": 192.8511962890625, + "learning_rate": 1.805989110707804e-05, + "loss": 41.7752, + "step": 2203 + }, + { + "epoch": 7.957110609480813, + "grad_norm": 223.10275268554688, + "learning_rate": 1.8054446460980036e-05, + "loss": 41.0178, + "step": 2204 + }, + { + "epoch": 7.960722347629797, + "grad_norm": 189.8402099609375, + "learning_rate": 1.8049001814882032e-05, + "loss": 37.9747, + "step": 2205 + }, + { + "epoch": 7.9643340857787805, + "grad_norm": 233.5938720703125, + "learning_rate": 1.8043557168784027e-05, + "loss": 35.3994, + "step": 2206 + }, + { + "epoch": 7.967945823927765, + "grad_norm": 218.5577850341797, + "learning_rate": 1.8038112522686026e-05, + "loss": 35.1967, + "step": 2207 + }, + { + "epoch": 7.971557562076749, + "grad_norm": 228.49502563476562, + "learning_rate": 1.8032667876588025e-05, + "loss": 34.5792, + "step": 2208 + }, + { + "epoch": 7.975169300225733, + "grad_norm": 285.4461364746094, + "learning_rate": 1.802722323049002e-05, + "loss": 37.9449, + "step": 2209 + }, + { + "epoch": 7.978781038374718, + "grad_norm": 186.83755493164062, + "learning_rate": 1.8021778584392016e-05, + "loss": 36.3295, + "step": 2210 + }, + { + "epoch": 7.978781038374718, + "eval_loss": 0.6212169528007507, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2210 + }, + { + "epoch": 7.982392776523702, + "grad_norm": 210.31175231933594, + "learning_rate": 1.801633393829401e-05, + "loss": 37.0061, + "step": 2211 + }, + { + "epoch": 7.986004514672686, + "grad_norm": 251.96026611328125, + "learning_rate": 1.8010889292196006e-05, + "loss": 37.8831, + "step": 2212 + }, + { + "epoch": 7.98961625282167, + "grad_norm": 273.8665771484375, + "learning_rate": 1.8005444646098005e-05, + "loss": 38.8926, + "step": 2213 + }, + { + "epoch": 7.993227990970655, + "grad_norm": 207.25836181640625, + "learning_rate": 1.8e-05, + "loss": 30.0468, + "step": 2214 + }, + { + "epoch": 7.996839729119639, + "grad_norm": 200.5218048095703, + "learning_rate": 1.7994555353901996e-05, + "loss": 24.0549, + "step": 2215 + }, + { + "epoch": 8.0, + "grad_norm": 245.7149200439453, + "learning_rate": 1.798911070780399e-05, + "loss": 22.3158, + "step": 2216 + }, + { + "epoch": 8.003611738148985, + "grad_norm": 263.85546875, + "learning_rate": 1.798366606170599e-05, + "loss": 43.2342, + "step": 2217 + }, + { + "epoch": 8.007223476297968, + "grad_norm": 244.57205200195312, + "learning_rate": 1.797822141560799e-05, + "loss": 44.0931, + "step": 2218 + }, + { + "epoch": 8.010835214446953, + "grad_norm": 196.4144287109375, + "learning_rate": 1.7972776769509984e-05, + "loss": 42.1926, + "step": 2219 + }, + { + "epoch": 8.014446952595938, + "grad_norm": 282.3250427246094, + "learning_rate": 1.796733212341198e-05, + "loss": 41.4664, + "step": 2220 + }, + { + "epoch": 8.014446952595938, + "eval_loss": 0.6222901344299316, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 2220 + }, + { + "epoch": 8.01805869074492, + "grad_norm": 186.79281616210938, + "learning_rate": 1.7961887477313975e-05, + "loss": 42.2133, + "step": 2221 + }, + { + "epoch": 8.021670428893906, + "grad_norm": 220.3788299560547, + "learning_rate": 1.795644283121597e-05, + "loss": 42.0159, + "step": 2222 + }, + { + "epoch": 8.025282167042889, + "grad_norm": 262.37078857421875, + "learning_rate": 1.7950998185117966e-05, + "loss": 42.6055, + "step": 2223 + }, + { + "epoch": 8.028893905191874, + "grad_norm": 199.07078552246094, + "learning_rate": 1.7945553539019964e-05, + "loss": 43.3061, + "step": 2224 + }, + { + "epoch": 8.032505643340858, + "grad_norm": 256.6651306152344, + "learning_rate": 1.794010889292196e-05, + "loss": 42.4806, + "step": 2225 + }, + { + "epoch": 8.036117381489841, + "grad_norm": 281.17431640625, + "learning_rate": 1.793466424682396e-05, + "loss": 43.9823, + "step": 2226 + }, + { + "epoch": 8.039729119638826, + "grad_norm": 201.19837951660156, + "learning_rate": 1.7929219600725954e-05, + "loss": 41.8372, + "step": 2227 + }, + { + "epoch": 8.043340857787811, + "grad_norm": 195.1905059814453, + "learning_rate": 1.792377495462795e-05, + "loss": 38.8656, + "step": 2228 + }, + { + "epoch": 8.046952595936794, + "grad_norm": 215.02772521972656, + "learning_rate": 1.7918330308529948e-05, + "loss": 39.8965, + "step": 2229 + }, + { + "epoch": 8.050564334085779, + "grad_norm": 202.16322326660156, + "learning_rate": 1.7912885662431944e-05, + "loss": 41.0917, + "step": 2230 + }, + { + "epoch": 8.050564334085779, + "eval_loss": 0.6212881207466125, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2230 + }, + { + "epoch": 8.054176072234762, + "grad_norm": 218.90786743164062, + "learning_rate": 1.790744101633394e-05, + "loss": 38.5499, + "step": 2231 + }, + { + "epoch": 8.057787810383747, + "grad_norm": 179.57138061523438, + "learning_rate": 1.7901996370235934e-05, + "loss": 39.5915, + "step": 2232 + }, + { + "epoch": 8.061399548532732, + "grad_norm": 242.74801635742188, + "learning_rate": 1.789655172413793e-05, + "loss": 39.6094, + "step": 2233 + }, + { + "epoch": 8.065011286681715, + "grad_norm": 183.07102966308594, + "learning_rate": 1.7891107078039925e-05, + "loss": 40.6025, + "step": 2234 + }, + { + "epoch": 8.0686230248307, + "grad_norm": 192.85418701171875, + "learning_rate": 1.7885662431941924e-05, + "loss": 40.3013, + "step": 2235 + }, + { + "epoch": 8.072234762979685, + "grad_norm": 254.26353454589844, + "learning_rate": 1.7880217785843923e-05, + "loss": 39.1747, + "step": 2236 + }, + { + "epoch": 8.075846501128668, + "grad_norm": 230.7747802734375, + "learning_rate": 1.7874773139745918e-05, + "loss": 40.7569, + "step": 2237 + }, + { + "epoch": 8.079458239277653, + "grad_norm": 179.30528259277344, + "learning_rate": 1.7869328493647913e-05, + "loss": 40.0753, + "step": 2238 + }, + { + "epoch": 8.083069977426636, + "grad_norm": 203.48915100097656, + "learning_rate": 1.786388384754991e-05, + "loss": 41.4453, + "step": 2239 + }, + { + "epoch": 8.08668171557562, + "grad_norm": 274.8970947265625, + "learning_rate": 1.7858439201451908e-05, + "loss": 40.5818, + "step": 2240 + }, + { + "epoch": 8.08668171557562, + "eval_loss": 0.6184170842170715, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.799, + "eval_steps_per_second": 56.799, + "step": 2240 + }, + { + "epoch": 8.090293453724605, + "grad_norm": 237.2452392578125, + "learning_rate": 1.7852994555353903e-05, + "loss": 42.5794, + "step": 2241 + }, + { + "epoch": 8.093905191873588, + "grad_norm": 236.33766174316406, + "learning_rate": 1.7847549909255898e-05, + "loss": 41.89, + "step": 2242 + }, + { + "epoch": 8.097516930022573, + "grad_norm": 269.4791564941406, + "learning_rate": 1.7842105263157894e-05, + "loss": 41.7726, + "step": 2243 + }, + { + "epoch": 8.101128668171558, + "grad_norm": 192.28457641601562, + "learning_rate": 1.783666061705989e-05, + "loss": 40.1187, + "step": 2244 + }, + { + "epoch": 8.104740406320541, + "grad_norm": 201.5625457763672, + "learning_rate": 1.7831215970961888e-05, + "loss": 36.8004, + "step": 2245 + }, + { + "epoch": 8.108352144469526, + "grad_norm": 175.7625274658203, + "learning_rate": 1.7825771324863887e-05, + "loss": 33.8354, + "step": 2246 + }, + { + "epoch": 8.111963882618511, + "grad_norm": 195.6171112060547, + "learning_rate": 1.7820326678765882e-05, + "loss": 33.5176, + "step": 2247 + }, + { + "epoch": 8.115575620767494, + "grad_norm": 158.7554168701172, + "learning_rate": 1.7814882032667877e-05, + "loss": 34.2908, + "step": 2248 + }, + { + "epoch": 8.119187358916479, + "grad_norm": 192.78900146484375, + "learning_rate": 1.7809437386569873e-05, + "loss": 34.0861, + "step": 2249 + }, + { + "epoch": 8.122799097065462, + "grad_norm": 186.6603240966797, + "learning_rate": 1.7803992740471868e-05, + "loss": 35.5742, + "step": 2250 + }, + { + "epoch": 8.122799097065462, + "eval_loss": 0.6207499504089355, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2250 + }, + { + "epoch": 8.126410835214447, + "grad_norm": 264.3590087890625, + "learning_rate": 1.7798548094373867e-05, + "loss": 35.6709, + "step": 2251 + }, + { + "epoch": 8.130022573363432, + "grad_norm": 202.9478302001953, + "learning_rate": 1.7793103448275862e-05, + "loss": 36.4221, + "step": 2252 + }, + { + "epoch": 8.133634311512415, + "grad_norm": 229.260498046875, + "learning_rate": 1.7787658802177858e-05, + "loss": 36.0745, + "step": 2253 + }, + { + "epoch": 8.1372460496614, + "grad_norm": 222.37716674804688, + "learning_rate": 1.7782214156079856e-05, + "loss": 37.3266, + "step": 2254 + }, + { + "epoch": 8.140857787810384, + "grad_norm": 217.02272033691406, + "learning_rate": 1.7776769509981852e-05, + "loss": 37.2819, + "step": 2255 + }, + { + "epoch": 8.144469525959368, + "grad_norm": 247.61016845703125, + "learning_rate": 1.7771324863883847e-05, + "loss": 37.2683, + "step": 2256 + }, + { + "epoch": 8.148081264108352, + "grad_norm": 209.7449493408203, + "learning_rate": 1.7765880217785846e-05, + "loss": 36.7165, + "step": 2257 + }, + { + "epoch": 8.151693002257336, + "grad_norm": 217.30722045898438, + "learning_rate": 1.776043557168784e-05, + "loss": 37.0805, + "step": 2258 + }, + { + "epoch": 8.15530474040632, + "grad_norm": 181.5167236328125, + "learning_rate": 1.7754990925589837e-05, + "loss": 38.0326, + "step": 2259 + }, + { + "epoch": 8.158916478555305, + "grad_norm": 217.4818878173828, + "learning_rate": 1.7749546279491832e-05, + "loss": 37.1798, + "step": 2260 + }, + { + "epoch": 8.158916478555305, + "eval_loss": 0.6218119263648987, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 2260 + }, + { + "epoch": 8.162528216704288, + "grad_norm": 233.60733032226562, + "learning_rate": 1.7744101633393828e-05, + "loss": 36.6039, + "step": 2261 + }, + { + "epoch": 8.166139954853273, + "grad_norm": 184.5128631591797, + "learning_rate": 1.7738656987295826e-05, + "loss": 30.6188, + "step": 2262 + }, + { + "epoch": 8.169751693002258, + "grad_norm": 154.25791931152344, + "learning_rate": 1.7733212341197825e-05, + "loss": 24.0782, + "step": 2263 + }, + { + "epoch": 8.173363431151241, + "grad_norm": 179.92723083496094, + "learning_rate": 1.772776769509982e-05, + "loss": 23.7072, + "step": 2264 + }, + { + "epoch": 8.176975169300226, + "grad_norm": 170.87684631347656, + "learning_rate": 1.7722323049001816e-05, + "loss": 24.0008, + "step": 2265 + }, + { + "epoch": 8.18058690744921, + "grad_norm": 179.25233459472656, + "learning_rate": 1.771687840290381e-05, + "loss": 24.8393, + "step": 2266 + }, + { + "epoch": 8.184198645598194, + "grad_norm": 268.7836608886719, + "learning_rate": 1.7711433756805807e-05, + "loss": 44.0573, + "step": 2267 + }, + { + "epoch": 8.187810383747179, + "grad_norm": 249.12033081054688, + "learning_rate": 1.7705989110707805e-05, + "loss": 45.0218, + "step": 2268 + }, + { + "epoch": 8.191422121896162, + "grad_norm": 275.2551574707031, + "learning_rate": 1.77005444646098e-05, + "loss": 43.1954, + "step": 2269 + }, + { + "epoch": 8.195033860045147, + "grad_norm": 233.5360107421875, + "learning_rate": 1.7695099818511796e-05, + "loss": 43.0807, + "step": 2270 + }, + { + "epoch": 8.195033860045147, + "eval_loss": 0.6311450600624084, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 2270 + }, + { + "epoch": 8.198645598194132, + "grad_norm": 201.01617431640625, + "learning_rate": 1.768965517241379e-05, + "loss": 43.8161, + "step": 2271 + }, + { + "epoch": 8.202257336343115, + "grad_norm": 243.028564453125, + "learning_rate": 1.7684210526315787e-05, + "loss": 43.3388, + "step": 2272 + }, + { + "epoch": 8.2058690744921, + "grad_norm": 191.8246307373047, + "learning_rate": 1.767876588021779e-05, + "loss": 42.6949, + "step": 2273 + }, + { + "epoch": 8.209480812641084, + "grad_norm": 241.33609008789062, + "learning_rate": 1.7673321234119784e-05, + "loss": 43.3541, + "step": 2274 + }, + { + "epoch": 8.213092550790067, + "grad_norm": 247.99066162109375, + "learning_rate": 1.766787658802178e-05, + "loss": 44.4262, + "step": 2275 + }, + { + "epoch": 8.216704288939052, + "grad_norm": 223.35452270507812, + "learning_rate": 1.7662431941923775e-05, + "loss": 42.5696, + "step": 2276 + }, + { + "epoch": 8.220316027088035, + "grad_norm": 208.75209045410156, + "learning_rate": 1.765698729582577e-05, + "loss": 41.9236, + "step": 2277 + }, + { + "epoch": 8.22392776523702, + "grad_norm": 229.60305786132812, + "learning_rate": 1.7651542649727766e-05, + "loss": 39.962, + "step": 2278 + }, + { + "epoch": 8.227539503386005, + "grad_norm": 294.3867492675781, + "learning_rate": 1.7646098003629765e-05, + "loss": 39.0847, + "step": 2279 + }, + { + "epoch": 8.231151241534988, + "grad_norm": 201.49679565429688, + "learning_rate": 1.764065335753176e-05, + "loss": 39.1451, + "step": 2280 + }, + { + "epoch": 8.231151241534988, + "eval_loss": 0.6214079856872559, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 2280 + }, + { + "epoch": 8.234762979683973, + "grad_norm": 201.57894897460938, + "learning_rate": 1.7635208711433756e-05, + "loss": 39.4673, + "step": 2281 + }, + { + "epoch": 8.238374717832958, + "grad_norm": 201.0395965576172, + "learning_rate": 1.7629764065335754e-05, + "loss": 39.9832, + "step": 2282 + }, + { + "epoch": 8.241986455981941, + "grad_norm": 274.41168212890625, + "learning_rate": 1.762431941923775e-05, + "loss": 40.3885, + "step": 2283 + }, + { + "epoch": 8.245598194130926, + "grad_norm": 173.79977416992188, + "learning_rate": 1.761887477313975e-05, + "loss": 39.5292, + "step": 2284 + }, + { + "epoch": 8.249209932279909, + "grad_norm": 194.91806030273438, + "learning_rate": 1.7613430127041744e-05, + "loss": 40.3855, + "step": 2285 + }, + { + "epoch": 8.252821670428894, + "grad_norm": 216.47213745117188, + "learning_rate": 1.760798548094374e-05, + "loss": 40.937, + "step": 2286 + }, + { + "epoch": 8.256433408577879, + "grad_norm": 168.1825714111328, + "learning_rate": 1.7602540834845735e-05, + "loss": 41.2523, + "step": 2287 + }, + { + "epoch": 8.260045146726862, + "grad_norm": 187.51914978027344, + "learning_rate": 1.759709618874773e-05, + "loss": 40.6913, + "step": 2288 + }, + { + "epoch": 8.263656884875846, + "grad_norm": 183.99844360351562, + "learning_rate": 1.759165154264973e-05, + "loss": 42.5074, + "step": 2289 + }, + { + "epoch": 8.267268623024831, + "grad_norm": 201.23797607421875, + "learning_rate": 1.7586206896551724e-05, + "loss": 42.0519, + "step": 2290 + }, + { + "epoch": 8.267268623024831, + "eval_loss": 0.6184054017066956, + "eval_runtime": 3.1465, + "eval_samples_per_second": 56.889, + "eval_steps_per_second": 56.889, + "step": 2290 + }, + { + "epoch": 8.270880361173814, + "grad_norm": 219.0037384033203, + "learning_rate": 1.7580762250453723e-05, + "loss": 41.7059, + "step": 2291 + }, + { + "epoch": 8.2744920993228, + "grad_norm": 221.00173950195312, + "learning_rate": 1.7575317604355718e-05, + "loss": 40.9004, + "step": 2292 + }, + { + "epoch": 8.278103837471784, + "grad_norm": 180.00828552246094, + "learning_rate": 1.7569872958257714e-05, + "loss": 38.7865, + "step": 2293 + }, + { + "epoch": 8.281715575620767, + "grad_norm": 210.69302368164062, + "learning_rate": 1.756442831215971e-05, + "loss": 39.207, + "step": 2294 + }, + { + "epoch": 8.285327313769752, + "grad_norm": 196.8787078857422, + "learning_rate": 1.7558983666061708e-05, + "loss": 39.4472, + "step": 2295 + }, + { + "epoch": 8.288939051918735, + "grad_norm": 229.16331481933594, + "learning_rate": 1.7553539019963703e-05, + "loss": 36.5539, + "step": 2296 + }, + { + "epoch": 8.29255079006772, + "grad_norm": 180.67474365234375, + "learning_rate": 1.75480943738657e-05, + "loss": 34.3887, + "step": 2297 + }, + { + "epoch": 8.296162528216705, + "grad_norm": 234.046875, + "learning_rate": 1.7542649727767694e-05, + "loss": 34.158, + "step": 2298 + }, + { + "epoch": 8.299774266365688, + "grad_norm": 213.34255981445312, + "learning_rate": 1.753720508166969e-05, + "loss": 34.7655, + "step": 2299 + }, + { + "epoch": 8.303386004514673, + "grad_norm": 205.6382598876953, + "learning_rate": 1.753176043557169e-05, + "loss": 34.4223, + "step": 2300 + }, + { + "epoch": 8.303386004514673, + "eval_loss": 0.6200549006462097, + "eval_runtime": 3.1447, + "eval_samples_per_second": 56.921, + "eval_steps_per_second": 56.921, + "step": 2300 + }, + { + "epoch": 8.306997742663658, + "grad_norm": 189.79238891601562, + "learning_rate": 1.7526315789473687e-05, + "loss": 35.3846, + "step": 2301 + }, + { + "epoch": 8.31060948081264, + "grad_norm": 202.27859497070312, + "learning_rate": 1.7520871143375682e-05, + "loss": 34.9006, + "step": 2302 + }, + { + "epoch": 8.314221218961626, + "grad_norm": 217.62327575683594, + "learning_rate": 1.7515426497277678e-05, + "loss": 36.3079, + "step": 2303 + }, + { + "epoch": 8.317832957110609, + "grad_norm": 212.82862854003906, + "learning_rate": 1.7509981851179673e-05, + "loss": 35.8598, + "step": 2304 + }, + { + "epoch": 8.321444695259594, + "grad_norm": 229.778564453125, + "learning_rate": 1.750453720508167e-05, + "loss": 37.0853, + "step": 2305 + }, + { + "epoch": 8.325056433408578, + "grad_norm": 219.99844360351562, + "learning_rate": 1.7499092558983667e-05, + "loss": 38.01, + "step": 2306 + }, + { + "epoch": 8.328668171557561, + "grad_norm": 202.63035583496094, + "learning_rate": 1.7493647912885663e-05, + "loss": 36.4756, + "step": 2307 + }, + { + "epoch": 8.332279909706546, + "grad_norm": 188.44094848632812, + "learning_rate": 1.7488203266787658e-05, + "loss": 37.0509, + "step": 2308 + }, + { + "epoch": 8.335891647855531, + "grad_norm": 187.8760223388672, + "learning_rate": 1.7482758620689657e-05, + "loss": 38.0019, + "step": 2309 + }, + { + "epoch": 8.339503386004514, + "grad_norm": 239.35833740234375, + "learning_rate": 1.7477313974591652e-05, + "loss": 38.2255, + "step": 2310 + }, + { + "epoch": 8.339503386004514, + "eval_loss": 0.6221747994422913, + "eval_runtime": 3.148, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 2310 + }, + { + "epoch": 8.343115124153499, + "grad_norm": 236.3567657470703, + "learning_rate": 1.747186932849365e-05, + "loss": 37.3598, + "step": 2311 + }, + { + "epoch": 8.346726862302482, + "grad_norm": 188.16151428222656, + "learning_rate": 1.7466424682395646e-05, + "loss": 27.1993, + "step": 2312 + }, + { + "epoch": 8.350338600451467, + "grad_norm": 216.58778381347656, + "learning_rate": 1.746098003629764e-05, + "loss": 23.7024, + "step": 2313 + }, + { + "epoch": 8.353950338600452, + "grad_norm": 221.03111267089844, + "learning_rate": 1.7455535390199637e-05, + "loss": 24.2856, + "step": 2314 + }, + { + "epoch": 8.357562076749435, + "grad_norm": 180.36221313476562, + "learning_rate": 1.7450090744101632e-05, + "loss": 23.7624, + "step": 2315 + }, + { + "epoch": 8.36117381489842, + "grad_norm": 198.77438354492188, + "learning_rate": 1.7444646098003628e-05, + "loss": 25.8628, + "step": 2316 + }, + { + "epoch": 8.364785553047405, + "grad_norm": 250.81321716308594, + "learning_rate": 1.7439201451905627e-05, + "loss": 43.4097, + "step": 2317 + }, + { + "epoch": 8.368397291196388, + "grad_norm": 246.19544982910156, + "learning_rate": 1.7433756805807622e-05, + "loss": 44.7141, + "step": 2318 + }, + { + "epoch": 8.372009029345373, + "grad_norm": 245.04241943359375, + "learning_rate": 1.742831215970962e-05, + "loss": 44.4511, + "step": 2319 + }, + { + "epoch": 8.375620767494357, + "grad_norm": 224.05331420898438, + "learning_rate": 1.7422867513611616e-05, + "loss": 43.5971, + "step": 2320 + }, + { + "epoch": 8.375620767494357, + "eval_loss": 0.6324251294136047, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 2320 + }, + { + "epoch": 8.37923250564334, + "grad_norm": 222.3795623779297, + "learning_rate": 1.741742286751361e-05, + "loss": 42.9007, + "step": 2321 + }, + { + "epoch": 8.382844243792325, + "grad_norm": 210.0133514404297, + "learning_rate": 1.741197822141561e-05, + "loss": 42.8733, + "step": 2322 + }, + { + "epoch": 8.386455981941308, + "grad_norm": 222.01031494140625, + "learning_rate": 1.7406533575317606e-05, + "loss": 42.9875, + "step": 2323 + }, + { + "epoch": 8.390067720090293, + "grad_norm": 187.30101013183594, + "learning_rate": 1.74010889292196e-05, + "loss": 42.4873, + "step": 2324 + }, + { + "epoch": 8.393679458239278, + "grad_norm": 188.22048950195312, + "learning_rate": 1.7395644283121596e-05, + "loss": 42.2066, + "step": 2325 + }, + { + "epoch": 8.397291196388261, + "grad_norm": 228.75363159179688, + "learning_rate": 1.7390199637023592e-05, + "loss": 42.7604, + "step": 2326 + }, + { + "epoch": 8.400902934537246, + "grad_norm": 196.8817901611328, + "learning_rate": 1.7384754990925587e-05, + "loss": 42.445, + "step": 2327 + }, + { + "epoch": 8.404514672686231, + "grad_norm": 205.3610382080078, + "learning_rate": 1.737931034482759e-05, + "loss": 39.8408, + "step": 2328 + }, + { + "epoch": 8.408126410835214, + "grad_norm": 259.0702819824219, + "learning_rate": 1.7373865698729585e-05, + "loss": 40.847, + "step": 2329 + }, + { + "epoch": 8.411738148984199, + "grad_norm": 216.12017822265625, + "learning_rate": 1.736842105263158e-05, + "loss": 40.4648, + "step": 2330 + }, + { + "epoch": 8.411738148984199, + "eval_loss": 0.6252871155738831, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2330 + }, + { + "epoch": 8.415349887133182, + "grad_norm": 330.9464111328125, + "learning_rate": 1.7362976406533575e-05, + "loss": 39.7682, + "step": 2331 + }, + { + "epoch": 8.418961625282167, + "grad_norm": 237.19505310058594, + "learning_rate": 1.735753176043557e-05, + "loss": 38.8824, + "step": 2332 + }, + { + "epoch": 8.422573363431152, + "grad_norm": 247.22259521484375, + "learning_rate": 1.735208711433757e-05, + "loss": 40.1187, + "step": 2333 + }, + { + "epoch": 8.426185101580135, + "grad_norm": 267.739990234375, + "learning_rate": 1.7346642468239565e-05, + "loss": 40.4589, + "step": 2334 + }, + { + "epoch": 8.42979683972912, + "grad_norm": 308.715576171875, + "learning_rate": 1.734119782214156e-05, + "loss": 41.5481, + "step": 2335 + }, + { + "epoch": 8.433408577878104, + "grad_norm": 350.8972473144531, + "learning_rate": 1.7335753176043556e-05, + "loss": 41.6628, + "step": 2336 + }, + { + "epoch": 8.437020316027088, + "grad_norm": 245.9825897216797, + "learning_rate": 1.7330308529945555e-05, + "loss": 40.3527, + "step": 2337 + }, + { + "epoch": 8.440632054176072, + "grad_norm": 253.94488525390625, + "learning_rate": 1.732486388384755e-05, + "loss": 39.6388, + "step": 2338 + }, + { + "epoch": 8.444243792325057, + "grad_norm": 226.24179077148438, + "learning_rate": 1.731941923774955e-05, + "loss": 40.5561, + "step": 2339 + }, + { + "epoch": 8.44785553047404, + "grad_norm": 188.66746520996094, + "learning_rate": 1.7313974591651544e-05, + "loss": 41.8422, + "step": 2340 + }, + { + "epoch": 8.44785553047404, + "eval_loss": 0.6197592616081238, + "eval_runtime": 3.1522, + "eval_samples_per_second": 56.786, + "eval_steps_per_second": 56.786, + "step": 2340 + }, + { + "epoch": 8.451467268623025, + "grad_norm": 227.01014709472656, + "learning_rate": 1.730852994555354e-05, + "loss": 41.4184, + "step": 2341 + }, + { + "epoch": 8.455079006772008, + "grad_norm": 187.11643981933594, + "learning_rate": 1.7303085299455535e-05, + "loss": 40.796, + "step": 2342 + }, + { + "epoch": 8.458690744920993, + "grad_norm": 243.1756134033203, + "learning_rate": 1.729764065335753e-05, + "loss": 41.7926, + "step": 2343 + }, + { + "epoch": 8.462302483069978, + "grad_norm": 226.15187072753906, + "learning_rate": 1.729219600725953e-05, + "loss": 41.588, + "step": 2344 + }, + { + "epoch": 8.465914221218961, + "grad_norm": 218.49935913085938, + "learning_rate": 1.7286751361161524e-05, + "loss": 39.6935, + "step": 2345 + }, + { + "epoch": 8.469525959367946, + "grad_norm": 232.4805145263672, + "learning_rate": 1.7281306715063523e-05, + "loss": 37.0718, + "step": 2346 + }, + { + "epoch": 8.47313769751693, + "grad_norm": 201.1748046875, + "learning_rate": 1.727586206896552e-05, + "loss": 33.9633, + "step": 2347 + }, + { + "epoch": 8.476749435665914, + "grad_norm": 208.79733276367188, + "learning_rate": 1.7270417422867514e-05, + "loss": 33.4553, + "step": 2348 + }, + { + "epoch": 8.480361173814899, + "grad_norm": 235.91151428222656, + "learning_rate": 1.726497277676951e-05, + "loss": 33.6144, + "step": 2349 + }, + { + "epoch": 8.483972911963882, + "grad_norm": 206.28811645507812, + "learning_rate": 1.7259528130671508e-05, + "loss": 35.3678, + "step": 2350 + }, + { + "epoch": 8.483972911963882, + "eval_loss": 0.6203061938285828, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 2350 + }, + { + "epoch": 8.487584650112867, + "grad_norm": 305.2204284667969, + "learning_rate": 1.7254083484573503e-05, + "loss": 35.9175, + "step": 2351 + }, + { + "epoch": 8.491196388261852, + "grad_norm": 227.1592254638672, + "learning_rate": 1.72486388384755e-05, + "loss": 35.5001, + "step": 2352 + }, + { + "epoch": 8.494808126410835, + "grad_norm": 194.739501953125, + "learning_rate": 1.7243194192377494e-05, + "loss": 35.0015, + "step": 2353 + }, + { + "epoch": 8.49841986455982, + "grad_norm": 233.8467254638672, + "learning_rate": 1.723774954627949e-05, + "loss": 36.8257, + "step": 2354 + }, + { + "epoch": 8.502031602708804, + "grad_norm": 258.8914489746094, + "learning_rate": 1.7232304900181492e-05, + "loss": 36.1246, + "step": 2355 + }, + { + "epoch": 8.505643340857787, + "grad_norm": 194.8585968017578, + "learning_rate": 1.7226860254083487e-05, + "loss": 36.1245, + "step": 2356 + }, + { + "epoch": 8.509255079006772, + "grad_norm": 191.2276153564453, + "learning_rate": 1.7221415607985483e-05, + "loss": 37.0608, + "step": 2357 + }, + { + "epoch": 8.512866817155757, + "grad_norm": 197.9025115966797, + "learning_rate": 1.7215970961887478e-05, + "loss": 37.0779, + "step": 2358 + }, + { + "epoch": 8.51647855530474, + "grad_norm": 207.01016235351562, + "learning_rate": 1.7210526315789473e-05, + "loss": 37.8432, + "step": 2359 + }, + { + "epoch": 8.520090293453725, + "grad_norm": 222.20201110839844, + "learning_rate": 1.720508166969147e-05, + "loss": 36.6983, + "step": 2360 + }, + { + "epoch": 8.520090293453725, + "eval_loss": 0.6240220665931702, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 2360 + }, + { + "epoch": 8.523702031602708, + "grad_norm": 200.19273376464844, + "learning_rate": 1.7199637023593467e-05, + "loss": 38.0613, + "step": 2361 + }, + { + "epoch": 8.527313769751693, + "grad_norm": 205.36758422851562, + "learning_rate": 1.7194192377495463e-05, + "loss": 29.6395, + "step": 2362 + }, + { + "epoch": 8.530925507900678, + "grad_norm": 206.53396606445312, + "learning_rate": 1.7188747731397458e-05, + "loss": 23.6478, + "step": 2363 + }, + { + "epoch": 8.534537246049661, + "grad_norm": 219.47044372558594, + "learning_rate": 1.7183303085299454e-05, + "loss": 22.8522, + "step": 2364 + }, + { + "epoch": 8.538148984198646, + "grad_norm": 178.48008728027344, + "learning_rate": 1.7177858439201452e-05, + "loss": 24.1411, + "step": 2365 + }, + { + "epoch": 8.54176072234763, + "grad_norm": 222.63731384277344, + "learning_rate": 1.717241379310345e-05, + "loss": 26.2818, + "step": 2366 + }, + { + "epoch": 8.545372460496614, + "grad_norm": 216.6333465576172, + "learning_rate": 1.7166969147005447e-05, + "loss": 42.5599, + "step": 2367 + }, + { + "epoch": 8.548984198645599, + "grad_norm": 241.42532348632812, + "learning_rate": 1.7161524500907442e-05, + "loss": 44.0016, + "step": 2368 + }, + { + "epoch": 8.552595936794582, + "grad_norm": 227.95193481445312, + "learning_rate": 1.7156079854809437e-05, + "loss": 44.1662, + "step": 2369 + }, + { + "epoch": 8.556207674943566, + "grad_norm": 204.9208526611328, + "learning_rate": 1.7150635208711433e-05, + "loss": 41.2255, + "step": 2370 + }, + { + "epoch": 8.556207674943566, + "eval_loss": 0.6293933987617493, + "eval_runtime": 3.1467, + "eval_samples_per_second": 56.884, + "eval_steps_per_second": 56.884, + "step": 2370 + }, + { + "epoch": 8.559819413092551, + "grad_norm": 168.1370849609375, + "learning_rate": 1.7145190562613428e-05, + "loss": 42.8374, + "step": 2371 + }, + { + "epoch": 8.563431151241534, + "grad_norm": 209.16641235351562, + "learning_rate": 1.7139745916515427e-05, + "loss": 42.4378, + "step": 2372 + }, + { + "epoch": 8.56704288939052, + "grad_norm": 235.36373901367188, + "learning_rate": 1.7134301270417422e-05, + "loss": 43.3213, + "step": 2373 + }, + { + "epoch": 8.570654627539504, + "grad_norm": 198.8206329345703, + "learning_rate": 1.712885662431942e-05, + "loss": 43.5621, + "step": 2374 + }, + { + "epoch": 8.574266365688487, + "grad_norm": 191.1640167236328, + "learning_rate": 1.7123411978221416e-05, + "loss": 41.8729, + "step": 2375 + }, + { + "epoch": 8.577878103837472, + "grad_norm": 281.6352233886719, + "learning_rate": 1.7117967332123412e-05, + "loss": 42.8306, + "step": 2376 + }, + { + "epoch": 8.581489841986457, + "grad_norm": 191.68939208984375, + "learning_rate": 1.711252268602541e-05, + "loss": 41.3603, + "step": 2377 + }, + { + "epoch": 8.58510158013544, + "grad_norm": 175.3041229248047, + "learning_rate": 1.7107078039927406e-05, + "loss": 38.7076, + "step": 2378 + }, + { + "epoch": 8.588713318284425, + "grad_norm": 186.31202697753906, + "learning_rate": 1.71016333938294e-05, + "loss": 38.832, + "step": 2379 + }, + { + "epoch": 8.592325056433408, + "grad_norm": 192.0680389404297, + "learning_rate": 1.7096188747731397e-05, + "loss": 40.6542, + "step": 2380 + }, + { + "epoch": 8.592325056433408, + "eval_loss": 0.6245992183685303, + "eval_runtime": 3.1487, + "eval_samples_per_second": 56.848, + "eval_steps_per_second": 56.848, + "step": 2380 + }, + { + "epoch": 8.595936794582393, + "grad_norm": 284.3516540527344, + "learning_rate": 1.7090744101633392e-05, + "loss": 40.3145, + "step": 2381 + }, + { + "epoch": 8.599548532731378, + "grad_norm": 210.2421875, + "learning_rate": 1.708529945553539e-05, + "loss": 39.9109, + "step": 2382 + }, + { + "epoch": 8.60316027088036, + "grad_norm": 202.3438720703125, + "learning_rate": 1.707985480943739e-05, + "loss": 39.0686, + "step": 2383 + }, + { + "epoch": 8.606772009029346, + "grad_norm": 189.5508270263672, + "learning_rate": 1.7074410163339385e-05, + "loss": 40.6673, + "step": 2384 + }, + { + "epoch": 8.610383747178329, + "grad_norm": 199.3516387939453, + "learning_rate": 1.706896551724138e-05, + "loss": 40.5357, + "step": 2385 + }, + { + "epoch": 8.613995485327314, + "grad_norm": 183.11309814453125, + "learning_rate": 1.7063520871143376e-05, + "loss": 40.7691, + "step": 2386 + }, + { + "epoch": 8.617607223476298, + "grad_norm": 347.104248046875, + "learning_rate": 1.705807622504537e-05, + "loss": 40.6822, + "step": 2387 + }, + { + "epoch": 8.621218961625281, + "grad_norm": 341.0453796386719, + "learning_rate": 1.705263157894737e-05, + "loss": 40.9791, + "step": 2388 + }, + { + "epoch": 8.624830699774266, + "grad_norm": 335.33221435546875, + "learning_rate": 1.7047186932849365e-05, + "loss": 41.0977, + "step": 2389 + }, + { + "epoch": 8.628442437923251, + "grad_norm": 209.75198364257812, + "learning_rate": 1.704174228675136e-05, + "loss": 41.3332, + "step": 2390 + }, + { + "epoch": 8.628442437923251, + "eval_loss": 0.6176490783691406, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2390 + }, + { + "epoch": 8.632054176072234, + "grad_norm": 221.6715545654297, + "learning_rate": 1.7036297640653356e-05, + "loss": 41.7456, + "step": 2391 + }, + { + "epoch": 8.635665914221219, + "grad_norm": 255.7875213623047, + "learning_rate": 1.7030852994555355e-05, + "loss": 41.7063, + "step": 2392 + }, + { + "epoch": 8.639277652370204, + "grad_norm": 206.66221618652344, + "learning_rate": 1.7025408348457354e-05, + "loss": 41.941, + "step": 2393 + }, + { + "epoch": 8.642889390519187, + "grad_norm": 381.9871826171875, + "learning_rate": 1.701996370235935e-05, + "loss": 42.8615, + "step": 2394 + }, + { + "epoch": 8.646501128668172, + "grad_norm": 303.8249816894531, + "learning_rate": 1.7014519056261344e-05, + "loss": 37.8472, + "step": 2395 + }, + { + "epoch": 8.650112866817155, + "grad_norm": 201.2444610595703, + "learning_rate": 1.700907441016334e-05, + "loss": 35.4641, + "step": 2396 + }, + { + "epoch": 8.65372460496614, + "grad_norm": 242.34298706054688, + "learning_rate": 1.7003629764065335e-05, + "loss": 33.3414, + "step": 2397 + }, + { + "epoch": 8.657336343115125, + "grad_norm": 214.45384216308594, + "learning_rate": 1.699818511796733e-05, + "loss": 33.7771, + "step": 2398 + }, + { + "epoch": 8.660948081264108, + "grad_norm": 276.4810485839844, + "learning_rate": 1.699274047186933e-05, + "loss": 35.4289, + "step": 2399 + }, + { + "epoch": 8.664559819413093, + "grad_norm": 199.68626403808594, + "learning_rate": 1.6987295825771325e-05, + "loss": 34.4205, + "step": 2400 + }, + { + "epoch": 8.664559819413093, + "eval_loss": 0.6179484128952026, + "eval_runtime": 3.1618, + "eval_samples_per_second": 56.614, + "eval_steps_per_second": 56.614, + "step": 2400 + }, + { + "epoch": 8.668171557562077, + "grad_norm": 239.19200134277344, + "learning_rate": 1.698185117967332e-05, + "loss": 34.3428, + "step": 2401 + }, + { + "epoch": 8.67178329571106, + "grad_norm": 341.44927978515625, + "learning_rate": 1.697640653357532e-05, + "loss": 37.6011, + "step": 2402 + }, + { + "epoch": 8.675395033860045, + "grad_norm": 260.5967102050781, + "learning_rate": 1.6970961887477314e-05, + "loss": 34.9222, + "step": 2403 + }, + { + "epoch": 8.679006772009028, + "grad_norm": 217.9357147216797, + "learning_rate": 1.6965517241379313e-05, + "loss": 36.6177, + "step": 2404 + }, + { + "epoch": 8.682618510158013, + "grad_norm": 355.21917724609375, + "learning_rate": 1.696007259528131e-05, + "loss": 36.3072, + "step": 2405 + }, + { + "epoch": 8.686230248306998, + "grad_norm": 279.37200927734375, + "learning_rate": 1.6954627949183304e-05, + "loss": 36.7026, + "step": 2406 + }, + { + "epoch": 8.689841986455981, + "grad_norm": 344.9017028808594, + "learning_rate": 1.69491833030853e-05, + "loss": 37.5009, + "step": 2407 + }, + { + "epoch": 8.693453724604966, + "grad_norm": 225.28668212890625, + "learning_rate": 1.6943738656987295e-05, + "loss": 36.0914, + "step": 2408 + }, + { + "epoch": 8.697065462753951, + "grad_norm": 233.16372680664062, + "learning_rate": 1.693829401088929e-05, + "loss": 38.0917, + "step": 2409 + }, + { + "epoch": 8.700677200902934, + "grad_norm": 220.2307891845703, + "learning_rate": 1.693284936479129e-05, + "loss": 37.4493, + "step": 2410 + }, + { + "epoch": 8.700677200902934, + "eval_loss": 0.6225734949111938, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2410 + }, + { + "epoch": 8.704288939051919, + "grad_norm": 298.2883605957031, + "learning_rate": 1.6927404718693287e-05, + "loss": 37.6527, + "step": 2411 + }, + { + "epoch": 8.707900677200904, + "grad_norm": 329.1615295410156, + "learning_rate": 1.6921960072595283e-05, + "loss": 30.9627, + "step": 2412 + }, + { + "epoch": 8.711512415349887, + "grad_norm": 192.55380249023438, + "learning_rate": 1.6916515426497278e-05, + "loss": 24.2028, + "step": 2413 + }, + { + "epoch": 8.715124153498872, + "grad_norm": 162.13583374023438, + "learning_rate": 1.6911070780399274e-05, + "loss": 23.3005, + "step": 2414 + }, + { + "epoch": 8.718735891647855, + "grad_norm": 152.95108032226562, + "learning_rate": 1.6905626134301272e-05, + "loss": 24.335, + "step": 2415 + }, + { + "epoch": 8.72234762979684, + "grad_norm": 183.4193572998047, + "learning_rate": 1.6900181488203268e-05, + "loss": 24.9279, + "step": 2416 + }, + { + "epoch": 8.725959367945824, + "grad_norm": 232.93650817871094, + "learning_rate": 1.6894736842105263e-05, + "loss": 43.4574, + "step": 2417 + }, + { + "epoch": 8.729571106094808, + "grad_norm": 226.85890197753906, + "learning_rate": 1.688929219600726e-05, + "loss": 44.4136, + "step": 2418 + }, + { + "epoch": 8.733182844243792, + "grad_norm": 232.16064453125, + "learning_rate": 1.6883847549909254e-05, + "loss": 42.8183, + "step": 2419 + }, + { + "epoch": 8.736794582392777, + "grad_norm": 243.5811767578125, + "learning_rate": 1.6878402903811253e-05, + "loss": 43.3031, + "step": 2420 + }, + { + "epoch": 8.736794582392777, + "eval_loss": 0.6284167170524597, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2420 + }, + { + "epoch": 8.74040632054176, + "grad_norm": 194.7115020751953, + "learning_rate": 1.687295825771325e-05, + "loss": 42.1276, + "step": 2421 + }, + { + "epoch": 8.744018058690745, + "grad_norm": 250.81983947753906, + "learning_rate": 1.6867513611615247e-05, + "loss": 42.5535, + "step": 2422 + }, + { + "epoch": 8.747629796839728, + "grad_norm": 205.1988983154297, + "learning_rate": 1.6862068965517242e-05, + "loss": 42.7745, + "step": 2423 + }, + { + "epoch": 8.751241534988713, + "grad_norm": 159.68243408203125, + "learning_rate": 1.6856624319419238e-05, + "loss": 43.6562, + "step": 2424 + }, + { + "epoch": 8.754853273137698, + "grad_norm": 164.31361389160156, + "learning_rate": 1.6851179673321233e-05, + "loss": 43.4602, + "step": 2425 + }, + { + "epoch": 8.758465011286681, + "grad_norm": 213.9793243408203, + "learning_rate": 1.6845735027223232e-05, + "loss": 42.1559, + "step": 2426 + }, + { + "epoch": 8.762076749435666, + "grad_norm": 205.79107666015625, + "learning_rate": 1.6840290381125227e-05, + "loss": 41.5687, + "step": 2427 + }, + { + "epoch": 8.76568848758465, + "grad_norm": 235.80348205566406, + "learning_rate": 1.6834845735027223e-05, + "loss": 41.0748, + "step": 2428 + }, + { + "epoch": 8.769300225733634, + "grad_norm": 203.84884643554688, + "learning_rate": 1.682940108892922e-05, + "loss": 39.3348, + "step": 2429 + }, + { + "epoch": 8.772911963882619, + "grad_norm": 271.2411804199219, + "learning_rate": 1.6823956442831217e-05, + "loss": 39.357, + "step": 2430 + }, + { + "epoch": 8.772911963882619, + "eval_loss": 0.6211046576499939, + "eval_runtime": 3.1402, + "eval_samples_per_second": 57.002, + "eval_steps_per_second": 57.002, + "step": 2430 + }, + { + "epoch": 8.776523702031604, + "grad_norm": 222.4960174560547, + "learning_rate": 1.6818511796733212e-05, + "loss": 39.2198, + "step": 2431 + }, + { + "epoch": 8.780135440180587, + "grad_norm": 325.9942932128906, + "learning_rate": 1.681306715063521e-05, + "loss": 40.572, + "step": 2432 + }, + { + "epoch": 8.783747178329572, + "grad_norm": 195.2740936279297, + "learning_rate": 1.6807622504537206e-05, + "loss": 39.2727, + "step": 2433 + }, + { + "epoch": 8.787358916478555, + "grad_norm": 196.16964721679688, + "learning_rate": 1.68021778584392e-05, + "loss": 40.6503, + "step": 2434 + }, + { + "epoch": 8.79097065462754, + "grad_norm": 183.2659454345703, + "learning_rate": 1.6796733212341197e-05, + "loss": 41.2074, + "step": 2435 + }, + { + "epoch": 8.794582392776524, + "grad_norm": 293.393798828125, + "learning_rate": 1.6791288566243192e-05, + "loss": 40.2778, + "step": 2436 + }, + { + "epoch": 8.798194130925507, + "grad_norm": 232.8402099609375, + "learning_rate": 1.678584392014519e-05, + "loss": 40.0305, + "step": 2437 + }, + { + "epoch": 8.801805869074492, + "grad_norm": 269.957275390625, + "learning_rate": 1.678039927404719e-05, + "loss": 40.4216, + "step": 2438 + }, + { + "epoch": 8.805417607223477, + "grad_norm": 175.6732635498047, + "learning_rate": 1.6774954627949185e-05, + "loss": 40.7998, + "step": 2439 + }, + { + "epoch": 8.80902934537246, + "grad_norm": 209.0604248046875, + "learning_rate": 1.676950998185118e-05, + "loss": 41.1176, + "step": 2440 + }, + { + "epoch": 8.80902934537246, + "eval_loss": 0.6211614012718201, + "eval_runtime": 3.15, + "eval_samples_per_second": 56.826, + "eval_steps_per_second": 56.826, + "step": 2440 + }, + { + "epoch": 8.812641083521445, + "grad_norm": 229.91171264648438, + "learning_rate": 1.6764065335753176e-05, + "loss": 41.37, + "step": 2441 + }, + { + "epoch": 8.816252821670428, + "grad_norm": 192.99610900878906, + "learning_rate": 1.675862068965517e-05, + "loss": 41.8377, + "step": 2442 + }, + { + "epoch": 8.819864559819413, + "grad_norm": 239.290771484375, + "learning_rate": 1.675317604355717e-05, + "loss": 42.3038, + "step": 2443 + }, + { + "epoch": 8.823476297968398, + "grad_norm": 203.52330017089844, + "learning_rate": 1.6747731397459166e-05, + "loss": 41.3334, + "step": 2444 + }, + { + "epoch": 8.827088036117381, + "grad_norm": 247.99099731445312, + "learning_rate": 1.674228675136116e-05, + "loss": 37.7455, + "step": 2445 + }, + { + "epoch": 8.830699774266366, + "grad_norm": 205.9770965576172, + "learning_rate": 1.6736842105263156e-05, + "loss": 34.6828, + "step": 2446 + }, + { + "epoch": 8.83431151241535, + "grad_norm": 215.47024536132812, + "learning_rate": 1.6731397459165152e-05, + "loss": 34.927, + "step": 2447 + }, + { + "epoch": 8.837923250564334, + "grad_norm": 254.14010620117188, + "learning_rate": 1.6725952813067154e-05, + "loss": 35.3194, + "step": 2448 + }, + { + "epoch": 8.841534988713319, + "grad_norm": 221.18174743652344, + "learning_rate": 1.672050816696915e-05, + "loss": 34.9577, + "step": 2449 + }, + { + "epoch": 8.845146726862303, + "grad_norm": 191.1651611328125, + "learning_rate": 1.6715063520871145e-05, + "loss": 33.7244, + "step": 2450 + }, + { + "epoch": 8.845146726862303, + "eval_loss": 0.6216589212417603, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2450 + }, + { + "epoch": 8.848758465011286, + "grad_norm": 228.3920135498047, + "learning_rate": 1.670961887477314e-05, + "loss": 34.9689, + "step": 2451 + }, + { + "epoch": 8.852370203160271, + "grad_norm": 227.6689910888672, + "learning_rate": 1.6704174228675135e-05, + "loss": 36.0718, + "step": 2452 + }, + { + "epoch": 8.855981941309254, + "grad_norm": 182.38978576660156, + "learning_rate": 1.669872958257713e-05, + "loss": 37.1143, + "step": 2453 + }, + { + "epoch": 8.85959367945824, + "grad_norm": 223.66966247558594, + "learning_rate": 1.669328493647913e-05, + "loss": 34.4468, + "step": 2454 + }, + { + "epoch": 8.863205417607224, + "grad_norm": 260.3930358886719, + "learning_rate": 1.6687840290381125e-05, + "loss": 36.7305, + "step": 2455 + }, + { + "epoch": 8.866817155756207, + "grad_norm": 218.60385131835938, + "learning_rate": 1.668239564428312e-05, + "loss": 36.1995, + "step": 2456 + }, + { + "epoch": 8.870428893905192, + "grad_norm": 227.4342041015625, + "learning_rate": 1.667695099818512e-05, + "loss": 35.9138, + "step": 2457 + }, + { + "epoch": 8.874040632054175, + "grad_norm": 208.42196655273438, + "learning_rate": 1.6671506352087115e-05, + "loss": 37.2621, + "step": 2458 + }, + { + "epoch": 8.87765237020316, + "grad_norm": 214.9486541748047, + "learning_rate": 1.6666061705989113e-05, + "loss": 38.5176, + "step": 2459 + }, + { + "epoch": 8.881264108352145, + "grad_norm": 226.6992645263672, + "learning_rate": 1.666061705989111e-05, + "loss": 38.3917, + "step": 2460 + }, + { + "epoch": 8.881264108352145, + "eval_loss": 0.6277003884315491, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.959, + "eval_steps_per_second": 56.959, + "step": 2460 + }, + { + "epoch": 8.884875846501128, + "grad_norm": 282.3875732421875, + "learning_rate": 1.6655172413793104e-05, + "loss": 39.1439, + "step": 2461 + }, + { + "epoch": 8.888487584650113, + "grad_norm": 240.29022216796875, + "learning_rate": 1.66497277676951e-05, + "loss": 33.7717, + "step": 2462 + }, + { + "epoch": 8.892099322799098, + "grad_norm": 231.84727478027344, + "learning_rate": 1.6644283121597095e-05, + "loss": 24.1146, + "step": 2463 + }, + { + "epoch": 8.89571106094808, + "grad_norm": 215.5159149169922, + "learning_rate": 1.663883847549909e-05, + "loss": 24.0165, + "step": 2464 + }, + { + "epoch": 8.899322799097066, + "grad_norm": 278.42950439453125, + "learning_rate": 1.663339382940109e-05, + "loss": 24.2048, + "step": 2465 + }, + { + "epoch": 8.90293453724605, + "grad_norm": 187.03341674804688, + "learning_rate": 1.6627949183303088e-05, + "loss": 24.7332, + "step": 2466 + }, + { + "epoch": 8.906546275395034, + "grad_norm": 261.2938232421875, + "learning_rate": 1.6622504537205083e-05, + "loss": 42.6764, + "step": 2467 + }, + { + "epoch": 8.910158013544018, + "grad_norm": 234.00880432128906, + "learning_rate": 1.661705989110708e-05, + "loss": 42.9894, + "step": 2468 + }, + { + "epoch": 8.913769751693001, + "grad_norm": 263.2890319824219, + "learning_rate": 1.6611615245009074e-05, + "loss": 43.3274, + "step": 2469 + }, + { + "epoch": 8.917381489841986, + "grad_norm": 286.3260192871094, + "learning_rate": 1.6606170598911073e-05, + "loss": 44.3862, + "step": 2470 + }, + { + "epoch": 8.917381489841986, + "eval_loss": 0.6278789043426514, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2470 + }, + { + "epoch": 8.920993227990971, + "grad_norm": 273.5133972167969, + "learning_rate": 1.6600725952813068e-05, + "loss": 43.4195, + "step": 2471 + }, + { + "epoch": 8.924604966139954, + "grad_norm": 246.2245330810547, + "learning_rate": 1.6595281306715063e-05, + "loss": 43.153, + "step": 2472 + }, + { + "epoch": 8.928216704288939, + "grad_norm": 261.3001403808594, + "learning_rate": 1.658983666061706e-05, + "loss": 41.1276, + "step": 2473 + }, + { + "epoch": 8.931828442437924, + "grad_norm": 263.7626037597656, + "learning_rate": 1.6584392014519054e-05, + "loss": 40.5055, + "step": 2474 + }, + { + "epoch": 8.935440180586907, + "grad_norm": 233.80442810058594, + "learning_rate": 1.6578947368421053e-05, + "loss": 40.7098, + "step": 2475 + }, + { + "epoch": 8.939051918735892, + "grad_norm": 334.1268615722656, + "learning_rate": 1.6573502722323052e-05, + "loss": 40.5404, + "step": 2476 + }, + { + "epoch": 8.942663656884875, + "grad_norm": 319.56689453125, + "learning_rate": 1.6568058076225047e-05, + "loss": 40.3434, + "step": 2477 + }, + { + "epoch": 8.94627539503386, + "grad_norm": 388.0625915527344, + "learning_rate": 1.6562613430127043e-05, + "loss": 41.1956, + "step": 2478 + }, + { + "epoch": 8.949887133182845, + "grad_norm": 256.9087829589844, + "learning_rate": 1.6557168784029038e-05, + "loss": 41.9647, + "step": 2479 + }, + { + "epoch": 8.953498871331828, + "grad_norm": 248.2635040283203, + "learning_rate": 1.6551724137931033e-05, + "loss": 41.1885, + "step": 2480 + }, + { + "epoch": 8.953498871331828, + "eval_loss": 0.6198933124542236, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2480 + }, + { + "epoch": 8.957110609480813, + "grad_norm": 236.89004516601562, + "learning_rate": 1.6546279491833032e-05, + "loss": 41.2178, + "step": 2481 + }, + { + "epoch": 8.960722347629797, + "grad_norm": 260.47357177734375, + "learning_rate": 1.6540834845735027e-05, + "loss": 42.1472, + "step": 2482 + }, + { + "epoch": 8.96433408577878, + "grad_norm": 216.1390380859375, + "learning_rate": 1.6535390199637023e-05, + "loss": 36.14, + "step": 2483 + }, + { + "epoch": 8.967945823927765, + "grad_norm": 194.7316131591797, + "learning_rate": 1.652994555353902e-05, + "loss": 33.7272, + "step": 2484 + }, + { + "epoch": 8.97155756207675, + "grad_norm": 202.0404052734375, + "learning_rate": 1.6524500907441017e-05, + "loss": 34.9427, + "step": 2485 + }, + { + "epoch": 8.975169300225733, + "grad_norm": 196.98463439941406, + "learning_rate": 1.6519056261343016e-05, + "loss": 36.4874, + "step": 2486 + }, + { + "epoch": 8.978781038374718, + "grad_norm": 211.46177673339844, + "learning_rate": 1.651361161524501e-05, + "loss": 35.7667, + "step": 2487 + }, + { + "epoch": 8.982392776523701, + "grad_norm": 190.47093200683594, + "learning_rate": 1.6508166969147006e-05, + "loss": 35.6874, + "step": 2488 + }, + { + "epoch": 8.986004514672686, + "grad_norm": 194.9825897216797, + "learning_rate": 1.6502722323049002e-05, + "loss": 36.8718, + "step": 2489 + }, + { + "epoch": 8.989616252821671, + "grad_norm": 230.24774169921875, + "learning_rate": 1.6497277676950997e-05, + "loss": 37.4962, + "step": 2490 + }, + { + "epoch": 8.989616252821671, + "eval_loss": 0.6168100237846375, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 2490 + }, + { + "epoch": 8.993227990970654, + "grad_norm": 266.5688171386719, + "learning_rate": 1.6491833030852993e-05, + "loss": 35.5063, + "step": 2491 + }, + { + "epoch": 8.996839729119639, + "grad_norm": 230.923828125, + "learning_rate": 1.648638838475499e-05, + "loss": 23.5847, + "step": 2492 + }, + { + "epoch": 9.0, + "grad_norm": 187.365478515625, + "learning_rate": 1.6480943738656987e-05, + "loss": 21.7926, + "step": 2493 + }, + { + "epoch": 9.003611738148985, + "grad_norm": 283.487060546875, + "learning_rate": 1.6475499092558986e-05, + "loss": 41.4221, + "step": 2494 + }, + { + "epoch": 9.007223476297968, + "grad_norm": 234.38009643554688, + "learning_rate": 1.647005444646098e-05, + "loss": 43.3343, + "step": 2495 + }, + { + "epoch": 9.010835214446953, + "grad_norm": 253.75588989257812, + "learning_rate": 1.6464609800362976e-05, + "loss": 42.1983, + "step": 2496 + }, + { + "epoch": 9.014446952595938, + "grad_norm": 224.6202392578125, + "learning_rate": 1.6459165154264975e-05, + "loss": 41.5355, + "step": 2497 + }, + { + "epoch": 9.01805869074492, + "grad_norm": 261.0040588378906, + "learning_rate": 1.645372050816697e-05, + "loss": 42.3058, + "step": 2498 + }, + { + "epoch": 9.021670428893906, + "grad_norm": 191.44142150878906, + "learning_rate": 1.6448275862068966e-05, + "loss": 42.3911, + "step": 2499 + }, + { + "epoch": 9.025282167042889, + "grad_norm": 246.79278564453125, + "learning_rate": 1.644283121597096e-05, + "loss": 41.6238, + "step": 2500 + }, + { + "epoch": 9.025282167042889, + "eval_loss": 0.6220878958702087, + "eval_runtime": 3.1552, + "eval_samples_per_second": 56.731, + "eval_steps_per_second": 56.731, + "step": 2500 + }, + { + "epoch": 9.028893905191874, + "grad_norm": 251.5475311279297, + "learning_rate": 1.6437386569872957e-05, + "loss": 43.9275, + "step": 2501 + }, + { + "epoch": 9.032505643340858, + "grad_norm": 300.0381164550781, + "learning_rate": 1.6431941923774952e-05, + "loss": 42.8938, + "step": 2502 + }, + { + "epoch": 9.036117381489841, + "grad_norm": 310.0517883300781, + "learning_rate": 1.6426497277676954e-05, + "loss": 42.3538, + "step": 2503 + }, + { + "epoch": 9.039729119638826, + "grad_norm": 213.50392150878906, + "learning_rate": 1.642105263157895e-05, + "loss": 40.2305, + "step": 2504 + }, + { + "epoch": 9.043340857787811, + "grad_norm": 173.3816680908203, + "learning_rate": 1.6415607985480945e-05, + "loss": 38.3336, + "step": 2505 + }, + { + "epoch": 9.046952595936794, + "grad_norm": 195.51968383789062, + "learning_rate": 1.641016333938294e-05, + "loss": 38.5937, + "step": 2506 + }, + { + "epoch": 9.050564334085779, + "grad_norm": 195.68910217285156, + "learning_rate": 1.6404718693284936e-05, + "loss": 37.9994, + "step": 2507 + }, + { + "epoch": 9.054176072234762, + "grad_norm": 239.56704711914062, + "learning_rate": 1.6399274047186934e-05, + "loss": 38.6006, + "step": 2508 + }, + { + "epoch": 9.057787810383747, + "grad_norm": 455.8309326171875, + "learning_rate": 1.639382940108893e-05, + "loss": 39.9516, + "step": 2509 + }, + { + "epoch": 9.061399548532732, + "grad_norm": 188.0857696533203, + "learning_rate": 1.6388384754990925e-05, + "loss": 38.8922, + "step": 2510 + }, + { + "epoch": 9.061399548532732, + "eval_loss": 0.6177002191543579, + "eval_runtime": 3.1595, + "eval_samples_per_second": 56.654, + "eval_steps_per_second": 56.654, + "step": 2510 + }, + { + "epoch": 9.065011286681715, + "grad_norm": 211.76168823242188, + "learning_rate": 1.638294010889292e-05, + "loss": 38.8895, + "step": 2511 + }, + { + "epoch": 9.0686230248307, + "grad_norm": 281.7332458496094, + "learning_rate": 1.637749546279492e-05, + "loss": 39.9238, + "step": 2512 + }, + { + "epoch": 9.072234762979685, + "grad_norm": 254.9953155517578, + "learning_rate": 1.6372050816696915e-05, + "loss": 41.2667, + "step": 2513 + }, + { + "epoch": 9.075846501128668, + "grad_norm": 233.8746337890625, + "learning_rate": 1.6366606170598914e-05, + "loss": 39.3087, + "step": 2514 + }, + { + "epoch": 9.079458239277653, + "grad_norm": 317.71270751953125, + "learning_rate": 1.636116152450091e-05, + "loss": 40.4902, + "step": 2515 + }, + { + "epoch": 9.083069977426636, + "grad_norm": 227.5228271484375, + "learning_rate": 1.6355716878402904e-05, + "loss": 40.1197, + "step": 2516 + }, + { + "epoch": 9.08668171557562, + "grad_norm": 225.84423828125, + "learning_rate": 1.63502722323049e-05, + "loss": 42.9099, + "step": 2517 + }, + { + "epoch": 9.090293453724605, + "grad_norm": 255.20858764648438, + "learning_rate": 1.6344827586206895e-05, + "loss": 42.0515, + "step": 2518 + }, + { + "epoch": 9.093905191873588, + "grad_norm": 215.45352172851562, + "learning_rate": 1.6339382940108894e-05, + "loss": 41.6817, + "step": 2519 + }, + { + "epoch": 9.097516930022573, + "grad_norm": 233.5334014892578, + "learning_rate": 1.633393829401089e-05, + "loss": 42.6121, + "step": 2520 + }, + { + "epoch": 9.097516930022573, + "eval_loss": 0.6148340106010437, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.926, + "eval_steps_per_second": 56.926, + "step": 2520 + }, + { + "epoch": 9.101128668171558, + "grad_norm": 196.54132080078125, + "learning_rate": 1.6328493647912888e-05, + "loss": 40.5833, + "step": 2521 + }, + { + "epoch": 9.104740406320541, + "grad_norm": 296.7503967285156, + "learning_rate": 1.6323049001814883e-05, + "loss": 39.098, + "step": 2522 + }, + { + "epoch": 9.108352144469526, + "grad_norm": 272.1104431152344, + "learning_rate": 1.631760435571688e-05, + "loss": 36.0076, + "step": 2523 + }, + { + "epoch": 9.111963882618511, + "grad_norm": 197.3100128173828, + "learning_rate": 1.6312159709618874e-05, + "loss": 33.3503, + "step": 2524 + }, + { + "epoch": 9.115575620767494, + "grad_norm": 223.1310272216797, + "learning_rate": 1.6306715063520873e-05, + "loss": 33.1386, + "step": 2525 + }, + { + "epoch": 9.119187358916479, + "grad_norm": 234.86093139648438, + "learning_rate": 1.630127041742287e-05, + "loss": 34.2101, + "step": 2526 + }, + { + "epoch": 9.122799097065462, + "grad_norm": 244.72328186035156, + "learning_rate": 1.6295825771324864e-05, + "loss": 34.955, + "step": 2527 + }, + { + "epoch": 9.126410835214447, + "grad_norm": 198.89134216308594, + "learning_rate": 1.629038112522686e-05, + "loss": 34.5405, + "step": 2528 + }, + { + "epoch": 9.130022573363432, + "grad_norm": 236.64096069335938, + "learning_rate": 1.6284936479128854e-05, + "loss": 35.2328, + "step": 2529 + }, + { + "epoch": 9.133634311512415, + "grad_norm": 212.8743438720703, + "learning_rate": 1.6279491833030853e-05, + "loss": 34.6642, + "step": 2530 + }, + { + "epoch": 9.133634311512415, + "eval_loss": 0.6154256463050842, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 2530 + }, + { + "epoch": 9.1372460496614, + "grad_norm": 227.15135192871094, + "learning_rate": 1.6274047186932852e-05, + "loss": 35.652, + "step": 2531 + }, + { + "epoch": 9.140857787810384, + "grad_norm": 207.30572509765625, + "learning_rate": 1.6268602540834847e-05, + "loss": 36.8476, + "step": 2532 + }, + { + "epoch": 9.144469525959368, + "grad_norm": 222.18023681640625, + "learning_rate": 1.6263157894736843e-05, + "loss": 35.8299, + "step": 2533 + }, + { + "epoch": 9.148081264108352, + "grad_norm": 283.674072265625, + "learning_rate": 1.6257713248638838e-05, + "loss": 36.5074, + "step": 2534 + }, + { + "epoch": 9.151693002257336, + "grad_norm": 235.69752502441406, + "learning_rate": 1.6252268602540834e-05, + "loss": 37.344, + "step": 2535 + }, + { + "epoch": 9.15530474040632, + "grad_norm": 224.37965393066406, + "learning_rate": 1.6246823956442832e-05, + "loss": 37.8138, + "step": 2536 + }, + { + "epoch": 9.158916478555305, + "grad_norm": 217.52230834960938, + "learning_rate": 1.6241379310344828e-05, + "loss": 37.1529, + "step": 2537 + }, + { + "epoch": 9.162528216704288, + "grad_norm": 234.7586212158203, + "learning_rate": 1.6235934664246823e-05, + "loss": 36.3247, + "step": 2538 + }, + { + "epoch": 9.166139954853273, + "grad_norm": 239.52479553222656, + "learning_rate": 1.623049001814882e-05, + "loss": 30.0805, + "step": 2539 + }, + { + "epoch": 9.169751693002258, + "grad_norm": 223.7616424560547, + "learning_rate": 1.6225045372050817e-05, + "loss": 23.8492, + "step": 2540 + }, + { + "epoch": 9.169751693002258, + "eval_loss": 0.6244915723800659, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.031, + "eval_steps_per_second": 57.031, + "step": 2540 + }, + { + "epoch": 9.173363431151241, + "grad_norm": 213.41371154785156, + "learning_rate": 1.6219600725952816e-05, + "loss": 23.3557, + "step": 2541 + }, + { + "epoch": 9.176975169300226, + "grad_norm": 162.4627685546875, + "learning_rate": 1.621415607985481e-05, + "loss": 23.8834, + "step": 2542 + }, + { + "epoch": 9.18058690744921, + "grad_norm": 172.13250732421875, + "learning_rate": 1.6208711433756807e-05, + "loss": 24.6428, + "step": 2543 + }, + { + "epoch": 9.184198645598194, + "grad_norm": 229.30799865722656, + "learning_rate": 1.6203266787658802e-05, + "loss": 42.5908, + "step": 2544 + }, + { + "epoch": 9.187810383747179, + "grad_norm": 195.30130004882812, + "learning_rate": 1.6197822141560798e-05, + "loss": 43.7286, + "step": 2545 + }, + { + "epoch": 9.191422121896162, + "grad_norm": 227.4984893798828, + "learning_rate": 1.6192377495462793e-05, + "loss": 43.5012, + "step": 2546 + }, + { + "epoch": 9.195033860045147, + "grad_norm": 254.69615173339844, + "learning_rate": 1.6186932849364792e-05, + "loss": 41.9295, + "step": 2547 + }, + { + "epoch": 9.198645598194132, + "grad_norm": 251.33778381347656, + "learning_rate": 1.6181488203266787e-05, + "loss": 42.0838, + "step": 2548 + }, + { + "epoch": 9.202257336343115, + "grad_norm": 237.91677856445312, + "learning_rate": 1.6176043557168786e-05, + "loss": 43.0031, + "step": 2549 + }, + { + "epoch": 9.2058690744921, + "grad_norm": 258.0311584472656, + "learning_rate": 1.617059891107078e-05, + "loss": 42.7196, + "step": 2550 + }, + { + "epoch": 9.2058690744921, + "eval_loss": 0.6245208978652954, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2550 + }, + { + "epoch": 9.209480812641084, + "grad_norm": 197.14703369140625, + "learning_rate": 1.6165154264972777e-05, + "loss": 42.1342, + "step": 2551 + }, + { + "epoch": 9.213092550790067, + "grad_norm": 235.19705200195312, + "learning_rate": 1.6159709618874775e-05, + "loss": 41.8462, + "step": 2552 + }, + { + "epoch": 9.216704288939052, + "grad_norm": 198.409423828125, + "learning_rate": 1.615426497277677e-05, + "loss": 43.5993, + "step": 2553 + }, + { + "epoch": 9.220316027088035, + "grad_norm": 254.08590698242188, + "learning_rate": 1.6148820326678766e-05, + "loss": 40.771, + "step": 2554 + }, + { + "epoch": 9.22392776523702, + "grad_norm": 181.64808654785156, + "learning_rate": 1.614337568058076e-05, + "loss": 39.3511, + "step": 2555 + }, + { + "epoch": 9.227539503386005, + "grad_norm": 294.1127014160156, + "learning_rate": 1.6137931034482757e-05, + "loss": 39.6586, + "step": 2556 + }, + { + "epoch": 9.231151241534988, + "grad_norm": 197.59982299804688, + "learning_rate": 1.6132486388384752e-05, + "loss": 38.2575, + "step": 2557 + }, + { + "epoch": 9.234762979683973, + "grad_norm": 223.74717712402344, + "learning_rate": 1.6127041742286754e-05, + "loss": 38.8801, + "step": 2558 + }, + { + "epoch": 9.238374717832958, + "grad_norm": 279.2779541015625, + "learning_rate": 1.612159709618875e-05, + "loss": 40.4591, + "step": 2559 + }, + { + "epoch": 9.241986455981941, + "grad_norm": 258.75909423828125, + "learning_rate": 1.6116152450090745e-05, + "loss": 39.2172, + "step": 2560 + }, + { + "epoch": 9.241986455981941, + "eval_loss": 0.6209923624992371, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.784, + "eval_steps_per_second": 56.784, + "step": 2560 + }, + { + "epoch": 9.245598194130926, + "grad_norm": 305.0645446777344, + "learning_rate": 1.611070780399274e-05, + "loss": 40.442, + "step": 2561 + }, + { + "epoch": 9.249209932279909, + "grad_norm": 196.18557739257812, + "learning_rate": 1.6105263157894736e-05, + "loss": 39.7092, + "step": 2562 + }, + { + "epoch": 9.252821670428894, + "grad_norm": 214.3220977783203, + "learning_rate": 1.6099818511796735e-05, + "loss": 39.3935, + "step": 2563 + }, + { + "epoch": 9.256433408577879, + "grad_norm": 217.2801055908203, + "learning_rate": 1.609437386569873e-05, + "loss": 40.39, + "step": 2564 + }, + { + "epoch": 9.260045146726862, + "grad_norm": 205.17446899414062, + "learning_rate": 1.6088929219600726e-05, + "loss": 39.9531, + "step": 2565 + }, + { + "epoch": 9.263656884875846, + "grad_norm": 197.3854217529297, + "learning_rate": 1.608348457350272e-05, + "loss": 40.474, + "step": 2566 + }, + { + "epoch": 9.267268623024831, + "grad_norm": 264.3934631347656, + "learning_rate": 1.607803992740472e-05, + "loss": 41.2794, + "step": 2567 + }, + { + "epoch": 9.270880361173814, + "grad_norm": 226.6471710205078, + "learning_rate": 1.6072595281306715e-05, + "loss": 40.3425, + "step": 2568 + }, + { + "epoch": 9.2744920993228, + "grad_norm": 198.62734985351562, + "learning_rate": 1.6067150635208714e-05, + "loss": 41.6261, + "step": 2569 + }, + { + "epoch": 9.278103837471784, + "grad_norm": 207.73509216308594, + "learning_rate": 1.606170598911071e-05, + "loss": 41.7835, + "step": 2570 + }, + { + "epoch": 9.278103837471784, + "eval_loss": 0.6173180937767029, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 2570 + }, + { + "epoch": 9.281715575620767, + "grad_norm": 214.13601684570312, + "learning_rate": 1.6056261343012705e-05, + "loss": 40.0095, + "step": 2571 + }, + { + "epoch": 9.285327313769752, + "grad_norm": 218.0533905029297, + "learning_rate": 1.60508166969147e-05, + "loss": 40.014, + "step": 2572 + }, + { + "epoch": 9.288939051918735, + "grad_norm": 211.27984619140625, + "learning_rate": 1.6045372050816695e-05, + "loss": 36.7399, + "step": 2573 + }, + { + "epoch": 9.29255079006772, + "grad_norm": 201.9020233154297, + "learning_rate": 1.6039927404718694e-05, + "loss": 33.7555, + "step": 2574 + }, + { + "epoch": 9.296162528216705, + "grad_norm": 230.27149963378906, + "learning_rate": 1.603448275862069e-05, + "loss": 32.9646, + "step": 2575 + }, + { + "epoch": 9.299774266365688, + "grad_norm": 208.77622985839844, + "learning_rate": 1.6029038112522685e-05, + "loss": 33.5332, + "step": 2576 + }, + { + "epoch": 9.303386004514673, + "grad_norm": 225.02796936035156, + "learning_rate": 1.6023593466424684e-05, + "loss": 34.2592, + "step": 2577 + }, + { + "epoch": 9.306997742663658, + "grad_norm": 201.79612731933594, + "learning_rate": 1.601814882032668e-05, + "loss": 34.6686, + "step": 2578 + }, + { + "epoch": 9.31060948081264, + "grad_norm": 235.6588134765625, + "learning_rate": 1.6012704174228678e-05, + "loss": 35.4554, + "step": 2579 + }, + { + "epoch": 9.314221218961626, + "grad_norm": 273.51904296875, + "learning_rate": 1.6007259528130673e-05, + "loss": 35.2077, + "step": 2580 + }, + { + "epoch": 9.314221218961626, + "eval_loss": 0.6169624328613281, + "eval_runtime": 3.1501, + "eval_samples_per_second": 56.823, + "eval_steps_per_second": 56.823, + "step": 2580 + }, + { + "epoch": 9.317832957110609, + "grad_norm": 199.19541931152344, + "learning_rate": 1.600181488203267e-05, + "loss": 35.0703, + "step": 2581 + }, + { + "epoch": 9.321444695259594, + "grad_norm": 212.49276733398438, + "learning_rate": 1.5996370235934664e-05, + "loss": 35.9691, + "step": 2582 + }, + { + "epoch": 9.325056433408578, + "grad_norm": 193.7330322265625, + "learning_rate": 1.599092558983666e-05, + "loss": 34.9043, + "step": 2583 + }, + { + "epoch": 9.328668171557561, + "grad_norm": 196.00503540039062, + "learning_rate": 1.5985480943738655e-05, + "loss": 36.3508, + "step": 2584 + }, + { + "epoch": 9.332279909706546, + "grad_norm": 218.78392028808594, + "learning_rate": 1.5980036297640654e-05, + "loss": 34.7672, + "step": 2585 + }, + { + "epoch": 9.335891647855531, + "grad_norm": 235.76873779296875, + "learning_rate": 1.5974591651542652e-05, + "loss": 36.8695, + "step": 2586 + }, + { + "epoch": 9.339503386004514, + "grad_norm": 250.538330078125, + "learning_rate": 1.5969147005444648e-05, + "loss": 37.4531, + "step": 2587 + }, + { + "epoch": 9.343115124153499, + "grad_norm": 234.12469482421875, + "learning_rate": 1.5963702359346643e-05, + "loss": 37.4506, + "step": 2588 + }, + { + "epoch": 9.346726862302482, + "grad_norm": 209.3461151123047, + "learning_rate": 1.595825771324864e-05, + "loss": 31.3062, + "step": 2589 + }, + { + "epoch": 9.350338600451467, + "grad_norm": 211.12277221679688, + "learning_rate": 1.5952813067150637e-05, + "loss": 23.3303, + "step": 2590 + }, + { + "epoch": 9.350338600451467, + "eval_loss": 0.6222187876701355, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 2590 + }, + { + "epoch": 9.353950338600452, + "grad_norm": 200.1257781982422, + "learning_rate": 1.5947368421052633e-05, + "loss": 22.9145, + "step": 2591 + }, + { + "epoch": 9.357562076749435, + "grad_norm": 179.01475524902344, + "learning_rate": 1.5941923774954628e-05, + "loss": 23.8842, + "step": 2592 + }, + { + "epoch": 9.36117381489842, + "grad_norm": 214.9254608154297, + "learning_rate": 1.5936479128856623e-05, + "loss": 25.4154, + "step": 2593 + }, + { + "epoch": 9.364785553047405, + "grad_norm": 211.63735961914062, + "learning_rate": 1.593103448275862e-05, + "loss": 42.6467, + "step": 2594 + }, + { + "epoch": 9.368397291196388, + "grad_norm": 232.43194580078125, + "learning_rate": 1.5925589836660618e-05, + "loss": 43.3501, + "step": 2595 + }, + { + "epoch": 9.372009029345373, + "grad_norm": 220.61468505859375, + "learning_rate": 1.5920145190562616e-05, + "loss": 43.4324, + "step": 2596 + }, + { + "epoch": 9.375620767494357, + "grad_norm": 179.00894165039062, + "learning_rate": 1.591470054446461e-05, + "loss": 41.9646, + "step": 2597 + }, + { + "epoch": 9.37923250564334, + "grad_norm": 203.847412109375, + "learning_rate": 1.5909255898366607e-05, + "loss": 41.1242, + "step": 2598 + }, + { + "epoch": 9.382844243792325, + "grad_norm": 244.20164489746094, + "learning_rate": 1.5903811252268602e-05, + "loss": 42.2451, + "step": 2599 + }, + { + "epoch": 9.386455981941308, + "grad_norm": 203.60154724121094, + "learning_rate": 1.5898366606170598e-05, + "loss": 42.0361, + "step": 2600 + }, + { + "epoch": 9.386455981941308, + "eval_loss": 0.627146303653717, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2600 + }, + { + "epoch": 9.390067720090293, + "grad_norm": 185.1741180419922, + "learning_rate": 1.5892921960072597e-05, + "loss": 41.9657, + "step": 2601 + }, + { + "epoch": 9.393679458239278, + "grad_norm": 211.64219665527344, + "learning_rate": 1.5887477313974592e-05, + "loss": 42.2619, + "step": 2602 + }, + { + "epoch": 9.397291196388261, + "grad_norm": 253.31997680664062, + "learning_rate": 1.5882032667876587e-05, + "loss": 42.5666, + "step": 2603 + }, + { + "epoch": 9.400902934537246, + "grad_norm": 257.8781433105469, + "learning_rate": 1.5876588021778586e-05, + "loss": 43.1747, + "step": 2604 + }, + { + "epoch": 9.404514672686231, + "grad_norm": 171.05398559570312, + "learning_rate": 1.587114337568058e-05, + "loss": 41.2645, + "step": 2605 + }, + { + "epoch": 9.408126410835214, + "grad_norm": 209.83749389648438, + "learning_rate": 1.5865698729582577e-05, + "loss": 38.7138, + "step": 2606 + }, + { + "epoch": 9.411738148984199, + "grad_norm": 303.92059326171875, + "learning_rate": 1.5860254083484576e-05, + "loss": 38.7962, + "step": 2607 + }, + { + "epoch": 9.415349887133182, + "grad_norm": 271.9322204589844, + "learning_rate": 1.585480943738657e-05, + "loss": 39.0622, + "step": 2608 + }, + { + "epoch": 9.418961625282167, + "grad_norm": 222.8749542236328, + "learning_rate": 1.5849364791288566e-05, + "loss": 40.0773, + "step": 2609 + }, + { + "epoch": 9.422573363431152, + "grad_norm": 194.549072265625, + "learning_rate": 1.5843920145190562e-05, + "loss": 39.3495, + "step": 2610 + }, + { + "epoch": 9.422573363431152, + "eval_loss": 0.618250846862793, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.796, + "eval_steps_per_second": 56.796, + "step": 2610 + }, + { + "epoch": 9.426185101580135, + "grad_norm": 231.32623291015625, + "learning_rate": 1.5838475499092557e-05, + "loss": 39.7577, + "step": 2611 + }, + { + "epoch": 9.42979683972912, + "grad_norm": 185.9986114501953, + "learning_rate": 1.5833030852994556e-05, + "loss": 40.9342, + "step": 2612 + }, + { + "epoch": 9.433408577878104, + "grad_norm": 221.356201171875, + "learning_rate": 1.5827586206896555e-05, + "loss": 39.7733, + "step": 2613 + }, + { + "epoch": 9.437020316027088, + "grad_norm": 216.2249755859375, + "learning_rate": 1.582214156079855e-05, + "loss": 39.7559, + "step": 2614 + }, + { + "epoch": 9.440632054176072, + "grad_norm": 263.5106201171875, + "learning_rate": 1.5816696914700546e-05, + "loss": 41.2872, + "step": 2615 + }, + { + "epoch": 9.444243792325057, + "grad_norm": 281.9518127441406, + "learning_rate": 1.581125226860254e-05, + "loss": 41.1114, + "step": 2616 + }, + { + "epoch": 9.44785553047404, + "grad_norm": 200.2808074951172, + "learning_rate": 1.5805807622504536e-05, + "loss": 41.7711, + "step": 2617 + }, + { + "epoch": 9.451467268623025, + "grad_norm": 233.034912109375, + "learning_rate": 1.5800362976406535e-05, + "loss": 41.3306, + "step": 2618 + }, + { + "epoch": 9.455079006772008, + "grad_norm": 215.5499725341797, + "learning_rate": 1.579491833030853e-05, + "loss": 41.0065, + "step": 2619 + }, + { + "epoch": 9.458690744920993, + "grad_norm": 220.21153259277344, + "learning_rate": 1.5789473684210526e-05, + "loss": 42.1116, + "step": 2620 + }, + { + "epoch": 9.458690744920993, + "eval_loss": 0.6146022081375122, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 2620 + }, + { + "epoch": 9.462302483069978, + "grad_norm": 198.20001220703125, + "learning_rate": 1.578402903811252e-05, + "loss": 39.637, + "step": 2621 + }, + { + "epoch": 9.465914221218961, + "grad_norm": 228.18357849121094, + "learning_rate": 1.5778584392014517e-05, + "loss": 37.3831, + "step": 2622 + }, + { + "epoch": 9.469525959367946, + "grad_norm": 207.68040466308594, + "learning_rate": 1.577313974591652e-05, + "loss": 35.6356, + "step": 2623 + }, + { + "epoch": 9.47313769751693, + "grad_norm": 267.0474853515625, + "learning_rate": 1.5767695099818514e-05, + "loss": 34.5549, + "step": 2624 + }, + { + "epoch": 9.476749435665914, + "grad_norm": 191.4129638671875, + "learning_rate": 1.576225045372051e-05, + "loss": 35.1065, + "step": 2625 + }, + { + "epoch": 9.480361173814899, + "grad_norm": 220.85708618164062, + "learning_rate": 1.5756805807622505e-05, + "loss": 34.9115, + "step": 2626 + }, + { + "epoch": 9.483972911963882, + "grad_norm": 218.62460327148438, + "learning_rate": 1.57513611615245e-05, + "loss": 33.9542, + "step": 2627 + }, + { + "epoch": 9.487584650112867, + "grad_norm": 184.085693359375, + "learning_rate": 1.5745916515426496e-05, + "loss": 35.2981, + "step": 2628 + }, + { + "epoch": 9.491196388261852, + "grad_norm": 286.73236083984375, + "learning_rate": 1.5740471869328494e-05, + "loss": 36.8326, + "step": 2629 + }, + { + "epoch": 9.494808126410835, + "grad_norm": 326.4263000488281, + "learning_rate": 1.573502722323049e-05, + "loss": 35.9728, + "step": 2630 + }, + { + "epoch": 9.494808126410835, + "eval_loss": 0.6165672540664673, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2630 + }, + { + "epoch": 9.49841986455982, + "grad_norm": 283.330322265625, + "learning_rate": 1.5729582577132485e-05, + "loss": 37.4227, + "step": 2631 + }, + { + "epoch": 9.502031602708804, + "grad_norm": 208.65829467773438, + "learning_rate": 1.5724137931034484e-05, + "loss": 36.8613, + "step": 2632 + }, + { + "epoch": 9.505643340857787, + "grad_norm": 191.59429931640625, + "learning_rate": 1.571869328493648e-05, + "loss": 36.2332, + "step": 2633 + }, + { + "epoch": 9.509255079006772, + "grad_norm": 306.4736022949219, + "learning_rate": 1.5713248638838478e-05, + "loss": 36.8045, + "step": 2634 + }, + { + "epoch": 9.512866817155757, + "grad_norm": 226.97509765625, + "learning_rate": 1.5707803992740474e-05, + "loss": 37.005, + "step": 2635 + }, + { + "epoch": 9.51647855530474, + "grad_norm": 230.47683715820312, + "learning_rate": 1.570235934664247e-05, + "loss": 36.9168, + "step": 2636 + }, + { + "epoch": 9.520090293453725, + "grad_norm": 221.44483947753906, + "learning_rate": 1.5696914700544464e-05, + "loss": 39.0025, + "step": 2637 + }, + { + "epoch": 9.523702031602708, + "grad_norm": 249.1531219482422, + "learning_rate": 1.569147005444646e-05, + "loss": 38.1069, + "step": 2638 + }, + { + "epoch": 9.527313769751693, + "grad_norm": 276.8532409667969, + "learning_rate": 1.5686025408348455e-05, + "loss": 30.9819, + "step": 2639 + }, + { + "epoch": 9.530925507900678, + "grad_norm": 218.25035095214844, + "learning_rate": 1.5680580762250454e-05, + "loss": 23.4807, + "step": 2640 + }, + { + "epoch": 9.530925507900678, + "eval_loss": 0.619295060634613, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2640 + }, + { + "epoch": 9.534537246049661, + "grad_norm": 185.83737182617188, + "learning_rate": 1.5675136116152453e-05, + "loss": 22.5394, + "step": 2641 + }, + { + "epoch": 9.538148984198646, + "grad_norm": 181.9920654296875, + "learning_rate": 1.5669691470054448e-05, + "loss": 23.9106, + "step": 2642 + }, + { + "epoch": 9.54176072234763, + "grad_norm": 209.20391845703125, + "learning_rate": 1.5664246823956443e-05, + "loss": 25.5328, + "step": 2643 + }, + { + "epoch": 9.545372460496614, + "grad_norm": 223.86093139648438, + "learning_rate": 1.565880217785844e-05, + "loss": 42.8563, + "step": 2644 + }, + { + "epoch": 9.548984198645599, + "grad_norm": 232.3086395263672, + "learning_rate": 1.5653357531760438e-05, + "loss": 44.0178, + "step": 2645 + }, + { + "epoch": 9.552595936794582, + "grad_norm": 223.76541137695312, + "learning_rate": 1.5647912885662433e-05, + "loss": 43.4928, + "step": 2646 + }, + { + "epoch": 9.556207674943566, + "grad_norm": 258.86700439453125, + "learning_rate": 1.5642468239564428e-05, + "loss": 42.3422, + "step": 2647 + }, + { + "epoch": 9.559819413092551, + "grad_norm": 255.09033203125, + "learning_rate": 1.5637023593466424e-05, + "loss": 41.6588, + "step": 2648 + }, + { + "epoch": 9.563431151241534, + "grad_norm": 205.88563537597656, + "learning_rate": 1.563157894736842e-05, + "loss": 41.9267, + "step": 2649 + }, + { + "epoch": 9.56704288939052, + "grad_norm": 204.12318420410156, + "learning_rate": 1.5626134301270418e-05, + "loss": 43.0326, + "step": 2650 + }, + { + "epoch": 9.56704288939052, + "eval_loss": 0.6218730807304382, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2650 + }, + { + "epoch": 9.570654627539504, + "grad_norm": 259.5694274902344, + "learning_rate": 1.5620689655172417e-05, + "loss": 42.9604, + "step": 2651 + }, + { + "epoch": 9.574266365688487, + "grad_norm": 234.35935974121094, + "learning_rate": 1.5615245009074412e-05, + "loss": 42.7316, + "step": 2652 + }, + { + "epoch": 9.577878103837472, + "grad_norm": 237.14346313476562, + "learning_rate": 1.5609800362976407e-05, + "loss": 42.4559, + "step": 2653 + }, + { + "epoch": 9.581489841986457, + "grad_norm": 208.2974395751953, + "learning_rate": 1.5604355716878403e-05, + "loss": 40.1113, + "step": 2654 + }, + { + "epoch": 9.58510158013544, + "grad_norm": 212.18814086914062, + "learning_rate": 1.5598911070780398e-05, + "loss": 38.6515, + "step": 2655 + }, + { + "epoch": 9.588713318284425, + "grad_norm": 245.23240661621094, + "learning_rate": 1.5593466424682397e-05, + "loss": 39.5289, + "step": 2656 + }, + { + "epoch": 9.592325056433408, + "grad_norm": 261.1321105957031, + "learning_rate": 1.5588021778584392e-05, + "loss": 39.3232, + "step": 2657 + }, + { + "epoch": 9.595936794582393, + "grad_norm": 257.67962646484375, + "learning_rate": 1.5582577132486388e-05, + "loss": 40.3963, + "step": 2658 + }, + { + "epoch": 9.599548532731378, + "grad_norm": 299.93914794921875, + "learning_rate": 1.5577132486388383e-05, + "loss": 39.0657, + "step": 2659 + }, + { + "epoch": 9.60316027088036, + "grad_norm": 215.45407104492188, + "learning_rate": 1.5571687840290382e-05, + "loss": 40.1408, + "step": 2660 + }, + { + "epoch": 9.60316027088036, + "eval_loss": 0.6216554045677185, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2660 + }, + { + "epoch": 9.606772009029346, + "grad_norm": 273.9233093261719, + "learning_rate": 1.5566243194192377e-05, + "loss": 40.6894, + "step": 2661 + }, + { + "epoch": 9.610383747178329, + "grad_norm": 220.76344299316406, + "learning_rate": 1.5560798548094376e-05, + "loss": 40.8146, + "step": 2662 + }, + { + "epoch": 9.613995485327314, + "grad_norm": 200.33929443359375, + "learning_rate": 1.555535390199637e-05, + "loss": 40.1362, + "step": 2663 + }, + { + "epoch": 9.617607223476298, + "grad_norm": 223.38536071777344, + "learning_rate": 1.5549909255898367e-05, + "loss": 39.3488, + "step": 2664 + }, + { + "epoch": 9.621218961625281, + "grad_norm": 240.99578857421875, + "learning_rate": 1.5544464609800362e-05, + "loss": 41.771, + "step": 2665 + }, + { + "epoch": 9.624830699774266, + "grad_norm": 202.30323791503906, + "learning_rate": 1.5539019963702357e-05, + "loss": 41.1412, + "step": 2666 + }, + { + "epoch": 9.628442437923251, + "grad_norm": 193.8411865234375, + "learning_rate": 1.5533575317604356e-05, + "loss": 41.0064, + "step": 2667 + }, + { + "epoch": 9.632054176072234, + "grad_norm": 197.1542510986328, + "learning_rate": 1.552813067150635e-05, + "loss": 41.4787, + "step": 2668 + }, + { + "epoch": 9.635665914221219, + "grad_norm": 259.21954345703125, + "learning_rate": 1.552268602540835e-05, + "loss": 41.753, + "step": 2669 + }, + { + "epoch": 9.639277652370204, + "grad_norm": 290.9770202636719, + "learning_rate": 1.5517241379310346e-05, + "loss": 40.4589, + "step": 2670 + }, + { + "epoch": 9.639277652370204, + "eval_loss": 0.6132164001464844, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2670 + }, + { + "epoch": 9.642889390519187, + "grad_norm": 252.86219787597656, + "learning_rate": 1.551179673321234e-05, + "loss": 37.356, + "step": 2671 + }, + { + "epoch": 9.646501128668172, + "grad_norm": 207.79254150390625, + "learning_rate": 1.550635208711434e-05, + "loss": 36.2071, + "step": 2672 + }, + { + "epoch": 9.650112866817155, + "grad_norm": 186.78857421875, + "learning_rate": 1.5500907441016335e-05, + "loss": 33.5074, + "step": 2673 + }, + { + "epoch": 9.65372460496614, + "grad_norm": 212.5107421875, + "learning_rate": 1.549546279491833e-05, + "loss": 33.7103, + "step": 2674 + }, + { + "epoch": 9.657336343115125, + "grad_norm": 243.2950897216797, + "learning_rate": 1.5490018148820326e-05, + "loss": 34.3476, + "step": 2675 + }, + { + "epoch": 9.660948081264108, + "grad_norm": 221.66415405273438, + "learning_rate": 1.548457350272232e-05, + "loss": 34.5377, + "step": 2676 + }, + { + "epoch": 9.664559819413093, + "grad_norm": 231.8260955810547, + "learning_rate": 1.5479128856624317e-05, + "loss": 34.3663, + "step": 2677 + }, + { + "epoch": 9.668171557562077, + "grad_norm": 284.6401062011719, + "learning_rate": 1.547368421052632e-05, + "loss": 35.5723, + "step": 2678 + }, + { + "epoch": 9.67178329571106, + "grad_norm": 373.43865966796875, + "learning_rate": 1.5468239564428314e-05, + "loss": 35.5628, + "step": 2679 + }, + { + "epoch": 9.675395033860045, + "grad_norm": 325.18316650390625, + "learning_rate": 1.546279491833031e-05, + "loss": 35.6192, + "step": 2680 + }, + { + "epoch": 9.675395033860045, + "eval_loss": 0.613842248916626, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 2680 + }, + { + "epoch": 9.679006772009028, + "grad_norm": 353.14739990234375, + "learning_rate": 1.5457350272232305e-05, + "loss": 36.4789, + "step": 2681 + }, + { + "epoch": 9.682618510158013, + "grad_norm": 215.21836853027344, + "learning_rate": 1.54519056261343e-05, + "loss": 36.0412, + "step": 2682 + }, + { + "epoch": 9.686230248306998, + "grad_norm": 219.64930725097656, + "learning_rate": 1.54464609800363e-05, + "loss": 37.1118, + "step": 2683 + }, + { + "epoch": 9.689841986455981, + "grad_norm": 247.86685180664062, + "learning_rate": 1.5441016333938295e-05, + "loss": 36.488, + "step": 2684 + }, + { + "epoch": 9.693453724604966, + "grad_norm": 248.7967071533203, + "learning_rate": 1.543557168784029e-05, + "loss": 36.2925, + "step": 2685 + }, + { + "epoch": 9.697065462753951, + "grad_norm": 243.1404571533203, + "learning_rate": 1.5430127041742285e-05, + "loss": 37.3986, + "step": 2686 + }, + { + "epoch": 9.700677200902934, + "grad_norm": 276.6585388183594, + "learning_rate": 1.5424682395644284e-05, + "loss": 37.9784, + "step": 2687 + }, + { + "epoch": 9.704288939051919, + "grad_norm": 308.171630859375, + "learning_rate": 1.541923774954628e-05, + "loss": 38.1591, + "step": 2688 + }, + { + "epoch": 9.707900677200904, + "grad_norm": 204.4575653076172, + "learning_rate": 1.541379310344828e-05, + "loss": 27.4514, + "step": 2689 + }, + { + "epoch": 9.711512415349887, + "grad_norm": 160.85946655273438, + "learning_rate": 1.5408348457350274e-05, + "loss": 23.7982, + "step": 2690 + }, + { + "epoch": 9.711512415349887, + "eval_loss": 0.619924008846283, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2690 + }, + { + "epoch": 9.715124153498872, + "grad_norm": 215.60049438476562, + "learning_rate": 1.540290381125227e-05, + "loss": 23.3927, + "step": 2691 + }, + { + "epoch": 9.718735891647855, + "grad_norm": 172.84011840820312, + "learning_rate": 1.5397459165154265e-05, + "loss": 24.1876, + "step": 2692 + }, + { + "epoch": 9.72234762979684, + "grad_norm": 208.42361450195312, + "learning_rate": 1.539201451905626e-05, + "loss": 25.1794, + "step": 2693 + }, + { + "epoch": 9.725959367945824, + "grad_norm": 255.73574829101562, + "learning_rate": 1.538656987295826e-05, + "loss": 42.3484, + "step": 2694 + }, + { + "epoch": 9.729571106094808, + "grad_norm": 239.65533447265625, + "learning_rate": 1.5381125226860254e-05, + "loss": 42.8277, + "step": 2695 + }, + { + "epoch": 9.733182844243792, + "grad_norm": 211.2068634033203, + "learning_rate": 1.5375680580762253e-05, + "loss": 42.6536, + "step": 2696 + }, + { + "epoch": 9.736794582392777, + "grad_norm": 302.85003662109375, + "learning_rate": 1.5370235934664248e-05, + "loss": 42.6263, + "step": 2697 + }, + { + "epoch": 9.74040632054176, + "grad_norm": 211.54754638671875, + "learning_rate": 1.5364791288566244e-05, + "loss": 41.5621, + "step": 2698 + }, + { + "epoch": 9.744018058690745, + "grad_norm": 229.22283935546875, + "learning_rate": 1.535934664246824e-05, + "loss": 43.3765, + "step": 2699 + }, + { + "epoch": 9.747629796839728, + "grad_norm": 206.64794921875, + "learning_rate": 1.5353901996370238e-05, + "loss": 41.4923, + "step": 2700 + }, + { + "epoch": 9.747629796839728, + "eval_loss": 0.6202616095542908, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.981, + "eval_steps_per_second": 56.981, + "step": 2700 + }, + { + "epoch": 9.751241534988713, + "grad_norm": 216.98757934570312, + "learning_rate": 1.5348457350272233e-05, + "loss": 43.1931, + "step": 2701 + }, + { + "epoch": 9.754853273137698, + "grad_norm": 222.7340545654297, + "learning_rate": 1.534301270417423e-05, + "loss": 42.485, + "step": 2702 + }, + { + "epoch": 9.758465011286681, + "grad_norm": 291.3454895019531, + "learning_rate": 1.5337568058076224e-05, + "loss": 41.4766, + "step": 2703 + }, + { + "epoch": 9.762076749435666, + "grad_norm": 239.50341796875, + "learning_rate": 1.533212341197822e-05, + "loss": 41.9215, + "step": 2704 + }, + { + "epoch": 9.76568848758465, + "grad_norm": 179.21839904785156, + "learning_rate": 1.5326678765880218e-05, + "loss": 40.6544, + "step": 2705 + }, + { + "epoch": 9.769300225733634, + "grad_norm": 210.89535522460938, + "learning_rate": 1.5321234119782217e-05, + "loss": 38.6204, + "step": 2706 + }, + { + "epoch": 9.772911963882619, + "grad_norm": 239.23291015625, + "learning_rate": 1.5315789473684212e-05, + "loss": 39.4385, + "step": 2707 + }, + { + "epoch": 9.776523702031604, + "grad_norm": 240.22772216796875, + "learning_rate": 1.5310344827586208e-05, + "loss": 40.0139, + "step": 2708 + }, + { + "epoch": 9.780135440180587, + "grad_norm": 185.4588623046875, + "learning_rate": 1.5304900181488203e-05, + "loss": 38.9331, + "step": 2709 + }, + { + "epoch": 9.783747178329572, + "grad_norm": 263.0315856933594, + "learning_rate": 1.52994555353902e-05, + "loss": 38.5485, + "step": 2710 + }, + { + "epoch": 9.783747178329572, + "eval_loss": 0.615914523601532, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2710 + }, + { + "epoch": 9.787358916478555, + "grad_norm": 209.05348205566406, + "learning_rate": 1.5294010889292197e-05, + "loss": 39.4875, + "step": 2711 + }, + { + "epoch": 9.79097065462754, + "grad_norm": 209.72293090820312, + "learning_rate": 1.5288566243194193e-05, + "loss": 40.4742, + "step": 2712 + }, + { + "epoch": 9.794582392776524, + "grad_norm": 210.02908325195312, + "learning_rate": 1.5283121597096188e-05, + "loss": 39.924, + "step": 2713 + }, + { + "epoch": 9.798194130925507, + "grad_norm": 204.3467254638672, + "learning_rate": 1.5277676950998183e-05, + "loss": 40.8893, + "step": 2714 + }, + { + "epoch": 9.801805869074492, + "grad_norm": 253.9317626953125, + "learning_rate": 1.5272232304900182e-05, + "loss": 38.3278, + "step": 2715 + }, + { + "epoch": 9.805417607223477, + "grad_norm": 263.6196594238281, + "learning_rate": 1.526678765880218e-05, + "loss": 40.5242, + "step": 2716 + }, + { + "epoch": 9.80902934537246, + "grad_norm": 230.35621643066406, + "learning_rate": 1.5261343012704176e-05, + "loss": 40.683, + "step": 2717 + }, + { + "epoch": 9.812641083521445, + "grad_norm": 190.16323852539062, + "learning_rate": 1.5255898366606172e-05, + "loss": 40.2472, + "step": 2718 + }, + { + "epoch": 9.816252821670428, + "grad_norm": 202.7122344970703, + "learning_rate": 1.5250453720508167e-05, + "loss": 38.9644, + "step": 2719 + }, + { + "epoch": 9.819864559819413, + "grad_norm": 193.65774536132812, + "learning_rate": 1.5245009074410164e-05, + "loss": 40.9982, + "step": 2720 + }, + { + "epoch": 9.819864559819413, + "eval_loss": 0.6152020692825317, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 2720 + }, + { + "epoch": 9.823476297968398, + "grad_norm": 272.0360412597656, + "learning_rate": 1.523956442831216e-05, + "loss": 40.5518, + "step": 2721 + }, + { + "epoch": 9.827088036117381, + "grad_norm": 200.20777893066406, + "learning_rate": 1.5234119782214155e-05, + "loss": 38.4801, + "step": 2722 + }, + { + "epoch": 9.830699774266366, + "grad_norm": 201.44764709472656, + "learning_rate": 1.5228675136116152e-05, + "loss": 35.7499, + "step": 2723 + }, + { + "epoch": 9.83431151241535, + "grad_norm": 234.89706420898438, + "learning_rate": 1.522323049001815e-05, + "loss": 35.4331, + "step": 2724 + }, + { + "epoch": 9.837923250564334, + "grad_norm": 193.27423095703125, + "learning_rate": 1.5217785843920146e-05, + "loss": 33.0281, + "step": 2725 + }, + { + "epoch": 9.841534988713319, + "grad_norm": 222.28060913085938, + "learning_rate": 1.5212341197822143e-05, + "loss": 34.2237, + "step": 2726 + }, + { + "epoch": 9.845146726862303, + "grad_norm": 264.2764587402344, + "learning_rate": 1.5206896551724139e-05, + "loss": 33.7112, + "step": 2727 + }, + { + "epoch": 9.848758465011286, + "grad_norm": 204.5146484375, + "learning_rate": 1.5201451905626134e-05, + "loss": 33.9014, + "step": 2728 + }, + { + "epoch": 9.852370203160271, + "grad_norm": 198.90907287597656, + "learning_rate": 1.5196007259528131e-05, + "loss": 36.6987, + "step": 2729 + }, + { + "epoch": 9.855981941309254, + "grad_norm": 254.19818115234375, + "learning_rate": 1.5190562613430126e-05, + "loss": 35.4466, + "step": 2730 + }, + { + "epoch": 9.855981941309254, + "eval_loss": 0.6153284311294556, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2730 + }, + { + "epoch": 9.85959367945824, + "grad_norm": 212.53749084472656, + "learning_rate": 1.5185117967332123e-05, + "loss": 35.659, + "step": 2731 + }, + { + "epoch": 9.863205417607224, + "grad_norm": 234.5277557373047, + "learning_rate": 1.5179673321234119e-05, + "loss": 36.7411, + "step": 2732 + }, + { + "epoch": 9.866817155756207, + "grad_norm": 229.25962829589844, + "learning_rate": 1.5174228675136118e-05, + "loss": 36.0713, + "step": 2733 + }, + { + "epoch": 9.870428893905192, + "grad_norm": 259.5096435546875, + "learning_rate": 1.5168784029038115e-05, + "loss": 37.2433, + "step": 2734 + }, + { + "epoch": 9.874040632054175, + "grad_norm": 297.2413024902344, + "learning_rate": 1.516333938294011e-05, + "loss": 37.222, + "step": 2735 + }, + { + "epoch": 9.87765237020316, + "grad_norm": 259.8325500488281, + "learning_rate": 1.5157894736842105e-05, + "loss": 37.096, + "step": 2736 + }, + { + "epoch": 9.881264108352145, + "grad_norm": 275.85888671875, + "learning_rate": 1.5152450090744103e-05, + "loss": 37.769, + "step": 2737 + }, + { + "epoch": 9.884875846501128, + "grad_norm": 261.16656494140625, + "learning_rate": 1.5147005444646098e-05, + "loss": 38.4089, + "step": 2738 + }, + { + "epoch": 9.888487584650113, + "grad_norm": 219.74351501464844, + "learning_rate": 1.5141560798548095e-05, + "loss": 32.5255, + "step": 2739 + }, + { + "epoch": 9.892099322799098, + "grad_norm": 203.9193878173828, + "learning_rate": 1.513611615245009e-05, + "loss": 24.2497, + "step": 2740 + }, + { + "epoch": 9.892099322799098, + "eval_loss": 0.6206448674201965, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 2740 + }, + { + "epoch": 9.89571106094808, + "grad_norm": 224.19454956054688, + "learning_rate": 1.5130671506352086e-05, + "loss": 23.0629, + "step": 2741 + }, + { + "epoch": 9.899322799097066, + "grad_norm": 252.4147186279297, + "learning_rate": 1.5125226860254086e-05, + "loss": 24.5799, + "step": 2742 + }, + { + "epoch": 9.90293453724605, + "grad_norm": 214.79067993164062, + "learning_rate": 1.5119782214156082e-05, + "loss": 24.6773, + "step": 2743 + }, + { + "epoch": 9.906546275395034, + "grad_norm": 225.59848022460938, + "learning_rate": 1.5114337568058077e-05, + "loss": 43.1147, + "step": 2744 + }, + { + "epoch": 9.910158013544018, + "grad_norm": 221.8661651611328, + "learning_rate": 1.5108892921960074e-05, + "loss": 42.7403, + "step": 2745 + }, + { + "epoch": 9.913769751693001, + "grad_norm": 316.3871765136719, + "learning_rate": 1.510344827586207e-05, + "loss": 41.6931, + "step": 2746 + }, + { + "epoch": 9.917381489841986, + "grad_norm": 250.6577911376953, + "learning_rate": 1.5098003629764065e-05, + "loss": 43.3, + "step": 2747 + }, + { + "epoch": 9.920993227990971, + "grad_norm": 222.44386291503906, + "learning_rate": 1.5092558983666062e-05, + "loss": 43.3128, + "step": 2748 + }, + { + "epoch": 9.924604966139954, + "grad_norm": 190.08682250976562, + "learning_rate": 1.5087114337568057e-05, + "loss": 41.4814, + "step": 2749 + }, + { + "epoch": 9.928216704288939, + "grad_norm": 276.9918212890625, + "learning_rate": 1.5081669691470054e-05, + "loss": 41.042, + "step": 2750 + }, + { + "epoch": 9.928216704288939, + "eval_loss": 0.6201648116111755, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2750 + }, + { + "epoch": 9.931828442437924, + "grad_norm": 269.7344970703125, + "learning_rate": 1.507622504537205e-05, + "loss": 40.3064, + "step": 2751 + }, + { + "epoch": 9.935440180586907, + "grad_norm": 263.11663818359375, + "learning_rate": 1.5070780399274049e-05, + "loss": 40.1675, + "step": 2752 + }, + { + "epoch": 9.939051918735892, + "grad_norm": 210.37635803222656, + "learning_rate": 1.5065335753176046e-05, + "loss": 40.5334, + "step": 2753 + }, + { + "epoch": 9.942663656884875, + "grad_norm": 206.09335327148438, + "learning_rate": 1.5059891107078041e-05, + "loss": 41.0429, + "step": 2754 + }, + { + "epoch": 9.94627539503386, + "grad_norm": 245.45013427734375, + "learning_rate": 1.5054446460980036e-05, + "loss": 40.8831, + "step": 2755 + }, + { + "epoch": 9.949887133182845, + "grad_norm": 216.63075256347656, + "learning_rate": 1.5049001814882033e-05, + "loss": 41.2453, + "step": 2756 + }, + { + "epoch": 9.953498871331828, + "grad_norm": 362.12127685546875, + "learning_rate": 1.5043557168784029e-05, + "loss": 40.4561, + "step": 2757 + }, + { + "epoch": 9.957110609480813, + "grad_norm": 222.01434326171875, + "learning_rate": 1.5038112522686024e-05, + "loss": 41.7307, + "step": 2758 + }, + { + "epoch": 9.960722347629797, + "grad_norm": 289.6107177734375, + "learning_rate": 1.5032667876588021e-05, + "loss": 37.83, + "step": 2759 + }, + { + "epoch": 9.96433408577878, + "grad_norm": 231.75274658203125, + "learning_rate": 1.5027223230490017e-05, + "loss": 34.1728, + "step": 2760 + }, + { + "epoch": 9.96433408577878, + "eval_loss": 0.6177247166633606, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2760 + }, + { + "epoch": 9.967945823927765, + "grad_norm": 269.4657287597656, + "learning_rate": 1.5021778584392017e-05, + "loss": 33.8501, + "step": 2761 + }, + { + "epoch": 9.97155756207675, + "grad_norm": 229.73004150390625, + "learning_rate": 1.5016333938294013e-05, + "loss": 35.0989, + "step": 2762 + }, + { + "epoch": 9.975169300225733, + "grad_norm": 215.75350952148438, + "learning_rate": 1.5010889292196008e-05, + "loss": 35.1091, + "step": 2763 + }, + { + "epoch": 9.978781038374718, + "grad_norm": 255.36439514160156, + "learning_rate": 1.5005444646098005e-05, + "loss": 36.8373, + "step": 2764 + }, + { + "epoch": 9.982392776523701, + "grad_norm": 226.71084594726562, + "learning_rate": 1.5e-05, + "loss": 36.6244, + "step": 2765 + }, + { + "epoch": 9.986004514672686, + "grad_norm": 264.1791076660156, + "learning_rate": 1.4994555353901996e-05, + "loss": 36.1925, + "step": 2766 + }, + { + "epoch": 9.989616252821671, + "grad_norm": 281.4349060058594, + "learning_rate": 1.4989110707803993e-05, + "loss": 38.5627, + "step": 2767 + }, + { + "epoch": 9.993227990970654, + "grad_norm": 275.13092041015625, + "learning_rate": 1.498366606170599e-05, + "loss": 33.3277, + "step": 2768 + }, + { + "epoch": 9.996839729119639, + "grad_norm": 215.79550170898438, + "learning_rate": 1.4978221415607985e-05, + "loss": 23.7482, + "step": 2769 + }, + { + "epoch": 10.0, + "grad_norm": 162.03152465820312, + "learning_rate": 1.4972776769509982e-05, + "loss": 21.7078, + "step": 2770 + }, + { + "epoch": 10.0, + "eval_loss": 0.6126651763916016, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2770 + }, + { + "epoch": 10.003611738148985, + "grad_norm": 243.1815185546875, + "learning_rate": 1.4967332123411978e-05, + "loss": 42.2449, + "step": 2771 + }, + { + "epoch": 10.007223476297968, + "grad_norm": 183.29127502441406, + "learning_rate": 1.4961887477313977e-05, + "loss": 41.5925, + "step": 2772 + }, + { + "epoch": 10.010835214446953, + "grad_norm": 206.04238891601562, + "learning_rate": 1.4956442831215972e-05, + "loss": 40.6657, + "step": 2773 + }, + { + "epoch": 10.014446952595938, + "grad_norm": 192.1796875, + "learning_rate": 1.4950998185117967e-05, + "loss": 41.7065, + "step": 2774 + }, + { + "epoch": 10.01805869074492, + "grad_norm": 202.77279663085938, + "learning_rate": 1.4945553539019964e-05, + "loss": 42.0608, + "step": 2775 + }, + { + "epoch": 10.021670428893906, + "grad_norm": 242.37734985351562, + "learning_rate": 1.494010889292196e-05, + "loss": 40.9925, + "step": 2776 + }, + { + "epoch": 10.025282167042889, + "grad_norm": 252.01358032226562, + "learning_rate": 1.4934664246823957e-05, + "loss": 41.1401, + "step": 2777 + }, + { + "epoch": 10.028893905191874, + "grad_norm": 205.82388305664062, + "learning_rate": 1.4929219600725954e-05, + "loss": 41.5, + "step": 2778 + }, + { + "epoch": 10.032505643340858, + "grad_norm": 251.53968811035156, + "learning_rate": 1.492377495462795e-05, + "loss": 41.8218, + "step": 2779 + }, + { + "epoch": 10.036117381489841, + "grad_norm": 236.55564880371094, + "learning_rate": 1.4918330308529945e-05, + "loss": 40.803, + "step": 2780 + }, + { + "epoch": 10.036117381489841, + "eval_loss": 0.6173696517944336, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 2780 + }, + { + "epoch": 10.039729119638826, + "grad_norm": 214.9959716796875, + "learning_rate": 1.4912885662431942e-05, + "loss": 40.522, + "step": 2781 + }, + { + "epoch": 10.043340857787811, + "grad_norm": 213.7000732421875, + "learning_rate": 1.4907441016333939e-05, + "loss": 38.8643, + "step": 2782 + }, + { + "epoch": 10.046952595936794, + "grad_norm": 225.6709747314453, + "learning_rate": 1.4901996370235936e-05, + "loss": 38.3625, + "step": 2783 + }, + { + "epoch": 10.050564334085779, + "grad_norm": 208.83712768554688, + "learning_rate": 1.4896551724137931e-05, + "loss": 38.5355, + "step": 2784 + }, + { + "epoch": 10.054176072234762, + "grad_norm": 185.51219177246094, + "learning_rate": 1.4891107078039927e-05, + "loss": 38.4303, + "step": 2785 + }, + { + "epoch": 10.057787810383747, + "grad_norm": 196.68551635742188, + "learning_rate": 1.4885662431941925e-05, + "loss": 38.1895, + "step": 2786 + }, + { + "epoch": 10.061399548532732, + "grad_norm": 207.4806671142578, + "learning_rate": 1.488021778584392e-05, + "loss": 39.2329, + "step": 2787 + }, + { + "epoch": 10.065011286681715, + "grad_norm": 211.640380859375, + "learning_rate": 1.4874773139745916e-05, + "loss": 40.108, + "step": 2788 + }, + { + "epoch": 10.0686230248307, + "grad_norm": 195.97006225585938, + "learning_rate": 1.4869328493647913e-05, + "loss": 39.6883, + "step": 2789 + }, + { + "epoch": 10.072234762979685, + "grad_norm": 207.20169067382812, + "learning_rate": 1.4863883847549909e-05, + "loss": 40.557, + "step": 2790 + }, + { + "epoch": 10.072234762979685, + "eval_loss": 0.6166439652442932, + "eval_runtime": 3.1461, + "eval_samples_per_second": 56.895, + "eval_steps_per_second": 56.895, + "step": 2790 + }, + { + "epoch": 10.075846501128668, + "grad_norm": 168.4052276611328, + "learning_rate": 1.4858439201451906e-05, + "loss": 39.76, + "step": 2791 + }, + { + "epoch": 10.079458239277653, + "grad_norm": 188.55575561523438, + "learning_rate": 1.4852994555353903e-05, + "loss": 40.4776, + "step": 2792 + }, + { + "epoch": 10.083069977426636, + "grad_norm": 181.60801696777344, + "learning_rate": 1.4847549909255898e-05, + "loss": 40.5414, + "step": 2793 + }, + { + "epoch": 10.08668171557562, + "grad_norm": 205.39608764648438, + "learning_rate": 1.4842105263157895e-05, + "loss": 41.4944, + "step": 2794 + }, + { + "epoch": 10.090293453724605, + "grad_norm": 271.0169372558594, + "learning_rate": 1.4836660617059892e-05, + "loss": 40.6805, + "step": 2795 + }, + { + "epoch": 10.093905191873588, + "grad_norm": 241.97889709472656, + "learning_rate": 1.4831215970961888e-05, + "loss": 39.5473, + "step": 2796 + }, + { + "epoch": 10.097516930022573, + "grad_norm": 211.64260864257812, + "learning_rate": 1.4825771324863885e-05, + "loss": 41.0357, + "step": 2797 + }, + { + "epoch": 10.101128668171558, + "grad_norm": 209.52804565429688, + "learning_rate": 1.482032667876588e-05, + "loss": 41.3357, + "step": 2798 + }, + { + "epoch": 10.104740406320541, + "grad_norm": 243.08419799804688, + "learning_rate": 1.4814882032667876e-05, + "loss": 38.6778, + "step": 2799 + }, + { + "epoch": 10.108352144469526, + "grad_norm": 227.17172241210938, + "learning_rate": 1.4809437386569874e-05, + "loss": 35.1128, + "step": 2800 + }, + { + "epoch": 10.108352144469526, + "eval_loss": 0.6153741478919983, + "eval_runtime": 3.143, + "eval_samples_per_second": 56.952, + "eval_steps_per_second": 56.952, + "step": 2800 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3737252824757043e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-2800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..632471f18f3e4035940a1477de97b8bc27d83905 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:120f5b8d17d3ea521ab0adb8380e2d86c08106f24595a3295f60bf110430239a +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..5b79cd02e7d28da9b390989eee69f1f60eadf1e0 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fb35a83e093f6185c5cedffefad36550f104ce5c686c5b03388fdeb1d51c22f +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2a2d2b9c565b815e5147a949842298ecd8b906f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d9ae51bed545caee1daa2e8707812d712299ad6bdc229bc4ce02bda0433e6b7 +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..44b1790ae409e3cf05cb93956afc5335a6c41c57 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27f9561f8d61a862645871ba3b671e76fc9ba400b0e1ae944ec8c73686a794ca +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9069a1cbcc6ab33a5aa1a4be85d5d7b21a4d690d --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bf8765dd99344258b1b68a2ccf740889840103c044fac4011b110a38b0989e1 +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2619c1a809d58d3bdecf3df587b511b5a8d6b08f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/trainer_state.json @@ -0,0 +1,23433 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 10.830699774266366, + "eval_steps": 10, + "global_step": 3000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + }, + { + "epoch": 4.33589164785553, + "grad_norm": 232.332763671875, + "learning_rate": 2.3515426497277678e-05, + "loss": 38.1029, + "step": 1201 + }, + { + "epoch": 4.339503386004514, + "grad_norm": 251.8763885498047, + "learning_rate": 2.3509981851179673e-05, + "loss": 40.2538, + "step": 1202 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 260.1363525390625, + "learning_rate": 2.350453720508167e-05, + "loss": 39.115, + "step": 1203 + }, + { + "epoch": 4.346726862302483, + "grad_norm": 227.32473754882812, + "learning_rate": 2.3499092558983667e-05, + "loss": 37.7692, + "step": 1204 + }, + { + "epoch": 4.350338600451467, + "grad_norm": 208.3872528076172, + "learning_rate": 2.3493647912885663e-05, + "loss": 26.7583, + "step": 1205 + }, + { + "epoch": 4.353950338600452, + "grad_norm": 173.05075073242188, + "learning_rate": 2.348820326678766e-05, + "loss": 24.7576, + "step": 1206 + }, + { + "epoch": 4.357562076749436, + "grad_norm": 214.4512939453125, + "learning_rate": 2.3482758620689657e-05, + "loss": 24.8792, + "step": 1207 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 179.293701171875, + "learning_rate": 2.3477313974591652e-05, + "loss": 26.1507, + "step": 1208 + }, + { + "epoch": 4.364785553047404, + "grad_norm": 401.9908142089844, + "learning_rate": 2.3471869328493648e-05, + "loss": 47.4017, + "step": 1209 + }, + { + "epoch": 4.368397291196389, + "grad_norm": 399.3369140625, + "learning_rate": 2.3466424682395643e-05, + "loss": 48.0082, + "step": 1210 + }, + { + "epoch": 4.368397291196389, + "eval_loss": 0.6664602756500244, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.18, + "eval_steps_per_second": 57.18, + "step": 1210 + }, + { + "epoch": 4.372009029345373, + "grad_norm": 320.49090576171875, + "learning_rate": 2.346098003629764e-05, + "loss": 47.4843, + "step": 1211 + }, + { + "epoch": 4.375620767494357, + "grad_norm": 297.55615234375, + "learning_rate": 2.3455535390199637e-05, + "loss": 46.3087, + "step": 1212 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 245.03399658203125, + "learning_rate": 2.3450090744101636e-05, + "loss": 45.4889, + "step": 1213 + }, + { + "epoch": 4.382844243792325, + "grad_norm": 227.94091796875, + "learning_rate": 2.344464609800363e-05, + "loss": 45.8501, + "step": 1214 + }, + { + "epoch": 4.386455981941309, + "grad_norm": 262.7824401855469, + "learning_rate": 2.3439201451905627e-05, + "loss": 46.2737, + "step": 1215 + }, + { + "epoch": 4.390067720090293, + "grad_norm": 235.969970703125, + "learning_rate": 2.3433756805807622e-05, + "loss": 45.2876, + "step": 1216 + }, + { + "epoch": 4.393679458239277, + "grad_norm": 244.8028106689453, + "learning_rate": 2.342831215970962e-05, + "loss": 45.4931, + "step": 1217 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 236.24844360351562, + "learning_rate": 2.3422867513611616e-05, + "loss": 45.6649, + "step": 1218 + }, + { + "epoch": 4.400902934537246, + "grad_norm": 204.7911834716797, + "learning_rate": 2.341742286751361e-05, + "loss": 43.9613, + "step": 1219 + }, + { + "epoch": 4.40451467268623, + "grad_norm": 190.6739044189453, + "learning_rate": 2.3411978221415607e-05, + "loss": 41.9267, + "step": 1220 + }, + { + "epoch": 4.40451467268623, + "eval_loss": 0.6481396555900574, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1220 + }, + { + "epoch": 4.408126410835214, + "grad_norm": 224.25758361816406, + "learning_rate": 2.3406533575317602e-05, + "loss": 42.34, + "step": 1221 + }, + { + "epoch": 4.411738148984199, + "grad_norm": 238.21913146972656, + "learning_rate": 2.34010889292196e-05, + "loss": 40.6947, + "step": 1222 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 255.64395141601562, + "learning_rate": 2.33956442831216e-05, + "loss": 39.8585, + "step": 1223 + }, + { + "epoch": 4.418961625282167, + "grad_norm": 202.08859252929688, + "learning_rate": 2.3390199637023595e-05, + "loss": 42.6031, + "step": 1224 + }, + { + "epoch": 4.422573363431152, + "grad_norm": 222.359619140625, + "learning_rate": 2.338475499092559e-05, + "loss": 41.9946, + "step": 1225 + }, + { + "epoch": 4.426185101580136, + "grad_norm": 198.84461975097656, + "learning_rate": 2.3379310344827586e-05, + "loss": 40.9174, + "step": 1226 + }, + { + "epoch": 4.42979683972912, + "grad_norm": 227.34942626953125, + "learning_rate": 2.337386569872958e-05, + "loss": 42.2865, + "step": 1227 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 249.9097900390625, + "learning_rate": 2.336842105263158e-05, + "loss": 42.6508, + "step": 1228 + }, + { + "epoch": 4.437020316027088, + "grad_norm": 236.96009826660156, + "learning_rate": 2.3362976406533576e-05, + "loss": 43.0846, + "step": 1229 + }, + { + "epoch": 4.440632054176072, + "grad_norm": 183.06201171875, + "learning_rate": 2.335753176043557e-05, + "loss": 42.4119, + "step": 1230 + }, + { + "epoch": 4.440632054176072, + "eval_loss": 0.6428424715995789, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 1230 + }, + { + "epoch": 4.444243792325056, + "grad_norm": 199.0382843017578, + "learning_rate": 2.335208711433757e-05, + "loss": 43.1702, + "step": 1231 + }, + { + "epoch": 4.44785553047404, + "grad_norm": 221.87939453125, + "learning_rate": 2.3346642468239565e-05, + "loss": 43.3518, + "step": 1232 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 205.0601043701172, + "learning_rate": 2.3341197822141564e-05, + "loss": 42.9713, + "step": 1233 + }, + { + "epoch": 4.455079006772009, + "grad_norm": 235.3998565673828, + "learning_rate": 2.333575317604356e-05, + "loss": 42.6973, + "step": 1234 + }, + { + "epoch": 4.458690744920993, + "grad_norm": 171.76986694335938, + "learning_rate": 2.3330308529945555e-05, + "loss": 43.351, + "step": 1235 + }, + { + "epoch": 4.462302483069977, + "grad_norm": 261.549072265625, + "learning_rate": 2.332486388384755e-05, + "loss": 43.8662, + "step": 1236 + }, + { + "epoch": 4.465914221218962, + "grad_norm": 256.76837158203125, + "learning_rate": 2.3319419237749545e-05, + "loss": 40.7938, + "step": 1237 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 176.35060119628906, + "learning_rate": 2.331397459165154e-05, + "loss": 38.1021, + "step": 1238 + }, + { + "epoch": 4.47313769751693, + "grad_norm": 203.00906372070312, + "learning_rate": 2.330852994555354e-05, + "loss": 36.6359, + "step": 1239 + }, + { + "epoch": 4.476749435665914, + "grad_norm": 259.6462707519531, + "learning_rate": 2.3303085299455535e-05, + "loss": 34.448, + "step": 1240 + }, + { + "epoch": 4.476749435665914, + "eval_loss": 0.6386051177978516, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 1240 + }, + { + "epoch": 4.480361173814899, + "grad_norm": 215.24737548828125, + "learning_rate": 2.3297640653357534e-05, + "loss": 35.2353, + "step": 1241 + }, + { + "epoch": 4.483972911963883, + "grad_norm": 249.12355041503906, + "learning_rate": 2.329219600725953e-05, + "loss": 38.2077, + "step": 1242 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 191.0881805419922, + "learning_rate": 2.3286751361161525e-05, + "loss": 36.8363, + "step": 1243 + }, + { + "epoch": 4.491196388261851, + "grad_norm": 229.26449584960938, + "learning_rate": 2.3281306715063523e-05, + "loss": 36.7398, + "step": 1244 + }, + { + "epoch": 4.4948081264108355, + "grad_norm": 184.931884765625, + "learning_rate": 2.327586206896552e-05, + "loss": 35.6614, + "step": 1245 + }, + { + "epoch": 4.4984198645598195, + "grad_norm": 183.7378387451172, + "learning_rate": 2.3270417422867514e-05, + "loss": 36.9818, + "step": 1246 + }, + { + "epoch": 4.502031602708803, + "grad_norm": 191.42543029785156, + "learning_rate": 2.326497277676951e-05, + "loss": 38.1348, + "step": 1247 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 211.6359100341797, + "learning_rate": 2.3259528130671505e-05, + "loss": 37.0112, + "step": 1248 + }, + { + "epoch": 4.509255079006772, + "grad_norm": 245.6946563720703, + "learning_rate": 2.32540834845735e-05, + "loss": 38.6218, + "step": 1249 + }, + { + "epoch": 4.512866817155756, + "grad_norm": 193.29095458984375, + "learning_rate": 2.3248638838475502e-05, + "loss": 36.9687, + "step": 1250 + }, + { + "epoch": 4.512866817155756, + "eval_loss": 0.6432057023048401, + "eval_runtime": 3.1301, + "eval_samples_per_second": 57.187, + "eval_steps_per_second": 57.187, + "step": 1250 + }, + { + "epoch": 4.51647855530474, + "grad_norm": 247.0595245361328, + "learning_rate": 2.3243194192377498e-05, + "loss": 39.8086, + "step": 1251 + }, + { + "epoch": 4.520090293453725, + "grad_norm": 243.1544189453125, + "learning_rate": 2.3237749546279493e-05, + "loss": 38.7245, + "step": 1252 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 322.0834045410156, + "learning_rate": 2.323230490018149e-05, + "loss": 39.5335, + "step": 1253 + }, + { + "epoch": 4.527313769751693, + "grad_norm": 201.5956573486328, + "learning_rate": 2.3226860254083484e-05, + "loss": 30.2928, + "step": 1254 + }, + { + "epoch": 4.530925507900677, + "grad_norm": 186.13291931152344, + "learning_rate": 2.3221415607985483e-05, + "loss": 24.8504, + "step": 1255 + }, + { + "epoch": 4.534537246049661, + "grad_norm": 251.50608825683594, + "learning_rate": 2.3215970961887478e-05, + "loss": 24.5528, + "step": 1256 + }, + { + "epoch": 4.538148984198646, + "grad_norm": 180.21124267578125, + "learning_rate": 2.3210526315789473e-05, + "loss": 25.0864, + "step": 1257 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 206.5410614013672, + "learning_rate": 2.320508166969147e-05, + "loss": 27.1602, + "step": 1258 + }, + { + "epoch": 4.545372460496614, + "grad_norm": 342.1103210449219, + "learning_rate": 2.3199637023593468e-05, + "loss": 47.3734, + "step": 1259 + }, + { + "epoch": 4.5489841986455986, + "grad_norm": 418.3056945800781, + "learning_rate": 2.3194192377495463e-05, + "loss": 48.0316, + "step": 1260 + }, + { + "epoch": 4.5489841986455986, + "eval_loss": 0.6742400527000427, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 1260 + }, + { + "epoch": 4.5525959367945825, + "grad_norm": 369.8560791015625, + "learning_rate": 2.3188747731397462e-05, + "loss": 47.4532, + "step": 1261 + }, + { + "epoch": 4.5562076749435665, + "grad_norm": 322.0288391113281, + "learning_rate": 2.3183303085299457e-05, + "loss": 47.0661, + "step": 1262 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 244.79066467285156, + "learning_rate": 2.3177858439201453e-05, + "loss": 45.1875, + "step": 1263 + }, + { + "epoch": 4.563431151241535, + "grad_norm": 209.29397583007812, + "learning_rate": 2.3172413793103448e-05, + "loss": 46.1355, + "step": 1264 + }, + { + "epoch": 4.567042889390519, + "grad_norm": 271.5123291015625, + "learning_rate": 2.3166969147005443e-05, + "loss": 45.8947, + "step": 1265 + }, + { + "epoch": 4.570654627539503, + "grad_norm": 232.42913818359375, + "learning_rate": 2.3161524500907442e-05, + "loss": 45.6542, + "step": 1266 + }, + { + "epoch": 4.574266365688487, + "grad_norm": 282.50738525390625, + "learning_rate": 2.3156079854809437e-05, + "loss": 45.8805, + "step": 1267 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 203.39031982421875, + "learning_rate": 2.3150635208711436e-05, + "loss": 44.8926, + "step": 1268 + }, + { + "epoch": 4.581489841986456, + "grad_norm": 213.94894409179688, + "learning_rate": 2.314519056261343e-05, + "loss": 43.7589, + "step": 1269 + }, + { + "epoch": 4.58510158013544, + "grad_norm": 198.9677734375, + "learning_rate": 2.3139745916515427e-05, + "loss": 41.819, + "step": 1270 + }, + { + "epoch": 4.58510158013544, + "eval_loss": 0.6428627371788025, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1270 + }, + { + "epoch": 4.588713318284425, + "grad_norm": 197.69903564453125, + "learning_rate": 2.3134301270417422e-05, + "loss": 40.6128, + "step": 1271 + }, + { + "epoch": 4.592325056433409, + "grad_norm": 229.10488891601562, + "learning_rate": 2.312885662431942e-05, + "loss": 41.1856, + "step": 1272 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 254.4750213623047, + "learning_rate": 2.3123411978221417e-05, + "loss": 40.2048, + "step": 1273 + }, + { + "epoch": 4.599548532731377, + "grad_norm": 247.2012939453125, + "learning_rate": 2.3117967332123412e-05, + "loss": 41.663, + "step": 1274 + }, + { + "epoch": 4.603160270880361, + "grad_norm": 196.78761291503906, + "learning_rate": 2.3112522686025407e-05, + "loss": 41.1102, + "step": 1275 + }, + { + "epoch": 4.606772009029346, + "grad_norm": 179.03880310058594, + "learning_rate": 2.3107078039927403e-05, + "loss": 39.6368, + "step": 1276 + }, + { + "epoch": 4.6103837471783295, + "grad_norm": 203.49159240722656, + "learning_rate": 2.3101633393829405e-05, + "loss": 42.9424, + "step": 1277 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 254.80018615722656, + "learning_rate": 2.30961887477314e-05, + "loss": 42.0636, + "step": 1278 + }, + { + "epoch": 4.617607223476298, + "grad_norm": 201.86109924316406, + "learning_rate": 2.3090744101633396e-05, + "loss": 41.4738, + "step": 1279 + }, + { + "epoch": 4.621218961625282, + "grad_norm": 185.1239471435547, + "learning_rate": 2.308529945553539e-05, + "loss": 41.8529, + "step": 1280 + }, + { + "epoch": 4.621218961625282, + "eval_loss": 0.6457561254501343, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 1280 + }, + { + "epoch": 4.624830699774266, + "grad_norm": 198.6769561767578, + "learning_rate": 2.3079854809437386e-05, + "loss": 41.8397, + "step": 1281 + }, + { + "epoch": 4.62844243792325, + "grad_norm": 254.9165496826172, + "learning_rate": 2.3074410163339382e-05, + "loss": 43.5585, + "step": 1282 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 183.61181640625, + "learning_rate": 2.306896551724138e-05, + "loss": 41.7349, + "step": 1283 + }, + { + "epoch": 4.635665914221219, + "grad_norm": 206.0381622314453, + "learning_rate": 2.3063520871143376e-05, + "loss": 42.6239, + "step": 1284 + }, + { + "epoch": 4.639277652370203, + "grad_norm": 188.5303497314453, + "learning_rate": 2.305807622504537e-05, + "loss": 43.0988, + "step": 1285 + }, + { + "epoch": 4.642889390519187, + "grad_norm": 208.30039978027344, + "learning_rate": 2.3052631578947367e-05, + "loss": 43.8379, + "step": 1286 + }, + { + "epoch": 4.646501128668172, + "grad_norm": 209.494384765625, + "learning_rate": 2.3047186932849365e-05, + "loss": 41.4395, + "step": 1287 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 223.97824096679688, + "learning_rate": 2.3041742286751364e-05, + "loss": 38.5792, + "step": 1288 + }, + { + "epoch": 4.65372460496614, + "grad_norm": 209.16192626953125, + "learning_rate": 2.303629764065336e-05, + "loss": 36.2448, + "step": 1289 + }, + { + "epoch": 4.657336343115124, + "grad_norm": 260.72821044921875, + "learning_rate": 2.3030852994555355e-05, + "loss": 35.1692, + "step": 1290 + }, + { + "epoch": 4.657336343115124, + "eval_loss": 0.6381233334541321, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1290 + }, + { + "epoch": 4.660948081264109, + "grad_norm": 222.2270965576172, + "learning_rate": 2.302540834845735e-05, + "loss": 35.2234, + "step": 1291 + }, + { + "epoch": 4.664559819413093, + "grad_norm": 208.68218994140625, + "learning_rate": 2.3019963702359346e-05, + "loss": 35.6167, + "step": 1292 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 199.57015991210938, + "learning_rate": 2.301451905626134e-05, + "loss": 36.9489, + "step": 1293 + }, + { + "epoch": 4.6717832957110605, + "grad_norm": 249.1312255859375, + "learning_rate": 2.300907441016334e-05, + "loss": 37.0681, + "step": 1294 + }, + { + "epoch": 4.675395033860045, + "grad_norm": 227.86341857910156, + "learning_rate": 2.3003629764065335e-05, + "loss": 38.3897, + "step": 1295 + }, + { + "epoch": 4.679006772009029, + "grad_norm": 290.3368225097656, + "learning_rate": 2.2998185117967334e-05, + "loss": 39.1391, + "step": 1296 + }, + { + "epoch": 4.682618510158013, + "grad_norm": 222.59974670410156, + "learning_rate": 2.299274047186933e-05, + "loss": 38.6362, + "step": 1297 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 233.853515625, + "learning_rate": 2.2987295825771325e-05, + "loss": 37.1796, + "step": 1298 + }, + { + "epoch": 4.689841986455982, + "grad_norm": 202.83212280273438, + "learning_rate": 2.2981851179673324e-05, + "loss": 38.5097, + "step": 1299 + }, + { + "epoch": 4.693453724604966, + "grad_norm": 203.59027099609375, + "learning_rate": 2.297640653357532e-05, + "loss": 38.3335, + "step": 1300 + }, + { + "epoch": 4.693453724604966, + "eval_loss": 0.6446877717971802, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 1300 + }, + { + "epoch": 4.69706546275395, + "grad_norm": 250.48324584960938, + "learning_rate": 2.2970961887477314e-05, + "loss": 39.1848, + "step": 1301 + }, + { + "epoch": 4.700677200902934, + "grad_norm": 218.0867462158203, + "learning_rate": 2.296551724137931e-05, + "loss": 38.2276, + "step": 1302 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 316.4258728027344, + "learning_rate": 2.2960072595281305e-05, + "loss": 38.4487, + "step": 1303 + }, + { + "epoch": 4.707900677200903, + "grad_norm": 262.96832275390625, + "learning_rate": 2.29546279491833e-05, + "loss": 29.1075, + "step": 1304 + }, + { + "epoch": 4.711512415349887, + "grad_norm": 261.25897216796875, + "learning_rate": 2.2949183303085303e-05, + "loss": 24.6257, + "step": 1305 + }, + { + "epoch": 4.715124153498872, + "grad_norm": 223.29014587402344, + "learning_rate": 2.2943738656987298e-05, + "loss": 24.4387, + "step": 1306 + }, + { + "epoch": 4.718735891647856, + "grad_norm": 167.95193481445312, + "learning_rate": 2.2938294010889293e-05, + "loss": 25.0916, + "step": 1307 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 203.88392639160156, + "learning_rate": 2.293284936479129e-05, + "loss": 26.1631, + "step": 1308 + }, + { + "epoch": 4.725959367945824, + "grad_norm": 350.67657470703125, + "learning_rate": 2.2927404718693284e-05, + "loss": 47.7021, + "step": 1309 + }, + { + "epoch": 4.7295711060948085, + "grad_norm": 357.1839294433594, + "learning_rate": 2.2921960072595283e-05, + "loss": 47.8161, + "step": 1310 + }, + { + "epoch": 4.7295711060948085, + "eval_loss": 0.6716815829277039, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 1310 + }, + { + "epoch": 4.733182844243792, + "grad_norm": 334.40216064453125, + "learning_rate": 2.291651542649728e-05, + "loss": 47.5608, + "step": 1311 + }, + { + "epoch": 4.736794582392776, + "grad_norm": 322.90008544921875, + "learning_rate": 2.2911070780399274e-05, + "loss": 45.9858, + "step": 1312 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 291.5083923339844, + "learning_rate": 2.290562613430127e-05, + "loss": 45.9813, + "step": 1313 + }, + { + "epoch": 4.744018058690745, + "grad_norm": 234.91102600097656, + "learning_rate": 2.2900181488203268e-05, + "loss": 44.4287, + "step": 1314 + }, + { + "epoch": 4.747629796839729, + "grad_norm": 271.03582763671875, + "learning_rate": 2.2894736842105263e-05, + "loss": 45.3697, + "step": 1315 + }, + { + "epoch": 4.751241534988713, + "grad_norm": 256.219482421875, + "learning_rate": 2.2889292196007262e-05, + "loss": 45.1817, + "step": 1316 + }, + { + "epoch": 4.754853273137698, + "grad_norm": 252.0631561279297, + "learning_rate": 2.2883847549909257e-05, + "loss": 45.2029, + "step": 1317 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 249.41812133789062, + "learning_rate": 2.2878402903811253e-05, + "loss": 44.9802, + "step": 1318 + }, + { + "epoch": 4.762076749435666, + "grad_norm": 208.9102325439453, + "learning_rate": 2.2872958257713248e-05, + "loss": 44.3745, + "step": 1319 + }, + { + "epoch": 4.76568848758465, + "grad_norm": 322.94903564453125, + "learning_rate": 2.2867513611615244e-05, + "loss": 40.9193, + "step": 1320 + }, + { + "epoch": 4.76568848758465, + "eval_loss": 0.6515910029411316, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1320 + }, + { + "epoch": 4.769300225733634, + "grad_norm": 264.6942138671875, + "learning_rate": 2.2862068965517242e-05, + "loss": 39.7286, + "step": 1321 + }, + { + "epoch": 4.772911963882619, + "grad_norm": 276.6095886230469, + "learning_rate": 2.2856624319419238e-05, + "loss": 41.3846, + "step": 1322 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 199.59877014160156, + "learning_rate": 2.2851179673321233e-05, + "loss": 40.5583, + "step": 1323 + }, + { + "epoch": 4.780135440180587, + "grad_norm": 252.59158325195312, + "learning_rate": 2.2845735027223232e-05, + "loss": 40.9513, + "step": 1324 + }, + { + "epoch": 4.7837471783295715, + "grad_norm": 215.53826904296875, + "learning_rate": 2.2840290381125227e-05, + "loss": 41.5119, + "step": 1325 + }, + { + "epoch": 4.7873589164785555, + "grad_norm": 290.7100524902344, + "learning_rate": 2.2834845735027226e-05, + "loss": 42.7646, + "step": 1326 + }, + { + "epoch": 4.7909706546275395, + "grad_norm": 190.2306671142578, + "learning_rate": 2.282940108892922e-05, + "loss": 42.2708, + "step": 1327 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 187.5550079345703, + "learning_rate": 2.2823956442831217e-05, + "loss": 41.9279, + "step": 1328 + }, + { + "epoch": 4.798194130925508, + "grad_norm": 169.10414123535156, + "learning_rate": 2.2818511796733212e-05, + "loss": 42.2688, + "step": 1329 + }, + { + "epoch": 4.801805869074492, + "grad_norm": 199.5216064453125, + "learning_rate": 2.2813067150635208e-05, + "loss": 41.9192, + "step": 1330 + }, + { + "epoch": 4.801805869074492, + "eval_loss": 0.6402038335800171, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 1330 + }, + { + "epoch": 4.805417607223476, + "grad_norm": 222.4996337890625, + "learning_rate": 2.2807622504537203e-05, + "loss": 43.8218, + "step": 1331 + }, + { + "epoch": 4.80902934537246, + "grad_norm": 228.1157684326172, + "learning_rate": 2.2802177858439202e-05, + "loss": 42.9497, + "step": 1332 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 179.83697509765625, + "learning_rate": 2.27967332123412e-05, + "loss": 43.9723, + "step": 1333 + }, + { + "epoch": 4.816252821670429, + "grad_norm": 196.81983947753906, + "learning_rate": 2.2791288566243196e-05, + "loss": 43.3302, + "step": 1334 + }, + { + "epoch": 4.819864559819413, + "grad_norm": 186.61160278320312, + "learning_rate": 2.278584392014519e-05, + "loss": 41.8957, + "step": 1335 + }, + { + "epoch": 4.823476297968397, + "grad_norm": 242.55886840820312, + "learning_rate": 2.2780399274047187e-05, + "loss": 43.1916, + "step": 1336 + }, + { + "epoch": 4.827088036117382, + "grad_norm": 212.07177734375, + "learning_rate": 2.2774954627949185e-05, + "loss": 38.3371, + "step": 1337 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 180.1990966796875, + "learning_rate": 2.276950998185118e-05, + "loss": 36.3413, + "step": 1338 + }, + { + "epoch": 4.83431151241535, + "grad_norm": 202.69529724121094, + "learning_rate": 2.2764065335753176e-05, + "loss": 35.4426, + "step": 1339 + }, + { + "epoch": 4.837923250564334, + "grad_norm": 180.47283935546875, + "learning_rate": 2.275862068965517e-05, + "loss": 35.5281, + "step": 1340 + }, + { + "epoch": 4.837923250564334, + "eval_loss": 0.6356105804443359, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 1340 + }, + { + "epoch": 4.8415349887133186, + "grad_norm": 204.674560546875, + "learning_rate": 2.2753176043557167e-05, + "loss": 36.2566, + "step": 1341 + }, + { + "epoch": 4.8451467268623025, + "grad_norm": 272.1197204589844, + "learning_rate": 2.2747731397459166e-05, + "loss": 36.3862, + "step": 1342 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 235.55101013183594, + "learning_rate": 2.2742286751361165e-05, + "loss": 35.1455, + "step": 1343 + }, + { + "epoch": 4.852370203160271, + "grad_norm": 271.2718200683594, + "learning_rate": 2.273684210526316e-05, + "loss": 37.3824, + "step": 1344 + }, + { + "epoch": 4.855981941309255, + "grad_norm": 242.15728759765625, + "learning_rate": 2.2731397459165155e-05, + "loss": 37.6587, + "step": 1345 + }, + { + "epoch": 4.859593679458239, + "grad_norm": 218.59481811523438, + "learning_rate": 2.272595281306715e-05, + "loss": 36.7602, + "step": 1346 + }, + { + "epoch": 4.863205417607223, + "grad_norm": 231.9490203857422, + "learning_rate": 2.2720508166969146e-05, + "loss": 38.187, + "step": 1347 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 385.56158447265625, + "learning_rate": 2.2715063520871145e-05, + "loss": 38.1905, + "step": 1348 + }, + { + "epoch": 4.870428893905192, + "grad_norm": 219.38204956054688, + "learning_rate": 2.270961887477314e-05, + "loss": 38.2179, + "step": 1349 + }, + { + "epoch": 4.874040632054176, + "grad_norm": 209.46580505371094, + "learning_rate": 2.2704174228675136e-05, + "loss": 37.3696, + "step": 1350 + }, + { + "epoch": 4.874040632054176, + "eval_loss": 0.6412517428398132, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 1350 + }, + { + "epoch": 4.87765237020316, + "grad_norm": 205.53416442871094, + "learning_rate": 2.2698729582577134e-05, + "loss": 38.5144, + "step": 1351 + }, + { + "epoch": 4.881264108352145, + "grad_norm": 214.2522735595703, + "learning_rate": 2.269328493647913e-05, + "loss": 38.7372, + "step": 1352 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 236.9787139892578, + "learning_rate": 2.2687840290381125e-05, + "loss": 38.8987, + "step": 1353 + }, + { + "epoch": 4.888487584650113, + "grad_norm": 247.30906677246094, + "learning_rate": 2.2682395644283124e-05, + "loss": 35.0837, + "step": 1354 + }, + { + "epoch": 4.892099322799097, + "grad_norm": 287.5954284667969, + "learning_rate": 2.267695099818512e-05, + "loss": 25.5272, + "step": 1355 + }, + { + "epoch": 4.895711060948082, + "grad_norm": 254.61672973632812, + "learning_rate": 2.2671506352087115e-05, + "loss": 25.1288, + "step": 1356 + }, + { + "epoch": 4.899322799097066, + "grad_norm": 180.98666381835938, + "learning_rate": 2.266606170598911e-05, + "loss": 25.0588, + "step": 1357 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 213.0275421142578, + "learning_rate": 2.2660617059891105e-05, + "loss": 25.464, + "step": 1358 + }, + { + "epoch": 4.9065462753950335, + "grad_norm": 385.18035888671875, + "learning_rate": 2.2655172413793104e-05, + "loss": 47.0056, + "step": 1359 + }, + { + "epoch": 4.910158013544018, + "grad_norm": 383.4106140136719, + "learning_rate": 2.2649727767695103e-05, + "loss": 46.9892, + "step": 1360 + }, + { + "epoch": 4.910158013544018, + "eval_loss": 0.6618479490280151, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1360 + }, + { + "epoch": 4.913769751693002, + "grad_norm": 415.4345397949219, + "learning_rate": 2.26442831215971e-05, + "loss": 47.1619, + "step": 1361 + }, + { + "epoch": 4.917381489841986, + "grad_norm": 362.338134765625, + "learning_rate": 2.2638838475499094e-05, + "loss": 46.7232, + "step": 1362 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 378.7535400390625, + "learning_rate": 2.263339382940109e-05, + "loss": 46.4438, + "step": 1363 + }, + { + "epoch": 4.924604966139955, + "grad_norm": 251.64901733398438, + "learning_rate": 2.2627949183303085e-05, + "loss": 44.8178, + "step": 1364 + }, + { + "epoch": 4.928216704288939, + "grad_norm": 273.1052551269531, + "learning_rate": 2.2622504537205083e-05, + "loss": 43.0865, + "step": 1365 + }, + { + "epoch": 4.931828442437923, + "grad_norm": 229.66415405273438, + "learning_rate": 2.261705989110708e-05, + "loss": 42.2463, + "step": 1366 + }, + { + "epoch": 4.935440180586907, + "grad_norm": 229.47940063476562, + "learning_rate": 2.2611615245009074e-05, + "loss": 42.4395, + "step": 1367 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 224.48890686035156, + "learning_rate": 2.260617059891107e-05, + "loss": 42.4994, + "step": 1368 + }, + { + "epoch": 4.942663656884876, + "grad_norm": 241.98745727539062, + "learning_rate": 2.2600725952813065e-05, + "loss": 42.5535, + "step": 1369 + }, + { + "epoch": 4.94627539503386, + "grad_norm": 258.1711120605469, + "learning_rate": 2.2595281306715067e-05, + "loss": 42.8475, + "step": 1370 + }, + { + "epoch": 4.94627539503386, + "eval_loss": 0.639252245426178, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.09, + "eval_steps_per_second": 57.09, + "step": 1370 + }, + { + "epoch": 4.949887133182845, + "grad_norm": 204.64927673339844, + "learning_rate": 2.2589836660617062e-05, + "loss": 42.9895, + "step": 1371 + }, + { + "epoch": 4.953498871331829, + "grad_norm": 342.9057922363281, + "learning_rate": 2.2584392014519058e-05, + "loss": 43.1972, + "step": 1372 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 207.45504760742188, + "learning_rate": 2.2578947368421053e-05, + "loss": 42.406, + "step": 1373 + }, + { + "epoch": 4.960722347629797, + "grad_norm": 232.78831481933594, + "learning_rate": 2.257350272232305e-05, + "loss": 36.8817, + "step": 1374 + }, + { + "epoch": 4.9643340857787805, + "grad_norm": 249.3349609375, + "learning_rate": 2.2568058076225044e-05, + "loss": 34.584, + "step": 1375 + }, + { + "epoch": 4.967945823927765, + "grad_norm": 322.7100524902344, + "learning_rate": 2.2562613430127043e-05, + "loss": 36.9512, + "step": 1376 + }, + { + "epoch": 4.971557562076749, + "grad_norm": 357.65228271484375, + "learning_rate": 2.2557168784029038e-05, + "loss": 37.6833, + "step": 1377 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 300.0970153808594, + "learning_rate": 2.2551724137931033e-05, + "loss": 38.597, + "step": 1378 + }, + { + "epoch": 4.978781038374718, + "grad_norm": 234.52508544921875, + "learning_rate": 2.2546279491833032e-05, + "loss": 38.4155, + "step": 1379 + }, + { + "epoch": 4.982392776523702, + "grad_norm": 270.60626220703125, + "learning_rate": 2.2540834845735028e-05, + "loss": 38.1589, + "step": 1380 + }, + { + "epoch": 4.982392776523702, + "eval_loss": 0.6409950256347656, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 1380 + }, + { + "epoch": 4.986004514672686, + "grad_norm": 232.9596710205078, + "learning_rate": 2.2535390199637026e-05, + "loss": 39.281, + "step": 1381 + }, + { + "epoch": 4.98961625282167, + "grad_norm": 248.0550994873047, + "learning_rate": 2.2529945553539022e-05, + "loss": 40.0868, + "step": 1382 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 256.327880859375, + "learning_rate": 2.2524500907441017e-05, + "loss": 28.1259, + "step": 1383 + }, + { + "epoch": 4.996839729119639, + "grad_norm": 198.29559326171875, + "learning_rate": 2.2519056261343012e-05, + "loss": 25.3166, + "step": 1384 + }, + { + "epoch": 5.0, + "grad_norm": 174.66856384277344, + "learning_rate": 2.2513611615245008e-05, + "loss": 22.0749, + "step": 1385 + }, + { + "epoch": 5.003611738148984, + "grad_norm": 309.0927429199219, + "learning_rate": 2.2508166969147003e-05, + "loss": 45.2433, + "step": 1386 + }, + { + "epoch": 5.007223476297969, + "grad_norm": 293.1455383300781, + "learning_rate": 2.2502722323049002e-05, + "loss": 46.7025, + "step": 1387 + }, + { + "epoch": 5.010835214446953, + "grad_norm": 269.47662353515625, + "learning_rate": 2.2497277676951e-05, + "loss": 45.3218, + "step": 1388 + }, + { + "epoch": 5.014446952595937, + "grad_norm": 284.49560546875, + "learning_rate": 2.2491833030852996e-05, + "loss": 44.9849, + "step": 1389 + }, + { + "epoch": 5.018058690744921, + "grad_norm": 223.5511474609375, + "learning_rate": 2.248638838475499e-05, + "loss": 44.887, + "step": 1390 + }, + { + "epoch": 5.018058690744921, + "eval_loss": 0.6435533165931702, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1390 + }, + { + "epoch": 5.021670428893906, + "grad_norm": 243.4492645263672, + "learning_rate": 2.2480943738656987e-05, + "loss": 45.1483, + "step": 1391 + }, + { + "epoch": 5.0252821670428895, + "grad_norm": 265.1712646484375, + "learning_rate": 2.2475499092558986e-05, + "loss": 44.3713, + "step": 1392 + }, + { + "epoch": 5.0288939051918735, + "grad_norm": 190.72190856933594, + "learning_rate": 2.247005444646098e-05, + "loss": 45.3138, + "step": 1393 + }, + { + "epoch": 5.0325056433408575, + "grad_norm": 177.26686096191406, + "learning_rate": 2.2464609800362976e-05, + "loss": 43.302, + "step": 1394 + }, + { + "epoch": 5.036117381489842, + "grad_norm": 198.6124725341797, + "learning_rate": 2.2459165154264972e-05, + "loss": 43.6363, + "step": 1395 + }, + { + "epoch": 5.039729119638826, + "grad_norm": 233.78738403320312, + "learning_rate": 2.2453720508166967e-05, + "loss": 43.0345, + "step": 1396 + }, + { + "epoch": 5.04334085778781, + "grad_norm": 225.48614501953125, + "learning_rate": 2.2448275862068966e-05, + "loss": 41.5932, + "step": 1397 + }, + { + "epoch": 5.046952595936794, + "grad_norm": 204.31179809570312, + "learning_rate": 2.2442831215970965e-05, + "loss": 40.1401, + "step": 1398 + }, + { + "epoch": 5.050564334085779, + "grad_norm": 219.5385284423828, + "learning_rate": 2.243738656987296e-05, + "loss": 40.8834, + "step": 1399 + }, + { + "epoch": 5.054176072234763, + "grad_norm": 168.3094024658203, + "learning_rate": 2.2431941923774956e-05, + "loss": 40.4476, + "step": 1400 + }, + { + "epoch": 5.054176072234763, + "eval_loss": 0.6361114382743835, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 1400 + }, + { + "epoch": 5.057787810383747, + "grad_norm": 169.45201110839844, + "learning_rate": 2.242649727767695e-05, + "loss": 40.1949, + "step": 1401 + }, + { + "epoch": 5.061399548532731, + "grad_norm": 208.84634399414062, + "learning_rate": 2.2421052631578946e-05, + "loss": 41.0091, + "step": 1402 + }, + { + "epoch": 5.065011286681716, + "grad_norm": 248.86221313476562, + "learning_rate": 2.2415607985480945e-05, + "loss": 40.2435, + "step": 1403 + }, + { + "epoch": 5.0686230248307, + "grad_norm": 297.0834655761719, + "learning_rate": 2.241016333938294e-05, + "loss": 42.37, + "step": 1404 + }, + { + "epoch": 5.072234762979684, + "grad_norm": 242.12661743164062, + "learning_rate": 2.2404718693284936e-05, + "loss": 42.3822, + "step": 1405 + }, + { + "epoch": 5.075846501128668, + "grad_norm": 230.1178741455078, + "learning_rate": 2.2399274047186935e-05, + "loss": 41.3722, + "step": 1406 + }, + { + "epoch": 5.079458239277653, + "grad_norm": 191.32371520996094, + "learning_rate": 2.239382940108893e-05, + "loss": 41.8087, + "step": 1407 + }, + { + "epoch": 5.083069977426637, + "grad_norm": 267.28753662109375, + "learning_rate": 2.2388384754990925e-05, + "loss": 42.5938, + "step": 1408 + }, + { + "epoch": 5.0866817155756205, + "grad_norm": 186.61978149414062, + "learning_rate": 2.2382940108892924e-05, + "loss": 42.8553, + "step": 1409 + }, + { + "epoch": 5.090293453724605, + "grad_norm": 242.53433227539062, + "learning_rate": 2.237749546279492e-05, + "loss": 41.9677, + "step": 1410 + }, + { + "epoch": 5.090293453724605, + "eval_loss": 0.6330043077468872, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 1410 + }, + { + "epoch": 5.093905191873589, + "grad_norm": 199.74696350097656, + "learning_rate": 2.2372050816696915e-05, + "loss": 42.9821, + "step": 1411 + }, + { + "epoch": 5.097516930022573, + "grad_norm": 254.1063690185547, + "learning_rate": 2.236660617059891e-05, + "loss": 42.7956, + "step": 1412 + }, + { + "epoch": 5.101128668171557, + "grad_norm": 215.59056091308594, + "learning_rate": 2.2361161524500906e-05, + "loss": 43.6312, + "step": 1413 + }, + { + "epoch": 5.104740406320542, + "grad_norm": 218.69973754882812, + "learning_rate": 2.2355716878402904e-05, + "loss": 40.9468, + "step": 1414 + }, + { + "epoch": 5.108352144469526, + "grad_norm": 200.34927368164062, + "learning_rate": 2.23502722323049e-05, + "loss": 38.2656, + "step": 1415 + }, + { + "epoch": 5.11196388261851, + "grad_norm": 191.56883239746094, + "learning_rate": 2.23448275862069e-05, + "loss": 35.8111, + "step": 1416 + }, + { + "epoch": 5.115575620767494, + "grad_norm": 192.629150390625, + "learning_rate": 2.2339382940108894e-05, + "loss": 35.1287, + "step": 1417 + }, + { + "epoch": 5.119187358916479, + "grad_norm": 217.54855346679688, + "learning_rate": 2.233393829401089e-05, + "loss": 34.9664, + "step": 1418 + }, + { + "epoch": 5.122799097065463, + "grad_norm": 234.12355041503906, + "learning_rate": 2.2328493647912888e-05, + "loss": 35.9252, + "step": 1419 + }, + { + "epoch": 5.126410835214447, + "grad_norm": 201.83477783203125, + "learning_rate": 2.2323049001814884e-05, + "loss": 36.4664, + "step": 1420 + }, + { + "epoch": 5.126410835214447, + "eval_loss": 0.6359394192695618, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 1420 + }, + { + "epoch": 5.130022573363431, + "grad_norm": 212.38943481445312, + "learning_rate": 2.231760435571688e-05, + "loss": 35.2733, + "step": 1421 + }, + { + "epoch": 5.133634311512416, + "grad_norm": 219.8803253173828, + "learning_rate": 2.2312159709618874e-05, + "loss": 37.2009, + "step": 1422 + }, + { + "epoch": 5.1372460496614, + "grad_norm": 222.28221130371094, + "learning_rate": 2.230671506352087e-05, + "loss": 36.9338, + "step": 1423 + }, + { + "epoch": 5.140857787810384, + "grad_norm": 217.56607055664062, + "learning_rate": 2.2301270417422865e-05, + "loss": 38.0419, + "step": 1424 + }, + { + "epoch": 5.144469525959368, + "grad_norm": 232.7363739013672, + "learning_rate": 2.2295825771324867e-05, + "loss": 38.1393, + "step": 1425 + }, + { + "epoch": 5.148081264108352, + "grad_norm": 228.12091064453125, + "learning_rate": 2.2290381125226863e-05, + "loss": 37.4169, + "step": 1426 + }, + { + "epoch": 5.151693002257336, + "grad_norm": 247.9901580810547, + "learning_rate": 2.2284936479128858e-05, + "loss": 37.6386, + "step": 1427 + }, + { + "epoch": 5.15530474040632, + "grad_norm": 227.96649169921875, + "learning_rate": 2.2279491833030853e-05, + "loss": 38.7843, + "step": 1428 + }, + { + "epoch": 5.158916478555304, + "grad_norm": 197.85072326660156, + "learning_rate": 2.227404718693285e-05, + "loss": 37.7056, + "step": 1429 + }, + { + "epoch": 5.162528216704289, + "grad_norm": 270.6370544433594, + "learning_rate": 2.2268602540834848e-05, + "loss": 38.5554, + "step": 1430 + }, + { + "epoch": 5.162528216704289, + "eval_loss": 0.6463288068771362, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 1430 + }, + { + "epoch": 5.166139954853273, + "grad_norm": 251.65847778320312, + "learning_rate": 2.2263157894736843e-05, + "loss": 32.6593, + "step": 1431 + }, + { + "epoch": 5.169751693002257, + "grad_norm": 248.84368896484375, + "learning_rate": 2.225771324863884e-05, + "loss": 24.8031, + "step": 1432 + }, + { + "epoch": 5.173363431151241, + "grad_norm": 218.12979125976562, + "learning_rate": 2.2252268602540834e-05, + "loss": 23.8542, + "step": 1433 + }, + { + "epoch": 5.176975169300226, + "grad_norm": 171.4182586669922, + "learning_rate": 2.2246823956442832e-05, + "loss": 25.1994, + "step": 1434 + }, + { + "epoch": 5.18058690744921, + "grad_norm": 200.76271057128906, + "learning_rate": 2.2241379310344828e-05, + "loss": 25.1259, + "step": 1435 + }, + { + "epoch": 5.184198645598194, + "grad_norm": 324.8979797363281, + "learning_rate": 2.2235934664246827e-05, + "loss": 46.7466, + "step": 1436 + }, + { + "epoch": 5.187810383747179, + "grad_norm": 391.9200439453125, + "learning_rate": 2.2230490018148822e-05, + "loss": 47.366, + "step": 1437 + }, + { + "epoch": 5.191422121896163, + "grad_norm": 332.51080322265625, + "learning_rate": 2.2225045372050817e-05, + "loss": 47.5236, + "step": 1438 + }, + { + "epoch": 5.195033860045147, + "grad_norm": 295.85333251953125, + "learning_rate": 2.2219600725952813e-05, + "loss": 44.9235, + "step": 1439 + }, + { + "epoch": 5.198645598194131, + "grad_norm": 246.46482849121094, + "learning_rate": 2.2214156079854808e-05, + "loss": 44.5892, + "step": 1440 + }, + { + "epoch": 5.198645598194131, + "eval_loss": 0.6501885056495667, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.096, + "eval_steps_per_second": 57.096, + "step": 1440 + }, + { + "epoch": 5.2022573363431155, + "grad_norm": 224.99964904785156, + "learning_rate": 2.2208711433756807e-05, + "loss": 45.1496, + "step": 1441 + }, + { + "epoch": 5.2058690744920995, + "grad_norm": 201.5928497314453, + "learning_rate": 2.2203266787658802e-05, + "loss": 44.2362, + "step": 1442 + }, + { + "epoch": 5.209480812641083, + "grad_norm": 220.72509765625, + "learning_rate": 2.21978221415608e-05, + "loss": 45.7963, + "step": 1443 + }, + { + "epoch": 5.213092550790067, + "grad_norm": 229.04412841796875, + "learning_rate": 2.2192377495462796e-05, + "loss": 44.1812, + "step": 1444 + }, + { + "epoch": 5.216704288939052, + "grad_norm": 214.86207580566406, + "learning_rate": 2.2186932849364792e-05, + "loss": 44.364, + "step": 1445 + }, + { + "epoch": 5.220316027088036, + "grad_norm": 169.3239288330078, + "learning_rate": 2.2181488203266787e-05, + "loss": 44.1106, + "step": 1446 + }, + { + "epoch": 5.22392776523702, + "grad_norm": 180.3131561279297, + "learning_rate": 2.2176043557168786e-05, + "loss": 41.8791, + "step": 1447 + }, + { + "epoch": 5.227539503386004, + "grad_norm": 227.83078002929688, + "learning_rate": 2.217059891107078e-05, + "loss": 39.7917, + "step": 1448 + }, + { + "epoch": 5.231151241534989, + "grad_norm": 267.4294738769531, + "learning_rate": 2.2165154264972777e-05, + "loss": 41.2864, + "step": 1449 + }, + { + "epoch": 5.234762979683973, + "grad_norm": 210.79034423828125, + "learning_rate": 2.2159709618874772e-05, + "loss": 40.7219, + "step": 1450 + }, + { + "epoch": 5.234762979683973, + "eval_loss": 0.6369529366493225, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 1450 + }, + { + "epoch": 5.238374717832957, + "grad_norm": 205.2632598876953, + "learning_rate": 2.2154264972776768e-05, + "loss": 41.0364, + "step": 1451 + }, + { + "epoch": 5.241986455981941, + "grad_norm": 199.7196807861328, + "learning_rate": 2.214882032667877e-05, + "loss": 40.2733, + "step": 1452 + }, + { + "epoch": 5.245598194130926, + "grad_norm": 184.26495361328125, + "learning_rate": 2.2143375680580765e-05, + "loss": 40.3418, + "step": 1453 + }, + { + "epoch": 5.24920993227991, + "grad_norm": 170.1937713623047, + "learning_rate": 2.213793103448276e-05, + "loss": 40.5658, + "step": 1454 + }, + { + "epoch": 5.252821670428894, + "grad_norm": 167.71109008789062, + "learning_rate": 2.2132486388384756e-05, + "loss": 41.9252, + "step": 1455 + }, + { + "epoch": 5.2564334085778786, + "grad_norm": 184.73162841796875, + "learning_rate": 2.212704174228675e-05, + "loss": 40.0485, + "step": 1456 + }, + { + "epoch": 5.2600451467268625, + "grad_norm": 195.0812225341797, + "learning_rate": 2.2121597096188747e-05, + "loss": 41.6424, + "step": 1457 + }, + { + "epoch": 5.2636568848758465, + "grad_norm": 218.23553466796875, + "learning_rate": 2.2116152450090745e-05, + "loss": 40.6179, + "step": 1458 + }, + { + "epoch": 5.2672686230248305, + "grad_norm": 229.79299926757812, + "learning_rate": 2.211070780399274e-05, + "loss": 42.8747, + "step": 1459 + }, + { + "epoch": 5.270880361173815, + "grad_norm": 231.70692443847656, + "learning_rate": 2.2105263157894736e-05, + "loss": 42.7016, + "step": 1460 + }, + { + "epoch": 5.270880361173815, + "eval_loss": 0.6424433588981628, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 1460 + }, + { + "epoch": 5.274492099322799, + "grad_norm": 204.9513397216797, + "learning_rate": 2.209981851179673e-05, + "loss": 41.206, + "step": 1461 + }, + { + "epoch": 5.278103837471783, + "grad_norm": 220.89083862304688, + "learning_rate": 2.209437386569873e-05, + "loss": 44.0126, + "step": 1462 + }, + { + "epoch": 5.281715575620767, + "grad_norm": 266.7763671875, + "learning_rate": 2.208892921960073e-05, + "loss": 41.4934, + "step": 1463 + }, + { + "epoch": 5.285327313769752, + "grad_norm": 241.42636108398438, + "learning_rate": 2.2083484573502724e-05, + "loss": 43.3433, + "step": 1464 + }, + { + "epoch": 5.288939051918736, + "grad_norm": 221.7669219970703, + "learning_rate": 2.207803992740472e-05, + "loss": 35.9569, + "step": 1465 + }, + { + "epoch": 5.29255079006772, + "grad_norm": 236.0152130126953, + "learning_rate": 2.2072595281306715e-05, + "loss": 36.0824, + "step": 1466 + }, + { + "epoch": 5.296162528216704, + "grad_norm": 239.56224060058594, + "learning_rate": 2.206715063520871e-05, + "loss": 33.6127, + "step": 1467 + }, + { + "epoch": 5.299774266365689, + "grad_norm": 277.1287841796875, + "learning_rate": 2.2061705989110706e-05, + "loss": 36.11, + "step": 1468 + }, + { + "epoch": 5.303386004514673, + "grad_norm": 250.19515991210938, + "learning_rate": 2.2056261343012705e-05, + "loss": 36.9984, + "step": 1469 + }, + { + "epoch": 5.306997742663657, + "grad_norm": 214.2754669189453, + "learning_rate": 2.20508166969147e-05, + "loss": 36.5917, + "step": 1470 + }, + { + "epoch": 5.306997742663657, + "eval_loss": 0.6356943845748901, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 1470 + }, + { + "epoch": 5.310609480812641, + "grad_norm": 224.37388610839844, + "learning_rate": 2.20453720508167e-05, + "loss": 36.5302, + "step": 1471 + }, + { + "epoch": 5.314221218961626, + "grad_norm": 276.2541809082031, + "learning_rate": 2.2039927404718694e-05, + "loss": 36.7978, + "step": 1472 + }, + { + "epoch": 5.3178329571106095, + "grad_norm": 361.717041015625, + "learning_rate": 2.203448275862069e-05, + "loss": 37.4063, + "step": 1473 + }, + { + "epoch": 5.3214446952595935, + "grad_norm": 285.3569641113281, + "learning_rate": 2.202903811252269e-05, + "loss": 37.2472, + "step": 1474 + }, + { + "epoch": 5.3250564334085775, + "grad_norm": 268.160400390625, + "learning_rate": 2.2023593466424684e-05, + "loss": 37.7361, + "step": 1475 + }, + { + "epoch": 5.328668171557562, + "grad_norm": 211.38070678710938, + "learning_rate": 2.201814882032668e-05, + "loss": 37.7794, + "step": 1476 + }, + { + "epoch": 5.332279909706546, + "grad_norm": 214.10638427734375, + "learning_rate": 2.2012704174228675e-05, + "loss": 39.0787, + "step": 1477 + }, + { + "epoch": 5.33589164785553, + "grad_norm": 238.9603271484375, + "learning_rate": 2.200725952813067e-05, + "loss": 37.6853, + "step": 1478 + }, + { + "epoch": 5.339503386004514, + "grad_norm": 323.44976806640625, + "learning_rate": 2.2001814882032665e-05, + "loss": 38.2844, + "step": 1479 + }, + { + "epoch": 5.343115124153499, + "grad_norm": 289.6131896972656, + "learning_rate": 2.1996370235934668e-05, + "loss": 38.8953, + "step": 1480 + }, + { + "epoch": 5.343115124153499, + "eval_loss": 0.6462770700454712, + "eval_runtime": 3.1673, + "eval_samples_per_second": 56.516, + "eval_steps_per_second": 56.516, + "step": 1480 + }, + { + "epoch": 5.346726862302483, + "grad_norm": 197.47299194335938, + "learning_rate": 2.1990925589836663e-05, + "loss": 28.126, + "step": 1481 + }, + { + "epoch": 5.350338600451467, + "grad_norm": 198.37156677246094, + "learning_rate": 2.1985480943738658e-05, + "loss": 24.2205, + "step": 1482 + }, + { + "epoch": 5.353950338600452, + "grad_norm": 211.03501892089844, + "learning_rate": 2.1980036297640654e-05, + "loss": 24.119, + "step": 1483 + }, + { + "epoch": 5.357562076749436, + "grad_norm": 182.23316955566406, + "learning_rate": 2.197459165154265e-05, + "loss": 24.7386, + "step": 1484 + }, + { + "epoch": 5.36117381489842, + "grad_norm": 192.6392822265625, + "learning_rate": 2.1969147005444648e-05, + "loss": 26.0739, + "step": 1485 + }, + { + "epoch": 5.364785553047404, + "grad_norm": 380.62896728515625, + "learning_rate": 2.1963702359346643e-05, + "loss": 46.6945, + "step": 1486 + }, + { + "epoch": 5.368397291196389, + "grad_norm": 342.5572814941406, + "learning_rate": 2.195825771324864e-05, + "loss": 46.1797, + "step": 1487 + }, + { + "epoch": 5.372009029345373, + "grad_norm": 311.7198791503906, + "learning_rate": 2.1952813067150634e-05, + "loss": 45.6588, + "step": 1488 + }, + { + "epoch": 5.375620767494357, + "grad_norm": 260.9885559082031, + "learning_rate": 2.1947368421052633e-05, + "loss": 45.2405, + "step": 1489 + }, + { + "epoch": 5.3792325056433405, + "grad_norm": 263.3132019042969, + "learning_rate": 2.1941923774954628e-05, + "loss": 44.117, + "step": 1490 + }, + { + "epoch": 5.3792325056433405, + "eval_loss": 0.644275426864624, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 1490 + }, + { + "epoch": 5.382844243792325, + "grad_norm": 254.92022705078125, + "learning_rate": 2.1936479128856627e-05, + "loss": 45.4002, + "step": 1491 + }, + { + "epoch": 5.386455981941309, + "grad_norm": 246.1839599609375, + "learning_rate": 2.1931034482758622e-05, + "loss": 45.3481, + "step": 1492 + }, + { + "epoch": 5.390067720090293, + "grad_norm": 282.2879638671875, + "learning_rate": 2.1925589836660618e-05, + "loss": 45.3958, + "step": 1493 + }, + { + "epoch": 5.393679458239277, + "grad_norm": 266.9140930175781, + "learning_rate": 2.1920145190562613e-05, + "loss": 44.2959, + "step": 1494 + }, + { + "epoch": 5.397291196388262, + "grad_norm": 196.81199645996094, + "learning_rate": 2.191470054446461e-05, + "loss": 44.765, + "step": 1495 + }, + { + "epoch": 5.400902934537246, + "grad_norm": 270.7329406738281, + "learning_rate": 2.1909255898366607e-05, + "loss": 42.8581, + "step": 1496 + }, + { + "epoch": 5.40451467268623, + "grad_norm": 187.3281707763672, + "learning_rate": 2.1903811252268603e-05, + "loss": 40.7167, + "step": 1497 + }, + { + "epoch": 5.408126410835214, + "grad_norm": 302.9165954589844, + "learning_rate": 2.1898366606170598e-05, + "loss": 41.0712, + "step": 1498 + }, + { + "epoch": 5.411738148984199, + "grad_norm": 395.1492614746094, + "learning_rate": 2.1892921960072597e-05, + "loss": 40.4098, + "step": 1499 + }, + { + "epoch": 5.415349887133183, + "grad_norm": 253.91494750976562, + "learning_rate": 2.1887477313974592e-05, + "loss": 41.2985, + "step": 1500 + }, + { + "epoch": 5.415349887133183, + "eval_loss": 0.6383773684501648, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1500 + }, + { + "epoch": 5.418961625282167, + "grad_norm": 248.4109344482422, + "learning_rate": 2.1882032667876588e-05, + "loss": 41.179, + "step": 1501 + }, + { + "epoch": 5.422573363431152, + "grad_norm": 210.50015258789062, + "learning_rate": 2.1876588021778586e-05, + "loss": 41.1934, + "step": 1502 + }, + { + "epoch": 5.426185101580136, + "grad_norm": 170.64334106445312, + "learning_rate": 2.187114337568058e-05, + "loss": 41.5535, + "step": 1503 + }, + { + "epoch": 5.42979683972912, + "grad_norm": 249.41270446777344, + "learning_rate": 2.1865698729582577e-05, + "loss": 41.8323, + "step": 1504 + }, + { + "epoch": 5.433408577878104, + "grad_norm": 214.53770446777344, + "learning_rate": 2.1860254083484572e-05, + "loss": 42.1517, + "step": 1505 + }, + { + "epoch": 5.437020316027088, + "grad_norm": 225.6502227783203, + "learning_rate": 2.1854809437386568e-05, + "loss": 42.7675, + "step": 1506 + }, + { + "epoch": 5.440632054176072, + "grad_norm": 210.19219970703125, + "learning_rate": 2.1849364791288567e-05, + "loss": 42.5094, + "step": 1507 + }, + { + "epoch": 5.444243792325056, + "grad_norm": 187.03294372558594, + "learning_rate": 2.1843920145190565e-05, + "loss": 42.2218, + "step": 1508 + }, + { + "epoch": 5.44785553047404, + "grad_norm": 227.6764373779297, + "learning_rate": 2.183847549909256e-05, + "loss": 42.7061, + "step": 1509 + }, + { + "epoch": 5.451467268623025, + "grad_norm": 239.2847442626953, + "learning_rate": 2.1833030852994556e-05, + "loss": 43.1959, + "step": 1510 + }, + { + "epoch": 5.451467268623025, + "eval_loss": 0.6405091285705566, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 1510 + }, + { + "epoch": 5.455079006772009, + "grad_norm": 268.887451171875, + "learning_rate": 2.182758620689655e-05, + "loss": 42.4915, + "step": 1511 + }, + { + "epoch": 5.458690744920993, + "grad_norm": 261.0531311035156, + "learning_rate": 2.182214156079855e-05, + "loss": 42.1777, + "step": 1512 + }, + { + "epoch": 5.462302483069977, + "grad_norm": 241.58819580078125, + "learning_rate": 2.1816696914700546e-05, + "loss": 40.8728, + "step": 1513 + }, + { + "epoch": 5.465914221218962, + "grad_norm": 227.302001953125, + "learning_rate": 2.181125226860254e-05, + "loss": 39.8861, + "step": 1514 + }, + { + "epoch": 5.469525959367946, + "grad_norm": 293.8402404785156, + "learning_rate": 2.1805807622504536e-05, + "loss": 36.8716, + "step": 1515 + }, + { + "epoch": 5.47313769751693, + "grad_norm": 332.8829650878906, + "learning_rate": 2.1800362976406532e-05, + "loss": 35.6049, + "step": 1516 + }, + { + "epoch": 5.476749435665914, + "grad_norm": 271.6636962890625, + "learning_rate": 2.179491833030853e-05, + "loss": 34.6785, + "step": 1517 + }, + { + "epoch": 5.480361173814899, + "grad_norm": 211.5673065185547, + "learning_rate": 2.178947368421053e-05, + "loss": 35.5321, + "step": 1518 + }, + { + "epoch": 5.483972911963883, + "grad_norm": 168.95346069335938, + "learning_rate": 2.1784029038112525e-05, + "loss": 35.1604, + "step": 1519 + }, + { + "epoch": 5.487584650112867, + "grad_norm": 242.66725158691406, + "learning_rate": 2.177858439201452e-05, + "loss": 37.8709, + "step": 1520 + }, + { + "epoch": 5.487584650112867, + "eval_loss": 0.6324127912521362, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1520 + }, + { + "epoch": 5.491196388261851, + "grad_norm": 202.7799530029297, + "learning_rate": 2.1773139745916516e-05, + "loss": 38.1727, + "step": 1521 + }, + { + "epoch": 5.4948081264108355, + "grad_norm": 210.12704467773438, + "learning_rate": 2.176769509981851e-05, + "loss": 36.4171, + "step": 1522 + }, + { + "epoch": 5.4984198645598195, + "grad_norm": 214.7133331298828, + "learning_rate": 2.176225045372051e-05, + "loss": 37.7873, + "step": 1523 + }, + { + "epoch": 5.502031602708803, + "grad_norm": 197.89781188964844, + "learning_rate": 2.1756805807622505e-05, + "loss": 37.1096, + "step": 1524 + }, + { + "epoch": 5.505643340857787, + "grad_norm": 203.01992797851562, + "learning_rate": 2.17513611615245e-05, + "loss": 36.9907, + "step": 1525 + }, + { + "epoch": 5.509255079006772, + "grad_norm": 210.42164611816406, + "learning_rate": 2.17459165154265e-05, + "loss": 38.0291, + "step": 1526 + }, + { + "epoch": 5.512866817155756, + "grad_norm": 210.2798309326172, + "learning_rate": 2.1740471869328495e-05, + "loss": 37.5385, + "step": 1527 + }, + { + "epoch": 5.51647855530474, + "grad_norm": 217.986572265625, + "learning_rate": 2.173502722323049e-05, + "loss": 39.2736, + "step": 1528 + }, + { + "epoch": 5.520090293453725, + "grad_norm": 221.05831909179688, + "learning_rate": 2.172958257713249e-05, + "loss": 39.2733, + "step": 1529 + }, + { + "epoch": 5.523702031602709, + "grad_norm": 250.36065673828125, + "learning_rate": 2.1724137931034484e-05, + "loss": 37.8987, + "step": 1530 + }, + { + "epoch": 5.523702031602709, + "eval_loss": 0.6414559483528137, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 1530 + }, + { + "epoch": 5.527313769751693, + "grad_norm": 275.062255859375, + "learning_rate": 2.171869328493648e-05, + "loss": 29.4874, + "step": 1531 + }, + { + "epoch": 5.530925507900677, + "grad_norm": 178.79615783691406, + "learning_rate": 2.1713248638838475e-05, + "loss": 25.2165, + "step": 1532 + }, + { + "epoch": 5.534537246049661, + "grad_norm": 221.6693572998047, + "learning_rate": 2.170780399274047e-05, + "loss": 24.7139, + "step": 1533 + }, + { + "epoch": 5.538148984198646, + "grad_norm": 207.15869140625, + "learning_rate": 2.170235934664247e-05, + "loss": 25.2773, + "step": 1534 + }, + { + "epoch": 5.54176072234763, + "grad_norm": 193.37644958496094, + "learning_rate": 2.1696914700544468e-05, + "loss": 25.7936, + "step": 1535 + }, + { + "epoch": 5.545372460496614, + "grad_norm": 314.101318359375, + "learning_rate": 2.1691470054446463e-05, + "loss": 45.8573, + "step": 1536 + }, + { + "epoch": 5.5489841986455986, + "grad_norm": 376.9578552246094, + "learning_rate": 2.168602540834846e-05, + "loss": 47.1284, + "step": 1537 + }, + { + "epoch": 5.5525959367945825, + "grad_norm": 343.3904724121094, + "learning_rate": 2.1680580762250454e-05, + "loss": 45.1873, + "step": 1538 + }, + { + "epoch": 5.5562076749435665, + "grad_norm": 263.31768798828125, + "learning_rate": 2.167513611615245e-05, + "loss": 45.4906, + "step": 1539 + }, + { + "epoch": 5.5598194130925505, + "grad_norm": 295.50384521484375, + "learning_rate": 2.1669691470054448e-05, + "loss": 44.9259, + "step": 1540 + }, + { + "epoch": 5.5598194130925505, + "eval_loss": 0.6483813524246216, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.923, + "eval_steps_per_second": 56.923, + "step": 1540 + }, + { + "epoch": 5.563431151241535, + "grad_norm": 208.8861846923828, + "learning_rate": 2.1664246823956444e-05, + "loss": 43.7965, + "step": 1541 + }, + { + "epoch": 5.567042889390519, + "grad_norm": 195.8695526123047, + "learning_rate": 2.165880217785844e-05, + "loss": 44.7409, + "step": 1542 + }, + { + "epoch": 5.570654627539503, + "grad_norm": 218.10089111328125, + "learning_rate": 2.1653357531760434e-05, + "loss": 45.9364, + "step": 1543 + }, + { + "epoch": 5.574266365688487, + "grad_norm": 204.17205810546875, + "learning_rate": 2.164791288566243e-05, + "loss": 45.468, + "step": 1544 + }, + { + "epoch": 5.577878103837472, + "grad_norm": 239.03952026367188, + "learning_rate": 2.1642468239564432e-05, + "loss": 44.7685, + "step": 1545 + }, + { + "epoch": 5.581489841986456, + "grad_norm": 251.59300231933594, + "learning_rate": 2.1637023593466427e-05, + "loss": 43.011, + "step": 1546 + }, + { + "epoch": 5.58510158013544, + "grad_norm": 186.72540283203125, + "learning_rate": 2.1631578947368423e-05, + "loss": 41.5255, + "step": 1547 + }, + { + "epoch": 5.588713318284425, + "grad_norm": 199.89732360839844, + "learning_rate": 2.1626134301270418e-05, + "loss": 40.2522, + "step": 1548 + }, + { + "epoch": 5.592325056433409, + "grad_norm": 182.16624450683594, + "learning_rate": 2.1620689655172413e-05, + "loss": 41.0931, + "step": 1549 + }, + { + "epoch": 5.595936794582393, + "grad_norm": 221.58680725097656, + "learning_rate": 2.161524500907441e-05, + "loss": 40.2717, + "step": 1550 + }, + { + "epoch": 5.595936794582393, + "eval_loss": 0.6393340229988098, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 1550 + }, + { + "epoch": 5.599548532731377, + "grad_norm": 209.82183837890625, + "learning_rate": 2.1609800362976408e-05, + "loss": 41.7522, + "step": 1551 + }, + { + "epoch": 5.603160270880361, + "grad_norm": 226.1896209716797, + "learning_rate": 2.1604355716878403e-05, + "loss": 40.8078, + "step": 1552 + }, + { + "epoch": 5.606772009029346, + "grad_norm": 219.57899475097656, + "learning_rate": 2.1598911070780398e-05, + "loss": 42.2331, + "step": 1553 + }, + { + "epoch": 5.6103837471783295, + "grad_norm": 185.2303009033203, + "learning_rate": 2.1593466424682397e-05, + "loss": 42.0695, + "step": 1554 + }, + { + "epoch": 5.6139954853273135, + "grad_norm": 192.32913208007812, + "learning_rate": 2.1588021778584392e-05, + "loss": 42.1317, + "step": 1555 + }, + { + "epoch": 5.617607223476298, + "grad_norm": 183.3128662109375, + "learning_rate": 2.158257713248639e-05, + "loss": 40.4957, + "step": 1556 + }, + { + "epoch": 5.621218961625282, + "grad_norm": 178.10691833496094, + "learning_rate": 2.1577132486388387e-05, + "loss": 40.9154, + "step": 1557 + }, + { + "epoch": 5.624830699774266, + "grad_norm": 207.3495330810547, + "learning_rate": 2.1571687840290382e-05, + "loss": 42.8389, + "step": 1558 + }, + { + "epoch": 5.62844243792325, + "grad_norm": 191.46353149414062, + "learning_rate": 2.1566243194192377e-05, + "loss": 41.9483, + "step": 1559 + }, + { + "epoch": 5.632054176072235, + "grad_norm": 218.9544219970703, + "learning_rate": 2.1560798548094373e-05, + "loss": 41.2037, + "step": 1560 + }, + { + "epoch": 5.632054176072235, + "eval_loss": 0.6345452070236206, + "eval_runtime": 3.1432, + "eval_samples_per_second": 56.949, + "eval_steps_per_second": 56.949, + "step": 1560 + }, + { + "epoch": 5.635665914221219, + "grad_norm": 235.9405059814453, + "learning_rate": 2.1555353901996368e-05, + "loss": 43.1159, + "step": 1561 + }, + { + "epoch": 5.639277652370203, + "grad_norm": 207.1119384765625, + "learning_rate": 2.1549909255898367e-05, + "loss": 43.4384, + "step": 1562 + }, + { + "epoch": 5.642889390519187, + "grad_norm": 305.3013916015625, + "learning_rate": 2.1544464609800366e-05, + "loss": 42.436, + "step": 1563 + }, + { + "epoch": 5.646501128668172, + "grad_norm": 226.25282287597656, + "learning_rate": 2.153901996370236e-05, + "loss": 39.6844, + "step": 1564 + }, + { + "epoch": 5.650112866817156, + "grad_norm": 201.5033416748047, + "learning_rate": 2.1533575317604356e-05, + "loss": 35.9103, + "step": 1565 + }, + { + "epoch": 5.65372460496614, + "grad_norm": 206.63229370117188, + "learning_rate": 2.1528130671506352e-05, + "loss": 35.0026, + "step": 1566 + }, + { + "epoch": 5.657336343115124, + "grad_norm": 212.67581176757812, + "learning_rate": 2.152268602540835e-05, + "loss": 35.6298, + "step": 1567 + }, + { + "epoch": 5.660948081264109, + "grad_norm": 193.2886199951172, + "learning_rate": 2.1517241379310346e-05, + "loss": 36.0356, + "step": 1568 + }, + { + "epoch": 5.664559819413093, + "grad_norm": 166.189208984375, + "learning_rate": 2.151179673321234e-05, + "loss": 35.5423, + "step": 1569 + }, + { + "epoch": 5.668171557562077, + "grad_norm": 288.91552734375, + "learning_rate": 2.1506352087114337e-05, + "loss": 36.6227, + "step": 1570 + }, + { + "epoch": 5.668171557562077, + "eval_loss": 0.6339959502220154, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1570 + }, + { + "epoch": 5.6717832957110605, + "grad_norm": 210.91664123535156, + "learning_rate": 2.1500907441016332e-05, + "loss": 37.3015, + "step": 1571 + }, + { + "epoch": 5.675395033860045, + "grad_norm": 206.54299926757812, + "learning_rate": 2.149546279491833e-05, + "loss": 36.961, + "step": 1572 + }, + { + "epoch": 5.679006772009029, + "grad_norm": 206.55613708496094, + "learning_rate": 2.149001814882033e-05, + "loss": 36.722, + "step": 1573 + }, + { + "epoch": 5.682618510158013, + "grad_norm": 206.86563110351562, + "learning_rate": 2.1484573502722325e-05, + "loss": 37.7482, + "step": 1574 + }, + { + "epoch": 5.686230248306998, + "grad_norm": 219.96533203125, + "learning_rate": 2.147912885662432e-05, + "loss": 37.7964, + "step": 1575 + }, + { + "epoch": 5.689841986455982, + "grad_norm": 226.23887634277344, + "learning_rate": 2.1473684210526316e-05, + "loss": 38.6577, + "step": 1576 + }, + { + "epoch": 5.693453724604966, + "grad_norm": 195.1751708984375, + "learning_rate": 2.146823956442831e-05, + "loss": 36.9764, + "step": 1577 + }, + { + "epoch": 5.69706546275395, + "grad_norm": 194.3510284423828, + "learning_rate": 2.146279491833031e-05, + "loss": 39.4842, + "step": 1578 + }, + { + "epoch": 5.700677200902934, + "grad_norm": 187.02281188964844, + "learning_rate": 2.1457350272232305e-05, + "loss": 38.9574, + "step": 1579 + }, + { + "epoch": 5.704288939051919, + "grad_norm": 242.91925048828125, + "learning_rate": 2.14519056261343e-05, + "loss": 37.6359, + "step": 1580 + }, + { + "epoch": 5.704288939051919, + "eval_loss": 0.6384473443031311, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 1580 + }, + { + "epoch": 5.707900677200903, + "grad_norm": 242.9617156982422, + "learning_rate": 2.14464609800363e-05, + "loss": 31.3564, + "step": 1581 + }, + { + "epoch": 5.711512415349887, + "grad_norm": 182.00540161132812, + "learning_rate": 2.1441016333938295e-05, + "loss": 24.2933, + "step": 1582 + }, + { + "epoch": 5.715124153498872, + "grad_norm": 257.7115173339844, + "learning_rate": 2.143557168784029e-05, + "loss": 24.6299, + "step": 1583 + }, + { + "epoch": 5.718735891647856, + "grad_norm": 198.71554565429688, + "learning_rate": 2.143012704174229e-05, + "loss": 24.7344, + "step": 1584 + }, + { + "epoch": 5.72234762979684, + "grad_norm": 198.24520874023438, + "learning_rate": 2.1424682395644284e-05, + "loss": 26.0825, + "step": 1585 + }, + { + "epoch": 5.725959367945824, + "grad_norm": 248.9528045654297, + "learning_rate": 2.141923774954628e-05, + "loss": 45.1176, + "step": 1586 + }, + { + "epoch": 5.7295711060948085, + "grad_norm": 293.7327575683594, + "learning_rate": 2.1413793103448275e-05, + "loss": 45.8517, + "step": 1587 + }, + { + "epoch": 5.733182844243792, + "grad_norm": 293.1148681640625, + "learning_rate": 2.140834845735027e-05, + "loss": 45.6659, + "step": 1588 + }, + { + "epoch": 5.736794582392776, + "grad_norm": 312.7779846191406, + "learning_rate": 2.140290381125227e-05, + "loss": 44.4863, + "step": 1589 + }, + { + "epoch": 5.74040632054176, + "grad_norm": 309.1000061035156, + "learning_rate": 2.1397459165154265e-05, + "loss": 43.649, + "step": 1590 + }, + { + "epoch": 5.74040632054176, + "eval_loss": 0.6471736431121826, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 1590 + }, + { + "epoch": 5.744018058690745, + "grad_norm": 276.4226989746094, + "learning_rate": 2.1392014519056263e-05, + "loss": 45.3135, + "step": 1591 + }, + { + "epoch": 5.747629796839729, + "grad_norm": 233.6791229248047, + "learning_rate": 2.138656987295826e-05, + "loss": 44.4919, + "step": 1592 + }, + { + "epoch": 5.751241534988713, + "grad_norm": 194.2917022705078, + "learning_rate": 2.1381125226860254e-05, + "loss": 44.8033, + "step": 1593 + }, + { + "epoch": 5.754853273137698, + "grad_norm": 241.76060485839844, + "learning_rate": 2.137568058076225e-05, + "loss": 45.1427, + "step": 1594 + }, + { + "epoch": 5.758465011286682, + "grad_norm": 216.56283569335938, + "learning_rate": 2.137023593466425e-05, + "loss": 43.1769, + "step": 1595 + }, + { + "epoch": 5.762076749435666, + "grad_norm": 230.0026092529297, + "learning_rate": 2.1364791288566244e-05, + "loss": 44.1141, + "step": 1596 + }, + { + "epoch": 5.76568848758465, + "grad_norm": 191.55433654785156, + "learning_rate": 2.135934664246824e-05, + "loss": 40.7227, + "step": 1597 + }, + { + "epoch": 5.769300225733634, + "grad_norm": 180.25885009765625, + "learning_rate": 2.1353901996370235e-05, + "loss": 40.9842, + "step": 1598 + }, + { + "epoch": 5.772911963882619, + "grad_norm": 220.4018096923828, + "learning_rate": 2.134845735027223e-05, + "loss": 40.0403, + "step": 1599 + }, + { + "epoch": 5.776523702031603, + "grad_norm": 264.20587158203125, + "learning_rate": 2.1343012704174232e-05, + "loss": 40.1543, + "step": 1600 + }, + { + "epoch": 5.776523702031603, + "eval_loss": 0.6374311447143555, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1600 + }, + { + "epoch": 5.780135440180587, + "grad_norm": 167.9457244873047, + "learning_rate": 2.1337568058076227e-05, + "loss": 40.9575, + "step": 1601 + }, + { + "epoch": 5.7837471783295715, + "grad_norm": 190.05247497558594, + "learning_rate": 2.1332123411978223e-05, + "loss": 39.5593, + "step": 1602 + }, + { + "epoch": 5.7873589164785555, + "grad_norm": 246.4980926513672, + "learning_rate": 2.1326678765880218e-05, + "loss": 40.7016, + "step": 1603 + }, + { + "epoch": 5.7909706546275395, + "grad_norm": 208.7435302734375, + "learning_rate": 2.1321234119782214e-05, + "loss": 41.7855, + "step": 1604 + }, + { + "epoch": 5.794582392776523, + "grad_norm": 190.84188842773438, + "learning_rate": 2.1315789473684212e-05, + "loss": 41.2129, + "step": 1605 + }, + { + "epoch": 5.798194130925508, + "grad_norm": 196.7161102294922, + "learning_rate": 2.1310344827586208e-05, + "loss": 40.8209, + "step": 1606 + }, + { + "epoch": 5.801805869074492, + "grad_norm": 181.4319305419922, + "learning_rate": 2.1304900181488203e-05, + "loss": 41.8345, + "step": 1607 + }, + { + "epoch": 5.805417607223476, + "grad_norm": 201.2064971923828, + "learning_rate": 2.12994555353902e-05, + "loss": 43.1464, + "step": 1608 + }, + { + "epoch": 5.80902934537246, + "grad_norm": 199.15174865722656, + "learning_rate": 2.1294010889292197e-05, + "loss": 42.6041, + "step": 1609 + }, + { + "epoch": 5.812641083521445, + "grad_norm": 231.0398406982422, + "learning_rate": 2.1288566243194193e-05, + "loss": 42.867, + "step": 1610 + }, + { + "epoch": 5.812641083521445, + "eval_loss": 0.6334222555160522, + "eval_runtime": 3.1534, + "eval_samples_per_second": 56.764, + "eval_steps_per_second": 56.764, + "step": 1610 + }, + { + "epoch": 5.816252821670429, + "grad_norm": 189.26132202148438, + "learning_rate": 2.128312159709619e-05, + "loss": 41.7717, + "step": 1611 + }, + { + "epoch": 5.819864559819413, + "grad_norm": 215.5289764404297, + "learning_rate": 2.1277676950998187e-05, + "loss": 41.3994, + "step": 1612 + }, + { + "epoch": 5.823476297968397, + "grad_norm": 267.4259033203125, + "learning_rate": 2.1272232304900182e-05, + "loss": 41.8173, + "step": 1613 + }, + { + "epoch": 5.827088036117382, + "grad_norm": 241.74749755859375, + "learning_rate": 2.1266787658802178e-05, + "loss": 39.9873, + "step": 1614 + }, + { + "epoch": 5.830699774266366, + "grad_norm": 242.233642578125, + "learning_rate": 2.1261343012704173e-05, + "loss": 37.0662, + "step": 1615 + }, + { + "epoch": 5.83431151241535, + "grad_norm": 217.06141662597656, + "learning_rate": 2.1255898366606172e-05, + "loss": 36.8948, + "step": 1616 + }, + { + "epoch": 5.837923250564334, + "grad_norm": 242.05567932128906, + "learning_rate": 2.1250453720508167e-05, + "loss": 34.9909, + "step": 1617 + }, + { + "epoch": 5.8415349887133186, + "grad_norm": 178.65618896484375, + "learning_rate": 2.1245009074410166e-05, + "loss": 35.603, + "step": 1618 + }, + { + "epoch": 5.8451467268623025, + "grad_norm": 216.36865234375, + "learning_rate": 2.123956442831216e-05, + "loss": 35.9822, + "step": 1619 + }, + { + "epoch": 5.8487584650112865, + "grad_norm": 241.22161865234375, + "learning_rate": 2.1234119782214157e-05, + "loss": 35.1473, + "step": 1620 + }, + { + "epoch": 5.8487584650112865, + "eval_loss": 0.6312161087989807, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 1620 + }, + { + "epoch": 5.852370203160271, + "grad_norm": 192.05210876464844, + "learning_rate": 2.1228675136116152e-05, + "loss": 36.145, + "step": 1621 + }, + { + "epoch": 5.855981941309255, + "grad_norm": 194.0652618408203, + "learning_rate": 2.122323049001815e-05, + "loss": 37.7076, + "step": 1622 + }, + { + "epoch": 5.859593679458239, + "grad_norm": 255.59286499023438, + "learning_rate": 2.1217785843920146e-05, + "loss": 37.6837, + "step": 1623 + }, + { + "epoch": 5.863205417607223, + "grad_norm": 184.0017852783203, + "learning_rate": 2.121234119782214e-05, + "loss": 37.1681, + "step": 1624 + }, + { + "epoch": 5.866817155756207, + "grad_norm": 186.98338317871094, + "learning_rate": 2.1206896551724137e-05, + "loss": 37.4902, + "step": 1625 + }, + { + "epoch": 5.870428893905192, + "grad_norm": 253.53775024414062, + "learning_rate": 2.1201451905626132e-05, + "loss": 37.2771, + "step": 1626 + }, + { + "epoch": 5.874040632054176, + "grad_norm": 196.43038940429688, + "learning_rate": 2.119600725952813e-05, + "loss": 37.7681, + "step": 1627 + }, + { + "epoch": 5.87765237020316, + "grad_norm": 255.99879455566406, + "learning_rate": 2.119056261343013e-05, + "loss": 40.0097, + "step": 1628 + }, + { + "epoch": 5.881264108352145, + "grad_norm": 275.1465148925781, + "learning_rate": 2.1185117967332125e-05, + "loss": 38.1076, + "step": 1629 + }, + { + "epoch": 5.884875846501129, + "grad_norm": 281.8592529296875, + "learning_rate": 2.117967332123412e-05, + "loss": 38.6463, + "step": 1630 + }, + { + "epoch": 5.884875846501129, + "eval_loss": 0.6449099779129028, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 1630 + }, + { + "epoch": 5.888487584650113, + "grad_norm": 246.7912139892578, + "learning_rate": 2.1174228675136116e-05, + "loss": 36.9158, + "step": 1631 + }, + { + "epoch": 5.892099322799097, + "grad_norm": 176.7545623779297, + "learning_rate": 2.116878402903811e-05, + "loss": 25.1153, + "step": 1632 + }, + { + "epoch": 5.895711060948082, + "grad_norm": 202.2602996826172, + "learning_rate": 2.116333938294011e-05, + "loss": 24.1999, + "step": 1633 + }, + { + "epoch": 5.899322799097066, + "grad_norm": 186.26255798339844, + "learning_rate": 2.1157894736842106e-05, + "loss": 24.185, + "step": 1634 + }, + { + "epoch": 5.9029345372460496, + "grad_norm": 231.0543670654297, + "learning_rate": 2.11524500907441e-05, + "loss": 26.1841, + "step": 1635 + }, + { + "epoch": 5.9065462753950335, + "grad_norm": 336.677001953125, + "learning_rate": 2.1147005444646096e-05, + "loss": 47.1367, + "step": 1636 + }, + { + "epoch": 5.910158013544018, + "grad_norm": 299.3211975097656, + "learning_rate": 2.1141560798548095e-05, + "loss": 46.7711, + "step": 1637 + }, + { + "epoch": 5.913769751693002, + "grad_norm": 287.5389099121094, + "learning_rate": 2.1136116152450094e-05, + "loss": 44.9163, + "step": 1638 + }, + { + "epoch": 5.917381489841986, + "grad_norm": 290.34930419921875, + "learning_rate": 2.113067150635209e-05, + "loss": 45.1651, + "step": 1639 + }, + { + "epoch": 5.92099322799097, + "grad_norm": 244.7100372314453, + "learning_rate": 2.1125226860254085e-05, + "loss": 45.6252, + "step": 1640 + }, + { + "epoch": 5.92099322799097, + "eval_loss": 0.6506878733634949, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 1640 + }, + { + "epoch": 5.924604966139955, + "grad_norm": 301.48223876953125, + "learning_rate": 2.111978221415608e-05, + "loss": 44.5345, + "step": 1641 + }, + { + "epoch": 5.928216704288939, + "grad_norm": 261.05987548828125, + "learning_rate": 2.1114337568058075e-05, + "loss": 42.0263, + "step": 1642 + }, + { + "epoch": 5.931828442437923, + "grad_norm": 220.4369659423828, + "learning_rate": 2.110889292196007e-05, + "loss": 41.2405, + "step": 1643 + }, + { + "epoch": 5.935440180586907, + "grad_norm": 261.3221435546875, + "learning_rate": 2.110344827586207e-05, + "loss": 42.2734, + "step": 1644 + }, + { + "epoch": 5.939051918735892, + "grad_norm": 253.70855712890625, + "learning_rate": 2.1098003629764065e-05, + "loss": 43.0752, + "step": 1645 + }, + { + "epoch": 5.942663656884876, + "grad_norm": 198.76138305664062, + "learning_rate": 2.1092558983666064e-05, + "loss": 42.7103, + "step": 1646 + }, + { + "epoch": 5.94627539503386, + "grad_norm": 212.21466064453125, + "learning_rate": 2.108711433756806e-05, + "loss": 42.6215, + "step": 1647 + }, + { + "epoch": 5.949887133182845, + "grad_norm": 212.9633026123047, + "learning_rate": 2.1081669691470055e-05, + "loss": 42.795, + "step": 1648 + }, + { + "epoch": 5.953498871331829, + "grad_norm": 263.2871398925781, + "learning_rate": 2.1076225045372053e-05, + "loss": 43.8843, + "step": 1649 + }, + { + "epoch": 5.957110609480813, + "grad_norm": 207.67120361328125, + "learning_rate": 2.107078039927405e-05, + "loss": 43.0161, + "step": 1650 + }, + { + "epoch": 5.957110609480813, + "eval_loss": 0.6315081715583801, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1650 + }, + { + "epoch": 5.960722347629797, + "grad_norm": 176.6342010498047, + "learning_rate": 2.1065335753176044e-05, + "loss": 38.803, + "step": 1651 + }, + { + "epoch": 5.9643340857787805, + "grad_norm": 223.57485961914062, + "learning_rate": 2.105989110707804e-05, + "loss": 35.1905, + "step": 1652 + }, + { + "epoch": 5.967945823927765, + "grad_norm": 291.507568359375, + "learning_rate": 2.1054446460980035e-05, + "loss": 34.9454, + "step": 1653 + }, + { + "epoch": 5.971557562076749, + "grad_norm": 250.51063537597656, + "learning_rate": 2.104900181488203e-05, + "loss": 37.4404, + "step": 1654 + }, + { + "epoch": 5.975169300225733, + "grad_norm": 307.9601135253906, + "learning_rate": 2.1043557168784032e-05, + "loss": 36.9775, + "step": 1655 + }, + { + "epoch": 5.978781038374718, + "grad_norm": 277.24151611328125, + "learning_rate": 2.1038112522686028e-05, + "loss": 38.2696, + "step": 1656 + }, + { + "epoch": 5.982392776523702, + "grad_norm": 186.7593994140625, + "learning_rate": 2.1032667876588023e-05, + "loss": 37.0656, + "step": 1657 + }, + { + "epoch": 5.986004514672686, + "grad_norm": 201.67047119140625, + "learning_rate": 2.102722323049002e-05, + "loss": 38.1747, + "step": 1658 + }, + { + "epoch": 5.98961625282167, + "grad_norm": 216.87525939941406, + "learning_rate": 2.1021778584392014e-05, + "loss": 39.3248, + "step": 1659 + }, + { + "epoch": 5.993227990970655, + "grad_norm": 227.381103515625, + "learning_rate": 2.1016333938294013e-05, + "loss": 33.4017, + "step": 1660 + }, + { + "epoch": 5.993227990970655, + "eval_loss": 0.6369583010673523, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1660 + }, + { + "epoch": 5.996839729119639, + "grad_norm": 237.2648468017578, + "learning_rate": 2.1010889292196008e-05, + "loss": 24.679, + "step": 1661 + }, + { + "epoch": 6.0, + "grad_norm": 191.99951171875, + "learning_rate": 2.1005444646098003e-05, + "loss": 21.9552, + "step": 1662 + }, + { + "epoch": 6.003611738148984, + "grad_norm": 267.92181396484375, + "learning_rate": 2.1e-05, + "loss": 43.6884, + "step": 1663 + }, + { + "epoch": 6.007223476297969, + "grad_norm": 318.86602783203125, + "learning_rate": 2.0994555353901998e-05, + "loss": 46.0709, + "step": 1664 + }, + { + "epoch": 6.010835214446953, + "grad_norm": 282.772705078125, + "learning_rate": 2.0989110707803993e-05, + "loss": 44.2746, + "step": 1665 + }, + { + "epoch": 6.014446952595937, + "grad_norm": 263.2024841308594, + "learning_rate": 2.0983666061705992e-05, + "loss": 43.818, + "step": 1666 + }, + { + "epoch": 6.018058690744921, + "grad_norm": 229.41725158691406, + "learning_rate": 2.0978221415607987e-05, + "loss": 43.9441, + "step": 1667 + }, + { + "epoch": 6.021670428893906, + "grad_norm": 253.25624084472656, + "learning_rate": 2.0972776769509983e-05, + "loss": 43.517, + "step": 1668 + }, + { + "epoch": 6.0252821670428895, + "grad_norm": 202.00238037109375, + "learning_rate": 2.0967332123411978e-05, + "loss": 44.3685, + "step": 1669 + }, + { + "epoch": 6.0288939051918735, + "grad_norm": 196.92825317382812, + "learning_rate": 2.0961887477313973e-05, + "loss": 44.9367, + "step": 1670 + }, + { + "epoch": 6.0288939051918735, + "eval_loss": 0.6381568312644958, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1670 + }, + { + "epoch": 6.0325056433408575, + "grad_norm": 191.00900268554688, + "learning_rate": 2.0956442831215972e-05, + "loss": 44.0743, + "step": 1671 + }, + { + "epoch": 6.036117381489842, + "grad_norm": 195.92141723632812, + "learning_rate": 2.0950998185117967e-05, + "loss": 43.3278, + "step": 1672 + }, + { + "epoch": 6.039729119638826, + "grad_norm": 230.04708862304688, + "learning_rate": 2.0945553539019963e-05, + "loss": 41.6419, + "step": 1673 + }, + { + "epoch": 6.04334085778781, + "grad_norm": 215.70689392089844, + "learning_rate": 2.094010889292196e-05, + "loss": 41.0927, + "step": 1674 + }, + { + "epoch": 6.046952595936794, + "grad_norm": 227.51797485351562, + "learning_rate": 2.0934664246823957e-05, + "loss": 40.1888, + "step": 1675 + }, + { + "epoch": 6.050564334085779, + "grad_norm": 216.93089294433594, + "learning_rate": 2.0929219600725952e-05, + "loss": 39.8766, + "step": 1676 + }, + { + "epoch": 6.054176072234763, + "grad_norm": 199.3091583251953, + "learning_rate": 2.092377495462795e-05, + "loss": 40.3851, + "step": 1677 + }, + { + "epoch": 6.057787810383747, + "grad_norm": 188.56056213378906, + "learning_rate": 2.0918330308529947e-05, + "loss": 40.5289, + "step": 1678 + }, + { + "epoch": 6.061399548532731, + "grad_norm": 194.23265075683594, + "learning_rate": 2.0912885662431942e-05, + "loss": 40.7509, + "step": 1679 + }, + { + "epoch": 6.065011286681716, + "grad_norm": 199.7327423095703, + "learning_rate": 2.0907441016333937e-05, + "loss": 41.3404, + "step": 1680 + }, + { + "epoch": 6.065011286681716, + "eval_loss": 0.6312655806541443, + "eval_runtime": 3.1482, + "eval_samples_per_second": 56.858, + "eval_steps_per_second": 56.858, + "step": 1680 + }, + { + "epoch": 6.0686230248307, + "grad_norm": 189.40150451660156, + "learning_rate": 2.0901996370235933e-05, + "loss": 41.3719, + "step": 1681 + }, + { + "epoch": 6.072234762979684, + "grad_norm": 222.07705688476562, + "learning_rate": 2.089655172413793e-05, + "loss": 41.8194, + "step": 1682 + }, + { + "epoch": 6.075846501128668, + "grad_norm": 205.6264190673828, + "learning_rate": 2.089110707803993e-05, + "loss": 39.8522, + "step": 1683 + }, + { + "epoch": 6.079458239277653, + "grad_norm": 207.98802185058594, + "learning_rate": 2.0885662431941926e-05, + "loss": 41.5093, + "step": 1684 + }, + { + "epoch": 6.083069977426637, + "grad_norm": 197.24134826660156, + "learning_rate": 2.088021778584392e-05, + "loss": 41.7284, + "step": 1685 + }, + { + "epoch": 6.0866817155756205, + "grad_norm": 220.84255981445312, + "learning_rate": 2.0874773139745916e-05, + "loss": 42.7841, + "step": 1686 + }, + { + "epoch": 6.090293453724605, + "grad_norm": 239.06854248046875, + "learning_rate": 2.0869328493647912e-05, + "loss": 43.6391, + "step": 1687 + }, + { + "epoch": 6.093905191873589, + "grad_norm": 193.2572021484375, + "learning_rate": 2.086388384754991e-05, + "loss": 41.9963, + "step": 1688 + }, + { + "epoch": 6.097516930022573, + "grad_norm": 206.66473388671875, + "learning_rate": 2.0858439201451906e-05, + "loss": 41.9834, + "step": 1689 + }, + { + "epoch": 6.101128668171557, + "grad_norm": 214.81956481933594, + "learning_rate": 2.08529945553539e-05, + "loss": 41.7128, + "step": 1690 + }, + { + "epoch": 6.101128668171557, + "eval_loss": 0.6309775114059448, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1690 + }, + { + "epoch": 6.104740406320542, + "grad_norm": 189.58360290527344, + "learning_rate": 2.0847549909255897e-05, + "loss": 37.7807, + "step": 1691 + }, + { + "epoch": 6.108352144469526, + "grad_norm": 265.76934814453125, + "learning_rate": 2.0842105263157895e-05, + "loss": 37.7091, + "step": 1692 + }, + { + "epoch": 6.11196388261851, + "grad_norm": 266.4632568359375, + "learning_rate": 2.0836660617059894e-05, + "loss": 34.7386, + "step": 1693 + }, + { + "epoch": 6.115575620767494, + "grad_norm": 309.3799743652344, + "learning_rate": 2.083121597096189e-05, + "loss": 34.9386, + "step": 1694 + }, + { + "epoch": 6.119187358916479, + "grad_norm": 252.98681640625, + "learning_rate": 2.0825771324863885e-05, + "loss": 34.9113, + "step": 1695 + }, + { + "epoch": 6.122799097065463, + "grad_norm": 199.3408660888672, + "learning_rate": 2.082032667876588e-05, + "loss": 35.1914, + "step": 1696 + }, + { + "epoch": 6.126410835214447, + "grad_norm": 231.67514038085938, + "learning_rate": 2.0814882032667876e-05, + "loss": 36.3151, + "step": 1697 + }, + { + "epoch": 6.130022573363431, + "grad_norm": 215.49317932128906, + "learning_rate": 2.080943738656987e-05, + "loss": 37.6763, + "step": 1698 + }, + { + "epoch": 6.133634311512416, + "grad_norm": 239.3602752685547, + "learning_rate": 2.080399274047187e-05, + "loss": 35.7805, + "step": 1699 + }, + { + "epoch": 6.1372460496614, + "grad_norm": 192.8195037841797, + "learning_rate": 2.0798548094373865e-05, + "loss": 36.7353, + "step": 1700 + }, + { + "epoch": 6.1372460496614, + "eval_loss": 0.6290757060050964, + "eval_runtime": 3.1486, + "eval_samples_per_second": 56.851, + "eval_steps_per_second": 56.851, + "step": 1700 + }, + { + "epoch": 6.140857787810384, + "grad_norm": 191.125, + "learning_rate": 2.0793103448275864e-05, + "loss": 36.6377, + "step": 1701 + }, + { + "epoch": 6.144469525959368, + "grad_norm": 232.39170837402344, + "learning_rate": 2.078765880217786e-05, + "loss": 36.5235, + "step": 1702 + }, + { + "epoch": 6.148081264108352, + "grad_norm": 259.41204833984375, + "learning_rate": 2.0782214156079855e-05, + "loss": 37.7093, + "step": 1703 + }, + { + "epoch": 6.151693002257336, + "grad_norm": 218.00814819335938, + "learning_rate": 2.0776769509981854e-05, + "loss": 37.8061, + "step": 1704 + }, + { + "epoch": 6.15530474040632, + "grad_norm": 183.78170776367188, + "learning_rate": 2.077132486388385e-05, + "loss": 37.9451, + "step": 1705 + }, + { + "epoch": 6.158916478555304, + "grad_norm": 242.387939453125, + "learning_rate": 2.0765880217785844e-05, + "loss": 38.687, + "step": 1706 + }, + { + "epoch": 6.162528216704289, + "grad_norm": 247.09152221679688, + "learning_rate": 2.076043557168784e-05, + "loss": 38.5109, + "step": 1707 + }, + { + "epoch": 6.166139954853273, + "grad_norm": 202.3104705810547, + "learning_rate": 2.0754990925589835e-05, + "loss": 28.0115, + "step": 1708 + }, + { + "epoch": 6.169751693002257, + "grad_norm": 239.5511016845703, + "learning_rate": 2.0749546279491834e-05, + "loss": 23.8873, + "step": 1709 + }, + { + "epoch": 6.173363431151241, + "grad_norm": 233.80007934570312, + "learning_rate": 2.0744101633393833e-05, + "loss": 24.0236, + "step": 1710 + }, + { + "epoch": 6.173363431151241, + "eval_loss": 0.6451307535171509, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1710 + }, + { + "epoch": 6.176975169300226, + "grad_norm": 231.85955810546875, + "learning_rate": 2.0738656987295828e-05, + "loss": 25.2521, + "step": 1711 + }, + { + "epoch": 6.18058690744921, + "grad_norm": 207.05453491210938, + "learning_rate": 2.0733212341197823e-05, + "loss": 25.5774, + "step": 1712 + }, + { + "epoch": 6.184198645598194, + "grad_norm": 265.9180908203125, + "learning_rate": 2.072776769509982e-05, + "loss": 46.0267, + "step": 1713 + }, + { + "epoch": 6.187810383747179, + "grad_norm": 289.2763671875, + "learning_rate": 2.0722323049001814e-05, + "loss": 46.6262, + "step": 1714 + }, + { + "epoch": 6.191422121896163, + "grad_norm": 254.466552734375, + "learning_rate": 2.0716878402903813e-05, + "loss": 44.2758, + "step": 1715 + }, + { + "epoch": 6.195033860045147, + "grad_norm": 262.713134765625, + "learning_rate": 2.071143375680581e-05, + "loss": 44.6334, + "step": 1716 + }, + { + "epoch": 6.198645598194131, + "grad_norm": 272.8150939941406, + "learning_rate": 2.0705989110707804e-05, + "loss": 44.9617, + "step": 1717 + }, + { + "epoch": 6.2022573363431155, + "grad_norm": 288.115478515625, + "learning_rate": 2.07005444646098e-05, + "loss": 44.4382, + "step": 1718 + }, + { + "epoch": 6.2058690744920995, + "grad_norm": 226.08058166503906, + "learning_rate": 2.0695099818511795e-05, + "loss": 44.8551, + "step": 1719 + }, + { + "epoch": 6.209480812641083, + "grad_norm": 219.95835876464844, + "learning_rate": 2.0689655172413797e-05, + "loss": 45.5901, + "step": 1720 + }, + { + "epoch": 6.209480812641083, + "eval_loss": 0.6379314661026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 1720 + }, + { + "epoch": 6.213092550790067, + "grad_norm": 190.3118896484375, + "learning_rate": 2.0684210526315792e-05, + "loss": 44.0675, + "step": 1721 + }, + { + "epoch": 6.216704288939052, + "grad_norm": 177.408935546875, + "learning_rate": 2.0678765880217787e-05, + "loss": 42.6333, + "step": 1722 + }, + { + "epoch": 6.220316027088036, + "grad_norm": 231.3040313720703, + "learning_rate": 2.0673321234119783e-05, + "loss": 41.6771, + "step": 1723 + }, + { + "epoch": 6.22392776523702, + "grad_norm": 226.51663208007812, + "learning_rate": 2.0667876588021778e-05, + "loss": 41.0829, + "step": 1724 + }, + { + "epoch": 6.227539503386004, + "grad_norm": 184.55775451660156, + "learning_rate": 2.0662431941923774e-05, + "loss": 39.2682, + "step": 1725 + }, + { + "epoch": 6.231151241534989, + "grad_norm": 205.0491943359375, + "learning_rate": 2.0656987295825772e-05, + "loss": 40.4101, + "step": 1726 + }, + { + "epoch": 6.234762979683973, + "grad_norm": 201.45838928222656, + "learning_rate": 2.0651542649727768e-05, + "loss": 39.9147, + "step": 1727 + }, + { + "epoch": 6.238374717832957, + "grad_norm": 220.16213989257812, + "learning_rate": 2.0646098003629763e-05, + "loss": 40.7215, + "step": 1728 + }, + { + "epoch": 6.241986455981941, + "grad_norm": 260.9661560058594, + "learning_rate": 2.0640653357531762e-05, + "loss": 40.0256, + "step": 1729 + }, + { + "epoch": 6.245598194130926, + "grad_norm": 314.2476806640625, + "learning_rate": 2.0635208711433757e-05, + "loss": 41.1147, + "step": 1730 + }, + { + "epoch": 6.245598194130926, + "eval_loss": 0.6347935199737549, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1730 + }, + { + "epoch": 6.24920993227991, + "grad_norm": 262.24505615234375, + "learning_rate": 2.0629764065335756e-05, + "loss": 41.7255, + "step": 1731 + }, + { + "epoch": 6.252821670428894, + "grad_norm": 212.0876922607422, + "learning_rate": 2.062431941923775e-05, + "loss": 41.2559, + "step": 1732 + }, + { + "epoch": 6.2564334085778786, + "grad_norm": 185.3249969482422, + "learning_rate": 2.0618874773139747e-05, + "loss": 41.1664, + "step": 1733 + }, + { + "epoch": 6.2600451467268625, + "grad_norm": 184.7873077392578, + "learning_rate": 2.0613430127041742e-05, + "loss": 41.3357, + "step": 1734 + }, + { + "epoch": 6.2636568848758465, + "grad_norm": 230.11257934570312, + "learning_rate": 2.0607985480943738e-05, + "loss": 43.0978, + "step": 1735 + }, + { + "epoch": 6.2672686230248305, + "grad_norm": 251.255126953125, + "learning_rate": 2.0602540834845733e-05, + "loss": 42.4169, + "step": 1736 + }, + { + "epoch": 6.270880361173815, + "grad_norm": 230.1149444580078, + "learning_rate": 2.0597096188747732e-05, + "loss": 43.2969, + "step": 1737 + }, + { + "epoch": 6.274492099322799, + "grad_norm": 217.2769012451172, + "learning_rate": 2.059165154264973e-05, + "loss": 42.6037, + "step": 1738 + }, + { + "epoch": 6.278103837471783, + "grad_norm": 189.85533142089844, + "learning_rate": 2.0586206896551726e-05, + "loss": 42.1215, + "step": 1739 + }, + { + "epoch": 6.281715575620767, + "grad_norm": 242.15667724609375, + "learning_rate": 2.058076225045372e-05, + "loss": 42.6337, + "step": 1740 + }, + { + "epoch": 6.281715575620767, + "eval_loss": 0.6310555934906006, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 1740 + }, + { + "epoch": 6.285327313769752, + "grad_norm": 213.7873992919922, + "learning_rate": 2.0575317604355717e-05, + "loss": 40.5315, + "step": 1741 + }, + { + "epoch": 6.288939051918736, + "grad_norm": 243.86492919921875, + "learning_rate": 2.0569872958257715e-05, + "loss": 38.9483, + "step": 1742 + }, + { + "epoch": 6.29255079006772, + "grad_norm": 276.0108642578125, + "learning_rate": 2.056442831215971e-05, + "loss": 35.9627, + "step": 1743 + }, + { + "epoch": 6.296162528216704, + "grad_norm": 252.5875701904297, + "learning_rate": 2.0558983666061706e-05, + "loss": 35.4305, + "step": 1744 + }, + { + "epoch": 6.299774266365689, + "grad_norm": 227.15142822265625, + "learning_rate": 2.05535390199637e-05, + "loss": 35.2385, + "step": 1745 + }, + { + "epoch": 6.303386004514673, + "grad_norm": 259.6727294921875, + "learning_rate": 2.0548094373865697e-05, + "loss": 35.735, + "step": 1746 + }, + { + "epoch": 6.306997742663657, + "grad_norm": 185.07765197753906, + "learning_rate": 2.0542649727767696e-05, + "loss": 36.8835, + "step": 1747 + }, + { + "epoch": 6.310609480812641, + "grad_norm": 207.650146484375, + "learning_rate": 2.0537205081669694e-05, + "loss": 36.346, + "step": 1748 + }, + { + "epoch": 6.314221218961626, + "grad_norm": 223.2378692626953, + "learning_rate": 2.053176043557169e-05, + "loss": 36.1527, + "step": 1749 + }, + { + "epoch": 6.3178329571106095, + "grad_norm": 162.90794372558594, + "learning_rate": 2.0526315789473685e-05, + "loss": 35.7408, + "step": 1750 + }, + { + "epoch": 6.3178329571106095, + "eval_loss": 0.6276403069496155, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 1750 + }, + { + "epoch": 6.3214446952595935, + "grad_norm": 165.8592987060547, + "learning_rate": 2.052087114337568e-05, + "loss": 37.7916, + "step": 1751 + }, + { + "epoch": 6.3250564334085775, + "grad_norm": 179.7499542236328, + "learning_rate": 2.0515426497277676e-05, + "loss": 36.8409, + "step": 1752 + }, + { + "epoch": 6.328668171557562, + "grad_norm": 227.0990753173828, + "learning_rate": 2.0509981851179675e-05, + "loss": 37.1766, + "step": 1753 + }, + { + "epoch": 6.332279909706546, + "grad_norm": 216.3297882080078, + "learning_rate": 2.050453720508167e-05, + "loss": 37.5, + "step": 1754 + }, + { + "epoch": 6.33589164785553, + "grad_norm": 197.88409423828125, + "learning_rate": 2.0499092558983666e-05, + "loss": 38.8293, + "step": 1755 + }, + { + "epoch": 6.339503386004514, + "grad_norm": 189.74916076660156, + "learning_rate": 2.049364791288566e-05, + "loss": 37.9873, + "step": 1756 + }, + { + "epoch": 6.343115124153499, + "grad_norm": 241.16644287109375, + "learning_rate": 2.048820326678766e-05, + "loss": 39.3107, + "step": 1757 + }, + { + "epoch": 6.346726862302483, + "grad_norm": 224.3491668701172, + "learning_rate": 2.0482758620689655e-05, + "loss": 36.2482, + "step": 1758 + }, + { + "epoch": 6.350338600451467, + "grad_norm": 217.30882263183594, + "learning_rate": 2.0477313974591654e-05, + "loss": 24.1945, + "step": 1759 + }, + { + "epoch": 6.353950338600452, + "grad_norm": 213.23683166503906, + "learning_rate": 2.047186932849365e-05, + "loss": 24.2356, + "step": 1760 + }, + { + "epoch": 6.353950338600452, + "eval_loss": 0.6382855772972107, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.795, + "eval_steps_per_second": 56.795, + "step": 1760 + }, + { + "epoch": 6.357562076749436, + "grad_norm": 209.8166961669922, + "learning_rate": 2.0466424682395645e-05, + "loss": 25.1916, + "step": 1761 + }, + { + "epoch": 6.36117381489842, + "grad_norm": 197.86773681640625, + "learning_rate": 2.046098003629764e-05, + "loss": 25.1372, + "step": 1762 + }, + { + "epoch": 6.364785553047404, + "grad_norm": 280.80517578125, + "learning_rate": 2.0455535390199635e-05, + "loss": 45.0431, + "step": 1763 + }, + { + "epoch": 6.368397291196389, + "grad_norm": 239.85861206054688, + "learning_rate": 2.0450090744101634e-05, + "loss": 45.4893, + "step": 1764 + }, + { + "epoch": 6.372009029345373, + "grad_norm": 302.56024169921875, + "learning_rate": 2.044464609800363e-05, + "loss": 45.3313, + "step": 1765 + }, + { + "epoch": 6.375620767494357, + "grad_norm": 255.5519256591797, + "learning_rate": 2.043920145190563e-05, + "loss": 44.703, + "step": 1766 + }, + { + "epoch": 6.3792325056433405, + "grad_norm": 223.1331024169922, + "learning_rate": 2.0433756805807624e-05, + "loss": 45.0278, + "step": 1767 + }, + { + "epoch": 6.382844243792325, + "grad_norm": 240.68817138671875, + "learning_rate": 2.042831215970962e-05, + "loss": 44.7298, + "step": 1768 + }, + { + "epoch": 6.386455981941309, + "grad_norm": 239.5072021484375, + "learning_rate": 2.0422867513611614e-05, + "loss": 44.0512, + "step": 1769 + }, + { + "epoch": 6.390067720090293, + "grad_norm": 186.3783416748047, + "learning_rate": 2.0417422867513613e-05, + "loss": 43.8646, + "step": 1770 + }, + { + "epoch": 6.390067720090293, + "eval_loss": 0.6325972676277161, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 1770 + }, + { + "epoch": 6.393679458239277, + "grad_norm": 169.77285766601562, + "learning_rate": 2.041197822141561e-05, + "loss": 43.8688, + "step": 1771 + }, + { + "epoch": 6.397291196388262, + "grad_norm": 158.4019012451172, + "learning_rate": 2.0406533575317604e-05, + "loss": 42.5757, + "step": 1772 + }, + { + "epoch": 6.400902934537246, + "grad_norm": 209.79916381835938, + "learning_rate": 2.04010889292196e-05, + "loss": 44.8075, + "step": 1773 + }, + { + "epoch": 6.40451467268623, + "grad_norm": 215.74639892578125, + "learning_rate": 2.0395644283121595e-05, + "loss": 42.0121, + "step": 1774 + }, + { + "epoch": 6.408126410835214, + "grad_norm": 215.21121215820312, + "learning_rate": 2.0390199637023597e-05, + "loss": 40.6564, + "step": 1775 + }, + { + "epoch": 6.411738148984199, + "grad_norm": 244.49574279785156, + "learning_rate": 2.0384754990925592e-05, + "loss": 40.543, + "step": 1776 + }, + { + "epoch": 6.415349887133183, + "grad_norm": 189.22781372070312, + "learning_rate": 2.0379310344827588e-05, + "loss": 39.5569, + "step": 1777 + }, + { + "epoch": 6.418961625282167, + "grad_norm": 204.32664489746094, + "learning_rate": 2.0373865698729583e-05, + "loss": 40.0789, + "step": 1778 + }, + { + "epoch": 6.422573363431152, + "grad_norm": 217.5277557373047, + "learning_rate": 2.036842105263158e-05, + "loss": 39.6436, + "step": 1779 + }, + { + "epoch": 6.426185101580136, + "grad_norm": 196.25918579101562, + "learning_rate": 2.0362976406533574e-05, + "loss": 41.0794, + "step": 1780 + }, + { + "epoch": 6.426185101580136, + "eval_loss": 0.6334295868873596, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1780 + }, + { + "epoch": 6.42979683972912, + "grad_norm": 191.50656127929688, + "learning_rate": 2.0357531760435573e-05, + "loss": 41.2976, + "step": 1781 + }, + { + "epoch": 6.433408577878104, + "grad_norm": 192.98692321777344, + "learning_rate": 2.0352087114337568e-05, + "loss": 41.0843, + "step": 1782 + }, + { + "epoch": 6.437020316027088, + "grad_norm": 197.32862854003906, + "learning_rate": 2.0346642468239563e-05, + "loss": 40.4123, + "step": 1783 + }, + { + "epoch": 6.440632054176072, + "grad_norm": 205.18751525878906, + "learning_rate": 2.0341197822141562e-05, + "loss": 41.9185, + "step": 1784 + }, + { + "epoch": 6.444243792325056, + "grad_norm": 201.69070434570312, + "learning_rate": 2.0335753176043558e-05, + "loss": 41.6794, + "step": 1785 + }, + { + "epoch": 6.44785553047404, + "grad_norm": 218.77044677734375, + "learning_rate": 2.0330308529945556e-05, + "loss": 43.5805, + "step": 1786 + }, + { + "epoch": 6.451467268623025, + "grad_norm": 183.25967407226562, + "learning_rate": 2.0324863883847552e-05, + "loss": 41.2777, + "step": 1787 + }, + { + "epoch": 6.455079006772009, + "grad_norm": 219.97369384765625, + "learning_rate": 2.0319419237749547e-05, + "loss": 42.4618, + "step": 1788 + }, + { + "epoch": 6.458690744920993, + "grad_norm": 216.1624298095703, + "learning_rate": 2.0313974591651542e-05, + "loss": 41.6424, + "step": 1789 + }, + { + "epoch": 6.462302483069977, + "grad_norm": 222.29965209960938, + "learning_rate": 2.0308529945553538e-05, + "loss": 41.4058, + "step": 1790 + }, + { + "epoch": 6.462302483069977, + "eval_loss": 0.6282982230186462, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 1790 + }, + { + "epoch": 6.465914221218962, + "grad_norm": 215.50511169433594, + "learning_rate": 2.0303085299455533e-05, + "loss": 39.474, + "step": 1791 + }, + { + "epoch": 6.469525959367946, + "grad_norm": 237.2119903564453, + "learning_rate": 2.0297640653357532e-05, + "loss": 36.0508, + "step": 1792 + }, + { + "epoch": 6.47313769751693, + "grad_norm": 234.52975463867188, + "learning_rate": 2.029219600725953e-05, + "loss": 34.1704, + "step": 1793 + }, + { + "epoch": 6.476749435665914, + "grad_norm": 213.22216796875, + "learning_rate": 2.0286751361161526e-05, + "loss": 34.7592, + "step": 1794 + }, + { + "epoch": 6.480361173814899, + "grad_norm": 215.77244567871094, + "learning_rate": 2.028130671506352e-05, + "loss": 35.3051, + "step": 1795 + }, + { + "epoch": 6.483972911963883, + "grad_norm": 179.0439910888672, + "learning_rate": 2.0275862068965517e-05, + "loss": 35.2493, + "step": 1796 + }, + { + "epoch": 6.487584650112867, + "grad_norm": 217.47218322753906, + "learning_rate": 2.0270417422867516e-05, + "loss": 35.6169, + "step": 1797 + }, + { + "epoch": 6.491196388261851, + "grad_norm": 191.3380584716797, + "learning_rate": 2.026497277676951e-05, + "loss": 36.428, + "step": 1798 + }, + { + "epoch": 6.4948081264108355, + "grad_norm": 200.8570098876953, + "learning_rate": 2.0259528130671506e-05, + "loss": 36.5983, + "step": 1799 + }, + { + "epoch": 6.4984198645598195, + "grad_norm": 173.1240234375, + "learning_rate": 2.0254083484573502e-05, + "loss": 36.0163, + "step": 1800 + }, + { + "epoch": 6.4984198645598195, + "eval_loss": 0.6268841624259949, + "eval_runtime": 3.146, + "eval_samples_per_second": 56.898, + "eval_steps_per_second": 56.898, + "step": 1800 + }, + { + "epoch": 6.502031602708803, + "grad_norm": 225.66845703125, + "learning_rate": 2.0248638838475497e-05, + "loss": 36.2461, + "step": 1801 + }, + { + "epoch": 6.505643340857787, + "grad_norm": 189.66233825683594, + "learning_rate": 2.0243194192377496e-05, + "loss": 37.416, + "step": 1802 + }, + { + "epoch": 6.509255079006772, + "grad_norm": 243.0270233154297, + "learning_rate": 2.0237749546279495e-05, + "loss": 38.5309, + "step": 1803 + }, + { + "epoch": 6.512866817155756, + "grad_norm": 192.0927276611328, + "learning_rate": 2.023230490018149e-05, + "loss": 37.087, + "step": 1804 + }, + { + "epoch": 6.51647855530474, + "grad_norm": 222.2957305908203, + "learning_rate": 2.0226860254083486e-05, + "loss": 37.8877, + "step": 1805 + }, + { + "epoch": 6.520090293453725, + "grad_norm": 259.84722900390625, + "learning_rate": 2.022141560798548e-05, + "loss": 39.2138, + "step": 1806 + }, + { + "epoch": 6.523702031602709, + "grad_norm": 205.5794219970703, + "learning_rate": 2.0215970961887476e-05, + "loss": 38.6066, + "step": 1807 + }, + { + "epoch": 6.527313769751693, + "grad_norm": 300.455810546875, + "learning_rate": 2.0210526315789475e-05, + "loss": 36.1581, + "step": 1808 + }, + { + "epoch": 6.530925507900677, + "grad_norm": 207.18063354492188, + "learning_rate": 2.020508166969147e-05, + "loss": 24.3689, + "step": 1809 + }, + { + "epoch": 6.534537246049661, + "grad_norm": 230.98516845703125, + "learning_rate": 2.0199637023593466e-05, + "loss": 23.7019, + "step": 1810 + }, + { + "epoch": 6.534537246049661, + "eval_loss": 0.6379140615463257, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 1810 + }, + { + "epoch": 6.538148984198646, + "grad_norm": 153.8694610595703, + "learning_rate": 2.019419237749546e-05, + "loss": 24.5035, + "step": 1811 + }, + { + "epoch": 6.54176072234763, + "grad_norm": 229.9432373046875, + "learning_rate": 2.018874773139746e-05, + "loss": 26.1645, + "step": 1812 + }, + { + "epoch": 6.545372460496614, + "grad_norm": 325.3592529296875, + "learning_rate": 2.018330308529946e-05, + "loss": 45.6349, + "step": 1813 + }, + { + "epoch": 6.5489841986455986, + "grad_norm": 261.0744323730469, + "learning_rate": 2.0177858439201454e-05, + "loss": 45.5545, + "step": 1814 + }, + { + "epoch": 6.5525959367945825, + "grad_norm": 261.4237976074219, + "learning_rate": 2.017241379310345e-05, + "loss": 45.321, + "step": 1815 + }, + { + "epoch": 6.5562076749435665, + "grad_norm": 238.8377685546875, + "learning_rate": 2.0166969147005445e-05, + "loss": 44.5963, + "step": 1816 + }, + { + "epoch": 6.5598194130925505, + "grad_norm": 225.89730834960938, + "learning_rate": 2.016152450090744e-05, + "loss": 43.593, + "step": 1817 + }, + { + "epoch": 6.563431151241535, + "grad_norm": 265.09625244140625, + "learning_rate": 2.0156079854809436e-05, + "loss": 43.536, + "step": 1818 + }, + { + "epoch": 6.567042889390519, + "grad_norm": 257.9114685058594, + "learning_rate": 2.0150635208711434e-05, + "loss": 44.1125, + "step": 1819 + }, + { + "epoch": 6.570654627539503, + "grad_norm": 188.06382751464844, + "learning_rate": 2.014519056261343e-05, + "loss": 45.097, + "step": 1820 + }, + { + "epoch": 6.570654627539503, + "eval_loss": 0.6347097754478455, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 1820 + }, + { + "epoch": 6.574266365688487, + "grad_norm": 227.7350616455078, + "learning_rate": 2.013974591651543e-05, + "loss": 43.9367, + "step": 1821 + }, + { + "epoch": 6.577878103837472, + "grad_norm": 207.54774475097656, + "learning_rate": 2.0134301270417424e-05, + "loss": 43.8266, + "step": 1822 + }, + { + "epoch": 6.581489841986456, + "grad_norm": 204.62364196777344, + "learning_rate": 2.012885662431942e-05, + "loss": 42.7973, + "step": 1823 + }, + { + "epoch": 6.58510158013544, + "grad_norm": 244.32159423828125, + "learning_rate": 2.0123411978221418e-05, + "loss": 42.7741, + "step": 1824 + }, + { + "epoch": 6.588713318284425, + "grad_norm": 304.9100036621094, + "learning_rate": 2.0117967332123414e-05, + "loss": 40.6529, + "step": 1825 + }, + { + "epoch": 6.592325056433409, + "grad_norm": 275.5767517089844, + "learning_rate": 2.011252268602541e-05, + "loss": 40.2909, + "step": 1826 + }, + { + "epoch": 6.595936794582393, + "grad_norm": 227.69642639160156, + "learning_rate": 2.0107078039927404e-05, + "loss": 39.8786, + "step": 1827 + }, + { + "epoch": 6.599548532731377, + "grad_norm": 261.4333190917969, + "learning_rate": 2.01016333938294e-05, + "loss": 40.7009, + "step": 1828 + }, + { + "epoch": 6.603160270880361, + "grad_norm": 213.0095977783203, + "learning_rate": 2.0096188747731395e-05, + "loss": 40.0595, + "step": 1829 + }, + { + "epoch": 6.606772009029346, + "grad_norm": 251.78590393066406, + "learning_rate": 2.0090744101633397e-05, + "loss": 40.8939, + "step": 1830 + }, + { + "epoch": 6.606772009029346, + "eval_loss": 0.6333281397819519, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1830 + }, + { + "epoch": 6.6103837471783295, + "grad_norm": 224.89805603027344, + "learning_rate": 2.0085299455535393e-05, + "loss": 41.4123, + "step": 1831 + }, + { + "epoch": 6.6139954853273135, + "grad_norm": 195.67982482910156, + "learning_rate": 2.0079854809437388e-05, + "loss": 41.3483, + "step": 1832 + }, + { + "epoch": 6.617607223476298, + "grad_norm": 214.318603515625, + "learning_rate": 2.0074410163339383e-05, + "loss": 40.5516, + "step": 1833 + }, + { + "epoch": 6.621218961625282, + "grad_norm": 226.60968017578125, + "learning_rate": 2.006896551724138e-05, + "loss": 41.3523, + "step": 1834 + }, + { + "epoch": 6.624830699774266, + "grad_norm": 231.63604736328125, + "learning_rate": 2.0063520871143378e-05, + "loss": 41.8734, + "step": 1835 + }, + { + "epoch": 6.62844243792325, + "grad_norm": 224.1644287109375, + "learning_rate": 2.0058076225045373e-05, + "loss": 42.7386, + "step": 1836 + }, + { + "epoch": 6.632054176072235, + "grad_norm": 273.651123046875, + "learning_rate": 2.0052631578947368e-05, + "loss": 42.4525, + "step": 1837 + }, + { + "epoch": 6.635665914221219, + "grad_norm": 270.8088684082031, + "learning_rate": 2.0047186932849364e-05, + "loss": 42.1051, + "step": 1838 + }, + { + "epoch": 6.639277652370203, + "grad_norm": 303.1058044433594, + "learning_rate": 2.0041742286751362e-05, + "loss": 42.1301, + "step": 1839 + }, + { + "epoch": 6.642889390519187, + "grad_norm": 207.29380798339844, + "learning_rate": 2.0036297640653358e-05, + "loss": 42.1495, + "step": 1840 + }, + { + "epoch": 6.642889390519187, + "eval_loss": 0.6321585774421692, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1840 + }, + { + "epoch": 6.646501128668172, + "grad_norm": 262.1852722167969, + "learning_rate": 2.0030852994555357e-05, + "loss": 39.6408, + "step": 1841 + }, + { + "epoch": 6.650112866817156, + "grad_norm": 233.7991943359375, + "learning_rate": 2.0025408348457352e-05, + "loss": 37.6177, + "step": 1842 + }, + { + "epoch": 6.65372460496614, + "grad_norm": 247.25514221191406, + "learning_rate": 2.0019963702359347e-05, + "loss": 35.4287, + "step": 1843 + }, + { + "epoch": 6.657336343115124, + "grad_norm": 191.53343200683594, + "learning_rate": 2.0014519056261343e-05, + "loss": 34.2335, + "step": 1844 + }, + { + "epoch": 6.660948081264109, + "grad_norm": 245.22821044921875, + "learning_rate": 2.0009074410163338e-05, + "loss": 35.8097, + "step": 1845 + }, + { + "epoch": 6.664559819413093, + "grad_norm": 213.8151092529297, + "learning_rate": 2.0003629764065337e-05, + "loss": 35.2621, + "step": 1846 + }, + { + "epoch": 6.668171557562077, + "grad_norm": 174.6085205078125, + "learning_rate": 1.9998185117967332e-05, + "loss": 36.6137, + "step": 1847 + }, + { + "epoch": 6.6717832957110605, + "grad_norm": 287.4677429199219, + "learning_rate": 1.9992740471869328e-05, + "loss": 37.5896, + "step": 1848 + }, + { + "epoch": 6.675395033860045, + "grad_norm": 224.59771728515625, + "learning_rate": 1.9987295825771326e-05, + "loss": 36.5515, + "step": 1849 + }, + { + "epoch": 6.679006772009029, + "grad_norm": 212.73065185546875, + "learning_rate": 1.9981851179673322e-05, + "loss": 36.2511, + "step": 1850 + }, + { + "epoch": 6.679006772009029, + "eval_loss": 0.6308404803276062, + "eval_runtime": 3.1419, + "eval_samples_per_second": 56.972, + "eval_steps_per_second": 56.972, + "step": 1850 + }, + { + "epoch": 6.682618510158013, + "grad_norm": 214.7340850830078, + "learning_rate": 1.9976406533575317e-05, + "loss": 37.6949, + "step": 1851 + }, + { + "epoch": 6.686230248306998, + "grad_norm": 220.3029327392578, + "learning_rate": 1.9970961887477316e-05, + "loss": 36.5785, + "step": 1852 + }, + { + "epoch": 6.689841986455982, + "grad_norm": 198.97564697265625, + "learning_rate": 1.996551724137931e-05, + "loss": 38.5277, + "step": 1853 + }, + { + "epoch": 6.693453724604966, + "grad_norm": 180.94789123535156, + "learning_rate": 1.9960072595281307e-05, + "loss": 37.5197, + "step": 1854 + }, + { + "epoch": 6.69706546275395, + "grad_norm": 212.17584228515625, + "learning_rate": 1.9954627949183302e-05, + "loss": 37.3483, + "step": 1855 + }, + { + "epoch": 6.700677200902934, + "grad_norm": 253.88601684570312, + "learning_rate": 1.9949183303085298e-05, + "loss": 38.5224, + "step": 1856 + }, + { + "epoch": 6.704288939051919, + "grad_norm": 193.17698669433594, + "learning_rate": 1.9943738656987296e-05, + "loss": 37.5679, + "step": 1857 + }, + { + "epoch": 6.707900677200903, + "grad_norm": 217.2652130126953, + "learning_rate": 1.9938294010889295e-05, + "loss": 27.7344, + "step": 1858 + }, + { + "epoch": 6.711512415349887, + "grad_norm": 183.9295196533203, + "learning_rate": 1.993284936479129e-05, + "loss": 24.3864, + "step": 1859 + }, + { + "epoch": 6.715124153498872, + "grad_norm": 200.3455352783203, + "learning_rate": 1.9927404718693286e-05, + "loss": 23.7328, + "step": 1860 + }, + { + "epoch": 6.715124153498872, + "eval_loss": 0.636415421962738, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.943, + "eval_steps_per_second": 56.943, + "step": 1860 + }, + { + "epoch": 6.718735891647856, + "grad_norm": 206.7858123779297, + "learning_rate": 1.992196007259528e-05, + "loss": 24.6541, + "step": 1861 + }, + { + "epoch": 6.72234762979684, + "grad_norm": 208.10414123535156, + "learning_rate": 1.9916515426497277e-05, + "loss": 25.1223, + "step": 1862 + }, + { + "epoch": 6.725959367945824, + "grad_norm": 270.6657409667969, + "learning_rate": 1.9911070780399275e-05, + "loss": 44.8561, + "step": 1863 + }, + { + "epoch": 6.7295711060948085, + "grad_norm": 246.69094848632812, + "learning_rate": 1.990562613430127e-05, + "loss": 45.8683, + "step": 1864 + }, + { + "epoch": 6.733182844243792, + "grad_norm": 243.4462432861328, + "learning_rate": 1.9900181488203266e-05, + "loss": 45.1845, + "step": 1865 + }, + { + "epoch": 6.736794582392776, + "grad_norm": 218.0637969970703, + "learning_rate": 1.989473684210526e-05, + "loss": 43.9492, + "step": 1866 + }, + { + "epoch": 6.74040632054176, + "grad_norm": 200.28140258789062, + "learning_rate": 1.988929219600726e-05, + "loss": 44.0612, + "step": 1867 + }, + { + "epoch": 6.744018058690745, + "grad_norm": 200.3120880126953, + "learning_rate": 1.988384754990926e-05, + "loss": 43.4748, + "step": 1868 + }, + { + "epoch": 6.747629796839729, + "grad_norm": 186.1811065673828, + "learning_rate": 1.9878402903811254e-05, + "loss": 43.6851, + "step": 1869 + }, + { + "epoch": 6.751241534988713, + "grad_norm": 208.15167236328125, + "learning_rate": 1.987295825771325e-05, + "loss": 44.4196, + "step": 1870 + }, + { + "epoch": 6.751241534988713, + "eval_loss": 0.6353851556777954, + "eval_runtime": 3.1436, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1870 + }, + { + "epoch": 6.754853273137698, + "grad_norm": 207.500244140625, + "learning_rate": 1.9867513611615245e-05, + "loss": 44.1493, + "step": 1871 + }, + { + "epoch": 6.758465011286682, + "grad_norm": 238.17047119140625, + "learning_rate": 1.986206896551724e-05, + "loss": 44.6587, + "step": 1872 + }, + { + "epoch": 6.762076749435666, + "grad_norm": 192.9468231201172, + "learning_rate": 1.9856624319419236e-05, + "loss": 43.2409, + "step": 1873 + }, + { + "epoch": 6.76568848758465, + "grad_norm": 205.26492309570312, + "learning_rate": 1.9851179673321235e-05, + "loss": 40.8636, + "step": 1874 + }, + { + "epoch": 6.769300225733634, + "grad_norm": 190.49908447265625, + "learning_rate": 1.984573502722323e-05, + "loss": 41.0769, + "step": 1875 + }, + { + "epoch": 6.772911963882619, + "grad_norm": 206.56097412109375, + "learning_rate": 1.984029038112523e-05, + "loss": 40.1137, + "step": 1876 + }, + { + "epoch": 6.776523702031603, + "grad_norm": 212.89256286621094, + "learning_rate": 1.9834845735027224e-05, + "loss": 41.0114, + "step": 1877 + }, + { + "epoch": 6.780135440180587, + "grad_norm": 197.24267578125, + "learning_rate": 1.982940108892922e-05, + "loss": 40.6027, + "step": 1878 + }, + { + "epoch": 6.7837471783295715, + "grad_norm": 187.01942443847656, + "learning_rate": 1.982395644283122e-05, + "loss": 40.5933, + "step": 1879 + }, + { + "epoch": 6.7873589164785555, + "grad_norm": 236.31092834472656, + "learning_rate": 1.9818511796733214e-05, + "loss": 41.2282, + "step": 1880 + }, + { + "epoch": 6.7873589164785555, + "eval_loss": 0.6299392580986023, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 1880 + }, + { + "epoch": 6.7909706546275395, + "grad_norm": 194.92059326171875, + "learning_rate": 1.981306715063521e-05, + "loss": 41.5858, + "step": 1881 + }, + { + "epoch": 6.794582392776523, + "grad_norm": 192.26272583007812, + "learning_rate": 1.9807622504537205e-05, + "loss": 40.6826, + "step": 1882 + }, + { + "epoch": 6.798194130925508, + "grad_norm": 181.8116912841797, + "learning_rate": 1.98021778584392e-05, + "loss": 40.0867, + "step": 1883 + }, + { + "epoch": 6.801805869074492, + "grad_norm": 219.03494262695312, + "learning_rate": 1.9796733212341195e-05, + "loss": 41.4496, + "step": 1884 + }, + { + "epoch": 6.805417607223476, + "grad_norm": 190.7852325439453, + "learning_rate": 1.9791288566243194e-05, + "loss": 42.4147, + "step": 1885 + }, + { + "epoch": 6.80902934537246, + "grad_norm": 200.32476806640625, + "learning_rate": 1.9785843920145193e-05, + "loss": 42.0316, + "step": 1886 + }, + { + "epoch": 6.812641083521445, + "grad_norm": 240.6086883544922, + "learning_rate": 1.9780399274047188e-05, + "loss": 39.6992, + "step": 1887 + }, + { + "epoch": 6.816252821670429, + "grad_norm": 222.31700134277344, + "learning_rate": 1.9774954627949184e-05, + "loss": 42.9572, + "step": 1888 + }, + { + "epoch": 6.819864559819413, + "grad_norm": 215.65292358398438, + "learning_rate": 1.976950998185118e-05, + "loss": 42.5147, + "step": 1889 + }, + { + "epoch": 6.823476297968397, + "grad_norm": 195.71624755859375, + "learning_rate": 1.9764065335753178e-05, + "loss": 40.9536, + "step": 1890 + }, + { + "epoch": 6.823476297968397, + "eval_loss": 0.6288287043571472, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 1890 + }, + { + "epoch": 6.827088036117382, + "grad_norm": 202.301025390625, + "learning_rate": 1.9758620689655173e-05, + "loss": 40.1754, + "step": 1891 + }, + { + "epoch": 6.830699774266366, + "grad_norm": 217.07186889648438, + "learning_rate": 1.975317604355717e-05, + "loss": 35.7505, + "step": 1892 + }, + { + "epoch": 6.83431151241535, + "grad_norm": 189.78782653808594, + "learning_rate": 1.9747731397459164e-05, + "loss": 34.813, + "step": 1893 + }, + { + "epoch": 6.837923250564334, + "grad_norm": 247.2117462158203, + "learning_rate": 1.974228675136116e-05, + "loss": 33.932, + "step": 1894 + }, + { + "epoch": 6.8415349887133186, + "grad_norm": 244.06321716308594, + "learning_rate": 1.9736842105263158e-05, + "loss": 36.2514, + "step": 1895 + }, + { + "epoch": 6.8451467268623025, + "grad_norm": 235.78692626953125, + "learning_rate": 1.9731397459165157e-05, + "loss": 35.2123, + "step": 1896 + }, + { + "epoch": 6.8487584650112865, + "grad_norm": 193.82456970214844, + "learning_rate": 1.9725952813067152e-05, + "loss": 36.5477, + "step": 1897 + }, + { + "epoch": 6.852370203160271, + "grad_norm": 230.2017059326172, + "learning_rate": 1.9720508166969148e-05, + "loss": 36.1244, + "step": 1898 + }, + { + "epoch": 6.855981941309255, + "grad_norm": 205.5274200439453, + "learning_rate": 1.9715063520871143e-05, + "loss": 36.7059, + "step": 1899 + }, + { + "epoch": 6.859593679458239, + "grad_norm": 236.6873016357422, + "learning_rate": 1.970961887477314e-05, + "loss": 36.6212, + "step": 1900 + }, + { + "epoch": 6.859593679458239, + "eval_loss": 0.6235609650611877, + "eval_runtime": 3.1497, + "eval_samples_per_second": 56.831, + "eval_steps_per_second": 56.831, + "step": 1900 + }, + { + "epoch": 6.863205417607223, + "grad_norm": 217.63638305664062, + "learning_rate": 1.9704174228675137e-05, + "loss": 37.3918, + "step": 1901 + }, + { + "epoch": 6.866817155756207, + "grad_norm": 169.31996154785156, + "learning_rate": 1.9698729582577133e-05, + "loss": 37.8555, + "step": 1902 + }, + { + "epoch": 6.870428893905192, + "grad_norm": 204.2144775390625, + "learning_rate": 1.9693284936479128e-05, + "loss": 38.0013, + "step": 1903 + }, + { + "epoch": 6.874040632054176, + "grad_norm": 219.13595581054688, + "learning_rate": 1.9687840290381127e-05, + "loss": 37.2128, + "step": 1904 + }, + { + "epoch": 6.87765237020316, + "grad_norm": 189.8477325439453, + "learning_rate": 1.9682395644283122e-05, + "loss": 39.272, + "step": 1905 + }, + { + "epoch": 6.881264108352145, + "grad_norm": 214.21360778808594, + "learning_rate": 1.967695099818512e-05, + "loss": 37.5185, + "step": 1906 + }, + { + "epoch": 6.884875846501129, + "grad_norm": 252.57867431640625, + "learning_rate": 1.9671506352087116e-05, + "loss": 37.6195, + "step": 1907 + }, + { + "epoch": 6.888487584650113, + "grad_norm": 169.85382080078125, + "learning_rate": 1.966606170598911e-05, + "loss": 29.083, + "step": 1908 + }, + { + "epoch": 6.892099322799097, + "grad_norm": 161.38137817382812, + "learning_rate": 1.9660617059891107e-05, + "loss": 24.4547, + "step": 1909 + }, + { + "epoch": 6.895711060948082, + "grad_norm": 192.5706787109375, + "learning_rate": 1.9655172413793102e-05, + "loss": 24.2235, + "step": 1910 + }, + { + "epoch": 6.895711060948082, + "eval_loss": 0.6387229561805725, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1910 + }, + { + "epoch": 6.899322799097066, + "grad_norm": 177.5368194580078, + "learning_rate": 1.9649727767695098e-05, + "loss": 24.8032, + "step": 1911 + }, + { + "epoch": 6.9029345372460496, + "grad_norm": 206.98458862304688, + "learning_rate": 1.9644283121597097e-05, + "loss": 25.7293, + "step": 1912 + }, + { + "epoch": 6.9065462753950335, + "grad_norm": 238.7289581298828, + "learning_rate": 1.9638838475499095e-05, + "loss": 44.2514, + "step": 1913 + }, + { + "epoch": 6.910158013544018, + "grad_norm": 225.86854553222656, + "learning_rate": 1.963339382940109e-05, + "loss": 44.4858, + "step": 1914 + }, + { + "epoch": 6.913769751693002, + "grad_norm": 235.71524047851562, + "learning_rate": 1.9627949183303086e-05, + "loss": 44.5351, + "step": 1915 + }, + { + "epoch": 6.917381489841986, + "grad_norm": 233.1634063720703, + "learning_rate": 1.962250453720508e-05, + "loss": 44.0865, + "step": 1916 + }, + { + "epoch": 6.92099322799097, + "grad_norm": 201.48944091796875, + "learning_rate": 1.961705989110708e-05, + "loss": 45.0226, + "step": 1917 + }, + { + "epoch": 6.924604966139955, + "grad_norm": 226.95469665527344, + "learning_rate": 1.9611615245009076e-05, + "loss": 44.3969, + "step": 1918 + }, + { + "epoch": 6.928216704288939, + "grad_norm": 242.79940795898438, + "learning_rate": 1.960617059891107e-05, + "loss": 41.3037, + "step": 1919 + }, + { + "epoch": 6.931828442437923, + "grad_norm": 255.3524932861328, + "learning_rate": 1.9600725952813066e-05, + "loss": 41.3567, + "step": 1920 + }, + { + "epoch": 6.931828442437923, + "eval_loss": 0.6346065998077393, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 1920 + }, + { + "epoch": 6.935440180586907, + "grad_norm": 277.0763854980469, + "learning_rate": 1.9595281306715062e-05, + "loss": 41.142, + "step": 1921 + }, + { + "epoch": 6.939051918735892, + "grad_norm": 176.02658081054688, + "learning_rate": 1.958983666061706e-05, + "loss": 42.1963, + "step": 1922 + }, + { + "epoch": 6.942663656884876, + "grad_norm": 236.36398315429688, + "learning_rate": 1.958439201451906e-05, + "loss": 42.351, + "step": 1923 + }, + { + "epoch": 6.94627539503386, + "grad_norm": 203.0919647216797, + "learning_rate": 1.9578947368421055e-05, + "loss": 41.5248, + "step": 1924 + }, + { + "epoch": 6.949887133182845, + "grad_norm": 273.605712890625, + "learning_rate": 1.957350272232305e-05, + "loss": 42.1004, + "step": 1925 + }, + { + "epoch": 6.953498871331829, + "grad_norm": 214.04319763183594, + "learning_rate": 1.9568058076225045e-05, + "loss": 42.6326, + "step": 1926 + }, + { + "epoch": 6.957110609480813, + "grad_norm": 250.81832885742188, + "learning_rate": 1.956261343012704e-05, + "loss": 43.8045, + "step": 1927 + }, + { + "epoch": 6.960722347629797, + "grad_norm": 233.58116149902344, + "learning_rate": 1.955716878402904e-05, + "loss": 39.8991, + "step": 1928 + }, + { + "epoch": 6.9643340857787805, + "grad_norm": 269.0545654296875, + "learning_rate": 1.9551724137931035e-05, + "loss": 34.6192, + "step": 1929 + }, + { + "epoch": 6.967945823927765, + "grad_norm": 266.1218566894531, + "learning_rate": 1.954627949183303e-05, + "loss": 35.7568, + "step": 1930 + }, + { + "epoch": 6.967945823927765, + "eval_loss": 0.6233173608779907, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1930 + }, + { + "epoch": 6.971557562076749, + "grad_norm": 294.6914978027344, + "learning_rate": 1.9540834845735026e-05, + "loss": 36.0795, + "step": 1931 + }, + { + "epoch": 6.975169300225733, + "grad_norm": 373.6831970214844, + "learning_rate": 1.9535390199637025e-05, + "loss": 37.2715, + "step": 1932 + }, + { + "epoch": 6.978781038374718, + "grad_norm": 240.34738159179688, + "learning_rate": 1.952994555353902e-05, + "loss": 37.8335, + "step": 1933 + }, + { + "epoch": 6.982392776523702, + "grad_norm": 312.1968994140625, + "learning_rate": 1.952450090744102e-05, + "loss": 37.8251, + "step": 1934 + }, + { + "epoch": 6.986004514672686, + "grad_norm": 276.3544006347656, + "learning_rate": 1.9519056261343014e-05, + "loss": 38.8466, + "step": 1935 + }, + { + "epoch": 6.98961625282167, + "grad_norm": 282.6874694824219, + "learning_rate": 1.951361161524501e-05, + "loss": 37.774, + "step": 1936 + }, + { + "epoch": 6.993227990970655, + "grad_norm": 323.96612548828125, + "learning_rate": 1.9508166969147005e-05, + "loss": 34.3747, + "step": 1937 + }, + { + "epoch": 6.996839729119639, + "grad_norm": 235.02915954589844, + "learning_rate": 1.9502722323049e-05, + "loss": 24.5297, + "step": 1938 + }, + { + "epoch": 7.0, + "grad_norm": 176.4046173095703, + "learning_rate": 1.9497277676951e-05, + "loss": 22.3179, + "step": 1939 + }, + { + "epoch": 7.003611738148984, + "grad_norm": 248.2797393798828, + "learning_rate": 1.9491833030852994e-05, + "loss": 42.225, + "step": 1940 + }, + { + "epoch": 7.003611738148984, + "eval_loss": 0.6272363066673279, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.911, + "eval_steps_per_second": 56.911, + "step": 1940 + }, + { + "epoch": 7.007223476297969, + "grad_norm": 235.9131622314453, + "learning_rate": 1.9486388384754993e-05, + "loss": 43.6526, + "step": 1941 + }, + { + "epoch": 7.010835214446953, + "grad_norm": 223.63479614257812, + "learning_rate": 1.948094373865699e-05, + "loss": 42.9052, + "step": 1942 + }, + { + "epoch": 7.014446952595937, + "grad_norm": 203.92141723632812, + "learning_rate": 1.9475499092558984e-05, + "loss": 43.5819, + "step": 1943 + }, + { + "epoch": 7.018058690744921, + "grad_norm": 209.6050567626953, + "learning_rate": 1.947005444646098e-05, + "loss": 43.1077, + "step": 1944 + }, + { + "epoch": 7.021670428893906, + "grad_norm": 245.77700805664062, + "learning_rate": 1.9464609800362978e-05, + "loss": 42.7508, + "step": 1945 + }, + { + "epoch": 7.0252821670428895, + "grad_norm": 203.13465881347656, + "learning_rate": 1.9459165154264973e-05, + "loss": 42.5234, + "step": 1946 + }, + { + "epoch": 7.0288939051918735, + "grad_norm": 226.4978485107422, + "learning_rate": 1.945372050816697e-05, + "loss": 44.0725, + "step": 1947 + }, + { + "epoch": 7.0325056433408575, + "grad_norm": 225.68116760253906, + "learning_rate": 1.9448275862068964e-05, + "loss": 42.6408, + "step": 1948 + }, + { + "epoch": 7.036117381489842, + "grad_norm": 182.14202880859375, + "learning_rate": 1.944283121597096e-05, + "loss": 41.7696, + "step": 1949 + }, + { + "epoch": 7.039729119638826, + "grad_norm": 196.1949005126953, + "learning_rate": 1.9437386569872962e-05, + "loss": 42.7008, + "step": 1950 + }, + { + "epoch": 7.039729119638826, + "eval_loss": 0.6277336478233337, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 1950 + }, + { + "epoch": 7.04334085778781, + "grad_norm": 180.6853485107422, + "learning_rate": 1.9431941923774957e-05, + "loss": 41.9946, + "step": 1951 + }, + { + "epoch": 7.046952595936794, + "grad_norm": 199.0644073486328, + "learning_rate": 1.9426497277676953e-05, + "loss": 39.8965, + "step": 1952 + }, + { + "epoch": 7.050564334085779, + "grad_norm": 208.21371459960938, + "learning_rate": 1.9421052631578948e-05, + "loss": 39.3263, + "step": 1953 + }, + { + "epoch": 7.054176072234763, + "grad_norm": 239.78677368164062, + "learning_rate": 1.9415607985480943e-05, + "loss": 40.1478, + "step": 1954 + }, + { + "epoch": 7.057787810383747, + "grad_norm": 211.55030822753906, + "learning_rate": 1.941016333938294e-05, + "loss": 40.061, + "step": 1955 + }, + { + "epoch": 7.061399548532731, + "grad_norm": 199.51455688476562, + "learning_rate": 1.9404718693284937e-05, + "loss": 39.8707, + "step": 1956 + }, + { + "epoch": 7.065011286681716, + "grad_norm": 183.39486694335938, + "learning_rate": 1.9399274047186933e-05, + "loss": 40.3183, + "step": 1957 + }, + { + "epoch": 7.0686230248307, + "grad_norm": 238.36737060546875, + "learning_rate": 1.9393829401088928e-05, + "loss": 40.8581, + "step": 1958 + }, + { + "epoch": 7.072234762979684, + "grad_norm": 202.5072021484375, + "learning_rate": 1.9388384754990927e-05, + "loss": 40.2192, + "step": 1959 + }, + { + "epoch": 7.075846501128668, + "grad_norm": 204.236083984375, + "learning_rate": 1.9382940108892922e-05, + "loss": 40.8533, + "step": 1960 + }, + { + "epoch": 7.075846501128668, + "eval_loss": 0.6252757906913757, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 1960 + }, + { + "epoch": 7.079458239277653, + "grad_norm": 260.2081298828125, + "learning_rate": 1.937749546279492e-05, + "loss": 39.7229, + "step": 1961 + }, + { + "epoch": 7.083069977426637, + "grad_norm": 241.91722106933594, + "learning_rate": 1.9372050816696917e-05, + "loss": 41.547, + "step": 1962 + }, + { + "epoch": 7.0866817155756205, + "grad_norm": 168.9304656982422, + "learning_rate": 1.9366606170598912e-05, + "loss": 41.4826, + "step": 1963 + }, + { + "epoch": 7.090293453724605, + "grad_norm": 230.05349731445312, + "learning_rate": 1.9361161524500907e-05, + "loss": 41.5411, + "step": 1964 + }, + { + "epoch": 7.093905191873589, + "grad_norm": 172.16851806640625, + "learning_rate": 1.9355716878402903e-05, + "loss": 42.2347, + "step": 1965 + }, + { + "epoch": 7.097516930022573, + "grad_norm": 312.65838623046875, + "learning_rate": 1.9350272232304898e-05, + "loss": 41.4039, + "step": 1966 + }, + { + "epoch": 7.101128668171557, + "grad_norm": 249.62351989746094, + "learning_rate": 1.9344827586206897e-05, + "loss": 41.4234, + "step": 1967 + }, + { + "epoch": 7.104740406320542, + "grad_norm": 250.49143981933594, + "learning_rate": 1.9339382940108896e-05, + "loss": 38.0539, + "step": 1968 + }, + { + "epoch": 7.108352144469526, + "grad_norm": 238.41546630859375, + "learning_rate": 1.933393829401089e-05, + "loss": 35.5584, + "step": 1969 + }, + { + "epoch": 7.11196388261851, + "grad_norm": 200.78282165527344, + "learning_rate": 1.9328493647912886e-05, + "loss": 34.4491, + "step": 1970 + }, + { + "epoch": 7.11196388261851, + "eval_loss": 0.6286216378211975, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 1970 + }, + { + "epoch": 7.115575620767494, + "grad_norm": 244.61717224121094, + "learning_rate": 1.9323049001814882e-05, + "loss": 34.5403, + "step": 1971 + }, + { + "epoch": 7.119187358916479, + "grad_norm": 219.14312744140625, + "learning_rate": 1.931760435571688e-05, + "loss": 35.7815, + "step": 1972 + }, + { + "epoch": 7.122799097065463, + "grad_norm": 221.85130310058594, + "learning_rate": 1.9312159709618876e-05, + "loss": 35.638, + "step": 1973 + }, + { + "epoch": 7.126410835214447, + "grad_norm": 237.97921752929688, + "learning_rate": 1.930671506352087e-05, + "loss": 35.1348, + "step": 1974 + }, + { + "epoch": 7.130022573363431, + "grad_norm": 234.06256103515625, + "learning_rate": 1.9301270417422867e-05, + "loss": 35.8709, + "step": 1975 + }, + { + "epoch": 7.133634311512416, + "grad_norm": 231.6852264404297, + "learning_rate": 1.9295825771324862e-05, + "loss": 36.6859, + "step": 1976 + }, + { + "epoch": 7.1372460496614, + "grad_norm": 208.2762908935547, + "learning_rate": 1.9290381125226857e-05, + "loss": 37.24, + "step": 1977 + }, + { + "epoch": 7.140857787810384, + "grad_norm": 219.8532257080078, + "learning_rate": 1.928493647912886e-05, + "loss": 36.4058, + "step": 1978 + }, + { + "epoch": 7.144469525959368, + "grad_norm": 242.73159790039062, + "learning_rate": 1.9279491833030855e-05, + "loss": 36.7565, + "step": 1979 + }, + { + "epoch": 7.148081264108352, + "grad_norm": 227.09645080566406, + "learning_rate": 1.927404718693285e-05, + "loss": 37.6752, + "step": 1980 + }, + { + "epoch": 7.148081264108352, + "eval_loss": 0.6243596076965332, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 1980 + }, + { + "epoch": 7.151693002257336, + "grad_norm": 236.27169799804688, + "learning_rate": 1.9268602540834846e-05, + "loss": 38.3857, + "step": 1981 + }, + { + "epoch": 7.15530474040632, + "grad_norm": 244.84912109375, + "learning_rate": 1.926315789473684e-05, + "loss": 38.414, + "step": 1982 + }, + { + "epoch": 7.158916478555304, + "grad_norm": 203.36798095703125, + "learning_rate": 1.925771324863884e-05, + "loss": 38.938, + "step": 1983 + }, + { + "epoch": 7.162528216704289, + "grad_norm": 225.50152587890625, + "learning_rate": 1.9252268602540835e-05, + "loss": 37.654, + "step": 1984 + }, + { + "epoch": 7.166139954853273, + "grad_norm": 236.4989471435547, + "learning_rate": 1.924682395644283e-05, + "loss": 28.2794, + "step": 1985 + }, + { + "epoch": 7.169751693002257, + "grad_norm": 173.909423828125, + "learning_rate": 1.9241379310344826e-05, + "loss": 23.3804, + "step": 1986 + }, + { + "epoch": 7.173363431151241, + "grad_norm": 195.63526916503906, + "learning_rate": 1.9235934664246825e-05, + "loss": 24.4696, + "step": 1987 + }, + { + "epoch": 7.176975169300226, + "grad_norm": 150.0059356689453, + "learning_rate": 1.923049001814882e-05, + "loss": 23.9438, + "step": 1988 + }, + { + "epoch": 7.18058690744921, + "grad_norm": 217.61630249023438, + "learning_rate": 1.922504537205082e-05, + "loss": 25.4084, + "step": 1989 + }, + { + "epoch": 7.184198645598194, + "grad_norm": 259.2041015625, + "learning_rate": 1.9219600725952814e-05, + "loss": 44.7159, + "step": 1990 + }, + { + "epoch": 7.184198645598194, + "eval_loss": 0.6465168595314026, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 1990 + }, + { + "epoch": 7.187810383747179, + "grad_norm": 282.1758117675781, + "learning_rate": 1.921415607985481e-05, + "loss": 45.7571, + "step": 1991 + }, + { + "epoch": 7.191422121896163, + "grad_norm": 276.5455322265625, + "learning_rate": 1.9208711433756805e-05, + "loss": 44.7227, + "step": 1992 + }, + { + "epoch": 7.195033860045147, + "grad_norm": 251.93589782714844, + "learning_rate": 1.92032667876588e-05, + "loss": 43.0705, + "step": 1993 + }, + { + "epoch": 7.198645598194131, + "grad_norm": 224.8245086669922, + "learning_rate": 1.91978221415608e-05, + "loss": 43.2009, + "step": 1994 + }, + { + "epoch": 7.2022573363431155, + "grad_norm": 233.61770629882812, + "learning_rate": 1.9192377495462795e-05, + "loss": 43.4496, + "step": 1995 + }, + { + "epoch": 7.2058690744920995, + "grad_norm": 188.65252685546875, + "learning_rate": 1.9186932849364793e-05, + "loss": 42.5907, + "step": 1996 + }, + { + "epoch": 7.209480812641083, + "grad_norm": 185.1155242919922, + "learning_rate": 1.918148820326679e-05, + "loss": 44.4651, + "step": 1997 + }, + { + "epoch": 7.213092550790067, + "grad_norm": 169.09701538085938, + "learning_rate": 1.9176043557168784e-05, + "loss": 43.6325, + "step": 1998 + }, + { + "epoch": 7.216704288939052, + "grad_norm": 198.49114990234375, + "learning_rate": 1.9170598911070783e-05, + "loss": 43.5817, + "step": 1999 + }, + { + "epoch": 7.220316027088036, + "grad_norm": 193.17591857910156, + "learning_rate": 1.916515426497278e-05, + "loss": 41.4884, + "step": 2000 + }, + { + "epoch": 7.220316027088036, + "eval_loss": 0.6329721212387085, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.995, + "eval_steps_per_second": 56.995, + "step": 2000 + }, + { + "epoch": 7.22392776523702, + "grad_norm": 202.32730102539062, + "learning_rate": 1.9159709618874774e-05, + "loss": 41.2168, + "step": 2001 + }, + { + "epoch": 7.227539503386004, + "grad_norm": 206.4916534423828, + "learning_rate": 1.915426497277677e-05, + "loss": 39.9909, + "step": 2002 + }, + { + "epoch": 7.231151241534989, + "grad_norm": 202.2099609375, + "learning_rate": 1.9148820326678765e-05, + "loss": 40.1413, + "step": 2003 + }, + { + "epoch": 7.234762979683973, + "grad_norm": 223.7954559326172, + "learning_rate": 1.914337568058076e-05, + "loss": 39.5872, + "step": 2004 + }, + { + "epoch": 7.238374717832957, + "grad_norm": 225.8967742919922, + "learning_rate": 1.9137931034482762e-05, + "loss": 41.3396, + "step": 2005 + }, + { + "epoch": 7.241986455981941, + "grad_norm": 248.0997772216797, + "learning_rate": 1.9132486388384757e-05, + "loss": 39.012, + "step": 2006 + }, + { + "epoch": 7.245598194130926, + "grad_norm": 227.4576873779297, + "learning_rate": 1.9127041742286753e-05, + "loss": 42.5922, + "step": 2007 + }, + { + "epoch": 7.24920993227991, + "grad_norm": 197.62547302246094, + "learning_rate": 1.9121597096188748e-05, + "loss": 41.6107, + "step": 2008 + }, + { + "epoch": 7.252821670428894, + "grad_norm": 170.18817138671875, + "learning_rate": 1.9116152450090744e-05, + "loss": 40.3326, + "step": 2009 + }, + { + "epoch": 7.2564334085778786, + "grad_norm": 186.9420166015625, + "learning_rate": 1.9110707803992742e-05, + "loss": 41.0365, + "step": 2010 + }, + { + "epoch": 7.2564334085778786, + "eval_loss": 0.6230406761169434, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2010 + }, + { + "epoch": 7.2600451467268625, + "grad_norm": 188.11244201660156, + "learning_rate": 1.9105263157894738e-05, + "loss": 42.0278, + "step": 2011 + }, + { + "epoch": 7.2636568848758465, + "grad_norm": 242.47305297851562, + "learning_rate": 1.9099818511796733e-05, + "loss": 41.5539, + "step": 2012 + }, + { + "epoch": 7.2672686230248305, + "grad_norm": 190.83987426757812, + "learning_rate": 1.909437386569873e-05, + "loss": 41.8641, + "step": 2013 + }, + { + "epoch": 7.270880361173815, + "grad_norm": 214.44650268554688, + "learning_rate": 1.9088929219600724e-05, + "loss": 42.232, + "step": 2014 + }, + { + "epoch": 7.274492099322799, + "grad_norm": 216.3888397216797, + "learning_rate": 1.9083484573502723e-05, + "loss": 41.6186, + "step": 2015 + }, + { + "epoch": 7.278103837471783, + "grad_norm": 210.46673583984375, + "learning_rate": 1.907803992740472e-05, + "loss": 42.2099, + "step": 2016 + }, + { + "epoch": 7.281715575620767, + "grad_norm": 194.84165954589844, + "learning_rate": 1.9072595281306717e-05, + "loss": 42.78, + "step": 2017 + }, + { + "epoch": 7.285327313769752, + "grad_norm": 201.91297912597656, + "learning_rate": 1.9067150635208712e-05, + "loss": 38.7115, + "step": 2018 + }, + { + "epoch": 7.288939051918736, + "grad_norm": 245.42625427246094, + "learning_rate": 1.9061705989110708e-05, + "loss": 35.7841, + "step": 2019 + }, + { + "epoch": 7.29255079006772, + "grad_norm": 182.4967041015625, + "learning_rate": 1.9056261343012703e-05, + "loss": 34.3308, + "step": 2020 + }, + { + "epoch": 7.29255079006772, + "eval_loss": 0.6238341331481934, + "eval_runtime": 3.1431, + "eval_samples_per_second": 56.95, + "eval_steps_per_second": 56.95, + "step": 2020 + }, + { + "epoch": 7.296162528216704, + "grad_norm": 297.3916320800781, + "learning_rate": 1.9050816696914702e-05, + "loss": 34.7534, + "step": 2021 + }, + { + "epoch": 7.299774266365689, + "grad_norm": 211.52554321289062, + "learning_rate": 1.9045372050816697e-05, + "loss": 34.0303, + "step": 2022 + }, + { + "epoch": 7.303386004514673, + "grad_norm": 232.99844360351562, + "learning_rate": 1.9039927404718693e-05, + "loss": 35.7378, + "step": 2023 + }, + { + "epoch": 7.306997742663657, + "grad_norm": 230.34642028808594, + "learning_rate": 1.903448275862069e-05, + "loss": 36.7492, + "step": 2024 + }, + { + "epoch": 7.310609480812641, + "grad_norm": 228.88966369628906, + "learning_rate": 1.9029038112522687e-05, + "loss": 35.1188, + "step": 2025 + }, + { + "epoch": 7.314221218961626, + "grad_norm": 213.2604522705078, + "learning_rate": 1.9023593466424682e-05, + "loss": 35.0688, + "step": 2026 + }, + { + "epoch": 7.3178329571106095, + "grad_norm": 202.62200927734375, + "learning_rate": 1.901814882032668e-05, + "loss": 37.6721, + "step": 2027 + }, + { + "epoch": 7.3214446952595935, + "grad_norm": 191.8877410888672, + "learning_rate": 1.9012704174228676e-05, + "loss": 36.7728, + "step": 2028 + }, + { + "epoch": 7.3250564334085775, + "grad_norm": 211.57571411132812, + "learning_rate": 1.900725952813067e-05, + "loss": 36.6342, + "step": 2029 + }, + { + "epoch": 7.328668171557562, + "grad_norm": 177.2289581298828, + "learning_rate": 1.9001814882032667e-05, + "loss": 36.8319, + "step": 2030 + }, + { + "epoch": 7.328668171557562, + "eval_loss": 0.6231008172035217, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2030 + }, + { + "epoch": 7.332279909706546, + "grad_norm": 227.7028350830078, + "learning_rate": 1.8996370235934662e-05, + "loss": 36.6706, + "step": 2031 + }, + { + "epoch": 7.33589164785553, + "grad_norm": 229.02972412109375, + "learning_rate": 1.899092558983666e-05, + "loss": 37.0749, + "step": 2032 + }, + { + "epoch": 7.339503386004514, + "grad_norm": 234.30946350097656, + "learning_rate": 1.898548094373866e-05, + "loss": 37.3716, + "step": 2033 + }, + { + "epoch": 7.343115124153499, + "grad_norm": 236.79893493652344, + "learning_rate": 1.8980036297640655e-05, + "loss": 38.9503, + "step": 2034 + }, + { + "epoch": 7.346726862302483, + "grad_norm": 256.5646057128906, + "learning_rate": 1.897459165154265e-05, + "loss": 32.5056, + "step": 2035 + }, + { + "epoch": 7.350338600451467, + "grad_norm": 183.38961791992188, + "learning_rate": 1.8969147005444646e-05, + "loss": 25.3982, + "step": 2036 + }, + { + "epoch": 7.353950338600452, + "grad_norm": 214.09742736816406, + "learning_rate": 1.896370235934664e-05, + "loss": 23.2743, + "step": 2037 + }, + { + "epoch": 7.357562076749436, + "grad_norm": 190.10867309570312, + "learning_rate": 1.895825771324864e-05, + "loss": 24.8062, + "step": 2038 + }, + { + "epoch": 7.36117381489842, + "grad_norm": 197.85313415527344, + "learning_rate": 1.8952813067150636e-05, + "loss": 25.5098, + "step": 2039 + }, + { + "epoch": 7.364785553047404, + "grad_norm": 235.79090881347656, + "learning_rate": 1.894736842105263e-05, + "loss": 44.3536, + "step": 2040 + }, + { + "epoch": 7.364785553047404, + "eval_loss": 0.6341925263404846, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.785, + "eval_steps_per_second": 56.785, + "step": 2040 + }, + { + "epoch": 7.368397291196389, + "grad_norm": 232.7415771484375, + "learning_rate": 1.8941923774954626e-05, + "loss": 44.6073, + "step": 2041 + }, + { + "epoch": 7.372009029345373, + "grad_norm": 302.3766174316406, + "learning_rate": 1.8936479128856625e-05, + "loss": 43.8575, + "step": 2042 + }, + { + "epoch": 7.375620767494357, + "grad_norm": 208.41441345214844, + "learning_rate": 1.8931034482758624e-05, + "loss": 42.4378, + "step": 2043 + }, + { + "epoch": 7.3792325056433405, + "grad_norm": 228.000732421875, + "learning_rate": 1.892558983666062e-05, + "loss": 44.5641, + "step": 2044 + }, + { + "epoch": 7.382844243792325, + "grad_norm": 201.757080078125, + "learning_rate": 1.8920145190562615e-05, + "loss": 43.7578, + "step": 2045 + }, + { + "epoch": 7.386455981941309, + "grad_norm": 220.2481689453125, + "learning_rate": 1.891470054446461e-05, + "loss": 42.755, + "step": 2046 + }, + { + "epoch": 7.390067720090293, + "grad_norm": 225.5443115234375, + "learning_rate": 1.8909255898366605e-05, + "loss": 44.3785, + "step": 2047 + }, + { + "epoch": 7.393679458239277, + "grad_norm": 200.2024688720703, + "learning_rate": 1.89038112522686e-05, + "loss": 42.994, + "step": 2048 + }, + { + "epoch": 7.397291196388262, + "grad_norm": 205.64794921875, + "learning_rate": 1.88983666061706e-05, + "loss": 43.1902, + "step": 2049 + }, + { + "epoch": 7.400902934537246, + "grad_norm": 183.3535919189453, + "learning_rate": 1.8892921960072595e-05, + "loss": 40.9422, + "step": 2050 + }, + { + "epoch": 7.400902934537246, + "eval_loss": 0.626913845539093, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2050 + }, + { + "epoch": 7.40451467268623, + "grad_norm": 201.8138885498047, + "learning_rate": 1.8887477313974594e-05, + "loss": 39.4408, + "step": 2051 + }, + { + "epoch": 7.408126410835214, + "grad_norm": 201.8863525390625, + "learning_rate": 1.888203266787659e-05, + "loss": 39.5467, + "step": 2052 + }, + { + "epoch": 7.411738148984199, + "grad_norm": 239.10687255859375, + "learning_rate": 1.8876588021778585e-05, + "loss": 41.2256, + "step": 2053 + }, + { + "epoch": 7.415349887133183, + "grad_norm": 209.47796630859375, + "learning_rate": 1.8871143375680583e-05, + "loss": 40.8963, + "step": 2054 + }, + { + "epoch": 7.418961625282167, + "grad_norm": 202.6414794921875, + "learning_rate": 1.886569872958258e-05, + "loss": 40.5138, + "step": 2055 + }, + { + "epoch": 7.422573363431152, + "grad_norm": 198.01795959472656, + "learning_rate": 1.8860254083484574e-05, + "loss": 39.1767, + "step": 2056 + }, + { + "epoch": 7.426185101580136, + "grad_norm": 173.26507568359375, + "learning_rate": 1.885480943738657e-05, + "loss": 40.6713, + "step": 2057 + }, + { + "epoch": 7.42979683972912, + "grad_norm": 166.11607360839844, + "learning_rate": 1.8849364791288565e-05, + "loss": 41.2602, + "step": 2058 + }, + { + "epoch": 7.433408577878104, + "grad_norm": 200.76956176757812, + "learning_rate": 1.884392014519056e-05, + "loss": 41.0714, + "step": 2059 + }, + { + "epoch": 7.437020316027088, + "grad_norm": 213.75315856933594, + "learning_rate": 1.883847549909256e-05, + "loss": 39.6812, + "step": 2060 + }, + { + "epoch": 7.437020316027088, + "eval_loss": 0.6279598474502563, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.0, + "eval_steps_per_second": 57.0, + "step": 2060 + }, + { + "epoch": 7.440632054176072, + "grad_norm": 221.25025939941406, + "learning_rate": 1.8833030852994558e-05, + "loss": 41.6964, + "step": 2061 + }, + { + "epoch": 7.444243792325056, + "grad_norm": 171.32106018066406, + "learning_rate": 1.8827586206896553e-05, + "loss": 41.4608, + "step": 2062 + }, + { + "epoch": 7.44785553047404, + "grad_norm": 222.76600646972656, + "learning_rate": 1.882214156079855e-05, + "loss": 41.2687, + "step": 2063 + }, + { + "epoch": 7.451467268623025, + "grad_norm": 169.82395935058594, + "learning_rate": 1.8816696914700544e-05, + "loss": 41.6048, + "step": 2064 + }, + { + "epoch": 7.455079006772009, + "grad_norm": 190.5113525390625, + "learning_rate": 1.8811252268602543e-05, + "loss": 41.8843, + "step": 2065 + }, + { + "epoch": 7.458690744920993, + "grad_norm": 194.5990447998047, + "learning_rate": 1.8805807622504538e-05, + "loss": 43.5968, + "step": 2066 + }, + { + "epoch": 7.462302483069977, + "grad_norm": 216.0985870361328, + "learning_rate": 1.8800362976406533e-05, + "loss": 41.6743, + "step": 2067 + }, + { + "epoch": 7.465914221218962, + "grad_norm": 249.05270385742188, + "learning_rate": 1.879491833030853e-05, + "loss": 39.4203, + "step": 2068 + }, + { + "epoch": 7.469525959367946, + "grad_norm": 232.5495147705078, + "learning_rate": 1.8789473684210524e-05, + "loss": 36.2202, + "step": 2069 + }, + { + "epoch": 7.47313769751693, + "grad_norm": 218.72299194335938, + "learning_rate": 1.8784029038112523e-05, + "loss": 34.9116, + "step": 2070 + }, + { + "epoch": 7.47313769751693, + "eval_loss": 0.6241349577903748, + "eval_runtime": 3.1499, + "eval_samples_per_second": 56.827, + "eval_steps_per_second": 56.827, + "step": 2070 + }, + { + "epoch": 7.476749435665914, + "grad_norm": 241.78179931640625, + "learning_rate": 1.8778584392014522e-05, + "loss": 36.2476, + "step": 2071 + }, + { + "epoch": 7.480361173814899, + "grad_norm": 194.92982482910156, + "learning_rate": 1.8773139745916517e-05, + "loss": 34.4524, + "step": 2072 + }, + { + "epoch": 7.483972911963883, + "grad_norm": 227.76156616210938, + "learning_rate": 1.8767695099818513e-05, + "loss": 34.5292, + "step": 2073 + }, + { + "epoch": 7.487584650112867, + "grad_norm": 287.61309814453125, + "learning_rate": 1.8762250453720508e-05, + "loss": 37.8068, + "step": 2074 + }, + { + "epoch": 7.491196388261851, + "grad_norm": 191.0822296142578, + "learning_rate": 1.8756805807622503e-05, + "loss": 36.0941, + "step": 2075 + }, + { + "epoch": 7.4948081264108355, + "grad_norm": 197.5564422607422, + "learning_rate": 1.8751361161524502e-05, + "loss": 36.3624, + "step": 2076 + }, + { + "epoch": 7.4984198645598195, + "grad_norm": 187.72479248046875, + "learning_rate": 1.8745916515426497e-05, + "loss": 37.5074, + "step": 2077 + }, + { + "epoch": 7.502031602708803, + "grad_norm": 220.4607391357422, + "learning_rate": 1.8740471869328493e-05, + "loss": 35.6139, + "step": 2078 + }, + { + "epoch": 7.505643340857787, + "grad_norm": 179.05612182617188, + "learning_rate": 1.873502722323049e-05, + "loss": 37.7286, + "step": 2079 + }, + { + "epoch": 7.509255079006772, + "grad_norm": 230.91879272460938, + "learning_rate": 1.8729582577132487e-05, + "loss": 36.1803, + "step": 2080 + }, + { + "epoch": 7.509255079006772, + "eval_loss": 0.6255043148994446, + "eval_runtime": 3.1466, + "eval_samples_per_second": 56.887, + "eval_steps_per_second": 56.887, + "step": 2080 + }, + { + "epoch": 7.512866817155756, + "grad_norm": 182.89437866210938, + "learning_rate": 1.8724137931034482e-05, + "loss": 36.5782, + "step": 2081 + }, + { + "epoch": 7.51647855530474, + "grad_norm": 215.36769104003906, + "learning_rate": 1.871869328493648e-05, + "loss": 38.233, + "step": 2082 + }, + { + "epoch": 7.520090293453725, + "grad_norm": 232.6095733642578, + "learning_rate": 1.8713248638838477e-05, + "loss": 38.6268, + "step": 2083 + }, + { + "epoch": 7.523702031602709, + "grad_norm": 236.94281005859375, + "learning_rate": 1.8707803992740472e-05, + "loss": 38.1768, + "step": 2084 + }, + { + "epoch": 7.527313769751693, + "grad_norm": 214.16079711914062, + "learning_rate": 1.8702359346642467e-05, + "loss": 27.514, + "step": 2085 + }, + { + "epoch": 7.530925507900677, + "grad_norm": 192.6107940673828, + "learning_rate": 1.8696914700544463e-05, + "loss": 24.274, + "step": 2086 + }, + { + "epoch": 7.534537246049661, + "grad_norm": 217.98619079589844, + "learning_rate": 1.869147005444646e-05, + "loss": 23.2824, + "step": 2087 + }, + { + "epoch": 7.538148984198646, + "grad_norm": 183.04296875, + "learning_rate": 1.868602540834846e-05, + "loss": 24.9622, + "step": 2088 + }, + { + "epoch": 7.54176072234763, + "grad_norm": 167.1417236328125, + "learning_rate": 1.8680580762250456e-05, + "loss": 25.1446, + "step": 2089 + }, + { + "epoch": 7.545372460496614, + "grad_norm": 287.29937744140625, + "learning_rate": 1.867513611615245e-05, + "loss": 44.1171, + "step": 2090 + }, + { + "epoch": 7.545372460496614, + "eval_loss": 0.6376849412918091, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.929, + "eval_steps_per_second": 56.929, + "step": 2090 + }, + { + "epoch": 7.5489841986455986, + "grad_norm": 285.3408203125, + "learning_rate": 1.8669691470054446e-05, + "loss": 46.3716, + "step": 2091 + }, + { + "epoch": 7.5525959367945825, + "grad_norm": 233.18389892578125, + "learning_rate": 1.8664246823956445e-05, + "loss": 44.0514, + "step": 2092 + }, + { + "epoch": 7.5562076749435665, + "grad_norm": 256.4196472167969, + "learning_rate": 1.865880217785844e-05, + "loss": 44.1784, + "step": 2093 + }, + { + "epoch": 7.5598194130925505, + "grad_norm": 223.28128051757812, + "learning_rate": 1.8653357531760436e-05, + "loss": 42.9897, + "step": 2094 + }, + { + "epoch": 7.563431151241535, + "grad_norm": 235.2901153564453, + "learning_rate": 1.864791288566243e-05, + "loss": 43.7651, + "step": 2095 + }, + { + "epoch": 7.567042889390519, + "grad_norm": 285.9206237792969, + "learning_rate": 1.8642468239564427e-05, + "loss": 44.6333, + "step": 2096 + }, + { + "epoch": 7.570654627539503, + "grad_norm": 200.00210571289062, + "learning_rate": 1.8637023593466425e-05, + "loss": 43.9845, + "step": 2097 + }, + { + "epoch": 7.574266365688487, + "grad_norm": 277.73394775390625, + "learning_rate": 1.8631578947368424e-05, + "loss": 44.7301, + "step": 2098 + }, + { + "epoch": 7.577878103837472, + "grad_norm": 216.9422149658203, + "learning_rate": 1.862613430127042e-05, + "loss": 44.0409, + "step": 2099 + }, + { + "epoch": 7.581489841986456, + "grad_norm": 198.86639404296875, + "learning_rate": 1.8620689655172415e-05, + "loss": 43.4026, + "step": 2100 + }, + { + "epoch": 7.581489841986456, + "eval_loss": 0.6270378232002258, + "eval_runtime": 3.1464, + "eval_samples_per_second": 56.891, + "eval_steps_per_second": 56.891, + "step": 2100 + }, + { + "epoch": 7.58510158013544, + "grad_norm": 240.495361328125, + "learning_rate": 1.861524500907441e-05, + "loss": 41.4092, + "step": 2101 + }, + { + "epoch": 7.588713318284425, + "grad_norm": 240.1851043701172, + "learning_rate": 1.8609800362976406e-05, + "loss": 40.1396, + "step": 2102 + }, + { + "epoch": 7.592325056433409, + "grad_norm": 241.21495056152344, + "learning_rate": 1.8604355716878405e-05, + "loss": 39.1778, + "step": 2103 + }, + { + "epoch": 7.595936794582393, + "grad_norm": 287.3133544921875, + "learning_rate": 1.85989110707804e-05, + "loss": 41.0348, + "step": 2104 + }, + { + "epoch": 7.599548532731377, + "grad_norm": 230.4313201904297, + "learning_rate": 1.8593466424682395e-05, + "loss": 39.5872, + "step": 2105 + }, + { + "epoch": 7.603160270880361, + "grad_norm": 210.32962036132812, + "learning_rate": 1.858802177858439e-05, + "loss": 40.6146, + "step": 2106 + }, + { + "epoch": 7.606772009029346, + "grad_norm": 185.81752014160156, + "learning_rate": 1.858257713248639e-05, + "loss": 39.6363, + "step": 2107 + }, + { + "epoch": 7.6103837471783295, + "grad_norm": 234.63037109375, + "learning_rate": 1.8577132486388385e-05, + "loss": 40.558, + "step": 2108 + }, + { + "epoch": 7.6139954853273135, + "grad_norm": 289.92803955078125, + "learning_rate": 1.8571687840290384e-05, + "loss": 41.1624, + "step": 2109 + }, + { + "epoch": 7.617607223476298, + "grad_norm": 252.82188415527344, + "learning_rate": 1.856624319419238e-05, + "loss": 41.7827, + "step": 2110 + }, + { + "epoch": 7.617607223476298, + "eval_loss": 0.6290409564971924, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2110 + }, + { + "epoch": 7.621218961625282, + "grad_norm": 201.8303985595703, + "learning_rate": 1.8560798548094374e-05, + "loss": 39.0072, + "step": 2111 + }, + { + "epoch": 7.624830699774266, + "grad_norm": 158.71446228027344, + "learning_rate": 1.855535390199637e-05, + "loss": 39.9822, + "step": 2112 + }, + { + "epoch": 7.62844243792325, + "grad_norm": 171.3879852294922, + "learning_rate": 1.8549909255898365e-05, + "loss": 42.1973, + "step": 2113 + }, + { + "epoch": 7.632054176072235, + "grad_norm": 218.584228515625, + "learning_rate": 1.8544464609800364e-05, + "loss": 42.933, + "step": 2114 + }, + { + "epoch": 7.635665914221219, + "grad_norm": 200.60093688964844, + "learning_rate": 1.853901996370236e-05, + "loss": 41.9847, + "step": 2115 + }, + { + "epoch": 7.639277652370203, + "grad_norm": 210.75128173828125, + "learning_rate": 1.8533575317604358e-05, + "loss": 42.4961, + "step": 2116 + }, + { + "epoch": 7.642889390519187, + "grad_norm": 187.47406005859375, + "learning_rate": 1.8528130671506353e-05, + "loss": 39.3404, + "step": 2117 + }, + { + "epoch": 7.646501128668172, + "grad_norm": 204.87693786621094, + "learning_rate": 1.852268602540835e-05, + "loss": 40.3011, + "step": 2118 + }, + { + "epoch": 7.650112866817156, + "grad_norm": 228.8159637451172, + "learning_rate": 1.8517241379310344e-05, + "loss": 37.4416, + "step": 2119 + }, + { + "epoch": 7.65372460496614, + "grad_norm": 237.59664916992188, + "learning_rate": 1.8511796733212343e-05, + "loss": 35.3079, + "step": 2120 + }, + { + "epoch": 7.65372460496614, + "eval_loss": 0.6256567239761353, + "eval_runtime": 3.1458, + "eval_samples_per_second": 56.902, + "eval_steps_per_second": 56.902, + "step": 2120 + }, + { + "epoch": 7.657336343115124, + "grad_norm": 233.3187713623047, + "learning_rate": 1.850635208711434e-05, + "loss": 34.5055, + "step": 2121 + }, + { + "epoch": 7.660948081264109, + "grad_norm": 232.7037353515625, + "learning_rate": 1.8500907441016334e-05, + "loss": 34.1232, + "step": 2122 + }, + { + "epoch": 7.664559819413093, + "grad_norm": 254.53050231933594, + "learning_rate": 1.849546279491833e-05, + "loss": 35.3301, + "step": 2123 + }, + { + "epoch": 7.668171557562077, + "grad_norm": 234.93154907226562, + "learning_rate": 1.8490018148820324e-05, + "loss": 35.9202, + "step": 2124 + }, + { + "epoch": 7.6717832957110605, + "grad_norm": 237.99671936035156, + "learning_rate": 1.8484573502722327e-05, + "loss": 36.5702, + "step": 2125 + }, + { + "epoch": 7.675395033860045, + "grad_norm": 186.25271606445312, + "learning_rate": 1.8479128856624322e-05, + "loss": 35.9423, + "step": 2126 + }, + { + "epoch": 7.679006772009029, + "grad_norm": 226.461669921875, + "learning_rate": 1.8473684210526317e-05, + "loss": 37.4121, + "step": 2127 + }, + { + "epoch": 7.682618510158013, + "grad_norm": 227.0966033935547, + "learning_rate": 1.8468239564428313e-05, + "loss": 36.8802, + "step": 2128 + }, + { + "epoch": 7.686230248306998, + "grad_norm": 193.4064178466797, + "learning_rate": 1.8462794918330308e-05, + "loss": 36.0245, + "step": 2129 + }, + { + "epoch": 7.689841986455982, + "grad_norm": 279.1668395996094, + "learning_rate": 1.8457350272232304e-05, + "loss": 37.4833, + "step": 2130 + }, + { + "epoch": 7.689841986455982, + "eval_loss": 0.6227458715438843, + "eval_runtime": 3.1429, + "eval_samples_per_second": 56.953, + "eval_steps_per_second": 56.953, + "step": 2130 + }, + { + "epoch": 7.693453724604966, + "grad_norm": 254.59234619140625, + "learning_rate": 1.8451905626134302e-05, + "loss": 36.8538, + "step": 2131 + }, + { + "epoch": 7.69706546275395, + "grad_norm": 191.14463806152344, + "learning_rate": 1.8446460980036298e-05, + "loss": 37.8517, + "step": 2132 + }, + { + "epoch": 7.700677200902934, + "grad_norm": 189.20896911621094, + "learning_rate": 1.8441016333938293e-05, + "loss": 38.406, + "step": 2133 + }, + { + "epoch": 7.704288939051919, + "grad_norm": 209.61175537109375, + "learning_rate": 1.8435571687840292e-05, + "loss": 37.7692, + "step": 2134 + }, + { + "epoch": 7.707900677200903, + "grad_norm": 220.5150146484375, + "learning_rate": 1.8430127041742287e-05, + "loss": 36.087, + "step": 2135 + }, + { + "epoch": 7.711512415349887, + "grad_norm": 211.78372192382812, + "learning_rate": 1.8424682395644286e-05, + "loss": 25.6052, + "step": 2136 + }, + { + "epoch": 7.715124153498872, + "grad_norm": 223.85789489746094, + "learning_rate": 1.841923774954628e-05, + "loss": 23.5576, + "step": 2137 + }, + { + "epoch": 7.718735891647856, + "grad_norm": 163.74220275878906, + "learning_rate": 1.8413793103448277e-05, + "loss": 24.4869, + "step": 2138 + }, + { + "epoch": 7.72234762979684, + "grad_norm": 182.80079650878906, + "learning_rate": 1.8408348457350272e-05, + "loss": 25.1878, + "step": 2139 + }, + { + "epoch": 7.725959367945824, + "grad_norm": 296.0340270996094, + "learning_rate": 1.8402903811252268e-05, + "loss": 44.4643, + "step": 2140 + }, + { + "epoch": 7.725959367945824, + "eval_loss": 0.6382863521575928, + "eval_runtime": 3.1441, + "eval_samples_per_second": 56.932, + "eval_steps_per_second": 56.932, + "step": 2140 + }, + { + "epoch": 7.7295711060948085, + "grad_norm": 248.48643493652344, + "learning_rate": 1.8397459165154263e-05, + "loss": 45.2141, + "step": 2141 + }, + { + "epoch": 7.733182844243792, + "grad_norm": 240.9061279296875, + "learning_rate": 1.8392014519056262e-05, + "loss": 42.9435, + "step": 2142 + }, + { + "epoch": 7.736794582392776, + "grad_norm": 231.62315368652344, + "learning_rate": 1.8386569872958257e-05, + "loss": 42.9769, + "step": 2143 + }, + { + "epoch": 7.74040632054176, + "grad_norm": 244.36915588378906, + "learning_rate": 1.8381125226860256e-05, + "loss": 43.6058, + "step": 2144 + }, + { + "epoch": 7.744018058690745, + "grad_norm": 252.9080047607422, + "learning_rate": 1.837568058076225e-05, + "loss": 43.1753, + "step": 2145 + }, + { + "epoch": 7.747629796839729, + "grad_norm": 274.0201721191406, + "learning_rate": 1.8370235934664247e-05, + "loss": 43.3285, + "step": 2146 + }, + { + "epoch": 7.751241534988713, + "grad_norm": 226.75595092773438, + "learning_rate": 1.8364791288566245e-05, + "loss": 43.3158, + "step": 2147 + }, + { + "epoch": 7.754853273137698, + "grad_norm": 197.0859832763672, + "learning_rate": 1.835934664246824e-05, + "loss": 43.5773, + "step": 2148 + }, + { + "epoch": 7.758465011286682, + "grad_norm": 212.14720153808594, + "learning_rate": 1.8353901996370236e-05, + "loss": 43.9208, + "step": 2149 + }, + { + "epoch": 7.762076749435666, + "grad_norm": 230.22158813476562, + "learning_rate": 1.834845735027223e-05, + "loss": 42.8429, + "step": 2150 + }, + { + "epoch": 7.762076749435666, + "eval_loss": 0.6291994452476501, + "eval_runtime": 3.1473, + "eval_samples_per_second": 56.874, + "eval_steps_per_second": 56.874, + "step": 2150 + }, + { + "epoch": 7.76568848758465, + "grad_norm": 215.79391479492188, + "learning_rate": 1.8343012704174227e-05, + "loss": 40.7289, + "step": 2151 + }, + { + "epoch": 7.769300225733634, + "grad_norm": 210.00296020507812, + "learning_rate": 1.8337568058076222e-05, + "loss": 39.9759, + "step": 2152 + }, + { + "epoch": 7.772911963882619, + "grad_norm": 291.2987976074219, + "learning_rate": 1.8332123411978224e-05, + "loss": 40.551, + "step": 2153 + }, + { + "epoch": 7.776523702031603, + "grad_norm": 218.08819580078125, + "learning_rate": 1.832667876588022e-05, + "loss": 40.7981, + "step": 2154 + }, + { + "epoch": 7.780135440180587, + "grad_norm": 268.615966796875, + "learning_rate": 1.8321234119782215e-05, + "loss": 40.5463, + "step": 2155 + }, + { + "epoch": 7.7837471783295715, + "grad_norm": 269.939697265625, + "learning_rate": 1.831578947368421e-05, + "loss": 40.6168, + "step": 2156 + }, + { + "epoch": 7.7873589164785555, + "grad_norm": 268.9761657714844, + "learning_rate": 1.8310344827586206e-05, + "loss": 41.2449, + "step": 2157 + }, + { + "epoch": 7.7909706546275395, + "grad_norm": 161.08811950683594, + "learning_rate": 1.8304900181488205e-05, + "loss": 40.6308, + "step": 2158 + }, + { + "epoch": 7.794582392776523, + "grad_norm": 190.44696044921875, + "learning_rate": 1.82994555353902e-05, + "loss": 40.9708, + "step": 2159 + }, + { + "epoch": 7.798194130925508, + "grad_norm": 202.4305419921875, + "learning_rate": 1.8294010889292196e-05, + "loss": 41.2053, + "step": 2160 + }, + { + "epoch": 7.798194130925508, + "eval_loss": 0.6233534812927246, + "eval_runtime": 3.1457, + "eval_samples_per_second": 56.903, + "eval_steps_per_second": 56.903, + "step": 2160 + }, + { + "epoch": 7.801805869074492, + "grad_norm": 188.5523681640625, + "learning_rate": 1.828856624319419e-05, + "loss": 40.3928, + "step": 2161 + }, + { + "epoch": 7.805417607223476, + "grad_norm": 184.18296813964844, + "learning_rate": 1.828312159709619e-05, + "loss": 42.3466, + "step": 2162 + }, + { + "epoch": 7.80902934537246, + "grad_norm": 223.9243927001953, + "learning_rate": 1.8277676950998185e-05, + "loss": 42.0301, + "step": 2163 + }, + { + "epoch": 7.812641083521445, + "grad_norm": 202.3498077392578, + "learning_rate": 1.8272232304900184e-05, + "loss": 42.3284, + "step": 2164 + }, + { + "epoch": 7.816252821670429, + "grad_norm": 205.77940368652344, + "learning_rate": 1.826678765880218e-05, + "loss": 42.0951, + "step": 2165 + }, + { + "epoch": 7.819864559819413, + "grad_norm": 191.46728515625, + "learning_rate": 1.8261343012704175e-05, + "loss": 40.826, + "step": 2166 + }, + { + "epoch": 7.823476297968397, + "grad_norm": 276.8330383300781, + "learning_rate": 1.825589836660617e-05, + "loss": 42.7909, + "step": 2167 + }, + { + "epoch": 7.827088036117382, + "grad_norm": 181.93955993652344, + "learning_rate": 1.8250453720508165e-05, + "loss": 38.6068, + "step": 2168 + }, + { + "epoch": 7.830699774266366, + "grad_norm": 178.79856872558594, + "learning_rate": 1.8245009074410164e-05, + "loss": 35.694, + "step": 2169 + }, + { + "epoch": 7.83431151241535, + "grad_norm": 224.6522979736328, + "learning_rate": 1.823956442831216e-05, + "loss": 36.7127, + "step": 2170 + }, + { + "epoch": 7.83431151241535, + "eval_loss": 0.6237645745277405, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 2170 + }, + { + "epoch": 7.837923250564334, + "grad_norm": 203.37196350097656, + "learning_rate": 1.823411978221416e-05, + "loss": 34.0039, + "step": 2171 + }, + { + "epoch": 7.8415349887133186, + "grad_norm": 212.79307556152344, + "learning_rate": 1.8228675136116154e-05, + "loss": 33.2787, + "step": 2172 + }, + { + "epoch": 7.8451467268623025, + "grad_norm": 215.5691375732422, + "learning_rate": 1.822323049001815e-05, + "loss": 35.4241, + "step": 2173 + }, + { + "epoch": 7.8487584650112865, + "grad_norm": 230.0751190185547, + "learning_rate": 1.8217785843920144e-05, + "loss": 36.9333, + "step": 2174 + }, + { + "epoch": 7.852370203160271, + "grad_norm": 217.8132781982422, + "learning_rate": 1.8212341197822143e-05, + "loss": 35.7233, + "step": 2175 + }, + { + "epoch": 7.855981941309255, + "grad_norm": 245.93177795410156, + "learning_rate": 1.820689655172414e-05, + "loss": 36.6111, + "step": 2176 + }, + { + "epoch": 7.859593679458239, + "grad_norm": 210.58218383789062, + "learning_rate": 1.8201451905626134e-05, + "loss": 36.3243, + "step": 2177 + }, + { + "epoch": 7.863205417607223, + "grad_norm": 234.6280059814453, + "learning_rate": 1.819600725952813e-05, + "loss": 37.0315, + "step": 2178 + }, + { + "epoch": 7.866817155756207, + "grad_norm": 184.53121948242188, + "learning_rate": 1.8190562613430125e-05, + "loss": 35.8725, + "step": 2179 + }, + { + "epoch": 7.870428893905192, + "grad_norm": 201.5563507080078, + "learning_rate": 1.8185117967332127e-05, + "loss": 37.9183, + "step": 2180 + }, + { + "epoch": 7.870428893905192, + "eval_loss": 0.6210297346115112, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2180 + }, + { + "epoch": 7.874040632054176, + "grad_norm": 192.29579162597656, + "learning_rate": 1.8179673321234122e-05, + "loss": 37.1709, + "step": 2181 + }, + { + "epoch": 7.87765237020316, + "grad_norm": 246.0638427734375, + "learning_rate": 1.8174228675136118e-05, + "loss": 38.5338, + "step": 2182 + }, + { + "epoch": 7.881264108352145, + "grad_norm": 237.47607421875, + "learning_rate": 1.8168784029038113e-05, + "loss": 37.7041, + "step": 2183 + }, + { + "epoch": 7.884875846501129, + "grad_norm": 215.06407165527344, + "learning_rate": 1.816333938294011e-05, + "loss": 38.1663, + "step": 2184 + }, + { + "epoch": 7.888487584650113, + "grad_norm": 193.76809692382812, + "learning_rate": 1.8157894736842107e-05, + "loss": 32.1679, + "step": 2185 + }, + { + "epoch": 7.892099322799097, + "grad_norm": 208.66111755371094, + "learning_rate": 1.8152450090744103e-05, + "loss": 24.2413, + "step": 2186 + }, + { + "epoch": 7.895711060948082, + "grad_norm": 182.810546875, + "learning_rate": 1.8147005444646098e-05, + "loss": 24.1102, + "step": 2187 + }, + { + "epoch": 7.899322799097066, + "grad_norm": 200.25823974609375, + "learning_rate": 1.8141560798548093e-05, + "loss": 24.5778, + "step": 2188 + }, + { + "epoch": 7.9029345372460496, + "grad_norm": 224.19125366210938, + "learning_rate": 1.813611615245009e-05, + "loss": 26.1643, + "step": 2189 + }, + { + "epoch": 7.9065462753950335, + "grad_norm": 261.03033447265625, + "learning_rate": 1.8130671506352088e-05, + "loss": 45.1071, + "step": 2190 + }, + { + "epoch": 7.9065462753950335, + "eval_loss": 0.6303785443305969, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2190 + }, + { + "epoch": 7.910158013544018, + "grad_norm": 273.6593322753906, + "learning_rate": 1.8125226860254086e-05, + "loss": 43.8271, + "step": 2191 + }, + { + "epoch": 7.913769751693002, + "grad_norm": 304.0534362792969, + "learning_rate": 1.8119782214156082e-05, + "loss": 43.7623, + "step": 2192 + }, + { + "epoch": 7.917381489841986, + "grad_norm": 249.27255249023438, + "learning_rate": 1.8114337568058077e-05, + "loss": 43.7191, + "step": 2193 + }, + { + "epoch": 7.92099322799097, + "grad_norm": 199.5006103515625, + "learning_rate": 1.8108892921960072e-05, + "loss": 44.1019, + "step": 2194 + }, + { + "epoch": 7.924604966139955, + "grad_norm": 228.42832946777344, + "learning_rate": 1.8103448275862068e-05, + "loss": 43.9717, + "step": 2195 + }, + { + "epoch": 7.928216704288939, + "grad_norm": 247.20901489257812, + "learning_rate": 1.8098003629764067e-05, + "loss": 40.022, + "step": 2196 + }, + { + "epoch": 7.931828442437923, + "grad_norm": 297.5372619628906, + "learning_rate": 1.8092558983666062e-05, + "loss": 40.6639, + "step": 2197 + }, + { + "epoch": 7.935440180586907, + "grad_norm": 245.11915588378906, + "learning_rate": 1.8087114337568057e-05, + "loss": 40.3569, + "step": 2198 + }, + { + "epoch": 7.939051918735892, + "grad_norm": 255.53297424316406, + "learning_rate": 1.8081669691470056e-05, + "loss": 41.7983, + "step": 2199 + }, + { + "epoch": 7.942663656884876, + "grad_norm": 226.12783813476562, + "learning_rate": 1.807622504537205e-05, + "loss": 41.7844, + "step": 2200 + }, + { + "epoch": 7.942663656884876, + "eval_loss": 0.6214397549629211, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2200 + }, + { + "epoch": 7.94627539503386, + "grad_norm": 220.90577697753906, + "learning_rate": 1.8070780399274047e-05, + "loss": 42.057, + "step": 2201 + }, + { + "epoch": 7.949887133182845, + "grad_norm": 192.33856201171875, + "learning_rate": 1.8065335753176046e-05, + "loss": 42.0299, + "step": 2202 + }, + { + "epoch": 7.953498871331829, + "grad_norm": 192.8511962890625, + "learning_rate": 1.805989110707804e-05, + "loss": 41.7752, + "step": 2203 + }, + { + "epoch": 7.957110609480813, + "grad_norm": 223.10275268554688, + "learning_rate": 1.8054446460980036e-05, + "loss": 41.0178, + "step": 2204 + }, + { + "epoch": 7.960722347629797, + "grad_norm": 189.8402099609375, + "learning_rate": 1.8049001814882032e-05, + "loss": 37.9747, + "step": 2205 + }, + { + "epoch": 7.9643340857787805, + "grad_norm": 233.5938720703125, + "learning_rate": 1.8043557168784027e-05, + "loss": 35.3994, + "step": 2206 + }, + { + "epoch": 7.967945823927765, + "grad_norm": 218.5577850341797, + "learning_rate": 1.8038112522686026e-05, + "loss": 35.1967, + "step": 2207 + }, + { + "epoch": 7.971557562076749, + "grad_norm": 228.49502563476562, + "learning_rate": 1.8032667876588025e-05, + "loss": 34.5792, + "step": 2208 + }, + { + "epoch": 7.975169300225733, + "grad_norm": 285.4461364746094, + "learning_rate": 1.802722323049002e-05, + "loss": 37.9449, + "step": 2209 + }, + { + "epoch": 7.978781038374718, + "grad_norm": 186.83755493164062, + "learning_rate": 1.8021778584392016e-05, + "loss": 36.3295, + "step": 2210 + }, + { + "epoch": 7.978781038374718, + "eval_loss": 0.6212169528007507, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2210 + }, + { + "epoch": 7.982392776523702, + "grad_norm": 210.31175231933594, + "learning_rate": 1.801633393829401e-05, + "loss": 37.0061, + "step": 2211 + }, + { + "epoch": 7.986004514672686, + "grad_norm": 251.96026611328125, + "learning_rate": 1.8010889292196006e-05, + "loss": 37.8831, + "step": 2212 + }, + { + "epoch": 7.98961625282167, + "grad_norm": 273.8665771484375, + "learning_rate": 1.8005444646098005e-05, + "loss": 38.8926, + "step": 2213 + }, + { + "epoch": 7.993227990970655, + "grad_norm": 207.25836181640625, + "learning_rate": 1.8e-05, + "loss": 30.0468, + "step": 2214 + }, + { + "epoch": 7.996839729119639, + "grad_norm": 200.5218048095703, + "learning_rate": 1.7994555353901996e-05, + "loss": 24.0549, + "step": 2215 + }, + { + "epoch": 8.0, + "grad_norm": 245.7149200439453, + "learning_rate": 1.798911070780399e-05, + "loss": 22.3158, + "step": 2216 + }, + { + "epoch": 8.003611738148985, + "grad_norm": 263.85546875, + "learning_rate": 1.798366606170599e-05, + "loss": 43.2342, + "step": 2217 + }, + { + "epoch": 8.007223476297968, + "grad_norm": 244.57205200195312, + "learning_rate": 1.797822141560799e-05, + "loss": 44.0931, + "step": 2218 + }, + { + "epoch": 8.010835214446953, + "grad_norm": 196.4144287109375, + "learning_rate": 1.7972776769509984e-05, + "loss": 42.1926, + "step": 2219 + }, + { + "epoch": 8.014446952595938, + "grad_norm": 282.3250427246094, + "learning_rate": 1.796733212341198e-05, + "loss": 41.4664, + "step": 2220 + }, + { + "epoch": 8.014446952595938, + "eval_loss": 0.6222901344299316, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 2220 + }, + { + "epoch": 8.01805869074492, + "grad_norm": 186.79281616210938, + "learning_rate": 1.7961887477313975e-05, + "loss": 42.2133, + "step": 2221 + }, + { + "epoch": 8.021670428893906, + "grad_norm": 220.3788299560547, + "learning_rate": 1.795644283121597e-05, + "loss": 42.0159, + "step": 2222 + }, + { + "epoch": 8.025282167042889, + "grad_norm": 262.37078857421875, + "learning_rate": 1.7950998185117966e-05, + "loss": 42.6055, + "step": 2223 + }, + { + "epoch": 8.028893905191874, + "grad_norm": 199.07078552246094, + "learning_rate": 1.7945553539019964e-05, + "loss": 43.3061, + "step": 2224 + }, + { + "epoch": 8.032505643340858, + "grad_norm": 256.6651306152344, + "learning_rate": 1.794010889292196e-05, + "loss": 42.4806, + "step": 2225 + }, + { + "epoch": 8.036117381489841, + "grad_norm": 281.17431640625, + "learning_rate": 1.793466424682396e-05, + "loss": 43.9823, + "step": 2226 + }, + { + "epoch": 8.039729119638826, + "grad_norm": 201.19837951660156, + "learning_rate": 1.7929219600725954e-05, + "loss": 41.8372, + "step": 2227 + }, + { + "epoch": 8.043340857787811, + "grad_norm": 195.1905059814453, + "learning_rate": 1.792377495462795e-05, + "loss": 38.8656, + "step": 2228 + }, + { + "epoch": 8.046952595936794, + "grad_norm": 215.02772521972656, + "learning_rate": 1.7918330308529948e-05, + "loss": 39.8965, + "step": 2229 + }, + { + "epoch": 8.050564334085779, + "grad_norm": 202.16322326660156, + "learning_rate": 1.7912885662431944e-05, + "loss": 41.0917, + "step": 2230 + }, + { + "epoch": 8.050564334085779, + "eval_loss": 0.6212881207466125, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2230 + }, + { + "epoch": 8.054176072234762, + "grad_norm": 218.90786743164062, + "learning_rate": 1.790744101633394e-05, + "loss": 38.5499, + "step": 2231 + }, + { + "epoch": 8.057787810383747, + "grad_norm": 179.57138061523438, + "learning_rate": 1.7901996370235934e-05, + "loss": 39.5915, + "step": 2232 + }, + { + "epoch": 8.061399548532732, + "grad_norm": 242.74801635742188, + "learning_rate": 1.789655172413793e-05, + "loss": 39.6094, + "step": 2233 + }, + { + "epoch": 8.065011286681715, + "grad_norm": 183.07102966308594, + "learning_rate": 1.7891107078039925e-05, + "loss": 40.6025, + "step": 2234 + }, + { + "epoch": 8.0686230248307, + "grad_norm": 192.85418701171875, + "learning_rate": 1.7885662431941924e-05, + "loss": 40.3013, + "step": 2235 + }, + { + "epoch": 8.072234762979685, + "grad_norm": 254.26353454589844, + "learning_rate": 1.7880217785843923e-05, + "loss": 39.1747, + "step": 2236 + }, + { + "epoch": 8.075846501128668, + "grad_norm": 230.7747802734375, + "learning_rate": 1.7874773139745918e-05, + "loss": 40.7569, + "step": 2237 + }, + { + "epoch": 8.079458239277653, + "grad_norm": 179.30528259277344, + "learning_rate": 1.7869328493647913e-05, + "loss": 40.0753, + "step": 2238 + }, + { + "epoch": 8.083069977426636, + "grad_norm": 203.48915100097656, + "learning_rate": 1.786388384754991e-05, + "loss": 41.4453, + "step": 2239 + }, + { + "epoch": 8.08668171557562, + "grad_norm": 274.8970947265625, + "learning_rate": 1.7858439201451908e-05, + "loss": 40.5818, + "step": 2240 + }, + { + "epoch": 8.08668171557562, + "eval_loss": 0.6184170842170715, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.799, + "eval_steps_per_second": 56.799, + "step": 2240 + }, + { + "epoch": 8.090293453724605, + "grad_norm": 237.2452392578125, + "learning_rate": 1.7852994555353903e-05, + "loss": 42.5794, + "step": 2241 + }, + { + "epoch": 8.093905191873588, + "grad_norm": 236.33766174316406, + "learning_rate": 1.7847549909255898e-05, + "loss": 41.89, + "step": 2242 + }, + { + "epoch": 8.097516930022573, + "grad_norm": 269.4791564941406, + "learning_rate": 1.7842105263157894e-05, + "loss": 41.7726, + "step": 2243 + }, + { + "epoch": 8.101128668171558, + "grad_norm": 192.28457641601562, + "learning_rate": 1.783666061705989e-05, + "loss": 40.1187, + "step": 2244 + }, + { + "epoch": 8.104740406320541, + "grad_norm": 201.5625457763672, + "learning_rate": 1.7831215970961888e-05, + "loss": 36.8004, + "step": 2245 + }, + { + "epoch": 8.108352144469526, + "grad_norm": 175.7625274658203, + "learning_rate": 1.7825771324863887e-05, + "loss": 33.8354, + "step": 2246 + }, + { + "epoch": 8.111963882618511, + "grad_norm": 195.6171112060547, + "learning_rate": 1.7820326678765882e-05, + "loss": 33.5176, + "step": 2247 + }, + { + "epoch": 8.115575620767494, + "grad_norm": 158.7554168701172, + "learning_rate": 1.7814882032667877e-05, + "loss": 34.2908, + "step": 2248 + }, + { + "epoch": 8.119187358916479, + "grad_norm": 192.78900146484375, + "learning_rate": 1.7809437386569873e-05, + "loss": 34.0861, + "step": 2249 + }, + { + "epoch": 8.122799097065462, + "grad_norm": 186.6603240966797, + "learning_rate": 1.7803992740471868e-05, + "loss": 35.5742, + "step": 2250 + }, + { + "epoch": 8.122799097065462, + "eval_loss": 0.6207499504089355, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2250 + }, + { + "epoch": 8.126410835214447, + "grad_norm": 264.3590087890625, + "learning_rate": 1.7798548094373867e-05, + "loss": 35.6709, + "step": 2251 + }, + { + "epoch": 8.130022573363432, + "grad_norm": 202.9478302001953, + "learning_rate": 1.7793103448275862e-05, + "loss": 36.4221, + "step": 2252 + }, + { + "epoch": 8.133634311512415, + "grad_norm": 229.260498046875, + "learning_rate": 1.7787658802177858e-05, + "loss": 36.0745, + "step": 2253 + }, + { + "epoch": 8.1372460496614, + "grad_norm": 222.37716674804688, + "learning_rate": 1.7782214156079856e-05, + "loss": 37.3266, + "step": 2254 + }, + { + "epoch": 8.140857787810384, + "grad_norm": 217.02272033691406, + "learning_rate": 1.7776769509981852e-05, + "loss": 37.2819, + "step": 2255 + }, + { + "epoch": 8.144469525959368, + "grad_norm": 247.61016845703125, + "learning_rate": 1.7771324863883847e-05, + "loss": 37.2683, + "step": 2256 + }, + { + "epoch": 8.148081264108352, + "grad_norm": 209.7449493408203, + "learning_rate": 1.7765880217785846e-05, + "loss": 36.7165, + "step": 2257 + }, + { + "epoch": 8.151693002257336, + "grad_norm": 217.30722045898438, + "learning_rate": 1.776043557168784e-05, + "loss": 37.0805, + "step": 2258 + }, + { + "epoch": 8.15530474040632, + "grad_norm": 181.5167236328125, + "learning_rate": 1.7754990925589837e-05, + "loss": 38.0326, + "step": 2259 + }, + { + "epoch": 8.158916478555305, + "grad_norm": 217.4818878173828, + "learning_rate": 1.7749546279491832e-05, + "loss": 37.1798, + "step": 2260 + }, + { + "epoch": 8.158916478555305, + "eval_loss": 0.6218119263648987, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 2260 + }, + { + "epoch": 8.162528216704288, + "grad_norm": 233.60733032226562, + "learning_rate": 1.7744101633393828e-05, + "loss": 36.6039, + "step": 2261 + }, + { + "epoch": 8.166139954853273, + "grad_norm": 184.5128631591797, + "learning_rate": 1.7738656987295826e-05, + "loss": 30.6188, + "step": 2262 + }, + { + "epoch": 8.169751693002258, + "grad_norm": 154.25791931152344, + "learning_rate": 1.7733212341197825e-05, + "loss": 24.0782, + "step": 2263 + }, + { + "epoch": 8.173363431151241, + "grad_norm": 179.92723083496094, + "learning_rate": 1.772776769509982e-05, + "loss": 23.7072, + "step": 2264 + }, + { + "epoch": 8.176975169300226, + "grad_norm": 170.87684631347656, + "learning_rate": 1.7722323049001816e-05, + "loss": 24.0008, + "step": 2265 + }, + { + "epoch": 8.18058690744921, + "grad_norm": 179.25233459472656, + "learning_rate": 1.771687840290381e-05, + "loss": 24.8393, + "step": 2266 + }, + { + "epoch": 8.184198645598194, + "grad_norm": 268.7836608886719, + "learning_rate": 1.7711433756805807e-05, + "loss": 44.0573, + "step": 2267 + }, + { + "epoch": 8.187810383747179, + "grad_norm": 249.12033081054688, + "learning_rate": 1.7705989110707805e-05, + "loss": 45.0218, + "step": 2268 + }, + { + "epoch": 8.191422121896162, + "grad_norm": 275.2551574707031, + "learning_rate": 1.77005444646098e-05, + "loss": 43.1954, + "step": 2269 + }, + { + "epoch": 8.195033860045147, + "grad_norm": 233.5360107421875, + "learning_rate": 1.7695099818511796e-05, + "loss": 43.0807, + "step": 2270 + }, + { + "epoch": 8.195033860045147, + "eval_loss": 0.6311450600624084, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 2270 + }, + { + "epoch": 8.198645598194132, + "grad_norm": 201.01617431640625, + "learning_rate": 1.768965517241379e-05, + "loss": 43.8161, + "step": 2271 + }, + { + "epoch": 8.202257336343115, + "grad_norm": 243.028564453125, + "learning_rate": 1.7684210526315787e-05, + "loss": 43.3388, + "step": 2272 + }, + { + "epoch": 8.2058690744921, + "grad_norm": 191.8246307373047, + "learning_rate": 1.767876588021779e-05, + "loss": 42.6949, + "step": 2273 + }, + { + "epoch": 8.209480812641084, + "grad_norm": 241.33609008789062, + "learning_rate": 1.7673321234119784e-05, + "loss": 43.3541, + "step": 2274 + }, + { + "epoch": 8.213092550790067, + "grad_norm": 247.99066162109375, + "learning_rate": 1.766787658802178e-05, + "loss": 44.4262, + "step": 2275 + }, + { + "epoch": 8.216704288939052, + "grad_norm": 223.35452270507812, + "learning_rate": 1.7662431941923775e-05, + "loss": 42.5696, + "step": 2276 + }, + { + "epoch": 8.220316027088035, + "grad_norm": 208.75209045410156, + "learning_rate": 1.765698729582577e-05, + "loss": 41.9236, + "step": 2277 + }, + { + "epoch": 8.22392776523702, + "grad_norm": 229.60305786132812, + "learning_rate": 1.7651542649727766e-05, + "loss": 39.962, + "step": 2278 + }, + { + "epoch": 8.227539503386005, + "grad_norm": 294.3867492675781, + "learning_rate": 1.7646098003629765e-05, + "loss": 39.0847, + "step": 2279 + }, + { + "epoch": 8.231151241534988, + "grad_norm": 201.49679565429688, + "learning_rate": 1.764065335753176e-05, + "loss": 39.1451, + "step": 2280 + }, + { + "epoch": 8.231151241534988, + "eval_loss": 0.6214079856872559, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 2280 + }, + { + "epoch": 8.234762979683973, + "grad_norm": 201.57894897460938, + "learning_rate": 1.7635208711433756e-05, + "loss": 39.4673, + "step": 2281 + }, + { + "epoch": 8.238374717832958, + "grad_norm": 201.0395965576172, + "learning_rate": 1.7629764065335754e-05, + "loss": 39.9832, + "step": 2282 + }, + { + "epoch": 8.241986455981941, + "grad_norm": 274.41168212890625, + "learning_rate": 1.762431941923775e-05, + "loss": 40.3885, + "step": 2283 + }, + { + "epoch": 8.245598194130926, + "grad_norm": 173.79977416992188, + "learning_rate": 1.761887477313975e-05, + "loss": 39.5292, + "step": 2284 + }, + { + "epoch": 8.249209932279909, + "grad_norm": 194.91806030273438, + "learning_rate": 1.7613430127041744e-05, + "loss": 40.3855, + "step": 2285 + }, + { + "epoch": 8.252821670428894, + "grad_norm": 216.47213745117188, + "learning_rate": 1.760798548094374e-05, + "loss": 40.937, + "step": 2286 + }, + { + "epoch": 8.256433408577879, + "grad_norm": 168.1825714111328, + "learning_rate": 1.7602540834845735e-05, + "loss": 41.2523, + "step": 2287 + }, + { + "epoch": 8.260045146726862, + "grad_norm": 187.51914978027344, + "learning_rate": 1.759709618874773e-05, + "loss": 40.6913, + "step": 2288 + }, + { + "epoch": 8.263656884875846, + "grad_norm": 183.99844360351562, + "learning_rate": 1.759165154264973e-05, + "loss": 42.5074, + "step": 2289 + }, + { + "epoch": 8.267268623024831, + "grad_norm": 201.23797607421875, + "learning_rate": 1.7586206896551724e-05, + "loss": 42.0519, + "step": 2290 + }, + { + "epoch": 8.267268623024831, + "eval_loss": 0.6184054017066956, + "eval_runtime": 3.1465, + "eval_samples_per_second": 56.889, + "eval_steps_per_second": 56.889, + "step": 2290 + }, + { + "epoch": 8.270880361173814, + "grad_norm": 219.0037384033203, + "learning_rate": 1.7580762250453723e-05, + "loss": 41.7059, + "step": 2291 + }, + { + "epoch": 8.2744920993228, + "grad_norm": 221.00173950195312, + "learning_rate": 1.7575317604355718e-05, + "loss": 40.9004, + "step": 2292 + }, + { + "epoch": 8.278103837471784, + "grad_norm": 180.00828552246094, + "learning_rate": 1.7569872958257714e-05, + "loss": 38.7865, + "step": 2293 + }, + { + "epoch": 8.281715575620767, + "grad_norm": 210.69302368164062, + "learning_rate": 1.756442831215971e-05, + "loss": 39.207, + "step": 2294 + }, + { + "epoch": 8.285327313769752, + "grad_norm": 196.8787078857422, + "learning_rate": 1.7558983666061708e-05, + "loss": 39.4472, + "step": 2295 + }, + { + "epoch": 8.288939051918735, + "grad_norm": 229.16331481933594, + "learning_rate": 1.7553539019963703e-05, + "loss": 36.5539, + "step": 2296 + }, + { + "epoch": 8.29255079006772, + "grad_norm": 180.67474365234375, + "learning_rate": 1.75480943738657e-05, + "loss": 34.3887, + "step": 2297 + }, + { + "epoch": 8.296162528216705, + "grad_norm": 234.046875, + "learning_rate": 1.7542649727767694e-05, + "loss": 34.158, + "step": 2298 + }, + { + "epoch": 8.299774266365688, + "grad_norm": 213.34255981445312, + "learning_rate": 1.753720508166969e-05, + "loss": 34.7655, + "step": 2299 + }, + { + "epoch": 8.303386004514673, + "grad_norm": 205.6382598876953, + "learning_rate": 1.753176043557169e-05, + "loss": 34.4223, + "step": 2300 + }, + { + "epoch": 8.303386004514673, + "eval_loss": 0.6200549006462097, + "eval_runtime": 3.1447, + "eval_samples_per_second": 56.921, + "eval_steps_per_second": 56.921, + "step": 2300 + }, + { + "epoch": 8.306997742663658, + "grad_norm": 189.79238891601562, + "learning_rate": 1.7526315789473687e-05, + "loss": 35.3846, + "step": 2301 + }, + { + "epoch": 8.31060948081264, + "grad_norm": 202.27859497070312, + "learning_rate": 1.7520871143375682e-05, + "loss": 34.9006, + "step": 2302 + }, + { + "epoch": 8.314221218961626, + "grad_norm": 217.62327575683594, + "learning_rate": 1.7515426497277678e-05, + "loss": 36.3079, + "step": 2303 + }, + { + "epoch": 8.317832957110609, + "grad_norm": 212.82862854003906, + "learning_rate": 1.7509981851179673e-05, + "loss": 35.8598, + "step": 2304 + }, + { + "epoch": 8.321444695259594, + "grad_norm": 229.778564453125, + "learning_rate": 1.750453720508167e-05, + "loss": 37.0853, + "step": 2305 + }, + { + "epoch": 8.325056433408578, + "grad_norm": 219.99844360351562, + "learning_rate": 1.7499092558983667e-05, + "loss": 38.01, + "step": 2306 + }, + { + "epoch": 8.328668171557561, + "grad_norm": 202.63035583496094, + "learning_rate": 1.7493647912885663e-05, + "loss": 36.4756, + "step": 2307 + }, + { + "epoch": 8.332279909706546, + "grad_norm": 188.44094848632812, + "learning_rate": 1.7488203266787658e-05, + "loss": 37.0509, + "step": 2308 + }, + { + "epoch": 8.335891647855531, + "grad_norm": 187.8760223388672, + "learning_rate": 1.7482758620689657e-05, + "loss": 38.0019, + "step": 2309 + }, + { + "epoch": 8.339503386004514, + "grad_norm": 239.35833740234375, + "learning_rate": 1.7477313974591652e-05, + "loss": 38.2255, + "step": 2310 + }, + { + "epoch": 8.339503386004514, + "eval_loss": 0.6221747994422913, + "eval_runtime": 3.148, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 2310 + }, + { + "epoch": 8.343115124153499, + "grad_norm": 236.3567657470703, + "learning_rate": 1.747186932849365e-05, + "loss": 37.3598, + "step": 2311 + }, + { + "epoch": 8.346726862302482, + "grad_norm": 188.16151428222656, + "learning_rate": 1.7466424682395646e-05, + "loss": 27.1993, + "step": 2312 + }, + { + "epoch": 8.350338600451467, + "grad_norm": 216.58778381347656, + "learning_rate": 1.746098003629764e-05, + "loss": 23.7024, + "step": 2313 + }, + { + "epoch": 8.353950338600452, + "grad_norm": 221.03111267089844, + "learning_rate": 1.7455535390199637e-05, + "loss": 24.2856, + "step": 2314 + }, + { + "epoch": 8.357562076749435, + "grad_norm": 180.36221313476562, + "learning_rate": 1.7450090744101632e-05, + "loss": 23.7624, + "step": 2315 + }, + { + "epoch": 8.36117381489842, + "grad_norm": 198.77438354492188, + "learning_rate": 1.7444646098003628e-05, + "loss": 25.8628, + "step": 2316 + }, + { + "epoch": 8.364785553047405, + "grad_norm": 250.81321716308594, + "learning_rate": 1.7439201451905627e-05, + "loss": 43.4097, + "step": 2317 + }, + { + "epoch": 8.368397291196388, + "grad_norm": 246.19544982910156, + "learning_rate": 1.7433756805807622e-05, + "loss": 44.7141, + "step": 2318 + }, + { + "epoch": 8.372009029345373, + "grad_norm": 245.04241943359375, + "learning_rate": 1.742831215970962e-05, + "loss": 44.4511, + "step": 2319 + }, + { + "epoch": 8.375620767494357, + "grad_norm": 224.05331420898438, + "learning_rate": 1.7422867513611616e-05, + "loss": 43.5971, + "step": 2320 + }, + { + "epoch": 8.375620767494357, + "eval_loss": 0.6324251294136047, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 2320 + }, + { + "epoch": 8.37923250564334, + "grad_norm": 222.3795623779297, + "learning_rate": 1.741742286751361e-05, + "loss": 42.9007, + "step": 2321 + }, + { + "epoch": 8.382844243792325, + "grad_norm": 210.0133514404297, + "learning_rate": 1.741197822141561e-05, + "loss": 42.8733, + "step": 2322 + }, + { + "epoch": 8.386455981941308, + "grad_norm": 222.01031494140625, + "learning_rate": 1.7406533575317606e-05, + "loss": 42.9875, + "step": 2323 + }, + { + "epoch": 8.390067720090293, + "grad_norm": 187.30101013183594, + "learning_rate": 1.74010889292196e-05, + "loss": 42.4873, + "step": 2324 + }, + { + "epoch": 8.393679458239278, + "grad_norm": 188.22048950195312, + "learning_rate": 1.7395644283121596e-05, + "loss": 42.2066, + "step": 2325 + }, + { + "epoch": 8.397291196388261, + "grad_norm": 228.75363159179688, + "learning_rate": 1.7390199637023592e-05, + "loss": 42.7604, + "step": 2326 + }, + { + "epoch": 8.400902934537246, + "grad_norm": 196.8817901611328, + "learning_rate": 1.7384754990925587e-05, + "loss": 42.445, + "step": 2327 + }, + { + "epoch": 8.404514672686231, + "grad_norm": 205.3610382080078, + "learning_rate": 1.737931034482759e-05, + "loss": 39.8408, + "step": 2328 + }, + { + "epoch": 8.408126410835214, + "grad_norm": 259.0702819824219, + "learning_rate": 1.7373865698729585e-05, + "loss": 40.847, + "step": 2329 + }, + { + "epoch": 8.411738148984199, + "grad_norm": 216.12017822265625, + "learning_rate": 1.736842105263158e-05, + "loss": 40.4648, + "step": 2330 + }, + { + "epoch": 8.411738148984199, + "eval_loss": 0.6252871155738831, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2330 + }, + { + "epoch": 8.415349887133182, + "grad_norm": 330.9464111328125, + "learning_rate": 1.7362976406533575e-05, + "loss": 39.7682, + "step": 2331 + }, + { + "epoch": 8.418961625282167, + "grad_norm": 237.19505310058594, + "learning_rate": 1.735753176043557e-05, + "loss": 38.8824, + "step": 2332 + }, + { + "epoch": 8.422573363431152, + "grad_norm": 247.22259521484375, + "learning_rate": 1.735208711433757e-05, + "loss": 40.1187, + "step": 2333 + }, + { + "epoch": 8.426185101580135, + "grad_norm": 267.739990234375, + "learning_rate": 1.7346642468239565e-05, + "loss": 40.4589, + "step": 2334 + }, + { + "epoch": 8.42979683972912, + "grad_norm": 308.715576171875, + "learning_rate": 1.734119782214156e-05, + "loss": 41.5481, + "step": 2335 + }, + { + "epoch": 8.433408577878104, + "grad_norm": 350.8972473144531, + "learning_rate": 1.7335753176043556e-05, + "loss": 41.6628, + "step": 2336 + }, + { + "epoch": 8.437020316027088, + "grad_norm": 245.9825897216797, + "learning_rate": 1.7330308529945555e-05, + "loss": 40.3527, + "step": 2337 + }, + { + "epoch": 8.440632054176072, + "grad_norm": 253.94488525390625, + "learning_rate": 1.732486388384755e-05, + "loss": 39.6388, + "step": 2338 + }, + { + "epoch": 8.444243792325057, + "grad_norm": 226.24179077148438, + "learning_rate": 1.731941923774955e-05, + "loss": 40.5561, + "step": 2339 + }, + { + "epoch": 8.44785553047404, + "grad_norm": 188.66746520996094, + "learning_rate": 1.7313974591651544e-05, + "loss": 41.8422, + "step": 2340 + }, + { + "epoch": 8.44785553047404, + "eval_loss": 0.6197592616081238, + "eval_runtime": 3.1522, + "eval_samples_per_second": 56.786, + "eval_steps_per_second": 56.786, + "step": 2340 + }, + { + "epoch": 8.451467268623025, + "grad_norm": 227.01014709472656, + "learning_rate": 1.730852994555354e-05, + "loss": 41.4184, + "step": 2341 + }, + { + "epoch": 8.455079006772008, + "grad_norm": 187.11643981933594, + "learning_rate": 1.7303085299455535e-05, + "loss": 40.796, + "step": 2342 + }, + { + "epoch": 8.458690744920993, + "grad_norm": 243.1756134033203, + "learning_rate": 1.729764065335753e-05, + "loss": 41.7926, + "step": 2343 + }, + { + "epoch": 8.462302483069978, + "grad_norm": 226.15187072753906, + "learning_rate": 1.729219600725953e-05, + "loss": 41.588, + "step": 2344 + }, + { + "epoch": 8.465914221218961, + "grad_norm": 218.49935913085938, + "learning_rate": 1.7286751361161524e-05, + "loss": 39.6935, + "step": 2345 + }, + { + "epoch": 8.469525959367946, + "grad_norm": 232.4805145263672, + "learning_rate": 1.7281306715063523e-05, + "loss": 37.0718, + "step": 2346 + }, + { + "epoch": 8.47313769751693, + "grad_norm": 201.1748046875, + "learning_rate": 1.727586206896552e-05, + "loss": 33.9633, + "step": 2347 + }, + { + "epoch": 8.476749435665914, + "grad_norm": 208.79733276367188, + "learning_rate": 1.7270417422867514e-05, + "loss": 33.4553, + "step": 2348 + }, + { + "epoch": 8.480361173814899, + "grad_norm": 235.91151428222656, + "learning_rate": 1.726497277676951e-05, + "loss": 33.6144, + "step": 2349 + }, + { + "epoch": 8.483972911963882, + "grad_norm": 206.28811645507812, + "learning_rate": 1.7259528130671508e-05, + "loss": 35.3678, + "step": 2350 + }, + { + "epoch": 8.483972911963882, + "eval_loss": 0.6203061938285828, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 2350 + }, + { + "epoch": 8.487584650112867, + "grad_norm": 305.2204284667969, + "learning_rate": 1.7254083484573503e-05, + "loss": 35.9175, + "step": 2351 + }, + { + "epoch": 8.491196388261852, + "grad_norm": 227.1592254638672, + "learning_rate": 1.72486388384755e-05, + "loss": 35.5001, + "step": 2352 + }, + { + "epoch": 8.494808126410835, + "grad_norm": 194.739501953125, + "learning_rate": 1.7243194192377494e-05, + "loss": 35.0015, + "step": 2353 + }, + { + "epoch": 8.49841986455982, + "grad_norm": 233.8467254638672, + "learning_rate": 1.723774954627949e-05, + "loss": 36.8257, + "step": 2354 + }, + { + "epoch": 8.502031602708804, + "grad_norm": 258.8914489746094, + "learning_rate": 1.7232304900181492e-05, + "loss": 36.1246, + "step": 2355 + }, + { + "epoch": 8.505643340857787, + "grad_norm": 194.8585968017578, + "learning_rate": 1.7226860254083487e-05, + "loss": 36.1245, + "step": 2356 + }, + { + "epoch": 8.509255079006772, + "grad_norm": 191.2276153564453, + "learning_rate": 1.7221415607985483e-05, + "loss": 37.0608, + "step": 2357 + }, + { + "epoch": 8.512866817155757, + "grad_norm": 197.9025115966797, + "learning_rate": 1.7215970961887478e-05, + "loss": 37.0779, + "step": 2358 + }, + { + "epoch": 8.51647855530474, + "grad_norm": 207.01016235351562, + "learning_rate": 1.7210526315789473e-05, + "loss": 37.8432, + "step": 2359 + }, + { + "epoch": 8.520090293453725, + "grad_norm": 222.20201110839844, + "learning_rate": 1.720508166969147e-05, + "loss": 36.6983, + "step": 2360 + }, + { + "epoch": 8.520090293453725, + "eval_loss": 0.6240220665931702, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 2360 + }, + { + "epoch": 8.523702031602708, + "grad_norm": 200.19273376464844, + "learning_rate": 1.7199637023593467e-05, + "loss": 38.0613, + "step": 2361 + }, + { + "epoch": 8.527313769751693, + "grad_norm": 205.36758422851562, + "learning_rate": 1.7194192377495463e-05, + "loss": 29.6395, + "step": 2362 + }, + { + "epoch": 8.530925507900678, + "grad_norm": 206.53396606445312, + "learning_rate": 1.7188747731397458e-05, + "loss": 23.6478, + "step": 2363 + }, + { + "epoch": 8.534537246049661, + "grad_norm": 219.47044372558594, + "learning_rate": 1.7183303085299454e-05, + "loss": 22.8522, + "step": 2364 + }, + { + "epoch": 8.538148984198646, + "grad_norm": 178.48008728027344, + "learning_rate": 1.7177858439201452e-05, + "loss": 24.1411, + "step": 2365 + }, + { + "epoch": 8.54176072234763, + "grad_norm": 222.63731384277344, + "learning_rate": 1.717241379310345e-05, + "loss": 26.2818, + "step": 2366 + }, + { + "epoch": 8.545372460496614, + "grad_norm": 216.6333465576172, + "learning_rate": 1.7166969147005447e-05, + "loss": 42.5599, + "step": 2367 + }, + { + "epoch": 8.548984198645599, + "grad_norm": 241.42532348632812, + "learning_rate": 1.7161524500907442e-05, + "loss": 44.0016, + "step": 2368 + }, + { + "epoch": 8.552595936794582, + "grad_norm": 227.95193481445312, + "learning_rate": 1.7156079854809437e-05, + "loss": 44.1662, + "step": 2369 + }, + { + "epoch": 8.556207674943566, + "grad_norm": 204.9208526611328, + "learning_rate": 1.7150635208711433e-05, + "loss": 41.2255, + "step": 2370 + }, + { + "epoch": 8.556207674943566, + "eval_loss": 0.6293933987617493, + "eval_runtime": 3.1467, + "eval_samples_per_second": 56.884, + "eval_steps_per_second": 56.884, + "step": 2370 + }, + { + "epoch": 8.559819413092551, + "grad_norm": 168.1370849609375, + "learning_rate": 1.7145190562613428e-05, + "loss": 42.8374, + "step": 2371 + }, + { + "epoch": 8.563431151241534, + "grad_norm": 209.16641235351562, + "learning_rate": 1.7139745916515427e-05, + "loss": 42.4378, + "step": 2372 + }, + { + "epoch": 8.56704288939052, + "grad_norm": 235.36373901367188, + "learning_rate": 1.7134301270417422e-05, + "loss": 43.3213, + "step": 2373 + }, + { + "epoch": 8.570654627539504, + "grad_norm": 198.8206329345703, + "learning_rate": 1.712885662431942e-05, + "loss": 43.5621, + "step": 2374 + }, + { + "epoch": 8.574266365688487, + "grad_norm": 191.1640167236328, + "learning_rate": 1.7123411978221416e-05, + "loss": 41.8729, + "step": 2375 + }, + { + "epoch": 8.577878103837472, + "grad_norm": 281.6352233886719, + "learning_rate": 1.7117967332123412e-05, + "loss": 42.8306, + "step": 2376 + }, + { + "epoch": 8.581489841986457, + "grad_norm": 191.68939208984375, + "learning_rate": 1.711252268602541e-05, + "loss": 41.3603, + "step": 2377 + }, + { + "epoch": 8.58510158013544, + "grad_norm": 175.3041229248047, + "learning_rate": 1.7107078039927406e-05, + "loss": 38.7076, + "step": 2378 + }, + { + "epoch": 8.588713318284425, + "grad_norm": 186.31202697753906, + "learning_rate": 1.71016333938294e-05, + "loss": 38.832, + "step": 2379 + }, + { + "epoch": 8.592325056433408, + "grad_norm": 192.0680389404297, + "learning_rate": 1.7096188747731397e-05, + "loss": 40.6542, + "step": 2380 + }, + { + "epoch": 8.592325056433408, + "eval_loss": 0.6245992183685303, + "eval_runtime": 3.1487, + "eval_samples_per_second": 56.848, + "eval_steps_per_second": 56.848, + "step": 2380 + }, + { + "epoch": 8.595936794582393, + "grad_norm": 284.3516540527344, + "learning_rate": 1.7090744101633392e-05, + "loss": 40.3145, + "step": 2381 + }, + { + "epoch": 8.599548532731378, + "grad_norm": 210.2421875, + "learning_rate": 1.708529945553539e-05, + "loss": 39.9109, + "step": 2382 + }, + { + "epoch": 8.60316027088036, + "grad_norm": 202.3438720703125, + "learning_rate": 1.707985480943739e-05, + "loss": 39.0686, + "step": 2383 + }, + { + "epoch": 8.606772009029346, + "grad_norm": 189.5508270263672, + "learning_rate": 1.7074410163339385e-05, + "loss": 40.6673, + "step": 2384 + }, + { + "epoch": 8.610383747178329, + "grad_norm": 199.3516387939453, + "learning_rate": 1.706896551724138e-05, + "loss": 40.5357, + "step": 2385 + }, + { + "epoch": 8.613995485327314, + "grad_norm": 183.11309814453125, + "learning_rate": 1.7063520871143376e-05, + "loss": 40.7691, + "step": 2386 + }, + { + "epoch": 8.617607223476298, + "grad_norm": 347.104248046875, + "learning_rate": 1.705807622504537e-05, + "loss": 40.6822, + "step": 2387 + }, + { + "epoch": 8.621218961625281, + "grad_norm": 341.0453796386719, + "learning_rate": 1.705263157894737e-05, + "loss": 40.9791, + "step": 2388 + }, + { + "epoch": 8.624830699774266, + "grad_norm": 335.33221435546875, + "learning_rate": 1.7047186932849365e-05, + "loss": 41.0977, + "step": 2389 + }, + { + "epoch": 8.628442437923251, + "grad_norm": 209.75198364257812, + "learning_rate": 1.704174228675136e-05, + "loss": 41.3332, + "step": 2390 + }, + { + "epoch": 8.628442437923251, + "eval_loss": 0.6176490783691406, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2390 + }, + { + "epoch": 8.632054176072234, + "grad_norm": 221.6715545654297, + "learning_rate": 1.7036297640653356e-05, + "loss": 41.7456, + "step": 2391 + }, + { + "epoch": 8.635665914221219, + "grad_norm": 255.7875213623047, + "learning_rate": 1.7030852994555355e-05, + "loss": 41.7063, + "step": 2392 + }, + { + "epoch": 8.639277652370204, + "grad_norm": 206.66221618652344, + "learning_rate": 1.7025408348457354e-05, + "loss": 41.941, + "step": 2393 + }, + { + "epoch": 8.642889390519187, + "grad_norm": 381.9871826171875, + "learning_rate": 1.701996370235935e-05, + "loss": 42.8615, + "step": 2394 + }, + { + "epoch": 8.646501128668172, + "grad_norm": 303.8249816894531, + "learning_rate": 1.7014519056261344e-05, + "loss": 37.8472, + "step": 2395 + }, + { + "epoch": 8.650112866817155, + "grad_norm": 201.2444610595703, + "learning_rate": 1.700907441016334e-05, + "loss": 35.4641, + "step": 2396 + }, + { + "epoch": 8.65372460496614, + "grad_norm": 242.34298706054688, + "learning_rate": 1.7003629764065335e-05, + "loss": 33.3414, + "step": 2397 + }, + { + "epoch": 8.657336343115125, + "grad_norm": 214.45384216308594, + "learning_rate": 1.699818511796733e-05, + "loss": 33.7771, + "step": 2398 + }, + { + "epoch": 8.660948081264108, + "grad_norm": 276.4810485839844, + "learning_rate": 1.699274047186933e-05, + "loss": 35.4289, + "step": 2399 + }, + { + "epoch": 8.664559819413093, + "grad_norm": 199.68626403808594, + "learning_rate": 1.6987295825771325e-05, + "loss": 34.4205, + "step": 2400 + }, + { + "epoch": 8.664559819413093, + "eval_loss": 0.6179484128952026, + "eval_runtime": 3.1618, + "eval_samples_per_second": 56.614, + "eval_steps_per_second": 56.614, + "step": 2400 + }, + { + "epoch": 8.668171557562077, + "grad_norm": 239.19200134277344, + "learning_rate": 1.698185117967332e-05, + "loss": 34.3428, + "step": 2401 + }, + { + "epoch": 8.67178329571106, + "grad_norm": 341.44927978515625, + "learning_rate": 1.697640653357532e-05, + "loss": 37.6011, + "step": 2402 + }, + { + "epoch": 8.675395033860045, + "grad_norm": 260.5967102050781, + "learning_rate": 1.6970961887477314e-05, + "loss": 34.9222, + "step": 2403 + }, + { + "epoch": 8.679006772009028, + "grad_norm": 217.9357147216797, + "learning_rate": 1.6965517241379313e-05, + "loss": 36.6177, + "step": 2404 + }, + { + "epoch": 8.682618510158013, + "grad_norm": 355.21917724609375, + "learning_rate": 1.696007259528131e-05, + "loss": 36.3072, + "step": 2405 + }, + { + "epoch": 8.686230248306998, + "grad_norm": 279.37200927734375, + "learning_rate": 1.6954627949183304e-05, + "loss": 36.7026, + "step": 2406 + }, + { + "epoch": 8.689841986455981, + "grad_norm": 344.9017028808594, + "learning_rate": 1.69491833030853e-05, + "loss": 37.5009, + "step": 2407 + }, + { + "epoch": 8.693453724604966, + "grad_norm": 225.28668212890625, + "learning_rate": 1.6943738656987295e-05, + "loss": 36.0914, + "step": 2408 + }, + { + "epoch": 8.697065462753951, + "grad_norm": 233.16372680664062, + "learning_rate": 1.693829401088929e-05, + "loss": 38.0917, + "step": 2409 + }, + { + "epoch": 8.700677200902934, + "grad_norm": 220.2307891845703, + "learning_rate": 1.693284936479129e-05, + "loss": 37.4493, + "step": 2410 + }, + { + "epoch": 8.700677200902934, + "eval_loss": 0.6225734949111938, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2410 + }, + { + "epoch": 8.704288939051919, + "grad_norm": 298.2883605957031, + "learning_rate": 1.6927404718693287e-05, + "loss": 37.6527, + "step": 2411 + }, + { + "epoch": 8.707900677200904, + "grad_norm": 329.1615295410156, + "learning_rate": 1.6921960072595283e-05, + "loss": 30.9627, + "step": 2412 + }, + { + "epoch": 8.711512415349887, + "grad_norm": 192.55380249023438, + "learning_rate": 1.6916515426497278e-05, + "loss": 24.2028, + "step": 2413 + }, + { + "epoch": 8.715124153498872, + "grad_norm": 162.13583374023438, + "learning_rate": 1.6911070780399274e-05, + "loss": 23.3005, + "step": 2414 + }, + { + "epoch": 8.718735891647855, + "grad_norm": 152.95108032226562, + "learning_rate": 1.6905626134301272e-05, + "loss": 24.335, + "step": 2415 + }, + { + "epoch": 8.72234762979684, + "grad_norm": 183.4193572998047, + "learning_rate": 1.6900181488203268e-05, + "loss": 24.9279, + "step": 2416 + }, + { + "epoch": 8.725959367945824, + "grad_norm": 232.93650817871094, + "learning_rate": 1.6894736842105263e-05, + "loss": 43.4574, + "step": 2417 + }, + { + "epoch": 8.729571106094808, + "grad_norm": 226.85890197753906, + "learning_rate": 1.688929219600726e-05, + "loss": 44.4136, + "step": 2418 + }, + { + "epoch": 8.733182844243792, + "grad_norm": 232.16064453125, + "learning_rate": 1.6883847549909254e-05, + "loss": 42.8183, + "step": 2419 + }, + { + "epoch": 8.736794582392777, + "grad_norm": 243.5811767578125, + "learning_rate": 1.6878402903811253e-05, + "loss": 43.3031, + "step": 2420 + }, + { + "epoch": 8.736794582392777, + "eval_loss": 0.6284167170524597, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2420 + }, + { + "epoch": 8.74040632054176, + "grad_norm": 194.7115020751953, + "learning_rate": 1.687295825771325e-05, + "loss": 42.1276, + "step": 2421 + }, + { + "epoch": 8.744018058690745, + "grad_norm": 250.81983947753906, + "learning_rate": 1.6867513611615247e-05, + "loss": 42.5535, + "step": 2422 + }, + { + "epoch": 8.747629796839728, + "grad_norm": 205.1988983154297, + "learning_rate": 1.6862068965517242e-05, + "loss": 42.7745, + "step": 2423 + }, + { + "epoch": 8.751241534988713, + "grad_norm": 159.68243408203125, + "learning_rate": 1.6856624319419238e-05, + "loss": 43.6562, + "step": 2424 + }, + { + "epoch": 8.754853273137698, + "grad_norm": 164.31361389160156, + "learning_rate": 1.6851179673321233e-05, + "loss": 43.4602, + "step": 2425 + }, + { + "epoch": 8.758465011286681, + "grad_norm": 213.9793243408203, + "learning_rate": 1.6845735027223232e-05, + "loss": 42.1559, + "step": 2426 + }, + { + "epoch": 8.762076749435666, + "grad_norm": 205.79107666015625, + "learning_rate": 1.6840290381125227e-05, + "loss": 41.5687, + "step": 2427 + }, + { + "epoch": 8.76568848758465, + "grad_norm": 235.80348205566406, + "learning_rate": 1.6834845735027223e-05, + "loss": 41.0748, + "step": 2428 + }, + { + "epoch": 8.769300225733634, + "grad_norm": 203.84884643554688, + "learning_rate": 1.682940108892922e-05, + "loss": 39.3348, + "step": 2429 + }, + { + "epoch": 8.772911963882619, + "grad_norm": 271.2411804199219, + "learning_rate": 1.6823956442831217e-05, + "loss": 39.357, + "step": 2430 + }, + { + "epoch": 8.772911963882619, + "eval_loss": 0.6211046576499939, + "eval_runtime": 3.1402, + "eval_samples_per_second": 57.002, + "eval_steps_per_second": 57.002, + "step": 2430 + }, + { + "epoch": 8.776523702031604, + "grad_norm": 222.4960174560547, + "learning_rate": 1.6818511796733212e-05, + "loss": 39.2198, + "step": 2431 + }, + { + "epoch": 8.780135440180587, + "grad_norm": 325.9942932128906, + "learning_rate": 1.681306715063521e-05, + "loss": 40.572, + "step": 2432 + }, + { + "epoch": 8.783747178329572, + "grad_norm": 195.2740936279297, + "learning_rate": 1.6807622504537206e-05, + "loss": 39.2727, + "step": 2433 + }, + { + "epoch": 8.787358916478555, + "grad_norm": 196.16964721679688, + "learning_rate": 1.68021778584392e-05, + "loss": 40.6503, + "step": 2434 + }, + { + "epoch": 8.79097065462754, + "grad_norm": 183.2659454345703, + "learning_rate": 1.6796733212341197e-05, + "loss": 41.2074, + "step": 2435 + }, + { + "epoch": 8.794582392776524, + "grad_norm": 293.393798828125, + "learning_rate": 1.6791288566243192e-05, + "loss": 40.2778, + "step": 2436 + }, + { + "epoch": 8.798194130925507, + "grad_norm": 232.8402099609375, + "learning_rate": 1.678584392014519e-05, + "loss": 40.0305, + "step": 2437 + }, + { + "epoch": 8.801805869074492, + "grad_norm": 269.957275390625, + "learning_rate": 1.678039927404719e-05, + "loss": 40.4216, + "step": 2438 + }, + { + "epoch": 8.805417607223477, + "grad_norm": 175.6732635498047, + "learning_rate": 1.6774954627949185e-05, + "loss": 40.7998, + "step": 2439 + }, + { + "epoch": 8.80902934537246, + "grad_norm": 209.0604248046875, + "learning_rate": 1.676950998185118e-05, + "loss": 41.1176, + "step": 2440 + }, + { + "epoch": 8.80902934537246, + "eval_loss": 0.6211614012718201, + "eval_runtime": 3.15, + "eval_samples_per_second": 56.826, + "eval_steps_per_second": 56.826, + "step": 2440 + }, + { + "epoch": 8.812641083521445, + "grad_norm": 229.91171264648438, + "learning_rate": 1.6764065335753176e-05, + "loss": 41.37, + "step": 2441 + }, + { + "epoch": 8.816252821670428, + "grad_norm": 192.99610900878906, + "learning_rate": 1.675862068965517e-05, + "loss": 41.8377, + "step": 2442 + }, + { + "epoch": 8.819864559819413, + "grad_norm": 239.290771484375, + "learning_rate": 1.675317604355717e-05, + "loss": 42.3038, + "step": 2443 + }, + { + "epoch": 8.823476297968398, + "grad_norm": 203.52330017089844, + "learning_rate": 1.6747731397459166e-05, + "loss": 41.3334, + "step": 2444 + }, + { + "epoch": 8.827088036117381, + "grad_norm": 247.99099731445312, + "learning_rate": 1.674228675136116e-05, + "loss": 37.7455, + "step": 2445 + }, + { + "epoch": 8.830699774266366, + "grad_norm": 205.9770965576172, + "learning_rate": 1.6736842105263156e-05, + "loss": 34.6828, + "step": 2446 + }, + { + "epoch": 8.83431151241535, + "grad_norm": 215.47024536132812, + "learning_rate": 1.6731397459165152e-05, + "loss": 34.927, + "step": 2447 + }, + { + "epoch": 8.837923250564334, + "grad_norm": 254.14010620117188, + "learning_rate": 1.6725952813067154e-05, + "loss": 35.3194, + "step": 2448 + }, + { + "epoch": 8.841534988713319, + "grad_norm": 221.18174743652344, + "learning_rate": 1.672050816696915e-05, + "loss": 34.9577, + "step": 2449 + }, + { + "epoch": 8.845146726862303, + "grad_norm": 191.1651611328125, + "learning_rate": 1.6715063520871145e-05, + "loss": 33.7244, + "step": 2450 + }, + { + "epoch": 8.845146726862303, + "eval_loss": 0.6216589212417603, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2450 + }, + { + "epoch": 8.848758465011286, + "grad_norm": 228.3920135498047, + "learning_rate": 1.670961887477314e-05, + "loss": 34.9689, + "step": 2451 + }, + { + "epoch": 8.852370203160271, + "grad_norm": 227.6689910888672, + "learning_rate": 1.6704174228675135e-05, + "loss": 36.0718, + "step": 2452 + }, + { + "epoch": 8.855981941309254, + "grad_norm": 182.38978576660156, + "learning_rate": 1.669872958257713e-05, + "loss": 37.1143, + "step": 2453 + }, + { + "epoch": 8.85959367945824, + "grad_norm": 223.66966247558594, + "learning_rate": 1.669328493647913e-05, + "loss": 34.4468, + "step": 2454 + }, + { + "epoch": 8.863205417607224, + "grad_norm": 260.3930358886719, + "learning_rate": 1.6687840290381125e-05, + "loss": 36.7305, + "step": 2455 + }, + { + "epoch": 8.866817155756207, + "grad_norm": 218.60385131835938, + "learning_rate": 1.668239564428312e-05, + "loss": 36.1995, + "step": 2456 + }, + { + "epoch": 8.870428893905192, + "grad_norm": 227.4342041015625, + "learning_rate": 1.667695099818512e-05, + "loss": 35.9138, + "step": 2457 + }, + { + "epoch": 8.874040632054175, + "grad_norm": 208.42196655273438, + "learning_rate": 1.6671506352087115e-05, + "loss": 37.2621, + "step": 2458 + }, + { + "epoch": 8.87765237020316, + "grad_norm": 214.9486541748047, + "learning_rate": 1.6666061705989113e-05, + "loss": 38.5176, + "step": 2459 + }, + { + "epoch": 8.881264108352145, + "grad_norm": 226.6992645263672, + "learning_rate": 1.666061705989111e-05, + "loss": 38.3917, + "step": 2460 + }, + { + "epoch": 8.881264108352145, + "eval_loss": 0.6277003884315491, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.959, + "eval_steps_per_second": 56.959, + "step": 2460 + }, + { + "epoch": 8.884875846501128, + "grad_norm": 282.3875732421875, + "learning_rate": 1.6655172413793104e-05, + "loss": 39.1439, + "step": 2461 + }, + { + "epoch": 8.888487584650113, + "grad_norm": 240.29022216796875, + "learning_rate": 1.66497277676951e-05, + "loss": 33.7717, + "step": 2462 + }, + { + "epoch": 8.892099322799098, + "grad_norm": 231.84727478027344, + "learning_rate": 1.6644283121597095e-05, + "loss": 24.1146, + "step": 2463 + }, + { + "epoch": 8.89571106094808, + "grad_norm": 215.5159149169922, + "learning_rate": 1.663883847549909e-05, + "loss": 24.0165, + "step": 2464 + }, + { + "epoch": 8.899322799097066, + "grad_norm": 278.42950439453125, + "learning_rate": 1.663339382940109e-05, + "loss": 24.2048, + "step": 2465 + }, + { + "epoch": 8.90293453724605, + "grad_norm": 187.03341674804688, + "learning_rate": 1.6627949183303088e-05, + "loss": 24.7332, + "step": 2466 + }, + { + "epoch": 8.906546275395034, + "grad_norm": 261.2938232421875, + "learning_rate": 1.6622504537205083e-05, + "loss": 42.6764, + "step": 2467 + }, + { + "epoch": 8.910158013544018, + "grad_norm": 234.00880432128906, + "learning_rate": 1.661705989110708e-05, + "loss": 42.9894, + "step": 2468 + }, + { + "epoch": 8.913769751693001, + "grad_norm": 263.2890319824219, + "learning_rate": 1.6611615245009074e-05, + "loss": 43.3274, + "step": 2469 + }, + { + "epoch": 8.917381489841986, + "grad_norm": 286.3260192871094, + "learning_rate": 1.6606170598911073e-05, + "loss": 44.3862, + "step": 2470 + }, + { + "epoch": 8.917381489841986, + "eval_loss": 0.6278789043426514, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2470 + }, + { + "epoch": 8.920993227990971, + "grad_norm": 273.5133972167969, + "learning_rate": 1.6600725952813068e-05, + "loss": 43.4195, + "step": 2471 + }, + { + "epoch": 8.924604966139954, + "grad_norm": 246.2245330810547, + "learning_rate": 1.6595281306715063e-05, + "loss": 43.153, + "step": 2472 + }, + { + "epoch": 8.928216704288939, + "grad_norm": 261.3001403808594, + "learning_rate": 1.658983666061706e-05, + "loss": 41.1276, + "step": 2473 + }, + { + "epoch": 8.931828442437924, + "grad_norm": 263.7626037597656, + "learning_rate": 1.6584392014519054e-05, + "loss": 40.5055, + "step": 2474 + }, + { + "epoch": 8.935440180586907, + "grad_norm": 233.80442810058594, + "learning_rate": 1.6578947368421053e-05, + "loss": 40.7098, + "step": 2475 + }, + { + "epoch": 8.939051918735892, + "grad_norm": 334.1268615722656, + "learning_rate": 1.6573502722323052e-05, + "loss": 40.5404, + "step": 2476 + }, + { + "epoch": 8.942663656884875, + "grad_norm": 319.56689453125, + "learning_rate": 1.6568058076225047e-05, + "loss": 40.3434, + "step": 2477 + }, + { + "epoch": 8.94627539503386, + "grad_norm": 388.0625915527344, + "learning_rate": 1.6562613430127043e-05, + "loss": 41.1956, + "step": 2478 + }, + { + "epoch": 8.949887133182845, + "grad_norm": 256.9087829589844, + "learning_rate": 1.6557168784029038e-05, + "loss": 41.9647, + "step": 2479 + }, + { + "epoch": 8.953498871331828, + "grad_norm": 248.2635040283203, + "learning_rate": 1.6551724137931033e-05, + "loss": 41.1885, + "step": 2480 + }, + { + "epoch": 8.953498871331828, + "eval_loss": 0.6198933124542236, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2480 + }, + { + "epoch": 8.957110609480813, + "grad_norm": 236.89004516601562, + "learning_rate": 1.6546279491833032e-05, + "loss": 41.2178, + "step": 2481 + }, + { + "epoch": 8.960722347629797, + "grad_norm": 260.47357177734375, + "learning_rate": 1.6540834845735027e-05, + "loss": 42.1472, + "step": 2482 + }, + { + "epoch": 8.96433408577878, + "grad_norm": 216.1390380859375, + "learning_rate": 1.6535390199637023e-05, + "loss": 36.14, + "step": 2483 + }, + { + "epoch": 8.967945823927765, + "grad_norm": 194.7316131591797, + "learning_rate": 1.652994555353902e-05, + "loss": 33.7272, + "step": 2484 + }, + { + "epoch": 8.97155756207675, + "grad_norm": 202.0404052734375, + "learning_rate": 1.6524500907441017e-05, + "loss": 34.9427, + "step": 2485 + }, + { + "epoch": 8.975169300225733, + "grad_norm": 196.98463439941406, + "learning_rate": 1.6519056261343016e-05, + "loss": 36.4874, + "step": 2486 + }, + { + "epoch": 8.978781038374718, + "grad_norm": 211.46177673339844, + "learning_rate": 1.651361161524501e-05, + "loss": 35.7667, + "step": 2487 + }, + { + "epoch": 8.982392776523701, + "grad_norm": 190.47093200683594, + "learning_rate": 1.6508166969147006e-05, + "loss": 35.6874, + "step": 2488 + }, + { + "epoch": 8.986004514672686, + "grad_norm": 194.9825897216797, + "learning_rate": 1.6502722323049002e-05, + "loss": 36.8718, + "step": 2489 + }, + { + "epoch": 8.989616252821671, + "grad_norm": 230.24774169921875, + "learning_rate": 1.6497277676950997e-05, + "loss": 37.4962, + "step": 2490 + }, + { + "epoch": 8.989616252821671, + "eval_loss": 0.6168100237846375, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 2490 + }, + { + "epoch": 8.993227990970654, + "grad_norm": 266.5688171386719, + "learning_rate": 1.6491833030852993e-05, + "loss": 35.5063, + "step": 2491 + }, + { + "epoch": 8.996839729119639, + "grad_norm": 230.923828125, + "learning_rate": 1.648638838475499e-05, + "loss": 23.5847, + "step": 2492 + }, + { + "epoch": 9.0, + "grad_norm": 187.365478515625, + "learning_rate": 1.6480943738656987e-05, + "loss": 21.7926, + "step": 2493 + }, + { + "epoch": 9.003611738148985, + "grad_norm": 283.487060546875, + "learning_rate": 1.6475499092558986e-05, + "loss": 41.4221, + "step": 2494 + }, + { + "epoch": 9.007223476297968, + "grad_norm": 234.38009643554688, + "learning_rate": 1.647005444646098e-05, + "loss": 43.3343, + "step": 2495 + }, + { + "epoch": 9.010835214446953, + "grad_norm": 253.75588989257812, + "learning_rate": 1.6464609800362976e-05, + "loss": 42.1983, + "step": 2496 + }, + { + "epoch": 9.014446952595938, + "grad_norm": 224.6202392578125, + "learning_rate": 1.6459165154264975e-05, + "loss": 41.5355, + "step": 2497 + }, + { + "epoch": 9.01805869074492, + "grad_norm": 261.0040588378906, + "learning_rate": 1.645372050816697e-05, + "loss": 42.3058, + "step": 2498 + }, + { + "epoch": 9.021670428893906, + "grad_norm": 191.44142150878906, + "learning_rate": 1.6448275862068966e-05, + "loss": 42.3911, + "step": 2499 + }, + { + "epoch": 9.025282167042889, + "grad_norm": 246.79278564453125, + "learning_rate": 1.644283121597096e-05, + "loss": 41.6238, + "step": 2500 + }, + { + "epoch": 9.025282167042889, + "eval_loss": 0.6220878958702087, + "eval_runtime": 3.1552, + "eval_samples_per_second": 56.731, + "eval_steps_per_second": 56.731, + "step": 2500 + }, + { + "epoch": 9.028893905191874, + "grad_norm": 251.5475311279297, + "learning_rate": 1.6437386569872957e-05, + "loss": 43.9275, + "step": 2501 + }, + { + "epoch": 9.032505643340858, + "grad_norm": 300.0381164550781, + "learning_rate": 1.6431941923774952e-05, + "loss": 42.8938, + "step": 2502 + }, + { + "epoch": 9.036117381489841, + "grad_norm": 310.0517883300781, + "learning_rate": 1.6426497277676954e-05, + "loss": 42.3538, + "step": 2503 + }, + { + "epoch": 9.039729119638826, + "grad_norm": 213.50392150878906, + "learning_rate": 1.642105263157895e-05, + "loss": 40.2305, + "step": 2504 + }, + { + "epoch": 9.043340857787811, + "grad_norm": 173.3816680908203, + "learning_rate": 1.6415607985480945e-05, + "loss": 38.3336, + "step": 2505 + }, + { + "epoch": 9.046952595936794, + "grad_norm": 195.51968383789062, + "learning_rate": 1.641016333938294e-05, + "loss": 38.5937, + "step": 2506 + }, + { + "epoch": 9.050564334085779, + "grad_norm": 195.68910217285156, + "learning_rate": 1.6404718693284936e-05, + "loss": 37.9994, + "step": 2507 + }, + { + "epoch": 9.054176072234762, + "grad_norm": 239.56704711914062, + "learning_rate": 1.6399274047186934e-05, + "loss": 38.6006, + "step": 2508 + }, + { + "epoch": 9.057787810383747, + "grad_norm": 455.8309326171875, + "learning_rate": 1.639382940108893e-05, + "loss": 39.9516, + "step": 2509 + }, + { + "epoch": 9.061399548532732, + "grad_norm": 188.0857696533203, + "learning_rate": 1.6388384754990925e-05, + "loss": 38.8922, + "step": 2510 + }, + { + "epoch": 9.061399548532732, + "eval_loss": 0.6177002191543579, + "eval_runtime": 3.1595, + "eval_samples_per_second": 56.654, + "eval_steps_per_second": 56.654, + "step": 2510 + }, + { + "epoch": 9.065011286681715, + "grad_norm": 211.76168823242188, + "learning_rate": 1.638294010889292e-05, + "loss": 38.8895, + "step": 2511 + }, + { + "epoch": 9.0686230248307, + "grad_norm": 281.7332458496094, + "learning_rate": 1.637749546279492e-05, + "loss": 39.9238, + "step": 2512 + }, + { + "epoch": 9.072234762979685, + "grad_norm": 254.9953155517578, + "learning_rate": 1.6372050816696915e-05, + "loss": 41.2667, + "step": 2513 + }, + { + "epoch": 9.075846501128668, + "grad_norm": 233.8746337890625, + "learning_rate": 1.6366606170598914e-05, + "loss": 39.3087, + "step": 2514 + }, + { + "epoch": 9.079458239277653, + "grad_norm": 317.71270751953125, + "learning_rate": 1.636116152450091e-05, + "loss": 40.4902, + "step": 2515 + }, + { + "epoch": 9.083069977426636, + "grad_norm": 227.5228271484375, + "learning_rate": 1.6355716878402904e-05, + "loss": 40.1197, + "step": 2516 + }, + { + "epoch": 9.08668171557562, + "grad_norm": 225.84423828125, + "learning_rate": 1.63502722323049e-05, + "loss": 42.9099, + "step": 2517 + }, + { + "epoch": 9.090293453724605, + "grad_norm": 255.20858764648438, + "learning_rate": 1.6344827586206895e-05, + "loss": 42.0515, + "step": 2518 + }, + { + "epoch": 9.093905191873588, + "grad_norm": 215.45352172851562, + "learning_rate": 1.6339382940108894e-05, + "loss": 41.6817, + "step": 2519 + }, + { + "epoch": 9.097516930022573, + "grad_norm": 233.5334014892578, + "learning_rate": 1.633393829401089e-05, + "loss": 42.6121, + "step": 2520 + }, + { + "epoch": 9.097516930022573, + "eval_loss": 0.6148340106010437, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.926, + "eval_steps_per_second": 56.926, + "step": 2520 + }, + { + "epoch": 9.101128668171558, + "grad_norm": 196.54132080078125, + "learning_rate": 1.6328493647912888e-05, + "loss": 40.5833, + "step": 2521 + }, + { + "epoch": 9.104740406320541, + "grad_norm": 296.7503967285156, + "learning_rate": 1.6323049001814883e-05, + "loss": 39.098, + "step": 2522 + }, + { + "epoch": 9.108352144469526, + "grad_norm": 272.1104431152344, + "learning_rate": 1.631760435571688e-05, + "loss": 36.0076, + "step": 2523 + }, + { + "epoch": 9.111963882618511, + "grad_norm": 197.3100128173828, + "learning_rate": 1.6312159709618874e-05, + "loss": 33.3503, + "step": 2524 + }, + { + "epoch": 9.115575620767494, + "grad_norm": 223.1310272216797, + "learning_rate": 1.6306715063520873e-05, + "loss": 33.1386, + "step": 2525 + }, + { + "epoch": 9.119187358916479, + "grad_norm": 234.86093139648438, + "learning_rate": 1.630127041742287e-05, + "loss": 34.2101, + "step": 2526 + }, + { + "epoch": 9.122799097065462, + "grad_norm": 244.72328186035156, + "learning_rate": 1.6295825771324864e-05, + "loss": 34.955, + "step": 2527 + }, + { + "epoch": 9.126410835214447, + "grad_norm": 198.89134216308594, + "learning_rate": 1.629038112522686e-05, + "loss": 34.5405, + "step": 2528 + }, + { + "epoch": 9.130022573363432, + "grad_norm": 236.64096069335938, + "learning_rate": 1.6284936479128854e-05, + "loss": 35.2328, + "step": 2529 + }, + { + "epoch": 9.133634311512415, + "grad_norm": 212.8743438720703, + "learning_rate": 1.6279491833030853e-05, + "loss": 34.6642, + "step": 2530 + }, + { + "epoch": 9.133634311512415, + "eval_loss": 0.6154256463050842, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 2530 + }, + { + "epoch": 9.1372460496614, + "grad_norm": 227.15135192871094, + "learning_rate": 1.6274047186932852e-05, + "loss": 35.652, + "step": 2531 + }, + { + "epoch": 9.140857787810384, + "grad_norm": 207.30572509765625, + "learning_rate": 1.6268602540834847e-05, + "loss": 36.8476, + "step": 2532 + }, + { + "epoch": 9.144469525959368, + "grad_norm": 222.18023681640625, + "learning_rate": 1.6263157894736843e-05, + "loss": 35.8299, + "step": 2533 + }, + { + "epoch": 9.148081264108352, + "grad_norm": 283.674072265625, + "learning_rate": 1.6257713248638838e-05, + "loss": 36.5074, + "step": 2534 + }, + { + "epoch": 9.151693002257336, + "grad_norm": 235.69752502441406, + "learning_rate": 1.6252268602540834e-05, + "loss": 37.344, + "step": 2535 + }, + { + "epoch": 9.15530474040632, + "grad_norm": 224.37965393066406, + "learning_rate": 1.6246823956442832e-05, + "loss": 37.8138, + "step": 2536 + }, + { + "epoch": 9.158916478555305, + "grad_norm": 217.52230834960938, + "learning_rate": 1.6241379310344828e-05, + "loss": 37.1529, + "step": 2537 + }, + { + "epoch": 9.162528216704288, + "grad_norm": 234.7586212158203, + "learning_rate": 1.6235934664246823e-05, + "loss": 36.3247, + "step": 2538 + }, + { + "epoch": 9.166139954853273, + "grad_norm": 239.52479553222656, + "learning_rate": 1.623049001814882e-05, + "loss": 30.0805, + "step": 2539 + }, + { + "epoch": 9.169751693002258, + "grad_norm": 223.7616424560547, + "learning_rate": 1.6225045372050817e-05, + "loss": 23.8492, + "step": 2540 + }, + { + "epoch": 9.169751693002258, + "eval_loss": 0.6244915723800659, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.031, + "eval_steps_per_second": 57.031, + "step": 2540 + }, + { + "epoch": 9.173363431151241, + "grad_norm": 213.41371154785156, + "learning_rate": 1.6219600725952816e-05, + "loss": 23.3557, + "step": 2541 + }, + { + "epoch": 9.176975169300226, + "grad_norm": 162.4627685546875, + "learning_rate": 1.621415607985481e-05, + "loss": 23.8834, + "step": 2542 + }, + { + "epoch": 9.18058690744921, + "grad_norm": 172.13250732421875, + "learning_rate": 1.6208711433756807e-05, + "loss": 24.6428, + "step": 2543 + }, + { + "epoch": 9.184198645598194, + "grad_norm": 229.30799865722656, + "learning_rate": 1.6203266787658802e-05, + "loss": 42.5908, + "step": 2544 + }, + { + "epoch": 9.187810383747179, + "grad_norm": 195.30130004882812, + "learning_rate": 1.6197822141560798e-05, + "loss": 43.7286, + "step": 2545 + }, + { + "epoch": 9.191422121896162, + "grad_norm": 227.4984893798828, + "learning_rate": 1.6192377495462793e-05, + "loss": 43.5012, + "step": 2546 + }, + { + "epoch": 9.195033860045147, + "grad_norm": 254.69615173339844, + "learning_rate": 1.6186932849364792e-05, + "loss": 41.9295, + "step": 2547 + }, + { + "epoch": 9.198645598194132, + "grad_norm": 251.33778381347656, + "learning_rate": 1.6181488203266787e-05, + "loss": 42.0838, + "step": 2548 + }, + { + "epoch": 9.202257336343115, + "grad_norm": 237.91677856445312, + "learning_rate": 1.6176043557168786e-05, + "loss": 43.0031, + "step": 2549 + }, + { + "epoch": 9.2058690744921, + "grad_norm": 258.0311584472656, + "learning_rate": 1.617059891107078e-05, + "loss": 42.7196, + "step": 2550 + }, + { + "epoch": 9.2058690744921, + "eval_loss": 0.6245208978652954, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2550 + }, + { + "epoch": 9.209480812641084, + "grad_norm": 197.14703369140625, + "learning_rate": 1.6165154264972777e-05, + "loss": 42.1342, + "step": 2551 + }, + { + "epoch": 9.213092550790067, + "grad_norm": 235.19705200195312, + "learning_rate": 1.6159709618874775e-05, + "loss": 41.8462, + "step": 2552 + }, + { + "epoch": 9.216704288939052, + "grad_norm": 198.409423828125, + "learning_rate": 1.615426497277677e-05, + "loss": 43.5993, + "step": 2553 + }, + { + "epoch": 9.220316027088035, + "grad_norm": 254.08590698242188, + "learning_rate": 1.6148820326678766e-05, + "loss": 40.771, + "step": 2554 + }, + { + "epoch": 9.22392776523702, + "grad_norm": 181.64808654785156, + "learning_rate": 1.614337568058076e-05, + "loss": 39.3511, + "step": 2555 + }, + { + "epoch": 9.227539503386005, + "grad_norm": 294.1127014160156, + "learning_rate": 1.6137931034482757e-05, + "loss": 39.6586, + "step": 2556 + }, + { + "epoch": 9.231151241534988, + "grad_norm": 197.59982299804688, + "learning_rate": 1.6132486388384752e-05, + "loss": 38.2575, + "step": 2557 + }, + { + "epoch": 9.234762979683973, + "grad_norm": 223.74717712402344, + "learning_rate": 1.6127041742286754e-05, + "loss": 38.8801, + "step": 2558 + }, + { + "epoch": 9.238374717832958, + "grad_norm": 279.2779541015625, + "learning_rate": 1.612159709618875e-05, + "loss": 40.4591, + "step": 2559 + }, + { + "epoch": 9.241986455981941, + "grad_norm": 258.75909423828125, + "learning_rate": 1.6116152450090745e-05, + "loss": 39.2172, + "step": 2560 + }, + { + "epoch": 9.241986455981941, + "eval_loss": 0.6209923624992371, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.784, + "eval_steps_per_second": 56.784, + "step": 2560 + }, + { + "epoch": 9.245598194130926, + "grad_norm": 305.0645446777344, + "learning_rate": 1.611070780399274e-05, + "loss": 40.442, + "step": 2561 + }, + { + "epoch": 9.249209932279909, + "grad_norm": 196.18557739257812, + "learning_rate": 1.6105263157894736e-05, + "loss": 39.7092, + "step": 2562 + }, + { + "epoch": 9.252821670428894, + "grad_norm": 214.3220977783203, + "learning_rate": 1.6099818511796735e-05, + "loss": 39.3935, + "step": 2563 + }, + { + "epoch": 9.256433408577879, + "grad_norm": 217.2801055908203, + "learning_rate": 1.609437386569873e-05, + "loss": 40.39, + "step": 2564 + }, + { + "epoch": 9.260045146726862, + "grad_norm": 205.17446899414062, + "learning_rate": 1.6088929219600726e-05, + "loss": 39.9531, + "step": 2565 + }, + { + "epoch": 9.263656884875846, + "grad_norm": 197.3854217529297, + "learning_rate": 1.608348457350272e-05, + "loss": 40.474, + "step": 2566 + }, + { + "epoch": 9.267268623024831, + "grad_norm": 264.3934631347656, + "learning_rate": 1.607803992740472e-05, + "loss": 41.2794, + "step": 2567 + }, + { + "epoch": 9.270880361173814, + "grad_norm": 226.6471710205078, + "learning_rate": 1.6072595281306715e-05, + "loss": 40.3425, + "step": 2568 + }, + { + "epoch": 9.2744920993228, + "grad_norm": 198.62734985351562, + "learning_rate": 1.6067150635208714e-05, + "loss": 41.6261, + "step": 2569 + }, + { + "epoch": 9.278103837471784, + "grad_norm": 207.73509216308594, + "learning_rate": 1.606170598911071e-05, + "loss": 41.7835, + "step": 2570 + }, + { + "epoch": 9.278103837471784, + "eval_loss": 0.6173180937767029, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 2570 + }, + { + "epoch": 9.281715575620767, + "grad_norm": 214.13601684570312, + "learning_rate": 1.6056261343012705e-05, + "loss": 40.0095, + "step": 2571 + }, + { + "epoch": 9.285327313769752, + "grad_norm": 218.0533905029297, + "learning_rate": 1.60508166969147e-05, + "loss": 40.014, + "step": 2572 + }, + { + "epoch": 9.288939051918735, + "grad_norm": 211.27984619140625, + "learning_rate": 1.6045372050816695e-05, + "loss": 36.7399, + "step": 2573 + }, + { + "epoch": 9.29255079006772, + "grad_norm": 201.9020233154297, + "learning_rate": 1.6039927404718694e-05, + "loss": 33.7555, + "step": 2574 + }, + { + "epoch": 9.296162528216705, + "grad_norm": 230.27149963378906, + "learning_rate": 1.603448275862069e-05, + "loss": 32.9646, + "step": 2575 + }, + { + "epoch": 9.299774266365688, + "grad_norm": 208.77622985839844, + "learning_rate": 1.6029038112522685e-05, + "loss": 33.5332, + "step": 2576 + }, + { + "epoch": 9.303386004514673, + "grad_norm": 225.02796936035156, + "learning_rate": 1.6023593466424684e-05, + "loss": 34.2592, + "step": 2577 + }, + { + "epoch": 9.306997742663658, + "grad_norm": 201.79612731933594, + "learning_rate": 1.601814882032668e-05, + "loss": 34.6686, + "step": 2578 + }, + { + "epoch": 9.31060948081264, + "grad_norm": 235.6588134765625, + "learning_rate": 1.6012704174228678e-05, + "loss": 35.4554, + "step": 2579 + }, + { + "epoch": 9.314221218961626, + "grad_norm": 273.51904296875, + "learning_rate": 1.6007259528130673e-05, + "loss": 35.2077, + "step": 2580 + }, + { + "epoch": 9.314221218961626, + "eval_loss": 0.6169624328613281, + "eval_runtime": 3.1501, + "eval_samples_per_second": 56.823, + "eval_steps_per_second": 56.823, + "step": 2580 + }, + { + "epoch": 9.317832957110609, + "grad_norm": 199.19541931152344, + "learning_rate": 1.600181488203267e-05, + "loss": 35.0703, + "step": 2581 + }, + { + "epoch": 9.321444695259594, + "grad_norm": 212.49276733398438, + "learning_rate": 1.5996370235934664e-05, + "loss": 35.9691, + "step": 2582 + }, + { + "epoch": 9.325056433408578, + "grad_norm": 193.7330322265625, + "learning_rate": 1.599092558983666e-05, + "loss": 34.9043, + "step": 2583 + }, + { + "epoch": 9.328668171557561, + "grad_norm": 196.00503540039062, + "learning_rate": 1.5985480943738655e-05, + "loss": 36.3508, + "step": 2584 + }, + { + "epoch": 9.332279909706546, + "grad_norm": 218.78392028808594, + "learning_rate": 1.5980036297640654e-05, + "loss": 34.7672, + "step": 2585 + }, + { + "epoch": 9.335891647855531, + "grad_norm": 235.76873779296875, + "learning_rate": 1.5974591651542652e-05, + "loss": 36.8695, + "step": 2586 + }, + { + "epoch": 9.339503386004514, + "grad_norm": 250.538330078125, + "learning_rate": 1.5969147005444648e-05, + "loss": 37.4531, + "step": 2587 + }, + { + "epoch": 9.343115124153499, + "grad_norm": 234.12469482421875, + "learning_rate": 1.5963702359346643e-05, + "loss": 37.4506, + "step": 2588 + }, + { + "epoch": 9.346726862302482, + "grad_norm": 209.3461151123047, + "learning_rate": 1.595825771324864e-05, + "loss": 31.3062, + "step": 2589 + }, + { + "epoch": 9.350338600451467, + "grad_norm": 211.12277221679688, + "learning_rate": 1.5952813067150637e-05, + "loss": 23.3303, + "step": 2590 + }, + { + "epoch": 9.350338600451467, + "eval_loss": 0.6222187876701355, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 2590 + }, + { + "epoch": 9.353950338600452, + "grad_norm": 200.1257781982422, + "learning_rate": 1.5947368421052633e-05, + "loss": 22.9145, + "step": 2591 + }, + { + "epoch": 9.357562076749435, + "grad_norm": 179.01475524902344, + "learning_rate": 1.5941923774954628e-05, + "loss": 23.8842, + "step": 2592 + }, + { + "epoch": 9.36117381489842, + "grad_norm": 214.9254608154297, + "learning_rate": 1.5936479128856623e-05, + "loss": 25.4154, + "step": 2593 + }, + { + "epoch": 9.364785553047405, + "grad_norm": 211.63735961914062, + "learning_rate": 1.593103448275862e-05, + "loss": 42.6467, + "step": 2594 + }, + { + "epoch": 9.368397291196388, + "grad_norm": 232.43194580078125, + "learning_rate": 1.5925589836660618e-05, + "loss": 43.3501, + "step": 2595 + }, + { + "epoch": 9.372009029345373, + "grad_norm": 220.61468505859375, + "learning_rate": 1.5920145190562616e-05, + "loss": 43.4324, + "step": 2596 + }, + { + "epoch": 9.375620767494357, + "grad_norm": 179.00894165039062, + "learning_rate": 1.591470054446461e-05, + "loss": 41.9646, + "step": 2597 + }, + { + "epoch": 9.37923250564334, + "grad_norm": 203.847412109375, + "learning_rate": 1.5909255898366607e-05, + "loss": 41.1242, + "step": 2598 + }, + { + "epoch": 9.382844243792325, + "grad_norm": 244.20164489746094, + "learning_rate": 1.5903811252268602e-05, + "loss": 42.2451, + "step": 2599 + }, + { + "epoch": 9.386455981941308, + "grad_norm": 203.60154724121094, + "learning_rate": 1.5898366606170598e-05, + "loss": 42.0361, + "step": 2600 + }, + { + "epoch": 9.386455981941308, + "eval_loss": 0.627146303653717, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2600 + }, + { + "epoch": 9.390067720090293, + "grad_norm": 185.1741180419922, + "learning_rate": 1.5892921960072597e-05, + "loss": 41.9657, + "step": 2601 + }, + { + "epoch": 9.393679458239278, + "grad_norm": 211.64219665527344, + "learning_rate": 1.5887477313974592e-05, + "loss": 42.2619, + "step": 2602 + }, + { + "epoch": 9.397291196388261, + "grad_norm": 253.31997680664062, + "learning_rate": 1.5882032667876587e-05, + "loss": 42.5666, + "step": 2603 + }, + { + "epoch": 9.400902934537246, + "grad_norm": 257.8781433105469, + "learning_rate": 1.5876588021778586e-05, + "loss": 43.1747, + "step": 2604 + }, + { + "epoch": 9.404514672686231, + "grad_norm": 171.05398559570312, + "learning_rate": 1.587114337568058e-05, + "loss": 41.2645, + "step": 2605 + }, + { + "epoch": 9.408126410835214, + "grad_norm": 209.83749389648438, + "learning_rate": 1.5865698729582577e-05, + "loss": 38.7138, + "step": 2606 + }, + { + "epoch": 9.411738148984199, + "grad_norm": 303.92059326171875, + "learning_rate": 1.5860254083484576e-05, + "loss": 38.7962, + "step": 2607 + }, + { + "epoch": 9.415349887133182, + "grad_norm": 271.9322204589844, + "learning_rate": 1.585480943738657e-05, + "loss": 39.0622, + "step": 2608 + }, + { + "epoch": 9.418961625282167, + "grad_norm": 222.8749542236328, + "learning_rate": 1.5849364791288566e-05, + "loss": 40.0773, + "step": 2609 + }, + { + "epoch": 9.422573363431152, + "grad_norm": 194.549072265625, + "learning_rate": 1.5843920145190562e-05, + "loss": 39.3495, + "step": 2610 + }, + { + "epoch": 9.422573363431152, + "eval_loss": 0.618250846862793, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.796, + "eval_steps_per_second": 56.796, + "step": 2610 + }, + { + "epoch": 9.426185101580135, + "grad_norm": 231.32623291015625, + "learning_rate": 1.5838475499092557e-05, + "loss": 39.7577, + "step": 2611 + }, + { + "epoch": 9.42979683972912, + "grad_norm": 185.9986114501953, + "learning_rate": 1.5833030852994556e-05, + "loss": 40.9342, + "step": 2612 + }, + { + "epoch": 9.433408577878104, + "grad_norm": 221.356201171875, + "learning_rate": 1.5827586206896555e-05, + "loss": 39.7733, + "step": 2613 + }, + { + "epoch": 9.437020316027088, + "grad_norm": 216.2249755859375, + "learning_rate": 1.582214156079855e-05, + "loss": 39.7559, + "step": 2614 + }, + { + "epoch": 9.440632054176072, + "grad_norm": 263.5106201171875, + "learning_rate": 1.5816696914700546e-05, + "loss": 41.2872, + "step": 2615 + }, + { + "epoch": 9.444243792325057, + "grad_norm": 281.9518127441406, + "learning_rate": 1.581125226860254e-05, + "loss": 41.1114, + "step": 2616 + }, + { + "epoch": 9.44785553047404, + "grad_norm": 200.2808074951172, + "learning_rate": 1.5805807622504536e-05, + "loss": 41.7711, + "step": 2617 + }, + { + "epoch": 9.451467268623025, + "grad_norm": 233.034912109375, + "learning_rate": 1.5800362976406535e-05, + "loss": 41.3306, + "step": 2618 + }, + { + "epoch": 9.455079006772008, + "grad_norm": 215.5499725341797, + "learning_rate": 1.579491833030853e-05, + "loss": 41.0065, + "step": 2619 + }, + { + "epoch": 9.458690744920993, + "grad_norm": 220.21153259277344, + "learning_rate": 1.5789473684210526e-05, + "loss": 42.1116, + "step": 2620 + }, + { + "epoch": 9.458690744920993, + "eval_loss": 0.6146022081375122, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 2620 + }, + { + "epoch": 9.462302483069978, + "grad_norm": 198.20001220703125, + "learning_rate": 1.578402903811252e-05, + "loss": 39.637, + "step": 2621 + }, + { + "epoch": 9.465914221218961, + "grad_norm": 228.18357849121094, + "learning_rate": 1.5778584392014517e-05, + "loss": 37.3831, + "step": 2622 + }, + { + "epoch": 9.469525959367946, + "grad_norm": 207.68040466308594, + "learning_rate": 1.577313974591652e-05, + "loss": 35.6356, + "step": 2623 + }, + { + "epoch": 9.47313769751693, + "grad_norm": 267.0474853515625, + "learning_rate": 1.5767695099818514e-05, + "loss": 34.5549, + "step": 2624 + }, + { + "epoch": 9.476749435665914, + "grad_norm": 191.4129638671875, + "learning_rate": 1.576225045372051e-05, + "loss": 35.1065, + "step": 2625 + }, + { + "epoch": 9.480361173814899, + "grad_norm": 220.85708618164062, + "learning_rate": 1.5756805807622505e-05, + "loss": 34.9115, + "step": 2626 + }, + { + "epoch": 9.483972911963882, + "grad_norm": 218.62460327148438, + "learning_rate": 1.57513611615245e-05, + "loss": 33.9542, + "step": 2627 + }, + { + "epoch": 9.487584650112867, + "grad_norm": 184.085693359375, + "learning_rate": 1.5745916515426496e-05, + "loss": 35.2981, + "step": 2628 + }, + { + "epoch": 9.491196388261852, + "grad_norm": 286.73236083984375, + "learning_rate": 1.5740471869328494e-05, + "loss": 36.8326, + "step": 2629 + }, + { + "epoch": 9.494808126410835, + "grad_norm": 326.4263000488281, + "learning_rate": 1.573502722323049e-05, + "loss": 35.9728, + "step": 2630 + }, + { + "epoch": 9.494808126410835, + "eval_loss": 0.6165672540664673, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2630 + }, + { + "epoch": 9.49841986455982, + "grad_norm": 283.330322265625, + "learning_rate": 1.5729582577132485e-05, + "loss": 37.4227, + "step": 2631 + }, + { + "epoch": 9.502031602708804, + "grad_norm": 208.65829467773438, + "learning_rate": 1.5724137931034484e-05, + "loss": 36.8613, + "step": 2632 + }, + { + "epoch": 9.505643340857787, + "grad_norm": 191.59429931640625, + "learning_rate": 1.571869328493648e-05, + "loss": 36.2332, + "step": 2633 + }, + { + "epoch": 9.509255079006772, + "grad_norm": 306.4736022949219, + "learning_rate": 1.5713248638838478e-05, + "loss": 36.8045, + "step": 2634 + }, + { + "epoch": 9.512866817155757, + "grad_norm": 226.97509765625, + "learning_rate": 1.5707803992740474e-05, + "loss": 37.005, + "step": 2635 + }, + { + "epoch": 9.51647855530474, + "grad_norm": 230.47683715820312, + "learning_rate": 1.570235934664247e-05, + "loss": 36.9168, + "step": 2636 + }, + { + "epoch": 9.520090293453725, + "grad_norm": 221.44483947753906, + "learning_rate": 1.5696914700544464e-05, + "loss": 39.0025, + "step": 2637 + }, + { + "epoch": 9.523702031602708, + "grad_norm": 249.1531219482422, + "learning_rate": 1.569147005444646e-05, + "loss": 38.1069, + "step": 2638 + }, + { + "epoch": 9.527313769751693, + "grad_norm": 276.8532409667969, + "learning_rate": 1.5686025408348455e-05, + "loss": 30.9819, + "step": 2639 + }, + { + "epoch": 9.530925507900678, + "grad_norm": 218.25035095214844, + "learning_rate": 1.5680580762250454e-05, + "loss": 23.4807, + "step": 2640 + }, + { + "epoch": 9.530925507900678, + "eval_loss": 0.619295060634613, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2640 + }, + { + "epoch": 9.534537246049661, + "grad_norm": 185.83737182617188, + "learning_rate": 1.5675136116152453e-05, + "loss": 22.5394, + "step": 2641 + }, + { + "epoch": 9.538148984198646, + "grad_norm": 181.9920654296875, + "learning_rate": 1.5669691470054448e-05, + "loss": 23.9106, + "step": 2642 + }, + { + "epoch": 9.54176072234763, + "grad_norm": 209.20391845703125, + "learning_rate": 1.5664246823956443e-05, + "loss": 25.5328, + "step": 2643 + }, + { + "epoch": 9.545372460496614, + "grad_norm": 223.86093139648438, + "learning_rate": 1.565880217785844e-05, + "loss": 42.8563, + "step": 2644 + }, + { + "epoch": 9.548984198645599, + "grad_norm": 232.3086395263672, + "learning_rate": 1.5653357531760438e-05, + "loss": 44.0178, + "step": 2645 + }, + { + "epoch": 9.552595936794582, + "grad_norm": 223.76541137695312, + "learning_rate": 1.5647912885662433e-05, + "loss": 43.4928, + "step": 2646 + }, + { + "epoch": 9.556207674943566, + "grad_norm": 258.86700439453125, + "learning_rate": 1.5642468239564428e-05, + "loss": 42.3422, + "step": 2647 + }, + { + "epoch": 9.559819413092551, + "grad_norm": 255.09033203125, + "learning_rate": 1.5637023593466424e-05, + "loss": 41.6588, + "step": 2648 + }, + { + "epoch": 9.563431151241534, + "grad_norm": 205.88563537597656, + "learning_rate": 1.563157894736842e-05, + "loss": 41.9267, + "step": 2649 + }, + { + "epoch": 9.56704288939052, + "grad_norm": 204.12318420410156, + "learning_rate": 1.5626134301270418e-05, + "loss": 43.0326, + "step": 2650 + }, + { + "epoch": 9.56704288939052, + "eval_loss": 0.6218730807304382, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2650 + }, + { + "epoch": 9.570654627539504, + "grad_norm": 259.5694274902344, + "learning_rate": 1.5620689655172417e-05, + "loss": 42.9604, + "step": 2651 + }, + { + "epoch": 9.574266365688487, + "grad_norm": 234.35935974121094, + "learning_rate": 1.5615245009074412e-05, + "loss": 42.7316, + "step": 2652 + }, + { + "epoch": 9.577878103837472, + "grad_norm": 237.14346313476562, + "learning_rate": 1.5609800362976407e-05, + "loss": 42.4559, + "step": 2653 + }, + { + "epoch": 9.581489841986457, + "grad_norm": 208.2974395751953, + "learning_rate": 1.5604355716878403e-05, + "loss": 40.1113, + "step": 2654 + }, + { + "epoch": 9.58510158013544, + "grad_norm": 212.18814086914062, + "learning_rate": 1.5598911070780398e-05, + "loss": 38.6515, + "step": 2655 + }, + { + "epoch": 9.588713318284425, + "grad_norm": 245.23240661621094, + "learning_rate": 1.5593466424682397e-05, + "loss": 39.5289, + "step": 2656 + }, + { + "epoch": 9.592325056433408, + "grad_norm": 261.1321105957031, + "learning_rate": 1.5588021778584392e-05, + "loss": 39.3232, + "step": 2657 + }, + { + "epoch": 9.595936794582393, + "grad_norm": 257.67962646484375, + "learning_rate": 1.5582577132486388e-05, + "loss": 40.3963, + "step": 2658 + }, + { + "epoch": 9.599548532731378, + "grad_norm": 299.93914794921875, + "learning_rate": 1.5577132486388383e-05, + "loss": 39.0657, + "step": 2659 + }, + { + "epoch": 9.60316027088036, + "grad_norm": 215.45407104492188, + "learning_rate": 1.5571687840290382e-05, + "loss": 40.1408, + "step": 2660 + }, + { + "epoch": 9.60316027088036, + "eval_loss": 0.6216554045677185, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2660 + }, + { + "epoch": 9.606772009029346, + "grad_norm": 273.9233093261719, + "learning_rate": 1.5566243194192377e-05, + "loss": 40.6894, + "step": 2661 + }, + { + "epoch": 9.610383747178329, + "grad_norm": 220.76344299316406, + "learning_rate": 1.5560798548094376e-05, + "loss": 40.8146, + "step": 2662 + }, + { + "epoch": 9.613995485327314, + "grad_norm": 200.33929443359375, + "learning_rate": 1.555535390199637e-05, + "loss": 40.1362, + "step": 2663 + }, + { + "epoch": 9.617607223476298, + "grad_norm": 223.38536071777344, + "learning_rate": 1.5549909255898367e-05, + "loss": 39.3488, + "step": 2664 + }, + { + "epoch": 9.621218961625281, + "grad_norm": 240.99578857421875, + "learning_rate": 1.5544464609800362e-05, + "loss": 41.771, + "step": 2665 + }, + { + "epoch": 9.624830699774266, + "grad_norm": 202.30323791503906, + "learning_rate": 1.5539019963702357e-05, + "loss": 41.1412, + "step": 2666 + }, + { + "epoch": 9.628442437923251, + "grad_norm": 193.8411865234375, + "learning_rate": 1.5533575317604356e-05, + "loss": 41.0064, + "step": 2667 + }, + { + "epoch": 9.632054176072234, + "grad_norm": 197.1542510986328, + "learning_rate": 1.552813067150635e-05, + "loss": 41.4787, + "step": 2668 + }, + { + "epoch": 9.635665914221219, + "grad_norm": 259.21954345703125, + "learning_rate": 1.552268602540835e-05, + "loss": 41.753, + "step": 2669 + }, + { + "epoch": 9.639277652370204, + "grad_norm": 290.9770202636719, + "learning_rate": 1.5517241379310346e-05, + "loss": 40.4589, + "step": 2670 + }, + { + "epoch": 9.639277652370204, + "eval_loss": 0.6132164001464844, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2670 + }, + { + "epoch": 9.642889390519187, + "grad_norm": 252.86219787597656, + "learning_rate": 1.551179673321234e-05, + "loss": 37.356, + "step": 2671 + }, + { + "epoch": 9.646501128668172, + "grad_norm": 207.79254150390625, + "learning_rate": 1.550635208711434e-05, + "loss": 36.2071, + "step": 2672 + }, + { + "epoch": 9.650112866817155, + "grad_norm": 186.78857421875, + "learning_rate": 1.5500907441016335e-05, + "loss": 33.5074, + "step": 2673 + }, + { + "epoch": 9.65372460496614, + "grad_norm": 212.5107421875, + "learning_rate": 1.549546279491833e-05, + "loss": 33.7103, + "step": 2674 + }, + { + "epoch": 9.657336343115125, + "grad_norm": 243.2950897216797, + "learning_rate": 1.5490018148820326e-05, + "loss": 34.3476, + "step": 2675 + }, + { + "epoch": 9.660948081264108, + "grad_norm": 221.66415405273438, + "learning_rate": 1.548457350272232e-05, + "loss": 34.5377, + "step": 2676 + }, + { + "epoch": 9.664559819413093, + "grad_norm": 231.8260955810547, + "learning_rate": 1.5479128856624317e-05, + "loss": 34.3663, + "step": 2677 + }, + { + "epoch": 9.668171557562077, + "grad_norm": 284.6401062011719, + "learning_rate": 1.547368421052632e-05, + "loss": 35.5723, + "step": 2678 + }, + { + "epoch": 9.67178329571106, + "grad_norm": 373.43865966796875, + "learning_rate": 1.5468239564428314e-05, + "loss": 35.5628, + "step": 2679 + }, + { + "epoch": 9.675395033860045, + "grad_norm": 325.18316650390625, + "learning_rate": 1.546279491833031e-05, + "loss": 35.6192, + "step": 2680 + }, + { + "epoch": 9.675395033860045, + "eval_loss": 0.613842248916626, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 2680 + }, + { + "epoch": 9.679006772009028, + "grad_norm": 353.14739990234375, + "learning_rate": 1.5457350272232305e-05, + "loss": 36.4789, + "step": 2681 + }, + { + "epoch": 9.682618510158013, + "grad_norm": 215.21836853027344, + "learning_rate": 1.54519056261343e-05, + "loss": 36.0412, + "step": 2682 + }, + { + "epoch": 9.686230248306998, + "grad_norm": 219.64930725097656, + "learning_rate": 1.54464609800363e-05, + "loss": 37.1118, + "step": 2683 + }, + { + "epoch": 9.689841986455981, + "grad_norm": 247.86685180664062, + "learning_rate": 1.5441016333938295e-05, + "loss": 36.488, + "step": 2684 + }, + { + "epoch": 9.693453724604966, + "grad_norm": 248.7967071533203, + "learning_rate": 1.543557168784029e-05, + "loss": 36.2925, + "step": 2685 + }, + { + "epoch": 9.697065462753951, + "grad_norm": 243.1404571533203, + "learning_rate": 1.5430127041742285e-05, + "loss": 37.3986, + "step": 2686 + }, + { + "epoch": 9.700677200902934, + "grad_norm": 276.6585388183594, + "learning_rate": 1.5424682395644284e-05, + "loss": 37.9784, + "step": 2687 + }, + { + "epoch": 9.704288939051919, + "grad_norm": 308.171630859375, + "learning_rate": 1.541923774954628e-05, + "loss": 38.1591, + "step": 2688 + }, + { + "epoch": 9.707900677200904, + "grad_norm": 204.4575653076172, + "learning_rate": 1.541379310344828e-05, + "loss": 27.4514, + "step": 2689 + }, + { + "epoch": 9.711512415349887, + "grad_norm": 160.85946655273438, + "learning_rate": 1.5408348457350274e-05, + "loss": 23.7982, + "step": 2690 + }, + { + "epoch": 9.711512415349887, + "eval_loss": 0.619924008846283, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2690 + }, + { + "epoch": 9.715124153498872, + "grad_norm": 215.60049438476562, + "learning_rate": 1.540290381125227e-05, + "loss": 23.3927, + "step": 2691 + }, + { + "epoch": 9.718735891647855, + "grad_norm": 172.84011840820312, + "learning_rate": 1.5397459165154265e-05, + "loss": 24.1876, + "step": 2692 + }, + { + "epoch": 9.72234762979684, + "grad_norm": 208.42361450195312, + "learning_rate": 1.539201451905626e-05, + "loss": 25.1794, + "step": 2693 + }, + { + "epoch": 9.725959367945824, + "grad_norm": 255.73574829101562, + "learning_rate": 1.538656987295826e-05, + "loss": 42.3484, + "step": 2694 + }, + { + "epoch": 9.729571106094808, + "grad_norm": 239.65533447265625, + "learning_rate": 1.5381125226860254e-05, + "loss": 42.8277, + "step": 2695 + }, + { + "epoch": 9.733182844243792, + "grad_norm": 211.2068634033203, + "learning_rate": 1.5375680580762253e-05, + "loss": 42.6536, + "step": 2696 + }, + { + "epoch": 9.736794582392777, + "grad_norm": 302.85003662109375, + "learning_rate": 1.5370235934664248e-05, + "loss": 42.6263, + "step": 2697 + }, + { + "epoch": 9.74040632054176, + "grad_norm": 211.54754638671875, + "learning_rate": 1.5364791288566244e-05, + "loss": 41.5621, + "step": 2698 + }, + { + "epoch": 9.744018058690745, + "grad_norm": 229.22283935546875, + "learning_rate": 1.535934664246824e-05, + "loss": 43.3765, + "step": 2699 + }, + { + "epoch": 9.747629796839728, + "grad_norm": 206.64794921875, + "learning_rate": 1.5353901996370238e-05, + "loss": 41.4923, + "step": 2700 + }, + { + "epoch": 9.747629796839728, + "eval_loss": 0.6202616095542908, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.981, + "eval_steps_per_second": 56.981, + "step": 2700 + }, + { + "epoch": 9.751241534988713, + "grad_norm": 216.98757934570312, + "learning_rate": 1.5348457350272233e-05, + "loss": 43.1931, + "step": 2701 + }, + { + "epoch": 9.754853273137698, + "grad_norm": 222.7340545654297, + "learning_rate": 1.534301270417423e-05, + "loss": 42.485, + "step": 2702 + }, + { + "epoch": 9.758465011286681, + "grad_norm": 291.3454895019531, + "learning_rate": 1.5337568058076224e-05, + "loss": 41.4766, + "step": 2703 + }, + { + "epoch": 9.762076749435666, + "grad_norm": 239.50341796875, + "learning_rate": 1.533212341197822e-05, + "loss": 41.9215, + "step": 2704 + }, + { + "epoch": 9.76568848758465, + "grad_norm": 179.21839904785156, + "learning_rate": 1.5326678765880218e-05, + "loss": 40.6544, + "step": 2705 + }, + { + "epoch": 9.769300225733634, + "grad_norm": 210.89535522460938, + "learning_rate": 1.5321234119782217e-05, + "loss": 38.6204, + "step": 2706 + }, + { + "epoch": 9.772911963882619, + "grad_norm": 239.23291015625, + "learning_rate": 1.5315789473684212e-05, + "loss": 39.4385, + "step": 2707 + }, + { + "epoch": 9.776523702031604, + "grad_norm": 240.22772216796875, + "learning_rate": 1.5310344827586208e-05, + "loss": 40.0139, + "step": 2708 + }, + { + "epoch": 9.780135440180587, + "grad_norm": 185.4588623046875, + "learning_rate": 1.5304900181488203e-05, + "loss": 38.9331, + "step": 2709 + }, + { + "epoch": 9.783747178329572, + "grad_norm": 263.0315856933594, + "learning_rate": 1.52994555353902e-05, + "loss": 38.5485, + "step": 2710 + }, + { + "epoch": 9.783747178329572, + "eval_loss": 0.615914523601532, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2710 + }, + { + "epoch": 9.787358916478555, + "grad_norm": 209.05348205566406, + "learning_rate": 1.5294010889292197e-05, + "loss": 39.4875, + "step": 2711 + }, + { + "epoch": 9.79097065462754, + "grad_norm": 209.72293090820312, + "learning_rate": 1.5288566243194193e-05, + "loss": 40.4742, + "step": 2712 + }, + { + "epoch": 9.794582392776524, + "grad_norm": 210.02908325195312, + "learning_rate": 1.5283121597096188e-05, + "loss": 39.924, + "step": 2713 + }, + { + "epoch": 9.798194130925507, + "grad_norm": 204.3467254638672, + "learning_rate": 1.5277676950998183e-05, + "loss": 40.8893, + "step": 2714 + }, + { + "epoch": 9.801805869074492, + "grad_norm": 253.9317626953125, + "learning_rate": 1.5272232304900182e-05, + "loss": 38.3278, + "step": 2715 + }, + { + "epoch": 9.805417607223477, + "grad_norm": 263.6196594238281, + "learning_rate": 1.526678765880218e-05, + "loss": 40.5242, + "step": 2716 + }, + { + "epoch": 9.80902934537246, + "grad_norm": 230.35621643066406, + "learning_rate": 1.5261343012704176e-05, + "loss": 40.683, + "step": 2717 + }, + { + "epoch": 9.812641083521445, + "grad_norm": 190.16323852539062, + "learning_rate": 1.5255898366606172e-05, + "loss": 40.2472, + "step": 2718 + }, + { + "epoch": 9.816252821670428, + "grad_norm": 202.7122344970703, + "learning_rate": 1.5250453720508167e-05, + "loss": 38.9644, + "step": 2719 + }, + { + "epoch": 9.819864559819413, + "grad_norm": 193.65774536132812, + "learning_rate": 1.5245009074410164e-05, + "loss": 40.9982, + "step": 2720 + }, + { + "epoch": 9.819864559819413, + "eval_loss": 0.6152020692825317, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 2720 + }, + { + "epoch": 9.823476297968398, + "grad_norm": 272.0360412597656, + "learning_rate": 1.523956442831216e-05, + "loss": 40.5518, + "step": 2721 + }, + { + "epoch": 9.827088036117381, + "grad_norm": 200.20777893066406, + "learning_rate": 1.5234119782214155e-05, + "loss": 38.4801, + "step": 2722 + }, + { + "epoch": 9.830699774266366, + "grad_norm": 201.44764709472656, + "learning_rate": 1.5228675136116152e-05, + "loss": 35.7499, + "step": 2723 + }, + { + "epoch": 9.83431151241535, + "grad_norm": 234.89706420898438, + "learning_rate": 1.522323049001815e-05, + "loss": 35.4331, + "step": 2724 + }, + { + "epoch": 9.837923250564334, + "grad_norm": 193.27423095703125, + "learning_rate": 1.5217785843920146e-05, + "loss": 33.0281, + "step": 2725 + }, + { + "epoch": 9.841534988713319, + "grad_norm": 222.28060913085938, + "learning_rate": 1.5212341197822143e-05, + "loss": 34.2237, + "step": 2726 + }, + { + "epoch": 9.845146726862303, + "grad_norm": 264.2764587402344, + "learning_rate": 1.5206896551724139e-05, + "loss": 33.7112, + "step": 2727 + }, + { + "epoch": 9.848758465011286, + "grad_norm": 204.5146484375, + "learning_rate": 1.5201451905626134e-05, + "loss": 33.9014, + "step": 2728 + }, + { + "epoch": 9.852370203160271, + "grad_norm": 198.90907287597656, + "learning_rate": 1.5196007259528131e-05, + "loss": 36.6987, + "step": 2729 + }, + { + "epoch": 9.855981941309254, + "grad_norm": 254.19818115234375, + "learning_rate": 1.5190562613430126e-05, + "loss": 35.4466, + "step": 2730 + }, + { + "epoch": 9.855981941309254, + "eval_loss": 0.6153284311294556, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2730 + }, + { + "epoch": 9.85959367945824, + "grad_norm": 212.53749084472656, + "learning_rate": 1.5185117967332123e-05, + "loss": 35.659, + "step": 2731 + }, + { + "epoch": 9.863205417607224, + "grad_norm": 234.5277557373047, + "learning_rate": 1.5179673321234119e-05, + "loss": 36.7411, + "step": 2732 + }, + { + "epoch": 9.866817155756207, + "grad_norm": 229.25962829589844, + "learning_rate": 1.5174228675136118e-05, + "loss": 36.0713, + "step": 2733 + }, + { + "epoch": 9.870428893905192, + "grad_norm": 259.5096435546875, + "learning_rate": 1.5168784029038115e-05, + "loss": 37.2433, + "step": 2734 + }, + { + "epoch": 9.874040632054175, + "grad_norm": 297.2413024902344, + "learning_rate": 1.516333938294011e-05, + "loss": 37.222, + "step": 2735 + }, + { + "epoch": 9.87765237020316, + "grad_norm": 259.8325500488281, + "learning_rate": 1.5157894736842105e-05, + "loss": 37.096, + "step": 2736 + }, + { + "epoch": 9.881264108352145, + "grad_norm": 275.85888671875, + "learning_rate": 1.5152450090744103e-05, + "loss": 37.769, + "step": 2737 + }, + { + "epoch": 9.884875846501128, + "grad_norm": 261.16656494140625, + "learning_rate": 1.5147005444646098e-05, + "loss": 38.4089, + "step": 2738 + }, + { + "epoch": 9.888487584650113, + "grad_norm": 219.74351501464844, + "learning_rate": 1.5141560798548095e-05, + "loss": 32.5255, + "step": 2739 + }, + { + "epoch": 9.892099322799098, + "grad_norm": 203.9193878173828, + "learning_rate": 1.513611615245009e-05, + "loss": 24.2497, + "step": 2740 + }, + { + "epoch": 9.892099322799098, + "eval_loss": 0.6206448674201965, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 2740 + }, + { + "epoch": 9.89571106094808, + "grad_norm": 224.19454956054688, + "learning_rate": 1.5130671506352086e-05, + "loss": 23.0629, + "step": 2741 + }, + { + "epoch": 9.899322799097066, + "grad_norm": 252.4147186279297, + "learning_rate": 1.5125226860254086e-05, + "loss": 24.5799, + "step": 2742 + }, + { + "epoch": 9.90293453724605, + "grad_norm": 214.79067993164062, + "learning_rate": 1.5119782214156082e-05, + "loss": 24.6773, + "step": 2743 + }, + { + "epoch": 9.906546275395034, + "grad_norm": 225.59848022460938, + "learning_rate": 1.5114337568058077e-05, + "loss": 43.1147, + "step": 2744 + }, + { + "epoch": 9.910158013544018, + "grad_norm": 221.8661651611328, + "learning_rate": 1.5108892921960074e-05, + "loss": 42.7403, + "step": 2745 + }, + { + "epoch": 9.913769751693001, + "grad_norm": 316.3871765136719, + "learning_rate": 1.510344827586207e-05, + "loss": 41.6931, + "step": 2746 + }, + { + "epoch": 9.917381489841986, + "grad_norm": 250.6577911376953, + "learning_rate": 1.5098003629764065e-05, + "loss": 43.3, + "step": 2747 + }, + { + "epoch": 9.920993227990971, + "grad_norm": 222.44386291503906, + "learning_rate": 1.5092558983666062e-05, + "loss": 43.3128, + "step": 2748 + }, + { + "epoch": 9.924604966139954, + "grad_norm": 190.08682250976562, + "learning_rate": 1.5087114337568057e-05, + "loss": 41.4814, + "step": 2749 + }, + { + "epoch": 9.928216704288939, + "grad_norm": 276.9918212890625, + "learning_rate": 1.5081669691470054e-05, + "loss": 41.042, + "step": 2750 + }, + { + "epoch": 9.928216704288939, + "eval_loss": 0.6201648116111755, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2750 + }, + { + "epoch": 9.931828442437924, + "grad_norm": 269.7344970703125, + "learning_rate": 1.507622504537205e-05, + "loss": 40.3064, + "step": 2751 + }, + { + "epoch": 9.935440180586907, + "grad_norm": 263.11663818359375, + "learning_rate": 1.5070780399274049e-05, + "loss": 40.1675, + "step": 2752 + }, + { + "epoch": 9.939051918735892, + "grad_norm": 210.37635803222656, + "learning_rate": 1.5065335753176046e-05, + "loss": 40.5334, + "step": 2753 + }, + { + "epoch": 9.942663656884875, + "grad_norm": 206.09335327148438, + "learning_rate": 1.5059891107078041e-05, + "loss": 41.0429, + "step": 2754 + }, + { + "epoch": 9.94627539503386, + "grad_norm": 245.45013427734375, + "learning_rate": 1.5054446460980036e-05, + "loss": 40.8831, + "step": 2755 + }, + { + "epoch": 9.949887133182845, + "grad_norm": 216.63075256347656, + "learning_rate": 1.5049001814882033e-05, + "loss": 41.2453, + "step": 2756 + }, + { + "epoch": 9.953498871331828, + "grad_norm": 362.12127685546875, + "learning_rate": 1.5043557168784029e-05, + "loss": 40.4561, + "step": 2757 + }, + { + "epoch": 9.957110609480813, + "grad_norm": 222.01434326171875, + "learning_rate": 1.5038112522686024e-05, + "loss": 41.7307, + "step": 2758 + }, + { + "epoch": 9.960722347629797, + "grad_norm": 289.6107177734375, + "learning_rate": 1.5032667876588021e-05, + "loss": 37.83, + "step": 2759 + }, + { + "epoch": 9.96433408577878, + "grad_norm": 231.75274658203125, + "learning_rate": 1.5027223230490017e-05, + "loss": 34.1728, + "step": 2760 + }, + { + "epoch": 9.96433408577878, + "eval_loss": 0.6177247166633606, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2760 + }, + { + "epoch": 9.967945823927765, + "grad_norm": 269.4657287597656, + "learning_rate": 1.5021778584392017e-05, + "loss": 33.8501, + "step": 2761 + }, + { + "epoch": 9.97155756207675, + "grad_norm": 229.73004150390625, + "learning_rate": 1.5016333938294013e-05, + "loss": 35.0989, + "step": 2762 + }, + { + "epoch": 9.975169300225733, + "grad_norm": 215.75350952148438, + "learning_rate": 1.5010889292196008e-05, + "loss": 35.1091, + "step": 2763 + }, + { + "epoch": 9.978781038374718, + "grad_norm": 255.36439514160156, + "learning_rate": 1.5005444646098005e-05, + "loss": 36.8373, + "step": 2764 + }, + { + "epoch": 9.982392776523701, + "grad_norm": 226.71084594726562, + "learning_rate": 1.5e-05, + "loss": 36.6244, + "step": 2765 + }, + { + "epoch": 9.986004514672686, + "grad_norm": 264.1791076660156, + "learning_rate": 1.4994555353901996e-05, + "loss": 36.1925, + "step": 2766 + }, + { + "epoch": 9.989616252821671, + "grad_norm": 281.4349060058594, + "learning_rate": 1.4989110707803993e-05, + "loss": 38.5627, + "step": 2767 + }, + { + "epoch": 9.993227990970654, + "grad_norm": 275.13092041015625, + "learning_rate": 1.498366606170599e-05, + "loss": 33.3277, + "step": 2768 + }, + { + "epoch": 9.996839729119639, + "grad_norm": 215.79550170898438, + "learning_rate": 1.4978221415607985e-05, + "loss": 23.7482, + "step": 2769 + }, + { + "epoch": 10.0, + "grad_norm": 162.03152465820312, + "learning_rate": 1.4972776769509982e-05, + "loss": 21.7078, + "step": 2770 + }, + { + "epoch": 10.0, + "eval_loss": 0.6126651763916016, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2770 + }, + { + "epoch": 10.003611738148985, + "grad_norm": 243.1815185546875, + "learning_rate": 1.4967332123411978e-05, + "loss": 42.2449, + "step": 2771 + }, + { + "epoch": 10.007223476297968, + "grad_norm": 183.29127502441406, + "learning_rate": 1.4961887477313977e-05, + "loss": 41.5925, + "step": 2772 + }, + { + "epoch": 10.010835214446953, + "grad_norm": 206.04238891601562, + "learning_rate": 1.4956442831215972e-05, + "loss": 40.6657, + "step": 2773 + }, + { + "epoch": 10.014446952595938, + "grad_norm": 192.1796875, + "learning_rate": 1.4950998185117967e-05, + "loss": 41.7065, + "step": 2774 + }, + { + "epoch": 10.01805869074492, + "grad_norm": 202.77279663085938, + "learning_rate": 1.4945553539019964e-05, + "loss": 42.0608, + "step": 2775 + }, + { + "epoch": 10.021670428893906, + "grad_norm": 242.37734985351562, + "learning_rate": 1.494010889292196e-05, + "loss": 40.9925, + "step": 2776 + }, + { + "epoch": 10.025282167042889, + "grad_norm": 252.01358032226562, + "learning_rate": 1.4934664246823957e-05, + "loss": 41.1401, + "step": 2777 + }, + { + "epoch": 10.028893905191874, + "grad_norm": 205.82388305664062, + "learning_rate": 1.4929219600725954e-05, + "loss": 41.5, + "step": 2778 + }, + { + "epoch": 10.032505643340858, + "grad_norm": 251.53968811035156, + "learning_rate": 1.492377495462795e-05, + "loss": 41.8218, + "step": 2779 + }, + { + "epoch": 10.036117381489841, + "grad_norm": 236.55564880371094, + "learning_rate": 1.4918330308529945e-05, + "loss": 40.803, + "step": 2780 + }, + { + "epoch": 10.036117381489841, + "eval_loss": 0.6173696517944336, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 2780 + }, + { + "epoch": 10.039729119638826, + "grad_norm": 214.9959716796875, + "learning_rate": 1.4912885662431942e-05, + "loss": 40.522, + "step": 2781 + }, + { + "epoch": 10.043340857787811, + "grad_norm": 213.7000732421875, + "learning_rate": 1.4907441016333939e-05, + "loss": 38.8643, + "step": 2782 + }, + { + "epoch": 10.046952595936794, + "grad_norm": 225.6709747314453, + "learning_rate": 1.4901996370235936e-05, + "loss": 38.3625, + "step": 2783 + }, + { + "epoch": 10.050564334085779, + "grad_norm": 208.83712768554688, + "learning_rate": 1.4896551724137931e-05, + "loss": 38.5355, + "step": 2784 + }, + { + "epoch": 10.054176072234762, + "grad_norm": 185.51219177246094, + "learning_rate": 1.4891107078039927e-05, + "loss": 38.4303, + "step": 2785 + }, + { + "epoch": 10.057787810383747, + "grad_norm": 196.68551635742188, + "learning_rate": 1.4885662431941925e-05, + "loss": 38.1895, + "step": 2786 + }, + { + "epoch": 10.061399548532732, + "grad_norm": 207.4806671142578, + "learning_rate": 1.488021778584392e-05, + "loss": 39.2329, + "step": 2787 + }, + { + "epoch": 10.065011286681715, + "grad_norm": 211.640380859375, + "learning_rate": 1.4874773139745916e-05, + "loss": 40.108, + "step": 2788 + }, + { + "epoch": 10.0686230248307, + "grad_norm": 195.97006225585938, + "learning_rate": 1.4869328493647913e-05, + "loss": 39.6883, + "step": 2789 + }, + { + "epoch": 10.072234762979685, + "grad_norm": 207.20169067382812, + "learning_rate": 1.4863883847549909e-05, + "loss": 40.557, + "step": 2790 + }, + { + "epoch": 10.072234762979685, + "eval_loss": 0.6166439652442932, + "eval_runtime": 3.1461, + "eval_samples_per_second": 56.895, + "eval_steps_per_second": 56.895, + "step": 2790 + }, + { + "epoch": 10.075846501128668, + "grad_norm": 168.4052276611328, + "learning_rate": 1.4858439201451906e-05, + "loss": 39.76, + "step": 2791 + }, + { + "epoch": 10.079458239277653, + "grad_norm": 188.55575561523438, + "learning_rate": 1.4852994555353903e-05, + "loss": 40.4776, + "step": 2792 + }, + { + "epoch": 10.083069977426636, + "grad_norm": 181.60801696777344, + "learning_rate": 1.4847549909255898e-05, + "loss": 40.5414, + "step": 2793 + }, + { + "epoch": 10.08668171557562, + "grad_norm": 205.39608764648438, + "learning_rate": 1.4842105263157895e-05, + "loss": 41.4944, + "step": 2794 + }, + { + "epoch": 10.090293453724605, + "grad_norm": 271.0169372558594, + "learning_rate": 1.4836660617059892e-05, + "loss": 40.6805, + "step": 2795 + }, + { + "epoch": 10.093905191873588, + "grad_norm": 241.97889709472656, + "learning_rate": 1.4831215970961888e-05, + "loss": 39.5473, + "step": 2796 + }, + { + "epoch": 10.097516930022573, + "grad_norm": 211.64260864257812, + "learning_rate": 1.4825771324863885e-05, + "loss": 41.0357, + "step": 2797 + }, + { + "epoch": 10.101128668171558, + "grad_norm": 209.52804565429688, + "learning_rate": 1.482032667876588e-05, + "loss": 41.3357, + "step": 2798 + }, + { + "epoch": 10.104740406320541, + "grad_norm": 243.08419799804688, + "learning_rate": 1.4814882032667876e-05, + "loss": 38.6778, + "step": 2799 + }, + { + "epoch": 10.108352144469526, + "grad_norm": 227.17172241210938, + "learning_rate": 1.4809437386569874e-05, + "loss": 35.1128, + "step": 2800 + }, + { + "epoch": 10.108352144469526, + "eval_loss": 0.6153741478919983, + "eval_runtime": 3.143, + "eval_samples_per_second": 56.952, + "eval_steps_per_second": 56.952, + "step": 2800 + }, + { + "epoch": 10.111963882618511, + "grad_norm": 284.7151794433594, + "learning_rate": 1.480399274047187e-05, + "loss": 33.1712, + "step": 2801 + }, + { + "epoch": 10.115575620767494, + "grad_norm": 234.85169982910156, + "learning_rate": 1.4798548094373867e-05, + "loss": 33.495, + "step": 2802 + }, + { + "epoch": 10.119187358916479, + "grad_norm": 236.6138458251953, + "learning_rate": 1.4793103448275862e-05, + "loss": 33.2318, + "step": 2803 + }, + { + "epoch": 10.122799097065462, + "grad_norm": 240.98997497558594, + "learning_rate": 1.4787658802177858e-05, + "loss": 33.9268, + "step": 2804 + }, + { + "epoch": 10.126410835214447, + "grad_norm": 218.304443359375, + "learning_rate": 1.4782214156079856e-05, + "loss": 34.667, + "step": 2805 + }, + { + "epoch": 10.130022573363432, + "grad_norm": 290.30108642578125, + "learning_rate": 1.4776769509981852e-05, + "loss": 36.7153, + "step": 2806 + }, + { + "epoch": 10.133634311512415, + "grad_norm": 267.7265625, + "learning_rate": 1.4771324863883847e-05, + "loss": 35.2035, + "step": 2807 + }, + { + "epoch": 10.1372460496614, + "grad_norm": 300.4646301269531, + "learning_rate": 1.4765880217785844e-05, + "loss": 35.6581, + "step": 2808 + }, + { + "epoch": 10.140857787810384, + "grad_norm": 234.16448974609375, + "learning_rate": 1.4760435571687841e-05, + "loss": 35.8547, + "step": 2809 + }, + { + "epoch": 10.144469525959368, + "grad_norm": 209.23858642578125, + "learning_rate": 1.4754990925589837e-05, + "loss": 34.47, + "step": 2810 + }, + { + "epoch": 10.144469525959368, + "eval_loss": 0.6160662770271301, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2810 + }, + { + "epoch": 10.148081264108352, + "grad_norm": 207.9628143310547, + "learning_rate": 1.4749546279491834e-05, + "loss": 36.1239, + "step": 2811 + }, + { + "epoch": 10.151693002257336, + "grad_norm": 183.68545532226562, + "learning_rate": 1.4744101633393829e-05, + "loss": 36.759, + "step": 2812 + }, + { + "epoch": 10.15530474040632, + "grad_norm": 222.00164794921875, + "learning_rate": 1.4738656987295826e-05, + "loss": 37.397, + "step": 2813 + }, + { + "epoch": 10.158916478555305, + "grad_norm": 226.9628448486328, + "learning_rate": 1.4733212341197823e-05, + "loss": 36.3648, + "step": 2814 + }, + { + "epoch": 10.162528216704288, + "grad_norm": 271.061279296875, + "learning_rate": 1.4727767695099819e-05, + "loss": 37.8754, + "step": 2815 + }, + { + "epoch": 10.166139954853273, + "grad_norm": 265.2478942871094, + "learning_rate": 1.4722323049001816e-05, + "loss": 33.7491, + "step": 2816 + }, + { + "epoch": 10.169751693002258, + "grad_norm": 227.5030975341797, + "learning_rate": 1.4716878402903811e-05, + "loss": 23.0162, + "step": 2817 + }, + { + "epoch": 10.173363431151241, + "grad_norm": 195.83477783203125, + "learning_rate": 1.4711433756805808e-05, + "loss": 23.5831, + "step": 2818 + }, + { + "epoch": 10.176975169300226, + "grad_norm": 196.982421875, + "learning_rate": 1.4705989110707805e-05, + "loss": 24.1078, + "step": 2819 + }, + { + "epoch": 10.18058690744921, + "grad_norm": 212.73031616210938, + "learning_rate": 1.47005444646098e-05, + "loss": 24.8378, + "step": 2820 + }, + { + "epoch": 10.18058690744921, + "eval_loss": 0.6217848062515259, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 2820 + }, + { + "epoch": 10.184198645598194, + "grad_norm": 261.8343200683594, + "learning_rate": 1.4695099818511796e-05, + "loss": 43.3402, + "step": 2821 + }, + { + "epoch": 10.187810383747179, + "grad_norm": 272.94158935546875, + "learning_rate": 1.4689655172413793e-05, + "loss": 42.8004, + "step": 2822 + }, + { + "epoch": 10.191422121896162, + "grad_norm": 261.5067138671875, + "learning_rate": 1.468421052631579e-05, + "loss": 43.5947, + "step": 2823 + }, + { + "epoch": 10.195033860045147, + "grad_norm": 280.4205322265625, + "learning_rate": 1.4678765880217787e-05, + "loss": 42.1887, + "step": 2824 + }, + { + "epoch": 10.198645598194132, + "grad_norm": 223.82449340820312, + "learning_rate": 1.4673321234119783e-05, + "loss": 40.9825, + "step": 2825 + }, + { + "epoch": 10.202257336343115, + "grad_norm": 261.1077575683594, + "learning_rate": 1.4667876588021778e-05, + "loss": 41.8347, + "step": 2826 + }, + { + "epoch": 10.2058690744921, + "grad_norm": 189.1642608642578, + "learning_rate": 1.4662431941923775e-05, + "loss": 41.7441, + "step": 2827 + }, + { + "epoch": 10.209480812641084, + "grad_norm": 216.94410705566406, + "learning_rate": 1.4656987295825772e-05, + "loss": 42.203, + "step": 2828 + }, + { + "epoch": 10.213092550790067, + "grad_norm": 260.44744873046875, + "learning_rate": 1.4651542649727768e-05, + "loss": 41.8887, + "step": 2829 + }, + { + "epoch": 10.216704288939052, + "grad_norm": 252.21682739257812, + "learning_rate": 1.4646098003629765e-05, + "loss": 42.5977, + "step": 2830 + }, + { + "epoch": 10.216704288939052, + "eval_loss": 0.6175437569618225, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.997, + "eval_steps_per_second": 56.997, + "step": 2830 + }, + { + "epoch": 10.220316027088035, + "grad_norm": 298.4760437011719, + "learning_rate": 1.464065335753176e-05, + "loss": 40.7994, + "step": 2831 + }, + { + "epoch": 10.22392776523702, + "grad_norm": 214.0433349609375, + "learning_rate": 1.4635208711433757e-05, + "loss": 39.1571, + "step": 2832 + }, + { + "epoch": 10.227539503386005, + "grad_norm": 220.59039306640625, + "learning_rate": 1.4629764065335754e-05, + "loss": 38.257, + "step": 2833 + }, + { + "epoch": 10.231151241534988, + "grad_norm": 218.2419891357422, + "learning_rate": 1.462431941923775e-05, + "loss": 38.1954, + "step": 2834 + }, + { + "epoch": 10.234762979683973, + "grad_norm": 241.67674255371094, + "learning_rate": 1.4618874773139747e-05, + "loss": 39.7451, + "step": 2835 + }, + { + "epoch": 10.238374717832958, + "grad_norm": 260.3656005859375, + "learning_rate": 1.4613430127041742e-05, + "loss": 38.8297, + "step": 2836 + }, + { + "epoch": 10.241986455981941, + "grad_norm": 231.78102111816406, + "learning_rate": 1.4607985480943739e-05, + "loss": 38.523, + "step": 2837 + }, + { + "epoch": 10.245598194130926, + "grad_norm": 217.64820861816406, + "learning_rate": 1.4602540834845736e-05, + "loss": 40.0389, + "step": 2838 + }, + { + "epoch": 10.249209932279909, + "grad_norm": 186.45240783691406, + "learning_rate": 1.4597096188747732e-05, + "loss": 40.3306, + "step": 2839 + }, + { + "epoch": 10.252821670428894, + "grad_norm": 225.20480346679688, + "learning_rate": 1.4591651542649727e-05, + "loss": 39.0968, + "step": 2840 + }, + { + "epoch": 10.252821670428894, + "eval_loss": 0.6195141673088074, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 2840 + }, + { + "epoch": 10.256433408577879, + "grad_norm": 367.6174621582031, + "learning_rate": 1.4586206896551724e-05, + "loss": 38.869, + "step": 2841 + }, + { + "epoch": 10.260045146726862, + "grad_norm": 274.3976135253906, + "learning_rate": 1.4580762250453721e-05, + "loss": 39.7781, + "step": 2842 + }, + { + "epoch": 10.263656884875846, + "grad_norm": 193.41665649414062, + "learning_rate": 1.4575317604355718e-05, + "loss": 38.819, + "step": 2843 + }, + { + "epoch": 10.267268623024831, + "grad_norm": 204.2224578857422, + "learning_rate": 1.4569872958257714e-05, + "loss": 41.5495, + "step": 2844 + }, + { + "epoch": 10.270880361173814, + "grad_norm": 276.07476806640625, + "learning_rate": 1.4564428312159709e-05, + "loss": 40.6553, + "step": 2845 + }, + { + "epoch": 10.2744920993228, + "grad_norm": 192.6361541748047, + "learning_rate": 1.4558983666061708e-05, + "loss": 40.2147, + "step": 2846 + }, + { + "epoch": 10.278103837471784, + "grad_norm": 232.6641082763672, + "learning_rate": 1.4553539019963703e-05, + "loss": 40.7223, + "step": 2847 + }, + { + "epoch": 10.281715575620767, + "grad_norm": 266.781005859375, + "learning_rate": 1.4548094373865698e-05, + "loss": 38.0127, + "step": 2848 + }, + { + "epoch": 10.285327313769752, + "grad_norm": 289.5414123535156, + "learning_rate": 1.4542649727767696e-05, + "loss": 35.216, + "step": 2849 + }, + { + "epoch": 10.288939051918735, + "grad_norm": 208.10845947265625, + "learning_rate": 1.4537205081669691e-05, + "loss": 33.829, + "step": 2850 + }, + { + "epoch": 10.288939051918735, + "eval_loss": 0.6140356063842773, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.703, + "eval_steps_per_second": 56.703, + "step": 2850 + }, + { + "epoch": 10.29255079006772, + "grad_norm": 260.80328369140625, + "learning_rate": 1.4531760435571688e-05, + "loss": 33.8409, + "step": 2851 + }, + { + "epoch": 10.296162528216705, + "grad_norm": 202.3874053955078, + "learning_rate": 1.4526315789473685e-05, + "loss": 32.6498, + "step": 2852 + }, + { + "epoch": 10.299774266365688, + "grad_norm": 236.0218048095703, + "learning_rate": 1.452087114337568e-05, + "loss": 33.6538, + "step": 2853 + }, + { + "epoch": 10.303386004514673, + "grad_norm": 219.1603240966797, + "learning_rate": 1.4515426497277678e-05, + "loss": 33.7346, + "step": 2854 + }, + { + "epoch": 10.306997742663658, + "grad_norm": 252.8759307861328, + "learning_rate": 1.4509981851179675e-05, + "loss": 34.6996, + "step": 2855 + }, + { + "epoch": 10.31060948081264, + "grad_norm": 204.89244079589844, + "learning_rate": 1.450453720508167e-05, + "loss": 36.1145, + "step": 2856 + }, + { + "epoch": 10.314221218961626, + "grad_norm": 239.5278778076172, + "learning_rate": 1.4499092558983667e-05, + "loss": 34.8845, + "step": 2857 + }, + { + "epoch": 10.317832957110609, + "grad_norm": 235.02403259277344, + "learning_rate": 1.4493647912885662e-05, + "loss": 36.1006, + "step": 2858 + }, + { + "epoch": 10.321444695259594, + "grad_norm": 219.25686645507812, + "learning_rate": 1.4488203266787658e-05, + "loss": 37.0463, + "step": 2859 + }, + { + "epoch": 10.325056433408578, + "grad_norm": 238.1767578125, + "learning_rate": 1.4482758620689657e-05, + "loss": 35.5543, + "step": 2860 + }, + { + "epoch": 10.325056433408578, + "eval_loss": 0.6116110682487488, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.93, + "eval_steps_per_second": 56.93, + "step": 2860 + }, + { + "epoch": 10.328668171557561, + "grad_norm": 245.4133758544922, + "learning_rate": 1.4477313974591652e-05, + "loss": 35.7557, + "step": 2861 + }, + { + "epoch": 10.332279909706546, + "grad_norm": 231.70779418945312, + "learning_rate": 1.4471869328493647e-05, + "loss": 35.9535, + "step": 2862 + }, + { + "epoch": 10.335891647855531, + "grad_norm": 218.71266174316406, + "learning_rate": 1.4466424682395644e-05, + "loss": 36.747, + "step": 2863 + }, + { + "epoch": 10.339503386004514, + "grad_norm": 206.82247924804688, + "learning_rate": 1.446098003629764e-05, + "loss": 37.4007, + "step": 2864 + }, + { + "epoch": 10.343115124153499, + "grad_norm": 286.6649475097656, + "learning_rate": 1.4455535390199639e-05, + "loss": 38.183, + "step": 2865 + }, + { + "epoch": 10.346726862302482, + "grad_norm": 262.2049865722656, + "learning_rate": 1.4450090744101634e-05, + "loss": 28.1564, + "step": 2866 + }, + { + "epoch": 10.350338600451467, + "grad_norm": 203.03831481933594, + "learning_rate": 1.444464609800363e-05, + "loss": 23.7155, + "step": 2867 + }, + { + "epoch": 10.353950338600452, + "grad_norm": 220.13597106933594, + "learning_rate": 1.4439201451905626e-05, + "loss": 23.5066, + "step": 2868 + }, + { + "epoch": 10.357562076749435, + "grad_norm": 208.22035217285156, + "learning_rate": 1.4433756805807624e-05, + "loss": 23.8087, + "step": 2869 + }, + { + "epoch": 10.36117381489842, + "grad_norm": 202.74989318847656, + "learning_rate": 1.4428312159709619e-05, + "loss": 24.6194, + "step": 2870 + }, + { + "epoch": 10.36117381489842, + "eval_loss": 0.6170971989631653, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 2870 + }, + { + "epoch": 10.364785553047405, + "grad_norm": 251.78924560546875, + "learning_rate": 1.4422867513611616e-05, + "loss": 41.1333, + "step": 2871 + }, + { + "epoch": 10.368397291196388, + "grad_norm": 269.72430419921875, + "learning_rate": 1.4417422867513611e-05, + "loss": 43.5289, + "step": 2872 + }, + { + "epoch": 10.372009029345373, + "grad_norm": 226.14202880859375, + "learning_rate": 1.4411978221415607e-05, + "loss": 42.1575, + "step": 2873 + }, + { + "epoch": 10.375620767494357, + "grad_norm": 230.2255096435547, + "learning_rate": 1.4406533575317606e-05, + "loss": 42.5563, + "step": 2874 + }, + { + "epoch": 10.37923250564334, + "grad_norm": 259.2338562011719, + "learning_rate": 1.4401088929219601e-05, + "loss": 41.517, + "step": 2875 + }, + { + "epoch": 10.382844243792325, + "grad_norm": 280.06414794921875, + "learning_rate": 1.4395644283121598e-05, + "loss": 41.3589, + "step": 2876 + }, + { + "epoch": 10.386455981941308, + "grad_norm": 259.1960754394531, + "learning_rate": 1.4390199637023593e-05, + "loss": 41.539, + "step": 2877 + }, + { + "epoch": 10.390067720090293, + "grad_norm": 244.4931640625, + "learning_rate": 1.438475499092559e-05, + "loss": 41.8689, + "step": 2878 + }, + { + "epoch": 10.393679458239278, + "grad_norm": 195.65065002441406, + "learning_rate": 1.4379310344827588e-05, + "loss": 42.9191, + "step": 2879 + }, + { + "epoch": 10.397291196388261, + "grad_norm": 215.88589477539062, + "learning_rate": 1.4373865698729583e-05, + "loss": 41.4172, + "step": 2880 + }, + { + "epoch": 10.397291196388261, + "eval_loss": 0.6176813840866089, + "eval_runtime": 3.1462, + "eval_samples_per_second": 56.893, + "eval_steps_per_second": 56.893, + "step": 2880 + }, + { + "epoch": 10.400902934537246, + "grad_norm": 175.21368408203125, + "learning_rate": 1.4368421052631578e-05, + "loss": 41.8998, + "step": 2881 + }, + { + "epoch": 10.404514672686231, + "grad_norm": 207.65963745117188, + "learning_rate": 1.4362976406533575e-05, + "loss": 40.33, + "step": 2882 + }, + { + "epoch": 10.408126410835214, + "grad_norm": 213.50526428222656, + "learning_rate": 1.4357531760435572e-05, + "loss": 38.0329, + "step": 2883 + }, + { + "epoch": 10.411738148984199, + "grad_norm": 190.8444366455078, + "learning_rate": 1.4352087114337568e-05, + "loss": 39.0142, + "step": 2884 + }, + { + "epoch": 10.415349887133182, + "grad_norm": 300.2298583984375, + "learning_rate": 1.4346642468239565e-05, + "loss": 38.6364, + "step": 2885 + }, + { + "epoch": 10.418961625282167, + "grad_norm": 183.6144256591797, + "learning_rate": 1.434119782214156e-05, + "loss": 39.6747, + "step": 2886 + }, + { + "epoch": 10.422573363431152, + "grad_norm": 237.85340881347656, + "learning_rate": 1.4335753176043557e-05, + "loss": 38.3018, + "step": 2887 + }, + { + "epoch": 10.426185101580135, + "grad_norm": 325.96624755859375, + "learning_rate": 1.4330308529945554e-05, + "loss": 40.1042, + "step": 2888 + }, + { + "epoch": 10.42979683972912, + "grad_norm": 248.4732666015625, + "learning_rate": 1.432486388384755e-05, + "loss": 40.0357, + "step": 2889 + }, + { + "epoch": 10.433408577878104, + "grad_norm": 374.6653747558594, + "learning_rate": 1.4319419237749547e-05, + "loss": 40.4383, + "step": 2890 + }, + { + "epoch": 10.433408577878104, + "eval_loss": 0.6150367856025696, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.881, + "eval_steps_per_second": 56.881, + "step": 2890 + }, + { + "epoch": 10.437020316027088, + "grad_norm": 229.79647827148438, + "learning_rate": 1.4313974591651542e-05, + "loss": 40.3728, + "step": 2891 + }, + { + "epoch": 10.440632054176072, + "grad_norm": 278.7500915527344, + "learning_rate": 1.430852994555354e-05, + "loss": 39.546, + "step": 2892 + }, + { + "epoch": 10.444243792325057, + "grad_norm": 233.1890106201172, + "learning_rate": 1.4303085299455536e-05, + "loss": 41.8094, + "step": 2893 + }, + { + "epoch": 10.44785553047404, + "grad_norm": 207.7745819091797, + "learning_rate": 1.4297640653357532e-05, + "loss": 40.6225, + "step": 2894 + }, + { + "epoch": 10.451467268623025, + "grad_norm": 233.37892150878906, + "learning_rate": 1.4292196007259529e-05, + "loss": 40.2499, + "step": 2895 + }, + { + "epoch": 10.455079006772008, + "grad_norm": 225.4070587158203, + "learning_rate": 1.4286751361161524e-05, + "loss": 40.3626, + "step": 2896 + }, + { + "epoch": 10.458690744920993, + "grad_norm": 239.60231018066406, + "learning_rate": 1.4281306715063521e-05, + "loss": 40.3149, + "step": 2897 + }, + { + "epoch": 10.462302483069978, + "grad_norm": 225.3981475830078, + "learning_rate": 1.4275862068965518e-05, + "loss": 39.3443, + "step": 2898 + }, + { + "epoch": 10.465914221218961, + "grad_norm": 270.2829284667969, + "learning_rate": 1.4270417422867514e-05, + "loss": 37.8947, + "step": 2899 + }, + { + "epoch": 10.469525959367946, + "grad_norm": 263.66986083984375, + "learning_rate": 1.426497277676951e-05, + "loss": 34.4721, + "step": 2900 + }, + { + "epoch": 10.469525959367946, + "eval_loss": 0.6134031414985657, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2900 + }, + { + "epoch": 10.47313769751693, + "grad_norm": 189.3812255859375, + "learning_rate": 1.4259528130671508e-05, + "loss": 34.3148, + "step": 2901 + }, + { + "epoch": 10.476749435665914, + "grad_norm": 256.7174987792969, + "learning_rate": 1.4254083484573503e-05, + "loss": 32.1693, + "step": 2902 + }, + { + "epoch": 10.480361173814899, + "grad_norm": 265.40692138671875, + "learning_rate": 1.4248638838475499e-05, + "loss": 34.369, + "step": 2903 + }, + { + "epoch": 10.483972911963882, + "grad_norm": 315.6539001464844, + "learning_rate": 1.4243194192377496e-05, + "loss": 34.9479, + "step": 2904 + }, + { + "epoch": 10.487584650112867, + "grad_norm": 263.7816162109375, + "learning_rate": 1.4237749546279491e-05, + "loss": 33.983, + "step": 2905 + }, + { + "epoch": 10.491196388261852, + "grad_norm": 244.69192504882812, + "learning_rate": 1.423230490018149e-05, + "loss": 36.6685, + "step": 2906 + }, + { + "epoch": 10.494808126410835, + "grad_norm": 224.26071166992188, + "learning_rate": 1.4226860254083485e-05, + "loss": 35.0337, + "step": 2907 + }, + { + "epoch": 10.49841986455982, + "grad_norm": 261.0958557128906, + "learning_rate": 1.422141560798548e-05, + "loss": 34.7154, + "step": 2908 + }, + { + "epoch": 10.502031602708804, + "grad_norm": 245.85960388183594, + "learning_rate": 1.4215970961887478e-05, + "loss": 35.4156, + "step": 2909 + }, + { + "epoch": 10.505643340857787, + "grad_norm": 309.3730163574219, + "learning_rate": 1.4210526315789473e-05, + "loss": 36.3999, + "step": 2910 + }, + { + "epoch": 10.505643340857787, + "eval_loss": 0.6144266128540039, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.853, + "eval_steps_per_second": 56.853, + "step": 2910 + }, + { + "epoch": 10.509255079006772, + "grad_norm": 209.9637451171875, + "learning_rate": 1.420508166969147e-05, + "loss": 37.1515, + "step": 2911 + }, + { + "epoch": 10.512866817155757, + "grad_norm": 254.81683349609375, + "learning_rate": 1.4199637023593467e-05, + "loss": 35.5548, + "step": 2912 + }, + { + "epoch": 10.51647855530474, + "grad_norm": 224.94137573242188, + "learning_rate": 1.4194192377495463e-05, + "loss": 36.7691, + "step": 2913 + }, + { + "epoch": 10.520090293453725, + "grad_norm": 223.81838989257812, + "learning_rate": 1.4188747731397458e-05, + "loss": 37.5904, + "step": 2914 + }, + { + "epoch": 10.523702031602708, + "grad_norm": 308.0168151855469, + "learning_rate": 1.4183303085299457e-05, + "loss": 36.1561, + "step": 2915 + }, + { + "epoch": 10.527313769751693, + "grad_norm": 214.77928161621094, + "learning_rate": 1.4177858439201452e-05, + "loss": 27.6309, + "step": 2916 + }, + { + "epoch": 10.530925507900678, + "grad_norm": 153.77163696289062, + "learning_rate": 1.417241379310345e-05, + "loss": 23.6151, + "step": 2917 + }, + { + "epoch": 10.534537246049661, + "grad_norm": 161.12826538085938, + "learning_rate": 1.4166969147005445e-05, + "loss": 23.1684, + "step": 2918 + }, + { + "epoch": 10.538148984198646, + "grad_norm": 228.01441955566406, + "learning_rate": 1.416152450090744e-05, + "loss": 23.4383, + "step": 2919 + }, + { + "epoch": 10.54176072234763, + "grad_norm": 207.55052185058594, + "learning_rate": 1.4156079854809439e-05, + "loss": 25.4699, + "step": 2920 + }, + { + "epoch": 10.54176072234763, + "eval_loss": 0.6177500486373901, + "eval_runtime": 3.1369, + "eval_samples_per_second": 57.063, + "eval_steps_per_second": 57.063, + "step": 2920 + }, + { + "epoch": 10.545372460496614, + "grad_norm": 254.23828125, + "learning_rate": 1.4150635208711434e-05, + "loss": 42.1525, + "step": 2921 + }, + { + "epoch": 10.548984198645599, + "grad_norm": 228.1654815673828, + "learning_rate": 1.414519056261343e-05, + "loss": 42.4282, + "step": 2922 + }, + { + "epoch": 10.552595936794582, + "grad_norm": 258.4981689453125, + "learning_rate": 1.4139745916515427e-05, + "loss": 42.3053, + "step": 2923 + }, + { + "epoch": 10.556207674943566, + "grad_norm": 364.42059326171875, + "learning_rate": 1.4134301270417424e-05, + "loss": 41.9009, + "step": 2924 + }, + { + "epoch": 10.559819413092551, + "grad_norm": 213.5066375732422, + "learning_rate": 1.412885662431942e-05, + "loss": 41.0624, + "step": 2925 + }, + { + "epoch": 10.563431151241534, + "grad_norm": 214.23472595214844, + "learning_rate": 1.4123411978221416e-05, + "loss": 42.2508, + "step": 2926 + }, + { + "epoch": 10.56704288939052, + "grad_norm": 249.8063201904297, + "learning_rate": 1.4117967332123412e-05, + "loss": 43.0671, + "step": 2927 + }, + { + "epoch": 10.570654627539504, + "grad_norm": 210.0769805908203, + "learning_rate": 1.4112522686025409e-05, + "loss": 43.4018, + "step": 2928 + }, + { + "epoch": 10.574266365688487, + "grad_norm": 255.67225646972656, + "learning_rate": 1.4107078039927406e-05, + "loss": 42.9609, + "step": 2929 + }, + { + "epoch": 10.577878103837472, + "grad_norm": 294.2599182128906, + "learning_rate": 1.4101633393829401e-05, + "loss": 41.8748, + "step": 2930 + }, + { + "epoch": 10.577878103837472, + "eval_loss": 0.6147512793540955, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2930 + }, + { + "epoch": 10.581489841986457, + "grad_norm": 212.6685333251953, + "learning_rate": 1.4096188747731398e-05, + "loss": 42.4291, + "step": 2931 + }, + { + "epoch": 10.58510158013544, + "grad_norm": 297.016357421875, + "learning_rate": 1.4090744101633394e-05, + "loss": 39.7291, + "step": 2932 + }, + { + "epoch": 10.588713318284425, + "grad_norm": 280.308837890625, + "learning_rate": 1.4085299455535389e-05, + "loss": 37.4836, + "step": 2933 + }, + { + "epoch": 10.592325056433408, + "grad_norm": 230.28994750976562, + "learning_rate": 1.4079854809437388e-05, + "loss": 39.4075, + "step": 2934 + }, + { + "epoch": 10.595936794582393, + "grad_norm": 377.0367126464844, + "learning_rate": 1.4074410163339383e-05, + "loss": 40.5601, + "step": 2935 + }, + { + "epoch": 10.599548532731378, + "grad_norm": 238.51597595214844, + "learning_rate": 1.406896551724138e-05, + "loss": 38.1238, + "step": 2936 + }, + { + "epoch": 10.60316027088036, + "grad_norm": 197.5536651611328, + "learning_rate": 1.4063520871143376e-05, + "loss": 38.2997, + "step": 2937 + }, + { + "epoch": 10.606772009029346, + "grad_norm": 211.65162658691406, + "learning_rate": 1.4058076225045373e-05, + "loss": 39.1501, + "step": 2938 + }, + { + "epoch": 10.610383747178329, + "grad_norm": 266.4801940917969, + "learning_rate": 1.405263157894737e-05, + "loss": 40.5761, + "step": 2939 + }, + { + "epoch": 10.613995485327314, + "grad_norm": 210.29478454589844, + "learning_rate": 1.4047186932849365e-05, + "loss": 39.7387, + "step": 2940 + }, + { + "epoch": 10.613995485327314, + "eval_loss": 0.6154477000236511, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 2940 + }, + { + "epoch": 10.617607223476298, + "grad_norm": 318.0694580078125, + "learning_rate": 1.404174228675136e-05, + "loss": 38.691, + "step": 2941 + }, + { + "epoch": 10.621218961625281, + "grad_norm": 351.12811279296875, + "learning_rate": 1.4036297640653358e-05, + "loss": 40.3878, + "step": 2942 + }, + { + "epoch": 10.624830699774266, + "grad_norm": 259.8601989746094, + "learning_rate": 1.4030852994555355e-05, + "loss": 38.4447, + "step": 2943 + }, + { + "epoch": 10.628442437923251, + "grad_norm": 249.7741241455078, + "learning_rate": 1.402540834845735e-05, + "loss": 41.1242, + "step": 2944 + }, + { + "epoch": 10.632054176072234, + "grad_norm": 207.11119079589844, + "learning_rate": 1.4019963702359347e-05, + "loss": 40.1977, + "step": 2945 + }, + { + "epoch": 10.635665914221219, + "grad_norm": 199.37295532226562, + "learning_rate": 1.4014519056261343e-05, + "loss": 40.71, + "step": 2946 + }, + { + "epoch": 10.639277652370204, + "grad_norm": 238.85061645507812, + "learning_rate": 1.4009074410163341e-05, + "loss": 41.8822, + "step": 2947 + }, + { + "epoch": 10.642889390519187, + "grad_norm": 212.46388244628906, + "learning_rate": 1.4003629764065337e-05, + "loss": 40.5648, + "step": 2948 + }, + { + "epoch": 10.646501128668172, + "grad_norm": 217.60386657714844, + "learning_rate": 1.3998185117967332e-05, + "loss": 39.6074, + "step": 2949 + }, + { + "epoch": 10.650112866817155, + "grad_norm": 223.88645935058594, + "learning_rate": 1.399274047186933e-05, + "loss": 37.7394, + "step": 2950 + }, + { + "epoch": 10.650112866817155, + "eval_loss": 0.6133999228477478, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 2950 + }, + { + "epoch": 10.65372460496614, + "grad_norm": 248.87986755371094, + "learning_rate": 1.3987295825771325e-05, + "loss": 34.911, + "step": 2951 + }, + { + "epoch": 10.657336343115125, + "grad_norm": 238.0355987548828, + "learning_rate": 1.3981851179673322e-05, + "loss": 34.0325, + "step": 2952 + }, + { + "epoch": 10.660948081264108, + "grad_norm": 212.9556121826172, + "learning_rate": 1.3976406533575319e-05, + "loss": 34.9663, + "step": 2953 + }, + { + "epoch": 10.664559819413093, + "grad_norm": 274.4277648925781, + "learning_rate": 1.3970961887477314e-05, + "loss": 34.2399, + "step": 2954 + }, + { + "epoch": 10.668171557562077, + "grad_norm": 211.77976989746094, + "learning_rate": 1.396551724137931e-05, + "loss": 33.7609, + "step": 2955 + }, + { + "epoch": 10.67178329571106, + "grad_norm": 280.6621398925781, + "learning_rate": 1.3960072595281307e-05, + "loss": 35.2616, + "step": 2956 + }, + { + "epoch": 10.675395033860045, + "grad_norm": 239.06439208984375, + "learning_rate": 1.3954627949183304e-05, + "loss": 34.2542, + "step": 2957 + }, + { + "epoch": 10.679006772009028, + "grad_norm": 271.45806884765625, + "learning_rate": 1.39491833030853e-05, + "loss": 36.0551, + "step": 2958 + }, + { + "epoch": 10.682618510158013, + "grad_norm": 247.76486206054688, + "learning_rate": 1.3943738656987296e-05, + "loss": 36.9935, + "step": 2959 + }, + { + "epoch": 10.686230248306998, + "grad_norm": 259.47930908203125, + "learning_rate": 1.3938294010889292e-05, + "loss": 36.7769, + "step": 2960 + }, + { + "epoch": 10.686230248306998, + "eval_loss": 0.6107803583145142, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.138, + "eval_steps_per_second": 57.138, + "step": 2960 + }, + { + "epoch": 10.689841986455981, + "grad_norm": 247.50103759765625, + "learning_rate": 1.393284936479129e-05, + "loss": 35.4848, + "step": 2961 + }, + { + "epoch": 10.693453724604966, + "grad_norm": 242.37330627441406, + "learning_rate": 1.3927404718693286e-05, + "loss": 36.3881, + "step": 2962 + }, + { + "epoch": 10.697065462753951, + "grad_norm": 200.2835693359375, + "learning_rate": 1.3921960072595281e-05, + "loss": 37.2684, + "step": 2963 + }, + { + "epoch": 10.700677200902934, + "grad_norm": 261.6256103515625, + "learning_rate": 1.3916515426497278e-05, + "loss": 37.4581, + "step": 2964 + }, + { + "epoch": 10.704288939051919, + "grad_norm": 243.7251434326172, + "learning_rate": 1.3911070780399274e-05, + "loss": 35.8237, + "step": 2965 + }, + { + "epoch": 10.707900677200904, + "grad_norm": 172.99339294433594, + "learning_rate": 1.390562613430127e-05, + "loss": 29.5815, + "step": 2966 + }, + { + "epoch": 10.711512415349887, + "grad_norm": 168.88490295410156, + "learning_rate": 1.3900181488203268e-05, + "loss": 23.6597, + "step": 2967 + }, + { + "epoch": 10.715124153498872, + "grad_norm": 213.0456085205078, + "learning_rate": 1.3894736842105263e-05, + "loss": 22.5034, + "step": 2968 + }, + { + "epoch": 10.718735891647855, + "grad_norm": 183.87222290039062, + "learning_rate": 1.388929219600726e-05, + "loss": 24.1696, + "step": 2969 + }, + { + "epoch": 10.72234762979684, + "grad_norm": 179.4297637939453, + "learning_rate": 1.3883847549909256e-05, + "loss": 24.8905, + "step": 2970 + }, + { + "epoch": 10.72234762979684, + "eval_loss": 0.6176853179931641, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 2970 + }, + { + "epoch": 10.725959367945824, + "grad_norm": 214.10662841796875, + "learning_rate": 1.3878402903811253e-05, + "loss": 40.6941, + "step": 2971 + }, + { + "epoch": 10.729571106094808, + "grad_norm": 199.4381103515625, + "learning_rate": 1.387295825771325e-05, + "loss": 42.6363, + "step": 2972 + }, + { + "epoch": 10.733182844243792, + "grad_norm": 182.74517822265625, + "learning_rate": 1.3867513611615245e-05, + "loss": 40.9695, + "step": 2973 + }, + { + "epoch": 10.736794582392777, + "grad_norm": 182.41421508789062, + "learning_rate": 1.386206896551724e-05, + "loss": 40.8893, + "step": 2974 + }, + { + "epoch": 10.74040632054176, + "grad_norm": 215.42904663085938, + "learning_rate": 1.385662431941924e-05, + "loss": 40.6667, + "step": 2975 + }, + { + "epoch": 10.744018058690745, + "grad_norm": 208.15133666992188, + "learning_rate": 1.3851179673321235e-05, + "loss": 42.0714, + "step": 2976 + }, + { + "epoch": 10.747629796839728, + "grad_norm": 224.70242309570312, + "learning_rate": 1.384573502722323e-05, + "loss": 40.9404, + "step": 2977 + }, + { + "epoch": 10.751241534988713, + "grad_norm": 241.45301818847656, + "learning_rate": 1.3840290381125227e-05, + "loss": 43.5597, + "step": 2978 + }, + { + "epoch": 10.754853273137698, + "grad_norm": 201.2677459716797, + "learning_rate": 1.3834845735027222e-05, + "loss": 42.7741, + "step": 2979 + }, + { + "epoch": 10.758465011286681, + "grad_norm": 246.30873107910156, + "learning_rate": 1.3829401088929221e-05, + "loss": 41.7873, + "step": 2980 + }, + { + "epoch": 10.758465011286681, + "eval_loss": 0.6206657886505127, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2980 + }, + { + "epoch": 10.762076749435666, + "grad_norm": 206.91009521484375, + "learning_rate": 1.3823956442831217e-05, + "loss": 42.3601, + "step": 2981 + }, + { + "epoch": 10.76568848758465, + "grad_norm": 206.37472534179688, + "learning_rate": 1.3818511796733212e-05, + "loss": 38.5536, + "step": 2982 + }, + { + "epoch": 10.769300225733634, + "grad_norm": 206.49070739746094, + "learning_rate": 1.3813067150635209e-05, + "loss": 38.1051, + "step": 2983 + }, + { + "epoch": 10.772911963882619, + "grad_norm": 215.02455139160156, + "learning_rate": 1.3807622504537206e-05, + "loss": 39.0797, + "step": 2984 + }, + { + "epoch": 10.776523702031604, + "grad_norm": 254.23757934570312, + "learning_rate": 1.3802177858439202e-05, + "loss": 39.419, + "step": 2985 + }, + { + "epoch": 10.780135440180587, + "grad_norm": 205.85079956054688, + "learning_rate": 1.3796733212341199e-05, + "loss": 39.2075, + "step": 2986 + }, + { + "epoch": 10.783747178329572, + "grad_norm": 216.0372314453125, + "learning_rate": 1.3791288566243194e-05, + "loss": 38.5652, + "step": 2987 + }, + { + "epoch": 10.787358916478555, + "grad_norm": 258.47650146484375, + "learning_rate": 1.3785843920145191e-05, + "loss": 38.1968, + "step": 2988 + }, + { + "epoch": 10.79097065462754, + "grad_norm": 289.07354736328125, + "learning_rate": 1.3780399274047188e-05, + "loss": 40.2233, + "step": 2989 + }, + { + "epoch": 10.794582392776524, + "grad_norm": 332.9964904785156, + "learning_rate": 1.3774954627949184e-05, + "loss": 39.5959, + "step": 2990 + }, + { + "epoch": 10.794582392776524, + "eval_loss": 0.6167517304420471, + "eval_runtime": 3.1556, + "eval_samples_per_second": 56.724, + "eval_steps_per_second": 56.724, + "step": 2990 + }, + { + "epoch": 10.798194130925507, + "grad_norm": 205.10699462890625, + "learning_rate": 1.376950998185118e-05, + "loss": 40.2468, + "step": 2991 + }, + { + "epoch": 10.801805869074492, + "grad_norm": 270.2808837890625, + "learning_rate": 1.3764065335753176e-05, + "loss": 37.5956, + "step": 2992 + }, + { + "epoch": 10.805417607223477, + "grad_norm": 199.32044982910156, + "learning_rate": 1.3758620689655171e-05, + "loss": 38.7289, + "step": 2993 + }, + { + "epoch": 10.80902934537246, + "grad_norm": 196.97547912597656, + "learning_rate": 1.375317604355717e-05, + "loss": 40.6707, + "step": 2994 + }, + { + "epoch": 10.812641083521445, + "grad_norm": 219.34588623046875, + "learning_rate": 1.3747731397459166e-05, + "loss": 39.6782, + "step": 2995 + }, + { + "epoch": 10.816252821670428, + "grad_norm": 261.7323913574219, + "learning_rate": 1.3742286751361161e-05, + "loss": 41.1828, + "step": 2996 + }, + { + "epoch": 10.819864559819413, + "grad_norm": 250.89186096191406, + "learning_rate": 1.3736842105263158e-05, + "loss": 41.3582, + "step": 2997 + }, + { + "epoch": 10.823476297968398, + "grad_norm": 284.7223205566406, + "learning_rate": 1.3731397459165155e-05, + "loss": 39.3584, + "step": 2998 + }, + { + "epoch": 10.827088036117381, + "grad_norm": 212.9114990234375, + "learning_rate": 1.3725952813067152e-05, + "loss": 37.5373, + "step": 2999 + }, + { + "epoch": 10.830699774266366, + "grad_norm": 182.8346405029297, + "learning_rate": 1.3720508166969148e-05, + "loss": 35.2027, + "step": 3000 + }, + { + "epoch": 10.830699774266366, + "eval_loss": 0.6083630919456482, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.702, + "eval_steps_per_second": 56.702, + "step": 3000 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.4718456018527846e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0cb30090c7c51e8a16086d40cef51d6eb6c2e128 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cdbfbab630f44627d7f40d74c513c5cb64fcd7c05b65fad964c4ac476fe17a9 +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..e23afd95447e2e74c8c2f65cf2c94c5b374b58e7 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65e24331a44a57b7e042cf7e067b30011c9a565f99513834cda93ce56fb479cf +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9011b0287cb4f18ef7432f26018e7aed94f572d8 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1a86f17a56f761cf65a29a3491f6cdbad7e79ce7241e4e04c040e0a352c2057 +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..5c7c54770d78d8b010b9c513c0b8ee53748a68bf --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:802d34dc4ad2bab7bf2331860ec4c995a2b56825c391d1c769a136190d26d41b +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..244d48e55812bc1947ba919e9186718aa46f7aea --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f3724ee19480c3958672b53293537f9588ed15bcc92a2a2ca8adef4967c9bd7 +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c6617018ab669406b30623759d39573c63de9434 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/trainer_state.json @@ -0,0 +1,24993 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 11.552595936794582, + "eval_steps": 10, + "global_step": 3200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + }, + { + "epoch": 4.33589164785553, + "grad_norm": 232.332763671875, + "learning_rate": 2.3515426497277678e-05, + "loss": 38.1029, + "step": 1201 + }, + { + "epoch": 4.339503386004514, + "grad_norm": 251.8763885498047, + "learning_rate": 2.3509981851179673e-05, + "loss": 40.2538, + "step": 1202 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 260.1363525390625, + "learning_rate": 2.350453720508167e-05, + "loss": 39.115, + "step": 1203 + }, + { + "epoch": 4.346726862302483, + "grad_norm": 227.32473754882812, + "learning_rate": 2.3499092558983667e-05, + "loss": 37.7692, + "step": 1204 + }, + { + "epoch": 4.350338600451467, + "grad_norm": 208.3872528076172, + "learning_rate": 2.3493647912885663e-05, + "loss": 26.7583, + "step": 1205 + }, + { + "epoch": 4.353950338600452, + "grad_norm": 173.05075073242188, + "learning_rate": 2.348820326678766e-05, + "loss": 24.7576, + "step": 1206 + }, + { + "epoch": 4.357562076749436, + "grad_norm": 214.4512939453125, + "learning_rate": 2.3482758620689657e-05, + "loss": 24.8792, + "step": 1207 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 179.293701171875, + "learning_rate": 2.3477313974591652e-05, + "loss": 26.1507, + "step": 1208 + }, + { + "epoch": 4.364785553047404, + "grad_norm": 401.9908142089844, + "learning_rate": 2.3471869328493648e-05, + "loss": 47.4017, + "step": 1209 + }, + { + "epoch": 4.368397291196389, + "grad_norm": 399.3369140625, + "learning_rate": 2.3466424682395643e-05, + "loss": 48.0082, + "step": 1210 + }, + { + "epoch": 4.368397291196389, + "eval_loss": 0.6664602756500244, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.18, + "eval_steps_per_second": 57.18, + "step": 1210 + }, + { + "epoch": 4.372009029345373, + "grad_norm": 320.49090576171875, + "learning_rate": 2.346098003629764e-05, + "loss": 47.4843, + "step": 1211 + }, + { + "epoch": 4.375620767494357, + "grad_norm": 297.55615234375, + "learning_rate": 2.3455535390199637e-05, + "loss": 46.3087, + "step": 1212 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 245.03399658203125, + "learning_rate": 2.3450090744101636e-05, + "loss": 45.4889, + "step": 1213 + }, + { + "epoch": 4.382844243792325, + "grad_norm": 227.94091796875, + "learning_rate": 2.344464609800363e-05, + "loss": 45.8501, + "step": 1214 + }, + { + "epoch": 4.386455981941309, + "grad_norm": 262.7824401855469, + "learning_rate": 2.3439201451905627e-05, + "loss": 46.2737, + "step": 1215 + }, + { + "epoch": 4.390067720090293, + "grad_norm": 235.969970703125, + "learning_rate": 2.3433756805807622e-05, + "loss": 45.2876, + "step": 1216 + }, + { + "epoch": 4.393679458239277, + "grad_norm": 244.8028106689453, + "learning_rate": 2.342831215970962e-05, + "loss": 45.4931, + "step": 1217 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 236.24844360351562, + "learning_rate": 2.3422867513611616e-05, + "loss": 45.6649, + "step": 1218 + }, + { + "epoch": 4.400902934537246, + "grad_norm": 204.7911834716797, + "learning_rate": 2.341742286751361e-05, + "loss": 43.9613, + "step": 1219 + }, + { + "epoch": 4.40451467268623, + "grad_norm": 190.6739044189453, + "learning_rate": 2.3411978221415607e-05, + "loss": 41.9267, + "step": 1220 + }, + { + "epoch": 4.40451467268623, + "eval_loss": 0.6481396555900574, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1220 + }, + { + "epoch": 4.408126410835214, + "grad_norm": 224.25758361816406, + "learning_rate": 2.3406533575317602e-05, + "loss": 42.34, + "step": 1221 + }, + { + "epoch": 4.411738148984199, + "grad_norm": 238.21913146972656, + "learning_rate": 2.34010889292196e-05, + "loss": 40.6947, + "step": 1222 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 255.64395141601562, + "learning_rate": 2.33956442831216e-05, + "loss": 39.8585, + "step": 1223 + }, + { + "epoch": 4.418961625282167, + "grad_norm": 202.08859252929688, + "learning_rate": 2.3390199637023595e-05, + "loss": 42.6031, + "step": 1224 + }, + { + "epoch": 4.422573363431152, + "grad_norm": 222.359619140625, + "learning_rate": 2.338475499092559e-05, + "loss": 41.9946, + "step": 1225 + }, + { + "epoch": 4.426185101580136, + "grad_norm": 198.84461975097656, + "learning_rate": 2.3379310344827586e-05, + "loss": 40.9174, + "step": 1226 + }, + { + "epoch": 4.42979683972912, + "grad_norm": 227.34942626953125, + "learning_rate": 2.337386569872958e-05, + "loss": 42.2865, + "step": 1227 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 249.9097900390625, + "learning_rate": 2.336842105263158e-05, + "loss": 42.6508, + "step": 1228 + }, + { + "epoch": 4.437020316027088, + "grad_norm": 236.96009826660156, + "learning_rate": 2.3362976406533576e-05, + "loss": 43.0846, + "step": 1229 + }, + { + "epoch": 4.440632054176072, + "grad_norm": 183.06201171875, + "learning_rate": 2.335753176043557e-05, + "loss": 42.4119, + "step": 1230 + }, + { + "epoch": 4.440632054176072, + "eval_loss": 0.6428424715995789, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 1230 + }, + { + "epoch": 4.444243792325056, + "grad_norm": 199.0382843017578, + "learning_rate": 2.335208711433757e-05, + "loss": 43.1702, + "step": 1231 + }, + { + "epoch": 4.44785553047404, + "grad_norm": 221.87939453125, + "learning_rate": 2.3346642468239565e-05, + "loss": 43.3518, + "step": 1232 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 205.0601043701172, + "learning_rate": 2.3341197822141564e-05, + "loss": 42.9713, + "step": 1233 + }, + { + "epoch": 4.455079006772009, + "grad_norm": 235.3998565673828, + "learning_rate": 2.333575317604356e-05, + "loss": 42.6973, + "step": 1234 + }, + { + "epoch": 4.458690744920993, + "grad_norm": 171.76986694335938, + "learning_rate": 2.3330308529945555e-05, + "loss": 43.351, + "step": 1235 + }, + { + "epoch": 4.462302483069977, + "grad_norm": 261.549072265625, + "learning_rate": 2.332486388384755e-05, + "loss": 43.8662, + "step": 1236 + }, + { + "epoch": 4.465914221218962, + "grad_norm": 256.76837158203125, + "learning_rate": 2.3319419237749545e-05, + "loss": 40.7938, + "step": 1237 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 176.35060119628906, + "learning_rate": 2.331397459165154e-05, + "loss": 38.1021, + "step": 1238 + }, + { + "epoch": 4.47313769751693, + "grad_norm": 203.00906372070312, + "learning_rate": 2.330852994555354e-05, + "loss": 36.6359, + "step": 1239 + }, + { + "epoch": 4.476749435665914, + "grad_norm": 259.6462707519531, + "learning_rate": 2.3303085299455535e-05, + "loss": 34.448, + "step": 1240 + }, + { + "epoch": 4.476749435665914, + "eval_loss": 0.6386051177978516, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 1240 + }, + { + "epoch": 4.480361173814899, + "grad_norm": 215.24737548828125, + "learning_rate": 2.3297640653357534e-05, + "loss": 35.2353, + "step": 1241 + }, + { + "epoch": 4.483972911963883, + "grad_norm": 249.12355041503906, + "learning_rate": 2.329219600725953e-05, + "loss": 38.2077, + "step": 1242 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 191.0881805419922, + "learning_rate": 2.3286751361161525e-05, + "loss": 36.8363, + "step": 1243 + }, + { + "epoch": 4.491196388261851, + "grad_norm": 229.26449584960938, + "learning_rate": 2.3281306715063523e-05, + "loss": 36.7398, + "step": 1244 + }, + { + "epoch": 4.4948081264108355, + "grad_norm": 184.931884765625, + "learning_rate": 2.327586206896552e-05, + "loss": 35.6614, + "step": 1245 + }, + { + "epoch": 4.4984198645598195, + "grad_norm": 183.7378387451172, + "learning_rate": 2.3270417422867514e-05, + "loss": 36.9818, + "step": 1246 + }, + { + "epoch": 4.502031602708803, + "grad_norm": 191.42543029785156, + "learning_rate": 2.326497277676951e-05, + "loss": 38.1348, + "step": 1247 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 211.6359100341797, + "learning_rate": 2.3259528130671505e-05, + "loss": 37.0112, + "step": 1248 + }, + { + "epoch": 4.509255079006772, + "grad_norm": 245.6946563720703, + "learning_rate": 2.32540834845735e-05, + "loss": 38.6218, + "step": 1249 + }, + { + "epoch": 4.512866817155756, + "grad_norm": 193.29095458984375, + "learning_rate": 2.3248638838475502e-05, + "loss": 36.9687, + "step": 1250 + }, + { + "epoch": 4.512866817155756, + "eval_loss": 0.6432057023048401, + "eval_runtime": 3.1301, + "eval_samples_per_second": 57.187, + "eval_steps_per_second": 57.187, + "step": 1250 + }, + { + "epoch": 4.51647855530474, + "grad_norm": 247.0595245361328, + "learning_rate": 2.3243194192377498e-05, + "loss": 39.8086, + "step": 1251 + }, + { + "epoch": 4.520090293453725, + "grad_norm": 243.1544189453125, + "learning_rate": 2.3237749546279493e-05, + "loss": 38.7245, + "step": 1252 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 322.0834045410156, + "learning_rate": 2.323230490018149e-05, + "loss": 39.5335, + "step": 1253 + }, + { + "epoch": 4.527313769751693, + "grad_norm": 201.5956573486328, + "learning_rate": 2.3226860254083484e-05, + "loss": 30.2928, + "step": 1254 + }, + { + "epoch": 4.530925507900677, + "grad_norm": 186.13291931152344, + "learning_rate": 2.3221415607985483e-05, + "loss": 24.8504, + "step": 1255 + }, + { + "epoch": 4.534537246049661, + "grad_norm": 251.50608825683594, + "learning_rate": 2.3215970961887478e-05, + "loss": 24.5528, + "step": 1256 + }, + { + "epoch": 4.538148984198646, + "grad_norm": 180.21124267578125, + "learning_rate": 2.3210526315789473e-05, + "loss": 25.0864, + "step": 1257 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 206.5410614013672, + "learning_rate": 2.320508166969147e-05, + "loss": 27.1602, + "step": 1258 + }, + { + "epoch": 4.545372460496614, + "grad_norm": 342.1103210449219, + "learning_rate": 2.3199637023593468e-05, + "loss": 47.3734, + "step": 1259 + }, + { + "epoch": 4.5489841986455986, + "grad_norm": 418.3056945800781, + "learning_rate": 2.3194192377495463e-05, + "loss": 48.0316, + "step": 1260 + }, + { + "epoch": 4.5489841986455986, + "eval_loss": 0.6742400527000427, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 1260 + }, + { + "epoch": 4.5525959367945825, + "grad_norm": 369.8560791015625, + "learning_rate": 2.3188747731397462e-05, + "loss": 47.4532, + "step": 1261 + }, + { + "epoch": 4.5562076749435665, + "grad_norm": 322.0288391113281, + "learning_rate": 2.3183303085299457e-05, + "loss": 47.0661, + "step": 1262 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 244.79066467285156, + "learning_rate": 2.3177858439201453e-05, + "loss": 45.1875, + "step": 1263 + }, + { + "epoch": 4.563431151241535, + "grad_norm": 209.29397583007812, + "learning_rate": 2.3172413793103448e-05, + "loss": 46.1355, + "step": 1264 + }, + { + "epoch": 4.567042889390519, + "grad_norm": 271.5123291015625, + "learning_rate": 2.3166969147005443e-05, + "loss": 45.8947, + "step": 1265 + }, + { + "epoch": 4.570654627539503, + "grad_norm": 232.42913818359375, + "learning_rate": 2.3161524500907442e-05, + "loss": 45.6542, + "step": 1266 + }, + { + "epoch": 4.574266365688487, + "grad_norm": 282.50738525390625, + "learning_rate": 2.3156079854809437e-05, + "loss": 45.8805, + "step": 1267 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 203.39031982421875, + "learning_rate": 2.3150635208711436e-05, + "loss": 44.8926, + "step": 1268 + }, + { + "epoch": 4.581489841986456, + "grad_norm": 213.94894409179688, + "learning_rate": 2.314519056261343e-05, + "loss": 43.7589, + "step": 1269 + }, + { + "epoch": 4.58510158013544, + "grad_norm": 198.9677734375, + "learning_rate": 2.3139745916515427e-05, + "loss": 41.819, + "step": 1270 + }, + { + "epoch": 4.58510158013544, + "eval_loss": 0.6428627371788025, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1270 + }, + { + "epoch": 4.588713318284425, + "grad_norm": 197.69903564453125, + "learning_rate": 2.3134301270417422e-05, + "loss": 40.6128, + "step": 1271 + }, + { + "epoch": 4.592325056433409, + "grad_norm": 229.10488891601562, + "learning_rate": 2.312885662431942e-05, + "loss": 41.1856, + "step": 1272 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 254.4750213623047, + "learning_rate": 2.3123411978221417e-05, + "loss": 40.2048, + "step": 1273 + }, + { + "epoch": 4.599548532731377, + "grad_norm": 247.2012939453125, + "learning_rate": 2.3117967332123412e-05, + "loss": 41.663, + "step": 1274 + }, + { + "epoch": 4.603160270880361, + "grad_norm": 196.78761291503906, + "learning_rate": 2.3112522686025407e-05, + "loss": 41.1102, + "step": 1275 + }, + { + "epoch": 4.606772009029346, + "grad_norm": 179.03880310058594, + "learning_rate": 2.3107078039927403e-05, + "loss": 39.6368, + "step": 1276 + }, + { + "epoch": 4.6103837471783295, + "grad_norm": 203.49159240722656, + "learning_rate": 2.3101633393829405e-05, + "loss": 42.9424, + "step": 1277 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 254.80018615722656, + "learning_rate": 2.30961887477314e-05, + "loss": 42.0636, + "step": 1278 + }, + { + "epoch": 4.617607223476298, + "grad_norm": 201.86109924316406, + "learning_rate": 2.3090744101633396e-05, + "loss": 41.4738, + "step": 1279 + }, + { + "epoch": 4.621218961625282, + "grad_norm": 185.1239471435547, + "learning_rate": 2.308529945553539e-05, + "loss": 41.8529, + "step": 1280 + }, + { + "epoch": 4.621218961625282, + "eval_loss": 0.6457561254501343, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 1280 + }, + { + "epoch": 4.624830699774266, + "grad_norm": 198.6769561767578, + "learning_rate": 2.3079854809437386e-05, + "loss": 41.8397, + "step": 1281 + }, + { + "epoch": 4.62844243792325, + "grad_norm": 254.9165496826172, + "learning_rate": 2.3074410163339382e-05, + "loss": 43.5585, + "step": 1282 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 183.61181640625, + "learning_rate": 2.306896551724138e-05, + "loss": 41.7349, + "step": 1283 + }, + { + "epoch": 4.635665914221219, + "grad_norm": 206.0381622314453, + "learning_rate": 2.3063520871143376e-05, + "loss": 42.6239, + "step": 1284 + }, + { + "epoch": 4.639277652370203, + "grad_norm": 188.5303497314453, + "learning_rate": 2.305807622504537e-05, + "loss": 43.0988, + "step": 1285 + }, + { + "epoch": 4.642889390519187, + "grad_norm": 208.30039978027344, + "learning_rate": 2.3052631578947367e-05, + "loss": 43.8379, + "step": 1286 + }, + { + "epoch": 4.646501128668172, + "grad_norm": 209.494384765625, + "learning_rate": 2.3047186932849365e-05, + "loss": 41.4395, + "step": 1287 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 223.97824096679688, + "learning_rate": 2.3041742286751364e-05, + "loss": 38.5792, + "step": 1288 + }, + { + "epoch": 4.65372460496614, + "grad_norm": 209.16192626953125, + "learning_rate": 2.303629764065336e-05, + "loss": 36.2448, + "step": 1289 + }, + { + "epoch": 4.657336343115124, + "grad_norm": 260.72821044921875, + "learning_rate": 2.3030852994555355e-05, + "loss": 35.1692, + "step": 1290 + }, + { + "epoch": 4.657336343115124, + "eval_loss": 0.6381233334541321, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1290 + }, + { + "epoch": 4.660948081264109, + "grad_norm": 222.2270965576172, + "learning_rate": 2.302540834845735e-05, + "loss": 35.2234, + "step": 1291 + }, + { + "epoch": 4.664559819413093, + "grad_norm": 208.68218994140625, + "learning_rate": 2.3019963702359346e-05, + "loss": 35.6167, + "step": 1292 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 199.57015991210938, + "learning_rate": 2.301451905626134e-05, + "loss": 36.9489, + "step": 1293 + }, + { + "epoch": 4.6717832957110605, + "grad_norm": 249.1312255859375, + "learning_rate": 2.300907441016334e-05, + "loss": 37.0681, + "step": 1294 + }, + { + "epoch": 4.675395033860045, + "grad_norm": 227.86341857910156, + "learning_rate": 2.3003629764065335e-05, + "loss": 38.3897, + "step": 1295 + }, + { + "epoch": 4.679006772009029, + "grad_norm": 290.3368225097656, + "learning_rate": 2.2998185117967334e-05, + "loss": 39.1391, + "step": 1296 + }, + { + "epoch": 4.682618510158013, + "grad_norm": 222.59974670410156, + "learning_rate": 2.299274047186933e-05, + "loss": 38.6362, + "step": 1297 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 233.853515625, + "learning_rate": 2.2987295825771325e-05, + "loss": 37.1796, + "step": 1298 + }, + { + "epoch": 4.689841986455982, + "grad_norm": 202.83212280273438, + "learning_rate": 2.2981851179673324e-05, + "loss": 38.5097, + "step": 1299 + }, + { + "epoch": 4.693453724604966, + "grad_norm": 203.59027099609375, + "learning_rate": 2.297640653357532e-05, + "loss": 38.3335, + "step": 1300 + }, + { + "epoch": 4.693453724604966, + "eval_loss": 0.6446877717971802, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 1300 + }, + { + "epoch": 4.69706546275395, + "grad_norm": 250.48324584960938, + "learning_rate": 2.2970961887477314e-05, + "loss": 39.1848, + "step": 1301 + }, + { + "epoch": 4.700677200902934, + "grad_norm": 218.0867462158203, + "learning_rate": 2.296551724137931e-05, + "loss": 38.2276, + "step": 1302 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 316.4258728027344, + "learning_rate": 2.2960072595281305e-05, + "loss": 38.4487, + "step": 1303 + }, + { + "epoch": 4.707900677200903, + "grad_norm": 262.96832275390625, + "learning_rate": 2.29546279491833e-05, + "loss": 29.1075, + "step": 1304 + }, + { + "epoch": 4.711512415349887, + "grad_norm": 261.25897216796875, + "learning_rate": 2.2949183303085303e-05, + "loss": 24.6257, + "step": 1305 + }, + { + "epoch": 4.715124153498872, + "grad_norm": 223.29014587402344, + "learning_rate": 2.2943738656987298e-05, + "loss": 24.4387, + "step": 1306 + }, + { + "epoch": 4.718735891647856, + "grad_norm": 167.95193481445312, + "learning_rate": 2.2938294010889293e-05, + "loss": 25.0916, + "step": 1307 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 203.88392639160156, + "learning_rate": 2.293284936479129e-05, + "loss": 26.1631, + "step": 1308 + }, + { + "epoch": 4.725959367945824, + "grad_norm": 350.67657470703125, + "learning_rate": 2.2927404718693284e-05, + "loss": 47.7021, + "step": 1309 + }, + { + "epoch": 4.7295711060948085, + "grad_norm": 357.1839294433594, + "learning_rate": 2.2921960072595283e-05, + "loss": 47.8161, + "step": 1310 + }, + { + "epoch": 4.7295711060948085, + "eval_loss": 0.6716815829277039, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 1310 + }, + { + "epoch": 4.733182844243792, + "grad_norm": 334.40216064453125, + "learning_rate": 2.291651542649728e-05, + "loss": 47.5608, + "step": 1311 + }, + { + "epoch": 4.736794582392776, + "grad_norm": 322.90008544921875, + "learning_rate": 2.2911070780399274e-05, + "loss": 45.9858, + "step": 1312 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 291.5083923339844, + "learning_rate": 2.290562613430127e-05, + "loss": 45.9813, + "step": 1313 + }, + { + "epoch": 4.744018058690745, + "grad_norm": 234.91102600097656, + "learning_rate": 2.2900181488203268e-05, + "loss": 44.4287, + "step": 1314 + }, + { + "epoch": 4.747629796839729, + "grad_norm": 271.03582763671875, + "learning_rate": 2.2894736842105263e-05, + "loss": 45.3697, + "step": 1315 + }, + { + "epoch": 4.751241534988713, + "grad_norm": 256.219482421875, + "learning_rate": 2.2889292196007262e-05, + "loss": 45.1817, + "step": 1316 + }, + { + "epoch": 4.754853273137698, + "grad_norm": 252.0631561279297, + "learning_rate": 2.2883847549909257e-05, + "loss": 45.2029, + "step": 1317 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 249.41812133789062, + "learning_rate": 2.2878402903811253e-05, + "loss": 44.9802, + "step": 1318 + }, + { + "epoch": 4.762076749435666, + "grad_norm": 208.9102325439453, + "learning_rate": 2.2872958257713248e-05, + "loss": 44.3745, + "step": 1319 + }, + { + "epoch": 4.76568848758465, + "grad_norm": 322.94903564453125, + "learning_rate": 2.2867513611615244e-05, + "loss": 40.9193, + "step": 1320 + }, + { + "epoch": 4.76568848758465, + "eval_loss": 0.6515910029411316, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1320 + }, + { + "epoch": 4.769300225733634, + "grad_norm": 264.6942138671875, + "learning_rate": 2.2862068965517242e-05, + "loss": 39.7286, + "step": 1321 + }, + { + "epoch": 4.772911963882619, + "grad_norm": 276.6095886230469, + "learning_rate": 2.2856624319419238e-05, + "loss": 41.3846, + "step": 1322 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 199.59877014160156, + "learning_rate": 2.2851179673321233e-05, + "loss": 40.5583, + "step": 1323 + }, + { + "epoch": 4.780135440180587, + "grad_norm": 252.59158325195312, + "learning_rate": 2.2845735027223232e-05, + "loss": 40.9513, + "step": 1324 + }, + { + "epoch": 4.7837471783295715, + "grad_norm": 215.53826904296875, + "learning_rate": 2.2840290381125227e-05, + "loss": 41.5119, + "step": 1325 + }, + { + "epoch": 4.7873589164785555, + "grad_norm": 290.7100524902344, + "learning_rate": 2.2834845735027226e-05, + "loss": 42.7646, + "step": 1326 + }, + { + "epoch": 4.7909706546275395, + "grad_norm": 190.2306671142578, + "learning_rate": 2.282940108892922e-05, + "loss": 42.2708, + "step": 1327 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 187.5550079345703, + "learning_rate": 2.2823956442831217e-05, + "loss": 41.9279, + "step": 1328 + }, + { + "epoch": 4.798194130925508, + "grad_norm": 169.10414123535156, + "learning_rate": 2.2818511796733212e-05, + "loss": 42.2688, + "step": 1329 + }, + { + "epoch": 4.801805869074492, + "grad_norm": 199.5216064453125, + "learning_rate": 2.2813067150635208e-05, + "loss": 41.9192, + "step": 1330 + }, + { + "epoch": 4.801805869074492, + "eval_loss": 0.6402038335800171, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 1330 + }, + { + "epoch": 4.805417607223476, + "grad_norm": 222.4996337890625, + "learning_rate": 2.2807622504537203e-05, + "loss": 43.8218, + "step": 1331 + }, + { + "epoch": 4.80902934537246, + "grad_norm": 228.1157684326172, + "learning_rate": 2.2802177858439202e-05, + "loss": 42.9497, + "step": 1332 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 179.83697509765625, + "learning_rate": 2.27967332123412e-05, + "loss": 43.9723, + "step": 1333 + }, + { + "epoch": 4.816252821670429, + "grad_norm": 196.81983947753906, + "learning_rate": 2.2791288566243196e-05, + "loss": 43.3302, + "step": 1334 + }, + { + "epoch": 4.819864559819413, + "grad_norm": 186.61160278320312, + "learning_rate": 2.278584392014519e-05, + "loss": 41.8957, + "step": 1335 + }, + { + "epoch": 4.823476297968397, + "grad_norm": 242.55886840820312, + "learning_rate": 2.2780399274047187e-05, + "loss": 43.1916, + "step": 1336 + }, + { + "epoch": 4.827088036117382, + "grad_norm": 212.07177734375, + "learning_rate": 2.2774954627949185e-05, + "loss": 38.3371, + "step": 1337 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 180.1990966796875, + "learning_rate": 2.276950998185118e-05, + "loss": 36.3413, + "step": 1338 + }, + { + "epoch": 4.83431151241535, + "grad_norm": 202.69529724121094, + "learning_rate": 2.2764065335753176e-05, + "loss": 35.4426, + "step": 1339 + }, + { + "epoch": 4.837923250564334, + "grad_norm": 180.47283935546875, + "learning_rate": 2.275862068965517e-05, + "loss": 35.5281, + "step": 1340 + }, + { + "epoch": 4.837923250564334, + "eval_loss": 0.6356105804443359, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 1340 + }, + { + "epoch": 4.8415349887133186, + "grad_norm": 204.674560546875, + "learning_rate": 2.2753176043557167e-05, + "loss": 36.2566, + "step": 1341 + }, + { + "epoch": 4.8451467268623025, + "grad_norm": 272.1197204589844, + "learning_rate": 2.2747731397459166e-05, + "loss": 36.3862, + "step": 1342 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 235.55101013183594, + "learning_rate": 2.2742286751361165e-05, + "loss": 35.1455, + "step": 1343 + }, + { + "epoch": 4.852370203160271, + "grad_norm": 271.2718200683594, + "learning_rate": 2.273684210526316e-05, + "loss": 37.3824, + "step": 1344 + }, + { + "epoch": 4.855981941309255, + "grad_norm": 242.15728759765625, + "learning_rate": 2.2731397459165155e-05, + "loss": 37.6587, + "step": 1345 + }, + { + "epoch": 4.859593679458239, + "grad_norm": 218.59481811523438, + "learning_rate": 2.272595281306715e-05, + "loss": 36.7602, + "step": 1346 + }, + { + "epoch": 4.863205417607223, + "grad_norm": 231.9490203857422, + "learning_rate": 2.2720508166969146e-05, + "loss": 38.187, + "step": 1347 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 385.56158447265625, + "learning_rate": 2.2715063520871145e-05, + "loss": 38.1905, + "step": 1348 + }, + { + "epoch": 4.870428893905192, + "grad_norm": 219.38204956054688, + "learning_rate": 2.270961887477314e-05, + "loss": 38.2179, + "step": 1349 + }, + { + "epoch": 4.874040632054176, + "grad_norm": 209.46580505371094, + "learning_rate": 2.2704174228675136e-05, + "loss": 37.3696, + "step": 1350 + }, + { + "epoch": 4.874040632054176, + "eval_loss": 0.6412517428398132, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 1350 + }, + { + "epoch": 4.87765237020316, + "grad_norm": 205.53416442871094, + "learning_rate": 2.2698729582577134e-05, + "loss": 38.5144, + "step": 1351 + }, + { + "epoch": 4.881264108352145, + "grad_norm": 214.2522735595703, + "learning_rate": 2.269328493647913e-05, + "loss": 38.7372, + "step": 1352 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 236.9787139892578, + "learning_rate": 2.2687840290381125e-05, + "loss": 38.8987, + "step": 1353 + }, + { + "epoch": 4.888487584650113, + "grad_norm": 247.30906677246094, + "learning_rate": 2.2682395644283124e-05, + "loss": 35.0837, + "step": 1354 + }, + { + "epoch": 4.892099322799097, + "grad_norm": 287.5954284667969, + "learning_rate": 2.267695099818512e-05, + "loss": 25.5272, + "step": 1355 + }, + { + "epoch": 4.895711060948082, + "grad_norm": 254.61672973632812, + "learning_rate": 2.2671506352087115e-05, + "loss": 25.1288, + "step": 1356 + }, + { + "epoch": 4.899322799097066, + "grad_norm": 180.98666381835938, + "learning_rate": 2.266606170598911e-05, + "loss": 25.0588, + "step": 1357 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 213.0275421142578, + "learning_rate": 2.2660617059891105e-05, + "loss": 25.464, + "step": 1358 + }, + { + "epoch": 4.9065462753950335, + "grad_norm": 385.18035888671875, + "learning_rate": 2.2655172413793104e-05, + "loss": 47.0056, + "step": 1359 + }, + { + "epoch": 4.910158013544018, + "grad_norm": 383.4106140136719, + "learning_rate": 2.2649727767695103e-05, + "loss": 46.9892, + "step": 1360 + }, + { + "epoch": 4.910158013544018, + "eval_loss": 0.6618479490280151, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1360 + }, + { + "epoch": 4.913769751693002, + "grad_norm": 415.4345397949219, + "learning_rate": 2.26442831215971e-05, + "loss": 47.1619, + "step": 1361 + }, + { + "epoch": 4.917381489841986, + "grad_norm": 362.338134765625, + "learning_rate": 2.2638838475499094e-05, + "loss": 46.7232, + "step": 1362 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 378.7535400390625, + "learning_rate": 2.263339382940109e-05, + "loss": 46.4438, + "step": 1363 + }, + { + "epoch": 4.924604966139955, + "grad_norm": 251.64901733398438, + "learning_rate": 2.2627949183303085e-05, + "loss": 44.8178, + "step": 1364 + }, + { + "epoch": 4.928216704288939, + "grad_norm": 273.1052551269531, + "learning_rate": 2.2622504537205083e-05, + "loss": 43.0865, + "step": 1365 + }, + { + "epoch": 4.931828442437923, + "grad_norm": 229.66415405273438, + "learning_rate": 2.261705989110708e-05, + "loss": 42.2463, + "step": 1366 + }, + { + "epoch": 4.935440180586907, + "grad_norm": 229.47940063476562, + "learning_rate": 2.2611615245009074e-05, + "loss": 42.4395, + "step": 1367 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 224.48890686035156, + "learning_rate": 2.260617059891107e-05, + "loss": 42.4994, + "step": 1368 + }, + { + "epoch": 4.942663656884876, + "grad_norm": 241.98745727539062, + "learning_rate": 2.2600725952813065e-05, + "loss": 42.5535, + "step": 1369 + }, + { + "epoch": 4.94627539503386, + "grad_norm": 258.1711120605469, + "learning_rate": 2.2595281306715067e-05, + "loss": 42.8475, + "step": 1370 + }, + { + "epoch": 4.94627539503386, + "eval_loss": 0.639252245426178, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.09, + "eval_steps_per_second": 57.09, + "step": 1370 + }, + { + "epoch": 4.949887133182845, + "grad_norm": 204.64927673339844, + "learning_rate": 2.2589836660617062e-05, + "loss": 42.9895, + "step": 1371 + }, + { + "epoch": 4.953498871331829, + "grad_norm": 342.9057922363281, + "learning_rate": 2.2584392014519058e-05, + "loss": 43.1972, + "step": 1372 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 207.45504760742188, + "learning_rate": 2.2578947368421053e-05, + "loss": 42.406, + "step": 1373 + }, + { + "epoch": 4.960722347629797, + "grad_norm": 232.78831481933594, + "learning_rate": 2.257350272232305e-05, + "loss": 36.8817, + "step": 1374 + }, + { + "epoch": 4.9643340857787805, + "grad_norm": 249.3349609375, + "learning_rate": 2.2568058076225044e-05, + "loss": 34.584, + "step": 1375 + }, + { + "epoch": 4.967945823927765, + "grad_norm": 322.7100524902344, + "learning_rate": 2.2562613430127043e-05, + "loss": 36.9512, + "step": 1376 + }, + { + "epoch": 4.971557562076749, + "grad_norm": 357.65228271484375, + "learning_rate": 2.2557168784029038e-05, + "loss": 37.6833, + "step": 1377 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 300.0970153808594, + "learning_rate": 2.2551724137931033e-05, + "loss": 38.597, + "step": 1378 + }, + { + "epoch": 4.978781038374718, + "grad_norm": 234.52508544921875, + "learning_rate": 2.2546279491833032e-05, + "loss": 38.4155, + "step": 1379 + }, + { + "epoch": 4.982392776523702, + "grad_norm": 270.60626220703125, + "learning_rate": 2.2540834845735028e-05, + "loss": 38.1589, + "step": 1380 + }, + { + "epoch": 4.982392776523702, + "eval_loss": 0.6409950256347656, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 1380 + }, + { + "epoch": 4.986004514672686, + "grad_norm": 232.9596710205078, + "learning_rate": 2.2535390199637026e-05, + "loss": 39.281, + "step": 1381 + }, + { + "epoch": 4.98961625282167, + "grad_norm": 248.0550994873047, + "learning_rate": 2.2529945553539022e-05, + "loss": 40.0868, + "step": 1382 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 256.327880859375, + "learning_rate": 2.2524500907441017e-05, + "loss": 28.1259, + "step": 1383 + }, + { + "epoch": 4.996839729119639, + "grad_norm": 198.29559326171875, + "learning_rate": 2.2519056261343012e-05, + "loss": 25.3166, + "step": 1384 + }, + { + "epoch": 5.0, + "grad_norm": 174.66856384277344, + "learning_rate": 2.2513611615245008e-05, + "loss": 22.0749, + "step": 1385 + }, + { + "epoch": 5.003611738148984, + "grad_norm": 309.0927429199219, + "learning_rate": 2.2508166969147003e-05, + "loss": 45.2433, + "step": 1386 + }, + { + "epoch": 5.007223476297969, + "grad_norm": 293.1455383300781, + "learning_rate": 2.2502722323049002e-05, + "loss": 46.7025, + "step": 1387 + }, + { + "epoch": 5.010835214446953, + "grad_norm": 269.47662353515625, + "learning_rate": 2.2497277676951e-05, + "loss": 45.3218, + "step": 1388 + }, + { + "epoch": 5.014446952595937, + "grad_norm": 284.49560546875, + "learning_rate": 2.2491833030852996e-05, + "loss": 44.9849, + "step": 1389 + }, + { + "epoch": 5.018058690744921, + "grad_norm": 223.5511474609375, + "learning_rate": 2.248638838475499e-05, + "loss": 44.887, + "step": 1390 + }, + { + "epoch": 5.018058690744921, + "eval_loss": 0.6435533165931702, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1390 + }, + { + "epoch": 5.021670428893906, + "grad_norm": 243.4492645263672, + "learning_rate": 2.2480943738656987e-05, + "loss": 45.1483, + "step": 1391 + }, + { + "epoch": 5.0252821670428895, + "grad_norm": 265.1712646484375, + "learning_rate": 2.2475499092558986e-05, + "loss": 44.3713, + "step": 1392 + }, + { + "epoch": 5.0288939051918735, + "grad_norm": 190.72190856933594, + "learning_rate": 2.247005444646098e-05, + "loss": 45.3138, + "step": 1393 + }, + { + "epoch": 5.0325056433408575, + "grad_norm": 177.26686096191406, + "learning_rate": 2.2464609800362976e-05, + "loss": 43.302, + "step": 1394 + }, + { + "epoch": 5.036117381489842, + "grad_norm": 198.6124725341797, + "learning_rate": 2.2459165154264972e-05, + "loss": 43.6363, + "step": 1395 + }, + { + "epoch": 5.039729119638826, + "grad_norm": 233.78738403320312, + "learning_rate": 2.2453720508166967e-05, + "loss": 43.0345, + "step": 1396 + }, + { + "epoch": 5.04334085778781, + "grad_norm": 225.48614501953125, + "learning_rate": 2.2448275862068966e-05, + "loss": 41.5932, + "step": 1397 + }, + { + "epoch": 5.046952595936794, + "grad_norm": 204.31179809570312, + "learning_rate": 2.2442831215970965e-05, + "loss": 40.1401, + "step": 1398 + }, + { + "epoch": 5.050564334085779, + "grad_norm": 219.5385284423828, + "learning_rate": 2.243738656987296e-05, + "loss": 40.8834, + "step": 1399 + }, + { + "epoch": 5.054176072234763, + "grad_norm": 168.3094024658203, + "learning_rate": 2.2431941923774956e-05, + "loss": 40.4476, + "step": 1400 + }, + { + "epoch": 5.054176072234763, + "eval_loss": 0.6361114382743835, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 1400 + }, + { + "epoch": 5.057787810383747, + "grad_norm": 169.45201110839844, + "learning_rate": 2.242649727767695e-05, + "loss": 40.1949, + "step": 1401 + }, + { + "epoch": 5.061399548532731, + "grad_norm": 208.84634399414062, + "learning_rate": 2.2421052631578946e-05, + "loss": 41.0091, + "step": 1402 + }, + { + "epoch": 5.065011286681716, + "grad_norm": 248.86221313476562, + "learning_rate": 2.2415607985480945e-05, + "loss": 40.2435, + "step": 1403 + }, + { + "epoch": 5.0686230248307, + "grad_norm": 297.0834655761719, + "learning_rate": 2.241016333938294e-05, + "loss": 42.37, + "step": 1404 + }, + { + "epoch": 5.072234762979684, + "grad_norm": 242.12661743164062, + "learning_rate": 2.2404718693284936e-05, + "loss": 42.3822, + "step": 1405 + }, + { + "epoch": 5.075846501128668, + "grad_norm": 230.1178741455078, + "learning_rate": 2.2399274047186935e-05, + "loss": 41.3722, + "step": 1406 + }, + { + "epoch": 5.079458239277653, + "grad_norm": 191.32371520996094, + "learning_rate": 2.239382940108893e-05, + "loss": 41.8087, + "step": 1407 + }, + { + "epoch": 5.083069977426637, + "grad_norm": 267.28753662109375, + "learning_rate": 2.2388384754990925e-05, + "loss": 42.5938, + "step": 1408 + }, + { + "epoch": 5.0866817155756205, + "grad_norm": 186.61978149414062, + "learning_rate": 2.2382940108892924e-05, + "loss": 42.8553, + "step": 1409 + }, + { + "epoch": 5.090293453724605, + "grad_norm": 242.53433227539062, + "learning_rate": 2.237749546279492e-05, + "loss": 41.9677, + "step": 1410 + }, + { + "epoch": 5.090293453724605, + "eval_loss": 0.6330043077468872, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 1410 + }, + { + "epoch": 5.093905191873589, + "grad_norm": 199.74696350097656, + "learning_rate": 2.2372050816696915e-05, + "loss": 42.9821, + "step": 1411 + }, + { + "epoch": 5.097516930022573, + "grad_norm": 254.1063690185547, + "learning_rate": 2.236660617059891e-05, + "loss": 42.7956, + "step": 1412 + }, + { + "epoch": 5.101128668171557, + "grad_norm": 215.59056091308594, + "learning_rate": 2.2361161524500906e-05, + "loss": 43.6312, + "step": 1413 + }, + { + "epoch": 5.104740406320542, + "grad_norm": 218.69973754882812, + "learning_rate": 2.2355716878402904e-05, + "loss": 40.9468, + "step": 1414 + }, + { + "epoch": 5.108352144469526, + "grad_norm": 200.34927368164062, + "learning_rate": 2.23502722323049e-05, + "loss": 38.2656, + "step": 1415 + }, + { + "epoch": 5.11196388261851, + "grad_norm": 191.56883239746094, + "learning_rate": 2.23448275862069e-05, + "loss": 35.8111, + "step": 1416 + }, + { + "epoch": 5.115575620767494, + "grad_norm": 192.629150390625, + "learning_rate": 2.2339382940108894e-05, + "loss": 35.1287, + "step": 1417 + }, + { + "epoch": 5.119187358916479, + "grad_norm": 217.54855346679688, + "learning_rate": 2.233393829401089e-05, + "loss": 34.9664, + "step": 1418 + }, + { + "epoch": 5.122799097065463, + "grad_norm": 234.12355041503906, + "learning_rate": 2.2328493647912888e-05, + "loss": 35.9252, + "step": 1419 + }, + { + "epoch": 5.126410835214447, + "grad_norm": 201.83477783203125, + "learning_rate": 2.2323049001814884e-05, + "loss": 36.4664, + "step": 1420 + }, + { + "epoch": 5.126410835214447, + "eval_loss": 0.6359394192695618, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 1420 + }, + { + "epoch": 5.130022573363431, + "grad_norm": 212.38943481445312, + "learning_rate": 2.231760435571688e-05, + "loss": 35.2733, + "step": 1421 + }, + { + "epoch": 5.133634311512416, + "grad_norm": 219.8803253173828, + "learning_rate": 2.2312159709618874e-05, + "loss": 37.2009, + "step": 1422 + }, + { + "epoch": 5.1372460496614, + "grad_norm": 222.28221130371094, + "learning_rate": 2.230671506352087e-05, + "loss": 36.9338, + "step": 1423 + }, + { + "epoch": 5.140857787810384, + "grad_norm": 217.56607055664062, + "learning_rate": 2.2301270417422865e-05, + "loss": 38.0419, + "step": 1424 + }, + { + "epoch": 5.144469525959368, + "grad_norm": 232.7363739013672, + "learning_rate": 2.2295825771324867e-05, + "loss": 38.1393, + "step": 1425 + }, + { + "epoch": 5.148081264108352, + "grad_norm": 228.12091064453125, + "learning_rate": 2.2290381125226863e-05, + "loss": 37.4169, + "step": 1426 + }, + { + "epoch": 5.151693002257336, + "grad_norm": 247.9901580810547, + "learning_rate": 2.2284936479128858e-05, + "loss": 37.6386, + "step": 1427 + }, + { + "epoch": 5.15530474040632, + "grad_norm": 227.96649169921875, + "learning_rate": 2.2279491833030853e-05, + "loss": 38.7843, + "step": 1428 + }, + { + "epoch": 5.158916478555304, + "grad_norm": 197.85072326660156, + "learning_rate": 2.227404718693285e-05, + "loss": 37.7056, + "step": 1429 + }, + { + "epoch": 5.162528216704289, + "grad_norm": 270.6370544433594, + "learning_rate": 2.2268602540834848e-05, + "loss": 38.5554, + "step": 1430 + }, + { + "epoch": 5.162528216704289, + "eval_loss": 0.6463288068771362, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 1430 + }, + { + "epoch": 5.166139954853273, + "grad_norm": 251.65847778320312, + "learning_rate": 2.2263157894736843e-05, + "loss": 32.6593, + "step": 1431 + }, + { + "epoch": 5.169751693002257, + "grad_norm": 248.84368896484375, + "learning_rate": 2.225771324863884e-05, + "loss": 24.8031, + "step": 1432 + }, + { + "epoch": 5.173363431151241, + "grad_norm": 218.12979125976562, + "learning_rate": 2.2252268602540834e-05, + "loss": 23.8542, + "step": 1433 + }, + { + "epoch": 5.176975169300226, + "grad_norm": 171.4182586669922, + "learning_rate": 2.2246823956442832e-05, + "loss": 25.1994, + "step": 1434 + }, + { + "epoch": 5.18058690744921, + "grad_norm": 200.76271057128906, + "learning_rate": 2.2241379310344828e-05, + "loss": 25.1259, + "step": 1435 + }, + { + "epoch": 5.184198645598194, + "grad_norm": 324.8979797363281, + "learning_rate": 2.2235934664246827e-05, + "loss": 46.7466, + "step": 1436 + }, + { + "epoch": 5.187810383747179, + "grad_norm": 391.9200439453125, + "learning_rate": 2.2230490018148822e-05, + "loss": 47.366, + "step": 1437 + }, + { + "epoch": 5.191422121896163, + "grad_norm": 332.51080322265625, + "learning_rate": 2.2225045372050817e-05, + "loss": 47.5236, + "step": 1438 + }, + { + "epoch": 5.195033860045147, + "grad_norm": 295.85333251953125, + "learning_rate": 2.2219600725952813e-05, + "loss": 44.9235, + "step": 1439 + }, + { + "epoch": 5.198645598194131, + "grad_norm": 246.46482849121094, + "learning_rate": 2.2214156079854808e-05, + "loss": 44.5892, + "step": 1440 + }, + { + "epoch": 5.198645598194131, + "eval_loss": 0.6501885056495667, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.096, + "eval_steps_per_second": 57.096, + "step": 1440 + }, + { + "epoch": 5.2022573363431155, + "grad_norm": 224.99964904785156, + "learning_rate": 2.2208711433756807e-05, + "loss": 45.1496, + "step": 1441 + }, + { + "epoch": 5.2058690744920995, + "grad_norm": 201.5928497314453, + "learning_rate": 2.2203266787658802e-05, + "loss": 44.2362, + "step": 1442 + }, + { + "epoch": 5.209480812641083, + "grad_norm": 220.72509765625, + "learning_rate": 2.21978221415608e-05, + "loss": 45.7963, + "step": 1443 + }, + { + "epoch": 5.213092550790067, + "grad_norm": 229.04412841796875, + "learning_rate": 2.2192377495462796e-05, + "loss": 44.1812, + "step": 1444 + }, + { + "epoch": 5.216704288939052, + "grad_norm": 214.86207580566406, + "learning_rate": 2.2186932849364792e-05, + "loss": 44.364, + "step": 1445 + }, + { + "epoch": 5.220316027088036, + "grad_norm": 169.3239288330078, + "learning_rate": 2.2181488203266787e-05, + "loss": 44.1106, + "step": 1446 + }, + { + "epoch": 5.22392776523702, + "grad_norm": 180.3131561279297, + "learning_rate": 2.2176043557168786e-05, + "loss": 41.8791, + "step": 1447 + }, + { + "epoch": 5.227539503386004, + "grad_norm": 227.83078002929688, + "learning_rate": 2.217059891107078e-05, + "loss": 39.7917, + "step": 1448 + }, + { + "epoch": 5.231151241534989, + "grad_norm": 267.4294738769531, + "learning_rate": 2.2165154264972777e-05, + "loss": 41.2864, + "step": 1449 + }, + { + "epoch": 5.234762979683973, + "grad_norm": 210.79034423828125, + "learning_rate": 2.2159709618874772e-05, + "loss": 40.7219, + "step": 1450 + }, + { + "epoch": 5.234762979683973, + "eval_loss": 0.6369529366493225, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 1450 + }, + { + "epoch": 5.238374717832957, + "grad_norm": 205.2632598876953, + "learning_rate": 2.2154264972776768e-05, + "loss": 41.0364, + "step": 1451 + }, + { + "epoch": 5.241986455981941, + "grad_norm": 199.7196807861328, + "learning_rate": 2.214882032667877e-05, + "loss": 40.2733, + "step": 1452 + }, + { + "epoch": 5.245598194130926, + "grad_norm": 184.26495361328125, + "learning_rate": 2.2143375680580765e-05, + "loss": 40.3418, + "step": 1453 + }, + { + "epoch": 5.24920993227991, + "grad_norm": 170.1937713623047, + "learning_rate": 2.213793103448276e-05, + "loss": 40.5658, + "step": 1454 + }, + { + "epoch": 5.252821670428894, + "grad_norm": 167.71109008789062, + "learning_rate": 2.2132486388384756e-05, + "loss": 41.9252, + "step": 1455 + }, + { + "epoch": 5.2564334085778786, + "grad_norm": 184.73162841796875, + "learning_rate": 2.212704174228675e-05, + "loss": 40.0485, + "step": 1456 + }, + { + "epoch": 5.2600451467268625, + "grad_norm": 195.0812225341797, + "learning_rate": 2.2121597096188747e-05, + "loss": 41.6424, + "step": 1457 + }, + { + "epoch": 5.2636568848758465, + "grad_norm": 218.23553466796875, + "learning_rate": 2.2116152450090745e-05, + "loss": 40.6179, + "step": 1458 + }, + { + "epoch": 5.2672686230248305, + "grad_norm": 229.79299926757812, + "learning_rate": 2.211070780399274e-05, + "loss": 42.8747, + "step": 1459 + }, + { + "epoch": 5.270880361173815, + "grad_norm": 231.70692443847656, + "learning_rate": 2.2105263157894736e-05, + "loss": 42.7016, + "step": 1460 + }, + { + "epoch": 5.270880361173815, + "eval_loss": 0.6424433588981628, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 1460 + }, + { + "epoch": 5.274492099322799, + "grad_norm": 204.9513397216797, + "learning_rate": 2.209981851179673e-05, + "loss": 41.206, + "step": 1461 + }, + { + "epoch": 5.278103837471783, + "grad_norm": 220.89083862304688, + "learning_rate": 2.209437386569873e-05, + "loss": 44.0126, + "step": 1462 + }, + { + "epoch": 5.281715575620767, + "grad_norm": 266.7763671875, + "learning_rate": 2.208892921960073e-05, + "loss": 41.4934, + "step": 1463 + }, + { + "epoch": 5.285327313769752, + "grad_norm": 241.42636108398438, + "learning_rate": 2.2083484573502724e-05, + "loss": 43.3433, + "step": 1464 + }, + { + "epoch": 5.288939051918736, + "grad_norm": 221.7669219970703, + "learning_rate": 2.207803992740472e-05, + "loss": 35.9569, + "step": 1465 + }, + { + "epoch": 5.29255079006772, + "grad_norm": 236.0152130126953, + "learning_rate": 2.2072595281306715e-05, + "loss": 36.0824, + "step": 1466 + }, + { + "epoch": 5.296162528216704, + "grad_norm": 239.56224060058594, + "learning_rate": 2.206715063520871e-05, + "loss": 33.6127, + "step": 1467 + }, + { + "epoch": 5.299774266365689, + "grad_norm": 277.1287841796875, + "learning_rate": 2.2061705989110706e-05, + "loss": 36.11, + "step": 1468 + }, + { + "epoch": 5.303386004514673, + "grad_norm": 250.19515991210938, + "learning_rate": 2.2056261343012705e-05, + "loss": 36.9984, + "step": 1469 + }, + { + "epoch": 5.306997742663657, + "grad_norm": 214.2754669189453, + "learning_rate": 2.20508166969147e-05, + "loss": 36.5917, + "step": 1470 + }, + { + "epoch": 5.306997742663657, + "eval_loss": 0.6356943845748901, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 1470 + }, + { + "epoch": 5.310609480812641, + "grad_norm": 224.37388610839844, + "learning_rate": 2.20453720508167e-05, + "loss": 36.5302, + "step": 1471 + }, + { + "epoch": 5.314221218961626, + "grad_norm": 276.2541809082031, + "learning_rate": 2.2039927404718694e-05, + "loss": 36.7978, + "step": 1472 + }, + { + "epoch": 5.3178329571106095, + "grad_norm": 361.717041015625, + "learning_rate": 2.203448275862069e-05, + "loss": 37.4063, + "step": 1473 + }, + { + "epoch": 5.3214446952595935, + "grad_norm": 285.3569641113281, + "learning_rate": 2.202903811252269e-05, + "loss": 37.2472, + "step": 1474 + }, + { + "epoch": 5.3250564334085775, + "grad_norm": 268.160400390625, + "learning_rate": 2.2023593466424684e-05, + "loss": 37.7361, + "step": 1475 + }, + { + "epoch": 5.328668171557562, + "grad_norm": 211.38070678710938, + "learning_rate": 2.201814882032668e-05, + "loss": 37.7794, + "step": 1476 + }, + { + "epoch": 5.332279909706546, + "grad_norm": 214.10638427734375, + "learning_rate": 2.2012704174228675e-05, + "loss": 39.0787, + "step": 1477 + }, + { + "epoch": 5.33589164785553, + "grad_norm": 238.9603271484375, + "learning_rate": 2.200725952813067e-05, + "loss": 37.6853, + "step": 1478 + }, + { + "epoch": 5.339503386004514, + "grad_norm": 323.44976806640625, + "learning_rate": 2.2001814882032665e-05, + "loss": 38.2844, + "step": 1479 + }, + { + "epoch": 5.343115124153499, + "grad_norm": 289.6131896972656, + "learning_rate": 2.1996370235934668e-05, + "loss": 38.8953, + "step": 1480 + }, + { + "epoch": 5.343115124153499, + "eval_loss": 0.6462770700454712, + "eval_runtime": 3.1673, + "eval_samples_per_second": 56.516, + "eval_steps_per_second": 56.516, + "step": 1480 + }, + { + "epoch": 5.346726862302483, + "grad_norm": 197.47299194335938, + "learning_rate": 2.1990925589836663e-05, + "loss": 28.126, + "step": 1481 + }, + { + "epoch": 5.350338600451467, + "grad_norm": 198.37156677246094, + "learning_rate": 2.1985480943738658e-05, + "loss": 24.2205, + "step": 1482 + }, + { + "epoch": 5.353950338600452, + "grad_norm": 211.03501892089844, + "learning_rate": 2.1980036297640654e-05, + "loss": 24.119, + "step": 1483 + }, + { + "epoch": 5.357562076749436, + "grad_norm": 182.23316955566406, + "learning_rate": 2.197459165154265e-05, + "loss": 24.7386, + "step": 1484 + }, + { + "epoch": 5.36117381489842, + "grad_norm": 192.6392822265625, + "learning_rate": 2.1969147005444648e-05, + "loss": 26.0739, + "step": 1485 + }, + { + "epoch": 5.364785553047404, + "grad_norm": 380.62896728515625, + "learning_rate": 2.1963702359346643e-05, + "loss": 46.6945, + "step": 1486 + }, + { + "epoch": 5.368397291196389, + "grad_norm": 342.5572814941406, + "learning_rate": 2.195825771324864e-05, + "loss": 46.1797, + "step": 1487 + }, + { + "epoch": 5.372009029345373, + "grad_norm": 311.7198791503906, + "learning_rate": 2.1952813067150634e-05, + "loss": 45.6588, + "step": 1488 + }, + { + "epoch": 5.375620767494357, + "grad_norm": 260.9885559082031, + "learning_rate": 2.1947368421052633e-05, + "loss": 45.2405, + "step": 1489 + }, + { + "epoch": 5.3792325056433405, + "grad_norm": 263.3132019042969, + "learning_rate": 2.1941923774954628e-05, + "loss": 44.117, + "step": 1490 + }, + { + "epoch": 5.3792325056433405, + "eval_loss": 0.644275426864624, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 1490 + }, + { + "epoch": 5.382844243792325, + "grad_norm": 254.92022705078125, + "learning_rate": 2.1936479128856627e-05, + "loss": 45.4002, + "step": 1491 + }, + { + "epoch": 5.386455981941309, + "grad_norm": 246.1839599609375, + "learning_rate": 2.1931034482758622e-05, + "loss": 45.3481, + "step": 1492 + }, + { + "epoch": 5.390067720090293, + "grad_norm": 282.2879638671875, + "learning_rate": 2.1925589836660618e-05, + "loss": 45.3958, + "step": 1493 + }, + { + "epoch": 5.393679458239277, + "grad_norm": 266.9140930175781, + "learning_rate": 2.1920145190562613e-05, + "loss": 44.2959, + "step": 1494 + }, + { + "epoch": 5.397291196388262, + "grad_norm": 196.81199645996094, + "learning_rate": 2.191470054446461e-05, + "loss": 44.765, + "step": 1495 + }, + { + "epoch": 5.400902934537246, + "grad_norm": 270.7329406738281, + "learning_rate": 2.1909255898366607e-05, + "loss": 42.8581, + "step": 1496 + }, + { + "epoch": 5.40451467268623, + "grad_norm": 187.3281707763672, + "learning_rate": 2.1903811252268603e-05, + "loss": 40.7167, + "step": 1497 + }, + { + "epoch": 5.408126410835214, + "grad_norm": 302.9165954589844, + "learning_rate": 2.1898366606170598e-05, + "loss": 41.0712, + "step": 1498 + }, + { + "epoch": 5.411738148984199, + "grad_norm": 395.1492614746094, + "learning_rate": 2.1892921960072597e-05, + "loss": 40.4098, + "step": 1499 + }, + { + "epoch": 5.415349887133183, + "grad_norm": 253.91494750976562, + "learning_rate": 2.1887477313974592e-05, + "loss": 41.2985, + "step": 1500 + }, + { + "epoch": 5.415349887133183, + "eval_loss": 0.6383773684501648, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1500 + }, + { + "epoch": 5.418961625282167, + "grad_norm": 248.4109344482422, + "learning_rate": 2.1882032667876588e-05, + "loss": 41.179, + "step": 1501 + }, + { + "epoch": 5.422573363431152, + "grad_norm": 210.50015258789062, + "learning_rate": 2.1876588021778586e-05, + "loss": 41.1934, + "step": 1502 + }, + { + "epoch": 5.426185101580136, + "grad_norm": 170.64334106445312, + "learning_rate": 2.187114337568058e-05, + "loss": 41.5535, + "step": 1503 + }, + { + "epoch": 5.42979683972912, + "grad_norm": 249.41270446777344, + "learning_rate": 2.1865698729582577e-05, + "loss": 41.8323, + "step": 1504 + }, + { + "epoch": 5.433408577878104, + "grad_norm": 214.53770446777344, + "learning_rate": 2.1860254083484572e-05, + "loss": 42.1517, + "step": 1505 + }, + { + "epoch": 5.437020316027088, + "grad_norm": 225.6502227783203, + "learning_rate": 2.1854809437386568e-05, + "loss": 42.7675, + "step": 1506 + }, + { + "epoch": 5.440632054176072, + "grad_norm": 210.19219970703125, + "learning_rate": 2.1849364791288567e-05, + "loss": 42.5094, + "step": 1507 + }, + { + "epoch": 5.444243792325056, + "grad_norm": 187.03294372558594, + "learning_rate": 2.1843920145190565e-05, + "loss": 42.2218, + "step": 1508 + }, + { + "epoch": 5.44785553047404, + "grad_norm": 227.6764373779297, + "learning_rate": 2.183847549909256e-05, + "loss": 42.7061, + "step": 1509 + }, + { + "epoch": 5.451467268623025, + "grad_norm": 239.2847442626953, + "learning_rate": 2.1833030852994556e-05, + "loss": 43.1959, + "step": 1510 + }, + { + "epoch": 5.451467268623025, + "eval_loss": 0.6405091285705566, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 1510 + }, + { + "epoch": 5.455079006772009, + "grad_norm": 268.887451171875, + "learning_rate": 2.182758620689655e-05, + "loss": 42.4915, + "step": 1511 + }, + { + "epoch": 5.458690744920993, + "grad_norm": 261.0531311035156, + "learning_rate": 2.182214156079855e-05, + "loss": 42.1777, + "step": 1512 + }, + { + "epoch": 5.462302483069977, + "grad_norm": 241.58819580078125, + "learning_rate": 2.1816696914700546e-05, + "loss": 40.8728, + "step": 1513 + }, + { + "epoch": 5.465914221218962, + "grad_norm": 227.302001953125, + "learning_rate": 2.181125226860254e-05, + "loss": 39.8861, + "step": 1514 + }, + { + "epoch": 5.469525959367946, + "grad_norm": 293.8402404785156, + "learning_rate": 2.1805807622504536e-05, + "loss": 36.8716, + "step": 1515 + }, + { + "epoch": 5.47313769751693, + "grad_norm": 332.8829650878906, + "learning_rate": 2.1800362976406532e-05, + "loss": 35.6049, + "step": 1516 + }, + { + "epoch": 5.476749435665914, + "grad_norm": 271.6636962890625, + "learning_rate": 2.179491833030853e-05, + "loss": 34.6785, + "step": 1517 + }, + { + "epoch": 5.480361173814899, + "grad_norm": 211.5673065185547, + "learning_rate": 2.178947368421053e-05, + "loss": 35.5321, + "step": 1518 + }, + { + "epoch": 5.483972911963883, + "grad_norm": 168.95346069335938, + "learning_rate": 2.1784029038112525e-05, + "loss": 35.1604, + "step": 1519 + }, + { + "epoch": 5.487584650112867, + "grad_norm": 242.66725158691406, + "learning_rate": 2.177858439201452e-05, + "loss": 37.8709, + "step": 1520 + }, + { + "epoch": 5.487584650112867, + "eval_loss": 0.6324127912521362, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1520 + }, + { + "epoch": 5.491196388261851, + "grad_norm": 202.7799530029297, + "learning_rate": 2.1773139745916516e-05, + "loss": 38.1727, + "step": 1521 + }, + { + "epoch": 5.4948081264108355, + "grad_norm": 210.12704467773438, + "learning_rate": 2.176769509981851e-05, + "loss": 36.4171, + "step": 1522 + }, + { + "epoch": 5.4984198645598195, + "grad_norm": 214.7133331298828, + "learning_rate": 2.176225045372051e-05, + "loss": 37.7873, + "step": 1523 + }, + { + "epoch": 5.502031602708803, + "grad_norm": 197.89781188964844, + "learning_rate": 2.1756805807622505e-05, + "loss": 37.1096, + "step": 1524 + }, + { + "epoch": 5.505643340857787, + "grad_norm": 203.01992797851562, + "learning_rate": 2.17513611615245e-05, + "loss": 36.9907, + "step": 1525 + }, + { + "epoch": 5.509255079006772, + "grad_norm": 210.42164611816406, + "learning_rate": 2.17459165154265e-05, + "loss": 38.0291, + "step": 1526 + }, + { + "epoch": 5.512866817155756, + "grad_norm": 210.2798309326172, + "learning_rate": 2.1740471869328495e-05, + "loss": 37.5385, + "step": 1527 + }, + { + "epoch": 5.51647855530474, + "grad_norm": 217.986572265625, + "learning_rate": 2.173502722323049e-05, + "loss": 39.2736, + "step": 1528 + }, + { + "epoch": 5.520090293453725, + "grad_norm": 221.05831909179688, + "learning_rate": 2.172958257713249e-05, + "loss": 39.2733, + "step": 1529 + }, + { + "epoch": 5.523702031602709, + "grad_norm": 250.36065673828125, + "learning_rate": 2.1724137931034484e-05, + "loss": 37.8987, + "step": 1530 + }, + { + "epoch": 5.523702031602709, + "eval_loss": 0.6414559483528137, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 1530 + }, + { + "epoch": 5.527313769751693, + "grad_norm": 275.062255859375, + "learning_rate": 2.171869328493648e-05, + "loss": 29.4874, + "step": 1531 + }, + { + "epoch": 5.530925507900677, + "grad_norm": 178.79615783691406, + "learning_rate": 2.1713248638838475e-05, + "loss": 25.2165, + "step": 1532 + }, + { + "epoch": 5.534537246049661, + "grad_norm": 221.6693572998047, + "learning_rate": 2.170780399274047e-05, + "loss": 24.7139, + "step": 1533 + }, + { + "epoch": 5.538148984198646, + "grad_norm": 207.15869140625, + "learning_rate": 2.170235934664247e-05, + "loss": 25.2773, + "step": 1534 + }, + { + "epoch": 5.54176072234763, + "grad_norm": 193.37644958496094, + "learning_rate": 2.1696914700544468e-05, + "loss": 25.7936, + "step": 1535 + }, + { + "epoch": 5.545372460496614, + "grad_norm": 314.101318359375, + "learning_rate": 2.1691470054446463e-05, + "loss": 45.8573, + "step": 1536 + }, + { + "epoch": 5.5489841986455986, + "grad_norm": 376.9578552246094, + "learning_rate": 2.168602540834846e-05, + "loss": 47.1284, + "step": 1537 + }, + { + "epoch": 5.5525959367945825, + "grad_norm": 343.3904724121094, + "learning_rate": 2.1680580762250454e-05, + "loss": 45.1873, + "step": 1538 + }, + { + "epoch": 5.5562076749435665, + "grad_norm": 263.31768798828125, + "learning_rate": 2.167513611615245e-05, + "loss": 45.4906, + "step": 1539 + }, + { + "epoch": 5.5598194130925505, + "grad_norm": 295.50384521484375, + "learning_rate": 2.1669691470054448e-05, + "loss": 44.9259, + "step": 1540 + }, + { + "epoch": 5.5598194130925505, + "eval_loss": 0.6483813524246216, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.923, + "eval_steps_per_second": 56.923, + "step": 1540 + }, + { + "epoch": 5.563431151241535, + "grad_norm": 208.8861846923828, + "learning_rate": 2.1664246823956444e-05, + "loss": 43.7965, + "step": 1541 + }, + { + "epoch": 5.567042889390519, + "grad_norm": 195.8695526123047, + "learning_rate": 2.165880217785844e-05, + "loss": 44.7409, + "step": 1542 + }, + { + "epoch": 5.570654627539503, + "grad_norm": 218.10089111328125, + "learning_rate": 2.1653357531760434e-05, + "loss": 45.9364, + "step": 1543 + }, + { + "epoch": 5.574266365688487, + "grad_norm": 204.17205810546875, + "learning_rate": 2.164791288566243e-05, + "loss": 45.468, + "step": 1544 + }, + { + "epoch": 5.577878103837472, + "grad_norm": 239.03952026367188, + "learning_rate": 2.1642468239564432e-05, + "loss": 44.7685, + "step": 1545 + }, + { + "epoch": 5.581489841986456, + "grad_norm": 251.59300231933594, + "learning_rate": 2.1637023593466427e-05, + "loss": 43.011, + "step": 1546 + }, + { + "epoch": 5.58510158013544, + "grad_norm": 186.72540283203125, + "learning_rate": 2.1631578947368423e-05, + "loss": 41.5255, + "step": 1547 + }, + { + "epoch": 5.588713318284425, + "grad_norm": 199.89732360839844, + "learning_rate": 2.1626134301270418e-05, + "loss": 40.2522, + "step": 1548 + }, + { + "epoch": 5.592325056433409, + "grad_norm": 182.16624450683594, + "learning_rate": 2.1620689655172413e-05, + "loss": 41.0931, + "step": 1549 + }, + { + "epoch": 5.595936794582393, + "grad_norm": 221.58680725097656, + "learning_rate": 2.161524500907441e-05, + "loss": 40.2717, + "step": 1550 + }, + { + "epoch": 5.595936794582393, + "eval_loss": 0.6393340229988098, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 1550 + }, + { + "epoch": 5.599548532731377, + "grad_norm": 209.82183837890625, + "learning_rate": 2.1609800362976408e-05, + "loss": 41.7522, + "step": 1551 + }, + { + "epoch": 5.603160270880361, + "grad_norm": 226.1896209716797, + "learning_rate": 2.1604355716878403e-05, + "loss": 40.8078, + "step": 1552 + }, + { + "epoch": 5.606772009029346, + "grad_norm": 219.57899475097656, + "learning_rate": 2.1598911070780398e-05, + "loss": 42.2331, + "step": 1553 + }, + { + "epoch": 5.6103837471783295, + "grad_norm": 185.2303009033203, + "learning_rate": 2.1593466424682397e-05, + "loss": 42.0695, + "step": 1554 + }, + { + "epoch": 5.6139954853273135, + "grad_norm": 192.32913208007812, + "learning_rate": 2.1588021778584392e-05, + "loss": 42.1317, + "step": 1555 + }, + { + "epoch": 5.617607223476298, + "grad_norm": 183.3128662109375, + "learning_rate": 2.158257713248639e-05, + "loss": 40.4957, + "step": 1556 + }, + { + "epoch": 5.621218961625282, + "grad_norm": 178.10691833496094, + "learning_rate": 2.1577132486388387e-05, + "loss": 40.9154, + "step": 1557 + }, + { + "epoch": 5.624830699774266, + "grad_norm": 207.3495330810547, + "learning_rate": 2.1571687840290382e-05, + "loss": 42.8389, + "step": 1558 + }, + { + "epoch": 5.62844243792325, + "grad_norm": 191.46353149414062, + "learning_rate": 2.1566243194192377e-05, + "loss": 41.9483, + "step": 1559 + }, + { + "epoch": 5.632054176072235, + "grad_norm": 218.9544219970703, + "learning_rate": 2.1560798548094373e-05, + "loss": 41.2037, + "step": 1560 + }, + { + "epoch": 5.632054176072235, + "eval_loss": 0.6345452070236206, + "eval_runtime": 3.1432, + "eval_samples_per_second": 56.949, + "eval_steps_per_second": 56.949, + "step": 1560 + }, + { + "epoch": 5.635665914221219, + "grad_norm": 235.9405059814453, + "learning_rate": 2.1555353901996368e-05, + "loss": 43.1159, + "step": 1561 + }, + { + "epoch": 5.639277652370203, + "grad_norm": 207.1119384765625, + "learning_rate": 2.1549909255898367e-05, + "loss": 43.4384, + "step": 1562 + }, + { + "epoch": 5.642889390519187, + "grad_norm": 305.3013916015625, + "learning_rate": 2.1544464609800366e-05, + "loss": 42.436, + "step": 1563 + }, + { + "epoch": 5.646501128668172, + "grad_norm": 226.25282287597656, + "learning_rate": 2.153901996370236e-05, + "loss": 39.6844, + "step": 1564 + }, + { + "epoch": 5.650112866817156, + "grad_norm": 201.5033416748047, + "learning_rate": 2.1533575317604356e-05, + "loss": 35.9103, + "step": 1565 + }, + { + "epoch": 5.65372460496614, + "grad_norm": 206.63229370117188, + "learning_rate": 2.1528130671506352e-05, + "loss": 35.0026, + "step": 1566 + }, + { + "epoch": 5.657336343115124, + "grad_norm": 212.67581176757812, + "learning_rate": 2.152268602540835e-05, + "loss": 35.6298, + "step": 1567 + }, + { + "epoch": 5.660948081264109, + "grad_norm": 193.2886199951172, + "learning_rate": 2.1517241379310346e-05, + "loss": 36.0356, + "step": 1568 + }, + { + "epoch": 5.664559819413093, + "grad_norm": 166.189208984375, + "learning_rate": 2.151179673321234e-05, + "loss": 35.5423, + "step": 1569 + }, + { + "epoch": 5.668171557562077, + "grad_norm": 288.91552734375, + "learning_rate": 2.1506352087114337e-05, + "loss": 36.6227, + "step": 1570 + }, + { + "epoch": 5.668171557562077, + "eval_loss": 0.6339959502220154, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1570 + }, + { + "epoch": 5.6717832957110605, + "grad_norm": 210.91664123535156, + "learning_rate": 2.1500907441016332e-05, + "loss": 37.3015, + "step": 1571 + }, + { + "epoch": 5.675395033860045, + "grad_norm": 206.54299926757812, + "learning_rate": 2.149546279491833e-05, + "loss": 36.961, + "step": 1572 + }, + { + "epoch": 5.679006772009029, + "grad_norm": 206.55613708496094, + "learning_rate": 2.149001814882033e-05, + "loss": 36.722, + "step": 1573 + }, + { + "epoch": 5.682618510158013, + "grad_norm": 206.86563110351562, + "learning_rate": 2.1484573502722325e-05, + "loss": 37.7482, + "step": 1574 + }, + { + "epoch": 5.686230248306998, + "grad_norm": 219.96533203125, + "learning_rate": 2.147912885662432e-05, + "loss": 37.7964, + "step": 1575 + }, + { + "epoch": 5.689841986455982, + "grad_norm": 226.23887634277344, + "learning_rate": 2.1473684210526316e-05, + "loss": 38.6577, + "step": 1576 + }, + { + "epoch": 5.693453724604966, + "grad_norm": 195.1751708984375, + "learning_rate": 2.146823956442831e-05, + "loss": 36.9764, + "step": 1577 + }, + { + "epoch": 5.69706546275395, + "grad_norm": 194.3510284423828, + "learning_rate": 2.146279491833031e-05, + "loss": 39.4842, + "step": 1578 + }, + { + "epoch": 5.700677200902934, + "grad_norm": 187.02281188964844, + "learning_rate": 2.1457350272232305e-05, + "loss": 38.9574, + "step": 1579 + }, + { + "epoch": 5.704288939051919, + "grad_norm": 242.91925048828125, + "learning_rate": 2.14519056261343e-05, + "loss": 37.6359, + "step": 1580 + }, + { + "epoch": 5.704288939051919, + "eval_loss": 0.6384473443031311, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 1580 + }, + { + "epoch": 5.707900677200903, + "grad_norm": 242.9617156982422, + "learning_rate": 2.14464609800363e-05, + "loss": 31.3564, + "step": 1581 + }, + { + "epoch": 5.711512415349887, + "grad_norm": 182.00540161132812, + "learning_rate": 2.1441016333938295e-05, + "loss": 24.2933, + "step": 1582 + }, + { + "epoch": 5.715124153498872, + "grad_norm": 257.7115173339844, + "learning_rate": 2.143557168784029e-05, + "loss": 24.6299, + "step": 1583 + }, + { + "epoch": 5.718735891647856, + "grad_norm": 198.71554565429688, + "learning_rate": 2.143012704174229e-05, + "loss": 24.7344, + "step": 1584 + }, + { + "epoch": 5.72234762979684, + "grad_norm": 198.24520874023438, + "learning_rate": 2.1424682395644284e-05, + "loss": 26.0825, + "step": 1585 + }, + { + "epoch": 5.725959367945824, + "grad_norm": 248.9528045654297, + "learning_rate": 2.141923774954628e-05, + "loss": 45.1176, + "step": 1586 + }, + { + "epoch": 5.7295711060948085, + "grad_norm": 293.7327575683594, + "learning_rate": 2.1413793103448275e-05, + "loss": 45.8517, + "step": 1587 + }, + { + "epoch": 5.733182844243792, + "grad_norm": 293.1148681640625, + "learning_rate": 2.140834845735027e-05, + "loss": 45.6659, + "step": 1588 + }, + { + "epoch": 5.736794582392776, + "grad_norm": 312.7779846191406, + "learning_rate": 2.140290381125227e-05, + "loss": 44.4863, + "step": 1589 + }, + { + "epoch": 5.74040632054176, + "grad_norm": 309.1000061035156, + "learning_rate": 2.1397459165154265e-05, + "loss": 43.649, + "step": 1590 + }, + { + "epoch": 5.74040632054176, + "eval_loss": 0.6471736431121826, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 1590 + }, + { + "epoch": 5.744018058690745, + "grad_norm": 276.4226989746094, + "learning_rate": 2.1392014519056263e-05, + "loss": 45.3135, + "step": 1591 + }, + { + "epoch": 5.747629796839729, + "grad_norm": 233.6791229248047, + "learning_rate": 2.138656987295826e-05, + "loss": 44.4919, + "step": 1592 + }, + { + "epoch": 5.751241534988713, + "grad_norm": 194.2917022705078, + "learning_rate": 2.1381125226860254e-05, + "loss": 44.8033, + "step": 1593 + }, + { + "epoch": 5.754853273137698, + "grad_norm": 241.76060485839844, + "learning_rate": 2.137568058076225e-05, + "loss": 45.1427, + "step": 1594 + }, + { + "epoch": 5.758465011286682, + "grad_norm": 216.56283569335938, + "learning_rate": 2.137023593466425e-05, + "loss": 43.1769, + "step": 1595 + }, + { + "epoch": 5.762076749435666, + "grad_norm": 230.0026092529297, + "learning_rate": 2.1364791288566244e-05, + "loss": 44.1141, + "step": 1596 + }, + { + "epoch": 5.76568848758465, + "grad_norm": 191.55433654785156, + "learning_rate": 2.135934664246824e-05, + "loss": 40.7227, + "step": 1597 + }, + { + "epoch": 5.769300225733634, + "grad_norm": 180.25885009765625, + "learning_rate": 2.1353901996370235e-05, + "loss": 40.9842, + "step": 1598 + }, + { + "epoch": 5.772911963882619, + "grad_norm": 220.4018096923828, + "learning_rate": 2.134845735027223e-05, + "loss": 40.0403, + "step": 1599 + }, + { + "epoch": 5.776523702031603, + "grad_norm": 264.20587158203125, + "learning_rate": 2.1343012704174232e-05, + "loss": 40.1543, + "step": 1600 + }, + { + "epoch": 5.776523702031603, + "eval_loss": 0.6374311447143555, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1600 + }, + { + "epoch": 5.780135440180587, + "grad_norm": 167.9457244873047, + "learning_rate": 2.1337568058076227e-05, + "loss": 40.9575, + "step": 1601 + }, + { + "epoch": 5.7837471783295715, + "grad_norm": 190.05247497558594, + "learning_rate": 2.1332123411978223e-05, + "loss": 39.5593, + "step": 1602 + }, + { + "epoch": 5.7873589164785555, + "grad_norm": 246.4980926513672, + "learning_rate": 2.1326678765880218e-05, + "loss": 40.7016, + "step": 1603 + }, + { + "epoch": 5.7909706546275395, + "grad_norm": 208.7435302734375, + "learning_rate": 2.1321234119782214e-05, + "loss": 41.7855, + "step": 1604 + }, + { + "epoch": 5.794582392776523, + "grad_norm": 190.84188842773438, + "learning_rate": 2.1315789473684212e-05, + "loss": 41.2129, + "step": 1605 + }, + { + "epoch": 5.798194130925508, + "grad_norm": 196.7161102294922, + "learning_rate": 2.1310344827586208e-05, + "loss": 40.8209, + "step": 1606 + }, + { + "epoch": 5.801805869074492, + "grad_norm": 181.4319305419922, + "learning_rate": 2.1304900181488203e-05, + "loss": 41.8345, + "step": 1607 + }, + { + "epoch": 5.805417607223476, + "grad_norm": 201.2064971923828, + "learning_rate": 2.12994555353902e-05, + "loss": 43.1464, + "step": 1608 + }, + { + "epoch": 5.80902934537246, + "grad_norm": 199.15174865722656, + "learning_rate": 2.1294010889292197e-05, + "loss": 42.6041, + "step": 1609 + }, + { + "epoch": 5.812641083521445, + "grad_norm": 231.0398406982422, + "learning_rate": 2.1288566243194193e-05, + "loss": 42.867, + "step": 1610 + }, + { + "epoch": 5.812641083521445, + "eval_loss": 0.6334222555160522, + "eval_runtime": 3.1534, + "eval_samples_per_second": 56.764, + "eval_steps_per_second": 56.764, + "step": 1610 + }, + { + "epoch": 5.816252821670429, + "grad_norm": 189.26132202148438, + "learning_rate": 2.128312159709619e-05, + "loss": 41.7717, + "step": 1611 + }, + { + "epoch": 5.819864559819413, + "grad_norm": 215.5289764404297, + "learning_rate": 2.1277676950998187e-05, + "loss": 41.3994, + "step": 1612 + }, + { + "epoch": 5.823476297968397, + "grad_norm": 267.4259033203125, + "learning_rate": 2.1272232304900182e-05, + "loss": 41.8173, + "step": 1613 + }, + { + "epoch": 5.827088036117382, + "grad_norm": 241.74749755859375, + "learning_rate": 2.1266787658802178e-05, + "loss": 39.9873, + "step": 1614 + }, + { + "epoch": 5.830699774266366, + "grad_norm": 242.233642578125, + "learning_rate": 2.1261343012704173e-05, + "loss": 37.0662, + "step": 1615 + }, + { + "epoch": 5.83431151241535, + "grad_norm": 217.06141662597656, + "learning_rate": 2.1255898366606172e-05, + "loss": 36.8948, + "step": 1616 + }, + { + "epoch": 5.837923250564334, + "grad_norm": 242.05567932128906, + "learning_rate": 2.1250453720508167e-05, + "loss": 34.9909, + "step": 1617 + }, + { + "epoch": 5.8415349887133186, + "grad_norm": 178.65618896484375, + "learning_rate": 2.1245009074410166e-05, + "loss": 35.603, + "step": 1618 + }, + { + "epoch": 5.8451467268623025, + "grad_norm": 216.36865234375, + "learning_rate": 2.123956442831216e-05, + "loss": 35.9822, + "step": 1619 + }, + { + "epoch": 5.8487584650112865, + "grad_norm": 241.22161865234375, + "learning_rate": 2.1234119782214157e-05, + "loss": 35.1473, + "step": 1620 + }, + { + "epoch": 5.8487584650112865, + "eval_loss": 0.6312161087989807, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 1620 + }, + { + "epoch": 5.852370203160271, + "grad_norm": 192.05210876464844, + "learning_rate": 2.1228675136116152e-05, + "loss": 36.145, + "step": 1621 + }, + { + "epoch": 5.855981941309255, + "grad_norm": 194.0652618408203, + "learning_rate": 2.122323049001815e-05, + "loss": 37.7076, + "step": 1622 + }, + { + "epoch": 5.859593679458239, + "grad_norm": 255.59286499023438, + "learning_rate": 2.1217785843920146e-05, + "loss": 37.6837, + "step": 1623 + }, + { + "epoch": 5.863205417607223, + "grad_norm": 184.0017852783203, + "learning_rate": 2.121234119782214e-05, + "loss": 37.1681, + "step": 1624 + }, + { + "epoch": 5.866817155756207, + "grad_norm": 186.98338317871094, + "learning_rate": 2.1206896551724137e-05, + "loss": 37.4902, + "step": 1625 + }, + { + "epoch": 5.870428893905192, + "grad_norm": 253.53775024414062, + "learning_rate": 2.1201451905626132e-05, + "loss": 37.2771, + "step": 1626 + }, + { + "epoch": 5.874040632054176, + "grad_norm": 196.43038940429688, + "learning_rate": 2.119600725952813e-05, + "loss": 37.7681, + "step": 1627 + }, + { + "epoch": 5.87765237020316, + "grad_norm": 255.99879455566406, + "learning_rate": 2.119056261343013e-05, + "loss": 40.0097, + "step": 1628 + }, + { + "epoch": 5.881264108352145, + "grad_norm": 275.1465148925781, + "learning_rate": 2.1185117967332125e-05, + "loss": 38.1076, + "step": 1629 + }, + { + "epoch": 5.884875846501129, + "grad_norm": 281.8592529296875, + "learning_rate": 2.117967332123412e-05, + "loss": 38.6463, + "step": 1630 + }, + { + "epoch": 5.884875846501129, + "eval_loss": 0.6449099779129028, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 1630 + }, + { + "epoch": 5.888487584650113, + "grad_norm": 246.7912139892578, + "learning_rate": 2.1174228675136116e-05, + "loss": 36.9158, + "step": 1631 + }, + { + "epoch": 5.892099322799097, + "grad_norm": 176.7545623779297, + "learning_rate": 2.116878402903811e-05, + "loss": 25.1153, + "step": 1632 + }, + { + "epoch": 5.895711060948082, + "grad_norm": 202.2602996826172, + "learning_rate": 2.116333938294011e-05, + "loss": 24.1999, + "step": 1633 + }, + { + "epoch": 5.899322799097066, + "grad_norm": 186.26255798339844, + "learning_rate": 2.1157894736842106e-05, + "loss": 24.185, + "step": 1634 + }, + { + "epoch": 5.9029345372460496, + "grad_norm": 231.0543670654297, + "learning_rate": 2.11524500907441e-05, + "loss": 26.1841, + "step": 1635 + }, + { + "epoch": 5.9065462753950335, + "grad_norm": 336.677001953125, + "learning_rate": 2.1147005444646096e-05, + "loss": 47.1367, + "step": 1636 + }, + { + "epoch": 5.910158013544018, + "grad_norm": 299.3211975097656, + "learning_rate": 2.1141560798548095e-05, + "loss": 46.7711, + "step": 1637 + }, + { + "epoch": 5.913769751693002, + "grad_norm": 287.5389099121094, + "learning_rate": 2.1136116152450094e-05, + "loss": 44.9163, + "step": 1638 + }, + { + "epoch": 5.917381489841986, + "grad_norm": 290.34930419921875, + "learning_rate": 2.113067150635209e-05, + "loss": 45.1651, + "step": 1639 + }, + { + "epoch": 5.92099322799097, + "grad_norm": 244.7100372314453, + "learning_rate": 2.1125226860254085e-05, + "loss": 45.6252, + "step": 1640 + }, + { + "epoch": 5.92099322799097, + "eval_loss": 0.6506878733634949, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 1640 + }, + { + "epoch": 5.924604966139955, + "grad_norm": 301.48223876953125, + "learning_rate": 2.111978221415608e-05, + "loss": 44.5345, + "step": 1641 + }, + { + "epoch": 5.928216704288939, + "grad_norm": 261.05987548828125, + "learning_rate": 2.1114337568058075e-05, + "loss": 42.0263, + "step": 1642 + }, + { + "epoch": 5.931828442437923, + "grad_norm": 220.4369659423828, + "learning_rate": 2.110889292196007e-05, + "loss": 41.2405, + "step": 1643 + }, + { + "epoch": 5.935440180586907, + "grad_norm": 261.3221435546875, + "learning_rate": 2.110344827586207e-05, + "loss": 42.2734, + "step": 1644 + }, + { + "epoch": 5.939051918735892, + "grad_norm": 253.70855712890625, + "learning_rate": 2.1098003629764065e-05, + "loss": 43.0752, + "step": 1645 + }, + { + "epoch": 5.942663656884876, + "grad_norm": 198.76138305664062, + "learning_rate": 2.1092558983666064e-05, + "loss": 42.7103, + "step": 1646 + }, + { + "epoch": 5.94627539503386, + "grad_norm": 212.21466064453125, + "learning_rate": 2.108711433756806e-05, + "loss": 42.6215, + "step": 1647 + }, + { + "epoch": 5.949887133182845, + "grad_norm": 212.9633026123047, + "learning_rate": 2.1081669691470055e-05, + "loss": 42.795, + "step": 1648 + }, + { + "epoch": 5.953498871331829, + "grad_norm": 263.2871398925781, + "learning_rate": 2.1076225045372053e-05, + "loss": 43.8843, + "step": 1649 + }, + { + "epoch": 5.957110609480813, + "grad_norm": 207.67120361328125, + "learning_rate": 2.107078039927405e-05, + "loss": 43.0161, + "step": 1650 + }, + { + "epoch": 5.957110609480813, + "eval_loss": 0.6315081715583801, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1650 + }, + { + "epoch": 5.960722347629797, + "grad_norm": 176.6342010498047, + "learning_rate": 2.1065335753176044e-05, + "loss": 38.803, + "step": 1651 + }, + { + "epoch": 5.9643340857787805, + "grad_norm": 223.57485961914062, + "learning_rate": 2.105989110707804e-05, + "loss": 35.1905, + "step": 1652 + }, + { + "epoch": 5.967945823927765, + "grad_norm": 291.507568359375, + "learning_rate": 2.1054446460980035e-05, + "loss": 34.9454, + "step": 1653 + }, + { + "epoch": 5.971557562076749, + "grad_norm": 250.51063537597656, + "learning_rate": 2.104900181488203e-05, + "loss": 37.4404, + "step": 1654 + }, + { + "epoch": 5.975169300225733, + "grad_norm": 307.9601135253906, + "learning_rate": 2.1043557168784032e-05, + "loss": 36.9775, + "step": 1655 + }, + { + "epoch": 5.978781038374718, + "grad_norm": 277.24151611328125, + "learning_rate": 2.1038112522686028e-05, + "loss": 38.2696, + "step": 1656 + }, + { + "epoch": 5.982392776523702, + "grad_norm": 186.7593994140625, + "learning_rate": 2.1032667876588023e-05, + "loss": 37.0656, + "step": 1657 + }, + { + "epoch": 5.986004514672686, + "grad_norm": 201.67047119140625, + "learning_rate": 2.102722323049002e-05, + "loss": 38.1747, + "step": 1658 + }, + { + "epoch": 5.98961625282167, + "grad_norm": 216.87525939941406, + "learning_rate": 2.1021778584392014e-05, + "loss": 39.3248, + "step": 1659 + }, + { + "epoch": 5.993227990970655, + "grad_norm": 227.381103515625, + "learning_rate": 2.1016333938294013e-05, + "loss": 33.4017, + "step": 1660 + }, + { + "epoch": 5.993227990970655, + "eval_loss": 0.6369583010673523, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1660 + }, + { + "epoch": 5.996839729119639, + "grad_norm": 237.2648468017578, + "learning_rate": 2.1010889292196008e-05, + "loss": 24.679, + "step": 1661 + }, + { + "epoch": 6.0, + "grad_norm": 191.99951171875, + "learning_rate": 2.1005444646098003e-05, + "loss": 21.9552, + "step": 1662 + }, + { + "epoch": 6.003611738148984, + "grad_norm": 267.92181396484375, + "learning_rate": 2.1e-05, + "loss": 43.6884, + "step": 1663 + }, + { + "epoch": 6.007223476297969, + "grad_norm": 318.86602783203125, + "learning_rate": 2.0994555353901998e-05, + "loss": 46.0709, + "step": 1664 + }, + { + "epoch": 6.010835214446953, + "grad_norm": 282.772705078125, + "learning_rate": 2.0989110707803993e-05, + "loss": 44.2746, + "step": 1665 + }, + { + "epoch": 6.014446952595937, + "grad_norm": 263.2024841308594, + "learning_rate": 2.0983666061705992e-05, + "loss": 43.818, + "step": 1666 + }, + { + "epoch": 6.018058690744921, + "grad_norm": 229.41725158691406, + "learning_rate": 2.0978221415607987e-05, + "loss": 43.9441, + "step": 1667 + }, + { + "epoch": 6.021670428893906, + "grad_norm": 253.25624084472656, + "learning_rate": 2.0972776769509983e-05, + "loss": 43.517, + "step": 1668 + }, + { + "epoch": 6.0252821670428895, + "grad_norm": 202.00238037109375, + "learning_rate": 2.0967332123411978e-05, + "loss": 44.3685, + "step": 1669 + }, + { + "epoch": 6.0288939051918735, + "grad_norm": 196.92825317382812, + "learning_rate": 2.0961887477313973e-05, + "loss": 44.9367, + "step": 1670 + }, + { + "epoch": 6.0288939051918735, + "eval_loss": 0.6381568312644958, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1670 + }, + { + "epoch": 6.0325056433408575, + "grad_norm": 191.00900268554688, + "learning_rate": 2.0956442831215972e-05, + "loss": 44.0743, + "step": 1671 + }, + { + "epoch": 6.036117381489842, + "grad_norm": 195.92141723632812, + "learning_rate": 2.0950998185117967e-05, + "loss": 43.3278, + "step": 1672 + }, + { + "epoch": 6.039729119638826, + "grad_norm": 230.04708862304688, + "learning_rate": 2.0945553539019963e-05, + "loss": 41.6419, + "step": 1673 + }, + { + "epoch": 6.04334085778781, + "grad_norm": 215.70689392089844, + "learning_rate": 2.094010889292196e-05, + "loss": 41.0927, + "step": 1674 + }, + { + "epoch": 6.046952595936794, + "grad_norm": 227.51797485351562, + "learning_rate": 2.0934664246823957e-05, + "loss": 40.1888, + "step": 1675 + }, + { + "epoch": 6.050564334085779, + "grad_norm": 216.93089294433594, + "learning_rate": 2.0929219600725952e-05, + "loss": 39.8766, + "step": 1676 + }, + { + "epoch": 6.054176072234763, + "grad_norm": 199.3091583251953, + "learning_rate": 2.092377495462795e-05, + "loss": 40.3851, + "step": 1677 + }, + { + "epoch": 6.057787810383747, + "grad_norm": 188.56056213378906, + "learning_rate": 2.0918330308529947e-05, + "loss": 40.5289, + "step": 1678 + }, + { + "epoch": 6.061399548532731, + "grad_norm": 194.23265075683594, + "learning_rate": 2.0912885662431942e-05, + "loss": 40.7509, + "step": 1679 + }, + { + "epoch": 6.065011286681716, + "grad_norm": 199.7327423095703, + "learning_rate": 2.0907441016333937e-05, + "loss": 41.3404, + "step": 1680 + }, + { + "epoch": 6.065011286681716, + "eval_loss": 0.6312655806541443, + "eval_runtime": 3.1482, + "eval_samples_per_second": 56.858, + "eval_steps_per_second": 56.858, + "step": 1680 + }, + { + "epoch": 6.0686230248307, + "grad_norm": 189.40150451660156, + "learning_rate": 2.0901996370235933e-05, + "loss": 41.3719, + "step": 1681 + }, + { + "epoch": 6.072234762979684, + "grad_norm": 222.07705688476562, + "learning_rate": 2.089655172413793e-05, + "loss": 41.8194, + "step": 1682 + }, + { + "epoch": 6.075846501128668, + "grad_norm": 205.6264190673828, + "learning_rate": 2.089110707803993e-05, + "loss": 39.8522, + "step": 1683 + }, + { + "epoch": 6.079458239277653, + "grad_norm": 207.98802185058594, + "learning_rate": 2.0885662431941926e-05, + "loss": 41.5093, + "step": 1684 + }, + { + "epoch": 6.083069977426637, + "grad_norm": 197.24134826660156, + "learning_rate": 2.088021778584392e-05, + "loss": 41.7284, + "step": 1685 + }, + { + "epoch": 6.0866817155756205, + "grad_norm": 220.84255981445312, + "learning_rate": 2.0874773139745916e-05, + "loss": 42.7841, + "step": 1686 + }, + { + "epoch": 6.090293453724605, + "grad_norm": 239.06854248046875, + "learning_rate": 2.0869328493647912e-05, + "loss": 43.6391, + "step": 1687 + }, + { + "epoch": 6.093905191873589, + "grad_norm": 193.2572021484375, + "learning_rate": 2.086388384754991e-05, + "loss": 41.9963, + "step": 1688 + }, + { + "epoch": 6.097516930022573, + "grad_norm": 206.66473388671875, + "learning_rate": 2.0858439201451906e-05, + "loss": 41.9834, + "step": 1689 + }, + { + "epoch": 6.101128668171557, + "grad_norm": 214.81956481933594, + "learning_rate": 2.08529945553539e-05, + "loss": 41.7128, + "step": 1690 + }, + { + "epoch": 6.101128668171557, + "eval_loss": 0.6309775114059448, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1690 + }, + { + "epoch": 6.104740406320542, + "grad_norm": 189.58360290527344, + "learning_rate": 2.0847549909255897e-05, + "loss": 37.7807, + "step": 1691 + }, + { + "epoch": 6.108352144469526, + "grad_norm": 265.76934814453125, + "learning_rate": 2.0842105263157895e-05, + "loss": 37.7091, + "step": 1692 + }, + { + "epoch": 6.11196388261851, + "grad_norm": 266.4632568359375, + "learning_rate": 2.0836660617059894e-05, + "loss": 34.7386, + "step": 1693 + }, + { + "epoch": 6.115575620767494, + "grad_norm": 309.3799743652344, + "learning_rate": 2.083121597096189e-05, + "loss": 34.9386, + "step": 1694 + }, + { + "epoch": 6.119187358916479, + "grad_norm": 252.98681640625, + "learning_rate": 2.0825771324863885e-05, + "loss": 34.9113, + "step": 1695 + }, + { + "epoch": 6.122799097065463, + "grad_norm": 199.3408660888672, + "learning_rate": 2.082032667876588e-05, + "loss": 35.1914, + "step": 1696 + }, + { + "epoch": 6.126410835214447, + "grad_norm": 231.67514038085938, + "learning_rate": 2.0814882032667876e-05, + "loss": 36.3151, + "step": 1697 + }, + { + "epoch": 6.130022573363431, + "grad_norm": 215.49317932128906, + "learning_rate": 2.080943738656987e-05, + "loss": 37.6763, + "step": 1698 + }, + { + "epoch": 6.133634311512416, + "grad_norm": 239.3602752685547, + "learning_rate": 2.080399274047187e-05, + "loss": 35.7805, + "step": 1699 + }, + { + "epoch": 6.1372460496614, + "grad_norm": 192.8195037841797, + "learning_rate": 2.0798548094373865e-05, + "loss": 36.7353, + "step": 1700 + }, + { + "epoch": 6.1372460496614, + "eval_loss": 0.6290757060050964, + "eval_runtime": 3.1486, + "eval_samples_per_second": 56.851, + "eval_steps_per_second": 56.851, + "step": 1700 + }, + { + "epoch": 6.140857787810384, + "grad_norm": 191.125, + "learning_rate": 2.0793103448275864e-05, + "loss": 36.6377, + "step": 1701 + }, + { + "epoch": 6.144469525959368, + "grad_norm": 232.39170837402344, + "learning_rate": 2.078765880217786e-05, + "loss": 36.5235, + "step": 1702 + }, + { + "epoch": 6.148081264108352, + "grad_norm": 259.41204833984375, + "learning_rate": 2.0782214156079855e-05, + "loss": 37.7093, + "step": 1703 + }, + { + "epoch": 6.151693002257336, + "grad_norm": 218.00814819335938, + "learning_rate": 2.0776769509981854e-05, + "loss": 37.8061, + "step": 1704 + }, + { + "epoch": 6.15530474040632, + "grad_norm": 183.78170776367188, + "learning_rate": 2.077132486388385e-05, + "loss": 37.9451, + "step": 1705 + }, + { + "epoch": 6.158916478555304, + "grad_norm": 242.387939453125, + "learning_rate": 2.0765880217785844e-05, + "loss": 38.687, + "step": 1706 + }, + { + "epoch": 6.162528216704289, + "grad_norm": 247.09152221679688, + "learning_rate": 2.076043557168784e-05, + "loss": 38.5109, + "step": 1707 + }, + { + "epoch": 6.166139954853273, + "grad_norm": 202.3104705810547, + "learning_rate": 2.0754990925589835e-05, + "loss": 28.0115, + "step": 1708 + }, + { + "epoch": 6.169751693002257, + "grad_norm": 239.5511016845703, + "learning_rate": 2.0749546279491834e-05, + "loss": 23.8873, + "step": 1709 + }, + { + "epoch": 6.173363431151241, + "grad_norm": 233.80007934570312, + "learning_rate": 2.0744101633393833e-05, + "loss": 24.0236, + "step": 1710 + }, + { + "epoch": 6.173363431151241, + "eval_loss": 0.6451307535171509, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1710 + }, + { + "epoch": 6.176975169300226, + "grad_norm": 231.85955810546875, + "learning_rate": 2.0738656987295828e-05, + "loss": 25.2521, + "step": 1711 + }, + { + "epoch": 6.18058690744921, + "grad_norm": 207.05453491210938, + "learning_rate": 2.0733212341197823e-05, + "loss": 25.5774, + "step": 1712 + }, + { + "epoch": 6.184198645598194, + "grad_norm": 265.9180908203125, + "learning_rate": 2.072776769509982e-05, + "loss": 46.0267, + "step": 1713 + }, + { + "epoch": 6.187810383747179, + "grad_norm": 289.2763671875, + "learning_rate": 2.0722323049001814e-05, + "loss": 46.6262, + "step": 1714 + }, + { + "epoch": 6.191422121896163, + "grad_norm": 254.466552734375, + "learning_rate": 2.0716878402903813e-05, + "loss": 44.2758, + "step": 1715 + }, + { + "epoch": 6.195033860045147, + "grad_norm": 262.713134765625, + "learning_rate": 2.071143375680581e-05, + "loss": 44.6334, + "step": 1716 + }, + { + "epoch": 6.198645598194131, + "grad_norm": 272.8150939941406, + "learning_rate": 2.0705989110707804e-05, + "loss": 44.9617, + "step": 1717 + }, + { + "epoch": 6.2022573363431155, + "grad_norm": 288.115478515625, + "learning_rate": 2.07005444646098e-05, + "loss": 44.4382, + "step": 1718 + }, + { + "epoch": 6.2058690744920995, + "grad_norm": 226.08058166503906, + "learning_rate": 2.0695099818511795e-05, + "loss": 44.8551, + "step": 1719 + }, + { + "epoch": 6.209480812641083, + "grad_norm": 219.95835876464844, + "learning_rate": 2.0689655172413797e-05, + "loss": 45.5901, + "step": 1720 + }, + { + "epoch": 6.209480812641083, + "eval_loss": 0.6379314661026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 1720 + }, + { + "epoch": 6.213092550790067, + "grad_norm": 190.3118896484375, + "learning_rate": 2.0684210526315792e-05, + "loss": 44.0675, + "step": 1721 + }, + { + "epoch": 6.216704288939052, + "grad_norm": 177.408935546875, + "learning_rate": 2.0678765880217787e-05, + "loss": 42.6333, + "step": 1722 + }, + { + "epoch": 6.220316027088036, + "grad_norm": 231.3040313720703, + "learning_rate": 2.0673321234119783e-05, + "loss": 41.6771, + "step": 1723 + }, + { + "epoch": 6.22392776523702, + "grad_norm": 226.51663208007812, + "learning_rate": 2.0667876588021778e-05, + "loss": 41.0829, + "step": 1724 + }, + { + "epoch": 6.227539503386004, + "grad_norm": 184.55775451660156, + "learning_rate": 2.0662431941923774e-05, + "loss": 39.2682, + "step": 1725 + }, + { + "epoch": 6.231151241534989, + "grad_norm": 205.0491943359375, + "learning_rate": 2.0656987295825772e-05, + "loss": 40.4101, + "step": 1726 + }, + { + "epoch": 6.234762979683973, + "grad_norm": 201.45838928222656, + "learning_rate": 2.0651542649727768e-05, + "loss": 39.9147, + "step": 1727 + }, + { + "epoch": 6.238374717832957, + "grad_norm": 220.16213989257812, + "learning_rate": 2.0646098003629763e-05, + "loss": 40.7215, + "step": 1728 + }, + { + "epoch": 6.241986455981941, + "grad_norm": 260.9661560058594, + "learning_rate": 2.0640653357531762e-05, + "loss": 40.0256, + "step": 1729 + }, + { + "epoch": 6.245598194130926, + "grad_norm": 314.2476806640625, + "learning_rate": 2.0635208711433757e-05, + "loss": 41.1147, + "step": 1730 + }, + { + "epoch": 6.245598194130926, + "eval_loss": 0.6347935199737549, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1730 + }, + { + "epoch": 6.24920993227991, + "grad_norm": 262.24505615234375, + "learning_rate": 2.0629764065335756e-05, + "loss": 41.7255, + "step": 1731 + }, + { + "epoch": 6.252821670428894, + "grad_norm": 212.0876922607422, + "learning_rate": 2.062431941923775e-05, + "loss": 41.2559, + "step": 1732 + }, + { + "epoch": 6.2564334085778786, + "grad_norm": 185.3249969482422, + "learning_rate": 2.0618874773139747e-05, + "loss": 41.1664, + "step": 1733 + }, + { + "epoch": 6.2600451467268625, + "grad_norm": 184.7873077392578, + "learning_rate": 2.0613430127041742e-05, + "loss": 41.3357, + "step": 1734 + }, + { + "epoch": 6.2636568848758465, + "grad_norm": 230.11257934570312, + "learning_rate": 2.0607985480943738e-05, + "loss": 43.0978, + "step": 1735 + }, + { + "epoch": 6.2672686230248305, + "grad_norm": 251.255126953125, + "learning_rate": 2.0602540834845733e-05, + "loss": 42.4169, + "step": 1736 + }, + { + "epoch": 6.270880361173815, + "grad_norm": 230.1149444580078, + "learning_rate": 2.0597096188747732e-05, + "loss": 43.2969, + "step": 1737 + }, + { + "epoch": 6.274492099322799, + "grad_norm": 217.2769012451172, + "learning_rate": 2.059165154264973e-05, + "loss": 42.6037, + "step": 1738 + }, + { + "epoch": 6.278103837471783, + "grad_norm": 189.85533142089844, + "learning_rate": 2.0586206896551726e-05, + "loss": 42.1215, + "step": 1739 + }, + { + "epoch": 6.281715575620767, + "grad_norm": 242.15667724609375, + "learning_rate": 2.058076225045372e-05, + "loss": 42.6337, + "step": 1740 + }, + { + "epoch": 6.281715575620767, + "eval_loss": 0.6310555934906006, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 1740 + }, + { + "epoch": 6.285327313769752, + "grad_norm": 213.7873992919922, + "learning_rate": 2.0575317604355717e-05, + "loss": 40.5315, + "step": 1741 + }, + { + "epoch": 6.288939051918736, + "grad_norm": 243.86492919921875, + "learning_rate": 2.0569872958257715e-05, + "loss": 38.9483, + "step": 1742 + }, + { + "epoch": 6.29255079006772, + "grad_norm": 276.0108642578125, + "learning_rate": 2.056442831215971e-05, + "loss": 35.9627, + "step": 1743 + }, + { + "epoch": 6.296162528216704, + "grad_norm": 252.5875701904297, + "learning_rate": 2.0558983666061706e-05, + "loss": 35.4305, + "step": 1744 + }, + { + "epoch": 6.299774266365689, + "grad_norm": 227.15142822265625, + "learning_rate": 2.05535390199637e-05, + "loss": 35.2385, + "step": 1745 + }, + { + "epoch": 6.303386004514673, + "grad_norm": 259.6727294921875, + "learning_rate": 2.0548094373865697e-05, + "loss": 35.735, + "step": 1746 + }, + { + "epoch": 6.306997742663657, + "grad_norm": 185.07765197753906, + "learning_rate": 2.0542649727767696e-05, + "loss": 36.8835, + "step": 1747 + }, + { + "epoch": 6.310609480812641, + "grad_norm": 207.650146484375, + "learning_rate": 2.0537205081669694e-05, + "loss": 36.346, + "step": 1748 + }, + { + "epoch": 6.314221218961626, + "grad_norm": 223.2378692626953, + "learning_rate": 2.053176043557169e-05, + "loss": 36.1527, + "step": 1749 + }, + { + "epoch": 6.3178329571106095, + "grad_norm": 162.90794372558594, + "learning_rate": 2.0526315789473685e-05, + "loss": 35.7408, + "step": 1750 + }, + { + "epoch": 6.3178329571106095, + "eval_loss": 0.6276403069496155, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 1750 + }, + { + "epoch": 6.3214446952595935, + "grad_norm": 165.8592987060547, + "learning_rate": 2.052087114337568e-05, + "loss": 37.7916, + "step": 1751 + }, + { + "epoch": 6.3250564334085775, + "grad_norm": 179.7499542236328, + "learning_rate": 2.0515426497277676e-05, + "loss": 36.8409, + "step": 1752 + }, + { + "epoch": 6.328668171557562, + "grad_norm": 227.0990753173828, + "learning_rate": 2.0509981851179675e-05, + "loss": 37.1766, + "step": 1753 + }, + { + "epoch": 6.332279909706546, + "grad_norm": 216.3297882080078, + "learning_rate": 2.050453720508167e-05, + "loss": 37.5, + "step": 1754 + }, + { + "epoch": 6.33589164785553, + "grad_norm": 197.88409423828125, + "learning_rate": 2.0499092558983666e-05, + "loss": 38.8293, + "step": 1755 + }, + { + "epoch": 6.339503386004514, + "grad_norm": 189.74916076660156, + "learning_rate": 2.049364791288566e-05, + "loss": 37.9873, + "step": 1756 + }, + { + "epoch": 6.343115124153499, + "grad_norm": 241.16644287109375, + "learning_rate": 2.048820326678766e-05, + "loss": 39.3107, + "step": 1757 + }, + { + "epoch": 6.346726862302483, + "grad_norm": 224.3491668701172, + "learning_rate": 2.0482758620689655e-05, + "loss": 36.2482, + "step": 1758 + }, + { + "epoch": 6.350338600451467, + "grad_norm": 217.30882263183594, + "learning_rate": 2.0477313974591654e-05, + "loss": 24.1945, + "step": 1759 + }, + { + "epoch": 6.353950338600452, + "grad_norm": 213.23683166503906, + "learning_rate": 2.047186932849365e-05, + "loss": 24.2356, + "step": 1760 + }, + { + "epoch": 6.353950338600452, + "eval_loss": 0.6382855772972107, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.795, + "eval_steps_per_second": 56.795, + "step": 1760 + }, + { + "epoch": 6.357562076749436, + "grad_norm": 209.8166961669922, + "learning_rate": 2.0466424682395645e-05, + "loss": 25.1916, + "step": 1761 + }, + { + "epoch": 6.36117381489842, + "grad_norm": 197.86773681640625, + "learning_rate": 2.046098003629764e-05, + "loss": 25.1372, + "step": 1762 + }, + { + "epoch": 6.364785553047404, + "grad_norm": 280.80517578125, + "learning_rate": 2.0455535390199635e-05, + "loss": 45.0431, + "step": 1763 + }, + { + "epoch": 6.368397291196389, + "grad_norm": 239.85861206054688, + "learning_rate": 2.0450090744101634e-05, + "loss": 45.4893, + "step": 1764 + }, + { + "epoch": 6.372009029345373, + "grad_norm": 302.56024169921875, + "learning_rate": 2.044464609800363e-05, + "loss": 45.3313, + "step": 1765 + }, + { + "epoch": 6.375620767494357, + "grad_norm": 255.5519256591797, + "learning_rate": 2.043920145190563e-05, + "loss": 44.703, + "step": 1766 + }, + { + "epoch": 6.3792325056433405, + "grad_norm": 223.1331024169922, + "learning_rate": 2.0433756805807624e-05, + "loss": 45.0278, + "step": 1767 + }, + { + "epoch": 6.382844243792325, + "grad_norm": 240.68817138671875, + "learning_rate": 2.042831215970962e-05, + "loss": 44.7298, + "step": 1768 + }, + { + "epoch": 6.386455981941309, + "grad_norm": 239.5072021484375, + "learning_rate": 2.0422867513611614e-05, + "loss": 44.0512, + "step": 1769 + }, + { + "epoch": 6.390067720090293, + "grad_norm": 186.3783416748047, + "learning_rate": 2.0417422867513613e-05, + "loss": 43.8646, + "step": 1770 + }, + { + "epoch": 6.390067720090293, + "eval_loss": 0.6325972676277161, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 1770 + }, + { + "epoch": 6.393679458239277, + "grad_norm": 169.77285766601562, + "learning_rate": 2.041197822141561e-05, + "loss": 43.8688, + "step": 1771 + }, + { + "epoch": 6.397291196388262, + "grad_norm": 158.4019012451172, + "learning_rate": 2.0406533575317604e-05, + "loss": 42.5757, + "step": 1772 + }, + { + "epoch": 6.400902934537246, + "grad_norm": 209.79916381835938, + "learning_rate": 2.04010889292196e-05, + "loss": 44.8075, + "step": 1773 + }, + { + "epoch": 6.40451467268623, + "grad_norm": 215.74639892578125, + "learning_rate": 2.0395644283121595e-05, + "loss": 42.0121, + "step": 1774 + }, + { + "epoch": 6.408126410835214, + "grad_norm": 215.21121215820312, + "learning_rate": 2.0390199637023597e-05, + "loss": 40.6564, + "step": 1775 + }, + { + "epoch": 6.411738148984199, + "grad_norm": 244.49574279785156, + "learning_rate": 2.0384754990925592e-05, + "loss": 40.543, + "step": 1776 + }, + { + "epoch": 6.415349887133183, + "grad_norm": 189.22781372070312, + "learning_rate": 2.0379310344827588e-05, + "loss": 39.5569, + "step": 1777 + }, + { + "epoch": 6.418961625282167, + "grad_norm": 204.32664489746094, + "learning_rate": 2.0373865698729583e-05, + "loss": 40.0789, + "step": 1778 + }, + { + "epoch": 6.422573363431152, + "grad_norm": 217.5277557373047, + "learning_rate": 2.036842105263158e-05, + "loss": 39.6436, + "step": 1779 + }, + { + "epoch": 6.426185101580136, + "grad_norm": 196.25918579101562, + "learning_rate": 2.0362976406533574e-05, + "loss": 41.0794, + "step": 1780 + }, + { + "epoch": 6.426185101580136, + "eval_loss": 0.6334295868873596, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1780 + }, + { + "epoch": 6.42979683972912, + "grad_norm": 191.50656127929688, + "learning_rate": 2.0357531760435573e-05, + "loss": 41.2976, + "step": 1781 + }, + { + "epoch": 6.433408577878104, + "grad_norm": 192.98692321777344, + "learning_rate": 2.0352087114337568e-05, + "loss": 41.0843, + "step": 1782 + }, + { + "epoch": 6.437020316027088, + "grad_norm": 197.32862854003906, + "learning_rate": 2.0346642468239563e-05, + "loss": 40.4123, + "step": 1783 + }, + { + "epoch": 6.440632054176072, + "grad_norm": 205.18751525878906, + "learning_rate": 2.0341197822141562e-05, + "loss": 41.9185, + "step": 1784 + }, + { + "epoch": 6.444243792325056, + "grad_norm": 201.69070434570312, + "learning_rate": 2.0335753176043558e-05, + "loss": 41.6794, + "step": 1785 + }, + { + "epoch": 6.44785553047404, + "grad_norm": 218.77044677734375, + "learning_rate": 2.0330308529945556e-05, + "loss": 43.5805, + "step": 1786 + }, + { + "epoch": 6.451467268623025, + "grad_norm": 183.25967407226562, + "learning_rate": 2.0324863883847552e-05, + "loss": 41.2777, + "step": 1787 + }, + { + "epoch": 6.455079006772009, + "grad_norm": 219.97369384765625, + "learning_rate": 2.0319419237749547e-05, + "loss": 42.4618, + "step": 1788 + }, + { + "epoch": 6.458690744920993, + "grad_norm": 216.1624298095703, + "learning_rate": 2.0313974591651542e-05, + "loss": 41.6424, + "step": 1789 + }, + { + "epoch": 6.462302483069977, + "grad_norm": 222.29965209960938, + "learning_rate": 2.0308529945553538e-05, + "loss": 41.4058, + "step": 1790 + }, + { + "epoch": 6.462302483069977, + "eval_loss": 0.6282982230186462, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 1790 + }, + { + "epoch": 6.465914221218962, + "grad_norm": 215.50511169433594, + "learning_rate": 2.0303085299455533e-05, + "loss": 39.474, + "step": 1791 + }, + { + "epoch": 6.469525959367946, + "grad_norm": 237.2119903564453, + "learning_rate": 2.0297640653357532e-05, + "loss": 36.0508, + "step": 1792 + }, + { + "epoch": 6.47313769751693, + "grad_norm": 234.52975463867188, + "learning_rate": 2.029219600725953e-05, + "loss": 34.1704, + "step": 1793 + }, + { + "epoch": 6.476749435665914, + "grad_norm": 213.22216796875, + "learning_rate": 2.0286751361161526e-05, + "loss": 34.7592, + "step": 1794 + }, + { + "epoch": 6.480361173814899, + "grad_norm": 215.77244567871094, + "learning_rate": 2.028130671506352e-05, + "loss": 35.3051, + "step": 1795 + }, + { + "epoch": 6.483972911963883, + "grad_norm": 179.0439910888672, + "learning_rate": 2.0275862068965517e-05, + "loss": 35.2493, + "step": 1796 + }, + { + "epoch": 6.487584650112867, + "grad_norm": 217.47218322753906, + "learning_rate": 2.0270417422867516e-05, + "loss": 35.6169, + "step": 1797 + }, + { + "epoch": 6.491196388261851, + "grad_norm": 191.3380584716797, + "learning_rate": 2.026497277676951e-05, + "loss": 36.428, + "step": 1798 + }, + { + "epoch": 6.4948081264108355, + "grad_norm": 200.8570098876953, + "learning_rate": 2.0259528130671506e-05, + "loss": 36.5983, + "step": 1799 + }, + { + "epoch": 6.4984198645598195, + "grad_norm": 173.1240234375, + "learning_rate": 2.0254083484573502e-05, + "loss": 36.0163, + "step": 1800 + }, + { + "epoch": 6.4984198645598195, + "eval_loss": 0.6268841624259949, + "eval_runtime": 3.146, + "eval_samples_per_second": 56.898, + "eval_steps_per_second": 56.898, + "step": 1800 + }, + { + "epoch": 6.502031602708803, + "grad_norm": 225.66845703125, + "learning_rate": 2.0248638838475497e-05, + "loss": 36.2461, + "step": 1801 + }, + { + "epoch": 6.505643340857787, + "grad_norm": 189.66233825683594, + "learning_rate": 2.0243194192377496e-05, + "loss": 37.416, + "step": 1802 + }, + { + "epoch": 6.509255079006772, + "grad_norm": 243.0270233154297, + "learning_rate": 2.0237749546279495e-05, + "loss": 38.5309, + "step": 1803 + }, + { + "epoch": 6.512866817155756, + "grad_norm": 192.0927276611328, + "learning_rate": 2.023230490018149e-05, + "loss": 37.087, + "step": 1804 + }, + { + "epoch": 6.51647855530474, + "grad_norm": 222.2957305908203, + "learning_rate": 2.0226860254083486e-05, + "loss": 37.8877, + "step": 1805 + }, + { + "epoch": 6.520090293453725, + "grad_norm": 259.84722900390625, + "learning_rate": 2.022141560798548e-05, + "loss": 39.2138, + "step": 1806 + }, + { + "epoch": 6.523702031602709, + "grad_norm": 205.5794219970703, + "learning_rate": 2.0215970961887476e-05, + "loss": 38.6066, + "step": 1807 + }, + { + "epoch": 6.527313769751693, + "grad_norm": 300.455810546875, + "learning_rate": 2.0210526315789475e-05, + "loss": 36.1581, + "step": 1808 + }, + { + "epoch": 6.530925507900677, + "grad_norm": 207.18063354492188, + "learning_rate": 2.020508166969147e-05, + "loss": 24.3689, + "step": 1809 + }, + { + "epoch": 6.534537246049661, + "grad_norm": 230.98516845703125, + "learning_rate": 2.0199637023593466e-05, + "loss": 23.7019, + "step": 1810 + }, + { + "epoch": 6.534537246049661, + "eval_loss": 0.6379140615463257, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 1810 + }, + { + "epoch": 6.538148984198646, + "grad_norm": 153.8694610595703, + "learning_rate": 2.019419237749546e-05, + "loss": 24.5035, + "step": 1811 + }, + { + "epoch": 6.54176072234763, + "grad_norm": 229.9432373046875, + "learning_rate": 2.018874773139746e-05, + "loss": 26.1645, + "step": 1812 + }, + { + "epoch": 6.545372460496614, + "grad_norm": 325.3592529296875, + "learning_rate": 2.018330308529946e-05, + "loss": 45.6349, + "step": 1813 + }, + { + "epoch": 6.5489841986455986, + "grad_norm": 261.0744323730469, + "learning_rate": 2.0177858439201454e-05, + "loss": 45.5545, + "step": 1814 + }, + { + "epoch": 6.5525959367945825, + "grad_norm": 261.4237976074219, + "learning_rate": 2.017241379310345e-05, + "loss": 45.321, + "step": 1815 + }, + { + "epoch": 6.5562076749435665, + "grad_norm": 238.8377685546875, + "learning_rate": 2.0166969147005445e-05, + "loss": 44.5963, + "step": 1816 + }, + { + "epoch": 6.5598194130925505, + "grad_norm": 225.89730834960938, + "learning_rate": 2.016152450090744e-05, + "loss": 43.593, + "step": 1817 + }, + { + "epoch": 6.563431151241535, + "grad_norm": 265.09625244140625, + "learning_rate": 2.0156079854809436e-05, + "loss": 43.536, + "step": 1818 + }, + { + "epoch": 6.567042889390519, + "grad_norm": 257.9114685058594, + "learning_rate": 2.0150635208711434e-05, + "loss": 44.1125, + "step": 1819 + }, + { + "epoch": 6.570654627539503, + "grad_norm": 188.06382751464844, + "learning_rate": 2.014519056261343e-05, + "loss": 45.097, + "step": 1820 + }, + { + "epoch": 6.570654627539503, + "eval_loss": 0.6347097754478455, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 1820 + }, + { + "epoch": 6.574266365688487, + "grad_norm": 227.7350616455078, + "learning_rate": 2.013974591651543e-05, + "loss": 43.9367, + "step": 1821 + }, + { + "epoch": 6.577878103837472, + "grad_norm": 207.54774475097656, + "learning_rate": 2.0134301270417424e-05, + "loss": 43.8266, + "step": 1822 + }, + { + "epoch": 6.581489841986456, + "grad_norm": 204.62364196777344, + "learning_rate": 2.012885662431942e-05, + "loss": 42.7973, + "step": 1823 + }, + { + "epoch": 6.58510158013544, + "grad_norm": 244.32159423828125, + "learning_rate": 2.0123411978221418e-05, + "loss": 42.7741, + "step": 1824 + }, + { + "epoch": 6.588713318284425, + "grad_norm": 304.9100036621094, + "learning_rate": 2.0117967332123414e-05, + "loss": 40.6529, + "step": 1825 + }, + { + "epoch": 6.592325056433409, + "grad_norm": 275.5767517089844, + "learning_rate": 2.011252268602541e-05, + "loss": 40.2909, + "step": 1826 + }, + { + "epoch": 6.595936794582393, + "grad_norm": 227.69642639160156, + "learning_rate": 2.0107078039927404e-05, + "loss": 39.8786, + "step": 1827 + }, + { + "epoch": 6.599548532731377, + "grad_norm": 261.4333190917969, + "learning_rate": 2.01016333938294e-05, + "loss": 40.7009, + "step": 1828 + }, + { + "epoch": 6.603160270880361, + "grad_norm": 213.0095977783203, + "learning_rate": 2.0096188747731395e-05, + "loss": 40.0595, + "step": 1829 + }, + { + "epoch": 6.606772009029346, + "grad_norm": 251.78590393066406, + "learning_rate": 2.0090744101633397e-05, + "loss": 40.8939, + "step": 1830 + }, + { + "epoch": 6.606772009029346, + "eval_loss": 0.6333281397819519, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1830 + }, + { + "epoch": 6.6103837471783295, + "grad_norm": 224.89805603027344, + "learning_rate": 2.0085299455535393e-05, + "loss": 41.4123, + "step": 1831 + }, + { + "epoch": 6.6139954853273135, + "grad_norm": 195.67982482910156, + "learning_rate": 2.0079854809437388e-05, + "loss": 41.3483, + "step": 1832 + }, + { + "epoch": 6.617607223476298, + "grad_norm": 214.318603515625, + "learning_rate": 2.0074410163339383e-05, + "loss": 40.5516, + "step": 1833 + }, + { + "epoch": 6.621218961625282, + "grad_norm": 226.60968017578125, + "learning_rate": 2.006896551724138e-05, + "loss": 41.3523, + "step": 1834 + }, + { + "epoch": 6.624830699774266, + "grad_norm": 231.63604736328125, + "learning_rate": 2.0063520871143378e-05, + "loss": 41.8734, + "step": 1835 + }, + { + "epoch": 6.62844243792325, + "grad_norm": 224.1644287109375, + "learning_rate": 2.0058076225045373e-05, + "loss": 42.7386, + "step": 1836 + }, + { + "epoch": 6.632054176072235, + "grad_norm": 273.651123046875, + "learning_rate": 2.0052631578947368e-05, + "loss": 42.4525, + "step": 1837 + }, + { + "epoch": 6.635665914221219, + "grad_norm": 270.8088684082031, + "learning_rate": 2.0047186932849364e-05, + "loss": 42.1051, + "step": 1838 + }, + { + "epoch": 6.639277652370203, + "grad_norm": 303.1058044433594, + "learning_rate": 2.0041742286751362e-05, + "loss": 42.1301, + "step": 1839 + }, + { + "epoch": 6.642889390519187, + "grad_norm": 207.29380798339844, + "learning_rate": 2.0036297640653358e-05, + "loss": 42.1495, + "step": 1840 + }, + { + "epoch": 6.642889390519187, + "eval_loss": 0.6321585774421692, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1840 + }, + { + "epoch": 6.646501128668172, + "grad_norm": 262.1852722167969, + "learning_rate": 2.0030852994555357e-05, + "loss": 39.6408, + "step": 1841 + }, + { + "epoch": 6.650112866817156, + "grad_norm": 233.7991943359375, + "learning_rate": 2.0025408348457352e-05, + "loss": 37.6177, + "step": 1842 + }, + { + "epoch": 6.65372460496614, + "grad_norm": 247.25514221191406, + "learning_rate": 2.0019963702359347e-05, + "loss": 35.4287, + "step": 1843 + }, + { + "epoch": 6.657336343115124, + "grad_norm": 191.53343200683594, + "learning_rate": 2.0014519056261343e-05, + "loss": 34.2335, + "step": 1844 + }, + { + "epoch": 6.660948081264109, + "grad_norm": 245.22821044921875, + "learning_rate": 2.0009074410163338e-05, + "loss": 35.8097, + "step": 1845 + }, + { + "epoch": 6.664559819413093, + "grad_norm": 213.8151092529297, + "learning_rate": 2.0003629764065337e-05, + "loss": 35.2621, + "step": 1846 + }, + { + "epoch": 6.668171557562077, + "grad_norm": 174.6085205078125, + "learning_rate": 1.9998185117967332e-05, + "loss": 36.6137, + "step": 1847 + }, + { + "epoch": 6.6717832957110605, + "grad_norm": 287.4677429199219, + "learning_rate": 1.9992740471869328e-05, + "loss": 37.5896, + "step": 1848 + }, + { + "epoch": 6.675395033860045, + "grad_norm": 224.59771728515625, + "learning_rate": 1.9987295825771326e-05, + "loss": 36.5515, + "step": 1849 + }, + { + "epoch": 6.679006772009029, + "grad_norm": 212.73065185546875, + "learning_rate": 1.9981851179673322e-05, + "loss": 36.2511, + "step": 1850 + }, + { + "epoch": 6.679006772009029, + "eval_loss": 0.6308404803276062, + "eval_runtime": 3.1419, + "eval_samples_per_second": 56.972, + "eval_steps_per_second": 56.972, + "step": 1850 + }, + { + "epoch": 6.682618510158013, + "grad_norm": 214.7340850830078, + "learning_rate": 1.9976406533575317e-05, + "loss": 37.6949, + "step": 1851 + }, + { + "epoch": 6.686230248306998, + "grad_norm": 220.3029327392578, + "learning_rate": 1.9970961887477316e-05, + "loss": 36.5785, + "step": 1852 + }, + { + "epoch": 6.689841986455982, + "grad_norm": 198.97564697265625, + "learning_rate": 1.996551724137931e-05, + "loss": 38.5277, + "step": 1853 + }, + { + "epoch": 6.693453724604966, + "grad_norm": 180.94789123535156, + "learning_rate": 1.9960072595281307e-05, + "loss": 37.5197, + "step": 1854 + }, + { + "epoch": 6.69706546275395, + "grad_norm": 212.17584228515625, + "learning_rate": 1.9954627949183302e-05, + "loss": 37.3483, + "step": 1855 + }, + { + "epoch": 6.700677200902934, + "grad_norm": 253.88601684570312, + "learning_rate": 1.9949183303085298e-05, + "loss": 38.5224, + "step": 1856 + }, + { + "epoch": 6.704288939051919, + "grad_norm": 193.17698669433594, + "learning_rate": 1.9943738656987296e-05, + "loss": 37.5679, + "step": 1857 + }, + { + "epoch": 6.707900677200903, + "grad_norm": 217.2652130126953, + "learning_rate": 1.9938294010889295e-05, + "loss": 27.7344, + "step": 1858 + }, + { + "epoch": 6.711512415349887, + "grad_norm": 183.9295196533203, + "learning_rate": 1.993284936479129e-05, + "loss": 24.3864, + "step": 1859 + }, + { + "epoch": 6.715124153498872, + "grad_norm": 200.3455352783203, + "learning_rate": 1.9927404718693286e-05, + "loss": 23.7328, + "step": 1860 + }, + { + "epoch": 6.715124153498872, + "eval_loss": 0.636415421962738, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.943, + "eval_steps_per_second": 56.943, + "step": 1860 + }, + { + "epoch": 6.718735891647856, + "grad_norm": 206.7858123779297, + "learning_rate": 1.992196007259528e-05, + "loss": 24.6541, + "step": 1861 + }, + { + "epoch": 6.72234762979684, + "grad_norm": 208.10414123535156, + "learning_rate": 1.9916515426497277e-05, + "loss": 25.1223, + "step": 1862 + }, + { + "epoch": 6.725959367945824, + "grad_norm": 270.6657409667969, + "learning_rate": 1.9911070780399275e-05, + "loss": 44.8561, + "step": 1863 + }, + { + "epoch": 6.7295711060948085, + "grad_norm": 246.69094848632812, + "learning_rate": 1.990562613430127e-05, + "loss": 45.8683, + "step": 1864 + }, + { + "epoch": 6.733182844243792, + "grad_norm": 243.4462432861328, + "learning_rate": 1.9900181488203266e-05, + "loss": 45.1845, + "step": 1865 + }, + { + "epoch": 6.736794582392776, + "grad_norm": 218.0637969970703, + "learning_rate": 1.989473684210526e-05, + "loss": 43.9492, + "step": 1866 + }, + { + "epoch": 6.74040632054176, + "grad_norm": 200.28140258789062, + "learning_rate": 1.988929219600726e-05, + "loss": 44.0612, + "step": 1867 + }, + { + "epoch": 6.744018058690745, + "grad_norm": 200.3120880126953, + "learning_rate": 1.988384754990926e-05, + "loss": 43.4748, + "step": 1868 + }, + { + "epoch": 6.747629796839729, + "grad_norm": 186.1811065673828, + "learning_rate": 1.9878402903811254e-05, + "loss": 43.6851, + "step": 1869 + }, + { + "epoch": 6.751241534988713, + "grad_norm": 208.15167236328125, + "learning_rate": 1.987295825771325e-05, + "loss": 44.4196, + "step": 1870 + }, + { + "epoch": 6.751241534988713, + "eval_loss": 0.6353851556777954, + "eval_runtime": 3.1436, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1870 + }, + { + "epoch": 6.754853273137698, + "grad_norm": 207.500244140625, + "learning_rate": 1.9867513611615245e-05, + "loss": 44.1493, + "step": 1871 + }, + { + "epoch": 6.758465011286682, + "grad_norm": 238.17047119140625, + "learning_rate": 1.986206896551724e-05, + "loss": 44.6587, + "step": 1872 + }, + { + "epoch": 6.762076749435666, + "grad_norm": 192.9468231201172, + "learning_rate": 1.9856624319419236e-05, + "loss": 43.2409, + "step": 1873 + }, + { + "epoch": 6.76568848758465, + "grad_norm": 205.26492309570312, + "learning_rate": 1.9851179673321235e-05, + "loss": 40.8636, + "step": 1874 + }, + { + "epoch": 6.769300225733634, + "grad_norm": 190.49908447265625, + "learning_rate": 1.984573502722323e-05, + "loss": 41.0769, + "step": 1875 + }, + { + "epoch": 6.772911963882619, + "grad_norm": 206.56097412109375, + "learning_rate": 1.984029038112523e-05, + "loss": 40.1137, + "step": 1876 + }, + { + "epoch": 6.776523702031603, + "grad_norm": 212.89256286621094, + "learning_rate": 1.9834845735027224e-05, + "loss": 41.0114, + "step": 1877 + }, + { + "epoch": 6.780135440180587, + "grad_norm": 197.24267578125, + "learning_rate": 1.982940108892922e-05, + "loss": 40.6027, + "step": 1878 + }, + { + "epoch": 6.7837471783295715, + "grad_norm": 187.01942443847656, + "learning_rate": 1.982395644283122e-05, + "loss": 40.5933, + "step": 1879 + }, + { + "epoch": 6.7873589164785555, + "grad_norm": 236.31092834472656, + "learning_rate": 1.9818511796733214e-05, + "loss": 41.2282, + "step": 1880 + }, + { + "epoch": 6.7873589164785555, + "eval_loss": 0.6299392580986023, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 1880 + }, + { + "epoch": 6.7909706546275395, + "grad_norm": 194.92059326171875, + "learning_rate": 1.981306715063521e-05, + "loss": 41.5858, + "step": 1881 + }, + { + "epoch": 6.794582392776523, + "grad_norm": 192.26272583007812, + "learning_rate": 1.9807622504537205e-05, + "loss": 40.6826, + "step": 1882 + }, + { + "epoch": 6.798194130925508, + "grad_norm": 181.8116912841797, + "learning_rate": 1.98021778584392e-05, + "loss": 40.0867, + "step": 1883 + }, + { + "epoch": 6.801805869074492, + "grad_norm": 219.03494262695312, + "learning_rate": 1.9796733212341195e-05, + "loss": 41.4496, + "step": 1884 + }, + { + "epoch": 6.805417607223476, + "grad_norm": 190.7852325439453, + "learning_rate": 1.9791288566243194e-05, + "loss": 42.4147, + "step": 1885 + }, + { + "epoch": 6.80902934537246, + "grad_norm": 200.32476806640625, + "learning_rate": 1.9785843920145193e-05, + "loss": 42.0316, + "step": 1886 + }, + { + "epoch": 6.812641083521445, + "grad_norm": 240.6086883544922, + "learning_rate": 1.9780399274047188e-05, + "loss": 39.6992, + "step": 1887 + }, + { + "epoch": 6.816252821670429, + "grad_norm": 222.31700134277344, + "learning_rate": 1.9774954627949184e-05, + "loss": 42.9572, + "step": 1888 + }, + { + "epoch": 6.819864559819413, + "grad_norm": 215.65292358398438, + "learning_rate": 1.976950998185118e-05, + "loss": 42.5147, + "step": 1889 + }, + { + "epoch": 6.823476297968397, + "grad_norm": 195.71624755859375, + "learning_rate": 1.9764065335753178e-05, + "loss": 40.9536, + "step": 1890 + }, + { + "epoch": 6.823476297968397, + "eval_loss": 0.6288287043571472, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 1890 + }, + { + "epoch": 6.827088036117382, + "grad_norm": 202.301025390625, + "learning_rate": 1.9758620689655173e-05, + "loss": 40.1754, + "step": 1891 + }, + { + "epoch": 6.830699774266366, + "grad_norm": 217.07186889648438, + "learning_rate": 1.975317604355717e-05, + "loss": 35.7505, + "step": 1892 + }, + { + "epoch": 6.83431151241535, + "grad_norm": 189.78782653808594, + "learning_rate": 1.9747731397459164e-05, + "loss": 34.813, + "step": 1893 + }, + { + "epoch": 6.837923250564334, + "grad_norm": 247.2117462158203, + "learning_rate": 1.974228675136116e-05, + "loss": 33.932, + "step": 1894 + }, + { + "epoch": 6.8415349887133186, + "grad_norm": 244.06321716308594, + "learning_rate": 1.9736842105263158e-05, + "loss": 36.2514, + "step": 1895 + }, + { + "epoch": 6.8451467268623025, + "grad_norm": 235.78692626953125, + "learning_rate": 1.9731397459165157e-05, + "loss": 35.2123, + "step": 1896 + }, + { + "epoch": 6.8487584650112865, + "grad_norm": 193.82456970214844, + "learning_rate": 1.9725952813067152e-05, + "loss": 36.5477, + "step": 1897 + }, + { + "epoch": 6.852370203160271, + "grad_norm": 230.2017059326172, + "learning_rate": 1.9720508166969148e-05, + "loss": 36.1244, + "step": 1898 + }, + { + "epoch": 6.855981941309255, + "grad_norm": 205.5274200439453, + "learning_rate": 1.9715063520871143e-05, + "loss": 36.7059, + "step": 1899 + }, + { + "epoch": 6.859593679458239, + "grad_norm": 236.6873016357422, + "learning_rate": 1.970961887477314e-05, + "loss": 36.6212, + "step": 1900 + }, + { + "epoch": 6.859593679458239, + "eval_loss": 0.6235609650611877, + "eval_runtime": 3.1497, + "eval_samples_per_second": 56.831, + "eval_steps_per_second": 56.831, + "step": 1900 + }, + { + "epoch": 6.863205417607223, + "grad_norm": 217.63638305664062, + "learning_rate": 1.9704174228675137e-05, + "loss": 37.3918, + "step": 1901 + }, + { + "epoch": 6.866817155756207, + "grad_norm": 169.31996154785156, + "learning_rate": 1.9698729582577133e-05, + "loss": 37.8555, + "step": 1902 + }, + { + "epoch": 6.870428893905192, + "grad_norm": 204.2144775390625, + "learning_rate": 1.9693284936479128e-05, + "loss": 38.0013, + "step": 1903 + }, + { + "epoch": 6.874040632054176, + "grad_norm": 219.13595581054688, + "learning_rate": 1.9687840290381127e-05, + "loss": 37.2128, + "step": 1904 + }, + { + "epoch": 6.87765237020316, + "grad_norm": 189.8477325439453, + "learning_rate": 1.9682395644283122e-05, + "loss": 39.272, + "step": 1905 + }, + { + "epoch": 6.881264108352145, + "grad_norm": 214.21360778808594, + "learning_rate": 1.967695099818512e-05, + "loss": 37.5185, + "step": 1906 + }, + { + "epoch": 6.884875846501129, + "grad_norm": 252.57867431640625, + "learning_rate": 1.9671506352087116e-05, + "loss": 37.6195, + "step": 1907 + }, + { + "epoch": 6.888487584650113, + "grad_norm": 169.85382080078125, + "learning_rate": 1.966606170598911e-05, + "loss": 29.083, + "step": 1908 + }, + { + "epoch": 6.892099322799097, + "grad_norm": 161.38137817382812, + "learning_rate": 1.9660617059891107e-05, + "loss": 24.4547, + "step": 1909 + }, + { + "epoch": 6.895711060948082, + "grad_norm": 192.5706787109375, + "learning_rate": 1.9655172413793102e-05, + "loss": 24.2235, + "step": 1910 + }, + { + "epoch": 6.895711060948082, + "eval_loss": 0.6387229561805725, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1910 + }, + { + "epoch": 6.899322799097066, + "grad_norm": 177.5368194580078, + "learning_rate": 1.9649727767695098e-05, + "loss": 24.8032, + "step": 1911 + }, + { + "epoch": 6.9029345372460496, + "grad_norm": 206.98458862304688, + "learning_rate": 1.9644283121597097e-05, + "loss": 25.7293, + "step": 1912 + }, + { + "epoch": 6.9065462753950335, + "grad_norm": 238.7289581298828, + "learning_rate": 1.9638838475499095e-05, + "loss": 44.2514, + "step": 1913 + }, + { + "epoch": 6.910158013544018, + "grad_norm": 225.86854553222656, + "learning_rate": 1.963339382940109e-05, + "loss": 44.4858, + "step": 1914 + }, + { + "epoch": 6.913769751693002, + "grad_norm": 235.71524047851562, + "learning_rate": 1.9627949183303086e-05, + "loss": 44.5351, + "step": 1915 + }, + { + "epoch": 6.917381489841986, + "grad_norm": 233.1634063720703, + "learning_rate": 1.962250453720508e-05, + "loss": 44.0865, + "step": 1916 + }, + { + "epoch": 6.92099322799097, + "grad_norm": 201.48944091796875, + "learning_rate": 1.961705989110708e-05, + "loss": 45.0226, + "step": 1917 + }, + { + "epoch": 6.924604966139955, + "grad_norm": 226.95469665527344, + "learning_rate": 1.9611615245009076e-05, + "loss": 44.3969, + "step": 1918 + }, + { + "epoch": 6.928216704288939, + "grad_norm": 242.79940795898438, + "learning_rate": 1.960617059891107e-05, + "loss": 41.3037, + "step": 1919 + }, + { + "epoch": 6.931828442437923, + "grad_norm": 255.3524932861328, + "learning_rate": 1.9600725952813066e-05, + "loss": 41.3567, + "step": 1920 + }, + { + "epoch": 6.931828442437923, + "eval_loss": 0.6346065998077393, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 1920 + }, + { + "epoch": 6.935440180586907, + "grad_norm": 277.0763854980469, + "learning_rate": 1.9595281306715062e-05, + "loss": 41.142, + "step": 1921 + }, + { + "epoch": 6.939051918735892, + "grad_norm": 176.02658081054688, + "learning_rate": 1.958983666061706e-05, + "loss": 42.1963, + "step": 1922 + }, + { + "epoch": 6.942663656884876, + "grad_norm": 236.36398315429688, + "learning_rate": 1.958439201451906e-05, + "loss": 42.351, + "step": 1923 + }, + { + "epoch": 6.94627539503386, + "grad_norm": 203.0919647216797, + "learning_rate": 1.9578947368421055e-05, + "loss": 41.5248, + "step": 1924 + }, + { + "epoch": 6.949887133182845, + "grad_norm": 273.605712890625, + "learning_rate": 1.957350272232305e-05, + "loss": 42.1004, + "step": 1925 + }, + { + "epoch": 6.953498871331829, + "grad_norm": 214.04319763183594, + "learning_rate": 1.9568058076225045e-05, + "loss": 42.6326, + "step": 1926 + }, + { + "epoch": 6.957110609480813, + "grad_norm": 250.81832885742188, + "learning_rate": 1.956261343012704e-05, + "loss": 43.8045, + "step": 1927 + }, + { + "epoch": 6.960722347629797, + "grad_norm": 233.58116149902344, + "learning_rate": 1.955716878402904e-05, + "loss": 39.8991, + "step": 1928 + }, + { + "epoch": 6.9643340857787805, + "grad_norm": 269.0545654296875, + "learning_rate": 1.9551724137931035e-05, + "loss": 34.6192, + "step": 1929 + }, + { + "epoch": 6.967945823927765, + "grad_norm": 266.1218566894531, + "learning_rate": 1.954627949183303e-05, + "loss": 35.7568, + "step": 1930 + }, + { + "epoch": 6.967945823927765, + "eval_loss": 0.6233173608779907, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1930 + }, + { + "epoch": 6.971557562076749, + "grad_norm": 294.6914978027344, + "learning_rate": 1.9540834845735026e-05, + "loss": 36.0795, + "step": 1931 + }, + { + "epoch": 6.975169300225733, + "grad_norm": 373.6831970214844, + "learning_rate": 1.9535390199637025e-05, + "loss": 37.2715, + "step": 1932 + }, + { + "epoch": 6.978781038374718, + "grad_norm": 240.34738159179688, + "learning_rate": 1.952994555353902e-05, + "loss": 37.8335, + "step": 1933 + }, + { + "epoch": 6.982392776523702, + "grad_norm": 312.1968994140625, + "learning_rate": 1.952450090744102e-05, + "loss": 37.8251, + "step": 1934 + }, + { + "epoch": 6.986004514672686, + "grad_norm": 276.3544006347656, + "learning_rate": 1.9519056261343014e-05, + "loss": 38.8466, + "step": 1935 + }, + { + "epoch": 6.98961625282167, + "grad_norm": 282.6874694824219, + "learning_rate": 1.951361161524501e-05, + "loss": 37.774, + "step": 1936 + }, + { + "epoch": 6.993227990970655, + "grad_norm": 323.96612548828125, + "learning_rate": 1.9508166969147005e-05, + "loss": 34.3747, + "step": 1937 + }, + { + "epoch": 6.996839729119639, + "grad_norm": 235.02915954589844, + "learning_rate": 1.9502722323049e-05, + "loss": 24.5297, + "step": 1938 + }, + { + "epoch": 7.0, + "grad_norm": 176.4046173095703, + "learning_rate": 1.9497277676951e-05, + "loss": 22.3179, + "step": 1939 + }, + { + "epoch": 7.003611738148984, + "grad_norm": 248.2797393798828, + "learning_rate": 1.9491833030852994e-05, + "loss": 42.225, + "step": 1940 + }, + { + "epoch": 7.003611738148984, + "eval_loss": 0.6272363066673279, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.911, + "eval_steps_per_second": 56.911, + "step": 1940 + }, + { + "epoch": 7.007223476297969, + "grad_norm": 235.9131622314453, + "learning_rate": 1.9486388384754993e-05, + "loss": 43.6526, + "step": 1941 + }, + { + "epoch": 7.010835214446953, + "grad_norm": 223.63479614257812, + "learning_rate": 1.948094373865699e-05, + "loss": 42.9052, + "step": 1942 + }, + { + "epoch": 7.014446952595937, + "grad_norm": 203.92141723632812, + "learning_rate": 1.9475499092558984e-05, + "loss": 43.5819, + "step": 1943 + }, + { + "epoch": 7.018058690744921, + "grad_norm": 209.6050567626953, + "learning_rate": 1.947005444646098e-05, + "loss": 43.1077, + "step": 1944 + }, + { + "epoch": 7.021670428893906, + "grad_norm": 245.77700805664062, + "learning_rate": 1.9464609800362978e-05, + "loss": 42.7508, + "step": 1945 + }, + { + "epoch": 7.0252821670428895, + "grad_norm": 203.13465881347656, + "learning_rate": 1.9459165154264973e-05, + "loss": 42.5234, + "step": 1946 + }, + { + "epoch": 7.0288939051918735, + "grad_norm": 226.4978485107422, + "learning_rate": 1.945372050816697e-05, + "loss": 44.0725, + "step": 1947 + }, + { + "epoch": 7.0325056433408575, + "grad_norm": 225.68116760253906, + "learning_rate": 1.9448275862068964e-05, + "loss": 42.6408, + "step": 1948 + }, + { + "epoch": 7.036117381489842, + "grad_norm": 182.14202880859375, + "learning_rate": 1.944283121597096e-05, + "loss": 41.7696, + "step": 1949 + }, + { + "epoch": 7.039729119638826, + "grad_norm": 196.1949005126953, + "learning_rate": 1.9437386569872962e-05, + "loss": 42.7008, + "step": 1950 + }, + { + "epoch": 7.039729119638826, + "eval_loss": 0.6277336478233337, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 1950 + }, + { + "epoch": 7.04334085778781, + "grad_norm": 180.6853485107422, + "learning_rate": 1.9431941923774957e-05, + "loss": 41.9946, + "step": 1951 + }, + { + "epoch": 7.046952595936794, + "grad_norm": 199.0644073486328, + "learning_rate": 1.9426497277676953e-05, + "loss": 39.8965, + "step": 1952 + }, + { + "epoch": 7.050564334085779, + "grad_norm": 208.21371459960938, + "learning_rate": 1.9421052631578948e-05, + "loss": 39.3263, + "step": 1953 + }, + { + "epoch": 7.054176072234763, + "grad_norm": 239.78677368164062, + "learning_rate": 1.9415607985480943e-05, + "loss": 40.1478, + "step": 1954 + }, + { + "epoch": 7.057787810383747, + "grad_norm": 211.55030822753906, + "learning_rate": 1.941016333938294e-05, + "loss": 40.061, + "step": 1955 + }, + { + "epoch": 7.061399548532731, + "grad_norm": 199.51455688476562, + "learning_rate": 1.9404718693284937e-05, + "loss": 39.8707, + "step": 1956 + }, + { + "epoch": 7.065011286681716, + "grad_norm": 183.39486694335938, + "learning_rate": 1.9399274047186933e-05, + "loss": 40.3183, + "step": 1957 + }, + { + "epoch": 7.0686230248307, + "grad_norm": 238.36737060546875, + "learning_rate": 1.9393829401088928e-05, + "loss": 40.8581, + "step": 1958 + }, + { + "epoch": 7.072234762979684, + "grad_norm": 202.5072021484375, + "learning_rate": 1.9388384754990927e-05, + "loss": 40.2192, + "step": 1959 + }, + { + "epoch": 7.075846501128668, + "grad_norm": 204.236083984375, + "learning_rate": 1.9382940108892922e-05, + "loss": 40.8533, + "step": 1960 + }, + { + "epoch": 7.075846501128668, + "eval_loss": 0.6252757906913757, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 1960 + }, + { + "epoch": 7.079458239277653, + "grad_norm": 260.2081298828125, + "learning_rate": 1.937749546279492e-05, + "loss": 39.7229, + "step": 1961 + }, + { + "epoch": 7.083069977426637, + "grad_norm": 241.91722106933594, + "learning_rate": 1.9372050816696917e-05, + "loss": 41.547, + "step": 1962 + }, + { + "epoch": 7.0866817155756205, + "grad_norm": 168.9304656982422, + "learning_rate": 1.9366606170598912e-05, + "loss": 41.4826, + "step": 1963 + }, + { + "epoch": 7.090293453724605, + "grad_norm": 230.05349731445312, + "learning_rate": 1.9361161524500907e-05, + "loss": 41.5411, + "step": 1964 + }, + { + "epoch": 7.093905191873589, + "grad_norm": 172.16851806640625, + "learning_rate": 1.9355716878402903e-05, + "loss": 42.2347, + "step": 1965 + }, + { + "epoch": 7.097516930022573, + "grad_norm": 312.65838623046875, + "learning_rate": 1.9350272232304898e-05, + "loss": 41.4039, + "step": 1966 + }, + { + "epoch": 7.101128668171557, + "grad_norm": 249.62351989746094, + "learning_rate": 1.9344827586206897e-05, + "loss": 41.4234, + "step": 1967 + }, + { + "epoch": 7.104740406320542, + "grad_norm": 250.49143981933594, + "learning_rate": 1.9339382940108896e-05, + "loss": 38.0539, + "step": 1968 + }, + { + "epoch": 7.108352144469526, + "grad_norm": 238.41546630859375, + "learning_rate": 1.933393829401089e-05, + "loss": 35.5584, + "step": 1969 + }, + { + "epoch": 7.11196388261851, + "grad_norm": 200.78282165527344, + "learning_rate": 1.9328493647912886e-05, + "loss": 34.4491, + "step": 1970 + }, + { + "epoch": 7.11196388261851, + "eval_loss": 0.6286216378211975, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 1970 + }, + { + "epoch": 7.115575620767494, + "grad_norm": 244.61717224121094, + "learning_rate": 1.9323049001814882e-05, + "loss": 34.5403, + "step": 1971 + }, + { + "epoch": 7.119187358916479, + "grad_norm": 219.14312744140625, + "learning_rate": 1.931760435571688e-05, + "loss": 35.7815, + "step": 1972 + }, + { + "epoch": 7.122799097065463, + "grad_norm": 221.85130310058594, + "learning_rate": 1.9312159709618876e-05, + "loss": 35.638, + "step": 1973 + }, + { + "epoch": 7.126410835214447, + "grad_norm": 237.97921752929688, + "learning_rate": 1.930671506352087e-05, + "loss": 35.1348, + "step": 1974 + }, + { + "epoch": 7.130022573363431, + "grad_norm": 234.06256103515625, + "learning_rate": 1.9301270417422867e-05, + "loss": 35.8709, + "step": 1975 + }, + { + "epoch": 7.133634311512416, + "grad_norm": 231.6852264404297, + "learning_rate": 1.9295825771324862e-05, + "loss": 36.6859, + "step": 1976 + }, + { + "epoch": 7.1372460496614, + "grad_norm": 208.2762908935547, + "learning_rate": 1.9290381125226857e-05, + "loss": 37.24, + "step": 1977 + }, + { + "epoch": 7.140857787810384, + "grad_norm": 219.8532257080078, + "learning_rate": 1.928493647912886e-05, + "loss": 36.4058, + "step": 1978 + }, + { + "epoch": 7.144469525959368, + "grad_norm": 242.73159790039062, + "learning_rate": 1.9279491833030855e-05, + "loss": 36.7565, + "step": 1979 + }, + { + "epoch": 7.148081264108352, + "grad_norm": 227.09645080566406, + "learning_rate": 1.927404718693285e-05, + "loss": 37.6752, + "step": 1980 + }, + { + "epoch": 7.148081264108352, + "eval_loss": 0.6243596076965332, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 1980 + }, + { + "epoch": 7.151693002257336, + "grad_norm": 236.27169799804688, + "learning_rate": 1.9268602540834846e-05, + "loss": 38.3857, + "step": 1981 + }, + { + "epoch": 7.15530474040632, + "grad_norm": 244.84912109375, + "learning_rate": 1.926315789473684e-05, + "loss": 38.414, + "step": 1982 + }, + { + "epoch": 7.158916478555304, + "grad_norm": 203.36798095703125, + "learning_rate": 1.925771324863884e-05, + "loss": 38.938, + "step": 1983 + }, + { + "epoch": 7.162528216704289, + "grad_norm": 225.50152587890625, + "learning_rate": 1.9252268602540835e-05, + "loss": 37.654, + "step": 1984 + }, + { + "epoch": 7.166139954853273, + "grad_norm": 236.4989471435547, + "learning_rate": 1.924682395644283e-05, + "loss": 28.2794, + "step": 1985 + }, + { + "epoch": 7.169751693002257, + "grad_norm": 173.909423828125, + "learning_rate": 1.9241379310344826e-05, + "loss": 23.3804, + "step": 1986 + }, + { + "epoch": 7.173363431151241, + "grad_norm": 195.63526916503906, + "learning_rate": 1.9235934664246825e-05, + "loss": 24.4696, + "step": 1987 + }, + { + "epoch": 7.176975169300226, + "grad_norm": 150.0059356689453, + "learning_rate": 1.923049001814882e-05, + "loss": 23.9438, + "step": 1988 + }, + { + "epoch": 7.18058690744921, + "grad_norm": 217.61630249023438, + "learning_rate": 1.922504537205082e-05, + "loss": 25.4084, + "step": 1989 + }, + { + "epoch": 7.184198645598194, + "grad_norm": 259.2041015625, + "learning_rate": 1.9219600725952814e-05, + "loss": 44.7159, + "step": 1990 + }, + { + "epoch": 7.184198645598194, + "eval_loss": 0.6465168595314026, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 1990 + }, + { + "epoch": 7.187810383747179, + "grad_norm": 282.1758117675781, + "learning_rate": 1.921415607985481e-05, + "loss": 45.7571, + "step": 1991 + }, + { + "epoch": 7.191422121896163, + "grad_norm": 276.5455322265625, + "learning_rate": 1.9208711433756805e-05, + "loss": 44.7227, + "step": 1992 + }, + { + "epoch": 7.195033860045147, + "grad_norm": 251.93589782714844, + "learning_rate": 1.92032667876588e-05, + "loss": 43.0705, + "step": 1993 + }, + { + "epoch": 7.198645598194131, + "grad_norm": 224.8245086669922, + "learning_rate": 1.91978221415608e-05, + "loss": 43.2009, + "step": 1994 + }, + { + "epoch": 7.2022573363431155, + "grad_norm": 233.61770629882812, + "learning_rate": 1.9192377495462795e-05, + "loss": 43.4496, + "step": 1995 + }, + { + "epoch": 7.2058690744920995, + "grad_norm": 188.65252685546875, + "learning_rate": 1.9186932849364793e-05, + "loss": 42.5907, + "step": 1996 + }, + { + "epoch": 7.209480812641083, + "grad_norm": 185.1155242919922, + "learning_rate": 1.918148820326679e-05, + "loss": 44.4651, + "step": 1997 + }, + { + "epoch": 7.213092550790067, + "grad_norm": 169.09701538085938, + "learning_rate": 1.9176043557168784e-05, + "loss": 43.6325, + "step": 1998 + }, + { + "epoch": 7.216704288939052, + "grad_norm": 198.49114990234375, + "learning_rate": 1.9170598911070783e-05, + "loss": 43.5817, + "step": 1999 + }, + { + "epoch": 7.220316027088036, + "grad_norm": 193.17591857910156, + "learning_rate": 1.916515426497278e-05, + "loss": 41.4884, + "step": 2000 + }, + { + "epoch": 7.220316027088036, + "eval_loss": 0.6329721212387085, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.995, + "eval_steps_per_second": 56.995, + "step": 2000 + }, + { + "epoch": 7.22392776523702, + "grad_norm": 202.32730102539062, + "learning_rate": 1.9159709618874774e-05, + "loss": 41.2168, + "step": 2001 + }, + { + "epoch": 7.227539503386004, + "grad_norm": 206.4916534423828, + "learning_rate": 1.915426497277677e-05, + "loss": 39.9909, + "step": 2002 + }, + { + "epoch": 7.231151241534989, + "grad_norm": 202.2099609375, + "learning_rate": 1.9148820326678765e-05, + "loss": 40.1413, + "step": 2003 + }, + { + "epoch": 7.234762979683973, + "grad_norm": 223.7954559326172, + "learning_rate": 1.914337568058076e-05, + "loss": 39.5872, + "step": 2004 + }, + { + "epoch": 7.238374717832957, + "grad_norm": 225.8967742919922, + "learning_rate": 1.9137931034482762e-05, + "loss": 41.3396, + "step": 2005 + }, + { + "epoch": 7.241986455981941, + "grad_norm": 248.0997772216797, + "learning_rate": 1.9132486388384757e-05, + "loss": 39.012, + "step": 2006 + }, + { + "epoch": 7.245598194130926, + "grad_norm": 227.4576873779297, + "learning_rate": 1.9127041742286753e-05, + "loss": 42.5922, + "step": 2007 + }, + { + "epoch": 7.24920993227991, + "grad_norm": 197.62547302246094, + "learning_rate": 1.9121597096188748e-05, + "loss": 41.6107, + "step": 2008 + }, + { + "epoch": 7.252821670428894, + "grad_norm": 170.18817138671875, + "learning_rate": 1.9116152450090744e-05, + "loss": 40.3326, + "step": 2009 + }, + { + "epoch": 7.2564334085778786, + "grad_norm": 186.9420166015625, + "learning_rate": 1.9110707803992742e-05, + "loss": 41.0365, + "step": 2010 + }, + { + "epoch": 7.2564334085778786, + "eval_loss": 0.6230406761169434, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2010 + }, + { + "epoch": 7.2600451467268625, + "grad_norm": 188.11244201660156, + "learning_rate": 1.9105263157894738e-05, + "loss": 42.0278, + "step": 2011 + }, + { + "epoch": 7.2636568848758465, + "grad_norm": 242.47305297851562, + "learning_rate": 1.9099818511796733e-05, + "loss": 41.5539, + "step": 2012 + }, + { + "epoch": 7.2672686230248305, + "grad_norm": 190.83987426757812, + "learning_rate": 1.909437386569873e-05, + "loss": 41.8641, + "step": 2013 + }, + { + "epoch": 7.270880361173815, + "grad_norm": 214.44650268554688, + "learning_rate": 1.9088929219600724e-05, + "loss": 42.232, + "step": 2014 + }, + { + "epoch": 7.274492099322799, + "grad_norm": 216.3888397216797, + "learning_rate": 1.9083484573502723e-05, + "loss": 41.6186, + "step": 2015 + }, + { + "epoch": 7.278103837471783, + "grad_norm": 210.46673583984375, + "learning_rate": 1.907803992740472e-05, + "loss": 42.2099, + "step": 2016 + }, + { + "epoch": 7.281715575620767, + "grad_norm": 194.84165954589844, + "learning_rate": 1.9072595281306717e-05, + "loss": 42.78, + "step": 2017 + }, + { + "epoch": 7.285327313769752, + "grad_norm": 201.91297912597656, + "learning_rate": 1.9067150635208712e-05, + "loss": 38.7115, + "step": 2018 + }, + { + "epoch": 7.288939051918736, + "grad_norm": 245.42625427246094, + "learning_rate": 1.9061705989110708e-05, + "loss": 35.7841, + "step": 2019 + }, + { + "epoch": 7.29255079006772, + "grad_norm": 182.4967041015625, + "learning_rate": 1.9056261343012703e-05, + "loss": 34.3308, + "step": 2020 + }, + { + "epoch": 7.29255079006772, + "eval_loss": 0.6238341331481934, + "eval_runtime": 3.1431, + "eval_samples_per_second": 56.95, + "eval_steps_per_second": 56.95, + "step": 2020 + }, + { + "epoch": 7.296162528216704, + "grad_norm": 297.3916320800781, + "learning_rate": 1.9050816696914702e-05, + "loss": 34.7534, + "step": 2021 + }, + { + "epoch": 7.299774266365689, + "grad_norm": 211.52554321289062, + "learning_rate": 1.9045372050816697e-05, + "loss": 34.0303, + "step": 2022 + }, + { + "epoch": 7.303386004514673, + "grad_norm": 232.99844360351562, + "learning_rate": 1.9039927404718693e-05, + "loss": 35.7378, + "step": 2023 + }, + { + "epoch": 7.306997742663657, + "grad_norm": 230.34642028808594, + "learning_rate": 1.903448275862069e-05, + "loss": 36.7492, + "step": 2024 + }, + { + "epoch": 7.310609480812641, + "grad_norm": 228.88966369628906, + "learning_rate": 1.9029038112522687e-05, + "loss": 35.1188, + "step": 2025 + }, + { + "epoch": 7.314221218961626, + "grad_norm": 213.2604522705078, + "learning_rate": 1.9023593466424682e-05, + "loss": 35.0688, + "step": 2026 + }, + { + "epoch": 7.3178329571106095, + "grad_norm": 202.62200927734375, + "learning_rate": 1.901814882032668e-05, + "loss": 37.6721, + "step": 2027 + }, + { + "epoch": 7.3214446952595935, + "grad_norm": 191.8877410888672, + "learning_rate": 1.9012704174228676e-05, + "loss": 36.7728, + "step": 2028 + }, + { + "epoch": 7.3250564334085775, + "grad_norm": 211.57571411132812, + "learning_rate": 1.900725952813067e-05, + "loss": 36.6342, + "step": 2029 + }, + { + "epoch": 7.328668171557562, + "grad_norm": 177.2289581298828, + "learning_rate": 1.9001814882032667e-05, + "loss": 36.8319, + "step": 2030 + }, + { + "epoch": 7.328668171557562, + "eval_loss": 0.6231008172035217, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2030 + }, + { + "epoch": 7.332279909706546, + "grad_norm": 227.7028350830078, + "learning_rate": 1.8996370235934662e-05, + "loss": 36.6706, + "step": 2031 + }, + { + "epoch": 7.33589164785553, + "grad_norm": 229.02972412109375, + "learning_rate": 1.899092558983666e-05, + "loss": 37.0749, + "step": 2032 + }, + { + "epoch": 7.339503386004514, + "grad_norm": 234.30946350097656, + "learning_rate": 1.898548094373866e-05, + "loss": 37.3716, + "step": 2033 + }, + { + "epoch": 7.343115124153499, + "grad_norm": 236.79893493652344, + "learning_rate": 1.8980036297640655e-05, + "loss": 38.9503, + "step": 2034 + }, + { + "epoch": 7.346726862302483, + "grad_norm": 256.5646057128906, + "learning_rate": 1.897459165154265e-05, + "loss": 32.5056, + "step": 2035 + }, + { + "epoch": 7.350338600451467, + "grad_norm": 183.38961791992188, + "learning_rate": 1.8969147005444646e-05, + "loss": 25.3982, + "step": 2036 + }, + { + "epoch": 7.353950338600452, + "grad_norm": 214.09742736816406, + "learning_rate": 1.896370235934664e-05, + "loss": 23.2743, + "step": 2037 + }, + { + "epoch": 7.357562076749436, + "grad_norm": 190.10867309570312, + "learning_rate": 1.895825771324864e-05, + "loss": 24.8062, + "step": 2038 + }, + { + "epoch": 7.36117381489842, + "grad_norm": 197.85313415527344, + "learning_rate": 1.8952813067150636e-05, + "loss": 25.5098, + "step": 2039 + }, + { + "epoch": 7.364785553047404, + "grad_norm": 235.79090881347656, + "learning_rate": 1.894736842105263e-05, + "loss": 44.3536, + "step": 2040 + }, + { + "epoch": 7.364785553047404, + "eval_loss": 0.6341925263404846, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.785, + "eval_steps_per_second": 56.785, + "step": 2040 + }, + { + "epoch": 7.368397291196389, + "grad_norm": 232.7415771484375, + "learning_rate": 1.8941923774954626e-05, + "loss": 44.6073, + "step": 2041 + }, + { + "epoch": 7.372009029345373, + "grad_norm": 302.3766174316406, + "learning_rate": 1.8936479128856625e-05, + "loss": 43.8575, + "step": 2042 + }, + { + "epoch": 7.375620767494357, + "grad_norm": 208.41441345214844, + "learning_rate": 1.8931034482758624e-05, + "loss": 42.4378, + "step": 2043 + }, + { + "epoch": 7.3792325056433405, + "grad_norm": 228.000732421875, + "learning_rate": 1.892558983666062e-05, + "loss": 44.5641, + "step": 2044 + }, + { + "epoch": 7.382844243792325, + "grad_norm": 201.757080078125, + "learning_rate": 1.8920145190562615e-05, + "loss": 43.7578, + "step": 2045 + }, + { + "epoch": 7.386455981941309, + "grad_norm": 220.2481689453125, + "learning_rate": 1.891470054446461e-05, + "loss": 42.755, + "step": 2046 + }, + { + "epoch": 7.390067720090293, + "grad_norm": 225.5443115234375, + "learning_rate": 1.8909255898366605e-05, + "loss": 44.3785, + "step": 2047 + }, + { + "epoch": 7.393679458239277, + "grad_norm": 200.2024688720703, + "learning_rate": 1.89038112522686e-05, + "loss": 42.994, + "step": 2048 + }, + { + "epoch": 7.397291196388262, + "grad_norm": 205.64794921875, + "learning_rate": 1.88983666061706e-05, + "loss": 43.1902, + "step": 2049 + }, + { + "epoch": 7.400902934537246, + "grad_norm": 183.3535919189453, + "learning_rate": 1.8892921960072595e-05, + "loss": 40.9422, + "step": 2050 + }, + { + "epoch": 7.400902934537246, + "eval_loss": 0.626913845539093, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2050 + }, + { + "epoch": 7.40451467268623, + "grad_norm": 201.8138885498047, + "learning_rate": 1.8887477313974594e-05, + "loss": 39.4408, + "step": 2051 + }, + { + "epoch": 7.408126410835214, + "grad_norm": 201.8863525390625, + "learning_rate": 1.888203266787659e-05, + "loss": 39.5467, + "step": 2052 + }, + { + "epoch": 7.411738148984199, + "grad_norm": 239.10687255859375, + "learning_rate": 1.8876588021778585e-05, + "loss": 41.2256, + "step": 2053 + }, + { + "epoch": 7.415349887133183, + "grad_norm": 209.47796630859375, + "learning_rate": 1.8871143375680583e-05, + "loss": 40.8963, + "step": 2054 + }, + { + "epoch": 7.418961625282167, + "grad_norm": 202.6414794921875, + "learning_rate": 1.886569872958258e-05, + "loss": 40.5138, + "step": 2055 + }, + { + "epoch": 7.422573363431152, + "grad_norm": 198.01795959472656, + "learning_rate": 1.8860254083484574e-05, + "loss": 39.1767, + "step": 2056 + }, + { + "epoch": 7.426185101580136, + "grad_norm": 173.26507568359375, + "learning_rate": 1.885480943738657e-05, + "loss": 40.6713, + "step": 2057 + }, + { + "epoch": 7.42979683972912, + "grad_norm": 166.11607360839844, + "learning_rate": 1.8849364791288565e-05, + "loss": 41.2602, + "step": 2058 + }, + { + "epoch": 7.433408577878104, + "grad_norm": 200.76956176757812, + "learning_rate": 1.884392014519056e-05, + "loss": 41.0714, + "step": 2059 + }, + { + "epoch": 7.437020316027088, + "grad_norm": 213.75315856933594, + "learning_rate": 1.883847549909256e-05, + "loss": 39.6812, + "step": 2060 + }, + { + "epoch": 7.437020316027088, + "eval_loss": 0.6279598474502563, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.0, + "eval_steps_per_second": 57.0, + "step": 2060 + }, + { + "epoch": 7.440632054176072, + "grad_norm": 221.25025939941406, + "learning_rate": 1.8833030852994558e-05, + "loss": 41.6964, + "step": 2061 + }, + { + "epoch": 7.444243792325056, + "grad_norm": 171.32106018066406, + "learning_rate": 1.8827586206896553e-05, + "loss": 41.4608, + "step": 2062 + }, + { + "epoch": 7.44785553047404, + "grad_norm": 222.76600646972656, + "learning_rate": 1.882214156079855e-05, + "loss": 41.2687, + "step": 2063 + }, + { + "epoch": 7.451467268623025, + "grad_norm": 169.82395935058594, + "learning_rate": 1.8816696914700544e-05, + "loss": 41.6048, + "step": 2064 + }, + { + "epoch": 7.455079006772009, + "grad_norm": 190.5113525390625, + "learning_rate": 1.8811252268602543e-05, + "loss": 41.8843, + "step": 2065 + }, + { + "epoch": 7.458690744920993, + "grad_norm": 194.5990447998047, + "learning_rate": 1.8805807622504538e-05, + "loss": 43.5968, + "step": 2066 + }, + { + "epoch": 7.462302483069977, + "grad_norm": 216.0985870361328, + "learning_rate": 1.8800362976406533e-05, + "loss": 41.6743, + "step": 2067 + }, + { + "epoch": 7.465914221218962, + "grad_norm": 249.05270385742188, + "learning_rate": 1.879491833030853e-05, + "loss": 39.4203, + "step": 2068 + }, + { + "epoch": 7.469525959367946, + "grad_norm": 232.5495147705078, + "learning_rate": 1.8789473684210524e-05, + "loss": 36.2202, + "step": 2069 + }, + { + "epoch": 7.47313769751693, + "grad_norm": 218.72299194335938, + "learning_rate": 1.8784029038112523e-05, + "loss": 34.9116, + "step": 2070 + }, + { + "epoch": 7.47313769751693, + "eval_loss": 0.6241349577903748, + "eval_runtime": 3.1499, + "eval_samples_per_second": 56.827, + "eval_steps_per_second": 56.827, + "step": 2070 + }, + { + "epoch": 7.476749435665914, + "grad_norm": 241.78179931640625, + "learning_rate": 1.8778584392014522e-05, + "loss": 36.2476, + "step": 2071 + }, + { + "epoch": 7.480361173814899, + "grad_norm": 194.92982482910156, + "learning_rate": 1.8773139745916517e-05, + "loss": 34.4524, + "step": 2072 + }, + { + "epoch": 7.483972911963883, + "grad_norm": 227.76156616210938, + "learning_rate": 1.8767695099818513e-05, + "loss": 34.5292, + "step": 2073 + }, + { + "epoch": 7.487584650112867, + "grad_norm": 287.61309814453125, + "learning_rate": 1.8762250453720508e-05, + "loss": 37.8068, + "step": 2074 + }, + { + "epoch": 7.491196388261851, + "grad_norm": 191.0822296142578, + "learning_rate": 1.8756805807622503e-05, + "loss": 36.0941, + "step": 2075 + }, + { + "epoch": 7.4948081264108355, + "grad_norm": 197.5564422607422, + "learning_rate": 1.8751361161524502e-05, + "loss": 36.3624, + "step": 2076 + }, + { + "epoch": 7.4984198645598195, + "grad_norm": 187.72479248046875, + "learning_rate": 1.8745916515426497e-05, + "loss": 37.5074, + "step": 2077 + }, + { + "epoch": 7.502031602708803, + "grad_norm": 220.4607391357422, + "learning_rate": 1.8740471869328493e-05, + "loss": 35.6139, + "step": 2078 + }, + { + "epoch": 7.505643340857787, + "grad_norm": 179.05612182617188, + "learning_rate": 1.873502722323049e-05, + "loss": 37.7286, + "step": 2079 + }, + { + "epoch": 7.509255079006772, + "grad_norm": 230.91879272460938, + "learning_rate": 1.8729582577132487e-05, + "loss": 36.1803, + "step": 2080 + }, + { + "epoch": 7.509255079006772, + "eval_loss": 0.6255043148994446, + "eval_runtime": 3.1466, + "eval_samples_per_second": 56.887, + "eval_steps_per_second": 56.887, + "step": 2080 + }, + { + "epoch": 7.512866817155756, + "grad_norm": 182.89437866210938, + "learning_rate": 1.8724137931034482e-05, + "loss": 36.5782, + "step": 2081 + }, + { + "epoch": 7.51647855530474, + "grad_norm": 215.36769104003906, + "learning_rate": 1.871869328493648e-05, + "loss": 38.233, + "step": 2082 + }, + { + "epoch": 7.520090293453725, + "grad_norm": 232.6095733642578, + "learning_rate": 1.8713248638838477e-05, + "loss": 38.6268, + "step": 2083 + }, + { + "epoch": 7.523702031602709, + "grad_norm": 236.94281005859375, + "learning_rate": 1.8707803992740472e-05, + "loss": 38.1768, + "step": 2084 + }, + { + "epoch": 7.527313769751693, + "grad_norm": 214.16079711914062, + "learning_rate": 1.8702359346642467e-05, + "loss": 27.514, + "step": 2085 + }, + { + "epoch": 7.530925507900677, + "grad_norm": 192.6107940673828, + "learning_rate": 1.8696914700544463e-05, + "loss": 24.274, + "step": 2086 + }, + { + "epoch": 7.534537246049661, + "grad_norm": 217.98619079589844, + "learning_rate": 1.869147005444646e-05, + "loss": 23.2824, + "step": 2087 + }, + { + "epoch": 7.538148984198646, + "grad_norm": 183.04296875, + "learning_rate": 1.868602540834846e-05, + "loss": 24.9622, + "step": 2088 + }, + { + "epoch": 7.54176072234763, + "grad_norm": 167.1417236328125, + "learning_rate": 1.8680580762250456e-05, + "loss": 25.1446, + "step": 2089 + }, + { + "epoch": 7.545372460496614, + "grad_norm": 287.29937744140625, + "learning_rate": 1.867513611615245e-05, + "loss": 44.1171, + "step": 2090 + }, + { + "epoch": 7.545372460496614, + "eval_loss": 0.6376849412918091, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.929, + "eval_steps_per_second": 56.929, + "step": 2090 + }, + { + "epoch": 7.5489841986455986, + "grad_norm": 285.3408203125, + "learning_rate": 1.8669691470054446e-05, + "loss": 46.3716, + "step": 2091 + }, + { + "epoch": 7.5525959367945825, + "grad_norm": 233.18389892578125, + "learning_rate": 1.8664246823956445e-05, + "loss": 44.0514, + "step": 2092 + }, + { + "epoch": 7.5562076749435665, + "grad_norm": 256.4196472167969, + "learning_rate": 1.865880217785844e-05, + "loss": 44.1784, + "step": 2093 + }, + { + "epoch": 7.5598194130925505, + "grad_norm": 223.28128051757812, + "learning_rate": 1.8653357531760436e-05, + "loss": 42.9897, + "step": 2094 + }, + { + "epoch": 7.563431151241535, + "grad_norm": 235.2901153564453, + "learning_rate": 1.864791288566243e-05, + "loss": 43.7651, + "step": 2095 + }, + { + "epoch": 7.567042889390519, + "grad_norm": 285.9206237792969, + "learning_rate": 1.8642468239564427e-05, + "loss": 44.6333, + "step": 2096 + }, + { + "epoch": 7.570654627539503, + "grad_norm": 200.00210571289062, + "learning_rate": 1.8637023593466425e-05, + "loss": 43.9845, + "step": 2097 + }, + { + "epoch": 7.574266365688487, + "grad_norm": 277.73394775390625, + "learning_rate": 1.8631578947368424e-05, + "loss": 44.7301, + "step": 2098 + }, + { + "epoch": 7.577878103837472, + "grad_norm": 216.9422149658203, + "learning_rate": 1.862613430127042e-05, + "loss": 44.0409, + "step": 2099 + }, + { + "epoch": 7.581489841986456, + "grad_norm": 198.86639404296875, + "learning_rate": 1.8620689655172415e-05, + "loss": 43.4026, + "step": 2100 + }, + { + "epoch": 7.581489841986456, + "eval_loss": 0.6270378232002258, + "eval_runtime": 3.1464, + "eval_samples_per_second": 56.891, + "eval_steps_per_second": 56.891, + "step": 2100 + }, + { + "epoch": 7.58510158013544, + "grad_norm": 240.495361328125, + "learning_rate": 1.861524500907441e-05, + "loss": 41.4092, + "step": 2101 + }, + { + "epoch": 7.588713318284425, + "grad_norm": 240.1851043701172, + "learning_rate": 1.8609800362976406e-05, + "loss": 40.1396, + "step": 2102 + }, + { + "epoch": 7.592325056433409, + "grad_norm": 241.21495056152344, + "learning_rate": 1.8604355716878405e-05, + "loss": 39.1778, + "step": 2103 + }, + { + "epoch": 7.595936794582393, + "grad_norm": 287.3133544921875, + "learning_rate": 1.85989110707804e-05, + "loss": 41.0348, + "step": 2104 + }, + { + "epoch": 7.599548532731377, + "grad_norm": 230.4313201904297, + "learning_rate": 1.8593466424682395e-05, + "loss": 39.5872, + "step": 2105 + }, + { + "epoch": 7.603160270880361, + "grad_norm": 210.32962036132812, + "learning_rate": 1.858802177858439e-05, + "loss": 40.6146, + "step": 2106 + }, + { + "epoch": 7.606772009029346, + "grad_norm": 185.81752014160156, + "learning_rate": 1.858257713248639e-05, + "loss": 39.6363, + "step": 2107 + }, + { + "epoch": 7.6103837471783295, + "grad_norm": 234.63037109375, + "learning_rate": 1.8577132486388385e-05, + "loss": 40.558, + "step": 2108 + }, + { + "epoch": 7.6139954853273135, + "grad_norm": 289.92803955078125, + "learning_rate": 1.8571687840290384e-05, + "loss": 41.1624, + "step": 2109 + }, + { + "epoch": 7.617607223476298, + "grad_norm": 252.82188415527344, + "learning_rate": 1.856624319419238e-05, + "loss": 41.7827, + "step": 2110 + }, + { + "epoch": 7.617607223476298, + "eval_loss": 0.6290409564971924, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2110 + }, + { + "epoch": 7.621218961625282, + "grad_norm": 201.8303985595703, + "learning_rate": 1.8560798548094374e-05, + "loss": 39.0072, + "step": 2111 + }, + { + "epoch": 7.624830699774266, + "grad_norm": 158.71446228027344, + "learning_rate": 1.855535390199637e-05, + "loss": 39.9822, + "step": 2112 + }, + { + "epoch": 7.62844243792325, + "grad_norm": 171.3879852294922, + "learning_rate": 1.8549909255898365e-05, + "loss": 42.1973, + "step": 2113 + }, + { + "epoch": 7.632054176072235, + "grad_norm": 218.584228515625, + "learning_rate": 1.8544464609800364e-05, + "loss": 42.933, + "step": 2114 + }, + { + "epoch": 7.635665914221219, + "grad_norm": 200.60093688964844, + "learning_rate": 1.853901996370236e-05, + "loss": 41.9847, + "step": 2115 + }, + { + "epoch": 7.639277652370203, + "grad_norm": 210.75128173828125, + "learning_rate": 1.8533575317604358e-05, + "loss": 42.4961, + "step": 2116 + }, + { + "epoch": 7.642889390519187, + "grad_norm": 187.47406005859375, + "learning_rate": 1.8528130671506353e-05, + "loss": 39.3404, + "step": 2117 + }, + { + "epoch": 7.646501128668172, + "grad_norm": 204.87693786621094, + "learning_rate": 1.852268602540835e-05, + "loss": 40.3011, + "step": 2118 + }, + { + "epoch": 7.650112866817156, + "grad_norm": 228.8159637451172, + "learning_rate": 1.8517241379310344e-05, + "loss": 37.4416, + "step": 2119 + }, + { + "epoch": 7.65372460496614, + "grad_norm": 237.59664916992188, + "learning_rate": 1.8511796733212343e-05, + "loss": 35.3079, + "step": 2120 + }, + { + "epoch": 7.65372460496614, + "eval_loss": 0.6256567239761353, + "eval_runtime": 3.1458, + "eval_samples_per_second": 56.902, + "eval_steps_per_second": 56.902, + "step": 2120 + }, + { + "epoch": 7.657336343115124, + "grad_norm": 233.3187713623047, + "learning_rate": 1.850635208711434e-05, + "loss": 34.5055, + "step": 2121 + }, + { + "epoch": 7.660948081264109, + "grad_norm": 232.7037353515625, + "learning_rate": 1.8500907441016334e-05, + "loss": 34.1232, + "step": 2122 + }, + { + "epoch": 7.664559819413093, + "grad_norm": 254.53050231933594, + "learning_rate": 1.849546279491833e-05, + "loss": 35.3301, + "step": 2123 + }, + { + "epoch": 7.668171557562077, + "grad_norm": 234.93154907226562, + "learning_rate": 1.8490018148820324e-05, + "loss": 35.9202, + "step": 2124 + }, + { + "epoch": 7.6717832957110605, + "grad_norm": 237.99671936035156, + "learning_rate": 1.8484573502722327e-05, + "loss": 36.5702, + "step": 2125 + }, + { + "epoch": 7.675395033860045, + "grad_norm": 186.25271606445312, + "learning_rate": 1.8479128856624322e-05, + "loss": 35.9423, + "step": 2126 + }, + { + "epoch": 7.679006772009029, + "grad_norm": 226.461669921875, + "learning_rate": 1.8473684210526317e-05, + "loss": 37.4121, + "step": 2127 + }, + { + "epoch": 7.682618510158013, + "grad_norm": 227.0966033935547, + "learning_rate": 1.8468239564428313e-05, + "loss": 36.8802, + "step": 2128 + }, + { + "epoch": 7.686230248306998, + "grad_norm": 193.4064178466797, + "learning_rate": 1.8462794918330308e-05, + "loss": 36.0245, + "step": 2129 + }, + { + "epoch": 7.689841986455982, + "grad_norm": 279.1668395996094, + "learning_rate": 1.8457350272232304e-05, + "loss": 37.4833, + "step": 2130 + }, + { + "epoch": 7.689841986455982, + "eval_loss": 0.6227458715438843, + "eval_runtime": 3.1429, + "eval_samples_per_second": 56.953, + "eval_steps_per_second": 56.953, + "step": 2130 + }, + { + "epoch": 7.693453724604966, + "grad_norm": 254.59234619140625, + "learning_rate": 1.8451905626134302e-05, + "loss": 36.8538, + "step": 2131 + }, + { + "epoch": 7.69706546275395, + "grad_norm": 191.14463806152344, + "learning_rate": 1.8446460980036298e-05, + "loss": 37.8517, + "step": 2132 + }, + { + "epoch": 7.700677200902934, + "grad_norm": 189.20896911621094, + "learning_rate": 1.8441016333938293e-05, + "loss": 38.406, + "step": 2133 + }, + { + "epoch": 7.704288939051919, + "grad_norm": 209.61175537109375, + "learning_rate": 1.8435571687840292e-05, + "loss": 37.7692, + "step": 2134 + }, + { + "epoch": 7.707900677200903, + "grad_norm": 220.5150146484375, + "learning_rate": 1.8430127041742287e-05, + "loss": 36.087, + "step": 2135 + }, + { + "epoch": 7.711512415349887, + "grad_norm": 211.78372192382812, + "learning_rate": 1.8424682395644286e-05, + "loss": 25.6052, + "step": 2136 + }, + { + "epoch": 7.715124153498872, + "grad_norm": 223.85789489746094, + "learning_rate": 1.841923774954628e-05, + "loss": 23.5576, + "step": 2137 + }, + { + "epoch": 7.718735891647856, + "grad_norm": 163.74220275878906, + "learning_rate": 1.8413793103448277e-05, + "loss": 24.4869, + "step": 2138 + }, + { + "epoch": 7.72234762979684, + "grad_norm": 182.80079650878906, + "learning_rate": 1.8408348457350272e-05, + "loss": 25.1878, + "step": 2139 + }, + { + "epoch": 7.725959367945824, + "grad_norm": 296.0340270996094, + "learning_rate": 1.8402903811252268e-05, + "loss": 44.4643, + "step": 2140 + }, + { + "epoch": 7.725959367945824, + "eval_loss": 0.6382863521575928, + "eval_runtime": 3.1441, + "eval_samples_per_second": 56.932, + "eval_steps_per_second": 56.932, + "step": 2140 + }, + { + "epoch": 7.7295711060948085, + "grad_norm": 248.48643493652344, + "learning_rate": 1.8397459165154263e-05, + "loss": 45.2141, + "step": 2141 + }, + { + "epoch": 7.733182844243792, + "grad_norm": 240.9061279296875, + "learning_rate": 1.8392014519056262e-05, + "loss": 42.9435, + "step": 2142 + }, + { + "epoch": 7.736794582392776, + "grad_norm": 231.62315368652344, + "learning_rate": 1.8386569872958257e-05, + "loss": 42.9769, + "step": 2143 + }, + { + "epoch": 7.74040632054176, + "grad_norm": 244.36915588378906, + "learning_rate": 1.8381125226860256e-05, + "loss": 43.6058, + "step": 2144 + }, + { + "epoch": 7.744018058690745, + "grad_norm": 252.9080047607422, + "learning_rate": 1.837568058076225e-05, + "loss": 43.1753, + "step": 2145 + }, + { + "epoch": 7.747629796839729, + "grad_norm": 274.0201721191406, + "learning_rate": 1.8370235934664247e-05, + "loss": 43.3285, + "step": 2146 + }, + { + "epoch": 7.751241534988713, + "grad_norm": 226.75595092773438, + "learning_rate": 1.8364791288566245e-05, + "loss": 43.3158, + "step": 2147 + }, + { + "epoch": 7.754853273137698, + "grad_norm": 197.0859832763672, + "learning_rate": 1.835934664246824e-05, + "loss": 43.5773, + "step": 2148 + }, + { + "epoch": 7.758465011286682, + "grad_norm": 212.14720153808594, + "learning_rate": 1.8353901996370236e-05, + "loss": 43.9208, + "step": 2149 + }, + { + "epoch": 7.762076749435666, + "grad_norm": 230.22158813476562, + "learning_rate": 1.834845735027223e-05, + "loss": 42.8429, + "step": 2150 + }, + { + "epoch": 7.762076749435666, + "eval_loss": 0.6291994452476501, + "eval_runtime": 3.1473, + "eval_samples_per_second": 56.874, + "eval_steps_per_second": 56.874, + "step": 2150 + }, + { + "epoch": 7.76568848758465, + "grad_norm": 215.79391479492188, + "learning_rate": 1.8343012704174227e-05, + "loss": 40.7289, + "step": 2151 + }, + { + "epoch": 7.769300225733634, + "grad_norm": 210.00296020507812, + "learning_rate": 1.8337568058076222e-05, + "loss": 39.9759, + "step": 2152 + }, + { + "epoch": 7.772911963882619, + "grad_norm": 291.2987976074219, + "learning_rate": 1.8332123411978224e-05, + "loss": 40.551, + "step": 2153 + }, + { + "epoch": 7.776523702031603, + "grad_norm": 218.08819580078125, + "learning_rate": 1.832667876588022e-05, + "loss": 40.7981, + "step": 2154 + }, + { + "epoch": 7.780135440180587, + "grad_norm": 268.615966796875, + "learning_rate": 1.8321234119782215e-05, + "loss": 40.5463, + "step": 2155 + }, + { + "epoch": 7.7837471783295715, + "grad_norm": 269.939697265625, + "learning_rate": 1.831578947368421e-05, + "loss": 40.6168, + "step": 2156 + }, + { + "epoch": 7.7873589164785555, + "grad_norm": 268.9761657714844, + "learning_rate": 1.8310344827586206e-05, + "loss": 41.2449, + "step": 2157 + }, + { + "epoch": 7.7909706546275395, + "grad_norm": 161.08811950683594, + "learning_rate": 1.8304900181488205e-05, + "loss": 40.6308, + "step": 2158 + }, + { + "epoch": 7.794582392776523, + "grad_norm": 190.44696044921875, + "learning_rate": 1.82994555353902e-05, + "loss": 40.9708, + "step": 2159 + }, + { + "epoch": 7.798194130925508, + "grad_norm": 202.4305419921875, + "learning_rate": 1.8294010889292196e-05, + "loss": 41.2053, + "step": 2160 + }, + { + "epoch": 7.798194130925508, + "eval_loss": 0.6233534812927246, + "eval_runtime": 3.1457, + "eval_samples_per_second": 56.903, + "eval_steps_per_second": 56.903, + "step": 2160 + }, + { + "epoch": 7.801805869074492, + "grad_norm": 188.5523681640625, + "learning_rate": 1.828856624319419e-05, + "loss": 40.3928, + "step": 2161 + }, + { + "epoch": 7.805417607223476, + "grad_norm": 184.18296813964844, + "learning_rate": 1.828312159709619e-05, + "loss": 42.3466, + "step": 2162 + }, + { + "epoch": 7.80902934537246, + "grad_norm": 223.9243927001953, + "learning_rate": 1.8277676950998185e-05, + "loss": 42.0301, + "step": 2163 + }, + { + "epoch": 7.812641083521445, + "grad_norm": 202.3498077392578, + "learning_rate": 1.8272232304900184e-05, + "loss": 42.3284, + "step": 2164 + }, + { + "epoch": 7.816252821670429, + "grad_norm": 205.77940368652344, + "learning_rate": 1.826678765880218e-05, + "loss": 42.0951, + "step": 2165 + }, + { + "epoch": 7.819864559819413, + "grad_norm": 191.46728515625, + "learning_rate": 1.8261343012704175e-05, + "loss": 40.826, + "step": 2166 + }, + { + "epoch": 7.823476297968397, + "grad_norm": 276.8330383300781, + "learning_rate": 1.825589836660617e-05, + "loss": 42.7909, + "step": 2167 + }, + { + "epoch": 7.827088036117382, + "grad_norm": 181.93955993652344, + "learning_rate": 1.8250453720508165e-05, + "loss": 38.6068, + "step": 2168 + }, + { + "epoch": 7.830699774266366, + "grad_norm": 178.79856872558594, + "learning_rate": 1.8245009074410164e-05, + "loss": 35.694, + "step": 2169 + }, + { + "epoch": 7.83431151241535, + "grad_norm": 224.6522979736328, + "learning_rate": 1.823956442831216e-05, + "loss": 36.7127, + "step": 2170 + }, + { + "epoch": 7.83431151241535, + "eval_loss": 0.6237645745277405, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 2170 + }, + { + "epoch": 7.837923250564334, + "grad_norm": 203.37196350097656, + "learning_rate": 1.823411978221416e-05, + "loss": 34.0039, + "step": 2171 + }, + { + "epoch": 7.8415349887133186, + "grad_norm": 212.79307556152344, + "learning_rate": 1.8228675136116154e-05, + "loss": 33.2787, + "step": 2172 + }, + { + "epoch": 7.8451467268623025, + "grad_norm": 215.5691375732422, + "learning_rate": 1.822323049001815e-05, + "loss": 35.4241, + "step": 2173 + }, + { + "epoch": 7.8487584650112865, + "grad_norm": 230.0751190185547, + "learning_rate": 1.8217785843920144e-05, + "loss": 36.9333, + "step": 2174 + }, + { + "epoch": 7.852370203160271, + "grad_norm": 217.8132781982422, + "learning_rate": 1.8212341197822143e-05, + "loss": 35.7233, + "step": 2175 + }, + { + "epoch": 7.855981941309255, + "grad_norm": 245.93177795410156, + "learning_rate": 1.820689655172414e-05, + "loss": 36.6111, + "step": 2176 + }, + { + "epoch": 7.859593679458239, + "grad_norm": 210.58218383789062, + "learning_rate": 1.8201451905626134e-05, + "loss": 36.3243, + "step": 2177 + }, + { + "epoch": 7.863205417607223, + "grad_norm": 234.6280059814453, + "learning_rate": 1.819600725952813e-05, + "loss": 37.0315, + "step": 2178 + }, + { + "epoch": 7.866817155756207, + "grad_norm": 184.53121948242188, + "learning_rate": 1.8190562613430125e-05, + "loss": 35.8725, + "step": 2179 + }, + { + "epoch": 7.870428893905192, + "grad_norm": 201.5563507080078, + "learning_rate": 1.8185117967332127e-05, + "loss": 37.9183, + "step": 2180 + }, + { + "epoch": 7.870428893905192, + "eval_loss": 0.6210297346115112, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2180 + }, + { + "epoch": 7.874040632054176, + "grad_norm": 192.29579162597656, + "learning_rate": 1.8179673321234122e-05, + "loss": 37.1709, + "step": 2181 + }, + { + "epoch": 7.87765237020316, + "grad_norm": 246.0638427734375, + "learning_rate": 1.8174228675136118e-05, + "loss": 38.5338, + "step": 2182 + }, + { + "epoch": 7.881264108352145, + "grad_norm": 237.47607421875, + "learning_rate": 1.8168784029038113e-05, + "loss": 37.7041, + "step": 2183 + }, + { + "epoch": 7.884875846501129, + "grad_norm": 215.06407165527344, + "learning_rate": 1.816333938294011e-05, + "loss": 38.1663, + "step": 2184 + }, + { + "epoch": 7.888487584650113, + "grad_norm": 193.76809692382812, + "learning_rate": 1.8157894736842107e-05, + "loss": 32.1679, + "step": 2185 + }, + { + "epoch": 7.892099322799097, + "grad_norm": 208.66111755371094, + "learning_rate": 1.8152450090744103e-05, + "loss": 24.2413, + "step": 2186 + }, + { + "epoch": 7.895711060948082, + "grad_norm": 182.810546875, + "learning_rate": 1.8147005444646098e-05, + "loss": 24.1102, + "step": 2187 + }, + { + "epoch": 7.899322799097066, + "grad_norm": 200.25823974609375, + "learning_rate": 1.8141560798548093e-05, + "loss": 24.5778, + "step": 2188 + }, + { + "epoch": 7.9029345372460496, + "grad_norm": 224.19125366210938, + "learning_rate": 1.813611615245009e-05, + "loss": 26.1643, + "step": 2189 + }, + { + "epoch": 7.9065462753950335, + "grad_norm": 261.03033447265625, + "learning_rate": 1.8130671506352088e-05, + "loss": 45.1071, + "step": 2190 + }, + { + "epoch": 7.9065462753950335, + "eval_loss": 0.6303785443305969, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2190 + }, + { + "epoch": 7.910158013544018, + "grad_norm": 273.6593322753906, + "learning_rate": 1.8125226860254086e-05, + "loss": 43.8271, + "step": 2191 + }, + { + "epoch": 7.913769751693002, + "grad_norm": 304.0534362792969, + "learning_rate": 1.8119782214156082e-05, + "loss": 43.7623, + "step": 2192 + }, + { + "epoch": 7.917381489841986, + "grad_norm": 249.27255249023438, + "learning_rate": 1.8114337568058077e-05, + "loss": 43.7191, + "step": 2193 + }, + { + "epoch": 7.92099322799097, + "grad_norm": 199.5006103515625, + "learning_rate": 1.8108892921960072e-05, + "loss": 44.1019, + "step": 2194 + }, + { + "epoch": 7.924604966139955, + "grad_norm": 228.42832946777344, + "learning_rate": 1.8103448275862068e-05, + "loss": 43.9717, + "step": 2195 + }, + { + "epoch": 7.928216704288939, + "grad_norm": 247.20901489257812, + "learning_rate": 1.8098003629764067e-05, + "loss": 40.022, + "step": 2196 + }, + { + "epoch": 7.931828442437923, + "grad_norm": 297.5372619628906, + "learning_rate": 1.8092558983666062e-05, + "loss": 40.6639, + "step": 2197 + }, + { + "epoch": 7.935440180586907, + "grad_norm": 245.11915588378906, + "learning_rate": 1.8087114337568057e-05, + "loss": 40.3569, + "step": 2198 + }, + { + "epoch": 7.939051918735892, + "grad_norm": 255.53297424316406, + "learning_rate": 1.8081669691470056e-05, + "loss": 41.7983, + "step": 2199 + }, + { + "epoch": 7.942663656884876, + "grad_norm": 226.12783813476562, + "learning_rate": 1.807622504537205e-05, + "loss": 41.7844, + "step": 2200 + }, + { + "epoch": 7.942663656884876, + "eval_loss": 0.6214397549629211, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2200 + }, + { + "epoch": 7.94627539503386, + "grad_norm": 220.90577697753906, + "learning_rate": 1.8070780399274047e-05, + "loss": 42.057, + "step": 2201 + }, + { + "epoch": 7.949887133182845, + "grad_norm": 192.33856201171875, + "learning_rate": 1.8065335753176046e-05, + "loss": 42.0299, + "step": 2202 + }, + { + "epoch": 7.953498871331829, + "grad_norm": 192.8511962890625, + "learning_rate": 1.805989110707804e-05, + "loss": 41.7752, + "step": 2203 + }, + { + "epoch": 7.957110609480813, + "grad_norm": 223.10275268554688, + "learning_rate": 1.8054446460980036e-05, + "loss": 41.0178, + "step": 2204 + }, + { + "epoch": 7.960722347629797, + "grad_norm": 189.8402099609375, + "learning_rate": 1.8049001814882032e-05, + "loss": 37.9747, + "step": 2205 + }, + { + "epoch": 7.9643340857787805, + "grad_norm": 233.5938720703125, + "learning_rate": 1.8043557168784027e-05, + "loss": 35.3994, + "step": 2206 + }, + { + "epoch": 7.967945823927765, + "grad_norm": 218.5577850341797, + "learning_rate": 1.8038112522686026e-05, + "loss": 35.1967, + "step": 2207 + }, + { + "epoch": 7.971557562076749, + "grad_norm": 228.49502563476562, + "learning_rate": 1.8032667876588025e-05, + "loss": 34.5792, + "step": 2208 + }, + { + "epoch": 7.975169300225733, + "grad_norm": 285.4461364746094, + "learning_rate": 1.802722323049002e-05, + "loss": 37.9449, + "step": 2209 + }, + { + "epoch": 7.978781038374718, + "grad_norm": 186.83755493164062, + "learning_rate": 1.8021778584392016e-05, + "loss": 36.3295, + "step": 2210 + }, + { + "epoch": 7.978781038374718, + "eval_loss": 0.6212169528007507, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2210 + }, + { + "epoch": 7.982392776523702, + "grad_norm": 210.31175231933594, + "learning_rate": 1.801633393829401e-05, + "loss": 37.0061, + "step": 2211 + }, + { + "epoch": 7.986004514672686, + "grad_norm": 251.96026611328125, + "learning_rate": 1.8010889292196006e-05, + "loss": 37.8831, + "step": 2212 + }, + { + "epoch": 7.98961625282167, + "grad_norm": 273.8665771484375, + "learning_rate": 1.8005444646098005e-05, + "loss": 38.8926, + "step": 2213 + }, + { + "epoch": 7.993227990970655, + "grad_norm": 207.25836181640625, + "learning_rate": 1.8e-05, + "loss": 30.0468, + "step": 2214 + }, + { + "epoch": 7.996839729119639, + "grad_norm": 200.5218048095703, + "learning_rate": 1.7994555353901996e-05, + "loss": 24.0549, + "step": 2215 + }, + { + "epoch": 8.0, + "grad_norm": 245.7149200439453, + "learning_rate": 1.798911070780399e-05, + "loss": 22.3158, + "step": 2216 + }, + { + "epoch": 8.003611738148985, + "grad_norm": 263.85546875, + "learning_rate": 1.798366606170599e-05, + "loss": 43.2342, + "step": 2217 + }, + { + "epoch": 8.007223476297968, + "grad_norm": 244.57205200195312, + "learning_rate": 1.797822141560799e-05, + "loss": 44.0931, + "step": 2218 + }, + { + "epoch": 8.010835214446953, + "grad_norm": 196.4144287109375, + "learning_rate": 1.7972776769509984e-05, + "loss": 42.1926, + "step": 2219 + }, + { + "epoch": 8.014446952595938, + "grad_norm": 282.3250427246094, + "learning_rate": 1.796733212341198e-05, + "loss": 41.4664, + "step": 2220 + }, + { + "epoch": 8.014446952595938, + "eval_loss": 0.6222901344299316, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 2220 + }, + { + "epoch": 8.01805869074492, + "grad_norm": 186.79281616210938, + "learning_rate": 1.7961887477313975e-05, + "loss": 42.2133, + "step": 2221 + }, + { + "epoch": 8.021670428893906, + "grad_norm": 220.3788299560547, + "learning_rate": 1.795644283121597e-05, + "loss": 42.0159, + "step": 2222 + }, + { + "epoch": 8.025282167042889, + "grad_norm": 262.37078857421875, + "learning_rate": 1.7950998185117966e-05, + "loss": 42.6055, + "step": 2223 + }, + { + "epoch": 8.028893905191874, + "grad_norm": 199.07078552246094, + "learning_rate": 1.7945553539019964e-05, + "loss": 43.3061, + "step": 2224 + }, + { + "epoch": 8.032505643340858, + "grad_norm": 256.6651306152344, + "learning_rate": 1.794010889292196e-05, + "loss": 42.4806, + "step": 2225 + }, + { + "epoch": 8.036117381489841, + "grad_norm": 281.17431640625, + "learning_rate": 1.793466424682396e-05, + "loss": 43.9823, + "step": 2226 + }, + { + "epoch": 8.039729119638826, + "grad_norm": 201.19837951660156, + "learning_rate": 1.7929219600725954e-05, + "loss": 41.8372, + "step": 2227 + }, + { + "epoch": 8.043340857787811, + "grad_norm": 195.1905059814453, + "learning_rate": 1.792377495462795e-05, + "loss": 38.8656, + "step": 2228 + }, + { + "epoch": 8.046952595936794, + "grad_norm": 215.02772521972656, + "learning_rate": 1.7918330308529948e-05, + "loss": 39.8965, + "step": 2229 + }, + { + "epoch": 8.050564334085779, + "grad_norm": 202.16322326660156, + "learning_rate": 1.7912885662431944e-05, + "loss": 41.0917, + "step": 2230 + }, + { + "epoch": 8.050564334085779, + "eval_loss": 0.6212881207466125, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2230 + }, + { + "epoch": 8.054176072234762, + "grad_norm": 218.90786743164062, + "learning_rate": 1.790744101633394e-05, + "loss": 38.5499, + "step": 2231 + }, + { + "epoch": 8.057787810383747, + "grad_norm": 179.57138061523438, + "learning_rate": 1.7901996370235934e-05, + "loss": 39.5915, + "step": 2232 + }, + { + "epoch": 8.061399548532732, + "grad_norm": 242.74801635742188, + "learning_rate": 1.789655172413793e-05, + "loss": 39.6094, + "step": 2233 + }, + { + "epoch": 8.065011286681715, + "grad_norm": 183.07102966308594, + "learning_rate": 1.7891107078039925e-05, + "loss": 40.6025, + "step": 2234 + }, + { + "epoch": 8.0686230248307, + "grad_norm": 192.85418701171875, + "learning_rate": 1.7885662431941924e-05, + "loss": 40.3013, + "step": 2235 + }, + { + "epoch": 8.072234762979685, + "grad_norm": 254.26353454589844, + "learning_rate": 1.7880217785843923e-05, + "loss": 39.1747, + "step": 2236 + }, + { + "epoch": 8.075846501128668, + "grad_norm": 230.7747802734375, + "learning_rate": 1.7874773139745918e-05, + "loss": 40.7569, + "step": 2237 + }, + { + "epoch": 8.079458239277653, + "grad_norm": 179.30528259277344, + "learning_rate": 1.7869328493647913e-05, + "loss": 40.0753, + "step": 2238 + }, + { + "epoch": 8.083069977426636, + "grad_norm": 203.48915100097656, + "learning_rate": 1.786388384754991e-05, + "loss": 41.4453, + "step": 2239 + }, + { + "epoch": 8.08668171557562, + "grad_norm": 274.8970947265625, + "learning_rate": 1.7858439201451908e-05, + "loss": 40.5818, + "step": 2240 + }, + { + "epoch": 8.08668171557562, + "eval_loss": 0.6184170842170715, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.799, + "eval_steps_per_second": 56.799, + "step": 2240 + }, + { + "epoch": 8.090293453724605, + "grad_norm": 237.2452392578125, + "learning_rate": 1.7852994555353903e-05, + "loss": 42.5794, + "step": 2241 + }, + { + "epoch": 8.093905191873588, + "grad_norm": 236.33766174316406, + "learning_rate": 1.7847549909255898e-05, + "loss": 41.89, + "step": 2242 + }, + { + "epoch": 8.097516930022573, + "grad_norm": 269.4791564941406, + "learning_rate": 1.7842105263157894e-05, + "loss": 41.7726, + "step": 2243 + }, + { + "epoch": 8.101128668171558, + "grad_norm": 192.28457641601562, + "learning_rate": 1.783666061705989e-05, + "loss": 40.1187, + "step": 2244 + }, + { + "epoch": 8.104740406320541, + "grad_norm": 201.5625457763672, + "learning_rate": 1.7831215970961888e-05, + "loss": 36.8004, + "step": 2245 + }, + { + "epoch": 8.108352144469526, + "grad_norm": 175.7625274658203, + "learning_rate": 1.7825771324863887e-05, + "loss": 33.8354, + "step": 2246 + }, + { + "epoch": 8.111963882618511, + "grad_norm": 195.6171112060547, + "learning_rate": 1.7820326678765882e-05, + "loss": 33.5176, + "step": 2247 + }, + { + "epoch": 8.115575620767494, + "grad_norm": 158.7554168701172, + "learning_rate": 1.7814882032667877e-05, + "loss": 34.2908, + "step": 2248 + }, + { + "epoch": 8.119187358916479, + "grad_norm": 192.78900146484375, + "learning_rate": 1.7809437386569873e-05, + "loss": 34.0861, + "step": 2249 + }, + { + "epoch": 8.122799097065462, + "grad_norm": 186.6603240966797, + "learning_rate": 1.7803992740471868e-05, + "loss": 35.5742, + "step": 2250 + }, + { + "epoch": 8.122799097065462, + "eval_loss": 0.6207499504089355, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2250 + }, + { + "epoch": 8.126410835214447, + "grad_norm": 264.3590087890625, + "learning_rate": 1.7798548094373867e-05, + "loss": 35.6709, + "step": 2251 + }, + { + "epoch": 8.130022573363432, + "grad_norm": 202.9478302001953, + "learning_rate": 1.7793103448275862e-05, + "loss": 36.4221, + "step": 2252 + }, + { + "epoch": 8.133634311512415, + "grad_norm": 229.260498046875, + "learning_rate": 1.7787658802177858e-05, + "loss": 36.0745, + "step": 2253 + }, + { + "epoch": 8.1372460496614, + "grad_norm": 222.37716674804688, + "learning_rate": 1.7782214156079856e-05, + "loss": 37.3266, + "step": 2254 + }, + { + "epoch": 8.140857787810384, + "grad_norm": 217.02272033691406, + "learning_rate": 1.7776769509981852e-05, + "loss": 37.2819, + "step": 2255 + }, + { + "epoch": 8.144469525959368, + "grad_norm": 247.61016845703125, + "learning_rate": 1.7771324863883847e-05, + "loss": 37.2683, + "step": 2256 + }, + { + "epoch": 8.148081264108352, + "grad_norm": 209.7449493408203, + "learning_rate": 1.7765880217785846e-05, + "loss": 36.7165, + "step": 2257 + }, + { + "epoch": 8.151693002257336, + "grad_norm": 217.30722045898438, + "learning_rate": 1.776043557168784e-05, + "loss": 37.0805, + "step": 2258 + }, + { + "epoch": 8.15530474040632, + "grad_norm": 181.5167236328125, + "learning_rate": 1.7754990925589837e-05, + "loss": 38.0326, + "step": 2259 + }, + { + "epoch": 8.158916478555305, + "grad_norm": 217.4818878173828, + "learning_rate": 1.7749546279491832e-05, + "loss": 37.1798, + "step": 2260 + }, + { + "epoch": 8.158916478555305, + "eval_loss": 0.6218119263648987, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 2260 + }, + { + "epoch": 8.162528216704288, + "grad_norm": 233.60733032226562, + "learning_rate": 1.7744101633393828e-05, + "loss": 36.6039, + "step": 2261 + }, + { + "epoch": 8.166139954853273, + "grad_norm": 184.5128631591797, + "learning_rate": 1.7738656987295826e-05, + "loss": 30.6188, + "step": 2262 + }, + { + "epoch": 8.169751693002258, + "grad_norm": 154.25791931152344, + "learning_rate": 1.7733212341197825e-05, + "loss": 24.0782, + "step": 2263 + }, + { + "epoch": 8.173363431151241, + "grad_norm": 179.92723083496094, + "learning_rate": 1.772776769509982e-05, + "loss": 23.7072, + "step": 2264 + }, + { + "epoch": 8.176975169300226, + "grad_norm": 170.87684631347656, + "learning_rate": 1.7722323049001816e-05, + "loss": 24.0008, + "step": 2265 + }, + { + "epoch": 8.18058690744921, + "grad_norm": 179.25233459472656, + "learning_rate": 1.771687840290381e-05, + "loss": 24.8393, + "step": 2266 + }, + { + "epoch": 8.184198645598194, + "grad_norm": 268.7836608886719, + "learning_rate": 1.7711433756805807e-05, + "loss": 44.0573, + "step": 2267 + }, + { + "epoch": 8.187810383747179, + "grad_norm": 249.12033081054688, + "learning_rate": 1.7705989110707805e-05, + "loss": 45.0218, + "step": 2268 + }, + { + "epoch": 8.191422121896162, + "grad_norm": 275.2551574707031, + "learning_rate": 1.77005444646098e-05, + "loss": 43.1954, + "step": 2269 + }, + { + "epoch": 8.195033860045147, + "grad_norm": 233.5360107421875, + "learning_rate": 1.7695099818511796e-05, + "loss": 43.0807, + "step": 2270 + }, + { + "epoch": 8.195033860045147, + "eval_loss": 0.6311450600624084, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 2270 + }, + { + "epoch": 8.198645598194132, + "grad_norm": 201.01617431640625, + "learning_rate": 1.768965517241379e-05, + "loss": 43.8161, + "step": 2271 + }, + { + "epoch": 8.202257336343115, + "grad_norm": 243.028564453125, + "learning_rate": 1.7684210526315787e-05, + "loss": 43.3388, + "step": 2272 + }, + { + "epoch": 8.2058690744921, + "grad_norm": 191.8246307373047, + "learning_rate": 1.767876588021779e-05, + "loss": 42.6949, + "step": 2273 + }, + { + "epoch": 8.209480812641084, + "grad_norm": 241.33609008789062, + "learning_rate": 1.7673321234119784e-05, + "loss": 43.3541, + "step": 2274 + }, + { + "epoch": 8.213092550790067, + "grad_norm": 247.99066162109375, + "learning_rate": 1.766787658802178e-05, + "loss": 44.4262, + "step": 2275 + }, + { + "epoch": 8.216704288939052, + "grad_norm": 223.35452270507812, + "learning_rate": 1.7662431941923775e-05, + "loss": 42.5696, + "step": 2276 + }, + { + "epoch": 8.220316027088035, + "grad_norm": 208.75209045410156, + "learning_rate": 1.765698729582577e-05, + "loss": 41.9236, + "step": 2277 + }, + { + "epoch": 8.22392776523702, + "grad_norm": 229.60305786132812, + "learning_rate": 1.7651542649727766e-05, + "loss": 39.962, + "step": 2278 + }, + { + "epoch": 8.227539503386005, + "grad_norm": 294.3867492675781, + "learning_rate": 1.7646098003629765e-05, + "loss": 39.0847, + "step": 2279 + }, + { + "epoch": 8.231151241534988, + "grad_norm": 201.49679565429688, + "learning_rate": 1.764065335753176e-05, + "loss": 39.1451, + "step": 2280 + }, + { + "epoch": 8.231151241534988, + "eval_loss": 0.6214079856872559, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 2280 + }, + { + "epoch": 8.234762979683973, + "grad_norm": 201.57894897460938, + "learning_rate": 1.7635208711433756e-05, + "loss": 39.4673, + "step": 2281 + }, + { + "epoch": 8.238374717832958, + "grad_norm": 201.0395965576172, + "learning_rate": 1.7629764065335754e-05, + "loss": 39.9832, + "step": 2282 + }, + { + "epoch": 8.241986455981941, + "grad_norm": 274.41168212890625, + "learning_rate": 1.762431941923775e-05, + "loss": 40.3885, + "step": 2283 + }, + { + "epoch": 8.245598194130926, + "grad_norm": 173.79977416992188, + "learning_rate": 1.761887477313975e-05, + "loss": 39.5292, + "step": 2284 + }, + { + "epoch": 8.249209932279909, + "grad_norm": 194.91806030273438, + "learning_rate": 1.7613430127041744e-05, + "loss": 40.3855, + "step": 2285 + }, + { + "epoch": 8.252821670428894, + "grad_norm": 216.47213745117188, + "learning_rate": 1.760798548094374e-05, + "loss": 40.937, + "step": 2286 + }, + { + "epoch": 8.256433408577879, + "grad_norm": 168.1825714111328, + "learning_rate": 1.7602540834845735e-05, + "loss": 41.2523, + "step": 2287 + }, + { + "epoch": 8.260045146726862, + "grad_norm": 187.51914978027344, + "learning_rate": 1.759709618874773e-05, + "loss": 40.6913, + "step": 2288 + }, + { + "epoch": 8.263656884875846, + "grad_norm": 183.99844360351562, + "learning_rate": 1.759165154264973e-05, + "loss": 42.5074, + "step": 2289 + }, + { + "epoch": 8.267268623024831, + "grad_norm": 201.23797607421875, + "learning_rate": 1.7586206896551724e-05, + "loss": 42.0519, + "step": 2290 + }, + { + "epoch": 8.267268623024831, + "eval_loss": 0.6184054017066956, + "eval_runtime": 3.1465, + "eval_samples_per_second": 56.889, + "eval_steps_per_second": 56.889, + "step": 2290 + }, + { + "epoch": 8.270880361173814, + "grad_norm": 219.0037384033203, + "learning_rate": 1.7580762250453723e-05, + "loss": 41.7059, + "step": 2291 + }, + { + "epoch": 8.2744920993228, + "grad_norm": 221.00173950195312, + "learning_rate": 1.7575317604355718e-05, + "loss": 40.9004, + "step": 2292 + }, + { + "epoch": 8.278103837471784, + "grad_norm": 180.00828552246094, + "learning_rate": 1.7569872958257714e-05, + "loss": 38.7865, + "step": 2293 + }, + { + "epoch": 8.281715575620767, + "grad_norm": 210.69302368164062, + "learning_rate": 1.756442831215971e-05, + "loss": 39.207, + "step": 2294 + }, + { + "epoch": 8.285327313769752, + "grad_norm": 196.8787078857422, + "learning_rate": 1.7558983666061708e-05, + "loss": 39.4472, + "step": 2295 + }, + { + "epoch": 8.288939051918735, + "grad_norm": 229.16331481933594, + "learning_rate": 1.7553539019963703e-05, + "loss": 36.5539, + "step": 2296 + }, + { + "epoch": 8.29255079006772, + "grad_norm": 180.67474365234375, + "learning_rate": 1.75480943738657e-05, + "loss": 34.3887, + "step": 2297 + }, + { + "epoch": 8.296162528216705, + "grad_norm": 234.046875, + "learning_rate": 1.7542649727767694e-05, + "loss": 34.158, + "step": 2298 + }, + { + "epoch": 8.299774266365688, + "grad_norm": 213.34255981445312, + "learning_rate": 1.753720508166969e-05, + "loss": 34.7655, + "step": 2299 + }, + { + "epoch": 8.303386004514673, + "grad_norm": 205.6382598876953, + "learning_rate": 1.753176043557169e-05, + "loss": 34.4223, + "step": 2300 + }, + { + "epoch": 8.303386004514673, + "eval_loss": 0.6200549006462097, + "eval_runtime": 3.1447, + "eval_samples_per_second": 56.921, + "eval_steps_per_second": 56.921, + "step": 2300 + }, + { + "epoch": 8.306997742663658, + "grad_norm": 189.79238891601562, + "learning_rate": 1.7526315789473687e-05, + "loss": 35.3846, + "step": 2301 + }, + { + "epoch": 8.31060948081264, + "grad_norm": 202.27859497070312, + "learning_rate": 1.7520871143375682e-05, + "loss": 34.9006, + "step": 2302 + }, + { + "epoch": 8.314221218961626, + "grad_norm": 217.62327575683594, + "learning_rate": 1.7515426497277678e-05, + "loss": 36.3079, + "step": 2303 + }, + { + "epoch": 8.317832957110609, + "grad_norm": 212.82862854003906, + "learning_rate": 1.7509981851179673e-05, + "loss": 35.8598, + "step": 2304 + }, + { + "epoch": 8.321444695259594, + "grad_norm": 229.778564453125, + "learning_rate": 1.750453720508167e-05, + "loss": 37.0853, + "step": 2305 + }, + { + "epoch": 8.325056433408578, + "grad_norm": 219.99844360351562, + "learning_rate": 1.7499092558983667e-05, + "loss": 38.01, + "step": 2306 + }, + { + "epoch": 8.328668171557561, + "grad_norm": 202.63035583496094, + "learning_rate": 1.7493647912885663e-05, + "loss": 36.4756, + "step": 2307 + }, + { + "epoch": 8.332279909706546, + "grad_norm": 188.44094848632812, + "learning_rate": 1.7488203266787658e-05, + "loss": 37.0509, + "step": 2308 + }, + { + "epoch": 8.335891647855531, + "grad_norm": 187.8760223388672, + "learning_rate": 1.7482758620689657e-05, + "loss": 38.0019, + "step": 2309 + }, + { + "epoch": 8.339503386004514, + "grad_norm": 239.35833740234375, + "learning_rate": 1.7477313974591652e-05, + "loss": 38.2255, + "step": 2310 + }, + { + "epoch": 8.339503386004514, + "eval_loss": 0.6221747994422913, + "eval_runtime": 3.148, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 2310 + }, + { + "epoch": 8.343115124153499, + "grad_norm": 236.3567657470703, + "learning_rate": 1.747186932849365e-05, + "loss": 37.3598, + "step": 2311 + }, + { + "epoch": 8.346726862302482, + "grad_norm": 188.16151428222656, + "learning_rate": 1.7466424682395646e-05, + "loss": 27.1993, + "step": 2312 + }, + { + "epoch": 8.350338600451467, + "grad_norm": 216.58778381347656, + "learning_rate": 1.746098003629764e-05, + "loss": 23.7024, + "step": 2313 + }, + { + "epoch": 8.353950338600452, + "grad_norm": 221.03111267089844, + "learning_rate": 1.7455535390199637e-05, + "loss": 24.2856, + "step": 2314 + }, + { + "epoch": 8.357562076749435, + "grad_norm": 180.36221313476562, + "learning_rate": 1.7450090744101632e-05, + "loss": 23.7624, + "step": 2315 + }, + { + "epoch": 8.36117381489842, + "grad_norm": 198.77438354492188, + "learning_rate": 1.7444646098003628e-05, + "loss": 25.8628, + "step": 2316 + }, + { + "epoch": 8.364785553047405, + "grad_norm": 250.81321716308594, + "learning_rate": 1.7439201451905627e-05, + "loss": 43.4097, + "step": 2317 + }, + { + "epoch": 8.368397291196388, + "grad_norm": 246.19544982910156, + "learning_rate": 1.7433756805807622e-05, + "loss": 44.7141, + "step": 2318 + }, + { + "epoch": 8.372009029345373, + "grad_norm": 245.04241943359375, + "learning_rate": 1.742831215970962e-05, + "loss": 44.4511, + "step": 2319 + }, + { + "epoch": 8.375620767494357, + "grad_norm": 224.05331420898438, + "learning_rate": 1.7422867513611616e-05, + "loss": 43.5971, + "step": 2320 + }, + { + "epoch": 8.375620767494357, + "eval_loss": 0.6324251294136047, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 2320 + }, + { + "epoch": 8.37923250564334, + "grad_norm": 222.3795623779297, + "learning_rate": 1.741742286751361e-05, + "loss": 42.9007, + "step": 2321 + }, + { + "epoch": 8.382844243792325, + "grad_norm": 210.0133514404297, + "learning_rate": 1.741197822141561e-05, + "loss": 42.8733, + "step": 2322 + }, + { + "epoch": 8.386455981941308, + "grad_norm": 222.01031494140625, + "learning_rate": 1.7406533575317606e-05, + "loss": 42.9875, + "step": 2323 + }, + { + "epoch": 8.390067720090293, + "grad_norm": 187.30101013183594, + "learning_rate": 1.74010889292196e-05, + "loss": 42.4873, + "step": 2324 + }, + { + "epoch": 8.393679458239278, + "grad_norm": 188.22048950195312, + "learning_rate": 1.7395644283121596e-05, + "loss": 42.2066, + "step": 2325 + }, + { + "epoch": 8.397291196388261, + "grad_norm": 228.75363159179688, + "learning_rate": 1.7390199637023592e-05, + "loss": 42.7604, + "step": 2326 + }, + { + "epoch": 8.400902934537246, + "grad_norm": 196.8817901611328, + "learning_rate": 1.7384754990925587e-05, + "loss": 42.445, + "step": 2327 + }, + { + "epoch": 8.404514672686231, + "grad_norm": 205.3610382080078, + "learning_rate": 1.737931034482759e-05, + "loss": 39.8408, + "step": 2328 + }, + { + "epoch": 8.408126410835214, + "grad_norm": 259.0702819824219, + "learning_rate": 1.7373865698729585e-05, + "loss": 40.847, + "step": 2329 + }, + { + "epoch": 8.411738148984199, + "grad_norm": 216.12017822265625, + "learning_rate": 1.736842105263158e-05, + "loss": 40.4648, + "step": 2330 + }, + { + "epoch": 8.411738148984199, + "eval_loss": 0.6252871155738831, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2330 + }, + { + "epoch": 8.415349887133182, + "grad_norm": 330.9464111328125, + "learning_rate": 1.7362976406533575e-05, + "loss": 39.7682, + "step": 2331 + }, + { + "epoch": 8.418961625282167, + "grad_norm": 237.19505310058594, + "learning_rate": 1.735753176043557e-05, + "loss": 38.8824, + "step": 2332 + }, + { + "epoch": 8.422573363431152, + "grad_norm": 247.22259521484375, + "learning_rate": 1.735208711433757e-05, + "loss": 40.1187, + "step": 2333 + }, + { + "epoch": 8.426185101580135, + "grad_norm": 267.739990234375, + "learning_rate": 1.7346642468239565e-05, + "loss": 40.4589, + "step": 2334 + }, + { + "epoch": 8.42979683972912, + "grad_norm": 308.715576171875, + "learning_rate": 1.734119782214156e-05, + "loss": 41.5481, + "step": 2335 + }, + { + "epoch": 8.433408577878104, + "grad_norm": 350.8972473144531, + "learning_rate": 1.7335753176043556e-05, + "loss": 41.6628, + "step": 2336 + }, + { + "epoch": 8.437020316027088, + "grad_norm": 245.9825897216797, + "learning_rate": 1.7330308529945555e-05, + "loss": 40.3527, + "step": 2337 + }, + { + "epoch": 8.440632054176072, + "grad_norm": 253.94488525390625, + "learning_rate": 1.732486388384755e-05, + "loss": 39.6388, + "step": 2338 + }, + { + "epoch": 8.444243792325057, + "grad_norm": 226.24179077148438, + "learning_rate": 1.731941923774955e-05, + "loss": 40.5561, + "step": 2339 + }, + { + "epoch": 8.44785553047404, + "grad_norm": 188.66746520996094, + "learning_rate": 1.7313974591651544e-05, + "loss": 41.8422, + "step": 2340 + }, + { + "epoch": 8.44785553047404, + "eval_loss": 0.6197592616081238, + "eval_runtime": 3.1522, + "eval_samples_per_second": 56.786, + "eval_steps_per_second": 56.786, + "step": 2340 + }, + { + "epoch": 8.451467268623025, + "grad_norm": 227.01014709472656, + "learning_rate": 1.730852994555354e-05, + "loss": 41.4184, + "step": 2341 + }, + { + "epoch": 8.455079006772008, + "grad_norm": 187.11643981933594, + "learning_rate": 1.7303085299455535e-05, + "loss": 40.796, + "step": 2342 + }, + { + "epoch": 8.458690744920993, + "grad_norm": 243.1756134033203, + "learning_rate": 1.729764065335753e-05, + "loss": 41.7926, + "step": 2343 + }, + { + "epoch": 8.462302483069978, + "grad_norm": 226.15187072753906, + "learning_rate": 1.729219600725953e-05, + "loss": 41.588, + "step": 2344 + }, + { + "epoch": 8.465914221218961, + "grad_norm": 218.49935913085938, + "learning_rate": 1.7286751361161524e-05, + "loss": 39.6935, + "step": 2345 + }, + { + "epoch": 8.469525959367946, + "grad_norm": 232.4805145263672, + "learning_rate": 1.7281306715063523e-05, + "loss": 37.0718, + "step": 2346 + }, + { + "epoch": 8.47313769751693, + "grad_norm": 201.1748046875, + "learning_rate": 1.727586206896552e-05, + "loss": 33.9633, + "step": 2347 + }, + { + "epoch": 8.476749435665914, + "grad_norm": 208.79733276367188, + "learning_rate": 1.7270417422867514e-05, + "loss": 33.4553, + "step": 2348 + }, + { + "epoch": 8.480361173814899, + "grad_norm": 235.91151428222656, + "learning_rate": 1.726497277676951e-05, + "loss": 33.6144, + "step": 2349 + }, + { + "epoch": 8.483972911963882, + "grad_norm": 206.28811645507812, + "learning_rate": 1.7259528130671508e-05, + "loss": 35.3678, + "step": 2350 + }, + { + "epoch": 8.483972911963882, + "eval_loss": 0.6203061938285828, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 2350 + }, + { + "epoch": 8.487584650112867, + "grad_norm": 305.2204284667969, + "learning_rate": 1.7254083484573503e-05, + "loss": 35.9175, + "step": 2351 + }, + { + "epoch": 8.491196388261852, + "grad_norm": 227.1592254638672, + "learning_rate": 1.72486388384755e-05, + "loss": 35.5001, + "step": 2352 + }, + { + "epoch": 8.494808126410835, + "grad_norm": 194.739501953125, + "learning_rate": 1.7243194192377494e-05, + "loss": 35.0015, + "step": 2353 + }, + { + "epoch": 8.49841986455982, + "grad_norm": 233.8467254638672, + "learning_rate": 1.723774954627949e-05, + "loss": 36.8257, + "step": 2354 + }, + { + "epoch": 8.502031602708804, + "grad_norm": 258.8914489746094, + "learning_rate": 1.7232304900181492e-05, + "loss": 36.1246, + "step": 2355 + }, + { + "epoch": 8.505643340857787, + "grad_norm": 194.8585968017578, + "learning_rate": 1.7226860254083487e-05, + "loss": 36.1245, + "step": 2356 + }, + { + "epoch": 8.509255079006772, + "grad_norm": 191.2276153564453, + "learning_rate": 1.7221415607985483e-05, + "loss": 37.0608, + "step": 2357 + }, + { + "epoch": 8.512866817155757, + "grad_norm": 197.9025115966797, + "learning_rate": 1.7215970961887478e-05, + "loss": 37.0779, + "step": 2358 + }, + { + "epoch": 8.51647855530474, + "grad_norm": 207.01016235351562, + "learning_rate": 1.7210526315789473e-05, + "loss": 37.8432, + "step": 2359 + }, + { + "epoch": 8.520090293453725, + "grad_norm": 222.20201110839844, + "learning_rate": 1.720508166969147e-05, + "loss": 36.6983, + "step": 2360 + }, + { + "epoch": 8.520090293453725, + "eval_loss": 0.6240220665931702, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 2360 + }, + { + "epoch": 8.523702031602708, + "grad_norm": 200.19273376464844, + "learning_rate": 1.7199637023593467e-05, + "loss": 38.0613, + "step": 2361 + }, + { + "epoch": 8.527313769751693, + "grad_norm": 205.36758422851562, + "learning_rate": 1.7194192377495463e-05, + "loss": 29.6395, + "step": 2362 + }, + { + "epoch": 8.530925507900678, + "grad_norm": 206.53396606445312, + "learning_rate": 1.7188747731397458e-05, + "loss": 23.6478, + "step": 2363 + }, + { + "epoch": 8.534537246049661, + "grad_norm": 219.47044372558594, + "learning_rate": 1.7183303085299454e-05, + "loss": 22.8522, + "step": 2364 + }, + { + "epoch": 8.538148984198646, + "grad_norm": 178.48008728027344, + "learning_rate": 1.7177858439201452e-05, + "loss": 24.1411, + "step": 2365 + }, + { + "epoch": 8.54176072234763, + "grad_norm": 222.63731384277344, + "learning_rate": 1.717241379310345e-05, + "loss": 26.2818, + "step": 2366 + }, + { + "epoch": 8.545372460496614, + "grad_norm": 216.6333465576172, + "learning_rate": 1.7166969147005447e-05, + "loss": 42.5599, + "step": 2367 + }, + { + "epoch": 8.548984198645599, + "grad_norm": 241.42532348632812, + "learning_rate": 1.7161524500907442e-05, + "loss": 44.0016, + "step": 2368 + }, + { + "epoch": 8.552595936794582, + "grad_norm": 227.95193481445312, + "learning_rate": 1.7156079854809437e-05, + "loss": 44.1662, + "step": 2369 + }, + { + "epoch": 8.556207674943566, + "grad_norm": 204.9208526611328, + "learning_rate": 1.7150635208711433e-05, + "loss": 41.2255, + "step": 2370 + }, + { + "epoch": 8.556207674943566, + "eval_loss": 0.6293933987617493, + "eval_runtime": 3.1467, + "eval_samples_per_second": 56.884, + "eval_steps_per_second": 56.884, + "step": 2370 + }, + { + "epoch": 8.559819413092551, + "grad_norm": 168.1370849609375, + "learning_rate": 1.7145190562613428e-05, + "loss": 42.8374, + "step": 2371 + }, + { + "epoch": 8.563431151241534, + "grad_norm": 209.16641235351562, + "learning_rate": 1.7139745916515427e-05, + "loss": 42.4378, + "step": 2372 + }, + { + "epoch": 8.56704288939052, + "grad_norm": 235.36373901367188, + "learning_rate": 1.7134301270417422e-05, + "loss": 43.3213, + "step": 2373 + }, + { + "epoch": 8.570654627539504, + "grad_norm": 198.8206329345703, + "learning_rate": 1.712885662431942e-05, + "loss": 43.5621, + "step": 2374 + }, + { + "epoch": 8.574266365688487, + "grad_norm": 191.1640167236328, + "learning_rate": 1.7123411978221416e-05, + "loss": 41.8729, + "step": 2375 + }, + { + "epoch": 8.577878103837472, + "grad_norm": 281.6352233886719, + "learning_rate": 1.7117967332123412e-05, + "loss": 42.8306, + "step": 2376 + }, + { + "epoch": 8.581489841986457, + "grad_norm": 191.68939208984375, + "learning_rate": 1.711252268602541e-05, + "loss": 41.3603, + "step": 2377 + }, + { + "epoch": 8.58510158013544, + "grad_norm": 175.3041229248047, + "learning_rate": 1.7107078039927406e-05, + "loss": 38.7076, + "step": 2378 + }, + { + "epoch": 8.588713318284425, + "grad_norm": 186.31202697753906, + "learning_rate": 1.71016333938294e-05, + "loss": 38.832, + "step": 2379 + }, + { + "epoch": 8.592325056433408, + "grad_norm": 192.0680389404297, + "learning_rate": 1.7096188747731397e-05, + "loss": 40.6542, + "step": 2380 + }, + { + "epoch": 8.592325056433408, + "eval_loss": 0.6245992183685303, + "eval_runtime": 3.1487, + "eval_samples_per_second": 56.848, + "eval_steps_per_second": 56.848, + "step": 2380 + }, + { + "epoch": 8.595936794582393, + "grad_norm": 284.3516540527344, + "learning_rate": 1.7090744101633392e-05, + "loss": 40.3145, + "step": 2381 + }, + { + "epoch": 8.599548532731378, + "grad_norm": 210.2421875, + "learning_rate": 1.708529945553539e-05, + "loss": 39.9109, + "step": 2382 + }, + { + "epoch": 8.60316027088036, + "grad_norm": 202.3438720703125, + "learning_rate": 1.707985480943739e-05, + "loss": 39.0686, + "step": 2383 + }, + { + "epoch": 8.606772009029346, + "grad_norm": 189.5508270263672, + "learning_rate": 1.7074410163339385e-05, + "loss": 40.6673, + "step": 2384 + }, + { + "epoch": 8.610383747178329, + "grad_norm": 199.3516387939453, + "learning_rate": 1.706896551724138e-05, + "loss": 40.5357, + "step": 2385 + }, + { + "epoch": 8.613995485327314, + "grad_norm": 183.11309814453125, + "learning_rate": 1.7063520871143376e-05, + "loss": 40.7691, + "step": 2386 + }, + { + "epoch": 8.617607223476298, + "grad_norm": 347.104248046875, + "learning_rate": 1.705807622504537e-05, + "loss": 40.6822, + "step": 2387 + }, + { + "epoch": 8.621218961625281, + "grad_norm": 341.0453796386719, + "learning_rate": 1.705263157894737e-05, + "loss": 40.9791, + "step": 2388 + }, + { + "epoch": 8.624830699774266, + "grad_norm": 335.33221435546875, + "learning_rate": 1.7047186932849365e-05, + "loss": 41.0977, + "step": 2389 + }, + { + "epoch": 8.628442437923251, + "grad_norm": 209.75198364257812, + "learning_rate": 1.704174228675136e-05, + "loss": 41.3332, + "step": 2390 + }, + { + "epoch": 8.628442437923251, + "eval_loss": 0.6176490783691406, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2390 + }, + { + "epoch": 8.632054176072234, + "grad_norm": 221.6715545654297, + "learning_rate": 1.7036297640653356e-05, + "loss": 41.7456, + "step": 2391 + }, + { + "epoch": 8.635665914221219, + "grad_norm": 255.7875213623047, + "learning_rate": 1.7030852994555355e-05, + "loss": 41.7063, + "step": 2392 + }, + { + "epoch": 8.639277652370204, + "grad_norm": 206.66221618652344, + "learning_rate": 1.7025408348457354e-05, + "loss": 41.941, + "step": 2393 + }, + { + "epoch": 8.642889390519187, + "grad_norm": 381.9871826171875, + "learning_rate": 1.701996370235935e-05, + "loss": 42.8615, + "step": 2394 + }, + { + "epoch": 8.646501128668172, + "grad_norm": 303.8249816894531, + "learning_rate": 1.7014519056261344e-05, + "loss": 37.8472, + "step": 2395 + }, + { + "epoch": 8.650112866817155, + "grad_norm": 201.2444610595703, + "learning_rate": 1.700907441016334e-05, + "loss": 35.4641, + "step": 2396 + }, + { + "epoch": 8.65372460496614, + "grad_norm": 242.34298706054688, + "learning_rate": 1.7003629764065335e-05, + "loss": 33.3414, + "step": 2397 + }, + { + "epoch": 8.657336343115125, + "grad_norm": 214.45384216308594, + "learning_rate": 1.699818511796733e-05, + "loss": 33.7771, + "step": 2398 + }, + { + "epoch": 8.660948081264108, + "grad_norm": 276.4810485839844, + "learning_rate": 1.699274047186933e-05, + "loss": 35.4289, + "step": 2399 + }, + { + "epoch": 8.664559819413093, + "grad_norm": 199.68626403808594, + "learning_rate": 1.6987295825771325e-05, + "loss": 34.4205, + "step": 2400 + }, + { + "epoch": 8.664559819413093, + "eval_loss": 0.6179484128952026, + "eval_runtime": 3.1618, + "eval_samples_per_second": 56.614, + "eval_steps_per_second": 56.614, + "step": 2400 + }, + { + "epoch": 8.668171557562077, + "grad_norm": 239.19200134277344, + "learning_rate": 1.698185117967332e-05, + "loss": 34.3428, + "step": 2401 + }, + { + "epoch": 8.67178329571106, + "grad_norm": 341.44927978515625, + "learning_rate": 1.697640653357532e-05, + "loss": 37.6011, + "step": 2402 + }, + { + "epoch": 8.675395033860045, + "grad_norm": 260.5967102050781, + "learning_rate": 1.6970961887477314e-05, + "loss": 34.9222, + "step": 2403 + }, + { + "epoch": 8.679006772009028, + "grad_norm": 217.9357147216797, + "learning_rate": 1.6965517241379313e-05, + "loss": 36.6177, + "step": 2404 + }, + { + "epoch": 8.682618510158013, + "grad_norm": 355.21917724609375, + "learning_rate": 1.696007259528131e-05, + "loss": 36.3072, + "step": 2405 + }, + { + "epoch": 8.686230248306998, + "grad_norm": 279.37200927734375, + "learning_rate": 1.6954627949183304e-05, + "loss": 36.7026, + "step": 2406 + }, + { + "epoch": 8.689841986455981, + "grad_norm": 344.9017028808594, + "learning_rate": 1.69491833030853e-05, + "loss": 37.5009, + "step": 2407 + }, + { + "epoch": 8.693453724604966, + "grad_norm": 225.28668212890625, + "learning_rate": 1.6943738656987295e-05, + "loss": 36.0914, + "step": 2408 + }, + { + "epoch": 8.697065462753951, + "grad_norm": 233.16372680664062, + "learning_rate": 1.693829401088929e-05, + "loss": 38.0917, + "step": 2409 + }, + { + "epoch": 8.700677200902934, + "grad_norm": 220.2307891845703, + "learning_rate": 1.693284936479129e-05, + "loss": 37.4493, + "step": 2410 + }, + { + "epoch": 8.700677200902934, + "eval_loss": 0.6225734949111938, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2410 + }, + { + "epoch": 8.704288939051919, + "grad_norm": 298.2883605957031, + "learning_rate": 1.6927404718693287e-05, + "loss": 37.6527, + "step": 2411 + }, + { + "epoch": 8.707900677200904, + "grad_norm": 329.1615295410156, + "learning_rate": 1.6921960072595283e-05, + "loss": 30.9627, + "step": 2412 + }, + { + "epoch": 8.711512415349887, + "grad_norm": 192.55380249023438, + "learning_rate": 1.6916515426497278e-05, + "loss": 24.2028, + "step": 2413 + }, + { + "epoch": 8.715124153498872, + "grad_norm": 162.13583374023438, + "learning_rate": 1.6911070780399274e-05, + "loss": 23.3005, + "step": 2414 + }, + { + "epoch": 8.718735891647855, + "grad_norm": 152.95108032226562, + "learning_rate": 1.6905626134301272e-05, + "loss": 24.335, + "step": 2415 + }, + { + "epoch": 8.72234762979684, + "grad_norm": 183.4193572998047, + "learning_rate": 1.6900181488203268e-05, + "loss": 24.9279, + "step": 2416 + }, + { + "epoch": 8.725959367945824, + "grad_norm": 232.93650817871094, + "learning_rate": 1.6894736842105263e-05, + "loss": 43.4574, + "step": 2417 + }, + { + "epoch": 8.729571106094808, + "grad_norm": 226.85890197753906, + "learning_rate": 1.688929219600726e-05, + "loss": 44.4136, + "step": 2418 + }, + { + "epoch": 8.733182844243792, + "grad_norm": 232.16064453125, + "learning_rate": 1.6883847549909254e-05, + "loss": 42.8183, + "step": 2419 + }, + { + "epoch": 8.736794582392777, + "grad_norm": 243.5811767578125, + "learning_rate": 1.6878402903811253e-05, + "loss": 43.3031, + "step": 2420 + }, + { + "epoch": 8.736794582392777, + "eval_loss": 0.6284167170524597, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2420 + }, + { + "epoch": 8.74040632054176, + "grad_norm": 194.7115020751953, + "learning_rate": 1.687295825771325e-05, + "loss": 42.1276, + "step": 2421 + }, + { + "epoch": 8.744018058690745, + "grad_norm": 250.81983947753906, + "learning_rate": 1.6867513611615247e-05, + "loss": 42.5535, + "step": 2422 + }, + { + "epoch": 8.747629796839728, + "grad_norm": 205.1988983154297, + "learning_rate": 1.6862068965517242e-05, + "loss": 42.7745, + "step": 2423 + }, + { + "epoch": 8.751241534988713, + "grad_norm": 159.68243408203125, + "learning_rate": 1.6856624319419238e-05, + "loss": 43.6562, + "step": 2424 + }, + { + "epoch": 8.754853273137698, + "grad_norm": 164.31361389160156, + "learning_rate": 1.6851179673321233e-05, + "loss": 43.4602, + "step": 2425 + }, + { + "epoch": 8.758465011286681, + "grad_norm": 213.9793243408203, + "learning_rate": 1.6845735027223232e-05, + "loss": 42.1559, + "step": 2426 + }, + { + "epoch": 8.762076749435666, + "grad_norm": 205.79107666015625, + "learning_rate": 1.6840290381125227e-05, + "loss": 41.5687, + "step": 2427 + }, + { + "epoch": 8.76568848758465, + "grad_norm": 235.80348205566406, + "learning_rate": 1.6834845735027223e-05, + "loss": 41.0748, + "step": 2428 + }, + { + "epoch": 8.769300225733634, + "grad_norm": 203.84884643554688, + "learning_rate": 1.682940108892922e-05, + "loss": 39.3348, + "step": 2429 + }, + { + "epoch": 8.772911963882619, + "grad_norm": 271.2411804199219, + "learning_rate": 1.6823956442831217e-05, + "loss": 39.357, + "step": 2430 + }, + { + "epoch": 8.772911963882619, + "eval_loss": 0.6211046576499939, + "eval_runtime": 3.1402, + "eval_samples_per_second": 57.002, + "eval_steps_per_second": 57.002, + "step": 2430 + }, + { + "epoch": 8.776523702031604, + "grad_norm": 222.4960174560547, + "learning_rate": 1.6818511796733212e-05, + "loss": 39.2198, + "step": 2431 + }, + { + "epoch": 8.780135440180587, + "grad_norm": 325.9942932128906, + "learning_rate": 1.681306715063521e-05, + "loss": 40.572, + "step": 2432 + }, + { + "epoch": 8.783747178329572, + "grad_norm": 195.2740936279297, + "learning_rate": 1.6807622504537206e-05, + "loss": 39.2727, + "step": 2433 + }, + { + "epoch": 8.787358916478555, + "grad_norm": 196.16964721679688, + "learning_rate": 1.68021778584392e-05, + "loss": 40.6503, + "step": 2434 + }, + { + "epoch": 8.79097065462754, + "grad_norm": 183.2659454345703, + "learning_rate": 1.6796733212341197e-05, + "loss": 41.2074, + "step": 2435 + }, + { + "epoch": 8.794582392776524, + "grad_norm": 293.393798828125, + "learning_rate": 1.6791288566243192e-05, + "loss": 40.2778, + "step": 2436 + }, + { + "epoch": 8.798194130925507, + "grad_norm": 232.8402099609375, + "learning_rate": 1.678584392014519e-05, + "loss": 40.0305, + "step": 2437 + }, + { + "epoch": 8.801805869074492, + "grad_norm": 269.957275390625, + "learning_rate": 1.678039927404719e-05, + "loss": 40.4216, + "step": 2438 + }, + { + "epoch": 8.805417607223477, + "grad_norm": 175.6732635498047, + "learning_rate": 1.6774954627949185e-05, + "loss": 40.7998, + "step": 2439 + }, + { + "epoch": 8.80902934537246, + "grad_norm": 209.0604248046875, + "learning_rate": 1.676950998185118e-05, + "loss": 41.1176, + "step": 2440 + }, + { + "epoch": 8.80902934537246, + "eval_loss": 0.6211614012718201, + "eval_runtime": 3.15, + "eval_samples_per_second": 56.826, + "eval_steps_per_second": 56.826, + "step": 2440 + }, + { + "epoch": 8.812641083521445, + "grad_norm": 229.91171264648438, + "learning_rate": 1.6764065335753176e-05, + "loss": 41.37, + "step": 2441 + }, + { + "epoch": 8.816252821670428, + "grad_norm": 192.99610900878906, + "learning_rate": 1.675862068965517e-05, + "loss": 41.8377, + "step": 2442 + }, + { + "epoch": 8.819864559819413, + "grad_norm": 239.290771484375, + "learning_rate": 1.675317604355717e-05, + "loss": 42.3038, + "step": 2443 + }, + { + "epoch": 8.823476297968398, + "grad_norm": 203.52330017089844, + "learning_rate": 1.6747731397459166e-05, + "loss": 41.3334, + "step": 2444 + }, + { + "epoch": 8.827088036117381, + "grad_norm": 247.99099731445312, + "learning_rate": 1.674228675136116e-05, + "loss": 37.7455, + "step": 2445 + }, + { + "epoch": 8.830699774266366, + "grad_norm": 205.9770965576172, + "learning_rate": 1.6736842105263156e-05, + "loss": 34.6828, + "step": 2446 + }, + { + "epoch": 8.83431151241535, + "grad_norm": 215.47024536132812, + "learning_rate": 1.6731397459165152e-05, + "loss": 34.927, + "step": 2447 + }, + { + "epoch": 8.837923250564334, + "grad_norm": 254.14010620117188, + "learning_rate": 1.6725952813067154e-05, + "loss": 35.3194, + "step": 2448 + }, + { + "epoch": 8.841534988713319, + "grad_norm": 221.18174743652344, + "learning_rate": 1.672050816696915e-05, + "loss": 34.9577, + "step": 2449 + }, + { + "epoch": 8.845146726862303, + "grad_norm": 191.1651611328125, + "learning_rate": 1.6715063520871145e-05, + "loss": 33.7244, + "step": 2450 + }, + { + "epoch": 8.845146726862303, + "eval_loss": 0.6216589212417603, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2450 + }, + { + "epoch": 8.848758465011286, + "grad_norm": 228.3920135498047, + "learning_rate": 1.670961887477314e-05, + "loss": 34.9689, + "step": 2451 + }, + { + "epoch": 8.852370203160271, + "grad_norm": 227.6689910888672, + "learning_rate": 1.6704174228675135e-05, + "loss": 36.0718, + "step": 2452 + }, + { + "epoch": 8.855981941309254, + "grad_norm": 182.38978576660156, + "learning_rate": 1.669872958257713e-05, + "loss": 37.1143, + "step": 2453 + }, + { + "epoch": 8.85959367945824, + "grad_norm": 223.66966247558594, + "learning_rate": 1.669328493647913e-05, + "loss": 34.4468, + "step": 2454 + }, + { + "epoch": 8.863205417607224, + "grad_norm": 260.3930358886719, + "learning_rate": 1.6687840290381125e-05, + "loss": 36.7305, + "step": 2455 + }, + { + "epoch": 8.866817155756207, + "grad_norm": 218.60385131835938, + "learning_rate": 1.668239564428312e-05, + "loss": 36.1995, + "step": 2456 + }, + { + "epoch": 8.870428893905192, + "grad_norm": 227.4342041015625, + "learning_rate": 1.667695099818512e-05, + "loss": 35.9138, + "step": 2457 + }, + { + "epoch": 8.874040632054175, + "grad_norm": 208.42196655273438, + "learning_rate": 1.6671506352087115e-05, + "loss": 37.2621, + "step": 2458 + }, + { + "epoch": 8.87765237020316, + "grad_norm": 214.9486541748047, + "learning_rate": 1.6666061705989113e-05, + "loss": 38.5176, + "step": 2459 + }, + { + "epoch": 8.881264108352145, + "grad_norm": 226.6992645263672, + "learning_rate": 1.666061705989111e-05, + "loss": 38.3917, + "step": 2460 + }, + { + "epoch": 8.881264108352145, + "eval_loss": 0.6277003884315491, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.959, + "eval_steps_per_second": 56.959, + "step": 2460 + }, + { + "epoch": 8.884875846501128, + "grad_norm": 282.3875732421875, + "learning_rate": 1.6655172413793104e-05, + "loss": 39.1439, + "step": 2461 + }, + { + "epoch": 8.888487584650113, + "grad_norm": 240.29022216796875, + "learning_rate": 1.66497277676951e-05, + "loss": 33.7717, + "step": 2462 + }, + { + "epoch": 8.892099322799098, + "grad_norm": 231.84727478027344, + "learning_rate": 1.6644283121597095e-05, + "loss": 24.1146, + "step": 2463 + }, + { + "epoch": 8.89571106094808, + "grad_norm": 215.5159149169922, + "learning_rate": 1.663883847549909e-05, + "loss": 24.0165, + "step": 2464 + }, + { + "epoch": 8.899322799097066, + "grad_norm": 278.42950439453125, + "learning_rate": 1.663339382940109e-05, + "loss": 24.2048, + "step": 2465 + }, + { + "epoch": 8.90293453724605, + "grad_norm": 187.03341674804688, + "learning_rate": 1.6627949183303088e-05, + "loss": 24.7332, + "step": 2466 + }, + { + "epoch": 8.906546275395034, + "grad_norm": 261.2938232421875, + "learning_rate": 1.6622504537205083e-05, + "loss": 42.6764, + "step": 2467 + }, + { + "epoch": 8.910158013544018, + "grad_norm": 234.00880432128906, + "learning_rate": 1.661705989110708e-05, + "loss": 42.9894, + "step": 2468 + }, + { + "epoch": 8.913769751693001, + "grad_norm": 263.2890319824219, + "learning_rate": 1.6611615245009074e-05, + "loss": 43.3274, + "step": 2469 + }, + { + "epoch": 8.917381489841986, + "grad_norm": 286.3260192871094, + "learning_rate": 1.6606170598911073e-05, + "loss": 44.3862, + "step": 2470 + }, + { + "epoch": 8.917381489841986, + "eval_loss": 0.6278789043426514, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2470 + }, + { + "epoch": 8.920993227990971, + "grad_norm": 273.5133972167969, + "learning_rate": 1.6600725952813068e-05, + "loss": 43.4195, + "step": 2471 + }, + { + "epoch": 8.924604966139954, + "grad_norm": 246.2245330810547, + "learning_rate": 1.6595281306715063e-05, + "loss": 43.153, + "step": 2472 + }, + { + "epoch": 8.928216704288939, + "grad_norm": 261.3001403808594, + "learning_rate": 1.658983666061706e-05, + "loss": 41.1276, + "step": 2473 + }, + { + "epoch": 8.931828442437924, + "grad_norm": 263.7626037597656, + "learning_rate": 1.6584392014519054e-05, + "loss": 40.5055, + "step": 2474 + }, + { + "epoch": 8.935440180586907, + "grad_norm": 233.80442810058594, + "learning_rate": 1.6578947368421053e-05, + "loss": 40.7098, + "step": 2475 + }, + { + "epoch": 8.939051918735892, + "grad_norm": 334.1268615722656, + "learning_rate": 1.6573502722323052e-05, + "loss": 40.5404, + "step": 2476 + }, + { + "epoch": 8.942663656884875, + "grad_norm": 319.56689453125, + "learning_rate": 1.6568058076225047e-05, + "loss": 40.3434, + "step": 2477 + }, + { + "epoch": 8.94627539503386, + "grad_norm": 388.0625915527344, + "learning_rate": 1.6562613430127043e-05, + "loss": 41.1956, + "step": 2478 + }, + { + "epoch": 8.949887133182845, + "grad_norm": 256.9087829589844, + "learning_rate": 1.6557168784029038e-05, + "loss": 41.9647, + "step": 2479 + }, + { + "epoch": 8.953498871331828, + "grad_norm": 248.2635040283203, + "learning_rate": 1.6551724137931033e-05, + "loss": 41.1885, + "step": 2480 + }, + { + "epoch": 8.953498871331828, + "eval_loss": 0.6198933124542236, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2480 + }, + { + "epoch": 8.957110609480813, + "grad_norm": 236.89004516601562, + "learning_rate": 1.6546279491833032e-05, + "loss": 41.2178, + "step": 2481 + }, + { + "epoch": 8.960722347629797, + "grad_norm": 260.47357177734375, + "learning_rate": 1.6540834845735027e-05, + "loss": 42.1472, + "step": 2482 + }, + { + "epoch": 8.96433408577878, + "grad_norm": 216.1390380859375, + "learning_rate": 1.6535390199637023e-05, + "loss": 36.14, + "step": 2483 + }, + { + "epoch": 8.967945823927765, + "grad_norm": 194.7316131591797, + "learning_rate": 1.652994555353902e-05, + "loss": 33.7272, + "step": 2484 + }, + { + "epoch": 8.97155756207675, + "grad_norm": 202.0404052734375, + "learning_rate": 1.6524500907441017e-05, + "loss": 34.9427, + "step": 2485 + }, + { + "epoch": 8.975169300225733, + "grad_norm": 196.98463439941406, + "learning_rate": 1.6519056261343016e-05, + "loss": 36.4874, + "step": 2486 + }, + { + "epoch": 8.978781038374718, + "grad_norm": 211.46177673339844, + "learning_rate": 1.651361161524501e-05, + "loss": 35.7667, + "step": 2487 + }, + { + "epoch": 8.982392776523701, + "grad_norm": 190.47093200683594, + "learning_rate": 1.6508166969147006e-05, + "loss": 35.6874, + "step": 2488 + }, + { + "epoch": 8.986004514672686, + "grad_norm": 194.9825897216797, + "learning_rate": 1.6502722323049002e-05, + "loss": 36.8718, + "step": 2489 + }, + { + "epoch": 8.989616252821671, + "grad_norm": 230.24774169921875, + "learning_rate": 1.6497277676950997e-05, + "loss": 37.4962, + "step": 2490 + }, + { + "epoch": 8.989616252821671, + "eval_loss": 0.6168100237846375, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 2490 + }, + { + "epoch": 8.993227990970654, + "grad_norm": 266.5688171386719, + "learning_rate": 1.6491833030852993e-05, + "loss": 35.5063, + "step": 2491 + }, + { + "epoch": 8.996839729119639, + "grad_norm": 230.923828125, + "learning_rate": 1.648638838475499e-05, + "loss": 23.5847, + "step": 2492 + }, + { + "epoch": 9.0, + "grad_norm": 187.365478515625, + "learning_rate": 1.6480943738656987e-05, + "loss": 21.7926, + "step": 2493 + }, + { + "epoch": 9.003611738148985, + "grad_norm": 283.487060546875, + "learning_rate": 1.6475499092558986e-05, + "loss": 41.4221, + "step": 2494 + }, + { + "epoch": 9.007223476297968, + "grad_norm": 234.38009643554688, + "learning_rate": 1.647005444646098e-05, + "loss": 43.3343, + "step": 2495 + }, + { + "epoch": 9.010835214446953, + "grad_norm": 253.75588989257812, + "learning_rate": 1.6464609800362976e-05, + "loss": 42.1983, + "step": 2496 + }, + { + "epoch": 9.014446952595938, + "grad_norm": 224.6202392578125, + "learning_rate": 1.6459165154264975e-05, + "loss": 41.5355, + "step": 2497 + }, + { + "epoch": 9.01805869074492, + "grad_norm": 261.0040588378906, + "learning_rate": 1.645372050816697e-05, + "loss": 42.3058, + "step": 2498 + }, + { + "epoch": 9.021670428893906, + "grad_norm": 191.44142150878906, + "learning_rate": 1.6448275862068966e-05, + "loss": 42.3911, + "step": 2499 + }, + { + "epoch": 9.025282167042889, + "grad_norm": 246.79278564453125, + "learning_rate": 1.644283121597096e-05, + "loss": 41.6238, + "step": 2500 + }, + { + "epoch": 9.025282167042889, + "eval_loss": 0.6220878958702087, + "eval_runtime": 3.1552, + "eval_samples_per_second": 56.731, + "eval_steps_per_second": 56.731, + "step": 2500 + }, + { + "epoch": 9.028893905191874, + "grad_norm": 251.5475311279297, + "learning_rate": 1.6437386569872957e-05, + "loss": 43.9275, + "step": 2501 + }, + { + "epoch": 9.032505643340858, + "grad_norm": 300.0381164550781, + "learning_rate": 1.6431941923774952e-05, + "loss": 42.8938, + "step": 2502 + }, + { + "epoch": 9.036117381489841, + "grad_norm": 310.0517883300781, + "learning_rate": 1.6426497277676954e-05, + "loss": 42.3538, + "step": 2503 + }, + { + "epoch": 9.039729119638826, + "grad_norm": 213.50392150878906, + "learning_rate": 1.642105263157895e-05, + "loss": 40.2305, + "step": 2504 + }, + { + "epoch": 9.043340857787811, + "grad_norm": 173.3816680908203, + "learning_rate": 1.6415607985480945e-05, + "loss": 38.3336, + "step": 2505 + }, + { + "epoch": 9.046952595936794, + "grad_norm": 195.51968383789062, + "learning_rate": 1.641016333938294e-05, + "loss": 38.5937, + "step": 2506 + }, + { + "epoch": 9.050564334085779, + "grad_norm": 195.68910217285156, + "learning_rate": 1.6404718693284936e-05, + "loss": 37.9994, + "step": 2507 + }, + { + "epoch": 9.054176072234762, + "grad_norm": 239.56704711914062, + "learning_rate": 1.6399274047186934e-05, + "loss": 38.6006, + "step": 2508 + }, + { + "epoch": 9.057787810383747, + "grad_norm": 455.8309326171875, + "learning_rate": 1.639382940108893e-05, + "loss": 39.9516, + "step": 2509 + }, + { + "epoch": 9.061399548532732, + "grad_norm": 188.0857696533203, + "learning_rate": 1.6388384754990925e-05, + "loss": 38.8922, + "step": 2510 + }, + { + "epoch": 9.061399548532732, + "eval_loss": 0.6177002191543579, + "eval_runtime": 3.1595, + "eval_samples_per_second": 56.654, + "eval_steps_per_second": 56.654, + "step": 2510 + }, + { + "epoch": 9.065011286681715, + "grad_norm": 211.76168823242188, + "learning_rate": 1.638294010889292e-05, + "loss": 38.8895, + "step": 2511 + }, + { + "epoch": 9.0686230248307, + "grad_norm": 281.7332458496094, + "learning_rate": 1.637749546279492e-05, + "loss": 39.9238, + "step": 2512 + }, + { + "epoch": 9.072234762979685, + "grad_norm": 254.9953155517578, + "learning_rate": 1.6372050816696915e-05, + "loss": 41.2667, + "step": 2513 + }, + { + "epoch": 9.075846501128668, + "grad_norm": 233.8746337890625, + "learning_rate": 1.6366606170598914e-05, + "loss": 39.3087, + "step": 2514 + }, + { + "epoch": 9.079458239277653, + "grad_norm": 317.71270751953125, + "learning_rate": 1.636116152450091e-05, + "loss": 40.4902, + "step": 2515 + }, + { + "epoch": 9.083069977426636, + "grad_norm": 227.5228271484375, + "learning_rate": 1.6355716878402904e-05, + "loss": 40.1197, + "step": 2516 + }, + { + "epoch": 9.08668171557562, + "grad_norm": 225.84423828125, + "learning_rate": 1.63502722323049e-05, + "loss": 42.9099, + "step": 2517 + }, + { + "epoch": 9.090293453724605, + "grad_norm": 255.20858764648438, + "learning_rate": 1.6344827586206895e-05, + "loss": 42.0515, + "step": 2518 + }, + { + "epoch": 9.093905191873588, + "grad_norm": 215.45352172851562, + "learning_rate": 1.6339382940108894e-05, + "loss": 41.6817, + "step": 2519 + }, + { + "epoch": 9.097516930022573, + "grad_norm": 233.5334014892578, + "learning_rate": 1.633393829401089e-05, + "loss": 42.6121, + "step": 2520 + }, + { + "epoch": 9.097516930022573, + "eval_loss": 0.6148340106010437, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.926, + "eval_steps_per_second": 56.926, + "step": 2520 + }, + { + "epoch": 9.101128668171558, + "grad_norm": 196.54132080078125, + "learning_rate": 1.6328493647912888e-05, + "loss": 40.5833, + "step": 2521 + }, + { + "epoch": 9.104740406320541, + "grad_norm": 296.7503967285156, + "learning_rate": 1.6323049001814883e-05, + "loss": 39.098, + "step": 2522 + }, + { + "epoch": 9.108352144469526, + "grad_norm": 272.1104431152344, + "learning_rate": 1.631760435571688e-05, + "loss": 36.0076, + "step": 2523 + }, + { + "epoch": 9.111963882618511, + "grad_norm": 197.3100128173828, + "learning_rate": 1.6312159709618874e-05, + "loss": 33.3503, + "step": 2524 + }, + { + "epoch": 9.115575620767494, + "grad_norm": 223.1310272216797, + "learning_rate": 1.6306715063520873e-05, + "loss": 33.1386, + "step": 2525 + }, + { + "epoch": 9.119187358916479, + "grad_norm": 234.86093139648438, + "learning_rate": 1.630127041742287e-05, + "loss": 34.2101, + "step": 2526 + }, + { + "epoch": 9.122799097065462, + "grad_norm": 244.72328186035156, + "learning_rate": 1.6295825771324864e-05, + "loss": 34.955, + "step": 2527 + }, + { + "epoch": 9.126410835214447, + "grad_norm": 198.89134216308594, + "learning_rate": 1.629038112522686e-05, + "loss": 34.5405, + "step": 2528 + }, + { + "epoch": 9.130022573363432, + "grad_norm": 236.64096069335938, + "learning_rate": 1.6284936479128854e-05, + "loss": 35.2328, + "step": 2529 + }, + { + "epoch": 9.133634311512415, + "grad_norm": 212.8743438720703, + "learning_rate": 1.6279491833030853e-05, + "loss": 34.6642, + "step": 2530 + }, + { + "epoch": 9.133634311512415, + "eval_loss": 0.6154256463050842, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 2530 + }, + { + "epoch": 9.1372460496614, + "grad_norm": 227.15135192871094, + "learning_rate": 1.6274047186932852e-05, + "loss": 35.652, + "step": 2531 + }, + { + "epoch": 9.140857787810384, + "grad_norm": 207.30572509765625, + "learning_rate": 1.6268602540834847e-05, + "loss": 36.8476, + "step": 2532 + }, + { + "epoch": 9.144469525959368, + "grad_norm": 222.18023681640625, + "learning_rate": 1.6263157894736843e-05, + "loss": 35.8299, + "step": 2533 + }, + { + "epoch": 9.148081264108352, + "grad_norm": 283.674072265625, + "learning_rate": 1.6257713248638838e-05, + "loss": 36.5074, + "step": 2534 + }, + { + "epoch": 9.151693002257336, + "grad_norm": 235.69752502441406, + "learning_rate": 1.6252268602540834e-05, + "loss": 37.344, + "step": 2535 + }, + { + "epoch": 9.15530474040632, + "grad_norm": 224.37965393066406, + "learning_rate": 1.6246823956442832e-05, + "loss": 37.8138, + "step": 2536 + }, + { + "epoch": 9.158916478555305, + "grad_norm": 217.52230834960938, + "learning_rate": 1.6241379310344828e-05, + "loss": 37.1529, + "step": 2537 + }, + { + "epoch": 9.162528216704288, + "grad_norm": 234.7586212158203, + "learning_rate": 1.6235934664246823e-05, + "loss": 36.3247, + "step": 2538 + }, + { + "epoch": 9.166139954853273, + "grad_norm": 239.52479553222656, + "learning_rate": 1.623049001814882e-05, + "loss": 30.0805, + "step": 2539 + }, + { + "epoch": 9.169751693002258, + "grad_norm": 223.7616424560547, + "learning_rate": 1.6225045372050817e-05, + "loss": 23.8492, + "step": 2540 + }, + { + "epoch": 9.169751693002258, + "eval_loss": 0.6244915723800659, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.031, + "eval_steps_per_second": 57.031, + "step": 2540 + }, + { + "epoch": 9.173363431151241, + "grad_norm": 213.41371154785156, + "learning_rate": 1.6219600725952816e-05, + "loss": 23.3557, + "step": 2541 + }, + { + "epoch": 9.176975169300226, + "grad_norm": 162.4627685546875, + "learning_rate": 1.621415607985481e-05, + "loss": 23.8834, + "step": 2542 + }, + { + "epoch": 9.18058690744921, + "grad_norm": 172.13250732421875, + "learning_rate": 1.6208711433756807e-05, + "loss": 24.6428, + "step": 2543 + }, + { + "epoch": 9.184198645598194, + "grad_norm": 229.30799865722656, + "learning_rate": 1.6203266787658802e-05, + "loss": 42.5908, + "step": 2544 + }, + { + "epoch": 9.187810383747179, + "grad_norm": 195.30130004882812, + "learning_rate": 1.6197822141560798e-05, + "loss": 43.7286, + "step": 2545 + }, + { + "epoch": 9.191422121896162, + "grad_norm": 227.4984893798828, + "learning_rate": 1.6192377495462793e-05, + "loss": 43.5012, + "step": 2546 + }, + { + "epoch": 9.195033860045147, + "grad_norm": 254.69615173339844, + "learning_rate": 1.6186932849364792e-05, + "loss": 41.9295, + "step": 2547 + }, + { + "epoch": 9.198645598194132, + "grad_norm": 251.33778381347656, + "learning_rate": 1.6181488203266787e-05, + "loss": 42.0838, + "step": 2548 + }, + { + "epoch": 9.202257336343115, + "grad_norm": 237.91677856445312, + "learning_rate": 1.6176043557168786e-05, + "loss": 43.0031, + "step": 2549 + }, + { + "epoch": 9.2058690744921, + "grad_norm": 258.0311584472656, + "learning_rate": 1.617059891107078e-05, + "loss": 42.7196, + "step": 2550 + }, + { + "epoch": 9.2058690744921, + "eval_loss": 0.6245208978652954, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2550 + }, + { + "epoch": 9.209480812641084, + "grad_norm": 197.14703369140625, + "learning_rate": 1.6165154264972777e-05, + "loss": 42.1342, + "step": 2551 + }, + { + "epoch": 9.213092550790067, + "grad_norm": 235.19705200195312, + "learning_rate": 1.6159709618874775e-05, + "loss": 41.8462, + "step": 2552 + }, + { + "epoch": 9.216704288939052, + "grad_norm": 198.409423828125, + "learning_rate": 1.615426497277677e-05, + "loss": 43.5993, + "step": 2553 + }, + { + "epoch": 9.220316027088035, + "grad_norm": 254.08590698242188, + "learning_rate": 1.6148820326678766e-05, + "loss": 40.771, + "step": 2554 + }, + { + "epoch": 9.22392776523702, + "grad_norm": 181.64808654785156, + "learning_rate": 1.614337568058076e-05, + "loss": 39.3511, + "step": 2555 + }, + { + "epoch": 9.227539503386005, + "grad_norm": 294.1127014160156, + "learning_rate": 1.6137931034482757e-05, + "loss": 39.6586, + "step": 2556 + }, + { + "epoch": 9.231151241534988, + "grad_norm": 197.59982299804688, + "learning_rate": 1.6132486388384752e-05, + "loss": 38.2575, + "step": 2557 + }, + { + "epoch": 9.234762979683973, + "grad_norm": 223.74717712402344, + "learning_rate": 1.6127041742286754e-05, + "loss": 38.8801, + "step": 2558 + }, + { + "epoch": 9.238374717832958, + "grad_norm": 279.2779541015625, + "learning_rate": 1.612159709618875e-05, + "loss": 40.4591, + "step": 2559 + }, + { + "epoch": 9.241986455981941, + "grad_norm": 258.75909423828125, + "learning_rate": 1.6116152450090745e-05, + "loss": 39.2172, + "step": 2560 + }, + { + "epoch": 9.241986455981941, + "eval_loss": 0.6209923624992371, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.784, + "eval_steps_per_second": 56.784, + "step": 2560 + }, + { + "epoch": 9.245598194130926, + "grad_norm": 305.0645446777344, + "learning_rate": 1.611070780399274e-05, + "loss": 40.442, + "step": 2561 + }, + { + "epoch": 9.249209932279909, + "grad_norm": 196.18557739257812, + "learning_rate": 1.6105263157894736e-05, + "loss": 39.7092, + "step": 2562 + }, + { + "epoch": 9.252821670428894, + "grad_norm": 214.3220977783203, + "learning_rate": 1.6099818511796735e-05, + "loss": 39.3935, + "step": 2563 + }, + { + "epoch": 9.256433408577879, + "grad_norm": 217.2801055908203, + "learning_rate": 1.609437386569873e-05, + "loss": 40.39, + "step": 2564 + }, + { + "epoch": 9.260045146726862, + "grad_norm": 205.17446899414062, + "learning_rate": 1.6088929219600726e-05, + "loss": 39.9531, + "step": 2565 + }, + { + "epoch": 9.263656884875846, + "grad_norm": 197.3854217529297, + "learning_rate": 1.608348457350272e-05, + "loss": 40.474, + "step": 2566 + }, + { + "epoch": 9.267268623024831, + "grad_norm": 264.3934631347656, + "learning_rate": 1.607803992740472e-05, + "loss": 41.2794, + "step": 2567 + }, + { + "epoch": 9.270880361173814, + "grad_norm": 226.6471710205078, + "learning_rate": 1.6072595281306715e-05, + "loss": 40.3425, + "step": 2568 + }, + { + "epoch": 9.2744920993228, + "grad_norm": 198.62734985351562, + "learning_rate": 1.6067150635208714e-05, + "loss": 41.6261, + "step": 2569 + }, + { + "epoch": 9.278103837471784, + "grad_norm": 207.73509216308594, + "learning_rate": 1.606170598911071e-05, + "loss": 41.7835, + "step": 2570 + }, + { + "epoch": 9.278103837471784, + "eval_loss": 0.6173180937767029, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 2570 + }, + { + "epoch": 9.281715575620767, + "grad_norm": 214.13601684570312, + "learning_rate": 1.6056261343012705e-05, + "loss": 40.0095, + "step": 2571 + }, + { + "epoch": 9.285327313769752, + "grad_norm": 218.0533905029297, + "learning_rate": 1.60508166969147e-05, + "loss": 40.014, + "step": 2572 + }, + { + "epoch": 9.288939051918735, + "grad_norm": 211.27984619140625, + "learning_rate": 1.6045372050816695e-05, + "loss": 36.7399, + "step": 2573 + }, + { + "epoch": 9.29255079006772, + "grad_norm": 201.9020233154297, + "learning_rate": 1.6039927404718694e-05, + "loss": 33.7555, + "step": 2574 + }, + { + "epoch": 9.296162528216705, + "grad_norm": 230.27149963378906, + "learning_rate": 1.603448275862069e-05, + "loss": 32.9646, + "step": 2575 + }, + { + "epoch": 9.299774266365688, + "grad_norm": 208.77622985839844, + "learning_rate": 1.6029038112522685e-05, + "loss": 33.5332, + "step": 2576 + }, + { + "epoch": 9.303386004514673, + "grad_norm": 225.02796936035156, + "learning_rate": 1.6023593466424684e-05, + "loss": 34.2592, + "step": 2577 + }, + { + "epoch": 9.306997742663658, + "grad_norm": 201.79612731933594, + "learning_rate": 1.601814882032668e-05, + "loss": 34.6686, + "step": 2578 + }, + { + "epoch": 9.31060948081264, + "grad_norm": 235.6588134765625, + "learning_rate": 1.6012704174228678e-05, + "loss": 35.4554, + "step": 2579 + }, + { + "epoch": 9.314221218961626, + "grad_norm": 273.51904296875, + "learning_rate": 1.6007259528130673e-05, + "loss": 35.2077, + "step": 2580 + }, + { + "epoch": 9.314221218961626, + "eval_loss": 0.6169624328613281, + "eval_runtime": 3.1501, + "eval_samples_per_second": 56.823, + "eval_steps_per_second": 56.823, + "step": 2580 + }, + { + "epoch": 9.317832957110609, + "grad_norm": 199.19541931152344, + "learning_rate": 1.600181488203267e-05, + "loss": 35.0703, + "step": 2581 + }, + { + "epoch": 9.321444695259594, + "grad_norm": 212.49276733398438, + "learning_rate": 1.5996370235934664e-05, + "loss": 35.9691, + "step": 2582 + }, + { + "epoch": 9.325056433408578, + "grad_norm": 193.7330322265625, + "learning_rate": 1.599092558983666e-05, + "loss": 34.9043, + "step": 2583 + }, + { + "epoch": 9.328668171557561, + "grad_norm": 196.00503540039062, + "learning_rate": 1.5985480943738655e-05, + "loss": 36.3508, + "step": 2584 + }, + { + "epoch": 9.332279909706546, + "grad_norm": 218.78392028808594, + "learning_rate": 1.5980036297640654e-05, + "loss": 34.7672, + "step": 2585 + }, + { + "epoch": 9.335891647855531, + "grad_norm": 235.76873779296875, + "learning_rate": 1.5974591651542652e-05, + "loss": 36.8695, + "step": 2586 + }, + { + "epoch": 9.339503386004514, + "grad_norm": 250.538330078125, + "learning_rate": 1.5969147005444648e-05, + "loss": 37.4531, + "step": 2587 + }, + { + "epoch": 9.343115124153499, + "grad_norm": 234.12469482421875, + "learning_rate": 1.5963702359346643e-05, + "loss": 37.4506, + "step": 2588 + }, + { + "epoch": 9.346726862302482, + "grad_norm": 209.3461151123047, + "learning_rate": 1.595825771324864e-05, + "loss": 31.3062, + "step": 2589 + }, + { + "epoch": 9.350338600451467, + "grad_norm": 211.12277221679688, + "learning_rate": 1.5952813067150637e-05, + "loss": 23.3303, + "step": 2590 + }, + { + "epoch": 9.350338600451467, + "eval_loss": 0.6222187876701355, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 2590 + }, + { + "epoch": 9.353950338600452, + "grad_norm": 200.1257781982422, + "learning_rate": 1.5947368421052633e-05, + "loss": 22.9145, + "step": 2591 + }, + { + "epoch": 9.357562076749435, + "grad_norm": 179.01475524902344, + "learning_rate": 1.5941923774954628e-05, + "loss": 23.8842, + "step": 2592 + }, + { + "epoch": 9.36117381489842, + "grad_norm": 214.9254608154297, + "learning_rate": 1.5936479128856623e-05, + "loss": 25.4154, + "step": 2593 + }, + { + "epoch": 9.364785553047405, + "grad_norm": 211.63735961914062, + "learning_rate": 1.593103448275862e-05, + "loss": 42.6467, + "step": 2594 + }, + { + "epoch": 9.368397291196388, + "grad_norm": 232.43194580078125, + "learning_rate": 1.5925589836660618e-05, + "loss": 43.3501, + "step": 2595 + }, + { + "epoch": 9.372009029345373, + "grad_norm": 220.61468505859375, + "learning_rate": 1.5920145190562616e-05, + "loss": 43.4324, + "step": 2596 + }, + { + "epoch": 9.375620767494357, + "grad_norm": 179.00894165039062, + "learning_rate": 1.591470054446461e-05, + "loss": 41.9646, + "step": 2597 + }, + { + "epoch": 9.37923250564334, + "grad_norm": 203.847412109375, + "learning_rate": 1.5909255898366607e-05, + "loss": 41.1242, + "step": 2598 + }, + { + "epoch": 9.382844243792325, + "grad_norm": 244.20164489746094, + "learning_rate": 1.5903811252268602e-05, + "loss": 42.2451, + "step": 2599 + }, + { + "epoch": 9.386455981941308, + "grad_norm": 203.60154724121094, + "learning_rate": 1.5898366606170598e-05, + "loss": 42.0361, + "step": 2600 + }, + { + "epoch": 9.386455981941308, + "eval_loss": 0.627146303653717, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2600 + }, + { + "epoch": 9.390067720090293, + "grad_norm": 185.1741180419922, + "learning_rate": 1.5892921960072597e-05, + "loss": 41.9657, + "step": 2601 + }, + { + "epoch": 9.393679458239278, + "grad_norm": 211.64219665527344, + "learning_rate": 1.5887477313974592e-05, + "loss": 42.2619, + "step": 2602 + }, + { + "epoch": 9.397291196388261, + "grad_norm": 253.31997680664062, + "learning_rate": 1.5882032667876587e-05, + "loss": 42.5666, + "step": 2603 + }, + { + "epoch": 9.400902934537246, + "grad_norm": 257.8781433105469, + "learning_rate": 1.5876588021778586e-05, + "loss": 43.1747, + "step": 2604 + }, + { + "epoch": 9.404514672686231, + "grad_norm": 171.05398559570312, + "learning_rate": 1.587114337568058e-05, + "loss": 41.2645, + "step": 2605 + }, + { + "epoch": 9.408126410835214, + "grad_norm": 209.83749389648438, + "learning_rate": 1.5865698729582577e-05, + "loss": 38.7138, + "step": 2606 + }, + { + "epoch": 9.411738148984199, + "grad_norm": 303.92059326171875, + "learning_rate": 1.5860254083484576e-05, + "loss": 38.7962, + "step": 2607 + }, + { + "epoch": 9.415349887133182, + "grad_norm": 271.9322204589844, + "learning_rate": 1.585480943738657e-05, + "loss": 39.0622, + "step": 2608 + }, + { + "epoch": 9.418961625282167, + "grad_norm": 222.8749542236328, + "learning_rate": 1.5849364791288566e-05, + "loss": 40.0773, + "step": 2609 + }, + { + "epoch": 9.422573363431152, + "grad_norm": 194.549072265625, + "learning_rate": 1.5843920145190562e-05, + "loss": 39.3495, + "step": 2610 + }, + { + "epoch": 9.422573363431152, + "eval_loss": 0.618250846862793, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.796, + "eval_steps_per_second": 56.796, + "step": 2610 + }, + { + "epoch": 9.426185101580135, + "grad_norm": 231.32623291015625, + "learning_rate": 1.5838475499092557e-05, + "loss": 39.7577, + "step": 2611 + }, + { + "epoch": 9.42979683972912, + "grad_norm": 185.9986114501953, + "learning_rate": 1.5833030852994556e-05, + "loss": 40.9342, + "step": 2612 + }, + { + "epoch": 9.433408577878104, + "grad_norm": 221.356201171875, + "learning_rate": 1.5827586206896555e-05, + "loss": 39.7733, + "step": 2613 + }, + { + "epoch": 9.437020316027088, + "grad_norm": 216.2249755859375, + "learning_rate": 1.582214156079855e-05, + "loss": 39.7559, + "step": 2614 + }, + { + "epoch": 9.440632054176072, + "grad_norm": 263.5106201171875, + "learning_rate": 1.5816696914700546e-05, + "loss": 41.2872, + "step": 2615 + }, + { + "epoch": 9.444243792325057, + "grad_norm": 281.9518127441406, + "learning_rate": 1.581125226860254e-05, + "loss": 41.1114, + "step": 2616 + }, + { + "epoch": 9.44785553047404, + "grad_norm": 200.2808074951172, + "learning_rate": 1.5805807622504536e-05, + "loss": 41.7711, + "step": 2617 + }, + { + "epoch": 9.451467268623025, + "grad_norm": 233.034912109375, + "learning_rate": 1.5800362976406535e-05, + "loss": 41.3306, + "step": 2618 + }, + { + "epoch": 9.455079006772008, + "grad_norm": 215.5499725341797, + "learning_rate": 1.579491833030853e-05, + "loss": 41.0065, + "step": 2619 + }, + { + "epoch": 9.458690744920993, + "grad_norm": 220.21153259277344, + "learning_rate": 1.5789473684210526e-05, + "loss": 42.1116, + "step": 2620 + }, + { + "epoch": 9.458690744920993, + "eval_loss": 0.6146022081375122, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 2620 + }, + { + "epoch": 9.462302483069978, + "grad_norm": 198.20001220703125, + "learning_rate": 1.578402903811252e-05, + "loss": 39.637, + "step": 2621 + }, + { + "epoch": 9.465914221218961, + "grad_norm": 228.18357849121094, + "learning_rate": 1.5778584392014517e-05, + "loss": 37.3831, + "step": 2622 + }, + { + "epoch": 9.469525959367946, + "grad_norm": 207.68040466308594, + "learning_rate": 1.577313974591652e-05, + "loss": 35.6356, + "step": 2623 + }, + { + "epoch": 9.47313769751693, + "grad_norm": 267.0474853515625, + "learning_rate": 1.5767695099818514e-05, + "loss": 34.5549, + "step": 2624 + }, + { + "epoch": 9.476749435665914, + "grad_norm": 191.4129638671875, + "learning_rate": 1.576225045372051e-05, + "loss": 35.1065, + "step": 2625 + }, + { + "epoch": 9.480361173814899, + "grad_norm": 220.85708618164062, + "learning_rate": 1.5756805807622505e-05, + "loss": 34.9115, + "step": 2626 + }, + { + "epoch": 9.483972911963882, + "grad_norm": 218.62460327148438, + "learning_rate": 1.57513611615245e-05, + "loss": 33.9542, + "step": 2627 + }, + { + "epoch": 9.487584650112867, + "grad_norm": 184.085693359375, + "learning_rate": 1.5745916515426496e-05, + "loss": 35.2981, + "step": 2628 + }, + { + "epoch": 9.491196388261852, + "grad_norm": 286.73236083984375, + "learning_rate": 1.5740471869328494e-05, + "loss": 36.8326, + "step": 2629 + }, + { + "epoch": 9.494808126410835, + "grad_norm": 326.4263000488281, + "learning_rate": 1.573502722323049e-05, + "loss": 35.9728, + "step": 2630 + }, + { + "epoch": 9.494808126410835, + "eval_loss": 0.6165672540664673, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2630 + }, + { + "epoch": 9.49841986455982, + "grad_norm": 283.330322265625, + "learning_rate": 1.5729582577132485e-05, + "loss": 37.4227, + "step": 2631 + }, + { + "epoch": 9.502031602708804, + "grad_norm": 208.65829467773438, + "learning_rate": 1.5724137931034484e-05, + "loss": 36.8613, + "step": 2632 + }, + { + "epoch": 9.505643340857787, + "grad_norm": 191.59429931640625, + "learning_rate": 1.571869328493648e-05, + "loss": 36.2332, + "step": 2633 + }, + { + "epoch": 9.509255079006772, + "grad_norm": 306.4736022949219, + "learning_rate": 1.5713248638838478e-05, + "loss": 36.8045, + "step": 2634 + }, + { + "epoch": 9.512866817155757, + "grad_norm": 226.97509765625, + "learning_rate": 1.5707803992740474e-05, + "loss": 37.005, + "step": 2635 + }, + { + "epoch": 9.51647855530474, + "grad_norm": 230.47683715820312, + "learning_rate": 1.570235934664247e-05, + "loss": 36.9168, + "step": 2636 + }, + { + "epoch": 9.520090293453725, + "grad_norm": 221.44483947753906, + "learning_rate": 1.5696914700544464e-05, + "loss": 39.0025, + "step": 2637 + }, + { + "epoch": 9.523702031602708, + "grad_norm": 249.1531219482422, + "learning_rate": 1.569147005444646e-05, + "loss": 38.1069, + "step": 2638 + }, + { + "epoch": 9.527313769751693, + "grad_norm": 276.8532409667969, + "learning_rate": 1.5686025408348455e-05, + "loss": 30.9819, + "step": 2639 + }, + { + "epoch": 9.530925507900678, + "grad_norm": 218.25035095214844, + "learning_rate": 1.5680580762250454e-05, + "loss": 23.4807, + "step": 2640 + }, + { + "epoch": 9.530925507900678, + "eval_loss": 0.619295060634613, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2640 + }, + { + "epoch": 9.534537246049661, + "grad_norm": 185.83737182617188, + "learning_rate": 1.5675136116152453e-05, + "loss": 22.5394, + "step": 2641 + }, + { + "epoch": 9.538148984198646, + "grad_norm": 181.9920654296875, + "learning_rate": 1.5669691470054448e-05, + "loss": 23.9106, + "step": 2642 + }, + { + "epoch": 9.54176072234763, + "grad_norm": 209.20391845703125, + "learning_rate": 1.5664246823956443e-05, + "loss": 25.5328, + "step": 2643 + }, + { + "epoch": 9.545372460496614, + "grad_norm": 223.86093139648438, + "learning_rate": 1.565880217785844e-05, + "loss": 42.8563, + "step": 2644 + }, + { + "epoch": 9.548984198645599, + "grad_norm": 232.3086395263672, + "learning_rate": 1.5653357531760438e-05, + "loss": 44.0178, + "step": 2645 + }, + { + "epoch": 9.552595936794582, + "grad_norm": 223.76541137695312, + "learning_rate": 1.5647912885662433e-05, + "loss": 43.4928, + "step": 2646 + }, + { + "epoch": 9.556207674943566, + "grad_norm": 258.86700439453125, + "learning_rate": 1.5642468239564428e-05, + "loss": 42.3422, + "step": 2647 + }, + { + "epoch": 9.559819413092551, + "grad_norm": 255.09033203125, + "learning_rate": 1.5637023593466424e-05, + "loss": 41.6588, + "step": 2648 + }, + { + "epoch": 9.563431151241534, + "grad_norm": 205.88563537597656, + "learning_rate": 1.563157894736842e-05, + "loss": 41.9267, + "step": 2649 + }, + { + "epoch": 9.56704288939052, + "grad_norm": 204.12318420410156, + "learning_rate": 1.5626134301270418e-05, + "loss": 43.0326, + "step": 2650 + }, + { + "epoch": 9.56704288939052, + "eval_loss": 0.6218730807304382, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2650 + }, + { + "epoch": 9.570654627539504, + "grad_norm": 259.5694274902344, + "learning_rate": 1.5620689655172417e-05, + "loss": 42.9604, + "step": 2651 + }, + { + "epoch": 9.574266365688487, + "grad_norm": 234.35935974121094, + "learning_rate": 1.5615245009074412e-05, + "loss": 42.7316, + "step": 2652 + }, + { + "epoch": 9.577878103837472, + "grad_norm": 237.14346313476562, + "learning_rate": 1.5609800362976407e-05, + "loss": 42.4559, + "step": 2653 + }, + { + "epoch": 9.581489841986457, + "grad_norm": 208.2974395751953, + "learning_rate": 1.5604355716878403e-05, + "loss": 40.1113, + "step": 2654 + }, + { + "epoch": 9.58510158013544, + "grad_norm": 212.18814086914062, + "learning_rate": 1.5598911070780398e-05, + "loss": 38.6515, + "step": 2655 + }, + { + "epoch": 9.588713318284425, + "grad_norm": 245.23240661621094, + "learning_rate": 1.5593466424682397e-05, + "loss": 39.5289, + "step": 2656 + }, + { + "epoch": 9.592325056433408, + "grad_norm": 261.1321105957031, + "learning_rate": 1.5588021778584392e-05, + "loss": 39.3232, + "step": 2657 + }, + { + "epoch": 9.595936794582393, + "grad_norm": 257.67962646484375, + "learning_rate": 1.5582577132486388e-05, + "loss": 40.3963, + "step": 2658 + }, + { + "epoch": 9.599548532731378, + "grad_norm": 299.93914794921875, + "learning_rate": 1.5577132486388383e-05, + "loss": 39.0657, + "step": 2659 + }, + { + "epoch": 9.60316027088036, + "grad_norm": 215.45407104492188, + "learning_rate": 1.5571687840290382e-05, + "loss": 40.1408, + "step": 2660 + }, + { + "epoch": 9.60316027088036, + "eval_loss": 0.6216554045677185, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2660 + }, + { + "epoch": 9.606772009029346, + "grad_norm": 273.9233093261719, + "learning_rate": 1.5566243194192377e-05, + "loss": 40.6894, + "step": 2661 + }, + { + "epoch": 9.610383747178329, + "grad_norm": 220.76344299316406, + "learning_rate": 1.5560798548094376e-05, + "loss": 40.8146, + "step": 2662 + }, + { + "epoch": 9.613995485327314, + "grad_norm": 200.33929443359375, + "learning_rate": 1.555535390199637e-05, + "loss": 40.1362, + "step": 2663 + }, + { + "epoch": 9.617607223476298, + "grad_norm": 223.38536071777344, + "learning_rate": 1.5549909255898367e-05, + "loss": 39.3488, + "step": 2664 + }, + { + "epoch": 9.621218961625281, + "grad_norm": 240.99578857421875, + "learning_rate": 1.5544464609800362e-05, + "loss": 41.771, + "step": 2665 + }, + { + "epoch": 9.624830699774266, + "grad_norm": 202.30323791503906, + "learning_rate": 1.5539019963702357e-05, + "loss": 41.1412, + "step": 2666 + }, + { + "epoch": 9.628442437923251, + "grad_norm": 193.8411865234375, + "learning_rate": 1.5533575317604356e-05, + "loss": 41.0064, + "step": 2667 + }, + { + "epoch": 9.632054176072234, + "grad_norm": 197.1542510986328, + "learning_rate": 1.552813067150635e-05, + "loss": 41.4787, + "step": 2668 + }, + { + "epoch": 9.635665914221219, + "grad_norm": 259.21954345703125, + "learning_rate": 1.552268602540835e-05, + "loss": 41.753, + "step": 2669 + }, + { + "epoch": 9.639277652370204, + "grad_norm": 290.9770202636719, + "learning_rate": 1.5517241379310346e-05, + "loss": 40.4589, + "step": 2670 + }, + { + "epoch": 9.639277652370204, + "eval_loss": 0.6132164001464844, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2670 + }, + { + "epoch": 9.642889390519187, + "grad_norm": 252.86219787597656, + "learning_rate": 1.551179673321234e-05, + "loss": 37.356, + "step": 2671 + }, + { + "epoch": 9.646501128668172, + "grad_norm": 207.79254150390625, + "learning_rate": 1.550635208711434e-05, + "loss": 36.2071, + "step": 2672 + }, + { + "epoch": 9.650112866817155, + "grad_norm": 186.78857421875, + "learning_rate": 1.5500907441016335e-05, + "loss": 33.5074, + "step": 2673 + }, + { + "epoch": 9.65372460496614, + "grad_norm": 212.5107421875, + "learning_rate": 1.549546279491833e-05, + "loss": 33.7103, + "step": 2674 + }, + { + "epoch": 9.657336343115125, + "grad_norm": 243.2950897216797, + "learning_rate": 1.5490018148820326e-05, + "loss": 34.3476, + "step": 2675 + }, + { + "epoch": 9.660948081264108, + "grad_norm": 221.66415405273438, + "learning_rate": 1.548457350272232e-05, + "loss": 34.5377, + "step": 2676 + }, + { + "epoch": 9.664559819413093, + "grad_norm": 231.8260955810547, + "learning_rate": 1.5479128856624317e-05, + "loss": 34.3663, + "step": 2677 + }, + { + "epoch": 9.668171557562077, + "grad_norm": 284.6401062011719, + "learning_rate": 1.547368421052632e-05, + "loss": 35.5723, + "step": 2678 + }, + { + "epoch": 9.67178329571106, + "grad_norm": 373.43865966796875, + "learning_rate": 1.5468239564428314e-05, + "loss": 35.5628, + "step": 2679 + }, + { + "epoch": 9.675395033860045, + "grad_norm": 325.18316650390625, + "learning_rate": 1.546279491833031e-05, + "loss": 35.6192, + "step": 2680 + }, + { + "epoch": 9.675395033860045, + "eval_loss": 0.613842248916626, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 2680 + }, + { + "epoch": 9.679006772009028, + "grad_norm": 353.14739990234375, + "learning_rate": 1.5457350272232305e-05, + "loss": 36.4789, + "step": 2681 + }, + { + "epoch": 9.682618510158013, + "grad_norm": 215.21836853027344, + "learning_rate": 1.54519056261343e-05, + "loss": 36.0412, + "step": 2682 + }, + { + "epoch": 9.686230248306998, + "grad_norm": 219.64930725097656, + "learning_rate": 1.54464609800363e-05, + "loss": 37.1118, + "step": 2683 + }, + { + "epoch": 9.689841986455981, + "grad_norm": 247.86685180664062, + "learning_rate": 1.5441016333938295e-05, + "loss": 36.488, + "step": 2684 + }, + { + "epoch": 9.693453724604966, + "grad_norm": 248.7967071533203, + "learning_rate": 1.543557168784029e-05, + "loss": 36.2925, + "step": 2685 + }, + { + "epoch": 9.697065462753951, + "grad_norm": 243.1404571533203, + "learning_rate": 1.5430127041742285e-05, + "loss": 37.3986, + "step": 2686 + }, + { + "epoch": 9.700677200902934, + "grad_norm": 276.6585388183594, + "learning_rate": 1.5424682395644284e-05, + "loss": 37.9784, + "step": 2687 + }, + { + "epoch": 9.704288939051919, + "grad_norm": 308.171630859375, + "learning_rate": 1.541923774954628e-05, + "loss": 38.1591, + "step": 2688 + }, + { + "epoch": 9.707900677200904, + "grad_norm": 204.4575653076172, + "learning_rate": 1.541379310344828e-05, + "loss": 27.4514, + "step": 2689 + }, + { + "epoch": 9.711512415349887, + "grad_norm": 160.85946655273438, + "learning_rate": 1.5408348457350274e-05, + "loss": 23.7982, + "step": 2690 + }, + { + "epoch": 9.711512415349887, + "eval_loss": 0.619924008846283, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2690 + }, + { + "epoch": 9.715124153498872, + "grad_norm": 215.60049438476562, + "learning_rate": 1.540290381125227e-05, + "loss": 23.3927, + "step": 2691 + }, + { + "epoch": 9.718735891647855, + "grad_norm": 172.84011840820312, + "learning_rate": 1.5397459165154265e-05, + "loss": 24.1876, + "step": 2692 + }, + { + "epoch": 9.72234762979684, + "grad_norm": 208.42361450195312, + "learning_rate": 1.539201451905626e-05, + "loss": 25.1794, + "step": 2693 + }, + { + "epoch": 9.725959367945824, + "grad_norm": 255.73574829101562, + "learning_rate": 1.538656987295826e-05, + "loss": 42.3484, + "step": 2694 + }, + { + "epoch": 9.729571106094808, + "grad_norm": 239.65533447265625, + "learning_rate": 1.5381125226860254e-05, + "loss": 42.8277, + "step": 2695 + }, + { + "epoch": 9.733182844243792, + "grad_norm": 211.2068634033203, + "learning_rate": 1.5375680580762253e-05, + "loss": 42.6536, + "step": 2696 + }, + { + "epoch": 9.736794582392777, + "grad_norm": 302.85003662109375, + "learning_rate": 1.5370235934664248e-05, + "loss": 42.6263, + "step": 2697 + }, + { + "epoch": 9.74040632054176, + "grad_norm": 211.54754638671875, + "learning_rate": 1.5364791288566244e-05, + "loss": 41.5621, + "step": 2698 + }, + { + "epoch": 9.744018058690745, + "grad_norm": 229.22283935546875, + "learning_rate": 1.535934664246824e-05, + "loss": 43.3765, + "step": 2699 + }, + { + "epoch": 9.747629796839728, + "grad_norm": 206.64794921875, + "learning_rate": 1.5353901996370238e-05, + "loss": 41.4923, + "step": 2700 + }, + { + "epoch": 9.747629796839728, + "eval_loss": 0.6202616095542908, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.981, + "eval_steps_per_second": 56.981, + "step": 2700 + }, + { + "epoch": 9.751241534988713, + "grad_norm": 216.98757934570312, + "learning_rate": 1.5348457350272233e-05, + "loss": 43.1931, + "step": 2701 + }, + { + "epoch": 9.754853273137698, + "grad_norm": 222.7340545654297, + "learning_rate": 1.534301270417423e-05, + "loss": 42.485, + "step": 2702 + }, + { + "epoch": 9.758465011286681, + "grad_norm": 291.3454895019531, + "learning_rate": 1.5337568058076224e-05, + "loss": 41.4766, + "step": 2703 + }, + { + "epoch": 9.762076749435666, + "grad_norm": 239.50341796875, + "learning_rate": 1.533212341197822e-05, + "loss": 41.9215, + "step": 2704 + }, + { + "epoch": 9.76568848758465, + "grad_norm": 179.21839904785156, + "learning_rate": 1.5326678765880218e-05, + "loss": 40.6544, + "step": 2705 + }, + { + "epoch": 9.769300225733634, + "grad_norm": 210.89535522460938, + "learning_rate": 1.5321234119782217e-05, + "loss": 38.6204, + "step": 2706 + }, + { + "epoch": 9.772911963882619, + "grad_norm": 239.23291015625, + "learning_rate": 1.5315789473684212e-05, + "loss": 39.4385, + "step": 2707 + }, + { + "epoch": 9.776523702031604, + "grad_norm": 240.22772216796875, + "learning_rate": 1.5310344827586208e-05, + "loss": 40.0139, + "step": 2708 + }, + { + "epoch": 9.780135440180587, + "grad_norm": 185.4588623046875, + "learning_rate": 1.5304900181488203e-05, + "loss": 38.9331, + "step": 2709 + }, + { + "epoch": 9.783747178329572, + "grad_norm": 263.0315856933594, + "learning_rate": 1.52994555353902e-05, + "loss": 38.5485, + "step": 2710 + }, + { + "epoch": 9.783747178329572, + "eval_loss": 0.615914523601532, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2710 + }, + { + "epoch": 9.787358916478555, + "grad_norm": 209.05348205566406, + "learning_rate": 1.5294010889292197e-05, + "loss": 39.4875, + "step": 2711 + }, + { + "epoch": 9.79097065462754, + "grad_norm": 209.72293090820312, + "learning_rate": 1.5288566243194193e-05, + "loss": 40.4742, + "step": 2712 + }, + { + "epoch": 9.794582392776524, + "grad_norm": 210.02908325195312, + "learning_rate": 1.5283121597096188e-05, + "loss": 39.924, + "step": 2713 + }, + { + "epoch": 9.798194130925507, + "grad_norm": 204.3467254638672, + "learning_rate": 1.5277676950998183e-05, + "loss": 40.8893, + "step": 2714 + }, + { + "epoch": 9.801805869074492, + "grad_norm": 253.9317626953125, + "learning_rate": 1.5272232304900182e-05, + "loss": 38.3278, + "step": 2715 + }, + { + "epoch": 9.805417607223477, + "grad_norm": 263.6196594238281, + "learning_rate": 1.526678765880218e-05, + "loss": 40.5242, + "step": 2716 + }, + { + "epoch": 9.80902934537246, + "grad_norm": 230.35621643066406, + "learning_rate": 1.5261343012704176e-05, + "loss": 40.683, + "step": 2717 + }, + { + "epoch": 9.812641083521445, + "grad_norm": 190.16323852539062, + "learning_rate": 1.5255898366606172e-05, + "loss": 40.2472, + "step": 2718 + }, + { + "epoch": 9.816252821670428, + "grad_norm": 202.7122344970703, + "learning_rate": 1.5250453720508167e-05, + "loss": 38.9644, + "step": 2719 + }, + { + "epoch": 9.819864559819413, + "grad_norm": 193.65774536132812, + "learning_rate": 1.5245009074410164e-05, + "loss": 40.9982, + "step": 2720 + }, + { + "epoch": 9.819864559819413, + "eval_loss": 0.6152020692825317, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 2720 + }, + { + "epoch": 9.823476297968398, + "grad_norm": 272.0360412597656, + "learning_rate": 1.523956442831216e-05, + "loss": 40.5518, + "step": 2721 + }, + { + "epoch": 9.827088036117381, + "grad_norm": 200.20777893066406, + "learning_rate": 1.5234119782214155e-05, + "loss": 38.4801, + "step": 2722 + }, + { + "epoch": 9.830699774266366, + "grad_norm": 201.44764709472656, + "learning_rate": 1.5228675136116152e-05, + "loss": 35.7499, + "step": 2723 + }, + { + "epoch": 9.83431151241535, + "grad_norm": 234.89706420898438, + "learning_rate": 1.522323049001815e-05, + "loss": 35.4331, + "step": 2724 + }, + { + "epoch": 9.837923250564334, + "grad_norm": 193.27423095703125, + "learning_rate": 1.5217785843920146e-05, + "loss": 33.0281, + "step": 2725 + }, + { + "epoch": 9.841534988713319, + "grad_norm": 222.28060913085938, + "learning_rate": 1.5212341197822143e-05, + "loss": 34.2237, + "step": 2726 + }, + { + "epoch": 9.845146726862303, + "grad_norm": 264.2764587402344, + "learning_rate": 1.5206896551724139e-05, + "loss": 33.7112, + "step": 2727 + }, + { + "epoch": 9.848758465011286, + "grad_norm": 204.5146484375, + "learning_rate": 1.5201451905626134e-05, + "loss": 33.9014, + "step": 2728 + }, + { + "epoch": 9.852370203160271, + "grad_norm": 198.90907287597656, + "learning_rate": 1.5196007259528131e-05, + "loss": 36.6987, + "step": 2729 + }, + { + "epoch": 9.855981941309254, + "grad_norm": 254.19818115234375, + "learning_rate": 1.5190562613430126e-05, + "loss": 35.4466, + "step": 2730 + }, + { + "epoch": 9.855981941309254, + "eval_loss": 0.6153284311294556, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2730 + }, + { + "epoch": 9.85959367945824, + "grad_norm": 212.53749084472656, + "learning_rate": 1.5185117967332123e-05, + "loss": 35.659, + "step": 2731 + }, + { + "epoch": 9.863205417607224, + "grad_norm": 234.5277557373047, + "learning_rate": 1.5179673321234119e-05, + "loss": 36.7411, + "step": 2732 + }, + { + "epoch": 9.866817155756207, + "grad_norm": 229.25962829589844, + "learning_rate": 1.5174228675136118e-05, + "loss": 36.0713, + "step": 2733 + }, + { + "epoch": 9.870428893905192, + "grad_norm": 259.5096435546875, + "learning_rate": 1.5168784029038115e-05, + "loss": 37.2433, + "step": 2734 + }, + { + "epoch": 9.874040632054175, + "grad_norm": 297.2413024902344, + "learning_rate": 1.516333938294011e-05, + "loss": 37.222, + "step": 2735 + }, + { + "epoch": 9.87765237020316, + "grad_norm": 259.8325500488281, + "learning_rate": 1.5157894736842105e-05, + "loss": 37.096, + "step": 2736 + }, + { + "epoch": 9.881264108352145, + "grad_norm": 275.85888671875, + "learning_rate": 1.5152450090744103e-05, + "loss": 37.769, + "step": 2737 + }, + { + "epoch": 9.884875846501128, + "grad_norm": 261.16656494140625, + "learning_rate": 1.5147005444646098e-05, + "loss": 38.4089, + "step": 2738 + }, + { + "epoch": 9.888487584650113, + "grad_norm": 219.74351501464844, + "learning_rate": 1.5141560798548095e-05, + "loss": 32.5255, + "step": 2739 + }, + { + "epoch": 9.892099322799098, + "grad_norm": 203.9193878173828, + "learning_rate": 1.513611615245009e-05, + "loss": 24.2497, + "step": 2740 + }, + { + "epoch": 9.892099322799098, + "eval_loss": 0.6206448674201965, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 2740 + }, + { + "epoch": 9.89571106094808, + "grad_norm": 224.19454956054688, + "learning_rate": 1.5130671506352086e-05, + "loss": 23.0629, + "step": 2741 + }, + { + "epoch": 9.899322799097066, + "grad_norm": 252.4147186279297, + "learning_rate": 1.5125226860254086e-05, + "loss": 24.5799, + "step": 2742 + }, + { + "epoch": 9.90293453724605, + "grad_norm": 214.79067993164062, + "learning_rate": 1.5119782214156082e-05, + "loss": 24.6773, + "step": 2743 + }, + { + "epoch": 9.906546275395034, + "grad_norm": 225.59848022460938, + "learning_rate": 1.5114337568058077e-05, + "loss": 43.1147, + "step": 2744 + }, + { + "epoch": 9.910158013544018, + "grad_norm": 221.8661651611328, + "learning_rate": 1.5108892921960074e-05, + "loss": 42.7403, + "step": 2745 + }, + { + "epoch": 9.913769751693001, + "grad_norm": 316.3871765136719, + "learning_rate": 1.510344827586207e-05, + "loss": 41.6931, + "step": 2746 + }, + { + "epoch": 9.917381489841986, + "grad_norm": 250.6577911376953, + "learning_rate": 1.5098003629764065e-05, + "loss": 43.3, + "step": 2747 + }, + { + "epoch": 9.920993227990971, + "grad_norm": 222.44386291503906, + "learning_rate": 1.5092558983666062e-05, + "loss": 43.3128, + "step": 2748 + }, + { + "epoch": 9.924604966139954, + "grad_norm": 190.08682250976562, + "learning_rate": 1.5087114337568057e-05, + "loss": 41.4814, + "step": 2749 + }, + { + "epoch": 9.928216704288939, + "grad_norm": 276.9918212890625, + "learning_rate": 1.5081669691470054e-05, + "loss": 41.042, + "step": 2750 + }, + { + "epoch": 9.928216704288939, + "eval_loss": 0.6201648116111755, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2750 + }, + { + "epoch": 9.931828442437924, + "grad_norm": 269.7344970703125, + "learning_rate": 1.507622504537205e-05, + "loss": 40.3064, + "step": 2751 + }, + { + "epoch": 9.935440180586907, + "grad_norm": 263.11663818359375, + "learning_rate": 1.5070780399274049e-05, + "loss": 40.1675, + "step": 2752 + }, + { + "epoch": 9.939051918735892, + "grad_norm": 210.37635803222656, + "learning_rate": 1.5065335753176046e-05, + "loss": 40.5334, + "step": 2753 + }, + { + "epoch": 9.942663656884875, + "grad_norm": 206.09335327148438, + "learning_rate": 1.5059891107078041e-05, + "loss": 41.0429, + "step": 2754 + }, + { + "epoch": 9.94627539503386, + "grad_norm": 245.45013427734375, + "learning_rate": 1.5054446460980036e-05, + "loss": 40.8831, + "step": 2755 + }, + { + "epoch": 9.949887133182845, + "grad_norm": 216.63075256347656, + "learning_rate": 1.5049001814882033e-05, + "loss": 41.2453, + "step": 2756 + }, + { + "epoch": 9.953498871331828, + "grad_norm": 362.12127685546875, + "learning_rate": 1.5043557168784029e-05, + "loss": 40.4561, + "step": 2757 + }, + { + "epoch": 9.957110609480813, + "grad_norm": 222.01434326171875, + "learning_rate": 1.5038112522686024e-05, + "loss": 41.7307, + "step": 2758 + }, + { + "epoch": 9.960722347629797, + "grad_norm": 289.6107177734375, + "learning_rate": 1.5032667876588021e-05, + "loss": 37.83, + "step": 2759 + }, + { + "epoch": 9.96433408577878, + "grad_norm": 231.75274658203125, + "learning_rate": 1.5027223230490017e-05, + "loss": 34.1728, + "step": 2760 + }, + { + "epoch": 9.96433408577878, + "eval_loss": 0.6177247166633606, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2760 + }, + { + "epoch": 9.967945823927765, + "grad_norm": 269.4657287597656, + "learning_rate": 1.5021778584392017e-05, + "loss": 33.8501, + "step": 2761 + }, + { + "epoch": 9.97155756207675, + "grad_norm": 229.73004150390625, + "learning_rate": 1.5016333938294013e-05, + "loss": 35.0989, + "step": 2762 + }, + { + "epoch": 9.975169300225733, + "grad_norm": 215.75350952148438, + "learning_rate": 1.5010889292196008e-05, + "loss": 35.1091, + "step": 2763 + }, + { + "epoch": 9.978781038374718, + "grad_norm": 255.36439514160156, + "learning_rate": 1.5005444646098005e-05, + "loss": 36.8373, + "step": 2764 + }, + { + "epoch": 9.982392776523701, + "grad_norm": 226.71084594726562, + "learning_rate": 1.5e-05, + "loss": 36.6244, + "step": 2765 + }, + { + "epoch": 9.986004514672686, + "grad_norm": 264.1791076660156, + "learning_rate": 1.4994555353901996e-05, + "loss": 36.1925, + "step": 2766 + }, + { + "epoch": 9.989616252821671, + "grad_norm": 281.4349060058594, + "learning_rate": 1.4989110707803993e-05, + "loss": 38.5627, + "step": 2767 + }, + { + "epoch": 9.993227990970654, + "grad_norm": 275.13092041015625, + "learning_rate": 1.498366606170599e-05, + "loss": 33.3277, + "step": 2768 + }, + { + "epoch": 9.996839729119639, + "grad_norm": 215.79550170898438, + "learning_rate": 1.4978221415607985e-05, + "loss": 23.7482, + "step": 2769 + }, + { + "epoch": 10.0, + "grad_norm": 162.03152465820312, + "learning_rate": 1.4972776769509982e-05, + "loss": 21.7078, + "step": 2770 + }, + { + "epoch": 10.0, + "eval_loss": 0.6126651763916016, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2770 + }, + { + "epoch": 10.003611738148985, + "grad_norm": 243.1815185546875, + "learning_rate": 1.4967332123411978e-05, + "loss": 42.2449, + "step": 2771 + }, + { + "epoch": 10.007223476297968, + "grad_norm": 183.29127502441406, + "learning_rate": 1.4961887477313977e-05, + "loss": 41.5925, + "step": 2772 + }, + { + "epoch": 10.010835214446953, + "grad_norm": 206.04238891601562, + "learning_rate": 1.4956442831215972e-05, + "loss": 40.6657, + "step": 2773 + }, + { + "epoch": 10.014446952595938, + "grad_norm": 192.1796875, + "learning_rate": 1.4950998185117967e-05, + "loss": 41.7065, + "step": 2774 + }, + { + "epoch": 10.01805869074492, + "grad_norm": 202.77279663085938, + "learning_rate": 1.4945553539019964e-05, + "loss": 42.0608, + "step": 2775 + }, + { + "epoch": 10.021670428893906, + "grad_norm": 242.37734985351562, + "learning_rate": 1.494010889292196e-05, + "loss": 40.9925, + "step": 2776 + }, + { + "epoch": 10.025282167042889, + "grad_norm": 252.01358032226562, + "learning_rate": 1.4934664246823957e-05, + "loss": 41.1401, + "step": 2777 + }, + { + "epoch": 10.028893905191874, + "grad_norm": 205.82388305664062, + "learning_rate": 1.4929219600725954e-05, + "loss": 41.5, + "step": 2778 + }, + { + "epoch": 10.032505643340858, + "grad_norm": 251.53968811035156, + "learning_rate": 1.492377495462795e-05, + "loss": 41.8218, + "step": 2779 + }, + { + "epoch": 10.036117381489841, + "grad_norm": 236.55564880371094, + "learning_rate": 1.4918330308529945e-05, + "loss": 40.803, + "step": 2780 + }, + { + "epoch": 10.036117381489841, + "eval_loss": 0.6173696517944336, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 2780 + }, + { + "epoch": 10.039729119638826, + "grad_norm": 214.9959716796875, + "learning_rate": 1.4912885662431942e-05, + "loss": 40.522, + "step": 2781 + }, + { + "epoch": 10.043340857787811, + "grad_norm": 213.7000732421875, + "learning_rate": 1.4907441016333939e-05, + "loss": 38.8643, + "step": 2782 + }, + { + "epoch": 10.046952595936794, + "grad_norm": 225.6709747314453, + "learning_rate": 1.4901996370235936e-05, + "loss": 38.3625, + "step": 2783 + }, + { + "epoch": 10.050564334085779, + "grad_norm": 208.83712768554688, + "learning_rate": 1.4896551724137931e-05, + "loss": 38.5355, + "step": 2784 + }, + { + "epoch": 10.054176072234762, + "grad_norm": 185.51219177246094, + "learning_rate": 1.4891107078039927e-05, + "loss": 38.4303, + "step": 2785 + }, + { + "epoch": 10.057787810383747, + "grad_norm": 196.68551635742188, + "learning_rate": 1.4885662431941925e-05, + "loss": 38.1895, + "step": 2786 + }, + { + "epoch": 10.061399548532732, + "grad_norm": 207.4806671142578, + "learning_rate": 1.488021778584392e-05, + "loss": 39.2329, + "step": 2787 + }, + { + "epoch": 10.065011286681715, + "grad_norm": 211.640380859375, + "learning_rate": 1.4874773139745916e-05, + "loss": 40.108, + "step": 2788 + }, + { + "epoch": 10.0686230248307, + "grad_norm": 195.97006225585938, + "learning_rate": 1.4869328493647913e-05, + "loss": 39.6883, + "step": 2789 + }, + { + "epoch": 10.072234762979685, + "grad_norm": 207.20169067382812, + "learning_rate": 1.4863883847549909e-05, + "loss": 40.557, + "step": 2790 + }, + { + "epoch": 10.072234762979685, + "eval_loss": 0.6166439652442932, + "eval_runtime": 3.1461, + "eval_samples_per_second": 56.895, + "eval_steps_per_second": 56.895, + "step": 2790 + }, + { + "epoch": 10.075846501128668, + "grad_norm": 168.4052276611328, + "learning_rate": 1.4858439201451906e-05, + "loss": 39.76, + "step": 2791 + }, + { + "epoch": 10.079458239277653, + "grad_norm": 188.55575561523438, + "learning_rate": 1.4852994555353903e-05, + "loss": 40.4776, + "step": 2792 + }, + { + "epoch": 10.083069977426636, + "grad_norm": 181.60801696777344, + "learning_rate": 1.4847549909255898e-05, + "loss": 40.5414, + "step": 2793 + }, + { + "epoch": 10.08668171557562, + "grad_norm": 205.39608764648438, + "learning_rate": 1.4842105263157895e-05, + "loss": 41.4944, + "step": 2794 + }, + { + "epoch": 10.090293453724605, + "grad_norm": 271.0169372558594, + "learning_rate": 1.4836660617059892e-05, + "loss": 40.6805, + "step": 2795 + }, + { + "epoch": 10.093905191873588, + "grad_norm": 241.97889709472656, + "learning_rate": 1.4831215970961888e-05, + "loss": 39.5473, + "step": 2796 + }, + { + "epoch": 10.097516930022573, + "grad_norm": 211.64260864257812, + "learning_rate": 1.4825771324863885e-05, + "loss": 41.0357, + "step": 2797 + }, + { + "epoch": 10.101128668171558, + "grad_norm": 209.52804565429688, + "learning_rate": 1.482032667876588e-05, + "loss": 41.3357, + "step": 2798 + }, + { + "epoch": 10.104740406320541, + "grad_norm": 243.08419799804688, + "learning_rate": 1.4814882032667876e-05, + "loss": 38.6778, + "step": 2799 + }, + { + "epoch": 10.108352144469526, + "grad_norm": 227.17172241210938, + "learning_rate": 1.4809437386569874e-05, + "loss": 35.1128, + "step": 2800 + }, + { + "epoch": 10.108352144469526, + "eval_loss": 0.6153741478919983, + "eval_runtime": 3.143, + "eval_samples_per_second": 56.952, + "eval_steps_per_second": 56.952, + "step": 2800 + }, + { + "epoch": 10.111963882618511, + "grad_norm": 284.7151794433594, + "learning_rate": 1.480399274047187e-05, + "loss": 33.1712, + "step": 2801 + }, + { + "epoch": 10.115575620767494, + "grad_norm": 234.85169982910156, + "learning_rate": 1.4798548094373867e-05, + "loss": 33.495, + "step": 2802 + }, + { + "epoch": 10.119187358916479, + "grad_norm": 236.6138458251953, + "learning_rate": 1.4793103448275862e-05, + "loss": 33.2318, + "step": 2803 + }, + { + "epoch": 10.122799097065462, + "grad_norm": 240.98997497558594, + "learning_rate": 1.4787658802177858e-05, + "loss": 33.9268, + "step": 2804 + }, + { + "epoch": 10.126410835214447, + "grad_norm": 218.304443359375, + "learning_rate": 1.4782214156079856e-05, + "loss": 34.667, + "step": 2805 + }, + { + "epoch": 10.130022573363432, + "grad_norm": 290.30108642578125, + "learning_rate": 1.4776769509981852e-05, + "loss": 36.7153, + "step": 2806 + }, + { + "epoch": 10.133634311512415, + "grad_norm": 267.7265625, + "learning_rate": 1.4771324863883847e-05, + "loss": 35.2035, + "step": 2807 + }, + { + "epoch": 10.1372460496614, + "grad_norm": 300.4646301269531, + "learning_rate": 1.4765880217785844e-05, + "loss": 35.6581, + "step": 2808 + }, + { + "epoch": 10.140857787810384, + "grad_norm": 234.16448974609375, + "learning_rate": 1.4760435571687841e-05, + "loss": 35.8547, + "step": 2809 + }, + { + "epoch": 10.144469525959368, + "grad_norm": 209.23858642578125, + "learning_rate": 1.4754990925589837e-05, + "loss": 34.47, + "step": 2810 + }, + { + "epoch": 10.144469525959368, + "eval_loss": 0.6160662770271301, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2810 + }, + { + "epoch": 10.148081264108352, + "grad_norm": 207.9628143310547, + "learning_rate": 1.4749546279491834e-05, + "loss": 36.1239, + "step": 2811 + }, + { + "epoch": 10.151693002257336, + "grad_norm": 183.68545532226562, + "learning_rate": 1.4744101633393829e-05, + "loss": 36.759, + "step": 2812 + }, + { + "epoch": 10.15530474040632, + "grad_norm": 222.00164794921875, + "learning_rate": 1.4738656987295826e-05, + "loss": 37.397, + "step": 2813 + }, + { + "epoch": 10.158916478555305, + "grad_norm": 226.9628448486328, + "learning_rate": 1.4733212341197823e-05, + "loss": 36.3648, + "step": 2814 + }, + { + "epoch": 10.162528216704288, + "grad_norm": 271.061279296875, + "learning_rate": 1.4727767695099819e-05, + "loss": 37.8754, + "step": 2815 + }, + { + "epoch": 10.166139954853273, + "grad_norm": 265.2478942871094, + "learning_rate": 1.4722323049001816e-05, + "loss": 33.7491, + "step": 2816 + }, + { + "epoch": 10.169751693002258, + "grad_norm": 227.5030975341797, + "learning_rate": 1.4716878402903811e-05, + "loss": 23.0162, + "step": 2817 + }, + { + "epoch": 10.173363431151241, + "grad_norm": 195.83477783203125, + "learning_rate": 1.4711433756805808e-05, + "loss": 23.5831, + "step": 2818 + }, + { + "epoch": 10.176975169300226, + "grad_norm": 196.982421875, + "learning_rate": 1.4705989110707805e-05, + "loss": 24.1078, + "step": 2819 + }, + { + "epoch": 10.18058690744921, + "grad_norm": 212.73031616210938, + "learning_rate": 1.47005444646098e-05, + "loss": 24.8378, + "step": 2820 + }, + { + "epoch": 10.18058690744921, + "eval_loss": 0.6217848062515259, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 2820 + }, + { + "epoch": 10.184198645598194, + "grad_norm": 261.8343200683594, + "learning_rate": 1.4695099818511796e-05, + "loss": 43.3402, + "step": 2821 + }, + { + "epoch": 10.187810383747179, + "grad_norm": 272.94158935546875, + "learning_rate": 1.4689655172413793e-05, + "loss": 42.8004, + "step": 2822 + }, + { + "epoch": 10.191422121896162, + "grad_norm": 261.5067138671875, + "learning_rate": 1.468421052631579e-05, + "loss": 43.5947, + "step": 2823 + }, + { + "epoch": 10.195033860045147, + "grad_norm": 280.4205322265625, + "learning_rate": 1.4678765880217787e-05, + "loss": 42.1887, + "step": 2824 + }, + { + "epoch": 10.198645598194132, + "grad_norm": 223.82449340820312, + "learning_rate": 1.4673321234119783e-05, + "loss": 40.9825, + "step": 2825 + }, + { + "epoch": 10.202257336343115, + "grad_norm": 261.1077575683594, + "learning_rate": 1.4667876588021778e-05, + "loss": 41.8347, + "step": 2826 + }, + { + "epoch": 10.2058690744921, + "grad_norm": 189.1642608642578, + "learning_rate": 1.4662431941923775e-05, + "loss": 41.7441, + "step": 2827 + }, + { + "epoch": 10.209480812641084, + "grad_norm": 216.94410705566406, + "learning_rate": 1.4656987295825772e-05, + "loss": 42.203, + "step": 2828 + }, + { + "epoch": 10.213092550790067, + "grad_norm": 260.44744873046875, + "learning_rate": 1.4651542649727768e-05, + "loss": 41.8887, + "step": 2829 + }, + { + "epoch": 10.216704288939052, + "grad_norm": 252.21682739257812, + "learning_rate": 1.4646098003629765e-05, + "loss": 42.5977, + "step": 2830 + }, + { + "epoch": 10.216704288939052, + "eval_loss": 0.6175437569618225, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.997, + "eval_steps_per_second": 56.997, + "step": 2830 + }, + { + "epoch": 10.220316027088035, + "grad_norm": 298.4760437011719, + "learning_rate": 1.464065335753176e-05, + "loss": 40.7994, + "step": 2831 + }, + { + "epoch": 10.22392776523702, + "grad_norm": 214.0433349609375, + "learning_rate": 1.4635208711433757e-05, + "loss": 39.1571, + "step": 2832 + }, + { + "epoch": 10.227539503386005, + "grad_norm": 220.59039306640625, + "learning_rate": 1.4629764065335754e-05, + "loss": 38.257, + "step": 2833 + }, + { + "epoch": 10.231151241534988, + "grad_norm": 218.2419891357422, + "learning_rate": 1.462431941923775e-05, + "loss": 38.1954, + "step": 2834 + }, + { + "epoch": 10.234762979683973, + "grad_norm": 241.67674255371094, + "learning_rate": 1.4618874773139747e-05, + "loss": 39.7451, + "step": 2835 + }, + { + "epoch": 10.238374717832958, + "grad_norm": 260.3656005859375, + "learning_rate": 1.4613430127041742e-05, + "loss": 38.8297, + "step": 2836 + }, + { + "epoch": 10.241986455981941, + "grad_norm": 231.78102111816406, + "learning_rate": 1.4607985480943739e-05, + "loss": 38.523, + "step": 2837 + }, + { + "epoch": 10.245598194130926, + "grad_norm": 217.64820861816406, + "learning_rate": 1.4602540834845736e-05, + "loss": 40.0389, + "step": 2838 + }, + { + "epoch": 10.249209932279909, + "grad_norm": 186.45240783691406, + "learning_rate": 1.4597096188747732e-05, + "loss": 40.3306, + "step": 2839 + }, + { + "epoch": 10.252821670428894, + "grad_norm": 225.20480346679688, + "learning_rate": 1.4591651542649727e-05, + "loss": 39.0968, + "step": 2840 + }, + { + "epoch": 10.252821670428894, + "eval_loss": 0.6195141673088074, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 2840 + }, + { + "epoch": 10.256433408577879, + "grad_norm": 367.6174621582031, + "learning_rate": 1.4586206896551724e-05, + "loss": 38.869, + "step": 2841 + }, + { + "epoch": 10.260045146726862, + "grad_norm": 274.3976135253906, + "learning_rate": 1.4580762250453721e-05, + "loss": 39.7781, + "step": 2842 + }, + { + "epoch": 10.263656884875846, + "grad_norm": 193.41665649414062, + "learning_rate": 1.4575317604355718e-05, + "loss": 38.819, + "step": 2843 + }, + { + "epoch": 10.267268623024831, + "grad_norm": 204.2224578857422, + "learning_rate": 1.4569872958257714e-05, + "loss": 41.5495, + "step": 2844 + }, + { + "epoch": 10.270880361173814, + "grad_norm": 276.07476806640625, + "learning_rate": 1.4564428312159709e-05, + "loss": 40.6553, + "step": 2845 + }, + { + "epoch": 10.2744920993228, + "grad_norm": 192.6361541748047, + "learning_rate": 1.4558983666061708e-05, + "loss": 40.2147, + "step": 2846 + }, + { + "epoch": 10.278103837471784, + "grad_norm": 232.6641082763672, + "learning_rate": 1.4553539019963703e-05, + "loss": 40.7223, + "step": 2847 + }, + { + "epoch": 10.281715575620767, + "grad_norm": 266.781005859375, + "learning_rate": 1.4548094373865698e-05, + "loss": 38.0127, + "step": 2848 + }, + { + "epoch": 10.285327313769752, + "grad_norm": 289.5414123535156, + "learning_rate": 1.4542649727767696e-05, + "loss": 35.216, + "step": 2849 + }, + { + "epoch": 10.288939051918735, + "grad_norm": 208.10845947265625, + "learning_rate": 1.4537205081669691e-05, + "loss": 33.829, + "step": 2850 + }, + { + "epoch": 10.288939051918735, + "eval_loss": 0.6140356063842773, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.703, + "eval_steps_per_second": 56.703, + "step": 2850 + }, + { + "epoch": 10.29255079006772, + "grad_norm": 260.80328369140625, + "learning_rate": 1.4531760435571688e-05, + "loss": 33.8409, + "step": 2851 + }, + { + "epoch": 10.296162528216705, + "grad_norm": 202.3874053955078, + "learning_rate": 1.4526315789473685e-05, + "loss": 32.6498, + "step": 2852 + }, + { + "epoch": 10.299774266365688, + "grad_norm": 236.0218048095703, + "learning_rate": 1.452087114337568e-05, + "loss": 33.6538, + "step": 2853 + }, + { + "epoch": 10.303386004514673, + "grad_norm": 219.1603240966797, + "learning_rate": 1.4515426497277678e-05, + "loss": 33.7346, + "step": 2854 + }, + { + "epoch": 10.306997742663658, + "grad_norm": 252.8759307861328, + "learning_rate": 1.4509981851179675e-05, + "loss": 34.6996, + "step": 2855 + }, + { + "epoch": 10.31060948081264, + "grad_norm": 204.89244079589844, + "learning_rate": 1.450453720508167e-05, + "loss": 36.1145, + "step": 2856 + }, + { + "epoch": 10.314221218961626, + "grad_norm": 239.5278778076172, + "learning_rate": 1.4499092558983667e-05, + "loss": 34.8845, + "step": 2857 + }, + { + "epoch": 10.317832957110609, + "grad_norm": 235.02403259277344, + "learning_rate": 1.4493647912885662e-05, + "loss": 36.1006, + "step": 2858 + }, + { + "epoch": 10.321444695259594, + "grad_norm": 219.25686645507812, + "learning_rate": 1.4488203266787658e-05, + "loss": 37.0463, + "step": 2859 + }, + { + "epoch": 10.325056433408578, + "grad_norm": 238.1767578125, + "learning_rate": 1.4482758620689657e-05, + "loss": 35.5543, + "step": 2860 + }, + { + "epoch": 10.325056433408578, + "eval_loss": 0.6116110682487488, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.93, + "eval_steps_per_second": 56.93, + "step": 2860 + }, + { + "epoch": 10.328668171557561, + "grad_norm": 245.4133758544922, + "learning_rate": 1.4477313974591652e-05, + "loss": 35.7557, + "step": 2861 + }, + { + "epoch": 10.332279909706546, + "grad_norm": 231.70779418945312, + "learning_rate": 1.4471869328493647e-05, + "loss": 35.9535, + "step": 2862 + }, + { + "epoch": 10.335891647855531, + "grad_norm": 218.71266174316406, + "learning_rate": 1.4466424682395644e-05, + "loss": 36.747, + "step": 2863 + }, + { + "epoch": 10.339503386004514, + "grad_norm": 206.82247924804688, + "learning_rate": 1.446098003629764e-05, + "loss": 37.4007, + "step": 2864 + }, + { + "epoch": 10.343115124153499, + "grad_norm": 286.6649475097656, + "learning_rate": 1.4455535390199639e-05, + "loss": 38.183, + "step": 2865 + }, + { + "epoch": 10.346726862302482, + "grad_norm": 262.2049865722656, + "learning_rate": 1.4450090744101634e-05, + "loss": 28.1564, + "step": 2866 + }, + { + "epoch": 10.350338600451467, + "grad_norm": 203.03831481933594, + "learning_rate": 1.444464609800363e-05, + "loss": 23.7155, + "step": 2867 + }, + { + "epoch": 10.353950338600452, + "grad_norm": 220.13597106933594, + "learning_rate": 1.4439201451905626e-05, + "loss": 23.5066, + "step": 2868 + }, + { + "epoch": 10.357562076749435, + "grad_norm": 208.22035217285156, + "learning_rate": 1.4433756805807624e-05, + "loss": 23.8087, + "step": 2869 + }, + { + "epoch": 10.36117381489842, + "grad_norm": 202.74989318847656, + "learning_rate": 1.4428312159709619e-05, + "loss": 24.6194, + "step": 2870 + }, + { + "epoch": 10.36117381489842, + "eval_loss": 0.6170971989631653, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 2870 + }, + { + "epoch": 10.364785553047405, + "grad_norm": 251.78924560546875, + "learning_rate": 1.4422867513611616e-05, + "loss": 41.1333, + "step": 2871 + }, + { + "epoch": 10.368397291196388, + "grad_norm": 269.72430419921875, + "learning_rate": 1.4417422867513611e-05, + "loss": 43.5289, + "step": 2872 + }, + { + "epoch": 10.372009029345373, + "grad_norm": 226.14202880859375, + "learning_rate": 1.4411978221415607e-05, + "loss": 42.1575, + "step": 2873 + }, + { + "epoch": 10.375620767494357, + "grad_norm": 230.2255096435547, + "learning_rate": 1.4406533575317606e-05, + "loss": 42.5563, + "step": 2874 + }, + { + "epoch": 10.37923250564334, + "grad_norm": 259.2338562011719, + "learning_rate": 1.4401088929219601e-05, + "loss": 41.517, + "step": 2875 + }, + { + "epoch": 10.382844243792325, + "grad_norm": 280.06414794921875, + "learning_rate": 1.4395644283121598e-05, + "loss": 41.3589, + "step": 2876 + }, + { + "epoch": 10.386455981941308, + "grad_norm": 259.1960754394531, + "learning_rate": 1.4390199637023593e-05, + "loss": 41.539, + "step": 2877 + }, + { + "epoch": 10.390067720090293, + "grad_norm": 244.4931640625, + "learning_rate": 1.438475499092559e-05, + "loss": 41.8689, + "step": 2878 + }, + { + "epoch": 10.393679458239278, + "grad_norm": 195.65065002441406, + "learning_rate": 1.4379310344827588e-05, + "loss": 42.9191, + "step": 2879 + }, + { + "epoch": 10.397291196388261, + "grad_norm": 215.88589477539062, + "learning_rate": 1.4373865698729583e-05, + "loss": 41.4172, + "step": 2880 + }, + { + "epoch": 10.397291196388261, + "eval_loss": 0.6176813840866089, + "eval_runtime": 3.1462, + "eval_samples_per_second": 56.893, + "eval_steps_per_second": 56.893, + "step": 2880 + }, + { + "epoch": 10.400902934537246, + "grad_norm": 175.21368408203125, + "learning_rate": 1.4368421052631578e-05, + "loss": 41.8998, + "step": 2881 + }, + { + "epoch": 10.404514672686231, + "grad_norm": 207.65963745117188, + "learning_rate": 1.4362976406533575e-05, + "loss": 40.33, + "step": 2882 + }, + { + "epoch": 10.408126410835214, + "grad_norm": 213.50526428222656, + "learning_rate": 1.4357531760435572e-05, + "loss": 38.0329, + "step": 2883 + }, + { + "epoch": 10.411738148984199, + "grad_norm": 190.8444366455078, + "learning_rate": 1.4352087114337568e-05, + "loss": 39.0142, + "step": 2884 + }, + { + "epoch": 10.415349887133182, + "grad_norm": 300.2298583984375, + "learning_rate": 1.4346642468239565e-05, + "loss": 38.6364, + "step": 2885 + }, + { + "epoch": 10.418961625282167, + "grad_norm": 183.6144256591797, + "learning_rate": 1.434119782214156e-05, + "loss": 39.6747, + "step": 2886 + }, + { + "epoch": 10.422573363431152, + "grad_norm": 237.85340881347656, + "learning_rate": 1.4335753176043557e-05, + "loss": 38.3018, + "step": 2887 + }, + { + "epoch": 10.426185101580135, + "grad_norm": 325.96624755859375, + "learning_rate": 1.4330308529945554e-05, + "loss": 40.1042, + "step": 2888 + }, + { + "epoch": 10.42979683972912, + "grad_norm": 248.4732666015625, + "learning_rate": 1.432486388384755e-05, + "loss": 40.0357, + "step": 2889 + }, + { + "epoch": 10.433408577878104, + "grad_norm": 374.6653747558594, + "learning_rate": 1.4319419237749547e-05, + "loss": 40.4383, + "step": 2890 + }, + { + "epoch": 10.433408577878104, + "eval_loss": 0.6150367856025696, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.881, + "eval_steps_per_second": 56.881, + "step": 2890 + }, + { + "epoch": 10.437020316027088, + "grad_norm": 229.79647827148438, + "learning_rate": 1.4313974591651542e-05, + "loss": 40.3728, + "step": 2891 + }, + { + "epoch": 10.440632054176072, + "grad_norm": 278.7500915527344, + "learning_rate": 1.430852994555354e-05, + "loss": 39.546, + "step": 2892 + }, + { + "epoch": 10.444243792325057, + "grad_norm": 233.1890106201172, + "learning_rate": 1.4303085299455536e-05, + "loss": 41.8094, + "step": 2893 + }, + { + "epoch": 10.44785553047404, + "grad_norm": 207.7745819091797, + "learning_rate": 1.4297640653357532e-05, + "loss": 40.6225, + "step": 2894 + }, + { + "epoch": 10.451467268623025, + "grad_norm": 233.37892150878906, + "learning_rate": 1.4292196007259529e-05, + "loss": 40.2499, + "step": 2895 + }, + { + "epoch": 10.455079006772008, + "grad_norm": 225.4070587158203, + "learning_rate": 1.4286751361161524e-05, + "loss": 40.3626, + "step": 2896 + }, + { + "epoch": 10.458690744920993, + "grad_norm": 239.60231018066406, + "learning_rate": 1.4281306715063521e-05, + "loss": 40.3149, + "step": 2897 + }, + { + "epoch": 10.462302483069978, + "grad_norm": 225.3981475830078, + "learning_rate": 1.4275862068965518e-05, + "loss": 39.3443, + "step": 2898 + }, + { + "epoch": 10.465914221218961, + "grad_norm": 270.2829284667969, + "learning_rate": 1.4270417422867514e-05, + "loss": 37.8947, + "step": 2899 + }, + { + "epoch": 10.469525959367946, + "grad_norm": 263.66986083984375, + "learning_rate": 1.426497277676951e-05, + "loss": 34.4721, + "step": 2900 + }, + { + "epoch": 10.469525959367946, + "eval_loss": 0.6134031414985657, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2900 + }, + { + "epoch": 10.47313769751693, + "grad_norm": 189.3812255859375, + "learning_rate": 1.4259528130671508e-05, + "loss": 34.3148, + "step": 2901 + }, + { + "epoch": 10.476749435665914, + "grad_norm": 256.7174987792969, + "learning_rate": 1.4254083484573503e-05, + "loss": 32.1693, + "step": 2902 + }, + { + "epoch": 10.480361173814899, + "grad_norm": 265.40692138671875, + "learning_rate": 1.4248638838475499e-05, + "loss": 34.369, + "step": 2903 + }, + { + "epoch": 10.483972911963882, + "grad_norm": 315.6539001464844, + "learning_rate": 1.4243194192377496e-05, + "loss": 34.9479, + "step": 2904 + }, + { + "epoch": 10.487584650112867, + "grad_norm": 263.7816162109375, + "learning_rate": 1.4237749546279491e-05, + "loss": 33.983, + "step": 2905 + }, + { + "epoch": 10.491196388261852, + "grad_norm": 244.69192504882812, + "learning_rate": 1.423230490018149e-05, + "loss": 36.6685, + "step": 2906 + }, + { + "epoch": 10.494808126410835, + "grad_norm": 224.26071166992188, + "learning_rate": 1.4226860254083485e-05, + "loss": 35.0337, + "step": 2907 + }, + { + "epoch": 10.49841986455982, + "grad_norm": 261.0958557128906, + "learning_rate": 1.422141560798548e-05, + "loss": 34.7154, + "step": 2908 + }, + { + "epoch": 10.502031602708804, + "grad_norm": 245.85960388183594, + "learning_rate": 1.4215970961887478e-05, + "loss": 35.4156, + "step": 2909 + }, + { + "epoch": 10.505643340857787, + "grad_norm": 309.3730163574219, + "learning_rate": 1.4210526315789473e-05, + "loss": 36.3999, + "step": 2910 + }, + { + "epoch": 10.505643340857787, + "eval_loss": 0.6144266128540039, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.853, + "eval_steps_per_second": 56.853, + "step": 2910 + }, + { + "epoch": 10.509255079006772, + "grad_norm": 209.9637451171875, + "learning_rate": 1.420508166969147e-05, + "loss": 37.1515, + "step": 2911 + }, + { + "epoch": 10.512866817155757, + "grad_norm": 254.81683349609375, + "learning_rate": 1.4199637023593467e-05, + "loss": 35.5548, + "step": 2912 + }, + { + "epoch": 10.51647855530474, + "grad_norm": 224.94137573242188, + "learning_rate": 1.4194192377495463e-05, + "loss": 36.7691, + "step": 2913 + }, + { + "epoch": 10.520090293453725, + "grad_norm": 223.81838989257812, + "learning_rate": 1.4188747731397458e-05, + "loss": 37.5904, + "step": 2914 + }, + { + "epoch": 10.523702031602708, + "grad_norm": 308.0168151855469, + "learning_rate": 1.4183303085299457e-05, + "loss": 36.1561, + "step": 2915 + }, + { + "epoch": 10.527313769751693, + "grad_norm": 214.77928161621094, + "learning_rate": 1.4177858439201452e-05, + "loss": 27.6309, + "step": 2916 + }, + { + "epoch": 10.530925507900678, + "grad_norm": 153.77163696289062, + "learning_rate": 1.417241379310345e-05, + "loss": 23.6151, + "step": 2917 + }, + { + "epoch": 10.534537246049661, + "grad_norm": 161.12826538085938, + "learning_rate": 1.4166969147005445e-05, + "loss": 23.1684, + "step": 2918 + }, + { + "epoch": 10.538148984198646, + "grad_norm": 228.01441955566406, + "learning_rate": 1.416152450090744e-05, + "loss": 23.4383, + "step": 2919 + }, + { + "epoch": 10.54176072234763, + "grad_norm": 207.55052185058594, + "learning_rate": 1.4156079854809439e-05, + "loss": 25.4699, + "step": 2920 + }, + { + "epoch": 10.54176072234763, + "eval_loss": 0.6177500486373901, + "eval_runtime": 3.1369, + "eval_samples_per_second": 57.063, + "eval_steps_per_second": 57.063, + "step": 2920 + }, + { + "epoch": 10.545372460496614, + "grad_norm": 254.23828125, + "learning_rate": 1.4150635208711434e-05, + "loss": 42.1525, + "step": 2921 + }, + { + "epoch": 10.548984198645599, + "grad_norm": 228.1654815673828, + "learning_rate": 1.414519056261343e-05, + "loss": 42.4282, + "step": 2922 + }, + { + "epoch": 10.552595936794582, + "grad_norm": 258.4981689453125, + "learning_rate": 1.4139745916515427e-05, + "loss": 42.3053, + "step": 2923 + }, + { + "epoch": 10.556207674943566, + "grad_norm": 364.42059326171875, + "learning_rate": 1.4134301270417424e-05, + "loss": 41.9009, + "step": 2924 + }, + { + "epoch": 10.559819413092551, + "grad_norm": 213.5066375732422, + "learning_rate": 1.412885662431942e-05, + "loss": 41.0624, + "step": 2925 + }, + { + "epoch": 10.563431151241534, + "grad_norm": 214.23472595214844, + "learning_rate": 1.4123411978221416e-05, + "loss": 42.2508, + "step": 2926 + }, + { + "epoch": 10.56704288939052, + "grad_norm": 249.8063201904297, + "learning_rate": 1.4117967332123412e-05, + "loss": 43.0671, + "step": 2927 + }, + { + "epoch": 10.570654627539504, + "grad_norm": 210.0769805908203, + "learning_rate": 1.4112522686025409e-05, + "loss": 43.4018, + "step": 2928 + }, + { + "epoch": 10.574266365688487, + "grad_norm": 255.67225646972656, + "learning_rate": 1.4107078039927406e-05, + "loss": 42.9609, + "step": 2929 + }, + { + "epoch": 10.577878103837472, + "grad_norm": 294.2599182128906, + "learning_rate": 1.4101633393829401e-05, + "loss": 41.8748, + "step": 2930 + }, + { + "epoch": 10.577878103837472, + "eval_loss": 0.6147512793540955, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2930 + }, + { + "epoch": 10.581489841986457, + "grad_norm": 212.6685333251953, + "learning_rate": 1.4096188747731398e-05, + "loss": 42.4291, + "step": 2931 + }, + { + "epoch": 10.58510158013544, + "grad_norm": 297.016357421875, + "learning_rate": 1.4090744101633394e-05, + "loss": 39.7291, + "step": 2932 + }, + { + "epoch": 10.588713318284425, + "grad_norm": 280.308837890625, + "learning_rate": 1.4085299455535389e-05, + "loss": 37.4836, + "step": 2933 + }, + { + "epoch": 10.592325056433408, + "grad_norm": 230.28994750976562, + "learning_rate": 1.4079854809437388e-05, + "loss": 39.4075, + "step": 2934 + }, + { + "epoch": 10.595936794582393, + "grad_norm": 377.0367126464844, + "learning_rate": 1.4074410163339383e-05, + "loss": 40.5601, + "step": 2935 + }, + { + "epoch": 10.599548532731378, + "grad_norm": 238.51597595214844, + "learning_rate": 1.406896551724138e-05, + "loss": 38.1238, + "step": 2936 + }, + { + "epoch": 10.60316027088036, + "grad_norm": 197.5536651611328, + "learning_rate": 1.4063520871143376e-05, + "loss": 38.2997, + "step": 2937 + }, + { + "epoch": 10.606772009029346, + "grad_norm": 211.65162658691406, + "learning_rate": 1.4058076225045373e-05, + "loss": 39.1501, + "step": 2938 + }, + { + "epoch": 10.610383747178329, + "grad_norm": 266.4801940917969, + "learning_rate": 1.405263157894737e-05, + "loss": 40.5761, + "step": 2939 + }, + { + "epoch": 10.613995485327314, + "grad_norm": 210.29478454589844, + "learning_rate": 1.4047186932849365e-05, + "loss": 39.7387, + "step": 2940 + }, + { + "epoch": 10.613995485327314, + "eval_loss": 0.6154477000236511, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 2940 + }, + { + "epoch": 10.617607223476298, + "grad_norm": 318.0694580078125, + "learning_rate": 1.404174228675136e-05, + "loss": 38.691, + "step": 2941 + }, + { + "epoch": 10.621218961625281, + "grad_norm": 351.12811279296875, + "learning_rate": 1.4036297640653358e-05, + "loss": 40.3878, + "step": 2942 + }, + { + "epoch": 10.624830699774266, + "grad_norm": 259.8601989746094, + "learning_rate": 1.4030852994555355e-05, + "loss": 38.4447, + "step": 2943 + }, + { + "epoch": 10.628442437923251, + "grad_norm": 249.7741241455078, + "learning_rate": 1.402540834845735e-05, + "loss": 41.1242, + "step": 2944 + }, + { + "epoch": 10.632054176072234, + "grad_norm": 207.11119079589844, + "learning_rate": 1.4019963702359347e-05, + "loss": 40.1977, + "step": 2945 + }, + { + "epoch": 10.635665914221219, + "grad_norm": 199.37295532226562, + "learning_rate": 1.4014519056261343e-05, + "loss": 40.71, + "step": 2946 + }, + { + "epoch": 10.639277652370204, + "grad_norm": 238.85061645507812, + "learning_rate": 1.4009074410163341e-05, + "loss": 41.8822, + "step": 2947 + }, + { + "epoch": 10.642889390519187, + "grad_norm": 212.46388244628906, + "learning_rate": 1.4003629764065337e-05, + "loss": 40.5648, + "step": 2948 + }, + { + "epoch": 10.646501128668172, + "grad_norm": 217.60386657714844, + "learning_rate": 1.3998185117967332e-05, + "loss": 39.6074, + "step": 2949 + }, + { + "epoch": 10.650112866817155, + "grad_norm": 223.88645935058594, + "learning_rate": 1.399274047186933e-05, + "loss": 37.7394, + "step": 2950 + }, + { + "epoch": 10.650112866817155, + "eval_loss": 0.6133999228477478, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 2950 + }, + { + "epoch": 10.65372460496614, + "grad_norm": 248.87986755371094, + "learning_rate": 1.3987295825771325e-05, + "loss": 34.911, + "step": 2951 + }, + { + "epoch": 10.657336343115125, + "grad_norm": 238.0355987548828, + "learning_rate": 1.3981851179673322e-05, + "loss": 34.0325, + "step": 2952 + }, + { + "epoch": 10.660948081264108, + "grad_norm": 212.9556121826172, + "learning_rate": 1.3976406533575319e-05, + "loss": 34.9663, + "step": 2953 + }, + { + "epoch": 10.664559819413093, + "grad_norm": 274.4277648925781, + "learning_rate": 1.3970961887477314e-05, + "loss": 34.2399, + "step": 2954 + }, + { + "epoch": 10.668171557562077, + "grad_norm": 211.77976989746094, + "learning_rate": 1.396551724137931e-05, + "loss": 33.7609, + "step": 2955 + }, + { + "epoch": 10.67178329571106, + "grad_norm": 280.6621398925781, + "learning_rate": 1.3960072595281307e-05, + "loss": 35.2616, + "step": 2956 + }, + { + "epoch": 10.675395033860045, + "grad_norm": 239.06439208984375, + "learning_rate": 1.3954627949183304e-05, + "loss": 34.2542, + "step": 2957 + }, + { + "epoch": 10.679006772009028, + "grad_norm": 271.45806884765625, + "learning_rate": 1.39491833030853e-05, + "loss": 36.0551, + "step": 2958 + }, + { + "epoch": 10.682618510158013, + "grad_norm": 247.76486206054688, + "learning_rate": 1.3943738656987296e-05, + "loss": 36.9935, + "step": 2959 + }, + { + "epoch": 10.686230248306998, + "grad_norm": 259.47930908203125, + "learning_rate": 1.3938294010889292e-05, + "loss": 36.7769, + "step": 2960 + }, + { + "epoch": 10.686230248306998, + "eval_loss": 0.6107803583145142, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.138, + "eval_steps_per_second": 57.138, + "step": 2960 + }, + { + "epoch": 10.689841986455981, + "grad_norm": 247.50103759765625, + "learning_rate": 1.393284936479129e-05, + "loss": 35.4848, + "step": 2961 + }, + { + "epoch": 10.693453724604966, + "grad_norm": 242.37330627441406, + "learning_rate": 1.3927404718693286e-05, + "loss": 36.3881, + "step": 2962 + }, + { + "epoch": 10.697065462753951, + "grad_norm": 200.2835693359375, + "learning_rate": 1.3921960072595281e-05, + "loss": 37.2684, + "step": 2963 + }, + { + "epoch": 10.700677200902934, + "grad_norm": 261.6256103515625, + "learning_rate": 1.3916515426497278e-05, + "loss": 37.4581, + "step": 2964 + }, + { + "epoch": 10.704288939051919, + "grad_norm": 243.7251434326172, + "learning_rate": 1.3911070780399274e-05, + "loss": 35.8237, + "step": 2965 + }, + { + "epoch": 10.707900677200904, + "grad_norm": 172.99339294433594, + "learning_rate": 1.390562613430127e-05, + "loss": 29.5815, + "step": 2966 + }, + { + "epoch": 10.711512415349887, + "grad_norm": 168.88490295410156, + "learning_rate": 1.3900181488203268e-05, + "loss": 23.6597, + "step": 2967 + }, + { + "epoch": 10.715124153498872, + "grad_norm": 213.0456085205078, + "learning_rate": 1.3894736842105263e-05, + "loss": 22.5034, + "step": 2968 + }, + { + "epoch": 10.718735891647855, + "grad_norm": 183.87222290039062, + "learning_rate": 1.388929219600726e-05, + "loss": 24.1696, + "step": 2969 + }, + { + "epoch": 10.72234762979684, + "grad_norm": 179.4297637939453, + "learning_rate": 1.3883847549909256e-05, + "loss": 24.8905, + "step": 2970 + }, + { + "epoch": 10.72234762979684, + "eval_loss": 0.6176853179931641, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 2970 + }, + { + "epoch": 10.725959367945824, + "grad_norm": 214.10662841796875, + "learning_rate": 1.3878402903811253e-05, + "loss": 40.6941, + "step": 2971 + }, + { + "epoch": 10.729571106094808, + "grad_norm": 199.4381103515625, + "learning_rate": 1.387295825771325e-05, + "loss": 42.6363, + "step": 2972 + }, + { + "epoch": 10.733182844243792, + "grad_norm": 182.74517822265625, + "learning_rate": 1.3867513611615245e-05, + "loss": 40.9695, + "step": 2973 + }, + { + "epoch": 10.736794582392777, + "grad_norm": 182.41421508789062, + "learning_rate": 1.386206896551724e-05, + "loss": 40.8893, + "step": 2974 + }, + { + "epoch": 10.74040632054176, + "grad_norm": 215.42904663085938, + "learning_rate": 1.385662431941924e-05, + "loss": 40.6667, + "step": 2975 + }, + { + "epoch": 10.744018058690745, + "grad_norm": 208.15133666992188, + "learning_rate": 1.3851179673321235e-05, + "loss": 42.0714, + "step": 2976 + }, + { + "epoch": 10.747629796839728, + "grad_norm": 224.70242309570312, + "learning_rate": 1.384573502722323e-05, + "loss": 40.9404, + "step": 2977 + }, + { + "epoch": 10.751241534988713, + "grad_norm": 241.45301818847656, + "learning_rate": 1.3840290381125227e-05, + "loss": 43.5597, + "step": 2978 + }, + { + "epoch": 10.754853273137698, + "grad_norm": 201.2677459716797, + "learning_rate": 1.3834845735027222e-05, + "loss": 42.7741, + "step": 2979 + }, + { + "epoch": 10.758465011286681, + "grad_norm": 246.30873107910156, + "learning_rate": 1.3829401088929221e-05, + "loss": 41.7873, + "step": 2980 + }, + { + "epoch": 10.758465011286681, + "eval_loss": 0.6206657886505127, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2980 + }, + { + "epoch": 10.762076749435666, + "grad_norm": 206.91009521484375, + "learning_rate": 1.3823956442831217e-05, + "loss": 42.3601, + "step": 2981 + }, + { + "epoch": 10.76568848758465, + "grad_norm": 206.37472534179688, + "learning_rate": 1.3818511796733212e-05, + "loss": 38.5536, + "step": 2982 + }, + { + "epoch": 10.769300225733634, + "grad_norm": 206.49070739746094, + "learning_rate": 1.3813067150635209e-05, + "loss": 38.1051, + "step": 2983 + }, + { + "epoch": 10.772911963882619, + "grad_norm": 215.02455139160156, + "learning_rate": 1.3807622504537206e-05, + "loss": 39.0797, + "step": 2984 + }, + { + "epoch": 10.776523702031604, + "grad_norm": 254.23757934570312, + "learning_rate": 1.3802177858439202e-05, + "loss": 39.419, + "step": 2985 + }, + { + "epoch": 10.780135440180587, + "grad_norm": 205.85079956054688, + "learning_rate": 1.3796733212341199e-05, + "loss": 39.2075, + "step": 2986 + }, + { + "epoch": 10.783747178329572, + "grad_norm": 216.0372314453125, + "learning_rate": 1.3791288566243194e-05, + "loss": 38.5652, + "step": 2987 + }, + { + "epoch": 10.787358916478555, + "grad_norm": 258.47650146484375, + "learning_rate": 1.3785843920145191e-05, + "loss": 38.1968, + "step": 2988 + }, + { + "epoch": 10.79097065462754, + "grad_norm": 289.07354736328125, + "learning_rate": 1.3780399274047188e-05, + "loss": 40.2233, + "step": 2989 + }, + { + "epoch": 10.794582392776524, + "grad_norm": 332.9964904785156, + "learning_rate": 1.3774954627949184e-05, + "loss": 39.5959, + "step": 2990 + }, + { + "epoch": 10.794582392776524, + "eval_loss": 0.6167517304420471, + "eval_runtime": 3.1556, + "eval_samples_per_second": 56.724, + "eval_steps_per_second": 56.724, + "step": 2990 + }, + { + "epoch": 10.798194130925507, + "grad_norm": 205.10699462890625, + "learning_rate": 1.376950998185118e-05, + "loss": 40.2468, + "step": 2991 + }, + { + "epoch": 10.801805869074492, + "grad_norm": 270.2808837890625, + "learning_rate": 1.3764065335753176e-05, + "loss": 37.5956, + "step": 2992 + }, + { + "epoch": 10.805417607223477, + "grad_norm": 199.32044982910156, + "learning_rate": 1.3758620689655171e-05, + "loss": 38.7289, + "step": 2993 + }, + { + "epoch": 10.80902934537246, + "grad_norm": 196.97547912597656, + "learning_rate": 1.375317604355717e-05, + "loss": 40.6707, + "step": 2994 + }, + { + "epoch": 10.812641083521445, + "grad_norm": 219.34588623046875, + "learning_rate": 1.3747731397459166e-05, + "loss": 39.6782, + "step": 2995 + }, + { + "epoch": 10.816252821670428, + "grad_norm": 261.7323913574219, + "learning_rate": 1.3742286751361161e-05, + "loss": 41.1828, + "step": 2996 + }, + { + "epoch": 10.819864559819413, + "grad_norm": 250.89186096191406, + "learning_rate": 1.3736842105263158e-05, + "loss": 41.3582, + "step": 2997 + }, + { + "epoch": 10.823476297968398, + "grad_norm": 284.7223205566406, + "learning_rate": 1.3731397459165155e-05, + "loss": 39.3584, + "step": 2998 + }, + { + "epoch": 10.827088036117381, + "grad_norm": 212.9114990234375, + "learning_rate": 1.3725952813067152e-05, + "loss": 37.5373, + "step": 2999 + }, + { + "epoch": 10.830699774266366, + "grad_norm": 182.8346405029297, + "learning_rate": 1.3720508166969148e-05, + "loss": 35.2027, + "step": 3000 + }, + { + "epoch": 10.830699774266366, + "eval_loss": 0.6083630919456482, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.702, + "eval_steps_per_second": 56.702, + "step": 3000 + }, + { + "epoch": 10.83431151241535, + "grad_norm": 259.0496520996094, + "learning_rate": 1.3715063520871143e-05, + "loss": 33.4937, + "step": 3001 + }, + { + "epoch": 10.837923250564334, + "grad_norm": 173.037353515625, + "learning_rate": 1.370961887477314e-05, + "loss": 32.8549, + "step": 3002 + }, + { + "epoch": 10.841534988713319, + "grad_norm": 257.9381408691406, + "learning_rate": 1.3704174228675137e-05, + "loss": 33.9163, + "step": 3003 + }, + { + "epoch": 10.845146726862303, + "grad_norm": 248.58355712890625, + "learning_rate": 1.3698729582577132e-05, + "loss": 34.3948, + "step": 3004 + }, + { + "epoch": 10.848758465011286, + "grad_norm": 277.0877990722656, + "learning_rate": 1.369328493647913e-05, + "loss": 34.2868, + "step": 3005 + }, + { + "epoch": 10.852370203160271, + "grad_norm": 220.54014587402344, + "learning_rate": 1.3687840290381125e-05, + "loss": 35.2502, + "step": 3006 + }, + { + "epoch": 10.855981941309254, + "grad_norm": 248.14111328125, + "learning_rate": 1.3682395644283122e-05, + "loss": 33.4599, + "step": 3007 + }, + { + "epoch": 10.85959367945824, + "grad_norm": 284.2827453613281, + "learning_rate": 1.3676950998185119e-05, + "loss": 34.2927, + "step": 3008 + }, + { + "epoch": 10.863205417607224, + "grad_norm": 236.78201293945312, + "learning_rate": 1.3671506352087114e-05, + "loss": 34.9322, + "step": 3009 + }, + { + "epoch": 10.866817155756207, + "grad_norm": 245.58331298828125, + "learning_rate": 1.3666061705989112e-05, + "loss": 35.7628, + "step": 3010 + }, + { + "epoch": 10.866817155756207, + "eval_loss": 0.6125946640968323, + "eval_runtime": 3.1644, + "eval_samples_per_second": 56.566, + "eval_steps_per_second": 56.566, + "step": 3010 + }, + { + "epoch": 10.870428893905192, + "grad_norm": 217.79248046875, + "learning_rate": 1.3660617059891107e-05, + "loss": 35.7332, + "step": 3011 + }, + { + "epoch": 10.874040632054175, + "grad_norm": 258.78729248046875, + "learning_rate": 1.3655172413793104e-05, + "loss": 38.293, + "step": 3012 + }, + { + "epoch": 10.87765237020316, + "grad_norm": 253.94757080078125, + "learning_rate": 1.3649727767695101e-05, + "loss": 37.511, + "step": 3013 + }, + { + "epoch": 10.881264108352145, + "grad_norm": 265.5654602050781, + "learning_rate": 1.3644283121597096e-05, + "loss": 37.5786, + "step": 3014 + }, + { + "epoch": 10.884875846501128, + "grad_norm": 252.11453247070312, + "learning_rate": 1.3638838475499092e-05, + "loss": 37.1039, + "step": 3015 + }, + { + "epoch": 10.888487584650113, + "grad_norm": 259.5934753417969, + "learning_rate": 1.3633393829401089e-05, + "loss": 35.2651, + "step": 3016 + }, + { + "epoch": 10.892099322799098, + "grad_norm": 194.3569793701172, + "learning_rate": 1.3627949183303086e-05, + "loss": 23.7438, + "step": 3017 + }, + { + "epoch": 10.89571106094808, + "grad_norm": 233.95205688476562, + "learning_rate": 1.3622504537205081e-05, + "loss": 23.0061, + "step": 3018 + }, + { + "epoch": 10.899322799097066, + "grad_norm": 185.18495178222656, + "learning_rate": 1.3617059891107078e-05, + "loss": 24.5404, + "step": 3019 + }, + { + "epoch": 10.90293453724605, + "grad_norm": 200.27029418945312, + "learning_rate": 1.3611615245009074e-05, + "loss": 24.3629, + "step": 3020 + }, + { + "epoch": 10.90293453724605, + "eval_loss": 0.6178797483444214, + "eval_runtime": 3.1498, + "eval_samples_per_second": 56.829, + "eval_steps_per_second": 56.829, + "step": 3020 + }, + { + "epoch": 10.906546275395034, + "grad_norm": 226.4281463623047, + "learning_rate": 1.3606170598911073e-05, + "loss": 41.7249, + "step": 3021 + }, + { + "epoch": 10.910158013544018, + "grad_norm": 207.73768615722656, + "learning_rate": 1.3600725952813068e-05, + "loss": 42.1902, + "step": 3022 + }, + { + "epoch": 10.913769751693001, + "grad_norm": 248.69773864746094, + "learning_rate": 1.3595281306715063e-05, + "loss": 40.8419, + "step": 3023 + }, + { + "epoch": 10.917381489841986, + "grad_norm": 224.0100860595703, + "learning_rate": 1.358983666061706e-05, + "loss": 41.483, + "step": 3024 + }, + { + "epoch": 10.920993227990971, + "grad_norm": 217.3524932861328, + "learning_rate": 1.3584392014519056e-05, + "loss": 42.4667, + "step": 3025 + }, + { + "epoch": 10.924604966139954, + "grad_norm": 226.0863494873047, + "learning_rate": 1.3578947368421053e-05, + "loss": 40.8693, + "step": 3026 + }, + { + "epoch": 10.928216704288939, + "grad_norm": 278.3658447265625, + "learning_rate": 1.357350272232305e-05, + "loss": 39.5165, + "step": 3027 + }, + { + "epoch": 10.931828442437924, + "grad_norm": 226.6543731689453, + "learning_rate": 1.3568058076225045e-05, + "loss": 39.3144, + "step": 3028 + }, + { + "epoch": 10.935440180586907, + "grad_norm": 215.39073181152344, + "learning_rate": 1.3562613430127042e-05, + "loss": 39.9823, + "step": 3029 + }, + { + "epoch": 10.939051918735892, + "grad_norm": 239.6291961669922, + "learning_rate": 1.355716878402904e-05, + "loss": 40.898, + "step": 3030 + }, + { + "epoch": 10.939051918735892, + "eval_loss": 0.6163076162338257, + "eval_runtime": 3.153, + "eval_samples_per_second": 56.771, + "eval_steps_per_second": 56.771, + "step": 3030 + }, + { + "epoch": 10.942663656884875, + "grad_norm": 251.20431518554688, + "learning_rate": 1.3551724137931035e-05, + "loss": 40.8357, + "step": 3031 + }, + { + "epoch": 10.94627539503386, + "grad_norm": 243.96022033691406, + "learning_rate": 1.3546279491833032e-05, + "loss": 39.1261, + "step": 3032 + }, + { + "epoch": 10.949887133182845, + "grad_norm": 248.15545654296875, + "learning_rate": 1.3540834845735027e-05, + "loss": 40.9375, + "step": 3033 + }, + { + "epoch": 10.953498871331828, + "grad_norm": 215.00927734375, + "learning_rate": 1.3535390199637023e-05, + "loss": 42.4167, + "step": 3034 + }, + { + "epoch": 10.957110609480813, + "grad_norm": 263.11566162109375, + "learning_rate": 1.3529945553539021e-05, + "loss": 40.7363, + "step": 3035 + }, + { + "epoch": 10.960722347629797, + "grad_norm": 208.59628295898438, + "learning_rate": 1.3524500907441017e-05, + "loss": 35.7124, + "step": 3036 + }, + { + "epoch": 10.96433408577878, + "grad_norm": 187.6036834716797, + "learning_rate": 1.3519056261343012e-05, + "loss": 33.7512, + "step": 3037 + }, + { + "epoch": 10.967945823927765, + "grad_norm": 217.89825439453125, + "learning_rate": 1.351361161524501e-05, + "loss": 33.4262, + "step": 3038 + }, + { + "epoch": 10.97155756207675, + "grad_norm": 235.59889221191406, + "learning_rate": 1.3508166969147005e-05, + "loss": 35.2587, + "step": 3039 + }, + { + "epoch": 10.975169300225733, + "grad_norm": 261.9609680175781, + "learning_rate": 1.3502722323049003e-05, + "loss": 36.1296, + "step": 3040 + }, + { + "epoch": 10.975169300225733, + "eval_loss": 0.610818088054657, + "eval_runtime": 3.1502, + "eval_samples_per_second": 56.822, + "eval_steps_per_second": 56.822, + "step": 3040 + }, + { + "epoch": 10.978781038374718, + "grad_norm": 239.44386291503906, + "learning_rate": 1.3497277676950999e-05, + "loss": 35.6712, + "step": 3041 + }, + { + "epoch": 10.982392776523701, + "grad_norm": 260.9620666503906, + "learning_rate": 1.3491833030852994e-05, + "loss": 35.9054, + "step": 3042 + }, + { + "epoch": 10.986004514672686, + "grad_norm": 246.35678100585938, + "learning_rate": 1.3486388384754991e-05, + "loss": 35.6071, + "step": 3043 + }, + { + "epoch": 10.989616252821671, + "grad_norm": 259.808349609375, + "learning_rate": 1.3480943738656988e-05, + "loss": 37.8261, + "step": 3044 + }, + { + "epoch": 10.993227990970654, + "grad_norm": 187.34579467773438, + "learning_rate": 1.3475499092558984e-05, + "loss": 29.4662, + "step": 3045 + }, + { + "epoch": 10.996839729119639, + "grad_norm": 235.4073486328125, + "learning_rate": 1.3470054446460981e-05, + "loss": 23.668, + "step": 3046 + }, + { + "epoch": 11.0, + "grad_norm": 171.45904541015625, + "learning_rate": 1.3464609800362976e-05, + "loss": 21.3995, + "step": 3047 + }, + { + "epoch": 11.003611738148985, + "grad_norm": 262.18798828125, + "learning_rate": 1.3459165154264972e-05, + "loss": 40.2072, + "step": 3048 + }, + { + "epoch": 11.007223476297968, + "grad_norm": 298.67755126953125, + "learning_rate": 1.345372050816697e-05, + "loss": 42.5345, + "step": 3049 + }, + { + "epoch": 11.010835214446953, + "grad_norm": 215.71389770507812, + "learning_rate": 1.3448275862068966e-05, + "loss": 41.3491, + "step": 3050 + }, + { + "epoch": 11.010835214446953, + "eval_loss": 0.6099278330802917, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 3050 + }, + { + "epoch": 11.014446952595938, + "grad_norm": 243.77044677734375, + "learning_rate": 1.3442831215970963e-05, + "loss": 41.0093, + "step": 3051 + }, + { + "epoch": 11.01805869074492, + "grad_norm": 205.8600616455078, + "learning_rate": 1.3437386569872958e-05, + "loss": 41.944, + "step": 3052 + }, + { + "epoch": 11.021670428893906, + "grad_norm": 204.25608825683594, + "learning_rate": 1.3431941923774955e-05, + "loss": 39.3595, + "step": 3053 + }, + { + "epoch": 11.025282167042889, + "grad_norm": 195.03114318847656, + "learning_rate": 1.3426497277676952e-05, + "loss": 42.0208, + "step": 3054 + }, + { + "epoch": 11.028893905191874, + "grad_norm": 193.05857849121094, + "learning_rate": 1.3421052631578948e-05, + "loss": 41.2148, + "step": 3055 + }, + { + "epoch": 11.032505643340858, + "grad_norm": 255.9553680419922, + "learning_rate": 1.3415607985480943e-05, + "loss": 41.6029, + "step": 3056 + }, + { + "epoch": 11.036117381489841, + "grad_norm": 234.97799682617188, + "learning_rate": 1.341016333938294e-05, + "loss": 41.2583, + "step": 3057 + }, + { + "epoch": 11.039729119638826, + "grad_norm": 183.76707458496094, + "learning_rate": 1.3404718693284937e-05, + "loss": 39.4893, + "step": 3058 + }, + { + "epoch": 11.043340857787811, + "grad_norm": 162.30191040039062, + "learning_rate": 1.3399274047186933e-05, + "loss": 37.697, + "step": 3059 + }, + { + "epoch": 11.046952595936794, + "grad_norm": 223.8235626220703, + "learning_rate": 1.339382940108893e-05, + "loss": 37.2762, + "step": 3060 + }, + { + "epoch": 11.046952595936794, + "eval_loss": 0.6099210381507874, + "eval_runtime": 3.1526, + "eval_samples_per_second": 56.778, + "eval_steps_per_second": 56.778, + "step": 3060 + }, + { + "epoch": 11.050564334085779, + "grad_norm": 203.874755859375, + "learning_rate": 1.3388384754990925e-05, + "loss": 37.7674, + "step": 3061 + }, + { + "epoch": 11.054176072234762, + "grad_norm": 222.9609832763672, + "learning_rate": 1.3382940108892922e-05, + "loss": 39.5784, + "step": 3062 + }, + { + "epoch": 11.057787810383747, + "grad_norm": 177.81871032714844, + "learning_rate": 1.337749546279492e-05, + "loss": 37.5264, + "step": 3063 + }, + { + "epoch": 11.061399548532732, + "grad_norm": 209.53326416015625, + "learning_rate": 1.3372050816696915e-05, + "loss": 38.5067, + "step": 3064 + }, + { + "epoch": 11.065011286681715, + "grad_norm": 228.35260009765625, + "learning_rate": 1.3366606170598912e-05, + "loss": 37.5329, + "step": 3065 + }, + { + "epoch": 11.0686230248307, + "grad_norm": 231.5054168701172, + "learning_rate": 1.3361161524500907e-05, + "loss": 39.8565, + "step": 3066 + }, + { + "epoch": 11.072234762979685, + "grad_norm": 184.31460571289062, + "learning_rate": 1.3355716878402904e-05, + "loss": 37.9703, + "step": 3067 + }, + { + "epoch": 11.075846501128668, + "grad_norm": 230.06463623046875, + "learning_rate": 1.3350272232304901e-05, + "loss": 39.1406, + "step": 3068 + }, + { + "epoch": 11.079458239277653, + "grad_norm": 263.3990478515625, + "learning_rate": 1.3344827586206897e-05, + "loss": 39.8019, + "step": 3069 + }, + { + "epoch": 11.083069977426636, + "grad_norm": 217.89923095703125, + "learning_rate": 1.3339382940108892e-05, + "loss": 40.195, + "step": 3070 + }, + { + "epoch": 11.083069977426636, + "eval_loss": 0.6136859655380249, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 3070 + }, + { + "epoch": 11.08668171557562, + "grad_norm": 238.8343505859375, + "learning_rate": 1.333393829401089e-05, + "loss": 39.1668, + "step": 3071 + }, + { + "epoch": 11.090293453724605, + "grad_norm": 288.6470947265625, + "learning_rate": 1.3328493647912886e-05, + "loss": 40.3355, + "step": 3072 + }, + { + "epoch": 11.093905191873588, + "grad_norm": 284.3423156738281, + "learning_rate": 1.3323049001814883e-05, + "loss": 41.5359, + "step": 3073 + }, + { + "epoch": 11.097516930022573, + "grad_norm": 263.0945739746094, + "learning_rate": 1.3317604355716879e-05, + "loss": 41.3219, + "step": 3074 + }, + { + "epoch": 11.101128668171558, + "grad_norm": 208.96383666992188, + "learning_rate": 1.3312159709618874e-05, + "loss": 39.7292, + "step": 3075 + }, + { + "epoch": 11.104740406320541, + "grad_norm": 233.49888610839844, + "learning_rate": 1.3306715063520873e-05, + "loss": 35.282, + "step": 3076 + }, + { + "epoch": 11.108352144469526, + "grad_norm": 216.6250762939453, + "learning_rate": 1.3301270417422868e-05, + "loss": 34.4335, + "step": 3077 + }, + { + "epoch": 11.111963882618511, + "grad_norm": 182.3594970703125, + "learning_rate": 1.3295825771324864e-05, + "loss": 32.7557, + "step": 3078 + }, + { + "epoch": 11.115575620767494, + "grad_norm": 215.4852752685547, + "learning_rate": 1.329038112522686e-05, + "loss": 32.185, + "step": 3079 + }, + { + "epoch": 11.119187358916479, + "grad_norm": 237.4733123779297, + "learning_rate": 1.3284936479128856e-05, + "loss": 32.8733, + "step": 3080 + }, + { + "epoch": 11.119187358916479, + "eval_loss": 0.6130570769309998, + "eval_runtime": 3.154, + "eval_samples_per_second": 56.754, + "eval_steps_per_second": 56.754, + "step": 3080 + }, + { + "epoch": 11.122799097065462, + "grad_norm": 202.9044952392578, + "learning_rate": 1.3279491833030853e-05, + "loss": 33.89, + "step": 3081 + }, + { + "epoch": 11.126410835214447, + "grad_norm": 230.82086181640625, + "learning_rate": 1.327404718693285e-05, + "loss": 34.0808, + "step": 3082 + }, + { + "epoch": 11.130022573363432, + "grad_norm": 318.1103515625, + "learning_rate": 1.3268602540834846e-05, + "loss": 35.5715, + "step": 3083 + }, + { + "epoch": 11.133634311512415, + "grad_norm": 296.760986328125, + "learning_rate": 1.3263157894736843e-05, + "loss": 36.0701, + "step": 3084 + }, + { + "epoch": 11.1372460496614, + "grad_norm": 355.1922302246094, + "learning_rate": 1.3257713248638838e-05, + "loss": 35.027, + "step": 3085 + }, + { + "epoch": 11.140857787810384, + "grad_norm": 379.0643310546875, + "learning_rate": 1.3252268602540835e-05, + "loss": 36.8225, + "step": 3086 + }, + { + "epoch": 11.144469525959368, + "grad_norm": 271.0293273925781, + "learning_rate": 1.3246823956442832e-05, + "loss": 34.18, + "step": 3087 + }, + { + "epoch": 11.148081264108352, + "grad_norm": 231.29782104492188, + "learning_rate": 1.3241379310344828e-05, + "loss": 37.5546, + "step": 3088 + }, + { + "epoch": 11.151693002257336, + "grad_norm": 236.58180236816406, + "learning_rate": 1.3235934664246823e-05, + "loss": 35.8625, + "step": 3089 + }, + { + "epoch": 11.15530474040632, + "grad_norm": 220.71853637695312, + "learning_rate": 1.3230490018148822e-05, + "loss": 38.1384, + "step": 3090 + }, + { + "epoch": 11.15530474040632, + "eval_loss": 0.6140565276145935, + "eval_runtime": 3.1543, + "eval_samples_per_second": 56.747, + "eval_steps_per_second": 56.747, + "step": 3090 + }, + { + "epoch": 11.158916478555305, + "grad_norm": 251.32090759277344, + "learning_rate": 1.3225045372050817e-05, + "loss": 36.7226, + "step": 3091 + }, + { + "epoch": 11.162528216704288, + "grad_norm": 244.061279296875, + "learning_rate": 1.3219600725952814e-05, + "loss": 37.2144, + "step": 3092 + }, + { + "epoch": 11.166139954853273, + "grad_norm": 274.3013610839844, + "learning_rate": 1.321415607985481e-05, + "loss": 27.0703, + "step": 3093 + }, + { + "epoch": 11.169751693002258, + "grad_norm": 197.1829071044922, + "learning_rate": 1.3208711433756805e-05, + "loss": 23.0504, + "step": 3094 + }, + { + "epoch": 11.173363431151241, + "grad_norm": 205.8387451171875, + "learning_rate": 1.3203266787658804e-05, + "loss": 23.4632, + "step": 3095 + }, + { + "epoch": 11.176975169300226, + "grad_norm": 237.6263427734375, + "learning_rate": 1.31978221415608e-05, + "loss": 23.9426, + "step": 3096 + }, + { + "epoch": 11.18058690744921, + "grad_norm": 177.99688720703125, + "learning_rate": 1.3192377495462795e-05, + "loss": 24.2553, + "step": 3097 + }, + { + "epoch": 11.184198645598194, + "grad_norm": 235.16787719726562, + "learning_rate": 1.3186932849364792e-05, + "loss": 41.3257, + "step": 3098 + }, + { + "epoch": 11.187810383747179, + "grad_norm": 213.4043731689453, + "learning_rate": 1.3181488203266787e-05, + "loss": 42.3344, + "step": 3099 + }, + { + "epoch": 11.191422121896162, + "grad_norm": 162.57554626464844, + "learning_rate": 1.3176043557168784e-05, + "loss": 41.2702, + "step": 3100 + }, + { + "epoch": 11.191422121896162, + "eval_loss": 0.6155741214752197, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.06, + "eval_steps_per_second": 57.06, + "step": 3100 + }, + { + "epoch": 11.195033860045147, + "grad_norm": 215.84335327148438, + "learning_rate": 1.3170598911070781e-05, + "loss": 41.0582, + "step": 3101 + }, + { + "epoch": 11.198645598194132, + "grad_norm": 295.0271301269531, + "learning_rate": 1.3165154264972777e-05, + "loss": 41.3479, + "step": 3102 + }, + { + "epoch": 11.202257336343115, + "grad_norm": 287.3316955566406, + "learning_rate": 1.3159709618874774e-05, + "loss": 41.6267, + "step": 3103 + }, + { + "epoch": 11.2058690744921, + "grad_norm": 249.3993377685547, + "learning_rate": 1.315426497277677e-05, + "loss": 40.5208, + "step": 3104 + }, + { + "epoch": 11.209480812641084, + "grad_norm": 274.5410461425781, + "learning_rate": 1.3148820326678766e-05, + "loss": 41.7072, + "step": 3105 + }, + { + "epoch": 11.213092550790067, + "grad_norm": 259.49627685546875, + "learning_rate": 1.3143375680580763e-05, + "loss": 41.0034, + "step": 3106 + }, + { + "epoch": 11.216704288939052, + "grad_norm": 246.60902404785156, + "learning_rate": 1.3137931034482759e-05, + "loss": 40.1154, + "step": 3107 + }, + { + "epoch": 11.220316027088035, + "grad_norm": 224.0052947998047, + "learning_rate": 1.3132486388384754e-05, + "loss": 41.1167, + "step": 3108 + }, + { + "epoch": 11.22392776523702, + "grad_norm": 204.24021911621094, + "learning_rate": 1.3127041742286753e-05, + "loss": 37.0909, + "step": 3109 + }, + { + "epoch": 11.227539503386005, + "grad_norm": 206.67681884765625, + "learning_rate": 1.3121597096188748e-05, + "loss": 38.0959, + "step": 3110 + }, + { + "epoch": 11.227539503386005, + "eval_loss": 0.6148640513420105, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.798, + "eval_steps_per_second": 56.798, + "step": 3110 + }, + { + "epoch": 11.231151241534988, + "grad_norm": 255.91238403320312, + "learning_rate": 1.3116152450090743e-05, + "loss": 38.8076, + "step": 3111 + }, + { + "epoch": 11.234762979683973, + "grad_norm": 239.5032958984375, + "learning_rate": 1.311070780399274e-05, + "loss": 39.3991, + "step": 3112 + }, + { + "epoch": 11.238374717832958, + "grad_norm": 254.8914031982422, + "learning_rate": 1.3105263157894738e-05, + "loss": 37.7301, + "step": 3113 + }, + { + "epoch": 11.241986455981941, + "grad_norm": 229.97943115234375, + "learning_rate": 1.3099818511796735e-05, + "loss": 38.8527, + "step": 3114 + }, + { + "epoch": 11.245598194130926, + "grad_norm": 208.1148681640625, + "learning_rate": 1.309437386569873e-05, + "loss": 38.8518, + "step": 3115 + }, + { + "epoch": 11.249209932279909, + "grad_norm": 208.49557495117188, + "learning_rate": 1.3088929219600725e-05, + "loss": 38.927, + "step": 3116 + }, + { + "epoch": 11.252821670428894, + "grad_norm": 332.9958801269531, + "learning_rate": 1.3083484573502723e-05, + "loss": 40.0492, + "step": 3117 + }, + { + "epoch": 11.256433408577879, + "grad_norm": 253.16769409179688, + "learning_rate": 1.307803992740472e-05, + "loss": 39.1965, + "step": 3118 + }, + { + "epoch": 11.260045146726862, + "grad_norm": 243.8136444091797, + "learning_rate": 1.3072595281306715e-05, + "loss": 38.2286, + "step": 3119 + }, + { + "epoch": 11.263656884875846, + "grad_norm": 273.6463623046875, + "learning_rate": 1.3067150635208712e-05, + "loss": 39.3751, + "step": 3120 + }, + { + "epoch": 11.263656884875846, + "eval_loss": 0.6175129413604736, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 3120 + }, + { + "epoch": 11.267268623024831, + "grad_norm": 228.980224609375, + "learning_rate": 1.3061705989110707e-05, + "loss": 40.29, + "step": 3121 + }, + { + "epoch": 11.270880361173814, + "grad_norm": 292.6310729980469, + "learning_rate": 1.3056261343012703e-05, + "loss": 41.1785, + "step": 3122 + }, + { + "epoch": 11.2744920993228, + "grad_norm": 217.0737762451172, + "learning_rate": 1.3050816696914702e-05, + "loss": 40.9514, + "step": 3123 + }, + { + "epoch": 11.278103837471784, + "grad_norm": 227.0102081298828, + "learning_rate": 1.3045372050816697e-05, + "loss": 39.6132, + "step": 3124 + }, + { + "epoch": 11.281715575620767, + "grad_norm": 195.74667358398438, + "learning_rate": 1.3039927404718694e-05, + "loss": 39.5024, + "step": 3125 + }, + { + "epoch": 11.285327313769752, + "grad_norm": 222.6744384765625, + "learning_rate": 1.303448275862069e-05, + "loss": 37.7863, + "step": 3126 + }, + { + "epoch": 11.288939051918735, + "grad_norm": 207.1038055419922, + "learning_rate": 1.3029038112522687e-05, + "loss": 34.9129, + "step": 3127 + }, + { + "epoch": 11.29255079006772, + "grad_norm": 227.38330078125, + "learning_rate": 1.3023593466424684e-05, + "loss": 33.231, + "step": 3128 + }, + { + "epoch": 11.296162528216705, + "grad_norm": 254.19442749023438, + "learning_rate": 1.3018148820326679e-05, + "loss": 33.3166, + "step": 3129 + }, + { + "epoch": 11.299774266365688, + "grad_norm": 221.4664306640625, + "learning_rate": 1.3012704174228674e-05, + "loss": 33.2336, + "step": 3130 + }, + { + "epoch": 11.299774266365688, + "eval_loss": 0.6138683557510376, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 3130 + }, + { + "epoch": 11.303386004514673, + "grad_norm": 179.73678588867188, + "learning_rate": 1.3007259528130671e-05, + "loss": 34.0082, + "step": 3131 + }, + { + "epoch": 11.306997742663658, + "grad_norm": 238.66107177734375, + "learning_rate": 1.3001814882032669e-05, + "loss": 33.1898, + "step": 3132 + }, + { + "epoch": 11.31060948081264, + "grad_norm": 315.51934814453125, + "learning_rate": 1.2996370235934666e-05, + "loss": 34.5558, + "step": 3133 + }, + { + "epoch": 11.314221218961626, + "grad_norm": 235.54217529296875, + "learning_rate": 1.2990925589836661e-05, + "loss": 32.4498, + "step": 3134 + }, + { + "epoch": 11.317832957110609, + "grad_norm": 225.9518280029297, + "learning_rate": 1.2985480943738656e-05, + "loss": 34.1823, + "step": 3135 + }, + { + "epoch": 11.321444695259594, + "grad_norm": 276.5481262207031, + "learning_rate": 1.2980036297640655e-05, + "loss": 34.6704, + "step": 3136 + }, + { + "epoch": 11.325056433408578, + "grad_norm": 306.4985656738281, + "learning_rate": 1.297459165154265e-05, + "loss": 35.9149, + "step": 3137 + }, + { + "epoch": 11.328668171557561, + "grad_norm": 207.28550720214844, + "learning_rate": 1.2969147005444646e-05, + "loss": 34.876, + "step": 3138 + }, + { + "epoch": 11.332279909706546, + "grad_norm": 238.89157104492188, + "learning_rate": 1.2963702359346643e-05, + "loss": 36.7191, + "step": 3139 + }, + { + "epoch": 11.335891647855531, + "grad_norm": 281.7445068359375, + "learning_rate": 1.2958257713248638e-05, + "loss": 37.9134, + "step": 3140 + }, + { + "epoch": 11.335891647855531, + "eval_loss": 0.6141538023948669, + "eval_runtime": 3.1622, + "eval_samples_per_second": 56.606, + "eval_steps_per_second": 56.606, + "step": 3140 + }, + { + "epoch": 11.339503386004514, + "grad_norm": 261.58221435546875, + "learning_rate": 1.2952813067150635e-05, + "loss": 36.7193, + "step": 3141 + }, + { + "epoch": 11.343115124153499, + "grad_norm": 260.8083190917969, + "learning_rate": 1.2947368421052633e-05, + "loss": 36.9418, + "step": 3142 + }, + { + "epoch": 11.346726862302482, + "grad_norm": 263.466552734375, + "learning_rate": 1.2941923774954628e-05, + "loss": 31.1083, + "step": 3143 + }, + { + "epoch": 11.350338600451467, + "grad_norm": 201.6587677001953, + "learning_rate": 1.2936479128856625e-05, + "loss": 23.4982, + "step": 3144 + }, + { + "epoch": 11.353950338600452, + "grad_norm": 230.29629516601562, + "learning_rate": 1.293103448275862e-05, + "loss": 22.5417, + "step": 3145 + }, + { + "epoch": 11.357562076749435, + "grad_norm": 193.08795166015625, + "learning_rate": 1.2925589836660617e-05, + "loss": 23.6032, + "step": 3146 + }, + { + "epoch": 11.36117381489842, + "grad_norm": 206.49093627929688, + "learning_rate": 1.2920145190562615e-05, + "loss": 24.1813, + "step": 3147 + }, + { + "epoch": 11.364785553047405, + "grad_norm": 285.38348388671875, + "learning_rate": 1.291470054446461e-05, + "loss": 41.4394, + "step": 3148 + }, + { + "epoch": 11.368397291196388, + "grad_norm": 307.4984130859375, + "learning_rate": 1.2909255898366605e-05, + "loss": 43.8865, + "step": 3149 + }, + { + "epoch": 11.372009029345373, + "grad_norm": 256.685791015625, + "learning_rate": 1.2903811252268604e-05, + "loss": 41.5534, + "step": 3150 + }, + { + "epoch": 11.372009029345373, + "eval_loss": 0.6155339479446411, + "eval_runtime": 3.1488, + "eval_samples_per_second": 56.846, + "eval_steps_per_second": 56.846, + "step": 3150 + }, + { + "epoch": 11.375620767494357, + "grad_norm": 302.5317077636719, + "learning_rate": 1.28983666061706e-05, + "loss": 41.5231, + "step": 3151 + }, + { + "epoch": 11.37923250564334, + "grad_norm": 381.4787292480469, + "learning_rate": 1.2892921960072595e-05, + "loss": 40.7064, + "step": 3152 + }, + { + "epoch": 11.382844243792325, + "grad_norm": 313.63116455078125, + "learning_rate": 1.2887477313974592e-05, + "loss": 41.4045, + "step": 3153 + }, + { + "epoch": 11.386455981941308, + "grad_norm": 265.4134521484375, + "learning_rate": 1.2882032667876587e-05, + "loss": 41.2618, + "step": 3154 + }, + { + "epoch": 11.390067720090293, + "grad_norm": 260.43084716796875, + "learning_rate": 1.2876588021778586e-05, + "loss": 42.6311, + "step": 3155 + }, + { + "epoch": 11.393679458239278, + "grad_norm": 326.7022705078125, + "learning_rate": 1.2871143375680581e-05, + "loss": 41.8859, + "step": 3156 + }, + { + "epoch": 11.397291196388261, + "grad_norm": 420.966552734375, + "learning_rate": 1.2865698729582577e-05, + "loss": 41.8117, + "step": 3157 + }, + { + "epoch": 11.400902934537246, + "grad_norm": 280.8377380371094, + "learning_rate": 1.2860254083484574e-05, + "loss": 41.3303, + "step": 3158 + }, + { + "epoch": 11.404514672686231, + "grad_norm": 238.64564514160156, + "learning_rate": 1.2854809437386571e-05, + "loss": 38.253, + "step": 3159 + }, + { + "epoch": 11.408126410835214, + "grad_norm": 258.8091125488281, + "learning_rate": 1.2849364791288566e-05, + "loss": 39.2494, + "step": 3160 + }, + { + "epoch": 11.408126410835214, + "eval_loss": 0.6130858659744263, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 3160 + }, + { + "epoch": 11.411738148984199, + "grad_norm": 209.76300048828125, + "learning_rate": 1.2843920145190563e-05, + "loss": 39.1069, + "step": 3161 + }, + { + "epoch": 11.415349887133182, + "grad_norm": 215.24072265625, + "learning_rate": 1.2838475499092559e-05, + "loss": 38.8867, + "step": 3162 + }, + { + "epoch": 11.418961625282167, + "grad_norm": 285.4281311035156, + "learning_rate": 1.2833030852994554e-05, + "loss": 38.0298, + "step": 3163 + }, + { + "epoch": 11.422573363431152, + "grad_norm": 322.1593017578125, + "learning_rate": 1.2827586206896553e-05, + "loss": 40.2122, + "step": 3164 + }, + { + "epoch": 11.426185101580135, + "grad_norm": 277.2178955078125, + "learning_rate": 1.2822141560798548e-05, + "loss": 38.0829, + "step": 3165 + }, + { + "epoch": 11.42979683972912, + "grad_norm": 186.9705810546875, + "learning_rate": 1.2816696914700545e-05, + "loss": 40.6601, + "step": 3166 + }, + { + "epoch": 11.433408577878104, + "grad_norm": 210.6102294921875, + "learning_rate": 1.281125226860254e-05, + "loss": 39.0126, + "step": 3167 + }, + { + "epoch": 11.437020316027088, + "grad_norm": 234.50717163085938, + "learning_rate": 1.2805807622504536e-05, + "loss": 38.6465, + "step": 3168 + }, + { + "epoch": 11.440632054176072, + "grad_norm": 217.9093475341797, + "learning_rate": 1.2800362976406535e-05, + "loss": 39.2568, + "step": 3169 + }, + { + "epoch": 11.444243792325057, + "grad_norm": 252.82054138183594, + "learning_rate": 1.279491833030853e-05, + "loss": 39.005, + "step": 3170 + }, + { + "epoch": 11.444243792325057, + "eval_loss": 0.6125118732452393, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 3170 + }, + { + "epoch": 11.44785553047404, + "grad_norm": 290.2322998046875, + "learning_rate": 1.2789473684210526e-05, + "loss": 39.6133, + "step": 3171 + }, + { + "epoch": 11.451467268623025, + "grad_norm": 250.72450256347656, + "learning_rate": 1.2784029038112523e-05, + "loss": 40.3251, + "step": 3172 + }, + { + "epoch": 11.455079006772008, + "grad_norm": 273.91229248046875, + "learning_rate": 1.277858439201452e-05, + "loss": 39.5129, + "step": 3173 + }, + { + "epoch": 11.458690744920993, + "grad_norm": 214.30038452148438, + "learning_rate": 1.2773139745916515e-05, + "loss": 40.5093, + "step": 3174 + }, + { + "epoch": 11.462302483069978, + "grad_norm": 264.251708984375, + "learning_rate": 1.2767695099818512e-05, + "loss": 38.3837, + "step": 3175 + }, + { + "epoch": 11.465914221218961, + "grad_norm": 224.7700653076172, + "learning_rate": 1.2762250453720508e-05, + "loss": 37.8522, + "step": 3176 + }, + { + "epoch": 11.469525959367946, + "grad_norm": 238.35604858398438, + "learning_rate": 1.2756805807622505e-05, + "loss": 34.0249, + "step": 3177 + }, + { + "epoch": 11.47313769751693, + "grad_norm": 181.4731903076172, + "learning_rate": 1.2751361161524502e-05, + "loss": 34.2473, + "step": 3178 + }, + { + "epoch": 11.476749435665914, + "grad_norm": 240.2397003173828, + "learning_rate": 1.2745916515426497e-05, + "loss": 32.8657, + "step": 3179 + }, + { + "epoch": 11.480361173814899, + "grad_norm": 283.2740478515625, + "learning_rate": 1.2740471869328494e-05, + "loss": 34.6619, + "step": 3180 + }, + { + "epoch": 11.480361173814899, + "eval_loss": 0.6126638054847717, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 3180 + }, + { + "epoch": 11.483972911963882, + "grad_norm": 248.70912170410156, + "learning_rate": 1.273502722323049e-05, + "loss": 33.0975, + "step": 3181 + }, + { + "epoch": 11.487584650112867, + "grad_norm": 210.9479217529297, + "learning_rate": 1.2729582577132487e-05, + "loss": 34.2069, + "step": 3182 + }, + { + "epoch": 11.491196388261852, + "grad_norm": 234.31399536132812, + "learning_rate": 1.2724137931034484e-05, + "loss": 35.811, + "step": 3183 + }, + { + "epoch": 11.494808126410835, + "grad_norm": 253.24478149414062, + "learning_rate": 1.271869328493648e-05, + "loss": 35.6234, + "step": 3184 + }, + { + "epoch": 11.49841986455982, + "grad_norm": 259.0565185546875, + "learning_rate": 1.2713248638838476e-05, + "loss": 35.1495, + "step": 3185 + }, + { + "epoch": 11.502031602708804, + "grad_norm": 235.4202880859375, + "learning_rate": 1.2707803992740472e-05, + "loss": 35.1363, + "step": 3186 + }, + { + "epoch": 11.505643340857787, + "grad_norm": 248.30267333984375, + "learning_rate": 1.2702359346642469e-05, + "loss": 35.9653, + "step": 3187 + }, + { + "epoch": 11.509255079006772, + "grad_norm": 197.6142120361328, + "learning_rate": 1.2696914700544466e-05, + "loss": 35.6304, + "step": 3188 + }, + { + "epoch": 11.512866817155757, + "grad_norm": 329.27862548828125, + "learning_rate": 1.2691470054446461e-05, + "loss": 35.6111, + "step": 3189 + }, + { + "epoch": 11.51647855530474, + "grad_norm": 194.7126922607422, + "learning_rate": 1.2686025408348457e-05, + "loss": 35.0693, + "step": 3190 + }, + { + "epoch": 11.51647855530474, + "eval_loss": 0.6106634736061096, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 3190 + }, + { + "epoch": 11.520090293453725, + "grad_norm": 243.0207061767578, + "learning_rate": 1.2680580762250454e-05, + "loss": 37.6373, + "step": 3191 + }, + { + "epoch": 11.523702031602708, + "grad_norm": 282.0947265625, + "learning_rate": 1.267513611615245e-05, + "loss": 36.2595, + "step": 3192 + }, + { + "epoch": 11.527313769751693, + "grad_norm": 249.8011932373047, + "learning_rate": 1.2669691470054446e-05, + "loss": 35.5601, + "step": 3193 + }, + { + "epoch": 11.530925507900678, + "grad_norm": 202.17503356933594, + "learning_rate": 1.2664246823956443e-05, + "loss": 23.1075, + "step": 3194 + }, + { + "epoch": 11.534537246049661, + "grad_norm": 188.78128051757812, + "learning_rate": 1.2658802177858439e-05, + "loss": 22.2458, + "step": 3195 + }, + { + "epoch": 11.538148984198646, + "grad_norm": 219.24722290039062, + "learning_rate": 1.2653357531760437e-05, + "loss": 23.7842, + "step": 3196 + }, + { + "epoch": 11.54176072234763, + "grad_norm": 213.0615234375, + "learning_rate": 1.2647912885662433e-05, + "loss": 25.3773, + "step": 3197 + }, + { + "epoch": 11.545372460496614, + "grad_norm": 274.6806335449219, + "learning_rate": 1.2642468239564428e-05, + "loss": 40.396, + "step": 3198 + }, + { + "epoch": 11.548984198645599, + "grad_norm": 248.91778564453125, + "learning_rate": 1.2637023593466425e-05, + "loss": 42.2405, + "step": 3199 + }, + { + "epoch": 11.552595936794582, + "grad_norm": 228.45591735839844, + "learning_rate": 1.263157894736842e-05, + "loss": 40.7328, + "step": 3200 + }, + { + "epoch": 11.552595936794582, + "eval_loss": 0.6154705286026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.04, + "eval_steps_per_second": 57.04, + "step": 3200 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.5693667440544973e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..487a21a0cb7639ebfb1d8548a25d807276cd7a82 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccb6ff2e581929924a713d66328bfe2df1d1a0e25a2e33ee0796e822a3c8acfe +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..bb8e8f44f25e5e827c5afafaef919d33543055e6 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd94e40e25c8d526dc6579005e6979501647a07d891a9abd3a78e92edab3b2fb +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d07e2340b0914bba6c756980a54affe6dc77d6a1 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:605a27024a5ccb7cdff2f6e568218fa15322411bd812379f991a710b9eb85061 +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2327c4ba76d0020ded06ce7da64cb170d5b240bd --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce2e7e33cfb66b24e860f89dd5a374283adaf4f0202d6b99a2e0b435465e646d +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f63a6d1530b38a890b8dd805ef9e7b744874f970 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd86ddb176feee51fe8e5aba414ba5f0505294d965e624afe1418fbea487dd17 +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d3b214aa4395bb48c6586b0dc6d6e4b88f6b3b20 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/trainer_state.json @@ -0,0 +1,26553 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 12.2744920993228, + "eval_steps": 10, + "global_step": 3400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + }, + { + "epoch": 4.33589164785553, + "grad_norm": 232.332763671875, + "learning_rate": 2.3515426497277678e-05, + "loss": 38.1029, + "step": 1201 + }, + { + "epoch": 4.339503386004514, + "grad_norm": 251.8763885498047, + "learning_rate": 2.3509981851179673e-05, + "loss": 40.2538, + "step": 1202 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 260.1363525390625, + "learning_rate": 2.350453720508167e-05, + "loss": 39.115, + "step": 1203 + }, + { + "epoch": 4.346726862302483, + "grad_norm": 227.32473754882812, + "learning_rate": 2.3499092558983667e-05, + "loss": 37.7692, + "step": 1204 + }, + { + "epoch": 4.350338600451467, + "grad_norm": 208.3872528076172, + "learning_rate": 2.3493647912885663e-05, + "loss": 26.7583, + "step": 1205 + }, + { + "epoch": 4.353950338600452, + "grad_norm": 173.05075073242188, + "learning_rate": 2.348820326678766e-05, + "loss": 24.7576, + "step": 1206 + }, + { + "epoch": 4.357562076749436, + "grad_norm": 214.4512939453125, + "learning_rate": 2.3482758620689657e-05, + "loss": 24.8792, + "step": 1207 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 179.293701171875, + "learning_rate": 2.3477313974591652e-05, + "loss": 26.1507, + "step": 1208 + }, + { + "epoch": 4.364785553047404, + "grad_norm": 401.9908142089844, + "learning_rate": 2.3471869328493648e-05, + "loss": 47.4017, + "step": 1209 + }, + { + "epoch": 4.368397291196389, + "grad_norm": 399.3369140625, + "learning_rate": 2.3466424682395643e-05, + "loss": 48.0082, + "step": 1210 + }, + { + "epoch": 4.368397291196389, + "eval_loss": 0.6664602756500244, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.18, + "eval_steps_per_second": 57.18, + "step": 1210 + }, + { + "epoch": 4.372009029345373, + "grad_norm": 320.49090576171875, + "learning_rate": 2.346098003629764e-05, + "loss": 47.4843, + "step": 1211 + }, + { + "epoch": 4.375620767494357, + "grad_norm": 297.55615234375, + "learning_rate": 2.3455535390199637e-05, + "loss": 46.3087, + "step": 1212 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 245.03399658203125, + "learning_rate": 2.3450090744101636e-05, + "loss": 45.4889, + "step": 1213 + }, + { + "epoch": 4.382844243792325, + "grad_norm": 227.94091796875, + "learning_rate": 2.344464609800363e-05, + "loss": 45.8501, + "step": 1214 + }, + { + "epoch": 4.386455981941309, + "grad_norm": 262.7824401855469, + "learning_rate": 2.3439201451905627e-05, + "loss": 46.2737, + "step": 1215 + }, + { + "epoch": 4.390067720090293, + "grad_norm": 235.969970703125, + "learning_rate": 2.3433756805807622e-05, + "loss": 45.2876, + "step": 1216 + }, + { + "epoch": 4.393679458239277, + "grad_norm": 244.8028106689453, + "learning_rate": 2.342831215970962e-05, + "loss": 45.4931, + "step": 1217 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 236.24844360351562, + "learning_rate": 2.3422867513611616e-05, + "loss": 45.6649, + "step": 1218 + }, + { + "epoch": 4.400902934537246, + "grad_norm": 204.7911834716797, + "learning_rate": 2.341742286751361e-05, + "loss": 43.9613, + "step": 1219 + }, + { + "epoch": 4.40451467268623, + "grad_norm": 190.6739044189453, + "learning_rate": 2.3411978221415607e-05, + "loss": 41.9267, + "step": 1220 + }, + { + "epoch": 4.40451467268623, + "eval_loss": 0.6481396555900574, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1220 + }, + { + "epoch": 4.408126410835214, + "grad_norm": 224.25758361816406, + "learning_rate": 2.3406533575317602e-05, + "loss": 42.34, + "step": 1221 + }, + { + "epoch": 4.411738148984199, + "grad_norm": 238.21913146972656, + "learning_rate": 2.34010889292196e-05, + "loss": 40.6947, + "step": 1222 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 255.64395141601562, + "learning_rate": 2.33956442831216e-05, + "loss": 39.8585, + "step": 1223 + }, + { + "epoch": 4.418961625282167, + "grad_norm": 202.08859252929688, + "learning_rate": 2.3390199637023595e-05, + "loss": 42.6031, + "step": 1224 + }, + { + "epoch": 4.422573363431152, + "grad_norm": 222.359619140625, + "learning_rate": 2.338475499092559e-05, + "loss": 41.9946, + "step": 1225 + }, + { + "epoch": 4.426185101580136, + "grad_norm": 198.84461975097656, + "learning_rate": 2.3379310344827586e-05, + "loss": 40.9174, + "step": 1226 + }, + { + "epoch": 4.42979683972912, + "grad_norm": 227.34942626953125, + "learning_rate": 2.337386569872958e-05, + "loss": 42.2865, + "step": 1227 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 249.9097900390625, + "learning_rate": 2.336842105263158e-05, + "loss": 42.6508, + "step": 1228 + }, + { + "epoch": 4.437020316027088, + "grad_norm": 236.96009826660156, + "learning_rate": 2.3362976406533576e-05, + "loss": 43.0846, + "step": 1229 + }, + { + "epoch": 4.440632054176072, + "grad_norm": 183.06201171875, + "learning_rate": 2.335753176043557e-05, + "loss": 42.4119, + "step": 1230 + }, + { + "epoch": 4.440632054176072, + "eval_loss": 0.6428424715995789, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 1230 + }, + { + "epoch": 4.444243792325056, + "grad_norm": 199.0382843017578, + "learning_rate": 2.335208711433757e-05, + "loss": 43.1702, + "step": 1231 + }, + { + "epoch": 4.44785553047404, + "grad_norm": 221.87939453125, + "learning_rate": 2.3346642468239565e-05, + "loss": 43.3518, + "step": 1232 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 205.0601043701172, + "learning_rate": 2.3341197822141564e-05, + "loss": 42.9713, + "step": 1233 + }, + { + "epoch": 4.455079006772009, + "grad_norm": 235.3998565673828, + "learning_rate": 2.333575317604356e-05, + "loss": 42.6973, + "step": 1234 + }, + { + "epoch": 4.458690744920993, + "grad_norm": 171.76986694335938, + "learning_rate": 2.3330308529945555e-05, + "loss": 43.351, + "step": 1235 + }, + { + "epoch": 4.462302483069977, + "grad_norm": 261.549072265625, + "learning_rate": 2.332486388384755e-05, + "loss": 43.8662, + "step": 1236 + }, + { + "epoch": 4.465914221218962, + "grad_norm": 256.76837158203125, + "learning_rate": 2.3319419237749545e-05, + "loss": 40.7938, + "step": 1237 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 176.35060119628906, + "learning_rate": 2.331397459165154e-05, + "loss": 38.1021, + "step": 1238 + }, + { + "epoch": 4.47313769751693, + "grad_norm": 203.00906372070312, + "learning_rate": 2.330852994555354e-05, + "loss": 36.6359, + "step": 1239 + }, + { + "epoch": 4.476749435665914, + "grad_norm": 259.6462707519531, + "learning_rate": 2.3303085299455535e-05, + "loss": 34.448, + "step": 1240 + }, + { + "epoch": 4.476749435665914, + "eval_loss": 0.6386051177978516, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 1240 + }, + { + "epoch": 4.480361173814899, + "grad_norm": 215.24737548828125, + "learning_rate": 2.3297640653357534e-05, + "loss": 35.2353, + "step": 1241 + }, + { + "epoch": 4.483972911963883, + "grad_norm": 249.12355041503906, + "learning_rate": 2.329219600725953e-05, + "loss": 38.2077, + "step": 1242 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 191.0881805419922, + "learning_rate": 2.3286751361161525e-05, + "loss": 36.8363, + "step": 1243 + }, + { + "epoch": 4.491196388261851, + "grad_norm": 229.26449584960938, + "learning_rate": 2.3281306715063523e-05, + "loss": 36.7398, + "step": 1244 + }, + { + "epoch": 4.4948081264108355, + "grad_norm": 184.931884765625, + "learning_rate": 2.327586206896552e-05, + "loss": 35.6614, + "step": 1245 + }, + { + "epoch": 4.4984198645598195, + "grad_norm": 183.7378387451172, + "learning_rate": 2.3270417422867514e-05, + "loss": 36.9818, + "step": 1246 + }, + { + "epoch": 4.502031602708803, + "grad_norm": 191.42543029785156, + "learning_rate": 2.326497277676951e-05, + "loss": 38.1348, + "step": 1247 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 211.6359100341797, + "learning_rate": 2.3259528130671505e-05, + "loss": 37.0112, + "step": 1248 + }, + { + "epoch": 4.509255079006772, + "grad_norm": 245.6946563720703, + "learning_rate": 2.32540834845735e-05, + "loss": 38.6218, + "step": 1249 + }, + { + "epoch": 4.512866817155756, + "grad_norm": 193.29095458984375, + "learning_rate": 2.3248638838475502e-05, + "loss": 36.9687, + "step": 1250 + }, + { + "epoch": 4.512866817155756, + "eval_loss": 0.6432057023048401, + "eval_runtime": 3.1301, + "eval_samples_per_second": 57.187, + "eval_steps_per_second": 57.187, + "step": 1250 + }, + { + "epoch": 4.51647855530474, + "grad_norm": 247.0595245361328, + "learning_rate": 2.3243194192377498e-05, + "loss": 39.8086, + "step": 1251 + }, + { + "epoch": 4.520090293453725, + "grad_norm": 243.1544189453125, + "learning_rate": 2.3237749546279493e-05, + "loss": 38.7245, + "step": 1252 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 322.0834045410156, + "learning_rate": 2.323230490018149e-05, + "loss": 39.5335, + "step": 1253 + }, + { + "epoch": 4.527313769751693, + "grad_norm": 201.5956573486328, + "learning_rate": 2.3226860254083484e-05, + "loss": 30.2928, + "step": 1254 + }, + { + "epoch": 4.530925507900677, + "grad_norm": 186.13291931152344, + "learning_rate": 2.3221415607985483e-05, + "loss": 24.8504, + "step": 1255 + }, + { + "epoch": 4.534537246049661, + "grad_norm": 251.50608825683594, + "learning_rate": 2.3215970961887478e-05, + "loss": 24.5528, + "step": 1256 + }, + { + "epoch": 4.538148984198646, + "grad_norm": 180.21124267578125, + "learning_rate": 2.3210526315789473e-05, + "loss": 25.0864, + "step": 1257 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 206.5410614013672, + "learning_rate": 2.320508166969147e-05, + "loss": 27.1602, + "step": 1258 + }, + { + "epoch": 4.545372460496614, + "grad_norm": 342.1103210449219, + "learning_rate": 2.3199637023593468e-05, + "loss": 47.3734, + "step": 1259 + }, + { + "epoch": 4.5489841986455986, + "grad_norm": 418.3056945800781, + "learning_rate": 2.3194192377495463e-05, + "loss": 48.0316, + "step": 1260 + }, + { + "epoch": 4.5489841986455986, + "eval_loss": 0.6742400527000427, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 1260 + }, + { + "epoch": 4.5525959367945825, + "grad_norm": 369.8560791015625, + "learning_rate": 2.3188747731397462e-05, + "loss": 47.4532, + "step": 1261 + }, + { + "epoch": 4.5562076749435665, + "grad_norm": 322.0288391113281, + "learning_rate": 2.3183303085299457e-05, + "loss": 47.0661, + "step": 1262 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 244.79066467285156, + "learning_rate": 2.3177858439201453e-05, + "loss": 45.1875, + "step": 1263 + }, + { + "epoch": 4.563431151241535, + "grad_norm": 209.29397583007812, + "learning_rate": 2.3172413793103448e-05, + "loss": 46.1355, + "step": 1264 + }, + { + "epoch": 4.567042889390519, + "grad_norm": 271.5123291015625, + "learning_rate": 2.3166969147005443e-05, + "loss": 45.8947, + "step": 1265 + }, + { + "epoch": 4.570654627539503, + "grad_norm": 232.42913818359375, + "learning_rate": 2.3161524500907442e-05, + "loss": 45.6542, + "step": 1266 + }, + { + "epoch": 4.574266365688487, + "grad_norm": 282.50738525390625, + "learning_rate": 2.3156079854809437e-05, + "loss": 45.8805, + "step": 1267 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 203.39031982421875, + "learning_rate": 2.3150635208711436e-05, + "loss": 44.8926, + "step": 1268 + }, + { + "epoch": 4.581489841986456, + "grad_norm": 213.94894409179688, + "learning_rate": 2.314519056261343e-05, + "loss": 43.7589, + "step": 1269 + }, + { + "epoch": 4.58510158013544, + "grad_norm": 198.9677734375, + "learning_rate": 2.3139745916515427e-05, + "loss": 41.819, + "step": 1270 + }, + { + "epoch": 4.58510158013544, + "eval_loss": 0.6428627371788025, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1270 + }, + { + "epoch": 4.588713318284425, + "grad_norm": 197.69903564453125, + "learning_rate": 2.3134301270417422e-05, + "loss": 40.6128, + "step": 1271 + }, + { + "epoch": 4.592325056433409, + "grad_norm": 229.10488891601562, + "learning_rate": 2.312885662431942e-05, + "loss": 41.1856, + "step": 1272 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 254.4750213623047, + "learning_rate": 2.3123411978221417e-05, + "loss": 40.2048, + "step": 1273 + }, + { + "epoch": 4.599548532731377, + "grad_norm": 247.2012939453125, + "learning_rate": 2.3117967332123412e-05, + "loss": 41.663, + "step": 1274 + }, + { + "epoch": 4.603160270880361, + "grad_norm": 196.78761291503906, + "learning_rate": 2.3112522686025407e-05, + "loss": 41.1102, + "step": 1275 + }, + { + "epoch": 4.606772009029346, + "grad_norm": 179.03880310058594, + "learning_rate": 2.3107078039927403e-05, + "loss": 39.6368, + "step": 1276 + }, + { + "epoch": 4.6103837471783295, + "grad_norm": 203.49159240722656, + "learning_rate": 2.3101633393829405e-05, + "loss": 42.9424, + "step": 1277 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 254.80018615722656, + "learning_rate": 2.30961887477314e-05, + "loss": 42.0636, + "step": 1278 + }, + { + "epoch": 4.617607223476298, + "grad_norm": 201.86109924316406, + "learning_rate": 2.3090744101633396e-05, + "loss": 41.4738, + "step": 1279 + }, + { + "epoch": 4.621218961625282, + "grad_norm": 185.1239471435547, + "learning_rate": 2.308529945553539e-05, + "loss": 41.8529, + "step": 1280 + }, + { + "epoch": 4.621218961625282, + "eval_loss": 0.6457561254501343, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 1280 + }, + { + "epoch": 4.624830699774266, + "grad_norm": 198.6769561767578, + "learning_rate": 2.3079854809437386e-05, + "loss": 41.8397, + "step": 1281 + }, + { + "epoch": 4.62844243792325, + "grad_norm": 254.9165496826172, + "learning_rate": 2.3074410163339382e-05, + "loss": 43.5585, + "step": 1282 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 183.61181640625, + "learning_rate": 2.306896551724138e-05, + "loss": 41.7349, + "step": 1283 + }, + { + "epoch": 4.635665914221219, + "grad_norm": 206.0381622314453, + "learning_rate": 2.3063520871143376e-05, + "loss": 42.6239, + "step": 1284 + }, + { + "epoch": 4.639277652370203, + "grad_norm": 188.5303497314453, + "learning_rate": 2.305807622504537e-05, + "loss": 43.0988, + "step": 1285 + }, + { + "epoch": 4.642889390519187, + "grad_norm": 208.30039978027344, + "learning_rate": 2.3052631578947367e-05, + "loss": 43.8379, + "step": 1286 + }, + { + "epoch": 4.646501128668172, + "grad_norm": 209.494384765625, + "learning_rate": 2.3047186932849365e-05, + "loss": 41.4395, + "step": 1287 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 223.97824096679688, + "learning_rate": 2.3041742286751364e-05, + "loss": 38.5792, + "step": 1288 + }, + { + "epoch": 4.65372460496614, + "grad_norm": 209.16192626953125, + "learning_rate": 2.303629764065336e-05, + "loss": 36.2448, + "step": 1289 + }, + { + "epoch": 4.657336343115124, + "grad_norm": 260.72821044921875, + "learning_rate": 2.3030852994555355e-05, + "loss": 35.1692, + "step": 1290 + }, + { + "epoch": 4.657336343115124, + "eval_loss": 0.6381233334541321, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1290 + }, + { + "epoch": 4.660948081264109, + "grad_norm": 222.2270965576172, + "learning_rate": 2.302540834845735e-05, + "loss": 35.2234, + "step": 1291 + }, + { + "epoch": 4.664559819413093, + "grad_norm": 208.68218994140625, + "learning_rate": 2.3019963702359346e-05, + "loss": 35.6167, + "step": 1292 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 199.57015991210938, + "learning_rate": 2.301451905626134e-05, + "loss": 36.9489, + "step": 1293 + }, + { + "epoch": 4.6717832957110605, + "grad_norm": 249.1312255859375, + "learning_rate": 2.300907441016334e-05, + "loss": 37.0681, + "step": 1294 + }, + { + "epoch": 4.675395033860045, + "grad_norm": 227.86341857910156, + "learning_rate": 2.3003629764065335e-05, + "loss": 38.3897, + "step": 1295 + }, + { + "epoch": 4.679006772009029, + "grad_norm": 290.3368225097656, + "learning_rate": 2.2998185117967334e-05, + "loss": 39.1391, + "step": 1296 + }, + { + "epoch": 4.682618510158013, + "grad_norm": 222.59974670410156, + "learning_rate": 2.299274047186933e-05, + "loss": 38.6362, + "step": 1297 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 233.853515625, + "learning_rate": 2.2987295825771325e-05, + "loss": 37.1796, + "step": 1298 + }, + { + "epoch": 4.689841986455982, + "grad_norm": 202.83212280273438, + "learning_rate": 2.2981851179673324e-05, + "loss": 38.5097, + "step": 1299 + }, + { + "epoch": 4.693453724604966, + "grad_norm": 203.59027099609375, + "learning_rate": 2.297640653357532e-05, + "loss": 38.3335, + "step": 1300 + }, + { + "epoch": 4.693453724604966, + "eval_loss": 0.6446877717971802, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 1300 + }, + { + "epoch": 4.69706546275395, + "grad_norm": 250.48324584960938, + "learning_rate": 2.2970961887477314e-05, + "loss": 39.1848, + "step": 1301 + }, + { + "epoch": 4.700677200902934, + "grad_norm": 218.0867462158203, + "learning_rate": 2.296551724137931e-05, + "loss": 38.2276, + "step": 1302 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 316.4258728027344, + "learning_rate": 2.2960072595281305e-05, + "loss": 38.4487, + "step": 1303 + }, + { + "epoch": 4.707900677200903, + "grad_norm": 262.96832275390625, + "learning_rate": 2.29546279491833e-05, + "loss": 29.1075, + "step": 1304 + }, + { + "epoch": 4.711512415349887, + "grad_norm": 261.25897216796875, + "learning_rate": 2.2949183303085303e-05, + "loss": 24.6257, + "step": 1305 + }, + { + "epoch": 4.715124153498872, + "grad_norm": 223.29014587402344, + "learning_rate": 2.2943738656987298e-05, + "loss": 24.4387, + "step": 1306 + }, + { + "epoch": 4.718735891647856, + "grad_norm": 167.95193481445312, + "learning_rate": 2.2938294010889293e-05, + "loss": 25.0916, + "step": 1307 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 203.88392639160156, + "learning_rate": 2.293284936479129e-05, + "loss": 26.1631, + "step": 1308 + }, + { + "epoch": 4.725959367945824, + "grad_norm": 350.67657470703125, + "learning_rate": 2.2927404718693284e-05, + "loss": 47.7021, + "step": 1309 + }, + { + "epoch": 4.7295711060948085, + "grad_norm": 357.1839294433594, + "learning_rate": 2.2921960072595283e-05, + "loss": 47.8161, + "step": 1310 + }, + { + "epoch": 4.7295711060948085, + "eval_loss": 0.6716815829277039, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 1310 + }, + { + "epoch": 4.733182844243792, + "grad_norm": 334.40216064453125, + "learning_rate": 2.291651542649728e-05, + "loss": 47.5608, + "step": 1311 + }, + { + "epoch": 4.736794582392776, + "grad_norm": 322.90008544921875, + "learning_rate": 2.2911070780399274e-05, + "loss": 45.9858, + "step": 1312 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 291.5083923339844, + "learning_rate": 2.290562613430127e-05, + "loss": 45.9813, + "step": 1313 + }, + { + "epoch": 4.744018058690745, + "grad_norm": 234.91102600097656, + "learning_rate": 2.2900181488203268e-05, + "loss": 44.4287, + "step": 1314 + }, + { + "epoch": 4.747629796839729, + "grad_norm": 271.03582763671875, + "learning_rate": 2.2894736842105263e-05, + "loss": 45.3697, + "step": 1315 + }, + { + "epoch": 4.751241534988713, + "grad_norm": 256.219482421875, + "learning_rate": 2.2889292196007262e-05, + "loss": 45.1817, + "step": 1316 + }, + { + "epoch": 4.754853273137698, + "grad_norm": 252.0631561279297, + "learning_rate": 2.2883847549909257e-05, + "loss": 45.2029, + "step": 1317 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 249.41812133789062, + "learning_rate": 2.2878402903811253e-05, + "loss": 44.9802, + "step": 1318 + }, + { + "epoch": 4.762076749435666, + "grad_norm": 208.9102325439453, + "learning_rate": 2.2872958257713248e-05, + "loss": 44.3745, + "step": 1319 + }, + { + "epoch": 4.76568848758465, + "grad_norm": 322.94903564453125, + "learning_rate": 2.2867513611615244e-05, + "loss": 40.9193, + "step": 1320 + }, + { + "epoch": 4.76568848758465, + "eval_loss": 0.6515910029411316, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1320 + }, + { + "epoch": 4.769300225733634, + "grad_norm": 264.6942138671875, + "learning_rate": 2.2862068965517242e-05, + "loss": 39.7286, + "step": 1321 + }, + { + "epoch": 4.772911963882619, + "grad_norm": 276.6095886230469, + "learning_rate": 2.2856624319419238e-05, + "loss": 41.3846, + "step": 1322 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 199.59877014160156, + "learning_rate": 2.2851179673321233e-05, + "loss": 40.5583, + "step": 1323 + }, + { + "epoch": 4.780135440180587, + "grad_norm": 252.59158325195312, + "learning_rate": 2.2845735027223232e-05, + "loss": 40.9513, + "step": 1324 + }, + { + "epoch": 4.7837471783295715, + "grad_norm": 215.53826904296875, + "learning_rate": 2.2840290381125227e-05, + "loss": 41.5119, + "step": 1325 + }, + { + "epoch": 4.7873589164785555, + "grad_norm": 290.7100524902344, + "learning_rate": 2.2834845735027226e-05, + "loss": 42.7646, + "step": 1326 + }, + { + "epoch": 4.7909706546275395, + "grad_norm": 190.2306671142578, + "learning_rate": 2.282940108892922e-05, + "loss": 42.2708, + "step": 1327 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 187.5550079345703, + "learning_rate": 2.2823956442831217e-05, + "loss": 41.9279, + "step": 1328 + }, + { + "epoch": 4.798194130925508, + "grad_norm": 169.10414123535156, + "learning_rate": 2.2818511796733212e-05, + "loss": 42.2688, + "step": 1329 + }, + { + "epoch": 4.801805869074492, + "grad_norm": 199.5216064453125, + "learning_rate": 2.2813067150635208e-05, + "loss": 41.9192, + "step": 1330 + }, + { + "epoch": 4.801805869074492, + "eval_loss": 0.6402038335800171, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 1330 + }, + { + "epoch": 4.805417607223476, + "grad_norm": 222.4996337890625, + "learning_rate": 2.2807622504537203e-05, + "loss": 43.8218, + "step": 1331 + }, + { + "epoch": 4.80902934537246, + "grad_norm": 228.1157684326172, + "learning_rate": 2.2802177858439202e-05, + "loss": 42.9497, + "step": 1332 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 179.83697509765625, + "learning_rate": 2.27967332123412e-05, + "loss": 43.9723, + "step": 1333 + }, + { + "epoch": 4.816252821670429, + "grad_norm": 196.81983947753906, + "learning_rate": 2.2791288566243196e-05, + "loss": 43.3302, + "step": 1334 + }, + { + "epoch": 4.819864559819413, + "grad_norm": 186.61160278320312, + "learning_rate": 2.278584392014519e-05, + "loss": 41.8957, + "step": 1335 + }, + { + "epoch": 4.823476297968397, + "grad_norm": 242.55886840820312, + "learning_rate": 2.2780399274047187e-05, + "loss": 43.1916, + "step": 1336 + }, + { + "epoch": 4.827088036117382, + "grad_norm": 212.07177734375, + "learning_rate": 2.2774954627949185e-05, + "loss": 38.3371, + "step": 1337 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 180.1990966796875, + "learning_rate": 2.276950998185118e-05, + "loss": 36.3413, + "step": 1338 + }, + { + "epoch": 4.83431151241535, + "grad_norm": 202.69529724121094, + "learning_rate": 2.2764065335753176e-05, + "loss": 35.4426, + "step": 1339 + }, + { + "epoch": 4.837923250564334, + "grad_norm": 180.47283935546875, + "learning_rate": 2.275862068965517e-05, + "loss": 35.5281, + "step": 1340 + }, + { + "epoch": 4.837923250564334, + "eval_loss": 0.6356105804443359, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 1340 + }, + { + "epoch": 4.8415349887133186, + "grad_norm": 204.674560546875, + "learning_rate": 2.2753176043557167e-05, + "loss": 36.2566, + "step": 1341 + }, + { + "epoch": 4.8451467268623025, + "grad_norm": 272.1197204589844, + "learning_rate": 2.2747731397459166e-05, + "loss": 36.3862, + "step": 1342 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 235.55101013183594, + "learning_rate": 2.2742286751361165e-05, + "loss": 35.1455, + "step": 1343 + }, + { + "epoch": 4.852370203160271, + "grad_norm": 271.2718200683594, + "learning_rate": 2.273684210526316e-05, + "loss": 37.3824, + "step": 1344 + }, + { + "epoch": 4.855981941309255, + "grad_norm": 242.15728759765625, + "learning_rate": 2.2731397459165155e-05, + "loss": 37.6587, + "step": 1345 + }, + { + "epoch": 4.859593679458239, + "grad_norm": 218.59481811523438, + "learning_rate": 2.272595281306715e-05, + "loss": 36.7602, + "step": 1346 + }, + { + "epoch": 4.863205417607223, + "grad_norm": 231.9490203857422, + "learning_rate": 2.2720508166969146e-05, + "loss": 38.187, + "step": 1347 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 385.56158447265625, + "learning_rate": 2.2715063520871145e-05, + "loss": 38.1905, + "step": 1348 + }, + { + "epoch": 4.870428893905192, + "grad_norm": 219.38204956054688, + "learning_rate": 2.270961887477314e-05, + "loss": 38.2179, + "step": 1349 + }, + { + "epoch": 4.874040632054176, + "grad_norm": 209.46580505371094, + "learning_rate": 2.2704174228675136e-05, + "loss": 37.3696, + "step": 1350 + }, + { + "epoch": 4.874040632054176, + "eval_loss": 0.6412517428398132, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 1350 + }, + { + "epoch": 4.87765237020316, + "grad_norm": 205.53416442871094, + "learning_rate": 2.2698729582577134e-05, + "loss": 38.5144, + "step": 1351 + }, + { + "epoch": 4.881264108352145, + "grad_norm": 214.2522735595703, + "learning_rate": 2.269328493647913e-05, + "loss": 38.7372, + "step": 1352 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 236.9787139892578, + "learning_rate": 2.2687840290381125e-05, + "loss": 38.8987, + "step": 1353 + }, + { + "epoch": 4.888487584650113, + "grad_norm": 247.30906677246094, + "learning_rate": 2.2682395644283124e-05, + "loss": 35.0837, + "step": 1354 + }, + { + "epoch": 4.892099322799097, + "grad_norm": 287.5954284667969, + "learning_rate": 2.267695099818512e-05, + "loss": 25.5272, + "step": 1355 + }, + { + "epoch": 4.895711060948082, + "grad_norm": 254.61672973632812, + "learning_rate": 2.2671506352087115e-05, + "loss": 25.1288, + "step": 1356 + }, + { + "epoch": 4.899322799097066, + "grad_norm": 180.98666381835938, + "learning_rate": 2.266606170598911e-05, + "loss": 25.0588, + "step": 1357 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 213.0275421142578, + "learning_rate": 2.2660617059891105e-05, + "loss": 25.464, + "step": 1358 + }, + { + "epoch": 4.9065462753950335, + "grad_norm": 385.18035888671875, + "learning_rate": 2.2655172413793104e-05, + "loss": 47.0056, + "step": 1359 + }, + { + "epoch": 4.910158013544018, + "grad_norm": 383.4106140136719, + "learning_rate": 2.2649727767695103e-05, + "loss": 46.9892, + "step": 1360 + }, + { + "epoch": 4.910158013544018, + "eval_loss": 0.6618479490280151, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1360 + }, + { + "epoch": 4.913769751693002, + "grad_norm": 415.4345397949219, + "learning_rate": 2.26442831215971e-05, + "loss": 47.1619, + "step": 1361 + }, + { + "epoch": 4.917381489841986, + "grad_norm": 362.338134765625, + "learning_rate": 2.2638838475499094e-05, + "loss": 46.7232, + "step": 1362 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 378.7535400390625, + "learning_rate": 2.263339382940109e-05, + "loss": 46.4438, + "step": 1363 + }, + { + "epoch": 4.924604966139955, + "grad_norm": 251.64901733398438, + "learning_rate": 2.2627949183303085e-05, + "loss": 44.8178, + "step": 1364 + }, + { + "epoch": 4.928216704288939, + "grad_norm": 273.1052551269531, + "learning_rate": 2.2622504537205083e-05, + "loss": 43.0865, + "step": 1365 + }, + { + "epoch": 4.931828442437923, + "grad_norm": 229.66415405273438, + "learning_rate": 2.261705989110708e-05, + "loss": 42.2463, + "step": 1366 + }, + { + "epoch": 4.935440180586907, + "grad_norm": 229.47940063476562, + "learning_rate": 2.2611615245009074e-05, + "loss": 42.4395, + "step": 1367 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 224.48890686035156, + "learning_rate": 2.260617059891107e-05, + "loss": 42.4994, + "step": 1368 + }, + { + "epoch": 4.942663656884876, + "grad_norm": 241.98745727539062, + "learning_rate": 2.2600725952813065e-05, + "loss": 42.5535, + "step": 1369 + }, + { + "epoch": 4.94627539503386, + "grad_norm": 258.1711120605469, + "learning_rate": 2.2595281306715067e-05, + "loss": 42.8475, + "step": 1370 + }, + { + "epoch": 4.94627539503386, + "eval_loss": 0.639252245426178, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.09, + "eval_steps_per_second": 57.09, + "step": 1370 + }, + { + "epoch": 4.949887133182845, + "grad_norm": 204.64927673339844, + "learning_rate": 2.2589836660617062e-05, + "loss": 42.9895, + "step": 1371 + }, + { + "epoch": 4.953498871331829, + "grad_norm": 342.9057922363281, + "learning_rate": 2.2584392014519058e-05, + "loss": 43.1972, + "step": 1372 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 207.45504760742188, + "learning_rate": 2.2578947368421053e-05, + "loss": 42.406, + "step": 1373 + }, + { + "epoch": 4.960722347629797, + "grad_norm": 232.78831481933594, + "learning_rate": 2.257350272232305e-05, + "loss": 36.8817, + "step": 1374 + }, + { + "epoch": 4.9643340857787805, + "grad_norm": 249.3349609375, + "learning_rate": 2.2568058076225044e-05, + "loss": 34.584, + "step": 1375 + }, + { + "epoch": 4.967945823927765, + "grad_norm": 322.7100524902344, + "learning_rate": 2.2562613430127043e-05, + "loss": 36.9512, + "step": 1376 + }, + { + "epoch": 4.971557562076749, + "grad_norm": 357.65228271484375, + "learning_rate": 2.2557168784029038e-05, + "loss": 37.6833, + "step": 1377 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 300.0970153808594, + "learning_rate": 2.2551724137931033e-05, + "loss": 38.597, + "step": 1378 + }, + { + "epoch": 4.978781038374718, + "grad_norm": 234.52508544921875, + "learning_rate": 2.2546279491833032e-05, + "loss": 38.4155, + "step": 1379 + }, + { + "epoch": 4.982392776523702, + "grad_norm": 270.60626220703125, + "learning_rate": 2.2540834845735028e-05, + "loss": 38.1589, + "step": 1380 + }, + { + "epoch": 4.982392776523702, + "eval_loss": 0.6409950256347656, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 1380 + }, + { + "epoch": 4.986004514672686, + "grad_norm": 232.9596710205078, + "learning_rate": 2.2535390199637026e-05, + "loss": 39.281, + "step": 1381 + }, + { + "epoch": 4.98961625282167, + "grad_norm": 248.0550994873047, + "learning_rate": 2.2529945553539022e-05, + "loss": 40.0868, + "step": 1382 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 256.327880859375, + "learning_rate": 2.2524500907441017e-05, + "loss": 28.1259, + "step": 1383 + }, + { + "epoch": 4.996839729119639, + "grad_norm": 198.29559326171875, + "learning_rate": 2.2519056261343012e-05, + "loss": 25.3166, + "step": 1384 + }, + { + "epoch": 5.0, + "grad_norm": 174.66856384277344, + "learning_rate": 2.2513611615245008e-05, + "loss": 22.0749, + "step": 1385 + }, + { + "epoch": 5.003611738148984, + "grad_norm": 309.0927429199219, + "learning_rate": 2.2508166969147003e-05, + "loss": 45.2433, + "step": 1386 + }, + { + "epoch": 5.007223476297969, + "grad_norm": 293.1455383300781, + "learning_rate": 2.2502722323049002e-05, + "loss": 46.7025, + "step": 1387 + }, + { + "epoch": 5.010835214446953, + "grad_norm": 269.47662353515625, + "learning_rate": 2.2497277676951e-05, + "loss": 45.3218, + "step": 1388 + }, + { + "epoch": 5.014446952595937, + "grad_norm": 284.49560546875, + "learning_rate": 2.2491833030852996e-05, + "loss": 44.9849, + "step": 1389 + }, + { + "epoch": 5.018058690744921, + "grad_norm": 223.5511474609375, + "learning_rate": 2.248638838475499e-05, + "loss": 44.887, + "step": 1390 + }, + { + "epoch": 5.018058690744921, + "eval_loss": 0.6435533165931702, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1390 + }, + { + "epoch": 5.021670428893906, + "grad_norm": 243.4492645263672, + "learning_rate": 2.2480943738656987e-05, + "loss": 45.1483, + "step": 1391 + }, + { + "epoch": 5.0252821670428895, + "grad_norm": 265.1712646484375, + "learning_rate": 2.2475499092558986e-05, + "loss": 44.3713, + "step": 1392 + }, + { + "epoch": 5.0288939051918735, + "grad_norm": 190.72190856933594, + "learning_rate": 2.247005444646098e-05, + "loss": 45.3138, + "step": 1393 + }, + { + "epoch": 5.0325056433408575, + "grad_norm": 177.26686096191406, + "learning_rate": 2.2464609800362976e-05, + "loss": 43.302, + "step": 1394 + }, + { + "epoch": 5.036117381489842, + "grad_norm": 198.6124725341797, + "learning_rate": 2.2459165154264972e-05, + "loss": 43.6363, + "step": 1395 + }, + { + "epoch": 5.039729119638826, + "grad_norm": 233.78738403320312, + "learning_rate": 2.2453720508166967e-05, + "loss": 43.0345, + "step": 1396 + }, + { + "epoch": 5.04334085778781, + "grad_norm": 225.48614501953125, + "learning_rate": 2.2448275862068966e-05, + "loss": 41.5932, + "step": 1397 + }, + { + "epoch": 5.046952595936794, + "grad_norm": 204.31179809570312, + "learning_rate": 2.2442831215970965e-05, + "loss": 40.1401, + "step": 1398 + }, + { + "epoch": 5.050564334085779, + "grad_norm": 219.5385284423828, + "learning_rate": 2.243738656987296e-05, + "loss": 40.8834, + "step": 1399 + }, + { + "epoch": 5.054176072234763, + "grad_norm": 168.3094024658203, + "learning_rate": 2.2431941923774956e-05, + "loss": 40.4476, + "step": 1400 + }, + { + "epoch": 5.054176072234763, + "eval_loss": 0.6361114382743835, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 1400 + }, + { + "epoch": 5.057787810383747, + "grad_norm": 169.45201110839844, + "learning_rate": 2.242649727767695e-05, + "loss": 40.1949, + "step": 1401 + }, + { + "epoch": 5.061399548532731, + "grad_norm": 208.84634399414062, + "learning_rate": 2.2421052631578946e-05, + "loss": 41.0091, + "step": 1402 + }, + { + "epoch": 5.065011286681716, + "grad_norm": 248.86221313476562, + "learning_rate": 2.2415607985480945e-05, + "loss": 40.2435, + "step": 1403 + }, + { + "epoch": 5.0686230248307, + "grad_norm": 297.0834655761719, + "learning_rate": 2.241016333938294e-05, + "loss": 42.37, + "step": 1404 + }, + { + "epoch": 5.072234762979684, + "grad_norm": 242.12661743164062, + "learning_rate": 2.2404718693284936e-05, + "loss": 42.3822, + "step": 1405 + }, + { + "epoch": 5.075846501128668, + "grad_norm": 230.1178741455078, + "learning_rate": 2.2399274047186935e-05, + "loss": 41.3722, + "step": 1406 + }, + { + "epoch": 5.079458239277653, + "grad_norm": 191.32371520996094, + "learning_rate": 2.239382940108893e-05, + "loss": 41.8087, + "step": 1407 + }, + { + "epoch": 5.083069977426637, + "grad_norm": 267.28753662109375, + "learning_rate": 2.2388384754990925e-05, + "loss": 42.5938, + "step": 1408 + }, + { + "epoch": 5.0866817155756205, + "grad_norm": 186.61978149414062, + "learning_rate": 2.2382940108892924e-05, + "loss": 42.8553, + "step": 1409 + }, + { + "epoch": 5.090293453724605, + "grad_norm": 242.53433227539062, + "learning_rate": 2.237749546279492e-05, + "loss": 41.9677, + "step": 1410 + }, + { + "epoch": 5.090293453724605, + "eval_loss": 0.6330043077468872, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 1410 + }, + { + "epoch": 5.093905191873589, + "grad_norm": 199.74696350097656, + "learning_rate": 2.2372050816696915e-05, + "loss": 42.9821, + "step": 1411 + }, + { + "epoch": 5.097516930022573, + "grad_norm": 254.1063690185547, + "learning_rate": 2.236660617059891e-05, + "loss": 42.7956, + "step": 1412 + }, + { + "epoch": 5.101128668171557, + "grad_norm": 215.59056091308594, + "learning_rate": 2.2361161524500906e-05, + "loss": 43.6312, + "step": 1413 + }, + { + "epoch": 5.104740406320542, + "grad_norm": 218.69973754882812, + "learning_rate": 2.2355716878402904e-05, + "loss": 40.9468, + "step": 1414 + }, + { + "epoch": 5.108352144469526, + "grad_norm": 200.34927368164062, + "learning_rate": 2.23502722323049e-05, + "loss": 38.2656, + "step": 1415 + }, + { + "epoch": 5.11196388261851, + "grad_norm": 191.56883239746094, + "learning_rate": 2.23448275862069e-05, + "loss": 35.8111, + "step": 1416 + }, + { + "epoch": 5.115575620767494, + "grad_norm": 192.629150390625, + "learning_rate": 2.2339382940108894e-05, + "loss": 35.1287, + "step": 1417 + }, + { + "epoch": 5.119187358916479, + "grad_norm": 217.54855346679688, + "learning_rate": 2.233393829401089e-05, + "loss": 34.9664, + "step": 1418 + }, + { + "epoch": 5.122799097065463, + "grad_norm": 234.12355041503906, + "learning_rate": 2.2328493647912888e-05, + "loss": 35.9252, + "step": 1419 + }, + { + "epoch": 5.126410835214447, + "grad_norm": 201.83477783203125, + "learning_rate": 2.2323049001814884e-05, + "loss": 36.4664, + "step": 1420 + }, + { + "epoch": 5.126410835214447, + "eval_loss": 0.6359394192695618, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 1420 + }, + { + "epoch": 5.130022573363431, + "grad_norm": 212.38943481445312, + "learning_rate": 2.231760435571688e-05, + "loss": 35.2733, + "step": 1421 + }, + { + "epoch": 5.133634311512416, + "grad_norm": 219.8803253173828, + "learning_rate": 2.2312159709618874e-05, + "loss": 37.2009, + "step": 1422 + }, + { + "epoch": 5.1372460496614, + "grad_norm": 222.28221130371094, + "learning_rate": 2.230671506352087e-05, + "loss": 36.9338, + "step": 1423 + }, + { + "epoch": 5.140857787810384, + "grad_norm": 217.56607055664062, + "learning_rate": 2.2301270417422865e-05, + "loss": 38.0419, + "step": 1424 + }, + { + "epoch": 5.144469525959368, + "grad_norm": 232.7363739013672, + "learning_rate": 2.2295825771324867e-05, + "loss": 38.1393, + "step": 1425 + }, + { + "epoch": 5.148081264108352, + "grad_norm": 228.12091064453125, + "learning_rate": 2.2290381125226863e-05, + "loss": 37.4169, + "step": 1426 + }, + { + "epoch": 5.151693002257336, + "grad_norm": 247.9901580810547, + "learning_rate": 2.2284936479128858e-05, + "loss": 37.6386, + "step": 1427 + }, + { + "epoch": 5.15530474040632, + "grad_norm": 227.96649169921875, + "learning_rate": 2.2279491833030853e-05, + "loss": 38.7843, + "step": 1428 + }, + { + "epoch": 5.158916478555304, + "grad_norm": 197.85072326660156, + "learning_rate": 2.227404718693285e-05, + "loss": 37.7056, + "step": 1429 + }, + { + "epoch": 5.162528216704289, + "grad_norm": 270.6370544433594, + "learning_rate": 2.2268602540834848e-05, + "loss": 38.5554, + "step": 1430 + }, + { + "epoch": 5.162528216704289, + "eval_loss": 0.6463288068771362, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 1430 + }, + { + "epoch": 5.166139954853273, + "grad_norm": 251.65847778320312, + "learning_rate": 2.2263157894736843e-05, + "loss": 32.6593, + "step": 1431 + }, + { + "epoch": 5.169751693002257, + "grad_norm": 248.84368896484375, + "learning_rate": 2.225771324863884e-05, + "loss": 24.8031, + "step": 1432 + }, + { + "epoch": 5.173363431151241, + "grad_norm": 218.12979125976562, + "learning_rate": 2.2252268602540834e-05, + "loss": 23.8542, + "step": 1433 + }, + { + "epoch": 5.176975169300226, + "grad_norm": 171.4182586669922, + "learning_rate": 2.2246823956442832e-05, + "loss": 25.1994, + "step": 1434 + }, + { + "epoch": 5.18058690744921, + "grad_norm": 200.76271057128906, + "learning_rate": 2.2241379310344828e-05, + "loss": 25.1259, + "step": 1435 + }, + { + "epoch": 5.184198645598194, + "grad_norm": 324.8979797363281, + "learning_rate": 2.2235934664246827e-05, + "loss": 46.7466, + "step": 1436 + }, + { + "epoch": 5.187810383747179, + "grad_norm": 391.9200439453125, + "learning_rate": 2.2230490018148822e-05, + "loss": 47.366, + "step": 1437 + }, + { + "epoch": 5.191422121896163, + "grad_norm": 332.51080322265625, + "learning_rate": 2.2225045372050817e-05, + "loss": 47.5236, + "step": 1438 + }, + { + "epoch": 5.195033860045147, + "grad_norm": 295.85333251953125, + "learning_rate": 2.2219600725952813e-05, + "loss": 44.9235, + "step": 1439 + }, + { + "epoch": 5.198645598194131, + "grad_norm": 246.46482849121094, + "learning_rate": 2.2214156079854808e-05, + "loss": 44.5892, + "step": 1440 + }, + { + "epoch": 5.198645598194131, + "eval_loss": 0.6501885056495667, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.096, + "eval_steps_per_second": 57.096, + "step": 1440 + }, + { + "epoch": 5.2022573363431155, + "grad_norm": 224.99964904785156, + "learning_rate": 2.2208711433756807e-05, + "loss": 45.1496, + "step": 1441 + }, + { + "epoch": 5.2058690744920995, + "grad_norm": 201.5928497314453, + "learning_rate": 2.2203266787658802e-05, + "loss": 44.2362, + "step": 1442 + }, + { + "epoch": 5.209480812641083, + "grad_norm": 220.72509765625, + "learning_rate": 2.21978221415608e-05, + "loss": 45.7963, + "step": 1443 + }, + { + "epoch": 5.213092550790067, + "grad_norm": 229.04412841796875, + "learning_rate": 2.2192377495462796e-05, + "loss": 44.1812, + "step": 1444 + }, + { + "epoch": 5.216704288939052, + "grad_norm": 214.86207580566406, + "learning_rate": 2.2186932849364792e-05, + "loss": 44.364, + "step": 1445 + }, + { + "epoch": 5.220316027088036, + "grad_norm": 169.3239288330078, + "learning_rate": 2.2181488203266787e-05, + "loss": 44.1106, + "step": 1446 + }, + { + "epoch": 5.22392776523702, + "grad_norm": 180.3131561279297, + "learning_rate": 2.2176043557168786e-05, + "loss": 41.8791, + "step": 1447 + }, + { + "epoch": 5.227539503386004, + "grad_norm": 227.83078002929688, + "learning_rate": 2.217059891107078e-05, + "loss": 39.7917, + "step": 1448 + }, + { + "epoch": 5.231151241534989, + "grad_norm": 267.4294738769531, + "learning_rate": 2.2165154264972777e-05, + "loss": 41.2864, + "step": 1449 + }, + { + "epoch": 5.234762979683973, + "grad_norm": 210.79034423828125, + "learning_rate": 2.2159709618874772e-05, + "loss": 40.7219, + "step": 1450 + }, + { + "epoch": 5.234762979683973, + "eval_loss": 0.6369529366493225, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 1450 + }, + { + "epoch": 5.238374717832957, + "grad_norm": 205.2632598876953, + "learning_rate": 2.2154264972776768e-05, + "loss": 41.0364, + "step": 1451 + }, + { + "epoch": 5.241986455981941, + "grad_norm": 199.7196807861328, + "learning_rate": 2.214882032667877e-05, + "loss": 40.2733, + "step": 1452 + }, + { + "epoch": 5.245598194130926, + "grad_norm": 184.26495361328125, + "learning_rate": 2.2143375680580765e-05, + "loss": 40.3418, + "step": 1453 + }, + { + "epoch": 5.24920993227991, + "grad_norm": 170.1937713623047, + "learning_rate": 2.213793103448276e-05, + "loss": 40.5658, + "step": 1454 + }, + { + "epoch": 5.252821670428894, + "grad_norm": 167.71109008789062, + "learning_rate": 2.2132486388384756e-05, + "loss": 41.9252, + "step": 1455 + }, + { + "epoch": 5.2564334085778786, + "grad_norm": 184.73162841796875, + "learning_rate": 2.212704174228675e-05, + "loss": 40.0485, + "step": 1456 + }, + { + "epoch": 5.2600451467268625, + "grad_norm": 195.0812225341797, + "learning_rate": 2.2121597096188747e-05, + "loss": 41.6424, + "step": 1457 + }, + { + "epoch": 5.2636568848758465, + "grad_norm": 218.23553466796875, + "learning_rate": 2.2116152450090745e-05, + "loss": 40.6179, + "step": 1458 + }, + { + "epoch": 5.2672686230248305, + "grad_norm": 229.79299926757812, + "learning_rate": 2.211070780399274e-05, + "loss": 42.8747, + "step": 1459 + }, + { + "epoch": 5.270880361173815, + "grad_norm": 231.70692443847656, + "learning_rate": 2.2105263157894736e-05, + "loss": 42.7016, + "step": 1460 + }, + { + "epoch": 5.270880361173815, + "eval_loss": 0.6424433588981628, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 1460 + }, + { + "epoch": 5.274492099322799, + "grad_norm": 204.9513397216797, + "learning_rate": 2.209981851179673e-05, + "loss": 41.206, + "step": 1461 + }, + { + "epoch": 5.278103837471783, + "grad_norm": 220.89083862304688, + "learning_rate": 2.209437386569873e-05, + "loss": 44.0126, + "step": 1462 + }, + { + "epoch": 5.281715575620767, + "grad_norm": 266.7763671875, + "learning_rate": 2.208892921960073e-05, + "loss": 41.4934, + "step": 1463 + }, + { + "epoch": 5.285327313769752, + "grad_norm": 241.42636108398438, + "learning_rate": 2.2083484573502724e-05, + "loss": 43.3433, + "step": 1464 + }, + { + "epoch": 5.288939051918736, + "grad_norm": 221.7669219970703, + "learning_rate": 2.207803992740472e-05, + "loss": 35.9569, + "step": 1465 + }, + { + "epoch": 5.29255079006772, + "grad_norm": 236.0152130126953, + "learning_rate": 2.2072595281306715e-05, + "loss": 36.0824, + "step": 1466 + }, + { + "epoch": 5.296162528216704, + "grad_norm": 239.56224060058594, + "learning_rate": 2.206715063520871e-05, + "loss": 33.6127, + "step": 1467 + }, + { + "epoch": 5.299774266365689, + "grad_norm": 277.1287841796875, + "learning_rate": 2.2061705989110706e-05, + "loss": 36.11, + "step": 1468 + }, + { + "epoch": 5.303386004514673, + "grad_norm": 250.19515991210938, + "learning_rate": 2.2056261343012705e-05, + "loss": 36.9984, + "step": 1469 + }, + { + "epoch": 5.306997742663657, + "grad_norm": 214.2754669189453, + "learning_rate": 2.20508166969147e-05, + "loss": 36.5917, + "step": 1470 + }, + { + "epoch": 5.306997742663657, + "eval_loss": 0.6356943845748901, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 1470 + }, + { + "epoch": 5.310609480812641, + "grad_norm": 224.37388610839844, + "learning_rate": 2.20453720508167e-05, + "loss": 36.5302, + "step": 1471 + }, + { + "epoch": 5.314221218961626, + "grad_norm": 276.2541809082031, + "learning_rate": 2.2039927404718694e-05, + "loss": 36.7978, + "step": 1472 + }, + { + "epoch": 5.3178329571106095, + "grad_norm": 361.717041015625, + "learning_rate": 2.203448275862069e-05, + "loss": 37.4063, + "step": 1473 + }, + { + "epoch": 5.3214446952595935, + "grad_norm": 285.3569641113281, + "learning_rate": 2.202903811252269e-05, + "loss": 37.2472, + "step": 1474 + }, + { + "epoch": 5.3250564334085775, + "grad_norm": 268.160400390625, + "learning_rate": 2.2023593466424684e-05, + "loss": 37.7361, + "step": 1475 + }, + { + "epoch": 5.328668171557562, + "grad_norm": 211.38070678710938, + "learning_rate": 2.201814882032668e-05, + "loss": 37.7794, + "step": 1476 + }, + { + "epoch": 5.332279909706546, + "grad_norm": 214.10638427734375, + "learning_rate": 2.2012704174228675e-05, + "loss": 39.0787, + "step": 1477 + }, + { + "epoch": 5.33589164785553, + "grad_norm": 238.9603271484375, + "learning_rate": 2.200725952813067e-05, + "loss": 37.6853, + "step": 1478 + }, + { + "epoch": 5.339503386004514, + "grad_norm": 323.44976806640625, + "learning_rate": 2.2001814882032665e-05, + "loss": 38.2844, + "step": 1479 + }, + { + "epoch": 5.343115124153499, + "grad_norm": 289.6131896972656, + "learning_rate": 2.1996370235934668e-05, + "loss": 38.8953, + "step": 1480 + }, + { + "epoch": 5.343115124153499, + "eval_loss": 0.6462770700454712, + "eval_runtime": 3.1673, + "eval_samples_per_second": 56.516, + "eval_steps_per_second": 56.516, + "step": 1480 + }, + { + "epoch": 5.346726862302483, + "grad_norm": 197.47299194335938, + "learning_rate": 2.1990925589836663e-05, + "loss": 28.126, + "step": 1481 + }, + { + "epoch": 5.350338600451467, + "grad_norm": 198.37156677246094, + "learning_rate": 2.1985480943738658e-05, + "loss": 24.2205, + "step": 1482 + }, + { + "epoch": 5.353950338600452, + "grad_norm": 211.03501892089844, + "learning_rate": 2.1980036297640654e-05, + "loss": 24.119, + "step": 1483 + }, + { + "epoch": 5.357562076749436, + "grad_norm": 182.23316955566406, + "learning_rate": 2.197459165154265e-05, + "loss": 24.7386, + "step": 1484 + }, + { + "epoch": 5.36117381489842, + "grad_norm": 192.6392822265625, + "learning_rate": 2.1969147005444648e-05, + "loss": 26.0739, + "step": 1485 + }, + { + "epoch": 5.364785553047404, + "grad_norm": 380.62896728515625, + "learning_rate": 2.1963702359346643e-05, + "loss": 46.6945, + "step": 1486 + }, + { + "epoch": 5.368397291196389, + "grad_norm": 342.5572814941406, + "learning_rate": 2.195825771324864e-05, + "loss": 46.1797, + "step": 1487 + }, + { + "epoch": 5.372009029345373, + "grad_norm": 311.7198791503906, + "learning_rate": 2.1952813067150634e-05, + "loss": 45.6588, + "step": 1488 + }, + { + "epoch": 5.375620767494357, + "grad_norm": 260.9885559082031, + "learning_rate": 2.1947368421052633e-05, + "loss": 45.2405, + "step": 1489 + }, + { + "epoch": 5.3792325056433405, + "grad_norm": 263.3132019042969, + "learning_rate": 2.1941923774954628e-05, + "loss": 44.117, + "step": 1490 + }, + { + "epoch": 5.3792325056433405, + "eval_loss": 0.644275426864624, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 1490 + }, + { + "epoch": 5.382844243792325, + "grad_norm": 254.92022705078125, + "learning_rate": 2.1936479128856627e-05, + "loss": 45.4002, + "step": 1491 + }, + { + "epoch": 5.386455981941309, + "grad_norm": 246.1839599609375, + "learning_rate": 2.1931034482758622e-05, + "loss": 45.3481, + "step": 1492 + }, + { + "epoch": 5.390067720090293, + "grad_norm": 282.2879638671875, + "learning_rate": 2.1925589836660618e-05, + "loss": 45.3958, + "step": 1493 + }, + { + "epoch": 5.393679458239277, + "grad_norm": 266.9140930175781, + "learning_rate": 2.1920145190562613e-05, + "loss": 44.2959, + "step": 1494 + }, + { + "epoch": 5.397291196388262, + "grad_norm": 196.81199645996094, + "learning_rate": 2.191470054446461e-05, + "loss": 44.765, + "step": 1495 + }, + { + "epoch": 5.400902934537246, + "grad_norm": 270.7329406738281, + "learning_rate": 2.1909255898366607e-05, + "loss": 42.8581, + "step": 1496 + }, + { + "epoch": 5.40451467268623, + "grad_norm": 187.3281707763672, + "learning_rate": 2.1903811252268603e-05, + "loss": 40.7167, + "step": 1497 + }, + { + "epoch": 5.408126410835214, + "grad_norm": 302.9165954589844, + "learning_rate": 2.1898366606170598e-05, + "loss": 41.0712, + "step": 1498 + }, + { + "epoch": 5.411738148984199, + "grad_norm": 395.1492614746094, + "learning_rate": 2.1892921960072597e-05, + "loss": 40.4098, + "step": 1499 + }, + { + "epoch": 5.415349887133183, + "grad_norm": 253.91494750976562, + "learning_rate": 2.1887477313974592e-05, + "loss": 41.2985, + "step": 1500 + }, + { + "epoch": 5.415349887133183, + "eval_loss": 0.6383773684501648, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1500 + }, + { + "epoch": 5.418961625282167, + "grad_norm": 248.4109344482422, + "learning_rate": 2.1882032667876588e-05, + "loss": 41.179, + "step": 1501 + }, + { + "epoch": 5.422573363431152, + "grad_norm": 210.50015258789062, + "learning_rate": 2.1876588021778586e-05, + "loss": 41.1934, + "step": 1502 + }, + { + "epoch": 5.426185101580136, + "grad_norm": 170.64334106445312, + "learning_rate": 2.187114337568058e-05, + "loss": 41.5535, + "step": 1503 + }, + { + "epoch": 5.42979683972912, + "grad_norm": 249.41270446777344, + "learning_rate": 2.1865698729582577e-05, + "loss": 41.8323, + "step": 1504 + }, + { + "epoch": 5.433408577878104, + "grad_norm": 214.53770446777344, + "learning_rate": 2.1860254083484572e-05, + "loss": 42.1517, + "step": 1505 + }, + { + "epoch": 5.437020316027088, + "grad_norm": 225.6502227783203, + "learning_rate": 2.1854809437386568e-05, + "loss": 42.7675, + "step": 1506 + }, + { + "epoch": 5.440632054176072, + "grad_norm": 210.19219970703125, + "learning_rate": 2.1849364791288567e-05, + "loss": 42.5094, + "step": 1507 + }, + { + "epoch": 5.444243792325056, + "grad_norm": 187.03294372558594, + "learning_rate": 2.1843920145190565e-05, + "loss": 42.2218, + "step": 1508 + }, + { + "epoch": 5.44785553047404, + "grad_norm": 227.6764373779297, + "learning_rate": 2.183847549909256e-05, + "loss": 42.7061, + "step": 1509 + }, + { + "epoch": 5.451467268623025, + "grad_norm": 239.2847442626953, + "learning_rate": 2.1833030852994556e-05, + "loss": 43.1959, + "step": 1510 + }, + { + "epoch": 5.451467268623025, + "eval_loss": 0.6405091285705566, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 1510 + }, + { + "epoch": 5.455079006772009, + "grad_norm": 268.887451171875, + "learning_rate": 2.182758620689655e-05, + "loss": 42.4915, + "step": 1511 + }, + { + "epoch": 5.458690744920993, + "grad_norm": 261.0531311035156, + "learning_rate": 2.182214156079855e-05, + "loss": 42.1777, + "step": 1512 + }, + { + "epoch": 5.462302483069977, + "grad_norm": 241.58819580078125, + "learning_rate": 2.1816696914700546e-05, + "loss": 40.8728, + "step": 1513 + }, + { + "epoch": 5.465914221218962, + "grad_norm": 227.302001953125, + "learning_rate": 2.181125226860254e-05, + "loss": 39.8861, + "step": 1514 + }, + { + "epoch": 5.469525959367946, + "grad_norm": 293.8402404785156, + "learning_rate": 2.1805807622504536e-05, + "loss": 36.8716, + "step": 1515 + }, + { + "epoch": 5.47313769751693, + "grad_norm": 332.8829650878906, + "learning_rate": 2.1800362976406532e-05, + "loss": 35.6049, + "step": 1516 + }, + { + "epoch": 5.476749435665914, + "grad_norm": 271.6636962890625, + "learning_rate": 2.179491833030853e-05, + "loss": 34.6785, + "step": 1517 + }, + { + "epoch": 5.480361173814899, + "grad_norm": 211.5673065185547, + "learning_rate": 2.178947368421053e-05, + "loss": 35.5321, + "step": 1518 + }, + { + "epoch": 5.483972911963883, + "grad_norm": 168.95346069335938, + "learning_rate": 2.1784029038112525e-05, + "loss": 35.1604, + "step": 1519 + }, + { + "epoch": 5.487584650112867, + "grad_norm": 242.66725158691406, + "learning_rate": 2.177858439201452e-05, + "loss": 37.8709, + "step": 1520 + }, + { + "epoch": 5.487584650112867, + "eval_loss": 0.6324127912521362, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1520 + }, + { + "epoch": 5.491196388261851, + "grad_norm": 202.7799530029297, + "learning_rate": 2.1773139745916516e-05, + "loss": 38.1727, + "step": 1521 + }, + { + "epoch": 5.4948081264108355, + "grad_norm": 210.12704467773438, + "learning_rate": 2.176769509981851e-05, + "loss": 36.4171, + "step": 1522 + }, + { + "epoch": 5.4984198645598195, + "grad_norm": 214.7133331298828, + "learning_rate": 2.176225045372051e-05, + "loss": 37.7873, + "step": 1523 + }, + { + "epoch": 5.502031602708803, + "grad_norm": 197.89781188964844, + "learning_rate": 2.1756805807622505e-05, + "loss": 37.1096, + "step": 1524 + }, + { + "epoch": 5.505643340857787, + "grad_norm": 203.01992797851562, + "learning_rate": 2.17513611615245e-05, + "loss": 36.9907, + "step": 1525 + }, + { + "epoch": 5.509255079006772, + "grad_norm": 210.42164611816406, + "learning_rate": 2.17459165154265e-05, + "loss": 38.0291, + "step": 1526 + }, + { + "epoch": 5.512866817155756, + "grad_norm": 210.2798309326172, + "learning_rate": 2.1740471869328495e-05, + "loss": 37.5385, + "step": 1527 + }, + { + "epoch": 5.51647855530474, + "grad_norm": 217.986572265625, + "learning_rate": 2.173502722323049e-05, + "loss": 39.2736, + "step": 1528 + }, + { + "epoch": 5.520090293453725, + "grad_norm": 221.05831909179688, + "learning_rate": 2.172958257713249e-05, + "loss": 39.2733, + "step": 1529 + }, + { + "epoch": 5.523702031602709, + "grad_norm": 250.36065673828125, + "learning_rate": 2.1724137931034484e-05, + "loss": 37.8987, + "step": 1530 + }, + { + "epoch": 5.523702031602709, + "eval_loss": 0.6414559483528137, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 1530 + }, + { + "epoch": 5.527313769751693, + "grad_norm": 275.062255859375, + "learning_rate": 2.171869328493648e-05, + "loss": 29.4874, + "step": 1531 + }, + { + "epoch": 5.530925507900677, + "grad_norm": 178.79615783691406, + "learning_rate": 2.1713248638838475e-05, + "loss": 25.2165, + "step": 1532 + }, + { + "epoch": 5.534537246049661, + "grad_norm": 221.6693572998047, + "learning_rate": 2.170780399274047e-05, + "loss": 24.7139, + "step": 1533 + }, + { + "epoch": 5.538148984198646, + "grad_norm": 207.15869140625, + "learning_rate": 2.170235934664247e-05, + "loss": 25.2773, + "step": 1534 + }, + { + "epoch": 5.54176072234763, + "grad_norm": 193.37644958496094, + "learning_rate": 2.1696914700544468e-05, + "loss": 25.7936, + "step": 1535 + }, + { + "epoch": 5.545372460496614, + "grad_norm": 314.101318359375, + "learning_rate": 2.1691470054446463e-05, + "loss": 45.8573, + "step": 1536 + }, + { + "epoch": 5.5489841986455986, + "grad_norm": 376.9578552246094, + "learning_rate": 2.168602540834846e-05, + "loss": 47.1284, + "step": 1537 + }, + { + "epoch": 5.5525959367945825, + "grad_norm": 343.3904724121094, + "learning_rate": 2.1680580762250454e-05, + "loss": 45.1873, + "step": 1538 + }, + { + "epoch": 5.5562076749435665, + "grad_norm": 263.31768798828125, + "learning_rate": 2.167513611615245e-05, + "loss": 45.4906, + "step": 1539 + }, + { + "epoch": 5.5598194130925505, + "grad_norm": 295.50384521484375, + "learning_rate": 2.1669691470054448e-05, + "loss": 44.9259, + "step": 1540 + }, + { + "epoch": 5.5598194130925505, + "eval_loss": 0.6483813524246216, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.923, + "eval_steps_per_second": 56.923, + "step": 1540 + }, + { + "epoch": 5.563431151241535, + "grad_norm": 208.8861846923828, + "learning_rate": 2.1664246823956444e-05, + "loss": 43.7965, + "step": 1541 + }, + { + "epoch": 5.567042889390519, + "grad_norm": 195.8695526123047, + "learning_rate": 2.165880217785844e-05, + "loss": 44.7409, + "step": 1542 + }, + { + "epoch": 5.570654627539503, + "grad_norm": 218.10089111328125, + "learning_rate": 2.1653357531760434e-05, + "loss": 45.9364, + "step": 1543 + }, + { + "epoch": 5.574266365688487, + "grad_norm": 204.17205810546875, + "learning_rate": 2.164791288566243e-05, + "loss": 45.468, + "step": 1544 + }, + { + "epoch": 5.577878103837472, + "grad_norm": 239.03952026367188, + "learning_rate": 2.1642468239564432e-05, + "loss": 44.7685, + "step": 1545 + }, + { + "epoch": 5.581489841986456, + "grad_norm": 251.59300231933594, + "learning_rate": 2.1637023593466427e-05, + "loss": 43.011, + "step": 1546 + }, + { + "epoch": 5.58510158013544, + "grad_norm": 186.72540283203125, + "learning_rate": 2.1631578947368423e-05, + "loss": 41.5255, + "step": 1547 + }, + { + "epoch": 5.588713318284425, + "grad_norm": 199.89732360839844, + "learning_rate": 2.1626134301270418e-05, + "loss": 40.2522, + "step": 1548 + }, + { + "epoch": 5.592325056433409, + "grad_norm": 182.16624450683594, + "learning_rate": 2.1620689655172413e-05, + "loss": 41.0931, + "step": 1549 + }, + { + "epoch": 5.595936794582393, + "grad_norm": 221.58680725097656, + "learning_rate": 2.161524500907441e-05, + "loss": 40.2717, + "step": 1550 + }, + { + "epoch": 5.595936794582393, + "eval_loss": 0.6393340229988098, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 1550 + }, + { + "epoch": 5.599548532731377, + "grad_norm": 209.82183837890625, + "learning_rate": 2.1609800362976408e-05, + "loss": 41.7522, + "step": 1551 + }, + { + "epoch": 5.603160270880361, + "grad_norm": 226.1896209716797, + "learning_rate": 2.1604355716878403e-05, + "loss": 40.8078, + "step": 1552 + }, + { + "epoch": 5.606772009029346, + "grad_norm": 219.57899475097656, + "learning_rate": 2.1598911070780398e-05, + "loss": 42.2331, + "step": 1553 + }, + { + "epoch": 5.6103837471783295, + "grad_norm": 185.2303009033203, + "learning_rate": 2.1593466424682397e-05, + "loss": 42.0695, + "step": 1554 + }, + { + "epoch": 5.6139954853273135, + "grad_norm": 192.32913208007812, + "learning_rate": 2.1588021778584392e-05, + "loss": 42.1317, + "step": 1555 + }, + { + "epoch": 5.617607223476298, + "grad_norm": 183.3128662109375, + "learning_rate": 2.158257713248639e-05, + "loss": 40.4957, + "step": 1556 + }, + { + "epoch": 5.621218961625282, + "grad_norm": 178.10691833496094, + "learning_rate": 2.1577132486388387e-05, + "loss": 40.9154, + "step": 1557 + }, + { + "epoch": 5.624830699774266, + "grad_norm": 207.3495330810547, + "learning_rate": 2.1571687840290382e-05, + "loss": 42.8389, + "step": 1558 + }, + { + "epoch": 5.62844243792325, + "grad_norm": 191.46353149414062, + "learning_rate": 2.1566243194192377e-05, + "loss": 41.9483, + "step": 1559 + }, + { + "epoch": 5.632054176072235, + "grad_norm": 218.9544219970703, + "learning_rate": 2.1560798548094373e-05, + "loss": 41.2037, + "step": 1560 + }, + { + "epoch": 5.632054176072235, + "eval_loss": 0.6345452070236206, + "eval_runtime": 3.1432, + "eval_samples_per_second": 56.949, + "eval_steps_per_second": 56.949, + "step": 1560 + }, + { + "epoch": 5.635665914221219, + "grad_norm": 235.9405059814453, + "learning_rate": 2.1555353901996368e-05, + "loss": 43.1159, + "step": 1561 + }, + { + "epoch": 5.639277652370203, + "grad_norm": 207.1119384765625, + "learning_rate": 2.1549909255898367e-05, + "loss": 43.4384, + "step": 1562 + }, + { + "epoch": 5.642889390519187, + "grad_norm": 305.3013916015625, + "learning_rate": 2.1544464609800366e-05, + "loss": 42.436, + "step": 1563 + }, + { + "epoch": 5.646501128668172, + "grad_norm": 226.25282287597656, + "learning_rate": 2.153901996370236e-05, + "loss": 39.6844, + "step": 1564 + }, + { + "epoch": 5.650112866817156, + "grad_norm": 201.5033416748047, + "learning_rate": 2.1533575317604356e-05, + "loss": 35.9103, + "step": 1565 + }, + { + "epoch": 5.65372460496614, + "grad_norm": 206.63229370117188, + "learning_rate": 2.1528130671506352e-05, + "loss": 35.0026, + "step": 1566 + }, + { + "epoch": 5.657336343115124, + "grad_norm": 212.67581176757812, + "learning_rate": 2.152268602540835e-05, + "loss": 35.6298, + "step": 1567 + }, + { + "epoch": 5.660948081264109, + "grad_norm": 193.2886199951172, + "learning_rate": 2.1517241379310346e-05, + "loss": 36.0356, + "step": 1568 + }, + { + "epoch": 5.664559819413093, + "grad_norm": 166.189208984375, + "learning_rate": 2.151179673321234e-05, + "loss": 35.5423, + "step": 1569 + }, + { + "epoch": 5.668171557562077, + "grad_norm": 288.91552734375, + "learning_rate": 2.1506352087114337e-05, + "loss": 36.6227, + "step": 1570 + }, + { + "epoch": 5.668171557562077, + "eval_loss": 0.6339959502220154, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1570 + }, + { + "epoch": 5.6717832957110605, + "grad_norm": 210.91664123535156, + "learning_rate": 2.1500907441016332e-05, + "loss": 37.3015, + "step": 1571 + }, + { + "epoch": 5.675395033860045, + "grad_norm": 206.54299926757812, + "learning_rate": 2.149546279491833e-05, + "loss": 36.961, + "step": 1572 + }, + { + "epoch": 5.679006772009029, + "grad_norm": 206.55613708496094, + "learning_rate": 2.149001814882033e-05, + "loss": 36.722, + "step": 1573 + }, + { + "epoch": 5.682618510158013, + "grad_norm": 206.86563110351562, + "learning_rate": 2.1484573502722325e-05, + "loss": 37.7482, + "step": 1574 + }, + { + "epoch": 5.686230248306998, + "grad_norm": 219.96533203125, + "learning_rate": 2.147912885662432e-05, + "loss": 37.7964, + "step": 1575 + }, + { + "epoch": 5.689841986455982, + "grad_norm": 226.23887634277344, + "learning_rate": 2.1473684210526316e-05, + "loss": 38.6577, + "step": 1576 + }, + { + "epoch": 5.693453724604966, + "grad_norm": 195.1751708984375, + "learning_rate": 2.146823956442831e-05, + "loss": 36.9764, + "step": 1577 + }, + { + "epoch": 5.69706546275395, + "grad_norm": 194.3510284423828, + "learning_rate": 2.146279491833031e-05, + "loss": 39.4842, + "step": 1578 + }, + { + "epoch": 5.700677200902934, + "grad_norm": 187.02281188964844, + "learning_rate": 2.1457350272232305e-05, + "loss": 38.9574, + "step": 1579 + }, + { + "epoch": 5.704288939051919, + "grad_norm": 242.91925048828125, + "learning_rate": 2.14519056261343e-05, + "loss": 37.6359, + "step": 1580 + }, + { + "epoch": 5.704288939051919, + "eval_loss": 0.6384473443031311, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 1580 + }, + { + "epoch": 5.707900677200903, + "grad_norm": 242.9617156982422, + "learning_rate": 2.14464609800363e-05, + "loss": 31.3564, + "step": 1581 + }, + { + "epoch": 5.711512415349887, + "grad_norm": 182.00540161132812, + "learning_rate": 2.1441016333938295e-05, + "loss": 24.2933, + "step": 1582 + }, + { + "epoch": 5.715124153498872, + "grad_norm": 257.7115173339844, + "learning_rate": 2.143557168784029e-05, + "loss": 24.6299, + "step": 1583 + }, + { + "epoch": 5.718735891647856, + "grad_norm": 198.71554565429688, + "learning_rate": 2.143012704174229e-05, + "loss": 24.7344, + "step": 1584 + }, + { + "epoch": 5.72234762979684, + "grad_norm": 198.24520874023438, + "learning_rate": 2.1424682395644284e-05, + "loss": 26.0825, + "step": 1585 + }, + { + "epoch": 5.725959367945824, + "grad_norm": 248.9528045654297, + "learning_rate": 2.141923774954628e-05, + "loss": 45.1176, + "step": 1586 + }, + { + "epoch": 5.7295711060948085, + "grad_norm": 293.7327575683594, + "learning_rate": 2.1413793103448275e-05, + "loss": 45.8517, + "step": 1587 + }, + { + "epoch": 5.733182844243792, + "grad_norm": 293.1148681640625, + "learning_rate": 2.140834845735027e-05, + "loss": 45.6659, + "step": 1588 + }, + { + "epoch": 5.736794582392776, + "grad_norm": 312.7779846191406, + "learning_rate": 2.140290381125227e-05, + "loss": 44.4863, + "step": 1589 + }, + { + "epoch": 5.74040632054176, + "grad_norm": 309.1000061035156, + "learning_rate": 2.1397459165154265e-05, + "loss": 43.649, + "step": 1590 + }, + { + "epoch": 5.74040632054176, + "eval_loss": 0.6471736431121826, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 1590 + }, + { + "epoch": 5.744018058690745, + "grad_norm": 276.4226989746094, + "learning_rate": 2.1392014519056263e-05, + "loss": 45.3135, + "step": 1591 + }, + { + "epoch": 5.747629796839729, + "grad_norm": 233.6791229248047, + "learning_rate": 2.138656987295826e-05, + "loss": 44.4919, + "step": 1592 + }, + { + "epoch": 5.751241534988713, + "grad_norm": 194.2917022705078, + "learning_rate": 2.1381125226860254e-05, + "loss": 44.8033, + "step": 1593 + }, + { + "epoch": 5.754853273137698, + "grad_norm": 241.76060485839844, + "learning_rate": 2.137568058076225e-05, + "loss": 45.1427, + "step": 1594 + }, + { + "epoch": 5.758465011286682, + "grad_norm": 216.56283569335938, + "learning_rate": 2.137023593466425e-05, + "loss": 43.1769, + "step": 1595 + }, + { + "epoch": 5.762076749435666, + "grad_norm": 230.0026092529297, + "learning_rate": 2.1364791288566244e-05, + "loss": 44.1141, + "step": 1596 + }, + { + "epoch": 5.76568848758465, + "grad_norm": 191.55433654785156, + "learning_rate": 2.135934664246824e-05, + "loss": 40.7227, + "step": 1597 + }, + { + "epoch": 5.769300225733634, + "grad_norm": 180.25885009765625, + "learning_rate": 2.1353901996370235e-05, + "loss": 40.9842, + "step": 1598 + }, + { + "epoch": 5.772911963882619, + "grad_norm": 220.4018096923828, + "learning_rate": 2.134845735027223e-05, + "loss": 40.0403, + "step": 1599 + }, + { + "epoch": 5.776523702031603, + "grad_norm": 264.20587158203125, + "learning_rate": 2.1343012704174232e-05, + "loss": 40.1543, + "step": 1600 + }, + { + "epoch": 5.776523702031603, + "eval_loss": 0.6374311447143555, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1600 + }, + { + "epoch": 5.780135440180587, + "grad_norm": 167.9457244873047, + "learning_rate": 2.1337568058076227e-05, + "loss": 40.9575, + "step": 1601 + }, + { + "epoch": 5.7837471783295715, + "grad_norm": 190.05247497558594, + "learning_rate": 2.1332123411978223e-05, + "loss": 39.5593, + "step": 1602 + }, + { + "epoch": 5.7873589164785555, + "grad_norm": 246.4980926513672, + "learning_rate": 2.1326678765880218e-05, + "loss": 40.7016, + "step": 1603 + }, + { + "epoch": 5.7909706546275395, + "grad_norm": 208.7435302734375, + "learning_rate": 2.1321234119782214e-05, + "loss": 41.7855, + "step": 1604 + }, + { + "epoch": 5.794582392776523, + "grad_norm": 190.84188842773438, + "learning_rate": 2.1315789473684212e-05, + "loss": 41.2129, + "step": 1605 + }, + { + "epoch": 5.798194130925508, + "grad_norm": 196.7161102294922, + "learning_rate": 2.1310344827586208e-05, + "loss": 40.8209, + "step": 1606 + }, + { + "epoch": 5.801805869074492, + "grad_norm": 181.4319305419922, + "learning_rate": 2.1304900181488203e-05, + "loss": 41.8345, + "step": 1607 + }, + { + "epoch": 5.805417607223476, + "grad_norm": 201.2064971923828, + "learning_rate": 2.12994555353902e-05, + "loss": 43.1464, + "step": 1608 + }, + { + "epoch": 5.80902934537246, + "grad_norm": 199.15174865722656, + "learning_rate": 2.1294010889292197e-05, + "loss": 42.6041, + "step": 1609 + }, + { + "epoch": 5.812641083521445, + "grad_norm": 231.0398406982422, + "learning_rate": 2.1288566243194193e-05, + "loss": 42.867, + "step": 1610 + }, + { + "epoch": 5.812641083521445, + "eval_loss": 0.6334222555160522, + "eval_runtime": 3.1534, + "eval_samples_per_second": 56.764, + "eval_steps_per_second": 56.764, + "step": 1610 + }, + { + "epoch": 5.816252821670429, + "grad_norm": 189.26132202148438, + "learning_rate": 2.128312159709619e-05, + "loss": 41.7717, + "step": 1611 + }, + { + "epoch": 5.819864559819413, + "grad_norm": 215.5289764404297, + "learning_rate": 2.1277676950998187e-05, + "loss": 41.3994, + "step": 1612 + }, + { + "epoch": 5.823476297968397, + "grad_norm": 267.4259033203125, + "learning_rate": 2.1272232304900182e-05, + "loss": 41.8173, + "step": 1613 + }, + { + "epoch": 5.827088036117382, + "grad_norm": 241.74749755859375, + "learning_rate": 2.1266787658802178e-05, + "loss": 39.9873, + "step": 1614 + }, + { + "epoch": 5.830699774266366, + "grad_norm": 242.233642578125, + "learning_rate": 2.1261343012704173e-05, + "loss": 37.0662, + "step": 1615 + }, + { + "epoch": 5.83431151241535, + "grad_norm": 217.06141662597656, + "learning_rate": 2.1255898366606172e-05, + "loss": 36.8948, + "step": 1616 + }, + { + "epoch": 5.837923250564334, + "grad_norm": 242.05567932128906, + "learning_rate": 2.1250453720508167e-05, + "loss": 34.9909, + "step": 1617 + }, + { + "epoch": 5.8415349887133186, + "grad_norm": 178.65618896484375, + "learning_rate": 2.1245009074410166e-05, + "loss": 35.603, + "step": 1618 + }, + { + "epoch": 5.8451467268623025, + "grad_norm": 216.36865234375, + "learning_rate": 2.123956442831216e-05, + "loss": 35.9822, + "step": 1619 + }, + { + "epoch": 5.8487584650112865, + "grad_norm": 241.22161865234375, + "learning_rate": 2.1234119782214157e-05, + "loss": 35.1473, + "step": 1620 + }, + { + "epoch": 5.8487584650112865, + "eval_loss": 0.6312161087989807, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 1620 + }, + { + "epoch": 5.852370203160271, + "grad_norm": 192.05210876464844, + "learning_rate": 2.1228675136116152e-05, + "loss": 36.145, + "step": 1621 + }, + { + "epoch": 5.855981941309255, + "grad_norm": 194.0652618408203, + "learning_rate": 2.122323049001815e-05, + "loss": 37.7076, + "step": 1622 + }, + { + "epoch": 5.859593679458239, + "grad_norm": 255.59286499023438, + "learning_rate": 2.1217785843920146e-05, + "loss": 37.6837, + "step": 1623 + }, + { + "epoch": 5.863205417607223, + "grad_norm": 184.0017852783203, + "learning_rate": 2.121234119782214e-05, + "loss": 37.1681, + "step": 1624 + }, + { + "epoch": 5.866817155756207, + "grad_norm": 186.98338317871094, + "learning_rate": 2.1206896551724137e-05, + "loss": 37.4902, + "step": 1625 + }, + { + "epoch": 5.870428893905192, + "grad_norm": 253.53775024414062, + "learning_rate": 2.1201451905626132e-05, + "loss": 37.2771, + "step": 1626 + }, + { + "epoch": 5.874040632054176, + "grad_norm": 196.43038940429688, + "learning_rate": 2.119600725952813e-05, + "loss": 37.7681, + "step": 1627 + }, + { + "epoch": 5.87765237020316, + "grad_norm": 255.99879455566406, + "learning_rate": 2.119056261343013e-05, + "loss": 40.0097, + "step": 1628 + }, + { + "epoch": 5.881264108352145, + "grad_norm": 275.1465148925781, + "learning_rate": 2.1185117967332125e-05, + "loss": 38.1076, + "step": 1629 + }, + { + "epoch": 5.884875846501129, + "grad_norm": 281.8592529296875, + "learning_rate": 2.117967332123412e-05, + "loss": 38.6463, + "step": 1630 + }, + { + "epoch": 5.884875846501129, + "eval_loss": 0.6449099779129028, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 1630 + }, + { + "epoch": 5.888487584650113, + "grad_norm": 246.7912139892578, + "learning_rate": 2.1174228675136116e-05, + "loss": 36.9158, + "step": 1631 + }, + { + "epoch": 5.892099322799097, + "grad_norm": 176.7545623779297, + "learning_rate": 2.116878402903811e-05, + "loss": 25.1153, + "step": 1632 + }, + { + "epoch": 5.895711060948082, + "grad_norm": 202.2602996826172, + "learning_rate": 2.116333938294011e-05, + "loss": 24.1999, + "step": 1633 + }, + { + "epoch": 5.899322799097066, + "grad_norm": 186.26255798339844, + "learning_rate": 2.1157894736842106e-05, + "loss": 24.185, + "step": 1634 + }, + { + "epoch": 5.9029345372460496, + "grad_norm": 231.0543670654297, + "learning_rate": 2.11524500907441e-05, + "loss": 26.1841, + "step": 1635 + }, + { + "epoch": 5.9065462753950335, + "grad_norm": 336.677001953125, + "learning_rate": 2.1147005444646096e-05, + "loss": 47.1367, + "step": 1636 + }, + { + "epoch": 5.910158013544018, + "grad_norm": 299.3211975097656, + "learning_rate": 2.1141560798548095e-05, + "loss": 46.7711, + "step": 1637 + }, + { + "epoch": 5.913769751693002, + "grad_norm": 287.5389099121094, + "learning_rate": 2.1136116152450094e-05, + "loss": 44.9163, + "step": 1638 + }, + { + "epoch": 5.917381489841986, + "grad_norm": 290.34930419921875, + "learning_rate": 2.113067150635209e-05, + "loss": 45.1651, + "step": 1639 + }, + { + "epoch": 5.92099322799097, + "grad_norm": 244.7100372314453, + "learning_rate": 2.1125226860254085e-05, + "loss": 45.6252, + "step": 1640 + }, + { + "epoch": 5.92099322799097, + "eval_loss": 0.6506878733634949, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 1640 + }, + { + "epoch": 5.924604966139955, + "grad_norm": 301.48223876953125, + "learning_rate": 2.111978221415608e-05, + "loss": 44.5345, + "step": 1641 + }, + { + "epoch": 5.928216704288939, + "grad_norm": 261.05987548828125, + "learning_rate": 2.1114337568058075e-05, + "loss": 42.0263, + "step": 1642 + }, + { + "epoch": 5.931828442437923, + "grad_norm": 220.4369659423828, + "learning_rate": 2.110889292196007e-05, + "loss": 41.2405, + "step": 1643 + }, + { + "epoch": 5.935440180586907, + "grad_norm": 261.3221435546875, + "learning_rate": 2.110344827586207e-05, + "loss": 42.2734, + "step": 1644 + }, + { + "epoch": 5.939051918735892, + "grad_norm": 253.70855712890625, + "learning_rate": 2.1098003629764065e-05, + "loss": 43.0752, + "step": 1645 + }, + { + "epoch": 5.942663656884876, + "grad_norm": 198.76138305664062, + "learning_rate": 2.1092558983666064e-05, + "loss": 42.7103, + "step": 1646 + }, + { + "epoch": 5.94627539503386, + "grad_norm": 212.21466064453125, + "learning_rate": 2.108711433756806e-05, + "loss": 42.6215, + "step": 1647 + }, + { + "epoch": 5.949887133182845, + "grad_norm": 212.9633026123047, + "learning_rate": 2.1081669691470055e-05, + "loss": 42.795, + "step": 1648 + }, + { + "epoch": 5.953498871331829, + "grad_norm": 263.2871398925781, + "learning_rate": 2.1076225045372053e-05, + "loss": 43.8843, + "step": 1649 + }, + { + "epoch": 5.957110609480813, + "grad_norm": 207.67120361328125, + "learning_rate": 2.107078039927405e-05, + "loss": 43.0161, + "step": 1650 + }, + { + "epoch": 5.957110609480813, + "eval_loss": 0.6315081715583801, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1650 + }, + { + "epoch": 5.960722347629797, + "grad_norm": 176.6342010498047, + "learning_rate": 2.1065335753176044e-05, + "loss": 38.803, + "step": 1651 + }, + { + "epoch": 5.9643340857787805, + "grad_norm": 223.57485961914062, + "learning_rate": 2.105989110707804e-05, + "loss": 35.1905, + "step": 1652 + }, + { + "epoch": 5.967945823927765, + "grad_norm": 291.507568359375, + "learning_rate": 2.1054446460980035e-05, + "loss": 34.9454, + "step": 1653 + }, + { + "epoch": 5.971557562076749, + "grad_norm": 250.51063537597656, + "learning_rate": 2.104900181488203e-05, + "loss": 37.4404, + "step": 1654 + }, + { + "epoch": 5.975169300225733, + "grad_norm": 307.9601135253906, + "learning_rate": 2.1043557168784032e-05, + "loss": 36.9775, + "step": 1655 + }, + { + "epoch": 5.978781038374718, + "grad_norm": 277.24151611328125, + "learning_rate": 2.1038112522686028e-05, + "loss": 38.2696, + "step": 1656 + }, + { + "epoch": 5.982392776523702, + "grad_norm": 186.7593994140625, + "learning_rate": 2.1032667876588023e-05, + "loss": 37.0656, + "step": 1657 + }, + { + "epoch": 5.986004514672686, + "grad_norm": 201.67047119140625, + "learning_rate": 2.102722323049002e-05, + "loss": 38.1747, + "step": 1658 + }, + { + "epoch": 5.98961625282167, + "grad_norm": 216.87525939941406, + "learning_rate": 2.1021778584392014e-05, + "loss": 39.3248, + "step": 1659 + }, + { + "epoch": 5.993227990970655, + "grad_norm": 227.381103515625, + "learning_rate": 2.1016333938294013e-05, + "loss": 33.4017, + "step": 1660 + }, + { + "epoch": 5.993227990970655, + "eval_loss": 0.6369583010673523, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1660 + }, + { + "epoch": 5.996839729119639, + "grad_norm": 237.2648468017578, + "learning_rate": 2.1010889292196008e-05, + "loss": 24.679, + "step": 1661 + }, + { + "epoch": 6.0, + "grad_norm": 191.99951171875, + "learning_rate": 2.1005444646098003e-05, + "loss": 21.9552, + "step": 1662 + }, + { + "epoch": 6.003611738148984, + "grad_norm": 267.92181396484375, + "learning_rate": 2.1e-05, + "loss": 43.6884, + "step": 1663 + }, + { + "epoch": 6.007223476297969, + "grad_norm": 318.86602783203125, + "learning_rate": 2.0994555353901998e-05, + "loss": 46.0709, + "step": 1664 + }, + { + "epoch": 6.010835214446953, + "grad_norm": 282.772705078125, + "learning_rate": 2.0989110707803993e-05, + "loss": 44.2746, + "step": 1665 + }, + { + "epoch": 6.014446952595937, + "grad_norm": 263.2024841308594, + "learning_rate": 2.0983666061705992e-05, + "loss": 43.818, + "step": 1666 + }, + { + "epoch": 6.018058690744921, + "grad_norm": 229.41725158691406, + "learning_rate": 2.0978221415607987e-05, + "loss": 43.9441, + "step": 1667 + }, + { + "epoch": 6.021670428893906, + "grad_norm": 253.25624084472656, + "learning_rate": 2.0972776769509983e-05, + "loss": 43.517, + "step": 1668 + }, + { + "epoch": 6.0252821670428895, + "grad_norm": 202.00238037109375, + "learning_rate": 2.0967332123411978e-05, + "loss": 44.3685, + "step": 1669 + }, + { + "epoch": 6.0288939051918735, + "grad_norm": 196.92825317382812, + "learning_rate": 2.0961887477313973e-05, + "loss": 44.9367, + "step": 1670 + }, + { + "epoch": 6.0288939051918735, + "eval_loss": 0.6381568312644958, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1670 + }, + { + "epoch": 6.0325056433408575, + "grad_norm": 191.00900268554688, + "learning_rate": 2.0956442831215972e-05, + "loss": 44.0743, + "step": 1671 + }, + { + "epoch": 6.036117381489842, + "grad_norm": 195.92141723632812, + "learning_rate": 2.0950998185117967e-05, + "loss": 43.3278, + "step": 1672 + }, + { + "epoch": 6.039729119638826, + "grad_norm": 230.04708862304688, + "learning_rate": 2.0945553539019963e-05, + "loss": 41.6419, + "step": 1673 + }, + { + "epoch": 6.04334085778781, + "grad_norm": 215.70689392089844, + "learning_rate": 2.094010889292196e-05, + "loss": 41.0927, + "step": 1674 + }, + { + "epoch": 6.046952595936794, + "grad_norm": 227.51797485351562, + "learning_rate": 2.0934664246823957e-05, + "loss": 40.1888, + "step": 1675 + }, + { + "epoch": 6.050564334085779, + "grad_norm": 216.93089294433594, + "learning_rate": 2.0929219600725952e-05, + "loss": 39.8766, + "step": 1676 + }, + { + "epoch": 6.054176072234763, + "grad_norm": 199.3091583251953, + "learning_rate": 2.092377495462795e-05, + "loss": 40.3851, + "step": 1677 + }, + { + "epoch": 6.057787810383747, + "grad_norm": 188.56056213378906, + "learning_rate": 2.0918330308529947e-05, + "loss": 40.5289, + "step": 1678 + }, + { + "epoch": 6.061399548532731, + "grad_norm": 194.23265075683594, + "learning_rate": 2.0912885662431942e-05, + "loss": 40.7509, + "step": 1679 + }, + { + "epoch": 6.065011286681716, + "grad_norm": 199.7327423095703, + "learning_rate": 2.0907441016333937e-05, + "loss": 41.3404, + "step": 1680 + }, + { + "epoch": 6.065011286681716, + "eval_loss": 0.6312655806541443, + "eval_runtime": 3.1482, + "eval_samples_per_second": 56.858, + "eval_steps_per_second": 56.858, + "step": 1680 + }, + { + "epoch": 6.0686230248307, + "grad_norm": 189.40150451660156, + "learning_rate": 2.0901996370235933e-05, + "loss": 41.3719, + "step": 1681 + }, + { + "epoch": 6.072234762979684, + "grad_norm": 222.07705688476562, + "learning_rate": 2.089655172413793e-05, + "loss": 41.8194, + "step": 1682 + }, + { + "epoch": 6.075846501128668, + "grad_norm": 205.6264190673828, + "learning_rate": 2.089110707803993e-05, + "loss": 39.8522, + "step": 1683 + }, + { + "epoch": 6.079458239277653, + "grad_norm": 207.98802185058594, + "learning_rate": 2.0885662431941926e-05, + "loss": 41.5093, + "step": 1684 + }, + { + "epoch": 6.083069977426637, + "grad_norm": 197.24134826660156, + "learning_rate": 2.088021778584392e-05, + "loss": 41.7284, + "step": 1685 + }, + { + "epoch": 6.0866817155756205, + "grad_norm": 220.84255981445312, + "learning_rate": 2.0874773139745916e-05, + "loss": 42.7841, + "step": 1686 + }, + { + "epoch": 6.090293453724605, + "grad_norm": 239.06854248046875, + "learning_rate": 2.0869328493647912e-05, + "loss": 43.6391, + "step": 1687 + }, + { + "epoch": 6.093905191873589, + "grad_norm": 193.2572021484375, + "learning_rate": 2.086388384754991e-05, + "loss": 41.9963, + "step": 1688 + }, + { + "epoch": 6.097516930022573, + "grad_norm": 206.66473388671875, + "learning_rate": 2.0858439201451906e-05, + "loss": 41.9834, + "step": 1689 + }, + { + "epoch": 6.101128668171557, + "grad_norm": 214.81956481933594, + "learning_rate": 2.08529945553539e-05, + "loss": 41.7128, + "step": 1690 + }, + { + "epoch": 6.101128668171557, + "eval_loss": 0.6309775114059448, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1690 + }, + { + "epoch": 6.104740406320542, + "grad_norm": 189.58360290527344, + "learning_rate": 2.0847549909255897e-05, + "loss": 37.7807, + "step": 1691 + }, + { + "epoch": 6.108352144469526, + "grad_norm": 265.76934814453125, + "learning_rate": 2.0842105263157895e-05, + "loss": 37.7091, + "step": 1692 + }, + { + "epoch": 6.11196388261851, + "grad_norm": 266.4632568359375, + "learning_rate": 2.0836660617059894e-05, + "loss": 34.7386, + "step": 1693 + }, + { + "epoch": 6.115575620767494, + "grad_norm": 309.3799743652344, + "learning_rate": 2.083121597096189e-05, + "loss": 34.9386, + "step": 1694 + }, + { + "epoch": 6.119187358916479, + "grad_norm": 252.98681640625, + "learning_rate": 2.0825771324863885e-05, + "loss": 34.9113, + "step": 1695 + }, + { + "epoch": 6.122799097065463, + "grad_norm": 199.3408660888672, + "learning_rate": 2.082032667876588e-05, + "loss": 35.1914, + "step": 1696 + }, + { + "epoch": 6.126410835214447, + "grad_norm": 231.67514038085938, + "learning_rate": 2.0814882032667876e-05, + "loss": 36.3151, + "step": 1697 + }, + { + "epoch": 6.130022573363431, + "grad_norm": 215.49317932128906, + "learning_rate": 2.080943738656987e-05, + "loss": 37.6763, + "step": 1698 + }, + { + "epoch": 6.133634311512416, + "grad_norm": 239.3602752685547, + "learning_rate": 2.080399274047187e-05, + "loss": 35.7805, + "step": 1699 + }, + { + "epoch": 6.1372460496614, + "grad_norm": 192.8195037841797, + "learning_rate": 2.0798548094373865e-05, + "loss": 36.7353, + "step": 1700 + }, + { + "epoch": 6.1372460496614, + "eval_loss": 0.6290757060050964, + "eval_runtime": 3.1486, + "eval_samples_per_second": 56.851, + "eval_steps_per_second": 56.851, + "step": 1700 + }, + { + "epoch": 6.140857787810384, + "grad_norm": 191.125, + "learning_rate": 2.0793103448275864e-05, + "loss": 36.6377, + "step": 1701 + }, + { + "epoch": 6.144469525959368, + "grad_norm": 232.39170837402344, + "learning_rate": 2.078765880217786e-05, + "loss": 36.5235, + "step": 1702 + }, + { + "epoch": 6.148081264108352, + "grad_norm": 259.41204833984375, + "learning_rate": 2.0782214156079855e-05, + "loss": 37.7093, + "step": 1703 + }, + { + "epoch": 6.151693002257336, + "grad_norm": 218.00814819335938, + "learning_rate": 2.0776769509981854e-05, + "loss": 37.8061, + "step": 1704 + }, + { + "epoch": 6.15530474040632, + "grad_norm": 183.78170776367188, + "learning_rate": 2.077132486388385e-05, + "loss": 37.9451, + "step": 1705 + }, + { + "epoch": 6.158916478555304, + "grad_norm": 242.387939453125, + "learning_rate": 2.0765880217785844e-05, + "loss": 38.687, + "step": 1706 + }, + { + "epoch": 6.162528216704289, + "grad_norm": 247.09152221679688, + "learning_rate": 2.076043557168784e-05, + "loss": 38.5109, + "step": 1707 + }, + { + "epoch": 6.166139954853273, + "grad_norm": 202.3104705810547, + "learning_rate": 2.0754990925589835e-05, + "loss": 28.0115, + "step": 1708 + }, + { + "epoch": 6.169751693002257, + "grad_norm": 239.5511016845703, + "learning_rate": 2.0749546279491834e-05, + "loss": 23.8873, + "step": 1709 + }, + { + "epoch": 6.173363431151241, + "grad_norm": 233.80007934570312, + "learning_rate": 2.0744101633393833e-05, + "loss": 24.0236, + "step": 1710 + }, + { + "epoch": 6.173363431151241, + "eval_loss": 0.6451307535171509, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1710 + }, + { + "epoch": 6.176975169300226, + "grad_norm": 231.85955810546875, + "learning_rate": 2.0738656987295828e-05, + "loss": 25.2521, + "step": 1711 + }, + { + "epoch": 6.18058690744921, + "grad_norm": 207.05453491210938, + "learning_rate": 2.0733212341197823e-05, + "loss": 25.5774, + "step": 1712 + }, + { + "epoch": 6.184198645598194, + "grad_norm": 265.9180908203125, + "learning_rate": 2.072776769509982e-05, + "loss": 46.0267, + "step": 1713 + }, + { + "epoch": 6.187810383747179, + "grad_norm": 289.2763671875, + "learning_rate": 2.0722323049001814e-05, + "loss": 46.6262, + "step": 1714 + }, + { + "epoch": 6.191422121896163, + "grad_norm": 254.466552734375, + "learning_rate": 2.0716878402903813e-05, + "loss": 44.2758, + "step": 1715 + }, + { + "epoch": 6.195033860045147, + "grad_norm": 262.713134765625, + "learning_rate": 2.071143375680581e-05, + "loss": 44.6334, + "step": 1716 + }, + { + "epoch": 6.198645598194131, + "grad_norm": 272.8150939941406, + "learning_rate": 2.0705989110707804e-05, + "loss": 44.9617, + "step": 1717 + }, + { + "epoch": 6.2022573363431155, + "grad_norm": 288.115478515625, + "learning_rate": 2.07005444646098e-05, + "loss": 44.4382, + "step": 1718 + }, + { + "epoch": 6.2058690744920995, + "grad_norm": 226.08058166503906, + "learning_rate": 2.0695099818511795e-05, + "loss": 44.8551, + "step": 1719 + }, + { + "epoch": 6.209480812641083, + "grad_norm": 219.95835876464844, + "learning_rate": 2.0689655172413797e-05, + "loss": 45.5901, + "step": 1720 + }, + { + "epoch": 6.209480812641083, + "eval_loss": 0.6379314661026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 1720 + }, + { + "epoch": 6.213092550790067, + "grad_norm": 190.3118896484375, + "learning_rate": 2.0684210526315792e-05, + "loss": 44.0675, + "step": 1721 + }, + { + "epoch": 6.216704288939052, + "grad_norm": 177.408935546875, + "learning_rate": 2.0678765880217787e-05, + "loss": 42.6333, + "step": 1722 + }, + { + "epoch": 6.220316027088036, + "grad_norm": 231.3040313720703, + "learning_rate": 2.0673321234119783e-05, + "loss": 41.6771, + "step": 1723 + }, + { + "epoch": 6.22392776523702, + "grad_norm": 226.51663208007812, + "learning_rate": 2.0667876588021778e-05, + "loss": 41.0829, + "step": 1724 + }, + { + "epoch": 6.227539503386004, + "grad_norm": 184.55775451660156, + "learning_rate": 2.0662431941923774e-05, + "loss": 39.2682, + "step": 1725 + }, + { + "epoch": 6.231151241534989, + "grad_norm": 205.0491943359375, + "learning_rate": 2.0656987295825772e-05, + "loss": 40.4101, + "step": 1726 + }, + { + "epoch": 6.234762979683973, + "grad_norm": 201.45838928222656, + "learning_rate": 2.0651542649727768e-05, + "loss": 39.9147, + "step": 1727 + }, + { + "epoch": 6.238374717832957, + "grad_norm": 220.16213989257812, + "learning_rate": 2.0646098003629763e-05, + "loss": 40.7215, + "step": 1728 + }, + { + "epoch": 6.241986455981941, + "grad_norm": 260.9661560058594, + "learning_rate": 2.0640653357531762e-05, + "loss": 40.0256, + "step": 1729 + }, + { + "epoch": 6.245598194130926, + "grad_norm": 314.2476806640625, + "learning_rate": 2.0635208711433757e-05, + "loss": 41.1147, + "step": 1730 + }, + { + "epoch": 6.245598194130926, + "eval_loss": 0.6347935199737549, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1730 + }, + { + "epoch": 6.24920993227991, + "grad_norm": 262.24505615234375, + "learning_rate": 2.0629764065335756e-05, + "loss": 41.7255, + "step": 1731 + }, + { + "epoch": 6.252821670428894, + "grad_norm": 212.0876922607422, + "learning_rate": 2.062431941923775e-05, + "loss": 41.2559, + "step": 1732 + }, + { + "epoch": 6.2564334085778786, + "grad_norm": 185.3249969482422, + "learning_rate": 2.0618874773139747e-05, + "loss": 41.1664, + "step": 1733 + }, + { + "epoch": 6.2600451467268625, + "grad_norm": 184.7873077392578, + "learning_rate": 2.0613430127041742e-05, + "loss": 41.3357, + "step": 1734 + }, + { + "epoch": 6.2636568848758465, + "grad_norm": 230.11257934570312, + "learning_rate": 2.0607985480943738e-05, + "loss": 43.0978, + "step": 1735 + }, + { + "epoch": 6.2672686230248305, + "grad_norm": 251.255126953125, + "learning_rate": 2.0602540834845733e-05, + "loss": 42.4169, + "step": 1736 + }, + { + "epoch": 6.270880361173815, + "grad_norm": 230.1149444580078, + "learning_rate": 2.0597096188747732e-05, + "loss": 43.2969, + "step": 1737 + }, + { + "epoch": 6.274492099322799, + "grad_norm": 217.2769012451172, + "learning_rate": 2.059165154264973e-05, + "loss": 42.6037, + "step": 1738 + }, + { + "epoch": 6.278103837471783, + "grad_norm": 189.85533142089844, + "learning_rate": 2.0586206896551726e-05, + "loss": 42.1215, + "step": 1739 + }, + { + "epoch": 6.281715575620767, + "grad_norm": 242.15667724609375, + "learning_rate": 2.058076225045372e-05, + "loss": 42.6337, + "step": 1740 + }, + { + "epoch": 6.281715575620767, + "eval_loss": 0.6310555934906006, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 1740 + }, + { + "epoch": 6.285327313769752, + "grad_norm": 213.7873992919922, + "learning_rate": 2.0575317604355717e-05, + "loss": 40.5315, + "step": 1741 + }, + { + "epoch": 6.288939051918736, + "grad_norm": 243.86492919921875, + "learning_rate": 2.0569872958257715e-05, + "loss": 38.9483, + "step": 1742 + }, + { + "epoch": 6.29255079006772, + "grad_norm": 276.0108642578125, + "learning_rate": 2.056442831215971e-05, + "loss": 35.9627, + "step": 1743 + }, + { + "epoch": 6.296162528216704, + "grad_norm": 252.5875701904297, + "learning_rate": 2.0558983666061706e-05, + "loss": 35.4305, + "step": 1744 + }, + { + "epoch": 6.299774266365689, + "grad_norm": 227.15142822265625, + "learning_rate": 2.05535390199637e-05, + "loss": 35.2385, + "step": 1745 + }, + { + "epoch": 6.303386004514673, + "grad_norm": 259.6727294921875, + "learning_rate": 2.0548094373865697e-05, + "loss": 35.735, + "step": 1746 + }, + { + "epoch": 6.306997742663657, + "grad_norm": 185.07765197753906, + "learning_rate": 2.0542649727767696e-05, + "loss": 36.8835, + "step": 1747 + }, + { + "epoch": 6.310609480812641, + "grad_norm": 207.650146484375, + "learning_rate": 2.0537205081669694e-05, + "loss": 36.346, + "step": 1748 + }, + { + "epoch": 6.314221218961626, + "grad_norm": 223.2378692626953, + "learning_rate": 2.053176043557169e-05, + "loss": 36.1527, + "step": 1749 + }, + { + "epoch": 6.3178329571106095, + "grad_norm": 162.90794372558594, + "learning_rate": 2.0526315789473685e-05, + "loss": 35.7408, + "step": 1750 + }, + { + "epoch": 6.3178329571106095, + "eval_loss": 0.6276403069496155, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 1750 + }, + { + "epoch": 6.3214446952595935, + "grad_norm": 165.8592987060547, + "learning_rate": 2.052087114337568e-05, + "loss": 37.7916, + "step": 1751 + }, + { + "epoch": 6.3250564334085775, + "grad_norm": 179.7499542236328, + "learning_rate": 2.0515426497277676e-05, + "loss": 36.8409, + "step": 1752 + }, + { + "epoch": 6.328668171557562, + "grad_norm": 227.0990753173828, + "learning_rate": 2.0509981851179675e-05, + "loss": 37.1766, + "step": 1753 + }, + { + "epoch": 6.332279909706546, + "grad_norm": 216.3297882080078, + "learning_rate": 2.050453720508167e-05, + "loss": 37.5, + "step": 1754 + }, + { + "epoch": 6.33589164785553, + "grad_norm": 197.88409423828125, + "learning_rate": 2.0499092558983666e-05, + "loss": 38.8293, + "step": 1755 + }, + { + "epoch": 6.339503386004514, + "grad_norm": 189.74916076660156, + "learning_rate": 2.049364791288566e-05, + "loss": 37.9873, + "step": 1756 + }, + { + "epoch": 6.343115124153499, + "grad_norm": 241.16644287109375, + "learning_rate": 2.048820326678766e-05, + "loss": 39.3107, + "step": 1757 + }, + { + "epoch": 6.346726862302483, + "grad_norm": 224.3491668701172, + "learning_rate": 2.0482758620689655e-05, + "loss": 36.2482, + "step": 1758 + }, + { + "epoch": 6.350338600451467, + "grad_norm": 217.30882263183594, + "learning_rate": 2.0477313974591654e-05, + "loss": 24.1945, + "step": 1759 + }, + { + "epoch": 6.353950338600452, + "grad_norm": 213.23683166503906, + "learning_rate": 2.047186932849365e-05, + "loss": 24.2356, + "step": 1760 + }, + { + "epoch": 6.353950338600452, + "eval_loss": 0.6382855772972107, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.795, + "eval_steps_per_second": 56.795, + "step": 1760 + }, + { + "epoch": 6.357562076749436, + "grad_norm": 209.8166961669922, + "learning_rate": 2.0466424682395645e-05, + "loss": 25.1916, + "step": 1761 + }, + { + "epoch": 6.36117381489842, + "grad_norm": 197.86773681640625, + "learning_rate": 2.046098003629764e-05, + "loss": 25.1372, + "step": 1762 + }, + { + "epoch": 6.364785553047404, + "grad_norm": 280.80517578125, + "learning_rate": 2.0455535390199635e-05, + "loss": 45.0431, + "step": 1763 + }, + { + "epoch": 6.368397291196389, + "grad_norm": 239.85861206054688, + "learning_rate": 2.0450090744101634e-05, + "loss": 45.4893, + "step": 1764 + }, + { + "epoch": 6.372009029345373, + "grad_norm": 302.56024169921875, + "learning_rate": 2.044464609800363e-05, + "loss": 45.3313, + "step": 1765 + }, + { + "epoch": 6.375620767494357, + "grad_norm": 255.5519256591797, + "learning_rate": 2.043920145190563e-05, + "loss": 44.703, + "step": 1766 + }, + { + "epoch": 6.3792325056433405, + "grad_norm": 223.1331024169922, + "learning_rate": 2.0433756805807624e-05, + "loss": 45.0278, + "step": 1767 + }, + { + "epoch": 6.382844243792325, + "grad_norm": 240.68817138671875, + "learning_rate": 2.042831215970962e-05, + "loss": 44.7298, + "step": 1768 + }, + { + "epoch": 6.386455981941309, + "grad_norm": 239.5072021484375, + "learning_rate": 2.0422867513611614e-05, + "loss": 44.0512, + "step": 1769 + }, + { + "epoch": 6.390067720090293, + "grad_norm": 186.3783416748047, + "learning_rate": 2.0417422867513613e-05, + "loss": 43.8646, + "step": 1770 + }, + { + "epoch": 6.390067720090293, + "eval_loss": 0.6325972676277161, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 1770 + }, + { + "epoch": 6.393679458239277, + "grad_norm": 169.77285766601562, + "learning_rate": 2.041197822141561e-05, + "loss": 43.8688, + "step": 1771 + }, + { + "epoch": 6.397291196388262, + "grad_norm": 158.4019012451172, + "learning_rate": 2.0406533575317604e-05, + "loss": 42.5757, + "step": 1772 + }, + { + "epoch": 6.400902934537246, + "grad_norm": 209.79916381835938, + "learning_rate": 2.04010889292196e-05, + "loss": 44.8075, + "step": 1773 + }, + { + "epoch": 6.40451467268623, + "grad_norm": 215.74639892578125, + "learning_rate": 2.0395644283121595e-05, + "loss": 42.0121, + "step": 1774 + }, + { + "epoch": 6.408126410835214, + "grad_norm": 215.21121215820312, + "learning_rate": 2.0390199637023597e-05, + "loss": 40.6564, + "step": 1775 + }, + { + "epoch": 6.411738148984199, + "grad_norm": 244.49574279785156, + "learning_rate": 2.0384754990925592e-05, + "loss": 40.543, + "step": 1776 + }, + { + "epoch": 6.415349887133183, + "grad_norm": 189.22781372070312, + "learning_rate": 2.0379310344827588e-05, + "loss": 39.5569, + "step": 1777 + }, + { + "epoch": 6.418961625282167, + "grad_norm": 204.32664489746094, + "learning_rate": 2.0373865698729583e-05, + "loss": 40.0789, + "step": 1778 + }, + { + "epoch": 6.422573363431152, + "grad_norm": 217.5277557373047, + "learning_rate": 2.036842105263158e-05, + "loss": 39.6436, + "step": 1779 + }, + { + "epoch": 6.426185101580136, + "grad_norm": 196.25918579101562, + "learning_rate": 2.0362976406533574e-05, + "loss": 41.0794, + "step": 1780 + }, + { + "epoch": 6.426185101580136, + "eval_loss": 0.6334295868873596, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1780 + }, + { + "epoch": 6.42979683972912, + "grad_norm": 191.50656127929688, + "learning_rate": 2.0357531760435573e-05, + "loss": 41.2976, + "step": 1781 + }, + { + "epoch": 6.433408577878104, + "grad_norm": 192.98692321777344, + "learning_rate": 2.0352087114337568e-05, + "loss": 41.0843, + "step": 1782 + }, + { + "epoch": 6.437020316027088, + "grad_norm": 197.32862854003906, + "learning_rate": 2.0346642468239563e-05, + "loss": 40.4123, + "step": 1783 + }, + { + "epoch": 6.440632054176072, + "grad_norm": 205.18751525878906, + "learning_rate": 2.0341197822141562e-05, + "loss": 41.9185, + "step": 1784 + }, + { + "epoch": 6.444243792325056, + "grad_norm": 201.69070434570312, + "learning_rate": 2.0335753176043558e-05, + "loss": 41.6794, + "step": 1785 + }, + { + "epoch": 6.44785553047404, + "grad_norm": 218.77044677734375, + "learning_rate": 2.0330308529945556e-05, + "loss": 43.5805, + "step": 1786 + }, + { + "epoch": 6.451467268623025, + "grad_norm": 183.25967407226562, + "learning_rate": 2.0324863883847552e-05, + "loss": 41.2777, + "step": 1787 + }, + { + "epoch": 6.455079006772009, + "grad_norm": 219.97369384765625, + "learning_rate": 2.0319419237749547e-05, + "loss": 42.4618, + "step": 1788 + }, + { + "epoch": 6.458690744920993, + "grad_norm": 216.1624298095703, + "learning_rate": 2.0313974591651542e-05, + "loss": 41.6424, + "step": 1789 + }, + { + "epoch": 6.462302483069977, + "grad_norm": 222.29965209960938, + "learning_rate": 2.0308529945553538e-05, + "loss": 41.4058, + "step": 1790 + }, + { + "epoch": 6.462302483069977, + "eval_loss": 0.6282982230186462, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 1790 + }, + { + "epoch": 6.465914221218962, + "grad_norm": 215.50511169433594, + "learning_rate": 2.0303085299455533e-05, + "loss": 39.474, + "step": 1791 + }, + { + "epoch": 6.469525959367946, + "grad_norm": 237.2119903564453, + "learning_rate": 2.0297640653357532e-05, + "loss": 36.0508, + "step": 1792 + }, + { + "epoch": 6.47313769751693, + "grad_norm": 234.52975463867188, + "learning_rate": 2.029219600725953e-05, + "loss": 34.1704, + "step": 1793 + }, + { + "epoch": 6.476749435665914, + "grad_norm": 213.22216796875, + "learning_rate": 2.0286751361161526e-05, + "loss": 34.7592, + "step": 1794 + }, + { + "epoch": 6.480361173814899, + "grad_norm": 215.77244567871094, + "learning_rate": 2.028130671506352e-05, + "loss": 35.3051, + "step": 1795 + }, + { + "epoch": 6.483972911963883, + "grad_norm": 179.0439910888672, + "learning_rate": 2.0275862068965517e-05, + "loss": 35.2493, + "step": 1796 + }, + { + "epoch": 6.487584650112867, + "grad_norm": 217.47218322753906, + "learning_rate": 2.0270417422867516e-05, + "loss": 35.6169, + "step": 1797 + }, + { + "epoch": 6.491196388261851, + "grad_norm": 191.3380584716797, + "learning_rate": 2.026497277676951e-05, + "loss": 36.428, + "step": 1798 + }, + { + "epoch": 6.4948081264108355, + "grad_norm": 200.8570098876953, + "learning_rate": 2.0259528130671506e-05, + "loss": 36.5983, + "step": 1799 + }, + { + "epoch": 6.4984198645598195, + "grad_norm": 173.1240234375, + "learning_rate": 2.0254083484573502e-05, + "loss": 36.0163, + "step": 1800 + }, + { + "epoch": 6.4984198645598195, + "eval_loss": 0.6268841624259949, + "eval_runtime": 3.146, + "eval_samples_per_second": 56.898, + "eval_steps_per_second": 56.898, + "step": 1800 + }, + { + "epoch": 6.502031602708803, + "grad_norm": 225.66845703125, + "learning_rate": 2.0248638838475497e-05, + "loss": 36.2461, + "step": 1801 + }, + { + "epoch": 6.505643340857787, + "grad_norm": 189.66233825683594, + "learning_rate": 2.0243194192377496e-05, + "loss": 37.416, + "step": 1802 + }, + { + "epoch": 6.509255079006772, + "grad_norm": 243.0270233154297, + "learning_rate": 2.0237749546279495e-05, + "loss": 38.5309, + "step": 1803 + }, + { + "epoch": 6.512866817155756, + "grad_norm": 192.0927276611328, + "learning_rate": 2.023230490018149e-05, + "loss": 37.087, + "step": 1804 + }, + { + "epoch": 6.51647855530474, + "grad_norm": 222.2957305908203, + "learning_rate": 2.0226860254083486e-05, + "loss": 37.8877, + "step": 1805 + }, + { + "epoch": 6.520090293453725, + "grad_norm": 259.84722900390625, + "learning_rate": 2.022141560798548e-05, + "loss": 39.2138, + "step": 1806 + }, + { + "epoch": 6.523702031602709, + "grad_norm": 205.5794219970703, + "learning_rate": 2.0215970961887476e-05, + "loss": 38.6066, + "step": 1807 + }, + { + "epoch": 6.527313769751693, + "grad_norm": 300.455810546875, + "learning_rate": 2.0210526315789475e-05, + "loss": 36.1581, + "step": 1808 + }, + { + "epoch": 6.530925507900677, + "grad_norm": 207.18063354492188, + "learning_rate": 2.020508166969147e-05, + "loss": 24.3689, + "step": 1809 + }, + { + "epoch": 6.534537246049661, + "grad_norm": 230.98516845703125, + "learning_rate": 2.0199637023593466e-05, + "loss": 23.7019, + "step": 1810 + }, + { + "epoch": 6.534537246049661, + "eval_loss": 0.6379140615463257, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 1810 + }, + { + "epoch": 6.538148984198646, + "grad_norm": 153.8694610595703, + "learning_rate": 2.019419237749546e-05, + "loss": 24.5035, + "step": 1811 + }, + { + "epoch": 6.54176072234763, + "grad_norm": 229.9432373046875, + "learning_rate": 2.018874773139746e-05, + "loss": 26.1645, + "step": 1812 + }, + { + "epoch": 6.545372460496614, + "grad_norm": 325.3592529296875, + "learning_rate": 2.018330308529946e-05, + "loss": 45.6349, + "step": 1813 + }, + { + "epoch": 6.5489841986455986, + "grad_norm": 261.0744323730469, + "learning_rate": 2.0177858439201454e-05, + "loss": 45.5545, + "step": 1814 + }, + { + "epoch": 6.5525959367945825, + "grad_norm": 261.4237976074219, + "learning_rate": 2.017241379310345e-05, + "loss": 45.321, + "step": 1815 + }, + { + "epoch": 6.5562076749435665, + "grad_norm": 238.8377685546875, + "learning_rate": 2.0166969147005445e-05, + "loss": 44.5963, + "step": 1816 + }, + { + "epoch": 6.5598194130925505, + "grad_norm": 225.89730834960938, + "learning_rate": 2.016152450090744e-05, + "loss": 43.593, + "step": 1817 + }, + { + "epoch": 6.563431151241535, + "grad_norm": 265.09625244140625, + "learning_rate": 2.0156079854809436e-05, + "loss": 43.536, + "step": 1818 + }, + { + "epoch": 6.567042889390519, + "grad_norm": 257.9114685058594, + "learning_rate": 2.0150635208711434e-05, + "loss": 44.1125, + "step": 1819 + }, + { + "epoch": 6.570654627539503, + "grad_norm": 188.06382751464844, + "learning_rate": 2.014519056261343e-05, + "loss": 45.097, + "step": 1820 + }, + { + "epoch": 6.570654627539503, + "eval_loss": 0.6347097754478455, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 1820 + }, + { + "epoch": 6.574266365688487, + "grad_norm": 227.7350616455078, + "learning_rate": 2.013974591651543e-05, + "loss": 43.9367, + "step": 1821 + }, + { + "epoch": 6.577878103837472, + "grad_norm": 207.54774475097656, + "learning_rate": 2.0134301270417424e-05, + "loss": 43.8266, + "step": 1822 + }, + { + "epoch": 6.581489841986456, + "grad_norm": 204.62364196777344, + "learning_rate": 2.012885662431942e-05, + "loss": 42.7973, + "step": 1823 + }, + { + "epoch": 6.58510158013544, + "grad_norm": 244.32159423828125, + "learning_rate": 2.0123411978221418e-05, + "loss": 42.7741, + "step": 1824 + }, + { + "epoch": 6.588713318284425, + "grad_norm": 304.9100036621094, + "learning_rate": 2.0117967332123414e-05, + "loss": 40.6529, + "step": 1825 + }, + { + "epoch": 6.592325056433409, + "grad_norm": 275.5767517089844, + "learning_rate": 2.011252268602541e-05, + "loss": 40.2909, + "step": 1826 + }, + { + "epoch": 6.595936794582393, + "grad_norm": 227.69642639160156, + "learning_rate": 2.0107078039927404e-05, + "loss": 39.8786, + "step": 1827 + }, + { + "epoch": 6.599548532731377, + "grad_norm": 261.4333190917969, + "learning_rate": 2.01016333938294e-05, + "loss": 40.7009, + "step": 1828 + }, + { + "epoch": 6.603160270880361, + "grad_norm": 213.0095977783203, + "learning_rate": 2.0096188747731395e-05, + "loss": 40.0595, + "step": 1829 + }, + { + "epoch": 6.606772009029346, + "grad_norm": 251.78590393066406, + "learning_rate": 2.0090744101633397e-05, + "loss": 40.8939, + "step": 1830 + }, + { + "epoch": 6.606772009029346, + "eval_loss": 0.6333281397819519, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1830 + }, + { + "epoch": 6.6103837471783295, + "grad_norm": 224.89805603027344, + "learning_rate": 2.0085299455535393e-05, + "loss": 41.4123, + "step": 1831 + }, + { + "epoch": 6.6139954853273135, + "grad_norm": 195.67982482910156, + "learning_rate": 2.0079854809437388e-05, + "loss": 41.3483, + "step": 1832 + }, + { + "epoch": 6.617607223476298, + "grad_norm": 214.318603515625, + "learning_rate": 2.0074410163339383e-05, + "loss": 40.5516, + "step": 1833 + }, + { + "epoch": 6.621218961625282, + "grad_norm": 226.60968017578125, + "learning_rate": 2.006896551724138e-05, + "loss": 41.3523, + "step": 1834 + }, + { + "epoch": 6.624830699774266, + "grad_norm": 231.63604736328125, + "learning_rate": 2.0063520871143378e-05, + "loss": 41.8734, + "step": 1835 + }, + { + "epoch": 6.62844243792325, + "grad_norm": 224.1644287109375, + "learning_rate": 2.0058076225045373e-05, + "loss": 42.7386, + "step": 1836 + }, + { + "epoch": 6.632054176072235, + "grad_norm": 273.651123046875, + "learning_rate": 2.0052631578947368e-05, + "loss": 42.4525, + "step": 1837 + }, + { + "epoch": 6.635665914221219, + "grad_norm": 270.8088684082031, + "learning_rate": 2.0047186932849364e-05, + "loss": 42.1051, + "step": 1838 + }, + { + "epoch": 6.639277652370203, + "grad_norm": 303.1058044433594, + "learning_rate": 2.0041742286751362e-05, + "loss": 42.1301, + "step": 1839 + }, + { + "epoch": 6.642889390519187, + "grad_norm": 207.29380798339844, + "learning_rate": 2.0036297640653358e-05, + "loss": 42.1495, + "step": 1840 + }, + { + "epoch": 6.642889390519187, + "eval_loss": 0.6321585774421692, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1840 + }, + { + "epoch": 6.646501128668172, + "grad_norm": 262.1852722167969, + "learning_rate": 2.0030852994555357e-05, + "loss": 39.6408, + "step": 1841 + }, + { + "epoch": 6.650112866817156, + "grad_norm": 233.7991943359375, + "learning_rate": 2.0025408348457352e-05, + "loss": 37.6177, + "step": 1842 + }, + { + "epoch": 6.65372460496614, + "grad_norm": 247.25514221191406, + "learning_rate": 2.0019963702359347e-05, + "loss": 35.4287, + "step": 1843 + }, + { + "epoch": 6.657336343115124, + "grad_norm": 191.53343200683594, + "learning_rate": 2.0014519056261343e-05, + "loss": 34.2335, + "step": 1844 + }, + { + "epoch": 6.660948081264109, + "grad_norm": 245.22821044921875, + "learning_rate": 2.0009074410163338e-05, + "loss": 35.8097, + "step": 1845 + }, + { + "epoch": 6.664559819413093, + "grad_norm": 213.8151092529297, + "learning_rate": 2.0003629764065337e-05, + "loss": 35.2621, + "step": 1846 + }, + { + "epoch": 6.668171557562077, + "grad_norm": 174.6085205078125, + "learning_rate": 1.9998185117967332e-05, + "loss": 36.6137, + "step": 1847 + }, + { + "epoch": 6.6717832957110605, + "grad_norm": 287.4677429199219, + "learning_rate": 1.9992740471869328e-05, + "loss": 37.5896, + "step": 1848 + }, + { + "epoch": 6.675395033860045, + "grad_norm": 224.59771728515625, + "learning_rate": 1.9987295825771326e-05, + "loss": 36.5515, + "step": 1849 + }, + { + "epoch": 6.679006772009029, + "grad_norm": 212.73065185546875, + "learning_rate": 1.9981851179673322e-05, + "loss": 36.2511, + "step": 1850 + }, + { + "epoch": 6.679006772009029, + "eval_loss": 0.6308404803276062, + "eval_runtime": 3.1419, + "eval_samples_per_second": 56.972, + "eval_steps_per_second": 56.972, + "step": 1850 + }, + { + "epoch": 6.682618510158013, + "grad_norm": 214.7340850830078, + "learning_rate": 1.9976406533575317e-05, + "loss": 37.6949, + "step": 1851 + }, + { + "epoch": 6.686230248306998, + "grad_norm": 220.3029327392578, + "learning_rate": 1.9970961887477316e-05, + "loss": 36.5785, + "step": 1852 + }, + { + "epoch": 6.689841986455982, + "grad_norm": 198.97564697265625, + "learning_rate": 1.996551724137931e-05, + "loss": 38.5277, + "step": 1853 + }, + { + "epoch": 6.693453724604966, + "grad_norm": 180.94789123535156, + "learning_rate": 1.9960072595281307e-05, + "loss": 37.5197, + "step": 1854 + }, + { + "epoch": 6.69706546275395, + "grad_norm": 212.17584228515625, + "learning_rate": 1.9954627949183302e-05, + "loss": 37.3483, + "step": 1855 + }, + { + "epoch": 6.700677200902934, + "grad_norm": 253.88601684570312, + "learning_rate": 1.9949183303085298e-05, + "loss": 38.5224, + "step": 1856 + }, + { + "epoch": 6.704288939051919, + "grad_norm": 193.17698669433594, + "learning_rate": 1.9943738656987296e-05, + "loss": 37.5679, + "step": 1857 + }, + { + "epoch": 6.707900677200903, + "grad_norm": 217.2652130126953, + "learning_rate": 1.9938294010889295e-05, + "loss": 27.7344, + "step": 1858 + }, + { + "epoch": 6.711512415349887, + "grad_norm": 183.9295196533203, + "learning_rate": 1.993284936479129e-05, + "loss": 24.3864, + "step": 1859 + }, + { + "epoch": 6.715124153498872, + "grad_norm": 200.3455352783203, + "learning_rate": 1.9927404718693286e-05, + "loss": 23.7328, + "step": 1860 + }, + { + "epoch": 6.715124153498872, + "eval_loss": 0.636415421962738, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.943, + "eval_steps_per_second": 56.943, + "step": 1860 + }, + { + "epoch": 6.718735891647856, + "grad_norm": 206.7858123779297, + "learning_rate": 1.992196007259528e-05, + "loss": 24.6541, + "step": 1861 + }, + { + "epoch": 6.72234762979684, + "grad_norm": 208.10414123535156, + "learning_rate": 1.9916515426497277e-05, + "loss": 25.1223, + "step": 1862 + }, + { + "epoch": 6.725959367945824, + "grad_norm": 270.6657409667969, + "learning_rate": 1.9911070780399275e-05, + "loss": 44.8561, + "step": 1863 + }, + { + "epoch": 6.7295711060948085, + "grad_norm": 246.69094848632812, + "learning_rate": 1.990562613430127e-05, + "loss": 45.8683, + "step": 1864 + }, + { + "epoch": 6.733182844243792, + "grad_norm": 243.4462432861328, + "learning_rate": 1.9900181488203266e-05, + "loss": 45.1845, + "step": 1865 + }, + { + "epoch": 6.736794582392776, + "grad_norm": 218.0637969970703, + "learning_rate": 1.989473684210526e-05, + "loss": 43.9492, + "step": 1866 + }, + { + "epoch": 6.74040632054176, + "grad_norm": 200.28140258789062, + "learning_rate": 1.988929219600726e-05, + "loss": 44.0612, + "step": 1867 + }, + { + "epoch": 6.744018058690745, + "grad_norm": 200.3120880126953, + "learning_rate": 1.988384754990926e-05, + "loss": 43.4748, + "step": 1868 + }, + { + "epoch": 6.747629796839729, + "grad_norm": 186.1811065673828, + "learning_rate": 1.9878402903811254e-05, + "loss": 43.6851, + "step": 1869 + }, + { + "epoch": 6.751241534988713, + "grad_norm": 208.15167236328125, + "learning_rate": 1.987295825771325e-05, + "loss": 44.4196, + "step": 1870 + }, + { + "epoch": 6.751241534988713, + "eval_loss": 0.6353851556777954, + "eval_runtime": 3.1436, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1870 + }, + { + "epoch": 6.754853273137698, + "grad_norm": 207.500244140625, + "learning_rate": 1.9867513611615245e-05, + "loss": 44.1493, + "step": 1871 + }, + { + "epoch": 6.758465011286682, + "grad_norm": 238.17047119140625, + "learning_rate": 1.986206896551724e-05, + "loss": 44.6587, + "step": 1872 + }, + { + "epoch": 6.762076749435666, + "grad_norm": 192.9468231201172, + "learning_rate": 1.9856624319419236e-05, + "loss": 43.2409, + "step": 1873 + }, + { + "epoch": 6.76568848758465, + "grad_norm": 205.26492309570312, + "learning_rate": 1.9851179673321235e-05, + "loss": 40.8636, + "step": 1874 + }, + { + "epoch": 6.769300225733634, + "grad_norm": 190.49908447265625, + "learning_rate": 1.984573502722323e-05, + "loss": 41.0769, + "step": 1875 + }, + { + "epoch": 6.772911963882619, + "grad_norm": 206.56097412109375, + "learning_rate": 1.984029038112523e-05, + "loss": 40.1137, + "step": 1876 + }, + { + "epoch": 6.776523702031603, + "grad_norm": 212.89256286621094, + "learning_rate": 1.9834845735027224e-05, + "loss": 41.0114, + "step": 1877 + }, + { + "epoch": 6.780135440180587, + "grad_norm": 197.24267578125, + "learning_rate": 1.982940108892922e-05, + "loss": 40.6027, + "step": 1878 + }, + { + "epoch": 6.7837471783295715, + "grad_norm": 187.01942443847656, + "learning_rate": 1.982395644283122e-05, + "loss": 40.5933, + "step": 1879 + }, + { + "epoch": 6.7873589164785555, + "grad_norm": 236.31092834472656, + "learning_rate": 1.9818511796733214e-05, + "loss": 41.2282, + "step": 1880 + }, + { + "epoch": 6.7873589164785555, + "eval_loss": 0.6299392580986023, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 1880 + }, + { + "epoch": 6.7909706546275395, + "grad_norm": 194.92059326171875, + "learning_rate": 1.981306715063521e-05, + "loss": 41.5858, + "step": 1881 + }, + { + "epoch": 6.794582392776523, + "grad_norm": 192.26272583007812, + "learning_rate": 1.9807622504537205e-05, + "loss": 40.6826, + "step": 1882 + }, + { + "epoch": 6.798194130925508, + "grad_norm": 181.8116912841797, + "learning_rate": 1.98021778584392e-05, + "loss": 40.0867, + "step": 1883 + }, + { + "epoch": 6.801805869074492, + "grad_norm": 219.03494262695312, + "learning_rate": 1.9796733212341195e-05, + "loss": 41.4496, + "step": 1884 + }, + { + "epoch": 6.805417607223476, + "grad_norm": 190.7852325439453, + "learning_rate": 1.9791288566243194e-05, + "loss": 42.4147, + "step": 1885 + }, + { + "epoch": 6.80902934537246, + "grad_norm": 200.32476806640625, + "learning_rate": 1.9785843920145193e-05, + "loss": 42.0316, + "step": 1886 + }, + { + "epoch": 6.812641083521445, + "grad_norm": 240.6086883544922, + "learning_rate": 1.9780399274047188e-05, + "loss": 39.6992, + "step": 1887 + }, + { + "epoch": 6.816252821670429, + "grad_norm": 222.31700134277344, + "learning_rate": 1.9774954627949184e-05, + "loss": 42.9572, + "step": 1888 + }, + { + "epoch": 6.819864559819413, + "grad_norm": 215.65292358398438, + "learning_rate": 1.976950998185118e-05, + "loss": 42.5147, + "step": 1889 + }, + { + "epoch": 6.823476297968397, + "grad_norm": 195.71624755859375, + "learning_rate": 1.9764065335753178e-05, + "loss": 40.9536, + "step": 1890 + }, + { + "epoch": 6.823476297968397, + "eval_loss": 0.6288287043571472, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 1890 + }, + { + "epoch": 6.827088036117382, + "grad_norm": 202.301025390625, + "learning_rate": 1.9758620689655173e-05, + "loss": 40.1754, + "step": 1891 + }, + { + "epoch": 6.830699774266366, + "grad_norm": 217.07186889648438, + "learning_rate": 1.975317604355717e-05, + "loss": 35.7505, + "step": 1892 + }, + { + "epoch": 6.83431151241535, + "grad_norm": 189.78782653808594, + "learning_rate": 1.9747731397459164e-05, + "loss": 34.813, + "step": 1893 + }, + { + "epoch": 6.837923250564334, + "grad_norm": 247.2117462158203, + "learning_rate": 1.974228675136116e-05, + "loss": 33.932, + "step": 1894 + }, + { + "epoch": 6.8415349887133186, + "grad_norm": 244.06321716308594, + "learning_rate": 1.9736842105263158e-05, + "loss": 36.2514, + "step": 1895 + }, + { + "epoch": 6.8451467268623025, + "grad_norm": 235.78692626953125, + "learning_rate": 1.9731397459165157e-05, + "loss": 35.2123, + "step": 1896 + }, + { + "epoch": 6.8487584650112865, + "grad_norm": 193.82456970214844, + "learning_rate": 1.9725952813067152e-05, + "loss": 36.5477, + "step": 1897 + }, + { + "epoch": 6.852370203160271, + "grad_norm": 230.2017059326172, + "learning_rate": 1.9720508166969148e-05, + "loss": 36.1244, + "step": 1898 + }, + { + "epoch": 6.855981941309255, + "grad_norm": 205.5274200439453, + "learning_rate": 1.9715063520871143e-05, + "loss": 36.7059, + "step": 1899 + }, + { + "epoch": 6.859593679458239, + "grad_norm": 236.6873016357422, + "learning_rate": 1.970961887477314e-05, + "loss": 36.6212, + "step": 1900 + }, + { + "epoch": 6.859593679458239, + "eval_loss": 0.6235609650611877, + "eval_runtime": 3.1497, + "eval_samples_per_second": 56.831, + "eval_steps_per_second": 56.831, + "step": 1900 + }, + { + "epoch": 6.863205417607223, + "grad_norm": 217.63638305664062, + "learning_rate": 1.9704174228675137e-05, + "loss": 37.3918, + "step": 1901 + }, + { + "epoch": 6.866817155756207, + "grad_norm": 169.31996154785156, + "learning_rate": 1.9698729582577133e-05, + "loss": 37.8555, + "step": 1902 + }, + { + "epoch": 6.870428893905192, + "grad_norm": 204.2144775390625, + "learning_rate": 1.9693284936479128e-05, + "loss": 38.0013, + "step": 1903 + }, + { + "epoch": 6.874040632054176, + "grad_norm": 219.13595581054688, + "learning_rate": 1.9687840290381127e-05, + "loss": 37.2128, + "step": 1904 + }, + { + "epoch": 6.87765237020316, + "grad_norm": 189.8477325439453, + "learning_rate": 1.9682395644283122e-05, + "loss": 39.272, + "step": 1905 + }, + { + "epoch": 6.881264108352145, + "grad_norm": 214.21360778808594, + "learning_rate": 1.967695099818512e-05, + "loss": 37.5185, + "step": 1906 + }, + { + "epoch": 6.884875846501129, + "grad_norm": 252.57867431640625, + "learning_rate": 1.9671506352087116e-05, + "loss": 37.6195, + "step": 1907 + }, + { + "epoch": 6.888487584650113, + "grad_norm": 169.85382080078125, + "learning_rate": 1.966606170598911e-05, + "loss": 29.083, + "step": 1908 + }, + { + "epoch": 6.892099322799097, + "grad_norm": 161.38137817382812, + "learning_rate": 1.9660617059891107e-05, + "loss": 24.4547, + "step": 1909 + }, + { + "epoch": 6.895711060948082, + "grad_norm": 192.5706787109375, + "learning_rate": 1.9655172413793102e-05, + "loss": 24.2235, + "step": 1910 + }, + { + "epoch": 6.895711060948082, + "eval_loss": 0.6387229561805725, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1910 + }, + { + "epoch": 6.899322799097066, + "grad_norm": 177.5368194580078, + "learning_rate": 1.9649727767695098e-05, + "loss": 24.8032, + "step": 1911 + }, + { + "epoch": 6.9029345372460496, + "grad_norm": 206.98458862304688, + "learning_rate": 1.9644283121597097e-05, + "loss": 25.7293, + "step": 1912 + }, + { + "epoch": 6.9065462753950335, + "grad_norm": 238.7289581298828, + "learning_rate": 1.9638838475499095e-05, + "loss": 44.2514, + "step": 1913 + }, + { + "epoch": 6.910158013544018, + "grad_norm": 225.86854553222656, + "learning_rate": 1.963339382940109e-05, + "loss": 44.4858, + "step": 1914 + }, + { + "epoch": 6.913769751693002, + "grad_norm": 235.71524047851562, + "learning_rate": 1.9627949183303086e-05, + "loss": 44.5351, + "step": 1915 + }, + { + "epoch": 6.917381489841986, + "grad_norm": 233.1634063720703, + "learning_rate": 1.962250453720508e-05, + "loss": 44.0865, + "step": 1916 + }, + { + "epoch": 6.92099322799097, + "grad_norm": 201.48944091796875, + "learning_rate": 1.961705989110708e-05, + "loss": 45.0226, + "step": 1917 + }, + { + "epoch": 6.924604966139955, + "grad_norm": 226.95469665527344, + "learning_rate": 1.9611615245009076e-05, + "loss": 44.3969, + "step": 1918 + }, + { + "epoch": 6.928216704288939, + "grad_norm": 242.79940795898438, + "learning_rate": 1.960617059891107e-05, + "loss": 41.3037, + "step": 1919 + }, + { + "epoch": 6.931828442437923, + "grad_norm": 255.3524932861328, + "learning_rate": 1.9600725952813066e-05, + "loss": 41.3567, + "step": 1920 + }, + { + "epoch": 6.931828442437923, + "eval_loss": 0.6346065998077393, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 1920 + }, + { + "epoch": 6.935440180586907, + "grad_norm": 277.0763854980469, + "learning_rate": 1.9595281306715062e-05, + "loss": 41.142, + "step": 1921 + }, + { + "epoch": 6.939051918735892, + "grad_norm": 176.02658081054688, + "learning_rate": 1.958983666061706e-05, + "loss": 42.1963, + "step": 1922 + }, + { + "epoch": 6.942663656884876, + "grad_norm": 236.36398315429688, + "learning_rate": 1.958439201451906e-05, + "loss": 42.351, + "step": 1923 + }, + { + "epoch": 6.94627539503386, + "grad_norm": 203.0919647216797, + "learning_rate": 1.9578947368421055e-05, + "loss": 41.5248, + "step": 1924 + }, + { + "epoch": 6.949887133182845, + "grad_norm": 273.605712890625, + "learning_rate": 1.957350272232305e-05, + "loss": 42.1004, + "step": 1925 + }, + { + "epoch": 6.953498871331829, + "grad_norm": 214.04319763183594, + "learning_rate": 1.9568058076225045e-05, + "loss": 42.6326, + "step": 1926 + }, + { + "epoch": 6.957110609480813, + "grad_norm": 250.81832885742188, + "learning_rate": 1.956261343012704e-05, + "loss": 43.8045, + "step": 1927 + }, + { + "epoch": 6.960722347629797, + "grad_norm": 233.58116149902344, + "learning_rate": 1.955716878402904e-05, + "loss": 39.8991, + "step": 1928 + }, + { + "epoch": 6.9643340857787805, + "grad_norm": 269.0545654296875, + "learning_rate": 1.9551724137931035e-05, + "loss": 34.6192, + "step": 1929 + }, + { + "epoch": 6.967945823927765, + "grad_norm": 266.1218566894531, + "learning_rate": 1.954627949183303e-05, + "loss": 35.7568, + "step": 1930 + }, + { + "epoch": 6.967945823927765, + "eval_loss": 0.6233173608779907, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1930 + }, + { + "epoch": 6.971557562076749, + "grad_norm": 294.6914978027344, + "learning_rate": 1.9540834845735026e-05, + "loss": 36.0795, + "step": 1931 + }, + { + "epoch": 6.975169300225733, + "grad_norm": 373.6831970214844, + "learning_rate": 1.9535390199637025e-05, + "loss": 37.2715, + "step": 1932 + }, + { + "epoch": 6.978781038374718, + "grad_norm": 240.34738159179688, + "learning_rate": 1.952994555353902e-05, + "loss": 37.8335, + "step": 1933 + }, + { + "epoch": 6.982392776523702, + "grad_norm": 312.1968994140625, + "learning_rate": 1.952450090744102e-05, + "loss": 37.8251, + "step": 1934 + }, + { + "epoch": 6.986004514672686, + "grad_norm": 276.3544006347656, + "learning_rate": 1.9519056261343014e-05, + "loss": 38.8466, + "step": 1935 + }, + { + "epoch": 6.98961625282167, + "grad_norm": 282.6874694824219, + "learning_rate": 1.951361161524501e-05, + "loss": 37.774, + "step": 1936 + }, + { + "epoch": 6.993227990970655, + "grad_norm": 323.96612548828125, + "learning_rate": 1.9508166969147005e-05, + "loss": 34.3747, + "step": 1937 + }, + { + "epoch": 6.996839729119639, + "grad_norm": 235.02915954589844, + "learning_rate": 1.9502722323049e-05, + "loss": 24.5297, + "step": 1938 + }, + { + "epoch": 7.0, + "grad_norm": 176.4046173095703, + "learning_rate": 1.9497277676951e-05, + "loss": 22.3179, + "step": 1939 + }, + { + "epoch": 7.003611738148984, + "grad_norm": 248.2797393798828, + "learning_rate": 1.9491833030852994e-05, + "loss": 42.225, + "step": 1940 + }, + { + "epoch": 7.003611738148984, + "eval_loss": 0.6272363066673279, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.911, + "eval_steps_per_second": 56.911, + "step": 1940 + }, + { + "epoch": 7.007223476297969, + "grad_norm": 235.9131622314453, + "learning_rate": 1.9486388384754993e-05, + "loss": 43.6526, + "step": 1941 + }, + { + "epoch": 7.010835214446953, + "grad_norm": 223.63479614257812, + "learning_rate": 1.948094373865699e-05, + "loss": 42.9052, + "step": 1942 + }, + { + "epoch": 7.014446952595937, + "grad_norm": 203.92141723632812, + "learning_rate": 1.9475499092558984e-05, + "loss": 43.5819, + "step": 1943 + }, + { + "epoch": 7.018058690744921, + "grad_norm": 209.6050567626953, + "learning_rate": 1.947005444646098e-05, + "loss": 43.1077, + "step": 1944 + }, + { + "epoch": 7.021670428893906, + "grad_norm": 245.77700805664062, + "learning_rate": 1.9464609800362978e-05, + "loss": 42.7508, + "step": 1945 + }, + { + "epoch": 7.0252821670428895, + "grad_norm": 203.13465881347656, + "learning_rate": 1.9459165154264973e-05, + "loss": 42.5234, + "step": 1946 + }, + { + "epoch": 7.0288939051918735, + "grad_norm": 226.4978485107422, + "learning_rate": 1.945372050816697e-05, + "loss": 44.0725, + "step": 1947 + }, + { + "epoch": 7.0325056433408575, + "grad_norm": 225.68116760253906, + "learning_rate": 1.9448275862068964e-05, + "loss": 42.6408, + "step": 1948 + }, + { + "epoch": 7.036117381489842, + "grad_norm": 182.14202880859375, + "learning_rate": 1.944283121597096e-05, + "loss": 41.7696, + "step": 1949 + }, + { + "epoch": 7.039729119638826, + "grad_norm": 196.1949005126953, + "learning_rate": 1.9437386569872962e-05, + "loss": 42.7008, + "step": 1950 + }, + { + "epoch": 7.039729119638826, + "eval_loss": 0.6277336478233337, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 1950 + }, + { + "epoch": 7.04334085778781, + "grad_norm": 180.6853485107422, + "learning_rate": 1.9431941923774957e-05, + "loss": 41.9946, + "step": 1951 + }, + { + "epoch": 7.046952595936794, + "grad_norm": 199.0644073486328, + "learning_rate": 1.9426497277676953e-05, + "loss": 39.8965, + "step": 1952 + }, + { + "epoch": 7.050564334085779, + "grad_norm": 208.21371459960938, + "learning_rate": 1.9421052631578948e-05, + "loss": 39.3263, + "step": 1953 + }, + { + "epoch": 7.054176072234763, + "grad_norm": 239.78677368164062, + "learning_rate": 1.9415607985480943e-05, + "loss": 40.1478, + "step": 1954 + }, + { + "epoch": 7.057787810383747, + "grad_norm": 211.55030822753906, + "learning_rate": 1.941016333938294e-05, + "loss": 40.061, + "step": 1955 + }, + { + "epoch": 7.061399548532731, + "grad_norm": 199.51455688476562, + "learning_rate": 1.9404718693284937e-05, + "loss": 39.8707, + "step": 1956 + }, + { + "epoch": 7.065011286681716, + "grad_norm": 183.39486694335938, + "learning_rate": 1.9399274047186933e-05, + "loss": 40.3183, + "step": 1957 + }, + { + "epoch": 7.0686230248307, + "grad_norm": 238.36737060546875, + "learning_rate": 1.9393829401088928e-05, + "loss": 40.8581, + "step": 1958 + }, + { + "epoch": 7.072234762979684, + "grad_norm": 202.5072021484375, + "learning_rate": 1.9388384754990927e-05, + "loss": 40.2192, + "step": 1959 + }, + { + "epoch": 7.075846501128668, + "grad_norm": 204.236083984375, + "learning_rate": 1.9382940108892922e-05, + "loss": 40.8533, + "step": 1960 + }, + { + "epoch": 7.075846501128668, + "eval_loss": 0.6252757906913757, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 1960 + }, + { + "epoch": 7.079458239277653, + "grad_norm": 260.2081298828125, + "learning_rate": 1.937749546279492e-05, + "loss": 39.7229, + "step": 1961 + }, + { + "epoch": 7.083069977426637, + "grad_norm": 241.91722106933594, + "learning_rate": 1.9372050816696917e-05, + "loss": 41.547, + "step": 1962 + }, + { + "epoch": 7.0866817155756205, + "grad_norm": 168.9304656982422, + "learning_rate": 1.9366606170598912e-05, + "loss": 41.4826, + "step": 1963 + }, + { + "epoch": 7.090293453724605, + "grad_norm": 230.05349731445312, + "learning_rate": 1.9361161524500907e-05, + "loss": 41.5411, + "step": 1964 + }, + { + "epoch": 7.093905191873589, + "grad_norm": 172.16851806640625, + "learning_rate": 1.9355716878402903e-05, + "loss": 42.2347, + "step": 1965 + }, + { + "epoch": 7.097516930022573, + "grad_norm": 312.65838623046875, + "learning_rate": 1.9350272232304898e-05, + "loss": 41.4039, + "step": 1966 + }, + { + "epoch": 7.101128668171557, + "grad_norm": 249.62351989746094, + "learning_rate": 1.9344827586206897e-05, + "loss": 41.4234, + "step": 1967 + }, + { + "epoch": 7.104740406320542, + "grad_norm": 250.49143981933594, + "learning_rate": 1.9339382940108896e-05, + "loss": 38.0539, + "step": 1968 + }, + { + "epoch": 7.108352144469526, + "grad_norm": 238.41546630859375, + "learning_rate": 1.933393829401089e-05, + "loss": 35.5584, + "step": 1969 + }, + { + "epoch": 7.11196388261851, + "grad_norm": 200.78282165527344, + "learning_rate": 1.9328493647912886e-05, + "loss": 34.4491, + "step": 1970 + }, + { + "epoch": 7.11196388261851, + "eval_loss": 0.6286216378211975, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 1970 + }, + { + "epoch": 7.115575620767494, + "grad_norm": 244.61717224121094, + "learning_rate": 1.9323049001814882e-05, + "loss": 34.5403, + "step": 1971 + }, + { + "epoch": 7.119187358916479, + "grad_norm": 219.14312744140625, + "learning_rate": 1.931760435571688e-05, + "loss": 35.7815, + "step": 1972 + }, + { + "epoch": 7.122799097065463, + "grad_norm": 221.85130310058594, + "learning_rate": 1.9312159709618876e-05, + "loss": 35.638, + "step": 1973 + }, + { + "epoch": 7.126410835214447, + "grad_norm": 237.97921752929688, + "learning_rate": 1.930671506352087e-05, + "loss": 35.1348, + "step": 1974 + }, + { + "epoch": 7.130022573363431, + "grad_norm": 234.06256103515625, + "learning_rate": 1.9301270417422867e-05, + "loss": 35.8709, + "step": 1975 + }, + { + "epoch": 7.133634311512416, + "grad_norm": 231.6852264404297, + "learning_rate": 1.9295825771324862e-05, + "loss": 36.6859, + "step": 1976 + }, + { + "epoch": 7.1372460496614, + "grad_norm": 208.2762908935547, + "learning_rate": 1.9290381125226857e-05, + "loss": 37.24, + "step": 1977 + }, + { + "epoch": 7.140857787810384, + "grad_norm": 219.8532257080078, + "learning_rate": 1.928493647912886e-05, + "loss": 36.4058, + "step": 1978 + }, + { + "epoch": 7.144469525959368, + "grad_norm": 242.73159790039062, + "learning_rate": 1.9279491833030855e-05, + "loss": 36.7565, + "step": 1979 + }, + { + "epoch": 7.148081264108352, + "grad_norm": 227.09645080566406, + "learning_rate": 1.927404718693285e-05, + "loss": 37.6752, + "step": 1980 + }, + { + "epoch": 7.148081264108352, + "eval_loss": 0.6243596076965332, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 1980 + }, + { + "epoch": 7.151693002257336, + "grad_norm": 236.27169799804688, + "learning_rate": 1.9268602540834846e-05, + "loss": 38.3857, + "step": 1981 + }, + { + "epoch": 7.15530474040632, + "grad_norm": 244.84912109375, + "learning_rate": 1.926315789473684e-05, + "loss": 38.414, + "step": 1982 + }, + { + "epoch": 7.158916478555304, + "grad_norm": 203.36798095703125, + "learning_rate": 1.925771324863884e-05, + "loss": 38.938, + "step": 1983 + }, + { + "epoch": 7.162528216704289, + "grad_norm": 225.50152587890625, + "learning_rate": 1.9252268602540835e-05, + "loss": 37.654, + "step": 1984 + }, + { + "epoch": 7.166139954853273, + "grad_norm": 236.4989471435547, + "learning_rate": 1.924682395644283e-05, + "loss": 28.2794, + "step": 1985 + }, + { + "epoch": 7.169751693002257, + "grad_norm": 173.909423828125, + "learning_rate": 1.9241379310344826e-05, + "loss": 23.3804, + "step": 1986 + }, + { + "epoch": 7.173363431151241, + "grad_norm": 195.63526916503906, + "learning_rate": 1.9235934664246825e-05, + "loss": 24.4696, + "step": 1987 + }, + { + "epoch": 7.176975169300226, + "grad_norm": 150.0059356689453, + "learning_rate": 1.923049001814882e-05, + "loss": 23.9438, + "step": 1988 + }, + { + "epoch": 7.18058690744921, + "grad_norm": 217.61630249023438, + "learning_rate": 1.922504537205082e-05, + "loss": 25.4084, + "step": 1989 + }, + { + "epoch": 7.184198645598194, + "grad_norm": 259.2041015625, + "learning_rate": 1.9219600725952814e-05, + "loss": 44.7159, + "step": 1990 + }, + { + "epoch": 7.184198645598194, + "eval_loss": 0.6465168595314026, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 1990 + }, + { + "epoch": 7.187810383747179, + "grad_norm": 282.1758117675781, + "learning_rate": 1.921415607985481e-05, + "loss": 45.7571, + "step": 1991 + }, + { + "epoch": 7.191422121896163, + "grad_norm": 276.5455322265625, + "learning_rate": 1.9208711433756805e-05, + "loss": 44.7227, + "step": 1992 + }, + { + "epoch": 7.195033860045147, + "grad_norm": 251.93589782714844, + "learning_rate": 1.92032667876588e-05, + "loss": 43.0705, + "step": 1993 + }, + { + "epoch": 7.198645598194131, + "grad_norm": 224.8245086669922, + "learning_rate": 1.91978221415608e-05, + "loss": 43.2009, + "step": 1994 + }, + { + "epoch": 7.2022573363431155, + "grad_norm": 233.61770629882812, + "learning_rate": 1.9192377495462795e-05, + "loss": 43.4496, + "step": 1995 + }, + { + "epoch": 7.2058690744920995, + "grad_norm": 188.65252685546875, + "learning_rate": 1.9186932849364793e-05, + "loss": 42.5907, + "step": 1996 + }, + { + "epoch": 7.209480812641083, + "grad_norm": 185.1155242919922, + "learning_rate": 1.918148820326679e-05, + "loss": 44.4651, + "step": 1997 + }, + { + "epoch": 7.213092550790067, + "grad_norm": 169.09701538085938, + "learning_rate": 1.9176043557168784e-05, + "loss": 43.6325, + "step": 1998 + }, + { + "epoch": 7.216704288939052, + "grad_norm": 198.49114990234375, + "learning_rate": 1.9170598911070783e-05, + "loss": 43.5817, + "step": 1999 + }, + { + "epoch": 7.220316027088036, + "grad_norm": 193.17591857910156, + "learning_rate": 1.916515426497278e-05, + "loss": 41.4884, + "step": 2000 + }, + { + "epoch": 7.220316027088036, + "eval_loss": 0.6329721212387085, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.995, + "eval_steps_per_second": 56.995, + "step": 2000 + }, + { + "epoch": 7.22392776523702, + "grad_norm": 202.32730102539062, + "learning_rate": 1.9159709618874774e-05, + "loss": 41.2168, + "step": 2001 + }, + { + "epoch": 7.227539503386004, + "grad_norm": 206.4916534423828, + "learning_rate": 1.915426497277677e-05, + "loss": 39.9909, + "step": 2002 + }, + { + "epoch": 7.231151241534989, + "grad_norm": 202.2099609375, + "learning_rate": 1.9148820326678765e-05, + "loss": 40.1413, + "step": 2003 + }, + { + "epoch": 7.234762979683973, + "grad_norm": 223.7954559326172, + "learning_rate": 1.914337568058076e-05, + "loss": 39.5872, + "step": 2004 + }, + { + "epoch": 7.238374717832957, + "grad_norm": 225.8967742919922, + "learning_rate": 1.9137931034482762e-05, + "loss": 41.3396, + "step": 2005 + }, + { + "epoch": 7.241986455981941, + "grad_norm": 248.0997772216797, + "learning_rate": 1.9132486388384757e-05, + "loss": 39.012, + "step": 2006 + }, + { + "epoch": 7.245598194130926, + "grad_norm": 227.4576873779297, + "learning_rate": 1.9127041742286753e-05, + "loss": 42.5922, + "step": 2007 + }, + { + "epoch": 7.24920993227991, + "grad_norm": 197.62547302246094, + "learning_rate": 1.9121597096188748e-05, + "loss": 41.6107, + "step": 2008 + }, + { + "epoch": 7.252821670428894, + "grad_norm": 170.18817138671875, + "learning_rate": 1.9116152450090744e-05, + "loss": 40.3326, + "step": 2009 + }, + { + "epoch": 7.2564334085778786, + "grad_norm": 186.9420166015625, + "learning_rate": 1.9110707803992742e-05, + "loss": 41.0365, + "step": 2010 + }, + { + "epoch": 7.2564334085778786, + "eval_loss": 0.6230406761169434, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2010 + }, + { + "epoch": 7.2600451467268625, + "grad_norm": 188.11244201660156, + "learning_rate": 1.9105263157894738e-05, + "loss": 42.0278, + "step": 2011 + }, + { + "epoch": 7.2636568848758465, + "grad_norm": 242.47305297851562, + "learning_rate": 1.9099818511796733e-05, + "loss": 41.5539, + "step": 2012 + }, + { + "epoch": 7.2672686230248305, + "grad_norm": 190.83987426757812, + "learning_rate": 1.909437386569873e-05, + "loss": 41.8641, + "step": 2013 + }, + { + "epoch": 7.270880361173815, + "grad_norm": 214.44650268554688, + "learning_rate": 1.9088929219600724e-05, + "loss": 42.232, + "step": 2014 + }, + { + "epoch": 7.274492099322799, + "grad_norm": 216.3888397216797, + "learning_rate": 1.9083484573502723e-05, + "loss": 41.6186, + "step": 2015 + }, + { + "epoch": 7.278103837471783, + "grad_norm": 210.46673583984375, + "learning_rate": 1.907803992740472e-05, + "loss": 42.2099, + "step": 2016 + }, + { + "epoch": 7.281715575620767, + "grad_norm": 194.84165954589844, + "learning_rate": 1.9072595281306717e-05, + "loss": 42.78, + "step": 2017 + }, + { + "epoch": 7.285327313769752, + "grad_norm": 201.91297912597656, + "learning_rate": 1.9067150635208712e-05, + "loss": 38.7115, + "step": 2018 + }, + { + "epoch": 7.288939051918736, + "grad_norm": 245.42625427246094, + "learning_rate": 1.9061705989110708e-05, + "loss": 35.7841, + "step": 2019 + }, + { + "epoch": 7.29255079006772, + "grad_norm": 182.4967041015625, + "learning_rate": 1.9056261343012703e-05, + "loss": 34.3308, + "step": 2020 + }, + { + "epoch": 7.29255079006772, + "eval_loss": 0.6238341331481934, + "eval_runtime": 3.1431, + "eval_samples_per_second": 56.95, + "eval_steps_per_second": 56.95, + "step": 2020 + }, + { + "epoch": 7.296162528216704, + "grad_norm": 297.3916320800781, + "learning_rate": 1.9050816696914702e-05, + "loss": 34.7534, + "step": 2021 + }, + { + "epoch": 7.299774266365689, + "grad_norm": 211.52554321289062, + "learning_rate": 1.9045372050816697e-05, + "loss": 34.0303, + "step": 2022 + }, + { + "epoch": 7.303386004514673, + "grad_norm": 232.99844360351562, + "learning_rate": 1.9039927404718693e-05, + "loss": 35.7378, + "step": 2023 + }, + { + "epoch": 7.306997742663657, + "grad_norm": 230.34642028808594, + "learning_rate": 1.903448275862069e-05, + "loss": 36.7492, + "step": 2024 + }, + { + "epoch": 7.310609480812641, + "grad_norm": 228.88966369628906, + "learning_rate": 1.9029038112522687e-05, + "loss": 35.1188, + "step": 2025 + }, + { + "epoch": 7.314221218961626, + "grad_norm": 213.2604522705078, + "learning_rate": 1.9023593466424682e-05, + "loss": 35.0688, + "step": 2026 + }, + { + "epoch": 7.3178329571106095, + "grad_norm": 202.62200927734375, + "learning_rate": 1.901814882032668e-05, + "loss": 37.6721, + "step": 2027 + }, + { + "epoch": 7.3214446952595935, + "grad_norm": 191.8877410888672, + "learning_rate": 1.9012704174228676e-05, + "loss": 36.7728, + "step": 2028 + }, + { + "epoch": 7.3250564334085775, + "grad_norm": 211.57571411132812, + "learning_rate": 1.900725952813067e-05, + "loss": 36.6342, + "step": 2029 + }, + { + "epoch": 7.328668171557562, + "grad_norm": 177.2289581298828, + "learning_rate": 1.9001814882032667e-05, + "loss": 36.8319, + "step": 2030 + }, + { + "epoch": 7.328668171557562, + "eval_loss": 0.6231008172035217, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2030 + }, + { + "epoch": 7.332279909706546, + "grad_norm": 227.7028350830078, + "learning_rate": 1.8996370235934662e-05, + "loss": 36.6706, + "step": 2031 + }, + { + "epoch": 7.33589164785553, + "grad_norm": 229.02972412109375, + "learning_rate": 1.899092558983666e-05, + "loss": 37.0749, + "step": 2032 + }, + { + "epoch": 7.339503386004514, + "grad_norm": 234.30946350097656, + "learning_rate": 1.898548094373866e-05, + "loss": 37.3716, + "step": 2033 + }, + { + "epoch": 7.343115124153499, + "grad_norm": 236.79893493652344, + "learning_rate": 1.8980036297640655e-05, + "loss": 38.9503, + "step": 2034 + }, + { + "epoch": 7.346726862302483, + "grad_norm": 256.5646057128906, + "learning_rate": 1.897459165154265e-05, + "loss": 32.5056, + "step": 2035 + }, + { + "epoch": 7.350338600451467, + "grad_norm": 183.38961791992188, + "learning_rate": 1.8969147005444646e-05, + "loss": 25.3982, + "step": 2036 + }, + { + "epoch": 7.353950338600452, + "grad_norm": 214.09742736816406, + "learning_rate": 1.896370235934664e-05, + "loss": 23.2743, + "step": 2037 + }, + { + "epoch": 7.357562076749436, + "grad_norm": 190.10867309570312, + "learning_rate": 1.895825771324864e-05, + "loss": 24.8062, + "step": 2038 + }, + { + "epoch": 7.36117381489842, + "grad_norm": 197.85313415527344, + "learning_rate": 1.8952813067150636e-05, + "loss": 25.5098, + "step": 2039 + }, + { + "epoch": 7.364785553047404, + "grad_norm": 235.79090881347656, + "learning_rate": 1.894736842105263e-05, + "loss": 44.3536, + "step": 2040 + }, + { + "epoch": 7.364785553047404, + "eval_loss": 0.6341925263404846, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.785, + "eval_steps_per_second": 56.785, + "step": 2040 + }, + { + "epoch": 7.368397291196389, + "grad_norm": 232.7415771484375, + "learning_rate": 1.8941923774954626e-05, + "loss": 44.6073, + "step": 2041 + }, + { + "epoch": 7.372009029345373, + "grad_norm": 302.3766174316406, + "learning_rate": 1.8936479128856625e-05, + "loss": 43.8575, + "step": 2042 + }, + { + "epoch": 7.375620767494357, + "grad_norm": 208.41441345214844, + "learning_rate": 1.8931034482758624e-05, + "loss": 42.4378, + "step": 2043 + }, + { + "epoch": 7.3792325056433405, + "grad_norm": 228.000732421875, + "learning_rate": 1.892558983666062e-05, + "loss": 44.5641, + "step": 2044 + }, + { + "epoch": 7.382844243792325, + "grad_norm": 201.757080078125, + "learning_rate": 1.8920145190562615e-05, + "loss": 43.7578, + "step": 2045 + }, + { + "epoch": 7.386455981941309, + "grad_norm": 220.2481689453125, + "learning_rate": 1.891470054446461e-05, + "loss": 42.755, + "step": 2046 + }, + { + "epoch": 7.390067720090293, + "grad_norm": 225.5443115234375, + "learning_rate": 1.8909255898366605e-05, + "loss": 44.3785, + "step": 2047 + }, + { + "epoch": 7.393679458239277, + "grad_norm": 200.2024688720703, + "learning_rate": 1.89038112522686e-05, + "loss": 42.994, + "step": 2048 + }, + { + "epoch": 7.397291196388262, + "grad_norm": 205.64794921875, + "learning_rate": 1.88983666061706e-05, + "loss": 43.1902, + "step": 2049 + }, + { + "epoch": 7.400902934537246, + "grad_norm": 183.3535919189453, + "learning_rate": 1.8892921960072595e-05, + "loss": 40.9422, + "step": 2050 + }, + { + "epoch": 7.400902934537246, + "eval_loss": 0.626913845539093, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2050 + }, + { + "epoch": 7.40451467268623, + "grad_norm": 201.8138885498047, + "learning_rate": 1.8887477313974594e-05, + "loss": 39.4408, + "step": 2051 + }, + { + "epoch": 7.408126410835214, + "grad_norm": 201.8863525390625, + "learning_rate": 1.888203266787659e-05, + "loss": 39.5467, + "step": 2052 + }, + { + "epoch": 7.411738148984199, + "grad_norm": 239.10687255859375, + "learning_rate": 1.8876588021778585e-05, + "loss": 41.2256, + "step": 2053 + }, + { + "epoch": 7.415349887133183, + "grad_norm": 209.47796630859375, + "learning_rate": 1.8871143375680583e-05, + "loss": 40.8963, + "step": 2054 + }, + { + "epoch": 7.418961625282167, + "grad_norm": 202.6414794921875, + "learning_rate": 1.886569872958258e-05, + "loss": 40.5138, + "step": 2055 + }, + { + "epoch": 7.422573363431152, + "grad_norm": 198.01795959472656, + "learning_rate": 1.8860254083484574e-05, + "loss": 39.1767, + "step": 2056 + }, + { + "epoch": 7.426185101580136, + "grad_norm": 173.26507568359375, + "learning_rate": 1.885480943738657e-05, + "loss": 40.6713, + "step": 2057 + }, + { + "epoch": 7.42979683972912, + "grad_norm": 166.11607360839844, + "learning_rate": 1.8849364791288565e-05, + "loss": 41.2602, + "step": 2058 + }, + { + "epoch": 7.433408577878104, + "grad_norm": 200.76956176757812, + "learning_rate": 1.884392014519056e-05, + "loss": 41.0714, + "step": 2059 + }, + { + "epoch": 7.437020316027088, + "grad_norm": 213.75315856933594, + "learning_rate": 1.883847549909256e-05, + "loss": 39.6812, + "step": 2060 + }, + { + "epoch": 7.437020316027088, + "eval_loss": 0.6279598474502563, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.0, + "eval_steps_per_second": 57.0, + "step": 2060 + }, + { + "epoch": 7.440632054176072, + "grad_norm": 221.25025939941406, + "learning_rate": 1.8833030852994558e-05, + "loss": 41.6964, + "step": 2061 + }, + { + "epoch": 7.444243792325056, + "grad_norm": 171.32106018066406, + "learning_rate": 1.8827586206896553e-05, + "loss": 41.4608, + "step": 2062 + }, + { + "epoch": 7.44785553047404, + "grad_norm": 222.76600646972656, + "learning_rate": 1.882214156079855e-05, + "loss": 41.2687, + "step": 2063 + }, + { + "epoch": 7.451467268623025, + "grad_norm": 169.82395935058594, + "learning_rate": 1.8816696914700544e-05, + "loss": 41.6048, + "step": 2064 + }, + { + "epoch": 7.455079006772009, + "grad_norm": 190.5113525390625, + "learning_rate": 1.8811252268602543e-05, + "loss": 41.8843, + "step": 2065 + }, + { + "epoch": 7.458690744920993, + "grad_norm": 194.5990447998047, + "learning_rate": 1.8805807622504538e-05, + "loss": 43.5968, + "step": 2066 + }, + { + "epoch": 7.462302483069977, + "grad_norm": 216.0985870361328, + "learning_rate": 1.8800362976406533e-05, + "loss": 41.6743, + "step": 2067 + }, + { + "epoch": 7.465914221218962, + "grad_norm": 249.05270385742188, + "learning_rate": 1.879491833030853e-05, + "loss": 39.4203, + "step": 2068 + }, + { + "epoch": 7.469525959367946, + "grad_norm": 232.5495147705078, + "learning_rate": 1.8789473684210524e-05, + "loss": 36.2202, + "step": 2069 + }, + { + "epoch": 7.47313769751693, + "grad_norm": 218.72299194335938, + "learning_rate": 1.8784029038112523e-05, + "loss": 34.9116, + "step": 2070 + }, + { + "epoch": 7.47313769751693, + "eval_loss": 0.6241349577903748, + "eval_runtime": 3.1499, + "eval_samples_per_second": 56.827, + "eval_steps_per_second": 56.827, + "step": 2070 + }, + { + "epoch": 7.476749435665914, + "grad_norm": 241.78179931640625, + "learning_rate": 1.8778584392014522e-05, + "loss": 36.2476, + "step": 2071 + }, + { + "epoch": 7.480361173814899, + "grad_norm": 194.92982482910156, + "learning_rate": 1.8773139745916517e-05, + "loss": 34.4524, + "step": 2072 + }, + { + "epoch": 7.483972911963883, + "grad_norm": 227.76156616210938, + "learning_rate": 1.8767695099818513e-05, + "loss": 34.5292, + "step": 2073 + }, + { + "epoch": 7.487584650112867, + "grad_norm": 287.61309814453125, + "learning_rate": 1.8762250453720508e-05, + "loss": 37.8068, + "step": 2074 + }, + { + "epoch": 7.491196388261851, + "grad_norm": 191.0822296142578, + "learning_rate": 1.8756805807622503e-05, + "loss": 36.0941, + "step": 2075 + }, + { + "epoch": 7.4948081264108355, + "grad_norm": 197.5564422607422, + "learning_rate": 1.8751361161524502e-05, + "loss": 36.3624, + "step": 2076 + }, + { + "epoch": 7.4984198645598195, + "grad_norm": 187.72479248046875, + "learning_rate": 1.8745916515426497e-05, + "loss": 37.5074, + "step": 2077 + }, + { + "epoch": 7.502031602708803, + "grad_norm": 220.4607391357422, + "learning_rate": 1.8740471869328493e-05, + "loss": 35.6139, + "step": 2078 + }, + { + "epoch": 7.505643340857787, + "grad_norm": 179.05612182617188, + "learning_rate": 1.873502722323049e-05, + "loss": 37.7286, + "step": 2079 + }, + { + "epoch": 7.509255079006772, + "grad_norm": 230.91879272460938, + "learning_rate": 1.8729582577132487e-05, + "loss": 36.1803, + "step": 2080 + }, + { + "epoch": 7.509255079006772, + "eval_loss": 0.6255043148994446, + "eval_runtime": 3.1466, + "eval_samples_per_second": 56.887, + "eval_steps_per_second": 56.887, + "step": 2080 + }, + { + "epoch": 7.512866817155756, + "grad_norm": 182.89437866210938, + "learning_rate": 1.8724137931034482e-05, + "loss": 36.5782, + "step": 2081 + }, + { + "epoch": 7.51647855530474, + "grad_norm": 215.36769104003906, + "learning_rate": 1.871869328493648e-05, + "loss": 38.233, + "step": 2082 + }, + { + "epoch": 7.520090293453725, + "grad_norm": 232.6095733642578, + "learning_rate": 1.8713248638838477e-05, + "loss": 38.6268, + "step": 2083 + }, + { + "epoch": 7.523702031602709, + "grad_norm": 236.94281005859375, + "learning_rate": 1.8707803992740472e-05, + "loss": 38.1768, + "step": 2084 + }, + { + "epoch": 7.527313769751693, + "grad_norm": 214.16079711914062, + "learning_rate": 1.8702359346642467e-05, + "loss": 27.514, + "step": 2085 + }, + { + "epoch": 7.530925507900677, + "grad_norm": 192.6107940673828, + "learning_rate": 1.8696914700544463e-05, + "loss": 24.274, + "step": 2086 + }, + { + "epoch": 7.534537246049661, + "grad_norm": 217.98619079589844, + "learning_rate": 1.869147005444646e-05, + "loss": 23.2824, + "step": 2087 + }, + { + "epoch": 7.538148984198646, + "grad_norm": 183.04296875, + "learning_rate": 1.868602540834846e-05, + "loss": 24.9622, + "step": 2088 + }, + { + "epoch": 7.54176072234763, + "grad_norm": 167.1417236328125, + "learning_rate": 1.8680580762250456e-05, + "loss": 25.1446, + "step": 2089 + }, + { + "epoch": 7.545372460496614, + "grad_norm": 287.29937744140625, + "learning_rate": 1.867513611615245e-05, + "loss": 44.1171, + "step": 2090 + }, + { + "epoch": 7.545372460496614, + "eval_loss": 0.6376849412918091, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.929, + "eval_steps_per_second": 56.929, + "step": 2090 + }, + { + "epoch": 7.5489841986455986, + "grad_norm": 285.3408203125, + "learning_rate": 1.8669691470054446e-05, + "loss": 46.3716, + "step": 2091 + }, + { + "epoch": 7.5525959367945825, + "grad_norm": 233.18389892578125, + "learning_rate": 1.8664246823956445e-05, + "loss": 44.0514, + "step": 2092 + }, + { + "epoch": 7.5562076749435665, + "grad_norm": 256.4196472167969, + "learning_rate": 1.865880217785844e-05, + "loss": 44.1784, + "step": 2093 + }, + { + "epoch": 7.5598194130925505, + "grad_norm": 223.28128051757812, + "learning_rate": 1.8653357531760436e-05, + "loss": 42.9897, + "step": 2094 + }, + { + "epoch": 7.563431151241535, + "grad_norm": 235.2901153564453, + "learning_rate": 1.864791288566243e-05, + "loss": 43.7651, + "step": 2095 + }, + { + "epoch": 7.567042889390519, + "grad_norm": 285.9206237792969, + "learning_rate": 1.8642468239564427e-05, + "loss": 44.6333, + "step": 2096 + }, + { + "epoch": 7.570654627539503, + "grad_norm": 200.00210571289062, + "learning_rate": 1.8637023593466425e-05, + "loss": 43.9845, + "step": 2097 + }, + { + "epoch": 7.574266365688487, + "grad_norm": 277.73394775390625, + "learning_rate": 1.8631578947368424e-05, + "loss": 44.7301, + "step": 2098 + }, + { + "epoch": 7.577878103837472, + "grad_norm": 216.9422149658203, + "learning_rate": 1.862613430127042e-05, + "loss": 44.0409, + "step": 2099 + }, + { + "epoch": 7.581489841986456, + "grad_norm": 198.86639404296875, + "learning_rate": 1.8620689655172415e-05, + "loss": 43.4026, + "step": 2100 + }, + { + "epoch": 7.581489841986456, + "eval_loss": 0.6270378232002258, + "eval_runtime": 3.1464, + "eval_samples_per_second": 56.891, + "eval_steps_per_second": 56.891, + "step": 2100 + }, + { + "epoch": 7.58510158013544, + "grad_norm": 240.495361328125, + "learning_rate": 1.861524500907441e-05, + "loss": 41.4092, + "step": 2101 + }, + { + "epoch": 7.588713318284425, + "grad_norm": 240.1851043701172, + "learning_rate": 1.8609800362976406e-05, + "loss": 40.1396, + "step": 2102 + }, + { + "epoch": 7.592325056433409, + "grad_norm": 241.21495056152344, + "learning_rate": 1.8604355716878405e-05, + "loss": 39.1778, + "step": 2103 + }, + { + "epoch": 7.595936794582393, + "grad_norm": 287.3133544921875, + "learning_rate": 1.85989110707804e-05, + "loss": 41.0348, + "step": 2104 + }, + { + "epoch": 7.599548532731377, + "grad_norm": 230.4313201904297, + "learning_rate": 1.8593466424682395e-05, + "loss": 39.5872, + "step": 2105 + }, + { + "epoch": 7.603160270880361, + "grad_norm": 210.32962036132812, + "learning_rate": 1.858802177858439e-05, + "loss": 40.6146, + "step": 2106 + }, + { + "epoch": 7.606772009029346, + "grad_norm": 185.81752014160156, + "learning_rate": 1.858257713248639e-05, + "loss": 39.6363, + "step": 2107 + }, + { + "epoch": 7.6103837471783295, + "grad_norm": 234.63037109375, + "learning_rate": 1.8577132486388385e-05, + "loss": 40.558, + "step": 2108 + }, + { + "epoch": 7.6139954853273135, + "grad_norm": 289.92803955078125, + "learning_rate": 1.8571687840290384e-05, + "loss": 41.1624, + "step": 2109 + }, + { + "epoch": 7.617607223476298, + "grad_norm": 252.82188415527344, + "learning_rate": 1.856624319419238e-05, + "loss": 41.7827, + "step": 2110 + }, + { + "epoch": 7.617607223476298, + "eval_loss": 0.6290409564971924, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2110 + }, + { + "epoch": 7.621218961625282, + "grad_norm": 201.8303985595703, + "learning_rate": 1.8560798548094374e-05, + "loss": 39.0072, + "step": 2111 + }, + { + "epoch": 7.624830699774266, + "grad_norm": 158.71446228027344, + "learning_rate": 1.855535390199637e-05, + "loss": 39.9822, + "step": 2112 + }, + { + "epoch": 7.62844243792325, + "grad_norm": 171.3879852294922, + "learning_rate": 1.8549909255898365e-05, + "loss": 42.1973, + "step": 2113 + }, + { + "epoch": 7.632054176072235, + "grad_norm": 218.584228515625, + "learning_rate": 1.8544464609800364e-05, + "loss": 42.933, + "step": 2114 + }, + { + "epoch": 7.635665914221219, + "grad_norm": 200.60093688964844, + "learning_rate": 1.853901996370236e-05, + "loss": 41.9847, + "step": 2115 + }, + { + "epoch": 7.639277652370203, + "grad_norm": 210.75128173828125, + "learning_rate": 1.8533575317604358e-05, + "loss": 42.4961, + "step": 2116 + }, + { + "epoch": 7.642889390519187, + "grad_norm": 187.47406005859375, + "learning_rate": 1.8528130671506353e-05, + "loss": 39.3404, + "step": 2117 + }, + { + "epoch": 7.646501128668172, + "grad_norm": 204.87693786621094, + "learning_rate": 1.852268602540835e-05, + "loss": 40.3011, + "step": 2118 + }, + { + "epoch": 7.650112866817156, + "grad_norm": 228.8159637451172, + "learning_rate": 1.8517241379310344e-05, + "loss": 37.4416, + "step": 2119 + }, + { + "epoch": 7.65372460496614, + "grad_norm": 237.59664916992188, + "learning_rate": 1.8511796733212343e-05, + "loss": 35.3079, + "step": 2120 + }, + { + "epoch": 7.65372460496614, + "eval_loss": 0.6256567239761353, + "eval_runtime": 3.1458, + "eval_samples_per_second": 56.902, + "eval_steps_per_second": 56.902, + "step": 2120 + }, + { + "epoch": 7.657336343115124, + "grad_norm": 233.3187713623047, + "learning_rate": 1.850635208711434e-05, + "loss": 34.5055, + "step": 2121 + }, + { + "epoch": 7.660948081264109, + "grad_norm": 232.7037353515625, + "learning_rate": 1.8500907441016334e-05, + "loss": 34.1232, + "step": 2122 + }, + { + "epoch": 7.664559819413093, + "grad_norm": 254.53050231933594, + "learning_rate": 1.849546279491833e-05, + "loss": 35.3301, + "step": 2123 + }, + { + "epoch": 7.668171557562077, + "grad_norm": 234.93154907226562, + "learning_rate": 1.8490018148820324e-05, + "loss": 35.9202, + "step": 2124 + }, + { + "epoch": 7.6717832957110605, + "grad_norm": 237.99671936035156, + "learning_rate": 1.8484573502722327e-05, + "loss": 36.5702, + "step": 2125 + }, + { + "epoch": 7.675395033860045, + "grad_norm": 186.25271606445312, + "learning_rate": 1.8479128856624322e-05, + "loss": 35.9423, + "step": 2126 + }, + { + "epoch": 7.679006772009029, + "grad_norm": 226.461669921875, + "learning_rate": 1.8473684210526317e-05, + "loss": 37.4121, + "step": 2127 + }, + { + "epoch": 7.682618510158013, + "grad_norm": 227.0966033935547, + "learning_rate": 1.8468239564428313e-05, + "loss": 36.8802, + "step": 2128 + }, + { + "epoch": 7.686230248306998, + "grad_norm": 193.4064178466797, + "learning_rate": 1.8462794918330308e-05, + "loss": 36.0245, + "step": 2129 + }, + { + "epoch": 7.689841986455982, + "grad_norm": 279.1668395996094, + "learning_rate": 1.8457350272232304e-05, + "loss": 37.4833, + "step": 2130 + }, + { + "epoch": 7.689841986455982, + "eval_loss": 0.6227458715438843, + "eval_runtime": 3.1429, + "eval_samples_per_second": 56.953, + "eval_steps_per_second": 56.953, + "step": 2130 + }, + { + "epoch": 7.693453724604966, + "grad_norm": 254.59234619140625, + "learning_rate": 1.8451905626134302e-05, + "loss": 36.8538, + "step": 2131 + }, + { + "epoch": 7.69706546275395, + "grad_norm": 191.14463806152344, + "learning_rate": 1.8446460980036298e-05, + "loss": 37.8517, + "step": 2132 + }, + { + "epoch": 7.700677200902934, + "grad_norm": 189.20896911621094, + "learning_rate": 1.8441016333938293e-05, + "loss": 38.406, + "step": 2133 + }, + { + "epoch": 7.704288939051919, + "grad_norm": 209.61175537109375, + "learning_rate": 1.8435571687840292e-05, + "loss": 37.7692, + "step": 2134 + }, + { + "epoch": 7.707900677200903, + "grad_norm": 220.5150146484375, + "learning_rate": 1.8430127041742287e-05, + "loss": 36.087, + "step": 2135 + }, + { + "epoch": 7.711512415349887, + "grad_norm": 211.78372192382812, + "learning_rate": 1.8424682395644286e-05, + "loss": 25.6052, + "step": 2136 + }, + { + "epoch": 7.715124153498872, + "grad_norm": 223.85789489746094, + "learning_rate": 1.841923774954628e-05, + "loss": 23.5576, + "step": 2137 + }, + { + "epoch": 7.718735891647856, + "grad_norm": 163.74220275878906, + "learning_rate": 1.8413793103448277e-05, + "loss": 24.4869, + "step": 2138 + }, + { + "epoch": 7.72234762979684, + "grad_norm": 182.80079650878906, + "learning_rate": 1.8408348457350272e-05, + "loss": 25.1878, + "step": 2139 + }, + { + "epoch": 7.725959367945824, + "grad_norm": 296.0340270996094, + "learning_rate": 1.8402903811252268e-05, + "loss": 44.4643, + "step": 2140 + }, + { + "epoch": 7.725959367945824, + "eval_loss": 0.6382863521575928, + "eval_runtime": 3.1441, + "eval_samples_per_second": 56.932, + "eval_steps_per_second": 56.932, + "step": 2140 + }, + { + "epoch": 7.7295711060948085, + "grad_norm": 248.48643493652344, + "learning_rate": 1.8397459165154263e-05, + "loss": 45.2141, + "step": 2141 + }, + { + "epoch": 7.733182844243792, + "grad_norm": 240.9061279296875, + "learning_rate": 1.8392014519056262e-05, + "loss": 42.9435, + "step": 2142 + }, + { + "epoch": 7.736794582392776, + "grad_norm": 231.62315368652344, + "learning_rate": 1.8386569872958257e-05, + "loss": 42.9769, + "step": 2143 + }, + { + "epoch": 7.74040632054176, + "grad_norm": 244.36915588378906, + "learning_rate": 1.8381125226860256e-05, + "loss": 43.6058, + "step": 2144 + }, + { + "epoch": 7.744018058690745, + "grad_norm": 252.9080047607422, + "learning_rate": 1.837568058076225e-05, + "loss": 43.1753, + "step": 2145 + }, + { + "epoch": 7.747629796839729, + "grad_norm": 274.0201721191406, + "learning_rate": 1.8370235934664247e-05, + "loss": 43.3285, + "step": 2146 + }, + { + "epoch": 7.751241534988713, + "grad_norm": 226.75595092773438, + "learning_rate": 1.8364791288566245e-05, + "loss": 43.3158, + "step": 2147 + }, + { + "epoch": 7.754853273137698, + "grad_norm": 197.0859832763672, + "learning_rate": 1.835934664246824e-05, + "loss": 43.5773, + "step": 2148 + }, + { + "epoch": 7.758465011286682, + "grad_norm": 212.14720153808594, + "learning_rate": 1.8353901996370236e-05, + "loss": 43.9208, + "step": 2149 + }, + { + "epoch": 7.762076749435666, + "grad_norm": 230.22158813476562, + "learning_rate": 1.834845735027223e-05, + "loss": 42.8429, + "step": 2150 + }, + { + "epoch": 7.762076749435666, + "eval_loss": 0.6291994452476501, + "eval_runtime": 3.1473, + "eval_samples_per_second": 56.874, + "eval_steps_per_second": 56.874, + "step": 2150 + }, + { + "epoch": 7.76568848758465, + "grad_norm": 215.79391479492188, + "learning_rate": 1.8343012704174227e-05, + "loss": 40.7289, + "step": 2151 + }, + { + "epoch": 7.769300225733634, + "grad_norm": 210.00296020507812, + "learning_rate": 1.8337568058076222e-05, + "loss": 39.9759, + "step": 2152 + }, + { + "epoch": 7.772911963882619, + "grad_norm": 291.2987976074219, + "learning_rate": 1.8332123411978224e-05, + "loss": 40.551, + "step": 2153 + }, + { + "epoch": 7.776523702031603, + "grad_norm": 218.08819580078125, + "learning_rate": 1.832667876588022e-05, + "loss": 40.7981, + "step": 2154 + }, + { + "epoch": 7.780135440180587, + "grad_norm": 268.615966796875, + "learning_rate": 1.8321234119782215e-05, + "loss": 40.5463, + "step": 2155 + }, + { + "epoch": 7.7837471783295715, + "grad_norm": 269.939697265625, + "learning_rate": 1.831578947368421e-05, + "loss": 40.6168, + "step": 2156 + }, + { + "epoch": 7.7873589164785555, + "grad_norm": 268.9761657714844, + "learning_rate": 1.8310344827586206e-05, + "loss": 41.2449, + "step": 2157 + }, + { + "epoch": 7.7909706546275395, + "grad_norm": 161.08811950683594, + "learning_rate": 1.8304900181488205e-05, + "loss": 40.6308, + "step": 2158 + }, + { + "epoch": 7.794582392776523, + "grad_norm": 190.44696044921875, + "learning_rate": 1.82994555353902e-05, + "loss": 40.9708, + "step": 2159 + }, + { + "epoch": 7.798194130925508, + "grad_norm": 202.4305419921875, + "learning_rate": 1.8294010889292196e-05, + "loss": 41.2053, + "step": 2160 + }, + { + "epoch": 7.798194130925508, + "eval_loss": 0.6233534812927246, + "eval_runtime": 3.1457, + "eval_samples_per_second": 56.903, + "eval_steps_per_second": 56.903, + "step": 2160 + }, + { + "epoch": 7.801805869074492, + "grad_norm": 188.5523681640625, + "learning_rate": 1.828856624319419e-05, + "loss": 40.3928, + "step": 2161 + }, + { + "epoch": 7.805417607223476, + "grad_norm": 184.18296813964844, + "learning_rate": 1.828312159709619e-05, + "loss": 42.3466, + "step": 2162 + }, + { + "epoch": 7.80902934537246, + "grad_norm": 223.9243927001953, + "learning_rate": 1.8277676950998185e-05, + "loss": 42.0301, + "step": 2163 + }, + { + "epoch": 7.812641083521445, + "grad_norm": 202.3498077392578, + "learning_rate": 1.8272232304900184e-05, + "loss": 42.3284, + "step": 2164 + }, + { + "epoch": 7.816252821670429, + "grad_norm": 205.77940368652344, + "learning_rate": 1.826678765880218e-05, + "loss": 42.0951, + "step": 2165 + }, + { + "epoch": 7.819864559819413, + "grad_norm": 191.46728515625, + "learning_rate": 1.8261343012704175e-05, + "loss": 40.826, + "step": 2166 + }, + { + "epoch": 7.823476297968397, + "grad_norm": 276.8330383300781, + "learning_rate": 1.825589836660617e-05, + "loss": 42.7909, + "step": 2167 + }, + { + "epoch": 7.827088036117382, + "grad_norm": 181.93955993652344, + "learning_rate": 1.8250453720508165e-05, + "loss": 38.6068, + "step": 2168 + }, + { + "epoch": 7.830699774266366, + "grad_norm": 178.79856872558594, + "learning_rate": 1.8245009074410164e-05, + "loss": 35.694, + "step": 2169 + }, + { + "epoch": 7.83431151241535, + "grad_norm": 224.6522979736328, + "learning_rate": 1.823956442831216e-05, + "loss": 36.7127, + "step": 2170 + }, + { + "epoch": 7.83431151241535, + "eval_loss": 0.6237645745277405, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 2170 + }, + { + "epoch": 7.837923250564334, + "grad_norm": 203.37196350097656, + "learning_rate": 1.823411978221416e-05, + "loss": 34.0039, + "step": 2171 + }, + { + "epoch": 7.8415349887133186, + "grad_norm": 212.79307556152344, + "learning_rate": 1.8228675136116154e-05, + "loss": 33.2787, + "step": 2172 + }, + { + "epoch": 7.8451467268623025, + "grad_norm": 215.5691375732422, + "learning_rate": 1.822323049001815e-05, + "loss": 35.4241, + "step": 2173 + }, + { + "epoch": 7.8487584650112865, + "grad_norm": 230.0751190185547, + "learning_rate": 1.8217785843920144e-05, + "loss": 36.9333, + "step": 2174 + }, + { + "epoch": 7.852370203160271, + "grad_norm": 217.8132781982422, + "learning_rate": 1.8212341197822143e-05, + "loss": 35.7233, + "step": 2175 + }, + { + "epoch": 7.855981941309255, + "grad_norm": 245.93177795410156, + "learning_rate": 1.820689655172414e-05, + "loss": 36.6111, + "step": 2176 + }, + { + "epoch": 7.859593679458239, + "grad_norm": 210.58218383789062, + "learning_rate": 1.8201451905626134e-05, + "loss": 36.3243, + "step": 2177 + }, + { + "epoch": 7.863205417607223, + "grad_norm": 234.6280059814453, + "learning_rate": 1.819600725952813e-05, + "loss": 37.0315, + "step": 2178 + }, + { + "epoch": 7.866817155756207, + "grad_norm": 184.53121948242188, + "learning_rate": 1.8190562613430125e-05, + "loss": 35.8725, + "step": 2179 + }, + { + "epoch": 7.870428893905192, + "grad_norm": 201.5563507080078, + "learning_rate": 1.8185117967332127e-05, + "loss": 37.9183, + "step": 2180 + }, + { + "epoch": 7.870428893905192, + "eval_loss": 0.6210297346115112, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2180 + }, + { + "epoch": 7.874040632054176, + "grad_norm": 192.29579162597656, + "learning_rate": 1.8179673321234122e-05, + "loss": 37.1709, + "step": 2181 + }, + { + "epoch": 7.87765237020316, + "grad_norm": 246.0638427734375, + "learning_rate": 1.8174228675136118e-05, + "loss": 38.5338, + "step": 2182 + }, + { + "epoch": 7.881264108352145, + "grad_norm": 237.47607421875, + "learning_rate": 1.8168784029038113e-05, + "loss": 37.7041, + "step": 2183 + }, + { + "epoch": 7.884875846501129, + "grad_norm": 215.06407165527344, + "learning_rate": 1.816333938294011e-05, + "loss": 38.1663, + "step": 2184 + }, + { + "epoch": 7.888487584650113, + "grad_norm": 193.76809692382812, + "learning_rate": 1.8157894736842107e-05, + "loss": 32.1679, + "step": 2185 + }, + { + "epoch": 7.892099322799097, + "grad_norm": 208.66111755371094, + "learning_rate": 1.8152450090744103e-05, + "loss": 24.2413, + "step": 2186 + }, + { + "epoch": 7.895711060948082, + "grad_norm": 182.810546875, + "learning_rate": 1.8147005444646098e-05, + "loss": 24.1102, + "step": 2187 + }, + { + "epoch": 7.899322799097066, + "grad_norm": 200.25823974609375, + "learning_rate": 1.8141560798548093e-05, + "loss": 24.5778, + "step": 2188 + }, + { + "epoch": 7.9029345372460496, + "grad_norm": 224.19125366210938, + "learning_rate": 1.813611615245009e-05, + "loss": 26.1643, + "step": 2189 + }, + { + "epoch": 7.9065462753950335, + "grad_norm": 261.03033447265625, + "learning_rate": 1.8130671506352088e-05, + "loss": 45.1071, + "step": 2190 + }, + { + "epoch": 7.9065462753950335, + "eval_loss": 0.6303785443305969, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2190 + }, + { + "epoch": 7.910158013544018, + "grad_norm": 273.6593322753906, + "learning_rate": 1.8125226860254086e-05, + "loss": 43.8271, + "step": 2191 + }, + { + "epoch": 7.913769751693002, + "grad_norm": 304.0534362792969, + "learning_rate": 1.8119782214156082e-05, + "loss": 43.7623, + "step": 2192 + }, + { + "epoch": 7.917381489841986, + "grad_norm": 249.27255249023438, + "learning_rate": 1.8114337568058077e-05, + "loss": 43.7191, + "step": 2193 + }, + { + "epoch": 7.92099322799097, + "grad_norm": 199.5006103515625, + "learning_rate": 1.8108892921960072e-05, + "loss": 44.1019, + "step": 2194 + }, + { + "epoch": 7.924604966139955, + "grad_norm": 228.42832946777344, + "learning_rate": 1.8103448275862068e-05, + "loss": 43.9717, + "step": 2195 + }, + { + "epoch": 7.928216704288939, + "grad_norm": 247.20901489257812, + "learning_rate": 1.8098003629764067e-05, + "loss": 40.022, + "step": 2196 + }, + { + "epoch": 7.931828442437923, + "grad_norm": 297.5372619628906, + "learning_rate": 1.8092558983666062e-05, + "loss": 40.6639, + "step": 2197 + }, + { + "epoch": 7.935440180586907, + "grad_norm": 245.11915588378906, + "learning_rate": 1.8087114337568057e-05, + "loss": 40.3569, + "step": 2198 + }, + { + "epoch": 7.939051918735892, + "grad_norm": 255.53297424316406, + "learning_rate": 1.8081669691470056e-05, + "loss": 41.7983, + "step": 2199 + }, + { + "epoch": 7.942663656884876, + "grad_norm": 226.12783813476562, + "learning_rate": 1.807622504537205e-05, + "loss": 41.7844, + "step": 2200 + }, + { + "epoch": 7.942663656884876, + "eval_loss": 0.6214397549629211, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2200 + }, + { + "epoch": 7.94627539503386, + "grad_norm": 220.90577697753906, + "learning_rate": 1.8070780399274047e-05, + "loss": 42.057, + "step": 2201 + }, + { + "epoch": 7.949887133182845, + "grad_norm": 192.33856201171875, + "learning_rate": 1.8065335753176046e-05, + "loss": 42.0299, + "step": 2202 + }, + { + "epoch": 7.953498871331829, + "grad_norm": 192.8511962890625, + "learning_rate": 1.805989110707804e-05, + "loss": 41.7752, + "step": 2203 + }, + { + "epoch": 7.957110609480813, + "grad_norm": 223.10275268554688, + "learning_rate": 1.8054446460980036e-05, + "loss": 41.0178, + "step": 2204 + }, + { + "epoch": 7.960722347629797, + "grad_norm": 189.8402099609375, + "learning_rate": 1.8049001814882032e-05, + "loss": 37.9747, + "step": 2205 + }, + { + "epoch": 7.9643340857787805, + "grad_norm": 233.5938720703125, + "learning_rate": 1.8043557168784027e-05, + "loss": 35.3994, + "step": 2206 + }, + { + "epoch": 7.967945823927765, + "grad_norm": 218.5577850341797, + "learning_rate": 1.8038112522686026e-05, + "loss": 35.1967, + "step": 2207 + }, + { + "epoch": 7.971557562076749, + "grad_norm": 228.49502563476562, + "learning_rate": 1.8032667876588025e-05, + "loss": 34.5792, + "step": 2208 + }, + { + "epoch": 7.975169300225733, + "grad_norm": 285.4461364746094, + "learning_rate": 1.802722323049002e-05, + "loss": 37.9449, + "step": 2209 + }, + { + "epoch": 7.978781038374718, + "grad_norm": 186.83755493164062, + "learning_rate": 1.8021778584392016e-05, + "loss": 36.3295, + "step": 2210 + }, + { + "epoch": 7.978781038374718, + "eval_loss": 0.6212169528007507, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2210 + }, + { + "epoch": 7.982392776523702, + "grad_norm": 210.31175231933594, + "learning_rate": 1.801633393829401e-05, + "loss": 37.0061, + "step": 2211 + }, + { + "epoch": 7.986004514672686, + "grad_norm": 251.96026611328125, + "learning_rate": 1.8010889292196006e-05, + "loss": 37.8831, + "step": 2212 + }, + { + "epoch": 7.98961625282167, + "grad_norm": 273.8665771484375, + "learning_rate": 1.8005444646098005e-05, + "loss": 38.8926, + "step": 2213 + }, + { + "epoch": 7.993227990970655, + "grad_norm": 207.25836181640625, + "learning_rate": 1.8e-05, + "loss": 30.0468, + "step": 2214 + }, + { + "epoch": 7.996839729119639, + "grad_norm": 200.5218048095703, + "learning_rate": 1.7994555353901996e-05, + "loss": 24.0549, + "step": 2215 + }, + { + "epoch": 8.0, + "grad_norm": 245.7149200439453, + "learning_rate": 1.798911070780399e-05, + "loss": 22.3158, + "step": 2216 + }, + { + "epoch": 8.003611738148985, + "grad_norm": 263.85546875, + "learning_rate": 1.798366606170599e-05, + "loss": 43.2342, + "step": 2217 + }, + { + "epoch": 8.007223476297968, + "grad_norm": 244.57205200195312, + "learning_rate": 1.797822141560799e-05, + "loss": 44.0931, + "step": 2218 + }, + { + "epoch": 8.010835214446953, + "grad_norm": 196.4144287109375, + "learning_rate": 1.7972776769509984e-05, + "loss": 42.1926, + "step": 2219 + }, + { + "epoch": 8.014446952595938, + "grad_norm": 282.3250427246094, + "learning_rate": 1.796733212341198e-05, + "loss": 41.4664, + "step": 2220 + }, + { + "epoch": 8.014446952595938, + "eval_loss": 0.6222901344299316, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 2220 + }, + { + "epoch": 8.01805869074492, + "grad_norm": 186.79281616210938, + "learning_rate": 1.7961887477313975e-05, + "loss": 42.2133, + "step": 2221 + }, + { + "epoch": 8.021670428893906, + "grad_norm": 220.3788299560547, + "learning_rate": 1.795644283121597e-05, + "loss": 42.0159, + "step": 2222 + }, + { + "epoch": 8.025282167042889, + "grad_norm": 262.37078857421875, + "learning_rate": 1.7950998185117966e-05, + "loss": 42.6055, + "step": 2223 + }, + { + "epoch": 8.028893905191874, + "grad_norm": 199.07078552246094, + "learning_rate": 1.7945553539019964e-05, + "loss": 43.3061, + "step": 2224 + }, + { + "epoch": 8.032505643340858, + "grad_norm": 256.6651306152344, + "learning_rate": 1.794010889292196e-05, + "loss": 42.4806, + "step": 2225 + }, + { + "epoch": 8.036117381489841, + "grad_norm": 281.17431640625, + "learning_rate": 1.793466424682396e-05, + "loss": 43.9823, + "step": 2226 + }, + { + "epoch": 8.039729119638826, + "grad_norm": 201.19837951660156, + "learning_rate": 1.7929219600725954e-05, + "loss": 41.8372, + "step": 2227 + }, + { + "epoch": 8.043340857787811, + "grad_norm": 195.1905059814453, + "learning_rate": 1.792377495462795e-05, + "loss": 38.8656, + "step": 2228 + }, + { + "epoch": 8.046952595936794, + "grad_norm": 215.02772521972656, + "learning_rate": 1.7918330308529948e-05, + "loss": 39.8965, + "step": 2229 + }, + { + "epoch": 8.050564334085779, + "grad_norm": 202.16322326660156, + "learning_rate": 1.7912885662431944e-05, + "loss": 41.0917, + "step": 2230 + }, + { + "epoch": 8.050564334085779, + "eval_loss": 0.6212881207466125, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2230 + }, + { + "epoch": 8.054176072234762, + "grad_norm": 218.90786743164062, + "learning_rate": 1.790744101633394e-05, + "loss": 38.5499, + "step": 2231 + }, + { + "epoch": 8.057787810383747, + "grad_norm": 179.57138061523438, + "learning_rate": 1.7901996370235934e-05, + "loss": 39.5915, + "step": 2232 + }, + { + "epoch": 8.061399548532732, + "grad_norm": 242.74801635742188, + "learning_rate": 1.789655172413793e-05, + "loss": 39.6094, + "step": 2233 + }, + { + "epoch": 8.065011286681715, + "grad_norm": 183.07102966308594, + "learning_rate": 1.7891107078039925e-05, + "loss": 40.6025, + "step": 2234 + }, + { + "epoch": 8.0686230248307, + "grad_norm": 192.85418701171875, + "learning_rate": 1.7885662431941924e-05, + "loss": 40.3013, + "step": 2235 + }, + { + "epoch": 8.072234762979685, + "grad_norm": 254.26353454589844, + "learning_rate": 1.7880217785843923e-05, + "loss": 39.1747, + "step": 2236 + }, + { + "epoch": 8.075846501128668, + "grad_norm": 230.7747802734375, + "learning_rate": 1.7874773139745918e-05, + "loss": 40.7569, + "step": 2237 + }, + { + "epoch": 8.079458239277653, + "grad_norm": 179.30528259277344, + "learning_rate": 1.7869328493647913e-05, + "loss": 40.0753, + "step": 2238 + }, + { + "epoch": 8.083069977426636, + "grad_norm": 203.48915100097656, + "learning_rate": 1.786388384754991e-05, + "loss": 41.4453, + "step": 2239 + }, + { + "epoch": 8.08668171557562, + "grad_norm": 274.8970947265625, + "learning_rate": 1.7858439201451908e-05, + "loss": 40.5818, + "step": 2240 + }, + { + "epoch": 8.08668171557562, + "eval_loss": 0.6184170842170715, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.799, + "eval_steps_per_second": 56.799, + "step": 2240 + }, + { + "epoch": 8.090293453724605, + "grad_norm": 237.2452392578125, + "learning_rate": 1.7852994555353903e-05, + "loss": 42.5794, + "step": 2241 + }, + { + "epoch": 8.093905191873588, + "grad_norm": 236.33766174316406, + "learning_rate": 1.7847549909255898e-05, + "loss": 41.89, + "step": 2242 + }, + { + "epoch": 8.097516930022573, + "grad_norm": 269.4791564941406, + "learning_rate": 1.7842105263157894e-05, + "loss": 41.7726, + "step": 2243 + }, + { + "epoch": 8.101128668171558, + "grad_norm": 192.28457641601562, + "learning_rate": 1.783666061705989e-05, + "loss": 40.1187, + "step": 2244 + }, + { + "epoch": 8.104740406320541, + "grad_norm": 201.5625457763672, + "learning_rate": 1.7831215970961888e-05, + "loss": 36.8004, + "step": 2245 + }, + { + "epoch": 8.108352144469526, + "grad_norm": 175.7625274658203, + "learning_rate": 1.7825771324863887e-05, + "loss": 33.8354, + "step": 2246 + }, + { + "epoch": 8.111963882618511, + "grad_norm": 195.6171112060547, + "learning_rate": 1.7820326678765882e-05, + "loss": 33.5176, + "step": 2247 + }, + { + "epoch": 8.115575620767494, + "grad_norm": 158.7554168701172, + "learning_rate": 1.7814882032667877e-05, + "loss": 34.2908, + "step": 2248 + }, + { + "epoch": 8.119187358916479, + "grad_norm": 192.78900146484375, + "learning_rate": 1.7809437386569873e-05, + "loss": 34.0861, + "step": 2249 + }, + { + "epoch": 8.122799097065462, + "grad_norm": 186.6603240966797, + "learning_rate": 1.7803992740471868e-05, + "loss": 35.5742, + "step": 2250 + }, + { + "epoch": 8.122799097065462, + "eval_loss": 0.6207499504089355, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2250 + }, + { + "epoch": 8.126410835214447, + "grad_norm": 264.3590087890625, + "learning_rate": 1.7798548094373867e-05, + "loss": 35.6709, + "step": 2251 + }, + { + "epoch": 8.130022573363432, + "grad_norm": 202.9478302001953, + "learning_rate": 1.7793103448275862e-05, + "loss": 36.4221, + "step": 2252 + }, + { + "epoch": 8.133634311512415, + "grad_norm": 229.260498046875, + "learning_rate": 1.7787658802177858e-05, + "loss": 36.0745, + "step": 2253 + }, + { + "epoch": 8.1372460496614, + "grad_norm": 222.37716674804688, + "learning_rate": 1.7782214156079856e-05, + "loss": 37.3266, + "step": 2254 + }, + { + "epoch": 8.140857787810384, + "grad_norm": 217.02272033691406, + "learning_rate": 1.7776769509981852e-05, + "loss": 37.2819, + "step": 2255 + }, + { + "epoch": 8.144469525959368, + "grad_norm": 247.61016845703125, + "learning_rate": 1.7771324863883847e-05, + "loss": 37.2683, + "step": 2256 + }, + { + "epoch": 8.148081264108352, + "grad_norm": 209.7449493408203, + "learning_rate": 1.7765880217785846e-05, + "loss": 36.7165, + "step": 2257 + }, + { + "epoch": 8.151693002257336, + "grad_norm": 217.30722045898438, + "learning_rate": 1.776043557168784e-05, + "loss": 37.0805, + "step": 2258 + }, + { + "epoch": 8.15530474040632, + "grad_norm": 181.5167236328125, + "learning_rate": 1.7754990925589837e-05, + "loss": 38.0326, + "step": 2259 + }, + { + "epoch": 8.158916478555305, + "grad_norm": 217.4818878173828, + "learning_rate": 1.7749546279491832e-05, + "loss": 37.1798, + "step": 2260 + }, + { + "epoch": 8.158916478555305, + "eval_loss": 0.6218119263648987, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 2260 + }, + { + "epoch": 8.162528216704288, + "grad_norm": 233.60733032226562, + "learning_rate": 1.7744101633393828e-05, + "loss": 36.6039, + "step": 2261 + }, + { + "epoch": 8.166139954853273, + "grad_norm": 184.5128631591797, + "learning_rate": 1.7738656987295826e-05, + "loss": 30.6188, + "step": 2262 + }, + { + "epoch": 8.169751693002258, + "grad_norm": 154.25791931152344, + "learning_rate": 1.7733212341197825e-05, + "loss": 24.0782, + "step": 2263 + }, + { + "epoch": 8.173363431151241, + "grad_norm": 179.92723083496094, + "learning_rate": 1.772776769509982e-05, + "loss": 23.7072, + "step": 2264 + }, + { + "epoch": 8.176975169300226, + "grad_norm": 170.87684631347656, + "learning_rate": 1.7722323049001816e-05, + "loss": 24.0008, + "step": 2265 + }, + { + "epoch": 8.18058690744921, + "grad_norm": 179.25233459472656, + "learning_rate": 1.771687840290381e-05, + "loss": 24.8393, + "step": 2266 + }, + { + "epoch": 8.184198645598194, + "grad_norm": 268.7836608886719, + "learning_rate": 1.7711433756805807e-05, + "loss": 44.0573, + "step": 2267 + }, + { + "epoch": 8.187810383747179, + "grad_norm": 249.12033081054688, + "learning_rate": 1.7705989110707805e-05, + "loss": 45.0218, + "step": 2268 + }, + { + "epoch": 8.191422121896162, + "grad_norm": 275.2551574707031, + "learning_rate": 1.77005444646098e-05, + "loss": 43.1954, + "step": 2269 + }, + { + "epoch": 8.195033860045147, + "grad_norm": 233.5360107421875, + "learning_rate": 1.7695099818511796e-05, + "loss": 43.0807, + "step": 2270 + }, + { + "epoch": 8.195033860045147, + "eval_loss": 0.6311450600624084, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 2270 + }, + { + "epoch": 8.198645598194132, + "grad_norm": 201.01617431640625, + "learning_rate": 1.768965517241379e-05, + "loss": 43.8161, + "step": 2271 + }, + { + "epoch": 8.202257336343115, + "grad_norm": 243.028564453125, + "learning_rate": 1.7684210526315787e-05, + "loss": 43.3388, + "step": 2272 + }, + { + "epoch": 8.2058690744921, + "grad_norm": 191.8246307373047, + "learning_rate": 1.767876588021779e-05, + "loss": 42.6949, + "step": 2273 + }, + { + "epoch": 8.209480812641084, + "grad_norm": 241.33609008789062, + "learning_rate": 1.7673321234119784e-05, + "loss": 43.3541, + "step": 2274 + }, + { + "epoch": 8.213092550790067, + "grad_norm": 247.99066162109375, + "learning_rate": 1.766787658802178e-05, + "loss": 44.4262, + "step": 2275 + }, + { + "epoch": 8.216704288939052, + "grad_norm": 223.35452270507812, + "learning_rate": 1.7662431941923775e-05, + "loss": 42.5696, + "step": 2276 + }, + { + "epoch": 8.220316027088035, + "grad_norm": 208.75209045410156, + "learning_rate": 1.765698729582577e-05, + "loss": 41.9236, + "step": 2277 + }, + { + "epoch": 8.22392776523702, + "grad_norm": 229.60305786132812, + "learning_rate": 1.7651542649727766e-05, + "loss": 39.962, + "step": 2278 + }, + { + "epoch": 8.227539503386005, + "grad_norm": 294.3867492675781, + "learning_rate": 1.7646098003629765e-05, + "loss": 39.0847, + "step": 2279 + }, + { + "epoch": 8.231151241534988, + "grad_norm": 201.49679565429688, + "learning_rate": 1.764065335753176e-05, + "loss": 39.1451, + "step": 2280 + }, + { + "epoch": 8.231151241534988, + "eval_loss": 0.6214079856872559, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 2280 + }, + { + "epoch": 8.234762979683973, + "grad_norm": 201.57894897460938, + "learning_rate": 1.7635208711433756e-05, + "loss": 39.4673, + "step": 2281 + }, + { + "epoch": 8.238374717832958, + "grad_norm": 201.0395965576172, + "learning_rate": 1.7629764065335754e-05, + "loss": 39.9832, + "step": 2282 + }, + { + "epoch": 8.241986455981941, + "grad_norm": 274.41168212890625, + "learning_rate": 1.762431941923775e-05, + "loss": 40.3885, + "step": 2283 + }, + { + "epoch": 8.245598194130926, + "grad_norm": 173.79977416992188, + "learning_rate": 1.761887477313975e-05, + "loss": 39.5292, + "step": 2284 + }, + { + "epoch": 8.249209932279909, + "grad_norm": 194.91806030273438, + "learning_rate": 1.7613430127041744e-05, + "loss": 40.3855, + "step": 2285 + }, + { + "epoch": 8.252821670428894, + "grad_norm": 216.47213745117188, + "learning_rate": 1.760798548094374e-05, + "loss": 40.937, + "step": 2286 + }, + { + "epoch": 8.256433408577879, + "grad_norm": 168.1825714111328, + "learning_rate": 1.7602540834845735e-05, + "loss": 41.2523, + "step": 2287 + }, + { + "epoch": 8.260045146726862, + "grad_norm": 187.51914978027344, + "learning_rate": 1.759709618874773e-05, + "loss": 40.6913, + "step": 2288 + }, + { + "epoch": 8.263656884875846, + "grad_norm": 183.99844360351562, + "learning_rate": 1.759165154264973e-05, + "loss": 42.5074, + "step": 2289 + }, + { + "epoch": 8.267268623024831, + "grad_norm": 201.23797607421875, + "learning_rate": 1.7586206896551724e-05, + "loss": 42.0519, + "step": 2290 + }, + { + "epoch": 8.267268623024831, + "eval_loss": 0.6184054017066956, + "eval_runtime": 3.1465, + "eval_samples_per_second": 56.889, + "eval_steps_per_second": 56.889, + "step": 2290 + }, + { + "epoch": 8.270880361173814, + "grad_norm": 219.0037384033203, + "learning_rate": 1.7580762250453723e-05, + "loss": 41.7059, + "step": 2291 + }, + { + "epoch": 8.2744920993228, + "grad_norm": 221.00173950195312, + "learning_rate": 1.7575317604355718e-05, + "loss": 40.9004, + "step": 2292 + }, + { + "epoch": 8.278103837471784, + "grad_norm": 180.00828552246094, + "learning_rate": 1.7569872958257714e-05, + "loss": 38.7865, + "step": 2293 + }, + { + "epoch": 8.281715575620767, + "grad_norm": 210.69302368164062, + "learning_rate": 1.756442831215971e-05, + "loss": 39.207, + "step": 2294 + }, + { + "epoch": 8.285327313769752, + "grad_norm": 196.8787078857422, + "learning_rate": 1.7558983666061708e-05, + "loss": 39.4472, + "step": 2295 + }, + { + "epoch": 8.288939051918735, + "grad_norm": 229.16331481933594, + "learning_rate": 1.7553539019963703e-05, + "loss": 36.5539, + "step": 2296 + }, + { + "epoch": 8.29255079006772, + "grad_norm": 180.67474365234375, + "learning_rate": 1.75480943738657e-05, + "loss": 34.3887, + "step": 2297 + }, + { + "epoch": 8.296162528216705, + "grad_norm": 234.046875, + "learning_rate": 1.7542649727767694e-05, + "loss": 34.158, + "step": 2298 + }, + { + "epoch": 8.299774266365688, + "grad_norm": 213.34255981445312, + "learning_rate": 1.753720508166969e-05, + "loss": 34.7655, + "step": 2299 + }, + { + "epoch": 8.303386004514673, + "grad_norm": 205.6382598876953, + "learning_rate": 1.753176043557169e-05, + "loss": 34.4223, + "step": 2300 + }, + { + "epoch": 8.303386004514673, + "eval_loss": 0.6200549006462097, + "eval_runtime": 3.1447, + "eval_samples_per_second": 56.921, + "eval_steps_per_second": 56.921, + "step": 2300 + }, + { + "epoch": 8.306997742663658, + "grad_norm": 189.79238891601562, + "learning_rate": 1.7526315789473687e-05, + "loss": 35.3846, + "step": 2301 + }, + { + "epoch": 8.31060948081264, + "grad_norm": 202.27859497070312, + "learning_rate": 1.7520871143375682e-05, + "loss": 34.9006, + "step": 2302 + }, + { + "epoch": 8.314221218961626, + "grad_norm": 217.62327575683594, + "learning_rate": 1.7515426497277678e-05, + "loss": 36.3079, + "step": 2303 + }, + { + "epoch": 8.317832957110609, + "grad_norm": 212.82862854003906, + "learning_rate": 1.7509981851179673e-05, + "loss": 35.8598, + "step": 2304 + }, + { + "epoch": 8.321444695259594, + "grad_norm": 229.778564453125, + "learning_rate": 1.750453720508167e-05, + "loss": 37.0853, + "step": 2305 + }, + { + "epoch": 8.325056433408578, + "grad_norm": 219.99844360351562, + "learning_rate": 1.7499092558983667e-05, + "loss": 38.01, + "step": 2306 + }, + { + "epoch": 8.328668171557561, + "grad_norm": 202.63035583496094, + "learning_rate": 1.7493647912885663e-05, + "loss": 36.4756, + "step": 2307 + }, + { + "epoch": 8.332279909706546, + "grad_norm": 188.44094848632812, + "learning_rate": 1.7488203266787658e-05, + "loss": 37.0509, + "step": 2308 + }, + { + "epoch": 8.335891647855531, + "grad_norm": 187.8760223388672, + "learning_rate": 1.7482758620689657e-05, + "loss": 38.0019, + "step": 2309 + }, + { + "epoch": 8.339503386004514, + "grad_norm": 239.35833740234375, + "learning_rate": 1.7477313974591652e-05, + "loss": 38.2255, + "step": 2310 + }, + { + "epoch": 8.339503386004514, + "eval_loss": 0.6221747994422913, + "eval_runtime": 3.148, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 2310 + }, + { + "epoch": 8.343115124153499, + "grad_norm": 236.3567657470703, + "learning_rate": 1.747186932849365e-05, + "loss": 37.3598, + "step": 2311 + }, + { + "epoch": 8.346726862302482, + "grad_norm": 188.16151428222656, + "learning_rate": 1.7466424682395646e-05, + "loss": 27.1993, + "step": 2312 + }, + { + "epoch": 8.350338600451467, + "grad_norm": 216.58778381347656, + "learning_rate": 1.746098003629764e-05, + "loss": 23.7024, + "step": 2313 + }, + { + "epoch": 8.353950338600452, + "grad_norm": 221.03111267089844, + "learning_rate": 1.7455535390199637e-05, + "loss": 24.2856, + "step": 2314 + }, + { + "epoch": 8.357562076749435, + "grad_norm": 180.36221313476562, + "learning_rate": 1.7450090744101632e-05, + "loss": 23.7624, + "step": 2315 + }, + { + "epoch": 8.36117381489842, + "grad_norm": 198.77438354492188, + "learning_rate": 1.7444646098003628e-05, + "loss": 25.8628, + "step": 2316 + }, + { + "epoch": 8.364785553047405, + "grad_norm": 250.81321716308594, + "learning_rate": 1.7439201451905627e-05, + "loss": 43.4097, + "step": 2317 + }, + { + "epoch": 8.368397291196388, + "grad_norm": 246.19544982910156, + "learning_rate": 1.7433756805807622e-05, + "loss": 44.7141, + "step": 2318 + }, + { + "epoch": 8.372009029345373, + "grad_norm": 245.04241943359375, + "learning_rate": 1.742831215970962e-05, + "loss": 44.4511, + "step": 2319 + }, + { + "epoch": 8.375620767494357, + "grad_norm": 224.05331420898438, + "learning_rate": 1.7422867513611616e-05, + "loss": 43.5971, + "step": 2320 + }, + { + "epoch": 8.375620767494357, + "eval_loss": 0.6324251294136047, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 2320 + }, + { + "epoch": 8.37923250564334, + "grad_norm": 222.3795623779297, + "learning_rate": 1.741742286751361e-05, + "loss": 42.9007, + "step": 2321 + }, + { + "epoch": 8.382844243792325, + "grad_norm": 210.0133514404297, + "learning_rate": 1.741197822141561e-05, + "loss": 42.8733, + "step": 2322 + }, + { + "epoch": 8.386455981941308, + "grad_norm": 222.01031494140625, + "learning_rate": 1.7406533575317606e-05, + "loss": 42.9875, + "step": 2323 + }, + { + "epoch": 8.390067720090293, + "grad_norm": 187.30101013183594, + "learning_rate": 1.74010889292196e-05, + "loss": 42.4873, + "step": 2324 + }, + { + "epoch": 8.393679458239278, + "grad_norm": 188.22048950195312, + "learning_rate": 1.7395644283121596e-05, + "loss": 42.2066, + "step": 2325 + }, + { + "epoch": 8.397291196388261, + "grad_norm": 228.75363159179688, + "learning_rate": 1.7390199637023592e-05, + "loss": 42.7604, + "step": 2326 + }, + { + "epoch": 8.400902934537246, + "grad_norm": 196.8817901611328, + "learning_rate": 1.7384754990925587e-05, + "loss": 42.445, + "step": 2327 + }, + { + "epoch": 8.404514672686231, + "grad_norm": 205.3610382080078, + "learning_rate": 1.737931034482759e-05, + "loss": 39.8408, + "step": 2328 + }, + { + "epoch": 8.408126410835214, + "grad_norm": 259.0702819824219, + "learning_rate": 1.7373865698729585e-05, + "loss": 40.847, + "step": 2329 + }, + { + "epoch": 8.411738148984199, + "grad_norm": 216.12017822265625, + "learning_rate": 1.736842105263158e-05, + "loss": 40.4648, + "step": 2330 + }, + { + "epoch": 8.411738148984199, + "eval_loss": 0.6252871155738831, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2330 + }, + { + "epoch": 8.415349887133182, + "grad_norm": 330.9464111328125, + "learning_rate": 1.7362976406533575e-05, + "loss": 39.7682, + "step": 2331 + }, + { + "epoch": 8.418961625282167, + "grad_norm": 237.19505310058594, + "learning_rate": 1.735753176043557e-05, + "loss": 38.8824, + "step": 2332 + }, + { + "epoch": 8.422573363431152, + "grad_norm": 247.22259521484375, + "learning_rate": 1.735208711433757e-05, + "loss": 40.1187, + "step": 2333 + }, + { + "epoch": 8.426185101580135, + "grad_norm": 267.739990234375, + "learning_rate": 1.7346642468239565e-05, + "loss": 40.4589, + "step": 2334 + }, + { + "epoch": 8.42979683972912, + "grad_norm": 308.715576171875, + "learning_rate": 1.734119782214156e-05, + "loss": 41.5481, + "step": 2335 + }, + { + "epoch": 8.433408577878104, + "grad_norm": 350.8972473144531, + "learning_rate": 1.7335753176043556e-05, + "loss": 41.6628, + "step": 2336 + }, + { + "epoch": 8.437020316027088, + "grad_norm": 245.9825897216797, + "learning_rate": 1.7330308529945555e-05, + "loss": 40.3527, + "step": 2337 + }, + { + "epoch": 8.440632054176072, + "grad_norm": 253.94488525390625, + "learning_rate": 1.732486388384755e-05, + "loss": 39.6388, + "step": 2338 + }, + { + "epoch": 8.444243792325057, + "grad_norm": 226.24179077148438, + "learning_rate": 1.731941923774955e-05, + "loss": 40.5561, + "step": 2339 + }, + { + "epoch": 8.44785553047404, + "grad_norm": 188.66746520996094, + "learning_rate": 1.7313974591651544e-05, + "loss": 41.8422, + "step": 2340 + }, + { + "epoch": 8.44785553047404, + "eval_loss": 0.6197592616081238, + "eval_runtime": 3.1522, + "eval_samples_per_second": 56.786, + "eval_steps_per_second": 56.786, + "step": 2340 + }, + { + "epoch": 8.451467268623025, + "grad_norm": 227.01014709472656, + "learning_rate": 1.730852994555354e-05, + "loss": 41.4184, + "step": 2341 + }, + { + "epoch": 8.455079006772008, + "grad_norm": 187.11643981933594, + "learning_rate": 1.7303085299455535e-05, + "loss": 40.796, + "step": 2342 + }, + { + "epoch": 8.458690744920993, + "grad_norm": 243.1756134033203, + "learning_rate": 1.729764065335753e-05, + "loss": 41.7926, + "step": 2343 + }, + { + "epoch": 8.462302483069978, + "grad_norm": 226.15187072753906, + "learning_rate": 1.729219600725953e-05, + "loss": 41.588, + "step": 2344 + }, + { + "epoch": 8.465914221218961, + "grad_norm": 218.49935913085938, + "learning_rate": 1.7286751361161524e-05, + "loss": 39.6935, + "step": 2345 + }, + { + "epoch": 8.469525959367946, + "grad_norm": 232.4805145263672, + "learning_rate": 1.7281306715063523e-05, + "loss": 37.0718, + "step": 2346 + }, + { + "epoch": 8.47313769751693, + "grad_norm": 201.1748046875, + "learning_rate": 1.727586206896552e-05, + "loss": 33.9633, + "step": 2347 + }, + { + "epoch": 8.476749435665914, + "grad_norm": 208.79733276367188, + "learning_rate": 1.7270417422867514e-05, + "loss": 33.4553, + "step": 2348 + }, + { + "epoch": 8.480361173814899, + "grad_norm": 235.91151428222656, + "learning_rate": 1.726497277676951e-05, + "loss": 33.6144, + "step": 2349 + }, + { + "epoch": 8.483972911963882, + "grad_norm": 206.28811645507812, + "learning_rate": 1.7259528130671508e-05, + "loss": 35.3678, + "step": 2350 + }, + { + "epoch": 8.483972911963882, + "eval_loss": 0.6203061938285828, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 2350 + }, + { + "epoch": 8.487584650112867, + "grad_norm": 305.2204284667969, + "learning_rate": 1.7254083484573503e-05, + "loss": 35.9175, + "step": 2351 + }, + { + "epoch": 8.491196388261852, + "grad_norm": 227.1592254638672, + "learning_rate": 1.72486388384755e-05, + "loss": 35.5001, + "step": 2352 + }, + { + "epoch": 8.494808126410835, + "grad_norm": 194.739501953125, + "learning_rate": 1.7243194192377494e-05, + "loss": 35.0015, + "step": 2353 + }, + { + "epoch": 8.49841986455982, + "grad_norm": 233.8467254638672, + "learning_rate": 1.723774954627949e-05, + "loss": 36.8257, + "step": 2354 + }, + { + "epoch": 8.502031602708804, + "grad_norm": 258.8914489746094, + "learning_rate": 1.7232304900181492e-05, + "loss": 36.1246, + "step": 2355 + }, + { + "epoch": 8.505643340857787, + "grad_norm": 194.8585968017578, + "learning_rate": 1.7226860254083487e-05, + "loss": 36.1245, + "step": 2356 + }, + { + "epoch": 8.509255079006772, + "grad_norm": 191.2276153564453, + "learning_rate": 1.7221415607985483e-05, + "loss": 37.0608, + "step": 2357 + }, + { + "epoch": 8.512866817155757, + "grad_norm": 197.9025115966797, + "learning_rate": 1.7215970961887478e-05, + "loss": 37.0779, + "step": 2358 + }, + { + "epoch": 8.51647855530474, + "grad_norm": 207.01016235351562, + "learning_rate": 1.7210526315789473e-05, + "loss": 37.8432, + "step": 2359 + }, + { + "epoch": 8.520090293453725, + "grad_norm": 222.20201110839844, + "learning_rate": 1.720508166969147e-05, + "loss": 36.6983, + "step": 2360 + }, + { + "epoch": 8.520090293453725, + "eval_loss": 0.6240220665931702, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 2360 + }, + { + "epoch": 8.523702031602708, + "grad_norm": 200.19273376464844, + "learning_rate": 1.7199637023593467e-05, + "loss": 38.0613, + "step": 2361 + }, + { + "epoch": 8.527313769751693, + "grad_norm": 205.36758422851562, + "learning_rate": 1.7194192377495463e-05, + "loss": 29.6395, + "step": 2362 + }, + { + "epoch": 8.530925507900678, + "grad_norm": 206.53396606445312, + "learning_rate": 1.7188747731397458e-05, + "loss": 23.6478, + "step": 2363 + }, + { + "epoch": 8.534537246049661, + "grad_norm": 219.47044372558594, + "learning_rate": 1.7183303085299454e-05, + "loss": 22.8522, + "step": 2364 + }, + { + "epoch": 8.538148984198646, + "grad_norm": 178.48008728027344, + "learning_rate": 1.7177858439201452e-05, + "loss": 24.1411, + "step": 2365 + }, + { + "epoch": 8.54176072234763, + "grad_norm": 222.63731384277344, + "learning_rate": 1.717241379310345e-05, + "loss": 26.2818, + "step": 2366 + }, + { + "epoch": 8.545372460496614, + "grad_norm": 216.6333465576172, + "learning_rate": 1.7166969147005447e-05, + "loss": 42.5599, + "step": 2367 + }, + { + "epoch": 8.548984198645599, + "grad_norm": 241.42532348632812, + "learning_rate": 1.7161524500907442e-05, + "loss": 44.0016, + "step": 2368 + }, + { + "epoch": 8.552595936794582, + "grad_norm": 227.95193481445312, + "learning_rate": 1.7156079854809437e-05, + "loss": 44.1662, + "step": 2369 + }, + { + "epoch": 8.556207674943566, + "grad_norm": 204.9208526611328, + "learning_rate": 1.7150635208711433e-05, + "loss": 41.2255, + "step": 2370 + }, + { + "epoch": 8.556207674943566, + "eval_loss": 0.6293933987617493, + "eval_runtime": 3.1467, + "eval_samples_per_second": 56.884, + "eval_steps_per_second": 56.884, + "step": 2370 + }, + { + "epoch": 8.559819413092551, + "grad_norm": 168.1370849609375, + "learning_rate": 1.7145190562613428e-05, + "loss": 42.8374, + "step": 2371 + }, + { + "epoch": 8.563431151241534, + "grad_norm": 209.16641235351562, + "learning_rate": 1.7139745916515427e-05, + "loss": 42.4378, + "step": 2372 + }, + { + "epoch": 8.56704288939052, + "grad_norm": 235.36373901367188, + "learning_rate": 1.7134301270417422e-05, + "loss": 43.3213, + "step": 2373 + }, + { + "epoch": 8.570654627539504, + "grad_norm": 198.8206329345703, + "learning_rate": 1.712885662431942e-05, + "loss": 43.5621, + "step": 2374 + }, + { + "epoch": 8.574266365688487, + "grad_norm": 191.1640167236328, + "learning_rate": 1.7123411978221416e-05, + "loss": 41.8729, + "step": 2375 + }, + { + "epoch": 8.577878103837472, + "grad_norm": 281.6352233886719, + "learning_rate": 1.7117967332123412e-05, + "loss": 42.8306, + "step": 2376 + }, + { + "epoch": 8.581489841986457, + "grad_norm": 191.68939208984375, + "learning_rate": 1.711252268602541e-05, + "loss": 41.3603, + "step": 2377 + }, + { + "epoch": 8.58510158013544, + "grad_norm": 175.3041229248047, + "learning_rate": 1.7107078039927406e-05, + "loss": 38.7076, + "step": 2378 + }, + { + "epoch": 8.588713318284425, + "grad_norm": 186.31202697753906, + "learning_rate": 1.71016333938294e-05, + "loss": 38.832, + "step": 2379 + }, + { + "epoch": 8.592325056433408, + "grad_norm": 192.0680389404297, + "learning_rate": 1.7096188747731397e-05, + "loss": 40.6542, + "step": 2380 + }, + { + "epoch": 8.592325056433408, + "eval_loss": 0.6245992183685303, + "eval_runtime": 3.1487, + "eval_samples_per_second": 56.848, + "eval_steps_per_second": 56.848, + "step": 2380 + }, + { + "epoch": 8.595936794582393, + "grad_norm": 284.3516540527344, + "learning_rate": 1.7090744101633392e-05, + "loss": 40.3145, + "step": 2381 + }, + { + "epoch": 8.599548532731378, + "grad_norm": 210.2421875, + "learning_rate": 1.708529945553539e-05, + "loss": 39.9109, + "step": 2382 + }, + { + "epoch": 8.60316027088036, + "grad_norm": 202.3438720703125, + "learning_rate": 1.707985480943739e-05, + "loss": 39.0686, + "step": 2383 + }, + { + "epoch": 8.606772009029346, + "grad_norm": 189.5508270263672, + "learning_rate": 1.7074410163339385e-05, + "loss": 40.6673, + "step": 2384 + }, + { + "epoch": 8.610383747178329, + "grad_norm": 199.3516387939453, + "learning_rate": 1.706896551724138e-05, + "loss": 40.5357, + "step": 2385 + }, + { + "epoch": 8.613995485327314, + "grad_norm": 183.11309814453125, + "learning_rate": 1.7063520871143376e-05, + "loss": 40.7691, + "step": 2386 + }, + { + "epoch": 8.617607223476298, + "grad_norm": 347.104248046875, + "learning_rate": 1.705807622504537e-05, + "loss": 40.6822, + "step": 2387 + }, + { + "epoch": 8.621218961625281, + "grad_norm": 341.0453796386719, + "learning_rate": 1.705263157894737e-05, + "loss": 40.9791, + "step": 2388 + }, + { + "epoch": 8.624830699774266, + "grad_norm": 335.33221435546875, + "learning_rate": 1.7047186932849365e-05, + "loss": 41.0977, + "step": 2389 + }, + { + "epoch": 8.628442437923251, + "grad_norm": 209.75198364257812, + "learning_rate": 1.704174228675136e-05, + "loss": 41.3332, + "step": 2390 + }, + { + "epoch": 8.628442437923251, + "eval_loss": 0.6176490783691406, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2390 + }, + { + "epoch": 8.632054176072234, + "grad_norm": 221.6715545654297, + "learning_rate": 1.7036297640653356e-05, + "loss": 41.7456, + "step": 2391 + }, + { + "epoch": 8.635665914221219, + "grad_norm": 255.7875213623047, + "learning_rate": 1.7030852994555355e-05, + "loss": 41.7063, + "step": 2392 + }, + { + "epoch": 8.639277652370204, + "grad_norm": 206.66221618652344, + "learning_rate": 1.7025408348457354e-05, + "loss": 41.941, + "step": 2393 + }, + { + "epoch": 8.642889390519187, + "grad_norm": 381.9871826171875, + "learning_rate": 1.701996370235935e-05, + "loss": 42.8615, + "step": 2394 + }, + { + "epoch": 8.646501128668172, + "grad_norm": 303.8249816894531, + "learning_rate": 1.7014519056261344e-05, + "loss": 37.8472, + "step": 2395 + }, + { + "epoch": 8.650112866817155, + "grad_norm": 201.2444610595703, + "learning_rate": 1.700907441016334e-05, + "loss": 35.4641, + "step": 2396 + }, + { + "epoch": 8.65372460496614, + "grad_norm": 242.34298706054688, + "learning_rate": 1.7003629764065335e-05, + "loss": 33.3414, + "step": 2397 + }, + { + "epoch": 8.657336343115125, + "grad_norm": 214.45384216308594, + "learning_rate": 1.699818511796733e-05, + "loss": 33.7771, + "step": 2398 + }, + { + "epoch": 8.660948081264108, + "grad_norm": 276.4810485839844, + "learning_rate": 1.699274047186933e-05, + "loss": 35.4289, + "step": 2399 + }, + { + "epoch": 8.664559819413093, + "grad_norm": 199.68626403808594, + "learning_rate": 1.6987295825771325e-05, + "loss": 34.4205, + "step": 2400 + }, + { + "epoch": 8.664559819413093, + "eval_loss": 0.6179484128952026, + "eval_runtime": 3.1618, + "eval_samples_per_second": 56.614, + "eval_steps_per_second": 56.614, + "step": 2400 + }, + { + "epoch": 8.668171557562077, + "grad_norm": 239.19200134277344, + "learning_rate": 1.698185117967332e-05, + "loss": 34.3428, + "step": 2401 + }, + { + "epoch": 8.67178329571106, + "grad_norm": 341.44927978515625, + "learning_rate": 1.697640653357532e-05, + "loss": 37.6011, + "step": 2402 + }, + { + "epoch": 8.675395033860045, + "grad_norm": 260.5967102050781, + "learning_rate": 1.6970961887477314e-05, + "loss": 34.9222, + "step": 2403 + }, + { + "epoch": 8.679006772009028, + "grad_norm": 217.9357147216797, + "learning_rate": 1.6965517241379313e-05, + "loss": 36.6177, + "step": 2404 + }, + { + "epoch": 8.682618510158013, + "grad_norm": 355.21917724609375, + "learning_rate": 1.696007259528131e-05, + "loss": 36.3072, + "step": 2405 + }, + { + "epoch": 8.686230248306998, + "grad_norm": 279.37200927734375, + "learning_rate": 1.6954627949183304e-05, + "loss": 36.7026, + "step": 2406 + }, + { + "epoch": 8.689841986455981, + "grad_norm": 344.9017028808594, + "learning_rate": 1.69491833030853e-05, + "loss": 37.5009, + "step": 2407 + }, + { + "epoch": 8.693453724604966, + "grad_norm": 225.28668212890625, + "learning_rate": 1.6943738656987295e-05, + "loss": 36.0914, + "step": 2408 + }, + { + "epoch": 8.697065462753951, + "grad_norm": 233.16372680664062, + "learning_rate": 1.693829401088929e-05, + "loss": 38.0917, + "step": 2409 + }, + { + "epoch": 8.700677200902934, + "grad_norm": 220.2307891845703, + "learning_rate": 1.693284936479129e-05, + "loss": 37.4493, + "step": 2410 + }, + { + "epoch": 8.700677200902934, + "eval_loss": 0.6225734949111938, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2410 + }, + { + "epoch": 8.704288939051919, + "grad_norm": 298.2883605957031, + "learning_rate": 1.6927404718693287e-05, + "loss": 37.6527, + "step": 2411 + }, + { + "epoch": 8.707900677200904, + "grad_norm": 329.1615295410156, + "learning_rate": 1.6921960072595283e-05, + "loss": 30.9627, + "step": 2412 + }, + { + "epoch": 8.711512415349887, + "grad_norm": 192.55380249023438, + "learning_rate": 1.6916515426497278e-05, + "loss": 24.2028, + "step": 2413 + }, + { + "epoch": 8.715124153498872, + "grad_norm": 162.13583374023438, + "learning_rate": 1.6911070780399274e-05, + "loss": 23.3005, + "step": 2414 + }, + { + "epoch": 8.718735891647855, + "grad_norm": 152.95108032226562, + "learning_rate": 1.6905626134301272e-05, + "loss": 24.335, + "step": 2415 + }, + { + "epoch": 8.72234762979684, + "grad_norm": 183.4193572998047, + "learning_rate": 1.6900181488203268e-05, + "loss": 24.9279, + "step": 2416 + }, + { + "epoch": 8.725959367945824, + "grad_norm": 232.93650817871094, + "learning_rate": 1.6894736842105263e-05, + "loss": 43.4574, + "step": 2417 + }, + { + "epoch": 8.729571106094808, + "grad_norm": 226.85890197753906, + "learning_rate": 1.688929219600726e-05, + "loss": 44.4136, + "step": 2418 + }, + { + "epoch": 8.733182844243792, + "grad_norm": 232.16064453125, + "learning_rate": 1.6883847549909254e-05, + "loss": 42.8183, + "step": 2419 + }, + { + "epoch": 8.736794582392777, + "grad_norm": 243.5811767578125, + "learning_rate": 1.6878402903811253e-05, + "loss": 43.3031, + "step": 2420 + }, + { + "epoch": 8.736794582392777, + "eval_loss": 0.6284167170524597, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2420 + }, + { + "epoch": 8.74040632054176, + "grad_norm": 194.7115020751953, + "learning_rate": 1.687295825771325e-05, + "loss": 42.1276, + "step": 2421 + }, + { + "epoch": 8.744018058690745, + "grad_norm": 250.81983947753906, + "learning_rate": 1.6867513611615247e-05, + "loss": 42.5535, + "step": 2422 + }, + { + "epoch": 8.747629796839728, + "grad_norm": 205.1988983154297, + "learning_rate": 1.6862068965517242e-05, + "loss": 42.7745, + "step": 2423 + }, + { + "epoch": 8.751241534988713, + "grad_norm": 159.68243408203125, + "learning_rate": 1.6856624319419238e-05, + "loss": 43.6562, + "step": 2424 + }, + { + "epoch": 8.754853273137698, + "grad_norm": 164.31361389160156, + "learning_rate": 1.6851179673321233e-05, + "loss": 43.4602, + "step": 2425 + }, + { + "epoch": 8.758465011286681, + "grad_norm": 213.9793243408203, + "learning_rate": 1.6845735027223232e-05, + "loss": 42.1559, + "step": 2426 + }, + { + "epoch": 8.762076749435666, + "grad_norm": 205.79107666015625, + "learning_rate": 1.6840290381125227e-05, + "loss": 41.5687, + "step": 2427 + }, + { + "epoch": 8.76568848758465, + "grad_norm": 235.80348205566406, + "learning_rate": 1.6834845735027223e-05, + "loss": 41.0748, + "step": 2428 + }, + { + "epoch": 8.769300225733634, + "grad_norm": 203.84884643554688, + "learning_rate": 1.682940108892922e-05, + "loss": 39.3348, + "step": 2429 + }, + { + "epoch": 8.772911963882619, + "grad_norm": 271.2411804199219, + "learning_rate": 1.6823956442831217e-05, + "loss": 39.357, + "step": 2430 + }, + { + "epoch": 8.772911963882619, + "eval_loss": 0.6211046576499939, + "eval_runtime": 3.1402, + "eval_samples_per_second": 57.002, + "eval_steps_per_second": 57.002, + "step": 2430 + }, + { + "epoch": 8.776523702031604, + "grad_norm": 222.4960174560547, + "learning_rate": 1.6818511796733212e-05, + "loss": 39.2198, + "step": 2431 + }, + { + "epoch": 8.780135440180587, + "grad_norm": 325.9942932128906, + "learning_rate": 1.681306715063521e-05, + "loss": 40.572, + "step": 2432 + }, + { + "epoch": 8.783747178329572, + "grad_norm": 195.2740936279297, + "learning_rate": 1.6807622504537206e-05, + "loss": 39.2727, + "step": 2433 + }, + { + "epoch": 8.787358916478555, + "grad_norm": 196.16964721679688, + "learning_rate": 1.68021778584392e-05, + "loss": 40.6503, + "step": 2434 + }, + { + "epoch": 8.79097065462754, + "grad_norm": 183.2659454345703, + "learning_rate": 1.6796733212341197e-05, + "loss": 41.2074, + "step": 2435 + }, + { + "epoch": 8.794582392776524, + "grad_norm": 293.393798828125, + "learning_rate": 1.6791288566243192e-05, + "loss": 40.2778, + "step": 2436 + }, + { + "epoch": 8.798194130925507, + "grad_norm": 232.8402099609375, + "learning_rate": 1.678584392014519e-05, + "loss": 40.0305, + "step": 2437 + }, + { + "epoch": 8.801805869074492, + "grad_norm": 269.957275390625, + "learning_rate": 1.678039927404719e-05, + "loss": 40.4216, + "step": 2438 + }, + { + "epoch": 8.805417607223477, + "grad_norm": 175.6732635498047, + "learning_rate": 1.6774954627949185e-05, + "loss": 40.7998, + "step": 2439 + }, + { + "epoch": 8.80902934537246, + "grad_norm": 209.0604248046875, + "learning_rate": 1.676950998185118e-05, + "loss": 41.1176, + "step": 2440 + }, + { + "epoch": 8.80902934537246, + "eval_loss": 0.6211614012718201, + "eval_runtime": 3.15, + "eval_samples_per_second": 56.826, + "eval_steps_per_second": 56.826, + "step": 2440 + }, + { + "epoch": 8.812641083521445, + "grad_norm": 229.91171264648438, + "learning_rate": 1.6764065335753176e-05, + "loss": 41.37, + "step": 2441 + }, + { + "epoch": 8.816252821670428, + "grad_norm": 192.99610900878906, + "learning_rate": 1.675862068965517e-05, + "loss": 41.8377, + "step": 2442 + }, + { + "epoch": 8.819864559819413, + "grad_norm": 239.290771484375, + "learning_rate": 1.675317604355717e-05, + "loss": 42.3038, + "step": 2443 + }, + { + "epoch": 8.823476297968398, + "grad_norm": 203.52330017089844, + "learning_rate": 1.6747731397459166e-05, + "loss": 41.3334, + "step": 2444 + }, + { + "epoch": 8.827088036117381, + "grad_norm": 247.99099731445312, + "learning_rate": 1.674228675136116e-05, + "loss": 37.7455, + "step": 2445 + }, + { + "epoch": 8.830699774266366, + "grad_norm": 205.9770965576172, + "learning_rate": 1.6736842105263156e-05, + "loss": 34.6828, + "step": 2446 + }, + { + "epoch": 8.83431151241535, + "grad_norm": 215.47024536132812, + "learning_rate": 1.6731397459165152e-05, + "loss": 34.927, + "step": 2447 + }, + { + "epoch": 8.837923250564334, + "grad_norm": 254.14010620117188, + "learning_rate": 1.6725952813067154e-05, + "loss": 35.3194, + "step": 2448 + }, + { + "epoch": 8.841534988713319, + "grad_norm": 221.18174743652344, + "learning_rate": 1.672050816696915e-05, + "loss": 34.9577, + "step": 2449 + }, + { + "epoch": 8.845146726862303, + "grad_norm": 191.1651611328125, + "learning_rate": 1.6715063520871145e-05, + "loss": 33.7244, + "step": 2450 + }, + { + "epoch": 8.845146726862303, + "eval_loss": 0.6216589212417603, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2450 + }, + { + "epoch": 8.848758465011286, + "grad_norm": 228.3920135498047, + "learning_rate": 1.670961887477314e-05, + "loss": 34.9689, + "step": 2451 + }, + { + "epoch": 8.852370203160271, + "grad_norm": 227.6689910888672, + "learning_rate": 1.6704174228675135e-05, + "loss": 36.0718, + "step": 2452 + }, + { + "epoch": 8.855981941309254, + "grad_norm": 182.38978576660156, + "learning_rate": 1.669872958257713e-05, + "loss": 37.1143, + "step": 2453 + }, + { + "epoch": 8.85959367945824, + "grad_norm": 223.66966247558594, + "learning_rate": 1.669328493647913e-05, + "loss": 34.4468, + "step": 2454 + }, + { + "epoch": 8.863205417607224, + "grad_norm": 260.3930358886719, + "learning_rate": 1.6687840290381125e-05, + "loss": 36.7305, + "step": 2455 + }, + { + "epoch": 8.866817155756207, + "grad_norm": 218.60385131835938, + "learning_rate": 1.668239564428312e-05, + "loss": 36.1995, + "step": 2456 + }, + { + "epoch": 8.870428893905192, + "grad_norm": 227.4342041015625, + "learning_rate": 1.667695099818512e-05, + "loss": 35.9138, + "step": 2457 + }, + { + "epoch": 8.874040632054175, + "grad_norm": 208.42196655273438, + "learning_rate": 1.6671506352087115e-05, + "loss": 37.2621, + "step": 2458 + }, + { + "epoch": 8.87765237020316, + "grad_norm": 214.9486541748047, + "learning_rate": 1.6666061705989113e-05, + "loss": 38.5176, + "step": 2459 + }, + { + "epoch": 8.881264108352145, + "grad_norm": 226.6992645263672, + "learning_rate": 1.666061705989111e-05, + "loss": 38.3917, + "step": 2460 + }, + { + "epoch": 8.881264108352145, + "eval_loss": 0.6277003884315491, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.959, + "eval_steps_per_second": 56.959, + "step": 2460 + }, + { + "epoch": 8.884875846501128, + "grad_norm": 282.3875732421875, + "learning_rate": 1.6655172413793104e-05, + "loss": 39.1439, + "step": 2461 + }, + { + "epoch": 8.888487584650113, + "grad_norm": 240.29022216796875, + "learning_rate": 1.66497277676951e-05, + "loss": 33.7717, + "step": 2462 + }, + { + "epoch": 8.892099322799098, + "grad_norm": 231.84727478027344, + "learning_rate": 1.6644283121597095e-05, + "loss": 24.1146, + "step": 2463 + }, + { + "epoch": 8.89571106094808, + "grad_norm": 215.5159149169922, + "learning_rate": 1.663883847549909e-05, + "loss": 24.0165, + "step": 2464 + }, + { + "epoch": 8.899322799097066, + "grad_norm": 278.42950439453125, + "learning_rate": 1.663339382940109e-05, + "loss": 24.2048, + "step": 2465 + }, + { + "epoch": 8.90293453724605, + "grad_norm": 187.03341674804688, + "learning_rate": 1.6627949183303088e-05, + "loss": 24.7332, + "step": 2466 + }, + { + "epoch": 8.906546275395034, + "grad_norm": 261.2938232421875, + "learning_rate": 1.6622504537205083e-05, + "loss": 42.6764, + "step": 2467 + }, + { + "epoch": 8.910158013544018, + "grad_norm": 234.00880432128906, + "learning_rate": 1.661705989110708e-05, + "loss": 42.9894, + "step": 2468 + }, + { + "epoch": 8.913769751693001, + "grad_norm": 263.2890319824219, + "learning_rate": 1.6611615245009074e-05, + "loss": 43.3274, + "step": 2469 + }, + { + "epoch": 8.917381489841986, + "grad_norm": 286.3260192871094, + "learning_rate": 1.6606170598911073e-05, + "loss": 44.3862, + "step": 2470 + }, + { + "epoch": 8.917381489841986, + "eval_loss": 0.6278789043426514, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2470 + }, + { + "epoch": 8.920993227990971, + "grad_norm": 273.5133972167969, + "learning_rate": 1.6600725952813068e-05, + "loss": 43.4195, + "step": 2471 + }, + { + "epoch": 8.924604966139954, + "grad_norm": 246.2245330810547, + "learning_rate": 1.6595281306715063e-05, + "loss": 43.153, + "step": 2472 + }, + { + "epoch": 8.928216704288939, + "grad_norm": 261.3001403808594, + "learning_rate": 1.658983666061706e-05, + "loss": 41.1276, + "step": 2473 + }, + { + "epoch": 8.931828442437924, + "grad_norm": 263.7626037597656, + "learning_rate": 1.6584392014519054e-05, + "loss": 40.5055, + "step": 2474 + }, + { + "epoch": 8.935440180586907, + "grad_norm": 233.80442810058594, + "learning_rate": 1.6578947368421053e-05, + "loss": 40.7098, + "step": 2475 + }, + { + "epoch": 8.939051918735892, + "grad_norm": 334.1268615722656, + "learning_rate": 1.6573502722323052e-05, + "loss": 40.5404, + "step": 2476 + }, + { + "epoch": 8.942663656884875, + "grad_norm": 319.56689453125, + "learning_rate": 1.6568058076225047e-05, + "loss": 40.3434, + "step": 2477 + }, + { + "epoch": 8.94627539503386, + "grad_norm": 388.0625915527344, + "learning_rate": 1.6562613430127043e-05, + "loss": 41.1956, + "step": 2478 + }, + { + "epoch": 8.949887133182845, + "grad_norm": 256.9087829589844, + "learning_rate": 1.6557168784029038e-05, + "loss": 41.9647, + "step": 2479 + }, + { + "epoch": 8.953498871331828, + "grad_norm": 248.2635040283203, + "learning_rate": 1.6551724137931033e-05, + "loss": 41.1885, + "step": 2480 + }, + { + "epoch": 8.953498871331828, + "eval_loss": 0.6198933124542236, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2480 + }, + { + "epoch": 8.957110609480813, + "grad_norm": 236.89004516601562, + "learning_rate": 1.6546279491833032e-05, + "loss": 41.2178, + "step": 2481 + }, + { + "epoch": 8.960722347629797, + "grad_norm": 260.47357177734375, + "learning_rate": 1.6540834845735027e-05, + "loss": 42.1472, + "step": 2482 + }, + { + "epoch": 8.96433408577878, + "grad_norm": 216.1390380859375, + "learning_rate": 1.6535390199637023e-05, + "loss": 36.14, + "step": 2483 + }, + { + "epoch": 8.967945823927765, + "grad_norm": 194.7316131591797, + "learning_rate": 1.652994555353902e-05, + "loss": 33.7272, + "step": 2484 + }, + { + "epoch": 8.97155756207675, + "grad_norm": 202.0404052734375, + "learning_rate": 1.6524500907441017e-05, + "loss": 34.9427, + "step": 2485 + }, + { + "epoch": 8.975169300225733, + "grad_norm": 196.98463439941406, + "learning_rate": 1.6519056261343016e-05, + "loss": 36.4874, + "step": 2486 + }, + { + "epoch": 8.978781038374718, + "grad_norm": 211.46177673339844, + "learning_rate": 1.651361161524501e-05, + "loss": 35.7667, + "step": 2487 + }, + { + "epoch": 8.982392776523701, + "grad_norm": 190.47093200683594, + "learning_rate": 1.6508166969147006e-05, + "loss": 35.6874, + "step": 2488 + }, + { + "epoch": 8.986004514672686, + "grad_norm": 194.9825897216797, + "learning_rate": 1.6502722323049002e-05, + "loss": 36.8718, + "step": 2489 + }, + { + "epoch": 8.989616252821671, + "grad_norm": 230.24774169921875, + "learning_rate": 1.6497277676950997e-05, + "loss": 37.4962, + "step": 2490 + }, + { + "epoch": 8.989616252821671, + "eval_loss": 0.6168100237846375, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 2490 + }, + { + "epoch": 8.993227990970654, + "grad_norm": 266.5688171386719, + "learning_rate": 1.6491833030852993e-05, + "loss": 35.5063, + "step": 2491 + }, + { + "epoch": 8.996839729119639, + "grad_norm": 230.923828125, + "learning_rate": 1.648638838475499e-05, + "loss": 23.5847, + "step": 2492 + }, + { + "epoch": 9.0, + "grad_norm": 187.365478515625, + "learning_rate": 1.6480943738656987e-05, + "loss": 21.7926, + "step": 2493 + }, + { + "epoch": 9.003611738148985, + "grad_norm": 283.487060546875, + "learning_rate": 1.6475499092558986e-05, + "loss": 41.4221, + "step": 2494 + }, + { + "epoch": 9.007223476297968, + "grad_norm": 234.38009643554688, + "learning_rate": 1.647005444646098e-05, + "loss": 43.3343, + "step": 2495 + }, + { + "epoch": 9.010835214446953, + "grad_norm": 253.75588989257812, + "learning_rate": 1.6464609800362976e-05, + "loss": 42.1983, + "step": 2496 + }, + { + "epoch": 9.014446952595938, + "grad_norm": 224.6202392578125, + "learning_rate": 1.6459165154264975e-05, + "loss": 41.5355, + "step": 2497 + }, + { + "epoch": 9.01805869074492, + "grad_norm": 261.0040588378906, + "learning_rate": 1.645372050816697e-05, + "loss": 42.3058, + "step": 2498 + }, + { + "epoch": 9.021670428893906, + "grad_norm": 191.44142150878906, + "learning_rate": 1.6448275862068966e-05, + "loss": 42.3911, + "step": 2499 + }, + { + "epoch": 9.025282167042889, + "grad_norm": 246.79278564453125, + "learning_rate": 1.644283121597096e-05, + "loss": 41.6238, + "step": 2500 + }, + { + "epoch": 9.025282167042889, + "eval_loss": 0.6220878958702087, + "eval_runtime": 3.1552, + "eval_samples_per_second": 56.731, + "eval_steps_per_second": 56.731, + "step": 2500 + }, + { + "epoch": 9.028893905191874, + "grad_norm": 251.5475311279297, + "learning_rate": 1.6437386569872957e-05, + "loss": 43.9275, + "step": 2501 + }, + { + "epoch": 9.032505643340858, + "grad_norm": 300.0381164550781, + "learning_rate": 1.6431941923774952e-05, + "loss": 42.8938, + "step": 2502 + }, + { + "epoch": 9.036117381489841, + "grad_norm": 310.0517883300781, + "learning_rate": 1.6426497277676954e-05, + "loss": 42.3538, + "step": 2503 + }, + { + "epoch": 9.039729119638826, + "grad_norm": 213.50392150878906, + "learning_rate": 1.642105263157895e-05, + "loss": 40.2305, + "step": 2504 + }, + { + "epoch": 9.043340857787811, + "grad_norm": 173.3816680908203, + "learning_rate": 1.6415607985480945e-05, + "loss": 38.3336, + "step": 2505 + }, + { + "epoch": 9.046952595936794, + "grad_norm": 195.51968383789062, + "learning_rate": 1.641016333938294e-05, + "loss": 38.5937, + "step": 2506 + }, + { + "epoch": 9.050564334085779, + "grad_norm": 195.68910217285156, + "learning_rate": 1.6404718693284936e-05, + "loss": 37.9994, + "step": 2507 + }, + { + "epoch": 9.054176072234762, + "grad_norm": 239.56704711914062, + "learning_rate": 1.6399274047186934e-05, + "loss": 38.6006, + "step": 2508 + }, + { + "epoch": 9.057787810383747, + "grad_norm": 455.8309326171875, + "learning_rate": 1.639382940108893e-05, + "loss": 39.9516, + "step": 2509 + }, + { + "epoch": 9.061399548532732, + "grad_norm": 188.0857696533203, + "learning_rate": 1.6388384754990925e-05, + "loss": 38.8922, + "step": 2510 + }, + { + "epoch": 9.061399548532732, + "eval_loss": 0.6177002191543579, + "eval_runtime": 3.1595, + "eval_samples_per_second": 56.654, + "eval_steps_per_second": 56.654, + "step": 2510 + }, + { + "epoch": 9.065011286681715, + "grad_norm": 211.76168823242188, + "learning_rate": 1.638294010889292e-05, + "loss": 38.8895, + "step": 2511 + }, + { + "epoch": 9.0686230248307, + "grad_norm": 281.7332458496094, + "learning_rate": 1.637749546279492e-05, + "loss": 39.9238, + "step": 2512 + }, + { + "epoch": 9.072234762979685, + "grad_norm": 254.9953155517578, + "learning_rate": 1.6372050816696915e-05, + "loss": 41.2667, + "step": 2513 + }, + { + "epoch": 9.075846501128668, + "grad_norm": 233.8746337890625, + "learning_rate": 1.6366606170598914e-05, + "loss": 39.3087, + "step": 2514 + }, + { + "epoch": 9.079458239277653, + "grad_norm": 317.71270751953125, + "learning_rate": 1.636116152450091e-05, + "loss": 40.4902, + "step": 2515 + }, + { + "epoch": 9.083069977426636, + "grad_norm": 227.5228271484375, + "learning_rate": 1.6355716878402904e-05, + "loss": 40.1197, + "step": 2516 + }, + { + "epoch": 9.08668171557562, + "grad_norm": 225.84423828125, + "learning_rate": 1.63502722323049e-05, + "loss": 42.9099, + "step": 2517 + }, + { + "epoch": 9.090293453724605, + "grad_norm": 255.20858764648438, + "learning_rate": 1.6344827586206895e-05, + "loss": 42.0515, + "step": 2518 + }, + { + "epoch": 9.093905191873588, + "grad_norm": 215.45352172851562, + "learning_rate": 1.6339382940108894e-05, + "loss": 41.6817, + "step": 2519 + }, + { + "epoch": 9.097516930022573, + "grad_norm": 233.5334014892578, + "learning_rate": 1.633393829401089e-05, + "loss": 42.6121, + "step": 2520 + }, + { + "epoch": 9.097516930022573, + "eval_loss": 0.6148340106010437, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.926, + "eval_steps_per_second": 56.926, + "step": 2520 + }, + { + "epoch": 9.101128668171558, + "grad_norm": 196.54132080078125, + "learning_rate": 1.6328493647912888e-05, + "loss": 40.5833, + "step": 2521 + }, + { + "epoch": 9.104740406320541, + "grad_norm": 296.7503967285156, + "learning_rate": 1.6323049001814883e-05, + "loss": 39.098, + "step": 2522 + }, + { + "epoch": 9.108352144469526, + "grad_norm": 272.1104431152344, + "learning_rate": 1.631760435571688e-05, + "loss": 36.0076, + "step": 2523 + }, + { + "epoch": 9.111963882618511, + "grad_norm": 197.3100128173828, + "learning_rate": 1.6312159709618874e-05, + "loss": 33.3503, + "step": 2524 + }, + { + "epoch": 9.115575620767494, + "grad_norm": 223.1310272216797, + "learning_rate": 1.6306715063520873e-05, + "loss": 33.1386, + "step": 2525 + }, + { + "epoch": 9.119187358916479, + "grad_norm": 234.86093139648438, + "learning_rate": 1.630127041742287e-05, + "loss": 34.2101, + "step": 2526 + }, + { + "epoch": 9.122799097065462, + "grad_norm": 244.72328186035156, + "learning_rate": 1.6295825771324864e-05, + "loss": 34.955, + "step": 2527 + }, + { + "epoch": 9.126410835214447, + "grad_norm": 198.89134216308594, + "learning_rate": 1.629038112522686e-05, + "loss": 34.5405, + "step": 2528 + }, + { + "epoch": 9.130022573363432, + "grad_norm": 236.64096069335938, + "learning_rate": 1.6284936479128854e-05, + "loss": 35.2328, + "step": 2529 + }, + { + "epoch": 9.133634311512415, + "grad_norm": 212.8743438720703, + "learning_rate": 1.6279491833030853e-05, + "loss": 34.6642, + "step": 2530 + }, + { + "epoch": 9.133634311512415, + "eval_loss": 0.6154256463050842, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 2530 + }, + { + "epoch": 9.1372460496614, + "grad_norm": 227.15135192871094, + "learning_rate": 1.6274047186932852e-05, + "loss": 35.652, + "step": 2531 + }, + { + "epoch": 9.140857787810384, + "grad_norm": 207.30572509765625, + "learning_rate": 1.6268602540834847e-05, + "loss": 36.8476, + "step": 2532 + }, + { + "epoch": 9.144469525959368, + "grad_norm": 222.18023681640625, + "learning_rate": 1.6263157894736843e-05, + "loss": 35.8299, + "step": 2533 + }, + { + "epoch": 9.148081264108352, + "grad_norm": 283.674072265625, + "learning_rate": 1.6257713248638838e-05, + "loss": 36.5074, + "step": 2534 + }, + { + "epoch": 9.151693002257336, + "grad_norm": 235.69752502441406, + "learning_rate": 1.6252268602540834e-05, + "loss": 37.344, + "step": 2535 + }, + { + "epoch": 9.15530474040632, + "grad_norm": 224.37965393066406, + "learning_rate": 1.6246823956442832e-05, + "loss": 37.8138, + "step": 2536 + }, + { + "epoch": 9.158916478555305, + "grad_norm": 217.52230834960938, + "learning_rate": 1.6241379310344828e-05, + "loss": 37.1529, + "step": 2537 + }, + { + "epoch": 9.162528216704288, + "grad_norm": 234.7586212158203, + "learning_rate": 1.6235934664246823e-05, + "loss": 36.3247, + "step": 2538 + }, + { + "epoch": 9.166139954853273, + "grad_norm": 239.52479553222656, + "learning_rate": 1.623049001814882e-05, + "loss": 30.0805, + "step": 2539 + }, + { + "epoch": 9.169751693002258, + "grad_norm": 223.7616424560547, + "learning_rate": 1.6225045372050817e-05, + "loss": 23.8492, + "step": 2540 + }, + { + "epoch": 9.169751693002258, + "eval_loss": 0.6244915723800659, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.031, + "eval_steps_per_second": 57.031, + "step": 2540 + }, + { + "epoch": 9.173363431151241, + "grad_norm": 213.41371154785156, + "learning_rate": 1.6219600725952816e-05, + "loss": 23.3557, + "step": 2541 + }, + { + "epoch": 9.176975169300226, + "grad_norm": 162.4627685546875, + "learning_rate": 1.621415607985481e-05, + "loss": 23.8834, + "step": 2542 + }, + { + "epoch": 9.18058690744921, + "grad_norm": 172.13250732421875, + "learning_rate": 1.6208711433756807e-05, + "loss": 24.6428, + "step": 2543 + }, + { + "epoch": 9.184198645598194, + "grad_norm": 229.30799865722656, + "learning_rate": 1.6203266787658802e-05, + "loss": 42.5908, + "step": 2544 + }, + { + "epoch": 9.187810383747179, + "grad_norm": 195.30130004882812, + "learning_rate": 1.6197822141560798e-05, + "loss": 43.7286, + "step": 2545 + }, + { + "epoch": 9.191422121896162, + "grad_norm": 227.4984893798828, + "learning_rate": 1.6192377495462793e-05, + "loss": 43.5012, + "step": 2546 + }, + { + "epoch": 9.195033860045147, + "grad_norm": 254.69615173339844, + "learning_rate": 1.6186932849364792e-05, + "loss": 41.9295, + "step": 2547 + }, + { + "epoch": 9.198645598194132, + "grad_norm": 251.33778381347656, + "learning_rate": 1.6181488203266787e-05, + "loss": 42.0838, + "step": 2548 + }, + { + "epoch": 9.202257336343115, + "grad_norm": 237.91677856445312, + "learning_rate": 1.6176043557168786e-05, + "loss": 43.0031, + "step": 2549 + }, + { + "epoch": 9.2058690744921, + "grad_norm": 258.0311584472656, + "learning_rate": 1.617059891107078e-05, + "loss": 42.7196, + "step": 2550 + }, + { + "epoch": 9.2058690744921, + "eval_loss": 0.6245208978652954, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2550 + }, + { + "epoch": 9.209480812641084, + "grad_norm": 197.14703369140625, + "learning_rate": 1.6165154264972777e-05, + "loss": 42.1342, + "step": 2551 + }, + { + "epoch": 9.213092550790067, + "grad_norm": 235.19705200195312, + "learning_rate": 1.6159709618874775e-05, + "loss": 41.8462, + "step": 2552 + }, + { + "epoch": 9.216704288939052, + "grad_norm": 198.409423828125, + "learning_rate": 1.615426497277677e-05, + "loss": 43.5993, + "step": 2553 + }, + { + "epoch": 9.220316027088035, + "grad_norm": 254.08590698242188, + "learning_rate": 1.6148820326678766e-05, + "loss": 40.771, + "step": 2554 + }, + { + "epoch": 9.22392776523702, + "grad_norm": 181.64808654785156, + "learning_rate": 1.614337568058076e-05, + "loss": 39.3511, + "step": 2555 + }, + { + "epoch": 9.227539503386005, + "grad_norm": 294.1127014160156, + "learning_rate": 1.6137931034482757e-05, + "loss": 39.6586, + "step": 2556 + }, + { + "epoch": 9.231151241534988, + "grad_norm": 197.59982299804688, + "learning_rate": 1.6132486388384752e-05, + "loss": 38.2575, + "step": 2557 + }, + { + "epoch": 9.234762979683973, + "grad_norm": 223.74717712402344, + "learning_rate": 1.6127041742286754e-05, + "loss": 38.8801, + "step": 2558 + }, + { + "epoch": 9.238374717832958, + "grad_norm": 279.2779541015625, + "learning_rate": 1.612159709618875e-05, + "loss": 40.4591, + "step": 2559 + }, + { + "epoch": 9.241986455981941, + "grad_norm": 258.75909423828125, + "learning_rate": 1.6116152450090745e-05, + "loss": 39.2172, + "step": 2560 + }, + { + "epoch": 9.241986455981941, + "eval_loss": 0.6209923624992371, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.784, + "eval_steps_per_second": 56.784, + "step": 2560 + }, + { + "epoch": 9.245598194130926, + "grad_norm": 305.0645446777344, + "learning_rate": 1.611070780399274e-05, + "loss": 40.442, + "step": 2561 + }, + { + "epoch": 9.249209932279909, + "grad_norm": 196.18557739257812, + "learning_rate": 1.6105263157894736e-05, + "loss": 39.7092, + "step": 2562 + }, + { + "epoch": 9.252821670428894, + "grad_norm": 214.3220977783203, + "learning_rate": 1.6099818511796735e-05, + "loss": 39.3935, + "step": 2563 + }, + { + "epoch": 9.256433408577879, + "grad_norm": 217.2801055908203, + "learning_rate": 1.609437386569873e-05, + "loss": 40.39, + "step": 2564 + }, + { + "epoch": 9.260045146726862, + "grad_norm": 205.17446899414062, + "learning_rate": 1.6088929219600726e-05, + "loss": 39.9531, + "step": 2565 + }, + { + "epoch": 9.263656884875846, + "grad_norm": 197.3854217529297, + "learning_rate": 1.608348457350272e-05, + "loss": 40.474, + "step": 2566 + }, + { + "epoch": 9.267268623024831, + "grad_norm": 264.3934631347656, + "learning_rate": 1.607803992740472e-05, + "loss": 41.2794, + "step": 2567 + }, + { + "epoch": 9.270880361173814, + "grad_norm": 226.6471710205078, + "learning_rate": 1.6072595281306715e-05, + "loss": 40.3425, + "step": 2568 + }, + { + "epoch": 9.2744920993228, + "grad_norm": 198.62734985351562, + "learning_rate": 1.6067150635208714e-05, + "loss": 41.6261, + "step": 2569 + }, + { + "epoch": 9.278103837471784, + "grad_norm": 207.73509216308594, + "learning_rate": 1.606170598911071e-05, + "loss": 41.7835, + "step": 2570 + }, + { + "epoch": 9.278103837471784, + "eval_loss": 0.6173180937767029, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 2570 + }, + { + "epoch": 9.281715575620767, + "grad_norm": 214.13601684570312, + "learning_rate": 1.6056261343012705e-05, + "loss": 40.0095, + "step": 2571 + }, + { + "epoch": 9.285327313769752, + "grad_norm": 218.0533905029297, + "learning_rate": 1.60508166969147e-05, + "loss": 40.014, + "step": 2572 + }, + { + "epoch": 9.288939051918735, + "grad_norm": 211.27984619140625, + "learning_rate": 1.6045372050816695e-05, + "loss": 36.7399, + "step": 2573 + }, + { + "epoch": 9.29255079006772, + "grad_norm": 201.9020233154297, + "learning_rate": 1.6039927404718694e-05, + "loss": 33.7555, + "step": 2574 + }, + { + "epoch": 9.296162528216705, + "grad_norm": 230.27149963378906, + "learning_rate": 1.603448275862069e-05, + "loss": 32.9646, + "step": 2575 + }, + { + "epoch": 9.299774266365688, + "grad_norm": 208.77622985839844, + "learning_rate": 1.6029038112522685e-05, + "loss": 33.5332, + "step": 2576 + }, + { + "epoch": 9.303386004514673, + "grad_norm": 225.02796936035156, + "learning_rate": 1.6023593466424684e-05, + "loss": 34.2592, + "step": 2577 + }, + { + "epoch": 9.306997742663658, + "grad_norm": 201.79612731933594, + "learning_rate": 1.601814882032668e-05, + "loss": 34.6686, + "step": 2578 + }, + { + "epoch": 9.31060948081264, + "grad_norm": 235.6588134765625, + "learning_rate": 1.6012704174228678e-05, + "loss": 35.4554, + "step": 2579 + }, + { + "epoch": 9.314221218961626, + "grad_norm": 273.51904296875, + "learning_rate": 1.6007259528130673e-05, + "loss": 35.2077, + "step": 2580 + }, + { + "epoch": 9.314221218961626, + "eval_loss": 0.6169624328613281, + "eval_runtime": 3.1501, + "eval_samples_per_second": 56.823, + "eval_steps_per_second": 56.823, + "step": 2580 + }, + { + "epoch": 9.317832957110609, + "grad_norm": 199.19541931152344, + "learning_rate": 1.600181488203267e-05, + "loss": 35.0703, + "step": 2581 + }, + { + "epoch": 9.321444695259594, + "grad_norm": 212.49276733398438, + "learning_rate": 1.5996370235934664e-05, + "loss": 35.9691, + "step": 2582 + }, + { + "epoch": 9.325056433408578, + "grad_norm": 193.7330322265625, + "learning_rate": 1.599092558983666e-05, + "loss": 34.9043, + "step": 2583 + }, + { + "epoch": 9.328668171557561, + "grad_norm": 196.00503540039062, + "learning_rate": 1.5985480943738655e-05, + "loss": 36.3508, + "step": 2584 + }, + { + "epoch": 9.332279909706546, + "grad_norm": 218.78392028808594, + "learning_rate": 1.5980036297640654e-05, + "loss": 34.7672, + "step": 2585 + }, + { + "epoch": 9.335891647855531, + "grad_norm": 235.76873779296875, + "learning_rate": 1.5974591651542652e-05, + "loss": 36.8695, + "step": 2586 + }, + { + "epoch": 9.339503386004514, + "grad_norm": 250.538330078125, + "learning_rate": 1.5969147005444648e-05, + "loss": 37.4531, + "step": 2587 + }, + { + "epoch": 9.343115124153499, + "grad_norm": 234.12469482421875, + "learning_rate": 1.5963702359346643e-05, + "loss": 37.4506, + "step": 2588 + }, + { + "epoch": 9.346726862302482, + "grad_norm": 209.3461151123047, + "learning_rate": 1.595825771324864e-05, + "loss": 31.3062, + "step": 2589 + }, + { + "epoch": 9.350338600451467, + "grad_norm": 211.12277221679688, + "learning_rate": 1.5952813067150637e-05, + "loss": 23.3303, + "step": 2590 + }, + { + "epoch": 9.350338600451467, + "eval_loss": 0.6222187876701355, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 2590 + }, + { + "epoch": 9.353950338600452, + "grad_norm": 200.1257781982422, + "learning_rate": 1.5947368421052633e-05, + "loss": 22.9145, + "step": 2591 + }, + { + "epoch": 9.357562076749435, + "grad_norm": 179.01475524902344, + "learning_rate": 1.5941923774954628e-05, + "loss": 23.8842, + "step": 2592 + }, + { + "epoch": 9.36117381489842, + "grad_norm": 214.9254608154297, + "learning_rate": 1.5936479128856623e-05, + "loss": 25.4154, + "step": 2593 + }, + { + "epoch": 9.364785553047405, + "grad_norm": 211.63735961914062, + "learning_rate": 1.593103448275862e-05, + "loss": 42.6467, + "step": 2594 + }, + { + "epoch": 9.368397291196388, + "grad_norm": 232.43194580078125, + "learning_rate": 1.5925589836660618e-05, + "loss": 43.3501, + "step": 2595 + }, + { + "epoch": 9.372009029345373, + "grad_norm": 220.61468505859375, + "learning_rate": 1.5920145190562616e-05, + "loss": 43.4324, + "step": 2596 + }, + { + "epoch": 9.375620767494357, + "grad_norm": 179.00894165039062, + "learning_rate": 1.591470054446461e-05, + "loss": 41.9646, + "step": 2597 + }, + { + "epoch": 9.37923250564334, + "grad_norm": 203.847412109375, + "learning_rate": 1.5909255898366607e-05, + "loss": 41.1242, + "step": 2598 + }, + { + "epoch": 9.382844243792325, + "grad_norm": 244.20164489746094, + "learning_rate": 1.5903811252268602e-05, + "loss": 42.2451, + "step": 2599 + }, + { + "epoch": 9.386455981941308, + "grad_norm": 203.60154724121094, + "learning_rate": 1.5898366606170598e-05, + "loss": 42.0361, + "step": 2600 + }, + { + "epoch": 9.386455981941308, + "eval_loss": 0.627146303653717, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2600 + }, + { + "epoch": 9.390067720090293, + "grad_norm": 185.1741180419922, + "learning_rate": 1.5892921960072597e-05, + "loss": 41.9657, + "step": 2601 + }, + { + "epoch": 9.393679458239278, + "grad_norm": 211.64219665527344, + "learning_rate": 1.5887477313974592e-05, + "loss": 42.2619, + "step": 2602 + }, + { + "epoch": 9.397291196388261, + "grad_norm": 253.31997680664062, + "learning_rate": 1.5882032667876587e-05, + "loss": 42.5666, + "step": 2603 + }, + { + "epoch": 9.400902934537246, + "grad_norm": 257.8781433105469, + "learning_rate": 1.5876588021778586e-05, + "loss": 43.1747, + "step": 2604 + }, + { + "epoch": 9.404514672686231, + "grad_norm": 171.05398559570312, + "learning_rate": 1.587114337568058e-05, + "loss": 41.2645, + "step": 2605 + }, + { + "epoch": 9.408126410835214, + "grad_norm": 209.83749389648438, + "learning_rate": 1.5865698729582577e-05, + "loss": 38.7138, + "step": 2606 + }, + { + "epoch": 9.411738148984199, + "grad_norm": 303.92059326171875, + "learning_rate": 1.5860254083484576e-05, + "loss": 38.7962, + "step": 2607 + }, + { + "epoch": 9.415349887133182, + "grad_norm": 271.9322204589844, + "learning_rate": 1.585480943738657e-05, + "loss": 39.0622, + "step": 2608 + }, + { + "epoch": 9.418961625282167, + "grad_norm": 222.8749542236328, + "learning_rate": 1.5849364791288566e-05, + "loss": 40.0773, + "step": 2609 + }, + { + "epoch": 9.422573363431152, + "grad_norm": 194.549072265625, + "learning_rate": 1.5843920145190562e-05, + "loss": 39.3495, + "step": 2610 + }, + { + "epoch": 9.422573363431152, + "eval_loss": 0.618250846862793, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.796, + "eval_steps_per_second": 56.796, + "step": 2610 + }, + { + "epoch": 9.426185101580135, + "grad_norm": 231.32623291015625, + "learning_rate": 1.5838475499092557e-05, + "loss": 39.7577, + "step": 2611 + }, + { + "epoch": 9.42979683972912, + "grad_norm": 185.9986114501953, + "learning_rate": 1.5833030852994556e-05, + "loss": 40.9342, + "step": 2612 + }, + { + "epoch": 9.433408577878104, + "grad_norm": 221.356201171875, + "learning_rate": 1.5827586206896555e-05, + "loss": 39.7733, + "step": 2613 + }, + { + "epoch": 9.437020316027088, + "grad_norm": 216.2249755859375, + "learning_rate": 1.582214156079855e-05, + "loss": 39.7559, + "step": 2614 + }, + { + "epoch": 9.440632054176072, + "grad_norm": 263.5106201171875, + "learning_rate": 1.5816696914700546e-05, + "loss": 41.2872, + "step": 2615 + }, + { + "epoch": 9.444243792325057, + "grad_norm": 281.9518127441406, + "learning_rate": 1.581125226860254e-05, + "loss": 41.1114, + "step": 2616 + }, + { + "epoch": 9.44785553047404, + "grad_norm": 200.2808074951172, + "learning_rate": 1.5805807622504536e-05, + "loss": 41.7711, + "step": 2617 + }, + { + "epoch": 9.451467268623025, + "grad_norm": 233.034912109375, + "learning_rate": 1.5800362976406535e-05, + "loss": 41.3306, + "step": 2618 + }, + { + "epoch": 9.455079006772008, + "grad_norm": 215.5499725341797, + "learning_rate": 1.579491833030853e-05, + "loss": 41.0065, + "step": 2619 + }, + { + "epoch": 9.458690744920993, + "grad_norm": 220.21153259277344, + "learning_rate": 1.5789473684210526e-05, + "loss": 42.1116, + "step": 2620 + }, + { + "epoch": 9.458690744920993, + "eval_loss": 0.6146022081375122, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 2620 + }, + { + "epoch": 9.462302483069978, + "grad_norm": 198.20001220703125, + "learning_rate": 1.578402903811252e-05, + "loss": 39.637, + "step": 2621 + }, + { + "epoch": 9.465914221218961, + "grad_norm": 228.18357849121094, + "learning_rate": 1.5778584392014517e-05, + "loss": 37.3831, + "step": 2622 + }, + { + "epoch": 9.469525959367946, + "grad_norm": 207.68040466308594, + "learning_rate": 1.577313974591652e-05, + "loss": 35.6356, + "step": 2623 + }, + { + "epoch": 9.47313769751693, + "grad_norm": 267.0474853515625, + "learning_rate": 1.5767695099818514e-05, + "loss": 34.5549, + "step": 2624 + }, + { + "epoch": 9.476749435665914, + "grad_norm": 191.4129638671875, + "learning_rate": 1.576225045372051e-05, + "loss": 35.1065, + "step": 2625 + }, + { + "epoch": 9.480361173814899, + "grad_norm": 220.85708618164062, + "learning_rate": 1.5756805807622505e-05, + "loss": 34.9115, + "step": 2626 + }, + { + "epoch": 9.483972911963882, + "grad_norm": 218.62460327148438, + "learning_rate": 1.57513611615245e-05, + "loss": 33.9542, + "step": 2627 + }, + { + "epoch": 9.487584650112867, + "grad_norm": 184.085693359375, + "learning_rate": 1.5745916515426496e-05, + "loss": 35.2981, + "step": 2628 + }, + { + "epoch": 9.491196388261852, + "grad_norm": 286.73236083984375, + "learning_rate": 1.5740471869328494e-05, + "loss": 36.8326, + "step": 2629 + }, + { + "epoch": 9.494808126410835, + "grad_norm": 326.4263000488281, + "learning_rate": 1.573502722323049e-05, + "loss": 35.9728, + "step": 2630 + }, + { + "epoch": 9.494808126410835, + "eval_loss": 0.6165672540664673, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2630 + }, + { + "epoch": 9.49841986455982, + "grad_norm": 283.330322265625, + "learning_rate": 1.5729582577132485e-05, + "loss": 37.4227, + "step": 2631 + }, + { + "epoch": 9.502031602708804, + "grad_norm": 208.65829467773438, + "learning_rate": 1.5724137931034484e-05, + "loss": 36.8613, + "step": 2632 + }, + { + "epoch": 9.505643340857787, + "grad_norm": 191.59429931640625, + "learning_rate": 1.571869328493648e-05, + "loss": 36.2332, + "step": 2633 + }, + { + "epoch": 9.509255079006772, + "grad_norm": 306.4736022949219, + "learning_rate": 1.5713248638838478e-05, + "loss": 36.8045, + "step": 2634 + }, + { + "epoch": 9.512866817155757, + "grad_norm": 226.97509765625, + "learning_rate": 1.5707803992740474e-05, + "loss": 37.005, + "step": 2635 + }, + { + "epoch": 9.51647855530474, + "grad_norm": 230.47683715820312, + "learning_rate": 1.570235934664247e-05, + "loss": 36.9168, + "step": 2636 + }, + { + "epoch": 9.520090293453725, + "grad_norm": 221.44483947753906, + "learning_rate": 1.5696914700544464e-05, + "loss": 39.0025, + "step": 2637 + }, + { + "epoch": 9.523702031602708, + "grad_norm": 249.1531219482422, + "learning_rate": 1.569147005444646e-05, + "loss": 38.1069, + "step": 2638 + }, + { + "epoch": 9.527313769751693, + "grad_norm": 276.8532409667969, + "learning_rate": 1.5686025408348455e-05, + "loss": 30.9819, + "step": 2639 + }, + { + "epoch": 9.530925507900678, + "grad_norm": 218.25035095214844, + "learning_rate": 1.5680580762250454e-05, + "loss": 23.4807, + "step": 2640 + }, + { + "epoch": 9.530925507900678, + "eval_loss": 0.619295060634613, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2640 + }, + { + "epoch": 9.534537246049661, + "grad_norm": 185.83737182617188, + "learning_rate": 1.5675136116152453e-05, + "loss": 22.5394, + "step": 2641 + }, + { + "epoch": 9.538148984198646, + "grad_norm": 181.9920654296875, + "learning_rate": 1.5669691470054448e-05, + "loss": 23.9106, + "step": 2642 + }, + { + "epoch": 9.54176072234763, + "grad_norm": 209.20391845703125, + "learning_rate": 1.5664246823956443e-05, + "loss": 25.5328, + "step": 2643 + }, + { + "epoch": 9.545372460496614, + "grad_norm": 223.86093139648438, + "learning_rate": 1.565880217785844e-05, + "loss": 42.8563, + "step": 2644 + }, + { + "epoch": 9.548984198645599, + "grad_norm": 232.3086395263672, + "learning_rate": 1.5653357531760438e-05, + "loss": 44.0178, + "step": 2645 + }, + { + "epoch": 9.552595936794582, + "grad_norm": 223.76541137695312, + "learning_rate": 1.5647912885662433e-05, + "loss": 43.4928, + "step": 2646 + }, + { + "epoch": 9.556207674943566, + "grad_norm": 258.86700439453125, + "learning_rate": 1.5642468239564428e-05, + "loss": 42.3422, + "step": 2647 + }, + { + "epoch": 9.559819413092551, + "grad_norm": 255.09033203125, + "learning_rate": 1.5637023593466424e-05, + "loss": 41.6588, + "step": 2648 + }, + { + "epoch": 9.563431151241534, + "grad_norm": 205.88563537597656, + "learning_rate": 1.563157894736842e-05, + "loss": 41.9267, + "step": 2649 + }, + { + "epoch": 9.56704288939052, + "grad_norm": 204.12318420410156, + "learning_rate": 1.5626134301270418e-05, + "loss": 43.0326, + "step": 2650 + }, + { + "epoch": 9.56704288939052, + "eval_loss": 0.6218730807304382, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2650 + }, + { + "epoch": 9.570654627539504, + "grad_norm": 259.5694274902344, + "learning_rate": 1.5620689655172417e-05, + "loss": 42.9604, + "step": 2651 + }, + { + "epoch": 9.574266365688487, + "grad_norm": 234.35935974121094, + "learning_rate": 1.5615245009074412e-05, + "loss": 42.7316, + "step": 2652 + }, + { + "epoch": 9.577878103837472, + "grad_norm": 237.14346313476562, + "learning_rate": 1.5609800362976407e-05, + "loss": 42.4559, + "step": 2653 + }, + { + "epoch": 9.581489841986457, + "grad_norm": 208.2974395751953, + "learning_rate": 1.5604355716878403e-05, + "loss": 40.1113, + "step": 2654 + }, + { + "epoch": 9.58510158013544, + "grad_norm": 212.18814086914062, + "learning_rate": 1.5598911070780398e-05, + "loss": 38.6515, + "step": 2655 + }, + { + "epoch": 9.588713318284425, + "grad_norm": 245.23240661621094, + "learning_rate": 1.5593466424682397e-05, + "loss": 39.5289, + "step": 2656 + }, + { + "epoch": 9.592325056433408, + "grad_norm": 261.1321105957031, + "learning_rate": 1.5588021778584392e-05, + "loss": 39.3232, + "step": 2657 + }, + { + "epoch": 9.595936794582393, + "grad_norm": 257.67962646484375, + "learning_rate": 1.5582577132486388e-05, + "loss": 40.3963, + "step": 2658 + }, + { + "epoch": 9.599548532731378, + "grad_norm": 299.93914794921875, + "learning_rate": 1.5577132486388383e-05, + "loss": 39.0657, + "step": 2659 + }, + { + "epoch": 9.60316027088036, + "grad_norm": 215.45407104492188, + "learning_rate": 1.5571687840290382e-05, + "loss": 40.1408, + "step": 2660 + }, + { + "epoch": 9.60316027088036, + "eval_loss": 0.6216554045677185, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2660 + }, + { + "epoch": 9.606772009029346, + "grad_norm": 273.9233093261719, + "learning_rate": 1.5566243194192377e-05, + "loss": 40.6894, + "step": 2661 + }, + { + "epoch": 9.610383747178329, + "grad_norm": 220.76344299316406, + "learning_rate": 1.5560798548094376e-05, + "loss": 40.8146, + "step": 2662 + }, + { + "epoch": 9.613995485327314, + "grad_norm": 200.33929443359375, + "learning_rate": 1.555535390199637e-05, + "loss": 40.1362, + "step": 2663 + }, + { + "epoch": 9.617607223476298, + "grad_norm": 223.38536071777344, + "learning_rate": 1.5549909255898367e-05, + "loss": 39.3488, + "step": 2664 + }, + { + "epoch": 9.621218961625281, + "grad_norm": 240.99578857421875, + "learning_rate": 1.5544464609800362e-05, + "loss": 41.771, + "step": 2665 + }, + { + "epoch": 9.624830699774266, + "grad_norm": 202.30323791503906, + "learning_rate": 1.5539019963702357e-05, + "loss": 41.1412, + "step": 2666 + }, + { + "epoch": 9.628442437923251, + "grad_norm": 193.8411865234375, + "learning_rate": 1.5533575317604356e-05, + "loss": 41.0064, + "step": 2667 + }, + { + "epoch": 9.632054176072234, + "grad_norm": 197.1542510986328, + "learning_rate": 1.552813067150635e-05, + "loss": 41.4787, + "step": 2668 + }, + { + "epoch": 9.635665914221219, + "grad_norm": 259.21954345703125, + "learning_rate": 1.552268602540835e-05, + "loss": 41.753, + "step": 2669 + }, + { + "epoch": 9.639277652370204, + "grad_norm": 290.9770202636719, + "learning_rate": 1.5517241379310346e-05, + "loss": 40.4589, + "step": 2670 + }, + { + "epoch": 9.639277652370204, + "eval_loss": 0.6132164001464844, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2670 + }, + { + "epoch": 9.642889390519187, + "grad_norm": 252.86219787597656, + "learning_rate": 1.551179673321234e-05, + "loss": 37.356, + "step": 2671 + }, + { + "epoch": 9.646501128668172, + "grad_norm": 207.79254150390625, + "learning_rate": 1.550635208711434e-05, + "loss": 36.2071, + "step": 2672 + }, + { + "epoch": 9.650112866817155, + "grad_norm": 186.78857421875, + "learning_rate": 1.5500907441016335e-05, + "loss": 33.5074, + "step": 2673 + }, + { + "epoch": 9.65372460496614, + "grad_norm": 212.5107421875, + "learning_rate": 1.549546279491833e-05, + "loss": 33.7103, + "step": 2674 + }, + { + "epoch": 9.657336343115125, + "grad_norm": 243.2950897216797, + "learning_rate": 1.5490018148820326e-05, + "loss": 34.3476, + "step": 2675 + }, + { + "epoch": 9.660948081264108, + "grad_norm": 221.66415405273438, + "learning_rate": 1.548457350272232e-05, + "loss": 34.5377, + "step": 2676 + }, + { + "epoch": 9.664559819413093, + "grad_norm": 231.8260955810547, + "learning_rate": 1.5479128856624317e-05, + "loss": 34.3663, + "step": 2677 + }, + { + "epoch": 9.668171557562077, + "grad_norm": 284.6401062011719, + "learning_rate": 1.547368421052632e-05, + "loss": 35.5723, + "step": 2678 + }, + { + "epoch": 9.67178329571106, + "grad_norm": 373.43865966796875, + "learning_rate": 1.5468239564428314e-05, + "loss": 35.5628, + "step": 2679 + }, + { + "epoch": 9.675395033860045, + "grad_norm": 325.18316650390625, + "learning_rate": 1.546279491833031e-05, + "loss": 35.6192, + "step": 2680 + }, + { + "epoch": 9.675395033860045, + "eval_loss": 0.613842248916626, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 2680 + }, + { + "epoch": 9.679006772009028, + "grad_norm": 353.14739990234375, + "learning_rate": 1.5457350272232305e-05, + "loss": 36.4789, + "step": 2681 + }, + { + "epoch": 9.682618510158013, + "grad_norm": 215.21836853027344, + "learning_rate": 1.54519056261343e-05, + "loss": 36.0412, + "step": 2682 + }, + { + "epoch": 9.686230248306998, + "grad_norm": 219.64930725097656, + "learning_rate": 1.54464609800363e-05, + "loss": 37.1118, + "step": 2683 + }, + { + "epoch": 9.689841986455981, + "grad_norm": 247.86685180664062, + "learning_rate": 1.5441016333938295e-05, + "loss": 36.488, + "step": 2684 + }, + { + "epoch": 9.693453724604966, + "grad_norm": 248.7967071533203, + "learning_rate": 1.543557168784029e-05, + "loss": 36.2925, + "step": 2685 + }, + { + "epoch": 9.697065462753951, + "grad_norm": 243.1404571533203, + "learning_rate": 1.5430127041742285e-05, + "loss": 37.3986, + "step": 2686 + }, + { + "epoch": 9.700677200902934, + "grad_norm": 276.6585388183594, + "learning_rate": 1.5424682395644284e-05, + "loss": 37.9784, + "step": 2687 + }, + { + "epoch": 9.704288939051919, + "grad_norm": 308.171630859375, + "learning_rate": 1.541923774954628e-05, + "loss": 38.1591, + "step": 2688 + }, + { + "epoch": 9.707900677200904, + "grad_norm": 204.4575653076172, + "learning_rate": 1.541379310344828e-05, + "loss": 27.4514, + "step": 2689 + }, + { + "epoch": 9.711512415349887, + "grad_norm": 160.85946655273438, + "learning_rate": 1.5408348457350274e-05, + "loss": 23.7982, + "step": 2690 + }, + { + "epoch": 9.711512415349887, + "eval_loss": 0.619924008846283, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2690 + }, + { + "epoch": 9.715124153498872, + "grad_norm": 215.60049438476562, + "learning_rate": 1.540290381125227e-05, + "loss": 23.3927, + "step": 2691 + }, + { + "epoch": 9.718735891647855, + "grad_norm": 172.84011840820312, + "learning_rate": 1.5397459165154265e-05, + "loss": 24.1876, + "step": 2692 + }, + { + "epoch": 9.72234762979684, + "grad_norm": 208.42361450195312, + "learning_rate": 1.539201451905626e-05, + "loss": 25.1794, + "step": 2693 + }, + { + "epoch": 9.725959367945824, + "grad_norm": 255.73574829101562, + "learning_rate": 1.538656987295826e-05, + "loss": 42.3484, + "step": 2694 + }, + { + "epoch": 9.729571106094808, + "grad_norm": 239.65533447265625, + "learning_rate": 1.5381125226860254e-05, + "loss": 42.8277, + "step": 2695 + }, + { + "epoch": 9.733182844243792, + "grad_norm": 211.2068634033203, + "learning_rate": 1.5375680580762253e-05, + "loss": 42.6536, + "step": 2696 + }, + { + "epoch": 9.736794582392777, + "grad_norm": 302.85003662109375, + "learning_rate": 1.5370235934664248e-05, + "loss": 42.6263, + "step": 2697 + }, + { + "epoch": 9.74040632054176, + "grad_norm": 211.54754638671875, + "learning_rate": 1.5364791288566244e-05, + "loss": 41.5621, + "step": 2698 + }, + { + "epoch": 9.744018058690745, + "grad_norm": 229.22283935546875, + "learning_rate": 1.535934664246824e-05, + "loss": 43.3765, + "step": 2699 + }, + { + "epoch": 9.747629796839728, + "grad_norm": 206.64794921875, + "learning_rate": 1.5353901996370238e-05, + "loss": 41.4923, + "step": 2700 + }, + { + "epoch": 9.747629796839728, + "eval_loss": 0.6202616095542908, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.981, + "eval_steps_per_second": 56.981, + "step": 2700 + }, + { + "epoch": 9.751241534988713, + "grad_norm": 216.98757934570312, + "learning_rate": 1.5348457350272233e-05, + "loss": 43.1931, + "step": 2701 + }, + { + "epoch": 9.754853273137698, + "grad_norm": 222.7340545654297, + "learning_rate": 1.534301270417423e-05, + "loss": 42.485, + "step": 2702 + }, + { + "epoch": 9.758465011286681, + "grad_norm": 291.3454895019531, + "learning_rate": 1.5337568058076224e-05, + "loss": 41.4766, + "step": 2703 + }, + { + "epoch": 9.762076749435666, + "grad_norm": 239.50341796875, + "learning_rate": 1.533212341197822e-05, + "loss": 41.9215, + "step": 2704 + }, + { + "epoch": 9.76568848758465, + "grad_norm": 179.21839904785156, + "learning_rate": 1.5326678765880218e-05, + "loss": 40.6544, + "step": 2705 + }, + { + "epoch": 9.769300225733634, + "grad_norm": 210.89535522460938, + "learning_rate": 1.5321234119782217e-05, + "loss": 38.6204, + "step": 2706 + }, + { + "epoch": 9.772911963882619, + "grad_norm": 239.23291015625, + "learning_rate": 1.5315789473684212e-05, + "loss": 39.4385, + "step": 2707 + }, + { + "epoch": 9.776523702031604, + "grad_norm": 240.22772216796875, + "learning_rate": 1.5310344827586208e-05, + "loss": 40.0139, + "step": 2708 + }, + { + "epoch": 9.780135440180587, + "grad_norm": 185.4588623046875, + "learning_rate": 1.5304900181488203e-05, + "loss": 38.9331, + "step": 2709 + }, + { + "epoch": 9.783747178329572, + "grad_norm": 263.0315856933594, + "learning_rate": 1.52994555353902e-05, + "loss": 38.5485, + "step": 2710 + }, + { + "epoch": 9.783747178329572, + "eval_loss": 0.615914523601532, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2710 + }, + { + "epoch": 9.787358916478555, + "grad_norm": 209.05348205566406, + "learning_rate": 1.5294010889292197e-05, + "loss": 39.4875, + "step": 2711 + }, + { + "epoch": 9.79097065462754, + "grad_norm": 209.72293090820312, + "learning_rate": 1.5288566243194193e-05, + "loss": 40.4742, + "step": 2712 + }, + { + "epoch": 9.794582392776524, + "grad_norm": 210.02908325195312, + "learning_rate": 1.5283121597096188e-05, + "loss": 39.924, + "step": 2713 + }, + { + "epoch": 9.798194130925507, + "grad_norm": 204.3467254638672, + "learning_rate": 1.5277676950998183e-05, + "loss": 40.8893, + "step": 2714 + }, + { + "epoch": 9.801805869074492, + "grad_norm": 253.9317626953125, + "learning_rate": 1.5272232304900182e-05, + "loss": 38.3278, + "step": 2715 + }, + { + "epoch": 9.805417607223477, + "grad_norm": 263.6196594238281, + "learning_rate": 1.526678765880218e-05, + "loss": 40.5242, + "step": 2716 + }, + { + "epoch": 9.80902934537246, + "grad_norm": 230.35621643066406, + "learning_rate": 1.5261343012704176e-05, + "loss": 40.683, + "step": 2717 + }, + { + "epoch": 9.812641083521445, + "grad_norm": 190.16323852539062, + "learning_rate": 1.5255898366606172e-05, + "loss": 40.2472, + "step": 2718 + }, + { + "epoch": 9.816252821670428, + "grad_norm": 202.7122344970703, + "learning_rate": 1.5250453720508167e-05, + "loss": 38.9644, + "step": 2719 + }, + { + "epoch": 9.819864559819413, + "grad_norm": 193.65774536132812, + "learning_rate": 1.5245009074410164e-05, + "loss": 40.9982, + "step": 2720 + }, + { + "epoch": 9.819864559819413, + "eval_loss": 0.6152020692825317, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 2720 + }, + { + "epoch": 9.823476297968398, + "grad_norm": 272.0360412597656, + "learning_rate": 1.523956442831216e-05, + "loss": 40.5518, + "step": 2721 + }, + { + "epoch": 9.827088036117381, + "grad_norm": 200.20777893066406, + "learning_rate": 1.5234119782214155e-05, + "loss": 38.4801, + "step": 2722 + }, + { + "epoch": 9.830699774266366, + "grad_norm": 201.44764709472656, + "learning_rate": 1.5228675136116152e-05, + "loss": 35.7499, + "step": 2723 + }, + { + "epoch": 9.83431151241535, + "grad_norm": 234.89706420898438, + "learning_rate": 1.522323049001815e-05, + "loss": 35.4331, + "step": 2724 + }, + { + "epoch": 9.837923250564334, + "grad_norm": 193.27423095703125, + "learning_rate": 1.5217785843920146e-05, + "loss": 33.0281, + "step": 2725 + }, + { + "epoch": 9.841534988713319, + "grad_norm": 222.28060913085938, + "learning_rate": 1.5212341197822143e-05, + "loss": 34.2237, + "step": 2726 + }, + { + "epoch": 9.845146726862303, + "grad_norm": 264.2764587402344, + "learning_rate": 1.5206896551724139e-05, + "loss": 33.7112, + "step": 2727 + }, + { + "epoch": 9.848758465011286, + "grad_norm": 204.5146484375, + "learning_rate": 1.5201451905626134e-05, + "loss": 33.9014, + "step": 2728 + }, + { + "epoch": 9.852370203160271, + "grad_norm": 198.90907287597656, + "learning_rate": 1.5196007259528131e-05, + "loss": 36.6987, + "step": 2729 + }, + { + "epoch": 9.855981941309254, + "grad_norm": 254.19818115234375, + "learning_rate": 1.5190562613430126e-05, + "loss": 35.4466, + "step": 2730 + }, + { + "epoch": 9.855981941309254, + "eval_loss": 0.6153284311294556, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2730 + }, + { + "epoch": 9.85959367945824, + "grad_norm": 212.53749084472656, + "learning_rate": 1.5185117967332123e-05, + "loss": 35.659, + "step": 2731 + }, + { + "epoch": 9.863205417607224, + "grad_norm": 234.5277557373047, + "learning_rate": 1.5179673321234119e-05, + "loss": 36.7411, + "step": 2732 + }, + { + "epoch": 9.866817155756207, + "grad_norm": 229.25962829589844, + "learning_rate": 1.5174228675136118e-05, + "loss": 36.0713, + "step": 2733 + }, + { + "epoch": 9.870428893905192, + "grad_norm": 259.5096435546875, + "learning_rate": 1.5168784029038115e-05, + "loss": 37.2433, + "step": 2734 + }, + { + "epoch": 9.874040632054175, + "grad_norm": 297.2413024902344, + "learning_rate": 1.516333938294011e-05, + "loss": 37.222, + "step": 2735 + }, + { + "epoch": 9.87765237020316, + "grad_norm": 259.8325500488281, + "learning_rate": 1.5157894736842105e-05, + "loss": 37.096, + "step": 2736 + }, + { + "epoch": 9.881264108352145, + "grad_norm": 275.85888671875, + "learning_rate": 1.5152450090744103e-05, + "loss": 37.769, + "step": 2737 + }, + { + "epoch": 9.884875846501128, + "grad_norm": 261.16656494140625, + "learning_rate": 1.5147005444646098e-05, + "loss": 38.4089, + "step": 2738 + }, + { + "epoch": 9.888487584650113, + "grad_norm": 219.74351501464844, + "learning_rate": 1.5141560798548095e-05, + "loss": 32.5255, + "step": 2739 + }, + { + "epoch": 9.892099322799098, + "grad_norm": 203.9193878173828, + "learning_rate": 1.513611615245009e-05, + "loss": 24.2497, + "step": 2740 + }, + { + "epoch": 9.892099322799098, + "eval_loss": 0.6206448674201965, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 2740 + }, + { + "epoch": 9.89571106094808, + "grad_norm": 224.19454956054688, + "learning_rate": 1.5130671506352086e-05, + "loss": 23.0629, + "step": 2741 + }, + { + "epoch": 9.899322799097066, + "grad_norm": 252.4147186279297, + "learning_rate": 1.5125226860254086e-05, + "loss": 24.5799, + "step": 2742 + }, + { + "epoch": 9.90293453724605, + "grad_norm": 214.79067993164062, + "learning_rate": 1.5119782214156082e-05, + "loss": 24.6773, + "step": 2743 + }, + { + "epoch": 9.906546275395034, + "grad_norm": 225.59848022460938, + "learning_rate": 1.5114337568058077e-05, + "loss": 43.1147, + "step": 2744 + }, + { + "epoch": 9.910158013544018, + "grad_norm": 221.8661651611328, + "learning_rate": 1.5108892921960074e-05, + "loss": 42.7403, + "step": 2745 + }, + { + "epoch": 9.913769751693001, + "grad_norm": 316.3871765136719, + "learning_rate": 1.510344827586207e-05, + "loss": 41.6931, + "step": 2746 + }, + { + "epoch": 9.917381489841986, + "grad_norm": 250.6577911376953, + "learning_rate": 1.5098003629764065e-05, + "loss": 43.3, + "step": 2747 + }, + { + "epoch": 9.920993227990971, + "grad_norm": 222.44386291503906, + "learning_rate": 1.5092558983666062e-05, + "loss": 43.3128, + "step": 2748 + }, + { + "epoch": 9.924604966139954, + "grad_norm": 190.08682250976562, + "learning_rate": 1.5087114337568057e-05, + "loss": 41.4814, + "step": 2749 + }, + { + "epoch": 9.928216704288939, + "grad_norm": 276.9918212890625, + "learning_rate": 1.5081669691470054e-05, + "loss": 41.042, + "step": 2750 + }, + { + "epoch": 9.928216704288939, + "eval_loss": 0.6201648116111755, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2750 + }, + { + "epoch": 9.931828442437924, + "grad_norm": 269.7344970703125, + "learning_rate": 1.507622504537205e-05, + "loss": 40.3064, + "step": 2751 + }, + { + "epoch": 9.935440180586907, + "grad_norm": 263.11663818359375, + "learning_rate": 1.5070780399274049e-05, + "loss": 40.1675, + "step": 2752 + }, + { + "epoch": 9.939051918735892, + "grad_norm": 210.37635803222656, + "learning_rate": 1.5065335753176046e-05, + "loss": 40.5334, + "step": 2753 + }, + { + "epoch": 9.942663656884875, + "grad_norm": 206.09335327148438, + "learning_rate": 1.5059891107078041e-05, + "loss": 41.0429, + "step": 2754 + }, + { + "epoch": 9.94627539503386, + "grad_norm": 245.45013427734375, + "learning_rate": 1.5054446460980036e-05, + "loss": 40.8831, + "step": 2755 + }, + { + "epoch": 9.949887133182845, + "grad_norm": 216.63075256347656, + "learning_rate": 1.5049001814882033e-05, + "loss": 41.2453, + "step": 2756 + }, + { + "epoch": 9.953498871331828, + "grad_norm": 362.12127685546875, + "learning_rate": 1.5043557168784029e-05, + "loss": 40.4561, + "step": 2757 + }, + { + "epoch": 9.957110609480813, + "grad_norm": 222.01434326171875, + "learning_rate": 1.5038112522686024e-05, + "loss": 41.7307, + "step": 2758 + }, + { + "epoch": 9.960722347629797, + "grad_norm": 289.6107177734375, + "learning_rate": 1.5032667876588021e-05, + "loss": 37.83, + "step": 2759 + }, + { + "epoch": 9.96433408577878, + "grad_norm": 231.75274658203125, + "learning_rate": 1.5027223230490017e-05, + "loss": 34.1728, + "step": 2760 + }, + { + "epoch": 9.96433408577878, + "eval_loss": 0.6177247166633606, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2760 + }, + { + "epoch": 9.967945823927765, + "grad_norm": 269.4657287597656, + "learning_rate": 1.5021778584392017e-05, + "loss": 33.8501, + "step": 2761 + }, + { + "epoch": 9.97155756207675, + "grad_norm": 229.73004150390625, + "learning_rate": 1.5016333938294013e-05, + "loss": 35.0989, + "step": 2762 + }, + { + "epoch": 9.975169300225733, + "grad_norm": 215.75350952148438, + "learning_rate": 1.5010889292196008e-05, + "loss": 35.1091, + "step": 2763 + }, + { + "epoch": 9.978781038374718, + "grad_norm": 255.36439514160156, + "learning_rate": 1.5005444646098005e-05, + "loss": 36.8373, + "step": 2764 + }, + { + "epoch": 9.982392776523701, + "grad_norm": 226.71084594726562, + "learning_rate": 1.5e-05, + "loss": 36.6244, + "step": 2765 + }, + { + "epoch": 9.986004514672686, + "grad_norm": 264.1791076660156, + "learning_rate": 1.4994555353901996e-05, + "loss": 36.1925, + "step": 2766 + }, + { + "epoch": 9.989616252821671, + "grad_norm": 281.4349060058594, + "learning_rate": 1.4989110707803993e-05, + "loss": 38.5627, + "step": 2767 + }, + { + "epoch": 9.993227990970654, + "grad_norm": 275.13092041015625, + "learning_rate": 1.498366606170599e-05, + "loss": 33.3277, + "step": 2768 + }, + { + "epoch": 9.996839729119639, + "grad_norm": 215.79550170898438, + "learning_rate": 1.4978221415607985e-05, + "loss": 23.7482, + "step": 2769 + }, + { + "epoch": 10.0, + "grad_norm": 162.03152465820312, + "learning_rate": 1.4972776769509982e-05, + "loss": 21.7078, + "step": 2770 + }, + { + "epoch": 10.0, + "eval_loss": 0.6126651763916016, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2770 + }, + { + "epoch": 10.003611738148985, + "grad_norm": 243.1815185546875, + "learning_rate": 1.4967332123411978e-05, + "loss": 42.2449, + "step": 2771 + }, + { + "epoch": 10.007223476297968, + "grad_norm": 183.29127502441406, + "learning_rate": 1.4961887477313977e-05, + "loss": 41.5925, + "step": 2772 + }, + { + "epoch": 10.010835214446953, + "grad_norm": 206.04238891601562, + "learning_rate": 1.4956442831215972e-05, + "loss": 40.6657, + "step": 2773 + }, + { + "epoch": 10.014446952595938, + "grad_norm": 192.1796875, + "learning_rate": 1.4950998185117967e-05, + "loss": 41.7065, + "step": 2774 + }, + { + "epoch": 10.01805869074492, + "grad_norm": 202.77279663085938, + "learning_rate": 1.4945553539019964e-05, + "loss": 42.0608, + "step": 2775 + }, + { + "epoch": 10.021670428893906, + "grad_norm": 242.37734985351562, + "learning_rate": 1.494010889292196e-05, + "loss": 40.9925, + "step": 2776 + }, + { + "epoch": 10.025282167042889, + "grad_norm": 252.01358032226562, + "learning_rate": 1.4934664246823957e-05, + "loss": 41.1401, + "step": 2777 + }, + { + "epoch": 10.028893905191874, + "grad_norm": 205.82388305664062, + "learning_rate": 1.4929219600725954e-05, + "loss": 41.5, + "step": 2778 + }, + { + "epoch": 10.032505643340858, + "grad_norm": 251.53968811035156, + "learning_rate": 1.492377495462795e-05, + "loss": 41.8218, + "step": 2779 + }, + { + "epoch": 10.036117381489841, + "grad_norm": 236.55564880371094, + "learning_rate": 1.4918330308529945e-05, + "loss": 40.803, + "step": 2780 + }, + { + "epoch": 10.036117381489841, + "eval_loss": 0.6173696517944336, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 2780 + }, + { + "epoch": 10.039729119638826, + "grad_norm": 214.9959716796875, + "learning_rate": 1.4912885662431942e-05, + "loss": 40.522, + "step": 2781 + }, + { + "epoch": 10.043340857787811, + "grad_norm": 213.7000732421875, + "learning_rate": 1.4907441016333939e-05, + "loss": 38.8643, + "step": 2782 + }, + { + "epoch": 10.046952595936794, + "grad_norm": 225.6709747314453, + "learning_rate": 1.4901996370235936e-05, + "loss": 38.3625, + "step": 2783 + }, + { + "epoch": 10.050564334085779, + "grad_norm": 208.83712768554688, + "learning_rate": 1.4896551724137931e-05, + "loss": 38.5355, + "step": 2784 + }, + { + "epoch": 10.054176072234762, + "grad_norm": 185.51219177246094, + "learning_rate": 1.4891107078039927e-05, + "loss": 38.4303, + "step": 2785 + }, + { + "epoch": 10.057787810383747, + "grad_norm": 196.68551635742188, + "learning_rate": 1.4885662431941925e-05, + "loss": 38.1895, + "step": 2786 + }, + { + "epoch": 10.061399548532732, + "grad_norm": 207.4806671142578, + "learning_rate": 1.488021778584392e-05, + "loss": 39.2329, + "step": 2787 + }, + { + "epoch": 10.065011286681715, + "grad_norm": 211.640380859375, + "learning_rate": 1.4874773139745916e-05, + "loss": 40.108, + "step": 2788 + }, + { + "epoch": 10.0686230248307, + "grad_norm": 195.97006225585938, + "learning_rate": 1.4869328493647913e-05, + "loss": 39.6883, + "step": 2789 + }, + { + "epoch": 10.072234762979685, + "grad_norm": 207.20169067382812, + "learning_rate": 1.4863883847549909e-05, + "loss": 40.557, + "step": 2790 + }, + { + "epoch": 10.072234762979685, + "eval_loss": 0.6166439652442932, + "eval_runtime": 3.1461, + "eval_samples_per_second": 56.895, + "eval_steps_per_second": 56.895, + "step": 2790 + }, + { + "epoch": 10.075846501128668, + "grad_norm": 168.4052276611328, + "learning_rate": 1.4858439201451906e-05, + "loss": 39.76, + "step": 2791 + }, + { + "epoch": 10.079458239277653, + "grad_norm": 188.55575561523438, + "learning_rate": 1.4852994555353903e-05, + "loss": 40.4776, + "step": 2792 + }, + { + "epoch": 10.083069977426636, + "grad_norm": 181.60801696777344, + "learning_rate": 1.4847549909255898e-05, + "loss": 40.5414, + "step": 2793 + }, + { + "epoch": 10.08668171557562, + "grad_norm": 205.39608764648438, + "learning_rate": 1.4842105263157895e-05, + "loss": 41.4944, + "step": 2794 + }, + { + "epoch": 10.090293453724605, + "grad_norm": 271.0169372558594, + "learning_rate": 1.4836660617059892e-05, + "loss": 40.6805, + "step": 2795 + }, + { + "epoch": 10.093905191873588, + "grad_norm": 241.97889709472656, + "learning_rate": 1.4831215970961888e-05, + "loss": 39.5473, + "step": 2796 + }, + { + "epoch": 10.097516930022573, + "grad_norm": 211.64260864257812, + "learning_rate": 1.4825771324863885e-05, + "loss": 41.0357, + "step": 2797 + }, + { + "epoch": 10.101128668171558, + "grad_norm": 209.52804565429688, + "learning_rate": 1.482032667876588e-05, + "loss": 41.3357, + "step": 2798 + }, + { + "epoch": 10.104740406320541, + "grad_norm": 243.08419799804688, + "learning_rate": 1.4814882032667876e-05, + "loss": 38.6778, + "step": 2799 + }, + { + "epoch": 10.108352144469526, + "grad_norm": 227.17172241210938, + "learning_rate": 1.4809437386569874e-05, + "loss": 35.1128, + "step": 2800 + }, + { + "epoch": 10.108352144469526, + "eval_loss": 0.6153741478919983, + "eval_runtime": 3.143, + "eval_samples_per_second": 56.952, + "eval_steps_per_second": 56.952, + "step": 2800 + }, + { + "epoch": 10.111963882618511, + "grad_norm": 284.7151794433594, + "learning_rate": 1.480399274047187e-05, + "loss": 33.1712, + "step": 2801 + }, + { + "epoch": 10.115575620767494, + "grad_norm": 234.85169982910156, + "learning_rate": 1.4798548094373867e-05, + "loss": 33.495, + "step": 2802 + }, + { + "epoch": 10.119187358916479, + "grad_norm": 236.6138458251953, + "learning_rate": 1.4793103448275862e-05, + "loss": 33.2318, + "step": 2803 + }, + { + "epoch": 10.122799097065462, + "grad_norm": 240.98997497558594, + "learning_rate": 1.4787658802177858e-05, + "loss": 33.9268, + "step": 2804 + }, + { + "epoch": 10.126410835214447, + "grad_norm": 218.304443359375, + "learning_rate": 1.4782214156079856e-05, + "loss": 34.667, + "step": 2805 + }, + { + "epoch": 10.130022573363432, + "grad_norm": 290.30108642578125, + "learning_rate": 1.4776769509981852e-05, + "loss": 36.7153, + "step": 2806 + }, + { + "epoch": 10.133634311512415, + "grad_norm": 267.7265625, + "learning_rate": 1.4771324863883847e-05, + "loss": 35.2035, + "step": 2807 + }, + { + "epoch": 10.1372460496614, + "grad_norm": 300.4646301269531, + "learning_rate": 1.4765880217785844e-05, + "loss": 35.6581, + "step": 2808 + }, + { + "epoch": 10.140857787810384, + "grad_norm": 234.16448974609375, + "learning_rate": 1.4760435571687841e-05, + "loss": 35.8547, + "step": 2809 + }, + { + "epoch": 10.144469525959368, + "grad_norm": 209.23858642578125, + "learning_rate": 1.4754990925589837e-05, + "loss": 34.47, + "step": 2810 + }, + { + "epoch": 10.144469525959368, + "eval_loss": 0.6160662770271301, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2810 + }, + { + "epoch": 10.148081264108352, + "grad_norm": 207.9628143310547, + "learning_rate": 1.4749546279491834e-05, + "loss": 36.1239, + "step": 2811 + }, + { + "epoch": 10.151693002257336, + "grad_norm": 183.68545532226562, + "learning_rate": 1.4744101633393829e-05, + "loss": 36.759, + "step": 2812 + }, + { + "epoch": 10.15530474040632, + "grad_norm": 222.00164794921875, + "learning_rate": 1.4738656987295826e-05, + "loss": 37.397, + "step": 2813 + }, + { + "epoch": 10.158916478555305, + "grad_norm": 226.9628448486328, + "learning_rate": 1.4733212341197823e-05, + "loss": 36.3648, + "step": 2814 + }, + { + "epoch": 10.162528216704288, + "grad_norm": 271.061279296875, + "learning_rate": 1.4727767695099819e-05, + "loss": 37.8754, + "step": 2815 + }, + { + "epoch": 10.166139954853273, + "grad_norm": 265.2478942871094, + "learning_rate": 1.4722323049001816e-05, + "loss": 33.7491, + "step": 2816 + }, + { + "epoch": 10.169751693002258, + "grad_norm": 227.5030975341797, + "learning_rate": 1.4716878402903811e-05, + "loss": 23.0162, + "step": 2817 + }, + { + "epoch": 10.173363431151241, + "grad_norm": 195.83477783203125, + "learning_rate": 1.4711433756805808e-05, + "loss": 23.5831, + "step": 2818 + }, + { + "epoch": 10.176975169300226, + "grad_norm": 196.982421875, + "learning_rate": 1.4705989110707805e-05, + "loss": 24.1078, + "step": 2819 + }, + { + "epoch": 10.18058690744921, + "grad_norm": 212.73031616210938, + "learning_rate": 1.47005444646098e-05, + "loss": 24.8378, + "step": 2820 + }, + { + "epoch": 10.18058690744921, + "eval_loss": 0.6217848062515259, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 2820 + }, + { + "epoch": 10.184198645598194, + "grad_norm": 261.8343200683594, + "learning_rate": 1.4695099818511796e-05, + "loss": 43.3402, + "step": 2821 + }, + { + "epoch": 10.187810383747179, + "grad_norm": 272.94158935546875, + "learning_rate": 1.4689655172413793e-05, + "loss": 42.8004, + "step": 2822 + }, + { + "epoch": 10.191422121896162, + "grad_norm": 261.5067138671875, + "learning_rate": 1.468421052631579e-05, + "loss": 43.5947, + "step": 2823 + }, + { + "epoch": 10.195033860045147, + "grad_norm": 280.4205322265625, + "learning_rate": 1.4678765880217787e-05, + "loss": 42.1887, + "step": 2824 + }, + { + "epoch": 10.198645598194132, + "grad_norm": 223.82449340820312, + "learning_rate": 1.4673321234119783e-05, + "loss": 40.9825, + "step": 2825 + }, + { + "epoch": 10.202257336343115, + "grad_norm": 261.1077575683594, + "learning_rate": 1.4667876588021778e-05, + "loss": 41.8347, + "step": 2826 + }, + { + "epoch": 10.2058690744921, + "grad_norm": 189.1642608642578, + "learning_rate": 1.4662431941923775e-05, + "loss": 41.7441, + "step": 2827 + }, + { + "epoch": 10.209480812641084, + "grad_norm": 216.94410705566406, + "learning_rate": 1.4656987295825772e-05, + "loss": 42.203, + "step": 2828 + }, + { + "epoch": 10.213092550790067, + "grad_norm": 260.44744873046875, + "learning_rate": 1.4651542649727768e-05, + "loss": 41.8887, + "step": 2829 + }, + { + "epoch": 10.216704288939052, + "grad_norm": 252.21682739257812, + "learning_rate": 1.4646098003629765e-05, + "loss": 42.5977, + "step": 2830 + }, + { + "epoch": 10.216704288939052, + "eval_loss": 0.6175437569618225, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.997, + "eval_steps_per_second": 56.997, + "step": 2830 + }, + { + "epoch": 10.220316027088035, + "grad_norm": 298.4760437011719, + "learning_rate": 1.464065335753176e-05, + "loss": 40.7994, + "step": 2831 + }, + { + "epoch": 10.22392776523702, + "grad_norm": 214.0433349609375, + "learning_rate": 1.4635208711433757e-05, + "loss": 39.1571, + "step": 2832 + }, + { + "epoch": 10.227539503386005, + "grad_norm": 220.59039306640625, + "learning_rate": 1.4629764065335754e-05, + "loss": 38.257, + "step": 2833 + }, + { + "epoch": 10.231151241534988, + "grad_norm": 218.2419891357422, + "learning_rate": 1.462431941923775e-05, + "loss": 38.1954, + "step": 2834 + }, + { + "epoch": 10.234762979683973, + "grad_norm": 241.67674255371094, + "learning_rate": 1.4618874773139747e-05, + "loss": 39.7451, + "step": 2835 + }, + { + "epoch": 10.238374717832958, + "grad_norm": 260.3656005859375, + "learning_rate": 1.4613430127041742e-05, + "loss": 38.8297, + "step": 2836 + }, + { + "epoch": 10.241986455981941, + "grad_norm": 231.78102111816406, + "learning_rate": 1.4607985480943739e-05, + "loss": 38.523, + "step": 2837 + }, + { + "epoch": 10.245598194130926, + "grad_norm": 217.64820861816406, + "learning_rate": 1.4602540834845736e-05, + "loss": 40.0389, + "step": 2838 + }, + { + "epoch": 10.249209932279909, + "grad_norm": 186.45240783691406, + "learning_rate": 1.4597096188747732e-05, + "loss": 40.3306, + "step": 2839 + }, + { + "epoch": 10.252821670428894, + "grad_norm": 225.20480346679688, + "learning_rate": 1.4591651542649727e-05, + "loss": 39.0968, + "step": 2840 + }, + { + "epoch": 10.252821670428894, + "eval_loss": 0.6195141673088074, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 2840 + }, + { + "epoch": 10.256433408577879, + "grad_norm": 367.6174621582031, + "learning_rate": 1.4586206896551724e-05, + "loss": 38.869, + "step": 2841 + }, + { + "epoch": 10.260045146726862, + "grad_norm": 274.3976135253906, + "learning_rate": 1.4580762250453721e-05, + "loss": 39.7781, + "step": 2842 + }, + { + "epoch": 10.263656884875846, + "grad_norm": 193.41665649414062, + "learning_rate": 1.4575317604355718e-05, + "loss": 38.819, + "step": 2843 + }, + { + "epoch": 10.267268623024831, + "grad_norm": 204.2224578857422, + "learning_rate": 1.4569872958257714e-05, + "loss": 41.5495, + "step": 2844 + }, + { + "epoch": 10.270880361173814, + "grad_norm": 276.07476806640625, + "learning_rate": 1.4564428312159709e-05, + "loss": 40.6553, + "step": 2845 + }, + { + "epoch": 10.2744920993228, + "grad_norm": 192.6361541748047, + "learning_rate": 1.4558983666061708e-05, + "loss": 40.2147, + "step": 2846 + }, + { + "epoch": 10.278103837471784, + "grad_norm": 232.6641082763672, + "learning_rate": 1.4553539019963703e-05, + "loss": 40.7223, + "step": 2847 + }, + { + "epoch": 10.281715575620767, + "grad_norm": 266.781005859375, + "learning_rate": 1.4548094373865698e-05, + "loss": 38.0127, + "step": 2848 + }, + { + "epoch": 10.285327313769752, + "grad_norm": 289.5414123535156, + "learning_rate": 1.4542649727767696e-05, + "loss": 35.216, + "step": 2849 + }, + { + "epoch": 10.288939051918735, + "grad_norm": 208.10845947265625, + "learning_rate": 1.4537205081669691e-05, + "loss": 33.829, + "step": 2850 + }, + { + "epoch": 10.288939051918735, + "eval_loss": 0.6140356063842773, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.703, + "eval_steps_per_second": 56.703, + "step": 2850 + }, + { + "epoch": 10.29255079006772, + "grad_norm": 260.80328369140625, + "learning_rate": 1.4531760435571688e-05, + "loss": 33.8409, + "step": 2851 + }, + { + "epoch": 10.296162528216705, + "grad_norm": 202.3874053955078, + "learning_rate": 1.4526315789473685e-05, + "loss": 32.6498, + "step": 2852 + }, + { + "epoch": 10.299774266365688, + "grad_norm": 236.0218048095703, + "learning_rate": 1.452087114337568e-05, + "loss": 33.6538, + "step": 2853 + }, + { + "epoch": 10.303386004514673, + "grad_norm": 219.1603240966797, + "learning_rate": 1.4515426497277678e-05, + "loss": 33.7346, + "step": 2854 + }, + { + "epoch": 10.306997742663658, + "grad_norm": 252.8759307861328, + "learning_rate": 1.4509981851179675e-05, + "loss": 34.6996, + "step": 2855 + }, + { + "epoch": 10.31060948081264, + "grad_norm": 204.89244079589844, + "learning_rate": 1.450453720508167e-05, + "loss": 36.1145, + "step": 2856 + }, + { + "epoch": 10.314221218961626, + "grad_norm": 239.5278778076172, + "learning_rate": 1.4499092558983667e-05, + "loss": 34.8845, + "step": 2857 + }, + { + "epoch": 10.317832957110609, + "grad_norm": 235.02403259277344, + "learning_rate": 1.4493647912885662e-05, + "loss": 36.1006, + "step": 2858 + }, + { + "epoch": 10.321444695259594, + "grad_norm": 219.25686645507812, + "learning_rate": 1.4488203266787658e-05, + "loss": 37.0463, + "step": 2859 + }, + { + "epoch": 10.325056433408578, + "grad_norm": 238.1767578125, + "learning_rate": 1.4482758620689657e-05, + "loss": 35.5543, + "step": 2860 + }, + { + "epoch": 10.325056433408578, + "eval_loss": 0.6116110682487488, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.93, + "eval_steps_per_second": 56.93, + "step": 2860 + }, + { + "epoch": 10.328668171557561, + "grad_norm": 245.4133758544922, + "learning_rate": 1.4477313974591652e-05, + "loss": 35.7557, + "step": 2861 + }, + { + "epoch": 10.332279909706546, + "grad_norm": 231.70779418945312, + "learning_rate": 1.4471869328493647e-05, + "loss": 35.9535, + "step": 2862 + }, + { + "epoch": 10.335891647855531, + "grad_norm": 218.71266174316406, + "learning_rate": 1.4466424682395644e-05, + "loss": 36.747, + "step": 2863 + }, + { + "epoch": 10.339503386004514, + "grad_norm": 206.82247924804688, + "learning_rate": 1.446098003629764e-05, + "loss": 37.4007, + "step": 2864 + }, + { + "epoch": 10.343115124153499, + "grad_norm": 286.6649475097656, + "learning_rate": 1.4455535390199639e-05, + "loss": 38.183, + "step": 2865 + }, + { + "epoch": 10.346726862302482, + "grad_norm": 262.2049865722656, + "learning_rate": 1.4450090744101634e-05, + "loss": 28.1564, + "step": 2866 + }, + { + "epoch": 10.350338600451467, + "grad_norm": 203.03831481933594, + "learning_rate": 1.444464609800363e-05, + "loss": 23.7155, + "step": 2867 + }, + { + "epoch": 10.353950338600452, + "grad_norm": 220.13597106933594, + "learning_rate": 1.4439201451905626e-05, + "loss": 23.5066, + "step": 2868 + }, + { + "epoch": 10.357562076749435, + "grad_norm": 208.22035217285156, + "learning_rate": 1.4433756805807624e-05, + "loss": 23.8087, + "step": 2869 + }, + { + "epoch": 10.36117381489842, + "grad_norm": 202.74989318847656, + "learning_rate": 1.4428312159709619e-05, + "loss": 24.6194, + "step": 2870 + }, + { + "epoch": 10.36117381489842, + "eval_loss": 0.6170971989631653, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 2870 + }, + { + "epoch": 10.364785553047405, + "grad_norm": 251.78924560546875, + "learning_rate": 1.4422867513611616e-05, + "loss": 41.1333, + "step": 2871 + }, + { + "epoch": 10.368397291196388, + "grad_norm": 269.72430419921875, + "learning_rate": 1.4417422867513611e-05, + "loss": 43.5289, + "step": 2872 + }, + { + "epoch": 10.372009029345373, + "grad_norm": 226.14202880859375, + "learning_rate": 1.4411978221415607e-05, + "loss": 42.1575, + "step": 2873 + }, + { + "epoch": 10.375620767494357, + "grad_norm": 230.2255096435547, + "learning_rate": 1.4406533575317606e-05, + "loss": 42.5563, + "step": 2874 + }, + { + "epoch": 10.37923250564334, + "grad_norm": 259.2338562011719, + "learning_rate": 1.4401088929219601e-05, + "loss": 41.517, + "step": 2875 + }, + { + "epoch": 10.382844243792325, + "grad_norm": 280.06414794921875, + "learning_rate": 1.4395644283121598e-05, + "loss": 41.3589, + "step": 2876 + }, + { + "epoch": 10.386455981941308, + "grad_norm": 259.1960754394531, + "learning_rate": 1.4390199637023593e-05, + "loss": 41.539, + "step": 2877 + }, + { + "epoch": 10.390067720090293, + "grad_norm": 244.4931640625, + "learning_rate": 1.438475499092559e-05, + "loss": 41.8689, + "step": 2878 + }, + { + "epoch": 10.393679458239278, + "grad_norm": 195.65065002441406, + "learning_rate": 1.4379310344827588e-05, + "loss": 42.9191, + "step": 2879 + }, + { + "epoch": 10.397291196388261, + "grad_norm": 215.88589477539062, + "learning_rate": 1.4373865698729583e-05, + "loss": 41.4172, + "step": 2880 + }, + { + "epoch": 10.397291196388261, + "eval_loss": 0.6176813840866089, + "eval_runtime": 3.1462, + "eval_samples_per_second": 56.893, + "eval_steps_per_second": 56.893, + "step": 2880 + }, + { + "epoch": 10.400902934537246, + "grad_norm": 175.21368408203125, + "learning_rate": 1.4368421052631578e-05, + "loss": 41.8998, + "step": 2881 + }, + { + "epoch": 10.404514672686231, + "grad_norm": 207.65963745117188, + "learning_rate": 1.4362976406533575e-05, + "loss": 40.33, + "step": 2882 + }, + { + "epoch": 10.408126410835214, + "grad_norm": 213.50526428222656, + "learning_rate": 1.4357531760435572e-05, + "loss": 38.0329, + "step": 2883 + }, + { + "epoch": 10.411738148984199, + "grad_norm": 190.8444366455078, + "learning_rate": 1.4352087114337568e-05, + "loss": 39.0142, + "step": 2884 + }, + { + "epoch": 10.415349887133182, + "grad_norm": 300.2298583984375, + "learning_rate": 1.4346642468239565e-05, + "loss": 38.6364, + "step": 2885 + }, + { + "epoch": 10.418961625282167, + "grad_norm": 183.6144256591797, + "learning_rate": 1.434119782214156e-05, + "loss": 39.6747, + "step": 2886 + }, + { + "epoch": 10.422573363431152, + "grad_norm": 237.85340881347656, + "learning_rate": 1.4335753176043557e-05, + "loss": 38.3018, + "step": 2887 + }, + { + "epoch": 10.426185101580135, + "grad_norm": 325.96624755859375, + "learning_rate": 1.4330308529945554e-05, + "loss": 40.1042, + "step": 2888 + }, + { + "epoch": 10.42979683972912, + "grad_norm": 248.4732666015625, + "learning_rate": 1.432486388384755e-05, + "loss": 40.0357, + "step": 2889 + }, + { + "epoch": 10.433408577878104, + "grad_norm": 374.6653747558594, + "learning_rate": 1.4319419237749547e-05, + "loss": 40.4383, + "step": 2890 + }, + { + "epoch": 10.433408577878104, + "eval_loss": 0.6150367856025696, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.881, + "eval_steps_per_second": 56.881, + "step": 2890 + }, + { + "epoch": 10.437020316027088, + "grad_norm": 229.79647827148438, + "learning_rate": 1.4313974591651542e-05, + "loss": 40.3728, + "step": 2891 + }, + { + "epoch": 10.440632054176072, + "grad_norm": 278.7500915527344, + "learning_rate": 1.430852994555354e-05, + "loss": 39.546, + "step": 2892 + }, + { + "epoch": 10.444243792325057, + "grad_norm": 233.1890106201172, + "learning_rate": 1.4303085299455536e-05, + "loss": 41.8094, + "step": 2893 + }, + { + "epoch": 10.44785553047404, + "grad_norm": 207.7745819091797, + "learning_rate": 1.4297640653357532e-05, + "loss": 40.6225, + "step": 2894 + }, + { + "epoch": 10.451467268623025, + "grad_norm": 233.37892150878906, + "learning_rate": 1.4292196007259529e-05, + "loss": 40.2499, + "step": 2895 + }, + { + "epoch": 10.455079006772008, + "grad_norm": 225.4070587158203, + "learning_rate": 1.4286751361161524e-05, + "loss": 40.3626, + "step": 2896 + }, + { + "epoch": 10.458690744920993, + "grad_norm": 239.60231018066406, + "learning_rate": 1.4281306715063521e-05, + "loss": 40.3149, + "step": 2897 + }, + { + "epoch": 10.462302483069978, + "grad_norm": 225.3981475830078, + "learning_rate": 1.4275862068965518e-05, + "loss": 39.3443, + "step": 2898 + }, + { + "epoch": 10.465914221218961, + "grad_norm": 270.2829284667969, + "learning_rate": 1.4270417422867514e-05, + "loss": 37.8947, + "step": 2899 + }, + { + "epoch": 10.469525959367946, + "grad_norm": 263.66986083984375, + "learning_rate": 1.426497277676951e-05, + "loss": 34.4721, + "step": 2900 + }, + { + "epoch": 10.469525959367946, + "eval_loss": 0.6134031414985657, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2900 + }, + { + "epoch": 10.47313769751693, + "grad_norm": 189.3812255859375, + "learning_rate": 1.4259528130671508e-05, + "loss": 34.3148, + "step": 2901 + }, + { + "epoch": 10.476749435665914, + "grad_norm": 256.7174987792969, + "learning_rate": 1.4254083484573503e-05, + "loss": 32.1693, + "step": 2902 + }, + { + "epoch": 10.480361173814899, + "grad_norm": 265.40692138671875, + "learning_rate": 1.4248638838475499e-05, + "loss": 34.369, + "step": 2903 + }, + { + "epoch": 10.483972911963882, + "grad_norm": 315.6539001464844, + "learning_rate": 1.4243194192377496e-05, + "loss": 34.9479, + "step": 2904 + }, + { + "epoch": 10.487584650112867, + "grad_norm": 263.7816162109375, + "learning_rate": 1.4237749546279491e-05, + "loss": 33.983, + "step": 2905 + }, + { + "epoch": 10.491196388261852, + "grad_norm": 244.69192504882812, + "learning_rate": 1.423230490018149e-05, + "loss": 36.6685, + "step": 2906 + }, + { + "epoch": 10.494808126410835, + "grad_norm": 224.26071166992188, + "learning_rate": 1.4226860254083485e-05, + "loss": 35.0337, + "step": 2907 + }, + { + "epoch": 10.49841986455982, + "grad_norm": 261.0958557128906, + "learning_rate": 1.422141560798548e-05, + "loss": 34.7154, + "step": 2908 + }, + { + "epoch": 10.502031602708804, + "grad_norm": 245.85960388183594, + "learning_rate": 1.4215970961887478e-05, + "loss": 35.4156, + "step": 2909 + }, + { + "epoch": 10.505643340857787, + "grad_norm": 309.3730163574219, + "learning_rate": 1.4210526315789473e-05, + "loss": 36.3999, + "step": 2910 + }, + { + "epoch": 10.505643340857787, + "eval_loss": 0.6144266128540039, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.853, + "eval_steps_per_second": 56.853, + "step": 2910 + }, + { + "epoch": 10.509255079006772, + "grad_norm": 209.9637451171875, + "learning_rate": 1.420508166969147e-05, + "loss": 37.1515, + "step": 2911 + }, + { + "epoch": 10.512866817155757, + "grad_norm": 254.81683349609375, + "learning_rate": 1.4199637023593467e-05, + "loss": 35.5548, + "step": 2912 + }, + { + "epoch": 10.51647855530474, + "grad_norm": 224.94137573242188, + "learning_rate": 1.4194192377495463e-05, + "loss": 36.7691, + "step": 2913 + }, + { + "epoch": 10.520090293453725, + "grad_norm": 223.81838989257812, + "learning_rate": 1.4188747731397458e-05, + "loss": 37.5904, + "step": 2914 + }, + { + "epoch": 10.523702031602708, + "grad_norm": 308.0168151855469, + "learning_rate": 1.4183303085299457e-05, + "loss": 36.1561, + "step": 2915 + }, + { + "epoch": 10.527313769751693, + "grad_norm": 214.77928161621094, + "learning_rate": 1.4177858439201452e-05, + "loss": 27.6309, + "step": 2916 + }, + { + "epoch": 10.530925507900678, + "grad_norm": 153.77163696289062, + "learning_rate": 1.417241379310345e-05, + "loss": 23.6151, + "step": 2917 + }, + { + "epoch": 10.534537246049661, + "grad_norm": 161.12826538085938, + "learning_rate": 1.4166969147005445e-05, + "loss": 23.1684, + "step": 2918 + }, + { + "epoch": 10.538148984198646, + "grad_norm": 228.01441955566406, + "learning_rate": 1.416152450090744e-05, + "loss": 23.4383, + "step": 2919 + }, + { + "epoch": 10.54176072234763, + "grad_norm": 207.55052185058594, + "learning_rate": 1.4156079854809439e-05, + "loss": 25.4699, + "step": 2920 + }, + { + "epoch": 10.54176072234763, + "eval_loss": 0.6177500486373901, + "eval_runtime": 3.1369, + "eval_samples_per_second": 57.063, + "eval_steps_per_second": 57.063, + "step": 2920 + }, + { + "epoch": 10.545372460496614, + "grad_norm": 254.23828125, + "learning_rate": 1.4150635208711434e-05, + "loss": 42.1525, + "step": 2921 + }, + { + "epoch": 10.548984198645599, + "grad_norm": 228.1654815673828, + "learning_rate": 1.414519056261343e-05, + "loss": 42.4282, + "step": 2922 + }, + { + "epoch": 10.552595936794582, + "grad_norm": 258.4981689453125, + "learning_rate": 1.4139745916515427e-05, + "loss": 42.3053, + "step": 2923 + }, + { + "epoch": 10.556207674943566, + "grad_norm": 364.42059326171875, + "learning_rate": 1.4134301270417424e-05, + "loss": 41.9009, + "step": 2924 + }, + { + "epoch": 10.559819413092551, + "grad_norm": 213.5066375732422, + "learning_rate": 1.412885662431942e-05, + "loss": 41.0624, + "step": 2925 + }, + { + "epoch": 10.563431151241534, + "grad_norm": 214.23472595214844, + "learning_rate": 1.4123411978221416e-05, + "loss": 42.2508, + "step": 2926 + }, + { + "epoch": 10.56704288939052, + "grad_norm": 249.8063201904297, + "learning_rate": 1.4117967332123412e-05, + "loss": 43.0671, + "step": 2927 + }, + { + "epoch": 10.570654627539504, + "grad_norm": 210.0769805908203, + "learning_rate": 1.4112522686025409e-05, + "loss": 43.4018, + "step": 2928 + }, + { + "epoch": 10.574266365688487, + "grad_norm": 255.67225646972656, + "learning_rate": 1.4107078039927406e-05, + "loss": 42.9609, + "step": 2929 + }, + { + "epoch": 10.577878103837472, + "grad_norm": 294.2599182128906, + "learning_rate": 1.4101633393829401e-05, + "loss": 41.8748, + "step": 2930 + }, + { + "epoch": 10.577878103837472, + "eval_loss": 0.6147512793540955, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2930 + }, + { + "epoch": 10.581489841986457, + "grad_norm": 212.6685333251953, + "learning_rate": 1.4096188747731398e-05, + "loss": 42.4291, + "step": 2931 + }, + { + "epoch": 10.58510158013544, + "grad_norm": 297.016357421875, + "learning_rate": 1.4090744101633394e-05, + "loss": 39.7291, + "step": 2932 + }, + { + "epoch": 10.588713318284425, + "grad_norm": 280.308837890625, + "learning_rate": 1.4085299455535389e-05, + "loss": 37.4836, + "step": 2933 + }, + { + "epoch": 10.592325056433408, + "grad_norm": 230.28994750976562, + "learning_rate": 1.4079854809437388e-05, + "loss": 39.4075, + "step": 2934 + }, + { + "epoch": 10.595936794582393, + "grad_norm": 377.0367126464844, + "learning_rate": 1.4074410163339383e-05, + "loss": 40.5601, + "step": 2935 + }, + { + "epoch": 10.599548532731378, + "grad_norm": 238.51597595214844, + "learning_rate": 1.406896551724138e-05, + "loss": 38.1238, + "step": 2936 + }, + { + "epoch": 10.60316027088036, + "grad_norm": 197.5536651611328, + "learning_rate": 1.4063520871143376e-05, + "loss": 38.2997, + "step": 2937 + }, + { + "epoch": 10.606772009029346, + "grad_norm": 211.65162658691406, + "learning_rate": 1.4058076225045373e-05, + "loss": 39.1501, + "step": 2938 + }, + { + "epoch": 10.610383747178329, + "grad_norm": 266.4801940917969, + "learning_rate": 1.405263157894737e-05, + "loss": 40.5761, + "step": 2939 + }, + { + "epoch": 10.613995485327314, + "grad_norm": 210.29478454589844, + "learning_rate": 1.4047186932849365e-05, + "loss": 39.7387, + "step": 2940 + }, + { + "epoch": 10.613995485327314, + "eval_loss": 0.6154477000236511, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 2940 + }, + { + "epoch": 10.617607223476298, + "grad_norm": 318.0694580078125, + "learning_rate": 1.404174228675136e-05, + "loss": 38.691, + "step": 2941 + }, + { + "epoch": 10.621218961625281, + "grad_norm": 351.12811279296875, + "learning_rate": 1.4036297640653358e-05, + "loss": 40.3878, + "step": 2942 + }, + { + "epoch": 10.624830699774266, + "grad_norm": 259.8601989746094, + "learning_rate": 1.4030852994555355e-05, + "loss": 38.4447, + "step": 2943 + }, + { + "epoch": 10.628442437923251, + "grad_norm": 249.7741241455078, + "learning_rate": 1.402540834845735e-05, + "loss": 41.1242, + "step": 2944 + }, + { + "epoch": 10.632054176072234, + "grad_norm": 207.11119079589844, + "learning_rate": 1.4019963702359347e-05, + "loss": 40.1977, + "step": 2945 + }, + { + "epoch": 10.635665914221219, + "grad_norm": 199.37295532226562, + "learning_rate": 1.4014519056261343e-05, + "loss": 40.71, + "step": 2946 + }, + { + "epoch": 10.639277652370204, + "grad_norm": 238.85061645507812, + "learning_rate": 1.4009074410163341e-05, + "loss": 41.8822, + "step": 2947 + }, + { + "epoch": 10.642889390519187, + "grad_norm": 212.46388244628906, + "learning_rate": 1.4003629764065337e-05, + "loss": 40.5648, + "step": 2948 + }, + { + "epoch": 10.646501128668172, + "grad_norm": 217.60386657714844, + "learning_rate": 1.3998185117967332e-05, + "loss": 39.6074, + "step": 2949 + }, + { + "epoch": 10.650112866817155, + "grad_norm": 223.88645935058594, + "learning_rate": 1.399274047186933e-05, + "loss": 37.7394, + "step": 2950 + }, + { + "epoch": 10.650112866817155, + "eval_loss": 0.6133999228477478, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 2950 + }, + { + "epoch": 10.65372460496614, + "grad_norm": 248.87986755371094, + "learning_rate": 1.3987295825771325e-05, + "loss": 34.911, + "step": 2951 + }, + { + "epoch": 10.657336343115125, + "grad_norm": 238.0355987548828, + "learning_rate": 1.3981851179673322e-05, + "loss": 34.0325, + "step": 2952 + }, + { + "epoch": 10.660948081264108, + "grad_norm": 212.9556121826172, + "learning_rate": 1.3976406533575319e-05, + "loss": 34.9663, + "step": 2953 + }, + { + "epoch": 10.664559819413093, + "grad_norm": 274.4277648925781, + "learning_rate": 1.3970961887477314e-05, + "loss": 34.2399, + "step": 2954 + }, + { + "epoch": 10.668171557562077, + "grad_norm": 211.77976989746094, + "learning_rate": 1.396551724137931e-05, + "loss": 33.7609, + "step": 2955 + }, + { + "epoch": 10.67178329571106, + "grad_norm": 280.6621398925781, + "learning_rate": 1.3960072595281307e-05, + "loss": 35.2616, + "step": 2956 + }, + { + "epoch": 10.675395033860045, + "grad_norm": 239.06439208984375, + "learning_rate": 1.3954627949183304e-05, + "loss": 34.2542, + "step": 2957 + }, + { + "epoch": 10.679006772009028, + "grad_norm": 271.45806884765625, + "learning_rate": 1.39491833030853e-05, + "loss": 36.0551, + "step": 2958 + }, + { + "epoch": 10.682618510158013, + "grad_norm": 247.76486206054688, + "learning_rate": 1.3943738656987296e-05, + "loss": 36.9935, + "step": 2959 + }, + { + "epoch": 10.686230248306998, + "grad_norm": 259.47930908203125, + "learning_rate": 1.3938294010889292e-05, + "loss": 36.7769, + "step": 2960 + }, + { + "epoch": 10.686230248306998, + "eval_loss": 0.6107803583145142, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.138, + "eval_steps_per_second": 57.138, + "step": 2960 + }, + { + "epoch": 10.689841986455981, + "grad_norm": 247.50103759765625, + "learning_rate": 1.393284936479129e-05, + "loss": 35.4848, + "step": 2961 + }, + { + "epoch": 10.693453724604966, + "grad_norm": 242.37330627441406, + "learning_rate": 1.3927404718693286e-05, + "loss": 36.3881, + "step": 2962 + }, + { + "epoch": 10.697065462753951, + "grad_norm": 200.2835693359375, + "learning_rate": 1.3921960072595281e-05, + "loss": 37.2684, + "step": 2963 + }, + { + "epoch": 10.700677200902934, + "grad_norm": 261.6256103515625, + "learning_rate": 1.3916515426497278e-05, + "loss": 37.4581, + "step": 2964 + }, + { + "epoch": 10.704288939051919, + "grad_norm": 243.7251434326172, + "learning_rate": 1.3911070780399274e-05, + "loss": 35.8237, + "step": 2965 + }, + { + "epoch": 10.707900677200904, + "grad_norm": 172.99339294433594, + "learning_rate": 1.390562613430127e-05, + "loss": 29.5815, + "step": 2966 + }, + { + "epoch": 10.711512415349887, + "grad_norm": 168.88490295410156, + "learning_rate": 1.3900181488203268e-05, + "loss": 23.6597, + "step": 2967 + }, + { + "epoch": 10.715124153498872, + "grad_norm": 213.0456085205078, + "learning_rate": 1.3894736842105263e-05, + "loss": 22.5034, + "step": 2968 + }, + { + "epoch": 10.718735891647855, + "grad_norm": 183.87222290039062, + "learning_rate": 1.388929219600726e-05, + "loss": 24.1696, + "step": 2969 + }, + { + "epoch": 10.72234762979684, + "grad_norm": 179.4297637939453, + "learning_rate": 1.3883847549909256e-05, + "loss": 24.8905, + "step": 2970 + }, + { + "epoch": 10.72234762979684, + "eval_loss": 0.6176853179931641, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 2970 + }, + { + "epoch": 10.725959367945824, + "grad_norm": 214.10662841796875, + "learning_rate": 1.3878402903811253e-05, + "loss": 40.6941, + "step": 2971 + }, + { + "epoch": 10.729571106094808, + "grad_norm": 199.4381103515625, + "learning_rate": 1.387295825771325e-05, + "loss": 42.6363, + "step": 2972 + }, + { + "epoch": 10.733182844243792, + "grad_norm": 182.74517822265625, + "learning_rate": 1.3867513611615245e-05, + "loss": 40.9695, + "step": 2973 + }, + { + "epoch": 10.736794582392777, + "grad_norm": 182.41421508789062, + "learning_rate": 1.386206896551724e-05, + "loss": 40.8893, + "step": 2974 + }, + { + "epoch": 10.74040632054176, + "grad_norm": 215.42904663085938, + "learning_rate": 1.385662431941924e-05, + "loss": 40.6667, + "step": 2975 + }, + { + "epoch": 10.744018058690745, + "grad_norm": 208.15133666992188, + "learning_rate": 1.3851179673321235e-05, + "loss": 42.0714, + "step": 2976 + }, + { + "epoch": 10.747629796839728, + "grad_norm": 224.70242309570312, + "learning_rate": 1.384573502722323e-05, + "loss": 40.9404, + "step": 2977 + }, + { + "epoch": 10.751241534988713, + "grad_norm": 241.45301818847656, + "learning_rate": 1.3840290381125227e-05, + "loss": 43.5597, + "step": 2978 + }, + { + "epoch": 10.754853273137698, + "grad_norm": 201.2677459716797, + "learning_rate": 1.3834845735027222e-05, + "loss": 42.7741, + "step": 2979 + }, + { + "epoch": 10.758465011286681, + "grad_norm": 246.30873107910156, + "learning_rate": 1.3829401088929221e-05, + "loss": 41.7873, + "step": 2980 + }, + { + "epoch": 10.758465011286681, + "eval_loss": 0.6206657886505127, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2980 + }, + { + "epoch": 10.762076749435666, + "grad_norm": 206.91009521484375, + "learning_rate": 1.3823956442831217e-05, + "loss": 42.3601, + "step": 2981 + }, + { + "epoch": 10.76568848758465, + "grad_norm": 206.37472534179688, + "learning_rate": 1.3818511796733212e-05, + "loss": 38.5536, + "step": 2982 + }, + { + "epoch": 10.769300225733634, + "grad_norm": 206.49070739746094, + "learning_rate": 1.3813067150635209e-05, + "loss": 38.1051, + "step": 2983 + }, + { + "epoch": 10.772911963882619, + "grad_norm": 215.02455139160156, + "learning_rate": 1.3807622504537206e-05, + "loss": 39.0797, + "step": 2984 + }, + { + "epoch": 10.776523702031604, + "grad_norm": 254.23757934570312, + "learning_rate": 1.3802177858439202e-05, + "loss": 39.419, + "step": 2985 + }, + { + "epoch": 10.780135440180587, + "grad_norm": 205.85079956054688, + "learning_rate": 1.3796733212341199e-05, + "loss": 39.2075, + "step": 2986 + }, + { + "epoch": 10.783747178329572, + "grad_norm": 216.0372314453125, + "learning_rate": 1.3791288566243194e-05, + "loss": 38.5652, + "step": 2987 + }, + { + "epoch": 10.787358916478555, + "grad_norm": 258.47650146484375, + "learning_rate": 1.3785843920145191e-05, + "loss": 38.1968, + "step": 2988 + }, + { + "epoch": 10.79097065462754, + "grad_norm": 289.07354736328125, + "learning_rate": 1.3780399274047188e-05, + "loss": 40.2233, + "step": 2989 + }, + { + "epoch": 10.794582392776524, + "grad_norm": 332.9964904785156, + "learning_rate": 1.3774954627949184e-05, + "loss": 39.5959, + "step": 2990 + }, + { + "epoch": 10.794582392776524, + "eval_loss": 0.6167517304420471, + "eval_runtime": 3.1556, + "eval_samples_per_second": 56.724, + "eval_steps_per_second": 56.724, + "step": 2990 + }, + { + "epoch": 10.798194130925507, + "grad_norm": 205.10699462890625, + "learning_rate": 1.376950998185118e-05, + "loss": 40.2468, + "step": 2991 + }, + { + "epoch": 10.801805869074492, + "grad_norm": 270.2808837890625, + "learning_rate": 1.3764065335753176e-05, + "loss": 37.5956, + "step": 2992 + }, + { + "epoch": 10.805417607223477, + "grad_norm": 199.32044982910156, + "learning_rate": 1.3758620689655171e-05, + "loss": 38.7289, + "step": 2993 + }, + { + "epoch": 10.80902934537246, + "grad_norm": 196.97547912597656, + "learning_rate": 1.375317604355717e-05, + "loss": 40.6707, + "step": 2994 + }, + { + "epoch": 10.812641083521445, + "grad_norm": 219.34588623046875, + "learning_rate": 1.3747731397459166e-05, + "loss": 39.6782, + "step": 2995 + }, + { + "epoch": 10.816252821670428, + "grad_norm": 261.7323913574219, + "learning_rate": 1.3742286751361161e-05, + "loss": 41.1828, + "step": 2996 + }, + { + "epoch": 10.819864559819413, + "grad_norm": 250.89186096191406, + "learning_rate": 1.3736842105263158e-05, + "loss": 41.3582, + "step": 2997 + }, + { + "epoch": 10.823476297968398, + "grad_norm": 284.7223205566406, + "learning_rate": 1.3731397459165155e-05, + "loss": 39.3584, + "step": 2998 + }, + { + "epoch": 10.827088036117381, + "grad_norm": 212.9114990234375, + "learning_rate": 1.3725952813067152e-05, + "loss": 37.5373, + "step": 2999 + }, + { + "epoch": 10.830699774266366, + "grad_norm": 182.8346405029297, + "learning_rate": 1.3720508166969148e-05, + "loss": 35.2027, + "step": 3000 + }, + { + "epoch": 10.830699774266366, + "eval_loss": 0.6083630919456482, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.702, + "eval_steps_per_second": 56.702, + "step": 3000 + }, + { + "epoch": 10.83431151241535, + "grad_norm": 259.0496520996094, + "learning_rate": 1.3715063520871143e-05, + "loss": 33.4937, + "step": 3001 + }, + { + "epoch": 10.837923250564334, + "grad_norm": 173.037353515625, + "learning_rate": 1.370961887477314e-05, + "loss": 32.8549, + "step": 3002 + }, + { + "epoch": 10.841534988713319, + "grad_norm": 257.9381408691406, + "learning_rate": 1.3704174228675137e-05, + "loss": 33.9163, + "step": 3003 + }, + { + "epoch": 10.845146726862303, + "grad_norm": 248.58355712890625, + "learning_rate": 1.3698729582577132e-05, + "loss": 34.3948, + "step": 3004 + }, + { + "epoch": 10.848758465011286, + "grad_norm": 277.0877990722656, + "learning_rate": 1.369328493647913e-05, + "loss": 34.2868, + "step": 3005 + }, + { + "epoch": 10.852370203160271, + "grad_norm": 220.54014587402344, + "learning_rate": 1.3687840290381125e-05, + "loss": 35.2502, + "step": 3006 + }, + { + "epoch": 10.855981941309254, + "grad_norm": 248.14111328125, + "learning_rate": 1.3682395644283122e-05, + "loss": 33.4599, + "step": 3007 + }, + { + "epoch": 10.85959367945824, + "grad_norm": 284.2827453613281, + "learning_rate": 1.3676950998185119e-05, + "loss": 34.2927, + "step": 3008 + }, + { + "epoch": 10.863205417607224, + "grad_norm": 236.78201293945312, + "learning_rate": 1.3671506352087114e-05, + "loss": 34.9322, + "step": 3009 + }, + { + "epoch": 10.866817155756207, + "grad_norm": 245.58331298828125, + "learning_rate": 1.3666061705989112e-05, + "loss": 35.7628, + "step": 3010 + }, + { + "epoch": 10.866817155756207, + "eval_loss": 0.6125946640968323, + "eval_runtime": 3.1644, + "eval_samples_per_second": 56.566, + "eval_steps_per_second": 56.566, + "step": 3010 + }, + { + "epoch": 10.870428893905192, + "grad_norm": 217.79248046875, + "learning_rate": 1.3660617059891107e-05, + "loss": 35.7332, + "step": 3011 + }, + { + "epoch": 10.874040632054175, + "grad_norm": 258.78729248046875, + "learning_rate": 1.3655172413793104e-05, + "loss": 38.293, + "step": 3012 + }, + { + "epoch": 10.87765237020316, + "grad_norm": 253.94757080078125, + "learning_rate": 1.3649727767695101e-05, + "loss": 37.511, + "step": 3013 + }, + { + "epoch": 10.881264108352145, + "grad_norm": 265.5654602050781, + "learning_rate": 1.3644283121597096e-05, + "loss": 37.5786, + "step": 3014 + }, + { + "epoch": 10.884875846501128, + "grad_norm": 252.11453247070312, + "learning_rate": 1.3638838475499092e-05, + "loss": 37.1039, + "step": 3015 + }, + { + "epoch": 10.888487584650113, + "grad_norm": 259.5934753417969, + "learning_rate": 1.3633393829401089e-05, + "loss": 35.2651, + "step": 3016 + }, + { + "epoch": 10.892099322799098, + "grad_norm": 194.3569793701172, + "learning_rate": 1.3627949183303086e-05, + "loss": 23.7438, + "step": 3017 + }, + { + "epoch": 10.89571106094808, + "grad_norm": 233.95205688476562, + "learning_rate": 1.3622504537205081e-05, + "loss": 23.0061, + "step": 3018 + }, + { + "epoch": 10.899322799097066, + "grad_norm": 185.18495178222656, + "learning_rate": 1.3617059891107078e-05, + "loss": 24.5404, + "step": 3019 + }, + { + "epoch": 10.90293453724605, + "grad_norm": 200.27029418945312, + "learning_rate": 1.3611615245009074e-05, + "loss": 24.3629, + "step": 3020 + }, + { + "epoch": 10.90293453724605, + "eval_loss": 0.6178797483444214, + "eval_runtime": 3.1498, + "eval_samples_per_second": 56.829, + "eval_steps_per_second": 56.829, + "step": 3020 + }, + { + "epoch": 10.906546275395034, + "grad_norm": 226.4281463623047, + "learning_rate": 1.3606170598911073e-05, + "loss": 41.7249, + "step": 3021 + }, + { + "epoch": 10.910158013544018, + "grad_norm": 207.73768615722656, + "learning_rate": 1.3600725952813068e-05, + "loss": 42.1902, + "step": 3022 + }, + { + "epoch": 10.913769751693001, + "grad_norm": 248.69773864746094, + "learning_rate": 1.3595281306715063e-05, + "loss": 40.8419, + "step": 3023 + }, + { + "epoch": 10.917381489841986, + "grad_norm": 224.0100860595703, + "learning_rate": 1.358983666061706e-05, + "loss": 41.483, + "step": 3024 + }, + { + "epoch": 10.920993227990971, + "grad_norm": 217.3524932861328, + "learning_rate": 1.3584392014519056e-05, + "loss": 42.4667, + "step": 3025 + }, + { + "epoch": 10.924604966139954, + "grad_norm": 226.0863494873047, + "learning_rate": 1.3578947368421053e-05, + "loss": 40.8693, + "step": 3026 + }, + { + "epoch": 10.928216704288939, + "grad_norm": 278.3658447265625, + "learning_rate": 1.357350272232305e-05, + "loss": 39.5165, + "step": 3027 + }, + { + "epoch": 10.931828442437924, + "grad_norm": 226.6543731689453, + "learning_rate": 1.3568058076225045e-05, + "loss": 39.3144, + "step": 3028 + }, + { + "epoch": 10.935440180586907, + "grad_norm": 215.39073181152344, + "learning_rate": 1.3562613430127042e-05, + "loss": 39.9823, + "step": 3029 + }, + { + "epoch": 10.939051918735892, + "grad_norm": 239.6291961669922, + "learning_rate": 1.355716878402904e-05, + "loss": 40.898, + "step": 3030 + }, + { + "epoch": 10.939051918735892, + "eval_loss": 0.6163076162338257, + "eval_runtime": 3.153, + "eval_samples_per_second": 56.771, + "eval_steps_per_second": 56.771, + "step": 3030 + }, + { + "epoch": 10.942663656884875, + "grad_norm": 251.20431518554688, + "learning_rate": 1.3551724137931035e-05, + "loss": 40.8357, + "step": 3031 + }, + { + "epoch": 10.94627539503386, + "grad_norm": 243.96022033691406, + "learning_rate": 1.3546279491833032e-05, + "loss": 39.1261, + "step": 3032 + }, + { + "epoch": 10.949887133182845, + "grad_norm": 248.15545654296875, + "learning_rate": 1.3540834845735027e-05, + "loss": 40.9375, + "step": 3033 + }, + { + "epoch": 10.953498871331828, + "grad_norm": 215.00927734375, + "learning_rate": 1.3535390199637023e-05, + "loss": 42.4167, + "step": 3034 + }, + { + "epoch": 10.957110609480813, + "grad_norm": 263.11566162109375, + "learning_rate": 1.3529945553539021e-05, + "loss": 40.7363, + "step": 3035 + }, + { + "epoch": 10.960722347629797, + "grad_norm": 208.59628295898438, + "learning_rate": 1.3524500907441017e-05, + "loss": 35.7124, + "step": 3036 + }, + { + "epoch": 10.96433408577878, + "grad_norm": 187.6036834716797, + "learning_rate": 1.3519056261343012e-05, + "loss": 33.7512, + "step": 3037 + }, + { + "epoch": 10.967945823927765, + "grad_norm": 217.89825439453125, + "learning_rate": 1.351361161524501e-05, + "loss": 33.4262, + "step": 3038 + }, + { + "epoch": 10.97155756207675, + "grad_norm": 235.59889221191406, + "learning_rate": 1.3508166969147005e-05, + "loss": 35.2587, + "step": 3039 + }, + { + "epoch": 10.975169300225733, + "grad_norm": 261.9609680175781, + "learning_rate": 1.3502722323049003e-05, + "loss": 36.1296, + "step": 3040 + }, + { + "epoch": 10.975169300225733, + "eval_loss": 0.610818088054657, + "eval_runtime": 3.1502, + "eval_samples_per_second": 56.822, + "eval_steps_per_second": 56.822, + "step": 3040 + }, + { + "epoch": 10.978781038374718, + "grad_norm": 239.44386291503906, + "learning_rate": 1.3497277676950999e-05, + "loss": 35.6712, + "step": 3041 + }, + { + "epoch": 10.982392776523701, + "grad_norm": 260.9620666503906, + "learning_rate": 1.3491833030852994e-05, + "loss": 35.9054, + "step": 3042 + }, + { + "epoch": 10.986004514672686, + "grad_norm": 246.35678100585938, + "learning_rate": 1.3486388384754991e-05, + "loss": 35.6071, + "step": 3043 + }, + { + "epoch": 10.989616252821671, + "grad_norm": 259.808349609375, + "learning_rate": 1.3480943738656988e-05, + "loss": 37.8261, + "step": 3044 + }, + { + "epoch": 10.993227990970654, + "grad_norm": 187.34579467773438, + "learning_rate": 1.3475499092558984e-05, + "loss": 29.4662, + "step": 3045 + }, + { + "epoch": 10.996839729119639, + "grad_norm": 235.4073486328125, + "learning_rate": 1.3470054446460981e-05, + "loss": 23.668, + "step": 3046 + }, + { + "epoch": 11.0, + "grad_norm": 171.45904541015625, + "learning_rate": 1.3464609800362976e-05, + "loss": 21.3995, + "step": 3047 + }, + { + "epoch": 11.003611738148985, + "grad_norm": 262.18798828125, + "learning_rate": 1.3459165154264972e-05, + "loss": 40.2072, + "step": 3048 + }, + { + "epoch": 11.007223476297968, + "grad_norm": 298.67755126953125, + "learning_rate": 1.345372050816697e-05, + "loss": 42.5345, + "step": 3049 + }, + { + "epoch": 11.010835214446953, + "grad_norm": 215.71389770507812, + "learning_rate": 1.3448275862068966e-05, + "loss": 41.3491, + "step": 3050 + }, + { + "epoch": 11.010835214446953, + "eval_loss": 0.6099278330802917, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 3050 + }, + { + "epoch": 11.014446952595938, + "grad_norm": 243.77044677734375, + "learning_rate": 1.3442831215970963e-05, + "loss": 41.0093, + "step": 3051 + }, + { + "epoch": 11.01805869074492, + "grad_norm": 205.8600616455078, + "learning_rate": 1.3437386569872958e-05, + "loss": 41.944, + "step": 3052 + }, + { + "epoch": 11.021670428893906, + "grad_norm": 204.25608825683594, + "learning_rate": 1.3431941923774955e-05, + "loss": 39.3595, + "step": 3053 + }, + { + "epoch": 11.025282167042889, + "grad_norm": 195.03114318847656, + "learning_rate": 1.3426497277676952e-05, + "loss": 42.0208, + "step": 3054 + }, + { + "epoch": 11.028893905191874, + "grad_norm": 193.05857849121094, + "learning_rate": 1.3421052631578948e-05, + "loss": 41.2148, + "step": 3055 + }, + { + "epoch": 11.032505643340858, + "grad_norm": 255.9553680419922, + "learning_rate": 1.3415607985480943e-05, + "loss": 41.6029, + "step": 3056 + }, + { + "epoch": 11.036117381489841, + "grad_norm": 234.97799682617188, + "learning_rate": 1.341016333938294e-05, + "loss": 41.2583, + "step": 3057 + }, + { + "epoch": 11.039729119638826, + "grad_norm": 183.76707458496094, + "learning_rate": 1.3404718693284937e-05, + "loss": 39.4893, + "step": 3058 + }, + { + "epoch": 11.043340857787811, + "grad_norm": 162.30191040039062, + "learning_rate": 1.3399274047186933e-05, + "loss": 37.697, + "step": 3059 + }, + { + "epoch": 11.046952595936794, + "grad_norm": 223.8235626220703, + "learning_rate": 1.339382940108893e-05, + "loss": 37.2762, + "step": 3060 + }, + { + "epoch": 11.046952595936794, + "eval_loss": 0.6099210381507874, + "eval_runtime": 3.1526, + "eval_samples_per_second": 56.778, + "eval_steps_per_second": 56.778, + "step": 3060 + }, + { + "epoch": 11.050564334085779, + "grad_norm": 203.874755859375, + "learning_rate": 1.3388384754990925e-05, + "loss": 37.7674, + "step": 3061 + }, + { + "epoch": 11.054176072234762, + "grad_norm": 222.9609832763672, + "learning_rate": 1.3382940108892922e-05, + "loss": 39.5784, + "step": 3062 + }, + { + "epoch": 11.057787810383747, + "grad_norm": 177.81871032714844, + "learning_rate": 1.337749546279492e-05, + "loss": 37.5264, + "step": 3063 + }, + { + "epoch": 11.061399548532732, + "grad_norm": 209.53326416015625, + "learning_rate": 1.3372050816696915e-05, + "loss": 38.5067, + "step": 3064 + }, + { + "epoch": 11.065011286681715, + "grad_norm": 228.35260009765625, + "learning_rate": 1.3366606170598912e-05, + "loss": 37.5329, + "step": 3065 + }, + { + "epoch": 11.0686230248307, + "grad_norm": 231.5054168701172, + "learning_rate": 1.3361161524500907e-05, + "loss": 39.8565, + "step": 3066 + }, + { + "epoch": 11.072234762979685, + "grad_norm": 184.31460571289062, + "learning_rate": 1.3355716878402904e-05, + "loss": 37.9703, + "step": 3067 + }, + { + "epoch": 11.075846501128668, + "grad_norm": 230.06463623046875, + "learning_rate": 1.3350272232304901e-05, + "loss": 39.1406, + "step": 3068 + }, + { + "epoch": 11.079458239277653, + "grad_norm": 263.3990478515625, + "learning_rate": 1.3344827586206897e-05, + "loss": 39.8019, + "step": 3069 + }, + { + "epoch": 11.083069977426636, + "grad_norm": 217.89923095703125, + "learning_rate": 1.3339382940108892e-05, + "loss": 40.195, + "step": 3070 + }, + { + "epoch": 11.083069977426636, + "eval_loss": 0.6136859655380249, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 3070 + }, + { + "epoch": 11.08668171557562, + "grad_norm": 238.8343505859375, + "learning_rate": 1.333393829401089e-05, + "loss": 39.1668, + "step": 3071 + }, + { + "epoch": 11.090293453724605, + "grad_norm": 288.6470947265625, + "learning_rate": 1.3328493647912886e-05, + "loss": 40.3355, + "step": 3072 + }, + { + "epoch": 11.093905191873588, + "grad_norm": 284.3423156738281, + "learning_rate": 1.3323049001814883e-05, + "loss": 41.5359, + "step": 3073 + }, + { + "epoch": 11.097516930022573, + "grad_norm": 263.0945739746094, + "learning_rate": 1.3317604355716879e-05, + "loss": 41.3219, + "step": 3074 + }, + { + "epoch": 11.101128668171558, + "grad_norm": 208.96383666992188, + "learning_rate": 1.3312159709618874e-05, + "loss": 39.7292, + "step": 3075 + }, + { + "epoch": 11.104740406320541, + "grad_norm": 233.49888610839844, + "learning_rate": 1.3306715063520873e-05, + "loss": 35.282, + "step": 3076 + }, + { + "epoch": 11.108352144469526, + "grad_norm": 216.6250762939453, + "learning_rate": 1.3301270417422868e-05, + "loss": 34.4335, + "step": 3077 + }, + { + "epoch": 11.111963882618511, + "grad_norm": 182.3594970703125, + "learning_rate": 1.3295825771324864e-05, + "loss": 32.7557, + "step": 3078 + }, + { + "epoch": 11.115575620767494, + "grad_norm": 215.4852752685547, + "learning_rate": 1.329038112522686e-05, + "loss": 32.185, + "step": 3079 + }, + { + "epoch": 11.119187358916479, + "grad_norm": 237.4733123779297, + "learning_rate": 1.3284936479128856e-05, + "loss": 32.8733, + "step": 3080 + }, + { + "epoch": 11.119187358916479, + "eval_loss": 0.6130570769309998, + "eval_runtime": 3.154, + "eval_samples_per_second": 56.754, + "eval_steps_per_second": 56.754, + "step": 3080 + }, + { + "epoch": 11.122799097065462, + "grad_norm": 202.9044952392578, + "learning_rate": 1.3279491833030853e-05, + "loss": 33.89, + "step": 3081 + }, + { + "epoch": 11.126410835214447, + "grad_norm": 230.82086181640625, + "learning_rate": 1.327404718693285e-05, + "loss": 34.0808, + "step": 3082 + }, + { + "epoch": 11.130022573363432, + "grad_norm": 318.1103515625, + "learning_rate": 1.3268602540834846e-05, + "loss": 35.5715, + "step": 3083 + }, + { + "epoch": 11.133634311512415, + "grad_norm": 296.760986328125, + "learning_rate": 1.3263157894736843e-05, + "loss": 36.0701, + "step": 3084 + }, + { + "epoch": 11.1372460496614, + "grad_norm": 355.1922302246094, + "learning_rate": 1.3257713248638838e-05, + "loss": 35.027, + "step": 3085 + }, + { + "epoch": 11.140857787810384, + "grad_norm": 379.0643310546875, + "learning_rate": 1.3252268602540835e-05, + "loss": 36.8225, + "step": 3086 + }, + { + "epoch": 11.144469525959368, + "grad_norm": 271.0293273925781, + "learning_rate": 1.3246823956442832e-05, + "loss": 34.18, + "step": 3087 + }, + { + "epoch": 11.148081264108352, + "grad_norm": 231.29782104492188, + "learning_rate": 1.3241379310344828e-05, + "loss": 37.5546, + "step": 3088 + }, + { + "epoch": 11.151693002257336, + "grad_norm": 236.58180236816406, + "learning_rate": 1.3235934664246823e-05, + "loss": 35.8625, + "step": 3089 + }, + { + "epoch": 11.15530474040632, + "grad_norm": 220.71853637695312, + "learning_rate": 1.3230490018148822e-05, + "loss": 38.1384, + "step": 3090 + }, + { + "epoch": 11.15530474040632, + "eval_loss": 0.6140565276145935, + "eval_runtime": 3.1543, + "eval_samples_per_second": 56.747, + "eval_steps_per_second": 56.747, + "step": 3090 + }, + { + "epoch": 11.158916478555305, + "grad_norm": 251.32090759277344, + "learning_rate": 1.3225045372050817e-05, + "loss": 36.7226, + "step": 3091 + }, + { + "epoch": 11.162528216704288, + "grad_norm": 244.061279296875, + "learning_rate": 1.3219600725952814e-05, + "loss": 37.2144, + "step": 3092 + }, + { + "epoch": 11.166139954853273, + "grad_norm": 274.3013610839844, + "learning_rate": 1.321415607985481e-05, + "loss": 27.0703, + "step": 3093 + }, + { + "epoch": 11.169751693002258, + "grad_norm": 197.1829071044922, + "learning_rate": 1.3208711433756805e-05, + "loss": 23.0504, + "step": 3094 + }, + { + "epoch": 11.173363431151241, + "grad_norm": 205.8387451171875, + "learning_rate": 1.3203266787658804e-05, + "loss": 23.4632, + "step": 3095 + }, + { + "epoch": 11.176975169300226, + "grad_norm": 237.6263427734375, + "learning_rate": 1.31978221415608e-05, + "loss": 23.9426, + "step": 3096 + }, + { + "epoch": 11.18058690744921, + "grad_norm": 177.99688720703125, + "learning_rate": 1.3192377495462795e-05, + "loss": 24.2553, + "step": 3097 + }, + { + "epoch": 11.184198645598194, + "grad_norm": 235.16787719726562, + "learning_rate": 1.3186932849364792e-05, + "loss": 41.3257, + "step": 3098 + }, + { + "epoch": 11.187810383747179, + "grad_norm": 213.4043731689453, + "learning_rate": 1.3181488203266787e-05, + "loss": 42.3344, + "step": 3099 + }, + { + "epoch": 11.191422121896162, + "grad_norm": 162.57554626464844, + "learning_rate": 1.3176043557168784e-05, + "loss": 41.2702, + "step": 3100 + }, + { + "epoch": 11.191422121896162, + "eval_loss": 0.6155741214752197, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.06, + "eval_steps_per_second": 57.06, + "step": 3100 + }, + { + "epoch": 11.195033860045147, + "grad_norm": 215.84335327148438, + "learning_rate": 1.3170598911070781e-05, + "loss": 41.0582, + "step": 3101 + }, + { + "epoch": 11.198645598194132, + "grad_norm": 295.0271301269531, + "learning_rate": 1.3165154264972777e-05, + "loss": 41.3479, + "step": 3102 + }, + { + "epoch": 11.202257336343115, + "grad_norm": 287.3316955566406, + "learning_rate": 1.3159709618874774e-05, + "loss": 41.6267, + "step": 3103 + }, + { + "epoch": 11.2058690744921, + "grad_norm": 249.3993377685547, + "learning_rate": 1.315426497277677e-05, + "loss": 40.5208, + "step": 3104 + }, + { + "epoch": 11.209480812641084, + "grad_norm": 274.5410461425781, + "learning_rate": 1.3148820326678766e-05, + "loss": 41.7072, + "step": 3105 + }, + { + "epoch": 11.213092550790067, + "grad_norm": 259.49627685546875, + "learning_rate": 1.3143375680580763e-05, + "loss": 41.0034, + "step": 3106 + }, + { + "epoch": 11.216704288939052, + "grad_norm": 246.60902404785156, + "learning_rate": 1.3137931034482759e-05, + "loss": 40.1154, + "step": 3107 + }, + { + "epoch": 11.220316027088035, + "grad_norm": 224.0052947998047, + "learning_rate": 1.3132486388384754e-05, + "loss": 41.1167, + "step": 3108 + }, + { + "epoch": 11.22392776523702, + "grad_norm": 204.24021911621094, + "learning_rate": 1.3127041742286753e-05, + "loss": 37.0909, + "step": 3109 + }, + { + "epoch": 11.227539503386005, + "grad_norm": 206.67681884765625, + "learning_rate": 1.3121597096188748e-05, + "loss": 38.0959, + "step": 3110 + }, + { + "epoch": 11.227539503386005, + "eval_loss": 0.6148640513420105, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.798, + "eval_steps_per_second": 56.798, + "step": 3110 + }, + { + "epoch": 11.231151241534988, + "grad_norm": 255.91238403320312, + "learning_rate": 1.3116152450090743e-05, + "loss": 38.8076, + "step": 3111 + }, + { + "epoch": 11.234762979683973, + "grad_norm": 239.5032958984375, + "learning_rate": 1.311070780399274e-05, + "loss": 39.3991, + "step": 3112 + }, + { + "epoch": 11.238374717832958, + "grad_norm": 254.8914031982422, + "learning_rate": 1.3105263157894738e-05, + "loss": 37.7301, + "step": 3113 + }, + { + "epoch": 11.241986455981941, + "grad_norm": 229.97943115234375, + "learning_rate": 1.3099818511796735e-05, + "loss": 38.8527, + "step": 3114 + }, + { + "epoch": 11.245598194130926, + "grad_norm": 208.1148681640625, + "learning_rate": 1.309437386569873e-05, + "loss": 38.8518, + "step": 3115 + }, + { + "epoch": 11.249209932279909, + "grad_norm": 208.49557495117188, + "learning_rate": 1.3088929219600725e-05, + "loss": 38.927, + "step": 3116 + }, + { + "epoch": 11.252821670428894, + "grad_norm": 332.9958801269531, + "learning_rate": 1.3083484573502723e-05, + "loss": 40.0492, + "step": 3117 + }, + { + "epoch": 11.256433408577879, + "grad_norm": 253.16769409179688, + "learning_rate": 1.307803992740472e-05, + "loss": 39.1965, + "step": 3118 + }, + { + "epoch": 11.260045146726862, + "grad_norm": 243.8136444091797, + "learning_rate": 1.3072595281306715e-05, + "loss": 38.2286, + "step": 3119 + }, + { + "epoch": 11.263656884875846, + "grad_norm": 273.6463623046875, + "learning_rate": 1.3067150635208712e-05, + "loss": 39.3751, + "step": 3120 + }, + { + "epoch": 11.263656884875846, + "eval_loss": 0.6175129413604736, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 3120 + }, + { + "epoch": 11.267268623024831, + "grad_norm": 228.980224609375, + "learning_rate": 1.3061705989110707e-05, + "loss": 40.29, + "step": 3121 + }, + { + "epoch": 11.270880361173814, + "grad_norm": 292.6310729980469, + "learning_rate": 1.3056261343012703e-05, + "loss": 41.1785, + "step": 3122 + }, + { + "epoch": 11.2744920993228, + "grad_norm": 217.0737762451172, + "learning_rate": 1.3050816696914702e-05, + "loss": 40.9514, + "step": 3123 + }, + { + "epoch": 11.278103837471784, + "grad_norm": 227.0102081298828, + "learning_rate": 1.3045372050816697e-05, + "loss": 39.6132, + "step": 3124 + }, + { + "epoch": 11.281715575620767, + "grad_norm": 195.74667358398438, + "learning_rate": 1.3039927404718694e-05, + "loss": 39.5024, + "step": 3125 + }, + { + "epoch": 11.285327313769752, + "grad_norm": 222.6744384765625, + "learning_rate": 1.303448275862069e-05, + "loss": 37.7863, + "step": 3126 + }, + { + "epoch": 11.288939051918735, + "grad_norm": 207.1038055419922, + "learning_rate": 1.3029038112522687e-05, + "loss": 34.9129, + "step": 3127 + }, + { + "epoch": 11.29255079006772, + "grad_norm": 227.38330078125, + "learning_rate": 1.3023593466424684e-05, + "loss": 33.231, + "step": 3128 + }, + { + "epoch": 11.296162528216705, + "grad_norm": 254.19442749023438, + "learning_rate": 1.3018148820326679e-05, + "loss": 33.3166, + "step": 3129 + }, + { + "epoch": 11.299774266365688, + "grad_norm": 221.4664306640625, + "learning_rate": 1.3012704174228674e-05, + "loss": 33.2336, + "step": 3130 + }, + { + "epoch": 11.299774266365688, + "eval_loss": 0.6138683557510376, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 3130 + }, + { + "epoch": 11.303386004514673, + "grad_norm": 179.73678588867188, + "learning_rate": 1.3007259528130671e-05, + "loss": 34.0082, + "step": 3131 + }, + { + "epoch": 11.306997742663658, + "grad_norm": 238.66107177734375, + "learning_rate": 1.3001814882032669e-05, + "loss": 33.1898, + "step": 3132 + }, + { + "epoch": 11.31060948081264, + "grad_norm": 315.51934814453125, + "learning_rate": 1.2996370235934666e-05, + "loss": 34.5558, + "step": 3133 + }, + { + "epoch": 11.314221218961626, + "grad_norm": 235.54217529296875, + "learning_rate": 1.2990925589836661e-05, + "loss": 32.4498, + "step": 3134 + }, + { + "epoch": 11.317832957110609, + "grad_norm": 225.9518280029297, + "learning_rate": 1.2985480943738656e-05, + "loss": 34.1823, + "step": 3135 + }, + { + "epoch": 11.321444695259594, + "grad_norm": 276.5481262207031, + "learning_rate": 1.2980036297640655e-05, + "loss": 34.6704, + "step": 3136 + }, + { + "epoch": 11.325056433408578, + "grad_norm": 306.4985656738281, + "learning_rate": 1.297459165154265e-05, + "loss": 35.9149, + "step": 3137 + }, + { + "epoch": 11.328668171557561, + "grad_norm": 207.28550720214844, + "learning_rate": 1.2969147005444646e-05, + "loss": 34.876, + "step": 3138 + }, + { + "epoch": 11.332279909706546, + "grad_norm": 238.89157104492188, + "learning_rate": 1.2963702359346643e-05, + "loss": 36.7191, + "step": 3139 + }, + { + "epoch": 11.335891647855531, + "grad_norm": 281.7445068359375, + "learning_rate": 1.2958257713248638e-05, + "loss": 37.9134, + "step": 3140 + }, + { + "epoch": 11.335891647855531, + "eval_loss": 0.6141538023948669, + "eval_runtime": 3.1622, + "eval_samples_per_second": 56.606, + "eval_steps_per_second": 56.606, + "step": 3140 + }, + { + "epoch": 11.339503386004514, + "grad_norm": 261.58221435546875, + "learning_rate": 1.2952813067150635e-05, + "loss": 36.7193, + "step": 3141 + }, + { + "epoch": 11.343115124153499, + "grad_norm": 260.8083190917969, + "learning_rate": 1.2947368421052633e-05, + "loss": 36.9418, + "step": 3142 + }, + { + "epoch": 11.346726862302482, + "grad_norm": 263.466552734375, + "learning_rate": 1.2941923774954628e-05, + "loss": 31.1083, + "step": 3143 + }, + { + "epoch": 11.350338600451467, + "grad_norm": 201.6587677001953, + "learning_rate": 1.2936479128856625e-05, + "loss": 23.4982, + "step": 3144 + }, + { + "epoch": 11.353950338600452, + "grad_norm": 230.29629516601562, + "learning_rate": 1.293103448275862e-05, + "loss": 22.5417, + "step": 3145 + }, + { + "epoch": 11.357562076749435, + "grad_norm": 193.08795166015625, + "learning_rate": 1.2925589836660617e-05, + "loss": 23.6032, + "step": 3146 + }, + { + "epoch": 11.36117381489842, + "grad_norm": 206.49093627929688, + "learning_rate": 1.2920145190562615e-05, + "loss": 24.1813, + "step": 3147 + }, + { + "epoch": 11.364785553047405, + "grad_norm": 285.38348388671875, + "learning_rate": 1.291470054446461e-05, + "loss": 41.4394, + "step": 3148 + }, + { + "epoch": 11.368397291196388, + "grad_norm": 307.4984130859375, + "learning_rate": 1.2909255898366605e-05, + "loss": 43.8865, + "step": 3149 + }, + { + "epoch": 11.372009029345373, + "grad_norm": 256.685791015625, + "learning_rate": 1.2903811252268604e-05, + "loss": 41.5534, + "step": 3150 + }, + { + "epoch": 11.372009029345373, + "eval_loss": 0.6155339479446411, + "eval_runtime": 3.1488, + "eval_samples_per_second": 56.846, + "eval_steps_per_second": 56.846, + "step": 3150 + }, + { + "epoch": 11.375620767494357, + "grad_norm": 302.5317077636719, + "learning_rate": 1.28983666061706e-05, + "loss": 41.5231, + "step": 3151 + }, + { + "epoch": 11.37923250564334, + "grad_norm": 381.4787292480469, + "learning_rate": 1.2892921960072595e-05, + "loss": 40.7064, + "step": 3152 + }, + { + "epoch": 11.382844243792325, + "grad_norm": 313.63116455078125, + "learning_rate": 1.2887477313974592e-05, + "loss": 41.4045, + "step": 3153 + }, + { + "epoch": 11.386455981941308, + "grad_norm": 265.4134521484375, + "learning_rate": 1.2882032667876587e-05, + "loss": 41.2618, + "step": 3154 + }, + { + "epoch": 11.390067720090293, + "grad_norm": 260.43084716796875, + "learning_rate": 1.2876588021778586e-05, + "loss": 42.6311, + "step": 3155 + }, + { + "epoch": 11.393679458239278, + "grad_norm": 326.7022705078125, + "learning_rate": 1.2871143375680581e-05, + "loss": 41.8859, + "step": 3156 + }, + { + "epoch": 11.397291196388261, + "grad_norm": 420.966552734375, + "learning_rate": 1.2865698729582577e-05, + "loss": 41.8117, + "step": 3157 + }, + { + "epoch": 11.400902934537246, + "grad_norm": 280.8377380371094, + "learning_rate": 1.2860254083484574e-05, + "loss": 41.3303, + "step": 3158 + }, + { + "epoch": 11.404514672686231, + "grad_norm": 238.64564514160156, + "learning_rate": 1.2854809437386571e-05, + "loss": 38.253, + "step": 3159 + }, + { + "epoch": 11.408126410835214, + "grad_norm": 258.8091125488281, + "learning_rate": 1.2849364791288566e-05, + "loss": 39.2494, + "step": 3160 + }, + { + "epoch": 11.408126410835214, + "eval_loss": 0.6130858659744263, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 3160 + }, + { + "epoch": 11.411738148984199, + "grad_norm": 209.76300048828125, + "learning_rate": 1.2843920145190563e-05, + "loss": 39.1069, + "step": 3161 + }, + { + "epoch": 11.415349887133182, + "grad_norm": 215.24072265625, + "learning_rate": 1.2838475499092559e-05, + "loss": 38.8867, + "step": 3162 + }, + { + "epoch": 11.418961625282167, + "grad_norm": 285.4281311035156, + "learning_rate": 1.2833030852994554e-05, + "loss": 38.0298, + "step": 3163 + }, + { + "epoch": 11.422573363431152, + "grad_norm": 322.1593017578125, + "learning_rate": 1.2827586206896553e-05, + "loss": 40.2122, + "step": 3164 + }, + { + "epoch": 11.426185101580135, + "grad_norm": 277.2178955078125, + "learning_rate": 1.2822141560798548e-05, + "loss": 38.0829, + "step": 3165 + }, + { + "epoch": 11.42979683972912, + "grad_norm": 186.9705810546875, + "learning_rate": 1.2816696914700545e-05, + "loss": 40.6601, + "step": 3166 + }, + { + "epoch": 11.433408577878104, + "grad_norm": 210.6102294921875, + "learning_rate": 1.281125226860254e-05, + "loss": 39.0126, + "step": 3167 + }, + { + "epoch": 11.437020316027088, + "grad_norm": 234.50717163085938, + "learning_rate": 1.2805807622504536e-05, + "loss": 38.6465, + "step": 3168 + }, + { + "epoch": 11.440632054176072, + "grad_norm": 217.9093475341797, + "learning_rate": 1.2800362976406535e-05, + "loss": 39.2568, + "step": 3169 + }, + { + "epoch": 11.444243792325057, + "grad_norm": 252.82054138183594, + "learning_rate": 1.279491833030853e-05, + "loss": 39.005, + "step": 3170 + }, + { + "epoch": 11.444243792325057, + "eval_loss": 0.6125118732452393, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 3170 + }, + { + "epoch": 11.44785553047404, + "grad_norm": 290.2322998046875, + "learning_rate": 1.2789473684210526e-05, + "loss": 39.6133, + "step": 3171 + }, + { + "epoch": 11.451467268623025, + "grad_norm": 250.72450256347656, + "learning_rate": 1.2784029038112523e-05, + "loss": 40.3251, + "step": 3172 + }, + { + "epoch": 11.455079006772008, + "grad_norm": 273.91229248046875, + "learning_rate": 1.277858439201452e-05, + "loss": 39.5129, + "step": 3173 + }, + { + "epoch": 11.458690744920993, + "grad_norm": 214.30038452148438, + "learning_rate": 1.2773139745916515e-05, + "loss": 40.5093, + "step": 3174 + }, + { + "epoch": 11.462302483069978, + "grad_norm": 264.251708984375, + "learning_rate": 1.2767695099818512e-05, + "loss": 38.3837, + "step": 3175 + }, + { + "epoch": 11.465914221218961, + "grad_norm": 224.7700653076172, + "learning_rate": 1.2762250453720508e-05, + "loss": 37.8522, + "step": 3176 + }, + { + "epoch": 11.469525959367946, + "grad_norm": 238.35604858398438, + "learning_rate": 1.2756805807622505e-05, + "loss": 34.0249, + "step": 3177 + }, + { + "epoch": 11.47313769751693, + "grad_norm": 181.4731903076172, + "learning_rate": 1.2751361161524502e-05, + "loss": 34.2473, + "step": 3178 + }, + { + "epoch": 11.476749435665914, + "grad_norm": 240.2397003173828, + "learning_rate": 1.2745916515426497e-05, + "loss": 32.8657, + "step": 3179 + }, + { + "epoch": 11.480361173814899, + "grad_norm": 283.2740478515625, + "learning_rate": 1.2740471869328494e-05, + "loss": 34.6619, + "step": 3180 + }, + { + "epoch": 11.480361173814899, + "eval_loss": 0.6126638054847717, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 3180 + }, + { + "epoch": 11.483972911963882, + "grad_norm": 248.70912170410156, + "learning_rate": 1.273502722323049e-05, + "loss": 33.0975, + "step": 3181 + }, + { + "epoch": 11.487584650112867, + "grad_norm": 210.9479217529297, + "learning_rate": 1.2729582577132487e-05, + "loss": 34.2069, + "step": 3182 + }, + { + "epoch": 11.491196388261852, + "grad_norm": 234.31399536132812, + "learning_rate": 1.2724137931034484e-05, + "loss": 35.811, + "step": 3183 + }, + { + "epoch": 11.494808126410835, + "grad_norm": 253.24478149414062, + "learning_rate": 1.271869328493648e-05, + "loss": 35.6234, + "step": 3184 + }, + { + "epoch": 11.49841986455982, + "grad_norm": 259.0565185546875, + "learning_rate": 1.2713248638838476e-05, + "loss": 35.1495, + "step": 3185 + }, + { + "epoch": 11.502031602708804, + "grad_norm": 235.4202880859375, + "learning_rate": 1.2707803992740472e-05, + "loss": 35.1363, + "step": 3186 + }, + { + "epoch": 11.505643340857787, + "grad_norm": 248.30267333984375, + "learning_rate": 1.2702359346642469e-05, + "loss": 35.9653, + "step": 3187 + }, + { + "epoch": 11.509255079006772, + "grad_norm": 197.6142120361328, + "learning_rate": 1.2696914700544466e-05, + "loss": 35.6304, + "step": 3188 + }, + { + "epoch": 11.512866817155757, + "grad_norm": 329.27862548828125, + "learning_rate": 1.2691470054446461e-05, + "loss": 35.6111, + "step": 3189 + }, + { + "epoch": 11.51647855530474, + "grad_norm": 194.7126922607422, + "learning_rate": 1.2686025408348457e-05, + "loss": 35.0693, + "step": 3190 + }, + { + "epoch": 11.51647855530474, + "eval_loss": 0.6106634736061096, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 3190 + }, + { + "epoch": 11.520090293453725, + "grad_norm": 243.0207061767578, + "learning_rate": 1.2680580762250454e-05, + "loss": 37.6373, + "step": 3191 + }, + { + "epoch": 11.523702031602708, + "grad_norm": 282.0947265625, + "learning_rate": 1.267513611615245e-05, + "loss": 36.2595, + "step": 3192 + }, + { + "epoch": 11.527313769751693, + "grad_norm": 249.8011932373047, + "learning_rate": 1.2669691470054446e-05, + "loss": 35.5601, + "step": 3193 + }, + { + "epoch": 11.530925507900678, + "grad_norm": 202.17503356933594, + "learning_rate": 1.2664246823956443e-05, + "loss": 23.1075, + "step": 3194 + }, + { + "epoch": 11.534537246049661, + "grad_norm": 188.78128051757812, + "learning_rate": 1.2658802177858439e-05, + "loss": 22.2458, + "step": 3195 + }, + { + "epoch": 11.538148984198646, + "grad_norm": 219.24722290039062, + "learning_rate": 1.2653357531760437e-05, + "loss": 23.7842, + "step": 3196 + }, + { + "epoch": 11.54176072234763, + "grad_norm": 213.0615234375, + "learning_rate": 1.2647912885662433e-05, + "loss": 25.3773, + "step": 3197 + }, + { + "epoch": 11.545372460496614, + "grad_norm": 274.6806335449219, + "learning_rate": 1.2642468239564428e-05, + "loss": 40.396, + "step": 3198 + }, + { + "epoch": 11.548984198645599, + "grad_norm": 248.91778564453125, + "learning_rate": 1.2637023593466425e-05, + "loss": 42.2405, + "step": 3199 + }, + { + "epoch": 11.552595936794582, + "grad_norm": 228.45591735839844, + "learning_rate": 1.263157894736842e-05, + "loss": 40.7328, + "step": 3200 + }, + { + "epoch": 11.552595936794582, + "eval_loss": 0.6154705286026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.04, + "eval_steps_per_second": 57.04, + "step": 3200 + }, + { + "epoch": 11.556207674943566, + "grad_norm": 206.54483032226562, + "learning_rate": 1.2626134301270418e-05, + "loss": 40.6909, + "step": 3201 + }, + { + "epoch": 11.559819413092551, + "grad_norm": 199.14816284179688, + "learning_rate": 1.2620689655172415e-05, + "loss": 40.6918, + "step": 3202 + }, + { + "epoch": 11.563431151241534, + "grad_norm": 217.4789276123047, + "learning_rate": 1.261524500907441e-05, + "loss": 41.686, + "step": 3203 + }, + { + "epoch": 11.56704288939052, + "grad_norm": 209.83084106445312, + "learning_rate": 1.2609800362976406e-05, + "loss": 40.685, + "step": 3204 + }, + { + "epoch": 11.570654627539504, + "grad_norm": 184.56614685058594, + "learning_rate": 1.2604355716878404e-05, + "loss": 42.1684, + "step": 3205 + }, + { + "epoch": 11.574266365688487, + "grad_norm": 226.84622192382812, + "learning_rate": 1.25989110707804e-05, + "loss": 42.4169, + "step": 3206 + }, + { + "epoch": 11.577878103837472, + "grad_norm": 271.7705383300781, + "learning_rate": 1.2593466424682397e-05, + "loss": 41.9603, + "step": 3207 + }, + { + "epoch": 11.581489841986457, + "grad_norm": 206.48257446289062, + "learning_rate": 1.2588021778584392e-05, + "loss": 39.9903, + "step": 3208 + }, + { + "epoch": 11.58510158013544, + "grad_norm": 190.86009216308594, + "learning_rate": 1.2582577132486388e-05, + "loss": 39.3138, + "step": 3209 + }, + { + "epoch": 11.588713318284425, + "grad_norm": 217.0152130126953, + "learning_rate": 1.2577132486388386e-05, + "loss": 37.652, + "step": 3210 + }, + { + "epoch": 11.588713318284425, + "eval_loss": 0.6143624186515808, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 3210 + }, + { + "epoch": 11.592325056433408, + "grad_norm": 203.3090362548828, + "learning_rate": 1.2571687840290382e-05, + "loss": 38.5532, + "step": 3211 + }, + { + "epoch": 11.595936794582393, + "grad_norm": 237.18287658691406, + "learning_rate": 1.2566243194192377e-05, + "loss": 38.4073, + "step": 3212 + }, + { + "epoch": 11.599548532731378, + "grad_norm": 222.20489501953125, + "learning_rate": 1.2560798548094374e-05, + "loss": 37.7122, + "step": 3213 + }, + { + "epoch": 11.60316027088036, + "grad_norm": 261.4862060546875, + "learning_rate": 1.255535390199637e-05, + "loss": 39.0125, + "step": 3214 + }, + { + "epoch": 11.606772009029346, + "grad_norm": 235.49668884277344, + "learning_rate": 1.2549909255898367e-05, + "loss": 38.1753, + "step": 3215 + }, + { + "epoch": 11.610383747178329, + "grad_norm": 219.66139221191406, + "learning_rate": 1.2544464609800364e-05, + "loss": 40.3478, + "step": 3216 + }, + { + "epoch": 11.613995485327314, + "grad_norm": 282.8075256347656, + "learning_rate": 1.2539019963702359e-05, + "loss": 39.3672, + "step": 3217 + }, + { + "epoch": 11.617607223476298, + "grad_norm": 235.07875061035156, + "learning_rate": 1.2533575317604356e-05, + "loss": 39.8955, + "step": 3218 + }, + { + "epoch": 11.621218961625281, + "grad_norm": 328.829833984375, + "learning_rate": 1.2528130671506353e-05, + "loss": 38.626, + "step": 3219 + }, + { + "epoch": 11.624830699774266, + "grad_norm": 283.1789245605469, + "learning_rate": 1.2522686025408349e-05, + "loss": 40.0565, + "step": 3220 + }, + { + "epoch": 11.624830699774266, + "eval_loss": 0.6113889217376709, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3220 + }, + { + "epoch": 11.628442437923251, + "grad_norm": 230.88047790527344, + "learning_rate": 1.2517241379310346e-05, + "loss": 40.1155, + "step": 3221 + }, + { + "epoch": 11.632054176072234, + "grad_norm": 258.1295166015625, + "learning_rate": 1.2511796733212341e-05, + "loss": 40.4707, + "step": 3222 + }, + { + "epoch": 11.635665914221219, + "grad_norm": 255.82699584960938, + "learning_rate": 1.2506352087114336e-05, + "loss": 41.1296, + "step": 3223 + }, + { + "epoch": 11.639277652370204, + "grad_norm": 226.4784393310547, + "learning_rate": 1.2500907441016335e-05, + "loss": 39.1159, + "step": 3224 + }, + { + "epoch": 11.642889390519187, + "grad_norm": 257.38104248046875, + "learning_rate": 1.249546279491833e-05, + "loss": 40.7933, + "step": 3225 + }, + { + "epoch": 11.646501128668172, + "grad_norm": 218.69070434570312, + "learning_rate": 1.2490018148820328e-05, + "loss": 39.6723, + "step": 3226 + }, + { + "epoch": 11.650112866817155, + "grad_norm": 232.3351287841797, + "learning_rate": 1.2484573502722323e-05, + "loss": 37.5671, + "step": 3227 + }, + { + "epoch": 11.65372460496614, + "grad_norm": 229.93295288085938, + "learning_rate": 1.2479128856624318e-05, + "loss": 32.7819, + "step": 3228 + }, + { + "epoch": 11.657336343115125, + "grad_norm": 265.6002197265625, + "learning_rate": 1.2473684210526317e-05, + "loss": 32.5955, + "step": 3229 + }, + { + "epoch": 11.660948081264108, + "grad_norm": 278.47705078125, + "learning_rate": 1.2468239564428313e-05, + "loss": 32.9901, + "step": 3230 + }, + { + "epoch": 11.660948081264108, + "eval_loss": 0.6078047752380371, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 3230 + }, + { + "epoch": 11.664559819413093, + "grad_norm": 239.9285430908203, + "learning_rate": 1.2462794918330308e-05, + "loss": 33.2737, + "step": 3231 + }, + { + "epoch": 11.668171557562077, + "grad_norm": 358.36090087890625, + "learning_rate": 1.2457350272232305e-05, + "loss": 34.8522, + "step": 3232 + }, + { + "epoch": 11.67178329571106, + "grad_norm": 258.0733642578125, + "learning_rate": 1.2451905626134302e-05, + "loss": 34.6796, + "step": 3233 + }, + { + "epoch": 11.675395033860045, + "grad_norm": 296.21942138671875, + "learning_rate": 1.2446460980036298e-05, + "loss": 35.8479, + "step": 3234 + }, + { + "epoch": 11.679006772009028, + "grad_norm": 229.6141815185547, + "learning_rate": 1.2441016333938295e-05, + "loss": 36.4934, + "step": 3235 + }, + { + "epoch": 11.682618510158013, + "grad_norm": 238.6092987060547, + "learning_rate": 1.243557168784029e-05, + "loss": 35.2253, + "step": 3236 + }, + { + "epoch": 11.686230248306998, + "grad_norm": 300.76300048828125, + "learning_rate": 1.2430127041742287e-05, + "loss": 34.9373, + "step": 3237 + }, + { + "epoch": 11.689841986455981, + "grad_norm": 227.70672607421875, + "learning_rate": 1.2424682395644284e-05, + "loss": 35.4369, + "step": 3238 + }, + { + "epoch": 11.693453724604966, + "grad_norm": 218.36000061035156, + "learning_rate": 1.241923774954628e-05, + "loss": 35.3398, + "step": 3239 + }, + { + "epoch": 11.697065462753951, + "grad_norm": 220.78475952148438, + "learning_rate": 1.2413793103448277e-05, + "loss": 35.7612, + "step": 3240 + }, + { + "epoch": 11.697065462753951, + "eval_loss": 0.6067846417427063, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 3240 + }, + { + "epoch": 11.700677200902934, + "grad_norm": 237.34437561035156, + "learning_rate": 1.2408348457350272e-05, + "loss": 38.0459, + "step": 3241 + }, + { + "epoch": 11.704288939051919, + "grad_norm": 251.60633850097656, + "learning_rate": 1.2402903811252269e-05, + "loss": 35.4676, + "step": 3242 + }, + { + "epoch": 11.707900677200904, + "grad_norm": 214.17117309570312, + "learning_rate": 1.2397459165154266e-05, + "loss": 30.5595, + "step": 3243 + }, + { + "epoch": 11.711512415349887, + "grad_norm": 202.3698272705078, + "learning_rate": 1.2392014519056262e-05, + "loss": 23.7468, + "step": 3244 + }, + { + "epoch": 11.715124153498872, + "grad_norm": 229.11776733398438, + "learning_rate": 1.2386569872958257e-05, + "loss": 23.1255, + "step": 3245 + }, + { + "epoch": 11.718735891647855, + "grad_norm": 175.93829345703125, + "learning_rate": 1.2381125226860254e-05, + "loss": 23.7349, + "step": 3246 + }, + { + "epoch": 11.72234762979684, + "grad_norm": 232.7489471435547, + "learning_rate": 1.2375680580762251e-05, + "loss": 24.4997, + "step": 3247 + }, + { + "epoch": 11.725959367945824, + "grad_norm": 280.5601806640625, + "learning_rate": 1.2370235934664248e-05, + "loss": 42.3811, + "step": 3248 + }, + { + "epoch": 11.729571106094808, + "grad_norm": 292.2538146972656, + "learning_rate": 1.2364791288566244e-05, + "loss": 42.9804, + "step": 3249 + }, + { + "epoch": 11.733182844243792, + "grad_norm": 265.0259704589844, + "learning_rate": 1.2359346642468239e-05, + "loss": 41.1251, + "step": 3250 + }, + { + "epoch": 11.733182844243792, + "eval_loss": 0.6141200065612793, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 3250 + }, + { + "epoch": 11.736794582392777, + "grad_norm": 232.92893981933594, + "learning_rate": 1.2353901996370236e-05, + "loss": 40.9372, + "step": 3251 + }, + { + "epoch": 11.74040632054176, + "grad_norm": 176.99818420410156, + "learning_rate": 1.2348457350272233e-05, + "loss": 41.0757, + "step": 3252 + }, + { + "epoch": 11.744018058690745, + "grad_norm": 206.5728759765625, + "learning_rate": 1.2343012704174228e-05, + "loss": 41.9635, + "step": 3253 + }, + { + "epoch": 11.747629796839728, + "grad_norm": 211.2556915283203, + "learning_rate": 1.2337568058076226e-05, + "loss": 41.5217, + "step": 3254 + }, + { + "epoch": 11.751241534988713, + "grad_norm": 198.8915252685547, + "learning_rate": 1.2332123411978221e-05, + "loss": 42.9997, + "step": 3255 + }, + { + "epoch": 11.754853273137698, + "grad_norm": 291.2761535644531, + "learning_rate": 1.2326678765880218e-05, + "loss": 42.2561, + "step": 3256 + }, + { + "epoch": 11.758465011286681, + "grad_norm": 243.2998046875, + "learning_rate": 1.2321234119782215e-05, + "loss": 41.6219, + "step": 3257 + }, + { + "epoch": 11.762076749435666, + "grad_norm": 266.1149597167969, + "learning_rate": 1.231578947368421e-05, + "loss": 40.1646, + "step": 3258 + }, + { + "epoch": 11.76568848758465, + "grad_norm": 236.6083221435547, + "learning_rate": 1.2310344827586208e-05, + "loss": 39.7079, + "step": 3259 + }, + { + "epoch": 11.769300225733634, + "grad_norm": 196.397216796875, + "learning_rate": 1.2304900181488203e-05, + "loss": 39.6629, + "step": 3260 + }, + { + "epoch": 11.769300225733634, + "eval_loss": 0.6124016046524048, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.055, + "eval_steps_per_second": 57.055, + "step": 3260 + }, + { + "epoch": 11.772911963882619, + "grad_norm": 198.52500915527344, + "learning_rate": 1.22994555353902e-05, + "loss": 38.5285, + "step": 3261 + }, + { + "epoch": 11.776523702031604, + "grad_norm": 236.25477600097656, + "learning_rate": 1.2294010889292197e-05, + "loss": 38.3358, + "step": 3262 + }, + { + "epoch": 11.780135440180587, + "grad_norm": 260.35955810546875, + "learning_rate": 1.2288566243194192e-05, + "loss": 38.374, + "step": 3263 + }, + { + "epoch": 11.783747178329572, + "grad_norm": 313.078857421875, + "learning_rate": 1.2283121597096188e-05, + "loss": 39.124, + "step": 3264 + }, + { + "epoch": 11.787358916478555, + "grad_norm": 191.34027099609375, + "learning_rate": 1.2277676950998187e-05, + "loss": 39.1776, + "step": 3265 + }, + { + "epoch": 11.79097065462754, + "grad_norm": 203.5764923095703, + "learning_rate": 1.2272232304900182e-05, + "loss": 38.7885, + "step": 3266 + }, + { + "epoch": 11.794582392776524, + "grad_norm": 234.38479614257812, + "learning_rate": 1.2266787658802177e-05, + "loss": 39.1353, + "step": 3267 + }, + { + "epoch": 11.798194130925507, + "grad_norm": 254.5694122314453, + "learning_rate": 1.2261343012704174e-05, + "loss": 38.141, + "step": 3268 + }, + { + "epoch": 11.801805869074492, + "grad_norm": 189.8268585205078, + "learning_rate": 1.225589836660617e-05, + "loss": 39.5199, + "step": 3269 + }, + { + "epoch": 11.805417607223477, + "grad_norm": 256.52728271484375, + "learning_rate": 1.2250453720508169e-05, + "loss": 41.5113, + "step": 3270 + }, + { + "epoch": 11.805417607223477, + "eval_loss": 0.6084021329879761, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3270 + }, + { + "epoch": 11.80902934537246, + "grad_norm": 195.57321166992188, + "learning_rate": 1.2245009074410164e-05, + "loss": 39.8129, + "step": 3271 + }, + { + "epoch": 11.812641083521445, + "grad_norm": 228.6748809814453, + "learning_rate": 1.223956442831216e-05, + "loss": 40.2273, + "step": 3272 + }, + { + "epoch": 11.816252821670428, + "grad_norm": 209.96096801757812, + "learning_rate": 1.2234119782214156e-05, + "loss": 40.2254, + "step": 3273 + }, + { + "epoch": 11.819864559819413, + "grad_norm": 247.4613037109375, + "learning_rate": 1.2228675136116152e-05, + "loss": 40.71, + "step": 3274 + }, + { + "epoch": 11.823476297968398, + "grad_norm": 263.0521240234375, + "learning_rate": 1.2223230490018149e-05, + "loss": 39.5572, + "step": 3275 + }, + { + "epoch": 11.827088036117381, + "grad_norm": 225.53634643554688, + "learning_rate": 1.2217785843920146e-05, + "loss": 36.4388, + "step": 3276 + }, + { + "epoch": 11.830699774266366, + "grad_norm": 194.59527587890625, + "learning_rate": 1.2212341197822141e-05, + "loss": 33.1005, + "step": 3277 + }, + { + "epoch": 11.83431151241535, + "grad_norm": 314.715576171875, + "learning_rate": 1.2206896551724138e-05, + "loss": 32.9812, + "step": 3278 + }, + { + "epoch": 11.837923250564334, + "grad_norm": 205.86862182617188, + "learning_rate": 1.2201451905626136e-05, + "loss": 33.6331, + "step": 3279 + }, + { + "epoch": 11.841534988713319, + "grad_norm": 217.54722595214844, + "learning_rate": 1.2196007259528131e-05, + "loss": 33.6535, + "step": 3280 + }, + { + "epoch": 11.841534988713319, + "eval_loss": 0.609620213508606, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 3280 + }, + { + "epoch": 11.845146726862303, + "grad_norm": 231.25390625, + "learning_rate": 1.2190562613430128e-05, + "loss": 34.5218, + "step": 3281 + }, + { + "epoch": 11.848758465011286, + "grad_norm": 208.8440704345703, + "learning_rate": 1.2185117967332123e-05, + "loss": 34.354, + "step": 3282 + }, + { + "epoch": 11.852370203160271, + "grad_norm": 221.25547790527344, + "learning_rate": 1.2179673321234119e-05, + "loss": 34.5705, + "step": 3283 + }, + { + "epoch": 11.855981941309254, + "grad_norm": 331.4505920410156, + "learning_rate": 1.2174228675136118e-05, + "loss": 35.796, + "step": 3284 + }, + { + "epoch": 11.85959367945824, + "grad_norm": 337.1404113769531, + "learning_rate": 1.2168784029038113e-05, + "loss": 36.4544, + "step": 3285 + }, + { + "epoch": 11.863205417607224, + "grad_norm": 238.75303649902344, + "learning_rate": 1.2163339382940108e-05, + "loss": 35.7165, + "step": 3286 + }, + { + "epoch": 11.866817155756207, + "grad_norm": 260.088134765625, + "learning_rate": 1.2157894736842105e-05, + "loss": 35.5461, + "step": 3287 + }, + { + "epoch": 11.870428893905192, + "grad_norm": 265.0240173339844, + "learning_rate": 1.2152450090744102e-05, + "loss": 37.0143, + "step": 3288 + }, + { + "epoch": 11.874040632054175, + "grad_norm": 251.74273681640625, + "learning_rate": 1.21470054446461e-05, + "loss": 36.6145, + "step": 3289 + }, + { + "epoch": 11.87765237020316, + "grad_norm": 216.8999786376953, + "learning_rate": 1.2141560798548095e-05, + "loss": 36.3135, + "step": 3290 + }, + { + "epoch": 11.87765237020316, + "eval_loss": 0.6087896823883057, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.066, + "eval_steps_per_second": 57.066, + "step": 3290 + }, + { + "epoch": 11.881264108352145, + "grad_norm": 256.50006103515625, + "learning_rate": 1.213611615245009e-05, + "loss": 36.6596, + "step": 3291 + }, + { + "epoch": 11.884875846501128, + "grad_norm": 249.34164428710938, + "learning_rate": 1.2130671506352087e-05, + "loss": 37.6473, + "step": 3292 + }, + { + "epoch": 11.888487584650113, + "grad_norm": 211.9344940185547, + "learning_rate": 1.2125226860254084e-05, + "loss": 28.2839, + "step": 3293 + }, + { + "epoch": 11.892099322799098, + "grad_norm": 170.77166748046875, + "learning_rate": 1.211978221415608e-05, + "loss": 23.2231, + "step": 3294 + }, + { + "epoch": 11.89571106094808, + "grad_norm": 177.49789428710938, + "learning_rate": 1.2114337568058077e-05, + "loss": 22.7909, + "step": 3295 + }, + { + "epoch": 11.899322799097066, + "grad_norm": 189.0458221435547, + "learning_rate": 1.2108892921960072e-05, + "loss": 23.8062, + "step": 3296 + }, + { + "epoch": 11.90293453724605, + "grad_norm": 182.90457153320312, + "learning_rate": 1.2103448275862068e-05, + "loss": 24.7812, + "step": 3297 + }, + { + "epoch": 11.906546275395034, + "grad_norm": 232.61126708984375, + "learning_rate": 1.2098003629764066e-05, + "loss": 41.5496, + "step": 3298 + }, + { + "epoch": 11.910158013544018, + "grad_norm": 283.25762939453125, + "learning_rate": 1.2092558983666062e-05, + "loss": 40.7831, + "step": 3299 + }, + { + "epoch": 11.913769751693001, + "grad_norm": 316.6318359375, + "learning_rate": 1.2087114337568059e-05, + "loss": 40.6287, + "step": 3300 + }, + { + "epoch": 11.913769751693001, + "eval_loss": 0.6114257574081421, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 3300 + }, + { + "epoch": 11.917381489841986, + "grad_norm": 248.5615234375, + "learning_rate": 1.2081669691470054e-05, + "loss": 40.5648, + "step": 3301 + }, + { + "epoch": 11.920993227990971, + "grad_norm": 255.31130981445312, + "learning_rate": 1.2076225045372051e-05, + "loss": 42.4736, + "step": 3302 + }, + { + "epoch": 11.924604966139954, + "grad_norm": 229.3546600341797, + "learning_rate": 1.2070780399274048e-05, + "loss": 43.112, + "step": 3303 + }, + { + "epoch": 11.928216704288939, + "grad_norm": 226.89553833007812, + "learning_rate": 1.2065335753176044e-05, + "loss": 37.9527, + "step": 3304 + }, + { + "epoch": 11.931828442437924, + "grad_norm": 210.63919067382812, + "learning_rate": 1.205989110707804e-05, + "loss": 38.7652, + "step": 3305 + }, + { + "epoch": 11.935440180586907, + "grad_norm": 267.75335693359375, + "learning_rate": 1.2054446460980036e-05, + "loss": 39.9077, + "step": 3306 + }, + { + "epoch": 11.939051918735892, + "grad_norm": 255.3372802734375, + "learning_rate": 1.2049001814882033e-05, + "loss": 39.9008, + "step": 3307 + }, + { + "epoch": 11.942663656884875, + "grad_norm": 220.55332946777344, + "learning_rate": 1.2043557168784029e-05, + "loss": 40.8187, + "step": 3308 + }, + { + "epoch": 11.94627539503386, + "grad_norm": 350.15374755859375, + "learning_rate": 1.2038112522686026e-05, + "loss": 40.2937, + "step": 3309 + }, + { + "epoch": 11.949887133182845, + "grad_norm": 296.1144714355469, + "learning_rate": 1.2032667876588021e-05, + "loss": 41.3939, + "step": 3310 + }, + { + "epoch": 11.949887133182845, + "eval_loss": 0.6116041541099548, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3310 + }, + { + "epoch": 11.953498871331828, + "grad_norm": 220.52304077148438, + "learning_rate": 1.202722323049002e-05, + "loss": 39.108, + "step": 3311 + }, + { + "epoch": 11.957110609480813, + "grad_norm": 268.8526916503906, + "learning_rate": 1.2021778584392015e-05, + "loss": 39.547, + "step": 3312 + }, + { + "epoch": 11.960722347629797, + "grad_norm": 205.97677612304688, + "learning_rate": 1.201633393829401e-05, + "loss": 36.7144, + "step": 3313 + }, + { + "epoch": 11.96433408577878, + "grad_norm": 186.62428283691406, + "learning_rate": 1.2010889292196008e-05, + "loss": 34.0491, + "step": 3314 + }, + { + "epoch": 11.967945823927765, + "grad_norm": 214.5521697998047, + "learning_rate": 1.2005444646098003e-05, + "loss": 34.1164, + "step": 3315 + }, + { + "epoch": 11.97155756207675, + "grad_norm": 203.8130340576172, + "learning_rate": 1.2e-05, + "loss": 34.0005, + "step": 3316 + }, + { + "epoch": 11.975169300225733, + "grad_norm": 207.25648498535156, + "learning_rate": 1.1994555353901997e-05, + "loss": 34.0489, + "step": 3317 + }, + { + "epoch": 11.978781038374718, + "grad_norm": 271.1595458984375, + "learning_rate": 1.1989110707803993e-05, + "loss": 35.0359, + "step": 3318 + }, + { + "epoch": 11.982392776523701, + "grad_norm": 266.0697021484375, + "learning_rate": 1.198366606170599e-05, + "loss": 36.4684, + "step": 3319 + }, + { + "epoch": 11.986004514672686, + "grad_norm": 264.1314392089844, + "learning_rate": 1.1978221415607985e-05, + "loss": 35.8805, + "step": 3320 + }, + { + "epoch": 11.986004514672686, + "eval_loss": 0.6101864576339722, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 3320 + }, + { + "epoch": 11.989616252821671, + "grad_norm": 266.34295654296875, + "learning_rate": 1.1972776769509982e-05, + "loss": 37.2928, + "step": 3321 + }, + { + "epoch": 11.993227990970654, + "grad_norm": 222.19161987304688, + "learning_rate": 1.196733212341198e-05, + "loss": 29.0638, + "step": 3322 + }, + { + "epoch": 11.996839729119639, + "grad_norm": 244.96974182128906, + "learning_rate": 1.1961887477313975e-05, + "loss": 23.6752, + "step": 3323 + }, + { + "epoch": 12.0, + "grad_norm": 227.6931915283203, + "learning_rate": 1.195644283121597e-05, + "loss": 20.9293, + "step": 3324 + }, + { + "epoch": 12.003611738148985, + "grad_norm": 259.7235412597656, + "learning_rate": 1.1950998185117969e-05, + "loss": 39.7694, + "step": 3325 + }, + { + "epoch": 12.007223476297968, + "grad_norm": 258.8477783203125, + "learning_rate": 1.1945553539019964e-05, + "loss": 41.3742, + "step": 3326 + }, + { + "epoch": 12.010835214446953, + "grad_norm": 216.0697784423828, + "learning_rate": 1.194010889292196e-05, + "loss": 40.0706, + "step": 3327 + }, + { + "epoch": 12.014446952595938, + "grad_norm": 197.73046875, + "learning_rate": 1.1934664246823957e-05, + "loss": 39.844, + "step": 3328 + }, + { + "epoch": 12.01805869074492, + "grad_norm": 190.29563903808594, + "learning_rate": 1.1929219600725952e-05, + "loss": 41.8877, + "step": 3329 + }, + { + "epoch": 12.021670428893906, + "grad_norm": 190.01197814941406, + "learning_rate": 1.1923774954627951e-05, + "loss": 40.5782, + "step": 3330 + }, + { + "epoch": 12.021670428893906, + "eval_loss": 0.6100598573684692, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3330 + }, + { + "epoch": 12.025282167042889, + "grad_norm": 283.20965576171875, + "learning_rate": 1.1918330308529946e-05, + "loss": 42.9183, + "step": 3331 + }, + { + "epoch": 12.028893905191874, + "grad_norm": 227.9106903076172, + "learning_rate": 1.1912885662431942e-05, + "loss": 41.4606, + "step": 3332 + }, + { + "epoch": 12.032505643340858, + "grad_norm": 217.31640625, + "learning_rate": 1.1907441016333939e-05, + "loss": 40.527, + "step": 3333 + }, + { + "epoch": 12.036117381489841, + "grad_norm": 181.33787536621094, + "learning_rate": 1.1901996370235936e-05, + "loss": 40.2536, + "step": 3334 + }, + { + "epoch": 12.039729119638826, + "grad_norm": 210.638427734375, + "learning_rate": 1.1896551724137931e-05, + "loss": 39.0234, + "step": 3335 + }, + { + "epoch": 12.043340857787811, + "grad_norm": 222.1325225830078, + "learning_rate": 1.1891107078039928e-05, + "loss": 36.6929, + "step": 3336 + }, + { + "epoch": 12.046952595936794, + "grad_norm": 195.0751953125, + "learning_rate": 1.1885662431941924e-05, + "loss": 37.9547, + "step": 3337 + }, + { + "epoch": 12.050564334085779, + "grad_norm": 287.6582946777344, + "learning_rate": 1.1880217785843919e-05, + "loss": 37.9016, + "step": 3338 + }, + { + "epoch": 12.054176072234762, + "grad_norm": 351.43701171875, + "learning_rate": 1.1874773139745918e-05, + "loss": 40.014, + "step": 3339 + }, + { + "epoch": 12.057787810383747, + "grad_norm": 212.9033966064453, + "learning_rate": 1.1869328493647913e-05, + "loss": 37.8761, + "step": 3340 + }, + { + "epoch": 12.057787810383747, + "eval_loss": 0.6093400120735168, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 3340 + }, + { + "epoch": 12.061399548532732, + "grad_norm": 268.8284912109375, + "learning_rate": 1.186388384754991e-05, + "loss": 38.7171, + "step": 3341 + }, + { + "epoch": 12.065011286681715, + "grad_norm": 193.27267456054688, + "learning_rate": 1.1858439201451906e-05, + "loss": 38.4908, + "step": 3342 + }, + { + "epoch": 12.0686230248307, + "grad_norm": 244.18124389648438, + "learning_rate": 1.1852994555353901e-05, + "loss": 37.9388, + "step": 3343 + }, + { + "epoch": 12.072234762979685, + "grad_norm": 311.6593933105469, + "learning_rate": 1.18475499092559e-05, + "loss": 38.4287, + "step": 3344 + }, + { + "epoch": 12.075846501128668, + "grad_norm": 239.28526306152344, + "learning_rate": 1.1842105263157895e-05, + "loss": 38.1349, + "step": 3345 + }, + { + "epoch": 12.079458239277653, + "grad_norm": 312.1795654296875, + "learning_rate": 1.183666061705989e-05, + "loss": 39.8067, + "step": 3346 + }, + { + "epoch": 12.083069977426636, + "grad_norm": 303.3067932128906, + "learning_rate": 1.1831215970961888e-05, + "loss": 40.0617, + "step": 3347 + }, + { + "epoch": 12.08668171557562, + "grad_norm": 280.8705749511719, + "learning_rate": 1.1825771324863885e-05, + "loss": 39.244, + "step": 3348 + }, + { + "epoch": 12.090293453724605, + "grad_norm": 249.89671325683594, + "learning_rate": 1.182032667876588e-05, + "loss": 39.0047, + "step": 3349 + }, + { + "epoch": 12.093905191873588, + "grad_norm": 226.19195556640625, + "learning_rate": 1.1814882032667877e-05, + "loss": 40.8044, + "step": 3350 + }, + { + "epoch": 12.093905191873588, + "eval_loss": 0.6100687384605408, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 3350 + }, + { + "epoch": 12.097516930022573, + "grad_norm": 250.29306030273438, + "learning_rate": 1.1809437386569873e-05, + "loss": 38.0745, + "step": 3351 + }, + { + "epoch": 12.101128668171558, + "grad_norm": 255.06137084960938, + "learning_rate": 1.180399274047187e-05, + "loss": 37.2922, + "step": 3352 + }, + { + "epoch": 12.104740406320541, + "grad_norm": 293.59185791015625, + "learning_rate": 1.1798548094373867e-05, + "loss": 35.488, + "step": 3353 + }, + { + "epoch": 12.108352144469526, + "grad_norm": 260.9599914550781, + "learning_rate": 1.1793103448275862e-05, + "loss": 32.8175, + "step": 3354 + }, + { + "epoch": 12.111963882618511, + "grad_norm": 387.63671875, + "learning_rate": 1.178765880217786e-05, + "loss": 31.3901, + "step": 3355 + }, + { + "epoch": 12.115575620767494, + "grad_norm": 216.2008819580078, + "learning_rate": 1.1782214156079855e-05, + "loss": 32.9512, + "step": 3356 + }, + { + "epoch": 12.119187358916479, + "grad_norm": 260.510498046875, + "learning_rate": 1.177676950998185e-05, + "loss": 31.838, + "step": 3357 + }, + { + "epoch": 12.122799097065462, + "grad_norm": 215.96522521972656, + "learning_rate": 1.1771324863883849e-05, + "loss": 33.5854, + "step": 3358 + }, + { + "epoch": 12.126410835214447, + "grad_norm": 277.2855529785156, + "learning_rate": 1.1765880217785844e-05, + "loss": 34.947, + "step": 3359 + }, + { + "epoch": 12.130022573363432, + "grad_norm": 199.53759765625, + "learning_rate": 1.176043557168784e-05, + "loss": 34.3862, + "step": 3360 + }, + { + "epoch": 12.130022573363432, + "eval_loss": 0.6107886433601379, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 3360 + }, + { + "epoch": 12.133634311512415, + "grad_norm": 244.73654174804688, + "learning_rate": 1.1754990925589837e-05, + "loss": 34.5678, + "step": 3361 + }, + { + "epoch": 12.1372460496614, + "grad_norm": 335.4967346191406, + "learning_rate": 1.1749546279491834e-05, + "loss": 35.8974, + "step": 3362 + }, + { + "epoch": 12.140857787810384, + "grad_norm": 269.8370056152344, + "learning_rate": 1.174410163339383e-05, + "loss": 36.3458, + "step": 3363 + }, + { + "epoch": 12.144469525959368, + "grad_norm": 230.82492065429688, + "learning_rate": 1.1738656987295826e-05, + "loss": 34.6797, + "step": 3364 + }, + { + "epoch": 12.148081264108352, + "grad_norm": 266.6196594238281, + "learning_rate": 1.1733212341197822e-05, + "loss": 35.5799, + "step": 3365 + }, + { + "epoch": 12.151693002257336, + "grad_norm": 268.1825256347656, + "learning_rate": 1.1727767695099819e-05, + "loss": 34.9859, + "step": 3366 + }, + { + "epoch": 12.15530474040632, + "grad_norm": 259.6159362792969, + "learning_rate": 1.1722323049001816e-05, + "loss": 37.2283, + "step": 3367 + }, + { + "epoch": 12.158916478555305, + "grad_norm": 225.1367645263672, + "learning_rate": 1.1716878402903811e-05, + "loss": 37.4073, + "step": 3368 + }, + { + "epoch": 12.162528216704288, + "grad_norm": 277.8457946777344, + "learning_rate": 1.1711433756805808e-05, + "loss": 36.3491, + "step": 3369 + }, + { + "epoch": 12.166139954853273, + "grad_norm": 273.1939697265625, + "learning_rate": 1.1705989110707804e-05, + "loss": 31.4646, + "step": 3370 + }, + { + "epoch": 12.166139954853273, + "eval_loss": 0.6099494695663452, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 3370 + }, + { + "epoch": 12.169751693002258, + "grad_norm": 199.32516479492188, + "learning_rate": 1.17005444646098e-05, + "loss": 22.7125, + "step": 3371 + }, + { + "epoch": 12.173363431151241, + "grad_norm": 195.47630310058594, + "learning_rate": 1.1695099818511798e-05, + "loss": 22.7899, + "step": 3372 + }, + { + "epoch": 12.176975169300226, + "grad_norm": 220.02413940429688, + "learning_rate": 1.1689655172413793e-05, + "loss": 23.4427, + "step": 3373 + }, + { + "epoch": 12.18058690744921, + "grad_norm": 215.43287658691406, + "learning_rate": 1.168421052631579e-05, + "loss": 24.1504, + "step": 3374 + }, + { + "epoch": 12.184198645598194, + "grad_norm": 298.2409973144531, + "learning_rate": 1.1678765880217786e-05, + "loss": 41.4955, + "step": 3375 + }, + { + "epoch": 12.187810383747179, + "grad_norm": 235.94728088378906, + "learning_rate": 1.1673321234119783e-05, + "loss": 42.4273, + "step": 3376 + }, + { + "epoch": 12.191422121896162, + "grad_norm": 235.44480895996094, + "learning_rate": 1.166787658802178e-05, + "loss": 40.6468, + "step": 3377 + }, + { + "epoch": 12.195033860045147, + "grad_norm": 281.5338439941406, + "learning_rate": 1.1662431941923775e-05, + "loss": 39.8335, + "step": 3378 + }, + { + "epoch": 12.198645598194132, + "grad_norm": 185.87339782714844, + "learning_rate": 1.165698729582577e-05, + "loss": 40.8669, + "step": 3379 + }, + { + "epoch": 12.202257336343115, + "grad_norm": 218.88861083984375, + "learning_rate": 1.1651542649727768e-05, + "loss": 40.1351, + "step": 3380 + }, + { + "epoch": 12.202257336343115, + "eval_loss": 0.6128573417663574, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3380 + }, + { + "epoch": 12.2058690744921, + "grad_norm": 192.7227783203125, + "learning_rate": 1.1646098003629765e-05, + "loss": 40.4448, + "step": 3381 + }, + { + "epoch": 12.209480812641084, + "grad_norm": 219.68093872070312, + "learning_rate": 1.1640653357531762e-05, + "loss": 41.579, + "step": 3382 + }, + { + "epoch": 12.213092550790067, + "grad_norm": 235.8788299560547, + "learning_rate": 1.1635208711433757e-05, + "loss": 41.3374, + "step": 3383 + }, + { + "epoch": 12.216704288939052, + "grad_norm": 245.11935424804688, + "learning_rate": 1.1629764065335752e-05, + "loss": 41.1151, + "step": 3384 + }, + { + "epoch": 12.220316027088035, + "grad_norm": 260.2931823730469, + "learning_rate": 1.1624319419237751e-05, + "loss": 38.9502, + "step": 3385 + }, + { + "epoch": 12.22392776523702, + "grad_norm": 240.62734985351562, + "learning_rate": 1.1618874773139747e-05, + "loss": 38.6309, + "step": 3386 + }, + { + "epoch": 12.227539503386005, + "grad_norm": 230.9380645751953, + "learning_rate": 1.1613430127041742e-05, + "loss": 38.3077, + "step": 3387 + }, + { + "epoch": 12.231151241534988, + "grad_norm": 234.40687561035156, + "learning_rate": 1.1607985480943739e-05, + "loss": 37.1566, + "step": 3388 + }, + { + "epoch": 12.234762979683973, + "grad_norm": 216.580810546875, + "learning_rate": 1.1602540834845734e-05, + "loss": 38.4919, + "step": 3389 + }, + { + "epoch": 12.238374717832958, + "grad_norm": 210.75079345703125, + "learning_rate": 1.1597096188747732e-05, + "loss": 38.1647, + "step": 3390 + }, + { + "epoch": 12.238374717832958, + "eval_loss": 0.6105583906173706, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3390 + }, + { + "epoch": 12.241986455981941, + "grad_norm": 207.82180786132812, + "learning_rate": 1.1591651542649729e-05, + "loss": 38.5585, + "step": 3391 + }, + { + "epoch": 12.245598194130926, + "grad_norm": 186.55081176757812, + "learning_rate": 1.1586206896551724e-05, + "loss": 38.0183, + "step": 3392 + }, + { + "epoch": 12.249209932279909, + "grad_norm": 179.60572814941406, + "learning_rate": 1.1580762250453721e-05, + "loss": 39.6951, + "step": 3393 + }, + { + "epoch": 12.252821670428894, + "grad_norm": 212.59837341308594, + "learning_rate": 1.1575317604355718e-05, + "loss": 39.2908, + "step": 3394 + }, + { + "epoch": 12.256433408577879, + "grad_norm": 239.90997314453125, + "learning_rate": 1.1569872958257714e-05, + "loss": 39.9409, + "step": 3395 + }, + { + "epoch": 12.260045146726862, + "grad_norm": 240.729248046875, + "learning_rate": 1.156442831215971e-05, + "loss": 39.2386, + "step": 3396 + }, + { + "epoch": 12.263656884875846, + "grad_norm": 248.6179962158203, + "learning_rate": 1.1558983666061706e-05, + "loss": 37.3296, + "step": 3397 + }, + { + "epoch": 12.267268623024831, + "grad_norm": 192.55084228515625, + "learning_rate": 1.1553539019963701e-05, + "loss": 40.1156, + "step": 3398 + }, + { + "epoch": 12.270880361173814, + "grad_norm": 217.89109802246094, + "learning_rate": 1.15480943738657e-05, + "loss": 41.0677, + "step": 3399 + }, + { + "epoch": 12.2744920993228, + "grad_norm": 240.77633666992188, + "learning_rate": 1.1542649727767695e-05, + "loss": 39.3552, + "step": 3400 + }, + { + "epoch": 12.2744920993228, + "eval_loss": 0.6094763278961182, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3400 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.667933732487168e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d8aee18da591a0459c518144f9695841165cc02f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:044b48ff27e4c49c6d0c04cdfa807892ceab715690db1b7ac39cde5309602fa5 +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..c7d24022ea24393e94101c325d3926844082bae0 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0968903bc888de188b7b0cf6f0f1424a5a29ed653238d79ff30990f85b42180 +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..267ef71461b44a1d0db03e4ea0fc9eef90a72a0c --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69e7bf6d0b6d70c69d15320b76b7790f939a6173c1ed65fd2bce0ef580e3e3eb +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f9debb2fdb6c884343e3391b13ab23981accabd0 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80c48a400b77fb3f86b7745826092b2447ad957ae1870e5ad837b7f7aee4d6fc +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e541d901ebbc7d1ccd1c225ebcdbc8679de7b582 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b9d0880d92fb6a2b7e701ca08a844d566ca04e21d9ee5d106867a327d36fdd5 +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b16244d4c45842d52309d5619d31bde1a410b70d --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/trainer_state.json @@ -0,0 +1,28113 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 12.996839729119639, + "eval_steps": 10, + "global_step": 3600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + }, + { + "epoch": 4.33589164785553, + "grad_norm": 232.332763671875, + "learning_rate": 2.3515426497277678e-05, + "loss": 38.1029, + "step": 1201 + }, + { + "epoch": 4.339503386004514, + "grad_norm": 251.8763885498047, + "learning_rate": 2.3509981851179673e-05, + "loss": 40.2538, + "step": 1202 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 260.1363525390625, + "learning_rate": 2.350453720508167e-05, + "loss": 39.115, + "step": 1203 + }, + { + "epoch": 4.346726862302483, + "grad_norm": 227.32473754882812, + "learning_rate": 2.3499092558983667e-05, + "loss": 37.7692, + "step": 1204 + }, + { + "epoch": 4.350338600451467, + "grad_norm": 208.3872528076172, + "learning_rate": 2.3493647912885663e-05, + "loss": 26.7583, + "step": 1205 + }, + { + "epoch": 4.353950338600452, + "grad_norm": 173.05075073242188, + "learning_rate": 2.348820326678766e-05, + "loss": 24.7576, + "step": 1206 + }, + { + "epoch": 4.357562076749436, + "grad_norm": 214.4512939453125, + "learning_rate": 2.3482758620689657e-05, + "loss": 24.8792, + "step": 1207 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 179.293701171875, + "learning_rate": 2.3477313974591652e-05, + "loss": 26.1507, + "step": 1208 + }, + { + "epoch": 4.364785553047404, + "grad_norm": 401.9908142089844, + "learning_rate": 2.3471869328493648e-05, + "loss": 47.4017, + "step": 1209 + }, + { + "epoch": 4.368397291196389, + "grad_norm": 399.3369140625, + "learning_rate": 2.3466424682395643e-05, + "loss": 48.0082, + "step": 1210 + }, + { + "epoch": 4.368397291196389, + "eval_loss": 0.6664602756500244, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.18, + "eval_steps_per_second": 57.18, + "step": 1210 + }, + { + "epoch": 4.372009029345373, + "grad_norm": 320.49090576171875, + "learning_rate": 2.346098003629764e-05, + "loss": 47.4843, + "step": 1211 + }, + { + "epoch": 4.375620767494357, + "grad_norm": 297.55615234375, + "learning_rate": 2.3455535390199637e-05, + "loss": 46.3087, + "step": 1212 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 245.03399658203125, + "learning_rate": 2.3450090744101636e-05, + "loss": 45.4889, + "step": 1213 + }, + { + "epoch": 4.382844243792325, + "grad_norm": 227.94091796875, + "learning_rate": 2.344464609800363e-05, + "loss": 45.8501, + "step": 1214 + }, + { + "epoch": 4.386455981941309, + "grad_norm": 262.7824401855469, + "learning_rate": 2.3439201451905627e-05, + "loss": 46.2737, + "step": 1215 + }, + { + "epoch": 4.390067720090293, + "grad_norm": 235.969970703125, + "learning_rate": 2.3433756805807622e-05, + "loss": 45.2876, + "step": 1216 + }, + { + "epoch": 4.393679458239277, + "grad_norm": 244.8028106689453, + "learning_rate": 2.342831215970962e-05, + "loss": 45.4931, + "step": 1217 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 236.24844360351562, + "learning_rate": 2.3422867513611616e-05, + "loss": 45.6649, + "step": 1218 + }, + { + "epoch": 4.400902934537246, + "grad_norm": 204.7911834716797, + "learning_rate": 2.341742286751361e-05, + "loss": 43.9613, + "step": 1219 + }, + { + "epoch": 4.40451467268623, + "grad_norm": 190.6739044189453, + "learning_rate": 2.3411978221415607e-05, + "loss": 41.9267, + "step": 1220 + }, + { + "epoch": 4.40451467268623, + "eval_loss": 0.6481396555900574, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1220 + }, + { + "epoch": 4.408126410835214, + "grad_norm": 224.25758361816406, + "learning_rate": 2.3406533575317602e-05, + "loss": 42.34, + "step": 1221 + }, + { + "epoch": 4.411738148984199, + "grad_norm": 238.21913146972656, + "learning_rate": 2.34010889292196e-05, + "loss": 40.6947, + "step": 1222 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 255.64395141601562, + "learning_rate": 2.33956442831216e-05, + "loss": 39.8585, + "step": 1223 + }, + { + "epoch": 4.418961625282167, + "grad_norm": 202.08859252929688, + "learning_rate": 2.3390199637023595e-05, + "loss": 42.6031, + "step": 1224 + }, + { + "epoch": 4.422573363431152, + "grad_norm": 222.359619140625, + "learning_rate": 2.338475499092559e-05, + "loss": 41.9946, + "step": 1225 + }, + { + "epoch": 4.426185101580136, + "grad_norm": 198.84461975097656, + "learning_rate": 2.3379310344827586e-05, + "loss": 40.9174, + "step": 1226 + }, + { + "epoch": 4.42979683972912, + "grad_norm": 227.34942626953125, + "learning_rate": 2.337386569872958e-05, + "loss": 42.2865, + "step": 1227 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 249.9097900390625, + "learning_rate": 2.336842105263158e-05, + "loss": 42.6508, + "step": 1228 + }, + { + "epoch": 4.437020316027088, + "grad_norm": 236.96009826660156, + "learning_rate": 2.3362976406533576e-05, + "loss": 43.0846, + "step": 1229 + }, + { + "epoch": 4.440632054176072, + "grad_norm": 183.06201171875, + "learning_rate": 2.335753176043557e-05, + "loss": 42.4119, + "step": 1230 + }, + { + "epoch": 4.440632054176072, + "eval_loss": 0.6428424715995789, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 1230 + }, + { + "epoch": 4.444243792325056, + "grad_norm": 199.0382843017578, + "learning_rate": 2.335208711433757e-05, + "loss": 43.1702, + "step": 1231 + }, + { + "epoch": 4.44785553047404, + "grad_norm": 221.87939453125, + "learning_rate": 2.3346642468239565e-05, + "loss": 43.3518, + "step": 1232 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 205.0601043701172, + "learning_rate": 2.3341197822141564e-05, + "loss": 42.9713, + "step": 1233 + }, + { + "epoch": 4.455079006772009, + "grad_norm": 235.3998565673828, + "learning_rate": 2.333575317604356e-05, + "loss": 42.6973, + "step": 1234 + }, + { + "epoch": 4.458690744920993, + "grad_norm": 171.76986694335938, + "learning_rate": 2.3330308529945555e-05, + "loss": 43.351, + "step": 1235 + }, + { + "epoch": 4.462302483069977, + "grad_norm": 261.549072265625, + "learning_rate": 2.332486388384755e-05, + "loss": 43.8662, + "step": 1236 + }, + { + "epoch": 4.465914221218962, + "grad_norm": 256.76837158203125, + "learning_rate": 2.3319419237749545e-05, + "loss": 40.7938, + "step": 1237 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 176.35060119628906, + "learning_rate": 2.331397459165154e-05, + "loss": 38.1021, + "step": 1238 + }, + { + "epoch": 4.47313769751693, + "grad_norm": 203.00906372070312, + "learning_rate": 2.330852994555354e-05, + "loss": 36.6359, + "step": 1239 + }, + { + "epoch": 4.476749435665914, + "grad_norm": 259.6462707519531, + "learning_rate": 2.3303085299455535e-05, + "loss": 34.448, + "step": 1240 + }, + { + "epoch": 4.476749435665914, + "eval_loss": 0.6386051177978516, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 1240 + }, + { + "epoch": 4.480361173814899, + "grad_norm": 215.24737548828125, + "learning_rate": 2.3297640653357534e-05, + "loss": 35.2353, + "step": 1241 + }, + { + "epoch": 4.483972911963883, + "grad_norm": 249.12355041503906, + "learning_rate": 2.329219600725953e-05, + "loss": 38.2077, + "step": 1242 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 191.0881805419922, + "learning_rate": 2.3286751361161525e-05, + "loss": 36.8363, + "step": 1243 + }, + { + "epoch": 4.491196388261851, + "grad_norm": 229.26449584960938, + "learning_rate": 2.3281306715063523e-05, + "loss": 36.7398, + "step": 1244 + }, + { + "epoch": 4.4948081264108355, + "grad_norm": 184.931884765625, + "learning_rate": 2.327586206896552e-05, + "loss": 35.6614, + "step": 1245 + }, + { + "epoch": 4.4984198645598195, + "grad_norm": 183.7378387451172, + "learning_rate": 2.3270417422867514e-05, + "loss": 36.9818, + "step": 1246 + }, + { + "epoch": 4.502031602708803, + "grad_norm": 191.42543029785156, + "learning_rate": 2.326497277676951e-05, + "loss": 38.1348, + "step": 1247 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 211.6359100341797, + "learning_rate": 2.3259528130671505e-05, + "loss": 37.0112, + "step": 1248 + }, + { + "epoch": 4.509255079006772, + "grad_norm": 245.6946563720703, + "learning_rate": 2.32540834845735e-05, + "loss": 38.6218, + "step": 1249 + }, + { + "epoch": 4.512866817155756, + "grad_norm": 193.29095458984375, + "learning_rate": 2.3248638838475502e-05, + "loss": 36.9687, + "step": 1250 + }, + { + "epoch": 4.512866817155756, + "eval_loss": 0.6432057023048401, + "eval_runtime": 3.1301, + "eval_samples_per_second": 57.187, + "eval_steps_per_second": 57.187, + "step": 1250 + }, + { + "epoch": 4.51647855530474, + "grad_norm": 247.0595245361328, + "learning_rate": 2.3243194192377498e-05, + "loss": 39.8086, + "step": 1251 + }, + { + "epoch": 4.520090293453725, + "grad_norm": 243.1544189453125, + "learning_rate": 2.3237749546279493e-05, + "loss": 38.7245, + "step": 1252 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 322.0834045410156, + "learning_rate": 2.323230490018149e-05, + "loss": 39.5335, + "step": 1253 + }, + { + "epoch": 4.527313769751693, + "grad_norm": 201.5956573486328, + "learning_rate": 2.3226860254083484e-05, + "loss": 30.2928, + "step": 1254 + }, + { + "epoch": 4.530925507900677, + "grad_norm": 186.13291931152344, + "learning_rate": 2.3221415607985483e-05, + "loss": 24.8504, + "step": 1255 + }, + { + "epoch": 4.534537246049661, + "grad_norm": 251.50608825683594, + "learning_rate": 2.3215970961887478e-05, + "loss": 24.5528, + "step": 1256 + }, + { + "epoch": 4.538148984198646, + "grad_norm": 180.21124267578125, + "learning_rate": 2.3210526315789473e-05, + "loss": 25.0864, + "step": 1257 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 206.5410614013672, + "learning_rate": 2.320508166969147e-05, + "loss": 27.1602, + "step": 1258 + }, + { + "epoch": 4.545372460496614, + "grad_norm": 342.1103210449219, + "learning_rate": 2.3199637023593468e-05, + "loss": 47.3734, + "step": 1259 + }, + { + "epoch": 4.5489841986455986, + "grad_norm": 418.3056945800781, + "learning_rate": 2.3194192377495463e-05, + "loss": 48.0316, + "step": 1260 + }, + { + "epoch": 4.5489841986455986, + "eval_loss": 0.6742400527000427, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 1260 + }, + { + "epoch": 4.5525959367945825, + "grad_norm": 369.8560791015625, + "learning_rate": 2.3188747731397462e-05, + "loss": 47.4532, + "step": 1261 + }, + { + "epoch": 4.5562076749435665, + "grad_norm": 322.0288391113281, + "learning_rate": 2.3183303085299457e-05, + "loss": 47.0661, + "step": 1262 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 244.79066467285156, + "learning_rate": 2.3177858439201453e-05, + "loss": 45.1875, + "step": 1263 + }, + { + "epoch": 4.563431151241535, + "grad_norm": 209.29397583007812, + "learning_rate": 2.3172413793103448e-05, + "loss": 46.1355, + "step": 1264 + }, + { + "epoch": 4.567042889390519, + "grad_norm": 271.5123291015625, + "learning_rate": 2.3166969147005443e-05, + "loss": 45.8947, + "step": 1265 + }, + { + "epoch": 4.570654627539503, + "grad_norm": 232.42913818359375, + "learning_rate": 2.3161524500907442e-05, + "loss": 45.6542, + "step": 1266 + }, + { + "epoch": 4.574266365688487, + "grad_norm": 282.50738525390625, + "learning_rate": 2.3156079854809437e-05, + "loss": 45.8805, + "step": 1267 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 203.39031982421875, + "learning_rate": 2.3150635208711436e-05, + "loss": 44.8926, + "step": 1268 + }, + { + "epoch": 4.581489841986456, + "grad_norm": 213.94894409179688, + "learning_rate": 2.314519056261343e-05, + "loss": 43.7589, + "step": 1269 + }, + { + "epoch": 4.58510158013544, + "grad_norm": 198.9677734375, + "learning_rate": 2.3139745916515427e-05, + "loss": 41.819, + "step": 1270 + }, + { + "epoch": 4.58510158013544, + "eval_loss": 0.6428627371788025, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1270 + }, + { + "epoch": 4.588713318284425, + "grad_norm": 197.69903564453125, + "learning_rate": 2.3134301270417422e-05, + "loss": 40.6128, + "step": 1271 + }, + { + "epoch": 4.592325056433409, + "grad_norm": 229.10488891601562, + "learning_rate": 2.312885662431942e-05, + "loss": 41.1856, + "step": 1272 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 254.4750213623047, + "learning_rate": 2.3123411978221417e-05, + "loss": 40.2048, + "step": 1273 + }, + { + "epoch": 4.599548532731377, + "grad_norm": 247.2012939453125, + "learning_rate": 2.3117967332123412e-05, + "loss": 41.663, + "step": 1274 + }, + { + "epoch": 4.603160270880361, + "grad_norm": 196.78761291503906, + "learning_rate": 2.3112522686025407e-05, + "loss": 41.1102, + "step": 1275 + }, + { + "epoch": 4.606772009029346, + "grad_norm": 179.03880310058594, + "learning_rate": 2.3107078039927403e-05, + "loss": 39.6368, + "step": 1276 + }, + { + "epoch": 4.6103837471783295, + "grad_norm": 203.49159240722656, + "learning_rate": 2.3101633393829405e-05, + "loss": 42.9424, + "step": 1277 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 254.80018615722656, + "learning_rate": 2.30961887477314e-05, + "loss": 42.0636, + "step": 1278 + }, + { + "epoch": 4.617607223476298, + "grad_norm": 201.86109924316406, + "learning_rate": 2.3090744101633396e-05, + "loss": 41.4738, + "step": 1279 + }, + { + "epoch": 4.621218961625282, + "grad_norm": 185.1239471435547, + "learning_rate": 2.308529945553539e-05, + "loss": 41.8529, + "step": 1280 + }, + { + "epoch": 4.621218961625282, + "eval_loss": 0.6457561254501343, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 1280 + }, + { + "epoch": 4.624830699774266, + "grad_norm": 198.6769561767578, + "learning_rate": 2.3079854809437386e-05, + "loss": 41.8397, + "step": 1281 + }, + { + "epoch": 4.62844243792325, + "grad_norm": 254.9165496826172, + "learning_rate": 2.3074410163339382e-05, + "loss": 43.5585, + "step": 1282 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 183.61181640625, + "learning_rate": 2.306896551724138e-05, + "loss": 41.7349, + "step": 1283 + }, + { + "epoch": 4.635665914221219, + "grad_norm": 206.0381622314453, + "learning_rate": 2.3063520871143376e-05, + "loss": 42.6239, + "step": 1284 + }, + { + "epoch": 4.639277652370203, + "grad_norm": 188.5303497314453, + "learning_rate": 2.305807622504537e-05, + "loss": 43.0988, + "step": 1285 + }, + { + "epoch": 4.642889390519187, + "grad_norm": 208.30039978027344, + "learning_rate": 2.3052631578947367e-05, + "loss": 43.8379, + "step": 1286 + }, + { + "epoch": 4.646501128668172, + "grad_norm": 209.494384765625, + "learning_rate": 2.3047186932849365e-05, + "loss": 41.4395, + "step": 1287 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 223.97824096679688, + "learning_rate": 2.3041742286751364e-05, + "loss": 38.5792, + "step": 1288 + }, + { + "epoch": 4.65372460496614, + "grad_norm": 209.16192626953125, + "learning_rate": 2.303629764065336e-05, + "loss": 36.2448, + "step": 1289 + }, + { + "epoch": 4.657336343115124, + "grad_norm": 260.72821044921875, + "learning_rate": 2.3030852994555355e-05, + "loss": 35.1692, + "step": 1290 + }, + { + "epoch": 4.657336343115124, + "eval_loss": 0.6381233334541321, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1290 + }, + { + "epoch": 4.660948081264109, + "grad_norm": 222.2270965576172, + "learning_rate": 2.302540834845735e-05, + "loss": 35.2234, + "step": 1291 + }, + { + "epoch": 4.664559819413093, + "grad_norm": 208.68218994140625, + "learning_rate": 2.3019963702359346e-05, + "loss": 35.6167, + "step": 1292 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 199.57015991210938, + "learning_rate": 2.301451905626134e-05, + "loss": 36.9489, + "step": 1293 + }, + { + "epoch": 4.6717832957110605, + "grad_norm": 249.1312255859375, + "learning_rate": 2.300907441016334e-05, + "loss": 37.0681, + "step": 1294 + }, + { + "epoch": 4.675395033860045, + "grad_norm": 227.86341857910156, + "learning_rate": 2.3003629764065335e-05, + "loss": 38.3897, + "step": 1295 + }, + { + "epoch": 4.679006772009029, + "grad_norm": 290.3368225097656, + "learning_rate": 2.2998185117967334e-05, + "loss": 39.1391, + "step": 1296 + }, + { + "epoch": 4.682618510158013, + "grad_norm": 222.59974670410156, + "learning_rate": 2.299274047186933e-05, + "loss": 38.6362, + "step": 1297 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 233.853515625, + "learning_rate": 2.2987295825771325e-05, + "loss": 37.1796, + "step": 1298 + }, + { + "epoch": 4.689841986455982, + "grad_norm": 202.83212280273438, + "learning_rate": 2.2981851179673324e-05, + "loss": 38.5097, + "step": 1299 + }, + { + "epoch": 4.693453724604966, + "grad_norm": 203.59027099609375, + "learning_rate": 2.297640653357532e-05, + "loss": 38.3335, + "step": 1300 + }, + { + "epoch": 4.693453724604966, + "eval_loss": 0.6446877717971802, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 1300 + }, + { + "epoch": 4.69706546275395, + "grad_norm": 250.48324584960938, + "learning_rate": 2.2970961887477314e-05, + "loss": 39.1848, + "step": 1301 + }, + { + "epoch": 4.700677200902934, + "grad_norm": 218.0867462158203, + "learning_rate": 2.296551724137931e-05, + "loss": 38.2276, + "step": 1302 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 316.4258728027344, + "learning_rate": 2.2960072595281305e-05, + "loss": 38.4487, + "step": 1303 + }, + { + "epoch": 4.707900677200903, + "grad_norm": 262.96832275390625, + "learning_rate": 2.29546279491833e-05, + "loss": 29.1075, + "step": 1304 + }, + { + "epoch": 4.711512415349887, + "grad_norm": 261.25897216796875, + "learning_rate": 2.2949183303085303e-05, + "loss": 24.6257, + "step": 1305 + }, + { + "epoch": 4.715124153498872, + "grad_norm": 223.29014587402344, + "learning_rate": 2.2943738656987298e-05, + "loss": 24.4387, + "step": 1306 + }, + { + "epoch": 4.718735891647856, + "grad_norm": 167.95193481445312, + "learning_rate": 2.2938294010889293e-05, + "loss": 25.0916, + "step": 1307 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 203.88392639160156, + "learning_rate": 2.293284936479129e-05, + "loss": 26.1631, + "step": 1308 + }, + { + "epoch": 4.725959367945824, + "grad_norm": 350.67657470703125, + "learning_rate": 2.2927404718693284e-05, + "loss": 47.7021, + "step": 1309 + }, + { + "epoch": 4.7295711060948085, + "grad_norm": 357.1839294433594, + "learning_rate": 2.2921960072595283e-05, + "loss": 47.8161, + "step": 1310 + }, + { + "epoch": 4.7295711060948085, + "eval_loss": 0.6716815829277039, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 1310 + }, + { + "epoch": 4.733182844243792, + "grad_norm": 334.40216064453125, + "learning_rate": 2.291651542649728e-05, + "loss": 47.5608, + "step": 1311 + }, + { + "epoch": 4.736794582392776, + "grad_norm": 322.90008544921875, + "learning_rate": 2.2911070780399274e-05, + "loss": 45.9858, + "step": 1312 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 291.5083923339844, + "learning_rate": 2.290562613430127e-05, + "loss": 45.9813, + "step": 1313 + }, + { + "epoch": 4.744018058690745, + "grad_norm": 234.91102600097656, + "learning_rate": 2.2900181488203268e-05, + "loss": 44.4287, + "step": 1314 + }, + { + "epoch": 4.747629796839729, + "grad_norm": 271.03582763671875, + "learning_rate": 2.2894736842105263e-05, + "loss": 45.3697, + "step": 1315 + }, + { + "epoch": 4.751241534988713, + "grad_norm": 256.219482421875, + "learning_rate": 2.2889292196007262e-05, + "loss": 45.1817, + "step": 1316 + }, + { + "epoch": 4.754853273137698, + "grad_norm": 252.0631561279297, + "learning_rate": 2.2883847549909257e-05, + "loss": 45.2029, + "step": 1317 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 249.41812133789062, + "learning_rate": 2.2878402903811253e-05, + "loss": 44.9802, + "step": 1318 + }, + { + "epoch": 4.762076749435666, + "grad_norm": 208.9102325439453, + "learning_rate": 2.2872958257713248e-05, + "loss": 44.3745, + "step": 1319 + }, + { + "epoch": 4.76568848758465, + "grad_norm": 322.94903564453125, + "learning_rate": 2.2867513611615244e-05, + "loss": 40.9193, + "step": 1320 + }, + { + "epoch": 4.76568848758465, + "eval_loss": 0.6515910029411316, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1320 + }, + { + "epoch": 4.769300225733634, + "grad_norm": 264.6942138671875, + "learning_rate": 2.2862068965517242e-05, + "loss": 39.7286, + "step": 1321 + }, + { + "epoch": 4.772911963882619, + "grad_norm": 276.6095886230469, + "learning_rate": 2.2856624319419238e-05, + "loss": 41.3846, + "step": 1322 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 199.59877014160156, + "learning_rate": 2.2851179673321233e-05, + "loss": 40.5583, + "step": 1323 + }, + { + "epoch": 4.780135440180587, + "grad_norm": 252.59158325195312, + "learning_rate": 2.2845735027223232e-05, + "loss": 40.9513, + "step": 1324 + }, + { + "epoch": 4.7837471783295715, + "grad_norm": 215.53826904296875, + "learning_rate": 2.2840290381125227e-05, + "loss": 41.5119, + "step": 1325 + }, + { + "epoch": 4.7873589164785555, + "grad_norm": 290.7100524902344, + "learning_rate": 2.2834845735027226e-05, + "loss": 42.7646, + "step": 1326 + }, + { + "epoch": 4.7909706546275395, + "grad_norm": 190.2306671142578, + "learning_rate": 2.282940108892922e-05, + "loss": 42.2708, + "step": 1327 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 187.5550079345703, + "learning_rate": 2.2823956442831217e-05, + "loss": 41.9279, + "step": 1328 + }, + { + "epoch": 4.798194130925508, + "grad_norm": 169.10414123535156, + "learning_rate": 2.2818511796733212e-05, + "loss": 42.2688, + "step": 1329 + }, + { + "epoch": 4.801805869074492, + "grad_norm": 199.5216064453125, + "learning_rate": 2.2813067150635208e-05, + "loss": 41.9192, + "step": 1330 + }, + { + "epoch": 4.801805869074492, + "eval_loss": 0.6402038335800171, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 1330 + }, + { + "epoch": 4.805417607223476, + "grad_norm": 222.4996337890625, + "learning_rate": 2.2807622504537203e-05, + "loss": 43.8218, + "step": 1331 + }, + { + "epoch": 4.80902934537246, + "grad_norm": 228.1157684326172, + "learning_rate": 2.2802177858439202e-05, + "loss": 42.9497, + "step": 1332 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 179.83697509765625, + "learning_rate": 2.27967332123412e-05, + "loss": 43.9723, + "step": 1333 + }, + { + "epoch": 4.816252821670429, + "grad_norm": 196.81983947753906, + "learning_rate": 2.2791288566243196e-05, + "loss": 43.3302, + "step": 1334 + }, + { + "epoch": 4.819864559819413, + "grad_norm": 186.61160278320312, + "learning_rate": 2.278584392014519e-05, + "loss": 41.8957, + "step": 1335 + }, + { + "epoch": 4.823476297968397, + "grad_norm": 242.55886840820312, + "learning_rate": 2.2780399274047187e-05, + "loss": 43.1916, + "step": 1336 + }, + { + "epoch": 4.827088036117382, + "grad_norm": 212.07177734375, + "learning_rate": 2.2774954627949185e-05, + "loss": 38.3371, + "step": 1337 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 180.1990966796875, + "learning_rate": 2.276950998185118e-05, + "loss": 36.3413, + "step": 1338 + }, + { + "epoch": 4.83431151241535, + "grad_norm": 202.69529724121094, + "learning_rate": 2.2764065335753176e-05, + "loss": 35.4426, + "step": 1339 + }, + { + "epoch": 4.837923250564334, + "grad_norm": 180.47283935546875, + "learning_rate": 2.275862068965517e-05, + "loss": 35.5281, + "step": 1340 + }, + { + "epoch": 4.837923250564334, + "eval_loss": 0.6356105804443359, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 1340 + }, + { + "epoch": 4.8415349887133186, + "grad_norm": 204.674560546875, + "learning_rate": 2.2753176043557167e-05, + "loss": 36.2566, + "step": 1341 + }, + { + "epoch": 4.8451467268623025, + "grad_norm": 272.1197204589844, + "learning_rate": 2.2747731397459166e-05, + "loss": 36.3862, + "step": 1342 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 235.55101013183594, + "learning_rate": 2.2742286751361165e-05, + "loss": 35.1455, + "step": 1343 + }, + { + "epoch": 4.852370203160271, + "grad_norm": 271.2718200683594, + "learning_rate": 2.273684210526316e-05, + "loss": 37.3824, + "step": 1344 + }, + { + "epoch": 4.855981941309255, + "grad_norm": 242.15728759765625, + "learning_rate": 2.2731397459165155e-05, + "loss": 37.6587, + "step": 1345 + }, + { + "epoch": 4.859593679458239, + "grad_norm": 218.59481811523438, + "learning_rate": 2.272595281306715e-05, + "loss": 36.7602, + "step": 1346 + }, + { + "epoch": 4.863205417607223, + "grad_norm": 231.9490203857422, + "learning_rate": 2.2720508166969146e-05, + "loss": 38.187, + "step": 1347 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 385.56158447265625, + "learning_rate": 2.2715063520871145e-05, + "loss": 38.1905, + "step": 1348 + }, + { + "epoch": 4.870428893905192, + "grad_norm": 219.38204956054688, + "learning_rate": 2.270961887477314e-05, + "loss": 38.2179, + "step": 1349 + }, + { + "epoch": 4.874040632054176, + "grad_norm": 209.46580505371094, + "learning_rate": 2.2704174228675136e-05, + "loss": 37.3696, + "step": 1350 + }, + { + "epoch": 4.874040632054176, + "eval_loss": 0.6412517428398132, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 1350 + }, + { + "epoch": 4.87765237020316, + "grad_norm": 205.53416442871094, + "learning_rate": 2.2698729582577134e-05, + "loss": 38.5144, + "step": 1351 + }, + { + "epoch": 4.881264108352145, + "grad_norm": 214.2522735595703, + "learning_rate": 2.269328493647913e-05, + "loss": 38.7372, + "step": 1352 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 236.9787139892578, + "learning_rate": 2.2687840290381125e-05, + "loss": 38.8987, + "step": 1353 + }, + { + "epoch": 4.888487584650113, + "grad_norm": 247.30906677246094, + "learning_rate": 2.2682395644283124e-05, + "loss": 35.0837, + "step": 1354 + }, + { + "epoch": 4.892099322799097, + "grad_norm": 287.5954284667969, + "learning_rate": 2.267695099818512e-05, + "loss": 25.5272, + "step": 1355 + }, + { + "epoch": 4.895711060948082, + "grad_norm": 254.61672973632812, + "learning_rate": 2.2671506352087115e-05, + "loss": 25.1288, + "step": 1356 + }, + { + "epoch": 4.899322799097066, + "grad_norm": 180.98666381835938, + "learning_rate": 2.266606170598911e-05, + "loss": 25.0588, + "step": 1357 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 213.0275421142578, + "learning_rate": 2.2660617059891105e-05, + "loss": 25.464, + "step": 1358 + }, + { + "epoch": 4.9065462753950335, + "grad_norm": 385.18035888671875, + "learning_rate": 2.2655172413793104e-05, + "loss": 47.0056, + "step": 1359 + }, + { + "epoch": 4.910158013544018, + "grad_norm": 383.4106140136719, + "learning_rate": 2.2649727767695103e-05, + "loss": 46.9892, + "step": 1360 + }, + { + "epoch": 4.910158013544018, + "eval_loss": 0.6618479490280151, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1360 + }, + { + "epoch": 4.913769751693002, + "grad_norm": 415.4345397949219, + "learning_rate": 2.26442831215971e-05, + "loss": 47.1619, + "step": 1361 + }, + { + "epoch": 4.917381489841986, + "grad_norm": 362.338134765625, + "learning_rate": 2.2638838475499094e-05, + "loss": 46.7232, + "step": 1362 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 378.7535400390625, + "learning_rate": 2.263339382940109e-05, + "loss": 46.4438, + "step": 1363 + }, + { + "epoch": 4.924604966139955, + "grad_norm": 251.64901733398438, + "learning_rate": 2.2627949183303085e-05, + "loss": 44.8178, + "step": 1364 + }, + { + "epoch": 4.928216704288939, + "grad_norm": 273.1052551269531, + "learning_rate": 2.2622504537205083e-05, + "loss": 43.0865, + "step": 1365 + }, + { + "epoch": 4.931828442437923, + "grad_norm": 229.66415405273438, + "learning_rate": 2.261705989110708e-05, + "loss": 42.2463, + "step": 1366 + }, + { + "epoch": 4.935440180586907, + "grad_norm": 229.47940063476562, + "learning_rate": 2.2611615245009074e-05, + "loss": 42.4395, + "step": 1367 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 224.48890686035156, + "learning_rate": 2.260617059891107e-05, + "loss": 42.4994, + "step": 1368 + }, + { + "epoch": 4.942663656884876, + "grad_norm": 241.98745727539062, + "learning_rate": 2.2600725952813065e-05, + "loss": 42.5535, + "step": 1369 + }, + { + "epoch": 4.94627539503386, + "grad_norm": 258.1711120605469, + "learning_rate": 2.2595281306715067e-05, + "loss": 42.8475, + "step": 1370 + }, + { + "epoch": 4.94627539503386, + "eval_loss": 0.639252245426178, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.09, + "eval_steps_per_second": 57.09, + "step": 1370 + }, + { + "epoch": 4.949887133182845, + "grad_norm": 204.64927673339844, + "learning_rate": 2.2589836660617062e-05, + "loss": 42.9895, + "step": 1371 + }, + { + "epoch": 4.953498871331829, + "grad_norm": 342.9057922363281, + "learning_rate": 2.2584392014519058e-05, + "loss": 43.1972, + "step": 1372 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 207.45504760742188, + "learning_rate": 2.2578947368421053e-05, + "loss": 42.406, + "step": 1373 + }, + { + "epoch": 4.960722347629797, + "grad_norm": 232.78831481933594, + "learning_rate": 2.257350272232305e-05, + "loss": 36.8817, + "step": 1374 + }, + { + "epoch": 4.9643340857787805, + "grad_norm": 249.3349609375, + "learning_rate": 2.2568058076225044e-05, + "loss": 34.584, + "step": 1375 + }, + { + "epoch": 4.967945823927765, + "grad_norm": 322.7100524902344, + "learning_rate": 2.2562613430127043e-05, + "loss": 36.9512, + "step": 1376 + }, + { + "epoch": 4.971557562076749, + "grad_norm": 357.65228271484375, + "learning_rate": 2.2557168784029038e-05, + "loss": 37.6833, + "step": 1377 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 300.0970153808594, + "learning_rate": 2.2551724137931033e-05, + "loss": 38.597, + "step": 1378 + }, + { + "epoch": 4.978781038374718, + "grad_norm": 234.52508544921875, + "learning_rate": 2.2546279491833032e-05, + "loss": 38.4155, + "step": 1379 + }, + { + "epoch": 4.982392776523702, + "grad_norm": 270.60626220703125, + "learning_rate": 2.2540834845735028e-05, + "loss": 38.1589, + "step": 1380 + }, + { + "epoch": 4.982392776523702, + "eval_loss": 0.6409950256347656, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 1380 + }, + { + "epoch": 4.986004514672686, + "grad_norm": 232.9596710205078, + "learning_rate": 2.2535390199637026e-05, + "loss": 39.281, + "step": 1381 + }, + { + "epoch": 4.98961625282167, + "grad_norm": 248.0550994873047, + "learning_rate": 2.2529945553539022e-05, + "loss": 40.0868, + "step": 1382 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 256.327880859375, + "learning_rate": 2.2524500907441017e-05, + "loss": 28.1259, + "step": 1383 + }, + { + "epoch": 4.996839729119639, + "grad_norm": 198.29559326171875, + "learning_rate": 2.2519056261343012e-05, + "loss": 25.3166, + "step": 1384 + }, + { + "epoch": 5.0, + "grad_norm": 174.66856384277344, + "learning_rate": 2.2513611615245008e-05, + "loss": 22.0749, + "step": 1385 + }, + { + "epoch": 5.003611738148984, + "grad_norm": 309.0927429199219, + "learning_rate": 2.2508166969147003e-05, + "loss": 45.2433, + "step": 1386 + }, + { + "epoch": 5.007223476297969, + "grad_norm": 293.1455383300781, + "learning_rate": 2.2502722323049002e-05, + "loss": 46.7025, + "step": 1387 + }, + { + "epoch": 5.010835214446953, + "grad_norm": 269.47662353515625, + "learning_rate": 2.2497277676951e-05, + "loss": 45.3218, + "step": 1388 + }, + { + "epoch": 5.014446952595937, + "grad_norm": 284.49560546875, + "learning_rate": 2.2491833030852996e-05, + "loss": 44.9849, + "step": 1389 + }, + { + "epoch": 5.018058690744921, + "grad_norm": 223.5511474609375, + "learning_rate": 2.248638838475499e-05, + "loss": 44.887, + "step": 1390 + }, + { + "epoch": 5.018058690744921, + "eval_loss": 0.6435533165931702, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1390 + }, + { + "epoch": 5.021670428893906, + "grad_norm": 243.4492645263672, + "learning_rate": 2.2480943738656987e-05, + "loss": 45.1483, + "step": 1391 + }, + { + "epoch": 5.0252821670428895, + "grad_norm": 265.1712646484375, + "learning_rate": 2.2475499092558986e-05, + "loss": 44.3713, + "step": 1392 + }, + { + "epoch": 5.0288939051918735, + "grad_norm": 190.72190856933594, + "learning_rate": 2.247005444646098e-05, + "loss": 45.3138, + "step": 1393 + }, + { + "epoch": 5.0325056433408575, + "grad_norm": 177.26686096191406, + "learning_rate": 2.2464609800362976e-05, + "loss": 43.302, + "step": 1394 + }, + { + "epoch": 5.036117381489842, + "grad_norm": 198.6124725341797, + "learning_rate": 2.2459165154264972e-05, + "loss": 43.6363, + "step": 1395 + }, + { + "epoch": 5.039729119638826, + "grad_norm": 233.78738403320312, + "learning_rate": 2.2453720508166967e-05, + "loss": 43.0345, + "step": 1396 + }, + { + "epoch": 5.04334085778781, + "grad_norm": 225.48614501953125, + "learning_rate": 2.2448275862068966e-05, + "loss": 41.5932, + "step": 1397 + }, + { + "epoch": 5.046952595936794, + "grad_norm": 204.31179809570312, + "learning_rate": 2.2442831215970965e-05, + "loss": 40.1401, + "step": 1398 + }, + { + "epoch": 5.050564334085779, + "grad_norm": 219.5385284423828, + "learning_rate": 2.243738656987296e-05, + "loss": 40.8834, + "step": 1399 + }, + { + "epoch": 5.054176072234763, + "grad_norm": 168.3094024658203, + "learning_rate": 2.2431941923774956e-05, + "loss": 40.4476, + "step": 1400 + }, + { + "epoch": 5.054176072234763, + "eval_loss": 0.6361114382743835, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 1400 + }, + { + "epoch": 5.057787810383747, + "grad_norm": 169.45201110839844, + "learning_rate": 2.242649727767695e-05, + "loss": 40.1949, + "step": 1401 + }, + { + "epoch": 5.061399548532731, + "grad_norm": 208.84634399414062, + "learning_rate": 2.2421052631578946e-05, + "loss": 41.0091, + "step": 1402 + }, + { + "epoch": 5.065011286681716, + "grad_norm": 248.86221313476562, + "learning_rate": 2.2415607985480945e-05, + "loss": 40.2435, + "step": 1403 + }, + { + "epoch": 5.0686230248307, + "grad_norm": 297.0834655761719, + "learning_rate": 2.241016333938294e-05, + "loss": 42.37, + "step": 1404 + }, + { + "epoch": 5.072234762979684, + "grad_norm": 242.12661743164062, + "learning_rate": 2.2404718693284936e-05, + "loss": 42.3822, + "step": 1405 + }, + { + "epoch": 5.075846501128668, + "grad_norm": 230.1178741455078, + "learning_rate": 2.2399274047186935e-05, + "loss": 41.3722, + "step": 1406 + }, + { + "epoch": 5.079458239277653, + "grad_norm": 191.32371520996094, + "learning_rate": 2.239382940108893e-05, + "loss": 41.8087, + "step": 1407 + }, + { + "epoch": 5.083069977426637, + "grad_norm": 267.28753662109375, + "learning_rate": 2.2388384754990925e-05, + "loss": 42.5938, + "step": 1408 + }, + { + "epoch": 5.0866817155756205, + "grad_norm": 186.61978149414062, + "learning_rate": 2.2382940108892924e-05, + "loss": 42.8553, + "step": 1409 + }, + { + "epoch": 5.090293453724605, + "grad_norm": 242.53433227539062, + "learning_rate": 2.237749546279492e-05, + "loss": 41.9677, + "step": 1410 + }, + { + "epoch": 5.090293453724605, + "eval_loss": 0.6330043077468872, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 1410 + }, + { + "epoch": 5.093905191873589, + "grad_norm": 199.74696350097656, + "learning_rate": 2.2372050816696915e-05, + "loss": 42.9821, + "step": 1411 + }, + { + "epoch": 5.097516930022573, + "grad_norm": 254.1063690185547, + "learning_rate": 2.236660617059891e-05, + "loss": 42.7956, + "step": 1412 + }, + { + "epoch": 5.101128668171557, + "grad_norm": 215.59056091308594, + "learning_rate": 2.2361161524500906e-05, + "loss": 43.6312, + "step": 1413 + }, + { + "epoch": 5.104740406320542, + "grad_norm": 218.69973754882812, + "learning_rate": 2.2355716878402904e-05, + "loss": 40.9468, + "step": 1414 + }, + { + "epoch": 5.108352144469526, + "grad_norm": 200.34927368164062, + "learning_rate": 2.23502722323049e-05, + "loss": 38.2656, + "step": 1415 + }, + { + "epoch": 5.11196388261851, + "grad_norm": 191.56883239746094, + "learning_rate": 2.23448275862069e-05, + "loss": 35.8111, + "step": 1416 + }, + { + "epoch": 5.115575620767494, + "grad_norm": 192.629150390625, + "learning_rate": 2.2339382940108894e-05, + "loss": 35.1287, + "step": 1417 + }, + { + "epoch": 5.119187358916479, + "grad_norm": 217.54855346679688, + "learning_rate": 2.233393829401089e-05, + "loss": 34.9664, + "step": 1418 + }, + { + "epoch": 5.122799097065463, + "grad_norm": 234.12355041503906, + "learning_rate": 2.2328493647912888e-05, + "loss": 35.9252, + "step": 1419 + }, + { + "epoch": 5.126410835214447, + "grad_norm": 201.83477783203125, + "learning_rate": 2.2323049001814884e-05, + "loss": 36.4664, + "step": 1420 + }, + { + "epoch": 5.126410835214447, + "eval_loss": 0.6359394192695618, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 1420 + }, + { + "epoch": 5.130022573363431, + "grad_norm": 212.38943481445312, + "learning_rate": 2.231760435571688e-05, + "loss": 35.2733, + "step": 1421 + }, + { + "epoch": 5.133634311512416, + "grad_norm": 219.8803253173828, + "learning_rate": 2.2312159709618874e-05, + "loss": 37.2009, + "step": 1422 + }, + { + "epoch": 5.1372460496614, + "grad_norm": 222.28221130371094, + "learning_rate": 2.230671506352087e-05, + "loss": 36.9338, + "step": 1423 + }, + { + "epoch": 5.140857787810384, + "grad_norm": 217.56607055664062, + "learning_rate": 2.2301270417422865e-05, + "loss": 38.0419, + "step": 1424 + }, + { + "epoch": 5.144469525959368, + "grad_norm": 232.7363739013672, + "learning_rate": 2.2295825771324867e-05, + "loss": 38.1393, + "step": 1425 + }, + { + "epoch": 5.148081264108352, + "grad_norm": 228.12091064453125, + "learning_rate": 2.2290381125226863e-05, + "loss": 37.4169, + "step": 1426 + }, + { + "epoch": 5.151693002257336, + "grad_norm": 247.9901580810547, + "learning_rate": 2.2284936479128858e-05, + "loss": 37.6386, + "step": 1427 + }, + { + "epoch": 5.15530474040632, + "grad_norm": 227.96649169921875, + "learning_rate": 2.2279491833030853e-05, + "loss": 38.7843, + "step": 1428 + }, + { + "epoch": 5.158916478555304, + "grad_norm": 197.85072326660156, + "learning_rate": 2.227404718693285e-05, + "loss": 37.7056, + "step": 1429 + }, + { + "epoch": 5.162528216704289, + "grad_norm": 270.6370544433594, + "learning_rate": 2.2268602540834848e-05, + "loss": 38.5554, + "step": 1430 + }, + { + "epoch": 5.162528216704289, + "eval_loss": 0.6463288068771362, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 1430 + }, + { + "epoch": 5.166139954853273, + "grad_norm": 251.65847778320312, + "learning_rate": 2.2263157894736843e-05, + "loss": 32.6593, + "step": 1431 + }, + { + "epoch": 5.169751693002257, + "grad_norm": 248.84368896484375, + "learning_rate": 2.225771324863884e-05, + "loss": 24.8031, + "step": 1432 + }, + { + "epoch": 5.173363431151241, + "grad_norm": 218.12979125976562, + "learning_rate": 2.2252268602540834e-05, + "loss": 23.8542, + "step": 1433 + }, + { + "epoch": 5.176975169300226, + "grad_norm": 171.4182586669922, + "learning_rate": 2.2246823956442832e-05, + "loss": 25.1994, + "step": 1434 + }, + { + "epoch": 5.18058690744921, + "grad_norm": 200.76271057128906, + "learning_rate": 2.2241379310344828e-05, + "loss": 25.1259, + "step": 1435 + }, + { + "epoch": 5.184198645598194, + "grad_norm": 324.8979797363281, + "learning_rate": 2.2235934664246827e-05, + "loss": 46.7466, + "step": 1436 + }, + { + "epoch": 5.187810383747179, + "grad_norm": 391.9200439453125, + "learning_rate": 2.2230490018148822e-05, + "loss": 47.366, + "step": 1437 + }, + { + "epoch": 5.191422121896163, + "grad_norm": 332.51080322265625, + "learning_rate": 2.2225045372050817e-05, + "loss": 47.5236, + "step": 1438 + }, + { + "epoch": 5.195033860045147, + "grad_norm": 295.85333251953125, + "learning_rate": 2.2219600725952813e-05, + "loss": 44.9235, + "step": 1439 + }, + { + "epoch": 5.198645598194131, + "grad_norm": 246.46482849121094, + "learning_rate": 2.2214156079854808e-05, + "loss": 44.5892, + "step": 1440 + }, + { + "epoch": 5.198645598194131, + "eval_loss": 0.6501885056495667, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.096, + "eval_steps_per_second": 57.096, + "step": 1440 + }, + { + "epoch": 5.2022573363431155, + "grad_norm": 224.99964904785156, + "learning_rate": 2.2208711433756807e-05, + "loss": 45.1496, + "step": 1441 + }, + { + "epoch": 5.2058690744920995, + "grad_norm": 201.5928497314453, + "learning_rate": 2.2203266787658802e-05, + "loss": 44.2362, + "step": 1442 + }, + { + "epoch": 5.209480812641083, + "grad_norm": 220.72509765625, + "learning_rate": 2.21978221415608e-05, + "loss": 45.7963, + "step": 1443 + }, + { + "epoch": 5.213092550790067, + "grad_norm": 229.04412841796875, + "learning_rate": 2.2192377495462796e-05, + "loss": 44.1812, + "step": 1444 + }, + { + "epoch": 5.216704288939052, + "grad_norm": 214.86207580566406, + "learning_rate": 2.2186932849364792e-05, + "loss": 44.364, + "step": 1445 + }, + { + "epoch": 5.220316027088036, + "grad_norm": 169.3239288330078, + "learning_rate": 2.2181488203266787e-05, + "loss": 44.1106, + "step": 1446 + }, + { + "epoch": 5.22392776523702, + "grad_norm": 180.3131561279297, + "learning_rate": 2.2176043557168786e-05, + "loss": 41.8791, + "step": 1447 + }, + { + "epoch": 5.227539503386004, + "grad_norm": 227.83078002929688, + "learning_rate": 2.217059891107078e-05, + "loss": 39.7917, + "step": 1448 + }, + { + "epoch": 5.231151241534989, + "grad_norm": 267.4294738769531, + "learning_rate": 2.2165154264972777e-05, + "loss": 41.2864, + "step": 1449 + }, + { + "epoch": 5.234762979683973, + "grad_norm": 210.79034423828125, + "learning_rate": 2.2159709618874772e-05, + "loss": 40.7219, + "step": 1450 + }, + { + "epoch": 5.234762979683973, + "eval_loss": 0.6369529366493225, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 1450 + }, + { + "epoch": 5.238374717832957, + "grad_norm": 205.2632598876953, + "learning_rate": 2.2154264972776768e-05, + "loss": 41.0364, + "step": 1451 + }, + { + "epoch": 5.241986455981941, + "grad_norm": 199.7196807861328, + "learning_rate": 2.214882032667877e-05, + "loss": 40.2733, + "step": 1452 + }, + { + "epoch": 5.245598194130926, + "grad_norm": 184.26495361328125, + "learning_rate": 2.2143375680580765e-05, + "loss": 40.3418, + "step": 1453 + }, + { + "epoch": 5.24920993227991, + "grad_norm": 170.1937713623047, + "learning_rate": 2.213793103448276e-05, + "loss": 40.5658, + "step": 1454 + }, + { + "epoch": 5.252821670428894, + "grad_norm": 167.71109008789062, + "learning_rate": 2.2132486388384756e-05, + "loss": 41.9252, + "step": 1455 + }, + { + "epoch": 5.2564334085778786, + "grad_norm": 184.73162841796875, + "learning_rate": 2.212704174228675e-05, + "loss": 40.0485, + "step": 1456 + }, + { + "epoch": 5.2600451467268625, + "grad_norm": 195.0812225341797, + "learning_rate": 2.2121597096188747e-05, + "loss": 41.6424, + "step": 1457 + }, + { + "epoch": 5.2636568848758465, + "grad_norm": 218.23553466796875, + "learning_rate": 2.2116152450090745e-05, + "loss": 40.6179, + "step": 1458 + }, + { + "epoch": 5.2672686230248305, + "grad_norm": 229.79299926757812, + "learning_rate": 2.211070780399274e-05, + "loss": 42.8747, + "step": 1459 + }, + { + "epoch": 5.270880361173815, + "grad_norm": 231.70692443847656, + "learning_rate": 2.2105263157894736e-05, + "loss": 42.7016, + "step": 1460 + }, + { + "epoch": 5.270880361173815, + "eval_loss": 0.6424433588981628, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 1460 + }, + { + "epoch": 5.274492099322799, + "grad_norm": 204.9513397216797, + "learning_rate": 2.209981851179673e-05, + "loss": 41.206, + "step": 1461 + }, + { + "epoch": 5.278103837471783, + "grad_norm": 220.89083862304688, + "learning_rate": 2.209437386569873e-05, + "loss": 44.0126, + "step": 1462 + }, + { + "epoch": 5.281715575620767, + "grad_norm": 266.7763671875, + "learning_rate": 2.208892921960073e-05, + "loss": 41.4934, + "step": 1463 + }, + { + "epoch": 5.285327313769752, + "grad_norm": 241.42636108398438, + "learning_rate": 2.2083484573502724e-05, + "loss": 43.3433, + "step": 1464 + }, + { + "epoch": 5.288939051918736, + "grad_norm": 221.7669219970703, + "learning_rate": 2.207803992740472e-05, + "loss": 35.9569, + "step": 1465 + }, + { + "epoch": 5.29255079006772, + "grad_norm": 236.0152130126953, + "learning_rate": 2.2072595281306715e-05, + "loss": 36.0824, + "step": 1466 + }, + { + "epoch": 5.296162528216704, + "grad_norm": 239.56224060058594, + "learning_rate": 2.206715063520871e-05, + "loss": 33.6127, + "step": 1467 + }, + { + "epoch": 5.299774266365689, + "grad_norm": 277.1287841796875, + "learning_rate": 2.2061705989110706e-05, + "loss": 36.11, + "step": 1468 + }, + { + "epoch": 5.303386004514673, + "grad_norm": 250.19515991210938, + "learning_rate": 2.2056261343012705e-05, + "loss": 36.9984, + "step": 1469 + }, + { + "epoch": 5.306997742663657, + "grad_norm": 214.2754669189453, + "learning_rate": 2.20508166969147e-05, + "loss": 36.5917, + "step": 1470 + }, + { + "epoch": 5.306997742663657, + "eval_loss": 0.6356943845748901, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 1470 + }, + { + "epoch": 5.310609480812641, + "grad_norm": 224.37388610839844, + "learning_rate": 2.20453720508167e-05, + "loss": 36.5302, + "step": 1471 + }, + { + "epoch": 5.314221218961626, + "grad_norm": 276.2541809082031, + "learning_rate": 2.2039927404718694e-05, + "loss": 36.7978, + "step": 1472 + }, + { + "epoch": 5.3178329571106095, + "grad_norm": 361.717041015625, + "learning_rate": 2.203448275862069e-05, + "loss": 37.4063, + "step": 1473 + }, + { + "epoch": 5.3214446952595935, + "grad_norm": 285.3569641113281, + "learning_rate": 2.202903811252269e-05, + "loss": 37.2472, + "step": 1474 + }, + { + "epoch": 5.3250564334085775, + "grad_norm": 268.160400390625, + "learning_rate": 2.2023593466424684e-05, + "loss": 37.7361, + "step": 1475 + }, + { + "epoch": 5.328668171557562, + "grad_norm": 211.38070678710938, + "learning_rate": 2.201814882032668e-05, + "loss": 37.7794, + "step": 1476 + }, + { + "epoch": 5.332279909706546, + "grad_norm": 214.10638427734375, + "learning_rate": 2.2012704174228675e-05, + "loss": 39.0787, + "step": 1477 + }, + { + "epoch": 5.33589164785553, + "grad_norm": 238.9603271484375, + "learning_rate": 2.200725952813067e-05, + "loss": 37.6853, + "step": 1478 + }, + { + "epoch": 5.339503386004514, + "grad_norm": 323.44976806640625, + "learning_rate": 2.2001814882032665e-05, + "loss": 38.2844, + "step": 1479 + }, + { + "epoch": 5.343115124153499, + "grad_norm": 289.6131896972656, + "learning_rate": 2.1996370235934668e-05, + "loss": 38.8953, + "step": 1480 + }, + { + "epoch": 5.343115124153499, + "eval_loss": 0.6462770700454712, + "eval_runtime": 3.1673, + "eval_samples_per_second": 56.516, + "eval_steps_per_second": 56.516, + "step": 1480 + }, + { + "epoch": 5.346726862302483, + "grad_norm": 197.47299194335938, + "learning_rate": 2.1990925589836663e-05, + "loss": 28.126, + "step": 1481 + }, + { + "epoch": 5.350338600451467, + "grad_norm": 198.37156677246094, + "learning_rate": 2.1985480943738658e-05, + "loss": 24.2205, + "step": 1482 + }, + { + "epoch": 5.353950338600452, + "grad_norm": 211.03501892089844, + "learning_rate": 2.1980036297640654e-05, + "loss": 24.119, + "step": 1483 + }, + { + "epoch": 5.357562076749436, + "grad_norm": 182.23316955566406, + "learning_rate": 2.197459165154265e-05, + "loss": 24.7386, + "step": 1484 + }, + { + "epoch": 5.36117381489842, + "grad_norm": 192.6392822265625, + "learning_rate": 2.1969147005444648e-05, + "loss": 26.0739, + "step": 1485 + }, + { + "epoch": 5.364785553047404, + "grad_norm": 380.62896728515625, + "learning_rate": 2.1963702359346643e-05, + "loss": 46.6945, + "step": 1486 + }, + { + "epoch": 5.368397291196389, + "grad_norm": 342.5572814941406, + "learning_rate": 2.195825771324864e-05, + "loss": 46.1797, + "step": 1487 + }, + { + "epoch": 5.372009029345373, + "grad_norm": 311.7198791503906, + "learning_rate": 2.1952813067150634e-05, + "loss": 45.6588, + "step": 1488 + }, + { + "epoch": 5.375620767494357, + "grad_norm": 260.9885559082031, + "learning_rate": 2.1947368421052633e-05, + "loss": 45.2405, + "step": 1489 + }, + { + "epoch": 5.3792325056433405, + "grad_norm": 263.3132019042969, + "learning_rate": 2.1941923774954628e-05, + "loss": 44.117, + "step": 1490 + }, + { + "epoch": 5.3792325056433405, + "eval_loss": 0.644275426864624, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 1490 + }, + { + "epoch": 5.382844243792325, + "grad_norm": 254.92022705078125, + "learning_rate": 2.1936479128856627e-05, + "loss": 45.4002, + "step": 1491 + }, + { + "epoch": 5.386455981941309, + "grad_norm": 246.1839599609375, + "learning_rate": 2.1931034482758622e-05, + "loss": 45.3481, + "step": 1492 + }, + { + "epoch": 5.390067720090293, + "grad_norm": 282.2879638671875, + "learning_rate": 2.1925589836660618e-05, + "loss": 45.3958, + "step": 1493 + }, + { + "epoch": 5.393679458239277, + "grad_norm": 266.9140930175781, + "learning_rate": 2.1920145190562613e-05, + "loss": 44.2959, + "step": 1494 + }, + { + "epoch": 5.397291196388262, + "grad_norm": 196.81199645996094, + "learning_rate": 2.191470054446461e-05, + "loss": 44.765, + "step": 1495 + }, + { + "epoch": 5.400902934537246, + "grad_norm": 270.7329406738281, + "learning_rate": 2.1909255898366607e-05, + "loss": 42.8581, + "step": 1496 + }, + { + "epoch": 5.40451467268623, + "grad_norm": 187.3281707763672, + "learning_rate": 2.1903811252268603e-05, + "loss": 40.7167, + "step": 1497 + }, + { + "epoch": 5.408126410835214, + "grad_norm": 302.9165954589844, + "learning_rate": 2.1898366606170598e-05, + "loss": 41.0712, + "step": 1498 + }, + { + "epoch": 5.411738148984199, + "grad_norm": 395.1492614746094, + "learning_rate": 2.1892921960072597e-05, + "loss": 40.4098, + "step": 1499 + }, + { + "epoch": 5.415349887133183, + "grad_norm": 253.91494750976562, + "learning_rate": 2.1887477313974592e-05, + "loss": 41.2985, + "step": 1500 + }, + { + "epoch": 5.415349887133183, + "eval_loss": 0.6383773684501648, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1500 + }, + { + "epoch": 5.418961625282167, + "grad_norm": 248.4109344482422, + "learning_rate": 2.1882032667876588e-05, + "loss": 41.179, + "step": 1501 + }, + { + "epoch": 5.422573363431152, + "grad_norm": 210.50015258789062, + "learning_rate": 2.1876588021778586e-05, + "loss": 41.1934, + "step": 1502 + }, + { + "epoch": 5.426185101580136, + "grad_norm": 170.64334106445312, + "learning_rate": 2.187114337568058e-05, + "loss": 41.5535, + "step": 1503 + }, + { + "epoch": 5.42979683972912, + "grad_norm": 249.41270446777344, + "learning_rate": 2.1865698729582577e-05, + "loss": 41.8323, + "step": 1504 + }, + { + "epoch": 5.433408577878104, + "grad_norm": 214.53770446777344, + "learning_rate": 2.1860254083484572e-05, + "loss": 42.1517, + "step": 1505 + }, + { + "epoch": 5.437020316027088, + "grad_norm": 225.6502227783203, + "learning_rate": 2.1854809437386568e-05, + "loss": 42.7675, + "step": 1506 + }, + { + "epoch": 5.440632054176072, + "grad_norm": 210.19219970703125, + "learning_rate": 2.1849364791288567e-05, + "loss": 42.5094, + "step": 1507 + }, + { + "epoch": 5.444243792325056, + "grad_norm": 187.03294372558594, + "learning_rate": 2.1843920145190565e-05, + "loss": 42.2218, + "step": 1508 + }, + { + "epoch": 5.44785553047404, + "grad_norm": 227.6764373779297, + "learning_rate": 2.183847549909256e-05, + "loss": 42.7061, + "step": 1509 + }, + { + "epoch": 5.451467268623025, + "grad_norm": 239.2847442626953, + "learning_rate": 2.1833030852994556e-05, + "loss": 43.1959, + "step": 1510 + }, + { + "epoch": 5.451467268623025, + "eval_loss": 0.6405091285705566, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 1510 + }, + { + "epoch": 5.455079006772009, + "grad_norm": 268.887451171875, + "learning_rate": 2.182758620689655e-05, + "loss": 42.4915, + "step": 1511 + }, + { + "epoch": 5.458690744920993, + "grad_norm": 261.0531311035156, + "learning_rate": 2.182214156079855e-05, + "loss": 42.1777, + "step": 1512 + }, + { + "epoch": 5.462302483069977, + "grad_norm": 241.58819580078125, + "learning_rate": 2.1816696914700546e-05, + "loss": 40.8728, + "step": 1513 + }, + { + "epoch": 5.465914221218962, + "grad_norm": 227.302001953125, + "learning_rate": 2.181125226860254e-05, + "loss": 39.8861, + "step": 1514 + }, + { + "epoch": 5.469525959367946, + "grad_norm": 293.8402404785156, + "learning_rate": 2.1805807622504536e-05, + "loss": 36.8716, + "step": 1515 + }, + { + "epoch": 5.47313769751693, + "grad_norm": 332.8829650878906, + "learning_rate": 2.1800362976406532e-05, + "loss": 35.6049, + "step": 1516 + }, + { + "epoch": 5.476749435665914, + "grad_norm": 271.6636962890625, + "learning_rate": 2.179491833030853e-05, + "loss": 34.6785, + "step": 1517 + }, + { + "epoch": 5.480361173814899, + "grad_norm": 211.5673065185547, + "learning_rate": 2.178947368421053e-05, + "loss": 35.5321, + "step": 1518 + }, + { + "epoch": 5.483972911963883, + "grad_norm": 168.95346069335938, + "learning_rate": 2.1784029038112525e-05, + "loss": 35.1604, + "step": 1519 + }, + { + "epoch": 5.487584650112867, + "grad_norm": 242.66725158691406, + "learning_rate": 2.177858439201452e-05, + "loss": 37.8709, + "step": 1520 + }, + { + "epoch": 5.487584650112867, + "eval_loss": 0.6324127912521362, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1520 + }, + { + "epoch": 5.491196388261851, + "grad_norm": 202.7799530029297, + "learning_rate": 2.1773139745916516e-05, + "loss": 38.1727, + "step": 1521 + }, + { + "epoch": 5.4948081264108355, + "grad_norm": 210.12704467773438, + "learning_rate": 2.176769509981851e-05, + "loss": 36.4171, + "step": 1522 + }, + { + "epoch": 5.4984198645598195, + "grad_norm": 214.7133331298828, + "learning_rate": 2.176225045372051e-05, + "loss": 37.7873, + "step": 1523 + }, + { + "epoch": 5.502031602708803, + "grad_norm": 197.89781188964844, + "learning_rate": 2.1756805807622505e-05, + "loss": 37.1096, + "step": 1524 + }, + { + "epoch": 5.505643340857787, + "grad_norm": 203.01992797851562, + "learning_rate": 2.17513611615245e-05, + "loss": 36.9907, + "step": 1525 + }, + { + "epoch": 5.509255079006772, + "grad_norm": 210.42164611816406, + "learning_rate": 2.17459165154265e-05, + "loss": 38.0291, + "step": 1526 + }, + { + "epoch": 5.512866817155756, + "grad_norm": 210.2798309326172, + "learning_rate": 2.1740471869328495e-05, + "loss": 37.5385, + "step": 1527 + }, + { + "epoch": 5.51647855530474, + "grad_norm": 217.986572265625, + "learning_rate": 2.173502722323049e-05, + "loss": 39.2736, + "step": 1528 + }, + { + "epoch": 5.520090293453725, + "grad_norm": 221.05831909179688, + "learning_rate": 2.172958257713249e-05, + "loss": 39.2733, + "step": 1529 + }, + { + "epoch": 5.523702031602709, + "grad_norm": 250.36065673828125, + "learning_rate": 2.1724137931034484e-05, + "loss": 37.8987, + "step": 1530 + }, + { + "epoch": 5.523702031602709, + "eval_loss": 0.6414559483528137, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 1530 + }, + { + "epoch": 5.527313769751693, + "grad_norm": 275.062255859375, + "learning_rate": 2.171869328493648e-05, + "loss": 29.4874, + "step": 1531 + }, + { + "epoch": 5.530925507900677, + "grad_norm": 178.79615783691406, + "learning_rate": 2.1713248638838475e-05, + "loss": 25.2165, + "step": 1532 + }, + { + "epoch": 5.534537246049661, + "grad_norm": 221.6693572998047, + "learning_rate": 2.170780399274047e-05, + "loss": 24.7139, + "step": 1533 + }, + { + "epoch": 5.538148984198646, + "grad_norm": 207.15869140625, + "learning_rate": 2.170235934664247e-05, + "loss": 25.2773, + "step": 1534 + }, + { + "epoch": 5.54176072234763, + "grad_norm": 193.37644958496094, + "learning_rate": 2.1696914700544468e-05, + "loss": 25.7936, + "step": 1535 + }, + { + "epoch": 5.545372460496614, + "grad_norm": 314.101318359375, + "learning_rate": 2.1691470054446463e-05, + "loss": 45.8573, + "step": 1536 + }, + { + "epoch": 5.5489841986455986, + "grad_norm": 376.9578552246094, + "learning_rate": 2.168602540834846e-05, + "loss": 47.1284, + "step": 1537 + }, + { + "epoch": 5.5525959367945825, + "grad_norm": 343.3904724121094, + "learning_rate": 2.1680580762250454e-05, + "loss": 45.1873, + "step": 1538 + }, + { + "epoch": 5.5562076749435665, + "grad_norm": 263.31768798828125, + "learning_rate": 2.167513611615245e-05, + "loss": 45.4906, + "step": 1539 + }, + { + "epoch": 5.5598194130925505, + "grad_norm": 295.50384521484375, + "learning_rate": 2.1669691470054448e-05, + "loss": 44.9259, + "step": 1540 + }, + { + "epoch": 5.5598194130925505, + "eval_loss": 0.6483813524246216, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.923, + "eval_steps_per_second": 56.923, + "step": 1540 + }, + { + "epoch": 5.563431151241535, + "grad_norm": 208.8861846923828, + "learning_rate": 2.1664246823956444e-05, + "loss": 43.7965, + "step": 1541 + }, + { + "epoch": 5.567042889390519, + "grad_norm": 195.8695526123047, + "learning_rate": 2.165880217785844e-05, + "loss": 44.7409, + "step": 1542 + }, + { + "epoch": 5.570654627539503, + "grad_norm": 218.10089111328125, + "learning_rate": 2.1653357531760434e-05, + "loss": 45.9364, + "step": 1543 + }, + { + "epoch": 5.574266365688487, + "grad_norm": 204.17205810546875, + "learning_rate": 2.164791288566243e-05, + "loss": 45.468, + "step": 1544 + }, + { + "epoch": 5.577878103837472, + "grad_norm": 239.03952026367188, + "learning_rate": 2.1642468239564432e-05, + "loss": 44.7685, + "step": 1545 + }, + { + "epoch": 5.581489841986456, + "grad_norm": 251.59300231933594, + "learning_rate": 2.1637023593466427e-05, + "loss": 43.011, + "step": 1546 + }, + { + "epoch": 5.58510158013544, + "grad_norm": 186.72540283203125, + "learning_rate": 2.1631578947368423e-05, + "loss": 41.5255, + "step": 1547 + }, + { + "epoch": 5.588713318284425, + "grad_norm": 199.89732360839844, + "learning_rate": 2.1626134301270418e-05, + "loss": 40.2522, + "step": 1548 + }, + { + "epoch": 5.592325056433409, + "grad_norm": 182.16624450683594, + "learning_rate": 2.1620689655172413e-05, + "loss": 41.0931, + "step": 1549 + }, + { + "epoch": 5.595936794582393, + "grad_norm": 221.58680725097656, + "learning_rate": 2.161524500907441e-05, + "loss": 40.2717, + "step": 1550 + }, + { + "epoch": 5.595936794582393, + "eval_loss": 0.6393340229988098, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 1550 + }, + { + "epoch": 5.599548532731377, + "grad_norm": 209.82183837890625, + "learning_rate": 2.1609800362976408e-05, + "loss": 41.7522, + "step": 1551 + }, + { + "epoch": 5.603160270880361, + "grad_norm": 226.1896209716797, + "learning_rate": 2.1604355716878403e-05, + "loss": 40.8078, + "step": 1552 + }, + { + "epoch": 5.606772009029346, + "grad_norm": 219.57899475097656, + "learning_rate": 2.1598911070780398e-05, + "loss": 42.2331, + "step": 1553 + }, + { + "epoch": 5.6103837471783295, + "grad_norm": 185.2303009033203, + "learning_rate": 2.1593466424682397e-05, + "loss": 42.0695, + "step": 1554 + }, + { + "epoch": 5.6139954853273135, + "grad_norm": 192.32913208007812, + "learning_rate": 2.1588021778584392e-05, + "loss": 42.1317, + "step": 1555 + }, + { + "epoch": 5.617607223476298, + "grad_norm": 183.3128662109375, + "learning_rate": 2.158257713248639e-05, + "loss": 40.4957, + "step": 1556 + }, + { + "epoch": 5.621218961625282, + "grad_norm": 178.10691833496094, + "learning_rate": 2.1577132486388387e-05, + "loss": 40.9154, + "step": 1557 + }, + { + "epoch": 5.624830699774266, + "grad_norm": 207.3495330810547, + "learning_rate": 2.1571687840290382e-05, + "loss": 42.8389, + "step": 1558 + }, + { + "epoch": 5.62844243792325, + "grad_norm": 191.46353149414062, + "learning_rate": 2.1566243194192377e-05, + "loss": 41.9483, + "step": 1559 + }, + { + "epoch": 5.632054176072235, + "grad_norm": 218.9544219970703, + "learning_rate": 2.1560798548094373e-05, + "loss": 41.2037, + "step": 1560 + }, + { + "epoch": 5.632054176072235, + "eval_loss": 0.6345452070236206, + "eval_runtime": 3.1432, + "eval_samples_per_second": 56.949, + "eval_steps_per_second": 56.949, + "step": 1560 + }, + { + "epoch": 5.635665914221219, + "grad_norm": 235.9405059814453, + "learning_rate": 2.1555353901996368e-05, + "loss": 43.1159, + "step": 1561 + }, + { + "epoch": 5.639277652370203, + "grad_norm": 207.1119384765625, + "learning_rate": 2.1549909255898367e-05, + "loss": 43.4384, + "step": 1562 + }, + { + "epoch": 5.642889390519187, + "grad_norm": 305.3013916015625, + "learning_rate": 2.1544464609800366e-05, + "loss": 42.436, + "step": 1563 + }, + { + "epoch": 5.646501128668172, + "grad_norm": 226.25282287597656, + "learning_rate": 2.153901996370236e-05, + "loss": 39.6844, + "step": 1564 + }, + { + "epoch": 5.650112866817156, + "grad_norm": 201.5033416748047, + "learning_rate": 2.1533575317604356e-05, + "loss": 35.9103, + "step": 1565 + }, + { + "epoch": 5.65372460496614, + "grad_norm": 206.63229370117188, + "learning_rate": 2.1528130671506352e-05, + "loss": 35.0026, + "step": 1566 + }, + { + "epoch": 5.657336343115124, + "grad_norm": 212.67581176757812, + "learning_rate": 2.152268602540835e-05, + "loss": 35.6298, + "step": 1567 + }, + { + "epoch": 5.660948081264109, + "grad_norm": 193.2886199951172, + "learning_rate": 2.1517241379310346e-05, + "loss": 36.0356, + "step": 1568 + }, + { + "epoch": 5.664559819413093, + "grad_norm": 166.189208984375, + "learning_rate": 2.151179673321234e-05, + "loss": 35.5423, + "step": 1569 + }, + { + "epoch": 5.668171557562077, + "grad_norm": 288.91552734375, + "learning_rate": 2.1506352087114337e-05, + "loss": 36.6227, + "step": 1570 + }, + { + "epoch": 5.668171557562077, + "eval_loss": 0.6339959502220154, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1570 + }, + { + "epoch": 5.6717832957110605, + "grad_norm": 210.91664123535156, + "learning_rate": 2.1500907441016332e-05, + "loss": 37.3015, + "step": 1571 + }, + { + "epoch": 5.675395033860045, + "grad_norm": 206.54299926757812, + "learning_rate": 2.149546279491833e-05, + "loss": 36.961, + "step": 1572 + }, + { + "epoch": 5.679006772009029, + "grad_norm": 206.55613708496094, + "learning_rate": 2.149001814882033e-05, + "loss": 36.722, + "step": 1573 + }, + { + "epoch": 5.682618510158013, + "grad_norm": 206.86563110351562, + "learning_rate": 2.1484573502722325e-05, + "loss": 37.7482, + "step": 1574 + }, + { + "epoch": 5.686230248306998, + "grad_norm": 219.96533203125, + "learning_rate": 2.147912885662432e-05, + "loss": 37.7964, + "step": 1575 + }, + { + "epoch": 5.689841986455982, + "grad_norm": 226.23887634277344, + "learning_rate": 2.1473684210526316e-05, + "loss": 38.6577, + "step": 1576 + }, + { + "epoch": 5.693453724604966, + "grad_norm": 195.1751708984375, + "learning_rate": 2.146823956442831e-05, + "loss": 36.9764, + "step": 1577 + }, + { + "epoch": 5.69706546275395, + "grad_norm": 194.3510284423828, + "learning_rate": 2.146279491833031e-05, + "loss": 39.4842, + "step": 1578 + }, + { + "epoch": 5.700677200902934, + "grad_norm": 187.02281188964844, + "learning_rate": 2.1457350272232305e-05, + "loss": 38.9574, + "step": 1579 + }, + { + "epoch": 5.704288939051919, + "grad_norm": 242.91925048828125, + "learning_rate": 2.14519056261343e-05, + "loss": 37.6359, + "step": 1580 + }, + { + "epoch": 5.704288939051919, + "eval_loss": 0.6384473443031311, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 1580 + }, + { + "epoch": 5.707900677200903, + "grad_norm": 242.9617156982422, + "learning_rate": 2.14464609800363e-05, + "loss": 31.3564, + "step": 1581 + }, + { + "epoch": 5.711512415349887, + "grad_norm": 182.00540161132812, + "learning_rate": 2.1441016333938295e-05, + "loss": 24.2933, + "step": 1582 + }, + { + "epoch": 5.715124153498872, + "grad_norm": 257.7115173339844, + "learning_rate": 2.143557168784029e-05, + "loss": 24.6299, + "step": 1583 + }, + { + "epoch": 5.718735891647856, + "grad_norm": 198.71554565429688, + "learning_rate": 2.143012704174229e-05, + "loss": 24.7344, + "step": 1584 + }, + { + "epoch": 5.72234762979684, + "grad_norm": 198.24520874023438, + "learning_rate": 2.1424682395644284e-05, + "loss": 26.0825, + "step": 1585 + }, + { + "epoch": 5.725959367945824, + "grad_norm": 248.9528045654297, + "learning_rate": 2.141923774954628e-05, + "loss": 45.1176, + "step": 1586 + }, + { + "epoch": 5.7295711060948085, + "grad_norm": 293.7327575683594, + "learning_rate": 2.1413793103448275e-05, + "loss": 45.8517, + "step": 1587 + }, + { + "epoch": 5.733182844243792, + "grad_norm": 293.1148681640625, + "learning_rate": 2.140834845735027e-05, + "loss": 45.6659, + "step": 1588 + }, + { + "epoch": 5.736794582392776, + "grad_norm": 312.7779846191406, + "learning_rate": 2.140290381125227e-05, + "loss": 44.4863, + "step": 1589 + }, + { + "epoch": 5.74040632054176, + "grad_norm": 309.1000061035156, + "learning_rate": 2.1397459165154265e-05, + "loss": 43.649, + "step": 1590 + }, + { + "epoch": 5.74040632054176, + "eval_loss": 0.6471736431121826, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 1590 + }, + { + "epoch": 5.744018058690745, + "grad_norm": 276.4226989746094, + "learning_rate": 2.1392014519056263e-05, + "loss": 45.3135, + "step": 1591 + }, + { + "epoch": 5.747629796839729, + "grad_norm": 233.6791229248047, + "learning_rate": 2.138656987295826e-05, + "loss": 44.4919, + "step": 1592 + }, + { + "epoch": 5.751241534988713, + "grad_norm": 194.2917022705078, + "learning_rate": 2.1381125226860254e-05, + "loss": 44.8033, + "step": 1593 + }, + { + "epoch": 5.754853273137698, + "grad_norm": 241.76060485839844, + "learning_rate": 2.137568058076225e-05, + "loss": 45.1427, + "step": 1594 + }, + { + "epoch": 5.758465011286682, + "grad_norm": 216.56283569335938, + "learning_rate": 2.137023593466425e-05, + "loss": 43.1769, + "step": 1595 + }, + { + "epoch": 5.762076749435666, + "grad_norm": 230.0026092529297, + "learning_rate": 2.1364791288566244e-05, + "loss": 44.1141, + "step": 1596 + }, + { + "epoch": 5.76568848758465, + "grad_norm": 191.55433654785156, + "learning_rate": 2.135934664246824e-05, + "loss": 40.7227, + "step": 1597 + }, + { + "epoch": 5.769300225733634, + "grad_norm": 180.25885009765625, + "learning_rate": 2.1353901996370235e-05, + "loss": 40.9842, + "step": 1598 + }, + { + "epoch": 5.772911963882619, + "grad_norm": 220.4018096923828, + "learning_rate": 2.134845735027223e-05, + "loss": 40.0403, + "step": 1599 + }, + { + "epoch": 5.776523702031603, + "grad_norm": 264.20587158203125, + "learning_rate": 2.1343012704174232e-05, + "loss": 40.1543, + "step": 1600 + }, + { + "epoch": 5.776523702031603, + "eval_loss": 0.6374311447143555, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1600 + }, + { + "epoch": 5.780135440180587, + "grad_norm": 167.9457244873047, + "learning_rate": 2.1337568058076227e-05, + "loss": 40.9575, + "step": 1601 + }, + { + "epoch": 5.7837471783295715, + "grad_norm": 190.05247497558594, + "learning_rate": 2.1332123411978223e-05, + "loss": 39.5593, + "step": 1602 + }, + { + "epoch": 5.7873589164785555, + "grad_norm": 246.4980926513672, + "learning_rate": 2.1326678765880218e-05, + "loss": 40.7016, + "step": 1603 + }, + { + "epoch": 5.7909706546275395, + "grad_norm": 208.7435302734375, + "learning_rate": 2.1321234119782214e-05, + "loss": 41.7855, + "step": 1604 + }, + { + "epoch": 5.794582392776523, + "grad_norm": 190.84188842773438, + "learning_rate": 2.1315789473684212e-05, + "loss": 41.2129, + "step": 1605 + }, + { + "epoch": 5.798194130925508, + "grad_norm": 196.7161102294922, + "learning_rate": 2.1310344827586208e-05, + "loss": 40.8209, + "step": 1606 + }, + { + "epoch": 5.801805869074492, + "grad_norm": 181.4319305419922, + "learning_rate": 2.1304900181488203e-05, + "loss": 41.8345, + "step": 1607 + }, + { + "epoch": 5.805417607223476, + "grad_norm": 201.2064971923828, + "learning_rate": 2.12994555353902e-05, + "loss": 43.1464, + "step": 1608 + }, + { + "epoch": 5.80902934537246, + "grad_norm": 199.15174865722656, + "learning_rate": 2.1294010889292197e-05, + "loss": 42.6041, + "step": 1609 + }, + { + "epoch": 5.812641083521445, + "grad_norm": 231.0398406982422, + "learning_rate": 2.1288566243194193e-05, + "loss": 42.867, + "step": 1610 + }, + { + "epoch": 5.812641083521445, + "eval_loss": 0.6334222555160522, + "eval_runtime": 3.1534, + "eval_samples_per_second": 56.764, + "eval_steps_per_second": 56.764, + "step": 1610 + }, + { + "epoch": 5.816252821670429, + "grad_norm": 189.26132202148438, + "learning_rate": 2.128312159709619e-05, + "loss": 41.7717, + "step": 1611 + }, + { + "epoch": 5.819864559819413, + "grad_norm": 215.5289764404297, + "learning_rate": 2.1277676950998187e-05, + "loss": 41.3994, + "step": 1612 + }, + { + "epoch": 5.823476297968397, + "grad_norm": 267.4259033203125, + "learning_rate": 2.1272232304900182e-05, + "loss": 41.8173, + "step": 1613 + }, + { + "epoch": 5.827088036117382, + "grad_norm": 241.74749755859375, + "learning_rate": 2.1266787658802178e-05, + "loss": 39.9873, + "step": 1614 + }, + { + "epoch": 5.830699774266366, + "grad_norm": 242.233642578125, + "learning_rate": 2.1261343012704173e-05, + "loss": 37.0662, + "step": 1615 + }, + { + "epoch": 5.83431151241535, + "grad_norm": 217.06141662597656, + "learning_rate": 2.1255898366606172e-05, + "loss": 36.8948, + "step": 1616 + }, + { + "epoch": 5.837923250564334, + "grad_norm": 242.05567932128906, + "learning_rate": 2.1250453720508167e-05, + "loss": 34.9909, + "step": 1617 + }, + { + "epoch": 5.8415349887133186, + "grad_norm": 178.65618896484375, + "learning_rate": 2.1245009074410166e-05, + "loss": 35.603, + "step": 1618 + }, + { + "epoch": 5.8451467268623025, + "grad_norm": 216.36865234375, + "learning_rate": 2.123956442831216e-05, + "loss": 35.9822, + "step": 1619 + }, + { + "epoch": 5.8487584650112865, + "grad_norm": 241.22161865234375, + "learning_rate": 2.1234119782214157e-05, + "loss": 35.1473, + "step": 1620 + }, + { + "epoch": 5.8487584650112865, + "eval_loss": 0.6312161087989807, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 1620 + }, + { + "epoch": 5.852370203160271, + "grad_norm": 192.05210876464844, + "learning_rate": 2.1228675136116152e-05, + "loss": 36.145, + "step": 1621 + }, + { + "epoch": 5.855981941309255, + "grad_norm": 194.0652618408203, + "learning_rate": 2.122323049001815e-05, + "loss": 37.7076, + "step": 1622 + }, + { + "epoch": 5.859593679458239, + "grad_norm": 255.59286499023438, + "learning_rate": 2.1217785843920146e-05, + "loss": 37.6837, + "step": 1623 + }, + { + "epoch": 5.863205417607223, + "grad_norm": 184.0017852783203, + "learning_rate": 2.121234119782214e-05, + "loss": 37.1681, + "step": 1624 + }, + { + "epoch": 5.866817155756207, + "grad_norm": 186.98338317871094, + "learning_rate": 2.1206896551724137e-05, + "loss": 37.4902, + "step": 1625 + }, + { + "epoch": 5.870428893905192, + "grad_norm": 253.53775024414062, + "learning_rate": 2.1201451905626132e-05, + "loss": 37.2771, + "step": 1626 + }, + { + "epoch": 5.874040632054176, + "grad_norm": 196.43038940429688, + "learning_rate": 2.119600725952813e-05, + "loss": 37.7681, + "step": 1627 + }, + { + "epoch": 5.87765237020316, + "grad_norm": 255.99879455566406, + "learning_rate": 2.119056261343013e-05, + "loss": 40.0097, + "step": 1628 + }, + { + "epoch": 5.881264108352145, + "grad_norm": 275.1465148925781, + "learning_rate": 2.1185117967332125e-05, + "loss": 38.1076, + "step": 1629 + }, + { + "epoch": 5.884875846501129, + "grad_norm": 281.8592529296875, + "learning_rate": 2.117967332123412e-05, + "loss": 38.6463, + "step": 1630 + }, + { + "epoch": 5.884875846501129, + "eval_loss": 0.6449099779129028, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 1630 + }, + { + "epoch": 5.888487584650113, + "grad_norm": 246.7912139892578, + "learning_rate": 2.1174228675136116e-05, + "loss": 36.9158, + "step": 1631 + }, + { + "epoch": 5.892099322799097, + "grad_norm": 176.7545623779297, + "learning_rate": 2.116878402903811e-05, + "loss": 25.1153, + "step": 1632 + }, + { + "epoch": 5.895711060948082, + "grad_norm": 202.2602996826172, + "learning_rate": 2.116333938294011e-05, + "loss": 24.1999, + "step": 1633 + }, + { + "epoch": 5.899322799097066, + "grad_norm": 186.26255798339844, + "learning_rate": 2.1157894736842106e-05, + "loss": 24.185, + "step": 1634 + }, + { + "epoch": 5.9029345372460496, + "grad_norm": 231.0543670654297, + "learning_rate": 2.11524500907441e-05, + "loss": 26.1841, + "step": 1635 + }, + { + "epoch": 5.9065462753950335, + "grad_norm": 336.677001953125, + "learning_rate": 2.1147005444646096e-05, + "loss": 47.1367, + "step": 1636 + }, + { + "epoch": 5.910158013544018, + "grad_norm": 299.3211975097656, + "learning_rate": 2.1141560798548095e-05, + "loss": 46.7711, + "step": 1637 + }, + { + "epoch": 5.913769751693002, + "grad_norm": 287.5389099121094, + "learning_rate": 2.1136116152450094e-05, + "loss": 44.9163, + "step": 1638 + }, + { + "epoch": 5.917381489841986, + "grad_norm": 290.34930419921875, + "learning_rate": 2.113067150635209e-05, + "loss": 45.1651, + "step": 1639 + }, + { + "epoch": 5.92099322799097, + "grad_norm": 244.7100372314453, + "learning_rate": 2.1125226860254085e-05, + "loss": 45.6252, + "step": 1640 + }, + { + "epoch": 5.92099322799097, + "eval_loss": 0.6506878733634949, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 1640 + }, + { + "epoch": 5.924604966139955, + "grad_norm": 301.48223876953125, + "learning_rate": 2.111978221415608e-05, + "loss": 44.5345, + "step": 1641 + }, + { + "epoch": 5.928216704288939, + "grad_norm": 261.05987548828125, + "learning_rate": 2.1114337568058075e-05, + "loss": 42.0263, + "step": 1642 + }, + { + "epoch": 5.931828442437923, + "grad_norm": 220.4369659423828, + "learning_rate": 2.110889292196007e-05, + "loss": 41.2405, + "step": 1643 + }, + { + "epoch": 5.935440180586907, + "grad_norm": 261.3221435546875, + "learning_rate": 2.110344827586207e-05, + "loss": 42.2734, + "step": 1644 + }, + { + "epoch": 5.939051918735892, + "grad_norm": 253.70855712890625, + "learning_rate": 2.1098003629764065e-05, + "loss": 43.0752, + "step": 1645 + }, + { + "epoch": 5.942663656884876, + "grad_norm": 198.76138305664062, + "learning_rate": 2.1092558983666064e-05, + "loss": 42.7103, + "step": 1646 + }, + { + "epoch": 5.94627539503386, + "grad_norm": 212.21466064453125, + "learning_rate": 2.108711433756806e-05, + "loss": 42.6215, + "step": 1647 + }, + { + "epoch": 5.949887133182845, + "grad_norm": 212.9633026123047, + "learning_rate": 2.1081669691470055e-05, + "loss": 42.795, + "step": 1648 + }, + { + "epoch": 5.953498871331829, + "grad_norm": 263.2871398925781, + "learning_rate": 2.1076225045372053e-05, + "loss": 43.8843, + "step": 1649 + }, + { + "epoch": 5.957110609480813, + "grad_norm": 207.67120361328125, + "learning_rate": 2.107078039927405e-05, + "loss": 43.0161, + "step": 1650 + }, + { + "epoch": 5.957110609480813, + "eval_loss": 0.6315081715583801, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1650 + }, + { + "epoch": 5.960722347629797, + "grad_norm": 176.6342010498047, + "learning_rate": 2.1065335753176044e-05, + "loss": 38.803, + "step": 1651 + }, + { + "epoch": 5.9643340857787805, + "grad_norm": 223.57485961914062, + "learning_rate": 2.105989110707804e-05, + "loss": 35.1905, + "step": 1652 + }, + { + "epoch": 5.967945823927765, + "grad_norm": 291.507568359375, + "learning_rate": 2.1054446460980035e-05, + "loss": 34.9454, + "step": 1653 + }, + { + "epoch": 5.971557562076749, + "grad_norm": 250.51063537597656, + "learning_rate": 2.104900181488203e-05, + "loss": 37.4404, + "step": 1654 + }, + { + "epoch": 5.975169300225733, + "grad_norm": 307.9601135253906, + "learning_rate": 2.1043557168784032e-05, + "loss": 36.9775, + "step": 1655 + }, + { + "epoch": 5.978781038374718, + "grad_norm": 277.24151611328125, + "learning_rate": 2.1038112522686028e-05, + "loss": 38.2696, + "step": 1656 + }, + { + "epoch": 5.982392776523702, + "grad_norm": 186.7593994140625, + "learning_rate": 2.1032667876588023e-05, + "loss": 37.0656, + "step": 1657 + }, + { + "epoch": 5.986004514672686, + "grad_norm": 201.67047119140625, + "learning_rate": 2.102722323049002e-05, + "loss": 38.1747, + "step": 1658 + }, + { + "epoch": 5.98961625282167, + "grad_norm": 216.87525939941406, + "learning_rate": 2.1021778584392014e-05, + "loss": 39.3248, + "step": 1659 + }, + { + "epoch": 5.993227990970655, + "grad_norm": 227.381103515625, + "learning_rate": 2.1016333938294013e-05, + "loss": 33.4017, + "step": 1660 + }, + { + "epoch": 5.993227990970655, + "eval_loss": 0.6369583010673523, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1660 + }, + { + "epoch": 5.996839729119639, + "grad_norm": 237.2648468017578, + "learning_rate": 2.1010889292196008e-05, + "loss": 24.679, + "step": 1661 + }, + { + "epoch": 6.0, + "grad_norm": 191.99951171875, + "learning_rate": 2.1005444646098003e-05, + "loss": 21.9552, + "step": 1662 + }, + { + "epoch": 6.003611738148984, + "grad_norm": 267.92181396484375, + "learning_rate": 2.1e-05, + "loss": 43.6884, + "step": 1663 + }, + { + "epoch": 6.007223476297969, + "grad_norm": 318.86602783203125, + "learning_rate": 2.0994555353901998e-05, + "loss": 46.0709, + "step": 1664 + }, + { + "epoch": 6.010835214446953, + "grad_norm": 282.772705078125, + "learning_rate": 2.0989110707803993e-05, + "loss": 44.2746, + "step": 1665 + }, + { + "epoch": 6.014446952595937, + "grad_norm": 263.2024841308594, + "learning_rate": 2.0983666061705992e-05, + "loss": 43.818, + "step": 1666 + }, + { + "epoch": 6.018058690744921, + "grad_norm": 229.41725158691406, + "learning_rate": 2.0978221415607987e-05, + "loss": 43.9441, + "step": 1667 + }, + { + "epoch": 6.021670428893906, + "grad_norm": 253.25624084472656, + "learning_rate": 2.0972776769509983e-05, + "loss": 43.517, + "step": 1668 + }, + { + "epoch": 6.0252821670428895, + "grad_norm": 202.00238037109375, + "learning_rate": 2.0967332123411978e-05, + "loss": 44.3685, + "step": 1669 + }, + { + "epoch": 6.0288939051918735, + "grad_norm": 196.92825317382812, + "learning_rate": 2.0961887477313973e-05, + "loss": 44.9367, + "step": 1670 + }, + { + "epoch": 6.0288939051918735, + "eval_loss": 0.6381568312644958, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1670 + }, + { + "epoch": 6.0325056433408575, + "grad_norm": 191.00900268554688, + "learning_rate": 2.0956442831215972e-05, + "loss": 44.0743, + "step": 1671 + }, + { + "epoch": 6.036117381489842, + "grad_norm": 195.92141723632812, + "learning_rate": 2.0950998185117967e-05, + "loss": 43.3278, + "step": 1672 + }, + { + "epoch": 6.039729119638826, + "grad_norm": 230.04708862304688, + "learning_rate": 2.0945553539019963e-05, + "loss": 41.6419, + "step": 1673 + }, + { + "epoch": 6.04334085778781, + "grad_norm": 215.70689392089844, + "learning_rate": 2.094010889292196e-05, + "loss": 41.0927, + "step": 1674 + }, + { + "epoch": 6.046952595936794, + "grad_norm": 227.51797485351562, + "learning_rate": 2.0934664246823957e-05, + "loss": 40.1888, + "step": 1675 + }, + { + "epoch": 6.050564334085779, + "grad_norm": 216.93089294433594, + "learning_rate": 2.0929219600725952e-05, + "loss": 39.8766, + "step": 1676 + }, + { + "epoch": 6.054176072234763, + "grad_norm": 199.3091583251953, + "learning_rate": 2.092377495462795e-05, + "loss": 40.3851, + "step": 1677 + }, + { + "epoch": 6.057787810383747, + "grad_norm": 188.56056213378906, + "learning_rate": 2.0918330308529947e-05, + "loss": 40.5289, + "step": 1678 + }, + { + "epoch": 6.061399548532731, + "grad_norm": 194.23265075683594, + "learning_rate": 2.0912885662431942e-05, + "loss": 40.7509, + "step": 1679 + }, + { + "epoch": 6.065011286681716, + "grad_norm": 199.7327423095703, + "learning_rate": 2.0907441016333937e-05, + "loss": 41.3404, + "step": 1680 + }, + { + "epoch": 6.065011286681716, + "eval_loss": 0.6312655806541443, + "eval_runtime": 3.1482, + "eval_samples_per_second": 56.858, + "eval_steps_per_second": 56.858, + "step": 1680 + }, + { + "epoch": 6.0686230248307, + "grad_norm": 189.40150451660156, + "learning_rate": 2.0901996370235933e-05, + "loss": 41.3719, + "step": 1681 + }, + { + "epoch": 6.072234762979684, + "grad_norm": 222.07705688476562, + "learning_rate": 2.089655172413793e-05, + "loss": 41.8194, + "step": 1682 + }, + { + "epoch": 6.075846501128668, + "grad_norm": 205.6264190673828, + "learning_rate": 2.089110707803993e-05, + "loss": 39.8522, + "step": 1683 + }, + { + "epoch": 6.079458239277653, + "grad_norm": 207.98802185058594, + "learning_rate": 2.0885662431941926e-05, + "loss": 41.5093, + "step": 1684 + }, + { + "epoch": 6.083069977426637, + "grad_norm": 197.24134826660156, + "learning_rate": 2.088021778584392e-05, + "loss": 41.7284, + "step": 1685 + }, + { + "epoch": 6.0866817155756205, + "grad_norm": 220.84255981445312, + "learning_rate": 2.0874773139745916e-05, + "loss": 42.7841, + "step": 1686 + }, + { + "epoch": 6.090293453724605, + "grad_norm": 239.06854248046875, + "learning_rate": 2.0869328493647912e-05, + "loss": 43.6391, + "step": 1687 + }, + { + "epoch": 6.093905191873589, + "grad_norm": 193.2572021484375, + "learning_rate": 2.086388384754991e-05, + "loss": 41.9963, + "step": 1688 + }, + { + "epoch": 6.097516930022573, + "grad_norm": 206.66473388671875, + "learning_rate": 2.0858439201451906e-05, + "loss": 41.9834, + "step": 1689 + }, + { + "epoch": 6.101128668171557, + "grad_norm": 214.81956481933594, + "learning_rate": 2.08529945553539e-05, + "loss": 41.7128, + "step": 1690 + }, + { + "epoch": 6.101128668171557, + "eval_loss": 0.6309775114059448, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1690 + }, + { + "epoch": 6.104740406320542, + "grad_norm": 189.58360290527344, + "learning_rate": 2.0847549909255897e-05, + "loss": 37.7807, + "step": 1691 + }, + { + "epoch": 6.108352144469526, + "grad_norm": 265.76934814453125, + "learning_rate": 2.0842105263157895e-05, + "loss": 37.7091, + "step": 1692 + }, + { + "epoch": 6.11196388261851, + "grad_norm": 266.4632568359375, + "learning_rate": 2.0836660617059894e-05, + "loss": 34.7386, + "step": 1693 + }, + { + "epoch": 6.115575620767494, + "grad_norm": 309.3799743652344, + "learning_rate": 2.083121597096189e-05, + "loss": 34.9386, + "step": 1694 + }, + { + "epoch": 6.119187358916479, + "grad_norm": 252.98681640625, + "learning_rate": 2.0825771324863885e-05, + "loss": 34.9113, + "step": 1695 + }, + { + "epoch": 6.122799097065463, + "grad_norm": 199.3408660888672, + "learning_rate": 2.082032667876588e-05, + "loss": 35.1914, + "step": 1696 + }, + { + "epoch": 6.126410835214447, + "grad_norm": 231.67514038085938, + "learning_rate": 2.0814882032667876e-05, + "loss": 36.3151, + "step": 1697 + }, + { + "epoch": 6.130022573363431, + "grad_norm": 215.49317932128906, + "learning_rate": 2.080943738656987e-05, + "loss": 37.6763, + "step": 1698 + }, + { + "epoch": 6.133634311512416, + "grad_norm": 239.3602752685547, + "learning_rate": 2.080399274047187e-05, + "loss": 35.7805, + "step": 1699 + }, + { + "epoch": 6.1372460496614, + "grad_norm": 192.8195037841797, + "learning_rate": 2.0798548094373865e-05, + "loss": 36.7353, + "step": 1700 + }, + { + "epoch": 6.1372460496614, + "eval_loss": 0.6290757060050964, + "eval_runtime": 3.1486, + "eval_samples_per_second": 56.851, + "eval_steps_per_second": 56.851, + "step": 1700 + }, + { + "epoch": 6.140857787810384, + "grad_norm": 191.125, + "learning_rate": 2.0793103448275864e-05, + "loss": 36.6377, + "step": 1701 + }, + { + "epoch": 6.144469525959368, + "grad_norm": 232.39170837402344, + "learning_rate": 2.078765880217786e-05, + "loss": 36.5235, + "step": 1702 + }, + { + "epoch": 6.148081264108352, + "grad_norm": 259.41204833984375, + "learning_rate": 2.0782214156079855e-05, + "loss": 37.7093, + "step": 1703 + }, + { + "epoch": 6.151693002257336, + "grad_norm": 218.00814819335938, + "learning_rate": 2.0776769509981854e-05, + "loss": 37.8061, + "step": 1704 + }, + { + "epoch": 6.15530474040632, + "grad_norm": 183.78170776367188, + "learning_rate": 2.077132486388385e-05, + "loss": 37.9451, + "step": 1705 + }, + { + "epoch": 6.158916478555304, + "grad_norm": 242.387939453125, + "learning_rate": 2.0765880217785844e-05, + "loss": 38.687, + "step": 1706 + }, + { + "epoch": 6.162528216704289, + "grad_norm": 247.09152221679688, + "learning_rate": 2.076043557168784e-05, + "loss": 38.5109, + "step": 1707 + }, + { + "epoch": 6.166139954853273, + "grad_norm": 202.3104705810547, + "learning_rate": 2.0754990925589835e-05, + "loss": 28.0115, + "step": 1708 + }, + { + "epoch": 6.169751693002257, + "grad_norm": 239.5511016845703, + "learning_rate": 2.0749546279491834e-05, + "loss": 23.8873, + "step": 1709 + }, + { + "epoch": 6.173363431151241, + "grad_norm": 233.80007934570312, + "learning_rate": 2.0744101633393833e-05, + "loss": 24.0236, + "step": 1710 + }, + { + "epoch": 6.173363431151241, + "eval_loss": 0.6451307535171509, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1710 + }, + { + "epoch": 6.176975169300226, + "grad_norm": 231.85955810546875, + "learning_rate": 2.0738656987295828e-05, + "loss": 25.2521, + "step": 1711 + }, + { + "epoch": 6.18058690744921, + "grad_norm": 207.05453491210938, + "learning_rate": 2.0733212341197823e-05, + "loss": 25.5774, + "step": 1712 + }, + { + "epoch": 6.184198645598194, + "grad_norm": 265.9180908203125, + "learning_rate": 2.072776769509982e-05, + "loss": 46.0267, + "step": 1713 + }, + { + "epoch": 6.187810383747179, + "grad_norm": 289.2763671875, + "learning_rate": 2.0722323049001814e-05, + "loss": 46.6262, + "step": 1714 + }, + { + "epoch": 6.191422121896163, + "grad_norm": 254.466552734375, + "learning_rate": 2.0716878402903813e-05, + "loss": 44.2758, + "step": 1715 + }, + { + "epoch": 6.195033860045147, + "grad_norm": 262.713134765625, + "learning_rate": 2.071143375680581e-05, + "loss": 44.6334, + "step": 1716 + }, + { + "epoch": 6.198645598194131, + "grad_norm": 272.8150939941406, + "learning_rate": 2.0705989110707804e-05, + "loss": 44.9617, + "step": 1717 + }, + { + "epoch": 6.2022573363431155, + "grad_norm": 288.115478515625, + "learning_rate": 2.07005444646098e-05, + "loss": 44.4382, + "step": 1718 + }, + { + "epoch": 6.2058690744920995, + "grad_norm": 226.08058166503906, + "learning_rate": 2.0695099818511795e-05, + "loss": 44.8551, + "step": 1719 + }, + { + "epoch": 6.209480812641083, + "grad_norm": 219.95835876464844, + "learning_rate": 2.0689655172413797e-05, + "loss": 45.5901, + "step": 1720 + }, + { + "epoch": 6.209480812641083, + "eval_loss": 0.6379314661026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 1720 + }, + { + "epoch": 6.213092550790067, + "grad_norm": 190.3118896484375, + "learning_rate": 2.0684210526315792e-05, + "loss": 44.0675, + "step": 1721 + }, + { + "epoch": 6.216704288939052, + "grad_norm": 177.408935546875, + "learning_rate": 2.0678765880217787e-05, + "loss": 42.6333, + "step": 1722 + }, + { + "epoch": 6.220316027088036, + "grad_norm": 231.3040313720703, + "learning_rate": 2.0673321234119783e-05, + "loss": 41.6771, + "step": 1723 + }, + { + "epoch": 6.22392776523702, + "grad_norm": 226.51663208007812, + "learning_rate": 2.0667876588021778e-05, + "loss": 41.0829, + "step": 1724 + }, + { + "epoch": 6.227539503386004, + "grad_norm": 184.55775451660156, + "learning_rate": 2.0662431941923774e-05, + "loss": 39.2682, + "step": 1725 + }, + { + "epoch": 6.231151241534989, + "grad_norm": 205.0491943359375, + "learning_rate": 2.0656987295825772e-05, + "loss": 40.4101, + "step": 1726 + }, + { + "epoch": 6.234762979683973, + "grad_norm": 201.45838928222656, + "learning_rate": 2.0651542649727768e-05, + "loss": 39.9147, + "step": 1727 + }, + { + "epoch": 6.238374717832957, + "grad_norm": 220.16213989257812, + "learning_rate": 2.0646098003629763e-05, + "loss": 40.7215, + "step": 1728 + }, + { + "epoch": 6.241986455981941, + "grad_norm": 260.9661560058594, + "learning_rate": 2.0640653357531762e-05, + "loss": 40.0256, + "step": 1729 + }, + { + "epoch": 6.245598194130926, + "grad_norm": 314.2476806640625, + "learning_rate": 2.0635208711433757e-05, + "loss": 41.1147, + "step": 1730 + }, + { + "epoch": 6.245598194130926, + "eval_loss": 0.6347935199737549, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1730 + }, + { + "epoch": 6.24920993227991, + "grad_norm": 262.24505615234375, + "learning_rate": 2.0629764065335756e-05, + "loss": 41.7255, + "step": 1731 + }, + { + "epoch": 6.252821670428894, + "grad_norm": 212.0876922607422, + "learning_rate": 2.062431941923775e-05, + "loss": 41.2559, + "step": 1732 + }, + { + "epoch": 6.2564334085778786, + "grad_norm": 185.3249969482422, + "learning_rate": 2.0618874773139747e-05, + "loss": 41.1664, + "step": 1733 + }, + { + "epoch": 6.2600451467268625, + "grad_norm": 184.7873077392578, + "learning_rate": 2.0613430127041742e-05, + "loss": 41.3357, + "step": 1734 + }, + { + "epoch": 6.2636568848758465, + "grad_norm": 230.11257934570312, + "learning_rate": 2.0607985480943738e-05, + "loss": 43.0978, + "step": 1735 + }, + { + "epoch": 6.2672686230248305, + "grad_norm": 251.255126953125, + "learning_rate": 2.0602540834845733e-05, + "loss": 42.4169, + "step": 1736 + }, + { + "epoch": 6.270880361173815, + "grad_norm": 230.1149444580078, + "learning_rate": 2.0597096188747732e-05, + "loss": 43.2969, + "step": 1737 + }, + { + "epoch": 6.274492099322799, + "grad_norm": 217.2769012451172, + "learning_rate": 2.059165154264973e-05, + "loss": 42.6037, + "step": 1738 + }, + { + "epoch": 6.278103837471783, + "grad_norm": 189.85533142089844, + "learning_rate": 2.0586206896551726e-05, + "loss": 42.1215, + "step": 1739 + }, + { + "epoch": 6.281715575620767, + "grad_norm": 242.15667724609375, + "learning_rate": 2.058076225045372e-05, + "loss": 42.6337, + "step": 1740 + }, + { + "epoch": 6.281715575620767, + "eval_loss": 0.6310555934906006, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 1740 + }, + { + "epoch": 6.285327313769752, + "grad_norm": 213.7873992919922, + "learning_rate": 2.0575317604355717e-05, + "loss": 40.5315, + "step": 1741 + }, + { + "epoch": 6.288939051918736, + "grad_norm": 243.86492919921875, + "learning_rate": 2.0569872958257715e-05, + "loss": 38.9483, + "step": 1742 + }, + { + "epoch": 6.29255079006772, + "grad_norm": 276.0108642578125, + "learning_rate": 2.056442831215971e-05, + "loss": 35.9627, + "step": 1743 + }, + { + "epoch": 6.296162528216704, + "grad_norm": 252.5875701904297, + "learning_rate": 2.0558983666061706e-05, + "loss": 35.4305, + "step": 1744 + }, + { + "epoch": 6.299774266365689, + "grad_norm": 227.15142822265625, + "learning_rate": 2.05535390199637e-05, + "loss": 35.2385, + "step": 1745 + }, + { + "epoch": 6.303386004514673, + "grad_norm": 259.6727294921875, + "learning_rate": 2.0548094373865697e-05, + "loss": 35.735, + "step": 1746 + }, + { + "epoch": 6.306997742663657, + "grad_norm": 185.07765197753906, + "learning_rate": 2.0542649727767696e-05, + "loss": 36.8835, + "step": 1747 + }, + { + "epoch": 6.310609480812641, + "grad_norm": 207.650146484375, + "learning_rate": 2.0537205081669694e-05, + "loss": 36.346, + "step": 1748 + }, + { + "epoch": 6.314221218961626, + "grad_norm": 223.2378692626953, + "learning_rate": 2.053176043557169e-05, + "loss": 36.1527, + "step": 1749 + }, + { + "epoch": 6.3178329571106095, + "grad_norm": 162.90794372558594, + "learning_rate": 2.0526315789473685e-05, + "loss": 35.7408, + "step": 1750 + }, + { + "epoch": 6.3178329571106095, + "eval_loss": 0.6276403069496155, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 1750 + }, + { + "epoch": 6.3214446952595935, + "grad_norm": 165.8592987060547, + "learning_rate": 2.052087114337568e-05, + "loss": 37.7916, + "step": 1751 + }, + { + "epoch": 6.3250564334085775, + "grad_norm": 179.7499542236328, + "learning_rate": 2.0515426497277676e-05, + "loss": 36.8409, + "step": 1752 + }, + { + "epoch": 6.328668171557562, + "grad_norm": 227.0990753173828, + "learning_rate": 2.0509981851179675e-05, + "loss": 37.1766, + "step": 1753 + }, + { + "epoch": 6.332279909706546, + "grad_norm": 216.3297882080078, + "learning_rate": 2.050453720508167e-05, + "loss": 37.5, + "step": 1754 + }, + { + "epoch": 6.33589164785553, + "grad_norm": 197.88409423828125, + "learning_rate": 2.0499092558983666e-05, + "loss": 38.8293, + "step": 1755 + }, + { + "epoch": 6.339503386004514, + "grad_norm": 189.74916076660156, + "learning_rate": 2.049364791288566e-05, + "loss": 37.9873, + "step": 1756 + }, + { + "epoch": 6.343115124153499, + "grad_norm": 241.16644287109375, + "learning_rate": 2.048820326678766e-05, + "loss": 39.3107, + "step": 1757 + }, + { + "epoch": 6.346726862302483, + "grad_norm": 224.3491668701172, + "learning_rate": 2.0482758620689655e-05, + "loss": 36.2482, + "step": 1758 + }, + { + "epoch": 6.350338600451467, + "grad_norm": 217.30882263183594, + "learning_rate": 2.0477313974591654e-05, + "loss": 24.1945, + "step": 1759 + }, + { + "epoch": 6.353950338600452, + "grad_norm": 213.23683166503906, + "learning_rate": 2.047186932849365e-05, + "loss": 24.2356, + "step": 1760 + }, + { + "epoch": 6.353950338600452, + "eval_loss": 0.6382855772972107, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.795, + "eval_steps_per_second": 56.795, + "step": 1760 + }, + { + "epoch": 6.357562076749436, + "grad_norm": 209.8166961669922, + "learning_rate": 2.0466424682395645e-05, + "loss": 25.1916, + "step": 1761 + }, + { + "epoch": 6.36117381489842, + "grad_norm": 197.86773681640625, + "learning_rate": 2.046098003629764e-05, + "loss": 25.1372, + "step": 1762 + }, + { + "epoch": 6.364785553047404, + "grad_norm": 280.80517578125, + "learning_rate": 2.0455535390199635e-05, + "loss": 45.0431, + "step": 1763 + }, + { + "epoch": 6.368397291196389, + "grad_norm": 239.85861206054688, + "learning_rate": 2.0450090744101634e-05, + "loss": 45.4893, + "step": 1764 + }, + { + "epoch": 6.372009029345373, + "grad_norm": 302.56024169921875, + "learning_rate": 2.044464609800363e-05, + "loss": 45.3313, + "step": 1765 + }, + { + "epoch": 6.375620767494357, + "grad_norm": 255.5519256591797, + "learning_rate": 2.043920145190563e-05, + "loss": 44.703, + "step": 1766 + }, + { + "epoch": 6.3792325056433405, + "grad_norm": 223.1331024169922, + "learning_rate": 2.0433756805807624e-05, + "loss": 45.0278, + "step": 1767 + }, + { + "epoch": 6.382844243792325, + "grad_norm": 240.68817138671875, + "learning_rate": 2.042831215970962e-05, + "loss": 44.7298, + "step": 1768 + }, + { + "epoch": 6.386455981941309, + "grad_norm": 239.5072021484375, + "learning_rate": 2.0422867513611614e-05, + "loss": 44.0512, + "step": 1769 + }, + { + "epoch": 6.390067720090293, + "grad_norm": 186.3783416748047, + "learning_rate": 2.0417422867513613e-05, + "loss": 43.8646, + "step": 1770 + }, + { + "epoch": 6.390067720090293, + "eval_loss": 0.6325972676277161, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 1770 + }, + { + "epoch": 6.393679458239277, + "grad_norm": 169.77285766601562, + "learning_rate": 2.041197822141561e-05, + "loss": 43.8688, + "step": 1771 + }, + { + "epoch": 6.397291196388262, + "grad_norm": 158.4019012451172, + "learning_rate": 2.0406533575317604e-05, + "loss": 42.5757, + "step": 1772 + }, + { + "epoch": 6.400902934537246, + "grad_norm": 209.79916381835938, + "learning_rate": 2.04010889292196e-05, + "loss": 44.8075, + "step": 1773 + }, + { + "epoch": 6.40451467268623, + "grad_norm": 215.74639892578125, + "learning_rate": 2.0395644283121595e-05, + "loss": 42.0121, + "step": 1774 + }, + { + "epoch": 6.408126410835214, + "grad_norm": 215.21121215820312, + "learning_rate": 2.0390199637023597e-05, + "loss": 40.6564, + "step": 1775 + }, + { + "epoch": 6.411738148984199, + "grad_norm": 244.49574279785156, + "learning_rate": 2.0384754990925592e-05, + "loss": 40.543, + "step": 1776 + }, + { + "epoch": 6.415349887133183, + "grad_norm": 189.22781372070312, + "learning_rate": 2.0379310344827588e-05, + "loss": 39.5569, + "step": 1777 + }, + { + "epoch": 6.418961625282167, + "grad_norm": 204.32664489746094, + "learning_rate": 2.0373865698729583e-05, + "loss": 40.0789, + "step": 1778 + }, + { + "epoch": 6.422573363431152, + "grad_norm": 217.5277557373047, + "learning_rate": 2.036842105263158e-05, + "loss": 39.6436, + "step": 1779 + }, + { + "epoch": 6.426185101580136, + "grad_norm": 196.25918579101562, + "learning_rate": 2.0362976406533574e-05, + "loss": 41.0794, + "step": 1780 + }, + { + "epoch": 6.426185101580136, + "eval_loss": 0.6334295868873596, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1780 + }, + { + "epoch": 6.42979683972912, + "grad_norm": 191.50656127929688, + "learning_rate": 2.0357531760435573e-05, + "loss": 41.2976, + "step": 1781 + }, + { + "epoch": 6.433408577878104, + "grad_norm": 192.98692321777344, + "learning_rate": 2.0352087114337568e-05, + "loss": 41.0843, + "step": 1782 + }, + { + "epoch": 6.437020316027088, + "grad_norm": 197.32862854003906, + "learning_rate": 2.0346642468239563e-05, + "loss": 40.4123, + "step": 1783 + }, + { + "epoch": 6.440632054176072, + "grad_norm": 205.18751525878906, + "learning_rate": 2.0341197822141562e-05, + "loss": 41.9185, + "step": 1784 + }, + { + "epoch": 6.444243792325056, + "grad_norm": 201.69070434570312, + "learning_rate": 2.0335753176043558e-05, + "loss": 41.6794, + "step": 1785 + }, + { + "epoch": 6.44785553047404, + "grad_norm": 218.77044677734375, + "learning_rate": 2.0330308529945556e-05, + "loss": 43.5805, + "step": 1786 + }, + { + "epoch": 6.451467268623025, + "grad_norm": 183.25967407226562, + "learning_rate": 2.0324863883847552e-05, + "loss": 41.2777, + "step": 1787 + }, + { + "epoch": 6.455079006772009, + "grad_norm": 219.97369384765625, + "learning_rate": 2.0319419237749547e-05, + "loss": 42.4618, + "step": 1788 + }, + { + "epoch": 6.458690744920993, + "grad_norm": 216.1624298095703, + "learning_rate": 2.0313974591651542e-05, + "loss": 41.6424, + "step": 1789 + }, + { + "epoch": 6.462302483069977, + "grad_norm": 222.29965209960938, + "learning_rate": 2.0308529945553538e-05, + "loss": 41.4058, + "step": 1790 + }, + { + "epoch": 6.462302483069977, + "eval_loss": 0.6282982230186462, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 1790 + }, + { + "epoch": 6.465914221218962, + "grad_norm": 215.50511169433594, + "learning_rate": 2.0303085299455533e-05, + "loss": 39.474, + "step": 1791 + }, + { + "epoch": 6.469525959367946, + "grad_norm": 237.2119903564453, + "learning_rate": 2.0297640653357532e-05, + "loss": 36.0508, + "step": 1792 + }, + { + "epoch": 6.47313769751693, + "grad_norm": 234.52975463867188, + "learning_rate": 2.029219600725953e-05, + "loss": 34.1704, + "step": 1793 + }, + { + "epoch": 6.476749435665914, + "grad_norm": 213.22216796875, + "learning_rate": 2.0286751361161526e-05, + "loss": 34.7592, + "step": 1794 + }, + { + "epoch": 6.480361173814899, + "grad_norm": 215.77244567871094, + "learning_rate": 2.028130671506352e-05, + "loss": 35.3051, + "step": 1795 + }, + { + "epoch": 6.483972911963883, + "grad_norm": 179.0439910888672, + "learning_rate": 2.0275862068965517e-05, + "loss": 35.2493, + "step": 1796 + }, + { + "epoch": 6.487584650112867, + "grad_norm": 217.47218322753906, + "learning_rate": 2.0270417422867516e-05, + "loss": 35.6169, + "step": 1797 + }, + { + "epoch": 6.491196388261851, + "grad_norm": 191.3380584716797, + "learning_rate": 2.026497277676951e-05, + "loss": 36.428, + "step": 1798 + }, + { + "epoch": 6.4948081264108355, + "grad_norm": 200.8570098876953, + "learning_rate": 2.0259528130671506e-05, + "loss": 36.5983, + "step": 1799 + }, + { + "epoch": 6.4984198645598195, + "grad_norm": 173.1240234375, + "learning_rate": 2.0254083484573502e-05, + "loss": 36.0163, + "step": 1800 + }, + { + "epoch": 6.4984198645598195, + "eval_loss": 0.6268841624259949, + "eval_runtime": 3.146, + "eval_samples_per_second": 56.898, + "eval_steps_per_second": 56.898, + "step": 1800 + }, + { + "epoch": 6.502031602708803, + "grad_norm": 225.66845703125, + "learning_rate": 2.0248638838475497e-05, + "loss": 36.2461, + "step": 1801 + }, + { + "epoch": 6.505643340857787, + "grad_norm": 189.66233825683594, + "learning_rate": 2.0243194192377496e-05, + "loss": 37.416, + "step": 1802 + }, + { + "epoch": 6.509255079006772, + "grad_norm": 243.0270233154297, + "learning_rate": 2.0237749546279495e-05, + "loss": 38.5309, + "step": 1803 + }, + { + "epoch": 6.512866817155756, + "grad_norm": 192.0927276611328, + "learning_rate": 2.023230490018149e-05, + "loss": 37.087, + "step": 1804 + }, + { + "epoch": 6.51647855530474, + "grad_norm": 222.2957305908203, + "learning_rate": 2.0226860254083486e-05, + "loss": 37.8877, + "step": 1805 + }, + { + "epoch": 6.520090293453725, + "grad_norm": 259.84722900390625, + "learning_rate": 2.022141560798548e-05, + "loss": 39.2138, + "step": 1806 + }, + { + "epoch": 6.523702031602709, + "grad_norm": 205.5794219970703, + "learning_rate": 2.0215970961887476e-05, + "loss": 38.6066, + "step": 1807 + }, + { + "epoch": 6.527313769751693, + "grad_norm": 300.455810546875, + "learning_rate": 2.0210526315789475e-05, + "loss": 36.1581, + "step": 1808 + }, + { + "epoch": 6.530925507900677, + "grad_norm": 207.18063354492188, + "learning_rate": 2.020508166969147e-05, + "loss": 24.3689, + "step": 1809 + }, + { + "epoch": 6.534537246049661, + "grad_norm": 230.98516845703125, + "learning_rate": 2.0199637023593466e-05, + "loss": 23.7019, + "step": 1810 + }, + { + "epoch": 6.534537246049661, + "eval_loss": 0.6379140615463257, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 1810 + }, + { + "epoch": 6.538148984198646, + "grad_norm": 153.8694610595703, + "learning_rate": 2.019419237749546e-05, + "loss": 24.5035, + "step": 1811 + }, + { + "epoch": 6.54176072234763, + "grad_norm": 229.9432373046875, + "learning_rate": 2.018874773139746e-05, + "loss": 26.1645, + "step": 1812 + }, + { + "epoch": 6.545372460496614, + "grad_norm": 325.3592529296875, + "learning_rate": 2.018330308529946e-05, + "loss": 45.6349, + "step": 1813 + }, + { + "epoch": 6.5489841986455986, + "grad_norm": 261.0744323730469, + "learning_rate": 2.0177858439201454e-05, + "loss": 45.5545, + "step": 1814 + }, + { + "epoch": 6.5525959367945825, + "grad_norm": 261.4237976074219, + "learning_rate": 2.017241379310345e-05, + "loss": 45.321, + "step": 1815 + }, + { + "epoch": 6.5562076749435665, + "grad_norm": 238.8377685546875, + "learning_rate": 2.0166969147005445e-05, + "loss": 44.5963, + "step": 1816 + }, + { + "epoch": 6.5598194130925505, + "grad_norm": 225.89730834960938, + "learning_rate": 2.016152450090744e-05, + "loss": 43.593, + "step": 1817 + }, + { + "epoch": 6.563431151241535, + "grad_norm": 265.09625244140625, + "learning_rate": 2.0156079854809436e-05, + "loss": 43.536, + "step": 1818 + }, + { + "epoch": 6.567042889390519, + "grad_norm": 257.9114685058594, + "learning_rate": 2.0150635208711434e-05, + "loss": 44.1125, + "step": 1819 + }, + { + "epoch": 6.570654627539503, + "grad_norm": 188.06382751464844, + "learning_rate": 2.014519056261343e-05, + "loss": 45.097, + "step": 1820 + }, + { + "epoch": 6.570654627539503, + "eval_loss": 0.6347097754478455, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 1820 + }, + { + "epoch": 6.574266365688487, + "grad_norm": 227.7350616455078, + "learning_rate": 2.013974591651543e-05, + "loss": 43.9367, + "step": 1821 + }, + { + "epoch": 6.577878103837472, + "grad_norm": 207.54774475097656, + "learning_rate": 2.0134301270417424e-05, + "loss": 43.8266, + "step": 1822 + }, + { + "epoch": 6.581489841986456, + "grad_norm": 204.62364196777344, + "learning_rate": 2.012885662431942e-05, + "loss": 42.7973, + "step": 1823 + }, + { + "epoch": 6.58510158013544, + "grad_norm": 244.32159423828125, + "learning_rate": 2.0123411978221418e-05, + "loss": 42.7741, + "step": 1824 + }, + { + "epoch": 6.588713318284425, + "grad_norm": 304.9100036621094, + "learning_rate": 2.0117967332123414e-05, + "loss": 40.6529, + "step": 1825 + }, + { + "epoch": 6.592325056433409, + "grad_norm": 275.5767517089844, + "learning_rate": 2.011252268602541e-05, + "loss": 40.2909, + "step": 1826 + }, + { + "epoch": 6.595936794582393, + "grad_norm": 227.69642639160156, + "learning_rate": 2.0107078039927404e-05, + "loss": 39.8786, + "step": 1827 + }, + { + "epoch": 6.599548532731377, + "grad_norm": 261.4333190917969, + "learning_rate": 2.01016333938294e-05, + "loss": 40.7009, + "step": 1828 + }, + { + "epoch": 6.603160270880361, + "grad_norm": 213.0095977783203, + "learning_rate": 2.0096188747731395e-05, + "loss": 40.0595, + "step": 1829 + }, + { + "epoch": 6.606772009029346, + "grad_norm": 251.78590393066406, + "learning_rate": 2.0090744101633397e-05, + "loss": 40.8939, + "step": 1830 + }, + { + "epoch": 6.606772009029346, + "eval_loss": 0.6333281397819519, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1830 + }, + { + "epoch": 6.6103837471783295, + "grad_norm": 224.89805603027344, + "learning_rate": 2.0085299455535393e-05, + "loss": 41.4123, + "step": 1831 + }, + { + "epoch": 6.6139954853273135, + "grad_norm": 195.67982482910156, + "learning_rate": 2.0079854809437388e-05, + "loss": 41.3483, + "step": 1832 + }, + { + "epoch": 6.617607223476298, + "grad_norm": 214.318603515625, + "learning_rate": 2.0074410163339383e-05, + "loss": 40.5516, + "step": 1833 + }, + { + "epoch": 6.621218961625282, + "grad_norm": 226.60968017578125, + "learning_rate": 2.006896551724138e-05, + "loss": 41.3523, + "step": 1834 + }, + { + "epoch": 6.624830699774266, + "grad_norm": 231.63604736328125, + "learning_rate": 2.0063520871143378e-05, + "loss": 41.8734, + "step": 1835 + }, + { + "epoch": 6.62844243792325, + "grad_norm": 224.1644287109375, + "learning_rate": 2.0058076225045373e-05, + "loss": 42.7386, + "step": 1836 + }, + { + "epoch": 6.632054176072235, + "grad_norm": 273.651123046875, + "learning_rate": 2.0052631578947368e-05, + "loss": 42.4525, + "step": 1837 + }, + { + "epoch": 6.635665914221219, + "grad_norm": 270.8088684082031, + "learning_rate": 2.0047186932849364e-05, + "loss": 42.1051, + "step": 1838 + }, + { + "epoch": 6.639277652370203, + "grad_norm": 303.1058044433594, + "learning_rate": 2.0041742286751362e-05, + "loss": 42.1301, + "step": 1839 + }, + { + "epoch": 6.642889390519187, + "grad_norm": 207.29380798339844, + "learning_rate": 2.0036297640653358e-05, + "loss": 42.1495, + "step": 1840 + }, + { + "epoch": 6.642889390519187, + "eval_loss": 0.6321585774421692, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1840 + }, + { + "epoch": 6.646501128668172, + "grad_norm": 262.1852722167969, + "learning_rate": 2.0030852994555357e-05, + "loss": 39.6408, + "step": 1841 + }, + { + "epoch": 6.650112866817156, + "grad_norm": 233.7991943359375, + "learning_rate": 2.0025408348457352e-05, + "loss": 37.6177, + "step": 1842 + }, + { + "epoch": 6.65372460496614, + "grad_norm": 247.25514221191406, + "learning_rate": 2.0019963702359347e-05, + "loss": 35.4287, + "step": 1843 + }, + { + "epoch": 6.657336343115124, + "grad_norm": 191.53343200683594, + "learning_rate": 2.0014519056261343e-05, + "loss": 34.2335, + "step": 1844 + }, + { + "epoch": 6.660948081264109, + "grad_norm": 245.22821044921875, + "learning_rate": 2.0009074410163338e-05, + "loss": 35.8097, + "step": 1845 + }, + { + "epoch": 6.664559819413093, + "grad_norm": 213.8151092529297, + "learning_rate": 2.0003629764065337e-05, + "loss": 35.2621, + "step": 1846 + }, + { + "epoch": 6.668171557562077, + "grad_norm": 174.6085205078125, + "learning_rate": 1.9998185117967332e-05, + "loss": 36.6137, + "step": 1847 + }, + { + "epoch": 6.6717832957110605, + "grad_norm": 287.4677429199219, + "learning_rate": 1.9992740471869328e-05, + "loss": 37.5896, + "step": 1848 + }, + { + "epoch": 6.675395033860045, + "grad_norm": 224.59771728515625, + "learning_rate": 1.9987295825771326e-05, + "loss": 36.5515, + "step": 1849 + }, + { + "epoch": 6.679006772009029, + "grad_norm": 212.73065185546875, + "learning_rate": 1.9981851179673322e-05, + "loss": 36.2511, + "step": 1850 + }, + { + "epoch": 6.679006772009029, + "eval_loss": 0.6308404803276062, + "eval_runtime": 3.1419, + "eval_samples_per_second": 56.972, + "eval_steps_per_second": 56.972, + "step": 1850 + }, + { + "epoch": 6.682618510158013, + "grad_norm": 214.7340850830078, + "learning_rate": 1.9976406533575317e-05, + "loss": 37.6949, + "step": 1851 + }, + { + "epoch": 6.686230248306998, + "grad_norm": 220.3029327392578, + "learning_rate": 1.9970961887477316e-05, + "loss": 36.5785, + "step": 1852 + }, + { + "epoch": 6.689841986455982, + "grad_norm": 198.97564697265625, + "learning_rate": 1.996551724137931e-05, + "loss": 38.5277, + "step": 1853 + }, + { + "epoch": 6.693453724604966, + "grad_norm": 180.94789123535156, + "learning_rate": 1.9960072595281307e-05, + "loss": 37.5197, + "step": 1854 + }, + { + "epoch": 6.69706546275395, + "grad_norm": 212.17584228515625, + "learning_rate": 1.9954627949183302e-05, + "loss": 37.3483, + "step": 1855 + }, + { + "epoch": 6.700677200902934, + "grad_norm": 253.88601684570312, + "learning_rate": 1.9949183303085298e-05, + "loss": 38.5224, + "step": 1856 + }, + { + "epoch": 6.704288939051919, + "grad_norm": 193.17698669433594, + "learning_rate": 1.9943738656987296e-05, + "loss": 37.5679, + "step": 1857 + }, + { + "epoch": 6.707900677200903, + "grad_norm": 217.2652130126953, + "learning_rate": 1.9938294010889295e-05, + "loss": 27.7344, + "step": 1858 + }, + { + "epoch": 6.711512415349887, + "grad_norm": 183.9295196533203, + "learning_rate": 1.993284936479129e-05, + "loss": 24.3864, + "step": 1859 + }, + { + "epoch": 6.715124153498872, + "grad_norm": 200.3455352783203, + "learning_rate": 1.9927404718693286e-05, + "loss": 23.7328, + "step": 1860 + }, + { + "epoch": 6.715124153498872, + "eval_loss": 0.636415421962738, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.943, + "eval_steps_per_second": 56.943, + "step": 1860 + }, + { + "epoch": 6.718735891647856, + "grad_norm": 206.7858123779297, + "learning_rate": 1.992196007259528e-05, + "loss": 24.6541, + "step": 1861 + }, + { + "epoch": 6.72234762979684, + "grad_norm": 208.10414123535156, + "learning_rate": 1.9916515426497277e-05, + "loss": 25.1223, + "step": 1862 + }, + { + "epoch": 6.725959367945824, + "grad_norm": 270.6657409667969, + "learning_rate": 1.9911070780399275e-05, + "loss": 44.8561, + "step": 1863 + }, + { + "epoch": 6.7295711060948085, + "grad_norm": 246.69094848632812, + "learning_rate": 1.990562613430127e-05, + "loss": 45.8683, + "step": 1864 + }, + { + "epoch": 6.733182844243792, + "grad_norm": 243.4462432861328, + "learning_rate": 1.9900181488203266e-05, + "loss": 45.1845, + "step": 1865 + }, + { + "epoch": 6.736794582392776, + "grad_norm": 218.0637969970703, + "learning_rate": 1.989473684210526e-05, + "loss": 43.9492, + "step": 1866 + }, + { + "epoch": 6.74040632054176, + "grad_norm": 200.28140258789062, + "learning_rate": 1.988929219600726e-05, + "loss": 44.0612, + "step": 1867 + }, + { + "epoch": 6.744018058690745, + "grad_norm": 200.3120880126953, + "learning_rate": 1.988384754990926e-05, + "loss": 43.4748, + "step": 1868 + }, + { + "epoch": 6.747629796839729, + "grad_norm": 186.1811065673828, + "learning_rate": 1.9878402903811254e-05, + "loss": 43.6851, + "step": 1869 + }, + { + "epoch": 6.751241534988713, + "grad_norm": 208.15167236328125, + "learning_rate": 1.987295825771325e-05, + "loss": 44.4196, + "step": 1870 + }, + { + "epoch": 6.751241534988713, + "eval_loss": 0.6353851556777954, + "eval_runtime": 3.1436, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1870 + }, + { + "epoch": 6.754853273137698, + "grad_norm": 207.500244140625, + "learning_rate": 1.9867513611615245e-05, + "loss": 44.1493, + "step": 1871 + }, + { + "epoch": 6.758465011286682, + "grad_norm": 238.17047119140625, + "learning_rate": 1.986206896551724e-05, + "loss": 44.6587, + "step": 1872 + }, + { + "epoch": 6.762076749435666, + "grad_norm": 192.9468231201172, + "learning_rate": 1.9856624319419236e-05, + "loss": 43.2409, + "step": 1873 + }, + { + "epoch": 6.76568848758465, + "grad_norm": 205.26492309570312, + "learning_rate": 1.9851179673321235e-05, + "loss": 40.8636, + "step": 1874 + }, + { + "epoch": 6.769300225733634, + "grad_norm": 190.49908447265625, + "learning_rate": 1.984573502722323e-05, + "loss": 41.0769, + "step": 1875 + }, + { + "epoch": 6.772911963882619, + "grad_norm": 206.56097412109375, + "learning_rate": 1.984029038112523e-05, + "loss": 40.1137, + "step": 1876 + }, + { + "epoch": 6.776523702031603, + "grad_norm": 212.89256286621094, + "learning_rate": 1.9834845735027224e-05, + "loss": 41.0114, + "step": 1877 + }, + { + "epoch": 6.780135440180587, + "grad_norm": 197.24267578125, + "learning_rate": 1.982940108892922e-05, + "loss": 40.6027, + "step": 1878 + }, + { + "epoch": 6.7837471783295715, + "grad_norm": 187.01942443847656, + "learning_rate": 1.982395644283122e-05, + "loss": 40.5933, + "step": 1879 + }, + { + "epoch": 6.7873589164785555, + "grad_norm": 236.31092834472656, + "learning_rate": 1.9818511796733214e-05, + "loss": 41.2282, + "step": 1880 + }, + { + "epoch": 6.7873589164785555, + "eval_loss": 0.6299392580986023, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 1880 + }, + { + "epoch": 6.7909706546275395, + "grad_norm": 194.92059326171875, + "learning_rate": 1.981306715063521e-05, + "loss": 41.5858, + "step": 1881 + }, + { + "epoch": 6.794582392776523, + "grad_norm": 192.26272583007812, + "learning_rate": 1.9807622504537205e-05, + "loss": 40.6826, + "step": 1882 + }, + { + "epoch": 6.798194130925508, + "grad_norm": 181.8116912841797, + "learning_rate": 1.98021778584392e-05, + "loss": 40.0867, + "step": 1883 + }, + { + "epoch": 6.801805869074492, + "grad_norm": 219.03494262695312, + "learning_rate": 1.9796733212341195e-05, + "loss": 41.4496, + "step": 1884 + }, + { + "epoch": 6.805417607223476, + "grad_norm": 190.7852325439453, + "learning_rate": 1.9791288566243194e-05, + "loss": 42.4147, + "step": 1885 + }, + { + "epoch": 6.80902934537246, + "grad_norm": 200.32476806640625, + "learning_rate": 1.9785843920145193e-05, + "loss": 42.0316, + "step": 1886 + }, + { + "epoch": 6.812641083521445, + "grad_norm": 240.6086883544922, + "learning_rate": 1.9780399274047188e-05, + "loss": 39.6992, + "step": 1887 + }, + { + "epoch": 6.816252821670429, + "grad_norm": 222.31700134277344, + "learning_rate": 1.9774954627949184e-05, + "loss": 42.9572, + "step": 1888 + }, + { + "epoch": 6.819864559819413, + "grad_norm": 215.65292358398438, + "learning_rate": 1.976950998185118e-05, + "loss": 42.5147, + "step": 1889 + }, + { + "epoch": 6.823476297968397, + "grad_norm": 195.71624755859375, + "learning_rate": 1.9764065335753178e-05, + "loss": 40.9536, + "step": 1890 + }, + { + "epoch": 6.823476297968397, + "eval_loss": 0.6288287043571472, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 1890 + }, + { + "epoch": 6.827088036117382, + "grad_norm": 202.301025390625, + "learning_rate": 1.9758620689655173e-05, + "loss": 40.1754, + "step": 1891 + }, + { + "epoch": 6.830699774266366, + "grad_norm": 217.07186889648438, + "learning_rate": 1.975317604355717e-05, + "loss": 35.7505, + "step": 1892 + }, + { + "epoch": 6.83431151241535, + "grad_norm": 189.78782653808594, + "learning_rate": 1.9747731397459164e-05, + "loss": 34.813, + "step": 1893 + }, + { + "epoch": 6.837923250564334, + "grad_norm": 247.2117462158203, + "learning_rate": 1.974228675136116e-05, + "loss": 33.932, + "step": 1894 + }, + { + "epoch": 6.8415349887133186, + "grad_norm": 244.06321716308594, + "learning_rate": 1.9736842105263158e-05, + "loss": 36.2514, + "step": 1895 + }, + { + "epoch": 6.8451467268623025, + "grad_norm": 235.78692626953125, + "learning_rate": 1.9731397459165157e-05, + "loss": 35.2123, + "step": 1896 + }, + { + "epoch": 6.8487584650112865, + "grad_norm": 193.82456970214844, + "learning_rate": 1.9725952813067152e-05, + "loss": 36.5477, + "step": 1897 + }, + { + "epoch": 6.852370203160271, + "grad_norm": 230.2017059326172, + "learning_rate": 1.9720508166969148e-05, + "loss": 36.1244, + "step": 1898 + }, + { + "epoch": 6.855981941309255, + "grad_norm": 205.5274200439453, + "learning_rate": 1.9715063520871143e-05, + "loss": 36.7059, + "step": 1899 + }, + { + "epoch": 6.859593679458239, + "grad_norm": 236.6873016357422, + "learning_rate": 1.970961887477314e-05, + "loss": 36.6212, + "step": 1900 + }, + { + "epoch": 6.859593679458239, + "eval_loss": 0.6235609650611877, + "eval_runtime": 3.1497, + "eval_samples_per_second": 56.831, + "eval_steps_per_second": 56.831, + "step": 1900 + }, + { + "epoch": 6.863205417607223, + "grad_norm": 217.63638305664062, + "learning_rate": 1.9704174228675137e-05, + "loss": 37.3918, + "step": 1901 + }, + { + "epoch": 6.866817155756207, + "grad_norm": 169.31996154785156, + "learning_rate": 1.9698729582577133e-05, + "loss": 37.8555, + "step": 1902 + }, + { + "epoch": 6.870428893905192, + "grad_norm": 204.2144775390625, + "learning_rate": 1.9693284936479128e-05, + "loss": 38.0013, + "step": 1903 + }, + { + "epoch": 6.874040632054176, + "grad_norm": 219.13595581054688, + "learning_rate": 1.9687840290381127e-05, + "loss": 37.2128, + "step": 1904 + }, + { + "epoch": 6.87765237020316, + "grad_norm": 189.8477325439453, + "learning_rate": 1.9682395644283122e-05, + "loss": 39.272, + "step": 1905 + }, + { + "epoch": 6.881264108352145, + "grad_norm": 214.21360778808594, + "learning_rate": 1.967695099818512e-05, + "loss": 37.5185, + "step": 1906 + }, + { + "epoch": 6.884875846501129, + "grad_norm": 252.57867431640625, + "learning_rate": 1.9671506352087116e-05, + "loss": 37.6195, + "step": 1907 + }, + { + "epoch": 6.888487584650113, + "grad_norm": 169.85382080078125, + "learning_rate": 1.966606170598911e-05, + "loss": 29.083, + "step": 1908 + }, + { + "epoch": 6.892099322799097, + "grad_norm": 161.38137817382812, + "learning_rate": 1.9660617059891107e-05, + "loss": 24.4547, + "step": 1909 + }, + { + "epoch": 6.895711060948082, + "grad_norm": 192.5706787109375, + "learning_rate": 1.9655172413793102e-05, + "loss": 24.2235, + "step": 1910 + }, + { + "epoch": 6.895711060948082, + "eval_loss": 0.6387229561805725, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1910 + }, + { + "epoch": 6.899322799097066, + "grad_norm": 177.5368194580078, + "learning_rate": 1.9649727767695098e-05, + "loss": 24.8032, + "step": 1911 + }, + { + "epoch": 6.9029345372460496, + "grad_norm": 206.98458862304688, + "learning_rate": 1.9644283121597097e-05, + "loss": 25.7293, + "step": 1912 + }, + { + "epoch": 6.9065462753950335, + "grad_norm": 238.7289581298828, + "learning_rate": 1.9638838475499095e-05, + "loss": 44.2514, + "step": 1913 + }, + { + "epoch": 6.910158013544018, + "grad_norm": 225.86854553222656, + "learning_rate": 1.963339382940109e-05, + "loss": 44.4858, + "step": 1914 + }, + { + "epoch": 6.913769751693002, + "grad_norm": 235.71524047851562, + "learning_rate": 1.9627949183303086e-05, + "loss": 44.5351, + "step": 1915 + }, + { + "epoch": 6.917381489841986, + "grad_norm": 233.1634063720703, + "learning_rate": 1.962250453720508e-05, + "loss": 44.0865, + "step": 1916 + }, + { + "epoch": 6.92099322799097, + "grad_norm": 201.48944091796875, + "learning_rate": 1.961705989110708e-05, + "loss": 45.0226, + "step": 1917 + }, + { + "epoch": 6.924604966139955, + "grad_norm": 226.95469665527344, + "learning_rate": 1.9611615245009076e-05, + "loss": 44.3969, + "step": 1918 + }, + { + "epoch": 6.928216704288939, + "grad_norm": 242.79940795898438, + "learning_rate": 1.960617059891107e-05, + "loss": 41.3037, + "step": 1919 + }, + { + "epoch": 6.931828442437923, + "grad_norm": 255.3524932861328, + "learning_rate": 1.9600725952813066e-05, + "loss": 41.3567, + "step": 1920 + }, + { + "epoch": 6.931828442437923, + "eval_loss": 0.6346065998077393, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 1920 + }, + { + "epoch": 6.935440180586907, + "grad_norm": 277.0763854980469, + "learning_rate": 1.9595281306715062e-05, + "loss": 41.142, + "step": 1921 + }, + { + "epoch": 6.939051918735892, + "grad_norm": 176.02658081054688, + "learning_rate": 1.958983666061706e-05, + "loss": 42.1963, + "step": 1922 + }, + { + "epoch": 6.942663656884876, + "grad_norm": 236.36398315429688, + "learning_rate": 1.958439201451906e-05, + "loss": 42.351, + "step": 1923 + }, + { + "epoch": 6.94627539503386, + "grad_norm": 203.0919647216797, + "learning_rate": 1.9578947368421055e-05, + "loss": 41.5248, + "step": 1924 + }, + { + "epoch": 6.949887133182845, + "grad_norm": 273.605712890625, + "learning_rate": 1.957350272232305e-05, + "loss": 42.1004, + "step": 1925 + }, + { + "epoch": 6.953498871331829, + "grad_norm": 214.04319763183594, + "learning_rate": 1.9568058076225045e-05, + "loss": 42.6326, + "step": 1926 + }, + { + "epoch": 6.957110609480813, + "grad_norm": 250.81832885742188, + "learning_rate": 1.956261343012704e-05, + "loss": 43.8045, + "step": 1927 + }, + { + "epoch": 6.960722347629797, + "grad_norm": 233.58116149902344, + "learning_rate": 1.955716878402904e-05, + "loss": 39.8991, + "step": 1928 + }, + { + "epoch": 6.9643340857787805, + "grad_norm": 269.0545654296875, + "learning_rate": 1.9551724137931035e-05, + "loss": 34.6192, + "step": 1929 + }, + { + "epoch": 6.967945823927765, + "grad_norm": 266.1218566894531, + "learning_rate": 1.954627949183303e-05, + "loss": 35.7568, + "step": 1930 + }, + { + "epoch": 6.967945823927765, + "eval_loss": 0.6233173608779907, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1930 + }, + { + "epoch": 6.971557562076749, + "grad_norm": 294.6914978027344, + "learning_rate": 1.9540834845735026e-05, + "loss": 36.0795, + "step": 1931 + }, + { + "epoch": 6.975169300225733, + "grad_norm": 373.6831970214844, + "learning_rate": 1.9535390199637025e-05, + "loss": 37.2715, + "step": 1932 + }, + { + "epoch": 6.978781038374718, + "grad_norm": 240.34738159179688, + "learning_rate": 1.952994555353902e-05, + "loss": 37.8335, + "step": 1933 + }, + { + "epoch": 6.982392776523702, + "grad_norm": 312.1968994140625, + "learning_rate": 1.952450090744102e-05, + "loss": 37.8251, + "step": 1934 + }, + { + "epoch": 6.986004514672686, + "grad_norm": 276.3544006347656, + "learning_rate": 1.9519056261343014e-05, + "loss": 38.8466, + "step": 1935 + }, + { + "epoch": 6.98961625282167, + "grad_norm": 282.6874694824219, + "learning_rate": 1.951361161524501e-05, + "loss": 37.774, + "step": 1936 + }, + { + "epoch": 6.993227990970655, + "grad_norm": 323.96612548828125, + "learning_rate": 1.9508166969147005e-05, + "loss": 34.3747, + "step": 1937 + }, + { + "epoch": 6.996839729119639, + "grad_norm": 235.02915954589844, + "learning_rate": 1.9502722323049e-05, + "loss": 24.5297, + "step": 1938 + }, + { + "epoch": 7.0, + "grad_norm": 176.4046173095703, + "learning_rate": 1.9497277676951e-05, + "loss": 22.3179, + "step": 1939 + }, + { + "epoch": 7.003611738148984, + "grad_norm": 248.2797393798828, + "learning_rate": 1.9491833030852994e-05, + "loss": 42.225, + "step": 1940 + }, + { + "epoch": 7.003611738148984, + "eval_loss": 0.6272363066673279, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.911, + "eval_steps_per_second": 56.911, + "step": 1940 + }, + { + "epoch": 7.007223476297969, + "grad_norm": 235.9131622314453, + "learning_rate": 1.9486388384754993e-05, + "loss": 43.6526, + "step": 1941 + }, + { + "epoch": 7.010835214446953, + "grad_norm": 223.63479614257812, + "learning_rate": 1.948094373865699e-05, + "loss": 42.9052, + "step": 1942 + }, + { + "epoch": 7.014446952595937, + "grad_norm": 203.92141723632812, + "learning_rate": 1.9475499092558984e-05, + "loss": 43.5819, + "step": 1943 + }, + { + "epoch": 7.018058690744921, + "grad_norm": 209.6050567626953, + "learning_rate": 1.947005444646098e-05, + "loss": 43.1077, + "step": 1944 + }, + { + "epoch": 7.021670428893906, + "grad_norm": 245.77700805664062, + "learning_rate": 1.9464609800362978e-05, + "loss": 42.7508, + "step": 1945 + }, + { + "epoch": 7.0252821670428895, + "grad_norm": 203.13465881347656, + "learning_rate": 1.9459165154264973e-05, + "loss": 42.5234, + "step": 1946 + }, + { + "epoch": 7.0288939051918735, + "grad_norm": 226.4978485107422, + "learning_rate": 1.945372050816697e-05, + "loss": 44.0725, + "step": 1947 + }, + { + "epoch": 7.0325056433408575, + "grad_norm": 225.68116760253906, + "learning_rate": 1.9448275862068964e-05, + "loss": 42.6408, + "step": 1948 + }, + { + "epoch": 7.036117381489842, + "grad_norm": 182.14202880859375, + "learning_rate": 1.944283121597096e-05, + "loss": 41.7696, + "step": 1949 + }, + { + "epoch": 7.039729119638826, + "grad_norm": 196.1949005126953, + "learning_rate": 1.9437386569872962e-05, + "loss": 42.7008, + "step": 1950 + }, + { + "epoch": 7.039729119638826, + "eval_loss": 0.6277336478233337, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 1950 + }, + { + "epoch": 7.04334085778781, + "grad_norm": 180.6853485107422, + "learning_rate": 1.9431941923774957e-05, + "loss": 41.9946, + "step": 1951 + }, + { + "epoch": 7.046952595936794, + "grad_norm": 199.0644073486328, + "learning_rate": 1.9426497277676953e-05, + "loss": 39.8965, + "step": 1952 + }, + { + "epoch": 7.050564334085779, + "grad_norm": 208.21371459960938, + "learning_rate": 1.9421052631578948e-05, + "loss": 39.3263, + "step": 1953 + }, + { + "epoch": 7.054176072234763, + "grad_norm": 239.78677368164062, + "learning_rate": 1.9415607985480943e-05, + "loss": 40.1478, + "step": 1954 + }, + { + "epoch": 7.057787810383747, + "grad_norm": 211.55030822753906, + "learning_rate": 1.941016333938294e-05, + "loss": 40.061, + "step": 1955 + }, + { + "epoch": 7.061399548532731, + "grad_norm": 199.51455688476562, + "learning_rate": 1.9404718693284937e-05, + "loss": 39.8707, + "step": 1956 + }, + { + "epoch": 7.065011286681716, + "grad_norm": 183.39486694335938, + "learning_rate": 1.9399274047186933e-05, + "loss": 40.3183, + "step": 1957 + }, + { + "epoch": 7.0686230248307, + "grad_norm": 238.36737060546875, + "learning_rate": 1.9393829401088928e-05, + "loss": 40.8581, + "step": 1958 + }, + { + "epoch": 7.072234762979684, + "grad_norm": 202.5072021484375, + "learning_rate": 1.9388384754990927e-05, + "loss": 40.2192, + "step": 1959 + }, + { + "epoch": 7.075846501128668, + "grad_norm": 204.236083984375, + "learning_rate": 1.9382940108892922e-05, + "loss": 40.8533, + "step": 1960 + }, + { + "epoch": 7.075846501128668, + "eval_loss": 0.6252757906913757, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 1960 + }, + { + "epoch": 7.079458239277653, + "grad_norm": 260.2081298828125, + "learning_rate": 1.937749546279492e-05, + "loss": 39.7229, + "step": 1961 + }, + { + "epoch": 7.083069977426637, + "grad_norm": 241.91722106933594, + "learning_rate": 1.9372050816696917e-05, + "loss": 41.547, + "step": 1962 + }, + { + "epoch": 7.0866817155756205, + "grad_norm": 168.9304656982422, + "learning_rate": 1.9366606170598912e-05, + "loss": 41.4826, + "step": 1963 + }, + { + "epoch": 7.090293453724605, + "grad_norm": 230.05349731445312, + "learning_rate": 1.9361161524500907e-05, + "loss": 41.5411, + "step": 1964 + }, + { + "epoch": 7.093905191873589, + "grad_norm": 172.16851806640625, + "learning_rate": 1.9355716878402903e-05, + "loss": 42.2347, + "step": 1965 + }, + { + "epoch": 7.097516930022573, + "grad_norm": 312.65838623046875, + "learning_rate": 1.9350272232304898e-05, + "loss": 41.4039, + "step": 1966 + }, + { + "epoch": 7.101128668171557, + "grad_norm": 249.62351989746094, + "learning_rate": 1.9344827586206897e-05, + "loss": 41.4234, + "step": 1967 + }, + { + "epoch": 7.104740406320542, + "grad_norm": 250.49143981933594, + "learning_rate": 1.9339382940108896e-05, + "loss": 38.0539, + "step": 1968 + }, + { + "epoch": 7.108352144469526, + "grad_norm": 238.41546630859375, + "learning_rate": 1.933393829401089e-05, + "loss": 35.5584, + "step": 1969 + }, + { + "epoch": 7.11196388261851, + "grad_norm": 200.78282165527344, + "learning_rate": 1.9328493647912886e-05, + "loss": 34.4491, + "step": 1970 + }, + { + "epoch": 7.11196388261851, + "eval_loss": 0.6286216378211975, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 1970 + }, + { + "epoch": 7.115575620767494, + "grad_norm": 244.61717224121094, + "learning_rate": 1.9323049001814882e-05, + "loss": 34.5403, + "step": 1971 + }, + { + "epoch": 7.119187358916479, + "grad_norm": 219.14312744140625, + "learning_rate": 1.931760435571688e-05, + "loss": 35.7815, + "step": 1972 + }, + { + "epoch": 7.122799097065463, + "grad_norm": 221.85130310058594, + "learning_rate": 1.9312159709618876e-05, + "loss": 35.638, + "step": 1973 + }, + { + "epoch": 7.126410835214447, + "grad_norm": 237.97921752929688, + "learning_rate": 1.930671506352087e-05, + "loss": 35.1348, + "step": 1974 + }, + { + "epoch": 7.130022573363431, + "grad_norm": 234.06256103515625, + "learning_rate": 1.9301270417422867e-05, + "loss": 35.8709, + "step": 1975 + }, + { + "epoch": 7.133634311512416, + "grad_norm": 231.6852264404297, + "learning_rate": 1.9295825771324862e-05, + "loss": 36.6859, + "step": 1976 + }, + { + "epoch": 7.1372460496614, + "grad_norm": 208.2762908935547, + "learning_rate": 1.9290381125226857e-05, + "loss": 37.24, + "step": 1977 + }, + { + "epoch": 7.140857787810384, + "grad_norm": 219.8532257080078, + "learning_rate": 1.928493647912886e-05, + "loss": 36.4058, + "step": 1978 + }, + { + "epoch": 7.144469525959368, + "grad_norm": 242.73159790039062, + "learning_rate": 1.9279491833030855e-05, + "loss": 36.7565, + "step": 1979 + }, + { + "epoch": 7.148081264108352, + "grad_norm": 227.09645080566406, + "learning_rate": 1.927404718693285e-05, + "loss": 37.6752, + "step": 1980 + }, + { + "epoch": 7.148081264108352, + "eval_loss": 0.6243596076965332, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 1980 + }, + { + "epoch": 7.151693002257336, + "grad_norm": 236.27169799804688, + "learning_rate": 1.9268602540834846e-05, + "loss": 38.3857, + "step": 1981 + }, + { + "epoch": 7.15530474040632, + "grad_norm": 244.84912109375, + "learning_rate": 1.926315789473684e-05, + "loss": 38.414, + "step": 1982 + }, + { + "epoch": 7.158916478555304, + "grad_norm": 203.36798095703125, + "learning_rate": 1.925771324863884e-05, + "loss": 38.938, + "step": 1983 + }, + { + "epoch": 7.162528216704289, + "grad_norm": 225.50152587890625, + "learning_rate": 1.9252268602540835e-05, + "loss": 37.654, + "step": 1984 + }, + { + "epoch": 7.166139954853273, + "grad_norm": 236.4989471435547, + "learning_rate": 1.924682395644283e-05, + "loss": 28.2794, + "step": 1985 + }, + { + "epoch": 7.169751693002257, + "grad_norm": 173.909423828125, + "learning_rate": 1.9241379310344826e-05, + "loss": 23.3804, + "step": 1986 + }, + { + "epoch": 7.173363431151241, + "grad_norm": 195.63526916503906, + "learning_rate": 1.9235934664246825e-05, + "loss": 24.4696, + "step": 1987 + }, + { + "epoch": 7.176975169300226, + "grad_norm": 150.0059356689453, + "learning_rate": 1.923049001814882e-05, + "loss": 23.9438, + "step": 1988 + }, + { + "epoch": 7.18058690744921, + "grad_norm": 217.61630249023438, + "learning_rate": 1.922504537205082e-05, + "loss": 25.4084, + "step": 1989 + }, + { + "epoch": 7.184198645598194, + "grad_norm": 259.2041015625, + "learning_rate": 1.9219600725952814e-05, + "loss": 44.7159, + "step": 1990 + }, + { + "epoch": 7.184198645598194, + "eval_loss": 0.6465168595314026, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 1990 + }, + { + "epoch": 7.187810383747179, + "grad_norm": 282.1758117675781, + "learning_rate": 1.921415607985481e-05, + "loss": 45.7571, + "step": 1991 + }, + { + "epoch": 7.191422121896163, + "grad_norm": 276.5455322265625, + "learning_rate": 1.9208711433756805e-05, + "loss": 44.7227, + "step": 1992 + }, + { + "epoch": 7.195033860045147, + "grad_norm": 251.93589782714844, + "learning_rate": 1.92032667876588e-05, + "loss": 43.0705, + "step": 1993 + }, + { + "epoch": 7.198645598194131, + "grad_norm": 224.8245086669922, + "learning_rate": 1.91978221415608e-05, + "loss": 43.2009, + "step": 1994 + }, + { + "epoch": 7.2022573363431155, + "grad_norm": 233.61770629882812, + "learning_rate": 1.9192377495462795e-05, + "loss": 43.4496, + "step": 1995 + }, + { + "epoch": 7.2058690744920995, + "grad_norm": 188.65252685546875, + "learning_rate": 1.9186932849364793e-05, + "loss": 42.5907, + "step": 1996 + }, + { + "epoch": 7.209480812641083, + "grad_norm": 185.1155242919922, + "learning_rate": 1.918148820326679e-05, + "loss": 44.4651, + "step": 1997 + }, + { + "epoch": 7.213092550790067, + "grad_norm": 169.09701538085938, + "learning_rate": 1.9176043557168784e-05, + "loss": 43.6325, + "step": 1998 + }, + { + "epoch": 7.216704288939052, + "grad_norm": 198.49114990234375, + "learning_rate": 1.9170598911070783e-05, + "loss": 43.5817, + "step": 1999 + }, + { + "epoch": 7.220316027088036, + "grad_norm": 193.17591857910156, + "learning_rate": 1.916515426497278e-05, + "loss": 41.4884, + "step": 2000 + }, + { + "epoch": 7.220316027088036, + "eval_loss": 0.6329721212387085, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.995, + "eval_steps_per_second": 56.995, + "step": 2000 + }, + { + "epoch": 7.22392776523702, + "grad_norm": 202.32730102539062, + "learning_rate": 1.9159709618874774e-05, + "loss": 41.2168, + "step": 2001 + }, + { + "epoch": 7.227539503386004, + "grad_norm": 206.4916534423828, + "learning_rate": 1.915426497277677e-05, + "loss": 39.9909, + "step": 2002 + }, + { + "epoch": 7.231151241534989, + "grad_norm": 202.2099609375, + "learning_rate": 1.9148820326678765e-05, + "loss": 40.1413, + "step": 2003 + }, + { + "epoch": 7.234762979683973, + "grad_norm": 223.7954559326172, + "learning_rate": 1.914337568058076e-05, + "loss": 39.5872, + "step": 2004 + }, + { + "epoch": 7.238374717832957, + "grad_norm": 225.8967742919922, + "learning_rate": 1.9137931034482762e-05, + "loss": 41.3396, + "step": 2005 + }, + { + "epoch": 7.241986455981941, + "grad_norm": 248.0997772216797, + "learning_rate": 1.9132486388384757e-05, + "loss": 39.012, + "step": 2006 + }, + { + "epoch": 7.245598194130926, + "grad_norm": 227.4576873779297, + "learning_rate": 1.9127041742286753e-05, + "loss": 42.5922, + "step": 2007 + }, + { + "epoch": 7.24920993227991, + "grad_norm": 197.62547302246094, + "learning_rate": 1.9121597096188748e-05, + "loss": 41.6107, + "step": 2008 + }, + { + "epoch": 7.252821670428894, + "grad_norm": 170.18817138671875, + "learning_rate": 1.9116152450090744e-05, + "loss": 40.3326, + "step": 2009 + }, + { + "epoch": 7.2564334085778786, + "grad_norm": 186.9420166015625, + "learning_rate": 1.9110707803992742e-05, + "loss": 41.0365, + "step": 2010 + }, + { + "epoch": 7.2564334085778786, + "eval_loss": 0.6230406761169434, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2010 + }, + { + "epoch": 7.2600451467268625, + "grad_norm": 188.11244201660156, + "learning_rate": 1.9105263157894738e-05, + "loss": 42.0278, + "step": 2011 + }, + { + "epoch": 7.2636568848758465, + "grad_norm": 242.47305297851562, + "learning_rate": 1.9099818511796733e-05, + "loss": 41.5539, + "step": 2012 + }, + { + "epoch": 7.2672686230248305, + "grad_norm": 190.83987426757812, + "learning_rate": 1.909437386569873e-05, + "loss": 41.8641, + "step": 2013 + }, + { + "epoch": 7.270880361173815, + "grad_norm": 214.44650268554688, + "learning_rate": 1.9088929219600724e-05, + "loss": 42.232, + "step": 2014 + }, + { + "epoch": 7.274492099322799, + "grad_norm": 216.3888397216797, + "learning_rate": 1.9083484573502723e-05, + "loss": 41.6186, + "step": 2015 + }, + { + "epoch": 7.278103837471783, + "grad_norm": 210.46673583984375, + "learning_rate": 1.907803992740472e-05, + "loss": 42.2099, + "step": 2016 + }, + { + "epoch": 7.281715575620767, + "grad_norm": 194.84165954589844, + "learning_rate": 1.9072595281306717e-05, + "loss": 42.78, + "step": 2017 + }, + { + "epoch": 7.285327313769752, + "grad_norm": 201.91297912597656, + "learning_rate": 1.9067150635208712e-05, + "loss": 38.7115, + "step": 2018 + }, + { + "epoch": 7.288939051918736, + "grad_norm": 245.42625427246094, + "learning_rate": 1.9061705989110708e-05, + "loss": 35.7841, + "step": 2019 + }, + { + "epoch": 7.29255079006772, + "grad_norm": 182.4967041015625, + "learning_rate": 1.9056261343012703e-05, + "loss": 34.3308, + "step": 2020 + }, + { + "epoch": 7.29255079006772, + "eval_loss": 0.6238341331481934, + "eval_runtime": 3.1431, + "eval_samples_per_second": 56.95, + "eval_steps_per_second": 56.95, + "step": 2020 + }, + { + "epoch": 7.296162528216704, + "grad_norm": 297.3916320800781, + "learning_rate": 1.9050816696914702e-05, + "loss": 34.7534, + "step": 2021 + }, + { + "epoch": 7.299774266365689, + "grad_norm": 211.52554321289062, + "learning_rate": 1.9045372050816697e-05, + "loss": 34.0303, + "step": 2022 + }, + { + "epoch": 7.303386004514673, + "grad_norm": 232.99844360351562, + "learning_rate": 1.9039927404718693e-05, + "loss": 35.7378, + "step": 2023 + }, + { + "epoch": 7.306997742663657, + "grad_norm": 230.34642028808594, + "learning_rate": 1.903448275862069e-05, + "loss": 36.7492, + "step": 2024 + }, + { + "epoch": 7.310609480812641, + "grad_norm": 228.88966369628906, + "learning_rate": 1.9029038112522687e-05, + "loss": 35.1188, + "step": 2025 + }, + { + "epoch": 7.314221218961626, + "grad_norm": 213.2604522705078, + "learning_rate": 1.9023593466424682e-05, + "loss": 35.0688, + "step": 2026 + }, + { + "epoch": 7.3178329571106095, + "grad_norm": 202.62200927734375, + "learning_rate": 1.901814882032668e-05, + "loss": 37.6721, + "step": 2027 + }, + { + "epoch": 7.3214446952595935, + "grad_norm": 191.8877410888672, + "learning_rate": 1.9012704174228676e-05, + "loss": 36.7728, + "step": 2028 + }, + { + "epoch": 7.3250564334085775, + "grad_norm": 211.57571411132812, + "learning_rate": 1.900725952813067e-05, + "loss": 36.6342, + "step": 2029 + }, + { + "epoch": 7.328668171557562, + "grad_norm": 177.2289581298828, + "learning_rate": 1.9001814882032667e-05, + "loss": 36.8319, + "step": 2030 + }, + { + "epoch": 7.328668171557562, + "eval_loss": 0.6231008172035217, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2030 + }, + { + "epoch": 7.332279909706546, + "grad_norm": 227.7028350830078, + "learning_rate": 1.8996370235934662e-05, + "loss": 36.6706, + "step": 2031 + }, + { + "epoch": 7.33589164785553, + "grad_norm": 229.02972412109375, + "learning_rate": 1.899092558983666e-05, + "loss": 37.0749, + "step": 2032 + }, + { + "epoch": 7.339503386004514, + "grad_norm": 234.30946350097656, + "learning_rate": 1.898548094373866e-05, + "loss": 37.3716, + "step": 2033 + }, + { + "epoch": 7.343115124153499, + "grad_norm": 236.79893493652344, + "learning_rate": 1.8980036297640655e-05, + "loss": 38.9503, + "step": 2034 + }, + { + "epoch": 7.346726862302483, + "grad_norm": 256.5646057128906, + "learning_rate": 1.897459165154265e-05, + "loss": 32.5056, + "step": 2035 + }, + { + "epoch": 7.350338600451467, + "grad_norm": 183.38961791992188, + "learning_rate": 1.8969147005444646e-05, + "loss": 25.3982, + "step": 2036 + }, + { + "epoch": 7.353950338600452, + "grad_norm": 214.09742736816406, + "learning_rate": 1.896370235934664e-05, + "loss": 23.2743, + "step": 2037 + }, + { + "epoch": 7.357562076749436, + "grad_norm": 190.10867309570312, + "learning_rate": 1.895825771324864e-05, + "loss": 24.8062, + "step": 2038 + }, + { + "epoch": 7.36117381489842, + "grad_norm": 197.85313415527344, + "learning_rate": 1.8952813067150636e-05, + "loss": 25.5098, + "step": 2039 + }, + { + "epoch": 7.364785553047404, + "grad_norm": 235.79090881347656, + "learning_rate": 1.894736842105263e-05, + "loss": 44.3536, + "step": 2040 + }, + { + "epoch": 7.364785553047404, + "eval_loss": 0.6341925263404846, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.785, + "eval_steps_per_second": 56.785, + "step": 2040 + }, + { + "epoch": 7.368397291196389, + "grad_norm": 232.7415771484375, + "learning_rate": 1.8941923774954626e-05, + "loss": 44.6073, + "step": 2041 + }, + { + "epoch": 7.372009029345373, + "grad_norm": 302.3766174316406, + "learning_rate": 1.8936479128856625e-05, + "loss": 43.8575, + "step": 2042 + }, + { + "epoch": 7.375620767494357, + "grad_norm": 208.41441345214844, + "learning_rate": 1.8931034482758624e-05, + "loss": 42.4378, + "step": 2043 + }, + { + "epoch": 7.3792325056433405, + "grad_norm": 228.000732421875, + "learning_rate": 1.892558983666062e-05, + "loss": 44.5641, + "step": 2044 + }, + { + "epoch": 7.382844243792325, + "grad_norm": 201.757080078125, + "learning_rate": 1.8920145190562615e-05, + "loss": 43.7578, + "step": 2045 + }, + { + "epoch": 7.386455981941309, + "grad_norm": 220.2481689453125, + "learning_rate": 1.891470054446461e-05, + "loss": 42.755, + "step": 2046 + }, + { + "epoch": 7.390067720090293, + "grad_norm": 225.5443115234375, + "learning_rate": 1.8909255898366605e-05, + "loss": 44.3785, + "step": 2047 + }, + { + "epoch": 7.393679458239277, + "grad_norm": 200.2024688720703, + "learning_rate": 1.89038112522686e-05, + "loss": 42.994, + "step": 2048 + }, + { + "epoch": 7.397291196388262, + "grad_norm": 205.64794921875, + "learning_rate": 1.88983666061706e-05, + "loss": 43.1902, + "step": 2049 + }, + { + "epoch": 7.400902934537246, + "grad_norm": 183.3535919189453, + "learning_rate": 1.8892921960072595e-05, + "loss": 40.9422, + "step": 2050 + }, + { + "epoch": 7.400902934537246, + "eval_loss": 0.626913845539093, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2050 + }, + { + "epoch": 7.40451467268623, + "grad_norm": 201.8138885498047, + "learning_rate": 1.8887477313974594e-05, + "loss": 39.4408, + "step": 2051 + }, + { + "epoch": 7.408126410835214, + "grad_norm": 201.8863525390625, + "learning_rate": 1.888203266787659e-05, + "loss": 39.5467, + "step": 2052 + }, + { + "epoch": 7.411738148984199, + "grad_norm": 239.10687255859375, + "learning_rate": 1.8876588021778585e-05, + "loss": 41.2256, + "step": 2053 + }, + { + "epoch": 7.415349887133183, + "grad_norm": 209.47796630859375, + "learning_rate": 1.8871143375680583e-05, + "loss": 40.8963, + "step": 2054 + }, + { + "epoch": 7.418961625282167, + "grad_norm": 202.6414794921875, + "learning_rate": 1.886569872958258e-05, + "loss": 40.5138, + "step": 2055 + }, + { + "epoch": 7.422573363431152, + "grad_norm": 198.01795959472656, + "learning_rate": 1.8860254083484574e-05, + "loss": 39.1767, + "step": 2056 + }, + { + "epoch": 7.426185101580136, + "grad_norm": 173.26507568359375, + "learning_rate": 1.885480943738657e-05, + "loss": 40.6713, + "step": 2057 + }, + { + "epoch": 7.42979683972912, + "grad_norm": 166.11607360839844, + "learning_rate": 1.8849364791288565e-05, + "loss": 41.2602, + "step": 2058 + }, + { + "epoch": 7.433408577878104, + "grad_norm": 200.76956176757812, + "learning_rate": 1.884392014519056e-05, + "loss": 41.0714, + "step": 2059 + }, + { + "epoch": 7.437020316027088, + "grad_norm": 213.75315856933594, + "learning_rate": 1.883847549909256e-05, + "loss": 39.6812, + "step": 2060 + }, + { + "epoch": 7.437020316027088, + "eval_loss": 0.6279598474502563, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.0, + "eval_steps_per_second": 57.0, + "step": 2060 + }, + { + "epoch": 7.440632054176072, + "grad_norm": 221.25025939941406, + "learning_rate": 1.8833030852994558e-05, + "loss": 41.6964, + "step": 2061 + }, + { + "epoch": 7.444243792325056, + "grad_norm": 171.32106018066406, + "learning_rate": 1.8827586206896553e-05, + "loss": 41.4608, + "step": 2062 + }, + { + "epoch": 7.44785553047404, + "grad_norm": 222.76600646972656, + "learning_rate": 1.882214156079855e-05, + "loss": 41.2687, + "step": 2063 + }, + { + "epoch": 7.451467268623025, + "grad_norm": 169.82395935058594, + "learning_rate": 1.8816696914700544e-05, + "loss": 41.6048, + "step": 2064 + }, + { + "epoch": 7.455079006772009, + "grad_norm": 190.5113525390625, + "learning_rate": 1.8811252268602543e-05, + "loss": 41.8843, + "step": 2065 + }, + { + "epoch": 7.458690744920993, + "grad_norm": 194.5990447998047, + "learning_rate": 1.8805807622504538e-05, + "loss": 43.5968, + "step": 2066 + }, + { + "epoch": 7.462302483069977, + "grad_norm": 216.0985870361328, + "learning_rate": 1.8800362976406533e-05, + "loss": 41.6743, + "step": 2067 + }, + { + "epoch": 7.465914221218962, + "grad_norm": 249.05270385742188, + "learning_rate": 1.879491833030853e-05, + "loss": 39.4203, + "step": 2068 + }, + { + "epoch": 7.469525959367946, + "grad_norm": 232.5495147705078, + "learning_rate": 1.8789473684210524e-05, + "loss": 36.2202, + "step": 2069 + }, + { + "epoch": 7.47313769751693, + "grad_norm": 218.72299194335938, + "learning_rate": 1.8784029038112523e-05, + "loss": 34.9116, + "step": 2070 + }, + { + "epoch": 7.47313769751693, + "eval_loss": 0.6241349577903748, + "eval_runtime": 3.1499, + "eval_samples_per_second": 56.827, + "eval_steps_per_second": 56.827, + "step": 2070 + }, + { + "epoch": 7.476749435665914, + "grad_norm": 241.78179931640625, + "learning_rate": 1.8778584392014522e-05, + "loss": 36.2476, + "step": 2071 + }, + { + "epoch": 7.480361173814899, + "grad_norm": 194.92982482910156, + "learning_rate": 1.8773139745916517e-05, + "loss": 34.4524, + "step": 2072 + }, + { + "epoch": 7.483972911963883, + "grad_norm": 227.76156616210938, + "learning_rate": 1.8767695099818513e-05, + "loss": 34.5292, + "step": 2073 + }, + { + "epoch": 7.487584650112867, + "grad_norm": 287.61309814453125, + "learning_rate": 1.8762250453720508e-05, + "loss": 37.8068, + "step": 2074 + }, + { + "epoch": 7.491196388261851, + "grad_norm": 191.0822296142578, + "learning_rate": 1.8756805807622503e-05, + "loss": 36.0941, + "step": 2075 + }, + { + "epoch": 7.4948081264108355, + "grad_norm": 197.5564422607422, + "learning_rate": 1.8751361161524502e-05, + "loss": 36.3624, + "step": 2076 + }, + { + "epoch": 7.4984198645598195, + "grad_norm": 187.72479248046875, + "learning_rate": 1.8745916515426497e-05, + "loss": 37.5074, + "step": 2077 + }, + { + "epoch": 7.502031602708803, + "grad_norm": 220.4607391357422, + "learning_rate": 1.8740471869328493e-05, + "loss": 35.6139, + "step": 2078 + }, + { + "epoch": 7.505643340857787, + "grad_norm": 179.05612182617188, + "learning_rate": 1.873502722323049e-05, + "loss": 37.7286, + "step": 2079 + }, + { + "epoch": 7.509255079006772, + "grad_norm": 230.91879272460938, + "learning_rate": 1.8729582577132487e-05, + "loss": 36.1803, + "step": 2080 + }, + { + "epoch": 7.509255079006772, + "eval_loss": 0.6255043148994446, + "eval_runtime": 3.1466, + "eval_samples_per_second": 56.887, + "eval_steps_per_second": 56.887, + "step": 2080 + }, + { + "epoch": 7.512866817155756, + "grad_norm": 182.89437866210938, + "learning_rate": 1.8724137931034482e-05, + "loss": 36.5782, + "step": 2081 + }, + { + "epoch": 7.51647855530474, + "grad_norm": 215.36769104003906, + "learning_rate": 1.871869328493648e-05, + "loss": 38.233, + "step": 2082 + }, + { + "epoch": 7.520090293453725, + "grad_norm": 232.6095733642578, + "learning_rate": 1.8713248638838477e-05, + "loss": 38.6268, + "step": 2083 + }, + { + "epoch": 7.523702031602709, + "grad_norm": 236.94281005859375, + "learning_rate": 1.8707803992740472e-05, + "loss": 38.1768, + "step": 2084 + }, + { + "epoch": 7.527313769751693, + "grad_norm": 214.16079711914062, + "learning_rate": 1.8702359346642467e-05, + "loss": 27.514, + "step": 2085 + }, + { + "epoch": 7.530925507900677, + "grad_norm": 192.6107940673828, + "learning_rate": 1.8696914700544463e-05, + "loss": 24.274, + "step": 2086 + }, + { + "epoch": 7.534537246049661, + "grad_norm": 217.98619079589844, + "learning_rate": 1.869147005444646e-05, + "loss": 23.2824, + "step": 2087 + }, + { + "epoch": 7.538148984198646, + "grad_norm": 183.04296875, + "learning_rate": 1.868602540834846e-05, + "loss": 24.9622, + "step": 2088 + }, + { + "epoch": 7.54176072234763, + "grad_norm": 167.1417236328125, + "learning_rate": 1.8680580762250456e-05, + "loss": 25.1446, + "step": 2089 + }, + { + "epoch": 7.545372460496614, + "grad_norm": 287.29937744140625, + "learning_rate": 1.867513611615245e-05, + "loss": 44.1171, + "step": 2090 + }, + { + "epoch": 7.545372460496614, + "eval_loss": 0.6376849412918091, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.929, + "eval_steps_per_second": 56.929, + "step": 2090 + }, + { + "epoch": 7.5489841986455986, + "grad_norm": 285.3408203125, + "learning_rate": 1.8669691470054446e-05, + "loss": 46.3716, + "step": 2091 + }, + { + "epoch": 7.5525959367945825, + "grad_norm": 233.18389892578125, + "learning_rate": 1.8664246823956445e-05, + "loss": 44.0514, + "step": 2092 + }, + { + "epoch": 7.5562076749435665, + "grad_norm": 256.4196472167969, + "learning_rate": 1.865880217785844e-05, + "loss": 44.1784, + "step": 2093 + }, + { + "epoch": 7.5598194130925505, + "grad_norm": 223.28128051757812, + "learning_rate": 1.8653357531760436e-05, + "loss": 42.9897, + "step": 2094 + }, + { + "epoch": 7.563431151241535, + "grad_norm": 235.2901153564453, + "learning_rate": 1.864791288566243e-05, + "loss": 43.7651, + "step": 2095 + }, + { + "epoch": 7.567042889390519, + "grad_norm": 285.9206237792969, + "learning_rate": 1.8642468239564427e-05, + "loss": 44.6333, + "step": 2096 + }, + { + "epoch": 7.570654627539503, + "grad_norm": 200.00210571289062, + "learning_rate": 1.8637023593466425e-05, + "loss": 43.9845, + "step": 2097 + }, + { + "epoch": 7.574266365688487, + "grad_norm": 277.73394775390625, + "learning_rate": 1.8631578947368424e-05, + "loss": 44.7301, + "step": 2098 + }, + { + "epoch": 7.577878103837472, + "grad_norm": 216.9422149658203, + "learning_rate": 1.862613430127042e-05, + "loss": 44.0409, + "step": 2099 + }, + { + "epoch": 7.581489841986456, + "grad_norm": 198.86639404296875, + "learning_rate": 1.8620689655172415e-05, + "loss": 43.4026, + "step": 2100 + }, + { + "epoch": 7.581489841986456, + "eval_loss": 0.6270378232002258, + "eval_runtime": 3.1464, + "eval_samples_per_second": 56.891, + "eval_steps_per_second": 56.891, + "step": 2100 + }, + { + "epoch": 7.58510158013544, + "grad_norm": 240.495361328125, + "learning_rate": 1.861524500907441e-05, + "loss": 41.4092, + "step": 2101 + }, + { + "epoch": 7.588713318284425, + "grad_norm": 240.1851043701172, + "learning_rate": 1.8609800362976406e-05, + "loss": 40.1396, + "step": 2102 + }, + { + "epoch": 7.592325056433409, + "grad_norm": 241.21495056152344, + "learning_rate": 1.8604355716878405e-05, + "loss": 39.1778, + "step": 2103 + }, + { + "epoch": 7.595936794582393, + "grad_norm": 287.3133544921875, + "learning_rate": 1.85989110707804e-05, + "loss": 41.0348, + "step": 2104 + }, + { + "epoch": 7.599548532731377, + "grad_norm": 230.4313201904297, + "learning_rate": 1.8593466424682395e-05, + "loss": 39.5872, + "step": 2105 + }, + { + "epoch": 7.603160270880361, + "grad_norm": 210.32962036132812, + "learning_rate": 1.858802177858439e-05, + "loss": 40.6146, + "step": 2106 + }, + { + "epoch": 7.606772009029346, + "grad_norm": 185.81752014160156, + "learning_rate": 1.858257713248639e-05, + "loss": 39.6363, + "step": 2107 + }, + { + "epoch": 7.6103837471783295, + "grad_norm": 234.63037109375, + "learning_rate": 1.8577132486388385e-05, + "loss": 40.558, + "step": 2108 + }, + { + "epoch": 7.6139954853273135, + "grad_norm": 289.92803955078125, + "learning_rate": 1.8571687840290384e-05, + "loss": 41.1624, + "step": 2109 + }, + { + "epoch": 7.617607223476298, + "grad_norm": 252.82188415527344, + "learning_rate": 1.856624319419238e-05, + "loss": 41.7827, + "step": 2110 + }, + { + "epoch": 7.617607223476298, + "eval_loss": 0.6290409564971924, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2110 + }, + { + "epoch": 7.621218961625282, + "grad_norm": 201.8303985595703, + "learning_rate": 1.8560798548094374e-05, + "loss": 39.0072, + "step": 2111 + }, + { + "epoch": 7.624830699774266, + "grad_norm": 158.71446228027344, + "learning_rate": 1.855535390199637e-05, + "loss": 39.9822, + "step": 2112 + }, + { + "epoch": 7.62844243792325, + "grad_norm": 171.3879852294922, + "learning_rate": 1.8549909255898365e-05, + "loss": 42.1973, + "step": 2113 + }, + { + "epoch": 7.632054176072235, + "grad_norm": 218.584228515625, + "learning_rate": 1.8544464609800364e-05, + "loss": 42.933, + "step": 2114 + }, + { + "epoch": 7.635665914221219, + "grad_norm": 200.60093688964844, + "learning_rate": 1.853901996370236e-05, + "loss": 41.9847, + "step": 2115 + }, + { + "epoch": 7.639277652370203, + "grad_norm": 210.75128173828125, + "learning_rate": 1.8533575317604358e-05, + "loss": 42.4961, + "step": 2116 + }, + { + "epoch": 7.642889390519187, + "grad_norm": 187.47406005859375, + "learning_rate": 1.8528130671506353e-05, + "loss": 39.3404, + "step": 2117 + }, + { + "epoch": 7.646501128668172, + "grad_norm": 204.87693786621094, + "learning_rate": 1.852268602540835e-05, + "loss": 40.3011, + "step": 2118 + }, + { + "epoch": 7.650112866817156, + "grad_norm": 228.8159637451172, + "learning_rate": 1.8517241379310344e-05, + "loss": 37.4416, + "step": 2119 + }, + { + "epoch": 7.65372460496614, + "grad_norm": 237.59664916992188, + "learning_rate": 1.8511796733212343e-05, + "loss": 35.3079, + "step": 2120 + }, + { + "epoch": 7.65372460496614, + "eval_loss": 0.6256567239761353, + "eval_runtime": 3.1458, + "eval_samples_per_second": 56.902, + "eval_steps_per_second": 56.902, + "step": 2120 + }, + { + "epoch": 7.657336343115124, + "grad_norm": 233.3187713623047, + "learning_rate": 1.850635208711434e-05, + "loss": 34.5055, + "step": 2121 + }, + { + "epoch": 7.660948081264109, + "grad_norm": 232.7037353515625, + "learning_rate": 1.8500907441016334e-05, + "loss": 34.1232, + "step": 2122 + }, + { + "epoch": 7.664559819413093, + "grad_norm": 254.53050231933594, + "learning_rate": 1.849546279491833e-05, + "loss": 35.3301, + "step": 2123 + }, + { + "epoch": 7.668171557562077, + "grad_norm": 234.93154907226562, + "learning_rate": 1.8490018148820324e-05, + "loss": 35.9202, + "step": 2124 + }, + { + "epoch": 7.6717832957110605, + "grad_norm": 237.99671936035156, + "learning_rate": 1.8484573502722327e-05, + "loss": 36.5702, + "step": 2125 + }, + { + "epoch": 7.675395033860045, + "grad_norm": 186.25271606445312, + "learning_rate": 1.8479128856624322e-05, + "loss": 35.9423, + "step": 2126 + }, + { + "epoch": 7.679006772009029, + "grad_norm": 226.461669921875, + "learning_rate": 1.8473684210526317e-05, + "loss": 37.4121, + "step": 2127 + }, + { + "epoch": 7.682618510158013, + "grad_norm": 227.0966033935547, + "learning_rate": 1.8468239564428313e-05, + "loss": 36.8802, + "step": 2128 + }, + { + "epoch": 7.686230248306998, + "grad_norm": 193.4064178466797, + "learning_rate": 1.8462794918330308e-05, + "loss": 36.0245, + "step": 2129 + }, + { + "epoch": 7.689841986455982, + "grad_norm": 279.1668395996094, + "learning_rate": 1.8457350272232304e-05, + "loss": 37.4833, + "step": 2130 + }, + { + "epoch": 7.689841986455982, + "eval_loss": 0.6227458715438843, + "eval_runtime": 3.1429, + "eval_samples_per_second": 56.953, + "eval_steps_per_second": 56.953, + "step": 2130 + }, + { + "epoch": 7.693453724604966, + "grad_norm": 254.59234619140625, + "learning_rate": 1.8451905626134302e-05, + "loss": 36.8538, + "step": 2131 + }, + { + "epoch": 7.69706546275395, + "grad_norm": 191.14463806152344, + "learning_rate": 1.8446460980036298e-05, + "loss": 37.8517, + "step": 2132 + }, + { + "epoch": 7.700677200902934, + "grad_norm": 189.20896911621094, + "learning_rate": 1.8441016333938293e-05, + "loss": 38.406, + "step": 2133 + }, + { + "epoch": 7.704288939051919, + "grad_norm": 209.61175537109375, + "learning_rate": 1.8435571687840292e-05, + "loss": 37.7692, + "step": 2134 + }, + { + "epoch": 7.707900677200903, + "grad_norm": 220.5150146484375, + "learning_rate": 1.8430127041742287e-05, + "loss": 36.087, + "step": 2135 + }, + { + "epoch": 7.711512415349887, + "grad_norm": 211.78372192382812, + "learning_rate": 1.8424682395644286e-05, + "loss": 25.6052, + "step": 2136 + }, + { + "epoch": 7.715124153498872, + "grad_norm": 223.85789489746094, + "learning_rate": 1.841923774954628e-05, + "loss": 23.5576, + "step": 2137 + }, + { + "epoch": 7.718735891647856, + "grad_norm": 163.74220275878906, + "learning_rate": 1.8413793103448277e-05, + "loss": 24.4869, + "step": 2138 + }, + { + "epoch": 7.72234762979684, + "grad_norm": 182.80079650878906, + "learning_rate": 1.8408348457350272e-05, + "loss": 25.1878, + "step": 2139 + }, + { + "epoch": 7.725959367945824, + "grad_norm": 296.0340270996094, + "learning_rate": 1.8402903811252268e-05, + "loss": 44.4643, + "step": 2140 + }, + { + "epoch": 7.725959367945824, + "eval_loss": 0.6382863521575928, + "eval_runtime": 3.1441, + "eval_samples_per_second": 56.932, + "eval_steps_per_second": 56.932, + "step": 2140 + }, + { + "epoch": 7.7295711060948085, + "grad_norm": 248.48643493652344, + "learning_rate": 1.8397459165154263e-05, + "loss": 45.2141, + "step": 2141 + }, + { + "epoch": 7.733182844243792, + "grad_norm": 240.9061279296875, + "learning_rate": 1.8392014519056262e-05, + "loss": 42.9435, + "step": 2142 + }, + { + "epoch": 7.736794582392776, + "grad_norm": 231.62315368652344, + "learning_rate": 1.8386569872958257e-05, + "loss": 42.9769, + "step": 2143 + }, + { + "epoch": 7.74040632054176, + "grad_norm": 244.36915588378906, + "learning_rate": 1.8381125226860256e-05, + "loss": 43.6058, + "step": 2144 + }, + { + "epoch": 7.744018058690745, + "grad_norm": 252.9080047607422, + "learning_rate": 1.837568058076225e-05, + "loss": 43.1753, + "step": 2145 + }, + { + "epoch": 7.747629796839729, + "grad_norm": 274.0201721191406, + "learning_rate": 1.8370235934664247e-05, + "loss": 43.3285, + "step": 2146 + }, + { + "epoch": 7.751241534988713, + "grad_norm": 226.75595092773438, + "learning_rate": 1.8364791288566245e-05, + "loss": 43.3158, + "step": 2147 + }, + { + "epoch": 7.754853273137698, + "grad_norm": 197.0859832763672, + "learning_rate": 1.835934664246824e-05, + "loss": 43.5773, + "step": 2148 + }, + { + "epoch": 7.758465011286682, + "grad_norm": 212.14720153808594, + "learning_rate": 1.8353901996370236e-05, + "loss": 43.9208, + "step": 2149 + }, + { + "epoch": 7.762076749435666, + "grad_norm": 230.22158813476562, + "learning_rate": 1.834845735027223e-05, + "loss": 42.8429, + "step": 2150 + }, + { + "epoch": 7.762076749435666, + "eval_loss": 0.6291994452476501, + "eval_runtime": 3.1473, + "eval_samples_per_second": 56.874, + "eval_steps_per_second": 56.874, + "step": 2150 + }, + { + "epoch": 7.76568848758465, + "grad_norm": 215.79391479492188, + "learning_rate": 1.8343012704174227e-05, + "loss": 40.7289, + "step": 2151 + }, + { + "epoch": 7.769300225733634, + "grad_norm": 210.00296020507812, + "learning_rate": 1.8337568058076222e-05, + "loss": 39.9759, + "step": 2152 + }, + { + "epoch": 7.772911963882619, + "grad_norm": 291.2987976074219, + "learning_rate": 1.8332123411978224e-05, + "loss": 40.551, + "step": 2153 + }, + { + "epoch": 7.776523702031603, + "grad_norm": 218.08819580078125, + "learning_rate": 1.832667876588022e-05, + "loss": 40.7981, + "step": 2154 + }, + { + "epoch": 7.780135440180587, + "grad_norm": 268.615966796875, + "learning_rate": 1.8321234119782215e-05, + "loss": 40.5463, + "step": 2155 + }, + { + "epoch": 7.7837471783295715, + "grad_norm": 269.939697265625, + "learning_rate": 1.831578947368421e-05, + "loss": 40.6168, + "step": 2156 + }, + { + "epoch": 7.7873589164785555, + "grad_norm": 268.9761657714844, + "learning_rate": 1.8310344827586206e-05, + "loss": 41.2449, + "step": 2157 + }, + { + "epoch": 7.7909706546275395, + "grad_norm": 161.08811950683594, + "learning_rate": 1.8304900181488205e-05, + "loss": 40.6308, + "step": 2158 + }, + { + "epoch": 7.794582392776523, + "grad_norm": 190.44696044921875, + "learning_rate": 1.82994555353902e-05, + "loss": 40.9708, + "step": 2159 + }, + { + "epoch": 7.798194130925508, + "grad_norm": 202.4305419921875, + "learning_rate": 1.8294010889292196e-05, + "loss": 41.2053, + "step": 2160 + }, + { + "epoch": 7.798194130925508, + "eval_loss": 0.6233534812927246, + "eval_runtime": 3.1457, + "eval_samples_per_second": 56.903, + "eval_steps_per_second": 56.903, + "step": 2160 + }, + { + "epoch": 7.801805869074492, + "grad_norm": 188.5523681640625, + "learning_rate": 1.828856624319419e-05, + "loss": 40.3928, + "step": 2161 + }, + { + "epoch": 7.805417607223476, + "grad_norm": 184.18296813964844, + "learning_rate": 1.828312159709619e-05, + "loss": 42.3466, + "step": 2162 + }, + { + "epoch": 7.80902934537246, + "grad_norm": 223.9243927001953, + "learning_rate": 1.8277676950998185e-05, + "loss": 42.0301, + "step": 2163 + }, + { + "epoch": 7.812641083521445, + "grad_norm": 202.3498077392578, + "learning_rate": 1.8272232304900184e-05, + "loss": 42.3284, + "step": 2164 + }, + { + "epoch": 7.816252821670429, + "grad_norm": 205.77940368652344, + "learning_rate": 1.826678765880218e-05, + "loss": 42.0951, + "step": 2165 + }, + { + "epoch": 7.819864559819413, + "grad_norm": 191.46728515625, + "learning_rate": 1.8261343012704175e-05, + "loss": 40.826, + "step": 2166 + }, + { + "epoch": 7.823476297968397, + "grad_norm": 276.8330383300781, + "learning_rate": 1.825589836660617e-05, + "loss": 42.7909, + "step": 2167 + }, + { + "epoch": 7.827088036117382, + "grad_norm": 181.93955993652344, + "learning_rate": 1.8250453720508165e-05, + "loss": 38.6068, + "step": 2168 + }, + { + "epoch": 7.830699774266366, + "grad_norm": 178.79856872558594, + "learning_rate": 1.8245009074410164e-05, + "loss": 35.694, + "step": 2169 + }, + { + "epoch": 7.83431151241535, + "grad_norm": 224.6522979736328, + "learning_rate": 1.823956442831216e-05, + "loss": 36.7127, + "step": 2170 + }, + { + "epoch": 7.83431151241535, + "eval_loss": 0.6237645745277405, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 2170 + }, + { + "epoch": 7.837923250564334, + "grad_norm": 203.37196350097656, + "learning_rate": 1.823411978221416e-05, + "loss": 34.0039, + "step": 2171 + }, + { + "epoch": 7.8415349887133186, + "grad_norm": 212.79307556152344, + "learning_rate": 1.8228675136116154e-05, + "loss": 33.2787, + "step": 2172 + }, + { + "epoch": 7.8451467268623025, + "grad_norm": 215.5691375732422, + "learning_rate": 1.822323049001815e-05, + "loss": 35.4241, + "step": 2173 + }, + { + "epoch": 7.8487584650112865, + "grad_norm": 230.0751190185547, + "learning_rate": 1.8217785843920144e-05, + "loss": 36.9333, + "step": 2174 + }, + { + "epoch": 7.852370203160271, + "grad_norm": 217.8132781982422, + "learning_rate": 1.8212341197822143e-05, + "loss": 35.7233, + "step": 2175 + }, + { + "epoch": 7.855981941309255, + "grad_norm": 245.93177795410156, + "learning_rate": 1.820689655172414e-05, + "loss": 36.6111, + "step": 2176 + }, + { + "epoch": 7.859593679458239, + "grad_norm": 210.58218383789062, + "learning_rate": 1.8201451905626134e-05, + "loss": 36.3243, + "step": 2177 + }, + { + "epoch": 7.863205417607223, + "grad_norm": 234.6280059814453, + "learning_rate": 1.819600725952813e-05, + "loss": 37.0315, + "step": 2178 + }, + { + "epoch": 7.866817155756207, + "grad_norm": 184.53121948242188, + "learning_rate": 1.8190562613430125e-05, + "loss": 35.8725, + "step": 2179 + }, + { + "epoch": 7.870428893905192, + "grad_norm": 201.5563507080078, + "learning_rate": 1.8185117967332127e-05, + "loss": 37.9183, + "step": 2180 + }, + { + "epoch": 7.870428893905192, + "eval_loss": 0.6210297346115112, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2180 + }, + { + "epoch": 7.874040632054176, + "grad_norm": 192.29579162597656, + "learning_rate": 1.8179673321234122e-05, + "loss": 37.1709, + "step": 2181 + }, + { + "epoch": 7.87765237020316, + "grad_norm": 246.0638427734375, + "learning_rate": 1.8174228675136118e-05, + "loss": 38.5338, + "step": 2182 + }, + { + "epoch": 7.881264108352145, + "grad_norm": 237.47607421875, + "learning_rate": 1.8168784029038113e-05, + "loss": 37.7041, + "step": 2183 + }, + { + "epoch": 7.884875846501129, + "grad_norm": 215.06407165527344, + "learning_rate": 1.816333938294011e-05, + "loss": 38.1663, + "step": 2184 + }, + { + "epoch": 7.888487584650113, + "grad_norm": 193.76809692382812, + "learning_rate": 1.8157894736842107e-05, + "loss": 32.1679, + "step": 2185 + }, + { + "epoch": 7.892099322799097, + "grad_norm": 208.66111755371094, + "learning_rate": 1.8152450090744103e-05, + "loss": 24.2413, + "step": 2186 + }, + { + "epoch": 7.895711060948082, + "grad_norm": 182.810546875, + "learning_rate": 1.8147005444646098e-05, + "loss": 24.1102, + "step": 2187 + }, + { + "epoch": 7.899322799097066, + "grad_norm": 200.25823974609375, + "learning_rate": 1.8141560798548093e-05, + "loss": 24.5778, + "step": 2188 + }, + { + "epoch": 7.9029345372460496, + "grad_norm": 224.19125366210938, + "learning_rate": 1.813611615245009e-05, + "loss": 26.1643, + "step": 2189 + }, + { + "epoch": 7.9065462753950335, + "grad_norm": 261.03033447265625, + "learning_rate": 1.8130671506352088e-05, + "loss": 45.1071, + "step": 2190 + }, + { + "epoch": 7.9065462753950335, + "eval_loss": 0.6303785443305969, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2190 + }, + { + "epoch": 7.910158013544018, + "grad_norm": 273.6593322753906, + "learning_rate": 1.8125226860254086e-05, + "loss": 43.8271, + "step": 2191 + }, + { + "epoch": 7.913769751693002, + "grad_norm": 304.0534362792969, + "learning_rate": 1.8119782214156082e-05, + "loss": 43.7623, + "step": 2192 + }, + { + "epoch": 7.917381489841986, + "grad_norm": 249.27255249023438, + "learning_rate": 1.8114337568058077e-05, + "loss": 43.7191, + "step": 2193 + }, + { + "epoch": 7.92099322799097, + "grad_norm": 199.5006103515625, + "learning_rate": 1.8108892921960072e-05, + "loss": 44.1019, + "step": 2194 + }, + { + "epoch": 7.924604966139955, + "grad_norm": 228.42832946777344, + "learning_rate": 1.8103448275862068e-05, + "loss": 43.9717, + "step": 2195 + }, + { + "epoch": 7.928216704288939, + "grad_norm": 247.20901489257812, + "learning_rate": 1.8098003629764067e-05, + "loss": 40.022, + "step": 2196 + }, + { + "epoch": 7.931828442437923, + "grad_norm": 297.5372619628906, + "learning_rate": 1.8092558983666062e-05, + "loss": 40.6639, + "step": 2197 + }, + { + "epoch": 7.935440180586907, + "grad_norm": 245.11915588378906, + "learning_rate": 1.8087114337568057e-05, + "loss": 40.3569, + "step": 2198 + }, + { + "epoch": 7.939051918735892, + "grad_norm": 255.53297424316406, + "learning_rate": 1.8081669691470056e-05, + "loss": 41.7983, + "step": 2199 + }, + { + "epoch": 7.942663656884876, + "grad_norm": 226.12783813476562, + "learning_rate": 1.807622504537205e-05, + "loss": 41.7844, + "step": 2200 + }, + { + "epoch": 7.942663656884876, + "eval_loss": 0.6214397549629211, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2200 + }, + { + "epoch": 7.94627539503386, + "grad_norm": 220.90577697753906, + "learning_rate": 1.8070780399274047e-05, + "loss": 42.057, + "step": 2201 + }, + { + "epoch": 7.949887133182845, + "grad_norm": 192.33856201171875, + "learning_rate": 1.8065335753176046e-05, + "loss": 42.0299, + "step": 2202 + }, + { + "epoch": 7.953498871331829, + "grad_norm": 192.8511962890625, + "learning_rate": 1.805989110707804e-05, + "loss": 41.7752, + "step": 2203 + }, + { + "epoch": 7.957110609480813, + "grad_norm": 223.10275268554688, + "learning_rate": 1.8054446460980036e-05, + "loss": 41.0178, + "step": 2204 + }, + { + "epoch": 7.960722347629797, + "grad_norm": 189.8402099609375, + "learning_rate": 1.8049001814882032e-05, + "loss": 37.9747, + "step": 2205 + }, + { + "epoch": 7.9643340857787805, + "grad_norm": 233.5938720703125, + "learning_rate": 1.8043557168784027e-05, + "loss": 35.3994, + "step": 2206 + }, + { + "epoch": 7.967945823927765, + "grad_norm": 218.5577850341797, + "learning_rate": 1.8038112522686026e-05, + "loss": 35.1967, + "step": 2207 + }, + { + "epoch": 7.971557562076749, + "grad_norm": 228.49502563476562, + "learning_rate": 1.8032667876588025e-05, + "loss": 34.5792, + "step": 2208 + }, + { + "epoch": 7.975169300225733, + "grad_norm": 285.4461364746094, + "learning_rate": 1.802722323049002e-05, + "loss": 37.9449, + "step": 2209 + }, + { + "epoch": 7.978781038374718, + "grad_norm": 186.83755493164062, + "learning_rate": 1.8021778584392016e-05, + "loss": 36.3295, + "step": 2210 + }, + { + "epoch": 7.978781038374718, + "eval_loss": 0.6212169528007507, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2210 + }, + { + "epoch": 7.982392776523702, + "grad_norm": 210.31175231933594, + "learning_rate": 1.801633393829401e-05, + "loss": 37.0061, + "step": 2211 + }, + { + "epoch": 7.986004514672686, + "grad_norm": 251.96026611328125, + "learning_rate": 1.8010889292196006e-05, + "loss": 37.8831, + "step": 2212 + }, + { + "epoch": 7.98961625282167, + "grad_norm": 273.8665771484375, + "learning_rate": 1.8005444646098005e-05, + "loss": 38.8926, + "step": 2213 + }, + { + "epoch": 7.993227990970655, + "grad_norm": 207.25836181640625, + "learning_rate": 1.8e-05, + "loss": 30.0468, + "step": 2214 + }, + { + "epoch": 7.996839729119639, + "grad_norm": 200.5218048095703, + "learning_rate": 1.7994555353901996e-05, + "loss": 24.0549, + "step": 2215 + }, + { + "epoch": 8.0, + "grad_norm": 245.7149200439453, + "learning_rate": 1.798911070780399e-05, + "loss": 22.3158, + "step": 2216 + }, + { + "epoch": 8.003611738148985, + "grad_norm": 263.85546875, + "learning_rate": 1.798366606170599e-05, + "loss": 43.2342, + "step": 2217 + }, + { + "epoch": 8.007223476297968, + "grad_norm": 244.57205200195312, + "learning_rate": 1.797822141560799e-05, + "loss": 44.0931, + "step": 2218 + }, + { + "epoch": 8.010835214446953, + "grad_norm": 196.4144287109375, + "learning_rate": 1.7972776769509984e-05, + "loss": 42.1926, + "step": 2219 + }, + { + "epoch": 8.014446952595938, + "grad_norm": 282.3250427246094, + "learning_rate": 1.796733212341198e-05, + "loss": 41.4664, + "step": 2220 + }, + { + "epoch": 8.014446952595938, + "eval_loss": 0.6222901344299316, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 2220 + }, + { + "epoch": 8.01805869074492, + "grad_norm": 186.79281616210938, + "learning_rate": 1.7961887477313975e-05, + "loss": 42.2133, + "step": 2221 + }, + { + "epoch": 8.021670428893906, + "grad_norm": 220.3788299560547, + "learning_rate": 1.795644283121597e-05, + "loss": 42.0159, + "step": 2222 + }, + { + "epoch": 8.025282167042889, + "grad_norm": 262.37078857421875, + "learning_rate": 1.7950998185117966e-05, + "loss": 42.6055, + "step": 2223 + }, + { + "epoch": 8.028893905191874, + "grad_norm": 199.07078552246094, + "learning_rate": 1.7945553539019964e-05, + "loss": 43.3061, + "step": 2224 + }, + { + "epoch": 8.032505643340858, + "grad_norm": 256.6651306152344, + "learning_rate": 1.794010889292196e-05, + "loss": 42.4806, + "step": 2225 + }, + { + "epoch": 8.036117381489841, + "grad_norm": 281.17431640625, + "learning_rate": 1.793466424682396e-05, + "loss": 43.9823, + "step": 2226 + }, + { + "epoch": 8.039729119638826, + "grad_norm": 201.19837951660156, + "learning_rate": 1.7929219600725954e-05, + "loss": 41.8372, + "step": 2227 + }, + { + "epoch": 8.043340857787811, + "grad_norm": 195.1905059814453, + "learning_rate": 1.792377495462795e-05, + "loss": 38.8656, + "step": 2228 + }, + { + "epoch": 8.046952595936794, + "grad_norm": 215.02772521972656, + "learning_rate": 1.7918330308529948e-05, + "loss": 39.8965, + "step": 2229 + }, + { + "epoch": 8.050564334085779, + "grad_norm": 202.16322326660156, + "learning_rate": 1.7912885662431944e-05, + "loss": 41.0917, + "step": 2230 + }, + { + "epoch": 8.050564334085779, + "eval_loss": 0.6212881207466125, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2230 + }, + { + "epoch": 8.054176072234762, + "grad_norm": 218.90786743164062, + "learning_rate": 1.790744101633394e-05, + "loss": 38.5499, + "step": 2231 + }, + { + "epoch": 8.057787810383747, + "grad_norm": 179.57138061523438, + "learning_rate": 1.7901996370235934e-05, + "loss": 39.5915, + "step": 2232 + }, + { + "epoch": 8.061399548532732, + "grad_norm": 242.74801635742188, + "learning_rate": 1.789655172413793e-05, + "loss": 39.6094, + "step": 2233 + }, + { + "epoch": 8.065011286681715, + "grad_norm": 183.07102966308594, + "learning_rate": 1.7891107078039925e-05, + "loss": 40.6025, + "step": 2234 + }, + { + "epoch": 8.0686230248307, + "grad_norm": 192.85418701171875, + "learning_rate": 1.7885662431941924e-05, + "loss": 40.3013, + "step": 2235 + }, + { + "epoch": 8.072234762979685, + "grad_norm": 254.26353454589844, + "learning_rate": 1.7880217785843923e-05, + "loss": 39.1747, + "step": 2236 + }, + { + "epoch": 8.075846501128668, + "grad_norm": 230.7747802734375, + "learning_rate": 1.7874773139745918e-05, + "loss": 40.7569, + "step": 2237 + }, + { + "epoch": 8.079458239277653, + "grad_norm": 179.30528259277344, + "learning_rate": 1.7869328493647913e-05, + "loss": 40.0753, + "step": 2238 + }, + { + "epoch": 8.083069977426636, + "grad_norm": 203.48915100097656, + "learning_rate": 1.786388384754991e-05, + "loss": 41.4453, + "step": 2239 + }, + { + "epoch": 8.08668171557562, + "grad_norm": 274.8970947265625, + "learning_rate": 1.7858439201451908e-05, + "loss": 40.5818, + "step": 2240 + }, + { + "epoch": 8.08668171557562, + "eval_loss": 0.6184170842170715, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.799, + "eval_steps_per_second": 56.799, + "step": 2240 + }, + { + "epoch": 8.090293453724605, + "grad_norm": 237.2452392578125, + "learning_rate": 1.7852994555353903e-05, + "loss": 42.5794, + "step": 2241 + }, + { + "epoch": 8.093905191873588, + "grad_norm": 236.33766174316406, + "learning_rate": 1.7847549909255898e-05, + "loss": 41.89, + "step": 2242 + }, + { + "epoch": 8.097516930022573, + "grad_norm": 269.4791564941406, + "learning_rate": 1.7842105263157894e-05, + "loss": 41.7726, + "step": 2243 + }, + { + "epoch": 8.101128668171558, + "grad_norm": 192.28457641601562, + "learning_rate": 1.783666061705989e-05, + "loss": 40.1187, + "step": 2244 + }, + { + "epoch": 8.104740406320541, + "grad_norm": 201.5625457763672, + "learning_rate": 1.7831215970961888e-05, + "loss": 36.8004, + "step": 2245 + }, + { + "epoch": 8.108352144469526, + "grad_norm": 175.7625274658203, + "learning_rate": 1.7825771324863887e-05, + "loss": 33.8354, + "step": 2246 + }, + { + "epoch": 8.111963882618511, + "grad_norm": 195.6171112060547, + "learning_rate": 1.7820326678765882e-05, + "loss": 33.5176, + "step": 2247 + }, + { + "epoch": 8.115575620767494, + "grad_norm": 158.7554168701172, + "learning_rate": 1.7814882032667877e-05, + "loss": 34.2908, + "step": 2248 + }, + { + "epoch": 8.119187358916479, + "grad_norm": 192.78900146484375, + "learning_rate": 1.7809437386569873e-05, + "loss": 34.0861, + "step": 2249 + }, + { + "epoch": 8.122799097065462, + "grad_norm": 186.6603240966797, + "learning_rate": 1.7803992740471868e-05, + "loss": 35.5742, + "step": 2250 + }, + { + "epoch": 8.122799097065462, + "eval_loss": 0.6207499504089355, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2250 + }, + { + "epoch": 8.126410835214447, + "grad_norm": 264.3590087890625, + "learning_rate": 1.7798548094373867e-05, + "loss": 35.6709, + "step": 2251 + }, + { + "epoch": 8.130022573363432, + "grad_norm": 202.9478302001953, + "learning_rate": 1.7793103448275862e-05, + "loss": 36.4221, + "step": 2252 + }, + { + "epoch": 8.133634311512415, + "grad_norm": 229.260498046875, + "learning_rate": 1.7787658802177858e-05, + "loss": 36.0745, + "step": 2253 + }, + { + "epoch": 8.1372460496614, + "grad_norm": 222.37716674804688, + "learning_rate": 1.7782214156079856e-05, + "loss": 37.3266, + "step": 2254 + }, + { + "epoch": 8.140857787810384, + "grad_norm": 217.02272033691406, + "learning_rate": 1.7776769509981852e-05, + "loss": 37.2819, + "step": 2255 + }, + { + "epoch": 8.144469525959368, + "grad_norm": 247.61016845703125, + "learning_rate": 1.7771324863883847e-05, + "loss": 37.2683, + "step": 2256 + }, + { + "epoch": 8.148081264108352, + "grad_norm": 209.7449493408203, + "learning_rate": 1.7765880217785846e-05, + "loss": 36.7165, + "step": 2257 + }, + { + "epoch": 8.151693002257336, + "grad_norm": 217.30722045898438, + "learning_rate": 1.776043557168784e-05, + "loss": 37.0805, + "step": 2258 + }, + { + "epoch": 8.15530474040632, + "grad_norm": 181.5167236328125, + "learning_rate": 1.7754990925589837e-05, + "loss": 38.0326, + "step": 2259 + }, + { + "epoch": 8.158916478555305, + "grad_norm": 217.4818878173828, + "learning_rate": 1.7749546279491832e-05, + "loss": 37.1798, + "step": 2260 + }, + { + "epoch": 8.158916478555305, + "eval_loss": 0.6218119263648987, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 2260 + }, + { + "epoch": 8.162528216704288, + "grad_norm": 233.60733032226562, + "learning_rate": 1.7744101633393828e-05, + "loss": 36.6039, + "step": 2261 + }, + { + "epoch": 8.166139954853273, + "grad_norm": 184.5128631591797, + "learning_rate": 1.7738656987295826e-05, + "loss": 30.6188, + "step": 2262 + }, + { + "epoch": 8.169751693002258, + "grad_norm": 154.25791931152344, + "learning_rate": 1.7733212341197825e-05, + "loss": 24.0782, + "step": 2263 + }, + { + "epoch": 8.173363431151241, + "grad_norm": 179.92723083496094, + "learning_rate": 1.772776769509982e-05, + "loss": 23.7072, + "step": 2264 + }, + { + "epoch": 8.176975169300226, + "grad_norm": 170.87684631347656, + "learning_rate": 1.7722323049001816e-05, + "loss": 24.0008, + "step": 2265 + }, + { + "epoch": 8.18058690744921, + "grad_norm": 179.25233459472656, + "learning_rate": 1.771687840290381e-05, + "loss": 24.8393, + "step": 2266 + }, + { + "epoch": 8.184198645598194, + "grad_norm": 268.7836608886719, + "learning_rate": 1.7711433756805807e-05, + "loss": 44.0573, + "step": 2267 + }, + { + "epoch": 8.187810383747179, + "grad_norm": 249.12033081054688, + "learning_rate": 1.7705989110707805e-05, + "loss": 45.0218, + "step": 2268 + }, + { + "epoch": 8.191422121896162, + "grad_norm": 275.2551574707031, + "learning_rate": 1.77005444646098e-05, + "loss": 43.1954, + "step": 2269 + }, + { + "epoch": 8.195033860045147, + "grad_norm": 233.5360107421875, + "learning_rate": 1.7695099818511796e-05, + "loss": 43.0807, + "step": 2270 + }, + { + "epoch": 8.195033860045147, + "eval_loss": 0.6311450600624084, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 2270 + }, + { + "epoch": 8.198645598194132, + "grad_norm": 201.01617431640625, + "learning_rate": 1.768965517241379e-05, + "loss": 43.8161, + "step": 2271 + }, + { + "epoch": 8.202257336343115, + "grad_norm": 243.028564453125, + "learning_rate": 1.7684210526315787e-05, + "loss": 43.3388, + "step": 2272 + }, + { + "epoch": 8.2058690744921, + "grad_norm": 191.8246307373047, + "learning_rate": 1.767876588021779e-05, + "loss": 42.6949, + "step": 2273 + }, + { + "epoch": 8.209480812641084, + "grad_norm": 241.33609008789062, + "learning_rate": 1.7673321234119784e-05, + "loss": 43.3541, + "step": 2274 + }, + { + "epoch": 8.213092550790067, + "grad_norm": 247.99066162109375, + "learning_rate": 1.766787658802178e-05, + "loss": 44.4262, + "step": 2275 + }, + { + "epoch": 8.216704288939052, + "grad_norm": 223.35452270507812, + "learning_rate": 1.7662431941923775e-05, + "loss": 42.5696, + "step": 2276 + }, + { + "epoch": 8.220316027088035, + "grad_norm": 208.75209045410156, + "learning_rate": 1.765698729582577e-05, + "loss": 41.9236, + "step": 2277 + }, + { + "epoch": 8.22392776523702, + "grad_norm": 229.60305786132812, + "learning_rate": 1.7651542649727766e-05, + "loss": 39.962, + "step": 2278 + }, + { + "epoch": 8.227539503386005, + "grad_norm": 294.3867492675781, + "learning_rate": 1.7646098003629765e-05, + "loss": 39.0847, + "step": 2279 + }, + { + "epoch": 8.231151241534988, + "grad_norm": 201.49679565429688, + "learning_rate": 1.764065335753176e-05, + "loss": 39.1451, + "step": 2280 + }, + { + "epoch": 8.231151241534988, + "eval_loss": 0.6214079856872559, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 2280 + }, + { + "epoch": 8.234762979683973, + "grad_norm": 201.57894897460938, + "learning_rate": 1.7635208711433756e-05, + "loss": 39.4673, + "step": 2281 + }, + { + "epoch": 8.238374717832958, + "grad_norm": 201.0395965576172, + "learning_rate": 1.7629764065335754e-05, + "loss": 39.9832, + "step": 2282 + }, + { + "epoch": 8.241986455981941, + "grad_norm": 274.41168212890625, + "learning_rate": 1.762431941923775e-05, + "loss": 40.3885, + "step": 2283 + }, + { + "epoch": 8.245598194130926, + "grad_norm": 173.79977416992188, + "learning_rate": 1.761887477313975e-05, + "loss": 39.5292, + "step": 2284 + }, + { + "epoch": 8.249209932279909, + "grad_norm": 194.91806030273438, + "learning_rate": 1.7613430127041744e-05, + "loss": 40.3855, + "step": 2285 + }, + { + "epoch": 8.252821670428894, + "grad_norm": 216.47213745117188, + "learning_rate": 1.760798548094374e-05, + "loss": 40.937, + "step": 2286 + }, + { + "epoch": 8.256433408577879, + "grad_norm": 168.1825714111328, + "learning_rate": 1.7602540834845735e-05, + "loss": 41.2523, + "step": 2287 + }, + { + "epoch": 8.260045146726862, + "grad_norm": 187.51914978027344, + "learning_rate": 1.759709618874773e-05, + "loss": 40.6913, + "step": 2288 + }, + { + "epoch": 8.263656884875846, + "grad_norm": 183.99844360351562, + "learning_rate": 1.759165154264973e-05, + "loss": 42.5074, + "step": 2289 + }, + { + "epoch": 8.267268623024831, + "grad_norm": 201.23797607421875, + "learning_rate": 1.7586206896551724e-05, + "loss": 42.0519, + "step": 2290 + }, + { + "epoch": 8.267268623024831, + "eval_loss": 0.6184054017066956, + "eval_runtime": 3.1465, + "eval_samples_per_second": 56.889, + "eval_steps_per_second": 56.889, + "step": 2290 + }, + { + "epoch": 8.270880361173814, + "grad_norm": 219.0037384033203, + "learning_rate": 1.7580762250453723e-05, + "loss": 41.7059, + "step": 2291 + }, + { + "epoch": 8.2744920993228, + "grad_norm": 221.00173950195312, + "learning_rate": 1.7575317604355718e-05, + "loss": 40.9004, + "step": 2292 + }, + { + "epoch": 8.278103837471784, + "grad_norm": 180.00828552246094, + "learning_rate": 1.7569872958257714e-05, + "loss": 38.7865, + "step": 2293 + }, + { + "epoch": 8.281715575620767, + "grad_norm": 210.69302368164062, + "learning_rate": 1.756442831215971e-05, + "loss": 39.207, + "step": 2294 + }, + { + "epoch": 8.285327313769752, + "grad_norm": 196.8787078857422, + "learning_rate": 1.7558983666061708e-05, + "loss": 39.4472, + "step": 2295 + }, + { + "epoch": 8.288939051918735, + "grad_norm": 229.16331481933594, + "learning_rate": 1.7553539019963703e-05, + "loss": 36.5539, + "step": 2296 + }, + { + "epoch": 8.29255079006772, + "grad_norm": 180.67474365234375, + "learning_rate": 1.75480943738657e-05, + "loss": 34.3887, + "step": 2297 + }, + { + "epoch": 8.296162528216705, + "grad_norm": 234.046875, + "learning_rate": 1.7542649727767694e-05, + "loss": 34.158, + "step": 2298 + }, + { + "epoch": 8.299774266365688, + "grad_norm": 213.34255981445312, + "learning_rate": 1.753720508166969e-05, + "loss": 34.7655, + "step": 2299 + }, + { + "epoch": 8.303386004514673, + "grad_norm": 205.6382598876953, + "learning_rate": 1.753176043557169e-05, + "loss": 34.4223, + "step": 2300 + }, + { + "epoch": 8.303386004514673, + "eval_loss": 0.6200549006462097, + "eval_runtime": 3.1447, + "eval_samples_per_second": 56.921, + "eval_steps_per_second": 56.921, + "step": 2300 + }, + { + "epoch": 8.306997742663658, + "grad_norm": 189.79238891601562, + "learning_rate": 1.7526315789473687e-05, + "loss": 35.3846, + "step": 2301 + }, + { + "epoch": 8.31060948081264, + "grad_norm": 202.27859497070312, + "learning_rate": 1.7520871143375682e-05, + "loss": 34.9006, + "step": 2302 + }, + { + "epoch": 8.314221218961626, + "grad_norm": 217.62327575683594, + "learning_rate": 1.7515426497277678e-05, + "loss": 36.3079, + "step": 2303 + }, + { + "epoch": 8.317832957110609, + "grad_norm": 212.82862854003906, + "learning_rate": 1.7509981851179673e-05, + "loss": 35.8598, + "step": 2304 + }, + { + "epoch": 8.321444695259594, + "grad_norm": 229.778564453125, + "learning_rate": 1.750453720508167e-05, + "loss": 37.0853, + "step": 2305 + }, + { + "epoch": 8.325056433408578, + "grad_norm": 219.99844360351562, + "learning_rate": 1.7499092558983667e-05, + "loss": 38.01, + "step": 2306 + }, + { + "epoch": 8.328668171557561, + "grad_norm": 202.63035583496094, + "learning_rate": 1.7493647912885663e-05, + "loss": 36.4756, + "step": 2307 + }, + { + "epoch": 8.332279909706546, + "grad_norm": 188.44094848632812, + "learning_rate": 1.7488203266787658e-05, + "loss": 37.0509, + "step": 2308 + }, + { + "epoch": 8.335891647855531, + "grad_norm": 187.8760223388672, + "learning_rate": 1.7482758620689657e-05, + "loss": 38.0019, + "step": 2309 + }, + { + "epoch": 8.339503386004514, + "grad_norm": 239.35833740234375, + "learning_rate": 1.7477313974591652e-05, + "loss": 38.2255, + "step": 2310 + }, + { + "epoch": 8.339503386004514, + "eval_loss": 0.6221747994422913, + "eval_runtime": 3.148, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 2310 + }, + { + "epoch": 8.343115124153499, + "grad_norm": 236.3567657470703, + "learning_rate": 1.747186932849365e-05, + "loss": 37.3598, + "step": 2311 + }, + { + "epoch": 8.346726862302482, + "grad_norm": 188.16151428222656, + "learning_rate": 1.7466424682395646e-05, + "loss": 27.1993, + "step": 2312 + }, + { + "epoch": 8.350338600451467, + "grad_norm": 216.58778381347656, + "learning_rate": 1.746098003629764e-05, + "loss": 23.7024, + "step": 2313 + }, + { + "epoch": 8.353950338600452, + "grad_norm": 221.03111267089844, + "learning_rate": 1.7455535390199637e-05, + "loss": 24.2856, + "step": 2314 + }, + { + "epoch": 8.357562076749435, + "grad_norm": 180.36221313476562, + "learning_rate": 1.7450090744101632e-05, + "loss": 23.7624, + "step": 2315 + }, + { + "epoch": 8.36117381489842, + "grad_norm": 198.77438354492188, + "learning_rate": 1.7444646098003628e-05, + "loss": 25.8628, + "step": 2316 + }, + { + "epoch": 8.364785553047405, + "grad_norm": 250.81321716308594, + "learning_rate": 1.7439201451905627e-05, + "loss": 43.4097, + "step": 2317 + }, + { + "epoch": 8.368397291196388, + "grad_norm": 246.19544982910156, + "learning_rate": 1.7433756805807622e-05, + "loss": 44.7141, + "step": 2318 + }, + { + "epoch": 8.372009029345373, + "grad_norm": 245.04241943359375, + "learning_rate": 1.742831215970962e-05, + "loss": 44.4511, + "step": 2319 + }, + { + "epoch": 8.375620767494357, + "grad_norm": 224.05331420898438, + "learning_rate": 1.7422867513611616e-05, + "loss": 43.5971, + "step": 2320 + }, + { + "epoch": 8.375620767494357, + "eval_loss": 0.6324251294136047, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 2320 + }, + { + "epoch": 8.37923250564334, + "grad_norm": 222.3795623779297, + "learning_rate": 1.741742286751361e-05, + "loss": 42.9007, + "step": 2321 + }, + { + "epoch": 8.382844243792325, + "grad_norm": 210.0133514404297, + "learning_rate": 1.741197822141561e-05, + "loss": 42.8733, + "step": 2322 + }, + { + "epoch": 8.386455981941308, + "grad_norm": 222.01031494140625, + "learning_rate": 1.7406533575317606e-05, + "loss": 42.9875, + "step": 2323 + }, + { + "epoch": 8.390067720090293, + "grad_norm": 187.30101013183594, + "learning_rate": 1.74010889292196e-05, + "loss": 42.4873, + "step": 2324 + }, + { + "epoch": 8.393679458239278, + "grad_norm": 188.22048950195312, + "learning_rate": 1.7395644283121596e-05, + "loss": 42.2066, + "step": 2325 + }, + { + "epoch": 8.397291196388261, + "grad_norm": 228.75363159179688, + "learning_rate": 1.7390199637023592e-05, + "loss": 42.7604, + "step": 2326 + }, + { + "epoch": 8.400902934537246, + "grad_norm": 196.8817901611328, + "learning_rate": 1.7384754990925587e-05, + "loss": 42.445, + "step": 2327 + }, + { + "epoch": 8.404514672686231, + "grad_norm": 205.3610382080078, + "learning_rate": 1.737931034482759e-05, + "loss": 39.8408, + "step": 2328 + }, + { + "epoch": 8.408126410835214, + "grad_norm": 259.0702819824219, + "learning_rate": 1.7373865698729585e-05, + "loss": 40.847, + "step": 2329 + }, + { + "epoch": 8.411738148984199, + "grad_norm": 216.12017822265625, + "learning_rate": 1.736842105263158e-05, + "loss": 40.4648, + "step": 2330 + }, + { + "epoch": 8.411738148984199, + "eval_loss": 0.6252871155738831, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2330 + }, + { + "epoch": 8.415349887133182, + "grad_norm": 330.9464111328125, + "learning_rate": 1.7362976406533575e-05, + "loss": 39.7682, + "step": 2331 + }, + { + "epoch": 8.418961625282167, + "grad_norm": 237.19505310058594, + "learning_rate": 1.735753176043557e-05, + "loss": 38.8824, + "step": 2332 + }, + { + "epoch": 8.422573363431152, + "grad_norm": 247.22259521484375, + "learning_rate": 1.735208711433757e-05, + "loss": 40.1187, + "step": 2333 + }, + { + "epoch": 8.426185101580135, + "grad_norm": 267.739990234375, + "learning_rate": 1.7346642468239565e-05, + "loss": 40.4589, + "step": 2334 + }, + { + "epoch": 8.42979683972912, + "grad_norm": 308.715576171875, + "learning_rate": 1.734119782214156e-05, + "loss": 41.5481, + "step": 2335 + }, + { + "epoch": 8.433408577878104, + "grad_norm": 350.8972473144531, + "learning_rate": 1.7335753176043556e-05, + "loss": 41.6628, + "step": 2336 + }, + { + "epoch": 8.437020316027088, + "grad_norm": 245.9825897216797, + "learning_rate": 1.7330308529945555e-05, + "loss": 40.3527, + "step": 2337 + }, + { + "epoch": 8.440632054176072, + "grad_norm": 253.94488525390625, + "learning_rate": 1.732486388384755e-05, + "loss": 39.6388, + "step": 2338 + }, + { + "epoch": 8.444243792325057, + "grad_norm": 226.24179077148438, + "learning_rate": 1.731941923774955e-05, + "loss": 40.5561, + "step": 2339 + }, + { + "epoch": 8.44785553047404, + "grad_norm": 188.66746520996094, + "learning_rate": 1.7313974591651544e-05, + "loss": 41.8422, + "step": 2340 + }, + { + "epoch": 8.44785553047404, + "eval_loss": 0.6197592616081238, + "eval_runtime": 3.1522, + "eval_samples_per_second": 56.786, + "eval_steps_per_second": 56.786, + "step": 2340 + }, + { + "epoch": 8.451467268623025, + "grad_norm": 227.01014709472656, + "learning_rate": 1.730852994555354e-05, + "loss": 41.4184, + "step": 2341 + }, + { + "epoch": 8.455079006772008, + "grad_norm": 187.11643981933594, + "learning_rate": 1.7303085299455535e-05, + "loss": 40.796, + "step": 2342 + }, + { + "epoch": 8.458690744920993, + "grad_norm": 243.1756134033203, + "learning_rate": 1.729764065335753e-05, + "loss": 41.7926, + "step": 2343 + }, + { + "epoch": 8.462302483069978, + "grad_norm": 226.15187072753906, + "learning_rate": 1.729219600725953e-05, + "loss": 41.588, + "step": 2344 + }, + { + "epoch": 8.465914221218961, + "grad_norm": 218.49935913085938, + "learning_rate": 1.7286751361161524e-05, + "loss": 39.6935, + "step": 2345 + }, + { + "epoch": 8.469525959367946, + "grad_norm": 232.4805145263672, + "learning_rate": 1.7281306715063523e-05, + "loss": 37.0718, + "step": 2346 + }, + { + "epoch": 8.47313769751693, + "grad_norm": 201.1748046875, + "learning_rate": 1.727586206896552e-05, + "loss": 33.9633, + "step": 2347 + }, + { + "epoch": 8.476749435665914, + "grad_norm": 208.79733276367188, + "learning_rate": 1.7270417422867514e-05, + "loss": 33.4553, + "step": 2348 + }, + { + "epoch": 8.480361173814899, + "grad_norm": 235.91151428222656, + "learning_rate": 1.726497277676951e-05, + "loss": 33.6144, + "step": 2349 + }, + { + "epoch": 8.483972911963882, + "grad_norm": 206.28811645507812, + "learning_rate": 1.7259528130671508e-05, + "loss": 35.3678, + "step": 2350 + }, + { + "epoch": 8.483972911963882, + "eval_loss": 0.6203061938285828, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 2350 + }, + { + "epoch": 8.487584650112867, + "grad_norm": 305.2204284667969, + "learning_rate": 1.7254083484573503e-05, + "loss": 35.9175, + "step": 2351 + }, + { + "epoch": 8.491196388261852, + "grad_norm": 227.1592254638672, + "learning_rate": 1.72486388384755e-05, + "loss": 35.5001, + "step": 2352 + }, + { + "epoch": 8.494808126410835, + "grad_norm": 194.739501953125, + "learning_rate": 1.7243194192377494e-05, + "loss": 35.0015, + "step": 2353 + }, + { + "epoch": 8.49841986455982, + "grad_norm": 233.8467254638672, + "learning_rate": 1.723774954627949e-05, + "loss": 36.8257, + "step": 2354 + }, + { + "epoch": 8.502031602708804, + "grad_norm": 258.8914489746094, + "learning_rate": 1.7232304900181492e-05, + "loss": 36.1246, + "step": 2355 + }, + { + "epoch": 8.505643340857787, + "grad_norm": 194.8585968017578, + "learning_rate": 1.7226860254083487e-05, + "loss": 36.1245, + "step": 2356 + }, + { + "epoch": 8.509255079006772, + "grad_norm": 191.2276153564453, + "learning_rate": 1.7221415607985483e-05, + "loss": 37.0608, + "step": 2357 + }, + { + "epoch": 8.512866817155757, + "grad_norm": 197.9025115966797, + "learning_rate": 1.7215970961887478e-05, + "loss": 37.0779, + "step": 2358 + }, + { + "epoch": 8.51647855530474, + "grad_norm": 207.01016235351562, + "learning_rate": 1.7210526315789473e-05, + "loss": 37.8432, + "step": 2359 + }, + { + "epoch": 8.520090293453725, + "grad_norm": 222.20201110839844, + "learning_rate": 1.720508166969147e-05, + "loss": 36.6983, + "step": 2360 + }, + { + "epoch": 8.520090293453725, + "eval_loss": 0.6240220665931702, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 2360 + }, + { + "epoch": 8.523702031602708, + "grad_norm": 200.19273376464844, + "learning_rate": 1.7199637023593467e-05, + "loss": 38.0613, + "step": 2361 + }, + { + "epoch": 8.527313769751693, + "grad_norm": 205.36758422851562, + "learning_rate": 1.7194192377495463e-05, + "loss": 29.6395, + "step": 2362 + }, + { + "epoch": 8.530925507900678, + "grad_norm": 206.53396606445312, + "learning_rate": 1.7188747731397458e-05, + "loss": 23.6478, + "step": 2363 + }, + { + "epoch": 8.534537246049661, + "grad_norm": 219.47044372558594, + "learning_rate": 1.7183303085299454e-05, + "loss": 22.8522, + "step": 2364 + }, + { + "epoch": 8.538148984198646, + "grad_norm": 178.48008728027344, + "learning_rate": 1.7177858439201452e-05, + "loss": 24.1411, + "step": 2365 + }, + { + "epoch": 8.54176072234763, + "grad_norm": 222.63731384277344, + "learning_rate": 1.717241379310345e-05, + "loss": 26.2818, + "step": 2366 + }, + { + "epoch": 8.545372460496614, + "grad_norm": 216.6333465576172, + "learning_rate": 1.7166969147005447e-05, + "loss": 42.5599, + "step": 2367 + }, + { + "epoch": 8.548984198645599, + "grad_norm": 241.42532348632812, + "learning_rate": 1.7161524500907442e-05, + "loss": 44.0016, + "step": 2368 + }, + { + "epoch": 8.552595936794582, + "grad_norm": 227.95193481445312, + "learning_rate": 1.7156079854809437e-05, + "loss": 44.1662, + "step": 2369 + }, + { + "epoch": 8.556207674943566, + "grad_norm": 204.9208526611328, + "learning_rate": 1.7150635208711433e-05, + "loss": 41.2255, + "step": 2370 + }, + { + "epoch": 8.556207674943566, + "eval_loss": 0.6293933987617493, + "eval_runtime": 3.1467, + "eval_samples_per_second": 56.884, + "eval_steps_per_second": 56.884, + "step": 2370 + }, + { + "epoch": 8.559819413092551, + "grad_norm": 168.1370849609375, + "learning_rate": 1.7145190562613428e-05, + "loss": 42.8374, + "step": 2371 + }, + { + "epoch": 8.563431151241534, + "grad_norm": 209.16641235351562, + "learning_rate": 1.7139745916515427e-05, + "loss": 42.4378, + "step": 2372 + }, + { + "epoch": 8.56704288939052, + "grad_norm": 235.36373901367188, + "learning_rate": 1.7134301270417422e-05, + "loss": 43.3213, + "step": 2373 + }, + { + "epoch": 8.570654627539504, + "grad_norm": 198.8206329345703, + "learning_rate": 1.712885662431942e-05, + "loss": 43.5621, + "step": 2374 + }, + { + "epoch": 8.574266365688487, + "grad_norm": 191.1640167236328, + "learning_rate": 1.7123411978221416e-05, + "loss": 41.8729, + "step": 2375 + }, + { + "epoch": 8.577878103837472, + "grad_norm": 281.6352233886719, + "learning_rate": 1.7117967332123412e-05, + "loss": 42.8306, + "step": 2376 + }, + { + "epoch": 8.581489841986457, + "grad_norm": 191.68939208984375, + "learning_rate": 1.711252268602541e-05, + "loss": 41.3603, + "step": 2377 + }, + { + "epoch": 8.58510158013544, + "grad_norm": 175.3041229248047, + "learning_rate": 1.7107078039927406e-05, + "loss": 38.7076, + "step": 2378 + }, + { + "epoch": 8.588713318284425, + "grad_norm": 186.31202697753906, + "learning_rate": 1.71016333938294e-05, + "loss": 38.832, + "step": 2379 + }, + { + "epoch": 8.592325056433408, + "grad_norm": 192.0680389404297, + "learning_rate": 1.7096188747731397e-05, + "loss": 40.6542, + "step": 2380 + }, + { + "epoch": 8.592325056433408, + "eval_loss": 0.6245992183685303, + "eval_runtime": 3.1487, + "eval_samples_per_second": 56.848, + "eval_steps_per_second": 56.848, + "step": 2380 + }, + { + "epoch": 8.595936794582393, + "grad_norm": 284.3516540527344, + "learning_rate": 1.7090744101633392e-05, + "loss": 40.3145, + "step": 2381 + }, + { + "epoch": 8.599548532731378, + "grad_norm": 210.2421875, + "learning_rate": 1.708529945553539e-05, + "loss": 39.9109, + "step": 2382 + }, + { + "epoch": 8.60316027088036, + "grad_norm": 202.3438720703125, + "learning_rate": 1.707985480943739e-05, + "loss": 39.0686, + "step": 2383 + }, + { + "epoch": 8.606772009029346, + "grad_norm": 189.5508270263672, + "learning_rate": 1.7074410163339385e-05, + "loss": 40.6673, + "step": 2384 + }, + { + "epoch": 8.610383747178329, + "grad_norm": 199.3516387939453, + "learning_rate": 1.706896551724138e-05, + "loss": 40.5357, + "step": 2385 + }, + { + "epoch": 8.613995485327314, + "grad_norm": 183.11309814453125, + "learning_rate": 1.7063520871143376e-05, + "loss": 40.7691, + "step": 2386 + }, + { + "epoch": 8.617607223476298, + "grad_norm": 347.104248046875, + "learning_rate": 1.705807622504537e-05, + "loss": 40.6822, + "step": 2387 + }, + { + "epoch": 8.621218961625281, + "grad_norm": 341.0453796386719, + "learning_rate": 1.705263157894737e-05, + "loss": 40.9791, + "step": 2388 + }, + { + "epoch": 8.624830699774266, + "grad_norm": 335.33221435546875, + "learning_rate": 1.7047186932849365e-05, + "loss": 41.0977, + "step": 2389 + }, + { + "epoch": 8.628442437923251, + "grad_norm": 209.75198364257812, + "learning_rate": 1.704174228675136e-05, + "loss": 41.3332, + "step": 2390 + }, + { + "epoch": 8.628442437923251, + "eval_loss": 0.6176490783691406, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2390 + }, + { + "epoch": 8.632054176072234, + "grad_norm": 221.6715545654297, + "learning_rate": 1.7036297640653356e-05, + "loss": 41.7456, + "step": 2391 + }, + { + "epoch": 8.635665914221219, + "grad_norm": 255.7875213623047, + "learning_rate": 1.7030852994555355e-05, + "loss": 41.7063, + "step": 2392 + }, + { + "epoch": 8.639277652370204, + "grad_norm": 206.66221618652344, + "learning_rate": 1.7025408348457354e-05, + "loss": 41.941, + "step": 2393 + }, + { + "epoch": 8.642889390519187, + "grad_norm": 381.9871826171875, + "learning_rate": 1.701996370235935e-05, + "loss": 42.8615, + "step": 2394 + }, + { + "epoch": 8.646501128668172, + "grad_norm": 303.8249816894531, + "learning_rate": 1.7014519056261344e-05, + "loss": 37.8472, + "step": 2395 + }, + { + "epoch": 8.650112866817155, + "grad_norm": 201.2444610595703, + "learning_rate": 1.700907441016334e-05, + "loss": 35.4641, + "step": 2396 + }, + { + "epoch": 8.65372460496614, + "grad_norm": 242.34298706054688, + "learning_rate": 1.7003629764065335e-05, + "loss": 33.3414, + "step": 2397 + }, + { + "epoch": 8.657336343115125, + "grad_norm": 214.45384216308594, + "learning_rate": 1.699818511796733e-05, + "loss": 33.7771, + "step": 2398 + }, + { + "epoch": 8.660948081264108, + "grad_norm": 276.4810485839844, + "learning_rate": 1.699274047186933e-05, + "loss": 35.4289, + "step": 2399 + }, + { + "epoch": 8.664559819413093, + "grad_norm": 199.68626403808594, + "learning_rate": 1.6987295825771325e-05, + "loss": 34.4205, + "step": 2400 + }, + { + "epoch": 8.664559819413093, + "eval_loss": 0.6179484128952026, + "eval_runtime": 3.1618, + "eval_samples_per_second": 56.614, + "eval_steps_per_second": 56.614, + "step": 2400 + }, + { + "epoch": 8.668171557562077, + "grad_norm": 239.19200134277344, + "learning_rate": 1.698185117967332e-05, + "loss": 34.3428, + "step": 2401 + }, + { + "epoch": 8.67178329571106, + "grad_norm": 341.44927978515625, + "learning_rate": 1.697640653357532e-05, + "loss": 37.6011, + "step": 2402 + }, + { + "epoch": 8.675395033860045, + "grad_norm": 260.5967102050781, + "learning_rate": 1.6970961887477314e-05, + "loss": 34.9222, + "step": 2403 + }, + { + "epoch": 8.679006772009028, + "grad_norm": 217.9357147216797, + "learning_rate": 1.6965517241379313e-05, + "loss": 36.6177, + "step": 2404 + }, + { + "epoch": 8.682618510158013, + "grad_norm": 355.21917724609375, + "learning_rate": 1.696007259528131e-05, + "loss": 36.3072, + "step": 2405 + }, + { + "epoch": 8.686230248306998, + "grad_norm": 279.37200927734375, + "learning_rate": 1.6954627949183304e-05, + "loss": 36.7026, + "step": 2406 + }, + { + "epoch": 8.689841986455981, + "grad_norm": 344.9017028808594, + "learning_rate": 1.69491833030853e-05, + "loss": 37.5009, + "step": 2407 + }, + { + "epoch": 8.693453724604966, + "grad_norm": 225.28668212890625, + "learning_rate": 1.6943738656987295e-05, + "loss": 36.0914, + "step": 2408 + }, + { + "epoch": 8.697065462753951, + "grad_norm": 233.16372680664062, + "learning_rate": 1.693829401088929e-05, + "loss": 38.0917, + "step": 2409 + }, + { + "epoch": 8.700677200902934, + "grad_norm": 220.2307891845703, + "learning_rate": 1.693284936479129e-05, + "loss": 37.4493, + "step": 2410 + }, + { + "epoch": 8.700677200902934, + "eval_loss": 0.6225734949111938, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2410 + }, + { + "epoch": 8.704288939051919, + "grad_norm": 298.2883605957031, + "learning_rate": 1.6927404718693287e-05, + "loss": 37.6527, + "step": 2411 + }, + { + "epoch": 8.707900677200904, + "grad_norm": 329.1615295410156, + "learning_rate": 1.6921960072595283e-05, + "loss": 30.9627, + "step": 2412 + }, + { + "epoch": 8.711512415349887, + "grad_norm": 192.55380249023438, + "learning_rate": 1.6916515426497278e-05, + "loss": 24.2028, + "step": 2413 + }, + { + "epoch": 8.715124153498872, + "grad_norm": 162.13583374023438, + "learning_rate": 1.6911070780399274e-05, + "loss": 23.3005, + "step": 2414 + }, + { + "epoch": 8.718735891647855, + "grad_norm": 152.95108032226562, + "learning_rate": 1.6905626134301272e-05, + "loss": 24.335, + "step": 2415 + }, + { + "epoch": 8.72234762979684, + "grad_norm": 183.4193572998047, + "learning_rate": 1.6900181488203268e-05, + "loss": 24.9279, + "step": 2416 + }, + { + "epoch": 8.725959367945824, + "grad_norm": 232.93650817871094, + "learning_rate": 1.6894736842105263e-05, + "loss": 43.4574, + "step": 2417 + }, + { + "epoch": 8.729571106094808, + "grad_norm": 226.85890197753906, + "learning_rate": 1.688929219600726e-05, + "loss": 44.4136, + "step": 2418 + }, + { + "epoch": 8.733182844243792, + "grad_norm": 232.16064453125, + "learning_rate": 1.6883847549909254e-05, + "loss": 42.8183, + "step": 2419 + }, + { + "epoch": 8.736794582392777, + "grad_norm": 243.5811767578125, + "learning_rate": 1.6878402903811253e-05, + "loss": 43.3031, + "step": 2420 + }, + { + "epoch": 8.736794582392777, + "eval_loss": 0.6284167170524597, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2420 + }, + { + "epoch": 8.74040632054176, + "grad_norm": 194.7115020751953, + "learning_rate": 1.687295825771325e-05, + "loss": 42.1276, + "step": 2421 + }, + { + "epoch": 8.744018058690745, + "grad_norm": 250.81983947753906, + "learning_rate": 1.6867513611615247e-05, + "loss": 42.5535, + "step": 2422 + }, + { + "epoch": 8.747629796839728, + "grad_norm": 205.1988983154297, + "learning_rate": 1.6862068965517242e-05, + "loss": 42.7745, + "step": 2423 + }, + { + "epoch": 8.751241534988713, + "grad_norm": 159.68243408203125, + "learning_rate": 1.6856624319419238e-05, + "loss": 43.6562, + "step": 2424 + }, + { + "epoch": 8.754853273137698, + "grad_norm": 164.31361389160156, + "learning_rate": 1.6851179673321233e-05, + "loss": 43.4602, + "step": 2425 + }, + { + "epoch": 8.758465011286681, + "grad_norm": 213.9793243408203, + "learning_rate": 1.6845735027223232e-05, + "loss": 42.1559, + "step": 2426 + }, + { + "epoch": 8.762076749435666, + "grad_norm": 205.79107666015625, + "learning_rate": 1.6840290381125227e-05, + "loss": 41.5687, + "step": 2427 + }, + { + "epoch": 8.76568848758465, + "grad_norm": 235.80348205566406, + "learning_rate": 1.6834845735027223e-05, + "loss": 41.0748, + "step": 2428 + }, + { + "epoch": 8.769300225733634, + "grad_norm": 203.84884643554688, + "learning_rate": 1.682940108892922e-05, + "loss": 39.3348, + "step": 2429 + }, + { + "epoch": 8.772911963882619, + "grad_norm": 271.2411804199219, + "learning_rate": 1.6823956442831217e-05, + "loss": 39.357, + "step": 2430 + }, + { + "epoch": 8.772911963882619, + "eval_loss": 0.6211046576499939, + "eval_runtime": 3.1402, + "eval_samples_per_second": 57.002, + "eval_steps_per_second": 57.002, + "step": 2430 + }, + { + "epoch": 8.776523702031604, + "grad_norm": 222.4960174560547, + "learning_rate": 1.6818511796733212e-05, + "loss": 39.2198, + "step": 2431 + }, + { + "epoch": 8.780135440180587, + "grad_norm": 325.9942932128906, + "learning_rate": 1.681306715063521e-05, + "loss": 40.572, + "step": 2432 + }, + { + "epoch": 8.783747178329572, + "grad_norm": 195.2740936279297, + "learning_rate": 1.6807622504537206e-05, + "loss": 39.2727, + "step": 2433 + }, + { + "epoch": 8.787358916478555, + "grad_norm": 196.16964721679688, + "learning_rate": 1.68021778584392e-05, + "loss": 40.6503, + "step": 2434 + }, + { + "epoch": 8.79097065462754, + "grad_norm": 183.2659454345703, + "learning_rate": 1.6796733212341197e-05, + "loss": 41.2074, + "step": 2435 + }, + { + "epoch": 8.794582392776524, + "grad_norm": 293.393798828125, + "learning_rate": 1.6791288566243192e-05, + "loss": 40.2778, + "step": 2436 + }, + { + "epoch": 8.798194130925507, + "grad_norm": 232.8402099609375, + "learning_rate": 1.678584392014519e-05, + "loss": 40.0305, + "step": 2437 + }, + { + "epoch": 8.801805869074492, + "grad_norm": 269.957275390625, + "learning_rate": 1.678039927404719e-05, + "loss": 40.4216, + "step": 2438 + }, + { + "epoch": 8.805417607223477, + "grad_norm": 175.6732635498047, + "learning_rate": 1.6774954627949185e-05, + "loss": 40.7998, + "step": 2439 + }, + { + "epoch": 8.80902934537246, + "grad_norm": 209.0604248046875, + "learning_rate": 1.676950998185118e-05, + "loss": 41.1176, + "step": 2440 + }, + { + "epoch": 8.80902934537246, + "eval_loss": 0.6211614012718201, + "eval_runtime": 3.15, + "eval_samples_per_second": 56.826, + "eval_steps_per_second": 56.826, + "step": 2440 + }, + { + "epoch": 8.812641083521445, + "grad_norm": 229.91171264648438, + "learning_rate": 1.6764065335753176e-05, + "loss": 41.37, + "step": 2441 + }, + { + "epoch": 8.816252821670428, + "grad_norm": 192.99610900878906, + "learning_rate": 1.675862068965517e-05, + "loss": 41.8377, + "step": 2442 + }, + { + "epoch": 8.819864559819413, + "grad_norm": 239.290771484375, + "learning_rate": 1.675317604355717e-05, + "loss": 42.3038, + "step": 2443 + }, + { + "epoch": 8.823476297968398, + "grad_norm": 203.52330017089844, + "learning_rate": 1.6747731397459166e-05, + "loss": 41.3334, + "step": 2444 + }, + { + "epoch": 8.827088036117381, + "grad_norm": 247.99099731445312, + "learning_rate": 1.674228675136116e-05, + "loss": 37.7455, + "step": 2445 + }, + { + "epoch": 8.830699774266366, + "grad_norm": 205.9770965576172, + "learning_rate": 1.6736842105263156e-05, + "loss": 34.6828, + "step": 2446 + }, + { + "epoch": 8.83431151241535, + "grad_norm": 215.47024536132812, + "learning_rate": 1.6731397459165152e-05, + "loss": 34.927, + "step": 2447 + }, + { + "epoch": 8.837923250564334, + "grad_norm": 254.14010620117188, + "learning_rate": 1.6725952813067154e-05, + "loss": 35.3194, + "step": 2448 + }, + { + "epoch": 8.841534988713319, + "grad_norm": 221.18174743652344, + "learning_rate": 1.672050816696915e-05, + "loss": 34.9577, + "step": 2449 + }, + { + "epoch": 8.845146726862303, + "grad_norm": 191.1651611328125, + "learning_rate": 1.6715063520871145e-05, + "loss": 33.7244, + "step": 2450 + }, + { + "epoch": 8.845146726862303, + "eval_loss": 0.6216589212417603, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2450 + }, + { + "epoch": 8.848758465011286, + "grad_norm": 228.3920135498047, + "learning_rate": 1.670961887477314e-05, + "loss": 34.9689, + "step": 2451 + }, + { + "epoch": 8.852370203160271, + "grad_norm": 227.6689910888672, + "learning_rate": 1.6704174228675135e-05, + "loss": 36.0718, + "step": 2452 + }, + { + "epoch": 8.855981941309254, + "grad_norm": 182.38978576660156, + "learning_rate": 1.669872958257713e-05, + "loss": 37.1143, + "step": 2453 + }, + { + "epoch": 8.85959367945824, + "grad_norm": 223.66966247558594, + "learning_rate": 1.669328493647913e-05, + "loss": 34.4468, + "step": 2454 + }, + { + "epoch": 8.863205417607224, + "grad_norm": 260.3930358886719, + "learning_rate": 1.6687840290381125e-05, + "loss": 36.7305, + "step": 2455 + }, + { + "epoch": 8.866817155756207, + "grad_norm": 218.60385131835938, + "learning_rate": 1.668239564428312e-05, + "loss": 36.1995, + "step": 2456 + }, + { + "epoch": 8.870428893905192, + "grad_norm": 227.4342041015625, + "learning_rate": 1.667695099818512e-05, + "loss": 35.9138, + "step": 2457 + }, + { + "epoch": 8.874040632054175, + "grad_norm": 208.42196655273438, + "learning_rate": 1.6671506352087115e-05, + "loss": 37.2621, + "step": 2458 + }, + { + "epoch": 8.87765237020316, + "grad_norm": 214.9486541748047, + "learning_rate": 1.6666061705989113e-05, + "loss": 38.5176, + "step": 2459 + }, + { + "epoch": 8.881264108352145, + "grad_norm": 226.6992645263672, + "learning_rate": 1.666061705989111e-05, + "loss": 38.3917, + "step": 2460 + }, + { + "epoch": 8.881264108352145, + "eval_loss": 0.6277003884315491, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.959, + "eval_steps_per_second": 56.959, + "step": 2460 + }, + { + "epoch": 8.884875846501128, + "grad_norm": 282.3875732421875, + "learning_rate": 1.6655172413793104e-05, + "loss": 39.1439, + "step": 2461 + }, + { + "epoch": 8.888487584650113, + "grad_norm": 240.29022216796875, + "learning_rate": 1.66497277676951e-05, + "loss": 33.7717, + "step": 2462 + }, + { + "epoch": 8.892099322799098, + "grad_norm": 231.84727478027344, + "learning_rate": 1.6644283121597095e-05, + "loss": 24.1146, + "step": 2463 + }, + { + "epoch": 8.89571106094808, + "grad_norm": 215.5159149169922, + "learning_rate": 1.663883847549909e-05, + "loss": 24.0165, + "step": 2464 + }, + { + "epoch": 8.899322799097066, + "grad_norm": 278.42950439453125, + "learning_rate": 1.663339382940109e-05, + "loss": 24.2048, + "step": 2465 + }, + { + "epoch": 8.90293453724605, + "grad_norm": 187.03341674804688, + "learning_rate": 1.6627949183303088e-05, + "loss": 24.7332, + "step": 2466 + }, + { + "epoch": 8.906546275395034, + "grad_norm": 261.2938232421875, + "learning_rate": 1.6622504537205083e-05, + "loss": 42.6764, + "step": 2467 + }, + { + "epoch": 8.910158013544018, + "grad_norm": 234.00880432128906, + "learning_rate": 1.661705989110708e-05, + "loss": 42.9894, + "step": 2468 + }, + { + "epoch": 8.913769751693001, + "grad_norm": 263.2890319824219, + "learning_rate": 1.6611615245009074e-05, + "loss": 43.3274, + "step": 2469 + }, + { + "epoch": 8.917381489841986, + "grad_norm": 286.3260192871094, + "learning_rate": 1.6606170598911073e-05, + "loss": 44.3862, + "step": 2470 + }, + { + "epoch": 8.917381489841986, + "eval_loss": 0.6278789043426514, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2470 + }, + { + "epoch": 8.920993227990971, + "grad_norm": 273.5133972167969, + "learning_rate": 1.6600725952813068e-05, + "loss": 43.4195, + "step": 2471 + }, + { + "epoch": 8.924604966139954, + "grad_norm": 246.2245330810547, + "learning_rate": 1.6595281306715063e-05, + "loss": 43.153, + "step": 2472 + }, + { + "epoch": 8.928216704288939, + "grad_norm": 261.3001403808594, + "learning_rate": 1.658983666061706e-05, + "loss": 41.1276, + "step": 2473 + }, + { + "epoch": 8.931828442437924, + "grad_norm": 263.7626037597656, + "learning_rate": 1.6584392014519054e-05, + "loss": 40.5055, + "step": 2474 + }, + { + "epoch": 8.935440180586907, + "grad_norm": 233.80442810058594, + "learning_rate": 1.6578947368421053e-05, + "loss": 40.7098, + "step": 2475 + }, + { + "epoch": 8.939051918735892, + "grad_norm": 334.1268615722656, + "learning_rate": 1.6573502722323052e-05, + "loss": 40.5404, + "step": 2476 + }, + { + "epoch": 8.942663656884875, + "grad_norm": 319.56689453125, + "learning_rate": 1.6568058076225047e-05, + "loss": 40.3434, + "step": 2477 + }, + { + "epoch": 8.94627539503386, + "grad_norm": 388.0625915527344, + "learning_rate": 1.6562613430127043e-05, + "loss": 41.1956, + "step": 2478 + }, + { + "epoch": 8.949887133182845, + "grad_norm": 256.9087829589844, + "learning_rate": 1.6557168784029038e-05, + "loss": 41.9647, + "step": 2479 + }, + { + "epoch": 8.953498871331828, + "grad_norm": 248.2635040283203, + "learning_rate": 1.6551724137931033e-05, + "loss": 41.1885, + "step": 2480 + }, + { + "epoch": 8.953498871331828, + "eval_loss": 0.6198933124542236, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2480 + }, + { + "epoch": 8.957110609480813, + "grad_norm": 236.89004516601562, + "learning_rate": 1.6546279491833032e-05, + "loss": 41.2178, + "step": 2481 + }, + { + "epoch": 8.960722347629797, + "grad_norm": 260.47357177734375, + "learning_rate": 1.6540834845735027e-05, + "loss": 42.1472, + "step": 2482 + }, + { + "epoch": 8.96433408577878, + "grad_norm": 216.1390380859375, + "learning_rate": 1.6535390199637023e-05, + "loss": 36.14, + "step": 2483 + }, + { + "epoch": 8.967945823927765, + "grad_norm": 194.7316131591797, + "learning_rate": 1.652994555353902e-05, + "loss": 33.7272, + "step": 2484 + }, + { + "epoch": 8.97155756207675, + "grad_norm": 202.0404052734375, + "learning_rate": 1.6524500907441017e-05, + "loss": 34.9427, + "step": 2485 + }, + { + "epoch": 8.975169300225733, + "grad_norm": 196.98463439941406, + "learning_rate": 1.6519056261343016e-05, + "loss": 36.4874, + "step": 2486 + }, + { + "epoch": 8.978781038374718, + "grad_norm": 211.46177673339844, + "learning_rate": 1.651361161524501e-05, + "loss": 35.7667, + "step": 2487 + }, + { + "epoch": 8.982392776523701, + "grad_norm": 190.47093200683594, + "learning_rate": 1.6508166969147006e-05, + "loss": 35.6874, + "step": 2488 + }, + { + "epoch": 8.986004514672686, + "grad_norm": 194.9825897216797, + "learning_rate": 1.6502722323049002e-05, + "loss": 36.8718, + "step": 2489 + }, + { + "epoch": 8.989616252821671, + "grad_norm": 230.24774169921875, + "learning_rate": 1.6497277676950997e-05, + "loss": 37.4962, + "step": 2490 + }, + { + "epoch": 8.989616252821671, + "eval_loss": 0.6168100237846375, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 2490 + }, + { + "epoch": 8.993227990970654, + "grad_norm": 266.5688171386719, + "learning_rate": 1.6491833030852993e-05, + "loss": 35.5063, + "step": 2491 + }, + { + "epoch": 8.996839729119639, + "grad_norm": 230.923828125, + "learning_rate": 1.648638838475499e-05, + "loss": 23.5847, + "step": 2492 + }, + { + "epoch": 9.0, + "grad_norm": 187.365478515625, + "learning_rate": 1.6480943738656987e-05, + "loss": 21.7926, + "step": 2493 + }, + { + "epoch": 9.003611738148985, + "grad_norm": 283.487060546875, + "learning_rate": 1.6475499092558986e-05, + "loss": 41.4221, + "step": 2494 + }, + { + "epoch": 9.007223476297968, + "grad_norm": 234.38009643554688, + "learning_rate": 1.647005444646098e-05, + "loss": 43.3343, + "step": 2495 + }, + { + "epoch": 9.010835214446953, + "grad_norm": 253.75588989257812, + "learning_rate": 1.6464609800362976e-05, + "loss": 42.1983, + "step": 2496 + }, + { + "epoch": 9.014446952595938, + "grad_norm": 224.6202392578125, + "learning_rate": 1.6459165154264975e-05, + "loss": 41.5355, + "step": 2497 + }, + { + "epoch": 9.01805869074492, + "grad_norm": 261.0040588378906, + "learning_rate": 1.645372050816697e-05, + "loss": 42.3058, + "step": 2498 + }, + { + "epoch": 9.021670428893906, + "grad_norm": 191.44142150878906, + "learning_rate": 1.6448275862068966e-05, + "loss": 42.3911, + "step": 2499 + }, + { + "epoch": 9.025282167042889, + "grad_norm": 246.79278564453125, + "learning_rate": 1.644283121597096e-05, + "loss": 41.6238, + "step": 2500 + }, + { + "epoch": 9.025282167042889, + "eval_loss": 0.6220878958702087, + "eval_runtime": 3.1552, + "eval_samples_per_second": 56.731, + "eval_steps_per_second": 56.731, + "step": 2500 + }, + { + "epoch": 9.028893905191874, + "grad_norm": 251.5475311279297, + "learning_rate": 1.6437386569872957e-05, + "loss": 43.9275, + "step": 2501 + }, + { + "epoch": 9.032505643340858, + "grad_norm": 300.0381164550781, + "learning_rate": 1.6431941923774952e-05, + "loss": 42.8938, + "step": 2502 + }, + { + "epoch": 9.036117381489841, + "grad_norm": 310.0517883300781, + "learning_rate": 1.6426497277676954e-05, + "loss": 42.3538, + "step": 2503 + }, + { + "epoch": 9.039729119638826, + "grad_norm": 213.50392150878906, + "learning_rate": 1.642105263157895e-05, + "loss": 40.2305, + "step": 2504 + }, + { + "epoch": 9.043340857787811, + "grad_norm": 173.3816680908203, + "learning_rate": 1.6415607985480945e-05, + "loss": 38.3336, + "step": 2505 + }, + { + "epoch": 9.046952595936794, + "grad_norm": 195.51968383789062, + "learning_rate": 1.641016333938294e-05, + "loss": 38.5937, + "step": 2506 + }, + { + "epoch": 9.050564334085779, + "grad_norm": 195.68910217285156, + "learning_rate": 1.6404718693284936e-05, + "loss": 37.9994, + "step": 2507 + }, + { + "epoch": 9.054176072234762, + "grad_norm": 239.56704711914062, + "learning_rate": 1.6399274047186934e-05, + "loss": 38.6006, + "step": 2508 + }, + { + "epoch": 9.057787810383747, + "grad_norm": 455.8309326171875, + "learning_rate": 1.639382940108893e-05, + "loss": 39.9516, + "step": 2509 + }, + { + "epoch": 9.061399548532732, + "grad_norm": 188.0857696533203, + "learning_rate": 1.6388384754990925e-05, + "loss": 38.8922, + "step": 2510 + }, + { + "epoch": 9.061399548532732, + "eval_loss": 0.6177002191543579, + "eval_runtime": 3.1595, + "eval_samples_per_second": 56.654, + "eval_steps_per_second": 56.654, + "step": 2510 + }, + { + "epoch": 9.065011286681715, + "grad_norm": 211.76168823242188, + "learning_rate": 1.638294010889292e-05, + "loss": 38.8895, + "step": 2511 + }, + { + "epoch": 9.0686230248307, + "grad_norm": 281.7332458496094, + "learning_rate": 1.637749546279492e-05, + "loss": 39.9238, + "step": 2512 + }, + { + "epoch": 9.072234762979685, + "grad_norm": 254.9953155517578, + "learning_rate": 1.6372050816696915e-05, + "loss": 41.2667, + "step": 2513 + }, + { + "epoch": 9.075846501128668, + "grad_norm": 233.8746337890625, + "learning_rate": 1.6366606170598914e-05, + "loss": 39.3087, + "step": 2514 + }, + { + "epoch": 9.079458239277653, + "grad_norm": 317.71270751953125, + "learning_rate": 1.636116152450091e-05, + "loss": 40.4902, + "step": 2515 + }, + { + "epoch": 9.083069977426636, + "grad_norm": 227.5228271484375, + "learning_rate": 1.6355716878402904e-05, + "loss": 40.1197, + "step": 2516 + }, + { + "epoch": 9.08668171557562, + "grad_norm": 225.84423828125, + "learning_rate": 1.63502722323049e-05, + "loss": 42.9099, + "step": 2517 + }, + { + "epoch": 9.090293453724605, + "grad_norm": 255.20858764648438, + "learning_rate": 1.6344827586206895e-05, + "loss": 42.0515, + "step": 2518 + }, + { + "epoch": 9.093905191873588, + "grad_norm": 215.45352172851562, + "learning_rate": 1.6339382940108894e-05, + "loss": 41.6817, + "step": 2519 + }, + { + "epoch": 9.097516930022573, + "grad_norm": 233.5334014892578, + "learning_rate": 1.633393829401089e-05, + "loss": 42.6121, + "step": 2520 + }, + { + "epoch": 9.097516930022573, + "eval_loss": 0.6148340106010437, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.926, + "eval_steps_per_second": 56.926, + "step": 2520 + }, + { + "epoch": 9.101128668171558, + "grad_norm": 196.54132080078125, + "learning_rate": 1.6328493647912888e-05, + "loss": 40.5833, + "step": 2521 + }, + { + "epoch": 9.104740406320541, + "grad_norm": 296.7503967285156, + "learning_rate": 1.6323049001814883e-05, + "loss": 39.098, + "step": 2522 + }, + { + "epoch": 9.108352144469526, + "grad_norm": 272.1104431152344, + "learning_rate": 1.631760435571688e-05, + "loss": 36.0076, + "step": 2523 + }, + { + "epoch": 9.111963882618511, + "grad_norm": 197.3100128173828, + "learning_rate": 1.6312159709618874e-05, + "loss": 33.3503, + "step": 2524 + }, + { + "epoch": 9.115575620767494, + "grad_norm": 223.1310272216797, + "learning_rate": 1.6306715063520873e-05, + "loss": 33.1386, + "step": 2525 + }, + { + "epoch": 9.119187358916479, + "grad_norm": 234.86093139648438, + "learning_rate": 1.630127041742287e-05, + "loss": 34.2101, + "step": 2526 + }, + { + "epoch": 9.122799097065462, + "grad_norm": 244.72328186035156, + "learning_rate": 1.6295825771324864e-05, + "loss": 34.955, + "step": 2527 + }, + { + "epoch": 9.126410835214447, + "grad_norm": 198.89134216308594, + "learning_rate": 1.629038112522686e-05, + "loss": 34.5405, + "step": 2528 + }, + { + "epoch": 9.130022573363432, + "grad_norm": 236.64096069335938, + "learning_rate": 1.6284936479128854e-05, + "loss": 35.2328, + "step": 2529 + }, + { + "epoch": 9.133634311512415, + "grad_norm": 212.8743438720703, + "learning_rate": 1.6279491833030853e-05, + "loss": 34.6642, + "step": 2530 + }, + { + "epoch": 9.133634311512415, + "eval_loss": 0.6154256463050842, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 2530 + }, + { + "epoch": 9.1372460496614, + "grad_norm": 227.15135192871094, + "learning_rate": 1.6274047186932852e-05, + "loss": 35.652, + "step": 2531 + }, + { + "epoch": 9.140857787810384, + "grad_norm": 207.30572509765625, + "learning_rate": 1.6268602540834847e-05, + "loss": 36.8476, + "step": 2532 + }, + { + "epoch": 9.144469525959368, + "grad_norm": 222.18023681640625, + "learning_rate": 1.6263157894736843e-05, + "loss": 35.8299, + "step": 2533 + }, + { + "epoch": 9.148081264108352, + "grad_norm": 283.674072265625, + "learning_rate": 1.6257713248638838e-05, + "loss": 36.5074, + "step": 2534 + }, + { + "epoch": 9.151693002257336, + "grad_norm": 235.69752502441406, + "learning_rate": 1.6252268602540834e-05, + "loss": 37.344, + "step": 2535 + }, + { + "epoch": 9.15530474040632, + "grad_norm": 224.37965393066406, + "learning_rate": 1.6246823956442832e-05, + "loss": 37.8138, + "step": 2536 + }, + { + "epoch": 9.158916478555305, + "grad_norm": 217.52230834960938, + "learning_rate": 1.6241379310344828e-05, + "loss": 37.1529, + "step": 2537 + }, + { + "epoch": 9.162528216704288, + "grad_norm": 234.7586212158203, + "learning_rate": 1.6235934664246823e-05, + "loss": 36.3247, + "step": 2538 + }, + { + "epoch": 9.166139954853273, + "grad_norm": 239.52479553222656, + "learning_rate": 1.623049001814882e-05, + "loss": 30.0805, + "step": 2539 + }, + { + "epoch": 9.169751693002258, + "grad_norm": 223.7616424560547, + "learning_rate": 1.6225045372050817e-05, + "loss": 23.8492, + "step": 2540 + }, + { + "epoch": 9.169751693002258, + "eval_loss": 0.6244915723800659, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.031, + "eval_steps_per_second": 57.031, + "step": 2540 + }, + { + "epoch": 9.173363431151241, + "grad_norm": 213.41371154785156, + "learning_rate": 1.6219600725952816e-05, + "loss": 23.3557, + "step": 2541 + }, + { + "epoch": 9.176975169300226, + "grad_norm": 162.4627685546875, + "learning_rate": 1.621415607985481e-05, + "loss": 23.8834, + "step": 2542 + }, + { + "epoch": 9.18058690744921, + "grad_norm": 172.13250732421875, + "learning_rate": 1.6208711433756807e-05, + "loss": 24.6428, + "step": 2543 + }, + { + "epoch": 9.184198645598194, + "grad_norm": 229.30799865722656, + "learning_rate": 1.6203266787658802e-05, + "loss": 42.5908, + "step": 2544 + }, + { + "epoch": 9.187810383747179, + "grad_norm": 195.30130004882812, + "learning_rate": 1.6197822141560798e-05, + "loss": 43.7286, + "step": 2545 + }, + { + "epoch": 9.191422121896162, + "grad_norm": 227.4984893798828, + "learning_rate": 1.6192377495462793e-05, + "loss": 43.5012, + "step": 2546 + }, + { + "epoch": 9.195033860045147, + "grad_norm": 254.69615173339844, + "learning_rate": 1.6186932849364792e-05, + "loss": 41.9295, + "step": 2547 + }, + { + "epoch": 9.198645598194132, + "grad_norm": 251.33778381347656, + "learning_rate": 1.6181488203266787e-05, + "loss": 42.0838, + "step": 2548 + }, + { + "epoch": 9.202257336343115, + "grad_norm": 237.91677856445312, + "learning_rate": 1.6176043557168786e-05, + "loss": 43.0031, + "step": 2549 + }, + { + "epoch": 9.2058690744921, + "grad_norm": 258.0311584472656, + "learning_rate": 1.617059891107078e-05, + "loss": 42.7196, + "step": 2550 + }, + { + "epoch": 9.2058690744921, + "eval_loss": 0.6245208978652954, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2550 + }, + { + "epoch": 9.209480812641084, + "grad_norm": 197.14703369140625, + "learning_rate": 1.6165154264972777e-05, + "loss": 42.1342, + "step": 2551 + }, + { + "epoch": 9.213092550790067, + "grad_norm": 235.19705200195312, + "learning_rate": 1.6159709618874775e-05, + "loss": 41.8462, + "step": 2552 + }, + { + "epoch": 9.216704288939052, + "grad_norm": 198.409423828125, + "learning_rate": 1.615426497277677e-05, + "loss": 43.5993, + "step": 2553 + }, + { + "epoch": 9.220316027088035, + "grad_norm": 254.08590698242188, + "learning_rate": 1.6148820326678766e-05, + "loss": 40.771, + "step": 2554 + }, + { + "epoch": 9.22392776523702, + "grad_norm": 181.64808654785156, + "learning_rate": 1.614337568058076e-05, + "loss": 39.3511, + "step": 2555 + }, + { + "epoch": 9.227539503386005, + "grad_norm": 294.1127014160156, + "learning_rate": 1.6137931034482757e-05, + "loss": 39.6586, + "step": 2556 + }, + { + "epoch": 9.231151241534988, + "grad_norm": 197.59982299804688, + "learning_rate": 1.6132486388384752e-05, + "loss": 38.2575, + "step": 2557 + }, + { + "epoch": 9.234762979683973, + "grad_norm": 223.74717712402344, + "learning_rate": 1.6127041742286754e-05, + "loss": 38.8801, + "step": 2558 + }, + { + "epoch": 9.238374717832958, + "grad_norm": 279.2779541015625, + "learning_rate": 1.612159709618875e-05, + "loss": 40.4591, + "step": 2559 + }, + { + "epoch": 9.241986455981941, + "grad_norm": 258.75909423828125, + "learning_rate": 1.6116152450090745e-05, + "loss": 39.2172, + "step": 2560 + }, + { + "epoch": 9.241986455981941, + "eval_loss": 0.6209923624992371, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.784, + "eval_steps_per_second": 56.784, + "step": 2560 + }, + { + "epoch": 9.245598194130926, + "grad_norm": 305.0645446777344, + "learning_rate": 1.611070780399274e-05, + "loss": 40.442, + "step": 2561 + }, + { + "epoch": 9.249209932279909, + "grad_norm": 196.18557739257812, + "learning_rate": 1.6105263157894736e-05, + "loss": 39.7092, + "step": 2562 + }, + { + "epoch": 9.252821670428894, + "grad_norm": 214.3220977783203, + "learning_rate": 1.6099818511796735e-05, + "loss": 39.3935, + "step": 2563 + }, + { + "epoch": 9.256433408577879, + "grad_norm": 217.2801055908203, + "learning_rate": 1.609437386569873e-05, + "loss": 40.39, + "step": 2564 + }, + { + "epoch": 9.260045146726862, + "grad_norm": 205.17446899414062, + "learning_rate": 1.6088929219600726e-05, + "loss": 39.9531, + "step": 2565 + }, + { + "epoch": 9.263656884875846, + "grad_norm": 197.3854217529297, + "learning_rate": 1.608348457350272e-05, + "loss": 40.474, + "step": 2566 + }, + { + "epoch": 9.267268623024831, + "grad_norm": 264.3934631347656, + "learning_rate": 1.607803992740472e-05, + "loss": 41.2794, + "step": 2567 + }, + { + "epoch": 9.270880361173814, + "grad_norm": 226.6471710205078, + "learning_rate": 1.6072595281306715e-05, + "loss": 40.3425, + "step": 2568 + }, + { + "epoch": 9.2744920993228, + "grad_norm": 198.62734985351562, + "learning_rate": 1.6067150635208714e-05, + "loss": 41.6261, + "step": 2569 + }, + { + "epoch": 9.278103837471784, + "grad_norm": 207.73509216308594, + "learning_rate": 1.606170598911071e-05, + "loss": 41.7835, + "step": 2570 + }, + { + "epoch": 9.278103837471784, + "eval_loss": 0.6173180937767029, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 2570 + }, + { + "epoch": 9.281715575620767, + "grad_norm": 214.13601684570312, + "learning_rate": 1.6056261343012705e-05, + "loss": 40.0095, + "step": 2571 + }, + { + "epoch": 9.285327313769752, + "grad_norm": 218.0533905029297, + "learning_rate": 1.60508166969147e-05, + "loss": 40.014, + "step": 2572 + }, + { + "epoch": 9.288939051918735, + "grad_norm": 211.27984619140625, + "learning_rate": 1.6045372050816695e-05, + "loss": 36.7399, + "step": 2573 + }, + { + "epoch": 9.29255079006772, + "grad_norm": 201.9020233154297, + "learning_rate": 1.6039927404718694e-05, + "loss": 33.7555, + "step": 2574 + }, + { + "epoch": 9.296162528216705, + "grad_norm": 230.27149963378906, + "learning_rate": 1.603448275862069e-05, + "loss": 32.9646, + "step": 2575 + }, + { + "epoch": 9.299774266365688, + "grad_norm": 208.77622985839844, + "learning_rate": 1.6029038112522685e-05, + "loss": 33.5332, + "step": 2576 + }, + { + "epoch": 9.303386004514673, + "grad_norm": 225.02796936035156, + "learning_rate": 1.6023593466424684e-05, + "loss": 34.2592, + "step": 2577 + }, + { + "epoch": 9.306997742663658, + "grad_norm": 201.79612731933594, + "learning_rate": 1.601814882032668e-05, + "loss": 34.6686, + "step": 2578 + }, + { + "epoch": 9.31060948081264, + "grad_norm": 235.6588134765625, + "learning_rate": 1.6012704174228678e-05, + "loss": 35.4554, + "step": 2579 + }, + { + "epoch": 9.314221218961626, + "grad_norm": 273.51904296875, + "learning_rate": 1.6007259528130673e-05, + "loss": 35.2077, + "step": 2580 + }, + { + "epoch": 9.314221218961626, + "eval_loss": 0.6169624328613281, + "eval_runtime": 3.1501, + "eval_samples_per_second": 56.823, + "eval_steps_per_second": 56.823, + "step": 2580 + }, + { + "epoch": 9.317832957110609, + "grad_norm": 199.19541931152344, + "learning_rate": 1.600181488203267e-05, + "loss": 35.0703, + "step": 2581 + }, + { + "epoch": 9.321444695259594, + "grad_norm": 212.49276733398438, + "learning_rate": 1.5996370235934664e-05, + "loss": 35.9691, + "step": 2582 + }, + { + "epoch": 9.325056433408578, + "grad_norm": 193.7330322265625, + "learning_rate": 1.599092558983666e-05, + "loss": 34.9043, + "step": 2583 + }, + { + "epoch": 9.328668171557561, + "grad_norm": 196.00503540039062, + "learning_rate": 1.5985480943738655e-05, + "loss": 36.3508, + "step": 2584 + }, + { + "epoch": 9.332279909706546, + "grad_norm": 218.78392028808594, + "learning_rate": 1.5980036297640654e-05, + "loss": 34.7672, + "step": 2585 + }, + { + "epoch": 9.335891647855531, + "grad_norm": 235.76873779296875, + "learning_rate": 1.5974591651542652e-05, + "loss": 36.8695, + "step": 2586 + }, + { + "epoch": 9.339503386004514, + "grad_norm": 250.538330078125, + "learning_rate": 1.5969147005444648e-05, + "loss": 37.4531, + "step": 2587 + }, + { + "epoch": 9.343115124153499, + "grad_norm": 234.12469482421875, + "learning_rate": 1.5963702359346643e-05, + "loss": 37.4506, + "step": 2588 + }, + { + "epoch": 9.346726862302482, + "grad_norm": 209.3461151123047, + "learning_rate": 1.595825771324864e-05, + "loss": 31.3062, + "step": 2589 + }, + { + "epoch": 9.350338600451467, + "grad_norm": 211.12277221679688, + "learning_rate": 1.5952813067150637e-05, + "loss": 23.3303, + "step": 2590 + }, + { + "epoch": 9.350338600451467, + "eval_loss": 0.6222187876701355, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 2590 + }, + { + "epoch": 9.353950338600452, + "grad_norm": 200.1257781982422, + "learning_rate": 1.5947368421052633e-05, + "loss": 22.9145, + "step": 2591 + }, + { + "epoch": 9.357562076749435, + "grad_norm": 179.01475524902344, + "learning_rate": 1.5941923774954628e-05, + "loss": 23.8842, + "step": 2592 + }, + { + "epoch": 9.36117381489842, + "grad_norm": 214.9254608154297, + "learning_rate": 1.5936479128856623e-05, + "loss": 25.4154, + "step": 2593 + }, + { + "epoch": 9.364785553047405, + "grad_norm": 211.63735961914062, + "learning_rate": 1.593103448275862e-05, + "loss": 42.6467, + "step": 2594 + }, + { + "epoch": 9.368397291196388, + "grad_norm": 232.43194580078125, + "learning_rate": 1.5925589836660618e-05, + "loss": 43.3501, + "step": 2595 + }, + { + "epoch": 9.372009029345373, + "grad_norm": 220.61468505859375, + "learning_rate": 1.5920145190562616e-05, + "loss": 43.4324, + "step": 2596 + }, + { + "epoch": 9.375620767494357, + "grad_norm": 179.00894165039062, + "learning_rate": 1.591470054446461e-05, + "loss": 41.9646, + "step": 2597 + }, + { + "epoch": 9.37923250564334, + "grad_norm": 203.847412109375, + "learning_rate": 1.5909255898366607e-05, + "loss": 41.1242, + "step": 2598 + }, + { + "epoch": 9.382844243792325, + "grad_norm": 244.20164489746094, + "learning_rate": 1.5903811252268602e-05, + "loss": 42.2451, + "step": 2599 + }, + { + "epoch": 9.386455981941308, + "grad_norm": 203.60154724121094, + "learning_rate": 1.5898366606170598e-05, + "loss": 42.0361, + "step": 2600 + }, + { + "epoch": 9.386455981941308, + "eval_loss": 0.627146303653717, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2600 + }, + { + "epoch": 9.390067720090293, + "grad_norm": 185.1741180419922, + "learning_rate": 1.5892921960072597e-05, + "loss": 41.9657, + "step": 2601 + }, + { + "epoch": 9.393679458239278, + "grad_norm": 211.64219665527344, + "learning_rate": 1.5887477313974592e-05, + "loss": 42.2619, + "step": 2602 + }, + { + "epoch": 9.397291196388261, + "grad_norm": 253.31997680664062, + "learning_rate": 1.5882032667876587e-05, + "loss": 42.5666, + "step": 2603 + }, + { + "epoch": 9.400902934537246, + "grad_norm": 257.8781433105469, + "learning_rate": 1.5876588021778586e-05, + "loss": 43.1747, + "step": 2604 + }, + { + "epoch": 9.404514672686231, + "grad_norm": 171.05398559570312, + "learning_rate": 1.587114337568058e-05, + "loss": 41.2645, + "step": 2605 + }, + { + "epoch": 9.408126410835214, + "grad_norm": 209.83749389648438, + "learning_rate": 1.5865698729582577e-05, + "loss": 38.7138, + "step": 2606 + }, + { + "epoch": 9.411738148984199, + "grad_norm": 303.92059326171875, + "learning_rate": 1.5860254083484576e-05, + "loss": 38.7962, + "step": 2607 + }, + { + "epoch": 9.415349887133182, + "grad_norm": 271.9322204589844, + "learning_rate": 1.585480943738657e-05, + "loss": 39.0622, + "step": 2608 + }, + { + "epoch": 9.418961625282167, + "grad_norm": 222.8749542236328, + "learning_rate": 1.5849364791288566e-05, + "loss": 40.0773, + "step": 2609 + }, + { + "epoch": 9.422573363431152, + "grad_norm": 194.549072265625, + "learning_rate": 1.5843920145190562e-05, + "loss": 39.3495, + "step": 2610 + }, + { + "epoch": 9.422573363431152, + "eval_loss": 0.618250846862793, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.796, + "eval_steps_per_second": 56.796, + "step": 2610 + }, + { + "epoch": 9.426185101580135, + "grad_norm": 231.32623291015625, + "learning_rate": 1.5838475499092557e-05, + "loss": 39.7577, + "step": 2611 + }, + { + "epoch": 9.42979683972912, + "grad_norm": 185.9986114501953, + "learning_rate": 1.5833030852994556e-05, + "loss": 40.9342, + "step": 2612 + }, + { + "epoch": 9.433408577878104, + "grad_norm": 221.356201171875, + "learning_rate": 1.5827586206896555e-05, + "loss": 39.7733, + "step": 2613 + }, + { + "epoch": 9.437020316027088, + "grad_norm": 216.2249755859375, + "learning_rate": 1.582214156079855e-05, + "loss": 39.7559, + "step": 2614 + }, + { + "epoch": 9.440632054176072, + "grad_norm": 263.5106201171875, + "learning_rate": 1.5816696914700546e-05, + "loss": 41.2872, + "step": 2615 + }, + { + "epoch": 9.444243792325057, + "grad_norm": 281.9518127441406, + "learning_rate": 1.581125226860254e-05, + "loss": 41.1114, + "step": 2616 + }, + { + "epoch": 9.44785553047404, + "grad_norm": 200.2808074951172, + "learning_rate": 1.5805807622504536e-05, + "loss": 41.7711, + "step": 2617 + }, + { + "epoch": 9.451467268623025, + "grad_norm": 233.034912109375, + "learning_rate": 1.5800362976406535e-05, + "loss": 41.3306, + "step": 2618 + }, + { + "epoch": 9.455079006772008, + "grad_norm": 215.5499725341797, + "learning_rate": 1.579491833030853e-05, + "loss": 41.0065, + "step": 2619 + }, + { + "epoch": 9.458690744920993, + "grad_norm": 220.21153259277344, + "learning_rate": 1.5789473684210526e-05, + "loss": 42.1116, + "step": 2620 + }, + { + "epoch": 9.458690744920993, + "eval_loss": 0.6146022081375122, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 2620 + }, + { + "epoch": 9.462302483069978, + "grad_norm": 198.20001220703125, + "learning_rate": 1.578402903811252e-05, + "loss": 39.637, + "step": 2621 + }, + { + "epoch": 9.465914221218961, + "grad_norm": 228.18357849121094, + "learning_rate": 1.5778584392014517e-05, + "loss": 37.3831, + "step": 2622 + }, + { + "epoch": 9.469525959367946, + "grad_norm": 207.68040466308594, + "learning_rate": 1.577313974591652e-05, + "loss": 35.6356, + "step": 2623 + }, + { + "epoch": 9.47313769751693, + "grad_norm": 267.0474853515625, + "learning_rate": 1.5767695099818514e-05, + "loss": 34.5549, + "step": 2624 + }, + { + "epoch": 9.476749435665914, + "grad_norm": 191.4129638671875, + "learning_rate": 1.576225045372051e-05, + "loss": 35.1065, + "step": 2625 + }, + { + "epoch": 9.480361173814899, + "grad_norm": 220.85708618164062, + "learning_rate": 1.5756805807622505e-05, + "loss": 34.9115, + "step": 2626 + }, + { + "epoch": 9.483972911963882, + "grad_norm": 218.62460327148438, + "learning_rate": 1.57513611615245e-05, + "loss": 33.9542, + "step": 2627 + }, + { + "epoch": 9.487584650112867, + "grad_norm": 184.085693359375, + "learning_rate": 1.5745916515426496e-05, + "loss": 35.2981, + "step": 2628 + }, + { + "epoch": 9.491196388261852, + "grad_norm": 286.73236083984375, + "learning_rate": 1.5740471869328494e-05, + "loss": 36.8326, + "step": 2629 + }, + { + "epoch": 9.494808126410835, + "grad_norm": 326.4263000488281, + "learning_rate": 1.573502722323049e-05, + "loss": 35.9728, + "step": 2630 + }, + { + "epoch": 9.494808126410835, + "eval_loss": 0.6165672540664673, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2630 + }, + { + "epoch": 9.49841986455982, + "grad_norm": 283.330322265625, + "learning_rate": 1.5729582577132485e-05, + "loss": 37.4227, + "step": 2631 + }, + { + "epoch": 9.502031602708804, + "grad_norm": 208.65829467773438, + "learning_rate": 1.5724137931034484e-05, + "loss": 36.8613, + "step": 2632 + }, + { + "epoch": 9.505643340857787, + "grad_norm": 191.59429931640625, + "learning_rate": 1.571869328493648e-05, + "loss": 36.2332, + "step": 2633 + }, + { + "epoch": 9.509255079006772, + "grad_norm": 306.4736022949219, + "learning_rate": 1.5713248638838478e-05, + "loss": 36.8045, + "step": 2634 + }, + { + "epoch": 9.512866817155757, + "grad_norm": 226.97509765625, + "learning_rate": 1.5707803992740474e-05, + "loss": 37.005, + "step": 2635 + }, + { + "epoch": 9.51647855530474, + "grad_norm": 230.47683715820312, + "learning_rate": 1.570235934664247e-05, + "loss": 36.9168, + "step": 2636 + }, + { + "epoch": 9.520090293453725, + "grad_norm": 221.44483947753906, + "learning_rate": 1.5696914700544464e-05, + "loss": 39.0025, + "step": 2637 + }, + { + "epoch": 9.523702031602708, + "grad_norm": 249.1531219482422, + "learning_rate": 1.569147005444646e-05, + "loss": 38.1069, + "step": 2638 + }, + { + "epoch": 9.527313769751693, + "grad_norm": 276.8532409667969, + "learning_rate": 1.5686025408348455e-05, + "loss": 30.9819, + "step": 2639 + }, + { + "epoch": 9.530925507900678, + "grad_norm": 218.25035095214844, + "learning_rate": 1.5680580762250454e-05, + "loss": 23.4807, + "step": 2640 + }, + { + "epoch": 9.530925507900678, + "eval_loss": 0.619295060634613, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2640 + }, + { + "epoch": 9.534537246049661, + "grad_norm": 185.83737182617188, + "learning_rate": 1.5675136116152453e-05, + "loss": 22.5394, + "step": 2641 + }, + { + "epoch": 9.538148984198646, + "grad_norm": 181.9920654296875, + "learning_rate": 1.5669691470054448e-05, + "loss": 23.9106, + "step": 2642 + }, + { + "epoch": 9.54176072234763, + "grad_norm": 209.20391845703125, + "learning_rate": 1.5664246823956443e-05, + "loss": 25.5328, + "step": 2643 + }, + { + "epoch": 9.545372460496614, + "grad_norm": 223.86093139648438, + "learning_rate": 1.565880217785844e-05, + "loss": 42.8563, + "step": 2644 + }, + { + "epoch": 9.548984198645599, + "grad_norm": 232.3086395263672, + "learning_rate": 1.5653357531760438e-05, + "loss": 44.0178, + "step": 2645 + }, + { + "epoch": 9.552595936794582, + "grad_norm": 223.76541137695312, + "learning_rate": 1.5647912885662433e-05, + "loss": 43.4928, + "step": 2646 + }, + { + "epoch": 9.556207674943566, + "grad_norm": 258.86700439453125, + "learning_rate": 1.5642468239564428e-05, + "loss": 42.3422, + "step": 2647 + }, + { + "epoch": 9.559819413092551, + "grad_norm": 255.09033203125, + "learning_rate": 1.5637023593466424e-05, + "loss": 41.6588, + "step": 2648 + }, + { + "epoch": 9.563431151241534, + "grad_norm": 205.88563537597656, + "learning_rate": 1.563157894736842e-05, + "loss": 41.9267, + "step": 2649 + }, + { + "epoch": 9.56704288939052, + "grad_norm": 204.12318420410156, + "learning_rate": 1.5626134301270418e-05, + "loss": 43.0326, + "step": 2650 + }, + { + "epoch": 9.56704288939052, + "eval_loss": 0.6218730807304382, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2650 + }, + { + "epoch": 9.570654627539504, + "grad_norm": 259.5694274902344, + "learning_rate": 1.5620689655172417e-05, + "loss": 42.9604, + "step": 2651 + }, + { + "epoch": 9.574266365688487, + "grad_norm": 234.35935974121094, + "learning_rate": 1.5615245009074412e-05, + "loss": 42.7316, + "step": 2652 + }, + { + "epoch": 9.577878103837472, + "grad_norm": 237.14346313476562, + "learning_rate": 1.5609800362976407e-05, + "loss": 42.4559, + "step": 2653 + }, + { + "epoch": 9.581489841986457, + "grad_norm": 208.2974395751953, + "learning_rate": 1.5604355716878403e-05, + "loss": 40.1113, + "step": 2654 + }, + { + "epoch": 9.58510158013544, + "grad_norm": 212.18814086914062, + "learning_rate": 1.5598911070780398e-05, + "loss": 38.6515, + "step": 2655 + }, + { + "epoch": 9.588713318284425, + "grad_norm": 245.23240661621094, + "learning_rate": 1.5593466424682397e-05, + "loss": 39.5289, + "step": 2656 + }, + { + "epoch": 9.592325056433408, + "grad_norm": 261.1321105957031, + "learning_rate": 1.5588021778584392e-05, + "loss": 39.3232, + "step": 2657 + }, + { + "epoch": 9.595936794582393, + "grad_norm": 257.67962646484375, + "learning_rate": 1.5582577132486388e-05, + "loss": 40.3963, + "step": 2658 + }, + { + "epoch": 9.599548532731378, + "grad_norm": 299.93914794921875, + "learning_rate": 1.5577132486388383e-05, + "loss": 39.0657, + "step": 2659 + }, + { + "epoch": 9.60316027088036, + "grad_norm": 215.45407104492188, + "learning_rate": 1.5571687840290382e-05, + "loss": 40.1408, + "step": 2660 + }, + { + "epoch": 9.60316027088036, + "eval_loss": 0.6216554045677185, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2660 + }, + { + "epoch": 9.606772009029346, + "grad_norm": 273.9233093261719, + "learning_rate": 1.5566243194192377e-05, + "loss": 40.6894, + "step": 2661 + }, + { + "epoch": 9.610383747178329, + "grad_norm": 220.76344299316406, + "learning_rate": 1.5560798548094376e-05, + "loss": 40.8146, + "step": 2662 + }, + { + "epoch": 9.613995485327314, + "grad_norm": 200.33929443359375, + "learning_rate": 1.555535390199637e-05, + "loss": 40.1362, + "step": 2663 + }, + { + "epoch": 9.617607223476298, + "grad_norm": 223.38536071777344, + "learning_rate": 1.5549909255898367e-05, + "loss": 39.3488, + "step": 2664 + }, + { + "epoch": 9.621218961625281, + "grad_norm": 240.99578857421875, + "learning_rate": 1.5544464609800362e-05, + "loss": 41.771, + "step": 2665 + }, + { + "epoch": 9.624830699774266, + "grad_norm": 202.30323791503906, + "learning_rate": 1.5539019963702357e-05, + "loss": 41.1412, + "step": 2666 + }, + { + "epoch": 9.628442437923251, + "grad_norm": 193.8411865234375, + "learning_rate": 1.5533575317604356e-05, + "loss": 41.0064, + "step": 2667 + }, + { + "epoch": 9.632054176072234, + "grad_norm": 197.1542510986328, + "learning_rate": 1.552813067150635e-05, + "loss": 41.4787, + "step": 2668 + }, + { + "epoch": 9.635665914221219, + "grad_norm": 259.21954345703125, + "learning_rate": 1.552268602540835e-05, + "loss": 41.753, + "step": 2669 + }, + { + "epoch": 9.639277652370204, + "grad_norm": 290.9770202636719, + "learning_rate": 1.5517241379310346e-05, + "loss": 40.4589, + "step": 2670 + }, + { + "epoch": 9.639277652370204, + "eval_loss": 0.6132164001464844, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2670 + }, + { + "epoch": 9.642889390519187, + "grad_norm": 252.86219787597656, + "learning_rate": 1.551179673321234e-05, + "loss": 37.356, + "step": 2671 + }, + { + "epoch": 9.646501128668172, + "grad_norm": 207.79254150390625, + "learning_rate": 1.550635208711434e-05, + "loss": 36.2071, + "step": 2672 + }, + { + "epoch": 9.650112866817155, + "grad_norm": 186.78857421875, + "learning_rate": 1.5500907441016335e-05, + "loss": 33.5074, + "step": 2673 + }, + { + "epoch": 9.65372460496614, + "grad_norm": 212.5107421875, + "learning_rate": 1.549546279491833e-05, + "loss": 33.7103, + "step": 2674 + }, + { + "epoch": 9.657336343115125, + "grad_norm": 243.2950897216797, + "learning_rate": 1.5490018148820326e-05, + "loss": 34.3476, + "step": 2675 + }, + { + "epoch": 9.660948081264108, + "grad_norm": 221.66415405273438, + "learning_rate": 1.548457350272232e-05, + "loss": 34.5377, + "step": 2676 + }, + { + "epoch": 9.664559819413093, + "grad_norm": 231.8260955810547, + "learning_rate": 1.5479128856624317e-05, + "loss": 34.3663, + "step": 2677 + }, + { + "epoch": 9.668171557562077, + "grad_norm": 284.6401062011719, + "learning_rate": 1.547368421052632e-05, + "loss": 35.5723, + "step": 2678 + }, + { + "epoch": 9.67178329571106, + "grad_norm": 373.43865966796875, + "learning_rate": 1.5468239564428314e-05, + "loss": 35.5628, + "step": 2679 + }, + { + "epoch": 9.675395033860045, + "grad_norm": 325.18316650390625, + "learning_rate": 1.546279491833031e-05, + "loss": 35.6192, + "step": 2680 + }, + { + "epoch": 9.675395033860045, + "eval_loss": 0.613842248916626, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 2680 + }, + { + "epoch": 9.679006772009028, + "grad_norm": 353.14739990234375, + "learning_rate": 1.5457350272232305e-05, + "loss": 36.4789, + "step": 2681 + }, + { + "epoch": 9.682618510158013, + "grad_norm": 215.21836853027344, + "learning_rate": 1.54519056261343e-05, + "loss": 36.0412, + "step": 2682 + }, + { + "epoch": 9.686230248306998, + "grad_norm": 219.64930725097656, + "learning_rate": 1.54464609800363e-05, + "loss": 37.1118, + "step": 2683 + }, + { + "epoch": 9.689841986455981, + "grad_norm": 247.86685180664062, + "learning_rate": 1.5441016333938295e-05, + "loss": 36.488, + "step": 2684 + }, + { + "epoch": 9.693453724604966, + "grad_norm": 248.7967071533203, + "learning_rate": 1.543557168784029e-05, + "loss": 36.2925, + "step": 2685 + }, + { + "epoch": 9.697065462753951, + "grad_norm": 243.1404571533203, + "learning_rate": 1.5430127041742285e-05, + "loss": 37.3986, + "step": 2686 + }, + { + "epoch": 9.700677200902934, + "grad_norm": 276.6585388183594, + "learning_rate": 1.5424682395644284e-05, + "loss": 37.9784, + "step": 2687 + }, + { + "epoch": 9.704288939051919, + "grad_norm": 308.171630859375, + "learning_rate": 1.541923774954628e-05, + "loss": 38.1591, + "step": 2688 + }, + { + "epoch": 9.707900677200904, + "grad_norm": 204.4575653076172, + "learning_rate": 1.541379310344828e-05, + "loss": 27.4514, + "step": 2689 + }, + { + "epoch": 9.711512415349887, + "grad_norm": 160.85946655273438, + "learning_rate": 1.5408348457350274e-05, + "loss": 23.7982, + "step": 2690 + }, + { + "epoch": 9.711512415349887, + "eval_loss": 0.619924008846283, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2690 + }, + { + "epoch": 9.715124153498872, + "grad_norm": 215.60049438476562, + "learning_rate": 1.540290381125227e-05, + "loss": 23.3927, + "step": 2691 + }, + { + "epoch": 9.718735891647855, + "grad_norm": 172.84011840820312, + "learning_rate": 1.5397459165154265e-05, + "loss": 24.1876, + "step": 2692 + }, + { + "epoch": 9.72234762979684, + "grad_norm": 208.42361450195312, + "learning_rate": 1.539201451905626e-05, + "loss": 25.1794, + "step": 2693 + }, + { + "epoch": 9.725959367945824, + "grad_norm": 255.73574829101562, + "learning_rate": 1.538656987295826e-05, + "loss": 42.3484, + "step": 2694 + }, + { + "epoch": 9.729571106094808, + "grad_norm": 239.65533447265625, + "learning_rate": 1.5381125226860254e-05, + "loss": 42.8277, + "step": 2695 + }, + { + "epoch": 9.733182844243792, + "grad_norm": 211.2068634033203, + "learning_rate": 1.5375680580762253e-05, + "loss": 42.6536, + "step": 2696 + }, + { + "epoch": 9.736794582392777, + "grad_norm": 302.85003662109375, + "learning_rate": 1.5370235934664248e-05, + "loss": 42.6263, + "step": 2697 + }, + { + "epoch": 9.74040632054176, + "grad_norm": 211.54754638671875, + "learning_rate": 1.5364791288566244e-05, + "loss": 41.5621, + "step": 2698 + }, + { + "epoch": 9.744018058690745, + "grad_norm": 229.22283935546875, + "learning_rate": 1.535934664246824e-05, + "loss": 43.3765, + "step": 2699 + }, + { + "epoch": 9.747629796839728, + "grad_norm": 206.64794921875, + "learning_rate": 1.5353901996370238e-05, + "loss": 41.4923, + "step": 2700 + }, + { + "epoch": 9.747629796839728, + "eval_loss": 0.6202616095542908, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.981, + "eval_steps_per_second": 56.981, + "step": 2700 + }, + { + "epoch": 9.751241534988713, + "grad_norm": 216.98757934570312, + "learning_rate": 1.5348457350272233e-05, + "loss": 43.1931, + "step": 2701 + }, + { + "epoch": 9.754853273137698, + "grad_norm": 222.7340545654297, + "learning_rate": 1.534301270417423e-05, + "loss": 42.485, + "step": 2702 + }, + { + "epoch": 9.758465011286681, + "grad_norm": 291.3454895019531, + "learning_rate": 1.5337568058076224e-05, + "loss": 41.4766, + "step": 2703 + }, + { + "epoch": 9.762076749435666, + "grad_norm": 239.50341796875, + "learning_rate": 1.533212341197822e-05, + "loss": 41.9215, + "step": 2704 + }, + { + "epoch": 9.76568848758465, + "grad_norm": 179.21839904785156, + "learning_rate": 1.5326678765880218e-05, + "loss": 40.6544, + "step": 2705 + }, + { + "epoch": 9.769300225733634, + "grad_norm": 210.89535522460938, + "learning_rate": 1.5321234119782217e-05, + "loss": 38.6204, + "step": 2706 + }, + { + "epoch": 9.772911963882619, + "grad_norm": 239.23291015625, + "learning_rate": 1.5315789473684212e-05, + "loss": 39.4385, + "step": 2707 + }, + { + "epoch": 9.776523702031604, + "grad_norm": 240.22772216796875, + "learning_rate": 1.5310344827586208e-05, + "loss": 40.0139, + "step": 2708 + }, + { + "epoch": 9.780135440180587, + "grad_norm": 185.4588623046875, + "learning_rate": 1.5304900181488203e-05, + "loss": 38.9331, + "step": 2709 + }, + { + "epoch": 9.783747178329572, + "grad_norm": 263.0315856933594, + "learning_rate": 1.52994555353902e-05, + "loss": 38.5485, + "step": 2710 + }, + { + "epoch": 9.783747178329572, + "eval_loss": 0.615914523601532, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2710 + }, + { + "epoch": 9.787358916478555, + "grad_norm": 209.05348205566406, + "learning_rate": 1.5294010889292197e-05, + "loss": 39.4875, + "step": 2711 + }, + { + "epoch": 9.79097065462754, + "grad_norm": 209.72293090820312, + "learning_rate": 1.5288566243194193e-05, + "loss": 40.4742, + "step": 2712 + }, + { + "epoch": 9.794582392776524, + "grad_norm": 210.02908325195312, + "learning_rate": 1.5283121597096188e-05, + "loss": 39.924, + "step": 2713 + }, + { + "epoch": 9.798194130925507, + "grad_norm": 204.3467254638672, + "learning_rate": 1.5277676950998183e-05, + "loss": 40.8893, + "step": 2714 + }, + { + "epoch": 9.801805869074492, + "grad_norm": 253.9317626953125, + "learning_rate": 1.5272232304900182e-05, + "loss": 38.3278, + "step": 2715 + }, + { + "epoch": 9.805417607223477, + "grad_norm": 263.6196594238281, + "learning_rate": 1.526678765880218e-05, + "loss": 40.5242, + "step": 2716 + }, + { + "epoch": 9.80902934537246, + "grad_norm": 230.35621643066406, + "learning_rate": 1.5261343012704176e-05, + "loss": 40.683, + "step": 2717 + }, + { + "epoch": 9.812641083521445, + "grad_norm": 190.16323852539062, + "learning_rate": 1.5255898366606172e-05, + "loss": 40.2472, + "step": 2718 + }, + { + "epoch": 9.816252821670428, + "grad_norm": 202.7122344970703, + "learning_rate": 1.5250453720508167e-05, + "loss": 38.9644, + "step": 2719 + }, + { + "epoch": 9.819864559819413, + "grad_norm": 193.65774536132812, + "learning_rate": 1.5245009074410164e-05, + "loss": 40.9982, + "step": 2720 + }, + { + "epoch": 9.819864559819413, + "eval_loss": 0.6152020692825317, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 2720 + }, + { + "epoch": 9.823476297968398, + "grad_norm": 272.0360412597656, + "learning_rate": 1.523956442831216e-05, + "loss": 40.5518, + "step": 2721 + }, + { + "epoch": 9.827088036117381, + "grad_norm": 200.20777893066406, + "learning_rate": 1.5234119782214155e-05, + "loss": 38.4801, + "step": 2722 + }, + { + "epoch": 9.830699774266366, + "grad_norm": 201.44764709472656, + "learning_rate": 1.5228675136116152e-05, + "loss": 35.7499, + "step": 2723 + }, + { + "epoch": 9.83431151241535, + "grad_norm": 234.89706420898438, + "learning_rate": 1.522323049001815e-05, + "loss": 35.4331, + "step": 2724 + }, + { + "epoch": 9.837923250564334, + "grad_norm": 193.27423095703125, + "learning_rate": 1.5217785843920146e-05, + "loss": 33.0281, + "step": 2725 + }, + { + "epoch": 9.841534988713319, + "grad_norm": 222.28060913085938, + "learning_rate": 1.5212341197822143e-05, + "loss": 34.2237, + "step": 2726 + }, + { + "epoch": 9.845146726862303, + "grad_norm": 264.2764587402344, + "learning_rate": 1.5206896551724139e-05, + "loss": 33.7112, + "step": 2727 + }, + { + "epoch": 9.848758465011286, + "grad_norm": 204.5146484375, + "learning_rate": 1.5201451905626134e-05, + "loss": 33.9014, + "step": 2728 + }, + { + "epoch": 9.852370203160271, + "grad_norm": 198.90907287597656, + "learning_rate": 1.5196007259528131e-05, + "loss": 36.6987, + "step": 2729 + }, + { + "epoch": 9.855981941309254, + "grad_norm": 254.19818115234375, + "learning_rate": 1.5190562613430126e-05, + "loss": 35.4466, + "step": 2730 + }, + { + "epoch": 9.855981941309254, + "eval_loss": 0.6153284311294556, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2730 + }, + { + "epoch": 9.85959367945824, + "grad_norm": 212.53749084472656, + "learning_rate": 1.5185117967332123e-05, + "loss": 35.659, + "step": 2731 + }, + { + "epoch": 9.863205417607224, + "grad_norm": 234.5277557373047, + "learning_rate": 1.5179673321234119e-05, + "loss": 36.7411, + "step": 2732 + }, + { + "epoch": 9.866817155756207, + "grad_norm": 229.25962829589844, + "learning_rate": 1.5174228675136118e-05, + "loss": 36.0713, + "step": 2733 + }, + { + "epoch": 9.870428893905192, + "grad_norm": 259.5096435546875, + "learning_rate": 1.5168784029038115e-05, + "loss": 37.2433, + "step": 2734 + }, + { + "epoch": 9.874040632054175, + "grad_norm": 297.2413024902344, + "learning_rate": 1.516333938294011e-05, + "loss": 37.222, + "step": 2735 + }, + { + "epoch": 9.87765237020316, + "grad_norm": 259.8325500488281, + "learning_rate": 1.5157894736842105e-05, + "loss": 37.096, + "step": 2736 + }, + { + "epoch": 9.881264108352145, + "grad_norm": 275.85888671875, + "learning_rate": 1.5152450090744103e-05, + "loss": 37.769, + "step": 2737 + }, + { + "epoch": 9.884875846501128, + "grad_norm": 261.16656494140625, + "learning_rate": 1.5147005444646098e-05, + "loss": 38.4089, + "step": 2738 + }, + { + "epoch": 9.888487584650113, + "grad_norm": 219.74351501464844, + "learning_rate": 1.5141560798548095e-05, + "loss": 32.5255, + "step": 2739 + }, + { + "epoch": 9.892099322799098, + "grad_norm": 203.9193878173828, + "learning_rate": 1.513611615245009e-05, + "loss": 24.2497, + "step": 2740 + }, + { + "epoch": 9.892099322799098, + "eval_loss": 0.6206448674201965, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 2740 + }, + { + "epoch": 9.89571106094808, + "grad_norm": 224.19454956054688, + "learning_rate": 1.5130671506352086e-05, + "loss": 23.0629, + "step": 2741 + }, + { + "epoch": 9.899322799097066, + "grad_norm": 252.4147186279297, + "learning_rate": 1.5125226860254086e-05, + "loss": 24.5799, + "step": 2742 + }, + { + "epoch": 9.90293453724605, + "grad_norm": 214.79067993164062, + "learning_rate": 1.5119782214156082e-05, + "loss": 24.6773, + "step": 2743 + }, + { + "epoch": 9.906546275395034, + "grad_norm": 225.59848022460938, + "learning_rate": 1.5114337568058077e-05, + "loss": 43.1147, + "step": 2744 + }, + { + "epoch": 9.910158013544018, + "grad_norm": 221.8661651611328, + "learning_rate": 1.5108892921960074e-05, + "loss": 42.7403, + "step": 2745 + }, + { + "epoch": 9.913769751693001, + "grad_norm": 316.3871765136719, + "learning_rate": 1.510344827586207e-05, + "loss": 41.6931, + "step": 2746 + }, + { + "epoch": 9.917381489841986, + "grad_norm": 250.6577911376953, + "learning_rate": 1.5098003629764065e-05, + "loss": 43.3, + "step": 2747 + }, + { + "epoch": 9.920993227990971, + "grad_norm": 222.44386291503906, + "learning_rate": 1.5092558983666062e-05, + "loss": 43.3128, + "step": 2748 + }, + { + "epoch": 9.924604966139954, + "grad_norm": 190.08682250976562, + "learning_rate": 1.5087114337568057e-05, + "loss": 41.4814, + "step": 2749 + }, + { + "epoch": 9.928216704288939, + "grad_norm": 276.9918212890625, + "learning_rate": 1.5081669691470054e-05, + "loss": 41.042, + "step": 2750 + }, + { + "epoch": 9.928216704288939, + "eval_loss": 0.6201648116111755, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2750 + }, + { + "epoch": 9.931828442437924, + "grad_norm": 269.7344970703125, + "learning_rate": 1.507622504537205e-05, + "loss": 40.3064, + "step": 2751 + }, + { + "epoch": 9.935440180586907, + "grad_norm": 263.11663818359375, + "learning_rate": 1.5070780399274049e-05, + "loss": 40.1675, + "step": 2752 + }, + { + "epoch": 9.939051918735892, + "grad_norm": 210.37635803222656, + "learning_rate": 1.5065335753176046e-05, + "loss": 40.5334, + "step": 2753 + }, + { + "epoch": 9.942663656884875, + "grad_norm": 206.09335327148438, + "learning_rate": 1.5059891107078041e-05, + "loss": 41.0429, + "step": 2754 + }, + { + "epoch": 9.94627539503386, + "grad_norm": 245.45013427734375, + "learning_rate": 1.5054446460980036e-05, + "loss": 40.8831, + "step": 2755 + }, + { + "epoch": 9.949887133182845, + "grad_norm": 216.63075256347656, + "learning_rate": 1.5049001814882033e-05, + "loss": 41.2453, + "step": 2756 + }, + { + "epoch": 9.953498871331828, + "grad_norm": 362.12127685546875, + "learning_rate": 1.5043557168784029e-05, + "loss": 40.4561, + "step": 2757 + }, + { + "epoch": 9.957110609480813, + "grad_norm": 222.01434326171875, + "learning_rate": 1.5038112522686024e-05, + "loss": 41.7307, + "step": 2758 + }, + { + "epoch": 9.960722347629797, + "grad_norm": 289.6107177734375, + "learning_rate": 1.5032667876588021e-05, + "loss": 37.83, + "step": 2759 + }, + { + "epoch": 9.96433408577878, + "grad_norm": 231.75274658203125, + "learning_rate": 1.5027223230490017e-05, + "loss": 34.1728, + "step": 2760 + }, + { + "epoch": 9.96433408577878, + "eval_loss": 0.6177247166633606, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2760 + }, + { + "epoch": 9.967945823927765, + "grad_norm": 269.4657287597656, + "learning_rate": 1.5021778584392017e-05, + "loss": 33.8501, + "step": 2761 + }, + { + "epoch": 9.97155756207675, + "grad_norm": 229.73004150390625, + "learning_rate": 1.5016333938294013e-05, + "loss": 35.0989, + "step": 2762 + }, + { + "epoch": 9.975169300225733, + "grad_norm": 215.75350952148438, + "learning_rate": 1.5010889292196008e-05, + "loss": 35.1091, + "step": 2763 + }, + { + "epoch": 9.978781038374718, + "grad_norm": 255.36439514160156, + "learning_rate": 1.5005444646098005e-05, + "loss": 36.8373, + "step": 2764 + }, + { + "epoch": 9.982392776523701, + "grad_norm": 226.71084594726562, + "learning_rate": 1.5e-05, + "loss": 36.6244, + "step": 2765 + }, + { + "epoch": 9.986004514672686, + "grad_norm": 264.1791076660156, + "learning_rate": 1.4994555353901996e-05, + "loss": 36.1925, + "step": 2766 + }, + { + "epoch": 9.989616252821671, + "grad_norm": 281.4349060058594, + "learning_rate": 1.4989110707803993e-05, + "loss": 38.5627, + "step": 2767 + }, + { + "epoch": 9.993227990970654, + "grad_norm": 275.13092041015625, + "learning_rate": 1.498366606170599e-05, + "loss": 33.3277, + "step": 2768 + }, + { + "epoch": 9.996839729119639, + "grad_norm": 215.79550170898438, + "learning_rate": 1.4978221415607985e-05, + "loss": 23.7482, + "step": 2769 + }, + { + "epoch": 10.0, + "grad_norm": 162.03152465820312, + "learning_rate": 1.4972776769509982e-05, + "loss": 21.7078, + "step": 2770 + }, + { + "epoch": 10.0, + "eval_loss": 0.6126651763916016, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2770 + }, + { + "epoch": 10.003611738148985, + "grad_norm": 243.1815185546875, + "learning_rate": 1.4967332123411978e-05, + "loss": 42.2449, + "step": 2771 + }, + { + "epoch": 10.007223476297968, + "grad_norm": 183.29127502441406, + "learning_rate": 1.4961887477313977e-05, + "loss": 41.5925, + "step": 2772 + }, + { + "epoch": 10.010835214446953, + "grad_norm": 206.04238891601562, + "learning_rate": 1.4956442831215972e-05, + "loss": 40.6657, + "step": 2773 + }, + { + "epoch": 10.014446952595938, + "grad_norm": 192.1796875, + "learning_rate": 1.4950998185117967e-05, + "loss": 41.7065, + "step": 2774 + }, + { + "epoch": 10.01805869074492, + "grad_norm": 202.77279663085938, + "learning_rate": 1.4945553539019964e-05, + "loss": 42.0608, + "step": 2775 + }, + { + "epoch": 10.021670428893906, + "grad_norm": 242.37734985351562, + "learning_rate": 1.494010889292196e-05, + "loss": 40.9925, + "step": 2776 + }, + { + "epoch": 10.025282167042889, + "grad_norm": 252.01358032226562, + "learning_rate": 1.4934664246823957e-05, + "loss": 41.1401, + "step": 2777 + }, + { + "epoch": 10.028893905191874, + "grad_norm": 205.82388305664062, + "learning_rate": 1.4929219600725954e-05, + "loss": 41.5, + "step": 2778 + }, + { + "epoch": 10.032505643340858, + "grad_norm": 251.53968811035156, + "learning_rate": 1.492377495462795e-05, + "loss": 41.8218, + "step": 2779 + }, + { + "epoch": 10.036117381489841, + "grad_norm": 236.55564880371094, + "learning_rate": 1.4918330308529945e-05, + "loss": 40.803, + "step": 2780 + }, + { + "epoch": 10.036117381489841, + "eval_loss": 0.6173696517944336, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 2780 + }, + { + "epoch": 10.039729119638826, + "grad_norm": 214.9959716796875, + "learning_rate": 1.4912885662431942e-05, + "loss": 40.522, + "step": 2781 + }, + { + "epoch": 10.043340857787811, + "grad_norm": 213.7000732421875, + "learning_rate": 1.4907441016333939e-05, + "loss": 38.8643, + "step": 2782 + }, + { + "epoch": 10.046952595936794, + "grad_norm": 225.6709747314453, + "learning_rate": 1.4901996370235936e-05, + "loss": 38.3625, + "step": 2783 + }, + { + "epoch": 10.050564334085779, + "grad_norm": 208.83712768554688, + "learning_rate": 1.4896551724137931e-05, + "loss": 38.5355, + "step": 2784 + }, + { + "epoch": 10.054176072234762, + "grad_norm": 185.51219177246094, + "learning_rate": 1.4891107078039927e-05, + "loss": 38.4303, + "step": 2785 + }, + { + "epoch": 10.057787810383747, + "grad_norm": 196.68551635742188, + "learning_rate": 1.4885662431941925e-05, + "loss": 38.1895, + "step": 2786 + }, + { + "epoch": 10.061399548532732, + "grad_norm": 207.4806671142578, + "learning_rate": 1.488021778584392e-05, + "loss": 39.2329, + "step": 2787 + }, + { + "epoch": 10.065011286681715, + "grad_norm": 211.640380859375, + "learning_rate": 1.4874773139745916e-05, + "loss": 40.108, + "step": 2788 + }, + { + "epoch": 10.0686230248307, + "grad_norm": 195.97006225585938, + "learning_rate": 1.4869328493647913e-05, + "loss": 39.6883, + "step": 2789 + }, + { + "epoch": 10.072234762979685, + "grad_norm": 207.20169067382812, + "learning_rate": 1.4863883847549909e-05, + "loss": 40.557, + "step": 2790 + }, + { + "epoch": 10.072234762979685, + "eval_loss": 0.6166439652442932, + "eval_runtime": 3.1461, + "eval_samples_per_second": 56.895, + "eval_steps_per_second": 56.895, + "step": 2790 + }, + { + "epoch": 10.075846501128668, + "grad_norm": 168.4052276611328, + "learning_rate": 1.4858439201451906e-05, + "loss": 39.76, + "step": 2791 + }, + { + "epoch": 10.079458239277653, + "grad_norm": 188.55575561523438, + "learning_rate": 1.4852994555353903e-05, + "loss": 40.4776, + "step": 2792 + }, + { + "epoch": 10.083069977426636, + "grad_norm": 181.60801696777344, + "learning_rate": 1.4847549909255898e-05, + "loss": 40.5414, + "step": 2793 + }, + { + "epoch": 10.08668171557562, + "grad_norm": 205.39608764648438, + "learning_rate": 1.4842105263157895e-05, + "loss": 41.4944, + "step": 2794 + }, + { + "epoch": 10.090293453724605, + "grad_norm": 271.0169372558594, + "learning_rate": 1.4836660617059892e-05, + "loss": 40.6805, + "step": 2795 + }, + { + "epoch": 10.093905191873588, + "grad_norm": 241.97889709472656, + "learning_rate": 1.4831215970961888e-05, + "loss": 39.5473, + "step": 2796 + }, + { + "epoch": 10.097516930022573, + "grad_norm": 211.64260864257812, + "learning_rate": 1.4825771324863885e-05, + "loss": 41.0357, + "step": 2797 + }, + { + "epoch": 10.101128668171558, + "grad_norm": 209.52804565429688, + "learning_rate": 1.482032667876588e-05, + "loss": 41.3357, + "step": 2798 + }, + { + "epoch": 10.104740406320541, + "grad_norm": 243.08419799804688, + "learning_rate": 1.4814882032667876e-05, + "loss": 38.6778, + "step": 2799 + }, + { + "epoch": 10.108352144469526, + "grad_norm": 227.17172241210938, + "learning_rate": 1.4809437386569874e-05, + "loss": 35.1128, + "step": 2800 + }, + { + "epoch": 10.108352144469526, + "eval_loss": 0.6153741478919983, + "eval_runtime": 3.143, + "eval_samples_per_second": 56.952, + "eval_steps_per_second": 56.952, + "step": 2800 + }, + { + "epoch": 10.111963882618511, + "grad_norm": 284.7151794433594, + "learning_rate": 1.480399274047187e-05, + "loss": 33.1712, + "step": 2801 + }, + { + "epoch": 10.115575620767494, + "grad_norm": 234.85169982910156, + "learning_rate": 1.4798548094373867e-05, + "loss": 33.495, + "step": 2802 + }, + { + "epoch": 10.119187358916479, + "grad_norm": 236.6138458251953, + "learning_rate": 1.4793103448275862e-05, + "loss": 33.2318, + "step": 2803 + }, + { + "epoch": 10.122799097065462, + "grad_norm": 240.98997497558594, + "learning_rate": 1.4787658802177858e-05, + "loss": 33.9268, + "step": 2804 + }, + { + "epoch": 10.126410835214447, + "grad_norm": 218.304443359375, + "learning_rate": 1.4782214156079856e-05, + "loss": 34.667, + "step": 2805 + }, + { + "epoch": 10.130022573363432, + "grad_norm": 290.30108642578125, + "learning_rate": 1.4776769509981852e-05, + "loss": 36.7153, + "step": 2806 + }, + { + "epoch": 10.133634311512415, + "grad_norm": 267.7265625, + "learning_rate": 1.4771324863883847e-05, + "loss": 35.2035, + "step": 2807 + }, + { + "epoch": 10.1372460496614, + "grad_norm": 300.4646301269531, + "learning_rate": 1.4765880217785844e-05, + "loss": 35.6581, + "step": 2808 + }, + { + "epoch": 10.140857787810384, + "grad_norm": 234.16448974609375, + "learning_rate": 1.4760435571687841e-05, + "loss": 35.8547, + "step": 2809 + }, + { + "epoch": 10.144469525959368, + "grad_norm": 209.23858642578125, + "learning_rate": 1.4754990925589837e-05, + "loss": 34.47, + "step": 2810 + }, + { + "epoch": 10.144469525959368, + "eval_loss": 0.6160662770271301, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2810 + }, + { + "epoch": 10.148081264108352, + "grad_norm": 207.9628143310547, + "learning_rate": 1.4749546279491834e-05, + "loss": 36.1239, + "step": 2811 + }, + { + "epoch": 10.151693002257336, + "grad_norm": 183.68545532226562, + "learning_rate": 1.4744101633393829e-05, + "loss": 36.759, + "step": 2812 + }, + { + "epoch": 10.15530474040632, + "grad_norm": 222.00164794921875, + "learning_rate": 1.4738656987295826e-05, + "loss": 37.397, + "step": 2813 + }, + { + "epoch": 10.158916478555305, + "grad_norm": 226.9628448486328, + "learning_rate": 1.4733212341197823e-05, + "loss": 36.3648, + "step": 2814 + }, + { + "epoch": 10.162528216704288, + "grad_norm": 271.061279296875, + "learning_rate": 1.4727767695099819e-05, + "loss": 37.8754, + "step": 2815 + }, + { + "epoch": 10.166139954853273, + "grad_norm": 265.2478942871094, + "learning_rate": 1.4722323049001816e-05, + "loss": 33.7491, + "step": 2816 + }, + { + "epoch": 10.169751693002258, + "grad_norm": 227.5030975341797, + "learning_rate": 1.4716878402903811e-05, + "loss": 23.0162, + "step": 2817 + }, + { + "epoch": 10.173363431151241, + "grad_norm": 195.83477783203125, + "learning_rate": 1.4711433756805808e-05, + "loss": 23.5831, + "step": 2818 + }, + { + "epoch": 10.176975169300226, + "grad_norm": 196.982421875, + "learning_rate": 1.4705989110707805e-05, + "loss": 24.1078, + "step": 2819 + }, + { + "epoch": 10.18058690744921, + "grad_norm": 212.73031616210938, + "learning_rate": 1.47005444646098e-05, + "loss": 24.8378, + "step": 2820 + }, + { + "epoch": 10.18058690744921, + "eval_loss": 0.6217848062515259, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 2820 + }, + { + "epoch": 10.184198645598194, + "grad_norm": 261.8343200683594, + "learning_rate": 1.4695099818511796e-05, + "loss": 43.3402, + "step": 2821 + }, + { + "epoch": 10.187810383747179, + "grad_norm": 272.94158935546875, + "learning_rate": 1.4689655172413793e-05, + "loss": 42.8004, + "step": 2822 + }, + { + "epoch": 10.191422121896162, + "grad_norm": 261.5067138671875, + "learning_rate": 1.468421052631579e-05, + "loss": 43.5947, + "step": 2823 + }, + { + "epoch": 10.195033860045147, + "grad_norm": 280.4205322265625, + "learning_rate": 1.4678765880217787e-05, + "loss": 42.1887, + "step": 2824 + }, + { + "epoch": 10.198645598194132, + "grad_norm": 223.82449340820312, + "learning_rate": 1.4673321234119783e-05, + "loss": 40.9825, + "step": 2825 + }, + { + "epoch": 10.202257336343115, + "grad_norm": 261.1077575683594, + "learning_rate": 1.4667876588021778e-05, + "loss": 41.8347, + "step": 2826 + }, + { + "epoch": 10.2058690744921, + "grad_norm": 189.1642608642578, + "learning_rate": 1.4662431941923775e-05, + "loss": 41.7441, + "step": 2827 + }, + { + "epoch": 10.209480812641084, + "grad_norm": 216.94410705566406, + "learning_rate": 1.4656987295825772e-05, + "loss": 42.203, + "step": 2828 + }, + { + "epoch": 10.213092550790067, + "grad_norm": 260.44744873046875, + "learning_rate": 1.4651542649727768e-05, + "loss": 41.8887, + "step": 2829 + }, + { + "epoch": 10.216704288939052, + "grad_norm": 252.21682739257812, + "learning_rate": 1.4646098003629765e-05, + "loss": 42.5977, + "step": 2830 + }, + { + "epoch": 10.216704288939052, + "eval_loss": 0.6175437569618225, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.997, + "eval_steps_per_second": 56.997, + "step": 2830 + }, + { + "epoch": 10.220316027088035, + "grad_norm": 298.4760437011719, + "learning_rate": 1.464065335753176e-05, + "loss": 40.7994, + "step": 2831 + }, + { + "epoch": 10.22392776523702, + "grad_norm": 214.0433349609375, + "learning_rate": 1.4635208711433757e-05, + "loss": 39.1571, + "step": 2832 + }, + { + "epoch": 10.227539503386005, + "grad_norm": 220.59039306640625, + "learning_rate": 1.4629764065335754e-05, + "loss": 38.257, + "step": 2833 + }, + { + "epoch": 10.231151241534988, + "grad_norm": 218.2419891357422, + "learning_rate": 1.462431941923775e-05, + "loss": 38.1954, + "step": 2834 + }, + { + "epoch": 10.234762979683973, + "grad_norm": 241.67674255371094, + "learning_rate": 1.4618874773139747e-05, + "loss": 39.7451, + "step": 2835 + }, + { + "epoch": 10.238374717832958, + "grad_norm": 260.3656005859375, + "learning_rate": 1.4613430127041742e-05, + "loss": 38.8297, + "step": 2836 + }, + { + "epoch": 10.241986455981941, + "grad_norm": 231.78102111816406, + "learning_rate": 1.4607985480943739e-05, + "loss": 38.523, + "step": 2837 + }, + { + "epoch": 10.245598194130926, + "grad_norm": 217.64820861816406, + "learning_rate": 1.4602540834845736e-05, + "loss": 40.0389, + "step": 2838 + }, + { + "epoch": 10.249209932279909, + "grad_norm": 186.45240783691406, + "learning_rate": 1.4597096188747732e-05, + "loss": 40.3306, + "step": 2839 + }, + { + "epoch": 10.252821670428894, + "grad_norm": 225.20480346679688, + "learning_rate": 1.4591651542649727e-05, + "loss": 39.0968, + "step": 2840 + }, + { + "epoch": 10.252821670428894, + "eval_loss": 0.6195141673088074, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 2840 + }, + { + "epoch": 10.256433408577879, + "grad_norm": 367.6174621582031, + "learning_rate": 1.4586206896551724e-05, + "loss": 38.869, + "step": 2841 + }, + { + "epoch": 10.260045146726862, + "grad_norm": 274.3976135253906, + "learning_rate": 1.4580762250453721e-05, + "loss": 39.7781, + "step": 2842 + }, + { + "epoch": 10.263656884875846, + "grad_norm": 193.41665649414062, + "learning_rate": 1.4575317604355718e-05, + "loss": 38.819, + "step": 2843 + }, + { + "epoch": 10.267268623024831, + "grad_norm": 204.2224578857422, + "learning_rate": 1.4569872958257714e-05, + "loss": 41.5495, + "step": 2844 + }, + { + "epoch": 10.270880361173814, + "grad_norm": 276.07476806640625, + "learning_rate": 1.4564428312159709e-05, + "loss": 40.6553, + "step": 2845 + }, + { + "epoch": 10.2744920993228, + "grad_norm": 192.6361541748047, + "learning_rate": 1.4558983666061708e-05, + "loss": 40.2147, + "step": 2846 + }, + { + "epoch": 10.278103837471784, + "grad_norm": 232.6641082763672, + "learning_rate": 1.4553539019963703e-05, + "loss": 40.7223, + "step": 2847 + }, + { + "epoch": 10.281715575620767, + "grad_norm": 266.781005859375, + "learning_rate": 1.4548094373865698e-05, + "loss": 38.0127, + "step": 2848 + }, + { + "epoch": 10.285327313769752, + "grad_norm": 289.5414123535156, + "learning_rate": 1.4542649727767696e-05, + "loss": 35.216, + "step": 2849 + }, + { + "epoch": 10.288939051918735, + "grad_norm": 208.10845947265625, + "learning_rate": 1.4537205081669691e-05, + "loss": 33.829, + "step": 2850 + }, + { + "epoch": 10.288939051918735, + "eval_loss": 0.6140356063842773, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.703, + "eval_steps_per_second": 56.703, + "step": 2850 + }, + { + "epoch": 10.29255079006772, + "grad_norm": 260.80328369140625, + "learning_rate": 1.4531760435571688e-05, + "loss": 33.8409, + "step": 2851 + }, + { + "epoch": 10.296162528216705, + "grad_norm": 202.3874053955078, + "learning_rate": 1.4526315789473685e-05, + "loss": 32.6498, + "step": 2852 + }, + { + "epoch": 10.299774266365688, + "grad_norm": 236.0218048095703, + "learning_rate": 1.452087114337568e-05, + "loss": 33.6538, + "step": 2853 + }, + { + "epoch": 10.303386004514673, + "grad_norm": 219.1603240966797, + "learning_rate": 1.4515426497277678e-05, + "loss": 33.7346, + "step": 2854 + }, + { + "epoch": 10.306997742663658, + "grad_norm": 252.8759307861328, + "learning_rate": 1.4509981851179675e-05, + "loss": 34.6996, + "step": 2855 + }, + { + "epoch": 10.31060948081264, + "grad_norm": 204.89244079589844, + "learning_rate": 1.450453720508167e-05, + "loss": 36.1145, + "step": 2856 + }, + { + "epoch": 10.314221218961626, + "grad_norm": 239.5278778076172, + "learning_rate": 1.4499092558983667e-05, + "loss": 34.8845, + "step": 2857 + }, + { + "epoch": 10.317832957110609, + "grad_norm": 235.02403259277344, + "learning_rate": 1.4493647912885662e-05, + "loss": 36.1006, + "step": 2858 + }, + { + "epoch": 10.321444695259594, + "grad_norm": 219.25686645507812, + "learning_rate": 1.4488203266787658e-05, + "loss": 37.0463, + "step": 2859 + }, + { + "epoch": 10.325056433408578, + "grad_norm": 238.1767578125, + "learning_rate": 1.4482758620689657e-05, + "loss": 35.5543, + "step": 2860 + }, + { + "epoch": 10.325056433408578, + "eval_loss": 0.6116110682487488, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.93, + "eval_steps_per_second": 56.93, + "step": 2860 + }, + { + "epoch": 10.328668171557561, + "grad_norm": 245.4133758544922, + "learning_rate": 1.4477313974591652e-05, + "loss": 35.7557, + "step": 2861 + }, + { + "epoch": 10.332279909706546, + "grad_norm": 231.70779418945312, + "learning_rate": 1.4471869328493647e-05, + "loss": 35.9535, + "step": 2862 + }, + { + "epoch": 10.335891647855531, + "grad_norm": 218.71266174316406, + "learning_rate": 1.4466424682395644e-05, + "loss": 36.747, + "step": 2863 + }, + { + "epoch": 10.339503386004514, + "grad_norm": 206.82247924804688, + "learning_rate": 1.446098003629764e-05, + "loss": 37.4007, + "step": 2864 + }, + { + "epoch": 10.343115124153499, + "grad_norm": 286.6649475097656, + "learning_rate": 1.4455535390199639e-05, + "loss": 38.183, + "step": 2865 + }, + { + "epoch": 10.346726862302482, + "grad_norm": 262.2049865722656, + "learning_rate": 1.4450090744101634e-05, + "loss": 28.1564, + "step": 2866 + }, + { + "epoch": 10.350338600451467, + "grad_norm": 203.03831481933594, + "learning_rate": 1.444464609800363e-05, + "loss": 23.7155, + "step": 2867 + }, + { + "epoch": 10.353950338600452, + "grad_norm": 220.13597106933594, + "learning_rate": 1.4439201451905626e-05, + "loss": 23.5066, + "step": 2868 + }, + { + "epoch": 10.357562076749435, + "grad_norm": 208.22035217285156, + "learning_rate": 1.4433756805807624e-05, + "loss": 23.8087, + "step": 2869 + }, + { + "epoch": 10.36117381489842, + "grad_norm": 202.74989318847656, + "learning_rate": 1.4428312159709619e-05, + "loss": 24.6194, + "step": 2870 + }, + { + "epoch": 10.36117381489842, + "eval_loss": 0.6170971989631653, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 2870 + }, + { + "epoch": 10.364785553047405, + "grad_norm": 251.78924560546875, + "learning_rate": 1.4422867513611616e-05, + "loss": 41.1333, + "step": 2871 + }, + { + "epoch": 10.368397291196388, + "grad_norm": 269.72430419921875, + "learning_rate": 1.4417422867513611e-05, + "loss": 43.5289, + "step": 2872 + }, + { + "epoch": 10.372009029345373, + "grad_norm": 226.14202880859375, + "learning_rate": 1.4411978221415607e-05, + "loss": 42.1575, + "step": 2873 + }, + { + "epoch": 10.375620767494357, + "grad_norm": 230.2255096435547, + "learning_rate": 1.4406533575317606e-05, + "loss": 42.5563, + "step": 2874 + }, + { + "epoch": 10.37923250564334, + "grad_norm": 259.2338562011719, + "learning_rate": 1.4401088929219601e-05, + "loss": 41.517, + "step": 2875 + }, + { + "epoch": 10.382844243792325, + "grad_norm": 280.06414794921875, + "learning_rate": 1.4395644283121598e-05, + "loss": 41.3589, + "step": 2876 + }, + { + "epoch": 10.386455981941308, + "grad_norm": 259.1960754394531, + "learning_rate": 1.4390199637023593e-05, + "loss": 41.539, + "step": 2877 + }, + { + "epoch": 10.390067720090293, + "grad_norm": 244.4931640625, + "learning_rate": 1.438475499092559e-05, + "loss": 41.8689, + "step": 2878 + }, + { + "epoch": 10.393679458239278, + "grad_norm": 195.65065002441406, + "learning_rate": 1.4379310344827588e-05, + "loss": 42.9191, + "step": 2879 + }, + { + "epoch": 10.397291196388261, + "grad_norm": 215.88589477539062, + "learning_rate": 1.4373865698729583e-05, + "loss": 41.4172, + "step": 2880 + }, + { + "epoch": 10.397291196388261, + "eval_loss": 0.6176813840866089, + "eval_runtime": 3.1462, + "eval_samples_per_second": 56.893, + "eval_steps_per_second": 56.893, + "step": 2880 + }, + { + "epoch": 10.400902934537246, + "grad_norm": 175.21368408203125, + "learning_rate": 1.4368421052631578e-05, + "loss": 41.8998, + "step": 2881 + }, + { + "epoch": 10.404514672686231, + "grad_norm": 207.65963745117188, + "learning_rate": 1.4362976406533575e-05, + "loss": 40.33, + "step": 2882 + }, + { + "epoch": 10.408126410835214, + "grad_norm": 213.50526428222656, + "learning_rate": 1.4357531760435572e-05, + "loss": 38.0329, + "step": 2883 + }, + { + "epoch": 10.411738148984199, + "grad_norm": 190.8444366455078, + "learning_rate": 1.4352087114337568e-05, + "loss": 39.0142, + "step": 2884 + }, + { + "epoch": 10.415349887133182, + "grad_norm": 300.2298583984375, + "learning_rate": 1.4346642468239565e-05, + "loss": 38.6364, + "step": 2885 + }, + { + "epoch": 10.418961625282167, + "grad_norm": 183.6144256591797, + "learning_rate": 1.434119782214156e-05, + "loss": 39.6747, + "step": 2886 + }, + { + "epoch": 10.422573363431152, + "grad_norm": 237.85340881347656, + "learning_rate": 1.4335753176043557e-05, + "loss": 38.3018, + "step": 2887 + }, + { + "epoch": 10.426185101580135, + "grad_norm": 325.96624755859375, + "learning_rate": 1.4330308529945554e-05, + "loss": 40.1042, + "step": 2888 + }, + { + "epoch": 10.42979683972912, + "grad_norm": 248.4732666015625, + "learning_rate": 1.432486388384755e-05, + "loss": 40.0357, + "step": 2889 + }, + { + "epoch": 10.433408577878104, + "grad_norm": 374.6653747558594, + "learning_rate": 1.4319419237749547e-05, + "loss": 40.4383, + "step": 2890 + }, + { + "epoch": 10.433408577878104, + "eval_loss": 0.6150367856025696, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.881, + "eval_steps_per_second": 56.881, + "step": 2890 + }, + { + "epoch": 10.437020316027088, + "grad_norm": 229.79647827148438, + "learning_rate": 1.4313974591651542e-05, + "loss": 40.3728, + "step": 2891 + }, + { + "epoch": 10.440632054176072, + "grad_norm": 278.7500915527344, + "learning_rate": 1.430852994555354e-05, + "loss": 39.546, + "step": 2892 + }, + { + "epoch": 10.444243792325057, + "grad_norm": 233.1890106201172, + "learning_rate": 1.4303085299455536e-05, + "loss": 41.8094, + "step": 2893 + }, + { + "epoch": 10.44785553047404, + "grad_norm": 207.7745819091797, + "learning_rate": 1.4297640653357532e-05, + "loss": 40.6225, + "step": 2894 + }, + { + "epoch": 10.451467268623025, + "grad_norm": 233.37892150878906, + "learning_rate": 1.4292196007259529e-05, + "loss": 40.2499, + "step": 2895 + }, + { + "epoch": 10.455079006772008, + "grad_norm": 225.4070587158203, + "learning_rate": 1.4286751361161524e-05, + "loss": 40.3626, + "step": 2896 + }, + { + "epoch": 10.458690744920993, + "grad_norm": 239.60231018066406, + "learning_rate": 1.4281306715063521e-05, + "loss": 40.3149, + "step": 2897 + }, + { + "epoch": 10.462302483069978, + "grad_norm": 225.3981475830078, + "learning_rate": 1.4275862068965518e-05, + "loss": 39.3443, + "step": 2898 + }, + { + "epoch": 10.465914221218961, + "grad_norm": 270.2829284667969, + "learning_rate": 1.4270417422867514e-05, + "loss": 37.8947, + "step": 2899 + }, + { + "epoch": 10.469525959367946, + "grad_norm": 263.66986083984375, + "learning_rate": 1.426497277676951e-05, + "loss": 34.4721, + "step": 2900 + }, + { + "epoch": 10.469525959367946, + "eval_loss": 0.6134031414985657, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2900 + }, + { + "epoch": 10.47313769751693, + "grad_norm": 189.3812255859375, + "learning_rate": 1.4259528130671508e-05, + "loss": 34.3148, + "step": 2901 + }, + { + "epoch": 10.476749435665914, + "grad_norm": 256.7174987792969, + "learning_rate": 1.4254083484573503e-05, + "loss": 32.1693, + "step": 2902 + }, + { + "epoch": 10.480361173814899, + "grad_norm": 265.40692138671875, + "learning_rate": 1.4248638838475499e-05, + "loss": 34.369, + "step": 2903 + }, + { + "epoch": 10.483972911963882, + "grad_norm": 315.6539001464844, + "learning_rate": 1.4243194192377496e-05, + "loss": 34.9479, + "step": 2904 + }, + { + "epoch": 10.487584650112867, + "grad_norm": 263.7816162109375, + "learning_rate": 1.4237749546279491e-05, + "loss": 33.983, + "step": 2905 + }, + { + "epoch": 10.491196388261852, + "grad_norm": 244.69192504882812, + "learning_rate": 1.423230490018149e-05, + "loss": 36.6685, + "step": 2906 + }, + { + "epoch": 10.494808126410835, + "grad_norm": 224.26071166992188, + "learning_rate": 1.4226860254083485e-05, + "loss": 35.0337, + "step": 2907 + }, + { + "epoch": 10.49841986455982, + "grad_norm": 261.0958557128906, + "learning_rate": 1.422141560798548e-05, + "loss": 34.7154, + "step": 2908 + }, + { + "epoch": 10.502031602708804, + "grad_norm": 245.85960388183594, + "learning_rate": 1.4215970961887478e-05, + "loss": 35.4156, + "step": 2909 + }, + { + "epoch": 10.505643340857787, + "grad_norm": 309.3730163574219, + "learning_rate": 1.4210526315789473e-05, + "loss": 36.3999, + "step": 2910 + }, + { + "epoch": 10.505643340857787, + "eval_loss": 0.6144266128540039, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.853, + "eval_steps_per_second": 56.853, + "step": 2910 + }, + { + "epoch": 10.509255079006772, + "grad_norm": 209.9637451171875, + "learning_rate": 1.420508166969147e-05, + "loss": 37.1515, + "step": 2911 + }, + { + "epoch": 10.512866817155757, + "grad_norm": 254.81683349609375, + "learning_rate": 1.4199637023593467e-05, + "loss": 35.5548, + "step": 2912 + }, + { + "epoch": 10.51647855530474, + "grad_norm": 224.94137573242188, + "learning_rate": 1.4194192377495463e-05, + "loss": 36.7691, + "step": 2913 + }, + { + "epoch": 10.520090293453725, + "grad_norm": 223.81838989257812, + "learning_rate": 1.4188747731397458e-05, + "loss": 37.5904, + "step": 2914 + }, + { + "epoch": 10.523702031602708, + "grad_norm": 308.0168151855469, + "learning_rate": 1.4183303085299457e-05, + "loss": 36.1561, + "step": 2915 + }, + { + "epoch": 10.527313769751693, + "grad_norm": 214.77928161621094, + "learning_rate": 1.4177858439201452e-05, + "loss": 27.6309, + "step": 2916 + }, + { + "epoch": 10.530925507900678, + "grad_norm": 153.77163696289062, + "learning_rate": 1.417241379310345e-05, + "loss": 23.6151, + "step": 2917 + }, + { + "epoch": 10.534537246049661, + "grad_norm": 161.12826538085938, + "learning_rate": 1.4166969147005445e-05, + "loss": 23.1684, + "step": 2918 + }, + { + "epoch": 10.538148984198646, + "grad_norm": 228.01441955566406, + "learning_rate": 1.416152450090744e-05, + "loss": 23.4383, + "step": 2919 + }, + { + "epoch": 10.54176072234763, + "grad_norm": 207.55052185058594, + "learning_rate": 1.4156079854809439e-05, + "loss": 25.4699, + "step": 2920 + }, + { + "epoch": 10.54176072234763, + "eval_loss": 0.6177500486373901, + "eval_runtime": 3.1369, + "eval_samples_per_second": 57.063, + "eval_steps_per_second": 57.063, + "step": 2920 + }, + { + "epoch": 10.545372460496614, + "grad_norm": 254.23828125, + "learning_rate": 1.4150635208711434e-05, + "loss": 42.1525, + "step": 2921 + }, + { + "epoch": 10.548984198645599, + "grad_norm": 228.1654815673828, + "learning_rate": 1.414519056261343e-05, + "loss": 42.4282, + "step": 2922 + }, + { + "epoch": 10.552595936794582, + "grad_norm": 258.4981689453125, + "learning_rate": 1.4139745916515427e-05, + "loss": 42.3053, + "step": 2923 + }, + { + "epoch": 10.556207674943566, + "grad_norm": 364.42059326171875, + "learning_rate": 1.4134301270417424e-05, + "loss": 41.9009, + "step": 2924 + }, + { + "epoch": 10.559819413092551, + "grad_norm": 213.5066375732422, + "learning_rate": 1.412885662431942e-05, + "loss": 41.0624, + "step": 2925 + }, + { + "epoch": 10.563431151241534, + "grad_norm": 214.23472595214844, + "learning_rate": 1.4123411978221416e-05, + "loss": 42.2508, + "step": 2926 + }, + { + "epoch": 10.56704288939052, + "grad_norm": 249.8063201904297, + "learning_rate": 1.4117967332123412e-05, + "loss": 43.0671, + "step": 2927 + }, + { + "epoch": 10.570654627539504, + "grad_norm": 210.0769805908203, + "learning_rate": 1.4112522686025409e-05, + "loss": 43.4018, + "step": 2928 + }, + { + "epoch": 10.574266365688487, + "grad_norm": 255.67225646972656, + "learning_rate": 1.4107078039927406e-05, + "loss": 42.9609, + "step": 2929 + }, + { + "epoch": 10.577878103837472, + "grad_norm": 294.2599182128906, + "learning_rate": 1.4101633393829401e-05, + "loss": 41.8748, + "step": 2930 + }, + { + "epoch": 10.577878103837472, + "eval_loss": 0.6147512793540955, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2930 + }, + { + "epoch": 10.581489841986457, + "grad_norm": 212.6685333251953, + "learning_rate": 1.4096188747731398e-05, + "loss": 42.4291, + "step": 2931 + }, + { + "epoch": 10.58510158013544, + "grad_norm": 297.016357421875, + "learning_rate": 1.4090744101633394e-05, + "loss": 39.7291, + "step": 2932 + }, + { + "epoch": 10.588713318284425, + "grad_norm": 280.308837890625, + "learning_rate": 1.4085299455535389e-05, + "loss": 37.4836, + "step": 2933 + }, + { + "epoch": 10.592325056433408, + "grad_norm": 230.28994750976562, + "learning_rate": 1.4079854809437388e-05, + "loss": 39.4075, + "step": 2934 + }, + { + "epoch": 10.595936794582393, + "grad_norm": 377.0367126464844, + "learning_rate": 1.4074410163339383e-05, + "loss": 40.5601, + "step": 2935 + }, + { + "epoch": 10.599548532731378, + "grad_norm": 238.51597595214844, + "learning_rate": 1.406896551724138e-05, + "loss": 38.1238, + "step": 2936 + }, + { + "epoch": 10.60316027088036, + "grad_norm": 197.5536651611328, + "learning_rate": 1.4063520871143376e-05, + "loss": 38.2997, + "step": 2937 + }, + { + "epoch": 10.606772009029346, + "grad_norm": 211.65162658691406, + "learning_rate": 1.4058076225045373e-05, + "loss": 39.1501, + "step": 2938 + }, + { + "epoch": 10.610383747178329, + "grad_norm": 266.4801940917969, + "learning_rate": 1.405263157894737e-05, + "loss": 40.5761, + "step": 2939 + }, + { + "epoch": 10.613995485327314, + "grad_norm": 210.29478454589844, + "learning_rate": 1.4047186932849365e-05, + "loss": 39.7387, + "step": 2940 + }, + { + "epoch": 10.613995485327314, + "eval_loss": 0.6154477000236511, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 2940 + }, + { + "epoch": 10.617607223476298, + "grad_norm": 318.0694580078125, + "learning_rate": 1.404174228675136e-05, + "loss": 38.691, + "step": 2941 + }, + { + "epoch": 10.621218961625281, + "grad_norm": 351.12811279296875, + "learning_rate": 1.4036297640653358e-05, + "loss": 40.3878, + "step": 2942 + }, + { + "epoch": 10.624830699774266, + "grad_norm": 259.8601989746094, + "learning_rate": 1.4030852994555355e-05, + "loss": 38.4447, + "step": 2943 + }, + { + "epoch": 10.628442437923251, + "grad_norm": 249.7741241455078, + "learning_rate": 1.402540834845735e-05, + "loss": 41.1242, + "step": 2944 + }, + { + "epoch": 10.632054176072234, + "grad_norm": 207.11119079589844, + "learning_rate": 1.4019963702359347e-05, + "loss": 40.1977, + "step": 2945 + }, + { + "epoch": 10.635665914221219, + "grad_norm": 199.37295532226562, + "learning_rate": 1.4014519056261343e-05, + "loss": 40.71, + "step": 2946 + }, + { + "epoch": 10.639277652370204, + "grad_norm": 238.85061645507812, + "learning_rate": 1.4009074410163341e-05, + "loss": 41.8822, + "step": 2947 + }, + { + "epoch": 10.642889390519187, + "grad_norm": 212.46388244628906, + "learning_rate": 1.4003629764065337e-05, + "loss": 40.5648, + "step": 2948 + }, + { + "epoch": 10.646501128668172, + "grad_norm": 217.60386657714844, + "learning_rate": 1.3998185117967332e-05, + "loss": 39.6074, + "step": 2949 + }, + { + "epoch": 10.650112866817155, + "grad_norm": 223.88645935058594, + "learning_rate": 1.399274047186933e-05, + "loss": 37.7394, + "step": 2950 + }, + { + "epoch": 10.650112866817155, + "eval_loss": 0.6133999228477478, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 2950 + }, + { + "epoch": 10.65372460496614, + "grad_norm": 248.87986755371094, + "learning_rate": 1.3987295825771325e-05, + "loss": 34.911, + "step": 2951 + }, + { + "epoch": 10.657336343115125, + "grad_norm": 238.0355987548828, + "learning_rate": 1.3981851179673322e-05, + "loss": 34.0325, + "step": 2952 + }, + { + "epoch": 10.660948081264108, + "grad_norm": 212.9556121826172, + "learning_rate": 1.3976406533575319e-05, + "loss": 34.9663, + "step": 2953 + }, + { + "epoch": 10.664559819413093, + "grad_norm": 274.4277648925781, + "learning_rate": 1.3970961887477314e-05, + "loss": 34.2399, + "step": 2954 + }, + { + "epoch": 10.668171557562077, + "grad_norm": 211.77976989746094, + "learning_rate": 1.396551724137931e-05, + "loss": 33.7609, + "step": 2955 + }, + { + "epoch": 10.67178329571106, + "grad_norm": 280.6621398925781, + "learning_rate": 1.3960072595281307e-05, + "loss": 35.2616, + "step": 2956 + }, + { + "epoch": 10.675395033860045, + "grad_norm": 239.06439208984375, + "learning_rate": 1.3954627949183304e-05, + "loss": 34.2542, + "step": 2957 + }, + { + "epoch": 10.679006772009028, + "grad_norm": 271.45806884765625, + "learning_rate": 1.39491833030853e-05, + "loss": 36.0551, + "step": 2958 + }, + { + "epoch": 10.682618510158013, + "grad_norm": 247.76486206054688, + "learning_rate": 1.3943738656987296e-05, + "loss": 36.9935, + "step": 2959 + }, + { + "epoch": 10.686230248306998, + "grad_norm": 259.47930908203125, + "learning_rate": 1.3938294010889292e-05, + "loss": 36.7769, + "step": 2960 + }, + { + "epoch": 10.686230248306998, + "eval_loss": 0.6107803583145142, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.138, + "eval_steps_per_second": 57.138, + "step": 2960 + }, + { + "epoch": 10.689841986455981, + "grad_norm": 247.50103759765625, + "learning_rate": 1.393284936479129e-05, + "loss": 35.4848, + "step": 2961 + }, + { + "epoch": 10.693453724604966, + "grad_norm": 242.37330627441406, + "learning_rate": 1.3927404718693286e-05, + "loss": 36.3881, + "step": 2962 + }, + { + "epoch": 10.697065462753951, + "grad_norm": 200.2835693359375, + "learning_rate": 1.3921960072595281e-05, + "loss": 37.2684, + "step": 2963 + }, + { + "epoch": 10.700677200902934, + "grad_norm": 261.6256103515625, + "learning_rate": 1.3916515426497278e-05, + "loss": 37.4581, + "step": 2964 + }, + { + "epoch": 10.704288939051919, + "grad_norm": 243.7251434326172, + "learning_rate": 1.3911070780399274e-05, + "loss": 35.8237, + "step": 2965 + }, + { + "epoch": 10.707900677200904, + "grad_norm": 172.99339294433594, + "learning_rate": 1.390562613430127e-05, + "loss": 29.5815, + "step": 2966 + }, + { + "epoch": 10.711512415349887, + "grad_norm": 168.88490295410156, + "learning_rate": 1.3900181488203268e-05, + "loss": 23.6597, + "step": 2967 + }, + { + "epoch": 10.715124153498872, + "grad_norm": 213.0456085205078, + "learning_rate": 1.3894736842105263e-05, + "loss": 22.5034, + "step": 2968 + }, + { + "epoch": 10.718735891647855, + "grad_norm": 183.87222290039062, + "learning_rate": 1.388929219600726e-05, + "loss": 24.1696, + "step": 2969 + }, + { + "epoch": 10.72234762979684, + "grad_norm": 179.4297637939453, + "learning_rate": 1.3883847549909256e-05, + "loss": 24.8905, + "step": 2970 + }, + { + "epoch": 10.72234762979684, + "eval_loss": 0.6176853179931641, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 2970 + }, + { + "epoch": 10.725959367945824, + "grad_norm": 214.10662841796875, + "learning_rate": 1.3878402903811253e-05, + "loss": 40.6941, + "step": 2971 + }, + { + "epoch": 10.729571106094808, + "grad_norm": 199.4381103515625, + "learning_rate": 1.387295825771325e-05, + "loss": 42.6363, + "step": 2972 + }, + { + "epoch": 10.733182844243792, + "grad_norm": 182.74517822265625, + "learning_rate": 1.3867513611615245e-05, + "loss": 40.9695, + "step": 2973 + }, + { + "epoch": 10.736794582392777, + "grad_norm": 182.41421508789062, + "learning_rate": 1.386206896551724e-05, + "loss": 40.8893, + "step": 2974 + }, + { + "epoch": 10.74040632054176, + "grad_norm": 215.42904663085938, + "learning_rate": 1.385662431941924e-05, + "loss": 40.6667, + "step": 2975 + }, + { + "epoch": 10.744018058690745, + "grad_norm": 208.15133666992188, + "learning_rate": 1.3851179673321235e-05, + "loss": 42.0714, + "step": 2976 + }, + { + "epoch": 10.747629796839728, + "grad_norm": 224.70242309570312, + "learning_rate": 1.384573502722323e-05, + "loss": 40.9404, + "step": 2977 + }, + { + "epoch": 10.751241534988713, + "grad_norm": 241.45301818847656, + "learning_rate": 1.3840290381125227e-05, + "loss": 43.5597, + "step": 2978 + }, + { + "epoch": 10.754853273137698, + "grad_norm": 201.2677459716797, + "learning_rate": 1.3834845735027222e-05, + "loss": 42.7741, + "step": 2979 + }, + { + "epoch": 10.758465011286681, + "grad_norm": 246.30873107910156, + "learning_rate": 1.3829401088929221e-05, + "loss": 41.7873, + "step": 2980 + }, + { + "epoch": 10.758465011286681, + "eval_loss": 0.6206657886505127, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2980 + }, + { + "epoch": 10.762076749435666, + "grad_norm": 206.91009521484375, + "learning_rate": 1.3823956442831217e-05, + "loss": 42.3601, + "step": 2981 + }, + { + "epoch": 10.76568848758465, + "grad_norm": 206.37472534179688, + "learning_rate": 1.3818511796733212e-05, + "loss": 38.5536, + "step": 2982 + }, + { + "epoch": 10.769300225733634, + "grad_norm": 206.49070739746094, + "learning_rate": 1.3813067150635209e-05, + "loss": 38.1051, + "step": 2983 + }, + { + "epoch": 10.772911963882619, + "grad_norm": 215.02455139160156, + "learning_rate": 1.3807622504537206e-05, + "loss": 39.0797, + "step": 2984 + }, + { + "epoch": 10.776523702031604, + "grad_norm": 254.23757934570312, + "learning_rate": 1.3802177858439202e-05, + "loss": 39.419, + "step": 2985 + }, + { + "epoch": 10.780135440180587, + "grad_norm": 205.85079956054688, + "learning_rate": 1.3796733212341199e-05, + "loss": 39.2075, + "step": 2986 + }, + { + "epoch": 10.783747178329572, + "grad_norm": 216.0372314453125, + "learning_rate": 1.3791288566243194e-05, + "loss": 38.5652, + "step": 2987 + }, + { + "epoch": 10.787358916478555, + "grad_norm": 258.47650146484375, + "learning_rate": 1.3785843920145191e-05, + "loss": 38.1968, + "step": 2988 + }, + { + "epoch": 10.79097065462754, + "grad_norm": 289.07354736328125, + "learning_rate": 1.3780399274047188e-05, + "loss": 40.2233, + "step": 2989 + }, + { + "epoch": 10.794582392776524, + "grad_norm": 332.9964904785156, + "learning_rate": 1.3774954627949184e-05, + "loss": 39.5959, + "step": 2990 + }, + { + "epoch": 10.794582392776524, + "eval_loss": 0.6167517304420471, + "eval_runtime": 3.1556, + "eval_samples_per_second": 56.724, + "eval_steps_per_second": 56.724, + "step": 2990 + }, + { + "epoch": 10.798194130925507, + "grad_norm": 205.10699462890625, + "learning_rate": 1.376950998185118e-05, + "loss": 40.2468, + "step": 2991 + }, + { + "epoch": 10.801805869074492, + "grad_norm": 270.2808837890625, + "learning_rate": 1.3764065335753176e-05, + "loss": 37.5956, + "step": 2992 + }, + { + "epoch": 10.805417607223477, + "grad_norm": 199.32044982910156, + "learning_rate": 1.3758620689655171e-05, + "loss": 38.7289, + "step": 2993 + }, + { + "epoch": 10.80902934537246, + "grad_norm": 196.97547912597656, + "learning_rate": 1.375317604355717e-05, + "loss": 40.6707, + "step": 2994 + }, + { + "epoch": 10.812641083521445, + "grad_norm": 219.34588623046875, + "learning_rate": 1.3747731397459166e-05, + "loss": 39.6782, + "step": 2995 + }, + { + "epoch": 10.816252821670428, + "grad_norm": 261.7323913574219, + "learning_rate": 1.3742286751361161e-05, + "loss": 41.1828, + "step": 2996 + }, + { + "epoch": 10.819864559819413, + "grad_norm": 250.89186096191406, + "learning_rate": 1.3736842105263158e-05, + "loss": 41.3582, + "step": 2997 + }, + { + "epoch": 10.823476297968398, + "grad_norm": 284.7223205566406, + "learning_rate": 1.3731397459165155e-05, + "loss": 39.3584, + "step": 2998 + }, + { + "epoch": 10.827088036117381, + "grad_norm": 212.9114990234375, + "learning_rate": 1.3725952813067152e-05, + "loss": 37.5373, + "step": 2999 + }, + { + "epoch": 10.830699774266366, + "grad_norm": 182.8346405029297, + "learning_rate": 1.3720508166969148e-05, + "loss": 35.2027, + "step": 3000 + }, + { + "epoch": 10.830699774266366, + "eval_loss": 0.6083630919456482, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.702, + "eval_steps_per_second": 56.702, + "step": 3000 + }, + { + "epoch": 10.83431151241535, + "grad_norm": 259.0496520996094, + "learning_rate": 1.3715063520871143e-05, + "loss": 33.4937, + "step": 3001 + }, + { + "epoch": 10.837923250564334, + "grad_norm": 173.037353515625, + "learning_rate": 1.370961887477314e-05, + "loss": 32.8549, + "step": 3002 + }, + { + "epoch": 10.841534988713319, + "grad_norm": 257.9381408691406, + "learning_rate": 1.3704174228675137e-05, + "loss": 33.9163, + "step": 3003 + }, + { + "epoch": 10.845146726862303, + "grad_norm": 248.58355712890625, + "learning_rate": 1.3698729582577132e-05, + "loss": 34.3948, + "step": 3004 + }, + { + "epoch": 10.848758465011286, + "grad_norm": 277.0877990722656, + "learning_rate": 1.369328493647913e-05, + "loss": 34.2868, + "step": 3005 + }, + { + "epoch": 10.852370203160271, + "grad_norm": 220.54014587402344, + "learning_rate": 1.3687840290381125e-05, + "loss": 35.2502, + "step": 3006 + }, + { + "epoch": 10.855981941309254, + "grad_norm": 248.14111328125, + "learning_rate": 1.3682395644283122e-05, + "loss": 33.4599, + "step": 3007 + }, + { + "epoch": 10.85959367945824, + "grad_norm": 284.2827453613281, + "learning_rate": 1.3676950998185119e-05, + "loss": 34.2927, + "step": 3008 + }, + { + "epoch": 10.863205417607224, + "grad_norm": 236.78201293945312, + "learning_rate": 1.3671506352087114e-05, + "loss": 34.9322, + "step": 3009 + }, + { + "epoch": 10.866817155756207, + "grad_norm": 245.58331298828125, + "learning_rate": 1.3666061705989112e-05, + "loss": 35.7628, + "step": 3010 + }, + { + "epoch": 10.866817155756207, + "eval_loss": 0.6125946640968323, + "eval_runtime": 3.1644, + "eval_samples_per_second": 56.566, + "eval_steps_per_second": 56.566, + "step": 3010 + }, + { + "epoch": 10.870428893905192, + "grad_norm": 217.79248046875, + "learning_rate": 1.3660617059891107e-05, + "loss": 35.7332, + "step": 3011 + }, + { + "epoch": 10.874040632054175, + "grad_norm": 258.78729248046875, + "learning_rate": 1.3655172413793104e-05, + "loss": 38.293, + "step": 3012 + }, + { + "epoch": 10.87765237020316, + "grad_norm": 253.94757080078125, + "learning_rate": 1.3649727767695101e-05, + "loss": 37.511, + "step": 3013 + }, + { + "epoch": 10.881264108352145, + "grad_norm": 265.5654602050781, + "learning_rate": 1.3644283121597096e-05, + "loss": 37.5786, + "step": 3014 + }, + { + "epoch": 10.884875846501128, + "grad_norm": 252.11453247070312, + "learning_rate": 1.3638838475499092e-05, + "loss": 37.1039, + "step": 3015 + }, + { + "epoch": 10.888487584650113, + "grad_norm": 259.5934753417969, + "learning_rate": 1.3633393829401089e-05, + "loss": 35.2651, + "step": 3016 + }, + { + "epoch": 10.892099322799098, + "grad_norm": 194.3569793701172, + "learning_rate": 1.3627949183303086e-05, + "loss": 23.7438, + "step": 3017 + }, + { + "epoch": 10.89571106094808, + "grad_norm": 233.95205688476562, + "learning_rate": 1.3622504537205081e-05, + "loss": 23.0061, + "step": 3018 + }, + { + "epoch": 10.899322799097066, + "grad_norm": 185.18495178222656, + "learning_rate": 1.3617059891107078e-05, + "loss": 24.5404, + "step": 3019 + }, + { + "epoch": 10.90293453724605, + "grad_norm": 200.27029418945312, + "learning_rate": 1.3611615245009074e-05, + "loss": 24.3629, + "step": 3020 + }, + { + "epoch": 10.90293453724605, + "eval_loss": 0.6178797483444214, + "eval_runtime": 3.1498, + "eval_samples_per_second": 56.829, + "eval_steps_per_second": 56.829, + "step": 3020 + }, + { + "epoch": 10.906546275395034, + "grad_norm": 226.4281463623047, + "learning_rate": 1.3606170598911073e-05, + "loss": 41.7249, + "step": 3021 + }, + { + "epoch": 10.910158013544018, + "grad_norm": 207.73768615722656, + "learning_rate": 1.3600725952813068e-05, + "loss": 42.1902, + "step": 3022 + }, + { + "epoch": 10.913769751693001, + "grad_norm": 248.69773864746094, + "learning_rate": 1.3595281306715063e-05, + "loss": 40.8419, + "step": 3023 + }, + { + "epoch": 10.917381489841986, + "grad_norm": 224.0100860595703, + "learning_rate": 1.358983666061706e-05, + "loss": 41.483, + "step": 3024 + }, + { + "epoch": 10.920993227990971, + "grad_norm": 217.3524932861328, + "learning_rate": 1.3584392014519056e-05, + "loss": 42.4667, + "step": 3025 + }, + { + "epoch": 10.924604966139954, + "grad_norm": 226.0863494873047, + "learning_rate": 1.3578947368421053e-05, + "loss": 40.8693, + "step": 3026 + }, + { + "epoch": 10.928216704288939, + "grad_norm": 278.3658447265625, + "learning_rate": 1.357350272232305e-05, + "loss": 39.5165, + "step": 3027 + }, + { + "epoch": 10.931828442437924, + "grad_norm": 226.6543731689453, + "learning_rate": 1.3568058076225045e-05, + "loss": 39.3144, + "step": 3028 + }, + { + "epoch": 10.935440180586907, + "grad_norm": 215.39073181152344, + "learning_rate": 1.3562613430127042e-05, + "loss": 39.9823, + "step": 3029 + }, + { + "epoch": 10.939051918735892, + "grad_norm": 239.6291961669922, + "learning_rate": 1.355716878402904e-05, + "loss": 40.898, + "step": 3030 + }, + { + "epoch": 10.939051918735892, + "eval_loss": 0.6163076162338257, + "eval_runtime": 3.153, + "eval_samples_per_second": 56.771, + "eval_steps_per_second": 56.771, + "step": 3030 + }, + { + "epoch": 10.942663656884875, + "grad_norm": 251.20431518554688, + "learning_rate": 1.3551724137931035e-05, + "loss": 40.8357, + "step": 3031 + }, + { + "epoch": 10.94627539503386, + "grad_norm": 243.96022033691406, + "learning_rate": 1.3546279491833032e-05, + "loss": 39.1261, + "step": 3032 + }, + { + "epoch": 10.949887133182845, + "grad_norm": 248.15545654296875, + "learning_rate": 1.3540834845735027e-05, + "loss": 40.9375, + "step": 3033 + }, + { + "epoch": 10.953498871331828, + "grad_norm": 215.00927734375, + "learning_rate": 1.3535390199637023e-05, + "loss": 42.4167, + "step": 3034 + }, + { + "epoch": 10.957110609480813, + "grad_norm": 263.11566162109375, + "learning_rate": 1.3529945553539021e-05, + "loss": 40.7363, + "step": 3035 + }, + { + "epoch": 10.960722347629797, + "grad_norm": 208.59628295898438, + "learning_rate": 1.3524500907441017e-05, + "loss": 35.7124, + "step": 3036 + }, + { + "epoch": 10.96433408577878, + "grad_norm": 187.6036834716797, + "learning_rate": 1.3519056261343012e-05, + "loss": 33.7512, + "step": 3037 + }, + { + "epoch": 10.967945823927765, + "grad_norm": 217.89825439453125, + "learning_rate": 1.351361161524501e-05, + "loss": 33.4262, + "step": 3038 + }, + { + "epoch": 10.97155756207675, + "grad_norm": 235.59889221191406, + "learning_rate": 1.3508166969147005e-05, + "loss": 35.2587, + "step": 3039 + }, + { + "epoch": 10.975169300225733, + "grad_norm": 261.9609680175781, + "learning_rate": 1.3502722323049003e-05, + "loss": 36.1296, + "step": 3040 + }, + { + "epoch": 10.975169300225733, + "eval_loss": 0.610818088054657, + "eval_runtime": 3.1502, + "eval_samples_per_second": 56.822, + "eval_steps_per_second": 56.822, + "step": 3040 + }, + { + "epoch": 10.978781038374718, + "grad_norm": 239.44386291503906, + "learning_rate": 1.3497277676950999e-05, + "loss": 35.6712, + "step": 3041 + }, + { + "epoch": 10.982392776523701, + "grad_norm": 260.9620666503906, + "learning_rate": 1.3491833030852994e-05, + "loss": 35.9054, + "step": 3042 + }, + { + "epoch": 10.986004514672686, + "grad_norm": 246.35678100585938, + "learning_rate": 1.3486388384754991e-05, + "loss": 35.6071, + "step": 3043 + }, + { + "epoch": 10.989616252821671, + "grad_norm": 259.808349609375, + "learning_rate": 1.3480943738656988e-05, + "loss": 37.8261, + "step": 3044 + }, + { + "epoch": 10.993227990970654, + "grad_norm": 187.34579467773438, + "learning_rate": 1.3475499092558984e-05, + "loss": 29.4662, + "step": 3045 + }, + { + "epoch": 10.996839729119639, + "grad_norm": 235.4073486328125, + "learning_rate": 1.3470054446460981e-05, + "loss": 23.668, + "step": 3046 + }, + { + "epoch": 11.0, + "grad_norm": 171.45904541015625, + "learning_rate": 1.3464609800362976e-05, + "loss": 21.3995, + "step": 3047 + }, + { + "epoch": 11.003611738148985, + "grad_norm": 262.18798828125, + "learning_rate": 1.3459165154264972e-05, + "loss": 40.2072, + "step": 3048 + }, + { + "epoch": 11.007223476297968, + "grad_norm": 298.67755126953125, + "learning_rate": 1.345372050816697e-05, + "loss": 42.5345, + "step": 3049 + }, + { + "epoch": 11.010835214446953, + "grad_norm": 215.71389770507812, + "learning_rate": 1.3448275862068966e-05, + "loss": 41.3491, + "step": 3050 + }, + { + "epoch": 11.010835214446953, + "eval_loss": 0.6099278330802917, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 3050 + }, + { + "epoch": 11.014446952595938, + "grad_norm": 243.77044677734375, + "learning_rate": 1.3442831215970963e-05, + "loss": 41.0093, + "step": 3051 + }, + { + "epoch": 11.01805869074492, + "grad_norm": 205.8600616455078, + "learning_rate": 1.3437386569872958e-05, + "loss": 41.944, + "step": 3052 + }, + { + "epoch": 11.021670428893906, + "grad_norm": 204.25608825683594, + "learning_rate": 1.3431941923774955e-05, + "loss": 39.3595, + "step": 3053 + }, + { + "epoch": 11.025282167042889, + "grad_norm": 195.03114318847656, + "learning_rate": 1.3426497277676952e-05, + "loss": 42.0208, + "step": 3054 + }, + { + "epoch": 11.028893905191874, + "grad_norm": 193.05857849121094, + "learning_rate": 1.3421052631578948e-05, + "loss": 41.2148, + "step": 3055 + }, + { + "epoch": 11.032505643340858, + "grad_norm": 255.9553680419922, + "learning_rate": 1.3415607985480943e-05, + "loss": 41.6029, + "step": 3056 + }, + { + "epoch": 11.036117381489841, + "grad_norm": 234.97799682617188, + "learning_rate": 1.341016333938294e-05, + "loss": 41.2583, + "step": 3057 + }, + { + "epoch": 11.039729119638826, + "grad_norm": 183.76707458496094, + "learning_rate": 1.3404718693284937e-05, + "loss": 39.4893, + "step": 3058 + }, + { + "epoch": 11.043340857787811, + "grad_norm": 162.30191040039062, + "learning_rate": 1.3399274047186933e-05, + "loss": 37.697, + "step": 3059 + }, + { + "epoch": 11.046952595936794, + "grad_norm": 223.8235626220703, + "learning_rate": 1.339382940108893e-05, + "loss": 37.2762, + "step": 3060 + }, + { + "epoch": 11.046952595936794, + "eval_loss": 0.6099210381507874, + "eval_runtime": 3.1526, + "eval_samples_per_second": 56.778, + "eval_steps_per_second": 56.778, + "step": 3060 + }, + { + "epoch": 11.050564334085779, + "grad_norm": 203.874755859375, + "learning_rate": 1.3388384754990925e-05, + "loss": 37.7674, + "step": 3061 + }, + { + "epoch": 11.054176072234762, + "grad_norm": 222.9609832763672, + "learning_rate": 1.3382940108892922e-05, + "loss": 39.5784, + "step": 3062 + }, + { + "epoch": 11.057787810383747, + "grad_norm": 177.81871032714844, + "learning_rate": 1.337749546279492e-05, + "loss": 37.5264, + "step": 3063 + }, + { + "epoch": 11.061399548532732, + "grad_norm": 209.53326416015625, + "learning_rate": 1.3372050816696915e-05, + "loss": 38.5067, + "step": 3064 + }, + { + "epoch": 11.065011286681715, + "grad_norm": 228.35260009765625, + "learning_rate": 1.3366606170598912e-05, + "loss": 37.5329, + "step": 3065 + }, + { + "epoch": 11.0686230248307, + "grad_norm": 231.5054168701172, + "learning_rate": 1.3361161524500907e-05, + "loss": 39.8565, + "step": 3066 + }, + { + "epoch": 11.072234762979685, + "grad_norm": 184.31460571289062, + "learning_rate": 1.3355716878402904e-05, + "loss": 37.9703, + "step": 3067 + }, + { + "epoch": 11.075846501128668, + "grad_norm": 230.06463623046875, + "learning_rate": 1.3350272232304901e-05, + "loss": 39.1406, + "step": 3068 + }, + { + "epoch": 11.079458239277653, + "grad_norm": 263.3990478515625, + "learning_rate": 1.3344827586206897e-05, + "loss": 39.8019, + "step": 3069 + }, + { + "epoch": 11.083069977426636, + "grad_norm": 217.89923095703125, + "learning_rate": 1.3339382940108892e-05, + "loss": 40.195, + "step": 3070 + }, + { + "epoch": 11.083069977426636, + "eval_loss": 0.6136859655380249, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 3070 + }, + { + "epoch": 11.08668171557562, + "grad_norm": 238.8343505859375, + "learning_rate": 1.333393829401089e-05, + "loss": 39.1668, + "step": 3071 + }, + { + "epoch": 11.090293453724605, + "grad_norm": 288.6470947265625, + "learning_rate": 1.3328493647912886e-05, + "loss": 40.3355, + "step": 3072 + }, + { + "epoch": 11.093905191873588, + "grad_norm": 284.3423156738281, + "learning_rate": 1.3323049001814883e-05, + "loss": 41.5359, + "step": 3073 + }, + { + "epoch": 11.097516930022573, + "grad_norm": 263.0945739746094, + "learning_rate": 1.3317604355716879e-05, + "loss": 41.3219, + "step": 3074 + }, + { + "epoch": 11.101128668171558, + "grad_norm": 208.96383666992188, + "learning_rate": 1.3312159709618874e-05, + "loss": 39.7292, + "step": 3075 + }, + { + "epoch": 11.104740406320541, + "grad_norm": 233.49888610839844, + "learning_rate": 1.3306715063520873e-05, + "loss": 35.282, + "step": 3076 + }, + { + "epoch": 11.108352144469526, + "grad_norm": 216.6250762939453, + "learning_rate": 1.3301270417422868e-05, + "loss": 34.4335, + "step": 3077 + }, + { + "epoch": 11.111963882618511, + "grad_norm": 182.3594970703125, + "learning_rate": 1.3295825771324864e-05, + "loss": 32.7557, + "step": 3078 + }, + { + "epoch": 11.115575620767494, + "grad_norm": 215.4852752685547, + "learning_rate": 1.329038112522686e-05, + "loss": 32.185, + "step": 3079 + }, + { + "epoch": 11.119187358916479, + "grad_norm": 237.4733123779297, + "learning_rate": 1.3284936479128856e-05, + "loss": 32.8733, + "step": 3080 + }, + { + "epoch": 11.119187358916479, + "eval_loss": 0.6130570769309998, + "eval_runtime": 3.154, + "eval_samples_per_second": 56.754, + "eval_steps_per_second": 56.754, + "step": 3080 + }, + { + "epoch": 11.122799097065462, + "grad_norm": 202.9044952392578, + "learning_rate": 1.3279491833030853e-05, + "loss": 33.89, + "step": 3081 + }, + { + "epoch": 11.126410835214447, + "grad_norm": 230.82086181640625, + "learning_rate": 1.327404718693285e-05, + "loss": 34.0808, + "step": 3082 + }, + { + "epoch": 11.130022573363432, + "grad_norm": 318.1103515625, + "learning_rate": 1.3268602540834846e-05, + "loss": 35.5715, + "step": 3083 + }, + { + "epoch": 11.133634311512415, + "grad_norm": 296.760986328125, + "learning_rate": 1.3263157894736843e-05, + "loss": 36.0701, + "step": 3084 + }, + { + "epoch": 11.1372460496614, + "grad_norm": 355.1922302246094, + "learning_rate": 1.3257713248638838e-05, + "loss": 35.027, + "step": 3085 + }, + { + "epoch": 11.140857787810384, + "grad_norm": 379.0643310546875, + "learning_rate": 1.3252268602540835e-05, + "loss": 36.8225, + "step": 3086 + }, + { + "epoch": 11.144469525959368, + "grad_norm": 271.0293273925781, + "learning_rate": 1.3246823956442832e-05, + "loss": 34.18, + "step": 3087 + }, + { + "epoch": 11.148081264108352, + "grad_norm": 231.29782104492188, + "learning_rate": 1.3241379310344828e-05, + "loss": 37.5546, + "step": 3088 + }, + { + "epoch": 11.151693002257336, + "grad_norm": 236.58180236816406, + "learning_rate": 1.3235934664246823e-05, + "loss": 35.8625, + "step": 3089 + }, + { + "epoch": 11.15530474040632, + "grad_norm": 220.71853637695312, + "learning_rate": 1.3230490018148822e-05, + "loss": 38.1384, + "step": 3090 + }, + { + "epoch": 11.15530474040632, + "eval_loss": 0.6140565276145935, + "eval_runtime": 3.1543, + "eval_samples_per_second": 56.747, + "eval_steps_per_second": 56.747, + "step": 3090 + }, + { + "epoch": 11.158916478555305, + "grad_norm": 251.32090759277344, + "learning_rate": 1.3225045372050817e-05, + "loss": 36.7226, + "step": 3091 + }, + { + "epoch": 11.162528216704288, + "grad_norm": 244.061279296875, + "learning_rate": 1.3219600725952814e-05, + "loss": 37.2144, + "step": 3092 + }, + { + "epoch": 11.166139954853273, + "grad_norm": 274.3013610839844, + "learning_rate": 1.321415607985481e-05, + "loss": 27.0703, + "step": 3093 + }, + { + "epoch": 11.169751693002258, + "grad_norm": 197.1829071044922, + "learning_rate": 1.3208711433756805e-05, + "loss": 23.0504, + "step": 3094 + }, + { + "epoch": 11.173363431151241, + "grad_norm": 205.8387451171875, + "learning_rate": 1.3203266787658804e-05, + "loss": 23.4632, + "step": 3095 + }, + { + "epoch": 11.176975169300226, + "grad_norm": 237.6263427734375, + "learning_rate": 1.31978221415608e-05, + "loss": 23.9426, + "step": 3096 + }, + { + "epoch": 11.18058690744921, + "grad_norm": 177.99688720703125, + "learning_rate": 1.3192377495462795e-05, + "loss": 24.2553, + "step": 3097 + }, + { + "epoch": 11.184198645598194, + "grad_norm": 235.16787719726562, + "learning_rate": 1.3186932849364792e-05, + "loss": 41.3257, + "step": 3098 + }, + { + "epoch": 11.187810383747179, + "grad_norm": 213.4043731689453, + "learning_rate": 1.3181488203266787e-05, + "loss": 42.3344, + "step": 3099 + }, + { + "epoch": 11.191422121896162, + "grad_norm": 162.57554626464844, + "learning_rate": 1.3176043557168784e-05, + "loss": 41.2702, + "step": 3100 + }, + { + "epoch": 11.191422121896162, + "eval_loss": 0.6155741214752197, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.06, + "eval_steps_per_second": 57.06, + "step": 3100 + }, + { + "epoch": 11.195033860045147, + "grad_norm": 215.84335327148438, + "learning_rate": 1.3170598911070781e-05, + "loss": 41.0582, + "step": 3101 + }, + { + "epoch": 11.198645598194132, + "grad_norm": 295.0271301269531, + "learning_rate": 1.3165154264972777e-05, + "loss": 41.3479, + "step": 3102 + }, + { + "epoch": 11.202257336343115, + "grad_norm": 287.3316955566406, + "learning_rate": 1.3159709618874774e-05, + "loss": 41.6267, + "step": 3103 + }, + { + "epoch": 11.2058690744921, + "grad_norm": 249.3993377685547, + "learning_rate": 1.315426497277677e-05, + "loss": 40.5208, + "step": 3104 + }, + { + "epoch": 11.209480812641084, + "grad_norm": 274.5410461425781, + "learning_rate": 1.3148820326678766e-05, + "loss": 41.7072, + "step": 3105 + }, + { + "epoch": 11.213092550790067, + "grad_norm": 259.49627685546875, + "learning_rate": 1.3143375680580763e-05, + "loss": 41.0034, + "step": 3106 + }, + { + "epoch": 11.216704288939052, + "grad_norm": 246.60902404785156, + "learning_rate": 1.3137931034482759e-05, + "loss": 40.1154, + "step": 3107 + }, + { + "epoch": 11.220316027088035, + "grad_norm": 224.0052947998047, + "learning_rate": 1.3132486388384754e-05, + "loss": 41.1167, + "step": 3108 + }, + { + "epoch": 11.22392776523702, + "grad_norm": 204.24021911621094, + "learning_rate": 1.3127041742286753e-05, + "loss": 37.0909, + "step": 3109 + }, + { + "epoch": 11.227539503386005, + "grad_norm": 206.67681884765625, + "learning_rate": 1.3121597096188748e-05, + "loss": 38.0959, + "step": 3110 + }, + { + "epoch": 11.227539503386005, + "eval_loss": 0.6148640513420105, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.798, + "eval_steps_per_second": 56.798, + "step": 3110 + }, + { + "epoch": 11.231151241534988, + "grad_norm": 255.91238403320312, + "learning_rate": 1.3116152450090743e-05, + "loss": 38.8076, + "step": 3111 + }, + { + "epoch": 11.234762979683973, + "grad_norm": 239.5032958984375, + "learning_rate": 1.311070780399274e-05, + "loss": 39.3991, + "step": 3112 + }, + { + "epoch": 11.238374717832958, + "grad_norm": 254.8914031982422, + "learning_rate": 1.3105263157894738e-05, + "loss": 37.7301, + "step": 3113 + }, + { + "epoch": 11.241986455981941, + "grad_norm": 229.97943115234375, + "learning_rate": 1.3099818511796735e-05, + "loss": 38.8527, + "step": 3114 + }, + { + "epoch": 11.245598194130926, + "grad_norm": 208.1148681640625, + "learning_rate": 1.309437386569873e-05, + "loss": 38.8518, + "step": 3115 + }, + { + "epoch": 11.249209932279909, + "grad_norm": 208.49557495117188, + "learning_rate": 1.3088929219600725e-05, + "loss": 38.927, + "step": 3116 + }, + { + "epoch": 11.252821670428894, + "grad_norm": 332.9958801269531, + "learning_rate": 1.3083484573502723e-05, + "loss": 40.0492, + "step": 3117 + }, + { + "epoch": 11.256433408577879, + "grad_norm": 253.16769409179688, + "learning_rate": 1.307803992740472e-05, + "loss": 39.1965, + "step": 3118 + }, + { + "epoch": 11.260045146726862, + "grad_norm": 243.8136444091797, + "learning_rate": 1.3072595281306715e-05, + "loss": 38.2286, + "step": 3119 + }, + { + "epoch": 11.263656884875846, + "grad_norm": 273.6463623046875, + "learning_rate": 1.3067150635208712e-05, + "loss": 39.3751, + "step": 3120 + }, + { + "epoch": 11.263656884875846, + "eval_loss": 0.6175129413604736, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 3120 + }, + { + "epoch": 11.267268623024831, + "grad_norm": 228.980224609375, + "learning_rate": 1.3061705989110707e-05, + "loss": 40.29, + "step": 3121 + }, + { + "epoch": 11.270880361173814, + "grad_norm": 292.6310729980469, + "learning_rate": 1.3056261343012703e-05, + "loss": 41.1785, + "step": 3122 + }, + { + "epoch": 11.2744920993228, + "grad_norm": 217.0737762451172, + "learning_rate": 1.3050816696914702e-05, + "loss": 40.9514, + "step": 3123 + }, + { + "epoch": 11.278103837471784, + "grad_norm": 227.0102081298828, + "learning_rate": 1.3045372050816697e-05, + "loss": 39.6132, + "step": 3124 + }, + { + "epoch": 11.281715575620767, + "grad_norm": 195.74667358398438, + "learning_rate": 1.3039927404718694e-05, + "loss": 39.5024, + "step": 3125 + }, + { + "epoch": 11.285327313769752, + "grad_norm": 222.6744384765625, + "learning_rate": 1.303448275862069e-05, + "loss": 37.7863, + "step": 3126 + }, + { + "epoch": 11.288939051918735, + "grad_norm": 207.1038055419922, + "learning_rate": 1.3029038112522687e-05, + "loss": 34.9129, + "step": 3127 + }, + { + "epoch": 11.29255079006772, + "grad_norm": 227.38330078125, + "learning_rate": 1.3023593466424684e-05, + "loss": 33.231, + "step": 3128 + }, + { + "epoch": 11.296162528216705, + "grad_norm": 254.19442749023438, + "learning_rate": 1.3018148820326679e-05, + "loss": 33.3166, + "step": 3129 + }, + { + "epoch": 11.299774266365688, + "grad_norm": 221.4664306640625, + "learning_rate": 1.3012704174228674e-05, + "loss": 33.2336, + "step": 3130 + }, + { + "epoch": 11.299774266365688, + "eval_loss": 0.6138683557510376, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 3130 + }, + { + "epoch": 11.303386004514673, + "grad_norm": 179.73678588867188, + "learning_rate": 1.3007259528130671e-05, + "loss": 34.0082, + "step": 3131 + }, + { + "epoch": 11.306997742663658, + "grad_norm": 238.66107177734375, + "learning_rate": 1.3001814882032669e-05, + "loss": 33.1898, + "step": 3132 + }, + { + "epoch": 11.31060948081264, + "grad_norm": 315.51934814453125, + "learning_rate": 1.2996370235934666e-05, + "loss": 34.5558, + "step": 3133 + }, + { + "epoch": 11.314221218961626, + "grad_norm": 235.54217529296875, + "learning_rate": 1.2990925589836661e-05, + "loss": 32.4498, + "step": 3134 + }, + { + "epoch": 11.317832957110609, + "grad_norm": 225.9518280029297, + "learning_rate": 1.2985480943738656e-05, + "loss": 34.1823, + "step": 3135 + }, + { + "epoch": 11.321444695259594, + "grad_norm": 276.5481262207031, + "learning_rate": 1.2980036297640655e-05, + "loss": 34.6704, + "step": 3136 + }, + { + "epoch": 11.325056433408578, + "grad_norm": 306.4985656738281, + "learning_rate": 1.297459165154265e-05, + "loss": 35.9149, + "step": 3137 + }, + { + "epoch": 11.328668171557561, + "grad_norm": 207.28550720214844, + "learning_rate": 1.2969147005444646e-05, + "loss": 34.876, + "step": 3138 + }, + { + "epoch": 11.332279909706546, + "grad_norm": 238.89157104492188, + "learning_rate": 1.2963702359346643e-05, + "loss": 36.7191, + "step": 3139 + }, + { + "epoch": 11.335891647855531, + "grad_norm": 281.7445068359375, + "learning_rate": 1.2958257713248638e-05, + "loss": 37.9134, + "step": 3140 + }, + { + "epoch": 11.335891647855531, + "eval_loss": 0.6141538023948669, + "eval_runtime": 3.1622, + "eval_samples_per_second": 56.606, + "eval_steps_per_second": 56.606, + "step": 3140 + }, + { + "epoch": 11.339503386004514, + "grad_norm": 261.58221435546875, + "learning_rate": 1.2952813067150635e-05, + "loss": 36.7193, + "step": 3141 + }, + { + "epoch": 11.343115124153499, + "grad_norm": 260.8083190917969, + "learning_rate": 1.2947368421052633e-05, + "loss": 36.9418, + "step": 3142 + }, + { + "epoch": 11.346726862302482, + "grad_norm": 263.466552734375, + "learning_rate": 1.2941923774954628e-05, + "loss": 31.1083, + "step": 3143 + }, + { + "epoch": 11.350338600451467, + "grad_norm": 201.6587677001953, + "learning_rate": 1.2936479128856625e-05, + "loss": 23.4982, + "step": 3144 + }, + { + "epoch": 11.353950338600452, + "grad_norm": 230.29629516601562, + "learning_rate": 1.293103448275862e-05, + "loss": 22.5417, + "step": 3145 + }, + { + "epoch": 11.357562076749435, + "grad_norm": 193.08795166015625, + "learning_rate": 1.2925589836660617e-05, + "loss": 23.6032, + "step": 3146 + }, + { + "epoch": 11.36117381489842, + "grad_norm": 206.49093627929688, + "learning_rate": 1.2920145190562615e-05, + "loss": 24.1813, + "step": 3147 + }, + { + "epoch": 11.364785553047405, + "grad_norm": 285.38348388671875, + "learning_rate": 1.291470054446461e-05, + "loss": 41.4394, + "step": 3148 + }, + { + "epoch": 11.368397291196388, + "grad_norm": 307.4984130859375, + "learning_rate": 1.2909255898366605e-05, + "loss": 43.8865, + "step": 3149 + }, + { + "epoch": 11.372009029345373, + "grad_norm": 256.685791015625, + "learning_rate": 1.2903811252268604e-05, + "loss": 41.5534, + "step": 3150 + }, + { + "epoch": 11.372009029345373, + "eval_loss": 0.6155339479446411, + "eval_runtime": 3.1488, + "eval_samples_per_second": 56.846, + "eval_steps_per_second": 56.846, + "step": 3150 + }, + { + "epoch": 11.375620767494357, + "grad_norm": 302.5317077636719, + "learning_rate": 1.28983666061706e-05, + "loss": 41.5231, + "step": 3151 + }, + { + "epoch": 11.37923250564334, + "grad_norm": 381.4787292480469, + "learning_rate": 1.2892921960072595e-05, + "loss": 40.7064, + "step": 3152 + }, + { + "epoch": 11.382844243792325, + "grad_norm": 313.63116455078125, + "learning_rate": 1.2887477313974592e-05, + "loss": 41.4045, + "step": 3153 + }, + { + "epoch": 11.386455981941308, + "grad_norm": 265.4134521484375, + "learning_rate": 1.2882032667876587e-05, + "loss": 41.2618, + "step": 3154 + }, + { + "epoch": 11.390067720090293, + "grad_norm": 260.43084716796875, + "learning_rate": 1.2876588021778586e-05, + "loss": 42.6311, + "step": 3155 + }, + { + "epoch": 11.393679458239278, + "grad_norm": 326.7022705078125, + "learning_rate": 1.2871143375680581e-05, + "loss": 41.8859, + "step": 3156 + }, + { + "epoch": 11.397291196388261, + "grad_norm": 420.966552734375, + "learning_rate": 1.2865698729582577e-05, + "loss": 41.8117, + "step": 3157 + }, + { + "epoch": 11.400902934537246, + "grad_norm": 280.8377380371094, + "learning_rate": 1.2860254083484574e-05, + "loss": 41.3303, + "step": 3158 + }, + { + "epoch": 11.404514672686231, + "grad_norm": 238.64564514160156, + "learning_rate": 1.2854809437386571e-05, + "loss": 38.253, + "step": 3159 + }, + { + "epoch": 11.408126410835214, + "grad_norm": 258.8091125488281, + "learning_rate": 1.2849364791288566e-05, + "loss": 39.2494, + "step": 3160 + }, + { + "epoch": 11.408126410835214, + "eval_loss": 0.6130858659744263, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 3160 + }, + { + "epoch": 11.411738148984199, + "grad_norm": 209.76300048828125, + "learning_rate": 1.2843920145190563e-05, + "loss": 39.1069, + "step": 3161 + }, + { + "epoch": 11.415349887133182, + "grad_norm": 215.24072265625, + "learning_rate": 1.2838475499092559e-05, + "loss": 38.8867, + "step": 3162 + }, + { + "epoch": 11.418961625282167, + "grad_norm": 285.4281311035156, + "learning_rate": 1.2833030852994554e-05, + "loss": 38.0298, + "step": 3163 + }, + { + "epoch": 11.422573363431152, + "grad_norm": 322.1593017578125, + "learning_rate": 1.2827586206896553e-05, + "loss": 40.2122, + "step": 3164 + }, + { + "epoch": 11.426185101580135, + "grad_norm": 277.2178955078125, + "learning_rate": 1.2822141560798548e-05, + "loss": 38.0829, + "step": 3165 + }, + { + "epoch": 11.42979683972912, + "grad_norm": 186.9705810546875, + "learning_rate": 1.2816696914700545e-05, + "loss": 40.6601, + "step": 3166 + }, + { + "epoch": 11.433408577878104, + "grad_norm": 210.6102294921875, + "learning_rate": 1.281125226860254e-05, + "loss": 39.0126, + "step": 3167 + }, + { + "epoch": 11.437020316027088, + "grad_norm": 234.50717163085938, + "learning_rate": 1.2805807622504536e-05, + "loss": 38.6465, + "step": 3168 + }, + { + "epoch": 11.440632054176072, + "grad_norm": 217.9093475341797, + "learning_rate": 1.2800362976406535e-05, + "loss": 39.2568, + "step": 3169 + }, + { + "epoch": 11.444243792325057, + "grad_norm": 252.82054138183594, + "learning_rate": 1.279491833030853e-05, + "loss": 39.005, + "step": 3170 + }, + { + "epoch": 11.444243792325057, + "eval_loss": 0.6125118732452393, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 3170 + }, + { + "epoch": 11.44785553047404, + "grad_norm": 290.2322998046875, + "learning_rate": 1.2789473684210526e-05, + "loss": 39.6133, + "step": 3171 + }, + { + "epoch": 11.451467268623025, + "grad_norm": 250.72450256347656, + "learning_rate": 1.2784029038112523e-05, + "loss": 40.3251, + "step": 3172 + }, + { + "epoch": 11.455079006772008, + "grad_norm": 273.91229248046875, + "learning_rate": 1.277858439201452e-05, + "loss": 39.5129, + "step": 3173 + }, + { + "epoch": 11.458690744920993, + "grad_norm": 214.30038452148438, + "learning_rate": 1.2773139745916515e-05, + "loss": 40.5093, + "step": 3174 + }, + { + "epoch": 11.462302483069978, + "grad_norm": 264.251708984375, + "learning_rate": 1.2767695099818512e-05, + "loss": 38.3837, + "step": 3175 + }, + { + "epoch": 11.465914221218961, + "grad_norm": 224.7700653076172, + "learning_rate": 1.2762250453720508e-05, + "loss": 37.8522, + "step": 3176 + }, + { + "epoch": 11.469525959367946, + "grad_norm": 238.35604858398438, + "learning_rate": 1.2756805807622505e-05, + "loss": 34.0249, + "step": 3177 + }, + { + "epoch": 11.47313769751693, + "grad_norm": 181.4731903076172, + "learning_rate": 1.2751361161524502e-05, + "loss": 34.2473, + "step": 3178 + }, + { + "epoch": 11.476749435665914, + "grad_norm": 240.2397003173828, + "learning_rate": 1.2745916515426497e-05, + "loss": 32.8657, + "step": 3179 + }, + { + "epoch": 11.480361173814899, + "grad_norm": 283.2740478515625, + "learning_rate": 1.2740471869328494e-05, + "loss": 34.6619, + "step": 3180 + }, + { + "epoch": 11.480361173814899, + "eval_loss": 0.6126638054847717, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 3180 + }, + { + "epoch": 11.483972911963882, + "grad_norm": 248.70912170410156, + "learning_rate": 1.273502722323049e-05, + "loss": 33.0975, + "step": 3181 + }, + { + "epoch": 11.487584650112867, + "grad_norm": 210.9479217529297, + "learning_rate": 1.2729582577132487e-05, + "loss": 34.2069, + "step": 3182 + }, + { + "epoch": 11.491196388261852, + "grad_norm": 234.31399536132812, + "learning_rate": 1.2724137931034484e-05, + "loss": 35.811, + "step": 3183 + }, + { + "epoch": 11.494808126410835, + "grad_norm": 253.24478149414062, + "learning_rate": 1.271869328493648e-05, + "loss": 35.6234, + "step": 3184 + }, + { + "epoch": 11.49841986455982, + "grad_norm": 259.0565185546875, + "learning_rate": 1.2713248638838476e-05, + "loss": 35.1495, + "step": 3185 + }, + { + "epoch": 11.502031602708804, + "grad_norm": 235.4202880859375, + "learning_rate": 1.2707803992740472e-05, + "loss": 35.1363, + "step": 3186 + }, + { + "epoch": 11.505643340857787, + "grad_norm": 248.30267333984375, + "learning_rate": 1.2702359346642469e-05, + "loss": 35.9653, + "step": 3187 + }, + { + "epoch": 11.509255079006772, + "grad_norm": 197.6142120361328, + "learning_rate": 1.2696914700544466e-05, + "loss": 35.6304, + "step": 3188 + }, + { + "epoch": 11.512866817155757, + "grad_norm": 329.27862548828125, + "learning_rate": 1.2691470054446461e-05, + "loss": 35.6111, + "step": 3189 + }, + { + "epoch": 11.51647855530474, + "grad_norm": 194.7126922607422, + "learning_rate": 1.2686025408348457e-05, + "loss": 35.0693, + "step": 3190 + }, + { + "epoch": 11.51647855530474, + "eval_loss": 0.6106634736061096, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 3190 + }, + { + "epoch": 11.520090293453725, + "grad_norm": 243.0207061767578, + "learning_rate": 1.2680580762250454e-05, + "loss": 37.6373, + "step": 3191 + }, + { + "epoch": 11.523702031602708, + "grad_norm": 282.0947265625, + "learning_rate": 1.267513611615245e-05, + "loss": 36.2595, + "step": 3192 + }, + { + "epoch": 11.527313769751693, + "grad_norm": 249.8011932373047, + "learning_rate": 1.2669691470054446e-05, + "loss": 35.5601, + "step": 3193 + }, + { + "epoch": 11.530925507900678, + "grad_norm": 202.17503356933594, + "learning_rate": 1.2664246823956443e-05, + "loss": 23.1075, + "step": 3194 + }, + { + "epoch": 11.534537246049661, + "grad_norm": 188.78128051757812, + "learning_rate": 1.2658802177858439e-05, + "loss": 22.2458, + "step": 3195 + }, + { + "epoch": 11.538148984198646, + "grad_norm": 219.24722290039062, + "learning_rate": 1.2653357531760437e-05, + "loss": 23.7842, + "step": 3196 + }, + { + "epoch": 11.54176072234763, + "grad_norm": 213.0615234375, + "learning_rate": 1.2647912885662433e-05, + "loss": 25.3773, + "step": 3197 + }, + { + "epoch": 11.545372460496614, + "grad_norm": 274.6806335449219, + "learning_rate": 1.2642468239564428e-05, + "loss": 40.396, + "step": 3198 + }, + { + "epoch": 11.548984198645599, + "grad_norm": 248.91778564453125, + "learning_rate": 1.2637023593466425e-05, + "loss": 42.2405, + "step": 3199 + }, + { + "epoch": 11.552595936794582, + "grad_norm": 228.45591735839844, + "learning_rate": 1.263157894736842e-05, + "loss": 40.7328, + "step": 3200 + }, + { + "epoch": 11.552595936794582, + "eval_loss": 0.6154705286026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.04, + "eval_steps_per_second": 57.04, + "step": 3200 + }, + { + "epoch": 11.556207674943566, + "grad_norm": 206.54483032226562, + "learning_rate": 1.2626134301270418e-05, + "loss": 40.6909, + "step": 3201 + }, + { + "epoch": 11.559819413092551, + "grad_norm": 199.14816284179688, + "learning_rate": 1.2620689655172415e-05, + "loss": 40.6918, + "step": 3202 + }, + { + "epoch": 11.563431151241534, + "grad_norm": 217.4789276123047, + "learning_rate": 1.261524500907441e-05, + "loss": 41.686, + "step": 3203 + }, + { + "epoch": 11.56704288939052, + "grad_norm": 209.83084106445312, + "learning_rate": 1.2609800362976406e-05, + "loss": 40.685, + "step": 3204 + }, + { + "epoch": 11.570654627539504, + "grad_norm": 184.56614685058594, + "learning_rate": 1.2604355716878404e-05, + "loss": 42.1684, + "step": 3205 + }, + { + "epoch": 11.574266365688487, + "grad_norm": 226.84622192382812, + "learning_rate": 1.25989110707804e-05, + "loss": 42.4169, + "step": 3206 + }, + { + "epoch": 11.577878103837472, + "grad_norm": 271.7705383300781, + "learning_rate": 1.2593466424682397e-05, + "loss": 41.9603, + "step": 3207 + }, + { + "epoch": 11.581489841986457, + "grad_norm": 206.48257446289062, + "learning_rate": 1.2588021778584392e-05, + "loss": 39.9903, + "step": 3208 + }, + { + "epoch": 11.58510158013544, + "grad_norm": 190.86009216308594, + "learning_rate": 1.2582577132486388e-05, + "loss": 39.3138, + "step": 3209 + }, + { + "epoch": 11.588713318284425, + "grad_norm": 217.0152130126953, + "learning_rate": 1.2577132486388386e-05, + "loss": 37.652, + "step": 3210 + }, + { + "epoch": 11.588713318284425, + "eval_loss": 0.6143624186515808, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 3210 + }, + { + "epoch": 11.592325056433408, + "grad_norm": 203.3090362548828, + "learning_rate": 1.2571687840290382e-05, + "loss": 38.5532, + "step": 3211 + }, + { + "epoch": 11.595936794582393, + "grad_norm": 237.18287658691406, + "learning_rate": 1.2566243194192377e-05, + "loss": 38.4073, + "step": 3212 + }, + { + "epoch": 11.599548532731378, + "grad_norm": 222.20489501953125, + "learning_rate": 1.2560798548094374e-05, + "loss": 37.7122, + "step": 3213 + }, + { + "epoch": 11.60316027088036, + "grad_norm": 261.4862060546875, + "learning_rate": 1.255535390199637e-05, + "loss": 39.0125, + "step": 3214 + }, + { + "epoch": 11.606772009029346, + "grad_norm": 235.49668884277344, + "learning_rate": 1.2549909255898367e-05, + "loss": 38.1753, + "step": 3215 + }, + { + "epoch": 11.610383747178329, + "grad_norm": 219.66139221191406, + "learning_rate": 1.2544464609800364e-05, + "loss": 40.3478, + "step": 3216 + }, + { + "epoch": 11.613995485327314, + "grad_norm": 282.8075256347656, + "learning_rate": 1.2539019963702359e-05, + "loss": 39.3672, + "step": 3217 + }, + { + "epoch": 11.617607223476298, + "grad_norm": 235.07875061035156, + "learning_rate": 1.2533575317604356e-05, + "loss": 39.8955, + "step": 3218 + }, + { + "epoch": 11.621218961625281, + "grad_norm": 328.829833984375, + "learning_rate": 1.2528130671506353e-05, + "loss": 38.626, + "step": 3219 + }, + { + "epoch": 11.624830699774266, + "grad_norm": 283.1789245605469, + "learning_rate": 1.2522686025408349e-05, + "loss": 40.0565, + "step": 3220 + }, + { + "epoch": 11.624830699774266, + "eval_loss": 0.6113889217376709, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3220 + }, + { + "epoch": 11.628442437923251, + "grad_norm": 230.88047790527344, + "learning_rate": 1.2517241379310346e-05, + "loss": 40.1155, + "step": 3221 + }, + { + "epoch": 11.632054176072234, + "grad_norm": 258.1295166015625, + "learning_rate": 1.2511796733212341e-05, + "loss": 40.4707, + "step": 3222 + }, + { + "epoch": 11.635665914221219, + "grad_norm": 255.82699584960938, + "learning_rate": 1.2506352087114336e-05, + "loss": 41.1296, + "step": 3223 + }, + { + "epoch": 11.639277652370204, + "grad_norm": 226.4784393310547, + "learning_rate": 1.2500907441016335e-05, + "loss": 39.1159, + "step": 3224 + }, + { + "epoch": 11.642889390519187, + "grad_norm": 257.38104248046875, + "learning_rate": 1.249546279491833e-05, + "loss": 40.7933, + "step": 3225 + }, + { + "epoch": 11.646501128668172, + "grad_norm": 218.69070434570312, + "learning_rate": 1.2490018148820328e-05, + "loss": 39.6723, + "step": 3226 + }, + { + "epoch": 11.650112866817155, + "grad_norm": 232.3351287841797, + "learning_rate": 1.2484573502722323e-05, + "loss": 37.5671, + "step": 3227 + }, + { + "epoch": 11.65372460496614, + "grad_norm": 229.93295288085938, + "learning_rate": 1.2479128856624318e-05, + "loss": 32.7819, + "step": 3228 + }, + { + "epoch": 11.657336343115125, + "grad_norm": 265.6002197265625, + "learning_rate": 1.2473684210526317e-05, + "loss": 32.5955, + "step": 3229 + }, + { + "epoch": 11.660948081264108, + "grad_norm": 278.47705078125, + "learning_rate": 1.2468239564428313e-05, + "loss": 32.9901, + "step": 3230 + }, + { + "epoch": 11.660948081264108, + "eval_loss": 0.6078047752380371, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 3230 + }, + { + "epoch": 11.664559819413093, + "grad_norm": 239.9285430908203, + "learning_rate": 1.2462794918330308e-05, + "loss": 33.2737, + "step": 3231 + }, + { + "epoch": 11.668171557562077, + "grad_norm": 358.36090087890625, + "learning_rate": 1.2457350272232305e-05, + "loss": 34.8522, + "step": 3232 + }, + { + "epoch": 11.67178329571106, + "grad_norm": 258.0733642578125, + "learning_rate": 1.2451905626134302e-05, + "loss": 34.6796, + "step": 3233 + }, + { + "epoch": 11.675395033860045, + "grad_norm": 296.21942138671875, + "learning_rate": 1.2446460980036298e-05, + "loss": 35.8479, + "step": 3234 + }, + { + "epoch": 11.679006772009028, + "grad_norm": 229.6141815185547, + "learning_rate": 1.2441016333938295e-05, + "loss": 36.4934, + "step": 3235 + }, + { + "epoch": 11.682618510158013, + "grad_norm": 238.6092987060547, + "learning_rate": 1.243557168784029e-05, + "loss": 35.2253, + "step": 3236 + }, + { + "epoch": 11.686230248306998, + "grad_norm": 300.76300048828125, + "learning_rate": 1.2430127041742287e-05, + "loss": 34.9373, + "step": 3237 + }, + { + "epoch": 11.689841986455981, + "grad_norm": 227.70672607421875, + "learning_rate": 1.2424682395644284e-05, + "loss": 35.4369, + "step": 3238 + }, + { + "epoch": 11.693453724604966, + "grad_norm": 218.36000061035156, + "learning_rate": 1.241923774954628e-05, + "loss": 35.3398, + "step": 3239 + }, + { + "epoch": 11.697065462753951, + "grad_norm": 220.78475952148438, + "learning_rate": 1.2413793103448277e-05, + "loss": 35.7612, + "step": 3240 + }, + { + "epoch": 11.697065462753951, + "eval_loss": 0.6067846417427063, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 3240 + }, + { + "epoch": 11.700677200902934, + "grad_norm": 237.34437561035156, + "learning_rate": 1.2408348457350272e-05, + "loss": 38.0459, + "step": 3241 + }, + { + "epoch": 11.704288939051919, + "grad_norm": 251.60633850097656, + "learning_rate": 1.2402903811252269e-05, + "loss": 35.4676, + "step": 3242 + }, + { + "epoch": 11.707900677200904, + "grad_norm": 214.17117309570312, + "learning_rate": 1.2397459165154266e-05, + "loss": 30.5595, + "step": 3243 + }, + { + "epoch": 11.711512415349887, + "grad_norm": 202.3698272705078, + "learning_rate": 1.2392014519056262e-05, + "loss": 23.7468, + "step": 3244 + }, + { + "epoch": 11.715124153498872, + "grad_norm": 229.11776733398438, + "learning_rate": 1.2386569872958257e-05, + "loss": 23.1255, + "step": 3245 + }, + { + "epoch": 11.718735891647855, + "grad_norm": 175.93829345703125, + "learning_rate": 1.2381125226860254e-05, + "loss": 23.7349, + "step": 3246 + }, + { + "epoch": 11.72234762979684, + "grad_norm": 232.7489471435547, + "learning_rate": 1.2375680580762251e-05, + "loss": 24.4997, + "step": 3247 + }, + { + "epoch": 11.725959367945824, + "grad_norm": 280.5601806640625, + "learning_rate": 1.2370235934664248e-05, + "loss": 42.3811, + "step": 3248 + }, + { + "epoch": 11.729571106094808, + "grad_norm": 292.2538146972656, + "learning_rate": 1.2364791288566244e-05, + "loss": 42.9804, + "step": 3249 + }, + { + "epoch": 11.733182844243792, + "grad_norm": 265.0259704589844, + "learning_rate": 1.2359346642468239e-05, + "loss": 41.1251, + "step": 3250 + }, + { + "epoch": 11.733182844243792, + "eval_loss": 0.6141200065612793, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 3250 + }, + { + "epoch": 11.736794582392777, + "grad_norm": 232.92893981933594, + "learning_rate": 1.2353901996370236e-05, + "loss": 40.9372, + "step": 3251 + }, + { + "epoch": 11.74040632054176, + "grad_norm": 176.99818420410156, + "learning_rate": 1.2348457350272233e-05, + "loss": 41.0757, + "step": 3252 + }, + { + "epoch": 11.744018058690745, + "grad_norm": 206.5728759765625, + "learning_rate": 1.2343012704174228e-05, + "loss": 41.9635, + "step": 3253 + }, + { + "epoch": 11.747629796839728, + "grad_norm": 211.2556915283203, + "learning_rate": 1.2337568058076226e-05, + "loss": 41.5217, + "step": 3254 + }, + { + "epoch": 11.751241534988713, + "grad_norm": 198.8915252685547, + "learning_rate": 1.2332123411978221e-05, + "loss": 42.9997, + "step": 3255 + }, + { + "epoch": 11.754853273137698, + "grad_norm": 291.2761535644531, + "learning_rate": 1.2326678765880218e-05, + "loss": 42.2561, + "step": 3256 + }, + { + "epoch": 11.758465011286681, + "grad_norm": 243.2998046875, + "learning_rate": 1.2321234119782215e-05, + "loss": 41.6219, + "step": 3257 + }, + { + "epoch": 11.762076749435666, + "grad_norm": 266.1149597167969, + "learning_rate": 1.231578947368421e-05, + "loss": 40.1646, + "step": 3258 + }, + { + "epoch": 11.76568848758465, + "grad_norm": 236.6083221435547, + "learning_rate": 1.2310344827586208e-05, + "loss": 39.7079, + "step": 3259 + }, + { + "epoch": 11.769300225733634, + "grad_norm": 196.397216796875, + "learning_rate": 1.2304900181488203e-05, + "loss": 39.6629, + "step": 3260 + }, + { + "epoch": 11.769300225733634, + "eval_loss": 0.6124016046524048, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.055, + "eval_steps_per_second": 57.055, + "step": 3260 + }, + { + "epoch": 11.772911963882619, + "grad_norm": 198.52500915527344, + "learning_rate": 1.22994555353902e-05, + "loss": 38.5285, + "step": 3261 + }, + { + "epoch": 11.776523702031604, + "grad_norm": 236.25477600097656, + "learning_rate": 1.2294010889292197e-05, + "loss": 38.3358, + "step": 3262 + }, + { + "epoch": 11.780135440180587, + "grad_norm": 260.35955810546875, + "learning_rate": 1.2288566243194192e-05, + "loss": 38.374, + "step": 3263 + }, + { + "epoch": 11.783747178329572, + "grad_norm": 313.078857421875, + "learning_rate": 1.2283121597096188e-05, + "loss": 39.124, + "step": 3264 + }, + { + "epoch": 11.787358916478555, + "grad_norm": 191.34027099609375, + "learning_rate": 1.2277676950998187e-05, + "loss": 39.1776, + "step": 3265 + }, + { + "epoch": 11.79097065462754, + "grad_norm": 203.5764923095703, + "learning_rate": 1.2272232304900182e-05, + "loss": 38.7885, + "step": 3266 + }, + { + "epoch": 11.794582392776524, + "grad_norm": 234.38479614257812, + "learning_rate": 1.2266787658802177e-05, + "loss": 39.1353, + "step": 3267 + }, + { + "epoch": 11.798194130925507, + "grad_norm": 254.5694122314453, + "learning_rate": 1.2261343012704174e-05, + "loss": 38.141, + "step": 3268 + }, + { + "epoch": 11.801805869074492, + "grad_norm": 189.8268585205078, + "learning_rate": 1.225589836660617e-05, + "loss": 39.5199, + "step": 3269 + }, + { + "epoch": 11.805417607223477, + "grad_norm": 256.52728271484375, + "learning_rate": 1.2250453720508169e-05, + "loss": 41.5113, + "step": 3270 + }, + { + "epoch": 11.805417607223477, + "eval_loss": 0.6084021329879761, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3270 + }, + { + "epoch": 11.80902934537246, + "grad_norm": 195.57321166992188, + "learning_rate": 1.2245009074410164e-05, + "loss": 39.8129, + "step": 3271 + }, + { + "epoch": 11.812641083521445, + "grad_norm": 228.6748809814453, + "learning_rate": 1.223956442831216e-05, + "loss": 40.2273, + "step": 3272 + }, + { + "epoch": 11.816252821670428, + "grad_norm": 209.96096801757812, + "learning_rate": 1.2234119782214156e-05, + "loss": 40.2254, + "step": 3273 + }, + { + "epoch": 11.819864559819413, + "grad_norm": 247.4613037109375, + "learning_rate": 1.2228675136116152e-05, + "loss": 40.71, + "step": 3274 + }, + { + "epoch": 11.823476297968398, + "grad_norm": 263.0521240234375, + "learning_rate": 1.2223230490018149e-05, + "loss": 39.5572, + "step": 3275 + }, + { + "epoch": 11.827088036117381, + "grad_norm": 225.53634643554688, + "learning_rate": 1.2217785843920146e-05, + "loss": 36.4388, + "step": 3276 + }, + { + "epoch": 11.830699774266366, + "grad_norm": 194.59527587890625, + "learning_rate": 1.2212341197822141e-05, + "loss": 33.1005, + "step": 3277 + }, + { + "epoch": 11.83431151241535, + "grad_norm": 314.715576171875, + "learning_rate": 1.2206896551724138e-05, + "loss": 32.9812, + "step": 3278 + }, + { + "epoch": 11.837923250564334, + "grad_norm": 205.86862182617188, + "learning_rate": 1.2201451905626136e-05, + "loss": 33.6331, + "step": 3279 + }, + { + "epoch": 11.841534988713319, + "grad_norm": 217.54722595214844, + "learning_rate": 1.2196007259528131e-05, + "loss": 33.6535, + "step": 3280 + }, + { + "epoch": 11.841534988713319, + "eval_loss": 0.609620213508606, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 3280 + }, + { + "epoch": 11.845146726862303, + "grad_norm": 231.25390625, + "learning_rate": 1.2190562613430128e-05, + "loss": 34.5218, + "step": 3281 + }, + { + "epoch": 11.848758465011286, + "grad_norm": 208.8440704345703, + "learning_rate": 1.2185117967332123e-05, + "loss": 34.354, + "step": 3282 + }, + { + "epoch": 11.852370203160271, + "grad_norm": 221.25547790527344, + "learning_rate": 1.2179673321234119e-05, + "loss": 34.5705, + "step": 3283 + }, + { + "epoch": 11.855981941309254, + "grad_norm": 331.4505920410156, + "learning_rate": 1.2174228675136118e-05, + "loss": 35.796, + "step": 3284 + }, + { + "epoch": 11.85959367945824, + "grad_norm": 337.1404113769531, + "learning_rate": 1.2168784029038113e-05, + "loss": 36.4544, + "step": 3285 + }, + { + "epoch": 11.863205417607224, + "grad_norm": 238.75303649902344, + "learning_rate": 1.2163339382940108e-05, + "loss": 35.7165, + "step": 3286 + }, + { + "epoch": 11.866817155756207, + "grad_norm": 260.088134765625, + "learning_rate": 1.2157894736842105e-05, + "loss": 35.5461, + "step": 3287 + }, + { + "epoch": 11.870428893905192, + "grad_norm": 265.0240173339844, + "learning_rate": 1.2152450090744102e-05, + "loss": 37.0143, + "step": 3288 + }, + { + "epoch": 11.874040632054175, + "grad_norm": 251.74273681640625, + "learning_rate": 1.21470054446461e-05, + "loss": 36.6145, + "step": 3289 + }, + { + "epoch": 11.87765237020316, + "grad_norm": 216.8999786376953, + "learning_rate": 1.2141560798548095e-05, + "loss": 36.3135, + "step": 3290 + }, + { + "epoch": 11.87765237020316, + "eval_loss": 0.6087896823883057, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.066, + "eval_steps_per_second": 57.066, + "step": 3290 + }, + { + "epoch": 11.881264108352145, + "grad_norm": 256.50006103515625, + "learning_rate": 1.213611615245009e-05, + "loss": 36.6596, + "step": 3291 + }, + { + "epoch": 11.884875846501128, + "grad_norm": 249.34164428710938, + "learning_rate": 1.2130671506352087e-05, + "loss": 37.6473, + "step": 3292 + }, + { + "epoch": 11.888487584650113, + "grad_norm": 211.9344940185547, + "learning_rate": 1.2125226860254084e-05, + "loss": 28.2839, + "step": 3293 + }, + { + "epoch": 11.892099322799098, + "grad_norm": 170.77166748046875, + "learning_rate": 1.211978221415608e-05, + "loss": 23.2231, + "step": 3294 + }, + { + "epoch": 11.89571106094808, + "grad_norm": 177.49789428710938, + "learning_rate": 1.2114337568058077e-05, + "loss": 22.7909, + "step": 3295 + }, + { + "epoch": 11.899322799097066, + "grad_norm": 189.0458221435547, + "learning_rate": 1.2108892921960072e-05, + "loss": 23.8062, + "step": 3296 + }, + { + "epoch": 11.90293453724605, + "grad_norm": 182.90457153320312, + "learning_rate": 1.2103448275862068e-05, + "loss": 24.7812, + "step": 3297 + }, + { + "epoch": 11.906546275395034, + "grad_norm": 232.61126708984375, + "learning_rate": 1.2098003629764066e-05, + "loss": 41.5496, + "step": 3298 + }, + { + "epoch": 11.910158013544018, + "grad_norm": 283.25762939453125, + "learning_rate": 1.2092558983666062e-05, + "loss": 40.7831, + "step": 3299 + }, + { + "epoch": 11.913769751693001, + "grad_norm": 316.6318359375, + "learning_rate": 1.2087114337568059e-05, + "loss": 40.6287, + "step": 3300 + }, + { + "epoch": 11.913769751693001, + "eval_loss": 0.6114257574081421, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 3300 + }, + { + "epoch": 11.917381489841986, + "grad_norm": 248.5615234375, + "learning_rate": 1.2081669691470054e-05, + "loss": 40.5648, + "step": 3301 + }, + { + "epoch": 11.920993227990971, + "grad_norm": 255.31130981445312, + "learning_rate": 1.2076225045372051e-05, + "loss": 42.4736, + "step": 3302 + }, + { + "epoch": 11.924604966139954, + "grad_norm": 229.3546600341797, + "learning_rate": 1.2070780399274048e-05, + "loss": 43.112, + "step": 3303 + }, + { + "epoch": 11.928216704288939, + "grad_norm": 226.89553833007812, + "learning_rate": 1.2065335753176044e-05, + "loss": 37.9527, + "step": 3304 + }, + { + "epoch": 11.931828442437924, + "grad_norm": 210.63919067382812, + "learning_rate": 1.205989110707804e-05, + "loss": 38.7652, + "step": 3305 + }, + { + "epoch": 11.935440180586907, + "grad_norm": 267.75335693359375, + "learning_rate": 1.2054446460980036e-05, + "loss": 39.9077, + "step": 3306 + }, + { + "epoch": 11.939051918735892, + "grad_norm": 255.3372802734375, + "learning_rate": 1.2049001814882033e-05, + "loss": 39.9008, + "step": 3307 + }, + { + "epoch": 11.942663656884875, + "grad_norm": 220.55332946777344, + "learning_rate": 1.2043557168784029e-05, + "loss": 40.8187, + "step": 3308 + }, + { + "epoch": 11.94627539503386, + "grad_norm": 350.15374755859375, + "learning_rate": 1.2038112522686026e-05, + "loss": 40.2937, + "step": 3309 + }, + { + "epoch": 11.949887133182845, + "grad_norm": 296.1144714355469, + "learning_rate": 1.2032667876588021e-05, + "loss": 41.3939, + "step": 3310 + }, + { + "epoch": 11.949887133182845, + "eval_loss": 0.6116041541099548, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3310 + }, + { + "epoch": 11.953498871331828, + "grad_norm": 220.52304077148438, + "learning_rate": 1.202722323049002e-05, + "loss": 39.108, + "step": 3311 + }, + { + "epoch": 11.957110609480813, + "grad_norm": 268.8526916503906, + "learning_rate": 1.2021778584392015e-05, + "loss": 39.547, + "step": 3312 + }, + { + "epoch": 11.960722347629797, + "grad_norm": 205.97677612304688, + "learning_rate": 1.201633393829401e-05, + "loss": 36.7144, + "step": 3313 + }, + { + "epoch": 11.96433408577878, + "grad_norm": 186.62428283691406, + "learning_rate": 1.2010889292196008e-05, + "loss": 34.0491, + "step": 3314 + }, + { + "epoch": 11.967945823927765, + "grad_norm": 214.5521697998047, + "learning_rate": 1.2005444646098003e-05, + "loss": 34.1164, + "step": 3315 + }, + { + "epoch": 11.97155756207675, + "grad_norm": 203.8130340576172, + "learning_rate": 1.2e-05, + "loss": 34.0005, + "step": 3316 + }, + { + "epoch": 11.975169300225733, + "grad_norm": 207.25648498535156, + "learning_rate": 1.1994555353901997e-05, + "loss": 34.0489, + "step": 3317 + }, + { + "epoch": 11.978781038374718, + "grad_norm": 271.1595458984375, + "learning_rate": 1.1989110707803993e-05, + "loss": 35.0359, + "step": 3318 + }, + { + "epoch": 11.982392776523701, + "grad_norm": 266.0697021484375, + "learning_rate": 1.198366606170599e-05, + "loss": 36.4684, + "step": 3319 + }, + { + "epoch": 11.986004514672686, + "grad_norm": 264.1314392089844, + "learning_rate": 1.1978221415607985e-05, + "loss": 35.8805, + "step": 3320 + }, + { + "epoch": 11.986004514672686, + "eval_loss": 0.6101864576339722, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 3320 + }, + { + "epoch": 11.989616252821671, + "grad_norm": 266.34295654296875, + "learning_rate": 1.1972776769509982e-05, + "loss": 37.2928, + "step": 3321 + }, + { + "epoch": 11.993227990970654, + "grad_norm": 222.19161987304688, + "learning_rate": 1.196733212341198e-05, + "loss": 29.0638, + "step": 3322 + }, + { + "epoch": 11.996839729119639, + "grad_norm": 244.96974182128906, + "learning_rate": 1.1961887477313975e-05, + "loss": 23.6752, + "step": 3323 + }, + { + "epoch": 12.0, + "grad_norm": 227.6931915283203, + "learning_rate": 1.195644283121597e-05, + "loss": 20.9293, + "step": 3324 + }, + { + "epoch": 12.003611738148985, + "grad_norm": 259.7235412597656, + "learning_rate": 1.1950998185117969e-05, + "loss": 39.7694, + "step": 3325 + }, + { + "epoch": 12.007223476297968, + "grad_norm": 258.8477783203125, + "learning_rate": 1.1945553539019964e-05, + "loss": 41.3742, + "step": 3326 + }, + { + "epoch": 12.010835214446953, + "grad_norm": 216.0697784423828, + "learning_rate": 1.194010889292196e-05, + "loss": 40.0706, + "step": 3327 + }, + { + "epoch": 12.014446952595938, + "grad_norm": 197.73046875, + "learning_rate": 1.1934664246823957e-05, + "loss": 39.844, + "step": 3328 + }, + { + "epoch": 12.01805869074492, + "grad_norm": 190.29563903808594, + "learning_rate": 1.1929219600725952e-05, + "loss": 41.8877, + "step": 3329 + }, + { + "epoch": 12.021670428893906, + "grad_norm": 190.01197814941406, + "learning_rate": 1.1923774954627951e-05, + "loss": 40.5782, + "step": 3330 + }, + { + "epoch": 12.021670428893906, + "eval_loss": 0.6100598573684692, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3330 + }, + { + "epoch": 12.025282167042889, + "grad_norm": 283.20965576171875, + "learning_rate": 1.1918330308529946e-05, + "loss": 42.9183, + "step": 3331 + }, + { + "epoch": 12.028893905191874, + "grad_norm": 227.9106903076172, + "learning_rate": 1.1912885662431942e-05, + "loss": 41.4606, + "step": 3332 + }, + { + "epoch": 12.032505643340858, + "grad_norm": 217.31640625, + "learning_rate": 1.1907441016333939e-05, + "loss": 40.527, + "step": 3333 + }, + { + "epoch": 12.036117381489841, + "grad_norm": 181.33787536621094, + "learning_rate": 1.1901996370235936e-05, + "loss": 40.2536, + "step": 3334 + }, + { + "epoch": 12.039729119638826, + "grad_norm": 210.638427734375, + "learning_rate": 1.1896551724137931e-05, + "loss": 39.0234, + "step": 3335 + }, + { + "epoch": 12.043340857787811, + "grad_norm": 222.1325225830078, + "learning_rate": 1.1891107078039928e-05, + "loss": 36.6929, + "step": 3336 + }, + { + "epoch": 12.046952595936794, + "grad_norm": 195.0751953125, + "learning_rate": 1.1885662431941924e-05, + "loss": 37.9547, + "step": 3337 + }, + { + "epoch": 12.050564334085779, + "grad_norm": 287.6582946777344, + "learning_rate": 1.1880217785843919e-05, + "loss": 37.9016, + "step": 3338 + }, + { + "epoch": 12.054176072234762, + "grad_norm": 351.43701171875, + "learning_rate": 1.1874773139745918e-05, + "loss": 40.014, + "step": 3339 + }, + { + "epoch": 12.057787810383747, + "grad_norm": 212.9033966064453, + "learning_rate": 1.1869328493647913e-05, + "loss": 37.8761, + "step": 3340 + }, + { + "epoch": 12.057787810383747, + "eval_loss": 0.6093400120735168, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 3340 + }, + { + "epoch": 12.061399548532732, + "grad_norm": 268.8284912109375, + "learning_rate": 1.186388384754991e-05, + "loss": 38.7171, + "step": 3341 + }, + { + "epoch": 12.065011286681715, + "grad_norm": 193.27267456054688, + "learning_rate": 1.1858439201451906e-05, + "loss": 38.4908, + "step": 3342 + }, + { + "epoch": 12.0686230248307, + "grad_norm": 244.18124389648438, + "learning_rate": 1.1852994555353901e-05, + "loss": 37.9388, + "step": 3343 + }, + { + "epoch": 12.072234762979685, + "grad_norm": 311.6593933105469, + "learning_rate": 1.18475499092559e-05, + "loss": 38.4287, + "step": 3344 + }, + { + "epoch": 12.075846501128668, + "grad_norm": 239.28526306152344, + "learning_rate": 1.1842105263157895e-05, + "loss": 38.1349, + "step": 3345 + }, + { + "epoch": 12.079458239277653, + "grad_norm": 312.1795654296875, + "learning_rate": 1.183666061705989e-05, + "loss": 39.8067, + "step": 3346 + }, + { + "epoch": 12.083069977426636, + "grad_norm": 303.3067932128906, + "learning_rate": 1.1831215970961888e-05, + "loss": 40.0617, + "step": 3347 + }, + { + "epoch": 12.08668171557562, + "grad_norm": 280.8705749511719, + "learning_rate": 1.1825771324863885e-05, + "loss": 39.244, + "step": 3348 + }, + { + "epoch": 12.090293453724605, + "grad_norm": 249.89671325683594, + "learning_rate": 1.182032667876588e-05, + "loss": 39.0047, + "step": 3349 + }, + { + "epoch": 12.093905191873588, + "grad_norm": 226.19195556640625, + "learning_rate": 1.1814882032667877e-05, + "loss": 40.8044, + "step": 3350 + }, + { + "epoch": 12.093905191873588, + "eval_loss": 0.6100687384605408, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 3350 + }, + { + "epoch": 12.097516930022573, + "grad_norm": 250.29306030273438, + "learning_rate": 1.1809437386569873e-05, + "loss": 38.0745, + "step": 3351 + }, + { + "epoch": 12.101128668171558, + "grad_norm": 255.06137084960938, + "learning_rate": 1.180399274047187e-05, + "loss": 37.2922, + "step": 3352 + }, + { + "epoch": 12.104740406320541, + "grad_norm": 293.59185791015625, + "learning_rate": 1.1798548094373867e-05, + "loss": 35.488, + "step": 3353 + }, + { + "epoch": 12.108352144469526, + "grad_norm": 260.9599914550781, + "learning_rate": 1.1793103448275862e-05, + "loss": 32.8175, + "step": 3354 + }, + { + "epoch": 12.111963882618511, + "grad_norm": 387.63671875, + "learning_rate": 1.178765880217786e-05, + "loss": 31.3901, + "step": 3355 + }, + { + "epoch": 12.115575620767494, + "grad_norm": 216.2008819580078, + "learning_rate": 1.1782214156079855e-05, + "loss": 32.9512, + "step": 3356 + }, + { + "epoch": 12.119187358916479, + "grad_norm": 260.510498046875, + "learning_rate": 1.177676950998185e-05, + "loss": 31.838, + "step": 3357 + }, + { + "epoch": 12.122799097065462, + "grad_norm": 215.96522521972656, + "learning_rate": 1.1771324863883849e-05, + "loss": 33.5854, + "step": 3358 + }, + { + "epoch": 12.126410835214447, + "grad_norm": 277.2855529785156, + "learning_rate": 1.1765880217785844e-05, + "loss": 34.947, + "step": 3359 + }, + { + "epoch": 12.130022573363432, + "grad_norm": 199.53759765625, + "learning_rate": 1.176043557168784e-05, + "loss": 34.3862, + "step": 3360 + }, + { + "epoch": 12.130022573363432, + "eval_loss": 0.6107886433601379, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 3360 + }, + { + "epoch": 12.133634311512415, + "grad_norm": 244.73654174804688, + "learning_rate": 1.1754990925589837e-05, + "loss": 34.5678, + "step": 3361 + }, + { + "epoch": 12.1372460496614, + "grad_norm": 335.4967346191406, + "learning_rate": 1.1749546279491834e-05, + "loss": 35.8974, + "step": 3362 + }, + { + "epoch": 12.140857787810384, + "grad_norm": 269.8370056152344, + "learning_rate": 1.174410163339383e-05, + "loss": 36.3458, + "step": 3363 + }, + { + "epoch": 12.144469525959368, + "grad_norm": 230.82492065429688, + "learning_rate": 1.1738656987295826e-05, + "loss": 34.6797, + "step": 3364 + }, + { + "epoch": 12.148081264108352, + "grad_norm": 266.6196594238281, + "learning_rate": 1.1733212341197822e-05, + "loss": 35.5799, + "step": 3365 + }, + { + "epoch": 12.151693002257336, + "grad_norm": 268.1825256347656, + "learning_rate": 1.1727767695099819e-05, + "loss": 34.9859, + "step": 3366 + }, + { + "epoch": 12.15530474040632, + "grad_norm": 259.6159362792969, + "learning_rate": 1.1722323049001816e-05, + "loss": 37.2283, + "step": 3367 + }, + { + "epoch": 12.158916478555305, + "grad_norm": 225.1367645263672, + "learning_rate": 1.1716878402903811e-05, + "loss": 37.4073, + "step": 3368 + }, + { + "epoch": 12.162528216704288, + "grad_norm": 277.8457946777344, + "learning_rate": 1.1711433756805808e-05, + "loss": 36.3491, + "step": 3369 + }, + { + "epoch": 12.166139954853273, + "grad_norm": 273.1939697265625, + "learning_rate": 1.1705989110707804e-05, + "loss": 31.4646, + "step": 3370 + }, + { + "epoch": 12.166139954853273, + "eval_loss": 0.6099494695663452, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 3370 + }, + { + "epoch": 12.169751693002258, + "grad_norm": 199.32516479492188, + "learning_rate": 1.17005444646098e-05, + "loss": 22.7125, + "step": 3371 + }, + { + "epoch": 12.173363431151241, + "grad_norm": 195.47630310058594, + "learning_rate": 1.1695099818511798e-05, + "loss": 22.7899, + "step": 3372 + }, + { + "epoch": 12.176975169300226, + "grad_norm": 220.02413940429688, + "learning_rate": 1.1689655172413793e-05, + "loss": 23.4427, + "step": 3373 + }, + { + "epoch": 12.18058690744921, + "grad_norm": 215.43287658691406, + "learning_rate": 1.168421052631579e-05, + "loss": 24.1504, + "step": 3374 + }, + { + "epoch": 12.184198645598194, + "grad_norm": 298.2409973144531, + "learning_rate": 1.1678765880217786e-05, + "loss": 41.4955, + "step": 3375 + }, + { + "epoch": 12.187810383747179, + "grad_norm": 235.94728088378906, + "learning_rate": 1.1673321234119783e-05, + "loss": 42.4273, + "step": 3376 + }, + { + "epoch": 12.191422121896162, + "grad_norm": 235.44480895996094, + "learning_rate": 1.166787658802178e-05, + "loss": 40.6468, + "step": 3377 + }, + { + "epoch": 12.195033860045147, + "grad_norm": 281.5338439941406, + "learning_rate": 1.1662431941923775e-05, + "loss": 39.8335, + "step": 3378 + }, + { + "epoch": 12.198645598194132, + "grad_norm": 185.87339782714844, + "learning_rate": 1.165698729582577e-05, + "loss": 40.8669, + "step": 3379 + }, + { + "epoch": 12.202257336343115, + "grad_norm": 218.88861083984375, + "learning_rate": 1.1651542649727768e-05, + "loss": 40.1351, + "step": 3380 + }, + { + "epoch": 12.202257336343115, + "eval_loss": 0.6128573417663574, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3380 + }, + { + "epoch": 12.2058690744921, + "grad_norm": 192.7227783203125, + "learning_rate": 1.1646098003629765e-05, + "loss": 40.4448, + "step": 3381 + }, + { + "epoch": 12.209480812641084, + "grad_norm": 219.68093872070312, + "learning_rate": 1.1640653357531762e-05, + "loss": 41.579, + "step": 3382 + }, + { + "epoch": 12.213092550790067, + "grad_norm": 235.8788299560547, + "learning_rate": 1.1635208711433757e-05, + "loss": 41.3374, + "step": 3383 + }, + { + "epoch": 12.216704288939052, + "grad_norm": 245.11935424804688, + "learning_rate": 1.1629764065335752e-05, + "loss": 41.1151, + "step": 3384 + }, + { + "epoch": 12.220316027088035, + "grad_norm": 260.2931823730469, + "learning_rate": 1.1624319419237751e-05, + "loss": 38.9502, + "step": 3385 + }, + { + "epoch": 12.22392776523702, + "grad_norm": 240.62734985351562, + "learning_rate": 1.1618874773139747e-05, + "loss": 38.6309, + "step": 3386 + }, + { + "epoch": 12.227539503386005, + "grad_norm": 230.9380645751953, + "learning_rate": 1.1613430127041742e-05, + "loss": 38.3077, + "step": 3387 + }, + { + "epoch": 12.231151241534988, + "grad_norm": 234.40687561035156, + "learning_rate": 1.1607985480943739e-05, + "loss": 37.1566, + "step": 3388 + }, + { + "epoch": 12.234762979683973, + "grad_norm": 216.580810546875, + "learning_rate": 1.1602540834845734e-05, + "loss": 38.4919, + "step": 3389 + }, + { + "epoch": 12.238374717832958, + "grad_norm": 210.75079345703125, + "learning_rate": 1.1597096188747732e-05, + "loss": 38.1647, + "step": 3390 + }, + { + "epoch": 12.238374717832958, + "eval_loss": 0.6105583906173706, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3390 + }, + { + "epoch": 12.241986455981941, + "grad_norm": 207.82180786132812, + "learning_rate": 1.1591651542649729e-05, + "loss": 38.5585, + "step": 3391 + }, + { + "epoch": 12.245598194130926, + "grad_norm": 186.55081176757812, + "learning_rate": 1.1586206896551724e-05, + "loss": 38.0183, + "step": 3392 + }, + { + "epoch": 12.249209932279909, + "grad_norm": 179.60572814941406, + "learning_rate": 1.1580762250453721e-05, + "loss": 39.6951, + "step": 3393 + }, + { + "epoch": 12.252821670428894, + "grad_norm": 212.59837341308594, + "learning_rate": 1.1575317604355718e-05, + "loss": 39.2908, + "step": 3394 + }, + { + "epoch": 12.256433408577879, + "grad_norm": 239.90997314453125, + "learning_rate": 1.1569872958257714e-05, + "loss": 39.9409, + "step": 3395 + }, + { + "epoch": 12.260045146726862, + "grad_norm": 240.729248046875, + "learning_rate": 1.156442831215971e-05, + "loss": 39.2386, + "step": 3396 + }, + { + "epoch": 12.263656884875846, + "grad_norm": 248.6179962158203, + "learning_rate": 1.1558983666061706e-05, + "loss": 37.3296, + "step": 3397 + }, + { + "epoch": 12.267268623024831, + "grad_norm": 192.55084228515625, + "learning_rate": 1.1553539019963701e-05, + "loss": 40.1156, + "step": 3398 + }, + { + "epoch": 12.270880361173814, + "grad_norm": 217.89109802246094, + "learning_rate": 1.15480943738657e-05, + "loss": 41.0677, + "step": 3399 + }, + { + "epoch": 12.2744920993228, + "grad_norm": 240.77633666992188, + "learning_rate": 1.1542649727767695e-05, + "loss": 39.3552, + "step": 3400 + }, + { + "epoch": 12.2744920993228, + "eval_loss": 0.6094763278961182, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3400 + }, + { + "epoch": 12.278103837471784, + "grad_norm": 210.38153076171875, + "learning_rate": 1.1537205081669691e-05, + "loss": 40.2202, + "step": 3401 + }, + { + "epoch": 12.281715575620767, + "grad_norm": 195.49087524414062, + "learning_rate": 1.1531760435571688e-05, + "loss": 37.5473, + "step": 3402 + }, + { + "epoch": 12.285327313769752, + "grad_norm": 254.43972778320312, + "learning_rate": 1.1526315789473683e-05, + "loss": 37.8032, + "step": 3403 + }, + { + "epoch": 12.288939051918735, + "grad_norm": 205.09913635253906, + "learning_rate": 1.1520871143375682e-05, + "loss": 35.1317, + "step": 3404 + }, + { + "epoch": 12.29255079006772, + "grad_norm": 241.22930908203125, + "learning_rate": 1.1515426497277677e-05, + "loss": 32.7809, + "step": 3405 + }, + { + "epoch": 12.296162528216705, + "grad_norm": 226.75311279296875, + "learning_rate": 1.1509981851179673e-05, + "loss": 32.5354, + "step": 3406 + }, + { + "epoch": 12.299774266365688, + "grad_norm": 323.5389709472656, + "learning_rate": 1.150453720508167e-05, + "loss": 33.1533, + "step": 3407 + }, + { + "epoch": 12.303386004514673, + "grad_norm": 306.7039794921875, + "learning_rate": 1.1499092558983667e-05, + "loss": 33.7924, + "step": 3408 + }, + { + "epoch": 12.306997742663658, + "grad_norm": 221.53897094726562, + "learning_rate": 1.1493647912885662e-05, + "loss": 33.829, + "step": 3409 + }, + { + "epoch": 12.31060948081264, + "grad_norm": 301.59527587890625, + "learning_rate": 1.148820326678766e-05, + "loss": 35.4583, + "step": 3410 + }, + { + "epoch": 12.31060948081264, + "eval_loss": 0.6092248558998108, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.058, + "eval_steps_per_second": 57.058, + "step": 3410 + }, + { + "epoch": 12.314221218961626, + "grad_norm": 229.63221740722656, + "learning_rate": 1.1482758620689655e-05, + "loss": 34.3258, + "step": 3411 + }, + { + "epoch": 12.317832957110609, + "grad_norm": 280.6421203613281, + "learning_rate": 1.147731397459165e-05, + "loss": 33.4522, + "step": 3412 + }, + { + "epoch": 12.321444695259594, + "grad_norm": 305.6673889160156, + "learning_rate": 1.1471869328493649e-05, + "loss": 34.8911, + "step": 3413 + }, + { + "epoch": 12.325056433408578, + "grad_norm": 278.5484924316406, + "learning_rate": 1.1466424682395644e-05, + "loss": 36.2668, + "step": 3414 + }, + { + "epoch": 12.328668171557561, + "grad_norm": 246.88082885742188, + "learning_rate": 1.1460980036297641e-05, + "loss": 34.8401, + "step": 3415 + }, + { + "epoch": 12.332279909706546, + "grad_norm": 279.730712890625, + "learning_rate": 1.1455535390199637e-05, + "loss": 36.2382, + "step": 3416 + }, + { + "epoch": 12.335891647855531, + "grad_norm": 243.62918090820312, + "learning_rate": 1.1450090744101634e-05, + "loss": 37.0742, + "step": 3417 + }, + { + "epoch": 12.339503386004514, + "grad_norm": 280.5240783691406, + "learning_rate": 1.1444646098003631e-05, + "loss": 37.0223, + "step": 3418 + }, + { + "epoch": 12.343115124153499, + "grad_norm": 270.56396484375, + "learning_rate": 1.1439201451905626e-05, + "loss": 34.8413, + "step": 3419 + }, + { + "epoch": 12.346726862302482, + "grad_norm": 246.56292724609375, + "learning_rate": 1.1433756805807622e-05, + "loss": 26.5596, + "step": 3420 + }, + { + "epoch": 12.346726862302482, + "eval_loss": 0.6123174428939819, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.143, + "eval_steps_per_second": 57.143, + "step": 3420 + }, + { + "epoch": 12.350338600451467, + "grad_norm": 199.72242736816406, + "learning_rate": 1.1428312159709619e-05, + "loss": 23.3959, + "step": 3421 + }, + { + "epoch": 12.353950338600452, + "grad_norm": 264.9206848144531, + "learning_rate": 1.1422867513611616e-05, + "loss": 23.448, + "step": 3422 + }, + { + "epoch": 12.357562076749435, + "grad_norm": 198.09420776367188, + "learning_rate": 1.1417422867513613e-05, + "loss": 23.4526, + "step": 3423 + }, + { + "epoch": 12.36117381489842, + "grad_norm": 191.74949645996094, + "learning_rate": 1.1411978221415608e-05, + "loss": 23.9586, + "step": 3424 + }, + { + "epoch": 12.364785553047405, + "grad_norm": 270.4527893066406, + "learning_rate": 1.1406533575317604e-05, + "loss": 41.2497, + "step": 3425 + }, + { + "epoch": 12.368397291196388, + "grad_norm": 253.06109619140625, + "learning_rate": 1.1401088929219601e-05, + "loss": 41.7598, + "step": 3426 + }, + { + "epoch": 12.372009029345373, + "grad_norm": 389.3164978027344, + "learning_rate": 1.1395644283121598e-05, + "loss": 42.1145, + "step": 3427 + }, + { + "epoch": 12.375620767494357, + "grad_norm": 405.1527404785156, + "learning_rate": 1.1390199637023593e-05, + "loss": 39.8163, + "step": 3428 + }, + { + "epoch": 12.37923250564334, + "grad_norm": 360.5083312988281, + "learning_rate": 1.138475499092559e-05, + "loss": 40.7344, + "step": 3429 + }, + { + "epoch": 12.382844243792325, + "grad_norm": 276.3650207519531, + "learning_rate": 1.1379310344827586e-05, + "loss": 40.6678, + "step": 3430 + }, + { + "epoch": 12.382844243792325, + "eval_loss": 0.612799346446991, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 3430 + }, + { + "epoch": 12.386455981941308, + "grad_norm": 222.34078979492188, + "learning_rate": 1.1373865698729583e-05, + "loss": 39.8701, + "step": 3431 + }, + { + "epoch": 12.390067720090293, + "grad_norm": 242.1103515625, + "learning_rate": 1.136842105263158e-05, + "loss": 42.031, + "step": 3432 + }, + { + "epoch": 12.393679458239278, + "grad_norm": 231.30453491210938, + "learning_rate": 1.1362976406533575e-05, + "loss": 40.7321, + "step": 3433 + }, + { + "epoch": 12.397291196388261, + "grad_norm": 302.65179443359375, + "learning_rate": 1.1357531760435572e-05, + "loss": 41.5889, + "step": 3434 + }, + { + "epoch": 12.400902934537246, + "grad_norm": 296.4203796386719, + "learning_rate": 1.1352087114337568e-05, + "loss": 40.3939, + "step": 3435 + }, + { + "epoch": 12.404514672686231, + "grad_norm": 281.8349304199219, + "learning_rate": 1.1346642468239565e-05, + "loss": 37.9457, + "step": 3436 + }, + { + "epoch": 12.408126410835214, + "grad_norm": 228.9622039794922, + "learning_rate": 1.1341197822141562e-05, + "loss": 37.4727, + "step": 3437 + }, + { + "epoch": 12.411738148984199, + "grad_norm": 276.8975524902344, + "learning_rate": 1.1335753176043557e-05, + "loss": 36.4285, + "step": 3438 + }, + { + "epoch": 12.415349887133182, + "grad_norm": 218.76206970214844, + "learning_rate": 1.1330308529945553e-05, + "loss": 37.7888, + "step": 3439 + }, + { + "epoch": 12.418961625282167, + "grad_norm": 277.31329345703125, + "learning_rate": 1.1324863883847551e-05, + "loss": 38.6416, + "step": 3440 + }, + { + "epoch": 12.418961625282167, + "eval_loss": 0.6118359565734863, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.105, + "eval_steps_per_second": 57.105, + "step": 3440 + }, + { + "epoch": 12.422573363431152, + "grad_norm": 239.2766876220703, + "learning_rate": 1.1319419237749547e-05, + "loss": 38.3779, + "step": 3441 + }, + { + "epoch": 12.426185101580135, + "grad_norm": 255.43939208984375, + "learning_rate": 1.1313974591651542e-05, + "loss": 38.7581, + "step": 3442 + }, + { + "epoch": 12.42979683972912, + "grad_norm": 196.33380126953125, + "learning_rate": 1.130852994555354e-05, + "loss": 40.1953, + "step": 3443 + }, + { + "epoch": 12.433408577878104, + "grad_norm": 284.2427062988281, + "learning_rate": 1.1303085299455535e-05, + "loss": 39.2743, + "step": 3444 + }, + { + "epoch": 12.437020316027088, + "grad_norm": 303.0172424316406, + "learning_rate": 1.1297640653357533e-05, + "loss": 39.4786, + "step": 3445 + }, + { + "epoch": 12.440632054176072, + "grad_norm": 231.17999267578125, + "learning_rate": 1.1292196007259529e-05, + "loss": 38.6038, + "step": 3446 + }, + { + "epoch": 12.444243792325057, + "grad_norm": 228.89599609375, + "learning_rate": 1.1286751361161524e-05, + "loss": 39.0235, + "step": 3447 + }, + { + "epoch": 12.44785553047404, + "grad_norm": 247.05203247070312, + "learning_rate": 1.1281306715063521e-05, + "loss": 39.9779, + "step": 3448 + }, + { + "epoch": 12.451467268623025, + "grad_norm": 221.5463104248047, + "learning_rate": 1.1275862068965517e-05, + "loss": 40.4104, + "step": 3449 + }, + { + "epoch": 12.455079006772008, + "grad_norm": 254.12820434570312, + "learning_rate": 1.1270417422867514e-05, + "loss": 40.8093, + "step": 3450 + }, + { + "epoch": 12.455079006772008, + "eval_loss": 0.6093817353248596, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 3450 + }, + { + "epoch": 12.458690744920993, + "grad_norm": 214.2323760986328, + "learning_rate": 1.1264972776769511e-05, + "loss": 40.3578, + "step": 3451 + }, + { + "epoch": 12.462302483069978, + "grad_norm": 230.64718627929688, + "learning_rate": 1.1259528130671506e-05, + "loss": 39.772, + "step": 3452 + }, + { + "epoch": 12.465914221218961, + "grad_norm": 217.81838989257812, + "learning_rate": 1.1254083484573502e-05, + "loss": 36.8193, + "step": 3453 + }, + { + "epoch": 12.469525959367946, + "grad_norm": 292.7674560546875, + "learning_rate": 1.12486388384755e-05, + "loss": 33.891, + "step": 3454 + }, + { + "epoch": 12.47313769751693, + "grad_norm": 241.6099395751953, + "learning_rate": 1.1243194192377496e-05, + "loss": 34.8947, + "step": 3455 + }, + { + "epoch": 12.476749435665914, + "grad_norm": 220.97128295898438, + "learning_rate": 1.1237749546279493e-05, + "loss": 31.7715, + "step": 3456 + }, + { + "epoch": 12.480361173814899, + "grad_norm": 191.04376220703125, + "learning_rate": 1.1232304900181488e-05, + "loss": 32.3878, + "step": 3457 + }, + { + "epoch": 12.483972911963882, + "grad_norm": 192.3009796142578, + "learning_rate": 1.1226860254083484e-05, + "loss": 33.3116, + "step": 3458 + }, + { + "epoch": 12.487584650112867, + "grad_norm": 214.22459411621094, + "learning_rate": 1.1221415607985482e-05, + "loss": 34.1394, + "step": 3459 + }, + { + "epoch": 12.491196388261852, + "grad_norm": 225.24191284179688, + "learning_rate": 1.1215970961887478e-05, + "loss": 34.9381, + "step": 3460 + }, + { + "epoch": 12.491196388261852, + "eval_loss": 0.6095408201217651, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 3460 + }, + { + "epoch": 12.494808126410835, + "grad_norm": 240.89199829101562, + "learning_rate": 1.1210526315789473e-05, + "loss": 34.5342, + "step": 3461 + }, + { + "epoch": 12.49841986455982, + "grad_norm": 263.5467224121094, + "learning_rate": 1.120508166969147e-05, + "loss": 35.3287, + "step": 3462 + }, + { + "epoch": 12.502031602708804, + "grad_norm": 253.0650634765625, + "learning_rate": 1.1199637023593467e-05, + "loss": 35.4859, + "step": 3463 + }, + { + "epoch": 12.505643340857787, + "grad_norm": 279.4447937011719, + "learning_rate": 1.1194192377495463e-05, + "loss": 33.919, + "step": 3464 + }, + { + "epoch": 12.509255079006772, + "grad_norm": 246.6184844970703, + "learning_rate": 1.118874773139746e-05, + "loss": 35.2743, + "step": 3465 + }, + { + "epoch": 12.512866817155757, + "grad_norm": 228.4134979248047, + "learning_rate": 1.1183303085299455e-05, + "loss": 36.0865, + "step": 3466 + }, + { + "epoch": 12.51647855530474, + "grad_norm": 264.87835693359375, + "learning_rate": 1.1177858439201452e-05, + "loss": 36.1596, + "step": 3467 + }, + { + "epoch": 12.520090293453725, + "grad_norm": 252.2872772216797, + "learning_rate": 1.117241379310345e-05, + "loss": 35.7293, + "step": 3468 + }, + { + "epoch": 12.523702031602708, + "grad_norm": 277.3695373535156, + "learning_rate": 1.1166969147005445e-05, + "loss": 36.8009, + "step": 3469 + }, + { + "epoch": 12.527313769751693, + "grad_norm": 255.64610290527344, + "learning_rate": 1.1161524500907442e-05, + "loss": 28.5986, + "step": 3470 + }, + { + "epoch": 12.527313769751693, + "eval_loss": 0.6122347116470337, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 3470 + }, + { + "epoch": 12.530925507900678, + "grad_norm": 256.1487121582031, + "learning_rate": 1.1156079854809437e-05, + "loss": 23.1289, + "step": 3471 + }, + { + "epoch": 12.534537246049661, + "grad_norm": 261.9757080078125, + "learning_rate": 1.1150635208711433e-05, + "loss": 22.3379, + "step": 3472 + }, + { + "epoch": 12.538148984198646, + "grad_norm": 194.83432006835938, + "learning_rate": 1.1145190562613431e-05, + "loss": 23.6192, + "step": 3473 + }, + { + "epoch": 12.54176072234763, + "grad_norm": 241.51089477539062, + "learning_rate": 1.1139745916515427e-05, + "loss": 24.0314, + "step": 3474 + }, + { + "epoch": 12.545372460496614, + "grad_norm": 242.6024932861328, + "learning_rate": 1.1134301270417424e-05, + "loss": 40.2969, + "step": 3475 + }, + { + "epoch": 12.548984198645599, + "grad_norm": 292.17303466796875, + "learning_rate": 1.112885662431942e-05, + "loss": 42.3448, + "step": 3476 + }, + { + "epoch": 12.552595936794582, + "grad_norm": 232.811767578125, + "learning_rate": 1.1123411978221416e-05, + "loss": 41.7642, + "step": 3477 + }, + { + "epoch": 12.556207674943566, + "grad_norm": 238.43162536621094, + "learning_rate": 1.1117967332123413e-05, + "loss": 41.0827, + "step": 3478 + }, + { + "epoch": 12.559819413092551, + "grad_norm": 290.20159912109375, + "learning_rate": 1.1112522686025409e-05, + "loss": 41.3795, + "step": 3479 + }, + { + "epoch": 12.563431151241534, + "grad_norm": 197.52903747558594, + "learning_rate": 1.1107078039927404e-05, + "loss": 40.6337, + "step": 3480 + }, + { + "epoch": 12.563431151241534, + "eval_loss": 0.6133883595466614, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.135, + "eval_steps_per_second": 57.135, + "step": 3480 + }, + { + "epoch": 12.56704288939052, + "grad_norm": 259.8161926269531, + "learning_rate": 1.1101633393829401e-05, + "loss": 40.2626, + "step": 3481 + }, + { + "epoch": 12.570654627539504, + "grad_norm": 196.7882537841797, + "learning_rate": 1.1096188747731398e-05, + "loss": 41.0171, + "step": 3482 + }, + { + "epoch": 12.574266365688487, + "grad_norm": 216.27642822265625, + "learning_rate": 1.1090744101633394e-05, + "loss": 42.1328, + "step": 3483 + }, + { + "epoch": 12.577878103837472, + "grad_norm": 292.6575012207031, + "learning_rate": 1.108529945553539e-05, + "loss": 39.9502, + "step": 3484 + }, + { + "epoch": 12.581489841986457, + "grad_norm": 254.43344116210938, + "learning_rate": 1.1079854809437386e-05, + "loss": 41.3409, + "step": 3485 + }, + { + "epoch": 12.58510158013544, + "grad_norm": 211.3965606689453, + "learning_rate": 1.1074410163339385e-05, + "loss": 39.6898, + "step": 3486 + }, + { + "epoch": 12.588713318284425, + "grad_norm": 196.2000274658203, + "learning_rate": 1.106896551724138e-05, + "loss": 38.0837, + "step": 3487 + }, + { + "epoch": 12.592325056433408, + "grad_norm": 224.4564666748047, + "learning_rate": 1.1063520871143376e-05, + "loss": 38.479, + "step": 3488 + }, + { + "epoch": 12.595936794582393, + "grad_norm": 215.7074432373047, + "learning_rate": 1.1058076225045373e-05, + "loss": 38.3103, + "step": 3489 + }, + { + "epoch": 12.599548532731378, + "grad_norm": 278.2279052734375, + "learning_rate": 1.1052631578947368e-05, + "loss": 37.9399, + "step": 3490 + }, + { + "epoch": 12.599548532731378, + "eval_loss": 0.6091782450675964, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 3490 + }, + { + "epoch": 12.60316027088036, + "grad_norm": 236.7021942138672, + "learning_rate": 1.1047186932849365e-05, + "loss": 38.185, + "step": 3491 + }, + { + "epoch": 12.606772009029346, + "grad_norm": 200.35169982910156, + "learning_rate": 1.1041742286751362e-05, + "loss": 38.7405, + "step": 3492 + }, + { + "epoch": 12.610383747178329, + "grad_norm": 211.9726104736328, + "learning_rate": 1.1036297640653358e-05, + "loss": 39.8351, + "step": 3493 + }, + { + "epoch": 12.613995485327314, + "grad_norm": 303.5962829589844, + "learning_rate": 1.1030852994555353e-05, + "loss": 39.3039, + "step": 3494 + }, + { + "epoch": 12.617607223476298, + "grad_norm": 298.086181640625, + "learning_rate": 1.102540834845735e-05, + "loss": 39.9149, + "step": 3495 + }, + { + "epoch": 12.621218961625281, + "grad_norm": 255.69854736328125, + "learning_rate": 1.1019963702359347e-05, + "loss": 36.3617, + "step": 3496 + }, + { + "epoch": 12.624830699774266, + "grad_norm": 273.2884216308594, + "learning_rate": 1.1014519056261344e-05, + "loss": 38.6865, + "step": 3497 + }, + { + "epoch": 12.628442437923251, + "grad_norm": 211.17837524414062, + "learning_rate": 1.100907441016334e-05, + "loss": 40.2771, + "step": 3498 + }, + { + "epoch": 12.632054176072234, + "grad_norm": 253.9141845703125, + "learning_rate": 1.1003629764065335e-05, + "loss": 40.3644, + "step": 3499 + }, + { + "epoch": 12.635665914221219, + "grad_norm": 247.4141082763672, + "learning_rate": 1.0998185117967334e-05, + "loss": 39.9754, + "step": 3500 + }, + { + "epoch": 12.635665914221219, + "eval_loss": 0.6086810827255249, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 3500 + }, + { + "epoch": 12.639277652370204, + "grad_norm": 237.3258056640625, + "learning_rate": 1.0992740471869329e-05, + "loss": 39.9438, + "step": 3501 + }, + { + "epoch": 12.642889390519187, + "grad_norm": 252.87744140625, + "learning_rate": 1.0987295825771325e-05, + "loss": 39.9713, + "step": 3502 + }, + { + "epoch": 12.646501128668172, + "grad_norm": 341.2947998046875, + "learning_rate": 1.0981851179673322e-05, + "loss": 36.54, + "step": 3503 + }, + { + "epoch": 12.650112866817155, + "grad_norm": 212.7144317626953, + "learning_rate": 1.0976406533575317e-05, + "loss": 33.2737, + "step": 3504 + }, + { + "epoch": 12.65372460496614, + "grad_norm": 220.15846252441406, + "learning_rate": 1.0970961887477314e-05, + "loss": 34.8862, + "step": 3505 + }, + { + "epoch": 12.657336343115125, + "grad_norm": 235.8145294189453, + "learning_rate": 1.0965517241379311e-05, + "loss": 31.637, + "step": 3506 + }, + { + "epoch": 12.660948081264108, + "grad_norm": 274.13140869140625, + "learning_rate": 1.0960072595281307e-05, + "loss": 33.6111, + "step": 3507 + }, + { + "epoch": 12.664559819413093, + "grad_norm": 259.9810791015625, + "learning_rate": 1.0954627949183304e-05, + "loss": 34.7118, + "step": 3508 + }, + { + "epoch": 12.668171557562077, + "grad_norm": 244.6074676513672, + "learning_rate": 1.0949183303085299e-05, + "loss": 34.3987, + "step": 3509 + }, + { + "epoch": 12.67178329571106, + "grad_norm": 264.0238037109375, + "learning_rate": 1.0943738656987296e-05, + "loss": 34.7304, + "step": 3510 + }, + { + "epoch": 12.67178329571106, + "eval_loss": 0.6089194416999817, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 3510 + }, + { + "epoch": 12.675395033860045, + "grad_norm": 286.857421875, + "learning_rate": 1.0938294010889293e-05, + "loss": 34.5722, + "step": 3511 + }, + { + "epoch": 12.679006772009028, + "grad_norm": 270.7839660644531, + "learning_rate": 1.0932849364791289e-05, + "loss": 35.6129, + "step": 3512 + }, + { + "epoch": 12.682618510158013, + "grad_norm": 214.4302978515625, + "learning_rate": 1.0927404718693284e-05, + "loss": 34.4318, + "step": 3513 + }, + { + "epoch": 12.686230248306998, + "grad_norm": 362.6913757324219, + "learning_rate": 1.0921960072595283e-05, + "loss": 35.6578, + "step": 3514 + }, + { + "epoch": 12.689841986455981, + "grad_norm": 266.5205993652344, + "learning_rate": 1.0916515426497278e-05, + "loss": 35.8627, + "step": 3515 + }, + { + "epoch": 12.693453724604966, + "grad_norm": 271.8298034667969, + "learning_rate": 1.0911070780399275e-05, + "loss": 36.8931, + "step": 3516 + }, + { + "epoch": 12.697065462753951, + "grad_norm": 230.13815307617188, + "learning_rate": 1.090562613430127e-05, + "loss": 35.8972, + "step": 3517 + }, + { + "epoch": 12.700677200902934, + "grad_norm": 235.57127380371094, + "learning_rate": 1.0900181488203266e-05, + "loss": 36.7884, + "step": 3518 + }, + { + "epoch": 12.704288939051919, + "grad_norm": 274.0856018066406, + "learning_rate": 1.0894736842105265e-05, + "loss": 35.938, + "step": 3519 + }, + { + "epoch": 12.707900677200904, + "grad_norm": 251.9855194091797, + "learning_rate": 1.088929219600726e-05, + "loss": 30.846, + "step": 3520 + }, + { + "epoch": 12.707900677200904, + "eval_loss": 0.6102532148361206, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 3520 + }, + { + "epoch": 12.711512415349887, + "grad_norm": 254.11465454101562, + "learning_rate": 1.0883847549909255e-05, + "loss": 22.8538, + "step": 3521 + }, + { + "epoch": 12.715124153498872, + "grad_norm": 233.05821228027344, + "learning_rate": 1.0878402903811253e-05, + "loss": 22.3346, + "step": 3522 + }, + { + "epoch": 12.718735891647855, + "grad_norm": 223.46646118164062, + "learning_rate": 1.087295825771325e-05, + "loss": 23.8109, + "step": 3523 + }, + { + "epoch": 12.72234762979684, + "grad_norm": 209.4064483642578, + "learning_rate": 1.0867513611615245e-05, + "loss": 24.7694, + "step": 3524 + }, + { + "epoch": 12.725959367945824, + "grad_norm": 299.6215515136719, + "learning_rate": 1.0862068965517242e-05, + "loss": 40.8879, + "step": 3525 + }, + { + "epoch": 12.729571106094808, + "grad_norm": 272.5259704589844, + "learning_rate": 1.0856624319419237e-05, + "loss": 41.5875, + "step": 3526 + }, + { + "epoch": 12.733182844243792, + "grad_norm": 219.70687866210938, + "learning_rate": 1.0851179673321235e-05, + "loss": 41.5546, + "step": 3527 + }, + { + "epoch": 12.736794582392777, + "grad_norm": 250.9104766845703, + "learning_rate": 1.0845735027223232e-05, + "loss": 40.0984, + "step": 3528 + }, + { + "epoch": 12.74040632054176, + "grad_norm": 260.9254150390625, + "learning_rate": 1.0840290381125227e-05, + "loss": 40.564, + "step": 3529 + }, + { + "epoch": 12.744018058690745, + "grad_norm": 275.46221923828125, + "learning_rate": 1.0834845735027224e-05, + "loss": 40.3864, + "step": 3530 + }, + { + "epoch": 12.744018058690745, + "eval_loss": 0.6099677681922913, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 3530 + }, + { + "epoch": 12.747629796839728, + "grad_norm": 200.9589385986328, + "learning_rate": 1.082940108892922e-05, + "loss": 40.5753, + "step": 3531 + }, + { + "epoch": 12.751241534988713, + "grad_norm": 228.87669372558594, + "learning_rate": 1.0823956442831215e-05, + "loss": 41.4702, + "step": 3532 + }, + { + "epoch": 12.754853273137698, + "grad_norm": 218.6998748779297, + "learning_rate": 1.0818511796733214e-05, + "loss": 41.6641, + "step": 3533 + }, + { + "epoch": 12.758465011286681, + "grad_norm": 422.519775390625, + "learning_rate": 1.0813067150635209e-05, + "loss": 41.8016, + "step": 3534 + }, + { + "epoch": 12.762076749435666, + "grad_norm": 198.31935119628906, + "learning_rate": 1.0807622504537204e-05, + "loss": 40.6053, + "step": 3535 + }, + { + "epoch": 12.76568848758465, + "grad_norm": 274.42333984375, + "learning_rate": 1.0802177858439201e-05, + "loss": 38.7974, + "step": 3536 + }, + { + "epoch": 12.769300225733634, + "grad_norm": 267.5847473144531, + "learning_rate": 1.0796733212341199e-05, + "loss": 37.157, + "step": 3537 + }, + { + "epoch": 12.772911963882619, + "grad_norm": 264.9976806640625, + "learning_rate": 1.0791288566243196e-05, + "loss": 38.1585, + "step": 3538 + }, + { + "epoch": 12.776523702031604, + "grad_norm": 216.5603790283203, + "learning_rate": 1.0785843920145191e-05, + "loss": 38.0501, + "step": 3539 + }, + { + "epoch": 12.780135440180587, + "grad_norm": 193.55081176757812, + "learning_rate": 1.0780399274047186e-05, + "loss": 38.3114, + "step": 3540 + }, + { + "epoch": 12.780135440180587, + "eval_loss": 0.6059894561767578, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3540 + }, + { + "epoch": 12.783747178329572, + "grad_norm": 256.3584289550781, + "learning_rate": 1.0774954627949183e-05, + "loss": 38.7056, + "step": 3541 + }, + { + "epoch": 12.787358916478555, + "grad_norm": 203.17401123046875, + "learning_rate": 1.076950998185118e-05, + "loss": 39.3947, + "step": 3542 + }, + { + "epoch": 12.79097065462754, + "grad_norm": 307.99517822265625, + "learning_rate": 1.0764065335753176e-05, + "loss": 39.2121, + "step": 3543 + }, + { + "epoch": 12.794582392776524, + "grad_norm": 199.4147186279297, + "learning_rate": 1.0758620689655173e-05, + "loss": 38.4621, + "step": 3544 + }, + { + "epoch": 12.798194130925507, + "grad_norm": 251.60293579101562, + "learning_rate": 1.0753176043557168e-05, + "loss": 38.2742, + "step": 3545 + }, + { + "epoch": 12.801805869074492, + "grad_norm": 277.1817321777344, + "learning_rate": 1.0747731397459165e-05, + "loss": 38.6803, + "step": 3546 + }, + { + "epoch": 12.805417607223477, + "grad_norm": 303.2837219238281, + "learning_rate": 1.0742286751361163e-05, + "loss": 39.7843, + "step": 3547 + }, + { + "epoch": 12.80902934537246, + "grad_norm": 321.22772216796875, + "learning_rate": 1.0736842105263158e-05, + "loss": 41.3761, + "step": 3548 + }, + { + "epoch": 12.812641083521445, + "grad_norm": 238.89007568359375, + "learning_rate": 1.0731397459165155e-05, + "loss": 40.3649, + "step": 3549 + }, + { + "epoch": 12.816252821670428, + "grad_norm": 251.22291564941406, + "learning_rate": 1.072595281306715e-05, + "loss": 40.8151, + "step": 3550 + }, + { + "epoch": 12.816252821670428, + "eval_loss": 0.6065003275871277, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 3550 + }, + { + "epoch": 12.819864559819413, + "grad_norm": 218.13418579101562, + "learning_rate": 1.0720508166969147e-05, + "loss": 39.381, + "step": 3551 + }, + { + "epoch": 12.823476297968398, + "grad_norm": 250.90328979492188, + "learning_rate": 1.0715063520871145e-05, + "loss": 39.8923, + "step": 3552 + }, + { + "epoch": 12.827088036117381, + "grad_norm": 227.4825897216797, + "learning_rate": 1.070961887477314e-05, + "loss": 36.836, + "step": 3553 + }, + { + "epoch": 12.830699774266366, + "grad_norm": 253.7106475830078, + "learning_rate": 1.0704174228675135e-05, + "loss": 34.499, + "step": 3554 + }, + { + "epoch": 12.83431151241535, + "grad_norm": 280.0548400878906, + "learning_rate": 1.0698729582577132e-05, + "loss": 33.3409, + "step": 3555 + }, + { + "epoch": 12.837923250564334, + "grad_norm": 201.3768768310547, + "learning_rate": 1.069328493647913e-05, + "loss": 32.4868, + "step": 3556 + }, + { + "epoch": 12.841534988713319, + "grad_norm": 245.73446655273438, + "learning_rate": 1.0687840290381125e-05, + "loss": 32.8295, + "step": 3557 + }, + { + "epoch": 12.845146726862303, + "grad_norm": 195.0170440673828, + "learning_rate": 1.0682395644283122e-05, + "loss": 33.2009, + "step": 3558 + }, + { + "epoch": 12.848758465011286, + "grad_norm": 261.66357421875, + "learning_rate": 1.0676950998185117e-05, + "loss": 33.0627, + "step": 3559 + }, + { + "epoch": 12.852370203160271, + "grad_norm": 299.0184326171875, + "learning_rate": 1.0671506352087116e-05, + "loss": 34.184, + "step": 3560 + }, + { + "epoch": 12.852370203160271, + "eval_loss": 0.6077792048454285, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.041, + "eval_steps_per_second": 57.041, + "step": 3560 + }, + { + "epoch": 12.855981941309254, + "grad_norm": 293.9249572753906, + "learning_rate": 1.0666061705989111e-05, + "loss": 34.748, + "step": 3561 + }, + { + "epoch": 12.85959367945824, + "grad_norm": 206.4182586669922, + "learning_rate": 1.0660617059891107e-05, + "loss": 33.8454, + "step": 3562 + }, + { + "epoch": 12.863205417607224, + "grad_norm": 261.4427185058594, + "learning_rate": 1.0655172413793104e-05, + "loss": 35.7317, + "step": 3563 + }, + { + "epoch": 12.866817155756207, + "grad_norm": 236.60704040527344, + "learning_rate": 1.06497277676951e-05, + "loss": 35.2389, + "step": 3564 + }, + { + "epoch": 12.870428893905192, + "grad_norm": 272.9973449707031, + "learning_rate": 1.0644283121597096e-05, + "loss": 34.8523, + "step": 3565 + }, + { + "epoch": 12.874040632054175, + "grad_norm": 228.82540893554688, + "learning_rate": 1.0638838475499093e-05, + "loss": 34.7236, + "step": 3566 + }, + { + "epoch": 12.87765237020316, + "grad_norm": 266.6078796386719, + "learning_rate": 1.0633393829401089e-05, + "loss": 36.1574, + "step": 3567 + }, + { + "epoch": 12.881264108352145, + "grad_norm": 267.52239990234375, + "learning_rate": 1.0627949183303086e-05, + "loss": 36.8466, + "step": 3568 + }, + { + "epoch": 12.884875846501128, + "grad_norm": 261.0372314453125, + "learning_rate": 1.0622504537205083e-05, + "loss": 37.2803, + "step": 3569 + }, + { + "epoch": 12.888487584650113, + "grad_norm": 220.42532348632812, + "learning_rate": 1.0617059891107078e-05, + "loss": 29.4233, + "step": 3570 + }, + { + "epoch": 12.888487584650113, + "eval_loss": 0.6131581664085388, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 3570 + }, + { + "epoch": 12.892099322799098, + "grad_norm": 187.53604125976562, + "learning_rate": 1.0611615245009075e-05, + "loss": 23.3851, + "step": 3571 + }, + { + "epoch": 12.89571106094808, + "grad_norm": 227.1913299560547, + "learning_rate": 1.060617059891107e-05, + "loss": 23.3155, + "step": 3572 + }, + { + "epoch": 12.899322799097066, + "grad_norm": 202.15939331054688, + "learning_rate": 1.0600725952813066e-05, + "loss": 24.4548, + "step": 3573 + }, + { + "epoch": 12.90293453724605, + "grad_norm": 195.67282104492188, + "learning_rate": 1.0595281306715065e-05, + "loss": 24.2037, + "step": 3574 + }, + { + "epoch": 12.906546275395034, + "grad_norm": 303.0018310546875, + "learning_rate": 1.058983666061706e-05, + "loss": 41.6489, + "step": 3575 + }, + { + "epoch": 12.910158013544018, + "grad_norm": 193.92433166503906, + "learning_rate": 1.0584392014519056e-05, + "loss": 40.3682, + "step": 3576 + }, + { + "epoch": 12.913769751693001, + "grad_norm": 305.50750732421875, + "learning_rate": 1.0578947368421053e-05, + "loss": 40.5065, + "step": 3577 + }, + { + "epoch": 12.917381489841986, + "grad_norm": 223.41732788085938, + "learning_rate": 1.0573502722323048e-05, + "loss": 41.6387, + "step": 3578 + }, + { + "epoch": 12.920993227990971, + "grad_norm": 215.65061950683594, + "learning_rate": 1.0568058076225047e-05, + "loss": 41.3623, + "step": 3579 + }, + { + "epoch": 12.924604966139954, + "grad_norm": 223.95880126953125, + "learning_rate": 1.0562613430127042e-05, + "loss": 40.7444, + "step": 3580 + }, + { + "epoch": 12.924604966139954, + "eval_loss": 0.6113386750221252, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.074, + "eval_steps_per_second": 57.074, + "step": 3580 + }, + { + "epoch": 12.928216704288939, + "grad_norm": 247.3272247314453, + "learning_rate": 1.0557168784029038e-05, + "loss": 37.8137, + "step": 3581 + }, + { + "epoch": 12.931828442437924, + "grad_norm": 277.4321594238281, + "learning_rate": 1.0551724137931035e-05, + "loss": 38.6946, + "step": 3582 + }, + { + "epoch": 12.935440180586907, + "grad_norm": 219.15576171875, + "learning_rate": 1.0546279491833032e-05, + "loss": 39.0059, + "step": 3583 + }, + { + "epoch": 12.939051918735892, + "grad_norm": 205.6105194091797, + "learning_rate": 1.0540834845735027e-05, + "loss": 39.2436, + "step": 3584 + }, + { + "epoch": 12.942663656884875, + "grad_norm": 303.84521484375, + "learning_rate": 1.0535390199637024e-05, + "loss": 39.2451, + "step": 3585 + }, + { + "epoch": 12.94627539503386, + "grad_norm": 326.2321472167969, + "learning_rate": 1.052994555353902e-05, + "loss": 38.1849, + "step": 3586 + }, + { + "epoch": 12.949887133182845, + "grad_norm": 332.7608642578125, + "learning_rate": 1.0524500907441015e-05, + "loss": 39.7121, + "step": 3587 + }, + { + "epoch": 12.953498871331828, + "grad_norm": 245.19827270507812, + "learning_rate": 1.0519056261343014e-05, + "loss": 39.6558, + "step": 3588 + }, + { + "epoch": 12.957110609480813, + "grad_norm": 227.54763793945312, + "learning_rate": 1.051361161524501e-05, + "loss": 38.6437, + "step": 3589 + }, + { + "epoch": 12.960722347629797, + "grad_norm": 273.1142272949219, + "learning_rate": 1.0508166969147006e-05, + "loss": 39.083, + "step": 3590 + }, + { + "epoch": 12.960722347629797, + "eval_loss": 0.6050187349319458, + "eval_runtime": 3.1339, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 3590 + }, + { + "epoch": 12.96433408577878, + "grad_norm": 227.0492401123047, + "learning_rate": 1.0502722323049002e-05, + "loss": 34.0254, + "step": 3591 + }, + { + "epoch": 12.967945823927765, + "grad_norm": 201.76736450195312, + "learning_rate": 1.0497277676950999e-05, + "loss": 32.4569, + "step": 3592 + }, + { + "epoch": 12.97155756207675, + "grad_norm": 279.99237060546875, + "learning_rate": 1.0491833030852996e-05, + "loss": 33.8718, + "step": 3593 + }, + { + "epoch": 12.975169300225733, + "grad_norm": 351.647705078125, + "learning_rate": 1.0486388384754991e-05, + "loss": 34.8168, + "step": 3594 + }, + { + "epoch": 12.978781038374718, + "grad_norm": 275.7414855957031, + "learning_rate": 1.0480943738656987e-05, + "loss": 35.1731, + "step": 3595 + }, + { + "epoch": 12.982392776523701, + "grad_norm": 347.0024719238281, + "learning_rate": 1.0475499092558984e-05, + "loss": 35.7127, + "step": 3596 + }, + { + "epoch": 12.986004514672686, + "grad_norm": 304.18218994140625, + "learning_rate": 1.047005444646098e-05, + "loss": 34.7709, + "step": 3597 + }, + { + "epoch": 12.989616252821671, + "grad_norm": 306.33245849609375, + "learning_rate": 1.0464609800362976e-05, + "loss": 37.2105, + "step": 3598 + }, + { + "epoch": 12.993227990970654, + "grad_norm": 326.3535461425781, + "learning_rate": 1.0459165154264973e-05, + "loss": 33.6613, + "step": 3599 + }, + { + "epoch": 12.996839729119639, + "grad_norm": 325.7522888183594, + "learning_rate": 1.0453720508166969e-05, + "loss": 22.8985, + "step": 3600 + }, + { + "epoch": 12.996839729119639, + "eval_loss": 0.6073772311210632, + "eval_runtime": 3.1391, + "eval_samples_per_second": 57.023, + "eval_steps_per_second": 57.023, + "step": 3600 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.765333212240937e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8e180a6742effaaf99294df6bc682a7b4a39c5f1 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecf0c635db978f15e54ae139dd32a4b61de021ff29f15494dca8ad2aec5fde15 +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..784490bdd0c6a69a462fb7e5599e3b157b6a597b --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5d4aac0ac4eb0ee46a74e2e78cee560604399716504ad722ecc4d4d1ffdd7bf +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cdae805fb423641cf8ebdec9651510b68350e8e --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad549f57be9003faba20f54229297fe304f6910caf8fc96c9344836aba101ac9 +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2deb90617d1b63a1e8f902d52d4c2b50f2a9fde0 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc8612bb9d5ac75c1e825464b4b9c4b884c622e0465d7d4faf8335ffa92d41cb +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..411b04463f73e780fbab76b7da40e760c9226c2d --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eb78a5282569c1c68243b51d683c4a57732eb7f2126e73b3cfd90d3b2cc478b +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d0bf30bad797e6d36d87b40590cc733c1118ed5a --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/trainer_state.json @@ -0,0 +1,29673 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 13.718735891647855, + "eval_steps": 10, + "global_step": 3800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + }, + { + "epoch": 4.33589164785553, + "grad_norm": 232.332763671875, + "learning_rate": 2.3515426497277678e-05, + "loss": 38.1029, + "step": 1201 + }, + { + "epoch": 4.339503386004514, + "grad_norm": 251.8763885498047, + "learning_rate": 2.3509981851179673e-05, + "loss": 40.2538, + "step": 1202 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 260.1363525390625, + "learning_rate": 2.350453720508167e-05, + "loss": 39.115, + "step": 1203 + }, + { + "epoch": 4.346726862302483, + "grad_norm": 227.32473754882812, + "learning_rate": 2.3499092558983667e-05, + "loss": 37.7692, + "step": 1204 + }, + { + "epoch": 4.350338600451467, + "grad_norm": 208.3872528076172, + "learning_rate": 2.3493647912885663e-05, + "loss": 26.7583, + "step": 1205 + }, + { + "epoch": 4.353950338600452, + "grad_norm": 173.05075073242188, + "learning_rate": 2.348820326678766e-05, + "loss": 24.7576, + "step": 1206 + }, + { + "epoch": 4.357562076749436, + "grad_norm": 214.4512939453125, + "learning_rate": 2.3482758620689657e-05, + "loss": 24.8792, + "step": 1207 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 179.293701171875, + "learning_rate": 2.3477313974591652e-05, + "loss": 26.1507, + "step": 1208 + }, + { + "epoch": 4.364785553047404, + "grad_norm": 401.9908142089844, + "learning_rate": 2.3471869328493648e-05, + "loss": 47.4017, + "step": 1209 + }, + { + "epoch": 4.368397291196389, + "grad_norm": 399.3369140625, + "learning_rate": 2.3466424682395643e-05, + "loss": 48.0082, + "step": 1210 + }, + { + "epoch": 4.368397291196389, + "eval_loss": 0.6664602756500244, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.18, + "eval_steps_per_second": 57.18, + "step": 1210 + }, + { + "epoch": 4.372009029345373, + "grad_norm": 320.49090576171875, + "learning_rate": 2.346098003629764e-05, + "loss": 47.4843, + "step": 1211 + }, + { + "epoch": 4.375620767494357, + "grad_norm": 297.55615234375, + "learning_rate": 2.3455535390199637e-05, + "loss": 46.3087, + "step": 1212 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 245.03399658203125, + "learning_rate": 2.3450090744101636e-05, + "loss": 45.4889, + "step": 1213 + }, + { + "epoch": 4.382844243792325, + "grad_norm": 227.94091796875, + "learning_rate": 2.344464609800363e-05, + "loss": 45.8501, + "step": 1214 + }, + { + "epoch": 4.386455981941309, + "grad_norm": 262.7824401855469, + "learning_rate": 2.3439201451905627e-05, + "loss": 46.2737, + "step": 1215 + }, + { + "epoch": 4.390067720090293, + "grad_norm": 235.969970703125, + "learning_rate": 2.3433756805807622e-05, + "loss": 45.2876, + "step": 1216 + }, + { + "epoch": 4.393679458239277, + "grad_norm": 244.8028106689453, + "learning_rate": 2.342831215970962e-05, + "loss": 45.4931, + "step": 1217 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 236.24844360351562, + "learning_rate": 2.3422867513611616e-05, + "loss": 45.6649, + "step": 1218 + }, + { + "epoch": 4.400902934537246, + "grad_norm": 204.7911834716797, + "learning_rate": 2.341742286751361e-05, + "loss": 43.9613, + "step": 1219 + }, + { + "epoch": 4.40451467268623, + "grad_norm": 190.6739044189453, + "learning_rate": 2.3411978221415607e-05, + "loss": 41.9267, + "step": 1220 + }, + { + "epoch": 4.40451467268623, + "eval_loss": 0.6481396555900574, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1220 + }, + { + "epoch": 4.408126410835214, + "grad_norm": 224.25758361816406, + "learning_rate": 2.3406533575317602e-05, + "loss": 42.34, + "step": 1221 + }, + { + "epoch": 4.411738148984199, + "grad_norm": 238.21913146972656, + "learning_rate": 2.34010889292196e-05, + "loss": 40.6947, + "step": 1222 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 255.64395141601562, + "learning_rate": 2.33956442831216e-05, + "loss": 39.8585, + "step": 1223 + }, + { + "epoch": 4.418961625282167, + "grad_norm": 202.08859252929688, + "learning_rate": 2.3390199637023595e-05, + "loss": 42.6031, + "step": 1224 + }, + { + "epoch": 4.422573363431152, + "grad_norm": 222.359619140625, + "learning_rate": 2.338475499092559e-05, + "loss": 41.9946, + "step": 1225 + }, + { + "epoch": 4.426185101580136, + "grad_norm": 198.84461975097656, + "learning_rate": 2.3379310344827586e-05, + "loss": 40.9174, + "step": 1226 + }, + { + "epoch": 4.42979683972912, + "grad_norm": 227.34942626953125, + "learning_rate": 2.337386569872958e-05, + "loss": 42.2865, + "step": 1227 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 249.9097900390625, + "learning_rate": 2.336842105263158e-05, + "loss": 42.6508, + "step": 1228 + }, + { + "epoch": 4.437020316027088, + "grad_norm": 236.96009826660156, + "learning_rate": 2.3362976406533576e-05, + "loss": 43.0846, + "step": 1229 + }, + { + "epoch": 4.440632054176072, + "grad_norm": 183.06201171875, + "learning_rate": 2.335753176043557e-05, + "loss": 42.4119, + "step": 1230 + }, + { + "epoch": 4.440632054176072, + "eval_loss": 0.6428424715995789, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 1230 + }, + { + "epoch": 4.444243792325056, + "grad_norm": 199.0382843017578, + "learning_rate": 2.335208711433757e-05, + "loss": 43.1702, + "step": 1231 + }, + { + "epoch": 4.44785553047404, + "grad_norm": 221.87939453125, + "learning_rate": 2.3346642468239565e-05, + "loss": 43.3518, + "step": 1232 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 205.0601043701172, + "learning_rate": 2.3341197822141564e-05, + "loss": 42.9713, + "step": 1233 + }, + { + "epoch": 4.455079006772009, + "grad_norm": 235.3998565673828, + "learning_rate": 2.333575317604356e-05, + "loss": 42.6973, + "step": 1234 + }, + { + "epoch": 4.458690744920993, + "grad_norm": 171.76986694335938, + "learning_rate": 2.3330308529945555e-05, + "loss": 43.351, + "step": 1235 + }, + { + "epoch": 4.462302483069977, + "grad_norm": 261.549072265625, + "learning_rate": 2.332486388384755e-05, + "loss": 43.8662, + "step": 1236 + }, + { + "epoch": 4.465914221218962, + "grad_norm": 256.76837158203125, + "learning_rate": 2.3319419237749545e-05, + "loss": 40.7938, + "step": 1237 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 176.35060119628906, + "learning_rate": 2.331397459165154e-05, + "loss": 38.1021, + "step": 1238 + }, + { + "epoch": 4.47313769751693, + "grad_norm": 203.00906372070312, + "learning_rate": 2.330852994555354e-05, + "loss": 36.6359, + "step": 1239 + }, + { + "epoch": 4.476749435665914, + "grad_norm": 259.6462707519531, + "learning_rate": 2.3303085299455535e-05, + "loss": 34.448, + "step": 1240 + }, + { + "epoch": 4.476749435665914, + "eval_loss": 0.6386051177978516, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 1240 + }, + { + "epoch": 4.480361173814899, + "grad_norm": 215.24737548828125, + "learning_rate": 2.3297640653357534e-05, + "loss": 35.2353, + "step": 1241 + }, + { + "epoch": 4.483972911963883, + "grad_norm": 249.12355041503906, + "learning_rate": 2.329219600725953e-05, + "loss": 38.2077, + "step": 1242 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 191.0881805419922, + "learning_rate": 2.3286751361161525e-05, + "loss": 36.8363, + "step": 1243 + }, + { + "epoch": 4.491196388261851, + "grad_norm": 229.26449584960938, + "learning_rate": 2.3281306715063523e-05, + "loss": 36.7398, + "step": 1244 + }, + { + "epoch": 4.4948081264108355, + "grad_norm": 184.931884765625, + "learning_rate": 2.327586206896552e-05, + "loss": 35.6614, + "step": 1245 + }, + { + "epoch": 4.4984198645598195, + "grad_norm": 183.7378387451172, + "learning_rate": 2.3270417422867514e-05, + "loss": 36.9818, + "step": 1246 + }, + { + "epoch": 4.502031602708803, + "grad_norm": 191.42543029785156, + "learning_rate": 2.326497277676951e-05, + "loss": 38.1348, + "step": 1247 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 211.6359100341797, + "learning_rate": 2.3259528130671505e-05, + "loss": 37.0112, + "step": 1248 + }, + { + "epoch": 4.509255079006772, + "grad_norm": 245.6946563720703, + "learning_rate": 2.32540834845735e-05, + "loss": 38.6218, + "step": 1249 + }, + { + "epoch": 4.512866817155756, + "grad_norm": 193.29095458984375, + "learning_rate": 2.3248638838475502e-05, + "loss": 36.9687, + "step": 1250 + }, + { + "epoch": 4.512866817155756, + "eval_loss": 0.6432057023048401, + "eval_runtime": 3.1301, + "eval_samples_per_second": 57.187, + "eval_steps_per_second": 57.187, + "step": 1250 + }, + { + "epoch": 4.51647855530474, + "grad_norm": 247.0595245361328, + "learning_rate": 2.3243194192377498e-05, + "loss": 39.8086, + "step": 1251 + }, + { + "epoch": 4.520090293453725, + "grad_norm": 243.1544189453125, + "learning_rate": 2.3237749546279493e-05, + "loss": 38.7245, + "step": 1252 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 322.0834045410156, + "learning_rate": 2.323230490018149e-05, + "loss": 39.5335, + "step": 1253 + }, + { + "epoch": 4.527313769751693, + "grad_norm": 201.5956573486328, + "learning_rate": 2.3226860254083484e-05, + "loss": 30.2928, + "step": 1254 + }, + { + "epoch": 4.530925507900677, + "grad_norm": 186.13291931152344, + "learning_rate": 2.3221415607985483e-05, + "loss": 24.8504, + "step": 1255 + }, + { + "epoch": 4.534537246049661, + "grad_norm": 251.50608825683594, + "learning_rate": 2.3215970961887478e-05, + "loss": 24.5528, + "step": 1256 + }, + { + "epoch": 4.538148984198646, + "grad_norm": 180.21124267578125, + "learning_rate": 2.3210526315789473e-05, + "loss": 25.0864, + "step": 1257 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 206.5410614013672, + "learning_rate": 2.320508166969147e-05, + "loss": 27.1602, + "step": 1258 + }, + { + "epoch": 4.545372460496614, + "grad_norm": 342.1103210449219, + "learning_rate": 2.3199637023593468e-05, + "loss": 47.3734, + "step": 1259 + }, + { + "epoch": 4.5489841986455986, + "grad_norm": 418.3056945800781, + "learning_rate": 2.3194192377495463e-05, + "loss": 48.0316, + "step": 1260 + }, + { + "epoch": 4.5489841986455986, + "eval_loss": 0.6742400527000427, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 1260 + }, + { + "epoch": 4.5525959367945825, + "grad_norm": 369.8560791015625, + "learning_rate": 2.3188747731397462e-05, + "loss": 47.4532, + "step": 1261 + }, + { + "epoch": 4.5562076749435665, + "grad_norm": 322.0288391113281, + "learning_rate": 2.3183303085299457e-05, + "loss": 47.0661, + "step": 1262 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 244.79066467285156, + "learning_rate": 2.3177858439201453e-05, + "loss": 45.1875, + "step": 1263 + }, + { + "epoch": 4.563431151241535, + "grad_norm": 209.29397583007812, + "learning_rate": 2.3172413793103448e-05, + "loss": 46.1355, + "step": 1264 + }, + { + "epoch": 4.567042889390519, + "grad_norm": 271.5123291015625, + "learning_rate": 2.3166969147005443e-05, + "loss": 45.8947, + "step": 1265 + }, + { + "epoch": 4.570654627539503, + "grad_norm": 232.42913818359375, + "learning_rate": 2.3161524500907442e-05, + "loss": 45.6542, + "step": 1266 + }, + { + "epoch": 4.574266365688487, + "grad_norm": 282.50738525390625, + "learning_rate": 2.3156079854809437e-05, + "loss": 45.8805, + "step": 1267 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 203.39031982421875, + "learning_rate": 2.3150635208711436e-05, + "loss": 44.8926, + "step": 1268 + }, + { + "epoch": 4.581489841986456, + "grad_norm": 213.94894409179688, + "learning_rate": 2.314519056261343e-05, + "loss": 43.7589, + "step": 1269 + }, + { + "epoch": 4.58510158013544, + "grad_norm": 198.9677734375, + "learning_rate": 2.3139745916515427e-05, + "loss": 41.819, + "step": 1270 + }, + { + "epoch": 4.58510158013544, + "eval_loss": 0.6428627371788025, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1270 + }, + { + "epoch": 4.588713318284425, + "grad_norm": 197.69903564453125, + "learning_rate": 2.3134301270417422e-05, + "loss": 40.6128, + "step": 1271 + }, + { + "epoch": 4.592325056433409, + "grad_norm": 229.10488891601562, + "learning_rate": 2.312885662431942e-05, + "loss": 41.1856, + "step": 1272 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 254.4750213623047, + "learning_rate": 2.3123411978221417e-05, + "loss": 40.2048, + "step": 1273 + }, + { + "epoch": 4.599548532731377, + "grad_norm": 247.2012939453125, + "learning_rate": 2.3117967332123412e-05, + "loss": 41.663, + "step": 1274 + }, + { + "epoch": 4.603160270880361, + "grad_norm": 196.78761291503906, + "learning_rate": 2.3112522686025407e-05, + "loss": 41.1102, + "step": 1275 + }, + { + "epoch": 4.606772009029346, + "grad_norm": 179.03880310058594, + "learning_rate": 2.3107078039927403e-05, + "loss": 39.6368, + "step": 1276 + }, + { + "epoch": 4.6103837471783295, + "grad_norm": 203.49159240722656, + "learning_rate": 2.3101633393829405e-05, + "loss": 42.9424, + "step": 1277 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 254.80018615722656, + "learning_rate": 2.30961887477314e-05, + "loss": 42.0636, + "step": 1278 + }, + { + "epoch": 4.617607223476298, + "grad_norm": 201.86109924316406, + "learning_rate": 2.3090744101633396e-05, + "loss": 41.4738, + "step": 1279 + }, + { + "epoch": 4.621218961625282, + "grad_norm": 185.1239471435547, + "learning_rate": 2.308529945553539e-05, + "loss": 41.8529, + "step": 1280 + }, + { + "epoch": 4.621218961625282, + "eval_loss": 0.6457561254501343, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 1280 + }, + { + "epoch": 4.624830699774266, + "grad_norm": 198.6769561767578, + "learning_rate": 2.3079854809437386e-05, + "loss": 41.8397, + "step": 1281 + }, + { + "epoch": 4.62844243792325, + "grad_norm": 254.9165496826172, + "learning_rate": 2.3074410163339382e-05, + "loss": 43.5585, + "step": 1282 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 183.61181640625, + "learning_rate": 2.306896551724138e-05, + "loss": 41.7349, + "step": 1283 + }, + { + "epoch": 4.635665914221219, + "grad_norm": 206.0381622314453, + "learning_rate": 2.3063520871143376e-05, + "loss": 42.6239, + "step": 1284 + }, + { + "epoch": 4.639277652370203, + "grad_norm": 188.5303497314453, + "learning_rate": 2.305807622504537e-05, + "loss": 43.0988, + "step": 1285 + }, + { + "epoch": 4.642889390519187, + "grad_norm": 208.30039978027344, + "learning_rate": 2.3052631578947367e-05, + "loss": 43.8379, + "step": 1286 + }, + { + "epoch": 4.646501128668172, + "grad_norm": 209.494384765625, + "learning_rate": 2.3047186932849365e-05, + "loss": 41.4395, + "step": 1287 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 223.97824096679688, + "learning_rate": 2.3041742286751364e-05, + "loss": 38.5792, + "step": 1288 + }, + { + "epoch": 4.65372460496614, + "grad_norm": 209.16192626953125, + "learning_rate": 2.303629764065336e-05, + "loss": 36.2448, + "step": 1289 + }, + { + "epoch": 4.657336343115124, + "grad_norm": 260.72821044921875, + "learning_rate": 2.3030852994555355e-05, + "loss": 35.1692, + "step": 1290 + }, + { + "epoch": 4.657336343115124, + "eval_loss": 0.6381233334541321, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1290 + }, + { + "epoch": 4.660948081264109, + "grad_norm": 222.2270965576172, + "learning_rate": 2.302540834845735e-05, + "loss": 35.2234, + "step": 1291 + }, + { + "epoch": 4.664559819413093, + "grad_norm": 208.68218994140625, + "learning_rate": 2.3019963702359346e-05, + "loss": 35.6167, + "step": 1292 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 199.57015991210938, + "learning_rate": 2.301451905626134e-05, + "loss": 36.9489, + "step": 1293 + }, + { + "epoch": 4.6717832957110605, + "grad_norm": 249.1312255859375, + "learning_rate": 2.300907441016334e-05, + "loss": 37.0681, + "step": 1294 + }, + { + "epoch": 4.675395033860045, + "grad_norm": 227.86341857910156, + "learning_rate": 2.3003629764065335e-05, + "loss": 38.3897, + "step": 1295 + }, + { + "epoch": 4.679006772009029, + "grad_norm": 290.3368225097656, + "learning_rate": 2.2998185117967334e-05, + "loss": 39.1391, + "step": 1296 + }, + { + "epoch": 4.682618510158013, + "grad_norm": 222.59974670410156, + "learning_rate": 2.299274047186933e-05, + "loss": 38.6362, + "step": 1297 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 233.853515625, + "learning_rate": 2.2987295825771325e-05, + "loss": 37.1796, + "step": 1298 + }, + { + "epoch": 4.689841986455982, + "grad_norm": 202.83212280273438, + "learning_rate": 2.2981851179673324e-05, + "loss": 38.5097, + "step": 1299 + }, + { + "epoch": 4.693453724604966, + "grad_norm": 203.59027099609375, + "learning_rate": 2.297640653357532e-05, + "loss": 38.3335, + "step": 1300 + }, + { + "epoch": 4.693453724604966, + "eval_loss": 0.6446877717971802, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 1300 + }, + { + "epoch": 4.69706546275395, + "grad_norm": 250.48324584960938, + "learning_rate": 2.2970961887477314e-05, + "loss": 39.1848, + "step": 1301 + }, + { + "epoch": 4.700677200902934, + "grad_norm": 218.0867462158203, + "learning_rate": 2.296551724137931e-05, + "loss": 38.2276, + "step": 1302 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 316.4258728027344, + "learning_rate": 2.2960072595281305e-05, + "loss": 38.4487, + "step": 1303 + }, + { + "epoch": 4.707900677200903, + "grad_norm": 262.96832275390625, + "learning_rate": 2.29546279491833e-05, + "loss": 29.1075, + "step": 1304 + }, + { + "epoch": 4.711512415349887, + "grad_norm": 261.25897216796875, + "learning_rate": 2.2949183303085303e-05, + "loss": 24.6257, + "step": 1305 + }, + { + "epoch": 4.715124153498872, + "grad_norm": 223.29014587402344, + "learning_rate": 2.2943738656987298e-05, + "loss": 24.4387, + "step": 1306 + }, + { + "epoch": 4.718735891647856, + "grad_norm": 167.95193481445312, + "learning_rate": 2.2938294010889293e-05, + "loss": 25.0916, + "step": 1307 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 203.88392639160156, + "learning_rate": 2.293284936479129e-05, + "loss": 26.1631, + "step": 1308 + }, + { + "epoch": 4.725959367945824, + "grad_norm": 350.67657470703125, + "learning_rate": 2.2927404718693284e-05, + "loss": 47.7021, + "step": 1309 + }, + { + "epoch": 4.7295711060948085, + "grad_norm": 357.1839294433594, + "learning_rate": 2.2921960072595283e-05, + "loss": 47.8161, + "step": 1310 + }, + { + "epoch": 4.7295711060948085, + "eval_loss": 0.6716815829277039, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 1310 + }, + { + "epoch": 4.733182844243792, + "grad_norm": 334.40216064453125, + "learning_rate": 2.291651542649728e-05, + "loss": 47.5608, + "step": 1311 + }, + { + "epoch": 4.736794582392776, + "grad_norm": 322.90008544921875, + "learning_rate": 2.2911070780399274e-05, + "loss": 45.9858, + "step": 1312 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 291.5083923339844, + "learning_rate": 2.290562613430127e-05, + "loss": 45.9813, + "step": 1313 + }, + { + "epoch": 4.744018058690745, + "grad_norm": 234.91102600097656, + "learning_rate": 2.2900181488203268e-05, + "loss": 44.4287, + "step": 1314 + }, + { + "epoch": 4.747629796839729, + "grad_norm": 271.03582763671875, + "learning_rate": 2.2894736842105263e-05, + "loss": 45.3697, + "step": 1315 + }, + { + "epoch": 4.751241534988713, + "grad_norm": 256.219482421875, + "learning_rate": 2.2889292196007262e-05, + "loss": 45.1817, + "step": 1316 + }, + { + "epoch": 4.754853273137698, + "grad_norm": 252.0631561279297, + "learning_rate": 2.2883847549909257e-05, + "loss": 45.2029, + "step": 1317 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 249.41812133789062, + "learning_rate": 2.2878402903811253e-05, + "loss": 44.9802, + "step": 1318 + }, + { + "epoch": 4.762076749435666, + "grad_norm": 208.9102325439453, + "learning_rate": 2.2872958257713248e-05, + "loss": 44.3745, + "step": 1319 + }, + { + "epoch": 4.76568848758465, + "grad_norm": 322.94903564453125, + "learning_rate": 2.2867513611615244e-05, + "loss": 40.9193, + "step": 1320 + }, + { + "epoch": 4.76568848758465, + "eval_loss": 0.6515910029411316, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1320 + }, + { + "epoch": 4.769300225733634, + "grad_norm": 264.6942138671875, + "learning_rate": 2.2862068965517242e-05, + "loss": 39.7286, + "step": 1321 + }, + { + "epoch": 4.772911963882619, + "grad_norm": 276.6095886230469, + "learning_rate": 2.2856624319419238e-05, + "loss": 41.3846, + "step": 1322 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 199.59877014160156, + "learning_rate": 2.2851179673321233e-05, + "loss": 40.5583, + "step": 1323 + }, + { + "epoch": 4.780135440180587, + "grad_norm": 252.59158325195312, + "learning_rate": 2.2845735027223232e-05, + "loss": 40.9513, + "step": 1324 + }, + { + "epoch": 4.7837471783295715, + "grad_norm": 215.53826904296875, + "learning_rate": 2.2840290381125227e-05, + "loss": 41.5119, + "step": 1325 + }, + { + "epoch": 4.7873589164785555, + "grad_norm": 290.7100524902344, + "learning_rate": 2.2834845735027226e-05, + "loss": 42.7646, + "step": 1326 + }, + { + "epoch": 4.7909706546275395, + "grad_norm": 190.2306671142578, + "learning_rate": 2.282940108892922e-05, + "loss": 42.2708, + "step": 1327 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 187.5550079345703, + "learning_rate": 2.2823956442831217e-05, + "loss": 41.9279, + "step": 1328 + }, + { + "epoch": 4.798194130925508, + "grad_norm": 169.10414123535156, + "learning_rate": 2.2818511796733212e-05, + "loss": 42.2688, + "step": 1329 + }, + { + "epoch": 4.801805869074492, + "grad_norm": 199.5216064453125, + "learning_rate": 2.2813067150635208e-05, + "loss": 41.9192, + "step": 1330 + }, + { + "epoch": 4.801805869074492, + "eval_loss": 0.6402038335800171, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 1330 + }, + { + "epoch": 4.805417607223476, + "grad_norm": 222.4996337890625, + "learning_rate": 2.2807622504537203e-05, + "loss": 43.8218, + "step": 1331 + }, + { + "epoch": 4.80902934537246, + "grad_norm": 228.1157684326172, + "learning_rate": 2.2802177858439202e-05, + "loss": 42.9497, + "step": 1332 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 179.83697509765625, + "learning_rate": 2.27967332123412e-05, + "loss": 43.9723, + "step": 1333 + }, + { + "epoch": 4.816252821670429, + "grad_norm": 196.81983947753906, + "learning_rate": 2.2791288566243196e-05, + "loss": 43.3302, + "step": 1334 + }, + { + "epoch": 4.819864559819413, + "grad_norm": 186.61160278320312, + "learning_rate": 2.278584392014519e-05, + "loss": 41.8957, + "step": 1335 + }, + { + "epoch": 4.823476297968397, + "grad_norm": 242.55886840820312, + "learning_rate": 2.2780399274047187e-05, + "loss": 43.1916, + "step": 1336 + }, + { + "epoch": 4.827088036117382, + "grad_norm": 212.07177734375, + "learning_rate": 2.2774954627949185e-05, + "loss": 38.3371, + "step": 1337 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 180.1990966796875, + "learning_rate": 2.276950998185118e-05, + "loss": 36.3413, + "step": 1338 + }, + { + "epoch": 4.83431151241535, + "grad_norm": 202.69529724121094, + "learning_rate": 2.2764065335753176e-05, + "loss": 35.4426, + "step": 1339 + }, + { + "epoch": 4.837923250564334, + "grad_norm": 180.47283935546875, + "learning_rate": 2.275862068965517e-05, + "loss": 35.5281, + "step": 1340 + }, + { + "epoch": 4.837923250564334, + "eval_loss": 0.6356105804443359, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 1340 + }, + { + "epoch": 4.8415349887133186, + "grad_norm": 204.674560546875, + "learning_rate": 2.2753176043557167e-05, + "loss": 36.2566, + "step": 1341 + }, + { + "epoch": 4.8451467268623025, + "grad_norm": 272.1197204589844, + "learning_rate": 2.2747731397459166e-05, + "loss": 36.3862, + "step": 1342 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 235.55101013183594, + "learning_rate": 2.2742286751361165e-05, + "loss": 35.1455, + "step": 1343 + }, + { + "epoch": 4.852370203160271, + "grad_norm": 271.2718200683594, + "learning_rate": 2.273684210526316e-05, + "loss": 37.3824, + "step": 1344 + }, + { + "epoch": 4.855981941309255, + "grad_norm": 242.15728759765625, + "learning_rate": 2.2731397459165155e-05, + "loss": 37.6587, + "step": 1345 + }, + { + "epoch": 4.859593679458239, + "grad_norm": 218.59481811523438, + "learning_rate": 2.272595281306715e-05, + "loss": 36.7602, + "step": 1346 + }, + { + "epoch": 4.863205417607223, + "grad_norm": 231.9490203857422, + "learning_rate": 2.2720508166969146e-05, + "loss": 38.187, + "step": 1347 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 385.56158447265625, + "learning_rate": 2.2715063520871145e-05, + "loss": 38.1905, + "step": 1348 + }, + { + "epoch": 4.870428893905192, + "grad_norm": 219.38204956054688, + "learning_rate": 2.270961887477314e-05, + "loss": 38.2179, + "step": 1349 + }, + { + "epoch": 4.874040632054176, + "grad_norm": 209.46580505371094, + "learning_rate": 2.2704174228675136e-05, + "loss": 37.3696, + "step": 1350 + }, + { + "epoch": 4.874040632054176, + "eval_loss": 0.6412517428398132, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 1350 + }, + { + "epoch": 4.87765237020316, + "grad_norm": 205.53416442871094, + "learning_rate": 2.2698729582577134e-05, + "loss": 38.5144, + "step": 1351 + }, + { + "epoch": 4.881264108352145, + "grad_norm": 214.2522735595703, + "learning_rate": 2.269328493647913e-05, + "loss": 38.7372, + "step": 1352 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 236.9787139892578, + "learning_rate": 2.2687840290381125e-05, + "loss": 38.8987, + "step": 1353 + }, + { + "epoch": 4.888487584650113, + "grad_norm": 247.30906677246094, + "learning_rate": 2.2682395644283124e-05, + "loss": 35.0837, + "step": 1354 + }, + { + "epoch": 4.892099322799097, + "grad_norm": 287.5954284667969, + "learning_rate": 2.267695099818512e-05, + "loss": 25.5272, + "step": 1355 + }, + { + "epoch": 4.895711060948082, + "grad_norm": 254.61672973632812, + "learning_rate": 2.2671506352087115e-05, + "loss": 25.1288, + "step": 1356 + }, + { + "epoch": 4.899322799097066, + "grad_norm": 180.98666381835938, + "learning_rate": 2.266606170598911e-05, + "loss": 25.0588, + "step": 1357 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 213.0275421142578, + "learning_rate": 2.2660617059891105e-05, + "loss": 25.464, + "step": 1358 + }, + { + "epoch": 4.9065462753950335, + "grad_norm": 385.18035888671875, + "learning_rate": 2.2655172413793104e-05, + "loss": 47.0056, + "step": 1359 + }, + { + "epoch": 4.910158013544018, + "grad_norm": 383.4106140136719, + "learning_rate": 2.2649727767695103e-05, + "loss": 46.9892, + "step": 1360 + }, + { + "epoch": 4.910158013544018, + "eval_loss": 0.6618479490280151, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1360 + }, + { + "epoch": 4.913769751693002, + "grad_norm": 415.4345397949219, + "learning_rate": 2.26442831215971e-05, + "loss": 47.1619, + "step": 1361 + }, + { + "epoch": 4.917381489841986, + "grad_norm": 362.338134765625, + "learning_rate": 2.2638838475499094e-05, + "loss": 46.7232, + "step": 1362 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 378.7535400390625, + "learning_rate": 2.263339382940109e-05, + "loss": 46.4438, + "step": 1363 + }, + { + "epoch": 4.924604966139955, + "grad_norm": 251.64901733398438, + "learning_rate": 2.2627949183303085e-05, + "loss": 44.8178, + "step": 1364 + }, + { + "epoch": 4.928216704288939, + "grad_norm": 273.1052551269531, + "learning_rate": 2.2622504537205083e-05, + "loss": 43.0865, + "step": 1365 + }, + { + "epoch": 4.931828442437923, + "grad_norm": 229.66415405273438, + "learning_rate": 2.261705989110708e-05, + "loss": 42.2463, + "step": 1366 + }, + { + "epoch": 4.935440180586907, + "grad_norm": 229.47940063476562, + "learning_rate": 2.2611615245009074e-05, + "loss": 42.4395, + "step": 1367 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 224.48890686035156, + "learning_rate": 2.260617059891107e-05, + "loss": 42.4994, + "step": 1368 + }, + { + "epoch": 4.942663656884876, + "grad_norm": 241.98745727539062, + "learning_rate": 2.2600725952813065e-05, + "loss": 42.5535, + "step": 1369 + }, + { + "epoch": 4.94627539503386, + "grad_norm": 258.1711120605469, + "learning_rate": 2.2595281306715067e-05, + "loss": 42.8475, + "step": 1370 + }, + { + "epoch": 4.94627539503386, + "eval_loss": 0.639252245426178, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.09, + "eval_steps_per_second": 57.09, + "step": 1370 + }, + { + "epoch": 4.949887133182845, + "grad_norm": 204.64927673339844, + "learning_rate": 2.2589836660617062e-05, + "loss": 42.9895, + "step": 1371 + }, + { + "epoch": 4.953498871331829, + "grad_norm": 342.9057922363281, + "learning_rate": 2.2584392014519058e-05, + "loss": 43.1972, + "step": 1372 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 207.45504760742188, + "learning_rate": 2.2578947368421053e-05, + "loss": 42.406, + "step": 1373 + }, + { + "epoch": 4.960722347629797, + "grad_norm": 232.78831481933594, + "learning_rate": 2.257350272232305e-05, + "loss": 36.8817, + "step": 1374 + }, + { + "epoch": 4.9643340857787805, + "grad_norm": 249.3349609375, + "learning_rate": 2.2568058076225044e-05, + "loss": 34.584, + "step": 1375 + }, + { + "epoch": 4.967945823927765, + "grad_norm": 322.7100524902344, + "learning_rate": 2.2562613430127043e-05, + "loss": 36.9512, + "step": 1376 + }, + { + "epoch": 4.971557562076749, + "grad_norm": 357.65228271484375, + "learning_rate": 2.2557168784029038e-05, + "loss": 37.6833, + "step": 1377 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 300.0970153808594, + "learning_rate": 2.2551724137931033e-05, + "loss": 38.597, + "step": 1378 + }, + { + "epoch": 4.978781038374718, + "grad_norm": 234.52508544921875, + "learning_rate": 2.2546279491833032e-05, + "loss": 38.4155, + "step": 1379 + }, + { + "epoch": 4.982392776523702, + "grad_norm": 270.60626220703125, + "learning_rate": 2.2540834845735028e-05, + "loss": 38.1589, + "step": 1380 + }, + { + "epoch": 4.982392776523702, + "eval_loss": 0.6409950256347656, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 1380 + }, + { + "epoch": 4.986004514672686, + "grad_norm": 232.9596710205078, + "learning_rate": 2.2535390199637026e-05, + "loss": 39.281, + "step": 1381 + }, + { + "epoch": 4.98961625282167, + "grad_norm": 248.0550994873047, + "learning_rate": 2.2529945553539022e-05, + "loss": 40.0868, + "step": 1382 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 256.327880859375, + "learning_rate": 2.2524500907441017e-05, + "loss": 28.1259, + "step": 1383 + }, + { + "epoch": 4.996839729119639, + "grad_norm": 198.29559326171875, + "learning_rate": 2.2519056261343012e-05, + "loss": 25.3166, + "step": 1384 + }, + { + "epoch": 5.0, + "grad_norm": 174.66856384277344, + "learning_rate": 2.2513611615245008e-05, + "loss": 22.0749, + "step": 1385 + }, + { + "epoch": 5.003611738148984, + "grad_norm": 309.0927429199219, + "learning_rate": 2.2508166969147003e-05, + "loss": 45.2433, + "step": 1386 + }, + { + "epoch": 5.007223476297969, + "grad_norm": 293.1455383300781, + "learning_rate": 2.2502722323049002e-05, + "loss": 46.7025, + "step": 1387 + }, + { + "epoch": 5.010835214446953, + "grad_norm": 269.47662353515625, + "learning_rate": 2.2497277676951e-05, + "loss": 45.3218, + "step": 1388 + }, + { + "epoch": 5.014446952595937, + "grad_norm": 284.49560546875, + "learning_rate": 2.2491833030852996e-05, + "loss": 44.9849, + "step": 1389 + }, + { + "epoch": 5.018058690744921, + "grad_norm": 223.5511474609375, + "learning_rate": 2.248638838475499e-05, + "loss": 44.887, + "step": 1390 + }, + { + "epoch": 5.018058690744921, + "eval_loss": 0.6435533165931702, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1390 + }, + { + "epoch": 5.021670428893906, + "grad_norm": 243.4492645263672, + "learning_rate": 2.2480943738656987e-05, + "loss": 45.1483, + "step": 1391 + }, + { + "epoch": 5.0252821670428895, + "grad_norm": 265.1712646484375, + "learning_rate": 2.2475499092558986e-05, + "loss": 44.3713, + "step": 1392 + }, + { + "epoch": 5.0288939051918735, + "grad_norm": 190.72190856933594, + "learning_rate": 2.247005444646098e-05, + "loss": 45.3138, + "step": 1393 + }, + { + "epoch": 5.0325056433408575, + "grad_norm": 177.26686096191406, + "learning_rate": 2.2464609800362976e-05, + "loss": 43.302, + "step": 1394 + }, + { + "epoch": 5.036117381489842, + "grad_norm": 198.6124725341797, + "learning_rate": 2.2459165154264972e-05, + "loss": 43.6363, + "step": 1395 + }, + { + "epoch": 5.039729119638826, + "grad_norm": 233.78738403320312, + "learning_rate": 2.2453720508166967e-05, + "loss": 43.0345, + "step": 1396 + }, + { + "epoch": 5.04334085778781, + "grad_norm": 225.48614501953125, + "learning_rate": 2.2448275862068966e-05, + "loss": 41.5932, + "step": 1397 + }, + { + "epoch": 5.046952595936794, + "grad_norm": 204.31179809570312, + "learning_rate": 2.2442831215970965e-05, + "loss": 40.1401, + "step": 1398 + }, + { + "epoch": 5.050564334085779, + "grad_norm": 219.5385284423828, + "learning_rate": 2.243738656987296e-05, + "loss": 40.8834, + "step": 1399 + }, + { + "epoch": 5.054176072234763, + "grad_norm": 168.3094024658203, + "learning_rate": 2.2431941923774956e-05, + "loss": 40.4476, + "step": 1400 + }, + { + "epoch": 5.054176072234763, + "eval_loss": 0.6361114382743835, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 1400 + }, + { + "epoch": 5.057787810383747, + "grad_norm": 169.45201110839844, + "learning_rate": 2.242649727767695e-05, + "loss": 40.1949, + "step": 1401 + }, + { + "epoch": 5.061399548532731, + "grad_norm": 208.84634399414062, + "learning_rate": 2.2421052631578946e-05, + "loss": 41.0091, + "step": 1402 + }, + { + "epoch": 5.065011286681716, + "grad_norm": 248.86221313476562, + "learning_rate": 2.2415607985480945e-05, + "loss": 40.2435, + "step": 1403 + }, + { + "epoch": 5.0686230248307, + "grad_norm": 297.0834655761719, + "learning_rate": 2.241016333938294e-05, + "loss": 42.37, + "step": 1404 + }, + { + "epoch": 5.072234762979684, + "grad_norm": 242.12661743164062, + "learning_rate": 2.2404718693284936e-05, + "loss": 42.3822, + "step": 1405 + }, + { + "epoch": 5.075846501128668, + "grad_norm": 230.1178741455078, + "learning_rate": 2.2399274047186935e-05, + "loss": 41.3722, + "step": 1406 + }, + { + "epoch": 5.079458239277653, + "grad_norm": 191.32371520996094, + "learning_rate": 2.239382940108893e-05, + "loss": 41.8087, + "step": 1407 + }, + { + "epoch": 5.083069977426637, + "grad_norm": 267.28753662109375, + "learning_rate": 2.2388384754990925e-05, + "loss": 42.5938, + "step": 1408 + }, + { + "epoch": 5.0866817155756205, + "grad_norm": 186.61978149414062, + "learning_rate": 2.2382940108892924e-05, + "loss": 42.8553, + "step": 1409 + }, + { + "epoch": 5.090293453724605, + "grad_norm": 242.53433227539062, + "learning_rate": 2.237749546279492e-05, + "loss": 41.9677, + "step": 1410 + }, + { + "epoch": 5.090293453724605, + "eval_loss": 0.6330043077468872, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 1410 + }, + { + "epoch": 5.093905191873589, + "grad_norm": 199.74696350097656, + "learning_rate": 2.2372050816696915e-05, + "loss": 42.9821, + "step": 1411 + }, + { + "epoch": 5.097516930022573, + "grad_norm": 254.1063690185547, + "learning_rate": 2.236660617059891e-05, + "loss": 42.7956, + "step": 1412 + }, + { + "epoch": 5.101128668171557, + "grad_norm": 215.59056091308594, + "learning_rate": 2.2361161524500906e-05, + "loss": 43.6312, + "step": 1413 + }, + { + "epoch": 5.104740406320542, + "grad_norm": 218.69973754882812, + "learning_rate": 2.2355716878402904e-05, + "loss": 40.9468, + "step": 1414 + }, + { + "epoch": 5.108352144469526, + "grad_norm": 200.34927368164062, + "learning_rate": 2.23502722323049e-05, + "loss": 38.2656, + "step": 1415 + }, + { + "epoch": 5.11196388261851, + "grad_norm": 191.56883239746094, + "learning_rate": 2.23448275862069e-05, + "loss": 35.8111, + "step": 1416 + }, + { + "epoch": 5.115575620767494, + "grad_norm": 192.629150390625, + "learning_rate": 2.2339382940108894e-05, + "loss": 35.1287, + "step": 1417 + }, + { + "epoch": 5.119187358916479, + "grad_norm": 217.54855346679688, + "learning_rate": 2.233393829401089e-05, + "loss": 34.9664, + "step": 1418 + }, + { + "epoch": 5.122799097065463, + "grad_norm": 234.12355041503906, + "learning_rate": 2.2328493647912888e-05, + "loss": 35.9252, + "step": 1419 + }, + { + "epoch": 5.126410835214447, + "grad_norm": 201.83477783203125, + "learning_rate": 2.2323049001814884e-05, + "loss": 36.4664, + "step": 1420 + }, + { + "epoch": 5.126410835214447, + "eval_loss": 0.6359394192695618, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 1420 + }, + { + "epoch": 5.130022573363431, + "grad_norm": 212.38943481445312, + "learning_rate": 2.231760435571688e-05, + "loss": 35.2733, + "step": 1421 + }, + { + "epoch": 5.133634311512416, + "grad_norm": 219.8803253173828, + "learning_rate": 2.2312159709618874e-05, + "loss": 37.2009, + "step": 1422 + }, + { + "epoch": 5.1372460496614, + "grad_norm": 222.28221130371094, + "learning_rate": 2.230671506352087e-05, + "loss": 36.9338, + "step": 1423 + }, + { + "epoch": 5.140857787810384, + "grad_norm": 217.56607055664062, + "learning_rate": 2.2301270417422865e-05, + "loss": 38.0419, + "step": 1424 + }, + { + "epoch": 5.144469525959368, + "grad_norm": 232.7363739013672, + "learning_rate": 2.2295825771324867e-05, + "loss": 38.1393, + "step": 1425 + }, + { + "epoch": 5.148081264108352, + "grad_norm": 228.12091064453125, + "learning_rate": 2.2290381125226863e-05, + "loss": 37.4169, + "step": 1426 + }, + { + "epoch": 5.151693002257336, + "grad_norm": 247.9901580810547, + "learning_rate": 2.2284936479128858e-05, + "loss": 37.6386, + "step": 1427 + }, + { + "epoch": 5.15530474040632, + "grad_norm": 227.96649169921875, + "learning_rate": 2.2279491833030853e-05, + "loss": 38.7843, + "step": 1428 + }, + { + "epoch": 5.158916478555304, + "grad_norm": 197.85072326660156, + "learning_rate": 2.227404718693285e-05, + "loss": 37.7056, + "step": 1429 + }, + { + "epoch": 5.162528216704289, + "grad_norm": 270.6370544433594, + "learning_rate": 2.2268602540834848e-05, + "loss": 38.5554, + "step": 1430 + }, + { + "epoch": 5.162528216704289, + "eval_loss": 0.6463288068771362, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 1430 + }, + { + "epoch": 5.166139954853273, + "grad_norm": 251.65847778320312, + "learning_rate": 2.2263157894736843e-05, + "loss": 32.6593, + "step": 1431 + }, + { + "epoch": 5.169751693002257, + "grad_norm": 248.84368896484375, + "learning_rate": 2.225771324863884e-05, + "loss": 24.8031, + "step": 1432 + }, + { + "epoch": 5.173363431151241, + "grad_norm": 218.12979125976562, + "learning_rate": 2.2252268602540834e-05, + "loss": 23.8542, + "step": 1433 + }, + { + "epoch": 5.176975169300226, + "grad_norm": 171.4182586669922, + "learning_rate": 2.2246823956442832e-05, + "loss": 25.1994, + "step": 1434 + }, + { + "epoch": 5.18058690744921, + "grad_norm": 200.76271057128906, + "learning_rate": 2.2241379310344828e-05, + "loss": 25.1259, + "step": 1435 + }, + { + "epoch": 5.184198645598194, + "grad_norm": 324.8979797363281, + "learning_rate": 2.2235934664246827e-05, + "loss": 46.7466, + "step": 1436 + }, + { + "epoch": 5.187810383747179, + "grad_norm": 391.9200439453125, + "learning_rate": 2.2230490018148822e-05, + "loss": 47.366, + "step": 1437 + }, + { + "epoch": 5.191422121896163, + "grad_norm": 332.51080322265625, + "learning_rate": 2.2225045372050817e-05, + "loss": 47.5236, + "step": 1438 + }, + { + "epoch": 5.195033860045147, + "grad_norm": 295.85333251953125, + "learning_rate": 2.2219600725952813e-05, + "loss": 44.9235, + "step": 1439 + }, + { + "epoch": 5.198645598194131, + "grad_norm": 246.46482849121094, + "learning_rate": 2.2214156079854808e-05, + "loss": 44.5892, + "step": 1440 + }, + { + "epoch": 5.198645598194131, + "eval_loss": 0.6501885056495667, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.096, + "eval_steps_per_second": 57.096, + "step": 1440 + }, + { + "epoch": 5.2022573363431155, + "grad_norm": 224.99964904785156, + "learning_rate": 2.2208711433756807e-05, + "loss": 45.1496, + "step": 1441 + }, + { + "epoch": 5.2058690744920995, + "grad_norm": 201.5928497314453, + "learning_rate": 2.2203266787658802e-05, + "loss": 44.2362, + "step": 1442 + }, + { + "epoch": 5.209480812641083, + "grad_norm": 220.72509765625, + "learning_rate": 2.21978221415608e-05, + "loss": 45.7963, + "step": 1443 + }, + { + "epoch": 5.213092550790067, + "grad_norm": 229.04412841796875, + "learning_rate": 2.2192377495462796e-05, + "loss": 44.1812, + "step": 1444 + }, + { + "epoch": 5.216704288939052, + "grad_norm": 214.86207580566406, + "learning_rate": 2.2186932849364792e-05, + "loss": 44.364, + "step": 1445 + }, + { + "epoch": 5.220316027088036, + "grad_norm": 169.3239288330078, + "learning_rate": 2.2181488203266787e-05, + "loss": 44.1106, + "step": 1446 + }, + { + "epoch": 5.22392776523702, + "grad_norm": 180.3131561279297, + "learning_rate": 2.2176043557168786e-05, + "loss": 41.8791, + "step": 1447 + }, + { + "epoch": 5.227539503386004, + "grad_norm": 227.83078002929688, + "learning_rate": 2.217059891107078e-05, + "loss": 39.7917, + "step": 1448 + }, + { + "epoch": 5.231151241534989, + "grad_norm": 267.4294738769531, + "learning_rate": 2.2165154264972777e-05, + "loss": 41.2864, + "step": 1449 + }, + { + "epoch": 5.234762979683973, + "grad_norm": 210.79034423828125, + "learning_rate": 2.2159709618874772e-05, + "loss": 40.7219, + "step": 1450 + }, + { + "epoch": 5.234762979683973, + "eval_loss": 0.6369529366493225, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 1450 + }, + { + "epoch": 5.238374717832957, + "grad_norm": 205.2632598876953, + "learning_rate": 2.2154264972776768e-05, + "loss": 41.0364, + "step": 1451 + }, + { + "epoch": 5.241986455981941, + "grad_norm": 199.7196807861328, + "learning_rate": 2.214882032667877e-05, + "loss": 40.2733, + "step": 1452 + }, + { + "epoch": 5.245598194130926, + "grad_norm": 184.26495361328125, + "learning_rate": 2.2143375680580765e-05, + "loss": 40.3418, + "step": 1453 + }, + { + "epoch": 5.24920993227991, + "grad_norm": 170.1937713623047, + "learning_rate": 2.213793103448276e-05, + "loss": 40.5658, + "step": 1454 + }, + { + "epoch": 5.252821670428894, + "grad_norm": 167.71109008789062, + "learning_rate": 2.2132486388384756e-05, + "loss": 41.9252, + "step": 1455 + }, + { + "epoch": 5.2564334085778786, + "grad_norm": 184.73162841796875, + "learning_rate": 2.212704174228675e-05, + "loss": 40.0485, + "step": 1456 + }, + { + "epoch": 5.2600451467268625, + "grad_norm": 195.0812225341797, + "learning_rate": 2.2121597096188747e-05, + "loss": 41.6424, + "step": 1457 + }, + { + "epoch": 5.2636568848758465, + "grad_norm": 218.23553466796875, + "learning_rate": 2.2116152450090745e-05, + "loss": 40.6179, + "step": 1458 + }, + { + "epoch": 5.2672686230248305, + "grad_norm": 229.79299926757812, + "learning_rate": 2.211070780399274e-05, + "loss": 42.8747, + "step": 1459 + }, + { + "epoch": 5.270880361173815, + "grad_norm": 231.70692443847656, + "learning_rate": 2.2105263157894736e-05, + "loss": 42.7016, + "step": 1460 + }, + { + "epoch": 5.270880361173815, + "eval_loss": 0.6424433588981628, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 1460 + }, + { + "epoch": 5.274492099322799, + "grad_norm": 204.9513397216797, + "learning_rate": 2.209981851179673e-05, + "loss": 41.206, + "step": 1461 + }, + { + "epoch": 5.278103837471783, + "grad_norm": 220.89083862304688, + "learning_rate": 2.209437386569873e-05, + "loss": 44.0126, + "step": 1462 + }, + { + "epoch": 5.281715575620767, + "grad_norm": 266.7763671875, + "learning_rate": 2.208892921960073e-05, + "loss": 41.4934, + "step": 1463 + }, + { + "epoch": 5.285327313769752, + "grad_norm": 241.42636108398438, + "learning_rate": 2.2083484573502724e-05, + "loss": 43.3433, + "step": 1464 + }, + { + "epoch": 5.288939051918736, + "grad_norm": 221.7669219970703, + "learning_rate": 2.207803992740472e-05, + "loss": 35.9569, + "step": 1465 + }, + { + "epoch": 5.29255079006772, + "grad_norm": 236.0152130126953, + "learning_rate": 2.2072595281306715e-05, + "loss": 36.0824, + "step": 1466 + }, + { + "epoch": 5.296162528216704, + "grad_norm": 239.56224060058594, + "learning_rate": 2.206715063520871e-05, + "loss": 33.6127, + "step": 1467 + }, + { + "epoch": 5.299774266365689, + "grad_norm": 277.1287841796875, + "learning_rate": 2.2061705989110706e-05, + "loss": 36.11, + "step": 1468 + }, + { + "epoch": 5.303386004514673, + "grad_norm": 250.19515991210938, + "learning_rate": 2.2056261343012705e-05, + "loss": 36.9984, + "step": 1469 + }, + { + "epoch": 5.306997742663657, + "grad_norm": 214.2754669189453, + "learning_rate": 2.20508166969147e-05, + "loss": 36.5917, + "step": 1470 + }, + { + "epoch": 5.306997742663657, + "eval_loss": 0.6356943845748901, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 1470 + }, + { + "epoch": 5.310609480812641, + "grad_norm": 224.37388610839844, + "learning_rate": 2.20453720508167e-05, + "loss": 36.5302, + "step": 1471 + }, + { + "epoch": 5.314221218961626, + "grad_norm": 276.2541809082031, + "learning_rate": 2.2039927404718694e-05, + "loss": 36.7978, + "step": 1472 + }, + { + "epoch": 5.3178329571106095, + "grad_norm": 361.717041015625, + "learning_rate": 2.203448275862069e-05, + "loss": 37.4063, + "step": 1473 + }, + { + "epoch": 5.3214446952595935, + "grad_norm": 285.3569641113281, + "learning_rate": 2.202903811252269e-05, + "loss": 37.2472, + "step": 1474 + }, + { + "epoch": 5.3250564334085775, + "grad_norm": 268.160400390625, + "learning_rate": 2.2023593466424684e-05, + "loss": 37.7361, + "step": 1475 + }, + { + "epoch": 5.328668171557562, + "grad_norm": 211.38070678710938, + "learning_rate": 2.201814882032668e-05, + "loss": 37.7794, + "step": 1476 + }, + { + "epoch": 5.332279909706546, + "grad_norm": 214.10638427734375, + "learning_rate": 2.2012704174228675e-05, + "loss": 39.0787, + "step": 1477 + }, + { + "epoch": 5.33589164785553, + "grad_norm": 238.9603271484375, + "learning_rate": 2.200725952813067e-05, + "loss": 37.6853, + "step": 1478 + }, + { + "epoch": 5.339503386004514, + "grad_norm": 323.44976806640625, + "learning_rate": 2.2001814882032665e-05, + "loss": 38.2844, + "step": 1479 + }, + { + "epoch": 5.343115124153499, + "grad_norm": 289.6131896972656, + "learning_rate": 2.1996370235934668e-05, + "loss": 38.8953, + "step": 1480 + }, + { + "epoch": 5.343115124153499, + "eval_loss": 0.6462770700454712, + "eval_runtime": 3.1673, + "eval_samples_per_second": 56.516, + "eval_steps_per_second": 56.516, + "step": 1480 + }, + { + "epoch": 5.346726862302483, + "grad_norm": 197.47299194335938, + "learning_rate": 2.1990925589836663e-05, + "loss": 28.126, + "step": 1481 + }, + { + "epoch": 5.350338600451467, + "grad_norm": 198.37156677246094, + "learning_rate": 2.1985480943738658e-05, + "loss": 24.2205, + "step": 1482 + }, + { + "epoch": 5.353950338600452, + "grad_norm": 211.03501892089844, + "learning_rate": 2.1980036297640654e-05, + "loss": 24.119, + "step": 1483 + }, + { + "epoch": 5.357562076749436, + "grad_norm": 182.23316955566406, + "learning_rate": 2.197459165154265e-05, + "loss": 24.7386, + "step": 1484 + }, + { + "epoch": 5.36117381489842, + "grad_norm": 192.6392822265625, + "learning_rate": 2.1969147005444648e-05, + "loss": 26.0739, + "step": 1485 + }, + { + "epoch": 5.364785553047404, + "grad_norm": 380.62896728515625, + "learning_rate": 2.1963702359346643e-05, + "loss": 46.6945, + "step": 1486 + }, + { + "epoch": 5.368397291196389, + "grad_norm": 342.5572814941406, + "learning_rate": 2.195825771324864e-05, + "loss": 46.1797, + "step": 1487 + }, + { + "epoch": 5.372009029345373, + "grad_norm": 311.7198791503906, + "learning_rate": 2.1952813067150634e-05, + "loss": 45.6588, + "step": 1488 + }, + { + "epoch": 5.375620767494357, + "grad_norm": 260.9885559082031, + "learning_rate": 2.1947368421052633e-05, + "loss": 45.2405, + "step": 1489 + }, + { + "epoch": 5.3792325056433405, + "grad_norm": 263.3132019042969, + "learning_rate": 2.1941923774954628e-05, + "loss": 44.117, + "step": 1490 + }, + { + "epoch": 5.3792325056433405, + "eval_loss": 0.644275426864624, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 1490 + }, + { + "epoch": 5.382844243792325, + "grad_norm": 254.92022705078125, + "learning_rate": 2.1936479128856627e-05, + "loss": 45.4002, + "step": 1491 + }, + { + "epoch": 5.386455981941309, + "grad_norm": 246.1839599609375, + "learning_rate": 2.1931034482758622e-05, + "loss": 45.3481, + "step": 1492 + }, + { + "epoch": 5.390067720090293, + "grad_norm": 282.2879638671875, + "learning_rate": 2.1925589836660618e-05, + "loss": 45.3958, + "step": 1493 + }, + { + "epoch": 5.393679458239277, + "grad_norm": 266.9140930175781, + "learning_rate": 2.1920145190562613e-05, + "loss": 44.2959, + "step": 1494 + }, + { + "epoch": 5.397291196388262, + "grad_norm": 196.81199645996094, + "learning_rate": 2.191470054446461e-05, + "loss": 44.765, + "step": 1495 + }, + { + "epoch": 5.400902934537246, + "grad_norm": 270.7329406738281, + "learning_rate": 2.1909255898366607e-05, + "loss": 42.8581, + "step": 1496 + }, + { + "epoch": 5.40451467268623, + "grad_norm": 187.3281707763672, + "learning_rate": 2.1903811252268603e-05, + "loss": 40.7167, + "step": 1497 + }, + { + "epoch": 5.408126410835214, + "grad_norm": 302.9165954589844, + "learning_rate": 2.1898366606170598e-05, + "loss": 41.0712, + "step": 1498 + }, + { + "epoch": 5.411738148984199, + "grad_norm": 395.1492614746094, + "learning_rate": 2.1892921960072597e-05, + "loss": 40.4098, + "step": 1499 + }, + { + "epoch": 5.415349887133183, + "grad_norm": 253.91494750976562, + "learning_rate": 2.1887477313974592e-05, + "loss": 41.2985, + "step": 1500 + }, + { + "epoch": 5.415349887133183, + "eval_loss": 0.6383773684501648, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1500 + }, + { + "epoch": 5.418961625282167, + "grad_norm": 248.4109344482422, + "learning_rate": 2.1882032667876588e-05, + "loss": 41.179, + "step": 1501 + }, + { + "epoch": 5.422573363431152, + "grad_norm": 210.50015258789062, + "learning_rate": 2.1876588021778586e-05, + "loss": 41.1934, + "step": 1502 + }, + { + "epoch": 5.426185101580136, + "grad_norm": 170.64334106445312, + "learning_rate": 2.187114337568058e-05, + "loss": 41.5535, + "step": 1503 + }, + { + "epoch": 5.42979683972912, + "grad_norm": 249.41270446777344, + "learning_rate": 2.1865698729582577e-05, + "loss": 41.8323, + "step": 1504 + }, + { + "epoch": 5.433408577878104, + "grad_norm": 214.53770446777344, + "learning_rate": 2.1860254083484572e-05, + "loss": 42.1517, + "step": 1505 + }, + { + "epoch": 5.437020316027088, + "grad_norm": 225.6502227783203, + "learning_rate": 2.1854809437386568e-05, + "loss": 42.7675, + "step": 1506 + }, + { + "epoch": 5.440632054176072, + "grad_norm": 210.19219970703125, + "learning_rate": 2.1849364791288567e-05, + "loss": 42.5094, + "step": 1507 + }, + { + "epoch": 5.444243792325056, + "grad_norm": 187.03294372558594, + "learning_rate": 2.1843920145190565e-05, + "loss": 42.2218, + "step": 1508 + }, + { + "epoch": 5.44785553047404, + "grad_norm": 227.6764373779297, + "learning_rate": 2.183847549909256e-05, + "loss": 42.7061, + "step": 1509 + }, + { + "epoch": 5.451467268623025, + "grad_norm": 239.2847442626953, + "learning_rate": 2.1833030852994556e-05, + "loss": 43.1959, + "step": 1510 + }, + { + "epoch": 5.451467268623025, + "eval_loss": 0.6405091285705566, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 1510 + }, + { + "epoch": 5.455079006772009, + "grad_norm": 268.887451171875, + "learning_rate": 2.182758620689655e-05, + "loss": 42.4915, + "step": 1511 + }, + { + "epoch": 5.458690744920993, + "grad_norm": 261.0531311035156, + "learning_rate": 2.182214156079855e-05, + "loss": 42.1777, + "step": 1512 + }, + { + "epoch": 5.462302483069977, + "grad_norm": 241.58819580078125, + "learning_rate": 2.1816696914700546e-05, + "loss": 40.8728, + "step": 1513 + }, + { + "epoch": 5.465914221218962, + "grad_norm": 227.302001953125, + "learning_rate": 2.181125226860254e-05, + "loss": 39.8861, + "step": 1514 + }, + { + "epoch": 5.469525959367946, + "grad_norm": 293.8402404785156, + "learning_rate": 2.1805807622504536e-05, + "loss": 36.8716, + "step": 1515 + }, + { + "epoch": 5.47313769751693, + "grad_norm": 332.8829650878906, + "learning_rate": 2.1800362976406532e-05, + "loss": 35.6049, + "step": 1516 + }, + { + "epoch": 5.476749435665914, + "grad_norm": 271.6636962890625, + "learning_rate": 2.179491833030853e-05, + "loss": 34.6785, + "step": 1517 + }, + { + "epoch": 5.480361173814899, + "grad_norm": 211.5673065185547, + "learning_rate": 2.178947368421053e-05, + "loss": 35.5321, + "step": 1518 + }, + { + "epoch": 5.483972911963883, + "grad_norm": 168.95346069335938, + "learning_rate": 2.1784029038112525e-05, + "loss": 35.1604, + "step": 1519 + }, + { + "epoch": 5.487584650112867, + "grad_norm": 242.66725158691406, + "learning_rate": 2.177858439201452e-05, + "loss": 37.8709, + "step": 1520 + }, + { + "epoch": 5.487584650112867, + "eval_loss": 0.6324127912521362, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1520 + }, + { + "epoch": 5.491196388261851, + "grad_norm": 202.7799530029297, + "learning_rate": 2.1773139745916516e-05, + "loss": 38.1727, + "step": 1521 + }, + { + "epoch": 5.4948081264108355, + "grad_norm": 210.12704467773438, + "learning_rate": 2.176769509981851e-05, + "loss": 36.4171, + "step": 1522 + }, + { + "epoch": 5.4984198645598195, + "grad_norm": 214.7133331298828, + "learning_rate": 2.176225045372051e-05, + "loss": 37.7873, + "step": 1523 + }, + { + "epoch": 5.502031602708803, + "grad_norm": 197.89781188964844, + "learning_rate": 2.1756805807622505e-05, + "loss": 37.1096, + "step": 1524 + }, + { + "epoch": 5.505643340857787, + "grad_norm": 203.01992797851562, + "learning_rate": 2.17513611615245e-05, + "loss": 36.9907, + "step": 1525 + }, + { + "epoch": 5.509255079006772, + "grad_norm": 210.42164611816406, + "learning_rate": 2.17459165154265e-05, + "loss": 38.0291, + "step": 1526 + }, + { + "epoch": 5.512866817155756, + "grad_norm": 210.2798309326172, + "learning_rate": 2.1740471869328495e-05, + "loss": 37.5385, + "step": 1527 + }, + { + "epoch": 5.51647855530474, + "grad_norm": 217.986572265625, + "learning_rate": 2.173502722323049e-05, + "loss": 39.2736, + "step": 1528 + }, + { + "epoch": 5.520090293453725, + "grad_norm": 221.05831909179688, + "learning_rate": 2.172958257713249e-05, + "loss": 39.2733, + "step": 1529 + }, + { + "epoch": 5.523702031602709, + "grad_norm": 250.36065673828125, + "learning_rate": 2.1724137931034484e-05, + "loss": 37.8987, + "step": 1530 + }, + { + "epoch": 5.523702031602709, + "eval_loss": 0.6414559483528137, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 1530 + }, + { + "epoch": 5.527313769751693, + "grad_norm": 275.062255859375, + "learning_rate": 2.171869328493648e-05, + "loss": 29.4874, + "step": 1531 + }, + { + "epoch": 5.530925507900677, + "grad_norm": 178.79615783691406, + "learning_rate": 2.1713248638838475e-05, + "loss": 25.2165, + "step": 1532 + }, + { + "epoch": 5.534537246049661, + "grad_norm": 221.6693572998047, + "learning_rate": 2.170780399274047e-05, + "loss": 24.7139, + "step": 1533 + }, + { + "epoch": 5.538148984198646, + "grad_norm": 207.15869140625, + "learning_rate": 2.170235934664247e-05, + "loss": 25.2773, + "step": 1534 + }, + { + "epoch": 5.54176072234763, + "grad_norm": 193.37644958496094, + "learning_rate": 2.1696914700544468e-05, + "loss": 25.7936, + "step": 1535 + }, + { + "epoch": 5.545372460496614, + "grad_norm": 314.101318359375, + "learning_rate": 2.1691470054446463e-05, + "loss": 45.8573, + "step": 1536 + }, + { + "epoch": 5.5489841986455986, + "grad_norm": 376.9578552246094, + "learning_rate": 2.168602540834846e-05, + "loss": 47.1284, + "step": 1537 + }, + { + "epoch": 5.5525959367945825, + "grad_norm": 343.3904724121094, + "learning_rate": 2.1680580762250454e-05, + "loss": 45.1873, + "step": 1538 + }, + { + "epoch": 5.5562076749435665, + "grad_norm": 263.31768798828125, + "learning_rate": 2.167513611615245e-05, + "loss": 45.4906, + "step": 1539 + }, + { + "epoch": 5.5598194130925505, + "grad_norm": 295.50384521484375, + "learning_rate": 2.1669691470054448e-05, + "loss": 44.9259, + "step": 1540 + }, + { + "epoch": 5.5598194130925505, + "eval_loss": 0.6483813524246216, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.923, + "eval_steps_per_second": 56.923, + "step": 1540 + }, + { + "epoch": 5.563431151241535, + "grad_norm": 208.8861846923828, + "learning_rate": 2.1664246823956444e-05, + "loss": 43.7965, + "step": 1541 + }, + { + "epoch": 5.567042889390519, + "grad_norm": 195.8695526123047, + "learning_rate": 2.165880217785844e-05, + "loss": 44.7409, + "step": 1542 + }, + { + "epoch": 5.570654627539503, + "grad_norm": 218.10089111328125, + "learning_rate": 2.1653357531760434e-05, + "loss": 45.9364, + "step": 1543 + }, + { + "epoch": 5.574266365688487, + "grad_norm": 204.17205810546875, + "learning_rate": 2.164791288566243e-05, + "loss": 45.468, + "step": 1544 + }, + { + "epoch": 5.577878103837472, + "grad_norm": 239.03952026367188, + "learning_rate": 2.1642468239564432e-05, + "loss": 44.7685, + "step": 1545 + }, + { + "epoch": 5.581489841986456, + "grad_norm": 251.59300231933594, + "learning_rate": 2.1637023593466427e-05, + "loss": 43.011, + "step": 1546 + }, + { + "epoch": 5.58510158013544, + "grad_norm": 186.72540283203125, + "learning_rate": 2.1631578947368423e-05, + "loss": 41.5255, + "step": 1547 + }, + { + "epoch": 5.588713318284425, + "grad_norm": 199.89732360839844, + "learning_rate": 2.1626134301270418e-05, + "loss": 40.2522, + "step": 1548 + }, + { + "epoch": 5.592325056433409, + "grad_norm": 182.16624450683594, + "learning_rate": 2.1620689655172413e-05, + "loss": 41.0931, + "step": 1549 + }, + { + "epoch": 5.595936794582393, + "grad_norm": 221.58680725097656, + "learning_rate": 2.161524500907441e-05, + "loss": 40.2717, + "step": 1550 + }, + { + "epoch": 5.595936794582393, + "eval_loss": 0.6393340229988098, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 1550 + }, + { + "epoch": 5.599548532731377, + "grad_norm": 209.82183837890625, + "learning_rate": 2.1609800362976408e-05, + "loss": 41.7522, + "step": 1551 + }, + { + "epoch": 5.603160270880361, + "grad_norm": 226.1896209716797, + "learning_rate": 2.1604355716878403e-05, + "loss": 40.8078, + "step": 1552 + }, + { + "epoch": 5.606772009029346, + "grad_norm": 219.57899475097656, + "learning_rate": 2.1598911070780398e-05, + "loss": 42.2331, + "step": 1553 + }, + { + "epoch": 5.6103837471783295, + "grad_norm": 185.2303009033203, + "learning_rate": 2.1593466424682397e-05, + "loss": 42.0695, + "step": 1554 + }, + { + "epoch": 5.6139954853273135, + "grad_norm": 192.32913208007812, + "learning_rate": 2.1588021778584392e-05, + "loss": 42.1317, + "step": 1555 + }, + { + "epoch": 5.617607223476298, + "grad_norm": 183.3128662109375, + "learning_rate": 2.158257713248639e-05, + "loss": 40.4957, + "step": 1556 + }, + { + "epoch": 5.621218961625282, + "grad_norm": 178.10691833496094, + "learning_rate": 2.1577132486388387e-05, + "loss": 40.9154, + "step": 1557 + }, + { + "epoch": 5.624830699774266, + "grad_norm": 207.3495330810547, + "learning_rate": 2.1571687840290382e-05, + "loss": 42.8389, + "step": 1558 + }, + { + "epoch": 5.62844243792325, + "grad_norm": 191.46353149414062, + "learning_rate": 2.1566243194192377e-05, + "loss": 41.9483, + "step": 1559 + }, + { + "epoch": 5.632054176072235, + "grad_norm": 218.9544219970703, + "learning_rate": 2.1560798548094373e-05, + "loss": 41.2037, + "step": 1560 + }, + { + "epoch": 5.632054176072235, + "eval_loss": 0.6345452070236206, + "eval_runtime": 3.1432, + "eval_samples_per_second": 56.949, + "eval_steps_per_second": 56.949, + "step": 1560 + }, + { + "epoch": 5.635665914221219, + "grad_norm": 235.9405059814453, + "learning_rate": 2.1555353901996368e-05, + "loss": 43.1159, + "step": 1561 + }, + { + "epoch": 5.639277652370203, + "grad_norm": 207.1119384765625, + "learning_rate": 2.1549909255898367e-05, + "loss": 43.4384, + "step": 1562 + }, + { + "epoch": 5.642889390519187, + "grad_norm": 305.3013916015625, + "learning_rate": 2.1544464609800366e-05, + "loss": 42.436, + "step": 1563 + }, + { + "epoch": 5.646501128668172, + "grad_norm": 226.25282287597656, + "learning_rate": 2.153901996370236e-05, + "loss": 39.6844, + "step": 1564 + }, + { + "epoch": 5.650112866817156, + "grad_norm": 201.5033416748047, + "learning_rate": 2.1533575317604356e-05, + "loss": 35.9103, + "step": 1565 + }, + { + "epoch": 5.65372460496614, + "grad_norm": 206.63229370117188, + "learning_rate": 2.1528130671506352e-05, + "loss": 35.0026, + "step": 1566 + }, + { + "epoch": 5.657336343115124, + "grad_norm": 212.67581176757812, + "learning_rate": 2.152268602540835e-05, + "loss": 35.6298, + "step": 1567 + }, + { + "epoch": 5.660948081264109, + "grad_norm": 193.2886199951172, + "learning_rate": 2.1517241379310346e-05, + "loss": 36.0356, + "step": 1568 + }, + { + "epoch": 5.664559819413093, + "grad_norm": 166.189208984375, + "learning_rate": 2.151179673321234e-05, + "loss": 35.5423, + "step": 1569 + }, + { + "epoch": 5.668171557562077, + "grad_norm": 288.91552734375, + "learning_rate": 2.1506352087114337e-05, + "loss": 36.6227, + "step": 1570 + }, + { + "epoch": 5.668171557562077, + "eval_loss": 0.6339959502220154, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1570 + }, + { + "epoch": 5.6717832957110605, + "grad_norm": 210.91664123535156, + "learning_rate": 2.1500907441016332e-05, + "loss": 37.3015, + "step": 1571 + }, + { + "epoch": 5.675395033860045, + "grad_norm": 206.54299926757812, + "learning_rate": 2.149546279491833e-05, + "loss": 36.961, + "step": 1572 + }, + { + "epoch": 5.679006772009029, + "grad_norm": 206.55613708496094, + "learning_rate": 2.149001814882033e-05, + "loss": 36.722, + "step": 1573 + }, + { + "epoch": 5.682618510158013, + "grad_norm": 206.86563110351562, + "learning_rate": 2.1484573502722325e-05, + "loss": 37.7482, + "step": 1574 + }, + { + "epoch": 5.686230248306998, + "grad_norm": 219.96533203125, + "learning_rate": 2.147912885662432e-05, + "loss": 37.7964, + "step": 1575 + }, + { + "epoch": 5.689841986455982, + "grad_norm": 226.23887634277344, + "learning_rate": 2.1473684210526316e-05, + "loss": 38.6577, + "step": 1576 + }, + { + "epoch": 5.693453724604966, + "grad_norm": 195.1751708984375, + "learning_rate": 2.146823956442831e-05, + "loss": 36.9764, + "step": 1577 + }, + { + "epoch": 5.69706546275395, + "grad_norm": 194.3510284423828, + "learning_rate": 2.146279491833031e-05, + "loss": 39.4842, + "step": 1578 + }, + { + "epoch": 5.700677200902934, + "grad_norm": 187.02281188964844, + "learning_rate": 2.1457350272232305e-05, + "loss": 38.9574, + "step": 1579 + }, + { + "epoch": 5.704288939051919, + "grad_norm": 242.91925048828125, + "learning_rate": 2.14519056261343e-05, + "loss": 37.6359, + "step": 1580 + }, + { + "epoch": 5.704288939051919, + "eval_loss": 0.6384473443031311, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 1580 + }, + { + "epoch": 5.707900677200903, + "grad_norm": 242.9617156982422, + "learning_rate": 2.14464609800363e-05, + "loss": 31.3564, + "step": 1581 + }, + { + "epoch": 5.711512415349887, + "grad_norm": 182.00540161132812, + "learning_rate": 2.1441016333938295e-05, + "loss": 24.2933, + "step": 1582 + }, + { + "epoch": 5.715124153498872, + "grad_norm": 257.7115173339844, + "learning_rate": 2.143557168784029e-05, + "loss": 24.6299, + "step": 1583 + }, + { + "epoch": 5.718735891647856, + "grad_norm": 198.71554565429688, + "learning_rate": 2.143012704174229e-05, + "loss": 24.7344, + "step": 1584 + }, + { + "epoch": 5.72234762979684, + "grad_norm": 198.24520874023438, + "learning_rate": 2.1424682395644284e-05, + "loss": 26.0825, + "step": 1585 + }, + { + "epoch": 5.725959367945824, + "grad_norm": 248.9528045654297, + "learning_rate": 2.141923774954628e-05, + "loss": 45.1176, + "step": 1586 + }, + { + "epoch": 5.7295711060948085, + "grad_norm": 293.7327575683594, + "learning_rate": 2.1413793103448275e-05, + "loss": 45.8517, + "step": 1587 + }, + { + "epoch": 5.733182844243792, + "grad_norm": 293.1148681640625, + "learning_rate": 2.140834845735027e-05, + "loss": 45.6659, + "step": 1588 + }, + { + "epoch": 5.736794582392776, + "grad_norm": 312.7779846191406, + "learning_rate": 2.140290381125227e-05, + "loss": 44.4863, + "step": 1589 + }, + { + "epoch": 5.74040632054176, + "grad_norm": 309.1000061035156, + "learning_rate": 2.1397459165154265e-05, + "loss": 43.649, + "step": 1590 + }, + { + "epoch": 5.74040632054176, + "eval_loss": 0.6471736431121826, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 1590 + }, + { + "epoch": 5.744018058690745, + "grad_norm": 276.4226989746094, + "learning_rate": 2.1392014519056263e-05, + "loss": 45.3135, + "step": 1591 + }, + { + "epoch": 5.747629796839729, + "grad_norm": 233.6791229248047, + "learning_rate": 2.138656987295826e-05, + "loss": 44.4919, + "step": 1592 + }, + { + "epoch": 5.751241534988713, + "grad_norm": 194.2917022705078, + "learning_rate": 2.1381125226860254e-05, + "loss": 44.8033, + "step": 1593 + }, + { + "epoch": 5.754853273137698, + "grad_norm": 241.76060485839844, + "learning_rate": 2.137568058076225e-05, + "loss": 45.1427, + "step": 1594 + }, + { + "epoch": 5.758465011286682, + "grad_norm": 216.56283569335938, + "learning_rate": 2.137023593466425e-05, + "loss": 43.1769, + "step": 1595 + }, + { + "epoch": 5.762076749435666, + "grad_norm": 230.0026092529297, + "learning_rate": 2.1364791288566244e-05, + "loss": 44.1141, + "step": 1596 + }, + { + "epoch": 5.76568848758465, + "grad_norm": 191.55433654785156, + "learning_rate": 2.135934664246824e-05, + "loss": 40.7227, + "step": 1597 + }, + { + "epoch": 5.769300225733634, + "grad_norm": 180.25885009765625, + "learning_rate": 2.1353901996370235e-05, + "loss": 40.9842, + "step": 1598 + }, + { + "epoch": 5.772911963882619, + "grad_norm": 220.4018096923828, + "learning_rate": 2.134845735027223e-05, + "loss": 40.0403, + "step": 1599 + }, + { + "epoch": 5.776523702031603, + "grad_norm": 264.20587158203125, + "learning_rate": 2.1343012704174232e-05, + "loss": 40.1543, + "step": 1600 + }, + { + "epoch": 5.776523702031603, + "eval_loss": 0.6374311447143555, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1600 + }, + { + "epoch": 5.780135440180587, + "grad_norm": 167.9457244873047, + "learning_rate": 2.1337568058076227e-05, + "loss": 40.9575, + "step": 1601 + }, + { + "epoch": 5.7837471783295715, + "grad_norm": 190.05247497558594, + "learning_rate": 2.1332123411978223e-05, + "loss": 39.5593, + "step": 1602 + }, + { + "epoch": 5.7873589164785555, + "grad_norm": 246.4980926513672, + "learning_rate": 2.1326678765880218e-05, + "loss": 40.7016, + "step": 1603 + }, + { + "epoch": 5.7909706546275395, + "grad_norm": 208.7435302734375, + "learning_rate": 2.1321234119782214e-05, + "loss": 41.7855, + "step": 1604 + }, + { + "epoch": 5.794582392776523, + "grad_norm": 190.84188842773438, + "learning_rate": 2.1315789473684212e-05, + "loss": 41.2129, + "step": 1605 + }, + { + "epoch": 5.798194130925508, + "grad_norm": 196.7161102294922, + "learning_rate": 2.1310344827586208e-05, + "loss": 40.8209, + "step": 1606 + }, + { + "epoch": 5.801805869074492, + "grad_norm": 181.4319305419922, + "learning_rate": 2.1304900181488203e-05, + "loss": 41.8345, + "step": 1607 + }, + { + "epoch": 5.805417607223476, + "grad_norm": 201.2064971923828, + "learning_rate": 2.12994555353902e-05, + "loss": 43.1464, + "step": 1608 + }, + { + "epoch": 5.80902934537246, + "grad_norm": 199.15174865722656, + "learning_rate": 2.1294010889292197e-05, + "loss": 42.6041, + "step": 1609 + }, + { + "epoch": 5.812641083521445, + "grad_norm": 231.0398406982422, + "learning_rate": 2.1288566243194193e-05, + "loss": 42.867, + "step": 1610 + }, + { + "epoch": 5.812641083521445, + "eval_loss": 0.6334222555160522, + "eval_runtime": 3.1534, + "eval_samples_per_second": 56.764, + "eval_steps_per_second": 56.764, + "step": 1610 + }, + { + "epoch": 5.816252821670429, + "grad_norm": 189.26132202148438, + "learning_rate": 2.128312159709619e-05, + "loss": 41.7717, + "step": 1611 + }, + { + "epoch": 5.819864559819413, + "grad_norm": 215.5289764404297, + "learning_rate": 2.1277676950998187e-05, + "loss": 41.3994, + "step": 1612 + }, + { + "epoch": 5.823476297968397, + "grad_norm": 267.4259033203125, + "learning_rate": 2.1272232304900182e-05, + "loss": 41.8173, + "step": 1613 + }, + { + "epoch": 5.827088036117382, + "grad_norm": 241.74749755859375, + "learning_rate": 2.1266787658802178e-05, + "loss": 39.9873, + "step": 1614 + }, + { + "epoch": 5.830699774266366, + "grad_norm": 242.233642578125, + "learning_rate": 2.1261343012704173e-05, + "loss": 37.0662, + "step": 1615 + }, + { + "epoch": 5.83431151241535, + "grad_norm": 217.06141662597656, + "learning_rate": 2.1255898366606172e-05, + "loss": 36.8948, + "step": 1616 + }, + { + "epoch": 5.837923250564334, + "grad_norm": 242.05567932128906, + "learning_rate": 2.1250453720508167e-05, + "loss": 34.9909, + "step": 1617 + }, + { + "epoch": 5.8415349887133186, + "grad_norm": 178.65618896484375, + "learning_rate": 2.1245009074410166e-05, + "loss": 35.603, + "step": 1618 + }, + { + "epoch": 5.8451467268623025, + "grad_norm": 216.36865234375, + "learning_rate": 2.123956442831216e-05, + "loss": 35.9822, + "step": 1619 + }, + { + "epoch": 5.8487584650112865, + "grad_norm": 241.22161865234375, + "learning_rate": 2.1234119782214157e-05, + "loss": 35.1473, + "step": 1620 + }, + { + "epoch": 5.8487584650112865, + "eval_loss": 0.6312161087989807, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 1620 + }, + { + "epoch": 5.852370203160271, + "grad_norm": 192.05210876464844, + "learning_rate": 2.1228675136116152e-05, + "loss": 36.145, + "step": 1621 + }, + { + "epoch": 5.855981941309255, + "grad_norm": 194.0652618408203, + "learning_rate": 2.122323049001815e-05, + "loss": 37.7076, + "step": 1622 + }, + { + "epoch": 5.859593679458239, + "grad_norm": 255.59286499023438, + "learning_rate": 2.1217785843920146e-05, + "loss": 37.6837, + "step": 1623 + }, + { + "epoch": 5.863205417607223, + "grad_norm": 184.0017852783203, + "learning_rate": 2.121234119782214e-05, + "loss": 37.1681, + "step": 1624 + }, + { + "epoch": 5.866817155756207, + "grad_norm": 186.98338317871094, + "learning_rate": 2.1206896551724137e-05, + "loss": 37.4902, + "step": 1625 + }, + { + "epoch": 5.870428893905192, + "grad_norm": 253.53775024414062, + "learning_rate": 2.1201451905626132e-05, + "loss": 37.2771, + "step": 1626 + }, + { + "epoch": 5.874040632054176, + "grad_norm": 196.43038940429688, + "learning_rate": 2.119600725952813e-05, + "loss": 37.7681, + "step": 1627 + }, + { + "epoch": 5.87765237020316, + "grad_norm": 255.99879455566406, + "learning_rate": 2.119056261343013e-05, + "loss": 40.0097, + "step": 1628 + }, + { + "epoch": 5.881264108352145, + "grad_norm": 275.1465148925781, + "learning_rate": 2.1185117967332125e-05, + "loss": 38.1076, + "step": 1629 + }, + { + "epoch": 5.884875846501129, + "grad_norm": 281.8592529296875, + "learning_rate": 2.117967332123412e-05, + "loss": 38.6463, + "step": 1630 + }, + { + "epoch": 5.884875846501129, + "eval_loss": 0.6449099779129028, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 1630 + }, + { + "epoch": 5.888487584650113, + "grad_norm": 246.7912139892578, + "learning_rate": 2.1174228675136116e-05, + "loss": 36.9158, + "step": 1631 + }, + { + "epoch": 5.892099322799097, + "grad_norm": 176.7545623779297, + "learning_rate": 2.116878402903811e-05, + "loss": 25.1153, + "step": 1632 + }, + { + "epoch": 5.895711060948082, + "grad_norm": 202.2602996826172, + "learning_rate": 2.116333938294011e-05, + "loss": 24.1999, + "step": 1633 + }, + { + "epoch": 5.899322799097066, + "grad_norm": 186.26255798339844, + "learning_rate": 2.1157894736842106e-05, + "loss": 24.185, + "step": 1634 + }, + { + "epoch": 5.9029345372460496, + "grad_norm": 231.0543670654297, + "learning_rate": 2.11524500907441e-05, + "loss": 26.1841, + "step": 1635 + }, + { + "epoch": 5.9065462753950335, + "grad_norm": 336.677001953125, + "learning_rate": 2.1147005444646096e-05, + "loss": 47.1367, + "step": 1636 + }, + { + "epoch": 5.910158013544018, + "grad_norm": 299.3211975097656, + "learning_rate": 2.1141560798548095e-05, + "loss": 46.7711, + "step": 1637 + }, + { + "epoch": 5.913769751693002, + "grad_norm": 287.5389099121094, + "learning_rate": 2.1136116152450094e-05, + "loss": 44.9163, + "step": 1638 + }, + { + "epoch": 5.917381489841986, + "grad_norm": 290.34930419921875, + "learning_rate": 2.113067150635209e-05, + "loss": 45.1651, + "step": 1639 + }, + { + "epoch": 5.92099322799097, + "grad_norm": 244.7100372314453, + "learning_rate": 2.1125226860254085e-05, + "loss": 45.6252, + "step": 1640 + }, + { + "epoch": 5.92099322799097, + "eval_loss": 0.6506878733634949, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 1640 + }, + { + "epoch": 5.924604966139955, + "grad_norm": 301.48223876953125, + "learning_rate": 2.111978221415608e-05, + "loss": 44.5345, + "step": 1641 + }, + { + "epoch": 5.928216704288939, + "grad_norm": 261.05987548828125, + "learning_rate": 2.1114337568058075e-05, + "loss": 42.0263, + "step": 1642 + }, + { + "epoch": 5.931828442437923, + "grad_norm": 220.4369659423828, + "learning_rate": 2.110889292196007e-05, + "loss": 41.2405, + "step": 1643 + }, + { + "epoch": 5.935440180586907, + "grad_norm": 261.3221435546875, + "learning_rate": 2.110344827586207e-05, + "loss": 42.2734, + "step": 1644 + }, + { + "epoch": 5.939051918735892, + "grad_norm": 253.70855712890625, + "learning_rate": 2.1098003629764065e-05, + "loss": 43.0752, + "step": 1645 + }, + { + "epoch": 5.942663656884876, + "grad_norm": 198.76138305664062, + "learning_rate": 2.1092558983666064e-05, + "loss": 42.7103, + "step": 1646 + }, + { + "epoch": 5.94627539503386, + "grad_norm": 212.21466064453125, + "learning_rate": 2.108711433756806e-05, + "loss": 42.6215, + "step": 1647 + }, + { + "epoch": 5.949887133182845, + "grad_norm": 212.9633026123047, + "learning_rate": 2.1081669691470055e-05, + "loss": 42.795, + "step": 1648 + }, + { + "epoch": 5.953498871331829, + "grad_norm": 263.2871398925781, + "learning_rate": 2.1076225045372053e-05, + "loss": 43.8843, + "step": 1649 + }, + { + "epoch": 5.957110609480813, + "grad_norm": 207.67120361328125, + "learning_rate": 2.107078039927405e-05, + "loss": 43.0161, + "step": 1650 + }, + { + "epoch": 5.957110609480813, + "eval_loss": 0.6315081715583801, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1650 + }, + { + "epoch": 5.960722347629797, + "grad_norm": 176.6342010498047, + "learning_rate": 2.1065335753176044e-05, + "loss": 38.803, + "step": 1651 + }, + { + "epoch": 5.9643340857787805, + "grad_norm": 223.57485961914062, + "learning_rate": 2.105989110707804e-05, + "loss": 35.1905, + "step": 1652 + }, + { + "epoch": 5.967945823927765, + "grad_norm": 291.507568359375, + "learning_rate": 2.1054446460980035e-05, + "loss": 34.9454, + "step": 1653 + }, + { + "epoch": 5.971557562076749, + "grad_norm": 250.51063537597656, + "learning_rate": 2.104900181488203e-05, + "loss": 37.4404, + "step": 1654 + }, + { + "epoch": 5.975169300225733, + "grad_norm": 307.9601135253906, + "learning_rate": 2.1043557168784032e-05, + "loss": 36.9775, + "step": 1655 + }, + { + "epoch": 5.978781038374718, + "grad_norm": 277.24151611328125, + "learning_rate": 2.1038112522686028e-05, + "loss": 38.2696, + "step": 1656 + }, + { + "epoch": 5.982392776523702, + "grad_norm": 186.7593994140625, + "learning_rate": 2.1032667876588023e-05, + "loss": 37.0656, + "step": 1657 + }, + { + "epoch": 5.986004514672686, + "grad_norm": 201.67047119140625, + "learning_rate": 2.102722323049002e-05, + "loss": 38.1747, + "step": 1658 + }, + { + "epoch": 5.98961625282167, + "grad_norm": 216.87525939941406, + "learning_rate": 2.1021778584392014e-05, + "loss": 39.3248, + "step": 1659 + }, + { + "epoch": 5.993227990970655, + "grad_norm": 227.381103515625, + "learning_rate": 2.1016333938294013e-05, + "loss": 33.4017, + "step": 1660 + }, + { + "epoch": 5.993227990970655, + "eval_loss": 0.6369583010673523, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1660 + }, + { + "epoch": 5.996839729119639, + "grad_norm": 237.2648468017578, + "learning_rate": 2.1010889292196008e-05, + "loss": 24.679, + "step": 1661 + }, + { + "epoch": 6.0, + "grad_norm": 191.99951171875, + "learning_rate": 2.1005444646098003e-05, + "loss": 21.9552, + "step": 1662 + }, + { + "epoch": 6.003611738148984, + "grad_norm": 267.92181396484375, + "learning_rate": 2.1e-05, + "loss": 43.6884, + "step": 1663 + }, + { + "epoch": 6.007223476297969, + "grad_norm": 318.86602783203125, + "learning_rate": 2.0994555353901998e-05, + "loss": 46.0709, + "step": 1664 + }, + { + "epoch": 6.010835214446953, + "grad_norm": 282.772705078125, + "learning_rate": 2.0989110707803993e-05, + "loss": 44.2746, + "step": 1665 + }, + { + "epoch": 6.014446952595937, + "grad_norm": 263.2024841308594, + "learning_rate": 2.0983666061705992e-05, + "loss": 43.818, + "step": 1666 + }, + { + "epoch": 6.018058690744921, + "grad_norm": 229.41725158691406, + "learning_rate": 2.0978221415607987e-05, + "loss": 43.9441, + "step": 1667 + }, + { + "epoch": 6.021670428893906, + "grad_norm": 253.25624084472656, + "learning_rate": 2.0972776769509983e-05, + "loss": 43.517, + "step": 1668 + }, + { + "epoch": 6.0252821670428895, + "grad_norm": 202.00238037109375, + "learning_rate": 2.0967332123411978e-05, + "loss": 44.3685, + "step": 1669 + }, + { + "epoch": 6.0288939051918735, + "grad_norm": 196.92825317382812, + "learning_rate": 2.0961887477313973e-05, + "loss": 44.9367, + "step": 1670 + }, + { + "epoch": 6.0288939051918735, + "eval_loss": 0.6381568312644958, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1670 + }, + { + "epoch": 6.0325056433408575, + "grad_norm": 191.00900268554688, + "learning_rate": 2.0956442831215972e-05, + "loss": 44.0743, + "step": 1671 + }, + { + "epoch": 6.036117381489842, + "grad_norm": 195.92141723632812, + "learning_rate": 2.0950998185117967e-05, + "loss": 43.3278, + "step": 1672 + }, + { + "epoch": 6.039729119638826, + "grad_norm": 230.04708862304688, + "learning_rate": 2.0945553539019963e-05, + "loss": 41.6419, + "step": 1673 + }, + { + "epoch": 6.04334085778781, + "grad_norm": 215.70689392089844, + "learning_rate": 2.094010889292196e-05, + "loss": 41.0927, + "step": 1674 + }, + { + "epoch": 6.046952595936794, + "grad_norm": 227.51797485351562, + "learning_rate": 2.0934664246823957e-05, + "loss": 40.1888, + "step": 1675 + }, + { + "epoch": 6.050564334085779, + "grad_norm": 216.93089294433594, + "learning_rate": 2.0929219600725952e-05, + "loss": 39.8766, + "step": 1676 + }, + { + "epoch": 6.054176072234763, + "grad_norm": 199.3091583251953, + "learning_rate": 2.092377495462795e-05, + "loss": 40.3851, + "step": 1677 + }, + { + "epoch": 6.057787810383747, + "grad_norm": 188.56056213378906, + "learning_rate": 2.0918330308529947e-05, + "loss": 40.5289, + "step": 1678 + }, + { + "epoch": 6.061399548532731, + "grad_norm": 194.23265075683594, + "learning_rate": 2.0912885662431942e-05, + "loss": 40.7509, + "step": 1679 + }, + { + "epoch": 6.065011286681716, + "grad_norm": 199.7327423095703, + "learning_rate": 2.0907441016333937e-05, + "loss": 41.3404, + "step": 1680 + }, + { + "epoch": 6.065011286681716, + "eval_loss": 0.6312655806541443, + "eval_runtime": 3.1482, + "eval_samples_per_second": 56.858, + "eval_steps_per_second": 56.858, + "step": 1680 + }, + { + "epoch": 6.0686230248307, + "grad_norm": 189.40150451660156, + "learning_rate": 2.0901996370235933e-05, + "loss": 41.3719, + "step": 1681 + }, + { + "epoch": 6.072234762979684, + "grad_norm": 222.07705688476562, + "learning_rate": 2.089655172413793e-05, + "loss": 41.8194, + "step": 1682 + }, + { + "epoch": 6.075846501128668, + "grad_norm": 205.6264190673828, + "learning_rate": 2.089110707803993e-05, + "loss": 39.8522, + "step": 1683 + }, + { + "epoch": 6.079458239277653, + "grad_norm": 207.98802185058594, + "learning_rate": 2.0885662431941926e-05, + "loss": 41.5093, + "step": 1684 + }, + { + "epoch": 6.083069977426637, + "grad_norm": 197.24134826660156, + "learning_rate": 2.088021778584392e-05, + "loss": 41.7284, + "step": 1685 + }, + { + "epoch": 6.0866817155756205, + "grad_norm": 220.84255981445312, + "learning_rate": 2.0874773139745916e-05, + "loss": 42.7841, + "step": 1686 + }, + { + "epoch": 6.090293453724605, + "grad_norm": 239.06854248046875, + "learning_rate": 2.0869328493647912e-05, + "loss": 43.6391, + "step": 1687 + }, + { + "epoch": 6.093905191873589, + "grad_norm": 193.2572021484375, + "learning_rate": 2.086388384754991e-05, + "loss": 41.9963, + "step": 1688 + }, + { + "epoch": 6.097516930022573, + "grad_norm": 206.66473388671875, + "learning_rate": 2.0858439201451906e-05, + "loss": 41.9834, + "step": 1689 + }, + { + "epoch": 6.101128668171557, + "grad_norm": 214.81956481933594, + "learning_rate": 2.08529945553539e-05, + "loss": 41.7128, + "step": 1690 + }, + { + "epoch": 6.101128668171557, + "eval_loss": 0.6309775114059448, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1690 + }, + { + "epoch": 6.104740406320542, + "grad_norm": 189.58360290527344, + "learning_rate": 2.0847549909255897e-05, + "loss": 37.7807, + "step": 1691 + }, + { + "epoch": 6.108352144469526, + "grad_norm": 265.76934814453125, + "learning_rate": 2.0842105263157895e-05, + "loss": 37.7091, + "step": 1692 + }, + { + "epoch": 6.11196388261851, + "grad_norm": 266.4632568359375, + "learning_rate": 2.0836660617059894e-05, + "loss": 34.7386, + "step": 1693 + }, + { + "epoch": 6.115575620767494, + "grad_norm": 309.3799743652344, + "learning_rate": 2.083121597096189e-05, + "loss": 34.9386, + "step": 1694 + }, + { + "epoch": 6.119187358916479, + "grad_norm": 252.98681640625, + "learning_rate": 2.0825771324863885e-05, + "loss": 34.9113, + "step": 1695 + }, + { + "epoch": 6.122799097065463, + "grad_norm": 199.3408660888672, + "learning_rate": 2.082032667876588e-05, + "loss": 35.1914, + "step": 1696 + }, + { + "epoch": 6.126410835214447, + "grad_norm": 231.67514038085938, + "learning_rate": 2.0814882032667876e-05, + "loss": 36.3151, + "step": 1697 + }, + { + "epoch": 6.130022573363431, + "grad_norm": 215.49317932128906, + "learning_rate": 2.080943738656987e-05, + "loss": 37.6763, + "step": 1698 + }, + { + "epoch": 6.133634311512416, + "grad_norm": 239.3602752685547, + "learning_rate": 2.080399274047187e-05, + "loss": 35.7805, + "step": 1699 + }, + { + "epoch": 6.1372460496614, + "grad_norm": 192.8195037841797, + "learning_rate": 2.0798548094373865e-05, + "loss": 36.7353, + "step": 1700 + }, + { + "epoch": 6.1372460496614, + "eval_loss": 0.6290757060050964, + "eval_runtime": 3.1486, + "eval_samples_per_second": 56.851, + "eval_steps_per_second": 56.851, + "step": 1700 + }, + { + "epoch": 6.140857787810384, + "grad_norm": 191.125, + "learning_rate": 2.0793103448275864e-05, + "loss": 36.6377, + "step": 1701 + }, + { + "epoch": 6.144469525959368, + "grad_norm": 232.39170837402344, + "learning_rate": 2.078765880217786e-05, + "loss": 36.5235, + "step": 1702 + }, + { + "epoch": 6.148081264108352, + "grad_norm": 259.41204833984375, + "learning_rate": 2.0782214156079855e-05, + "loss": 37.7093, + "step": 1703 + }, + { + "epoch": 6.151693002257336, + "grad_norm": 218.00814819335938, + "learning_rate": 2.0776769509981854e-05, + "loss": 37.8061, + "step": 1704 + }, + { + "epoch": 6.15530474040632, + "grad_norm": 183.78170776367188, + "learning_rate": 2.077132486388385e-05, + "loss": 37.9451, + "step": 1705 + }, + { + "epoch": 6.158916478555304, + "grad_norm": 242.387939453125, + "learning_rate": 2.0765880217785844e-05, + "loss": 38.687, + "step": 1706 + }, + { + "epoch": 6.162528216704289, + "grad_norm": 247.09152221679688, + "learning_rate": 2.076043557168784e-05, + "loss": 38.5109, + "step": 1707 + }, + { + "epoch": 6.166139954853273, + "grad_norm": 202.3104705810547, + "learning_rate": 2.0754990925589835e-05, + "loss": 28.0115, + "step": 1708 + }, + { + "epoch": 6.169751693002257, + "grad_norm": 239.5511016845703, + "learning_rate": 2.0749546279491834e-05, + "loss": 23.8873, + "step": 1709 + }, + { + "epoch": 6.173363431151241, + "grad_norm": 233.80007934570312, + "learning_rate": 2.0744101633393833e-05, + "loss": 24.0236, + "step": 1710 + }, + { + "epoch": 6.173363431151241, + "eval_loss": 0.6451307535171509, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1710 + }, + { + "epoch": 6.176975169300226, + "grad_norm": 231.85955810546875, + "learning_rate": 2.0738656987295828e-05, + "loss": 25.2521, + "step": 1711 + }, + { + "epoch": 6.18058690744921, + "grad_norm": 207.05453491210938, + "learning_rate": 2.0733212341197823e-05, + "loss": 25.5774, + "step": 1712 + }, + { + "epoch": 6.184198645598194, + "grad_norm": 265.9180908203125, + "learning_rate": 2.072776769509982e-05, + "loss": 46.0267, + "step": 1713 + }, + { + "epoch": 6.187810383747179, + "grad_norm": 289.2763671875, + "learning_rate": 2.0722323049001814e-05, + "loss": 46.6262, + "step": 1714 + }, + { + "epoch": 6.191422121896163, + "grad_norm": 254.466552734375, + "learning_rate": 2.0716878402903813e-05, + "loss": 44.2758, + "step": 1715 + }, + { + "epoch": 6.195033860045147, + "grad_norm": 262.713134765625, + "learning_rate": 2.071143375680581e-05, + "loss": 44.6334, + "step": 1716 + }, + { + "epoch": 6.198645598194131, + "grad_norm": 272.8150939941406, + "learning_rate": 2.0705989110707804e-05, + "loss": 44.9617, + "step": 1717 + }, + { + "epoch": 6.2022573363431155, + "grad_norm": 288.115478515625, + "learning_rate": 2.07005444646098e-05, + "loss": 44.4382, + "step": 1718 + }, + { + "epoch": 6.2058690744920995, + "grad_norm": 226.08058166503906, + "learning_rate": 2.0695099818511795e-05, + "loss": 44.8551, + "step": 1719 + }, + { + "epoch": 6.209480812641083, + "grad_norm": 219.95835876464844, + "learning_rate": 2.0689655172413797e-05, + "loss": 45.5901, + "step": 1720 + }, + { + "epoch": 6.209480812641083, + "eval_loss": 0.6379314661026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 1720 + }, + { + "epoch": 6.213092550790067, + "grad_norm": 190.3118896484375, + "learning_rate": 2.0684210526315792e-05, + "loss": 44.0675, + "step": 1721 + }, + { + "epoch": 6.216704288939052, + "grad_norm": 177.408935546875, + "learning_rate": 2.0678765880217787e-05, + "loss": 42.6333, + "step": 1722 + }, + { + "epoch": 6.220316027088036, + "grad_norm": 231.3040313720703, + "learning_rate": 2.0673321234119783e-05, + "loss": 41.6771, + "step": 1723 + }, + { + "epoch": 6.22392776523702, + "grad_norm": 226.51663208007812, + "learning_rate": 2.0667876588021778e-05, + "loss": 41.0829, + "step": 1724 + }, + { + "epoch": 6.227539503386004, + "grad_norm": 184.55775451660156, + "learning_rate": 2.0662431941923774e-05, + "loss": 39.2682, + "step": 1725 + }, + { + "epoch": 6.231151241534989, + "grad_norm": 205.0491943359375, + "learning_rate": 2.0656987295825772e-05, + "loss": 40.4101, + "step": 1726 + }, + { + "epoch": 6.234762979683973, + "grad_norm": 201.45838928222656, + "learning_rate": 2.0651542649727768e-05, + "loss": 39.9147, + "step": 1727 + }, + { + "epoch": 6.238374717832957, + "grad_norm": 220.16213989257812, + "learning_rate": 2.0646098003629763e-05, + "loss": 40.7215, + "step": 1728 + }, + { + "epoch": 6.241986455981941, + "grad_norm": 260.9661560058594, + "learning_rate": 2.0640653357531762e-05, + "loss": 40.0256, + "step": 1729 + }, + { + "epoch": 6.245598194130926, + "grad_norm": 314.2476806640625, + "learning_rate": 2.0635208711433757e-05, + "loss": 41.1147, + "step": 1730 + }, + { + "epoch": 6.245598194130926, + "eval_loss": 0.6347935199737549, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1730 + }, + { + "epoch": 6.24920993227991, + "grad_norm": 262.24505615234375, + "learning_rate": 2.0629764065335756e-05, + "loss": 41.7255, + "step": 1731 + }, + { + "epoch": 6.252821670428894, + "grad_norm": 212.0876922607422, + "learning_rate": 2.062431941923775e-05, + "loss": 41.2559, + "step": 1732 + }, + { + "epoch": 6.2564334085778786, + "grad_norm": 185.3249969482422, + "learning_rate": 2.0618874773139747e-05, + "loss": 41.1664, + "step": 1733 + }, + { + "epoch": 6.2600451467268625, + "grad_norm": 184.7873077392578, + "learning_rate": 2.0613430127041742e-05, + "loss": 41.3357, + "step": 1734 + }, + { + "epoch": 6.2636568848758465, + "grad_norm": 230.11257934570312, + "learning_rate": 2.0607985480943738e-05, + "loss": 43.0978, + "step": 1735 + }, + { + "epoch": 6.2672686230248305, + "grad_norm": 251.255126953125, + "learning_rate": 2.0602540834845733e-05, + "loss": 42.4169, + "step": 1736 + }, + { + "epoch": 6.270880361173815, + "grad_norm": 230.1149444580078, + "learning_rate": 2.0597096188747732e-05, + "loss": 43.2969, + "step": 1737 + }, + { + "epoch": 6.274492099322799, + "grad_norm": 217.2769012451172, + "learning_rate": 2.059165154264973e-05, + "loss": 42.6037, + "step": 1738 + }, + { + "epoch": 6.278103837471783, + "grad_norm": 189.85533142089844, + "learning_rate": 2.0586206896551726e-05, + "loss": 42.1215, + "step": 1739 + }, + { + "epoch": 6.281715575620767, + "grad_norm": 242.15667724609375, + "learning_rate": 2.058076225045372e-05, + "loss": 42.6337, + "step": 1740 + }, + { + "epoch": 6.281715575620767, + "eval_loss": 0.6310555934906006, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 1740 + }, + { + "epoch": 6.285327313769752, + "grad_norm": 213.7873992919922, + "learning_rate": 2.0575317604355717e-05, + "loss": 40.5315, + "step": 1741 + }, + { + "epoch": 6.288939051918736, + "grad_norm": 243.86492919921875, + "learning_rate": 2.0569872958257715e-05, + "loss": 38.9483, + "step": 1742 + }, + { + "epoch": 6.29255079006772, + "grad_norm": 276.0108642578125, + "learning_rate": 2.056442831215971e-05, + "loss": 35.9627, + "step": 1743 + }, + { + "epoch": 6.296162528216704, + "grad_norm": 252.5875701904297, + "learning_rate": 2.0558983666061706e-05, + "loss": 35.4305, + "step": 1744 + }, + { + "epoch": 6.299774266365689, + "grad_norm": 227.15142822265625, + "learning_rate": 2.05535390199637e-05, + "loss": 35.2385, + "step": 1745 + }, + { + "epoch": 6.303386004514673, + "grad_norm": 259.6727294921875, + "learning_rate": 2.0548094373865697e-05, + "loss": 35.735, + "step": 1746 + }, + { + "epoch": 6.306997742663657, + "grad_norm": 185.07765197753906, + "learning_rate": 2.0542649727767696e-05, + "loss": 36.8835, + "step": 1747 + }, + { + "epoch": 6.310609480812641, + "grad_norm": 207.650146484375, + "learning_rate": 2.0537205081669694e-05, + "loss": 36.346, + "step": 1748 + }, + { + "epoch": 6.314221218961626, + "grad_norm": 223.2378692626953, + "learning_rate": 2.053176043557169e-05, + "loss": 36.1527, + "step": 1749 + }, + { + "epoch": 6.3178329571106095, + "grad_norm": 162.90794372558594, + "learning_rate": 2.0526315789473685e-05, + "loss": 35.7408, + "step": 1750 + }, + { + "epoch": 6.3178329571106095, + "eval_loss": 0.6276403069496155, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 1750 + }, + { + "epoch": 6.3214446952595935, + "grad_norm": 165.8592987060547, + "learning_rate": 2.052087114337568e-05, + "loss": 37.7916, + "step": 1751 + }, + { + "epoch": 6.3250564334085775, + "grad_norm": 179.7499542236328, + "learning_rate": 2.0515426497277676e-05, + "loss": 36.8409, + "step": 1752 + }, + { + "epoch": 6.328668171557562, + "grad_norm": 227.0990753173828, + "learning_rate": 2.0509981851179675e-05, + "loss": 37.1766, + "step": 1753 + }, + { + "epoch": 6.332279909706546, + "grad_norm": 216.3297882080078, + "learning_rate": 2.050453720508167e-05, + "loss": 37.5, + "step": 1754 + }, + { + "epoch": 6.33589164785553, + "grad_norm": 197.88409423828125, + "learning_rate": 2.0499092558983666e-05, + "loss": 38.8293, + "step": 1755 + }, + { + "epoch": 6.339503386004514, + "grad_norm": 189.74916076660156, + "learning_rate": 2.049364791288566e-05, + "loss": 37.9873, + "step": 1756 + }, + { + "epoch": 6.343115124153499, + "grad_norm": 241.16644287109375, + "learning_rate": 2.048820326678766e-05, + "loss": 39.3107, + "step": 1757 + }, + { + "epoch": 6.346726862302483, + "grad_norm": 224.3491668701172, + "learning_rate": 2.0482758620689655e-05, + "loss": 36.2482, + "step": 1758 + }, + { + "epoch": 6.350338600451467, + "grad_norm": 217.30882263183594, + "learning_rate": 2.0477313974591654e-05, + "loss": 24.1945, + "step": 1759 + }, + { + "epoch": 6.353950338600452, + "grad_norm": 213.23683166503906, + "learning_rate": 2.047186932849365e-05, + "loss": 24.2356, + "step": 1760 + }, + { + "epoch": 6.353950338600452, + "eval_loss": 0.6382855772972107, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.795, + "eval_steps_per_second": 56.795, + "step": 1760 + }, + { + "epoch": 6.357562076749436, + "grad_norm": 209.8166961669922, + "learning_rate": 2.0466424682395645e-05, + "loss": 25.1916, + "step": 1761 + }, + { + "epoch": 6.36117381489842, + "grad_norm": 197.86773681640625, + "learning_rate": 2.046098003629764e-05, + "loss": 25.1372, + "step": 1762 + }, + { + "epoch": 6.364785553047404, + "grad_norm": 280.80517578125, + "learning_rate": 2.0455535390199635e-05, + "loss": 45.0431, + "step": 1763 + }, + { + "epoch": 6.368397291196389, + "grad_norm": 239.85861206054688, + "learning_rate": 2.0450090744101634e-05, + "loss": 45.4893, + "step": 1764 + }, + { + "epoch": 6.372009029345373, + "grad_norm": 302.56024169921875, + "learning_rate": 2.044464609800363e-05, + "loss": 45.3313, + "step": 1765 + }, + { + "epoch": 6.375620767494357, + "grad_norm": 255.5519256591797, + "learning_rate": 2.043920145190563e-05, + "loss": 44.703, + "step": 1766 + }, + { + "epoch": 6.3792325056433405, + "grad_norm": 223.1331024169922, + "learning_rate": 2.0433756805807624e-05, + "loss": 45.0278, + "step": 1767 + }, + { + "epoch": 6.382844243792325, + "grad_norm": 240.68817138671875, + "learning_rate": 2.042831215970962e-05, + "loss": 44.7298, + "step": 1768 + }, + { + "epoch": 6.386455981941309, + "grad_norm": 239.5072021484375, + "learning_rate": 2.0422867513611614e-05, + "loss": 44.0512, + "step": 1769 + }, + { + "epoch": 6.390067720090293, + "grad_norm": 186.3783416748047, + "learning_rate": 2.0417422867513613e-05, + "loss": 43.8646, + "step": 1770 + }, + { + "epoch": 6.390067720090293, + "eval_loss": 0.6325972676277161, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 1770 + }, + { + "epoch": 6.393679458239277, + "grad_norm": 169.77285766601562, + "learning_rate": 2.041197822141561e-05, + "loss": 43.8688, + "step": 1771 + }, + { + "epoch": 6.397291196388262, + "grad_norm": 158.4019012451172, + "learning_rate": 2.0406533575317604e-05, + "loss": 42.5757, + "step": 1772 + }, + { + "epoch": 6.400902934537246, + "grad_norm": 209.79916381835938, + "learning_rate": 2.04010889292196e-05, + "loss": 44.8075, + "step": 1773 + }, + { + "epoch": 6.40451467268623, + "grad_norm": 215.74639892578125, + "learning_rate": 2.0395644283121595e-05, + "loss": 42.0121, + "step": 1774 + }, + { + "epoch": 6.408126410835214, + "grad_norm": 215.21121215820312, + "learning_rate": 2.0390199637023597e-05, + "loss": 40.6564, + "step": 1775 + }, + { + "epoch": 6.411738148984199, + "grad_norm": 244.49574279785156, + "learning_rate": 2.0384754990925592e-05, + "loss": 40.543, + "step": 1776 + }, + { + "epoch": 6.415349887133183, + "grad_norm": 189.22781372070312, + "learning_rate": 2.0379310344827588e-05, + "loss": 39.5569, + "step": 1777 + }, + { + "epoch": 6.418961625282167, + "grad_norm": 204.32664489746094, + "learning_rate": 2.0373865698729583e-05, + "loss": 40.0789, + "step": 1778 + }, + { + "epoch": 6.422573363431152, + "grad_norm": 217.5277557373047, + "learning_rate": 2.036842105263158e-05, + "loss": 39.6436, + "step": 1779 + }, + { + "epoch": 6.426185101580136, + "grad_norm": 196.25918579101562, + "learning_rate": 2.0362976406533574e-05, + "loss": 41.0794, + "step": 1780 + }, + { + "epoch": 6.426185101580136, + "eval_loss": 0.6334295868873596, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1780 + }, + { + "epoch": 6.42979683972912, + "grad_norm": 191.50656127929688, + "learning_rate": 2.0357531760435573e-05, + "loss": 41.2976, + "step": 1781 + }, + { + "epoch": 6.433408577878104, + "grad_norm": 192.98692321777344, + "learning_rate": 2.0352087114337568e-05, + "loss": 41.0843, + "step": 1782 + }, + { + "epoch": 6.437020316027088, + "grad_norm": 197.32862854003906, + "learning_rate": 2.0346642468239563e-05, + "loss": 40.4123, + "step": 1783 + }, + { + "epoch": 6.440632054176072, + "grad_norm": 205.18751525878906, + "learning_rate": 2.0341197822141562e-05, + "loss": 41.9185, + "step": 1784 + }, + { + "epoch": 6.444243792325056, + "grad_norm": 201.69070434570312, + "learning_rate": 2.0335753176043558e-05, + "loss": 41.6794, + "step": 1785 + }, + { + "epoch": 6.44785553047404, + "grad_norm": 218.77044677734375, + "learning_rate": 2.0330308529945556e-05, + "loss": 43.5805, + "step": 1786 + }, + { + "epoch": 6.451467268623025, + "grad_norm": 183.25967407226562, + "learning_rate": 2.0324863883847552e-05, + "loss": 41.2777, + "step": 1787 + }, + { + "epoch": 6.455079006772009, + "grad_norm": 219.97369384765625, + "learning_rate": 2.0319419237749547e-05, + "loss": 42.4618, + "step": 1788 + }, + { + "epoch": 6.458690744920993, + "grad_norm": 216.1624298095703, + "learning_rate": 2.0313974591651542e-05, + "loss": 41.6424, + "step": 1789 + }, + { + "epoch": 6.462302483069977, + "grad_norm": 222.29965209960938, + "learning_rate": 2.0308529945553538e-05, + "loss": 41.4058, + "step": 1790 + }, + { + "epoch": 6.462302483069977, + "eval_loss": 0.6282982230186462, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 1790 + }, + { + "epoch": 6.465914221218962, + "grad_norm": 215.50511169433594, + "learning_rate": 2.0303085299455533e-05, + "loss": 39.474, + "step": 1791 + }, + { + "epoch": 6.469525959367946, + "grad_norm": 237.2119903564453, + "learning_rate": 2.0297640653357532e-05, + "loss": 36.0508, + "step": 1792 + }, + { + "epoch": 6.47313769751693, + "grad_norm": 234.52975463867188, + "learning_rate": 2.029219600725953e-05, + "loss": 34.1704, + "step": 1793 + }, + { + "epoch": 6.476749435665914, + "grad_norm": 213.22216796875, + "learning_rate": 2.0286751361161526e-05, + "loss": 34.7592, + "step": 1794 + }, + { + "epoch": 6.480361173814899, + "grad_norm": 215.77244567871094, + "learning_rate": 2.028130671506352e-05, + "loss": 35.3051, + "step": 1795 + }, + { + "epoch": 6.483972911963883, + "grad_norm": 179.0439910888672, + "learning_rate": 2.0275862068965517e-05, + "loss": 35.2493, + "step": 1796 + }, + { + "epoch": 6.487584650112867, + "grad_norm": 217.47218322753906, + "learning_rate": 2.0270417422867516e-05, + "loss": 35.6169, + "step": 1797 + }, + { + "epoch": 6.491196388261851, + "grad_norm": 191.3380584716797, + "learning_rate": 2.026497277676951e-05, + "loss": 36.428, + "step": 1798 + }, + { + "epoch": 6.4948081264108355, + "grad_norm": 200.8570098876953, + "learning_rate": 2.0259528130671506e-05, + "loss": 36.5983, + "step": 1799 + }, + { + "epoch": 6.4984198645598195, + "grad_norm": 173.1240234375, + "learning_rate": 2.0254083484573502e-05, + "loss": 36.0163, + "step": 1800 + }, + { + "epoch": 6.4984198645598195, + "eval_loss": 0.6268841624259949, + "eval_runtime": 3.146, + "eval_samples_per_second": 56.898, + "eval_steps_per_second": 56.898, + "step": 1800 + }, + { + "epoch": 6.502031602708803, + "grad_norm": 225.66845703125, + "learning_rate": 2.0248638838475497e-05, + "loss": 36.2461, + "step": 1801 + }, + { + "epoch": 6.505643340857787, + "grad_norm": 189.66233825683594, + "learning_rate": 2.0243194192377496e-05, + "loss": 37.416, + "step": 1802 + }, + { + "epoch": 6.509255079006772, + "grad_norm": 243.0270233154297, + "learning_rate": 2.0237749546279495e-05, + "loss": 38.5309, + "step": 1803 + }, + { + "epoch": 6.512866817155756, + "grad_norm": 192.0927276611328, + "learning_rate": 2.023230490018149e-05, + "loss": 37.087, + "step": 1804 + }, + { + "epoch": 6.51647855530474, + "grad_norm": 222.2957305908203, + "learning_rate": 2.0226860254083486e-05, + "loss": 37.8877, + "step": 1805 + }, + { + "epoch": 6.520090293453725, + "grad_norm": 259.84722900390625, + "learning_rate": 2.022141560798548e-05, + "loss": 39.2138, + "step": 1806 + }, + { + "epoch": 6.523702031602709, + "grad_norm": 205.5794219970703, + "learning_rate": 2.0215970961887476e-05, + "loss": 38.6066, + "step": 1807 + }, + { + "epoch": 6.527313769751693, + "grad_norm": 300.455810546875, + "learning_rate": 2.0210526315789475e-05, + "loss": 36.1581, + "step": 1808 + }, + { + "epoch": 6.530925507900677, + "grad_norm": 207.18063354492188, + "learning_rate": 2.020508166969147e-05, + "loss": 24.3689, + "step": 1809 + }, + { + "epoch": 6.534537246049661, + "grad_norm": 230.98516845703125, + "learning_rate": 2.0199637023593466e-05, + "loss": 23.7019, + "step": 1810 + }, + { + "epoch": 6.534537246049661, + "eval_loss": 0.6379140615463257, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 1810 + }, + { + "epoch": 6.538148984198646, + "grad_norm": 153.8694610595703, + "learning_rate": 2.019419237749546e-05, + "loss": 24.5035, + "step": 1811 + }, + { + "epoch": 6.54176072234763, + "grad_norm": 229.9432373046875, + "learning_rate": 2.018874773139746e-05, + "loss": 26.1645, + "step": 1812 + }, + { + "epoch": 6.545372460496614, + "grad_norm": 325.3592529296875, + "learning_rate": 2.018330308529946e-05, + "loss": 45.6349, + "step": 1813 + }, + { + "epoch": 6.5489841986455986, + "grad_norm": 261.0744323730469, + "learning_rate": 2.0177858439201454e-05, + "loss": 45.5545, + "step": 1814 + }, + { + "epoch": 6.5525959367945825, + "grad_norm": 261.4237976074219, + "learning_rate": 2.017241379310345e-05, + "loss": 45.321, + "step": 1815 + }, + { + "epoch": 6.5562076749435665, + "grad_norm": 238.8377685546875, + "learning_rate": 2.0166969147005445e-05, + "loss": 44.5963, + "step": 1816 + }, + { + "epoch": 6.5598194130925505, + "grad_norm": 225.89730834960938, + "learning_rate": 2.016152450090744e-05, + "loss": 43.593, + "step": 1817 + }, + { + "epoch": 6.563431151241535, + "grad_norm": 265.09625244140625, + "learning_rate": 2.0156079854809436e-05, + "loss": 43.536, + "step": 1818 + }, + { + "epoch": 6.567042889390519, + "grad_norm": 257.9114685058594, + "learning_rate": 2.0150635208711434e-05, + "loss": 44.1125, + "step": 1819 + }, + { + "epoch": 6.570654627539503, + "grad_norm": 188.06382751464844, + "learning_rate": 2.014519056261343e-05, + "loss": 45.097, + "step": 1820 + }, + { + "epoch": 6.570654627539503, + "eval_loss": 0.6347097754478455, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 1820 + }, + { + "epoch": 6.574266365688487, + "grad_norm": 227.7350616455078, + "learning_rate": 2.013974591651543e-05, + "loss": 43.9367, + "step": 1821 + }, + { + "epoch": 6.577878103837472, + "grad_norm": 207.54774475097656, + "learning_rate": 2.0134301270417424e-05, + "loss": 43.8266, + "step": 1822 + }, + { + "epoch": 6.581489841986456, + "grad_norm": 204.62364196777344, + "learning_rate": 2.012885662431942e-05, + "loss": 42.7973, + "step": 1823 + }, + { + "epoch": 6.58510158013544, + "grad_norm": 244.32159423828125, + "learning_rate": 2.0123411978221418e-05, + "loss": 42.7741, + "step": 1824 + }, + { + "epoch": 6.588713318284425, + "grad_norm": 304.9100036621094, + "learning_rate": 2.0117967332123414e-05, + "loss": 40.6529, + "step": 1825 + }, + { + "epoch": 6.592325056433409, + "grad_norm": 275.5767517089844, + "learning_rate": 2.011252268602541e-05, + "loss": 40.2909, + "step": 1826 + }, + { + "epoch": 6.595936794582393, + "grad_norm": 227.69642639160156, + "learning_rate": 2.0107078039927404e-05, + "loss": 39.8786, + "step": 1827 + }, + { + "epoch": 6.599548532731377, + "grad_norm": 261.4333190917969, + "learning_rate": 2.01016333938294e-05, + "loss": 40.7009, + "step": 1828 + }, + { + "epoch": 6.603160270880361, + "grad_norm": 213.0095977783203, + "learning_rate": 2.0096188747731395e-05, + "loss": 40.0595, + "step": 1829 + }, + { + "epoch": 6.606772009029346, + "grad_norm": 251.78590393066406, + "learning_rate": 2.0090744101633397e-05, + "loss": 40.8939, + "step": 1830 + }, + { + "epoch": 6.606772009029346, + "eval_loss": 0.6333281397819519, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1830 + }, + { + "epoch": 6.6103837471783295, + "grad_norm": 224.89805603027344, + "learning_rate": 2.0085299455535393e-05, + "loss": 41.4123, + "step": 1831 + }, + { + "epoch": 6.6139954853273135, + "grad_norm": 195.67982482910156, + "learning_rate": 2.0079854809437388e-05, + "loss": 41.3483, + "step": 1832 + }, + { + "epoch": 6.617607223476298, + "grad_norm": 214.318603515625, + "learning_rate": 2.0074410163339383e-05, + "loss": 40.5516, + "step": 1833 + }, + { + "epoch": 6.621218961625282, + "grad_norm": 226.60968017578125, + "learning_rate": 2.006896551724138e-05, + "loss": 41.3523, + "step": 1834 + }, + { + "epoch": 6.624830699774266, + "grad_norm": 231.63604736328125, + "learning_rate": 2.0063520871143378e-05, + "loss": 41.8734, + "step": 1835 + }, + { + "epoch": 6.62844243792325, + "grad_norm": 224.1644287109375, + "learning_rate": 2.0058076225045373e-05, + "loss": 42.7386, + "step": 1836 + }, + { + "epoch": 6.632054176072235, + "grad_norm": 273.651123046875, + "learning_rate": 2.0052631578947368e-05, + "loss": 42.4525, + "step": 1837 + }, + { + "epoch": 6.635665914221219, + "grad_norm": 270.8088684082031, + "learning_rate": 2.0047186932849364e-05, + "loss": 42.1051, + "step": 1838 + }, + { + "epoch": 6.639277652370203, + "grad_norm": 303.1058044433594, + "learning_rate": 2.0041742286751362e-05, + "loss": 42.1301, + "step": 1839 + }, + { + "epoch": 6.642889390519187, + "grad_norm": 207.29380798339844, + "learning_rate": 2.0036297640653358e-05, + "loss": 42.1495, + "step": 1840 + }, + { + "epoch": 6.642889390519187, + "eval_loss": 0.6321585774421692, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1840 + }, + { + "epoch": 6.646501128668172, + "grad_norm": 262.1852722167969, + "learning_rate": 2.0030852994555357e-05, + "loss": 39.6408, + "step": 1841 + }, + { + "epoch": 6.650112866817156, + "grad_norm": 233.7991943359375, + "learning_rate": 2.0025408348457352e-05, + "loss": 37.6177, + "step": 1842 + }, + { + "epoch": 6.65372460496614, + "grad_norm": 247.25514221191406, + "learning_rate": 2.0019963702359347e-05, + "loss": 35.4287, + "step": 1843 + }, + { + "epoch": 6.657336343115124, + "grad_norm": 191.53343200683594, + "learning_rate": 2.0014519056261343e-05, + "loss": 34.2335, + "step": 1844 + }, + { + "epoch": 6.660948081264109, + "grad_norm": 245.22821044921875, + "learning_rate": 2.0009074410163338e-05, + "loss": 35.8097, + "step": 1845 + }, + { + "epoch": 6.664559819413093, + "grad_norm": 213.8151092529297, + "learning_rate": 2.0003629764065337e-05, + "loss": 35.2621, + "step": 1846 + }, + { + "epoch": 6.668171557562077, + "grad_norm": 174.6085205078125, + "learning_rate": 1.9998185117967332e-05, + "loss": 36.6137, + "step": 1847 + }, + { + "epoch": 6.6717832957110605, + "grad_norm": 287.4677429199219, + "learning_rate": 1.9992740471869328e-05, + "loss": 37.5896, + "step": 1848 + }, + { + "epoch": 6.675395033860045, + "grad_norm": 224.59771728515625, + "learning_rate": 1.9987295825771326e-05, + "loss": 36.5515, + "step": 1849 + }, + { + "epoch": 6.679006772009029, + "grad_norm": 212.73065185546875, + "learning_rate": 1.9981851179673322e-05, + "loss": 36.2511, + "step": 1850 + }, + { + "epoch": 6.679006772009029, + "eval_loss": 0.6308404803276062, + "eval_runtime": 3.1419, + "eval_samples_per_second": 56.972, + "eval_steps_per_second": 56.972, + "step": 1850 + }, + { + "epoch": 6.682618510158013, + "grad_norm": 214.7340850830078, + "learning_rate": 1.9976406533575317e-05, + "loss": 37.6949, + "step": 1851 + }, + { + "epoch": 6.686230248306998, + "grad_norm": 220.3029327392578, + "learning_rate": 1.9970961887477316e-05, + "loss": 36.5785, + "step": 1852 + }, + { + "epoch": 6.689841986455982, + "grad_norm": 198.97564697265625, + "learning_rate": 1.996551724137931e-05, + "loss": 38.5277, + "step": 1853 + }, + { + "epoch": 6.693453724604966, + "grad_norm": 180.94789123535156, + "learning_rate": 1.9960072595281307e-05, + "loss": 37.5197, + "step": 1854 + }, + { + "epoch": 6.69706546275395, + "grad_norm": 212.17584228515625, + "learning_rate": 1.9954627949183302e-05, + "loss": 37.3483, + "step": 1855 + }, + { + "epoch": 6.700677200902934, + "grad_norm": 253.88601684570312, + "learning_rate": 1.9949183303085298e-05, + "loss": 38.5224, + "step": 1856 + }, + { + "epoch": 6.704288939051919, + "grad_norm": 193.17698669433594, + "learning_rate": 1.9943738656987296e-05, + "loss": 37.5679, + "step": 1857 + }, + { + "epoch": 6.707900677200903, + "grad_norm": 217.2652130126953, + "learning_rate": 1.9938294010889295e-05, + "loss": 27.7344, + "step": 1858 + }, + { + "epoch": 6.711512415349887, + "grad_norm": 183.9295196533203, + "learning_rate": 1.993284936479129e-05, + "loss": 24.3864, + "step": 1859 + }, + { + "epoch": 6.715124153498872, + "grad_norm": 200.3455352783203, + "learning_rate": 1.9927404718693286e-05, + "loss": 23.7328, + "step": 1860 + }, + { + "epoch": 6.715124153498872, + "eval_loss": 0.636415421962738, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.943, + "eval_steps_per_second": 56.943, + "step": 1860 + }, + { + "epoch": 6.718735891647856, + "grad_norm": 206.7858123779297, + "learning_rate": 1.992196007259528e-05, + "loss": 24.6541, + "step": 1861 + }, + { + "epoch": 6.72234762979684, + "grad_norm": 208.10414123535156, + "learning_rate": 1.9916515426497277e-05, + "loss": 25.1223, + "step": 1862 + }, + { + "epoch": 6.725959367945824, + "grad_norm": 270.6657409667969, + "learning_rate": 1.9911070780399275e-05, + "loss": 44.8561, + "step": 1863 + }, + { + "epoch": 6.7295711060948085, + "grad_norm": 246.69094848632812, + "learning_rate": 1.990562613430127e-05, + "loss": 45.8683, + "step": 1864 + }, + { + "epoch": 6.733182844243792, + "grad_norm": 243.4462432861328, + "learning_rate": 1.9900181488203266e-05, + "loss": 45.1845, + "step": 1865 + }, + { + "epoch": 6.736794582392776, + "grad_norm": 218.0637969970703, + "learning_rate": 1.989473684210526e-05, + "loss": 43.9492, + "step": 1866 + }, + { + "epoch": 6.74040632054176, + "grad_norm": 200.28140258789062, + "learning_rate": 1.988929219600726e-05, + "loss": 44.0612, + "step": 1867 + }, + { + "epoch": 6.744018058690745, + "grad_norm": 200.3120880126953, + "learning_rate": 1.988384754990926e-05, + "loss": 43.4748, + "step": 1868 + }, + { + "epoch": 6.747629796839729, + "grad_norm": 186.1811065673828, + "learning_rate": 1.9878402903811254e-05, + "loss": 43.6851, + "step": 1869 + }, + { + "epoch": 6.751241534988713, + "grad_norm": 208.15167236328125, + "learning_rate": 1.987295825771325e-05, + "loss": 44.4196, + "step": 1870 + }, + { + "epoch": 6.751241534988713, + "eval_loss": 0.6353851556777954, + "eval_runtime": 3.1436, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1870 + }, + { + "epoch": 6.754853273137698, + "grad_norm": 207.500244140625, + "learning_rate": 1.9867513611615245e-05, + "loss": 44.1493, + "step": 1871 + }, + { + "epoch": 6.758465011286682, + "grad_norm": 238.17047119140625, + "learning_rate": 1.986206896551724e-05, + "loss": 44.6587, + "step": 1872 + }, + { + "epoch": 6.762076749435666, + "grad_norm": 192.9468231201172, + "learning_rate": 1.9856624319419236e-05, + "loss": 43.2409, + "step": 1873 + }, + { + "epoch": 6.76568848758465, + "grad_norm": 205.26492309570312, + "learning_rate": 1.9851179673321235e-05, + "loss": 40.8636, + "step": 1874 + }, + { + "epoch": 6.769300225733634, + "grad_norm": 190.49908447265625, + "learning_rate": 1.984573502722323e-05, + "loss": 41.0769, + "step": 1875 + }, + { + "epoch": 6.772911963882619, + "grad_norm": 206.56097412109375, + "learning_rate": 1.984029038112523e-05, + "loss": 40.1137, + "step": 1876 + }, + { + "epoch": 6.776523702031603, + "grad_norm": 212.89256286621094, + "learning_rate": 1.9834845735027224e-05, + "loss": 41.0114, + "step": 1877 + }, + { + "epoch": 6.780135440180587, + "grad_norm": 197.24267578125, + "learning_rate": 1.982940108892922e-05, + "loss": 40.6027, + "step": 1878 + }, + { + "epoch": 6.7837471783295715, + "grad_norm": 187.01942443847656, + "learning_rate": 1.982395644283122e-05, + "loss": 40.5933, + "step": 1879 + }, + { + "epoch": 6.7873589164785555, + "grad_norm": 236.31092834472656, + "learning_rate": 1.9818511796733214e-05, + "loss": 41.2282, + "step": 1880 + }, + { + "epoch": 6.7873589164785555, + "eval_loss": 0.6299392580986023, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 1880 + }, + { + "epoch": 6.7909706546275395, + "grad_norm": 194.92059326171875, + "learning_rate": 1.981306715063521e-05, + "loss": 41.5858, + "step": 1881 + }, + { + "epoch": 6.794582392776523, + "grad_norm": 192.26272583007812, + "learning_rate": 1.9807622504537205e-05, + "loss": 40.6826, + "step": 1882 + }, + { + "epoch": 6.798194130925508, + "grad_norm": 181.8116912841797, + "learning_rate": 1.98021778584392e-05, + "loss": 40.0867, + "step": 1883 + }, + { + "epoch": 6.801805869074492, + "grad_norm": 219.03494262695312, + "learning_rate": 1.9796733212341195e-05, + "loss": 41.4496, + "step": 1884 + }, + { + "epoch": 6.805417607223476, + "grad_norm": 190.7852325439453, + "learning_rate": 1.9791288566243194e-05, + "loss": 42.4147, + "step": 1885 + }, + { + "epoch": 6.80902934537246, + "grad_norm": 200.32476806640625, + "learning_rate": 1.9785843920145193e-05, + "loss": 42.0316, + "step": 1886 + }, + { + "epoch": 6.812641083521445, + "grad_norm": 240.6086883544922, + "learning_rate": 1.9780399274047188e-05, + "loss": 39.6992, + "step": 1887 + }, + { + "epoch": 6.816252821670429, + "grad_norm": 222.31700134277344, + "learning_rate": 1.9774954627949184e-05, + "loss": 42.9572, + "step": 1888 + }, + { + "epoch": 6.819864559819413, + "grad_norm": 215.65292358398438, + "learning_rate": 1.976950998185118e-05, + "loss": 42.5147, + "step": 1889 + }, + { + "epoch": 6.823476297968397, + "grad_norm": 195.71624755859375, + "learning_rate": 1.9764065335753178e-05, + "loss": 40.9536, + "step": 1890 + }, + { + "epoch": 6.823476297968397, + "eval_loss": 0.6288287043571472, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 1890 + }, + { + "epoch": 6.827088036117382, + "grad_norm": 202.301025390625, + "learning_rate": 1.9758620689655173e-05, + "loss": 40.1754, + "step": 1891 + }, + { + "epoch": 6.830699774266366, + "grad_norm": 217.07186889648438, + "learning_rate": 1.975317604355717e-05, + "loss": 35.7505, + "step": 1892 + }, + { + "epoch": 6.83431151241535, + "grad_norm": 189.78782653808594, + "learning_rate": 1.9747731397459164e-05, + "loss": 34.813, + "step": 1893 + }, + { + "epoch": 6.837923250564334, + "grad_norm": 247.2117462158203, + "learning_rate": 1.974228675136116e-05, + "loss": 33.932, + "step": 1894 + }, + { + "epoch": 6.8415349887133186, + "grad_norm": 244.06321716308594, + "learning_rate": 1.9736842105263158e-05, + "loss": 36.2514, + "step": 1895 + }, + { + "epoch": 6.8451467268623025, + "grad_norm": 235.78692626953125, + "learning_rate": 1.9731397459165157e-05, + "loss": 35.2123, + "step": 1896 + }, + { + "epoch": 6.8487584650112865, + "grad_norm": 193.82456970214844, + "learning_rate": 1.9725952813067152e-05, + "loss": 36.5477, + "step": 1897 + }, + { + "epoch": 6.852370203160271, + "grad_norm": 230.2017059326172, + "learning_rate": 1.9720508166969148e-05, + "loss": 36.1244, + "step": 1898 + }, + { + "epoch": 6.855981941309255, + "grad_norm": 205.5274200439453, + "learning_rate": 1.9715063520871143e-05, + "loss": 36.7059, + "step": 1899 + }, + { + "epoch": 6.859593679458239, + "grad_norm": 236.6873016357422, + "learning_rate": 1.970961887477314e-05, + "loss": 36.6212, + "step": 1900 + }, + { + "epoch": 6.859593679458239, + "eval_loss": 0.6235609650611877, + "eval_runtime": 3.1497, + "eval_samples_per_second": 56.831, + "eval_steps_per_second": 56.831, + "step": 1900 + }, + { + "epoch": 6.863205417607223, + "grad_norm": 217.63638305664062, + "learning_rate": 1.9704174228675137e-05, + "loss": 37.3918, + "step": 1901 + }, + { + "epoch": 6.866817155756207, + "grad_norm": 169.31996154785156, + "learning_rate": 1.9698729582577133e-05, + "loss": 37.8555, + "step": 1902 + }, + { + "epoch": 6.870428893905192, + "grad_norm": 204.2144775390625, + "learning_rate": 1.9693284936479128e-05, + "loss": 38.0013, + "step": 1903 + }, + { + "epoch": 6.874040632054176, + "grad_norm": 219.13595581054688, + "learning_rate": 1.9687840290381127e-05, + "loss": 37.2128, + "step": 1904 + }, + { + "epoch": 6.87765237020316, + "grad_norm": 189.8477325439453, + "learning_rate": 1.9682395644283122e-05, + "loss": 39.272, + "step": 1905 + }, + { + "epoch": 6.881264108352145, + "grad_norm": 214.21360778808594, + "learning_rate": 1.967695099818512e-05, + "loss": 37.5185, + "step": 1906 + }, + { + "epoch": 6.884875846501129, + "grad_norm": 252.57867431640625, + "learning_rate": 1.9671506352087116e-05, + "loss": 37.6195, + "step": 1907 + }, + { + "epoch": 6.888487584650113, + "grad_norm": 169.85382080078125, + "learning_rate": 1.966606170598911e-05, + "loss": 29.083, + "step": 1908 + }, + { + "epoch": 6.892099322799097, + "grad_norm": 161.38137817382812, + "learning_rate": 1.9660617059891107e-05, + "loss": 24.4547, + "step": 1909 + }, + { + "epoch": 6.895711060948082, + "grad_norm": 192.5706787109375, + "learning_rate": 1.9655172413793102e-05, + "loss": 24.2235, + "step": 1910 + }, + { + "epoch": 6.895711060948082, + "eval_loss": 0.6387229561805725, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1910 + }, + { + "epoch": 6.899322799097066, + "grad_norm": 177.5368194580078, + "learning_rate": 1.9649727767695098e-05, + "loss": 24.8032, + "step": 1911 + }, + { + "epoch": 6.9029345372460496, + "grad_norm": 206.98458862304688, + "learning_rate": 1.9644283121597097e-05, + "loss": 25.7293, + "step": 1912 + }, + { + "epoch": 6.9065462753950335, + "grad_norm": 238.7289581298828, + "learning_rate": 1.9638838475499095e-05, + "loss": 44.2514, + "step": 1913 + }, + { + "epoch": 6.910158013544018, + "grad_norm": 225.86854553222656, + "learning_rate": 1.963339382940109e-05, + "loss": 44.4858, + "step": 1914 + }, + { + "epoch": 6.913769751693002, + "grad_norm": 235.71524047851562, + "learning_rate": 1.9627949183303086e-05, + "loss": 44.5351, + "step": 1915 + }, + { + "epoch": 6.917381489841986, + "grad_norm": 233.1634063720703, + "learning_rate": 1.962250453720508e-05, + "loss": 44.0865, + "step": 1916 + }, + { + "epoch": 6.92099322799097, + "grad_norm": 201.48944091796875, + "learning_rate": 1.961705989110708e-05, + "loss": 45.0226, + "step": 1917 + }, + { + "epoch": 6.924604966139955, + "grad_norm": 226.95469665527344, + "learning_rate": 1.9611615245009076e-05, + "loss": 44.3969, + "step": 1918 + }, + { + "epoch": 6.928216704288939, + "grad_norm": 242.79940795898438, + "learning_rate": 1.960617059891107e-05, + "loss": 41.3037, + "step": 1919 + }, + { + "epoch": 6.931828442437923, + "grad_norm": 255.3524932861328, + "learning_rate": 1.9600725952813066e-05, + "loss": 41.3567, + "step": 1920 + }, + { + "epoch": 6.931828442437923, + "eval_loss": 0.6346065998077393, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 1920 + }, + { + "epoch": 6.935440180586907, + "grad_norm": 277.0763854980469, + "learning_rate": 1.9595281306715062e-05, + "loss": 41.142, + "step": 1921 + }, + { + "epoch": 6.939051918735892, + "grad_norm": 176.02658081054688, + "learning_rate": 1.958983666061706e-05, + "loss": 42.1963, + "step": 1922 + }, + { + "epoch": 6.942663656884876, + "grad_norm": 236.36398315429688, + "learning_rate": 1.958439201451906e-05, + "loss": 42.351, + "step": 1923 + }, + { + "epoch": 6.94627539503386, + "grad_norm": 203.0919647216797, + "learning_rate": 1.9578947368421055e-05, + "loss": 41.5248, + "step": 1924 + }, + { + "epoch": 6.949887133182845, + "grad_norm": 273.605712890625, + "learning_rate": 1.957350272232305e-05, + "loss": 42.1004, + "step": 1925 + }, + { + "epoch": 6.953498871331829, + "grad_norm": 214.04319763183594, + "learning_rate": 1.9568058076225045e-05, + "loss": 42.6326, + "step": 1926 + }, + { + "epoch": 6.957110609480813, + "grad_norm": 250.81832885742188, + "learning_rate": 1.956261343012704e-05, + "loss": 43.8045, + "step": 1927 + }, + { + "epoch": 6.960722347629797, + "grad_norm": 233.58116149902344, + "learning_rate": 1.955716878402904e-05, + "loss": 39.8991, + "step": 1928 + }, + { + "epoch": 6.9643340857787805, + "grad_norm": 269.0545654296875, + "learning_rate": 1.9551724137931035e-05, + "loss": 34.6192, + "step": 1929 + }, + { + "epoch": 6.967945823927765, + "grad_norm": 266.1218566894531, + "learning_rate": 1.954627949183303e-05, + "loss": 35.7568, + "step": 1930 + }, + { + "epoch": 6.967945823927765, + "eval_loss": 0.6233173608779907, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1930 + }, + { + "epoch": 6.971557562076749, + "grad_norm": 294.6914978027344, + "learning_rate": 1.9540834845735026e-05, + "loss": 36.0795, + "step": 1931 + }, + { + "epoch": 6.975169300225733, + "grad_norm": 373.6831970214844, + "learning_rate": 1.9535390199637025e-05, + "loss": 37.2715, + "step": 1932 + }, + { + "epoch": 6.978781038374718, + "grad_norm": 240.34738159179688, + "learning_rate": 1.952994555353902e-05, + "loss": 37.8335, + "step": 1933 + }, + { + "epoch": 6.982392776523702, + "grad_norm": 312.1968994140625, + "learning_rate": 1.952450090744102e-05, + "loss": 37.8251, + "step": 1934 + }, + { + "epoch": 6.986004514672686, + "grad_norm": 276.3544006347656, + "learning_rate": 1.9519056261343014e-05, + "loss": 38.8466, + "step": 1935 + }, + { + "epoch": 6.98961625282167, + "grad_norm": 282.6874694824219, + "learning_rate": 1.951361161524501e-05, + "loss": 37.774, + "step": 1936 + }, + { + "epoch": 6.993227990970655, + "grad_norm": 323.96612548828125, + "learning_rate": 1.9508166969147005e-05, + "loss": 34.3747, + "step": 1937 + }, + { + "epoch": 6.996839729119639, + "grad_norm": 235.02915954589844, + "learning_rate": 1.9502722323049e-05, + "loss": 24.5297, + "step": 1938 + }, + { + "epoch": 7.0, + "grad_norm": 176.4046173095703, + "learning_rate": 1.9497277676951e-05, + "loss": 22.3179, + "step": 1939 + }, + { + "epoch": 7.003611738148984, + "grad_norm": 248.2797393798828, + "learning_rate": 1.9491833030852994e-05, + "loss": 42.225, + "step": 1940 + }, + { + "epoch": 7.003611738148984, + "eval_loss": 0.6272363066673279, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.911, + "eval_steps_per_second": 56.911, + "step": 1940 + }, + { + "epoch": 7.007223476297969, + "grad_norm": 235.9131622314453, + "learning_rate": 1.9486388384754993e-05, + "loss": 43.6526, + "step": 1941 + }, + { + "epoch": 7.010835214446953, + "grad_norm": 223.63479614257812, + "learning_rate": 1.948094373865699e-05, + "loss": 42.9052, + "step": 1942 + }, + { + "epoch": 7.014446952595937, + "grad_norm": 203.92141723632812, + "learning_rate": 1.9475499092558984e-05, + "loss": 43.5819, + "step": 1943 + }, + { + "epoch": 7.018058690744921, + "grad_norm": 209.6050567626953, + "learning_rate": 1.947005444646098e-05, + "loss": 43.1077, + "step": 1944 + }, + { + "epoch": 7.021670428893906, + "grad_norm": 245.77700805664062, + "learning_rate": 1.9464609800362978e-05, + "loss": 42.7508, + "step": 1945 + }, + { + "epoch": 7.0252821670428895, + "grad_norm": 203.13465881347656, + "learning_rate": 1.9459165154264973e-05, + "loss": 42.5234, + "step": 1946 + }, + { + "epoch": 7.0288939051918735, + "grad_norm": 226.4978485107422, + "learning_rate": 1.945372050816697e-05, + "loss": 44.0725, + "step": 1947 + }, + { + "epoch": 7.0325056433408575, + "grad_norm": 225.68116760253906, + "learning_rate": 1.9448275862068964e-05, + "loss": 42.6408, + "step": 1948 + }, + { + "epoch": 7.036117381489842, + "grad_norm": 182.14202880859375, + "learning_rate": 1.944283121597096e-05, + "loss": 41.7696, + "step": 1949 + }, + { + "epoch": 7.039729119638826, + "grad_norm": 196.1949005126953, + "learning_rate": 1.9437386569872962e-05, + "loss": 42.7008, + "step": 1950 + }, + { + "epoch": 7.039729119638826, + "eval_loss": 0.6277336478233337, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 1950 + }, + { + "epoch": 7.04334085778781, + "grad_norm": 180.6853485107422, + "learning_rate": 1.9431941923774957e-05, + "loss": 41.9946, + "step": 1951 + }, + { + "epoch": 7.046952595936794, + "grad_norm": 199.0644073486328, + "learning_rate": 1.9426497277676953e-05, + "loss": 39.8965, + "step": 1952 + }, + { + "epoch": 7.050564334085779, + "grad_norm": 208.21371459960938, + "learning_rate": 1.9421052631578948e-05, + "loss": 39.3263, + "step": 1953 + }, + { + "epoch": 7.054176072234763, + "grad_norm": 239.78677368164062, + "learning_rate": 1.9415607985480943e-05, + "loss": 40.1478, + "step": 1954 + }, + { + "epoch": 7.057787810383747, + "grad_norm": 211.55030822753906, + "learning_rate": 1.941016333938294e-05, + "loss": 40.061, + "step": 1955 + }, + { + "epoch": 7.061399548532731, + "grad_norm": 199.51455688476562, + "learning_rate": 1.9404718693284937e-05, + "loss": 39.8707, + "step": 1956 + }, + { + "epoch": 7.065011286681716, + "grad_norm": 183.39486694335938, + "learning_rate": 1.9399274047186933e-05, + "loss": 40.3183, + "step": 1957 + }, + { + "epoch": 7.0686230248307, + "grad_norm": 238.36737060546875, + "learning_rate": 1.9393829401088928e-05, + "loss": 40.8581, + "step": 1958 + }, + { + "epoch": 7.072234762979684, + "grad_norm": 202.5072021484375, + "learning_rate": 1.9388384754990927e-05, + "loss": 40.2192, + "step": 1959 + }, + { + "epoch": 7.075846501128668, + "grad_norm": 204.236083984375, + "learning_rate": 1.9382940108892922e-05, + "loss": 40.8533, + "step": 1960 + }, + { + "epoch": 7.075846501128668, + "eval_loss": 0.6252757906913757, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 1960 + }, + { + "epoch": 7.079458239277653, + "grad_norm": 260.2081298828125, + "learning_rate": 1.937749546279492e-05, + "loss": 39.7229, + "step": 1961 + }, + { + "epoch": 7.083069977426637, + "grad_norm": 241.91722106933594, + "learning_rate": 1.9372050816696917e-05, + "loss": 41.547, + "step": 1962 + }, + { + "epoch": 7.0866817155756205, + "grad_norm": 168.9304656982422, + "learning_rate": 1.9366606170598912e-05, + "loss": 41.4826, + "step": 1963 + }, + { + "epoch": 7.090293453724605, + "grad_norm": 230.05349731445312, + "learning_rate": 1.9361161524500907e-05, + "loss": 41.5411, + "step": 1964 + }, + { + "epoch": 7.093905191873589, + "grad_norm": 172.16851806640625, + "learning_rate": 1.9355716878402903e-05, + "loss": 42.2347, + "step": 1965 + }, + { + "epoch": 7.097516930022573, + "grad_norm": 312.65838623046875, + "learning_rate": 1.9350272232304898e-05, + "loss": 41.4039, + "step": 1966 + }, + { + "epoch": 7.101128668171557, + "grad_norm": 249.62351989746094, + "learning_rate": 1.9344827586206897e-05, + "loss": 41.4234, + "step": 1967 + }, + { + "epoch": 7.104740406320542, + "grad_norm": 250.49143981933594, + "learning_rate": 1.9339382940108896e-05, + "loss": 38.0539, + "step": 1968 + }, + { + "epoch": 7.108352144469526, + "grad_norm": 238.41546630859375, + "learning_rate": 1.933393829401089e-05, + "loss": 35.5584, + "step": 1969 + }, + { + "epoch": 7.11196388261851, + "grad_norm": 200.78282165527344, + "learning_rate": 1.9328493647912886e-05, + "loss": 34.4491, + "step": 1970 + }, + { + "epoch": 7.11196388261851, + "eval_loss": 0.6286216378211975, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 1970 + }, + { + "epoch": 7.115575620767494, + "grad_norm": 244.61717224121094, + "learning_rate": 1.9323049001814882e-05, + "loss": 34.5403, + "step": 1971 + }, + { + "epoch": 7.119187358916479, + "grad_norm": 219.14312744140625, + "learning_rate": 1.931760435571688e-05, + "loss": 35.7815, + "step": 1972 + }, + { + "epoch": 7.122799097065463, + "grad_norm": 221.85130310058594, + "learning_rate": 1.9312159709618876e-05, + "loss": 35.638, + "step": 1973 + }, + { + "epoch": 7.126410835214447, + "grad_norm": 237.97921752929688, + "learning_rate": 1.930671506352087e-05, + "loss": 35.1348, + "step": 1974 + }, + { + "epoch": 7.130022573363431, + "grad_norm": 234.06256103515625, + "learning_rate": 1.9301270417422867e-05, + "loss": 35.8709, + "step": 1975 + }, + { + "epoch": 7.133634311512416, + "grad_norm": 231.6852264404297, + "learning_rate": 1.9295825771324862e-05, + "loss": 36.6859, + "step": 1976 + }, + { + "epoch": 7.1372460496614, + "grad_norm": 208.2762908935547, + "learning_rate": 1.9290381125226857e-05, + "loss": 37.24, + "step": 1977 + }, + { + "epoch": 7.140857787810384, + "grad_norm": 219.8532257080078, + "learning_rate": 1.928493647912886e-05, + "loss": 36.4058, + "step": 1978 + }, + { + "epoch": 7.144469525959368, + "grad_norm": 242.73159790039062, + "learning_rate": 1.9279491833030855e-05, + "loss": 36.7565, + "step": 1979 + }, + { + "epoch": 7.148081264108352, + "grad_norm": 227.09645080566406, + "learning_rate": 1.927404718693285e-05, + "loss": 37.6752, + "step": 1980 + }, + { + "epoch": 7.148081264108352, + "eval_loss": 0.6243596076965332, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 1980 + }, + { + "epoch": 7.151693002257336, + "grad_norm": 236.27169799804688, + "learning_rate": 1.9268602540834846e-05, + "loss": 38.3857, + "step": 1981 + }, + { + "epoch": 7.15530474040632, + "grad_norm": 244.84912109375, + "learning_rate": 1.926315789473684e-05, + "loss": 38.414, + "step": 1982 + }, + { + "epoch": 7.158916478555304, + "grad_norm": 203.36798095703125, + "learning_rate": 1.925771324863884e-05, + "loss": 38.938, + "step": 1983 + }, + { + "epoch": 7.162528216704289, + "grad_norm": 225.50152587890625, + "learning_rate": 1.9252268602540835e-05, + "loss": 37.654, + "step": 1984 + }, + { + "epoch": 7.166139954853273, + "grad_norm": 236.4989471435547, + "learning_rate": 1.924682395644283e-05, + "loss": 28.2794, + "step": 1985 + }, + { + "epoch": 7.169751693002257, + "grad_norm": 173.909423828125, + "learning_rate": 1.9241379310344826e-05, + "loss": 23.3804, + "step": 1986 + }, + { + "epoch": 7.173363431151241, + "grad_norm": 195.63526916503906, + "learning_rate": 1.9235934664246825e-05, + "loss": 24.4696, + "step": 1987 + }, + { + "epoch": 7.176975169300226, + "grad_norm": 150.0059356689453, + "learning_rate": 1.923049001814882e-05, + "loss": 23.9438, + "step": 1988 + }, + { + "epoch": 7.18058690744921, + "grad_norm": 217.61630249023438, + "learning_rate": 1.922504537205082e-05, + "loss": 25.4084, + "step": 1989 + }, + { + "epoch": 7.184198645598194, + "grad_norm": 259.2041015625, + "learning_rate": 1.9219600725952814e-05, + "loss": 44.7159, + "step": 1990 + }, + { + "epoch": 7.184198645598194, + "eval_loss": 0.6465168595314026, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 1990 + }, + { + "epoch": 7.187810383747179, + "grad_norm": 282.1758117675781, + "learning_rate": 1.921415607985481e-05, + "loss": 45.7571, + "step": 1991 + }, + { + "epoch": 7.191422121896163, + "grad_norm": 276.5455322265625, + "learning_rate": 1.9208711433756805e-05, + "loss": 44.7227, + "step": 1992 + }, + { + "epoch": 7.195033860045147, + "grad_norm": 251.93589782714844, + "learning_rate": 1.92032667876588e-05, + "loss": 43.0705, + "step": 1993 + }, + { + "epoch": 7.198645598194131, + "grad_norm": 224.8245086669922, + "learning_rate": 1.91978221415608e-05, + "loss": 43.2009, + "step": 1994 + }, + { + "epoch": 7.2022573363431155, + "grad_norm": 233.61770629882812, + "learning_rate": 1.9192377495462795e-05, + "loss": 43.4496, + "step": 1995 + }, + { + "epoch": 7.2058690744920995, + "grad_norm": 188.65252685546875, + "learning_rate": 1.9186932849364793e-05, + "loss": 42.5907, + "step": 1996 + }, + { + "epoch": 7.209480812641083, + "grad_norm": 185.1155242919922, + "learning_rate": 1.918148820326679e-05, + "loss": 44.4651, + "step": 1997 + }, + { + "epoch": 7.213092550790067, + "grad_norm": 169.09701538085938, + "learning_rate": 1.9176043557168784e-05, + "loss": 43.6325, + "step": 1998 + }, + { + "epoch": 7.216704288939052, + "grad_norm": 198.49114990234375, + "learning_rate": 1.9170598911070783e-05, + "loss": 43.5817, + "step": 1999 + }, + { + "epoch": 7.220316027088036, + "grad_norm": 193.17591857910156, + "learning_rate": 1.916515426497278e-05, + "loss": 41.4884, + "step": 2000 + }, + { + "epoch": 7.220316027088036, + "eval_loss": 0.6329721212387085, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.995, + "eval_steps_per_second": 56.995, + "step": 2000 + }, + { + "epoch": 7.22392776523702, + "grad_norm": 202.32730102539062, + "learning_rate": 1.9159709618874774e-05, + "loss": 41.2168, + "step": 2001 + }, + { + "epoch": 7.227539503386004, + "grad_norm": 206.4916534423828, + "learning_rate": 1.915426497277677e-05, + "loss": 39.9909, + "step": 2002 + }, + { + "epoch": 7.231151241534989, + "grad_norm": 202.2099609375, + "learning_rate": 1.9148820326678765e-05, + "loss": 40.1413, + "step": 2003 + }, + { + "epoch": 7.234762979683973, + "grad_norm": 223.7954559326172, + "learning_rate": 1.914337568058076e-05, + "loss": 39.5872, + "step": 2004 + }, + { + "epoch": 7.238374717832957, + "grad_norm": 225.8967742919922, + "learning_rate": 1.9137931034482762e-05, + "loss": 41.3396, + "step": 2005 + }, + { + "epoch": 7.241986455981941, + "grad_norm": 248.0997772216797, + "learning_rate": 1.9132486388384757e-05, + "loss": 39.012, + "step": 2006 + }, + { + "epoch": 7.245598194130926, + "grad_norm": 227.4576873779297, + "learning_rate": 1.9127041742286753e-05, + "loss": 42.5922, + "step": 2007 + }, + { + "epoch": 7.24920993227991, + "grad_norm": 197.62547302246094, + "learning_rate": 1.9121597096188748e-05, + "loss": 41.6107, + "step": 2008 + }, + { + "epoch": 7.252821670428894, + "grad_norm": 170.18817138671875, + "learning_rate": 1.9116152450090744e-05, + "loss": 40.3326, + "step": 2009 + }, + { + "epoch": 7.2564334085778786, + "grad_norm": 186.9420166015625, + "learning_rate": 1.9110707803992742e-05, + "loss": 41.0365, + "step": 2010 + }, + { + "epoch": 7.2564334085778786, + "eval_loss": 0.6230406761169434, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2010 + }, + { + "epoch": 7.2600451467268625, + "grad_norm": 188.11244201660156, + "learning_rate": 1.9105263157894738e-05, + "loss": 42.0278, + "step": 2011 + }, + { + "epoch": 7.2636568848758465, + "grad_norm": 242.47305297851562, + "learning_rate": 1.9099818511796733e-05, + "loss": 41.5539, + "step": 2012 + }, + { + "epoch": 7.2672686230248305, + "grad_norm": 190.83987426757812, + "learning_rate": 1.909437386569873e-05, + "loss": 41.8641, + "step": 2013 + }, + { + "epoch": 7.270880361173815, + "grad_norm": 214.44650268554688, + "learning_rate": 1.9088929219600724e-05, + "loss": 42.232, + "step": 2014 + }, + { + "epoch": 7.274492099322799, + "grad_norm": 216.3888397216797, + "learning_rate": 1.9083484573502723e-05, + "loss": 41.6186, + "step": 2015 + }, + { + "epoch": 7.278103837471783, + "grad_norm": 210.46673583984375, + "learning_rate": 1.907803992740472e-05, + "loss": 42.2099, + "step": 2016 + }, + { + "epoch": 7.281715575620767, + "grad_norm": 194.84165954589844, + "learning_rate": 1.9072595281306717e-05, + "loss": 42.78, + "step": 2017 + }, + { + "epoch": 7.285327313769752, + "grad_norm": 201.91297912597656, + "learning_rate": 1.9067150635208712e-05, + "loss": 38.7115, + "step": 2018 + }, + { + "epoch": 7.288939051918736, + "grad_norm": 245.42625427246094, + "learning_rate": 1.9061705989110708e-05, + "loss": 35.7841, + "step": 2019 + }, + { + "epoch": 7.29255079006772, + "grad_norm": 182.4967041015625, + "learning_rate": 1.9056261343012703e-05, + "loss": 34.3308, + "step": 2020 + }, + { + "epoch": 7.29255079006772, + "eval_loss": 0.6238341331481934, + "eval_runtime": 3.1431, + "eval_samples_per_second": 56.95, + "eval_steps_per_second": 56.95, + "step": 2020 + }, + { + "epoch": 7.296162528216704, + "grad_norm": 297.3916320800781, + "learning_rate": 1.9050816696914702e-05, + "loss": 34.7534, + "step": 2021 + }, + { + "epoch": 7.299774266365689, + "grad_norm": 211.52554321289062, + "learning_rate": 1.9045372050816697e-05, + "loss": 34.0303, + "step": 2022 + }, + { + "epoch": 7.303386004514673, + "grad_norm": 232.99844360351562, + "learning_rate": 1.9039927404718693e-05, + "loss": 35.7378, + "step": 2023 + }, + { + "epoch": 7.306997742663657, + "grad_norm": 230.34642028808594, + "learning_rate": 1.903448275862069e-05, + "loss": 36.7492, + "step": 2024 + }, + { + "epoch": 7.310609480812641, + "grad_norm": 228.88966369628906, + "learning_rate": 1.9029038112522687e-05, + "loss": 35.1188, + "step": 2025 + }, + { + "epoch": 7.314221218961626, + "grad_norm": 213.2604522705078, + "learning_rate": 1.9023593466424682e-05, + "loss": 35.0688, + "step": 2026 + }, + { + "epoch": 7.3178329571106095, + "grad_norm": 202.62200927734375, + "learning_rate": 1.901814882032668e-05, + "loss": 37.6721, + "step": 2027 + }, + { + "epoch": 7.3214446952595935, + "grad_norm": 191.8877410888672, + "learning_rate": 1.9012704174228676e-05, + "loss": 36.7728, + "step": 2028 + }, + { + "epoch": 7.3250564334085775, + "grad_norm": 211.57571411132812, + "learning_rate": 1.900725952813067e-05, + "loss": 36.6342, + "step": 2029 + }, + { + "epoch": 7.328668171557562, + "grad_norm": 177.2289581298828, + "learning_rate": 1.9001814882032667e-05, + "loss": 36.8319, + "step": 2030 + }, + { + "epoch": 7.328668171557562, + "eval_loss": 0.6231008172035217, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2030 + }, + { + "epoch": 7.332279909706546, + "grad_norm": 227.7028350830078, + "learning_rate": 1.8996370235934662e-05, + "loss": 36.6706, + "step": 2031 + }, + { + "epoch": 7.33589164785553, + "grad_norm": 229.02972412109375, + "learning_rate": 1.899092558983666e-05, + "loss": 37.0749, + "step": 2032 + }, + { + "epoch": 7.339503386004514, + "grad_norm": 234.30946350097656, + "learning_rate": 1.898548094373866e-05, + "loss": 37.3716, + "step": 2033 + }, + { + "epoch": 7.343115124153499, + "grad_norm": 236.79893493652344, + "learning_rate": 1.8980036297640655e-05, + "loss": 38.9503, + "step": 2034 + }, + { + "epoch": 7.346726862302483, + "grad_norm": 256.5646057128906, + "learning_rate": 1.897459165154265e-05, + "loss": 32.5056, + "step": 2035 + }, + { + "epoch": 7.350338600451467, + "grad_norm": 183.38961791992188, + "learning_rate": 1.8969147005444646e-05, + "loss": 25.3982, + "step": 2036 + }, + { + "epoch": 7.353950338600452, + "grad_norm": 214.09742736816406, + "learning_rate": 1.896370235934664e-05, + "loss": 23.2743, + "step": 2037 + }, + { + "epoch": 7.357562076749436, + "grad_norm": 190.10867309570312, + "learning_rate": 1.895825771324864e-05, + "loss": 24.8062, + "step": 2038 + }, + { + "epoch": 7.36117381489842, + "grad_norm": 197.85313415527344, + "learning_rate": 1.8952813067150636e-05, + "loss": 25.5098, + "step": 2039 + }, + { + "epoch": 7.364785553047404, + "grad_norm": 235.79090881347656, + "learning_rate": 1.894736842105263e-05, + "loss": 44.3536, + "step": 2040 + }, + { + "epoch": 7.364785553047404, + "eval_loss": 0.6341925263404846, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.785, + "eval_steps_per_second": 56.785, + "step": 2040 + }, + { + "epoch": 7.368397291196389, + "grad_norm": 232.7415771484375, + "learning_rate": 1.8941923774954626e-05, + "loss": 44.6073, + "step": 2041 + }, + { + "epoch": 7.372009029345373, + "grad_norm": 302.3766174316406, + "learning_rate": 1.8936479128856625e-05, + "loss": 43.8575, + "step": 2042 + }, + { + "epoch": 7.375620767494357, + "grad_norm": 208.41441345214844, + "learning_rate": 1.8931034482758624e-05, + "loss": 42.4378, + "step": 2043 + }, + { + "epoch": 7.3792325056433405, + "grad_norm": 228.000732421875, + "learning_rate": 1.892558983666062e-05, + "loss": 44.5641, + "step": 2044 + }, + { + "epoch": 7.382844243792325, + "grad_norm": 201.757080078125, + "learning_rate": 1.8920145190562615e-05, + "loss": 43.7578, + "step": 2045 + }, + { + "epoch": 7.386455981941309, + "grad_norm": 220.2481689453125, + "learning_rate": 1.891470054446461e-05, + "loss": 42.755, + "step": 2046 + }, + { + "epoch": 7.390067720090293, + "grad_norm": 225.5443115234375, + "learning_rate": 1.8909255898366605e-05, + "loss": 44.3785, + "step": 2047 + }, + { + "epoch": 7.393679458239277, + "grad_norm": 200.2024688720703, + "learning_rate": 1.89038112522686e-05, + "loss": 42.994, + "step": 2048 + }, + { + "epoch": 7.397291196388262, + "grad_norm": 205.64794921875, + "learning_rate": 1.88983666061706e-05, + "loss": 43.1902, + "step": 2049 + }, + { + "epoch": 7.400902934537246, + "grad_norm": 183.3535919189453, + "learning_rate": 1.8892921960072595e-05, + "loss": 40.9422, + "step": 2050 + }, + { + "epoch": 7.400902934537246, + "eval_loss": 0.626913845539093, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2050 + }, + { + "epoch": 7.40451467268623, + "grad_norm": 201.8138885498047, + "learning_rate": 1.8887477313974594e-05, + "loss": 39.4408, + "step": 2051 + }, + { + "epoch": 7.408126410835214, + "grad_norm": 201.8863525390625, + "learning_rate": 1.888203266787659e-05, + "loss": 39.5467, + "step": 2052 + }, + { + "epoch": 7.411738148984199, + "grad_norm": 239.10687255859375, + "learning_rate": 1.8876588021778585e-05, + "loss": 41.2256, + "step": 2053 + }, + { + "epoch": 7.415349887133183, + "grad_norm": 209.47796630859375, + "learning_rate": 1.8871143375680583e-05, + "loss": 40.8963, + "step": 2054 + }, + { + "epoch": 7.418961625282167, + "grad_norm": 202.6414794921875, + "learning_rate": 1.886569872958258e-05, + "loss": 40.5138, + "step": 2055 + }, + { + "epoch": 7.422573363431152, + "grad_norm": 198.01795959472656, + "learning_rate": 1.8860254083484574e-05, + "loss": 39.1767, + "step": 2056 + }, + { + "epoch": 7.426185101580136, + "grad_norm": 173.26507568359375, + "learning_rate": 1.885480943738657e-05, + "loss": 40.6713, + "step": 2057 + }, + { + "epoch": 7.42979683972912, + "grad_norm": 166.11607360839844, + "learning_rate": 1.8849364791288565e-05, + "loss": 41.2602, + "step": 2058 + }, + { + "epoch": 7.433408577878104, + "grad_norm": 200.76956176757812, + "learning_rate": 1.884392014519056e-05, + "loss": 41.0714, + "step": 2059 + }, + { + "epoch": 7.437020316027088, + "grad_norm": 213.75315856933594, + "learning_rate": 1.883847549909256e-05, + "loss": 39.6812, + "step": 2060 + }, + { + "epoch": 7.437020316027088, + "eval_loss": 0.6279598474502563, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.0, + "eval_steps_per_second": 57.0, + "step": 2060 + }, + { + "epoch": 7.440632054176072, + "grad_norm": 221.25025939941406, + "learning_rate": 1.8833030852994558e-05, + "loss": 41.6964, + "step": 2061 + }, + { + "epoch": 7.444243792325056, + "grad_norm": 171.32106018066406, + "learning_rate": 1.8827586206896553e-05, + "loss": 41.4608, + "step": 2062 + }, + { + "epoch": 7.44785553047404, + "grad_norm": 222.76600646972656, + "learning_rate": 1.882214156079855e-05, + "loss": 41.2687, + "step": 2063 + }, + { + "epoch": 7.451467268623025, + "grad_norm": 169.82395935058594, + "learning_rate": 1.8816696914700544e-05, + "loss": 41.6048, + "step": 2064 + }, + { + "epoch": 7.455079006772009, + "grad_norm": 190.5113525390625, + "learning_rate": 1.8811252268602543e-05, + "loss": 41.8843, + "step": 2065 + }, + { + "epoch": 7.458690744920993, + "grad_norm": 194.5990447998047, + "learning_rate": 1.8805807622504538e-05, + "loss": 43.5968, + "step": 2066 + }, + { + "epoch": 7.462302483069977, + "grad_norm": 216.0985870361328, + "learning_rate": 1.8800362976406533e-05, + "loss": 41.6743, + "step": 2067 + }, + { + "epoch": 7.465914221218962, + "grad_norm": 249.05270385742188, + "learning_rate": 1.879491833030853e-05, + "loss": 39.4203, + "step": 2068 + }, + { + "epoch": 7.469525959367946, + "grad_norm": 232.5495147705078, + "learning_rate": 1.8789473684210524e-05, + "loss": 36.2202, + "step": 2069 + }, + { + "epoch": 7.47313769751693, + "grad_norm": 218.72299194335938, + "learning_rate": 1.8784029038112523e-05, + "loss": 34.9116, + "step": 2070 + }, + { + "epoch": 7.47313769751693, + "eval_loss": 0.6241349577903748, + "eval_runtime": 3.1499, + "eval_samples_per_second": 56.827, + "eval_steps_per_second": 56.827, + "step": 2070 + }, + { + "epoch": 7.476749435665914, + "grad_norm": 241.78179931640625, + "learning_rate": 1.8778584392014522e-05, + "loss": 36.2476, + "step": 2071 + }, + { + "epoch": 7.480361173814899, + "grad_norm": 194.92982482910156, + "learning_rate": 1.8773139745916517e-05, + "loss": 34.4524, + "step": 2072 + }, + { + "epoch": 7.483972911963883, + "grad_norm": 227.76156616210938, + "learning_rate": 1.8767695099818513e-05, + "loss": 34.5292, + "step": 2073 + }, + { + "epoch": 7.487584650112867, + "grad_norm": 287.61309814453125, + "learning_rate": 1.8762250453720508e-05, + "loss": 37.8068, + "step": 2074 + }, + { + "epoch": 7.491196388261851, + "grad_norm": 191.0822296142578, + "learning_rate": 1.8756805807622503e-05, + "loss": 36.0941, + "step": 2075 + }, + { + "epoch": 7.4948081264108355, + "grad_norm": 197.5564422607422, + "learning_rate": 1.8751361161524502e-05, + "loss": 36.3624, + "step": 2076 + }, + { + "epoch": 7.4984198645598195, + "grad_norm": 187.72479248046875, + "learning_rate": 1.8745916515426497e-05, + "loss": 37.5074, + "step": 2077 + }, + { + "epoch": 7.502031602708803, + "grad_norm": 220.4607391357422, + "learning_rate": 1.8740471869328493e-05, + "loss": 35.6139, + "step": 2078 + }, + { + "epoch": 7.505643340857787, + "grad_norm": 179.05612182617188, + "learning_rate": 1.873502722323049e-05, + "loss": 37.7286, + "step": 2079 + }, + { + "epoch": 7.509255079006772, + "grad_norm": 230.91879272460938, + "learning_rate": 1.8729582577132487e-05, + "loss": 36.1803, + "step": 2080 + }, + { + "epoch": 7.509255079006772, + "eval_loss": 0.6255043148994446, + "eval_runtime": 3.1466, + "eval_samples_per_second": 56.887, + "eval_steps_per_second": 56.887, + "step": 2080 + }, + { + "epoch": 7.512866817155756, + "grad_norm": 182.89437866210938, + "learning_rate": 1.8724137931034482e-05, + "loss": 36.5782, + "step": 2081 + }, + { + "epoch": 7.51647855530474, + "grad_norm": 215.36769104003906, + "learning_rate": 1.871869328493648e-05, + "loss": 38.233, + "step": 2082 + }, + { + "epoch": 7.520090293453725, + "grad_norm": 232.6095733642578, + "learning_rate": 1.8713248638838477e-05, + "loss": 38.6268, + "step": 2083 + }, + { + "epoch": 7.523702031602709, + "grad_norm": 236.94281005859375, + "learning_rate": 1.8707803992740472e-05, + "loss": 38.1768, + "step": 2084 + }, + { + "epoch": 7.527313769751693, + "grad_norm": 214.16079711914062, + "learning_rate": 1.8702359346642467e-05, + "loss": 27.514, + "step": 2085 + }, + { + "epoch": 7.530925507900677, + "grad_norm": 192.6107940673828, + "learning_rate": 1.8696914700544463e-05, + "loss": 24.274, + "step": 2086 + }, + { + "epoch": 7.534537246049661, + "grad_norm": 217.98619079589844, + "learning_rate": 1.869147005444646e-05, + "loss": 23.2824, + "step": 2087 + }, + { + "epoch": 7.538148984198646, + "grad_norm": 183.04296875, + "learning_rate": 1.868602540834846e-05, + "loss": 24.9622, + "step": 2088 + }, + { + "epoch": 7.54176072234763, + "grad_norm": 167.1417236328125, + "learning_rate": 1.8680580762250456e-05, + "loss": 25.1446, + "step": 2089 + }, + { + "epoch": 7.545372460496614, + "grad_norm": 287.29937744140625, + "learning_rate": 1.867513611615245e-05, + "loss": 44.1171, + "step": 2090 + }, + { + "epoch": 7.545372460496614, + "eval_loss": 0.6376849412918091, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.929, + "eval_steps_per_second": 56.929, + "step": 2090 + }, + { + "epoch": 7.5489841986455986, + "grad_norm": 285.3408203125, + "learning_rate": 1.8669691470054446e-05, + "loss": 46.3716, + "step": 2091 + }, + { + "epoch": 7.5525959367945825, + "grad_norm": 233.18389892578125, + "learning_rate": 1.8664246823956445e-05, + "loss": 44.0514, + "step": 2092 + }, + { + "epoch": 7.5562076749435665, + "grad_norm": 256.4196472167969, + "learning_rate": 1.865880217785844e-05, + "loss": 44.1784, + "step": 2093 + }, + { + "epoch": 7.5598194130925505, + "grad_norm": 223.28128051757812, + "learning_rate": 1.8653357531760436e-05, + "loss": 42.9897, + "step": 2094 + }, + { + "epoch": 7.563431151241535, + "grad_norm": 235.2901153564453, + "learning_rate": 1.864791288566243e-05, + "loss": 43.7651, + "step": 2095 + }, + { + "epoch": 7.567042889390519, + "grad_norm": 285.9206237792969, + "learning_rate": 1.8642468239564427e-05, + "loss": 44.6333, + "step": 2096 + }, + { + "epoch": 7.570654627539503, + "grad_norm": 200.00210571289062, + "learning_rate": 1.8637023593466425e-05, + "loss": 43.9845, + "step": 2097 + }, + { + "epoch": 7.574266365688487, + "grad_norm": 277.73394775390625, + "learning_rate": 1.8631578947368424e-05, + "loss": 44.7301, + "step": 2098 + }, + { + "epoch": 7.577878103837472, + "grad_norm": 216.9422149658203, + "learning_rate": 1.862613430127042e-05, + "loss": 44.0409, + "step": 2099 + }, + { + "epoch": 7.581489841986456, + "grad_norm": 198.86639404296875, + "learning_rate": 1.8620689655172415e-05, + "loss": 43.4026, + "step": 2100 + }, + { + "epoch": 7.581489841986456, + "eval_loss": 0.6270378232002258, + "eval_runtime": 3.1464, + "eval_samples_per_second": 56.891, + "eval_steps_per_second": 56.891, + "step": 2100 + }, + { + "epoch": 7.58510158013544, + "grad_norm": 240.495361328125, + "learning_rate": 1.861524500907441e-05, + "loss": 41.4092, + "step": 2101 + }, + { + "epoch": 7.588713318284425, + "grad_norm": 240.1851043701172, + "learning_rate": 1.8609800362976406e-05, + "loss": 40.1396, + "step": 2102 + }, + { + "epoch": 7.592325056433409, + "grad_norm": 241.21495056152344, + "learning_rate": 1.8604355716878405e-05, + "loss": 39.1778, + "step": 2103 + }, + { + "epoch": 7.595936794582393, + "grad_norm": 287.3133544921875, + "learning_rate": 1.85989110707804e-05, + "loss": 41.0348, + "step": 2104 + }, + { + "epoch": 7.599548532731377, + "grad_norm": 230.4313201904297, + "learning_rate": 1.8593466424682395e-05, + "loss": 39.5872, + "step": 2105 + }, + { + "epoch": 7.603160270880361, + "grad_norm": 210.32962036132812, + "learning_rate": 1.858802177858439e-05, + "loss": 40.6146, + "step": 2106 + }, + { + "epoch": 7.606772009029346, + "grad_norm": 185.81752014160156, + "learning_rate": 1.858257713248639e-05, + "loss": 39.6363, + "step": 2107 + }, + { + "epoch": 7.6103837471783295, + "grad_norm": 234.63037109375, + "learning_rate": 1.8577132486388385e-05, + "loss": 40.558, + "step": 2108 + }, + { + "epoch": 7.6139954853273135, + "grad_norm": 289.92803955078125, + "learning_rate": 1.8571687840290384e-05, + "loss": 41.1624, + "step": 2109 + }, + { + "epoch": 7.617607223476298, + "grad_norm": 252.82188415527344, + "learning_rate": 1.856624319419238e-05, + "loss": 41.7827, + "step": 2110 + }, + { + "epoch": 7.617607223476298, + "eval_loss": 0.6290409564971924, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2110 + }, + { + "epoch": 7.621218961625282, + "grad_norm": 201.8303985595703, + "learning_rate": 1.8560798548094374e-05, + "loss": 39.0072, + "step": 2111 + }, + { + "epoch": 7.624830699774266, + "grad_norm": 158.71446228027344, + "learning_rate": 1.855535390199637e-05, + "loss": 39.9822, + "step": 2112 + }, + { + "epoch": 7.62844243792325, + "grad_norm": 171.3879852294922, + "learning_rate": 1.8549909255898365e-05, + "loss": 42.1973, + "step": 2113 + }, + { + "epoch": 7.632054176072235, + "grad_norm": 218.584228515625, + "learning_rate": 1.8544464609800364e-05, + "loss": 42.933, + "step": 2114 + }, + { + "epoch": 7.635665914221219, + "grad_norm": 200.60093688964844, + "learning_rate": 1.853901996370236e-05, + "loss": 41.9847, + "step": 2115 + }, + { + "epoch": 7.639277652370203, + "grad_norm": 210.75128173828125, + "learning_rate": 1.8533575317604358e-05, + "loss": 42.4961, + "step": 2116 + }, + { + "epoch": 7.642889390519187, + "grad_norm": 187.47406005859375, + "learning_rate": 1.8528130671506353e-05, + "loss": 39.3404, + "step": 2117 + }, + { + "epoch": 7.646501128668172, + "grad_norm": 204.87693786621094, + "learning_rate": 1.852268602540835e-05, + "loss": 40.3011, + "step": 2118 + }, + { + "epoch": 7.650112866817156, + "grad_norm": 228.8159637451172, + "learning_rate": 1.8517241379310344e-05, + "loss": 37.4416, + "step": 2119 + }, + { + "epoch": 7.65372460496614, + "grad_norm": 237.59664916992188, + "learning_rate": 1.8511796733212343e-05, + "loss": 35.3079, + "step": 2120 + }, + { + "epoch": 7.65372460496614, + "eval_loss": 0.6256567239761353, + "eval_runtime": 3.1458, + "eval_samples_per_second": 56.902, + "eval_steps_per_second": 56.902, + "step": 2120 + }, + { + "epoch": 7.657336343115124, + "grad_norm": 233.3187713623047, + "learning_rate": 1.850635208711434e-05, + "loss": 34.5055, + "step": 2121 + }, + { + "epoch": 7.660948081264109, + "grad_norm": 232.7037353515625, + "learning_rate": 1.8500907441016334e-05, + "loss": 34.1232, + "step": 2122 + }, + { + "epoch": 7.664559819413093, + "grad_norm": 254.53050231933594, + "learning_rate": 1.849546279491833e-05, + "loss": 35.3301, + "step": 2123 + }, + { + "epoch": 7.668171557562077, + "grad_norm": 234.93154907226562, + "learning_rate": 1.8490018148820324e-05, + "loss": 35.9202, + "step": 2124 + }, + { + "epoch": 7.6717832957110605, + "grad_norm": 237.99671936035156, + "learning_rate": 1.8484573502722327e-05, + "loss": 36.5702, + "step": 2125 + }, + { + "epoch": 7.675395033860045, + "grad_norm": 186.25271606445312, + "learning_rate": 1.8479128856624322e-05, + "loss": 35.9423, + "step": 2126 + }, + { + "epoch": 7.679006772009029, + "grad_norm": 226.461669921875, + "learning_rate": 1.8473684210526317e-05, + "loss": 37.4121, + "step": 2127 + }, + { + "epoch": 7.682618510158013, + "grad_norm": 227.0966033935547, + "learning_rate": 1.8468239564428313e-05, + "loss": 36.8802, + "step": 2128 + }, + { + "epoch": 7.686230248306998, + "grad_norm": 193.4064178466797, + "learning_rate": 1.8462794918330308e-05, + "loss": 36.0245, + "step": 2129 + }, + { + "epoch": 7.689841986455982, + "grad_norm": 279.1668395996094, + "learning_rate": 1.8457350272232304e-05, + "loss": 37.4833, + "step": 2130 + }, + { + "epoch": 7.689841986455982, + "eval_loss": 0.6227458715438843, + "eval_runtime": 3.1429, + "eval_samples_per_second": 56.953, + "eval_steps_per_second": 56.953, + "step": 2130 + }, + { + "epoch": 7.693453724604966, + "grad_norm": 254.59234619140625, + "learning_rate": 1.8451905626134302e-05, + "loss": 36.8538, + "step": 2131 + }, + { + "epoch": 7.69706546275395, + "grad_norm": 191.14463806152344, + "learning_rate": 1.8446460980036298e-05, + "loss": 37.8517, + "step": 2132 + }, + { + "epoch": 7.700677200902934, + "grad_norm": 189.20896911621094, + "learning_rate": 1.8441016333938293e-05, + "loss": 38.406, + "step": 2133 + }, + { + "epoch": 7.704288939051919, + "grad_norm": 209.61175537109375, + "learning_rate": 1.8435571687840292e-05, + "loss": 37.7692, + "step": 2134 + }, + { + "epoch": 7.707900677200903, + "grad_norm": 220.5150146484375, + "learning_rate": 1.8430127041742287e-05, + "loss": 36.087, + "step": 2135 + }, + { + "epoch": 7.711512415349887, + "grad_norm": 211.78372192382812, + "learning_rate": 1.8424682395644286e-05, + "loss": 25.6052, + "step": 2136 + }, + { + "epoch": 7.715124153498872, + "grad_norm": 223.85789489746094, + "learning_rate": 1.841923774954628e-05, + "loss": 23.5576, + "step": 2137 + }, + { + "epoch": 7.718735891647856, + "grad_norm": 163.74220275878906, + "learning_rate": 1.8413793103448277e-05, + "loss": 24.4869, + "step": 2138 + }, + { + "epoch": 7.72234762979684, + "grad_norm": 182.80079650878906, + "learning_rate": 1.8408348457350272e-05, + "loss": 25.1878, + "step": 2139 + }, + { + "epoch": 7.725959367945824, + "grad_norm": 296.0340270996094, + "learning_rate": 1.8402903811252268e-05, + "loss": 44.4643, + "step": 2140 + }, + { + "epoch": 7.725959367945824, + "eval_loss": 0.6382863521575928, + "eval_runtime": 3.1441, + "eval_samples_per_second": 56.932, + "eval_steps_per_second": 56.932, + "step": 2140 + }, + { + "epoch": 7.7295711060948085, + "grad_norm": 248.48643493652344, + "learning_rate": 1.8397459165154263e-05, + "loss": 45.2141, + "step": 2141 + }, + { + "epoch": 7.733182844243792, + "grad_norm": 240.9061279296875, + "learning_rate": 1.8392014519056262e-05, + "loss": 42.9435, + "step": 2142 + }, + { + "epoch": 7.736794582392776, + "grad_norm": 231.62315368652344, + "learning_rate": 1.8386569872958257e-05, + "loss": 42.9769, + "step": 2143 + }, + { + "epoch": 7.74040632054176, + "grad_norm": 244.36915588378906, + "learning_rate": 1.8381125226860256e-05, + "loss": 43.6058, + "step": 2144 + }, + { + "epoch": 7.744018058690745, + "grad_norm": 252.9080047607422, + "learning_rate": 1.837568058076225e-05, + "loss": 43.1753, + "step": 2145 + }, + { + "epoch": 7.747629796839729, + "grad_norm": 274.0201721191406, + "learning_rate": 1.8370235934664247e-05, + "loss": 43.3285, + "step": 2146 + }, + { + "epoch": 7.751241534988713, + "grad_norm": 226.75595092773438, + "learning_rate": 1.8364791288566245e-05, + "loss": 43.3158, + "step": 2147 + }, + { + "epoch": 7.754853273137698, + "grad_norm": 197.0859832763672, + "learning_rate": 1.835934664246824e-05, + "loss": 43.5773, + "step": 2148 + }, + { + "epoch": 7.758465011286682, + "grad_norm": 212.14720153808594, + "learning_rate": 1.8353901996370236e-05, + "loss": 43.9208, + "step": 2149 + }, + { + "epoch": 7.762076749435666, + "grad_norm": 230.22158813476562, + "learning_rate": 1.834845735027223e-05, + "loss": 42.8429, + "step": 2150 + }, + { + "epoch": 7.762076749435666, + "eval_loss": 0.6291994452476501, + "eval_runtime": 3.1473, + "eval_samples_per_second": 56.874, + "eval_steps_per_second": 56.874, + "step": 2150 + }, + { + "epoch": 7.76568848758465, + "grad_norm": 215.79391479492188, + "learning_rate": 1.8343012704174227e-05, + "loss": 40.7289, + "step": 2151 + }, + { + "epoch": 7.769300225733634, + "grad_norm": 210.00296020507812, + "learning_rate": 1.8337568058076222e-05, + "loss": 39.9759, + "step": 2152 + }, + { + "epoch": 7.772911963882619, + "grad_norm": 291.2987976074219, + "learning_rate": 1.8332123411978224e-05, + "loss": 40.551, + "step": 2153 + }, + { + "epoch": 7.776523702031603, + "grad_norm": 218.08819580078125, + "learning_rate": 1.832667876588022e-05, + "loss": 40.7981, + "step": 2154 + }, + { + "epoch": 7.780135440180587, + "grad_norm": 268.615966796875, + "learning_rate": 1.8321234119782215e-05, + "loss": 40.5463, + "step": 2155 + }, + { + "epoch": 7.7837471783295715, + "grad_norm": 269.939697265625, + "learning_rate": 1.831578947368421e-05, + "loss": 40.6168, + "step": 2156 + }, + { + "epoch": 7.7873589164785555, + "grad_norm": 268.9761657714844, + "learning_rate": 1.8310344827586206e-05, + "loss": 41.2449, + "step": 2157 + }, + { + "epoch": 7.7909706546275395, + "grad_norm": 161.08811950683594, + "learning_rate": 1.8304900181488205e-05, + "loss": 40.6308, + "step": 2158 + }, + { + "epoch": 7.794582392776523, + "grad_norm": 190.44696044921875, + "learning_rate": 1.82994555353902e-05, + "loss": 40.9708, + "step": 2159 + }, + { + "epoch": 7.798194130925508, + "grad_norm": 202.4305419921875, + "learning_rate": 1.8294010889292196e-05, + "loss": 41.2053, + "step": 2160 + }, + { + "epoch": 7.798194130925508, + "eval_loss": 0.6233534812927246, + "eval_runtime": 3.1457, + "eval_samples_per_second": 56.903, + "eval_steps_per_second": 56.903, + "step": 2160 + }, + { + "epoch": 7.801805869074492, + "grad_norm": 188.5523681640625, + "learning_rate": 1.828856624319419e-05, + "loss": 40.3928, + "step": 2161 + }, + { + "epoch": 7.805417607223476, + "grad_norm": 184.18296813964844, + "learning_rate": 1.828312159709619e-05, + "loss": 42.3466, + "step": 2162 + }, + { + "epoch": 7.80902934537246, + "grad_norm": 223.9243927001953, + "learning_rate": 1.8277676950998185e-05, + "loss": 42.0301, + "step": 2163 + }, + { + "epoch": 7.812641083521445, + "grad_norm": 202.3498077392578, + "learning_rate": 1.8272232304900184e-05, + "loss": 42.3284, + "step": 2164 + }, + { + "epoch": 7.816252821670429, + "grad_norm": 205.77940368652344, + "learning_rate": 1.826678765880218e-05, + "loss": 42.0951, + "step": 2165 + }, + { + "epoch": 7.819864559819413, + "grad_norm": 191.46728515625, + "learning_rate": 1.8261343012704175e-05, + "loss": 40.826, + "step": 2166 + }, + { + "epoch": 7.823476297968397, + "grad_norm": 276.8330383300781, + "learning_rate": 1.825589836660617e-05, + "loss": 42.7909, + "step": 2167 + }, + { + "epoch": 7.827088036117382, + "grad_norm": 181.93955993652344, + "learning_rate": 1.8250453720508165e-05, + "loss": 38.6068, + "step": 2168 + }, + { + "epoch": 7.830699774266366, + "grad_norm": 178.79856872558594, + "learning_rate": 1.8245009074410164e-05, + "loss": 35.694, + "step": 2169 + }, + { + "epoch": 7.83431151241535, + "grad_norm": 224.6522979736328, + "learning_rate": 1.823956442831216e-05, + "loss": 36.7127, + "step": 2170 + }, + { + "epoch": 7.83431151241535, + "eval_loss": 0.6237645745277405, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 2170 + }, + { + "epoch": 7.837923250564334, + "grad_norm": 203.37196350097656, + "learning_rate": 1.823411978221416e-05, + "loss": 34.0039, + "step": 2171 + }, + { + "epoch": 7.8415349887133186, + "grad_norm": 212.79307556152344, + "learning_rate": 1.8228675136116154e-05, + "loss": 33.2787, + "step": 2172 + }, + { + "epoch": 7.8451467268623025, + "grad_norm": 215.5691375732422, + "learning_rate": 1.822323049001815e-05, + "loss": 35.4241, + "step": 2173 + }, + { + "epoch": 7.8487584650112865, + "grad_norm": 230.0751190185547, + "learning_rate": 1.8217785843920144e-05, + "loss": 36.9333, + "step": 2174 + }, + { + "epoch": 7.852370203160271, + "grad_norm": 217.8132781982422, + "learning_rate": 1.8212341197822143e-05, + "loss": 35.7233, + "step": 2175 + }, + { + "epoch": 7.855981941309255, + "grad_norm": 245.93177795410156, + "learning_rate": 1.820689655172414e-05, + "loss": 36.6111, + "step": 2176 + }, + { + "epoch": 7.859593679458239, + "grad_norm": 210.58218383789062, + "learning_rate": 1.8201451905626134e-05, + "loss": 36.3243, + "step": 2177 + }, + { + "epoch": 7.863205417607223, + "grad_norm": 234.6280059814453, + "learning_rate": 1.819600725952813e-05, + "loss": 37.0315, + "step": 2178 + }, + { + "epoch": 7.866817155756207, + "grad_norm": 184.53121948242188, + "learning_rate": 1.8190562613430125e-05, + "loss": 35.8725, + "step": 2179 + }, + { + "epoch": 7.870428893905192, + "grad_norm": 201.5563507080078, + "learning_rate": 1.8185117967332127e-05, + "loss": 37.9183, + "step": 2180 + }, + { + "epoch": 7.870428893905192, + "eval_loss": 0.6210297346115112, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2180 + }, + { + "epoch": 7.874040632054176, + "grad_norm": 192.29579162597656, + "learning_rate": 1.8179673321234122e-05, + "loss": 37.1709, + "step": 2181 + }, + { + "epoch": 7.87765237020316, + "grad_norm": 246.0638427734375, + "learning_rate": 1.8174228675136118e-05, + "loss": 38.5338, + "step": 2182 + }, + { + "epoch": 7.881264108352145, + "grad_norm": 237.47607421875, + "learning_rate": 1.8168784029038113e-05, + "loss": 37.7041, + "step": 2183 + }, + { + "epoch": 7.884875846501129, + "grad_norm": 215.06407165527344, + "learning_rate": 1.816333938294011e-05, + "loss": 38.1663, + "step": 2184 + }, + { + "epoch": 7.888487584650113, + "grad_norm": 193.76809692382812, + "learning_rate": 1.8157894736842107e-05, + "loss": 32.1679, + "step": 2185 + }, + { + "epoch": 7.892099322799097, + "grad_norm": 208.66111755371094, + "learning_rate": 1.8152450090744103e-05, + "loss": 24.2413, + "step": 2186 + }, + { + "epoch": 7.895711060948082, + "grad_norm": 182.810546875, + "learning_rate": 1.8147005444646098e-05, + "loss": 24.1102, + "step": 2187 + }, + { + "epoch": 7.899322799097066, + "grad_norm": 200.25823974609375, + "learning_rate": 1.8141560798548093e-05, + "loss": 24.5778, + "step": 2188 + }, + { + "epoch": 7.9029345372460496, + "grad_norm": 224.19125366210938, + "learning_rate": 1.813611615245009e-05, + "loss": 26.1643, + "step": 2189 + }, + { + "epoch": 7.9065462753950335, + "grad_norm": 261.03033447265625, + "learning_rate": 1.8130671506352088e-05, + "loss": 45.1071, + "step": 2190 + }, + { + "epoch": 7.9065462753950335, + "eval_loss": 0.6303785443305969, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2190 + }, + { + "epoch": 7.910158013544018, + "grad_norm": 273.6593322753906, + "learning_rate": 1.8125226860254086e-05, + "loss": 43.8271, + "step": 2191 + }, + { + "epoch": 7.913769751693002, + "grad_norm": 304.0534362792969, + "learning_rate": 1.8119782214156082e-05, + "loss": 43.7623, + "step": 2192 + }, + { + "epoch": 7.917381489841986, + "grad_norm": 249.27255249023438, + "learning_rate": 1.8114337568058077e-05, + "loss": 43.7191, + "step": 2193 + }, + { + "epoch": 7.92099322799097, + "grad_norm": 199.5006103515625, + "learning_rate": 1.8108892921960072e-05, + "loss": 44.1019, + "step": 2194 + }, + { + "epoch": 7.924604966139955, + "grad_norm": 228.42832946777344, + "learning_rate": 1.8103448275862068e-05, + "loss": 43.9717, + "step": 2195 + }, + { + "epoch": 7.928216704288939, + "grad_norm": 247.20901489257812, + "learning_rate": 1.8098003629764067e-05, + "loss": 40.022, + "step": 2196 + }, + { + "epoch": 7.931828442437923, + "grad_norm": 297.5372619628906, + "learning_rate": 1.8092558983666062e-05, + "loss": 40.6639, + "step": 2197 + }, + { + "epoch": 7.935440180586907, + "grad_norm": 245.11915588378906, + "learning_rate": 1.8087114337568057e-05, + "loss": 40.3569, + "step": 2198 + }, + { + "epoch": 7.939051918735892, + "grad_norm": 255.53297424316406, + "learning_rate": 1.8081669691470056e-05, + "loss": 41.7983, + "step": 2199 + }, + { + "epoch": 7.942663656884876, + "grad_norm": 226.12783813476562, + "learning_rate": 1.807622504537205e-05, + "loss": 41.7844, + "step": 2200 + }, + { + "epoch": 7.942663656884876, + "eval_loss": 0.6214397549629211, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2200 + }, + { + "epoch": 7.94627539503386, + "grad_norm": 220.90577697753906, + "learning_rate": 1.8070780399274047e-05, + "loss": 42.057, + "step": 2201 + }, + { + "epoch": 7.949887133182845, + "grad_norm": 192.33856201171875, + "learning_rate": 1.8065335753176046e-05, + "loss": 42.0299, + "step": 2202 + }, + { + "epoch": 7.953498871331829, + "grad_norm": 192.8511962890625, + "learning_rate": 1.805989110707804e-05, + "loss": 41.7752, + "step": 2203 + }, + { + "epoch": 7.957110609480813, + "grad_norm": 223.10275268554688, + "learning_rate": 1.8054446460980036e-05, + "loss": 41.0178, + "step": 2204 + }, + { + "epoch": 7.960722347629797, + "grad_norm": 189.8402099609375, + "learning_rate": 1.8049001814882032e-05, + "loss": 37.9747, + "step": 2205 + }, + { + "epoch": 7.9643340857787805, + "grad_norm": 233.5938720703125, + "learning_rate": 1.8043557168784027e-05, + "loss": 35.3994, + "step": 2206 + }, + { + "epoch": 7.967945823927765, + "grad_norm": 218.5577850341797, + "learning_rate": 1.8038112522686026e-05, + "loss": 35.1967, + "step": 2207 + }, + { + "epoch": 7.971557562076749, + "grad_norm": 228.49502563476562, + "learning_rate": 1.8032667876588025e-05, + "loss": 34.5792, + "step": 2208 + }, + { + "epoch": 7.975169300225733, + "grad_norm": 285.4461364746094, + "learning_rate": 1.802722323049002e-05, + "loss": 37.9449, + "step": 2209 + }, + { + "epoch": 7.978781038374718, + "grad_norm": 186.83755493164062, + "learning_rate": 1.8021778584392016e-05, + "loss": 36.3295, + "step": 2210 + }, + { + "epoch": 7.978781038374718, + "eval_loss": 0.6212169528007507, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2210 + }, + { + "epoch": 7.982392776523702, + "grad_norm": 210.31175231933594, + "learning_rate": 1.801633393829401e-05, + "loss": 37.0061, + "step": 2211 + }, + { + "epoch": 7.986004514672686, + "grad_norm": 251.96026611328125, + "learning_rate": 1.8010889292196006e-05, + "loss": 37.8831, + "step": 2212 + }, + { + "epoch": 7.98961625282167, + "grad_norm": 273.8665771484375, + "learning_rate": 1.8005444646098005e-05, + "loss": 38.8926, + "step": 2213 + }, + { + "epoch": 7.993227990970655, + "grad_norm": 207.25836181640625, + "learning_rate": 1.8e-05, + "loss": 30.0468, + "step": 2214 + }, + { + "epoch": 7.996839729119639, + "grad_norm": 200.5218048095703, + "learning_rate": 1.7994555353901996e-05, + "loss": 24.0549, + "step": 2215 + }, + { + "epoch": 8.0, + "grad_norm": 245.7149200439453, + "learning_rate": 1.798911070780399e-05, + "loss": 22.3158, + "step": 2216 + }, + { + "epoch": 8.003611738148985, + "grad_norm": 263.85546875, + "learning_rate": 1.798366606170599e-05, + "loss": 43.2342, + "step": 2217 + }, + { + "epoch": 8.007223476297968, + "grad_norm": 244.57205200195312, + "learning_rate": 1.797822141560799e-05, + "loss": 44.0931, + "step": 2218 + }, + { + "epoch": 8.010835214446953, + "grad_norm": 196.4144287109375, + "learning_rate": 1.7972776769509984e-05, + "loss": 42.1926, + "step": 2219 + }, + { + "epoch": 8.014446952595938, + "grad_norm": 282.3250427246094, + "learning_rate": 1.796733212341198e-05, + "loss": 41.4664, + "step": 2220 + }, + { + "epoch": 8.014446952595938, + "eval_loss": 0.6222901344299316, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 2220 + }, + { + "epoch": 8.01805869074492, + "grad_norm": 186.79281616210938, + "learning_rate": 1.7961887477313975e-05, + "loss": 42.2133, + "step": 2221 + }, + { + "epoch": 8.021670428893906, + "grad_norm": 220.3788299560547, + "learning_rate": 1.795644283121597e-05, + "loss": 42.0159, + "step": 2222 + }, + { + "epoch": 8.025282167042889, + "grad_norm": 262.37078857421875, + "learning_rate": 1.7950998185117966e-05, + "loss": 42.6055, + "step": 2223 + }, + { + "epoch": 8.028893905191874, + "grad_norm": 199.07078552246094, + "learning_rate": 1.7945553539019964e-05, + "loss": 43.3061, + "step": 2224 + }, + { + "epoch": 8.032505643340858, + "grad_norm": 256.6651306152344, + "learning_rate": 1.794010889292196e-05, + "loss": 42.4806, + "step": 2225 + }, + { + "epoch": 8.036117381489841, + "grad_norm": 281.17431640625, + "learning_rate": 1.793466424682396e-05, + "loss": 43.9823, + "step": 2226 + }, + { + "epoch": 8.039729119638826, + "grad_norm": 201.19837951660156, + "learning_rate": 1.7929219600725954e-05, + "loss": 41.8372, + "step": 2227 + }, + { + "epoch": 8.043340857787811, + "grad_norm": 195.1905059814453, + "learning_rate": 1.792377495462795e-05, + "loss": 38.8656, + "step": 2228 + }, + { + "epoch": 8.046952595936794, + "grad_norm": 215.02772521972656, + "learning_rate": 1.7918330308529948e-05, + "loss": 39.8965, + "step": 2229 + }, + { + "epoch": 8.050564334085779, + "grad_norm": 202.16322326660156, + "learning_rate": 1.7912885662431944e-05, + "loss": 41.0917, + "step": 2230 + }, + { + "epoch": 8.050564334085779, + "eval_loss": 0.6212881207466125, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2230 + }, + { + "epoch": 8.054176072234762, + "grad_norm": 218.90786743164062, + "learning_rate": 1.790744101633394e-05, + "loss": 38.5499, + "step": 2231 + }, + { + "epoch": 8.057787810383747, + "grad_norm": 179.57138061523438, + "learning_rate": 1.7901996370235934e-05, + "loss": 39.5915, + "step": 2232 + }, + { + "epoch": 8.061399548532732, + "grad_norm": 242.74801635742188, + "learning_rate": 1.789655172413793e-05, + "loss": 39.6094, + "step": 2233 + }, + { + "epoch": 8.065011286681715, + "grad_norm": 183.07102966308594, + "learning_rate": 1.7891107078039925e-05, + "loss": 40.6025, + "step": 2234 + }, + { + "epoch": 8.0686230248307, + "grad_norm": 192.85418701171875, + "learning_rate": 1.7885662431941924e-05, + "loss": 40.3013, + "step": 2235 + }, + { + "epoch": 8.072234762979685, + "grad_norm": 254.26353454589844, + "learning_rate": 1.7880217785843923e-05, + "loss": 39.1747, + "step": 2236 + }, + { + "epoch": 8.075846501128668, + "grad_norm": 230.7747802734375, + "learning_rate": 1.7874773139745918e-05, + "loss": 40.7569, + "step": 2237 + }, + { + "epoch": 8.079458239277653, + "grad_norm": 179.30528259277344, + "learning_rate": 1.7869328493647913e-05, + "loss": 40.0753, + "step": 2238 + }, + { + "epoch": 8.083069977426636, + "grad_norm": 203.48915100097656, + "learning_rate": 1.786388384754991e-05, + "loss": 41.4453, + "step": 2239 + }, + { + "epoch": 8.08668171557562, + "grad_norm": 274.8970947265625, + "learning_rate": 1.7858439201451908e-05, + "loss": 40.5818, + "step": 2240 + }, + { + "epoch": 8.08668171557562, + "eval_loss": 0.6184170842170715, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.799, + "eval_steps_per_second": 56.799, + "step": 2240 + }, + { + "epoch": 8.090293453724605, + "grad_norm": 237.2452392578125, + "learning_rate": 1.7852994555353903e-05, + "loss": 42.5794, + "step": 2241 + }, + { + "epoch": 8.093905191873588, + "grad_norm": 236.33766174316406, + "learning_rate": 1.7847549909255898e-05, + "loss": 41.89, + "step": 2242 + }, + { + "epoch": 8.097516930022573, + "grad_norm": 269.4791564941406, + "learning_rate": 1.7842105263157894e-05, + "loss": 41.7726, + "step": 2243 + }, + { + "epoch": 8.101128668171558, + "grad_norm": 192.28457641601562, + "learning_rate": 1.783666061705989e-05, + "loss": 40.1187, + "step": 2244 + }, + { + "epoch": 8.104740406320541, + "grad_norm": 201.5625457763672, + "learning_rate": 1.7831215970961888e-05, + "loss": 36.8004, + "step": 2245 + }, + { + "epoch": 8.108352144469526, + "grad_norm": 175.7625274658203, + "learning_rate": 1.7825771324863887e-05, + "loss": 33.8354, + "step": 2246 + }, + { + "epoch": 8.111963882618511, + "grad_norm": 195.6171112060547, + "learning_rate": 1.7820326678765882e-05, + "loss": 33.5176, + "step": 2247 + }, + { + "epoch": 8.115575620767494, + "grad_norm": 158.7554168701172, + "learning_rate": 1.7814882032667877e-05, + "loss": 34.2908, + "step": 2248 + }, + { + "epoch": 8.119187358916479, + "grad_norm": 192.78900146484375, + "learning_rate": 1.7809437386569873e-05, + "loss": 34.0861, + "step": 2249 + }, + { + "epoch": 8.122799097065462, + "grad_norm": 186.6603240966797, + "learning_rate": 1.7803992740471868e-05, + "loss": 35.5742, + "step": 2250 + }, + { + "epoch": 8.122799097065462, + "eval_loss": 0.6207499504089355, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2250 + }, + { + "epoch": 8.126410835214447, + "grad_norm": 264.3590087890625, + "learning_rate": 1.7798548094373867e-05, + "loss": 35.6709, + "step": 2251 + }, + { + "epoch": 8.130022573363432, + "grad_norm": 202.9478302001953, + "learning_rate": 1.7793103448275862e-05, + "loss": 36.4221, + "step": 2252 + }, + { + "epoch": 8.133634311512415, + "grad_norm": 229.260498046875, + "learning_rate": 1.7787658802177858e-05, + "loss": 36.0745, + "step": 2253 + }, + { + "epoch": 8.1372460496614, + "grad_norm": 222.37716674804688, + "learning_rate": 1.7782214156079856e-05, + "loss": 37.3266, + "step": 2254 + }, + { + "epoch": 8.140857787810384, + "grad_norm": 217.02272033691406, + "learning_rate": 1.7776769509981852e-05, + "loss": 37.2819, + "step": 2255 + }, + { + "epoch": 8.144469525959368, + "grad_norm": 247.61016845703125, + "learning_rate": 1.7771324863883847e-05, + "loss": 37.2683, + "step": 2256 + }, + { + "epoch": 8.148081264108352, + "grad_norm": 209.7449493408203, + "learning_rate": 1.7765880217785846e-05, + "loss": 36.7165, + "step": 2257 + }, + { + "epoch": 8.151693002257336, + "grad_norm": 217.30722045898438, + "learning_rate": 1.776043557168784e-05, + "loss": 37.0805, + "step": 2258 + }, + { + "epoch": 8.15530474040632, + "grad_norm": 181.5167236328125, + "learning_rate": 1.7754990925589837e-05, + "loss": 38.0326, + "step": 2259 + }, + { + "epoch": 8.158916478555305, + "grad_norm": 217.4818878173828, + "learning_rate": 1.7749546279491832e-05, + "loss": 37.1798, + "step": 2260 + }, + { + "epoch": 8.158916478555305, + "eval_loss": 0.6218119263648987, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 2260 + }, + { + "epoch": 8.162528216704288, + "grad_norm": 233.60733032226562, + "learning_rate": 1.7744101633393828e-05, + "loss": 36.6039, + "step": 2261 + }, + { + "epoch": 8.166139954853273, + "grad_norm": 184.5128631591797, + "learning_rate": 1.7738656987295826e-05, + "loss": 30.6188, + "step": 2262 + }, + { + "epoch": 8.169751693002258, + "grad_norm": 154.25791931152344, + "learning_rate": 1.7733212341197825e-05, + "loss": 24.0782, + "step": 2263 + }, + { + "epoch": 8.173363431151241, + "grad_norm": 179.92723083496094, + "learning_rate": 1.772776769509982e-05, + "loss": 23.7072, + "step": 2264 + }, + { + "epoch": 8.176975169300226, + "grad_norm": 170.87684631347656, + "learning_rate": 1.7722323049001816e-05, + "loss": 24.0008, + "step": 2265 + }, + { + "epoch": 8.18058690744921, + "grad_norm": 179.25233459472656, + "learning_rate": 1.771687840290381e-05, + "loss": 24.8393, + "step": 2266 + }, + { + "epoch": 8.184198645598194, + "grad_norm": 268.7836608886719, + "learning_rate": 1.7711433756805807e-05, + "loss": 44.0573, + "step": 2267 + }, + { + "epoch": 8.187810383747179, + "grad_norm": 249.12033081054688, + "learning_rate": 1.7705989110707805e-05, + "loss": 45.0218, + "step": 2268 + }, + { + "epoch": 8.191422121896162, + "grad_norm": 275.2551574707031, + "learning_rate": 1.77005444646098e-05, + "loss": 43.1954, + "step": 2269 + }, + { + "epoch": 8.195033860045147, + "grad_norm": 233.5360107421875, + "learning_rate": 1.7695099818511796e-05, + "loss": 43.0807, + "step": 2270 + }, + { + "epoch": 8.195033860045147, + "eval_loss": 0.6311450600624084, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 2270 + }, + { + "epoch": 8.198645598194132, + "grad_norm": 201.01617431640625, + "learning_rate": 1.768965517241379e-05, + "loss": 43.8161, + "step": 2271 + }, + { + "epoch": 8.202257336343115, + "grad_norm": 243.028564453125, + "learning_rate": 1.7684210526315787e-05, + "loss": 43.3388, + "step": 2272 + }, + { + "epoch": 8.2058690744921, + "grad_norm": 191.8246307373047, + "learning_rate": 1.767876588021779e-05, + "loss": 42.6949, + "step": 2273 + }, + { + "epoch": 8.209480812641084, + "grad_norm": 241.33609008789062, + "learning_rate": 1.7673321234119784e-05, + "loss": 43.3541, + "step": 2274 + }, + { + "epoch": 8.213092550790067, + "grad_norm": 247.99066162109375, + "learning_rate": 1.766787658802178e-05, + "loss": 44.4262, + "step": 2275 + }, + { + "epoch": 8.216704288939052, + "grad_norm": 223.35452270507812, + "learning_rate": 1.7662431941923775e-05, + "loss": 42.5696, + "step": 2276 + }, + { + "epoch": 8.220316027088035, + "grad_norm": 208.75209045410156, + "learning_rate": 1.765698729582577e-05, + "loss": 41.9236, + "step": 2277 + }, + { + "epoch": 8.22392776523702, + "grad_norm": 229.60305786132812, + "learning_rate": 1.7651542649727766e-05, + "loss": 39.962, + "step": 2278 + }, + { + "epoch": 8.227539503386005, + "grad_norm": 294.3867492675781, + "learning_rate": 1.7646098003629765e-05, + "loss": 39.0847, + "step": 2279 + }, + { + "epoch": 8.231151241534988, + "grad_norm": 201.49679565429688, + "learning_rate": 1.764065335753176e-05, + "loss": 39.1451, + "step": 2280 + }, + { + "epoch": 8.231151241534988, + "eval_loss": 0.6214079856872559, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 2280 + }, + { + "epoch": 8.234762979683973, + "grad_norm": 201.57894897460938, + "learning_rate": 1.7635208711433756e-05, + "loss": 39.4673, + "step": 2281 + }, + { + "epoch": 8.238374717832958, + "grad_norm": 201.0395965576172, + "learning_rate": 1.7629764065335754e-05, + "loss": 39.9832, + "step": 2282 + }, + { + "epoch": 8.241986455981941, + "grad_norm": 274.41168212890625, + "learning_rate": 1.762431941923775e-05, + "loss": 40.3885, + "step": 2283 + }, + { + "epoch": 8.245598194130926, + "grad_norm": 173.79977416992188, + "learning_rate": 1.761887477313975e-05, + "loss": 39.5292, + "step": 2284 + }, + { + "epoch": 8.249209932279909, + "grad_norm": 194.91806030273438, + "learning_rate": 1.7613430127041744e-05, + "loss": 40.3855, + "step": 2285 + }, + { + "epoch": 8.252821670428894, + "grad_norm": 216.47213745117188, + "learning_rate": 1.760798548094374e-05, + "loss": 40.937, + "step": 2286 + }, + { + "epoch": 8.256433408577879, + "grad_norm": 168.1825714111328, + "learning_rate": 1.7602540834845735e-05, + "loss": 41.2523, + "step": 2287 + }, + { + "epoch": 8.260045146726862, + "grad_norm": 187.51914978027344, + "learning_rate": 1.759709618874773e-05, + "loss": 40.6913, + "step": 2288 + }, + { + "epoch": 8.263656884875846, + "grad_norm": 183.99844360351562, + "learning_rate": 1.759165154264973e-05, + "loss": 42.5074, + "step": 2289 + }, + { + "epoch": 8.267268623024831, + "grad_norm": 201.23797607421875, + "learning_rate": 1.7586206896551724e-05, + "loss": 42.0519, + "step": 2290 + }, + { + "epoch": 8.267268623024831, + "eval_loss": 0.6184054017066956, + "eval_runtime": 3.1465, + "eval_samples_per_second": 56.889, + "eval_steps_per_second": 56.889, + "step": 2290 + }, + { + "epoch": 8.270880361173814, + "grad_norm": 219.0037384033203, + "learning_rate": 1.7580762250453723e-05, + "loss": 41.7059, + "step": 2291 + }, + { + "epoch": 8.2744920993228, + "grad_norm": 221.00173950195312, + "learning_rate": 1.7575317604355718e-05, + "loss": 40.9004, + "step": 2292 + }, + { + "epoch": 8.278103837471784, + "grad_norm": 180.00828552246094, + "learning_rate": 1.7569872958257714e-05, + "loss": 38.7865, + "step": 2293 + }, + { + "epoch": 8.281715575620767, + "grad_norm": 210.69302368164062, + "learning_rate": 1.756442831215971e-05, + "loss": 39.207, + "step": 2294 + }, + { + "epoch": 8.285327313769752, + "grad_norm": 196.8787078857422, + "learning_rate": 1.7558983666061708e-05, + "loss": 39.4472, + "step": 2295 + }, + { + "epoch": 8.288939051918735, + "grad_norm": 229.16331481933594, + "learning_rate": 1.7553539019963703e-05, + "loss": 36.5539, + "step": 2296 + }, + { + "epoch": 8.29255079006772, + "grad_norm": 180.67474365234375, + "learning_rate": 1.75480943738657e-05, + "loss": 34.3887, + "step": 2297 + }, + { + "epoch": 8.296162528216705, + "grad_norm": 234.046875, + "learning_rate": 1.7542649727767694e-05, + "loss": 34.158, + "step": 2298 + }, + { + "epoch": 8.299774266365688, + "grad_norm": 213.34255981445312, + "learning_rate": 1.753720508166969e-05, + "loss": 34.7655, + "step": 2299 + }, + { + "epoch": 8.303386004514673, + "grad_norm": 205.6382598876953, + "learning_rate": 1.753176043557169e-05, + "loss": 34.4223, + "step": 2300 + }, + { + "epoch": 8.303386004514673, + "eval_loss": 0.6200549006462097, + "eval_runtime": 3.1447, + "eval_samples_per_second": 56.921, + "eval_steps_per_second": 56.921, + "step": 2300 + }, + { + "epoch": 8.306997742663658, + "grad_norm": 189.79238891601562, + "learning_rate": 1.7526315789473687e-05, + "loss": 35.3846, + "step": 2301 + }, + { + "epoch": 8.31060948081264, + "grad_norm": 202.27859497070312, + "learning_rate": 1.7520871143375682e-05, + "loss": 34.9006, + "step": 2302 + }, + { + "epoch": 8.314221218961626, + "grad_norm": 217.62327575683594, + "learning_rate": 1.7515426497277678e-05, + "loss": 36.3079, + "step": 2303 + }, + { + "epoch": 8.317832957110609, + "grad_norm": 212.82862854003906, + "learning_rate": 1.7509981851179673e-05, + "loss": 35.8598, + "step": 2304 + }, + { + "epoch": 8.321444695259594, + "grad_norm": 229.778564453125, + "learning_rate": 1.750453720508167e-05, + "loss": 37.0853, + "step": 2305 + }, + { + "epoch": 8.325056433408578, + "grad_norm": 219.99844360351562, + "learning_rate": 1.7499092558983667e-05, + "loss": 38.01, + "step": 2306 + }, + { + "epoch": 8.328668171557561, + "grad_norm": 202.63035583496094, + "learning_rate": 1.7493647912885663e-05, + "loss": 36.4756, + "step": 2307 + }, + { + "epoch": 8.332279909706546, + "grad_norm": 188.44094848632812, + "learning_rate": 1.7488203266787658e-05, + "loss": 37.0509, + "step": 2308 + }, + { + "epoch": 8.335891647855531, + "grad_norm": 187.8760223388672, + "learning_rate": 1.7482758620689657e-05, + "loss": 38.0019, + "step": 2309 + }, + { + "epoch": 8.339503386004514, + "grad_norm": 239.35833740234375, + "learning_rate": 1.7477313974591652e-05, + "loss": 38.2255, + "step": 2310 + }, + { + "epoch": 8.339503386004514, + "eval_loss": 0.6221747994422913, + "eval_runtime": 3.148, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 2310 + }, + { + "epoch": 8.343115124153499, + "grad_norm": 236.3567657470703, + "learning_rate": 1.747186932849365e-05, + "loss": 37.3598, + "step": 2311 + }, + { + "epoch": 8.346726862302482, + "grad_norm": 188.16151428222656, + "learning_rate": 1.7466424682395646e-05, + "loss": 27.1993, + "step": 2312 + }, + { + "epoch": 8.350338600451467, + "grad_norm": 216.58778381347656, + "learning_rate": 1.746098003629764e-05, + "loss": 23.7024, + "step": 2313 + }, + { + "epoch": 8.353950338600452, + "grad_norm": 221.03111267089844, + "learning_rate": 1.7455535390199637e-05, + "loss": 24.2856, + "step": 2314 + }, + { + "epoch": 8.357562076749435, + "grad_norm": 180.36221313476562, + "learning_rate": 1.7450090744101632e-05, + "loss": 23.7624, + "step": 2315 + }, + { + "epoch": 8.36117381489842, + "grad_norm": 198.77438354492188, + "learning_rate": 1.7444646098003628e-05, + "loss": 25.8628, + "step": 2316 + }, + { + "epoch": 8.364785553047405, + "grad_norm": 250.81321716308594, + "learning_rate": 1.7439201451905627e-05, + "loss": 43.4097, + "step": 2317 + }, + { + "epoch": 8.368397291196388, + "grad_norm": 246.19544982910156, + "learning_rate": 1.7433756805807622e-05, + "loss": 44.7141, + "step": 2318 + }, + { + "epoch": 8.372009029345373, + "grad_norm": 245.04241943359375, + "learning_rate": 1.742831215970962e-05, + "loss": 44.4511, + "step": 2319 + }, + { + "epoch": 8.375620767494357, + "grad_norm": 224.05331420898438, + "learning_rate": 1.7422867513611616e-05, + "loss": 43.5971, + "step": 2320 + }, + { + "epoch": 8.375620767494357, + "eval_loss": 0.6324251294136047, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 2320 + }, + { + "epoch": 8.37923250564334, + "grad_norm": 222.3795623779297, + "learning_rate": 1.741742286751361e-05, + "loss": 42.9007, + "step": 2321 + }, + { + "epoch": 8.382844243792325, + "grad_norm": 210.0133514404297, + "learning_rate": 1.741197822141561e-05, + "loss": 42.8733, + "step": 2322 + }, + { + "epoch": 8.386455981941308, + "grad_norm": 222.01031494140625, + "learning_rate": 1.7406533575317606e-05, + "loss": 42.9875, + "step": 2323 + }, + { + "epoch": 8.390067720090293, + "grad_norm": 187.30101013183594, + "learning_rate": 1.74010889292196e-05, + "loss": 42.4873, + "step": 2324 + }, + { + "epoch": 8.393679458239278, + "grad_norm": 188.22048950195312, + "learning_rate": 1.7395644283121596e-05, + "loss": 42.2066, + "step": 2325 + }, + { + "epoch": 8.397291196388261, + "grad_norm": 228.75363159179688, + "learning_rate": 1.7390199637023592e-05, + "loss": 42.7604, + "step": 2326 + }, + { + "epoch": 8.400902934537246, + "grad_norm": 196.8817901611328, + "learning_rate": 1.7384754990925587e-05, + "loss": 42.445, + "step": 2327 + }, + { + "epoch": 8.404514672686231, + "grad_norm": 205.3610382080078, + "learning_rate": 1.737931034482759e-05, + "loss": 39.8408, + "step": 2328 + }, + { + "epoch": 8.408126410835214, + "grad_norm": 259.0702819824219, + "learning_rate": 1.7373865698729585e-05, + "loss": 40.847, + "step": 2329 + }, + { + "epoch": 8.411738148984199, + "grad_norm": 216.12017822265625, + "learning_rate": 1.736842105263158e-05, + "loss": 40.4648, + "step": 2330 + }, + { + "epoch": 8.411738148984199, + "eval_loss": 0.6252871155738831, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2330 + }, + { + "epoch": 8.415349887133182, + "grad_norm": 330.9464111328125, + "learning_rate": 1.7362976406533575e-05, + "loss": 39.7682, + "step": 2331 + }, + { + "epoch": 8.418961625282167, + "grad_norm": 237.19505310058594, + "learning_rate": 1.735753176043557e-05, + "loss": 38.8824, + "step": 2332 + }, + { + "epoch": 8.422573363431152, + "grad_norm": 247.22259521484375, + "learning_rate": 1.735208711433757e-05, + "loss": 40.1187, + "step": 2333 + }, + { + "epoch": 8.426185101580135, + "grad_norm": 267.739990234375, + "learning_rate": 1.7346642468239565e-05, + "loss": 40.4589, + "step": 2334 + }, + { + "epoch": 8.42979683972912, + "grad_norm": 308.715576171875, + "learning_rate": 1.734119782214156e-05, + "loss": 41.5481, + "step": 2335 + }, + { + "epoch": 8.433408577878104, + "grad_norm": 350.8972473144531, + "learning_rate": 1.7335753176043556e-05, + "loss": 41.6628, + "step": 2336 + }, + { + "epoch": 8.437020316027088, + "grad_norm": 245.9825897216797, + "learning_rate": 1.7330308529945555e-05, + "loss": 40.3527, + "step": 2337 + }, + { + "epoch": 8.440632054176072, + "grad_norm": 253.94488525390625, + "learning_rate": 1.732486388384755e-05, + "loss": 39.6388, + "step": 2338 + }, + { + "epoch": 8.444243792325057, + "grad_norm": 226.24179077148438, + "learning_rate": 1.731941923774955e-05, + "loss": 40.5561, + "step": 2339 + }, + { + "epoch": 8.44785553047404, + "grad_norm": 188.66746520996094, + "learning_rate": 1.7313974591651544e-05, + "loss": 41.8422, + "step": 2340 + }, + { + "epoch": 8.44785553047404, + "eval_loss": 0.6197592616081238, + "eval_runtime": 3.1522, + "eval_samples_per_second": 56.786, + "eval_steps_per_second": 56.786, + "step": 2340 + }, + { + "epoch": 8.451467268623025, + "grad_norm": 227.01014709472656, + "learning_rate": 1.730852994555354e-05, + "loss": 41.4184, + "step": 2341 + }, + { + "epoch": 8.455079006772008, + "grad_norm": 187.11643981933594, + "learning_rate": 1.7303085299455535e-05, + "loss": 40.796, + "step": 2342 + }, + { + "epoch": 8.458690744920993, + "grad_norm": 243.1756134033203, + "learning_rate": 1.729764065335753e-05, + "loss": 41.7926, + "step": 2343 + }, + { + "epoch": 8.462302483069978, + "grad_norm": 226.15187072753906, + "learning_rate": 1.729219600725953e-05, + "loss": 41.588, + "step": 2344 + }, + { + "epoch": 8.465914221218961, + "grad_norm": 218.49935913085938, + "learning_rate": 1.7286751361161524e-05, + "loss": 39.6935, + "step": 2345 + }, + { + "epoch": 8.469525959367946, + "grad_norm": 232.4805145263672, + "learning_rate": 1.7281306715063523e-05, + "loss": 37.0718, + "step": 2346 + }, + { + "epoch": 8.47313769751693, + "grad_norm": 201.1748046875, + "learning_rate": 1.727586206896552e-05, + "loss": 33.9633, + "step": 2347 + }, + { + "epoch": 8.476749435665914, + "grad_norm": 208.79733276367188, + "learning_rate": 1.7270417422867514e-05, + "loss": 33.4553, + "step": 2348 + }, + { + "epoch": 8.480361173814899, + "grad_norm": 235.91151428222656, + "learning_rate": 1.726497277676951e-05, + "loss": 33.6144, + "step": 2349 + }, + { + "epoch": 8.483972911963882, + "grad_norm": 206.28811645507812, + "learning_rate": 1.7259528130671508e-05, + "loss": 35.3678, + "step": 2350 + }, + { + "epoch": 8.483972911963882, + "eval_loss": 0.6203061938285828, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 2350 + }, + { + "epoch": 8.487584650112867, + "grad_norm": 305.2204284667969, + "learning_rate": 1.7254083484573503e-05, + "loss": 35.9175, + "step": 2351 + }, + { + "epoch": 8.491196388261852, + "grad_norm": 227.1592254638672, + "learning_rate": 1.72486388384755e-05, + "loss": 35.5001, + "step": 2352 + }, + { + "epoch": 8.494808126410835, + "grad_norm": 194.739501953125, + "learning_rate": 1.7243194192377494e-05, + "loss": 35.0015, + "step": 2353 + }, + { + "epoch": 8.49841986455982, + "grad_norm": 233.8467254638672, + "learning_rate": 1.723774954627949e-05, + "loss": 36.8257, + "step": 2354 + }, + { + "epoch": 8.502031602708804, + "grad_norm": 258.8914489746094, + "learning_rate": 1.7232304900181492e-05, + "loss": 36.1246, + "step": 2355 + }, + { + "epoch": 8.505643340857787, + "grad_norm": 194.8585968017578, + "learning_rate": 1.7226860254083487e-05, + "loss": 36.1245, + "step": 2356 + }, + { + "epoch": 8.509255079006772, + "grad_norm": 191.2276153564453, + "learning_rate": 1.7221415607985483e-05, + "loss": 37.0608, + "step": 2357 + }, + { + "epoch": 8.512866817155757, + "grad_norm": 197.9025115966797, + "learning_rate": 1.7215970961887478e-05, + "loss": 37.0779, + "step": 2358 + }, + { + "epoch": 8.51647855530474, + "grad_norm": 207.01016235351562, + "learning_rate": 1.7210526315789473e-05, + "loss": 37.8432, + "step": 2359 + }, + { + "epoch": 8.520090293453725, + "grad_norm": 222.20201110839844, + "learning_rate": 1.720508166969147e-05, + "loss": 36.6983, + "step": 2360 + }, + { + "epoch": 8.520090293453725, + "eval_loss": 0.6240220665931702, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 2360 + }, + { + "epoch": 8.523702031602708, + "grad_norm": 200.19273376464844, + "learning_rate": 1.7199637023593467e-05, + "loss": 38.0613, + "step": 2361 + }, + { + "epoch": 8.527313769751693, + "grad_norm": 205.36758422851562, + "learning_rate": 1.7194192377495463e-05, + "loss": 29.6395, + "step": 2362 + }, + { + "epoch": 8.530925507900678, + "grad_norm": 206.53396606445312, + "learning_rate": 1.7188747731397458e-05, + "loss": 23.6478, + "step": 2363 + }, + { + "epoch": 8.534537246049661, + "grad_norm": 219.47044372558594, + "learning_rate": 1.7183303085299454e-05, + "loss": 22.8522, + "step": 2364 + }, + { + "epoch": 8.538148984198646, + "grad_norm": 178.48008728027344, + "learning_rate": 1.7177858439201452e-05, + "loss": 24.1411, + "step": 2365 + }, + { + "epoch": 8.54176072234763, + "grad_norm": 222.63731384277344, + "learning_rate": 1.717241379310345e-05, + "loss": 26.2818, + "step": 2366 + }, + { + "epoch": 8.545372460496614, + "grad_norm": 216.6333465576172, + "learning_rate": 1.7166969147005447e-05, + "loss": 42.5599, + "step": 2367 + }, + { + "epoch": 8.548984198645599, + "grad_norm": 241.42532348632812, + "learning_rate": 1.7161524500907442e-05, + "loss": 44.0016, + "step": 2368 + }, + { + "epoch": 8.552595936794582, + "grad_norm": 227.95193481445312, + "learning_rate": 1.7156079854809437e-05, + "loss": 44.1662, + "step": 2369 + }, + { + "epoch": 8.556207674943566, + "grad_norm": 204.9208526611328, + "learning_rate": 1.7150635208711433e-05, + "loss": 41.2255, + "step": 2370 + }, + { + "epoch": 8.556207674943566, + "eval_loss": 0.6293933987617493, + "eval_runtime": 3.1467, + "eval_samples_per_second": 56.884, + "eval_steps_per_second": 56.884, + "step": 2370 + }, + { + "epoch": 8.559819413092551, + "grad_norm": 168.1370849609375, + "learning_rate": 1.7145190562613428e-05, + "loss": 42.8374, + "step": 2371 + }, + { + "epoch": 8.563431151241534, + "grad_norm": 209.16641235351562, + "learning_rate": 1.7139745916515427e-05, + "loss": 42.4378, + "step": 2372 + }, + { + "epoch": 8.56704288939052, + "grad_norm": 235.36373901367188, + "learning_rate": 1.7134301270417422e-05, + "loss": 43.3213, + "step": 2373 + }, + { + "epoch": 8.570654627539504, + "grad_norm": 198.8206329345703, + "learning_rate": 1.712885662431942e-05, + "loss": 43.5621, + "step": 2374 + }, + { + "epoch": 8.574266365688487, + "grad_norm": 191.1640167236328, + "learning_rate": 1.7123411978221416e-05, + "loss": 41.8729, + "step": 2375 + }, + { + "epoch": 8.577878103837472, + "grad_norm": 281.6352233886719, + "learning_rate": 1.7117967332123412e-05, + "loss": 42.8306, + "step": 2376 + }, + { + "epoch": 8.581489841986457, + "grad_norm": 191.68939208984375, + "learning_rate": 1.711252268602541e-05, + "loss": 41.3603, + "step": 2377 + }, + { + "epoch": 8.58510158013544, + "grad_norm": 175.3041229248047, + "learning_rate": 1.7107078039927406e-05, + "loss": 38.7076, + "step": 2378 + }, + { + "epoch": 8.588713318284425, + "grad_norm": 186.31202697753906, + "learning_rate": 1.71016333938294e-05, + "loss": 38.832, + "step": 2379 + }, + { + "epoch": 8.592325056433408, + "grad_norm": 192.0680389404297, + "learning_rate": 1.7096188747731397e-05, + "loss": 40.6542, + "step": 2380 + }, + { + "epoch": 8.592325056433408, + "eval_loss": 0.6245992183685303, + "eval_runtime": 3.1487, + "eval_samples_per_second": 56.848, + "eval_steps_per_second": 56.848, + "step": 2380 + }, + { + "epoch": 8.595936794582393, + "grad_norm": 284.3516540527344, + "learning_rate": 1.7090744101633392e-05, + "loss": 40.3145, + "step": 2381 + }, + { + "epoch": 8.599548532731378, + "grad_norm": 210.2421875, + "learning_rate": 1.708529945553539e-05, + "loss": 39.9109, + "step": 2382 + }, + { + "epoch": 8.60316027088036, + "grad_norm": 202.3438720703125, + "learning_rate": 1.707985480943739e-05, + "loss": 39.0686, + "step": 2383 + }, + { + "epoch": 8.606772009029346, + "grad_norm": 189.5508270263672, + "learning_rate": 1.7074410163339385e-05, + "loss": 40.6673, + "step": 2384 + }, + { + "epoch": 8.610383747178329, + "grad_norm": 199.3516387939453, + "learning_rate": 1.706896551724138e-05, + "loss": 40.5357, + "step": 2385 + }, + { + "epoch": 8.613995485327314, + "grad_norm": 183.11309814453125, + "learning_rate": 1.7063520871143376e-05, + "loss": 40.7691, + "step": 2386 + }, + { + "epoch": 8.617607223476298, + "grad_norm": 347.104248046875, + "learning_rate": 1.705807622504537e-05, + "loss": 40.6822, + "step": 2387 + }, + { + "epoch": 8.621218961625281, + "grad_norm": 341.0453796386719, + "learning_rate": 1.705263157894737e-05, + "loss": 40.9791, + "step": 2388 + }, + { + "epoch": 8.624830699774266, + "grad_norm": 335.33221435546875, + "learning_rate": 1.7047186932849365e-05, + "loss": 41.0977, + "step": 2389 + }, + { + "epoch": 8.628442437923251, + "grad_norm": 209.75198364257812, + "learning_rate": 1.704174228675136e-05, + "loss": 41.3332, + "step": 2390 + }, + { + "epoch": 8.628442437923251, + "eval_loss": 0.6176490783691406, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2390 + }, + { + "epoch": 8.632054176072234, + "grad_norm": 221.6715545654297, + "learning_rate": 1.7036297640653356e-05, + "loss": 41.7456, + "step": 2391 + }, + { + "epoch": 8.635665914221219, + "grad_norm": 255.7875213623047, + "learning_rate": 1.7030852994555355e-05, + "loss": 41.7063, + "step": 2392 + }, + { + "epoch": 8.639277652370204, + "grad_norm": 206.66221618652344, + "learning_rate": 1.7025408348457354e-05, + "loss": 41.941, + "step": 2393 + }, + { + "epoch": 8.642889390519187, + "grad_norm": 381.9871826171875, + "learning_rate": 1.701996370235935e-05, + "loss": 42.8615, + "step": 2394 + }, + { + "epoch": 8.646501128668172, + "grad_norm": 303.8249816894531, + "learning_rate": 1.7014519056261344e-05, + "loss": 37.8472, + "step": 2395 + }, + { + "epoch": 8.650112866817155, + "grad_norm": 201.2444610595703, + "learning_rate": 1.700907441016334e-05, + "loss": 35.4641, + "step": 2396 + }, + { + "epoch": 8.65372460496614, + "grad_norm": 242.34298706054688, + "learning_rate": 1.7003629764065335e-05, + "loss": 33.3414, + "step": 2397 + }, + { + "epoch": 8.657336343115125, + "grad_norm": 214.45384216308594, + "learning_rate": 1.699818511796733e-05, + "loss": 33.7771, + "step": 2398 + }, + { + "epoch": 8.660948081264108, + "grad_norm": 276.4810485839844, + "learning_rate": 1.699274047186933e-05, + "loss": 35.4289, + "step": 2399 + }, + { + "epoch": 8.664559819413093, + "grad_norm": 199.68626403808594, + "learning_rate": 1.6987295825771325e-05, + "loss": 34.4205, + "step": 2400 + }, + { + "epoch": 8.664559819413093, + "eval_loss": 0.6179484128952026, + "eval_runtime": 3.1618, + "eval_samples_per_second": 56.614, + "eval_steps_per_second": 56.614, + "step": 2400 + }, + { + "epoch": 8.668171557562077, + "grad_norm": 239.19200134277344, + "learning_rate": 1.698185117967332e-05, + "loss": 34.3428, + "step": 2401 + }, + { + "epoch": 8.67178329571106, + "grad_norm": 341.44927978515625, + "learning_rate": 1.697640653357532e-05, + "loss": 37.6011, + "step": 2402 + }, + { + "epoch": 8.675395033860045, + "grad_norm": 260.5967102050781, + "learning_rate": 1.6970961887477314e-05, + "loss": 34.9222, + "step": 2403 + }, + { + "epoch": 8.679006772009028, + "grad_norm": 217.9357147216797, + "learning_rate": 1.6965517241379313e-05, + "loss": 36.6177, + "step": 2404 + }, + { + "epoch": 8.682618510158013, + "grad_norm": 355.21917724609375, + "learning_rate": 1.696007259528131e-05, + "loss": 36.3072, + "step": 2405 + }, + { + "epoch": 8.686230248306998, + "grad_norm": 279.37200927734375, + "learning_rate": 1.6954627949183304e-05, + "loss": 36.7026, + "step": 2406 + }, + { + "epoch": 8.689841986455981, + "grad_norm": 344.9017028808594, + "learning_rate": 1.69491833030853e-05, + "loss": 37.5009, + "step": 2407 + }, + { + "epoch": 8.693453724604966, + "grad_norm": 225.28668212890625, + "learning_rate": 1.6943738656987295e-05, + "loss": 36.0914, + "step": 2408 + }, + { + "epoch": 8.697065462753951, + "grad_norm": 233.16372680664062, + "learning_rate": 1.693829401088929e-05, + "loss": 38.0917, + "step": 2409 + }, + { + "epoch": 8.700677200902934, + "grad_norm": 220.2307891845703, + "learning_rate": 1.693284936479129e-05, + "loss": 37.4493, + "step": 2410 + }, + { + "epoch": 8.700677200902934, + "eval_loss": 0.6225734949111938, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2410 + }, + { + "epoch": 8.704288939051919, + "grad_norm": 298.2883605957031, + "learning_rate": 1.6927404718693287e-05, + "loss": 37.6527, + "step": 2411 + }, + { + "epoch": 8.707900677200904, + "grad_norm": 329.1615295410156, + "learning_rate": 1.6921960072595283e-05, + "loss": 30.9627, + "step": 2412 + }, + { + "epoch": 8.711512415349887, + "grad_norm": 192.55380249023438, + "learning_rate": 1.6916515426497278e-05, + "loss": 24.2028, + "step": 2413 + }, + { + "epoch": 8.715124153498872, + "grad_norm": 162.13583374023438, + "learning_rate": 1.6911070780399274e-05, + "loss": 23.3005, + "step": 2414 + }, + { + "epoch": 8.718735891647855, + "grad_norm": 152.95108032226562, + "learning_rate": 1.6905626134301272e-05, + "loss": 24.335, + "step": 2415 + }, + { + "epoch": 8.72234762979684, + "grad_norm": 183.4193572998047, + "learning_rate": 1.6900181488203268e-05, + "loss": 24.9279, + "step": 2416 + }, + { + "epoch": 8.725959367945824, + "grad_norm": 232.93650817871094, + "learning_rate": 1.6894736842105263e-05, + "loss": 43.4574, + "step": 2417 + }, + { + "epoch": 8.729571106094808, + "grad_norm": 226.85890197753906, + "learning_rate": 1.688929219600726e-05, + "loss": 44.4136, + "step": 2418 + }, + { + "epoch": 8.733182844243792, + "grad_norm": 232.16064453125, + "learning_rate": 1.6883847549909254e-05, + "loss": 42.8183, + "step": 2419 + }, + { + "epoch": 8.736794582392777, + "grad_norm": 243.5811767578125, + "learning_rate": 1.6878402903811253e-05, + "loss": 43.3031, + "step": 2420 + }, + { + "epoch": 8.736794582392777, + "eval_loss": 0.6284167170524597, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2420 + }, + { + "epoch": 8.74040632054176, + "grad_norm": 194.7115020751953, + "learning_rate": 1.687295825771325e-05, + "loss": 42.1276, + "step": 2421 + }, + { + "epoch": 8.744018058690745, + "grad_norm": 250.81983947753906, + "learning_rate": 1.6867513611615247e-05, + "loss": 42.5535, + "step": 2422 + }, + { + "epoch": 8.747629796839728, + "grad_norm": 205.1988983154297, + "learning_rate": 1.6862068965517242e-05, + "loss": 42.7745, + "step": 2423 + }, + { + "epoch": 8.751241534988713, + "grad_norm": 159.68243408203125, + "learning_rate": 1.6856624319419238e-05, + "loss": 43.6562, + "step": 2424 + }, + { + "epoch": 8.754853273137698, + "grad_norm": 164.31361389160156, + "learning_rate": 1.6851179673321233e-05, + "loss": 43.4602, + "step": 2425 + }, + { + "epoch": 8.758465011286681, + "grad_norm": 213.9793243408203, + "learning_rate": 1.6845735027223232e-05, + "loss": 42.1559, + "step": 2426 + }, + { + "epoch": 8.762076749435666, + "grad_norm": 205.79107666015625, + "learning_rate": 1.6840290381125227e-05, + "loss": 41.5687, + "step": 2427 + }, + { + "epoch": 8.76568848758465, + "grad_norm": 235.80348205566406, + "learning_rate": 1.6834845735027223e-05, + "loss": 41.0748, + "step": 2428 + }, + { + "epoch": 8.769300225733634, + "grad_norm": 203.84884643554688, + "learning_rate": 1.682940108892922e-05, + "loss": 39.3348, + "step": 2429 + }, + { + "epoch": 8.772911963882619, + "grad_norm": 271.2411804199219, + "learning_rate": 1.6823956442831217e-05, + "loss": 39.357, + "step": 2430 + }, + { + "epoch": 8.772911963882619, + "eval_loss": 0.6211046576499939, + "eval_runtime": 3.1402, + "eval_samples_per_second": 57.002, + "eval_steps_per_second": 57.002, + "step": 2430 + }, + { + "epoch": 8.776523702031604, + "grad_norm": 222.4960174560547, + "learning_rate": 1.6818511796733212e-05, + "loss": 39.2198, + "step": 2431 + }, + { + "epoch": 8.780135440180587, + "grad_norm": 325.9942932128906, + "learning_rate": 1.681306715063521e-05, + "loss": 40.572, + "step": 2432 + }, + { + "epoch": 8.783747178329572, + "grad_norm": 195.2740936279297, + "learning_rate": 1.6807622504537206e-05, + "loss": 39.2727, + "step": 2433 + }, + { + "epoch": 8.787358916478555, + "grad_norm": 196.16964721679688, + "learning_rate": 1.68021778584392e-05, + "loss": 40.6503, + "step": 2434 + }, + { + "epoch": 8.79097065462754, + "grad_norm": 183.2659454345703, + "learning_rate": 1.6796733212341197e-05, + "loss": 41.2074, + "step": 2435 + }, + { + "epoch": 8.794582392776524, + "grad_norm": 293.393798828125, + "learning_rate": 1.6791288566243192e-05, + "loss": 40.2778, + "step": 2436 + }, + { + "epoch": 8.798194130925507, + "grad_norm": 232.8402099609375, + "learning_rate": 1.678584392014519e-05, + "loss": 40.0305, + "step": 2437 + }, + { + "epoch": 8.801805869074492, + "grad_norm": 269.957275390625, + "learning_rate": 1.678039927404719e-05, + "loss": 40.4216, + "step": 2438 + }, + { + "epoch": 8.805417607223477, + "grad_norm": 175.6732635498047, + "learning_rate": 1.6774954627949185e-05, + "loss": 40.7998, + "step": 2439 + }, + { + "epoch": 8.80902934537246, + "grad_norm": 209.0604248046875, + "learning_rate": 1.676950998185118e-05, + "loss": 41.1176, + "step": 2440 + }, + { + "epoch": 8.80902934537246, + "eval_loss": 0.6211614012718201, + "eval_runtime": 3.15, + "eval_samples_per_second": 56.826, + "eval_steps_per_second": 56.826, + "step": 2440 + }, + { + "epoch": 8.812641083521445, + "grad_norm": 229.91171264648438, + "learning_rate": 1.6764065335753176e-05, + "loss": 41.37, + "step": 2441 + }, + { + "epoch": 8.816252821670428, + "grad_norm": 192.99610900878906, + "learning_rate": 1.675862068965517e-05, + "loss": 41.8377, + "step": 2442 + }, + { + "epoch": 8.819864559819413, + "grad_norm": 239.290771484375, + "learning_rate": 1.675317604355717e-05, + "loss": 42.3038, + "step": 2443 + }, + { + "epoch": 8.823476297968398, + "grad_norm": 203.52330017089844, + "learning_rate": 1.6747731397459166e-05, + "loss": 41.3334, + "step": 2444 + }, + { + "epoch": 8.827088036117381, + "grad_norm": 247.99099731445312, + "learning_rate": 1.674228675136116e-05, + "loss": 37.7455, + "step": 2445 + }, + { + "epoch": 8.830699774266366, + "grad_norm": 205.9770965576172, + "learning_rate": 1.6736842105263156e-05, + "loss": 34.6828, + "step": 2446 + }, + { + "epoch": 8.83431151241535, + "grad_norm": 215.47024536132812, + "learning_rate": 1.6731397459165152e-05, + "loss": 34.927, + "step": 2447 + }, + { + "epoch": 8.837923250564334, + "grad_norm": 254.14010620117188, + "learning_rate": 1.6725952813067154e-05, + "loss": 35.3194, + "step": 2448 + }, + { + "epoch": 8.841534988713319, + "grad_norm": 221.18174743652344, + "learning_rate": 1.672050816696915e-05, + "loss": 34.9577, + "step": 2449 + }, + { + "epoch": 8.845146726862303, + "grad_norm": 191.1651611328125, + "learning_rate": 1.6715063520871145e-05, + "loss": 33.7244, + "step": 2450 + }, + { + "epoch": 8.845146726862303, + "eval_loss": 0.6216589212417603, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2450 + }, + { + "epoch": 8.848758465011286, + "grad_norm": 228.3920135498047, + "learning_rate": 1.670961887477314e-05, + "loss": 34.9689, + "step": 2451 + }, + { + "epoch": 8.852370203160271, + "grad_norm": 227.6689910888672, + "learning_rate": 1.6704174228675135e-05, + "loss": 36.0718, + "step": 2452 + }, + { + "epoch": 8.855981941309254, + "grad_norm": 182.38978576660156, + "learning_rate": 1.669872958257713e-05, + "loss": 37.1143, + "step": 2453 + }, + { + "epoch": 8.85959367945824, + "grad_norm": 223.66966247558594, + "learning_rate": 1.669328493647913e-05, + "loss": 34.4468, + "step": 2454 + }, + { + "epoch": 8.863205417607224, + "grad_norm": 260.3930358886719, + "learning_rate": 1.6687840290381125e-05, + "loss": 36.7305, + "step": 2455 + }, + { + "epoch": 8.866817155756207, + "grad_norm": 218.60385131835938, + "learning_rate": 1.668239564428312e-05, + "loss": 36.1995, + "step": 2456 + }, + { + "epoch": 8.870428893905192, + "grad_norm": 227.4342041015625, + "learning_rate": 1.667695099818512e-05, + "loss": 35.9138, + "step": 2457 + }, + { + "epoch": 8.874040632054175, + "grad_norm": 208.42196655273438, + "learning_rate": 1.6671506352087115e-05, + "loss": 37.2621, + "step": 2458 + }, + { + "epoch": 8.87765237020316, + "grad_norm": 214.9486541748047, + "learning_rate": 1.6666061705989113e-05, + "loss": 38.5176, + "step": 2459 + }, + { + "epoch": 8.881264108352145, + "grad_norm": 226.6992645263672, + "learning_rate": 1.666061705989111e-05, + "loss": 38.3917, + "step": 2460 + }, + { + "epoch": 8.881264108352145, + "eval_loss": 0.6277003884315491, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.959, + "eval_steps_per_second": 56.959, + "step": 2460 + }, + { + "epoch": 8.884875846501128, + "grad_norm": 282.3875732421875, + "learning_rate": 1.6655172413793104e-05, + "loss": 39.1439, + "step": 2461 + }, + { + "epoch": 8.888487584650113, + "grad_norm": 240.29022216796875, + "learning_rate": 1.66497277676951e-05, + "loss": 33.7717, + "step": 2462 + }, + { + "epoch": 8.892099322799098, + "grad_norm": 231.84727478027344, + "learning_rate": 1.6644283121597095e-05, + "loss": 24.1146, + "step": 2463 + }, + { + "epoch": 8.89571106094808, + "grad_norm": 215.5159149169922, + "learning_rate": 1.663883847549909e-05, + "loss": 24.0165, + "step": 2464 + }, + { + "epoch": 8.899322799097066, + "grad_norm": 278.42950439453125, + "learning_rate": 1.663339382940109e-05, + "loss": 24.2048, + "step": 2465 + }, + { + "epoch": 8.90293453724605, + "grad_norm": 187.03341674804688, + "learning_rate": 1.6627949183303088e-05, + "loss": 24.7332, + "step": 2466 + }, + { + "epoch": 8.906546275395034, + "grad_norm": 261.2938232421875, + "learning_rate": 1.6622504537205083e-05, + "loss": 42.6764, + "step": 2467 + }, + { + "epoch": 8.910158013544018, + "grad_norm": 234.00880432128906, + "learning_rate": 1.661705989110708e-05, + "loss": 42.9894, + "step": 2468 + }, + { + "epoch": 8.913769751693001, + "grad_norm": 263.2890319824219, + "learning_rate": 1.6611615245009074e-05, + "loss": 43.3274, + "step": 2469 + }, + { + "epoch": 8.917381489841986, + "grad_norm": 286.3260192871094, + "learning_rate": 1.6606170598911073e-05, + "loss": 44.3862, + "step": 2470 + }, + { + "epoch": 8.917381489841986, + "eval_loss": 0.6278789043426514, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2470 + }, + { + "epoch": 8.920993227990971, + "grad_norm": 273.5133972167969, + "learning_rate": 1.6600725952813068e-05, + "loss": 43.4195, + "step": 2471 + }, + { + "epoch": 8.924604966139954, + "grad_norm": 246.2245330810547, + "learning_rate": 1.6595281306715063e-05, + "loss": 43.153, + "step": 2472 + }, + { + "epoch": 8.928216704288939, + "grad_norm": 261.3001403808594, + "learning_rate": 1.658983666061706e-05, + "loss": 41.1276, + "step": 2473 + }, + { + "epoch": 8.931828442437924, + "grad_norm": 263.7626037597656, + "learning_rate": 1.6584392014519054e-05, + "loss": 40.5055, + "step": 2474 + }, + { + "epoch": 8.935440180586907, + "grad_norm": 233.80442810058594, + "learning_rate": 1.6578947368421053e-05, + "loss": 40.7098, + "step": 2475 + }, + { + "epoch": 8.939051918735892, + "grad_norm": 334.1268615722656, + "learning_rate": 1.6573502722323052e-05, + "loss": 40.5404, + "step": 2476 + }, + { + "epoch": 8.942663656884875, + "grad_norm": 319.56689453125, + "learning_rate": 1.6568058076225047e-05, + "loss": 40.3434, + "step": 2477 + }, + { + "epoch": 8.94627539503386, + "grad_norm": 388.0625915527344, + "learning_rate": 1.6562613430127043e-05, + "loss": 41.1956, + "step": 2478 + }, + { + "epoch": 8.949887133182845, + "grad_norm": 256.9087829589844, + "learning_rate": 1.6557168784029038e-05, + "loss": 41.9647, + "step": 2479 + }, + { + "epoch": 8.953498871331828, + "grad_norm": 248.2635040283203, + "learning_rate": 1.6551724137931033e-05, + "loss": 41.1885, + "step": 2480 + }, + { + "epoch": 8.953498871331828, + "eval_loss": 0.6198933124542236, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2480 + }, + { + "epoch": 8.957110609480813, + "grad_norm": 236.89004516601562, + "learning_rate": 1.6546279491833032e-05, + "loss": 41.2178, + "step": 2481 + }, + { + "epoch": 8.960722347629797, + "grad_norm": 260.47357177734375, + "learning_rate": 1.6540834845735027e-05, + "loss": 42.1472, + "step": 2482 + }, + { + "epoch": 8.96433408577878, + "grad_norm": 216.1390380859375, + "learning_rate": 1.6535390199637023e-05, + "loss": 36.14, + "step": 2483 + }, + { + "epoch": 8.967945823927765, + "grad_norm": 194.7316131591797, + "learning_rate": 1.652994555353902e-05, + "loss": 33.7272, + "step": 2484 + }, + { + "epoch": 8.97155756207675, + "grad_norm": 202.0404052734375, + "learning_rate": 1.6524500907441017e-05, + "loss": 34.9427, + "step": 2485 + }, + { + "epoch": 8.975169300225733, + "grad_norm": 196.98463439941406, + "learning_rate": 1.6519056261343016e-05, + "loss": 36.4874, + "step": 2486 + }, + { + "epoch": 8.978781038374718, + "grad_norm": 211.46177673339844, + "learning_rate": 1.651361161524501e-05, + "loss": 35.7667, + "step": 2487 + }, + { + "epoch": 8.982392776523701, + "grad_norm": 190.47093200683594, + "learning_rate": 1.6508166969147006e-05, + "loss": 35.6874, + "step": 2488 + }, + { + "epoch": 8.986004514672686, + "grad_norm": 194.9825897216797, + "learning_rate": 1.6502722323049002e-05, + "loss": 36.8718, + "step": 2489 + }, + { + "epoch": 8.989616252821671, + "grad_norm": 230.24774169921875, + "learning_rate": 1.6497277676950997e-05, + "loss": 37.4962, + "step": 2490 + }, + { + "epoch": 8.989616252821671, + "eval_loss": 0.6168100237846375, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 2490 + }, + { + "epoch": 8.993227990970654, + "grad_norm": 266.5688171386719, + "learning_rate": 1.6491833030852993e-05, + "loss": 35.5063, + "step": 2491 + }, + { + "epoch": 8.996839729119639, + "grad_norm": 230.923828125, + "learning_rate": 1.648638838475499e-05, + "loss": 23.5847, + "step": 2492 + }, + { + "epoch": 9.0, + "grad_norm": 187.365478515625, + "learning_rate": 1.6480943738656987e-05, + "loss": 21.7926, + "step": 2493 + }, + { + "epoch": 9.003611738148985, + "grad_norm": 283.487060546875, + "learning_rate": 1.6475499092558986e-05, + "loss": 41.4221, + "step": 2494 + }, + { + "epoch": 9.007223476297968, + "grad_norm": 234.38009643554688, + "learning_rate": 1.647005444646098e-05, + "loss": 43.3343, + "step": 2495 + }, + { + "epoch": 9.010835214446953, + "grad_norm": 253.75588989257812, + "learning_rate": 1.6464609800362976e-05, + "loss": 42.1983, + "step": 2496 + }, + { + "epoch": 9.014446952595938, + "grad_norm": 224.6202392578125, + "learning_rate": 1.6459165154264975e-05, + "loss": 41.5355, + "step": 2497 + }, + { + "epoch": 9.01805869074492, + "grad_norm": 261.0040588378906, + "learning_rate": 1.645372050816697e-05, + "loss": 42.3058, + "step": 2498 + }, + { + "epoch": 9.021670428893906, + "grad_norm": 191.44142150878906, + "learning_rate": 1.6448275862068966e-05, + "loss": 42.3911, + "step": 2499 + }, + { + "epoch": 9.025282167042889, + "grad_norm": 246.79278564453125, + "learning_rate": 1.644283121597096e-05, + "loss": 41.6238, + "step": 2500 + }, + { + "epoch": 9.025282167042889, + "eval_loss": 0.6220878958702087, + "eval_runtime": 3.1552, + "eval_samples_per_second": 56.731, + "eval_steps_per_second": 56.731, + "step": 2500 + }, + { + "epoch": 9.028893905191874, + "grad_norm": 251.5475311279297, + "learning_rate": 1.6437386569872957e-05, + "loss": 43.9275, + "step": 2501 + }, + { + "epoch": 9.032505643340858, + "grad_norm": 300.0381164550781, + "learning_rate": 1.6431941923774952e-05, + "loss": 42.8938, + "step": 2502 + }, + { + "epoch": 9.036117381489841, + "grad_norm": 310.0517883300781, + "learning_rate": 1.6426497277676954e-05, + "loss": 42.3538, + "step": 2503 + }, + { + "epoch": 9.039729119638826, + "grad_norm": 213.50392150878906, + "learning_rate": 1.642105263157895e-05, + "loss": 40.2305, + "step": 2504 + }, + { + "epoch": 9.043340857787811, + "grad_norm": 173.3816680908203, + "learning_rate": 1.6415607985480945e-05, + "loss": 38.3336, + "step": 2505 + }, + { + "epoch": 9.046952595936794, + "grad_norm": 195.51968383789062, + "learning_rate": 1.641016333938294e-05, + "loss": 38.5937, + "step": 2506 + }, + { + "epoch": 9.050564334085779, + "grad_norm": 195.68910217285156, + "learning_rate": 1.6404718693284936e-05, + "loss": 37.9994, + "step": 2507 + }, + { + "epoch": 9.054176072234762, + "grad_norm": 239.56704711914062, + "learning_rate": 1.6399274047186934e-05, + "loss": 38.6006, + "step": 2508 + }, + { + "epoch": 9.057787810383747, + "grad_norm": 455.8309326171875, + "learning_rate": 1.639382940108893e-05, + "loss": 39.9516, + "step": 2509 + }, + { + "epoch": 9.061399548532732, + "grad_norm": 188.0857696533203, + "learning_rate": 1.6388384754990925e-05, + "loss": 38.8922, + "step": 2510 + }, + { + "epoch": 9.061399548532732, + "eval_loss": 0.6177002191543579, + "eval_runtime": 3.1595, + "eval_samples_per_second": 56.654, + "eval_steps_per_second": 56.654, + "step": 2510 + }, + { + "epoch": 9.065011286681715, + "grad_norm": 211.76168823242188, + "learning_rate": 1.638294010889292e-05, + "loss": 38.8895, + "step": 2511 + }, + { + "epoch": 9.0686230248307, + "grad_norm": 281.7332458496094, + "learning_rate": 1.637749546279492e-05, + "loss": 39.9238, + "step": 2512 + }, + { + "epoch": 9.072234762979685, + "grad_norm": 254.9953155517578, + "learning_rate": 1.6372050816696915e-05, + "loss": 41.2667, + "step": 2513 + }, + { + "epoch": 9.075846501128668, + "grad_norm": 233.8746337890625, + "learning_rate": 1.6366606170598914e-05, + "loss": 39.3087, + "step": 2514 + }, + { + "epoch": 9.079458239277653, + "grad_norm": 317.71270751953125, + "learning_rate": 1.636116152450091e-05, + "loss": 40.4902, + "step": 2515 + }, + { + "epoch": 9.083069977426636, + "grad_norm": 227.5228271484375, + "learning_rate": 1.6355716878402904e-05, + "loss": 40.1197, + "step": 2516 + }, + { + "epoch": 9.08668171557562, + "grad_norm": 225.84423828125, + "learning_rate": 1.63502722323049e-05, + "loss": 42.9099, + "step": 2517 + }, + { + "epoch": 9.090293453724605, + "grad_norm": 255.20858764648438, + "learning_rate": 1.6344827586206895e-05, + "loss": 42.0515, + "step": 2518 + }, + { + "epoch": 9.093905191873588, + "grad_norm": 215.45352172851562, + "learning_rate": 1.6339382940108894e-05, + "loss": 41.6817, + "step": 2519 + }, + { + "epoch": 9.097516930022573, + "grad_norm": 233.5334014892578, + "learning_rate": 1.633393829401089e-05, + "loss": 42.6121, + "step": 2520 + }, + { + "epoch": 9.097516930022573, + "eval_loss": 0.6148340106010437, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.926, + "eval_steps_per_second": 56.926, + "step": 2520 + }, + { + "epoch": 9.101128668171558, + "grad_norm": 196.54132080078125, + "learning_rate": 1.6328493647912888e-05, + "loss": 40.5833, + "step": 2521 + }, + { + "epoch": 9.104740406320541, + "grad_norm": 296.7503967285156, + "learning_rate": 1.6323049001814883e-05, + "loss": 39.098, + "step": 2522 + }, + { + "epoch": 9.108352144469526, + "grad_norm": 272.1104431152344, + "learning_rate": 1.631760435571688e-05, + "loss": 36.0076, + "step": 2523 + }, + { + "epoch": 9.111963882618511, + "grad_norm": 197.3100128173828, + "learning_rate": 1.6312159709618874e-05, + "loss": 33.3503, + "step": 2524 + }, + { + "epoch": 9.115575620767494, + "grad_norm": 223.1310272216797, + "learning_rate": 1.6306715063520873e-05, + "loss": 33.1386, + "step": 2525 + }, + { + "epoch": 9.119187358916479, + "grad_norm": 234.86093139648438, + "learning_rate": 1.630127041742287e-05, + "loss": 34.2101, + "step": 2526 + }, + { + "epoch": 9.122799097065462, + "grad_norm": 244.72328186035156, + "learning_rate": 1.6295825771324864e-05, + "loss": 34.955, + "step": 2527 + }, + { + "epoch": 9.126410835214447, + "grad_norm": 198.89134216308594, + "learning_rate": 1.629038112522686e-05, + "loss": 34.5405, + "step": 2528 + }, + { + "epoch": 9.130022573363432, + "grad_norm": 236.64096069335938, + "learning_rate": 1.6284936479128854e-05, + "loss": 35.2328, + "step": 2529 + }, + { + "epoch": 9.133634311512415, + "grad_norm": 212.8743438720703, + "learning_rate": 1.6279491833030853e-05, + "loss": 34.6642, + "step": 2530 + }, + { + "epoch": 9.133634311512415, + "eval_loss": 0.6154256463050842, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 2530 + }, + { + "epoch": 9.1372460496614, + "grad_norm": 227.15135192871094, + "learning_rate": 1.6274047186932852e-05, + "loss": 35.652, + "step": 2531 + }, + { + "epoch": 9.140857787810384, + "grad_norm": 207.30572509765625, + "learning_rate": 1.6268602540834847e-05, + "loss": 36.8476, + "step": 2532 + }, + { + "epoch": 9.144469525959368, + "grad_norm": 222.18023681640625, + "learning_rate": 1.6263157894736843e-05, + "loss": 35.8299, + "step": 2533 + }, + { + "epoch": 9.148081264108352, + "grad_norm": 283.674072265625, + "learning_rate": 1.6257713248638838e-05, + "loss": 36.5074, + "step": 2534 + }, + { + "epoch": 9.151693002257336, + "grad_norm": 235.69752502441406, + "learning_rate": 1.6252268602540834e-05, + "loss": 37.344, + "step": 2535 + }, + { + "epoch": 9.15530474040632, + "grad_norm": 224.37965393066406, + "learning_rate": 1.6246823956442832e-05, + "loss": 37.8138, + "step": 2536 + }, + { + "epoch": 9.158916478555305, + "grad_norm": 217.52230834960938, + "learning_rate": 1.6241379310344828e-05, + "loss": 37.1529, + "step": 2537 + }, + { + "epoch": 9.162528216704288, + "grad_norm": 234.7586212158203, + "learning_rate": 1.6235934664246823e-05, + "loss": 36.3247, + "step": 2538 + }, + { + "epoch": 9.166139954853273, + "grad_norm": 239.52479553222656, + "learning_rate": 1.623049001814882e-05, + "loss": 30.0805, + "step": 2539 + }, + { + "epoch": 9.169751693002258, + "grad_norm": 223.7616424560547, + "learning_rate": 1.6225045372050817e-05, + "loss": 23.8492, + "step": 2540 + }, + { + "epoch": 9.169751693002258, + "eval_loss": 0.6244915723800659, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.031, + "eval_steps_per_second": 57.031, + "step": 2540 + }, + { + "epoch": 9.173363431151241, + "grad_norm": 213.41371154785156, + "learning_rate": 1.6219600725952816e-05, + "loss": 23.3557, + "step": 2541 + }, + { + "epoch": 9.176975169300226, + "grad_norm": 162.4627685546875, + "learning_rate": 1.621415607985481e-05, + "loss": 23.8834, + "step": 2542 + }, + { + "epoch": 9.18058690744921, + "grad_norm": 172.13250732421875, + "learning_rate": 1.6208711433756807e-05, + "loss": 24.6428, + "step": 2543 + }, + { + "epoch": 9.184198645598194, + "grad_norm": 229.30799865722656, + "learning_rate": 1.6203266787658802e-05, + "loss": 42.5908, + "step": 2544 + }, + { + "epoch": 9.187810383747179, + "grad_norm": 195.30130004882812, + "learning_rate": 1.6197822141560798e-05, + "loss": 43.7286, + "step": 2545 + }, + { + "epoch": 9.191422121896162, + "grad_norm": 227.4984893798828, + "learning_rate": 1.6192377495462793e-05, + "loss": 43.5012, + "step": 2546 + }, + { + "epoch": 9.195033860045147, + "grad_norm": 254.69615173339844, + "learning_rate": 1.6186932849364792e-05, + "loss": 41.9295, + "step": 2547 + }, + { + "epoch": 9.198645598194132, + "grad_norm": 251.33778381347656, + "learning_rate": 1.6181488203266787e-05, + "loss": 42.0838, + "step": 2548 + }, + { + "epoch": 9.202257336343115, + "grad_norm": 237.91677856445312, + "learning_rate": 1.6176043557168786e-05, + "loss": 43.0031, + "step": 2549 + }, + { + "epoch": 9.2058690744921, + "grad_norm": 258.0311584472656, + "learning_rate": 1.617059891107078e-05, + "loss": 42.7196, + "step": 2550 + }, + { + "epoch": 9.2058690744921, + "eval_loss": 0.6245208978652954, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2550 + }, + { + "epoch": 9.209480812641084, + "grad_norm": 197.14703369140625, + "learning_rate": 1.6165154264972777e-05, + "loss": 42.1342, + "step": 2551 + }, + { + "epoch": 9.213092550790067, + "grad_norm": 235.19705200195312, + "learning_rate": 1.6159709618874775e-05, + "loss": 41.8462, + "step": 2552 + }, + { + "epoch": 9.216704288939052, + "grad_norm": 198.409423828125, + "learning_rate": 1.615426497277677e-05, + "loss": 43.5993, + "step": 2553 + }, + { + "epoch": 9.220316027088035, + "grad_norm": 254.08590698242188, + "learning_rate": 1.6148820326678766e-05, + "loss": 40.771, + "step": 2554 + }, + { + "epoch": 9.22392776523702, + "grad_norm": 181.64808654785156, + "learning_rate": 1.614337568058076e-05, + "loss": 39.3511, + "step": 2555 + }, + { + "epoch": 9.227539503386005, + "grad_norm": 294.1127014160156, + "learning_rate": 1.6137931034482757e-05, + "loss": 39.6586, + "step": 2556 + }, + { + "epoch": 9.231151241534988, + "grad_norm": 197.59982299804688, + "learning_rate": 1.6132486388384752e-05, + "loss": 38.2575, + "step": 2557 + }, + { + "epoch": 9.234762979683973, + "grad_norm": 223.74717712402344, + "learning_rate": 1.6127041742286754e-05, + "loss": 38.8801, + "step": 2558 + }, + { + "epoch": 9.238374717832958, + "grad_norm": 279.2779541015625, + "learning_rate": 1.612159709618875e-05, + "loss": 40.4591, + "step": 2559 + }, + { + "epoch": 9.241986455981941, + "grad_norm": 258.75909423828125, + "learning_rate": 1.6116152450090745e-05, + "loss": 39.2172, + "step": 2560 + }, + { + "epoch": 9.241986455981941, + "eval_loss": 0.6209923624992371, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.784, + "eval_steps_per_second": 56.784, + "step": 2560 + }, + { + "epoch": 9.245598194130926, + "grad_norm": 305.0645446777344, + "learning_rate": 1.611070780399274e-05, + "loss": 40.442, + "step": 2561 + }, + { + "epoch": 9.249209932279909, + "grad_norm": 196.18557739257812, + "learning_rate": 1.6105263157894736e-05, + "loss": 39.7092, + "step": 2562 + }, + { + "epoch": 9.252821670428894, + "grad_norm": 214.3220977783203, + "learning_rate": 1.6099818511796735e-05, + "loss": 39.3935, + "step": 2563 + }, + { + "epoch": 9.256433408577879, + "grad_norm": 217.2801055908203, + "learning_rate": 1.609437386569873e-05, + "loss": 40.39, + "step": 2564 + }, + { + "epoch": 9.260045146726862, + "grad_norm": 205.17446899414062, + "learning_rate": 1.6088929219600726e-05, + "loss": 39.9531, + "step": 2565 + }, + { + "epoch": 9.263656884875846, + "grad_norm": 197.3854217529297, + "learning_rate": 1.608348457350272e-05, + "loss": 40.474, + "step": 2566 + }, + { + "epoch": 9.267268623024831, + "grad_norm": 264.3934631347656, + "learning_rate": 1.607803992740472e-05, + "loss": 41.2794, + "step": 2567 + }, + { + "epoch": 9.270880361173814, + "grad_norm": 226.6471710205078, + "learning_rate": 1.6072595281306715e-05, + "loss": 40.3425, + "step": 2568 + }, + { + "epoch": 9.2744920993228, + "grad_norm": 198.62734985351562, + "learning_rate": 1.6067150635208714e-05, + "loss": 41.6261, + "step": 2569 + }, + { + "epoch": 9.278103837471784, + "grad_norm": 207.73509216308594, + "learning_rate": 1.606170598911071e-05, + "loss": 41.7835, + "step": 2570 + }, + { + "epoch": 9.278103837471784, + "eval_loss": 0.6173180937767029, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 2570 + }, + { + "epoch": 9.281715575620767, + "grad_norm": 214.13601684570312, + "learning_rate": 1.6056261343012705e-05, + "loss": 40.0095, + "step": 2571 + }, + { + "epoch": 9.285327313769752, + "grad_norm": 218.0533905029297, + "learning_rate": 1.60508166969147e-05, + "loss": 40.014, + "step": 2572 + }, + { + "epoch": 9.288939051918735, + "grad_norm": 211.27984619140625, + "learning_rate": 1.6045372050816695e-05, + "loss": 36.7399, + "step": 2573 + }, + { + "epoch": 9.29255079006772, + "grad_norm": 201.9020233154297, + "learning_rate": 1.6039927404718694e-05, + "loss": 33.7555, + "step": 2574 + }, + { + "epoch": 9.296162528216705, + "grad_norm": 230.27149963378906, + "learning_rate": 1.603448275862069e-05, + "loss": 32.9646, + "step": 2575 + }, + { + "epoch": 9.299774266365688, + "grad_norm": 208.77622985839844, + "learning_rate": 1.6029038112522685e-05, + "loss": 33.5332, + "step": 2576 + }, + { + "epoch": 9.303386004514673, + "grad_norm": 225.02796936035156, + "learning_rate": 1.6023593466424684e-05, + "loss": 34.2592, + "step": 2577 + }, + { + "epoch": 9.306997742663658, + "grad_norm": 201.79612731933594, + "learning_rate": 1.601814882032668e-05, + "loss": 34.6686, + "step": 2578 + }, + { + "epoch": 9.31060948081264, + "grad_norm": 235.6588134765625, + "learning_rate": 1.6012704174228678e-05, + "loss": 35.4554, + "step": 2579 + }, + { + "epoch": 9.314221218961626, + "grad_norm": 273.51904296875, + "learning_rate": 1.6007259528130673e-05, + "loss": 35.2077, + "step": 2580 + }, + { + "epoch": 9.314221218961626, + "eval_loss": 0.6169624328613281, + "eval_runtime": 3.1501, + "eval_samples_per_second": 56.823, + "eval_steps_per_second": 56.823, + "step": 2580 + }, + { + "epoch": 9.317832957110609, + "grad_norm": 199.19541931152344, + "learning_rate": 1.600181488203267e-05, + "loss": 35.0703, + "step": 2581 + }, + { + "epoch": 9.321444695259594, + "grad_norm": 212.49276733398438, + "learning_rate": 1.5996370235934664e-05, + "loss": 35.9691, + "step": 2582 + }, + { + "epoch": 9.325056433408578, + "grad_norm": 193.7330322265625, + "learning_rate": 1.599092558983666e-05, + "loss": 34.9043, + "step": 2583 + }, + { + "epoch": 9.328668171557561, + "grad_norm": 196.00503540039062, + "learning_rate": 1.5985480943738655e-05, + "loss": 36.3508, + "step": 2584 + }, + { + "epoch": 9.332279909706546, + "grad_norm": 218.78392028808594, + "learning_rate": 1.5980036297640654e-05, + "loss": 34.7672, + "step": 2585 + }, + { + "epoch": 9.335891647855531, + "grad_norm": 235.76873779296875, + "learning_rate": 1.5974591651542652e-05, + "loss": 36.8695, + "step": 2586 + }, + { + "epoch": 9.339503386004514, + "grad_norm": 250.538330078125, + "learning_rate": 1.5969147005444648e-05, + "loss": 37.4531, + "step": 2587 + }, + { + "epoch": 9.343115124153499, + "grad_norm": 234.12469482421875, + "learning_rate": 1.5963702359346643e-05, + "loss": 37.4506, + "step": 2588 + }, + { + "epoch": 9.346726862302482, + "grad_norm": 209.3461151123047, + "learning_rate": 1.595825771324864e-05, + "loss": 31.3062, + "step": 2589 + }, + { + "epoch": 9.350338600451467, + "grad_norm": 211.12277221679688, + "learning_rate": 1.5952813067150637e-05, + "loss": 23.3303, + "step": 2590 + }, + { + "epoch": 9.350338600451467, + "eval_loss": 0.6222187876701355, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 2590 + }, + { + "epoch": 9.353950338600452, + "grad_norm": 200.1257781982422, + "learning_rate": 1.5947368421052633e-05, + "loss": 22.9145, + "step": 2591 + }, + { + "epoch": 9.357562076749435, + "grad_norm": 179.01475524902344, + "learning_rate": 1.5941923774954628e-05, + "loss": 23.8842, + "step": 2592 + }, + { + "epoch": 9.36117381489842, + "grad_norm": 214.9254608154297, + "learning_rate": 1.5936479128856623e-05, + "loss": 25.4154, + "step": 2593 + }, + { + "epoch": 9.364785553047405, + "grad_norm": 211.63735961914062, + "learning_rate": 1.593103448275862e-05, + "loss": 42.6467, + "step": 2594 + }, + { + "epoch": 9.368397291196388, + "grad_norm": 232.43194580078125, + "learning_rate": 1.5925589836660618e-05, + "loss": 43.3501, + "step": 2595 + }, + { + "epoch": 9.372009029345373, + "grad_norm": 220.61468505859375, + "learning_rate": 1.5920145190562616e-05, + "loss": 43.4324, + "step": 2596 + }, + { + "epoch": 9.375620767494357, + "grad_norm": 179.00894165039062, + "learning_rate": 1.591470054446461e-05, + "loss": 41.9646, + "step": 2597 + }, + { + "epoch": 9.37923250564334, + "grad_norm": 203.847412109375, + "learning_rate": 1.5909255898366607e-05, + "loss": 41.1242, + "step": 2598 + }, + { + "epoch": 9.382844243792325, + "grad_norm": 244.20164489746094, + "learning_rate": 1.5903811252268602e-05, + "loss": 42.2451, + "step": 2599 + }, + { + "epoch": 9.386455981941308, + "grad_norm": 203.60154724121094, + "learning_rate": 1.5898366606170598e-05, + "loss": 42.0361, + "step": 2600 + }, + { + "epoch": 9.386455981941308, + "eval_loss": 0.627146303653717, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2600 + }, + { + "epoch": 9.390067720090293, + "grad_norm": 185.1741180419922, + "learning_rate": 1.5892921960072597e-05, + "loss": 41.9657, + "step": 2601 + }, + { + "epoch": 9.393679458239278, + "grad_norm": 211.64219665527344, + "learning_rate": 1.5887477313974592e-05, + "loss": 42.2619, + "step": 2602 + }, + { + "epoch": 9.397291196388261, + "grad_norm": 253.31997680664062, + "learning_rate": 1.5882032667876587e-05, + "loss": 42.5666, + "step": 2603 + }, + { + "epoch": 9.400902934537246, + "grad_norm": 257.8781433105469, + "learning_rate": 1.5876588021778586e-05, + "loss": 43.1747, + "step": 2604 + }, + { + "epoch": 9.404514672686231, + "grad_norm": 171.05398559570312, + "learning_rate": 1.587114337568058e-05, + "loss": 41.2645, + "step": 2605 + }, + { + "epoch": 9.408126410835214, + "grad_norm": 209.83749389648438, + "learning_rate": 1.5865698729582577e-05, + "loss": 38.7138, + "step": 2606 + }, + { + "epoch": 9.411738148984199, + "grad_norm": 303.92059326171875, + "learning_rate": 1.5860254083484576e-05, + "loss": 38.7962, + "step": 2607 + }, + { + "epoch": 9.415349887133182, + "grad_norm": 271.9322204589844, + "learning_rate": 1.585480943738657e-05, + "loss": 39.0622, + "step": 2608 + }, + { + "epoch": 9.418961625282167, + "grad_norm": 222.8749542236328, + "learning_rate": 1.5849364791288566e-05, + "loss": 40.0773, + "step": 2609 + }, + { + "epoch": 9.422573363431152, + "grad_norm": 194.549072265625, + "learning_rate": 1.5843920145190562e-05, + "loss": 39.3495, + "step": 2610 + }, + { + "epoch": 9.422573363431152, + "eval_loss": 0.618250846862793, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.796, + "eval_steps_per_second": 56.796, + "step": 2610 + }, + { + "epoch": 9.426185101580135, + "grad_norm": 231.32623291015625, + "learning_rate": 1.5838475499092557e-05, + "loss": 39.7577, + "step": 2611 + }, + { + "epoch": 9.42979683972912, + "grad_norm": 185.9986114501953, + "learning_rate": 1.5833030852994556e-05, + "loss": 40.9342, + "step": 2612 + }, + { + "epoch": 9.433408577878104, + "grad_norm": 221.356201171875, + "learning_rate": 1.5827586206896555e-05, + "loss": 39.7733, + "step": 2613 + }, + { + "epoch": 9.437020316027088, + "grad_norm": 216.2249755859375, + "learning_rate": 1.582214156079855e-05, + "loss": 39.7559, + "step": 2614 + }, + { + "epoch": 9.440632054176072, + "grad_norm": 263.5106201171875, + "learning_rate": 1.5816696914700546e-05, + "loss": 41.2872, + "step": 2615 + }, + { + "epoch": 9.444243792325057, + "grad_norm": 281.9518127441406, + "learning_rate": 1.581125226860254e-05, + "loss": 41.1114, + "step": 2616 + }, + { + "epoch": 9.44785553047404, + "grad_norm": 200.2808074951172, + "learning_rate": 1.5805807622504536e-05, + "loss": 41.7711, + "step": 2617 + }, + { + "epoch": 9.451467268623025, + "grad_norm": 233.034912109375, + "learning_rate": 1.5800362976406535e-05, + "loss": 41.3306, + "step": 2618 + }, + { + "epoch": 9.455079006772008, + "grad_norm": 215.5499725341797, + "learning_rate": 1.579491833030853e-05, + "loss": 41.0065, + "step": 2619 + }, + { + "epoch": 9.458690744920993, + "grad_norm": 220.21153259277344, + "learning_rate": 1.5789473684210526e-05, + "loss": 42.1116, + "step": 2620 + }, + { + "epoch": 9.458690744920993, + "eval_loss": 0.6146022081375122, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 2620 + }, + { + "epoch": 9.462302483069978, + "grad_norm": 198.20001220703125, + "learning_rate": 1.578402903811252e-05, + "loss": 39.637, + "step": 2621 + }, + { + "epoch": 9.465914221218961, + "grad_norm": 228.18357849121094, + "learning_rate": 1.5778584392014517e-05, + "loss": 37.3831, + "step": 2622 + }, + { + "epoch": 9.469525959367946, + "grad_norm": 207.68040466308594, + "learning_rate": 1.577313974591652e-05, + "loss": 35.6356, + "step": 2623 + }, + { + "epoch": 9.47313769751693, + "grad_norm": 267.0474853515625, + "learning_rate": 1.5767695099818514e-05, + "loss": 34.5549, + "step": 2624 + }, + { + "epoch": 9.476749435665914, + "grad_norm": 191.4129638671875, + "learning_rate": 1.576225045372051e-05, + "loss": 35.1065, + "step": 2625 + }, + { + "epoch": 9.480361173814899, + "grad_norm": 220.85708618164062, + "learning_rate": 1.5756805807622505e-05, + "loss": 34.9115, + "step": 2626 + }, + { + "epoch": 9.483972911963882, + "grad_norm": 218.62460327148438, + "learning_rate": 1.57513611615245e-05, + "loss": 33.9542, + "step": 2627 + }, + { + "epoch": 9.487584650112867, + "grad_norm": 184.085693359375, + "learning_rate": 1.5745916515426496e-05, + "loss": 35.2981, + "step": 2628 + }, + { + "epoch": 9.491196388261852, + "grad_norm": 286.73236083984375, + "learning_rate": 1.5740471869328494e-05, + "loss": 36.8326, + "step": 2629 + }, + { + "epoch": 9.494808126410835, + "grad_norm": 326.4263000488281, + "learning_rate": 1.573502722323049e-05, + "loss": 35.9728, + "step": 2630 + }, + { + "epoch": 9.494808126410835, + "eval_loss": 0.6165672540664673, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2630 + }, + { + "epoch": 9.49841986455982, + "grad_norm": 283.330322265625, + "learning_rate": 1.5729582577132485e-05, + "loss": 37.4227, + "step": 2631 + }, + { + "epoch": 9.502031602708804, + "grad_norm": 208.65829467773438, + "learning_rate": 1.5724137931034484e-05, + "loss": 36.8613, + "step": 2632 + }, + { + "epoch": 9.505643340857787, + "grad_norm": 191.59429931640625, + "learning_rate": 1.571869328493648e-05, + "loss": 36.2332, + "step": 2633 + }, + { + "epoch": 9.509255079006772, + "grad_norm": 306.4736022949219, + "learning_rate": 1.5713248638838478e-05, + "loss": 36.8045, + "step": 2634 + }, + { + "epoch": 9.512866817155757, + "grad_norm": 226.97509765625, + "learning_rate": 1.5707803992740474e-05, + "loss": 37.005, + "step": 2635 + }, + { + "epoch": 9.51647855530474, + "grad_norm": 230.47683715820312, + "learning_rate": 1.570235934664247e-05, + "loss": 36.9168, + "step": 2636 + }, + { + "epoch": 9.520090293453725, + "grad_norm": 221.44483947753906, + "learning_rate": 1.5696914700544464e-05, + "loss": 39.0025, + "step": 2637 + }, + { + "epoch": 9.523702031602708, + "grad_norm": 249.1531219482422, + "learning_rate": 1.569147005444646e-05, + "loss": 38.1069, + "step": 2638 + }, + { + "epoch": 9.527313769751693, + "grad_norm": 276.8532409667969, + "learning_rate": 1.5686025408348455e-05, + "loss": 30.9819, + "step": 2639 + }, + { + "epoch": 9.530925507900678, + "grad_norm": 218.25035095214844, + "learning_rate": 1.5680580762250454e-05, + "loss": 23.4807, + "step": 2640 + }, + { + "epoch": 9.530925507900678, + "eval_loss": 0.619295060634613, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2640 + }, + { + "epoch": 9.534537246049661, + "grad_norm": 185.83737182617188, + "learning_rate": 1.5675136116152453e-05, + "loss": 22.5394, + "step": 2641 + }, + { + "epoch": 9.538148984198646, + "grad_norm": 181.9920654296875, + "learning_rate": 1.5669691470054448e-05, + "loss": 23.9106, + "step": 2642 + }, + { + "epoch": 9.54176072234763, + "grad_norm": 209.20391845703125, + "learning_rate": 1.5664246823956443e-05, + "loss": 25.5328, + "step": 2643 + }, + { + "epoch": 9.545372460496614, + "grad_norm": 223.86093139648438, + "learning_rate": 1.565880217785844e-05, + "loss": 42.8563, + "step": 2644 + }, + { + "epoch": 9.548984198645599, + "grad_norm": 232.3086395263672, + "learning_rate": 1.5653357531760438e-05, + "loss": 44.0178, + "step": 2645 + }, + { + "epoch": 9.552595936794582, + "grad_norm": 223.76541137695312, + "learning_rate": 1.5647912885662433e-05, + "loss": 43.4928, + "step": 2646 + }, + { + "epoch": 9.556207674943566, + "grad_norm": 258.86700439453125, + "learning_rate": 1.5642468239564428e-05, + "loss": 42.3422, + "step": 2647 + }, + { + "epoch": 9.559819413092551, + "grad_norm": 255.09033203125, + "learning_rate": 1.5637023593466424e-05, + "loss": 41.6588, + "step": 2648 + }, + { + "epoch": 9.563431151241534, + "grad_norm": 205.88563537597656, + "learning_rate": 1.563157894736842e-05, + "loss": 41.9267, + "step": 2649 + }, + { + "epoch": 9.56704288939052, + "grad_norm": 204.12318420410156, + "learning_rate": 1.5626134301270418e-05, + "loss": 43.0326, + "step": 2650 + }, + { + "epoch": 9.56704288939052, + "eval_loss": 0.6218730807304382, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2650 + }, + { + "epoch": 9.570654627539504, + "grad_norm": 259.5694274902344, + "learning_rate": 1.5620689655172417e-05, + "loss": 42.9604, + "step": 2651 + }, + { + "epoch": 9.574266365688487, + "grad_norm": 234.35935974121094, + "learning_rate": 1.5615245009074412e-05, + "loss": 42.7316, + "step": 2652 + }, + { + "epoch": 9.577878103837472, + "grad_norm": 237.14346313476562, + "learning_rate": 1.5609800362976407e-05, + "loss": 42.4559, + "step": 2653 + }, + { + "epoch": 9.581489841986457, + "grad_norm": 208.2974395751953, + "learning_rate": 1.5604355716878403e-05, + "loss": 40.1113, + "step": 2654 + }, + { + "epoch": 9.58510158013544, + "grad_norm": 212.18814086914062, + "learning_rate": 1.5598911070780398e-05, + "loss": 38.6515, + "step": 2655 + }, + { + "epoch": 9.588713318284425, + "grad_norm": 245.23240661621094, + "learning_rate": 1.5593466424682397e-05, + "loss": 39.5289, + "step": 2656 + }, + { + "epoch": 9.592325056433408, + "grad_norm": 261.1321105957031, + "learning_rate": 1.5588021778584392e-05, + "loss": 39.3232, + "step": 2657 + }, + { + "epoch": 9.595936794582393, + "grad_norm": 257.67962646484375, + "learning_rate": 1.5582577132486388e-05, + "loss": 40.3963, + "step": 2658 + }, + { + "epoch": 9.599548532731378, + "grad_norm": 299.93914794921875, + "learning_rate": 1.5577132486388383e-05, + "loss": 39.0657, + "step": 2659 + }, + { + "epoch": 9.60316027088036, + "grad_norm": 215.45407104492188, + "learning_rate": 1.5571687840290382e-05, + "loss": 40.1408, + "step": 2660 + }, + { + "epoch": 9.60316027088036, + "eval_loss": 0.6216554045677185, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2660 + }, + { + "epoch": 9.606772009029346, + "grad_norm": 273.9233093261719, + "learning_rate": 1.5566243194192377e-05, + "loss": 40.6894, + "step": 2661 + }, + { + "epoch": 9.610383747178329, + "grad_norm": 220.76344299316406, + "learning_rate": 1.5560798548094376e-05, + "loss": 40.8146, + "step": 2662 + }, + { + "epoch": 9.613995485327314, + "grad_norm": 200.33929443359375, + "learning_rate": 1.555535390199637e-05, + "loss": 40.1362, + "step": 2663 + }, + { + "epoch": 9.617607223476298, + "grad_norm": 223.38536071777344, + "learning_rate": 1.5549909255898367e-05, + "loss": 39.3488, + "step": 2664 + }, + { + "epoch": 9.621218961625281, + "grad_norm": 240.99578857421875, + "learning_rate": 1.5544464609800362e-05, + "loss": 41.771, + "step": 2665 + }, + { + "epoch": 9.624830699774266, + "grad_norm": 202.30323791503906, + "learning_rate": 1.5539019963702357e-05, + "loss": 41.1412, + "step": 2666 + }, + { + "epoch": 9.628442437923251, + "grad_norm": 193.8411865234375, + "learning_rate": 1.5533575317604356e-05, + "loss": 41.0064, + "step": 2667 + }, + { + "epoch": 9.632054176072234, + "grad_norm": 197.1542510986328, + "learning_rate": 1.552813067150635e-05, + "loss": 41.4787, + "step": 2668 + }, + { + "epoch": 9.635665914221219, + "grad_norm": 259.21954345703125, + "learning_rate": 1.552268602540835e-05, + "loss": 41.753, + "step": 2669 + }, + { + "epoch": 9.639277652370204, + "grad_norm": 290.9770202636719, + "learning_rate": 1.5517241379310346e-05, + "loss": 40.4589, + "step": 2670 + }, + { + "epoch": 9.639277652370204, + "eval_loss": 0.6132164001464844, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2670 + }, + { + "epoch": 9.642889390519187, + "grad_norm": 252.86219787597656, + "learning_rate": 1.551179673321234e-05, + "loss": 37.356, + "step": 2671 + }, + { + "epoch": 9.646501128668172, + "grad_norm": 207.79254150390625, + "learning_rate": 1.550635208711434e-05, + "loss": 36.2071, + "step": 2672 + }, + { + "epoch": 9.650112866817155, + "grad_norm": 186.78857421875, + "learning_rate": 1.5500907441016335e-05, + "loss": 33.5074, + "step": 2673 + }, + { + "epoch": 9.65372460496614, + "grad_norm": 212.5107421875, + "learning_rate": 1.549546279491833e-05, + "loss": 33.7103, + "step": 2674 + }, + { + "epoch": 9.657336343115125, + "grad_norm": 243.2950897216797, + "learning_rate": 1.5490018148820326e-05, + "loss": 34.3476, + "step": 2675 + }, + { + "epoch": 9.660948081264108, + "grad_norm": 221.66415405273438, + "learning_rate": 1.548457350272232e-05, + "loss": 34.5377, + "step": 2676 + }, + { + "epoch": 9.664559819413093, + "grad_norm": 231.8260955810547, + "learning_rate": 1.5479128856624317e-05, + "loss": 34.3663, + "step": 2677 + }, + { + "epoch": 9.668171557562077, + "grad_norm": 284.6401062011719, + "learning_rate": 1.547368421052632e-05, + "loss": 35.5723, + "step": 2678 + }, + { + "epoch": 9.67178329571106, + "grad_norm": 373.43865966796875, + "learning_rate": 1.5468239564428314e-05, + "loss": 35.5628, + "step": 2679 + }, + { + "epoch": 9.675395033860045, + "grad_norm": 325.18316650390625, + "learning_rate": 1.546279491833031e-05, + "loss": 35.6192, + "step": 2680 + }, + { + "epoch": 9.675395033860045, + "eval_loss": 0.613842248916626, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 2680 + }, + { + "epoch": 9.679006772009028, + "grad_norm": 353.14739990234375, + "learning_rate": 1.5457350272232305e-05, + "loss": 36.4789, + "step": 2681 + }, + { + "epoch": 9.682618510158013, + "grad_norm": 215.21836853027344, + "learning_rate": 1.54519056261343e-05, + "loss": 36.0412, + "step": 2682 + }, + { + "epoch": 9.686230248306998, + "grad_norm": 219.64930725097656, + "learning_rate": 1.54464609800363e-05, + "loss": 37.1118, + "step": 2683 + }, + { + "epoch": 9.689841986455981, + "grad_norm": 247.86685180664062, + "learning_rate": 1.5441016333938295e-05, + "loss": 36.488, + "step": 2684 + }, + { + "epoch": 9.693453724604966, + "grad_norm": 248.7967071533203, + "learning_rate": 1.543557168784029e-05, + "loss": 36.2925, + "step": 2685 + }, + { + "epoch": 9.697065462753951, + "grad_norm": 243.1404571533203, + "learning_rate": 1.5430127041742285e-05, + "loss": 37.3986, + "step": 2686 + }, + { + "epoch": 9.700677200902934, + "grad_norm": 276.6585388183594, + "learning_rate": 1.5424682395644284e-05, + "loss": 37.9784, + "step": 2687 + }, + { + "epoch": 9.704288939051919, + "grad_norm": 308.171630859375, + "learning_rate": 1.541923774954628e-05, + "loss": 38.1591, + "step": 2688 + }, + { + "epoch": 9.707900677200904, + "grad_norm": 204.4575653076172, + "learning_rate": 1.541379310344828e-05, + "loss": 27.4514, + "step": 2689 + }, + { + "epoch": 9.711512415349887, + "grad_norm": 160.85946655273438, + "learning_rate": 1.5408348457350274e-05, + "loss": 23.7982, + "step": 2690 + }, + { + "epoch": 9.711512415349887, + "eval_loss": 0.619924008846283, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2690 + }, + { + "epoch": 9.715124153498872, + "grad_norm": 215.60049438476562, + "learning_rate": 1.540290381125227e-05, + "loss": 23.3927, + "step": 2691 + }, + { + "epoch": 9.718735891647855, + "grad_norm": 172.84011840820312, + "learning_rate": 1.5397459165154265e-05, + "loss": 24.1876, + "step": 2692 + }, + { + "epoch": 9.72234762979684, + "grad_norm": 208.42361450195312, + "learning_rate": 1.539201451905626e-05, + "loss": 25.1794, + "step": 2693 + }, + { + "epoch": 9.725959367945824, + "grad_norm": 255.73574829101562, + "learning_rate": 1.538656987295826e-05, + "loss": 42.3484, + "step": 2694 + }, + { + "epoch": 9.729571106094808, + "grad_norm": 239.65533447265625, + "learning_rate": 1.5381125226860254e-05, + "loss": 42.8277, + "step": 2695 + }, + { + "epoch": 9.733182844243792, + "grad_norm": 211.2068634033203, + "learning_rate": 1.5375680580762253e-05, + "loss": 42.6536, + "step": 2696 + }, + { + "epoch": 9.736794582392777, + "grad_norm": 302.85003662109375, + "learning_rate": 1.5370235934664248e-05, + "loss": 42.6263, + "step": 2697 + }, + { + "epoch": 9.74040632054176, + "grad_norm": 211.54754638671875, + "learning_rate": 1.5364791288566244e-05, + "loss": 41.5621, + "step": 2698 + }, + { + "epoch": 9.744018058690745, + "grad_norm": 229.22283935546875, + "learning_rate": 1.535934664246824e-05, + "loss": 43.3765, + "step": 2699 + }, + { + "epoch": 9.747629796839728, + "grad_norm": 206.64794921875, + "learning_rate": 1.5353901996370238e-05, + "loss": 41.4923, + "step": 2700 + }, + { + "epoch": 9.747629796839728, + "eval_loss": 0.6202616095542908, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.981, + "eval_steps_per_second": 56.981, + "step": 2700 + }, + { + "epoch": 9.751241534988713, + "grad_norm": 216.98757934570312, + "learning_rate": 1.5348457350272233e-05, + "loss": 43.1931, + "step": 2701 + }, + { + "epoch": 9.754853273137698, + "grad_norm": 222.7340545654297, + "learning_rate": 1.534301270417423e-05, + "loss": 42.485, + "step": 2702 + }, + { + "epoch": 9.758465011286681, + "grad_norm": 291.3454895019531, + "learning_rate": 1.5337568058076224e-05, + "loss": 41.4766, + "step": 2703 + }, + { + "epoch": 9.762076749435666, + "grad_norm": 239.50341796875, + "learning_rate": 1.533212341197822e-05, + "loss": 41.9215, + "step": 2704 + }, + { + "epoch": 9.76568848758465, + "grad_norm": 179.21839904785156, + "learning_rate": 1.5326678765880218e-05, + "loss": 40.6544, + "step": 2705 + }, + { + "epoch": 9.769300225733634, + "grad_norm": 210.89535522460938, + "learning_rate": 1.5321234119782217e-05, + "loss": 38.6204, + "step": 2706 + }, + { + "epoch": 9.772911963882619, + "grad_norm": 239.23291015625, + "learning_rate": 1.5315789473684212e-05, + "loss": 39.4385, + "step": 2707 + }, + { + "epoch": 9.776523702031604, + "grad_norm": 240.22772216796875, + "learning_rate": 1.5310344827586208e-05, + "loss": 40.0139, + "step": 2708 + }, + { + "epoch": 9.780135440180587, + "grad_norm": 185.4588623046875, + "learning_rate": 1.5304900181488203e-05, + "loss": 38.9331, + "step": 2709 + }, + { + "epoch": 9.783747178329572, + "grad_norm": 263.0315856933594, + "learning_rate": 1.52994555353902e-05, + "loss": 38.5485, + "step": 2710 + }, + { + "epoch": 9.783747178329572, + "eval_loss": 0.615914523601532, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2710 + }, + { + "epoch": 9.787358916478555, + "grad_norm": 209.05348205566406, + "learning_rate": 1.5294010889292197e-05, + "loss": 39.4875, + "step": 2711 + }, + { + "epoch": 9.79097065462754, + "grad_norm": 209.72293090820312, + "learning_rate": 1.5288566243194193e-05, + "loss": 40.4742, + "step": 2712 + }, + { + "epoch": 9.794582392776524, + "grad_norm": 210.02908325195312, + "learning_rate": 1.5283121597096188e-05, + "loss": 39.924, + "step": 2713 + }, + { + "epoch": 9.798194130925507, + "grad_norm": 204.3467254638672, + "learning_rate": 1.5277676950998183e-05, + "loss": 40.8893, + "step": 2714 + }, + { + "epoch": 9.801805869074492, + "grad_norm": 253.9317626953125, + "learning_rate": 1.5272232304900182e-05, + "loss": 38.3278, + "step": 2715 + }, + { + "epoch": 9.805417607223477, + "grad_norm": 263.6196594238281, + "learning_rate": 1.526678765880218e-05, + "loss": 40.5242, + "step": 2716 + }, + { + "epoch": 9.80902934537246, + "grad_norm": 230.35621643066406, + "learning_rate": 1.5261343012704176e-05, + "loss": 40.683, + "step": 2717 + }, + { + "epoch": 9.812641083521445, + "grad_norm": 190.16323852539062, + "learning_rate": 1.5255898366606172e-05, + "loss": 40.2472, + "step": 2718 + }, + { + "epoch": 9.816252821670428, + "grad_norm": 202.7122344970703, + "learning_rate": 1.5250453720508167e-05, + "loss": 38.9644, + "step": 2719 + }, + { + "epoch": 9.819864559819413, + "grad_norm": 193.65774536132812, + "learning_rate": 1.5245009074410164e-05, + "loss": 40.9982, + "step": 2720 + }, + { + "epoch": 9.819864559819413, + "eval_loss": 0.6152020692825317, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 2720 + }, + { + "epoch": 9.823476297968398, + "grad_norm": 272.0360412597656, + "learning_rate": 1.523956442831216e-05, + "loss": 40.5518, + "step": 2721 + }, + { + "epoch": 9.827088036117381, + "grad_norm": 200.20777893066406, + "learning_rate": 1.5234119782214155e-05, + "loss": 38.4801, + "step": 2722 + }, + { + "epoch": 9.830699774266366, + "grad_norm": 201.44764709472656, + "learning_rate": 1.5228675136116152e-05, + "loss": 35.7499, + "step": 2723 + }, + { + "epoch": 9.83431151241535, + "grad_norm": 234.89706420898438, + "learning_rate": 1.522323049001815e-05, + "loss": 35.4331, + "step": 2724 + }, + { + "epoch": 9.837923250564334, + "grad_norm": 193.27423095703125, + "learning_rate": 1.5217785843920146e-05, + "loss": 33.0281, + "step": 2725 + }, + { + "epoch": 9.841534988713319, + "grad_norm": 222.28060913085938, + "learning_rate": 1.5212341197822143e-05, + "loss": 34.2237, + "step": 2726 + }, + { + "epoch": 9.845146726862303, + "grad_norm": 264.2764587402344, + "learning_rate": 1.5206896551724139e-05, + "loss": 33.7112, + "step": 2727 + }, + { + "epoch": 9.848758465011286, + "grad_norm": 204.5146484375, + "learning_rate": 1.5201451905626134e-05, + "loss": 33.9014, + "step": 2728 + }, + { + "epoch": 9.852370203160271, + "grad_norm": 198.90907287597656, + "learning_rate": 1.5196007259528131e-05, + "loss": 36.6987, + "step": 2729 + }, + { + "epoch": 9.855981941309254, + "grad_norm": 254.19818115234375, + "learning_rate": 1.5190562613430126e-05, + "loss": 35.4466, + "step": 2730 + }, + { + "epoch": 9.855981941309254, + "eval_loss": 0.6153284311294556, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2730 + }, + { + "epoch": 9.85959367945824, + "grad_norm": 212.53749084472656, + "learning_rate": 1.5185117967332123e-05, + "loss": 35.659, + "step": 2731 + }, + { + "epoch": 9.863205417607224, + "grad_norm": 234.5277557373047, + "learning_rate": 1.5179673321234119e-05, + "loss": 36.7411, + "step": 2732 + }, + { + "epoch": 9.866817155756207, + "grad_norm": 229.25962829589844, + "learning_rate": 1.5174228675136118e-05, + "loss": 36.0713, + "step": 2733 + }, + { + "epoch": 9.870428893905192, + "grad_norm": 259.5096435546875, + "learning_rate": 1.5168784029038115e-05, + "loss": 37.2433, + "step": 2734 + }, + { + "epoch": 9.874040632054175, + "grad_norm": 297.2413024902344, + "learning_rate": 1.516333938294011e-05, + "loss": 37.222, + "step": 2735 + }, + { + "epoch": 9.87765237020316, + "grad_norm": 259.8325500488281, + "learning_rate": 1.5157894736842105e-05, + "loss": 37.096, + "step": 2736 + }, + { + "epoch": 9.881264108352145, + "grad_norm": 275.85888671875, + "learning_rate": 1.5152450090744103e-05, + "loss": 37.769, + "step": 2737 + }, + { + "epoch": 9.884875846501128, + "grad_norm": 261.16656494140625, + "learning_rate": 1.5147005444646098e-05, + "loss": 38.4089, + "step": 2738 + }, + { + "epoch": 9.888487584650113, + "grad_norm": 219.74351501464844, + "learning_rate": 1.5141560798548095e-05, + "loss": 32.5255, + "step": 2739 + }, + { + "epoch": 9.892099322799098, + "grad_norm": 203.9193878173828, + "learning_rate": 1.513611615245009e-05, + "loss": 24.2497, + "step": 2740 + }, + { + "epoch": 9.892099322799098, + "eval_loss": 0.6206448674201965, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 2740 + }, + { + "epoch": 9.89571106094808, + "grad_norm": 224.19454956054688, + "learning_rate": 1.5130671506352086e-05, + "loss": 23.0629, + "step": 2741 + }, + { + "epoch": 9.899322799097066, + "grad_norm": 252.4147186279297, + "learning_rate": 1.5125226860254086e-05, + "loss": 24.5799, + "step": 2742 + }, + { + "epoch": 9.90293453724605, + "grad_norm": 214.79067993164062, + "learning_rate": 1.5119782214156082e-05, + "loss": 24.6773, + "step": 2743 + }, + { + "epoch": 9.906546275395034, + "grad_norm": 225.59848022460938, + "learning_rate": 1.5114337568058077e-05, + "loss": 43.1147, + "step": 2744 + }, + { + "epoch": 9.910158013544018, + "grad_norm": 221.8661651611328, + "learning_rate": 1.5108892921960074e-05, + "loss": 42.7403, + "step": 2745 + }, + { + "epoch": 9.913769751693001, + "grad_norm": 316.3871765136719, + "learning_rate": 1.510344827586207e-05, + "loss": 41.6931, + "step": 2746 + }, + { + "epoch": 9.917381489841986, + "grad_norm": 250.6577911376953, + "learning_rate": 1.5098003629764065e-05, + "loss": 43.3, + "step": 2747 + }, + { + "epoch": 9.920993227990971, + "grad_norm": 222.44386291503906, + "learning_rate": 1.5092558983666062e-05, + "loss": 43.3128, + "step": 2748 + }, + { + "epoch": 9.924604966139954, + "grad_norm": 190.08682250976562, + "learning_rate": 1.5087114337568057e-05, + "loss": 41.4814, + "step": 2749 + }, + { + "epoch": 9.928216704288939, + "grad_norm": 276.9918212890625, + "learning_rate": 1.5081669691470054e-05, + "loss": 41.042, + "step": 2750 + }, + { + "epoch": 9.928216704288939, + "eval_loss": 0.6201648116111755, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2750 + }, + { + "epoch": 9.931828442437924, + "grad_norm": 269.7344970703125, + "learning_rate": 1.507622504537205e-05, + "loss": 40.3064, + "step": 2751 + }, + { + "epoch": 9.935440180586907, + "grad_norm": 263.11663818359375, + "learning_rate": 1.5070780399274049e-05, + "loss": 40.1675, + "step": 2752 + }, + { + "epoch": 9.939051918735892, + "grad_norm": 210.37635803222656, + "learning_rate": 1.5065335753176046e-05, + "loss": 40.5334, + "step": 2753 + }, + { + "epoch": 9.942663656884875, + "grad_norm": 206.09335327148438, + "learning_rate": 1.5059891107078041e-05, + "loss": 41.0429, + "step": 2754 + }, + { + "epoch": 9.94627539503386, + "grad_norm": 245.45013427734375, + "learning_rate": 1.5054446460980036e-05, + "loss": 40.8831, + "step": 2755 + }, + { + "epoch": 9.949887133182845, + "grad_norm": 216.63075256347656, + "learning_rate": 1.5049001814882033e-05, + "loss": 41.2453, + "step": 2756 + }, + { + "epoch": 9.953498871331828, + "grad_norm": 362.12127685546875, + "learning_rate": 1.5043557168784029e-05, + "loss": 40.4561, + "step": 2757 + }, + { + "epoch": 9.957110609480813, + "grad_norm": 222.01434326171875, + "learning_rate": 1.5038112522686024e-05, + "loss": 41.7307, + "step": 2758 + }, + { + "epoch": 9.960722347629797, + "grad_norm": 289.6107177734375, + "learning_rate": 1.5032667876588021e-05, + "loss": 37.83, + "step": 2759 + }, + { + "epoch": 9.96433408577878, + "grad_norm": 231.75274658203125, + "learning_rate": 1.5027223230490017e-05, + "loss": 34.1728, + "step": 2760 + }, + { + "epoch": 9.96433408577878, + "eval_loss": 0.6177247166633606, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2760 + }, + { + "epoch": 9.967945823927765, + "grad_norm": 269.4657287597656, + "learning_rate": 1.5021778584392017e-05, + "loss": 33.8501, + "step": 2761 + }, + { + "epoch": 9.97155756207675, + "grad_norm": 229.73004150390625, + "learning_rate": 1.5016333938294013e-05, + "loss": 35.0989, + "step": 2762 + }, + { + "epoch": 9.975169300225733, + "grad_norm": 215.75350952148438, + "learning_rate": 1.5010889292196008e-05, + "loss": 35.1091, + "step": 2763 + }, + { + "epoch": 9.978781038374718, + "grad_norm": 255.36439514160156, + "learning_rate": 1.5005444646098005e-05, + "loss": 36.8373, + "step": 2764 + }, + { + "epoch": 9.982392776523701, + "grad_norm": 226.71084594726562, + "learning_rate": 1.5e-05, + "loss": 36.6244, + "step": 2765 + }, + { + "epoch": 9.986004514672686, + "grad_norm": 264.1791076660156, + "learning_rate": 1.4994555353901996e-05, + "loss": 36.1925, + "step": 2766 + }, + { + "epoch": 9.989616252821671, + "grad_norm": 281.4349060058594, + "learning_rate": 1.4989110707803993e-05, + "loss": 38.5627, + "step": 2767 + }, + { + "epoch": 9.993227990970654, + "grad_norm": 275.13092041015625, + "learning_rate": 1.498366606170599e-05, + "loss": 33.3277, + "step": 2768 + }, + { + "epoch": 9.996839729119639, + "grad_norm": 215.79550170898438, + "learning_rate": 1.4978221415607985e-05, + "loss": 23.7482, + "step": 2769 + }, + { + "epoch": 10.0, + "grad_norm": 162.03152465820312, + "learning_rate": 1.4972776769509982e-05, + "loss": 21.7078, + "step": 2770 + }, + { + "epoch": 10.0, + "eval_loss": 0.6126651763916016, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2770 + }, + { + "epoch": 10.003611738148985, + "grad_norm": 243.1815185546875, + "learning_rate": 1.4967332123411978e-05, + "loss": 42.2449, + "step": 2771 + }, + { + "epoch": 10.007223476297968, + "grad_norm": 183.29127502441406, + "learning_rate": 1.4961887477313977e-05, + "loss": 41.5925, + "step": 2772 + }, + { + "epoch": 10.010835214446953, + "grad_norm": 206.04238891601562, + "learning_rate": 1.4956442831215972e-05, + "loss": 40.6657, + "step": 2773 + }, + { + "epoch": 10.014446952595938, + "grad_norm": 192.1796875, + "learning_rate": 1.4950998185117967e-05, + "loss": 41.7065, + "step": 2774 + }, + { + "epoch": 10.01805869074492, + "grad_norm": 202.77279663085938, + "learning_rate": 1.4945553539019964e-05, + "loss": 42.0608, + "step": 2775 + }, + { + "epoch": 10.021670428893906, + "grad_norm": 242.37734985351562, + "learning_rate": 1.494010889292196e-05, + "loss": 40.9925, + "step": 2776 + }, + { + "epoch": 10.025282167042889, + "grad_norm": 252.01358032226562, + "learning_rate": 1.4934664246823957e-05, + "loss": 41.1401, + "step": 2777 + }, + { + "epoch": 10.028893905191874, + "grad_norm": 205.82388305664062, + "learning_rate": 1.4929219600725954e-05, + "loss": 41.5, + "step": 2778 + }, + { + "epoch": 10.032505643340858, + "grad_norm": 251.53968811035156, + "learning_rate": 1.492377495462795e-05, + "loss": 41.8218, + "step": 2779 + }, + { + "epoch": 10.036117381489841, + "grad_norm": 236.55564880371094, + "learning_rate": 1.4918330308529945e-05, + "loss": 40.803, + "step": 2780 + }, + { + "epoch": 10.036117381489841, + "eval_loss": 0.6173696517944336, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 2780 + }, + { + "epoch": 10.039729119638826, + "grad_norm": 214.9959716796875, + "learning_rate": 1.4912885662431942e-05, + "loss": 40.522, + "step": 2781 + }, + { + "epoch": 10.043340857787811, + "grad_norm": 213.7000732421875, + "learning_rate": 1.4907441016333939e-05, + "loss": 38.8643, + "step": 2782 + }, + { + "epoch": 10.046952595936794, + "grad_norm": 225.6709747314453, + "learning_rate": 1.4901996370235936e-05, + "loss": 38.3625, + "step": 2783 + }, + { + "epoch": 10.050564334085779, + "grad_norm": 208.83712768554688, + "learning_rate": 1.4896551724137931e-05, + "loss": 38.5355, + "step": 2784 + }, + { + "epoch": 10.054176072234762, + "grad_norm": 185.51219177246094, + "learning_rate": 1.4891107078039927e-05, + "loss": 38.4303, + "step": 2785 + }, + { + "epoch": 10.057787810383747, + "grad_norm": 196.68551635742188, + "learning_rate": 1.4885662431941925e-05, + "loss": 38.1895, + "step": 2786 + }, + { + "epoch": 10.061399548532732, + "grad_norm": 207.4806671142578, + "learning_rate": 1.488021778584392e-05, + "loss": 39.2329, + "step": 2787 + }, + { + "epoch": 10.065011286681715, + "grad_norm": 211.640380859375, + "learning_rate": 1.4874773139745916e-05, + "loss": 40.108, + "step": 2788 + }, + { + "epoch": 10.0686230248307, + "grad_norm": 195.97006225585938, + "learning_rate": 1.4869328493647913e-05, + "loss": 39.6883, + "step": 2789 + }, + { + "epoch": 10.072234762979685, + "grad_norm": 207.20169067382812, + "learning_rate": 1.4863883847549909e-05, + "loss": 40.557, + "step": 2790 + }, + { + "epoch": 10.072234762979685, + "eval_loss": 0.6166439652442932, + "eval_runtime": 3.1461, + "eval_samples_per_second": 56.895, + "eval_steps_per_second": 56.895, + "step": 2790 + }, + { + "epoch": 10.075846501128668, + "grad_norm": 168.4052276611328, + "learning_rate": 1.4858439201451906e-05, + "loss": 39.76, + "step": 2791 + }, + { + "epoch": 10.079458239277653, + "grad_norm": 188.55575561523438, + "learning_rate": 1.4852994555353903e-05, + "loss": 40.4776, + "step": 2792 + }, + { + "epoch": 10.083069977426636, + "grad_norm": 181.60801696777344, + "learning_rate": 1.4847549909255898e-05, + "loss": 40.5414, + "step": 2793 + }, + { + "epoch": 10.08668171557562, + "grad_norm": 205.39608764648438, + "learning_rate": 1.4842105263157895e-05, + "loss": 41.4944, + "step": 2794 + }, + { + "epoch": 10.090293453724605, + "grad_norm": 271.0169372558594, + "learning_rate": 1.4836660617059892e-05, + "loss": 40.6805, + "step": 2795 + }, + { + "epoch": 10.093905191873588, + "grad_norm": 241.97889709472656, + "learning_rate": 1.4831215970961888e-05, + "loss": 39.5473, + "step": 2796 + }, + { + "epoch": 10.097516930022573, + "grad_norm": 211.64260864257812, + "learning_rate": 1.4825771324863885e-05, + "loss": 41.0357, + "step": 2797 + }, + { + "epoch": 10.101128668171558, + "grad_norm": 209.52804565429688, + "learning_rate": 1.482032667876588e-05, + "loss": 41.3357, + "step": 2798 + }, + { + "epoch": 10.104740406320541, + "grad_norm": 243.08419799804688, + "learning_rate": 1.4814882032667876e-05, + "loss": 38.6778, + "step": 2799 + }, + { + "epoch": 10.108352144469526, + "grad_norm": 227.17172241210938, + "learning_rate": 1.4809437386569874e-05, + "loss": 35.1128, + "step": 2800 + }, + { + "epoch": 10.108352144469526, + "eval_loss": 0.6153741478919983, + "eval_runtime": 3.143, + "eval_samples_per_second": 56.952, + "eval_steps_per_second": 56.952, + "step": 2800 + }, + { + "epoch": 10.111963882618511, + "grad_norm": 284.7151794433594, + "learning_rate": 1.480399274047187e-05, + "loss": 33.1712, + "step": 2801 + }, + { + "epoch": 10.115575620767494, + "grad_norm": 234.85169982910156, + "learning_rate": 1.4798548094373867e-05, + "loss": 33.495, + "step": 2802 + }, + { + "epoch": 10.119187358916479, + "grad_norm": 236.6138458251953, + "learning_rate": 1.4793103448275862e-05, + "loss": 33.2318, + "step": 2803 + }, + { + "epoch": 10.122799097065462, + "grad_norm": 240.98997497558594, + "learning_rate": 1.4787658802177858e-05, + "loss": 33.9268, + "step": 2804 + }, + { + "epoch": 10.126410835214447, + "grad_norm": 218.304443359375, + "learning_rate": 1.4782214156079856e-05, + "loss": 34.667, + "step": 2805 + }, + { + "epoch": 10.130022573363432, + "grad_norm": 290.30108642578125, + "learning_rate": 1.4776769509981852e-05, + "loss": 36.7153, + "step": 2806 + }, + { + "epoch": 10.133634311512415, + "grad_norm": 267.7265625, + "learning_rate": 1.4771324863883847e-05, + "loss": 35.2035, + "step": 2807 + }, + { + "epoch": 10.1372460496614, + "grad_norm": 300.4646301269531, + "learning_rate": 1.4765880217785844e-05, + "loss": 35.6581, + "step": 2808 + }, + { + "epoch": 10.140857787810384, + "grad_norm": 234.16448974609375, + "learning_rate": 1.4760435571687841e-05, + "loss": 35.8547, + "step": 2809 + }, + { + "epoch": 10.144469525959368, + "grad_norm": 209.23858642578125, + "learning_rate": 1.4754990925589837e-05, + "loss": 34.47, + "step": 2810 + }, + { + "epoch": 10.144469525959368, + "eval_loss": 0.6160662770271301, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2810 + }, + { + "epoch": 10.148081264108352, + "grad_norm": 207.9628143310547, + "learning_rate": 1.4749546279491834e-05, + "loss": 36.1239, + "step": 2811 + }, + { + "epoch": 10.151693002257336, + "grad_norm": 183.68545532226562, + "learning_rate": 1.4744101633393829e-05, + "loss": 36.759, + "step": 2812 + }, + { + "epoch": 10.15530474040632, + "grad_norm": 222.00164794921875, + "learning_rate": 1.4738656987295826e-05, + "loss": 37.397, + "step": 2813 + }, + { + "epoch": 10.158916478555305, + "grad_norm": 226.9628448486328, + "learning_rate": 1.4733212341197823e-05, + "loss": 36.3648, + "step": 2814 + }, + { + "epoch": 10.162528216704288, + "grad_norm": 271.061279296875, + "learning_rate": 1.4727767695099819e-05, + "loss": 37.8754, + "step": 2815 + }, + { + "epoch": 10.166139954853273, + "grad_norm": 265.2478942871094, + "learning_rate": 1.4722323049001816e-05, + "loss": 33.7491, + "step": 2816 + }, + { + "epoch": 10.169751693002258, + "grad_norm": 227.5030975341797, + "learning_rate": 1.4716878402903811e-05, + "loss": 23.0162, + "step": 2817 + }, + { + "epoch": 10.173363431151241, + "grad_norm": 195.83477783203125, + "learning_rate": 1.4711433756805808e-05, + "loss": 23.5831, + "step": 2818 + }, + { + "epoch": 10.176975169300226, + "grad_norm": 196.982421875, + "learning_rate": 1.4705989110707805e-05, + "loss": 24.1078, + "step": 2819 + }, + { + "epoch": 10.18058690744921, + "grad_norm": 212.73031616210938, + "learning_rate": 1.47005444646098e-05, + "loss": 24.8378, + "step": 2820 + }, + { + "epoch": 10.18058690744921, + "eval_loss": 0.6217848062515259, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 2820 + }, + { + "epoch": 10.184198645598194, + "grad_norm": 261.8343200683594, + "learning_rate": 1.4695099818511796e-05, + "loss": 43.3402, + "step": 2821 + }, + { + "epoch": 10.187810383747179, + "grad_norm": 272.94158935546875, + "learning_rate": 1.4689655172413793e-05, + "loss": 42.8004, + "step": 2822 + }, + { + "epoch": 10.191422121896162, + "grad_norm": 261.5067138671875, + "learning_rate": 1.468421052631579e-05, + "loss": 43.5947, + "step": 2823 + }, + { + "epoch": 10.195033860045147, + "grad_norm": 280.4205322265625, + "learning_rate": 1.4678765880217787e-05, + "loss": 42.1887, + "step": 2824 + }, + { + "epoch": 10.198645598194132, + "grad_norm": 223.82449340820312, + "learning_rate": 1.4673321234119783e-05, + "loss": 40.9825, + "step": 2825 + }, + { + "epoch": 10.202257336343115, + "grad_norm": 261.1077575683594, + "learning_rate": 1.4667876588021778e-05, + "loss": 41.8347, + "step": 2826 + }, + { + "epoch": 10.2058690744921, + "grad_norm": 189.1642608642578, + "learning_rate": 1.4662431941923775e-05, + "loss": 41.7441, + "step": 2827 + }, + { + "epoch": 10.209480812641084, + "grad_norm": 216.94410705566406, + "learning_rate": 1.4656987295825772e-05, + "loss": 42.203, + "step": 2828 + }, + { + "epoch": 10.213092550790067, + "grad_norm": 260.44744873046875, + "learning_rate": 1.4651542649727768e-05, + "loss": 41.8887, + "step": 2829 + }, + { + "epoch": 10.216704288939052, + "grad_norm": 252.21682739257812, + "learning_rate": 1.4646098003629765e-05, + "loss": 42.5977, + "step": 2830 + }, + { + "epoch": 10.216704288939052, + "eval_loss": 0.6175437569618225, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.997, + "eval_steps_per_second": 56.997, + "step": 2830 + }, + { + "epoch": 10.220316027088035, + "grad_norm": 298.4760437011719, + "learning_rate": 1.464065335753176e-05, + "loss": 40.7994, + "step": 2831 + }, + { + "epoch": 10.22392776523702, + "grad_norm": 214.0433349609375, + "learning_rate": 1.4635208711433757e-05, + "loss": 39.1571, + "step": 2832 + }, + { + "epoch": 10.227539503386005, + "grad_norm": 220.59039306640625, + "learning_rate": 1.4629764065335754e-05, + "loss": 38.257, + "step": 2833 + }, + { + "epoch": 10.231151241534988, + "grad_norm": 218.2419891357422, + "learning_rate": 1.462431941923775e-05, + "loss": 38.1954, + "step": 2834 + }, + { + "epoch": 10.234762979683973, + "grad_norm": 241.67674255371094, + "learning_rate": 1.4618874773139747e-05, + "loss": 39.7451, + "step": 2835 + }, + { + "epoch": 10.238374717832958, + "grad_norm": 260.3656005859375, + "learning_rate": 1.4613430127041742e-05, + "loss": 38.8297, + "step": 2836 + }, + { + "epoch": 10.241986455981941, + "grad_norm": 231.78102111816406, + "learning_rate": 1.4607985480943739e-05, + "loss": 38.523, + "step": 2837 + }, + { + "epoch": 10.245598194130926, + "grad_norm": 217.64820861816406, + "learning_rate": 1.4602540834845736e-05, + "loss": 40.0389, + "step": 2838 + }, + { + "epoch": 10.249209932279909, + "grad_norm": 186.45240783691406, + "learning_rate": 1.4597096188747732e-05, + "loss": 40.3306, + "step": 2839 + }, + { + "epoch": 10.252821670428894, + "grad_norm": 225.20480346679688, + "learning_rate": 1.4591651542649727e-05, + "loss": 39.0968, + "step": 2840 + }, + { + "epoch": 10.252821670428894, + "eval_loss": 0.6195141673088074, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 2840 + }, + { + "epoch": 10.256433408577879, + "grad_norm": 367.6174621582031, + "learning_rate": 1.4586206896551724e-05, + "loss": 38.869, + "step": 2841 + }, + { + "epoch": 10.260045146726862, + "grad_norm": 274.3976135253906, + "learning_rate": 1.4580762250453721e-05, + "loss": 39.7781, + "step": 2842 + }, + { + "epoch": 10.263656884875846, + "grad_norm": 193.41665649414062, + "learning_rate": 1.4575317604355718e-05, + "loss": 38.819, + "step": 2843 + }, + { + "epoch": 10.267268623024831, + "grad_norm": 204.2224578857422, + "learning_rate": 1.4569872958257714e-05, + "loss": 41.5495, + "step": 2844 + }, + { + "epoch": 10.270880361173814, + "grad_norm": 276.07476806640625, + "learning_rate": 1.4564428312159709e-05, + "loss": 40.6553, + "step": 2845 + }, + { + "epoch": 10.2744920993228, + "grad_norm": 192.6361541748047, + "learning_rate": 1.4558983666061708e-05, + "loss": 40.2147, + "step": 2846 + }, + { + "epoch": 10.278103837471784, + "grad_norm": 232.6641082763672, + "learning_rate": 1.4553539019963703e-05, + "loss": 40.7223, + "step": 2847 + }, + { + "epoch": 10.281715575620767, + "grad_norm": 266.781005859375, + "learning_rate": 1.4548094373865698e-05, + "loss": 38.0127, + "step": 2848 + }, + { + "epoch": 10.285327313769752, + "grad_norm": 289.5414123535156, + "learning_rate": 1.4542649727767696e-05, + "loss": 35.216, + "step": 2849 + }, + { + "epoch": 10.288939051918735, + "grad_norm": 208.10845947265625, + "learning_rate": 1.4537205081669691e-05, + "loss": 33.829, + "step": 2850 + }, + { + "epoch": 10.288939051918735, + "eval_loss": 0.6140356063842773, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.703, + "eval_steps_per_second": 56.703, + "step": 2850 + }, + { + "epoch": 10.29255079006772, + "grad_norm": 260.80328369140625, + "learning_rate": 1.4531760435571688e-05, + "loss": 33.8409, + "step": 2851 + }, + { + "epoch": 10.296162528216705, + "grad_norm": 202.3874053955078, + "learning_rate": 1.4526315789473685e-05, + "loss": 32.6498, + "step": 2852 + }, + { + "epoch": 10.299774266365688, + "grad_norm": 236.0218048095703, + "learning_rate": 1.452087114337568e-05, + "loss": 33.6538, + "step": 2853 + }, + { + "epoch": 10.303386004514673, + "grad_norm": 219.1603240966797, + "learning_rate": 1.4515426497277678e-05, + "loss": 33.7346, + "step": 2854 + }, + { + "epoch": 10.306997742663658, + "grad_norm": 252.8759307861328, + "learning_rate": 1.4509981851179675e-05, + "loss": 34.6996, + "step": 2855 + }, + { + "epoch": 10.31060948081264, + "grad_norm": 204.89244079589844, + "learning_rate": 1.450453720508167e-05, + "loss": 36.1145, + "step": 2856 + }, + { + "epoch": 10.314221218961626, + "grad_norm": 239.5278778076172, + "learning_rate": 1.4499092558983667e-05, + "loss": 34.8845, + "step": 2857 + }, + { + "epoch": 10.317832957110609, + "grad_norm": 235.02403259277344, + "learning_rate": 1.4493647912885662e-05, + "loss": 36.1006, + "step": 2858 + }, + { + "epoch": 10.321444695259594, + "grad_norm": 219.25686645507812, + "learning_rate": 1.4488203266787658e-05, + "loss": 37.0463, + "step": 2859 + }, + { + "epoch": 10.325056433408578, + "grad_norm": 238.1767578125, + "learning_rate": 1.4482758620689657e-05, + "loss": 35.5543, + "step": 2860 + }, + { + "epoch": 10.325056433408578, + "eval_loss": 0.6116110682487488, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.93, + "eval_steps_per_second": 56.93, + "step": 2860 + }, + { + "epoch": 10.328668171557561, + "grad_norm": 245.4133758544922, + "learning_rate": 1.4477313974591652e-05, + "loss": 35.7557, + "step": 2861 + }, + { + "epoch": 10.332279909706546, + "grad_norm": 231.70779418945312, + "learning_rate": 1.4471869328493647e-05, + "loss": 35.9535, + "step": 2862 + }, + { + "epoch": 10.335891647855531, + "grad_norm": 218.71266174316406, + "learning_rate": 1.4466424682395644e-05, + "loss": 36.747, + "step": 2863 + }, + { + "epoch": 10.339503386004514, + "grad_norm": 206.82247924804688, + "learning_rate": 1.446098003629764e-05, + "loss": 37.4007, + "step": 2864 + }, + { + "epoch": 10.343115124153499, + "grad_norm": 286.6649475097656, + "learning_rate": 1.4455535390199639e-05, + "loss": 38.183, + "step": 2865 + }, + { + "epoch": 10.346726862302482, + "grad_norm": 262.2049865722656, + "learning_rate": 1.4450090744101634e-05, + "loss": 28.1564, + "step": 2866 + }, + { + "epoch": 10.350338600451467, + "grad_norm": 203.03831481933594, + "learning_rate": 1.444464609800363e-05, + "loss": 23.7155, + "step": 2867 + }, + { + "epoch": 10.353950338600452, + "grad_norm": 220.13597106933594, + "learning_rate": 1.4439201451905626e-05, + "loss": 23.5066, + "step": 2868 + }, + { + "epoch": 10.357562076749435, + "grad_norm": 208.22035217285156, + "learning_rate": 1.4433756805807624e-05, + "loss": 23.8087, + "step": 2869 + }, + { + "epoch": 10.36117381489842, + "grad_norm": 202.74989318847656, + "learning_rate": 1.4428312159709619e-05, + "loss": 24.6194, + "step": 2870 + }, + { + "epoch": 10.36117381489842, + "eval_loss": 0.6170971989631653, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 2870 + }, + { + "epoch": 10.364785553047405, + "grad_norm": 251.78924560546875, + "learning_rate": 1.4422867513611616e-05, + "loss": 41.1333, + "step": 2871 + }, + { + "epoch": 10.368397291196388, + "grad_norm": 269.72430419921875, + "learning_rate": 1.4417422867513611e-05, + "loss": 43.5289, + "step": 2872 + }, + { + "epoch": 10.372009029345373, + "grad_norm": 226.14202880859375, + "learning_rate": 1.4411978221415607e-05, + "loss": 42.1575, + "step": 2873 + }, + { + "epoch": 10.375620767494357, + "grad_norm": 230.2255096435547, + "learning_rate": 1.4406533575317606e-05, + "loss": 42.5563, + "step": 2874 + }, + { + "epoch": 10.37923250564334, + "grad_norm": 259.2338562011719, + "learning_rate": 1.4401088929219601e-05, + "loss": 41.517, + "step": 2875 + }, + { + "epoch": 10.382844243792325, + "grad_norm": 280.06414794921875, + "learning_rate": 1.4395644283121598e-05, + "loss": 41.3589, + "step": 2876 + }, + { + "epoch": 10.386455981941308, + "grad_norm": 259.1960754394531, + "learning_rate": 1.4390199637023593e-05, + "loss": 41.539, + "step": 2877 + }, + { + "epoch": 10.390067720090293, + "grad_norm": 244.4931640625, + "learning_rate": 1.438475499092559e-05, + "loss": 41.8689, + "step": 2878 + }, + { + "epoch": 10.393679458239278, + "grad_norm": 195.65065002441406, + "learning_rate": 1.4379310344827588e-05, + "loss": 42.9191, + "step": 2879 + }, + { + "epoch": 10.397291196388261, + "grad_norm": 215.88589477539062, + "learning_rate": 1.4373865698729583e-05, + "loss": 41.4172, + "step": 2880 + }, + { + "epoch": 10.397291196388261, + "eval_loss": 0.6176813840866089, + "eval_runtime": 3.1462, + "eval_samples_per_second": 56.893, + "eval_steps_per_second": 56.893, + "step": 2880 + }, + { + "epoch": 10.400902934537246, + "grad_norm": 175.21368408203125, + "learning_rate": 1.4368421052631578e-05, + "loss": 41.8998, + "step": 2881 + }, + { + "epoch": 10.404514672686231, + "grad_norm": 207.65963745117188, + "learning_rate": 1.4362976406533575e-05, + "loss": 40.33, + "step": 2882 + }, + { + "epoch": 10.408126410835214, + "grad_norm": 213.50526428222656, + "learning_rate": 1.4357531760435572e-05, + "loss": 38.0329, + "step": 2883 + }, + { + "epoch": 10.411738148984199, + "grad_norm": 190.8444366455078, + "learning_rate": 1.4352087114337568e-05, + "loss": 39.0142, + "step": 2884 + }, + { + "epoch": 10.415349887133182, + "grad_norm": 300.2298583984375, + "learning_rate": 1.4346642468239565e-05, + "loss": 38.6364, + "step": 2885 + }, + { + "epoch": 10.418961625282167, + "grad_norm": 183.6144256591797, + "learning_rate": 1.434119782214156e-05, + "loss": 39.6747, + "step": 2886 + }, + { + "epoch": 10.422573363431152, + "grad_norm": 237.85340881347656, + "learning_rate": 1.4335753176043557e-05, + "loss": 38.3018, + "step": 2887 + }, + { + "epoch": 10.426185101580135, + "grad_norm": 325.96624755859375, + "learning_rate": 1.4330308529945554e-05, + "loss": 40.1042, + "step": 2888 + }, + { + "epoch": 10.42979683972912, + "grad_norm": 248.4732666015625, + "learning_rate": 1.432486388384755e-05, + "loss": 40.0357, + "step": 2889 + }, + { + "epoch": 10.433408577878104, + "grad_norm": 374.6653747558594, + "learning_rate": 1.4319419237749547e-05, + "loss": 40.4383, + "step": 2890 + }, + { + "epoch": 10.433408577878104, + "eval_loss": 0.6150367856025696, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.881, + "eval_steps_per_second": 56.881, + "step": 2890 + }, + { + "epoch": 10.437020316027088, + "grad_norm": 229.79647827148438, + "learning_rate": 1.4313974591651542e-05, + "loss": 40.3728, + "step": 2891 + }, + { + "epoch": 10.440632054176072, + "grad_norm": 278.7500915527344, + "learning_rate": 1.430852994555354e-05, + "loss": 39.546, + "step": 2892 + }, + { + "epoch": 10.444243792325057, + "grad_norm": 233.1890106201172, + "learning_rate": 1.4303085299455536e-05, + "loss": 41.8094, + "step": 2893 + }, + { + "epoch": 10.44785553047404, + "grad_norm": 207.7745819091797, + "learning_rate": 1.4297640653357532e-05, + "loss": 40.6225, + "step": 2894 + }, + { + "epoch": 10.451467268623025, + "grad_norm": 233.37892150878906, + "learning_rate": 1.4292196007259529e-05, + "loss": 40.2499, + "step": 2895 + }, + { + "epoch": 10.455079006772008, + "grad_norm": 225.4070587158203, + "learning_rate": 1.4286751361161524e-05, + "loss": 40.3626, + "step": 2896 + }, + { + "epoch": 10.458690744920993, + "grad_norm": 239.60231018066406, + "learning_rate": 1.4281306715063521e-05, + "loss": 40.3149, + "step": 2897 + }, + { + "epoch": 10.462302483069978, + "grad_norm": 225.3981475830078, + "learning_rate": 1.4275862068965518e-05, + "loss": 39.3443, + "step": 2898 + }, + { + "epoch": 10.465914221218961, + "grad_norm": 270.2829284667969, + "learning_rate": 1.4270417422867514e-05, + "loss": 37.8947, + "step": 2899 + }, + { + "epoch": 10.469525959367946, + "grad_norm": 263.66986083984375, + "learning_rate": 1.426497277676951e-05, + "loss": 34.4721, + "step": 2900 + }, + { + "epoch": 10.469525959367946, + "eval_loss": 0.6134031414985657, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2900 + }, + { + "epoch": 10.47313769751693, + "grad_norm": 189.3812255859375, + "learning_rate": 1.4259528130671508e-05, + "loss": 34.3148, + "step": 2901 + }, + { + "epoch": 10.476749435665914, + "grad_norm": 256.7174987792969, + "learning_rate": 1.4254083484573503e-05, + "loss": 32.1693, + "step": 2902 + }, + { + "epoch": 10.480361173814899, + "grad_norm": 265.40692138671875, + "learning_rate": 1.4248638838475499e-05, + "loss": 34.369, + "step": 2903 + }, + { + "epoch": 10.483972911963882, + "grad_norm": 315.6539001464844, + "learning_rate": 1.4243194192377496e-05, + "loss": 34.9479, + "step": 2904 + }, + { + "epoch": 10.487584650112867, + "grad_norm": 263.7816162109375, + "learning_rate": 1.4237749546279491e-05, + "loss": 33.983, + "step": 2905 + }, + { + "epoch": 10.491196388261852, + "grad_norm": 244.69192504882812, + "learning_rate": 1.423230490018149e-05, + "loss": 36.6685, + "step": 2906 + }, + { + "epoch": 10.494808126410835, + "grad_norm": 224.26071166992188, + "learning_rate": 1.4226860254083485e-05, + "loss": 35.0337, + "step": 2907 + }, + { + "epoch": 10.49841986455982, + "grad_norm": 261.0958557128906, + "learning_rate": 1.422141560798548e-05, + "loss": 34.7154, + "step": 2908 + }, + { + "epoch": 10.502031602708804, + "grad_norm": 245.85960388183594, + "learning_rate": 1.4215970961887478e-05, + "loss": 35.4156, + "step": 2909 + }, + { + "epoch": 10.505643340857787, + "grad_norm": 309.3730163574219, + "learning_rate": 1.4210526315789473e-05, + "loss": 36.3999, + "step": 2910 + }, + { + "epoch": 10.505643340857787, + "eval_loss": 0.6144266128540039, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.853, + "eval_steps_per_second": 56.853, + "step": 2910 + }, + { + "epoch": 10.509255079006772, + "grad_norm": 209.9637451171875, + "learning_rate": 1.420508166969147e-05, + "loss": 37.1515, + "step": 2911 + }, + { + "epoch": 10.512866817155757, + "grad_norm": 254.81683349609375, + "learning_rate": 1.4199637023593467e-05, + "loss": 35.5548, + "step": 2912 + }, + { + "epoch": 10.51647855530474, + "grad_norm": 224.94137573242188, + "learning_rate": 1.4194192377495463e-05, + "loss": 36.7691, + "step": 2913 + }, + { + "epoch": 10.520090293453725, + "grad_norm": 223.81838989257812, + "learning_rate": 1.4188747731397458e-05, + "loss": 37.5904, + "step": 2914 + }, + { + "epoch": 10.523702031602708, + "grad_norm": 308.0168151855469, + "learning_rate": 1.4183303085299457e-05, + "loss": 36.1561, + "step": 2915 + }, + { + "epoch": 10.527313769751693, + "grad_norm": 214.77928161621094, + "learning_rate": 1.4177858439201452e-05, + "loss": 27.6309, + "step": 2916 + }, + { + "epoch": 10.530925507900678, + "grad_norm": 153.77163696289062, + "learning_rate": 1.417241379310345e-05, + "loss": 23.6151, + "step": 2917 + }, + { + "epoch": 10.534537246049661, + "grad_norm": 161.12826538085938, + "learning_rate": 1.4166969147005445e-05, + "loss": 23.1684, + "step": 2918 + }, + { + "epoch": 10.538148984198646, + "grad_norm": 228.01441955566406, + "learning_rate": 1.416152450090744e-05, + "loss": 23.4383, + "step": 2919 + }, + { + "epoch": 10.54176072234763, + "grad_norm": 207.55052185058594, + "learning_rate": 1.4156079854809439e-05, + "loss": 25.4699, + "step": 2920 + }, + { + "epoch": 10.54176072234763, + "eval_loss": 0.6177500486373901, + "eval_runtime": 3.1369, + "eval_samples_per_second": 57.063, + "eval_steps_per_second": 57.063, + "step": 2920 + }, + { + "epoch": 10.545372460496614, + "grad_norm": 254.23828125, + "learning_rate": 1.4150635208711434e-05, + "loss": 42.1525, + "step": 2921 + }, + { + "epoch": 10.548984198645599, + "grad_norm": 228.1654815673828, + "learning_rate": 1.414519056261343e-05, + "loss": 42.4282, + "step": 2922 + }, + { + "epoch": 10.552595936794582, + "grad_norm": 258.4981689453125, + "learning_rate": 1.4139745916515427e-05, + "loss": 42.3053, + "step": 2923 + }, + { + "epoch": 10.556207674943566, + "grad_norm": 364.42059326171875, + "learning_rate": 1.4134301270417424e-05, + "loss": 41.9009, + "step": 2924 + }, + { + "epoch": 10.559819413092551, + "grad_norm": 213.5066375732422, + "learning_rate": 1.412885662431942e-05, + "loss": 41.0624, + "step": 2925 + }, + { + "epoch": 10.563431151241534, + "grad_norm": 214.23472595214844, + "learning_rate": 1.4123411978221416e-05, + "loss": 42.2508, + "step": 2926 + }, + { + "epoch": 10.56704288939052, + "grad_norm": 249.8063201904297, + "learning_rate": 1.4117967332123412e-05, + "loss": 43.0671, + "step": 2927 + }, + { + "epoch": 10.570654627539504, + "grad_norm": 210.0769805908203, + "learning_rate": 1.4112522686025409e-05, + "loss": 43.4018, + "step": 2928 + }, + { + "epoch": 10.574266365688487, + "grad_norm": 255.67225646972656, + "learning_rate": 1.4107078039927406e-05, + "loss": 42.9609, + "step": 2929 + }, + { + "epoch": 10.577878103837472, + "grad_norm": 294.2599182128906, + "learning_rate": 1.4101633393829401e-05, + "loss": 41.8748, + "step": 2930 + }, + { + "epoch": 10.577878103837472, + "eval_loss": 0.6147512793540955, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2930 + }, + { + "epoch": 10.581489841986457, + "grad_norm": 212.6685333251953, + "learning_rate": 1.4096188747731398e-05, + "loss": 42.4291, + "step": 2931 + }, + { + "epoch": 10.58510158013544, + "grad_norm": 297.016357421875, + "learning_rate": 1.4090744101633394e-05, + "loss": 39.7291, + "step": 2932 + }, + { + "epoch": 10.588713318284425, + "grad_norm": 280.308837890625, + "learning_rate": 1.4085299455535389e-05, + "loss": 37.4836, + "step": 2933 + }, + { + "epoch": 10.592325056433408, + "grad_norm": 230.28994750976562, + "learning_rate": 1.4079854809437388e-05, + "loss": 39.4075, + "step": 2934 + }, + { + "epoch": 10.595936794582393, + "grad_norm": 377.0367126464844, + "learning_rate": 1.4074410163339383e-05, + "loss": 40.5601, + "step": 2935 + }, + { + "epoch": 10.599548532731378, + "grad_norm": 238.51597595214844, + "learning_rate": 1.406896551724138e-05, + "loss": 38.1238, + "step": 2936 + }, + { + "epoch": 10.60316027088036, + "grad_norm": 197.5536651611328, + "learning_rate": 1.4063520871143376e-05, + "loss": 38.2997, + "step": 2937 + }, + { + "epoch": 10.606772009029346, + "grad_norm": 211.65162658691406, + "learning_rate": 1.4058076225045373e-05, + "loss": 39.1501, + "step": 2938 + }, + { + "epoch": 10.610383747178329, + "grad_norm": 266.4801940917969, + "learning_rate": 1.405263157894737e-05, + "loss": 40.5761, + "step": 2939 + }, + { + "epoch": 10.613995485327314, + "grad_norm": 210.29478454589844, + "learning_rate": 1.4047186932849365e-05, + "loss": 39.7387, + "step": 2940 + }, + { + "epoch": 10.613995485327314, + "eval_loss": 0.6154477000236511, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 2940 + }, + { + "epoch": 10.617607223476298, + "grad_norm": 318.0694580078125, + "learning_rate": 1.404174228675136e-05, + "loss": 38.691, + "step": 2941 + }, + { + "epoch": 10.621218961625281, + "grad_norm": 351.12811279296875, + "learning_rate": 1.4036297640653358e-05, + "loss": 40.3878, + "step": 2942 + }, + { + "epoch": 10.624830699774266, + "grad_norm": 259.8601989746094, + "learning_rate": 1.4030852994555355e-05, + "loss": 38.4447, + "step": 2943 + }, + { + "epoch": 10.628442437923251, + "grad_norm": 249.7741241455078, + "learning_rate": 1.402540834845735e-05, + "loss": 41.1242, + "step": 2944 + }, + { + "epoch": 10.632054176072234, + "grad_norm": 207.11119079589844, + "learning_rate": 1.4019963702359347e-05, + "loss": 40.1977, + "step": 2945 + }, + { + "epoch": 10.635665914221219, + "grad_norm": 199.37295532226562, + "learning_rate": 1.4014519056261343e-05, + "loss": 40.71, + "step": 2946 + }, + { + "epoch": 10.639277652370204, + "grad_norm": 238.85061645507812, + "learning_rate": 1.4009074410163341e-05, + "loss": 41.8822, + "step": 2947 + }, + { + "epoch": 10.642889390519187, + "grad_norm": 212.46388244628906, + "learning_rate": 1.4003629764065337e-05, + "loss": 40.5648, + "step": 2948 + }, + { + "epoch": 10.646501128668172, + "grad_norm": 217.60386657714844, + "learning_rate": 1.3998185117967332e-05, + "loss": 39.6074, + "step": 2949 + }, + { + "epoch": 10.650112866817155, + "grad_norm": 223.88645935058594, + "learning_rate": 1.399274047186933e-05, + "loss": 37.7394, + "step": 2950 + }, + { + "epoch": 10.650112866817155, + "eval_loss": 0.6133999228477478, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 2950 + }, + { + "epoch": 10.65372460496614, + "grad_norm": 248.87986755371094, + "learning_rate": 1.3987295825771325e-05, + "loss": 34.911, + "step": 2951 + }, + { + "epoch": 10.657336343115125, + "grad_norm": 238.0355987548828, + "learning_rate": 1.3981851179673322e-05, + "loss": 34.0325, + "step": 2952 + }, + { + "epoch": 10.660948081264108, + "grad_norm": 212.9556121826172, + "learning_rate": 1.3976406533575319e-05, + "loss": 34.9663, + "step": 2953 + }, + { + "epoch": 10.664559819413093, + "grad_norm": 274.4277648925781, + "learning_rate": 1.3970961887477314e-05, + "loss": 34.2399, + "step": 2954 + }, + { + "epoch": 10.668171557562077, + "grad_norm": 211.77976989746094, + "learning_rate": 1.396551724137931e-05, + "loss": 33.7609, + "step": 2955 + }, + { + "epoch": 10.67178329571106, + "grad_norm": 280.6621398925781, + "learning_rate": 1.3960072595281307e-05, + "loss": 35.2616, + "step": 2956 + }, + { + "epoch": 10.675395033860045, + "grad_norm": 239.06439208984375, + "learning_rate": 1.3954627949183304e-05, + "loss": 34.2542, + "step": 2957 + }, + { + "epoch": 10.679006772009028, + "grad_norm": 271.45806884765625, + "learning_rate": 1.39491833030853e-05, + "loss": 36.0551, + "step": 2958 + }, + { + "epoch": 10.682618510158013, + "grad_norm": 247.76486206054688, + "learning_rate": 1.3943738656987296e-05, + "loss": 36.9935, + "step": 2959 + }, + { + "epoch": 10.686230248306998, + "grad_norm": 259.47930908203125, + "learning_rate": 1.3938294010889292e-05, + "loss": 36.7769, + "step": 2960 + }, + { + "epoch": 10.686230248306998, + "eval_loss": 0.6107803583145142, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.138, + "eval_steps_per_second": 57.138, + "step": 2960 + }, + { + "epoch": 10.689841986455981, + "grad_norm": 247.50103759765625, + "learning_rate": 1.393284936479129e-05, + "loss": 35.4848, + "step": 2961 + }, + { + "epoch": 10.693453724604966, + "grad_norm": 242.37330627441406, + "learning_rate": 1.3927404718693286e-05, + "loss": 36.3881, + "step": 2962 + }, + { + "epoch": 10.697065462753951, + "grad_norm": 200.2835693359375, + "learning_rate": 1.3921960072595281e-05, + "loss": 37.2684, + "step": 2963 + }, + { + "epoch": 10.700677200902934, + "grad_norm": 261.6256103515625, + "learning_rate": 1.3916515426497278e-05, + "loss": 37.4581, + "step": 2964 + }, + { + "epoch": 10.704288939051919, + "grad_norm": 243.7251434326172, + "learning_rate": 1.3911070780399274e-05, + "loss": 35.8237, + "step": 2965 + }, + { + "epoch": 10.707900677200904, + "grad_norm": 172.99339294433594, + "learning_rate": 1.390562613430127e-05, + "loss": 29.5815, + "step": 2966 + }, + { + "epoch": 10.711512415349887, + "grad_norm": 168.88490295410156, + "learning_rate": 1.3900181488203268e-05, + "loss": 23.6597, + "step": 2967 + }, + { + "epoch": 10.715124153498872, + "grad_norm": 213.0456085205078, + "learning_rate": 1.3894736842105263e-05, + "loss": 22.5034, + "step": 2968 + }, + { + "epoch": 10.718735891647855, + "grad_norm": 183.87222290039062, + "learning_rate": 1.388929219600726e-05, + "loss": 24.1696, + "step": 2969 + }, + { + "epoch": 10.72234762979684, + "grad_norm": 179.4297637939453, + "learning_rate": 1.3883847549909256e-05, + "loss": 24.8905, + "step": 2970 + }, + { + "epoch": 10.72234762979684, + "eval_loss": 0.6176853179931641, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 2970 + }, + { + "epoch": 10.725959367945824, + "grad_norm": 214.10662841796875, + "learning_rate": 1.3878402903811253e-05, + "loss": 40.6941, + "step": 2971 + }, + { + "epoch": 10.729571106094808, + "grad_norm": 199.4381103515625, + "learning_rate": 1.387295825771325e-05, + "loss": 42.6363, + "step": 2972 + }, + { + "epoch": 10.733182844243792, + "grad_norm": 182.74517822265625, + "learning_rate": 1.3867513611615245e-05, + "loss": 40.9695, + "step": 2973 + }, + { + "epoch": 10.736794582392777, + "grad_norm": 182.41421508789062, + "learning_rate": 1.386206896551724e-05, + "loss": 40.8893, + "step": 2974 + }, + { + "epoch": 10.74040632054176, + "grad_norm": 215.42904663085938, + "learning_rate": 1.385662431941924e-05, + "loss": 40.6667, + "step": 2975 + }, + { + "epoch": 10.744018058690745, + "grad_norm": 208.15133666992188, + "learning_rate": 1.3851179673321235e-05, + "loss": 42.0714, + "step": 2976 + }, + { + "epoch": 10.747629796839728, + "grad_norm": 224.70242309570312, + "learning_rate": 1.384573502722323e-05, + "loss": 40.9404, + "step": 2977 + }, + { + "epoch": 10.751241534988713, + "grad_norm": 241.45301818847656, + "learning_rate": 1.3840290381125227e-05, + "loss": 43.5597, + "step": 2978 + }, + { + "epoch": 10.754853273137698, + "grad_norm": 201.2677459716797, + "learning_rate": 1.3834845735027222e-05, + "loss": 42.7741, + "step": 2979 + }, + { + "epoch": 10.758465011286681, + "grad_norm": 246.30873107910156, + "learning_rate": 1.3829401088929221e-05, + "loss": 41.7873, + "step": 2980 + }, + { + "epoch": 10.758465011286681, + "eval_loss": 0.6206657886505127, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2980 + }, + { + "epoch": 10.762076749435666, + "grad_norm": 206.91009521484375, + "learning_rate": 1.3823956442831217e-05, + "loss": 42.3601, + "step": 2981 + }, + { + "epoch": 10.76568848758465, + "grad_norm": 206.37472534179688, + "learning_rate": 1.3818511796733212e-05, + "loss": 38.5536, + "step": 2982 + }, + { + "epoch": 10.769300225733634, + "grad_norm": 206.49070739746094, + "learning_rate": 1.3813067150635209e-05, + "loss": 38.1051, + "step": 2983 + }, + { + "epoch": 10.772911963882619, + "grad_norm": 215.02455139160156, + "learning_rate": 1.3807622504537206e-05, + "loss": 39.0797, + "step": 2984 + }, + { + "epoch": 10.776523702031604, + "grad_norm": 254.23757934570312, + "learning_rate": 1.3802177858439202e-05, + "loss": 39.419, + "step": 2985 + }, + { + "epoch": 10.780135440180587, + "grad_norm": 205.85079956054688, + "learning_rate": 1.3796733212341199e-05, + "loss": 39.2075, + "step": 2986 + }, + { + "epoch": 10.783747178329572, + "grad_norm": 216.0372314453125, + "learning_rate": 1.3791288566243194e-05, + "loss": 38.5652, + "step": 2987 + }, + { + "epoch": 10.787358916478555, + "grad_norm": 258.47650146484375, + "learning_rate": 1.3785843920145191e-05, + "loss": 38.1968, + "step": 2988 + }, + { + "epoch": 10.79097065462754, + "grad_norm": 289.07354736328125, + "learning_rate": 1.3780399274047188e-05, + "loss": 40.2233, + "step": 2989 + }, + { + "epoch": 10.794582392776524, + "grad_norm": 332.9964904785156, + "learning_rate": 1.3774954627949184e-05, + "loss": 39.5959, + "step": 2990 + }, + { + "epoch": 10.794582392776524, + "eval_loss": 0.6167517304420471, + "eval_runtime": 3.1556, + "eval_samples_per_second": 56.724, + "eval_steps_per_second": 56.724, + "step": 2990 + }, + { + "epoch": 10.798194130925507, + "grad_norm": 205.10699462890625, + "learning_rate": 1.376950998185118e-05, + "loss": 40.2468, + "step": 2991 + }, + { + "epoch": 10.801805869074492, + "grad_norm": 270.2808837890625, + "learning_rate": 1.3764065335753176e-05, + "loss": 37.5956, + "step": 2992 + }, + { + "epoch": 10.805417607223477, + "grad_norm": 199.32044982910156, + "learning_rate": 1.3758620689655171e-05, + "loss": 38.7289, + "step": 2993 + }, + { + "epoch": 10.80902934537246, + "grad_norm": 196.97547912597656, + "learning_rate": 1.375317604355717e-05, + "loss": 40.6707, + "step": 2994 + }, + { + "epoch": 10.812641083521445, + "grad_norm": 219.34588623046875, + "learning_rate": 1.3747731397459166e-05, + "loss": 39.6782, + "step": 2995 + }, + { + "epoch": 10.816252821670428, + "grad_norm": 261.7323913574219, + "learning_rate": 1.3742286751361161e-05, + "loss": 41.1828, + "step": 2996 + }, + { + "epoch": 10.819864559819413, + "grad_norm": 250.89186096191406, + "learning_rate": 1.3736842105263158e-05, + "loss": 41.3582, + "step": 2997 + }, + { + "epoch": 10.823476297968398, + "grad_norm": 284.7223205566406, + "learning_rate": 1.3731397459165155e-05, + "loss": 39.3584, + "step": 2998 + }, + { + "epoch": 10.827088036117381, + "grad_norm": 212.9114990234375, + "learning_rate": 1.3725952813067152e-05, + "loss": 37.5373, + "step": 2999 + }, + { + "epoch": 10.830699774266366, + "grad_norm": 182.8346405029297, + "learning_rate": 1.3720508166969148e-05, + "loss": 35.2027, + "step": 3000 + }, + { + "epoch": 10.830699774266366, + "eval_loss": 0.6083630919456482, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.702, + "eval_steps_per_second": 56.702, + "step": 3000 + }, + { + "epoch": 10.83431151241535, + "grad_norm": 259.0496520996094, + "learning_rate": 1.3715063520871143e-05, + "loss": 33.4937, + "step": 3001 + }, + { + "epoch": 10.837923250564334, + "grad_norm": 173.037353515625, + "learning_rate": 1.370961887477314e-05, + "loss": 32.8549, + "step": 3002 + }, + { + "epoch": 10.841534988713319, + "grad_norm": 257.9381408691406, + "learning_rate": 1.3704174228675137e-05, + "loss": 33.9163, + "step": 3003 + }, + { + "epoch": 10.845146726862303, + "grad_norm": 248.58355712890625, + "learning_rate": 1.3698729582577132e-05, + "loss": 34.3948, + "step": 3004 + }, + { + "epoch": 10.848758465011286, + "grad_norm": 277.0877990722656, + "learning_rate": 1.369328493647913e-05, + "loss": 34.2868, + "step": 3005 + }, + { + "epoch": 10.852370203160271, + "grad_norm": 220.54014587402344, + "learning_rate": 1.3687840290381125e-05, + "loss": 35.2502, + "step": 3006 + }, + { + "epoch": 10.855981941309254, + "grad_norm": 248.14111328125, + "learning_rate": 1.3682395644283122e-05, + "loss": 33.4599, + "step": 3007 + }, + { + "epoch": 10.85959367945824, + "grad_norm": 284.2827453613281, + "learning_rate": 1.3676950998185119e-05, + "loss": 34.2927, + "step": 3008 + }, + { + "epoch": 10.863205417607224, + "grad_norm": 236.78201293945312, + "learning_rate": 1.3671506352087114e-05, + "loss": 34.9322, + "step": 3009 + }, + { + "epoch": 10.866817155756207, + "grad_norm": 245.58331298828125, + "learning_rate": 1.3666061705989112e-05, + "loss": 35.7628, + "step": 3010 + }, + { + "epoch": 10.866817155756207, + "eval_loss": 0.6125946640968323, + "eval_runtime": 3.1644, + "eval_samples_per_second": 56.566, + "eval_steps_per_second": 56.566, + "step": 3010 + }, + { + "epoch": 10.870428893905192, + "grad_norm": 217.79248046875, + "learning_rate": 1.3660617059891107e-05, + "loss": 35.7332, + "step": 3011 + }, + { + "epoch": 10.874040632054175, + "grad_norm": 258.78729248046875, + "learning_rate": 1.3655172413793104e-05, + "loss": 38.293, + "step": 3012 + }, + { + "epoch": 10.87765237020316, + "grad_norm": 253.94757080078125, + "learning_rate": 1.3649727767695101e-05, + "loss": 37.511, + "step": 3013 + }, + { + "epoch": 10.881264108352145, + "grad_norm": 265.5654602050781, + "learning_rate": 1.3644283121597096e-05, + "loss": 37.5786, + "step": 3014 + }, + { + "epoch": 10.884875846501128, + "grad_norm": 252.11453247070312, + "learning_rate": 1.3638838475499092e-05, + "loss": 37.1039, + "step": 3015 + }, + { + "epoch": 10.888487584650113, + "grad_norm": 259.5934753417969, + "learning_rate": 1.3633393829401089e-05, + "loss": 35.2651, + "step": 3016 + }, + { + "epoch": 10.892099322799098, + "grad_norm": 194.3569793701172, + "learning_rate": 1.3627949183303086e-05, + "loss": 23.7438, + "step": 3017 + }, + { + "epoch": 10.89571106094808, + "grad_norm": 233.95205688476562, + "learning_rate": 1.3622504537205081e-05, + "loss": 23.0061, + "step": 3018 + }, + { + "epoch": 10.899322799097066, + "grad_norm": 185.18495178222656, + "learning_rate": 1.3617059891107078e-05, + "loss": 24.5404, + "step": 3019 + }, + { + "epoch": 10.90293453724605, + "grad_norm": 200.27029418945312, + "learning_rate": 1.3611615245009074e-05, + "loss": 24.3629, + "step": 3020 + }, + { + "epoch": 10.90293453724605, + "eval_loss": 0.6178797483444214, + "eval_runtime": 3.1498, + "eval_samples_per_second": 56.829, + "eval_steps_per_second": 56.829, + "step": 3020 + }, + { + "epoch": 10.906546275395034, + "grad_norm": 226.4281463623047, + "learning_rate": 1.3606170598911073e-05, + "loss": 41.7249, + "step": 3021 + }, + { + "epoch": 10.910158013544018, + "grad_norm": 207.73768615722656, + "learning_rate": 1.3600725952813068e-05, + "loss": 42.1902, + "step": 3022 + }, + { + "epoch": 10.913769751693001, + "grad_norm": 248.69773864746094, + "learning_rate": 1.3595281306715063e-05, + "loss": 40.8419, + "step": 3023 + }, + { + "epoch": 10.917381489841986, + "grad_norm": 224.0100860595703, + "learning_rate": 1.358983666061706e-05, + "loss": 41.483, + "step": 3024 + }, + { + "epoch": 10.920993227990971, + "grad_norm": 217.3524932861328, + "learning_rate": 1.3584392014519056e-05, + "loss": 42.4667, + "step": 3025 + }, + { + "epoch": 10.924604966139954, + "grad_norm": 226.0863494873047, + "learning_rate": 1.3578947368421053e-05, + "loss": 40.8693, + "step": 3026 + }, + { + "epoch": 10.928216704288939, + "grad_norm": 278.3658447265625, + "learning_rate": 1.357350272232305e-05, + "loss": 39.5165, + "step": 3027 + }, + { + "epoch": 10.931828442437924, + "grad_norm": 226.6543731689453, + "learning_rate": 1.3568058076225045e-05, + "loss": 39.3144, + "step": 3028 + }, + { + "epoch": 10.935440180586907, + "grad_norm": 215.39073181152344, + "learning_rate": 1.3562613430127042e-05, + "loss": 39.9823, + "step": 3029 + }, + { + "epoch": 10.939051918735892, + "grad_norm": 239.6291961669922, + "learning_rate": 1.355716878402904e-05, + "loss": 40.898, + "step": 3030 + }, + { + "epoch": 10.939051918735892, + "eval_loss": 0.6163076162338257, + "eval_runtime": 3.153, + "eval_samples_per_second": 56.771, + "eval_steps_per_second": 56.771, + "step": 3030 + }, + { + "epoch": 10.942663656884875, + "grad_norm": 251.20431518554688, + "learning_rate": 1.3551724137931035e-05, + "loss": 40.8357, + "step": 3031 + }, + { + "epoch": 10.94627539503386, + "grad_norm": 243.96022033691406, + "learning_rate": 1.3546279491833032e-05, + "loss": 39.1261, + "step": 3032 + }, + { + "epoch": 10.949887133182845, + "grad_norm": 248.15545654296875, + "learning_rate": 1.3540834845735027e-05, + "loss": 40.9375, + "step": 3033 + }, + { + "epoch": 10.953498871331828, + "grad_norm": 215.00927734375, + "learning_rate": 1.3535390199637023e-05, + "loss": 42.4167, + "step": 3034 + }, + { + "epoch": 10.957110609480813, + "grad_norm": 263.11566162109375, + "learning_rate": 1.3529945553539021e-05, + "loss": 40.7363, + "step": 3035 + }, + { + "epoch": 10.960722347629797, + "grad_norm": 208.59628295898438, + "learning_rate": 1.3524500907441017e-05, + "loss": 35.7124, + "step": 3036 + }, + { + "epoch": 10.96433408577878, + "grad_norm": 187.6036834716797, + "learning_rate": 1.3519056261343012e-05, + "loss": 33.7512, + "step": 3037 + }, + { + "epoch": 10.967945823927765, + "grad_norm": 217.89825439453125, + "learning_rate": 1.351361161524501e-05, + "loss": 33.4262, + "step": 3038 + }, + { + "epoch": 10.97155756207675, + "grad_norm": 235.59889221191406, + "learning_rate": 1.3508166969147005e-05, + "loss": 35.2587, + "step": 3039 + }, + { + "epoch": 10.975169300225733, + "grad_norm": 261.9609680175781, + "learning_rate": 1.3502722323049003e-05, + "loss": 36.1296, + "step": 3040 + }, + { + "epoch": 10.975169300225733, + "eval_loss": 0.610818088054657, + "eval_runtime": 3.1502, + "eval_samples_per_second": 56.822, + "eval_steps_per_second": 56.822, + "step": 3040 + }, + { + "epoch": 10.978781038374718, + "grad_norm": 239.44386291503906, + "learning_rate": 1.3497277676950999e-05, + "loss": 35.6712, + "step": 3041 + }, + { + "epoch": 10.982392776523701, + "grad_norm": 260.9620666503906, + "learning_rate": 1.3491833030852994e-05, + "loss": 35.9054, + "step": 3042 + }, + { + "epoch": 10.986004514672686, + "grad_norm": 246.35678100585938, + "learning_rate": 1.3486388384754991e-05, + "loss": 35.6071, + "step": 3043 + }, + { + "epoch": 10.989616252821671, + "grad_norm": 259.808349609375, + "learning_rate": 1.3480943738656988e-05, + "loss": 37.8261, + "step": 3044 + }, + { + "epoch": 10.993227990970654, + "grad_norm": 187.34579467773438, + "learning_rate": 1.3475499092558984e-05, + "loss": 29.4662, + "step": 3045 + }, + { + "epoch": 10.996839729119639, + "grad_norm": 235.4073486328125, + "learning_rate": 1.3470054446460981e-05, + "loss": 23.668, + "step": 3046 + }, + { + "epoch": 11.0, + "grad_norm": 171.45904541015625, + "learning_rate": 1.3464609800362976e-05, + "loss": 21.3995, + "step": 3047 + }, + { + "epoch": 11.003611738148985, + "grad_norm": 262.18798828125, + "learning_rate": 1.3459165154264972e-05, + "loss": 40.2072, + "step": 3048 + }, + { + "epoch": 11.007223476297968, + "grad_norm": 298.67755126953125, + "learning_rate": 1.345372050816697e-05, + "loss": 42.5345, + "step": 3049 + }, + { + "epoch": 11.010835214446953, + "grad_norm": 215.71389770507812, + "learning_rate": 1.3448275862068966e-05, + "loss": 41.3491, + "step": 3050 + }, + { + "epoch": 11.010835214446953, + "eval_loss": 0.6099278330802917, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 3050 + }, + { + "epoch": 11.014446952595938, + "grad_norm": 243.77044677734375, + "learning_rate": 1.3442831215970963e-05, + "loss": 41.0093, + "step": 3051 + }, + { + "epoch": 11.01805869074492, + "grad_norm": 205.8600616455078, + "learning_rate": 1.3437386569872958e-05, + "loss": 41.944, + "step": 3052 + }, + { + "epoch": 11.021670428893906, + "grad_norm": 204.25608825683594, + "learning_rate": 1.3431941923774955e-05, + "loss": 39.3595, + "step": 3053 + }, + { + "epoch": 11.025282167042889, + "grad_norm": 195.03114318847656, + "learning_rate": 1.3426497277676952e-05, + "loss": 42.0208, + "step": 3054 + }, + { + "epoch": 11.028893905191874, + "grad_norm": 193.05857849121094, + "learning_rate": 1.3421052631578948e-05, + "loss": 41.2148, + "step": 3055 + }, + { + "epoch": 11.032505643340858, + "grad_norm": 255.9553680419922, + "learning_rate": 1.3415607985480943e-05, + "loss": 41.6029, + "step": 3056 + }, + { + "epoch": 11.036117381489841, + "grad_norm": 234.97799682617188, + "learning_rate": 1.341016333938294e-05, + "loss": 41.2583, + "step": 3057 + }, + { + "epoch": 11.039729119638826, + "grad_norm": 183.76707458496094, + "learning_rate": 1.3404718693284937e-05, + "loss": 39.4893, + "step": 3058 + }, + { + "epoch": 11.043340857787811, + "grad_norm": 162.30191040039062, + "learning_rate": 1.3399274047186933e-05, + "loss": 37.697, + "step": 3059 + }, + { + "epoch": 11.046952595936794, + "grad_norm": 223.8235626220703, + "learning_rate": 1.339382940108893e-05, + "loss": 37.2762, + "step": 3060 + }, + { + "epoch": 11.046952595936794, + "eval_loss": 0.6099210381507874, + "eval_runtime": 3.1526, + "eval_samples_per_second": 56.778, + "eval_steps_per_second": 56.778, + "step": 3060 + }, + { + "epoch": 11.050564334085779, + "grad_norm": 203.874755859375, + "learning_rate": 1.3388384754990925e-05, + "loss": 37.7674, + "step": 3061 + }, + { + "epoch": 11.054176072234762, + "grad_norm": 222.9609832763672, + "learning_rate": 1.3382940108892922e-05, + "loss": 39.5784, + "step": 3062 + }, + { + "epoch": 11.057787810383747, + "grad_norm": 177.81871032714844, + "learning_rate": 1.337749546279492e-05, + "loss": 37.5264, + "step": 3063 + }, + { + "epoch": 11.061399548532732, + "grad_norm": 209.53326416015625, + "learning_rate": 1.3372050816696915e-05, + "loss": 38.5067, + "step": 3064 + }, + { + "epoch": 11.065011286681715, + "grad_norm": 228.35260009765625, + "learning_rate": 1.3366606170598912e-05, + "loss": 37.5329, + "step": 3065 + }, + { + "epoch": 11.0686230248307, + "grad_norm": 231.5054168701172, + "learning_rate": 1.3361161524500907e-05, + "loss": 39.8565, + "step": 3066 + }, + { + "epoch": 11.072234762979685, + "grad_norm": 184.31460571289062, + "learning_rate": 1.3355716878402904e-05, + "loss": 37.9703, + "step": 3067 + }, + { + "epoch": 11.075846501128668, + "grad_norm": 230.06463623046875, + "learning_rate": 1.3350272232304901e-05, + "loss": 39.1406, + "step": 3068 + }, + { + "epoch": 11.079458239277653, + "grad_norm": 263.3990478515625, + "learning_rate": 1.3344827586206897e-05, + "loss": 39.8019, + "step": 3069 + }, + { + "epoch": 11.083069977426636, + "grad_norm": 217.89923095703125, + "learning_rate": 1.3339382940108892e-05, + "loss": 40.195, + "step": 3070 + }, + { + "epoch": 11.083069977426636, + "eval_loss": 0.6136859655380249, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 3070 + }, + { + "epoch": 11.08668171557562, + "grad_norm": 238.8343505859375, + "learning_rate": 1.333393829401089e-05, + "loss": 39.1668, + "step": 3071 + }, + { + "epoch": 11.090293453724605, + "grad_norm": 288.6470947265625, + "learning_rate": 1.3328493647912886e-05, + "loss": 40.3355, + "step": 3072 + }, + { + "epoch": 11.093905191873588, + "grad_norm": 284.3423156738281, + "learning_rate": 1.3323049001814883e-05, + "loss": 41.5359, + "step": 3073 + }, + { + "epoch": 11.097516930022573, + "grad_norm": 263.0945739746094, + "learning_rate": 1.3317604355716879e-05, + "loss": 41.3219, + "step": 3074 + }, + { + "epoch": 11.101128668171558, + "grad_norm": 208.96383666992188, + "learning_rate": 1.3312159709618874e-05, + "loss": 39.7292, + "step": 3075 + }, + { + "epoch": 11.104740406320541, + "grad_norm": 233.49888610839844, + "learning_rate": 1.3306715063520873e-05, + "loss": 35.282, + "step": 3076 + }, + { + "epoch": 11.108352144469526, + "grad_norm": 216.6250762939453, + "learning_rate": 1.3301270417422868e-05, + "loss": 34.4335, + "step": 3077 + }, + { + "epoch": 11.111963882618511, + "grad_norm": 182.3594970703125, + "learning_rate": 1.3295825771324864e-05, + "loss": 32.7557, + "step": 3078 + }, + { + "epoch": 11.115575620767494, + "grad_norm": 215.4852752685547, + "learning_rate": 1.329038112522686e-05, + "loss": 32.185, + "step": 3079 + }, + { + "epoch": 11.119187358916479, + "grad_norm": 237.4733123779297, + "learning_rate": 1.3284936479128856e-05, + "loss": 32.8733, + "step": 3080 + }, + { + "epoch": 11.119187358916479, + "eval_loss": 0.6130570769309998, + "eval_runtime": 3.154, + "eval_samples_per_second": 56.754, + "eval_steps_per_second": 56.754, + "step": 3080 + }, + { + "epoch": 11.122799097065462, + "grad_norm": 202.9044952392578, + "learning_rate": 1.3279491833030853e-05, + "loss": 33.89, + "step": 3081 + }, + { + "epoch": 11.126410835214447, + "grad_norm": 230.82086181640625, + "learning_rate": 1.327404718693285e-05, + "loss": 34.0808, + "step": 3082 + }, + { + "epoch": 11.130022573363432, + "grad_norm": 318.1103515625, + "learning_rate": 1.3268602540834846e-05, + "loss": 35.5715, + "step": 3083 + }, + { + "epoch": 11.133634311512415, + "grad_norm": 296.760986328125, + "learning_rate": 1.3263157894736843e-05, + "loss": 36.0701, + "step": 3084 + }, + { + "epoch": 11.1372460496614, + "grad_norm": 355.1922302246094, + "learning_rate": 1.3257713248638838e-05, + "loss": 35.027, + "step": 3085 + }, + { + "epoch": 11.140857787810384, + "grad_norm": 379.0643310546875, + "learning_rate": 1.3252268602540835e-05, + "loss": 36.8225, + "step": 3086 + }, + { + "epoch": 11.144469525959368, + "grad_norm": 271.0293273925781, + "learning_rate": 1.3246823956442832e-05, + "loss": 34.18, + "step": 3087 + }, + { + "epoch": 11.148081264108352, + "grad_norm": 231.29782104492188, + "learning_rate": 1.3241379310344828e-05, + "loss": 37.5546, + "step": 3088 + }, + { + "epoch": 11.151693002257336, + "grad_norm": 236.58180236816406, + "learning_rate": 1.3235934664246823e-05, + "loss": 35.8625, + "step": 3089 + }, + { + "epoch": 11.15530474040632, + "grad_norm": 220.71853637695312, + "learning_rate": 1.3230490018148822e-05, + "loss": 38.1384, + "step": 3090 + }, + { + "epoch": 11.15530474040632, + "eval_loss": 0.6140565276145935, + "eval_runtime": 3.1543, + "eval_samples_per_second": 56.747, + "eval_steps_per_second": 56.747, + "step": 3090 + }, + { + "epoch": 11.158916478555305, + "grad_norm": 251.32090759277344, + "learning_rate": 1.3225045372050817e-05, + "loss": 36.7226, + "step": 3091 + }, + { + "epoch": 11.162528216704288, + "grad_norm": 244.061279296875, + "learning_rate": 1.3219600725952814e-05, + "loss": 37.2144, + "step": 3092 + }, + { + "epoch": 11.166139954853273, + "grad_norm": 274.3013610839844, + "learning_rate": 1.321415607985481e-05, + "loss": 27.0703, + "step": 3093 + }, + { + "epoch": 11.169751693002258, + "grad_norm": 197.1829071044922, + "learning_rate": 1.3208711433756805e-05, + "loss": 23.0504, + "step": 3094 + }, + { + "epoch": 11.173363431151241, + "grad_norm": 205.8387451171875, + "learning_rate": 1.3203266787658804e-05, + "loss": 23.4632, + "step": 3095 + }, + { + "epoch": 11.176975169300226, + "grad_norm": 237.6263427734375, + "learning_rate": 1.31978221415608e-05, + "loss": 23.9426, + "step": 3096 + }, + { + "epoch": 11.18058690744921, + "grad_norm": 177.99688720703125, + "learning_rate": 1.3192377495462795e-05, + "loss": 24.2553, + "step": 3097 + }, + { + "epoch": 11.184198645598194, + "grad_norm": 235.16787719726562, + "learning_rate": 1.3186932849364792e-05, + "loss": 41.3257, + "step": 3098 + }, + { + "epoch": 11.187810383747179, + "grad_norm": 213.4043731689453, + "learning_rate": 1.3181488203266787e-05, + "loss": 42.3344, + "step": 3099 + }, + { + "epoch": 11.191422121896162, + "grad_norm": 162.57554626464844, + "learning_rate": 1.3176043557168784e-05, + "loss": 41.2702, + "step": 3100 + }, + { + "epoch": 11.191422121896162, + "eval_loss": 0.6155741214752197, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.06, + "eval_steps_per_second": 57.06, + "step": 3100 + }, + { + "epoch": 11.195033860045147, + "grad_norm": 215.84335327148438, + "learning_rate": 1.3170598911070781e-05, + "loss": 41.0582, + "step": 3101 + }, + { + "epoch": 11.198645598194132, + "grad_norm": 295.0271301269531, + "learning_rate": 1.3165154264972777e-05, + "loss": 41.3479, + "step": 3102 + }, + { + "epoch": 11.202257336343115, + "grad_norm": 287.3316955566406, + "learning_rate": 1.3159709618874774e-05, + "loss": 41.6267, + "step": 3103 + }, + { + "epoch": 11.2058690744921, + "grad_norm": 249.3993377685547, + "learning_rate": 1.315426497277677e-05, + "loss": 40.5208, + "step": 3104 + }, + { + "epoch": 11.209480812641084, + "grad_norm": 274.5410461425781, + "learning_rate": 1.3148820326678766e-05, + "loss": 41.7072, + "step": 3105 + }, + { + "epoch": 11.213092550790067, + "grad_norm": 259.49627685546875, + "learning_rate": 1.3143375680580763e-05, + "loss": 41.0034, + "step": 3106 + }, + { + "epoch": 11.216704288939052, + "grad_norm": 246.60902404785156, + "learning_rate": 1.3137931034482759e-05, + "loss": 40.1154, + "step": 3107 + }, + { + "epoch": 11.220316027088035, + "grad_norm": 224.0052947998047, + "learning_rate": 1.3132486388384754e-05, + "loss": 41.1167, + "step": 3108 + }, + { + "epoch": 11.22392776523702, + "grad_norm": 204.24021911621094, + "learning_rate": 1.3127041742286753e-05, + "loss": 37.0909, + "step": 3109 + }, + { + "epoch": 11.227539503386005, + "grad_norm": 206.67681884765625, + "learning_rate": 1.3121597096188748e-05, + "loss": 38.0959, + "step": 3110 + }, + { + "epoch": 11.227539503386005, + "eval_loss": 0.6148640513420105, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.798, + "eval_steps_per_second": 56.798, + "step": 3110 + }, + { + "epoch": 11.231151241534988, + "grad_norm": 255.91238403320312, + "learning_rate": 1.3116152450090743e-05, + "loss": 38.8076, + "step": 3111 + }, + { + "epoch": 11.234762979683973, + "grad_norm": 239.5032958984375, + "learning_rate": 1.311070780399274e-05, + "loss": 39.3991, + "step": 3112 + }, + { + "epoch": 11.238374717832958, + "grad_norm": 254.8914031982422, + "learning_rate": 1.3105263157894738e-05, + "loss": 37.7301, + "step": 3113 + }, + { + "epoch": 11.241986455981941, + "grad_norm": 229.97943115234375, + "learning_rate": 1.3099818511796735e-05, + "loss": 38.8527, + "step": 3114 + }, + { + "epoch": 11.245598194130926, + "grad_norm": 208.1148681640625, + "learning_rate": 1.309437386569873e-05, + "loss": 38.8518, + "step": 3115 + }, + { + "epoch": 11.249209932279909, + "grad_norm": 208.49557495117188, + "learning_rate": 1.3088929219600725e-05, + "loss": 38.927, + "step": 3116 + }, + { + "epoch": 11.252821670428894, + "grad_norm": 332.9958801269531, + "learning_rate": 1.3083484573502723e-05, + "loss": 40.0492, + "step": 3117 + }, + { + "epoch": 11.256433408577879, + "grad_norm": 253.16769409179688, + "learning_rate": 1.307803992740472e-05, + "loss": 39.1965, + "step": 3118 + }, + { + "epoch": 11.260045146726862, + "grad_norm": 243.8136444091797, + "learning_rate": 1.3072595281306715e-05, + "loss": 38.2286, + "step": 3119 + }, + { + "epoch": 11.263656884875846, + "grad_norm": 273.6463623046875, + "learning_rate": 1.3067150635208712e-05, + "loss": 39.3751, + "step": 3120 + }, + { + "epoch": 11.263656884875846, + "eval_loss": 0.6175129413604736, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 3120 + }, + { + "epoch": 11.267268623024831, + "grad_norm": 228.980224609375, + "learning_rate": 1.3061705989110707e-05, + "loss": 40.29, + "step": 3121 + }, + { + "epoch": 11.270880361173814, + "grad_norm": 292.6310729980469, + "learning_rate": 1.3056261343012703e-05, + "loss": 41.1785, + "step": 3122 + }, + { + "epoch": 11.2744920993228, + "grad_norm": 217.0737762451172, + "learning_rate": 1.3050816696914702e-05, + "loss": 40.9514, + "step": 3123 + }, + { + "epoch": 11.278103837471784, + "grad_norm": 227.0102081298828, + "learning_rate": 1.3045372050816697e-05, + "loss": 39.6132, + "step": 3124 + }, + { + "epoch": 11.281715575620767, + "grad_norm": 195.74667358398438, + "learning_rate": 1.3039927404718694e-05, + "loss": 39.5024, + "step": 3125 + }, + { + "epoch": 11.285327313769752, + "grad_norm": 222.6744384765625, + "learning_rate": 1.303448275862069e-05, + "loss": 37.7863, + "step": 3126 + }, + { + "epoch": 11.288939051918735, + "grad_norm": 207.1038055419922, + "learning_rate": 1.3029038112522687e-05, + "loss": 34.9129, + "step": 3127 + }, + { + "epoch": 11.29255079006772, + "grad_norm": 227.38330078125, + "learning_rate": 1.3023593466424684e-05, + "loss": 33.231, + "step": 3128 + }, + { + "epoch": 11.296162528216705, + "grad_norm": 254.19442749023438, + "learning_rate": 1.3018148820326679e-05, + "loss": 33.3166, + "step": 3129 + }, + { + "epoch": 11.299774266365688, + "grad_norm": 221.4664306640625, + "learning_rate": 1.3012704174228674e-05, + "loss": 33.2336, + "step": 3130 + }, + { + "epoch": 11.299774266365688, + "eval_loss": 0.6138683557510376, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 3130 + }, + { + "epoch": 11.303386004514673, + "grad_norm": 179.73678588867188, + "learning_rate": 1.3007259528130671e-05, + "loss": 34.0082, + "step": 3131 + }, + { + "epoch": 11.306997742663658, + "grad_norm": 238.66107177734375, + "learning_rate": 1.3001814882032669e-05, + "loss": 33.1898, + "step": 3132 + }, + { + "epoch": 11.31060948081264, + "grad_norm": 315.51934814453125, + "learning_rate": 1.2996370235934666e-05, + "loss": 34.5558, + "step": 3133 + }, + { + "epoch": 11.314221218961626, + "grad_norm": 235.54217529296875, + "learning_rate": 1.2990925589836661e-05, + "loss": 32.4498, + "step": 3134 + }, + { + "epoch": 11.317832957110609, + "grad_norm": 225.9518280029297, + "learning_rate": 1.2985480943738656e-05, + "loss": 34.1823, + "step": 3135 + }, + { + "epoch": 11.321444695259594, + "grad_norm": 276.5481262207031, + "learning_rate": 1.2980036297640655e-05, + "loss": 34.6704, + "step": 3136 + }, + { + "epoch": 11.325056433408578, + "grad_norm": 306.4985656738281, + "learning_rate": 1.297459165154265e-05, + "loss": 35.9149, + "step": 3137 + }, + { + "epoch": 11.328668171557561, + "grad_norm": 207.28550720214844, + "learning_rate": 1.2969147005444646e-05, + "loss": 34.876, + "step": 3138 + }, + { + "epoch": 11.332279909706546, + "grad_norm": 238.89157104492188, + "learning_rate": 1.2963702359346643e-05, + "loss": 36.7191, + "step": 3139 + }, + { + "epoch": 11.335891647855531, + "grad_norm": 281.7445068359375, + "learning_rate": 1.2958257713248638e-05, + "loss": 37.9134, + "step": 3140 + }, + { + "epoch": 11.335891647855531, + "eval_loss": 0.6141538023948669, + "eval_runtime": 3.1622, + "eval_samples_per_second": 56.606, + "eval_steps_per_second": 56.606, + "step": 3140 + }, + { + "epoch": 11.339503386004514, + "grad_norm": 261.58221435546875, + "learning_rate": 1.2952813067150635e-05, + "loss": 36.7193, + "step": 3141 + }, + { + "epoch": 11.343115124153499, + "grad_norm": 260.8083190917969, + "learning_rate": 1.2947368421052633e-05, + "loss": 36.9418, + "step": 3142 + }, + { + "epoch": 11.346726862302482, + "grad_norm": 263.466552734375, + "learning_rate": 1.2941923774954628e-05, + "loss": 31.1083, + "step": 3143 + }, + { + "epoch": 11.350338600451467, + "grad_norm": 201.6587677001953, + "learning_rate": 1.2936479128856625e-05, + "loss": 23.4982, + "step": 3144 + }, + { + "epoch": 11.353950338600452, + "grad_norm": 230.29629516601562, + "learning_rate": 1.293103448275862e-05, + "loss": 22.5417, + "step": 3145 + }, + { + "epoch": 11.357562076749435, + "grad_norm": 193.08795166015625, + "learning_rate": 1.2925589836660617e-05, + "loss": 23.6032, + "step": 3146 + }, + { + "epoch": 11.36117381489842, + "grad_norm": 206.49093627929688, + "learning_rate": 1.2920145190562615e-05, + "loss": 24.1813, + "step": 3147 + }, + { + "epoch": 11.364785553047405, + "grad_norm": 285.38348388671875, + "learning_rate": 1.291470054446461e-05, + "loss": 41.4394, + "step": 3148 + }, + { + "epoch": 11.368397291196388, + "grad_norm": 307.4984130859375, + "learning_rate": 1.2909255898366605e-05, + "loss": 43.8865, + "step": 3149 + }, + { + "epoch": 11.372009029345373, + "grad_norm": 256.685791015625, + "learning_rate": 1.2903811252268604e-05, + "loss": 41.5534, + "step": 3150 + }, + { + "epoch": 11.372009029345373, + "eval_loss": 0.6155339479446411, + "eval_runtime": 3.1488, + "eval_samples_per_second": 56.846, + "eval_steps_per_second": 56.846, + "step": 3150 + }, + { + "epoch": 11.375620767494357, + "grad_norm": 302.5317077636719, + "learning_rate": 1.28983666061706e-05, + "loss": 41.5231, + "step": 3151 + }, + { + "epoch": 11.37923250564334, + "grad_norm": 381.4787292480469, + "learning_rate": 1.2892921960072595e-05, + "loss": 40.7064, + "step": 3152 + }, + { + "epoch": 11.382844243792325, + "grad_norm": 313.63116455078125, + "learning_rate": 1.2887477313974592e-05, + "loss": 41.4045, + "step": 3153 + }, + { + "epoch": 11.386455981941308, + "grad_norm": 265.4134521484375, + "learning_rate": 1.2882032667876587e-05, + "loss": 41.2618, + "step": 3154 + }, + { + "epoch": 11.390067720090293, + "grad_norm": 260.43084716796875, + "learning_rate": 1.2876588021778586e-05, + "loss": 42.6311, + "step": 3155 + }, + { + "epoch": 11.393679458239278, + "grad_norm": 326.7022705078125, + "learning_rate": 1.2871143375680581e-05, + "loss": 41.8859, + "step": 3156 + }, + { + "epoch": 11.397291196388261, + "grad_norm": 420.966552734375, + "learning_rate": 1.2865698729582577e-05, + "loss": 41.8117, + "step": 3157 + }, + { + "epoch": 11.400902934537246, + "grad_norm": 280.8377380371094, + "learning_rate": 1.2860254083484574e-05, + "loss": 41.3303, + "step": 3158 + }, + { + "epoch": 11.404514672686231, + "grad_norm": 238.64564514160156, + "learning_rate": 1.2854809437386571e-05, + "loss": 38.253, + "step": 3159 + }, + { + "epoch": 11.408126410835214, + "grad_norm": 258.8091125488281, + "learning_rate": 1.2849364791288566e-05, + "loss": 39.2494, + "step": 3160 + }, + { + "epoch": 11.408126410835214, + "eval_loss": 0.6130858659744263, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 3160 + }, + { + "epoch": 11.411738148984199, + "grad_norm": 209.76300048828125, + "learning_rate": 1.2843920145190563e-05, + "loss": 39.1069, + "step": 3161 + }, + { + "epoch": 11.415349887133182, + "grad_norm": 215.24072265625, + "learning_rate": 1.2838475499092559e-05, + "loss": 38.8867, + "step": 3162 + }, + { + "epoch": 11.418961625282167, + "grad_norm": 285.4281311035156, + "learning_rate": 1.2833030852994554e-05, + "loss": 38.0298, + "step": 3163 + }, + { + "epoch": 11.422573363431152, + "grad_norm": 322.1593017578125, + "learning_rate": 1.2827586206896553e-05, + "loss": 40.2122, + "step": 3164 + }, + { + "epoch": 11.426185101580135, + "grad_norm": 277.2178955078125, + "learning_rate": 1.2822141560798548e-05, + "loss": 38.0829, + "step": 3165 + }, + { + "epoch": 11.42979683972912, + "grad_norm": 186.9705810546875, + "learning_rate": 1.2816696914700545e-05, + "loss": 40.6601, + "step": 3166 + }, + { + "epoch": 11.433408577878104, + "grad_norm": 210.6102294921875, + "learning_rate": 1.281125226860254e-05, + "loss": 39.0126, + "step": 3167 + }, + { + "epoch": 11.437020316027088, + "grad_norm": 234.50717163085938, + "learning_rate": 1.2805807622504536e-05, + "loss": 38.6465, + "step": 3168 + }, + { + "epoch": 11.440632054176072, + "grad_norm": 217.9093475341797, + "learning_rate": 1.2800362976406535e-05, + "loss": 39.2568, + "step": 3169 + }, + { + "epoch": 11.444243792325057, + "grad_norm": 252.82054138183594, + "learning_rate": 1.279491833030853e-05, + "loss": 39.005, + "step": 3170 + }, + { + "epoch": 11.444243792325057, + "eval_loss": 0.6125118732452393, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 3170 + }, + { + "epoch": 11.44785553047404, + "grad_norm": 290.2322998046875, + "learning_rate": 1.2789473684210526e-05, + "loss": 39.6133, + "step": 3171 + }, + { + "epoch": 11.451467268623025, + "grad_norm": 250.72450256347656, + "learning_rate": 1.2784029038112523e-05, + "loss": 40.3251, + "step": 3172 + }, + { + "epoch": 11.455079006772008, + "grad_norm": 273.91229248046875, + "learning_rate": 1.277858439201452e-05, + "loss": 39.5129, + "step": 3173 + }, + { + "epoch": 11.458690744920993, + "grad_norm": 214.30038452148438, + "learning_rate": 1.2773139745916515e-05, + "loss": 40.5093, + "step": 3174 + }, + { + "epoch": 11.462302483069978, + "grad_norm": 264.251708984375, + "learning_rate": 1.2767695099818512e-05, + "loss": 38.3837, + "step": 3175 + }, + { + "epoch": 11.465914221218961, + "grad_norm": 224.7700653076172, + "learning_rate": 1.2762250453720508e-05, + "loss": 37.8522, + "step": 3176 + }, + { + "epoch": 11.469525959367946, + "grad_norm": 238.35604858398438, + "learning_rate": 1.2756805807622505e-05, + "loss": 34.0249, + "step": 3177 + }, + { + "epoch": 11.47313769751693, + "grad_norm": 181.4731903076172, + "learning_rate": 1.2751361161524502e-05, + "loss": 34.2473, + "step": 3178 + }, + { + "epoch": 11.476749435665914, + "grad_norm": 240.2397003173828, + "learning_rate": 1.2745916515426497e-05, + "loss": 32.8657, + "step": 3179 + }, + { + "epoch": 11.480361173814899, + "grad_norm": 283.2740478515625, + "learning_rate": 1.2740471869328494e-05, + "loss": 34.6619, + "step": 3180 + }, + { + "epoch": 11.480361173814899, + "eval_loss": 0.6126638054847717, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 3180 + }, + { + "epoch": 11.483972911963882, + "grad_norm": 248.70912170410156, + "learning_rate": 1.273502722323049e-05, + "loss": 33.0975, + "step": 3181 + }, + { + "epoch": 11.487584650112867, + "grad_norm": 210.9479217529297, + "learning_rate": 1.2729582577132487e-05, + "loss": 34.2069, + "step": 3182 + }, + { + "epoch": 11.491196388261852, + "grad_norm": 234.31399536132812, + "learning_rate": 1.2724137931034484e-05, + "loss": 35.811, + "step": 3183 + }, + { + "epoch": 11.494808126410835, + "grad_norm": 253.24478149414062, + "learning_rate": 1.271869328493648e-05, + "loss": 35.6234, + "step": 3184 + }, + { + "epoch": 11.49841986455982, + "grad_norm": 259.0565185546875, + "learning_rate": 1.2713248638838476e-05, + "loss": 35.1495, + "step": 3185 + }, + { + "epoch": 11.502031602708804, + "grad_norm": 235.4202880859375, + "learning_rate": 1.2707803992740472e-05, + "loss": 35.1363, + "step": 3186 + }, + { + "epoch": 11.505643340857787, + "grad_norm": 248.30267333984375, + "learning_rate": 1.2702359346642469e-05, + "loss": 35.9653, + "step": 3187 + }, + { + "epoch": 11.509255079006772, + "grad_norm": 197.6142120361328, + "learning_rate": 1.2696914700544466e-05, + "loss": 35.6304, + "step": 3188 + }, + { + "epoch": 11.512866817155757, + "grad_norm": 329.27862548828125, + "learning_rate": 1.2691470054446461e-05, + "loss": 35.6111, + "step": 3189 + }, + { + "epoch": 11.51647855530474, + "grad_norm": 194.7126922607422, + "learning_rate": 1.2686025408348457e-05, + "loss": 35.0693, + "step": 3190 + }, + { + "epoch": 11.51647855530474, + "eval_loss": 0.6106634736061096, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 3190 + }, + { + "epoch": 11.520090293453725, + "grad_norm": 243.0207061767578, + "learning_rate": 1.2680580762250454e-05, + "loss": 37.6373, + "step": 3191 + }, + { + "epoch": 11.523702031602708, + "grad_norm": 282.0947265625, + "learning_rate": 1.267513611615245e-05, + "loss": 36.2595, + "step": 3192 + }, + { + "epoch": 11.527313769751693, + "grad_norm": 249.8011932373047, + "learning_rate": 1.2669691470054446e-05, + "loss": 35.5601, + "step": 3193 + }, + { + "epoch": 11.530925507900678, + "grad_norm": 202.17503356933594, + "learning_rate": 1.2664246823956443e-05, + "loss": 23.1075, + "step": 3194 + }, + { + "epoch": 11.534537246049661, + "grad_norm": 188.78128051757812, + "learning_rate": 1.2658802177858439e-05, + "loss": 22.2458, + "step": 3195 + }, + { + "epoch": 11.538148984198646, + "grad_norm": 219.24722290039062, + "learning_rate": 1.2653357531760437e-05, + "loss": 23.7842, + "step": 3196 + }, + { + "epoch": 11.54176072234763, + "grad_norm": 213.0615234375, + "learning_rate": 1.2647912885662433e-05, + "loss": 25.3773, + "step": 3197 + }, + { + "epoch": 11.545372460496614, + "grad_norm": 274.6806335449219, + "learning_rate": 1.2642468239564428e-05, + "loss": 40.396, + "step": 3198 + }, + { + "epoch": 11.548984198645599, + "grad_norm": 248.91778564453125, + "learning_rate": 1.2637023593466425e-05, + "loss": 42.2405, + "step": 3199 + }, + { + "epoch": 11.552595936794582, + "grad_norm": 228.45591735839844, + "learning_rate": 1.263157894736842e-05, + "loss": 40.7328, + "step": 3200 + }, + { + "epoch": 11.552595936794582, + "eval_loss": 0.6154705286026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.04, + "eval_steps_per_second": 57.04, + "step": 3200 + }, + { + "epoch": 11.556207674943566, + "grad_norm": 206.54483032226562, + "learning_rate": 1.2626134301270418e-05, + "loss": 40.6909, + "step": 3201 + }, + { + "epoch": 11.559819413092551, + "grad_norm": 199.14816284179688, + "learning_rate": 1.2620689655172415e-05, + "loss": 40.6918, + "step": 3202 + }, + { + "epoch": 11.563431151241534, + "grad_norm": 217.4789276123047, + "learning_rate": 1.261524500907441e-05, + "loss": 41.686, + "step": 3203 + }, + { + "epoch": 11.56704288939052, + "grad_norm": 209.83084106445312, + "learning_rate": 1.2609800362976406e-05, + "loss": 40.685, + "step": 3204 + }, + { + "epoch": 11.570654627539504, + "grad_norm": 184.56614685058594, + "learning_rate": 1.2604355716878404e-05, + "loss": 42.1684, + "step": 3205 + }, + { + "epoch": 11.574266365688487, + "grad_norm": 226.84622192382812, + "learning_rate": 1.25989110707804e-05, + "loss": 42.4169, + "step": 3206 + }, + { + "epoch": 11.577878103837472, + "grad_norm": 271.7705383300781, + "learning_rate": 1.2593466424682397e-05, + "loss": 41.9603, + "step": 3207 + }, + { + "epoch": 11.581489841986457, + "grad_norm": 206.48257446289062, + "learning_rate": 1.2588021778584392e-05, + "loss": 39.9903, + "step": 3208 + }, + { + "epoch": 11.58510158013544, + "grad_norm": 190.86009216308594, + "learning_rate": 1.2582577132486388e-05, + "loss": 39.3138, + "step": 3209 + }, + { + "epoch": 11.588713318284425, + "grad_norm": 217.0152130126953, + "learning_rate": 1.2577132486388386e-05, + "loss": 37.652, + "step": 3210 + }, + { + "epoch": 11.588713318284425, + "eval_loss": 0.6143624186515808, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 3210 + }, + { + "epoch": 11.592325056433408, + "grad_norm": 203.3090362548828, + "learning_rate": 1.2571687840290382e-05, + "loss": 38.5532, + "step": 3211 + }, + { + "epoch": 11.595936794582393, + "grad_norm": 237.18287658691406, + "learning_rate": 1.2566243194192377e-05, + "loss": 38.4073, + "step": 3212 + }, + { + "epoch": 11.599548532731378, + "grad_norm": 222.20489501953125, + "learning_rate": 1.2560798548094374e-05, + "loss": 37.7122, + "step": 3213 + }, + { + "epoch": 11.60316027088036, + "grad_norm": 261.4862060546875, + "learning_rate": 1.255535390199637e-05, + "loss": 39.0125, + "step": 3214 + }, + { + "epoch": 11.606772009029346, + "grad_norm": 235.49668884277344, + "learning_rate": 1.2549909255898367e-05, + "loss": 38.1753, + "step": 3215 + }, + { + "epoch": 11.610383747178329, + "grad_norm": 219.66139221191406, + "learning_rate": 1.2544464609800364e-05, + "loss": 40.3478, + "step": 3216 + }, + { + "epoch": 11.613995485327314, + "grad_norm": 282.8075256347656, + "learning_rate": 1.2539019963702359e-05, + "loss": 39.3672, + "step": 3217 + }, + { + "epoch": 11.617607223476298, + "grad_norm": 235.07875061035156, + "learning_rate": 1.2533575317604356e-05, + "loss": 39.8955, + "step": 3218 + }, + { + "epoch": 11.621218961625281, + "grad_norm": 328.829833984375, + "learning_rate": 1.2528130671506353e-05, + "loss": 38.626, + "step": 3219 + }, + { + "epoch": 11.624830699774266, + "grad_norm": 283.1789245605469, + "learning_rate": 1.2522686025408349e-05, + "loss": 40.0565, + "step": 3220 + }, + { + "epoch": 11.624830699774266, + "eval_loss": 0.6113889217376709, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3220 + }, + { + "epoch": 11.628442437923251, + "grad_norm": 230.88047790527344, + "learning_rate": 1.2517241379310346e-05, + "loss": 40.1155, + "step": 3221 + }, + { + "epoch": 11.632054176072234, + "grad_norm": 258.1295166015625, + "learning_rate": 1.2511796733212341e-05, + "loss": 40.4707, + "step": 3222 + }, + { + "epoch": 11.635665914221219, + "grad_norm": 255.82699584960938, + "learning_rate": 1.2506352087114336e-05, + "loss": 41.1296, + "step": 3223 + }, + { + "epoch": 11.639277652370204, + "grad_norm": 226.4784393310547, + "learning_rate": 1.2500907441016335e-05, + "loss": 39.1159, + "step": 3224 + }, + { + "epoch": 11.642889390519187, + "grad_norm": 257.38104248046875, + "learning_rate": 1.249546279491833e-05, + "loss": 40.7933, + "step": 3225 + }, + { + "epoch": 11.646501128668172, + "grad_norm": 218.69070434570312, + "learning_rate": 1.2490018148820328e-05, + "loss": 39.6723, + "step": 3226 + }, + { + "epoch": 11.650112866817155, + "grad_norm": 232.3351287841797, + "learning_rate": 1.2484573502722323e-05, + "loss": 37.5671, + "step": 3227 + }, + { + "epoch": 11.65372460496614, + "grad_norm": 229.93295288085938, + "learning_rate": 1.2479128856624318e-05, + "loss": 32.7819, + "step": 3228 + }, + { + "epoch": 11.657336343115125, + "grad_norm": 265.6002197265625, + "learning_rate": 1.2473684210526317e-05, + "loss": 32.5955, + "step": 3229 + }, + { + "epoch": 11.660948081264108, + "grad_norm": 278.47705078125, + "learning_rate": 1.2468239564428313e-05, + "loss": 32.9901, + "step": 3230 + }, + { + "epoch": 11.660948081264108, + "eval_loss": 0.6078047752380371, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 3230 + }, + { + "epoch": 11.664559819413093, + "grad_norm": 239.9285430908203, + "learning_rate": 1.2462794918330308e-05, + "loss": 33.2737, + "step": 3231 + }, + { + "epoch": 11.668171557562077, + "grad_norm": 358.36090087890625, + "learning_rate": 1.2457350272232305e-05, + "loss": 34.8522, + "step": 3232 + }, + { + "epoch": 11.67178329571106, + "grad_norm": 258.0733642578125, + "learning_rate": 1.2451905626134302e-05, + "loss": 34.6796, + "step": 3233 + }, + { + "epoch": 11.675395033860045, + "grad_norm": 296.21942138671875, + "learning_rate": 1.2446460980036298e-05, + "loss": 35.8479, + "step": 3234 + }, + { + "epoch": 11.679006772009028, + "grad_norm": 229.6141815185547, + "learning_rate": 1.2441016333938295e-05, + "loss": 36.4934, + "step": 3235 + }, + { + "epoch": 11.682618510158013, + "grad_norm": 238.6092987060547, + "learning_rate": 1.243557168784029e-05, + "loss": 35.2253, + "step": 3236 + }, + { + "epoch": 11.686230248306998, + "grad_norm": 300.76300048828125, + "learning_rate": 1.2430127041742287e-05, + "loss": 34.9373, + "step": 3237 + }, + { + "epoch": 11.689841986455981, + "grad_norm": 227.70672607421875, + "learning_rate": 1.2424682395644284e-05, + "loss": 35.4369, + "step": 3238 + }, + { + "epoch": 11.693453724604966, + "grad_norm": 218.36000061035156, + "learning_rate": 1.241923774954628e-05, + "loss": 35.3398, + "step": 3239 + }, + { + "epoch": 11.697065462753951, + "grad_norm": 220.78475952148438, + "learning_rate": 1.2413793103448277e-05, + "loss": 35.7612, + "step": 3240 + }, + { + "epoch": 11.697065462753951, + "eval_loss": 0.6067846417427063, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 3240 + }, + { + "epoch": 11.700677200902934, + "grad_norm": 237.34437561035156, + "learning_rate": 1.2408348457350272e-05, + "loss": 38.0459, + "step": 3241 + }, + { + "epoch": 11.704288939051919, + "grad_norm": 251.60633850097656, + "learning_rate": 1.2402903811252269e-05, + "loss": 35.4676, + "step": 3242 + }, + { + "epoch": 11.707900677200904, + "grad_norm": 214.17117309570312, + "learning_rate": 1.2397459165154266e-05, + "loss": 30.5595, + "step": 3243 + }, + { + "epoch": 11.711512415349887, + "grad_norm": 202.3698272705078, + "learning_rate": 1.2392014519056262e-05, + "loss": 23.7468, + "step": 3244 + }, + { + "epoch": 11.715124153498872, + "grad_norm": 229.11776733398438, + "learning_rate": 1.2386569872958257e-05, + "loss": 23.1255, + "step": 3245 + }, + { + "epoch": 11.718735891647855, + "grad_norm": 175.93829345703125, + "learning_rate": 1.2381125226860254e-05, + "loss": 23.7349, + "step": 3246 + }, + { + "epoch": 11.72234762979684, + "grad_norm": 232.7489471435547, + "learning_rate": 1.2375680580762251e-05, + "loss": 24.4997, + "step": 3247 + }, + { + "epoch": 11.725959367945824, + "grad_norm": 280.5601806640625, + "learning_rate": 1.2370235934664248e-05, + "loss": 42.3811, + "step": 3248 + }, + { + "epoch": 11.729571106094808, + "grad_norm": 292.2538146972656, + "learning_rate": 1.2364791288566244e-05, + "loss": 42.9804, + "step": 3249 + }, + { + "epoch": 11.733182844243792, + "grad_norm": 265.0259704589844, + "learning_rate": 1.2359346642468239e-05, + "loss": 41.1251, + "step": 3250 + }, + { + "epoch": 11.733182844243792, + "eval_loss": 0.6141200065612793, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 3250 + }, + { + "epoch": 11.736794582392777, + "grad_norm": 232.92893981933594, + "learning_rate": 1.2353901996370236e-05, + "loss": 40.9372, + "step": 3251 + }, + { + "epoch": 11.74040632054176, + "grad_norm": 176.99818420410156, + "learning_rate": 1.2348457350272233e-05, + "loss": 41.0757, + "step": 3252 + }, + { + "epoch": 11.744018058690745, + "grad_norm": 206.5728759765625, + "learning_rate": 1.2343012704174228e-05, + "loss": 41.9635, + "step": 3253 + }, + { + "epoch": 11.747629796839728, + "grad_norm": 211.2556915283203, + "learning_rate": 1.2337568058076226e-05, + "loss": 41.5217, + "step": 3254 + }, + { + "epoch": 11.751241534988713, + "grad_norm": 198.8915252685547, + "learning_rate": 1.2332123411978221e-05, + "loss": 42.9997, + "step": 3255 + }, + { + "epoch": 11.754853273137698, + "grad_norm": 291.2761535644531, + "learning_rate": 1.2326678765880218e-05, + "loss": 42.2561, + "step": 3256 + }, + { + "epoch": 11.758465011286681, + "grad_norm": 243.2998046875, + "learning_rate": 1.2321234119782215e-05, + "loss": 41.6219, + "step": 3257 + }, + { + "epoch": 11.762076749435666, + "grad_norm": 266.1149597167969, + "learning_rate": 1.231578947368421e-05, + "loss": 40.1646, + "step": 3258 + }, + { + "epoch": 11.76568848758465, + "grad_norm": 236.6083221435547, + "learning_rate": 1.2310344827586208e-05, + "loss": 39.7079, + "step": 3259 + }, + { + "epoch": 11.769300225733634, + "grad_norm": 196.397216796875, + "learning_rate": 1.2304900181488203e-05, + "loss": 39.6629, + "step": 3260 + }, + { + "epoch": 11.769300225733634, + "eval_loss": 0.6124016046524048, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.055, + "eval_steps_per_second": 57.055, + "step": 3260 + }, + { + "epoch": 11.772911963882619, + "grad_norm": 198.52500915527344, + "learning_rate": 1.22994555353902e-05, + "loss": 38.5285, + "step": 3261 + }, + { + "epoch": 11.776523702031604, + "grad_norm": 236.25477600097656, + "learning_rate": 1.2294010889292197e-05, + "loss": 38.3358, + "step": 3262 + }, + { + "epoch": 11.780135440180587, + "grad_norm": 260.35955810546875, + "learning_rate": 1.2288566243194192e-05, + "loss": 38.374, + "step": 3263 + }, + { + "epoch": 11.783747178329572, + "grad_norm": 313.078857421875, + "learning_rate": 1.2283121597096188e-05, + "loss": 39.124, + "step": 3264 + }, + { + "epoch": 11.787358916478555, + "grad_norm": 191.34027099609375, + "learning_rate": 1.2277676950998187e-05, + "loss": 39.1776, + "step": 3265 + }, + { + "epoch": 11.79097065462754, + "grad_norm": 203.5764923095703, + "learning_rate": 1.2272232304900182e-05, + "loss": 38.7885, + "step": 3266 + }, + { + "epoch": 11.794582392776524, + "grad_norm": 234.38479614257812, + "learning_rate": 1.2266787658802177e-05, + "loss": 39.1353, + "step": 3267 + }, + { + "epoch": 11.798194130925507, + "grad_norm": 254.5694122314453, + "learning_rate": 1.2261343012704174e-05, + "loss": 38.141, + "step": 3268 + }, + { + "epoch": 11.801805869074492, + "grad_norm": 189.8268585205078, + "learning_rate": 1.225589836660617e-05, + "loss": 39.5199, + "step": 3269 + }, + { + "epoch": 11.805417607223477, + "grad_norm": 256.52728271484375, + "learning_rate": 1.2250453720508169e-05, + "loss": 41.5113, + "step": 3270 + }, + { + "epoch": 11.805417607223477, + "eval_loss": 0.6084021329879761, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3270 + }, + { + "epoch": 11.80902934537246, + "grad_norm": 195.57321166992188, + "learning_rate": 1.2245009074410164e-05, + "loss": 39.8129, + "step": 3271 + }, + { + "epoch": 11.812641083521445, + "grad_norm": 228.6748809814453, + "learning_rate": 1.223956442831216e-05, + "loss": 40.2273, + "step": 3272 + }, + { + "epoch": 11.816252821670428, + "grad_norm": 209.96096801757812, + "learning_rate": 1.2234119782214156e-05, + "loss": 40.2254, + "step": 3273 + }, + { + "epoch": 11.819864559819413, + "grad_norm": 247.4613037109375, + "learning_rate": 1.2228675136116152e-05, + "loss": 40.71, + "step": 3274 + }, + { + "epoch": 11.823476297968398, + "grad_norm": 263.0521240234375, + "learning_rate": 1.2223230490018149e-05, + "loss": 39.5572, + "step": 3275 + }, + { + "epoch": 11.827088036117381, + "grad_norm": 225.53634643554688, + "learning_rate": 1.2217785843920146e-05, + "loss": 36.4388, + "step": 3276 + }, + { + "epoch": 11.830699774266366, + "grad_norm": 194.59527587890625, + "learning_rate": 1.2212341197822141e-05, + "loss": 33.1005, + "step": 3277 + }, + { + "epoch": 11.83431151241535, + "grad_norm": 314.715576171875, + "learning_rate": 1.2206896551724138e-05, + "loss": 32.9812, + "step": 3278 + }, + { + "epoch": 11.837923250564334, + "grad_norm": 205.86862182617188, + "learning_rate": 1.2201451905626136e-05, + "loss": 33.6331, + "step": 3279 + }, + { + "epoch": 11.841534988713319, + "grad_norm": 217.54722595214844, + "learning_rate": 1.2196007259528131e-05, + "loss": 33.6535, + "step": 3280 + }, + { + "epoch": 11.841534988713319, + "eval_loss": 0.609620213508606, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 3280 + }, + { + "epoch": 11.845146726862303, + "grad_norm": 231.25390625, + "learning_rate": 1.2190562613430128e-05, + "loss": 34.5218, + "step": 3281 + }, + { + "epoch": 11.848758465011286, + "grad_norm": 208.8440704345703, + "learning_rate": 1.2185117967332123e-05, + "loss": 34.354, + "step": 3282 + }, + { + "epoch": 11.852370203160271, + "grad_norm": 221.25547790527344, + "learning_rate": 1.2179673321234119e-05, + "loss": 34.5705, + "step": 3283 + }, + { + "epoch": 11.855981941309254, + "grad_norm": 331.4505920410156, + "learning_rate": 1.2174228675136118e-05, + "loss": 35.796, + "step": 3284 + }, + { + "epoch": 11.85959367945824, + "grad_norm": 337.1404113769531, + "learning_rate": 1.2168784029038113e-05, + "loss": 36.4544, + "step": 3285 + }, + { + "epoch": 11.863205417607224, + "grad_norm": 238.75303649902344, + "learning_rate": 1.2163339382940108e-05, + "loss": 35.7165, + "step": 3286 + }, + { + "epoch": 11.866817155756207, + "grad_norm": 260.088134765625, + "learning_rate": 1.2157894736842105e-05, + "loss": 35.5461, + "step": 3287 + }, + { + "epoch": 11.870428893905192, + "grad_norm": 265.0240173339844, + "learning_rate": 1.2152450090744102e-05, + "loss": 37.0143, + "step": 3288 + }, + { + "epoch": 11.874040632054175, + "grad_norm": 251.74273681640625, + "learning_rate": 1.21470054446461e-05, + "loss": 36.6145, + "step": 3289 + }, + { + "epoch": 11.87765237020316, + "grad_norm": 216.8999786376953, + "learning_rate": 1.2141560798548095e-05, + "loss": 36.3135, + "step": 3290 + }, + { + "epoch": 11.87765237020316, + "eval_loss": 0.6087896823883057, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.066, + "eval_steps_per_second": 57.066, + "step": 3290 + }, + { + "epoch": 11.881264108352145, + "grad_norm": 256.50006103515625, + "learning_rate": 1.213611615245009e-05, + "loss": 36.6596, + "step": 3291 + }, + { + "epoch": 11.884875846501128, + "grad_norm": 249.34164428710938, + "learning_rate": 1.2130671506352087e-05, + "loss": 37.6473, + "step": 3292 + }, + { + "epoch": 11.888487584650113, + "grad_norm": 211.9344940185547, + "learning_rate": 1.2125226860254084e-05, + "loss": 28.2839, + "step": 3293 + }, + { + "epoch": 11.892099322799098, + "grad_norm": 170.77166748046875, + "learning_rate": 1.211978221415608e-05, + "loss": 23.2231, + "step": 3294 + }, + { + "epoch": 11.89571106094808, + "grad_norm": 177.49789428710938, + "learning_rate": 1.2114337568058077e-05, + "loss": 22.7909, + "step": 3295 + }, + { + "epoch": 11.899322799097066, + "grad_norm": 189.0458221435547, + "learning_rate": 1.2108892921960072e-05, + "loss": 23.8062, + "step": 3296 + }, + { + "epoch": 11.90293453724605, + "grad_norm": 182.90457153320312, + "learning_rate": 1.2103448275862068e-05, + "loss": 24.7812, + "step": 3297 + }, + { + "epoch": 11.906546275395034, + "grad_norm": 232.61126708984375, + "learning_rate": 1.2098003629764066e-05, + "loss": 41.5496, + "step": 3298 + }, + { + "epoch": 11.910158013544018, + "grad_norm": 283.25762939453125, + "learning_rate": 1.2092558983666062e-05, + "loss": 40.7831, + "step": 3299 + }, + { + "epoch": 11.913769751693001, + "grad_norm": 316.6318359375, + "learning_rate": 1.2087114337568059e-05, + "loss": 40.6287, + "step": 3300 + }, + { + "epoch": 11.913769751693001, + "eval_loss": 0.6114257574081421, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 3300 + }, + { + "epoch": 11.917381489841986, + "grad_norm": 248.5615234375, + "learning_rate": 1.2081669691470054e-05, + "loss": 40.5648, + "step": 3301 + }, + { + "epoch": 11.920993227990971, + "grad_norm": 255.31130981445312, + "learning_rate": 1.2076225045372051e-05, + "loss": 42.4736, + "step": 3302 + }, + { + "epoch": 11.924604966139954, + "grad_norm": 229.3546600341797, + "learning_rate": 1.2070780399274048e-05, + "loss": 43.112, + "step": 3303 + }, + { + "epoch": 11.928216704288939, + "grad_norm": 226.89553833007812, + "learning_rate": 1.2065335753176044e-05, + "loss": 37.9527, + "step": 3304 + }, + { + "epoch": 11.931828442437924, + "grad_norm": 210.63919067382812, + "learning_rate": 1.205989110707804e-05, + "loss": 38.7652, + "step": 3305 + }, + { + "epoch": 11.935440180586907, + "grad_norm": 267.75335693359375, + "learning_rate": 1.2054446460980036e-05, + "loss": 39.9077, + "step": 3306 + }, + { + "epoch": 11.939051918735892, + "grad_norm": 255.3372802734375, + "learning_rate": 1.2049001814882033e-05, + "loss": 39.9008, + "step": 3307 + }, + { + "epoch": 11.942663656884875, + "grad_norm": 220.55332946777344, + "learning_rate": 1.2043557168784029e-05, + "loss": 40.8187, + "step": 3308 + }, + { + "epoch": 11.94627539503386, + "grad_norm": 350.15374755859375, + "learning_rate": 1.2038112522686026e-05, + "loss": 40.2937, + "step": 3309 + }, + { + "epoch": 11.949887133182845, + "grad_norm": 296.1144714355469, + "learning_rate": 1.2032667876588021e-05, + "loss": 41.3939, + "step": 3310 + }, + { + "epoch": 11.949887133182845, + "eval_loss": 0.6116041541099548, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3310 + }, + { + "epoch": 11.953498871331828, + "grad_norm": 220.52304077148438, + "learning_rate": 1.202722323049002e-05, + "loss": 39.108, + "step": 3311 + }, + { + "epoch": 11.957110609480813, + "grad_norm": 268.8526916503906, + "learning_rate": 1.2021778584392015e-05, + "loss": 39.547, + "step": 3312 + }, + { + "epoch": 11.960722347629797, + "grad_norm": 205.97677612304688, + "learning_rate": 1.201633393829401e-05, + "loss": 36.7144, + "step": 3313 + }, + { + "epoch": 11.96433408577878, + "grad_norm": 186.62428283691406, + "learning_rate": 1.2010889292196008e-05, + "loss": 34.0491, + "step": 3314 + }, + { + "epoch": 11.967945823927765, + "grad_norm": 214.5521697998047, + "learning_rate": 1.2005444646098003e-05, + "loss": 34.1164, + "step": 3315 + }, + { + "epoch": 11.97155756207675, + "grad_norm": 203.8130340576172, + "learning_rate": 1.2e-05, + "loss": 34.0005, + "step": 3316 + }, + { + "epoch": 11.975169300225733, + "grad_norm": 207.25648498535156, + "learning_rate": 1.1994555353901997e-05, + "loss": 34.0489, + "step": 3317 + }, + { + "epoch": 11.978781038374718, + "grad_norm": 271.1595458984375, + "learning_rate": 1.1989110707803993e-05, + "loss": 35.0359, + "step": 3318 + }, + { + "epoch": 11.982392776523701, + "grad_norm": 266.0697021484375, + "learning_rate": 1.198366606170599e-05, + "loss": 36.4684, + "step": 3319 + }, + { + "epoch": 11.986004514672686, + "grad_norm": 264.1314392089844, + "learning_rate": 1.1978221415607985e-05, + "loss": 35.8805, + "step": 3320 + }, + { + "epoch": 11.986004514672686, + "eval_loss": 0.6101864576339722, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 3320 + }, + { + "epoch": 11.989616252821671, + "grad_norm": 266.34295654296875, + "learning_rate": 1.1972776769509982e-05, + "loss": 37.2928, + "step": 3321 + }, + { + "epoch": 11.993227990970654, + "grad_norm": 222.19161987304688, + "learning_rate": 1.196733212341198e-05, + "loss": 29.0638, + "step": 3322 + }, + { + "epoch": 11.996839729119639, + "grad_norm": 244.96974182128906, + "learning_rate": 1.1961887477313975e-05, + "loss": 23.6752, + "step": 3323 + }, + { + "epoch": 12.0, + "grad_norm": 227.6931915283203, + "learning_rate": 1.195644283121597e-05, + "loss": 20.9293, + "step": 3324 + }, + { + "epoch": 12.003611738148985, + "grad_norm": 259.7235412597656, + "learning_rate": 1.1950998185117969e-05, + "loss": 39.7694, + "step": 3325 + }, + { + "epoch": 12.007223476297968, + "grad_norm": 258.8477783203125, + "learning_rate": 1.1945553539019964e-05, + "loss": 41.3742, + "step": 3326 + }, + { + "epoch": 12.010835214446953, + "grad_norm": 216.0697784423828, + "learning_rate": 1.194010889292196e-05, + "loss": 40.0706, + "step": 3327 + }, + { + "epoch": 12.014446952595938, + "grad_norm": 197.73046875, + "learning_rate": 1.1934664246823957e-05, + "loss": 39.844, + "step": 3328 + }, + { + "epoch": 12.01805869074492, + "grad_norm": 190.29563903808594, + "learning_rate": 1.1929219600725952e-05, + "loss": 41.8877, + "step": 3329 + }, + { + "epoch": 12.021670428893906, + "grad_norm": 190.01197814941406, + "learning_rate": 1.1923774954627951e-05, + "loss": 40.5782, + "step": 3330 + }, + { + "epoch": 12.021670428893906, + "eval_loss": 0.6100598573684692, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3330 + }, + { + "epoch": 12.025282167042889, + "grad_norm": 283.20965576171875, + "learning_rate": 1.1918330308529946e-05, + "loss": 42.9183, + "step": 3331 + }, + { + "epoch": 12.028893905191874, + "grad_norm": 227.9106903076172, + "learning_rate": 1.1912885662431942e-05, + "loss": 41.4606, + "step": 3332 + }, + { + "epoch": 12.032505643340858, + "grad_norm": 217.31640625, + "learning_rate": 1.1907441016333939e-05, + "loss": 40.527, + "step": 3333 + }, + { + "epoch": 12.036117381489841, + "grad_norm": 181.33787536621094, + "learning_rate": 1.1901996370235936e-05, + "loss": 40.2536, + "step": 3334 + }, + { + "epoch": 12.039729119638826, + "grad_norm": 210.638427734375, + "learning_rate": 1.1896551724137931e-05, + "loss": 39.0234, + "step": 3335 + }, + { + "epoch": 12.043340857787811, + "grad_norm": 222.1325225830078, + "learning_rate": 1.1891107078039928e-05, + "loss": 36.6929, + "step": 3336 + }, + { + "epoch": 12.046952595936794, + "grad_norm": 195.0751953125, + "learning_rate": 1.1885662431941924e-05, + "loss": 37.9547, + "step": 3337 + }, + { + "epoch": 12.050564334085779, + "grad_norm": 287.6582946777344, + "learning_rate": 1.1880217785843919e-05, + "loss": 37.9016, + "step": 3338 + }, + { + "epoch": 12.054176072234762, + "grad_norm": 351.43701171875, + "learning_rate": 1.1874773139745918e-05, + "loss": 40.014, + "step": 3339 + }, + { + "epoch": 12.057787810383747, + "grad_norm": 212.9033966064453, + "learning_rate": 1.1869328493647913e-05, + "loss": 37.8761, + "step": 3340 + }, + { + "epoch": 12.057787810383747, + "eval_loss": 0.6093400120735168, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 3340 + }, + { + "epoch": 12.061399548532732, + "grad_norm": 268.8284912109375, + "learning_rate": 1.186388384754991e-05, + "loss": 38.7171, + "step": 3341 + }, + { + "epoch": 12.065011286681715, + "grad_norm": 193.27267456054688, + "learning_rate": 1.1858439201451906e-05, + "loss": 38.4908, + "step": 3342 + }, + { + "epoch": 12.0686230248307, + "grad_norm": 244.18124389648438, + "learning_rate": 1.1852994555353901e-05, + "loss": 37.9388, + "step": 3343 + }, + { + "epoch": 12.072234762979685, + "grad_norm": 311.6593933105469, + "learning_rate": 1.18475499092559e-05, + "loss": 38.4287, + "step": 3344 + }, + { + "epoch": 12.075846501128668, + "grad_norm": 239.28526306152344, + "learning_rate": 1.1842105263157895e-05, + "loss": 38.1349, + "step": 3345 + }, + { + "epoch": 12.079458239277653, + "grad_norm": 312.1795654296875, + "learning_rate": 1.183666061705989e-05, + "loss": 39.8067, + "step": 3346 + }, + { + "epoch": 12.083069977426636, + "grad_norm": 303.3067932128906, + "learning_rate": 1.1831215970961888e-05, + "loss": 40.0617, + "step": 3347 + }, + { + "epoch": 12.08668171557562, + "grad_norm": 280.8705749511719, + "learning_rate": 1.1825771324863885e-05, + "loss": 39.244, + "step": 3348 + }, + { + "epoch": 12.090293453724605, + "grad_norm": 249.89671325683594, + "learning_rate": 1.182032667876588e-05, + "loss": 39.0047, + "step": 3349 + }, + { + "epoch": 12.093905191873588, + "grad_norm": 226.19195556640625, + "learning_rate": 1.1814882032667877e-05, + "loss": 40.8044, + "step": 3350 + }, + { + "epoch": 12.093905191873588, + "eval_loss": 0.6100687384605408, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 3350 + }, + { + "epoch": 12.097516930022573, + "grad_norm": 250.29306030273438, + "learning_rate": 1.1809437386569873e-05, + "loss": 38.0745, + "step": 3351 + }, + { + "epoch": 12.101128668171558, + "grad_norm": 255.06137084960938, + "learning_rate": 1.180399274047187e-05, + "loss": 37.2922, + "step": 3352 + }, + { + "epoch": 12.104740406320541, + "grad_norm": 293.59185791015625, + "learning_rate": 1.1798548094373867e-05, + "loss": 35.488, + "step": 3353 + }, + { + "epoch": 12.108352144469526, + "grad_norm": 260.9599914550781, + "learning_rate": 1.1793103448275862e-05, + "loss": 32.8175, + "step": 3354 + }, + { + "epoch": 12.111963882618511, + "grad_norm": 387.63671875, + "learning_rate": 1.178765880217786e-05, + "loss": 31.3901, + "step": 3355 + }, + { + "epoch": 12.115575620767494, + "grad_norm": 216.2008819580078, + "learning_rate": 1.1782214156079855e-05, + "loss": 32.9512, + "step": 3356 + }, + { + "epoch": 12.119187358916479, + "grad_norm": 260.510498046875, + "learning_rate": 1.177676950998185e-05, + "loss": 31.838, + "step": 3357 + }, + { + "epoch": 12.122799097065462, + "grad_norm": 215.96522521972656, + "learning_rate": 1.1771324863883849e-05, + "loss": 33.5854, + "step": 3358 + }, + { + "epoch": 12.126410835214447, + "grad_norm": 277.2855529785156, + "learning_rate": 1.1765880217785844e-05, + "loss": 34.947, + "step": 3359 + }, + { + "epoch": 12.130022573363432, + "grad_norm": 199.53759765625, + "learning_rate": 1.176043557168784e-05, + "loss": 34.3862, + "step": 3360 + }, + { + "epoch": 12.130022573363432, + "eval_loss": 0.6107886433601379, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 3360 + }, + { + "epoch": 12.133634311512415, + "grad_norm": 244.73654174804688, + "learning_rate": 1.1754990925589837e-05, + "loss": 34.5678, + "step": 3361 + }, + { + "epoch": 12.1372460496614, + "grad_norm": 335.4967346191406, + "learning_rate": 1.1749546279491834e-05, + "loss": 35.8974, + "step": 3362 + }, + { + "epoch": 12.140857787810384, + "grad_norm": 269.8370056152344, + "learning_rate": 1.174410163339383e-05, + "loss": 36.3458, + "step": 3363 + }, + { + "epoch": 12.144469525959368, + "grad_norm": 230.82492065429688, + "learning_rate": 1.1738656987295826e-05, + "loss": 34.6797, + "step": 3364 + }, + { + "epoch": 12.148081264108352, + "grad_norm": 266.6196594238281, + "learning_rate": 1.1733212341197822e-05, + "loss": 35.5799, + "step": 3365 + }, + { + "epoch": 12.151693002257336, + "grad_norm": 268.1825256347656, + "learning_rate": 1.1727767695099819e-05, + "loss": 34.9859, + "step": 3366 + }, + { + "epoch": 12.15530474040632, + "grad_norm": 259.6159362792969, + "learning_rate": 1.1722323049001816e-05, + "loss": 37.2283, + "step": 3367 + }, + { + "epoch": 12.158916478555305, + "grad_norm": 225.1367645263672, + "learning_rate": 1.1716878402903811e-05, + "loss": 37.4073, + "step": 3368 + }, + { + "epoch": 12.162528216704288, + "grad_norm": 277.8457946777344, + "learning_rate": 1.1711433756805808e-05, + "loss": 36.3491, + "step": 3369 + }, + { + "epoch": 12.166139954853273, + "grad_norm": 273.1939697265625, + "learning_rate": 1.1705989110707804e-05, + "loss": 31.4646, + "step": 3370 + }, + { + "epoch": 12.166139954853273, + "eval_loss": 0.6099494695663452, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 3370 + }, + { + "epoch": 12.169751693002258, + "grad_norm": 199.32516479492188, + "learning_rate": 1.17005444646098e-05, + "loss": 22.7125, + "step": 3371 + }, + { + "epoch": 12.173363431151241, + "grad_norm": 195.47630310058594, + "learning_rate": 1.1695099818511798e-05, + "loss": 22.7899, + "step": 3372 + }, + { + "epoch": 12.176975169300226, + "grad_norm": 220.02413940429688, + "learning_rate": 1.1689655172413793e-05, + "loss": 23.4427, + "step": 3373 + }, + { + "epoch": 12.18058690744921, + "grad_norm": 215.43287658691406, + "learning_rate": 1.168421052631579e-05, + "loss": 24.1504, + "step": 3374 + }, + { + "epoch": 12.184198645598194, + "grad_norm": 298.2409973144531, + "learning_rate": 1.1678765880217786e-05, + "loss": 41.4955, + "step": 3375 + }, + { + "epoch": 12.187810383747179, + "grad_norm": 235.94728088378906, + "learning_rate": 1.1673321234119783e-05, + "loss": 42.4273, + "step": 3376 + }, + { + "epoch": 12.191422121896162, + "grad_norm": 235.44480895996094, + "learning_rate": 1.166787658802178e-05, + "loss": 40.6468, + "step": 3377 + }, + { + "epoch": 12.195033860045147, + "grad_norm": 281.5338439941406, + "learning_rate": 1.1662431941923775e-05, + "loss": 39.8335, + "step": 3378 + }, + { + "epoch": 12.198645598194132, + "grad_norm": 185.87339782714844, + "learning_rate": 1.165698729582577e-05, + "loss": 40.8669, + "step": 3379 + }, + { + "epoch": 12.202257336343115, + "grad_norm": 218.88861083984375, + "learning_rate": 1.1651542649727768e-05, + "loss": 40.1351, + "step": 3380 + }, + { + "epoch": 12.202257336343115, + "eval_loss": 0.6128573417663574, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3380 + }, + { + "epoch": 12.2058690744921, + "grad_norm": 192.7227783203125, + "learning_rate": 1.1646098003629765e-05, + "loss": 40.4448, + "step": 3381 + }, + { + "epoch": 12.209480812641084, + "grad_norm": 219.68093872070312, + "learning_rate": 1.1640653357531762e-05, + "loss": 41.579, + "step": 3382 + }, + { + "epoch": 12.213092550790067, + "grad_norm": 235.8788299560547, + "learning_rate": 1.1635208711433757e-05, + "loss": 41.3374, + "step": 3383 + }, + { + "epoch": 12.216704288939052, + "grad_norm": 245.11935424804688, + "learning_rate": 1.1629764065335752e-05, + "loss": 41.1151, + "step": 3384 + }, + { + "epoch": 12.220316027088035, + "grad_norm": 260.2931823730469, + "learning_rate": 1.1624319419237751e-05, + "loss": 38.9502, + "step": 3385 + }, + { + "epoch": 12.22392776523702, + "grad_norm": 240.62734985351562, + "learning_rate": 1.1618874773139747e-05, + "loss": 38.6309, + "step": 3386 + }, + { + "epoch": 12.227539503386005, + "grad_norm": 230.9380645751953, + "learning_rate": 1.1613430127041742e-05, + "loss": 38.3077, + "step": 3387 + }, + { + "epoch": 12.231151241534988, + "grad_norm": 234.40687561035156, + "learning_rate": 1.1607985480943739e-05, + "loss": 37.1566, + "step": 3388 + }, + { + "epoch": 12.234762979683973, + "grad_norm": 216.580810546875, + "learning_rate": 1.1602540834845734e-05, + "loss": 38.4919, + "step": 3389 + }, + { + "epoch": 12.238374717832958, + "grad_norm": 210.75079345703125, + "learning_rate": 1.1597096188747732e-05, + "loss": 38.1647, + "step": 3390 + }, + { + "epoch": 12.238374717832958, + "eval_loss": 0.6105583906173706, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3390 + }, + { + "epoch": 12.241986455981941, + "grad_norm": 207.82180786132812, + "learning_rate": 1.1591651542649729e-05, + "loss": 38.5585, + "step": 3391 + }, + { + "epoch": 12.245598194130926, + "grad_norm": 186.55081176757812, + "learning_rate": 1.1586206896551724e-05, + "loss": 38.0183, + "step": 3392 + }, + { + "epoch": 12.249209932279909, + "grad_norm": 179.60572814941406, + "learning_rate": 1.1580762250453721e-05, + "loss": 39.6951, + "step": 3393 + }, + { + "epoch": 12.252821670428894, + "grad_norm": 212.59837341308594, + "learning_rate": 1.1575317604355718e-05, + "loss": 39.2908, + "step": 3394 + }, + { + "epoch": 12.256433408577879, + "grad_norm": 239.90997314453125, + "learning_rate": 1.1569872958257714e-05, + "loss": 39.9409, + "step": 3395 + }, + { + "epoch": 12.260045146726862, + "grad_norm": 240.729248046875, + "learning_rate": 1.156442831215971e-05, + "loss": 39.2386, + "step": 3396 + }, + { + "epoch": 12.263656884875846, + "grad_norm": 248.6179962158203, + "learning_rate": 1.1558983666061706e-05, + "loss": 37.3296, + "step": 3397 + }, + { + "epoch": 12.267268623024831, + "grad_norm": 192.55084228515625, + "learning_rate": 1.1553539019963701e-05, + "loss": 40.1156, + "step": 3398 + }, + { + "epoch": 12.270880361173814, + "grad_norm": 217.89109802246094, + "learning_rate": 1.15480943738657e-05, + "loss": 41.0677, + "step": 3399 + }, + { + "epoch": 12.2744920993228, + "grad_norm": 240.77633666992188, + "learning_rate": 1.1542649727767695e-05, + "loss": 39.3552, + "step": 3400 + }, + { + "epoch": 12.2744920993228, + "eval_loss": 0.6094763278961182, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3400 + }, + { + "epoch": 12.278103837471784, + "grad_norm": 210.38153076171875, + "learning_rate": 1.1537205081669691e-05, + "loss": 40.2202, + "step": 3401 + }, + { + "epoch": 12.281715575620767, + "grad_norm": 195.49087524414062, + "learning_rate": 1.1531760435571688e-05, + "loss": 37.5473, + "step": 3402 + }, + { + "epoch": 12.285327313769752, + "grad_norm": 254.43972778320312, + "learning_rate": 1.1526315789473683e-05, + "loss": 37.8032, + "step": 3403 + }, + { + "epoch": 12.288939051918735, + "grad_norm": 205.09913635253906, + "learning_rate": 1.1520871143375682e-05, + "loss": 35.1317, + "step": 3404 + }, + { + "epoch": 12.29255079006772, + "grad_norm": 241.22930908203125, + "learning_rate": 1.1515426497277677e-05, + "loss": 32.7809, + "step": 3405 + }, + { + "epoch": 12.296162528216705, + "grad_norm": 226.75311279296875, + "learning_rate": 1.1509981851179673e-05, + "loss": 32.5354, + "step": 3406 + }, + { + "epoch": 12.299774266365688, + "grad_norm": 323.5389709472656, + "learning_rate": 1.150453720508167e-05, + "loss": 33.1533, + "step": 3407 + }, + { + "epoch": 12.303386004514673, + "grad_norm": 306.7039794921875, + "learning_rate": 1.1499092558983667e-05, + "loss": 33.7924, + "step": 3408 + }, + { + "epoch": 12.306997742663658, + "grad_norm": 221.53897094726562, + "learning_rate": 1.1493647912885662e-05, + "loss": 33.829, + "step": 3409 + }, + { + "epoch": 12.31060948081264, + "grad_norm": 301.59527587890625, + "learning_rate": 1.148820326678766e-05, + "loss": 35.4583, + "step": 3410 + }, + { + "epoch": 12.31060948081264, + "eval_loss": 0.6092248558998108, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.058, + "eval_steps_per_second": 57.058, + "step": 3410 + }, + { + "epoch": 12.314221218961626, + "grad_norm": 229.63221740722656, + "learning_rate": 1.1482758620689655e-05, + "loss": 34.3258, + "step": 3411 + }, + { + "epoch": 12.317832957110609, + "grad_norm": 280.6421203613281, + "learning_rate": 1.147731397459165e-05, + "loss": 33.4522, + "step": 3412 + }, + { + "epoch": 12.321444695259594, + "grad_norm": 305.6673889160156, + "learning_rate": 1.1471869328493649e-05, + "loss": 34.8911, + "step": 3413 + }, + { + "epoch": 12.325056433408578, + "grad_norm": 278.5484924316406, + "learning_rate": 1.1466424682395644e-05, + "loss": 36.2668, + "step": 3414 + }, + { + "epoch": 12.328668171557561, + "grad_norm": 246.88082885742188, + "learning_rate": 1.1460980036297641e-05, + "loss": 34.8401, + "step": 3415 + }, + { + "epoch": 12.332279909706546, + "grad_norm": 279.730712890625, + "learning_rate": 1.1455535390199637e-05, + "loss": 36.2382, + "step": 3416 + }, + { + "epoch": 12.335891647855531, + "grad_norm": 243.62918090820312, + "learning_rate": 1.1450090744101634e-05, + "loss": 37.0742, + "step": 3417 + }, + { + "epoch": 12.339503386004514, + "grad_norm": 280.5240783691406, + "learning_rate": 1.1444646098003631e-05, + "loss": 37.0223, + "step": 3418 + }, + { + "epoch": 12.343115124153499, + "grad_norm": 270.56396484375, + "learning_rate": 1.1439201451905626e-05, + "loss": 34.8413, + "step": 3419 + }, + { + "epoch": 12.346726862302482, + "grad_norm": 246.56292724609375, + "learning_rate": 1.1433756805807622e-05, + "loss": 26.5596, + "step": 3420 + }, + { + "epoch": 12.346726862302482, + "eval_loss": 0.6123174428939819, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.143, + "eval_steps_per_second": 57.143, + "step": 3420 + }, + { + "epoch": 12.350338600451467, + "grad_norm": 199.72242736816406, + "learning_rate": 1.1428312159709619e-05, + "loss": 23.3959, + "step": 3421 + }, + { + "epoch": 12.353950338600452, + "grad_norm": 264.9206848144531, + "learning_rate": 1.1422867513611616e-05, + "loss": 23.448, + "step": 3422 + }, + { + "epoch": 12.357562076749435, + "grad_norm": 198.09420776367188, + "learning_rate": 1.1417422867513613e-05, + "loss": 23.4526, + "step": 3423 + }, + { + "epoch": 12.36117381489842, + "grad_norm": 191.74949645996094, + "learning_rate": 1.1411978221415608e-05, + "loss": 23.9586, + "step": 3424 + }, + { + "epoch": 12.364785553047405, + "grad_norm": 270.4527893066406, + "learning_rate": 1.1406533575317604e-05, + "loss": 41.2497, + "step": 3425 + }, + { + "epoch": 12.368397291196388, + "grad_norm": 253.06109619140625, + "learning_rate": 1.1401088929219601e-05, + "loss": 41.7598, + "step": 3426 + }, + { + "epoch": 12.372009029345373, + "grad_norm": 389.3164978027344, + "learning_rate": 1.1395644283121598e-05, + "loss": 42.1145, + "step": 3427 + }, + { + "epoch": 12.375620767494357, + "grad_norm": 405.1527404785156, + "learning_rate": 1.1390199637023593e-05, + "loss": 39.8163, + "step": 3428 + }, + { + "epoch": 12.37923250564334, + "grad_norm": 360.5083312988281, + "learning_rate": 1.138475499092559e-05, + "loss": 40.7344, + "step": 3429 + }, + { + "epoch": 12.382844243792325, + "grad_norm": 276.3650207519531, + "learning_rate": 1.1379310344827586e-05, + "loss": 40.6678, + "step": 3430 + }, + { + "epoch": 12.382844243792325, + "eval_loss": 0.612799346446991, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 3430 + }, + { + "epoch": 12.386455981941308, + "grad_norm": 222.34078979492188, + "learning_rate": 1.1373865698729583e-05, + "loss": 39.8701, + "step": 3431 + }, + { + "epoch": 12.390067720090293, + "grad_norm": 242.1103515625, + "learning_rate": 1.136842105263158e-05, + "loss": 42.031, + "step": 3432 + }, + { + "epoch": 12.393679458239278, + "grad_norm": 231.30453491210938, + "learning_rate": 1.1362976406533575e-05, + "loss": 40.7321, + "step": 3433 + }, + { + "epoch": 12.397291196388261, + "grad_norm": 302.65179443359375, + "learning_rate": 1.1357531760435572e-05, + "loss": 41.5889, + "step": 3434 + }, + { + "epoch": 12.400902934537246, + "grad_norm": 296.4203796386719, + "learning_rate": 1.1352087114337568e-05, + "loss": 40.3939, + "step": 3435 + }, + { + "epoch": 12.404514672686231, + "grad_norm": 281.8349304199219, + "learning_rate": 1.1346642468239565e-05, + "loss": 37.9457, + "step": 3436 + }, + { + "epoch": 12.408126410835214, + "grad_norm": 228.9622039794922, + "learning_rate": 1.1341197822141562e-05, + "loss": 37.4727, + "step": 3437 + }, + { + "epoch": 12.411738148984199, + "grad_norm": 276.8975524902344, + "learning_rate": 1.1335753176043557e-05, + "loss": 36.4285, + "step": 3438 + }, + { + "epoch": 12.415349887133182, + "grad_norm": 218.76206970214844, + "learning_rate": 1.1330308529945553e-05, + "loss": 37.7888, + "step": 3439 + }, + { + "epoch": 12.418961625282167, + "grad_norm": 277.31329345703125, + "learning_rate": 1.1324863883847551e-05, + "loss": 38.6416, + "step": 3440 + }, + { + "epoch": 12.418961625282167, + "eval_loss": 0.6118359565734863, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.105, + "eval_steps_per_second": 57.105, + "step": 3440 + }, + { + "epoch": 12.422573363431152, + "grad_norm": 239.2766876220703, + "learning_rate": 1.1319419237749547e-05, + "loss": 38.3779, + "step": 3441 + }, + { + "epoch": 12.426185101580135, + "grad_norm": 255.43939208984375, + "learning_rate": 1.1313974591651542e-05, + "loss": 38.7581, + "step": 3442 + }, + { + "epoch": 12.42979683972912, + "grad_norm": 196.33380126953125, + "learning_rate": 1.130852994555354e-05, + "loss": 40.1953, + "step": 3443 + }, + { + "epoch": 12.433408577878104, + "grad_norm": 284.2427062988281, + "learning_rate": 1.1303085299455535e-05, + "loss": 39.2743, + "step": 3444 + }, + { + "epoch": 12.437020316027088, + "grad_norm": 303.0172424316406, + "learning_rate": 1.1297640653357533e-05, + "loss": 39.4786, + "step": 3445 + }, + { + "epoch": 12.440632054176072, + "grad_norm": 231.17999267578125, + "learning_rate": 1.1292196007259529e-05, + "loss": 38.6038, + "step": 3446 + }, + { + "epoch": 12.444243792325057, + "grad_norm": 228.89599609375, + "learning_rate": 1.1286751361161524e-05, + "loss": 39.0235, + "step": 3447 + }, + { + "epoch": 12.44785553047404, + "grad_norm": 247.05203247070312, + "learning_rate": 1.1281306715063521e-05, + "loss": 39.9779, + "step": 3448 + }, + { + "epoch": 12.451467268623025, + "grad_norm": 221.5463104248047, + "learning_rate": 1.1275862068965517e-05, + "loss": 40.4104, + "step": 3449 + }, + { + "epoch": 12.455079006772008, + "grad_norm": 254.12820434570312, + "learning_rate": 1.1270417422867514e-05, + "loss": 40.8093, + "step": 3450 + }, + { + "epoch": 12.455079006772008, + "eval_loss": 0.6093817353248596, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 3450 + }, + { + "epoch": 12.458690744920993, + "grad_norm": 214.2323760986328, + "learning_rate": 1.1264972776769511e-05, + "loss": 40.3578, + "step": 3451 + }, + { + "epoch": 12.462302483069978, + "grad_norm": 230.64718627929688, + "learning_rate": 1.1259528130671506e-05, + "loss": 39.772, + "step": 3452 + }, + { + "epoch": 12.465914221218961, + "grad_norm": 217.81838989257812, + "learning_rate": 1.1254083484573502e-05, + "loss": 36.8193, + "step": 3453 + }, + { + "epoch": 12.469525959367946, + "grad_norm": 292.7674560546875, + "learning_rate": 1.12486388384755e-05, + "loss": 33.891, + "step": 3454 + }, + { + "epoch": 12.47313769751693, + "grad_norm": 241.6099395751953, + "learning_rate": 1.1243194192377496e-05, + "loss": 34.8947, + "step": 3455 + }, + { + "epoch": 12.476749435665914, + "grad_norm": 220.97128295898438, + "learning_rate": 1.1237749546279493e-05, + "loss": 31.7715, + "step": 3456 + }, + { + "epoch": 12.480361173814899, + "grad_norm": 191.04376220703125, + "learning_rate": 1.1232304900181488e-05, + "loss": 32.3878, + "step": 3457 + }, + { + "epoch": 12.483972911963882, + "grad_norm": 192.3009796142578, + "learning_rate": 1.1226860254083484e-05, + "loss": 33.3116, + "step": 3458 + }, + { + "epoch": 12.487584650112867, + "grad_norm": 214.22459411621094, + "learning_rate": 1.1221415607985482e-05, + "loss": 34.1394, + "step": 3459 + }, + { + "epoch": 12.491196388261852, + "grad_norm": 225.24191284179688, + "learning_rate": 1.1215970961887478e-05, + "loss": 34.9381, + "step": 3460 + }, + { + "epoch": 12.491196388261852, + "eval_loss": 0.6095408201217651, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 3460 + }, + { + "epoch": 12.494808126410835, + "grad_norm": 240.89199829101562, + "learning_rate": 1.1210526315789473e-05, + "loss": 34.5342, + "step": 3461 + }, + { + "epoch": 12.49841986455982, + "grad_norm": 263.5467224121094, + "learning_rate": 1.120508166969147e-05, + "loss": 35.3287, + "step": 3462 + }, + { + "epoch": 12.502031602708804, + "grad_norm": 253.0650634765625, + "learning_rate": 1.1199637023593467e-05, + "loss": 35.4859, + "step": 3463 + }, + { + "epoch": 12.505643340857787, + "grad_norm": 279.4447937011719, + "learning_rate": 1.1194192377495463e-05, + "loss": 33.919, + "step": 3464 + }, + { + "epoch": 12.509255079006772, + "grad_norm": 246.6184844970703, + "learning_rate": 1.118874773139746e-05, + "loss": 35.2743, + "step": 3465 + }, + { + "epoch": 12.512866817155757, + "grad_norm": 228.4134979248047, + "learning_rate": 1.1183303085299455e-05, + "loss": 36.0865, + "step": 3466 + }, + { + "epoch": 12.51647855530474, + "grad_norm": 264.87835693359375, + "learning_rate": 1.1177858439201452e-05, + "loss": 36.1596, + "step": 3467 + }, + { + "epoch": 12.520090293453725, + "grad_norm": 252.2872772216797, + "learning_rate": 1.117241379310345e-05, + "loss": 35.7293, + "step": 3468 + }, + { + "epoch": 12.523702031602708, + "grad_norm": 277.3695373535156, + "learning_rate": 1.1166969147005445e-05, + "loss": 36.8009, + "step": 3469 + }, + { + "epoch": 12.527313769751693, + "grad_norm": 255.64610290527344, + "learning_rate": 1.1161524500907442e-05, + "loss": 28.5986, + "step": 3470 + }, + { + "epoch": 12.527313769751693, + "eval_loss": 0.6122347116470337, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 3470 + }, + { + "epoch": 12.530925507900678, + "grad_norm": 256.1487121582031, + "learning_rate": 1.1156079854809437e-05, + "loss": 23.1289, + "step": 3471 + }, + { + "epoch": 12.534537246049661, + "grad_norm": 261.9757080078125, + "learning_rate": 1.1150635208711433e-05, + "loss": 22.3379, + "step": 3472 + }, + { + "epoch": 12.538148984198646, + "grad_norm": 194.83432006835938, + "learning_rate": 1.1145190562613431e-05, + "loss": 23.6192, + "step": 3473 + }, + { + "epoch": 12.54176072234763, + "grad_norm": 241.51089477539062, + "learning_rate": 1.1139745916515427e-05, + "loss": 24.0314, + "step": 3474 + }, + { + "epoch": 12.545372460496614, + "grad_norm": 242.6024932861328, + "learning_rate": 1.1134301270417424e-05, + "loss": 40.2969, + "step": 3475 + }, + { + "epoch": 12.548984198645599, + "grad_norm": 292.17303466796875, + "learning_rate": 1.112885662431942e-05, + "loss": 42.3448, + "step": 3476 + }, + { + "epoch": 12.552595936794582, + "grad_norm": 232.811767578125, + "learning_rate": 1.1123411978221416e-05, + "loss": 41.7642, + "step": 3477 + }, + { + "epoch": 12.556207674943566, + "grad_norm": 238.43162536621094, + "learning_rate": 1.1117967332123413e-05, + "loss": 41.0827, + "step": 3478 + }, + { + "epoch": 12.559819413092551, + "grad_norm": 290.20159912109375, + "learning_rate": 1.1112522686025409e-05, + "loss": 41.3795, + "step": 3479 + }, + { + "epoch": 12.563431151241534, + "grad_norm": 197.52903747558594, + "learning_rate": 1.1107078039927404e-05, + "loss": 40.6337, + "step": 3480 + }, + { + "epoch": 12.563431151241534, + "eval_loss": 0.6133883595466614, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.135, + "eval_steps_per_second": 57.135, + "step": 3480 + }, + { + "epoch": 12.56704288939052, + "grad_norm": 259.8161926269531, + "learning_rate": 1.1101633393829401e-05, + "loss": 40.2626, + "step": 3481 + }, + { + "epoch": 12.570654627539504, + "grad_norm": 196.7882537841797, + "learning_rate": 1.1096188747731398e-05, + "loss": 41.0171, + "step": 3482 + }, + { + "epoch": 12.574266365688487, + "grad_norm": 216.27642822265625, + "learning_rate": 1.1090744101633394e-05, + "loss": 42.1328, + "step": 3483 + }, + { + "epoch": 12.577878103837472, + "grad_norm": 292.6575012207031, + "learning_rate": 1.108529945553539e-05, + "loss": 39.9502, + "step": 3484 + }, + { + "epoch": 12.581489841986457, + "grad_norm": 254.43344116210938, + "learning_rate": 1.1079854809437386e-05, + "loss": 41.3409, + "step": 3485 + }, + { + "epoch": 12.58510158013544, + "grad_norm": 211.3965606689453, + "learning_rate": 1.1074410163339385e-05, + "loss": 39.6898, + "step": 3486 + }, + { + "epoch": 12.588713318284425, + "grad_norm": 196.2000274658203, + "learning_rate": 1.106896551724138e-05, + "loss": 38.0837, + "step": 3487 + }, + { + "epoch": 12.592325056433408, + "grad_norm": 224.4564666748047, + "learning_rate": 1.1063520871143376e-05, + "loss": 38.479, + "step": 3488 + }, + { + "epoch": 12.595936794582393, + "grad_norm": 215.7074432373047, + "learning_rate": 1.1058076225045373e-05, + "loss": 38.3103, + "step": 3489 + }, + { + "epoch": 12.599548532731378, + "grad_norm": 278.2279052734375, + "learning_rate": 1.1052631578947368e-05, + "loss": 37.9399, + "step": 3490 + }, + { + "epoch": 12.599548532731378, + "eval_loss": 0.6091782450675964, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 3490 + }, + { + "epoch": 12.60316027088036, + "grad_norm": 236.7021942138672, + "learning_rate": 1.1047186932849365e-05, + "loss": 38.185, + "step": 3491 + }, + { + "epoch": 12.606772009029346, + "grad_norm": 200.35169982910156, + "learning_rate": 1.1041742286751362e-05, + "loss": 38.7405, + "step": 3492 + }, + { + "epoch": 12.610383747178329, + "grad_norm": 211.9726104736328, + "learning_rate": 1.1036297640653358e-05, + "loss": 39.8351, + "step": 3493 + }, + { + "epoch": 12.613995485327314, + "grad_norm": 303.5962829589844, + "learning_rate": 1.1030852994555353e-05, + "loss": 39.3039, + "step": 3494 + }, + { + "epoch": 12.617607223476298, + "grad_norm": 298.086181640625, + "learning_rate": 1.102540834845735e-05, + "loss": 39.9149, + "step": 3495 + }, + { + "epoch": 12.621218961625281, + "grad_norm": 255.69854736328125, + "learning_rate": 1.1019963702359347e-05, + "loss": 36.3617, + "step": 3496 + }, + { + "epoch": 12.624830699774266, + "grad_norm": 273.2884216308594, + "learning_rate": 1.1014519056261344e-05, + "loss": 38.6865, + "step": 3497 + }, + { + "epoch": 12.628442437923251, + "grad_norm": 211.17837524414062, + "learning_rate": 1.100907441016334e-05, + "loss": 40.2771, + "step": 3498 + }, + { + "epoch": 12.632054176072234, + "grad_norm": 253.9141845703125, + "learning_rate": 1.1003629764065335e-05, + "loss": 40.3644, + "step": 3499 + }, + { + "epoch": 12.635665914221219, + "grad_norm": 247.4141082763672, + "learning_rate": 1.0998185117967334e-05, + "loss": 39.9754, + "step": 3500 + }, + { + "epoch": 12.635665914221219, + "eval_loss": 0.6086810827255249, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 3500 + }, + { + "epoch": 12.639277652370204, + "grad_norm": 237.3258056640625, + "learning_rate": 1.0992740471869329e-05, + "loss": 39.9438, + "step": 3501 + }, + { + "epoch": 12.642889390519187, + "grad_norm": 252.87744140625, + "learning_rate": 1.0987295825771325e-05, + "loss": 39.9713, + "step": 3502 + }, + { + "epoch": 12.646501128668172, + "grad_norm": 341.2947998046875, + "learning_rate": 1.0981851179673322e-05, + "loss": 36.54, + "step": 3503 + }, + { + "epoch": 12.650112866817155, + "grad_norm": 212.7144317626953, + "learning_rate": 1.0976406533575317e-05, + "loss": 33.2737, + "step": 3504 + }, + { + "epoch": 12.65372460496614, + "grad_norm": 220.15846252441406, + "learning_rate": 1.0970961887477314e-05, + "loss": 34.8862, + "step": 3505 + }, + { + "epoch": 12.657336343115125, + "grad_norm": 235.8145294189453, + "learning_rate": 1.0965517241379311e-05, + "loss": 31.637, + "step": 3506 + }, + { + "epoch": 12.660948081264108, + "grad_norm": 274.13140869140625, + "learning_rate": 1.0960072595281307e-05, + "loss": 33.6111, + "step": 3507 + }, + { + "epoch": 12.664559819413093, + "grad_norm": 259.9810791015625, + "learning_rate": 1.0954627949183304e-05, + "loss": 34.7118, + "step": 3508 + }, + { + "epoch": 12.668171557562077, + "grad_norm": 244.6074676513672, + "learning_rate": 1.0949183303085299e-05, + "loss": 34.3987, + "step": 3509 + }, + { + "epoch": 12.67178329571106, + "grad_norm": 264.0238037109375, + "learning_rate": 1.0943738656987296e-05, + "loss": 34.7304, + "step": 3510 + }, + { + "epoch": 12.67178329571106, + "eval_loss": 0.6089194416999817, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 3510 + }, + { + "epoch": 12.675395033860045, + "grad_norm": 286.857421875, + "learning_rate": 1.0938294010889293e-05, + "loss": 34.5722, + "step": 3511 + }, + { + "epoch": 12.679006772009028, + "grad_norm": 270.7839660644531, + "learning_rate": 1.0932849364791289e-05, + "loss": 35.6129, + "step": 3512 + }, + { + "epoch": 12.682618510158013, + "grad_norm": 214.4302978515625, + "learning_rate": 1.0927404718693284e-05, + "loss": 34.4318, + "step": 3513 + }, + { + "epoch": 12.686230248306998, + "grad_norm": 362.6913757324219, + "learning_rate": 1.0921960072595283e-05, + "loss": 35.6578, + "step": 3514 + }, + { + "epoch": 12.689841986455981, + "grad_norm": 266.5205993652344, + "learning_rate": 1.0916515426497278e-05, + "loss": 35.8627, + "step": 3515 + }, + { + "epoch": 12.693453724604966, + "grad_norm": 271.8298034667969, + "learning_rate": 1.0911070780399275e-05, + "loss": 36.8931, + "step": 3516 + }, + { + "epoch": 12.697065462753951, + "grad_norm": 230.13815307617188, + "learning_rate": 1.090562613430127e-05, + "loss": 35.8972, + "step": 3517 + }, + { + "epoch": 12.700677200902934, + "grad_norm": 235.57127380371094, + "learning_rate": 1.0900181488203266e-05, + "loss": 36.7884, + "step": 3518 + }, + { + "epoch": 12.704288939051919, + "grad_norm": 274.0856018066406, + "learning_rate": 1.0894736842105265e-05, + "loss": 35.938, + "step": 3519 + }, + { + "epoch": 12.707900677200904, + "grad_norm": 251.9855194091797, + "learning_rate": 1.088929219600726e-05, + "loss": 30.846, + "step": 3520 + }, + { + "epoch": 12.707900677200904, + "eval_loss": 0.6102532148361206, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 3520 + }, + { + "epoch": 12.711512415349887, + "grad_norm": 254.11465454101562, + "learning_rate": 1.0883847549909255e-05, + "loss": 22.8538, + "step": 3521 + }, + { + "epoch": 12.715124153498872, + "grad_norm": 233.05821228027344, + "learning_rate": 1.0878402903811253e-05, + "loss": 22.3346, + "step": 3522 + }, + { + "epoch": 12.718735891647855, + "grad_norm": 223.46646118164062, + "learning_rate": 1.087295825771325e-05, + "loss": 23.8109, + "step": 3523 + }, + { + "epoch": 12.72234762979684, + "grad_norm": 209.4064483642578, + "learning_rate": 1.0867513611615245e-05, + "loss": 24.7694, + "step": 3524 + }, + { + "epoch": 12.725959367945824, + "grad_norm": 299.6215515136719, + "learning_rate": 1.0862068965517242e-05, + "loss": 40.8879, + "step": 3525 + }, + { + "epoch": 12.729571106094808, + "grad_norm": 272.5259704589844, + "learning_rate": 1.0856624319419237e-05, + "loss": 41.5875, + "step": 3526 + }, + { + "epoch": 12.733182844243792, + "grad_norm": 219.70687866210938, + "learning_rate": 1.0851179673321235e-05, + "loss": 41.5546, + "step": 3527 + }, + { + "epoch": 12.736794582392777, + "grad_norm": 250.9104766845703, + "learning_rate": 1.0845735027223232e-05, + "loss": 40.0984, + "step": 3528 + }, + { + "epoch": 12.74040632054176, + "grad_norm": 260.9254150390625, + "learning_rate": 1.0840290381125227e-05, + "loss": 40.564, + "step": 3529 + }, + { + "epoch": 12.744018058690745, + "grad_norm": 275.46221923828125, + "learning_rate": 1.0834845735027224e-05, + "loss": 40.3864, + "step": 3530 + }, + { + "epoch": 12.744018058690745, + "eval_loss": 0.6099677681922913, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 3530 + }, + { + "epoch": 12.747629796839728, + "grad_norm": 200.9589385986328, + "learning_rate": 1.082940108892922e-05, + "loss": 40.5753, + "step": 3531 + }, + { + "epoch": 12.751241534988713, + "grad_norm": 228.87669372558594, + "learning_rate": 1.0823956442831215e-05, + "loss": 41.4702, + "step": 3532 + }, + { + "epoch": 12.754853273137698, + "grad_norm": 218.6998748779297, + "learning_rate": 1.0818511796733214e-05, + "loss": 41.6641, + "step": 3533 + }, + { + "epoch": 12.758465011286681, + "grad_norm": 422.519775390625, + "learning_rate": 1.0813067150635209e-05, + "loss": 41.8016, + "step": 3534 + }, + { + "epoch": 12.762076749435666, + "grad_norm": 198.31935119628906, + "learning_rate": 1.0807622504537204e-05, + "loss": 40.6053, + "step": 3535 + }, + { + "epoch": 12.76568848758465, + "grad_norm": 274.42333984375, + "learning_rate": 1.0802177858439201e-05, + "loss": 38.7974, + "step": 3536 + }, + { + "epoch": 12.769300225733634, + "grad_norm": 267.5847473144531, + "learning_rate": 1.0796733212341199e-05, + "loss": 37.157, + "step": 3537 + }, + { + "epoch": 12.772911963882619, + "grad_norm": 264.9976806640625, + "learning_rate": 1.0791288566243196e-05, + "loss": 38.1585, + "step": 3538 + }, + { + "epoch": 12.776523702031604, + "grad_norm": 216.5603790283203, + "learning_rate": 1.0785843920145191e-05, + "loss": 38.0501, + "step": 3539 + }, + { + "epoch": 12.780135440180587, + "grad_norm": 193.55081176757812, + "learning_rate": 1.0780399274047186e-05, + "loss": 38.3114, + "step": 3540 + }, + { + "epoch": 12.780135440180587, + "eval_loss": 0.6059894561767578, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3540 + }, + { + "epoch": 12.783747178329572, + "grad_norm": 256.3584289550781, + "learning_rate": 1.0774954627949183e-05, + "loss": 38.7056, + "step": 3541 + }, + { + "epoch": 12.787358916478555, + "grad_norm": 203.17401123046875, + "learning_rate": 1.076950998185118e-05, + "loss": 39.3947, + "step": 3542 + }, + { + "epoch": 12.79097065462754, + "grad_norm": 307.99517822265625, + "learning_rate": 1.0764065335753176e-05, + "loss": 39.2121, + "step": 3543 + }, + { + "epoch": 12.794582392776524, + "grad_norm": 199.4147186279297, + "learning_rate": 1.0758620689655173e-05, + "loss": 38.4621, + "step": 3544 + }, + { + "epoch": 12.798194130925507, + "grad_norm": 251.60293579101562, + "learning_rate": 1.0753176043557168e-05, + "loss": 38.2742, + "step": 3545 + }, + { + "epoch": 12.801805869074492, + "grad_norm": 277.1817321777344, + "learning_rate": 1.0747731397459165e-05, + "loss": 38.6803, + "step": 3546 + }, + { + "epoch": 12.805417607223477, + "grad_norm": 303.2837219238281, + "learning_rate": 1.0742286751361163e-05, + "loss": 39.7843, + "step": 3547 + }, + { + "epoch": 12.80902934537246, + "grad_norm": 321.22772216796875, + "learning_rate": 1.0736842105263158e-05, + "loss": 41.3761, + "step": 3548 + }, + { + "epoch": 12.812641083521445, + "grad_norm": 238.89007568359375, + "learning_rate": 1.0731397459165155e-05, + "loss": 40.3649, + "step": 3549 + }, + { + "epoch": 12.816252821670428, + "grad_norm": 251.22291564941406, + "learning_rate": 1.072595281306715e-05, + "loss": 40.8151, + "step": 3550 + }, + { + "epoch": 12.816252821670428, + "eval_loss": 0.6065003275871277, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 3550 + }, + { + "epoch": 12.819864559819413, + "grad_norm": 218.13418579101562, + "learning_rate": 1.0720508166969147e-05, + "loss": 39.381, + "step": 3551 + }, + { + "epoch": 12.823476297968398, + "grad_norm": 250.90328979492188, + "learning_rate": 1.0715063520871145e-05, + "loss": 39.8923, + "step": 3552 + }, + { + "epoch": 12.827088036117381, + "grad_norm": 227.4825897216797, + "learning_rate": 1.070961887477314e-05, + "loss": 36.836, + "step": 3553 + }, + { + "epoch": 12.830699774266366, + "grad_norm": 253.7106475830078, + "learning_rate": 1.0704174228675135e-05, + "loss": 34.499, + "step": 3554 + }, + { + "epoch": 12.83431151241535, + "grad_norm": 280.0548400878906, + "learning_rate": 1.0698729582577132e-05, + "loss": 33.3409, + "step": 3555 + }, + { + "epoch": 12.837923250564334, + "grad_norm": 201.3768768310547, + "learning_rate": 1.069328493647913e-05, + "loss": 32.4868, + "step": 3556 + }, + { + "epoch": 12.841534988713319, + "grad_norm": 245.73446655273438, + "learning_rate": 1.0687840290381125e-05, + "loss": 32.8295, + "step": 3557 + }, + { + "epoch": 12.845146726862303, + "grad_norm": 195.0170440673828, + "learning_rate": 1.0682395644283122e-05, + "loss": 33.2009, + "step": 3558 + }, + { + "epoch": 12.848758465011286, + "grad_norm": 261.66357421875, + "learning_rate": 1.0676950998185117e-05, + "loss": 33.0627, + "step": 3559 + }, + { + "epoch": 12.852370203160271, + "grad_norm": 299.0184326171875, + "learning_rate": 1.0671506352087116e-05, + "loss": 34.184, + "step": 3560 + }, + { + "epoch": 12.852370203160271, + "eval_loss": 0.6077792048454285, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.041, + "eval_steps_per_second": 57.041, + "step": 3560 + }, + { + "epoch": 12.855981941309254, + "grad_norm": 293.9249572753906, + "learning_rate": 1.0666061705989111e-05, + "loss": 34.748, + "step": 3561 + }, + { + "epoch": 12.85959367945824, + "grad_norm": 206.4182586669922, + "learning_rate": 1.0660617059891107e-05, + "loss": 33.8454, + "step": 3562 + }, + { + "epoch": 12.863205417607224, + "grad_norm": 261.4427185058594, + "learning_rate": 1.0655172413793104e-05, + "loss": 35.7317, + "step": 3563 + }, + { + "epoch": 12.866817155756207, + "grad_norm": 236.60704040527344, + "learning_rate": 1.06497277676951e-05, + "loss": 35.2389, + "step": 3564 + }, + { + "epoch": 12.870428893905192, + "grad_norm": 272.9973449707031, + "learning_rate": 1.0644283121597096e-05, + "loss": 34.8523, + "step": 3565 + }, + { + "epoch": 12.874040632054175, + "grad_norm": 228.82540893554688, + "learning_rate": 1.0638838475499093e-05, + "loss": 34.7236, + "step": 3566 + }, + { + "epoch": 12.87765237020316, + "grad_norm": 266.6078796386719, + "learning_rate": 1.0633393829401089e-05, + "loss": 36.1574, + "step": 3567 + }, + { + "epoch": 12.881264108352145, + "grad_norm": 267.52239990234375, + "learning_rate": 1.0627949183303086e-05, + "loss": 36.8466, + "step": 3568 + }, + { + "epoch": 12.884875846501128, + "grad_norm": 261.0372314453125, + "learning_rate": 1.0622504537205083e-05, + "loss": 37.2803, + "step": 3569 + }, + { + "epoch": 12.888487584650113, + "grad_norm": 220.42532348632812, + "learning_rate": 1.0617059891107078e-05, + "loss": 29.4233, + "step": 3570 + }, + { + "epoch": 12.888487584650113, + "eval_loss": 0.6131581664085388, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 3570 + }, + { + "epoch": 12.892099322799098, + "grad_norm": 187.53604125976562, + "learning_rate": 1.0611615245009075e-05, + "loss": 23.3851, + "step": 3571 + }, + { + "epoch": 12.89571106094808, + "grad_norm": 227.1913299560547, + "learning_rate": 1.060617059891107e-05, + "loss": 23.3155, + "step": 3572 + }, + { + "epoch": 12.899322799097066, + "grad_norm": 202.15939331054688, + "learning_rate": 1.0600725952813066e-05, + "loss": 24.4548, + "step": 3573 + }, + { + "epoch": 12.90293453724605, + "grad_norm": 195.67282104492188, + "learning_rate": 1.0595281306715065e-05, + "loss": 24.2037, + "step": 3574 + }, + { + "epoch": 12.906546275395034, + "grad_norm": 303.0018310546875, + "learning_rate": 1.058983666061706e-05, + "loss": 41.6489, + "step": 3575 + }, + { + "epoch": 12.910158013544018, + "grad_norm": 193.92433166503906, + "learning_rate": 1.0584392014519056e-05, + "loss": 40.3682, + "step": 3576 + }, + { + "epoch": 12.913769751693001, + "grad_norm": 305.50750732421875, + "learning_rate": 1.0578947368421053e-05, + "loss": 40.5065, + "step": 3577 + }, + { + "epoch": 12.917381489841986, + "grad_norm": 223.41732788085938, + "learning_rate": 1.0573502722323048e-05, + "loss": 41.6387, + "step": 3578 + }, + { + "epoch": 12.920993227990971, + "grad_norm": 215.65061950683594, + "learning_rate": 1.0568058076225047e-05, + "loss": 41.3623, + "step": 3579 + }, + { + "epoch": 12.924604966139954, + "grad_norm": 223.95880126953125, + "learning_rate": 1.0562613430127042e-05, + "loss": 40.7444, + "step": 3580 + }, + { + "epoch": 12.924604966139954, + "eval_loss": 0.6113386750221252, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.074, + "eval_steps_per_second": 57.074, + "step": 3580 + }, + { + "epoch": 12.928216704288939, + "grad_norm": 247.3272247314453, + "learning_rate": 1.0557168784029038e-05, + "loss": 37.8137, + "step": 3581 + }, + { + "epoch": 12.931828442437924, + "grad_norm": 277.4321594238281, + "learning_rate": 1.0551724137931035e-05, + "loss": 38.6946, + "step": 3582 + }, + { + "epoch": 12.935440180586907, + "grad_norm": 219.15576171875, + "learning_rate": 1.0546279491833032e-05, + "loss": 39.0059, + "step": 3583 + }, + { + "epoch": 12.939051918735892, + "grad_norm": 205.6105194091797, + "learning_rate": 1.0540834845735027e-05, + "loss": 39.2436, + "step": 3584 + }, + { + "epoch": 12.942663656884875, + "grad_norm": 303.84521484375, + "learning_rate": 1.0535390199637024e-05, + "loss": 39.2451, + "step": 3585 + }, + { + "epoch": 12.94627539503386, + "grad_norm": 326.2321472167969, + "learning_rate": 1.052994555353902e-05, + "loss": 38.1849, + "step": 3586 + }, + { + "epoch": 12.949887133182845, + "grad_norm": 332.7608642578125, + "learning_rate": 1.0524500907441015e-05, + "loss": 39.7121, + "step": 3587 + }, + { + "epoch": 12.953498871331828, + "grad_norm": 245.19827270507812, + "learning_rate": 1.0519056261343014e-05, + "loss": 39.6558, + "step": 3588 + }, + { + "epoch": 12.957110609480813, + "grad_norm": 227.54763793945312, + "learning_rate": 1.051361161524501e-05, + "loss": 38.6437, + "step": 3589 + }, + { + "epoch": 12.960722347629797, + "grad_norm": 273.1142272949219, + "learning_rate": 1.0508166969147006e-05, + "loss": 39.083, + "step": 3590 + }, + { + "epoch": 12.960722347629797, + "eval_loss": 0.6050187349319458, + "eval_runtime": 3.1339, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 3590 + }, + { + "epoch": 12.96433408577878, + "grad_norm": 227.0492401123047, + "learning_rate": 1.0502722323049002e-05, + "loss": 34.0254, + "step": 3591 + }, + { + "epoch": 12.967945823927765, + "grad_norm": 201.76736450195312, + "learning_rate": 1.0497277676950999e-05, + "loss": 32.4569, + "step": 3592 + }, + { + "epoch": 12.97155756207675, + "grad_norm": 279.99237060546875, + "learning_rate": 1.0491833030852996e-05, + "loss": 33.8718, + "step": 3593 + }, + { + "epoch": 12.975169300225733, + "grad_norm": 351.647705078125, + "learning_rate": 1.0486388384754991e-05, + "loss": 34.8168, + "step": 3594 + }, + { + "epoch": 12.978781038374718, + "grad_norm": 275.7414855957031, + "learning_rate": 1.0480943738656987e-05, + "loss": 35.1731, + "step": 3595 + }, + { + "epoch": 12.982392776523701, + "grad_norm": 347.0024719238281, + "learning_rate": 1.0475499092558984e-05, + "loss": 35.7127, + "step": 3596 + }, + { + "epoch": 12.986004514672686, + "grad_norm": 304.18218994140625, + "learning_rate": 1.047005444646098e-05, + "loss": 34.7709, + "step": 3597 + }, + { + "epoch": 12.989616252821671, + "grad_norm": 306.33245849609375, + "learning_rate": 1.0464609800362976e-05, + "loss": 37.2105, + "step": 3598 + }, + { + "epoch": 12.993227990970654, + "grad_norm": 326.3535461425781, + "learning_rate": 1.0459165154264973e-05, + "loss": 33.6613, + "step": 3599 + }, + { + "epoch": 12.996839729119639, + "grad_norm": 325.7522888183594, + "learning_rate": 1.0453720508166969e-05, + "loss": 22.8985, + "step": 3600 + }, + { + "epoch": 12.996839729119639, + "eval_loss": 0.6073772311210632, + "eval_runtime": 3.1391, + "eval_samples_per_second": 57.023, + "eval_steps_per_second": 57.023, + "step": 3600 + }, + { + "epoch": 13.0, + "grad_norm": 256.7010498046875, + "learning_rate": 1.0448275862068966e-05, + "loss": 21.3776, + "step": 3601 + }, + { + "epoch": 13.003611738148985, + "grad_norm": 247.7591552734375, + "learning_rate": 1.0442831215970963e-05, + "loss": 39.0509, + "step": 3602 + }, + { + "epoch": 13.007223476297968, + "grad_norm": 389.6626281738281, + "learning_rate": 1.0437386569872958e-05, + "loss": 41.042, + "step": 3603 + }, + { + "epoch": 13.010835214446953, + "grad_norm": 271.01885986328125, + "learning_rate": 1.0431941923774955e-05, + "loss": 39.9542, + "step": 3604 + }, + { + "epoch": 13.014446952595938, + "grad_norm": 263.2490539550781, + "learning_rate": 1.042649727767695e-05, + "loss": 39.8852, + "step": 3605 + }, + { + "epoch": 13.01805869074492, + "grad_norm": 255.46878051757812, + "learning_rate": 1.0421052631578948e-05, + "loss": 39.3902, + "step": 3606 + }, + { + "epoch": 13.021670428893906, + "grad_norm": 206.02244567871094, + "learning_rate": 1.0415607985480945e-05, + "loss": 40.1731, + "step": 3607 + }, + { + "epoch": 13.025282167042889, + "grad_norm": 194.83055114746094, + "learning_rate": 1.041016333938294e-05, + "loss": 39.17, + "step": 3608 + }, + { + "epoch": 13.028893905191874, + "grad_norm": 230.1270294189453, + "learning_rate": 1.0404718693284936e-05, + "loss": 40.3363, + "step": 3609 + }, + { + "epoch": 13.032505643340858, + "grad_norm": 206.0470733642578, + "learning_rate": 1.0399274047186933e-05, + "loss": 40.7774, + "step": 3610 + }, + { + "epoch": 13.032505643340858, + "eval_loss": 0.6078981161117554, + "eval_runtime": 3.1697, + "eval_samples_per_second": 56.472, + "eval_steps_per_second": 56.472, + "step": 3610 + }, + { + "epoch": 13.036117381489841, + "grad_norm": 210.79327392578125, + "learning_rate": 1.039382940108893e-05, + "loss": 40.725, + "step": 3611 + }, + { + "epoch": 13.039729119638826, + "grad_norm": 200.4281768798828, + "learning_rate": 1.0388384754990927e-05, + "loss": 38.8736, + "step": 3612 + }, + { + "epoch": 13.043340857787811, + "grad_norm": 183.33575439453125, + "learning_rate": 1.0382940108892922e-05, + "loss": 37.5542, + "step": 3613 + }, + { + "epoch": 13.046952595936794, + "grad_norm": 195.2568817138672, + "learning_rate": 1.0377495462794918e-05, + "loss": 36.5576, + "step": 3614 + }, + { + "epoch": 13.050564334085779, + "grad_norm": 223.9565887451172, + "learning_rate": 1.0372050816696916e-05, + "loss": 36.9015, + "step": 3615 + }, + { + "epoch": 13.054176072234762, + "grad_norm": 264.0516052246094, + "learning_rate": 1.0366606170598912e-05, + "loss": 38.8146, + "step": 3616 + }, + { + "epoch": 13.057787810383747, + "grad_norm": 247.3844757080078, + "learning_rate": 1.0361161524500907e-05, + "loss": 37.0338, + "step": 3617 + }, + { + "epoch": 13.061399548532732, + "grad_norm": 243.3253173828125, + "learning_rate": 1.0355716878402904e-05, + "loss": 37.3565, + "step": 3618 + }, + { + "epoch": 13.065011286681715, + "grad_norm": 213.89939880371094, + "learning_rate": 1.03502722323049e-05, + "loss": 38.367, + "step": 3619 + }, + { + "epoch": 13.0686230248307, + "grad_norm": 254.04953002929688, + "learning_rate": 1.0344827586206898e-05, + "loss": 38.3101, + "step": 3620 + }, + { + "epoch": 13.0686230248307, + "eval_loss": 0.6108394861221313, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 3620 + }, + { + "epoch": 13.072234762979685, + "grad_norm": 235.3623046875, + "learning_rate": 1.0339382940108894e-05, + "loss": 38.3113, + "step": 3621 + }, + { + "epoch": 13.075846501128668, + "grad_norm": 259.0147399902344, + "learning_rate": 1.0333938294010889e-05, + "loss": 36.9916, + "step": 3622 + }, + { + "epoch": 13.079458239277653, + "grad_norm": 257.96575927734375, + "learning_rate": 1.0328493647912886e-05, + "loss": 36.5944, + "step": 3623 + }, + { + "epoch": 13.083069977426636, + "grad_norm": 228.49131774902344, + "learning_rate": 1.0323049001814882e-05, + "loss": 39.7592, + "step": 3624 + }, + { + "epoch": 13.08668171557562, + "grad_norm": 278.5231018066406, + "learning_rate": 1.0317604355716879e-05, + "loss": 38.7785, + "step": 3625 + }, + { + "epoch": 13.090293453724605, + "grad_norm": 218.6136932373047, + "learning_rate": 1.0312159709618876e-05, + "loss": 39.6878, + "step": 3626 + }, + { + "epoch": 13.093905191873588, + "grad_norm": 231.03012084960938, + "learning_rate": 1.0306715063520871e-05, + "loss": 40.5433, + "step": 3627 + }, + { + "epoch": 13.097516930022573, + "grad_norm": 254.7096405029297, + "learning_rate": 1.0301270417422866e-05, + "loss": 39.1311, + "step": 3628 + }, + { + "epoch": 13.101128668171558, + "grad_norm": 303.50274658203125, + "learning_rate": 1.0295825771324865e-05, + "loss": 38.6237, + "step": 3629 + }, + { + "epoch": 13.104740406320541, + "grad_norm": 217.4394073486328, + "learning_rate": 1.029038112522686e-05, + "loss": 36.5534, + "step": 3630 + }, + { + "epoch": 13.104740406320541, + "eval_loss": 0.6075544357299805, + "eval_runtime": 3.1475, + "eval_samples_per_second": 56.87, + "eval_steps_per_second": 56.87, + "step": 3630 + }, + { + "epoch": 13.108352144469526, + "grad_norm": 249.18490600585938, + "learning_rate": 1.0284936479128858e-05, + "loss": 34.2153, + "step": 3631 + }, + { + "epoch": 13.111963882618511, + "grad_norm": 261.9061584472656, + "learning_rate": 1.0279491833030853e-05, + "loss": 33.7793, + "step": 3632 + }, + { + "epoch": 13.115575620767494, + "grad_norm": 205.93113708496094, + "learning_rate": 1.0274047186932848e-05, + "loss": 31.2934, + "step": 3633 + }, + { + "epoch": 13.119187358916479, + "grad_norm": 203.82980346679688, + "learning_rate": 1.0268602540834847e-05, + "loss": 31.9074, + "step": 3634 + }, + { + "epoch": 13.122799097065462, + "grad_norm": 309.0658874511719, + "learning_rate": 1.0263157894736843e-05, + "loss": 32.6883, + "step": 3635 + }, + { + "epoch": 13.126410835214447, + "grad_norm": 239.59312438964844, + "learning_rate": 1.0257713248638838e-05, + "loss": 34.1261, + "step": 3636 + }, + { + "epoch": 13.130022573363432, + "grad_norm": 360.4351501464844, + "learning_rate": 1.0252268602540835e-05, + "loss": 34.7656, + "step": 3637 + }, + { + "epoch": 13.133634311512415, + "grad_norm": 319.87451171875, + "learning_rate": 1.024682395644283e-05, + "loss": 34.6533, + "step": 3638 + }, + { + "epoch": 13.1372460496614, + "grad_norm": 352.31707763671875, + "learning_rate": 1.0241379310344828e-05, + "loss": 33.9159, + "step": 3639 + }, + { + "epoch": 13.140857787810384, + "grad_norm": 288.85418701171875, + "learning_rate": 1.0235934664246825e-05, + "loss": 34.6115, + "step": 3640 + }, + { + "epoch": 13.140857787810384, + "eval_loss": 0.6106187105178833, + "eval_runtime": 3.1535, + "eval_samples_per_second": 56.763, + "eval_steps_per_second": 56.763, + "step": 3640 + }, + { + "epoch": 13.144469525959368, + "grad_norm": 263.8638000488281, + "learning_rate": 1.023049001814882e-05, + "loss": 34.3008, + "step": 3641 + }, + { + "epoch": 13.148081264108352, + "grad_norm": 308.10650634765625, + "learning_rate": 1.0225045372050817e-05, + "loss": 35.9397, + "step": 3642 + }, + { + "epoch": 13.151693002257336, + "grad_norm": 208.60519409179688, + "learning_rate": 1.0219600725952814e-05, + "loss": 34.2573, + "step": 3643 + }, + { + "epoch": 13.15530474040632, + "grad_norm": 251.36766052246094, + "learning_rate": 1.021415607985481e-05, + "loss": 35.853, + "step": 3644 + }, + { + "epoch": 13.158916478555305, + "grad_norm": 264.94818115234375, + "learning_rate": 1.0208711433756807e-05, + "loss": 35.7057, + "step": 3645 + }, + { + "epoch": 13.162528216704288, + "grad_norm": 313.0333251953125, + "learning_rate": 1.0203266787658802e-05, + "loss": 34.611, + "step": 3646 + }, + { + "epoch": 13.166139954853273, + "grad_norm": 254.9687042236328, + "learning_rate": 1.0197822141560797e-05, + "loss": 31.1751, + "step": 3647 + }, + { + "epoch": 13.169751693002258, + "grad_norm": 219.7308349609375, + "learning_rate": 1.0192377495462796e-05, + "loss": 22.8425, + "step": 3648 + }, + { + "epoch": 13.173363431151241, + "grad_norm": 305.76416015625, + "learning_rate": 1.0186932849364792e-05, + "loss": 22.5266, + "step": 3649 + }, + { + "epoch": 13.176975169300226, + "grad_norm": 301.26239013671875, + "learning_rate": 1.0181488203266787e-05, + "loss": 23.861, + "step": 3650 + }, + { + "epoch": 13.176975169300226, + "eval_loss": 0.6107029914855957, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 3650 + }, + { + "epoch": 13.18058690744921, + "grad_norm": 235.15576171875, + "learning_rate": 1.0176043557168784e-05, + "loss": 24.495, + "step": 3651 + }, + { + "epoch": 13.184198645598194, + "grad_norm": 268.524658203125, + "learning_rate": 1.0170598911070781e-05, + "loss": 40.3819, + "step": 3652 + }, + { + "epoch": 13.187810383747179, + "grad_norm": 257.869140625, + "learning_rate": 1.0165154264972778e-05, + "loss": 42.2715, + "step": 3653 + }, + { + "epoch": 13.191422121896162, + "grad_norm": 191.8995361328125, + "learning_rate": 1.0159709618874774e-05, + "loss": 41.2991, + "step": 3654 + }, + { + "epoch": 13.195033860045147, + "grad_norm": 242.85342407226562, + "learning_rate": 1.0154264972776769e-05, + "loss": 39.6007, + "step": 3655 + }, + { + "epoch": 13.198645598194132, + "grad_norm": 279.1092529296875, + "learning_rate": 1.0148820326678766e-05, + "loss": 39.8502, + "step": 3656 + }, + { + "epoch": 13.202257336343115, + "grad_norm": 233.94708251953125, + "learning_rate": 1.0143375680580763e-05, + "loss": 39.6407, + "step": 3657 + }, + { + "epoch": 13.2058690744921, + "grad_norm": 227.53001403808594, + "learning_rate": 1.0137931034482758e-05, + "loss": 40.3618, + "step": 3658 + }, + { + "epoch": 13.209480812641084, + "grad_norm": 216.17654418945312, + "learning_rate": 1.0132486388384756e-05, + "loss": 41.3187, + "step": 3659 + }, + { + "epoch": 13.213092550790067, + "grad_norm": 199.51072692871094, + "learning_rate": 1.0127041742286751e-05, + "loss": 41.7474, + "step": 3660 + }, + { + "epoch": 13.213092550790067, + "eval_loss": 0.6099065542221069, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3660 + }, + { + "epoch": 13.216704288939052, + "grad_norm": 212.3302001953125, + "learning_rate": 1.0121597096188748e-05, + "loss": 40.8565, + "step": 3661 + }, + { + "epoch": 13.220316027088035, + "grad_norm": 185.42857360839844, + "learning_rate": 1.0116152450090745e-05, + "loss": 41.5302, + "step": 3662 + }, + { + "epoch": 13.22392776523702, + "grad_norm": 241.05487060546875, + "learning_rate": 1.011070780399274e-05, + "loss": 38.6842, + "step": 3663 + }, + { + "epoch": 13.227539503386005, + "grad_norm": 314.1755065917969, + "learning_rate": 1.0105263157894738e-05, + "loss": 37.8021, + "step": 3664 + }, + { + "epoch": 13.231151241534988, + "grad_norm": 262.6571960449219, + "learning_rate": 1.0099818511796733e-05, + "loss": 36.3265, + "step": 3665 + }, + { + "epoch": 13.234762979683973, + "grad_norm": 259.24029541015625, + "learning_rate": 1.009437386569873e-05, + "loss": 38.4521, + "step": 3666 + }, + { + "epoch": 13.238374717832958, + "grad_norm": 223.5182342529297, + "learning_rate": 1.0088929219600727e-05, + "loss": 37.3267, + "step": 3667 + }, + { + "epoch": 13.241986455981941, + "grad_norm": 181.72926330566406, + "learning_rate": 1.0083484573502722e-05, + "loss": 38.0142, + "step": 3668 + }, + { + "epoch": 13.245598194130926, + "grad_norm": 204.99813842773438, + "learning_rate": 1.0078039927404718e-05, + "loss": 37.3513, + "step": 3669 + }, + { + "epoch": 13.249209932279909, + "grad_norm": 184.05482482910156, + "learning_rate": 1.0072595281306715e-05, + "loss": 37.9737, + "step": 3670 + }, + { + "epoch": 13.249209932279909, + "eval_loss": 0.6081296801567078, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.081, + "eval_steps_per_second": 57.081, + "step": 3670 + }, + { + "epoch": 13.252821670428894, + "grad_norm": 261.076416015625, + "learning_rate": 1.0067150635208712e-05, + "loss": 38.1087, + "step": 3671 + }, + { + "epoch": 13.256433408577879, + "grad_norm": 218.79515075683594, + "learning_rate": 1.0061705989110709e-05, + "loss": 37.215, + "step": 3672 + }, + { + "epoch": 13.260045146726862, + "grad_norm": 240.93222045898438, + "learning_rate": 1.0056261343012704e-05, + "loss": 37.4461, + "step": 3673 + }, + { + "epoch": 13.263656884875846, + "grad_norm": 241.46072387695312, + "learning_rate": 1.00508166969147e-05, + "loss": 39.4396, + "step": 3674 + }, + { + "epoch": 13.267268623024831, + "grad_norm": 217.85369873046875, + "learning_rate": 1.0045372050816699e-05, + "loss": 38.5512, + "step": 3675 + }, + { + "epoch": 13.270880361173814, + "grad_norm": 254.53549194335938, + "learning_rate": 1.0039927404718694e-05, + "loss": 39.4436, + "step": 3676 + }, + { + "epoch": 13.2744920993228, + "grad_norm": 330.2030029296875, + "learning_rate": 1.003448275862069e-05, + "loss": 39.6341, + "step": 3677 + }, + { + "epoch": 13.278103837471784, + "grad_norm": 267.6778869628906, + "learning_rate": 1.0029038112522686e-05, + "loss": 38.5305, + "step": 3678 + }, + { + "epoch": 13.281715575620767, + "grad_norm": 251.23703002929688, + "learning_rate": 1.0023593466424682e-05, + "loss": 39.712, + "step": 3679 + }, + { + "epoch": 13.285327313769752, + "grad_norm": 258.8126525878906, + "learning_rate": 1.0018148820326679e-05, + "loss": 37.982, + "step": 3680 + }, + { + "epoch": 13.285327313769752, + "eval_loss": 0.6092600226402283, + "eval_runtime": 3.1494, + "eval_samples_per_second": 56.837, + "eval_steps_per_second": 56.837, + "step": 3680 + }, + { + "epoch": 13.288939051918735, + "grad_norm": 270.01690673828125, + "learning_rate": 1.0012704174228676e-05, + "loss": 35.8938, + "step": 3681 + }, + { + "epoch": 13.29255079006772, + "grad_norm": 271.138671875, + "learning_rate": 1.0007259528130671e-05, + "loss": 33.2221, + "step": 3682 + }, + { + "epoch": 13.296162528216705, + "grad_norm": 239.4976806640625, + "learning_rate": 1.0001814882032668e-05, + "loss": 32.6252, + "step": 3683 + }, + { + "epoch": 13.299774266365688, + "grad_norm": 203.7470245361328, + "learning_rate": 9.996370235934664e-06, + "loss": 32.3694, + "step": 3684 + }, + { + "epoch": 13.303386004514673, + "grad_norm": 255.28419494628906, + "learning_rate": 9.990925589836661e-06, + "loss": 32.7386, + "step": 3685 + }, + { + "epoch": 13.306997742663658, + "grad_norm": 267.82489013671875, + "learning_rate": 9.985480943738658e-06, + "loss": 33.7657, + "step": 3686 + }, + { + "epoch": 13.31060948081264, + "grad_norm": 224.82432556152344, + "learning_rate": 9.980036297640653e-06, + "loss": 34.085, + "step": 3687 + }, + { + "epoch": 13.314221218961626, + "grad_norm": 249.92684936523438, + "learning_rate": 9.974591651542649e-06, + "loss": 33.9186, + "step": 3688 + }, + { + "epoch": 13.317832957110609, + "grad_norm": 249.29620361328125, + "learning_rate": 9.969147005444648e-06, + "loss": 35.0909, + "step": 3689 + }, + { + "epoch": 13.321444695259594, + "grad_norm": 276.4640808105469, + "learning_rate": 9.963702359346643e-06, + "loss": 35.6823, + "step": 3690 + }, + { + "epoch": 13.321444695259594, + "eval_loss": 0.6132593154907227, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 3690 + }, + { + "epoch": 13.325056433408578, + "grad_norm": 245.46163940429688, + "learning_rate": 9.958257713248638e-06, + "loss": 35.7071, + "step": 3691 + }, + { + "epoch": 13.328668171557561, + "grad_norm": 311.008544921875, + "learning_rate": 9.952813067150635e-06, + "loss": 33.6089, + "step": 3692 + }, + { + "epoch": 13.332279909706546, + "grad_norm": 283.2784118652344, + "learning_rate": 9.94736842105263e-06, + "loss": 34.9939, + "step": 3693 + }, + { + "epoch": 13.335891647855531, + "grad_norm": 293.2317199707031, + "learning_rate": 9.94192377495463e-06, + "loss": 37.1149, + "step": 3694 + }, + { + "epoch": 13.339503386004514, + "grad_norm": 263.33111572265625, + "learning_rate": 9.936479128856625e-06, + "loss": 36.5911, + "step": 3695 + }, + { + "epoch": 13.343115124153499, + "grad_norm": 285.1488952636719, + "learning_rate": 9.93103448275862e-06, + "loss": 35.9336, + "step": 3696 + }, + { + "epoch": 13.346726862302482, + "grad_norm": 246.30616760253906, + "learning_rate": 9.925589836660617e-06, + "loss": 26.1555, + "step": 3697 + }, + { + "epoch": 13.350338600451467, + "grad_norm": 185.4857177734375, + "learning_rate": 9.920145190562614e-06, + "loss": 21.9519, + "step": 3698 + }, + { + "epoch": 13.353950338600452, + "grad_norm": 269.6291809082031, + "learning_rate": 9.91470054446461e-06, + "loss": 22.5592, + "step": 3699 + }, + { + "epoch": 13.357562076749435, + "grad_norm": 214.7660675048828, + "learning_rate": 9.909255898366607e-06, + "loss": 23.2505, + "step": 3700 + }, + { + "epoch": 13.357562076749435, + "eval_loss": 0.6123418211936951, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 3700 + }, + { + "epoch": 13.36117381489842, + "grad_norm": 227.8025360107422, + "learning_rate": 9.903811252268602e-06, + "loss": 23.9731, + "step": 3701 + }, + { + "epoch": 13.364785553047405, + "grad_norm": 261.7846374511719, + "learning_rate": 9.898366606170598e-06, + "loss": 40.3869, + "step": 3702 + }, + { + "epoch": 13.368397291196388, + "grad_norm": 305.4109802246094, + "learning_rate": 9.892921960072596e-06, + "loss": 41.9626, + "step": 3703 + }, + { + "epoch": 13.372009029345373, + "grad_norm": 272.86236572265625, + "learning_rate": 9.887477313974592e-06, + "loss": 39.9819, + "step": 3704 + }, + { + "epoch": 13.375620767494357, + "grad_norm": 371.4781188964844, + "learning_rate": 9.882032667876589e-06, + "loss": 40.8074, + "step": 3705 + }, + { + "epoch": 13.37923250564334, + "grad_norm": 278.7463684082031, + "learning_rate": 9.876588021778584e-06, + "loss": 40.6721, + "step": 3706 + }, + { + "epoch": 13.382844243792325, + "grad_norm": 270.41619873046875, + "learning_rate": 9.87114337568058e-06, + "loss": 40.1604, + "step": 3707 + }, + { + "epoch": 13.386455981941308, + "grad_norm": 204.42018127441406, + "learning_rate": 9.865698729582578e-06, + "loss": 41.4666, + "step": 3708 + }, + { + "epoch": 13.390067720090293, + "grad_norm": 197.43289184570312, + "learning_rate": 9.860254083484574e-06, + "loss": 40.953, + "step": 3709 + }, + { + "epoch": 13.393679458239278, + "grad_norm": 203.92056274414062, + "learning_rate": 9.85480943738657e-06, + "loss": 40.6416, + "step": 3710 + }, + { + "epoch": 13.393679458239278, + "eval_loss": 0.608938992023468, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.863, + "eval_steps_per_second": 56.863, + "step": 3710 + }, + { + "epoch": 13.397291196388261, + "grad_norm": 353.2951354980469, + "learning_rate": 9.849364791288566e-06, + "loss": 39.7, + "step": 3711 + }, + { + "epoch": 13.400902934537246, + "grad_norm": 222.94410705566406, + "learning_rate": 9.843920145190563e-06, + "loss": 40.4703, + "step": 3712 + }, + { + "epoch": 13.404514672686231, + "grad_norm": 301.0710754394531, + "learning_rate": 9.83847549909256e-06, + "loss": 37.0453, + "step": 3713 + }, + { + "epoch": 13.408126410835214, + "grad_norm": 251.70263671875, + "learning_rate": 9.833030852994556e-06, + "loss": 37.5346, + "step": 3714 + }, + { + "epoch": 13.411738148984199, + "grad_norm": 201.29335021972656, + "learning_rate": 9.827586206896551e-06, + "loss": 39.0706, + "step": 3715 + }, + { + "epoch": 13.415349887133182, + "grad_norm": 233.82212829589844, + "learning_rate": 9.822141560798548e-06, + "loss": 38.4527, + "step": 3716 + }, + { + "epoch": 13.418961625282167, + "grad_norm": 245.0128936767578, + "learning_rate": 9.816696914700545e-06, + "loss": 37.82, + "step": 3717 + }, + { + "epoch": 13.422573363431152, + "grad_norm": 325.1784973144531, + "learning_rate": 9.81125226860254e-06, + "loss": 38.8858, + "step": 3718 + }, + { + "epoch": 13.426185101580135, + "grad_norm": 196.15032958984375, + "learning_rate": 9.805807622504538e-06, + "loss": 37.1919, + "step": 3719 + }, + { + "epoch": 13.42979683972912, + "grad_norm": 254.73980712890625, + "learning_rate": 9.800362976406533e-06, + "loss": 39.1644, + "step": 3720 + }, + { + "epoch": 13.42979683972912, + "eval_loss": 0.6100116968154907, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 3720 + }, + { + "epoch": 13.433408577878104, + "grad_norm": 253.11489868164062, + "learning_rate": 9.79491833030853e-06, + "loss": 39.8542, + "step": 3721 + }, + { + "epoch": 13.437020316027088, + "grad_norm": 267.8416748046875, + "learning_rate": 9.789473684210527e-06, + "loss": 39.8469, + "step": 3722 + }, + { + "epoch": 13.440632054176072, + "grad_norm": 267.62835693359375, + "learning_rate": 9.784029038112523e-06, + "loss": 37.4556, + "step": 3723 + }, + { + "epoch": 13.444243792325057, + "grad_norm": 346.6018371582031, + "learning_rate": 9.77858439201452e-06, + "loss": 39.7817, + "step": 3724 + }, + { + "epoch": 13.44785553047404, + "grad_norm": 241.95008850097656, + "learning_rate": 9.773139745916515e-06, + "loss": 39.1631, + "step": 3725 + }, + { + "epoch": 13.451467268623025, + "grad_norm": 244.9163055419922, + "learning_rate": 9.767695099818512e-06, + "loss": 38.6152, + "step": 3726 + }, + { + "epoch": 13.455079006772008, + "grad_norm": 243.60633850097656, + "learning_rate": 9.76225045372051e-06, + "loss": 39.5388, + "step": 3727 + }, + { + "epoch": 13.458690744920993, + "grad_norm": 230.57276916503906, + "learning_rate": 9.756805807622505e-06, + "loss": 40.3007, + "step": 3728 + }, + { + "epoch": 13.462302483069978, + "grad_norm": 228.76754760742188, + "learning_rate": 9.7513611615245e-06, + "loss": 37.7111, + "step": 3729 + }, + { + "epoch": 13.465914221218961, + "grad_norm": 292.7367248535156, + "learning_rate": 9.745916515426497e-06, + "loss": 38.4114, + "step": 3730 + }, + { + "epoch": 13.465914221218961, + "eval_loss": 0.6064842939376831, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 3730 + }, + { + "epoch": 13.469525959367946, + "grad_norm": 226.9254150390625, + "learning_rate": 9.740471869328494e-06, + "loss": 34.015, + "step": 3731 + }, + { + "epoch": 13.47313769751693, + "grad_norm": 250.38137817382812, + "learning_rate": 9.73502722323049e-06, + "loss": 34.2911, + "step": 3732 + }, + { + "epoch": 13.476749435665914, + "grad_norm": 230.447265625, + "learning_rate": 9.729582577132487e-06, + "loss": 31.8708, + "step": 3733 + }, + { + "epoch": 13.480361173814899, + "grad_norm": 241.05787658691406, + "learning_rate": 9.724137931034482e-06, + "loss": 34.5685, + "step": 3734 + }, + { + "epoch": 13.483972911963882, + "grad_norm": 248.07254028320312, + "learning_rate": 9.718693284936481e-06, + "loss": 32.6084, + "step": 3735 + }, + { + "epoch": 13.487584650112867, + "grad_norm": 241.22862243652344, + "learning_rate": 9.713248638838476e-06, + "loss": 32.787, + "step": 3736 + }, + { + "epoch": 13.491196388261852, + "grad_norm": 295.4871520996094, + "learning_rate": 9.707803992740472e-06, + "loss": 33.9786, + "step": 3737 + }, + { + "epoch": 13.494808126410835, + "grad_norm": 285.3634948730469, + "learning_rate": 9.702359346642469e-06, + "loss": 33.9872, + "step": 3738 + }, + { + "epoch": 13.49841986455982, + "grad_norm": 302.39947509765625, + "learning_rate": 9.696914700544464e-06, + "loss": 33.9854, + "step": 3739 + }, + { + "epoch": 13.502031602708804, + "grad_norm": 310.0465087890625, + "learning_rate": 9.691470054446461e-06, + "loss": 34.1859, + "step": 3740 + }, + { + "epoch": 13.502031602708804, + "eval_loss": 0.6067100167274475, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 3740 + }, + { + "epoch": 13.505643340857787, + "grad_norm": 319.9311828613281, + "learning_rate": 9.686025408348458e-06, + "loss": 34.5264, + "step": 3741 + }, + { + "epoch": 13.509255079006772, + "grad_norm": 291.75738525390625, + "learning_rate": 9.680580762250454e-06, + "loss": 35.8348, + "step": 3742 + }, + { + "epoch": 13.512866817155757, + "grad_norm": 291.5312805175781, + "learning_rate": 9.675136116152449e-06, + "loss": 33.8803, + "step": 3743 + }, + { + "epoch": 13.51647855530474, + "grad_norm": 228.00588989257812, + "learning_rate": 9.669691470054448e-06, + "loss": 36.1919, + "step": 3744 + }, + { + "epoch": 13.520090293453725, + "grad_norm": 236.5559539794922, + "learning_rate": 9.664246823956443e-06, + "loss": 35.8432, + "step": 3745 + }, + { + "epoch": 13.523702031602708, + "grad_norm": 287.7408752441406, + "learning_rate": 9.65880217785844e-06, + "loss": 37.069, + "step": 3746 + }, + { + "epoch": 13.527313769751693, + "grad_norm": 272.73870849609375, + "learning_rate": 9.653357531760436e-06, + "loss": 29.1896, + "step": 3747 + }, + { + "epoch": 13.530925507900678, + "grad_norm": 256.5550842285156, + "learning_rate": 9.647912885662431e-06, + "loss": 23.0953, + "step": 3748 + }, + { + "epoch": 13.534537246049661, + "grad_norm": 230.98487854003906, + "learning_rate": 9.64246823956443e-06, + "loss": 21.9902, + "step": 3749 + }, + { + "epoch": 13.538148984198646, + "grad_norm": 247.1185760498047, + "learning_rate": 9.637023593466425e-06, + "loss": 23.7439, + "step": 3750 + }, + { + "epoch": 13.538148984198646, + "eval_loss": 0.6106311082839966, + "eval_runtime": 3.1356, + "eval_samples_per_second": 57.086, + "eval_steps_per_second": 57.086, + "step": 3750 + }, + { + "epoch": 13.54176072234763, + "grad_norm": 193.83152770996094, + "learning_rate": 9.63157894736842e-06, + "loss": 24.2292, + "step": 3751 + }, + { + "epoch": 13.545372460496614, + "grad_norm": 322.80487060546875, + "learning_rate": 9.626134301270418e-06, + "loss": 40.9778, + "step": 3752 + }, + { + "epoch": 13.548984198645599, + "grad_norm": 345.0560302734375, + "learning_rate": 9.620689655172413e-06, + "loss": 42.3601, + "step": 3753 + }, + { + "epoch": 13.552595936794582, + "grad_norm": 240.3759002685547, + "learning_rate": 9.61524500907441e-06, + "loss": 41.092, + "step": 3754 + }, + { + "epoch": 13.556207674943566, + "grad_norm": 219.0955352783203, + "learning_rate": 9.609800362976407e-06, + "loss": 40.3108, + "step": 3755 + }, + { + "epoch": 13.559819413092551, + "grad_norm": 255.6158447265625, + "learning_rate": 9.604355716878403e-06, + "loss": 39.8885, + "step": 3756 + }, + { + "epoch": 13.563431151241534, + "grad_norm": 264.55010986328125, + "learning_rate": 9.5989110707804e-06, + "loss": 40.8838, + "step": 3757 + }, + { + "epoch": 13.56704288939052, + "grad_norm": 313.0918273925781, + "learning_rate": 9.593466424682397e-06, + "loss": 40.6634, + "step": 3758 + }, + { + "epoch": 13.570654627539504, + "grad_norm": 304.87396240234375, + "learning_rate": 9.588021778584392e-06, + "loss": 41.8734, + "step": 3759 + }, + { + "epoch": 13.574266365688487, + "grad_norm": 239.76063537597656, + "learning_rate": 9.58257713248639e-06, + "loss": 40.6281, + "step": 3760 + }, + { + "epoch": 13.574266365688487, + "eval_loss": 0.6124129891395569, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.028, + "eval_steps_per_second": 57.028, + "step": 3760 + }, + { + "epoch": 13.577878103837472, + "grad_norm": 201.89422607421875, + "learning_rate": 9.577132486388385e-06, + "loss": 39.6948, + "step": 3761 + }, + { + "epoch": 13.581489841986457, + "grad_norm": 232.8797607421875, + "learning_rate": 9.57168784029038e-06, + "loss": 39.6927, + "step": 3762 + }, + { + "epoch": 13.58510158013544, + "grad_norm": 250.30355834960938, + "learning_rate": 9.566243194192379e-06, + "loss": 37.6926, + "step": 3763 + }, + { + "epoch": 13.588713318284425, + "grad_norm": 256.23626708984375, + "learning_rate": 9.560798548094374e-06, + "loss": 38.248, + "step": 3764 + }, + { + "epoch": 13.592325056433408, + "grad_norm": 234.1791534423828, + "learning_rate": 9.555353901996371e-06, + "loss": 36.8178, + "step": 3765 + }, + { + "epoch": 13.595936794582393, + "grad_norm": 243.87615966796875, + "learning_rate": 9.549909255898367e-06, + "loss": 37.0802, + "step": 3766 + }, + { + "epoch": 13.599548532731378, + "grad_norm": 220.98150634765625, + "learning_rate": 9.544464609800362e-06, + "loss": 37.1251, + "step": 3767 + }, + { + "epoch": 13.60316027088036, + "grad_norm": 235.8653564453125, + "learning_rate": 9.53901996370236e-06, + "loss": 38.2965, + "step": 3768 + }, + { + "epoch": 13.606772009029346, + "grad_norm": 237.66712951660156, + "learning_rate": 9.533575317604356e-06, + "loss": 38.0266, + "step": 3769 + }, + { + "epoch": 13.610383747178329, + "grad_norm": 229.4922637939453, + "learning_rate": 9.528130671506351e-06, + "loss": 38.4199, + "step": 3770 + }, + { + "epoch": 13.610383747178329, + "eval_loss": 0.6078812479972839, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 3770 + }, + { + "epoch": 13.613995485327314, + "grad_norm": 250.82533264160156, + "learning_rate": 9.522686025408349e-06, + "loss": 39.713, + "step": 3771 + }, + { + "epoch": 13.617607223476298, + "grad_norm": 218.97511291503906, + "learning_rate": 9.517241379310346e-06, + "loss": 37.6396, + "step": 3772 + }, + { + "epoch": 13.621218961625281, + "grad_norm": 240.13096618652344, + "learning_rate": 9.511796733212341e-06, + "loss": 39.2808, + "step": 3773 + }, + { + "epoch": 13.624830699774266, + "grad_norm": 214.77957153320312, + "learning_rate": 9.506352087114338e-06, + "loss": 39.1584, + "step": 3774 + }, + { + "epoch": 13.628442437923251, + "grad_norm": 273.2488708496094, + "learning_rate": 9.500907441016333e-06, + "loss": 39.6725, + "step": 3775 + }, + { + "epoch": 13.632054176072234, + "grad_norm": 240.46669006347656, + "learning_rate": 9.49546279491833e-06, + "loss": 40.155, + "step": 3776 + }, + { + "epoch": 13.635665914221219, + "grad_norm": 304.46533203125, + "learning_rate": 9.490018148820328e-06, + "loss": 39.5831, + "step": 3777 + }, + { + "epoch": 13.639277652370204, + "grad_norm": 282.9252624511719, + "learning_rate": 9.484573502722323e-06, + "loss": 40.8392, + "step": 3778 + }, + { + "epoch": 13.642889390519187, + "grad_norm": 229.2595977783203, + "learning_rate": 9.47912885662432e-06, + "loss": 38.4015, + "step": 3779 + }, + { + "epoch": 13.646501128668172, + "grad_norm": 300.0253601074219, + "learning_rate": 9.473684210526315e-06, + "loss": 35.0578, + "step": 3780 + }, + { + "epoch": 13.646501128668172, + "eval_loss": 0.6059401631355286, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 3780 + }, + { + "epoch": 13.650112866817155, + "grad_norm": 266.379638671875, + "learning_rate": 9.468239564428313e-06, + "loss": 33.0308, + "step": 3781 + }, + { + "epoch": 13.65372460496614, + "grad_norm": 248.8190460205078, + "learning_rate": 9.46279491833031e-06, + "loss": 31.7632, + "step": 3782 + }, + { + "epoch": 13.657336343115125, + "grad_norm": 224.4126739501953, + "learning_rate": 9.457350272232305e-06, + "loss": 32.8875, + "step": 3783 + }, + { + "epoch": 13.660948081264108, + "grad_norm": 259.84466552734375, + "learning_rate": 9.4519056261343e-06, + "loss": 32.3248, + "step": 3784 + }, + { + "epoch": 13.664559819413093, + "grad_norm": 233.59483337402344, + "learning_rate": 9.446460980036297e-06, + "loss": 32.5855, + "step": 3785 + }, + { + "epoch": 13.668171557562077, + "grad_norm": 283.1840515136719, + "learning_rate": 9.441016333938295e-06, + "loss": 33.8277, + "step": 3786 + }, + { + "epoch": 13.67178329571106, + "grad_norm": 269.51171875, + "learning_rate": 9.435571687840292e-06, + "loss": 33.8348, + "step": 3787 + }, + { + "epoch": 13.675395033860045, + "grad_norm": 284.6701354980469, + "learning_rate": 9.430127041742287e-06, + "loss": 34.2571, + "step": 3788 + }, + { + "epoch": 13.679006772009028, + "grad_norm": 308.96221923828125, + "learning_rate": 9.424682395644282e-06, + "loss": 34.2313, + "step": 3789 + }, + { + "epoch": 13.682618510158013, + "grad_norm": 229.36366271972656, + "learning_rate": 9.41923774954628e-06, + "loss": 34.6341, + "step": 3790 + }, + { + "epoch": 13.682618510158013, + "eval_loss": 0.606715202331543, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 3790 + }, + { + "epoch": 13.686230248306998, + "grad_norm": 335.4346008300781, + "learning_rate": 9.413793103448277e-06, + "loss": 35.2222, + "step": 3791 + }, + { + "epoch": 13.689841986455981, + "grad_norm": 259.72222900390625, + "learning_rate": 9.408348457350272e-06, + "loss": 34.7416, + "step": 3792 + }, + { + "epoch": 13.693453724604966, + "grad_norm": 275.96112060546875, + "learning_rate": 9.402903811252269e-06, + "loss": 34.2018, + "step": 3793 + }, + { + "epoch": 13.697065462753951, + "grad_norm": 349.28924560546875, + "learning_rate": 9.397459165154264e-06, + "loss": 37.8801, + "step": 3794 + }, + { + "epoch": 13.700677200902934, + "grad_norm": 288.47540283203125, + "learning_rate": 9.392014519056261e-06, + "loss": 37.5101, + "step": 3795 + }, + { + "epoch": 13.704288939051919, + "grad_norm": 255.31033325195312, + "learning_rate": 9.386569872958259e-06, + "loss": 36.9294, + "step": 3796 + }, + { + "epoch": 13.707900677200904, + "grad_norm": 273.757080078125, + "learning_rate": 9.381125226860254e-06, + "loss": 31.64, + "step": 3797 + }, + { + "epoch": 13.711512415349887, + "grad_norm": 236.24928283691406, + "learning_rate": 9.375680580762251e-06, + "loss": 22.9812, + "step": 3798 + }, + { + "epoch": 13.715124153498872, + "grad_norm": 206.70883178710938, + "learning_rate": 9.370235934664246e-06, + "loss": 22.4788, + "step": 3799 + }, + { + "epoch": 13.718735891647855, + "grad_norm": 168.15762329101562, + "learning_rate": 9.364791288566243e-06, + "loss": 23.3803, + "step": 3800 + }, + { + "epoch": 13.718735891647855, + "eval_loss": 0.6092759966850281, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 3800 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.8634169320420147e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-3800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..999a680753f5324b70d766fa96d8be2dcd87cb85 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee04884c25699573ed21449876b62f2a63cbb634b90c2d7b9c0ee97f3b47f626 +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..a1e98d6c7a9b3d8981effcb55e0582f13689fce5 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cefa18f2990161b5da55663c4f52606076458ca6d27fc6c8581405164873ac3 +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb13a9634535d079aaa12fbf49b6054765f32cec --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd6070cfb17101a7b0b69b077881093dab0460310622f3797fa6129403d76625 +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8e5340c353aa3fd3380c34f66acede2c4011b65a --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:346711a0589589ff45c8c90d8d4cca50c78e7a69ecf88992ec6a76e78ea0a2a3 +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..112897aec9506f07aa9e2f86f8d0b099ccee4292 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e82f5a9ba5a45a9475809d00910ad2da90b0ac919869aa8fff6ff05e0b8466a +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d3e9beeb676db9cdc9eff6235a06bab3cef47d76 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/trainer_state.json @@ -0,0 +1,3153 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.4442437923250564, + "eval_steps": 10, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.970177717285683e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..548ef072f02fbc95d299e2adde404dcda7eeafdf --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bbab55a1422b0dbfebaa54b87c5360bfdace201c6015217534a53d9785a2715 +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..e8b22eb791600ea15ec857727745c46ad34642da --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3716c72e768f1ee012632129a371dba691a60d296ba0703512144e3c04f07f83 +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e17d79011b8067fa9656ce26acf859c7cd33c3d4 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49d4eeb89c8cfbb111c6ca8fb8cdf28ed521b4f3ae97271eac59f13322a9ddea +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..81603b76cdfd5d07eab524fe48793f72dce2ac01 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:227f5040a6ebd3ce1a78f458c4ca583ce652e613c480aea118b00dd527ac3ba2 +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..156fd1923cb4433d53be0b0ac57fa6823ffa25e4 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5af2f66974a88755224b7bc249801f2d4eb8c165cb5bf0c1d28b642c51974c4 +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..519070e6db95a2d3687bb165152a6068fdbf7d8b --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/trainer_state.json @@ -0,0 +1,31233 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 14.440632054176072, + "eval_steps": 10, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + }, + { + "epoch": 4.33589164785553, + "grad_norm": 232.332763671875, + "learning_rate": 2.3515426497277678e-05, + "loss": 38.1029, + "step": 1201 + }, + { + "epoch": 4.339503386004514, + "grad_norm": 251.8763885498047, + "learning_rate": 2.3509981851179673e-05, + "loss": 40.2538, + "step": 1202 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 260.1363525390625, + "learning_rate": 2.350453720508167e-05, + "loss": 39.115, + "step": 1203 + }, + { + "epoch": 4.346726862302483, + "grad_norm": 227.32473754882812, + "learning_rate": 2.3499092558983667e-05, + "loss": 37.7692, + "step": 1204 + }, + { + "epoch": 4.350338600451467, + "grad_norm": 208.3872528076172, + "learning_rate": 2.3493647912885663e-05, + "loss": 26.7583, + "step": 1205 + }, + { + "epoch": 4.353950338600452, + "grad_norm": 173.05075073242188, + "learning_rate": 2.348820326678766e-05, + "loss": 24.7576, + "step": 1206 + }, + { + "epoch": 4.357562076749436, + "grad_norm": 214.4512939453125, + "learning_rate": 2.3482758620689657e-05, + "loss": 24.8792, + "step": 1207 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 179.293701171875, + "learning_rate": 2.3477313974591652e-05, + "loss": 26.1507, + "step": 1208 + }, + { + "epoch": 4.364785553047404, + "grad_norm": 401.9908142089844, + "learning_rate": 2.3471869328493648e-05, + "loss": 47.4017, + "step": 1209 + }, + { + "epoch": 4.368397291196389, + "grad_norm": 399.3369140625, + "learning_rate": 2.3466424682395643e-05, + "loss": 48.0082, + "step": 1210 + }, + { + "epoch": 4.368397291196389, + "eval_loss": 0.6664602756500244, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.18, + "eval_steps_per_second": 57.18, + "step": 1210 + }, + { + "epoch": 4.372009029345373, + "grad_norm": 320.49090576171875, + "learning_rate": 2.346098003629764e-05, + "loss": 47.4843, + "step": 1211 + }, + { + "epoch": 4.375620767494357, + "grad_norm": 297.55615234375, + "learning_rate": 2.3455535390199637e-05, + "loss": 46.3087, + "step": 1212 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 245.03399658203125, + "learning_rate": 2.3450090744101636e-05, + "loss": 45.4889, + "step": 1213 + }, + { + "epoch": 4.382844243792325, + "grad_norm": 227.94091796875, + "learning_rate": 2.344464609800363e-05, + "loss": 45.8501, + "step": 1214 + }, + { + "epoch": 4.386455981941309, + "grad_norm": 262.7824401855469, + "learning_rate": 2.3439201451905627e-05, + "loss": 46.2737, + "step": 1215 + }, + { + "epoch": 4.390067720090293, + "grad_norm": 235.969970703125, + "learning_rate": 2.3433756805807622e-05, + "loss": 45.2876, + "step": 1216 + }, + { + "epoch": 4.393679458239277, + "grad_norm": 244.8028106689453, + "learning_rate": 2.342831215970962e-05, + "loss": 45.4931, + "step": 1217 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 236.24844360351562, + "learning_rate": 2.3422867513611616e-05, + "loss": 45.6649, + "step": 1218 + }, + { + "epoch": 4.400902934537246, + "grad_norm": 204.7911834716797, + "learning_rate": 2.341742286751361e-05, + "loss": 43.9613, + "step": 1219 + }, + { + "epoch": 4.40451467268623, + "grad_norm": 190.6739044189453, + "learning_rate": 2.3411978221415607e-05, + "loss": 41.9267, + "step": 1220 + }, + { + "epoch": 4.40451467268623, + "eval_loss": 0.6481396555900574, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1220 + }, + { + "epoch": 4.408126410835214, + "grad_norm": 224.25758361816406, + "learning_rate": 2.3406533575317602e-05, + "loss": 42.34, + "step": 1221 + }, + { + "epoch": 4.411738148984199, + "grad_norm": 238.21913146972656, + "learning_rate": 2.34010889292196e-05, + "loss": 40.6947, + "step": 1222 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 255.64395141601562, + "learning_rate": 2.33956442831216e-05, + "loss": 39.8585, + "step": 1223 + }, + { + "epoch": 4.418961625282167, + "grad_norm": 202.08859252929688, + "learning_rate": 2.3390199637023595e-05, + "loss": 42.6031, + "step": 1224 + }, + { + "epoch": 4.422573363431152, + "grad_norm": 222.359619140625, + "learning_rate": 2.338475499092559e-05, + "loss": 41.9946, + "step": 1225 + }, + { + "epoch": 4.426185101580136, + "grad_norm": 198.84461975097656, + "learning_rate": 2.3379310344827586e-05, + "loss": 40.9174, + "step": 1226 + }, + { + "epoch": 4.42979683972912, + "grad_norm": 227.34942626953125, + "learning_rate": 2.337386569872958e-05, + "loss": 42.2865, + "step": 1227 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 249.9097900390625, + "learning_rate": 2.336842105263158e-05, + "loss": 42.6508, + "step": 1228 + }, + { + "epoch": 4.437020316027088, + "grad_norm": 236.96009826660156, + "learning_rate": 2.3362976406533576e-05, + "loss": 43.0846, + "step": 1229 + }, + { + "epoch": 4.440632054176072, + "grad_norm": 183.06201171875, + "learning_rate": 2.335753176043557e-05, + "loss": 42.4119, + "step": 1230 + }, + { + "epoch": 4.440632054176072, + "eval_loss": 0.6428424715995789, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 1230 + }, + { + "epoch": 4.444243792325056, + "grad_norm": 199.0382843017578, + "learning_rate": 2.335208711433757e-05, + "loss": 43.1702, + "step": 1231 + }, + { + "epoch": 4.44785553047404, + "grad_norm": 221.87939453125, + "learning_rate": 2.3346642468239565e-05, + "loss": 43.3518, + "step": 1232 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 205.0601043701172, + "learning_rate": 2.3341197822141564e-05, + "loss": 42.9713, + "step": 1233 + }, + { + "epoch": 4.455079006772009, + "grad_norm": 235.3998565673828, + "learning_rate": 2.333575317604356e-05, + "loss": 42.6973, + "step": 1234 + }, + { + "epoch": 4.458690744920993, + "grad_norm": 171.76986694335938, + "learning_rate": 2.3330308529945555e-05, + "loss": 43.351, + "step": 1235 + }, + { + "epoch": 4.462302483069977, + "grad_norm": 261.549072265625, + "learning_rate": 2.332486388384755e-05, + "loss": 43.8662, + "step": 1236 + }, + { + "epoch": 4.465914221218962, + "grad_norm": 256.76837158203125, + "learning_rate": 2.3319419237749545e-05, + "loss": 40.7938, + "step": 1237 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 176.35060119628906, + "learning_rate": 2.331397459165154e-05, + "loss": 38.1021, + "step": 1238 + }, + { + "epoch": 4.47313769751693, + "grad_norm": 203.00906372070312, + "learning_rate": 2.330852994555354e-05, + "loss": 36.6359, + "step": 1239 + }, + { + "epoch": 4.476749435665914, + "grad_norm": 259.6462707519531, + "learning_rate": 2.3303085299455535e-05, + "loss": 34.448, + "step": 1240 + }, + { + "epoch": 4.476749435665914, + "eval_loss": 0.6386051177978516, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 1240 + }, + { + "epoch": 4.480361173814899, + "grad_norm": 215.24737548828125, + "learning_rate": 2.3297640653357534e-05, + "loss": 35.2353, + "step": 1241 + }, + { + "epoch": 4.483972911963883, + "grad_norm": 249.12355041503906, + "learning_rate": 2.329219600725953e-05, + "loss": 38.2077, + "step": 1242 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 191.0881805419922, + "learning_rate": 2.3286751361161525e-05, + "loss": 36.8363, + "step": 1243 + }, + { + "epoch": 4.491196388261851, + "grad_norm": 229.26449584960938, + "learning_rate": 2.3281306715063523e-05, + "loss": 36.7398, + "step": 1244 + }, + { + "epoch": 4.4948081264108355, + "grad_norm": 184.931884765625, + "learning_rate": 2.327586206896552e-05, + "loss": 35.6614, + "step": 1245 + }, + { + "epoch": 4.4984198645598195, + "grad_norm": 183.7378387451172, + "learning_rate": 2.3270417422867514e-05, + "loss": 36.9818, + "step": 1246 + }, + { + "epoch": 4.502031602708803, + "grad_norm": 191.42543029785156, + "learning_rate": 2.326497277676951e-05, + "loss": 38.1348, + "step": 1247 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 211.6359100341797, + "learning_rate": 2.3259528130671505e-05, + "loss": 37.0112, + "step": 1248 + }, + { + "epoch": 4.509255079006772, + "grad_norm": 245.6946563720703, + "learning_rate": 2.32540834845735e-05, + "loss": 38.6218, + "step": 1249 + }, + { + "epoch": 4.512866817155756, + "grad_norm": 193.29095458984375, + "learning_rate": 2.3248638838475502e-05, + "loss": 36.9687, + "step": 1250 + }, + { + "epoch": 4.512866817155756, + "eval_loss": 0.6432057023048401, + "eval_runtime": 3.1301, + "eval_samples_per_second": 57.187, + "eval_steps_per_second": 57.187, + "step": 1250 + }, + { + "epoch": 4.51647855530474, + "grad_norm": 247.0595245361328, + "learning_rate": 2.3243194192377498e-05, + "loss": 39.8086, + "step": 1251 + }, + { + "epoch": 4.520090293453725, + "grad_norm": 243.1544189453125, + "learning_rate": 2.3237749546279493e-05, + "loss": 38.7245, + "step": 1252 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 322.0834045410156, + "learning_rate": 2.323230490018149e-05, + "loss": 39.5335, + "step": 1253 + }, + { + "epoch": 4.527313769751693, + "grad_norm": 201.5956573486328, + "learning_rate": 2.3226860254083484e-05, + "loss": 30.2928, + "step": 1254 + }, + { + "epoch": 4.530925507900677, + "grad_norm": 186.13291931152344, + "learning_rate": 2.3221415607985483e-05, + "loss": 24.8504, + "step": 1255 + }, + { + "epoch": 4.534537246049661, + "grad_norm": 251.50608825683594, + "learning_rate": 2.3215970961887478e-05, + "loss": 24.5528, + "step": 1256 + }, + { + "epoch": 4.538148984198646, + "grad_norm": 180.21124267578125, + "learning_rate": 2.3210526315789473e-05, + "loss": 25.0864, + "step": 1257 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 206.5410614013672, + "learning_rate": 2.320508166969147e-05, + "loss": 27.1602, + "step": 1258 + }, + { + "epoch": 4.545372460496614, + "grad_norm": 342.1103210449219, + "learning_rate": 2.3199637023593468e-05, + "loss": 47.3734, + "step": 1259 + }, + { + "epoch": 4.5489841986455986, + "grad_norm": 418.3056945800781, + "learning_rate": 2.3194192377495463e-05, + "loss": 48.0316, + "step": 1260 + }, + { + "epoch": 4.5489841986455986, + "eval_loss": 0.6742400527000427, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 1260 + }, + { + "epoch": 4.5525959367945825, + "grad_norm": 369.8560791015625, + "learning_rate": 2.3188747731397462e-05, + "loss": 47.4532, + "step": 1261 + }, + { + "epoch": 4.5562076749435665, + "grad_norm": 322.0288391113281, + "learning_rate": 2.3183303085299457e-05, + "loss": 47.0661, + "step": 1262 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 244.79066467285156, + "learning_rate": 2.3177858439201453e-05, + "loss": 45.1875, + "step": 1263 + }, + { + "epoch": 4.563431151241535, + "grad_norm": 209.29397583007812, + "learning_rate": 2.3172413793103448e-05, + "loss": 46.1355, + "step": 1264 + }, + { + "epoch": 4.567042889390519, + "grad_norm": 271.5123291015625, + "learning_rate": 2.3166969147005443e-05, + "loss": 45.8947, + "step": 1265 + }, + { + "epoch": 4.570654627539503, + "grad_norm": 232.42913818359375, + "learning_rate": 2.3161524500907442e-05, + "loss": 45.6542, + "step": 1266 + }, + { + "epoch": 4.574266365688487, + "grad_norm": 282.50738525390625, + "learning_rate": 2.3156079854809437e-05, + "loss": 45.8805, + "step": 1267 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 203.39031982421875, + "learning_rate": 2.3150635208711436e-05, + "loss": 44.8926, + "step": 1268 + }, + { + "epoch": 4.581489841986456, + "grad_norm": 213.94894409179688, + "learning_rate": 2.314519056261343e-05, + "loss": 43.7589, + "step": 1269 + }, + { + "epoch": 4.58510158013544, + "grad_norm": 198.9677734375, + "learning_rate": 2.3139745916515427e-05, + "loss": 41.819, + "step": 1270 + }, + { + "epoch": 4.58510158013544, + "eval_loss": 0.6428627371788025, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1270 + }, + { + "epoch": 4.588713318284425, + "grad_norm": 197.69903564453125, + "learning_rate": 2.3134301270417422e-05, + "loss": 40.6128, + "step": 1271 + }, + { + "epoch": 4.592325056433409, + "grad_norm": 229.10488891601562, + "learning_rate": 2.312885662431942e-05, + "loss": 41.1856, + "step": 1272 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 254.4750213623047, + "learning_rate": 2.3123411978221417e-05, + "loss": 40.2048, + "step": 1273 + }, + { + "epoch": 4.599548532731377, + "grad_norm": 247.2012939453125, + "learning_rate": 2.3117967332123412e-05, + "loss": 41.663, + "step": 1274 + }, + { + "epoch": 4.603160270880361, + "grad_norm": 196.78761291503906, + "learning_rate": 2.3112522686025407e-05, + "loss": 41.1102, + "step": 1275 + }, + { + "epoch": 4.606772009029346, + "grad_norm": 179.03880310058594, + "learning_rate": 2.3107078039927403e-05, + "loss": 39.6368, + "step": 1276 + }, + { + "epoch": 4.6103837471783295, + "grad_norm": 203.49159240722656, + "learning_rate": 2.3101633393829405e-05, + "loss": 42.9424, + "step": 1277 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 254.80018615722656, + "learning_rate": 2.30961887477314e-05, + "loss": 42.0636, + "step": 1278 + }, + { + "epoch": 4.617607223476298, + "grad_norm": 201.86109924316406, + "learning_rate": 2.3090744101633396e-05, + "loss": 41.4738, + "step": 1279 + }, + { + "epoch": 4.621218961625282, + "grad_norm": 185.1239471435547, + "learning_rate": 2.308529945553539e-05, + "loss": 41.8529, + "step": 1280 + }, + { + "epoch": 4.621218961625282, + "eval_loss": 0.6457561254501343, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 1280 + }, + { + "epoch": 4.624830699774266, + "grad_norm": 198.6769561767578, + "learning_rate": 2.3079854809437386e-05, + "loss": 41.8397, + "step": 1281 + }, + { + "epoch": 4.62844243792325, + "grad_norm": 254.9165496826172, + "learning_rate": 2.3074410163339382e-05, + "loss": 43.5585, + "step": 1282 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 183.61181640625, + "learning_rate": 2.306896551724138e-05, + "loss": 41.7349, + "step": 1283 + }, + { + "epoch": 4.635665914221219, + "grad_norm": 206.0381622314453, + "learning_rate": 2.3063520871143376e-05, + "loss": 42.6239, + "step": 1284 + }, + { + "epoch": 4.639277652370203, + "grad_norm": 188.5303497314453, + "learning_rate": 2.305807622504537e-05, + "loss": 43.0988, + "step": 1285 + }, + { + "epoch": 4.642889390519187, + "grad_norm": 208.30039978027344, + "learning_rate": 2.3052631578947367e-05, + "loss": 43.8379, + "step": 1286 + }, + { + "epoch": 4.646501128668172, + "grad_norm": 209.494384765625, + "learning_rate": 2.3047186932849365e-05, + "loss": 41.4395, + "step": 1287 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 223.97824096679688, + "learning_rate": 2.3041742286751364e-05, + "loss": 38.5792, + "step": 1288 + }, + { + "epoch": 4.65372460496614, + "grad_norm": 209.16192626953125, + "learning_rate": 2.303629764065336e-05, + "loss": 36.2448, + "step": 1289 + }, + { + "epoch": 4.657336343115124, + "grad_norm": 260.72821044921875, + "learning_rate": 2.3030852994555355e-05, + "loss": 35.1692, + "step": 1290 + }, + { + "epoch": 4.657336343115124, + "eval_loss": 0.6381233334541321, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1290 + }, + { + "epoch": 4.660948081264109, + "grad_norm": 222.2270965576172, + "learning_rate": 2.302540834845735e-05, + "loss": 35.2234, + "step": 1291 + }, + { + "epoch": 4.664559819413093, + "grad_norm": 208.68218994140625, + "learning_rate": 2.3019963702359346e-05, + "loss": 35.6167, + "step": 1292 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 199.57015991210938, + "learning_rate": 2.301451905626134e-05, + "loss": 36.9489, + "step": 1293 + }, + { + "epoch": 4.6717832957110605, + "grad_norm": 249.1312255859375, + "learning_rate": 2.300907441016334e-05, + "loss": 37.0681, + "step": 1294 + }, + { + "epoch": 4.675395033860045, + "grad_norm": 227.86341857910156, + "learning_rate": 2.3003629764065335e-05, + "loss": 38.3897, + "step": 1295 + }, + { + "epoch": 4.679006772009029, + "grad_norm": 290.3368225097656, + "learning_rate": 2.2998185117967334e-05, + "loss": 39.1391, + "step": 1296 + }, + { + "epoch": 4.682618510158013, + "grad_norm": 222.59974670410156, + "learning_rate": 2.299274047186933e-05, + "loss": 38.6362, + "step": 1297 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 233.853515625, + "learning_rate": 2.2987295825771325e-05, + "loss": 37.1796, + "step": 1298 + }, + { + "epoch": 4.689841986455982, + "grad_norm": 202.83212280273438, + "learning_rate": 2.2981851179673324e-05, + "loss": 38.5097, + "step": 1299 + }, + { + "epoch": 4.693453724604966, + "grad_norm": 203.59027099609375, + "learning_rate": 2.297640653357532e-05, + "loss": 38.3335, + "step": 1300 + }, + { + "epoch": 4.693453724604966, + "eval_loss": 0.6446877717971802, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 1300 + }, + { + "epoch": 4.69706546275395, + "grad_norm": 250.48324584960938, + "learning_rate": 2.2970961887477314e-05, + "loss": 39.1848, + "step": 1301 + }, + { + "epoch": 4.700677200902934, + "grad_norm": 218.0867462158203, + "learning_rate": 2.296551724137931e-05, + "loss": 38.2276, + "step": 1302 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 316.4258728027344, + "learning_rate": 2.2960072595281305e-05, + "loss": 38.4487, + "step": 1303 + }, + { + "epoch": 4.707900677200903, + "grad_norm": 262.96832275390625, + "learning_rate": 2.29546279491833e-05, + "loss": 29.1075, + "step": 1304 + }, + { + "epoch": 4.711512415349887, + "grad_norm": 261.25897216796875, + "learning_rate": 2.2949183303085303e-05, + "loss": 24.6257, + "step": 1305 + }, + { + "epoch": 4.715124153498872, + "grad_norm": 223.29014587402344, + "learning_rate": 2.2943738656987298e-05, + "loss": 24.4387, + "step": 1306 + }, + { + "epoch": 4.718735891647856, + "grad_norm": 167.95193481445312, + "learning_rate": 2.2938294010889293e-05, + "loss": 25.0916, + "step": 1307 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 203.88392639160156, + "learning_rate": 2.293284936479129e-05, + "loss": 26.1631, + "step": 1308 + }, + { + "epoch": 4.725959367945824, + "grad_norm": 350.67657470703125, + "learning_rate": 2.2927404718693284e-05, + "loss": 47.7021, + "step": 1309 + }, + { + "epoch": 4.7295711060948085, + "grad_norm": 357.1839294433594, + "learning_rate": 2.2921960072595283e-05, + "loss": 47.8161, + "step": 1310 + }, + { + "epoch": 4.7295711060948085, + "eval_loss": 0.6716815829277039, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 1310 + }, + { + "epoch": 4.733182844243792, + "grad_norm": 334.40216064453125, + "learning_rate": 2.291651542649728e-05, + "loss": 47.5608, + "step": 1311 + }, + { + "epoch": 4.736794582392776, + "grad_norm": 322.90008544921875, + "learning_rate": 2.2911070780399274e-05, + "loss": 45.9858, + "step": 1312 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 291.5083923339844, + "learning_rate": 2.290562613430127e-05, + "loss": 45.9813, + "step": 1313 + }, + { + "epoch": 4.744018058690745, + "grad_norm": 234.91102600097656, + "learning_rate": 2.2900181488203268e-05, + "loss": 44.4287, + "step": 1314 + }, + { + "epoch": 4.747629796839729, + "grad_norm": 271.03582763671875, + "learning_rate": 2.2894736842105263e-05, + "loss": 45.3697, + "step": 1315 + }, + { + "epoch": 4.751241534988713, + "grad_norm": 256.219482421875, + "learning_rate": 2.2889292196007262e-05, + "loss": 45.1817, + "step": 1316 + }, + { + "epoch": 4.754853273137698, + "grad_norm": 252.0631561279297, + "learning_rate": 2.2883847549909257e-05, + "loss": 45.2029, + "step": 1317 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 249.41812133789062, + "learning_rate": 2.2878402903811253e-05, + "loss": 44.9802, + "step": 1318 + }, + { + "epoch": 4.762076749435666, + "grad_norm": 208.9102325439453, + "learning_rate": 2.2872958257713248e-05, + "loss": 44.3745, + "step": 1319 + }, + { + "epoch": 4.76568848758465, + "grad_norm": 322.94903564453125, + "learning_rate": 2.2867513611615244e-05, + "loss": 40.9193, + "step": 1320 + }, + { + "epoch": 4.76568848758465, + "eval_loss": 0.6515910029411316, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1320 + }, + { + "epoch": 4.769300225733634, + "grad_norm": 264.6942138671875, + "learning_rate": 2.2862068965517242e-05, + "loss": 39.7286, + "step": 1321 + }, + { + "epoch": 4.772911963882619, + "grad_norm": 276.6095886230469, + "learning_rate": 2.2856624319419238e-05, + "loss": 41.3846, + "step": 1322 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 199.59877014160156, + "learning_rate": 2.2851179673321233e-05, + "loss": 40.5583, + "step": 1323 + }, + { + "epoch": 4.780135440180587, + "grad_norm": 252.59158325195312, + "learning_rate": 2.2845735027223232e-05, + "loss": 40.9513, + "step": 1324 + }, + { + "epoch": 4.7837471783295715, + "grad_norm": 215.53826904296875, + "learning_rate": 2.2840290381125227e-05, + "loss": 41.5119, + "step": 1325 + }, + { + "epoch": 4.7873589164785555, + "grad_norm": 290.7100524902344, + "learning_rate": 2.2834845735027226e-05, + "loss": 42.7646, + "step": 1326 + }, + { + "epoch": 4.7909706546275395, + "grad_norm": 190.2306671142578, + "learning_rate": 2.282940108892922e-05, + "loss": 42.2708, + "step": 1327 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 187.5550079345703, + "learning_rate": 2.2823956442831217e-05, + "loss": 41.9279, + "step": 1328 + }, + { + "epoch": 4.798194130925508, + "grad_norm": 169.10414123535156, + "learning_rate": 2.2818511796733212e-05, + "loss": 42.2688, + "step": 1329 + }, + { + "epoch": 4.801805869074492, + "grad_norm": 199.5216064453125, + "learning_rate": 2.2813067150635208e-05, + "loss": 41.9192, + "step": 1330 + }, + { + "epoch": 4.801805869074492, + "eval_loss": 0.6402038335800171, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 1330 + }, + { + "epoch": 4.805417607223476, + "grad_norm": 222.4996337890625, + "learning_rate": 2.2807622504537203e-05, + "loss": 43.8218, + "step": 1331 + }, + { + "epoch": 4.80902934537246, + "grad_norm": 228.1157684326172, + "learning_rate": 2.2802177858439202e-05, + "loss": 42.9497, + "step": 1332 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 179.83697509765625, + "learning_rate": 2.27967332123412e-05, + "loss": 43.9723, + "step": 1333 + }, + { + "epoch": 4.816252821670429, + "grad_norm": 196.81983947753906, + "learning_rate": 2.2791288566243196e-05, + "loss": 43.3302, + "step": 1334 + }, + { + "epoch": 4.819864559819413, + "grad_norm": 186.61160278320312, + "learning_rate": 2.278584392014519e-05, + "loss": 41.8957, + "step": 1335 + }, + { + "epoch": 4.823476297968397, + "grad_norm": 242.55886840820312, + "learning_rate": 2.2780399274047187e-05, + "loss": 43.1916, + "step": 1336 + }, + { + "epoch": 4.827088036117382, + "grad_norm": 212.07177734375, + "learning_rate": 2.2774954627949185e-05, + "loss": 38.3371, + "step": 1337 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 180.1990966796875, + "learning_rate": 2.276950998185118e-05, + "loss": 36.3413, + "step": 1338 + }, + { + "epoch": 4.83431151241535, + "grad_norm": 202.69529724121094, + "learning_rate": 2.2764065335753176e-05, + "loss": 35.4426, + "step": 1339 + }, + { + "epoch": 4.837923250564334, + "grad_norm": 180.47283935546875, + "learning_rate": 2.275862068965517e-05, + "loss": 35.5281, + "step": 1340 + }, + { + "epoch": 4.837923250564334, + "eval_loss": 0.6356105804443359, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 1340 + }, + { + "epoch": 4.8415349887133186, + "grad_norm": 204.674560546875, + "learning_rate": 2.2753176043557167e-05, + "loss": 36.2566, + "step": 1341 + }, + { + "epoch": 4.8451467268623025, + "grad_norm": 272.1197204589844, + "learning_rate": 2.2747731397459166e-05, + "loss": 36.3862, + "step": 1342 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 235.55101013183594, + "learning_rate": 2.2742286751361165e-05, + "loss": 35.1455, + "step": 1343 + }, + { + "epoch": 4.852370203160271, + "grad_norm": 271.2718200683594, + "learning_rate": 2.273684210526316e-05, + "loss": 37.3824, + "step": 1344 + }, + { + "epoch": 4.855981941309255, + "grad_norm": 242.15728759765625, + "learning_rate": 2.2731397459165155e-05, + "loss": 37.6587, + "step": 1345 + }, + { + "epoch": 4.859593679458239, + "grad_norm": 218.59481811523438, + "learning_rate": 2.272595281306715e-05, + "loss": 36.7602, + "step": 1346 + }, + { + "epoch": 4.863205417607223, + "grad_norm": 231.9490203857422, + "learning_rate": 2.2720508166969146e-05, + "loss": 38.187, + "step": 1347 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 385.56158447265625, + "learning_rate": 2.2715063520871145e-05, + "loss": 38.1905, + "step": 1348 + }, + { + "epoch": 4.870428893905192, + "grad_norm": 219.38204956054688, + "learning_rate": 2.270961887477314e-05, + "loss": 38.2179, + "step": 1349 + }, + { + "epoch": 4.874040632054176, + "grad_norm": 209.46580505371094, + "learning_rate": 2.2704174228675136e-05, + "loss": 37.3696, + "step": 1350 + }, + { + "epoch": 4.874040632054176, + "eval_loss": 0.6412517428398132, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 1350 + }, + { + "epoch": 4.87765237020316, + "grad_norm": 205.53416442871094, + "learning_rate": 2.2698729582577134e-05, + "loss": 38.5144, + "step": 1351 + }, + { + "epoch": 4.881264108352145, + "grad_norm": 214.2522735595703, + "learning_rate": 2.269328493647913e-05, + "loss": 38.7372, + "step": 1352 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 236.9787139892578, + "learning_rate": 2.2687840290381125e-05, + "loss": 38.8987, + "step": 1353 + }, + { + "epoch": 4.888487584650113, + "grad_norm": 247.30906677246094, + "learning_rate": 2.2682395644283124e-05, + "loss": 35.0837, + "step": 1354 + }, + { + "epoch": 4.892099322799097, + "grad_norm": 287.5954284667969, + "learning_rate": 2.267695099818512e-05, + "loss": 25.5272, + "step": 1355 + }, + { + "epoch": 4.895711060948082, + "grad_norm": 254.61672973632812, + "learning_rate": 2.2671506352087115e-05, + "loss": 25.1288, + "step": 1356 + }, + { + "epoch": 4.899322799097066, + "grad_norm": 180.98666381835938, + "learning_rate": 2.266606170598911e-05, + "loss": 25.0588, + "step": 1357 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 213.0275421142578, + "learning_rate": 2.2660617059891105e-05, + "loss": 25.464, + "step": 1358 + }, + { + "epoch": 4.9065462753950335, + "grad_norm": 385.18035888671875, + "learning_rate": 2.2655172413793104e-05, + "loss": 47.0056, + "step": 1359 + }, + { + "epoch": 4.910158013544018, + "grad_norm": 383.4106140136719, + "learning_rate": 2.2649727767695103e-05, + "loss": 46.9892, + "step": 1360 + }, + { + "epoch": 4.910158013544018, + "eval_loss": 0.6618479490280151, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1360 + }, + { + "epoch": 4.913769751693002, + "grad_norm": 415.4345397949219, + "learning_rate": 2.26442831215971e-05, + "loss": 47.1619, + "step": 1361 + }, + { + "epoch": 4.917381489841986, + "grad_norm": 362.338134765625, + "learning_rate": 2.2638838475499094e-05, + "loss": 46.7232, + "step": 1362 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 378.7535400390625, + "learning_rate": 2.263339382940109e-05, + "loss": 46.4438, + "step": 1363 + }, + { + "epoch": 4.924604966139955, + "grad_norm": 251.64901733398438, + "learning_rate": 2.2627949183303085e-05, + "loss": 44.8178, + "step": 1364 + }, + { + "epoch": 4.928216704288939, + "grad_norm": 273.1052551269531, + "learning_rate": 2.2622504537205083e-05, + "loss": 43.0865, + "step": 1365 + }, + { + "epoch": 4.931828442437923, + "grad_norm": 229.66415405273438, + "learning_rate": 2.261705989110708e-05, + "loss": 42.2463, + "step": 1366 + }, + { + "epoch": 4.935440180586907, + "grad_norm": 229.47940063476562, + "learning_rate": 2.2611615245009074e-05, + "loss": 42.4395, + "step": 1367 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 224.48890686035156, + "learning_rate": 2.260617059891107e-05, + "loss": 42.4994, + "step": 1368 + }, + { + "epoch": 4.942663656884876, + "grad_norm": 241.98745727539062, + "learning_rate": 2.2600725952813065e-05, + "loss": 42.5535, + "step": 1369 + }, + { + "epoch": 4.94627539503386, + "grad_norm": 258.1711120605469, + "learning_rate": 2.2595281306715067e-05, + "loss": 42.8475, + "step": 1370 + }, + { + "epoch": 4.94627539503386, + "eval_loss": 0.639252245426178, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.09, + "eval_steps_per_second": 57.09, + "step": 1370 + }, + { + "epoch": 4.949887133182845, + "grad_norm": 204.64927673339844, + "learning_rate": 2.2589836660617062e-05, + "loss": 42.9895, + "step": 1371 + }, + { + "epoch": 4.953498871331829, + "grad_norm": 342.9057922363281, + "learning_rate": 2.2584392014519058e-05, + "loss": 43.1972, + "step": 1372 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 207.45504760742188, + "learning_rate": 2.2578947368421053e-05, + "loss": 42.406, + "step": 1373 + }, + { + "epoch": 4.960722347629797, + "grad_norm": 232.78831481933594, + "learning_rate": 2.257350272232305e-05, + "loss": 36.8817, + "step": 1374 + }, + { + "epoch": 4.9643340857787805, + "grad_norm": 249.3349609375, + "learning_rate": 2.2568058076225044e-05, + "loss": 34.584, + "step": 1375 + }, + { + "epoch": 4.967945823927765, + "grad_norm": 322.7100524902344, + "learning_rate": 2.2562613430127043e-05, + "loss": 36.9512, + "step": 1376 + }, + { + "epoch": 4.971557562076749, + "grad_norm": 357.65228271484375, + "learning_rate": 2.2557168784029038e-05, + "loss": 37.6833, + "step": 1377 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 300.0970153808594, + "learning_rate": 2.2551724137931033e-05, + "loss": 38.597, + "step": 1378 + }, + { + "epoch": 4.978781038374718, + "grad_norm": 234.52508544921875, + "learning_rate": 2.2546279491833032e-05, + "loss": 38.4155, + "step": 1379 + }, + { + "epoch": 4.982392776523702, + "grad_norm": 270.60626220703125, + "learning_rate": 2.2540834845735028e-05, + "loss": 38.1589, + "step": 1380 + }, + { + "epoch": 4.982392776523702, + "eval_loss": 0.6409950256347656, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 1380 + }, + { + "epoch": 4.986004514672686, + "grad_norm": 232.9596710205078, + "learning_rate": 2.2535390199637026e-05, + "loss": 39.281, + "step": 1381 + }, + { + "epoch": 4.98961625282167, + "grad_norm": 248.0550994873047, + "learning_rate": 2.2529945553539022e-05, + "loss": 40.0868, + "step": 1382 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 256.327880859375, + "learning_rate": 2.2524500907441017e-05, + "loss": 28.1259, + "step": 1383 + }, + { + "epoch": 4.996839729119639, + "grad_norm": 198.29559326171875, + "learning_rate": 2.2519056261343012e-05, + "loss": 25.3166, + "step": 1384 + }, + { + "epoch": 5.0, + "grad_norm": 174.66856384277344, + "learning_rate": 2.2513611615245008e-05, + "loss": 22.0749, + "step": 1385 + }, + { + "epoch": 5.003611738148984, + "grad_norm": 309.0927429199219, + "learning_rate": 2.2508166969147003e-05, + "loss": 45.2433, + "step": 1386 + }, + { + "epoch": 5.007223476297969, + "grad_norm": 293.1455383300781, + "learning_rate": 2.2502722323049002e-05, + "loss": 46.7025, + "step": 1387 + }, + { + "epoch": 5.010835214446953, + "grad_norm": 269.47662353515625, + "learning_rate": 2.2497277676951e-05, + "loss": 45.3218, + "step": 1388 + }, + { + "epoch": 5.014446952595937, + "grad_norm": 284.49560546875, + "learning_rate": 2.2491833030852996e-05, + "loss": 44.9849, + "step": 1389 + }, + { + "epoch": 5.018058690744921, + "grad_norm": 223.5511474609375, + "learning_rate": 2.248638838475499e-05, + "loss": 44.887, + "step": 1390 + }, + { + "epoch": 5.018058690744921, + "eval_loss": 0.6435533165931702, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1390 + }, + { + "epoch": 5.021670428893906, + "grad_norm": 243.4492645263672, + "learning_rate": 2.2480943738656987e-05, + "loss": 45.1483, + "step": 1391 + }, + { + "epoch": 5.0252821670428895, + "grad_norm": 265.1712646484375, + "learning_rate": 2.2475499092558986e-05, + "loss": 44.3713, + "step": 1392 + }, + { + "epoch": 5.0288939051918735, + "grad_norm": 190.72190856933594, + "learning_rate": 2.247005444646098e-05, + "loss": 45.3138, + "step": 1393 + }, + { + "epoch": 5.0325056433408575, + "grad_norm": 177.26686096191406, + "learning_rate": 2.2464609800362976e-05, + "loss": 43.302, + "step": 1394 + }, + { + "epoch": 5.036117381489842, + "grad_norm": 198.6124725341797, + "learning_rate": 2.2459165154264972e-05, + "loss": 43.6363, + "step": 1395 + }, + { + "epoch": 5.039729119638826, + "grad_norm": 233.78738403320312, + "learning_rate": 2.2453720508166967e-05, + "loss": 43.0345, + "step": 1396 + }, + { + "epoch": 5.04334085778781, + "grad_norm": 225.48614501953125, + "learning_rate": 2.2448275862068966e-05, + "loss": 41.5932, + "step": 1397 + }, + { + "epoch": 5.046952595936794, + "grad_norm": 204.31179809570312, + "learning_rate": 2.2442831215970965e-05, + "loss": 40.1401, + "step": 1398 + }, + { + "epoch": 5.050564334085779, + "grad_norm": 219.5385284423828, + "learning_rate": 2.243738656987296e-05, + "loss": 40.8834, + "step": 1399 + }, + { + "epoch": 5.054176072234763, + "grad_norm": 168.3094024658203, + "learning_rate": 2.2431941923774956e-05, + "loss": 40.4476, + "step": 1400 + }, + { + "epoch": 5.054176072234763, + "eval_loss": 0.6361114382743835, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 1400 + }, + { + "epoch": 5.057787810383747, + "grad_norm": 169.45201110839844, + "learning_rate": 2.242649727767695e-05, + "loss": 40.1949, + "step": 1401 + }, + { + "epoch": 5.061399548532731, + "grad_norm": 208.84634399414062, + "learning_rate": 2.2421052631578946e-05, + "loss": 41.0091, + "step": 1402 + }, + { + "epoch": 5.065011286681716, + "grad_norm": 248.86221313476562, + "learning_rate": 2.2415607985480945e-05, + "loss": 40.2435, + "step": 1403 + }, + { + "epoch": 5.0686230248307, + "grad_norm": 297.0834655761719, + "learning_rate": 2.241016333938294e-05, + "loss": 42.37, + "step": 1404 + }, + { + "epoch": 5.072234762979684, + "grad_norm": 242.12661743164062, + "learning_rate": 2.2404718693284936e-05, + "loss": 42.3822, + "step": 1405 + }, + { + "epoch": 5.075846501128668, + "grad_norm": 230.1178741455078, + "learning_rate": 2.2399274047186935e-05, + "loss": 41.3722, + "step": 1406 + }, + { + "epoch": 5.079458239277653, + "grad_norm": 191.32371520996094, + "learning_rate": 2.239382940108893e-05, + "loss": 41.8087, + "step": 1407 + }, + { + "epoch": 5.083069977426637, + "grad_norm": 267.28753662109375, + "learning_rate": 2.2388384754990925e-05, + "loss": 42.5938, + "step": 1408 + }, + { + "epoch": 5.0866817155756205, + "grad_norm": 186.61978149414062, + "learning_rate": 2.2382940108892924e-05, + "loss": 42.8553, + "step": 1409 + }, + { + "epoch": 5.090293453724605, + "grad_norm": 242.53433227539062, + "learning_rate": 2.237749546279492e-05, + "loss": 41.9677, + "step": 1410 + }, + { + "epoch": 5.090293453724605, + "eval_loss": 0.6330043077468872, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 1410 + }, + { + "epoch": 5.093905191873589, + "grad_norm": 199.74696350097656, + "learning_rate": 2.2372050816696915e-05, + "loss": 42.9821, + "step": 1411 + }, + { + "epoch": 5.097516930022573, + "grad_norm": 254.1063690185547, + "learning_rate": 2.236660617059891e-05, + "loss": 42.7956, + "step": 1412 + }, + { + "epoch": 5.101128668171557, + "grad_norm": 215.59056091308594, + "learning_rate": 2.2361161524500906e-05, + "loss": 43.6312, + "step": 1413 + }, + { + "epoch": 5.104740406320542, + "grad_norm": 218.69973754882812, + "learning_rate": 2.2355716878402904e-05, + "loss": 40.9468, + "step": 1414 + }, + { + "epoch": 5.108352144469526, + "grad_norm": 200.34927368164062, + "learning_rate": 2.23502722323049e-05, + "loss": 38.2656, + "step": 1415 + }, + { + "epoch": 5.11196388261851, + "grad_norm": 191.56883239746094, + "learning_rate": 2.23448275862069e-05, + "loss": 35.8111, + "step": 1416 + }, + { + "epoch": 5.115575620767494, + "grad_norm": 192.629150390625, + "learning_rate": 2.2339382940108894e-05, + "loss": 35.1287, + "step": 1417 + }, + { + "epoch": 5.119187358916479, + "grad_norm": 217.54855346679688, + "learning_rate": 2.233393829401089e-05, + "loss": 34.9664, + "step": 1418 + }, + { + "epoch": 5.122799097065463, + "grad_norm": 234.12355041503906, + "learning_rate": 2.2328493647912888e-05, + "loss": 35.9252, + "step": 1419 + }, + { + "epoch": 5.126410835214447, + "grad_norm": 201.83477783203125, + "learning_rate": 2.2323049001814884e-05, + "loss": 36.4664, + "step": 1420 + }, + { + "epoch": 5.126410835214447, + "eval_loss": 0.6359394192695618, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 1420 + }, + { + "epoch": 5.130022573363431, + "grad_norm": 212.38943481445312, + "learning_rate": 2.231760435571688e-05, + "loss": 35.2733, + "step": 1421 + }, + { + "epoch": 5.133634311512416, + "grad_norm": 219.8803253173828, + "learning_rate": 2.2312159709618874e-05, + "loss": 37.2009, + "step": 1422 + }, + { + "epoch": 5.1372460496614, + "grad_norm": 222.28221130371094, + "learning_rate": 2.230671506352087e-05, + "loss": 36.9338, + "step": 1423 + }, + { + "epoch": 5.140857787810384, + "grad_norm": 217.56607055664062, + "learning_rate": 2.2301270417422865e-05, + "loss": 38.0419, + "step": 1424 + }, + { + "epoch": 5.144469525959368, + "grad_norm": 232.7363739013672, + "learning_rate": 2.2295825771324867e-05, + "loss": 38.1393, + "step": 1425 + }, + { + "epoch": 5.148081264108352, + "grad_norm": 228.12091064453125, + "learning_rate": 2.2290381125226863e-05, + "loss": 37.4169, + "step": 1426 + }, + { + "epoch": 5.151693002257336, + "grad_norm": 247.9901580810547, + "learning_rate": 2.2284936479128858e-05, + "loss": 37.6386, + "step": 1427 + }, + { + "epoch": 5.15530474040632, + "grad_norm": 227.96649169921875, + "learning_rate": 2.2279491833030853e-05, + "loss": 38.7843, + "step": 1428 + }, + { + "epoch": 5.158916478555304, + "grad_norm": 197.85072326660156, + "learning_rate": 2.227404718693285e-05, + "loss": 37.7056, + "step": 1429 + }, + { + "epoch": 5.162528216704289, + "grad_norm": 270.6370544433594, + "learning_rate": 2.2268602540834848e-05, + "loss": 38.5554, + "step": 1430 + }, + { + "epoch": 5.162528216704289, + "eval_loss": 0.6463288068771362, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 1430 + }, + { + "epoch": 5.166139954853273, + "grad_norm": 251.65847778320312, + "learning_rate": 2.2263157894736843e-05, + "loss": 32.6593, + "step": 1431 + }, + { + "epoch": 5.169751693002257, + "grad_norm": 248.84368896484375, + "learning_rate": 2.225771324863884e-05, + "loss": 24.8031, + "step": 1432 + }, + { + "epoch": 5.173363431151241, + "grad_norm": 218.12979125976562, + "learning_rate": 2.2252268602540834e-05, + "loss": 23.8542, + "step": 1433 + }, + { + "epoch": 5.176975169300226, + "grad_norm": 171.4182586669922, + "learning_rate": 2.2246823956442832e-05, + "loss": 25.1994, + "step": 1434 + }, + { + "epoch": 5.18058690744921, + "grad_norm": 200.76271057128906, + "learning_rate": 2.2241379310344828e-05, + "loss": 25.1259, + "step": 1435 + }, + { + "epoch": 5.184198645598194, + "grad_norm": 324.8979797363281, + "learning_rate": 2.2235934664246827e-05, + "loss": 46.7466, + "step": 1436 + }, + { + "epoch": 5.187810383747179, + "grad_norm": 391.9200439453125, + "learning_rate": 2.2230490018148822e-05, + "loss": 47.366, + "step": 1437 + }, + { + "epoch": 5.191422121896163, + "grad_norm": 332.51080322265625, + "learning_rate": 2.2225045372050817e-05, + "loss": 47.5236, + "step": 1438 + }, + { + "epoch": 5.195033860045147, + "grad_norm": 295.85333251953125, + "learning_rate": 2.2219600725952813e-05, + "loss": 44.9235, + "step": 1439 + }, + { + "epoch": 5.198645598194131, + "grad_norm": 246.46482849121094, + "learning_rate": 2.2214156079854808e-05, + "loss": 44.5892, + "step": 1440 + }, + { + "epoch": 5.198645598194131, + "eval_loss": 0.6501885056495667, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.096, + "eval_steps_per_second": 57.096, + "step": 1440 + }, + { + "epoch": 5.2022573363431155, + "grad_norm": 224.99964904785156, + "learning_rate": 2.2208711433756807e-05, + "loss": 45.1496, + "step": 1441 + }, + { + "epoch": 5.2058690744920995, + "grad_norm": 201.5928497314453, + "learning_rate": 2.2203266787658802e-05, + "loss": 44.2362, + "step": 1442 + }, + { + "epoch": 5.209480812641083, + "grad_norm": 220.72509765625, + "learning_rate": 2.21978221415608e-05, + "loss": 45.7963, + "step": 1443 + }, + { + "epoch": 5.213092550790067, + "grad_norm": 229.04412841796875, + "learning_rate": 2.2192377495462796e-05, + "loss": 44.1812, + "step": 1444 + }, + { + "epoch": 5.216704288939052, + "grad_norm": 214.86207580566406, + "learning_rate": 2.2186932849364792e-05, + "loss": 44.364, + "step": 1445 + }, + { + "epoch": 5.220316027088036, + "grad_norm": 169.3239288330078, + "learning_rate": 2.2181488203266787e-05, + "loss": 44.1106, + "step": 1446 + }, + { + "epoch": 5.22392776523702, + "grad_norm": 180.3131561279297, + "learning_rate": 2.2176043557168786e-05, + "loss": 41.8791, + "step": 1447 + }, + { + "epoch": 5.227539503386004, + "grad_norm": 227.83078002929688, + "learning_rate": 2.217059891107078e-05, + "loss": 39.7917, + "step": 1448 + }, + { + "epoch": 5.231151241534989, + "grad_norm": 267.4294738769531, + "learning_rate": 2.2165154264972777e-05, + "loss": 41.2864, + "step": 1449 + }, + { + "epoch": 5.234762979683973, + "grad_norm": 210.79034423828125, + "learning_rate": 2.2159709618874772e-05, + "loss": 40.7219, + "step": 1450 + }, + { + "epoch": 5.234762979683973, + "eval_loss": 0.6369529366493225, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 1450 + }, + { + "epoch": 5.238374717832957, + "grad_norm": 205.2632598876953, + "learning_rate": 2.2154264972776768e-05, + "loss": 41.0364, + "step": 1451 + }, + { + "epoch": 5.241986455981941, + "grad_norm": 199.7196807861328, + "learning_rate": 2.214882032667877e-05, + "loss": 40.2733, + "step": 1452 + }, + { + "epoch": 5.245598194130926, + "grad_norm": 184.26495361328125, + "learning_rate": 2.2143375680580765e-05, + "loss": 40.3418, + "step": 1453 + }, + { + "epoch": 5.24920993227991, + "grad_norm": 170.1937713623047, + "learning_rate": 2.213793103448276e-05, + "loss": 40.5658, + "step": 1454 + }, + { + "epoch": 5.252821670428894, + "grad_norm": 167.71109008789062, + "learning_rate": 2.2132486388384756e-05, + "loss": 41.9252, + "step": 1455 + }, + { + "epoch": 5.2564334085778786, + "grad_norm": 184.73162841796875, + "learning_rate": 2.212704174228675e-05, + "loss": 40.0485, + "step": 1456 + }, + { + "epoch": 5.2600451467268625, + "grad_norm": 195.0812225341797, + "learning_rate": 2.2121597096188747e-05, + "loss": 41.6424, + "step": 1457 + }, + { + "epoch": 5.2636568848758465, + "grad_norm": 218.23553466796875, + "learning_rate": 2.2116152450090745e-05, + "loss": 40.6179, + "step": 1458 + }, + { + "epoch": 5.2672686230248305, + "grad_norm": 229.79299926757812, + "learning_rate": 2.211070780399274e-05, + "loss": 42.8747, + "step": 1459 + }, + { + "epoch": 5.270880361173815, + "grad_norm": 231.70692443847656, + "learning_rate": 2.2105263157894736e-05, + "loss": 42.7016, + "step": 1460 + }, + { + "epoch": 5.270880361173815, + "eval_loss": 0.6424433588981628, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 1460 + }, + { + "epoch": 5.274492099322799, + "grad_norm": 204.9513397216797, + "learning_rate": 2.209981851179673e-05, + "loss": 41.206, + "step": 1461 + }, + { + "epoch": 5.278103837471783, + "grad_norm": 220.89083862304688, + "learning_rate": 2.209437386569873e-05, + "loss": 44.0126, + "step": 1462 + }, + { + "epoch": 5.281715575620767, + "grad_norm": 266.7763671875, + "learning_rate": 2.208892921960073e-05, + "loss": 41.4934, + "step": 1463 + }, + { + "epoch": 5.285327313769752, + "grad_norm": 241.42636108398438, + "learning_rate": 2.2083484573502724e-05, + "loss": 43.3433, + "step": 1464 + }, + { + "epoch": 5.288939051918736, + "grad_norm": 221.7669219970703, + "learning_rate": 2.207803992740472e-05, + "loss": 35.9569, + "step": 1465 + }, + { + "epoch": 5.29255079006772, + "grad_norm": 236.0152130126953, + "learning_rate": 2.2072595281306715e-05, + "loss": 36.0824, + "step": 1466 + }, + { + "epoch": 5.296162528216704, + "grad_norm": 239.56224060058594, + "learning_rate": 2.206715063520871e-05, + "loss": 33.6127, + "step": 1467 + }, + { + "epoch": 5.299774266365689, + "grad_norm": 277.1287841796875, + "learning_rate": 2.2061705989110706e-05, + "loss": 36.11, + "step": 1468 + }, + { + "epoch": 5.303386004514673, + "grad_norm": 250.19515991210938, + "learning_rate": 2.2056261343012705e-05, + "loss": 36.9984, + "step": 1469 + }, + { + "epoch": 5.306997742663657, + "grad_norm": 214.2754669189453, + "learning_rate": 2.20508166969147e-05, + "loss": 36.5917, + "step": 1470 + }, + { + "epoch": 5.306997742663657, + "eval_loss": 0.6356943845748901, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 1470 + }, + { + "epoch": 5.310609480812641, + "grad_norm": 224.37388610839844, + "learning_rate": 2.20453720508167e-05, + "loss": 36.5302, + "step": 1471 + }, + { + "epoch": 5.314221218961626, + "grad_norm": 276.2541809082031, + "learning_rate": 2.2039927404718694e-05, + "loss": 36.7978, + "step": 1472 + }, + { + "epoch": 5.3178329571106095, + "grad_norm": 361.717041015625, + "learning_rate": 2.203448275862069e-05, + "loss": 37.4063, + "step": 1473 + }, + { + "epoch": 5.3214446952595935, + "grad_norm": 285.3569641113281, + "learning_rate": 2.202903811252269e-05, + "loss": 37.2472, + "step": 1474 + }, + { + "epoch": 5.3250564334085775, + "grad_norm": 268.160400390625, + "learning_rate": 2.2023593466424684e-05, + "loss": 37.7361, + "step": 1475 + }, + { + "epoch": 5.328668171557562, + "grad_norm": 211.38070678710938, + "learning_rate": 2.201814882032668e-05, + "loss": 37.7794, + "step": 1476 + }, + { + "epoch": 5.332279909706546, + "grad_norm": 214.10638427734375, + "learning_rate": 2.2012704174228675e-05, + "loss": 39.0787, + "step": 1477 + }, + { + "epoch": 5.33589164785553, + "grad_norm": 238.9603271484375, + "learning_rate": 2.200725952813067e-05, + "loss": 37.6853, + "step": 1478 + }, + { + "epoch": 5.339503386004514, + "grad_norm": 323.44976806640625, + "learning_rate": 2.2001814882032665e-05, + "loss": 38.2844, + "step": 1479 + }, + { + "epoch": 5.343115124153499, + "grad_norm": 289.6131896972656, + "learning_rate": 2.1996370235934668e-05, + "loss": 38.8953, + "step": 1480 + }, + { + "epoch": 5.343115124153499, + "eval_loss": 0.6462770700454712, + "eval_runtime": 3.1673, + "eval_samples_per_second": 56.516, + "eval_steps_per_second": 56.516, + "step": 1480 + }, + { + "epoch": 5.346726862302483, + "grad_norm": 197.47299194335938, + "learning_rate": 2.1990925589836663e-05, + "loss": 28.126, + "step": 1481 + }, + { + "epoch": 5.350338600451467, + "grad_norm": 198.37156677246094, + "learning_rate": 2.1985480943738658e-05, + "loss": 24.2205, + "step": 1482 + }, + { + "epoch": 5.353950338600452, + "grad_norm": 211.03501892089844, + "learning_rate": 2.1980036297640654e-05, + "loss": 24.119, + "step": 1483 + }, + { + "epoch": 5.357562076749436, + "grad_norm": 182.23316955566406, + "learning_rate": 2.197459165154265e-05, + "loss": 24.7386, + "step": 1484 + }, + { + "epoch": 5.36117381489842, + "grad_norm": 192.6392822265625, + "learning_rate": 2.1969147005444648e-05, + "loss": 26.0739, + "step": 1485 + }, + { + "epoch": 5.364785553047404, + "grad_norm": 380.62896728515625, + "learning_rate": 2.1963702359346643e-05, + "loss": 46.6945, + "step": 1486 + }, + { + "epoch": 5.368397291196389, + "grad_norm": 342.5572814941406, + "learning_rate": 2.195825771324864e-05, + "loss": 46.1797, + "step": 1487 + }, + { + "epoch": 5.372009029345373, + "grad_norm": 311.7198791503906, + "learning_rate": 2.1952813067150634e-05, + "loss": 45.6588, + "step": 1488 + }, + { + "epoch": 5.375620767494357, + "grad_norm": 260.9885559082031, + "learning_rate": 2.1947368421052633e-05, + "loss": 45.2405, + "step": 1489 + }, + { + "epoch": 5.3792325056433405, + "grad_norm": 263.3132019042969, + "learning_rate": 2.1941923774954628e-05, + "loss": 44.117, + "step": 1490 + }, + { + "epoch": 5.3792325056433405, + "eval_loss": 0.644275426864624, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 1490 + }, + { + "epoch": 5.382844243792325, + "grad_norm": 254.92022705078125, + "learning_rate": 2.1936479128856627e-05, + "loss": 45.4002, + "step": 1491 + }, + { + "epoch": 5.386455981941309, + "grad_norm": 246.1839599609375, + "learning_rate": 2.1931034482758622e-05, + "loss": 45.3481, + "step": 1492 + }, + { + "epoch": 5.390067720090293, + "grad_norm": 282.2879638671875, + "learning_rate": 2.1925589836660618e-05, + "loss": 45.3958, + "step": 1493 + }, + { + "epoch": 5.393679458239277, + "grad_norm": 266.9140930175781, + "learning_rate": 2.1920145190562613e-05, + "loss": 44.2959, + "step": 1494 + }, + { + "epoch": 5.397291196388262, + "grad_norm": 196.81199645996094, + "learning_rate": 2.191470054446461e-05, + "loss": 44.765, + "step": 1495 + }, + { + "epoch": 5.400902934537246, + "grad_norm": 270.7329406738281, + "learning_rate": 2.1909255898366607e-05, + "loss": 42.8581, + "step": 1496 + }, + { + "epoch": 5.40451467268623, + "grad_norm": 187.3281707763672, + "learning_rate": 2.1903811252268603e-05, + "loss": 40.7167, + "step": 1497 + }, + { + "epoch": 5.408126410835214, + "grad_norm": 302.9165954589844, + "learning_rate": 2.1898366606170598e-05, + "loss": 41.0712, + "step": 1498 + }, + { + "epoch": 5.411738148984199, + "grad_norm": 395.1492614746094, + "learning_rate": 2.1892921960072597e-05, + "loss": 40.4098, + "step": 1499 + }, + { + "epoch": 5.415349887133183, + "grad_norm": 253.91494750976562, + "learning_rate": 2.1887477313974592e-05, + "loss": 41.2985, + "step": 1500 + }, + { + "epoch": 5.415349887133183, + "eval_loss": 0.6383773684501648, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1500 + }, + { + "epoch": 5.418961625282167, + "grad_norm": 248.4109344482422, + "learning_rate": 2.1882032667876588e-05, + "loss": 41.179, + "step": 1501 + }, + { + "epoch": 5.422573363431152, + "grad_norm": 210.50015258789062, + "learning_rate": 2.1876588021778586e-05, + "loss": 41.1934, + "step": 1502 + }, + { + "epoch": 5.426185101580136, + "grad_norm": 170.64334106445312, + "learning_rate": 2.187114337568058e-05, + "loss": 41.5535, + "step": 1503 + }, + { + "epoch": 5.42979683972912, + "grad_norm": 249.41270446777344, + "learning_rate": 2.1865698729582577e-05, + "loss": 41.8323, + "step": 1504 + }, + { + "epoch": 5.433408577878104, + "grad_norm": 214.53770446777344, + "learning_rate": 2.1860254083484572e-05, + "loss": 42.1517, + "step": 1505 + }, + { + "epoch": 5.437020316027088, + "grad_norm": 225.6502227783203, + "learning_rate": 2.1854809437386568e-05, + "loss": 42.7675, + "step": 1506 + }, + { + "epoch": 5.440632054176072, + "grad_norm": 210.19219970703125, + "learning_rate": 2.1849364791288567e-05, + "loss": 42.5094, + "step": 1507 + }, + { + "epoch": 5.444243792325056, + "grad_norm": 187.03294372558594, + "learning_rate": 2.1843920145190565e-05, + "loss": 42.2218, + "step": 1508 + }, + { + "epoch": 5.44785553047404, + "grad_norm": 227.6764373779297, + "learning_rate": 2.183847549909256e-05, + "loss": 42.7061, + "step": 1509 + }, + { + "epoch": 5.451467268623025, + "grad_norm": 239.2847442626953, + "learning_rate": 2.1833030852994556e-05, + "loss": 43.1959, + "step": 1510 + }, + { + "epoch": 5.451467268623025, + "eval_loss": 0.6405091285705566, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 1510 + }, + { + "epoch": 5.455079006772009, + "grad_norm": 268.887451171875, + "learning_rate": 2.182758620689655e-05, + "loss": 42.4915, + "step": 1511 + }, + { + "epoch": 5.458690744920993, + "grad_norm": 261.0531311035156, + "learning_rate": 2.182214156079855e-05, + "loss": 42.1777, + "step": 1512 + }, + { + "epoch": 5.462302483069977, + "grad_norm": 241.58819580078125, + "learning_rate": 2.1816696914700546e-05, + "loss": 40.8728, + "step": 1513 + }, + { + "epoch": 5.465914221218962, + "grad_norm": 227.302001953125, + "learning_rate": 2.181125226860254e-05, + "loss": 39.8861, + "step": 1514 + }, + { + "epoch": 5.469525959367946, + "grad_norm": 293.8402404785156, + "learning_rate": 2.1805807622504536e-05, + "loss": 36.8716, + "step": 1515 + }, + { + "epoch": 5.47313769751693, + "grad_norm": 332.8829650878906, + "learning_rate": 2.1800362976406532e-05, + "loss": 35.6049, + "step": 1516 + }, + { + "epoch": 5.476749435665914, + "grad_norm": 271.6636962890625, + "learning_rate": 2.179491833030853e-05, + "loss": 34.6785, + "step": 1517 + }, + { + "epoch": 5.480361173814899, + "grad_norm": 211.5673065185547, + "learning_rate": 2.178947368421053e-05, + "loss": 35.5321, + "step": 1518 + }, + { + "epoch": 5.483972911963883, + "grad_norm": 168.95346069335938, + "learning_rate": 2.1784029038112525e-05, + "loss": 35.1604, + "step": 1519 + }, + { + "epoch": 5.487584650112867, + "grad_norm": 242.66725158691406, + "learning_rate": 2.177858439201452e-05, + "loss": 37.8709, + "step": 1520 + }, + { + "epoch": 5.487584650112867, + "eval_loss": 0.6324127912521362, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1520 + }, + { + "epoch": 5.491196388261851, + "grad_norm": 202.7799530029297, + "learning_rate": 2.1773139745916516e-05, + "loss": 38.1727, + "step": 1521 + }, + { + "epoch": 5.4948081264108355, + "grad_norm": 210.12704467773438, + "learning_rate": 2.176769509981851e-05, + "loss": 36.4171, + "step": 1522 + }, + { + "epoch": 5.4984198645598195, + "grad_norm": 214.7133331298828, + "learning_rate": 2.176225045372051e-05, + "loss": 37.7873, + "step": 1523 + }, + { + "epoch": 5.502031602708803, + "grad_norm": 197.89781188964844, + "learning_rate": 2.1756805807622505e-05, + "loss": 37.1096, + "step": 1524 + }, + { + "epoch": 5.505643340857787, + "grad_norm": 203.01992797851562, + "learning_rate": 2.17513611615245e-05, + "loss": 36.9907, + "step": 1525 + }, + { + "epoch": 5.509255079006772, + "grad_norm": 210.42164611816406, + "learning_rate": 2.17459165154265e-05, + "loss": 38.0291, + "step": 1526 + }, + { + "epoch": 5.512866817155756, + "grad_norm": 210.2798309326172, + "learning_rate": 2.1740471869328495e-05, + "loss": 37.5385, + "step": 1527 + }, + { + "epoch": 5.51647855530474, + "grad_norm": 217.986572265625, + "learning_rate": 2.173502722323049e-05, + "loss": 39.2736, + "step": 1528 + }, + { + "epoch": 5.520090293453725, + "grad_norm": 221.05831909179688, + "learning_rate": 2.172958257713249e-05, + "loss": 39.2733, + "step": 1529 + }, + { + "epoch": 5.523702031602709, + "grad_norm": 250.36065673828125, + "learning_rate": 2.1724137931034484e-05, + "loss": 37.8987, + "step": 1530 + }, + { + "epoch": 5.523702031602709, + "eval_loss": 0.6414559483528137, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 1530 + }, + { + "epoch": 5.527313769751693, + "grad_norm": 275.062255859375, + "learning_rate": 2.171869328493648e-05, + "loss": 29.4874, + "step": 1531 + }, + { + "epoch": 5.530925507900677, + "grad_norm": 178.79615783691406, + "learning_rate": 2.1713248638838475e-05, + "loss": 25.2165, + "step": 1532 + }, + { + "epoch": 5.534537246049661, + "grad_norm": 221.6693572998047, + "learning_rate": 2.170780399274047e-05, + "loss": 24.7139, + "step": 1533 + }, + { + "epoch": 5.538148984198646, + "grad_norm": 207.15869140625, + "learning_rate": 2.170235934664247e-05, + "loss": 25.2773, + "step": 1534 + }, + { + "epoch": 5.54176072234763, + "grad_norm": 193.37644958496094, + "learning_rate": 2.1696914700544468e-05, + "loss": 25.7936, + "step": 1535 + }, + { + "epoch": 5.545372460496614, + "grad_norm": 314.101318359375, + "learning_rate": 2.1691470054446463e-05, + "loss": 45.8573, + "step": 1536 + }, + { + "epoch": 5.5489841986455986, + "grad_norm": 376.9578552246094, + "learning_rate": 2.168602540834846e-05, + "loss": 47.1284, + "step": 1537 + }, + { + "epoch": 5.5525959367945825, + "grad_norm": 343.3904724121094, + "learning_rate": 2.1680580762250454e-05, + "loss": 45.1873, + "step": 1538 + }, + { + "epoch": 5.5562076749435665, + "grad_norm": 263.31768798828125, + "learning_rate": 2.167513611615245e-05, + "loss": 45.4906, + "step": 1539 + }, + { + "epoch": 5.5598194130925505, + "grad_norm": 295.50384521484375, + "learning_rate": 2.1669691470054448e-05, + "loss": 44.9259, + "step": 1540 + }, + { + "epoch": 5.5598194130925505, + "eval_loss": 0.6483813524246216, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.923, + "eval_steps_per_second": 56.923, + "step": 1540 + }, + { + "epoch": 5.563431151241535, + "grad_norm": 208.8861846923828, + "learning_rate": 2.1664246823956444e-05, + "loss": 43.7965, + "step": 1541 + }, + { + "epoch": 5.567042889390519, + "grad_norm": 195.8695526123047, + "learning_rate": 2.165880217785844e-05, + "loss": 44.7409, + "step": 1542 + }, + { + "epoch": 5.570654627539503, + "grad_norm": 218.10089111328125, + "learning_rate": 2.1653357531760434e-05, + "loss": 45.9364, + "step": 1543 + }, + { + "epoch": 5.574266365688487, + "grad_norm": 204.17205810546875, + "learning_rate": 2.164791288566243e-05, + "loss": 45.468, + "step": 1544 + }, + { + "epoch": 5.577878103837472, + "grad_norm": 239.03952026367188, + "learning_rate": 2.1642468239564432e-05, + "loss": 44.7685, + "step": 1545 + }, + { + "epoch": 5.581489841986456, + "grad_norm": 251.59300231933594, + "learning_rate": 2.1637023593466427e-05, + "loss": 43.011, + "step": 1546 + }, + { + "epoch": 5.58510158013544, + "grad_norm": 186.72540283203125, + "learning_rate": 2.1631578947368423e-05, + "loss": 41.5255, + "step": 1547 + }, + { + "epoch": 5.588713318284425, + "grad_norm": 199.89732360839844, + "learning_rate": 2.1626134301270418e-05, + "loss": 40.2522, + "step": 1548 + }, + { + "epoch": 5.592325056433409, + "grad_norm": 182.16624450683594, + "learning_rate": 2.1620689655172413e-05, + "loss": 41.0931, + "step": 1549 + }, + { + "epoch": 5.595936794582393, + "grad_norm": 221.58680725097656, + "learning_rate": 2.161524500907441e-05, + "loss": 40.2717, + "step": 1550 + }, + { + "epoch": 5.595936794582393, + "eval_loss": 0.6393340229988098, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 1550 + }, + { + "epoch": 5.599548532731377, + "grad_norm": 209.82183837890625, + "learning_rate": 2.1609800362976408e-05, + "loss": 41.7522, + "step": 1551 + }, + { + "epoch": 5.603160270880361, + "grad_norm": 226.1896209716797, + "learning_rate": 2.1604355716878403e-05, + "loss": 40.8078, + "step": 1552 + }, + { + "epoch": 5.606772009029346, + "grad_norm": 219.57899475097656, + "learning_rate": 2.1598911070780398e-05, + "loss": 42.2331, + "step": 1553 + }, + { + "epoch": 5.6103837471783295, + "grad_norm": 185.2303009033203, + "learning_rate": 2.1593466424682397e-05, + "loss": 42.0695, + "step": 1554 + }, + { + "epoch": 5.6139954853273135, + "grad_norm": 192.32913208007812, + "learning_rate": 2.1588021778584392e-05, + "loss": 42.1317, + "step": 1555 + }, + { + "epoch": 5.617607223476298, + "grad_norm": 183.3128662109375, + "learning_rate": 2.158257713248639e-05, + "loss": 40.4957, + "step": 1556 + }, + { + "epoch": 5.621218961625282, + "grad_norm": 178.10691833496094, + "learning_rate": 2.1577132486388387e-05, + "loss": 40.9154, + "step": 1557 + }, + { + "epoch": 5.624830699774266, + "grad_norm": 207.3495330810547, + "learning_rate": 2.1571687840290382e-05, + "loss": 42.8389, + "step": 1558 + }, + { + "epoch": 5.62844243792325, + "grad_norm": 191.46353149414062, + "learning_rate": 2.1566243194192377e-05, + "loss": 41.9483, + "step": 1559 + }, + { + "epoch": 5.632054176072235, + "grad_norm": 218.9544219970703, + "learning_rate": 2.1560798548094373e-05, + "loss": 41.2037, + "step": 1560 + }, + { + "epoch": 5.632054176072235, + "eval_loss": 0.6345452070236206, + "eval_runtime": 3.1432, + "eval_samples_per_second": 56.949, + "eval_steps_per_second": 56.949, + "step": 1560 + }, + { + "epoch": 5.635665914221219, + "grad_norm": 235.9405059814453, + "learning_rate": 2.1555353901996368e-05, + "loss": 43.1159, + "step": 1561 + }, + { + "epoch": 5.639277652370203, + "grad_norm": 207.1119384765625, + "learning_rate": 2.1549909255898367e-05, + "loss": 43.4384, + "step": 1562 + }, + { + "epoch": 5.642889390519187, + "grad_norm": 305.3013916015625, + "learning_rate": 2.1544464609800366e-05, + "loss": 42.436, + "step": 1563 + }, + { + "epoch": 5.646501128668172, + "grad_norm": 226.25282287597656, + "learning_rate": 2.153901996370236e-05, + "loss": 39.6844, + "step": 1564 + }, + { + "epoch": 5.650112866817156, + "grad_norm": 201.5033416748047, + "learning_rate": 2.1533575317604356e-05, + "loss": 35.9103, + "step": 1565 + }, + { + "epoch": 5.65372460496614, + "grad_norm": 206.63229370117188, + "learning_rate": 2.1528130671506352e-05, + "loss": 35.0026, + "step": 1566 + }, + { + "epoch": 5.657336343115124, + "grad_norm": 212.67581176757812, + "learning_rate": 2.152268602540835e-05, + "loss": 35.6298, + "step": 1567 + }, + { + "epoch": 5.660948081264109, + "grad_norm": 193.2886199951172, + "learning_rate": 2.1517241379310346e-05, + "loss": 36.0356, + "step": 1568 + }, + { + "epoch": 5.664559819413093, + "grad_norm": 166.189208984375, + "learning_rate": 2.151179673321234e-05, + "loss": 35.5423, + "step": 1569 + }, + { + "epoch": 5.668171557562077, + "grad_norm": 288.91552734375, + "learning_rate": 2.1506352087114337e-05, + "loss": 36.6227, + "step": 1570 + }, + { + "epoch": 5.668171557562077, + "eval_loss": 0.6339959502220154, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1570 + }, + { + "epoch": 5.6717832957110605, + "grad_norm": 210.91664123535156, + "learning_rate": 2.1500907441016332e-05, + "loss": 37.3015, + "step": 1571 + }, + { + "epoch": 5.675395033860045, + "grad_norm": 206.54299926757812, + "learning_rate": 2.149546279491833e-05, + "loss": 36.961, + "step": 1572 + }, + { + "epoch": 5.679006772009029, + "grad_norm": 206.55613708496094, + "learning_rate": 2.149001814882033e-05, + "loss": 36.722, + "step": 1573 + }, + { + "epoch": 5.682618510158013, + "grad_norm": 206.86563110351562, + "learning_rate": 2.1484573502722325e-05, + "loss": 37.7482, + "step": 1574 + }, + { + "epoch": 5.686230248306998, + "grad_norm": 219.96533203125, + "learning_rate": 2.147912885662432e-05, + "loss": 37.7964, + "step": 1575 + }, + { + "epoch": 5.689841986455982, + "grad_norm": 226.23887634277344, + "learning_rate": 2.1473684210526316e-05, + "loss": 38.6577, + "step": 1576 + }, + { + "epoch": 5.693453724604966, + "grad_norm": 195.1751708984375, + "learning_rate": 2.146823956442831e-05, + "loss": 36.9764, + "step": 1577 + }, + { + "epoch": 5.69706546275395, + "grad_norm": 194.3510284423828, + "learning_rate": 2.146279491833031e-05, + "loss": 39.4842, + "step": 1578 + }, + { + "epoch": 5.700677200902934, + "grad_norm": 187.02281188964844, + "learning_rate": 2.1457350272232305e-05, + "loss": 38.9574, + "step": 1579 + }, + { + "epoch": 5.704288939051919, + "grad_norm": 242.91925048828125, + "learning_rate": 2.14519056261343e-05, + "loss": 37.6359, + "step": 1580 + }, + { + "epoch": 5.704288939051919, + "eval_loss": 0.6384473443031311, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 1580 + }, + { + "epoch": 5.707900677200903, + "grad_norm": 242.9617156982422, + "learning_rate": 2.14464609800363e-05, + "loss": 31.3564, + "step": 1581 + }, + { + "epoch": 5.711512415349887, + "grad_norm": 182.00540161132812, + "learning_rate": 2.1441016333938295e-05, + "loss": 24.2933, + "step": 1582 + }, + { + "epoch": 5.715124153498872, + "grad_norm": 257.7115173339844, + "learning_rate": 2.143557168784029e-05, + "loss": 24.6299, + "step": 1583 + }, + { + "epoch": 5.718735891647856, + "grad_norm": 198.71554565429688, + "learning_rate": 2.143012704174229e-05, + "loss": 24.7344, + "step": 1584 + }, + { + "epoch": 5.72234762979684, + "grad_norm": 198.24520874023438, + "learning_rate": 2.1424682395644284e-05, + "loss": 26.0825, + "step": 1585 + }, + { + "epoch": 5.725959367945824, + "grad_norm": 248.9528045654297, + "learning_rate": 2.141923774954628e-05, + "loss": 45.1176, + "step": 1586 + }, + { + "epoch": 5.7295711060948085, + "grad_norm": 293.7327575683594, + "learning_rate": 2.1413793103448275e-05, + "loss": 45.8517, + "step": 1587 + }, + { + "epoch": 5.733182844243792, + "grad_norm": 293.1148681640625, + "learning_rate": 2.140834845735027e-05, + "loss": 45.6659, + "step": 1588 + }, + { + "epoch": 5.736794582392776, + "grad_norm": 312.7779846191406, + "learning_rate": 2.140290381125227e-05, + "loss": 44.4863, + "step": 1589 + }, + { + "epoch": 5.74040632054176, + "grad_norm": 309.1000061035156, + "learning_rate": 2.1397459165154265e-05, + "loss": 43.649, + "step": 1590 + }, + { + "epoch": 5.74040632054176, + "eval_loss": 0.6471736431121826, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 1590 + }, + { + "epoch": 5.744018058690745, + "grad_norm": 276.4226989746094, + "learning_rate": 2.1392014519056263e-05, + "loss": 45.3135, + "step": 1591 + }, + { + "epoch": 5.747629796839729, + "grad_norm": 233.6791229248047, + "learning_rate": 2.138656987295826e-05, + "loss": 44.4919, + "step": 1592 + }, + { + "epoch": 5.751241534988713, + "grad_norm": 194.2917022705078, + "learning_rate": 2.1381125226860254e-05, + "loss": 44.8033, + "step": 1593 + }, + { + "epoch": 5.754853273137698, + "grad_norm": 241.76060485839844, + "learning_rate": 2.137568058076225e-05, + "loss": 45.1427, + "step": 1594 + }, + { + "epoch": 5.758465011286682, + "grad_norm": 216.56283569335938, + "learning_rate": 2.137023593466425e-05, + "loss": 43.1769, + "step": 1595 + }, + { + "epoch": 5.762076749435666, + "grad_norm": 230.0026092529297, + "learning_rate": 2.1364791288566244e-05, + "loss": 44.1141, + "step": 1596 + }, + { + "epoch": 5.76568848758465, + "grad_norm": 191.55433654785156, + "learning_rate": 2.135934664246824e-05, + "loss": 40.7227, + "step": 1597 + }, + { + "epoch": 5.769300225733634, + "grad_norm": 180.25885009765625, + "learning_rate": 2.1353901996370235e-05, + "loss": 40.9842, + "step": 1598 + }, + { + "epoch": 5.772911963882619, + "grad_norm": 220.4018096923828, + "learning_rate": 2.134845735027223e-05, + "loss": 40.0403, + "step": 1599 + }, + { + "epoch": 5.776523702031603, + "grad_norm": 264.20587158203125, + "learning_rate": 2.1343012704174232e-05, + "loss": 40.1543, + "step": 1600 + }, + { + "epoch": 5.776523702031603, + "eval_loss": 0.6374311447143555, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1600 + }, + { + "epoch": 5.780135440180587, + "grad_norm": 167.9457244873047, + "learning_rate": 2.1337568058076227e-05, + "loss": 40.9575, + "step": 1601 + }, + { + "epoch": 5.7837471783295715, + "grad_norm": 190.05247497558594, + "learning_rate": 2.1332123411978223e-05, + "loss": 39.5593, + "step": 1602 + }, + { + "epoch": 5.7873589164785555, + "grad_norm": 246.4980926513672, + "learning_rate": 2.1326678765880218e-05, + "loss": 40.7016, + "step": 1603 + }, + { + "epoch": 5.7909706546275395, + "grad_norm": 208.7435302734375, + "learning_rate": 2.1321234119782214e-05, + "loss": 41.7855, + "step": 1604 + }, + { + "epoch": 5.794582392776523, + "grad_norm": 190.84188842773438, + "learning_rate": 2.1315789473684212e-05, + "loss": 41.2129, + "step": 1605 + }, + { + "epoch": 5.798194130925508, + "grad_norm": 196.7161102294922, + "learning_rate": 2.1310344827586208e-05, + "loss": 40.8209, + "step": 1606 + }, + { + "epoch": 5.801805869074492, + "grad_norm": 181.4319305419922, + "learning_rate": 2.1304900181488203e-05, + "loss": 41.8345, + "step": 1607 + }, + { + "epoch": 5.805417607223476, + "grad_norm": 201.2064971923828, + "learning_rate": 2.12994555353902e-05, + "loss": 43.1464, + "step": 1608 + }, + { + "epoch": 5.80902934537246, + "grad_norm": 199.15174865722656, + "learning_rate": 2.1294010889292197e-05, + "loss": 42.6041, + "step": 1609 + }, + { + "epoch": 5.812641083521445, + "grad_norm": 231.0398406982422, + "learning_rate": 2.1288566243194193e-05, + "loss": 42.867, + "step": 1610 + }, + { + "epoch": 5.812641083521445, + "eval_loss": 0.6334222555160522, + "eval_runtime": 3.1534, + "eval_samples_per_second": 56.764, + "eval_steps_per_second": 56.764, + "step": 1610 + }, + { + "epoch": 5.816252821670429, + "grad_norm": 189.26132202148438, + "learning_rate": 2.128312159709619e-05, + "loss": 41.7717, + "step": 1611 + }, + { + "epoch": 5.819864559819413, + "grad_norm": 215.5289764404297, + "learning_rate": 2.1277676950998187e-05, + "loss": 41.3994, + "step": 1612 + }, + { + "epoch": 5.823476297968397, + "grad_norm": 267.4259033203125, + "learning_rate": 2.1272232304900182e-05, + "loss": 41.8173, + "step": 1613 + }, + { + "epoch": 5.827088036117382, + "grad_norm": 241.74749755859375, + "learning_rate": 2.1266787658802178e-05, + "loss": 39.9873, + "step": 1614 + }, + { + "epoch": 5.830699774266366, + "grad_norm": 242.233642578125, + "learning_rate": 2.1261343012704173e-05, + "loss": 37.0662, + "step": 1615 + }, + { + "epoch": 5.83431151241535, + "grad_norm": 217.06141662597656, + "learning_rate": 2.1255898366606172e-05, + "loss": 36.8948, + "step": 1616 + }, + { + "epoch": 5.837923250564334, + "grad_norm": 242.05567932128906, + "learning_rate": 2.1250453720508167e-05, + "loss": 34.9909, + "step": 1617 + }, + { + "epoch": 5.8415349887133186, + "grad_norm": 178.65618896484375, + "learning_rate": 2.1245009074410166e-05, + "loss": 35.603, + "step": 1618 + }, + { + "epoch": 5.8451467268623025, + "grad_norm": 216.36865234375, + "learning_rate": 2.123956442831216e-05, + "loss": 35.9822, + "step": 1619 + }, + { + "epoch": 5.8487584650112865, + "grad_norm": 241.22161865234375, + "learning_rate": 2.1234119782214157e-05, + "loss": 35.1473, + "step": 1620 + }, + { + "epoch": 5.8487584650112865, + "eval_loss": 0.6312161087989807, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 1620 + }, + { + "epoch": 5.852370203160271, + "grad_norm": 192.05210876464844, + "learning_rate": 2.1228675136116152e-05, + "loss": 36.145, + "step": 1621 + }, + { + "epoch": 5.855981941309255, + "grad_norm": 194.0652618408203, + "learning_rate": 2.122323049001815e-05, + "loss": 37.7076, + "step": 1622 + }, + { + "epoch": 5.859593679458239, + "grad_norm": 255.59286499023438, + "learning_rate": 2.1217785843920146e-05, + "loss": 37.6837, + "step": 1623 + }, + { + "epoch": 5.863205417607223, + "grad_norm": 184.0017852783203, + "learning_rate": 2.121234119782214e-05, + "loss": 37.1681, + "step": 1624 + }, + { + "epoch": 5.866817155756207, + "grad_norm": 186.98338317871094, + "learning_rate": 2.1206896551724137e-05, + "loss": 37.4902, + "step": 1625 + }, + { + "epoch": 5.870428893905192, + "grad_norm": 253.53775024414062, + "learning_rate": 2.1201451905626132e-05, + "loss": 37.2771, + "step": 1626 + }, + { + "epoch": 5.874040632054176, + "grad_norm": 196.43038940429688, + "learning_rate": 2.119600725952813e-05, + "loss": 37.7681, + "step": 1627 + }, + { + "epoch": 5.87765237020316, + "grad_norm": 255.99879455566406, + "learning_rate": 2.119056261343013e-05, + "loss": 40.0097, + "step": 1628 + }, + { + "epoch": 5.881264108352145, + "grad_norm": 275.1465148925781, + "learning_rate": 2.1185117967332125e-05, + "loss": 38.1076, + "step": 1629 + }, + { + "epoch": 5.884875846501129, + "grad_norm": 281.8592529296875, + "learning_rate": 2.117967332123412e-05, + "loss": 38.6463, + "step": 1630 + }, + { + "epoch": 5.884875846501129, + "eval_loss": 0.6449099779129028, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 1630 + }, + { + "epoch": 5.888487584650113, + "grad_norm": 246.7912139892578, + "learning_rate": 2.1174228675136116e-05, + "loss": 36.9158, + "step": 1631 + }, + { + "epoch": 5.892099322799097, + "grad_norm": 176.7545623779297, + "learning_rate": 2.116878402903811e-05, + "loss": 25.1153, + "step": 1632 + }, + { + "epoch": 5.895711060948082, + "grad_norm": 202.2602996826172, + "learning_rate": 2.116333938294011e-05, + "loss": 24.1999, + "step": 1633 + }, + { + "epoch": 5.899322799097066, + "grad_norm": 186.26255798339844, + "learning_rate": 2.1157894736842106e-05, + "loss": 24.185, + "step": 1634 + }, + { + "epoch": 5.9029345372460496, + "grad_norm": 231.0543670654297, + "learning_rate": 2.11524500907441e-05, + "loss": 26.1841, + "step": 1635 + }, + { + "epoch": 5.9065462753950335, + "grad_norm": 336.677001953125, + "learning_rate": 2.1147005444646096e-05, + "loss": 47.1367, + "step": 1636 + }, + { + "epoch": 5.910158013544018, + "grad_norm": 299.3211975097656, + "learning_rate": 2.1141560798548095e-05, + "loss": 46.7711, + "step": 1637 + }, + { + "epoch": 5.913769751693002, + "grad_norm": 287.5389099121094, + "learning_rate": 2.1136116152450094e-05, + "loss": 44.9163, + "step": 1638 + }, + { + "epoch": 5.917381489841986, + "grad_norm": 290.34930419921875, + "learning_rate": 2.113067150635209e-05, + "loss": 45.1651, + "step": 1639 + }, + { + "epoch": 5.92099322799097, + "grad_norm": 244.7100372314453, + "learning_rate": 2.1125226860254085e-05, + "loss": 45.6252, + "step": 1640 + }, + { + "epoch": 5.92099322799097, + "eval_loss": 0.6506878733634949, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 1640 + }, + { + "epoch": 5.924604966139955, + "grad_norm": 301.48223876953125, + "learning_rate": 2.111978221415608e-05, + "loss": 44.5345, + "step": 1641 + }, + { + "epoch": 5.928216704288939, + "grad_norm": 261.05987548828125, + "learning_rate": 2.1114337568058075e-05, + "loss": 42.0263, + "step": 1642 + }, + { + "epoch": 5.931828442437923, + "grad_norm": 220.4369659423828, + "learning_rate": 2.110889292196007e-05, + "loss": 41.2405, + "step": 1643 + }, + { + "epoch": 5.935440180586907, + "grad_norm": 261.3221435546875, + "learning_rate": 2.110344827586207e-05, + "loss": 42.2734, + "step": 1644 + }, + { + "epoch": 5.939051918735892, + "grad_norm": 253.70855712890625, + "learning_rate": 2.1098003629764065e-05, + "loss": 43.0752, + "step": 1645 + }, + { + "epoch": 5.942663656884876, + "grad_norm": 198.76138305664062, + "learning_rate": 2.1092558983666064e-05, + "loss": 42.7103, + "step": 1646 + }, + { + "epoch": 5.94627539503386, + "grad_norm": 212.21466064453125, + "learning_rate": 2.108711433756806e-05, + "loss": 42.6215, + "step": 1647 + }, + { + "epoch": 5.949887133182845, + "grad_norm": 212.9633026123047, + "learning_rate": 2.1081669691470055e-05, + "loss": 42.795, + "step": 1648 + }, + { + "epoch": 5.953498871331829, + "grad_norm": 263.2871398925781, + "learning_rate": 2.1076225045372053e-05, + "loss": 43.8843, + "step": 1649 + }, + { + "epoch": 5.957110609480813, + "grad_norm": 207.67120361328125, + "learning_rate": 2.107078039927405e-05, + "loss": 43.0161, + "step": 1650 + }, + { + "epoch": 5.957110609480813, + "eval_loss": 0.6315081715583801, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1650 + }, + { + "epoch": 5.960722347629797, + "grad_norm": 176.6342010498047, + "learning_rate": 2.1065335753176044e-05, + "loss": 38.803, + "step": 1651 + }, + { + "epoch": 5.9643340857787805, + "grad_norm": 223.57485961914062, + "learning_rate": 2.105989110707804e-05, + "loss": 35.1905, + "step": 1652 + }, + { + "epoch": 5.967945823927765, + "grad_norm": 291.507568359375, + "learning_rate": 2.1054446460980035e-05, + "loss": 34.9454, + "step": 1653 + }, + { + "epoch": 5.971557562076749, + "grad_norm": 250.51063537597656, + "learning_rate": 2.104900181488203e-05, + "loss": 37.4404, + "step": 1654 + }, + { + "epoch": 5.975169300225733, + "grad_norm": 307.9601135253906, + "learning_rate": 2.1043557168784032e-05, + "loss": 36.9775, + "step": 1655 + }, + { + "epoch": 5.978781038374718, + "grad_norm": 277.24151611328125, + "learning_rate": 2.1038112522686028e-05, + "loss": 38.2696, + "step": 1656 + }, + { + "epoch": 5.982392776523702, + "grad_norm": 186.7593994140625, + "learning_rate": 2.1032667876588023e-05, + "loss": 37.0656, + "step": 1657 + }, + { + "epoch": 5.986004514672686, + "grad_norm": 201.67047119140625, + "learning_rate": 2.102722323049002e-05, + "loss": 38.1747, + "step": 1658 + }, + { + "epoch": 5.98961625282167, + "grad_norm": 216.87525939941406, + "learning_rate": 2.1021778584392014e-05, + "loss": 39.3248, + "step": 1659 + }, + { + "epoch": 5.993227990970655, + "grad_norm": 227.381103515625, + "learning_rate": 2.1016333938294013e-05, + "loss": 33.4017, + "step": 1660 + }, + { + "epoch": 5.993227990970655, + "eval_loss": 0.6369583010673523, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1660 + }, + { + "epoch": 5.996839729119639, + "grad_norm": 237.2648468017578, + "learning_rate": 2.1010889292196008e-05, + "loss": 24.679, + "step": 1661 + }, + { + "epoch": 6.0, + "grad_norm": 191.99951171875, + "learning_rate": 2.1005444646098003e-05, + "loss": 21.9552, + "step": 1662 + }, + { + "epoch": 6.003611738148984, + "grad_norm": 267.92181396484375, + "learning_rate": 2.1e-05, + "loss": 43.6884, + "step": 1663 + }, + { + "epoch": 6.007223476297969, + "grad_norm": 318.86602783203125, + "learning_rate": 2.0994555353901998e-05, + "loss": 46.0709, + "step": 1664 + }, + { + "epoch": 6.010835214446953, + "grad_norm": 282.772705078125, + "learning_rate": 2.0989110707803993e-05, + "loss": 44.2746, + "step": 1665 + }, + { + "epoch": 6.014446952595937, + "grad_norm": 263.2024841308594, + "learning_rate": 2.0983666061705992e-05, + "loss": 43.818, + "step": 1666 + }, + { + "epoch": 6.018058690744921, + "grad_norm": 229.41725158691406, + "learning_rate": 2.0978221415607987e-05, + "loss": 43.9441, + "step": 1667 + }, + { + "epoch": 6.021670428893906, + "grad_norm": 253.25624084472656, + "learning_rate": 2.0972776769509983e-05, + "loss": 43.517, + "step": 1668 + }, + { + "epoch": 6.0252821670428895, + "grad_norm": 202.00238037109375, + "learning_rate": 2.0967332123411978e-05, + "loss": 44.3685, + "step": 1669 + }, + { + "epoch": 6.0288939051918735, + "grad_norm": 196.92825317382812, + "learning_rate": 2.0961887477313973e-05, + "loss": 44.9367, + "step": 1670 + }, + { + "epoch": 6.0288939051918735, + "eval_loss": 0.6381568312644958, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1670 + }, + { + "epoch": 6.0325056433408575, + "grad_norm": 191.00900268554688, + "learning_rate": 2.0956442831215972e-05, + "loss": 44.0743, + "step": 1671 + }, + { + "epoch": 6.036117381489842, + "grad_norm": 195.92141723632812, + "learning_rate": 2.0950998185117967e-05, + "loss": 43.3278, + "step": 1672 + }, + { + "epoch": 6.039729119638826, + "grad_norm": 230.04708862304688, + "learning_rate": 2.0945553539019963e-05, + "loss": 41.6419, + "step": 1673 + }, + { + "epoch": 6.04334085778781, + "grad_norm": 215.70689392089844, + "learning_rate": 2.094010889292196e-05, + "loss": 41.0927, + "step": 1674 + }, + { + "epoch": 6.046952595936794, + "grad_norm": 227.51797485351562, + "learning_rate": 2.0934664246823957e-05, + "loss": 40.1888, + "step": 1675 + }, + { + "epoch": 6.050564334085779, + "grad_norm": 216.93089294433594, + "learning_rate": 2.0929219600725952e-05, + "loss": 39.8766, + "step": 1676 + }, + { + "epoch": 6.054176072234763, + "grad_norm": 199.3091583251953, + "learning_rate": 2.092377495462795e-05, + "loss": 40.3851, + "step": 1677 + }, + { + "epoch": 6.057787810383747, + "grad_norm": 188.56056213378906, + "learning_rate": 2.0918330308529947e-05, + "loss": 40.5289, + "step": 1678 + }, + { + "epoch": 6.061399548532731, + "grad_norm": 194.23265075683594, + "learning_rate": 2.0912885662431942e-05, + "loss": 40.7509, + "step": 1679 + }, + { + "epoch": 6.065011286681716, + "grad_norm": 199.7327423095703, + "learning_rate": 2.0907441016333937e-05, + "loss": 41.3404, + "step": 1680 + }, + { + "epoch": 6.065011286681716, + "eval_loss": 0.6312655806541443, + "eval_runtime": 3.1482, + "eval_samples_per_second": 56.858, + "eval_steps_per_second": 56.858, + "step": 1680 + }, + { + "epoch": 6.0686230248307, + "grad_norm": 189.40150451660156, + "learning_rate": 2.0901996370235933e-05, + "loss": 41.3719, + "step": 1681 + }, + { + "epoch": 6.072234762979684, + "grad_norm": 222.07705688476562, + "learning_rate": 2.089655172413793e-05, + "loss": 41.8194, + "step": 1682 + }, + { + "epoch": 6.075846501128668, + "grad_norm": 205.6264190673828, + "learning_rate": 2.089110707803993e-05, + "loss": 39.8522, + "step": 1683 + }, + { + "epoch": 6.079458239277653, + "grad_norm": 207.98802185058594, + "learning_rate": 2.0885662431941926e-05, + "loss": 41.5093, + "step": 1684 + }, + { + "epoch": 6.083069977426637, + "grad_norm": 197.24134826660156, + "learning_rate": 2.088021778584392e-05, + "loss": 41.7284, + "step": 1685 + }, + { + "epoch": 6.0866817155756205, + "grad_norm": 220.84255981445312, + "learning_rate": 2.0874773139745916e-05, + "loss": 42.7841, + "step": 1686 + }, + { + "epoch": 6.090293453724605, + "grad_norm": 239.06854248046875, + "learning_rate": 2.0869328493647912e-05, + "loss": 43.6391, + "step": 1687 + }, + { + "epoch": 6.093905191873589, + "grad_norm": 193.2572021484375, + "learning_rate": 2.086388384754991e-05, + "loss": 41.9963, + "step": 1688 + }, + { + "epoch": 6.097516930022573, + "grad_norm": 206.66473388671875, + "learning_rate": 2.0858439201451906e-05, + "loss": 41.9834, + "step": 1689 + }, + { + "epoch": 6.101128668171557, + "grad_norm": 214.81956481933594, + "learning_rate": 2.08529945553539e-05, + "loss": 41.7128, + "step": 1690 + }, + { + "epoch": 6.101128668171557, + "eval_loss": 0.6309775114059448, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1690 + }, + { + "epoch": 6.104740406320542, + "grad_norm": 189.58360290527344, + "learning_rate": 2.0847549909255897e-05, + "loss": 37.7807, + "step": 1691 + }, + { + "epoch": 6.108352144469526, + "grad_norm": 265.76934814453125, + "learning_rate": 2.0842105263157895e-05, + "loss": 37.7091, + "step": 1692 + }, + { + "epoch": 6.11196388261851, + "grad_norm": 266.4632568359375, + "learning_rate": 2.0836660617059894e-05, + "loss": 34.7386, + "step": 1693 + }, + { + "epoch": 6.115575620767494, + "grad_norm": 309.3799743652344, + "learning_rate": 2.083121597096189e-05, + "loss": 34.9386, + "step": 1694 + }, + { + "epoch": 6.119187358916479, + "grad_norm": 252.98681640625, + "learning_rate": 2.0825771324863885e-05, + "loss": 34.9113, + "step": 1695 + }, + { + "epoch": 6.122799097065463, + "grad_norm": 199.3408660888672, + "learning_rate": 2.082032667876588e-05, + "loss": 35.1914, + "step": 1696 + }, + { + "epoch": 6.126410835214447, + "grad_norm": 231.67514038085938, + "learning_rate": 2.0814882032667876e-05, + "loss": 36.3151, + "step": 1697 + }, + { + "epoch": 6.130022573363431, + "grad_norm": 215.49317932128906, + "learning_rate": 2.080943738656987e-05, + "loss": 37.6763, + "step": 1698 + }, + { + "epoch": 6.133634311512416, + "grad_norm": 239.3602752685547, + "learning_rate": 2.080399274047187e-05, + "loss": 35.7805, + "step": 1699 + }, + { + "epoch": 6.1372460496614, + "grad_norm": 192.8195037841797, + "learning_rate": 2.0798548094373865e-05, + "loss": 36.7353, + "step": 1700 + }, + { + "epoch": 6.1372460496614, + "eval_loss": 0.6290757060050964, + "eval_runtime": 3.1486, + "eval_samples_per_second": 56.851, + "eval_steps_per_second": 56.851, + "step": 1700 + }, + { + "epoch": 6.140857787810384, + "grad_norm": 191.125, + "learning_rate": 2.0793103448275864e-05, + "loss": 36.6377, + "step": 1701 + }, + { + "epoch": 6.144469525959368, + "grad_norm": 232.39170837402344, + "learning_rate": 2.078765880217786e-05, + "loss": 36.5235, + "step": 1702 + }, + { + "epoch": 6.148081264108352, + "grad_norm": 259.41204833984375, + "learning_rate": 2.0782214156079855e-05, + "loss": 37.7093, + "step": 1703 + }, + { + "epoch": 6.151693002257336, + "grad_norm": 218.00814819335938, + "learning_rate": 2.0776769509981854e-05, + "loss": 37.8061, + "step": 1704 + }, + { + "epoch": 6.15530474040632, + "grad_norm": 183.78170776367188, + "learning_rate": 2.077132486388385e-05, + "loss": 37.9451, + "step": 1705 + }, + { + "epoch": 6.158916478555304, + "grad_norm": 242.387939453125, + "learning_rate": 2.0765880217785844e-05, + "loss": 38.687, + "step": 1706 + }, + { + "epoch": 6.162528216704289, + "grad_norm": 247.09152221679688, + "learning_rate": 2.076043557168784e-05, + "loss": 38.5109, + "step": 1707 + }, + { + "epoch": 6.166139954853273, + "grad_norm": 202.3104705810547, + "learning_rate": 2.0754990925589835e-05, + "loss": 28.0115, + "step": 1708 + }, + { + "epoch": 6.169751693002257, + "grad_norm": 239.5511016845703, + "learning_rate": 2.0749546279491834e-05, + "loss": 23.8873, + "step": 1709 + }, + { + "epoch": 6.173363431151241, + "grad_norm": 233.80007934570312, + "learning_rate": 2.0744101633393833e-05, + "loss": 24.0236, + "step": 1710 + }, + { + "epoch": 6.173363431151241, + "eval_loss": 0.6451307535171509, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1710 + }, + { + "epoch": 6.176975169300226, + "grad_norm": 231.85955810546875, + "learning_rate": 2.0738656987295828e-05, + "loss": 25.2521, + "step": 1711 + }, + { + "epoch": 6.18058690744921, + "grad_norm": 207.05453491210938, + "learning_rate": 2.0733212341197823e-05, + "loss": 25.5774, + "step": 1712 + }, + { + "epoch": 6.184198645598194, + "grad_norm": 265.9180908203125, + "learning_rate": 2.072776769509982e-05, + "loss": 46.0267, + "step": 1713 + }, + { + "epoch": 6.187810383747179, + "grad_norm": 289.2763671875, + "learning_rate": 2.0722323049001814e-05, + "loss": 46.6262, + "step": 1714 + }, + { + "epoch": 6.191422121896163, + "grad_norm": 254.466552734375, + "learning_rate": 2.0716878402903813e-05, + "loss": 44.2758, + "step": 1715 + }, + { + "epoch": 6.195033860045147, + "grad_norm": 262.713134765625, + "learning_rate": 2.071143375680581e-05, + "loss": 44.6334, + "step": 1716 + }, + { + "epoch": 6.198645598194131, + "grad_norm": 272.8150939941406, + "learning_rate": 2.0705989110707804e-05, + "loss": 44.9617, + "step": 1717 + }, + { + "epoch": 6.2022573363431155, + "grad_norm": 288.115478515625, + "learning_rate": 2.07005444646098e-05, + "loss": 44.4382, + "step": 1718 + }, + { + "epoch": 6.2058690744920995, + "grad_norm": 226.08058166503906, + "learning_rate": 2.0695099818511795e-05, + "loss": 44.8551, + "step": 1719 + }, + { + "epoch": 6.209480812641083, + "grad_norm": 219.95835876464844, + "learning_rate": 2.0689655172413797e-05, + "loss": 45.5901, + "step": 1720 + }, + { + "epoch": 6.209480812641083, + "eval_loss": 0.6379314661026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 1720 + }, + { + "epoch": 6.213092550790067, + "grad_norm": 190.3118896484375, + "learning_rate": 2.0684210526315792e-05, + "loss": 44.0675, + "step": 1721 + }, + { + "epoch": 6.216704288939052, + "grad_norm": 177.408935546875, + "learning_rate": 2.0678765880217787e-05, + "loss": 42.6333, + "step": 1722 + }, + { + "epoch": 6.220316027088036, + "grad_norm": 231.3040313720703, + "learning_rate": 2.0673321234119783e-05, + "loss": 41.6771, + "step": 1723 + }, + { + "epoch": 6.22392776523702, + "grad_norm": 226.51663208007812, + "learning_rate": 2.0667876588021778e-05, + "loss": 41.0829, + "step": 1724 + }, + { + "epoch": 6.227539503386004, + "grad_norm": 184.55775451660156, + "learning_rate": 2.0662431941923774e-05, + "loss": 39.2682, + "step": 1725 + }, + { + "epoch": 6.231151241534989, + "grad_norm": 205.0491943359375, + "learning_rate": 2.0656987295825772e-05, + "loss": 40.4101, + "step": 1726 + }, + { + "epoch": 6.234762979683973, + "grad_norm": 201.45838928222656, + "learning_rate": 2.0651542649727768e-05, + "loss": 39.9147, + "step": 1727 + }, + { + "epoch": 6.238374717832957, + "grad_norm": 220.16213989257812, + "learning_rate": 2.0646098003629763e-05, + "loss": 40.7215, + "step": 1728 + }, + { + "epoch": 6.241986455981941, + "grad_norm": 260.9661560058594, + "learning_rate": 2.0640653357531762e-05, + "loss": 40.0256, + "step": 1729 + }, + { + "epoch": 6.245598194130926, + "grad_norm": 314.2476806640625, + "learning_rate": 2.0635208711433757e-05, + "loss": 41.1147, + "step": 1730 + }, + { + "epoch": 6.245598194130926, + "eval_loss": 0.6347935199737549, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1730 + }, + { + "epoch": 6.24920993227991, + "grad_norm": 262.24505615234375, + "learning_rate": 2.0629764065335756e-05, + "loss": 41.7255, + "step": 1731 + }, + { + "epoch": 6.252821670428894, + "grad_norm": 212.0876922607422, + "learning_rate": 2.062431941923775e-05, + "loss": 41.2559, + "step": 1732 + }, + { + "epoch": 6.2564334085778786, + "grad_norm": 185.3249969482422, + "learning_rate": 2.0618874773139747e-05, + "loss": 41.1664, + "step": 1733 + }, + { + "epoch": 6.2600451467268625, + "grad_norm": 184.7873077392578, + "learning_rate": 2.0613430127041742e-05, + "loss": 41.3357, + "step": 1734 + }, + { + "epoch": 6.2636568848758465, + "grad_norm": 230.11257934570312, + "learning_rate": 2.0607985480943738e-05, + "loss": 43.0978, + "step": 1735 + }, + { + "epoch": 6.2672686230248305, + "grad_norm": 251.255126953125, + "learning_rate": 2.0602540834845733e-05, + "loss": 42.4169, + "step": 1736 + }, + { + "epoch": 6.270880361173815, + "grad_norm": 230.1149444580078, + "learning_rate": 2.0597096188747732e-05, + "loss": 43.2969, + "step": 1737 + }, + { + "epoch": 6.274492099322799, + "grad_norm": 217.2769012451172, + "learning_rate": 2.059165154264973e-05, + "loss": 42.6037, + "step": 1738 + }, + { + "epoch": 6.278103837471783, + "grad_norm": 189.85533142089844, + "learning_rate": 2.0586206896551726e-05, + "loss": 42.1215, + "step": 1739 + }, + { + "epoch": 6.281715575620767, + "grad_norm": 242.15667724609375, + "learning_rate": 2.058076225045372e-05, + "loss": 42.6337, + "step": 1740 + }, + { + "epoch": 6.281715575620767, + "eval_loss": 0.6310555934906006, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 1740 + }, + { + "epoch": 6.285327313769752, + "grad_norm": 213.7873992919922, + "learning_rate": 2.0575317604355717e-05, + "loss": 40.5315, + "step": 1741 + }, + { + "epoch": 6.288939051918736, + "grad_norm": 243.86492919921875, + "learning_rate": 2.0569872958257715e-05, + "loss": 38.9483, + "step": 1742 + }, + { + "epoch": 6.29255079006772, + "grad_norm": 276.0108642578125, + "learning_rate": 2.056442831215971e-05, + "loss": 35.9627, + "step": 1743 + }, + { + "epoch": 6.296162528216704, + "grad_norm": 252.5875701904297, + "learning_rate": 2.0558983666061706e-05, + "loss": 35.4305, + "step": 1744 + }, + { + "epoch": 6.299774266365689, + "grad_norm": 227.15142822265625, + "learning_rate": 2.05535390199637e-05, + "loss": 35.2385, + "step": 1745 + }, + { + "epoch": 6.303386004514673, + "grad_norm": 259.6727294921875, + "learning_rate": 2.0548094373865697e-05, + "loss": 35.735, + "step": 1746 + }, + { + "epoch": 6.306997742663657, + "grad_norm": 185.07765197753906, + "learning_rate": 2.0542649727767696e-05, + "loss": 36.8835, + "step": 1747 + }, + { + "epoch": 6.310609480812641, + "grad_norm": 207.650146484375, + "learning_rate": 2.0537205081669694e-05, + "loss": 36.346, + "step": 1748 + }, + { + "epoch": 6.314221218961626, + "grad_norm": 223.2378692626953, + "learning_rate": 2.053176043557169e-05, + "loss": 36.1527, + "step": 1749 + }, + { + "epoch": 6.3178329571106095, + "grad_norm": 162.90794372558594, + "learning_rate": 2.0526315789473685e-05, + "loss": 35.7408, + "step": 1750 + }, + { + "epoch": 6.3178329571106095, + "eval_loss": 0.6276403069496155, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 1750 + }, + { + "epoch": 6.3214446952595935, + "grad_norm": 165.8592987060547, + "learning_rate": 2.052087114337568e-05, + "loss": 37.7916, + "step": 1751 + }, + { + "epoch": 6.3250564334085775, + "grad_norm": 179.7499542236328, + "learning_rate": 2.0515426497277676e-05, + "loss": 36.8409, + "step": 1752 + }, + { + "epoch": 6.328668171557562, + "grad_norm": 227.0990753173828, + "learning_rate": 2.0509981851179675e-05, + "loss": 37.1766, + "step": 1753 + }, + { + "epoch": 6.332279909706546, + "grad_norm": 216.3297882080078, + "learning_rate": 2.050453720508167e-05, + "loss": 37.5, + "step": 1754 + }, + { + "epoch": 6.33589164785553, + "grad_norm": 197.88409423828125, + "learning_rate": 2.0499092558983666e-05, + "loss": 38.8293, + "step": 1755 + }, + { + "epoch": 6.339503386004514, + "grad_norm": 189.74916076660156, + "learning_rate": 2.049364791288566e-05, + "loss": 37.9873, + "step": 1756 + }, + { + "epoch": 6.343115124153499, + "grad_norm": 241.16644287109375, + "learning_rate": 2.048820326678766e-05, + "loss": 39.3107, + "step": 1757 + }, + { + "epoch": 6.346726862302483, + "grad_norm": 224.3491668701172, + "learning_rate": 2.0482758620689655e-05, + "loss": 36.2482, + "step": 1758 + }, + { + "epoch": 6.350338600451467, + "grad_norm": 217.30882263183594, + "learning_rate": 2.0477313974591654e-05, + "loss": 24.1945, + "step": 1759 + }, + { + "epoch": 6.353950338600452, + "grad_norm": 213.23683166503906, + "learning_rate": 2.047186932849365e-05, + "loss": 24.2356, + "step": 1760 + }, + { + "epoch": 6.353950338600452, + "eval_loss": 0.6382855772972107, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.795, + "eval_steps_per_second": 56.795, + "step": 1760 + }, + { + "epoch": 6.357562076749436, + "grad_norm": 209.8166961669922, + "learning_rate": 2.0466424682395645e-05, + "loss": 25.1916, + "step": 1761 + }, + { + "epoch": 6.36117381489842, + "grad_norm": 197.86773681640625, + "learning_rate": 2.046098003629764e-05, + "loss": 25.1372, + "step": 1762 + }, + { + "epoch": 6.364785553047404, + "grad_norm": 280.80517578125, + "learning_rate": 2.0455535390199635e-05, + "loss": 45.0431, + "step": 1763 + }, + { + "epoch": 6.368397291196389, + "grad_norm": 239.85861206054688, + "learning_rate": 2.0450090744101634e-05, + "loss": 45.4893, + "step": 1764 + }, + { + "epoch": 6.372009029345373, + "grad_norm": 302.56024169921875, + "learning_rate": 2.044464609800363e-05, + "loss": 45.3313, + "step": 1765 + }, + { + "epoch": 6.375620767494357, + "grad_norm": 255.5519256591797, + "learning_rate": 2.043920145190563e-05, + "loss": 44.703, + "step": 1766 + }, + { + "epoch": 6.3792325056433405, + "grad_norm": 223.1331024169922, + "learning_rate": 2.0433756805807624e-05, + "loss": 45.0278, + "step": 1767 + }, + { + "epoch": 6.382844243792325, + "grad_norm": 240.68817138671875, + "learning_rate": 2.042831215970962e-05, + "loss": 44.7298, + "step": 1768 + }, + { + "epoch": 6.386455981941309, + "grad_norm": 239.5072021484375, + "learning_rate": 2.0422867513611614e-05, + "loss": 44.0512, + "step": 1769 + }, + { + "epoch": 6.390067720090293, + "grad_norm": 186.3783416748047, + "learning_rate": 2.0417422867513613e-05, + "loss": 43.8646, + "step": 1770 + }, + { + "epoch": 6.390067720090293, + "eval_loss": 0.6325972676277161, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 1770 + }, + { + "epoch": 6.393679458239277, + "grad_norm": 169.77285766601562, + "learning_rate": 2.041197822141561e-05, + "loss": 43.8688, + "step": 1771 + }, + { + "epoch": 6.397291196388262, + "grad_norm": 158.4019012451172, + "learning_rate": 2.0406533575317604e-05, + "loss": 42.5757, + "step": 1772 + }, + { + "epoch": 6.400902934537246, + "grad_norm": 209.79916381835938, + "learning_rate": 2.04010889292196e-05, + "loss": 44.8075, + "step": 1773 + }, + { + "epoch": 6.40451467268623, + "grad_norm": 215.74639892578125, + "learning_rate": 2.0395644283121595e-05, + "loss": 42.0121, + "step": 1774 + }, + { + "epoch": 6.408126410835214, + "grad_norm": 215.21121215820312, + "learning_rate": 2.0390199637023597e-05, + "loss": 40.6564, + "step": 1775 + }, + { + "epoch": 6.411738148984199, + "grad_norm": 244.49574279785156, + "learning_rate": 2.0384754990925592e-05, + "loss": 40.543, + "step": 1776 + }, + { + "epoch": 6.415349887133183, + "grad_norm": 189.22781372070312, + "learning_rate": 2.0379310344827588e-05, + "loss": 39.5569, + "step": 1777 + }, + { + "epoch": 6.418961625282167, + "grad_norm": 204.32664489746094, + "learning_rate": 2.0373865698729583e-05, + "loss": 40.0789, + "step": 1778 + }, + { + "epoch": 6.422573363431152, + "grad_norm": 217.5277557373047, + "learning_rate": 2.036842105263158e-05, + "loss": 39.6436, + "step": 1779 + }, + { + "epoch": 6.426185101580136, + "grad_norm": 196.25918579101562, + "learning_rate": 2.0362976406533574e-05, + "loss": 41.0794, + "step": 1780 + }, + { + "epoch": 6.426185101580136, + "eval_loss": 0.6334295868873596, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1780 + }, + { + "epoch": 6.42979683972912, + "grad_norm": 191.50656127929688, + "learning_rate": 2.0357531760435573e-05, + "loss": 41.2976, + "step": 1781 + }, + { + "epoch": 6.433408577878104, + "grad_norm": 192.98692321777344, + "learning_rate": 2.0352087114337568e-05, + "loss": 41.0843, + "step": 1782 + }, + { + "epoch": 6.437020316027088, + "grad_norm": 197.32862854003906, + "learning_rate": 2.0346642468239563e-05, + "loss": 40.4123, + "step": 1783 + }, + { + "epoch": 6.440632054176072, + "grad_norm": 205.18751525878906, + "learning_rate": 2.0341197822141562e-05, + "loss": 41.9185, + "step": 1784 + }, + { + "epoch": 6.444243792325056, + "grad_norm": 201.69070434570312, + "learning_rate": 2.0335753176043558e-05, + "loss": 41.6794, + "step": 1785 + }, + { + "epoch": 6.44785553047404, + "grad_norm": 218.77044677734375, + "learning_rate": 2.0330308529945556e-05, + "loss": 43.5805, + "step": 1786 + }, + { + "epoch": 6.451467268623025, + "grad_norm": 183.25967407226562, + "learning_rate": 2.0324863883847552e-05, + "loss": 41.2777, + "step": 1787 + }, + { + "epoch": 6.455079006772009, + "grad_norm": 219.97369384765625, + "learning_rate": 2.0319419237749547e-05, + "loss": 42.4618, + "step": 1788 + }, + { + "epoch": 6.458690744920993, + "grad_norm": 216.1624298095703, + "learning_rate": 2.0313974591651542e-05, + "loss": 41.6424, + "step": 1789 + }, + { + "epoch": 6.462302483069977, + "grad_norm": 222.29965209960938, + "learning_rate": 2.0308529945553538e-05, + "loss": 41.4058, + "step": 1790 + }, + { + "epoch": 6.462302483069977, + "eval_loss": 0.6282982230186462, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 1790 + }, + { + "epoch": 6.465914221218962, + "grad_norm": 215.50511169433594, + "learning_rate": 2.0303085299455533e-05, + "loss": 39.474, + "step": 1791 + }, + { + "epoch": 6.469525959367946, + "grad_norm": 237.2119903564453, + "learning_rate": 2.0297640653357532e-05, + "loss": 36.0508, + "step": 1792 + }, + { + "epoch": 6.47313769751693, + "grad_norm": 234.52975463867188, + "learning_rate": 2.029219600725953e-05, + "loss": 34.1704, + "step": 1793 + }, + { + "epoch": 6.476749435665914, + "grad_norm": 213.22216796875, + "learning_rate": 2.0286751361161526e-05, + "loss": 34.7592, + "step": 1794 + }, + { + "epoch": 6.480361173814899, + "grad_norm": 215.77244567871094, + "learning_rate": 2.028130671506352e-05, + "loss": 35.3051, + "step": 1795 + }, + { + "epoch": 6.483972911963883, + "grad_norm": 179.0439910888672, + "learning_rate": 2.0275862068965517e-05, + "loss": 35.2493, + "step": 1796 + }, + { + "epoch": 6.487584650112867, + "grad_norm": 217.47218322753906, + "learning_rate": 2.0270417422867516e-05, + "loss": 35.6169, + "step": 1797 + }, + { + "epoch": 6.491196388261851, + "grad_norm": 191.3380584716797, + "learning_rate": 2.026497277676951e-05, + "loss": 36.428, + "step": 1798 + }, + { + "epoch": 6.4948081264108355, + "grad_norm": 200.8570098876953, + "learning_rate": 2.0259528130671506e-05, + "loss": 36.5983, + "step": 1799 + }, + { + "epoch": 6.4984198645598195, + "grad_norm": 173.1240234375, + "learning_rate": 2.0254083484573502e-05, + "loss": 36.0163, + "step": 1800 + }, + { + "epoch": 6.4984198645598195, + "eval_loss": 0.6268841624259949, + "eval_runtime": 3.146, + "eval_samples_per_second": 56.898, + "eval_steps_per_second": 56.898, + "step": 1800 + }, + { + "epoch": 6.502031602708803, + "grad_norm": 225.66845703125, + "learning_rate": 2.0248638838475497e-05, + "loss": 36.2461, + "step": 1801 + }, + { + "epoch": 6.505643340857787, + "grad_norm": 189.66233825683594, + "learning_rate": 2.0243194192377496e-05, + "loss": 37.416, + "step": 1802 + }, + { + "epoch": 6.509255079006772, + "grad_norm": 243.0270233154297, + "learning_rate": 2.0237749546279495e-05, + "loss": 38.5309, + "step": 1803 + }, + { + "epoch": 6.512866817155756, + "grad_norm": 192.0927276611328, + "learning_rate": 2.023230490018149e-05, + "loss": 37.087, + "step": 1804 + }, + { + "epoch": 6.51647855530474, + "grad_norm": 222.2957305908203, + "learning_rate": 2.0226860254083486e-05, + "loss": 37.8877, + "step": 1805 + }, + { + "epoch": 6.520090293453725, + "grad_norm": 259.84722900390625, + "learning_rate": 2.022141560798548e-05, + "loss": 39.2138, + "step": 1806 + }, + { + "epoch": 6.523702031602709, + "grad_norm": 205.5794219970703, + "learning_rate": 2.0215970961887476e-05, + "loss": 38.6066, + "step": 1807 + }, + { + "epoch": 6.527313769751693, + "grad_norm": 300.455810546875, + "learning_rate": 2.0210526315789475e-05, + "loss": 36.1581, + "step": 1808 + }, + { + "epoch": 6.530925507900677, + "grad_norm": 207.18063354492188, + "learning_rate": 2.020508166969147e-05, + "loss": 24.3689, + "step": 1809 + }, + { + "epoch": 6.534537246049661, + "grad_norm": 230.98516845703125, + "learning_rate": 2.0199637023593466e-05, + "loss": 23.7019, + "step": 1810 + }, + { + "epoch": 6.534537246049661, + "eval_loss": 0.6379140615463257, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 1810 + }, + { + "epoch": 6.538148984198646, + "grad_norm": 153.8694610595703, + "learning_rate": 2.019419237749546e-05, + "loss": 24.5035, + "step": 1811 + }, + { + "epoch": 6.54176072234763, + "grad_norm": 229.9432373046875, + "learning_rate": 2.018874773139746e-05, + "loss": 26.1645, + "step": 1812 + }, + { + "epoch": 6.545372460496614, + "grad_norm": 325.3592529296875, + "learning_rate": 2.018330308529946e-05, + "loss": 45.6349, + "step": 1813 + }, + { + "epoch": 6.5489841986455986, + "grad_norm": 261.0744323730469, + "learning_rate": 2.0177858439201454e-05, + "loss": 45.5545, + "step": 1814 + }, + { + "epoch": 6.5525959367945825, + "grad_norm": 261.4237976074219, + "learning_rate": 2.017241379310345e-05, + "loss": 45.321, + "step": 1815 + }, + { + "epoch": 6.5562076749435665, + "grad_norm": 238.8377685546875, + "learning_rate": 2.0166969147005445e-05, + "loss": 44.5963, + "step": 1816 + }, + { + "epoch": 6.5598194130925505, + "grad_norm": 225.89730834960938, + "learning_rate": 2.016152450090744e-05, + "loss": 43.593, + "step": 1817 + }, + { + "epoch": 6.563431151241535, + "grad_norm": 265.09625244140625, + "learning_rate": 2.0156079854809436e-05, + "loss": 43.536, + "step": 1818 + }, + { + "epoch": 6.567042889390519, + "grad_norm": 257.9114685058594, + "learning_rate": 2.0150635208711434e-05, + "loss": 44.1125, + "step": 1819 + }, + { + "epoch": 6.570654627539503, + "grad_norm": 188.06382751464844, + "learning_rate": 2.014519056261343e-05, + "loss": 45.097, + "step": 1820 + }, + { + "epoch": 6.570654627539503, + "eval_loss": 0.6347097754478455, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 1820 + }, + { + "epoch": 6.574266365688487, + "grad_norm": 227.7350616455078, + "learning_rate": 2.013974591651543e-05, + "loss": 43.9367, + "step": 1821 + }, + { + "epoch": 6.577878103837472, + "grad_norm": 207.54774475097656, + "learning_rate": 2.0134301270417424e-05, + "loss": 43.8266, + "step": 1822 + }, + { + "epoch": 6.581489841986456, + "grad_norm": 204.62364196777344, + "learning_rate": 2.012885662431942e-05, + "loss": 42.7973, + "step": 1823 + }, + { + "epoch": 6.58510158013544, + "grad_norm": 244.32159423828125, + "learning_rate": 2.0123411978221418e-05, + "loss": 42.7741, + "step": 1824 + }, + { + "epoch": 6.588713318284425, + "grad_norm": 304.9100036621094, + "learning_rate": 2.0117967332123414e-05, + "loss": 40.6529, + "step": 1825 + }, + { + "epoch": 6.592325056433409, + "grad_norm": 275.5767517089844, + "learning_rate": 2.011252268602541e-05, + "loss": 40.2909, + "step": 1826 + }, + { + "epoch": 6.595936794582393, + "grad_norm": 227.69642639160156, + "learning_rate": 2.0107078039927404e-05, + "loss": 39.8786, + "step": 1827 + }, + { + "epoch": 6.599548532731377, + "grad_norm": 261.4333190917969, + "learning_rate": 2.01016333938294e-05, + "loss": 40.7009, + "step": 1828 + }, + { + "epoch": 6.603160270880361, + "grad_norm": 213.0095977783203, + "learning_rate": 2.0096188747731395e-05, + "loss": 40.0595, + "step": 1829 + }, + { + "epoch": 6.606772009029346, + "grad_norm": 251.78590393066406, + "learning_rate": 2.0090744101633397e-05, + "loss": 40.8939, + "step": 1830 + }, + { + "epoch": 6.606772009029346, + "eval_loss": 0.6333281397819519, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1830 + }, + { + "epoch": 6.6103837471783295, + "grad_norm": 224.89805603027344, + "learning_rate": 2.0085299455535393e-05, + "loss": 41.4123, + "step": 1831 + }, + { + "epoch": 6.6139954853273135, + "grad_norm": 195.67982482910156, + "learning_rate": 2.0079854809437388e-05, + "loss": 41.3483, + "step": 1832 + }, + { + "epoch": 6.617607223476298, + "grad_norm": 214.318603515625, + "learning_rate": 2.0074410163339383e-05, + "loss": 40.5516, + "step": 1833 + }, + { + "epoch": 6.621218961625282, + "grad_norm": 226.60968017578125, + "learning_rate": 2.006896551724138e-05, + "loss": 41.3523, + "step": 1834 + }, + { + "epoch": 6.624830699774266, + "grad_norm": 231.63604736328125, + "learning_rate": 2.0063520871143378e-05, + "loss": 41.8734, + "step": 1835 + }, + { + "epoch": 6.62844243792325, + "grad_norm": 224.1644287109375, + "learning_rate": 2.0058076225045373e-05, + "loss": 42.7386, + "step": 1836 + }, + { + "epoch": 6.632054176072235, + "grad_norm": 273.651123046875, + "learning_rate": 2.0052631578947368e-05, + "loss": 42.4525, + "step": 1837 + }, + { + "epoch": 6.635665914221219, + "grad_norm": 270.8088684082031, + "learning_rate": 2.0047186932849364e-05, + "loss": 42.1051, + "step": 1838 + }, + { + "epoch": 6.639277652370203, + "grad_norm": 303.1058044433594, + "learning_rate": 2.0041742286751362e-05, + "loss": 42.1301, + "step": 1839 + }, + { + "epoch": 6.642889390519187, + "grad_norm": 207.29380798339844, + "learning_rate": 2.0036297640653358e-05, + "loss": 42.1495, + "step": 1840 + }, + { + "epoch": 6.642889390519187, + "eval_loss": 0.6321585774421692, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1840 + }, + { + "epoch": 6.646501128668172, + "grad_norm": 262.1852722167969, + "learning_rate": 2.0030852994555357e-05, + "loss": 39.6408, + "step": 1841 + }, + { + "epoch": 6.650112866817156, + "grad_norm": 233.7991943359375, + "learning_rate": 2.0025408348457352e-05, + "loss": 37.6177, + "step": 1842 + }, + { + "epoch": 6.65372460496614, + "grad_norm": 247.25514221191406, + "learning_rate": 2.0019963702359347e-05, + "loss": 35.4287, + "step": 1843 + }, + { + "epoch": 6.657336343115124, + "grad_norm": 191.53343200683594, + "learning_rate": 2.0014519056261343e-05, + "loss": 34.2335, + "step": 1844 + }, + { + "epoch": 6.660948081264109, + "grad_norm": 245.22821044921875, + "learning_rate": 2.0009074410163338e-05, + "loss": 35.8097, + "step": 1845 + }, + { + "epoch": 6.664559819413093, + "grad_norm": 213.8151092529297, + "learning_rate": 2.0003629764065337e-05, + "loss": 35.2621, + "step": 1846 + }, + { + "epoch": 6.668171557562077, + "grad_norm": 174.6085205078125, + "learning_rate": 1.9998185117967332e-05, + "loss": 36.6137, + "step": 1847 + }, + { + "epoch": 6.6717832957110605, + "grad_norm": 287.4677429199219, + "learning_rate": 1.9992740471869328e-05, + "loss": 37.5896, + "step": 1848 + }, + { + "epoch": 6.675395033860045, + "grad_norm": 224.59771728515625, + "learning_rate": 1.9987295825771326e-05, + "loss": 36.5515, + "step": 1849 + }, + { + "epoch": 6.679006772009029, + "grad_norm": 212.73065185546875, + "learning_rate": 1.9981851179673322e-05, + "loss": 36.2511, + "step": 1850 + }, + { + "epoch": 6.679006772009029, + "eval_loss": 0.6308404803276062, + "eval_runtime": 3.1419, + "eval_samples_per_second": 56.972, + "eval_steps_per_second": 56.972, + "step": 1850 + }, + { + "epoch": 6.682618510158013, + "grad_norm": 214.7340850830078, + "learning_rate": 1.9976406533575317e-05, + "loss": 37.6949, + "step": 1851 + }, + { + "epoch": 6.686230248306998, + "grad_norm": 220.3029327392578, + "learning_rate": 1.9970961887477316e-05, + "loss": 36.5785, + "step": 1852 + }, + { + "epoch": 6.689841986455982, + "grad_norm": 198.97564697265625, + "learning_rate": 1.996551724137931e-05, + "loss": 38.5277, + "step": 1853 + }, + { + "epoch": 6.693453724604966, + "grad_norm": 180.94789123535156, + "learning_rate": 1.9960072595281307e-05, + "loss": 37.5197, + "step": 1854 + }, + { + "epoch": 6.69706546275395, + "grad_norm": 212.17584228515625, + "learning_rate": 1.9954627949183302e-05, + "loss": 37.3483, + "step": 1855 + }, + { + "epoch": 6.700677200902934, + "grad_norm": 253.88601684570312, + "learning_rate": 1.9949183303085298e-05, + "loss": 38.5224, + "step": 1856 + }, + { + "epoch": 6.704288939051919, + "grad_norm": 193.17698669433594, + "learning_rate": 1.9943738656987296e-05, + "loss": 37.5679, + "step": 1857 + }, + { + "epoch": 6.707900677200903, + "grad_norm": 217.2652130126953, + "learning_rate": 1.9938294010889295e-05, + "loss": 27.7344, + "step": 1858 + }, + { + "epoch": 6.711512415349887, + "grad_norm": 183.9295196533203, + "learning_rate": 1.993284936479129e-05, + "loss": 24.3864, + "step": 1859 + }, + { + "epoch": 6.715124153498872, + "grad_norm": 200.3455352783203, + "learning_rate": 1.9927404718693286e-05, + "loss": 23.7328, + "step": 1860 + }, + { + "epoch": 6.715124153498872, + "eval_loss": 0.636415421962738, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.943, + "eval_steps_per_second": 56.943, + "step": 1860 + }, + { + "epoch": 6.718735891647856, + "grad_norm": 206.7858123779297, + "learning_rate": 1.992196007259528e-05, + "loss": 24.6541, + "step": 1861 + }, + { + "epoch": 6.72234762979684, + "grad_norm": 208.10414123535156, + "learning_rate": 1.9916515426497277e-05, + "loss": 25.1223, + "step": 1862 + }, + { + "epoch": 6.725959367945824, + "grad_norm": 270.6657409667969, + "learning_rate": 1.9911070780399275e-05, + "loss": 44.8561, + "step": 1863 + }, + { + "epoch": 6.7295711060948085, + "grad_norm": 246.69094848632812, + "learning_rate": 1.990562613430127e-05, + "loss": 45.8683, + "step": 1864 + }, + { + "epoch": 6.733182844243792, + "grad_norm": 243.4462432861328, + "learning_rate": 1.9900181488203266e-05, + "loss": 45.1845, + "step": 1865 + }, + { + "epoch": 6.736794582392776, + "grad_norm": 218.0637969970703, + "learning_rate": 1.989473684210526e-05, + "loss": 43.9492, + "step": 1866 + }, + { + "epoch": 6.74040632054176, + "grad_norm": 200.28140258789062, + "learning_rate": 1.988929219600726e-05, + "loss": 44.0612, + "step": 1867 + }, + { + "epoch": 6.744018058690745, + "grad_norm": 200.3120880126953, + "learning_rate": 1.988384754990926e-05, + "loss": 43.4748, + "step": 1868 + }, + { + "epoch": 6.747629796839729, + "grad_norm": 186.1811065673828, + "learning_rate": 1.9878402903811254e-05, + "loss": 43.6851, + "step": 1869 + }, + { + "epoch": 6.751241534988713, + "grad_norm": 208.15167236328125, + "learning_rate": 1.987295825771325e-05, + "loss": 44.4196, + "step": 1870 + }, + { + "epoch": 6.751241534988713, + "eval_loss": 0.6353851556777954, + "eval_runtime": 3.1436, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1870 + }, + { + "epoch": 6.754853273137698, + "grad_norm": 207.500244140625, + "learning_rate": 1.9867513611615245e-05, + "loss": 44.1493, + "step": 1871 + }, + { + "epoch": 6.758465011286682, + "grad_norm": 238.17047119140625, + "learning_rate": 1.986206896551724e-05, + "loss": 44.6587, + "step": 1872 + }, + { + "epoch": 6.762076749435666, + "grad_norm": 192.9468231201172, + "learning_rate": 1.9856624319419236e-05, + "loss": 43.2409, + "step": 1873 + }, + { + "epoch": 6.76568848758465, + "grad_norm": 205.26492309570312, + "learning_rate": 1.9851179673321235e-05, + "loss": 40.8636, + "step": 1874 + }, + { + "epoch": 6.769300225733634, + "grad_norm": 190.49908447265625, + "learning_rate": 1.984573502722323e-05, + "loss": 41.0769, + "step": 1875 + }, + { + "epoch": 6.772911963882619, + "grad_norm": 206.56097412109375, + "learning_rate": 1.984029038112523e-05, + "loss": 40.1137, + "step": 1876 + }, + { + "epoch": 6.776523702031603, + "grad_norm": 212.89256286621094, + "learning_rate": 1.9834845735027224e-05, + "loss": 41.0114, + "step": 1877 + }, + { + "epoch": 6.780135440180587, + "grad_norm": 197.24267578125, + "learning_rate": 1.982940108892922e-05, + "loss": 40.6027, + "step": 1878 + }, + { + "epoch": 6.7837471783295715, + "grad_norm": 187.01942443847656, + "learning_rate": 1.982395644283122e-05, + "loss": 40.5933, + "step": 1879 + }, + { + "epoch": 6.7873589164785555, + "grad_norm": 236.31092834472656, + "learning_rate": 1.9818511796733214e-05, + "loss": 41.2282, + "step": 1880 + }, + { + "epoch": 6.7873589164785555, + "eval_loss": 0.6299392580986023, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 1880 + }, + { + "epoch": 6.7909706546275395, + "grad_norm": 194.92059326171875, + "learning_rate": 1.981306715063521e-05, + "loss": 41.5858, + "step": 1881 + }, + { + "epoch": 6.794582392776523, + "grad_norm": 192.26272583007812, + "learning_rate": 1.9807622504537205e-05, + "loss": 40.6826, + "step": 1882 + }, + { + "epoch": 6.798194130925508, + "grad_norm": 181.8116912841797, + "learning_rate": 1.98021778584392e-05, + "loss": 40.0867, + "step": 1883 + }, + { + "epoch": 6.801805869074492, + "grad_norm": 219.03494262695312, + "learning_rate": 1.9796733212341195e-05, + "loss": 41.4496, + "step": 1884 + }, + { + "epoch": 6.805417607223476, + "grad_norm": 190.7852325439453, + "learning_rate": 1.9791288566243194e-05, + "loss": 42.4147, + "step": 1885 + }, + { + "epoch": 6.80902934537246, + "grad_norm": 200.32476806640625, + "learning_rate": 1.9785843920145193e-05, + "loss": 42.0316, + "step": 1886 + }, + { + "epoch": 6.812641083521445, + "grad_norm": 240.6086883544922, + "learning_rate": 1.9780399274047188e-05, + "loss": 39.6992, + "step": 1887 + }, + { + "epoch": 6.816252821670429, + "grad_norm": 222.31700134277344, + "learning_rate": 1.9774954627949184e-05, + "loss": 42.9572, + "step": 1888 + }, + { + "epoch": 6.819864559819413, + "grad_norm": 215.65292358398438, + "learning_rate": 1.976950998185118e-05, + "loss": 42.5147, + "step": 1889 + }, + { + "epoch": 6.823476297968397, + "grad_norm": 195.71624755859375, + "learning_rate": 1.9764065335753178e-05, + "loss": 40.9536, + "step": 1890 + }, + { + "epoch": 6.823476297968397, + "eval_loss": 0.6288287043571472, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 1890 + }, + { + "epoch": 6.827088036117382, + "grad_norm": 202.301025390625, + "learning_rate": 1.9758620689655173e-05, + "loss": 40.1754, + "step": 1891 + }, + { + "epoch": 6.830699774266366, + "grad_norm": 217.07186889648438, + "learning_rate": 1.975317604355717e-05, + "loss": 35.7505, + "step": 1892 + }, + { + "epoch": 6.83431151241535, + "grad_norm": 189.78782653808594, + "learning_rate": 1.9747731397459164e-05, + "loss": 34.813, + "step": 1893 + }, + { + "epoch": 6.837923250564334, + "grad_norm": 247.2117462158203, + "learning_rate": 1.974228675136116e-05, + "loss": 33.932, + "step": 1894 + }, + { + "epoch": 6.8415349887133186, + "grad_norm": 244.06321716308594, + "learning_rate": 1.9736842105263158e-05, + "loss": 36.2514, + "step": 1895 + }, + { + "epoch": 6.8451467268623025, + "grad_norm": 235.78692626953125, + "learning_rate": 1.9731397459165157e-05, + "loss": 35.2123, + "step": 1896 + }, + { + "epoch": 6.8487584650112865, + "grad_norm": 193.82456970214844, + "learning_rate": 1.9725952813067152e-05, + "loss": 36.5477, + "step": 1897 + }, + { + "epoch": 6.852370203160271, + "grad_norm": 230.2017059326172, + "learning_rate": 1.9720508166969148e-05, + "loss": 36.1244, + "step": 1898 + }, + { + "epoch": 6.855981941309255, + "grad_norm": 205.5274200439453, + "learning_rate": 1.9715063520871143e-05, + "loss": 36.7059, + "step": 1899 + }, + { + "epoch": 6.859593679458239, + "grad_norm": 236.6873016357422, + "learning_rate": 1.970961887477314e-05, + "loss": 36.6212, + "step": 1900 + }, + { + "epoch": 6.859593679458239, + "eval_loss": 0.6235609650611877, + "eval_runtime": 3.1497, + "eval_samples_per_second": 56.831, + "eval_steps_per_second": 56.831, + "step": 1900 + }, + { + "epoch": 6.863205417607223, + "grad_norm": 217.63638305664062, + "learning_rate": 1.9704174228675137e-05, + "loss": 37.3918, + "step": 1901 + }, + { + "epoch": 6.866817155756207, + "grad_norm": 169.31996154785156, + "learning_rate": 1.9698729582577133e-05, + "loss": 37.8555, + "step": 1902 + }, + { + "epoch": 6.870428893905192, + "grad_norm": 204.2144775390625, + "learning_rate": 1.9693284936479128e-05, + "loss": 38.0013, + "step": 1903 + }, + { + "epoch": 6.874040632054176, + "grad_norm": 219.13595581054688, + "learning_rate": 1.9687840290381127e-05, + "loss": 37.2128, + "step": 1904 + }, + { + "epoch": 6.87765237020316, + "grad_norm": 189.8477325439453, + "learning_rate": 1.9682395644283122e-05, + "loss": 39.272, + "step": 1905 + }, + { + "epoch": 6.881264108352145, + "grad_norm": 214.21360778808594, + "learning_rate": 1.967695099818512e-05, + "loss": 37.5185, + "step": 1906 + }, + { + "epoch": 6.884875846501129, + "grad_norm": 252.57867431640625, + "learning_rate": 1.9671506352087116e-05, + "loss": 37.6195, + "step": 1907 + }, + { + "epoch": 6.888487584650113, + "grad_norm": 169.85382080078125, + "learning_rate": 1.966606170598911e-05, + "loss": 29.083, + "step": 1908 + }, + { + "epoch": 6.892099322799097, + "grad_norm": 161.38137817382812, + "learning_rate": 1.9660617059891107e-05, + "loss": 24.4547, + "step": 1909 + }, + { + "epoch": 6.895711060948082, + "grad_norm": 192.5706787109375, + "learning_rate": 1.9655172413793102e-05, + "loss": 24.2235, + "step": 1910 + }, + { + "epoch": 6.895711060948082, + "eval_loss": 0.6387229561805725, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1910 + }, + { + "epoch": 6.899322799097066, + "grad_norm": 177.5368194580078, + "learning_rate": 1.9649727767695098e-05, + "loss": 24.8032, + "step": 1911 + }, + { + "epoch": 6.9029345372460496, + "grad_norm": 206.98458862304688, + "learning_rate": 1.9644283121597097e-05, + "loss": 25.7293, + "step": 1912 + }, + { + "epoch": 6.9065462753950335, + "grad_norm": 238.7289581298828, + "learning_rate": 1.9638838475499095e-05, + "loss": 44.2514, + "step": 1913 + }, + { + "epoch": 6.910158013544018, + "grad_norm": 225.86854553222656, + "learning_rate": 1.963339382940109e-05, + "loss": 44.4858, + "step": 1914 + }, + { + "epoch": 6.913769751693002, + "grad_norm": 235.71524047851562, + "learning_rate": 1.9627949183303086e-05, + "loss": 44.5351, + "step": 1915 + }, + { + "epoch": 6.917381489841986, + "grad_norm": 233.1634063720703, + "learning_rate": 1.962250453720508e-05, + "loss": 44.0865, + "step": 1916 + }, + { + "epoch": 6.92099322799097, + "grad_norm": 201.48944091796875, + "learning_rate": 1.961705989110708e-05, + "loss": 45.0226, + "step": 1917 + }, + { + "epoch": 6.924604966139955, + "grad_norm": 226.95469665527344, + "learning_rate": 1.9611615245009076e-05, + "loss": 44.3969, + "step": 1918 + }, + { + "epoch": 6.928216704288939, + "grad_norm": 242.79940795898438, + "learning_rate": 1.960617059891107e-05, + "loss": 41.3037, + "step": 1919 + }, + { + "epoch": 6.931828442437923, + "grad_norm": 255.3524932861328, + "learning_rate": 1.9600725952813066e-05, + "loss": 41.3567, + "step": 1920 + }, + { + "epoch": 6.931828442437923, + "eval_loss": 0.6346065998077393, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 1920 + }, + { + "epoch": 6.935440180586907, + "grad_norm": 277.0763854980469, + "learning_rate": 1.9595281306715062e-05, + "loss": 41.142, + "step": 1921 + }, + { + "epoch": 6.939051918735892, + "grad_norm": 176.02658081054688, + "learning_rate": 1.958983666061706e-05, + "loss": 42.1963, + "step": 1922 + }, + { + "epoch": 6.942663656884876, + "grad_norm": 236.36398315429688, + "learning_rate": 1.958439201451906e-05, + "loss": 42.351, + "step": 1923 + }, + { + "epoch": 6.94627539503386, + "grad_norm": 203.0919647216797, + "learning_rate": 1.9578947368421055e-05, + "loss": 41.5248, + "step": 1924 + }, + { + "epoch": 6.949887133182845, + "grad_norm": 273.605712890625, + "learning_rate": 1.957350272232305e-05, + "loss": 42.1004, + "step": 1925 + }, + { + "epoch": 6.953498871331829, + "grad_norm": 214.04319763183594, + "learning_rate": 1.9568058076225045e-05, + "loss": 42.6326, + "step": 1926 + }, + { + "epoch": 6.957110609480813, + "grad_norm": 250.81832885742188, + "learning_rate": 1.956261343012704e-05, + "loss": 43.8045, + "step": 1927 + }, + { + "epoch": 6.960722347629797, + "grad_norm": 233.58116149902344, + "learning_rate": 1.955716878402904e-05, + "loss": 39.8991, + "step": 1928 + }, + { + "epoch": 6.9643340857787805, + "grad_norm": 269.0545654296875, + "learning_rate": 1.9551724137931035e-05, + "loss": 34.6192, + "step": 1929 + }, + { + "epoch": 6.967945823927765, + "grad_norm": 266.1218566894531, + "learning_rate": 1.954627949183303e-05, + "loss": 35.7568, + "step": 1930 + }, + { + "epoch": 6.967945823927765, + "eval_loss": 0.6233173608779907, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1930 + }, + { + "epoch": 6.971557562076749, + "grad_norm": 294.6914978027344, + "learning_rate": 1.9540834845735026e-05, + "loss": 36.0795, + "step": 1931 + }, + { + "epoch": 6.975169300225733, + "grad_norm": 373.6831970214844, + "learning_rate": 1.9535390199637025e-05, + "loss": 37.2715, + "step": 1932 + }, + { + "epoch": 6.978781038374718, + "grad_norm": 240.34738159179688, + "learning_rate": 1.952994555353902e-05, + "loss": 37.8335, + "step": 1933 + }, + { + "epoch": 6.982392776523702, + "grad_norm": 312.1968994140625, + "learning_rate": 1.952450090744102e-05, + "loss": 37.8251, + "step": 1934 + }, + { + "epoch": 6.986004514672686, + "grad_norm": 276.3544006347656, + "learning_rate": 1.9519056261343014e-05, + "loss": 38.8466, + "step": 1935 + }, + { + "epoch": 6.98961625282167, + "grad_norm": 282.6874694824219, + "learning_rate": 1.951361161524501e-05, + "loss": 37.774, + "step": 1936 + }, + { + "epoch": 6.993227990970655, + "grad_norm": 323.96612548828125, + "learning_rate": 1.9508166969147005e-05, + "loss": 34.3747, + "step": 1937 + }, + { + "epoch": 6.996839729119639, + "grad_norm": 235.02915954589844, + "learning_rate": 1.9502722323049e-05, + "loss": 24.5297, + "step": 1938 + }, + { + "epoch": 7.0, + "grad_norm": 176.4046173095703, + "learning_rate": 1.9497277676951e-05, + "loss": 22.3179, + "step": 1939 + }, + { + "epoch": 7.003611738148984, + "grad_norm": 248.2797393798828, + "learning_rate": 1.9491833030852994e-05, + "loss": 42.225, + "step": 1940 + }, + { + "epoch": 7.003611738148984, + "eval_loss": 0.6272363066673279, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.911, + "eval_steps_per_second": 56.911, + "step": 1940 + }, + { + "epoch": 7.007223476297969, + "grad_norm": 235.9131622314453, + "learning_rate": 1.9486388384754993e-05, + "loss": 43.6526, + "step": 1941 + }, + { + "epoch": 7.010835214446953, + "grad_norm": 223.63479614257812, + "learning_rate": 1.948094373865699e-05, + "loss": 42.9052, + "step": 1942 + }, + { + "epoch": 7.014446952595937, + "grad_norm": 203.92141723632812, + "learning_rate": 1.9475499092558984e-05, + "loss": 43.5819, + "step": 1943 + }, + { + "epoch": 7.018058690744921, + "grad_norm": 209.6050567626953, + "learning_rate": 1.947005444646098e-05, + "loss": 43.1077, + "step": 1944 + }, + { + "epoch": 7.021670428893906, + "grad_norm": 245.77700805664062, + "learning_rate": 1.9464609800362978e-05, + "loss": 42.7508, + "step": 1945 + }, + { + "epoch": 7.0252821670428895, + "grad_norm": 203.13465881347656, + "learning_rate": 1.9459165154264973e-05, + "loss": 42.5234, + "step": 1946 + }, + { + "epoch": 7.0288939051918735, + "grad_norm": 226.4978485107422, + "learning_rate": 1.945372050816697e-05, + "loss": 44.0725, + "step": 1947 + }, + { + "epoch": 7.0325056433408575, + "grad_norm": 225.68116760253906, + "learning_rate": 1.9448275862068964e-05, + "loss": 42.6408, + "step": 1948 + }, + { + "epoch": 7.036117381489842, + "grad_norm": 182.14202880859375, + "learning_rate": 1.944283121597096e-05, + "loss": 41.7696, + "step": 1949 + }, + { + "epoch": 7.039729119638826, + "grad_norm": 196.1949005126953, + "learning_rate": 1.9437386569872962e-05, + "loss": 42.7008, + "step": 1950 + }, + { + "epoch": 7.039729119638826, + "eval_loss": 0.6277336478233337, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 1950 + }, + { + "epoch": 7.04334085778781, + "grad_norm": 180.6853485107422, + "learning_rate": 1.9431941923774957e-05, + "loss": 41.9946, + "step": 1951 + }, + { + "epoch": 7.046952595936794, + "grad_norm": 199.0644073486328, + "learning_rate": 1.9426497277676953e-05, + "loss": 39.8965, + "step": 1952 + }, + { + "epoch": 7.050564334085779, + "grad_norm": 208.21371459960938, + "learning_rate": 1.9421052631578948e-05, + "loss": 39.3263, + "step": 1953 + }, + { + "epoch": 7.054176072234763, + "grad_norm": 239.78677368164062, + "learning_rate": 1.9415607985480943e-05, + "loss": 40.1478, + "step": 1954 + }, + { + "epoch": 7.057787810383747, + "grad_norm": 211.55030822753906, + "learning_rate": 1.941016333938294e-05, + "loss": 40.061, + "step": 1955 + }, + { + "epoch": 7.061399548532731, + "grad_norm": 199.51455688476562, + "learning_rate": 1.9404718693284937e-05, + "loss": 39.8707, + "step": 1956 + }, + { + "epoch": 7.065011286681716, + "grad_norm": 183.39486694335938, + "learning_rate": 1.9399274047186933e-05, + "loss": 40.3183, + "step": 1957 + }, + { + "epoch": 7.0686230248307, + "grad_norm": 238.36737060546875, + "learning_rate": 1.9393829401088928e-05, + "loss": 40.8581, + "step": 1958 + }, + { + "epoch": 7.072234762979684, + "grad_norm": 202.5072021484375, + "learning_rate": 1.9388384754990927e-05, + "loss": 40.2192, + "step": 1959 + }, + { + "epoch": 7.075846501128668, + "grad_norm": 204.236083984375, + "learning_rate": 1.9382940108892922e-05, + "loss": 40.8533, + "step": 1960 + }, + { + "epoch": 7.075846501128668, + "eval_loss": 0.6252757906913757, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 1960 + }, + { + "epoch": 7.079458239277653, + "grad_norm": 260.2081298828125, + "learning_rate": 1.937749546279492e-05, + "loss": 39.7229, + "step": 1961 + }, + { + "epoch": 7.083069977426637, + "grad_norm": 241.91722106933594, + "learning_rate": 1.9372050816696917e-05, + "loss": 41.547, + "step": 1962 + }, + { + "epoch": 7.0866817155756205, + "grad_norm": 168.9304656982422, + "learning_rate": 1.9366606170598912e-05, + "loss": 41.4826, + "step": 1963 + }, + { + "epoch": 7.090293453724605, + "grad_norm": 230.05349731445312, + "learning_rate": 1.9361161524500907e-05, + "loss": 41.5411, + "step": 1964 + }, + { + "epoch": 7.093905191873589, + "grad_norm": 172.16851806640625, + "learning_rate": 1.9355716878402903e-05, + "loss": 42.2347, + "step": 1965 + }, + { + "epoch": 7.097516930022573, + "grad_norm": 312.65838623046875, + "learning_rate": 1.9350272232304898e-05, + "loss": 41.4039, + "step": 1966 + }, + { + "epoch": 7.101128668171557, + "grad_norm": 249.62351989746094, + "learning_rate": 1.9344827586206897e-05, + "loss": 41.4234, + "step": 1967 + }, + { + "epoch": 7.104740406320542, + "grad_norm": 250.49143981933594, + "learning_rate": 1.9339382940108896e-05, + "loss": 38.0539, + "step": 1968 + }, + { + "epoch": 7.108352144469526, + "grad_norm": 238.41546630859375, + "learning_rate": 1.933393829401089e-05, + "loss": 35.5584, + "step": 1969 + }, + { + "epoch": 7.11196388261851, + "grad_norm": 200.78282165527344, + "learning_rate": 1.9328493647912886e-05, + "loss": 34.4491, + "step": 1970 + }, + { + "epoch": 7.11196388261851, + "eval_loss": 0.6286216378211975, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 1970 + }, + { + "epoch": 7.115575620767494, + "grad_norm": 244.61717224121094, + "learning_rate": 1.9323049001814882e-05, + "loss": 34.5403, + "step": 1971 + }, + { + "epoch": 7.119187358916479, + "grad_norm": 219.14312744140625, + "learning_rate": 1.931760435571688e-05, + "loss": 35.7815, + "step": 1972 + }, + { + "epoch": 7.122799097065463, + "grad_norm": 221.85130310058594, + "learning_rate": 1.9312159709618876e-05, + "loss": 35.638, + "step": 1973 + }, + { + "epoch": 7.126410835214447, + "grad_norm": 237.97921752929688, + "learning_rate": 1.930671506352087e-05, + "loss": 35.1348, + "step": 1974 + }, + { + "epoch": 7.130022573363431, + "grad_norm": 234.06256103515625, + "learning_rate": 1.9301270417422867e-05, + "loss": 35.8709, + "step": 1975 + }, + { + "epoch": 7.133634311512416, + "grad_norm": 231.6852264404297, + "learning_rate": 1.9295825771324862e-05, + "loss": 36.6859, + "step": 1976 + }, + { + "epoch": 7.1372460496614, + "grad_norm": 208.2762908935547, + "learning_rate": 1.9290381125226857e-05, + "loss": 37.24, + "step": 1977 + }, + { + "epoch": 7.140857787810384, + "grad_norm": 219.8532257080078, + "learning_rate": 1.928493647912886e-05, + "loss": 36.4058, + "step": 1978 + }, + { + "epoch": 7.144469525959368, + "grad_norm": 242.73159790039062, + "learning_rate": 1.9279491833030855e-05, + "loss": 36.7565, + "step": 1979 + }, + { + "epoch": 7.148081264108352, + "grad_norm": 227.09645080566406, + "learning_rate": 1.927404718693285e-05, + "loss": 37.6752, + "step": 1980 + }, + { + "epoch": 7.148081264108352, + "eval_loss": 0.6243596076965332, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 1980 + }, + { + "epoch": 7.151693002257336, + "grad_norm": 236.27169799804688, + "learning_rate": 1.9268602540834846e-05, + "loss": 38.3857, + "step": 1981 + }, + { + "epoch": 7.15530474040632, + "grad_norm": 244.84912109375, + "learning_rate": 1.926315789473684e-05, + "loss": 38.414, + "step": 1982 + }, + { + "epoch": 7.158916478555304, + "grad_norm": 203.36798095703125, + "learning_rate": 1.925771324863884e-05, + "loss": 38.938, + "step": 1983 + }, + { + "epoch": 7.162528216704289, + "grad_norm": 225.50152587890625, + "learning_rate": 1.9252268602540835e-05, + "loss": 37.654, + "step": 1984 + }, + { + "epoch": 7.166139954853273, + "grad_norm": 236.4989471435547, + "learning_rate": 1.924682395644283e-05, + "loss": 28.2794, + "step": 1985 + }, + { + "epoch": 7.169751693002257, + "grad_norm": 173.909423828125, + "learning_rate": 1.9241379310344826e-05, + "loss": 23.3804, + "step": 1986 + }, + { + "epoch": 7.173363431151241, + "grad_norm": 195.63526916503906, + "learning_rate": 1.9235934664246825e-05, + "loss": 24.4696, + "step": 1987 + }, + { + "epoch": 7.176975169300226, + "grad_norm": 150.0059356689453, + "learning_rate": 1.923049001814882e-05, + "loss": 23.9438, + "step": 1988 + }, + { + "epoch": 7.18058690744921, + "grad_norm": 217.61630249023438, + "learning_rate": 1.922504537205082e-05, + "loss": 25.4084, + "step": 1989 + }, + { + "epoch": 7.184198645598194, + "grad_norm": 259.2041015625, + "learning_rate": 1.9219600725952814e-05, + "loss": 44.7159, + "step": 1990 + }, + { + "epoch": 7.184198645598194, + "eval_loss": 0.6465168595314026, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 1990 + }, + { + "epoch": 7.187810383747179, + "grad_norm": 282.1758117675781, + "learning_rate": 1.921415607985481e-05, + "loss": 45.7571, + "step": 1991 + }, + { + "epoch": 7.191422121896163, + "grad_norm": 276.5455322265625, + "learning_rate": 1.9208711433756805e-05, + "loss": 44.7227, + "step": 1992 + }, + { + "epoch": 7.195033860045147, + "grad_norm": 251.93589782714844, + "learning_rate": 1.92032667876588e-05, + "loss": 43.0705, + "step": 1993 + }, + { + "epoch": 7.198645598194131, + "grad_norm": 224.8245086669922, + "learning_rate": 1.91978221415608e-05, + "loss": 43.2009, + "step": 1994 + }, + { + "epoch": 7.2022573363431155, + "grad_norm": 233.61770629882812, + "learning_rate": 1.9192377495462795e-05, + "loss": 43.4496, + "step": 1995 + }, + { + "epoch": 7.2058690744920995, + "grad_norm": 188.65252685546875, + "learning_rate": 1.9186932849364793e-05, + "loss": 42.5907, + "step": 1996 + }, + { + "epoch": 7.209480812641083, + "grad_norm": 185.1155242919922, + "learning_rate": 1.918148820326679e-05, + "loss": 44.4651, + "step": 1997 + }, + { + "epoch": 7.213092550790067, + "grad_norm": 169.09701538085938, + "learning_rate": 1.9176043557168784e-05, + "loss": 43.6325, + "step": 1998 + }, + { + "epoch": 7.216704288939052, + "grad_norm": 198.49114990234375, + "learning_rate": 1.9170598911070783e-05, + "loss": 43.5817, + "step": 1999 + }, + { + "epoch": 7.220316027088036, + "grad_norm": 193.17591857910156, + "learning_rate": 1.916515426497278e-05, + "loss": 41.4884, + "step": 2000 + }, + { + "epoch": 7.220316027088036, + "eval_loss": 0.6329721212387085, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.995, + "eval_steps_per_second": 56.995, + "step": 2000 + }, + { + "epoch": 7.22392776523702, + "grad_norm": 202.32730102539062, + "learning_rate": 1.9159709618874774e-05, + "loss": 41.2168, + "step": 2001 + }, + { + "epoch": 7.227539503386004, + "grad_norm": 206.4916534423828, + "learning_rate": 1.915426497277677e-05, + "loss": 39.9909, + "step": 2002 + }, + { + "epoch": 7.231151241534989, + "grad_norm": 202.2099609375, + "learning_rate": 1.9148820326678765e-05, + "loss": 40.1413, + "step": 2003 + }, + { + "epoch": 7.234762979683973, + "grad_norm": 223.7954559326172, + "learning_rate": 1.914337568058076e-05, + "loss": 39.5872, + "step": 2004 + }, + { + "epoch": 7.238374717832957, + "grad_norm": 225.8967742919922, + "learning_rate": 1.9137931034482762e-05, + "loss": 41.3396, + "step": 2005 + }, + { + "epoch": 7.241986455981941, + "grad_norm": 248.0997772216797, + "learning_rate": 1.9132486388384757e-05, + "loss": 39.012, + "step": 2006 + }, + { + "epoch": 7.245598194130926, + "grad_norm": 227.4576873779297, + "learning_rate": 1.9127041742286753e-05, + "loss": 42.5922, + "step": 2007 + }, + { + "epoch": 7.24920993227991, + "grad_norm": 197.62547302246094, + "learning_rate": 1.9121597096188748e-05, + "loss": 41.6107, + "step": 2008 + }, + { + "epoch": 7.252821670428894, + "grad_norm": 170.18817138671875, + "learning_rate": 1.9116152450090744e-05, + "loss": 40.3326, + "step": 2009 + }, + { + "epoch": 7.2564334085778786, + "grad_norm": 186.9420166015625, + "learning_rate": 1.9110707803992742e-05, + "loss": 41.0365, + "step": 2010 + }, + { + "epoch": 7.2564334085778786, + "eval_loss": 0.6230406761169434, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2010 + }, + { + "epoch": 7.2600451467268625, + "grad_norm": 188.11244201660156, + "learning_rate": 1.9105263157894738e-05, + "loss": 42.0278, + "step": 2011 + }, + { + "epoch": 7.2636568848758465, + "grad_norm": 242.47305297851562, + "learning_rate": 1.9099818511796733e-05, + "loss": 41.5539, + "step": 2012 + }, + { + "epoch": 7.2672686230248305, + "grad_norm": 190.83987426757812, + "learning_rate": 1.909437386569873e-05, + "loss": 41.8641, + "step": 2013 + }, + { + "epoch": 7.270880361173815, + "grad_norm": 214.44650268554688, + "learning_rate": 1.9088929219600724e-05, + "loss": 42.232, + "step": 2014 + }, + { + "epoch": 7.274492099322799, + "grad_norm": 216.3888397216797, + "learning_rate": 1.9083484573502723e-05, + "loss": 41.6186, + "step": 2015 + }, + { + "epoch": 7.278103837471783, + "grad_norm": 210.46673583984375, + "learning_rate": 1.907803992740472e-05, + "loss": 42.2099, + "step": 2016 + }, + { + "epoch": 7.281715575620767, + "grad_norm": 194.84165954589844, + "learning_rate": 1.9072595281306717e-05, + "loss": 42.78, + "step": 2017 + }, + { + "epoch": 7.285327313769752, + "grad_norm": 201.91297912597656, + "learning_rate": 1.9067150635208712e-05, + "loss": 38.7115, + "step": 2018 + }, + { + "epoch": 7.288939051918736, + "grad_norm": 245.42625427246094, + "learning_rate": 1.9061705989110708e-05, + "loss": 35.7841, + "step": 2019 + }, + { + "epoch": 7.29255079006772, + "grad_norm": 182.4967041015625, + "learning_rate": 1.9056261343012703e-05, + "loss": 34.3308, + "step": 2020 + }, + { + "epoch": 7.29255079006772, + "eval_loss": 0.6238341331481934, + "eval_runtime": 3.1431, + "eval_samples_per_second": 56.95, + "eval_steps_per_second": 56.95, + "step": 2020 + }, + { + "epoch": 7.296162528216704, + "grad_norm": 297.3916320800781, + "learning_rate": 1.9050816696914702e-05, + "loss": 34.7534, + "step": 2021 + }, + { + "epoch": 7.299774266365689, + "grad_norm": 211.52554321289062, + "learning_rate": 1.9045372050816697e-05, + "loss": 34.0303, + "step": 2022 + }, + { + "epoch": 7.303386004514673, + "grad_norm": 232.99844360351562, + "learning_rate": 1.9039927404718693e-05, + "loss": 35.7378, + "step": 2023 + }, + { + "epoch": 7.306997742663657, + "grad_norm": 230.34642028808594, + "learning_rate": 1.903448275862069e-05, + "loss": 36.7492, + "step": 2024 + }, + { + "epoch": 7.310609480812641, + "grad_norm": 228.88966369628906, + "learning_rate": 1.9029038112522687e-05, + "loss": 35.1188, + "step": 2025 + }, + { + "epoch": 7.314221218961626, + "grad_norm": 213.2604522705078, + "learning_rate": 1.9023593466424682e-05, + "loss": 35.0688, + "step": 2026 + }, + { + "epoch": 7.3178329571106095, + "grad_norm": 202.62200927734375, + "learning_rate": 1.901814882032668e-05, + "loss": 37.6721, + "step": 2027 + }, + { + "epoch": 7.3214446952595935, + "grad_norm": 191.8877410888672, + "learning_rate": 1.9012704174228676e-05, + "loss": 36.7728, + "step": 2028 + }, + { + "epoch": 7.3250564334085775, + "grad_norm": 211.57571411132812, + "learning_rate": 1.900725952813067e-05, + "loss": 36.6342, + "step": 2029 + }, + { + "epoch": 7.328668171557562, + "grad_norm": 177.2289581298828, + "learning_rate": 1.9001814882032667e-05, + "loss": 36.8319, + "step": 2030 + }, + { + "epoch": 7.328668171557562, + "eval_loss": 0.6231008172035217, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2030 + }, + { + "epoch": 7.332279909706546, + "grad_norm": 227.7028350830078, + "learning_rate": 1.8996370235934662e-05, + "loss": 36.6706, + "step": 2031 + }, + { + "epoch": 7.33589164785553, + "grad_norm": 229.02972412109375, + "learning_rate": 1.899092558983666e-05, + "loss": 37.0749, + "step": 2032 + }, + { + "epoch": 7.339503386004514, + "grad_norm": 234.30946350097656, + "learning_rate": 1.898548094373866e-05, + "loss": 37.3716, + "step": 2033 + }, + { + "epoch": 7.343115124153499, + "grad_norm": 236.79893493652344, + "learning_rate": 1.8980036297640655e-05, + "loss": 38.9503, + "step": 2034 + }, + { + "epoch": 7.346726862302483, + "grad_norm": 256.5646057128906, + "learning_rate": 1.897459165154265e-05, + "loss": 32.5056, + "step": 2035 + }, + { + "epoch": 7.350338600451467, + "grad_norm": 183.38961791992188, + "learning_rate": 1.8969147005444646e-05, + "loss": 25.3982, + "step": 2036 + }, + { + "epoch": 7.353950338600452, + "grad_norm": 214.09742736816406, + "learning_rate": 1.896370235934664e-05, + "loss": 23.2743, + "step": 2037 + }, + { + "epoch": 7.357562076749436, + "grad_norm": 190.10867309570312, + "learning_rate": 1.895825771324864e-05, + "loss": 24.8062, + "step": 2038 + }, + { + "epoch": 7.36117381489842, + "grad_norm": 197.85313415527344, + "learning_rate": 1.8952813067150636e-05, + "loss": 25.5098, + "step": 2039 + }, + { + "epoch": 7.364785553047404, + "grad_norm": 235.79090881347656, + "learning_rate": 1.894736842105263e-05, + "loss": 44.3536, + "step": 2040 + }, + { + "epoch": 7.364785553047404, + "eval_loss": 0.6341925263404846, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.785, + "eval_steps_per_second": 56.785, + "step": 2040 + }, + { + "epoch": 7.368397291196389, + "grad_norm": 232.7415771484375, + "learning_rate": 1.8941923774954626e-05, + "loss": 44.6073, + "step": 2041 + }, + { + "epoch": 7.372009029345373, + "grad_norm": 302.3766174316406, + "learning_rate": 1.8936479128856625e-05, + "loss": 43.8575, + "step": 2042 + }, + { + "epoch": 7.375620767494357, + "grad_norm": 208.41441345214844, + "learning_rate": 1.8931034482758624e-05, + "loss": 42.4378, + "step": 2043 + }, + { + "epoch": 7.3792325056433405, + "grad_norm": 228.000732421875, + "learning_rate": 1.892558983666062e-05, + "loss": 44.5641, + "step": 2044 + }, + { + "epoch": 7.382844243792325, + "grad_norm": 201.757080078125, + "learning_rate": 1.8920145190562615e-05, + "loss": 43.7578, + "step": 2045 + }, + { + "epoch": 7.386455981941309, + "grad_norm": 220.2481689453125, + "learning_rate": 1.891470054446461e-05, + "loss": 42.755, + "step": 2046 + }, + { + "epoch": 7.390067720090293, + "grad_norm": 225.5443115234375, + "learning_rate": 1.8909255898366605e-05, + "loss": 44.3785, + "step": 2047 + }, + { + "epoch": 7.393679458239277, + "grad_norm": 200.2024688720703, + "learning_rate": 1.89038112522686e-05, + "loss": 42.994, + "step": 2048 + }, + { + "epoch": 7.397291196388262, + "grad_norm": 205.64794921875, + "learning_rate": 1.88983666061706e-05, + "loss": 43.1902, + "step": 2049 + }, + { + "epoch": 7.400902934537246, + "grad_norm": 183.3535919189453, + "learning_rate": 1.8892921960072595e-05, + "loss": 40.9422, + "step": 2050 + }, + { + "epoch": 7.400902934537246, + "eval_loss": 0.626913845539093, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2050 + }, + { + "epoch": 7.40451467268623, + "grad_norm": 201.8138885498047, + "learning_rate": 1.8887477313974594e-05, + "loss": 39.4408, + "step": 2051 + }, + { + "epoch": 7.408126410835214, + "grad_norm": 201.8863525390625, + "learning_rate": 1.888203266787659e-05, + "loss": 39.5467, + "step": 2052 + }, + { + "epoch": 7.411738148984199, + "grad_norm": 239.10687255859375, + "learning_rate": 1.8876588021778585e-05, + "loss": 41.2256, + "step": 2053 + }, + { + "epoch": 7.415349887133183, + "grad_norm": 209.47796630859375, + "learning_rate": 1.8871143375680583e-05, + "loss": 40.8963, + "step": 2054 + }, + { + "epoch": 7.418961625282167, + "grad_norm": 202.6414794921875, + "learning_rate": 1.886569872958258e-05, + "loss": 40.5138, + "step": 2055 + }, + { + "epoch": 7.422573363431152, + "grad_norm": 198.01795959472656, + "learning_rate": 1.8860254083484574e-05, + "loss": 39.1767, + "step": 2056 + }, + { + "epoch": 7.426185101580136, + "grad_norm": 173.26507568359375, + "learning_rate": 1.885480943738657e-05, + "loss": 40.6713, + "step": 2057 + }, + { + "epoch": 7.42979683972912, + "grad_norm": 166.11607360839844, + "learning_rate": 1.8849364791288565e-05, + "loss": 41.2602, + "step": 2058 + }, + { + "epoch": 7.433408577878104, + "grad_norm": 200.76956176757812, + "learning_rate": 1.884392014519056e-05, + "loss": 41.0714, + "step": 2059 + }, + { + "epoch": 7.437020316027088, + "grad_norm": 213.75315856933594, + "learning_rate": 1.883847549909256e-05, + "loss": 39.6812, + "step": 2060 + }, + { + "epoch": 7.437020316027088, + "eval_loss": 0.6279598474502563, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.0, + "eval_steps_per_second": 57.0, + "step": 2060 + }, + { + "epoch": 7.440632054176072, + "grad_norm": 221.25025939941406, + "learning_rate": 1.8833030852994558e-05, + "loss": 41.6964, + "step": 2061 + }, + { + "epoch": 7.444243792325056, + "grad_norm": 171.32106018066406, + "learning_rate": 1.8827586206896553e-05, + "loss": 41.4608, + "step": 2062 + }, + { + "epoch": 7.44785553047404, + "grad_norm": 222.76600646972656, + "learning_rate": 1.882214156079855e-05, + "loss": 41.2687, + "step": 2063 + }, + { + "epoch": 7.451467268623025, + "grad_norm": 169.82395935058594, + "learning_rate": 1.8816696914700544e-05, + "loss": 41.6048, + "step": 2064 + }, + { + "epoch": 7.455079006772009, + "grad_norm": 190.5113525390625, + "learning_rate": 1.8811252268602543e-05, + "loss": 41.8843, + "step": 2065 + }, + { + "epoch": 7.458690744920993, + "grad_norm": 194.5990447998047, + "learning_rate": 1.8805807622504538e-05, + "loss": 43.5968, + "step": 2066 + }, + { + "epoch": 7.462302483069977, + "grad_norm": 216.0985870361328, + "learning_rate": 1.8800362976406533e-05, + "loss": 41.6743, + "step": 2067 + }, + { + "epoch": 7.465914221218962, + "grad_norm": 249.05270385742188, + "learning_rate": 1.879491833030853e-05, + "loss": 39.4203, + "step": 2068 + }, + { + "epoch": 7.469525959367946, + "grad_norm": 232.5495147705078, + "learning_rate": 1.8789473684210524e-05, + "loss": 36.2202, + "step": 2069 + }, + { + "epoch": 7.47313769751693, + "grad_norm": 218.72299194335938, + "learning_rate": 1.8784029038112523e-05, + "loss": 34.9116, + "step": 2070 + }, + { + "epoch": 7.47313769751693, + "eval_loss": 0.6241349577903748, + "eval_runtime": 3.1499, + "eval_samples_per_second": 56.827, + "eval_steps_per_second": 56.827, + "step": 2070 + }, + { + "epoch": 7.476749435665914, + "grad_norm": 241.78179931640625, + "learning_rate": 1.8778584392014522e-05, + "loss": 36.2476, + "step": 2071 + }, + { + "epoch": 7.480361173814899, + "grad_norm": 194.92982482910156, + "learning_rate": 1.8773139745916517e-05, + "loss": 34.4524, + "step": 2072 + }, + { + "epoch": 7.483972911963883, + "grad_norm": 227.76156616210938, + "learning_rate": 1.8767695099818513e-05, + "loss": 34.5292, + "step": 2073 + }, + { + "epoch": 7.487584650112867, + "grad_norm": 287.61309814453125, + "learning_rate": 1.8762250453720508e-05, + "loss": 37.8068, + "step": 2074 + }, + { + "epoch": 7.491196388261851, + "grad_norm": 191.0822296142578, + "learning_rate": 1.8756805807622503e-05, + "loss": 36.0941, + "step": 2075 + }, + { + "epoch": 7.4948081264108355, + "grad_norm": 197.5564422607422, + "learning_rate": 1.8751361161524502e-05, + "loss": 36.3624, + "step": 2076 + }, + { + "epoch": 7.4984198645598195, + "grad_norm": 187.72479248046875, + "learning_rate": 1.8745916515426497e-05, + "loss": 37.5074, + "step": 2077 + }, + { + "epoch": 7.502031602708803, + "grad_norm": 220.4607391357422, + "learning_rate": 1.8740471869328493e-05, + "loss": 35.6139, + "step": 2078 + }, + { + "epoch": 7.505643340857787, + "grad_norm": 179.05612182617188, + "learning_rate": 1.873502722323049e-05, + "loss": 37.7286, + "step": 2079 + }, + { + "epoch": 7.509255079006772, + "grad_norm": 230.91879272460938, + "learning_rate": 1.8729582577132487e-05, + "loss": 36.1803, + "step": 2080 + }, + { + "epoch": 7.509255079006772, + "eval_loss": 0.6255043148994446, + "eval_runtime": 3.1466, + "eval_samples_per_second": 56.887, + "eval_steps_per_second": 56.887, + "step": 2080 + }, + { + "epoch": 7.512866817155756, + "grad_norm": 182.89437866210938, + "learning_rate": 1.8724137931034482e-05, + "loss": 36.5782, + "step": 2081 + }, + { + "epoch": 7.51647855530474, + "grad_norm": 215.36769104003906, + "learning_rate": 1.871869328493648e-05, + "loss": 38.233, + "step": 2082 + }, + { + "epoch": 7.520090293453725, + "grad_norm": 232.6095733642578, + "learning_rate": 1.8713248638838477e-05, + "loss": 38.6268, + "step": 2083 + }, + { + "epoch": 7.523702031602709, + "grad_norm": 236.94281005859375, + "learning_rate": 1.8707803992740472e-05, + "loss": 38.1768, + "step": 2084 + }, + { + "epoch": 7.527313769751693, + "grad_norm": 214.16079711914062, + "learning_rate": 1.8702359346642467e-05, + "loss": 27.514, + "step": 2085 + }, + { + "epoch": 7.530925507900677, + "grad_norm": 192.6107940673828, + "learning_rate": 1.8696914700544463e-05, + "loss": 24.274, + "step": 2086 + }, + { + "epoch": 7.534537246049661, + "grad_norm": 217.98619079589844, + "learning_rate": 1.869147005444646e-05, + "loss": 23.2824, + "step": 2087 + }, + { + "epoch": 7.538148984198646, + "grad_norm": 183.04296875, + "learning_rate": 1.868602540834846e-05, + "loss": 24.9622, + "step": 2088 + }, + { + "epoch": 7.54176072234763, + "grad_norm": 167.1417236328125, + "learning_rate": 1.8680580762250456e-05, + "loss": 25.1446, + "step": 2089 + }, + { + "epoch": 7.545372460496614, + "grad_norm": 287.29937744140625, + "learning_rate": 1.867513611615245e-05, + "loss": 44.1171, + "step": 2090 + }, + { + "epoch": 7.545372460496614, + "eval_loss": 0.6376849412918091, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.929, + "eval_steps_per_second": 56.929, + "step": 2090 + }, + { + "epoch": 7.5489841986455986, + "grad_norm": 285.3408203125, + "learning_rate": 1.8669691470054446e-05, + "loss": 46.3716, + "step": 2091 + }, + { + "epoch": 7.5525959367945825, + "grad_norm": 233.18389892578125, + "learning_rate": 1.8664246823956445e-05, + "loss": 44.0514, + "step": 2092 + }, + { + "epoch": 7.5562076749435665, + "grad_norm": 256.4196472167969, + "learning_rate": 1.865880217785844e-05, + "loss": 44.1784, + "step": 2093 + }, + { + "epoch": 7.5598194130925505, + "grad_norm": 223.28128051757812, + "learning_rate": 1.8653357531760436e-05, + "loss": 42.9897, + "step": 2094 + }, + { + "epoch": 7.563431151241535, + "grad_norm": 235.2901153564453, + "learning_rate": 1.864791288566243e-05, + "loss": 43.7651, + "step": 2095 + }, + { + "epoch": 7.567042889390519, + "grad_norm": 285.9206237792969, + "learning_rate": 1.8642468239564427e-05, + "loss": 44.6333, + "step": 2096 + }, + { + "epoch": 7.570654627539503, + "grad_norm": 200.00210571289062, + "learning_rate": 1.8637023593466425e-05, + "loss": 43.9845, + "step": 2097 + }, + { + "epoch": 7.574266365688487, + "grad_norm": 277.73394775390625, + "learning_rate": 1.8631578947368424e-05, + "loss": 44.7301, + "step": 2098 + }, + { + "epoch": 7.577878103837472, + "grad_norm": 216.9422149658203, + "learning_rate": 1.862613430127042e-05, + "loss": 44.0409, + "step": 2099 + }, + { + "epoch": 7.581489841986456, + "grad_norm": 198.86639404296875, + "learning_rate": 1.8620689655172415e-05, + "loss": 43.4026, + "step": 2100 + }, + { + "epoch": 7.581489841986456, + "eval_loss": 0.6270378232002258, + "eval_runtime": 3.1464, + "eval_samples_per_second": 56.891, + "eval_steps_per_second": 56.891, + "step": 2100 + }, + { + "epoch": 7.58510158013544, + "grad_norm": 240.495361328125, + "learning_rate": 1.861524500907441e-05, + "loss": 41.4092, + "step": 2101 + }, + { + "epoch": 7.588713318284425, + "grad_norm": 240.1851043701172, + "learning_rate": 1.8609800362976406e-05, + "loss": 40.1396, + "step": 2102 + }, + { + "epoch": 7.592325056433409, + "grad_norm": 241.21495056152344, + "learning_rate": 1.8604355716878405e-05, + "loss": 39.1778, + "step": 2103 + }, + { + "epoch": 7.595936794582393, + "grad_norm": 287.3133544921875, + "learning_rate": 1.85989110707804e-05, + "loss": 41.0348, + "step": 2104 + }, + { + "epoch": 7.599548532731377, + "grad_norm": 230.4313201904297, + "learning_rate": 1.8593466424682395e-05, + "loss": 39.5872, + "step": 2105 + }, + { + "epoch": 7.603160270880361, + "grad_norm": 210.32962036132812, + "learning_rate": 1.858802177858439e-05, + "loss": 40.6146, + "step": 2106 + }, + { + "epoch": 7.606772009029346, + "grad_norm": 185.81752014160156, + "learning_rate": 1.858257713248639e-05, + "loss": 39.6363, + "step": 2107 + }, + { + "epoch": 7.6103837471783295, + "grad_norm": 234.63037109375, + "learning_rate": 1.8577132486388385e-05, + "loss": 40.558, + "step": 2108 + }, + { + "epoch": 7.6139954853273135, + "grad_norm": 289.92803955078125, + "learning_rate": 1.8571687840290384e-05, + "loss": 41.1624, + "step": 2109 + }, + { + "epoch": 7.617607223476298, + "grad_norm": 252.82188415527344, + "learning_rate": 1.856624319419238e-05, + "loss": 41.7827, + "step": 2110 + }, + { + "epoch": 7.617607223476298, + "eval_loss": 0.6290409564971924, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2110 + }, + { + "epoch": 7.621218961625282, + "grad_norm": 201.8303985595703, + "learning_rate": 1.8560798548094374e-05, + "loss": 39.0072, + "step": 2111 + }, + { + "epoch": 7.624830699774266, + "grad_norm": 158.71446228027344, + "learning_rate": 1.855535390199637e-05, + "loss": 39.9822, + "step": 2112 + }, + { + "epoch": 7.62844243792325, + "grad_norm": 171.3879852294922, + "learning_rate": 1.8549909255898365e-05, + "loss": 42.1973, + "step": 2113 + }, + { + "epoch": 7.632054176072235, + "grad_norm": 218.584228515625, + "learning_rate": 1.8544464609800364e-05, + "loss": 42.933, + "step": 2114 + }, + { + "epoch": 7.635665914221219, + "grad_norm": 200.60093688964844, + "learning_rate": 1.853901996370236e-05, + "loss": 41.9847, + "step": 2115 + }, + { + "epoch": 7.639277652370203, + "grad_norm": 210.75128173828125, + "learning_rate": 1.8533575317604358e-05, + "loss": 42.4961, + "step": 2116 + }, + { + "epoch": 7.642889390519187, + "grad_norm": 187.47406005859375, + "learning_rate": 1.8528130671506353e-05, + "loss": 39.3404, + "step": 2117 + }, + { + "epoch": 7.646501128668172, + "grad_norm": 204.87693786621094, + "learning_rate": 1.852268602540835e-05, + "loss": 40.3011, + "step": 2118 + }, + { + "epoch": 7.650112866817156, + "grad_norm": 228.8159637451172, + "learning_rate": 1.8517241379310344e-05, + "loss": 37.4416, + "step": 2119 + }, + { + "epoch": 7.65372460496614, + "grad_norm": 237.59664916992188, + "learning_rate": 1.8511796733212343e-05, + "loss": 35.3079, + "step": 2120 + }, + { + "epoch": 7.65372460496614, + "eval_loss": 0.6256567239761353, + "eval_runtime": 3.1458, + "eval_samples_per_second": 56.902, + "eval_steps_per_second": 56.902, + "step": 2120 + }, + { + "epoch": 7.657336343115124, + "grad_norm": 233.3187713623047, + "learning_rate": 1.850635208711434e-05, + "loss": 34.5055, + "step": 2121 + }, + { + "epoch": 7.660948081264109, + "grad_norm": 232.7037353515625, + "learning_rate": 1.8500907441016334e-05, + "loss": 34.1232, + "step": 2122 + }, + { + "epoch": 7.664559819413093, + "grad_norm": 254.53050231933594, + "learning_rate": 1.849546279491833e-05, + "loss": 35.3301, + "step": 2123 + }, + { + "epoch": 7.668171557562077, + "grad_norm": 234.93154907226562, + "learning_rate": 1.8490018148820324e-05, + "loss": 35.9202, + "step": 2124 + }, + { + "epoch": 7.6717832957110605, + "grad_norm": 237.99671936035156, + "learning_rate": 1.8484573502722327e-05, + "loss": 36.5702, + "step": 2125 + }, + { + "epoch": 7.675395033860045, + "grad_norm": 186.25271606445312, + "learning_rate": 1.8479128856624322e-05, + "loss": 35.9423, + "step": 2126 + }, + { + "epoch": 7.679006772009029, + "grad_norm": 226.461669921875, + "learning_rate": 1.8473684210526317e-05, + "loss": 37.4121, + "step": 2127 + }, + { + "epoch": 7.682618510158013, + "grad_norm": 227.0966033935547, + "learning_rate": 1.8468239564428313e-05, + "loss": 36.8802, + "step": 2128 + }, + { + "epoch": 7.686230248306998, + "grad_norm": 193.4064178466797, + "learning_rate": 1.8462794918330308e-05, + "loss": 36.0245, + "step": 2129 + }, + { + "epoch": 7.689841986455982, + "grad_norm": 279.1668395996094, + "learning_rate": 1.8457350272232304e-05, + "loss": 37.4833, + "step": 2130 + }, + { + "epoch": 7.689841986455982, + "eval_loss": 0.6227458715438843, + "eval_runtime": 3.1429, + "eval_samples_per_second": 56.953, + "eval_steps_per_second": 56.953, + "step": 2130 + }, + { + "epoch": 7.693453724604966, + "grad_norm": 254.59234619140625, + "learning_rate": 1.8451905626134302e-05, + "loss": 36.8538, + "step": 2131 + }, + { + "epoch": 7.69706546275395, + "grad_norm": 191.14463806152344, + "learning_rate": 1.8446460980036298e-05, + "loss": 37.8517, + "step": 2132 + }, + { + "epoch": 7.700677200902934, + "grad_norm": 189.20896911621094, + "learning_rate": 1.8441016333938293e-05, + "loss": 38.406, + "step": 2133 + }, + { + "epoch": 7.704288939051919, + "grad_norm": 209.61175537109375, + "learning_rate": 1.8435571687840292e-05, + "loss": 37.7692, + "step": 2134 + }, + { + "epoch": 7.707900677200903, + "grad_norm": 220.5150146484375, + "learning_rate": 1.8430127041742287e-05, + "loss": 36.087, + "step": 2135 + }, + { + "epoch": 7.711512415349887, + "grad_norm": 211.78372192382812, + "learning_rate": 1.8424682395644286e-05, + "loss": 25.6052, + "step": 2136 + }, + { + "epoch": 7.715124153498872, + "grad_norm": 223.85789489746094, + "learning_rate": 1.841923774954628e-05, + "loss": 23.5576, + "step": 2137 + }, + { + "epoch": 7.718735891647856, + "grad_norm": 163.74220275878906, + "learning_rate": 1.8413793103448277e-05, + "loss": 24.4869, + "step": 2138 + }, + { + "epoch": 7.72234762979684, + "grad_norm": 182.80079650878906, + "learning_rate": 1.8408348457350272e-05, + "loss": 25.1878, + "step": 2139 + }, + { + "epoch": 7.725959367945824, + "grad_norm": 296.0340270996094, + "learning_rate": 1.8402903811252268e-05, + "loss": 44.4643, + "step": 2140 + }, + { + "epoch": 7.725959367945824, + "eval_loss": 0.6382863521575928, + "eval_runtime": 3.1441, + "eval_samples_per_second": 56.932, + "eval_steps_per_second": 56.932, + "step": 2140 + }, + { + "epoch": 7.7295711060948085, + "grad_norm": 248.48643493652344, + "learning_rate": 1.8397459165154263e-05, + "loss": 45.2141, + "step": 2141 + }, + { + "epoch": 7.733182844243792, + "grad_norm": 240.9061279296875, + "learning_rate": 1.8392014519056262e-05, + "loss": 42.9435, + "step": 2142 + }, + { + "epoch": 7.736794582392776, + "grad_norm": 231.62315368652344, + "learning_rate": 1.8386569872958257e-05, + "loss": 42.9769, + "step": 2143 + }, + { + "epoch": 7.74040632054176, + "grad_norm": 244.36915588378906, + "learning_rate": 1.8381125226860256e-05, + "loss": 43.6058, + "step": 2144 + }, + { + "epoch": 7.744018058690745, + "grad_norm": 252.9080047607422, + "learning_rate": 1.837568058076225e-05, + "loss": 43.1753, + "step": 2145 + }, + { + "epoch": 7.747629796839729, + "grad_norm": 274.0201721191406, + "learning_rate": 1.8370235934664247e-05, + "loss": 43.3285, + "step": 2146 + }, + { + "epoch": 7.751241534988713, + "grad_norm": 226.75595092773438, + "learning_rate": 1.8364791288566245e-05, + "loss": 43.3158, + "step": 2147 + }, + { + "epoch": 7.754853273137698, + "grad_norm": 197.0859832763672, + "learning_rate": 1.835934664246824e-05, + "loss": 43.5773, + "step": 2148 + }, + { + "epoch": 7.758465011286682, + "grad_norm": 212.14720153808594, + "learning_rate": 1.8353901996370236e-05, + "loss": 43.9208, + "step": 2149 + }, + { + "epoch": 7.762076749435666, + "grad_norm": 230.22158813476562, + "learning_rate": 1.834845735027223e-05, + "loss": 42.8429, + "step": 2150 + }, + { + "epoch": 7.762076749435666, + "eval_loss": 0.6291994452476501, + "eval_runtime": 3.1473, + "eval_samples_per_second": 56.874, + "eval_steps_per_second": 56.874, + "step": 2150 + }, + { + "epoch": 7.76568848758465, + "grad_norm": 215.79391479492188, + "learning_rate": 1.8343012704174227e-05, + "loss": 40.7289, + "step": 2151 + }, + { + "epoch": 7.769300225733634, + "grad_norm": 210.00296020507812, + "learning_rate": 1.8337568058076222e-05, + "loss": 39.9759, + "step": 2152 + }, + { + "epoch": 7.772911963882619, + "grad_norm": 291.2987976074219, + "learning_rate": 1.8332123411978224e-05, + "loss": 40.551, + "step": 2153 + }, + { + "epoch": 7.776523702031603, + "grad_norm": 218.08819580078125, + "learning_rate": 1.832667876588022e-05, + "loss": 40.7981, + "step": 2154 + }, + { + "epoch": 7.780135440180587, + "grad_norm": 268.615966796875, + "learning_rate": 1.8321234119782215e-05, + "loss": 40.5463, + "step": 2155 + }, + { + "epoch": 7.7837471783295715, + "grad_norm": 269.939697265625, + "learning_rate": 1.831578947368421e-05, + "loss": 40.6168, + "step": 2156 + }, + { + "epoch": 7.7873589164785555, + "grad_norm": 268.9761657714844, + "learning_rate": 1.8310344827586206e-05, + "loss": 41.2449, + "step": 2157 + }, + { + "epoch": 7.7909706546275395, + "grad_norm": 161.08811950683594, + "learning_rate": 1.8304900181488205e-05, + "loss": 40.6308, + "step": 2158 + }, + { + "epoch": 7.794582392776523, + "grad_norm": 190.44696044921875, + "learning_rate": 1.82994555353902e-05, + "loss": 40.9708, + "step": 2159 + }, + { + "epoch": 7.798194130925508, + "grad_norm": 202.4305419921875, + "learning_rate": 1.8294010889292196e-05, + "loss": 41.2053, + "step": 2160 + }, + { + "epoch": 7.798194130925508, + "eval_loss": 0.6233534812927246, + "eval_runtime": 3.1457, + "eval_samples_per_second": 56.903, + "eval_steps_per_second": 56.903, + "step": 2160 + }, + { + "epoch": 7.801805869074492, + "grad_norm": 188.5523681640625, + "learning_rate": 1.828856624319419e-05, + "loss": 40.3928, + "step": 2161 + }, + { + "epoch": 7.805417607223476, + "grad_norm": 184.18296813964844, + "learning_rate": 1.828312159709619e-05, + "loss": 42.3466, + "step": 2162 + }, + { + "epoch": 7.80902934537246, + "grad_norm": 223.9243927001953, + "learning_rate": 1.8277676950998185e-05, + "loss": 42.0301, + "step": 2163 + }, + { + "epoch": 7.812641083521445, + "grad_norm": 202.3498077392578, + "learning_rate": 1.8272232304900184e-05, + "loss": 42.3284, + "step": 2164 + }, + { + "epoch": 7.816252821670429, + "grad_norm": 205.77940368652344, + "learning_rate": 1.826678765880218e-05, + "loss": 42.0951, + "step": 2165 + }, + { + "epoch": 7.819864559819413, + "grad_norm": 191.46728515625, + "learning_rate": 1.8261343012704175e-05, + "loss": 40.826, + "step": 2166 + }, + { + "epoch": 7.823476297968397, + "grad_norm": 276.8330383300781, + "learning_rate": 1.825589836660617e-05, + "loss": 42.7909, + "step": 2167 + }, + { + "epoch": 7.827088036117382, + "grad_norm": 181.93955993652344, + "learning_rate": 1.8250453720508165e-05, + "loss": 38.6068, + "step": 2168 + }, + { + "epoch": 7.830699774266366, + "grad_norm": 178.79856872558594, + "learning_rate": 1.8245009074410164e-05, + "loss": 35.694, + "step": 2169 + }, + { + "epoch": 7.83431151241535, + "grad_norm": 224.6522979736328, + "learning_rate": 1.823956442831216e-05, + "loss": 36.7127, + "step": 2170 + }, + { + "epoch": 7.83431151241535, + "eval_loss": 0.6237645745277405, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 2170 + }, + { + "epoch": 7.837923250564334, + "grad_norm": 203.37196350097656, + "learning_rate": 1.823411978221416e-05, + "loss": 34.0039, + "step": 2171 + }, + { + "epoch": 7.8415349887133186, + "grad_norm": 212.79307556152344, + "learning_rate": 1.8228675136116154e-05, + "loss": 33.2787, + "step": 2172 + }, + { + "epoch": 7.8451467268623025, + "grad_norm": 215.5691375732422, + "learning_rate": 1.822323049001815e-05, + "loss": 35.4241, + "step": 2173 + }, + { + "epoch": 7.8487584650112865, + "grad_norm": 230.0751190185547, + "learning_rate": 1.8217785843920144e-05, + "loss": 36.9333, + "step": 2174 + }, + { + "epoch": 7.852370203160271, + "grad_norm": 217.8132781982422, + "learning_rate": 1.8212341197822143e-05, + "loss": 35.7233, + "step": 2175 + }, + { + "epoch": 7.855981941309255, + "grad_norm": 245.93177795410156, + "learning_rate": 1.820689655172414e-05, + "loss": 36.6111, + "step": 2176 + }, + { + "epoch": 7.859593679458239, + "grad_norm": 210.58218383789062, + "learning_rate": 1.8201451905626134e-05, + "loss": 36.3243, + "step": 2177 + }, + { + "epoch": 7.863205417607223, + "grad_norm": 234.6280059814453, + "learning_rate": 1.819600725952813e-05, + "loss": 37.0315, + "step": 2178 + }, + { + "epoch": 7.866817155756207, + "grad_norm": 184.53121948242188, + "learning_rate": 1.8190562613430125e-05, + "loss": 35.8725, + "step": 2179 + }, + { + "epoch": 7.870428893905192, + "grad_norm": 201.5563507080078, + "learning_rate": 1.8185117967332127e-05, + "loss": 37.9183, + "step": 2180 + }, + { + "epoch": 7.870428893905192, + "eval_loss": 0.6210297346115112, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2180 + }, + { + "epoch": 7.874040632054176, + "grad_norm": 192.29579162597656, + "learning_rate": 1.8179673321234122e-05, + "loss": 37.1709, + "step": 2181 + }, + { + "epoch": 7.87765237020316, + "grad_norm": 246.0638427734375, + "learning_rate": 1.8174228675136118e-05, + "loss": 38.5338, + "step": 2182 + }, + { + "epoch": 7.881264108352145, + "grad_norm": 237.47607421875, + "learning_rate": 1.8168784029038113e-05, + "loss": 37.7041, + "step": 2183 + }, + { + "epoch": 7.884875846501129, + "grad_norm": 215.06407165527344, + "learning_rate": 1.816333938294011e-05, + "loss": 38.1663, + "step": 2184 + }, + { + "epoch": 7.888487584650113, + "grad_norm": 193.76809692382812, + "learning_rate": 1.8157894736842107e-05, + "loss": 32.1679, + "step": 2185 + }, + { + "epoch": 7.892099322799097, + "grad_norm": 208.66111755371094, + "learning_rate": 1.8152450090744103e-05, + "loss": 24.2413, + "step": 2186 + }, + { + "epoch": 7.895711060948082, + "grad_norm": 182.810546875, + "learning_rate": 1.8147005444646098e-05, + "loss": 24.1102, + "step": 2187 + }, + { + "epoch": 7.899322799097066, + "grad_norm": 200.25823974609375, + "learning_rate": 1.8141560798548093e-05, + "loss": 24.5778, + "step": 2188 + }, + { + "epoch": 7.9029345372460496, + "grad_norm": 224.19125366210938, + "learning_rate": 1.813611615245009e-05, + "loss": 26.1643, + "step": 2189 + }, + { + "epoch": 7.9065462753950335, + "grad_norm": 261.03033447265625, + "learning_rate": 1.8130671506352088e-05, + "loss": 45.1071, + "step": 2190 + }, + { + "epoch": 7.9065462753950335, + "eval_loss": 0.6303785443305969, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2190 + }, + { + "epoch": 7.910158013544018, + "grad_norm": 273.6593322753906, + "learning_rate": 1.8125226860254086e-05, + "loss": 43.8271, + "step": 2191 + }, + { + "epoch": 7.913769751693002, + "grad_norm": 304.0534362792969, + "learning_rate": 1.8119782214156082e-05, + "loss": 43.7623, + "step": 2192 + }, + { + "epoch": 7.917381489841986, + "grad_norm": 249.27255249023438, + "learning_rate": 1.8114337568058077e-05, + "loss": 43.7191, + "step": 2193 + }, + { + "epoch": 7.92099322799097, + "grad_norm": 199.5006103515625, + "learning_rate": 1.8108892921960072e-05, + "loss": 44.1019, + "step": 2194 + }, + { + "epoch": 7.924604966139955, + "grad_norm": 228.42832946777344, + "learning_rate": 1.8103448275862068e-05, + "loss": 43.9717, + "step": 2195 + }, + { + "epoch": 7.928216704288939, + "grad_norm": 247.20901489257812, + "learning_rate": 1.8098003629764067e-05, + "loss": 40.022, + "step": 2196 + }, + { + "epoch": 7.931828442437923, + "grad_norm": 297.5372619628906, + "learning_rate": 1.8092558983666062e-05, + "loss": 40.6639, + "step": 2197 + }, + { + "epoch": 7.935440180586907, + "grad_norm": 245.11915588378906, + "learning_rate": 1.8087114337568057e-05, + "loss": 40.3569, + "step": 2198 + }, + { + "epoch": 7.939051918735892, + "grad_norm": 255.53297424316406, + "learning_rate": 1.8081669691470056e-05, + "loss": 41.7983, + "step": 2199 + }, + { + "epoch": 7.942663656884876, + "grad_norm": 226.12783813476562, + "learning_rate": 1.807622504537205e-05, + "loss": 41.7844, + "step": 2200 + }, + { + "epoch": 7.942663656884876, + "eval_loss": 0.6214397549629211, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2200 + }, + { + "epoch": 7.94627539503386, + "grad_norm": 220.90577697753906, + "learning_rate": 1.8070780399274047e-05, + "loss": 42.057, + "step": 2201 + }, + { + "epoch": 7.949887133182845, + "grad_norm": 192.33856201171875, + "learning_rate": 1.8065335753176046e-05, + "loss": 42.0299, + "step": 2202 + }, + { + "epoch": 7.953498871331829, + "grad_norm": 192.8511962890625, + "learning_rate": 1.805989110707804e-05, + "loss": 41.7752, + "step": 2203 + }, + { + "epoch": 7.957110609480813, + "grad_norm": 223.10275268554688, + "learning_rate": 1.8054446460980036e-05, + "loss": 41.0178, + "step": 2204 + }, + { + "epoch": 7.960722347629797, + "grad_norm": 189.8402099609375, + "learning_rate": 1.8049001814882032e-05, + "loss": 37.9747, + "step": 2205 + }, + { + "epoch": 7.9643340857787805, + "grad_norm": 233.5938720703125, + "learning_rate": 1.8043557168784027e-05, + "loss": 35.3994, + "step": 2206 + }, + { + "epoch": 7.967945823927765, + "grad_norm": 218.5577850341797, + "learning_rate": 1.8038112522686026e-05, + "loss": 35.1967, + "step": 2207 + }, + { + "epoch": 7.971557562076749, + "grad_norm": 228.49502563476562, + "learning_rate": 1.8032667876588025e-05, + "loss": 34.5792, + "step": 2208 + }, + { + "epoch": 7.975169300225733, + "grad_norm": 285.4461364746094, + "learning_rate": 1.802722323049002e-05, + "loss": 37.9449, + "step": 2209 + }, + { + "epoch": 7.978781038374718, + "grad_norm": 186.83755493164062, + "learning_rate": 1.8021778584392016e-05, + "loss": 36.3295, + "step": 2210 + }, + { + "epoch": 7.978781038374718, + "eval_loss": 0.6212169528007507, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2210 + }, + { + "epoch": 7.982392776523702, + "grad_norm": 210.31175231933594, + "learning_rate": 1.801633393829401e-05, + "loss": 37.0061, + "step": 2211 + }, + { + "epoch": 7.986004514672686, + "grad_norm": 251.96026611328125, + "learning_rate": 1.8010889292196006e-05, + "loss": 37.8831, + "step": 2212 + }, + { + "epoch": 7.98961625282167, + "grad_norm": 273.8665771484375, + "learning_rate": 1.8005444646098005e-05, + "loss": 38.8926, + "step": 2213 + }, + { + "epoch": 7.993227990970655, + "grad_norm": 207.25836181640625, + "learning_rate": 1.8e-05, + "loss": 30.0468, + "step": 2214 + }, + { + "epoch": 7.996839729119639, + "grad_norm": 200.5218048095703, + "learning_rate": 1.7994555353901996e-05, + "loss": 24.0549, + "step": 2215 + }, + { + "epoch": 8.0, + "grad_norm": 245.7149200439453, + "learning_rate": 1.798911070780399e-05, + "loss": 22.3158, + "step": 2216 + }, + { + "epoch": 8.003611738148985, + "grad_norm": 263.85546875, + "learning_rate": 1.798366606170599e-05, + "loss": 43.2342, + "step": 2217 + }, + { + "epoch": 8.007223476297968, + "grad_norm": 244.57205200195312, + "learning_rate": 1.797822141560799e-05, + "loss": 44.0931, + "step": 2218 + }, + { + "epoch": 8.010835214446953, + "grad_norm": 196.4144287109375, + "learning_rate": 1.7972776769509984e-05, + "loss": 42.1926, + "step": 2219 + }, + { + "epoch": 8.014446952595938, + "grad_norm": 282.3250427246094, + "learning_rate": 1.796733212341198e-05, + "loss": 41.4664, + "step": 2220 + }, + { + "epoch": 8.014446952595938, + "eval_loss": 0.6222901344299316, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 2220 + }, + { + "epoch": 8.01805869074492, + "grad_norm": 186.79281616210938, + "learning_rate": 1.7961887477313975e-05, + "loss": 42.2133, + "step": 2221 + }, + { + "epoch": 8.021670428893906, + "grad_norm": 220.3788299560547, + "learning_rate": 1.795644283121597e-05, + "loss": 42.0159, + "step": 2222 + }, + { + "epoch": 8.025282167042889, + "grad_norm": 262.37078857421875, + "learning_rate": 1.7950998185117966e-05, + "loss": 42.6055, + "step": 2223 + }, + { + "epoch": 8.028893905191874, + "grad_norm": 199.07078552246094, + "learning_rate": 1.7945553539019964e-05, + "loss": 43.3061, + "step": 2224 + }, + { + "epoch": 8.032505643340858, + "grad_norm": 256.6651306152344, + "learning_rate": 1.794010889292196e-05, + "loss": 42.4806, + "step": 2225 + }, + { + "epoch": 8.036117381489841, + "grad_norm": 281.17431640625, + "learning_rate": 1.793466424682396e-05, + "loss": 43.9823, + "step": 2226 + }, + { + "epoch": 8.039729119638826, + "grad_norm": 201.19837951660156, + "learning_rate": 1.7929219600725954e-05, + "loss": 41.8372, + "step": 2227 + }, + { + "epoch": 8.043340857787811, + "grad_norm": 195.1905059814453, + "learning_rate": 1.792377495462795e-05, + "loss": 38.8656, + "step": 2228 + }, + { + "epoch": 8.046952595936794, + "grad_norm": 215.02772521972656, + "learning_rate": 1.7918330308529948e-05, + "loss": 39.8965, + "step": 2229 + }, + { + "epoch": 8.050564334085779, + "grad_norm": 202.16322326660156, + "learning_rate": 1.7912885662431944e-05, + "loss": 41.0917, + "step": 2230 + }, + { + "epoch": 8.050564334085779, + "eval_loss": 0.6212881207466125, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2230 + }, + { + "epoch": 8.054176072234762, + "grad_norm": 218.90786743164062, + "learning_rate": 1.790744101633394e-05, + "loss": 38.5499, + "step": 2231 + }, + { + "epoch": 8.057787810383747, + "grad_norm": 179.57138061523438, + "learning_rate": 1.7901996370235934e-05, + "loss": 39.5915, + "step": 2232 + }, + { + "epoch": 8.061399548532732, + "grad_norm": 242.74801635742188, + "learning_rate": 1.789655172413793e-05, + "loss": 39.6094, + "step": 2233 + }, + { + "epoch": 8.065011286681715, + "grad_norm": 183.07102966308594, + "learning_rate": 1.7891107078039925e-05, + "loss": 40.6025, + "step": 2234 + }, + { + "epoch": 8.0686230248307, + "grad_norm": 192.85418701171875, + "learning_rate": 1.7885662431941924e-05, + "loss": 40.3013, + "step": 2235 + }, + { + "epoch": 8.072234762979685, + "grad_norm": 254.26353454589844, + "learning_rate": 1.7880217785843923e-05, + "loss": 39.1747, + "step": 2236 + }, + { + "epoch": 8.075846501128668, + "grad_norm": 230.7747802734375, + "learning_rate": 1.7874773139745918e-05, + "loss": 40.7569, + "step": 2237 + }, + { + "epoch": 8.079458239277653, + "grad_norm": 179.30528259277344, + "learning_rate": 1.7869328493647913e-05, + "loss": 40.0753, + "step": 2238 + }, + { + "epoch": 8.083069977426636, + "grad_norm": 203.48915100097656, + "learning_rate": 1.786388384754991e-05, + "loss": 41.4453, + "step": 2239 + }, + { + "epoch": 8.08668171557562, + "grad_norm": 274.8970947265625, + "learning_rate": 1.7858439201451908e-05, + "loss": 40.5818, + "step": 2240 + }, + { + "epoch": 8.08668171557562, + "eval_loss": 0.6184170842170715, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.799, + "eval_steps_per_second": 56.799, + "step": 2240 + }, + { + "epoch": 8.090293453724605, + "grad_norm": 237.2452392578125, + "learning_rate": 1.7852994555353903e-05, + "loss": 42.5794, + "step": 2241 + }, + { + "epoch": 8.093905191873588, + "grad_norm": 236.33766174316406, + "learning_rate": 1.7847549909255898e-05, + "loss": 41.89, + "step": 2242 + }, + { + "epoch": 8.097516930022573, + "grad_norm": 269.4791564941406, + "learning_rate": 1.7842105263157894e-05, + "loss": 41.7726, + "step": 2243 + }, + { + "epoch": 8.101128668171558, + "grad_norm": 192.28457641601562, + "learning_rate": 1.783666061705989e-05, + "loss": 40.1187, + "step": 2244 + }, + { + "epoch": 8.104740406320541, + "grad_norm": 201.5625457763672, + "learning_rate": 1.7831215970961888e-05, + "loss": 36.8004, + "step": 2245 + }, + { + "epoch": 8.108352144469526, + "grad_norm": 175.7625274658203, + "learning_rate": 1.7825771324863887e-05, + "loss": 33.8354, + "step": 2246 + }, + { + "epoch": 8.111963882618511, + "grad_norm": 195.6171112060547, + "learning_rate": 1.7820326678765882e-05, + "loss": 33.5176, + "step": 2247 + }, + { + "epoch": 8.115575620767494, + "grad_norm": 158.7554168701172, + "learning_rate": 1.7814882032667877e-05, + "loss": 34.2908, + "step": 2248 + }, + { + "epoch": 8.119187358916479, + "grad_norm": 192.78900146484375, + "learning_rate": 1.7809437386569873e-05, + "loss": 34.0861, + "step": 2249 + }, + { + "epoch": 8.122799097065462, + "grad_norm": 186.6603240966797, + "learning_rate": 1.7803992740471868e-05, + "loss": 35.5742, + "step": 2250 + }, + { + "epoch": 8.122799097065462, + "eval_loss": 0.6207499504089355, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2250 + }, + { + "epoch": 8.126410835214447, + "grad_norm": 264.3590087890625, + "learning_rate": 1.7798548094373867e-05, + "loss": 35.6709, + "step": 2251 + }, + { + "epoch": 8.130022573363432, + "grad_norm": 202.9478302001953, + "learning_rate": 1.7793103448275862e-05, + "loss": 36.4221, + "step": 2252 + }, + { + "epoch": 8.133634311512415, + "grad_norm": 229.260498046875, + "learning_rate": 1.7787658802177858e-05, + "loss": 36.0745, + "step": 2253 + }, + { + "epoch": 8.1372460496614, + "grad_norm": 222.37716674804688, + "learning_rate": 1.7782214156079856e-05, + "loss": 37.3266, + "step": 2254 + }, + { + "epoch": 8.140857787810384, + "grad_norm": 217.02272033691406, + "learning_rate": 1.7776769509981852e-05, + "loss": 37.2819, + "step": 2255 + }, + { + "epoch": 8.144469525959368, + "grad_norm": 247.61016845703125, + "learning_rate": 1.7771324863883847e-05, + "loss": 37.2683, + "step": 2256 + }, + { + "epoch": 8.148081264108352, + "grad_norm": 209.7449493408203, + "learning_rate": 1.7765880217785846e-05, + "loss": 36.7165, + "step": 2257 + }, + { + "epoch": 8.151693002257336, + "grad_norm": 217.30722045898438, + "learning_rate": 1.776043557168784e-05, + "loss": 37.0805, + "step": 2258 + }, + { + "epoch": 8.15530474040632, + "grad_norm": 181.5167236328125, + "learning_rate": 1.7754990925589837e-05, + "loss": 38.0326, + "step": 2259 + }, + { + "epoch": 8.158916478555305, + "grad_norm": 217.4818878173828, + "learning_rate": 1.7749546279491832e-05, + "loss": 37.1798, + "step": 2260 + }, + { + "epoch": 8.158916478555305, + "eval_loss": 0.6218119263648987, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 2260 + }, + { + "epoch": 8.162528216704288, + "grad_norm": 233.60733032226562, + "learning_rate": 1.7744101633393828e-05, + "loss": 36.6039, + "step": 2261 + }, + { + "epoch": 8.166139954853273, + "grad_norm": 184.5128631591797, + "learning_rate": 1.7738656987295826e-05, + "loss": 30.6188, + "step": 2262 + }, + { + "epoch": 8.169751693002258, + "grad_norm": 154.25791931152344, + "learning_rate": 1.7733212341197825e-05, + "loss": 24.0782, + "step": 2263 + }, + { + "epoch": 8.173363431151241, + "grad_norm": 179.92723083496094, + "learning_rate": 1.772776769509982e-05, + "loss": 23.7072, + "step": 2264 + }, + { + "epoch": 8.176975169300226, + "grad_norm": 170.87684631347656, + "learning_rate": 1.7722323049001816e-05, + "loss": 24.0008, + "step": 2265 + }, + { + "epoch": 8.18058690744921, + "grad_norm": 179.25233459472656, + "learning_rate": 1.771687840290381e-05, + "loss": 24.8393, + "step": 2266 + }, + { + "epoch": 8.184198645598194, + "grad_norm": 268.7836608886719, + "learning_rate": 1.7711433756805807e-05, + "loss": 44.0573, + "step": 2267 + }, + { + "epoch": 8.187810383747179, + "grad_norm": 249.12033081054688, + "learning_rate": 1.7705989110707805e-05, + "loss": 45.0218, + "step": 2268 + }, + { + "epoch": 8.191422121896162, + "grad_norm": 275.2551574707031, + "learning_rate": 1.77005444646098e-05, + "loss": 43.1954, + "step": 2269 + }, + { + "epoch": 8.195033860045147, + "grad_norm": 233.5360107421875, + "learning_rate": 1.7695099818511796e-05, + "loss": 43.0807, + "step": 2270 + }, + { + "epoch": 8.195033860045147, + "eval_loss": 0.6311450600624084, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 2270 + }, + { + "epoch": 8.198645598194132, + "grad_norm": 201.01617431640625, + "learning_rate": 1.768965517241379e-05, + "loss": 43.8161, + "step": 2271 + }, + { + "epoch": 8.202257336343115, + "grad_norm": 243.028564453125, + "learning_rate": 1.7684210526315787e-05, + "loss": 43.3388, + "step": 2272 + }, + { + "epoch": 8.2058690744921, + "grad_norm": 191.8246307373047, + "learning_rate": 1.767876588021779e-05, + "loss": 42.6949, + "step": 2273 + }, + { + "epoch": 8.209480812641084, + "grad_norm": 241.33609008789062, + "learning_rate": 1.7673321234119784e-05, + "loss": 43.3541, + "step": 2274 + }, + { + "epoch": 8.213092550790067, + "grad_norm": 247.99066162109375, + "learning_rate": 1.766787658802178e-05, + "loss": 44.4262, + "step": 2275 + }, + { + "epoch": 8.216704288939052, + "grad_norm": 223.35452270507812, + "learning_rate": 1.7662431941923775e-05, + "loss": 42.5696, + "step": 2276 + }, + { + "epoch": 8.220316027088035, + "grad_norm": 208.75209045410156, + "learning_rate": 1.765698729582577e-05, + "loss": 41.9236, + "step": 2277 + }, + { + "epoch": 8.22392776523702, + "grad_norm": 229.60305786132812, + "learning_rate": 1.7651542649727766e-05, + "loss": 39.962, + "step": 2278 + }, + { + "epoch": 8.227539503386005, + "grad_norm": 294.3867492675781, + "learning_rate": 1.7646098003629765e-05, + "loss": 39.0847, + "step": 2279 + }, + { + "epoch": 8.231151241534988, + "grad_norm": 201.49679565429688, + "learning_rate": 1.764065335753176e-05, + "loss": 39.1451, + "step": 2280 + }, + { + "epoch": 8.231151241534988, + "eval_loss": 0.6214079856872559, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 2280 + }, + { + "epoch": 8.234762979683973, + "grad_norm": 201.57894897460938, + "learning_rate": 1.7635208711433756e-05, + "loss": 39.4673, + "step": 2281 + }, + { + "epoch": 8.238374717832958, + "grad_norm": 201.0395965576172, + "learning_rate": 1.7629764065335754e-05, + "loss": 39.9832, + "step": 2282 + }, + { + "epoch": 8.241986455981941, + "grad_norm": 274.41168212890625, + "learning_rate": 1.762431941923775e-05, + "loss": 40.3885, + "step": 2283 + }, + { + "epoch": 8.245598194130926, + "grad_norm": 173.79977416992188, + "learning_rate": 1.761887477313975e-05, + "loss": 39.5292, + "step": 2284 + }, + { + "epoch": 8.249209932279909, + "grad_norm": 194.91806030273438, + "learning_rate": 1.7613430127041744e-05, + "loss": 40.3855, + "step": 2285 + }, + { + "epoch": 8.252821670428894, + "grad_norm": 216.47213745117188, + "learning_rate": 1.760798548094374e-05, + "loss": 40.937, + "step": 2286 + }, + { + "epoch": 8.256433408577879, + "grad_norm": 168.1825714111328, + "learning_rate": 1.7602540834845735e-05, + "loss": 41.2523, + "step": 2287 + }, + { + "epoch": 8.260045146726862, + "grad_norm": 187.51914978027344, + "learning_rate": 1.759709618874773e-05, + "loss": 40.6913, + "step": 2288 + }, + { + "epoch": 8.263656884875846, + "grad_norm": 183.99844360351562, + "learning_rate": 1.759165154264973e-05, + "loss": 42.5074, + "step": 2289 + }, + { + "epoch": 8.267268623024831, + "grad_norm": 201.23797607421875, + "learning_rate": 1.7586206896551724e-05, + "loss": 42.0519, + "step": 2290 + }, + { + "epoch": 8.267268623024831, + "eval_loss": 0.6184054017066956, + "eval_runtime": 3.1465, + "eval_samples_per_second": 56.889, + "eval_steps_per_second": 56.889, + "step": 2290 + }, + { + "epoch": 8.270880361173814, + "grad_norm": 219.0037384033203, + "learning_rate": 1.7580762250453723e-05, + "loss": 41.7059, + "step": 2291 + }, + { + "epoch": 8.2744920993228, + "grad_norm": 221.00173950195312, + "learning_rate": 1.7575317604355718e-05, + "loss": 40.9004, + "step": 2292 + }, + { + "epoch": 8.278103837471784, + "grad_norm": 180.00828552246094, + "learning_rate": 1.7569872958257714e-05, + "loss": 38.7865, + "step": 2293 + }, + { + "epoch": 8.281715575620767, + "grad_norm": 210.69302368164062, + "learning_rate": 1.756442831215971e-05, + "loss": 39.207, + "step": 2294 + }, + { + "epoch": 8.285327313769752, + "grad_norm": 196.8787078857422, + "learning_rate": 1.7558983666061708e-05, + "loss": 39.4472, + "step": 2295 + }, + { + "epoch": 8.288939051918735, + "grad_norm": 229.16331481933594, + "learning_rate": 1.7553539019963703e-05, + "loss": 36.5539, + "step": 2296 + }, + { + "epoch": 8.29255079006772, + "grad_norm": 180.67474365234375, + "learning_rate": 1.75480943738657e-05, + "loss": 34.3887, + "step": 2297 + }, + { + "epoch": 8.296162528216705, + "grad_norm": 234.046875, + "learning_rate": 1.7542649727767694e-05, + "loss": 34.158, + "step": 2298 + }, + { + "epoch": 8.299774266365688, + "grad_norm": 213.34255981445312, + "learning_rate": 1.753720508166969e-05, + "loss": 34.7655, + "step": 2299 + }, + { + "epoch": 8.303386004514673, + "grad_norm": 205.6382598876953, + "learning_rate": 1.753176043557169e-05, + "loss": 34.4223, + "step": 2300 + }, + { + "epoch": 8.303386004514673, + "eval_loss": 0.6200549006462097, + "eval_runtime": 3.1447, + "eval_samples_per_second": 56.921, + "eval_steps_per_second": 56.921, + "step": 2300 + }, + { + "epoch": 8.306997742663658, + "grad_norm": 189.79238891601562, + "learning_rate": 1.7526315789473687e-05, + "loss": 35.3846, + "step": 2301 + }, + { + "epoch": 8.31060948081264, + "grad_norm": 202.27859497070312, + "learning_rate": 1.7520871143375682e-05, + "loss": 34.9006, + "step": 2302 + }, + { + "epoch": 8.314221218961626, + "grad_norm": 217.62327575683594, + "learning_rate": 1.7515426497277678e-05, + "loss": 36.3079, + "step": 2303 + }, + { + "epoch": 8.317832957110609, + "grad_norm": 212.82862854003906, + "learning_rate": 1.7509981851179673e-05, + "loss": 35.8598, + "step": 2304 + }, + { + "epoch": 8.321444695259594, + "grad_norm": 229.778564453125, + "learning_rate": 1.750453720508167e-05, + "loss": 37.0853, + "step": 2305 + }, + { + "epoch": 8.325056433408578, + "grad_norm": 219.99844360351562, + "learning_rate": 1.7499092558983667e-05, + "loss": 38.01, + "step": 2306 + }, + { + "epoch": 8.328668171557561, + "grad_norm": 202.63035583496094, + "learning_rate": 1.7493647912885663e-05, + "loss": 36.4756, + "step": 2307 + }, + { + "epoch": 8.332279909706546, + "grad_norm": 188.44094848632812, + "learning_rate": 1.7488203266787658e-05, + "loss": 37.0509, + "step": 2308 + }, + { + "epoch": 8.335891647855531, + "grad_norm": 187.8760223388672, + "learning_rate": 1.7482758620689657e-05, + "loss": 38.0019, + "step": 2309 + }, + { + "epoch": 8.339503386004514, + "grad_norm": 239.35833740234375, + "learning_rate": 1.7477313974591652e-05, + "loss": 38.2255, + "step": 2310 + }, + { + "epoch": 8.339503386004514, + "eval_loss": 0.6221747994422913, + "eval_runtime": 3.148, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 2310 + }, + { + "epoch": 8.343115124153499, + "grad_norm": 236.3567657470703, + "learning_rate": 1.747186932849365e-05, + "loss": 37.3598, + "step": 2311 + }, + { + "epoch": 8.346726862302482, + "grad_norm": 188.16151428222656, + "learning_rate": 1.7466424682395646e-05, + "loss": 27.1993, + "step": 2312 + }, + { + "epoch": 8.350338600451467, + "grad_norm": 216.58778381347656, + "learning_rate": 1.746098003629764e-05, + "loss": 23.7024, + "step": 2313 + }, + { + "epoch": 8.353950338600452, + "grad_norm": 221.03111267089844, + "learning_rate": 1.7455535390199637e-05, + "loss": 24.2856, + "step": 2314 + }, + { + "epoch": 8.357562076749435, + "grad_norm": 180.36221313476562, + "learning_rate": 1.7450090744101632e-05, + "loss": 23.7624, + "step": 2315 + }, + { + "epoch": 8.36117381489842, + "grad_norm": 198.77438354492188, + "learning_rate": 1.7444646098003628e-05, + "loss": 25.8628, + "step": 2316 + }, + { + "epoch": 8.364785553047405, + "grad_norm": 250.81321716308594, + "learning_rate": 1.7439201451905627e-05, + "loss": 43.4097, + "step": 2317 + }, + { + "epoch": 8.368397291196388, + "grad_norm": 246.19544982910156, + "learning_rate": 1.7433756805807622e-05, + "loss": 44.7141, + "step": 2318 + }, + { + "epoch": 8.372009029345373, + "grad_norm": 245.04241943359375, + "learning_rate": 1.742831215970962e-05, + "loss": 44.4511, + "step": 2319 + }, + { + "epoch": 8.375620767494357, + "grad_norm": 224.05331420898438, + "learning_rate": 1.7422867513611616e-05, + "loss": 43.5971, + "step": 2320 + }, + { + "epoch": 8.375620767494357, + "eval_loss": 0.6324251294136047, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 2320 + }, + { + "epoch": 8.37923250564334, + "grad_norm": 222.3795623779297, + "learning_rate": 1.741742286751361e-05, + "loss": 42.9007, + "step": 2321 + }, + { + "epoch": 8.382844243792325, + "grad_norm": 210.0133514404297, + "learning_rate": 1.741197822141561e-05, + "loss": 42.8733, + "step": 2322 + }, + { + "epoch": 8.386455981941308, + "grad_norm": 222.01031494140625, + "learning_rate": 1.7406533575317606e-05, + "loss": 42.9875, + "step": 2323 + }, + { + "epoch": 8.390067720090293, + "grad_norm": 187.30101013183594, + "learning_rate": 1.74010889292196e-05, + "loss": 42.4873, + "step": 2324 + }, + { + "epoch": 8.393679458239278, + "grad_norm": 188.22048950195312, + "learning_rate": 1.7395644283121596e-05, + "loss": 42.2066, + "step": 2325 + }, + { + "epoch": 8.397291196388261, + "grad_norm": 228.75363159179688, + "learning_rate": 1.7390199637023592e-05, + "loss": 42.7604, + "step": 2326 + }, + { + "epoch": 8.400902934537246, + "grad_norm": 196.8817901611328, + "learning_rate": 1.7384754990925587e-05, + "loss": 42.445, + "step": 2327 + }, + { + "epoch": 8.404514672686231, + "grad_norm": 205.3610382080078, + "learning_rate": 1.737931034482759e-05, + "loss": 39.8408, + "step": 2328 + }, + { + "epoch": 8.408126410835214, + "grad_norm": 259.0702819824219, + "learning_rate": 1.7373865698729585e-05, + "loss": 40.847, + "step": 2329 + }, + { + "epoch": 8.411738148984199, + "grad_norm": 216.12017822265625, + "learning_rate": 1.736842105263158e-05, + "loss": 40.4648, + "step": 2330 + }, + { + "epoch": 8.411738148984199, + "eval_loss": 0.6252871155738831, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2330 + }, + { + "epoch": 8.415349887133182, + "grad_norm": 330.9464111328125, + "learning_rate": 1.7362976406533575e-05, + "loss": 39.7682, + "step": 2331 + }, + { + "epoch": 8.418961625282167, + "grad_norm": 237.19505310058594, + "learning_rate": 1.735753176043557e-05, + "loss": 38.8824, + "step": 2332 + }, + { + "epoch": 8.422573363431152, + "grad_norm": 247.22259521484375, + "learning_rate": 1.735208711433757e-05, + "loss": 40.1187, + "step": 2333 + }, + { + "epoch": 8.426185101580135, + "grad_norm": 267.739990234375, + "learning_rate": 1.7346642468239565e-05, + "loss": 40.4589, + "step": 2334 + }, + { + "epoch": 8.42979683972912, + "grad_norm": 308.715576171875, + "learning_rate": 1.734119782214156e-05, + "loss": 41.5481, + "step": 2335 + }, + { + "epoch": 8.433408577878104, + "grad_norm": 350.8972473144531, + "learning_rate": 1.7335753176043556e-05, + "loss": 41.6628, + "step": 2336 + }, + { + "epoch": 8.437020316027088, + "grad_norm": 245.9825897216797, + "learning_rate": 1.7330308529945555e-05, + "loss": 40.3527, + "step": 2337 + }, + { + "epoch": 8.440632054176072, + "grad_norm": 253.94488525390625, + "learning_rate": 1.732486388384755e-05, + "loss": 39.6388, + "step": 2338 + }, + { + "epoch": 8.444243792325057, + "grad_norm": 226.24179077148438, + "learning_rate": 1.731941923774955e-05, + "loss": 40.5561, + "step": 2339 + }, + { + "epoch": 8.44785553047404, + "grad_norm": 188.66746520996094, + "learning_rate": 1.7313974591651544e-05, + "loss": 41.8422, + "step": 2340 + }, + { + "epoch": 8.44785553047404, + "eval_loss": 0.6197592616081238, + "eval_runtime": 3.1522, + "eval_samples_per_second": 56.786, + "eval_steps_per_second": 56.786, + "step": 2340 + }, + { + "epoch": 8.451467268623025, + "grad_norm": 227.01014709472656, + "learning_rate": 1.730852994555354e-05, + "loss": 41.4184, + "step": 2341 + }, + { + "epoch": 8.455079006772008, + "grad_norm": 187.11643981933594, + "learning_rate": 1.7303085299455535e-05, + "loss": 40.796, + "step": 2342 + }, + { + "epoch": 8.458690744920993, + "grad_norm": 243.1756134033203, + "learning_rate": 1.729764065335753e-05, + "loss": 41.7926, + "step": 2343 + }, + { + "epoch": 8.462302483069978, + "grad_norm": 226.15187072753906, + "learning_rate": 1.729219600725953e-05, + "loss": 41.588, + "step": 2344 + }, + { + "epoch": 8.465914221218961, + "grad_norm": 218.49935913085938, + "learning_rate": 1.7286751361161524e-05, + "loss": 39.6935, + "step": 2345 + }, + { + "epoch": 8.469525959367946, + "grad_norm": 232.4805145263672, + "learning_rate": 1.7281306715063523e-05, + "loss": 37.0718, + "step": 2346 + }, + { + "epoch": 8.47313769751693, + "grad_norm": 201.1748046875, + "learning_rate": 1.727586206896552e-05, + "loss": 33.9633, + "step": 2347 + }, + { + "epoch": 8.476749435665914, + "grad_norm": 208.79733276367188, + "learning_rate": 1.7270417422867514e-05, + "loss": 33.4553, + "step": 2348 + }, + { + "epoch": 8.480361173814899, + "grad_norm": 235.91151428222656, + "learning_rate": 1.726497277676951e-05, + "loss": 33.6144, + "step": 2349 + }, + { + "epoch": 8.483972911963882, + "grad_norm": 206.28811645507812, + "learning_rate": 1.7259528130671508e-05, + "loss": 35.3678, + "step": 2350 + }, + { + "epoch": 8.483972911963882, + "eval_loss": 0.6203061938285828, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 2350 + }, + { + "epoch": 8.487584650112867, + "grad_norm": 305.2204284667969, + "learning_rate": 1.7254083484573503e-05, + "loss": 35.9175, + "step": 2351 + }, + { + "epoch": 8.491196388261852, + "grad_norm": 227.1592254638672, + "learning_rate": 1.72486388384755e-05, + "loss": 35.5001, + "step": 2352 + }, + { + "epoch": 8.494808126410835, + "grad_norm": 194.739501953125, + "learning_rate": 1.7243194192377494e-05, + "loss": 35.0015, + "step": 2353 + }, + { + "epoch": 8.49841986455982, + "grad_norm": 233.8467254638672, + "learning_rate": 1.723774954627949e-05, + "loss": 36.8257, + "step": 2354 + }, + { + "epoch": 8.502031602708804, + "grad_norm": 258.8914489746094, + "learning_rate": 1.7232304900181492e-05, + "loss": 36.1246, + "step": 2355 + }, + { + "epoch": 8.505643340857787, + "grad_norm": 194.8585968017578, + "learning_rate": 1.7226860254083487e-05, + "loss": 36.1245, + "step": 2356 + }, + { + "epoch": 8.509255079006772, + "grad_norm": 191.2276153564453, + "learning_rate": 1.7221415607985483e-05, + "loss": 37.0608, + "step": 2357 + }, + { + "epoch": 8.512866817155757, + "grad_norm": 197.9025115966797, + "learning_rate": 1.7215970961887478e-05, + "loss": 37.0779, + "step": 2358 + }, + { + "epoch": 8.51647855530474, + "grad_norm": 207.01016235351562, + "learning_rate": 1.7210526315789473e-05, + "loss": 37.8432, + "step": 2359 + }, + { + "epoch": 8.520090293453725, + "grad_norm": 222.20201110839844, + "learning_rate": 1.720508166969147e-05, + "loss": 36.6983, + "step": 2360 + }, + { + "epoch": 8.520090293453725, + "eval_loss": 0.6240220665931702, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 2360 + }, + { + "epoch": 8.523702031602708, + "grad_norm": 200.19273376464844, + "learning_rate": 1.7199637023593467e-05, + "loss": 38.0613, + "step": 2361 + }, + { + "epoch": 8.527313769751693, + "grad_norm": 205.36758422851562, + "learning_rate": 1.7194192377495463e-05, + "loss": 29.6395, + "step": 2362 + }, + { + "epoch": 8.530925507900678, + "grad_norm": 206.53396606445312, + "learning_rate": 1.7188747731397458e-05, + "loss": 23.6478, + "step": 2363 + }, + { + "epoch": 8.534537246049661, + "grad_norm": 219.47044372558594, + "learning_rate": 1.7183303085299454e-05, + "loss": 22.8522, + "step": 2364 + }, + { + "epoch": 8.538148984198646, + "grad_norm": 178.48008728027344, + "learning_rate": 1.7177858439201452e-05, + "loss": 24.1411, + "step": 2365 + }, + { + "epoch": 8.54176072234763, + "grad_norm": 222.63731384277344, + "learning_rate": 1.717241379310345e-05, + "loss": 26.2818, + "step": 2366 + }, + { + "epoch": 8.545372460496614, + "grad_norm": 216.6333465576172, + "learning_rate": 1.7166969147005447e-05, + "loss": 42.5599, + "step": 2367 + }, + { + "epoch": 8.548984198645599, + "grad_norm": 241.42532348632812, + "learning_rate": 1.7161524500907442e-05, + "loss": 44.0016, + "step": 2368 + }, + { + "epoch": 8.552595936794582, + "grad_norm": 227.95193481445312, + "learning_rate": 1.7156079854809437e-05, + "loss": 44.1662, + "step": 2369 + }, + { + "epoch": 8.556207674943566, + "grad_norm": 204.9208526611328, + "learning_rate": 1.7150635208711433e-05, + "loss": 41.2255, + "step": 2370 + }, + { + "epoch": 8.556207674943566, + "eval_loss": 0.6293933987617493, + "eval_runtime": 3.1467, + "eval_samples_per_second": 56.884, + "eval_steps_per_second": 56.884, + "step": 2370 + }, + { + "epoch": 8.559819413092551, + "grad_norm": 168.1370849609375, + "learning_rate": 1.7145190562613428e-05, + "loss": 42.8374, + "step": 2371 + }, + { + "epoch": 8.563431151241534, + "grad_norm": 209.16641235351562, + "learning_rate": 1.7139745916515427e-05, + "loss": 42.4378, + "step": 2372 + }, + { + "epoch": 8.56704288939052, + "grad_norm": 235.36373901367188, + "learning_rate": 1.7134301270417422e-05, + "loss": 43.3213, + "step": 2373 + }, + { + "epoch": 8.570654627539504, + "grad_norm": 198.8206329345703, + "learning_rate": 1.712885662431942e-05, + "loss": 43.5621, + "step": 2374 + }, + { + "epoch": 8.574266365688487, + "grad_norm": 191.1640167236328, + "learning_rate": 1.7123411978221416e-05, + "loss": 41.8729, + "step": 2375 + }, + { + "epoch": 8.577878103837472, + "grad_norm": 281.6352233886719, + "learning_rate": 1.7117967332123412e-05, + "loss": 42.8306, + "step": 2376 + }, + { + "epoch": 8.581489841986457, + "grad_norm": 191.68939208984375, + "learning_rate": 1.711252268602541e-05, + "loss": 41.3603, + "step": 2377 + }, + { + "epoch": 8.58510158013544, + "grad_norm": 175.3041229248047, + "learning_rate": 1.7107078039927406e-05, + "loss": 38.7076, + "step": 2378 + }, + { + "epoch": 8.588713318284425, + "grad_norm": 186.31202697753906, + "learning_rate": 1.71016333938294e-05, + "loss": 38.832, + "step": 2379 + }, + { + "epoch": 8.592325056433408, + "grad_norm": 192.0680389404297, + "learning_rate": 1.7096188747731397e-05, + "loss": 40.6542, + "step": 2380 + }, + { + "epoch": 8.592325056433408, + "eval_loss": 0.6245992183685303, + "eval_runtime": 3.1487, + "eval_samples_per_second": 56.848, + "eval_steps_per_second": 56.848, + "step": 2380 + }, + { + "epoch": 8.595936794582393, + "grad_norm": 284.3516540527344, + "learning_rate": 1.7090744101633392e-05, + "loss": 40.3145, + "step": 2381 + }, + { + "epoch": 8.599548532731378, + "grad_norm": 210.2421875, + "learning_rate": 1.708529945553539e-05, + "loss": 39.9109, + "step": 2382 + }, + { + "epoch": 8.60316027088036, + "grad_norm": 202.3438720703125, + "learning_rate": 1.707985480943739e-05, + "loss": 39.0686, + "step": 2383 + }, + { + "epoch": 8.606772009029346, + "grad_norm": 189.5508270263672, + "learning_rate": 1.7074410163339385e-05, + "loss": 40.6673, + "step": 2384 + }, + { + "epoch": 8.610383747178329, + "grad_norm": 199.3516387939453, + "learning_rate": 1.706896551724138e-05, + "loss": 40.5357, + "step": 2385 + }, + { + "epoch": 8.613995485327314, + "grad_norm": 183.11309814453125, + "learning_rate": 1.7063520871143376e-05, + "loss": 40.7691, + "step": 2386 + }, + { + "epoch": 8.617607223476298, + "grad_norm": 347.104248046875, + "learning_rate": 1.705807622504537e-05, + "loss": 40.6822, + "step": 2387 + }, + { + "epoch": 8.621218961625281, + "grad_norm": 341.0453796386719, + "learning_rate": 1.705263157894737e-05, + "loss": 40.9791, + "step": 2388 + }, + { + "epoch": 8.624830699774266, + "grad_norm": 335.33221435546875, + "learning_rate": 1.7047186932849365e-05, + "loss": 41.0977, + "step": 2389 + }, + { + "epoch": 8.628442437923251, + "grad_norm": 209.75198364257812, + "learning_rate": 1.704174228675136e-05, + "loss": 41.3332, + "step": 2390 + }, + { + "epoch": 8.628442437923251, + "eval_loss": 0.6176490783691406, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2390 + }, + { + "epoch": 8.632054176072234, + "grad_norm": 221.6715545654297, + "learning_rate": 1.7036297640653356e-05, + "loss": 41.7456, + "step": 2391 + }, + { + "epoch": 8.635665914221219, + "grad_norm": 255.7875213623047, + "learning_rate": 1.7030852994555355e-05, + "loss": 41.7063, + "step": 2392 + }, + { + "epoch": 8.639277652370204, + "grad_norm": 206.66221618652344, + "learning_rate": 1.7025408348457354e-05, + "loss": 41.941, + "step": 2393 + }, + { + "epoch": 8.642889390519187, + "grad_norm": 381.9871826171875, + "learning_rate": 1.701996370235935e-05, + "loss": 42.8615, + "step": 2394 + }, + { + "epoch": 8.646501128668172, + "grad_norm": 303.8249816894531, + "learning_rate": 1.7014519056261344e-05, + "loss": 37.8472, + "step": 2395 + }, + { + "epoch": 8.650112866817155, + "grad_norm": 201.2444610595703, + "learning_rate": 1.700907441016334e-05, + "loss": 35.4641, + "step": 2396 + }, + { + "epoch": 8.65372460496614, + "grad_norm": 242.34298706054688, + "learning_rate": 1.7003629764065335e-05, + "loss": 33.3414, + "step": 2397 + }, + { + "epoch": 8.657336343115125, + "grad_norm": 214.45384216308594, + "learning_rate": 1.699818511796733e-05, + "loss": 33.7771, + "step": 2398 + }, + { + "epoch": 8.660948081264108, + "grad_norm": 276.4810485839844, + "learning_rate": 1.699274047186933e-05, + "loss": 35.4289, + "step": 2399 + }, + { + "epoch": 8.664559819413093, + "grad_norm": 199.68626403808594, + "learning_rate": 1.6987295825771325e-05, + "loss": 34.4205, + "step": 2400 + }, + { + "epoch": 8.664559819413093, + "eval_loss": 0.6179484128952026, + "eval_runtime": 3.1618, + "eval_samples_per_second": 56.614, + "eval_steps_per_second": 56.614, + "step": 2400 + }, + { + "epoch": 8.668171557562077, + "grad_norm": 239.19200134277344, + "learning_rate": 1.698185117967332e-05, + "loss": 34.3428, + "step": 2401 + }, + { + "epoch": 8.67178329571106, + "grad_norm": 341.44927978515625, + "learning_rate": 1.697640653357532e-05, + "loss": 37.6011, + "step": 2402 + }, + { + "epoch": 8.675395033860045, + "grad_norm": 260.5967102050781, + "learning_rate": 1.6970961887477314e-05, + "loss": 34.9222, + "step": 2403 + }, + { + "epoch": 8.679006772009028, + "grad_norm": 217.9357147216797, + "learning_rate": 1.6965517241379313e-05, + "loss": 36.6177, + "step": 2404 + }, + { + "epoch": 8.682618510158013, + "grad_norm": 355.21917724609375, + "learning_rate": 1.696007259528131e-05, + "loss": 36.3072, + "step": 2405 + }, + { + "epoch": 8.686230248306998, + "grad_norm": 279.37200927734375, + "learning_rate": 1.6954627949183304e-05, + "loss": 36.7026, + "step": 2406 + }, + { + "epoch": 8.689841986455981, + "grad_norm": 344.9017028808594, + "learning_rate": 1.69491833030853e-05, + "loss": 37.5009, + "step": 2407 + }, + { + "epoch": 8.693453724604966, + "grad_norm": 225.28668212890625, + "learning_rate": 1.6943738656987295e-05, + "loss": 36.0914, + "step": 2408 + }, + { + "epoch": 8.697065462753951, + "grad_norm": 233.16372680664062, + "learning_rate": 1.693829401088929e-05, + "loss": 38.0917, + "step": 2409 + }, + { + "epoch": 8.700677200902934, + "grad_norm": 220.2307891845703, + "learning_rate": 1.693284936479129e-05, + "loss": 37.4493, + "step": 2410 + }, + { + "epoch": 8.700677200902934, + "eval_loss": 0.6225734949111938, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2410 + }, + { + "epoch": 8.704288939051919, + "grad_norm": 298.2883605957031, + "learning_rate": 1.6927404718693287e-05, + "loss": 37.6527, + "step": 2411 + }, + { + "epoch": 8.707900677200904, + "grad_norm": 329.1615295410156, + "learning_rate": 1.6921960072595283e-05, + "loss": 30.9627, + "step": 2412 + }, + { + "epoch": 8.711512415349887, + "grad_norm": 192.55380249023438, + "learning_rate": 1.6916515426497278e-05, + "loss": 24.2028, + "step": 2413 + }, + { + "epoch": 8.715124153498872, + "grad_norm": 162.13583374023438, + "learning_rate": 1.6911070780399274e-05, + "loss": 23.3005, + "step": 2414 + }, + { + "epoch": 8.718735891647855, + "grad_norm": 152.95108032226562, + "learning_rate": 1.6905626134301272e-05, + "loss": 24.335, + "step": 2415 + }, + { + "epoch": 8.72234762979684, + "grad_norm": 183.4193572998047, + "learning_rate": 1.6900181488203268e-05, + "loss": 24.9279, + "step": 2416 + }, + { + "epoch": 8.725959367945824, + "grad_norm": 232.93650817871094, + "learning_rate": 1.6894736842105263e-05, + "loss": 43.4574, + "step": 2417 + }, + { + "epoch": 8.729571106094808, + "grad_norm": 226.85890197753906, + "learning_rate": 1.688929219600726e-05, + "loss": 44.4136, + "step": 2418 + }, + { + "epoch": 8.733182844243792, + "grad_norm": 232.16064453125, + "learning_rate": 1.6883847549909254e-05, + "loss": 42.8183, + "step": 2419 + }, + { + "epoch": 8.736794582392777, + "grad_norm": 243.5811767578125, + "learning_rate": 1.6878402903811253e-05, + "loss": 43.3031, + "step": 2420 + }, + { + "epoch": 8.736794582392777, + "eval_loss": 0.6284167170524597, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2420 + }, + { + "epoch": 8.74040632054176, + "grad_norm": 194.7115020751953, + "learning_rate": 1.687295825771325e-05, + "loss": 42.1276, + "step": 2421 + }, + { + "epoch": 8.744018058690745, + "grad_norm": 250.81983947753906, + "learning_rate": 1.6867513611615247e-05, + "loss": 42.5535, + "step": 2422 + }, + { + "epoch": 8.747629796839728, + "grad_norm": 205.1988983154297, + "learning_rate": 1.6862068965517242e-05, + "loss": 42.7745, + "step": 2423 + }, + { + "epoch": 8.751241534988713, + "grad_norm": 159.68243408203125, + "learning_rate": 1.6856624319419238e-05, + "loss": 43.6562, + "step": 2424 + }, + { + "epoch": 8.754853273137698, + "grad_norm": 164.31361389160156, + "learning_rate": 1.6851179673321233e-05, + "loss": 43.4602, + "step": 2425 + }, + { + "epoch": 8.758465011286681, + "grad_norm": 213.9793243408203, + "learning_rate": 1.6845735027223232e-05, + "loss": 42.1559, + "step": 2426 + }, + { + "epoch": 8.762076749435666, + "grad_norm": 205.79107666015625, + "learning_rate": 1.6840290381125227e-05, + "loss": 41.5687, + "step": 2427 + }, + { + "epoch": 8.76568848758465, + "grad_norm": 235.80348205566406, + "learning_rate": 1.6834845735027223e-05, + "loss": 41.0748, + "step": 2428 + }, + { + "epoch": 8.769300225733634, + "grad_norm": 203.84884643554688, + "learning_rate": 1.682940108892922e-05, + "loss": 39.3348, + "step": 2429 + }, + { + "epoch": 8.772911963882619, + "grad_norm": 271.2411804199219, + "learning_rate": 1.6823956442831217e-05, + "loss": 39.357, + "step": 2430 + }, + { + "epoch": 8.772911963882619, + "eval_loss": 0.6211046576499939, + "eval_runtime": 3.1402, + "eval_samples_per_second": 57.002, + "eval_steps_per_second": 57.002, + "step": 2430 + }, + { + "epoch": 8.776523702031604, + "grad_norm": 222.4960174560547, + "learning_rate": 1.6818511796733212e-05, + "loss": 39.2198, + "step": 2431 + }, + { + "epoch": 8.780135440180587, + "grad_norm": 325.9942932128906, + "learning_rate": 1.681306715063521e-05, + "loss": 40.572, + "step": 2432 + }, + { + "epoch": 8.783747178329572, + "grad_norm": 195.2740936279297, + "learning_rate": 1.6807622504537206e-05, + "loss": 39.2727, + "step": 2433 + }, + { + "epoch": 8.787358916478555, + "grad_norm": 196.16964721679688, + "learning_rate": 1.68021778584392e-05, + "loss": 40.6503, + "step": 2434 + }, + { + "epoch": 8.79097065462754, + "grad_norm": 183.2659454345703, + "learning_rate": 1.6796733212341197e-05, + "loss": 41.2074, + "step": 2435 + }, + { + "epoch": 8.794582392776524, + "grad_norm": 293.393798828125, + "learning_rate": 1.6791288566243192e-05, + "loss": 40.2778, + "step": 2436 + }, + { + "epoch": 8.798194130925507, + "grad_norm": 232.8402099609375, + "learning_rate": 1.678584392014519e-05, + "loss": 40.0305, + "step": 2437 + }, + { + "epoch": 8.801805869074492, + "grad_norm": 269.957275390625, + "learning_rate": 1.678039927404719e-05, + "loss": 40.4216, + "step": 2438 + }, + { + "epoch": 8.805417607223477, + "grad_norm": 175.6732635498047, + "learning_rate": 1.6774954627949185e-05, + "loss": 40.7998, + "step": 2439 + }, + { + "epoch": 8.80902934537246, + "grad_norm": 209.0604248046875, + "learning_rate": 1.676950998185118e-05, + "loss": 41.1176, + "step": 2440 + }, + { + "epoch": 8.80902934537246, + "eval_loss": 0.6211614012718201, + "eval_runtime": 3.15, + "eval_samples_per_second": 56.826, + "eval_steps_per_second": 56.826, + "step": 2440 + }, + { + "epoch": 8.812641083521445, + "grad_norm": 229.91171264648438, + "learning_rate": 1.6764065335753176e-05, + "loss": 41.37, + "step": 2441 + }, + { + "epoch": 8.816252821670428, + "grad_norm": 192.99610900878906, + "learning_rate": 1.675862068965517e-05, + "loss": 41.8377, + "step": 2442 + }, + { + "epoch": 8.819864559819413, + "grad_norm": 239.290771484375, + "learning_rate": 1.675317604355717e-05, + "loss": 42.3038, + "step": 2443 + }, + { + "epoch": 8.823476297968398, + "grad_norm": 203.52330017089844, + "learning_rate": 1.6747731397459166e-05, + "loss": 41.3334, + "step": 2444 + }, + { + "epoch": 8.827088036117381, + "grad_norm": 247.99099731445312, + "learning_rate": 1.674228675136116e-05, + "loss": 37.7455, + "step": 2445 + }, + { + "epoch": 8.830699774266366, + "grad_norm": 205.9770965576172, + "learning_rate": 1.6736842105263156e-05, + "loss": 34.6828, + "step": 2446 + }, + { + "epoch": 8.83431151241535, + "grad_norm": 215.47024536132812, + "learning_rate": 1.6731397459165152e-05, + "loss": 34.927, + "step": 2447 + }, + { + "epoch": 8.837923250564334, + "grad_norm": 254.14010620117188, + "learning_rate": 1.6725952813067154e-05, + "loss": 35.3194, + "step": 2448 + }, + { + "epoch": 8.841534988713319, + "grad_norm": 221.18174743652344, + "learning_rate": 1.672050816696915e-05, + "loss": 34.9577, + "step": 2449 + }, + { + "epoch": 8.845146726862303, + "grad_norm": 191.1651611328125, + "learning_rate": 1.6715063520871145e-05, + "loss": 33.7244, + "step": 2450 + }, + { + "epoch": 8.845146726862303, + "eval_loss": 0.6216589212417603, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2450 + }, + { + "epoch": 8.848758465011286, + "grad_norm": 228.3920135498047, + "learning_rate": 1.670961887477314e-05, + "loss": 34.9689, + "step": 2451 + }, + { + "epoch": 8.852370203160271, + "grad_norm": 227.6689910888672, + "learning_rate": 1.6704174228675135e-05, + "loss": 36.0718, + "step": 2452 + }, + { + "epoch": 8.855981941309254, + "grad_norm": 182.38978576660156, + "learning_rate": 1.669872958257713e-05, + "loss": 37.1143, + "step": 2453 + }, + { + "epoch": 8.85959367945824, + "grad_norm": 223.66966247558594, + "learning_rate": 1.669328493647913e-05, + "loss": 34.4468, + "step": 2454 + }, + { + "epoch": 8.863205417607224, + "grad_norm": 260.3930358886719, + "learning_rate": 1.6687840290381125e-05, + "loss": 36.7305, + "step": 2455 + }, + { + "epoch": 8.866817155756207, + "grad_norm": 218.60385131835938, + "learning_rate": 1.668239564428312e-05, + "loss": 36.1995, + "step": 2456 + }, + { + "epoch": 8.870428893905192, + "grad_norm": 227.4342041015625, + "learning_rate": 1.667695099818512e-05, + "loss": 35.9138, + "step": 2457 + }, + { + "epoch": 8.874040632054175, + "grad_norm": 208.42196655273438, + "learning_rate": 1.6671506352087115e-05, + "loss": 37.2621, + "step": 2458 + }, + { + "epoch": 8.87765237020316, + "grad_norm": 214.9486541748047, + "learning_rate": 1.6666061705989113e-05, + "loss": 38.5176, + "step": 2459 + }, + { + "epoch": 8.881264108352145, + "grad_norm": 226.6992645263672, + "learning_rate": 1.666061705989111e-05, + "loss": 38.3917, + "step": 2460 + }, + { + "epoch": 8.881264108352145, + "eval_loss": 0.6277003884315491, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.959, + "eval_steps_per_second": 56.959, + "step": 2460 + }, + { + "epoch": 8.884875846501128, + "grad_norm": 282.3875732421875, + "learning_rate": 1.6655172413793104e-05, + "loss": 39.1439, + "step": 2461 + }, + { + "epoch": 8.888487584650113, + "grad_norm": 240.29022216796875, + "learning_rate": 1.66497277676951e-05, + "loss": 33.7717, + "step": 2462 + }, + { + "epoch": 8.892099322799098, + "grad_norm": 231.84727478027344, + "learning_rate": 1.6644283121597095e-05, + "loss": 24.1146, + "step": 2463 + }, + { + "epoch": 8.89571106094808, + "grad_norm": 215.5159149169922, + "learning_rate": 1.663883847549909e-05, + "loss": 24.0165, + "step": 2464 + }, + { + "epoch": 8.899322799097066, + "grad_norm": 278.42950439453125, + "learning_rate": 1.663339382940109e-05, + "loss": 24.2048, + "step": 2465 + }, + { + "epoch": 8.90293453724605, + "grad_norm": 187.03341674804688, + "learning_rate": 1.6627949183303088e-05, + "loss": 24.7332, + "step": 2466 + }, + { + "epoch": 8.906546275395034, + "grad_norm": 261.2938232421875, + "learning_rate": 1.6622504537205083e-05, + "loss": 42.6764, + "step": 2467 + }, + { + "epoch": 8.910158013544018, + "grad_norm": 234.00880432128906, + "learning_rate": 1.661705989110708e-05, + "loss": 42.9894, + "step": 2468 + }, + { + "epoch": 8.913769751693001, + "grad_norm": 263.2890319824219, + "learning_rate": 1.6611615245009074e-05, + "loss": 43.3274, + "step": 2469 + }, + { + "epoch": 8.917381489841986, + "grad_norm": 286.3260192871094, + "learning_rate": 1.6606170598911073e-05, + "loss": 44.3862, + "step": 2470 + }, + { + "epoch": 8.917381489841986, + "eval_loss": 0.6278789043426514, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2470 + }, + { + "epoch": 8.920993227990971, + "grad_norm": 273.5133972167969, + "learning_rate": 1.6600725952813068e-05, + "loss": 43.4195, + "step": 2471 + }, + { + "epoch": 8.924604966139954, + "grad_norm": 246.2245330810547, + "learning_rate": 1.6595281306715063e-05, + "loss": 43.153, + "step": 2472 + }, + { + "epoch": 8.928216704288939, + "grad_norm": 261.3001403808594, + "learning_rate": 1.658983666061706e-05, + "loss": 41.1276, + "step": 2473 + }, + { + "epoch": 8.931828442437924, + "grad_norm": 263.7626037597656, + "learning_rate": 1.6584392014519054e-05, + "loss": 40.5055, + "step": 2474 + }, + { + "epoch": 8.935440180586907, + "grad_norm": 233.80442810058594, + "learning_rate": 1.6578947368421053e-05, + "loss": 40.7098, + "step": 2475 + }, + { + "epoch": 8.939051918735892, + "grad_norm": 334.1268615722656, + "learning_rate": 1.6573502722323052e-05, + "loss": 40.5404, + "step": 2476 + }, + { + "epoch": 8.942663656884875, + "grad_norm": 319.56689453125, + "learning_rate": 1.6568058076225047e-05, + "loss": 40.3434, + "step": 2477 + }, + { + "epoch": 8.94627539503386, + "grad_norm": 388.0625915527344, + "learning_rate": 1.6562613430127043e-05, + "loss": 41.1956, + "step": 2478 + }, + { + "epoch": 8.949887133182845, + "grad_norm": 256.9087829589844, + "learning_rate": 1.6557168784029038e-05, + "loss": 41.9647, + "step": 2479 + }, + { + "epoch": 8.953498871331828, + "grad_norm": 248.2635040283203, + "learning_rate": 1.6551724137931033e-05, + "loss": 41.1885, + "step": 2480 + }, + { + "epoch": 8.953498871331828, + "eval_loss": 0.6198933124542236, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2480 + }, + { + "epoch": 8.957110609480813, + "grad_norm": 236.89004516601562, + "learning_rate": 1.6546279491833032e-05, + "loss": 41.2178, + "step": 2481 + }, + { + "epoch": 8.960722347629797, + "grad_norm": 260.47357177734375, + "learning_rate": 1.6540834845735027e-05, + "loss": 42.1472, + "step": 2482 + }, + { + "epoch": 8.96433408577878, + "grad_norm": 216.1390380859375, + "learning_rate": 1.6535390199637023e-05, + "loss": 36.14, + "step": 2483 + }, + { + "epoch": 8.967945823927765, + "grad_norm": 194.7316131591797, + "learning_rate": 1.652994555353902e-05, + "loss": 33.7272, + "step": 2484 + }, + { + "epoch": 8.97155756207675, + "grad_norm": 202.0404052734375, + "learning_rate": 1.6524500907441017e-05, + "loss": 34.9427, + "step": 2485 + }, + { + "epoch": 8.975169300225733, + "grad_norm": 196.98463439941406, + "learning_rate": 1.6519056261343016e-05, + "loss": 36.4874, + "step": 2486 + }, + { + "epoch": 8.978781038374718, + "grad_norm": 211.46177673339844, + "learning_rate": 1.651361161524501e-05, + "loss": 35.7667, + "step": 2487 + }, + { + "epoch": 8.982392776523701, + "grad_norm": 190.47093200683594, + "learning_rate": 1.6508166969147006e-05, + "loss": 35.6874, + "step": 2488 + }, + { + "epoch": 8.986004514672686, + "grad_norm": 194.9825897216797, + "learning_rate": 1.6502722323049002e-05, + "loss": 36.8718, + "step": 2489 + }, + { + "epoch": 8.989616252821671, + "grad_norm": 230.24774169921875, + "learning_rate": 1.6497277676950997e-05, + "loss": 37.4962, + "step": 2490 + }, + { + "epoch": 8.989616252821671, + "eval_loss": 0.6168100237846375, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 2490 + }, + { + "epoch": 8.993227990970654, + "grad_norm": 266.5688171386719, + "learning_rate": 1.6491833030852993e-05, + "loss": 35.5063, + "step": 2491 + }, + { + "epoch": 8.996839729119639, + "grad_norm": 230.923828125, + "learning_rate": 1.648638838475499e-05, + "loss": 23.5847, + "step": 2492 + }, + { + "epoch": 9.0, + "grad_norm": 187.365478515625, + "learning_rate": 1.6480943738656987e-05, + "loss": 21.7926, + "step": 2493 + }, + { + "epoch": 9.003611738148985, + "grad_norm": 283.487060546875, + "learning_rate": 1.6475499092558986e-05, + "loss": 41.4221, + "step": 2494 + }, + { + "epoch": 9.007223476297968, + "grad_norm": 234.38009643554688, + "learning_rate": 1.647005444646098e-05, + "loss": 43.3343, + "step": 2495 + }, + { + "epoch": 9.010835214446953, + "grad_norm": 253.75588989257812, + "learning_rate": 1.6464609800362976e-05, + "loss": 42.1983, + "step": 2496 + }, + { + "epoch": 9.014446952595938, + "grad_norm": 224.6202392578125, + "learning_rate": 1.6459165154264975e-05, + "loss": 41.5355, + "step": 2497 + }, + { + "epoch": 9.01805869074492, + "grad_norm": 261.0040588378906, + "learning_rate": 1.645372050816697e-05, + "loss": 42.3058, + "step": 2498 + }, + { + "epoch": 9.021670428893906, + "grad_norm": 191.44142150878906, + "learning_rate": 1.6448275862068966e-05, + "loss": 42.3911, + "step": 2499 + }, + { + "epoch": 9.025282167042889, + "grad_norm": 246.79278564453125, + "learning_rate": 1.644283121597096e-05, + "loss": 41.6238, + "step": 2500 + }, + { + "epoch": 9.025282167042889, + "eval_loss": 0.6220878958702087, + "eval_runtime": 3.1552, + "eval_samples_per_second": 56.731, + "eval_steps_per_second": 56.731, + "step": 2500 + }, + { + "epoch": 9.028893905191874, + "grad_norm": 251.5475311279297, + "learning_rate": 1.6437386569872957e-05, + "loss": 43.9275, + "step": 2501 + }, + { + "epoch": 9.032505643340858, + "grad_norm": 300.0381164550781, + "learning_rate": 1.6431941923774952e-05, + "loss": 42.8938, + "step": 2502 + }, + { + "epoch": 9.036117381489841, + "grad_norm": 310.0517883300781, + "learning_rate": 1.6426497277676954e-05, + "loss": 42.3538, + "step": 2503 + }, + { + "epoch": 9.039729119638826, + "grad_norm": 213.50392150878906, + "learning_rate": 1.642105263157895e-05, + "loss": 40.2305, + "step": 2504 + }, + { + "epoch": 9.043340857787811, + "grad_norm": 173.3816680908203, + "learning_rate": 1.6415607985480945e-05, + "loss": 38.3336, + "step": 2505 + }, + { + "epoch": 9.046952595936794, + "grad_norm": 195.51968383789062, + "learning_rate": 1.641016333938294e-05, + "loss": 38.5937, + "step": 2506 + }, + { + "epoch": 9.050564334085779, + "grad_norm": 195.68910217285156, + "learning_rate": 1.6404718693284936e-05, + "loss": 37.9994, + "step": 2507 + }, + { + "epoch": 9.054176072234762, + "grad_norm": 239.56704711914062, + "learning_rate": 1.6399274047186934e-05, + "loss": 38.6006, + "step": 2508 + }, + { + "epoch": 9.057787810383747, + "grad_norm": 455.8309326171875, + "learning_rate": 1.639382940108893e-05, + "loss": 39.9516, + "step": 2509 + }, + { + "epoch": 9.061399548532732, + "grad_norm": 188.0857696533203, + "learning_rate": 1.6388384754990925e-05, + "loss": 38.8922, + "step": 2510 + }, + { + "epoch": 9.061399548532732, + "eval_loss": 0.6177002191543579, + "eval_runtime": 3.1595, + "eval_samples_per_second": 56.654, + "eval_steps_per_second": 56.654, + "step": 2510 + }, + { + "epoch": 9.065011286681715, + "grad_norm": 211.76168823242188, + "learning_rate": 1.638294010889292e-05, + "loss": 38.8895, + "step": 2511 + }, + { + "epoch": 9.0686230248307, + "grad_norm": 281.7332458496094, + "learning_rate": 1.637749546279492e-05, + "loss": 39.9238, + "step": 2512 + }, + { + "epoch": 9.072234762979685, + "grad_norm": 254.9953155517578, + "learning_rate": 1.6372050816696915e-05, + "loss": 41.2667, + "step": 2513 + }, + { + "epoch": 9.075846501128668, + "grad_norm": 233.8746337890625, + "learning_rate": 1.6366606170598914e-05, + "loss": 39.3087, + "step": 2514 + }, + { + "epoch": 9.079458239277653, + "grad_norm": 317.71270751953125, + "learning_rate": 1.636116152450091e-05, + "loss": 40.4902, + "step": 2515 + }, + { + "epoch": 9.083069977426636, + "grad_norm": 227.5228271484375, + "learning_rate": 1.6355716878402904e-05, + "loss": 40.1197, + "step": 2516 + }, + { + "epoch": 9.08668171557562, + "grad_norm": 225.84423828125, + "learning_rate": 1.63502722323049e-05, + "loss": 42.9099, + "step": 2517 + }, + { + "epoch": 9.090293453724605, + "grad_norm": 255.20858764648438, + "learning_rate": 1.6344827586206895e-05, + "loss": 42.0515, + "step": 2518 + }, + { + "epoch": 9.093905191873588, + "grad_norm": 215.45352172851562, + "learning_rate": 1.6339382940108894e-05, + "loss": 41.6817, + "step": 2519 + }, + { + "epoch": 9.097516930022573, + "grad_norm": 233.5334014892578, + "learning_rate": 1.633393829401089e-05, + "loss": 42.6121, + "step": 2520 + }, + { + "epoch": 9.097516930022573, + "eval_loss": 0.6148340106010437, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.926, + "eval_steps_per_second": 56.926, + "step": 2520 + }, + { + "epoch": 9.101128668171558, + "grad_norm": 196.54132080078125, + "learning_rate": 1.6328493647912888e-05, + "loss": 40.5833, + "step": 2521 + }, + { + "epoch": 9.104740406320541, + "grad_norm": 296.7503967285156, + "learning_rate": 1.6323049001814883e-05, + "loss": 39.098, + "step": 2522 + }, + { + "epoch": 9.108352144469526, + "grad_norm": 272.1104431152344, + "learning_rate": 1.631760435571688e-05, + "loss": 36.0076, + "step": 2523 + }, + { + "epoch": 9.111963882618511, + "grad_norm": 197.3100128173828, + "learning_rate": 1.6312159709618874e-05, + "loss": 33.3503, + "step": 2524 + }, + { + "epoch": 9.115575620767494, + "grad_norm": 223.1310272216797, + "learning_rate": 1.6306715063520873e-05, + "loss": 33.1386, + "step": 2525 + }, + { + "epoch": 9.119187358916479, + "grad_norm": 234.86093139648438, + "learning_rate": 1.630127041742287e-05, + "loss": 34.2101, + "step": 2526 + }, + { + "epoch": 9.122799097065462, + "grad_norm": 244.72328186035156, + "learning_rate": 1.6295825771324864e-05, + "loss": 34.955, + "step": 2527 + }, + { + "epoch": 9.126410835214447, + "grad_norm": 198.89134216308594, + "learning_rate": 1.629038112522686e-05, + "loss": 34.5405, + "step": 2528 + }, + { + "epoch": 9.130022573363432, + "grad_norm": 236.64096069335938, + "learning_rate": 1.6284936479128854e-05, + "loss": 35.2328, + "step": 2529 + }, + { + "epoch": 9.133634311512415, + "grad_norm": 212.8743438720703, + "learning_rate": 1.6279491833030853e-05, + "loss": 34.6642, + "step": 2530 + }, + { + "epoch": 9.133634311512415, + "eval_loss": 0.6154256463050842, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 2530 + }, + { + "epoch": 9.1372460496614, + "grad_norm": 227.15135192871094, + "learning_rate": 1.6274047186932852e-05, + "loss": 35.652, + "step": 2531 + }, + { + "epoch": 9.140857787810384, + "grad_norm": 207.30572509765625, + "learning_rate": 1.6268602540834847e-05, + "loss": 36.8476, + "step": 2532 + }, + { + "epoch": 9.144469525959368, + "grad_norm": 222.18023681640625, + "learning_rate": 1.6263157894736843e-05, + "loss": 35.8299, + "step": 2533 + }, + { + "epoch": 9.148081264108352, + "grad_norm": 283.674072265625, + "learning_rate": 1.6257713248638838e-05, + "loss": 36.5074, + "step": 2534 + }, + { + "epoch": 9.151693002257336, + "grad_norm": 235.69752502441406, + "learning_rate": 1.6252268602540834e-05, + "loss": 37.344, + "step": 2535 + }, + { + "epoch": 9.15530474040632, + "grad_norm": 224.37965393066406, + "learning_rate": 1.6246823956442832e-05, + "loss": 37.8138, + "step": 2536 + }, + { + "epoch": 9.158916478555305, + "grad_norm": 217.52230834960938, + "learning_rate": 1.6241379310344828e-05, + "loss": 37.1529, + "step": 2537 + }, + { + "epoch": 9.162528216704288, + "grad_norm": 234.7586212158203, + "learning_rate": 1.6235934664246823e-05, + "loss": 36.3247, + "step": 2538 + }, + { + "epoch": 9.166139954853273, + "grad_norm": 239.52479553222656, + "learning_rate": 1.623049001814882e-05, + "loss": 30.0805, + "step": 2539 + }, + { + "epoch": 9.169751693002258, + "grad_norm": 223.7616424560547, + "learning_rate": 1.6225045372050817e-05, + "loss": 23.8492, + "step": 2540 + }, + { + "epoch": 9.169751693002258, + "eval_loss": 0.6244915723800659, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.031, + "eval_steps_per_second": 57.031, + "step": 2540 + }, + { + "epoch": 9.173363431151241, + "grad_norm": 213.41371154785156, + "learning_rate": 1.6219600725952816e-05, + "loss": 23.3557, + "step": 2541 + }, + { + "epoch": 9.176975169300226, + "grad_norm": 162.4627685546875, + "learning_rate": 1.621415607985481e-05, + "loss": 23.8834, + "step": 2542 + }, + { + "epoch": 9.18058690744921, + "grad_norm": 172.13250732421875, + "learning_rate": 1.6208711433756807e-05, + "loss": 24.6428, + "step": 2543 + }, + { + "epoch": 9.184198645598194, + "grad_norm": 229.30799865722656, + "learning_rate": 1.6203266787658802e-05, + "loss": 42.5908, + "step": 2544 + }, + { + "epoch": 9.187810383747179, + "grad_norm": 195.30130004882812, + "learning_rate": 1.6197822141560798e-05, + "loss": 43.7286, + "step": 2545 + }, + { + "epoch": 9.191422121896162, + "grad_norm": 227.4984893798828, + "learning_rate": 1.6192377495462793e-05, + "loss": 43.5012, + "step": 2546 + }, + { + "epoch": 9.195033860045147, + "grad_norm": 254.69615173339844, + "learning_rate": 1.6186932849364792e-05, + "loss": 41.9295, + "step": 2547 + }, + { + "epoch": 9.198645598194132, + "grad_norm": 251.33778381347656, + "learning_rate": 1.6181488203266787e-05, + "loss": 42.0838, + "step": 2548 + }, + { + "epoch": 9.202257336343115, + "grad_norm": 237.91677856445312, + "learning_rate": 1.6176043557168786e-05, + "loss": 43.0031, + "step": 2549 + }, + { + "epoch": 9.2058690744921, + "grad_norm": 258.0311584472656, + "learning_rate": 1.617059891107078e-05, + "loss": 42.7196, + "step": 2550 + }, + { + "epoch": 9.2058690744921, + "eval_loss": 0.6245208978652954, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2550 + }, + { + "epoch": 9.209480812641084, + "grad_norm": 197.14703369140625, + "learning_rate": 1.6165154264972777e-05, + "loss": 42.1342, + "step": 2551 + }, + { + "epoch": 9.213092550790067, + "grad_norm": 235.19705200195312, + "learning_rate": 1.6159709618874775e-05, + "loss": 41.8462, + "step": 2552 + }, + { + "epoch": 9.216704288939052, + "grad_norm": 198.409423828125, + "learning_rate": 1.615426497277677e-05, + "loss": 43.5993, + "step": 2553 + }, + { + "epoch": 9.220316027088035, + "grad_norm": 254.08590698242188, + "learning_rate": 1.6148820326678766e-05, + "loss": 40.771, + "step": 2554 + }, + { + "epoch": 9.22392776523702, + "grad_norm": 181.64808654785156, + "learning_rate": 1.614337568058076e-05, + "loss": 39.3511, + "step": 2555 + }, + { + "epoch": 9.227539503386005, + "grad_norm": 294.1127014160156, + "learning_rate": 1.6137931034482757e-05, + "loss": 39.6586, + "step": 2556 + }, + { + "epoch": 9.231151241534988, + "grad_norm": 197.59982299804688, + "learning_rate": 1.6132486388384752e-05, + "loss": 38.2575, + "step": 2557 + }, + { + "epoch": 9.234762979683973, + "grad_norm": 223.74717712402344, + "learning_rate": 1.6127041742286754e-05, + "loss": 38.8801, + "step": 2558 + }, + { + "epoch": 9.238374717832958, + "grad_norm": 279.2779541015625, + "learning_rate": 1.612159709618875e-05, + "loss": 40.4591, + "step": 2559 + }, + { + "epoch": 9.241986455981941, + "grad_norm": 258.75909423828125, + "learning_rate": 1.6116152450090745e-05, + "loss": 39.2172, + "step": 2560 + }, + { + "epoch": 9.241986455981941, + "eval_loss": 0.6209923624992371, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.784, + "eval_steps_per_second": 56.784, + "step": 2560 + }, + { + "epoch": 9.245598194130926, + "grad_norm": 305.0645446777344, + "learning_rate": 1.611070780399274e-05, + "loss": 40.442, + "step": 2561 + }, + { + "epoch": 9.249209932279909, + "grad_norm": 196.18557739257812, + "learning_rate": 1.6105263157894736e-05, + "loss": 39.7092, + "step": 2562 + }, + { + "epoch": 9.252821670428894, + "grad_norm": 214.3220977783203, + "learning_rate": 1.6099818511796735e-05, + "loss": 39.3935, + "step": 2563 + }, + { + "epoch": 9.256433408577879, + "grad_norm": 217.2801055908203, + "learning_rate": 1.609437386569873e-05, + "loss": 40.39, + "step": 2564 + }, + { + "epoch": 9.260045146726862, + "grad_norm": 205.17446899414062, + "learning_rate": 1.6088929219600726e-05, + "loss": 39.9531, + "step": 2565 + }, + { + "epoch": 9.263656884875846, + "grad_norm": 197.3854217529297, + "learning_rate": 1.608348457350272e-05, + "loss": 40.474, + "step": 2566 + }, + { + "epoch": 9.267268623024831, + "grad_norm": 264.3934631347656, + "learning_rate": 1.607803992740472e-05, + "loss": 41.2794, + "step": 2567 + }, + { + "epoch": 9.270880361173814, + "grad_norm": 226.6471710205078, + "learning_rate": 1.6072595281306715e-05, + "loss": 40.3425, + "step": 2568 + }, + { + "epoch": 9.2744920993228, + "grad_norm": 198.62734985351562, + "learning_rate": 1.6067150635208714e-05, + "loss": 41.6261, + "step": 2569 + }, + { + "epoch": 9.278103837471784, + "grad_norm": 207.73509216308594, + "learning_rate": 1.606170598911071e-05, + "loss": 41.7835, + "step": 2570 + }, + { + "epoch": 9.278103837471784, + "eval_loss": 0.6173180937767029, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 2570 + }, + { + "epoch": 9.281715575620767, + "grad_norm": 214.13601684570312, + "learning_rate": 1.6056261343012705e-05, + "loss": 40.0095, + "step": 2571 + }, + { + "epoch": 9.285327313769752, + "grad_norm": 218.0533905029297, + "learning_rate": 1.60508166969147e-05, + "loss": 40.014, + "step": 2572 + }, + { + "epoch": 9.288939051918735, + "grad_norm": 211.27984619140625, + "learning_rate": 1.6045372050816695e-05, + "loss": 36.7399, + "step": 2573 + }, + { + "epoch": 9.29255079006772, + "grad_norm": 201.9020233154297, + "learning_rate": 1.6039927404718694e-05, + "loss": 33.7555, + "step": 2574 + }, + { + "epoch": 9.296162528216705, + "grad_norm": 230.27149963378906, + "learning_rate": 1.603448275862069e-05, + "loss": 32.9646, + "step": 2575 + }, + { + "epoch": 9.299774266365688, + "grad_norm": 208.77622985839844, + "learning_rate": 1.6029038112522685e-05, + "loss": 33.5332, + "step": 2576 + }, + { + "epoch": 9.303386004514673, + "grad_norm": 225.02796936035156, + "learning_rate": 1.6023593466424684e-05, + "loss": 34.2592, + "step": 2577 + }, + { + "epoch": 9.306997742663658, + "grad_norm": 201.79612731933594, + "learning_rate": 1.601814882032668e-05, + "loss": 34.6686, + "step": 2578 + }, + { + "epoch": 9.31060948081264, + "grad_norm": 235.6588134765625, + "learning_rate": 1.6012704174228678e-05, + "loss": 35.4554, + "step": 2579 + }, + { + "epoch": 9.314221218961626, + "grad_norm": 273.51904296875, + "learning_rate": 1.6007259528130673e-05, + "loss": 35.2077, + "step": 2580 + }, + { + "epoch": 9.314221218961626, + "eval_loss": 0.6169624328613281, + "eval_runtime": 3.1501, + "eval_samples_per_second": 56.823, + "eval_steps_per_second": 56.823, + "step": 2580 + }, + { + "epoch": 9.317832957110609, + "grad_norm": 199.19541931152344, + "learning_rate": 1.600181488203267e-05, + "loss": 35.0703, + "step": 2581 + }, + { + "epoch": 9.321444695259594, + "grad_norm": 212.49276733398438, + "learning_rate": 1.5996370235934664e-05, + "loss": 35.9691, + "step": 2582 + }, + { + "epoch": 9.325056433408578, + "grad_norm": 193.7330322265625, + "learning_rate": 1.599092558983666e-05, + "loss": 34.9043, + "step": 2583 + }, + { + "epoch": 9.328668171557561, + "grad_norm": 196.00503540039062, + "learning_rate": 1.5985480943738655e-05, + "loss": 36.3508, + "step": 2584 + }, + { + "epoch": 9.332279909706546, + "grad_norm": 218.78392028808594, + "learning_rate": 1.5980036297640654e-05, + "loss": 34.7672, + "step": 2585 + }, + { + "epoch": 9.335891647855531, + "grad_norm": 235.76873779296875, + "learning_rate": 1.5974591651542652e-05, + "loss": 36.8695, + "step": 2586 + }, + { + "epoch": 9.339503386004514, + "grad_norm": 250.538330078125, + "learning_rate": 1.5969147005444648e-05, + "loss": 37.4531, + "step": 2587 + }, + { + "epoch": 9.343115124153499, + "grad_norm": 234.12469482421875, + "learning_rate": 1.5963702359346643e-05, + "loss": 37.4506, + "step": 2588 + }, + { + "epoch": 9.346726862302482, + "grad_norm": 209.3461151123047, + "learning_rate": 1.595825771324864e-05, + "loss": 31.3062, + "step": 2589 + }, + { + "epoch": 9.350338600451467, + "grad_norm": 211.12277221679688, + "learning_rate": 1.5952813067150637e-05, + "loss": 23.3303, + "step": 2590 + }, + { + "epoch": 9.350338600451467, + "eval_loss": 0.6222187876701355, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 2590 + }, + { + "epoch": 9.353950338600452, + "grad_norm": 200.1257781982422, + "learning_rate": 1.5947368421052633e-05, + "loss": 22.9145, + "step": 2591 + }, + { + "epoch": 9.357562076749435, + "grad_norm": 179.01475524902344, + "learning_rate": 1.5941923774954628e-05, + "loss": 23.8842, + "step": 2592 + }, + { + "epoch": 9.36117381489842, + "grad_norm": 214.9254608154297, + "learning_rate": 1.5936479128856623e-05, + "loss": 25.4154, + "step": 2593 + }, + { + "epoch": 9.364785553047405, + "grad_norm": 211.63735961914062, + "learning_rate": 1.593103448275862e-05, + "loss": 42.6467, + "step": 2594 + }, + { + "epoch": 9.368397291196388, + "grad_norm": 232.43194580078125, + "learning_rate": 1.5925589836660618e-05, + "loss": 43.3501, + "step": 2595 + }, + { + "epoch": 9.372009029345373, + "grad_norm": 220.61468505859375, + "learning_rate": 1.5920145190562616e-05, + "loss": 43.4324, + "step": 2596 + }, + { + "epoch": 9.375620767494357, + "grad_norm": 179.00894165039062, + "learning_rate": 1.591470054446461e-05, + "loss": 41.9646, + "step": 2597 + }, + { + "epoch": 9.37923250564334, + "grad_norm": 203.847412109375, + "learning_rate": 1.5909255898366607e-05, + "loss": 41.1242, + "step": 2598 + }, + { + "epoch": 9.382844243792325, + "grad_norm": 244.20164489746094, + "learning_rate": 1.5903811252268602e-05, + "loss": 42.2451, + "step": 2599 + }, + { + "epoch": 9.386455981941308, + "grad_norm": 203.60154724121094, + "learning_rate": 1.5898366606170598e-05, + "loss": 42.0361, + "step": 2600 + }, + { + "epoch": 9.386455981941308, + "eval_loss": 0.627146303653717, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2600 + }, + { + "epoch": 9.390067720090293, + "grad_norm": 185.1741180419922, + "learning_rate": 1.5892921960072597e-05, + "loss": 41.9657, + "step": 2601 + }, + { + "epoch": 9.393679458239278, + "grad_norm": 211.64219665527344, + "learning_rate": 1.5887477313974592e-05, + "loss": 42.2619, + "step": 2602 + }, + { + "epoch": 9.397291196388261, + "grad_norm": 253.31997680664062, + "learning_rate": 1.5882032667876587e-05, + "loss": 42.5666, + "step": 2603 + }, + { + "epoch": 9.400902934537246, + "grad_norm": 257.8781433105469, + "learning_rate": 1.5876588021778586e-05, + "loss": 43.1747, + "step": 2604 + }, + { + "epoch": 9.404514672686231, + "grad_norm": 171.05398559570312, + "learning_rate": 1.587114337568058e-05, + "loss": 41.2645, + "step": 2605 + }, + { + "epoch": 9.408126410835214, + "grad_norm": 209.83749389648438, + "learning_rate": 1.5865698729582577e-05, + "loss": 38.7138, + "step": 2606 + }, + { + "epoch": 9.411738148984199, + "grad_norm": 303.92059326171875, + "learning_rate": 1.5860254083484576e-05, + "loss": 38.7962, + "step": 2607 + }, + { + "epoch": 9.415349887133182, + "grad_norm": 271.9322204589844, + "learning_rate": 1.585480943738657e-05, + "loss": 39.0622, + "step": 2608 + }, + { + "epoch": 9.418961625282167, + "grad_norm": 222.8749542236328, + "learning_rate": 1.5849364791288566e-05, + "loss": 40.0773, + "step": 2609 + }, + { + "epoch": 9.422573363431152, + "grad_norm": 194.549072265625, + "learning_rate": 1.5843920145190562e-05, + "loss": 39.3495, + "step": 2610 + }, + { + "epoch": 9.422573363431152, + "eval_loss": 0.618250846862793, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.796, + "eval_steps_per_second": 56.796, + "step": 2610 + }, + { + "epoch": 9.426185101580135, + "grad_norm": 231.32623291015625, + "learning_rate": 1.5838475499092557e-05, + "loss": 39.7577, + "step": 2611 + }, + { + "epoch": 9.42979683972912, + "grad_norm": 185.9986114501953, + "learning_rate": 1.5833030852994556e-05, + "loss": 40.9342, + "step": 2612 + }, + { + "epoch": 9.433408577878104, + "grad_norm": 221.356201171875, + "learning_rate": 1.5827586206896555e-05, + "loss": 39.7733, + "step": 2613 + }, + { + "epoch": 9.437020316027088, + "grad_norm": 216.2249755859375, + "learning_rate": 1.582214156079855e-05, + "loss": 39.7559, + "step": 2614 + }, + { + "epoch": 9.440632054176072, + "grad_norm": 263.5106201171875, + "learning_rate": 1.5816696914700546e-05, + "loss": 41.2872, + "step": 2615 + }, + { + "epoch": 9.444243792325057, + "grad_norm": 281.9518127441406, + "learning_rate": 1.581125226860254e-05, + "loss": 41.1114, + "step": 2616 + }, + { + "epoch": 9.44785553047404, + "grad_norm": 200.2808074951172, + "learning_rate": 1.5805807622504536e-05, + "loss": 41.7711, + "step": 2617 + }, + { + "epoch": 9.451467268623025, + "grad_norm": 233.034912109375, + "learning_rate": 1.5800362976406535e-05, + "loss": 41.3306, + "step": 2618 + }, + { + "epoch": 9.455079006772008, + "grad_norm": 215.5499725341797, + "learning_rate": 1.579491833030853e-05, + "loss": 41.0065, + "step": 2619 + }, + { + "epoch": 9.458690744920993, + "grad_norm": 220.21153259277344, + "learning_rate": 1.5789473684210526e-05, + "loss": 42.1116, + "step": 2620 + }, + { + "epoch": 9.458690744920993, + "eval_loss": 0.6146022081375122, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 2620 + }, + { + "epoch": 9.462302483069978, + "grad_norm": 198.20001220703125, + "learning_rate": 1.578402903811252e-05, + "loss": 39.637, + "step": 2621 + }, + { + "epoch": 9.465914221218961, + "grad_norm": 228.18357849121094, + "learning_rate": 1.5778584392014517e-05, + "loss": 37.3831, + "step": 2622 + }, + { + "epoch": 9.469525959367946, + "grad_norm": 207.68040466308594, + "learning_rate": 1.577313974591652e-05, + "loss": 35.6356, + "step": 2623 + }, + { + "epoch": 9.47313769751693, + "grad_norm": 267.0474853515625, + "learning_rate": 1.5767695099818514e-05, + "loss": 34.5549, + "step": 2624 + }, + { + "epoch": 9.476749435665914, + "grad_norm": 191.4129638671875, + "learning_rate": 1.576225045372051e-05, + "loss": 35.1065, + "step": 2625 + }, + { + "epoch": 9.480361173814899, + "grad_norm": 220.85708618164062, + "learning_rate": 1.5756805807622505e-05, + "loss": 34.9115, + "step": 2626 + }, + { + "epoch": 9.483972911963882, + "grad_norm": 218.62460327148438, + "learning_rate": 1.57513611615245e-05, + "loss": 33.9542, + "step": 2627 + }, + { + "epoch": 9.487584650112867, + "grad_norm": 184.085693359375, + "learning_rate": 1.5745916515426496e-05, + "loss": 35.2981, + "step": 2628 + }, + { + "epoch": 9.491196388261852, + "grad_norm": 286.73236083984375, + "learning_rate": 1.5740471869328494e-05, + "loss": 36.8326, + "step": 2629 + }, + { + "epoch": 9.494808126410835, + "grad_norm": 326.4263000488281, + "learning_rate": 1.573502722323049e-05, + "loss": 35.9728, + "step": 2630 + }, + { + "epoch": 9.494808126410835, + "eval_loss": 0.6165672540664673, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2630 + }, + { + "epoch": 9.49841986455982, + "grad_norm": 283.330322265625, + "learning_rate": 1.5729582577132485e-05, + "loss": 37.4227, + "step": 2631 + }, + { + "epoch": 9.502031602708804, + "grad_norm": 208.65829467773438, + "learning_rate": 1.5724137931034484e-05, + "loss": 36.8613, + "step": 2632 + }, + { + "epoch": 9.505643340857787, + "grad_norm": 191.59429931640625, + "learning_rate": 1.571869328493648e-05, + "loss": 36.2332, + "step": 2633 + }, + { + "epoch": 9.509255079006772, + "grad_norm": 306.4736022949219, + "learning_rate": 1.5713248638838478e-05, + "loss": 36.8045, + "step": 2634 + }, + { + "epoch": 9.512866817155757, + "grad_norm": 226.97509765625, + "learning_rate": 1.5707803992740474e-05, + "loss": 37.005, + "step": 2635 + }, + { + "epoch": 9.51647855530474, + "grad_norm": 230.47683715820312, + "learning_rate": 1.570235934664247e-05, + "loss": 36.9168, + "step": 2636 + }, + { + "epoch": 9.520090293453725, + "grad_norm": 221.44483947753906, + "learning_rate": 1.5696914700544464e-05, + "loss": 39.0025, + "step": 2637 + }, + { + "epoch": 9.523702031602708, + "grad_norm": 249.1531219482422, + "learning_rate": 1.569147005444646e-05, + "loss": 38.1069, + "step": 2638 + }, + { + "epoch": 9.527313769751693, + "grad_norm": 276.8532409667969, + "learning_rate": 1.5686025408348455e-05, + "loss": 30.9819, + "step": 2639 + }, + { + "epoch": 9.530925507900678, + "grad_norm": 218.25035095214844, + "learning_rate": 1.5680580762250454e-05, + "loss": 23.4807, + "step": 2640 + }, + { + "epoch": 9.530925507900678, + "eval_loss": 0.619295060634613, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2640 + }, + { + "epoch": 9.534537246049661, + "grad_norm": 185.83737182617188, + "learning_rate": 1.5675136116152453e-05, + "loss": 22.5394, + "step": 2641 + }, + { + "epoch": 9.538148984198646, + "grad_norm": 181.9920654296875, + "learning_rate": 1.5669691470054448e-05, + "loss": 23.9106, + "step": 2642 + }, + { + "epoch": 9.54176072234763, + "grad_norm": 209.20391845703125, + "learning_rate": 1.5664246823956443e-05, + "loss": 25.5328, + "step": 2643 + }, + { + "epoch": 9.545372460496614, + "grad_norm": 223.86093139648438, + "learning_rate": 1.565880217785844e-05, + "loss": 42.8563, + "step": 2644 + }, + { + "epoch": 9.548984198645599, + "grad_norm": 232.3086395263672, + "learning_rate": 1.5653357531760438e-05, + "loss": 44.0178, + "step": 2645 + }, + { + "epoch": 9.552595936794582, + "grad_norm": 223.76541137695312, + "learning_rate": 1.5647912885662433e-05, + "loss": 43.4928, + "step": 2646 + }, + { + "epoch": 9.556207674943566, + "grad_norm": 258.86700439453125, + "learning_rate": 1.5642468239564428e-05, + "loss": 42.3422, + "step": 2647 + }, + { + "epoch": 9.559819413092551, + "grad_norm": 255.09033203125, + "learning_rate": 1.5637023593466424e-05, + "loss": 41.6588, + "step": 2648 + }, + { + "epoch": 9.563431151241534, + "grad_norm": 205.88563537597656, + "learning_rate": 1.563157894736842e-05, + "loss": 41.9267, + "step": 2649 + }, + { + "epoch": 9.56704288939052, + "grad_norm": 204.12318420410156, + "learning_rate": 1.5626134301270418e-05, + "loss": 43.0326, + "step": 2650 + }, + { + "epoch": 9.56704288939052, + "eval_loss": 0.6218730807304382, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2650 + }, + { + "epoch": 9.570654627539504, + "grad_norm": 259.5694274902344, + "learning_rate": 1.5620689655172417e-05, + "loss": 42.9604, + "step": 2651 + }, + { + "epoch": 9.574266365688487, + "grad_norm": 234.35935974121094, + "learning_rate": 1.5615245009074412e-05, + "loss": 42.7316, + "step": 2652 + }, + { + "epoch": 9.577878103837472, + "grad_norm": 237.14346313476562, + "learning_rate": 1.5609800362976407e-05, + "loss": 42.4559, + "step": 2653 + }, + { + "epoch": 9.581489841986457, + "grad_norm": 208.2974395751953, + "learning_rate": 1.5604355716878403e-05, + "loss": 40.1113, + "step": 2654 + }, + { + "epoch": 9.58510158013544, + "grad_norm": 212.18814086914062, + "learning_rate": 1.5598911070780398e-05, + "loss": 38.6515, + "step": 2655 + }, + { + "epoch": 9.588713318284425, + "grad_norm": 245.23240661621094, + "learning_rate": 1.5593466424682397e-05, + "loss": 39.5289, + "step": 2656 + }, + { + "epoch": 9.592325056433408, + "grad_norm": 261.1321105957031, + "learning_rate": 1.5588021778584392e-05, + "loss": 39.3232, + "step": 2657 + }, + { + "epoch": 9.595936794582393, + "grad_norm": 257.67962646484375, + "learning_rate": 1.5582577132486388e-05, + "loss": 40.3963, + "step": 2658 + }, + { + "epoch": 9.599548532731378, + "grad_norm": 299.93914794921875, + "learning_rate": 1.5577132486388383e-05, + "loss": 39.0657, + "step": 2659 + }, + { + "epoch": 9.60316027088036, + "grad_norm": 215.45407104492188, + "learning_rate": 1.5571687840290382e-05, + "loss": 40.1408, + "step": 2660 + }, + { + "epoch": 9.60316027088036, + "eval_loss": 0.6216554045677185, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2660 + }, + { + "epoch": 9.606772009029346, + "grad_norm": 273.9233093261719, + "learning_rate": 1.5566243194192377e-05, + "loss": 40.6894, + "step": 2661 + }, + { + "epoch": 9.610383747178329, + "grad_norm": 220.76344299316406, + "learning_rate": 1.5560798548094376e-05, + "loss": 40.8146, + "step": 2662 + }, + { + "epoch": 9.613995485327314, + "grad_norm": 200.33929443359375, + "learning_rate": 1.555535390199637e-05, + "loss": 40.1362, + "step": 2663 + }, + { + "epoch": 9.617607223476298, + "grad_norm": 223.38536071777344, + "learning_rate": 1.5549909255898367e-05, + "loss": 39.3488, + "step": 2664 + }, + { + "epoch": 9.621218961625281, + "grad_norm": 240.99578857421875, + "learning_rate": 1.5544464609800362e-05, + "loss": 41.771, + "step": 2665 + }, + { + "epoch": 9.624830699774266, + "grad_norm": 202.30323791503906, + "learning_rate": 1.5539019963702357e-05, + "loss": 41.1412, + "step": 2666 + }, + { + "epoch": 9.628442437923251, + "grad_norm": 193.8411865234375, + "learning_rate": 1.5533575317604356e-05, + "loss": 41.0064, + "step": 2667 + }, + { + "epoch": 9.632054176072234, + "grad_norm": 197.1542510986328, + "learning_rate": 1.552813067150635e-05, + "loss": 41.4787, + "step": 2668 + }, + { + "epoch": 9.635665914221219, + "grad_norm": 259.21954345703125, + "learning_rate": 1.552268602540835e-05, + "loss": 41.753, + "step": 2669 + }, + { + "epoch": 9.639277652370204, + "grad_norm": 290.9770202636719, + "learning_rate": 1.5517241379310346e-05, + "loss": 40.4589, + "step": 2670 + }, + { + "epoch": 9.639277652370204, + "eval_loss": 0.6132164001464844, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2670 + }, + { + "epoch": 9.642889390519187, + "grad_norm": 252.86219787597656, + "learning_rate": 1.551179673321234e-05, + "loss": 37.356, + "step": 2671 + }, + { + "epoch": 9.646501128668172, + "grad_norm": 207.79254150390625, + "learning_rate": 1.550635208711434e-05, + "loss": 36.2071, + "step": 2672 + }, + { + "epoch": 9.650112866817155, + "grad_norm": 186.78857421875, + "learning_rate": 1.5500907441016335e-05, + "loss": 33.5074, + "step": 2673 + }, + { + "epoch": 9.65372460496614, + "grad_norm": 212.5107421875, + "learning_rate": 1.549546279491833e-05, + "loss": 33.7103, + "step": 2674 + }, + { + "epoch": 9.657336343115125, + "grad_norm": 243.2950897216797, + "learning_rate": 1.5490018148820326e-05, + "loss": 34.3476, + "step": 2675 + }, + { + "epoch": 9.660948081264108, + "grad_norm": 221.66415405273438, + "learning_rate": 1.548457350272232e-05, + "loss": 34.5377, + "step": 2676 + }, + { + "epoch": 9.664559819413093, + "grad_norm": 231.8260955810547, + "learning_rate": 1.5479128856624317e-05, + "loss": 34.3663, + "step": 2677 + }, + { + "epoch": 9.668171557562077, + "grad_norm": 284.6401062011719, + "learning_rate": 1.547368421052632e-05, + "loss": 35.5723, + "step": 2678 + }, + { + "epoch": 9.67178329571106, + "grad_norm": 373.43865966796875, + "learning_rate": 1.5468239564428314e-05, + "loss": 35.5628, + "step": 2679 + }, + { + "epoch": 9.675395033860045, + "grad_norm": 325.18316650390625, + "learning_rate": 1.546279491833031e-05, + "loss": 35.6192, + "step": 2680 + }, + { + "epoch": 9.675395033860045, + "eval_loss": 0.613842248916626, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 2680 + }, + { + "epoch": 9.679006772009028, + "grad_norm": 353.14739990234375, + "learning_rate": 1.5457350272232305e-05, + "loss": 36.4789, + "step": 2681 + }, + { + "epoch": 9.682618510158013, + "grad_norm": 215.21836853027344, + "learning_rate": 1.54519056261343e-05, + "loss": 36.0412, + "step": 2682 + }, + { + "epoch": 9.686230248306998, + "grad_norm": 219.64930725097656, + "learning_rate": 1.54464609800363e-05, + "loss": 37.1118, + "step": 2683 + }, + { + "epoch": 9.689841986455981, + "grad_norm": 247.86685180664062, + "learning_rate": 1.5441016333938295e-05, + "loss": 36.488, + "step": 2684 + }, + { + "epoch": 9.693453724604966, + "grad_norm": 248.7967071533203, + "learning_rate": 1.543557168784029e-05, + "loss": 36.2925, + "step": 2685 + }, + { + "epoch": 9.697065462753951, + "grad_norm": 243.1404571533203, + "learning_rate": 1.5430127041742285e-05, + "loss": 37.3986, + "step": 2686 + }, + { + "epoch": 9.700677200902934, + "grad_norm": 276.6585388183594, + "learning_rate": 1.5424682395644284e-05, + "loss": 37.9784, + "step": 2687 + }, + { + "epoch": 9.704288939051919, + "grad_norm": 308.171630859375, + "learning_rate": 1.541923774954628e-05, + "loss": 38.1591, + "step": 2688 + }, + { + "epoch": 9.707900677200904, + "grad_norm": 204.4575653076172, + "learning_rate": 1.541379310344828e-05, + "loss": 27.4514, + "step": 2689 + }, + { + "epoch": 9.711512415349887, + "grad_norm": 160.85946655273438, + "learning_rate": 1.5408348457350274e-05, + "loss": 23.7982, + "step": 2690 + }, + { + "epoch": 9.711512415349887, + "eval_loss": 0.619924008846283, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2690 + }, + { + "epoch": 9.715124153498872, + "grad_norm": 215.60049438476562, + "learning_rate": 1.540290381125227e-05, + "loss": 23.3927, + "step": 2691 + }, + { + "epoch": 9.718735891647855, + "grad_norm": 172.84011840820312, + "learning_rate": 1.5397459165154265e-05, + "loss": 24.1876, + "step": 2692 + }, + { + "epoch": 9.72234762979684, + "grad_norm": 208.42361450195312, + "learning_rate": 1.539201451905626e-05, + "loss": 25.1794, + "step": 2693 + }, + { + "epoch": 9.725959367945824, + "grad_norm": 255.73574829101562, + "learning_rate": 1.538656987295826e-05, + "loss": 42.3484, + "step": 2694 + }, + { + "epoch": 9.729571106094808, + "grad_norm": 239.65533447265625, + "learning_rate": 1.5381125226860254e-05, + "loss": 42.8277, + "step": 2695 + }, + { + "epoch": 9.733182844243792, + "grad_norm": 211.2068634033203, + "learning_rate": 1.5375680580762253e-05, + "loss": 42.6536, + "step": 2696 + }, + { + "epoch": 9.736794582392777, + "grad_norm": 302.85003662109375, + "learning_rate": 1.5370235934664248e-05, + "loss": 42.6263, + "step": 2697 + }, + { + "epoch": 9.74040632054176, + "grad_norm": 211.54754638671875, + "learning_rate": 1.5364791288566244e-05, + "loss": 41.5621, + "step": 2698 + }, + { + "epoch": 9.744018058690745, + "grad_norm": 229.22283935546875, + "learning_rate": 1.535934664246824e-05, + "loss": 43.3765, + "step": 2699 + }, + { + "epoch": 9.747629796839728, + "grad_norm": 206.64794921875, + "learning_rate": 1.5353901996370238e-05, + "loss": 41.4923, + "step": 2700 + }, + { + "epoch": 9.747629796839728, + "eval_loss": 0.6202616095542908, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.981, + "eval_steps_per_second": 56.981, + "step": 2700 + }, + { + "epoch": 9.751241534988713, + "grad_norm": 216.98757934570312, + "learning_rate": 1.5348457350272233e-05, + "loss": 43.1931, + "step": 2701 + }, + { + "epoch": 9.754853273137698, + "grad_norm": 222.7340545654297, + "learning_rate": 1.534301270417423e-05, + "loss": 42.485, + "step": 2702 + }, + { + "epoch": 9.758465011286681, + "grad_norm": 291.3454895019531, + "learning_rate": 1.5337568058076224e-05, + "loss": 41.4766, + "step": 2703 + }, + { + "epoch": 9.762076749435666, + "grad_norm": 239.50341796875, + "learning_rate": 1.533212341197822e-05, + "loss": 41.9215, + "step": 2704 + }, + { + "epoch": 9.76568848758465, + "grad_norm": 179.21839904785156, + "learning_rate": 1.5326678765880218e-05, + "loss": 40.6544, + "step": 2705 + }, + { + "epoch": 9.769300225733634, + "grad_norm": 210.89535522460938, + "learning_rate": 1.5321234119782217e-05, + "loss": 38.6204, + "step": 2706 + }, + { + "epoch": 9.772911963882619, + "grad_norm": 239.23291015625, + "learning_rate": 1.5315789473684212e-05, + "loss": 39.4385, + "step": 2707 + }, + { + "epoch": 9.776523702031604, + "grad_norm": 240.22772216796875, + "learning_rate": 1.5310344827586208e-05, + "loss": 40.0139, + "step": 2708 + }, + { + "epoch": 9.780135440180587, + "grad_norm": 185.4588623046875, + "learning_rate": 1.5304900181488203e-05, + "loss": 38.9331, + "step": 2709 + }, + { + "epoch": 9.783747178329572, + "grad_norm": 263.0315856933594, + "learning_rate": 1.52994555353902e-05, + "loss": 38.5485, + "step": 2710 + }, + { + "epoch": 9.783747178329572, + "eval_loss": 0.615914523601532, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2710 + }, + { + "epoch": 9.787358916478555, + "grad_norm": 209.05348205566406, + "learning_rate": 1.5294010889292197e-05, + "loss": 39.4875, + "step": 2711 + }, + { + "epoch": 9.79097065462754, + "grad_norm": 209.72293090820312, + "learning_rate": 1.5288566243194193e-05, + "loss": 40.4742, + "step": 2712 + }, + { + "epoch": 9.794582392776524, + "grad_norm": 210.02908325195312, + "learning_rate": 1.5283121597096188e-05, + "loss": 39.924, + "step": 2713 + }, + { + "epoch": 9.798194130925507, + "grad_norm": 204.3467254638672, + "learning_rate": 1.5277676950998183e-05, + "loss": 40.8893, + "step": 2714 + }, + { + "epoch": 9.801805869074492, + "grad_norm": 253.9317626953125, + "learning_rate": 1.5272232304900182e-05, + "loss": 38.3278, + "step": 2715 + }, + { + "epoch": 9.805417607223477, + "grad_norm": 263.6196594238281, + "learning_rate": 1.526678765880218e-05, + "loss": 40.5242, + "step": 2716 + }, + { + "epoch": 9.80902934537246, + "grad_norm": 230.35621643066406, + "learning_rate": 1.5261343012704176e-05, + "loss": 40.683, + "step": 2717 + }, + { + "epoch": 9.812641083521445, + "grad_norm": 190.16323852539062, + "learning_rate": 1.5255898366606172e-05, + "loss": 40.2472, + "step": 2718 + }, + { + "epoch": 9.816252821670428, + "grad_norm": 202.7122344970703, + "learning_rate": 1.5250453720508167e-05, + "loss": 38.9644, + "step": 2719 + }, + { + "epoch": 9.819864559819413, + "grad_norm": 193.65774536132812, + "learning_rate": 1.5245009074410164e-05, + "loss": 40.9982, + "step": 2720 + }, + { + "epoch": 9.819864559819413, + "eval_loss": 0.6152020692825317, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 2720 + }, + { + "epoch": 9.823476297968398, + "grad_norm": 272.0360412597656, + "learning_rate": 1.523956442831216e-05, + "loss": 40.5518, + "step": 2721 + }, + { + "epoch": 9.827088036117381, + "grad_norm": 200.20777893066406, + "learning_rate": 1.5234119782214155e-05, + "loss": 38.4801, + "step": 2722 + }, + { + "epoch": 9.830699774266366, + "grad_norm": 201.44764709472656, + "learning_rate": 1.5228675136116152e-05, + "loss": 35.7499, + "step": 2723 + }, + { + "epoch": 9.83431151241535, + "grad_norm": 234.89706420898438, + "learning_rate": 1.522323049001815e-05, + "loss": 35.4331, + "step": 2724 + }, + { + "epoch": 9.837923250564334, + "grad_norm": 193.27423095703125, + "learning_rate": 1.5217785843920146e-05, + "loss": 33.0281, + "step": 2725 + }, + { + "epoch": 9.841534988713319, + "grad_norm": 222.28060913085938, + "learning_rate": 1.5212341197822143e-05, + "loss": 34.2237, + "step": 2726 + }, + { + "epoch": 9.845146726862303, + "grad_norm": 264.2764587402344, + "learning_rate": 1.5206896551724139e-05, + "loss": 33.7112, + "step": 2727 + }, + { + "epoch": 9.848758465011286, + "grad_norm": 204.5146484375, + "learning_rate": 1.5201451905626134e-05, + "loss": 33.9014, + "step": 2728 + }, + { + "epoch": 9.852370203160271, + "grad_norm": 198.90907287597656, + "learning_rate": 1.5196007259528131e-05, + "loss": 36.6987, + "step": 2729 + }, + { + "epoch": 9.855981941309254, + "grad_norm": 254.19818115234375, + "learning_rate": 1.5190562613430126e-05, + "loss": 35.4466, + "step": 2730 + }, + { + "epoch": 9.855981941309254, + "eval_loss": 0.6153284311294556, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2730 + }, + { + "epoch": 9.85959367945824, + "grad_norm": 212.53749084472656, + "learning_rate": 1.5185117967332123e-05, + "loss": 35.659, + "step": 2731 + }, + { + "epoch": 9.863205417607224, + "grad_norm": 234.5277557373047, + "learning_rate": 1.5179673321234119e-05, + "loss": 36.7411, + "step": 2732 + }, + { + "epoch": 9.866817155756207, + "grad_norm": 229.25962829589844, + "learning_rate": 1.5174228675136118e-05, + "loss": 36.0713, + "step": 2733 + }, + { + "epoch": 9.870428893905192, + "grad_norm": 259.5096435546875, + "learning_rate": 1.5168784029038115e-05, + "loss": 37.2433, + "step": 2734 + }, + { + "epoch": 9.874040632054175, + "grad_norm": 297.2413024902344, + "learning_rate": 1.516333938294011e-05, + "loss": 37.222, + "step": 2735 + }, + { + "epoch": 9.87765237020316, + "grad_norm": 259.8325500488281, + "learning_rate": 1.5157894736842105e-05, + "loss": 37.096, + "step": 2736 + }, + { + "epoch": 9.881264108352145, + "grad_norm": 275.85888671875, + "learning_rate": 1.5152450090744103e-05, + "loss": 37.769, + "step": 2737 + }, + { + "epoch": 9.884875846501128, + "grad_norm": 261.16656494140625, + "learning_rate": 1.5147005444646098e-05, + "loss": 38.4089, + "step": 2738 + }, + { + "epoch": 9.888487584650113, + "grad_norm": 219.74351501464844, + "learning_rate": 1.5141560798548095e-05, + "loss": 32.5255, + "step": 2739 + }, + { + "epoch": 9.892099322799098, + "grad_norm": 203.9193878173828, + "learning_rate": 1.513611615245009e-05, + "loss": 24.2497, + "step": 2740 + }, + { + "epoch": 9.892099322799098, + "eval_loss": 0.6206448674201965, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 2740 + }, + { + "epoch": 9.89571106094808, + "grad_norm": 224.19454956054688, + "learning_rate": 1.5130671506352086e-05, + "loss": 23.0629, + "step": 2741 + }, + { + "epoch": 9.899322799097066, + "grad_norm": 252.4147186279297, + "learning_rate": 1.5125226860254086e-05, + "loss": 24.5799, + "step": 2742 + }, + { + "epoch": 9.90293453724605, + "grad_norm": 214.79067993164062, + "learning_rate": 1.5119782214156082e-05, + "loss": 24.6773, + "step": 2743 + }, + { + "epoch": 9.906546275395034, + "grad_norm": 225.59848022460938, + "learning_rate": 1.5114337568058077e-05, + "loss": 43.1147, + "step": 2744 + }, + { + "epoch": 9.910158013544018, + "grad_norm": 221.8661651611328, + "learning_rate": 1.5108892921960074e-05, + "loss": 42.7403, + "step": 2745 + }, + { + "epoch": 9.913769751693001, + "grad_norm": 316.3871765136719, + "learning_rate": 1.510344827586207e-05, + "loss": 41.6931, + "step": 2746 + }, + { + "epoch": 9.917381489841986, + "grad_norm": 250.6577911376953, + "learning_rate": 1.5098003629764065e-05, + "loss": 43.3, + "step": 2747 + }, + { + "epoch": 9.920993227990971, + "grad_norm": 222.44386291503906, + "learning_rate": 1.5092558983666062e-05, + "loss": 43.3128, + "step": 2748 + }, + { + "epoch": 9.924604966139954, + "grad_norm": 190.08682250976562, + "learning_rate": 1.5087114337568057e-05, + "loss": 41.4814, + "step": 2749 + }, + { + "epoch": 9.928216704288939, + "grad_norm": 276.9918212890625, + "learning_rate": 1.5081669691470054e-05, + "loss": 41.042, + "step": 2750 + }, + { + "epoch": 9.928216704288939, + "eval_loss": 0.6201648116111755, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2750 + }, + { + "epoch": 9.931828442437924, + "grad_norm": 269.7344970703125, + "learning_rate": 1.507622504537205e-05, + "loss": 40.3064, + "step": 2751 + }, + { + "epoch": 9.935440180586907, + "grad_norm": 263.11663818359375, + "learning_rate": 1.5070780399274049e-05, + "loss": 40.1675, + "step": 2752 + }, + { + "epoch": 9.939051918735892, + "grad_norm": 210.37635803222656, + "learning_rate": 1.5065335753176046e-05, + "loss": 40.5334, + "step": 2753 + }, + { + "epoch": 9.942663656884875, + "grad_norm": 206.09335327148438, + "learning_rate": 1.5059891107078041e-05, + "loss": 41.0429, + "step": 2754 + }, + { + "epoch": 9.94627539503386, + "grad_norm": 245.45013427734375, + "learning_rate": 1.5054446460980036e-05, + "loss": 40.8831, + "step": 2755 + }, + { + "epoch": 9.949887133182845, + "grad_norm": 216.63075256347656, + "learning_rate": 1.5049001814882033e-05, + "loss": 41.2453, + "step": 2756 + }, + { + "epoch": 9.953498871331828, + "grad_norm": 362.12127685546875, + "learning_rate": 1.5043557168784029e-05, + "loss": 40.4561, + "step": 2757 + }, + { + "epoch": 9.957110609480813, + "grad_norm": 222.01434326171875, + "learning_rate": 1.5038112522686024e-05, + "loss": 41.7307, + "step": 2758 + }, + { + "epoch": 9.960722347629797, + "grad_norm": 289.6107177734375, + "learning_rate": 1.5032667876588021e-05, + "loss": 37.83, + "step": 2759 + }, + { + "epoch": 9.96433408577878, + "grad_norm": 231.75274658203125, + "learning_rate": 1.5027223230490017e-05, + "loss": 34.1728, + "step": 2760 + }, + { + "epoch": 9.96433408577878, + "eval_loss": 0.6177247166633606, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2760 + }, + { + "epoch": 9.967945823927765, + "grad_norm": 269.4657287597656, + "learning_rate": 1.5021778584392017e-05, + "loss": 33.8501, + "step": 2761 + }, + { + "epoch": 9.97155756207675, + "grad_norm": 229.73004150390625, + "learning_rate": 1.5016333938294013e-05, + "loss": 35.0989, + "step": 2762 + }, + { + "epoch": 9.975169300225733, + "grad_norm": 215.75350952148438, + "learning_rate": 1.5010889292196008e-05, + "loss": 35.1091, + "step": 2763 + }, + { + "epoch": 9.978781038374718, + "grad_norm": 255.36439514160156, + "learning_rate": 1.5005444646098005e-05, + "loss": 36.8373, + "step": 2764 + }, + { + "epoch": 9.982392776523701, + "grad_norm": 226.71084594726562, + "learning_rate": 1.5e-05, + "loss": 36.6244, + "step": 2765 + }, + { + "epoch": 9.986004514672686, + "grad_norm": 264.1791076660156, + "learning_rate": 1.4994555353901996e-05, + "loss": 36.1925, + "step": 2766 + }, + { + "epoch": 9.989616252821671, + "grad_norm": 281.4349060058594, + "learning_rate": 1.4989110707803993e-05, + "loss": 38.5627, + "step": 2767 + }, + { + "epoch": 9.993227990970654, + "grad_norm": 275.13092041015625, + "learning_rate": 1.498366606170599e-05, + "loss": 33.3277, + "step": 2768 + }, + { + "epoch": 9.996839729119639, + "grad_norm": 215.79550170898438, + "learning_rate": 1.4978221415607985e-05, + "loss": 23.7482, + "step": 2769 + }, + { + "epoch": 10.0, + "grad_norm": 162.03152465820312, + "learning_rate": 1.4972776769509982e-05, + "loss": 21.7078, + "step": 2770 + }, + { + "epoch": 10.0, + "eval_loss": 0.6126651763916016, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2770 + }, + { + "epoch": 10.003611738148985, + "grad_norm": 243.1815185546875, + "learning_rate": 1.4967332123411978e-05, + "loss": 42.2449, + "step": 2771 + }, + { + "epoch": 10.007223476297968, + "grad_norm": 183.29127502441406, + "learning_rate": 1.4961887477313977e-05, + "loss": 41.5925, + "step": 2772 + }, + { + "epoch": 10.010835214446953, + "grad_norm": 206.04238891601562, + "learning_rate": 1.4956442831215972e-05, + "loss": 40.6657, + "step": 2773 + }, + { + "epoch": 10.014446952595938, + "grad_norm": 192.1796875, + "learning_rate": 1.4950998185117967e-05, + "loss": 41.7065, + "step": 2774 + }, + { + "epoch": 10.01805869074492, + "grad_norm": 202.77279663085938, + "learning_rate": 1.4945553539019964e-05, + "loss": 42.0608, + "step": 2775 + }, + { + "epoch": 10.021670428893906, + "grad_norm": 242.37734985351562, + "learning_rate": 1.494010889292196e-05, + "loss": 40.9925, + "step": 2776 + }, + { + "epoch": 10.025282167042889, + "grad_norm": 252.01358032226562, + "learning_rate": 1.4934664246823957e-05, + "loss": 41.1401, + "step": 2777 + }, + { + "epoch": 10.028893905191874, + "grad_norm": 205.82388305664062, + "learning_rate": 1.4929219600725954e-05, + "loss": 41.5, + "step": 2778 + }, + { + "epoch": 10.032505643340858, + "grad_norm": 251.53968811035156, + "learning_rate": 1.492377495462795e-05, + "loss": 41.8218, + "step": 2779 + }, + { + "epoch": 10.036117381489841, + "grad_norm": 236.55564880371094, + "learning_rate": 1.4918330308529945e-05, + "loss": 40.803, + "step": 2780 + }, + { + "epoch": 10.036117381489841, + "eval_loss": 0.6173696517944336, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 2780 + }, + { + "epoch": 10.039729119638826, + "grad_norm": 214.9959716796875, + "learning_rate": 1.4912885662431942e-05, + "loss": 40.522, + "step": 2781 + }, + { + "epoch": 10.043340857787811, + "grad_norm": 213.7000732421875, + "learning_rate": 1.4907441016333939e-05, + "loss": 38.8643, + "step": 2782 + }, + { + "epoch": 10.046952595936794, + "grad_norm": 225.6709747314453, + "learning_rate": 1.4901996370235936e-05, + "loss": 38.3625, + "step": 2783 + }, + { + "epoch": 10.050564334085779, + "grad_norm": 208.83712768554688, + "learning_rate": 1.4896551724137931e-05, + "loss": 38.5355, + "step": 2784 + }, + { + "epoch": 10.054176072234762, + "grad_norm": 185.51219177246094, + "learning_rate": 1.4891107078039927e-05, + "loss": 38.4303, + "step": 2785 + }, + { + "epoch": 10.057787810383747, + "grad_norm": 196.68551635742188, + "learning_rate": 1.4885662431941925e-05, + "loss": 38.1895, + "step": 2786 + }, + { + "epoch": 10.061399548532732, + "grad_norm": 207.4806671142578, + "learning_rate": 1.488021778584392e-05, + "loss": 39.2329, + "step": 2787 + }, + { + "epoch": 10.065011286681715, + "grad_norm": 211.640380859375, + "learning_rate": 1.4874773139745916e-05, + "loss": 40.108, + "step": 2788 + }, + { + "epoch": 10.0686230248307, + "grad_norm": 195.97006225585938, + "learning_rate": 1.4869328493647913e-05, + "loss": 39.6883, + "step": 2789 + }, + { + "epoch": 10.072234762979685, + "grad_norm": 207.20169067382812, + "learning_rate": 1.4863883847549909e-05, + "loss": 40.557, + "step": 2790 + }, + { + "epoch": 10.072234762979685, + "eval_loss": 0.6166439652442932, + "eval_runtime": 3.1461, + "eval_samples_per_second": 56.895, + "eval_steps_per_second": 56.895, + "step": 2790 + }, + { + "epoch": 10.075846501128668, + "grad_norm": 168.4052276611328, + "learning_rate": 1.4858439201451906e-05, + "loss": 39.76, + "step": 2791 + }, + { + "epoch": 10.079458239277653, + "grad_norm": 188.55575561523438, + "learning_rate": 1.4852994555353903e-05, + "loss": 40.4776, + "step": 2792 + }, + { + "epoch": 10.083069977426636, + "grad_norm": 181.60801696777344, + "learning_rate": 1.4847549909255898e-05, + "loss": 40.5414, + "step": 2793 + }, + { + "epoch": 10.08668171557562, + "grad_norm": 205.39608764648438, + "learning_rate": 1.4842105263157895e-05, + "loss": 41.4944, + "step": 2794 + }, + { + "epoch": 10.090293453724605, + "grad_norm": 271.0169372558594, + "learning_rate": 1.4836660617059892e-05, + "loss": 40.6805, + "step": 2795 + }, + { + "epoch": 10.093905191873588, + "grad_norm": 241.97889709472656, + "learning_rate": 1.4831215970961888e-05, + "loss": 39.5473, + "step": 2796 + }, + { + "epoch": 10.097516930022573, + "grad_norm": 211.64260864257812, + "learning_rate": 1.4825771324863885e-05, + "loss": 41.0357, + "step": 2797 + }, + { + "epoch": 10.101128668171558, + "grad_norm": 209.52804565429688, + "learning_rate": 1.482032667876588e-05, + "loss": 41.3357, + "step": 2798 + }, + { + "epoch": 10.104740406320541, + "grad_norm": 243.08419799804688, + "learning_rate": 1.4814882032667876e-05, + "loss": 38.6778, + "step": 2799 + }, + { + "epoch": 10.108352144469526, + "grad_norm": 227.17172241210938, + "learning_rate": 1.4809437386569874e-05, + "loss": 35.1128, + "step": 2800 + }, + { + "epoch": 10.108352144469526, + "eval_loss": 0.6153741478919983, + "eval_runtime": 3.143, + "eval_samples_per_second": 56.952, + "eval_steps_per_second": 56.952, + "step": 2800 + }, + { + "epoch": 10.111963882618511, + "grad_norm": 284.7151794433594, + "learning_rate": 1.480399274047187e-05, + "loss": 33.1712, + "step": 2801 + }, + { + "epoch": 10.115575620767494, + "grad_norm": 234.85169982910156, + "learning_rate": 1.4798548094373867e-05, + "loss": 33.495, + "step": 2802 + }, + { + "epoch": 10.119187358916479, + "grad_norm": 236.6138458251953, + "learning_rate": 1.4793103448275862e-05, + "loss": 33.2318, + "step": 2803 + }, + { + "epoch": 10.122799097065462, + "grad_norm": 240.98997497558594, + "learning_rate": 1.4787658802177858e-05, + "loss": 33.9268, + "step": 2804 + }, + { + "epoch": 10.126410835214447, + "grad_norm": 218.304443359375, + "learning_rate": 1.4782214156079856e-05, + "loss": 34.667, + "step": 2805 + }, + { + "epoch": 10.130022573363432, + "grad_norm": 290.30108642578125, + "learning_rate": 1.4776769509981852e-05, + "loss": 36.7153, + "step": 2806 + }, + { + "epoch": 10.133634311512415, + "grad_norm": 267.7265625, + "learning_rate": 1.4771324863883847e-05, + "loss": 35.2035, + "step": 2807 + }, + { + "epoch": 10.1372460496614, + "grad_norm": 300.4646301269531, + "learning_rate": 1.4765880217785844e-05, + "loss": 35.6581, + "step": 2808 + }, + { + "epoch": 10.140857787810384, + "grad_norm": 234.16448974609375, + "learning_rate": 1.4760435571687841e-05, + "loss": 35.8547, + "step": 2809 + }, + { + "epoch": 10.144469525959368, + "grad_norm": 209.23858642578125, + "learning_rate": 1.4754990925589837e-05, + "loss": 34.47, + "step": 2810 + }, + { + "epoch": 10.144469525959368, + "eval_loss": 0.6160662770271301, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2810 + }, + { + "epoch": 10.148081264108352, + "grad_norm": 207.9628143310547, + "learning_rate": 1.4749546279491834e-05, + "loss": 36.1239, + "step": 2811 + }, + { + "epoch": 10.151693002257336, + "grad_norm": 183.68545532226562, + "learning_rate": 1.4744101633393829e-05, + "loss": 36.759, + "step": 2812 + }, + { + "epoch": 10.15530474040632, + "grad_norm": 222.00164794921875, + "learning_rate": 1.4738656987295826e-05, + "loss": 37.397, + "step": 2813 + }, + { + "epoch": 10.158916478555305, + "grad_norm": 226.9628448486328, + "learning_rate": 1.4733212341197823e-05, + "loss": 36.3648, + "step": 2814 + }, + { + "epoch": 10.162528216704288, + "grad_norm": 271.061279296875, + "learning_rate": 1.4727767695099819e-05, + "loss": 37.8754, + "step": 2815 + }, + { + "epoch": 10.166139954853273, + "grad_norm": 265.2478942871094, + "learning_rate": 1.4722323049001816e-05, + "loss": 33.7491, + "step": 2816 + }, + { + "epoch": 10.169751693002258, + "grad_norm": 227.5030975341797, + "learning_rate": 1.4716878402903811e-05, + "loss": 23.0162, + "step": 2817 + }, + { + "epoch": 10.173363431151241, + "grad_norm": 195.83477783203125, + "learning_rate": 1.4711433756805808e-05, + "loss": 23.5831, + "step": 2818 + }, + { + "epoch": 10.176975169300226, + "grad_norm": 196.982421875, + "learning_rate": 1.4705989110707805e-05, + "loss": 24.1078, + "step": 2819 + }, + { + "epoch": 10.18058690744921, + "grad_norm": 212.73031616210938, + "learning_rate": 1.47005444646098e-05, + "loss": 24.8378, + "step": 2820 + }, + { + "epoch": 10.18058690744921, + "eval_loss": 0.6217848062515259, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 2820 + }, + { + "epoch": 10.184198645598194, + "grad_norm": 261.8343200683594, + "learning_rate": 1.4695099818511796e-05, + "loss": 43.3402, + "step": 2821 + }, + { + "epoch": 10.187810383747179, + "grad_norm": 272.94158935546875, + "learning_rate": 1.4689655172413793e-05, + "loss": 42.8004, + "step": 2822 + }, + { + "epoch": 10.191422121896162, + "grad_norm": 261.5067138671875, + "learning_rate": 1.468421052631579e-05, + "loss": 43.5947, + "step": 2823 + }, + { + "epoch": 10.195033860045147, + "grad_norm": 280.4205322265625, + "learning_rate": 1.4678765880217787e-05, + "loss": 42.1887, + "step": 2824 + }, + { + "epoch": 10.198645598194132, + "grad_norm": 223.82449340820312, + "learning_rate": 1.4673321234119783e-05, + "loss": 40.9825, + "step": 2825 + }, + { + "epoch": 10.202257336343115, + "grad_norm": 261.1077575683594, + "learning_rate": 1.4667876588021778e-05, + "loss": 41.8347, + "step": 2826 + }, + { + "epoch": 10.2058690744921, + "grad_norm": 189.1642608642578, + "learning_rate": 1.4662431941923775e-05, + "loss": 41.7441, + "step": 2827 + }, + { + "epoch": 10.209480812641084, + "grad_norm": 216.94410705566406, + "learning_rate": 1.4656987295825772e-05, + "loss": 42.203, + "step": 2828 + }, + { + "epoch": 10.213092550790067, + "grad_norm": 260.44744873046875, + "learning_rate": 1.4651542649727768e-05, + "loss": 41.8887, + "step": 2829 + }, + { + "epoch": 10.216704288939052, + "grad_norm": 252.21682739257812, + "learning_rate": 1.4646098003629765e-05, + "loss": 42.5977, + "step": 2830 + }, + { + "epoch": 10.216704288939052, + "eval_loss": 0.6175437569618225, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.997, + "eval_steps_per_second": 56.997, + "step": 2830 + }, + { + "epoch": 10.220316027088035, + "grad_norm": 298.4760437011719, + "learning_rate": 1.464065335753176e-05, + "loss": 40.7994, + "step": 2831 + }, + { + "epoch": 10.22392776523702, + "grad_norm": 214.0433349609375, + "learning_rate": 1.4635208711433757e-05, + "loss": 39.1571, + "step": 2832 + }, + { + "epoch": 10.227539503386005, + "grad_norm": 220.59039306640625, + "learning_rate": 1.4629764065335754e-05, + "loss": 38.257, + "step": 2833 + }, + { + "epoch": 10.231151241534988, + "grad_norm": 218.2419891357422, + "learning_rate": 1.462431941923775e-05, + "loss": 38.1954, + "step": 2834 + }, + { + "epoch": 10.234762979683973, + "grad_norm": 241.67674255371094, + "learning_rate": 1.4618874773139747e-05, + "loss": 39.7451, + "step": 2835 + }, + { + "epoch": 10.238374717832958, + "grad_norm": 260.3656005859375, + "learning_rate": 1.4613430127041742e-05, + "loss": 38.8297, + "step": 2836 + }, + { + "epoch": 10.241986455981941, + "grad_norm": 231.78102111816406, + "learning_rate": 1.4607985480943739e-05, + "loss": 38.523, + "step": 2837 + }, + { + "epoch": 10.245598194130926, + "grad_norm": 217.64820861816406, + "learning_rate": 1.4602540834845736e-05, + "loss": 40.0389, + "step": 2838 + }, + { + "epoch": 10.249209932279909, + "grad_norm": 186.45240783691406, + "learning_rate": 1.4597096188747732e-05, + "loss": 40.3306, + "step": 2839 + }, + { + "epoch": 10.252821670428894, + "grad_norm": 225.20480346679688, + "learning_rate": 1.4591651542649727e-05, + "loss": 39.0968, + "step": 2840 + }, + { + "epoch": 10.252821670428894, + "eval_loss": 0.6195141673088074, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 2840 + }, + { + "epoch": 10.256433408577879, + "grad_norm": 367.6174621582031, + "learning_rate": 1.4586206896551724e-05, + "loss": 38.869, + "step": 2841 + }, + { + "epoch": 10.260045146726862, + "grad_norm": 274.3976135253906, + "learning_rate": 1.4580762250453721e-05, + "loss": 39.7781, + "step": 2842 + }, + { + "epoch": 10.263656884875846, + "grad_norm": 193.41665649414062, + "learning_rate": 1.4575317604355718e-05, + "loss": 38.819, + "step": 2843 + }, + { + "epoch": 10.267268623024831, + "grad_norm": 204.2224578857422, + "learning_rate": 1.4569872958257714e-05, + "loss": 41.5495, + "step": 2844 + }, + { + "epoch": 10.270880361173814, + "grad_norm": 276.07476806640625, + "learning_rate": 1.4564428312159709e-05, + "loss": 40.6553, + "step": 2845 + }, + { + "epoch": 10.2744920993228, + "grad_norm": 192.6361541748047, + "learning_rate": 1.4558983666061708e-05, + "loss": 40.2147, + "step": 2846 + }, + { + "epoch": 10.278103837471784, + "grad_norm": 232.6641082763672, + "learning_rate": 1.4553539019963703e-05, + "loss": 40.7223, + "step": 2847 + }, + { + "epoch": 10.281715575620767, + "grad_norm": 266.781005859375, + "learning_rate": 1.4548094373865698e-05, + "loss": 38.0127, + "step": 2848 + }, + { + "epoch": 10.285327313769752, + "grad_norm": 289.5414123535156, + "learning_rate": 1.4542649727767696e-05, + "loss": 35.216, + "step": 2849 + }, + { + "epoch": 10.288939051918735, + "grad_norm": 208.10845947265625, + "learning_rate": 1.4537205081669691e-05, + "loss": 33.829, + "step": 2850 + }, + { + "epoch": 10.288939051918735, + "eval_loss": 0.6140356063842773, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.703, + "eval_steps_per_second": 56.703, + "step": 2850 + }, + { + "epoch": 10.29255079006772, + "grad_norm": 260.80328369140625, + "learning_rate": 1.4531760435571688e-05, + "loss": 33.8409, + "step": 2851 + }, + { + "epoch": 10.296162528216705, + "grad_norm": 202.3874053955078, + "learning_rate": 1.4526315789473685e-05, + "loss": 32.6498, + "step": 2852 + }, + { + "epoch": 10.299774266365688, + "grad_norm": 236.0218048095703, + "learning_rate": 1.452087114337568e-05, + "loss": 33.6538, + "step": 2853 + }, + { + "epoch": 10.303386004514673, + "grad_norm": 219.1603240966797, + "learning_rate": 1.4515426497277678e-05, + "loss": 33.7346, + "step": 2854 + }, + { + "epoch": 10.306997742663658, + "grad_norm": 252.8759307861328, + "learning_rate": 1.4509981851179675e-05, + "loss": 34.6996, + "step": 2855 + }, + { + "epoch": 10.31060948081264, + "grad_norm": 204.89244079589844, + "learning_rate": 1.450453720508167e-05, + "loss": 36.1145, + "step": 2856 + }, + { + "epoch": 10.314221218961626, + "grad_norm": 239.5278778076172, + "learning_rate": 1.4499092558983667e-05, + "loss": 34.8845, + "step": 2857 + }, + { + "epoch": 10.317832957110609, + "grad_norm": 235.02403259277344, + "learning_rate": 1.4493647912885662e-05, + "loss": 36.1006, + "step": 2858 + }, + { + "epoch": 10.321444695259594, + "grad_norm": 219.25686645507812, + "learning_rate": 1.4488203266787658e-05, + "loss": 37.0463, + "step": 2859 + }, + { + "epoch": 10.325056433408578, + "grad_norm": 238.1767578125, + "learning_rate": 1.4482758620689657e-05, + "loss": 35.5543, + "step": 2860 + }, + { + "epoch": 10.325056433408578, + "eval_loss": 0.6116110682487488, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.93, + "eval_steps_per_second": 56.93, + "step": 2860 + }, + { + "epoch": 10.328668171557561, + "grad_norm": 245.4133758544922, + "learning_rate": 1.4477313974591652e-05, + "loss": 35.7557, + "step": 2861 + }, + { + "epoch": 10.332279909706546, + "grad_norm": 231.70779418945312, + "learning_rate": 1.4471869328493647e-05, + "loss": 35.9535, + "step": 2862 + }, + { + "epoch": 10.335891647855531, + "grad_norm": 218.71266174316406, + "learning_rate": 1.4466424682395644e-05, + "loss": 36.747, + "step": 2863 + }, + { + "epoch": 10.339503386004514, + "grad_norm": 206.82247924804688, + "learning_rate": 1.446098003629764e-05, + "loss": 37.4007, + "step": 2864 + }, + { + "epoch": 10.343115124153499, + "grad_norm": 286.6649475097656, + "learning_rate": 1.4455535390199639e-05, + "loss": 38.183, + "step": 2865 + }, + { + "epoch": 10.346726862302482, + "grad_norm": 262.2049865722656, + "learning_rate": 1.4450090744101634e-05, + "loss": 28.1564, + "step": 2866 + }, + { + "epoch": 10.350338600451467, + "grad_norm": 203.03831481933594, + "learning_rate": 1.444464609800363e-05, + "loss": 23.7155, + "step": 2867 + }, + { + "epoch": 10.353950338600452, + "grad_norm": 220.13597106933594, + "learning_rate": 1.4439201451905626e-05, + "loss": 23.5066, + "step": 2868 + }, + { + "epoch": 10.357562076749435, + "grad_norm": 208.22035217285156, + "learning_rate": 1.4433756805807624e-05, + "loss": 23.8087, + "step": 2869 + }, + { + "epoch": 10.36117381489842, + "grad_norm": 202.74989318847656, + "learning_rate": 1.4428312159709619e-05, + "loss": 24.6194, + "step": 2870 + }, + { + "epoch": 10.36117381489842, + "eval_loss": 0.6170971989631653, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 2870 + }, + { + "epoch": 10.364785553047405, + "grad_norm": 251.78924560546875, + "learning_rate": 1.4422867513611616e-05, + "loss": 41.1333, + "step": 2871 + }, + { + "epoch": 10.368397291196388, + "grad_norm": 269.72430419921875, + "learning_rate": 1.4417422867513611e-05, + "loss": 43.5289, + "step": 2872 + }, + { + "epoch": 10.372009029345373, + "grad_norm": 226.14202880859375, + "learning_rate": 1.4411978221415607e-05, + "loss": 42.1575, + "step": 2873 + }, + { + "epoch": 10.375620767494357, + "grad_norm": 230.2255096435547, + "learning_rate": 1.4406533575317606e-05, + "loss": 42.5563, + "step": 2874 + }, + { + "epoch": 10.37923250564334, + "grad_norm": 259.2338562011719, + "learning_rate": 1.4401088929219601e-05, + "loss": 41.517, + "step": 2875 + }, + { + "epoch": 10.382844243792325, + "grad_norm": 280.06414794921875, + "learning_rate": 1.4395644283121598e-05, + "loss": 41.3589, + "step": 2876 + }, + { + "epoch": 10.386455981941308, + "grad_norm": 259.1960754394531, + "learning_rate": 1.4390199637023593e-05, + "loss": 41.539, + "step": 2877 + }, + { + "epoch": 10.390067720090293, + "grad_norm": 244.4931640625, + "learning_rate": 1.438475499092559e-05, + "loss": 41.8689, + "step": 2878 + }, + { + "epoch": 10.393679458239278, + "grad_norm": 195.65065002441406, + "learning_rate": 1.4379310344827588e-05, + "loss": 42.9191, + "step": 2879 + }, + { + "epoch": 10.397291196388261, + "grad_norm": 215.88589477539062, + "learning_rate": 1.4373865698729583e-05, + "loss": 41.4172, + "step": 2880 + }, + { + "epoch": 10.397291196388261, + "eval_loss": 0.6176813840866089, + "eval_runtime": 3.1462, + "eval_samples_per_second": 56.893, + "eval_steps_per_second": 56.893, + "step": 2880 + }, + { + "epoch": 10.400902934537246, + "grad_norm": 175.21368408203125, + "learning_rate": 1.4368421052631578e-05, + "loss": 41.8998, + "step": 2881 + }, + { + "epoch": 10.404514672686231, + "grad_norm": 207.65963745117188, + "learning_rate": 1.4362976406533575e-05, + "loss": 40.33, + "step": 2882 + }, + { + "epoch": 10.408126410835214, + "grad_norm": 213.50526428222656, + "learning_rate": 1.4357531760435572e-05, + "loss": 38.0329, + "step": 2883 + }, + { + "epoch": 10.411738148984199, + "grad_norm": 190.8444366455078, + "learning_rate": 1.4352087114337568e-05, + "loss": 39.0142, + "step": 2884 + }, + { + "epoch": 10.415349887133182, + "grad_norm": 300.2298583984375, + "learning_rate": 1.4346642468239565e-05, + "loss": 38.6364, + "step": 2885 + }, + { + "epoch": 10.418961625282167, + "grad_norm": 183.6144256591797, + "learning_rate": 1.434119782214156e-05, + "loss": 39.6747, + "step": 2886 + }, + { + "epoch": 10.422573363431152, + "grad_norm": 237.85340881347656, + "learning_rate": 1.4335753176043557e-05, + "loss": 38.3018, + "step": 2887 + }, + { + "epoch": 10.426185101580135, + "grad_norm": 325.96624755859375, + "learning_rate": 1.4330308529945554e-05, + "loss": 40.1042, + "step": 2888 + }, + { + "epoch": 10.42979683972912, + "grad_norm": 248.4732666015625, + "learning_rate": 1.432486388384755e-05, + "loss": 40.0357, + "step": 2889 + }, + { + "epoch": 10.433408577878104, + "grad_norm": 374.6653747558594, + "learning_rate": 1.4319419237749547e-05, + "loss": 40.4383, + "step": 2890 + }, + { + "epoch": 10.433408577878104, + "eval_loss": 0.6150367856025696, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.881, + "eval_steps_per_second": 56.881, + "step": 2890 + }, + { + "epoch": 10.437020316027088, + "grad_norm": 229.79647827148438, + "learning_rate": 1.4313974591651542e-05, + "loss": 40.3728, + "step": 2891 + }, + { + "epoch": 10.440632054176072, + "grad_norm": 278.7500915527344, + "learning_rate": 1.430852994555354e-05, + "loss": 39.546, + "step": 2892 + }, + { + "epoch": 10.444243792325057, + "grad_norm": 233.1890106201172, + "learning_rate": 1.4303085299455536e-05, + "loss": 41.8094, + "step": 2893 + }, + { + "epoch": 10.44785553047404, + "grad_norm": 207.7745819091797, + "learning_rate": 1.4297640653357532e-05, + "loss": 40.6225, + "step": 2894 + }, + { + "epoch": 10.451467268623025, + "grad_norm": 233.37892150878906, + "learning_rate": 1.4292196007259529e-05, + "loss": 40.2499, + "step": 2895 + }, + { + "epoch": 10.455079006772008, + "grad_norm": 225.4070587158203, + "learning_rate": 1.4286751361161524e-05, + "loss": 40.3626, + "step": 2896 + }, + { + "epoch": 10.458690744920993, + "grad_norm": 239.60231018066406, + "learning_rate": 1.4281306715063521e-05, + "loss": 40.3149, + "step": 2897 + }, + { + "epoch": 10.462302483069978, + "grad_norm": 225.3981475830078, + "learning_rate": 1.4275862068965518e-05, + "loss": 39.3443, + "step": 2898 + }, + { + "epoch": 10.465914221218961, + "grad_norm": 270.2829284667969, + "learning_rate": 1.4270417422867514e-05, + "loss": 37.8947, + "step": 2899 + }, + { + "epoch": 10.469525959367946, + "grad_norm": 263.66986083984375, + "learning_rate": 1.426497277676951e-05, + "loss": 34.4721, + "step": 2900 + }, + { + "epoch": 10.469525959367946, + "eval_loss": 0.6134031414985657, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2900 + }, + { + "epoch": 10.47313769751693, + "grad_norm": 189.3812255859375, + "learning_rate": 1.4259528130671508e-05, + "loss": 34.3148, + "step": 2901 + }, + { + "epoch": 10.476749435665914, + "grad_norm": 256.7174987792969, + "learning_rate": 1.4254083484573503e-05, + "loss": 32.1693, + "step": 2902 + }, + { + "epoch": 10.480361173814899, + "grad_norm": 265.40692138671875, + "learning_rate": 1.4248638838475499e-05, + "loss": 34.369, + "step": 2903 + }, + { + "epoch": 10.483972911963882, + "grad_norm": 315.6539001464844, + "learning_rate": 1.4243194192377496e-05, + "loss": 34.9479, + "step": 2904 + }, + { + "epoch": 10.487584650112867, + "grad_norm": 263.7816162109375, + "learning_rate": 1.4237749546279491e-05, + "loss": 33.983, + "step": 2905 + }, + { + "epoch": 10.491196388261852, + "grad_norm": 244.69192504882812, + "learning_rate": 1.423230490018149e-05, + "loss": 36.6685, + "step": 2906 + }, + { + "epoch": 10.494808126410835, + "grad_norm": 224.26071166992188, + "learning_rate": 1.4226860254083485e-05, + "loss": 35.0337, + "step": 2907 + }, + { + "epoch": 10.49841986455982, + "grad_norm": 261.0958557128906, + "learning_rate": 1.422141560798548e-05, + "loss": 34.7154, + "step": 2908 + }, + { + "epoch": 10.502031602708804, + "grad_norm": 245.85960388183594, + "learning_rate": 1.4215970961887478e-05, + "loss": 35.4156, + "step": 2909 + }, + { + "epoch": 10.505643340857787, + "grad_norm": 309.3730163574219, + "learning_rate": 1.4210526315789473e-05, + "loss": 36.3999, + "step": 2910 + }, + { + "epoch": 10.505643340857787, + "eval_loss": 0.6144266128540039, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.853, + "eval_steps_per_second": 56.853, + "step": 2910 + }, + { + "epoch": 10.509255079006772, + "grad_norm": 209.9637451171875, + "learning_rate": 1.420508166969147e-05, + "loss": 37.1515, + "step": 2911 + }, + { + "epoch": 10.512866817155757, + "grad_norm": 254.81683349609375, + "learning_rate": 1.4199637023593467e-05, + "loss": 35.5548, + "step": 2912 + }, + { + "epoch": 10.51647855530474, + "grad_norm": 224.94137573242188, + "learning_rate": 1.4194192377495463e-05, + "loss": 36.7691, + "step": 2913 + }, + { + "epoch": 10.520090293453725, + "grad_norm": 223.81838989257812, + "learning_rate": 1.4188747731397458e-05, + "loss": 37.5904, + "step": 2914 + }, + { + "epoch": 10.523702031602708, + "grad_norm": 308.0168151855469, + "learning_rate": 1.4183303085299457e-05, + "loss": 36.1561, + "step": 2915 + }, + { + "epoch": 10.527313769751693, + "grad_norm": 214.77928161621094, + "learning_rate": 1.4177858439201452e-05, + "loss": 27.6309, + "step": 2916 + }, + { + "epoch": 10.530925507900678, + "grad_norm": 153.77163696289062, + "learning_rate": 1.417241379310345e-05, + "loss": 23.6151, + "step": 2917 + }, + { + "epoch": 10.534537246049661, + "grad_norm": 161.12826538085938, + "learning_rate": 1.4166969147005445e-05, + "loss": 23.1684, + "step": 2918 + }, + { + "epoch": 10.538148984198646, + "grad_norm": 228.01441955566406, + "learning_rate": 1.416152450090744e-05, + "loss": 23.4383, + "step": 2919 + }, + { + "epoch": 10.54176072234763, + "grad_norm": 207.55052185058594, + "learning_rate": 1.4156079854809439e-05, + "loss": 25.4699, + "step": 2920 + }, + { + "epoch": 10.54176072234763, + "eval_loss": 0.6177500486373901, + "eval_runtime": 3.1369, + "eval_samples_per_second": 57.063, + "eval_steps_per_second": 57.063, + "step": 2920 + }, + { + "epoch": 10.545372460496614, + "grad_norm": 254.23828125, + "learning_rate": 1.4150635208711434e-05, + "loss": 42.1525, + "step": 2921 + }, + { + "epoch": 10.548984198645599, + "grad_norm": 228.1654815673828, + "learning_rate": 1.414519056261343e-05, + "loss": 42.4282, + "step": 2922 + }, + { + "epoch": 10.552595936794582, + "grad_norm": 258.4981689453125, + "learning_rate": 1.4139745916515427e-05, + "loss": 42.3053, + "step": 2923 + }, + { + "epoch": 10.556207674943566, + "grad_norm": 364.42059326171875, + "learning_rate": 1.4134301270417424e-05, + "loss": 41.9009, + "step": 2924 + }, + { + "epoch": 10.559819413092551, + "grad_norm": 213.5066375732422, + "learning_rate": 1.412885662431942e-05, + "loss": 41.0624, + "step": 2925 + }, + { + "epoch": 10.563431151241534, + "grad_norm": 214.23472595214844, + "learning_rate": 1.4123411978221416e-05, + "loss": 42.2508, + "step": 2926 + }, + { + "epoch": 10.56704288939052, + "grad_norm": 249.8063201904297, + "learning_rate": 1.4117967332123412e-05, + "loss": 43.0671, + "step": 2927 + }, + { + "epoch": 10.570654627539504, + "grad_norm": 210.0769805908203, + "learning_rate": 1.4112522686025409e-05, + "loss": 43.4018, + "step": 2928 + }, + { + "epoch": 10.574266365688487, + "grad_norm": 255.67225646972656, + "learning_rate": 1.4107078039927406e-05, + "loss": 42.9609, + "step": 2929 + }, + { + "epoch": 10.577878103837472, + "grad_norm": 294.2599182128906, + "learning_rate": 1.4101633393829401e-05, + "loss": 41.8748, + "step": 2930 + }, + { + "epoch": 10.577878103837472, + "eval_loss": 0.6147512793540955, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2930 + }, + { + "epoch": 10.581489841986457, + "grad_norm": 212.6685333251953, + "learning_rate": 1.4096188747731398e-05, + "loss": 42.4291, + "step": 2931 + }, + { + "epoch": 10.58510158013544, + "grad_norm": 297.016357421875, + "learning_rate": 1.4090744101633394e-05, + "loss": 39.7291, + "step": 2932 + }, + { + "epoch": 10.588713318284425, + "grad_norm": 280.308837890625, + "learning_rate": 1.4085299455535389e-05, + "loss": 37.4836, + "step": 2933 + }, + { + "epoch": 10.592325056433408, + "grad_norm": 230.28994750976562, + "learning_rate": 1.4079854809437388e-05, + "loss": 39.4075, + "step": 2934 + }, + { + "epoch": 10.595936794582393, + "grad_norm": 377.0367126464844, + "learning_rate": 1.4074410163339383e-05, + "loss": 40.5601, + "step": 2935 + }, + { + "epoch": 10.599548532731378, + "grad_norm": 238.51597595214844, + "learning_rate": 1.406896551724138e-05, + "loss": 38.1238, + "step": 2936 + }, + { + "epoch": 10.60316027088036, + "grad_norm": 197.5536651611328, + "learning_rate": 1.4063520871143376e-05, + "loss": 38.2997, + "step": 2937 + }, + { + "epoch": 10.606772009029346, + "grad_norm": 211.65162658691406, + "learning_rate": 1.4058076225045373e-05, + "loss": 39.1501, + "step": 2938 + }, + { + "epoch": 10.610383747178329, + "grad_norm": 266.4801940917969, + "learning_rate": 1.405263157894737e-05, + "loss": 40.5761, + "step": 2939 + }, + { + "epoch": 10.613995485327314, + "grad_norm": 210.29478454589844, + "learning_rate": 1.4047186932849365e-05, + "loss": 39.7387, + "step": 2940 + }, + { + "epoch": 10.613995485327314, + "eval_loss": 0.6154477000236511, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 2940 + }, + { + "epoch": 10.617607223476298, + "grad_norm": 318.0694580078125, + "learning_rate": 1.404174228675136e-05, + "loss": 38.691, + "step": 2941 + }, + { + "epoch": 10.621218961625281, + "grad_norm": 351.12811279296875, + "learning_rate": 1.4036297640653358e-05, + "loss": 40.3878, + "step": 2942 + }, + { + "epoch": 10.624830699774266, + "grad_norm": 259.8601989746094, + "learning_rate": 1.4030852994555355e-05, + "loss": 38.4447, + "step": 2943 + }, + { + "epoch": 10.628442437923251, + "grad_norm": 249.7741241455078, + "learning_rate": 1.402540834845735e-05, + "loss": 41.1242, + "step": 2944 + }, + { + "epoch": 10.632054176072234, + "grad_norm": 207.11119079589844, + "learning_rate": 1.4019963702359347e-05, + "loss": 40.1977, + "step": 2945 + }, + { + "epoch": 10.635665914221219, + "grad_norm": 199.37295532226562, + "learning_rate": 1.4014519056261343e-05, + "loss": 40.71, + "step": 2946 + }, + { + "epoch": 10.639277652370204, + "grad_norm": 238.85061645507812, + "learning_rate": 1.4009074410163341e-05, + "loss": 41.8822, + "step": 2947 + }, + { + "epoch": 10.642889390519187, + "grad_norm": 212.46388244628906, + "learning_rate": 1.4003629764065337e-05, + "loss": 40.5648, + "step": 2948 + }, + { + "epoch": 10.646501128668172, + "grad_norm": 217.60386657714844, + "learning_rate": 1.3998185117967332e-05, + "loss": 39.6074, + "step": 2949 + }, + { + "epoch": 10.650112866817155, + "grad_norm": 223.88645935058594, + "learning_rate": 1.399274047186933e-05, + "loss": 37.7394, + "step": 2950 + }, + { + "epoch": 10.650112866817155, + "eval_loss": 0.6133999228477478, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 2950 + }, + { + "epoch": 10.65372460496614, + "grad_norm": 248.87986755371094, + "learning_rate": 1.3987295825771325e-05, + "loss": 34.911, + "step": 2951 + }, + { + "epoch": 10.657336343115125, + "grad_norm": 238.0355987548828, + "learning_rate": 1.3981851179673322e-05, + "loss": 34.0325, + "step": 2952 + }, + { + "epoch": 10.660948081264108, + "grad_norm": 212.9556121826172, + "learning_rate": 1.3976406533575319e-05, + "loss": 34.9663, + "step": 2953 + }, + { + "epoch": 10.664559819413093, + "grad_norm": 274.4277648925781, + "learning_rate": 1.3970961887477314e-05, + "loss": 34.2399, + "step": 2954 + }, + { + "epoch": 10.668171557562077, + "grad_norm": 211.77976989746094, + "learning_rate": 1.396551724137931e-05, + "loss": 33.7609, + "step": 2955 + }, + { + "epoch": 10.67178329571106, + "grad_norm": 280.6621398925781, + "learning_rate": 1.3960072595281307e-05, + "loss": 35.2616, + "step": 2956 + }, + { + "epoch": 10.675395033860045, + "grad_norm": 239.06439208984375, + "learning_rate": 1.3954627949183304e-05, + "loss": 34.2542, + "step": 2957 + }, + { + "epoch": 10.679006772009028, + "grad_norm": 271.45806884765625, + "learning_rate": 1.39491833030853e-05, + "loss": 36.0551, + "step": 2958 + }, + { + "epoch": 10.682618510158013, + "grad_norm": 247.76486206054688, + "learning_rate": 1.3943738656987296e-05, + "loss": 36.9935, + "step": 2959 + }, + { + "epoch": 10.686230248306998, + "grad_norm": 259.47930908203125, + "learning_rate": 1.3938294010889292e-05, + "loss": 36.7769, + "step": 2960 + }, + { + "epoch": 10.686230248306998, + "eval_loss": 0.6107803583145142, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.138, + "eval_steps_per_second": 57.138, + "step": 2960 + }, + { + "epoch": 10.689841986455981, + "grad_norm": 247.50103759765625, + "learning_rate": 1.393284936479129e-05, + "loss": 35.4848, + "step": 2961 + }, + { + "epoch": 10.693453724604966, + "grad_norm": 242.37330627441406, + "learning_rate": 1.3927404718693286e-05, + "loss": 36.3881, + "step": 2962 + }, + { + "epoch": 10.697065462753951, + "grad_norm": 200.2835693359375, + "learning_rate": 1.3921960072595281e-05, + "loss": 37.2684, + "step": 2963 + }, + { + "epoch": 10.700677200902934, + "grad_norm": 261.6256103515625, + "learning_rate": 1.3916515426497278e-05, + "loss": 37.4581, + "step": 2964 + }, + { + "epoch": 10.704288939051919, + "grad_norm": 243.7251434326172, + "learning_rate": 1.3911070780399274e-05, + "loss": 35.8237, + "step": 2965 + }, + { + "epoch": 10.707900677200904, + "grad_norm": 172.99339294433594, + "learning_rate": 1.390562613430127e-05, + "loss": 29.5815, + "step": 2966 + }, + { + "epoch": 10.711512415349887, + "grad_norm": 168.88490295410156, + "learning_rate": 1.3900181488203268e-05, + "loss": 23.6597, + "step": 2967 + }, + { + "epoch": 10.715124153498872, + "grad_norm": 213.0456085205078, + "learning_rate": 1.3894736842105263e-05, + "loss": 22.5034, + "step": 2968 + }, + { + "epoch": 10.718735891647855, + "grad_norm": 183.87222290039062, + "learning_rate": 1.388929219600726e-05, + "loss": 24.1696, + "step": 2969 + }, + { + "epoch": 10.72234762979684, + "grad_norm": 179.4297637939453, + "learning_rate": 1.3883847549909256e-05, + "loss": 24.8905, + "step": 2970 + }, + { + "epoch": 10.72234762979684, + "eval_loss": 0.6176853179931641, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 2970 + }, + { + "epoch": 10.725959367945824, + "grad_norm": 214.10662841796875, + "learning_rate": 1.3878402903811253e-05, + "loss": 40.6941, + "step": 2971 + }, + { + "epoch": 10.729571106094808, + "grad_norm": 199.4381103515625, + "learning_rate": 1.387295825771325e-05, + "loss": 42.6363, + "step": 2972 + }, + { + "epoch": 10.733182844243792, + "grad_norm": 182.74517822265625, + "learning_rate": 1.3867513611615245e-05, + "loss": 40.9695, + "step": 2973 + }, + { + "epoch": 10.736794582392777, + "grad_norm": 182.41421508789062, + "learning_rate": 1.386206896551724e-05, + "loss": 40.8893, + "step": 2974 + }, + { + "epoch": 10.74040632054176, + "grad_norm": 215.42904663085938, + "learning_rate": 1.385662431941924e-05, + "loss": 40.6667, + "step": 2975 + }, + { + "epoch": 10.744018058690745, + "grad_norm": 208.15133666992188, + "learning_rate": 1.3851179673321235e-05, + "loss": 42.0714, + "step": 2976 + }, + { + "epoch": 10.747629796839728, + "grad_norm": 224.70242309570312, + "learning_rate": 1.384573502722323e-05, + "loss": 40.9404, + "step": 2977 + }, + { + "epoch": 10.751241534988713, + "grad_norm": 241.45301818847656, + "learning_rate": 1.3840290381125227e-05, + "loss": 43.5597, + "step": 2978 + }, + { + "epoch": 10.754853273137698, + "grad_norm": 201.2677459716797, + "learning_rate": 1.3834845735027222e-05, + "loss": 42.7741, + "step": 2979 + }, + { + "epoch": 10.758465011286681, + "grad_norm": 246.30873107910156, + "learning_rate": 1.3829401088929221e-05, + "loss": 41.7873, + "step": 2980 + }, + { + "epoch": 10.758465011286681, + "eval_loss": 0.6206657886505127, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2980 + }, + { + "epoch": 10.762076749435666, + "grad_norm": 206.91009521484375, + "learning_rate": 1.3823956442831217e-05, + "loss": 42.3601, + "step": 2981 + }, + { + "epoch": 10.76568848758465, + "grad_norm": 206.37472534179688, + "learning_rate": 1.3818511796733212e-05, + "loss": 38.5536, + "step": 2982 + }, + { + "epoch": 10.769300225733634, + "grad_norm": 206.49070739746094, + "learning_rate": 1.3813067150635209e-05, + "loss": 38.1051, + "step": 2983 + }, + { + "epoch": 10.772911963882619, + "grad_norm": 215.02455139160156, + "learning_rate": 1.3807622504537206e-05, + "loss": 39.0797, + "step": 2984 + }, + { + "epoch": 10.776523702031604, + "grad_norm": 254.23757934570312, + "learning_rate": 1.3802177858439202e-05, + "loss": 39.419, + "step": 2985 + }, + { + "epoch": 10.780135440180587, + "grad_norm": 205.85079956054688, + "learning_rate": 1.3796733212341199e-05, + "loss": 39.2075, + "step": 2986 + }, + { + "epoch": 10.783747178329572, + "grad_norm": 216.0372314453125, + "learning_rate": 1.3791288566243194e-05, + "loss": 38.5652, + "step": 2987 + }, + { + "epoch": 10.787358916478555, + "grad_norm": 258.47650146484375, + "learning_rate": 1.3785843920145191e-05, + "loss": 38.1968, + "step": 2988 + }, + { + "epoch": 10.79097065462754, + "grad_norm": 289.07354736328125, + "learning_rate": 1.3780399274047188e-05, + "loss": 40.2233, + "step": 2989 + }, + { + "epoch": 10.794582392776524, + "grad_norm": 332.9964904785156, + "learning_rate": 1.3774954627949184e-05, + "loss": 39.5959, + "step": 2990 + }, + { + "epoch": 10.794582392776524, + "eval_loss": 0.6167517304420471, + "eval_runtime": 3.1556, + "eval_samples_per_second": 56.724, + "eval_steps_per_second": 56.724, + "step": 2990 + }, + { + "epoch": 10.798194130925507, + "grad_norm": 205.10699462890625, + "learning_rate": 1.376950998185118e-05, + "loss": 40.2468, + "step": 2991 + }, + { + "epoch": 10.801805869074492, + "grad_norm": 270.2808837890625, + "learning_rate": 1.3764065335753176e-05, + "loss": 37.5956, + "step": 2992 + }, + { + "epoch": 10.805417607223477, + "grad_norm": 199.32044982910156, + "learning_rate": 1.3758620689655171e-05, + "loss": 38.7289, + "step": 2993 + }, + { + "epoch": 10.80902934537246, + "grad_norm": 196.97547912597656, + "learning_rate": 1.375317604355717e-05, + "loss": 40.6707, + "step": 2994 + }, + { + "epoch": 10.812641083521445, + "grad_norm": 219.34588623046875, + "learning_rate": 1.3747731397459166e-05, + "loss": 39.6782, + "step": 2995 + }, + { + "epoch": 10.816252821670428, + "grad_norm": 261.7323913574219, + "learning_rate": 1.3742286751361161e-05, + "loss": 41.1828, + "step": 2996 + }, + { + "epoch": 10.819864559819413, + "grad_norm": 250.89186096191406, + "learning_rate": 1.3736842105263158e-05, + "loss": 41.3582, + "step": 2997 + }, + { + "epoch": 10.823476297968398, + "grad_norm": 284.7223205566406, + "learning_rate": 1.3731397459165155e-05, + "loss": 39.3584, + "step": 2998 + }, + { + "epoch": 10.827088036117381, + "grad_norm": 212.9114990234375, + "learning_rate": 1.3725952813067152e-05, + "loss": 37.5373, + "step": 2999 + }, + { + "epoch": 10.830699774266366, + "grad_norm": 182.8346405029297, + "learning_rate": 1.3720508166969148e-05, + "loss": 35.2027, + "step": 3000 + }, + { + "epoch": 10.830699774266366, + "eval_loss": 0.6083630919456482, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.702, + "eval_steps_per_second": 56.702, + "step": 3000 + }, + { + "epoch": 10.83431151241535, + "grad_norm": 259.0496520996094, + "learning_rate": 1.3715063520871143e-05, + "loss": 33.4937, + "step": 3001 + }, + { + "epoch": 10.837923250564334, + "grad_norm": 173.037353515625, + "learning_rate": 1.370961887477314e-05, + "loss": 32.8549, + "step": 3002 + }, + { + "epoch": 10.841534988713319, + "grad_norm": 257.9381408691406, + "learning_rate": 1.3704174228675137e-05, + "loss": 33.9163, + "step": 3003 + }, + { + "epoch": 10.845146726862303, + "grad_norm": 248.58355712890625, + "learning_rate": 1.3698729582577132e-05, + "loss": 34.3948, + "step": 3004 + }, + { + "epoch": 10.848758465011286, + "grad_norm": 277.0877990722656, + "learning_rate": 1.369328493647913e-05, + "loss": 34.2868, + "step": 3005 + }, + { + "epoch": 10.852370203160271, + "grad_norm": 220.54014587402344, + "learning_rate": 1.3687840290381125e-05, + "loss": 35.2502, + "step": 3006 + }, + { + "epoch": 10.855981941309254, + "grad_norm": 248.14111328125, + "learning_rate": 1.3682395644283122e-05, + "loss": 33.4599, + "step": 3007 + }, + { + "epoch": 10.85959367945824, + "grad_norm": 284.2827453613281, + "learning_rate": 1.3676950998185119e-05, + "loss": 34.2927, + "step": 3008 + }, + { + "epoch": 10.863205417607224, + "grad_norm": 236.78201293945312, + "learning_rate": 1.3671506352087114e-05, + "loss": 34.9322, + "step": 3009 + }, + { + "epoch": 10.866817155756207, + "grad_norm": 245.58331298828125, + "learning_rate": 1.3666061705989112e-05, + "loss": 35.7628, + "step": 3010 + }, + { + "epoch": 10.866817155756207, + "eval_loss": 0.6125946640968323, + "eval_runtime": 3.1644, + "eval_samples_per_second": 56.566, + "eval_steps_per_second": 56.566, + "step": 3010 + }, + { + "epoch": 10.870428893905192, + "grad_norm": 217.79248046875, + "learning_rate": 1.3660617059891107e-05, + "loss": 35.7332, + "step": 3011 + }, + { + "epoch": 10.874040632054175, + "grad_norm": 258.78729248046875, + "learning_rate": 1.3655172413793104e-05, + "loss": 38.293, + "step": 3012 + }, + { + "epoch": 10.87765237020316, + "grad_norm": 253.94757080078125, + "learning_rate": 1.3649727767695101e-05, + "loss": 37.511, + "step": 3013 + }, + { + "epoch": 10.881264108352145, + "grad_norm": 265.5654602050781, + "learning_rate": 1.3644283121597096e-05, + "loss": 37.5786, + "step": 3014 + }, + { + "epoch": 10.884875846501128, + "grad_norm": 252.11453247070312, + "learning_rate": 1.3638838475499092e-05, + "loss": 37.1039, + "step": 3015 + }, + { + "epoch": 10.888487584650113, + "grad_norm": 259.5934753417969, + "learning_rate": 1.3633393829401089e-05, + "loss": 35.2651, + "step": 3016 + }, + { + "epoch": 10.892099322799098, + "grad_norm": 194.3569793701172, + "learning_rate": 1.3627949183303086e-05, + "loss": 23.7438, + "step": 3017 + }, + { + "epoch": 10.89571106094808, + "grad_norm": 233.95205688476562, + "learning_rate": 1.3622504537205081e-05, + "loss": 23.0061, + "step": 3018 + }, + { + "epoch": 10.899322799097066, + "grad_norm": 185.18495178222656, + "learning_rate": 1.3617059891107078e-05, + "loss": 24.5404, + "step": 3019 + }, + { + "epoch": 10.90293453724605, + "grad_norm": 200.27029418945312, + "learning_rate": 1.3611615245009074e-05, + "loss": 24.3629, + "step": 3020 + }, + { + "epoch": 10.90293453724605, + "eval_loss": 0.6178797483444214, + "eval_runtime": 3.1498, + "eval_samples_per_second": 56.829, + "eval_steps_per_second": 56.829, + "step": 3020 + }, + { + "epoch": 10.906546275395034, + "grad_norm": 226.4281463623047, + "learning_rate": 1.3606170598911073e-05, + "loss": 41.7249, + "step": 3021 + }, + { + "epoch": 10.910158013544018, + "grad_norm": 207.73768615722656, + "learning_rate": 1.3600725952813068e-05, + "loss": 42.1902, + "step": 3022 + }, + { + "epoch": 10.913769751693001, + "grad_norm": 248.69773864746094, + "learning_rate": 1.3595281306715063e-05, + "loss": 40.8419, + "step": 3023 + }, + { + "epoch": 10.917381489841986, + "grad_norm": 224.0100860595703, + "learning_rate": 1.358983666061706e-05, + "loss": 41.483, + "step": 3024 + }, + { + "epoch": 10.920993227990971, + "grad_norm": 217.3524932861328, + "learning_rate": 1.3584392014519056e-05, + "loss": 42.4667, + "step": 3025 + }, + { + "epoch": 10.924604966139954, + "grad_norm": 226.0863494873047, + "learning_rate": 1.3578947368421053e-05, + "loss": 40.8693, + "step": 3026 + }, + { + "epoch": 10.928216704288939, + "grad_norm": 278.3658447265625, + "learning_rate": 1.357350272232305e-05, + "loss": 39.5165, + "step": 3027 + }, + { + "epoch": 10.931828442437924, + "grad_norm": 226.6543731689453, + "learning_rate": 1.3568058076225045e-05, + "loss": 39.3144, + "step": 3028 + }, + { + "epoch": 10.935440180586907, + "grad_norm": 215.39073181152344, + "learning_rate": 1.3562613430127042e-05, + "loss": 39.9823, + "step": 3029 + }, + { + "epoch": 10.939051918735892, + "grad_norm": 239.6291961669922, + "learning_rate": 1.355716878402904e-05, + "loss": 40.898, + "step": 3030 + }, + { + "epoch": 10.939051918735892, + "eval_loss": 0.6163076162338257, + "eval_runtime": 3.153, + "eval_samples_per_second": 56.771, + "eval_steps_per_second": 56.771, + "step": 3030 + }, + { + "epoch": 10.942663656884875, + "grad_norm": 251.20431518554688, + "learning_rate": 1.3551724137931035e-05, + "loss": 40.8357, + "step": 3031 + }, + { + "epoch": 10.94627539503386, + "grad_norm": 243.96022033691406, + "learning_rate": 1.3546279491833032e-05, + "loss": 39.1261, + "step": 3032 + }, + { + "epoch": 10.949887133182845, + "grad_norm": 248.15545654296875, + "learning_rate": 1.3540834845735027e-05, + "loss": 40.9375, + "step": 3033 + }, + { + "epoch": 10.953498871331828, + "grad_norm": 215.00927734375, + "learning_rate": 1.3535390199637023e-05, + "loss": 42.4167, + "step": 3034 + }, + { + "epoch": 10.957110609480813, + "grad_norm": 263.11566162109375, + "learning_rate": 1.3529945553539021e-05, + "loss": 40.7363, + "step": 3035 + }, + { + "epoch": 10.960722347629797, + "grad_norm": 208.59628295898438, + "learning_rate": 1.3524500907441017e-05, + "loss": 35.7124, + "step": 3036 + }, + { + "epoch": 10.96433408577878, + "grad_norm": 187.6036834716797, + "learning_rate": 1.3519056261343012e-05, + "loss": 33.7512, + "step": 3037 + }, + { + "epoch": 10.967945823927765, + "grad_norm": 217.89825439453125, + "learning_rate": 1.351361161524501e-05, + "loss": 33.4262, + "step": 3038 + }, + { + "epoch": 10.97155756207675, + "grad_norm": 235.59889221191406, + "learning_rate": 1.3508166969147005e-05, + "loss": 35.2587, + "step": 3039 + }, + { + "epoch": 10.975169300225733, + "grad_norm": 261.9609680175781, + "learning_rate": 1.3502722323049003e-05, + "loss": 36.1296, + "step": 3040 + }, + { + "epoch": 10.975169300225733, + "eval_loss": 0.610818088054657, + "eval_runtime": 3.1502, + "eval_samples_per_second": 56.822, + "eval_steps_per_second": 56.822, + "step": 3040 + }, + { + "epoch": 10.978781038374718, + "grad_norm": 239.44386291503906, + "learning_rate": 1.3497277676950999e-05, + "loss": 35.6712, + "step": 3041 + }, + { + "epoch": 10.982392776523701, + "grad_norm": 260.9620666503906, + "learning_rate": 1.3491833030852994e-05, + "loss": 35.9054, + "step": 3042 + }, + { + "epoch": 10.986004514672686, + "grad_norm": 246.35678100585938, + "learning_rate": 1.3486388384754991e-05, + "loss": 35.6071, + "step": 3043 + }, + { + "epoch": 10.989616252821671, + "grad_norm": 259.808349609375, + "learning_rate": 1.3480943738656988e-05, + "loss": 37.8261, + "step": 3044 + }, + { + "epoch": 10.993227990970654, + "grad_norm": 187.34579467773438, + "learning_rate": 1.3475499092558984e-05, + "loss": 29.4662, + "step": 3045 + }, + { + "epoch": 10.996839729119639, + "grad_norm": 235.4073486328125, + "learning_rate": 1.3470054446460981e-05, + "loss": 23.668, + "step": 3046 + }, + { + "epoch": 11.0, + "grad_norm": 171.45904541015625, + "learning_rate": 1.3464609800362976e-05, + "loss": 21.3995, + "step": 3047 + }, + { + "epoch": 11.003611738148985, + "grad_norm": 262.18798828125, + "learning_rate": 1.3459165154264972e-05, + "loss": 40.2072, + "step": 3048 + }, + { + "epoch": 11.007223476297968, + "grad_norm": 298.67755126953125, + "learning_rate": 1.345372050816697e-05, + "loss": 42.5345, + "step": 3049 + }, + { + "epoch": 11.010835214446953, + "grad_norm": 215.71389770507812, + "learning_rate": 1.3448275862068966e-05, + "loss": 41.3491, + "step": 3050 + }, + { + "epoch": 11.010835214446953, + "eval_loss": 0.6099278330802917, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 3050 + }, + { + "epoch": 11.014446952595938, + "grad_norm": 243.77044677734375, + "learning_rate": 1.3442831215970963e-05, + "loss": 41.0093, + "step": 3051 + }, + { + "epoch": 11.01805869074492, + "grad_norm": 205.8600616455078, + "learning_rate": 1.3437386569872958e-05, + "loss": 41.944, + "step": 3052 + }, + { + "epoch": 11.021670428893906, + "grad_norm": 204.25608825683594, + "learning_rate": 1.3431941923774955e-05, + "loss": 39.3595, + "step": 3053 + }, + { + "epoch": 11.025282167042889, + "grad_norm": 195.03114318847656, + "learning_rate": 1.3426497277676952e-05, + "loss": 42.0208, + "step": 3054 + }, + { + "epoch": 11.028893905191874, + "grad_norm": 193.05857849121094, + "learning_rate": 1.3421052631578948e-05, + "loss": 41.2148, + "step": 3055 + }, + { + "epoch": 11.032505643340858, + "grad_norm": 255.9553680419922, + "learning_rate": 1.3415607985480943e-05, + "loss": 41.6029, + "step": 3056 + }, + { + "epoch": 11.036117381489841, + "grad_norm": 234.97799682617188, + "learning_rate": 1.341016333938294e-05, + "loss": 41.2583, + "step": 3057 + }, + { + "epoch": 11.039729119638826, + "grad_norm": 183.76707458496094, + "learning_rate": 1.3404718693284937e-05, + "loss": 39.4893, + "step": 3058 + }, + { + "epoch": 11.043340857787811, + "grad_norm": 162.30191040039062, + "learning_rate": 1.3399274047186933e-05, + "loss": 37.697, + "step": 3059 + }, + { + "epoch": 11.046952595936794, + "grad_norm": 223.8235626220703, + "learning_rate": 1.339382940108893e-05, + "loss": 37.2762, + "step": 3060 + }, + { + "epoch": 11.046952595936794, + "eval_loss": 0.6099210381507874, + "eval_runtime": 3.1526, + "eval_samples_per_second": 56.778, + "eval_steps_per_second": 56.778, + "step": 3060 + }, + { + "epoch": 11.050564334085779, + "grad_norm": 203.874755859375, + "learning_rate": 1.3388384754990925e-05, + "loss": 37.7674, + "step": 3061 + }, + { + "epoch": 11.054176072234762, + "grad_norm": 222.9609832763672, + "learning_rate": 1.3382940108892922e-05, + "loss": 39.5784, + "step": 3062 + }, + { + "epoch": 11.057787810383747, + "grad_norm": 177.81871032714844, + "learning_rate": 1.337749546279492e-05, + "loss": 37.5264, + "step": 3063 + }, + { + "epoch": 11.061399548532732, + "grad_norm": 209.53326416015625, + "learning_rate": 1.3372050816696915e-05, + "loss": 38.5067, + "step": 3064 + }, + { + "epoch": 11.065011286681715, + "grad_norm": 228.35260009765625, + "learning_rate": 1.3366606170598912e-05, + "loss": 37.5329, + "step": 3065 + }, + { + "epoch": 11.0686230248307, + "grad_norm": 231.5054168701172, + "learning_rate": 1.3361161524500907e-05, + "loss": 39.8565, + "step": 3066 + }, + { + "epoch": 11.072234762979685, + "grad_norm": 184.31460571289062, + "learning_rate": 1.3355716878402904e-05, + "loss": 37.9703, + "step": 3067 + }, + { + "epoch": 11.075846501128668, + "grad_norm": 230.06463623046875, + "learning_rate": 1.3350272232304901e-05, + "loss": 39.1406, + "step": 3068 + }, + { + "epoch": 11.079458239277653, + "grad_norm": 263.3990478515625, + "learning_rate": 1.3344827586206897e-05, + "loss": 39.8019, + "step": 3069 + }, + { + "epoch": 11.083069977426636, + "grad_norm": 217.89923095703125, + "learning_rate": 1.3339382940108892e-05, + "loss": 40.195, + "step": 3070 + }, + { + "epoch": 11.083069977426636, + "eval_loss": 0.6136859655380249, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 3070 + }, + { + "epoch": 11.08668171557562, + "grad_norm": 238.8343505859375, + "learning_rate": 1.333393829401089e-05, + "loss": 39.1668, + "step": 3071 + }, + { + "epoch": 11.090293453724605, + "grad_norm": 288.6470947265625, + "learning_rate": 1.3328493647912886e-05, + "loss": 40.3355, + "step": 3072 + }, + { + "epoch": 11.093905191873588, + "grad_norm": 284.3423156738281, + "learning_rate": 1.3323049001814883e-05, + "loss": 41.5359, + "step": 3073 + }, + { + "epoch": 11.097516930022573, + "grad_norm": 263.0945739746094, + "learning_rate": 1.3317604355716879e-05, + "loss": 41.3219, + "step": 3074 + }, + { + "epoch": 11.101128668171558, + "grad_norm": 208.96383666992188, + "learning_rate": 1.3312159709618874e-05, + "loss": 39.7292, + "step": 3075 + }, + { + "epoch": 11.104740406320541, + "grad_norm": 233.49888610839844, + "learning_rate": 1.3306715063520873e-05, + "loss": 35.282, + "step": 3076 + }, + { + "epoch": 11.108352144469526, + "grad_norm": 216.6250762939453, + "learning_rate": 1.3301270417422868e-05, + "loss": 34.4335, + "step": 3077 + }, + { + "epoch": 11.111963882618511, + "grad_norm": 182.3594970703125, + "learning_rate": 1.3295825771324864e-05, + "loss": 32.7557, + "step": 3078 + }, + { + "epoch": 11.115575620767494, + "grad_norm": 215.4852752685547, + "learning_rate": 1.329038112522686e-05, + "loss": 32.185, + "step": 3079 + }, + { + "epoch": 11.119187358916479, + "grad_norm": 237.4733123779297, + "learning_rate": 1.3284936479128856e-05, + "loss": 32.8733, + "step": 3080 + }, + { + "epoch": 11.119187358916479, + "eval_loss": 0.6130570769309998, + "eval_runtime": 3.154, + "eval_samples_per_second": 56.754, + "eval_steps_per_second": 56.754, + "step": 3080 + }, + { + "epoch": 11.122799097065462, + "grad_norm": 202.9044952392578, + "learning_rate": 1.3279491833030853e-05, + "loss": 33.89, + "step": 3081 + }, + { + "epoch": 11.126410835214447, + "grad_norm": 230.82086181640625, + "learning_rate": 1.327404718693285e-05, + "loss": 34.0808, + "step": 3082 + }, + { + "epoch": 11.130022573363432, + "grad_norm": 318.1103515625, + "learning_rate": 1.3268602540834846e-05, + "loss": 35.5715, + "step": 3083 + }, + { + "epoch": 11.133634311512415, + "grad_norm": 296.760986328125, + "learning_rate": 1.3263157894736843e-05, + "loss": 36.0701, + "step": 3084 + }, + { + "epoch": 11.1372460496614, + "grad_norm": 355.1922302246094, + "learning_rate": 1.3257713248638838e-05, + "loss": 35.027, + "step": 3085 + }, + { + "epoch": 11.140857787810384, + "grad_norm": 379.0643310546875, + "learning_rate": 1.3252268602540835e-05, + "loss": 36.8225, + "step": 3086 + }, + { + "epoch": 11.144469525959368, + "grad_norm": 271.0293273925781, + "learning_rate": 1.3246823956442832e-05, + "loss": 34.18, + "step": 3087 + }, + { + "epoch": 11.148081264108352, + "grad_norm": 231.29782104492188, + "learning_rate": 1.3241379310344828e-05, + "loss": 37.5546, + "step": 3088 + }, + { + "epoch": 11.151693002257336, + "grad_norm": 236.58180236816406, + "learning_rate": 1.3235934664246823e-05, + "loss": 35.8625, + "step": 3089 + }, + { + "epoch": 11.15530474040632, + "grad_norm": 220.71853637695312, + "learning_rate": 1.3230490018148822e-05, + "loss": 38.1384, + "step": 3090 + }, + { + "epoch": 11.15530474040632, + "eval_loss": 0.6140565276145935, + "eval_runtime": 3.1543, + "eval_samples_per_second": 56.747, + "eval_steps_per_second": 56.747, + "step": 3090 + }, + { + "epoch": 11.158916478555305, + "grad_norm": 251.32090759277344, + "learning_rate": 1.3225045372050817e-05, + "loss": 36.7226, + "step": 3091 + }, + { + "epoch": 11.162528216704288, + "grad_norm": 244.061279296875, + "learning_rate": 1.3219600725952814e-05, + "loss": 37.2144, + "step": 3092 + }, + { + "epoch": 11.166139954853273, + "grad_norm": 274.3013610839844, + "learning_rate": 1.321415607985481e-05, + "loss": 27.0703, + "step": 3093 + }, + { + "epoch": 11.169751693002258, + "grad_norm": 197.1829071044922, + "learning_rate": 1.3208711433756805e-05, + "loss": 23.0504, + "step": 3094 + }, + { + "epoch": 11.173363431151241, + "grad_norm": 205.8387451171875, + "learning_rate": 1.3203266787658804e-05, + "loss": 23.4632, + "step": 3095 + }, + { + "epoch": 11.176975169300226, + "grad_norm": 237.6263427734375, + "learning_rate": 1.31978221415608e-05, + "loss": 23.9426, + "step": 3096 + }, + { + "epoch": 11.18058690744921, + "grad_norm": 177.99688720703125, + "learning_rate": 1.3192377495462795e-05, + "loss": 24.2553, + "step": 3097 + }, + { + "epoch": 11.184198645598194, + "grad_norm": 235.16787719726562, + "learning_rate": 1.3186932849364792e-05, + "loss": 41.3257, + "step": 3098 + }, + { + "epoch": 11.187810383747179, + "grad_norm": 213.4043731689453, + "learning_rate": 1.3181488203266787e-05, + "loss": 42.3344, + "step": 3099 + }, + { + "epoch": 11.191422121896162, + "grad_norm": 162.57554626464844, + "learning_rate": 1.3176043557168784e-05, + "loss": 41.2702, + "step": 3100 + }, + { + "epoch": 11.191422121896162, + "eval_loss": 0.6155741214752197, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.06, + "eval_steps_per_second": 57.06, + "step": 3100 + }, + { + "epoch": 11.195033860045147, + "grad_norm": 215.84335327148438, + "learning_rate": 1.3170598911070781e-05, + "loss": 41.0582, + "step": 3101 + }, + { + "epoch": 11.198645598194132, + "grad_norm": 295.0271301269531, + "learning_rate": 1.3165154264972777e-05, + "loss": 41.3479, + "step": 3102 + }, + { + "epoch": 11.202257336343115, + "grad_norm": 287.3316955566406, + "learning_rate": 1.3159709618874774e-05, + "loss": 41.6267, + "step": 3103 + }, + { + "epoch": 11.2058690744921, + "grad_norm": 249.3993377685547, + "learning_rate": 1.315426497277677e-05, + "loss": 40.5208, + "step": 3104 + }, + { + "epoch": 11.209480812641084, + "grad_norm": 274.5410461425781, + "learning_rate": 1.3148820326678766e-05, + "loss": 41.7072, + "step": 3105 + }, + { + "epoch": 11.213092550790067, + "grad_norm": 259.49627685546875, + "learning_rate": 1.3143375680580763e-05, + "loss": 41.0034, + "step": 3106 + }, + { + "epoch": 11.216704288939052, + "grad_norm": 246.60902404785156, + "learning_rate": 1.3137931034482759e-05, + "loss": 40.1154, + "step": 3107 + }, + { + "epoch": 11.220316027088035, + "grad_norm": 224.0052947998047, + "learning_rate": 1.3132486388384754e-05, + "loss": 41.1167, + "step": 3108 + }, + { + "epoch": 11.22392776523702, + "grad_norm": 204.24021911621094, + "learning_rate": 1.3127041742286753e-05, + "loss": 37.0909, + "step": 3109 + }, + { + "epoch": 11.227539503386005, + "grad_norm": 206.67681884765625, + "learning_rate": 1.3121597096188748e-05, + "loss": 38.0959, + "step": 3110 + }, + { + "epoch": 11.227539503386005, + "eval_loss": 0.6148640513420105, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.798, + "eval_steps_per_second": 56.798, + "step": 3110 + }, + { + "epoch": 11.231151241534988, + "grad_norm": 255.91238403320312, + "learning_rate": 1.3116152450090743e-05, + "loss": 38.8076, + "step": 3111 + }, + { + "epoch": 11.234762979683973, + "grad_norm": 239.5032958984375, + "learning_rate": 1.311070780399274e-05, + "loss": 39.3991, + "step": 3112 + }, + { + "epoch": 11.238374717832958, + "grad_norm": 254.8914031982422, + "learning_rate": 1.3105263157894738e-05, + "loss": 37.7301, + "step": 3113 + }, + { + "epoch": 11.241986455981941, + "grad_norm": 229.97943115234375, + "learning_rate": 1.3099818511796735e-05, + "loss": 38.8527, + "step": 3114 + }, + { + "epoch": 11.245598194130926, + "grad_norm": 208.1148681640625, + "learning_rate": 1.309437386569873e-05, + "loss": 38.8518, + "step": 3115 + }, + { + "epoch": 11.249209932279909, + "grad_norm": 208.49557495117188, + "learning_rate": 1.3088929219600725e-05, + "loss": 38.927, + "step": 3116 + }, + { + "epoch": 11.252821670428894, + "grad_norm": 332.9958801269531, + "learning_rate": 1.3083484573502723e-05, + "loss": 40.0492, + "step": 3117 + }, + { + "epoch": 11.256433408577879, + "grad_norm": 253.16769409179688, + "learning_rate": 1.307803992740472e-05, + "loss": 39.1965, + "step": 3118 + }, + { + "epoch": 11.260045146726862, + "grad_norm": 243.8136444091797, + "learning_rate": 1.3072595281306715e-05, + "loss": 38.2286, + "step": 3119 + }, + { + "epoch": 11.263656884875846, + "grad_norm": 273.6463623046875, + "learning_rate": 1.3067150635208712e-05, + "loss": 39.3751, + "step": 3120 + }, + { + "epoch": 11.263656884875846, + "eval_loss": 0.6175129413604736, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 3120 + }, + { + "epoch": 11.267268623024831, + "grad_norm": 228.980224609375, + "learning_rate": 1.3061705989110707e-05, + "loss": 40.29, + "step": 3121 + }, + { + "epoch": 11.270880361173814, + "grad_norm": 292.6310729980469, + "learning_rate": 1.3056261343012703e-05, + "loss": 41.1785, + "step": 3122 + }, + { + "epoch": 11.2744920993228, + "grad_norm": 217.0737762451172, + "learning_rate": 1.3050816696914702e-05, + "loss": 40.9514, + "step": 3123 + }, + { + "epoch": 11.278103837471784, + "grad_norm": 227.0102081298828, + "learning_rate": 1.3045372050816697e-05, + "loss": 39.6132, + "step": 3124 + }, + { + "epoch": 11.281715575620767, + "grad_norm": 195.74667358398438, + "learning_rate": 1.3039927404718694e-05, + "loss": 39.5024, + "step": 3125 + }, + { + "epoch": 11.285327313769752, + "grad_norm": 222.6744384765625, + "learning_rate": 1.303448275862069e-05, + "loss": 37.7863, + "step": 3126 + }, + { + "epoch": 11.288939051918735, + "grad_norm": 207.1038055419922, + "learning_rate": 1.3029038112522687e-05, + "loss": 34.9129, + "step": 3127 + }, + { + "epoch": 11.29255079006772, + "grad_norm": 227.38330078125, + "learning_rate": 1.3023593466424684e-05, + "loss": 33.231, + "step": 3128 + }, + { + "epoch": 11.296162528216705, + "grad_norm": 254.19442749023438, + "learning_rate": 1.3018148820326679e-05, + "loss": 33.3166, + "step": 3129 + }, + { + "epoch": 11.299774266365688, + "grad_norm": 221.4664306640625, + "learning_rate": 1.3012704174228674e-05, + "loss": 33.2336, + "step": 3130 + }, + { + "epoch": 11.299774266365688, + "eval_loss": 0.6138683557510376, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 3130 + }, + { + "epoch": 11.303386004514673, + "grad_norm": 179.73678588867188, + "learning_rate": 1.3007259528130671e-05, + "loss": 34.0082, + "step": 3131 + }, + { + "epoch": 11.306997742663658, + "grad_norm": 238.66107177734375, + "learning_rate": 1.3001814882032669e-05, + "loss": 33.1898, + "step": 3132 + }, + { + "epoch": 11.31060948081264, + "grad_norm": 315.51934814453125, + "learning_rate": 1.2996370235934666e-05, + "loss": 34.5558, + "step": 3133 + }, + { + "epoch": 11.314221218961626, + "grad_norm": 235.54217529296875, + "learning_rate": 1.2990925589836661e-05, + "loss": 32.4498, + "step": 3134 + }, + { + "epoch": 11.317832957110609, + "grad_norm": 225.9518280029297, + "learning_rate": 1.2985480943738656e-05, + "loss": 34.1823, + "step": 3135 + }, + { + "epoch": 11.321444695259594, + "grad_norm": 276.5481262207031, + "learning_rate": 1.2980036297640655e-05, + "loss": 34.6704, + "step": 3136 + }, + { + "epoch": 11.325056433408578, + "grad_norm": 306.4985656738281, + "learning_rate": 1.297459165154265e-05, + "loss": 35.9149, + "step": 3137 + }, + { + "epoch": 11.328668171557561, + "grad_norm": 207.28550720214844, + "learning_rate": 1.2969147005444646e-05, + "loss": 34.876, + "step": 3138 + }, + { + "epoch": 11.332279909706546, + "grad_norm": 238.89157104492188, + "learning_rate": 1.2963702359346643e-05, + "loss": 36.7191, + "step": 3139 + }, + { + "epoch": 11.335891647855531, + "grad_norm": 281.7445068359375, + "learning_rate": 1.2958257713248638e-05, + "loss": 37.9134, + "step": 3140 + }, + { + "epoch": 11.335891647855531, + "eval_loss": 0.6141538023948669, + "eval_runtime": 3.1622, + "eval_samples_per_second": 56.606, + "eval_steps_per_second": 56.606, + "step": 3140 + }, + { + "epoch": 11.339503386004514, + "grad_norm": 261.58221435546875, + "learning_rate": 1.2952813067150635e-05, + "loss": 36.7193, + "step": 3141 + }, + { + "epoch": 11.343115124153499, + "grad_norm": 260.8083190917969, + "learning_rate": 1.2947368421052633e-05, + "loss": 36.9418, + "step": 3142 + }, + { + "epoch": 11.346726862302482, + "grad_norm": 263.466552734375, + "learning_rate": 1.2941923774954628e-05, + "loss": 31.1083, + "step": 3143 + }, + { + "epoch": 11.350338600451467, + "grad_norm": 201.6587677001953, + "learning_rate": 1.2936479128856625e-05, + "loss": 23.4982, + "step": 3144 + }, + { + "epoch": 11.353950338600452, + "grad_norm": 230.29629516601562, + "learning_rate": 1.293103448275862e-05, + "loss": 22.5417, + "step": 3145 + }, + { + "epoch": 11.357562076749435, + "grad_norm": 193.08795166015625, + "learning_rate": 1.2925589836660617e-05, + "loss": 23.6032, + "step": 3146 + }, + { + "epoch": 11.36117381489842, + "grad_norm": 206.49093627929688, + "learning_rate": 1.2920145190562615e-05, + "loss": 24.1813, + "step": 3147 + }, + { + "epoch": 11.364785553047405, + "grad_norm": 285.38348388671875, + "learning_rate": 1.291470054446461e-05, + "loss": 41.4394, + "step": 3148 + }, + { + "epoch": 11.368397291196388, + "grad_norm": 307.4984130859375, + "learning_rate": 1.2909255898366605e-05, + "loss": 43.8865, + "step": 3149 + }, + { + "epoch": 11.372009029345373, + "grad_norm": 256.685791015625, + "learning_rate": 1.2903811252268604e-05, + "loss": 41.5534, + "step": 3150 + }, + { + "epoch": 11.372009029345373, + "eval_loss": 0.6155339479446411, + "eval_runtime": 3.1488, + "eval_samples_per_second": 56.846, + "eval_steps_per_second": 56.846, + "step": 3150 + }, + { + "epoch": 11.375620767494357, + "grad_norm": 302.5317077636719, + "learning_rate": 1.28983666061706e-05, + "loss": 41.5231, + "step": 3151 + }, + { + "epoch": 11.37923250564334, + "grad_norm": 381.4787292480469, + "learning_rate": 1.2892921960072595e-05, + "loss": 40.7064, + "step": 3152 + }, + { + "epoch": 11.382844243792325, + "grad_norm": 313.63116455078125, + "learning_rate": 1.2887477313974592e-05, + "loss": 41.4045, + "step": 3153 + }, + { + "epoch": 11.386455981941308, + "grad_norm": 265.4134521484375, + "learning_rate": 1.2882032667876587e-05, + "loss": 41.2618, + "step": 3154 + }, + { + "epoch": 11.390067720090293, + "grad_norm": 260.43084716796875, + "learning_rate": 1.2876588021778586e-05, + "loss": 42.6311, + "step": 3155 + }, + { + "epoch": 11.393679458239278, + "grad_norm": 326.7022705078125, + "learning_rate": 1.2871143375680581e-05, + "loss": 41.8859, + "step": 3156 + }, + { + "epoch": 11.397291196388261, + "grad_norm": 420.966552734375, + "learning_rate": 1.2865698729582577e-05, + "loss": 41.8117, + "step": 3157 + }, + { + "epoch": 11.400902934537246, + "grad_norm": 280.8377380371094, + "learning_rate": 1.2860254083484574e-05, + "loss": 41.3303, + "step": 3158 + }, + { + "epoch": 11.404514672686231, + "grad_norm": 238.64564514160156, + "learning_rate": 1.2854809437386571e-05, + "loss": 38.253, + "step": 3159 + }, + { + "epoch": 11.408126410835214, + "grad_norm": 258.8091125488281, + "learning_rate": 1.2849364791288566e-05, + "loss": 39.2494, + "step": 3160 + }, + { + "epoch": 11.408126410835214, + "eval_loss": 0.6130858659744263, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 3160 + }, + { + "epoch": 11.411738148984199, + "grad_norm": 209.76300048828125, + "learning_rate": 1.2843920145190563e-05, + "loss": 39.1069, + "step": 3161 + }, + { + "epoch": 11.415349887133182, + "grad_norm": 215.24072265625, + "learning_rate": 1.2838475499092559e-05, + "loss": 38.8867, + "step": 3162 + }, + { + "epoch": 11.418961625282167, + "grad_norm": 285.4281311035156, + "learning_rate": 1.2833030852994554e-05, + "loss": 38.0298, + "step": 3163 + }, + { + "epoch": 11.422573363431152, + "grad_norm": 322.1593017578125, + "learning_rate": 1.2827586206896553e-05, + "loss": 40.2122, + "step": 3164 + }, + { + "epoch": 11.426185101580135, + "grad_norm": 277.2178955078125, + "learning_rate": 1.2822141560798548e-05, + "loss": 38.0829, + "step": 3165 + }, + { + "epoch": 11.42979683972912, + "grad_norm": 186.9705810546875, + "learning_rate": 1.2816696914700545e-05, + "loss": 40.6601, + "step": 3166 + }, + { + "epoch": 11.433408577878104, + "grad_norm": 210.6102294921875, + "learning_rate": 1.281125226860254e-05, + "loss": 39.0126, + "step": 3167 + }, + { + "epoch": 11.437020316027088, + "grad_norm": 234.50717163085938, + "learning_rate": 1.2805807622504536e-05, + "loss": 38.6465, + "step": 3168 + }, + { + "epoch": 11.440632054176072, + "grad_norm": 217.9093475341797, + "learning_rate": 1.2800362976406535e-05, + "loss": 39.2568, + "step": 3169 + }, + { + "epoch": 11.444243792325057, + "grad_norm": 252.82054138183594, + "learning_rate": 1.279491833030853e-05, + "loss": 39.005, + "step": 3170 + }, + { + "epoch": 11.444243792325057, + "eval_loss": 0.6125118732452393, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 3170 + }, + { + "epoch": 11.44785553047404, + "grad_norm": 290.2322998046875, + "learning_rate": 1.2789473684210526e-05, + "loss": 39.6133, + "step": 3171 + }, + { + "epoch": 11.451467268623025, + "grad_norm": 250.72450256347656, + "learning_rate": 1.2784029038112523e-05, + "loss": 40.3251, + "step": 3172 + }, + { + "epoch": 11.455079006772008, + "grad_norm": 273.91229248046875, + "learning_rate": 1.277858439201452e-05, + "loss": 39.5129, + "step": 3173 + }, + { + "epoch": 11.458690744920993, + "grad_norm": 214.30038452148438, + "learning_rate": 1.2773139745916515e-05, + "loss": 40.5093, + "step": 3174 + }, + { + "epoch": 11.462302483069978, + "grad_norm": 264.251708984375, + "learning_rate": 1.2767695099818512e-05, + "loss": 38.3837, + "step": 3175 + }, + { + "epoch": 11.465914221218961, + "grad_norm": 224.7700653076172, + "learning_rate": 1.2762250453720508e-05, + "loss": 37.8522, + "step": 3176 + }, + { + "epoch": 11.469525959367946, + "grad_norm": 238.35604858398438, + "learning_rate": 1.2756805807622505e-05, + "loss": 34.0249, + "step": 3177 + }, + { + "epoch": 11.47313769751693, + "grad_norm": 181.4731903076172, + "learning_rate": 1.2751361161524502e-05, + "loss": 34.2473, + "step": 3178 + }, + { + "epoch": 11.476749435665914, + "grad_norm": 240.2397003173828, + "learning_rate": 1.2745916515426497e-05, + "loss": 32.8657, + "step": 3179 + }, + { + "epoch": 11.480361173814899, + "grad_norm": 283.2740478515625, + "learning_rate": 1.2740471869328494e-05, + "loss": 34.6619, + "step": 3180 + }, + { + "epoch": 11.480361173814899, + "eval_loss": 0.6126638054847717, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 3180 + }, + { + "epoch": 11.483972911963882, + "grad_norm": 248.70912170410156, + "learning_rate": 1.273502722323049e-05, + "loss": 33.0975, + "step": 3181 + }, + { + "epoch": 11.487584650112867, + "grad_norm": 210.9479217529297, + "learning_rate": 1.2729582577132487e-05, + "loss": 34.2069, + "step": 3182 + }, + { + "epoch": 11.491196388261852, + "grad_norm": 234.31399536132812, + "learning_rate": 1.2724137931034484e-05, + "loss": 35.811, + "step": 3183 + }, + { + "epoch": 11.494808126410835, + "grad_norm": 253.24478149414062, + "learning_rate": 1.271869328493648e-05, + "loss": 35.6234, + "step": 3184 + }, + { + "epoch": 11.49841986455982, + "grad_norm": 259.0565185546875, + "learning_rate": 1.2713248638838476e-05, + "loss": 35.1495, + "step": 3185 + }, + { + "epoch": 11.502031602708804, + "grad_norm": 235.4202880859375, + "learning_rate": 1.2707803992740472e-05, + "loss": 35.1363, + "step": 3186 + }, + { + "epoch": 11.505643340857787, + "grad_norm": 248.30267333984375, + "learning_rate": 1.2702359346642469e-05, + "loss": 35.9653, + "step": 3187 + }, + { + "epoch": 11.509255079006772, + "grad_norm": 197.6142120361328, + "learning_rate": 1.2696914700544466e-05, + "loss": 35.6304, + "step": 3188 + }, + { + "epoch": 11.512866817155757, + "grad_norm": 329.27862548828125, + "learning_rate": 1.2691470054446461e-05, + "loss": 35.6111, + "step": 3189 + }, + { + "epoch": 11.51647855530474, + "grad_norm": 194.7126922607422, + "learning_rate": 1.2686025408348457e-05, + "loss": 35.0693, + "step": 3190 + }, + { + "epoch": 11.51647855530474, + "eval_loss": 0.6106634736061096, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 3190 + }, + { + "epoch": 11.520090293453725, + "grad_norm": 243.0207061767578, + "learning_rate": 1.2680580762250454e-05, + "loss": 37.6373, + "step": 3191 + }, + { + "epoch": 11.523702031602708, + "grad_norm": 282.0947265625, + "learning_rate": 1.267513611615245e-05, + "loss": 36.2595, + "step": 3192 + }, + { + "epoch": 11.527313769751693, + "grad_norm": 249.8011932373047, + "learning_rate": 1.2669691470054446e-05, + "loss": 35.5601, + "step": 3193 + }, + { + "epoch": 11.530925507900678, + "grad_norm": 202.17503356933594, + "learning_rate": 1.2664246823956443e-05, + "loss": 23.1075, + "step": 3194 + }, + { + "epoch": 11.534537246049661, + "grad_norm": 188.78128051757812, + "learning_rate": 1.2658802177858439e-05, + "loss": 22.2458, + "step": 3195 + }, + { + "epoch": 11.538148984198646, + "grad_norm": 219.24722290039062, + "learning_rate": 1.2653357531760437e-05, + "loss": 23.7842, + "step": 3196 + }, + { + "epoch": 11.54176072234763, + "grad_norm": 213.0615234375, + "learning_rate": 1.2647912885662433e-05, + "loss": 25.3773, + "step": 3197 + }, + { + "epoch": 11.545372460496614, + "grad_norm": 274.6806335449219, + "learning_rate": 1.2642468239564428e-05, + "loss": 40.396, + "step": 3198 + }, + { + "epoch": 11.548984198645599, + "grad_norm": 248.91778564453125, + "learning_rate": 1.2637023593466425e-05, + "loss": 42.2405, + "step": 3199 + }, + { + "epoch": 11.552595936794582, + "grad_norm": 228.45591735839844, + "learning_rate": 1.263157894736842e-05, + "loss": 40.7328, + "step": 3200 + }, + { + "epoch": 11.552595936794582, + "eval_loss": 0.6154705286026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.04, + "eval_steps_per_second": 57.04, + "step": 3200 + }, + { + "epoch": 11.556207674943566, + "grad_norm": 206.54483032226562, + "learning_rate": 1.2626134301270418e-05, + "loss": 40.6909, + "step": 3201 + }, + { + "epoch": 11.559819413092551, + "grad_norm": 199.14816284179688, + "learning_rate": 1.2620689655172415e-05, + "loss": 40.6918, + "step": 3202 + }, + { + "epoch": 11.563431151241534, + "grad_norm": 217.4789276123047, + "learning_rate": 1.261524500907441e-05, + "loss": 41.686, + "step": 3203 + }, + { + "epoch": 11.56704288939052, + "grad_norm": 209.83084106445312, + "learning_rate": 1.2609800362976406e-05, + "loss": 40.685, + "step": 3204 + }, + { + "epoch": 11.570654627539504, + "grad_norm": 184.56614685058594, + "learning_rate": 1.2604355716878404e-05, + "loss": 42.1684, + "step": 3205 + }, + { + "epoch": 11.574266365688487, + "grad_norm": 226.84622192382812, + "learning_rate": 1.25989110707804e-05, + "loss": 42.4169, + "step": 3206 + }, + { + "epoch": 11.577878103837472, + "grad_norm": 271.7705383300781, + "learning_rate": 1.2593466424682397e-05, + "loss": 41.9603, + "step": 3207 + }, + { + "epoch": 11.581489841986457, + "grad_norm": 206.48257446289062, + "learning_rate": 1.2588021778584392e-05, + "loss": 39.9903, + "step": 3208 + }, + { + "epoch": 11.58510158013544, + "grad_norm": 190.86009216308594, + "learning_rate": 1.2582577132486388e-05, + "loss": 39.3138, + "step": 3209 + }, + { + "epoch": 11.588713318284425, + "grad_norm": 217.0152130126953, + "learning_rate": 1.2577132486388386e-05, + "loss": 37.652, + "step": 3210 + }, + { + "epoch": 11.588713318284425, + "eval_loss": 0.6143624186515808, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 3210 + }, + { + "epoch": 11.592325056433408, + "grad_norm": 203.3090362548828, + "learning_rate": 1.2571687840290382e-05, + "loss": 38.5532, + "step": 3211 + }, + { + "epoch": 11.595936794582393, + "grad_norm": 237.18287658691406, + "learning_rate": 1.2566243194192377e-05, + "loss": 38.4073, + "step": 3212 + }, + { + "epoch": 11.599548532731378, + "grad_norm": 222.20489501953125, + "learning_rate": 1.2560798548094374e-05, + "loss": 37.7122, + "step": 3213 + }, + { + "epoch": 11.60316027088036, + "grad_norm": 261.4862060546875, + "learning_rate": 1.255535390199637e-05, + "loss": 39.0125, + "step": 3214 + }, + { + "epoch": 11.606772009029346, + "grad_norm": 235.49668884277344, + "learning_rate": 1.2549909255898367e-05, + "loss": 38.1753, + "step": 3215 + }, + { + "epoch": 11.610383747178329, + "grad_norm": 219.66139221191406, + "learning_rate": 1.2544464609800364e-05, + "loss": 40.3478, + "step": 3216 + }, + { + "epoch": 11.613995485327314, + "grad_norm": 282.8075256347656, + "learning_rate": 1.2539019963702359e-05, + "loss": 39.3672, + "step": 3217 + }, + { + "epoch": 11.617607223476298, + "grad_norm": 235.07875061035156, + "learning_rate": 1.2533575317604356e-05, + "loss": 39.8955, + "step": 3218 + }, + { + "epoch": 11.621218961625281, + "grad_norm": 328.829833984375, + "learning_rate": 1.2528130671506353e-05, + "loss": 38.626, + "step": 3219 + }, + { + "epoch": 11.624830699774266, + "grad_norm": 283.1789245605469, + "learning_rate": 1.2522686025408349e-05, + "loss": 40.0565, + "step": 3220 + }, + { + "epoch": 11.624830699774266, + "eval_loss": 0.6113889217376709, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3220 + }, + { + "epoch": 11.628442437923251, + "grad_norm": 230.88047790527344, + "learning_rate": 1.2517241379310346e-05, + "loss": 40.1155, + "step": 3221 + }, + { + "epoch": 11.632054176072234, + "grad_norm": 258.1295166015625, + "learning_rate": 1.2511796733212341e-05, + "loss": 40.4707, + "step": 3222 + }, + { + "epoch": 11.635665914221219, + "grad_norm": 255.82699584960938, + "learning_rate": 1.2506352087114336e-05, + "loss": 41.1296, + "step": 3223 + }, + { + "epoch": 11.639277652370204, + "grad_norm": 226.4784393310547, + "learning_rate": 1.2500907441016335e-05, + "loss": 39.1159, + "step": 3224 + }, + { + "epoch": 11.642889390519187, + "grad_norm": 257.38104248046875, + "learning_rate": 1.249546279491833e-05, + "loss": 40.7933, + "step": 3225 + }, + { + "epoch": 11.646501128668172, + "grad_norm": 218.69070434570312, + "learning_rate": 1.2490018148820328e-05, + "loss": 39.6723, + "step": 3226 + }, + { + "epoch": 11.650112866817155, + "grad_norm": 232.3351287841797, + "learning_rate": 1.2484573502722323e-05, + "loss": 37.5671, + "step": 3227 + }, + { + "epoch": 11.65372460496614, + "grad_norm": 229.93295288085938, + "learning_rate": 1.2479128856624318e-05, + "loss": 32.7819, + "step": 3228 + }, + { + "epoch": 11.657336343115125, + "grad_norm": 265.6002197265625, + "learning_rate": 1.2473684210526317e-05, + "loss": 32.5955, + "step": 3229 + }, + { + "epoch": 11.660948081264108, + "grad_norm": 278.47705078125, + "learning_rate": 1.2468239564428313e-05, + "loss": 32.9901, + "step": 3230 + }, + { + "epoch": 11.660948081264108, + "eval_loss": 0.6078047752380371, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 3230 + }, + { + "epoch": 11.664559819413093, + "grad_norm": 239.9285430908203, + "learning_rate": 1.2462794918330308e-05, + "loss": 33.2737, + "step": 3231 + }, + { + "epoch": 11.668171557562077, + "grad_norm": 358.36090087890625, + "learning_rate": 1.2457350272232305e-05, + "loss": 34.8522, + "step": 3232 + }, + { + "epoch": 11.67178329571106, + "grad_norm": 258.0733642578125, + "learning_rate": 1.2451905626134302e-05, + "loss": 34.6796, + "step": 3233 + }, + { + "epoch": 11.675395033860045, + "grad_norm": 296.21942138671875, + "learning_rate": 1.2446460980036298e-05, + "loss": 35.8479, + "step": 3234 + }, + { + "epoch": 11.679006772009028, + "grad_norm": 229.6141815185547, + "learning_rate": 1.2441016333938295e-05, + "loss": 36.4934, + "step": 3235 + }, + { + "epoch": 11.682618510158013, + "grad_norm": 238.6092987060547, + "learning_rate": 1.243557168784029e-05, + "loss": 35.2253, + "step": 3236 + }, + { + "epoch": 11.686230248306998, + "grad_norm": 300.76300048828125, + "learning_rate": 1.2430127041742287e-05, + "loss": 34.9373, + "step": 3237 + }, + { + "epoch": 11.689841986455981, + "grad_norm": 227.70672607421875, + "learning_rate": 1.2424682395644284e-05, + "loss": 35.4369, + "step": 3238 + }, + { + "epoch": 11.693453724604966, + "grad_norm": 218.36000061035156, + "learning_rate": 1.241923774954628e-05, + "loss": 35.3398, + "step": 3239 + }, + { + "epoch": 11.697065462753951, + "grad_norm": 220.78475952148438, + "learning_rate": 1.2413793103448277e-05, + "loss": 35.7612, + "step": 3240 + }, + { + "epoch": 11.697065462753951, + "eval_loss": 0.6067846417427063, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 3240 + }, + { + "epoch": 11.700677200902934, + "grad_norm": 237.34437561035156, + "learning_rate": 1.2408348457350272e-05, + "loss": 38.0459, + "step": 3241 + }, + { + "epoch": 11.704288939051919, + "grad_norm": 251.60633850097656, + "learning_rate": 1.2402903811252269e-05, + "loss": 35.4676, + "step": 3242 + }, + { + "epoch": 11.707900677200904, + "grad_norm": 214.17117309570312, + "learning_rate": 1.2397459165154266e-05, + "loss": 30.5595, + "step": 3243 + }, + { + "epoch": 11.711512415349887, + "grad_norm": 202.3698272705078, + "learning_rate": 1.2392014519056262e-05, + "loss": 23.7468, + "step": 3244 + }, + { + "epoch": 11.715124153498872, + "grad_norm": 229.11776733398438, + "learning_rate": 1.2386569872958257e-05, + "loss": 23.1255, + "step": 3245 + }, + { + "epoch": 11.718735891647855, + "grad_norm": 175.93829345703125, + "learning_rate": 1.2381125226860254e-05, + "loss": 23.7349, + "step": 3246 + }, + { + "epoch": 11.72234762979684, + "grad_norm": 232.7489471435547, + "learning_rate": 1.2375680580762251e-05, + "loss": 24.4997, + "step": 3247 + }, + { + "epoch": 11.725959367945824, + "grad_norm": 280.5601806640625, + "learning_rate": 1.2370235934664248e-05, + "loss": 42.3811, + "step": 3248 + }, + { + "epoch": 11.729571106094808, + "grad_norm": 292.2538146972656, + "learning_rate": 1.2364791288566244e-05, + "loss": 42.9804, + "step": 3249 + }, + { + "epoch": 11.733182844243792, + "grad_norm": 265.0259704589844, + "learning_rate": 1.2359346642468239e-05, + "loss": 41.1251, + "step": 3250 + }, + { + "epoch": 11.733182844243792, + "eval_loss": 0.6141200065612793, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 3250 + }, + { + "epoch": 11.736794582392777, + "grad_norm": 232.92893981933594, + "learning_rate": 1.2353901996370236e-05, + "loss": 40.9372, + "step": 3251 + }, + { + "epoch": 11.74040632054176, + "grad_norm": 176.99818420410156, + "learning_rate": 1.2348457350272233e-05, + "loss": 41.0757, + "step": 3252 + }, + { + "epoch": 11.744018058690745, + "grad_norm": 206.5728759765625, + "learning_rate": 1.2343012704174228e-05, + "loss": 41.9635, + "step": 3253 + }, + { + "epoch": 11.747629796839728, + "grad_norm": 211.2556915283203, + "learning_rate": 1.2337568058076226e-05, + "loss": 41.5217, + "step": 3254 + }, + { + "epoch": 11.751241534988713, + "grad_norm": 198.8915252685547, + "learning_rate": 1.2332123411978221e-05, + "loss": 42.9997, + "step": 3255 + }, + { + "epoch": 11.754853273137698, + "grad_norm": 291.2761535644531, + "learning_rate": 1.2326678765880218e-05, + "loss": 42.2561, + "step": 3256 + }, + { + "epoch": 11.758465011286681, + "grad_norm": 243.2998046875, + "learning_rate": 1.2321234119782215e-05, + "loss": 41.6219, + "step": 3257 + }, + { + "epoch": 11.762076749435666, + "grad_norm": 266.1149597167969, + "learning_rate": 1.231578947368421e-05, + "loss": 40.1646, + "step": 3258 + }, + { + "epoch": 11.76568848758465, + "grad_norm": 236.6083221435547, + "learning_rate": 1.2310344827586208e-05, + "loss": 39.7079, + "step": 3259 + }, + { + "epoch": 11.769300225733634, + "grad_norm": 196.397216796875, + "learning_rate": 1.2304900181488203e-05, + "loss": 39.6629, + "step": 3260 + }, + { + "epoch": 11.769300225733634, + "eval_loss": 0.6124016046524048, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.055, + "eval_steps_per_second": 57.055, + "step": 3260 + }, + { + "epoch": 11.772911963882619, + "grad_norm": 198.52500915527344, + "learning_rate": 1.22994555353902e-05, + "loss": 38.5285, + "step": 3261 + }, + { + "epoch": 11.776523702031604, + "grad_norm": 236.25477600097656, + "learning_rate": 1.2294010889292197e-05, + "loss": 38.3358, + "step": 3262 + }, + { + "epoch": 11.780135440180587, + "grad_norm": 260.35955810546875, + "learning_rate": 1.2288566243194192e-05, + "loss": 38.374, + "step": 3263 + }, + { + "epoch": 11.783747178329572, + "grad_norm": 313.078857421875, + "learning_rate": 1.2283121597096188e-05, + "loss": 39.124, + "step": 3264 + }, + { + "epoch": 11.787358916478555, + "grad_norm": 191.34027099609375, + "learning_rate": 1.2277676950998187e-05, + "loss": 39.1776, + "step": 3265 + }, + { + "epoch": 11.79097065462754, + "grad_norm": 203.5764923095703, + "learning_rate": 1.2272232304900182e-05, + "loss": 38.7885, + "step": 3266 + }, + { + "epoch": 11.794582392776524, + "grad_norm": 234.38479614257812, + "learning_rate": 1.2266787658802177e-05, + "loss": 39.1353, + "step": 3267 + }, + { + "epoch": 11.798194130925507, + "grad_norm": 254.5694122314453, + "learning_rate": 1.2261343012704174e-05, + "loss": 38.141, + "step": 3268 + }, + { + "epoch": 11.801805869074492, + "grad_norm": 189.8268585205078, + "learning_rate": 1.225589836660617e-05, + "loss": 39.5199, + "step": 3269 + }, + { + "epoch": 11.805417607223477, + "grad_norm": 256.52728271484375, + "learning_rate": 1.2250453720508169e-05, + "loss": 41.5113, + "step": 3270 + }, + { + "epoch": 11.805417607223477, + "eval_loss": 0.6084021329879761, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3270 + }, + { + "epoch": 11.80902934537246, + "grad_norm": 195.57321166992188, + "learning_rate": 1.2245009074410164e-05, + "loss": 39.8129, + "step": 3271 + }, + { + "epoch": 11.812641083521445, + "grad_norm": 228.6748809814453, + "learning_rate": 1.223956442831216e-05, + "loss": 40.2273, + "step": 3272 + }, + { + "epoch": 11.816252821670428, + "grad_norm": 209.96096801757812, + "learning_rate": 1.2234119782214156e-05, + "loss": 40.2254, + "step": 3273 + }, + { + "epoch": 11.819864559819413, + "grad_norm": 247.4613037109375, + "learning_rate": 1.2228675136116152e-05, + "loss": 40.71, + "step": 3274 + }, + { + "epoch": 11.823476297968398, + "grad_norm": 263.0521240234375, + "learning_rate": 1.2223230490018149e-05, + "loss": 39.5572, + "step": 3275 + }, + { + "epoch": 11.827088036117381, + "grad_norm": 225.53634643554688, + "learning_rate": 1.2217785843920146e-05, + "loss": 36.4388, + "step": 3276 + }, + { + "epoch": 11.830699774266366, + "grad_norm": 194.59527587890625, + "learning_rate": 1.2212341197822141e-05, + "loss": 33.1005, + "step": 3277 + }, + { + "epoch": 11.83431151241535, + "grad_norm": 314.715576171875, + "learning_rate": 1.2206896551724138e-05, + "loss": 32.9812, + "step": 3278 + }, + { + "epoch": 11.837923250564334, + "grad_norm": 205.86862182617188, + "learning_rate": 1.2201451905626136e-05, + "loss": 33.6331, + "step": 3279 + }, + { + "epoch": 11.841534988713319, + "grad_norm": 217.54722595214844, + "learning_rate": 1.2196007259528131e-05, + "loss": 33.6535, + "step": 3280 + }, + { + "epoch": 11.841534988713319, + "eval_loss": 0.609620213508606, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 3280 + }, + { + "epoch": 11.845146726862303, + "grad_norm": 231.25390625, + "learning_rate": 1.2190562613430128e-05, + "loss": 34.5218, + "step": 3281 + }, + { + "epoch": 11.848758465011286, + "grad_norm": 208.8440704345703, + "learning_rate": 1.2185117967332123e-05, + "loss": 34.354, + "step": 3282 + }, + { + "epoch": 11.852370203160271, + "grad_norm": 221.25547790527344, + "learning_rate": 1.2179673321234119e-05, + "loss": 34.5705, + "step": 3283 + }, + { + "epoch": 11.855981941309254, + "grad_norm": 331.4505920410156, + "learning_rate": 1.2174228675136118e-05, + "loss": 35.796, + "step": 3284 + }, + { + "epoch": 11.85959367945824, + "grad_norm": 337.1404113769531, + "learning_rate": 1.2168784029038113e-05, + "loss": 36.4544, + "step": 3285 + }, + { + "epoch": 11.863205417607224, + "grad_norm": 238.75303649902344, + "learning_rate": 1.2163339382940108e-05, + "loss": 35.7165, + "step": 3286 + }, + { + "epoch": 11.866817155756207, + "grad_norm": 260.088134765625, + "learning_rate": 1.2157894736842105e-05, + "loss": 35.5461, + "step": 3287 + }, + { + "epoch": 11.870428893905192, + "grad_norm": 265.0240173339844, + "learning_rate": 1.2152450090744102e-05, + "loss": 37.0143, + "step": 3288 + }, + { + "epoch": 11.874040632054175, + "grad_norm": 251.74273681640625, + "learning_rate": 1.21470054446461e-05, + "loss": 36.6145, + "step": 3289 + }, + { + "epoch": 11.87765237020316, + "grad_norm": 216.8999786376953, + "learning_rate": 1.2141560798548095e-05, + "loss": 36.3135, + "step": 3290 + }, + { + "epoch": 11.87765237020316, + "eval_loss": 0.6087896823883057, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.066, + "eval_steps_per_second": 57.066, + "step": 3290 + }, + { + "epoch": 11.881264108352145, + "grad_norm": 256.50006103515625, + "learning_rate": 1.213611615245009e-05, + "loss": 36.6596, + "step": 3291 + }, + { + "epoch": 11.884875846501128, + "grad_norm": 249.34164428710938, + "learning_rate": 1.2130671506352087e-05, + "loss": 37.6473, + "step": 3292 + }, + { + "epoch": 11.888487584650113, + "grad_norm": 211.9344940185547, + "learning_rate": 1.2125226860254084e-05, + "loss": 28.2839, + "step": 3293 + }, + { + "epoch": 11.892099322799098, + "grad_norm": 170.77166748046875, + "learning_rate": 1.211978221415608e-05, + "loss": 23.2231, + "step": 3294 + }, + { + "epoch": 11.89571106094808, + "grad_norm": 177.49789428710938, + "learning_rate": 1.2114337568058077e-05, + "loss": 22.7909, + "step": 3295 + }, + { + "epoch": 11.899322799097066, + "grad_norm": 189.0458221435547, + "learning_rate": 1.2108892921960072e-05, + "loss": 23.8062, + "step": 3296 + }, + { + "epoch": 11.90293453724605, + "grad_norm": 182.90457153320312, + "learning_rate": 1.2103448275862068e-05, + "loss": 24.7812, + "step": 3297 + }, + { + "epoch": 11.906546275395034, + "grad_norm": 232.61126708984375, + "learning_rate": 1.2098003629764066e-05, + "loss": 41.5496, + "step": 3298 + }, + { + "epoch": 11.910158013544018, + "grad_norm": 283.25762939453125, + "learning_rate": 1.2092558983666062e-05, + "loss": 40.7831, + "step": 3299 + }, + { + "epoch": 11.913769751693001, + "grad_norm": 316.6318359375, + "learning_rate": 1.2087114337568059e-05, + "loss": 40.6287, + "step": 3300 + }, + { + "epoch": 11.913769751693001, + "eval_loss": 0.6114257574081421, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 3300 + }, + { + "epoch": 11.917381489841986, + "grad_norm": 248.5615234375, + "learning_rate": 1.2081669691470054e-05, + "loss": 40.5648, + "step": 3301 + }, + { + "epoch": 11.920993227990971, + "grad_norm": 255.31130981445312, + "learning_rate": 1.2076225045372051e-05, + "loss": 42.4736, + "step": 3302 + }, + { + "epoch": 11.924604966139954, + "grad_norm": 229.3546600341797, + "learning_rate": 1.2070780399274048e-05, + "loss": 43.112, + "step": 3303 + }, + { + "epoch": 11.928216704288939, + "grad_norm": 226.89553833007812, + "learning_rate": 1.2065335753176044e-05, + "loss": 37.9527, + "step": 3304 + }, + { + "epoch": 11.931828442437924, + "grad_norm": 210.63919067382812, + "learning_rate": 1.205989110707804e-05, + "loss": 38.7652, + "step": 3305 + }, + { + "epoch": 11.935440180586907, + "grad_norm": 267.75335693359375, + "learning_rate": 1.2054446460980036e-05, + "loss": 39.9077, + "step": 3306 + }, + { + "epoch": 11.939051918735892, + "grad_norm": 255.3372802734375, + "learning_rate": 1.2049001814882033e-05, + "loss": 39.9008, + "step": 3307 + }, + { + "epoch": 11.942663656884875, + "grad_norm": 220.55332946777344, + "learning_rate": 1.2043557168784029e-05, + "loss": 40.8187, + "step": 3308 + }, + { + "epoch": 11.94627539503386, + "grad_norm": 350.15374755859375, + "learning_rate": 1.2038112522686026e-05, + "loss": 40.2937, + "step": 3309 + }, + { + "epoch": 11.949887133182845, + "grad_norm": 296.1144714355469, + "learning_rate": 1.2032667876588021e-05, + "loss": 41.3939, + "step": 3310 + }, + { + "epoch": 11.949887133182845, + "eval_loss": 0.6116041541099548, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3310 + }, + { + "epoch": 11.953498871331828, + "grad_norm": 220.52304077148438, + "learning_rate": 1.202722323049002e-05, + "loss": 39.108, + "step": 3311 + }, + { + "epoch": 11.957110609480813, + "grad_norm": 268.8526916503906, + "learning_rate": 1.2021778584392015e-05, + "loss": 39.547, + "step": 3312 + }, + { + "epoch": 11.960722347629797, + "grad_norm": 205.97677612304688, + "learning_rate": 1.201633393829401e-05, + "loss": 36.7144, + "step": 3313 + }, + { + "epoch": 11.96433408577878, + "grad_norm": 186.62428283691406, + "learning_rate": 1.2010889292196008e-05, + "loss": 34.0491, + "step": 3314 + }, + { + "epoch": 11.967945823927765, + "grad_norm": 214.5521697998047, + "learning_rate": 1.2005444646098003e-05, + "loss": 34.1164, + "step": 3315 + }, + { + "epoch": 11.97155756207675, + "grad_norm": 203.8130340576172, + "learning_rate": 1.2e-05, + "loss": 34.0005, + "step": 3316 + }, + { + "epoch": 11.975169300225733, + "grad_norm": 207.25648498535156, + "learning_rate": 1.1994555353901997e-05, + "loss": 34.0489, + "step": 3317 + }, + { + "epoch": 11.978781038374718, + "grad_norm": 271.1595458984375, + "learning_rate": 1.1989110707803993e-05, + "loss": 35.0359, + "step": 3318 + }, + { + "epoch": 11.982392776523701, + "grad_norm": 266.0697021484375, + "learning_rate": 1.198366606170599e-05, + "loss": 36.4684, + "step": 3319 + }, + { + "epoch": 11.986004514672686, + "grad_norm": 264.1314392089844, + "learning_rate": 1.1978221415607985e-05, + "loss": 35.8805, + "step": 3320 + }, + { + "epoch": 11.986004514672686, + "eval_loss": 0.6101864576339722, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 3320 + }, + { + "epoch": 11.989616252821671, + "grad_norm": 266.34295654296875, + "learning_rate": 1.1972776769509982e-05, + "loss": 37.2928, + "step": 3321 + }, + { + "epoch": 11.993227990970654, + "grad_norm": 222.19161987304688, + "learning_rate": 1.196733212341198e-05, + "loss": 29.0638, + "step": 3322 + }, + { + "epoch": 11.996839729119639, + "grad_norm": 244.96974182128906, + "learning_rate": 1.1961887477313975e-05, + "loss": 23.6752, + "step": 3323 + }, + { + "epoch": 12.0, + "grad_norm": 227.6931915283203, + "learning_rate": 1.195644283121597e-05, + "loss": 20.9293, + "step": 3324 + }, + { + "epoch": 12.003611738148985, + "grad_norm": 259.7235412597656, + "learning_rate": 1.1950998185117969e-05, + "loss": 39.7694, + "step": 3325 + }, + { + "epoch": 12.007223476297968, + "grad_norm": 258.8477783203125, + "learning_rate": 1.1945553539019964e-05, + "loss": 41.3742, + "step": 3326 + }, + { + "epoch": 12.010835214446953, + "grad_norm": 216.0697784423828, + "learning_rate": 1.194010889292196e-05, + "loss": 40.0706, + "step": 3327 + }, + { + "epoch": 12.014446952595938, + "grad_norm": 197.73046875, + "learning_rate": 1.1934664246823957e-05, + "loss": 39.844, + "step": 3328 + }, + { + "epoch": 12.01805869074492, + "grad_norm": 190.29563903808594, + "learning_rate": 1.1929219600725952e-05, + "loss": 41.8877, + "step": 3329 + }, + { + "epoch": 12.021670428893906, + "grad_norm": 190.01197814941406, + "learning_rate": 1.1923774954627951e-05, + "loss": 40.5782, + "step": 3330 + }, + { + "epoch": 12.021670428893906, + "eval_loss": 0.6100598573684692, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3330 + }, + { + "epoch": 12.025282167042889, + "grad_norm": 283.20965576171875, + "learning_rate": 1.1918330308529946e-05, + "loss": 42.9183, + "step": 3331 + }, + { + "epoch": 12.028893905191874, + "grad_norm": 227.9106903076172, + "learning_rate": 1.1912885662431942e-05, + "loss": 41.4606, + "step": 3332 + }, + { + "epoch": 12.032505643340858, + "grad_norm": 217.31640625, + "learning_rate": 1.1907441016333939e-05, + "loss": 40.527, + "step": 3333 + }, + { + "epoch": 12.036117381489841, + "grad_norm": 181.33787536621094, + "learning_rate": 1.1901996370235936e-05, + "loss": 40.2536, + "step": 3334 + }, + { + "epoch": 12.039729119638826, + "grad_norm": 210.638427734375, + "learning_rate": 1.1896551724137931e-05, + "loss": 39.0234, + "step": 3335 + }, + { + "epoch": 12.043340857787811, + "grad_norm": 222.1325225830078, + "learning_rate": 1.1891107078039928e-05, + "loss": 36.6929, + "step": 3336 + }, + { + "epoch": 12.046952595936794, + "grad_norm": 195.0751953125, + "learning_rate": 1.1885662431941924e-05, + "loss": 37.9547, + "step": 3337 + }, + { + "epoch": 12.050564334085779, + "grad_norm": 287.6582946777344, + "learning_rate": 1.1880217785843919e-05, + "loss": 37.9016, + "step": 3338 + }, + { + "epoch": 12.054176072234762, + "grad_norm": 351.43701171875, + "learning_rate": 1.1874773139745918e-05, + "loss": 40.014, + "step": 3339 + }, + { + "epoch": 12.057787810383747, + "grad_norm": 212.9033966064453, + "learning_rate": 1.1869328493647913e-05, + "loss": 37.8761, + "step": 3340 + }, + { + "epoch": 12.057787810383747, + "eval_loss": 0.6093400120735168, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 3340 + }, + { + "epoch": 12.061399548532732, + "grad_norm": 268.8284912109375, + "learning_rate": 1.186388384754991e-05, + "loss": 38.7171, + "step": 3341 + }, + { + "epoch": 12.065011286681715, + "grad_norm": 193.27267456054688, + "learning_rate": 1.1858439201451906e-05, + "loss": 38.4908, + "step": 3342 + }, + { + "epoch": 12.0686230248307, + "grad_norm": 244.18124389648438, + "learning_rate": 1.1852994555353901e-05, + "loss": 37.9388, + "step": 3343 + }, + { + "epoch": 12.072234762979685, + "grad_norm": 311.6593933105469, + "learning_rate": 1.18475499092559e-05, + "loss": 38.4287, + "step": 3344 + }, + { + "epoch": 12.075846501128668, + "grad_norm": 239.28526306152344, + "learning_rate": 1.1842105263157895e-05, + "loss": 38.1349, + "step": 3345 + }, + { + "epoch": 12.079458239277653, + "grad_norm": 312.1795654296875, + "learning_rate": 1.183666061705989e-05, + "loss": 39.8067, + "step": 3346 + }, + { + "epoch": 12.083069977426636, + "grad_norm": 303.3067932128906, + "learning_rate": 1.1831215970961888e-05, + "loss": 40.0617, + "step": 3347 + }, + { + "epoch": 12.08668171557562, + "grad_norm": 280.8705749511719, + "learning_rate": 1.1825771324863885e-05, + "loss": 39.244, + "step": 3348 + }, + { + "epoch": 12.090293453724605, + "grad_norm": 249.89671325683594, + "learning_rate": 1.182032667876588e-05, + "loss": 39.0047, + "step": 3349 + }, + { + "epoch": 12.093905191873588, + "grad_norm": 226.19195556640625, + "learning_rate": 1.1814882032667877e-05, + "loss": 40.8044, + "step": 3350 + }, + { + "epoch": 12.093905191873588, + "eval_loss": 0.6100687384605408, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 3350 + }, + { + "epoch": 12.097516930022573, + "grad_norm": 250.29306030273438, + "learning_rate": 1.1809437386569873e-05, + "loss": 38.0745, + "step": 3351 + }, + { + "epoch": 12.101128668171558, + "grad_norm": 255.06137084960938, + "learning_rate": 1.180399274047187e-05, + "loss": 37.2922, + "step": 3352 + }, + { + "epoch": 12.104740406320541, + "grad_norm": 293.59185791015625, + "learning_rate": 1.1798548094373867e-05, + "loss": 35.488, + "step": 3353 + }, + { + "epoch": 12.108352144469526, + "grad_norm": 260.9599914550781, + "learning_rate": 1.1793103448275862e-05, + "loss": 32.8175, + "step": 3354 + }, + { + "epoch": 12.111963882618511, + "grad_norm": 387.63671875, + "learning_rate": 1.178765880217786e-05, + "loss": 31.3901, + "step": 3355 + }, + { + "epoch": 12.115575620767494, + "grad_norm": 216.2008819580078, + "learning_rate": 1.1782214156079855e-05, + "loss": 32.9512, + "step": 3356 + }, + { + "epoch": 12.119187358916479, + "grad_norm": 260.510498046875, + "learning_rate": 1.177676950998185e-05, + "loss": 31.838, + "step": 3357 + }, + { + "epoch": 12.122799097065462, + "grad_norm": 215.96522521972656, + "learning_rate": 1.1771324863883849e-05, + "loss": 33.5854, + "step": 3358 + }, + { + "epoch": 12.126410835214447, + "grad_norm": 277.2855529785156, + "learning_rate": 1.1765880217785844e-05, + "loss": 34.947, + "step": 3359 + }, + { + "epoch": 12.130022573363432, + "grad_norm": 199.53759765625, + "learning_rate": 1.176043557168784e-05, + "loss": 34.3862, + "step": 3360 + }, + { + "epoch": 12.130022573363432, + "eval_loss": 0.6107886433601379, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 3360 + }, + { + "epoch": 12.133634311512415, + "grad_norm": 244.73654174804688, + "learning_rate": 1.1754990925589837e-05, + "loss": 34.5678, + "step": 3361 + }, + { + "epoch": 12.1372460496614, + "grad_norm": 335.4967346191406, + "learning_rate": 1.1749546279491834e-05, + "loss": 35.8974, + "step": 3362 + }, + { + "epoch": 12.140857787810384, + "grad_norm": 269.8370056152344, + "learning_rate": 1.174410163339383e-05, + "loss": 36.3458, + "step": 3363 + }, + { + "epoch": 12.144469525959368, + "grad_norm": 230.82492065429688, + "learning_rate": 1.1738656987295826e-05, + "loss": 34.6797, + "step": 3364 + }, + { + "epoch": 12.148081264108352, + "grad_norm": 266.6196594238281, + "learning_rate": 1.1733212341197822e-05, + "loss": 35.5799, + "step": 3365 + }, + { + "epoch": 12.151693002257336, + "grad_norm": 268.1825256347656, + "learning_rate": 1.1727767695099819e-05, + "loss": 34.9859, + "step": 3366 + }, + { + "epoch": 12.15530474040632, + "grad_norm": 259.6159362792969, + "learning_rate": 1.1722323049001816e-05, + "loss": 37.2283, + "step": 3367 + }, + { + "epoch": 12.158916478555305, + "grad_norm": 225.1367645263672, + "learning_rate": 1.1716878402903811e-05, + "loss": 37.4073, + "step": 3368 + }, + { + "epoch": 12.162528216704288, + "grad_norm": 277.8457946777344, + "learning_rate": 1.1711433756805808e-05, + "loss": 36.3491, + "step": 3369 + }, + { + "epoch": 12.166139954853273, + "grad_norm": 273.1939697265625, + "learning_rate": 1.1705989110707804e-05, + "loss": 31.4646, + "step": 3370 + }, + { + "epoch": 12.166139954853273, + "eval_loss": 0.6099494695663452, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 3370 + }, + { + "epoch": 12.169751693002258, + "grad_norm": 199.32516479492188, + "learning_rate": 1.17005444646098e-05, + "loss": 22.7125, + "step": 3371 + }, + { + "epoch": 12.173363431151241, + "grad_norm": 195.47630310058594, + "learning_rate": 1.1695099818511798e-05, + "loss": 22.7899, + "step": 3372 + }, + { + "epoch": 12.176975169300226, + "grad_norm": 220.02413940429688, + "learning_rate": 1.1689655172413793e-05, + "loss": 23.4427, + "step": 3373 + }, + { + "epoch": 12.18058690744921, + "grad_norm": 215.43287658691406, + "learning_rate": 1.168421052631579e-05, + "loss": 24.1504, + "step": 3374 + }, + { + "epoch": 12.184198645598194, + "grad_norm": 298.2409973144531, + "learning_rate": 1.1678765880217786e-05, + "loss": 41.4955, + "step": 3375 + }, + { + "epoch": 12.187810383747179, + "grad_norm": 235.94728088378906, + "learning_rate": 1.1673321234119783e-05, + "loss": 42.4273, + "step": 3376 + }, + { + "epoch": 12.191422121896162, + "grad_norm": 235.44480895996094, + "learning_rate": 1.166787658802178e-05, + "loss": 40.6468, + "step": 3377 + }, + { + "epoch": 12.195033860045147, + "grad_norm": 281.5338439941406, + "learning_rate": 1.1662431941923775e-05, + "loss": 39.8335, + "step": 3378 + }, + { + "epoch": 12.198645598194132, + "grad_norm": 185.87339782714844, + "learning_rate": 1.165698729582577e-05, + "loss": 40.8669, + "step": 3379 + }, + { + "epoch": 12.202257336343115, + "grad_norm": 218.88861083984375, + "learning_rate": 1.1651542649727768e-05, + "loss": 40.1351, + "step": 3380 + }, + { + "epoch": 12.202257336343115, + "eval_loss": 0.6128573417663574, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3380 + }, + { + "epoch": 12.2058690744921, + "grad_norm": 192.7227783203125, + "learning_rate": 1.1646098003629765e-05, + "loss": 40.4448, + "step": 3381 + }, + { + "epoch": 12.209480812641084, + "grad_norm": 219.68093872070312, + "learning_rate": 1.1640653357531762e-05, + "loss": 41.579, + "step": 3382 + }, + { + "epoch": 12.213092550790067, + "grad_norm": 235.8788299560547, + "learning_rate": 1.1635208711433757e-05, + "loss": 41.3374, + "step": 3383 + }, + { + "epoch": 12.216704288939052, + "grad_norm": 245.11935424804688, + "learning_rate": 1.1629764065335752e-05, + "loss": 41.1151, + "step": 3384 + }, + { + "epoch": 12.220316027088035, + "grad_norm": 260.2931823730469, + "learning_rate": 1.1624319419237751e-05, + "loss": 38.9502, + "step": 3385 + }, + { + "epoch": 12.22392776523702, + "grad_norm": 240.62734985351562, + "learning_rate": 1.1618874773139747e-05, + "loss": 38.6309, + "step": 3386 + }, + { + "epoch": 12.227539503386005, + "grad_norm": 230.9380645751953, + "learning_rate": 1.1613430127041742e-05, + "loss": 38.3077, + "step": 3387 + }, + { + "epoch": 12.231151241534988, + "grad_norm": 234.40687561035156, + "learning_rate": 1.1607985480943739e-05, + "loss": 37.1566, + "step": 3388 + }, + { + "epoch": 12.234762979683973, + "grad_norm": 216.580810546875, + "learning_rate": 1.1602540834845734e-05, + "loss": 38.4919, + "step": 3389 + }, + { + "epoch": 12.238374717832958, + "grad_norm": 210.75079345703125, + "learning_rate": 1.1597096188747732e-05, + "loss": 38.1647, + "step": 3390 + }, + { + "epoch": 12.238374717832958, + "eval_loss": 0.6105583906173706, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3390 + }, + { + "epoch": 12.241986455981941, + "grad_norm": 207.82180786132812, + "learning_rate": 1.1591651542649729e-05, + "loss": 38.5585, + "step": 3391 + }, + { + "epoch": 12.245598194130926, + "grad_norm": 186.55081176757812, + "learning_rate": 1.1586206896551724e-05, + "loss": 38.0183, + "step": 3392 + }, + { + "epoch": 12.249209932279909, + "grad_norm": 179.60572814941406, + "learning_rate": 1.1580762250453721e-05, + "loss": 39.6951, + "step": 3393 + }, + { + "epoch": 12.252821670428894, + "grad_norm": 212.59837341308594, + "learning_rate": 1.1575317604355718e-05, + "loss": 39.2908, + "step": 3394 + }, + { + "epoch": 12.256433408577879, + "grad_norm": 239.90997314453125, + "learning_rate": 1.1569872958257714e-05, + "loss": 39.9409, + "step": 3395 + }, + { + "epoch": 12.260045146726862, + "grad_norm": 240.729248046875, + "learning_rate": 1.156442831215971e-05, + "loss": 39.2386, + "step": 3396 + }, + { + "epoch": 12.263656884875846, + "grad_norm": 248.6179962158203, + "learning_rate": 1.1558983666061706e-05, + "loss": 37.3296, + "step": 3397 + }, + { + "epoch": 12.267268623024831, + "grad_norm": 192.55084228515625, + "learning_rate": 1.1553539019963701e-05, + "loss": 40.1156, + "step": 3398 + }, + { + "epoch": 12.270880361173814, + "grad_norm": 217.89109802246094, + "learning_rate": 1.15480943738657e-05, + "loss": 41.0677, + "step": 3399 + }, + { + "epoch": 12.2744920993228, + "grad_norm": 240.77633666992188, + "learning_rate": 1.1542649727767695e-05, + "loss": 39.3552, + "step": 3400 + }, + { + "epoch": 12.2744920993228, + "eval_loss": 0.6094763278961182, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3400 + }, + { + "epoch": 12.278103837471784, + "grad_norm": 210.38153076171875, + "learning_rate": 1.1537205081669691e-05, + "loss": 40.2202, + "step": 3401 + }, + { + "epoch": 12.281715575620767, + "grad_norm": 195.49087524414062, + "learning_rate": 1.1531760435571688e-05, + "loss": 37.5473, + "step": 3402 + }, + { + "epoch": 12.285327313769752, + "grad_norm": 254.43972778320312, + "learning_rate": 1.1526315789473683e-05, + "loss": 37.8032, + "step": 3403 + }, + { + "epoch": 12.288939051918735, + "grad_norm": 205.09913635253906, + "learning_rate": 1.1520871143375682e-05, + "loss": 35.1317, + "step": 3404 + }, + { + "epoch": 12.29255079006772, + "grad_norm": 241.22930908203125, + "learning_rate": 1.1515426497277677e-05, + "loss": 32.7809, + "step": 3405 + }, + { + "epoch": 12.296162528216705, + "grad_norm": 226.75311279296875, + "learning_rate": 1.1509981851179673e-05, + "loss": 32.5354, + "step": 3406 + }, + { + "epoch": 12.299774266365688, + "grad_norm": 323.5389709472656, + "learning_rate": 1.150453720508167e-05, + "loss": 33.1533, + "step": 3407 + }, + { + "epoch": 12.303386004514673, + "grad_norm": 306.7039794921875, + "learning_rate": 1.1499092558983667e-05, + "loss": 33.7924, + "step": 3408 + }, + { + "epoch": 12.306997742663658, + "grad_norm": 221.53897094726562, + "learning_rate": 1.1493647912885662e-05, + "loss": 33.829, + "step": 3409 + }, + { + "epoch": 12.31060948081264, + "grad_norm": 301.59527587890625, + "learning_rate": 1.148820326678766e-05, + "loss": 35.4583, + "step": 3410 + }, + { + "epoch": 12.31060948081264, + "eval_loss": 0.6092248558998108, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.058, + "eval_steps_per_second": 57.058, + "step": 3410 + }, + { + "epoch": 12.314221218961626, + "grad_norm": 229.63221740722656, + "learning_rate": 1.1482758620689655e-05, + "loss": 34.3258, + "step": 3411 + }, + { + "epoch": 12.317832957110609, + "grad_norm": 280.6421203613281, + "learning_rate": 1.147731397459165e-05, + "loss": 33.4522, + "step": 3412 + }, + { + "epoch": 12.321444695259594, + "grad_norm": 305.6673889160156, + "learning_rate": 1.1471869328493649e-05, + "loss": 34.8911, + "step": 3413 + }, + { + "epoch": 12.325056433408578, + "grad_norm": 278.5484924316406, + "learning_rate": 1.1466424682395644e-05, + "loss": 36.2668, + "step": 3414 + }, + { + "epoch": 12.328668171557561, + "grad_norm": 246.88082885742188, + "learning_rate": 1.1460980036297641e-05, + "loss": 34.8401, + "step": 3415 + }, + { + "epoch": 12.332279909706546, + "grad_norm": 279.730712890625, + "learning_rate": 1.1455535390199637e-05, + "loss": 36.2382, + "step": 3416 + }, + { + "epoch": 12.335891647855531, + "grad_norm": 243.62918090820312, + "learning_rate": 1.1450090744101634e-05, + "loss": 37.0742, + "step": 3417 + }, + { + "epoch": 12.339503386004514, + "grad_norm": 280.5240783691406, + "learning_rate": 1.1444646098003631e-05, + "loss": 37.0223, + "step": 3418 + }, + { + "epoch": 12.343115124153499, + "grad_norm": 270.56396484375, + "learning_rate": 1.1439201451905626e-05, + "loss": 34.8413, + "step": 3419 + }, + { + "epoch": 12.346726862302482, + "grad_norm": 246.56292724609375, + "learning_rate": 1.1433756805807622e-05, + "loss": 26.5596, + "step": 3420 + }, + { + "epoch": 12.346726862302482, + "eval_loss": 0.6123174428939819, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.143, + "eval_steps_per_second": 57.143, + "step": 3420 + }, + { + "epoch": 12.350338600451467, + "grad_norm": 199.72242736816406, + "learning_rate": 1.1428312159709619e-05, + "loss": 23.3959, + "step": 3421 + }, + { + "epoch": 12.353950338600452, + "grad_norm": 264.9206848144531, + "learning_rate": 1.1422867513611616e-05, + "loss": 23.448, + "step": 3422 + }, + { + "epoch": 12.357562076749435, + "grad_norm": 198.09420776367188, + "learning_rate": 1.1417422867513613e-05, + "loss": 23.4526, + "step": 3423 + }, + { + "epoch": 12.36117381489842, + "grad_norm": 191.74949645996094, + "learning_rate": 1.1411978221415608e-05, + "loss": 23.9586, + "step": 3424 + }, + { + "epoch": 12.364785553047405, + "grad_norm": 270.4527893066406, + "learning_rate": 1.1406533575317604e-05, + "loss": 41.2497, + "step": 3425 + }, + { + "epoch": 12.368397291196388, + "grad_norm": 253.06109619140625, + "learning_rate": 1.1401088929219601e-05, + "loss": 41.7598, + "step": 3426 + }, + { + "epoch": 12.372009029345373, + "grad_norm": 389.3164978027344, + "learning_rate": 1.1395644283121598e-05, + "loss": 42.1145, + "step": 3427 + }, + { + "epoch": 12.375620767494357, + "grad_norm": 405.1527404785156, + "learning_rate": 1.1390199637023593e-05, + "loss": 39.8163, + "step": 3428 + }, + { + "epoch": 12.37923250564334, + "grad_norm": 360.5083312988281, + "learning_rate": 1.138475499092559e-05, + "loss": 40.7344, + "step": 3429 + }, + { + "epoch": 12.382844243792325, + "grad_norm": 276.3650207519531, + "learning_rate": 1.1379310344827586e-05, + "loss": 40.6678, + "step": 3430 + }, + { + "epoch": 12.382844243792325, + "eval_loss": 0.612799346446991, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 3430 + }, + { + "epoch": 12.386455981941308, + "grad_norm": 222.34078979492188, + "learning_rate": 1.1373865698729583e-05, + "loss": 39.8701, + "step": 3431 + }, + { + "epoch": 12.390067720090293, + "grad_norm": 242.1103515625, + "learning_rate": 1.136842105263158e-05, + "loss": 42.031, + "step": 3432 + }, + { + "epoch": 12.393679458239278, + "grad_norm": 231.30453491210938, + "learning_rate": 1.1362976406533575e-05, + "loss": 40.7321, + "step": 3433 + }, + { + "epoch": 12.397291196388261, + "grad_norm": 302.65179443359375, + "learning_rate": 1.1357531760435572e-05, + "loss": 41.5889, + "step": 3434 + }, + { + "epoch": 12.400902934537246, + "grad_norm": 296.4203796386719, + "learning_rate": 1.1352087114337568e-05, + "loss": 40.3939, + "step": 3435 + }, + { + "epoch": 12.404514672686231, + "grad_norm": 281.8349304199219, + "learning_rate": 1.1346642468239565e-05, + "loss": 37.9457, + "step": 3436 + }, + { + "epoch": 12.408126410835214, + "grad_norm": 228.9622039794922, + "learning_rate": 1.1341197822141562e-05, + "loss": 37.4727, + "step": 3437 + }, + { + "epoch": 12.411738148984199, + "grad_norm": 276.8975524902344, + "learning_rate": 1.1335753176043557e-05, + "loss": 36.4285, + "step": 3438 + }, + { + "epoch": 12.415349887133182, + "grad_norm": 218.76206970214844, + "learning_rate": 1.1330308529945553e-05, + "loss": 37.7888, + "step": 3439 + }, + { + "epoch": 12.418961625282167, + "grad_norm": 277.31329345703125, + "learning_rate": 1.1324863883847551e-05, + "loss": 38.6416, + "step": 3440 + }, + { + "epoch": 12.418961625282167, + "eval_loss": 0.6118359565734863, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.105, + "eval_steps_per_second": 57.105, + "step": 3440 + }, + { + "epoch": 12.422573363431152, + "grad_norm": 239.2766876220703, + "learning_rate": 1.1319419237749547e-05, + "loss": 38.3779, + "step": 3441 + }, + { + "epoch": 12.426185101580135, + "grad_norm": 255.43939208984375, + "learning_rate": 1.1313974591651542e-05, + "loss": 38.7581, + "step": 3442 + }, + { + "epoch": 12.42979683972912, + "grad_norm": 196.33380126953125, + "learning_rate": 1.130852994555354e-05, + "loss": 40.1953, + "step": 3443 + }, + { + "epoch": 12.433408577878104, + "grad_norm": 284.2427062988281, + "learning_rate": 1.1303085299455535e-05, + "loss": 39.2743, + "step": 3444 + }, + { + "epoch": 12.437020316027088, + "grad_norm": 303.0172424316406, + "learning_rate": 1.1297640653357533e-05, + "loss": 39.4786, + "step": 3445 + }, + { + "epoch": 12.440632054176072, + "grad_norm": 231.17999267578125, + "learning_rate": 1.1292196007259529e-05, + "loss": 38.6038, + "step": 3446 + }, + { + "epoch": 12.444243792325057, + "grad_norm": 228.89599609375, + "learning_rate": 1.1286751361161524e-05, + "loss": 39.0235, + "step": 3447 + }, + { + "epoch": 12.44785553047404, + "grad_norm": 247.05203247070312, + "learning_rate": 1.1281306715063521e-05, + "loss": 39.9779, + "step": 3448 + }, + { + "epoch": 12.451467268623025, + "grad_norm": 221.5463104248047, + "learning_rate": 1.1275862068965517e-05, + "loss": 40.4104, + "step": 3449 + }, + { + "epoch": 12.455079006772008, + "grad_norm": 254.12820434570312, + "learning_rate": 1.1270417422867514e-05, + "loss": 40.8093, + "step": 3450 + }, + { + "epoch": 12.455079006772008, + "eval_loss": 0.6093817353248596, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 3450 + }, + { + "epoch": 12.458690744920993, + "grad_norm": 214.2323760986328, + "learning_rate": 1.1264972776769511e-05, + "loss": 40.3578, + "step": 3451 + }, + { + "epoch": 12.462302483069978, + "grad_norm": 230.64718627929688, + "learning_rate": 1.1259528130671506e-05, + "loss": 39.772, + "step": 3452 + }, + { + "epoch": 12.465914221218961, + "grad_norm": 217.81838989257812, + "learning_rate": 1.1254083484573502e-05, + "loss": 36.8193, + "step": 3453 + }, + { + "epoch": 12.469525959367946, + "grad_norm": 292.7674560546875, + "learning_rate": 1.12486388384755e-05, + "loss": 33.891, + "step": 3454 + }, + { + "epoch": 12.47313769751693, + "grad_norm": 241.6099395751953, + "learning_rate": 1.1243194192377496e-05, + "loss": 34.8947, + "step": 3455 + }, + { + "epoch": 12.476749435665914, + "grad_norm": 220.97128295898438, + "learning_rate": 1.1237749546279493e-05, + "loss": 31.7715, + "step": 3456 + }, + { + "epoch": 12.480361173814899, + "grad_norm": 191.04376220703125, + "learning_rate": 1.1232304900181488e-05, + "loss": 32.3878, + "step": 3457 + }, + { + "epoch": 12.483972911963882, + "grad_norm": 192.3009796142578, + "learning_rate": 1.1226860254083484e-05, + "loss": 33.3116, + "step": 3458 + }, + { + "epoch": 12.487584650112867, + "grad_norm": 214.22459411621094, + "learning_rate": 1.1221415607985482e-05, + "loss": 34.1394, + "step": 3459 + }, + { + "epoch": 12.491196388261852, + "grad_norm": 225.24191284179688, + "learning_rate": 1.1215970961887478e-05, + "loss": 34.9381, + "step": 3460 + }, + { + "epoch": 12.491196388261852, + "eval_loss": 0.6095408201217651, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 3460 + }, + { + "epoch": 12.494808126410835, + "grad_norm": 240.89199829101562, + "learning_rate": 1.1210526315789473e-05, + "loss": 34.5342, + "step": 3461 + }, + { + "epoch": 12.49841986455982, + "grad_norm": 263.5467224121094, + "learning_rate": 1.120508166969147e-05, + "loss": 35.3287, + "step": 3462 + }, + { + "epoch": 12.502031602708804, + "grad_norm": 253.0650634765625, + "learning_rate": 1.1199637023593467e-05, + "loss": 35.4859, + "step": 3463 + }, + { + "epoch": 12.505643340857787, + "grad_norm": 279.4447937011719, + "learning_rate": 1.1194192377495463e-05, + "loss": 33.919, + "step": 3464 + }, + { + "epoch": 12.509255079006772, + "grad_norm": 246.6184844970703, + "learning_rate": 1.118874773139746e-05, + "loss": 35.2743, + "step": 3465 + }, + { + "epoch": 12.512866817155757, + "grad_norm": 228.4134979248047, + "learning_rate": 1.1183303085299455e-05, + "loss": 36.0865, + "step": 3466 + }, + { + "epoch": 12.51647855530474, + "grad_norm": 264.87835693359375, + "learning_rate": 1.1177858439201452e-05, + "loss": 36.1596, + "step": 3467 + }, + { + "epoch": 12.520090293453725, + "grad_norm": 252.2872772216797, + "learning_rate": 1.117241379310345e-05, + "loss": 35.7293, + "step": 3468 + }, + { + "epoch": 12.523702031602708, + "grad_norm": 277.3695373535156, + "learning_rate": 1.1166969147005445e-05, + "loss": 36.8009, + "step": 3469 + }, + { + "epoch": 12.527313769751693, + "grad_norm": 255.64610290527344, + "learning_rate": 1.1161524500907442e-05, + "loss": 28.5986, + "step": 3470 + }, + { + "epoch": 12.527313769751693, + "eval_loss": 0.6122347116470337, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 3470 + }, + { + "epoch": 12.530925507900678, + "grad_norm": 256.1487121582031, + "learning_rate": 1.1156079854809437e-05, + "loss": 23.1289, + "step": 3471 + }, + { + "epoch": 12.534537246049661, + "grad_norm": 261.9757080078125, + "learning_rate": 1.1150635208711433e-05, + "loss": 22.3379, + "step": 3472 + }, + { + "epoch": 12.538148984198646, + "grad_norm": 194.83432006835938, + "learning_rate": 1.1145190562613431e-05, + "loss": 23.6192, + "step": 3473 + }, + { + "epoch": 12.54176072234763, + "grad_norm": 241.51089477539062, + "learning_rate": 1.1139745916515427e-05, + "loss": 24.0314, + "step": 3474 + }, + { + "epoch": 12.545372460496614, + "grad_norm": 242.6024932861328, + "learning_rate": 1.1134301270417424e-05, + "loss": 40.2969, + "step": 3475 + }, + { + "epoch": 12.548984198645599, + "grad_norm": 292.17303466796875, + "learning_rate": 1.112885662431942e-05, + "loss": 42.3448, + "step": 3476 + }, + { + "epoch": 12.552595936794582, + "grad_norm": 232.811767578125, + "learning_rate": 1.1123411978221416e-05, + "loss": 41.7642, + "step": 3477 + }, + { + "epoch": 12.556207674943566, + "grad_norm": 238.43162536621094, + "learning_rate": 1.1117967332123413e-05, + "loss": 41.0827, + "step": 3478 + }, + { + "epoch": 12.559819413092551, + "grad_norm": 290.20159912109375, + "learning_rate": 1.1112522686025409e-05, + "loss": 41.3795, + "step": 3479 + }, + { + "epoch": 12.563431151241534, + "grad_norm": 197.52903747558594, + "learning_rate": 1.1107078039927404e-05, + "loss": 40.6337, + "step": 3480 + }, + { + "epoch": 12.563431151241534, + "eval_loss": 0.6133883595466614, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.135, + "eval_steps_per_second": 57.135, + "step": 3480 + }, + { + "epoch": 12.56704288939052, + "grad_norm": 259.8161926269531, + "learning_rate": 1.1101633393829401e-05, + "loss": 40.2626, + "step": 3481 + }, + { + "epoch": 12.570654627539504, + "grad_norm": 196.7882537841797, + "learning_rate": 1.1096188747731398e-05, + "loss": 41.0171, + "step": 3482 + }, + { + "epoch": 12.574266365688487, + "grad_norm": 216.27642822265625, + "learning_rate": 1.1090744101633394e-05, + "loss": 42.1328, + "step": 3483 + }, + { + "epoch": 12.577878103837472, + "grad_norm": 292.6575012207031, + "learning_rate": 1.108529945553539e-05, + "loss": 39.9502, + "step": 3484 + }, + { + "epoch": 12.581489841986457, + "grad_norm": 254.43344116210938, + "learning_rate": 1.1079854809437386e-05, + "loss": 41.3409, + "step": 3485 + }, + { + "epoch": 12.58510158013544, + "grad_norm": 211.3965606689453, + "learning_rate": 1.1074410163339385e-05, + "loss": 39.6898, + "step": 3486 + }, + { + "epoch": 12.588713318284425, + "grad_norm": 196.2000274658203, + "learning_rate": 1.106896551724138e-05, + "loss": 38.0837, + "step": 3487 + }, + { + "epoch": 12.592325056433408, + "grad_norm": 224.4564666748047, + "learning_rate": 1.1063520871143376e-05, + "loss": 38.479, + "step": 3488 + }, + { + "epoch": 12.595936794582393, + "grad_norm": 215.7074432373047, + "learning_rate": 1.1058076225045373e-05, + "loss": 38.3103, + "step": 3489 + }, + { + "epoch": 12.599548532731378, + "grad_norm": 278.2279052734375, + "learning_rate": 1.1052631578947368e-05, + "loss": 37.9399, + "step": 3490 + }, + { + "epoch": 12.599548532731378, + "eval_loss": 0.6091782450675964, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 3490 + }, + { + "epoch": 12.60316027088036, + "grad_norm": 236.7021942138672, + "learning_rate": 1.1047186932849365e-05, + "loss": 38.185, + "step": 3491 + }, + { + "epoch": 12.606772009029346, + "grad_norm": 200.35169982910156, + "learning_rate": 1.1041742286751362e-05, + "loss": 38.7405, + "step": 3492 + }, + { + "epoch": 12.610383747178329, + "grad_norm": 211.9726104736328, + "learning_rate": 1.1036297640653358e-05, + "loss": 39.8351, + "step": 3493 + }, + { + "epoch": 12.613995485327314, + "grad_norm": 303.5962829589844, + "learning_rate": 1.1030852994555353e-05, + "loss": 39.3039, + "step": 3494 + }, + { + "epoch": 12.617607223476298, + "grad_norm": 298.086181640625, + "learning_rate": 1.102540834845735e-05, + "loss": 39.9149, + "step": 3495 + }, + { + "epoch": 12.621218961625281, + "grad_norm": 255.69854736328125, + "learning_rate": 1.1019963702359347e-05, + "loss": 36.3617, + "step": 3496 + }, + { + "epoch": 12.624830699774266, + "grad_norm": 273.2884216308594, + "learning_rate": 1.1014519056261344e-05, + "loss": 38.6865, + "step": 3497 + }, + { + "epoch": 12.628442437923251, + "grad_norm": 211.17837524414062, + "learning_rate": 1.100907441016334e-05, + "loss": 40.2771, + "step": 3498 + }, + { + "epoch": 12.632054176072234, + "grad_norm": 253.9141845703125, + "learning_rate": 1.1003629764065335e-05, + "loss": 40.3644, + "step": 3499 + }, + { + "epoch": 12.635665914221219, + "grad_norm": 247.4141082763672, + "learning_rate": 1.0998185117967334e-05, + "loss": 39.9754, + "step": 3500 + }, + { + "epoch": 12.635665914221219, + "eval_loss": 0.6086810827255249, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 3500 + }, + { + "epoch": 12.639277652370204, + "grad_norm": 237.3258056640625, + "learning_rate": 1.0992740471869329e-05, + "loss": 39.9438, + "step": 3501 + }, + { + "epoch": 12.642889390519187, + "grad_norm": 252.87744140625, + "learning_rate": 1.0987295825771325e-05, + "loss": 39.9713, + "step": 3502 + }, + { + "epoch": 12.646501128668172, + "grad_norm": 341.2947998046875, + "learning_rate": 1.0981851179673322e-05, + "loss": 36.54, + "step": 3503 + }, + { + "epoch": 12.650112866817155, + "grad_norm": 212.7144317626953, + "learning_rate": 1.0976406533575317e-05, + "loss": 33.2737, + "step": 3504 + }, + { + "epoch": 12.65372460496614, + "grad_norm": 220.15846252441406, + "learning_rate": 1.0970961887477314e-05, + "loss": 34.8862, + "step": 3505 + }, + { + "epoch": 12.657336343115125, + "grad_norm": 235.8145294189453, + "learning_rate": 1.0965517241379311e-05, + "loss": 31.637, + "step": 3506 + }, + { + "epoch": 12.660948081264108, + "grad_norm": 274.13140869140625, + "learning_rate": 1.0960072595281307e-05, + "loss": 33.6111, + "step": 3507 + }, + { + "epoch": 12.664559819413093, + "grad_norm": 259.9810791015625, + "learning_rate": 1.0954627949183304e-05, + "loss": 34.7118, + "step": 3508 + }, + { + "epoch": 12.668171557562077, + "grad_norm": 244.6074676513672, + "learning_rate": 1.0949183303085299e-05, + "loss": 34.3987, + "step": 3509 + }, + { + "epoch": 12.67178329571106, + "grad_norm": 264.0238037109375, + "learning_rate": 1.0943738656987296e-05, + "loss": 34.7304, + "step": 3510 + }, + { + "epoch": 12.67178329571106, + "eval_loss": 0.6089194416999817, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 3510 + }, + { + "epoch": 12.675395033860045, + "grad_norm": 286.857421875, + "learning_rate": 1.0938294010889293e-05, + "loss": 34.5722, + "step": 3511 + }, + { + "epoch": 12.679006772009028, + "grad_norm": 270.7839660644531, + "learning_rate": 1.0932849364791289e-05, + "loss": 35.6129, + "step": 3512 + }, + { + "epoch": 12.682618510158013, + "grad_norm": 214.4302978515625, + "learning_rate": 1.0927404718693284e-05, + "loss": 34.4318, + "step": 3513 + }, + { + "epoch": 12.686230248306998, + "grad_norm": 362.6913757324219, + "learning_rate": 1.0921960072595283e-05, + "loss": 35.6578, + "step": 3514 + }, + { + "epoch": 12.689841986455981, + "grad_norm": 266.5205993652344, + "learning_rate": 1.0916515426497278e-05, + "loss": 35.8627, + "step": 3515 + }, + { + "epoch": 12.693453724604966, + "grad_norm": 271.8298034667969, + "learning_rate": 1.0911070780399275e-05, + "loss": 36.8931, + "step": 3516 + }, + { + "epoch": 12.697065462753951, + "grad_norm": 230.13815307617188, + "learning_rate": 1.090562613430127e-05, + "loss": 35.8972, + "step": 3517 + }, + { + "epoch": 12.700677200902934, + "grad_norm": 235.57127380371094, + "learning_rate": 1.0900181488203266e-05, + "loss": 36.7884, + "step": 3518 + }, + { + "epoch": 12.704288939051919, + "grad_norm": 274.0856018066406, + "learning_rate": 1.0894736842105265e-05, + "loss": 35.938, + "step": 3519 + }, + { + "epoch": 12.707900677200904, + "grad_norm": 251.9855194091797, + "learning_rate": 1.088929219600726e-05, + "loss": 30.846, + "step": 3520 + }, + { + "epoch": 12.707900677200904, + "eval_loss": 0.6102532148361206, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 3520 + }, + { + "epoch": 12.711512415349887, + "grad_norm": 254.11465454101562, + "learning_rate": 1.0883847549909255e-05, + "loss": 22.8538, + "step": 3521 + }, + { + "epoch": 12.715124153498872, + "grad_norm": 233.05821228027344, + "learning_rate": 1.0878402903811253e-05, + "loss": 22.3346, + "step": 3522 + }, + { + "epoch": 12.718735891647855, + "grad_norm": 223.46646118164062, + "learning_rate": 1.087295825771325e-05, + "loss": 23.8109, + "step": 3523 + }, + { + "epoch": 12.72234762979684, + "grad_norm": 209.4064483642578, + "learning_rate": 1.0867513611615245e-05, + "loss": 24.7694, + "step": 3524 + }, + { + "epoch": 12.725959367945824, + "grad_norm": 299.6215515136719, + "learning_rate": 1.0862068965517242e-05, + "loss": 40.8879, + "step": 3525 + }, + { + "epoch": 12.729571106094808, + "grad_norm": 272.5259704589844, + "learning_rate": 1.0856624319419237e-05, + "loss": 41.5875, + "step": 3526 + }, + { + "epoch": 12.733182844243792, + "grad_norm": 219.70687866210938, + "learning_rate": 1.0851179673321235e-05, + "loss": 41.5546, + "step": 3527 + }, + { + "epoch": 12.736794582392777, + "grad_norm": 250.9104766845703, + "learning_rate": 1.0845735027223232e-05, + "loss": 40.0984, + "step": 3528 + }, + { + "epoch": 12.74040632054176, + "grad_norm": 260.9254150390625, + "learning_rate": 1.0840290381125227e-05, + "loss": 40.564, + "step": 3529 + }, + { + "epoch": 12.744018058690745, + "grad_norm": 275.46221923828125, + "learning_rate": 1.0834845735027224e-05, + "loss": 40.3864, + "step": 3530 + }, + { + "epoch": 12.744018058690745, + "eval_loss": 0.6099677681922913, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 3530 + }, + { + "epoch": 12.747629796839728, + "grad_norm": 200.9589385986328, + "learning_rate": 1.082940108892922e-05, + "loss": 40.5753, + "step": 3531 + }, + { + "epoch": 12.751241534988713, + "grad_norm": 228.87669372558594, + "learning_rate": 1.0823956442831215e-05, + "loss": 41.4702, + "step": 3532 + }, + { + "epoch": 12.754853273137698, + "grad_norm": 218.6998748779297, + "learning_rate": 1.0818511796733214e-05, + "loss": 41.6641, + "step": 3533 + }, + { + "epoch": 12.758465011286681, + "grad_norm": 422.519775390625, + "learning_rate": 1.0813067150635209e-05, + "loss": 41.8016, + "step": 3534 + }, + { + "epoch": 12.762076749435666, + "grad_norm": 198.31935119628906, + "learning_rate": 1.0807622504537204e-05, + "loss": 40.6053, + "step": 3535 + }, + { + "epoch": 12.76568848758465, + "grad_norm": 274.42333984375, + "learning_rate": 1.0802177858439201e-05, + "loss": 38.7974, + "step": 3536 + }, + { + "epoch": 12.769300225733634, + "grad_norm": 267.5847473144531, + "learning_rate": 1.0796733212341199e-05, + "loss": 37.157, + "step": 3537 + }, + { + "epoch": 12.772911963882619, + "grad_norm": 264.9976806640625, + "learning_rate": 1.0791288566243196e-05, + "loss": 38.1585, + "step": 3538 + }, + { + "epoch": 12.776523702031604, + "grad_norm": 216.5603790283203, + "learning_rate": 1.0785843920145191e-05, + "loss": 38.0501, + "step": 3539 + }, + { + "epoch": 12.780135440180587, + "grad_norm": 193.55081176757812, + "learning_rate": 1.0780399274047186e-05, + "loss": 38.3114, + "step": 3540 + }, + { + "epoch": 12.780135440180587, + "eval_loss": 0.6059894561767578, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3540 + }, + { + "epoch": 12.783747178329572, + "grad_norm": 256.3584289550781, + "learning_rate": 1.0774954627949183e-05, + "loss": 38.7056, + "step": 3541 + }, + { + "epoch": 12.787358916478555, + "grad_norm": 203.17401123046875, + "learning_rate": 1.076950998185118e-05, + "loss": 39.3947, + "step": 3542 + }, + { + "epoch": 12.79097065462754, + "grad_norm": 307.99517822265625, + "learning_rate": 1.0764065335753176e-05, + "loss": 39.2121, + "step": 3543 + }, + { + "epoch": 12.794582392776524, + "grad_norm": 199.4147186279297, + "learning_rate": 1.0758620689655173e-05, + "loss": 38.4621, + "step": 3544 + }, + { + "epoch": 12.798194130925507, + "grad_norm": 251.60293579101562, + "learning_rate": 1.0753176043557168e-05, + "loss": 38.2742, + "step": 3545 + }, + { + "epoch": 12.801805869074492, + "grad_norm": 277.1817321777344, + "learning_rate": 1.0747731397459165e-05, + "loss": 38.6803, + "step": 3546 + }, + { + "epoch": 12.805417607223477, + "grad_norm": 303.2837219238281, + "learning_rate": 1.0742286751361163e-05, + "loss": 39.7843, + "step": 3547 + }, + { + "epoch": 12.80902934537246, + "grad_norm": 321.22772216796875, + "learning_rate": 1.0736842105263158e-05, + "loss": 41.3761, + "step": 3548 + }, + { + "epoch": 12.812641083521445, + "grad_norm": 238.89007568359375, + "learning_rate": 1.0731397459165155e-05, + "loss": 40.3649, + "step": 3549 + }, + { + "epoch": 12.816252821670428, + "grad_norm": 251.22291564941406, + "learning_rate": 1.072595281306715e-05, + "loss": 40.8151, + "step": 3550 + }, + { + "epoch": 12.816252821670428, + "eval_loss": 0.6065003275871277, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 3550 + }, + { + "epoch": 12.819864559819413, + "grad_norm": 218.13418579101562, + "learning_rate": 1.0720508166969147e-05, + "loss": 39.381, + "step": 3551 + }, + { + "epoch": 12.823476297968398, + "grad_norm": 250.90328979492188, + "learning_rate": 1.0715063520871145e-05, + "loss": 39.8923, + "step": 3552 + }, + { + "epoch": 12.827088036117381, + "grad_norm": 227.4825897216797, + "learning_rate": 1.070961887477314e-05, + "loss": 36.836, + "step": 3553 + }, + { + "epoch": 12.830699774266366, + "grad_norm": 253.7106475830078, + "learning_rate": 1.0704174228675135e-05, + "loss": 34.499, + "step": 3554 + }, + { + "epoch": 12.83431151241535, + "grad_norm": 280.0548400878906, + "learning_rate": 1.0698729582577132e-05, + "loss": 33.3409, + "step": 3555 + }, + { + "epoch": 12.837923250564334, + "grad_norm": 201.3768768310547, + "learning_rate": 1.069328493647913e-05, + "loss": 32.4868, + "step": 3556 + }, + { + "epoch": 12.841534988713319, + "grad_norm": 245.73446655273438, + "learning_rate": 1.0687840290381125e-05, + "loss": 32.8295, + "step": 3557 + }, + { + "epoch": 12.845146726862303, + "grad_norm": 195.0170440673828, + "learning_rate": 1.0682395644283122e-05, + "loss": 33.2009, + "step": 3558 + }, + { + "epoch": 12.848758465011286, + "grad_norm": 261.66357421875, + "learning_rate": 1.0676950998185117e-05, + "loss": 33.0627, + "step": 3559 + }, + { + "epoch": 12.852370203160271, + "grad_norm": 299.0184326171875, + "learning_rate": 1.0671506352087116e-05, + "loss": 34.184, + "step": 3560 + }, + { + "epoch": 12.852370203160271, + "eval_loss": 0.6077792048454285, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.041, + "eval_steps_per_second": 57.041, + "step": 3560 + }, + { + "epoch": 12.855981941309254, + "grad_norm": 293.9249572753906, + "learning_rate": 1.0666061705989111e-05, + "loss": 34.748, + "step": 3561 + }, + { + "epoch": 12.85959367945824, + "grad_norm": 206.4182586669922, + "learning_rate": 1.0660617059891107e-05, + "loss": 33.8454, + "step": 3562 + }, + { + "epoch": 12.863205417607224, + "grad_norm": 261.4427185058594, + "learning_rate": 1.0655172413793104e-05, + "loss": 35.7317, + "step": 3563 + }, + { + "epoch": 12.866817155756207, + "grad_norm": 236.60704040527344, + "learning_rate": 1.06497277676951e-05, + "loss": 35.2389, + "step": 3564 + }, + { + "epoch": 12.870428893905192, + "grad_norm": 272.9973449707031, + "learning_rate": 1.0644283121597096e-05, + "loss": 34.8523, + "step": 3565 + }, + { + "epoch": 12.874040632054175, + "grad_norm": 228.82540893554688, + "learning_rate": 1.0638838475499093e-05, + "loss": 34.7236, + "step": 3566 + }, + { + "epoch": 12.87765237020316, + "grad_norm": 266.6078796386719, + "learning_rate": 1.0633393829401089e-05, + "loss": 36.1574, + "step": 3567 + }, + { + "epoch": 12.881264108352145, + "grad_norm": 267.52239990234375, + "learning_rate": 1.0627949183303086e-05, + "loss": 36.8466, + "step": 3568 + }, + { + "epoch": 12.884875846501128, + "grad_norm": 261.0372314453125, + "learning_rate": 1.0622504537205083e-05, + "loss": 37.2803, + "step": 3569 + }, + { + "epoch": 12.888487584650113, + "grad_norm": 220.42532348632812, + "learning_rate": 1.0617059891107078e-05, + "loss": 29.4233, + "step": 3570 + }, + { + "epoch": 12.888487584650113, + "eval_loss": 0.6131581664085388, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 3570 + }, + { + "epoch": 12.892099322799098, + "grad_norm": 187.53604125976562, + "learning_rate": 1.0611615245009075e-05, + "loss": 23.3851, + "step": 3571 + }, + { + "epoch": 12.89571106094808, + "grad_norm": 227.1913299560547, + "learning_rate": 1.060617059891107e-05, + "loss": 23.3155, + "step": 3572 + }, + { + "epoch": 12.899322799097066, + "grad_norm": 202.15939331054688, + "learning_rate": 1.0600725952813066e-05, + "loss": 24.4548, + "step": 3573 + }, + { + "epoch": 12.90293453724605, + "grad_norm": 195.67282104492188, + "learning_rate": 1.0595281306715065e-05, + "loss": 24.2037, + "step": 3574 + }, + { + "epoch": 12.906546275395034, + "grad_norm": 303.0018310546875, + "learning_rate": 1.058983666061706e-05, + "loss": 41.6489, + "step": 3575 + }, + { + "epoch": 12.910158013544018, + "grad_norm": 193.92433166503906, + "learning_rate": 1.0584392014519056e-05, + "loss": 40.3682, + "step": 3576 + }, + { + "epoch": 12.913769751693001, + "grad_norm": 305.50750732421875, + "learning_rate": 1.0578947368421053e-05, + "loss": 40.5065, + "step": 3577 + }, + { + "epoch": 12.917381489841986, + "grad_norm": 223.41732788085938, + "learning_rate": 1.0573502722323048e-05, + "loss": 41.6387, + "step": 3578 + }, + { + "epoch": 12.920993227990971, + "grad_norm": 215.65061950683594, + "learning_rate": 1.0568058076225047e-05, + "loss": 41.3623, + "step": 3579 + }, + { + "epoch": 12.924604966139954, + "grad_norm": 223.95880126953125, + "learning_rate": 1.0562613430127042e-05, + "loss": 40.7444, + "step": 3580 + }, + { + "epoch": 12.924604966139954, + "eval_loss": 0.6113386750221252, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.074, + "eval_steps_per_second": 57.074, + "step": 3580 + }, + { + "epoch": 12.928216704288939, + "grad_norm": 247.3272247314453, + "learning_rate": 1.0557168784029038e-05, + "loss": 37.8137, + "step": 3581 + }, + { + "epoch": 12.931828442437924, + "grad_norm": 277.4321594238281, + "learning_rate": 1.0551724137931035e-05, + "loss": 38.6946, + "step": 3582 + }, + { + "epoch": 12.935440180586907, + "grad_norm": 219.15576171875, + "learning_rate": 1.0546279491833032e-05, + "loss": 39.0059, + "step": 3583 + }, + { + "epoch": 12.939051918735892, + "grad_norm": 205.6105194091797, + "learning_rate": 1.0540834845735027e-05, + "loss": 39.2436, + "step": 3584 + }, + { + "epoch": 12.942663656884875, + "grad_norm": 303.84521484375, + "learning_rate": 1.0535390199637024e-05, + "loss": 39.2451, + "step": 3585 + }, + { + "epoch": 12.94627539503386, + "grad_norm": 326.2321472167969, + "learning_rate": 1.052994555353902e-05, + "loss": 38.1849, + "step": 3586 + }, + { + "epoch": 12.949887133182845, + "grad_norm": 332.7608642578125, + "learning_rate": 1.0524500907441015e-05, + "loss": 39.7121, + "step": 3587 + }, + { + "epoch": 12.953498871331828, + "grad_norm": 245.19827270507812, + "learning_rate": 1.0519056261343014e-05, + "loss": 39.6558, + "step": 3588 + }, + { + "epoch": 12.957110609480813, + "grad_norm": 227.54763793945312, + "learning_rate": 1.051361161524501e-05, + "loss": 38.6437, + "step": 3589 + }, + { + "epoch": 12.960722347629797, + "grad_norm": 273.1142272949219, + "learning_rate": 1.0508166969147006e-05, + "loss": 39.083, + "step": 3590 + }, + { + "epoch": 12.960722347629797, + "eval_loss": 0.6050187349319458, + "eval_runtime": 3.1339, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 3590 + }, + { + "epoch": 12.96433408577878, + "grad_norm": 227.0492401123047, + "learning_rate": 1.0502722323049002e-05, + "loss": 34.0254, + "step": 3591 + }, + { + "epoch": 12.967945823927765, + "grad_norm": 201.76736450195312, + "learning_rate": 1.0497277676950999e-05, + "loss": 32.4569, + "step": 3592 + }, + { + "epoch": 12.97155756207675, + "grad_norm": 279.99237060546875, + "learning_rate": 1.0491833030852996e-05, + "loss": 33.8718, + "step": 3593 + }, + { + "epoch": 12.975169300225733, + "grad_norm": 351.647705078125, + "learning_rate": 1.0486388384754991e-05, + "loss": 34.8168, + "step": 3594 + }, + { + "epoch": 12.978781038374718, + "grad_norm": 275.7414855957031, + "learning_rate": 1.0480943738656987e-05, + "loss": 35.1731, + "step": 3595 + }, + { + "epoch": 12.982392776523701, + "grad_norm": 347.0024719238281, + "learning_rate": 1.0475499092558984e-05, + "loss": 35.7127, + "step": 3596 + }, + { + "epoch": 12.986004514672686, + "grad_norm": 304.18218994140625, + "learning_rate": 1.047005444646098e-05, + "loss": 34.7709, + "step": 3597 + }, + { + "epoch": 12.989616252821671, + "grad_norm": 306.33245849609375, + "learning_rate": 1.0464609800362976e-05, + "loss": 37.2105, + "step": 3598 + }, + { + "epoch": 12.993227990970654, + "grad_norm": 326.3535461425781, + "learning_rate": 1.0459165154264973e-05, + "loss": 33.6613, + "step": 3599 + }, + { + "epoch": 12.996839729119639, + "grad_norm": 325.7522888183594, + "learning_rate": 1.0453720508166969e-05, + "loss": 22.8985, + "step": 3600 + }, + { + "epoch": 12.996839729119639, + "eval_loss": 0.6073772311210632, + "eval_runtime": 3.1391, + "eval_samples_per_second": 57.023, + "eval_steps_per_second": 57.023, + "step": 3600 + }, + { + "epoch": 13.0, + "grad_norm": 256.7010498046875, + "learning_rate": 1.0448275862068966e-05, + "loss": 21.3776, + "step": 3601 + }, + { + "epoch": 13.003611738148985, + "grad_norm": 247.7591552734375, + "learning_rate": 1.0442831215970963e-05, + "loss": 39.0509, + "step": 3602 + }, + { + "epoch": 13.007223476297968, + "grad_norm": 389.6626281738281, + "learning_rate": 1.0437386569872958e-05, + "loss": 41.042, + "step": 3603 + }, + { + "epoch": 13.010835214446953, + "grad_norm": 271.01885986328125, + "learning_rate": 1.0431941923774955e-05, + "loss": 39.9542, + "step": 3604 + }, + { + "epoch": 13.014446952595938, + "grad_norm": 263.2490539550781, + "learning_rate": 1.042649727767695e-05, + "loss": 39.8852, + "step": 3605 + }, + { + "epoch": 13.01805869074492, + "grad_norm": 255.46878051757812, + "learning_rate": 1.0421052631578948e-05, + "loss": 39.3902, + "step": 3606 + }, + { + "epoch": 13.021670428893906, + "grad_norm": 206.02244567871094, + "learning_rate": 1.0415607985480945e-05, + "loss": 40.1731, + "step": 3607 + }, + { + "epoch": 13.025282167042889, + "grad_norm": 194.83055114746094, + "learning_rate": 1.041016333938294e-05, + "loss": 39.17, + "step": 3608 + }, + { + "epoch": 13.028893905191874, + "grad_norm": 230.1270294189453, + "learning_rate": 1.0404718693284936e-05, + "loss": 40.3363, + "step": 3609 + }, + { + "epoch": 13.032505643340858, + "grad_norm": 206.0470733642578, + "learning_rate": 1.0399274047186933e-05, + "loss": 40.7774, + "step": 3610 + }, + { + "epoch": 13.032505643340858, + "eval_loss": 0.6078981161117554, + "eval_runtime": 3.1697, + "eval_samples_per_second": 56.472, + "eval_steps_per_second": 56.472, + "step": 3610 + }, + { + "epoch": 13.036117381489841, + "grad_norm": 210.79327392578125, + "learning_rate": 1.039382940108893e-05, + "loss": 40.725, + "step": 3611 + }, + { + "epoch": 13.039729119638826, + "grad_norm": 200.4281768798828, + "learning_rate": 1.0388384754990927e-05, + "loss": 38.8736, + "step": 3612 + }, + { + "epoch": 13.043340857787811, + "grad_norm": 183.33575439453125, + "learning_rate": 1.0382940108892922e-05, + "loss": 37.5542, + "step": 3613 + }, + { + "epoch": 13.046952595936794, + "grad_norm": 195.2568817138672, + "learning_rate": 1.0377495462794918e-05, + "loss": 36.5576, + "step": 3614 + }, + { + "epoch": 13.050564334085779, + "grad_norm": 223.9565887451172, + "learning_rate": 1.0372050816696916e-05, + "loss": 36.9015, + "step": 3615 + }, + { + "epoch": 13.054176072234762, + "grad_norm": 264.0516052246094, + "learning_rate": 1.0366606170598912e-05, + "loss": 38.8146, + "step": 3616 + }, + { + "epoch": 13.057787810383747, + "grad_norm": 247.3844757080078, + "learning_rate": 1.0361161524500907e-05, + "loss": 37.0338, + "step": 3617 + }, + { + "epoch": 13.061399548532732, + "grad_norm": 243.3253173828125, + "learning_rate": 1.0355716878402904e-05, + "loss": 37.3565, + "step": 3618 + }, + { + "epoch": 13.065011286681715, + "grad_norm": 213.89939880371094, + "learning_rate": 1.03502722323049e-05, + "loss": 38.367, + "step": 3619 + }, + { + "epoch": 13.0686230248307, + "grad_norm": 254.04953002929688, + "learning_rate": 1.0344827586206898e-05, + "loss": 38.3101, + "step": 3620 + }, + { + "epoch": 13.0686230248307, + "eval_loss": 0.6108394861221313, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 3620 + }, + { + "epoch": 13.072234762979685, + "grad_norm": 235.3623046875, + "learning_rate": 1.0339382940108894e-05, + "loss": 38.3113, + "step": 3621 + }, + { + "epoch": 13.075846501128668, + "grad_norm": 259.0147399902344, + "learning_rate": 1.0333938294010889e-05, + "loss": 36.9916, + "step": 3622 + }, + { + "epoch": 13.079458239277653, + "grad_norm": 257.96575927734375, + "learning_rate": 1.0328493647912886e-05, + "loss": 36.5944, + "step": 3623 + }, + { + "epoch": 13.083069977426636, + "grad_norm": 228.49131774902344, + "learning_rate": 1.0323049001814882e-05, + "loss": 39.7592, + "step": 3624 + }, + { + "epoch": 13.08668171557562, + "grad_norm": 278.5231018066406, + "learning_rate": 1.0317604355716879e-05, + "loss": 38.7785, + "step": 3625 + }, + { + "epoch": 13.090293453724605, + "grad_norm": 218.6136932373047, + "learning_rate": 1.0312159709618876e-05, + "loss": 39.6878, + "step": 3626 + }, + { + "epoch": 13.093905191873588, + "grad_norm": 231.03012084960938, + "learning_rate": 1.0306715063520871e-05, + "loss": 40.5433, + "step": 3627 + }, + { + "epoch": 13.097516930022573, + "grad_norm": 254.7096405029297, + "learning_rate": 1.0301270417422866e-05, + "loss": 39.1311, + "step": 3628 + }, + { + "epoch": 13.101128668171558, + "grad_norm": 303.50274658203125, + "learning_rate": 1.0295825771324865e-05, + "loss": 38.6237, + "step": 3629 + }, + { + "epoch": 13.104740406320541, + "grad_norm": 217.4394073486328, + "learning_rate": 1.029038112522686e-05, + "loss": 36.5534, + "step": 3630 + }, + { + "epoch": 13.104740406320541, + "eval_loss": 0.6075544357299805, + "eval_runtime": 3.1475, + "eval_samples_per_second": 56.87, + "eval_steps_per_second": 56.87, + "step": 3630 + }, + { + "epoch": 13.108352144469526, + "grad_norm": 249.18490600585938, + "learning_rate": 1.0284936479128858e-05, + "loss": 34.2153, + "step": 3631 + }, + { + "epoch": 13.111963882618511, + "grad_norm": 261.9061584472656, + "learning_rate": 1.0279491833030853e-05, + "loss": 33.7793, + "step": 3632 + }, + { + "epoch": 13.115575620767494, + "grad_norm": 205.93113708496094, + "learning_rate": 1.0274047186932848e-05, + "loss": 31.2934, + "step": 3633 + }, + { + "epoch": 13.119187358916479, + "grad_norm": 203.82980346679688, + "learning_rate": 1.0268602540834847e-05, + "loss": 31.9074, + "step": 3634 + }, + { + "epoch": 13.122799097065462, + "grad_norm": 309.0658874511719, + "learning_rate": 1.0263157894736843e-05, + "loss": 32.6883, + "step": 3635 + }, + { + "epoch": 13.126410835214447, + "grad_norm": 239.59312438964844, + "learning_rate": 1.0257713248638838e-05, + "loss": 34.1261, + "step": 3636 + }, + { + "epoch": 13.130022573363432, + "grad_norm": 360.4351501464844, + "learning_rate": 1.0252268602540835e-05, + "loss": 34.7656, + "step": 3637 + }, + { + "epoch": 13.133634311512415, + "grad_norm": 319.87451171875, + "learning_rate": 1.024682395644283e-05, + "loss": 34.6533, + "step": 3638 + }, + { + "epoch": 13.1372460496614, + "grad_norm": 352.31707763671875, + "learning_rate": 1.0241379310344828e-05, + "loss": 33.9159, + "step": 3639 + }, + { + "epoch": 13.140857787810384, + "grad_norm": 288.85418701171875, + "learning_rate": 1.0235934664246825e-05, + "loss": 34.6115, + "step": 3640 + }, + { + "epoch": 13.140857787810384, + "eval_loss": 0.6106187105178833, + "eval_runtime": 3.1535, + "eval_samples_per_second": 56.763, + "eval_steps_per_second": 56.763, + "step": 3640 + }, + { + "epoch": 13.144469525959368, + "grad_norm": 263.8638000488281, + "learning_rate": 1.023049001814882e-05, + "loss": 34.3008, + "step": 3641 + }, + { + "epoch": 13.148081264108352, + "grad_norm": 308.10650634765625, + "learning_rate": 1.0225045372050817e-05, + "loss": 35.9397, + "step": 3642 + }, + { + "epoch": 13.151693002257336, + "grad_norm": 208.60519409179688, + "learning_rate": 1.0219600725952814e-05, + "loss": 34.2573, + "step": 3643 + }, + { + "epoch": 13.15530474040632, + "grad_norm": 251.36766052246094, + "learning_rate": 1.021415607985481e-05, + "loss": 35.853, + "step": 3644 + }, + { + "epoch": 13.158916478555305, + "grad_norm": 264.94818115234375, + "learning_rate": 1.0208711433756807e-05, + "loss": 35.7057, + "step": 3645 + }, + { + "epoch": 13.162528216704288, + "grad_norm": 313.0333251953125, + "learning_rate": 1.0203266787658802e-05, + "loss": 34.611, + "step": 3646 + }, + { + "epoch": 13.166139954853273, + "grad_norm": 254.9687042236328, + "learning_rate": 1.0197822141560797e-05, + "loss": 31.1751, + "step": 3647 + }, + { + "epoch": 13.169751693002258, + "grad_norm": 219.7308349609375, + "learning_rate": 1.0192377495462796e-05, + "loss": 22.8425, + "step": 3648 + }, + { + "epoch": 13.173363431151241, + "grad_norm": 305.76416015625, + "learning_rate": 1.0186932849364792e-05, + "loss": 22.5266, + "step": 3649 + }, + { + "epoch": 13.176975169300226, + "grad_norm": 301.26239013671875, + "learning_rate": 1.0181488203266787e-05, + "loss": 23.861, + "step": 3650 + }, + { + "epoch": 13.176975169300226, + "eval_loss": 0.6107029914855957, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 3650 + }, + { + "epoch": 13.18058690744921, + "grad_norm": 235.15576171875, + "learning_rate": 1.0176043557168784e-05, + "loss": 24.495, + "step": 3651 + }, + { + "epoch": 13.184198645598194, + "grad_norm": 268.524658203125, + "learning_rate": 1.0170598911070781e-05, + "loss": 40.3819, + "step": 3652 + }, + { + "epoch": 13.187810383747179, + "grad_norm": 257.869140625, + "learning_rate": 1.0165154264972778e-05, + "loss": 42.2715, + "step": 3653 + }, + { + "epoch": 13.191422121896162, + "grad_norm": 191.8995361328125, + "learning_rate": 1.0159709618874774e-05, + "loss": 41.2991, + "step": 3654 + }, + { + "epoch": 13.195033860045147, + "grad_norm": 242.85342407226562, + "learning_rate": 1.0154264972776769e-05, + "loss": 39.6007, + "step": 3655 + }, + { + "epoch": 13.198645598194132, + "grad_norm": 279.1092529296875, + "learning_rate": 1.0148820326678766e-05, + "loss": 39.8502, + "step": 3656 + }, + { + "epoch": 13.202257336343115, + "grad_norm": 233.94708251953125, + "learning_rate": 1.0143375680580763e-05, + "loss": 39.6407, + "step": 3657 + }, + { + "epoch": 13.2058690744921, + "grad_norm": 227.53001403808594, + "learning_rate": 1.0137931034482758e-05, + "loss": 40.3618, + "step": 3658 + }, + { + "epoch": 13.209480812641084, + "grad_norm": 216.17654418945312, + "learning_rate": 1.0132486388384756e-05, + "loss": 41.3187, + "step": 3659 + }, + { + "epoch": 13.213092550790067, + "grad_norm": 199.51072692871094, + "learning_rate": 1.0127041742286751e-05, + "loss": 41.7474, + "step": 3660 + }, + { + "epoch": 13.213092550790067, + "eval_loss": 0.6099065542221069, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3660 + }, + { + "epoch": 13.216704288939052, + "grad_norm": 212.3302001953125, + "learning_rate": 1.0121597096188748e-05, + "loss": 40.8565, + "step": 3661 + }, + { + "epoch": 13.220316027088035, + "grad_norm": 185.42857360839844, + "learning_rate": 1.0116152450090745e-05, + "loss": 41.5302, + "step": 3662 + }, + { + "epoch": 13.22392776523702, + "grad_norm": 241.05487060546875, + "learning_rate": 1.011070780399274e-05, + "loss": 38.6842, + "step": 3663 + }, + { + "epoch": 13.227539503386005, + "grad_norm": 314.1755065917969, + "learning_rate": 1.0105263157894738e-05, + "loss": 37.8021, + "step": 3664 + }, + { + "epoch": 13.231151241534988, + "grad_norm": 262.6571960449219, + "learning_rate": 1.0099818511796733e-05, + "loss": 36.3265, + "step": 3665 + }, + { + "epoch": 13.234762979683973, + "grad_norm": 259.24029541015625, + "learning_rate": 1.009437386569873e-05, + "loss": 38.4521, + "step": 3666 + }, + { + "epoch": 13.238374717832958, + "grad_norm": 223.5182342529297, + "learning_rate": 1.0088929219600727e-05, + "loss": 37.3267, + "step": 3667 + }, + { + "epoch": 13.241986455981941, + "grad_norm": 181.72926330566406, + "learning_rate": 1.0083484573502722e-05, + "loss": 38.0142, + "step": 3668 + }, + { + "epoch": 13.245598194130926, + "grad_norm": 204.99813842773438, + "learning_rate": 1.0078039927404718e-05, + "loss": 37.3513, + "step": 3669 + }, + { + "epoch": 13.249209932279909, + "grad_norm": 184.05482482910156, + "learning_rate": 1.0072595281306715e-05, + "loss": 37.9737, + "step": 3670 + }, + { + "epoch": 13.249209932279909, + "eval_loss": 0.6081296801567078, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.081, + "eval_steps_per_second": 57.081, + "step": 3670 + }, + { + "epoch": 13.252821670428894, + "grad_norm": 261.076416015625, + "learning_rate": 1.0067150635208712e-05, + "loss": 38.1087, + "step": 3671 + }, + { + "epoch": 13.256433408577879, + "grad_norm": 218.79515075683594, + "learning_rate": 1.0061705989110709e-05, + "loss": 37.215, + "step": 3672 + }, + { + "epoch": 13.260045146726862, + "grad_norm": 240.93222045898438, + "learning_rate": 1.0056261343012704e-05, + "loss": 37.4461, + "step": 3673 + }, + { + "epoch": 13.263656884875846, + "grad_norm": 241.46072387695312, + "learning_rate": 1.00508166969147e-05, + "loss": 39.4396, + "step": 3674 + }, + { + "epoch": 13.267268623024831, + "grad_norm": 217.85369873046875, + "learning_rate": 1.0045372050816699e-05, + "loss": 38.5512, + "step": 3675 + }, + { + "epoch": 13.270880361173814, + "grad_norm": 254.53549194335938, + "learning_rate": 1.0039927404718694e-05, + "loss": 39.4436, + "step": 3676 + }, + { + "epoch": 13.2744920993228, + "grad_norm": 330.2030029296875, + "learning_rate": 1.003448275862069e-05, + "loss": 39.6341, + "step": 3677 + }, + { + "epoch": 13.278103837471784, + "grad_norm": 267.6778869628906, + "learning_rate": 1.0029038112522686e-05, + "loss": 38.5305, + "step": 3678 + }, + { + "epoch": 13.281715575620767, + "grad_norm": 251.23703002929688, + "learning_rate": 1.0023593466424682e-05, + "loss": 39.712, + "step": 3679 + }, + { + "epoch": 13.285327313769752, + "grad_norm": 258.8126525878906, + "learning_rate": 1.0018148820326679e-05, + "loss": 37.982, + "step": 3680 + }, + { + "epoch": 13.285327313769752, + "eval_loss": 0.6092600226402283, + "eval_runtime": 3.1494, + "eval_samples_per_second": 56.837, + "eval_steps_per_second": 56.837, + "step": 3680 + }, + { + "epoch": 13.288939051918735, + "grad_norm": 270.01690673828125, + "learning_rate": 1.0012704174228676e-05, + "loss": 35.8938, + "step": 3681 + }, + { + "epoch": 13.29255079006772, + "grad_norm": 271.138671875, + "learning_rate": 1.0007259528130671e-05, + "loss": 33.2221, + "step": 3682 + }, + { + "epoch": 13.296162528216705, + "grad_norm": 239.4976806640625, + "learning_rate": 1.0001814882032668e-05, + "loss": 32.6252, + "step": 3683 + }, + { + "epoch": 13.299774266365688, + "grad_norm": 203.7470245361328, + "learning_rate": 9.996370235934664e-06, + "loss": 32.3694, + "step": 3684 + }, + { + "epoch": 13.303386004514673, + "grad_norm": 255.28419494628906, + "learning_rate": 9.990925589836661e-06, + "loss": 32.7386, + "step": 3685 + }, + { + "epoch": 13.306997742663658, + "grad_norm": 267.82489013671875, + "learning_rate": 9.985480943738658e-06, + "loss": 33.7657, + "step": 3686 + }, + { + "epoch": 13.31060948081264, + "grad_norm": 224.82432556152344, + "learning_rate": 9.980036297640653e-06, + "loss": 34.085, + "step": 3687 + }, + { + "epoch": 13.314221218961626, + "grad_norm": 249.92684936523438, + "learning_rate": 9.974591651542649e-06, + "loss": 33.9186, + "step": 3688 + }, + { + "epoch": 13.317832957110609, + "grad_norm": 249.29620361328125, + "learning_rate": 9.969147005444648e-06, + "loss": 35.0909, + "step": 3689 + }, + { + "epoch": 13.321444695259594, + "grad_norm": 276.4640808105469, + "learning_rate": 9.963702359346643e-06, + "loss": 35.6823, + "step": 3690 + }, + { + "epoch": 13.321444695259594, + "eval_loss": 0.6132593154907227, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 3690 + }, + { + "epoch": 13.325056433408578, + "grad_norm": 245.46163940429688, + "learning_rate": 9.958257713248638e-06, + "loss": 35.7071, + "step": 3691 + }, + { + "epoch": 13.328668171557561, + "grad_norm": 311.008544921875, + "learning_rate": 9.952813067150635e-06, + "loss": 33.6089, + "step": 3692 + }, + { + "epoch": 13.332279909706546, + "grad_norm": 283.2784118652344, + "learning_rate": 9.94736842105263e-06, + "loss": 34.9939, + "step": 3693 + }, + { + "epoch": 13.335891647855531, + "grad_norm": 293.2317199707031, + "learning_rate": 9.94192377495463e-06, + "loss": 37.1149, + "step": 3694 + }, + { + "epoch": 13.339503386004514, + "grad_norm": 263.33111572265625, + "learning_rate": 9.936479128856625e-06, + "loss": 36.5911, + "step": 3695 + }, + { + "epoch": 13.343115124153499, + "grad_norm": 285.1488952636719, + "learning_rate": 9.93103448275862e-06, + "loss": 35.9336, + "step": 3696 + }, + { + "epoch": 13.346726862302482, + "grad_norm": 246.30616760253906, + "learning_rate": 9.925589836660617e-06, + "loss": 26.1555, + "step": 3697 + }, + { + "epoch": 13.350338600451467, + "grad_norm": 185.4857177734375, + "learning_rate": 9.920145190562614e-06, + "loss": 21.9519, + "step": 3698 + }, + { + "epoch": 13.353950338600452, + "grad_norm": 269.6291809082031, + "learning_rate": 9.91470054446461e-06, + "loss": 22.5592, + "step": 3699 + }, + { + "epoch": 13.357562076749435, + "grad_norm": 214.7660675048828, + "learning_rate": 9.909255898366607e-06, + "loss": 23.2505, + "step": 3700 + }, + { + "epoch": 13.357562076749435, + "eval_loss": 0.6123418211936951, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 3700 + }, + { + "epoch": 13.36117381489842, + "grad_norm": 227.8025360107422, + "learning_rate": 9.903811252268602e-06, + "loss": 23.9731, + "step": 3701 + }, + { + "epoch": 13.364785553047405, + "grad_norm": 261.7846374511719, + "learning_rate": 9.898366606170598e-06, + "loss": 40.3869, + "step": 3702 + }, + { + "epoch": 13.368397291196388, + "grad_norm": 305.4109802246094, + "learning_rate": 9.892921960072596e-06, + "loss": 41.9626, + "step": 3703 + }, + { + "epoch": 13.372009029345373, + "grad_norm": 272.86236572265625, + "learning_rate": 9.887477313974592e-06, + "loss": 39.9819, + "step": 3704 + }, + { + "epoch": 13.375620767494357, + "grad_norm": 371.4781188964844, + "learning_rate": 9.882032667876589e-06, + "loss": 40.8074, + "step": 3705 + }, + { + "epoch": 13.37923250564334, + "grad_norm": 278.7463684082031, + "learning_rate": 9.876588021778584e-06, + "loss": 40.6721, + "step": 3706 + }, + { + "epoch": 13.382844243792325, + "grad_norm": 270.41619873046875, + "learning_rate": 9.87114337568058e-06, + "loss": 40.1604, + "step": 3707 + }, + { + "epoch": 13.386455981941308, + "grad_norm": 204.42018127441406, + "learning_rate": 9.865698729582578e-06, + "loss": 41.4666, + "step": 3708 + }, + { + "epoch": 13.390067720090293, + "grad_norm": 197.43289184570312, + "learning_rate": 9.860254083484574e-06, + "loss": 40.953, + "step": 3709 + }, + { + "epoch": 13.393679458239278, + "grad_norm": 203.92056274414062, + "learning_rate": 9.85480943738657e-06, + "loss": 40.6416, + "step": 3710 + }, + { + "epoch": 13.393679458239278, + "eval_loss": 0.608938992023468, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.863, + "eval_steps_per_second": 56.863, + "step": 3710 + }, + { + "epoch": 13.397291196388261, + "grad_norm": 353.2951354980469, + "learning_rate": 9.849364791288566e-06, + "loss": 39.7, + "step": 3711 + }, + { + "epoch": 13.400902934537246, + "grad_norm": 222.94410705566406, + "learning_rate": 9.843920145190563e-06, + "loss": 40.4703, + "step": 3712 + }, + { + "epoch": 13.404514672686231, + "grad_norm": 301.0710754394531, + "learning_rate": 9.83847549909256e-06, + "loss": 37.0453, + "step": 3713 + }, + { + "epoch": 13.408126410835214, + "grad_norm": 251.70263671875, + "learning_rate": 9.833030852994556e-06, + "loss": 37.5346, + "step": 3714 + }, + { + "epoch": 13.411738148984199, + "grad_norm": 201.29335021972656, + "learning_rate": 9.827586206896551e-06, + "loss": 39.0706, + "step": 3715 + }, + { + "epoch": 13.415349887133182, + "grad_norm": 233.82212829589844, + "learning_rate": 9.822141560798548e-06, + "loss": 38.4527, + "step": 3716 + }, + { + "epoch": 13.418961625282167, + "grad_norm": 245.0128936767578, + "learning_rate": 9.816696914700545e-06, + "loss": 37.82, + "step": 3717 + }, + { + "epoch": 13.422573363431152, + "grad_norm": 325.1784973144531, + "learning_rate": 9.81125226860254e-06, + "loss": 38.8858, + "step": 3718 + }, + { + "epoch": 13.426185101580135, + "grad_norm": 196.15032958984375, + "learning_rate": 9.805807622504538e-06, + "loss": 37.1919, + "step": 3719 + }, + { + "epoch": 13.42979683972912, + "grad_norm": 254.73980712890625, + "learning_rate": 9.800362976406533e-06, + "loss": 39.1644, + "step": 3720 + }, + { + "epoch": 13.42979683972912, + "eval_loss": 0.6100116968154907, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 3720 + }, + { + "epoch": 13.433408577878104, + "grad_norm": 253.11489868164062, + "learning_rate": 9.79491833030853e-06, + "loss": 39.8542, + "step": 3721 + }, + { + "epoch": 13.437020316027088, + "grad_norm": 267.8416748046875, + "learning_rate": 9.789473684210527e-06, + "loss": 39.8469, + "step": 3722 + }, + { + "epoch": 13.440632054176072, + "grad_norm": 267.62835693359375, + "learning_rate": 9.784029038112523e-06, + "loss": 37.4556, + "step": 3723 + }, + { + "epoch": 13.444243792325057, + "grad_norm": 346.6018371582031, + "learning_rate": 9.77858439201452e-06, + "loss": 39.7817, + "step": 3724 + }, + { + "epoch": 13.44785553047404, + "grad_norm": 241.95008850097656, + "learning_rate": 9.773139745916515e-06, + "loss": 39.1631, + "step": 3725 + }, + { + "epoch": 13.451467268623025, + "grad_norm": 244.9163055419922, + "learning_rate": 9.767695099818512e-06, + "loss": 38.6152, + "step": 3726 + }, + { + "epoch": 13.455079006772008, + "grad_norm": 243.60633850097656, + "learning_rate": 9.76225045372051e-06, + "loss": 39.5388, + "step": 3727 + }, + { + "epoch": 13.458690744920993, + "grad_norm": 230.57276916503906, + "learning_rate": 9.756805807622505e-06, + "loss": 40.3007, + "step": 3728 + }, + { + "epoch": 13.462302483069978, + "grad_norm": 228.76754760742188, + "learning_rate": 9.7513611615245e-06, + "loss": 37.7111, + "step": 3729 + }, + { + "epoch": 13.465914221218961, + "grad_norm": 292.7367248535156, + "learning_rate": 9.745916515426497e-06, + "loss": 38.4114, + "step": 3730 + }, + { + "epoch": 13.465914221218961, + "eval_loss": 0.6064842939376831, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 3730 + }, + { + "epoch": 13.469525959367946, + "grad_norm": 226.9254150390625, + "learning_rate": 9.740471869328494e-06, + "loss": 34.015, + "step": 3731 + }, + { + "epoch": 13.47313769751693, + "grad_norm": 250.38137817382812, + "learning_rate": 9.73502722323049e-06, + "loss": 34.2911, + "step": 3732 + }, + { + "epoch": 13.476749435665914, + "grad_norm": 230.447265625, + "learning_rate": 9.729582577132487e-06, + "loss": 31.8708, + "step": 3733 + }, + { + "epoch": 13.480361173814899, + "grad_norm": 241.05787658691406, + "learning_rate": 9.724137931034482e-06, + "loss": 34.5685, + "step": 3734 + }, + { + "epoch": 13.483972911963882, + "grad_norm": 248.07254028320312, + "learning_rate": 9.718693284936481e-06, + "loss": 32.6084, + "step": 3735 + }, + { + "epoch": 13.487584650112867, + "grad_norm": 241.22862243652344, + "learning_rate": 9.713248638838476e-06, + "loss": 32.787, + "step": 3736 + }, + { + "epoch": 13.491196388261852, + "grad_norm": 295.4871520996094, + "learning_rate": 9.707803992740472e-06, + "loss": 33.9786, + "step": 3737 + }, + { + "epoch": 13.494808126410835, + "grad_norm": 285.3634948730469, + "learning_rate": 9.702359346642469e-06, + "loss": 33.9872, + "step": 3738 + }, + { + "epoch": 13.49841986455982, + "grad_norm": 302.39947509765625, + "learning_rate": 9.696914700544464e-06, + "loss": 33.9854, + "step": 3739 + }, + { + "epoch": 13.502031602708804, + "grad_norm": 310.0465087890625, + "learning_rate": 9.691470054446461e-06, + "loss": 34.1859, + "step": 3740 + }, + { + "epoch": 13.502031602708804, + "eval_loss": 0.6067100167274475, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 3740 + }, + { + "epoch": 13.505643340857787, + "grad_norm": 319.9311828613281, + "learning_rate": 9.686025408348458e-06, + "loss": 34.5264, + "step": 3741 + }, + { + "epoch": 13.509255079006772, + "grad_norm": 291.75738525390625, + "learning_rate": 9.680580762250454e-06, + "loss": 35.8348, + "step": 3742 + }, + { + "epoch": 13.512866817155757, + "grad_norm": 291.5312805175781, + "learning_rate": 9.675136116152449e-06, + "loss": 33.8803, + "step": 3743 + }, + { + "epoch": 13.51647855530474, + "grad_norm": 228.00588989257812, + "learning_rate": 9.669691470054448e-06, + "loss": 36.1919, + "step": 3744 + }, + { + "epoch": 13.520090293453725, + "grad_norm": 236.5559539794922, + "learning_rate": 9.664246823956443e-06, + "loss": 35.8432, + "step": 3745 + }, + { + "epoch": 13.523702031602708, + "grad_norm": 287.7408752441406, + "learning_rate": 9.65880217785844e-06, + "loss": 37.069, + "step": 3746 + }, + { + "epoch": 13.527313769751693, + "grad_norm": 272.73870849609375, + "learning_rate": 9.653357531760436e-06, + "loss": 29.1896, + "step": 3747 + }, + { + "epoch": 13.530925507900678, + "grad_norm": 256.5550842285156, + "learning_rate": 9.647912885662431e-06, + "loss": 23.0953, + "step": 3748 + }, + { + "epoch": 13.534537246049661, + "grad_norm": 230.98487854003906, + "learning_rate": 9.64246823956443e-06, + "loss": 21.9902, + "step": 3749 + }, + { + "epoch": 13.538148984198646, + "grad_norm": 247.1185760498047, + "learning_rate": 9.637023593466425e-06, + "loss": 23.7439, + "step": 3750 + }, + { + "epoch": 13.538148984198646, + "eval_loss": 0.6106311082839966, + "eval_runtime": 3.1356, + "eval_samples_per_second": 57.086, + "eval_steps_per_second": 57.086, + "step": 3750 + }, + { + "epoch": 13.54176072234763, + "grad_norm": 193.83152770996094, + "learning_rate": 9.63157894736842e-06, + "loss": 24.2292, + "step": 3751 + }, + { + "epoch": 13.545372460496614, + "grad_norm": 322.80487060546875, + "learning_rate": 9.626134301270418e-06, + "loss": 40.9778, + "step": 3752 + }, + { + "epoch": 13.548984198645599, + "grad_norm": 345.0560302734375, + "learning_rate": 9.620689655172413e-06, + "loss": 42.3601, + "step": 3753 + }, + { + "epoch": 13.552595936794582, + "grad_norm": 240.3759002685547, + "learning_rate": 9.61524500907441e-06, + "loss": 41.092, + "step": 3754 + }, + { + "epoch": 13.556207674943566, + "grad_norm": 219.0955352783203, + "learning_rate": 9.609800362976407e-06, + "loss": 40.3108, + "step": 3755 + }, + { + "epoch": 13.559819413092551, + "grad_norm": 255.6158447265625, + "learning_rate": 9.604355716878403e-06, + "loss": 39.8885, + "step": 3756 + }, + { + "epoch": 13.563431151241534, + "grad_norm": 264.55010986328125, + "learning_rate": 9.5989110707804e-06, + "loss": 40.8838, + "step": 3757 + }, + { + "epoch": 13.56704288939052, + "grad_norm": 313.0918273925781, + "learning_rate": 9.593466424682397e-06, + "loss": 40.6634, + "step": 3758 + }, + { + "epoch": 13.570654627539504, + "grad_norm": 304.87396240234375, + "learning_rate": 9.588021778584392e-06, + "loss": 41.8734, + "step": 3759 + }, + { + "epoch": 13.574266365688487, + "grad_norm": 239.76063537597656, + "learning_rate": 9.58257713248639e-06, + "loss": 40.6281, + "step": 3760 + }, + { + "epoch": 13.574266365688487, + "eval_loss": 0.6124129891395569, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.028, + "eval_steps_per_second": 57.028, + "step": 3760 + }, + { + "epoch": 13.577878103837472, + "grad_norm": 201.89422607421875, + "learning_rate": 9.577132486388385e-06, + "loss": 39.6948, + "step": 3761 + }, + { + "epoch": 13.581489841986457, + "grad_norm": 232.8797607421875, + "learning_rate": 9.57168784029038e-06, + "loss": 39.6927, + "step": 3762 + }, + { + "epoch": 13.58510158013544, + "grad_norm": 250.30355834960938, + "learning_rate": 9.566243194192379e-06, + "loss": 37.6926, + "step": 3763 + }, + { + "epoch": 13.588713318284425, + "grad_norm": 256.23626708984375, + "learning_rate": 9.560798548094374e-06, + "loss": 38.248, + "step": 3764 + }, + { + "epoch": 13.592325056433408, + "grad_norm": 234.1791534423828, + "learning_rate": 9.555353901996371e-06, + "loss": 36.8178, + "step": 3765 + }, + { + "epoch": 13.595936794582393, + "grad_norm": 243.87615966796875, + "learning_rate": 9.549909255898367e-06, + "loss": 37.0802, + "step": 3766 + }, + { + "epoch": 13.599548532731378, + "grad_norm": 220.98150634765625, + "learning_rate": 9.544464609800362e-06, + "loss": 37.1251, + "step": 3767 + }, + { + "epoch": 13.60316027088036, + "grad_norm": 235.8653564453125, + "learning_rate": 9.53901996370236e-06, + "loss": 38.2965, + "step": 3768 + }, + { + "epoch": 13.606772009029346, + "grad_norm": 237.66712951660156, + "learning_rate": 9.533575317604356e-06, + "loss": 38.0266, + "step": 3769 + }, + { + "epoch": 13.610383747178329, + "grad_norm": 229.4922637939453, + "learning_rate": 9.528130671506351e-06, + "loss": 38.4199, + "step": 3770 + }, + { + "epoch": 13.610383747178329, + "eval_loss": 0.6078812479972839, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 3770 + }, + { + "epoch": 13.613995485327314, + "grad_norm": 250.82533264160156, + "learning_rate": 9.522686025408349e-06, + "loss": 39.713, + "step": 3771 + }, + { + "epoch": 13.617607223476298, + "grad_norm": 218.97511291503906, + "learning_rate": 9.517241379310346e-06, + "loss": 37.6396, + "step": 3772 + }, + { + "epoch": 13.621218961625281, + "grad_norm": 240.13096618652344, + "learning_rate": 9.511796733212341e-06, + "loss": 39.2808, + "step": 3773 + }, + { + "epoch": 13.624830699774266, + "grad_norm": 214.77957153320312, + "learning_rate": 9.506352087114338e-06, + "loss": 39.1584, + "step": 3774 + }, + { + "epoch": 13.628442437923251, + "grad_norm": 273.2488708496094, + "learning_rate": 9.500907441016333e-06, + "loss": 39.6725, + "step": 3775 + }, + { + "epoch": 13.632054176072234, + "grad_norm": 240.46669006347656, + "learning_rate": 9.49546279491833e-06, + "loss": 40.155, + "step": 3776 + }, + { + "epoch": 13.635665914221219, + "grad_norm": 304.46533203125, + "learning_rate": 9.490018148820328e-06, + "loss": 39.5831, + "step": 3777 + }, + { + "epoch": 13.639277652370204, + "grad_norm": 282.9252624511719, + "learning_rate": 9.484573502722323e-06, + "loss": 40.8392, + "step": 3778 + }, + { + "epoch": 13.642889390519187, + "grad_norm": 229.2595977783203, + "learning_rate": 9.47912885662432e-06, + "loss": 38.4015, + "step": 3779 + }, + { + "epoch": 13.646501128668172, + "grad_norm": 300.0253601074219, + "learning_rate": 9.473684210526315e-06, + "loss": 35.0578, + "step": 3780 + }, + { + "epoch": 13.646501128668172, + "eval_loss": 0.6059401631355286, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 3780 + }, + { + "epoch": 13.650112866817155, + "grad_norm": 266.379638671875, + "learning_rate": 9.468239564428313e-06, + "loss": 33.0308, + "step": 3781 + }, + { + "epoch": 13.65372460496614, + "grad_norm": 248.8190460205078, + "learning_rate": 9.46279491833031e-06, + "loss": 31.7632, + "step": 3782 + }, + { + "epoch": 13.657336343115125, + "grad_norm": 224.4126739501953, + "learning_rate": 9.457350272232305e-06, + "loss": 32.8875, + "step": 3783 + }, + { + "epoch": 13.660948081264108, + "grad_norm": 259.84466552734375, + "learning_rate": 9.4519056261343e-06, + "loss": 32.3248, + "step": 3784 + }, + { + "epoch": 13.664559819413093, + "grad_norm": 233.59483337402344, + "learning_rate": 9.446460980036297e-06, + "loss": 32.5855, + "step": 3785 + }, + { + "epoch": 13.668171557562077, + "grad_norm": 283.1840515136719, + "learning_rate": 9.441016333938295e-06, + "loss": 33.8277, + "step": 3786 + }, + { + "epoch": 13.67178329571106, + "grad_norm": 269.51171875, + "learning_rate": 9.435571687840292e-06, + "loss": 33.8348, + "step": 3787 + }, + { + "epoch": 13.675395033860045, + "grad_norm": 284.6701354980469, + "learning_rate": 9.430127041742287e-06, + "loss": 34.2571, + "step": 3788 + }, + { + "epoch": 13.679006772009028, + "grad_norm": 308.96221923828125, + "learning_rate": 9.424682395644282e-06, + "loss": 34.2313, + "step": 3789 + }, + { + "epoch": 13.682618510158013, + "grad_norm": 229.36366271972656, + "learning_rate": 9.41923774954628e-06, + "loss": 34.6341, + "step": 3790 + }, + { + "epoch": 13.682618510158013, + "eval_loss": 0.606715202331543, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 3790 + }, + { + "epoch": 13.686230248306998, + "grad_norm": 335.4346008300781, + "learning_rate": 9.413793103448277e-06, + "loss": 35.2222, + "step": 3791 + }, + { + "epoch": 13.689841986455981, + "grad_norm": 259.72222900390625, + "learning_rate": 9.408348457350272e-06, + "loss": 34.7416, + "step": 3792 + }, + { + "epoch": 13.693453724604966, + "grad_norm": 275.96112060546875, + "learning_rate": 9.402903811252269e-06, + "loss": 34.2018, + "step": 3793 + }, + { + "epoch": 13.697065462753951, + "grad_norm": 349.28924560546875, + "learning_rate": 9.397459165154264e-06, + "loss": 37.8801, + "step": 3794 + }, + { + "epoch": 13.700677200902934, + "grad_norm": 288.47540283203125, + "learning_rate": 9.392014519056261e-06, + "loss": 37.5101, + "step": 3795 + }, + { + "epoch": 13.704288939051919, + "grad_norm": 255.31033325195312, + "learning_rate": 9.386569872958259e-06, + "loss": 36.9294, + "step": 3796 + }, + { + "epoch": 13.707900677200904, + "grad_norm": 273.757080078125, + "learning_rate": 9.381125226860254e-06, + "loss": 31.64, + "step": 3797 + }, + { + "epoch": 13.711512415349887, + "grad_norm": 236.24928283691406, + "learning_rate": 9.375680580762251e-06, + "loss": 22.9812, + "step": 3798 + }, + { + "epoch": 13.715124153498872, + "grad_norm": 206.70883178710938, + "learning_rate": 9.370235934664246e-06, + "loss": 22.4788, + "step": 3799 + }, + { + "epoch": 13.718735891647855, + "grad_norm": 168.15762329101562, + "learning_rate": 9.364791288566243e-06, + "loss": 23.3803, + "step": 3800 + }, + { + "epoch": 13.718735891647855, + "eval_loss": 0.6092759966850281, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 3800 + }, + { + "epoch": 13.72234762979684, + "grad_norm": 261.88397216796875, + "learning_rate": 9.35934664246824e-06, + "loss": 24.8757, + "step": 3801 + }, + { + "epoch": 13.725959367945824, + "grad_norm": 235.3518829345703, + "learning_rate": 9.353901996370236e-06, + "loss": 39.8777, + "step": 3802 + }, + { + "epoch": 13.729571106094808, + "grad_norm": 226.94027709960938, + "learning_rate": 9.348457350272231e-06, + "loss": 40.4357, + "step": 3803 + }, + { + "epoch": 13.733182844243792, + "grad_norm": 266.2643737792969, + "learning_rate": 9.34301270417423e-06, + "loss": 41.6411, + "step": 3804 + }, + { + "epoch": 13.736794582392777, + "grad_norm": 327.39288330078125, + "learning_rate": 9.337568058076225e-06, + "loss": 39.862, + "step": 3805 + }, + { + "epoch": 13.74040632054176, + "grad_norm": 241.03121948242188, + "learning_rate": 9.332123411978223e-06, + "loss": 39.1833, + "step": 3806 + }, + { + "epoch": 13.744018058690745, + "grad_norm": 232.2872314453125, + "learning_rate": 9.326678765880218e-06, + "loss": 40.6895, + "step": 3807 + }, + { + "epoch": 13.747629796839728, + "grad_norm": 236.909912109375, + "learning_rate": 9.321234119782213e-06, + "loss": 39.5891, + "step": 3808 + }, + { + "epoch": 13.751241534988713, + "grad_norm": 193.81478881835938, + "learning_rate": 9.315789473684212e-06, + "loss": 41.5211, + "step": 3809 + }, + { + "epoch": 13.754853273137698, + "grad_norm": 214.87301635742188, + "learning_rate": 9.310344827586207e-06, + "loss": 41.0726, + "step": 3810 + }, + { + "epoch": 13.754853273137698, + "eval_loss": 0.6098713874816895, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.047, + "eval_steps_per_second": 57.047, + "step": 3810 + }, + { + "epoch": 13.758465011286681, + "grad_norm": 196.57247924804688, + "learning_rate": 9.304900181488203e-06, + "loss": 40.1843, + "step": 3811 + }, + { + "epoch": 13.762076749435666, + "grad_norm": 215.59698486328125, + "learning_rate": 9.2994555353902e-06, + "loss": 37.6279, + "step": 3812 + }, + { + "epoch": 13.76568848758465, + "grad_norm": 221.1280059814453, + "learning_rate": 9.294010889292195e-06, + "loss": 37.9593, + "step": 3813 + }, + { + "epoch": 13.769300225733634, + "grad_norm": 314.94610595703125, + "learning_rate": 9.288566243194192e-06, + "loss": 37.3399, + "step": 3814 + }, + { + "epoch": 13.772911963882619, + "grad_norm": 240.10816955566406, + "learning_rate": 9.28312159709619e-06, + "loss": 38.3185, + "step": 3815 + }, + { + "epoch": 13.776523702031604, + "grad_norm": 229.2427978515625, + "learning_rate": 9.277676950998185e-06, + "loss": 36.9407, + "step": 3816 + }, + { + "epoch": 13.780135440180587, + "grad_norm": 224.78335571289062, + "learning_rate": 9.272232304900182e-06, + "loss": 39.3709, + "step": 3817 + }, + { + "epoch": 13.783747178329572, + "grad_norm": 216.5969696044922, + "learning_rate": 9.266787658802179e-06, + "loss": 38.2303, + "step": 3818 + }, + { + "epoch": 13.787358916478555, + "grad_norm": 208.7849884033203, + "learning_rate": 9.261343012704174e-06, + "loss": 39.492, + "step": 3819 + }, + { + "epoch": 13.79097065462754, + "grad_norm": 215.76475524902344, + "learning_rate": 9.255898366606171e-06, + "loss": 38.5599, + "step": 3820 + }, + { + "epoch": 13.79097065462754, + "eval_loss": 0.6080366969108582, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.041, + "eval_steps_per_second": 57.041, + "step": 3820 + }, + { + "epoch": 13.794582392776524, + "grad_norm": 224.64462280273438, + "learning_rate": 9.250453720508167e-06, + "loss": 39.315, + "step": 3821 + }, + { + "epoch": 13.798194130925507, + "grad_norm": 298.545654296875, + "learning_rate": 9.245009074410162e-06, + "loss": 38.3108, + "step": 3822 + }, + { + "epoch": 13.801805869074492, + "grad_norm": 236.5186767578125, + "learning_rate": 9.239564428312161e-06, + "loss": 39.9223, + "step": 3823 + }, + { + "epoch": 13.805417607223477, + "grad_norm": 251.47999572753906, + "learning_rate": 9.234119782214156e-06, + "loss": 39.4288, + "step": 3824 + }, + { + "epoch": 13.80902934537246, + "grad_norm": 260.8268737792969, + "learning_rate": 9.228675136116152e-06, + "loss": 38.276, + "step": 3825 + }, + { + "epoch": 13.812641083521445, + "grad_norm": 253.25172424316406, + "learning_rate": 9.223230490018149e-06, + "loss": 40.7118, + "step": 3826 + }, + { + "epoch": 13.816252821670428, + "grad_norm": 250.31784057617188, + "learning_rate": 9.217785843920146e-06, + "loss": 40.1916, + "step": 3827 + }, + { + "epoch": 13.819864559819413, + "grad_norm": 228.79234313964844, + "learning_rate": 9.212341197822143e-06, + "loss": 38.1513, + "step": 3828 + }, + { + "epoch": 13.823476297968398, + "grad_norm": 262.689697265625, + "learning_rate": 9.206896551724138e-06, + "loss": 38.43, + "step": 3829 + }, + { + "epoch": 13.827088036117381, + "grad_norm": 191.04139709472656, + "learning_rate": 9.201451905626134e-06, + "loss": 34.2476, + "step": 3830 + }, + { + "epoch": 13.827088036117381, + "eval_loss": 0.6077054142951965, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 3830 + }, + { + "epoch": 13.830699774266366, + "grad_norm": 236.3266143798828, + "learning_rate": 9.196007259528131e-06, + "loss": 33.7892, + "step": 3831 + }, + { + "epoch": 13.83431151241535, + "grad_norm": 284.8748474121094, + "learning_rate": 9.190562613430128e-06, + "loss": 31.9857, + "step": 3832 + }, + { + "epoch": 13.837923250564334, + "grad_norm": 261.17413330078125, + "learning_rate": 9.185117967332123e-06, + "loss": 32.8165, + "step": 3833 + }, + { + "epoch": 13.841534988713319, + "grad_norm": 195.1323699951172, + "learning_rate": 9.17967332123412e-06, + "loss": 33.1709, + "step": 3834 + }, + { + "epoch": 13.845146726862303, + "grad_norm": 220.5006561279297, + "learning_rate": 9.174228675136116e-06, + "loss": 33.149, + "step": 3835 + }, + { + "epoch": 13.848758465011286, + "grad_norm": 236.7254638671875, + "learning_rate": 9.168784029038111e-06, + "loss": 33.633, + "step": 3836 + }, + { + "epoch": 13.852370203160271, + "grad_norm": 269.1921691894531, + "learning_rate": 9.16333938294011e-06, + "loss": 34.6822, + "step": 3837 + }, + { + "epoch": 13.855981941309254, + "grad_norm": 222.4369354248047, + "learning_rate": 9.157894736842105e-06, + "loss": 35.2816, + "step": 3838 + }, + { + "epoch": 13.85959367945824, + "grad_norm": 232.4306640625, + "learning_rate": 9.152450090744102e-06, + "loss": 35.0067, + "step": 3839 + }, + { + "epoch": 13.863205417607224, + "grad_norm": 297.0786437988281, + "learning_rate": 9.147005444646098e-06, + "loss": 34.264, + "step": 3840 + }, + { + "epoch": 13.863205417607224, + "eval_loss": 0.6047748327255249, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 3840 + }, + { + "epoch": 13.866817155756207, + "grad_norm": 370.232421875, + "learning_rate": 9.141560798548095e-06, + "loss": 35.4996, + "step": 3841 + }, + { + "epoch": 13.870428893905192, + "grad_norm": 216.05775451660156, + "learning_rate": 9.136116152450092e-06, + "loss": 36.1403, + "step": 3842 + }, + { + "epoch": 13.874040632054175, + "grad_norm": 233.11138916015625, + "learning_rate": 9.130671506352087e-06, + "loss": 36.0324, + "step": 3843 + }, + { + "epoch": 13.87765237020316, + "grad_norm": 297.1761779785156, + "learning_rate": 9.125226860254083e-06, + "loss": 36.5617, + "step": 3844 + }, + { + "epoch": 13.881264108352145, + "grad_norm": 290.61590576171875, + "learning_rate": 9.11978221415608e-06, + "loss": 36.7113, + "step": 3845 + }, + { + "epoch": 13.884875846501128, + "grad_norm": 293.5744934082031, + "learning_rate": 9.114337568058077e-06, + "loss": 36.9964, + "step": 3846 + }, + { + "epoch": 13.888487584650113, + "grad_norm": 227.73455810546875, + "learning_rate": 9.108892921960072e-06, + "loss": 31.8552, + "step": 3847 + }, + { + "epoch": 13.892099322799098, + "grad_norm": 223.36077880859375, + "learning_rate": 9.10344827586207e-06, + "loss": 22.9122, + "step": 3848 + }, + { + "epoch": 13.89571106094808, + "grad_norm": 181.14501953125, + "learning_rate": 9.098003629764065e-06, + "loss": 22.366, + "step": 3849 + }, + { + "epoch": 13.899322799097066, + "grad_norm": 215.75856018066406, + "learning_rate": 9.092558983666063e-06, + "loss": 23.9545, + "step": 3850 + }, + { + "epoch": 13.899322799097066, + "eval_loss": 0.6072003245353699, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 3850 + }, + { + "epoch": 13.90293453724605, + "grad_norm": 233.22837829589844, + "learning_rate": 9.087114337568059e-06, + "loss": 23.5196, + "step": 3851 + }, + { + "epoch": 13.906546275395034, + "grad_norm": 269.9342041015625, + "learning_rate": 9.081669691470054e-06, + "loss": 41.4605, + "step": 3852 + }, + { + "epoch": 13.910158013544018, + "grad_norm": 304.4266662597656, + "learning_rate": 9.076225045372051e-06, + "loss": 40.2848, + "step": 3853 + }, + { + "epoch": 13.913769751693001, + "grad_norm": 318.2371520996094, + "learning_rate": 9.070780399274047e-06, + "loss": 41.0044, + "step": 3854 + }, + { + "epoch": 13.917381489841986, + "grad_norm": 272.9725341796875, + "learning_rate": 9.065335753176044e-06, + "loss": 40.776, + "step": 3855 + }, + { + "epoch": 13.920993227990971, + "grad_norm": 213.8822784423828, + "learning_rate": 9.059891107078041e-06, + "loss": 39.4964, + "step": 3856 + }, + { + "epoch": 13.924604966139954, + "grad_norm": 239.16128540039062, + "learning_rate": 9.054446460980036e-06, + "loss": 41.3482, + "step": 3857 + }, + { + "epoch": 13.928216704288939, + "grad_norm": 264.839111328125, + "learning_rate": 9.049001814882033e-06, + "loss": 38.2433, + "step": 3858 + }, + { + "epoch": 13.931828442437924, + "grad_norm": 244.00926208496094, + "learning_rate": 9.043557168784029e-06, + "loss": 38.6482, + "step": 3859 + }, + { + "epoch": 13.935440180586907, + "grad_norm": 342.8050537109375, + "learning_rate": 9.038112522686026e-06, + "loss": 39.2047, + "step": 3860 + }, + { + "epoch": 13.935440180586907, + "eval_loss": 0.6078094244003296, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3860 + }, + { + "epoch": 13.939051918735892, + "grad_norm": 232.509033203125, + "learning_rate": 9.032667876588023e-06, + "loss": 39.2827, + "step": 3861 + }, + { + "epoch": 13.942663656884875, + "grad_norm": 343.2891845703125, + "learning_rate": 9.027223230490018e-06, + "loss": 38.2709, + "step": 3862 + }, + { + "epoch": 13.94627539503386, + "grad_norm": 332.9613342285156, + "learning_rate": 9.021778584392014e-06, + "loss": 38.8266, + "step": 3863 + }, + { + "epoch": 13.949887133182845, + "grad_norm": 339.5653076171875, + "learning_rate": 9.016333938294012e-06, + "loss": 39.9249, + "step": 3864 + }, + { + "epoch": 13.953498871331828, + "grad_norm": 269.0108947753906, + "learning_rate": 9.010889292196008e-06, + "loss": 39.4593, + "step": 3865 + }, + { + "epoch": 13.957110609480813, + "grad_norm": 252.5339813232422, + "learning_rate": 9.005444646098003e-06, + "loss": 39.5471, + "step": 3866 + }, + { + "epoch": 13.960722347629797, + "grad_norm": 424.7225646972656, + "learning_rate": 9e-06, + "loss": 35.7505, + "step": 3867 + }, + { + "epoch": 13.96433408577878, + "grad_norm": 286.189208984375, + "learning_rate": 8.994555353901996e-06, + "loss": 32.445, + "step": 3868 + }, + { + "epoch": 13.967945823927765, + "grad_norm": 245.153564453125, + "learning_rate": 8.989110707803994e-06, + "loss": 33.2369, + "step": 3869 + }, + { + "epoch": 13.97155756207675, + "grad_norm": 305.3119812011719, + "learning_rate": 8.98366606170599e-06, + "loss": 31.7864, + "step": 3870 + }, + { + "epoch": 13.97155756207675, + "eval_loss": 0.6069231629371643, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.877, + "eval_steps_per_second": 56.877, + "step": 3870 + }, + { + "epoch": 13.975169300225733, + "grad_norm": 218.70913696289062, + "learning_rate": 8.978221415607985e-06, + "loss": 33.7166, + "step": 3871 + }, + { + "epoch": 13.978781038374718, + "grad_norm": 334.856201171875, + "learning_rate": 8.972776769509982e-06, + "loss": 35.8878, + "step": 3872 + }, + { + "epoch": 13.982392776523701, + "grad_norm": 305.65203857421875, + "learning_rate": 8.96733212341198e-06, + "loss": 35.1525, + "step": 3873 + }, + { + "epoch": 13.986004514672686, + "grad_norm": 330.148193359375, + "learning_rate": 8.961887477313975e-06, + "loss": 34.8268, + "step": 3874 + }, + { + "epoch": 13.989616252821671, + "grad_norm": 288.9424133300781, + "learning_rate": 8.956442831215972e-06, + "loss": 35.5068, + "step": 3875 + }, + { + "epoch": 13.993227990970654, + "grad_norm": 256.2596740722656, + "learning_rate": 8.950998185117967e-06, + "loss": 28.5016, + "step": 3876 + }, + { + "epoch": 13.996839729119639, + "grad_norm": 234.31991577148438, + "learning_rate": 8.945553539019963e-06, + "loss": 23.7416, + "step": 3877 + }, + { + "epoch": 14.0, + "grad_norm": 182.19000244140625, + "learning_rate": 8.940108892921961e-06, + "loss": 21.0329, + "step": 3878 + }, + { + "epoch": 14.003611738148985, + "grad_norm": 254.86355590820312, + "learning_rate": 8.934664246823957e-06, + "loss": 39.94, + "step": 3879 + }, + { + "epoch": 14.007223476297968, + "grad_norm": 229.75650024414062, + "learning_rate": 8.929219600725954e-06, + "loss": 40.3213, + "step": 3880 + }, + { + "epoch": 14.007223476297968, + "eval_loss": 0.604503870010376, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3880 + }, + { + "epoch": 14.010835214446953, + "grad_norm": 220.18190002441406, + "learning_rate": 8.923774954627949e-06, + "loss": 40.1568, + "step": 3881 + }, + { + "epoch": 14.014446952595938, + "grad_norm": 269.5978088378906, + "learning_rate": 8.918330308529945e-06, + "loss": 40.3685, + "step": 3882 + }, + { + "epoch": 14.01805869074492, + "grad_norm": 254.3507537841797, + "learning_rate": 8.912885662431943e-06, + "loss": 40.0845, + "step": 3883 + }, + { + "epoch": 14.021670428893906, + "grad_norm": 251.43653869628906, + "learning_rate": 8.907441016333939e-06, + "loss": 40.1731, + "step": 3884 + }, + { + "epoch": 14.025282167042889, + "grad_norm": 215.91253662109375, + "learning_rate": 8.901996370235934e-06, + "loss": 39.7179, + "step": 3885 + }, + { + "epoch": 14.028893905191874, + "grad_norm": 247.81790161132812, + "learning_rate": 8.896551724137931e-06, + "loss": 41.0822, + "step": 3886 + }, + { + "epoch": 14.032505643340858, + "grad_norm": 232.45892333984375, + "learning_rate": 8.891107078039928e-06, + "loss": 39.7873, + "step": 3887 + }, + { + "epoch": 14.036117381489841, + "grad_norm": 231.8137969970703, + "learning_rate": 8.885662431941924e-06, + "loss": 41.1302, + "step": 3888 + }, + { + "epoch": 14.039729119638826, + "grad_norm": 219.09446716308594, + "learning_rate": 8.88021778584392e-06, + "loss": 39.2293, + "step": 3889 + }, + { + "epoch": 14.043340857787811, + "grad_norm": 187.99874877929688, + "learning_rate": 8.874773139745916e-06, + "loss": 37.3338, + "step": 3890 + }, + { + "epoch": 14.043340857787811, + "eval_loss": 0.603966236114502, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 3890 + }, + { + "epoch": 14.046952595936794, + "grad_norm": 285.2400207519531, + "learning_rate": 8.869328493647913e-06, + "loss": 36.9479, + "step": 3891 + }, + { + "epoch": 14.050564334085779, + "grad_norm": 234.23655700683594, + "learning_rate": 8.86388384754991e-06, + "loss": 35.1313, + "step": 3892 + }, + { + "epoch": 14.054176072234762, + "grad_norm": 234.78717041015625, + "learning_rate": 8.858439201451906e-06, + "loss": 36.5917, + "step": 3893 + }, + { + "epoch": 14.057787810383747, + "grad_norm": 226.53997802734375, + "learning_rate": 8.852994555353903e-06, + "loss": 38.3228, + "step": 3894 + }, + { + "epoch": 14.061399548532732, + "grad_norm": 222.05213928222656, + "learning_rate": 8.847549909255898e-06, + "loss": 37.3542, + "step": 3895 + }, + { + "epoch": 14.065011286681715, + "grad_norm": 222.9646759033203, + "learning_rate": 8.842105263157893e-06, + "loss": 37.6396, + "step": 3896 + }, + { + "epoch": 14.0686230248307, + "grad_norm": 227.78965759277344, + "learning_rate": 8.836660617059892e-06, + "loss": 38.1988, + "step": 3897 + }, + { + "epoch": 14.072234762979685, + "grad_norm": 200.89691162109375, + "learning_rate": 8.831215970961888e-06, + "loss": 38.3981, + "step": 3898 + }, + { + "epoch": 14.075846501128668, + "grad_norm": 212.52891540527344, + "learning_rate": 8.825771324863883e-06, + "loss": 37.3422, + "step": 3899 + }, + { + "epoch": 14.079458239277653, + "grad_norm": 312.33905029296875, + "learning_rate": 8.82032667876588e-06, + "loss": 38.1292, + "step": 3900 + }, + { + "epoch": 14.079458239277653, + "eval_loss": 0.6061921119689941, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.017, + "eval_steps_per_second": 57.017, + "step": 3900 + }, + { + "epoch": 14.083069977426636, + "grad_norm": 261.8415832519531, + "learning_rate": 8.814882032667877e-06, + "loss": 37.5543, + "step": 3901 + }, + { + "epoch": 14.08668171557562, + "grad_norm": 264.625732421875, + "learning_rate": 8.809437386569874e-06, + "loss": 39.3912, + "step": 3902 + }, + { + "epoch": 14.090293453724605, + "grad_norm": 305.7203063964844, + "learning_rate": 8.80399274047187e-06, + "loss": 39.7879, + "step": 3903 + }, + { + "epoch": 14.093905191873588, + "grad_norm": 282.63616943359375, + "learning_rate": 8.798548094373865e-06, + "loss": 38.7212, + "step": 3904 + }, + { + "epoch": 14.097516930022573, + "grad_norm": 246.49169921875, + "learning_rate": 8.793103448275862e-06, + "loss": 40.6198, + "step": 3905 + }, + { + "epoch": 14.101128668171558, + "grad_norm": 283.2737731933594, + "learning_rate": 8.787658802177859e-06, + "loss": 39.6947, + "step": 3906 + }, + { + "epoch": 14.104740406320541, + "grad_norm": 306.95721435546875, + "learning_rate": 8.782214156079855e-06, + "loss": 38.6157, + "step": 3907 + }, + { + "epoch": 14.108352144469526, + "grad_norm": 238.1789093017578, + "learning_rate": 8.776769509981852e-06, + "loss": 35.5328, + "step": 3908 + }, + { + "epoch": 14.111963882618511, + "grad_norm": 233.2298126220703, + "learning_rate": 8.771324863883847e-06, + "loss": 32.4008, + "step": 3909 + }, + { + "epoch": 14.115575620767494, + "grad_norm": 233.46339416503906, + "learning_rate": 8.765880217785846e-06, + "loss": 31.0712, + "step": 3910 + }, + { + "epoch": 14.115575620767494, + "eval_loss": 0.6046931147575378, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 3910 + }, + { + "epoch": 14.119187358916479, + "grad_norm": 226.30343627929688, + "learning_rate": 8.760435571687841e-06, + "loss": 33.252, + "step": 3911 + }, + { + "epoch": 14.122799097065462, + "grad_norm": 247.17465209960938, + "learning_rate": 8.754990925589837e-06, + "loss": 31.526, + "step": 3912 + }, + { + "epoch": 14.126410835214447, + "grad_norm": 208.25439453125, + "learning_rate": 8.749546279491834e-06, + "loss": 32.4838, + "step": 3913 + }, + { + "epoch": 14.130022573363432, + "grad_norm": 236.4488525390625, + "learning_rate": 8.744101633393829e-06, + "loss": 32.7987, + "step": 3914 + }, + { + "epoch": 14.133634311512415, + "grad_norm": 219.13279724121094, + "learning_rate": 8.738656987295826e-06, + "loss": 32.8516, + "step": 3915 + }, + { + "epoch": 14.1372460496614, + "grad_norm": 239.7289581298828, + "learning_rate": 8.733212341197823e-06, + "loss": 33.7763, + "step": 3916 + }, + { + "epoch": 14.140857787810384, + "grad_norm": 226.3568878173828, + "learning_rate": 8.727767695099819e-06, + "loss": 35.675, + "step": 3917 + }, + { + "epoch": 14.144469525959368, + "grad_norm": 302.84307861328125, + "learning_rate": 8.722323049001814e-06, + "loss": 34.0523, + "step": 3918 + }, + { + "epoch": 14.148081264108352, + "grad_norm": 280.40106201171875, + "learning_rate": 8.716878402903811e-06, + "loss": 35.2923, + "step": 3919 + }, + { + "epoch": 14.151693002257336, + "grad_norm": 238.30520629882812, + "learning_rate": 8.711433756805808e-06, + "loss": 36.0242, + "step": 3920 + }, + { + "epoch": 14.151693002257336, + "eval_loss": 0.6067762970924377, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 3920 + }, + { + "epoch": 14.15530474040632, + "grad_norm": 238.6465301513672, + "learning_rate": 8.705989110707805e-06, + "loss": 36.2959, + "step": 3921 + }, + { + "epoch": 14.158916478555305, + "grad_norm": 273.26837158203125, + "learning_rate": 8.7005444646098e-06, + "loss": 35.45, + "step": 3922 + }, + { + "epoch": 14.162528216704288, + "grad_norm": 296.907958984375, + "learning_rate": 8.695099818511796e-06, + "loss": 36.4428, + "step": 3923 + }, + { + "epoch": 14.166139954853273, + "grad_norm": 215.07374572753906, + "learning_rate": 8.689655172413795e-06, + "loss": 26.4171, + "step": 3924 + }, + { + "epoch": 14.169751693002258, + "grad_norm": 217.64779663085938, + "learning_rate": 8.68421052631579e-06, + "loss": 22.5483, + "step": 3925 + }, + { + "epoch": 14.173363431151241, + "grad_norm": 243.59364318847656, + "learning_rate": 8.678765880217785e-06, + "loss": 22.0396, + "step": 3926 + }, + { + "epoch": 14.176975169300226, + "grad_norm": 189.66969299316406, + "learning_rate": 8.673321234119783e-06, + "loss": 23.0957, + "step": 3927 + }, + { + "epoch": 14.18058690744921, + "grad_norm": 191.86180114746094, + "learning_rate": 8.667876588021778e-06, + "loss": 23.9385, + "step": 3928 + }, + { + "epoch": 14.184198645598194, + "grad_norm": 234.34896850585938, + "learning_rate": 8.662431941923775e-06, + "loss": 40.1665, + "step": 3929 + }, + { + "epoch": 14.187810383747179, + "grad_norm": 230.52401733398438, + "learning_rate": 8.656987295825772e-06, + "loss": 40.6752, + "step": 3930 + }, + { + "epoch": 14.187810383747179, + "eval_loss": 0.6088615655899048, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.998, + "eval_steps_per_second": 56.998, + "step": 3930 + }, + { + "epoch": 14.191422121896162, + "grad_norm": 234.06272888183594, + "learning_rate": 8.651542649727767e-06, + "loss": 40.7938, + "step": 3931 + }, + { + "epoch": 14.195033860045147, + "grad_norm": 344.4232482910156, + "learning_rate": 8.646098003629765e-06, + "loss": 38.7342, + "step": 3932 + }, + { + "epoch": 14.198645598194132, + "grad_norm": 375.74365234375, + "learning_rate": 8.640653357531762e-06, + "loss": 40.2052, + "step": 3933 + }, + { + "epoch": 14.202257336343115, + "grad_norm": 258.15570068359375, + "learning_rate": 8.635208711433757e-06, + "loss": 39.7266, + "step": 3934 + }, + { + "epoch": 14.2058690744921, + "grad_norm": 235.2681121826172, + "learning_rate": 8.629764065335754e-06, + "loss": 40.4821, + "step": 3935 + }, + { + "epoch": 14.209480812641084, + "grad_norm": 226.94764709472656, + "learning_rate": 8.62431941923775e-06, + "loss": 41.2414, + "step": 3936 + }, + { + "epoch": 14.213092550790067, + "grad_norm": 236.22109985351562, + "learning_rate": 8.618874773139745e-06, + "loss": 40.5807, + "step": 3937 + }, + { + "epoch": 14.216704288939052, + "grad_norm": 201.31112670898438, + "learning_rate": 8.613430127041744e-06, + "loss": 40.4824, + "step": 3938 + }, + { + "epoch": 14.220316027088035, + "grad_norm": 328.0167541503906, + "learning_rate": 8.607985480943739e-06, + "loss": 38.3881, + "step": 3939 + }, + { + "epoch": 14.22392776523702, + "grad_norm": 281.4416809082031, + "learning_rate": 8.602540834845734e-06, + "loss": 36.5777, + "step": 3940 + }, + { + "epoch": 14.22392776523702, + "eval_loss": 0.6099084615707397, + "eval_runtime": 3.1377, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 3940 + }, + { + "epoch": 14.227539503386005, + "grad_norm": 258.5203552246094, + "learning_rate": 8.597096188747731e-06, + "loss": 37.5071, + "step": 3941 + }, + { + "epoch": 14.231151241534988, + "grad_norm": 274.8222351074219, + "learning_rate": 8.591651542649727e-06, + "loss": 36.358, + "step": 3942 + }, + { + "epoch": 14.234762979683973, + "grad_norm": 253.1671600341797, + "learning_rate": 8.586206896551726e-06, + "loss": 37.5859, + "step": 3943 + }, + { + "epoch": 14.238374717832958, + "grad_norm": 249.80943298339844, + "learning_rate": 8.580762250453721e-06, + "loss": 37.8799, + "step": 3944 + }, + { + "epoch": 14.241986455981941, + "grad_norm": 245.29103088378906, + "learning_rate": 8.575317604355716e-06, + "loss": 36.7551, + "step": 3945 + }, + { + "epoch": 14.245598194130926, + "grad_norm": 205.5915985107422, + "learning_rate": 8.569872958257713e-06, + "loss": 38.4761, + "step": 3946 + }, + { + "epoch": 14.249209932279909, + "grad_norm": 218.10328674316406, + "learning_rate": 8.56442831215971e-06, + "loss": 37.5862, + "step": 3947 + }, + { + "epoch": 14.252821670428894, + "grad_norm": 273.5924072265625, + "learning_rate": 8.558983666061706e-06, + "loss": 39.2851, + "step": 3948 + }, + { + "epoch": 14.256433408577879, + "grad_norm": 235.48069763183594, + "learning_rate": 8.553539019963703e-06, + "loss": 39.0707, + "step": 3949 + }, + { + "epoch": 14.260045146726862, + "grad_norm": 230.93150329589844, + "learning_rate": 8.548094373865698e-06, + "loss": 37.8469, + "step": 3950 + }, + { + "epoch": 14.260045146726862, + "eval_loss": 0.6072147488594055, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 3950 + }, + { + "epoch": 14.263656884875846, + "grad_norm": 226.3638458251953, + "learning_rate": 8.542649727767695e-06, + "loss": 39.4245, + "step": 3951 + }, + { + "epoch": 14.267268623024831, + "grad_norm": 226.74595642089844, + "learning_rate": 8.537205081669693e-06, + "loss": 38.116, + "step": 3952 + }, + { + "epoch": 14.270880361173814, + "grad_norm": 226.1452178955078, + "learning_rate": 8.531760435571688e-06, + "loss": 39.9114, + "step": 3953 + }, + { + "epoch": 14.2744920993228, + "grad_norm": 387.8020324707031, + "learning_rate": 8.526315789473685e-06, + "loss": 38.9457, + "step": 3954 + }, + { + "epoch": 14.278103837471784, + "grad_norm": 381.5679931640625, + "learning_rate": 8.52087114337568e-06, + "loss": 40.7989, + "step": 3955 + }, + { + "epoch": 14.281715575620767, + "grad_norm": 246.16464233398438, + "learning_rate": 8.515426497277677e-06, + "loss": 37.6288, + "step": 3956 + }, + { + "epoch": 14.285327313769752, + "grad_norm": 337.05059814453125, + "learning_rate": 8.509981851179674e-06, + "loss": 37.3276, + "step": 3957 + }, + { + "epoch": 14.288939051918735, + "grad_norm": 223.80421447753906, + "learning_rate": 8.50453720508167e-06, + "loss": 33.9465, + "step": 3958 + }, + { + "epoch": 14.29255079006772, + "grad_norm": 218.9332275390625, + "learning_rate": 8.499092558983665e-06, + "loss": 33.0305, + "step": 3959 + }, + { + "epoch": 14.296162528216705, + "grad_norm": 254.20726013183594, + "learning_rate": 8.493647912885662e-06, + "loss": 31.3806, + "step": 3960 + }, + { + "epoch": 14.296162528216705, + "eval_loss": 0.6070483922958374, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 3960 + }, + { + "epoch": 14.299774266365688, + "grad_norm": 232.96702575683594, + "learning_rate": 8.48820326678766e-06, + "loss": 31.7001, + "step": 3961 + }, + { + "epoch": 14.303386004514673, + "grad_norm": 305.31207275390625, + "learning_rate": 8.482758620689656e-06, + "loss": 32.2629, + "step": 3962 + }, + { + "epoch": 14.306997742663658, + "grad_norm": 253.60858154296875, + "learning_rate": 8.477313974591652e-06, + "loss": 34.2635, + "step": 3963 + }, + { + "epoch": 14.31060948081264, + "grad_norm": 395.4168701171875, + "learning_rate": 8.471869328493647e-06, + "loss": 34.6987, + "step": 3964 + }, + { + "epoch": 14.314221218961626, + "grad_norm": 279.72845458984375, + "learning_rate": 8.466424682395644e-06, + "loss": 34.5488, + "step": 3965 + }, + { + "epoch": 14.317832957110609, + "grad_norm": 285.7306213378906, + "learning_rate": 8.460980036297641e-06, + "loss": 35.2566, + "step": 3966 + }, + { + "epoch": 14.321444695259594, + "grad_norm": 229.04226684570312, + "learning_rate": 8.455535390199637e-06, + "loss": 34.5273, + "step": 3967 + }, + { + "epoch": 14.325056433408578, + "grad_norm": 232.50205993652344, + "learning_rate": 8.450090744101634e-06, + "loss": 34.6337, + "step": 3968 + }, + { + "epoch": 14.328668171557561, + "grad_norm": 225.87583923339844, + "learning_rate": 8.44464609800363e-06, + "loss": 35.1575, + "step": 3969 + }, + { + "epoch": 14.332279909706546, + "grad_norm": 266.2709045410156, + "learning_rate": 8.439201451905626e-06, + "loss": 34.2619, + "step": 3970 + }, + { + "epoch": 14.332279909706546, + "eval_loss": 0.6066078543663025, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 3970 + }, + { + "epoch": 14.335891647855531, + "grad_norm": 283.557373046875, + "learning_rate": 8.433756805807623e-06, + "loss": 35.5713, + "step": 3971 + }, + { + "epoch": 14.339503386004514, + "grad_norm": 288.43707275390625, + "learning_rate": 8.428312159709619e-06, + "loss": 36.7442, + "step": 3972 + }, + { + "epoch": 14.343115124153499, + "grad_norm": 331.3218994140625, + "learning_rate": 8.422867513611616e-06, + "loss": 35.5839, + "step": 3973 + }, + { + "epoch": 14.346726862302482, + "grad_norm": 257.1488037109375, + "learning_rate": 8.417422867513611e-06, + "loss": 30.2221, + "step": 3974 + }, + { + "epoch": 14.350338600451467, + "grad_norm": 200.0919189453125, + "learning_rate": 8.411978221415608e-06, + "loss": 22.217, + "step": 3975 + }, + { + "epoch": 14.353950338600452, + "grad_norm": 245.030029296875, + "learning_rate": 8.406533575317605e-06, + "loss": 22.8927, + "step": 3976 + }, + { + "epoch": 14.357562076749435, + "grad_norm": 208.5701904296875, + "learning_rate": 8.4010889292196e-06, + "loss": 22.9537, + "step": 3977 + }, + { + "epoch": 14.36117381489842, + "grad_norm": 232.0613250732422, + "learning_rate": 8.395644283121596e-06, + "loss": 24.5304, + "step": 3978 + }, + { + "epoch": 14.364785553047405, + "grad_norm": 193.56541442871094, + "learning_rate": 8.390199637023595e-06, + "loss": 39.4552, + "step": 3979 + }, + { + "epoch": 14.368397291196388, + "grad_norm": 230.35507202148438, + "learning_rate": 8.38475499092559e-06, + "loss": 41.0417, + "step": 3980 + }, + { + "epoch": 14.368397291196388, + "eval_loss": 0.6071842908859253, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 3980 + }, + { + "epoch": 14.372009029345373, + "grad_norm": 191.09242248535156, + "learning_rate": 8.379310344827586e-06, + "loss": 40.1548, + "step": 3981 + }, + { + "epoch": 14.375620767494357, + "grad_norm": 249.24520874023438, + "learning_rate": 8.373865698729583e-06, + "loss": 39.5746, + "step": 3982 + }, + { + "epoch": 14.37923250564334, + "grad_norm": 266.509033203125, + "learning_rate": 8.368421052631578e-06, + "loss": 39.2388, + "step": 3983 + }, + { + "epoch": 14.382844243792325, + "grad_norm": 255.36209106445312, + "learning_rate": 8.362976406533577e-06, + "loss": 39.9314, + "step": 3984 + }, + { + "epoch": 14.386455981941308, + "grad_norm": 239.0690460205078, + "learning_rate": 8.357531760435572e-06, + "loss": 39.9124, + "step": 3985 + }, + { + "epoch": 14.390067720090293, + "grad_norm": 211.36135864257812, + "learning_rate": 8.352087114337568e-06, + "loss": 40.1307, + "step": 3986 + }, + { + "epoch": 14.393679458239278, + "grad_norm": 215.28912353515625, + "learning_rate": 8.346642468239565e-06, + "loss": 40.5252, + "step": 3987 + }, + { + "epoch": 14.397291196388261, + "grad_norm": 240.84271240234375, + "learning_rate": 8.34119782214156e-06, + "loss": 40.8348, + "step": 3988 + }, + { + "epoch": 14.400902934537246, + "grad_norm": 228.41758728027344, + "learning_rate": 8.335753176043557e-06, + "loss": 39.8228, + "step": 3989 + }, + { + "epoch": 14.404514672686231, + "grad_norm": 203.0228729248047, + "learning_rate": 8.330308529945554e-06, + "loss": 38.0696, + "step": 3990 + }, + { + "epoch": 14.404514672686231, + "eval_loss": 0.6064196825027466, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.983, + "eval_steps_per_second": 56.983, + "step": 3990 + }, + { + "epoch": 14.408126410835214, + "grad_norm": 245.14646911621094, + "learning_rate": 8.32486388384755e-06, + "loss": 37.3921, + "step": 3991 + }, + { + "epoch": 14.411738148984199, + "grad_norm": 230.0685577392578, + "learning_rate": 8.319419237749545e-06, + "loss": 36.8794, + "step": 3992 + }, + { + "epoch": 14.415349887133182, + "grad_norm": 203.02955627441406, + "learning_rate": 8.313974591651544e-06, + "loss": 38.011, + "step": 3993 + }, + { + "epoch": 14.418961625282167, + "grad_norm": 276.0522766113281, + "learning_rate": 8.30852994555354e-06, + "loss": 37.8114, + "step": 3994 + }, + { + "epoch": 14.422573363431152, + "grad_norm": 205.56423950195312, + "learning_rate": 8.303085299455536e-06, + "loss": 38.1956, + "step": 3995 + }, + { + "epoch": 14.426185101580135, + "grad_norm": 200.71507263183594, + "learning_rate": 8.297640653357532e-06, + "loss": 36.4471, + "step": 3996 + }, + { + "epoch": 14.42979683972912, + "grad_norm": 217.8540496826172, + "learning_rate": 8.292196007259527e-06, + "loss": 37.6204, + "step": 3997 + }, + { + "epoch": 14.433408577878104, + "grad_norm": 228.0621337890625, + "learning_rate": 8.286751361161526e-06, + "loss": 38.6074, + "step": 3998 + }, + { + "epoch": 14.437020316027088, + "grad_norm": 246.05203247070312, + "learning_rate": 8.281306715063521e-06, + "loss": 37.8614, + "step": 3999 + }, + { + "epoch": 14.440632054176072, + "grad_norm": 216.0327911376953, + "learning_rate": 8.275862068965517e-06, + "loss": 37.4941, + "step": 4000 + }, + { + "epoch": 14.440632054176072, + "eval_loss": 0.605604887008667, + "eval_runtime": 3.1399, + "eval_samples_per_second": 57.008, + "eval_steps_per_second": 57.008, + "step": 4000 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.9621957471941427e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9a96cfe805bf934b3051a2c273aab2dbb5f3f032 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1fb1642989f2ef12eb4dac1023adbd9f23a6d5e891842b29cc0e52b4bfb1eca +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..8b3f69dc5089f999f82e35d9b09d264cec2b6ff0 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afaeb3ac0c62338d1d247db7bd3c50cb7cd0f9233fc4a14e3baa236c6e8e6a36 +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..46388193aa75921cef6934f18086b115bbd8e5a2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb1c0eec235d6c33824fc90b2d6c22ab16fd534fa97061cd6c921e92d697e8ca +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1e709ff8073a6003080a71a8b2f4d283db8db8bd --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7d7f325b3ec9ca7853f71d87a820c9bab2afbbece55653116b59d048cba56fb +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a1c920bf95095af526e12a99ac1ab87b94d7b5c5 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:657c25b66db230e6f360ab4cfad70edebce3123503fc80a27d624e38f6563958 +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..64d14e1d5f36a0fa79b3f0d6f486313740a8a17a --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/trainer_state.json @@ -0,0 +1,32793 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 15.162528216704288, + "eval_steps": 10, + "global_step": 4200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + }, + { + "epoch": 4.33589164785553, + "grad_norm": 232.332763671875, + "learning_rate": 2.3515426497277678e-05, + "loss": 38.1029, + "step": 1201 + }, + { + "epoch": 4.339503386004514, + "grad_norm": 251.8763885498047, + "learning_rate": 2.3509981851179673e-05, + "loss": 40.2538, + "step": 1202 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 260.1363525390625, + "learning_rate": 2.350453720508167e-05, + "loss": 39.115, + "step": 1203 + }, + { + "epoch": 4.346726862302483, + "grad_norm": 227.32473754882812, + "learning_rate": 2.3499092558983667e-05, + "loss": 37.7692, + "step": 1204 + }, + { + "epoch": 4.350338600451467, + "grad_norm": 208.3872528076172, + "learning_rate": 2.3493647912885663e-05, + "loss": 26.7583, + "step": 1205 + }, + { + "epoch": 4.353950338600452, + "grad_norm": 173.05075073242188, + "learning_rate": 2.348820326678766e-05, + "loss": 24.7576, + "step": 1206 + }, + { + "epoch": 4.357562076749436, + "grad_norm": 214.4512939453125, + "learning_rate": 2.3482758620689657e-05, + "loss": 24.8792, + "step": 1207 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 179.293701171875, + "learning_rate": 2.3477313974591652e-05, + "loss": 26.1507, + "step": 1208 + }, + { + "epoch": 4.364785553047404, + "grad_norm": 401.9908142089844, + "learning_rate": 2.3471869328493648e-05, + "loss": 47.4017, + "step": 1209 + }, + { + "epoch": 4.368397291196389, + "grad_norm": 399.3369140625, + "learning_rate": 2.3466424682395643e-05, + "loss": 48.0082, + "step": 1210 + }, + { + "epoch": 4.368397291196389, + "eval_loss": 0.6664602756500244, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.18, + "eval_steps_per_second": 57.18, + "step": 1210 + }, + { + "epoch": 4.372009029345373, + "grad_norm": 320.49090576171875, + "learning_rate": 2.346098003629764e-05, + "loss": 47.4843, + "step": 1211 + }, + { + "epoch": 4.375620767494357, + "grad_norm": 297.55615234375, + "learning_rate": 2.3455535390199637e-05, + "loss": 46.3087, + "step": 1212 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 245.03399658203125, + "learning_rate": 2.3450090744101636e-05, + "loss": 45.4889, + "step": 1213 + }, + { + "epoch": 4.382844243792325, + "grad_norm": 227.94091796875, + "learning_rate": 2.344464609800363e-05, + "loss": 45.8501, + "step": 1214 + }, + { + "epoch": 4.386455981941309, + "grad_norm": 262.7824401855469, + "learning_rate": 2.3439201451905627e-05, + "loss": 46.2737, + "step": 1215 + }, + { + "epoch": 4.390067720090293, + "grad_norm": 235.969970703125, + "learning_rate": 2.3433756805807622e-05, + "loss": 45.2876, + "step": 1216 + }, + { + "epoch": 4.393679458239277, + "grad_norm": 244.8028106689453, + "learning_rate": 2.342831215970962e-05, + "loss": 45.4931, + "step": 1217 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 236.24844360351562, + "learning_rate": 2.3422867513611616e-05, + "loss": 45.6649, + "step": 1218 + }, + { + "epoch": 4.400902934537246, + "grad_norm": 204.7911834716797, + "learning_rate": 2.341742286751361e-05, + "loss": 43.9613, + "step": 1219 + }, + { + "epoch": 4.40451467268623, + "grad_norm": 190.6739044189453, + "learning_rate": 2.3411978221415607e-05, + "loss": 41.9267, + "step": 1220 + }, + { + "epoch": 4.40451467268623, + "eval_loss": 0.6481396555900574, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1220 + }, + { + "epoch": 4.408126410835214, + "grad_norm": 224.25758361816406, + "learning_rate": 2.3406533575317602e-05, + "loss": 42.34, + "step": 1221 + }, + { + "epoch": 4.411738148984199, + "grad_norm": 238.21913146972656, + "learning_rate": 2.34010889292196e-05, + "loss": 40.6947, + "step": 1222 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 255.64395141601562, + "learning_rate": 2.33956442831216e-05, + "loss": 39.8585, + "step": 1223 + }, + { + "epoch": 4.418961625282167, + "grad_norm": 202.08859252929688, + "learning_rate": 2.3390199637023595e-05, + "loss": 42.6031, + "step": 1224 + }, + { + "epoch": 4.422573363431152, + "grad_norm": 222.359619140625, + "learning_rate": 2.338475499092559e-05, + "loss": 41.9946, + "step": 1225 + }, + { + "epoch": 4.426185101580136, + "grad_norm": 198.84461975097656, + "learning_rate": 2.3379310344827586e-05, + "loss": 40.9174, + "step": 1226 + }, + { + "epoch": 4.42979683972912, + "grad_norm": 227.34942626953125, + "learning_rate": 2.337386569872958e-05, + "loss": 42.2865, + "step": 1227 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 249.9097900390625, + "learning_rate": 2.336842105263158e-05, + "loss": 42.6508, + "step": 1228 + }, + { + "epoch": 4.437020316027088, + "grad_norm": 236.96009826660156, + "learning_rate": 2.3362976406533576e-05, + "loss": 43.0846, + "step": 1229 + }, + { + "epoch": 4.440632054176072, + "grad_norm": 183.06201171875, + "learning_rate": 2.335753176043557e-05, + "loss": 42.4119, + "step": 1230 + }, + { + "epoch": 4.440632054176072, + "eval_loss": 0.6428424715995789, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 1230 + }, + { + "epoch": 4.444243792325056, + "grad_norm": 199.0382843017578, + "learning_rate": 2.335208711433757e-05, + "loss": 43.1702, + "step": 1231 + }, + { + "epoch": 4.44785553047404, + "grad_norm": 221.87939453125, + "learning_rate": 2.3346642468239565e-05, + "loss": 43.3518, + "step": 1232 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 205.0601043701172, + "learning_rate": 2.3341197822141564e-05, + "loss": 42.9713, + "step": 1233 + }, + { + "epoch": 4.455079006772009, + "grad_norm": 235.3998565673828, + "learning_rate": 2.333575317604356e-05, + "loss": 42.6973, + "step": 1234 + }, + { + "epoch": 4.458690744920993, + "grad_norm": 171.76986694335938, + "learning_rate": 2.3330308529945555e-05, + "loss": 43.351, + "step": 1235 + }, + { + "epoch": 4.462302483069977, + "grad_norm": 261.549072265625, + "learning_rate": 2.332486388384755e-05, + "loss": 43.8662, + "step": 1236 + }, + { + "epoch": 4.465914221218962, + "grad_norm": 256.76837158203125, + "learning_rate": 2.3319419237749545e-05, + "loss": 40.7938, + "step": 1237 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 176.35060119628906, + "learning_rate": 2.331397459165154e-05, + "loss": 38.1021, + "step": 1238 + }, + { + "epoch": 4.47313769751693, + "grad_norm": 203.00906372070312, + "learning_rate": 2.330852994555354e-05, + "loss": 36.6359, + "step": 1239 + }, + { + "epoch": 4.476749435665914, + "grad_norm": 259.6462707519531, + "learning_rate": 2.3303085299455535e-05, + "loss": 34.448, + "step": 1240 + }, + { + "epoch": 4.476749435665914, + "eval_loss": 0.6386051177978516, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 1240 + }, + { + "epoch": 4.480361173814899, + "grad_norm": 215.24737548828125, + "learning_rate": 2.3297640653357534e-05, + "loss": 35.2353, + "step": 1241 + }, + { + "epoch": 4.483972911963883, + "grad_norm": 249.12355041503906, + "learning_rate": 2.329219600725953e-05, + "loss": 38.2077, + "step": 1242 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 191.0881805419922, + "learning_rate": 2.3286751361161525e-05, + "loss": 36.8363, + "step": 1243 + }, + { + "epoch": 4.491196388261851, + "grad_norm": 229.26449584960938, + "learning_rate": 2.3281306715063523e-05, + "loss": 36.7398, + "step": 1244 + }, + { + "epoch": 4.4948081264108355, + "grad_norm": 184.931884765625, + "learning_rate": 2.327586206896552e-05, + "loss": 35.6614, + "step": 1245 + }, + { + "epoch": 4.4984198645598195, + "grad_norm": 183.7378387451172, + "learning_rate": 2.3270417422867514e-05, + "loss": 36.9818, + "step": 1246 + }, + { + "epoch": 4.502031602708803, + "grad_norm": 191.42543029785156, + "learning_rate": 2.326497277676951e-05, + "loss": 38.1348, + "step": 1247 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 211.6359100341797, + "learning_rate": 2.3259528130671505e-05, + "loss": 37.0112, + "step": 1248 + }, + { + "epoch": 4.509255079006772, + "grad_norm": 245.6946563720703, + "learning_rate": 2.32540834845735e-05, + "loss": 38.6218, + "step": 1249 + }, + { + "epoch": 4.512866817155756, + "grad_norm": 193.29095458984375, + "learning_rate": 2.3248638838475502e-05, + "loss": 36.9687, + "step": 1250 + }, + { + "epoch": 4.512866817155756, + "eval_loss": 0.6432057023048401, + "eval_runtime": 3.1301, + "eval_samples_per_second": 57.187, + "eval_steps_per_second": 57.187, + "step": 1250 + }, + { + "epoch": 4.51647855530474, + "grad_norm": 247.0595245361328, + "learning_rate": 2.3243194192377498e-05, + "loss": 39.8086, + "step": 1251 + }, + { + "epoch": 4.520090293453725, + "grad_norm": 243.1544189453125, + "learning_rate": 2.3237749546279493e-05, + "loss": 38.7245, + "step": 1252 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 322.0834045410156, + "learning_rate": 2.323230490018149e-05, + "loss": 39.5335, + "step": 1253 + }, + { + "epoch": 4.527313769751693, + "grad_norm": 201.5956573486328, + "learning_rate": 2.3226860254083484e-05, + "loss": 30.2928, + "step": 1254 + }, + { + "epoch": 4.530925507900677, + "grad_norm": 186.13291931152344, + "learning_rate": 2.3221415607985483e-05, + "loss": 24.8504, + "step": 1255 + }, + { + "epoch": 4.534537246049661, + "grad_norm": 251.50608825683594, + "learning_rate": 2.3215970961887478e-05, + "loss": 24.5528, + "step": 1256 + }, + { + "epoch": 4.538148984198646, + "grad_norm": 180.21124267578125, + "learning_rate": 2.3210526315789473e-05, + "loss": 25.0864, + "step": 1257 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 206.5410614013672, + "learning_rate": 2.320508166969147e-05, + "loss": 27.1602, + "step": 1258 + }, + { + "epoch": 4.545372460496614, + "grad_norm": 342.1103210449219, + "learning_rate": 2.3199637023593468e-05, + "loss": 47.3734, + "step": 1259 + }, + { + "epoch": 4.5489841986455986, + "grad_norm": 418.3056945800781, + "learning_rate": 2.3194192377495463e-05, + "loss": 48.0316, + "step": 1260 + }, + { + "epoch": 4.5489841986455986, + "eval_loss": 0.6742400527000427, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 1260 + }, + { + "epoch": 4.5525959367945825, + "grad_norm": 369.8560791015625, + "learning_rate": 2.3188747731397462e-05, + "loss": 47.4532, + "step": 1261 + }, + { + "epoch": 4.5562076749435665, + "grad_norm": 322.0288391113281, + "learning_rate": 2.3183303085299457e-05, + "loss": 47.0661, + "step": 1262 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 244.79066467285156, + "learning_rate": 2.3177858439201453e-05, + "loss": 45.1875, + "step": 1263 + }, + { + "epoch": 4.563431151241535, + "grad_norm": 209.29397583007812, + "learning_rate": 2.3172413793103448e-05, + "loss": 46.1355, + "step": 1264 + }, + { + "epoch": 4.567042889390519, + "grad_norm": 271.5123291015625, + "learning_rate": 2.3166969147005443e-05, + "loss": 45.8947, + "step": 1265 + }, + { + "epoch": 4.570654627539503, + "grad_norm": 232.42913818359375, + "learning_rate": 2.3161524500907442e-05, + "loss": 45.6542, + "step": 1266 + }, + { + "epoch": 4.574266365688487, + "grad_norm": 282.50738525390625, + "learning_rate": 2.3156079854809437e-05, + "loss": 45.8805, + "step": 1267 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 203.39031982421875, + "learning_rate": 2.3150635208711436e-05, + "loss": 44.8926, + "step": 1268 + }, + { + "epoch": 4.581489841986456, + "grad_norm": 213.94894409179688, + "learning_rate": 2.314519056261343e-05, + "loss": 43.7589, + "step": 1269 + }, + { + "epoch": 4.58510158013544, + "grad_norm": 198.9677734375, + "learning_rate": 2.3139745916515427e-05, + "loss": 41.819, + "step": 1270 + }, + { + "epoch": 4.58510158013544, + "eval_loss": 0.6428627371788025, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1270 + }, + { + "epoch": 4.588713318284425, + "grad_norm": 197.69903564453125, + "learning_rate": 2.3134301270417422e-05, + "loss": 40.6128, + "step": 1271 + }, + { + "epoch": 4.592325056433409, + "grad_norm": 229.10488891601562, + "learning_rate": 2.312885662431942e-05, + "loss": 41.1856, + "step": 1272 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 254.4750213623047, + "learning_rate": 2.3123411978221417e-05, + "loss": 40.2048, + "step": 1273 + }, + { + "epoch": 4.599548532731377, + "grad_norm": 247.2012939453125, + "learning_rate": 2.3117967332123412e-05, + "loss": 41.663, + "step": 1274 + }, + { + "epoch": 4.603160270880361, + "grad_norm": 196.78761291503906, + "learning_rate": 2.3112522686025407e-05, + "loss": 41.1102, + "step": 1275 + }, + { + "epoch": 4.606772009029346, + "grad_norm": 179.03880310058594, + "learning_rate": 2.3107078039927403e-05, + "loss": 39.6368, + "step": 1276 + }, + { + "epoch": 4.6103837471783295, + "grad_norm": 203.49159240722656, + "learning_rate": 2.3101633393829405e-05, + "loss": 42.9424, + "step": 1277 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 254.80018615722656, + "learning_rate": 2.30961887477314e-05, + "loss": 42.0636, + "step": 1278 + }, + { + "epoch": 4.617607223476298, + "grad_norm": 201.86109924316406, + "learning_rate": 2.3090744101633396e-05, + "loss": 41.4738, + "step": 1279 + }, + { + "epoch": 4.621218961625282, + "grad_norm": 185.1239471435547, + "learning_rate": 2.308529945553539e-05, + "loss": 41.8529, + "step": 1280 + }, + { + "epoch": 4.621218961625282, + "eval_loss": 0.6457561254501343, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 1280 + }, + { + "epoch": 4.624830699774266, + "grad_norm": 198.6769561767578, + "learning_rate": 2.3079854809437386e-05, + "loss": 41.8397, + "step": 1281 + }, + { + "epoch": 4.62844243792325, + "grad_norm": 254.9165496826172, + "learning_rate": 2.3074410163339382e-05, + "loss": 43.5585, + "step": 1282 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 183.61181640625, + "learning_rate": 2.306896551724138e-05, + "loss": 41.7349, + "step": 1283 + }, + { + "epoch": 4.635665914221219, + "grad_norm": 206.0381622314453, + "learning_rate": 2.3063520871143376e-05, + "loss": 42.6239, + "step": 1284 + }, + { + "epoch": 4.639277652370203, + "grad_norm": 188.5303497314453, + "learning_rate": 2.305807622504537e-05, + "loss": 43.0988, + "step": 1285 + }, + { + "epoch": 4.642889390519187, + "grad_norm": 208.30039978027344, + "learning_rate": 2.3052631578947367e-05, + "loss": 43.8379, + "step": 1286 + }, + { + "epoch": 4.646501128668172, + "grad_norm": 209.494384765625, + "learning_rate": 2.3047186932849365e-05, + "loss": 41.4395, + "step": 1287 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 223.97824096679688, + "learning_rate": 2.3041742286751364e-05, + "loss": 38.5792, + "step": 1288 + }, + { + "epoch": 4.65372460496614, + "grad_norm": 209.16192626953125, + "learning_rate": 2.303629764065336e-05, + "loss": 36.2448, + "step": 1289 + }, + { + "epoch": 4.657336343115124, + "grad_norm": 260.72821044921875, + "learning_rate": 2.3030852994555355e-05, + "loss": 35.1692, + "step": 1290 + }, + { + "epoch": 4.657336343115124, + "eval_loss": 0.6381233334541321, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1290 + }, + { + "epoch": 4.660948081264109, + "grad_norm": 222.2270965576172, + "learning_rate": 2.302540834845735e-05, + "loss": 35.2234, + "step": 1291 + }, + { + "epoch": 4.664559819413093, + "grad_norm": 208.68218994140625, + "learning_rate": 2.3019963702359346e-05, + "loss": 35.6167, + "step": 1292 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 199.57015991210938, + "learning_rate": 2.301451905626134e-05, + "loss": 36.9489, + "step": 1293 + }, + { + "epoch": 4.6717832957110605, + "grad_norm": 249.1312255859375, + "learning_rate": 2.300907441016334e-05, + "loss": 37.0681, + "step": 1294 + }, + { + "epoch": 4.675395033860045, + "grad_norm": 227.86341857910156, + "learning_rate": 2.3003629764065335e-05, + "loss": 38.3897, + "step": 1295 + }, + { + "epoch": 4.679006772009029, + "grad_norm": 290.3368225097656, + "learning_rate": 2.2998185117967334e-05, + "loss": 39.1391, + "step": 1296 + }, + { + "epoch": 4.682618510158013, + "grad_norm": 222.59974670410156, + "learning_rate": 2.299274047186933e-05, + "loss": 38.6362, + "step": 1297 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 233.853515625, + "learning_rate": 2.2987295825771325e-05, + "loss": 37.1796, + "step": 1298 + }, + { + "epoch": 4.689841986455982, + "grad_norm": 202.83212280273438, + "learning_rate": 2.2981851179673324e-05, + "loss": 38.5097, + "step": 1299 + }, + { + "epoch": 4.693453724604966, + "grad_norm": 203.59027099609375, + "learning_rate": 2.297640653357532e-05, + "loss": 38.3335, + "step": 1300 + }, + { + "epoch": 4.693453724604966, + "eval_loss": 0.6446877717971802, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 1300 + }, + { + "epoch": 4.69706546275395, + "grad_norm": 250.48324584960938, + "learning_rate": 2.2970961887477314e-05, + "loss": 39.1848, + "step": 1301 + }, + { + "epoch": 4.700677200902934, + "grad_norm": 218.0867462158203, + "learning_rate": 2.296551724137931e-05, + "loss": 38.2276, + "step": 1302 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 316.4258728027344, + "learning_rate": 2.2960072595281305e-05, + "loss": 38.4487, + "step": 1303 + }, + { + "epoch": 4.707900677200903, + "grad_norm": 262.96832275390625, + "learning_rate": 2.29546279491833e-05, + "loss": 29.1075, + "step": 1304 + }, + { + "epoch": 4.711512415349887, + "grad_norm": 261.25897216796875, + "learning_rate": 2.2949183303085303e-05, + "loss": 24.6257, + "step": 1305 + }, + { + "epoch": 4.715124153498872, + "grad_norm": 223.29014587402344, + "learning_rate": 2.2943738656987298e-05, + "loss": 24.4387, + "step": 1306 + }, + { + "epoch": 4.718735891647856, + "grad_norm": 167.95193481445312, + "learning_rate": 2.2938294010889293e-05, + "loss": 25.0916, + "step": 1307 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 203.88392639160156, + "learning_rate": 2.293284936479129e-05, + "loss": 26.1631, + "step": 1308 + }, + { + "epoch": 4.725959367945824, + "grad_norm": 350.67657470703125, + "learning_rate": 2.2927404718693284e-05, + "loss": 47.7021, + "step": 1309 + }, + { + "epoch": 4.7295711060948085, + "grad_norm": 357.1839294433594, + "learning_rate": 2.2921960072595283e-05, + "loss": 47.8161, + "step": 1310 + }, + { + "epoch": 4.7295711060948085, + "eval_loss": 0.6716815829277039, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 1310 + }, + { + "epoch": 4.733182844243792, + "grad_norm": 334.40216064453125, + "learning_rate": 2.291651542649728e-05, + "loss": 47.5608, + "step": 1311 + }, + { + "epoch": 4.736794582392776, + "grad_norm": 322.90008544921875, + "learning_rate": 2.2911070780399274e-05, + "loss": 45.9858, + "step": 1312 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 291.5083923339844, + "learning_rate": 2.290562613430127e-05, + "loss": 45.9813, + "step": 1313 + }, + { + "epoch": 4.744018058690745, + "grad_norm": 234.91102600097656, + "learning_rate": 2.2900181488203268e-05, + "loss": 44.4287, + "step": 1314 + }, + { + "epoch": 4.747629796839729, + "grad_norm": 271.03582763671875, + "learning_rate": 2.2894736842105263e-05, + "loss": 45.3697, + "step": 1315 + }, + { + "epoch": 4.751241534988713, + "grad_norm": 256.219482421875, + "learning_rate": 2.2889292196007262e-05, + "loss": 45.1817, + "step": 1316 + }, + { + "epoch": 4.754853273137698, + "grad_norm": 252.0631561279297, + "learning_rate": 2.2883847549909257e-05, + "loss": 45.2029, + "step": 1317 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 249.41812133789062, + "learning_rate": 2.2878402903811253e-05, + "loss": 44.9802, + "step": 1318 + }, + { + "epoch": 4.762076749435666, + "grad_norm": 208.9102325439453, + "learning_rate": 2.2872958257713248e-05, + "loss": 44.3745, + "step": 1319 + }, + { + "epoch": 4.76568848758465, + "grad_norm": 322.94903564453125, + "learning_rate": 2.2867513611615244e-05, + "loss": 40.9193, + "step": 1320 + }, + { + "epoch": 4.76568848758465, + "eval_loss": 0.6515910029411316, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1320 + }, + { + "epoch": 4.769300225733634, + "grad_norm": 264.6942138671875, + "learning_rate": 2.2862068965517242e-05, + "loss": 39.7286, + "step": 1321 + }, + { + "epoch": 4.772911963882619, + "grad_norm": 276.6095886230469, + "learning_rate": 2.2856624319419238e-05, + "loss": 41.3846, + "step": 1322 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 199.59877014160156, + "learning_rate": 2.2851179673321233e-05, + "loss": 40.5583, + "step": 1323 + }, + { + "epoch": 4.780135440180587, + "grad_norm": 252.59158325195312, + "learning_rate": 2.2845735027223232e-05, + "loss": 40.9513, + "step": 1324 + }, + { + "epoch": 4.7837471783295715, + "grad_norm": 215.53826904296875, + "learning_rate": 2.2840290381125227e-05, + "loss": 41.5119, + "step": 1325 + }, + { + "epoch": 4.7873589164785555, + "grad_norm": 290.7100524902344, + "learning_rate": 2.2834845735027226e-05, + "loss": 42.7646, + "step": 1326 + }, + { + "epoch": 4.7909706546275395, + "grad_norm": 190.2306671142578, + "learning_rate": 2.282940108892922e-05, + "loss": 42.2708, + "step": 1327 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 187.5550079345703, + "learning_rate": 2.2823956442831217e-05, + "loss": 41.9279, + "step": 1328 + }, + { + "epoch": 4.798194130925508, + "grad_norm": 169.10414123535156, + "learning_rate": 2.2818511796733212e-05, + "loss": 42.2688, + "step": 1329 + }, + { + "epoch": 4.801805869074492, + "grad_norm": 199.5216064453125, + "learning_rate": 2.2813067150635208e-05, + "loss": 41.9192, + "step": 1330 + }, + { + "epoch": 4.801805869074492, + "eval_loss": 0.6402038335800171, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 1330 + }, + { + "epoch": 4.805417607223476, + "grad_norm": 222.4996337890625, + "learning_rate": 2.2807622504537203e-05, + "loss": 43.8218, + "step": 1331 + }, + { + "epoch": 4.80902934537246, + "grad_norm": 228.1157684326172, + "learning_rate": 2.2802177858439202e-05, + "loss": 42.9497, + "step": 1332 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 179.83697509765625, + "learning_rate": 2.27967332123412e-05, + "loss": 43.9723, + "step": 1333 + }, + { + "epoch": 4.816252821670429, + "grad_norm": 196.81983947753906, + "learning_rate": 2.2791288566243196e-05, + "loss": 43.3302, + "step": 1334 + }, + { + "epoch": 4.819864559819413, + "grad_norm": 186.61160278320312, + "learning_rate": 2.278584392014519e-05, + "loss": 41.8957, + "step": 1335 + }, + { + "epoch": 4.823476297968397, + "grad_norm": 242.55886840820312, + "learning_rate": 2.2780399274047187e-05, + "loss": 43.1916, + "step": 1336 + }, + { + "epoch": 4.827088036117382, + "grad_norm": 212.07177734375, + "learning_rate": 2.2774954627949185e-05, + "loss": 38.3371, + "step": 1337 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 180.1990966796875, + "learning_rate": 2.276950998185118e-05, + "loss": 36.3413, + "step": 1338 + }, + { + "epoch": 4.83431151241535, + "grad_norm": 202.69529724121094, + "learning_rate": 2.2764065335753176e-05, + "loss": 35.4426, + "step": 1339 + }, + { + "epoch": 4.837923250564334, + "grad_norm": 180.47283935546875, + "learning_rate": 2.275862068965517e-05, + "loss": 35.5281, + "step": 1340 + }, + { + "epoch": 4.837923250564334, + "eval_loss": 0.6356105804443359, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 1340 + }, + { + "epoch": 4.8415349887133186, + "grad_norm": 204.674560546875, + "learning_rate": 2.2753176043557167e-05, + "loss": 36.2566, + "step": 1341 + }, + { + "epoch": 4.8451467268623025, + "grad_norm": 272.1197204589844, + "learning_rate": 2.2747731397459166e-05, + "loss": 36.3862, + "step": 1342 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 235.55101013183594, + "learning_rate": 2.2742286751361165e-05, + "loss": 35.1455, + "step": 1343 + }, + { + "epoch": 4.852370203160271, + "grad_norm": 271.2718200683594, + "learning_rate": 2.273684210526316e-05, + "loss": 37.3824, + "step": 1344 + }, + { + "epoch": 4.855981941309255, + "grad_norm": 242.15728759765625, + "learning_rate": 2.2731397459165155e-05, + "loss": 37.6587, + "step": 1345 + }, + { + "epoch": 4.859593679458239, + "grad_norm": 218.59481811523438, + "learning_rate": 2.272595281306715e-05, + "loss": 36.7602, + "step": 1346 + }, + { + "epoch": 4.863205417607223, + "grad_norm": 231.9490203857422, + "learning_rate": 2.2720508166969146e-05, + "loss": 38.187, + "step": 1347 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 385.56158447265625, + "learning_rate": 2.2715063520871145e-05, + "loss": 38.1905, + "step": 1348 + }, + { + "epoch": 4.870428893905192, + "grad_norm": 219.38204956054688, + "learning_rate": 2.270961887477314e-05, + "loss": 38.2179, + "step": 1349 + }, + { + "epoch": 4.874040632054176, + "grad_norm": 209.46580505371094, + "learning_rate": 2.2704174228675136e-05, + "loss": 37.3696, + "step": 1350 + }, + { + "epoch": 4.874040632054176, + "eval_loss": 0.6412517428398132, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 1350 + }, + { + "epoch": 4.87765237020316, + "grad_norm": 205.53416442871094, + "learning_rate": 2.2698729582577134e-05, + "loss": 38.5144, + "step": 1351 + }, + { + "epoch": 4.881264108352145, + "grad_norm": 214.2522735595703, + "learning_rate": 2.269328493647913e-05, + "loss": 38.7372, + "step": 1352 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 236.9787139892578, + "learning_rate": 2.2687840290381125e-05, + "loss": 38.8987, + "step": 1353 + }, + { + "epoch": 4.888487584650113, + "grad_norm": 247.30906677246094, + "learning_rate": 2.2682395644283124e-05, + "loss": 35.0837, + "step": 1354 + }, + { + "epoch": 4.892099322799097, + "grad_norm": 287.5954284667969, + "learning_rate": 2.267695099818512e-05, + "loss": 25.5272, + "step": 1355 + }, + { + "epoch": 4.895711060948082, + "grad_norm": 254.61672973632812, + "learning_rate": 2.2671506352087115e-05, + "loss": 25.1288, + "step": 1356 + }, + { + "epoch": 4.899322799097066, + "grad_norm": 180.98666381835938, + "learning_rate": 2.266606170598911e-05, + "loss": 25.0588, + "step": 1357 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 213.0275421142578, + "learning_rate": 2.2660617059891105e-05, + "loss": 25.464, + "step": 1358 + }, + { + "epoch": 4.9065462753950335, + "grad_norm": 385.18035888671875, + "learning_rate": 2.2655172413793104e-05, + "loss": 47.0056, + "step": 1359 + }, + { + "epoch": 4.910158013544018, + "grad_norm": 383.4106140136719, + "learning_rate": 2.2649727767695103e-05, + "loss": 46.9892, + "step": 1360 + }, + { + "epoch": 4.910158013544018, + "eval_loss": 0.6618479490280151, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1360 + }, + { + "epoch": 4.913769751693002, + "grad_norm": 415.4345397949219, + "learning_rate": 2.26442831215971e-05, + "loss": 47.1619, + "step": 1361 + }, + { + "epoch": 4.917381489841986, + "grad_norm": 362.338134765625, + "learning_rate": 2.2638838475499094e-05, + "loss": 46.7232, + "step": 1362 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 378.7535400390625, + "learning_rate": 2.263339382940109e-05, + "loss": 46.4438, + "step": 1363 + }, + { + "epoch": 4.924604966139955, + "grad_norm": 251.64901733398438, + "learning_rate": 2.2627949183303085e-05, + "loss": 44.8178, + "step": 1364 + }, + { + "epoch": 4.928216704288939, + "grad_norm": 273.1052551269531, + "learning_rate": 2.2622504537205083e-05, + "loss": 43.0865, + "step": 1365 + }, + { + "epoch": 4.931828442437923, + "grad_norm": 229.66415405273438, + "learning_rate": 2.261705989110708e-05, + "loss": 42.2463, + "step": 1366 + }, + { + "epoch": 4.935440180586907, + "grad_norm": 229.47940063476562, + "learning_rate": 2.2611615245009074e-05, + "loss": 42.4395, + "step": 1367 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 224.48890686035156, + "learning_rate": 2.260617059891107e-05, + "loss": 42.4994, + "step": 1368 + }, + { + "epoch": 4.942663656884876, + "grad_norm": 241.98745727539062, + "learning_rate": 2.2600725952813065e-05, + "loss": 42.5535, + "step": 1369 + }, + { + "epoch": 4.94627539503386, + "grad_norm": 258.1711120605469, + "learning_rate": 2.2595281306715067e-05, + "loss": 42.8475, + "step": 1370 + }, + { + "epoch": 4.94627539503386, + "eval_loss": 0.639252245426178, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.09, + "eval_steps_per_second": 57.09, + "step": 1370 + }, + { + "epoch": 4.949887133182845, + "grad_norm": 204.64927673339844, + "learning_rate": 2.2589836660617062e-05, + "loss": 42.9895, + "step": 1371 + }, + { + "epoch": 4.953498871331829, + "grad_norm": 342.9057922363281, + "learning_rate": 2.2584392014519058e-05, + "loss": 43.1972, + "step": 1372 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 207.45504760742188, + "learning_rate": 2.2578947368421053e-05, + "loss": 42.406, + "step": 1373 + }, + { + "epoch": 4.960722347629797, + "grad_norm": 232.78831481933594, + "learning_rate": 2.257350272232305e-05, + "loss": 36.8817, + "step": 1374 + }, + { + "epoch": 4.9643340857787805, + "grad_norm": 249.3349609375, + "learning_rate": 2.2568058076225044e-05, + "loss": 34.584, + "step": 1375 + }, + { + "epoch": 4.967945823927765, + "grad_norm": 322.7100524902344, + "learning_rate": 2.2562613430127043e-05, + "loss": 36.9512, + "step": 1376 + }, + { + "epoch": 4.971557562076749, + "grad_norm": 357.65228271484375, + "learning_rate": 2.2557168784029038e-05, + "loss": 37.6833, + "step": 1377 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 300.0970153808594, + "learning_rate": 2.2551724137931033e-05, + "loss": 38.597, + "step": 1378 + }, + { + "epoch": 4.978781038374718, + "grad_norm": 234.52508544921875, + "learning_rate": 2.2546279491833032e-05, + "loss": 38.4155, + "step": 1379 + }, + { + "epoch": 4.982392776523702, + "grad_norm": 270.60626220703125, + "learning_rate": 2.2540834845735028e-05, + "loss": 38.1589, + "step": 1380 + }, + { + "epoch": 4.982392776523702, + "eval_loss": 0.6409950256347656, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 1380 + }, + { + "epoch": 4.986004514672686, + "grad_norm": 232.9596710205078, + "learning_rate": 2.2535390199637026e-05, + "loss": 39.281, + "step": 1381 + }, + { + "epoch": 4.98961625282167, + "grad_norm": 248.0550994873047, + "learning_rate": 2.2529945553539022e-05, + "loss": 40.0868, + "step": 1382 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 256.327880859375, + "learning_rate": 2.2524500907441017e-05, + "loss": 28.1259, + "step": 1383 + }, + { + "epoch": 4.996839729119639, + "grad_norm": 198.29559326171875, + "learning_rate": 2.2519056261343012e-05, + "loss": 25.3166, + "step": 1384 + }, + { + "epoch": 5.0, + "grad_norm": 174.66856384277344, + "learning_rate": 2.2513611615245008e-05, + "loss": 22.0749, + "step": 1385 + }, + { + "epoch": 5.003611738148984, + "grad_norm": 309.0927429199219, + "learning_rate": 2.2508166969147003e-05, + "loss": 45.2433, + "step": 1386 + }, + { + "epoch": 5.007223476297969, + "grad_norm": 293.1455383300781, + "learning_rate": 2.2502722323049002e-05, + "loss": 46.7025, + "step": 1387 + }, + { + "epoch": 5.010835214446953, + "grad_norm": 269.47662353515625, + "learning_rate": 2.2497277676951e-05, + "loss": 45.3218, + "step": 1388 + }, + { + "epoch": 5.014446952595937, + "grad_norm": 284.49560546875, + "learning_rate": 2.2491833030852996e-05, + "loss": 44.9849, + "step": 1389 + }, + { + "epoch": 5.018058690744921, + "grad_norm": 223.5511474609375, + "learning_rate": 2.248638838475499e-05, + "loss": 44.887, + "step": 1390 + }, + { + "epoch": 5.018058690744921, + "eval_loss": 0.6435533165931702, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1390 + }, + { + "epoch": 5.021670428893906, + "grad_norm": 243.4492645263672, + "learning_rate": 2.2480943738656987e-05, + "loss": 45.1483, + "step": 1391 + }, + { + "epoch": 5.0252821670428895, + "grad_norm": 265.1712646484375, + "learning_rate": 2.2475499092558986e-05, + "loss": 44.3713, + "step": 1392 + }, + { + "epoch": 5.0288939051918735, + "grad_norm": 190.72190856933594, + "learning_rate": 2.247005444646098e-05, + "loss": 45.3138, + "step": 1393 + }, + { + "epoch": 5.0325056433408575, + "grad_norm": 177.26686096191406, + "learning_rate": 2.2464609800362976e-05, + "loss": 43.302, + "step": 1394 + }, + { + "epoch": 5.036117381489842, + "grad_norm": 198.6124725341797, + "learning_rate": 2.2459165154264972e-05, + "loss": 43.6363, + "step": 1395 + }, + { + "epoch": 5.039729119638826, + "grad_norm": 233.78738403320312, + "learning_rate": 2.2453720508166967e-05, + "loss": 43.0345, + "step": 1396 + }, + { + "epoch": 5.04334085778781, + "grad_norm": 225.48614501953125, + "learning_rate": 2.2448275862068966e-05, + "loss": 41.5932, + "step": 1397 + }, + { + "epoch": 5.046952595936794, + "grad_norm": 204.31179809570312, + "learning_rate": 2.2442831215970965e-05, + "loss": 40.1401, + "step": 1398 + }, + { + "epoch": 5.050564334085779, + "grad_norm": 219.5385284423828, + "learning_rate": 2.243738656987296e-05, + "loss": 40.8834, + "step": 1399 + }, + { + "epoch": 5.054176072234763, + "grad_norm": 168.3094024658203, + "learning_rate": 2.2431941923774956e-05, + "loss": 40.4476, + "step": 1400 + }, + { + "epoch": 5.054176072234763, + "eval_loss": 0.6361114382743835, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 1400 + }, + { + "epoch": 5.057787810383747, + "grad_norm": 169.45201110839844, + "learning_rate": 2.242649727767695e-05, + "loss": 40.1949, + "step": 1401 + }, + { + "epoch": 5.061399548532731, + "grad_norm": 208.84634399414062, + "learning_rate": 2.2421052631578946e-05, + "loss": 41.0091, + "step": 1402 + }, + { + "epoch": 5.065011286681716, + "grad_norm": 248.86221313476562, + "learning_rate": 2.2415607985480945e-05, + "loss": 40.2435, + "step": 1403 + }, + { + "epoch": 5.0686230248307, + "grad_norm": 297.0834655761719, + "learning_rate": 2.241016333938294e-05, + "loss": 42.37, + "step": 1404 + }, + { + "epoch": 5.072234762979684, + "grad_norm": 242.12661743164062, + "learning_rate": 2.2404718693284936e-05, + "loss": 42.3822, + "step": 1405 + }, + { + "epoch": 5.075846501128668, + "grad_norm": 230.1178741455078, + "learning_rate": 2.2399274047186935e-05, + "loss": 41.3722, + "step": 1406 + }, + { + "epoch": 5.079458239277653, + "grad_norm": 191.32371520996094, + "learning_rate": 2.239382940108893e-05, + "loss": 41.8087, + "step": 1407 + }, + { + "epoch": 5.083069977426637, + "grad_norm": 267.28753662109375, + "learning_rate": 2.2388384754990925e-05, + "loss": 42.5938, + "step": 1408 + }, + { + "epoch": 5.0866817155756205, + "grad_norm": 186.61978149414062, + "learning_rate": 2.2382940108892924e-05, + "loss": 42.8553, + "step": 1409 + }, + { + "epoch": 5.090293453724605, + "grad_norm": 242.53433227539062, + "learning_rate": 2.237749546279492e-05, + "loss": 41.9677, + "step": 1410 + }, + { + "epoch": 5.090293453724605, + "eval_loss": 0.6330043077468872, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 1410 + }, + { + "epoch": 5.093905191873589, + "grad_norm": 199.74696350097656, + "learning_rate": 2.2372050816696915e-05, + "loss": 42.9821, + "step": 1411 + }, + { + "epoch": 5.097516930022573, + "grad_norm": 254.1063690185547, + "learning_rate": 2.236660617059891e-05, + "loss": 42.7956, + "step": 1412 + }, + { + "epoch": 5.101128668171557, + "grad_norm": 215.59056091308594, + "learning_rate": 2.2361161524500906e-05, + "loss": 43.6312, + "step": 1413 + }, + { + "epoch": 5.104740406320542, + "grad_norm": 218.69973754882812, + "learning_rate": 2.2355716878402904e-05, + "loss": 40.9468, + "step": 1414 + }, + { + "epoch": 5.108352144469526, + "grad_norm": 200.34927368164062, + "learning_rate": 2.23502722323049e-05, + "loss": 38.2656, + "step": 1415 + }, + { + "epoch": 5.11196388261851, + "grad_norm": 191.56883239746094, + "learning_rate": 2.23448275862069e-05, + "loss": 35.8111, + "step": 1416 + }, + { + "epoch": 5.115575620767494, + "grad_norm": 192.629150390625, + "learning_rate": 2.2339382940108894e-05, + "loss": 35.1287, + "step": 1417 + }, + { + "epoch": 5.119187358916479, + "grad_norm": 217.54855346679688, + "learning_rate": 2.233393829401089e-05, + "loss": 34.9664, + "step": 1418 + }, + { + "epoch": 5.122799097065463, + "grad_norm": 234.12355041503906, + "learning_rate": 2.2328493647912888e-05, + "loss": 35.9252, + "step": 1419 + }, + { + "epoch": 5.126410835214447, + "grad_norm": 201.83477783203125, + "learning_rate": 2.2323049001814884e-05, + "loss": 36.4664, + "step": 1420 + }, + { + "epoch": 5.126410835214447, + "eval_loss": 0.6359394192695618, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 1420 + }, + { + "epoch": 5.130022573363431, + "grad_norm": 212.38943481445312, + "learning_rate": 2.231760435571688e-05, + "loss": 35.2733, + "step": 1421 + }, + { + "epoch": 5.133634311512416, + "grad_norm": 219.8803253173828, + "learning_rate": 2.2312159709618874e-05, + "loss": 37.2009, + "step": 1422 + }, + { + "epoch": 5.1372460496614, + "grad_norm": 222.28221130371094, + "learning_rate": 2.230671506352087e-05, + "loss": 36.9338, + "step": 1423 + }, + { + "epoch": 5.140857787810384, + "grad_norm": 217.56607055664062, + "learning_rate": 2.2301270417422865e-05, + "loss": 38.0419, + "step": 1424 + }, + { + "epoch": 5.144469525959368, + "grad_norm": 232.7363739013672, + "learning_rate": 2.2295825771324867e-05, + "loss": 38.1393, + "step": 1425 + }, + { + "epoch": 5.148081264108352, + "grad_norm": 228.12091064453125, + "learning_rate": 2.2290381125226863e-05, + "loss": 37.4169, + "step": 1426 + }, + { + "epoch": 5.151693002257336, + "grad_norm": 247.9901580810547, + "learning_rate": 2.2284936479128858e-05, + "loss": 37.6386, + "step": 1427 + }, + { + "epoch": 5.15530474040632, + "grad_norm": 227.96649169921875, + "learning_rate": 2.2279491833030853e-05, + "loss": 38.7843, + "step": 1428 + }, + { + "epoch": 5.158916478555304, + "grad_norm": 197.85072326660156, + "learning_rate": 2.227404718693285e-05, + "loss": 37.7056, + "step": 1429 + }, + { + "epoch": 5.162528216704289, + "grad_norm": 270.6370544433594, + "learning_rate": 2.2268602540834848e-05, + "loss": 38.5554, + "step": 1430 + }, + { + "epoch": 5.162528216704289, + "eval_loss": 0.6463288068771362, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 1430 + }, + { + "epoch": 5.166139954853273, + "grad_norm": 251.65847778320312, + "learning_rate": 2.2263157894736843e-05, + "loss": 32.6593, + "step": 1431 + }, + { + "epoch": 5.169751693002257, + "grad_norm": 248.84368896484375, + "learning_rate": 2.225771324863884e-05, + "loss": 24.8031, + "step": 1432 + }, + { + "epoch": 5.173363431151241, + "grad_norm": 218.12979125976562, + "learning_rate": 2.2252268602540834e-05, + "loss": 23.8542, + "step": 1433 + }, + { + "epoch": 5.176975169300226, + "grad_norm": 171.4182586669922, + "learning_rate": 2.2246823956442832e-05, + "loss": 25.1994, + "step": 1434 + }, + { + "epoch": 5.18058690744921, + "grad_norm": 200.76271057128906, + "learning_rate": 2.2241379310344828e-05, + "loss": 25.1259, + "step": 1435 + }, + { + "epoch": 5.184198645598194, + "grad_norm": 324.8979797363281, + "learning_rate": 2.2235934664246827e-05, + "loss": 46.7466, + "step": 1436 + }, + { + "epoch": 5.187810383747179, + "grad_norm": 391.9200439453125, + "learning_rate": 2.2230490018148822e-05, + "loss": 47.366, + "step": 1437 + }, + { + "epoch": 5.191422121896163, + "grad_norm": 332.51080322265625, + "learning_rate": 2.2225045372050817e-05, + "loss": 47.5236, + "step": 1438 + }, + { + "epoch": 5.195033860045147, + "grad_norm": 295.85333251953125, + "learning_rate": 2.2219600725952813e-05, + "loss": 44.9235, + "step": 1439 + }, + { + "epoch": 5.198645598194131, + "grad_norm": 246.46482849121094, + "learning_rate": 2.2214156079854808e-05, + "loss": 44.5892, + "step": 1440 + }, + { + "epoch": 5.198645598194131, + "eval_loss": 0.6501885056495667, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.096, + "eval_steps_per_second": 57.096, + "step": 1440 + }, + { + "epoch": 5.2022573363431155, + "grad_norm": 224.99964904785156, + "learning_rate": 2.2208711433756807e-05, + "loss": 45.1496, + "step": 1441 + }, + { + "epoch": 5.2058690744920995, + "grad_norm": 201.5928497314453, + "learning_rate": 2.2203266787658802e-05, + "loss": 44.2362, + "step": 1442 + }, + { + "epoch": 5.209480812641083, + "grad_norm": 220.72509765625, + "learning_rate": 2.21978221415608e-05, + "loss": 45.7963, + "step": 1443 + }, + { + "epoch": 5.213092550790067, + "grad_norm": 229.04412841796875, + "learning_rate": 2.2192377495462796e-05, + "loss": 44.1812, + "step": 1444 + }, + { + "epoch": 5.216704288939052, + "grad_norm": 214.86207580566406, + "learning_rate": 2.2186932849364792e-05, + "loss": 44.364, + "step": 1445 + }, + { + "epoch": 5.220316027088036, + "grad_norm": 169.3239288330078, + "learning_rate": 2.2181488203266787e-05, + "loss": 44.1106, + "step": 1446 + }, + { + "epoch": 5.22392776523702, + "grad_norm": 180.3131561279297, + "learning_rate": 2.2176043557168786e-05, + "loss": 41.8791, + "step": 1447 + }, + { + "epoch": 5.227539503386004, + "grad_norm": 227.83078002929688, + "learning_rate": 2.217059891107078e-05, + "loss": 39.7917, + "step": 1448 + }, + { + "epoch": 5.231151241534989, + "grad_norm": 267.4294738769531, + "learning_rate": 2.2165154264972777e-05, + "loss": 41.2864, + "step": 1449 + }, + { + "epoch": 5.234762979683973, + "grad_norm": 210.79034423828125, + "learning_rate": 2.2159709618874772e-05, + "loss": 40.7219, + "step": 1450 + }, + { + "epoch": 5.234762979683973, + "eval_loss": 0.6369529366493225, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 1450 + }, + { + "epoch": 5.238374717832957, + "grad_norm": 205.2632598876953, + "learning_rate": 2.2154264972776768e-05, + "loss": 41.0364, + "step": 1451 + }, + { + "epoch": 5.241986455981941, + "grad_norm": 199.7196807861328, + "learning_rate": 2.214882032667877e-05, + "loss": 40.2733, + "step": 1452 + }, + { + "epoch": 5.245598194130926, + "grad_norm": 184.26495361328125, + "learning_rate": 2.2143375680580765e-05, + "loss": 40.3418, + "step": 1453 + }, + { + "epoch": 5.24920993227991, + "grad_norm": 170.1937713623047, + "learning_rate": 2.213793103448276e-05, + "loss": 40.5658, + "step": 1454 + }, + { + "epoch": 5.252821670428894, + "grad_norm": 167.71109008789062, + "learning_rate": 2.2132486388384756e-05, + "loss": 41.9252, + "step": 1455 + }, + { + "epoch": 5.2564334085778786, + "grad_norm": 184.73162841796875, + "learning_rate": 2.212704174228675e-05, + "loss": 40.0485, + "step": 1456 + }, + { + "epoch": 5.2600451467268625, + "grad_norm": 195.0812225341797, + "learning_rate": 2.2121597096188747e-05, + "loss": 41.6424, + "step": 1457 + }, + { + "epoch": 5.2636568848758465, + "grad_norm": 218.23553466796875, + "learning_rate": 2.2116152450090745e-05, + "loss": 40.6179, + "step": 1458 + }, + { + "epoch": 5.2672686230248305, + "grad_norm": 229.79299926757812, + "learning_rate": 2.211070780399274e-05, + "loss": 42.8747, + "step": 1459 + }, + { + "epoch": 5.270880361173815, + "grad_norm": 231.70692443847656, + "learning_rate": 2.2105263157894736e-05, + "loss": 42.7016, + "step": 1460 + }, + { + "epoch": 5.270880361173815, + "eval_loss": 0.6424433588981628, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 1460 + }, + { + "epoch": 5.274492099322799, + "grad_norm": 204.9513397216797, + "learning_rate": 2.209981851179673e-05, + "loss": 41.206, + "step": 1461 + }, + { + "epoch": 5.278103837471783, + "grad_norm": 220.89083862304688, + "learning_rate": 2.209437386569873e-05, + "loss": 44.0126, + "step": 1462 + }, + { + "epoch": 5.281715575620767, + "grad_norm": 266.7763671875, + "learning_rate": 2.208892921960073e-05, + "loss": 41.4934, + "step": 1463 + }, + { + "epoch": 5.285327313769752, + "grad_norm": 241.42636108398438, + "learning_rate": 2.2083484573502724e-05, + "loss": 43.3433, + "step": 1464 + }, + { + "epoch": 5.288939051918736, + "grad_norm": 221.7669219970703, + "learning_rate": 2.207803992740472e-05, + "loss": 35.9569, + "step": 1465 + }, + { + "epoch": 5.29255079006772, + "grad_norm": 236.0152130126953, + "learning_rate": 2.2072595281306715e-05, + "loss": 36.0824, + "step": 1466 + }, + { + "epoch": 5.296162528216704, + "grad_norm": 239.56224060058594, + "learning_rate": 2.206715063520871e-05, + "loss": 33.6127, + "step": 1467 + }, + { + "epoch": 5.299774266365689, + "grad_norm": 277.1287841796875, + "learning_rate": 2.2061705989110706e-05, + "loss": 36.11, + "step": 1468 + }, + { + "epoch": 5.303386004514673, + "grad_norm": 250.19515991210938, + "learning_rate": 2.2056261343012705e-05, + "loss": 36.9984, + "step": 1469 + }, + { + "epoch": 5.306997742663657, + "grad_norm": 214.2754669189453, + "learning_rate": 2.20508166969147e-05, + "loss": 36.5917, + "step": 1470 + }, + { + "epoch": 5.306997742663657, + "eval_loss": 0.6356943845748901, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 1470 + }, + { + "epoch": 5.310609480812641, + "grad_norm": 224.37388610839844, + "learning_rate": 2.20453720508167e-05, + "loss": 36.5302, + "step": 1471 + }, + { + "epoch": 5.314221218961626, + "grad_norm": 276.2541809082031, + "learning_rate": 2.2039927404718694e-05, + "loss": 36.7978, + "step": 1472 + }, + { + "epoch": 5.3178329571106095, + "grad_norm": 361.717041015625, + "learning_rate": 2.203448275862069e-05, + "loss": 37.4063, + "step": 1473 + }, + { + "epoch": 5.3214446952595935, + "grad_norm": 285.3569641113281, + "learning_rate": 2.202903811252269e-05, + "loss": 37.2472, + "step": 1474 + }, + { + "epoch": 5.3250564334085775, + "grad_norm": 268.160400390625, + "learning_rate": 2.2023593466424684e-05, + "loss": 37.7361, + "step": 1475 + }, + { + "epoch": 5.328668171557562, + "grad_norm": 211.38070678710938, + "learning_rate": 2.201814882032668e-05, + "loss": 37.7794, + "step": 1476 + }, + { + "epoch": 5.332279909706546, + "grad_norm": 214.10638427734375, + "learning_rate": 2.2012704174228675e-05, + "loss": 39.0787, + "step": 1477 + }, + { + "epoch": 5.33589164785553, + "grad_norm": 238.9603271484375, + "learning_rate": 2.200725952813067e-05, + "loss": 37.6853, + "step": 1478 + }, + { + "epoch": 5.339503386004514, + "grad_norm": 323.44976806640625, + "learning_rate": 2.2001814882032665e-05, + "loss": 38.2844, + "step": 1479 + }, + { + "epoch": 5.343115124153499, + "grad_norm": 289.6131896972656, + "learning_rate": 2.1996370235934668e-05, + "loss": 38.8953, + "step": 1480 + }, + { + "epoch": 5.343115124153499, + "eval_loss": 0.6462770700454712, + "eval_runtime": 3.1673, + "eval_samples_per_second": 56.516, + "eval_steps_per_second": 56.516, + "step": 1480 + }, + { + "epoch": 5.346726862302483, + "grad_norm": 197.47299194335938, + "learning_rate": 2.1990925589836663e-05, + "loss": 28.126, + "step": 1481 + }, + { + "epoch": 5.350338600451467, + "grad_norm": 198.37156677246094, + "learning_rate": 2.1985480943738658e-05, + "loss": 24.2205, + "step": 1482 + }, + { + "epoch": 5.353950338600452, + "grad_norm": 211.03501892089844, + "learning_rate": 2.1980036297640654e-05, + "loss": 24.119, + "step": 1483 + }, + { + "epoch": 5.357562076749436, + "grad_norm": 182.23316955566406, + "learning_rate": 2.197459165154265e-05, + "loss": 24.7386, + "step": 1484 + }, + { + "epoch": 5.36117381489842, + "grad_norm": 192.6392822265625, + "learning_rate": 2.1969147005444648e-05, + "loss": 26.0739, + "step": 1485 + }, + { + "epoch": 5.364785553047404, + "grad_norm": 380.62896728515625, + "learning_rate": 2.1963702359346643e-05, + "loss": 46.6945, + "step": 1486 + }, + { + "epoch": 5.368397291196389, + "grad_norm": 342.5572814941406, + "learning_rate": 2.195825771324864e-05, + "loss": 46.1797, + "step": 1487 + }, + { + "epoch": 5.372009029345373, + "grad_norm": 311.7198791503906, + "learning_rate": 2.1952813067150634e-05, + "loss": 45.6588, + "step": 1488 + }, + { + "epoch": 5.375620767494357, + "grad_norm": 260.9885559082031, + "learning_rate": 2.1947368421052633e-05, + "loss": 45.2405, + "step": 1489 + }, + { + "epoch": 5.3792325056433405, + "grad_norm": 263.3132019042969, + "learning_rate": 2.1941923774954628e-05, + "loss": 44.117, + "step": 1490 + }, + { + "epoch": 5.3792325056433405, + "eval_loss": 0.644275426864624, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 1490 + }, + { + "epoch": 5.382844243792325, + "grad_norm": 254.92022705078125, + "learning_rate": 2.1936479128856627e-05, + "loss": 45.4002, + "step": 1491 + }, + { + "epoch": 5.386455981941309, + "grad_norm": 246.1839599609375, + "learning_rate": 2.1931034482758622e-05, + "loss": 45.3481, + "step": 1492 + }, + { + "epoch": 5.390067720090293, + "grad_norm": 282.2879638671875, + "learning_rate": 2.1925589836660618e-05, + "loss": 45.3958, + "step": 1493 + }, + { + "epoch": 5.393679458239277, + "grad_norm": 266.9140930175781, + "learning_rate": 2.1920145190562613e-05, + "loss": 44.2959, + "step": 1494 + }, + { + "epoch": 5.397291196388262, + "grad_norm": 196.81199645996094, + "learning_rate": 2.191470054446461e-05, + "loss": 44.765, + "step": 1495 + }, + { + "epoch": 5.400902934537246, + "grad_norm": 270.7329406738281, + "learning_rate": 2.1909255898366607e-05, + "loss": 42.8581, + "step": 1496 + }, + { + "epoch": 5.40451467268623, + "grad_norm": 187.3281707763672, + "learning_rate": 2.1903811252268603e-05, + "loss": 40.7167, + "step": 1497 + }, + { + "epoch": 5.408126410835214, + "grad_norm": 302.9165954589844, + "learning_rate": 2.1898366606170598e-05, + "loss": 41.0712, + "step": 1498 + }, + { + "epoch": 5.411738148984199, + "grad_norm": 395.1492614746094, + "learning_rate": 2.1892921960072597e-05, + "loss": 40.4098, + "step": 1499 + }, + { + "epoch": 5.415349887133183, + "grad_norm": 253.91494750976562, + "learning_rate": 2.1887477313974592e-05, + "loss": 41.2985, + "step": 1500 + }, + { + "epoch": 5.415349887133183, + "eval_loss": 0.6383773684501648, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1500 + }, + { + "epoch": 5.418961625282167, + "grad_norm": 248.4109344482422, + "learning_rate": 2.1882032667876588e-05, + "loss": 41.179, + "step": 1501 + }, + { + "epoch": 5.422573363431152, + "grad_norm": 210.50015258789062, + "learning_rate": 2.1876588021778586e-05, + "loss": 41.1934, + "step": 1502 + }, + { + "epoch": 5.426185101580136, + "grad_norm": 170.64334106445312, + "learning_rate": 2.187114337568058e-05, + "loss": 41.5535, + "step": 1503 + }, + { + "epoch": 5.42979683972912, + "grad_norm": 249.41270446777344, + "learning_rate": 2.1865698729582577e-05, + "loss": 41.8323, + "step": 1504 + }, + { + "epoch": 5.433408577878104, + "grad_norm": 214.53770446777344, + "learning_rate": 2.1860254083484572e-05, + "loss": 42.1517, + "step": 1505 + }, + { + "epoch": 5.437020316027088, + "grad_norm": 225.6502227783203, + "learning_rate": 2.1854809437386568e-05, + "loss": 42.7675, + "step": 1506 + }, + { + "epoch": 5.440632054176072, + "grad_norm": 210.19219970703125, + "learning_rate": 2.1849364791288567e-05, + "loss": 42.5094, + "step": 1507 + }, + { + "epoch": 5.444243792325056, + "grad_norm": 187.03294372558594, + "learning_rate": 2.1843920145190565e-05, + "loss": 42.2218, + "step": 1508 + }, + { + "epoch": 5.44785553047404, + "grad_norm": 227.6764373779297, + "learning_rate": 2.183847549909256e-05, + "loss": 42.7061, + "step": 1509 + }, + { + "epoch": 5.451467268623025, + "grad_norm": 239.2847442626953, + "learning_rate": 2.1833030852994556e-05, + "loss": 43.1959, + "step": 1510 + }, + { + "epoch": 5.451467268623025, + "eval_loss": 0.6405091285705566, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 1510 + }, + { + "epoch": 5.455079006772009, + "grad_norm": 268.887451171875, + "learning_rate": 2.182758620689655e-05, + "loss": 42.4915, + "step": 1511 + }, + { + "epoch": 5.458690744920993, + "grad_norm": 261.0531311035156, + "learning_rate": 2.182214156079855e-05, + "loss": 42.1777, + "step": 1512 + }, + { + "epoch": 5.462302483069977, + "grad_norm": 241.58819580078125, + "learning_rate": 2.1816696914700546e-05, + "loss": 40.8728, + "step": 1513 + }, + { + "epoch": 5.465914221218962, + "grad_norm": 227.302001953125, + "learning_rate": 2.181125226860254e-05, + "loss": 39.8861, + "step": 1514 + }, + { + "epoch": 5.469525959367946, + "grad_norm": 293.8402404785156, + "learning_rate": 2.1805807622504536e-05, + "loss": 36.8716, + "step": 1515 + }, + { + "epoch": 5.47313769751693, + "grad_norm": 332.8829650878906, + "learning_rate": 2.1800362976406532e-05, + "loss": 35.6049, + "step": 1516 + }, + { + "epoch": 5.476749435665914, + "grad_norm": 271.6636962890625, + "learning_rate": 2.179491833030853e-05, + "loss": 34.6785, + "step": 1517 + }, + { + "epoch": 5.480361173814899, + "grad_norm": 211.5673065185547, + "learning_rate": 2.178947368421053e-05, + "loss": 35.5321, + "step": 1518 + }, + { + "epoch": 5.483972911963883, + "grad_norm": 168.95346069335938, + "learning_rate": 2.1784029038112525e-05, + "loss": 35.1604, + "step": 1519 + }, + { + "epoch": 5.487584650112867, + "grad_norm": 242.66725158691406, + "learning_rate": 2.177858439201452e-05, + "loss": 37.8709, + "step": 1520 + }, + { + "epoch": 5.487584650112867, + "eval_loss": 0.6324127912521362, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1520 + }, + { + "epoch": 5.491196388261851, + "grad_norm": 202.7799530029297, + "learning_rate": 2.1773139745916516e-05, + "loss": 38.1727, + "step": 1521 + }, + { + "epoch": 5.4948081264108355, + "grad_norm": 210.12704467773438, + "learning_rate": 2.176769509981851e-05, + "loss": 36.4171, + "step": 1522 + }, + { + "epoch": 5.4984198645598195, + "grad_norm": 214.7133331298828, + "learning_rate": 2.176225045372051e-05, + "loss": 37.7873, + "step": 1523 + }, + { + "epoch": 5.502031602708803, + "grad_norm": 197.89781188964844, + "learning_rate": 2.1756805807622505e-05, + "loss": 37.1096, + "step": 1524 + }, + { + "epoch": 5.505643340857787, + "grad_norm": 203.01992797851562, + "learning_rate": 2.17513611615245e-05, + "loss": 36.9907, + "step": 1525 + }, + { + "epoch": 5.509255079006772, + "grad_norm": 210.42164611816406, + "learning_rate": 2.17459165154265e-05, + "loss": 38.0291, + "step": 1526 + }, + { + "epoch": 5.512866817155756, + "grad_norm": 210.2798309326172, + "learning_rate": 2.1740471869328495e-05, + "loss": 37.5385, + "step": 1527 + }, + { + "epoch": 5.51647855530474, + "grad_norm": 217.986572265625, + "learning_rate": 2.173502722323049e-05, + "loss": 39.2736, + "step": 1528 + }, + { + "epoch": 5.520090293453725, + "grad_norm": 221.05831909179688, + "learning_rate": 2.172958257713249e-05, + "loss": 39.2733, + "step": 1529 + }, + { + "epoch": 5.523702031602709, + "grad_norm": 250.36065673828125, + "learning_rate": 2.1724137931034484e-05, + "loss": 37.8987, + "step": 1530 + }, + { + "epoch": 5.523702031602709, + "eval_loss": 0.6414559483528137, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 1530 + }, + { + "epoch": 5.527313769751693, + "grad_norm": 275.062255859375, + "learning_rate": 2.171869328493648e-05, + "loss": 29.4874, + "step": 1531 + }, + { + "epoch": 5.530925507900677, + "grad_norm": 178.79615783691406, + "learning_rate": 2.1713248638838475e-05, + "loss": 25.2165, + "step": 1532 + }, + { + "epoch": 5.534537246049661, + "grad_norm": 221.6693572998047, + "learning_rate": 2.170780399274047e-05, + "loss": 24.7139, + "step": 1533 + }, + { + "epoch": 5.538148984198646, + "grad_norm": 207.15869140625, + "learning_rate": 2.170235934664247e-05, + "loss": 25.2773, + "step": 1534 + }, + { + "epoch": 5.54176072234763, + "grad_norm": 193.37644958496094, + "learning_rate": 2.1696914700544468e-05, + "loss": 25.7936, + "step": 1535 + }, + { + "epoch": 5.545372460496614, + "grad_norm": 314.101318359375, + "learning_rate": 2.1691470054446463e-05, + "loss": 45.8573, + "step": 1536 + }, + { + "epoch": 5.5489841986455986, + "grad_norm": 376.9578552246094, + "learning_rate": 2.168602540834846e-05, + "loss": 47.1284, + "step": 1537 + }, + { + "epoch": 5.5525959367945825, + "grad_norm": 343.3904724121094, + "learning_rate": 2.1680580762250454e-05, + "loss": 45.1873, + "step": 1538 + }, + { + "epoch": 5.5562076749435665, + "grad_norm": 263.31768798828125, + "learning_rate": 2.167513611615245e-05, + "loss": 45.4906, + "step": 1539 + }, + { + "epoch": 5.5598194130925505, + "grad_norm": 295.50384521484375, + "learning_rate": 2.1669691470054448e-05, + "loss": 44.9259, + "step": 1540 + }, + { + "epoch": 5.5598194130925505, + "eval_loss": 0.6483813524246216, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.923, + "eval_steps_per_second": 56.923, + "step": 1540 + }, + { + "epoch": 5.563431151241535, + "grad_norm": 208.8861846923828, + "learning_rate": 2.1664246823956444e-05, + "loss": 43.7965, + "step": 1541 + }, + { + "epoch": 5.567042889390519, + "grad_norm": 195.8695526123047, + "learning_rate": 2.165880217785844e-05, + "loss": 44.7409, + "step": 1542 + }, + { + "epoch": 5.570654627539503, + "grad_norm": 218.10089111328125, + "learning_rate": 2.1653357531760434e-05, + "loss": 45.9364, + "step": 1543 + }, + { + "epoch": 5.574266365688487, + "grad_norm": 204.17205810546875, + "learning_rate": 2.164791288566243e-05, + "loss": 45.468, + "step": 1544 + }, + { + "epoch": 5.577878103837472, + "grad_norm": 239.03952026367188, + "learning_rate": 2.1642468239564432e-05, + "loss": 44.7685, + "step": 1545 + }, + { + "epoch": 5.581489841986456, + "grad_norm": 251.59300231933594, + "learning_rate": 2.1637023593466427e-05, + "loss": 43.011, + "step": 1546 + }, + { + "epoch": 5.58510158013544, + "grad_norm": 186.72540283203125, + "learning_rate": 2.1631578947368423e-05, + "loss": 41.5255, + "step": 1547 + }, + { + "epoch": 5.588713318284425, + "grad_norm": 199.89732360839844, + "learning_rate": 2.1626134301270418e-05, + "loss": 40.2522, + "step": 1548 + }, + { + "epoch": 5.592325056433409, + "grad_norm": 182.16624450683594, + "learning_rate": 2.1620689655172413e-05, + "loss": 41.0931, + "step": 1549 + }, + { + "epoch": 5.595936794582393, + "grad_norm": 221.58680725097656, + "learning_rate": 2.161524500907441e-05, + "loss": 40.2717, + "step": 1550 + }, + { + "epoch": 5.595936794582393, + "eval_loss": 0.6393340229988098, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 1550 + }, + { + "epoch": 5.599548532731377, + "grad_norm": 209.82183837890625, + "learning_rate": 2.1609800362976408e-05, + "loss": 41.7522, + "step": 1551 + }, + { + "epoch": 5.603160270880361, + "grad_norm": 226.1896209716797, + "learning_rate": 2.1604355716878403e-05, + "loss": 40.8078, + "step": 1552 + }, + { + "epoch": 5.606772009029346, + "grad_norm": 219.57899475097656, + "learning_rate": 2.1598911070780398e-05, + "loss": 42.2331, + "step": 1553 + }, + { + "epoch": 5.6103837471783295, + "grad_norm": 185.2303009033203, + "learning_rate": 2.1593466424682397e-05, + "loss": 42.0695, + "step": 1554 + }, + { + "epoch": 5.6139954853273135, + "grad_norm": 192.32913208007812, + "learning_rate": 2.1588021778584392e-05, + "loss": 42.1317, + "step": 1555 + }, + { + "epoch": 5.617607223476298, + "grad_norm": 183.3128662109375, + "learning_rate": 2.158257713248639e-05, + "loss": 40.4957, + "step": 1556 + }, + { + "epoch": 5.621218961625282, + "grad_norm": 178.10691833496094, + "learning_rate": 2.1577132486388387e-05, + "loss": 40.9154, + "step": 1557 + }, + { + "epoch": 5.624830699774266, + "grad_norm": 207.3495330810547, + "learning_rate": 2.1571687840290382e-05, + "loss": 42.8389, + "step": 1558 + }, + { + "epoch": 5.62844243792325, + "grad_norm": 191.46353149414062, + "learning_rate": 2.1566243194192377e-05, + "loss": 41.9483, + "step": 1559 + }, + { + "epoch": 5.632054176072235, + "grad_norm": 218.9544219970703, + "learning_rate": 2.1560798548094373e-05, + "loss": 41.2037, + "step": 1560 + }, + { + "epoch": 5.632054176072235, + "eval_loss": 0.6345452070236206, + "eval_runtime": 3.1432, + "eval_samples_per_second": 56.949, + "eval_steps_per_second": 56.949, + "step": 1560 + }, + { + "epoch": 5.635665914221219, + "grad_norm": 235.9405059814453, + "learning_rate": 2.1555353901996368e-05, + "loss": 43.1159, + "step": 1561 + }, + { + "epoch": 5.639277652370203, + "grad_norm": 207.1119384765625, + "learning_rate": 2.1549909255898367e-05, + "loss": 43.4384, + "step": 1562 + }, + { + "epoch": 5.642889390519187, + "grad_norm": 305.3013916015625, + "learning_rate": 2.1544464609800366e-05, + "loss": 42.436, + "step": 1563 + }, + { + "epoch": 5.646501128668172, + "grad_norm": 226.25282287597656, + "learning_rate": 2.153901996370236e-05, + "loss": 39.6844, + "step": 1564 + }, + { + "epoch": 5.650112866817156, + "grad_norm": 201.5033416748047, + "learning_rate": 2.1533575317604356e-05, + "loss": 35.9103, + "step": 1565 + }, + { + "epoch": 5.65372460496614, + "grad_norm": 206.63229370117188, + "learning_rate": 2.1528130671506352e-05, + "loss": 35.0026, + "step": 1566 + }, + { + "epoch": 5.657336343115124, + "grad_norm": 212.67581176757812, + "learning_rate": 2.152268602540835e-05, + "loss": 35.6298, + "step": 1567 + }, + { + "epoch": 5.660948081264109, + "grad_norm": 193.2886199951172, + "learning_rate": 2.1517241379310346e-05, + "loss": 36.0356, + "step": 1568 + }, + { + "epoch": 5.664559819413093, + "grad_norm": 166.189208984375, + "learning_rate": 2.151179673321234e-05, + "loss": 35.5423, + "step": 1569 + }, + { + "epoch": 5.668171557562077, + "grad_norm": 288.91552734375, + "learning_rate": 2.1506352087114337e-05, + "loss": 36.6227, + "step": 1570 + }, + { + "epoch": 5.668171557562077, + "eval_loss": 0.6339959502220154, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1570 + }, + { + "epoch": 5.6717832957110605, + "grad_norm": 210.91664123535156, + "learning_rate": 2.1500907441016332e-05, + "loss": 37.3015, + "step": 1571 + }, + { + "epoch": 5.675395033860045, + "grad_norm": 206.54299926757812, + "learning_rate": 2.149546279491833e-05, + "loss": 36.961, + "step": 1572 + }, + { + "epoch": 5.679006772009029, + "grad_norm": 206.55613708496094, + "learning_rate": 2.149001814882033e-05, + "loss": 36.722, + "step": 1573 + }, + { + "epoch": 5.682618510158013, + "grad_norm": 206.86563110351562, + "learning_rate": 2.1484573502722325e-05, + "loss": 37.7482, + "step": 1574 + }, + { + "epoch": 5.686230248306998, + "grad_norm": 219.96533203125, + "learning_rate": 2.147912885662432e-05, + "loss": 37.7964, + "step": 1575 + }, + { + "epoch": 5.689841986455982, + "grad_norm": 226.23887634277344, + "learning_rate": 2.1473684210526316e-05, + "loss": 38.6577, + "step": 1576 + }, + { + "epoch": 5.693453724604966, + "grad_norm": 195.1751708984375, + "learning_rate": 2.146823956442831e-05, + "loss": 36.9764, + "step": 1577 + }, + { + "epoch": 5.69706546275395, + "grad_norm": 194.3510284423828, + "learning_rate": 2.146279491833031e-05, + "loss": 39.4842, + "step": 1578 + }, + { + "epoch": 5.700677200902934, + "grad_norm": 187.02281188964844, + "learning_rate": 2.1457350272232305e-05, + "loss": 38.9574, + "step": 1579 + }, + { + "epoch": 5.704288939051919, + "grad_norm": 242.91925048828125, + "learning_rate": 2.14519056261343e-05, + "loss": 37.6359, + "step": 1580 + }, + { + "epoch": 5.704288939051919, + "eval_loss": 0.6384473443031311, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 1580 + }, + { + "epoch": 5.707900677200903, + "grad_norm": 242.9617156982422, + "learning_rate": 2.14464609800363e-05, + "loss": 31.3564, + "step": 1581 + }, + { + "epoch": 5.711512415349887, + "grad_norm": 182.00540161132812, + "learning_rate": 2.1441016333938295e-05, + "loss": 24.2933, + "step": 1582 + }, + { + "epoch": 5.715124153498872, + "grad_norm": 257.7115173339844, + "learning_rate": 2.143557168784029e-05, + "loss": 24.6299, + "step": 1583 + }, + { + "epoch": 5.718735891647856, + "grad_norm": 198.71554565429688, + "learning_rate": 2.143012704174229e-05, + "loss": 24.7344, + "step": 1584 + }, + { + "epoch": 5.72234762979684, + "grad_norm": 198.24520874023438, + "learning_rate": 2.1424682395644284e-05, + "loss": 26.0825, + "step": 1585 + }, + { + "epoch": 5.725959367945824, + "grad_norm": 248.9528045654297, + "learning_rate": 2.141923774954628e-05, + "loss": 45.1176, + "step": 1586 + }, + { + "epoch": 5.7295711060948085, + "grad_norm": 293.7327575683594, + "learning_rate": 2.1413793103448275e-05, + "loss": 45.8517, + "step": 1587 + }, + { + "epoch": 5.733182844243792, + "grad_norm": 293.1148681640625, + "learning_rate": 2.140834845735027e-05, + "loss": 45.6659, + "step": 1588 + }, + { + "epoch": 5.736794582392776, + "grad_norm": 312.7779846191406, + "learning_rate": 2.140290381125227e-05, + "loss": 44.4863, + "step": 1589 + }, + { + "epoch": 5.74040632054176, + "grad_norm": 309.1000061035156, + "learning_rate": 2.1397459165154265e-05, + "loss": 43.649, + "step": 1590 + }, + { + "epoch": 5.74040632054176, + "eval_loss": 0.6471736431121826, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 1590 + }, + { + "epoch": 5.744018058690745, + "grad_norm": 276.4226989746094, + "learning_rate": 2.1392014519056263e-05, + "loss": 45.3135, + "step": 1591 + }, + { + "epoch": 5.747629796839729, + "grad_norm": 233.6791229248047, + "learning_rate": 2.138656987295826e-05, + "loss": 44.4919, + "step": 1592 + }, + { + "epoch": 5.751241534988713, + "grad_norm": 194.2917022705078, + "learning_rate": 2.1381125226860254e-05, + "loss": 44.8033, + "step": 1593 + }, + { + "epoch": 5.754853273137698, + "grad_norm": 241.76060485839844, + "learning_rate": 2.137568058076225e-05, + "loss": 45.1427, + "step": 1594 + }, + { + "epoch": 5.758465011286682, + "grad_norm": 216.56283569335938, + "learning_rate": 2.137023593466425e-05, + "loss": 43.1769, + "step": 1595 + }, + { + "epoch": 5.762076749435666, + "grad_norm": 230.0026092529297, + "learning_rate": 2.1364791288566244e-05, + "loss": 44.1141, + "step": 1596 + }, + { + "epoch": 5.76568848758465, + "grad_norm": 191.55433654785156, + "learning_rate": 2.135934664246824e-05, + "loss": 40.7227, + "step": 1597 + }, + { + "epoch": 5.769300225733634, + "grad_norm": 180.25885009765625, + "learning_rate": 2.1353901996370235e-05, + "loss": 40.9842, + "step": 1598 + }, + { + "epoch": 5.772911963882619, + "grad_norm": 220.4018096923828, + "learning_rate": 2.134845735027223e-05, + "loss": 40.0403, + "step": 1599 + }, + { + "epoch": 5.776523702031603, + "grad_norm": 264.20587158203125, + "learning_rate": 2.1343012704174232e-05, + "loss": 40.1543, + "step": 1600 + }, + { + "epoch": 5.776523702031603, + "eval_loss": 0.6374311447143555, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1600 + }, + { + "epoch": 5.780135440180587, + "grad_norm": 167.9457244873047, + "learning_rate": 2.1337568058076227e-05, + "loss": 40.9575, + "step": 1601 + }, + { + "epoch": 5.7837471783295715, + "grad_norm": 190.05247497558594, + "learning_rate": 2.1332123411978223e-05, + "loss": 39.5593, + "step": 1602 + }, + { + "epoch": 5.7873589164785555, + "grad_norm": 246.4980926513672, + "learning_rate": 2.1326678765880218e-05, + "loss": 40.7016, + "step": 1603 + }, + { + "epoch": 5.7909706546275395, + "grad_norm": 208.7435302734375, + "learning_rate": 2.1321234119782214e-05, + "loss": 41.7855, + "step": 1604 + }, + { + "epoch": 5.794582392776523, + "grad_norm": 190.84188842773438, + "learning_rate": 2.1315789473684212e-05, + "loss": 41.2129, + "step": 1605 + }, + { + "epoch": 5.798194130925508, + "grad_norm": 196.7161102294922, + "learning_rate": 2.1310344827586208e-05, + "loss": 40.8209, + "step": 1606 + }, + { + "epoch": 5.801805869074492, + "grad_norm": 181.4319305419922, + "learning_rate": 2.1304900181488203e-05, + "loss": 41.8345, + "step": 1607 + }, + { + "epoch": 5.805417607223476, + "grad_norm": 201.2064971923828, + "learning_rate": 2.12994555353902e-05, + "loss": 43.1464, + "step": 1608 + }, + { + "epoch": 5.80902934537246, + "grad_norm": 199.15174865722656, + "learning_rate": 2.1294010889292197e-05, + "loss": 42.6041, + "step": 1609 + }, + { + "epoch": 5.812641083521445, + "grad_norm": 231.0398406982422, + "learning_rate": 2.1288566243194193e-05, + "loss": 42.867, + "step": 1610 + }, + { + "epoch": 5.812641083521445, + "eval_loss": 0.6334222555160522, + "eval_runtime": 3.1534, + "eval_samples_per_second": 56.764, + "eval_steps_per_second": 56.764, + "step": 1610 + }, + { + "epoch": 5.816252821670429, + "grad_norm": 189.26132202148438, + "learning_rate": 2.128312159709619e-05, + "loss": 41.7717, + "step": 1611 + }, + { + "epoch": 5.819864559819413, + "grad_norm": 215.5289764404297, + "learning_rate": 2.1277676950998187e-05, + "loss": 41.3994, + "step": 1612 + }, + { + "epoch": 5.823476297968397, + "grad_norm": 267.4259033203125, + "learning_rate": 2.1272232304900182e-05, + "loss": 41.8173, + "step": 1613 + }, + { + "epoch": 5.827088036117382, + "grad_norm": 241.74749755859375, + "learning_rate": 2.1266787658802178e-05, + "loss": 39.9873, + "step": 1614 + }, + { + "epoch": 5.830699774266366, + "grad_norm": 242.233642578125, + "learning_rate": 2.1261343012704173e-05, + "loss": 37.0662, + "step": 1615 + }, + { + "epoch": 5.83431151241535, + "grad_norm": 217.06141662597656, + "learning_rate": 2.1255898366606172e-05, + "loss": 36.8948, + "step": 1616 + }, + { + "epoch": 5.837923250564334, + "grad_norm": 242.05567932128906, + "learning_rate": 2.1250453720508167e-05, + "loss": 34.9909, + "step": 1617 + }, + { + "epoch": 5.8415349887133186, + "grad_norm": 178.65618896484375, + "learning_rate": 2.1245009074410166e-05, + "loss": 35.603, + "step": 1618 + }, + { + "epoch": 5.8451467268623025, + "grad_norm": 216.36865234375, + "learning_rate": 2.123956442831216e-05, + "loss": 35.9822, + "step": 1619 + }, + { + "epoch": 5.8487584650112865, + "grad_norm": 241.22161865234375, + "learning_rate": 2.1234119782214157e-05, + "loss": 35.1473, + "step": 1620 + }, + { + "epoch": 5.8487584650112865, + "eval_loss": 0.6312161087989807, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 1620 + }, + { + "epoch": 5.852370203160271, + "grad_norm": 192.05210876464844, + "learning_rate": 2.1228675136116152e-05, + "loss": 36.145, + "step": 1621 + }, + { + "epoch": 5.855981941309255, + "grad_norm": 194.0652618408203, + "learning_rate": 2.122323049001815e-05, + "loss": 37.7076, + "step": 1622 + }, + { + "epoch": 5.859593679458239, + "grad_norm": 255.59286499023438, + "learning_rate": 2.1217785843920146e-05, + "loss": 37.6837, + "step": 1623 + }, + { + "epoch": 5.863205417607223, + "grad_norm": 184.0017852783203, + "learning_rate": 2.121234119782214e-05, + "loss": 37.1681, + "step": 1624 + }, + { + "epoch": 5.866817155756207, + "grad_norm": 186.98338317871094, + "learning_rate": 2.1206896551724137e-05, + "loss": 37.4902, + "step": 1625 + }, + { + "epoch": 5.870428893905192, + "grad_norm": 253.53775024414062, + "learning_rate": 2.1201451905626132e-05, + "loss": 37.2771, + "step": 1626 + }, + { + "epoch": 5.874040632054176, + "grad_norm": 196.43038940429688, + "learning_rate": 2.119600725952813e-05, + "loss": 37.7681, + "step": 1627 + }, + { + "epoch": 5.87765237020316, + "grad_norm": 255.99879455566406, + "learning_rate": 2.119056261343013e-05, + "loss": 40.0097, + "step": 1628 + }, + { + "epoch": 5.881264108352145, + "grad_norm": 275.1465148925781, + "learning_rate": 2.1185117967332125e-05, + "loss": 38.1076, + "step": 1629 + }, + { + "epoch": 5.884875846501129, + "grad_norm": 281.8592529296875, + "learning_rate": 2.117967332123412e-05, + "loss": 38.6463, + "step": 1630 + }, + { + "epoch": 5.884875846501129, + "eval_loss": 0.6449099779129028, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 1630 + }, + { + "epoch": 5.888487584650113, + "grad_norm": 246.7912139892578, + "learning_rate": 2.1174228675136116e-05, + "loss": 36.9158, + "step": 1631 + }, + { + "epoch": 5.892099322799097, + "grad_norm": 176.7545623779297, + "learning_rate": 2.116878402903811e-05, + "loss": 25.1153, + "step": 1632 + }, + { + "epoch": 5.895711060948082, + "grad_norm": 202.2602996826172, + "learning_rate": 2.116333938294011e-05, + "loss": 24.1999, + "step": 1633 + }, + { + "epoch": 5.899322799097066, + "grad_norm": 186.26255798339844, + "learning_rate": 2.1157894736842106e-05, + "loss": 24.185, + "step": 1634 + }, + { + "epoch": 5.9029345372460496, + "grad_norm": 231.0543670654297, + "learning_rate": 2.11524500907441e-05, + "loss": 26.1841, + "step": 1635 + }, + { + "epoch": 5.9065462753950335, + "grad_norm": 336.677001953125, + "learning_rate": 2.1147005444646096e-05, + "loss": 47.1367, + "step": 1636 + }, + { + "epoch": 5.910158013544018, + "grad_norm": 299.3211975097656, + "learning_rate": 2.1141560798548095e-05, + "loss": 46.7711, + "step": 1637 + }, + { + "epoch": 5.913769751693002, + "grad_norm": 287.5389099121094, + "learning_rate": 2.1136116152450094e-05, + "loss": 44.9163, + "step": 1638 + }, + { + "epoch": 5.917381489841986, + "grad_norm": 290.34930419921875, + "learning_rate": 2.113067150635209e-05, + "loss": 45.1651, + "step": 1639 + }, + { + "epoch": 5.92099322799097, + "grad_norm": 244.7100372314453, + "learning_rate": 2.1125226860254085e-05, + "loss": 45.6252, + "step": 1640 + }, + { + "epoch": 5.92099322799097, + "eval_loss": 0.6506878733634949, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 1640 + }, + { + "epoch": 5.924604966139955, + "grad_norm": 301.48223876953125, + "learning_rate": 2.111978221415608e-05, + "loss": 44.5345, + "step": 1641 + }, + { + "epoch": 5.928216704288939, + "grad_norm": 261.05987548828125, + "learning_rate": 2.1114337568058075e-05, + "loss": 42.0263, + "step": 1642 + }, + { + "epoch": 5.931828442437923, + "grad_norm": 220.4369659423828, + "learning_rate": 2.110889292196007e-05, + "loss": 41.2405, + "step": 1643 + }, + { + "epoch": 5.935440180586907, + "grad_norm": 261.3221435546875, + "learning_rate": 2.110344827586207e-05, + "loss": 42.2734, + "step": 1644 + }, + { + "epoch": 5.939051918735892, + "grad_norm": 253.70855712890625, + "learning_rate": 2.1098003629764065e-05, + "loss": 43.0752, + "step": 1645 + }, + { + "epoch": 5.942663656884876, + "grad_norm": 198.76138305664062, + "learning_rate": 2.1092558983666064e-05, + "loss": 42.7103, + "step": 1646 + }, + { + "epoch": 5.94627539503386, + "grad_norm": 212.21466064453125, + "learning_rate": 2.108711433756806e-05, + "loss": 42.6215, + "step": 1647 + }, + { + "epoch": 5.949887133182845, + "grad_norm": 212.9633026123047, + "learning_rate": 2.1081669691470055e-05, + "loss": 42.795, + "step": 1648 + }, + { + "epoch": 5.953498871331829, + "grad_norm": 263.2871398925781, + "learning_rate": 2.1076225045372053e-05, + "loss": 43.8843, + "step": 1649 + }, + { + "epoch": 5.957110609480813, + "grad_norm": 207.67120361328125, + "learning_rate": 2.107078039927405e-05, + "loss": 43.0161, + "step": 1650 + }, + { + "epoch": 5.957110609480813, + "eval_loss": 0.6315081715583801, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1650 + }, + { + "epoch": 5.960722347629797, + "grad_norm": 176.6342010498047, + "learning_rate": 2.1065335753176044e-05, + "loss": 38.803, + "step": 1651 + }, + { + "epoch": 5.9643340857787805, + "grad_norm": 223.57485961914062, + "learning_rate": 2.105989110707804e-05, + "loss": 35.1905, + "step": 1652 + }, + { + "epoch": 5.967945823927765, + "grad_norm": 291.507568359375, + "learning_rate": 2.1054446460980035e-05, + "loss": 34.9454, + "step": 1653 + }, + { + "epoch": 5.971557562076749, + "grad_norm": 250.51063537597656, + "learning_rate": 2.104900181488203e-05, + "loss": 37.4404, + "step": 1654 + }, + { + "epoch": 5.975169300225733, + "grad_norm": 307.9601135253906, + "learning_rate": 2.1043557168784032e-05, + "loss": 36.9775, + "step": 1655 + }, + { + "epoch": 5.978781038374718, + "grad_norm": 277.24151611328125, + "learning_rate": 2.1038112522686028e-05, + "loss": 38.2696, + "step": 1656 + }, + { + "epoch": 5.982392776523702, + "grad_norm": 186.7593994140625, + "learning_rate": 2.1032667876588023e-05, + "loss": 37.0656, + "step": 1657 + }, + { + "epoch": 5.986004514672686, + "grad_norm": 201.67047119140625, + "learning_rate": 2.102722323049002e-05, + "loss": 38.1747, + "step": 1658 + }, + { + "epoch": 5.98961625282167, + "grad_norm": 216.87525939941406, + "learning_rate": 2.1021778584392014e-05, + "loss": 39.3248, + "step": 1659 + }, + { + "epoch": 5.993227990970655, + "grad_norm": 227.381103515625, + "learning_rate": 2.1016333938294013e-05, + "loss": 33.4017, + "step": 1660 + }, + { + "epoch": 5.993227990970655, + "eval_loss": 0.6369583010673523, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1660 + }, + { + "epoch": 5.996839729119639, + "grad_norm": 237.2648468017578, + "learning_rate": 2.1010889292196008e-05, + "loss": 24.679, + "step": 1661 + }, + { + "epoch": 6.0, + "grad_norm": 191.99951171875, + "learning_rate": 2.1005444646098003e-05, + "loss": 21.9552, + "step": 1662 + }, + { + "epoch": 6.003611738148984, + "grad_norm": 267.92181396484375, + "learning_rate": 2.1e-05, + "loss": 43.6884, + "step": 1663 + }, + { + "epoch": 6.007223476297969, + "grad_norm": 318.86602783203125, + "learning_rate": 2.0994555353901998e-05, + "loss": 46.0709, + "step": 1664 + }, + { + "epoch": 6.010835214446953, + "grad_norm": 282.772705078125, + "learning_rate": 2.0989110707803993e-05, + "loss": 44.2746, + "step": 1665 + }, + { + "epoch": 6.014446952595937, + "grad_norm": 263.2024841308594, + "learning_rate": 2.0983666061705992e-05, + "loss": 43.818, + "step": 1666 + }, + { + "epoch": 6.018058690744921, + "grad_norm": 229.41725158691406, + "learning_rate": 2.0978221415607987e-05, + "loss": 43.9441, + "step": 1667 + }, + { + "epoch": 6.021670428893906, + "grad_norm": 253.25624084472656, + "learning_rate": 2.0972776769509983e-05, + "loss": 43.517, + "step": 1668 + }, + { + "epoch": 6.0252821670428895, + "grad_norm": 202.00238037109375, + "learning_rate": 2.0967332123411978e-05, + "loss": 44.3685, + "step": 1669 + }, + { + "epoch": 6.0288939051918735, + "grad_norm": 196.92825317382812, + "learning_rate": 2.0961887477313973e-05, + "loss": 44.9367, + "step": 1670 + }, + { + "epoch": 6.0288939051918735, + "eval_loss": 0.6381568312644958, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1670 + }, + { + "epoch": 6.0325056433408575, + "grad_norm": 191.00900268554688, + "learning_rate": 2.0956442831215972e-05, + "loss": 44.0743, + "step": 1671 + }, + { + "epoch": 6.036117381489842, + "grad_norm": 195.92141723632812, + "learning_rate": 2.0950998185117967e-05, + "loss": 43.3278, + "step": 1672 + }, + { + "epoch": 6.039729119638826, + "grad_norm": 230.04708862304688, + "learning_rate": 2.0945553539019963e-05, + "loss": 41.6419, + "step": 1673 + }, + { + "epoch": 6.04334085778781, + "grad_norm": 215.70689392089844, + "learning_rate": 2.094010889292196e-05, + "loss": 41.0927, + "step": 1674 + }, + { + "epoch": 6.046952595936794, + "grad_norm": 227.51797485351562, + "learning_rate": 2.0934664246823957e-05, + "loss": 40.1888, + "step": 1675 + }, + { + "epoch": 6.050564334085779, + "grad_norm": 216.93089294433594, + "learning_rate": 2.0929219600725952e-05, + "loss": 39.8766, + "step": 1676 + }, + { + "epoch": 6.054176072234763, + "grad_norm": 199.3091583251953, + "learning_rate": 2.092377495462795e-05, + "loss": 40.3851, + "step": 1677 + }, + { + "epoch": 6.057787810383747, + "grad_norm": 188.56056213378906, + "learning_rate": 2.0918330308529947e-05, + "loss": 40.5289, + "step": 1678 + }, + { + "epoch": 6.061399548532731, + "grad_norm": 194.23265075683594, + "learning_rate": 2.0912885662431942e-05, + "loss": 40.7509, + "step": 1679 + }, + { + "epoch": 6.065011286681716, + "grad_norm": 199.7327423095703, + "learning_rate": 2.0907441016333937e-05, + "loss": 41.3404, + "step": 1680 + }, + { + "epoch": 6.065011286681716, + "eval_loss": 0.6312655806541443, + "eval_runtime": 3.1482, + "eval_samples_per_second": 56.858, + "eval_steps_per_second": 56.858, + "step": 1680 + }, + { + "epoch": 6.0686230248307, + "grad_norm": 189.40150451660156, + "learning_rate": 2.0901996370235933e-05, + "loss": 41.3719, + "step": 1681 + }, + { + "epoch": 6.072234762979684, + "grad_norm": 222.07705688476562, + "learning_rate": 2.089655172413793e-05, + "loss": 41.8194, + "step": 1682 + }, + { + "epoch": 6.075846501128668, + "grad_norm": 205.6264190673828, + "learning_rate": 2.089110707803993e-05, + "loss": 39.8522, + "step": 1683 + }, + { + "epoch": 6.079458239277653, + "grad_norm": 207.98802185058594, + "learning_rate": 2.0885662431941926e-05, + "loss": 41.5093, + "step": 1684 + }, + { + "epoch": 6.083069977426637, + "grad_norm": 197.24134826660156, + "learning_rate": 2.088021778584392e-05, + "loss": 41.7284, + "step": 1685 + }, + { + "epoch": 6.0866817155756205, + "grad_norm": 220.84255981445312, + "learning_rate": 2.0874773139745916e-05, + "loss": 42.7841, + "step": 1686 + }, + { + "epoch": 6.090293453724605, + "grad_norm": 239.06854248046875, + "learning_rate": 2.0869328493647912e-05, + "loss": 43.6391, + "step": 1687 + }, + { + "epoch": 6.093905191873589, + "grad_norm": 193.2572021484375, + "learning_rate": 2.086388384754991e-05, + "loss": 41.9963, + "step": 1688 + }, + { + "epoch": 6.097516930022573, + "grad_norm": 206.66473388671875, + "learning_rate": 2.0858439201451906e-05, + "loss": 41.9834, + "step": 1689 + }, + { + "epoch": 6.101128668171557, + "grad_norm": 214.81956481933594, + "learning_rate": 2.08529945553539e-05, + "loss": 41.7128, + "step": 1690 + }, + { + "epoch": 6.101128668171557, + "eval_loss": 0.6309775114059448, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1690 + }, + { + "epoch": 6.104740406320542, + "grad_norm": 189.58360290527344, + "learning_rate": 2.0847549909255897e-05, + "loss": 37.7807, + "step": 1691 + }, + { + "epoch": 6.108352144469526, + "grad_norm": 265.76934814453125, + "learning_rate": 2.0842105263157895e-05, + "loss": 37.7091, + "step": 1692 + }, + { + "epoch": 6.11196388261851, + "grad_norm": 266.4632568359375, + "learning_rate": 2.0836660617059894e-05, + "loss": 34.7386, + "step": 1693 + }, + { + "epoch": 6.115575620767494, + "grad_norm": 309.3799743652344, + "learning_rate": 2.083121597096189e-05, + "loss": 34.9386, + "step": 1694 + }, + { + "epoch": 6.119187358916479, + "grad_norm": 252.98681640625, + "learning_rate": 2.0825771324863885e-05, + "loss": 34.9113, + "step": 1695 + }, + { + "epoch": 6.122799097065463, + "grad_norm": 199.3408660888672, + "learning_rate": 2.082032667876588e-05, + "loss": 35.1914, + "step": 1696 + }, + { + "epoch": 6.126410835214447, + "grad_norm": 231.67514038085938, + "learning_rate": 2.0814882032667876e-05, + "loss": 36.3151, + "step": 1697 + }, + { + "epoch": 6.130022573363431, + "grad_norm": 215.49317932128906, + "learning_rate": 2.080943738656987e-05, + "loss": 37.6763, + "step": 1698 + }, + { + "epoch": 6.133634311512416, + "grad_norm": 239.3602752685547, + "learning_rate": 2.080399274047187e-05, + "loss": 35.7805, + "step": 1699 + }, + { + "epoch": 6.1372460496614, + "grad_norm": 192.8195037841797, + "learning_rate": 2.0798548094373865e-05, + "loss": 36.7353, + "step": 1700 + }, + { + "epoch": 6.1372460496614, + "eval_loss": 0.6290757060050964, + "eval_runtime": 3.1486, + "eval_samples_per_second": 56.851, + "eval_steps_per_second": 56.851, + "step": 1700 + }, + { + "epoch": 6.140857787810384, + "grad_norm": 191.125, + "learning_rate": 2.0793103448275864e-05, + "loss": 36.6377, + "step": 1701 + }, + { + "epoch": 6.144469525959368, + "grad_norm": 232.39170837402344, + "learning_rate": 2.078765880217786e-05, + "loss": 36.5235, + "step": 1702 + }, + { + "epoch": 6.148081264108352, + "grad_norm": 259.41204833984375, + "learning_rate": 2.0782214156079855e-05, + "loss": 37.7093, + "step": 1703 + }, + { + "epoch": 6.151693002257336, + "grad_norm": 218.00814819335938, + "learning_rate": 2.0776769509981854e-05, + "loss": 37.8061, + "step": 1704 + }, + { + "epoch": 6.15530474040632, + "grad_norm": 183.78170776367188, + "learning_rate": 2.077132486388385e-05, + "loss": 37.9451, + "step": 1705 + }, + { + "epoch": 6.158916478555304, + "grad_norm": 242.387939453125, + "learning_rate": 2.0765880217785844e-05, + "loss": 38.687, + "step": 1706 + }, + { + "epoch": 6.162528216704289, + "grad_norm": 247.09152221679688, + "learning_rate": 2.076043557168784e-05, + "loss": 38.5109, + "step": 1707 + }, + { + "epoch": 6.166139954853273, + "grad_norm": 202.3104705810547, + "learning_rate": 2.0754990925589835e-05, + "loss": 28.0115, + "step": 1708 + }, + { + "epoch": 6.169751693002257, + "grad_norm": 239.5511016845703, + "learning_rate": 2.0749546279491834e-05, + "loss": 23.8873, + "step": 1709 + }, + { + "epoch": 6.173363431151241, + "grad_norm": 233.80007934570312, + "learning_rate": 2.0744101633393833e-05, + "loss": 24.0236, + "step": 1710 + }, + { + "epoch": 6.173363431151241, + "eval_loss": 0.6451307535171509, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1710 + }, + { + "epoch": 6.176975169300226, + "grad_norm": 231.85955810546875, + "learning_rate": 2.0738656987295828e-05, + "loss": 25.2521, + "step": 1711 + }, + { + "epoch": 6.18058690744921, + "grad_norm": 207.05453491210938, + "learning_rate": 2.0733212341197823e-05, + "loss": 25.5774, + "step": 1712 + }, + { + "epoch": 6.184198645598194, + "grad_norm": 265.9180908203125, + "learning_rate": 2.072776769509982e-05, + "loss": 46.0267, + "step": 1713 + }, + { + "epoch": 6.187810383747179, + "grad_norm": 289.2763671875, + "learning_rate": 2.0722323049001814e-05, + "loss": 46.6262, + "step": 1714 + }, + { + "epoch": 6.191422121896163, + "grad_norm": 254.466552734375, + "learning_rate": 2.0716878402903813e-05, + "loss": 44.2758, + "step": 1715 + }, + { + "epoch": 6.195033860045147, + "grad_norm": 262.713134765625, + "learning_rate": 2.071143375680581e-05, + "loss": 44.6334, + "step": 1716 + }, + { + "epoch": 6.198645598194131, + "grad_norm": 272.8150939941406, + "learning_rate": 2.0705989110707804e-05, + "loss": 44.9617, + "step": 1717 + }, + { + "epoch": 6.2022573363431155, + "grad_norm": 288.115478515625, + "learning_rate": 2.07005444646098e-05, + "loss": 44.4382, + "step": 1718 + }, + { + "epoch": 6.2058690744920995, + "grad_norm": 226.08058166503906, + "learning_rate": 2.0695099818511795e-05, + "loss": 44.8551, + "step": 1719 + }, + { + "epoch": 6.209480812641083, + "grad_norm": 219.95835876464844, + "learning_rate": 2.0689655172413797e-05, + "loss": 45.5901, + "step": 1720 + }, + { + "epoch": 6.209480812641083, + "eval_loss": 0.6379314661026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 1720 + }, + { + "epoch": 6.213092550790067, + "grad_norm": 190.3118896484375, + "learning_rate": 2.0684210526315792e-05, + "loss": 44.0675, + "step": 1721 + }, + { + "epoch": 6.216704288939052, + "grad_norm": 177.408935546875, + "learning_rate": 2.0678765880217787e-05, + "loss": 42.6333, + "step": 1722 + }, + { + "epoch": 6.220316027088036, + "grad_norm": 231.3040313720703, + "learning_rate": 2.0673321234119783e-05, + "loss": 41.6771, + "step": 1723 + }, + { + "epoch": 6.22392776523702, + "grad_norm": 226.51663208007812, + "learning_rate": 2.0667876588021778e-05, + "loss": 41.0829, + "step": 1724 + }, + { + "epoch": 6.227539503386004, + "grad_norm": 184.55775451660156, + "learning_rate": 2.0662431941923774e-05, + "loss": 39.2682, + "step": 1725 + }, + { + "epoch": 6.231151241534989, + "grad_norm": 205.0491943359375, + "learning_rate": 2.0656987295825772e-05, + "loss": 40.4101, + "step": 1726 + }, + { + "epoch": 6.234762979683973, + "grad_norm": 201.45838928222656, + "learning_rate": 2.0651542649727768e-05, + "loss": 39.9147, + "step": 1727 + }, + { + "epoch": 6.238374717832957, + "grad_norm": 220.16213989257812, + "learning_rate": 2.0646098003629763e-05, + "loss": 40.7215, + "step": 1728 + }, + { + "epoch": 6.241986455981941, + "grad_norm": 260.9661560058594, + "learning_rate": 2.0640653357531762e-05, + "loss": 40.0256, + "step": 1729 + }, + { + "epoch": 6.245598194130926, + "grad_norm": 314.2476806640625, + "learning_rate": 2.0635208711433757e-05, + "loss": 41.1147, + "step": 1730 + }, + { + "epoch": 6.245598194130926, + "eval_loss": 0.6347935199737549, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1730 + }, + { + "epoch": 6.24920993227991, + "grad_norm": 262.24505615234375, + "learning_rate": 2.0629764065335756e-05, + "loss": 41.7255, + "step": 1731 + }, + { + "epoch": 6.252821670428894, + "grad_norm": 212.0876922607422, + "learning_rate": 2.062431941923775e-05, + "loss": 41.2559, + "step": 1732 + }, + { + "epoch": 6.2564334085778786, + "grad_norm": 185.3249969482422, + "learning_rate": 2.0618874773139747e-05, + "loss": 41.1664, + "step": 1733 + }, + { + "epoch": 6.2600451467268625, + "grad_norm": 184.7873077392578, + "learning_rate": 2.0613430127041742e-05, + "loss": 41.3357, + "step": 1734 + }, + { + "epoch": 6.2636568848758465, + "grad_norm": 230.11257934570312, + "learning_rate": 2.0607985480943738e-05, + "loss": 43.0978, + "step": 1735 + }, + { + "epoch": 6.2672686230248305, + "grad_norm": 251.255126953125, + "learning_rate": 2.0602540834845733e-05, + "loss": 42.4169, + "step": 1736 + }, + { + "epoch": 6.270880361173815, + "grad_norm": 230.1149444580078, + "learning_rate": 2.0597096188747732e-05, + "loss": 43.2969, + "step": 1737 + }, + { + "epoch": 6.274492099322799, + "grad_norm": 217.2769012451172, + "learning_rate": 2.059165154264973e-05, + "loss": 42.6037, + "step": 1738 + }, + { + "epoch": 6.278103837471783, + "grad_norm": 189.85533142089844, + "learning_rate": 2.0586206896551726e-05, + "loss": 42.1215, + "step": 1739 + }, + { + "epoch": 6.281715575620767, + "grad_norm": 242.15667724609375, + "learning_rate": 2.058076225045372e-05, + "loss": 42.6337, + "step": 1740 + }, + { + "epoch": 6.281715575620767, + "eval_loss": 0.6310555934906006, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 1740 + }, + { + "epoch": 6.285327313769752, + "grad_norm": 213.7873992919922, + "learning_rate": 2.0575317604355717e-05, + "loss": 40.5315, + "step": 1741 + }, + { + "epoch": 6.288939051918736, + "grad_norm": 243.86492919921875, + "learning_rate": 2.0569872958257715e-05, + "loss": 38.9483, + "step": 1742 + }, + { + "epoch": 6.29255079006772, + "grad_norm": 276.0108642578125, + "learning_rate": 2.056442831215971e-05, + "loss": 35.9627, + "step": 1743 + }, + { + "epoch": 6.296162528216704, + "grad_norm": 252.5875701904297, + "learning_rate": 2.0558983666061706e-05, + "loss": 35.4305, + "step": 1744 + }, + { + "epoch": 6.299774266365689, + "grad_norm": 227.15142822265625, + "learning_rate": 2.05535390199637e-05, + "loss": 35.2385, + "step": 1745 + }, + { + "epoch": 6.303386004514673, + "grad_norm": 259.6727294921875, + "learning_rate": 2.0548094373865697e-05, + "loss": 35.735, + "step": 1746 + }, + { + "epoch": 6.306997742663657, + "grad_norm": 185.07765197753906, + "learning_rate": 2.0542649727767696e-05, + "loss": 36.8835, + "step": 1747 + }, + { + "epoch": 6.310609480812641, + "grad_norm": 207.650146484375, + "learning_rate": 2.0537205081669694e-05, + "loss": 36.346, + "step": 1748 + }, + { + "epoch": 6.314221218961626, + "grad_norm": 223.2378692626953, + "learning_rate": 2.053176043557169e-05, + "loss": 36.1527, + "step": 1749 + }, + { + "epoch": 6.3178329571106095, + "grad_norm": 162.90794372558594, + "learning_rate": 2.0526315789473685e-05, + "loss": 35.7408, + "step": 1750 + }, + { + "epoch": 6.3178329571106095, + "eval_loss": 0.6276403069496155, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 1750 + }, + { + "epoch": 6.3214446952595935, + "grad_norm": 165.8592987060547, + "learning_rate": 2.052087114337568e-05, + "loss": 37.7916, + "step": 1751 + }, + { + "epoch": 6.3250564334085775, + "grad_norm": 179.7499542236328, + "learning_rate": 2.0515426497277676e-05, + "loss": 36.8409, + "step": 1752 + }, + { + "epoch": 6.328668171557562, + "grad_norm": 227.0990753173828, + "learning_rate": 2.0509981851179675e-05, + "loss": 37.1766, + "step": 1753 + }, + { + "epoch": 6.332279909706546, + "grad_norm": 216.3297882080078, + "learning_rate": 2.050453720508167e-05, + "loss": 37.5, + "step": 1754 + }, + { + "epoch": 6.33589164785553, + "grad_norm": 197.88409423828125, + "learning_rate": 2.0499092558983666e-05, + "loss": 38.8293, + "step": 1755 + }, + { + "epoch": 6.339503386004514, + "grad_norm": 189.74916076660156, + "learning_rate": 2.049364791288566e-05, + "loss": 37.9873, + "step": 1756 + }, + { + "epoch": 6.343115124153499, + "grad_norm": 241.16644287109375, + "learning_rate": 2.048820326678766e-05, + "loss": 39.3107, + "step": 1757 + }, + { + "epoch": 6.346726862302483, + "grad_norm": 224.3491668701172, + "learning_rate": 2.0482758620689655e-05, + "loss": 36.2482, + "step": 1758 + }, + { + "epoch": 6.350338600451467, + "grad_norm": 217.30882263183594, + "learning_rate": 2.0477313974591654e-05, + "loss": 24.1945, + "step": 1759 + }, + { + "epoch": 6.353950338600452, + "grad_norm": 213.23683166503906, + "learning_rate": 2.047186932849365e-05, + "loss": 24.2356, + "step": 1760 + }, + { + "epoch": 6.353950338600452, + "eval_loss": 0.6382855772972107, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.795, + "eval_steps_per_second": 56.795, + "step": 1760 + }, + { + "epoch": 6.357562076749436, + "grad_norm": 209.8166961669922, + "learning_rate": 2.0466424682395645e-05, + "loss": 25.1916, + "step": 1761 + }, + { + "epoch": 6.36117381489842, + "grad_norm": 197.86773681640625, + "learning_rate": 2.046098003629764e-05, + "loss": 25.1372, + "step": 1762 + }, + { + "epoch": 6.364785553047404, + "grad_norm": 280.80517578125, + "learning_rate": 2.0455535390199635e-05, + "loss": 45.0431, + "step": 1763 + }, + { + "epoch": 6.368397291196389, + "grad_norm": 239.85861206054688, + "learning_rate": 2.0450090744101634e-05, + "loss": 45.4893, + "step": 1764 + }, + { + "epoch": 6.372009029345373, + "grad_norm": 302.56024169921875, + "learning_rate": 2.044464609800363e-05, + "loss": 45.3313, + "step": 1765 + }, + { + "epoch": 6.375620767494357, + "grad_norm": 255.5519256591797, + "learning_rate": 2.043920145190563e-05, + "loss": 44.703, + "step": 1766 + }, + { + "epoch": 6.3792325056433405, + "grad_norm": 223.1331024169922, + "learning_rate": 2.0433756805807624e-05, + "loss": 45.0278, + "step": 1767 + }, + { + "epoch": 6.382844243792325, + "grad_norm": 240.68817138671875, + "learning_rate": 2.042831215970962e-05, + "loss": 44.7298, + "step": 1768 + }, + { + "epoch": 6.386455981941309, + "grad_norm": 239.5072021484375, + "learning_rate": 2.0422867513611614e-05, + "loss": 44.0512, + "step": 1769 + }, + { + "epoch": 6.390067720090293, + "grad_norm": 186.3783416748047, + "learning_rate": 2.0417422867513613e-05, + "loss": 43.8646, + "step": 1770 + }, + { + "epoch": 6.390067720090293, + "eval_loss": 0.6325972676277161, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 1770 + }, + { + "epoch": 6.393679458239277, + "grad_norm": 169.77285766601562, + "learning_rate": 2.041197822141561e-05, + "loss": 43.8688, + "step": 1771 + }, + { + "epoch": 6.397291196388262, + "grad_norm": 158.4019012451172, + "learning_rate": 2.0406533575317604e-05, + "loss": 42.5757, + "step": 1772 + }, + { + "epoch": 6.400902934537246, + "grad_norm": 209.79916381835938, + "learning_rate": 2.04010889292196e-05, + "loss": 44.8075, + "step": 1773 + }, + { + "epoch": 6.40451467268623, + "grad_norm": 215.74639892578125, + "learning_rate": 2.0395644283121595e-05, + "loss": 42.0121, + "step": 1774 + }, + { + "epoch": 6.408126410835214, + "grad_norm": 215.21121215820312, + "learning_rate": 2.0390199637023597e-05, + "loss": 40.6564, + "step": 1775 + }, + { + "epoch": 6.411738148984199, + "grad_norm": 244.49574279785156, + "learning_rate": 2.0384754990925592e-05, + "loss": 40.543, + "step": 1776 + }, + { + "epoch": 6.415349887133183, + "grad_norm": 189.22781372070312, + "learning_rate": 2.0379310344827588e-05, + "loss": 39.5569, + "step": 1777 + }, + { + "epoch": 6.418961625282167, + "grad_norm": 204.32664489746094, + "learning_rate": 2.0373865698729583e-05, + "loss": 40.0789, + "step": 1778 + }, + { + "epoch": 6.422573363431152, + "grad_norm": 217.5277557373047, + "learning_rate": 2.036842105263158e-05, + "loss": 39.6436, + "step": 1779 + }, + { + "epoch": 6.426185101580136, + "grad_norm": 196.25918579101562, + "learning_rate": 2.0362976406533574e-05, + "loss": 41.0794, + "step": 1780 + }, + { + "epoch": 6.426185101580136, + "eval_loss": 0.6334295868873596, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1780 + }, + { + "epoch": 6.42979683972912, + "grad_norm": 191.50656127929688, + "learning_rate": 2.0357531760435573e-05, + "loss": 41.2976, + "step": 1781 + }, + { + "epoch": 6.433408577878104, + "grad_norm": 192.98692321777344, + "learning_rate": 2.0352087114337568e-05, + "loss": 41.0843, + "step": 1782 + }, + { + "epoch": 6.437020316027088, + "grad_norm": 197.32862854003906, + "learning_rate": 2.0346642468239563e-05, + "loss": 40.4123, + "step": 1783 + }, + { + "epoch": 6.440632054176072, + "grad_norm": 205.18751525878906, + "learning_rate": 2.0341197822141562e-05, + "loss": 41.9185, + "step": 1784 + }, + { + "epoch": 6.444243792325056, + "grad_norm": 201.69070434570312, + "learning_rate": 2.0335753176043558e-05, + "loss": 41.6794, + "step": 1785 + }, + { + "epoch": 6.44785553047404, + "grad_norm": 218.77044677734375, + "learning_rate": 2.0330308529945556e-05, + "loss": 43.5805, + "step": 1786 + }, + { + "epoch": 6.451467268623025, + "grad_norm": 183.25967407226562, + "learning_rate": 2.0324863883847552e-05, + "loss": 41.2777, + "step": 1787 + }, + { + "epoch": 6.455079006772009, + "grad_norm": 219.97369384765625, + "learning_rate": 2.0319419237749547e-05, + "loss": 42.4618, + "step": 1788 + }, + { + "epoch": 6.458690744920993, + "grad_norm": 216.1624298095703, + "learning_rate": 2.0313974591651542e-05, + "loss": 41.6424, + "step": 1789 + }, + { + "epoch": 6.462302483069977, + "grad_norm": 222.29965209960938, + "learning_rate": 2.0308529945553538e-05, + "loss": 41.4058, + "step": 1790 + }, + { + "epoch": 6.462302483069977, + "eval_loss": 0.6282982230186462, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 1790 + }, + { + "epoch": 6.465914221218962, + "grad_norm": 215.50511169433594, + "learning_rate": 2.0303085299455533e-05, + "loss": 39.474, + "step": 1791 + }, + { + "epoch": 6.469525959367946, + "grad_norm": 237.2119903564453, + "learning_rate": 2.0297640653357532e-05, + "loss": 36.0508, + "step": 1792 + }, + { + "epoch": 6.47313769751693, + "grad_norm": 234.52975463867188, + "learning_rate": 2.029219600725953e-05, + "loss": 34.1704, + "step": 1793 + }, + { + "epoch": 6.476749435665914, + "grad_norm": 213.22216796875, + "learning_rate": 2.0286751361161526e-05, + "loss": 34.7592, + "step": 1794 + }, + { + "epoch": 6.480361173814899, + "grad_norm": 215.77244567871094, + "learning_rate": 2.028130671506352e-05, + "loss": 35.3051, + "step": 1795 + }, + { + "epoch": 6.483972911963883, + "grad_norm": 179.0439910888672, + "learning_rate": 2.0275862068965517e-05, + "loss": 35.2493, + "step": 1796 + }, + { + "epoch": 6.487584650112867, + "grad_norm": 217.47218322753906, + "learning_rate": 2.0270417422867516e-05, + "loss": 35.6169, + "step": 1797 + }, + { + "epoch": 6.491196388261851, + "grad_norm": 191.3380584716797, + "learning_rate": 2.026497277676951e-05, + "loss": 36.428, + "step": 1798 + }, + { + "epoch": 6.4948081264108355, + "grad_norm": 200.8570098876953, + "learning_rate": 2.0259528130671506e-05, + "loss": 36.5983, + "step": 1799 + }, + { + "epoch": 6.4984198645598195, + "grad_norm": 173.1240234375, + "learning_rate": 2.0254083484573502e-05, + "loss": 36.0163, + "step": 1800 + }, + { + "epoch": 6.4984198645598195, + "eval_loss": 0.6268841624259949, + "eval_runtime": 3.146, + "eval_samples_per_second": 56.898, + "eval_steps_per_second": 56.898, + "step": 1800 + }, + { + "epoch": 6.502031602708803, + "grad_norm": 225.66845703125, + "learning_rate": 2.0248638838475497e-05, + "loss": 36.2461, + "step": 1801 + }, + { + "epoch": 6.505643340857787, + "grad_norm": 189.66233825683594, + "learning_rate": 2.0243194192377496e-05, + "loss": 37.416, + "step": 1802 + }, + { + "epoch": 6.509255079006772, + "grad_norm": 243.0270233154297, + "learning_rate": 2.0237749546279495e-05, + "loss": 38.5309, + "step": 1803 + }, + { + "epoch": 6.512866817155756, + "grad_norm": 192.0927276611328, + "learning_rate": 2.023230490018149e-05, + "loss": 37.087, + "step": 1804 + }, + { + "epoch": 6.51647855530474, + "grad_norm": 222.2957305908203, + "learning_rate": 2.0226860254083486e-05, + "loss": 37.8877, + "step": 1805 + }, + { + "epoch": 6.520090293453725, + "grad_norm": 259.84722900390625, + "learning_rate": 2.022141560798548e-05, + "loss": 39.2138, + "step": 1806 + }, + { + "epoch": 6.523702031602709, + "grad_norm": 205.5794219970703, + "learning_rate": 2.0215970961887476e-05, + "loss": 38.6066, + "step": 1807 + }, + { + "epoch": 6.527313769751693, + "grad_norm": 300.455810546875, + "learning_rate": 2.0210526315789475e-05, + "loss": 36.1581, + "step": 1808 + }, + { + "epoch": 6.530925507900677, + "grad_norm": 207.18063354492188, + "learning_rate": 2.020508166969147e-05, + "loss": 24.3689, + "step": 1809 + }, + { + "epoch": 6.534537246049661, + "grad_norm": 230.98516845703125, + "learning_rate": 2.0199637023593466e-05, + "loss": 23.7019, + "step": 1810 + }, + { + "epoch": 6.534537246049661, + "eval_loss": 0.6379140615463257, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 1810 + }, + { + "epoch": 6.538148984198646, + "grad_norm": 153.8694610595703, + "learning_rate": 2.019419237749546e-05, + "loss": 24.5035, + "step": 1811 + }, + { + "epoch": 6.54176072234763, + "grad_norm": 229.9432373046875, + "learning_rate": 2.018874773139746e-05, + "loss": 26.1645, + "step": 1812 + }, + { + "epoch": 6.545372460496614, + "grad_norm": 325.3592529296875, + "learning_rate": 2.018330308529946e-05, + "loss": 45.6349, + "step": 1813 + }, + { + "epoch": 6.5489841986455986, + "grad_norm": 261.0744323730469, + "learning_rate": 2.0177858439201454e-05, + "loss": 45.5545, + "step": 1814 + }, + { + "epoch": 6.5525959367945825, + "grad_norm": 261.4237976074219, + "learning_rate": 2.017241379310345e-05, + "loss": 45.321, + "step": 1815 + }, + { + "epoch": 6.5562076749435665, + "grad_norm": 238.8377685546875, + "learning_rate": 2.0166969147005445e-05, + "loss": 44.5963, + "step": 1816 + }, + { + "epoch": 6.5598194130925505, + "grad_norm": 225.89730834960938, + "learning_rate": 2.016152450090744e-05, + "loss": 43.593, + "step": 1817 + }, + { + "epoch": 6.563431151241535, + "grad_norm": 265.09625244140625, + "learning_rate": 2.0156079854809436e-05, + "loss": 43.536, + "step": 1818 + }, + { + "epoch": 6.567042889390519, + "grad_norm": 257.9114685058594, + "learning_rate": 2.0150635208711434e-05, + "loss": 44.1125, + "step": 1819 + }, + { + "epoch": 6.570654627539503, + "grad_norm": 188.06382751464844, + "learning_rate": 2.014519056261343e-05, + "loss": 45.097, + "step": 1820 + }, + { + "epoch": 6.570654627539503, + "eval_loss": 0.6347097754478455, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 1820 + }, + { + "epoch": 6.574266365688487, + "grad_norm": 227.7350616455078, + "learning_rate": 2.013974591651543e-05, + "loss": 43.9367, + "step": 1821 + }, + { + "epoch": 6.577878103837472, + "grad_norm": 207.54774475097656, + "learning_rate": 2.0134301270417424e-05, + "loss": 43.8266, + "step": 1822 + }, + { + "epoch": 6.581489841986456, + "grad_norm": 204.62364196777344, + "learning_rate": 2.012885662431942e-05, + "loss": 42.7973, + "step": 1823 + }, + { + "epoch": 6.58510158013544, + "grad_norm": 244.32159423828125, + "learning_rate": 2.0123411978221418e-05, + "loss": 42.7741, + "step": 1824 + }, + { + "epoch": 6.588713318284425, + "grad_norm": 304.9100036621094, + "learning_rate": 2.0117967332123414e-05, + "loss": 40.6529, + "step": 1825 + }, + { + "epoch": 6.592325056433409, + "grad_norm": 275.5767517089844, + "learning_rate": 2.011252268602541e-05, + "loss": 40.2909, + "step": 1826 + }, + { + "epoch": 6.595936794582393, + "grad_norm": 227.69642639160156, + "learning_rate": 2.0107078039927404e-05, + "loss": 39.8786, + "step": 1827 + }, + { + "epoch": 6.599548532731377, + "grad_norm": 261.4333190917969, + "learning_rate": 2.01016333938294e-05, + "loss": 40.7009, + "step": 1828 + }, + { + "epoch": 6.603160270880361, + "grad_norm": 213.0095977783203, + "learning_rate": 2.0096188747731395e-05, + "loss": 40.0595, + "step": 1829 + }, + { + "epoch": 6.606772009029346, + "grad_norm": 251.78590393066406, + "learning_rate": 2.0090744101633397e-05, + "loss": 40.8939, + "step": 1830 + }, + { + "epoch": 6.606772009029346, + "eval_loss": 0.6333281397819519, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1830 + }, + { + "epoch": 6.6103837471783295, + "grad_norm": 224.89805603027344, + "learning_rate": 2.0085299455535393e-05, + "loss": 41.4123, + "step": 1831 + }, + { + "epoch": 6.6139954853273135, + "grad_norm": 195.67982482910156, + "learning_rate": 2.0079854809437388e-05, + "loss": 41.3483, + "step": 1832 + }, + { + "epoch": 6.617607223476298, + "grad_norm": 214.318603515625, + "learning_rate": 2.0074410163339383e-05, + "loss": 40.5516, + "step": 1833 + }, + { + "epoch": 6.621218961625282, + "grad_norm": 226.60968017578125, + "learning_rate": 2.006896551724138e-05, + "loss": 41.3523, + "step": 1834 + }, + { + "epoch": 6.624830699774266, + "grad_norm": 231.63604736328125, + "learning_rate": 2.0063520871143378e-05, + "loss": 41.8734, + "step": 1835 + }, + { + "epoch": 6.62844243792325, + "grad_norm": 224.1644287109375, + "learning_rate": 2.0058076225045373e-05, + "loss": 42.7386, + "step": 1836 + }, + { + "epoch": 6.632054176072235, + "grad_norm": 273.651123046875, + "learning_rate": 2.0052631578947368e-05, + "loss": 42.4525, + "step": 1837 + }, + { + "epoch": 6.635665914221219, + "grad_norm": 270.8088684082031, + "learning_rate": 2.0047186932849364e-05, + "loss": 42.1051, + "step": 1838 + }, + { + "epoch": 6.639277652370203, + "grad_norm": 303.1058044433594, + "learning_rate": 2.0041742286751362e-05, + "loss": 42.1301, + "step": 1839 + }, + { + "epoch": 6.642889390519187, + "grad_norm": 207.29380798339844, + "learning_rate": 2.0036297640653358e-05, + "loss": 42.1495, + "step": 1840 + }, + { + "epoch": 6.642889390519187, + "eval_loss": 0.6321585774421692, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1840 + }, + { + "epoch": 6.646501128668172, + "grad_norm": 262.1852722167969, + "learning_rate": 2.0030852994555357e-05, + "loss": 39.6408, + "step": 1841 + }, + { + "epoch": 6.650112866817156, + "grad_norm": 233.7991943359375, + "learning_rate": 2.0025408348457352e-05, + "loss": 37.6177, + "step": 1842 + }, + { + "epoch": 6.65372460496614, + "grad_norm": 247.25514221191406, + "learning_rate": 2.0019963702359347e-05, + "loss": 35.4287, + "step": 1843 + }, + { + "epoch": 6.657336343115124, + "grad_norm": 191.53343200683594, + "learning_rate": 2.0014519056261343e-05, + "loss": 34.2335, + "step": 1844 + }, + { + "epoch": 6.660948081264109, + "grad_norm": 245.22821044921875, + "learning_rate": 2.0009074410163338e-05, + "loss": 35.8097, + "step": 1845 + }, + { + "epoch": 6.664559819413093, + "grad_norm": 213.8151092529297, + "learning_rate": 2.0003629764065337e-05, + "loss": 35.2621, + "step": 1846 + }, + { + "epoch": 6.668171557562077, + "grad_norm": 174.6085205078125, + "learning_rate": 1.9998185117967332e-05, + "loss": 36.6137, + "step": 1847 + }, + { + "epoch": 6.6717832957110605, + "grad_norm": 287.4677429199219, + "learning_rate": 1.9992740471869328e-05, + "loss": 37.5896, + "step": 1848 + }, + { + "epoch": 6.675395033860045, + "grad_norm": 224.59771728515625, + "learning_rate": 1.9987295825771326e-05, + "loss": 36.5515, + "step": 1849 + }, + { + "epoch": 6.679006772009029, + "grad_norm": 212.73065185546875, + "learning_rate": 1.9981851179673322e-05, + "loss": 36.2511, + "step": 1850 + }, + { + "epoch": 6.679006772009029, + "eval_loss": 0.6308404803276062, + "eval_runtime": 3.1419, + "eval_samples_per_second": 56.972, + "eval_steps_per_second": 56.972, + "step": 1850 + }, + { + "epoch": 6.682618510158013, + "grad_norm": 214.7340850830078, + "learning_rate": 1.9976406533575317e-05, + "loss": 37.6949, + "step": 1851 + }, + { + "epoch": 6.686230248306998, + "grad_norm": 220.3029327392578, + "learning_rate": 1.9970961887477316e-05, + "loss": 36.5785, + "step": 1852 + }, + { + "epoch": 6.689841986455982, + "grad_norm": 198.97564697265625, + "learning_rate": 1.996551724137931e-05, + "loss": 38.5277, + "step": 1853 + }, + { + "epoch": 6.693453724604966, + "grad_norm": 180.94789123535156, + "learning_rate": 1.9960072595281307e-05, + "loss": 37.5197, + "step": 1854 + }, + { + "epoch": 6.69706546275395, + "grad_norm": 212.17584228515625, + "learning_rate": 1.9954627949183302e-05, + "loss": 37.3483, + "step": 1855 + }, + { + "epoch": 6.700677200902934, + "grad_norm": 253.88601684570312, + "learning_rate": 1.9949183303085298e-05, + "loss": 38.5224, + "step": 1856 + }, + { + "epoch": 6.704288939051919, + "grad_norm": 193.17698669433594, + "learning_rate": 1.9943738656987296e-05, + "loss": 37.5679, + "step": 1857 + }, + { + "epoch": 6.707900677200903, + "grad_norm": 217.2652130126953, + "learning_rate": 1.9938294010889295e-05, + "loss": 27.7344, + "step": 1858 + }, + { + "epoch": 6.711512415349887, + "grad_norm": 183.9295196533203, + "learning_rate": 1.993284936479129e-05, + "loss": 24.3864, + "step": 1859 + }, + { + "epoch": 6.715124153498872, + "grad_norm": 200.3455352783203, + "learning_rate": 1.9927404718693286e-05, + "loss": 23.7328, + "step": 1860 + }, + { + "epoch": 6.715124153498872, + "eval_loss": 0.636415421962738, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.943, + "eval_steps_per_second": 56.943, + "step": 1860 + }, + { + "epoch": 6.718735891647856, + "grad_norm": 206.7858123779297, + "learning_rate": 1.992196007259528e-05, + "loss": 24.6541, + "step": 1861 + }, + { + "epoch": 6.72234762979684, + "grad_norm": 208.10414123535156, + "learning_rate": 1.9916515426497277e-05, + "loss": 25.1223, + "step": 1862 + }, + { + "epoch": 6.725959367945824, + "grad_norm": 270.6657409667969, + "learning_rate": 1.9911070780399275e-05, + "loss": 44.8561, + "step": 1863 + }, + { + "epoch": 6.7295711060948085, + "grad_norm": 246.69094848632812, + "learning_rate": 1.990562613430127e-05, + "loss": 45.8683, + "step": 1864 + }, + { + "epoch": 6.733182844243792, + "grad_norm": 243.4462432861328, + "learning_rate": 1.9900181488203266e-05, + "loss": 45.1845, + "step": 1865 + }, + { + "epoch": 6.736794582392776, + "grad_norm": 218.0637969970703, + "learning_rate": 1.989473684210526e-05, + "loss": 43.9492, + "step": 1866 + }, + { + "epoch": 6.74040632054176, + "grad_norm": 200.28140258789062, + "learning_rate": 1.988929219600726e-05, + "loss": 44.0612, + "step": 1867 + }, + { + "epoch": 6.744018058690745, + "grad_norm": 200.3120880126953, + "learning_rate": 1.988384754990926e-05, + "loss": 43.4748, + "step": 1868 + }, + { + "epoch": 6.747629796839729, + "grad_norm": 186.1811065673828, + "learning_rate": 1.9878402903811254e-05, + "loss": 43.6851, + "step": 1869 + }, + { + "epoch": 6.751241534988713, + "grad_norm": 208.15167236328125, + "learning_rate": 1.987295825771325e-05, + "loss": 44.4196, + "step": 1870 + }, + { + "epoch": 6.751241534988713, + "eval_loss": 0.6353851556777954, + "eval_runtime": 3.1436, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1870 + }, + { + "epoch": 6.754853273137698, + "grad_norm": 207.500244140625, + "learning_rate": 1.9867513611615245e-05, + "loss": 44.1493, + "step": 1871 + }, + { + "epoch": 6.758465011286682, + "grad_norm": 238.17047119140625, + "learning_rate": 1.986206896551724e-05, + "loss": 44.6587, + "step": 1872 + }, + { + "epoch": 6.762076749435666, + "grad_norm": 192.9468231201172, + "learning_rate": 1.9856624319419236e-05, + "loss": 43.2409, + "step": 1873 + }, + { + "epoch": 6.76568848758465, + "grad_norm": 205.26492309570312, + "learning_rate": 1.9851179673321235e-05, + "loss": 40.8636, + "step": 1874 + }, + { + "epoch": 6.769300225733634, + "grad_norm": 190.49908447265625, + "learning_rate": 1.984573502722323e-05, + "loss": 41.0769, + "step": 1875 + }, + { + "epoch": 6.772911963882619, + "grad_norm": 206.56097412109375, + "learning_rate": 1.984029038112523e-05, + "loss": 40.1137, + "step": 1876 + }, + { + "epoch": 6.776523702031603, + "grad_norm": 212.89256286621094, + "learning_rate": 1.9834845735027224e-05, + "loss": 41.0114, + "step": 1877 + }, + { + "epoch": 6.780135440180587, + "grad_norm": 197.24267578125, + "learning_rate": 1.982940108892922e-05, + "loss": 40.6027, + "step": 1878 + }, + { + "epoch": 6.7837471783295715, + "grad_norm": 187.01942443847656, + "learning_rate": 1.982395644283122e-05, + "loss": 40.5933, + "step": 1879 + }, + { + "epoch": 6.7873589164785555, + "grad_norm": 236.31092834472656, + "learning_rate": 1.9818511796733214e-05, + "loss": 41.2282, + "step": 1880 + }, + { + "epoch": 6.7873589164785555, + "eval_loss": 0.6299392580986023, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 1880 + }, + { + "epoch": 6.7909706546275395, + "grad_norm": 194.92059326171875, + "learning_rate": 1.981306715063521e-05, + "loss": 41.5858, + "step": 1881 + }, + { + "epoch": 6.794582392776523, + "grad_norm": 192.26272583007812, + "learning_rate": 1.9807622504537205e-05, + "loss": 40.6826, + "step": 1882 + }, + { + "epoch": 6.798194130925508, + "grad_norm": 181.8116912841797, + "learning_rate": 1.98021778584392e-05, + "loss": 40.0867, + "step": 1883 + }, + { + "epoch": 6.801805869074492, + "grad_norm": 219.03494262695312, + "learning_rate": 1.9796733212341195e-05, + "loss": 41.4496, + "step": 1884 + }, + { + "epoch": 6.805417607223476, + "grad_norm": 190.7852325439453, + "learning_rate": 1.9791288566243194e-05, + "loss": 42.4147, + "step": 1885 + }, + { + "epoch": 6.80902934537246, + "grad_norm": 200.32476806640625, + "learning_rate": 1.9785843920145193e-05, + "loss": 42.0316, + "step": 1886 + }, + { + "epoch": 6.812641083521445, + "grad_norm": 240.6086883544922, + "learning_rate": 1.9780399274047188e-05, + "loss": 39.6992, + "step": 1887 + }, + { + "epoch": 6.816252821670429, + "grad_norm": 222.31700134277344, + "learning_rate": 1.9774954627949184e-05, + "loss": 42.9572, + "step": 1888 + }, + { + "epoch": 6.819864559819413, + "grad_norm": 215.65292358398438, + "learning_rate": 1.976950998185118e-05, + "loss": 42.5147, + "step": 1889 + }, + { + "epoch": 6.823476297968397, + "grad_norm": 195.71624755859375, + "learning_rate": 1.9764065335753178e-05, + "loss": 40.9536, + "step": 1890 + }, + { + "epoch": 6.823476297968397, + "eval_loss": 0.6288287043571472, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 1890 + }, + { + "epoch": 6.827088036117382, + "grad_norm": 202.301025390625, + "learning_rate": 1.9758620689655173e-05, + "loss": 40.1754, + "step": 1891 + }, + { + "epoch": 6.830699774266366, + "grad_norm": 217.07186889648438, + "learning_rate": 1.975317604355717e-05, + "loss": 35.7505, + "step": 1892 + }, + { + "epoch": 6.83431151241535, + "grad_norm": 189.78782653808594, + "learning_rate": 1.9747731397459164e-05, + "loss": 34.813, + "step": 1893 + }, + { + "epoch": 6.837923250564334, + "grad_norm": 247.2117462158203, + "learning_rate": 1.974228675136116e-05, + "loss": 33.932, + "step": 1894 + }, + { + "epoch": 6.8415349887133186, + "grad_norm": 244.06321716308594, + "learning_rate": 1.9736842105263158e-05, + "loss": 36.2514, + "step": 1895 + }, + { + "epoch": 6.8451467268623025, + "grad_norm": 235.78692626953125, + "learning_rate": 1.9731397459165157e-05, + "loss": 35.2123, + "step": 1896 + }, + { + "epoch": 6.8487584650112865, + "grad_norm": 193.82456970214844, + "learning_rate": 1.9725952813067152e-05, + "loss": 36.5477, + "step": 1897 + }, + { + "epoch": 6.852370203160271, + "grad_norm": 230.2017059326172, + "learning_rate": 1.9720508166969148e-05, + "loss": 36.1244, + "step": 1898 + }, + { + "epoch": 6.855981941309255, + "grad_norm": 205.5274200439453, + "learning_rate": 1.9715063520871143e-05, + "loss": 36.7059, + "step": 1899 + }, + { + "epoch": 6.859593679458239, + "grad_norm": 236.6873016357422, + "learning_rate": 1.970961887477314e-05, + "loss": 36.6212, + "step": 1900 + }, + { + "epoch": 6.859593679458239, + "eval_loss": 0.6235609650611877, + "eval_runtime": 3.1497, + "eval_samples_per_second": 56.831, + "eval_steps_per_second": 56.831, + "step": 1900 + }, + { + "epoch": 6.863205417607223, + "grad_norm": 217.63638305664062, + "learning_rate": 1.9704174228675137e-05, + "loss": 37.3918, + "step": 1901 + }, + { + "epoch": 6.866817155756207, + "grad_norm": 169.31996154785156, + "learning_rate": 1.9698729582577133e-05, + "loss": 37.8555, + "step": 1902 + }, + { + "epoch": 6.870428893905192, + "grad_norm": 204.2144775390625, + "learning_rate": 1.9693284936479128e-05, + "loss": 38.0013, + "step": 1903 + }, + { + "epoch": 6.874040632054176, + "grad_norm": 219.13595581054688, + "learning_rate": 1.9687840290381127e-05, + "loss": 37.2128, + "step": 1904 + }, + { + "epoch": 6.87765237020316, + "grad_norm": 189.8477325439453, + "learning_rate": 1.9682395644283122e-05, + "loss": 39.272, + "step": 1905 + }, + { + "epoch": 6.881264108352145, + "grad_norm": 214.21360778808594, + "learning_rate": 1.967695099818512e-05, + "loss": 37.5185, + "step": 1906 + }, + { + "epoch": 6.884875846501129, + "grad_norm": 252.57867431640625, + "learning_rate": 1.9671506352087116e-05, + "loss": 37.6195, + "step": 1907 + }, + { + "epoch": 6.888487584650113, + "grad_norm": 169.85382080078125, + "learning_rate": 1.966606170598911e-05, + "loss": 29.083, + "step": 1908 + }, + { + "epoch": 6.892099322799097, + "grad_norm": 161.38137817382812, + "learning_rate": 1.9660617059891107e-05, + "loss": 24.4547, + "step": 1909 + }, + { + "epoch": 6.895711060948082, + "grad_norm": 192.5706787109375, + "learning_rate": 1.9655172413793102e-05, + "loss": 24.2235, + "step": 1910 + }, + { + "epoch": 6.895711060948082, + "eval_loss": 0.6387229561805725, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1910 + }, + { + "epoch": 6.899322799097066, + "grad_norm": 177.5368194580078, + "learning_rate": 1.9649727767695098e-05, + "loss": 24.8032, + "step": 1911 + }, + { + "epoch": 6.9029345372460496, + "grad_norm": 206.98458862304688, + "learning_rate": 1.9644283121597097e-05, + "loss": 25.7293, + "step": 1912 + }, + { + "epoch": 6.9065462753950335, + "grad_norm": 238.7289581298828, + "learning_rate": 1.9638838475499095e-05, + "loss": 44.2514, + "step": 1913 + }, + { + "epoch": 6.910158013544018, + "grad_norm": 225.86854553222656, + "learning_rate": 1.963339382940109e-05, + "loss": 44.4858, + "step": 1914 + }, + { + "epoch": 6.913769751693002, + "grad_norm": 235.71524047851562, + "learning_rate": 1.9627949183303086e-05, + "loss": 44.5351, + "step": 1915 + }, + { + "epoch": 6.917381489841986, + "grad_norm": 233.1634063720703, + "learning_rate": 1.962250453720508e-05, + "loss": 44.0865, + "step": 1916 + }, + { + "epoch": 6.92099322799097, + "grad_norm": 201.48944091796875, + "learning_rate": 1.961705989110708e-05, + "loss": 45.0226, + "step": 1917 + }, + { + "epoch": 6.924604966139955, + "grad_norm": 226.95469665527344, + "learning_rate": 1.9611615245009076e-05, + "loss": 44.3969, + "step": 1918 + }, + { + "epoch": 6.928216704288939, + "grad_norm": 242.79940795898438, + "learning_rate": 1.960617059891107e-05, + "loss": 41.3037, + "step": 1919 + }, + { + "epoch": 6.931828442437923, + "grad_norm": 255.3524932861328, + "learning_rate": 1.9600725952813066e-05, + "loss": 41.3567, + "step": 1920 + }, + { + "epoch": 6.931828442437923, + "eval_loss": 0.6346065998077393, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 1920 + }, + { + "epoch": 6.935440180586907, + "grad_norm": 277.0763854980469, + "learning_rate": 1.9595281306715062e-05, + "loss": 41.142, + "step": 1921 + }, + { + "epoch": 6.939051918735892, + "grad_norm": 176.02658081054688, + "learning_rate": 1.958983666061706e-05, + "loss": 42.1963, + "step": 1922 + }, + { + "epoch": 6.942663656884876, + "grad_norm": 236.36398315429688, + "learning_rate": 1.958439201451906e-05, + "loss": 42.351, + "step": 1923 + }, + { + "epoch": 6.94627539503386, + "grad_norm": 203.0919647216797, + "learning_rate": 1.9578947368421055e-05, + "loss": 41.5248, + "step": 1924 + }, + { + "epoch": 6.949887133182845, + "grad_norm": 273.605712890625, + "learning_rate": 1.957350272232305e-05, + "loss": 42.1004, + "step": 1925 + }, + { + "epoch": 6.953498871331829, + "grad_norm": 214.04319763183594, + "learning_rate": 1.9568058076225045e-05, + "loss": 42.6326, + "step": 1926 + }, + { + "epoch": 6.957110609480813, + "grad_norm": 250.81832885742188, + "learning_rate": 1.956261343012704e-05, + "loss": 43.8045, + "step": 1927 + }, + { + "epoch": 6.960722347629797, + "grad_norm": 233.58116149902344, + "learning_rate": 1.955716878402904e-05, + "loss": 39.8991, + "step": 1928 + }, + { + "epoch": 6.9643340857787805, + "grad_norm": 269.0545654296875, + "learning_rate": 1.9551724137931035e-05, + "loss": 34.6192, + "step": 1929 + }, + { + "epoch": 6.967945823927765, + "grad_norm": 266.1218566894531, + "learning_rate": 1.954627949183303e-05, + "loss": 35.7568, + "step": 1930 + }, + { + "epoch": 6.967945823927765, + "eval_loss": 0.6233173608779907, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1930 + }, + { + "epoch": 6.971557562076749, + "grad_norm": 294.6914978027344, + "learning_rate": 1.9540834845735026e-05, + "loss": 36.0795, + "step": 1931 + }, + { + "epoch": 6.975169300225733, + "grad_norm": 373.6831970214844, + "learning_rate": 1.9535390199637025e-05, + "loss": 37.2715, + "step": 1932 + }, + { + "epoch": 6.978781038374718, + "grad_norm": 240.34738159179688, + "learning_rate": 1.952994555353902e-05, + "loss": 37.8335, + "step": 1933 + }, + { + "epoch": 6.982392776523702, + "grad_norm": 312.1968994140625, + "learning_rate": 1.952450090744102e-05, + "loss": 37.8251, + "step": 1934 + }, + { + "epoch": 6.986004514672686, + "grad_norm": 276.3544006347656, + "learning_rate": 1.9519056261343014e-05, + "loss": 38.8466, + "step": 1935 + }, + { + "epoch": 6.98961625282167, + "grad_norm": 282.6874694824219, + "learning_rate": 1.951361161524501e-05, + "loss": 37.774, + "step": 1936 + }, + { + "epoch": 6.993227990970655, + "grad_norm": 323.96612548828125, + "learning_rate": 1.9508166969147005e-05, + "loss": 34.3747, + "step": 1937 + }, + { + "epoch": 6.996839729119639, + "grad_norm": 235.02915954589844, + "learning_rate": 1.9502722323049e-05, + "loss": 24.5297, + "step": 1938 + }, + { + "epoch": 7.0, + "grad_norm": 176.4046173095703, + "learning_rate": 1.9497277676951e-05, + "loss": 22.3179, + "step": 1939 + }, + { + "epoch": 7.003611738148984, + "grad_norm": 248.2797393798828, + "learning_rate": 1.9491833030852994e-05, + "loss": 42.225, + "step": 1940 + }, + { + "epoch": 7.003611738148984, + "eval_loss": 0.6272363066673279, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.911, + "eval_steps_per_second": 56.911, + "step": 1940 + }, + { + "epoch": 7.007223476297969, + "grad_norm": 235.9131622314453, + "learning_rate": 1.9486388384754993e-05, + "loss": 43.6526, + "step": 1941 + }, + { + "epoch": 7.010835214446953, + "grad_norm": 223.63479614257812, + "learning_rate": 1.948094373865699e-05, + "loss": 42.9052, + "step": 1942 + }, + { + "epoch": 7.014446952595937, + "grad_norm": 203.92141723632812, + "learning_rate": 1.9475499092558984e-05, + "loss": 43.5819, + "step": 1943 + }, + { + "epoch": 7.018058690744921, + "grad_norm": 209.6050567626953, + "learning_rate": 1.947005444646098e-05, + "loss": 43.1077, + "step": 1944 + }, + { + "epoch": 7.021670428893906, + "grad_norm": 245.77700805664062, + "learning_rate": 1.9464609800362978e-05, + "loss": 42.7508, + "step": 1945 + }, + { + "epoch": 7.0252821670428895, + "grad_norm": 203.13465881347656, + "learning_rate": 1.9459165154264973e-05, + "loss": 42.5234, + "step": 1946 + }, + { + "epoch": 7.0288939051918735, + "grad_norm": 226.4978485107422, + "learning_rate": 1.945372050816697e-05, + "loss": 44.0725, + "step": 1947 + }, + { + "epoch": 7.0325056433408575, + "grad_norm": 225.68116760253906, + "learning_rate": 1.9448275862068964e-05, + "loss": 42.6408, + "step": 1948 + }, + { + "epoch": 7.036117381489842, + "grad_norm": 182.14202880859375, + "learning_rate": 1.944283121597096e-05, + "loss": 41.7696, + "step": 1949 + }, + { + "epoch": 7.039729119638826, + "grad_norm": 196.1949005126953, + "learning_rate": 1.9437386569872962e-05, + "loss": 42.7008, + "step": 1950 + }, + { + "epoch": 7.039729119638826, + "eval_loss": 0.6277336478233337, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 1950 + }, + { + "epoch": 7.04334085778781, + "grad_norm": 180.6853485107422, + "learning_rate": 1.9431941923774957e-05, + "loss": 41.9946, + "step": 1951 + }, + { + "epoch": 7.046952595936794, + "grad_norm": 199.0644073486328, + "learning_rate": 1.9426497277676953e-05, + "loss": 39.8965, + "step": 1952 + }, + { + "epoch": 7.050564334085779, + "grad_norm": 208.21371459960938, + "learning_rate": 1.9421052631578948e-05, + "loss": 39.3263, + "step": 1953 + }, + { + "epoch": 7.054176072234763, + "grad_norm": 239.78677368164062, + "learning_rate": 1.9415607985480943e-05, + "loss": 40.1478, + "step": 1954 + }, + { + "epoch": 7.057787810383747, + "grad_norm": 211.55030822753906, + "learning_rate": 1.941016333938294e-05, + "loss": 40.061, + "step": 1955 + }, + { + "epoch": 7.061399548532731, + "grad_norm": 199.51455688476562, + "learning_rate": 1.9404718693284937e-05, + "loss": 39.8707, + "step": 1956 + }, + { + "epoch": 7.065011286681716, + "grad_norm": 183.39486694335938, + "learning_rate": 1.9399274047186933e-05, + "loss": 40.3183, + "step": 1957 + }, + { + "epoch": 7.0686230248307, + "grad_norm": 238.36737060546875, + "learning_rate": 1.9393829401088928e-05, + "loss": 40.8581, + "step": 1958 + }, + { + "epoch": 7.072234762979684, + "grad_norm": 202.5072021484375, + "learning_rate": 1.9388384754990927e-05, + "loss": 40.2192, + "step": 1959 + }, + { + "epoch": 7.075846501128668, + "grad_norm": 204.236083984375, + "learning_rate": 1.9382940108892922e-05, + "loss": 40.8533, + "step": 1960 + }, + { + "epoch": 7.075846501128668, + "eval_loss": 0.6252757906913757, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 1960 + }, + { + "epoch": 7.079458239277653, + "grad_norm": 260.2081298828125, + "learning_rate": 1.937749546279492e-05, + "loss": 39.7229, + "step": 1961 + }, + { + "epoch": 7.083069977426637, + "grad_norm": 241.91722106933594, + "learning_rate": 1.9372050816696917e-05, + "loss": 41.547, + "step": 1962 + }, + { + "epoch": 7.0866817155756205, + "grad_norm": 168.9304656982422, + "learning_rate": 1.9366606170598912e-05, + "loss": 41.4826, + "step": 1963 + }, + { + "epoch": 7.090293453724605, + "grad_norm": 230.05349731445312, + "learning_rate": 1.9361161524500907e-05, + "loss": 41.5411, + "step": 1964 + }, + { + "epoch": 7.093905191873589, + "grad_norm": 172.16851806640625, + "learning_rate": 1.9355716878402903e-05, + "loss": 42.2347, + "step": 1965 + }, + { + "epoch": 7.097516930022573, + "grad_norm": 312.65838623046875, + "learning_rate": 1.9350272232304898e-05, + "loss": 41.4039, + "step": 1966 + }, + { + "epoch": 7.101128668171557, + "grad_norm": 249.62351989746094, + "learning_rate": 1.9344827586206897e-05, + "loss": 41.4234, + "step": 1967 + }, + { + "epoch": 7.104740406320542, + "grad_norm": 250.49143981933594, + "learning_rate": 1.9339382940108896e-05, + "loss": 38.0539, + "step": 1968 + }, + { + "epoch": 7.108352144469526, + "grad_norm": 238.41546630859375, + "learning_rate": 1.933393829401089e-05, + "loss": 35.5584, + "step": 1969 + }, + { + "epoch": 7.11196388261851, + "grad_norm": 200.78282165527344, + "learning_rate": 1.9328493647912886e-05, + "loss": 34.4491, + "step": 1970 + }, + { + "epoch": 7.11196388261851, + "eval_loss": 0.6286216378211975, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 1970 + }, + { + "epoch": 7.115575620767494, + "grad_norm": 244.61717224121094, + "learning_rate": 1.9323049001814882e-05, + "loss": 34.5403, + "step": 1971 + }, + { + "epoch": 7.119187358916479, + "grad_norm": 219.14312744140625, + "learning_rate": 1.931760435571688e-05, + "loss": 35.7815, + "step": 1972 + }, + { + "epoch": 7.122799097065463, + "grad_norm": 221.85130310058594, + "learning_rate": 1.9312159709618876e-05, + "loss": 35.638, + "step": 1973 + }, + { + "epoch": 7.126410835214447, + "grad_norm": 237.97921752929688, + "learning_rate": 1.930671506352087e-05, + "loss": 35.1348, + "step": 1974 + }, + { + "epoch": 7.130022573363431, + "grad_norm": 234.06256103515625, + "learning_rate": 1.9301270417422867e-05, + "loss": 35.8709, + "step": 1975 + }, + { + "epoch": 7.133634311512416, + "grad_norm": 231.6852264404297, + "learning_rate": 1.9295825771324862e-05, + "loss": 36.6859, + "step": 1976 + }, + { + "epoch": 7.1372460496614, + "grad_norm": 208.2762908935547, + "learning_rate": 1.9290381125226857e-05, + "loss": 37.24, + "step": 1977 + }, + { + "epoch": 7.140857787810384, + "grad_norm": 219.8532257080078, + "learning_rate": 1.928493647912886e-05, + "loss": 36.4058, + "step": 1978 + }, + { + "epoch": 7.144469525959368, + "grad_norm": 242.73159790039062, + "learning_rate": 1.9279491833030855e-05, + "loss": 36.7565, + "step": 1979 + }, + { + "epoch": 7.148081264108352, + "grad_norm": 227.09645080566406, + "learning_rate": 1.927404718693285e-05, + "loss": 37.6752, + "step": 1980 + }, + { + "epoch": 7.148081264108352, + "eval_loss": 0.6243596076965332, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 1980 + }, + { + "epoch": 7.151693002257336, + "grad_norm": 236.27169799804688, + "learning_rate": 1.9268602540834846e-05, + "loss": 38.3857, + "step": 1981 + }, + { + "epoch": 7.15530474040632, + "grad_norm": 244.84912109375, + "learning_rate": 1.926315789473684e-05, + "loss": 38.414, + "step": 1982 + }, + { + "epoch": 7.158916478555304, + "grad_norm": 203.36798095703125, + "learning_rate": 1.925771324863884e-05, + "loss": 38.938, + "step": 1983 + }, + { + "epoch": 7.162528216704289, + "grad_norm": 225.50152587890625, + "learning_rate": 1.9252268602540835e-05, + "loss": 37.654, + "step": 1984 + }, + { + "epoch": 7.166139954853273, + "grad_norm": 236.4989471435547, + "learning_rate": 1.924682395644283e-05, + "loss": 28.2794, + "step": 1985 + }, + { + "epoch": 7.169751693002257, + "grad_norm": 173.909423828125, + "learning_rate": 1.9241379310344826e-05, + "loss": 23.3804, + "step": 1986 + }, + { + "epoch": 7.173363431151241, + "grad_norm": 195.63526916503906, + "learning_rate": 1.9235934664246825e-05, + "loss": 24.4696, + "step": 1987 + }, + { + "epoch": 7.176975169300226, + "grad_norm": 150.0059356689453, + "learning_rate": 1.923049001814882e-05, + "loss": 23.9438, + "step": 1988 + }, + { + "epoch": 7.18058690744921, + "grad_norm": 217.61630249023438, + "learning_rate": 1.922504537205082e-05, + "loss": 25.4084, + "step": 1989 + }, + { + "epoch": 7.184198645598194, + "grad_norm": 259.2041015625, + "learning_rate": 1.9219600725952814e-05, + "loss": 44.7159, + "step": 1990 + }, + { + "epoch": 7.184198645598194, + "eval_loss": 0.6465168595314026, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 1990 + }, + { + "epoch": 7.187810383747179, + "grad_norm": 282.1758117675781, + "learning_rate": 1.921415607985481e-05, + "loss": 45.7571, + "step": 1991 + }, + { + "epoch": 7.191422121896163, + "grad_norm": 276.5455322265625, + "learning_rate": 1.9208711433756805e-05, + "loss": 44.7227, + "step": 1992 + }, + { + "epoch": 7.195033860045147, + "grad_norm": 251.93589782714844, + "learning_rate": 1.92032667876588e-05, + "loss": 43.0705, + "step": 1993 + }, + { + "epoch": 7.198645598194131, + "grad_norm": 224.8245086669922, + "learning_rate": 1.91978221415608e-05, + "loss": 43.2009, + "step": 1994 + }, + { + "epoch": 7.2022573363431155, + "grad_norm": 233.61770629882812, + "learning_rate": 1.9192377495462795e-05, + "loss": 43.4496, + "step": 1995 + }, + { + "epoch": 7.2058690744920995, + "grad_norm": 188.65252685546875, + "learning_rate": 1.9186932849364793e-05, + "loss": 42.5907, + "step": 1996 + }, + { + "epoch": 7.209480812641083, + "grad_norm": 185.1155242919922, + "learning_rate": 1.918148820326679e-05, + "loss": 44.4651, + "step": 1997 + }, + { + "epoch": 7.213092550790067, + "grad_norm": 169.09701538085938, + "learning_rate": 1.9176043557168784e-05, + "loss": 43.6325, + "step": 1998 + }, + { + "epoch": 7.216704288939052, + "grad_norm": 198.49114990234375, + "learning_rate": 1.9170598911070783e-05, + "loss": 43.5817, + "step": 1999 + }, + { + "epoch": 7.220316027088036, + "grad_norm": 193.17591857910156, + "learning_rate": 1.916515426497278e-05, + "loss": 41.4884, + "step": 2000 + }, + { + "epoch": 7.220316027088036, + "eval_loss": 0.6329721212387085, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.995, + "eval_steps_per_second": 56.995, + "step": 2000 + }, + { + "epoch": 7.22392776523702, + "grad_norm": 202.32730102539062, + "learning_rate": 1.9159709618874774e-05, + "loss": 41.2168, + "step": 2001 + }, + { + "epoch": 7.227539503386004, + "grad_norm": 206.4916534423828, + "learning_rate": 1.915426497277677e-05, + "loss": 39.9909, + "step": 2002 + }, + { + "epoch": 7.231151241534989, + "grad_norm": 202.2099609375, + "learning_rate": 1.9148820326678765e-05, + "loss": 40.1413, + "step": 2003 + }, + { + "epoch": 7.234762979683973, + "grad_norm": 223.7954559326172, + "learning_rate": 1.914337568058076e-05, + "loss": 39.5872, + "step": 2004 + }, + { + "epoch": 7.238374717832957, + "grad_norm": 225.8967742919922, + "learning_rate": 1.9137931034482762e-05, + "loss": 41.3396, + "step": 2005 + }, + { + "epoch": 7.241986455981941, + "grad_norm": 248.0997772216797, + "learning_rate": 1.9132486388384757e-05, + "loss": 39.012, + "step": 2006 + }, + { + "epoch": 7.245598194130926, + "grad_norm": 227.4576873779297, + "learning_rate": 1.9127041742286753e-05, + "loss": 42.5922, + "step": 2007 + }, + { + "epoch": 7.24920993227991, + "grad_norm": 197.62547302246094, + "learning_rate": 1.9121597096188748e-05, + "loss": 41.6107, + "step": 2008 + }, + { + "epoch": 7.252821670428894, + "grad_norm": 170.18817138671875, + "learning_rate": 1.9116152450090744e-05, + "loss": 40.3326, + "step": 2009 + }, + { + "epoch": 7.2564334085778786, + "grad_norm": 186.9420166015625, + "learning_rate": 1.9110707803992742e-05, + "loss": 41.0365, + "step": 2010 + }, + { + "epoch": 7.2564334085778786, + "eval_loss": 0.6230406761169434, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2010 + }, + { + "epoch": 7.2600451467268625, + "grad_norm": 188.11244201660156, + "learning_rate": 1.9105263157894738e-05, + "loss": 42.0278, + "step": 2011 + }, + { + "epoch": 7.2636568848758465, + "grad_norm": 242.47305297851562, + "learning_rate": 1.9099818511796733e-05, + "loss": 41.5539, + "step": 2012 + }, + { + "epoch": 7.2672686230248305, + "grad_norm": 190.83987426757812, + "learning_rate": 1.909437386569873e-05, + "loss": 41.8641, + "step": 2013 + }, + { + "epoch": 7.270880361173815, + "grad_norm": 214.44650268554688, + "learning_rate": 1.9088929219600724e-05, + "loss": 42.232, + "step": 2014 + }, + { + "epoch": 7.274492099322799, + "grad_norm": 216.3888397216797, + "learning_rate": 1.9083484573502723e-05, + "loss": 41.6186, + "step": 2015 + }, + { + "epoch": 7.278103837471783, + "grad_norm": 210.46673583984375, + "learning_rate": 1.907803992740472e-05, + "loss": 42.2099, + "step": 2016 + }, + { + "epoch": 7.281715575620767, + "grad_norm": 194.84165954589844, + "learning_rate": 1.9072595281306717e-05, + "loss": 42.78, + "step": 2017 + }, + { + "epoch": 7.285327313769752, + "grad_norm": 201.91297912597656, + "learning_rate": 1.9067150635208712e-05, + "loss": 38.7115, + "step": 2018 + }, + { + "epoch": 7.288939051918736, + "grad_norm": 245.42625427246094, + "learning_rate": 1.9061705989110708e-05, + "loss": 35.7841, + "step": 2019 + }, + { + "epoch": 7.29255079006772, + "grad_norm": 182.4967041015625, + "learning_rate": 1.9056261343012703e-05, + "loss": 34.3308, + "step": 2020 + }, + { + "epoch": 7.29255079006772, + "eval_loss": 0.6238341331481934, + "eval_runtime": 3.1431, + "eval_samples_per_second": 56.95, + "eval_steps_per_second": 56.95, + "step": 2020 + }, + { + "epoch": 7.296162528216704, + "grad_norm": 297.3916320800781, + "learning_rate": 1.9050816696914702e-05, + "loss": 34.7534, + "step": 2021 + }, + { + "epoch": 7.299774266365689, + "grad_norm": 211.52554321289062, + "learning_rate": 1.9045372050816697e-05, + "loss": 34.0303, + "step": 2022 + }, + { + "epoch": 7.303386004514673, + "grad_norm": 232.99844360351562, + "learning_rate": 1.9039927404718693e-05, + "loss": 35.7378, + "step": 2023 + }, + { + "epoch": 7.306997742663657, + "grad_norm": 230.34642028808594, + "learning_rate": 1.903448275862069e-05, + "loss": 36.7492, + "step": 2024 + }, + { + "epoch": 7.310609480812641, + "grad_norm": 228.88966369628906, + "learning_rate": 1.9029038112522687e-05, + "loss": 35.1188, + "step": 2025 + }, + { + "epoch": 7.314221218961626, + "grad_norm": 213.2604522705078, + "learning_rate": 1.9023593466424682e-05, + "loss": 35.0688, + "step": 2026 + }, + { + "epoch": 7.3178329571106095, + "grad_norm": 202.62200927734375, + "learning_rate": 1.901814882032668e-05, + "loss": 37.6721, + "step": 2027 + }, + { + "epoch": 7.3214446952595935, + "grad_norm": 191.8877410888672, + "learning_rate": 1.9012704174228676e-05, + "loss": 36.7728, + "step": 2028 + }, + { + "epoch": 7.3250564334085775, + "grad_norm": 211.57571411132812, + "learning_rate": 1.900725952813067e-05, + "loss": 36.6342, + "step": 2029 + }, + { + "epoch": 7.328668171557562, + "grad_norm": 177.2289581298828, + "learning_rate": 1.9001814882032667e-05, + "loss": 36.8319, + "step": 2030 + }, + { + "epoch": 7.328668171557562, + "eval_loss": 0.6231008172035217, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2030 + }, + { + "epoch": 7.332279909706546, + "grad_norm": 227.7028350830078, + "learning_rate": 1.8996370235934662e-05, + "loss": 36.6706, + "step": 2031 + }, + { + "epoch": 7.33589164785553, + "grad_norm": 229.02972412109375, + "learning_rate": 1.899092558983666e-05, + "loss": 37.0749, + "step": 2032 + }, + { + "epoch": 7.339503386004514, + "grad_norm": 234.30946350097656, + "learning_rate": 1.898548094373866e-05, + "loss": 37.3716, + "step": 2033 + }, + { + "epoch": 7.343115124153499, + "grad_norm": 236.79893493652344, + "learning_rate": 1.8980036297640655e-05, + "loss": 38.9503, + "step": 2034 + }, + { + "epoch": 7.346726862302483, + "grad_norm": 256.5646057128906, + "learning_rate": 1.897459165154265e-05, + "loss": 32.5056, + "step": 2035 + }, + { + "epoch": 7.350338600451467, + "grad_norm": 183.38961791992188, + "learning_rate": 1.8969147005444646e-05, + "loss": 25.3982, + "step": 2036 + }, + { + "epoch": 7.353950338600452, + "grad_norm": 214.09742736816406, + "learning_rate": 1.896370235934664e-05, + "loss": 23.2743, + "step": 2037 + }, + { + "epoch": 7.357562076749436, + "grad_norm": 190.10867309570312, + "learning_rate": 1.895825771324864e-05, + "loss": 24.8062, + "step": 2038 + }, + { + "epoch": 7.36117381489842, + "grad_norm": 197.85313415527344, + "learning_rate": 1.8952813067150636e-05, + "loss": 25.5098, + "step": 2039 + }, + { + "epoch": 7.364785553047404, + "grad_norm": 235.79090881347656, + "learning_rate": 1.894736842105263e-05, + "loss": 44.3536, + "step": 2040 + }, + { + "epoch": 7.364785553047404, + "eval_loss": 0.6341925263404846, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.785, + "eval_steps_per_second": 56.785, + "step": 2040 + }, + { + "epoch": 7.368397291196389, + "grad_norm": 232.7415771484375, + "learning_rate": 1.8941923774954626e-05, + "loss": 44.6073, + "step": 2041 + }, + { + "epoch": 7.372009029345373, + "grad_norm": 302.3766174316406, + "learning_rate": 1.8936479128856625e-05, + "loss": 43.8575, + "step": 2042 + }, + { + "epoch": 7.375620767494357, + "grad_norm": 208.41441345214844, + "learning_rate": 1.8931034482758624e-05, + "loss": 42.4378, + "step": 2043 + }, + { + "epoch": 7.3792325056433405, + "grad_norm": 228.000732421875, + "learning_rate": 1.892558983666062e-05, + "loss": 44.5641, + "step": 2044 + }, + { + "epoch": 7.382844243792325, + "grad_norm": 201.757080078125, + "learning_rate": 1.8920145190562615e-05, + "loss": 43.7578, + "step": 2045 + }, + { + "epoch": 7.386455981941309, + "grad_norm": 220.2481689453125, + "learning_rate": 1.891470054446461e-05, + "loss": 42.755, + "step": 2046 + }, + { + "epoch": 7.390067720090293, + "grad_norm": 225.5443115234375, + "learning_rate": 1.8909255898366605e-05, + "loss": 44.3785, + "step": 2047 + }, + { + "epoch": 7.393679458239277, + "grad_norm": 200.2024688720703, + "learning_rate": 1.89038112522686e-05, + "loss": 42.994, + "step": 2048 + }, + { + "epoch": 7.397291196388262, + "grad_norm": 205.64794921875, + "learning_rate": 1.88983666061706e-05, + "loss": 43.1902, + "step": 2049 + }, + { + "epoch": 7.400902934537246, + "grad_norm": 183.3535919189453, + "learning_rate": 1.8892921960072595e-05, + "loss": 40.9422, + "step": 2050 + }, + { + "epoch": 7.400902934537246, + "eval_loss": 0.626913845539093, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2050 + }, + { + "epoch": 7.40451467268623, + "grad_norm": 201.8138885498047, + "learning_rate": 1.8887477313974594e-05, + "loss": 39.4408, + "step": 2051 + }, + { + "epoch": 7.408126410835214, + "grad_norm": 201.8863525390625, + "learning_rate": 1.888203266787659e-05, + "loss": 39.5467, + "step": 2052 + }, + { + "epoch": 7.411738148984199, + "grad_norm": 239.10687255859375, + "learning_rate": 1.8876588021778585e-05, + "loss": 41.2256, + "step": 2053 + }, + { + "epoch": 7.415349887133183, + "grad_norm": 209.47796630859375, + "learning_rate": 1.8871143375680583e-05, + "loss": 40.8963, + "step": 2054 + }, + { + "epoch": 7.418961625282167, + "grad_norm": 202.6414794921875, + "learning_rate": 1.886569872958258e-05, + "loss": 40.5138, + "step": 2055 + }, + { + "epoch": 7.422573363431152, + "grad_norm": 198.01795959472656, + "learning_rate": 1.8860254083484574e-05, + "loss": 39.1767, + "step": 2056 + }, + { + "epoch": 7.426185101580136, + "grad_norm": 173.26507568359375, + "learning_rate": 1.885480943738657e-05, + "loss": 40.6713, + "step": 2057 + }, + { + "epoch": 7.42979683972912, + "grad_norm": 166.11607360839844, + "learning_rate": 1.8849364791288565e-05, + "loss": 41.2602, + "step": 2058 + }, + { + "epoch": 7.433408577878104, + "grad_norm": 200.76956176757812, + "learning_rate": 1.884392014519056e-05, + "loss": 41.0714, + "step": 2059 + }, + { + "epoch": 7.437020316027088, + "grad_norm": 213.75315856933594, + "learning_rate": 1.883847549909256e-05, + "loss": 39.6812, + "step": 2060 + }, + { + "epoch": 7.437020316027088, + "eval_loss": 0.6279598474502563, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.0, + "eval_steps_per_second": 57.0, + "step": 2060 + }, + { + "epoch": 7.440632054176072, + "grad_norm": 221.25025939941406, + "learning_rate": 1.8833030852994558e-05, + "loss": 41.6964, + "step": 2061 + }, + { + "epoch": 7.444243792325056, + "grad_norm": 171.32106018066406, + "learning_rate": 1.8827586206896553e-05, + "loss": 41.4608, + "step": 2062 + }, + { + "epoch": 7.44785553047404, + "grad_norm": 222.76600646972656, + "learning_rate": 1.882214156079855e-05, + "loss": 41.2687, + "step": 2063 + }, + { + "epoch": 7.451467268623025, + "grad_norm": 169.82395935058594, + "learning_rate": 1.8816696914700544e-05, + "loss": 41.6048, + "step": 2064 + }, + { + "epoch": 7.455079006772009, + "grad_norm": 190.5113525390625, + "learning_rate": 1.8811252268602543e-05, + "loss": 41.8843, + "step": 2065 + }, + { + "epoch": 7.458690744920993, + "grad_norm": 194.5990447998047, + "learning_rate": 1.8805807622504538e-05, + "loss": 43.5968, + "step": 2066 + }, + { + "epoch": 7.462302483069977, + "grad_norm": 216.0985870361328, + "learning_rate": 1.8800362976406533e-05, + "loss": 41.6743, + "step": 2067 + }, + { + "epoch": 7.465914221218962, + "grad_norm": 249.05270385742188, + "learning_rate": 1.879491833030853e-05, + "loss": 39.4203, + "step": 2068 + }, + { + "epoch": 7.469525959367946, + "grad_norm": 232.5495147705078, + "learning_rate": 1.8789473684210524e-05, + "loss": 36.2202, + "step": 2069 + }, + { + "epoch": 7.47313769751693, + "grad_norm": 218.72299194335938, + "learning_rate": 1.8784029038112523e-05, + "loss": 34.9116, + "step": 2070 + }, + { + "epoch": 7.47313769751693, + "eval_loss": 0.6241349577903748, + "eval_runtime": 3.1499, + "eval_samples_per_second": 56.827, + "eval_steps_per_second": 56.827, + "step": 2070 + }, + { + "epoch": 7.476749435665914, + "grad_norm": 241.78179931640625, + "learning_rate": 1.8778584392014522e-05, + "loss": 36.2476, + "step": 2071 + }, + { + "epoch": 7.480361173814899, + "grad_norm": 194.92982482910156, + "learning_rate": 1.8773139745916517e-05, + "loss": 34.4524, + "step": 2072 + }, + { + "epoch": 7.483972911963883, + "grad_norm": 227.76156616210938, + "learning_rate": 1.8767695099818513e-05, + "loss": 34.5292, + "step": 2073 + }, + { + "epoch": 7.487584650112867, + "grad_norm": 287.61309814453125, + "learning_rate": 1.8762250453720508e-05, + "loss": 37.8068, + "step": 2074 + }, + { + "epoch": 7.491196388261851, + "grad_norm": 191.0822296142578, + "learning_rate": 1.8756805807622503e-05, + "loss": 36.0941, + "step": 2075 + }, + { + "epoch": 7.4948081264108355, + "grad_norm": 197.5564422607422, + "learning_rate": 1.8751361161524502e-05, + "loss": 36.3624, + "step": 2076 + }, + { + "epoch": 7.4984198645598195, + "grad_norm": 187.72479248046875, + "learning_rate": 1.8745916515426497e-05, + "loss": 37.5074, + "step": 2077 + }, + { + "epoch": 7.502031602708803, + "grad_norm": 220.4607391357422, + "learning_rate": 1.8740471869328493e-05, + "loss": 35.6139, + "step": 2078 + }, + { + "epoch": 7.505643340857787, + "grad_norm": 179.05612182617188, + "learning_rate": 1.873502722323049e-05, + "loss": 37.7286, + "step": 2079 + }, + { + "epoch": 7.509255079006772, + "grad_norm": 230.91879272460938, + "learning_rate": 1.8729582577132487e-05, + "loss": 36.1803, + "step": 2080 + }, + { + "epoch": 7.509255079006772, + "eval_loss": 0.6255043148994446, + "eval_runtime": 3.1466, + "eval_samples_per_second": 56.887, + "eval_steps_per_second": 56.887, + "step": 2080 + }, + { + "epoch": 7.512866817155756, + "grad_norm": 182.89437866210938, + "learning_rate": 1.8724137931034482e-05, + "loss": 36.5782, + "step": 2081 + }, + { + "epoch": 7.51647855530474, + "grad_norm": 215.36769104003906, + "learning_rate": 1.871869328493648e-05, + "loss": 38.233, + "step": 2082 + }, + { + "epoch": 7.520090293453725, + "grad_norm": 232.6095733642578, + "learning_rate": 1.8713248638838477e-05, + "loss": 38.6268, + "step": 2083 + }, + { + "epoch": 7.523702031602709, + "grad_norm": 236.94281005859375, + "learning_rate": 1.8707803992740472e-05, + "loss": 38.1768, + "step": 2084 + }, + { + "epoch": 7.527313769751693, + "grad_norm": 214.16079711914062, + "learning_rate": 1.8702359346642467e-05, + "loss": 27.514, + "step": 2085 + }, + { + "epoch": 7.530925507900677, + "grad_norm": 192.6107940673828, + "learning_rate": 1.8696914700544463e-05, + "loss": 24.274, + "step": 2086 + }, + { + "epoch": 7.534537246049661, + "grad_norm": 217.98619079589844, + "learning_rate": 1.869147005444646e-05, + "loss": 23.2824, + "step": 2087 + }, + { + "epoch": 7.538148984198646, + "grad_norm": 183.04296875, + "learning_rate": 1.868602540834846e-05, + "loss": 24.9622, + "step": 2088 + }, + { + "epoch": 7.54176072234763, + "grad_norm": 167.1417236328125, + "learning_rate": 1.8680580762250456e-05, + "loss": 25.1446, + "step": 2089 + }, + { + "epoch": 7.545372460496614, + "grad_norm": 287.29937744140625, + "learning_rate": 1.867513611615245e-05, + "loss": 44.1171, + "step": 2090 + }, + { + "epoch": 7.545372460496614, + "eval_loss": 0.6376849412918091, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.929, + "eval_steps_per_second": 56.929, + "step": 2090 + }, + { + "epoch": 7.5489841986455986, + "grad_norm": 285.3408203125, + "learning_rate": 1.8669691470054446e-05, + "loss": 46.3716, + "step": 2091 + }, + { + "epoch": 7.5525959367945825, + "grad_norm": 233.18389892578125, + "learning_rate": 1.8664246823956445e-05, + "loss": 44.0514, + "step": 2092 + }, + { + "epoch": 7.5562076749435665, + "grad_norm": 256.4196472167969, + "learning_rate": 1.865880217785844e-05, + "loss": 44.1784, + "step": 2093 + }, + { + "epoch": 7.5598194130925505, + "grad_norm": 223.28128051757812, + "learning_rate": 1.8653357531760436e-05, + "loss": 42.9897, + "step": 2094 + }, + { + "epoch": 7.563431151241535, + "grad_norm": 235.2901153564453, + "learning_rate": 1.864791288566243e-05, + "loss": 43.7651, + "step": 2095 + }, + { + "epoch": 7.567042889390519, + "grad_norm": 285.9206237792969, + "learning_rate": 1.8642468239564427e-05, + "loss": 44.6333, + "step": 2096 + }, + { + "epoch": 7.570654627539503, + "grad_norm": 200.00210571289062, + "learning_rate": 1.8637023593466425e-05, + "loss": 43.9845, + "step": 2097 + }, + { + "epoch": 7.574266365688487, + "grad_norm": 277.73394775390625, + "learning_rate": 1.8631578947368424e-05, + "loss": 44.7301, + "step": 2098 + }, + { + "epoch": 7.577878103837472, + "grad_norm": 216.9422149658203, + "learning_rate": 1.862613430127042e-05, + "loss": 44.0409, + "step": 2099 + }, + { + "epoch": 7.581489841986456, + "grad_norm": 198.86639404296875, + "learning_rate": 1.8620689655172415e-05, + "loss": 43.4026, + "step": 2100 + }, + { + "epoch": 7.581489841986456, + "eval_loss": 0.6270378232002258, + "eval_runtime": 3.1464, + "eval_samples_per_second": 56.891, + "eval_steps_per_second": 56.891, + "step": 2100 + }, + { + "epoch": 7.58510158013544, + "grad_norm": 240.495361328125, + "learning_rate": 1.861524500907441e-05, + "loss": 41.4092, + "step": 2101 + }, + { + "epoch": 7.588713318284425, + "grad_norm": 240.1851043701172, + "learning_rate": 1.8609800362976406e-05, + "loss": 40.1396, + "step": 2102 + }, + { + "epoch": 7.592325056433409, + "grad_norm": 241.21495056152344, + "learning_rate": 1.8604355716878405e-05, + "loss": 39.1778, + "step": 2103 + }, + { + "epoch": 7.595936794582393, + "grad_norm": 287.3133544921875, + "learning_rate": 1.85989110707804e-05, + "loss": 41.0348, + "step": 2104 + }, + { + "epoch": 7.599548532731377, + "grad_norm": 230.4313201904297, + "learning_rate": 1.8593466424682395e-05, + "loss": 39.5872, + "step": 2105 + }, + { + "epoch": 7.603160270880361, + "grad_norm": 210.32962036132812, + "learning_rate": 1.858802177858439e-05, + "loss": 40.6146, + "step": 2106 + }, + { + "epoch": 7.606772009029346, + "grad_norm": 185.81752014160156, + "learning_rate": 1.858257713248639e-05, + "loss": 39.6363, + "step": 2107 + }, + { + "epoch": 7.6103837471783295, + "grad_norm": 234.63037109375, + "learning_rate": 1.8577132486388385e-05, + "loss": 40.558, + "step": 2108 + }, + { + "epoch": 7.6139954853273135, + "grad_norm": 289.92803955078125, + "learning_rate": 1.8571687840290384e-05, + "loss": 41.1624, + "step": 2109 + }, + { + "epoch": 7.617607223476298, + "grad_norm": 252.82188415527344, + "learning_rate": 1.856624319419238e-05, + "loss": 41.7827, + "step": 2110 + }, + { + "epoch": 7.617607223476298, + "eval_loss": 0.6290409564971924, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2110 + }, + { + "epoch": 7.621218961625282, + "grad_norm": 201.8303985595703, + "learning_rate": 1.8560798548094374e-05, + "loss": 39.0072, + "step": 2111 + }, + { + "epoch": 7.624830699774266, + "grad_norm": 158.71446228027344, + "learning_rate": 1.855535390199637e-05, + "loss": 39.9822, + "step": 2112 + }, + { + "epoch": 7.62844243792325, + "grad_norm": 171.3879852294922, + "learning_rate": 1.8549909255898365e-05, + "loss": 42.1973, + "step": 2113 + }, + { + "epoch": 7.632054176072235, + "grad_norm": 218.584228515625, + "learning_rate": 1.8544464609800364e-05, + "loss": 42.933, + "step": 2114 + }, + { + "epoch": 7.635665914221219, + "grad_norm": 200.60093688964844, + "learning_rate": 1.853901996370236e-05, + "loss": 41.9847, + "step": 2115 + }, + { + "epoch": 7.639277652370203, + "grad_norm": 210.75128173828125, + "learning_rate": 1.8533575317604358e-05, + "loss": 42.4961, + "step": 2116 + }, + { + "epoch": 7.642889390519187, + "grad_norm": 187.47406005859375, + "learning_rate": 1.8528130671506353e-05, + "loss": 39.3404, + "step": 2117 + }, + { + "epoch": 7.646501128668172, + "grad_norm": 204.87693786621094, + "learning_rate": 1.852268602540835e-05, + "loss": 40.3011, + "step": 2118 + }, + { + "epoch": 7.650112866817156, + "grad_norm": 228.8159637451172, + "learning_rate": 1.8517241379310344e-05, + "loss": 37.4416, + "step": 2119 + }, + { + "epoch": 7.65372460496614, + "grad_norm": 237.59664916992188, + "learning_rate": 1.8511796733212343e-05, + "loss": 35.3079, + "step": 2120 + }, + { + "epoch": 7.65372460496614, + "eval_loss": 0.6256567239761353, + "eval_runtime": 3.1458, + "eval_samples_per_second": 56.902, + "eval_steps_per_second": 56.902, + "step": 2120 + }, + { + "epoch": 7.657336343115124, + "grad_norm": 233.3187713623047, + "learning_rate": 1.850635208711434e-05, + "loss": 34.5055, + "step": 2121 + }, + { + "epoch": 7.660948081264109, + "grad_norm": 232.7037353515625, + "learning_rate": 1.8500907441016334e-05, + "loss": 34.1232, + "step": 2122 + }, + { + "epoch": 7.664559819413093, + "grad_norm": 254.53050231933594, + "learning_rate": 1.849546279491833e-05, + "loss": 35.3301, + "step": 2123 + }, + { + "epoch": 7.668171557562077, + "grad_norm": 234.93154907226562, + "learning_rate": 1.8490018148820324e-05, + "loss": 35.9202, + "step": 2124 + }, + { + "epoch": 7.6717832957110605, + "grad_norm": 237.99671936035156, + "learning_rate": 1.8484573502722327e-05, + "loss": 36.5702, + "step": 2125 + }, + { + "epoch": 7.675395033860045, + "grad_norm": 186.25271606445312, + "learning_rate": 1.8479128856624322e-05, + "loss": 35.9423, + "step": 2126 + }, + { + "epoch": 7.679006772009029, + "grad_norm": 226.461669921875, + "learning_rate": 1.8473684210526317e-05, + "loss": 37.4121, + "step": 2127 + }, + { + "epoch": 7.682618510158013, + "grad_norm": 227.0966033935547, + "learning_rate": 1.8468239564428313e-05, + "loss": 36.8802, + "step": 2128 + }, + { + "epoch": 7.686230248306998, + "grad_norm": 193.4064178466797, + "learning_rate": 1.8462794918330308e-05, + "loss": 36.0245, + "step": 2129 + }, + { + "epoch": 7.689841986455982, + "grad_norm": 279.1668395996094, + "learning_rate": 1.8457350272232304e-05, + "loss": 37.4833, + "step": 2130 + }, + { + "epoch": 7.689841986455982, + "eval_loss": 0.6227458715438843, + "eval_runtime": 3.1429, + "eval_samples_per_second": 56.953, + "eval_steps_per_second": 56.953, + "step": 2130 + }, + { + "epoch": 7.693453724604966, + "grad_norm": 254.59234619140625, + "learning_rate": 1.8451905626134302e-05, + "loss": 36.8538, + "step": 2131 + }, + { + "epoch": 7.69706546275395, + "grad_norm": 191.14463806152344, + "learning_rate": 1.8446460980036298e-05, + "loss": 37.8517, + "step": 2132 + }, + { + "epoch": 7.700677200902934, + "grad_norm": 189.20896911621094, + "learning_rate": 1.8441016333938293e-05, + "loss": 38.406, + "step": 2133 + }, + { + "epoch": 7.704288939051919, + "grad_norm": 209.61175537109375, + "learning_rate": 1.8435571687840292e-05, + "loss": 37.7692, + "step": 2134 + }, + { + "epoch": 7.707900677200903, + "grad_norm": 220.5150146484375, + "learning_rate": 1.8430127041742287e-05, + "loss": 36.087, + "step": 2135 + }, + { + "epoch": 7.711512415349887, + "grad_norm": 211.78372192382812, + "learning_rate": 1.8424682395644286e-05, + "loss": 25.6052, + "step": 2136 + }, + { + "epoch": 7.715124153498872, + "grad_norm": 223.85789489746094, + "learning_rate": 1.841923774954628e-05, + "loss": 23.5576, + "step": 2137 + }, + { + "epoch": 7.718735891647856, + "grad_norm": 163.74220275878906, + "learning_rate": 1.8413793103448277e-05, + "loss": 24.4869, + "step": 2138 + }, + { + "epoch": 7.72234762979684, + "grad_norm": 182.80079650878906, + "learning_rate": 1.8408348457350272e-05, + "loss": 25.1878, + "step": 2139 + }, + { + "epoch": 7.725959367945824, + "grad_norm": 296.0340270996094, + "learning_rate": 1.8402903811252268e-05, + "loss": 44.4643, + "step": 2140 + }, + { + "epoch": 7.725959367945824, + "eval_loss": 0.6382863521575928, + "eval_runtime": 3.1441, + "eval_samples_per_second": 56.932, + "eval_steps_per_second": 56.932, + "step": 2140 + }, + { + "epoch": 7.7295711060948085, + "grad_norm": 248.48643493652344, + "learning_rate": 1.8397459165154263e-05, + "loss": 45.2141, + "step": 2141 + }, + { + "epoch": 7.733182844243792, + "grad_norm": 240.9061279296875, + "learning_rate": 1.8392014519056262e-05, + "loss": 42.9435, + "step": 2142 + }, + { + "epoch": 7.736794582392776, + "grad_norm": 231.62315368652344, + "learning_rate": 1.8386569872958257e-05, + "loss": 42.9769, + "step": 2143 + }, + { + "epoch": 7.74040632054176, + "grad_norm": 244.36915588378906, + "learning_rate": 1.8381125226860256e-05, + "loss": 43.6058, + "step": 2144 + }, + { + "epoch": 7.744018058690745, + "grad_norm": 252.9080047607422, + "learning_rate": 1.837568058076225e-05, + "loss": 43.1753, + "step": 2145 + }, + { + "epoch": 7.747629796839729, + "grad_norm": 274.0201721191406, + "learning_rate": 1.8370235934664247e-05, + "loss": 43.3285, + "step": 2146 + }, + { + "epoch": 7.751241534988713, + "grad_norm": 226.75595092773438, + "learning_rate": 1.8364791288566245e-05, + "loss": 43.3158, + "step": 2147 + }, + { + "epoch": 7.754853273137698, + "grad_norm": 197.0859832763672, + "learning_rate": 1.835934664246824e-05, + "loss": 43.5773, + "step": 2148 + }, + { + "epoch": 7.758465011286682, + "grad_norm": 212.14720153808594, + "learning_rate": 1.8353901996370236e-05, + "loss": 43.9208, + "step": 2149 + }, + { + "epoch": 7.762076749435666, + "grad_norm": 230.22158813476562, + "learning_rate": 1.834845735027223e-05, + "loss": 42.8429, + "step": 2150 + }, + { + "epoch": 7.762076749435666, + "eval_loss": 0.6291994452476501, + "eval_runtime": 3.1473, + "eval_samples_per_second": 56.874, + "eval_steps_per_second": 56.874, + "step": 2150 + }, + { + "epoch": 7.76568848758465, + "grad_norm": 215.79391479492188, + "learning_rate": 1.8343012704174227e-05, + "loss": 40.7289, + "step": 2151 + }, + { + "epoch": 7.769300225733634, + "grad_norm": 210.00296020507812, + "learning_rate": 1.8337568058076222e-05, + "loss": 39.9759, + "step": 2152 + }, + { + "epoch": 7.772911963882619, + "grad_norm": 291.2987976074219, + "learning_rate": 1.8332123411978224e-05, + "loss": 40.551, + "step": 2153 + }, + { + "epoch": 7.776523702031603, + "grad_norm": 218.08819580078125, + "learning_rate": 1.832667876588022e-05, + "loss": 40.7981, + "step": 2154 + }, + { + "epoch": 7.780135440180587, + "grad_norm": 268.615966796875, + "learning_rate": 1.8321234119782215e-05, + "loss": 40.5463, + "step": 2155 + }, + { + "epoch": 7.7837471783295715, + "grad_norm": 269.939697265625, + "learning_rate": 1.831578947368421e-05, + "loss": 40.6168, + "step": 2156 + }, + { + "epoch": 7.7873589164785555, + "grad_norm": 268.9761657714844, + "learning_rate": 1.8310344827586206e-05, + "loss": 41.2449, + "step": 2157 + }, + { + "epoch": 7.7909706546275395, + "grad_norm": 161.08811950683594, + "learning_rate": 1.8304900181488205e-05, + "loss": 40.6308, + "step": 2158 + }, + { + "epoch": 7.794582392776523, + "grad_norm": 190.44696044921875, + "learning_rate": 1.82994555353902e-05, + "loss": 40.9708, + "step": 2159 + }, + { + "epoch": 7.798194130925508, + "grad_norm": 202.4305419921875, + "learning_rate": 1.8294010889292196e-05, + "loss": 41.2053, + "step": 2160 + }, + { + "epoch": 7.798194130925508, + "eval_loss": 0.6233534812927246, + "eval_runtime": 3.1457, + "eval_samples_per_second": 56.903, + "eval_steps_per_second": 56.903, + "step": 2160 + }, + { + "epoch": 7.801805869074492, + "grad_norm": 188.5523681640625, + "learning_rate": 1.828856624319419e-05, + "loss": 40.3928, + "step": 2161 + }, + { + "epoch": 7.805417607223476, + "grad_norm": 184.18296813964844, + "learning_rate": 1.828312159709619e-05, + "loss": 42.3466, + "step": 2162 + }, + { + "epoch": 7.80902934537246, + "grad_norm": 223.9243927001953, + "learning_rate": 1.8277676950998185e-05, + "loss": 42.0301, + "step": 2163 + }, + { + "epoch": 7.812641083521445, + "grad_norm": 202.3498077392578, + "learning_rate": 1.8272232304900184e-05, + "loss": 42.3284, + "step": 2164 + }, + { + "epoch": 7.816252821670429, + "grad_norm": 205.77940368652344, + "learning_rate": 1.826678765880218e-05, + "loss": 42.0951, + "step": 2165 + }, + { + "epoch": 7.819864559819413, + "grad_norm": 191.46728515625, + "learning_rate": 1.8261343012704175e-05, + "loss": 40.826, + "step": 2166 + }, + { + "epoch": 7.823476297968397, + "grad_norm": 276.8330383300781, + "learning_rate": 1.825589836660617e-05, + "loss": 42.7909, + "step": 2167 + }, + { + "epoch": 7.827088036117382, + "grad_norm": 181.93955993652344, + "learning_rate": 1.8250453720508165e-05, + "loss": 38.6068, + "step": 2168 + }, + { + "epoch": 7.830699774266366, + "grad_norm": 178.79856872558594, + "learning_rate": 1.8245009074410164e-05, + "loss": 35.694, + "step": 2169 + }, + { + "epoch": 7.83431151241535, + "grad_norm": 224.6522979736328, + "learning_rate": 1.823956442831216e-05, + "loss": 36.7127, + "step": 2170 + }, + { + "epoch": 7.83431151241535, + "eval_loss": 0.6237645745277405, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 2170 + }, + { + "epoch": 7.837923250564334, + "grad_norm": 203.37196350097656, + "learning_rate": 1.823411978221416e-05, + "loss": 34.0039, + "step": 2171 + }, + { + "epoch": 7.8415349887133186, + "grad_norm": 212.79307556152344, + "learning_rate": 1.8228675136116154e-05, + "loss": 33.2787, + "step": 2172 + }, + { + "epoch": 7.8451467268623025, + "grad_norm": 215.5691375732422, + "learning_rate": 1.822323049001815e-05, + "loss": 35.4241, + "step": 2173 + }, + { + "epoch": 7.8487584650112865, + "grad_norm": 230.0751190185547, + "learning_rate": 1.8217785843920144e-05, + "loss": 36.9333, + "step": 2174 + }, + { + "epoch": 7.852370203160271, + "grad_norm": 217.8132781982422, + "learning_rate": 1.8212341197822143e-05, + "loss": 35.7233, + "step": 2175 + }, + { + "epoch": 7.855981941309255, + "grad_norm": 245.93177795410156, + "learning_rate": 1.820689655172414e-05, + "loss": 36.6111, + "step": 2176 + }, + { + "epoch": 7.859593679458239, + "grad_norm": 210.58218383789062, + "learning_rate": 1.8201451905626134e-05, + "loss": 36.3243, + "step": 2177 + }, + { + "epoch": 7.863205417607223, + "grad_norm": 234.6280059814453, + "learning_rate": 1.819600725952813e-05, + "loss": 37.0315, + "step": 2178 + }, + { + "epoch": 7.866817155756207, + "grad_norm": 184.53121948242188, + "learning_rate": 1.8190562613430125e-05, + "loss": 35.8725, + "step": 2179 + }, + { + "epoch": 7.870428893905192, + "grad_norm": 201.5563507080078, + "learning_rate": 1.8185117967332127e-05, + "loss": 37.9183, + "step": 2180 + }, + { + "epoch": 7.870428893905192, + "eval_loss": 0.6210297346115112, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2180 + }, + { + "epoch": 7.874040632054176, + "grad_norm": 192.29579162597656, + "learning_rate": 1.8179673321234122e-05, + "loss": 37.1709, + "step": 2181 + }, + { + "epoch": 7.87765237020316, + "grad_norm": 246.0638427734375, + "learning_rate": 1.8174228675136118e-05, + "loss": 38.5338, + "step": 2182 + }, + { + "epoch": 7.881264108352145, + "grad_norm": 237.47607421875, + "learning_rate": 1.8168784029038113e-05, + "loss": 37.7041, + "step": 2183 + }, + { + "epoch": 7.884875846501129, + "grad_norm": 215.06407165527344, + "learning_rate": 1.816333938294011e-05, + "loss": 38.1663, + "step": 2184 + }, + { + "epoch": 7.888487584650113, + "grad_norm": 193.76809692382812, + "learning_rate": 1.8157894736842107e-05, + "loss": 32.1679, + "step": 2185 + }, + { + "epoch": 7.892099322799097, + "grad_norm": 208.66111755371094, + "learning_rate": 1.8152450090744103e-05, + "loss": 24.2413, + "step": 2186 + }, + { + "epoch": 7.895711060948082, + "grad_norm": 182.810546875, + "learning_rate": 1.8147005444646098e-05, + "loss": 24.1102, + "step": 2187 + }, + { + "epoch": 7.899322799097066, + "grad_norm": 200.25823974609375, + "learning_rate": 1.8141560798548093e-05, + "loss": 24.5778, + "step": 2188 + }, + { + "epoch": 7.9029345372460496, + "grad_norm": 224.19125366210938, + "learning_rate": 1.813611615245009e-05, + "loss": 26.1643, + "step": 2189 + }, + { + "epoch": 7.9065462753950335, + "grad_norm": 261.03033447265625, + "learning_rate": 1.8130671506352088e-05, + "loss": 45.1071, + "step": 2190 + }, + { + "epoch": 7.9065462753950335, + "eval_loss": 0.6303785443305969, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2190 + }, + { + "epoch": 7.910158013544018, + "grad_norm": 273.6593322753906, + "learning_rate": 1.8125226860254086e-05, + "loss": 43.8271, + "step": 2191 + }, + { + "epoch": 7.913769751693002, + "grad_norm": 304.0534362792969, + "learning_rate": 1.8119782214156082e-05, + "loss": 43.7623, + "step": 2192 + }, + { + "epoch": 7.917381489841986, + "grad_norm": 249.27255249023438, + "learning_rate": 1.8114337568058077e-05, + "loss": 43.7191, + "step": 2193 + }, + { + "epoch": 7.92099322799097, + "grad_norm": 199.5006103515625, + "learning_rate": 1.8108892921960072e-05, + "loss": 44.1019, + "step": 2194 + }, + { + "epoch": 7.924604966139955, + "grad_norm": 228.42832946777344, + "learning_rate": 1.8103448275862068e-05, + "loss": 43.9717, + "step": 2195 + }, + { + "epoch": 7.928216704288939, + "grad_norm": 247.20901489257812, + "learning_rate": 1.8098003629764067e-05, + "loss": 40.022, + "step": 2196 + }, + { + "epoch": 7.931828442437923, + "grad_norm": 297.5372619628906, + "learning_rate": 1.8092558983666062e-05, + "loss": 40.6639, + "step": 2197 + }, + { + "epoch": 7.935440180586907, + "grad_norm": 245.11915588378906, + "learning_rate": 1.8087114337568057e-05, + "loss": 40.3569, + "step": 2198 + }, + { + "epoch": 7.939051918735892, + "grad_norm": 255.53297424316406, + "learning_rate": 1.8081669691470056e-05, + "loss": 41.7983, + "step": 2199 + }, + { + "epoch": 7.942663656884876, + "grad_norm": 226.12783813476562, + "learning_rate": 1.807622504537205e-05, + "loss": 41.7844, + "step": 2200 + }, + { + "epoch": 7.942663656884876, + "eval_loss": 0.6214397549629211, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2200 + }, + { + "epoch": 7.94627539503386, + "grad_norm": 220.90577697753906, + "learning_rate": 1.8070780399274047e-05, + "loss": 42.057, + "step": 2201 + }, + { + "epoch": 7.949887133182845, + "grad_norm": 192.33856201171875, + "learning_rate": 1.8065335753176046e-05, + "loss": 42.0299, + "step": 2202 + }, + { + "epoch": 7.953498871331829, + "grad_norm": 192.8511962890625, + "learning_rate": 1.805989110707804e-05, + "loss": 41.7752, + "step": 2203 + }, + { + "epoch": 7.957110609480813, + "grad_norm": 223.10275268554688, + "learning_rate": 1.8054446460980036e-05, + "loss": 41.0178, + "step": 2204 + }, + { + "epoch": 7.960722347629797, + "grad_norm": 189.8402099609375, + "learning_rate": 1.8049001814882032e-05, + "loss": 37.9747, + "step": 2205 + }, + { + "epoch": 7.9643340857787805, + "grad_norm": 233.5938720703125, + "learning_rate": 1.8043557168784027e-05, + "loss": 35.3994, + "step": 2206 + }, + { + "epoch": 7.967945823927765, + "grad_norm": 218.5577850341797, + "learning_rate": 1.8038112522686026e-05, + "loss": 35.1967, + "step": 2207 + }, + { + "epoch": 7.971557562076749, + "grad_norm": 228.49502563476562, + "learning_rate": 1.8032667876588025e-05, + "loss": 34.5792, + "step": 2208 + }, + { + "epoch": 7.975169300225733, + "grad_norm": 285.4461364746094, + "learning_rate": 1.802722323049002e-05, + "loss": 37.9449, + "step": 2209 + }, + { + "epoch": 7.978781038374718, + "grad_norm": 186.83755493164062, + "learning_rate": 1.8021778584392016e-05, + "loss": 36.3295, + "step": 2210 + }, + { + "epoch": 7.978781038374718, + "eval_loss": 0.6212169528007507, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2210 + }, + { + "epoch": 7.982392776523702, + "grad_norm": 210.31175231933594, + "learning_rate": 1.801633393829401e-05, + "loss": 37.0061, + "step": 2211 + }, + { + "epoch": 7.986004514672686, + "grad_norm": 251.96026611328125, + "learning_rate": 1.8010889292196006e-05, + "loss": 37.8831, + "step": 2212 + }, + { + "epoch": 7.98961625282167, + "grad_norm": 273.8665771484375, + "learning_rate": 1.8005444646098005e-05, + "loss": 38.8926, + "step": 2213 + }, + { + "epoch": 7.993227990970655, + "grad_norm": 207.25836181640625, + "learning_rate": 1.8e-05, + "loss": 30.0468, + "step": 2214 + }, + { + "epoch": 7.996839729119639, + "grad_norm": 200.5218048095703, + "learning_rate": 1.7994555353901996e-05, + "loss": 24.0549, + "step": 2215 + }, + { + "epoch": 8.0, + "grad_norm": 245.7149200439453, + "learning_rate": 1.798911070780399e-05, + "loss": 22.3158, + "step": 2216 + }, + { + "epoch": 8.003611738148985, + "grad_norm": 263.85546875, + "learning_rate": 1.798366606170599e-05, + "loss": 43.2342, + "step": 2217 + }, + { + "epoch": 8.007223476297968, + "grad_norm": 244.57205200195312, + "learning_rate": 1.797822141560799e-05, + "loss": 44.0931, + "step": 2218 + }, + { + "epoch": 8.010835214446953, + "grad_norm": 196.4144287109375, + "learning_rate": 1.7972776769509984e-05, + "loss": 42.1926, + "step": 2219 + }, + { + "epoch": 8.014446952595938, + "grad_norm": 282.3250427246094, + "learning_rate": 1.796733212341198e-05, + "loss": 41.4664, + "step": 2220 + }, + { + "epoch": 8.014446952595938, + "eval_loss": 0.6222901344299316, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 2220 + }, + { + "epoch": 8.01805869074492, + "grad_norm": 186.79281616210938, + "learning_rate": 1.7961887477313975e-05, + "loss": 42.2133, + "step": 2221 + }, + { + "epoch": 8.021670428893906, + "grad_norm": 220.3788299560547, + "learning_rate": 1.795644283121597e-05, + "loss": 42.0159, + "step": 2222 + }, + { + "epoch": 8.025282167042889, + "grad_norm": 262.37078857421875, + "learning_rate": 1.7950998185117966e-05, + "loss": 42.6055, + "step": 2223 + }, + { + "epoch": 8.028893905191874, + "grad_norm": 199.07078552246094, + "learning_rate": 1.7945553539019964e-05, + "loss": 43.3061, + "step": 2224 + }, + { + "epoch": 8.032505643340858, + "grad_norm": 256.6651306152344, + "learning_rate": 1.794010889292196e-05, + "loss": 42.4806, + "step": 2225 + }, + { + "epoch": 8.036117381489841, + "grad_norm": 281.17431640625, + "learning_rate": 1.793466424682396e-05, + "loss": 43.9823, + "step": 2226 + }, + { + "epoch": 8.039729119638826, + "grad_norm": 201.19837951660156, + "learning_rate": 1.7929219600725954e-05, + "loss": 41.8372, + "step": 2227 + }, + { + "epoch": 8.043340857787811, + "grad_norm": 195.1905059814453, + "learning_rate": 1.792377495462795e-05, + "loss": 38.8656, + "step": 2228 + }, + { + "epoch": 8.046952595936794, + "grad_norm": 215.02772521972656, + "learning_rate": 1.7918330308529948e-05, + "loss": 39.8965, + "step": 2229 + }, + { + "epoch": 8.050564334085779, + "grad_norm": 202.16322326660156, + "learning_rate": 1.7912885662431944e-05, + "loss": 41.0917, + "step": 2230 + }, + { + "epoch": 8.050564334085779, + "eval_loss": 0.6212881207466125, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2230 + }, + { + "epoch": 8.054176072234762, + "grad_norm": 218.90786743164062, + "learning_rate": 1.790744101633394e-05, + "loss": 38.5499, + "step": 2231 + }, + { + "epoch": 8.057787810383747, + "grad_norm": 179.57138061523438, + "learning_rate": 1.7901996370235934e-05, + "loss": 39.5915, + "step": 2232 + }, + { + "epoch": 8.061399548532732, + "grad_norm": 242.74801635742188, + "learning_rate": 1.789655172413793e-05, + "loss": 39.6094, + "step": 2233 + }, + { + "epoch": 8.065011286681715, + "grad_norm": 183.07102966308594, + "learning_rate": 1.7891107078039925e-05, + "loss": 40.6025, + "step": 2234 + }, + { + "epoch": 8.0686230248307, + "grad_norm": 192.85418701171875, + "learning_rate": 1.7885662431941924e-05, + "loss": 40.3013, + "step": 2235 + }, + { + "epoch": 8.072234762979685, + "grad_norm": 254.26353454589844, + "learning_rate": 1.7880217785843923e-05, + "loss": 39.1747, + "step": 2236 + }, + { + "epoch": 8.075846501128668, + "grad_norm": 230.7747802734375, + "learning_rate": 1.7874773139745918e-05, + "loss": 40.7569, + "step": 2237 + }, + { + "epoch": 8.079458239277653, + "grad_norm": 179.30528259277344, + "learning_rate": 1.7869328493647913e-05, + "loss": 40.0753, + "step": 2238 + }, + { + "epoch": 8.083069977426636, + "grad_norm": 203.48915100097656, + "learning_rate": 1.786388384754991e-05, + "loss": 41.4453, + "step": 2239 + }, + { + "epoch": 8.08668171557562, + "grad_norm": 274.8970947265625, + "learning_rate": 1.7858439201451908e-05, + "loss": 40.5818, + "step": 2240 + }, + { + "epoch": 8.08668171557562, + "eval_loss": 0.6184170842170715, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.799, + "eval_steps_per_second": 56.799, + "step": 2240 + }, + { + "epoch": 8.090293453724605, + "grad_norm": 237.2452392578125, + "learning_rate": 1.7852994555353903e-05, + "loss": 42.5794, + "step": 2241 + }, + { + "epoch": 8.093905191873588, + "grad_norm": 236.33766174316406, + "learning_rate": 1.7847549909255898e-05, + "loss": 41.89, + "step": 2242 + }, + { + "epoch": 8.097516930022573, + "grad_norm": 269.4791564941406, + "learning_rate": 1.7842105263157894e-05, + "loss": 41.7726, + "step": 2243 + }, + { + "epoch": 8.101128668171558, + "grad_norm": 192.28457641601562, + "learning_rate": 1.783666061705989e-05, + "loss": 40.1187, + "step": 2244 + }, + { + "epoch": 8.104740406320541, + "grad_norm": 201.5625457763672, + "learning_rate": 1.7831215970961888e-05, + "loss": 36.8004, + "step": 2245 + }, + { + "epoch": 8.108352144469526, + "grad_norm": 175.7625274658203, + "learning_rate": 1.7825771324863887e-05, + "loss": 33.8354, + "step": 2246 + }, + { + "epoch": 8.111963882618511, + "grad_norm": 195.6171112060547, + "learning_rate": 1.7820326678765882e-05, + "loss": 33.5176, + "step": 2247 + }, + { + "epoch": 8.115575620767494, + "grad_norm": 158.7554168701172, + "learning_rate": 1.7814882032667877e-05, + "loss": 34.2908, + "step": 2248 + }, + { + "epoch": 8.119187358916479, + "grad_norm": 192.78900146484375, + "learning_rate": 1.7809437386569873e-05, + "loss": 34.0861, + "step": 2249 + }, + { + "epoch": 8.122799097065462, + "grad_norm": 186.6603240966797, + "learning_rate": 1.7803992740471868e-05, + "loss": 35.5742, + "step": 2250 + }, + { + "epoch": 8.122799097065462, + "eval_loss": 0.6207499504089355, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2250 + }, + { + "epoch": 8.126410835214447, + "grad_norm": 264.3590087890625, + "learning_rate": 1.7798548094373867e-05, + "loss": 35.6709, + "step": 2251 + }, + { + "epoch": 8.130022573363432, + "grad_norm": 202.9478302001953, + "learning_rate": 1.7793103448275862e-05, + "loss": 36.4221, + "step": 2252 + }, + { + "epoch": 8.133634311512415, + "grad_norm": 229.260498046875, + "learning_rate": 1.7787658802177858e-05, + "loss": 36.0745, + "step": 2253 + }, + { + "epoch": 8.1372460496614, + "grad_norm": 222.37716674804688, + "learning_rate": 1.7782214156079856e-05, + "loss": 37.3266, + "step": 2254 + }, + { + "epoch": 8.140857787810384, + "grad_norm": 217.02272033691406, + "learning_rate": 1.7776769509981852e-05, + "loss": 37.2819, + "step": 2255 + }, + { + "epoch": 8.144469525959368, + "grad_norm": 247.61016845703125, + "learning_rate": 1.7771324863883847e-05, + "loss": 37.2683, + "step": 2256 + }, + { + "epoch": 8.148081264108352, + "grad_norm": 209.7449493408203, + "learning_rate": 1.7765880217785846e-05, + "loss": 36.7165, + "step": 2257 + }, + { + "epoch": 8.151693002257336, + "grad_norm": 217.30722045898438, + "learning_rate": 1.776043557168784e-05, + "loss": 37.0805, + "step": 2258 + }, + { + "epoch": 8.15530474040632, + "grad_norm": 181.5167236328125, + "learning_rate": 1.7754990925589837e-05, + "loss": 38.0326, + "step": 2259 + }, + { + "epoch": 8.158916478555305, + "grad_norm": 217.4818878173828, + "learning_rate": 1.7749546279491832e-05, + "loss": 37.1798, + "step": 2260 + }, + { + "epoch": 8.158916478555305, + "eval_loss": 0.6218119263648987, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 2260 + }, + { + "epoch": 8.162528216704288, + "grad_norm": 233.60733032226562, + "learning_rate": 1.7744101633393828e-05, + "loss": 36.6039, + "step": 2261 + }, + { + "epoch": 8.166139954853273, + "grad_norm": 184.5128631591797, + "learning_rate": 1.7738656987295826e-05, + "loss": 30.6188, + "step": 2262 + }, + { + "epoch": 8.169751693002258, + "grad_norm": 154.25791931152344, + "learning_rate": 1.7733212341197825e-05, + "loss": 24.0782, + "step": 2263 + }, + { + "epoch": 8.173363431151241, + "grad_norm": 179.92723083496094, + "learning_rate": 1.772776769509982e-05, + "loss": 23.7072, + "step": 2264 + }, + { + "epoch": 8.176975169300226, + "grad_norm": 170.87684631347656, + "learning_rate": 1.7722323049001816e-05, + "loss": 24.0008, + "step": 2265 + }, + { + "epoch": 8.18058690744921, + "grad_norm": 179.25233459472656, + "learning_rate": 1.771687840290381e-05, + "loss": 24.8393, + "step": 2266 + }, + { + "epoch": 8.184198645598194, + "grad_norm": 268.7836608886719, + "learning_rate": 1.7711433756805807e-05, + "loss": 44.0573, + "step": 2267 + }, + { + "epoch": 8.187810383747179, + "grad_norm": 249.12033081054688, + "learning_rate": 1.7705989110707805e-05, + "loss": 45.0218, + "step": 2268 + }, + { + "epoch": 8.191422121896162, + "grad_norm": 275.2551574707031, + "learning_rate": 1.77005444646098e-05, + "loss": 43.1954, + "step": 2269 + }, + { + "epoch": 8.195033860045147, + "grad_norm": 233.5360107421875, + "learning_rate": 1.7695099818511796e-05, + "loss": 43.0807, + "step": 2270 + }, + { + "epoch": 8.195033860045147, + "eval_loss": 0.6311450600624084, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 2270 + }, + { + "epoch": 8.198645598194132, + "grad_norm": 201.01617431640625, + "learning_rate": 1.768965517241379e-05, + "loss": 43.8161, + "step": 2271 + }, + { + "epoch": 8.202257336343115, + "grad_norm": 243.028564453125, + "learning_rate": 1.7684210526315787e-05, + "loss": 43.3388, + "step": 2272 + }, + { + "epoch": 8.2058690744921, + "grad_norm": 191.8246307373047, + "learning_rate": 1.767876588021779e-05, + "loss": 42.6949, + "step": 2273 + }, + { + "epoch": 8.209480812641084, + "grad_norm": 241.33609008789062, + "learning_rate": 1.7673321234119784e-05, + "loss": 43.3541, + "step": 2274 + }, + { + "epoch": 8.213092550790067, + "grad_norm": 247.99066162109375, + "learning_rate": 1.766787658802178e-05, + "loss": 44.4262, + "step": 2275 + }, + { + "epoch": 8.216704288939052, + "grad_norm": 223.35452270507812, + "learning_rate": 1.7662431941923775e-05, + "loss": 42.5696, + "step": 2276 + }, + { + "epoch": 8.220316027088035, + "grad_norm": 208.75209045410156, + "learning_rate": 1.765698729582577e-05, + "loss": 41.9236, + "step": 2277 + }, + { + "epoch": 8.22392776523702, + "grad_norm": 229.60305786132812, + "learning_rate": 1.7651542649727766e-05, + "loss": 39.962, + "step": 2278 + }, + { + "epoch": 8.227539503386005, + "grad_norm": 294.3867492675781, + "learning_rate": 1.7646098003629765e-05, + "loss": 39.0847, + "step": 2279 + }, + { + "epoch": 8.231151241534988, + "grad_norm": 201.49679565429688, + "learning_rate": 1.764065335753176e-05, + "loss": 39.1451, + "step": 2280 + }, + { + "epoch": 8.231151241534988, + "eval_loss": 0.6214079856872559, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 2280 + }, + { + "epoch": 8.234762979683973, + "grad_norm": 201.57894897460938, + "learning_rate": 1.7635208711433756e-05, + "loss": 39.4673, + "step": 2281 + }, + { + "epoch": 8.238374717832958, + "grad_norm": 201.0395965576172, + "learning_rate": 1.7629764065335754e-05, + "loss": 39.9832, + "step": 2282 + }, + { + "epoch": 8.241986455981941, + "grad_norm": 274.41168212890625, + "learning_rate": 1.762431941923775e-05, + "loss": 40.3885, + "step": 2283 + }, + { + "epoch": 8.245598194130926, + "grad_norm": 173.79977416992188, + "learning_rate": 1.761887477313975e-05, + "loss": 39.5292, + "step": 2284 + }, + { + "epoch": 8.249209932279909, + "grad_norm": 194.91806030273438, + "learning_rate": 1.7613430127041744e-05, + "loss": 40.3855, + "step": 2285 + }, + { + "epoch": 8.252821670428894, + "grad_norm": 216.47213745117188, + "learning_rate": 1.760798548094374e-05, + "loss": 40.937, + "step": 2286 + }, + { + "epoch": 8.256433408577879, + "grad_norm": 168.1825714111328, + "learning_rate": 1.7602540834845735e-05, + "loss": 41.2523, + "step": 2287 + }, + { + "epoch": 8.260045146726862, + "grad_norm": 187.51914978027344, + "learning_rate": 1.759709618874773e-05, + "loss": 40.6913, + "step": 2288 + }, + { + "epoch": 8.263656884875846, + "grad_norm": 183.99844360351562, + "learning_rate": 1.759165154264973e-05, + "loss": 42.5074, + "step": 2289 + }, + { + "epoch": 8.267268623024831, + "grad_norm": 201.23797607421875, + "learning_rate": 1.7586206896551724e-05, + "loss": 42.0519, + "step": 2290 + }, + { + "epoch": 8.267268623024831, + "eval_loss": 0.6184054017066956, + "eval_runtime": 3.1465, + "eval_samples_per_second": 56.889, + "eval_steps_per_second": 56.889, + "step": 2290 + }, + { + "epoch": 8.270880361173814, + "grad_norm": 219.0037384033203, + "learning_rate": 1.7580762250453723e-05, + "loss": 41.7059, + "step": 2291 + }, + { + "epoch": 8.2744920993228, + "grad_norm": 221.00173950195312, + "learning_rate": 1.7575317604355718e-05, + "loss": 40.9004, + "step": 2292 + }, + { + "epoch": 8.278103837471784, + "grad_norm": 180.00828552246094, + "learning_rate": 1.7569872958257714e-05, + "loss": 38.7865, + "step": 2293 + }, + { + "epoch": 8.281715575620767, + "grad_norm": 210.69302368164062, + "learning_rate": 1.756442831215971e-05, + "loss": 39.207, + "step": 2294 + }, + { + "epoch": 8.285327313769752, + "grad_norm": 196.8787078857422, + "learning_rate": 1.7558983666061708e-05, + "loss": 39.4472, + "step": 2295 + }, + { + "epoch": 8.288939051918735, + "grad_norm": 229.16331481933594, + "learning_rate": 1.7553539019963703e-05, + "loss": 36.5539, + "step": 2296 + }, + { + "epoch": 8.29255079006772, + "grad_norm": 180.67474365234375, + "learning_rate": 1.75480943738657e-05, + "loss": 34.3887, + "step": 2297 + }, + { + "epoch": 8.296162528216705, + "grad_norm": 234.046875, + "learning_rate": 1.7542649727767694e-05, + "loss": 34.158, + "step": 2298 + }, + { + "epoch": 8.299774266365688, + "grad_norm": 213.34255981445312, + "learning_rate": 1.753720508166969e-05, + "loss": 34.7655, + "step": 2299 + }, + { + "epoch": 8.303386004514673, + "grad_norm": 205.6382598876953, + "learning_rate": 1.753176043557169e-05, + "loss": 34.4223, + "step": 2300 + }, + { + "epoch": 8.303386004514673, + "eval_loss": 0.6200549006462097, + "eval_runtime": 3.1447, + "eval_samples_per_second": 56.921, + "eval_steps_per_second": 56.921, + "step": 2300 + }, + { + "epoch": 8.306997742663658, + "grad_norm": 189.79238891601562, + "learning_rate": 1.7526315789473687e-05, + "loss": 35.3846, + "step": 2301 + }, + { + "epoch": 8.31060948081264, + "grad_norm": 202.27859497070312, + "learning_rate": 1.7520871143375682e-05, + "loss": 34.9006, + "step": 2302 + }, + { + "epoch": 8.314221218961626, + "grad_norm": 217.62327575683594, + "learning_rate": 1.7515426497277678e-05, + "loss": 36.3079, + "step": 2303 + }, + { + "epoch": 8.317832957110609, + "grad_norm": 212.82862854003906, + "learning_rate": 1.7509981851179673e-05, + "loss": 35.8598, + "step": 2304 + }, + { + "epoch": 8.321444695259594, + "grad_norm": 229.778564453125, + "learning_rate": 1.750453720508167e-05, + "loss": 37.0853, + "step": 2305 + }, + { + "epoch": 8.325056433408578, + "grad_norm": 219.99844360351562, + "learning_rate": 1.7499092558983667e-05, + "loss": 38.01, + "step": 2306 + }, + { + "epoch": 8.328668171557561, + "grad_norm": 202.63035583496094, + "learning_rate": 1.7493647912885663e-05, + "loss": 36.4756, + "step": 2307 + }, + { + "epoch": 8.332279909706546, + "grad_norm": 188.44094848632812, + "learning_rate": 1.7488203266787658e-05, + "loss": 37.0509, + "step": 2308 + }, + { + "epoch": 8.335891647855531, + "grad_norm": 187.8760223388672, + "learning_rate": 1.7482758620689657e-05, + "loss": 38.0019, + "step": 2309 + }, + { + "epoch": 8.339503386004514, + "grad_norm": 239.35833740234375, + "learning_rate": 1.7477313974591652e-05, + "loss": 38.2255, + "step": 2310 + }, + { + "epoch": 8.339503386004514, + "eval_loss": 0.6221747994422913, + "eval_runtime": 3.148, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 2310 + }, + { + "epoch": 8.343115124153499, + "grad_norm": 236.3567657470703, + "learning_rate": 1.747186932849365e-05, + "loss": 37.3598, + "step": 2311 + }, + { + "epoch": 8.346726862302482, + "grad_norm": 188.16151428222656, + "learning_rate": 1.7466424682395646e-05, + "loss": 27.1993, + "step": 2312 + }, + { + "epoch": 8.350338600451467, + "grad_norm": 216.58778381347656, + "learning_rate": 1.746098003629764e-05, + "loss": 23.7024, + "step": 2313 + }, + { + "epoch": 8.353950338600452, + "grad_norm": 221.03111267089844, + "learning_rate": 1.7455535390199637e-05, + "loss": 24.2856, + "step": 2314 + }, + { + "epoch": 8.357562076749435, + "grad_norm": 180.36221313476562, + "learning_rate": 1.7450090744101632e-05, + "loss": 23.7624, + "step": 2315 + }, + { + "epoch": 8.36117381489842, + "grad_norm": 198.77438354492188, + "learning_rate": 1.7444646098003628e-05, + "loss": 25.8628, + "step": 2316 + }, + { + "epoch": 8.364785553047405, + "grad_norm": 250.81321716308594, + "learning_rate": 1.7439201451905627e-05, + "loss": 43.4097, + "step": 2317 + }, + { + "epoch": 8.368397291196388, + "grad_norm": 246.19544982910156, + "learning_rate": 1.7433756805807622e-05, + "loss": 44.7141, + "step": 2318 + }, + { + "epoch": 8.372009029345373, + "grad_norm": 245.04241943359375, + "learning_rate": 1.742831215970962e-05, + "loss": 44.4511, + "step": 2319 + }, + { + "epoch": 8.375620767494357, + "grad_norm": 224.05331420898438, + "learning_rate": 1.7422867513611616e-05, + "loss": 43.5971, + "step": 2320 + }, + { + "epoch": 8.375620767494357, + "eval_loss": 0.6324251294136047, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 2320 + }, + { + "epoch": 8.37923250564334, + "grad_norm": 222.3795623779297, + "learning_rate": 1.741742286751361e-05, + "loss": 42.9007, + "step": 2321 + }, + { + "epoch": 8.382844243792325, + "grad_norm": 210.0133514404297, + "learning_rate": 1.741197822141561e-05, + "loss": 42.8733, + "step": 2322 + }, + { + "epoch": 8.386455981941308, + "grad_norm": 222.01031494140625, + "learning_rate": 1.7406533575317606e-05, + "loss": 42.9875, + "step": 2323 + }, + { + "epoch": 8.390067720090293, + "grad_norm": 187.30101013183594, + "learning_rate": 1.74010889292196e-05, + "loss": 42.4873, + "step": 2324 + }, + { + "epoch": 8.393679458239278, + "grad_norm": 188.22048950195312, + "learning_rate": 1.7395644283121596e-05, + "loss": 42.2066, + "step": 2325 + }, + { + "epoch": 8.397291196388261, + "grad_norm": 228.75363159179688, + "learning_rate": 1.7390199637023592e-05, + "loss": 42.7604, + "step": 2326 + }, + { + "epoch": 8.400902934537246, + "grad_norm": 196.8817901611328, + "learning_rate": 1.7384754990925587e-05, + "loss": 42.445, + "step": 2327 + }, + { + "epoch": 8.404514672686231, + "grad_norm": 205.3610382080078, + "learning_rate": 1.737931034482759e-05, + "loss": 39.8408, + "step": 2328 + }, + { + "epoch": 8.408126410835214, + "grad_norm": 259.0702819824219, + "learning_rate": 1.7373865698729585e-05, + "loss": 40.847, + "step": 2329 + }, + { + "epoch": 8.411738148984199, + "grad_norm": 216.12017822265625, + "learning_rate": 1.736842105263158e-05, + "loss": 40.4648, + "step": 2330 + }, + { + "epoch": 8.411738148984199, + "eval_loss": 0.6252871155738831, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2330 + }, + { + "epoch": 8.415349887133182, + "grad_norm": 330.9464111328125, + "learning_rate": 1.7362976406533575e-05, + "loss": 39.7682, + "step": 2331 + }, + { + "epoch": 8.418961625282167, + "grad_norm": 237.19505310058594, + "learning_rate": 1.735753176043557e-05, + "loss": 38.8824, + "step": 2332 + }, + { + "epoch": 8.422573363431152, + "grad_norm": 247.22259521484375, + "learning_rate": 1.735208711433757e-05, + "loss": 40.1187, + "step": 2333 + }, + { + "epoch": 8.426185101580135, + "grad_norm": 267.739990234375, + "learning_rate": 1.7346642468239565e-05, + "loss": 40.4589, + "step": 2334 + }, + { + "epoch": 8.42979683972912, + "grad_norm": 308.715576171875, + "learning_rate": 1.734119782214156e-05, + "loss": 41.5481, + "step": 2335 + }, + { + "epoch": 8.433408577878104, + "grad_norm": 350.8972473144531, + "learning_rate": 1.7335753176043556e-05, + "loss": 41.6628, + "step": 2336 + }, + { + "epoch": 8.437020316027088, + "grad_norm": 245.9825897216797, + "learning_rate": 1.7330308529945555e-05, + "loss": 40.3527, + "step": 2337 + }, + { + "epoch": 8.440632054176072, + "grad_norm": 253.94488525390625, + "learning_rate": 1.732486388384755e-05, + "loss": 39.6388, + "step": 2338 + }, + { + "epoch": 8.444243792325057, + "grad_norm": 226.24179077148438, + "learning_rate": 1.731941923774955e-05, + "loss": 40.5561, + "step": 2339 + }, + { + "epoch": 8.44785553047404, + "grad_norm": 188.66746520996094, + "learning_rate": 1.7313974591651544e-05, + "loss": 41.8422, + "step": 2340 + }, + { + "epoch": 8.44785553047404, + "eval_loss": 0.6197592616081238, + "eval_runtime": 3.1522, + "eval_samples_per_second": 56.786, + "eval_steps_per_second": 56.786, + "step": 2340 + }, + { + "epoch": 8.451467268623025, + "grad_norm": 227.01014709472656, + "learning_rate": 1.730852994555354e-05, + "loss": 41.4184, + "step": 2341 + }, + { + "epoch": 8.455079006772008, + "grad_norm": 187.11643981933594, + "learning_rate": 1.7303085299455535e-05, + "loss": 40.796, + "step": 2342 + }, + { + "epoch": 8.458690744920993, + "grad_norm": 243.1756134033203, + "learning_rate": 1.729764065335753e-05, + "loss": 41.7926, + "step": 2343 + }, + { + "epoch": 8.462302483069978, + "grad_norm": 226.15187072753906, + "learning_rate": 1.729219600725953e-05, + "loss": 41.588, + "step": 2344 + }, + { + "epoch": 8.465914221218961, + "grad_norm": 218.49935913085938, + "learning_rate": 1.7286751361161524e-05, + "loss": 39.6935, + "step": 2345 + }, + { + "epoch": 8.469525959367946, + "grad_norm": 232.4805145263672, + "learning_rate": 1.7281306715063523e-05, + "loss": 37.0718, + "step": 2346 + }, + { + "epoch": 8.47313769751693, + "grad_norm": 201.1748046875, + "learning_rate": 1.727586206896552e-05, + "loss": 33.9633, + "step": 2347 + }, + { + "epoch": 8.476749435665914, + "grad_norm": 208.79733276367188, + "learning_rate": 1.7270417422867514e-05, + "loss": 33.4553, + "step": 2348 + }, + { + "epoch": 8.480361173814899, + "grad_norm": 235.91151428222656, + "learning_rate": 1.726497277676951e-05, + "loss": 33.6144, + "step": 2349 + }, + { + "epoch": 8.483972911963882, + "grad_norm": 206.28811645507812, + "learning_rate": 1.7259528130671508e-05, + "loss": 35.3678, + "step": 2350 + }, + { + "epoch": 8.483972911963882, + "eval_loss": 0.6203061938285828, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 2350 + }, + { + "epoch": 8.487584650112867, + "grad_norm": 305.2204284667969, + "learning_rate": 1.7254083484573503e-05, + "loss": 35.9175, + "step": 2351 + }, + { + "epoch": 8.491196388261852, + "grad_norm": 227.1592254638672, + "learning_rate": 1.72486388384755e-05, + "loss": 35.5001, + "step": 2352 + }, + { + "epoch": 8.494808126410835, + "grad_norm": 194.739501953125, + "learning_rate": 1.7243194192377494e-05, + "loss": 35.0015, + "step": 2353 + }, + { + "epoch": 8.49841986455982, + "grad_norm": 233.8467254638672, + "learning_rate": 1.723774954627949e-05, + "loss": 36.8257, + "step": 2354 + }, + { + "epoch": 8.502031602708804, + "grad_norm": 258.8914489746094, + "learning_rate": 1.7232304900181492e-05, + "loss": 36.1246, + "step": 2355 + }, + { + "epoch": 8.505643340857787, + "grad_norm": 194.8585968017578, + "learning_rate": 1.7226860254083487e-05, + "loss": 36.1245, + "step": 2356 + }, + { + "epoch": 8.509255079006772, + "grad_norm": 191.2276153564453, + "learning_rate": 1.7221415607985483e-05, + "loss": 37.0608, + "step": 2357 + }, + { + "epoch": 8.512866817155757, + "grad_norm": 197.9025115966797, + "learning_rate": 1.7215970961887478e-05, + "loss": 37.0779, + "step": 2358 + }, + { + "epoch": 8.51647855530474, + "grad_norm": 207.01016235351562, + "learning_rate": 1.7210526315789473e-05, + "loss": 37.8432, + "step": 2359 + }, + { + "epoch": 8.520090293453725, + "grad_norm": 222.20201110839844, + "learning_rate": 1.720508166969147e-05, + "loss": 36.6983, + "step": 2360 + }, + { + "epoch": 8.520090293453725, + "eval_loss": 0.6240220665931702, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 2360 + }, + { + "epoch": 8.523702031602708, + "grad_norm": 200.19273376464844, + "learning_rate": 1.7199637023593467e-05, + "loss": 38.0613, + "step": 2361 + }, + { + "epoch": 8.527313769751693, + "grad_norm": 205.36758422851562, + "learning_rate": 1.7194192377495463e-05, + "loss": 29.6395, + "step": 2362 + }, + { + "epoch": 8.530925507900678, + "grad_norm": 206.53396606445312, + "learning_rate": 1.7188747731397458e-05, + "loss": 23.6478, + "step": 2363 + }, + { + "epoch": 8.534537246049661, + "grad_norm": 219.47044372558594, + "learning_rate": 1.7183303085299454e-05, + "loss": 22.8522, + "step": 2364 + }, + { + "epoch": 8.538148984198646, + "grad_norm": 178.48008728027344, + "learning_rate": 1.7177858439201452e-05, + "loss": 24.1411, + "step": 2365 + }, + { + "epoch": 8.54176072234763, + "grad_norm": 222.63731384277344, + "learning_rate": 1.717241379310345e-05, + "loss": 26.2818, + "step": 2366 + }, + { + "epoch": 8.545372460496614, + "grad_norm": 216.6333465576172, + "learning_rate": 1.7166969147005447e-05, + "loss": 42.5599, + "step": 2367 + }, + { + "epoch": 8.548984198645599, + "grad_norm": 241.42532348632812, + "learning_rate": 1.7161524500907442e-05, + "loss": 44.0016, + "step": 2368 + }, + { + "epoch": 8.552595936794582, + "grad_norm": 227.95193481445312, + "learning_rate": 1.7156079854809437e-05, + "loss": 44.1662, + "step": 2369 + }, + { + "epoch": 8.556207674943566, + "grad_norm": 204.9208526611328, + "learning_rate": 1.7150635208711433e-05, + "loss": 41.2255, + "step": 2370 + }, + { + "epoch": 8.556207674943566, + "eval_loss": 0.6293933987617493, + "eval_runtime": 3.1467, + "eval_samples_per_second": 56.884, + "eval_steps_per_second": 56.884, + "step": 2370 + }, + { + "epoch": 8.559819413092551, + "grad_norm": 168.1370849609375, + "learning_rate": 1.7145190562613428e-05, + "loss": 42.8374, + "step": 2371 + }, + { + "epoch": 8.563431151241534, + "grad_norm": 209.16641235351562, + "learning_rate": 1.7139745916515427e-05, + "loss": 42.4378, + "step": 2372 + }, + { + "epoch": 8.56704288939052, + "grad_norm": 235.36373901367188, + "learning_rate": 1.7134301270417422e-05, + "loss": 43.3213, + "step": 2373 + }, + { + "epoch": 8.570654627539504, + "grad_norm": 198.8206329345703, + "learning_rate": 1.712885662431942e-05, + "loss": 43.5621, + "step": 2374 + }, + { + "epoch": 8.574266365688487, + "grad_norm": 191.1640167236328, + "learning_rate": 1.7123411978221416e-05, + "loss": 41.8729, + "step": 2375 + }, + { + "epoch": 8.577878103837472, + "grad_norm": 281.6352233886719, + "learning_rate": 1.7117967332123412e-05, + "loss": 42.8306, + "step": 2376 + }, + { + "epoch": 8.581489841986457, + "grad_norm": 191.68939208984375, + "learning_rate": 1.711252268602541e-05, + "loss": 41.3603, + "step": 2377 + }, + { + "epoch": 8.58510158013544, + "grad_norm": 175.3041229248047, + "learning_rate": 1.7107078039927406e-05, + "loss": 38.7076, + "step": 2378 + }, + { + "epoch": 8.588713318284425, + "grad_norm": 186.31202697753906, + "learning_rate": 1.71016333938294e-05, + "loss": 38.832, + "step": 2379 + }, + { + "epoch": 8.592325056433408, + "grad_norm": 192.0680389404297, + "learning_rate": 1.7096188747731397e-05, + "loss": 40.6542, + "step": 2380 + }, + { + "epoch": 8.592325056433408, + "eval_loss": 0.6245992183685303, + "eval_runtime": 3.1487, + "eval_samples_per_second": 56.848, + "eval_steps_per_second": 56.848, + "step": 2380 + }, + { + "epoch": 8.595936794582393, + "grad_norm": 284.3516540527344, + "learning_rate": 1.7090744101633392e-05, + "loss": 40.3145, + "step": 2381 + }, + { + "epoch": 8.599548532731378, + "grad_norm": 210.2421875, + "learning_rate": 1.708529945553539e-05, + "loss": 39.9109, + "step": 2382 + }, + { + "epoch": 8.60316027088036, + "grad_norm": 202.3438720703125, + "learning_rate": 1.707985480943739e-05, + "loss": 39.0686, + "step": 2383 + }, + { + "epoch": 8.606772009029346, + "grad_norm": 189.5508270263672, + "learning_rate": 1.7074410163339385e-05, + "loss": 40.6673, + "step": 2384 + }, + { + "epoch": 8.610383747178329, + "grad_norm": 199.3516387939453, + "learning_rate": 1.706896551724138e-05, + "loss": 40.5357, + "step": 2385 + }, + { + "epoch": 8.613995485327314, + "grad_norm": 183.11309814453125, + "learning_rate": 1.7063520871143376e-05, + "loss": 40.7691, + "step": 2386 + }, + { + "epoch": 8.617607223476298, + "grad_norm": 347.104248046875, + "learning_rate": 1.705807622504537e-05, + "loss": 40.6822, + "step": 2387 + }, + { + "epoch": 8.621218961625281, + "grad_norm": 341.0453796386719, + "learning_rate": 1.705263157894737e-05, + "loss": 40.9791, + "step": 2388 + }, + { + "epoch": 8.624830699774266, + "grad_norm": 335.33221435546875, + "learning_rate": 1.7047186932849365e-05, + "loss": 41.0977, + "step": 2389 + }, + { + "epoch": 8.628442437923251, + "grad_norm": 209.75198364257812, + "learning_rate": 1.704174228675136e-05, + "loss": 41.3332, + "step": 2390 + }, + { + "epoch": 8.628442437923251, + "eval_loss": 0.6176490783691406, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2390 + }, + { + "epoch": 8.632054176072234, + "grad_norm": 221.6715545654297, + "learning_rate": 1.7036297640653356e-05, + "loss": 41.7456, + "step": 2391 + }, + { + "epoch": 8.635665914221219, + "grad_norm": 255.7875213623047, + "learning_rate": 1.7030852994555355e-05, + "loss": 41.7063, + "step": 2392 + }, + { + "epoch": 8.639277652370204, + "grad_norm": 206.66221618652344, + "learning_rate": 1.7025408348457354e-05, + "loss": 41.941, + "step": 2393 + }, + { + "epoch": 8.642889390519187, + "grad_norm": 381.9871826171875, + "learning_rate": 1.701996370235935e-05, + "loss": 42.8615, + "step": 2394 + }, + { + "epoch": 8.646501128668172, + "grad_norm": 303.8249816894531, + "learning_rate": 1.7014519056261344e-05, + "loss": 37.8472, + "step": 2395 + }, + { + "epoch": 8.650112866817155, + "grad_norm": 201.2444610595703, + "learning_rate": 1.700907441016334e-05, + "loss": 35.4641, + "step": 2396 + }, + { + "epoch": 8.65372460496614, + "grad_norm": 242.34298706054688, + "learning_rate": 1.7003629764065335e-05, + "loss": 33.3414, + "step": 2397 + }, + { + "epoch": 8.657336343115125, + "grad_norm": 214.45384216308594, + "learning_rate": 1.699818511796733e-05, + "loss": 33.7771, + "step": 2398 + }, + { + "epoch": 8.660948081264108, + "grad_norm": 276.4810485839844, + "learning_rate": 1.699274047186933e-05, + "loss": 35.4289, + "step": 2399 + }, + { + "epoch": 8.664559819413093, + "grad_norm": 199.68626403808594, + "learning_rate": 1.6987295825771325e-05, + "loss": 34.4205, + "step": 2400 + }, + { + "epoch": 8.664559819413093, + "eval_loss": 0.6179484128952026, + "eval_runtime": 3.1618, + "eval_samples_per_second": 56.614, + "eval_steps_per_second": 56.614, + "step": 2400 + }, + { + "epoch": 8.668171557562077, + "grad_norm": 239.19200134277344, + "learning_rate": 1.698185117967332e-05, + "loss": 34.3428, + "step": 2401 + }, + { + "epoch": 8.67178329571106, + "grad_norm": 341.44927978515625, + "learning_rate": 1.697640653357532e-05, + "loss": 37.6011, + "step": 2402 + }, + { + "epoch": 8.675395033860045, + "grad_norm": 260.5967102050781, + "learning_rate": 1.6970961887477314e-05, + "loss": 34.9222, + "step": 2403 + }, + { + "epoch": 8.679006772009028, + "grad_norm": 217.9357147216797, + "learning_rate": 1.6965517241379313e-05, + "loss": 36.6177, + "step": 2404 + }, + { + "epoch": 8.682618510158013, + "grad_norm": 355.21917724609375, + "learning_rate": 1.696007259528131e-05, + "loss": 36.3072, + "step": 2405 + }, + { + "epoch": 8.686230248306998, + "grad_norm": 279.37200927734375, + "learning_rate": 1.6954627949183304e-05, + "loss": 36.7026, + "step": 2406 + }, + { + "epoch": 8.689841986455981, + "grad_norm": 344.9017028808594, + "learning_rate": 1.69491833030853e-05, + "loss": 37.5009, + "step": 2407 + }, + { + "epoch": 8.693453724604966, + "grad_norm": 225.28668212890625, + "learning_rate": 1.6943738656987295e-05, + "loss": 36.0914, + "step": 2408 + }, + { + "epoch": 8.697065462753951, + "grad_norm": 233.16372680664062, + "learning_rate": 1.693829401088929e-05, + "loss": 38.0917, + "step": 2409 + }, + { + "epoch": 8.700677200902934, + "grad_norm": 220.2307891845703, + "learning_rate": 1.693284936479129e-05, + "loss": 37.4493, + "step": 2410 + }, + { + "epoch": 8.700677200902934, + "eval_loss": 0.6225734949111938, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2410 + }, + { + "epoch": 8.704288939051919, + "grad_norm": 298.2883605957031, + "learning_rate": 1.6927404718693287e-05, + "loss": 37.6527, + "step": 2411 + }, + { + "epoch": 8.707900677200904, + "grad_norm": 329.1615295410156, + "learning_rate": 1.6921960072595283e-05, + "loss": 30.9627, + "step": 2412 + }, + { + "epoch": 8.711512415349887, + "grad_norm": 192.55380249023438, + "learning_rate": 1.6916515426497278e-05, + "loss": 24.2028, + "step": 2413 + }, + { + "epoch": 8.715124153498872, + "grad_norm": 162.13583374023438, + "learning_rate": 1.6911070780399274e-05, + "loss": 23.3005, + "step": 2414 + }, + { + "epoch": 8.718735891647855, + "grad_norm": 152.95108032226562, + "learning_rate": 1.6905626134301272e-05, + "loss": 24.335, + "step": 2415 + }, + { + "epoch": 8.72234762979684, + "grad_norm": 183.4193572998047, + "learning_rate": 1.6900181488203268e-05, + "loss": 24.9279, + "step": 2416 + }, + { + "epoch": 8.725959367945824, + "grad_norm": 232.93650817871094, + "learning_rate": 1.6894736842105263e-05, + "loss": 43.4574, + "step": 2417 + }, + { + "epoch": 8.729571106094808, + "grad_norm": 226.85890197753906, + "learning_rate": 1.688929219600726e-05, + "loss": 44.4136, + "step": 2418 + }, + { + "epoch": 8.733182844243792, + "grad_norm": 232.16064453125, + "learning_rate": 1.6883847549909254e-05, + "loss": 42.8183, + "step": 2419 + }, + { + "epoch": 8.736794582392777, + "grad_norm": 243.5811767578125, + "learning_rate": 1.6878402903811253e-05, + "loss": 43.3031, + "step": 2420 + }, + { + "epoch": 8.736794582392777, + "eval_loss": 0.6284167170524597, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2420 + }, + { + "epoch": 8.74040632054176, + "grad_norm": 194.7115020751953, + "learning_rate": 1.687295825771325e-05, + "loss": 42.1276, + "step": 2421 + }, + { + "epoch": 8.744018058690745, + "grad_norm": 250.81983947753906, + "learning_rate": 1.6867513611615247e-05, + "loss": 42.5535, + "step": 2422 + }, + { + "epoch": 8.747629796839728, + "grad_norm": 205.1988983154297, + "learning_rate": 1.6862068965517242e-05, + "loss": 42.7745, + "step": 2423 + }, + { + "epoch": 8.751241534988713, + "grad_norm": 159.68243408203125, + "learning_rate": 1.6856624319419238e-05, + "loss": 43.6562, + "step": 2424 + }, + { + "epoch": 8.754853273137698, + "grad_norm": 164.31361389160156, + "learning_rate": 1.6851179673321233e-05, + "loss": 43.4602, + "step": 2425 + }, + { + "epoch": 8.758465011286681, + "grad_norm": 213.9793243408203, + "learning_rate": 1.6845735027223232e-05, + "loss": 42.1559, + "step": 2426 + }, + { + "epoch": 8.762076749435666, + "grad_norm": 205.79107666015625, + "learning_rate": 1.6840290381125227e-05, + "loss": 41.5687, + "step": 2427 + }, + { + "epoch": 8.76568848758465, + "grad_norm": 235.80348205566406, + "learning_rate": 1.6834845735027223e-05, + "loss": 41.0748, + "step": 2428 + }, + { + "epoch": 8.769300225733634, + "grad_norm": 203.84884643554688, + "learning_rate": 1.682940108892922e-05, + "loss": 39.3348, + "step": 2429 + }, + { + "epoch": 8.772911963882619, + "grad_norm": 271.2411804199219, + "learning_rate": 1.6823956442831217e-05, + "loss": 39.357, + "step": 2430 + }, + { + "epoch": 8.772911963882619, + "eval_loss": 0.6211046576499939, + "eval_runtime": 3.1402, + "eval_samples_per_second": 57.002, + "eval_steps_per_second": 57.002, + "step": 2430 + }, + { + "epoch": 8.776523702031604, + "grad_norm": 222.4960174560547, + "learning_rate": 1.6818511796733212e-05, + "loss": 39.2198, + "step": 2431 + }, + { + "epoch": 8.780135440180587, + "grad_norm": 325.9942932128906, + "learning_rate": 1.681306715063521e-05, + "loss": 40.572, + "step": 2432 + }, + { + "epoch": 8.783747178329572, + "grad_norm": 195.2740936279297, + "learning_rate": 1.6807622504537206e-05, + "loss": 39.2727, + "step": 2433 + }, + { + "epoch": 8.787358916478555, + "grad_norm": 196.16964721679688, + "learning_rate": 1.68021778584392e-05, + "loss": 40.6503, + "step": 2434 + }, + { + "epoch": 8.79097065462754, + "grad_norm": 183.2659454345703, + "learning_rate": 1.6796733212341197e-05, + "loss": 41.2074, + "step": 2435 + }, + { + "epoch": 8.794582392776524, + "grad_norm": 293.393798828125, + "learning_rate": 1.6791288566243192e-05, + "loss": 40.2778, + "step": 2436 + }, + { + "epoch": 8.798194130925507, + "grad_norm": 232.8402099609375, + "learning_rate": 1.678584392014519e-05, + "loss": 40.0305, + "step": 2437 + }, + { + "epoch": 8.801805869074492, + "grad_norm": 269.957275390625, + "learning_rate": 1.678039927404719e-05, + "loss": 40.4216, + "step": 2438 + }, + { + "epoch": 8.805417607223477, + "grad_norm": 175.6732635498047, + "learning_rate": 1.6774954627949185e-05, + "loss": 40.7998, + "step": 2439 + }, + { + "epoch": 8.80902934537246, + "grad_norm": 209.0604248046875, + "learning_rate": 1.676950998185118e-05, + "loss": 41.1176, + "step": 2440 + }, + { + "epoch": 8.80902934537246, + "eval_loss": 0.6211614012718201, + "eval_runtime": 3.15, + "eval_samples_per_second": 56.826, + "eval_steps_per_second": 56.826, + "step": 2440 + }, + { + "epoch": 8.812641083521445, + "grad_norm": 229.91171264648438, + "learning_rate": 1.6764065335753176e-05, + "loss": 41.37, + "step": 2441 + }, + { + "epoch": 8.816252821670428, + "grad_norm": 192.99610900878906, + "learning_rate": 1.675862068965517e-05, + "loss": 41.8377, + "step": 2442 + }, + { + "epoch": 8.819864559819413, + "grad_norm": 239.290771484375, + "learning_rate": 1.675317604355717e-05, + "loss": 42.3038, + "step": 2443 + }, + { + "epoch": 8.823476297968398, + "grad_norm": 203.52330017089844, + "learning_rate": 1.6747731397459166e-05, + "loss": 41.3334, + "step": 2444 + }, + { + "epoch": 8.827088036117381, + "grad_norm": 247.99099731445312, + "learning_rate": 1.674228675136116e-05, + "loss": 37.7455, + "step": 2445 + }, + { + "epoch": 8.830699774266366, + "grad_norm": 205.9770965576172, + "learning_rate": 1.6736842105263156e-05, + "loss": 34.6828, + "step": 2446 + }, + { + "epoch": 8.83431151241535, + "grad_norm": 215.47024536132812, + "learning_rate": 1.6731397459165152e-05, + "loss": 34.927, + "step": 2447 + }, + { + "epoch": 8.837923250564334, + "grad_norm": 254.14010620117188, + "learning_rate": 1.6725952813067154e-05, + "loss": 35.3194, + "step": 2448 + }, + { + "epoch": 8.841534988713319, + "grad_norm": 221.18174743652344, + "learning_rate": 1.672050816696915e-05, + "loss": 34.9577, + "step": 2449 + }, + { + "epoch": 8.845146726862303, + "grad_norm": 191.1651611328125, + "learning_rate": 1.6715063520871145e-05, + "loss": 33.7244, + "step": 2450 + }, + { + "epoch": 8.845146726862303, + "eval_loss": 0.6216589212417603, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2450 + }, + { + "epoch": 8.848758465011286, + "grad_norm": 228.3920135498047, + "learning_rate": 1.670961887477314e-05, + "loss": 34.9689, + "step": 2451 + }, + { + "epoch": 8.852370203160271, + "grad_norm": 227.6689910888672, + "learning_rate": 1.6704174228675135e-05, + "loss": 36.0718, + "step": 2452 + }, + { + "epoch": 8.855981941309254, + "grad_norm": 182.38978576660156, + "learning_rate": 1.669872958257713e-05, + "loss": 37.1143, + "step": 2453 + }, + { + "epoch": 8.85959367945824, + "grad_norm": 223.66966247558594, + "learning_rate": 1.669328493647913e-05, + "loss": 34.4468, + "step": 2454 + }, + { + "epoch": 8.863205417607224, + "grad_norm": 260.3930358886719, + "learning_rate": 1.6687840290381125e-05, + "loss": 36.7305, + "step": 2455 + }, + { + "epoch": 8.866817155756207, + "grad_norm": 218.60385131835938, + "learning_rate": 1.668239564428312e-05, + "loss": 36.1995, + "step": 2456 + }, + { + "epoch": 8.870428893905192, + "grad_norm": 227.4342041015625, + "learning_rate": 1.667695099818512e-05, + "loss": 35.9138, + "step": 2457 + }, + { + "epoch": 8.874040632054175, + "grad_norm": 208.42196655273438, + "learning_rate": 1.6671506352087115e-05, + "loss": 37.2621, + "step": 2458 + }, + { + "epoch": 8.87765237020316, + "grad_norm": 214.9486541748047, + "learning_rate": 1.6666061705989113e-05, + "loss": 38.5176, + "step": 2459 + }, + { + "epoch": 8.881264108352145, + "grad_norm": 226.6992645263672, + "learning_rate": 1.666061705989111e-05, + "loss": 38.3917, + "step": 2460 + }, + { + "epoch": 8.881264108352145, + "eval_loss": 0.6277003884315491, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.959, + "eval_steps_per_second": 56.959, + "step": 2460 + }, + { + "epoch": 8.884875846501128, + "grad_norm": 282.3875732421875, + "learning_rate": 1.6655172413793104e-05, + "loss": 39.1439, + "step": 2461 + }, + { + "epoch": 8.888487584650113, + "grad_norm": 240.29022216796875, + "learning_rate": 1.66497277676951e-05, + "loss": 33.7717, + "step": 2462 + }, + { + "epoch": 8.892099322799098, + "grad_norm": 231.84727478027344, + "learning_rate": 1.6644283121597095e-05, + "loss": 24.1146, + "step": 2463 + }, + { + "epoch": 8.89571106094808, + "grad_norm": 215.5159149169922, + "learning_rate": 1.663883847549909e-05, + "loss": 24.0165, + "step": 2464 + }, + { + "epoch": 8.899322799097066, + "grad_norm": 278.42950439453125, + "learning_rate": 1.663339382940109e-05, + "loss": 24.2048, + "step": 2465 + }, + { + "epoch": 8.90293453724605, + "grad_norm": 187.03341674804688, + "learning_rate": 1.6627949183303088e-05, + "loss": 24.7332, + "step": 2466 + }, + { + "epoch": 8.906546275395034, + "grad_norm": 261.2938232421875, + "learning_rate": 1.6622504537205083e-05, + "loss": 42.6764, + "step": 2467 + }, + { + "epoch": 8.910158013544018, + "grad_norm": 234.00880432128906, + "learning_rate": 1.661705989110708e-05, + "loss": 42.9894, + "step": 2468 + }, + { + "epoch": 8.913769751693001, + "grad_norm": 263.2890319824219, + "learning_rate": 1.6611615245009074e-05, + "loss": 43.3274, + "step": 2469 + }, + { + "epoch": 8.917381489841986, + "grad_norm": 286.3260192871094, + "learning_rate": 1.6606170598911073e-05, + "loss": 44.3862, + "step": 2470 + }, + { + "epoch": 8.917381489841986, + "eval_loss": 0.6278789043426514, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2470 + }, + { + "epoch": 8.920993227990971, + "grad_norm": 273.5133972167969, + "learning_rate": 1.6600725952813068e-05, + "loss": 43.4195, + "step": 2471 + }, + { + "epoch": 8.924604966139954, + "grad_norm": 246.2245330810547, + "learning_rate": 1.6595281306715063e-05, + "loss": 43.153, + "step": 2472 + }, + { + "epoch": 8.928216704288939, + "grad_norm": 261.3001403808594, + "learning_rate": 1.658983666061706e-05, + "loss": 41.1276, + "step": 2473 + }, + { + "epoch": 8.931828442437924, + "grad_norm": 263.7626037597656, + "learning_rate": 1.6584392014519054e-05, + "loss": 40.5055, + "step": 2474 + }, + { + "epoch": 8.935440180586907, + "grad_norm": 233.80442810058594, + "learning_rate": 1.6578947368421053e-05, + "loss": 40.7098, + "step": 2475 + }, + { + "epoch": 8.939051918735892, + "grad_norm": 334.1268615722656, + "learning_rate": 1.6573502722323052e-05, + "loss": 40.5404, + "step": 2476 + }, + { + "epoch": 8.942663656884875, + "grad_norm": 319.56689453125, + "learning_rate": 1.6568058076225047e-05, + "loss": 40.3434, + "step": 2477 + }, + { + "epoch": 8.94627539503386, + "grad_norm": 388.0625915527344, + "learning_rate": 1.6562613430127043e-05, + "loss": 41.1956, + "step": 2478 + }, + { + "epoch": 8.949887133182845, + "grad_norm": 256.9087829589844, + "learning_rate": 1.6557168784029038e-05, + "loss": 41.9647, + "step": 2479 + }, + { + "epoch": 8.953498871331828, + "grad_norm": 248.2635040283203, + "learning_rate": 1.6551724137931033e-05, + "loss": 41.1885, + "step": 2480 + }, + { + "epoch": 8.953498871331828, + "eval_loss": 0.6198933124542236, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2480 + }, + { + "epoch": 8.957110609480813, + "grad_norm": 236.89004516601562, + "learning_rate": 1.6546279491833032e-05, + "loss": 41.2178, + "step": 2481 + }, + { + "epoch": 8.960722347629797, + "grad_norm": 260.47357177734375, + "learning_rate": 1.6540834845735027e-05, + "loss": 42.1472, + "step": 2482 + }, + { + "epoch": 8.96433408577878, + "grad_norm": 216.1390380859375, + "learning_rate": 1.6535390199637023e-05, + "loss": 36.14, + "step": 2483 + }, + { + "epoch": 8.967945823927765, + "grad_norm": 194.7316131591797, + "learning_rate": 1.652994555353902e-05, + "loss": 33.7272, + "step": 2484 + }, + { + "epoch": 8.97155756207675, + "grad_norm": 202.0404052734375, + "learning_rate": 1.6524500907441017e-05, + "loss": 34.9427, + "step": 2485 + }, + { + "epoch": 8.975169300225733, + "grad_norm": 196.98463439941406, + "learning_rate": 1.6519056261343016e-05, + "loss": 36.4874, + "step": 2486 + }, + { + "epoch": 8.978781038374718, + "grad_norm": 211.46177673339844, + "learning_rate": 1.651361161524501e-05, + "loss": 35.7667, + "step": 2487 + }, + { + "epoch": 8.982392776523701, + "grad_norm": 190.47093200683594, + "learning_rate": 1.6508166969147006e-05, + "loss": 35.6874, + "step": 2488 + }, + { + "epoch": 8.986004514672686, + "grad_norm": 194.9825897216797, + "learning_rate": 1.6502722323049002e-05, + "loss": 36.8718, + "step": 2489 + }, + { + "epoch": 8.989616252821671, + "grad_norm": 230.24774169921875, + "learning_rate": 1.6497277676950997e-05, + "loss": 37.4962, + "step": 2490 + }, + { + "epoch": 8.989616252821671, + "eval_loss": 0.6168100237846375, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 2490 + }, + { + "epoch": 8.993227990970654, + "grad_norm": 266.5688171386719, + "learning_rate": 1.6491833030852993e-05, + "loss": 35.5063, + "step": 2491 + }, + { + "epoch": 8.996839729119639, + "grad_norm": 230.923828125, + "learning_rate": 1.648638838475499e-05, + "loss": 23.5847, + "step": 2492 + }, + { + "epoch": 9.0, + "grad_norm": 187.365478515625, + "learning_rate": 1.6480943738656987e-05, + "loss": 21.7926, + "step": 2493 + }, + { + "epoch": 9.003611738148985, + "grad_norm": 283.487060546875, + "learning_rate": 1.6475499092558986e-05, + "loss": 41.4221, + "step": 2494 + }, + { + "epoch": 9.007223476297968, + "grad_norm": 234.38009643554688, + "learning_rate": 1.647005444646098e-05, + "loss": 43.3343, + "step": 2495 + }, + { + "epoch": 9.010835214446953, + "grad_norm": 253.75588989257812, + "learning_rate": 1.6464609800362976e-05, + "loss": 42.1983, + "step": 2496 + }, + { + "epoch": 9.014446952595938, + "grad_norm": 224.6202392578125, + "learning_rate": 1.6459165154264975e-05, + "loss": 41.5355, + "step": 2497 + }, + { + "epoch": 9.01805869074492, + "grad_norm": 261.0040588378906, + "learning_rate": 1.645372050816697e-05, + "loss": 42.3058, + "step": 2498 + }, + { + "epoch": 9.021670428893906, + "grad_norm": 191.44142150878906, + "learning_rate": 1.6448275862068966e-05, + "loss": 42.3911, + "step": 2499 + }, + { + "epoch": 9.025282167042889, + "grad_norm": 246.79278564453125, + "learning_rate": 1.644283121597096e-05, + "loss": 41.6238, + "step": 2500 + }, + { + "epoch": 9.025282167042889, + "eval_loss": 0.6220878958702087, + "eval_runtime": 3.1552, + "eval_samples_per_second": 56.731, + "eval_steps_per_second": 56.731, + "step": 2500 + }, + { + "epoch": 9.028893905191874, + "grad_norm": 251.5475311279297, + "learning_rate": 1.6437386569872957e-05, + "loss": 43.9275, + "step": 2501 + }, + { + "epoch": 9.032505643340858, + "grad_norm": 300.0381164550781, + "learning_rate": 1.6431941923774952e-05, + "loss": 42.8938, + "step": 2502 + }, + { + "epoch": 9.036117381489841, + "grad_norm": 310.0517883300781, + "learning_rate": 1.6426497277676954e-05, + "loss": 42.3538, + "step": 2503 + }, + { + "epoch": 9.039729119638826, + "grad_norm": 213.50392150878906, + "learning_rate": 1.642105263157895e-05, + "loss": 40.2305, + "step": 2504 + }, + { + "epoch": 9.043340857787811, + "grad_norm": 173.3816680908203, + "learning_rate": 1.6415607985480945e-05, + "loss": 38.3336, + "step": 2505 + }, + { + "epoch": 9.046952595936794, + "grad_norm": 195.51968383789062, + "learning_rate": 1.641016333938294e-05, + "loss": 38.5937, + "step": 2506 + }, + { + "epoch": 9.050564334085779, + "grad_norm": 195.68910217285156, + "learning_rate": 1.6404718693284936e-05, + "loss": 37.9994, + "step": 2507 + }, + { + "epoch": 9.054176072234762, + "grad_norm": 239.56704711914062, + "learning_rate": 1.6399274047186934e-05, + "loss": 38.6006, + "step": 2508 + }, + { + "epoch": 9.057787810383747, + "grad_norm": 455.8309326171875, + "learning_rate": 1.639382940108893e-05, + "loss": 39.9516, + "step": 2509 + }, + { + "epoch": 9.061399548532732, + "grad_norm": 188.0857696533203, + "learning_rate": 1.6388384754990925e-05, + "loss": 38.8922, + "step": 2510 + }, + { + "epoch": 9.061399548532732, + "eval_loss": 0.6177002191543579, + "eval_runtime": 3.1595, + "eval_samples_per_second": 56.654, + "eval_steps_per_second": 56.654, + "step": 2510 + }, + { + "epoch": 9.065011286681715, + "grad_norm": 211.76168823242188, + "learning_rate": 1.638294010889292e-05, + "loss": 38.8895, + "step": 2511 + }, + { + "epoch": 9.0686230248307, + "grad_norm": 281.7332458496094, + "learning_rate": 1.637749546279492e-05, + "loss": 39.9238, + "step": 2512 + }, + { + "epoch": 9.072234762979685, + "grad_norm": 254.9953155517578, + "learning_rate": 1.6372050816696915e-05, + "loss": 41.2667, + "step": 2513 + }, + { + "epoch": 9.075846501128668, + "grad_norm": 233.8746337890625, + "learning_rate": 1.6366606170598914e-05, + "loss": 39.3087, + "step": 2514 + }, + { + "epoch": 9.079458239277653, + "grad_norm": 317.71270751953125, + "learning_rate": 1.636116152450091e-05, + "loss": 40.4902, + "step": 2515 + }, + { + "epoch": 9.083069977426636, + "grad_norm": 227.5228271484375, + "learning_rate": 1.6355716878402904e-05, + "loss": 40.1197, + "step": 2516 + }, + { + "epoch": 9.08668171557562, + "grad_norm": 225.84423828125, + "learning_rate": 1.63502722323049e-05, + "loss": 42.9099, + "step": 2517 + }, + { + "epoch": 9.090293453724605, + "grad_norm": 255.20858764648438, + "learning_rate": 1.6344827586206895e-05, + "loss": 42.0515, + "step": 2518 + }, + { + "epoch": 9.093905191873588, + "grad_norm": 215.45352172851562, + "learning_rate": 1.6339382940108894e-05, + "loss": 41.6817, + "step": 2519 + }, + { + "epoch": 9.097516930022573, + "grad_norm": 233.5334014892578, + "learning_rate": 1.633393829401089e-05, + "loss": 42.6121, + "step": 2520 + }, + { + "epoch": 9.097516930022573, + "eval_loss": 0.6148340106010437, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.926, + "eval_steps_per_second": 56.926, + "step": 2520 + }, + { + "epoch": 9.101128668171558, + "grad_norm": 196.54132080078125, + "learning_rate": 1.6328493647912888e-05, + "loss": 40.5833, + "step": 2521 + }, + { + "epoch": 9.104740406320541, + "grad_norm": 296.7503967285156, + "learning_rate": 1.6323049001814883e-05, + "loss": 39.098, + "step": 2522 + }, + { + "epoch": 9.108352144469526, + "grad_norm": 272.1104431152344, + "learning_rate": 1.631760435571688e-05, + "loss": 36.0076, + "step": 2523 + }, + { + "epoch": 9.111963882618511, + "grad_norm": 197.3100128173828, + "learning_rate": 1.6312159709618874e-05, + "loss": 33.3503, + "step": 2524 + }, + { + "epoch": 9.115575620767494, + "grad_norm": 223.1310272216797, + "learning_rate": 1.6306715063520873e-05, + "loss": 33.1386, + "step": 2525 + }, + { + "epoch": 9.119187358916479, + "grad_norm": 234.86093139648438, + "learning_rate": 1.630127041742287e-05, + "loss": 34.2101, + "step": 2526 + }, + { + "epoch": 9.122799097065462, + "grad_norm": 244.72328186035156, + "learning_rate": 1.6295825771324864e-05, + "loss": 34.955, + "step": 2527 + }, + { + "epoch": 9.126410835214447, + "grad_norm": 198.89134216308594, + "learning_rate": 1.629038112522686e-05, + "loss": 34.5405, + "step": 2528 + }, + { + "epoch": 9.130022573363432, + "grad_norm": 236.64096069335938, + "learning_rate": 1.6284936479128854e-05, + "loss": 35.2328, + "step": 2529 + }, + { + "epoch": 9.133634311512415, + "grad_norm": 212.8743438720703, + "learning_rate": 1.6279491833030853e-05, + "loss": 34.6642, + "step": 2530 + }, + { + "epoch": 9.133634311512415, + "eval_loss": 0.6154256463050842, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 2530 + }, + { + "epoch": 9.1372460496614, + "grad_norm": 227.15135192871094, + "learning_rate": 1.6274047186932852e-05, + "loss": 35.652, + "step": 2531 + }, + { + "epoch": 9.140857787810384, + "grad_norm": 207.30572509765625, + "learning_rate": 1.6268602540834847e-05, + "loss": 36.8476, + "step": 2532 + }, + { + "epoch": 9.144469525959368, + "grad_norm": 222.18023681640625, + "learning_rate": 1.6263157894736843e-05, + "loss": 35.8299, + "step": 2533 + }, + { + "epoch": 9.148081264108352, + "grad_norm": 283.674072265625, + "learning_rate": 1.6257713248638838e-05, + "loss": 36.5074, + "step": 2534 + }, + { + "epoch": 9.151693002257336, + "grad_norm": 235.69752502441406, + "learning_rate": 1.6252268602540834e-05, + "loss": 37.344, + "step": 2535 + }, + { + "epoch": 9.15530474040632, + "grad_norm": 224.37965393066406, + "learning_rate": 1.6246823956442832e-05, + "loss": 37.8138, + "step": 2536 + }, + { + "epoch": 9.158916478555305, + "grad_norm": 217.52230834960938, + "learning_rate": 1.6241379310344828e-05, + "loss": 37.1529, + "step": 2537 + }, + { + "epoch": 9.162528216704288, + "grad_norm": 234.7586212158203, + "learning_rate": 1.6235934664246823e-05, + "loss": 36.3247, + "step": 2538 + }, + { + "epoch": 9.166139954853273, + "grad_norm": 239.52479553222656, + "learning_rate": 1.623049001814882e-05, + "loss": 30.0805, + "step": 2539 + }, + { + "epoch": 9.169751693002258, + "grad_norm": 223.7616424560547, + "learning_rate": 1.6225045372050817e-05, + "loss": 23.8492, + "step": 2540 + }, + { + "epoch": 9.169751693002258, + "eval_loss": 0.6244915723800659, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.031, + "eval_steps_per_second": 57.031, + "step": 2540 + }, + { + "epoch": 9.173363431151241, + "grad_norm": 213.41371154785156, + "learning_rate": 1.6219600725952816e-05, + "loss": 23.3557, + "step": 2541 + }, + { + "epoch": 9.176975169300226, + "grad_norm": 162.4627685546875, + "learning_rate": 1.621415607985481e-05, + "loss": 23.8834, + "step": 2542 + }, + { + "epoch": 9.18058690744921, + "grad_norm": 172.13250732421875, + "learning_rate": 1.6208711433756807e-05, + "loss": 24.6428, + "step": 2543 + }, + { + "epoch": 9.184198645598194, + "grad_norm": 229.30799865722656, + "learning_rate": 1.6203266787658802e-05, + "loss": 42.5908, + "step": 2544 + }, + { + "epoch": 9.187810383747179, + "grad_norm": 195.30130004882812, + "learning_rate": 1.6197822141560798e-05, + "loss": 43.7286, + "step": 2545 + }, + { + "epoch": 9.191422121896162, + "grad_norm": 227.4984893798828, + "learning_rate": 1.6192377495462793e-05, + "loss": 43.5012, + "step": 2546 + }, + { + "epoch": 9.195033860045147, + "grad_norm": 254.69615173339844, + "learning_rate": 1.6186932849364792e-05, + "loss": 41.9295, + "step": 2547 + }, + { + "epoch": 9.198645598194132, + "grad_norm": 251.33778381347656, + "learning_rate": 1.6181488203266787e-05, + "loss": 42.0838, + "step": 2548 + }, + { + "epoch": 9.202257336343115, + "grad_norm": 237.91677856445312, + "learning_rate": 1.6176043557168786e-05, + "loss": 43.0031, + "step": 2549 + }, + { + "epoch": 9.2058690744921, + "grad_norm": 258.0311584472656, + "learning_rate": 1.617059891107078e-05, + "loss": 42.7196, + "step": 2550 + }, + { + "epoch": 9.2058690744921, + "eval_loss": 0.6245208978652954, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2550 + }, + { + "epoch": 9.209480812641084, + "grad_norm": 197.14703369140625, + "learning_rate": 1.6165154264972777e-05, + "loss": 42.1342, + "step": 2551 + }, + { + "epoch": 9.213092550790067, + "grad_norm": 235.19705200195312, + "learning_rate": 1.6159709618874775e-05, + "loss": 41.8462, + "step": 2552 + }, + { + "epoch": 9.216704288939052, + "grad_norm": 198.409423828125, + "learning_rate": 1.615426497277677e-05, + "loss": 43.5993, + "step": 2553 + }, + { + "epoch": 9.220316027088035, + "grad_norm": 254.08590698242188, + "learning_rate": 1.6148820326678766e-05, + "loss": 40.771, + "step": 2554 + }, + { + "epoch": 9.22392776523702, + "grad_norm": 181.64808654785156, + "learning_rate": 1.614337568058076e-05, + "loss": 39.3511, + "step": 2555 + }, + { + "epoch": 9.227539503386005, + "grad_norm": 294.1127014160156, + "learning_rate": 1.6137931034482757e-05, + "loss": 39.6586, + "step": 2556 + }, + { + "epoch": 9.231151241534988, + "grad_norm": 197.59982299804688, + "learning_rate": 1.6132486388384752e-05, + "loss": 38.2575, + "step": 2557 + }, + { + "epoch": 9.234762979683973, + "grad_norm": 223.74717712402344, + "learning_rate": 1.6127041742286754e-05, + "loss": 38.8801, + "step": 2558 + }, + { + "epoch": 9.238374717832958, + "grad_norm": 279.2779541015625, + "learning_rate": 1.612159709618875e-05, + "loss": 40.4591, + "step": 2559 + }, + { + "epoch": 9.241986455981941, + "grad_norm": 258.75909423828125, + "learning_rate": 1.6116152450090745e-05, + "loss": 39.2172, + "step": 2560 + }, + { + "epoch": 9.241986455981941, + "eval_loss": 0.6209923624992371, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.784, + "eval_steps_per_second": 56.784, + "step": 2560 + }, + { + "epoch": 9.245598194130926, + "grad_norm": 305.0645446777344, + "learning_rate": 1.611070780399274e-05, + "loss": 40.442, + "step": 2561 + }, + { + "epoch": 9.249209932279909, + "grad_norm": 196.18557739257812, + "learning_rate": 1.6105263157894736e-05, + "loss": 39.7092, + "step": 2562 + }, + { + "epoch": 9.252821670428894, + "grad_norm": 214.3220977783203, + "learning_rate": 1.6099818511796735e-05, + "loss": 39.3935, + "step": 2563 + }, + { + "epoch": 9.256433408577879, + "grad_norm": 217.2801055908203, + "learning_rate": 1.609437386569873e-05, + "loss": 40.39, + "step": 2564 + }, + { + "epoch": 9.260045146726862, + "grad_norm": 205.17446899414062, + "learning_rate": 1.6088929219600726e-05, + "loss": 39.9531, + "step": 2565 + }, + { + "epoch": 9.263656884875846, + "grad_norm": 197.3854217529297, + "learning_rate": 1.608348457350272e-05, + "loss": 40.474, + "step": 2566 + }, + { + "epoch": 9.267268623024831, + "grad_norm": 264.3934631347656, + "learning_rate": 1.607803992740472e-05, + "loss": 41.2794, + "step": 2567 + }, + { + "epoch": 9.270880361173814, + "grad_norm": 226.6471710205078, + "learning_rate": 1.6072595281306715e-05, + "loss": 40.3425, + "step": 2568 + }, + { + "epoch": 9.2744920993228, + "grad_norm": 198.62734985351562, + "learning_rate": 1.6067150635208714e-05, + "loss": 41.6261, + "step": 2569 + }, + { + "epoch": 9.278103837471784, + "grad_norm": 207.73509216308594, + "learning_rate": 1.606170598911071e-05, + "loss": 41.7835, + "step": 2570 + }, + { + "epoch": 9.278103837471784, + "eval_loss": 0.6173180937767029, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 2570 + }, + { + "epoch": 9.281715575620767, + "grad_norm": 214.13601684570312, + "learning_rate": 1.6056261343012705e-05, + "loss": 40.0095, + "step": 2571 + }, + { + "epoch": 9.285327313769752, + "grad_norm": 218.0533905029297, + "learning_rate": 1.60508166969147e-05, + "loss": 40.014, + "step": 2572 + }, + { + "epoch": 9.288939051918735, + "grad_norm": 211.27984619140625, + "learning_rate": 1.6045372050816695e-05, + "loss": 36.7399, + "step": 2573 + }, + { + "epoch": 9.29255079006772, + "grad_norm": 201.9020233154297, + "learning_rate": 1.6039927404718694e-05, + "loss": 33.7555, + "step": 2574 + }, + { + "epoch": 9.296162528216705, + "grad_norm": 230.27149963378906, + "learning_rate": 1.603448275862069e-05, + "loss": 32.9646, + "step": 2575 + }, + { + "epoch": 9.299774266365688, + "grad_norm": 208.77622985839844, + "learning_rate": 1.6029038112522685e-05, + "loss": 33.5332, + "step": 2576 + }, + { + "epoch": 9.303386004514673, + "grad_norm": 225.02796936035156, + "learning_rate": 1.6023593466424684e-05, + "loss": 34.2592, + "step": 2577 + }, + { + "epoch": 9.306997742663658, + "grad_norm": 201.79612731933594, + "learning_rate": 1.601814882032668e-05, + "loss": 34.6686, + "step": 2578 + }, + { + "epoch": 9.31060948081264, + "grad_norm": 235.6588134765625, + "learning_rate": 1.6012704174228678e-05, + "loss": 35.4554, + "step": 2579 + }, + { + "epoch": 9.314221218961626, + "grad_norm": 273.51904296875, + "learning_rate": 1.6007259528130673e-05, + "loss": 35.2077, + "step": 2580 + }, + { + "epoch": 9.314221218961626, + "eval_loss": 0.6169624328613281, + "eval_runtime": 3.1501, + "eval_samples_per_second": 56.823, + "eval_steps_per_second": 56.823, + "step": 2580 + }, + { + "epoch": 9.317832957110609, + "grad_norm": 199.19541931152344, + "learning_rate": 1.600181488203267e-05, + "loss": 35.0703, + "step": 2581 + }, + { + "epoch": 9.321444695259594, + "grad_norm": 212.49276733398438, + "learning_rate": 1.5996370235934664e-05, + "loss": 35.9691, + "step": 2582 + }, + { + "epoch": 9.325056433408578, + "grad_norm": 193.7330322265625, + "learning_rate": 1.599092558983666e-05, + "loss": 34.9043, + "step": 2583 + }, + { + "epoch": 9.328668171557561, + "grad_norm": 196.00503540039062, + "learning_rate": 1.5985480943738655e-05, + "loss": 36.3508, + "step": 2584 + }, + { + "epoch": 9.332279909706546, + "grad_norm": 218.78392028808594, + "learning_rate": 1.5980036297640654e-05, + "loss": 34.7672, + "step": 2585 + }, + { + "epoch": 9.335891647855531, + "grad_norm": 235.76873779296875, + "learning_rate": 1.5974591651542652e-05, + "loss": 36.8695, + "step": 2586 + }, + { + "epoch": 9.339503386004514, + "grad_norm": 250.538330078125, + "learning_rate": 1.5969147005444648e-05, + "loss": 37.4531, + "step": 2587 + }, + { + "epoch": 9.343115124153499, + "grad_norm": 234.12469482421875, + "learning_rate": 1.5963702359346643e-05, + "loss": 37.4506, + "step": 2588 + }, + { + "epoch": 9.346726862302482, + "grad_norm": 209.3461151123047, + "learning_rate": 1.595825771324864e-05, + "loss": 31.3062, + "step": 2589 + }, + { + "epoch": 9.350338600451467, + "grad_norm": 211.12277221679688, + "learning_rate": 1.5952813067150637e-05, + "loss": 23.3303, + "step": 2590 + }, + { + "epoch": 9.350338600451467, + "eval_loss": 0.6222187876701355, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 2590 + }, + { + "epoch": 9.353950338600452, + "grad_norm": 200.1257781982422, + "learning_rate": 1.5947368421052633e-05, + "loss": 22.9145, + "step": 2591 + }, + { + "epoch": 9.357562076749435, + "grad_norm": 179.01475524902344, + "learning_rate": 1.5941923774954628e-05, + "loss": 23.8842, + "step": 2592 + }, + { + "epoch": 9.36117381489842, + "grad_norm": 214.9254608154297, + "learning_rate": 1.5936479128856623e-05, + "loss": 25.4154, + "step": 2593 + }, + { + "epoch": 9.364785553047405, + "grad_norm": 211.63735961914062, + "learning_rate": 1.593103448275862e-05, + "loss": 42.6467, + "step": 2594 + }, + { + "epoch": 9.368397291196388, + "grad_norm": 232.43194580078125, + "learning_rate": 1.5925589836660618e-05, + "loss": 43.3501, + "step": 2595 + }, + { + "epoch": 9.372009029345373, + "grad_norm": 220.61468505859375, + "learning_rate": 1.5920145190562616e-05, + "loss": 43.4324, + "step": 2596 + }, + { + "epoch": 9.375620767494357, + "grad_norm": 179.00894165039062, + "learning_rate": 1.591470054446461e-05, + "loss": 41.9646, + "step": 2597 + }, + { + "epoch": 9.37923250564334, + "grad_norm": 203.847412109375, + "learning_rate": 1.5909255898366607e-05, + "loss": 41.1242, + "step": 2598 + }, + { + "epoch": 9.382844243792325, + "grad_norm": 244.20164489746094, + "learning_rate": 1.5903811252268602e-05, + "loss": 42.2451, + "step": 2599 + }, + { + "epoch": 9.386455981941308, + "grad_norm": 203.60154724121094, + "learning_rate": 1.5898366606170598e-05, + "loss": 42.0361, + "step": 2600 + }, + { + "epoch": 9.386455981941308, + "eval_loss": 0.627146303653717, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2600 + }, + { + "epoch": 9.390067720090293, + "grad_norm": 185.1741180419922, + "learning_rate": 1.5892921960072597e-05, + "loss": 41.9657, + "step": 2601 + }, + { + "epoch": 9.393679458239278, + "grad_norm": 211.64219665527344, + "learning_rate": 1.5887477313974592e-05, + "loss": 42.2619, + "step": 2602 + }, + { + "epoch": 9.397291196388261, + "grad_norm": 253.31997680664062, + "learning_rate": 1.5882032667876587e-05, + "loss": 42.5666, + "step": 2603 + }, + { + "epoch": 9.400902934537246, + "grad_norm": 257.8781433105469, + "learning_rate": 1.5876588021778586e-05, + "loss": 43.1747, + "step": 2604 + }, + { + "epoch": 9.404514672686231, + "grad_norm": 171.05398559570312, + "learning_rate": 1.587114337568058e-05, + "loss": 41.2645, + "step": 2605 + }, + { + "epoch": 9.408126410835214, + "grad_norm": 209.83749389648438, + "learning_rate": 1.5865698729582577e-05, + "loss": 38.7138, + "step": 2606 + }, + { + "epoch": 9.411738148984199, + "grad_norm": 303.92059326171875, + "learning_rate": 1.5860254083484576e-05, + "loss": 38.7962, + "step": 2607 + }, + { + "epoch": 9.415349887133182, + "grad_norm": 271.9322204589844, + "learning_rate": 1.585480943738657e-05, + "loss": 39.0622, + "step": 2608 + }, + { + "epoch": 9.418961625282167, + "grad_norm": 222.8749542236328, + "learning_rate": 1.5849364791288566e-05, + "loss": 40.0773, + "step": 2609 + }, + { + "epoch": 9.422573363431152, + "grad_norm": 194.549072265625, + "learning_rate": 1.5843920145190562e-05, + "loss": 39.3495, + "step": 2610 + }, + { + "epoch": 9.422573363431152, + "eval_loss": 0.618250846862793, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.796, + "eval_steps_per_second": 56.796, + "step": 2610 + }, + { + "epoch": 9.426185101580135, + "grad_norm": 231.32623291015625, + "learning_rate": 1.5838475499092557e-05, + "loss": 39.7577, + "step": 2611 + }, + { + "epoch": 9.42979683972912, + "grad_norm": 185.9986114501953, + "learning_rate": 1.5833030852994556e-05, + "loss": 40.9342, + "step": 2612 + }, + { + "epoch": 9.433408577878104, + "grad_norm": 221.356201171875, + "learning_rate": 1.5827586206896555e-05, + "loss": 39.7733, + "step": 2613 + }, + { + "epoch": 9.437020316027088, + "grad_norm": 216.2249755859375, + "learning_rate": 1.582214156079855e-05, + "loss": 39.7559, + "step": 2614 + }, + { + "epoch": 9.440632054176072, + "grad_norm": 263.5106201171875, + "learning_rate": 1.5816696914700546e-05, + "loss": 41.2872, + "step": 2615 + }, + { + "epoch": 9.444243792325057, + "grad_norm": 281.9518127441406, + "learning_rate": 1.581125226860254e-05, + "loss": 41.1114, + "step": 2616 + }, + { + "epoch": 9.44785553047404, + "grad_norm": 200.2808074951172, + "learning_rate": 1.5805807622504536e-05, + "loss": 41.7711, + "step": 2617 + }, + { + "epoch": 9.451467268623025, + "grad_norm": 233.034912109375, + "learning_rate": 1.5800362976406535e-05, + "loss": 41.3306, + "step": 2618 + }, + { + "epoch": 9.455079006772008, + "grad_norm": 215.5499725341797, + "learning_rate": 1.579491833030853e-05, + "loss": 41.0065, + "step": 2619 + }, + { + "epoch": 9.458690744920993, + "grad_norm": 220.21153259277344, + "learning_rate": 1.5789473684210526e-05, + "loss": 42.1116, + "step": 2620 + }, + { + "epoch": 9.458690744920993, + "eval_loss": 0.6146022081375122, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 2620 + }, + { + "epoch": 9.462302483069978, + "grad_norm": 198.20001220703125, + "learning_rate": 1.578402903811252e-05, + "loss": 39.637, + "step": 2621 + }, + { + "epoch": 9.465914221218961, + "grad_norm": 228.18357849121094, + "learning_rate": 1.5778584392014517e-05, + "loss": 37.3831, + "step": 2622 + }, + { + "epoch": 9.469525959367946, + "grad_norm": 207.68040466308594, + "learning_rate": 1.577313974591652e-05, + "loss": 35.6356, + "step": 2623 + }, + { + "epoch": 9.47313769751693, + "grad_norm": 267.0474853515625, + "learning_rate": 1.5767695099818514e-05, + "loss": 34.5549, + "step": 2624 + }, + { + "epoch": 9.476749435665914, + "grad_norm": 191.4129638671875, + "learning_rate": 1.576225045372051e-05, + "loss": 35.1065, + "step": 2625 + }, + { + "epoch": 9.480361173814899, + "grad_norm": 220.85708618164062, + "learning_rate": 1.5756805807622505e-05, + "loss": 34.9115, + "step": 2626 + }, + { + "epoch": 9.483972911963882, + "grad_norm": 218.62460327148438, + "learning_rate": 1.57513611615245e-05, + "loss": 33.9542, + "step": 2627 + }, + { + "epoch": 9.487584650112867, + "grad_norm": 184.085693359375, + "learning_rate": 1.5745916515426496e-05, + "loss": 35.2981, + "step": 2628 + }, + { + "epoch": 9.491196388261852, + "grad_norm": 286.73236083984375, + "learning_rate": 1.5740471869328494e-05, + "loss": 36.8326, + "step": 2629 + }, + { + "epoch": 9.494808126410835, + "grad_norm": 326.4263000488281, + "learning_rate": 1.573502722323049e-05, + "loss": 35.9728, + "step": 2630 + }, + { + "epoch": 9.494808126410835, + "eval_loss": 0.6165672540664673, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2630 + }, + { + "epoch": 9.49841986455982, + "grad_norm": 283.330322265625, + "learning_rate": 1.5729582577132485e-05, + "loss": 37.4227, + "step": 2631 + }, + { + "epoch": 9.502031602708804, + "grad_norm": 208.65829467773438, + "learning_rate": 1.5724137931034484e-05, + "loss": 36.8613, + "step": 2632 + }, + { + "epoch": 9.505643340857787, + "grad_norm": 191.59429931640625, + "learning_rate": 1.571869328493648e-05, + "loss": 36.2332, + "step": 2633 + }, + { + "epoch": 9.509255079006772, + "grad_norm": 306.4736022949219, + "learning_rate": 1.5713248638838478e-05, + "loss": 36.8045, + "step": 2634 + }, + { + "epoch": 9.512866817155757, + "grad_norm": 226.97509765625, + "learning_rate": 1.5707803992740474e-05, + "loss": 37.005, + "step": 2635 + }, + { + "epoch": 9.51647855530474, + "grad_norm": 230.47683715820312, + "learning_rate": 1.570235934664247e-05, + "loss": 36.9168, + "step": 2636 + }, + { + "epoch": 9.520090293453725, + "grad_norm": 221.44483947753906, + "learning_rate": 1.5696914700544464e-05, + "loss": 39.0025, + "step": 2637 + }, + { + "epoch": 9.523702031602708, + "grad_norm": 249.1531219482422, + "learning_rate": 1.569147005444646e-05, + "loss": 38.1069, + "step": 2638 + }, + { + "epoch": 9.527313769751693, + "grad_norm": 276.8532409667969, + "learning_rate": 1.5686025408348455e-05, + "loss": 30.9819, + "step": 2639 + }, + { + "epoch": 9.530925507900678, + "grad_norm": 218.25035095214844, + "learning_rate": 1.5680580762250454e-05, + "loss": 23.4807, + "step": 2640 + }, + { + "epoch": 9.530925507900678, + "eval_loss": 0.619295060634613, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2640 + }, + { + "epoch": 9.534537246049661, + "grad_norm": 185.83737182617188, + "learning_rate": 1.5675136116152453e-05, + "loss": 22.5394, + "step": 2641 + }, + { + "epoch": 9.538148984198646, + "grad_norm": 181.9920654296875, + "learning_rate": 1.5669691470054448e-05, + "loss": 23.9106, + "step": 2642 + }, + { + "epoch": 9.54176072234763, + "grad_norm": 209.20391845703125, + "learning_rate": 1.5664246823956443e-05, + "loss": 25.5328, + "step": 2643 + }, + { + "epoch": 9.545372460496614, + "grad_norm": 223.86093139648438, + "learning_rate": 1.565880217785844e-05, + "loss": 42.8563, + "step": 2644 + }, + { + "epoch": 9.548984198645599, + "grad_norm": 232.3086395263672, + "learning_rate": 1.5653357531760438e-05, + "loss": 44.0178, + "step": 2645 + }, + { + "epoch": 9.552595936794582, + "grad_norm": 223.76541137695312, + "learning_rate": 1.5647912885662433e-05, + "loss": 43.4928, + "step": 2646 + }, + { + "epoch": 9.556207674943566, + "grad_norm": 258.86700439453125, + "learning_rate": 1.5642468239564428e-05, + "loss": 42.3422, + "step": 2647 + }, + { + "epoch": 9.559819413092551, + "grad_norm": 255.09033203125, + "learning_rate": 1.5637023593466424e-05, + "loss": 41.6588, + "step": 2648 + }, + { + "epoch": 9.563431151241534, + "grad_norm": 205.88563537597656, + "learning_rate": 1.563157894736842e-05, + "loss": 41.9267, + "step": 2649 + }, + { + "epoch": 9.56704288939052, + "grad_norm": 204.12318420410156, + "learning_rate": 1.5626134301270418e-05, + "loss": 43.0326, + "step": 2650 + }, + { + "epoch": 9.56704288939052, + "eval_loss": 0.6218730807304382, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2650 + }, + { + "epoch": 9.570654627539504, + "grad_norm": 259.5694274902344, + "learning_rate": 1.5620689655172417e-05, + "loss": 42.9604, + "step": 2651 + }, + { + "epoch": 9.574266365688487, + "grad_norm": 234.35935974121094, + "learning_rate": 1.5615245009074412e-05, + "loss": 42.7316, + "step": 2652 + }, + { + "epoch": 9.577878103837472, + "grad_norm": 237.14346313476562, + "learning_rate": 1.5609800362976407e-05, + "loss": 42.4559, + "step": 2653 + }, + { + "epoch": 9.581489841986457, + "grad_norm": 208.2974395751953, + "learning_rate": 1.5604355716878403e-05, + "loss": 40.1113, + "step": 2654 + }, + { + "epoch": 9.58510158013544, + "grad_norm": 212.18814086914062, + "learning_rate": 1.5598911070780398e-05, + "loss": 38.6515, + "step": 2655 + }, + { + "epoch": 9.588713318284425, + "grad_norm": 245.23240661621094, + "learning_rate": 1.5593466424682397e-05, + "loss": 39.5289, + "step": 2656 + }, + { + "epoch": 9.592325056433408, + "grad_norm": 261.1321105957031, + "learning_rate": 1.5588021778584392e-05, + "loss": 39.3232, + "step": 2657 + }, + { + "epoch": 9.595936794582393, + "grad_norm": 257.67962646484375, + "learning_rate": 1.5582577132486388e-05, + "loss": 40.3963, + "step": 2658 + }, + { + "epoch": 9.599548532731378, + "grad_norm": 299.93914794921875, + "learning_rate": 1.5577132486388383e-05, + "loss": 39.0657, + "step": 2659 + }, + { + "epoch": 9.60316027088036, + "grad_norm": 215.45407104492188, + "learning_rate": 1.5571687840290382e-05, + "loss": 40.1408, + "step": 2660 + }, + { + "epoch": 9.60316027088036, + "eval_loss": 0.6216554045677185, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2660 + }, + { + "epoch": 9.606772009029346, + "grad_norm": 273.9233093261719, + "learning_rate": 1.5566243194192377e-05, + "loss": 40.6894, + "step": 2661 + }, + { + "epoch": 9.610383747178329, + "grad_norm": 220.76344299316406, + "learning_rate": 1.5560798548094376e-05, + "loss": 40.8146, + "step": 2662 + }, + { + "epoch": 9.613995485327314, + "grad_norm": 200.33929443359375, + "learning_rate": 1.555535390199637e-05, + "loss": 40.1362, + "step": 2663 + }, + { + "epoch": 9.617607223476298, + "grad_norm": 223.38536071777344, + "learning_rate": 1.5549909255898367e-05, + "loss": 39.3488, + "step": 2664 + }, + { + "epoch": 9.621218961625281, + "grad_norm": 240.99578857421875, + "learning_rate": 1.5544464609800362e-05, + "loss": 41.771, + "step": 2665 + }, + { + "epoch": 9.624830699774266, + "grad_norm": 202.30323791503906, + "learning_rate": 1.5539019963702357e-05, + "loss": 41.1412, + "step": 2666 + }, + { + "epoch": 9.628442437923251, + "grad_norm": 193.8411865234375, + "learning_rate": 1.5533575317604356e-05, + "loss": 41.0064, + "step": 2667 + }, + { + "epoch": 9.632054176072234, + "grad_norm": 197.1542510986328, + "learning_rate": 1.552813067150635e-05, + "loss": 41.4787, + "step": 2668 + }, + { + "epoch": 9.635665914221219, + "grad_norm": 259.21954345703125, + "learning_rate": 1.552268602540835e-05, + "loss": 41.753, + "step": 2669 + }, + { + "epoch": 9.639277652370204, + "grad_norm": 290.9770202636719, + "learning_rate": 1.5517241379310346e-05, + "loss": 40.4589, + "step": 2670 + }, + { + "epoch": 9.639277652370204, + "eval_loss": 0.6132164001464844, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2670 + }, + { + "epoch": 9.642889390519187, + "grad_norm": 252.86219787597656, + "learning_rate": 1.551179673321234e-05, + "loss": 37.356, + "step": 2671 + }, + { + "epoch": 9.646501128668172, + "grad_norm": 207.79254150390625, + "learning_rate": 1.550635208711434e-05, + "loss": 36.2071, + "step": 2672 + }, + { + "epoch": 9.650112866817155, + "grad_norm": 186.78857421875, + "learning_rate": 1.5500907441016335e-05, + "loss": 33.5074, + "step": 2673 + }, + { + "epoch": 9.65372460496614, + "grad_norm": 212.5107421875, + "learning_rate": 1.549546279491833e-05, + "loss": 33.7103, + "step": 2674 + }, + { + "epoch": 9.657336343115125, + "grad_norm": 243.2950897216797, + "learning_rate": 1.5490018148820326e-05, + "loss": 34.3476, + "step": 2675 + }, + { + "epoch": 9.660948081264108, + "grad_norm": 221.66415405273438, + "learning_rate": 1.548457350272232e-05, + "loss": 34.5377, + "step": 2676 + }, + { + "epoch": 9.664559819413093, + "grad_norm": 231.8260955810547, + "learning_rate": 1.5479128856624317e-05, + "loss": 34.3663, + "step": 2677 + }, + { + "epoch": 9.668171557562077, + "grad_norm": 284.6401062011719, + "learning_rate": 1.547368421052632e-05, + "loss": 35.5723, + "step": 2678 + }, + { + "epoch": 9.67178329571106, + "grad_norm": 373.43865966796875, + "learning_rate": 1.5468239564428314e-05, + "loss": 35.5628, + "step": 2679 + }, + { + "epoch": 9.675395033860045, + "grad_norm": 325.18316650390625, + "learning_rate": 1.546279491833031e-05, + "loss": 35.6192, + "step": 2680 + }, + { + "epoch": 9.675395033860045, + "eval_loss": 0.613842248916626, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 2680 + }, + { + "epoch": 9.679006772009028, + "grad_norm": 353.14739990234375, + "learning_rate": 1.5457350272232305e-05, + "loss": 36.4789, + "step": 2681 + }, + { + "epoch": 9.682618510158013, + "grad_norm": 215.21836853027344, + "learning_rate": 1.54519056261343e-05, + "loss": 36.0412, + "step": 2682 + }, + { + "epoch": 9.686230248306998, + "grad_norm": 219.64930725097656, + "learning_rate": 1.54464609800363e-05, + "loss": 37.1118, + "step": 2683 + }, + { + "epoch": 9.689841986455981, + "grad_norm": 247.86685180664062, + "learning_rate": 1.5441016333938295e-05, + "loss": 36.488, + "step": 2684 + }, + { + "epoch": 9.693453724604966, + "grad_norm": 248.7967071533203, + "learning_rate": 1.543557168784029e-05, + "loss": 36.2925, + "step": 2685 + }, + { + "epoch": 9.697065462753951, + "grad_norm": 243.1404571533203, + "learning_rate": 1.5430127041742285e-05, + "loss": 37.3986, + "step": 2686 + }, + { + "epoch": 9.700677200902934, + "grad_norm": 276.6585388183594, + "learning_rate": 1.5424682395644284e-05, + "loss": 37.9784, + "step": 2687 + }, + { + "epoch": 9.704288939051919, + "grad_norm": 308.171630859375, + "learning_rate": 1.541923774954628e-05, + "loss": 38.1591, + "step": 2688 + }, + { + "epoch": 9.707900677200904, + "grad_norm": 204.4575653076172, + "learning_rate": 1.541379310344828e-05, + "loss": 27.4514, + "step": 2689 + }, + { + "epoch": 9.711512415349887, + "grad_norm": 160.85946655273438, + "learning_rate": 1.5408348457350274e-05, + "loss": 23.7982, + "step": 2690 + }, + { + "epoch": 9.711512415349887, + "eval_loss": 0.619924008846283, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2690 + }, + { + "epoch": 9.715124153498872, + "grad_norm": 215.60049438476562, + "learning_rate": 1.540290381125227e-05, + "loss": 23.3927, + "step": 2691 + }, + { + "epoch": 9.718735891647855, + "grad_norm": 172.84011840820312, + "learning_rate": 1.5397459165154265e-05, + "loss": 24.1876, + "step": 2692 + }, + { + "epoch": 9.72234762979684, + "grad_norm": 208.42361450195312, + "learning_rate": 1.539201451905626e-05, + "loss": 25.1794, + "step": 2693 + }, + { + "epoch": 9.725959367945824, + "grad_norm": 255.73574829101562, + "learning_rate": 1.538656987295826e-05, + "loss": 42.3484, + "step": 2694 + }, + { + "epoch": 9.729571106094808, + "grad_norm": 239.65533447265625, + "learning_rate": 1.5381125226860254e-05, + "loss": 42.8277, + "step": 2695 + }, + { + "epoch": 9.733182844243792, + "grad_norm": 211.2068634033203, + "learning_rate": 1.5375680580762253e-05, + "loss": 42.6536, + "step": 2696 + }, + { + "epoch": 9.736794582392777, + "grad_norm": 302.85003662109375, + "learning_rate": 1.5370235934664248e-05, + "loss": 42.6263, + "step": 2697 + }, + { + "epoch": 9.74040632054176, + "grad_norm": 211.54754638671875, + "learning_rate": 1.5364791288566244e-05, + "loss": 41.5621, + "step": 2698 + }, + { + "epoch": 9.744018058690745, + "grad_norm": 229.22283935546875, + "learning_rate": 1.535934664246824e-05, + "loss": 43.3765, + "step": 2699 + }, + { + "epoch": 9.747629796839728, + "grad_norm": 206.64794921875, + "learning_rate": 1.5353901996370238e-05, + "loss": 41.4923, + "step": 2700 + }, + { + "epoch": 9.747629796839728, + "eval_loss": 0.6202616095542908, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.981, + "eval_steps_per_second": 56.981, + "step": 2700 + }, + { + "epoch": 9.751241534988713, + "grad_norm": 216.98757934570312, + "learning_rate": 1.5348457350272233e-05, + "loss": 43.1931, + "step": 2701 + }, + { + "epoch": 9.754853273137698, + "grad_norm": 222.7340545654297, + "learning_rate": 1.534301270417423e-05, + "loss": 42.485, + "step": 2702 + }, + { + "epoch": 9.758465011286681, + "grad_norm": 291.3454895019531, + "learning_rate": 1.5337568058076224e-05, + "loss": 41.4766, + "step": 2703 + }, + { + "epoch": 9.762076749435666, + "grad_norm": 239.50341796875, + "learning_rate": 1.533212341197822e-05, + "loss": 41.9215, + "step": 2704 + }, + { + "epoch": 9.76568848758465, + "grad_norm": 179.21839904785156, + "learning_rate": 1.5326678765880218e-05, + "loss": 40.6544, + "step": 2705 + }, + { + "epoch": 9.769300225733634, + "grad_norm": 210.89535522460938, + "learning_rate": 1.5321234119782217e-05, + "loss": 38.6204, + "step": 2706 + }, + { + "epoch": 9.772911963882619, + "grad_norm": 239.23291015625, + "learning_rate": 1.5315789473684212e-05, + "loss": 39.4385, + "step": 2707 + }, + { + "epoch": 9.776523702031604, + "grad_norm": 240.22772216796875, + "learning_rate": 1.5310344827586208e-05, + "loss": 40.0139, + "step": 2708 + }, + { + "epoch": 9.780135440180587, + "grad_norm": 185.4588623046875, + "learning_rate": 1.5304900181488203e-05, + "loss": 38.9331, + "step": 2709 + }, + { + "epoch": 9.783747178329572, + "grad_norm": 263.0315856933594, + "learning_rate": 1.52994555353902e-05, + "loss": 38.5485, + "step": 2710 + }, + { + "epoch": 9.783747178329572, + "eval_loss": 0.615914523601532, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2710 + }, + { + "epoch": 9.787358916478555, + "grad_norm": 209.05348205566406, + "learning_rate": 1.5294010889292197e-05, + "loss": 39.4875, + "step": 2711 + }, + { + "epoch": 9.79097065462754, + "grad_norm": 209.72293090820312, + "learning_rate": 1.5288566243194193e-05, + "loss": 40.4742, + "step": 2712 + }, + { + "epoch": 9.794582392776524, + "grad_norm": 210.02908325195312, + "learning_rate": 1.5283121597096188e-05, + "loss": 39.924, + "step": 2713 + }, + { + "epoch": 9.798194130925507, + "grad_norm": 204.3467254638672, + "learning_rate": 1.5277676950998183e-05, + "loss": 40.8893, + "step": 2714 + }, + { + "epoch": 9.801805869074492, + "grad_norm": 253.9317626953125, + "learning_rate": 1.5272232304900182e-05, + "loss": 38.3278, + "step": 2715 + }, + { + "epoch": 9.805417607223477, + "grad_norm": 263.6196594238281, + "learning_rate": 1.526678765880218e-05, + "loss": 40.5242, + "step": 2716 + }, + { + "epoch": 9.80902934537246, + "grad_norm": 230.35621643066406, + "learning_rate": 1.5261343012704176e-05, + "loss": 40.683, + "step": 2717 + }, + { + "epoch": 9.812641083521445, + "grad_norm": 190.16323852539062, + "learning_rate": 1.5255898366606172e-05, + "loss": 40.2472, + "step": 2718 + }, + { + "epoch": 9.816252821670428, + "grad_norm": 202.7122344970703, + "learning_rate": 1.5250453720508167e-05, + "loss": 38.9644, + "step": 2719 + }, + { + "epoch": 9.819864559819413, + "grad_norm": 193.65774536132812, + "learning_rate": 1.5245009074410164e-05, + "loss": 40.9982, + "step": 2720 + }, + { + "epoch": 9.819864559819413, + "eval_loss": 0.6152020692825317, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 2720 + }, + { + "epoch": 9.823476297968398, + "grad_norm": 272.0360412597656, + "learning_rate": 1.523956442831216e-05, + "loss": 40.5518, + "step": 2721 + }, + { + "epoch": 9.827088036117381, + "grad_norm": 200.20777893066406, + "learning_rate": 1.5234119782214155e-05, + "loss": 38.4801, + "step": 2722 + }, + { + "epoch": 9.830699774266366, + "grad_norm": 201.44764709472656, + "learning_rate": 1.5228675136116152e-05, + "loss": 35.7499, + "step": 2723 + }, + { + "epoch": 9.83431151241535, + "grad_norm": 234.89706420898438, + "learning_rate": 1.522323049001815e-05, + "loss": 35.4331, + "step": 2724 + }, + { + "epoch": 9.837923250564334, + "grad_norm": 193.27423095703125, + "learning_rate": 1.5217785843920146e-05, + "loss": 33.0281, + "step": 2725 + }, + { + "epoch": 9.841534988713319, + "grad_norm": 222.28060913085938, + "learning_rate": 1.5212341197822143e-05, + "loss": 34.2237, + "step": 2726 + }, + { + "epoch": 9.845146726862303, + "grad_norm": 264.2764587402344, + "learning_rate": 1.5206896551724139e-05, + "loss": 33.7112, + "step": 2727 + }, + { + "epoch": 9.848758465011286, + "grad_norm": 204.5146484375, + "learning_rate": 1.5201451905626134e-05, + "loss": 33.9014, + "step": 2728 + }, + { + "epoch": 9.852370203160271, + "grad_norm": 198.90907287597656, + "learning_rate": 1.5196007259528131e-05, + "loss": 36.6987, + "step": 2729 + }, + { + "epoch": 9.855981941309254, + "grad_norm": 254.19818115234375, + "learning_rate": 1.5190562613430126e-05, + "loss": 35.4466, + "step": 2730 + }, + { + "epoch": 9.855981941309254, + "eval_loss": 0.6153284311294556, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2730 + }, + { + "epoch": 9.85959367945824, + "grad_norm": 212.53749084472656, + "learning_rate": 1.5185117967332123e-05, + "loss": 35.659, + "step": 2731 + }, + { + "epoch": 9.863205417607224, + "grad_norm": 234.5277557373047, + "learning_rate": 1.5179673321234119e-05, + "loss": 36.7411, + "step": 2732 + }, + { + "epoch": 9.866817155756207, + "grad_norm": 229.25962829589844, + "learning_rate": 1.5174228675136118e-05, + "loss": 36.0713, + "step": 2733 + }, + { + "epoch": 9.870428893905192, + "grad_norm": 259.5096435546875, + "learning_rate": 1.5168784029038115e-05, + "loss": 37.2433, + "step": 2734 + }, + { + "epoch": 9.874040632054175, + "grad_norm": 297.2413024902344, + "learning_rate": 1.516333938294011e-05, + "loss": 37.222, + "step": 2735 + }, + { + "epoch": 9.87765237020316, + "grad_norm": 259.8325500488281, + "learning_rate": 1.5157894736842105e-05, + "loss": 37.096, + "step": 2736 + }, + { + "epoch": 9.881264108352145, + "grad_norm": 275.85888671875, + "learning_rate": 1.5152450090744103e-05, + "loss": 37.769, + "step": 2737 + }, + { + "epoch": 9.884875846501128, + "grad_norm": 261.16656494140625, + "learning_rate": 1.5147005444646098e-05, + "loss": 38.4089, + "step": 2738 + }, + { + "epoch": 9.888487584650113, + "grad_norm": 219.74351501464844, + "learning_rate": 1.5141560798548095e-05, + "loss": 32.5255, + "step": 2739 + }, + { + "epoch": 9.892099322799098, + "grad_norm": 203.9193878173828, + "learning_rate": 1.513611615245009e-05, + "loss": 24.2497, + "step": 2740 + }, + { + "epoch": 9.892099322799098, + "eval_loss": 0.6206448674201965, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 2740 + }, + { + "epoch": 9.89571106094808, + "grad_norm": 224.19454956054688, + "learning_rate": 1.5130671506352086e-05, + "loss": 23.0629, + "step": 2741 + }, + { + "epoch": 9.899322799097066, + "grad_norm": 252.4147186279297, + "learning_rate": 1.5125226860254086e-05, + "loss": 24.5799, + "step": 2742 + }, + { + "epoch": 9.90293453724605, + "grad_norm": 214.79067993164062, + "learning_rate": 1.5119782214156082e-05, + "loss": 24.6773, + "step": 2743 + }, + { + "epoch": 9.906546275395034, + "grad_norm": 225.59848022460938, + "learning_rate": 1.5114337568058077e-05, + "loss": 43.1147, + "step": 2744 + }, + { + "epoch": 9.910158013544018, + "grad_norm": 221.8661651611328, + "learning_rate": 1.5108892921960074e-05, + "loss": 42.7403, + "step": 2745 + }, + { + "epoch": 9.913769751693001, + "grad_norm": 316.3871765136719, + "learning_rate": 1.510344827586207e-05, + "loss": 41.6931, + "step": 2746 + }, + { + "epoch": 9.917381489841986, + "grad_norm": 250.6577911376953, + "learning_rate": 1.5098003629764065e-05, + "loss": 43.3, + "step": 2747 + }, + { + "epoch": 9.920993227990971, + "grad_norm": 222.44386291503906, + "learning_rate": 1.5092558983666062e-05, + "loss": 43.3128, + "step": 2748 + }, + { + "epoch": 9.924604966139954, + "grad_norm": 190.08682250976562, + "learning_rate": 1.5087114337568057e-05, + "loss": 41.4814, + "step": 2749 + }, + { + "epoch": 9.928216704288939, + "grad_norm": 276.9918212890625, + "learning_rate": 1.5081669691470054e-05, + "loss": 41.042, + "step": 2750 + }, + { + "epoch": 9.928216704288939, + "eval_loss": 0.6201648116111755, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2750 + }, + { + "epoch": 9.931828442437924, + "grad_norm": 269.7344970703125, + "learning_rate": 1.507622504537205e-05, + "loss": 40.3064, + "step": 2751 + }, + { + "epoch": 9.935440180586907, + "grad_norm": 263.11663818359375, + "learning_rate": 1.5070780399274049e-05, + "loss": 40.1675, + "step": 2752 + }, + { + "epoch": 9.939051918735892, + "grad_norm": 210.37635803222656, + "learning_rate": 1.5065335753176046e-05, + "loss": 40.5334, + "step": 2753 + }, + { + "epoch": 9.942663656884875, + "grad_norm": 206.09335327148438, + "learning_rate": 1.5059891107078041e-05, + "loss": 41.0429, + "step": 2754 + }, + { + "epoch": 9.94627539503386, + "grad_norm": 245.45013427734375, + "learning_rate": 1.5054446460980036e-05, + "loss": 40.8831, + "step": 2755 + }, + { + "epoch": 9.949887133182845, + "grad_norm": 216.63075256347656, + "learning_rate": 1.5049001814882033e-05, + "loss": 41.2453, + "step": 2756 + }, + { + "epoch": 9.953498871331828, + "grad_norm": 362.12127685546875, + "learning_rate": 1.5043557168784029e-05, + "loss": 40.4561, + "step": 2757 + }, + { + "epoch": 9.957110609480813, + "grad_norm": 222.01434326171875, + "learning_rate": 1.5038112522686024e-05, + "loss": 41.7307, + "step": 2758 + }, + { + "epoch": 9.960722347629797, + "grad_norm": 289.6107177734375, + "learning_rate": 1.5032667876588021e-05, + "loss": 37.83, + "step": 2759 + }, + { + "epoch": 9.96433408577878, + "grad_norm": 231.75274658203125, + "learning_rate": 1.5027223230490017e-05, + "loss": 34.1728, + "step": 2760 + }, + { + "epoch": 9.96433408577878, + "eval_loss": 0.6177247166633606, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2760 + }, + { + "epoch": 9.967945823927765, + "grad_norm": 269.4657287597656, + "learning_rate": 1.5021778584392017e-05, + "loss": 33.8501, + "step": 2761 + }, + { + "epoch": 9.97155756207675, + "grad_norm": 229.73004150390625, + "learning_rate": 1.5016333938294013e-05, + "loss": 35.0989, + "step": 2762 + }, + { + "epoch": 9.975169300225733, + "grad_norm": 215.75350952148438, + "learning_rate": 1.5010889292196008e-05, + "loss": 35.1091, + "step": 2763 + }, + { + "epoch": 9.978781038374718, + "grad_norm": 255.36439514160156, + "learning_rate": 1.5005444646098005e-05, + "loss": 36.8373, + "step": 2764 + }, + { + "epoch": 9.982392776523701, + "grad_norm": 226.71084594726562, + "learning_rate": 1.5e-05, + "loss": 36.6244, + "step": 2765 + }, + { + "epoch": 9.986004514672686, + "grad_norm": 264.1791076660156, + "learning_rate": 1.4994555353901996e-05, + "loss": 36.1925, + "step": 2766 + }, + { + "epoch": 9.989616252821671, + "grad_norm": 281.4349060058594, + "learning_rate": 1.4989110707803993e-05, + "loss": 38.5627, + "step": 2767 + }, + { + "epoch": 9.993227990970654, + "grad_norm": 275.13092041015625, + "learning_rate": 1.498366606170599e-05, + "loss": 33.3277, + "step": 2768 + }, + { + "epoch": 9.996839729119639, + "grad_norm": 215.79550170898438, + "learning_rate": 1.4978221415607985e-05, + "loss": 23.7482, + "step": 2769 + }, + { + "epoch": 10.0, + "grad_norm": 162.03152465820312, + "learning_rate": 1.4972776769509982e-05, + "loss": 21.7078, + "step": 2770 + }, + { + "epoch": 10.0, + "eval_loss": 0.6126651763916016, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2770 + }, + { + "epoch": 10.003611738148985, + "grad_norm": 243.1815185546875, + "learning_rate": 1.4967332123411978e-05, + "loss": 42.2449, + "step": 2771 + }, + { + "epoch": 10.007223476297968, + "grad_norm": 183.29127502441406, + "learning_rate": 1.4961887477313977e-05, + "loss": 41.5925, + "step": 2772 + }, + { + "epoch": 10.010835214446953, + "grad_norm": 206.04238891601562, + "learning_rate": 1.4956442831215972e-05, + "loss": 40.6657, + "step": 2773 + }, + { + "epoch": 10.014446952595938, + "grad_norm": 192.1796875, + "learning_rate": 1.4950998185117967e-05, + "loss": 41.7065, + "step": 2774 + }, + { + "epoch": 10.01805869074492, + "grad_norm": 202.77279663085938, + "learning_rate": 1.4945553539019964e-05, + "loss": 42.0608, + "step": 2775 + }, + { + "epoch": 10.021670428893906, + "grad_norm": 242.37734985351562, + "learning_rate": 1.494010889292196e-05, + "loss": 40.9925, + "step": 2776 + }, + { + "epoch": 10.025282167042889, + "grad_norm": 252.01358032226562, + "learning_rate": 1.4934664246823957e-05, + "loss": 41.1401, + "step": 2777 + }, + { + "epoch": 10.028893905191874, + "grad_norm": 205.82388305664062, + "learning_rate": 1.4929219600725954e-05, + "loss": 41.5, + "step": 2778 + }, + { + "epoch": 10.032505643340858, + "grad_norm": 251.53968811035156, + "learning_rate": 1.492377495462795e-05, + "loss": 41.8218, + "step": 2779 + }, + { + "epoch": 10.036117381489841, + "grad_norm": 236.55564880371094, + "learning_rate": 1.4918330308529945e-05, + "loss": 40.803, + "step": 2780 + }, + { + "epoch": 10.036117381489841, + "eval_loss": 0.6173696517944336, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 2780 + }, + { + "epoch": 10.039729119638826, + "grad_norm": 214.9959716796875, + "learning_rate": 1.4912885662431942e-05, + "loss": 40.522, + "step": 2781 + }, + { + "epoch": 10.043340857787811, + "grad_norm": 213.7000732421875, + "learning_rate": 1.4907441016333939e-05, + "loss": 38.8643, + "step": 2782 + }, + { + "epoch": 10.046952595936794, + "grad_norm": 225.6709747314453, + "learning_rate": 1.4901996370235936e-05, + "loss": 38.3625, + "step": 2783 + }, + { + "epoch": 10.050564334085779, + "grad_norm": 208.83712768554688, + "learning_rate": 1.4896551724137931e-05, + "loss": 38.5355, + "step": 2784 + }, + { + "epoch": 10.054176072234762, + "grad_norm": 185.51219177246094, + "learning_rate": 1.4891107078039927e-05, + "loss": 38.4303, + "step": 2785 + }, + { + "epoch": 10.057787810383747, + "grad_norm": 196.68551635742188, + "learning_rate": 1.4885662431941925e-05, + "loss": 38.1895, + "step": 2786 + }, + { + "epoch": 10.061399548532732, + "grad_norm": 207.4806671142578, + "learning_rate": 1.488021778584392e-05, + "loss": 39.2329, + "step": 2787 + }, + { + "epoch": 10.065011286681715, + "grad_norm": 211.640380859375, + "learning_rate": 1.4874773139745916e-05, + "loss": 40.108, + "step": 2788 + }, + { + "epoch": 10.0686230248307, + "grad_norm": 195.97006225585938, + "learning_rate": 1.4869328493647913e-05, + "loss": 39.6883, + "step": 2789 + }, + { + "epoch": 10.072234762979685, + "grad_norm": 207.20169067382812, + "learning_rate": 1.4863883847549909e-05, + "loss": 40.557, + "step": 2790 + }, + { + "epoch": 10.072234762979685, + "eval_loss": 0.6166439652442932, + "eval_runtime": 3.1461, + "eval_samples_per_second": 56.895, + "eval_steps_per_second": 56.895, + "step": 2790 + }, + { + "epoch": 10.075846501128668, + "grad_norm": 168.4052276611328, + "learning_rate": 1.4858439201451906e-05, + "loss": 39.76, + "step": 2791 + }, + { + "epoch": 10.079458239277653, + "grad_norm": 188.55575561523438, + "learning_rate": 1.4852994555353903e-05, + "loss": 40.4776, + "step": 2792 + }, + { + "epoch": 10.083069977426636, + "grad_norm": 181.60801696777344, + "learning_rate": 1.4847549909255898e-05, + "loss": 40.5414, + "step": 2793 + }, + { + "epoch": 10.08668171557562, + "grad_norm": 205.39608764648438, + "learning_rate": 1.4842105263157895e-05, + "loss": 41.4944, + "step": 2794 + }, + { + "epoch": 10.090293453724605, + "grad_norm": 271.0169372558594, + "learning_rate": 1.4836660617059892e-05, + "loss": 40.6805, + "step": 2795 + }, + { + "epoch": 10.093905191873588, + "grad_norm": 241.97889709472656, + "learning_rate": 1.4831215970961888e-05, + "loss": 39.5473, + "step": 2796 + }, + { + "epoch": 10.097516930022573, + "grad_norm": 211.64260864257812, + "learning_rate": 1.4825771324863885e-05, + "loss": 41.0357, + "step": 2797 + }, + { + "epoch": 10.101128668171558, + "grad_norm": 209.52804565429688, + "learning_rate": 1.482032667876588e-05, + "loss": 41.3357, + "step": 2798 + }, + { + "epoch": 10.104740406320541, + "grad_norm": 243.08419799804688, + "learning_rate": 1.4814882032667876e-05, + "loss": 38.6778, + "step": 2799 + }, + { + "epoch": 10.108352144469526, + "grad_norm": 227.17172241210938, + "learning_rate": 1.4809437386569874e-05, + "loss": 35.1128, + "step": 2800 + }, + { + "epoch": 10.108352144469526, + "eval_loss": 0.6153741478919983, + "eval_runtime": 3.143, + "eval_samples_per_second": 56.952, + "eval_steps_per_second": 56.952, + "step": 2800 + }, + { + "epoch": 10.111963882618511, + "grad_norm": 284.7151794433594, + "learning_rate": 1.480399274047187e-05, + "loss": 33.1712, + "step": 2801 + }, + { + "epoch": 10.115575620767494, + "grad_norm": 234.85169982910156, + "learning_rate": 1.4798548094373867e-05, + "loss": 33.495, + "step": 2802 + }, + { + "epoch": 10.119187358916479, + "grad_norm": 236.6138458251953, + "learning_rate": 1.4793103448275862e-05, + "loss": 33.2318, + "step": 2803 + }, + { + "epoch": 10.122799097065462, + "grad_norm": 240.98997497558594, + "learning_rate": 1.4787658802177858e-05, + "loss": 33.9268, + "step": 2804 + }, + { + "epoch": 10.126410835214447, + "grad_norm": 218.304443359375, + "learning_rate": 1.4782214156079856e-05, + "loss": 34.667, + "step": 2805 + }, + { + "epoch": 10.130022573363432, + "grad_norm": 290.30108642578125, + "learning_rate": 1.4776769509981852e-05, + "loss": 36.7153, + "step": 2806 + }, + { + "epoch": 10.133634311512415, + "grad_norm": 267.7265625, + "learning_rate": 1.4771324863883847e-05, + "loss": 35.2035, + "step": 2807 + }, + { + "epoch": 10.1372460496614, + "grad_norm": 300.4646301269531, + "learning_rate": 1.4765880217785844e-05, + "loss": 35.6581, + "step": 2808 + }, + { + "epoch": 10.140857787810384, + "grad_norm": 234.16448974609375, + "learning_rate": 1.4760435571687841e-05, + "loss": 35.8547, + "step": 2809 + }, + { + "epoch": 10.144469525959368, + "grad_norm": 209.23858642578125, + "learning_rate": 1.4754990925589837e-05, + "loss": 34.47, + "step": 2810 + }, + { + "epoch": 10.144469525959368, + "eval_loss": 0.6160662770271301, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2810 + }, + { + "epoch": 10.148081264108352, + "grad_norm": 207.9628143310547, + "learning_rate": 1.4749546279491834e-05, + "loss": 36.1239, + "step": 2811 + }, + { + "epoch": 10.151693002257336, + "grad_norm": 183.68545532226562, + "learning_rate": 1.4744101633393829e-05, + "loss": 36.759, + "step": 2812 + }, + { + "epoch": 10.15530474040632, + "grad_norm": 222.00164794921875, + "learning_rate": 1.4738656987295826e-05, + "loss": 37.397, + "step": 2813 + }, + { + "epoch": 10.158916478555305, + "grad_norm": 226.9628448486328, + "learning_rate": 1.4733212341197823e-05, + "loss": 36.3648, + "step": 2814 + }, + { + "epoch": 10.162528216704288, + "grad_norm": 271.061279296875, + "learning_rate": 1.4727767695099819e-05, + "loss": 37.8754, + "step": 2815 + }, + { + "epoch": 10.166139954853273, + "grad_norm": 265.2478942871094, + "learning_rate": 1.4722323049001816e-05, + "loss": 33.7491, + "step": 2816 + }, + { + "epoch": 10.169751693002258, + "grad_norm": 227.5030975341797, + "learning_rate": 1.4716878402903811e-05, + "loss": 23.0162, + "step": 2817 + }, + { + "epoch": 10.173363431151241, + "grad_norm": 195.83477783203125, + "learning_rate": 1.4711433756805808e-05, + "loss": 23.5831, + "step": 2818 + }, + { + "epoch": 10.176975169300226, + "grad_norm": 196.982421875, + "learning_rate": 1.4705989110707805e-05, + "loss": 24.1078, + "step": 2819 + }, + { + "epoch": 10.18058690744921, + "grad_norm": 212.73031616210938, + "learning_rate": 1.47005444646098e-05, + "loss": 24.8378, + "step": 2820 + }, + { + "epoch": 10.18058690744921, + "eval_loss": 0.6217848062515259, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 2820 + }, + { + "epoch": 10.184198645598194, + "grad_norm": 261.8343200683594, + "learning_rate": 1.4695099818511796e-05, + "loss": 43.3402, + "step": 2821 + }, + { + "epoch": 10.187810383747179, + "grad_norm": 272.94158935546875, + "learning_rate": 1.4689655172413793e-05, + "loss": 42.8004, + "step": 2822 + }, + { + "epoch": 10.191422121896162, + "grad_norm": 261.5067138671875, + "learning_rate": 1.468421052631579e-05, + "loss": 43.5947, + "step": 2823 + }, + { + "epoch": 10.195033860045147, + "grad_norm": 280.4205322265625, + "learning_rate": 1.4678765880217787e-05, + "loss": 42.1887, + "step": 2824 + }, + { + "epoch": 10.198645598194132, + "grad_norm": 223.82449340820312, + "learning_rate": 1.4673321234119783e-05, + "loss": 40.9825, + "step": 2825 + }, + { + "epoch": 10.202257336343115, + "grad_norm": 261.1077575683594, + "learning_rate": 1.4667876588021778e-05, + "loss": 41.8347, + "step": 2826 + }, + { + "epoch": 10.2058690744921, + "grad_norm": 189.1642608642578, + "learning_rate": 1.4662431941923775e-05, + "loss": 41.7441, + "step": 2827 + }, + { + "epoch": 10.209480812641084, + "grad_norm": 216.94410705566406, + "learning_rate": 1.4656987295825772e-05, + "loss": 42.203, + "step": 2828 + }, + { + "epoch": 10.213092550790067, + "grad_norm": 260.44744873046875, + "learning_rate": 1.4651542649727768e-05, + "loss": 41.8887, + "step": 2829 + }, + { + "epoch": 10.216704288939052, + "grad_norm": 252.21682739257812, + "learning_rate": 1.4646098003629765e-05, + "loss": 42.5977, + "step": 2830 + }, + { + "epoch": 10.216704288939052, + "eval_loss": 0.6175437569618225, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.997, + "eval_steps_per_second": 56.997, + "step": 2830 + }, + { + "epoch": 10.220316027088035, + "grad_norm": 298.4760437011719, + "learning_rate": 1.464065335753176e-05, + "loss": 40.7994, + "step": 2831 + }, + { + "epoch": 10.22392776523702, + "grad_norm": 214.0433349609375, + "learning_rate": 1.4635208711433757e-05, + "loss": 39.1571, + "step": 2832 + }, + { + "epoch": 10.227539503386005, + "grad_norm": 220.59039306640625, + "learning_rate": 1.4629764065335754e-05, + "loss": 38.257, + "step": 2833 + }, + { + "epoch": 10.231151241534988, + "grad_norm": 218.2419891357422, + "learning_rate": 1.462431941923775e-05, + "loss": 38.1954, + "step": 2834 + }, + { + "epoch": 10.234762979683973, + "grad_norm": 241.67674255371094, + "learning_rate": 1.4618874773139747e-05, + "loss": 39.7451, + "step": 2835 + }, + { + "epoch": 10.238374717832958, + "grad_norm": 260.3656005859375, + "learning_rate": 1.4613430127041742e-05, + "loss": 38.8297, + "step": 2836 + }, + { + "epoch": 10.241986455981941, + "grad_norm": 231.78102111816406, + "learning_rate": 1.4607985480943739e-05, + "loss": 38.523, + "step": 2837 + }, + { + "epoch": 10.245598194130926, + "grad_norm": 217.64820861816406, + "learning_rate": 1.4602540834845736e-05, + "loss": 40.0389, + "step": 2838 + }, + { + "epoch": 10.249209932279909, + "grad_norm": 186.45240783691406, + "learning_rate": 1.4597096188747732e-05, + "loss": 40.3306, + "step": 2839 + }, + { + "epoch": 10.252821670428894, + "grad_norm": 225.20480346679688, + "learning_rate": 1.4591651542649727e-05, + "loss": 39.0968, + "step": 2840 + }, + { + "epoch": 10.252821670428894, + "eval_loss": 0.6195141673088074, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 2840 + }, + { + "epoch": 10.256433408577879, + "grad_norm": 367.6174621582031, + "learning_rate": 1.4586206896551724e-05, + "loss": 38.869, + "step": 2841 + }, + { + "epoch": 10.260045146726862, + "grad_norm": 274.3976135253906, + "learning_rate": 1.4580762250453721e-05, + "loss": 39.7781, + "step": 2842 + }, + { + "epoch": 10.263656884875846, + "grad_norm": 193.41665649414062, + "learning_rate": 1.4575317604355718e-05, + "loss": 38.819, + "step": 2843 + }, + { + "epoch": 10.267268623024831, + "grad_norm": 204.2224578857422, + "learning_rate": 1.4569872958257714e-05, + "loss": 41.5495, + "step": 2844 + }, + { + "epoch": 10.270880361173814, + "grad_norm": 276.07476806640625, + "learning_rate": 1.4564428312159709e-05, + "loss": 40.6553, + "step": 2845 + }, + { + "epoch": 10.2744920993228, + "grad_norm": 192.6361541748047, + "learning_rate": 1.4558983666061708e-05, + "loss": 40.2147, + "step": 2846 + }, + { + "epoch": 10.278103837471784, + "grad_norm": 232.6641082763672, + "learning_rate": 1.4553539019963703e-05, + "loss": 40.7223, + "step": 2847 + }, + { + "epoch": 10.281715575620767, + "grad_norm": 266.781005859375, + "learning_rate": 1.4548094373865698e-05, + "loss": 38.0127, + "step": 2848 + }, + { + "epoch": 10.285327313769752, + "grad_norm": 289.5414123535156, + "learning_rate": 1.4542649727767696e-05, + "loss": 35.216, + "step": 2849 + }, + { + "epoch": 10.288939051918735, + "grad_norm": 208.10845947265625, + "learning_rate": 1.4537205081669691e-05, + "loss": 33.829, + "step": 2850 + }, + { + "epoch": 10.288939051918735, + "eval_loss": 0.6140356063842773, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.703, + "eval_steps_per_second": 56.703, + "step": 2850 + }, + { + "epoch": 10.29255079006772, + "grad_norm": 260.80328369140625, + "learning_rate": 1.4531760435571688e-05, + "loss": 33.8409, + "step": 2851 + }, + { + "epoch": 10.296162528216705, + "grad_norm": 202.3874053955078, + "learning_rate": 1.4526315789473685e-05, + "loss": 32.6498, + "step": 2852 + }, + { + "epoch": 10.299774266365688, + "grad_norm": 236.0218048095703, + "learning_rate": 1.452087114337568e-05, + "loss": 33.6538, + "step": 2853 + }, + { + "epoch": 10.303386004514673, + "grad_norm": 219.1603240966797, + "learning_rate": 1.4515426497277678e-05, + "loss": 33.7346, + "step": 2854 + }, + { + "epoch": 10.306997742663658, + "grad_norm": 252.8759307861328, + "learning_rate": 1.4509981851179675e-05, + "loss": 34.6996, + "step": 2855 + }, + { + "epoch": 10.31060948081264, + "grad_norm": 204.89244079589844, + "learning_rate": 1.450453720508167e-05, + "loss": 36.1145, + "step": 2856 + }, + { + "epoch": 10.314221218961626, + "grad_norm": 239.5278778076172, + "learning_rate": 1.4499092558983667e-05, + "loss": 34.8845, + "step": 2857 + }, + { + "epoch": 10.317832957110609, + "grad_norm": 235.02403259277344, + "learning_rate": 1.4493647912885662e-05, + "loss": 36.1006, + "step": 2858 + }, + { + "epoch": 10.321444695259594, + "grad_norm": 219.25686645507812, + "learning_rate": 1.4488203266787658e-05, + "loss": 37.0463, + "step": 2859 + }, + { + "epoch": 10.325056433408578, + "grad_norm": 238.1767578125, + "learning_rate": 1.4482758620689657e-05, + "loss": 35.5543, + "step": 2860 + }, + { + "epoch": 10.325056433408578, + "eval_loss": 0.6116110682487488, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.93, + "eval_steps_per_second": 56.93, + "step": 2860 + }, + { + "epoch": 10.328668171557561, + "grad_norm": 245.4133758544922, + "learning_rate": 1.4477313974591652e-05, + "loss": 35.7557, + "step": 2861 + }, + { + "epoch": 10.332279909706546, + "grad_norm": 231.70779418945312, + "learning_rate": 1.4471869328493647e-05, + "loss": 35.9535, + "step": 2862 + }, + { + "epoch": 10.335891647855531, + "grad_norm": 218.71266174316406, + "learning_rate": 1.4466424682395644e-05, + "loss": 36.747, + "step": 2863 + }, + { + "epoch": 10.339503386004514, + "grad_norm": 206.82247924804688, + "learning_rate": 1.446098003629764e-05, + "loss": 37.4007, + "step": 2864 + }, + { + "epoch": 10.343115124153499, + "grad_norm": 286.6649475097656, + "learning_rate": 1.4455535390199639e-05, + "loss": 38.183, + "step": 2865 + }, + { + "epoch": 10.346726862302482, + "grad_norm": 262.2049865722656, + "learning_rate": 1.4450090744101634e-05, + "loss": 28.1564, + "step": 2866 + }, + { + "epoch": 10.350338600451467, + "grad_norm": 203.03831481933594, + "learning_rate": 1.444464609800363e-05, + "loss": 23.7155, + "step": 2867 + }, + { + "epoch": 10.353950338600452, + "grad_norm": 220.13597106933594, + "learning_rate": 1.4439201451905626e-05, + "loss": 23.5066, + "step": 2868 + }, + { + "epoch": 10.357562076749435, + "grad_norm": 208.22035217285156, + "learning_rate": 1.4433756805807624e-05, + "loss": 23.8087, + "step": 2869 + }, + { + "epoch": 10.36117381489842, + "grad_norm": 202.74989318847656, + "learning_rate": 1.4428312159709619e-05, + "loss": 24.6194, + "step": 2870 + }, + { + "epoch": 10.36117381489842, + "eval_loss": 0.6170971989631653, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 2870 + }, + { + "epoch": 10.364785553047405, + "grad_norm": 251.78924560546875, + "learning_rate": 1.4422867513611616e-05, + "loss": 41.1333, + "step": 2871 + }, + { + "epoch": 10.368397291196388, + "grad_norm": 269.72430419921875, + "learning_rate": 1.4417422867513611e-05, + "loss": 43.5289, + "step": 2872 + }, + { + "epoch": 10.372009029345373, + "grad_norm": 226.14202880859375, + "learning_rate": 1.4411978221415607e-05, + "loss": 42.1575, + "step": 2873 + }, + { + "epoch": 10.375620767494357, + "grad_norm": 230.2255096435547, + "learning_rate": 1.4406533575317606e-05, + "loss": 42.5563, + "step": 2874 + }, + { + "epoch": 10.37923250564334, + "grad_norm": 259.2338562011719, + "learning_rate": 1.4401088929219601e-05, + "loss": 41.517, + "step": 2875 + }, + { + "epoch": 10.382844243792325, + "grad_norm": 280.06414794921875, + "learning_rate": 1.4395644283121598e-05, + "loss": 41.3589, + "step": 2876 + }, + { + "epoch": 10.386455981941308, + "grad_norm": 259.1960754394531, + "learning_rate": 1.4390199637023593e-05, + "loss": 41.539, + "step": 2877 + }, + { + "epoch": 10.390067720090293, + "grad_norm": 244.4931640625, + "learning_rate": 1.438475499092559e-05, + "loss": 41.8689, + "step": 2878 + }, + { + "epoch": 10.393679458239278, + "grad_norm": 195.65065002441406, + "learning_rate": 1.4379310344827588e-05, + "loss": 42.9191, + "step": 2879 + }, + { + "epoch": 10.397291196388261, + "grad_norm": 215.88589477539062, + "learning_rate": 1.4373865698729583e-05, + "loss": 41.4172, + "step": 2880 + }, + { + "epoch": 10.397291196388261, + "eval_loss": 0.6176813840866089, + "eval_runtime": 3.1462, + "eval_samples_per_second": 56.893, + "eval_steps_per_second": 56.893, + "step": 2880 + }, + { + "epoch": 10.400902934537246, + "grad_norm": 175.21368408203125, + "learning_rate": 1.4368421052631578e-05, + "loss": 41.8998, + "step": 2881 + }, + { + "epoch": 10.404514672686231, + "grad_norm": 207.65963745117188, + "learning_rate": 1.4362976406533575e-05, + "loss": 40.33, + "step": 2882 + }, + { + "epoch": 10.408126410835214, + "grad_norm": 213.50526428222656, + "learning_rate": 1.4357531760435572e-05, + "loss": 38.0329, + "step": 2883 + }, + { + "epoch": 10.411738148984199, + "grad_norm": 190.8444366455078, + "learning_rate": 1.4352087114337568e-05, + "loss": 39.0142, + "step": 2884 + }, + { + "epoch": 10.415349887133182, + "grad_norm": 300.2298583984375, + "learning_rate": 1.4346642468239565e-05, + "loss": 38.6364, + "step": 2885 + }, + { + "epoch": 10.418961625282167, + "grad_norm": 183.6144256591797, + "learning_rate": 1.434119782214156e-05, + "loss": 39.6747, + "step": 2886 + }, + { + "epoch": 10.422573363431152, + "grad_norm": 237.85340881347656, + "learning_rate": 1.4335753176043557e-05, + "loss": 38.3018, + "step": 2887 + }, + { + "epoch": 10.426185101580135, + "grad_norm": 325.96624755859375, + "learning_rate": 1.4330308529945554e-05, + "loss": 40.1042, + "step": 2888 + }, + { + "epoch": 10.42979683972912, + "grad_norm": 248.4732666015625, + "learning_rate": 1.432486388384755e-05, + "loss": 40.0357, + "step": 2889 + }, + { + "epoch": 10.433408577878104, + "grad_norm": 374.6653747558594, + "learning_rate": 1.4319419237749547e-05, + "loss": 40.4383, + "step": 2890 + }, + { + "epoch": 10.433408577878104, + "eval_loss": 0.6150367856025696, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.881, + "eval_steps_per_second": 56.881, + "step": 2890 + }, + { + "epoch": 10.437020316027088, + "grad_norm": 229.79647827148438, + "learning_rate": 1.4313974591651542e-05, + "loss": 40.3728, + "step": 2891 + }, + { + "epoch": 10.440632054176072, + "grad_norm": 278.7500915527344, + "learning_rate": 1.430852994555354e-05, + "loss": 39.546, + "step": 2892 + }, + { + "epoch": 10.444243792325057, + "grad_norm": 233.1890106201172, + "learning_rate": 1.4303085299455536e-05, + "loss": 41.8094, + "step": 2893 + }, + { + "epoch": 10.44785553047404, + "grad_norm": 207.7745819091797, + "learning_rate": 1.4297640653357532e-05, + "loss": 40.6225, + "step": 2894 + }, + { + "epoch": 10.451467268623025, + "grad_norm": 233.37892150878906, + "learning_rate": 1.4292196007259529e-05, + "loss": 40.2499, + "step": 2895 + }, + { + "epoch": 10.455079006772008, + "grad_norm": 225.4070587158203, + "learning_rate": 1.4286751361161524e-05, + "loss": 40.3626, + "step": 2896 + }, + { + "epoch": 10.458690744920993, + "grad_norm": 239.60231018066406, + "learning_rate": 1.4281306715063521e-05, + "loss": 40.3149, + "step": 2897 + }, + { + "epoch": 10.462302483069978, + "grad_norm": 225.3981475830078, + "learning_rate": 1.4275862068965518e-05, + "loss": 39.3443, + "step": 2898 + }, + { + "epoch": 10.465914221218961, + "grad_norm": 270.2829284667969, + "learning_rate": 1.4270417422867514e-05, + "loss": 37.8947, + "step": 2899 + }, + { + "epoch": 10.469525959367946, + "grad_norm": 263.66986083984375, + "learning_rate": 1.426497277676951e-05, + "loss": 34.4721, + "step": 2900 + }, + { + "epoch": 10.469525959367946, + "eval_loss": 0.6134031414985657, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2900 + }, + { + "epoch": 10.47313769751693, + "grad_norm": 189.3812255859375, + "learning_rate": 1.4259528130671508e-05, + "loss": 34.3148, + "step": 2901 + }, + { + "epoch": 10.476749435665914, + "grad_norm": 256.7174987792969, + "learning_rate": 1.4254083484573503e-05, + "loss": 32.1693, + "step": 2902 + }, + { + "epoch": 10.480361173814899, + "grad_norm": 265.40692138671875, + "learning_rate": 1.4248638838475499e-05, + "loss": 34.369, + "step": 2903 + }, + { + "epoch": 10.483972911963882, + "grad_norm": 315.6539001464844, + "learning_rate": 1.4243194192377496e-05, + "loss": 34.9479, + "step": 2904 + }, + { + "epoch": 10.487584650112867, + "grad_norm": 263.7816162109375, + "learning_rate": 1.4237749546279491e-05, + "loss": 33.983, + "step": 2905 + }, + { + "epoch": 10.491196388261852, + "grad_norm": 244.69192504882812, + "learning_rate": 1.423230490018149e-05, + "loss": 36.6685, + "step": 2906 + }, + { + "epoch": 10.494808126410835, + "grad_norm": 224.26071166992188, + "learning_rate": 1.4226860254083485e-05, + "loss": 35.0337, + "step": 2907 + }, + { + "epoch": 10.49841986455982, + "grad_norm": 261.0958557128906, + "learning_rate": 1.422141560798548e-05, + "loss": 34.7154, + "step": 2908 + }, + { + "epoch": 10.502031602708804, + "grad_norm": 245.85960388183594, + "learning_rate": 1.4215970961887478e-05, + "loss": 35.4156, + "step": 2909 + }, + { + "epoch": 10.505643340857787, + "grad_norm": 309.3730163574219, + "learning_rate": 1.4210526315789473e-05, + "loss": 36.3999, + "step": 2910 + }, + { + "epoch": 10.505643340857787, + "eval_loss": 0.6144266128540039, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.853, + "eval_steps_per_second": 56.853, + "step": 2910 + }, + { + "epoch": 10.509255079006772, + "grad_norm": 209.9637451171875, + "learning_rate": 1.420508166969147e-05, + "loss": 37.1515, + "step": 2911 + }, + { + "epoch": 10.512866817155757, + "grad_norm": 254.81683349609375, + "learning_rate": 1.4199637023593467e-05, + "loss": 35.5548, + "step": 2912 + }, + { + "epoch": 10.51647855530474, + "grad_norm": 224.94137573242188, + "learning_rate": 1.4194192377495463e-05, + "loss": 36.7691, + "step": 2913 + }, + { + "epoch": 10.520090293453725, + "grad_norm": 223.81838989257812, + "learning_rate": 1.4188747731397458e-05, + "loss": 37.5904, + "step": 2914 + }, + { + "epoch": 10.523702031602708, + "grad_norm": 308.0168151855469, + "learning_rate": 1.4183303085299457e-05, + "loss": 36.1561, + "step": 2915 + }, + { + "epoch": 10.527313769751693, + "grad_norm": 214.77928161621094, + "learning_rate": 1.4177858439201452e-05, + "loss": 27.6309, + "step": 2916 + }, + { + "epoch": 10.530925507900678, + "grad_norm": 153.77163696289062, + "learning_rate": 1.417241379310345e-05, + "loss": 23.6151, + "step": 2917 + }, + { + "epoch": 10.534537246049661, + "grad_norm": 161.12826538085938, + "learning_rate": 1.4166969147005445e-05, + "loss": 23.1684, + "step": 2918 + }, + { + "epoch": 10.538148984198646, + "grad_norm": 228.01441955566406, + "learning_rate": 1.416152450090744e-05, + "loss": 23.4383, + "step": 2919 + }, + { + "epoch": 10.54176072234763, + "grad_norm": 207.55052185058594, + "learning_rate": 1.4156079854809439e-05, + "loss": 25.4699, + "step": 2920 + }, + { + "epoch": 10.54176072234763, + "eval_loss": 0.6177500486373901, + "eval_runtime": 3.1369, + "eval_samples_per_second": 57.063, + "eval_steps_per_second": 57.063, + "step": 2920 + }, + { + "epoch": 10.545372460496614, + "grad_norm": 254.23828125, + "learning_rate": 1.4150635208711434e-05, + "loss": 42.1525, + "step": 2921 + }, + { + "epoch": 10.548984198645599, + "grad_norm": 228.1654815673828, + "learning_rate": 1.414519056261343e-05, + "loss": 42.4282, + "step": 2922 + }, + { + "epoch": 10.552595936794582, + "grad_norm": 258.4981689453125, + "learning_rate": 1.4139745916515427e-05, + "loss": 42.3053, + "step": 2923 + }, + { + "epoch": 10.556207674943566, + "grad_norm": 364.42059326171875, + "learning_rate": 1.4134301270417424e-05, + "loss": 41.9009, + "step": 2924 + }, + { + "epoch": 10.559819413092551, + "grad_norm": 213.5066375732422, + "learning_rate": 1.412885662431942e-05, + "loss": 41.0624, + "step": 2925 + }, + { + "epoch": 10.563431151241534, + "grad_norm": 214.23472595214844, + "learning_rate": 1.4123411978221416e-05, + "loss": 42.2508, + "step": 2926 + }, + { + "epoch": 10.56704288939052, + "grad_norm": 249.8063201904297, + "learning_rate": 1.4117967332123412e-05, + "loss": 43.0671, + "step": 2927 + }, + { + "epoch": 10.570654627539504, + "grad_norm": 210.0769805908203, + "learning_rate": 1.4112522686025409e-05, + "loss": 43.4018, + "step": 2928 + }, + { + "epoch": 10.574266365688487, + "grad_norm": 255.67225646972656, + "learning_rate": 1.4107078039927406e-05, + "loss": 42.9609, + "step": 2929 + }, + { + "epoch": 10.577878103837472, + "grad_norm": 294.2599182128906, + "learning_rate": 1.4101633393829401e-05, + "loss": 41.8748, + "step": 2930 + }, + { + "epoch": 10.577878103837472, + "eval_loss": 0.6147512793540955, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2930 + }, + { + "epoch": 10.581489841986457, + "grad_norm": 212.6685333251953, + "learning_rate": 1.4096188747731398e-05, + "loss": 42.4291, + "step": 2931 + }, + { + "epoch": 10.58510158013544, + "grad_norm": 297.016357421875, + "learning_rate": 1.4090744101633394e-05, + "loss": 39.7291, + "step": 2932 + }, + { + "epoch": 10.588713318284425, + "grad_norm": 280.308837890625, + "learning_rate": 1.4085299455535389e-05, + "loss": 37.4836, + "step": 2933 + }, + { + "epoch": 10.592325056433408, + "grad_norm": 230.28994750976562, + "learning_rate": 1.4079854809437388e-05, + "loss": 39.4075, + "step": 2934 + }, + { + "epoch": 10.595936794582393, + "grad_norm": 377.0367126464844, + "learning_rate": 1.4074410163339383e-05, + "loss": 40.5601, + "step": 2935 + }, + { + "epoch": 10.599548532731378, + "grad_norm": 238.51597595214844, + "learning_rate": 1.406896551724138e-05, + "loss": 38.1238, + "step": 2936 + }, + { + "epoch": 10.60316027088036, + "grad_norm": 197.5536651611328, + "learning_rate": 1.4063520871143376e-05, + "loss": 38.2997, + "step": 2937 + }, + { + "epoch": 10.606772009029346, + "grad_norm": 211.65162658691406, + "learning_rate": 1.4058076225045373e-05, + "loss": 39.1501, + "step": 2938 + }, + { + "epoch": 10.610383747178329, + "grad_norm": 266.4801940917969, + "learning_rate": 1.405263157894737e-05, + "loss": 40.5761, + "step": 2939 + }, + { + "epoch": 10.613995485327314, + "grad_norm": 210.29478454589844, + "learning_rate": 1.4047186932849365e-05, + "loss": 39.7387, + "step": 2940 + }, + { + "epoch": 10.613995485327314, + "eval_loss": 0.6154477000236511, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 2940 + }, + { + "epoch": 10.617607223476298, + "grad_norm": 318.0694580078125, + "learning_rate": 1.404174228675136e-05, + "loss": 38.691, + "step": 2941 + }, + { + "epoch": 10.621218961625281, + "grad_norm": 351.12811279296875, + "learning_rate": 1.4036297640653358e-05, + "loss": 40.3878, + "step": 2942 + }, + { + "epoch": 10.624830699774266, + "grad_norm": 259.8601989746094, + "learning_rate": 1.4030852994555355e-05, + "loss": 38.4447, + "step": 2943 + }, + { + "epoch": 10.628442437923251, + "grad_norm": 249.7741241455078, + "learning_rate": 1.402540834845735e-05, + "loss": 41.1242, + "step": 2944 + }, + { + "epoch": 10.632054176072234, + "grad_norm": 207.11119079589844, + "learning_rate": 1.4019963702359347e-05, + "loss": 40.1977, + "step": 2945 + }, + { + "epoch": 10.635665914221219, + "grad_norm": 199.37295532226562, + "learning_rate": 1.4014519056261343e-05, + "loss": 40.71, + "step": 2946 + }, + { + "epoch": 10.639277652370204, + "grad_norm": 238.85061645507812, + "learning_rate": 1.4009074410163341e-05, + "loss": 41.8822, + "step": 2947 + }, + { + "epoch": 10.642889390519187, + "grad_norm": 212.46388244628906, + "learning_rate": 1.4003629764065337e-05, + "loss": 40.5648, + "step": 2948 + }, + { + "epoch": 10.646501128668172, + "grad_norm": 217.60386657714844, + "learning_rate": 1.3998185117967332e-05, + "loss": 39.6074, + "step": 2949 + }, + { + "epoch": 10.650112866817155, + "grad_norm": 223.88645935058594, + "learning_rate": 1.399274047186933e-05, + "loss": 37.7394, + "step": 2950 + }, + { + "epoch": 10.650112866817155, + "eval_loss": 0.6133999228477478, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 2950 + }, + { + "epoch": 10.65372460496614, + "grad_norm": 248.87986755371094, + "learning_rate": 1.3987295825771325e-05, + "loss": 34.911, + "step": 2951 + }, + { + "epoch": 10.657336343115125, + "grad_norm": 238.0355987548828, + "learning_rate": 1.3981851179673322e-05, + "loss": 34.0325, + "step": 2952 + }, + { + "epoch": 10.660948081264108, + "grad_norm": 212.9556121826172, + "learning_rate": 1.3976406533575319e-05, + "loss": 34.9663, + "step": 2953 + }, + { + "epoch": 10.664559819413093, + "grad_norm": 274.4277648925781, + "learning_rate": 1.3970961887477314e-05, + "loss": 34.2399, + "step": 2954 + }, + { + "epoch": 10.668171557562077, + "grad_norm": 211.77976989746094, + "learning_rate": 1.396551724137931e-05, + "loss": 33.7609, + "step": 2955 + }, + { + "epoch": 10.67178329571106, + "grad_norm": 280.6621398925781, + "learning_rate": 1.3960072595281307e-05, + "loss": 35.2616, + "step": 2956 + }, + { + "epoch": 10.675395033860045, + "grad_norm": 239.06439208984375, + "learning_rate": 1.3954627949183304e-05, + "loss": 34.2542, + "step": 2957 + }, + { + "epoch": 10.679006772009028, + "grad_norm": 271.45806884765625, + "learning_rate": 1.39491833030853e-05, + "loss": 36.0551, + "step": 2958 + }, + { + "epoch": 10.682618510158013, + "grad_norm": 247.76486206054688, + "learning_rate": 1.3943738656987296e-05, + "loss": 36.9935, + "step": 2959 + }, + { + "epoch": 10.686230248306998, + "grad_norm": 259.47930908203125, + "learning_rate": 1.3938294010889292e-05, + "loss": 36.7769, + "step": 2960 + }, + { + "epoch": 10.686230248306998, + "eval_loss": 0.6107803583145142, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.138, + "eval_steps_per_second": 57.138, + "step": 2960 + }, + { + "epoch": 10.689841986455981, + "grad_norm": 247.50103759765625, + "learning_rate": 1.393284936479129e-05, + "loss": 35.4848, + "step": 2961 + }, + { + "epoch": 10.693453724604966, + "grad_norm": 242.37330627441406, + "learning_rate": 1.3927404718693286e-05, + "loss": 36.3881, + "step": 2962 + }, + { + "epoch": 10.697065462753951, + "grad_norm": 200.2835693359375, + "learning_rate": 1.3921960072595281e-05, + "loss": 37.2684, + "step": 2963 + }, + { + "epoch": 10.700677200902934, + "grad_norm": 261.6256103515625, + "learning_rate": 1.3916515426497278e-05, + "loss": 37.4581, + "step": 2964 + }, + { + "epoch": 10.704288939051919, + "grad_norm": 243.7251434326172, + "learning_rate": 1.3911070780399274e-05, + "loss": 35.8237, + "step": 2965 + }, + { + "epoch": 10.707900677200904, + "grad_norm": 172.99339294433594, + "learning_rate": 1.390562613430127e-05, + "loss": 29.5815, + "step": 2966 + }, + { + "epoch": 10.711512415349887, + "grad_norm": 168.88490295410156, + "learning_rate": 1.3900181488203268e-05, + "loss": 23.6597, + "step": 2967 + }, + { + "epoch": 10.715124153498872, + "grad_norm": 213.0456085205078, + "learning_rate": 1.3894736842105263e-05, + "loss": 22.5034, + "step": 2968 + }, + { + "epoch": 10.718735891647855, + "grad_norm": 183.87222290039062, + "learning_rate": 1.388929219600726e-05, + "loss": 24.1696, + "step": 2969 + }, + { + "epoch": 10.72234762979684, + "grad_norm": 179.4297637939453, + "learning_rate": 1.3883847549909256e-05, + "loss": 24.8905, + "step": 2970 + }, + { + "epoch": 10.72234762979684, + "eval_loss": 0.6176853179931641, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 2970 + }, + { + "epoch": 10.725959367945824, + "grad_norm": 214.10662841796875, + "learning_rate": 1.3878402903811253e-05, + "loss": 40.6941, + "step": 2971 + }, + { + "epoch": 10.729571106094808, + "grad_norm": 199.4381103515625, + "learning_rate": 1.387295825771325e-05, + "loss": 42.6363, + "step": 2972 + }, + { + "epoch": 10.733182844243792, + "grad_norm": 182.74517822265625, + "learning_rate": 1.3867513611615245e-05, + "loss": 40.9695, + "step": 2973 + }, + { + "epoch": 10.736794582392777, + "grad_norm": 182.41421508789062, + "learning_rate": 1.386206896551724e-05, + "loss": 40.8893, + "step": 2974 + }, + { + "epoch": 10.74040632054176, + "grad_norm": 215.42904663085938, + "learning_rate": 1.385662431941924e-05, + "loss": 40.6667, + "step": 2975 + }, + { + "epoch": 10.744018058690745, + "grad_norm": 208.15133666992188, + "learning_rate": 1.3851179673321235e-05, + "loss": 42.0714, + "step": 2976 + }, + { + "epoch": 10.747629796839728, + "grad_norm": 224.70242309570312, + "learning_rate": 1.384573502722323e-05, + "loss": 40.9404, + "step": 2977 + }, + { + "epoch": 10.751241534988713, + "grad_norm": 241.45301818847656, + "learning_rate": 1.3840290381125227e-05, + "loss": 43.5597, + "step": 2978 + }, + { + "epoch": 10.754853273137698, + "grad_norm": 201.2677459716797, + "learning_rate": 1.3834845735027222e-05, + "loss": 42.7741, + "step": 2979 + }, + { + "epoch": 10.758465011286681, + "grad_norm": 246.30873107910156, + "learning_rate": 1.3829401088929221e-05, + "loss": 41.7873, + "step": 2980 + }, + { + "epoch": 10.758465011286681, + "eval_loss": 0.6206657886505127, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2980 + }, + { + "epoch": 10.762076749435666, + "grad_norm": 206.91009521484375, + "learning_rate": 1.3823956442831217e-05, + "loss": 42.3601, + "step": 2981 + }, + { + "epoch": 10.76568848758465, + "grad_norm": 206.37472534179688, + "learning_rate": 1.3818511796733212e-05, + "loss": 38.5536, + "step": 2982 + }, + { + "epoch": 10.769300225733634, + "grad_norm": 206.49070739746094, + "learning_rate": 1.3813067150635209e-05, + "loss": 38.1051, + "step": 2983 + }, + { + "epoch": 10.772911963882619, + "grad_norm": 215.02455139160156, + "learning_rate": 1.3807622504537206e-05, + "loss": 39.0797, + "step": 2984 + }, + { + "epoch": 10.776523702031604, + "grad_norm": 254.23757934570312, + "learning_rate": 1.3802177858439202e-05, + "loss": 39.419, + "step": 2985 + }, + { + "epoch": 10.780135440180587, + "grad_norm": 205.85079956054688, + "learning_rate": 1.3796733212341199e-05, + "loss": 39.2075, + "step": 2986 + }, + { + "epoch": 10.783747178329572, + "grad_norm": 216.0372314453125, + "learning_rate": 1.3791288566243194e-05, + "loss": 38.5652, + "step": 2987 + }, + { + "epoch": 10.787358916478555, + "grad_norm": 258.47650146484375, + "learning_rate": 1.3785843920145191e-05, + "loss": 38.1968, + "step": 2988 + }, + { + "epoch": 10.79097065462754, + "grad_norm": 289.07354736328125, + "learning_rate": 1.3780399274047188e-05, + "loss": 40.2233, + "step": 2989 + }, + { + "epoch": 10.794582392776524, + "grad_norm": 332.9964904785156, + "learning_rate": 1.3774954627949184e-05, + "loss": 39.5959, + "step": 2990 + }, + { + "epoch": 10.794582392776524, + "eval_loss": 0.6167517304420471, + "eval_runtime": 3.1556, + "eval_samples_per_second": 56.724, + "eval_steps_per_second": 56.724, + "step": 2990 + }, + { + "epoch": 10.798194130925507, + "grad_norm": 205.10699462890625, + "learning_rate": 1.376950998185118e-05, + "loss": 40.2468, + "step": 2991 + }, + { + "epoch": 10.801805869074492, + "grad_norm": 270.2808837890625, + "learning_rate": 1.3764065335753176e-05, + "loss": 37.5956, + "step": 2992 + }, + { + "epoch": 10.805417607223477, + "grad_norm": 199.32044982910156, + "learning_rate": 1.3758620689655171e-05, + "loss": 38.7289, + "step": 2993 + }, + { + "epoch": 10.80902934537246, + "grad_norm": 196.97547912597656, + "learning_rate": 1.375317604355717e-05, + "loss": 40.6707, + "step": 2994 + }, + { + "epoch": 10.812641083521445, + "grad_norm": 219.34588623046875, + "learning_rate": 1.3747731397459166e-05, + "loss": 39.6782, + "step": 2995 + }, + { + "epoch": 10.816252821670428, + "grad_norm": 261.7323913574219, + "learning_rate": 1.3742286751361161e-05, + "loss": 41.1828, + "step": 2996 + }, + { + "epoch": 10.819864559819413, + "grad_norm": 250.89186096191406, + "learning_rate": 1.3736842105263158e-05, + "loss": 41.3582, + "step": 2997 + }, + { + "epoch": 10.823476297968398, + "grad_norm": 284.7223205566406, + "learning_rate": 1.3731397459165155e-05, + "loss": 39.3584, + "step": 2998 + }, + { + "epoch": 10.827088036117381, + "grad_norm": 212.9114990234375, + "learning_rate": 1.3725952813067152e-05, + "loss": 37.5373, + "step": 2999 + }, + { + "epoch": 10.830699774266366, + "grad_norm": 182.8346405029297, + "learning_rate": 1.3720508166969148e-05, + "loss": 35.2027, + "step": 3000 + }, + { + "epoch": 10.830699774266366, + "eval_loss": 0.6083630919456482, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.702, + "eval_steps_per_second": 56.702, + "step": 3000 + }, + { + "epoch": 10.83431151241535, + "grad_norm": 259.0496520996094, + "learning_rate": 1.3715063520871143e-05, + "loss": 33.4937, + "step": 3001 + }, + { + "epoch": 10.837923250564334, + "grad_norm": 173.037353515625, + "learning_rate": 1.370961887477314e-05, + "loss": 32.8549, + "step": 3002 + }, + { + "epoch": 10.841534988713319, + "grad_norm": 257.9381408691406, + "learning_rate": 1.3704174228675137e-05, + "loss": 33.9163, + "step": 3003 + }, + { + "epoch": 10.845146726862303, + "grad_norm": 248.58355712890625, + "learning_rate": 1.3698729582577132e-05, + "loss": 34.3948, + "step": 3004 + }, + { + "epoch": 10.848758465011286, + "grad_norm": 277.0877990722656, + "learning_rate": 1.369328493647913e-05, + "loss": 34.2868, + "step": 3005 + }, + { + "epoch": 10.852370203160271, + "grad_norm": 220.54014587402344, + "learning_rate": 1.3687840290381125e-05, + "loss": 35.2502, + "step": 3006 + }, + { + "epoch": 10.855981941309254, + "grad_norm": 248.14111328125, + "learning_rate": 1.3682395644283122e-05, + "loss": 33.4599, + "step": 3007 + }, + { + "epoch": 10.85959367945824, + "grad_norm": 284.2827453613281, + "learning_rate": 1.3676950998185119e-05, + "loss": 34.2927, + "step": 3008 + }, + { + "epoch": 10.863205417607224, + "grad_norm": 236.78201293945312, + "learning_rate": 1.3671506352087114e-05, + "loss": 34.9322, + "step": 3009 + }, + { + "epoch": 10.866817155756207, + "grad_norm": 245.58331298828125, + "learning_rate": 1.3666061705989112e-05, + "loss": 35.7628, + "step": 3010 + }, + { + "epoch": 10.866817155756207, + "eval_loss": 0.6125946640968323, + "eval_runtime": 3.1644, + "eval_samples_per_second": 56.566, + "eval_steps_per_second": 56.566, + "step": 3010 + }, + { + "epoch": 10.870428893905192, + "grad_norm": 217.79248046875, + "learning_rate": 1.3660617059891107e-05, + "loss": 35.7332, + "step": 3011 + }, + { + "epoch": 10.874040632054175, + "grad_norm": 258.78729248046875, + "learning_rate": 1.3655172413793104e-05, + "loss": 38.293, + "step": 3012 + }, + { + "epoch": 10.87765237020316, + "grad_norm": 253.94757080078125, + "learning_rate": 1.3649727767695101e-05, + "loss": 37.511, + "step": 3013 + }, + { + "epoch": 10.881264108352145, + "grad_norm": 265.5654602050781, + "learning_rate": 1.3644283121597096e-05, + "loss": 37.5786, + "step": 3014 + }, + { + "epoch": 10.884875846501128, + "grad_norm": 252.11453247070312, + "learning_rate": 1.3638838475499092e-05, + "loss": 37.1039, + "step": 3015 + }, + { + "epoch": 10.888487584650113, + "grad_norm": 259.5934753417969, + "learning_rate": 1.3633393829401089e-05, + "loss": 35.2651, + "step": 3016 + }, + { + "epoch": 10.892099322799098, + "grad_norm": 194.3569793701172, + "learning_rate": 1.3627949183303086e-05, + "loss": 23.7438, + "step": 3017 + }, + { + "epoch": 10.89571106094808, + "grad_norm": 233.95205688476562, + "learning_rate": 1.3622504537205081e-05, + "loss": 23.0061, + "step": 3018 + }, + { + "epoch": 10.899322799097066, + "grad_norm": 185.18495178222656, + "learning_rate": 1.3617059891107078e-05, + "loss": 24.5404, + "step": 3019 + }, + { + "epoch": 10.90293453724605, + "grad_norm": 200.27029418945312, + "learning_rate": 1.3611615245009074e-05, + "loss": 24.3629, + "step": 3020 + }, + { + "epoch": 10.90293453724605, + "eval_loss": 0.6178797483444214, + "eval_runtime": 3.1498, + "eval_samples_per_second": 56.829, + "eval_steps_per_second": 56.829, + "step": 3020 + }, + { + "epoch": 10.906546275395034, + "grad_norm": 226.4281463623047, + "learning_rate": 1.3606170598911073e-05, + "loss": 41.7249, + "step": 3021 + }, + { + "epoch": 10.910158013544018, + "grad_norm": 207.73768615722656, + "learning_rate": 1.3600725952813068e-05, + "loss": 42.1902, + "step": 3022 + }, + { + "epoch": 10.913769751693001, + "grad_norm": 248.69773864746094, + "learning_rate": 1.3595281306715063e-05, + "loss": 40.8419, + "step": 3023 + }, + { + "epoch": 10.917381489841986, + "grad_norm": 224.0100860595703, + "learning_rate": 1.358983666061706e-05, + "loss": 41.483, + "step": 3024 + }, + { + "epoch": 10.920993227990971, + "grad_norm": 217.3524932861328, + "learning_rate": 1.3584392014519056e-05, + "loss": 42.4667, + "step": 3025 + }, + { + "epoch": 10.924604966139954, + "grad_norm": 226.0863494873047, + "learning_rate": 1.3578947368421053e-05, + "loss": 40.8693, + "step": 3026 + }, + { + "epoch": 10.928216704288939, + "grad_norm": 278.3658447265625, + "learning_rate": 1.357350272232305e-05, + "loss": 39.5165, + "step": 3027 + }, + { + "epoch": 10.931828442437924, + "grad_norm": 226.6543731689453, + "learning_rate": 1.3568058076225045e-05, + "loss": 39.3144, + "step": 3028 + }, + { + "epoch": 10.935440180586907, + "grad_norm": 215.39073181152344, + "learning_rate": 1.3562613430127042e-05, + "loss": 39.9823, + "step": 3029 + }, + { + "epoch": 10.939051918735892, + "grad_norm": 239.6291961669922, + "learning_rate": 1.355716878402904e-05, + "loss": 40.898, + "step": 3030 + }, + { + "epoch": 10.939051918735892, + "eval_loss": 0.6163076162338257, + "eval_runtime": 3.153, + "eval_samples_per_second": 56.771, + "eval_steps_per_second": 56.771, + "step": 3030 + }, + { + "epoch": 10.942663656884875, + "grad_norm": 251.20431518554688, + "learning_rate": 1.3551724137931035e-05, + "loss": 40.8357, + "step": 3031 + }, + { + "epoch": 10.94627539503386, + "grad_norm": 243.96022033691406, + "learning_rate": 1.3546279491833032e-05, + "loss": 39.1261, + "step": 3032 + }, + { + "epoch": 10.949887133182845, + "grad_norm": 248.15545654296875, + "learning_rate": 1.3540834845735027e-05, + "loss": 40.9375, + "step": 3033 + }, + { + "epoch": 10.953498871331828, + "grad_norm": 215.00927734375, + "learning_rate": 1.3535390199637023e-05, + "loss": 42.4167, + "step": 3034 + }, + { + "epoch": 10.957110609480813, + "grad_norm": 263.11566162109375, + "learning_rate": 1.3529945553539021e-05, + "loss": 40.7363, + "step": 3035 + }, + { + "epoch": 10.960722347629797, + "grad_norm": 208.59628295898438, + "learning_rate": 1.3524500907441017e-05, + "loss": 35.7124, + "step": 3036 + }, + { + "epoch": 10.96433408577878, + "grad_norm": 187.6036834716797, + "learning_rate": 1.3519056261343012e-05, + "loss": 33.7512, + "step": 3037 + }, + { + "epoch": 10.967945823927765, + "grad_norm": 217.89825439453125, + "learning_rate": 1.351361161524501e-05, + "loss": 33.4262, + "step": 3038 + }, + { + "epoch": 10.97155756207675, + "grad_norm": 235.59889221191406, + "learning_rate": 1.3508166969147005e-05, + "loss": 35.2587, + "step": 3039 + }, + { + "epoch": 10.975169300225733, + "grad_norm": 261.9609680175781, + "learning_rate": 1.3502722323049003e-05, + "loss": 36.1296, + "step": 3040 + }, + { + "epoch": 10.975169300225733, + "eval_loss": 0.610818088054657, + "eval_runtime": 3.1502, + "eval_samples_per_second": 56.822, + "eval_steps_per_second": 56.822, + "step": 3040 + }, + { + "epoch": 10.978781038374718, + "grad_norm": 239.44386291503906, + "learning_rate": 1.3497277676950999e-05, + "loss": 35.6712, + "step": 3041 + }, + { + "epoch": 10.982392776523701, + "grad_norm": 260.9620666503906, + "learning_rate": 1.3491833030852994e-05, + "loss": 35.9054, + "step": 3042 + }, + { + "epoch": 10.986004514672686, + "grad_norm": 246.35678100585938, + "learning_rate": 1.3486388384754991e-05, + "loss": 35.6071, + "step": 3043 + }, + { + "epoch": 10.989616252821671, + "grad_norm": 259.808349609375, + "learning_rate": 1.3480943738656988e-05, + "loss": 37.8261, + "step": 3044 + }, + { + "epoch": 10.993227990970654, + "grad_norm": 187.34579467773438, + "learning_rate": 1.3475499092558984e-05, + "loss": 29.4662, + "step": 3045 + }, + { + "epoch": 10.996839729119639, + "grad_norm": 235.4073486328125, + "learning_rate": 1.3470054446460981e-05, + "loss": 23.668, + "step": 3046 + }, + { + "epoch": 11.0, + "grad_norm": 171.45904541015625, + "learning_rate": 1.3464609800362976e-05, + "loss": 21.3995, + "step": 3047 + }, + { + "epoch": 11.003611738148985, + "grad_norm": 262.18798828125, + "learning_rate": 1.3459165154264972e-05, + "loss": 40.2072, + "step": 3048 + }, + { + "epoch": 11.007223476297968, + "grad_norm": 298.67755126953125, + "learning_rate": 1.345372050816697e-05, + "loss": 42.5345, + "step": 3049 + }, + { + "epoch": 11.010835214446953, + "grad_norm": 215.71389770507812, + "learning_rate": 1.3448275862068966e-05, + "loss": 41.3491, + "step": 3050 + }, + { + "epoch": 11.010835214446953, + "eval_loss": 0.6099278330802917, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 3050 + }, + { + "epoch": 11.014446952595938, + "grad_norm": 243.77044677734375, + "learning_rate": 1.3442831215970963e-05, + "loss": 41.0093, + "step": 3051 + }, + { + "epoch": 11.01805869074492, + "grad_norm": 205.8600616455078, + "learning_rate": 1.3437386569872958e-05, + "loss": 41.944, + "step": 3052 + }, + { + "epoch": 11.021670428893906, + "grad_norm": 204.25608825683594, + "learning_rate": 1.3431941923774955e-05, + "loss": 39.3595, + "step": 3053 + }, + { + "epoch": 11.025282167042889, + "grad_norm": 195.03114318847656, + "learning_rate": 1.3426497277676952e-05, + "loss": 42.0208, + "step": 3054 + }, + { + "epoch": 11.028893905191874, + "grad_norm": 193.05857849121094, + "learning_rate": 1.3421052631578948e-05, + "loss": 41.2148, + "step": 3055 + }, + { + "epoch": 11.032505643340858, + "grad_norm": 255.9553680419922, + "learning_rate": 1.3415607985480943e-05, + "loss": 41.6029, + "step": 3056 + }, + { + "epoch": 11.036117381489841, + "grad_norm": 234.97799682617188, + "learning_rate": 1.341016333938294e-05, + "loss": 41.2583, + "step": 3057 + }, + { + "epoch": 11.039729119638826, + "grad_norm": 183.76707458496094, + "learning_rate": 1.3404718693284937e-05, + "loss": 39.4893, + "step": 3058 + }, + { + "epoch": 11.043340857787811, + "grad_norm": 162.30191040039062, + "learning_rate": 1.3399274047186933e-05, + "loss": 37.697, + "step": 3059 + }, + { + "epoch": 11.046952595936794, + "grad_norm": 223.8235626220703, + "learning_rate": 1.339382940108893e-05, + "loss": 37.2762, + "step": 3060 + }, + { + "epoch": 11.046952595936794, + "eval_loss": 0.6099210381507874, + "eval_runtime": 3.1526, + "eval_samples_per_second": 56.778, + "eval_steps_per_second": 56.778, + "step": 3060 + }, + { + "epoch": 11.050564334085779, + "grad_norm": 203.874755859375, + "learning_rate": 1.3388384754990925e-05, + "loss": 37.7674, + "step": 3061 + }, + { + "epoch": 11.054176072234762, + "grad_norm": 222.9609832763672, + "learning_rate": 1.3382940108892922e-05, + "loss": 39.5784, + "step": 3062 + }, + { + "epoch": 11.057787810383747, + "grad_norm": 177.81871032714844, + "learning_rate": 1.337749546279492e-05, + "loss": 37.5264, + "step": 3063 + }, + { + "epoch": 11.061399548532732, + "grad_norm": 209.53326416015625, + "learning_rate": 1.3372050816696915e-05, + "loss": 38.5067, + "step": 3064 + }, + { + "epoch": 11.065011286681715, + "grad_norm": 228.35260009765625, + "learning_rate": 1.3366606170598912e-05, + "loss": 37.5329, + "step": 3065 + }, + { + "epoch": 11.0686230248307, + "grad_norm": 231.5054168701172, + "learning_rate": 1.3361161524500907e-05, + "loss": 39.8565, + "step": 3066 + }, + { + "epoch": 11.072234762979685, + "grad_norm": 184.31460571289062, + "learning_rate": 1.3355716878402904e-05, + "loss": 37.9703, + "step": 3067 + }, + { + "epoch": 11.075846501128668, + "grad_norm": 230.06463623046875, + "learning_rate": 1.3350272232304901e-05, + "loss": 39.1406, + "step": 3068 + }, + { + "epoch": 11.079458239277653, + "grad_norm": 263.3990478515625, + "learning_rate": 1.3344827586206897e-05, + "loss": 39.8019, + "step": 3069 + }, + { + "epoch": 11.083069977426636, + "grad_norm": 217.89923095703125, + "learning_rate": 1.3339382940108892e-05, + "loss": 40.195, + "step": 3070 + }, + { + "epoch": 11.083069977426636, + "eval_loss": 0.6136859655380249, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 3070 + }, + { + "epoch": 11.08668171557562, + "grad_norm": 238.8343505859375, + "learning_rate": 1.333393829401089e-05, + "loss": 39.1668, + "step": 3071 + }, + { + "epoch": 11.090293453724605, + "grad_norm": 288.6470947265625, + "learning_rate": 1.3328493647912886e-05, + "loss": 40.3355, + "step": 3072 + }, + { + "epoch": 11.093905191873588, + "grad_norm": 284.3423156738281, + "learning_rate": 1.3323049001814883e-05, + "loss": 41.5359, + "step": 3073 + }, + { + "epoch": 11.097516930022573, + "grad_norm": 263.0945739746094, + "learning_rate": 1.3317604355716879e-05, + "loss": 41.3219, + "step": 3074 + }, + { + "epoch": 11.101128668171558, + "grad_norm": 208.96383666992188, + "learning_rate": 1.3312159709618874e-05, + "loss": 39.7292, + "step": 3075 + }, + { + "epoch": 11.104740406320541, + "grad_norm": 233.49888610839844, + "learning_rate": 1.3306715063520873e-05, + "loss": 35.282, + "step": 3076 + }, + { + "epoch": 11.108352144469526, + "grad_norm": 216.6250762939453, + "learning_rate": 1.3301270417422868e-05, + "loss": 34.4335, + "step": 3077 + }, + { + "epoch": 11.111963882618511, + "grad_norm": 182.3594970703125, + "learning_rate": 1.3295825771324864e-05, + "loss": 32.7557, + "step": 3078 + }, + { + "epoch": 11.115575620767494, + "grad_norm": 215.4852752685547, + "learning_rate": 1.329038112522686e-05, + "loss": 32.185, + "step": 3079 + }, + { + "epoch": 11.119187358916479, + "grad_norm": 237.4733123779297, + "learning_rate": 1.3284936479128856e-05, + "loss": 32.8733, + "step": 3080 + }, + { + "epoch": 11.119187358916479, + "eval_loss": 0.6130570769309998, + "eval_runtime": 3.154, + "eval_samples_per_second": 56.754, + "eval_steps_per_second": 56.754, + "step": 3080 + }, + { + "epoch": 11.122799097065462, + "grad_norm": 202.9044952392578, + "learning_rate": 1.3279491833030853e-05, + "loss": 33.89, + "step": 3081 + }, + { + "epoch": 11.126410835214447, + "grad_norm": 230.82086181640625, + "learning_rate": 1.327404718693285e-05, + "loss": 34.0808, + "step": 3082 + }, + { + "epoch": 11.130022573363432, + "grad_norm": 318.1103515625, + "learning_rate": 1.3268602540834846e-05, + "loss": 35.5715, + "step": 3083 + }, + { + "epoch": 11.133634311512415, + "grad_norm": 296.760986328125, + "learning_rate": 1.3263157894736843e-05, + "loss": 36.0701, + "step": 3084 + }, + { + "epoch": 11.1372460496614, + "grad_norm": 355.1922302246094, + "learning_rate": 1.3257713248638838e-05, + "loss": 35.027, + "step": 3085 + }, + { + "epoch": 11.140857787810384, + "grad_norm": 379.0643310546875, + "learning_rate": 1.3252268602540835e-05, + "loss": 36.8225, + "step": 3086 + }, + { + "epoch": 11.144469525959368, + "grad_norm": 271.0293273925781, + "learning_rate": 1.3246823956442832e-05, + "loss": 34.18, + "step": 3087 + }, + { + "epoch": 11.148081264108352, + "grad_norm": 231.29782104492188, + "learning_rate": 1.3241379310344828e-05, + "loss": 37.5546, + "step": 3088 + }, + { + "epoch": 11.151693002257336, + "grad_norm": 236.58180236816406, + "learning_rate": 1.3235934664246823e-05, + "loss": 35.8625, + "step": 3089 + }, + { + "epoch": 11.15530474040632, + "grad_norm": 220.71853637695312, + "learning_rate": 1.3230490018148822e-05, + "loss": 38.1384, + "step": 3090 + }, + { + "epoch": 11.15530474040632, + "eval_loss": 0.6140565276145935, + "eval_runtime": 3.1543, + "eval_samples_per_second": 56.747, + "eval_steps_per_second": 56.747, + "step": 3090 + }, + { + "epoch": 11.158916478555305, + "grad_norm": 251.32090759277344, + "learning_rate": 1.3225045372050817e-05, + "loss": 36.7226, + "step": 3091 + }, + { + "epoch": 11.162528216704288, + "grad_norm": 244.061279296875, + "learning_rate": 1.3219600725952814e-05, + "loss": 37.2144, + "step": 3092 + }, + { + "epoch": 11.166139954853273, + "grad_norm": 274.3013610839844, + "learning_rate": 1.321415607985481e-05, + "loss": 27.0703, + "step": 3093 + }, + { + "epoch": 11.169751693002258, + "grad_norm": 197.1829071044922, + "learning_rate": 1.3208711433756805e-05, + "loss": 23.0504, + "step": 3094 + }, + { + "epoch": 11.173363431151241, + "grad_norm": 205.8387451171875, + "learning_rate": 1.3203266787658804e-05, + "loss": 23.4632, + "step": 3095 + }, + { + "epoch": 11.176975169300226, + "grad_norm": 237.6263427734375, + "learning_rate": 1.31978221415608e-05, + "loss": 23.9426, + "step": 3096 + }, + { + "epoch": 11.18058690744921, + "grad_norm": 177.99688720703125, + "learning_rate": 1.3192377495462795e-05, + "loss": 24.2553, + "step": 3097 + }, + { + "epoch": 11.184198645598194, + "grad_norm": 235.16787719726562, + "learning_rate": 1.3186932849364792e-05, + "loss": 41.3257, + "step": 3098 + }, + { + "epoch": 11.187810383747179, + "grad_norm": 213.4043731689453, + "learning_rate": 1.3181488203266787e-05, + "loss": 42.3344, + "step": 3099 + }, + { + "epoch": 11.191422121896162, + "grad_norm": 162.57554626464844, + "learning_rate": 1.3176043557168784e-05, + "loss": 41.2702, + "step": 3100 + }, + { + "epoch": 11.191422121896162, + "eval_loss": 0.6155741214752197, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.06, + "eval_steps_per_second": 57.06, + "step": 3100 + }, + { + "epoch": 11.195033860045147, + "grad_norm": 215.84335327148438, + "learning_rate": 1.3170598911070781e-05, + "loss": 41.0582, + "step": 3101 + }, + { + "epoch": 11.198645598194132, + "grad_norm": 295.0271301269531, + "learning_rate": 1.3165154264972777e-05, + "loss": 41.3479, + "step": 3102 + }, + { + "epoch": 11.202257336343115, + "grad_norm": 287.3316955566406, + "learning_rate": 1.3159709618874774e-05, + "loss": 41.6267, + "step": 3103 + }, + { + "epoch": 11.2058690744921, + "grad_norm": 249.3993377685547, + "learning_rate": 1.315426497277677e-05, + "loss": 40.5208, + "step": 3104 + }, + { + "epoch": 11.209480812641084, + "grad_norm": 274.5410461425781, + "learning_rate": 1.3148820326678766e-05, + "loss": 41.7072, + "step": 3105 + }, + { + "epoch": 11.213092550790067, + "grad_norm": 259.49627685546875, + "learning_rate": 1.3143375680580763e-05, + "loss": 41.0034, + "step": 3106 + }, + { + "epoch": 11.216704288939052, + "grad_norm": 246.60902404785156, + "learning_rate": 1.3137931034482759e-05, + "loss": 40.1154, + "step": 3107 + }, + { + "epoch": 11.220316027088035, + "grad_norm": 224.0052947998047, + "learning_rate": 1.3132486388384754e-05, + "loss": 41.1167, + "step": 3108 + }, + { + "epoch": 11.22392776523702, + "grad_norm": 204.24021911621094, + "learning_rate": 1.3127041742286753e-05, + "loss": 37.0909, + "step": 3109 + }, + { + "epoch": 11.227539503386005, + "grad_norm": 206.67681884765625, + "learning_rate": 1.3121597096188748e-05, + "loss": 38.0959, + "step": 3110 + }, + { + "epoch": 11.227539503386005, + "eval_loss": 0.6148640513420105, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.798, + "eval_steps_per_second": 56.798, + "step": 3110 + }, + { + "epoch": 11.231151241534988, + "grad_norm": 255.91238403320312, + "learning_rate": 1.3116152450090743e-05, + "loss": 38.8076, + "step": 3111 + }, + { + "epoch": 11.234762979683973, + "grad_norm": 239.5032958984375, + "learning_rate": 1.311070780399274e-05, + "loss": 39.3991, + "step": 3112 + }, + { + "epoch": 11.238374717832958, + "grad_norm": 254.8914031982422, + "learning_rate": 1.3105263157894738e-05, + "loss": 37.7301, + "step": 3113 + }, + { + "epoch": 11.241986455981941, + "grad_norm": 229.97943115234375, + "learning_rate": 1.3099818511796735e-05, + "loss": 38.8527, + "step": 3114 + }, + { + "epoch": 11.245598194130926, + "grad_norm": 208.1148681640625, + "learning_rate": 1.309437386569873e-05, + "loss": 38.8518, + "step": 3115 + }, + { + "epoch": 11.249209932279909, + "grad_norm": 208.49557495117188, + "learning_rate": 1.3088929219600725e-05, + "loss": 38.927, + "step": 3116 + }, + { + "epoch": 11.252821670428894, + "grad_norm": 332.9958801269531, + "learning_rate": 1.3083484573502723e-05, + "loss": 40.0492, + "step": 3117 + }, + { + "epoch": 11.256433408577879, + "grad_norm": 253.16769409179688, + "learning_rate": 1.307803992740472e-05, + "loss": 39.1965, + "step": 3118 + }, + { + "epoch": 11.260045146726862, + "grad_norm": 243.8136444091797, + "learning_rate": 1.3072595281306715e-05, + "loss": 38.2286, + "step": 3119 + }, + { + "epoch": 11.263656884875846, + "grad_norm": 273.6463623046875, + "learning_rate": 1.3067150635208712e-05, + "loss": 39.3751, + "step": 3120 + }, + { + "epoch": 11.263656884875846, + "eval_loss": 0.6175129413604736, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 3120 + }, + { + "epoch": 11.267268623024831, + "grad_norm": 228.980224609375, + "learning_rate": 1.3061705989110707e-05, + "loss": 40.29, + "step": 3121 + }, + { + "epoch": 11.270880361173814, + "grad_norm": 292.6310729980469, + "learning_rate": 1.3056261343012703e-05, + "loss": 41.1785, + "step": 3122 + }, + { + "epoch": 11.2744920993228, + "grad_norm": 217.0737762451172, + "learning_rate": 1.3050816696914702e-05, + "loss": 40.9514, + "step": 3123 + }, + { + "epoch": 11.278103837471784, + "grad_norm": 227.0102081298828, + "learning_rate": 1.3045372050816697e-05, + "loss": 39.6132, + "step": 3124 + }, + { + "epoch": 11.281715575620767, + "grad_norm": 195.74667358398438, + "learning_rate": 1.3039927404718694e-05, + "loss": 39.5024, + "step": 3125 + }, + { + "epoch": 11.285327313769752, + "grad_norm": 222.6744384765625, + "learning_rate": 1.303448275862069e-05, + "loss": 37.7863, + "step": 3126 + }, + { + "epoch": 11.288939051918735, + "grad_norm": 207.1038055419922, + "learning_rate": 1.3029038112522687e-05, + "loss": 34.9129, + "step": 3127 + }, + { + "epoch": 11.29255079006772, + "grad_norm": 227.38330078125, + "learning_rate": 1.3023593466424684e-05, + "loss": 33.231, + "step": 3128 + }, + { + "epoch": 11.296162528216705, + "grad_norm": 254.19442749023438, + "learning_rate": 1.3018148820326679e-05, + "loss": 33.3166, + "step": 3129 + }, + { + "epoch": 11.299774266365688, + "grad_norm": 221.4664306640625, + "learning_rate": 1.3012704174228674e-05, + "loss": 33.2336, + "step": 3130 + }, + { + "epoch": 11.299774266365688, + "eval_loss": 0.6138683557510376, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 3130 + }, + { + "epoch": 11.303386004514673, + "grad_norm": 179.73678588867188, + "learning_rate": 1.3007259528130671e-05, + "loss": 34.0082, + "step": 3131 + }, + { + "epoch": 11.306997742663658, + "grad_norm": 238.66107177734375, + "learning_rate": 1.3001814882032669e-05, + "loss": 33.1898, + "step": 3132 + }, + { + "epoch": 11.31060948081264, + "grad_norm": 315.51934814453125, + "learning_rate": 1.2996370235934666e-05, + "loss": 34.5558, + "step": 3133 + }, + { + "epoch": 11.314221218961626, + "grad_norm": 235.54217529296875, + "learning_rate": 1.2990925589836661e-05, + "loss": 32.4498, + "step": 3134 + }, + { + "epoch": 11.317832957110609, + "grad_norm": 225.9518280029297, + "learning_rate": 1.2985480943738656e-05, + "loss": 34.1823, + "step": 3135 + }, + { + "epoch": 11.321444695259594, + "grad_norm": 276.5481262207031, + "learning_rate": 1.2980036297640655e-05, + "loss": 34.6704, + "step": 3136 + }, + { + "epoch": 11.325056433408578, + "grad_norm": 306.4985656738281, + "learning_rate": 1.297459165154265e-05, + "loss": 35.9149, + "step": 3137 + }, + { + "epoch": 11.328668171557561, + "grad_norm": 207.28550720214844, + "learning_rate": 1.2969147005444646e-05, + "loss": 34.876, + "step": 3138 + }, + { + "epoch": 11.332279909706546, + "grad_norm": 238.89157104492188, + "learning_rate": 1.2963702359346643e-05, + "loss": 36.7191, + "step": 3139 + }, + { + "epoch": 11.335891647855531, + "grad_norm": 281.7445068359375, + "learning_rate": 1.2958257713248638e-05, + "loss": 37.9134, + "step": 3140 + }, + { + "epoch": 11.335891647855531, + "eval_loss": 0.6141538023948669, + "eval_runtime": 3.1622, + "eval_samples_per_second": 56.606, + "eval_steps_per_second": 56.606, + "step": 3140 + }, + { + "epoch": 11.339503386004514, + "grad_norm": 261.58221435546875, + "learning_rate": 1.2952813067150635e-05, + "loss": 36.7193, + "step": 3141 + }, + { + "epoch": 11.343115124153499, + "grad_norm": 260.8083190917969, + "learning_rate": 1.2947368421052633e-05, + "loss": 36.9418, + "step": 3142 + }, + { + "epoch": 11.346726862302482, + "grad_norm": 263.466552734375, + "learning_rate": 1.2941923774954628e-05, + "loss": 31.1083, + "step": 3143 + }, + { + "epoch": 11.350338600451467, + "grad_norm": 201.6587677001953, + "learning_rate": 1.2936479128856625e-05, + "loss": 23.4982, + "step": 3144 + }, + { + "epoch": 11.353950338600452, + "grad_norm": 230.29629516601562, + "learning_rate": 1.293103448275862e-05, + "loss": 22.5417, + "step": 3145 + }, + { + "epoch": 11.357562076749435, + "grad_norm": 193.08795166015625, + "learning_rate": 1.2925589836660617e-05, + "loss": 23.6032, + "step": 3146 + }, + { + "epoch": 11.36117381489842, + "grad_norm": 206.49093627929688, + "learning_rate": 1.2920145190562615e-05, + "loss": 24.1813, + "step": 3147 + }, + { + "epoch": 11.364785553047405, + "grad_norm": 285.38348388671875, + "learning_rate": 1.291470054446461e-05, + "loss": 41.4394, + "step": 3148 + }, + { + "epoch": 11.368397291196388, + "grad_norm": 307.4984130859375, + "learning_rate": 1.2909255898366605e-05, + "loss": 43.8865, + "step": 3149 + }, + { + "epoch": 11.372009029345373, + "grad_norm": 256.685791015625, + "learning_rate": 1.2903811252268604e-05, + "loss": 41.5534, + "step": 3150 + }, + { + "epoch": 11.372009029345373, + "eval_loss": 0.6155339479446411, + "eval_runtime": 3.1488, + "eval_samples_per_second": 56.846, + "eval_steps_per_second": 56.846, + "step": 3150 + }, + { + "epoch": 11.375620767494357, + "grad_norm": 302.5317077636719, + "learning_rate": 1.28983666061706e-05, + "loss": 41.5231, + "step": 3151 + }, + { + "epoch": 11.37923250564334, + "grad_norm": 381.4787292480469, + "learning_rate": 1.2892921960072595e-05, + "loss": 40.7064, + "step": 3152 + }, + { + "epoch": 11.382844243792325, + "grad_norm": 313.63116455078125, + "learning_rate": 1.2887477313974592e-05, + "loss": 41.4045, + "step": 3153 + }, + { + "epoch": 11.386455981941308, + "grad_norm": 265.4134521484375, + "learning_rate": 1.2882032667876587e-05, + "loss": 41.2618, + "step": 3154 + }, + { + "epoch": 11.390067720090293, + "grad_norm": 260.43084716796875, + "learning_rate": 1.2876588021778586e-05, + "loss": 42.6311, + "step": 3155 + }, + { + "epoch": 11.393679458239278, + "grad_norm": 326.7022705078125, + "learning_rate": 1.2871143375680581e-05, + "loss": 41.8859, + "step": 3156 + }, + { + "epoch": 11.397291196388261, + "grad_norm": 420.966552734375, + "learning_rate": 1.2865698729582577e-05, + "loss": 41.8117, + "step": 3157 + }, + { + "epoch": 11.400902934537246, + "grad_norm": 280.8377380371094, + "learning_rate": 1.2860254083484574e-05, + "loss": 41.3303, + "step": 3158 + }, + { + "epoch": 11.404514672686231, + "grad_norm": 238.64564514160156, + "learning_rate": 1.2854809437386571e-05, + "loss": 38.253, + "step": 3159 + }, + { + "epoch": 11.408126410835214, + "grad_norm": 258.8091125488281, + "learning_rate": 1.2849364791288566e-05, + "loss": 39.2494, + "step": 3160 + }, + { + "epoch": 11.408126410835214, + "eval_loss": 0.6130858659744263, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 3160 + }, + { + "epoch": 11.411738148984199, + "grad_norm": 209.76300048828125, + "learning_rate": 1.2843920145190563e-05, + "loss": 39.1069, + "step": 3161 + }, + { + "epoch": 11.415349887133182, + "grad_norm": 215.24072265625, + "learning_rate": 1.2838475499092559e-05, + "loss": 38.8867, + "step": 3162 + }, + { + "epoch": 11.418961625282167, + "grad_norm": 285.4281311035156, + "learning_rate": 1.2833030852994554e-05, + "loss": 38.0298, + "step": 3163 + }, + { + "epoch": 11.422573363431152, + "grad_norm": 322.1593017578125, + "learning_rate": 1.2827586206896553e-05, + "loss": 40.2122, + "step": 3164 + }, + { + "epoch": 11.426185101580135, + "grad_norm": 277.2178955078125, + "learning_rate": 1.2822141560798548e-05, + "loss": 38.0829, + "step": 3165 + }, + { + "epoch": 11.42979683972912, + "grad_norm": 186.9705810546875, + "learning_rate": 1.2816696914700545e-05, + "loss": 40.6601, + "step": 3166 + }, + { + "epoch": 11.433408577878104, + "grad_norm": 210.6102294921875, + "learning_rate": 1.281125226860254e-05, + "loss": 39.0126, + "step": 3167 + }, + { + "epoch": 11.437020316027088, + "grad_norm": 234.50717163085938, + "learning_rate": 1.2805807622504536e-05, + "loss": 38.6465, + "step": 3168 + }, + { + "epoch": 11.440632054176072, + "grad_norm": 217.9093475341797, + "learning_rate": 1.2800362976406535e-05, + "loss": 39.2568, + "step": 3169 + }, + { + "epoch": 11.444243792325057, + "grad_norm": 252.82054138183594, + "learning_rate": 1.279491833030853e-05, + "loss": 39.005, + "step": 3170 + }, + { + "epoch": 11.444243792325057, + "eval_loss": 0.6125118732452393, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 3170 + }, + { + "epoch": 11.44785553047404, + "grad_norm": 290.2322998046875, + "learning_rate": 1.2789473684210526e-05, + "loss": 39.6133, + "step": 3171 + }, + { + "epoch": 11.451467268623025, + "grad_norm": 250.72450256347656, + "learning_rate": 1.2784029038112523e-05, + "loss": 40.3251, + "step": 3172 + }, + { + "epoch": 11.455079006772008, + "grad_norm": 273.91229248046875, + "learning_rate": 1.277858439201452e-05, + "loss": 39.5129, + "step": 3173 + }, + { + "epoch": 11.458690744920993, + "grad_norm": 214.30038452148438, + "learning_rate": 1.2773139745916515e-05, + "loss": 40.5093, + "step": 3174 + }, + { + "epoch": 11.462302483069978, + "grad_norm": 264.251708984375, + "learning_rate": 1.2767695099818512e-05, + "loss": 38.3837, + "step": 3175 + }, + { + "epoch": 11.465914221218961, + "grad_norm": 224.7700653076172, + "learning_rate": 1.2762250453720508e-05, + "loss": 37.8522, + "step": 3176 + }, + { + "epoch": 11.469525959367946, + "grad_norm": 238.35604858398438, + "learning_rate": 1.2756805807622505e-05, + "loss": 34.0249, + "step": 3177 + }, + { + "epoch": 11.47313769751693, + "grad_norm": 181.4731903076172, + "learning_rate": 1.2751361161524502e-05, + "loss": 34.2473, + "step": 3178 + }, + { + "epoch": 11.476749435665914, + "grad_norm": 240.2397003173828, + "learning_rate": 1.2745916515426497e-05, + "loss": 32.8657, + "step": 3179 + }, + { + "epoch": 11.480361173814899, + "grad_norm": 283.2740478515625, + "learning_rate": 1.2740471869328494e-05, + "loss": 34.6619, + "step": 3180 + }, + { + "epoch": 11.480361173814899, + "eval_loss": 0.6126638054847717, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 3180 + }, + { + "epoch": 11.483972911963882, + "grad_norm": 248.70912170410156, + "learning_rate": 1.273502722323049e-05, + "loss": 33.0975, + "step": 3181 + }, + { + "epoch": 11.487584650112867, + "grad_norm": 210.9479217529297, + "learning_rate": 1.2729582577132487e-05, + "loss": 34.2069, + "step": 3182 + }, + { + "epoch": 11.491196388261852, + "grad_norm": 234.31399536132812, + "learning_rate": 1.2724137931034484e-05, + "loss": 35.811, + "step": 3183 + }, + { + "epoch": 11.494808126410835, + "grad_norm": 253.24478149414062, + "learning_rate": 1.271869328493648e-05, + "loss": 35.6234, + "step": 3184 + }, + { + "epoch": 11.49841986455982, + "grad_norm": 259.0565185546875, + "learning_rate": 1.2713248638838476e-05, + "loss": 35.1495, + "step": 3185 + }, + { + "epoch": 11.502031602708804, + "grad_norm": 235.4202880859375, + "learning_rate": 1.2707803992740472e-05, + "loss": 35.1363, + "step": 3186 + }, + { + "epoch": 11.505643340857787, + "grad_norm": 248.30267333984375, + "learning_rate": 1.2702359346642469e-05, + "loss": 35.9653, + "step": 3187 + }, + { + "epoch": 11.509255079006772, + "grad_norm": 197.6142120361328, + "learning_rate": 1.2696914700544466e-05, + "loss": 35.6304, + "step": 3188 + }, + { + "epoch": 11.512866817155757, + "grad_norm": 329.27862548828125, + "learning_rate": 1.2691470054446461e-05, + "loss": 35.6111, + "step": 3189 + }, + { + "epoch": 11.51647855530474, + "grad_norm": 194.7126922607422, + "learning_rate": 1.2686025408348457e-05, + "loss": 35.0693, + "step": 3190 + }, + { + "epoch": 11.51647855530474, + "eval_loss": 0.6106634736061096, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 3190 + }, + { + "epoch": 11.520090293453725, + "grad_norm": 243.0207061767578, + "learning_rate": 1.2680580762250454e-05, + "loss": 37.6373, + "step": 3191 + }, + { + "epoch": 11.523702031602708, + "grad_norm": 282.0947265625, + "learning_rate": 1.267513611615245e-05, + "loss": 36.2595, + "step": 3192 + }, + { + "epoch": 11.527313769751693, + "grad_norm": 249.8011932373047, + "learning_rate": 1.2669691470054446e-05, + "loss": 35.5601, + "step": 3193 + }, + { + "epoch": 11.530925507900678, + "grad_norm": 202.17503356933594, + "learning_rate": 1.2664246823956443e-05, + "loss": 23.1075, + "step": 3194 + }, + { + "epoch": 11.534537246049661, + "grad_norm": 188.78128051757812, + "learning_rate": 1.2658802177858439e-05, + "loss": 22.2458, + "step": 3195 + }, + { + "epoch": 11.538148984198646, + "grad_norm": 219.24722290039062, + "learning_rate": 1.2653357531760437e-05, + "loss": 23.7842, + "step": 3196 + }, + { + "epoch": 11.54176072234763, + "grad_norm": 213.0615234375, + "learning_rate": 1.2647912885662433e-05, + "loss": 25.3773, + "step": 3197 + }, + { + "epoch": 11.545372460496614, + "grad_norm": 274.6806335449219, + "learning_rate": 1.2642468239564428e-05, + "loss": 40.396, + "step": 3198 + }, + { + "epoch": 11.548984198645599, + "grad_norm": 248.91778564453125, + "learning_rate": 1.2637023593466425e-05, + "loss": 42.2405, + "step": 3199 + }, + { + "epoch": 11.552595936794582, + "grad_norm": 228.45591735839844, + "learning_rate": 1.263157894736842e-05, + "loss": 40.7328, + "step": 3200 + }, + { + "epoch": 11.552595936794582, + "eval_loss": 0.6154705286026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.04, + "eval_steps_per_second": 57.04, + "step": 3200 + }, + { + "epoch": 11.556207674943566, + "grad_norm": 206.54483032226562, + "learning_rate": 1.2626134301270418e-05, + "loss": 40.6909, + "step": 3201 + }, + { + "epoch": 11.559819413092551, + "grad_norm": 199.14816284179688, + "learning_rate": 1.2620689655172415e-05, + "loss": 40.6918, + "step": 3202 + }, + { + "epoch": 11.563431151241534, + "grad_norm": 217.4789276123047, + "learning_rate": 1.261524500907441e-05, + "loss": 41.686, + "step": 3203 + }, + { + "epoch": 11.56704288939052, + "grad_norm": 209.83084106445312, + "learning_rate": 1.2609800362976406e-05, + "loss": 40.685, + "step": 3204 + }, + { + "epoch": 11.570654627539504, + "grad_norm": 184.56614685058594, + "learning_rate": 1.2604355716878404e-05, + "loss": 42.1684, + "step": 3205 + }, + { + "epoch": 11.574266365688487, + "grad_norm": 226.84622192382812, + "learning_rate": 1.25989110707804e-05, + "loss": 42.4169, + "step": 3206 + }, + { + "epoch": 11.577878103837472, + "grad_norm": 271.7705383300781, + "learning_rate": 1.2593466424682397e-05, + "loss": 41.9603, + "step": 3207 + }, + { + "epoch": 11.581489841986457, + "grad_norm": 206.48257446289062, + "learning_rate": 1.2588021778584392e-05, + "loss": 39.9903, + "step": 3208 + }, + { + "epoch": 11.58510158013544, + "grad_norm": 190.86009216308594, + "learning_rate": 1.2582577132486388e-05, + "loss": 39.3138, + "step": 3209 + }, + { + "epoch": 11.588713318284425, + "grad_norm": 217.0152130126953, + "learning_rate": 1.2577132486388386e-05, + "loss": 37.652, + "step": 3210 + }, + { + "epoch": 11.588713318284425, + "eval_loss": 0.6143624186515808, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 3210 + }, + { + "epoch": 11.592325056433408, + "grad_norm": 203.3090362548828, + "learning_rate": 1.2571687840290382e-05, + "loss": 38.5532, + "step": 3211 + }, + { + "epoch": 11.595936794582393, + "grad_norm": 237.18287658691406, + "learning_rate": 1.2566243194192377e-05, + "loss": 38.4073, + "step": 3212 + }, + { + "epoch": 11.599548532731378, + "grad_norm": 222.20489501953125, + "learning_rate": 1.2560798548094374e-05, + "loss": 37.7122, + "step": 3213 + }, + { + "epoch": 11.60316027088036, + "grad_norm": 261.4862060546875, + "learning_rate": 1.255535390199637e-05, + "loss": 39.0125, + "step": 3214 + }, + { + "epoch": 11.606772009029346, + "grad_norm": 235.49668884277344, + "learning_rate": 1.2549909255898367e-05, + "loss": 38.1753, + "step": 3215 + }, + { + "epoch": 11.610383747178329, + "grad_norm": 219.66139221191406, + "learning_rate": 1.2544464609800364e-05, + "loss": 40.3478, + "step": 3216 + }, + { + "epoch": 11.613995485327314, + "grad_norm": 282.8075256347656, + "learning_rate": 1.2539019963702359e-05, + "loss": 39.3672, + "step": 3217 + }, + { + "epoch": 11.617607223476298, + "grad_norm": 235.07875061035156, + "learning_rate": 1.2533575317604356e-05, + "loss": 39.8955, + "step": 3218 + }, + { + "epoch": 11.621218961625281, + "grad_norm": 328.829833984375, + "learning_rate": 1.2528130671506353e-05, + "loss": 38.626, + "step": 3219 + }, + { + "epoch": 11.624830699774266, + "grad_norm": 283.1789245605469, + "learning_rate": 1.2522686025408349e-05, + "loss": 40.0565, + "step": 3220 + }, + { + "epoch": 11.624830699774266, + "eval_loss": 0.6113889217376709, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3220 + }, + { + "epoch": 11.628442437923251, + "grad_norm": 230.88047790527344, + "learning_rate": 1.2517241379310346e-05, + "loss": 40.1155, + "step": 3221 + }, + { + "epoch": 11.632054176072234, + "grad_norm": 258.1295166015625, + "learning_rate": 1.2511796733212341e-05, + "loss": 40.4707, + "step": 3222 + }, + { + "epoch": 11.635665914221219, + "grad_norm": 255.82699584960938, + "learning_rate": 1.2506352087114336e-05, + "loss": 41.1296, + "step": 3223 + }, + { + "epoch": 11.639277652370204, + "grad_norm": 226.4784393310547, + "learning_rate": 1.2500907441016335e-05, + "loss": 39.1159, + "step": 3224 + }, + { + "epoch": 11.642889390519187, + "grad_norm": 257.38104248046875, + "learning_rate": 1.249546279491833e-05, + "loss": 40.7933, + "step": 3225 + }, + { + "epoch": 11.646501128668172, + "grad_norm": 218.69070434570312, + "learning_rate": 1.2490018148820328e-05, + "loss": 39.6723, + "step": 3226 + }, + { + "epoch": 11.650112866817155, + "grad_norm": 232.3351287841797, + "learning_rate": 1.2484573502722323e-05, + "loss": 37.5671, + "step": 3227 + }, + { + "epoch": 11.65372460496614, + "grad_norm": 229.93295288085938, + "learning_rate": 1.2479128856624318e-05, + "loss": 32.7819, + "step": 3228 + }, + { + "epoch": 11.657336343115125, + "grad_norm": 265.6002197265625, + "learning_rate": 1.2473684210526317e-05, + "loss": 32.5955, + "step": 3229 + }, + { + "epoch": 11.660948081264108, + "grad_norm": 278.47705078125, + "learning_rate": 1.2468239564428313e-05, + "loss": 32.9901, + "step": 3230 + }, + { + "epoch": 11.660948081264108, + "eval_loss": 0.6078047752380371, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 3230 + }, + { + "epoch": 11.664559819413093, + "grad_norm": 239.9285430908203, + "learning_rate": 1.2462794918330308e-05, + "loss": 33.2737, + "step": 3231 + }, + { + "epoch": 11.668171557562077, + "grad_norm": 358.36090087890625, + "learning_rate": 1.2457350272232305e-05, + "loss": 34.8522, + "step": 3232 + }, + { + "epoch": 11.67178329571106, + "grad_norm": 258.0733642578125, + "learning_rate": 1.2451905626134302e-05, + "loss": 34.6796, + "step": 3233 + }, + { + "epoch": 11.675395033860045, + "grad_norm": 296.21942138671875, + "learning_rate": 1.2446460980036298e-05, + "loss": 35.8479, + "step": 3234 + }, + { + "epoch": 11.679006772009028, + "grad_norm": 229.6141815185547, + "learning_rate": 1.2441016333938295e-05, + "loss": 36.4934, + "step": 3235 + }, + { + "epoch": 11.682618510158013, + "grad_norm": 238.6092987060547, + "learning_rate": 1.243557168784029e-05, + "loss": 35.2253, + "step": 3236 + }, + { + "epoch": 11.686230248306998, + "grad_norm": 300.76300048828125, + "learning_rate": 1.2430127041742287e-05, + "loss": 34.9373, + "step": 3237 + }, + { + "epoch": 11.689841986455981, + "grad_norm": 227.70672607421875, + "learning_rate": 1.2424682395644284e-05, + "loss": 35.4369, + "step": 3238 + }, + { + "epoch": 11.693453724604966, + "grad_norm": 218.36000061035156, + "learning_rate": 1.241923774954628e-05, + "loss": 35.3398, + "step": 3239 + }, + { + "epoch": 11.697065462753951, + "grad_norm": 220.78475952148438, + "learning_rate": 1.2413793103448277e-05, + "loss": 35.7612, + "step": 3240 + }, + { + "epoch": 11.697065462753951, + "eval_loss": 0.6067846417427063, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 3240 + }, + { + "epoch": 11.700677200902934, + "grad_norm": 237.34437561035156, + "learning_rate": 1.2408348457350272e-05, + "loss": 38.0459, + "step": 3241 + }, + { + "epoch": 11.704288939051919, + "grad_norm": 251.60633850097656, + "learning_rate": 1.2402903811252269e-05, + "loss": 35.4676, + "step": 3242 + }, + { + "epoch": 11.707900677200904, + "grad_norm": 214.17117309570312, + "learning_rate": 1.2397459165154266e-05, + "loss": 30.5595, + "step": 3243 + }, + { + "epoch": 11.711512415349887, + "grad_norm": 202.3698272705078, + "learning_rate": 1.2392014519056262e-05, + "loss": 23.7468, + "step": 3244 + }, + { + "epoch": 11.715124153498872, + "grad_norm": 229.11776733398438, + "learning_rate": 1.2386569872958257e-05, + "loss": 23.1255, + "step": 3245 + }, + { + "epoch": 11.718735891647855, + "grad_norm": 175.93829345703125, + "learning_rate": 1.2381125226860254e-05, + "loss": 23.7349, + "step": 3246 + }, + { + "epoch": 11.72234762979684, + "grad_norm": 232.7489471435547, + "learning_rate": 1.2375680580762251e-05, + "loss": 24.4997, + "step": 3247 + }, + { + "epoch": 11.725959367945824, + "grad_norm": 280.5601806640625, + "learning_rate": 1.2370235934664248e-05, + "loss": 42.3811, + "step": 3248 + }, + { + "epoch": 11.729571106094808, + "grad_norm": 292.2538146972656, + "learning_rate": 1.2364791288566244e-05, + "loss": 42.9804, + "step": 3249 + }, + { + "epoch": 11.733182844243792, + "grad_norm": 265.0259704589844, + "learning_rate": 1.2359346642468239e-05, + "loss": 41.1251, + "step": 3250 + }, + { + "epoch": 11.733182844243792, + "eval_loss": 0.6141200065612793, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 3250 + }, + { + "epoch": 11.736794582392777, + "grad_norm": 232.92893981933594, + "learning_rate": 1.2353901996370236e-05, + "loss": 40.9372, + "step": 3251 + }, + { + "epoch": 11.74040632054176, + "grad_norm": 176.99818420410156, + "learning_rate": 1.2348457350272233e-05, + "loss": 41.0757, + "step": 3252 + }, + { + "epoch": 11.744018058690745, + "grad_norm": 206.5728759765625, + "learning_rate": 1.2343012704174228e-05, + "loss": 41.9635, + "step": 3253 + }, + { + "epoch": 11.747629796839728, + "grad_norm": 211.2556915283203, + "learning_rate": 1.2337568058076226e-05, + "loss": 41.5217, + "step": 3254 + }, + { + "epoch": 11.751241534988713, + "grad_norm": 198.8915252685547, + "learning_rate": 1.2332123411978221e-05, + "loss": 42.9997, + "step": 3255 + }, + { + "epoch": 11.754853273137698, + "grad_norm": 291.2761535644531, + "learning_rate": 1.2326678765880218e-05, + "loss": 42.2561, + "step": 3256 + }, + { + "epoch": 11.758465011286681, + "grad_norm": 243.2998046875, + "learning_rate": 1.2321234119782215e-05, + "loss": 41.6219, + "step": 3257 + }, + { + "epoch": 11.762076749435666, + "grad_norm": 266.1149597167969, + "learning_rate": 1.231578947368421e-05, + "loss": 40.1646, + "step": 3258 + }, + { + "epoch": 11.76568848758465, + "grad_norm": 236.6083221435547, + "learning_rate": 1.2310344827586208e-05, + "loss": 39.7079, + "step": 3259 + }, + { + "epoch": 11.769300225733634, + "grad_norm": 196.397216796875, + "learning_rate": 1.2304900181488203e-05, + "loss": 39.6629, + "step": 3260 + }, + { + "epoch": 11.769300225733634, + "eval_loss": 0.6124016046524048, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.055, + "eval_steps_per_second": 57.055, + "step": 3260 + }, + { + "epoch": 11.772911963882619, + "grad_norm": 198.52500915527344, + "learning_rate": 1.22994555353902e-05, + "loss": 38.5285, + "step": 3261 + }, + { + "epoch": 11.776523702031604, + "grad_norm": 236.25477600097656, + "learning_rate": 1.2294010889292197e-05, + "loss": 38.3358, + "step": 3262 + }, + { + "epoch": 11.780135440180587, + "grad_norm": 260.35955810546875, + "learning_rate": 1.2288566243194192e-05, + "loss": 38.374, + "step": 3263 + }, + { + "epoch": 11.783747178329572, + "grad_norm": 313.078857421875, + "learning_rate": 1.2283121597096188e-05, + "loss": 39.124, + "step": 3264 + }, + { + "epoch": 11.787358916478555, + "grad_norm": 191.34027099609375, + "learning_rate": 1.2277676950998187e-05, + "loss": 39.1776, + "step": 3265 + }, + { + "epoch": 11.79097065462754, + "grad_norm": 203.5764923095703, + "learning_rate": 1.2272232304900182e-05, + "loss": 38.7885, + "step": 3266 + }, + { + "epoch": 11.794582392776524, + "grad_norm": 234.38479614257812, + "learning_rate": 1.2266787658802177e-05, + "loss": 39.1353, + "step": 3267 + }, + { + "epoch": 11.798194130925507, + "grad_norm": 254.5694122314453, + "learning_rate": 1.2261343012704174e-05, + "loss": 38.141, + "step": 3268 + }, + { + "epoch": 11.801805869074492, + "grad_norm": 189.8268585205078, + "learning_rate": 1.225589836660617e-05, + "loss": 39.5199, + "step": 3269 + }, + { + "epoch": 11.805417607223477, + "grad_norm": 256.52728271484375, + "learning_rate": 1.2250453720508169e-05, + "loss": 41.5113, + "step": 3270 + }, + { + "epoch": 11.805417607223477, + "eval_loss": 0.6084021329879761, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3270 + }, + { + "epoch": 11.80902934537246, + "grad_norm": 195.57321166992188, + "learning_rate": 1.2245009074410164e-05, + "loss": 39.8129, + "step": 3271 + }, + { + "epoch": 11.812641083521445, + "grad_norm": 228.6748809814453, + "learning_rate": 1.223956442831216e-05, + "loss": 40.2273, + "step": 3272 + }, + { + "epoch": 11.816252821670428, + "grad_norm": 209.96096801757812, + "learning_rate": 1.2234119782214156e-05, + "loss": 40.2254, + "step": 3273 + }, + { + "epoch": 11.819864559819413, + "grad_norm": 247.4613037109375, + "learning_rate": 1.2228675136116152e-05, + "loss": 40.71, + "step": 3274 + }, + { + "epoch": 11.823476297968398, + "grad_norm": 263.0521240234375, + "learning_rate": 1.2223230490018149e-05, + "loss": 39.5572, + "step": 3275 + }, + { + "epoch": 11.827088036117381, + "grad_norm": 225.53634643554688, + "learning_rate": 1.2217785843920146e-05, + "loss": 36.4388, + "step": 3276 + }, + { + "epoch": 11.830699774266366, + "grad_norm": 194.59527587890625, + "learning_rate": 1.2212341197822141e-05, + "loss": 33.1005, + "step": 3277 + }, + { + "epoch": 11.83431151241535, + "grad_norm": 314.715576171875, + "learning_rate": 1.2206896551724138e-05, + "loss": 32.9812, + "step": 3278 + }, + { + "epoch": 11.837923250564334, + "grad_norm": 205.86862182617188, + "learning_rate": 1.2201451905626136e-05, + "loss": 33.6331, + "step": 3279 + }, + { + "epoch": 11.841534988713319, + "grad_norm": 217.54722595214844, + "learning_rate": 1.2196007259528131e-05, + "loss": 33.6535, + "step": 3280 + }, + { + "epoch": 11.841534988713319, + "eval_loss": 0.609620213508606, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 3280 + }, + { + "epoch": 11.845146726862303, + "grad_norm": 231.25390625, + "learning_rate": 1.2190562613430128e-05, + "loss": 34.5218, + "step": 3281 + }, + { + "epoch": 11.848758465011286, + "grad_norm": 208.8440704345703, + "learning_rate": 1.2185117967332123e-05, + "loss": 34.354, + "step": 3282 + }, + { + "epoch": 11.852370203160271, + "grad_norm": 221.25547790527344, + "learning_rate": 1.2179673321234119e-05, + "loss": 34.5705, + "step": 3283 + }, + { + "epoch": 11.855981941309254, + "grad_norm": 331.4505920410156, + "learning_rate": 1.2174228675136118e-05, + "loss": 35.796, + "step": 3284 + }, + { + "epoch": 11.85959367945824, + "grad_norm": 337.1404113769531, + "learning_rate": 1.2168784029038113e-05, + "loss": 36.4544, + "step": 3285 + }, + { + "epoch": 11.863205417607224, + "grad_norm": 238.75303649902344, + "learning_rate": 1.2163339382940108e-05, + "loss": 35.7165, + "step": 3286 + }, + { + "epoch": 11.866817155756207, + "grad_norm": 260.088134765625, + "learning_rate": 1.2157894736842105e-05, + "loss": 35.5461, + "step": 3287 + }, + { + "epoch": 11.870428893905192, + "grad_norm": 265.0240173339844, + "learning_rate": 1.2152450090744102e-05, + "loss": 37.0143, + "step": 3288 + }, + { + "epoch": 11.874040632054175, + "grad_norm": 251.74273681640625, + "learning_rate": 1.21470054446461e-05, + "loss": 36.6145, + "step": 3289 + }, + { + "epoch": 11.87765237020316, + "grad_norm": 216.8999786376953, + "learning_rate": 1.2141560798548095e-05, + "loss": 36.3135, + "step": 3290 + }, + { + "epoch": 11.87765237020316, + "eval_loss": 0.6087896823883057, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.066, + "eval_steps_per_second": 57.066, + "step": 3290 + }, + { + "epoch": 11.881264108352145, + "grad_norm": 256.50006103515625, + "learning_rate": 1.213611615245009e-05, + "loss": 36.6596, + "step": 3291 + }, + { + "epoch": 11.884875846501128, + "grad_norm": 249.34164428710938, + "learning_rate": 1.2130671506352087e-05, + "loss": 37.6473, + "step": 3292 + }, + { + "epoch": 11.888487584650113, + "grad_norm": 211.9344940185547, + "learning_rate": 1.2125226860254084e-05, + "loss": 28.2839, + "step": 3293 + }, + { + "epoch": 11.892099322799098, + "grad_norm": 170.77166748046875, + "learning_rate": 1.211978221415608e-05, + "loss": 23.2231, + "step": 3294 + }, + { + "epoch": 11.89571106094808, + "grad_norm": 177.49789428710938, + "learning_rate": 1.2114337568058077e-05, + "loss": 22.7909, + "step": 3295 + }, + { + "epoch": 11.899322799097066, + "grad_norm": 189.0458221435547, + "learning_rate": 1.2108892921960072e-05, + "loss": 23.8062, + "step": 3296 + }, + { + "epoch": 11.90293453724605, + "grad_norm": 182.90457153320312, + "learning_rate": 1.2103448275862068e-05, + "loss": 24.7812, + "step": 3297 + }, + { + "epoch": 11.906546275395034, + "grad_norm": 232.61126708984375, + "learning_rate": 1.2098003629764066e-05, + "loss": 41.5496, + "step": 3298 + }, + { + "epoch": 11.910158013544018, + "grad_norm": 283.25762939453125, + "learning_rate": 1.2092558983666062e-05, + "loss": 40.7831, + "step": 3299 + }, + { + "epoch": 11.913769751693001, + "grad_norm": 316.6318359375, + "learning_rate": 1.2087114337568059e-05, + "loss": 40.6287, + "step": 3300 + }, + { + "epoch": 11.913769751693001, + "eval_loss": 0.6114257574081421, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 3300 + }, + { + "epoch": 11.917381489841986, + "grad_norm": 248.5615234375, + "learning_rate": 1.2081669691470054e-05, + "loss": 40.5648, + "step": 3301 + }, + { + "epoch": 11.920993227990971, + "grad_norm": 255.31130981445312, + "learning_rate": 1.2076225045372051e-05, + "loss": 42.4736, + "step": 3302 + }, + { + "epoch": 11.924604966139954, + "grad_norm": 229.3546600341797, + "learning_rate": 1.2070780399274048e-05, + "loss": 43.112, + "step": 3303 + }, + { + "epoch": 11.928216704288939, + "grad_norm": 226.89553833007812, + "learning_rate": 1.2065335753176044e-05, + "loss": 37.9527, + "step": 3304 + }, + { + "epoch": 11.931828442437924, + "grad_norm": 210.63919067382812, + "learning_rate": 1.205989110707804e-05, + "loss": 38.7652, + "step": 3305 + }, + { + "epoch": 11.935440180586907, + "grad_norm": 267.75335693359375, + "learning_rate": 1.2054446460980036e-05, + "loss": 39.9077, + "step": 3306 + }, + { + "epoch": 11.939051918735892, + "grad_norm": 255.3372802734375, + "learning_rate": 1.2049001814882033e-05, + "loss": 39.9008, + "step": 3307 + }, + { + "epoch": 11.942663656884875, + "grad_norm": 220.55332946777344, + "learning_rate": 1.2043557168784029e-05, + "loss": 40.8187, + "step": 3308 + }, + { + "epoch": 11.94627539503386, + "grad_norm": 350.15374755859375, + "learning_rate": 1.2038112522686026e-05, + "loss": 40.2937, + "step": 3309 + }, + { + "epoch": 11.949887133182845, + "grad_norm": 296.1144714355469, + "learning_rate": 1.2032667876588021e-05, + "loss": 41.3939, + "step": 3310 + }, + { + "epoch": 11.949887133182845, + "eval_loss": 0.6116041541099548, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3310 + }, + { + "epoch": 11.953498871331828, + "grad_norm": 220.52304077148438, + "learning_rate": 1.202722323049002e-05, + "loss": 39.108, + "step": 3311 + }, + { + "epoch": 11.957110609480813, + "grad_norm": 268.8526916503906, + "learning_rate": 1.2021778584392015e-05, + "loss": 39.547, + "step": 3312 + }, + { + "epoch": 11.960722347629797, + "grad_norm": 205.97677612304688, + "learning_rate": 1.201633393829401e-05, + "loss": 36.7144, + "step": 3313 + }, + { + "epoch": 11.96433408577878, + "grad_norm": 186.62428283691406, + "learning_rate": 1.2010889292196008e-05, + "loss": 34.0491, + "step": 3314 + }, + { + "epoch": 11.967945823927765, + "grad_norm": 214.5521697998047, + "learning_rate": 1.2005444646098003e-05, + "loss": 34.1164, + "step": 3315 + }, + { + "epoch": 11.97155756207675, + "grad_norm": 203.8130340576172, + "learning_rate": 1.2e-05, + "loss": 34.0005, + "step": 3316 + }, + { + "epoch": 11.975169300225733, + "grad_norm": 207.25648498535156, + "learning_rate": 1.1994555353901997e-05, + "loss": 34.0489, + "step": 3317 + }, + { + "epoch": 11.978781038374718, + "grad_norm": 271.1595458984375, + "learning_rate": 1.1989110707803993e-05, + "loss": 35.0359, + "step": 3318 + }, + { + "epoch": 11.982392776523701, + "grad_norm": 266.0697021484375, + "learning_rate": 1.198366606170599e-05, + "loss": 36.4684, + "step": 3319 + }, + { + "epoch": 11.986004514672686, + "grad_norm": 264.1314392089844, + "learning_rate": 1.1978221415607985e-05, + "loss": 35.8805, + "step": 3320 + }, + { + "epoch": 11.986004514672686, + "eval_loss": 0.6101864576339722, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 3320 + }, + { + "epoch": 11.989616252821671, + "grad_norm": 266.34295654296875, + "learning_rate": 1.1972776769509982e-05, + "loss": 37.2928, + "step": 3321 + }, + { + "epoch": 11.993227990970654, + "grad_norm": 222.19161987304688, + "learning_rate": 1.196733212341198e-05, + "loss": 29.0638, + "step": 3322 + }, + { + "epoch": 11.996839729119639, + "grad_norm": 244.96974182128906, + "learning_rate": 1.1961887477313975e-05, + "loss": 23.6752, + "step": 3323 + }, + { + "epoch": 12.0, + "grad_norm": 227.6931915283203, + "learning_rate": 1.195644283121597e-05, + "loss": 20.9293, + "step": 3324 + }, + { + "epoch": 12.003611738148985, + "grad_norm": 259.7235412597656, + "learning_rate": 1.1950998185117969e-05, + "loss": 39.7694, + "step": 3325 + }, + { + "epoch": 12.007223476297968, + "grad_norm": 258.8477783203125, + "learning_rate": 1.1945553539019964e-05, + "loss": 41.3742, + "step": 3326 + }, + { + "epoch": 12.010835214446953, + "grad_norm": 216.0697784423828, + "learning_rate": 1.194010889292196e-05, + "loss": 40.0706, + "step": 3327 + }, + { + "epoch": 12.014446952595938, + "grad_norm": 197.73046875, + "learning_rate": 1.1934664246823957e-05, + "loss": 39.844, + "step": 3328 + }, + { + "epoch": 12.01805869074492, + "grad_norm": 190.29563903808594, + "learning_rate": 1.1929219600725952e-05, + "loss": 41.8877, + "step": 3329 + }, + { + "epoch": 12.021670428893906, + "grad_norm": 190.01197814941406, + "learning_rate": 1.1923774954627951e-05, + "loss": 40.5782, + "step": 3330 + }, + { + "epoch": 12.021670428893906, + "eval_loss": 0.6100598573684692, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3330 + }, + { + "epoch": 12.025282167042889, + "grad_norm": 283.20965576171875, + "learning_rate": 1.1918330308529946e-05, + "loss": 42.9183, + "step": 3331 + }, + { + "epoch": 12.028893905191874, + "grad_norm": 227.9106903076172, + "learning_rate": 1.1912885662431942e-05, + "loss": 41.4606, + "step": 3332 + }, + { + "epoch": 12.032505643340858, + "grad_norm": 217.31640625, + "learning_rate": 1.1907441016333939e-05, + "loss": 40.527, + "step": 3333 + }, + { + "epoch": 12.036117381489841, + "grad_norm": 181.33787536621094, + "learning_rate": 1.1901996370235936e-05, + "loss": 40.2536, + "step": 3334 + }, + { + "epoch": 12.039729119638826, + "grad_norm": 210.638427734375, + "learning_rate": 1.1896551724137931e-05, + "loss": 39.0234, + "step": 3335 + }, + { + "epoch": 12.043340857787811, + "grad_norm": 222.1325225830078, + "learning_rate": 1.1891107078039928e-05, + "loss": 36.6929, + "step": 3336 + }, + { + "epoch": 12.046952595936794, + "grad_norm": 195.0751953125, + "learning_rate": 1.1885662431941924e-05, + "loss": 37.9547, + "step": 3337 + }, + { + "epoch": 12.050564334085779, + "grad_norm": 287.6582946777344, + "learning_rate": 1.1880217785843919e-05, + "loss": 37.9016, + "step": 3338 + }, + { + "epoch": 12.054176072234762, + "grad_norm": 351.43701171875, + "learning_rate": 1.1874773139745918e-05, + "loss": 40.014, + "step": 3339 + }, + { + "epoch": 12.057787810383747, + "grad_norm": 212.9033966064453, + "learning_rate": 1.1869328493647913e-05, + "loss": 37.8761, + "step": 3340 + }, + { + "epoch": 12.057787810383747, + "eval_loss": 0.6093400120735168, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 3340 + }, + { + "epoch": 12.061399548532732, + "grad_norm": 268.8284912109375, + "learning_rate": 1.186388384754991e-05, + "loss": 38.7171, + "step": 3341 + }, + { + "epoch": 12.065011286681715, + "grad_norm": 193.27267456054688, + "learning_rate": 1.1858439201451906e-05, + "loss": 38.4908, + "step": 3342 + }, + { + "epoch": 12.0686230248307, + "grad_norm": 244.18124389648438, + "learning_rate": 1.1852994555353901e-05, + "loss": 37.9388, + "step": 3343 + }, + { + "epoch": 12.072234762979685, + "grad_norm": 311.6593933105469, + "learning_rate": 1.18475499092559e-05, + "loss": 38.4287, + "step": 3344 + }, + { + "epoch": 12.075846501128668, + "grad_norm": 239.28526306152344, + "learning_rate": 1.1842105263157895e-05, + "loss": 38.1349, + "step": 3345 + }, + { + "epoch": 12.079458239277653, + "grad_norm": 312.1795654296875, + "learning_rate": 1.183666061705989e-05, + "loss": 39.8067, + "step": 3346 + }, + { + "epoch": 12.083069977426636, + "grad_norm": 303.3067932128906, + "learning_rate": 1.1831215970961888e-05, + "loss": 40.0617, + "step": 3347 + }, + { + "epoch": 12.08668171557562, + "grad_norm": 280.8705749511719, + "learning_rate": 1.1825771324863885e-05, + "loss": 39.244, + "step": 3348 + }, + { + "epoch": 12.090293453724605, + "grad_norm": 249.89671325683594, + "learning_rate": 1.182032667876588e-05, + "loss": 39.0047, + "step": 3349 + }, + { + "epoch": 12.093905191873588, + "grad_norm": 226.19195556640625, + "learning_rate": 1.1814882032667877e-05, + "loss": 40.8044, + "step": 3350 + }, + { + "epoch": 12.093905191873588, + "eval_loss": 0.6100687384605408, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 3350 + }, + { + "epoch": 12.097516930022573, + "grad_norm": 250.29306030273438, + "learning_rate": 1.1809437386569873e-05, + "loss": 38.0745, + "step": 3351 + }, + { + "epoch": 12.101128668171558, + "grad_norm": 255.06137084960938, + "learning_rate": 1.180399274047187e-05, + "loss": 37.2922, + "step": 3352 + }, + { + "epoch": 12.104740406320541, + "grad_norm": 293.59185791015625, + "learning_rate": 1.1798548094373867e-05, + "loss": 35.488, + "step": 3353 + }, + { + "epoch": 12.108352144469526, + "grad_norm": 260.9599914550781, + "learning_rate": 1.1793103448275862e-05, + "loss": 32.8175, + "step": 3354 + }, + { + "epoch": 12.111963882618511, + "grad_norm": 387.63671875, + "learning_rate": 1.178765880217786e-05, + "loss": 31.3901, + "step": 3355 + }, + { + "epoch": 12.115575620767494, + "grad_norm": 216.2008819580078, + "learning_rate": 1.1782214156079855e-05, + "loss": 32.9512, + "step": 3356 + }, + { + "epoch": 12.119187358916479, + "grad_norm": 260.510498046875, + "learning_rate": 1.177676950998185e-05, + "loss": 31.838, + "step": 3357 + }, + { + "epoch": 12.122799097065462, + "grad_norm": 215.96522521972656, + "learning_rate": 1.1771324863883849e-05, + "loss": 33.5854, + "step": 3358 + }, + { + "epoch": 12.126410835214447, + "grad_norm": 277.2855529785156, + "learning_rate": 1.1765880217785844e-05, + "loss": 34.947, + "step": 3359 + }, + { + "epoch": 12.130022573363432, + "grad_norm": 199.53759765625, + "learning_rate": 1.176043557168784e-05, + "loss": 34.3862, + "step": 3360 + }, + { + "epoch": 12.130022573363432, + "eval_loss": 0.6107886433601379, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 3360 + }, + { + "epoch": 12.133634311512415, + "grad_norm": 244.73654174804688, + "learning_rate": 1.1754990925589837e-05, + "loss": 34.5678, + "step": 3361 + }, + { + "epoch": 12.1372460496614, + "grad_norm": 335.4967346191406, + "learning_rate": 1.1749546279491834e-05, + "loss": 35.8974, + "step": 3362 + }, + { + "epoch": 12.140857787810384, + "grad_norm": 269.8370056152344, + "learning_rate": 1.174410163339383e-05, + "loss": 36.3458, + "step": 3363 + }, + { + "epoch": 12.144469525959368, + "grad_norm": 230.82492065429688, + "learning_rate": 1.1738656987295826e-05, + "loss": 34.6797, + "step": 3364 + }, + { + "epoch": 12.148081264108352, + "grad_norm": 266.6196594238281, + "learning_rate": 1.1733212341197822e-05, + "loss": 35.5799, + "step": 3365 + }, + { + "epoch": 12.151693002257336, + "grad_norm": 268.1825256347656, + "learning_rate": 1.1727767695099819e-05, + "loss": 34.9859, + "step": 3366 + }, + { + "epoch": 12.15530474040632, + "grad_norm": 259.6159362792969, + "learning_rate": 1.1722323049001816e-05, + "loss": 37.2283, + "step": 3367 + }, + { + "epoch": 12.158916478555305, + "grad_norm": 225.1367645263672, + "learning_rate": 1.1716878402903811e-05, + "loss": 37.4073, + "step": 3368 + }, + { + "epoch": 12.162528216704288, + "grad_norm": 277.8457946777344, + "learning_rate": 1.1711433756805808e-05, + "loss": 36.3491, + "step": 3369 + }, + { + "epoch": 12.166139954853273, + "grad_norm": 273.1939697265625, + "learning_rate": 1.1705989110707804e-05, + "loss": 31.4646, + "step": 3370 + }, + { + "epoch": 12.166139954853273, + "eval_loss": 0.6099494695663452, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 3370 + }, + { + "epoch": 12.169751693002258, + "grad_norm": 199.32516479492188, + "learning_rate": 1.17005444646098e-05, + "loss": 22.7125, + "step": 3371 + }, + { + "epoch": 12.173363431151241, + "grad_norm": 195.47630310058594, + "learning_rate": 1.1695099818511798e-05, + "loss": 22.7899, + "step": 3372 + }, + { + "epoch": 12.176975169300226, + "grad_norm": 220.02413940429688, + "learning_rate": 1.1689655172413793e-05, + "loss": 23.4427, + "step": 3373 + }, + { + "epoch": 12.18058690744921, + "grad_norm": 215.43287658691406, + "learning_rate": 1.168421052631579e-05, + "loss": 24.1504, + "step": 3374 + }, + { + "epoch": 12.184198645598194, + "grad_norm": 298.2409973144531, + "learning_rate": 1.1678765880217786e-05, + "loss": 41.4955, + "step": 3375 + }, + { + "epoch": 12.187810383747179, + "grad_norm": 235.94728088378906, + "learning_rate": 1.1673321234119783e-05, + "loss": 42.4273, + "step": 3376 + }, + { + "epoch": 12.191422121896162, + "grad_norm": 235.44480895996094, + "learning_rate": 1.166787658802178e-05, + "loss": 40.6468, + "step": 3377 + }, + { + "epoch": 12.195033860045147, + "grad_norm": 281.5338439941406, + "learning_rate": 1.1662431941923775e-05, + "loss": 39.8335, + "step": 3378 + }, + { + "epoch": 12.198645598194132, + "grad_norm": 185.87339782714844, + "learning_rate": 1.165698729582577e-05, + "loss": 40.8669, + "step": 3379 + }, + { + "epoch": 12.202257336343115, + "grad_norm": 218.88861083984375, + "learning_rate": 1.1651542649727768e-05, + "loss": 40.1351, + "step": 3380 + }, + { + "epoch": 12.202257336343115, + "eval_loss": 0.6128573417663574, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3380 + }, + { + "epoch": 12.2058690744921, + "grad_norm": 192.7227783203125, + "learning_rate": 1.1646098003629765e-05, + "loss": 40.4448, + "step": 3381 + }, + { + "epoch": 12.209480812641084, + "grad_norm": 219.68093872070312, + "learning_rate": 1.1640653357531762e-05, + "loss": 41.579, + "step": 3382 + }, + { + "epoch": 12.213092550790067, + "grad_norm": 235.8788299560547, + "learning_rate": 1.1635208711433757e-05, + "loss": 41.3374, + "step": 3383 + }, + { + "epoch": 12.216704288939052, + "grad_norm": 245.11935424804688, + "learning_rate": 1.1629764065335752e-05, + "loss": 41.1151, + "step": 3384 + }, + { + "epoch": 12.220316027088035, + "grad_norm": 260.2931823730469, + "learning_rate": 1.1624319419237751e-05, + "loss": 38.9502, + "step": 3385 + }, + { + "epoch": 12.22392776523702, + "grad_norm": 240.62734985351562, + "learning_rate": 1.1618874773139747e-05, + "loss": 38.6309, + "step": 3386 + }, + { + "epoch": 12.227539503386005, + "grad_norm": 230.9380645751953, + "learning_rate": 1.1613430127041742e-05, + "loss": 38.3077, + "step": 3387 + }, + { + "epoch": 12.231151241534988, + "grad_norm": 234.40687561035156, + "learning_rate": 1.1607985480943739e-05, + "loss": 37.1566, + "step": 3388 + }, + { + "epoch": 12.234762979683973, + "grad_norm": 216.580810546875, + "learning_rate": 1.1602540834845734e-05, + "loss": 38.4919, + "step": 3389 + }, + { + "epoch": 12.238374717832958, + "grad_norm": 210.75079345703125, + "learning_rate": 1.1597096188747732e-05, + "loss": 38.1647, + "step": 3390 + }, + { + "epoch": 12.238374717832958, + "eval_loss": 0.6105583906173706, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3390 + }, + { + "epoch": 12.241986455981941, + "grad_norm": 207.82180786132812, + "learning_rate": 1.1591651542649729e-05, + "loss": 38.5585, + "step": 3391 + }, + { + "epoch": 12.245598194130926, + "grad_norm": 186.55081176757812, + "learning_rate": 1.1586206896551724e-05, + "loss": 38.0183, + "step": 3392 + }, + { + "epoch": 12.249209932279909, + "grad_norm": 179.60572814941406, + "learning_rate": 1.1580762250453721e-05, + "loss": 39.6951, + "step": 3393 + }, + { + "epoch": 12.252821670428894, + "grad_norm": 212.59837341308594, + "learning_rate": 1.1575317604355718e-05, + "loss": 39.2908, + "step": 3394 + }, + { + "epoch": 12.256433408577879, + "grad_norm": 239.90997314453125, + "learning_rate": 1.1569872958257714e-05, + "loss": 39.9409, + "step": 3395 + }, + { + "epoch": 12.260045146726862, + "grad_norm": 240.729248046875, + "learning_rate": 1.156442831215971e-05, + "loss": 39.2386, + "step": 3396 + }, + { + "epoch": 12.263656884875846, + "grad_norm": 248.6179962158203, + "learning_rate": 1.1558983666061706e-05, + "loss": 37.3296, + "step": 3397 + }, + { + "epoch": 12.267268623024831, + "grad_norm": 192.55084228515625, + "learning_rate": 1.1553539019963701e-05, + "loss": 40.1156, + "step": 3398 + }, + { + "epoch": 12.270880361173814, + "grad_norm": 217.89109802246094, + "learning_rate": 1.15480943738657e-05, + "loss": 41.0677, + "step": 3399 + }, + { + "epoch": 12.2744920993228, + "grad_norm": 240.77633666992188, + "learning_rate": 1.1542649727767695e-05, + "loss": 39.3552, + "step": 3400 + }, + { + "epoch": 12.2744920993228, + "eval_loss": 0.6094763278961182, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3400 + }, + { + "epoch": 12.278103837471784, + "grad_norm": 210.38153076171875, + "learning_rate": 1.1537205081669691e-05, + "loss": 40.2202, + "step": 3401 + }, + { + "epoch": 12.281715575620767, + "grad_norm": 195.49087524414062, + "learning_rate": 1.1531760435571688e-05, + "loss": 37.5473, + "step": 3402 + }, + { + "epoch": 12.285327313769752, + "grad_norm": 254.43972778320312, + "learning_rate": 1.1526315789473683e-05, + "loss": 37.8032, + "step": 3403 + }, + { + "epoch": 12.288939051918735, + "grad_norm": 205.09913635253906, + "learning_rate": 1.1520871143375682e-05, + "loss": 35.1317, + "step": 3404 + }, + { + "epoch": 12.29255079006772, + "grad_norm": 241.22930908203125, + "learning_rate": 1.1515426497277677e-05, + "loss": 32.7809, + "step": 3405 + }, + { + "epoch": 12.296162528216705, + "grad_norm": 226.75311279296875, + "learning_rate": 1.1509981851179673e-05, + "loss": 32.5354, + "step": 3406 + }, + { + "epoch": 12.299774266365688, + "grad_norm": 323.5389709472656, + "learning_rate": 1.150453720508167e-05, + "loss": 33.1533, + "step": 3407 + }, + { + "epoch": 12.303386004514673, + "grad_norm": 306.7039794921875, + "learning_rate": 1.1499092558983667e-05, + "loss": 33.7924, + "step": 3408 + }, + { + "epoch": 12.306997742663658, + "grad_norm": 221.53897094726562, + "learning_rate": 1.1493647912885662e-05, + "loss": 33.829, + "step": 3409 + }, + { + "epoch": 12.31060948081264, + "grad_norm": 301.59527587890625, + "learning_rate": 1.148820326678766e-05, + "loss": 35.4583, + "step": 3410 + }, + { + "epoch": 12.31060948081264, + "eval_loss": 0.6092248558998108, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.058, + "eval_steps_per_second": 57.058, + "step": 3410 + }, + { + "epoch": 12.314221218961626, + "grad_norm": 229.63221740722656, + "learning_rate": 1.1482758620689655e-05, + "loss": 34.3258, + "step": 3411 + }, + { + "epoch": 12.317832957110609, + "grad_norm": 280.6421203613281, + "learning_rate": 1.147731397459165e-05, + "loss": 33.4522, + "step": 3412 + }, + { + "epoch": 12.321444695259594, + "grad_norm": 305.6673889160156, + "learning_rate": 1.1471869328493649e-05, + "loss": 34.8911, + "step": 3413 + }, + { + "epoch": 12.325056433408578, + "grad_norm": 278.5484924316406, + "learning_rate": 1.1466424682395644e-05, + "loss": 36.2668, + "step": 3414 + }, + { + "epoch": 12.328668171557561, + "grad_norm": 246.88082885742188, + "learning_rate": 1.1460980036297641e-05, + "loss": 34.8401, + "step": 3415 + }, + { + "epoch": 12.332279909706546, + "grad_norm": 279.730712890625, + "learning_rate": 1.1455535390199637e-05, + "loss": 36.2382, + "step": 3416 + }, + { + "epoch": 12.335891647855531, + "grad_norm": 243.62918090820312, + "learning_rate": 1.1450090744101634e-05, + "loss": 37.0742, + "step": 3417 + }, + { + "epoch": 12.339503386004514, + "grad_norm": 280.5240783691406, + "learning_rate": 1.1444646098003631e-05, + "loss": 37.0223, + "step": 3418 + }, + { + "epoch": 12.343115124153499, + "grad_norm": 270.56396484375, + "learning_rate": 1.1439201451905626e-05, + "loss": 34.8413, + "step": 3419 + }, + { + "epoch": 12.346726862302482, + "grad_norm": 246.56292724609375, + "learning_rate": 1.1433756805807622e-05, + "loss": 26.5596, + "step": 3420 + }, + { + "epoch": 12.346726862302482, + "eval_loss": 0.6123174428939819, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.143, + "eval_steps_per_second": 57.143, + "step": 3420 + }, + { + "epoch": 12.350338600451467, + "grad_norm": 199.72242736816406, + "learning_rate": 1.1428312159709619e-05, + "loss": 23.3959, + "step": 3421 + }, + { + "epoch": 12.353950338600452, + "grad_norm": 264.9206848144531, + "learning_rate": 1.1422867513611616e-05, + "loss": 23.448, + "step": 3422 + }, + { + "epoch": 12.357562076749435, + "grad_norm": 198.09420776367188, + "learning_rate": 1.1417422867513613e-05, + "loss": 23.4526, + "step": 3423 + }, + { + "epoch": 12.36117381489842, + "grad_norm": 191.74949645996094, + "learning_rate": 1.1411978221415608e-05, + "loss": 23.9586, + "step": 3424 + }, + { + "epoch": 12.364785553047405, + "grad_norm": 270.4527893066406, + "learning_rate": 1.1406533575317604e-05, + "loss": 41.2497, + "step": 3425 + }, + { + "epoch": 12.368397291196388, + "grad_norm": 253.06109619140625, + "learning_rate": 1.1401088929219601e-05, + "loss": 41.7598, + "step": 3426 + }, + { + "epoch": 12.372009029345373, + "grad_norm": 389.3164978027344, + "learning_rate": 1.1395644283121598e-05, + "loss": 42.1145, + "step": 3427 + }, + { + "epoch": 12.375620767494357, + "grad_norm": 405.1527404785156, + "learning_rate": 1.1390199637023593e-05, + "loss": 39.8163, + "step": 3428 + }, + { + "epoch": 12.37923250564334, + "grad_norm": 360.5083312988281, + "learning_rate": 1.138475499092559e-05, + "loss": 40.7344, + "step": 3429 + }, + { + "epoch": 12.382844243792325, + "grad_norm": 276.3650207519531, + "learning_rate": 1.1379310344827586e-05, + "loss": 40.6678, + "step": 3430 + }, + { + "epoch": 12.382844243792325, + "eval_loss": 0.612799346446991, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 3430 + }, + { + "epoch": 12.386455981941308, + "grad_norm": 222.34078979492188, + "learning_rate": 1.1373865698729583e-05, + "loss": 39.8701, + "step": 3431 + }, + { + "epoch": 12.390067720090293, + "grad_norm": 242.1103515625, + "learning_rate": 1.136842105263158e-05, + "loss": 42.031, + "step": 3432 + }, + { + "epoch": 12.393679458239278, + "grad_norm": 231.30453491210938, + "learning_rate": 1.1362976406533575e-05, + "loss": 40.7321, + "step": 3433 + }, + { + "epoch": 12.397291196388261, + "grad_norm": 302.65179443359375, + "learning_rate": 1.1357531760435572e-05, + "loss": 41.5889, + "step": 3434 + }, + { + "epoch": 12.400902934537246, + "grad_norm": 296.4203796386719, + "learning_rate": 1.1352087114337568e-05, + "loss": 40.3939, + "step": 3435 + }, + { + "epoch": 12.404514672686231, + "grad_norm": 281.8349304199219, + "learning_rate": 1.1346642468239565e-05, + "loss": 37.9457, + "step": 3436 + }, + { + "epoch": 12.408126410835214, + "grad_norm": 228.9622039794922, + "learning_rate": 1.1341197822141562e-05, + "loss": 37.4727, + "step": 3437 + }, + { + "epoch": 12.411738148984199, + "grad_norm": 276.8975524902344, + "learning_rate": 1.1335753176043557e-05, + "loss": 36.4285, + "step": 3438 + }, + { + "epoch": 12.415349887133182, + "grad_norm": 218.76206970214844, + "learning_rate": 1.1330308529945553e-05, + "loss": 37.7888, + "step": 3439 + }, + { + "epoch": 12.418961625282167, + "grad_norm": 277.31329345703125, + "learning_rate": 1.1324863883847551e-05, + "loss": 38.6416, + "step": 3440 + }, + { + "epoch": 12.418961625282167, + "eval_loss": 0.6118359565734863, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.105, + "eval_steps_per_second": 57.105, + "step": 3440 + }, + { + "epoch": 12.422573363431152, + "grad_norm": 239.2766876220703, + "learning_rate": 1.1319419237749547e-05, + "loss": 38.3779, + "step": 3441 + }, + { + "epoch": 12.426185101580135, + "grad_norm": 255.43939208984375, + "learning_rate": 1.1313974591651542e-05, + "loss": 38.7581, + "step": 3442 + }, + { + "epoch": 12.42979683972912, + "grad_norm": 196.33380126953125, + "learning_rate": 1.130852994555354e-05, + "loss": 40.1953, + "step": 3443 + }, + { + "epoch": 12.433408577878104, + "grad_norm": 284.2427062988281, + "learning_rate": 1.1303085299455535e-05, + "loss": 39.2743, + "step": 3444 + }, + { + "epoch": 12.437020316027088, + "grad_norm": 303.0172424316406, + "learning_rate": 1.1297640653357533e-05, + "loss": 39.4786, + "step": 3445 + }, + { + "epoch": 12.440632054176072, + "grad_norm": 231.17999267578125, + "learning_rate": 1.1292196007259529e-05, + "loss": 38.6038, + "step": 3446 + }, + { + "epoch": 12.444243792325057, + "grad_norm": 228.89599609375, + "learning_rate": 1.1286751361161524e-05, + "loss": 39.0235, + "step": 3447 + }, + { + "epoch": 12.44785553047404, + "grad_norm": 247.05203247070312, + "learning_rate": 1.1281306715063521e-05, + "loss": 39.9779, + "step": 3448 + }, + { + "epoch": 12.451467268623025, + "grad_norm": 221.5463104248047, + "learning_rate": 1.1275862068965517e-05, + "loss": 40.4104, + "step": 3449 + }, + { + "epoch": 12.455079006772008, + "grad_norm": 254.12820434570312, + "learning_rate": 1.1270417422867514e-05, + "loss": 40.8093, + "step": 3450 + }, + { + "epoch": 12.455079006772008, + "eval_loss": 0.6093817353248596, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 3450 + }, + { + "epoch": 12.458690744920993, + "grad_norm": 214.2323760986328, + "learning_rate": 1.1264972776769511e-05, + "loss": 40.3578, + "step": 3451 + }, + { + "epoch": 12.462302483069978, + "grad_norm": 230.64718627929688, + "learning_rate": 1.1259528130671506e-05, + "loss": 39.772, + "step": 3452 + }, + { + "epoch": 12.465914221218961, + "grad_norm": 217.81838989257812, + "learning_rate": 1.1254083484573502e-05, + "loss": 36.8193, + "step": 3453 + }, + { + "epoch": 12.469525959367946, + "grad_norm": 292.7674560546875, + "learning_rate": 1.12486388384755e-05, + "loss": 33.891, + "step": 3454 + }, + { + "epoch": 12.47313769751693, + "grad_norm": 241.6099395751953, + "learning_rate": 1.1243194192377496e-05, + "loss": 34.8947, + "step": 3455 + }, + { + "epoch": 12.476749435665914, + "grad_norm": 220.97128295898438, + "learning_rate": 1.1237749546279493e-05, + "loss": 31.7715, + "step": 3456 + }, + { + "epoch": 12.480361173814899, + "grad_norm": 191.04376220703125, + "learning_rate": 1.1232304900181488e-05, + "loss": 32.3878, + "step": 3457 + }, + { + "epoch": 12.483972911963882, + "grad_norm": 192.3009796142578, + "learning_rate": 1.1226860254083484e-05, + "loss": 33.3116, + "step": 3458 + }, + { + "epoch": 12.487584650112867, + "grad_norm": 214.22459411621094, + "learning_rate": 1.1221415607985482e-05, + "loss": 34.1394, + "step": 3459 + }, + { + "epoch": 12.491196388261852, + "grad_norm": 225.24191284179688, + "learning_rate": 1.1215970961887478e-05, + "loss": 34.9381, + "step": 3460 + }, + { + "epoch": 12.491196388261852, + "eval_loss": 0.6095408201217651, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 3460 + }, + { + "epoch": 12.494808126410835, + "grad_norm": 240.89199829101562, + "learning_rate": 1.1210526315789473e-05, + "loss": 34.5342, + "step": 3461 + }, + { + "epoch": 12.49841986455982, + "grad_norm": 263.5467224121094, + "learning_rate": 1.120508166969147e-05, + "loss": 35.3287, + "step": 3462 + }, + { + "epoch": 12.502031602708804, + "grad_norm": 253.0650634765625, + "learning_rate": 1.1199637023593467e-05, + "loss": 35.4859, + "step": 3463 + }, + { + "epoch": 12.505643340857787, + "grad_norm": 279.4447937011719, + "learning_rate": 1.1194192377495463e-05, + "loss": 33.919, + "step": 3464 + }, + { + "epoch": 12.509255079006772, + "grad_norm": 246.6184844970703, + "learning_rate": 1.118874773139746e-05, + "loss": 35.2743, + "step": 3465 + }, + { + "epoch": 12.512866817155757, + "grad_norm": 228.4134979248047, + "learning_rate": 1.1183303085299455e-05, + "loss": 36.0865, + "step": 3466 + }, + { + "epoch": 12.51647855530474, + "grad_norm": 264.87835693359375, + "learning_rate": 1.1177858439201452e-05, + "loss": 36.1596, + "step": 3467 + }, + { + "epoch": 12.520090293453725, + "grad_norm": 252.2872772216797, + "learning_rate": 1.117241379310345e-05, + "loss": 35.7293, + "step": 3468 + }, + { + "epoch": 12.523702031602708, + "grad_norm": 277.3695373535156, + "learning_rate": 1.1166969147005445e-05, + "loss": 36.8009, + "step": 3469 + }, + { + "epoch": 12.527313769751693, + "grad_norm": 255.64610290527344, + "learning_rate": 1.1161524500907442e-05, + "loss": 28.5986, + "step": 3470 + }, + { + "epoch": 12.527313769751693, + "eval_loss": 0.6122347116470337, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 3470 + }, + { + "epoch": 12.530925507900678, + "grad_norm": 256.1487121582031, + "learning_rate": 1.1156079854809437e-05, + "loss": 23.1289, + "step": 3471 + }, + { + "epoch": 12.534537246049661, + "grad_norm": 261.9757080078125, + "learning_rate": 1.1150635208711433e-05, + "loss": 22.3379, + "step": 3472 + }, + { + "epoch": 12.538148984198646, + "grad_norm": 194.83432006835938, + "learning_rate": 1.1145190562613431e-05, + "loss": 23.6192, + "step": 3473 + }, + { + "epoch": 12.54176072234763, + "grad_norm": 241.51089477539062, + "learning_rate": 1.1139745916515427e-05, + "loss": 24.0314, + "step": 3474 + }, + { + "epoch": 12.545372460496614, + "grad_norm": 242.6024932861328, + "learning_rate": 1.1134301270417424e-05, + "loss": 40.2969, + "step": 3475 + }, + { + "epoch": 12.548984198645599, + "grad_norm": 292.17303466796875, + "learning_rate": 1.112885662431942e-05, + "loss": 42.3448, + "step": 3476 + }, + { + "epoch": 12.552595936794582, + "grad_norm": 232.811767578125, + "learning_rate": 1.1123411978221416e-05, + "loss": 41.7642, + "step": 3477 + }, + { + "epoch": 12.556207674943566, + "grad_norm": 238.43162536621094, + "learning_rate": 1.1117967332123413e-05, + "loss": 41.0827, + "step": 3478 + }, + { + "epoch": 12.559819413092551, + "grad_norm": 290.20159912109375, + "learning_rate": 1.1112522686025409e-05, + "loss": 41.3795, + "step": 3479 + }, + { + "epoch": 12.563431151241534, + "grad_norm": 197.52903747558594, + "learning_rate": 1.1107078039927404e-05, + "loss": 40.6337, + "step": 3480 + }, + { + "epoch": 12.563431151241534, + "eval_loss": 0.6133883595466614, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.135, + "eval_steps_per_second": 57.135, + "step": 3480 + }, + { + "epoch": 12.56704288939052, + "grad_norm": 259.8161926269531, + "learning_rate": 1.1101633393829401e-05, + "loss": 40.2626, + "step": 3481 + }, + { + "epoch": 12.570654627539504, + "grad_norm": 196.7882537841797, + "learning_rate": 1.1096188747731398e-05, + "loss": 41.0171, + "step": 3482 + }, + { + "epoch": 12.574266365688487, + "grad_norm": 216.27642822265625, + "learning_rate": 1.1090744101633394e-05, + "loss": 42.1328, + "step": 3483 + }, + { + "epoch": 12.577878103837472, + "grad_norm": 292.6575012207031, + "learning_rate": 1.108529945553539e-05, + "loss": 39.9502, + "step": 3484 + }, + { + "epoch": 12.581489841986457, + "grad_norm": 254.43344116210938, + "learning_rate": 1.1079854809437386e-05, + "loss": 41.3409, + "step": 3485 + }, + { + "epoch": 12.58510158013544, + "grad_norm": 211.3965606689453, + "learning_rate": 1.1074410163339385e-05, + "loss": 39.6898, + "step": 3486 + }, + { + "epoch": 12.588713318284425, + "grad_norm": 196.2000274658203, + "learning_rate": 1.106896551724138e-05, + "loss": 38.0837, + "step": 3487 + }, + { + "epoch": 12.592325056433408, + "grad_norm": 224.4564666748047, + "learning_rate": 1.1063520871143376e-05, + "loss": 38.479, + "step": 3488 + }, + { + "epoch": 12.595936794582393, + "grad_norm": 215.7074432373047, + "learning_rate": 1.1058076225045373e-05, + "loss": 38.3103, + "step": 3489 + }, + { + "epoch": 12.599548532731378, + "grad_norm": 278.2279052734375, + "learning_rate": 1.1052631578947368e-05, + "loss": 37.9399, + "step": 3490 + }, + { + "epoch": 12.599548532731378, + "eval_loss": 0.6091782450675964, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 3490 + }, + { + "epoch": 12.60316027088036, + "grad_norm": 236.7021942138672, + "learning_rate": 1.1047186932849365e-05, + "loss": 38.185, + "step": 3491 + }, + { + "epoch": 12.606772009029346, + "grad_norm": 200.35169982910156, + "learning_rate": 1.1041742286751362e-05, + "loss": 38.7405, + "step": 3492 + }, + { + "epoch": 12.610383747178329, + "grad_norm": 211.9726104736328, + "learning_rate": 1.1036297640653358e-05, + "loss": 39.8351, + "step": 3493 + }, + { + "epoch": 12.613995485327314, + "grad_norm": 303.5962829589844, + "learning_rate": 1.1030852994555353e-05, + "loss": 39.3039, + "step": 3494 + }, + { + "epoch": 12.617607223476298, + "grad_norm": 298.086181640625, + "learning_rate": 1.102540834845735e-05, + "loss": 39.9149, + "step": 3495 + }, + { + "epoch": 12.621218961625281, + "grad_norm": 255.69854736328125, + "learning_rate": 1.1019963702359347e-05, + "loss": 36.3617, + "step": 3496 + }, + { + "epoch": 12.624830699774266, + "grad_norm": 273.2884216308594, + "learning_rate": 1.1014519056261344e-05, + "loss": 38.6865, + "step": 3497 + }, + { + "epoch": 12.628442437923251, + "grad_norm": 211.17837524414062, + "learning_rate": 1.100907441016334e-05, + "loss": 40.2771, + "step": 3498 + }, + { + "epoch": 12.632054176072234, + "grad_norm": 253.9141845703125, + "learning_rate": 1.1003629764065335e-05, + "loss": 40.3644, + "step": 3499 + }, + { + "epoch": 12.635665914221219, + "grad_norm": 247.4141082763672, + "learning_rate": 1.0998185117967334e-05, + "loss": 39.9754, + "step": 3500 + }, + { + "epoch": 12.635665914221219, + "eval_loss": 0.6086810827255249, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 3500 + }, + { + "epoch": 12.639277652370204, + "grad_norm": 237.3258056640625, + "learning_rate": 1.0992740471869329e-05, + "loss": 39.9438, + "step": 3501 + }, + { + "epoch": 12.642889390519187, + "grad_norm": 252.87744140625, + "learning_rate": 1.0987295825771325e-05, + "loss": 39.9713, + "step": 3502 + }, + { + "epoch": 12.646501128668172, + "grad_norm": 341.2947998046875, + "learning_rate": 1.0981851179673322e-05, + "loss": 36.54, + "step": 3503 + }, + { + "epoch": 12.650112866817155, + "grad_norm": 212.7144317626953, + "learning_rate": 1.0976406533575317e-05, + "loss": 33.2737, + "step": 3504 + }, + { + "epoch": 12.65372460496614, + "grad_norm": 220.15846252441406, + "learning_rate": 1.0970961887477314e-05, + "loss": 34.8862, + "step": 3505 + }, + { + "epoch": 12.657336343115125, + "grad_norm": 235.8145294189453, + "learning_rate": 1.0965517241379311e-05, + "loss": 31.637, + "step": 3506 + }, + { + "epoch": 12.660948081264108, + "grad_norm": 274.13140869140625, + "learning_rate": 1.0960072595281307e-05, + "loss": 33.6111, + "step": 3507 + }, + { + "epoch": 12.664559819413093, + "grad_norm": 259.9810791015625, + "learning_rate": 1.0954627949183304e-05, + "loss": 34.7118, + "step": 3508 + }, + { + "epoch": 12.668171557562077, + "grad_norm": 244.6074676513672, + "learning_rate": 1.0949183303085299e-05, + "loss": 34.3987, + "step": 3509 + }, + { + "epoch": 12.67178329571106, + "grad_norm": 264.0238037109375, + "learning_rate": 1.0943738656987296e-05, + "loss": 34.7304, + "step": 3510 + }, + { + "epoch": 12.67178329571106, + "eval_loss": 0.6089194416999817, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 3510 + }, + { + "epoch": 12.675395033860045, + "grad_norm": 286.857421875, + "learning_rate": 1.0938294010889293e-05, + "loss": 34.5722, + "step": 3511 + }, + { + "epoch": 12.679006772009028, + "grad_norm": 270.7839660644531, + "learning_rate": 1.0932849364791289e-05, + "loss": 35.6129, + "step": 3512 + }, + { + "epoch": 12.682618510158013, + "grad_norm": 214.4302978515625, + "learning_rate": 1.0927404718693284e-05, + "loss": 34.4318, + "step": 3513 + }, + { + "epoch": 12.686230248306998, + "grad_norm": 362.6913757324219, + "learning_rate": 1.0921960072595283e-05, + "loss": 35.6578, + "step": 3514 + }, + { + "epoch": 12.689841986455981, + "grad_norm": 266.5205993652344, + "learning_rate": 1.0916515426497278e-05, + "loss": 35.8627, + "step": 3515 + }, + { + "epoch": 12.693453724604966, + "grad_norm": 271.8298034667969, + "learning_rate": 1.0911070780399275e-05, + "loss": 36.8931, + "step": 3516 + }, + { + "epoch": 12.697065462753951, + "grad_norm": 230.13815307617188, + "learning_rate": 1.090562613430127e-05, + "loss": 35.8972, + "step": 3517 + }, + { + "epoch": 12.700677200902934, + "grad_norm": 235.57127380371094, + "learning_rate": 1.0900181488203266e-05, + "loss": 36.7884, + "step": 3518 + }, + { + "epoch": 12.704288939051919, + "grad_norm": 274.0856018066406, + "learning_rate": 1.0894736842105265e-05, + "loss": 35.938, + "step": 3519 + }, + { + "epoch": 12.707900677200904, + "grad_norm": 251.9855194091797, + "learning_rate": 1.088929219600726e-05, + "loss": 30.846, + "step": 3520 + }, + { + "epoch": 12.707900677200904, + "eval_loss": 0.6102532148361206, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 3520 + }, + { + "epoch": 12.711512415349887, + "grad_norm": 254.11465454101562, + "learning_rate": 1.0883847549909255e-05, + "loss": 22.8538, + "step": 3521 + }, + { + "epoch": 12.715124153498872, + "grad_norm": 233.05821228027344, + "learning_rate": 1.0878402903811253e-05, + "loss": 22.3346, + "step": 3522 + }, + { + "epoch": 12.718735891647855, + "grad_norm": 223.46646118164062, + "learning_rate": 1.087295825771325e-05, + "loss": 23.8109, + "step": 3523 + }, + { + "epoch": 12.72234762979684, + "grad_norm": 209.4064483642578, + "learning_rate": 1.0867513611615245e-05, + "loss": 24.7694, + "step": 3524 + }, + { + "epoch": 12.725959367945824, + "grad_norm": 299.6215515136719, + "learning_rate": 1.0862068965517242e-05, + "loss": 40.8879, + "step": 3525 + }, + { + "epoch": 12.729571106094808, + "grad_norm": 272.5259704589844, + "learning_rate": 1.0856624319419237e-05, + "loss": 41.5875, + "step": 3526 + }, + { + "epoch": 12.733182844243792, + "grad_norm": 219.70687866210938, + "learning_rate": 1.0851179673321235e-05, + "loss": 41.5546, + "step": 3527 + }, + { + "epoch": 12.736794582392777, + "grad_norm": 250.9104766845703, + "learning_rate": 1.0845735027223232e-05, + "loss": 40.0984, + "step": 3528 + }, + { + "epoch": 12.74040632054176, + "grad_norm": 260.9254150390625, + "learning_rate": 1.0840290381125227e-05, + "loss": 40.564, + "step": 3529 + }, + { + "epoch": 12.744018058690745, + "grad_norm": 275.46221923828125, + "learning_rate": 1.0834845735027224e-05, + "loss": 40.3864, + "step": 3530 + }, + { + "epoch": 12.744018058690745, + "eval_loss": 0.6099677681922913, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 3530 + }, + { + "epoch": 12.747629796839728, + "grad_norm": 200.9589385986328, + "learning_rate": 1.082940108892922e-05, + "loss": 40.5753, + "step": 3531 + }, + { + "epoch": 12.751241534988713, + "grad_norm": 228.87669372558594, + "learning_rate": 1.0823956442831215e-05, + "loss": 41.4702, + "step": 3532 + }, + { + "epoch": 12.754853273137698, + "grad_norm": 218.6998748779297, + "learning_rate": 1.0818511796733214e-05, + "loss": 41.6641, + "step": 3533 + }, + { + "epoch": 12.758465011286681, + "grad_norm": 422.519775390625, + "learning_rate": 1.0813067150635209e-05, + "loss": 41.8016, + "step": 3534 + }, + { + "epoch": 12.762076749435666, + "grad_norm": 198.31935119628906, + "learning_rate": 1.0807622504537204e-05, + "loss": 40.6053, + "step": 3535 + }, + { + "epoch": 12.76568848758465, + "grad_norm": 274.42333984375, + "learning_rate": 1.0802177858439201e-05, + "loss": 38.7974, + "step": 3536 + }, + { + "epoch": 12.769300225733634, + "grad_norm": 267.5847473144531, + "learning_rate": 1.0796733212341199e-05, + "loss": 37.157, + "step": 3537 + }, + { + "epoch": 12.772911963882619, + "grad_norm": 264.9976806640625, + "learning_rate": 1.0791288566243196e-05, + "loss": 38.1585, + "step": 3538 + }, + { + "epoch": 12.776523702031604, + "grad_norm": 216.5603790283203, + "learning_rate": 1.0785843920145191e-05, + "loss": 38.0501, + "step": 3539 + }, + { + "epoch": 12.780135440180587, + "grad_norm": 193.55081176757812, + "learning_rate": 1.0780399274047186e-05, + "loss": 38.3114, + "step": 3540 + }, + { + "epoch": 12.780135440180587, + "eval_loss": 0.6059894561767578, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3540 + }, + { + "epoch": 12.783747178329572, + "grad_norm": 256.3584289550781, + "learning_rate": 1.0774954627949183e-05, + "loss": 38.7056, + "step": 3541 + }, + { + "epoch": 12.787358916478555, + "grad_norm": 203.17401123046875, + "learning_rate": 1.076950998185118e-05, + "loss": 39.3947, + "step": 3542 + }, + { + "epoch": 12.79097065462754, + "grad_norm": 307.99517822265625, + "learning_rate": 1.0764065335753176e-05, + "loss": 39.2121, + "step": 3543 + }, + { + "epoch": 12.794582392776524, + "grad_norm": 199.4147186279297, + "learning_rate": 1.0758620689655173e-05, + "loss": 38.4621, + "step": 3544 + }, + { + "epoch": 12.798194130925507, + "grad_norm": 251.60293579101562, + "learning_rate": 1.0753176043557168e-05, + "loss": 38.2742, + "step": 3545 + }, + { + "epoch": 12.801805869074492, + "grad_norm": 277.1817321777344, + "learning_rate": 1.0747731397459165e-05, + "loss": 38.6803, + "step": 3546 + }, + { + "epoch": 12.805417607223477, + "grad_norm": 303.2837219238281, + "learning_rate": 1.0742286751361163e-05, + "loss": 39.7843, + "step": 3547 + }, + { + "epoch": 12.80902934537246, + "grad_norm": 321.22772216796875, + "learning_rate": 1.0736842105263158e-05, + "loss": 41.3761, + "step": 3548 + }, + { + "epoch": 12.812641083521445, + "grad_norm": 238.89007568359375, + "learning_rate": 1.0731397459165155e-05, + "loss": 40.3649, + "step": 3549 + }, + { + "epoch": 12.816252821670428, + "grad_norm": 251.22291564941406, + "learning_rate": 1.072595281306715e-05, + "loss": 40.8151, + "step": 3550 + }, + { + "epoch": 12.816252821670428, + "eval_loss": 0.6065003275871277, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 3550 + }, + { + "epoch": 12.819864559819413, + "grad_norm": 218.13418579101562, + "learning_rate": 1.0720508166969147e-05, + "loss": 39.381, + "step": 3551 + }, + { + "epoch": 12.823476297968398, + "grad_norm": 250.90328979492188, + "learning_rate": 1.0715063520871145e-05, + "loss": 39.8923, + "step": 3552 + }, + { + "epoch": 12.827088036117381, + "grad_norm": 227.4825897216797, + "learning_rate": 1.070961887477314e-05, + "loss": 36.836, + "step": 3553 + }, + { + "epoch": 12.830699774266366, + "grad_norm": 253.7106475830078, + "learning_rate": 1.0704174228675135e-05, + "loss": 34.499, + "step": 3554 + }, + { + "epoch": 12.83431151241535, + "grad_norm": 280.0548400878906, + "learning_rate": 1.0698729582577132e-05, + "loss": 33.3409, + "step": 3555 + }, + { + "epoch": 12.837923250564334, + "grad_norm": 201.3768768310547, + "learning_rate": 1.069328493647913e-05, + "loss": 32.4868, + "step": 3556 + }, + { + "epoch": 12.841534988713319, + "grad_norm": 245.73446655273438, + "learning_rate": 1.0687840290381125e-05, + "loss": 32.8295, + "step": 3557 + }, + { + "epoch": 12.845146726862303, + "grad_norm": 195.0170440673828, + "learning_rate": 1.0682395644283122e-05, + "loss": 33.2009, + "step": 3558 + }, + { + "epoch": 12.848758465011286, + "grad_norm": 261.66357421875, + "learning_rate": 1.0676950998185117e-05, + "loss": 33.0627, + "step": 3559 + }, + { + "epoch": 12.852370203160271, + "grad_norm": 299.0184326171875, + "learning_rate": 1.0671506352087116e-05, + "loss": 34.184, + "step": 3560 + }, + { + "epoch": 12.852370203160271, + "eval_loss": 0.6077792048454285, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.041, + "eval_steps_per_second": 57.041, + "step": 3560 + }, + { + "epoch": 12.855981941309254, + "grad_norm": 293.9249572753906, + "learning_rate": 1.0666061705989111e-05, + "loss": 34.748, + "step": 3561 + }, + { + "epoch": 12.85959367945824, + "grad_norm": 206.4182586669922, + "learning_rate": 1.0660617059891107e-05, + "loss": 33.8454, + "step": 3562 + }, + { + "epoch": 12.863205417607224, + "grad_norm": 261.4427185058594, + "learning_rate": 1.0655172413793104e-05, + "loss": 35.7317, + "step": 3563 + }, + { + "epoch": 12.866817155756207, + "grad_norm": 236.60704040527344, + "learning_rate": 1.06497277676951e-05, + "loss": 35.2389, + "step": 3564 + }, + { + "epoch": 12.870428893905192, + "grad_norm": 272.9973449707031, + "learning_rate": 1.0644283121597096e-05, + "loss": 34.8523, + "step": 3565 + }, + { + "epoch": 12.874040632054175, + "grad_norm": 228.82540893554688, + "learning_rate": 1.0638838475499093e-05, + "loss": 34.7236, + "step": 3566 + }, + { + "epoch": 12.87765237020316, + "grad_norm": 266.6078796386719, + "learning_rate": 1.0633393829401089e-05, + "loss": 36.1574, + "step": 3567 + }, + { + "epoch": 12.881264108352145, + "grad_norm": 267.52239990234375, + "learning_rate": 1.0627949183303086e-05, + "loss": 36.8466, + "step": 3568 + }, + { + "epoch": 12.884875846501128, + "grad_norm": 261.0372314453125, + "learning_rate": 1.0622504537205083e-05, + "loss": 37.2803, + "step": 3569 + }, + { + "epoch": 12.888487584650113, + "grad_norm": 220.42532348632812, + "learning_rate": 1.0617059891107078e-05, + "loss": 29.4233, + "step": 3570 + }, + { + "epoch": 12.888487584650113, + "eval_loss": 0.6131581664085388, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 3570 + }, + { + "epoch": 12.892099322799098, + "grad_norm": 187.53604125976562, + "learning_rate": 1.0611615245009075e-05, + "loss": 23.3851, + "step": 3571 + }, + { + "epoch": 12.89571106094808, + "grad_norm": 227.1913299560547, + "learning_rate": 1.060617059891107e-05, + "loss": 23.3155, + "step": 3572 + }, + { + "epoch": 12.899322799097066, + "grad_norm": 202.15939331054688, + "learning_rate": 1.0600725952813066e-05, + "loss": 24.4548, + "step": 3573 + }, + { + "epoch": 12.90293453724605, + "grad_norm": 195.67282104492188, + "learning_rate": 1.0595281306715065e-05, + "loss": 24.2037, + "step": 3574 + }, + { + "epoch": 12.906546275395034, + "grad_norm": 303.0018310546875, + "learning_rate": 1.058983666061706e-05, + "loss": 41.6489, + "step": 3575 + }, + { + "epoch": 12.910158013544018, + "grad_norm": 193.92433166503906, + "learning_rate": 1.0584392014519056e-05, + "loss": 40.3682, + "step": 3576 + }, + { + "epoch": 12.913769751693001, + "grad_norm": 305.50750732421875, + "learning_rate": 1.0578947368421053e-05, + "loss": 40.5065, + "step": 3577 + }, + { + "epoch": 12.917381489841986, + "grad_norm": 223.41732788085938, + "learning_rate": 1.0573502722323048e-05, + "loss": 41.6387, + "step": 3578 + }, + { + "epoch": 12.920993227990971, + "grad_norm": 215.65061950683594, + "learning_rate": 1.0568058076225047e-05, + "loss": 41.3623, + "step": 3579 + }, + { + "epoch": 12.924604966139954, + "grad_norm": 223.95880126953125, + "learning_rate": 1.0562613430127042e-05, + "loss": 40.7444, + "step": 3580 + }, + { + "epoch": 12.924604966139954, + "eval_loss": 0.6113386750221252, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.074, + "eval_steps_per_second": 57.074, + "step": 3580 + }, + { + "epoch": 12.928216704288939, + "grad_norm": 247.3272247314453, + "learning_rate": 1.0557168784029038e-05, + "loss": 37.8137, + "step": 3581 + }, + { + "epoch": 12.931828442437924, + "grad_norm": 277.4321594238281, + "learning_rate": 1.0551724137931035e-05, + "loss": 38.6946, + "step": 3582 + }, + { + "epoch": 12.935440180586907, + "grad_norm": 219.15576171875, + "learning_rate": 1.0546279491833032e-05, + "loss": 39.0059, + "step": 3583 + }, + { + "epoch": 12.939051918735892, + "grad_norm": 205.6105194091797, + "learning_rate": 1.0540834845735027e-05, + "loss": 39.2436, + "step": 3584 + }, + { + "epoch": 12.942663656884875, + "grad_norm": 303.84521484375, + "learning_rate": 1.0535390199637024e-05, + "loss": 39.2451, + "step": 3585 + }, + { + "epoch": 12.94627539503386, + "grad_norm": 326.2321472167969, + "learning_rate": 1.052994555353902e-05, + "loss": 38.1849, + "step": 3586 + }, + { + "epoch": 12.949887133182845, + "grad_norm": 332.7608642578125, + "learning_rate": 1.0524500907441015e-05, + "loss": 39.7121, + "step": 3587 + }, + { + "epoch": 12.953498871331828, + "grad_norm": 245.19827270507812, + "learning_rate": 1.0519056261343014e-05, + "loss": 39.6558, + "step": 3588 + }, + { + "epoch": 12.957110609480813, + "grad_norm": 227.54763793945312, + "learning_rate": 1.051361161524501e-05, + "loss": 38.6437, + "step": 3589 + }, + { + "epoch": 12.960722347629797, + "grad_norm": 273.1142272949219, + "learning_rate": 1.0508166969147006e-05, + "loss": 39.083, + "step": 3590 + }, + { + "epoch": 12.960722347629797, + "eval_loss": 0.6050187349319458, + "eval_runtime": 3.1339, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 3590 + }, + { + "epoch": 12.96433408577878, + "grad_norm": 227.0492401123047, + "learning_rate": 1.0502722323049002e-05, + "loss": 34.0254, + "step": 3591 + }, + { + "epoch": 12.967945823927765, + "grad_norm": 201.76736450195312, + "learning_rate": 1.0497277676950999e-05, + "loss": 32.4569, + "step": 3592 + }, + { + "epoch": 12.97155756207675, + "grad_norm": 279.99237060546875, + "learning_rate": 1.0491833030852996e-05, + "loss": 33.8718, + "step": 3593 + }, + { + "epoch": 12.975169300225733, + "grad_norm": 351.647705078125, + "learning_rate": 1.0486388384754991e-05, + "loss": 34.8168, + "step": 3594 + }, + { + "epoch": 12.978781038374718, + "grad_norm": 275.7414855957031, + "learning_rate": 1.0480943738656987e-05, + "loss": 35.1731, + "step": 3595 + }, + { + "epoch": 12.982392776523701, + "grad_norm": 347.0024719238281, + "learning_rate": 1.0475499092558984e-05, + "loss": 35.7127, + "step": 3596 + }, + { + "epoch": 12.986004514672686, + "grad_norm": 304.18218994140625, + "learning_rate": 1.047005444646098e-05, + "loss": 34.7709, + "step": 3597 + }, + { + "epoch": 12.989616252821671, + "grad_norm": 306.33245849609375, + "learning_rate": 1.0464609800362976e-05, + "loss": 37.2105, + "step": 3598 + }, + { + "epoch": 12.993227990970654, + "grad_norm": 326.3535461425781, + "learning_rate": 1.0459165154264973e-05, + "loss": 33.6613, + "step": 3599 + }, + { + "epoch": 12.996839729119639, + "grad_norm": 325.7522888183594, + "learning_rate": 1.0453720508166969e-05, + "loss": 22.8985, + "step": 3600 + }, + { + "epoch": 12.996839729119639, + "eval_loss": 0.6073772311210632, + "eval_runtime": 3.1391, + "eval_samples_per_second": 57.023, + "eval_steps_per_second": 57.023, + "step": 3600 + }, + { + "epoch": 13.0, + "grad_norm": 256.7010498046875, + "learning_rate": 1.0448275862068966e-05, + "loss": 21.3776, + "step": 3601 + }, + { + "epoch": 13.003611738148985, + "grad_norm": 247.7591552734375, + "learning_rate": 1.0442831215970963e-05, + "loss": 39.0509, + "step": 3602 + }, + { + "epoch": 13.007223476297968, + "grad_norm": 389.6626281738281, + "learning_rate": 1.0437386569872958e-05, + "loss": 41.042, + "step": 3603 + }, + { + "epoch": 13.010835214446953, + "grad_norm": 271.01885986328125, + "learning_rate": 1.0431941923774955e-05, + "loss": 39.9542, + "step": 3604 + }, + { + "epoch": 13.014446952595938, + "grad_norm": 263.2490539550781, + "learning_rate": 1.042649727767695e-05, + "loss": 39.8852, + "step": 3605 + }, + { + "epoch": 13.01805869074492, + "grad_norm": 255.46878051757812, + "learning_rate": 1.0421052631578948e-05, + "loss": 39.3902, + "step": 3606 + }, + { + "epoch": 13.021670428893906, + "grad_norm": 206.02244567871094, + "learning_rate": 1.0415607985480945e-05, + "loss": 40.1731, + "step": 3607 + }, + { + "epoch": 13.025282167042889, + "grad_norm": 194.83055114746094, + "learning_rate": 1.041016333938294e-05, + "loss": 39.17, + "step": 3608 + }, + { + "epoch": 13.028893905191874, + "grad_norm": 230.1270294189453, + "learning_rate": 1.0404718693284936e-05, + "loss": 40.3363, + "step": 3609 + }, + { + "epoch": 13.032505643340858, + "grad_norm": 206.0470733642578, + "learning_rate": 1.0399274047186933e-05, + "loss": 40.7774, + "step": 3610 + }, + { + "epoch": 13.032505643340858, + "eval_loss": 0.6078981161117554, + "eval_runtime": 3.1697, + "eval_samples_per_second": 56.472, + "eval_steps_per_second": 56.472, + "step": 3610 + }, + { + "epoch": 13.036117381489841, + "grad_norm": 210.79327392578125, + "learning_rate": 1.039382940108893e-05, + "loss": 40.725, + "step": 3611 + }, + { + "epoch": 13.039729119638826, + "grad_norm": 200.4281768798828, + "learning_rate": 1.0388384754990927e-05, + "loss": 38.8736, + "step": 3612 + }, + { + "epoch": 13.043340857787811, + "grad_norm": 183.33575439453125, + "learning_rate": 1.0382940108892922e-05, + "loss": 37.5542, + "step": 3613 + }, + { + "epoch": 13.046952595936794, + "grad_norm": 195.2568817138672, + "learning_rate": 1.0377495462794918e-05, + "loss": 36.5576, + "step": 3614 + }, + { + "epoch": 13.050564334085779, + "grad_norm": 223.9565887451172, + "learning_rate": 1.0372050816696916e-05, + "loss": 36.9015, + "step": 3615 + }, + { + "epoch": 13.054176072234762, + "grad_norm": 264.0516052246094, + "learning_rate": 1.0366606170598912e-05, + "loss": 38.8146, + "step": 3616 + }, + { + "epoch": 13.057787810383747, + "grad_norm": 247.3844757080078, + "learning_rate": 1.0361161524500907e-05, + "loss": 37.0338, + "step": 3617 + }, + { + "epoch": 13.061399548532732, + "grad_norm": 243.3253173828125, + "learning_rate": 1.0355716878402904e-05, + "loss": 37.3565, + "step": 3618 + }, + { + "epoch": 13.065011286681715, + "grad_norm": 213.89939880371094, + "learning_rate": 1.03502722323049e-05, + "loss": 38.367, + "step": 3619 + }, + { + "epoch": 13.0686230248307, + "grad_norm": 254.04953002929688, + "learning_rate": 1.0344827586206898e-05, + "loss": 38.3101, + "step": 3620 + }, + { + "epoch": 13.0686230248307, + "eval_loss": 0.6108394861221313, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 3620 + }, + { + "epoch": 13.072234762979685, + "grad_norm": 235.3623046875, + "learning_rate": 1.0339382940108894e-05, + "loss": 38.3113, + "step": 3621 + }, + { + "epoch": 13.075846501128668, + "grad_norm": 259.0147399902344, + "learning_rate": 1.0333938294010889e-05, + "loss": 36.9916, + "step": 3622 + }, + { + "epoch": 13.079458239277653, + "grad_norm": 257.96575927734375, + "learning_rate": 1.0328493647912886e-05, + "loss": 36.5944, + "step": 3623 + }, + { + "epoch": 13.083069977426636, + "grad_norm": 228.49131774902344, + "learning_rate": 1.0323049001814882e-05, + "loss": 39.7592, + "step": 3624 + }, + { + "epoch": 13.08668171557562, + "grad_norm": 278.5231018066406, + "learning_rate": 1.0317604355716879e-05, + "loss": 38.7785, + "step": 3625 + }, + { + "epoch": 13.090293453724605, + "grad_norm": 218.6136932373047, + "learning_rate": 1.0312159709618876e-05, + "loss": 39.6878, + "step": 3626 + }, + { + "epoch": 13.093905191873588, + "grad_norm": 231.03012084960938, + "learning_rate": 1.0306715063520871e-05, + "loss": 40.5433, + "step": 3627 + }, + { + "epoch": 13.097516930022573, + "grad_norm": 254.7096405029297, + "learning_rate": 1.0301270417422866e-05, + "loss": 39.1311, + "step": 3628 + }, + { + "epoch": 13.101128668171558, + "grad_norm": 303.50274658203125, + "learning_rate": 1.0295825771324865e-05, + "loss": 38.6237, + "step": 3629 + }, + { + "epoch": 13.104740406320541, + "grad_norm": 217.4394073486328, + "learning_rate": 1.029038112522686e-05, + "loss": 36.5534, + "step": 3630 + }, + { + "epoch": 13.104740406320541, + "eval_loss": 0.6075544357299805, + "eval_runtime": 3.1475, + "eval_samples_per_second": 56.87, + "eval_steps_per_second": 56.87, + "step": 3630 + }, + { + "epoch": 13.108352144469526, + "grad_norm": 249.18490600585938, + "learning_rate": 1.0284936479128858e-05, + "loss": 34.2153, + "step": 3631 + }, + { + "epoch": 13.111963882618511, + "grad_norm": 261.9061584472656, + "learning_rate": 1.0279491833030853e-05, + "loss": 33.7793, + "step": 3632 + }, + { + "epoch": 13.115575620767494, + "grad_norm": 205.93113708496094, + "learning_rate": 1.0274047186932848e-05, + "loss": 31.2934, + "step": 3633 + }, + { + "epoch": 13.119187358916479, + "grad_norm": 203.82980346679688, + "learning_rate": 1.0268602540834847e-05, + "loss": 31.9074, + "step": 3634 + }, + { + "epoch": 13.122799097065462, + "grad_norm": 309.0658874511719, + "learning_rate": 1.0263157894736843e-05, + "loss": 32.6883, + "step": 3635 + }, + { + "epoch": 13.126410835214447, + "grad_norm": 239.59312438964844, + "learning_rate": 1.0257713248638838e-05, + "loss": 34.1261, + "step": 3636 + }, + { + "epoch": 13.130022573363432, + "grad_norm": 360.4351501464844, + "learning_rate": 1.0252268602540835e-05, + "loss": 34.7656, + "step": 3637 + }, + { + "epoch": 13.133634311512415, + "grad_norm": 319.87451171875, + "learning_rate": 1.024682395644283e-05, + "loss": 34.6533, + "step": 3638 + }, + { + "epoch": 13.1372460496614, + "grad_norm": 352.31707763671875, + "learning_rate": 1.0241379310344828e-05, + "loss": 33.9159, + "step": 3639 + }, + { + "epoch": 13.140857787810384, + "grad_norm": 288.85418701171875, + "learning_rate": 1.0235934664246825e-05, + "loss": 34.6115, + "step": 3640 + }, + { + "epoch": 13.140857787810384, + "eval_loss": 0.6106187105178833, + "eval_runtime": 3.1535, + "eval_samples_per_second": 56.763, + "eval_steps_per_second": 56.763, + "step": 3640 + }, + { + "epoch": 13.144469525959368, + "grad_norm": 263.8638000488281, + "learning_rate": 1.023049001814882e-05, + "loss": 34.3008, + "step": 3641 + }, + { + "epoch": 13.148081264108352, + "grad_norm": 308.10650634765625, + "learning_rate": 1.0225045372050817e-05, + "loss": 35.9397, + "step": 3642 + }, + { + "epoch": 13.151693002257336, + "grad_norm": 208.60519409179688, + "learning_rate": 1.0219600725952814e-05, + "loss": 34.2573, + "step": 3643 + }, + { + "epoch": 13.15530474040632, + "grad_norm": 251.36766052246094, + "learning_rate": 1.021415607985481e-05, + "loss": 35.853, + "step": 3644 + }, + { + "epoch": 13.158916478555305, + "grad_norm": 264.94818115234375, + "learning_rate": 1.0208711433756807e-05, + "loss": 35.7057, + "step": 3645 + }, + { + "epoch": 13.162528216704288, + "grad_norm": 313.0333251953125, + "learning_rate": 1.0203266787658802e-05, + "loss": 34.611, + "step": 3646 + }, + { + "epoch": 13.166139954853273, + "grad_norm": 254.9687042236328, + "learning_rate": 1.0197822141560797e-05, + "loss": 31.1751, + "step": 3647 + }, + { + "epoch": 13.169751693002258, + "grad_norm": 219.7308349609375, + "learning_rate": 1.0192377495462796e-05, + "loss": 22.8425, + "step": 3648 + }, + { + "epoch": 13.173363431151241, + "grad_norm": 305.76416015625, + "learning_rate": 1.0186932849364792e-05, + "loss": 22.5266, + "step": 3649 + }, + { + "epoch": 13.176975169300226, + "grad_norm": 301.26239013671875, + "learning_rate": 1.0181488203266787e-05, + "loss": 23.861, + "step": 3650 + }, + { + "epoch": 13.176975169300226, + "eval_loss": 0.6107029914855957, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 3650 + }, + { + "epoch": 13.18058690744921, + "grad_norm": 235.15576171875, + "learning_rate": 1.0176043557168784e-05, + "loss": 24.495, + "step": 3651 + }, + { + "epoch": 13.184198645598194, + "grad_norm": 268.524658203125, + "learning_rate": 1.0170598911070781e-05, + "loss": 40.3819, + "step": 3652 + }, + { + "epoch": 13.187810383747179, + "grad_norm": 257.869140625, + "learning_rate": 1.0165154264972778e-05, + "loss": 42.2715, + "step": 3653 + }, + { + "epoch": 13.191422121896162, + "grad_norm": 191.8995361328125, + "learning_rate": 1.0159709618874774e-05, + "loss": 41.2991, + "step": 3654 + }, + { + "epoch": 13.195033860045147, + "grad_norm": 242.85342407226562, + "learning_rate": 1.0154264972776769e-05, + "loss": 39.6007, + "step": 3655 + }, + { + "epoch": 13.198645598194132, + "grad_norm": 279.1092529296875, + "learning_rate": 1.0148820326678766e-05, + "loss": 39.8502, + "step": 3656 + }, + { + "epoch": 13.202257336343115, + "grad_norm": 233.94708251953125, + "learning_rate": 1.0143375680580763e-05, + "loss": 39.6407, + "step": 3657 + }, + { + "epoch": 13.2058690744921, + "grad_norm": 227.53001403808594, + "learning_rate": 1.0137931034482758e-05, + "loss": 40.3618, + "step": 3658 + }, + { + "epoch": 13.209480812641084, + "grad_norm": 216.17654418945312, + "learning_rate": 1.0132486388384756e-05, + "loss": 41.3187, + "step": 3659 + }, + { + "epoch": 13.213092550790067, + "grad_norm": 199.51072692871094, + "learning_rate": 1.0127041742286751e-05, + "loss": 41.7474, + "step": 3660 + }, + { + "epoch": 13.213092550790067, + "eval_loss": 0.6099065542221069, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3660 + }, + { + "epoch": 13.216704288939052, + "grad_norm": 212.3302001953125, + "learning_rate": 1.0121597096188748e-05, + "loss": 40.8565, + "step": 3661 + }, + { + "epoch": 13.220316027088035, + "grad_norm": 185.42857360839844, + "learning_rate": 1.0116152450090745e-05, + "loss": 41.5302, + "step": 3662 + }, + { + "epoch": 13.22392776523702, + "grad_norm": 241.05487060546875, + "learning_rate": 1.011070780399274e-05, + "loss": 38.6842, + "step": 3663 + }, + { + "epoch": 13.227539503386005, + "grad_norm": 314.1755065917969, + "learning_rate": 1.0105263157894738e-05, + "loss": 37.8021, + "step": 3664 + }, + { + "epoch": 13.231151241534988, + "grad_norm": 262.6571960449219, + "learning_rate": 1.0099818511796733e-05, + "loss": 36.3265, + "step": 3665 + }, + { + "epoch": 13.234762979683973, + "grad_norm": 259.24029541015625, + "learning_rate": 1.009437386569873e-05, + "loss": 38.4521, + "step": 3666 + }, + { + "epoch": 13.238374717832958, + "grad_norm": 223.5182342529297, + "learning_rate": 1.0088929219600727e-05, + "loss": 37.3267, + "step": 3667 + }, + { + "epoch": 13.241986455981941, + "grad_norm": 181.72926330566406, + "learning_rate": 1.0083484573502722e-05, + "loss": 38.0142, + "step": 3668 + }, + { + "epoch": 13.245598194130926, + "grad_norm": 204.99813842773438, + "learning_rate": 1.0078039927404718e-05, + "loss": 37.3513, + "step": 3669 + }, + { + "epoch": 13.249209932279909, + "grad_norm": 184.05482482910156, + "learning_rate": 1.0072595281306715e-05, + "loss": 37.9737, + "step": 3670 + }, + { + "epoch": 13.249209932279909, + "eval_loss": 0.6081296801567078, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.081, + "eval_steps_per_second": 57.081, + "step": 3670 + }, + { + "epoch": 13.252821670428894, + "grad_norm": 261.076416015625, + "learning_rate": 1.0067150635208712e-05, + "loss": 38.1087, + "step": 3671 + }, + { + "epoch": 13.256433408577879, + "grad_norm": 218.79515075683594, + "learning_rate": 1.0061705989110709e-05, + "loss": 37.215, + "step": 3672 + }, + { + "epoch": 13.260045146726862, + "grad_norm": 240.93222045898438, + "learning_rate": 1.0056261343012704e-05, + "loss": 37.4461, + "step": 3673 + }, + { + "epoch": 13.263656884875846, + "grad_norm": 241.46072387695312, + "learning_rate": 1.00508166969147e-05, + "loss": 39.4396, + "step": 3674 + }, + { + "epoch": 13.267268623024831, + "grad_norm": 217.85369873046875, + "learning_rate": 1.0045372050816699e-05, + "loss": 38.5512, + "step": 3675 + }, + { + "epoch": 13.270880361173814, + "grad_norm": 254.53549194335938, + "learning_rate": 1.0039927404718694e-05, + "loss": 39.4436, + "step": 3676 + }, + { + "epoch": 13.2744920993228, + "grad_norm": 330.2030029296875, + "learning_rate": 1.003448275862069e-05, + "loss": 39.6341, + "step": 3677 + }, + { + "epoch": 13.278103837471784, + "grad_norm": 267.6778869628906, + "learning_rate": 1.0029038112522686e-05, + "loss": 38.5305, + "step": 3678 + }, + { + "epoch": 13.281715575620767, + "grad_norm": 251.23703002929688, + "learning_rate": 1.0023593466424682e-05, + "loss": 39.712, + "step": 3679 + }, + { + "epoch": 13.285327313769752, + "grad_norm": 258.8126525878906, + "learning_rate": 1.0018148820326679e-05, + "loss": 37.982, + "step": 3680 + }, + { + "epoch": 13.285327313769752, + "eval_loss": 0.6092600226402283, + "eval_runtime": 3.1494, + "eval_samples_per_second": 56.837, + "eval_steps_per_second": 56.837, + "step": 3680 + }, + { + "epoch": 13.288939051918735, + "grad_norm": 270.01690673828125, + "learning_rate": 1.0012704174228676e-05, + "loss": 35.8938, + "step": 3681 + }, + { + "epoch": 13.29255079006772, + "grad_norm": 271.138671875, + "learning_rate": 1.0007259528130671e-05, + "loss": 33.2221, + "step": 3682 + }, + { + "epoch": 13.296162528216705, + "grad_norm": 239.4976806640625, + "learning_rate": 1.0001814882032668e-05, + "loss": 32.6252, + "step": 3683 + }, + { + "epoch": 13.299774266365688, + "grad_norm": 203.7470245361328, + "learning_rate": 9.996370235934664e-06, + "loss": 32.3694, + "step": 3684 + }, + { + "epoch": 13.303386004514673, + "grad_norm": 255.28419494628906, + "learning_rate": 9.990925589836661e-06, + "loss": 32.7386, + "step": 3685 + }, + { + "epoch": 13.306997742663658, + "grad_norm": 267.82489013671875, + "learning_rate": 9.985480943738658e-06, + "loss": 33.7657, + "step": 3686 + }, + { + "epoch": 13.31060948081264, + "grad_norm": 224.82432556152344, + "learning_rate": 9.980036297640653e-06, + "loss": 34.085, + "step": 3687 + }, + { + "epoch": 13.314221218961626, + "grad_norm": 249.92684936523438, + "learning_rate": 9.974591651542649e-06, + "loss": 33.9186, + "step": 3688 + }, + { + "epoch": 13.317832957110609, + "grad_norm": 249.29620361328125, + "learning_rate": 9.969147005444648e-06, + "loss": 35.0909, + "step": 3689 + }, + { + "epoch": 13.321444695259594, + "grad_norm": 276.4640808105469, + "learning_rate": 9.963702359346643e-06, + "loss": 35.6823, + "step": 3690 + }, + { + "epoch": 13.321444695259594, + "eval_loss": 0.6132593154907227, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 3690 + }, + { + "epoch": 13.325056433408578, + "grad_norm": 245.46163940429688, + "learning_rate": 9.958257713248638e-06, + "loss": 35.7071, + "step": 3691 + }, + { + "epoch": 13.328668171557561, + "grad_norm": 311.008544921875, + "learning_rate": 9.952813067150635e-06, + "loss": 33.6089, + "step": 3692 + }, + { + "epoch": 13.332279909706546, + "grad_norm": 283.2784118652344, + "learning_rate": 9.94736842105263e-06, + "loss": 34.9939, + "step": 3693 + }, + { + "epoch": 13.335891647855531, + "grad_norm": 293.2317199707031, + "learning_rate": 9.94192377495463e-06, + "loss": 37.1149, + "step": 3694 + }, + { + "epoch": 13.339503386004514, + "grad_norm": 263.33111572265625, + "learning_rate": 9.936479128856625e-06, + "loss": 36.5911, + "step": 3695 + }, + { + "epoch": 13.343115124153499, + "grad_norm": 285.1488952636719, + "learning_rate": 9.93103448275862e-06, + "loss": 35.9336, + "step": 3696 + }, + { + "epoch": 13.346726862302482, + "grad_norm": 246.30616760253906, + "learning_rate": 9.925589836660617e-06, + "loss": 26.1555, + "step": 3697 + }, + { + "epoch": 13.350338600451467, + "grad_norm": 185.4857177734375, + "learning_rate": 9.920145190562614e-06, + "loss": 21.9519, + "step": 3698 + }, + { + "epoch": 13.353950338600452, + "grad_norm": 269.6291809082031, + "learning_rate": 9.91470054446461e-06, + "loss": 22.5592, + "step": 3699 + }, + { + "epoch": 13.357562076749435, + "grad_norm": 214.7660675048828, + "learning_rate": 9.909255898366607e-06, + "loss": 23.2505, + "step": 3700 + }, + { + "epoch": 13.357562076749435, + "eval_loss": 0.6123418211936951, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 3700 + }, + { + "epoch": 13.36117381489842, + "grad_norm": 227.8025360107422, + "learning_rate": 9.903811252268602e-06, + "loss": 23.9731, + "step": 3701 + }, + { + "epoch": 13.364785553047405, + "grad_norm": 261.7846374511719, + "learning_rate": 9.898366606170598e-06, + "loss": 40.3869, + "step": 3702 + }, + { + "epoch": 13.368397291196388, + "grad_norm": 305.4109802246094, + "learning_rate": 9.892921960072596e-06, + "loss": 41.9626, + "step": 3703 + }, + { + "epoch": 13.372009029345373, + "grad_norm": 272.86236572265625, + "learning_rate": 9.887477313974592e-06, + "loss": 39.9819, + "step": 3704 + }, + { + "epoch": 13.375620767494357, + "grad_norm": 371.4781188964844, + "learning_rate": 9.882032667876589e-06, + "loss": 40.8074, + "step": 3705 + }, + { + "epoch": 13.37923250564334, + "grad_norm": 278.7463684082031, + "learning_rate": 9.876588021778584e-06, + "loss": 40.6721, + "step": 3706 + }, + { + "epoch": 13.382844243792325, + "grad_norm": 270.41619873046875, + "learning_rate": 9.87114337568058e-06, + "loss": 40.1604, + "step": 3707 + }, + { + "epoch": 13.386455981941308, + "grad_norm": 204.42018127441406, + "learning_rate": 9.865698729582578e-06, + "loss": 41.4666, + "step": 3708 + }, + { + "epoch": 13.390067720090293, + "grad_norm": 197.43289184570312, + "learning_rate": 9.860254083484574e-06, + "loss": 40.953, + "step": 3709 + }, + { + "epoch": 13.393679458239278, + "grad_norm": 203.92056274414062, + "learning_rate": 9.85480943738657e-06, + "loss": 40.6416, + "step": 3710 + }, + { + "epoch": 13.393679458239278, + "eval_loss": 0.608938992023468, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.863, + "eval_steps_per_second": 56.863, + "step": 3710 + }, + { + "epoch": 13.397291196388261, + "grad_norm": 353.2951354980469, + "learning_rate": 9.849364791288566e-06, + "loss": 39.7, + "step": 3711 + }, + { + "epoch": 13.400902934537246, + "grad_norm": 222.94410705566406, + "learning_rate": 9.843920145190563e-06, + "loss": 40.4703, + "step": 3712 + }, + { + "epoch": 13.404514672686231, + "grad_norm": 301.0710754394531, + "learning_rate": 9.83847549909256e-06, + "loss": 37.0453, + "step": 3713 + }, + { + "epoch": 13.408126410835214, + "grad_norm": 251.70263671875, + "learning_rate": 9.833030852994556e-06, + "loss": 37.5346, + "step": 3714 + }, + { + "epoch": 13.411738148984199, + "grad_norm": 201.29335021972656, + "learning_rate": 9.827586206896551e-06, + "loss": 39.0706, + "step": 3715 + }, + { + "epoch": 13.415349887133182, + "grad_norm": 233.82212829589844, + "learning_rate": 9.822141560798548e-06, + "loss": 38.4527, + "step": 3716 + }, + { + "epoch": 13.418961625282167, + "grad_norm": 245.0128936767578, + "learning_rate": 9.816696914700545e-06, + "loss": 37.82, + "step": 3717 + }, + { + "epoch": 13.422573363431152, + "grad_norm": 325.1784973144531, + "learning_rate": 9.81125226860254e-06, + "loss": 38.8858, + "step": 3718 + }, + { + "epoch": 13.426185101580135, + "grad_norm": 196.15032958984375, + "learning_rate": 9.805807622504538e-06, + "loss": 37.1919, + "step": 3719 + }, + { + "epoch": 13.42979683972912, + "grad_norm": 254.73980712890625, + "learning_rate": 9.800362976406533e-06, + "loss": 39.1644, + "step": 3720 + }, + { + "epoch": 13.42979683972912, + "eval_loss": 0.6100116968154907, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 3720 + }, + { + "epoch": 13.433408577878104, + "grad_norm": 253.11489868164062, + "learning_rate": 9.79491833030853e-06, + "loss": 39.8542, + "step": 3721 + }, + { + "epoch": 13.437020316027088, + "grad_norm": 267.8416748046875, + "learning_rate": 9.789473684210527e-06, + "loss": 39.8469, + "step": 3722 + }, + { + "epoch": 13.440632054176072, + "grad_norm": 267.62835693359375, + "learning_rate": 9.784029038112523e-06, + "loss": 37.4556, + "step": 3723 + }, + { + "epoch": 13.444243792325057, + "grad_norm": 346.6018371582031, + "learning_rate": 9.77858439201452e-06, + "loss": 39.7817, + "step": 3724 + }, + { + "epoch": 13.44785553047404, + "grad_norm": 241.95008850097656, + "learning_rate": 9.773139745916515e-06, + "loss": 39.1631, + "step": 3725 + }, + { + "epoch": 13.451467268623025, + "grad_norm": 244.9163055419922, + "learning_rate": 9.767695099818512e-06, + "loss": 38.6152, + "step": 3726 + }, + { + "epoch": 13.455079006772008, + "grad_norm": 243.60633850097656, + "learning_rate": 9.76225045372051e-06, + "loss": 39.5388, + "step": 3727 + }, + { + "epoch": 13.458690744920993, + "grad_norm": 230.57276916503906, + "learning_rate": 9.756805807622505e-06, + "loss": 40.3007, + "step": 3728 + }, + { + "epoch": 13.462302483069978, + "grad_norm": 228.76754760742188, + "learning_rate": 9.7513611615245e-06, + "loss": 37.7111, + "step": 3729 + }, + { + "epoch": 13.465914221218961, + "grad_norm": 292.7367248535156, + "learning_rate": 9.745916515426497e-06, + "loss": 38.4114, + "step": 3730 + }, + { + "epoch": 13.465914221218961, + "eval_loss": 0.6064842939376831, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 3730 + }, + { + "epoch": 13.469525959367946, + "grad_norm": 226.9254150390625, + "learning_rate": 9.740471869328494e-06, + "loss": 34.015, + "step": 3731 + }, + { + "epoch": 13.47313769751693, + "grad_norm": 250.38137817382812, + "learning_rate": 9.73502722323049e-06, + "loss": 34.2911, + "step": 3732 + }, + { + "epoch": 13.476749435665914, + "grad_norm": 230.447265625, + "learning_rate": 9.729582577132487e-06, + "loss": 31.8708, + "step": 3733 + }, + { + "epoch": 13.480361173814899, + "grad_norm": 241.05787658691406, + "learning_rate": 9.724137931034482e-06, + "loss": 34.5685, + "step": 3734 + }, + { + "epoch": 13.483972911963882, + "grad_norm": 248.07254028320312, + "learning_rate": 9.718693284936481e-06, + "loss": 32.6084, + "step": 3735 + }, + { + "epoch": 13.487584650112867, + "grad_norm": 241.22862243652344, + "learning_rate": 9.713248638838476e-06, + "loss": 32.787, + "step": 3736 + }, + { + "epoch": 13.491196388261852, + "grad_norm": 295.4871520996094, + "learning_rate": 9.707803992740472e-06, + "loss": 33.9786, + "step": 3737 + }, + { + "epoch": 13.494808126410835, + "grad_norm": 285.3634948730469, + "learning_rate": 9.702359346642469e-06, + "loss": 33.9872, + "step": 3738 + }, + { + "epoch": 13.49841986455982, + "grad_norm": 302.39947509765625, + "learning_rate": 9.696914700544464e-06, + "loss": 33.9854, + "step": 3739 + }, + { + "epoch": 13.502031602708804, + "grad_norm": 310.0465087890625, + "learning_rate": 9.691470054446461e-06, + "loss": 34.1859, + "step": 3740 + }, + { + "epoch": 13.502031602708804, + "eval_loss": 0.6067100167274475, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 3740 + }, + { + "epoch": 13.505643340857787, + "grad_norm": 319.9311828613281, + "learning_rate": 9.686025408348458e-06, + "loss": 34.5264, + "step": 3741 + }, + { + "epoch": 13.509255079006772, + "grad_norm": 291.75738525390625, + "learning_rate": 9.680580762250454e-06, + "loss": 35.8348, + "step": 3742 + }, + { + "epoch": 13.512866817155757, + "grad_norm": 291.5312805175781, + "learning_rate": 9.675136116152449e-06, + "loss": 33.8803, + "step": 3743 + }, + { + "epoch": 13.51647855530474, + "grad_norm": 228.00588989257812, + "learning_rate": 9.669691470054448e-06, + "loss": 36.1919, + "step": 3744 + }, + { + "epoch": 13.520090293453725, + "grad_norm": 236.5559539794922, + "learning_rate": 9.664246823956443e-06, + "loss": 35.8432, + "step": 3745 + }, + { + "epoch": 13.523702031602708, + "grad_norm": 287.7408752441406, + "learning_rate": 9.65880217785844e-06, + "loss": 37.069, + "step": 3746 + }, + { + "epoch": 13.527313769751693, + "grad_norm": 272.73870849609375, + "learning_rate": 9.653357531760436e-06, + "loss": 29.1896, + "step": 3747 + }, + { + "epoch": 13.530925507900678, + "grad_norm": 256.5550842285156, + "learning_rate": 9.647912885662431e-06, + "loss": 23.0953, + "step": 3748 + }, + { + "epoch": 13.534537246049661, + "grad_norm": 230.98487854003906, + "learning_rate": 9.64246823956443e-06, + "loss": 21.9902, + "step": 3749 + }, + { + "epoch": 13.538148984198646, + "grad_norm": 247.1185760498047, + "learning_rate": 9.637023593466425e-06, + "loss": 23.7439, + "step": 3750 + }, + { + "epoch": 13.538148984198646, + "eval_loss": 0.6106311082839966, + "eval_runtime": 3.1356, + "eval_samples_per_second": 57.086, + "eval_steps_per_second": 57.086, + "step": 3750 + }, + { + "epoch": 13.54176072234763, + "grad_norm": 193.83152770996094, + "learning_rate": 9.63157894736842e-06, + "loss": 24.2292, + "step": 3751 + }, + { + "epoch": 13.545372460496614, + "grad_norm": 322.80487060546875, + "learning_rate": 9.626134301270418e-06, + "loss": 40.9778, + "step": 3752 + }, + { + "epoch": 13.548984198645599, + "grad_norm": 345.0560302734375, + "learning_rate": 9.620689655172413e-06, + "loss": 42.3601, + "step": 3753 + }, + { + "epoch": 13.552595936794582, + "grad_norm": 240.3759002685547, + "learning_rate": 9.61524500907441e-06, + "loss": 41.092, + "step": 3754 + }, + { + "epoch": 13.556207674943566, + "grad_norm": 219.0955352783203, + "learning_rate": 9.609800362976407e-06, + "loss": 40.3108, + "step": 3755 + }, + { + "epoch": 13.559819413092551, + "grad_norm": 255.6158447265625, + "learning_rate": 9.604355716878403e-06, + "loss": 39.8885, + "step": 3756 + }, + { + "epoch": 13.563431151241534, + "grad_norm": 264.55010986328125, + "learning_rate": 9.5989110707804e-06, + "loss": 40.8838, + "step": 3757 + }, + { + "epoch": 13.56704288939052, + "grad_norm": 313.0918273925781, + "learning_rate": 9.593466424682397e-06, + "loss": 40.6634, + "step": 3758 + }, + { + "epoch": 13.570654627539504, + "grad_norm": 304.87396240234375, + "learning_rate": 9.588021778584392e-06, + "loss": 41.8734, + "step": 3759 + }, + { + "epoch": 13.574266365688487, + "grad_norm": 239.76063537597656, + "learning_rate": 9.58257713248639e-06, + "loss": 40.6281, + "step": 3760 + }, + { + "epoch": 13.574266365688487, + "eval_loss": 0.6124129891395569, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.028, + "eval_steps_per_second": 57.028, + "step": 3760 + }, + { + "epoch": 13.577878103837472, + "grad_norm": 201.89422607421875, + "learning_rate": 9.577132486388385e-06, + "loss": 39.6948, + "step": 3761 + }, + { + "epoch": 13.581489841986457, + "grad_norm": 232.8797607421875, + "learning_rate": 9.57168784029038e-06, + "loss": 39.6927, + "step": 3762 + }, + { + "epoch": 13.58510158013544, + "grad_norm": 250.30355834960938, + "learning_rate": 9.566243194192379e-06, + "loss": 37.6926, + "step": 3763 + }, + { + "epoch": 13.588713318284425, + "grad_norm": 256.23626708984375, + "learning_rate": 9.560798548094374e-06, + "loss": 38.248, + "step": 3764 + }, + { + "epoch": 13.592325056433408, + "grad_norm": 234.1791534423828, + "learning_rate": 9.555353901996371e-06, + "loss": 36.8178, + "step": 3765 + }, + { + "epoch": 13.595936794582393, + "grad_norm": 243.87615966796875, + "learning_rate": 9.549909255898367e-06, + "loss": 37.0802, + "step": 3766 + }, + { + "epoch": 13.599548532731378, + "grad_norm": 220.98150634765625, + "learning_rate": 9.544464609800362e-06, + "loss": 37.1251, + "step": 3767 + }, + { + "epoch": 13.60316027088036, + "grad_norm": 235.8653564453125, + "learning_rate": 9.53901996370236e-06, + "loss": 38.2965, + "step": 3768 + }, + { + "epoch": 13.606772009029346, + "grad_norm": 237.66712951660156, + "learning_rate": 9.533575317604356e-06, + "loss": 38.0266, + "step": 3769 + }, + { + "epoch": 13.610383747178329, + "grad_norm": 229.4922637939453, + "learning_rate": 9.528130671506351e-06, + "loss": 38.4199, + "step": 3770 + }, + { + "epoch": 13.610383747178329, + "eval_loss": 0.6078812479972839, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 3770 + }, + { + "epoch": 13.613995485327314, + "grad_norm": 250.82533264160156, + "learning_rate": 9.522686025408349e-06, + "loss": 39.713, + "step": 3771 + }, + { + "epoch": 13.617607223476298, + "grad_norm": 218.97511291503906, + "learning_rate": 9.517241379310346e-06, + "loss": 37.6396, + "step": 3772 + }, + { + "epoch": 13.621218961625281, + "grad_norm": 240.13096618652344, + "learning_rate": 9.511796733212341e-06, + "loss": 39.2808, + "step": 3773 + }, + { + "epoch": 13.624830699774266, + "grad_norm": 214.77957153320312, + "learning_rate": 9.506352087114338e-06, + "loss": 39.1584, + "step": 3774 + }, + { + "epoch": 13.628442437923251, + "grad_norm": 273.2488708496094, + "learning_rate": 9.500907441016333e-06, + "loss": 39.6725, + "step": 3775 + }, + { + "epoch": 13.632054176072234, + "grad_norm": 240.46669006347656, + "learning_rate": 9.49546279491833e-06, + "loss": 40.155, + "step": 3776 + }, + { + "epoch": 13.635665914221219, + "grad_norm": 304.46533203125, + "learning_rate": 9.490018148820328e-06, + "loss": 39.5831, + "step": 3777 + }, + { + "epoch": 13.639277652370204, + "grad_norm": 282.9252624511719, + "learning_rate": 9.484573502722323e-06, + "loss": 40.8392, + "step": 3778 + }, + { + "epoch": 13.642889390519187, + "grad_norm": 229.2595977783203, + "learning_rate": 9.47912885662432e-06, + "loss": 38.4015, + "step": 3779 + }, + { + "epoch": 13.646501128668172, + "grad_norm": 300.0253601074219, + "learning_rate": 9.473684210526315e-06, + "loss": 35.0578, + "step": 3780 + }, + { + "epoch": 13.646501128668172, + "eval_loss": 0.6059401631355286, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 3780 + }, + { + "epoch": 13.650112866817155, + "grad_norm": 266.379638671875, + "learning_rate": 9.468239564428313e-06, + "loss": 33.0308, + "step": 3781 + }, + { + "epoch": 13.65372460496614, + "grad_norm": 248.8190460205078, + "learning_rate": 9.46279491833031e-06, + "loss": 31.7632, + "step": 3782 + }, + { + "epoch": 13.657336343115125, + "grad_norm": 224.4126739501953, + "learning_rate": 9.457350272232305e-06, + "loss": 32.8875, + "step": 3783 + }, + { + "epoch": 13.660948081264108, + "grad_norm": 259.84466552734375, + "learning_rate": 9.4519056261343e-06, + "loss": 32.3248, + "step": 3784 + }, + { + "epoch": 13.664559819413093, + "grad_norm": 233.59483337402344, + "learning_rate": 9.446460980036297e-06, + "loss": 32.5855, + "step": 3785 + }, + { + "epoch": 13.668171557562077, + "grad_norm": 283.1840515136719, + "learning_rate": 9.441016333938295e-06, + "loss": 33.8277, + "step": 3786 + }, + { + "epoch": 13.67178329571106, + "grad_norm": 269.51171875, + "learning_rate": 9.435571687840292e-06, + "loss": 33.8348, + "step": 3787 + }, + { + "epoch": 13.675395033860045, + "grad_norm": 284.6701354980469, + "learning_rate": 9.430127041742287e-06, + "loss": 34.2571, + "step": 3788 + }, + { + "epoch": 13.679006772009028, + "grad_norm": 308.96221923828125, + "learning_rate": 9.424682395644282e-06, + "loss": 34.2313, + "step": 3789 + }, + { + "epoch": 13.682618510158013, + "grad_norm": 229.36366271972656, + "learning_rate": 9.41923774954628e-06, + "loss": 34.6341, + "step": 3790 + }, + { + "epoch": 13.682618510158013, + "eval_loss": 0.606715202331543, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 3790 + }, + { + "epoch": 13.686230248306998, + "grad_norm": 335.4346008300781, + "learning_rate": 9.413793103448277e-06, + "loss": 35.2222, + "step": 3791 + }, + { + "epoch": 13.689841986455981, + "grad_norm": 259.72222900390625, + "learning_rate": 9.408348457350272e-06, + "loss": 34.7416, + "step": 3792 + }, + { + "epoch": 13.693453724604966, + "grad_norm": 275.96112060546875, + "learning_rate": 9.402903811252269e-06, + "loss": 34.2018, + "step": 3793 + }, + { + "epoch": 13.697065462753951, + "grad_norm": 349.28924560546875, + "learning_rate": 9.397459165154264e-06, + "loss": 37.8801, + "step": 3794 + }, + { + "epoch": 13.700677200902934, + "grad_norm": 288.47540283203125, + "learning_rate": 9.392014519056261e-06, + "loss": 37.5101, + "step": 3795 + }, + { + "epoch": 13.704288939051919, + "grad_norm": 255.31033325195312, + "learning_rate": 9.386569872958259e-06, + "loss": 36.9294, + "step": 3796 + }, + { + "epoch": 13.707900677200904, + "grad_norm": 273.757080078125, + "learning_rate": 9.381125226860254e-06, + "loss": 31.64, + "step": 3797 + }, + { + "epoch": 13.711512415349887, + "grad_norm": 236.24928283691406, + "learning_rate": 9.375680580762251e-06, + "loss": 22.9812, + "step": 3798 + }, + { + "epoch": 13.715124153498872, + "grad_norm": 206.70883178710938, + "learning_rate": 9.370235934664246e-06, + "loss": 22.4788, + "step": 3799 + }, + { + "epoch": 13.718735891647855, + "grad_norm": 168.15762329101562, + "learning_rate": 9.364791288566243e-06, + "loss": 23.3803, + "step": 3800 + }, + { + "epoch": 13.718735891647855, + "eval_loss": 0.6092759966850281, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 3800 + }, + { + "epoch": 13.72234762979684, + "grad_norm": 261.88397216796875, + "learning_rate": 9.35934664246824e-06, + "loss": 24.8757, + "step": 3801 + }, + { + "epoch": 13.725959367945824, + "grad_norm": 235.3518829345703, + "learning_rate": 9.353901996370236e-06, + "loss": 39.8777, + "step": 3802 + }, + { + "epoch": 13.729571106094808, + "grad_norm": 226.94027709960938, + "learning_rate": 9.348457350272231e-06, + "loss": 40.4357, + "step": 3803 + }, + { + "epoch": 13.733182844243792, + "grad_norm": 266.2643737792969, + "learning_rate": 9.34301270417423e-06, + "loss": 41.6411, + "step": 3804 + }, + { + "epoch": 13.736794582392777, + "grad_norm": 327.39288330078125, + "learning_rate": 9.337568058076225e-06, + "loss": 39.862, + "step": 3805 + }, + { + "epoch": 13.74040632054176, + "grad_norm": 241.03121948242188, + "learning_rate": 9.332123411978223e-06, + "loss": 39.1833, + "step": 3806 + }, + { + "epoch": 13.744018058690745, + "grad_norm": 232.2872314453125, + "learning_rate": 9.326678765880218e-06, + "loss": 40.6895, + "step": 3807 + }, + { + "epoch": 13.747629796839728, + "grad_norm": 236.909912109375, + "learning_rate": 9.321234119782213e-06, + "loss": 39.5891, + "step": 3808 + }, + { + "epoch": 13.751241534988713, + "grad_norm": 193.81478881835938, + "learning_rate": 9.315789473684212e-06, + "loss": 41.5211, + "step": 3809 + }, + { + "epoch": 13.754853273137698, + "grad_norm": 214.87301635742188, + "learning_rate": 9.310344827586207e-06, + "loss": 41.0726, + "step": 3810 + }, + { + "epoch": 13.754853273137698, + "eval_loss": 0.6098713874816895, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.047, + "eval_steps_per_second": 57.047, + "step": 3810 + }, + { + "epoch": 13.758465011286681, + "grad_norm": 196.57247924804688, + "learning_rate": 9.304900181488203e-06, + "loss": 40.1843, + "step": 3811 + }, + { + "epoch": 13.762076749435666, + "grad_norm": 215.59698486328125, + "learning_rate": 9.2994555353902e-06, + "loss": 37.6279, + "step": 3812 + }, + { + "epoch": 13.76568848758465, + "grad_norm": 221.1280059814453, + "learning_rate": 9.294010889292195e-06, + "loss": 37.9593, + "step": 3813 + }, + { + "epoch": 13.769300225733634, + "grad_norm": 314.94610595703125, + "learning_rate": 9.288566243194192e-06, + "loss": 37.3399, + "step": 3814 + }, + { + "epoch": 13.772911963882619, + "grad_norm": 240.10816955566406, + "learning_rate": 9.28312159709619e-06, + "loss": 38.3185, + "step": 3815 + }, + { + "epoch": 13.776523702031604, + "grad_norm": 229.2427978515625, + "learning_rate": 9.277676950998185e-06, + "loss": 36.9407, + "step": 3816 + }, + { + "epoch": 13.780135440180587, + "grad_norm": 224.78335571289062, + "learning_rate": 9.272232304900182e-06, + "loss": 39.3709, + "step": 3817 + }, + { + "epoch": 13.783747178329572, + "grad_norm": 216.5969696044922, + "learning_rate": 9.266787658802179e-06, + "loss": 38.2303, + "step": 3818 + }, + { + "epoch": 13.787358916478555, + "grad_norm": 208.7849884033203, + "learning_rate": 9.261343012704174e-06, + "loss": 39.492, + "step": 3819 + }, + { + "epoch": 13.79097065462754, + "grad_norm": 215.76475524902344, + "learning_rate": 9.255898366606171e-06, + "loss": 38.5599, + "step": 3820 + }, + { + "epoch": 13.79097065462754, + "eval_loss": 0.6080366969108582, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.041, + "eval_steps_per_second": 57.041, + "step": 3820 + }, + { + "epoch": 13.794582392776524, + "grad_norm": 224.64462280273438, + "learning_rate": 9.250453720508167e-06, + "loss": 39.315, + "step": 3821 + }, + { + "epoch": 13.798194130925507, + "grad_norm": 298.545654296875, + "learning_rate": 9.245009074410162e-06, + "loss": 38.3108, + "step": 3822 + }, + { + "epoch": 13.801805869074492, + "grad_norm": 236.5186767578125, + "learning_rate": 9.239564428312161e-06, + "loss": 39.9223, + "step": 3823 + }, + { + "epoch": 13.805417607223477, + "grad_norm": 251.47999572753906, + "learning_rate": 9.234119782214156e-06, + "loss": 39.4288, + "step": 3824 + }, + { + "epoch": 13.80902934537246, + "grad_norm": 260.8268737792969, + "learning_rate": 9.228675136116152e-06, + "loss": 38.276, + "step": 3825 + }, + { + "epoch": 13.812641083521445, + "grad_norm": 253.25172424316406, + "learning_rate": 9.223230490018149e-06, + "loss": 40.7118, + "step": 3826 + }, + { + "epoch": 13.816252821670428, + "grad_norm": 250.31784057617188, + "learning_rate": 9.217785843920146e-06, + "loss": 40.1916, + "step": 3827 + }, + { + "epoch": 13.819864559819413, + "grad_norm": 228.79234313964844, + "learning_rate": 9.212341197822143e-06, + "loss": 38.1513, + "step": 3828 + }, + { + "epoch": 13.823476297968398, + "grad_norm": 262.689697265625, + "learning_rate": 9.206896551724138e-06, + "loss": 38.43, + "step": 3829 + }, + { + "epoch": 13.827088036117381, + "grad_norm": 191.04139709472656, + "learning_rate": 9.201451905626134e-06, + "loss": 34.2476, + "step": 3830 + }, + { + "epoch": 13.827088036117381, + "eval_loss": 0.6077054142951965, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 3830 + }, + { + "epoch": 13.830699774266366, + "grad_norm": 236.3266143798828, + "learning_rate": 9.196007259528131e-06, + "loss": 33.7892, + "step": 3831 + }, + { + "epoch": 13.83431151241535, + "grad_norm": 284.8748474121094, + "learning_rate": 9.190562613430128e-06, + "loss": 31.9857, + "step": 3832 + }, + { + "epoch": 13.837923250564334, + "grad_norm": 261.17413330078125, + "learning_rate": 9.185117967332123e-06, + "loss": 32.8165, + "step": 3833 + }, + { + "epoch": 13.841534988713319, + "grad_norm": 195.1323699951172, + "learning_rate": 9.17967332123412e-06, + "loss": 33.1709, + "step": 3834 + }, + { + "epoch": 13.845146726862303, + "grad_norm": 220.5006561279297, + "learning_rate": 9.174228675136116e-06, + "loss": 33.149, + "step": 3835 + }, + { + "epoch": 13.848758465011286, + "grad_norm": 236.7254638671875, + "learning_rate": 9.168784029038111e-06, + "loss": 33.633, + "step": 3836 + }, + { + "epoch": 13.852370203160271, + "grad_norm": 269.1921691894531, + "learning_rate": 9.16333938294011e-06, + "loss": 34.6822, + "step": 3837 + }, + { + "epoch": 13.855981941309254, + "grad_norm": 222.4369354248047, + "learning_rate": 9.157894736842105e-06, + "loss": 35.2816, + "step": 3838 + }, + { + "epoch": 13.85959367945824, + "grad_norm": 232.4306640625, + "learning_rate": 9.152450090744102e-06, + "loss": 35.0067, + "step": 3839 + }, + { + "epoch": 13.863205417607224, + "grad_norm": 297.0786437988281, + "learning_rate": 9.147005444646098e-06, + "loss": 34.264, + "step": 3840 + }, + { + "epoch": 13.863205417607224, + "eval_loss": 0.6047748327255249, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 3840 + }, + { + "epoch": 13.866817155756207, + "grad_norm": 370.232421875, + "learning_rate": 9.141560798548095e-06, + "loss": 35.4996, + "step": 3841 + }, + { + "epoch": 13.870428893905192, + "grad_norm": 216.05775451660156, + "learning_rate": 9.136116152450092e-06, + "loss": 36.1403, + "step": 3842 + }, + { + "epoch": 13.874040632054175, + "grad_norm": 233.11138916015625, + "learning_rate": 9.130671506352087e-06, + "loss": 36.0324, + "step": 3843 + }, + { + "epoch": 13.87765237020316, + "grad_norm": 297.1761779785156, + "learning_rate": 9.125226860254083e-06, + "loss": 36.5617, + "step": 3844 + }, + { + "epoch": 13.881264108352145, + "grad_norm": 290.61590576171875, + "learning_rate": 9.11978221415608e-06, + "loss": 36.7113, + "step": 3845 + }, + { + "epoch": 13.884875846501128, + "grad_norm": 293.5744934082031, + "learning_rate": 9.114337568058077e-06, + "loss": 36.9964, + "step": 3846 + }, + { + "epoch": 13.888487584650113, + "grad_norm": 227.73455810546875, + "learning_rate": 9.108892921960072e-06, + "loss": 31.8552, + "step": 3847 + }, + { + "epoch": 13.892099322799098, + "grad_norm": 223.36077880859375, + "learning_rate": 9.10344827586207e-06, + "loss": 22.9122, + "step": 3848 + }, + { + "epoch": 13.89571106094808, + "grad_norm": 181.14501953125, + "learning_rate": 9.098003629764065e-06, + "loss": 22.366, + "step": 3849 + }, + { + "epoch": 13.899322799097066, + "grad_norm": 215.75856018066406, + "learning_rate": 9.092558983666063e-06, + "loss": 23.9545, + "step": 3850 + }, + { + "epoch": 13.899322799097066, + "eval_loss": 0.6072003245353699, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 3850 + }, + { + "epoch": 13.90293453724605, + "grad_norm": 233.22837829589844, + "learning_rate": 9.087114337568059e-06, + "loss": 23.5196, + "step": 3851 + }, + { + "epoch": 13.906546275395034, + "grad_norm": 269.9342041015625, + "learning_rate": 9.081669691470054e-06, + "loss": 41.4605, + "step": 3852 + }, + { + "epoch": 13.910158013544018, + "grad_norm": 304.4266662597656, + "learning_rate": 9.076225045372051e-06, + "loss": 40.2848, + "step": 3853 + }, + { + "epoch": 13.913769751693001, + "grad_norm": 318.2371520996094, + "learning_rate": 9.070780399274047e-06, + "loss": 41.0044, + "step": 3854 + }, + { + "epoch": 13.917381489841986, + "grad_norm": 272.9725341796875, + "learning_rate": 9.065335753176044e-06, + "loss": 40.776, + "step": 3855 + }, + { + "epoch": 13.920993227990971, + "grad_norm": 213.8822784423828, + "learning_rate": 9.059891107078041e-06, + "loss": 39.4964, + "step": 3856 + }, + { + "epoch": 13.924604966139954, + "grad_norm": 239.16128540039062, + "learning_rate": 9.054446460980036e-06, + "loss": 41.3482, + "step": 3857 + }, + { + "epoch": 13.928216704288939, + "grad_norm": 264.839111328125, + "learning_rate": 9.049001814882033e-06, + "loss": 38.2433, + "step": 3858 + }, + { + "epoch": 13.931828442437924, + "grad_norm": 244.00926208496094, + "learning_rate": 9.043557168784029e-06, + "loss": 38.6482, + "step": 3859 + }, + { + "epoch": 13.935440180586907, + "grad_norm": 342.8050537109375, + "learning_rate": 9.038112522686026e-06, + "loss": 39.2047, + "step": 3860 + }, + { + "epoch": 13.935440180586907, + "eval_loss": 0.6078094244003296, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3860 + }, + { + "epoch": 13.939051918735892, + "grad_norm": 232.509033203125, + "learning_rate": 9.032667876588023e-06, + "loss": 39.2827, + "step": 3861 + }, + { + "epoch": 13.942663656884875, + "grad_norm": 343.2891845703125, + "learning_rate": 9.027223230490018e-06, + "loss": 38.2709, + "step": 3862 + }, + { + "epoch": 13.94627539503386, + "grad_norm": 332.9613342285156, + "learning_rate": 9.021778584392014e-06, + "loss": 38.8266, + "step": 3863 + }, + { + "epoch": 13.949887133182845, + "grad_norm": 339.5653076171875, + "learning_rate": 9.016333938294012e-06, + "loss": 39.9249, + "step": 3864 + }, + { + "epoch": 13.953498871331828, + "grad_norm": 269.0108947753906, + "learning_rate": 9.010889292196008e-06, + "loss": 39.4593, + "step": 3865 + }, + { + "epoch": 13.957110609480813, + "grad_norm": 252.5339813232422, + "learning_rate": 9.005444646098003e-06, + "loss": 39.5471, + "step": 3866 + }, + { + "epoch": 13.960722347629797, + "grad_norm": 424.7225646972656, + "learning_rate": 9e-06, + "loss": 35.7505, + "step": 3867 + }, + { + "epoch": 13.96433408577878, + "grad_norm": 286.189208984375, + "learning_rate": 8.994555353901996e-06, + "loss": 32.445, + "step": 3868 + }, + { + "epoch": 13.967945823927765, + "grad_norm": 245.153564453125, + "learning_rate": 8.989110707803994e-06, + "loss": 33.2369, + "step": 3869 + }, + { + "epoch": 13.97155756207675, + "grad_norm": 305.3119812011719, + "learning_rate": 8.98366606170599e-06, + "loss": 31.7864, + "step": 3870 + }, + { + "epoch": 13.97155756207675, + "eval_loss": 0.6069231629371643, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.877, + "eval_steps_per_second": 56.877, + "step": 3870 + }, + { + "epoch": 13.975169300225733, + "grad_norm": 218.70913696289062, + "learning_rate": 8.978221415607985e-06, + "loss": 33.7166, + "step": 3871 + }, + { + "epoch": 13.978781038374718, + "grad_norm": 334.856201171875, + "learning_rate": 8.972776769509982e-06, + "loss": 35.8878, + "step": 3872 + }, + { + "epoch": 13.982392776523701, + "grad_norm": 305.65203857421875, + "learning_rate": 8.96733212341198e-06, + "loss": 35.1525, + "step": 3873 + }, + { + "epoch": 13.986004514672686, + "grad_norm": 330.148193359375, + "learning_rate": 8.961887477313975e-06, + "loss": 34.8268, + "step": 3874 + }, + { + "epoch": 13.989616252821671, + "grad_norm": 288.9424133300781, + "learning_rate": 8.956442831215972e-06, + "loss": 35.5068, + "step": 3875 + }, + { + "epoch": 13.993227990970654, + "grad_norm": 256.2596740722656, + "learning_rate": 8.950998185117967e-06, + "loss": 28.5016, + "step": 3876 + }, + { + "epoch": 13.996839729119639, + "grad_norm": 234.31991577148438, + "learning_rate": 8.945553539019963e-06, + "loss": 23.7416, + "step": 3877 + }, + { + "epoch": 14.0, + "grad_norm": 182.19000244140625, + "learning_rate": 8.940108892921961e-06, + "loss": 21.0329, + "step": 3878 + }, + { + "epoch": 14.003611738148985, + "grad_norm": 254.86355590820312, + "learning_rate": 8.934664246823957e-06, + "loss": 39.94, + "step": 3879 + }, + { + "epoch": 14.007223476297968, + "grad_norm": 229.75650024414062, + "learning_rate": 8.929219600725954e-06, + "loss": 40.3213, + "step": 3880 + }, + { + "epoch": 14.007223476297968, + "eval_loss": 0.604503870010376, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3880 + }, + { + "epoch": 14.010835214446953, + "grad_norm": 220.18190002441406, + "learning_rate": 8.923774954627949e-06, + "loss": 40.1568, + "step": 3881 + }, + { + "epoch": 14.014446952595938, + "grad_norm": 269.5978088378906, + "learning_rate": 8.918330308529945e-06, + "loss": 40.3685, + "step": 3882 + }, + { + "epoch": 14.01805869074492, + "grad_norm": 254.3507537841797, + "learning_rate": 8.912885662431943e-06, + "loss": 40.0845, + "step": 3883 + }, + { + "epoch": 14.021670428893906, + "grad_norm": 251.43653869628906, + "learning_rate": 8.907441016333939e-06, + "loss": 40.1731, + "step": 3884 + }, + { + "epoch": 14.025282167042889, + "grad_norm": 215.91253662109375, + "learning_rate": 8.901996370235934e-06, + "loss": 39.7179, + "step": 3885 + }, + { + "epoch": 14.028893905191874, + "grad_norm": 247.81790161132812, + "learning_rate": 8.896551724137931e-06, + "loss": 41.0822, + "step": 3886 + }, + { + "epoch": 14.032505643340858, + "grad_norm": 232.45892333984375, + "learning_rate": 8.891107078039928e-06, + "loss": 39.7873, + "step": 3887 + }, + { + "epoch": 14.036117381489841, + "grad_norm": 231.8137969970703, + "learning_rate": 8.885662431941924e-06, + "loss": 41.1302, + "step": 3888 + }, + { + "epoch": 14.039729119638826, + "grad_norm": 219.09446716308594, + "learning_rate": 8.88021778584392e-06, + "loss": 39.2293, + "step": 3889 + }, + { + "epoch": 14.043340857787811, + "grad_norm": 187.99874877929688, + "learning_rate": 8.874773139745916e-06, + "loss": 37.3338, + "step": 3890 + }, + { + "epoch": 14.043340857787811, + "eval_loss": 0.603966236114502, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 3890 + }, + { + "epoch": 14.046952595936794, + "grad_norm": 285.2400207519531, + "learning_rate": 8.869328493647913e-06, + "loss": 36.9479, + "step": 3891 + }, + { + "epoch": 14.050564334085779, + "grad_norm": 234.23655700683594, + "learning_rate": 8.86388384754991e-06, + "loss": 35.1313, + "step": 3892 + }, + { + "epoch": 14.054176072234762, + "grad_norm": 234.78717041015625, + "learning_rate": 8.858439201451906e-06, + "loss": 36.5917, + "step": 3893 + }, + { + "epoch": 14.057787810383747, + "grad_norm": 226.53997802734375, + "learning_rate": 8.852994555353903e-06, + "loss": 38.3228, + "step": 3894 + }, + { + "epoch": 14.061399548532732, + "grad_norm": 222.05213928222656, + "learning_rate": 8.847549909255898e-06, + "loss": 37.3542, + "step": 3895 + }, + { + "epoch": 14.065011286681715, + "grad_norm": 222.9646759033203, + "learning_rate": 8.842105263157893e-06, + "loss": 37.6396, + "step": 3896 + }, + { + "epoch": 14.0686230248307, + "grad_norm": 227.78965759277344, + "learning_rate": 8.836660617059892e-06, + "loss": 38.1988, + "step": 3897 + }, + { + "epoch": 14.072234762979685, + "grad_norm": 200.89691162109375, + "learning_rate": 8.831215970961888e-06, + "loss": 38.3981, + "step": 3898 + }, + { + "epoch": 14.075846501128668, + "grad_norm": 212.52891540527344, + "learning_rate": 8.825771324863883e-06, + "loss": 37.3422, + "step": 3899 + }, + { + "epoch": 14.079458239277653, + "grad_norm": 312.33905029296875, + "learning_rate": 8.82032667876588e-06, + "loss": 38.1292, + "step": 3900 + }, + { + "epoch": 14.079458239277653, + "eval_loss": 0.6061921119689941, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.017, + "eval_steps_per_second": 57.017, + "step": 3900 + }, + { + "epoch": 14.083069977426636, + "grad_norm": 261.8415832519531, + "learning_rate": 8.814882032667877e-06, + "loss": 37.5543, + "step": 3901 + }, + { + "epoch": 14.08668171557562, + "grad_norm": 264.625732421875, + "learning_rate": 8.809437386569874e-06, + "loss": 39.3912, + "step": 3902 + }, + { + "epoch": 14.090293453724605, + "grad_norm": 305.7203063964844, + "learning_rate": 8.80399274047187e-06, + "loss": 39.7879, + "step": 3903 + }, + { + "epoch": 14.093905191873588, + "grad_norm": 282.63616943359375, + "learning_rate": 8.798548094373865e-06, + "loss": 38.7212, + "step": 3904 + }, + { + "epoch": 14.097516930022573, + "grad_norm": 246.49169921875, + "learning_rate": 8.793103448275862e-06, + "loss": 40.6198, + "step": 3905 + }, + { + "epoch": 14.101128668171558, + "grad_norm": 283.2737731933594, + "learning_rate": 8.787658802177859e-06, + "loss": 39.6947, + "step": 3906 + }, + { + "epoch": 14.104740406320541, + "grad_norm": 306.95721435546875, + "learning_rate": 8.782214156079855e-06, + "loss": 38.6157, + "step": 3907 + }, + { + "epoch": 14.108352144469526, + "grad_norm": 238.1789093017578, + "learning_rate": 8.776769509981852e-06, + "loss": 35.5328, + "step": 3908 + }, + { + "epoch": 14.111963882618511, + "grad_norm": 233.2298126220703, + "learning_rate": 8.771324863883847e-06, + "loss": 32.4008, + "step": 3909 + }, + { + "epoch": 14.115575620767494, + "grad_norm": 233.46339416503906, + "learning_rate": 8.765880217785846e-06, + "loss": 31.0712, + "step": 3910 + }, + { + "epoch": 14.115575620767494, + "eval_loss": 0.6046931147575378, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 3910 + }, + { + "epoch": 14.119187358916479, + "grad_norm": 226.30343627929688, + "learning_rate": 8.760435571687841e-06, + "loss": 33.252, + "step": 3911 + }, + { + "epoch": 14.122799097065462, + "grad_norm": 247.17465209960938, + "learning_rate": 8.754990925589837e-06, + "loss": 31.526, + "step": 3912 + }, + { + "epoch": 14.126410835214447, + "grad_norm": 208.25439453125, + "learning_rate": 8.749546279491834e-06, + "loss": 32.4838, + "step": 3913 + }, + { + "epoch": 14.130022573363432, + "grad_norm": 236.4488525390625, + "learning_rate": 8.744101633393829e-06, + "loss": 32.7987, + "step": 3914 + }, + { + "epoch": 14.133634311512415, + "grad_norm": 219.13279724121094, + "learning_rate": 8.738656987295826e-06, + "loss": 32.8516, + "step": 3915 + }, + { + "epoch": 14.1372460496614, + "grad_norm": 239.7289581298828, + "learning_rate": 8.733212341197823e-06, + "loss": 33.7763, + "step": 3916 + }, + { + "epoch": 14.140857787810384, + "grad_norm": 226.3568878173828, + "learning_rate": 8.727767695099819e-06, + "loss": 35.675, + "step": 3917 + }, + { + "epoch": 14.144469525959368, + "grad_norm": 302.84307861328125, + "learning_rate": 8.722323049001814e-06, + "loss": 34.0523, + "step": 3918 + }, + { + "epoch": 14.148081264108352, + "grad_norm": 280.40106201171875, + "learning_rate": 8.716878402903811e-06, + "loss": 35.2923, + "step": 3919 + }, + { + "epoch": 14.151693002257336, + "grad_norm": 238.30520629882812, + "learning_rate": 8.711433756805808e-06, + "loss": 36.0242, + "step": 3920 + }, + { + "epoch": 14.151693002257336, + "eval_loss": 0.6067762970924377, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 3920 + }, + { + "epoch": 14.15530474040632, + "grad_norm": 238.6465301513672, + "learning_rate": 8.705989110707805e-06, + "loss": 36.2959, + "step": 3921 + }, + { + "epoch": 14.158916478555305, + "grad_norm": 273.26837158203125, + "learning_rate": 8.7005444646098e-06, + "loss": 35.45, + "step": 3922 + }, + { + "epoch": 14.162528216704288, + "grad_norm": 296.907958984375, + "learning_rate": 8.695099818511796e-06, + "loss": 36.4428, + "step": 3923 + }, + { + "epoch": 14.166139954853273, + "grad_norm": 215.07374572753906, + "learning_rate": 8.689655172413795e-06, + "loss": 26.4171, + "step": 3924 + }, + { + "epoch": 14.169751693002258, + "grad_norm": 217.64779663085938, + "learning_rate": 8.68421052631579e-06, + "loss": 22.5483, + "step": 3925 + }, + { + "epoch": 14.173363431151241, + "grad_norm": 243.59364318847656, + "learning_rate": 8.678765880217785e-06, + "loss": 22.0396, + "step": 3926 + }, + { + "epoch": 14.176975169300226, + "grad_norm": 189.66969299316406, + "learning_rate": 8.673321234119783e-06, + "loss": 23.0957, + "step": 3927 + }, + { + "epoch": 14.18058690744921, + "grad_norm": 191.86180114746094, + "learning_rate": 8.667876588021778e-06, + "loss": 23.9385, + "step": 3928 + }, + { + "epoch": 14.184198645598194, + "grad_norm": 234.34896850585938, + "learning_rate": 8.662431941923775e-06, + "loss": 40.1665, + "step": 3929 + }, + { + "epoch": 14.187810383747179, + "grad_norm": 230.52401733398438, + "learning_rate": 8.656987295825772e-06, + "loss": 40.6752, + "step": 3930 + }, + { + "epoch": 14.187810383747179, + "eval_loss": 0.6088615655899048, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.998, + "eval_steps_per_second": 56.998, + "step": 3930 + }, + { + "epoch": 14.191422121896162, + "grad_norm": 234.06272888183594, + "learning_rate": 8.651542649727767e-06, + "loss": 40.7938, + "step": 3931 + }, + { + "epoch": 14.195033860045147, + "grad_norm": 344.4232482910156, + "learning_rate": 8.646098003629765e-06, + "loss": 38.7342, + "step": 3932 + }, + { + "epoch": 14.198645598194132, + "grad_norm": 375.74365234375, + "learning_rate": 8.640653357531762e-06, + "loss": 40.2052, + "step": 3933 + }, + { + "epoch": 14.202257336343115, + "grad_norm": 258.15570068359375, + "learning_rate": 8.635208711433757e-06, + "loss": 39.7266, + "step": 3934 + }, + { + "epoch": 14.2058690744921, + "grad_norm": 235.2681121826172, + "learning_rate": 8.629764065335754e-06, + "loss": 40.4821, + "step": 3935 + }, + { + "epoch": 14.209480812641084, + "grad_norm": 226.94764709472656, + "learning_rate": 8.62431941923775e-06, + "loss": 41.2414, + "step": 3936 + }, + { + "epoch": 14.213092550790067, + "grad_norm": 236.22109985351562, + "learning_rate": 8.618874773139745e-06, + "loss": 40.5807, + "step": 3937 + }, + { + "epoch": 14.216704288939052, + "grad_norm": 201.31112670898438, + "learning_rate": 8.613430127041744e-06, + "loss": 40.4824, + "step": 3938 + }, + { + "epoch": 14.220316027088035, + "grad_norm": 328.0167541503906, + "learning_rate": 8.607985480943739e-06, + "loss": 38.3881, + "step": 3939 + }, + { + "epoch": 14.22392776523702, + "grad_norm": 281.4416809082031, + "learning_rate": 8.602540834845734e-06, + "loss": 36.5777, + "step": 3940 + }, + { + "epoch": 14.22392776523702, + "eval_loss": 0.6099084615707397, + "eval_runtime": 3.1377, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 3940 + }, + { + "epoch": 14.227539503386005, + "grad_norm": 258.5203552246094, + "learning_rate": 8.597096188747731e-06, + "loss": 37.5071, + "step": 3941 + }, + { + "epoch": 14.231151241534988, + "grad_norm": 274.8222351074219, + "learning_rate": 8.591651542649727e-06, + "loss": 36.358, + "step": 3942 + }, + { + "epoch": 14.234762979683973, + "grad_norm": 253.1671600341797, + "learning_rate": 8.586206896551726e-06, + "loss": 37.5859, + "step": 3943 + }, + { + "epoch": 14.238374717832958, + "grad_norm": 249.80943298339844, + "learning_rate": 8.580762250453721e-06, + "loss": 37.8799, + "step": 3944 + }, + { + "epoch": 14.241986455981941, + "grad_norm": 245.29103088378906, + "learning_rate": 8.575317604355716e-06, + "loss": 36.7551, + "step": 3945 + }, + { + "epoch": 14.245598194130926, + "grad_norm": 205.5915985107422, + "learning_rate": 8.569872958257713e-06, + "loss": 38.4761, + "step": 3946 + }, + { + "epoch": 14.249209932279909, + "grad_norm": 218.10328674316406, + "learning_rate": 8.56442831215971e-06, + "loss": 37.5862, + "step": 3947 + }, + { + "epoch": 14.252821670428894, + "grad_norm": 273.5924072265625, + "learning_rate": 8.558983666061706e-06, + "loss": 39.2851, + "step": 3948 + }, + { + "epoch": 14.256433408577879, + "grad_norm": 235.48069763183594, + "learning_rate": 8.553539019963703e-06, + "loss": 39.0707, + "step": 3949 + }, + { + "epoch": 14.260045146726862, + "grad_norm": 230.93150329589844, + "learning_rate": 8.548094373865698e-06, + "loss": 37.8469, + "step": 3950 + }, + { + "epoch": 14.260045146726862, + "eval_loss": 0.6072147488594055, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 3950 + }, + { + "epoch": 14.263656884875846, + "grad_norm": 226.3638458251953, + "learning_rate": 8.542649727767695e-06, + "loss": 39.4245, + "step": 3951 + }, + { + "epoch": 14.267268623024831, + "grad_norm": 226.74595642089844, + "learning_rate": 8.537205081669693e-06, + "loss": 38.116, + "step": 3952 + }, + { + "epoch": 14.270880361173814, + "grad_norm": 226.1452178955078, + "learning_rate": 8.531760435571688e-06, + "loss": 39.9114, + "step": 3953 + }, + { + "epoch": 14.2744920993228, + "grad_norm": 387.8020324707031, + "learning_rate": 8.526315789473685e-06, + "loss": 38.9457, + "step": 3954 + }, + { + "epoch": 14.278103837471784, + "grad_norm": 381.5679931640625, + "learning_rate": 8.52087114337568e-06, + "loss": 40.7989, + "step": 3955 + }, + { + "epoch": 14.281715575620767, + "grad_norm": 246.16464233398438, + "learning_rate": 8.515426497277677e-06, + "loss": 37.6288, + "step": 3956 + }, + { + "epoch": 14.285327313769752, + "grad_norm": 337.05059814453125, + "learning_rate": 8.509981851179674e-06, + "loss": 37.3276, + "step": 3957 + }, + { + "epoch": 14.288939051918735, + "grad_norm": 223.80421447753906, + "learning_rate": 8.50453720508167e-06, + "loss": 33.9465, + "step": 3958 + }, + { + "epoch": 14.29255079006772, + "grad_norm": 218.9332275390625, + "learning_rate": 8.499092558983665e-06, + "loss": 33.0305, + "step": 3959 + }, + { + "epoch": 14.296162528216705, + "grad_norm": 254.20726013183594, + "learning_rate": 8.493647912885662e-06, + "loss": 31.3806, + "step": 3960 + }, + { + "epoch": 14.296162528216705, + "eval_loss": 0.6070483922958374, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 3960 + }, + { + "epoch": 14.299774266365688, + "grad_norm": 232.96702575683594, + "learning_rate": 8.48820326678766e-06, + "loss": 31.7001, + "step": 3961 + }, + { + "epoch": 14.303386004514673, + "grad_norm": 305.31207275390625, + "learning_rate": 8.482758620689656e-06, + "loss": 32.2629, + "step": 3962 + }, + { + "epoch": 14.306997742663658, + "grad_norm": 253.60858154296875, + "learning_rate": 8.477313974591652e-06, + "loss": 34.2635, + "step": 3963 + }, + { + "epoch": 14.31060948081264, + "grad_norm": 395.4168701171875, + "learning_rate": 8.471869328493647e-06, + "loss": 34.6987, + "step": 3964 + }, + { + "epoch": 14.314221218961626, + "grad_norm": 279.72845458984375, + "learning_rate": 8.466424682395644e-06, + "loss": 34.5488, + "step": 3965 + }, + { + "epoch": 14.317832957110609, + "grad_norm": 285.7306213378906, + "learning_rate": 8.460980036297641e-06, + "loss": 35.2566, + "step": 3966 + }, + { + "epoch": 14.321444695259594, + "grad_norm": 229.04226684570312, + "learning_rate": 8.455535390199637e-06, + "loss": 34.5273, + "step": 3967 + }, + { + "epoch": 14.325056433408578, + "grad_norm": 232.50205993652344, + "learning_rate": 8.450090744101634e-06, + "loss": 34.6337, + "step": 3968 + }, + { + "epoch": 14.328668171557561, + "grad_norm": 225.87583923339844, + "learning_rate": 8.44464609800363e-06, + "loss": 35.1575, + "step": 3969 + }, + { + "epoch": 14.332279909706546, + "grad_norm": 266.2709045410156, + "learning_rate": 8.439201451905626e-06, + "loss": 34.2619, + "step": 3970 + }, + { + "epoch": 14.332279909706546, + "eval_loss": 0.6066078543663025, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 3970 + }, + { + "epoch": 14.335891647855531, + "grad_norm": 283.557373046875, + "learning_rate": 8.433756805807623e-06, + "loss": 35.5713, + "step": 3971 + }, + { + "epoch": 14.339503386004514, + "grad_norm": 288.43707275390625, + "learning_rate": 8.428312159709619e-06, + "loss": 36.7442, + "step": 3972 + }, + { + "epoch": 14.343115124153499, + "grad_norm": 331.3218994140625, + "learning_rate": 8.422867513611616e-06, + "loss": 35.5839, + "step": 3973 + }, + { + "epoch": 14.346726862302482, + "grad_norm": 257.1488037109375, + "learning_rate": 8.417422867513611e-06, + "loss": 30.2221, + "step": 3974 + }, + { + "epoch": 14.350338600451467, + "grad_norm": 200.0919189453125, + "learning_rate": 8.411978221415608e-06, + "loss": 22.217, + "step": 3975 + }, + { + "epoch": 14.353950338600452, + "grad_norm": 245.030029296875, + "learning_rate": 8.406533575317605e-06, + "loss": 22.8927, + "step": 3976 + }, + { + "epoch": 14.357562076749435, + "grad_norm": 208.5701904296875, + "learning_rate": 8.4010889292196e-06, + "loss": 22.9537, + "step": 3977 + }, + { + "epoch": 14.36117381489842, + "grad_norm": 232.0613250732422, + "learning_rate": 8.395644283121596e-06, + "loss": 24.5304, + "step": 3978 + }, + { + "epoch": 14.364785553047405, + "grad_norm": 193.56541442871094, + "learning_rate": 8.390199637023595e-06, + "loss": 39.4552, + "step": 3979 + }, + { + "epoch": 14.368397291196388, + "grad_norm": 230.35507202148438, + "learning_rate": 8.38475499092559e-06, + "loss": 41.0417, + "step": 3980 + }, + { + "epoch": 14.368397291196388, + "eval_loss": 0.6071842908859253, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 3980 + }, + { + "epoch": 14.372009029345373, + "grad_norm": 191.09242248535156, + "learning_rate": 8.379310344827586e-06, + "loss": 40.1548, + "step": 3981 + }, + { + "epoch": 14.375620767494357, + "grad_norm": 249.24520874023438, + "learning_rate": 8.373865698729583e-06, + "loss": 39.5746, + "step": 3982 + }, + { + "epoch": 14.37923250564334, + "grad_norm": 266.509033203125, + "learning_rate": 8.368421052631578e-06, + "loss": 39.2388, + "step": 3983 + }, + { + "epoch": 14.382844243792325, + "grad_norm": 255.36209106445312, + "learning_rate": 8.362976406533577e-06, + "loss": 39.9314, + "step": 3984 + }, + { + "epoch": 14.386455981941308, + "grad_norm": 239.0690460205078, + "learning_rate": 8.357531760435572e-06, + "loss": 39.9124, + "step": 3985 + }, + { + "epoch": 14.390067720090293, + "grad_norm": 211.36135864257812, + "learning_rate": 8.352087114337568e-06, + "loss": 40.1307, + "step": 3986 + }, + { + "epoch": 14.393679458239278, + "grad_norm": 215.28912353515625, + "learning_rate": 8.346642468239565e-06, + "loss": 40.5252, + "step": 3987 + }, + { + "epoch": 14.397291196388261, + "grad_norm": 240.84271240234375, + "learning_rate": 8.34119782214156e-06, + "loss": 40.8348, + "step": 3988 + }, + { + "epoch": 14.400902934537246, + "grad_norm": 228.41758728027344, + "learning_rate": 8.335753176043557e-06, + "loss": 39.8228, + "step": 3989 + }, + { + "epoch": 14.404514672686231, + "grad_norm": 203.0228729248047, + "learning_rate": 8.330308529945554e-06, + "loss": 38.0696, + "step": 3990 + }, + { + "epoch": 14.404514672686231, + "eval_loss": 0.6064196825027466, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.983, + "eval_steps_per_second": 56.983, + "step": 3990 + }, + { + "epoch": 14.408126410835214, + "grad_norm": 245.14646911621094, + "learning_rate": 8.32486388384755e-06, + "loss": 37.3921, + "step": 3991 + }, + { + "epoch": 14.411738148984199, + "grad_norm": 230.0685577392578, + "learning_rate": 8.319419237749545e-06, + "loss": 36.8794, + "step": 3992 + }, + { + "epoch": 14.415349887133182, + "grad_norm": 203.02955627441406, + "learning_rate": 8.313974591651544e-06, + "loss": 38.011, + "step": 3993 + }, + { + "epoch": 14.418961625282167, + "grad_norm": 276.0522766113281, + "learning_rate": 8.30852994555354e-06, + "loss": 37.8114, + "step": 3994 + }, + { + "epoch": 14.422573363431152, + "grad_norm": 205.56423950195312, + "learning_rate": 8.303085299455536e-06, + "loss": 38.1956, + "step": 3995 + }, + { + "epoch": 14.426185101580135, + "grad_norm": 200.71507263183594, + "learning_rate": 8.297640653357532e-06, + "loss": 36.4471, + "step": 3996 + }, + { + "epoch": 14.42979683972912, + "grad_norm": 217.8540496826172, + "learning_rate": 8.292196007259527e-06, + "loss": 37.6204, + "step": 3997 + }, + { + "epoch": 14.433408577878104, + "grad_norm": 228.0621337890625, + "learning_rate": 8.286751361161526e-06, + "loss": 38.6074, + "step": 3998 + }, + { + "epoch": 14.437020316027088, + "grad_norm": 246.05203247070312, + "learning_rate": 8.281306715063521e-06, + "loss": 37.8614, + "step": 3999 + }, + { + "epoch": 14.440632054176072, + "grad_norm": 216.0327911376953, + "learning_rate": 8.275862068965517e-06, + "loss": 37.4941, + "step": 4000 + }, + { + "epoch": 14.440632054176072, + "eval_loss": 0.605604887008667, + "eval_runtime": 3.1399, + "eval_samples_per_second": 57.008, + "eval_steps_per_second": 57.008, + "step": 4000 + }, + { + "epoch": 14.444243792325057, + "grad_norm": 292.38653564453125, + "learning_rate": 8.270417422867514e-06, + "loss": 37.9576, + "step": 4001 + }, + { + "epoch": 14.44785553047404, + "grad_norm": 268.2558288574219, + "learning_rate": 8.26497277676951e-06, + "loss": 38.7505, + "step": 4002 + }, + { + "epoch": 14.451467268623025, + "grad_norm": 324.135498046875, + "learning_rate": 8.259528130671508e-06, + "loss": 39.9733, + "step": 4003 + }, + { + "epoch": 14.455079006772008, + "grad_norm": 269.1458740234375, + "learning_rate": 8.254083484573503e-06, + "loss": 38.8272, + "step": 4004 + }, + { + "epoch": 14.458690744920993, + "grad_norm": 214.26547241210938, + "learning_rate": 8.248638838475499e-06, + "loss": 37.7277, + "step": 4005 + }, + { + "epoch": 14.462302483069978, + "grad_norm": 256.4419860839844, + "learning_rate": 8.243194192377496e-06, + "loss": 39.0446, + "step": 4006 + }, + { + "epoch": 14.465914221218961, + "grad_norm": 226.9741973876953, + "learning_rate": 8.237749546279493e-06, + "loss": 34.2491, + "step": 4007 + }, + { + "epoch": 14.469525959367946, + "grad_norm": 238.4901123046875, + "learning_rate": 8.232304900181488e-06, + "loss": 32.1969, + "step": 4008 + }, + { + "epoch": 14.47313769751693, + "grad_norm": 260.6334533691406, + "learning_rate": 8.226860254083485e-06, + "loss": 32.5999, + "step": 4009 + }, + { + "epoch": 14.476749435665914, + "grad_norm": 227.4844970703125, + "learning_rate": 8.22141560798548e-06, + "loss": 30.3598, + "step": 4010 + }, + { + "epoch": 14.476749435665914, + "eval_loss": 0.6049788594245911, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 4010 + }, + { + "epoch": 14.480361173814899, + "grad_norm": 231.49935913085938, + "learning_rate": 8.215970961887476e-06, + "loss": 32.3228, + "step": 4011 + }, + { + "epoch": 14.483972911963882, + "grad_norm": 246.83099365234375, + "learning_rate": 8.210526315789475e-06, + "loss": 32.1275, + "step": 4012 + }, + { + "epoch": 14.487584650112867, + "grad_norm": 283.0715026855469, + "learning_rate": 8.20508166969147e-06, + "loss": 32.9237, + "step": 4013 + }, + { + "epoch": 14.491196388261852, + "grad_norm": 264.58941650390625, + "learning_rate": 8.199637023593467e-06, + "loss": 34.3091, + "step": 4014 + }, + { + "epoch": 14.494808126410835, + "grad_norm": 207.57241821289062, + "learning_rate": 8.194192377495463e-06, + "loss": 34.2317, + "step": 4015 + }, + { + "epoch": 14.49841986455982, + "grad_norm": 266.3730163574219, + "learning_rate": 8.18874773139746e-06, + "loss": 35.5423, + "step": 4016 + }, + { + "epoch": 14.502031602708804, + "grad_norm": 274.2936096191406, + "learning_rate": 8.183303085299457e-06, + "loss": 34.0383, + "step": 4017 + }, + { + "epoch": 14.505643340857787, + "grad_norm": 345.4320068359375, + "learning_rate": 8.177858439201452e-06, + "loss": 35.6892, + "step": 4018 + }, + { + "epoch": 14.509255079006772, + "grad_norm": 254.9503631591797, + "learning_rate": 8.172413793103448e-06, + "loss": 34.4219, + "step": 4019 + }, + { + "epoch": 14.512866817155757, + "grad_norm": 277.176025390625, + "learning_rate": 8.166969147005445e-06, + "loss": 34.6322, + "step": 4020 + }, + { + "epoch": 14.512866817155757, + "eval_loss": 0.6078911423683167, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 4020 + }, + { + "epoch": 14.51647855530474, + "grad_norm": 267.24737548828125, + "learning_rate": 8.161524500907442e-06, + "loss": 36.4843, + "step": 4021 + }, + { + "epoch": 14.520090293453725, + "grad_norm": 291.5208740234375, + "learning_rate": 8.156079854809437e-06, + "loss": 36.347, + "step": 4022 + }, + { + "epoch": 14.523702031602708, + "grad_norm": 331.9736328125, + "learning_rate": 8.150635208711434e-06, + "loss": 36.5678, + "step": 4023 + }, + { + "epoch": 14.527313769751693, + "grad_norm": 283.7598876953125, + "learning_rate": 8.14519056261343e-06, + "loss": 29.4886, + "step": 4024 + }, + { + "epoch": 14.530925507900678, + "grad_norm": 214.61712646484375, + "learning_rate": 8.139745916515427e-06, + "loss": 23.2178, + "step": 4025 + }, + { + "epoch": 14.534537246049661, + "grad_norm": 286.7948913574219, + "learning_rate": 8.134301270417424e-06, + "loss": 22.0972, + "step": 4026 + }, + { + "epoch": 14.538148984198646, + "grad_norm": 230.6540069580078, + "learning_rate": 8.128856624319419e-06, + "loss": 23.2764, + "step": 4027 + }, + { + "epoch": 14.54176072234763, + "grad_norm": 300.9560241699219, + "learning_rate": 8.123411978221416e-06, + "loss": 24.1889, + "step": 4028 + }, + { + "epoch": 14.545372460496614, + "grad_norm": 211.4068145751953, + "learning_rate": 8.117967332123412e-06, + "loss": 39.0039, + "step": 4029 + }, + { + "epoch": 14.548984198645599, + "grad_norm": 274.3965759277344, + "learning_rate": 8.112522686025409e-06, + "loss": 41.1832, + "step": 4030 + }, + { + "epoch": 14.548984198645599, + "eval_loss": 0.6079195141792297, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 4030 + }, + { + "epoch": 14.552595936794582, + "grad_norm": 247.50657653808594, + "learning_rate": 8.107078039927406e-06, + "loss": 38.28, + "step": 4031 + }, + { + "epoch": 14.556207674943566, + "grad_norm": 216.0500946044922, + "learning_rate": 8.101633393829401e-06, + "loss": 39.5079, + "step": 4032 + }, + { + "epoch": 14.559819413092551, + "grad_norm": 271.37066650390625, + "learning_rate": 8.096188747731396e-06, + "loss": 40.1902, + "step": 4033 + }, + { + "epoch": 14.563431151241534, + "grad_norm": 233.35415649414062, + "learning_rate": 8.090744101633394e-06, + "loss": 40.2113, + "step": 4034 + }, + { + "epoch": 14.56704288939052, + "grad_norm": 214.67381286621094, + "learning_rate": 8.08529945553539e-06, + "loss": 39.794, + "step": 4035 + }, + { + "epoch": 14.570654627539504, + "grad_norm": 298.1142578125, + "learning_rate": 8.079854809437388e-06, + "loss": 39.9214, + "step": 4036 + }, + { + "epoch": 14.574266365688487, + "grad_norm": 197.40823364257812, + "learning_rate": 8.074410163339383e-06, + "loss": 40.9599, + "step": 4037 + }, + { + "epoch": 14.577878103837472, + "grad_norm": 242.1573028564453, + "learning_rate": 8.068965517241378e-06, + "loss": 40.2351, + "step": 4038 + }, + { + "epoch": 14.581489841986457, + "grad_norm": 224.93801879882812, + "learning_rate": 8.063520871143377e-06, + "loss": 39.0174, + "step": 4039 + }, + { + "epoch": 14.58510158013544, + "grad_norm": 295.4931335449219, + "learning_rate": 8.058076225045373e-06, + "loss": 37.4696, + "step": 4040 + }, + { + "epoch": 14.58510158013544, + "eval_loss": 0.6091852188110352, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 4040 + }, + { + "epoch": 14.588713318284425, + "grad_norm": 302.8267517089844, + "learning_rate": 8.052631578947368e-06, + "loss": 37.3227, + "step": 4041 + }, + { + "epoch": 14.592325056433408, + "grad_norm": 355.2379150390625, + "learning_rate": 8.047186932849365e-06, + "loss": 38.433, + "step": 4042 + }, + { + "epoch": 14.595936794582393, + "grad_norm": 304.96234130859375, + "learning_rate": 8.04174228675136e-06, + "loss": 37.8352, + "step": 4043 + }, + { + "epoch": 14.599548532731378, + "grad_norm": 309.294921875, + "learning_rate": 8.036297640653358e-06, + "loss": 38.1734, + "step": 4044 + }, + { + "epoch": 14.60316027088036, + "grad_norm": 216.3328399658203, + "learning_rate": 8.030852994555355e-06, + "loss": 37.3612, + "step": 4045 + }, + { + "epoch": 14.606772009029346, + "grad_norm": 250.9885711669922, + "learning_rate": 8.02540834845735e-06, + "loss": 39.1612, + "step": 4046 + }, + { + "epoch": 14.610383747178329, + "grad_norm": 215.0750732421875, + "learning_rate": 8.019963702359347e-06, + "loss": 39.6837, + "step": 4047 + }, + { + "epoch": 14.613995485327314, + "grad_norm": 234.02069091796875, + "learning_rate": 8.014519056261342e-06, + "loss": 37.9746, + "step": 4048 + }, + { + "epoch": 14.617607223476298, + "grad_norm": 233.7527313232422, + "learning_rate": 8.00907441016334e-06, + "loss": 38.5114, + "step": 4049 + }, + { + "epoch": 14.621218961625281, + "grad_norm": 271.77496337890625, + "learning_rate": 8.003629764065337e-06, + "loss": 37.1647, + "step": 4050 + }, + { + "epoch": 14.621218961625281, + "eval_loss": 0.6047770977020264, + "eval_runtime": 3.1379, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 4050 + }, + { + "epoch": 14.624830699774266, + "grad_norm": 281.7846374511719, + "learning_rate": 7.998185117967332e-06, + "loss": 38.981, + "step": 4051 + }, + { + "epoch": 14.628442437923251, + "grad_norm": 308.8702697753906, + "learning_rate": 7.992740471869327e-06, + "loss": 39.4821, + "step": 4052 + }, + { + "epoch": 14.632054176072234, + "grad_norm": 366.1501770019531, + "learning_rate": 7.987295825771326e-06, + "loss": 39.0898, + "step": 4053 + }, + { + "epoch": 14.635665914221219, + "grad_norm": 276.92962646484375, + "learning_rate": 7.981851179673322e-06, + "loss": 39.6162, + "step": 4054 + }, + { + "epoch": 14.639277652370204, + "grad_norm": 220.0023651123047, + "learning_rate": 7.976406533575319e-06, + "loss": 38.5888, + "step": 4055 + }, + { + "epoch": 14.642889390519187, + "grad_norm": 268.57293701171875, + "learning_rate": 7.970961887477314e-06, + "loss": 38.4631, + "step": 4056 + }, + { + "epoch": 14.646501128668172, + "grad_norm": 307.8072509765625, + "learning_rate": 7.96551724137931e-06, + "loss": 35.4139, + "step": 4057 + }, + { + "epoch": 14.650112866817155, + "grad_norm": 228.11767578125, + "learning_rate": 7.960072595281308e-06, + "loss": 33.3694, + "step": 4058 + }, + { + "epoch": 14.65372460496614, + "grad_norm": 217.6271209716797, + "learning_rate": 7.954627949183304e-06, + "loss": 31.3355, + "step": 4059 + }, + { + "epoch": 14.657336343115125, + "grad_norm": 232.31944274902344, + "learning_rate": 7.949183303085299e-06, + "loss": 32.8306, + "step": 4060 + }, + { + "epoch": 14.657336343115125, + "eval_loss": 0.6018487215042114, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 4060 + }, + { + "epoch": 14.660948081264108, + "grad_norm": 244.58303833007812, + "learning_rate": 7.943738656987296e-06, + "loss": 33.2157, + "step": 4061 + }, + { + "epoch": 14.664559819413093, + "grad_norm": 306.12005615234375, + "learning_rate": 7.938294010889293e-06, + "loss": 33.6361, + "step": 4062 + }, + { + "epoch": 14.668171557562077, + "grad_norm": 266.2792053222656, + "learning_rate": 7.932849364791288e-06, + "loss": 32.3917, + "step": 4063 + }, + { + "epoch": 14.67178329571106, + "grad_norm": 259.373779296875, + "learning_rate": 7.927404718693286e-06, + "loss": 33.3598, + "step": 4064 + }, + { + "epoch": 14.675395033860045, + "grad_norm": 247.35179138183594, + "learning_rate": 7.921960072595281e-06, + "loss": 32.2699, + "step": 4065 + }, + { + "epoch": 14.679006772009028, + "grad_norm": 280.02960205078125, + "learning_rate": 7.916515426497278e-06, + "loss": 33.0305, + "step": 4066 + }, + { + "epoch": 14.682618510158013, + "grad_norm": 394.6492919921875, + "learning_rate": 7.911070780399275e-06, + "loss": 35.1854, + "step": 4067 + }, + { + "epoch": 14.686230248306998, + "grad_norm": 298.6531677246094, + "learning_rate": 7.90562613430127e-06, + "loss": 35.1836, + "step": 4068 + }, + { + "epoch": 14.689841986455981, + "grad_norm": 250.960693359375, + "learning_rate": 7.900181488203268e-06, + "loss": 32.6266, + "step": 4069 + }, + { + "epoch": 14.693453724604966, + "grad_norm": 240.4825897216797, + "learning_rate": 7.894736842105263e-06, + "loss": 35.5937, + "step": 4070 + }, + { + "epoch": 14.693453724604966, + "eval_loss": 0.6042065620422363, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.91, + "eval_steps_per_second": 56.91, + "step": 4070 + }, + { + "epoch": 14.697065462753951, + "grad_norm": 274.6919860839844, + "learning_rate": 7.889292196007258e-06, + "loss": 36.4225, + "step": 4071 + }, + { + "epoch": 14.700677200902934, + "grad_norm": 245.4980010986328, + "learning_rate": 7.883847549909257e-06, + "loss": 36.5503, + "step": 4072 + }, + { + "epoch": 14.704288939051919, + "grad_norm": 373.362548828125, + "learning_rate": 7.878402903811252e-06, + "loss": 35.38, + "step": 4073 + }, + { + "epoch": 14.707900677200904, + "grad_norm": 337.5054626464844, + "learning_rate": 7.872958257713248e-06, + "loss": 28.869, + "step": 4074 + }, + { + "epoch": 14.711512415349887, + "grad_norm": 238.19195556640625, + "learning_rate": 7.867513611615245e-06, + "loss": 22.99, + "step": 4075 + }, + { + "epoch": 14.715124153498872, + "grad_norm": 254.274169921875, + "learning_rate": 7.862068965517242e-06, + "loss": 22.5274, + "step": 4076 + }, + { + "epoch": 14.718735891647855, + "grad_norm": 236.74099731445312, + "learning_rate": 7.856624319419239e-06, + "loss": 23.6756, + "step": 4077 + }, + { + "epoch": 14.72234762979684, + "grad_norm": 239.69911193847656, + "learning_rate": 7.851179673321234e-06, + "loss": 23.2024, + "step": 4078 + }, + { + "epoch": 14.725959367945824, + "grad_norm": 296.35101318359375, + "learning_rate": 7.84573502722323e-06, + "loss": 40.0026, + "step": 4079 + }, + { + "epoch": 14.729571106094808, + "grad_norm": 202.52577209472656, + "learning_rate": 7.840290381125227e-06, + "loss": 41.2817, + "step": 4080 + }, + { + "epoch": 14.729571106094808, + "eval_loss": 0.6069625616073608, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4080 + }, + { + "epoch": 14.733182844243792, + "grad_norm": 290.4194030761719, + "learning_rate": 7.834845735027224e-06, + "loss": 40.5411, + "step": 4081 + }, + { + "epoch": 14.736794582392777, + "grad_norm": 284.0616455078125, + "learning_rate": 7.82940108892922e-06, + "loss": 40.6588, + "step": 4082 + }, + { + "epoch": 14.74040632054176, + "grad_norm": 289.5628967285156, + "learning_rate": 7.823956442831216e-06, + "loss": 38.986, + "step": 4083 + }, + { + "epoch": 14.744018058690745, + "grad_norm": 217.09841918945312, + "learning_rate": 7.818511796733212e-06, + "loss": 38.83, + "step": 4084 + }, + { + "epoch": 14.747629796839728, + "grad_norm": 223.49148559570312, + "learning_rate": 7.813067150635209e-06, + "loss": 39.4897, + "step": 4085 + }, + { + "epoch": 14.751241534988713, + "grad_norm": 240.41578674316406, + "learning_rate": 7.807622504537206e-06, + "loss": 38.9963, + "step": 4086 + }, + { + "epoch": 14.754853273137698, + "grad_norm": 206.7586212158203, + "learning_rate": 7.802177858439201e-06, + "loss": 39.7875, + "step": 4087 + }, + { + "epoch": 14.758465011286681, + "grad_norm": 239.97174072265625, + "learning_rate": 7.796733212341198e-06, + "loss": 39.3977, + "step": 4088 + }, + { + "epoch": 14.762076749435666, + "grad_norm": 204.50839233398438, + "learning_rate": 7.791288566243194e-06, + "loss": 38.7869, + "step": 4089 + }, + { + "epoch": 14.76568848758465, + "grad_norm": 216.79583740234375, + "learning_rate": 7.785843920145191e-06, + "loss": 36.7325, + "step": 4090 + }, + { + "epoch": 14.76568848758465, + "eval_loss": 0.6052367091178894, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.93, + "eval_steps_per_second": 56.93, + "step": 4090 + }, + { + "epoch": 14.769300225733634, + "grad_norm": 251.13209533691406, + "learning_rate": 7.780399274047188e-06, + "loss": 38.2377, + "step": 4091 + }, + { + "epoch": 14.772911963882619, + "grad_norm": 222.745361328125, + "learning_rate": 7.774954627949183e-06, + "loss": 36.8119, + "step": 4092 + }, + { + "epoch": 14.776523702031604, + "grad_norm": 252.72117614746094, + "learning_rate": 7.769509981851179e-06, + "loss": 38.1241, + "step": 4093 + }, + { + "epoch": 14.780135440180587, + "grad_norm": 272.38165283203125, + "learning_rate": 7.764065335753176e-06, + "loss": 37.6839, + "step": 4094 + }, + { + "epoch": 14.783747178329572, + "grad_norm": 301.0637512207031, + "learning_rate": 7.758620689655173e-06, + "loss": 38.1267, + "step": 4095 + }, + { + "epoch": 14.787358916478555, + "grad_norm": 240.22515869140625, + "learning_rate": 7.75317604355717e-06, + "loss": 36.9847, + "step": 4096 + }, + { + "epoch": 14.79097065462754, + "grad_norm": 273.3988952636719, + "learning_rate": 7.747731397459165e-06, + "loss": 39.0368, + "step": 4097 + }, + { + "epoch": 14.794582392776524, + "grad_norm": 252.66497802734375, + "learning_rate": 7.74228675136116e-06, + "loss": 38.6439, + "step": 4098 + }, + { + "epoch": 14.798194130925507, + "grad_norm": 246.3287811279297, + "learning_rate": 7.73684210526316e-06, + "loss": 36.3503, + "step": 4099 + }, + { + "epoch": 14.801805869074492, + "grad_norm": 220.6704559326172, + "learning_rate": 7.731397459165155e-06, + "loss": 38.1603, + "step": 4100 + }, + { + "epoch": 14.801805869074492, + "eval_loss": 0.6043270826339722, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4100 + }, + { + "epoch": 14.805417607223477, + "grad_norm": 215.94979858398438, + "learning_rate": 7.72595281306715e-06, + "loss": 38.9624, + "step": 4101 + }, + { + "epoch": 14.80902934537246, + "grad_norm": 228.76815795898438, + "learning_rate": 7.720508166969147e-06, + "loss": 39.2196, + "step": 4102 + }, + { + "epoch": 14.812641083521445, + "grad_norm": 216.1998291015625, + "learning_rate": 7.715063520871143e-06, + "loss": 39.3677, + "step": 4103 + }, + { + "epoch": 14.816252821670428, + "grad_norm": 266.1018981933594, + "learning_rate": 7.70961887477314e-06, + "loss": 38.1856, + "step": 4104 + }, + { + "epoch": 14.819864559819413, + "grad_norm": 234.2566680908203, + "learning_rate": 7.704174228675137e-06, + "loss": 39.6282, + "step": 4105 + }, + { + "epoch": 14.823476297968398, + "grad_norm": 241.16615295410156, + "learning_rate": 7.698729582577132e-06, + "loss": 38.2693, + "step": 4106 + }, + { + "epoch": 14.827088036117381, + "grad_norm": 332.6835021972656, + "learning_rate": 7.69328493647913e-06, + "loss": 37.7161, + "step": 4107 + }, + { + "epoch": 14.830699774266366, + "grad_norm": 260.1654357910156, + "learning_rate": 7.687840290381126e-06, + "loss": 33.9704, + "step": 4108 + }, + { + "epoch": 14.83431151241535, + "grad_norm": 214.45509338378906, + "learning_rate": 7.682395644283122e-06, + "loss": 32.5126, + "step": 4109 + }, + { + "epoch": 14.837923250564334, + "grad_norm": 257.4847717285156, + "learning_rate": 7.676950998185119e-06, + "loss": 32.0682, + "step": 4110 + }, + { + "epoch": 14.837923250564334, + "eval_loss": 0.6022929549217224, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.957, + "eval_steps_per_second": 56.957, + "step": 4110 + }, + { + "epoch": 14.841534988713319, + "grad_norm": 241.302978515625, + "learning_rate": 7.671506352087114e-06, + "loss": 32.8817, + "step": 4111 + }, + { + "epoch": 14.845146726862303, + "grad_norm": 238.0950164794922, + "learning_rate": 7.66606170598911e-06, + "loss": 31.9995, + "step": 4112 + }, + { + "epoch": 14.848758465011286, + "grad_norm": 239.700439453125, + "learning_rate": 7.660617059891108e-06, + "loss": 32.9681, + "step": 4113 + }, + { + "epoch": 14.852370203160271, + "grad_norm": 234.23890686035156, + "learning_rate": 7.655172413793104e-06, + "loss": 33.6878, + "step": 4114 + }, + { + "epoch": 14.855981941309254, + "grad_norm": 367.3103332519531, + "learning_rate": 7.6497277676951e-06, + "loss": 34.2346, + "step": 4115 + }, + { + "epoch": 14.85959367945824, + "grad_norm": 221.31381225585938, + "learning_rate": 7.644283121597096e-06, + "loss": 35.0148, + "step": 4116 + }, + { + "epoch": 14.863205417607224, + "grad_norm": 352.1162109375, + "learning_rate": 7.638838475499092e-06, + "loss": 34.8326, + "step": 4117 + }, + { + "epoch": 14.866817155756207, + "grad_norm": 296.8202209472656, + "learning_rate": 7.63339382940109e-06, + "loss": 34.2522, + "step": 4118 + }, + { + "epoch": 14.870428893905192, + "grad_norm": 283.4679870605469, + "learning_rate": 7.627949183303086e-06, + "loss": 34.5005, + "step": 4119 + }, + { + "epoch": 14.874040632054175, + "grad_norm": 249.95033264160156, + "learning_rate": 7.622504537205082e-06, + "loss": 34.9581, + "step": 4120 + }, + { + "epoch": 14.874040632054175, + "eval_loss": 0.6031190752983093, + "eval_runtime": 3.1392, + "eval_samples_per_second": 57.02, + "eval_steps_per_second": 57.02, + "step": 4120 + }, + { + "epoch": 14.87765237020316, + "grad_norm": 235.65065002441406, + "learning_rate": 7.6170598911070774e-06, + "loss": 35.3024, + "step": 4121 + }, + { + "epoch": 14.881264108352145, + "grad_norm": 258.1300964355469, + "learning_rate": 7.611615245009075e-06, + "loss": 35.4444, + "step": 4122 + }, + { + "epoch": 14.884875846501128, + "grad_norm": 262.9698791503906, + "learning_rate": 7.606170598911072e-06, + "loss": 36.5643, + "step": 4123 + }, + { + "epoch": 14.888487584650113, + "grad_norm": 274.81781005859375, + "learning_rate": 7.600725952813067e-06, + "loss": 33.0157, + "step": 4124 + }, + { + "epoch": 14.892099322799098, + "grad_norm": 205.41566467285156, + "learning_rate": 7.595281306715063e-06, + "loss": 22.226, + "step": 4125 + }, + { + "epoch": 14.89571106094808, + "grad_norm": 231.19541931152344, + "learning_rate": 7.5898366606170594e-06, + "loss": 22.1499, + "step": 4126 + }, + { + "epoch": 14.899322799097066, + "grad_norm": 203.04856872558594, + "learning_rate": 7.584392014519057e-06, + "loss": 23.3987, + "step": 4127 + }, + { + "epoch": 14.90293453724605, + "grad_norm": 289.031005859375, + "learning_rate": 7.578947368421053e-06, + "loss": 24.3649, + "step": 4128 + }, + { + "epoch": 14.906546275395034, + "grad_norm": 285.2325744628906, + "learning_rate": 7.573502722323049e-06, + "loss": 41.146, + "step": 4129 + }, + { + "epoch": 14.910158013544018, + "grad_norm": 232.21603393554688, + "learning_rate": 7.568058076225045e-06, + "loss": 40.3871, + "step": 4130 + }, + { + "epoch": 14.910158013544018, + "eval_loss": 0.6056836247444153, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 4130 + }, + { + "epoch": 14.913769751693001, + "grad_norm": 358.63238525390625, + "learning_rate": 7.562613430127043e-06, + "loss": 39.5914, + "step": 4131 + }, + { + "epoch": 14.917381489841986, + "grad_norm": 262.66741943359375, + "learning_rate": 7.5571687840290385e-06, + "loss": 39.4552, + "step": 4132 + }, + { + "epoch": 14.920993227990971, + "grad_norm": 228.7096710205078, + "learning_rate": 7.551724137931035e-06, + "loss": 41.5379, + "step": 4133 + }, + { + "epoch": 14.924604966139954, + "grad_norm": 266.6537780761719, + "learning_rate": 7.546279491833031e-06, + "loss": 39.8314, + "step": 4134 + }, + { + "epoch": 14.928216704288939, + "grad_norm": 329.5486755371094, + "learning_rate": 7.540834845735027e-06, + "loss": 37.8247, + "step": 4135 + }, + { + "epoch": 14.931828442437924, + "grad_norm": 391.49127197265625, + "learning_rate": 7.535390199637024e-06, + "loss": 36.8491, + "step": 4136 + }, + { + "epoch": 14.935440180586907, + "grad_norm": 342.66632080078125, + "learning_rate": 7.5299455535390205e-06, + "loss": 37.7245, + "step": 4137 + }, + { + "epoch": 14.939051918735892, + "grad_norm": 309.25115966796875, + "learning_rate": 7.524500907441017e-06, + "loss": 38.3694, + "step": 4138 + }, + { + "epoch": 14.942663656884875, + "grad_norm": 438.21539306640625, + "learning_rate": 7.519056261343012e-06, + "loss": 38.5028, + "step": 4139 + }, + { + "epoch": 14.94627539503386, + "grad_norm": 314.2667541503906, + "learning_rate": 7.513611615245008e-06, + "loss": 39.2531, + "step": 4140 + }, + { + "epoch": 14.94627539503386, + "eval_loss": 0.6075459718704224, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 4140 + }, + { + "epoch": 14.949887133182845, + "grad_norm": 348.3675537109375, + "learning_rate": 7.508166969147006e-06, + "loss": 38.3904, + "step": 4141 + }, + { + "epoch": 14.953498871331828, + "grad_norm": 448.6506652832031, + "learning_rate": 7.5027223230490025e-06, + "loss": 39.0257, + "step": 4142 + }, + { + "epoch": 14.957110609480813, + "grad_norm": 407.4074401855469, + "learning_rate": 7.497277676950998e-06, + "loss": 36.8144, + "step": 4143 + }, + { + "epoch": 14.960722347629797, + "grad_norm": 311.0707702636719, + "learning_rate": 7.491833030852995e-06, + "loss": 34.3852, + "step": 4144 + }, + { + "epoch": 14.96433408577878, + "grad_norm": 316.660400390625, + "learning_rate": 7.486388384754991e-06, + "loss": 32.9411, + "step": 4145 + }, + { + "epoch": 14.967945823927765, + "grad_norm": 405.3203125, + "learning_rate": 7.480943738656988e-06, + "loss": 32.9947, + "step": 4146 + }, + { + "epoch": 14.97155756207675, + "grad_norm": 246.47296142578125, + "learning_rate": 7.475499092558984e-06, + "loss": 34.9284, + "step": 4147 + }, + { + "epoch": 14.975169300225733, + "grad_norm": 250.6293487548828, + "learning_rate": 7.47005444646098e-06, + "loss": 33.5852, + "step": 4148 + }, + { + "epoch": 14.978781038374718, + "grad_norm": 367.8492736816406, + "learning_rate": 7.464609800362977e-06, + "loss": 34.5658, + "step": 4149 + }, + { + "epoch": 14.982392776523701, + "grad_norm": 299.1382141113281, + "learning_rate": 7.459165154264972e-06, + "loss": 35.4483, + "step": 4150 + }, + { + "epoch": 14.982392776523701, + "eval_loss": 0.6054605841636658, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 4150 + }, + { + "epoch": 14.986004514672686, + "grad_norm": 448.0080261230469, + "learning_rate": 7.453720508166969e-06, + "loss": 35.9366, + "step": 4151 + }, + { + "epoch": 14.989616252821671, + "grad_norm": 496.0691223144531, + "learning_rate": 7.448275862068966e-06, + "loss": 37.6222, + "step": 4152 + }, + { + "epoch": 14.993227990970654, + "grad_norm": 300.7026062011719, + "learning_rate": 7.442831215970963e-06, + "loss": 27.5573, + "step": 4153 + }, + { + "epoch": 14.996839729119639, + "grad_norm": 183.81434631347656, + "learning_rate": 7.437386569872958e-06, + "loss": 23.0142, + "step": 4154 + }, + { + "epoch": 15.0, + "grad_norm": 198.61032104492188, + "learning_rate": 7.431941923774954e-06, + "loss": 21.0732, + "step": 4155 + }, + { + "epoch": 15.003611738148985, + "grad_norm": 244.2176513671875, + "learning_rate": 7.426497277676951e-06, + "loss": 39.1709, + "step": 4156 + }, + { + "epoch": 15.007223476297968, + "grad_norm": 211.74375915527344, + "learning_rate": 7.421052631578948e-06, + "loss": 39.9364, + "step": 4157 + }, + { + "epoch": 15.010835214446953, + "grad_norm": 216.2489013671875, + "learning_rate": 7.415607985480944e-06, + "loss": 39.5166, + "step": 4158 + }, + { + "epoch": 15.014446952595938, + "grad_norm": 279.423583984375, + "learning_rate": 7.41016333938294e-06, + "loss": 39.6738, + "step": 4159 + }, + { + "epoch": 15.01805869074492, + "grad_norm": 279.117919921875, + "learning_rate": 7.404718693284937e-06, + "loss": 39.3556, + "step": 4160 + }, + { + "epoch": 15.01805869074492, + "eval_loss": 0.6020110249519348, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 4160 + }, + { + "epoch": 15.021670428893906, + "grad_norm": 213.57162475585938, + "learning_rate": 7.399274047186933e-06, + "loss": 38.9987, + "step": 4161 + }, + { + "epoch": 15.025282167042889, + "grad_norm": 184.1968994140625, + "learning_rate": 7.393829401088929e-06, + "loss": 39.1696, + "step": 4162 + }, + { + "epoch": 15.028893905191874, + "grad_norm": 219.38076782226562, + "learning_rate": 7.388384754990926e-06, + "loss": 39.8897, + "step": 4163 + }, + { + "epoch": 15.032505643340858, + "grad_norm": 225.4325714111328, + "learning_rate": 7.382940108892922e-06, + "loss": 40.7633, + "step": 4164 + }, + { + "epoch": 15.036117381489841, + "grad_norm": 274.78472900390625, + "learning_rate": 7.377495462794918e-06, + "loss": 39.8768, + "step": 4165 + }, + { + "epoch": 15.039729119638826, + "grad_norm": 269.5557861328125, + "learning_rate": 7.3720508166969146e-06, + "loss": 38.4735, + "step": 4166 + }, + { + "epoch": 15.043340857787811, + "grad_norm": 219.78761291503906, + "learning_rate": 7.366606170598912e-06, + "loss": 37.2117, + "step": 4167 + }, + { + "epoch": 15.046952595936794, + "grad_norm": 205.49771118164062, + "learning_rate": 7.361161524500908e-06, + "loss": 36.6855, + "step": 4168 + }, + { + "epoch": 15.050564334085779, + "grad_norm": 235.72068786621094, + "learning_rate": 7.355716878402904e-06, + "loss": 35.4408, + "step": 4169 + }, + { + "epoch": 15.054176072234762, + "grad_norm": 218.84732055664062, + "learning_rate": 7.3502722323049e-06, + "loss": 38.2297, + "step": 4170 + }, + { + "epoch": 15.054176072234762, + "eval_loss": 0.6053969860076904, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 4170 + }, + { + "epoch": 15.057787810383747, + "grad_norm": 195.80685424804688, + "learning_rate": 7.3448275862068966e-06, + "loss": 35.7271, + "step": 4171 + }, + { + "epoch": 15.061399548532732, + "grad_norm": 207.12481689453125, + "learning_rate": 7.339382940108894e-06, + "loss": 37.3393, + "step": 4172 + }, + { + "epoch": 15.065011286681715, + "grad_norm": 211.0287322998047, + "learning_rate": 7.333938294010889e-06, + "loss": 36.9505, + "step": 4173 + }, + { + "epoch": 15.0686230248307, + "grad_norm": 279.0206604003906, + "learning_rate": 7.328493647912886e-06, + "loss": 38.1225, + "step": 4174 + }, + { + "epoch": 15.072234762979685, + "grad_norm": 206.3834228515625, + "learning_rate": 7.323049001814882e-06, + "loss": 37.1117, + "step": 4175 + }, + { + "epoch": 15.075846501128668, + "grad_norm": 266.8707275390625, + "learning_rate": 7.3176043557168786e-06, + "loss": 36.1971, + "step": 4176 + }, + { + "epoch": 15.079458239277653, + "grad_norm": 260.35791015625, + "learning_rate": 7.312159709618875e-06, + "loss": 37.4714, + "step": 4177 + }, + { + "epoch": 15.083069977426636, + "grad_norm": 281.152587890625, + "learning_rate": 7.306715063520871e-06, + "loss": 37.621, + "step": 4178 + }, + { + "epoch": 15.08668171557562, + "grad_norm": 246.25758361816406, + "learning_rate": 7.301270417422868e-06, + "loss": 38.919, + "step": 4179 + }, + { + "epoch": 15.090293453724605, + "grad_norm": 378.4499816894531, + "learning_rate": 7.2958257713248635e-06, + "loss": 39.5783, + "step": 4180 + }, + { + "epoch": 15.090293453724605, + "eval_loss": 0.6071392297744751, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 4180 + }, + { + "epoch": 15.093905191873588, + "grad_norm": 421.0552673339844, + "learning_rate": 7.2903811252268606e-06, + "loss": 38.9023, + "step": 4181 + }, + { + "epoch": 15.097516930022573, + "grad_norm": 264.24359130859375, + "learning_rate": 7.284936479128857e-06, + "loss": 39.6466, + "step": 4182 + }, + { + "epoch": 15.101128668171558, + "grad_norm": 246.88182067871094, + "learning_rate": 7.279491833030854e-06, + "loss": 39.4899, + "step": 4183 + }, + { + "epoch": 15.104740406320541, + "grad_norm": 236.83848571777344, + "learning_rate": 7.274047186932849e-06, + "loss": 35.6587, + "step": 4184 + }, + { + "epoch": 15.108352144469526, + "grad_norm": 278.31573486328125, + "learning_rate": 7.2686025408348455e-06, + "loss": 34.1567, + "step": 4185 + }, + { + "epoch": 15.111963882618511, + "grad_norm": 243.71160888671875, + "learning_rate": 7.2631578947368426e-06, + "loss": 32.1268, + "step": 4186 + }, + { + "epoch": 15.115575620767494, + "grad_norm": 233.81211853027344, + "learning_rate": 7.257713248638839e-06, + "loss": 31.498, + "step": 4187 + }, + { + "epoch": 15.119187358916479, + "grad_norm": 243.12672424316406, + "learning_rate": 7.252268602540835e-06, + "loss": 32.3648, + "step": 4188 + }, + { + "epoch": 15.122799097065462, + "grad_norm": 293.38299560546875, + "learning_rate": 7.246823956442831e-06, + "loss": 32.2236, + "step": 4189 + }, + { + "epoch": 15.126410835214447, + "grad_norm": 249.70071411132812, + "learning_rate": 7.241379310344828e-06, + "loss": 34.5535, + "step": 4190 + }, + { + "epoch": 15.126410835214447, + "eval_loss": 0.6050077676773071, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.934, + "eval_steps_per_second": 56.934, + "step": 4190 + }, + { + "epoch": 15.130022573363432, + "grad_norm": 300.9483642578125, + "learning_rate": 7.235934664246824e-06, + "loss": 32.9552, + "step": 4191 + }, + { + "epoch": 15.133634311512415, + "grad_norm": 228.797607421875, + "learning_rate": 7.23049001814882e-06, + "loss": 33.0974, + "step": 4192 + }, + { + "epoch": 15.1372460496614, + "grad_norm": 279.9087219238281, + "learning_rate": 7.225045372050817e-06, + "loss": 34.2865, + "step": 4193 + }, + { + "epoch": 15.140857787810384, + "grad_norm": 254.15928649902344, + "learning_rate": 7.219600725952813e-06, + "loss": 34.5603, + "step": 4194 + }, + { + "epoch": 15.144469525959368, + "grad_norm": 314.19012451171875, + "learning_rate": 7.2141560798548095e-06, + "loss": 34.6428, + "step": 4195 + }, + { + "epoch": 15.148081264108352, + "grad_norm": 291.8244323730469, + "learning_rate": 7.208711433756806e-06, + "loss": 33.6676, + "step": 4196 + }, + { + "epoch": 15.151693002257336, + "grad_norm": 276.4428405761719, + "learning_rate": 7.203266787658803e-06, + "loss": 33.9118, + "step": 4197 + }, + { + "epoch": 15.15530474040632, + "grad_norm": 265.7801208496094, + "learning_rate": 7.197822141560799e-06, + "loss": 35.1971, + "step": 4198 + }, + { + "epoch": 15.158916478555305, + "grad_norm": 244.48667907714844, + "learning_rate": 7.192377495462795e-06, + "loss": 33.0843, + "step": 4199 + }, + { + "epoch": 15.162528216704288, + "grad_norm": 348.6037902832031, + "learning_rate": 7.1869328493647915e-06, + "loss": 36.7957, + "step": 4200 + }, + { + "epoch": 15.162528216704288, + "eval_loss": 0.6052607297897339, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 4200 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.0597797077929165e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a7765ad3298fbe9f0d09c1d2970f3a62e3bc3007 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26c48cffb3efd8320212d6f3062e28ce4b10dfddb4e8058dd777a13a9b3c0cfb +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..dfa668b010702df33ddf7eec7c02c6c6869ad8f7 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef07af568e41556877a41a3405632c04a80665446b954788375faaccc4b18203 +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..679fe034447c2fa3bea5b84c5de2bd6dec555da3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c306e276e891fd99955f59891694dc056e6f68ccc7e937a460f33bd871e1c06 +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..9050bce0b83a3ff679a37aae49cd3838dd17d18e --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f16735483e7f459b7f7c853601a9a20d80f9a20030ffee66d2d1c34243d7360e +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..26cb2930b009bdfa1ddac71f63b95ba3b60d052c --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:807cb3fb9da896170a4eecc9febf434194e1e73040a6127023c47373726deef2 +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a305482ffb3eca325e2b66ff6bd6dd2e702f3737 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/trainer_state.json @@ -0,0 +1,34353 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 15.884875846501128, + "eval_steps": 10, + "global_step": 4400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + }, + { + "epoch": 4.33589164785553, + "grad_norm": 232.332763671875, + "learning_rate": 2.3515426497277678e-05, + "loss": 38.1029, + "step": 1201 + }, + { + "epoch": 4.339503386004514, + "grad_norm": 251.8763885498047, + "learning_rate": 2.3509981851179673e-05, + "loss": 40.2538, + "step": 1202 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 260.1363525390625, + "learning_rate": 2.350453720508167e-05, + "loss": 39.115, + "step": 1203 + }, + { + "epoch": 4.346726862302483, + "grad_norm": 227.32473754882812, + "learning_rate": 2.3499092558983667e-05, + "loss": 37.7692, + "step": 1204 + }, + { + "epoch": 4.350338600451467, + "grad_norm": 208.3872528076172, + "learning_rate": 2.3493647912885663e-05, + "loss": 26.7583, + "step": 1205 + }, + { + "epoch": 4.353950338600452, + "grad_norm": 173.05075073242188, + "learning_rate": 2.348820326678766e-05, + "loss": 24.7576, + "step": 1206 + }, + { + "epoch": 4.357562076749436, + "grad_norm": 214.4512939453125, + "learning_rate": 2.3482758620689657e-05, + "loss": 24.8792, + "step": 1207 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 179.293701171875, + "learning_rate": 2.3477313974591652e-05, + "loss": 26.1507, + "step": 1208 + }, + { + "epoch": 4.364785553047404, + "grad_norm": 401.9908142089844, + "learning_rate": 2.3471869328493648e-05, + "loss": 47.4017, + "step": 1209 + }, + { + "epoch": 4.368397291196389, + "grad_norm": 399.3369140625, + "learning_rate": 2.3466424682395643e-05, + "loss": 48.0082, + "step": 1210 + }, + { + "epoch": 4.368397291196389, + "eval_loss": 0.6664602756500244, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.18, + "eval_steps_per_second": 57.18, + "step": 1210 + }, + { + "epoch": 4.372009029345373, + "grad_norm": 320.49090576171875, + "learning_rate": 2.346098003629764e-05, + "loss": 47.4843, + "step": 1211 + }, + { + "epoch": 4.375620767494357, + "grad_norm": 297.55615234375, + "learning_rate": 2.3455535390199637e-05, + "loss": 46.3087, + "step": 1212 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 245.03399658203125, + "learning_rate": 2.3450090744101636e-05, + "loss": 45.4889, + "step": 1213 + }, + { + "epoch": 4.382844243792325, + "grad_norm": 227.94091796875, + "learning_rate": 2.344464609800363e-05, + "loss": 45.8501, + "step": 1214 + }, + { + "epoch": 4.386455981941309, + "grad_norm": 262.7824401855469, + "learning_rate": 2.3439201451905627e-05, + "loss": 46.2737, + "step": 1215 + }, + { + "epoch": 4.390067720090293, + "grad_norm": 235.969970703125, + "learning_rate": 2.3433756805807622e-05, + "loss": 45.2876, + "step": 1216 + }, + { + "epoch": 4.393679458239277, + "grad_norm": 244.8028106689453, + "learning_rate": 2.342831215970962e-05, + "loss": 45.4931, + "step": 1217 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 236.24844360351562, + "learning_rate": 2.3422867513611616e-05, + "loss": 45.6649, + "step": 1218 + }, + { + "epoch": 4.400902934537246, + "grad_norm": 204.7911834716797, + "learning_rate": 2.341742286751361e-05, + "loss": 43.9613, + "step": 1219 + }, + { + "epoch": 4.40451467268623, + "grad_norm": 190.6739044189453, + "learning_rate": 2.3411978221415607e-05, + "loss": 41.9267, + "step": 1220 + }, + { + "epoch": 4.40451467268623, + "eval_loss": 0.6481396555900574, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1220 + }, + { + "epoch": 4.408126410835214, + "grad_norm": 224.25758361816406, + "learning_rate": 2.3406533575317602e-05, + "loss": 42.34, + "step": 1221 + }, + { + "epoch": 4.411738148984199, + "grad_norm": 238.21913146972656, + "learning_rate": 2.34010889292196e-05, + "loss": 40.6947, + "step": 1222 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 255.64395141601562, + "learning_rate": 2.33956442831216e-05, + "loss": 39.8585, + "step": 1223 + }, + { + "epoch": 4.418961625282167, + "grad_norm": 202.08859252929688, + "learning_rate": 2.3390199637023595e-05, + "loss": 42.6031, + "step": 1224 + }, + { + "epoch": 4.422573363431152, + "grad_norm": 222.359619140625, + "learning_rate": 2.338475499092559e-05, + "loss": 41.9946, + "step": 1225 + }, + { + "epoch": 4.426185101580136, + "grad_norm": 198.84461975097656, + "learning_rate": 2.3379310344827586e-05, + "loss": 40.9174, + "step": 1226 + }, + { + "epoch": 4.42979683972912, + "grad_norm": 227.34942626953125, + "learning_rate": 2.337386569872958e-05, + "loss": 42.2865, + "step": 1227 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 249.9097900390625, + "learning_rate": 2.336842105263158e-05, + "loss": 42.6508, + "step": 1228 + }, + { + "epoch": 4.437020316027088, + "grad_norm": 236.96009826660156, + "learning_rate": 2.3362976406533576e-05, + "loss": 43.0846, + "step": 1229 + }, + { + "epoch": 4.440632054176072, + "grad_norm": 183.06201171875, + "learning_rate": 2.335753176043557e-05, + "loss": 42.4119, + "step": 1230 + }, + { + "epoch": 4.440632054176072, + "eval_loss": 0.6428424715995789, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 1230 + }, + { + "epoch": 4.444243792325056, + "grad_norm": 199.0382843017578, + "learning_rate": 2.335208711433757e-05, + "loss": 43.1702, + "step": 1231 + }, + { + "epoch": 4.44785553047404, + "grad_norm": 221.87939453125, + "learning_rate": 2.3346642468239565e-05, + "loss": 43.3518, + "step": 1232 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 205.0601043701172, + "learning_rate": 2.3341197822141564e-05, + "loss": 42.9713, + "step": 1233 + }, + { + "epoch": 4.455079006772009, + "grad_norm": 235.3998565673828, + "learning_rate": 2.333575317604356e-05, + "loss": 42.6973, + "step": 1234 + }, + { + "epoch": 4.458690744920993, + "grad_norm": 171.76986694335938, + "learning_rate": 2.3330308529945555e-05, + "loss": 43.351, + "step": 1235 + }, + { + "epoch": 4.462302483069977, + "grad_norm": 261.549072265625, + "learning_rate": 2.332486388384755e-05, + "loss": 43.8662, + "step": 1236 + }, + { + "epoch": 4.465914221218962, + "grad_norm": 256.76837158203125, + "learning_rate": 2.3319419237749545e-05, + "loss": 40.7938, + "step": 1237 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 176.35060119628906, + "learning_rate": 2.331397459165154e-05, + "loss": 38.1021, + "step": 1238 + }, + { + "epoch": 4.47313769751693, + "grad_norm": 203.00906372070312, + "learning_rate": 2.330852994555354e-05, + "loss": 36.6359, + "step": 1239 + }, + { + "epoch": 4.476749435665914, + "grad_norm": 259.6462707519531, + "learning_rate": 2.3303085299455535e-05, + "loss": 34.448, + "step": 1240 + }, + { + "epoch": 4.476749435665914, + "eval_loss": 0.6386051177978516, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 1240 + }, + { + "epoch": 4.480361173814899, + "grad_norm": 215.24737548828125, + "learning_rate": 2.3297640653357534e-05, + "loss": 35.2353, + "step": 1241 + }, + { + "epoch": 4.483972911963883, + "grad_norm": 249.12355041503906, + "learning_rate": 2.329219600725953e-05, + "loss": 38.2077, + "step": 1242 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 191.0881805419922, + "learning_rate": 2.3286751361161525e-05, + "loss": 36.8363, + "step": 1243 + }, + { + "epoch": 4.491196388261851, + "grad_norm": 229.26449584960938, + "learning_rate": 2.3281306715063523e-05, + "loss": 36.7398, + "step": 1244 + }, + { + "epoch": 4.4948081264108355, + "grad_norm": 184.931884765625, + "learning_rate": 2.327586206896552e-05, + "loss": 35.6614, + "step": 1245 + }, + { + "epoch": 4.4984198645598195, + "grad_norm": 183.7378387451172, + "learning_rate": 2.3270417422867514e-05, + "loss": 36.9818, + "step": 1246 + }, + { + "epoch": 4.502031602708803, + "grad_norm": 191.42543029785156, + "learning_rate": 2.326497277676951e-05, + "loss": 38.1348, + "step": 1247 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 211.6359100341797, + "learning_rate": 2.3259528130671505e-05, + "loss": 37.0112, + "step": 1248 + }, + { + "epoch": 4.509255079006772, + "grad_norm": 245.6946563720703, + "learning_rate": 2.32540834845735e-05, + "loss": 38.6218, + "step": 1249 + }, + { + "epoch": 4.512866817155756, + "grad_norm": 193.29095458984375, + "learning_rate": 2.3248638838475502e-05, + "loss": 36.9687, + "step": 1250 + }, + { + "epoch": 4.512866817155756, + "eval_loss": 0.6432057023048401, + "eval_runtime": 3.1301, + "eval_samples_per_second": 57.187, + "eval_steps_per_second": 57.187, + "step": 1250 + }, + { + "epoch": 4.51647855530474, + "grad_norm": 247.0595245361328, + "learning_rate": 2.3243194192377498e-05, + "loss": 39.8086, + "step": 1251 + }, + { + "epoch": 4.520090293453725, + "grad_norm": 243.1544189453125, + "learning_rate": 2.3237749546279493e-05, + "loss": 38.7245, + "step": 1252 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 322.0834045410156, + "learning_rate": 2.323230490018149e-05, + "loss": 39.5335, + "step": 1253 + }, + { + "epoch": 4.527313769751693, + "grad_norm": 201.5956573486328, + "learning_rate": 2.3226860254083484e-05, + "loss": 30.2928, + "step": 1254 + }, + { + "epoch": 4.530925507900677, + "grad_norm": 186.13291931152344, + "learning_rate": 2.3221415607985483e-05, + "loss": 24.8504, + "step": 1255 + }, + { + "epoch": 4.534537246049661, + "grad_norm": 251.50608825683594, + "learning_rate": 2.3215970961887478e-05, + "loss": 24.5528, + "step": 1256 + }, + { + "epoch": 4.538148984198646, + "grad_norm": 180.21124267578125, + "learning_rate": 2.3210526315789473e-05, + "loss": 25.0864, + "step": 1257 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 206.5410614013672, + "learning_rate": 2.320508166969147e-05, + "loss": 27.1602, + "step": 1258 + }, + { + "epoch": 4.545372460496614, + "grad_norm": 342.1103210449219, + "learning_rate": 2.3199637023593468e-05, + "loss": 47.3734, + "step": 1259 + }, + { + "epoch": 4.5489841986455986, + "grad_norm": 418.3056945800781, + "learning_rate": 2.3194192377495463e-05, + "loss": 48.0316, + "step": 1260 + }, + { + "epoch": 4.5489841986455986, + "eval_loss": 0.6742400527000427, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 1260 + }, + { + "epoch": 4.5525959367945825, + "grad_norm": 369.8560791015625, + "learning_rate": 2.3188747731397462e-05, + "loss": 47.4532, + "step": 1261 + }, + { + "epoch": 4.5562076749435665, + "grad_norm": 322.0288391113281, + "learning_rate": 2.3183303085299457e-05, + "loss": 47.0661, + "step": 1262 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 244.79066467285156, + "learning_rate": 2.3177858439201453e-05, + "loss": 45.1875, + "step": 1263 + }, + { + "epoch": 4.563431151241535, + "grad_norm": 209.29397583007812, + "learning_rate": 2.3172413793103448e-05, + "loss": 46.1355, + "step": 1264 + }, + { + "epoch": 4.567042889390519, + "grad_norm": 271.5123291015625, + "learning_rate": 2.3166969147005443e-05, + "loss": 45.8947, + "step": 1265 + }, + { + "epoch": 4.570654627539503, + "grad_norm": 232.42913818359375, + "learning_rate": 2.3161524500907442e-05, + "loss": 45.6542, + "step": 1266 + }, + { + "epoch": 4.574266365688487, + "grad_norm": 282.50738525390625, + "learning_rate": 2.3156079854809437e-05, + "loss": 45.8805, + "step": 1267 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 203.39031982421875, + "learning_rate": 2.3150635208711436e-05, + "loss": 44.8926, + "step": 1268 + }, + { + "epoch": 4.581489841986456, + "grad_norm": 213.94894409179688, + "learning_rate": 2.314519056261343e-05, + "loss": 43.7589, + "step": 1269 + }, + { + "epoch": 4.58510158013544, + "grad_norm": 198.9677734375, + "learning_rate": 2.3139745916515427e-05, + "loss": 41.819, + "step": 1270 + }, + { + "epoch": 4.58510158013544, + "eval_loss": 0.6428627371788025, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1270 + }, + { + "epoch": 4.588713318284425, + "grad_norm": 197.69903564453125, + "learning_rate": 2.3134301270417422e-05, + "loss": 40.6128, + "step": 1271 + }, + { + "epoch": 4.592325056433409, + "grad_norm": 229.10488891601562, + "learning_rate": 2.312885662431942e-05, + "loss": 41.1856, + "step": 1272 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 254.4750213623047, + "learning_rate": 2.3123411978221417e-05, + "loss": 40.2048, + "step": 1273 + }, + { + "epoch": 4.599548532731377, + "grad_norm": 247.2012939453125, + "learning_rate": 2.3117967332123412e-05, + "loss": 41.663, + "step": 1274 + }, + { + "epoch": 4.603160270880361, + "grad_norm": 196.78761291503906, + "learning_rate": 2.3112522686025407e-05, + "loss": 41.1102, + "step": 1275 + }, + { + "epoch": 4.606772009029346, + "grad_norm": 179.03880310058594, + "learning_rate": 2.3107078039927403e-05, + "loss": 39.6368, + "step": 1276 + }, + { + "epoch": 4.6103837471783295, + "grad_norm": 203.49159240722656, + "learning_rate": 2.3101633393829405e-05, + "loss": 42.9424, + "step": 1277 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 254.80018615722656, + "learning_rate": 2.30961887477314e-05, + "loss": 42.0636, + "step": 1278 + }, + { + "epoch": 4.617607223476298, + "grad_norm": 201.86109924316406, + "learning_rate": 2.3090744101633396e-05, + "loss": 41.4738, + "step": 1279 + }, + { + "epoch": 4.621218961625282, + "grad_norm": 185.1239471435547, + "learning_rate": 2.308529945553539e-05, + "loss": 41.8529, + "step": 1280 + }, + { + "epoch": 4.621218961625282, + "eval_loss": 0.6457561254501343, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 1280 + }, + { + "epoch": 4.624830699774266, + "grad_norm": 198.6769561767578, + "learning_rate": 2.3079854809437386e-05, + "loss": 41.8397, + "step": 1281 + }, + { + "epoch": 4.62844243792325, + "grad_norm": 254.9165496826172, + "learning_rate": 2.3074410163339382e-05, + "loss": 43.5585, + "step": 1282 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 183.61181640625, + "learning_rate": 2.306896551724138e-05, + "loss": 41.7349, + "step": 1283 + }, + { + "epoch": 4.635665914221219, + "grad_norm": 206.0381622314453, + "learning_rate": 2.3063520871143376e-05, + "loss": 42.6239, + "step": 1284 + }, + { + "epoch": 4.639277652370203, + "grad_norm": 188.5303497314453, + "learning_rate": 2.305807622504537e-05, + "loss": 43.0988, + "step": 1285 + }, + { + "epoch": 4.642889390519187, + "grad_norm": 208.30039978027344, + "learning_rate": 2.3052631578947367e-05, + "loss": 43.8379, + "step": 1286 + }, + { + "epoch": 4.646501128668172, + "grad_norm": 209.494384765625, + "learning_rate": 2.3047186932849365e-05, + "loss": 41.4395, + "step": 1287 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 223.97824096679688, + "learning_rate": 2.3041742286751364e-05, + "loss": 38.5792, + "step": 1288 + }, + { + "epoch": 4.65372460496614, + "grad_norm": 209.16192626953125, + "learning_rate": 2.303629764065336e-05, + "loss": 36.2448, + "step": 1289 + }, + { + "epoch": 4.657336343115124, + "grad_norm": 260.72821044921875, + "learning_rate": 2.3030852994555355e-05, + "loss": 35.1692, + "step": 1290 + }, + { + "epoch": 4.657336343115124, + "eval_loss": 0.6381233334541321, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1290 + }, + { + "epoch": 4.660948081264109, + "grad_norm": 222.2270965576172, + "learning_rate": 2.302540834845735e-05, + "loss": 35.2234, + "step": 1291 + }, + { + "epoch": 4.664559819413093, + "grad_norm": 208.68218994140625, + "learning_rate": 2.3019963702359346e-05, + "loss": 35.6167, + "step": 1292 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 199.57015991210938, + "learning_rate": 2.301451905626134e-05, + "loss": 36.9489, + "step": 1293 + }, + { + "epoch": 4.6717832957110605, + "grad_norm": 249.1312255859375, + "learning_rate": 2.300907441016334e-05, + "loss": 37.0681, + "step": 1294 + }, + { + "epoch": 4.675395033860045, + "grad_norm": 227.86341857910156, + "learning_rate": 2.3003629764065335e-05, + "loss": 38.3897, + "step": 1295 + }, + { + "epoch": 4.679006772009029, + "grad_norm": 290.3368225097656, + "learning_rate": 2.2998185117967334e-05, + "loss": 39.1391, + "step": 1296 + }, + { + "epoch": 4.682618510158013, + "grad_norm": 222.59974670410156, + "learning_rate": 2.299274047186933e-05, + "loss": 38.6362, + "step": 1297 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 233.853515625, + "learning_rate": 2.2987295825771325e-05, + "loss": 37.1796, + "step": 1298 + }, + { + "epoch": 4.689841986455982, + "grad_norm": 202.83212280273438, + "learning_rate": 2.2981851179673324e-05, + "loss": 38.5097, + "step": 1299 + }, + { + "epoch": 4.693453724604966, + "grad_norm": 203.59027099609375, + "learning_rate": 2.297640653357532e-05, + "loss": 38.3335, + "step": 1300 + }, + { + "epoch": 4.693453724604966, + "eval_loss": 0.6446877717971802, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 1300 + }, + { + "epoch": 4.69706546275395, + "grad_norm": 250.48324584960938, + "learning_rate": 2.2970961887477314e-05, + "loss": 39.1848, + "step": 1301 + }, + { + "epoch": 4.700677200902934, + "grad_norm": 218.0867462158203, + "learning_rate": 2.296551724137931e-05, + "loss": 38.2276, + "step": 1302 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 316.4258728027344, + "learning_rate": 2.2960072595281305e-05, + "loss": 38.4487, + "step": 1303 + }, + { + "epoch": 4.707900677200903, + "grad_norm": 262.96832275390625, + "learning_rate": 2.29546279491833e-05, + "loss": 29.1075, + "step": 1304 + }, + { + "epoch": 4.711512415349887, + "grad_norm": 261.25897216796875, + "learning_rate": 2.2949183303085303e-05, + "loss": 24.6257, + "step": 1305 + }, + { + "epoch": 4.715124153498872, + "grad_norm": 223.29014587402344, + "learning_rate": 2.2943738656987298e-05, + "loss": 24.4387, + "step": 1306 + }, + { + "epoch": 4.718735891647856, + "grad_norm": 167.95193481445312, + "learning_rate": 2.2938294010889293e-05, + "loss": 25.0916, + "step": 1307 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 203.88392639160156, + "learning_rate": 2.293284936479129e-05, + "loss": 26.1631, + "step": 1308 + }, + { + "epoch": 4.725959367945824, + "grad_norm": 350.67657470703125, + "learning_rate": 2.2927404718693284e-05, + "loss": 47.7021, + "step": 1309 + }, + { + "epoch": 4.7295711060948085, + "grad_norm": 357.1839294433594, + "learning_rate": 2.2921960072595283e-05, + "loss": 47.8161, + "step": 1310 + }, + { + "epoch": 4.7295711060948085, + "eval_loss": 0.6716815829277039, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 1310 + }, + { + "epoch": 4.733182844243792, + "grad_norm": 334.40216064453125, + "learning_rate": 2.291651542649728e-05, + "loss": 47.5608, + "step": 1311 + }, + { + "epoch": 4.736794582392776, + "grad_norm": 322.90008544921875, + "learning_rate": 2.2911070780399274e-05, + "loss": 45.9858, + "step": 1312 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 291.5083923339844, + "learning_rate": 2.290562613430127e-05, + "loss": 45.9813, + "step": 1313 + }, + { + "epoch": 4.744018058690745, + "grad_norm": 234.91102600097656, + "learning_rate": 2.2900181488203268e-05, + "loss": 44.4287, + "step": 1314 + }, + { + "epoch": 4.747629796839729, + "grad_norm": 271.03582763671875, + "learning_rate": 2.2894736842105263e-05, + "loss": 45.3697, + "step": 1315 + }, + { + "epoch": 4.751241534988713, + "grad_norm": 256.219482421875, + "learning_rate": 2.2889292196007262e-05, + "loss": 45.1817, + "step": 1316 + }, + { + "epoch": 4.754853273137698, + "grad_norm": 252.0631561279297, + "learning_rate": 2.2883847549909257e-05, + "loss": 45.2029, + "step": 1317 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 249.41812133789062, + "learning_rate": 2.2878402903811253e-05, + "loss": 44.9802, + "step": 1318 + }, + { + "epoch": 4.762076749435666, + "grad_norm": 208.9102325439453, + "learning_rate": 2.2872958257713248e-05, + "loss": 44.3745, + "step": 1319 + }, + { + "epoch": 4.76568848758465, + "grad_norm": 322.94903564453125, + "learning_rate": 2.2867513611615244e-05, + "loss": 40.9193, + "step": 1320 + }, + { + "epoch": 4.76568848758465, + "eval_loss": 0.6515910029411316, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1320 + }, + { + "epoch": 4.769300225733634, + "grad_norm": 264.6942138671875, + "learning_rate": 2.2862068965517242e-05, + "loss": 39.7286, + "step": 1321 + }, + { + "epoch": 4.772911963882619, + "grad_norm": 276.6095886230469, + "learning_rate": 2.2856624319419238e-05, + "loss": 41.3846, + "step": 1322 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 199.59877014160156, + "learning_rate": 2.2851179673321233e-05, + "loss": 40.5583, + "step": 1323 + }, + { + "epoch": 4.780135440180587, + "grad_norm": 252.59158325195312, + "learning_rate": 2.2845735027223232e-05, + "loss": 40.9513, + "step": 1324 + }, + { + "epoch": 4.7837471783295715, + "grad_norm": 215.53826904296875, + "learning_rate": 2.2840290381125227e-05, + "loss": 41.5119, + "step": 1325 + }, + { + "epoch": 4.7873589164785555, + "grad_norm": 290.7100524902344, + "learning_rate": 2.2834845735027226e-05, + "loss": 42.7646, + "step": 1326 + }, + { + "epoch": 4.7909706546275395, + "grad_norm": 190.2306671142578, + "learning_rate": 2.282940108892922e-05, + "loss": 42.2708, + "step": 1327 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 187.5550079345703, + "learning_rate": 2.2823956442831217e-05, + "loss": 41.9279, + "step": 1328 + }, + { + "epoch": 4.798194130925508, + "grad_norm": 169.10414123535156, + "learning_rate": 2.2818511796733212e-05, + "loss": 42.2688, + "step": 1329 + }, + { + "epoch": 4.801805869074492, + "grad_norm": 199.5216064453125, + "learning_rate": 2.2813067150635208e-05, + "loss": 41.9192, + "step": 1330 + }, + { + "epoch": 4.801805869074492, + "eval_loss": 0.6402038335800171, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 1330 + }, + { + "epoch": 4.805417607223476, + "grad_norm": 222.4996337890625, + "learning_rate": 2.2807622504537203e-05, + "loss": 43.8218, + "step": 1331 + }, + { + "epoch": 4.80902934537246, + "grad_norm": 228.1157684326172, + "learning_rate": 2.2802177858439202e-05, + "loss": 42.9497, + "step": 1332 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 179.83697509765625, + "learning_rate": 2.27967332123412e-05, + "loss": 43.9723, + "step": 1333 + }, + { + "epoch": 4.816252821670429, + "grad_norm": 196.81983947753906, + "learning_rate": 2.2791288566243196e-05, + "loss": 43.3302, + "step": 1334 + }, + { + "epoch": 4.819864559819413, + "grad_norm": 186.61160278320312, + "learning_rate": 2.278584392014519e-05, + "loss": 41.8957, + "step": 1335 + }, + { + "epoch": 4.823476297968397, + "grad_norm": 242.55886840820312, + "learning_rate": 2.2780399274047187e-05, + "loss": 43.1916, + "step": 1336 + }, + { + "epoch": 4.827088036117382, + "grad_norm": 212.07177734375, + "learning_rate": 2.2774954627949185e-05, + "loss": 38.3371, + "step": 1337 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 180.1990966796875, + "learning_rate": 2.276950998185118e-05, + "loss": 36.3413, + "step": 1338 + }, + { + "epoch": 4.83431151241535, + "grad_norm": 202.69529724121094, + "learning_rate": 2.2764065335753176e-05, + "loss": 35.4426, + "step": 1339 + }, + { + "epoch": 4.837923250564334, + "grad_norm": 180.47283935546875, + "learning_rate": 2.275862068965517e-05, + "loss": 35.5281, + "step": 1340 + }, + { + "epoch": 4.837923250564334, + "eval_loss": 0.6356105804443359, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 1340 + }, + { + "epoch": 4.8415349887133186, + "grad_norm": 204.674560546875, + "learning_rate": 2.2753176043557167e-05, + "loss": 36.2566, + "step": 1341 + }, + { + "epoch": 4.8451467268623025, + "grad_norm": 272.1197204589844, + "learning_rate": 2.2747731397459166e-05, + "loss": 36.3862, + "step": 1342 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 235.55101013183594, + "learning_rate": 2.2742286751361165e-05, + "loss": 35.1455, + "step": 1343 + }, + { + "epoch": 4.852370203160271, + "grad_norm": 271.2718200683594, + "learning_rate": 2.273684210526316e-05, + "loss": 37.3824, + "step": 1344 + }, + { + "epoch": 4.855981941309255, + "grad_norm": 242.15728759765625, + "learning_rate": 2.2731397459165155e-05, + "loss": 37.6587, + "step": 1345 + }, + { + "epoch": 4.859593679458239, + "grad_norm": 218.59481811523438, + "learning_rate": 2.272595281306715e-05, + "loss": 36.7602, + "step": 1346 + }, + { + "epoch": 4.863205417607223, + "grad_norm": 231.9490203857422, + "learning_rate": 2.2720508166969146e-05, + "loss": 38.187, + "step": 1347 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 385.56158447265625, + "learning_rate": 2.2715063520871145e-05, + "loss": 38.1905, + "step": 1348 + }, + { + "epoch": 4.870428893905192, + "grad_norm": 219.38204956054688, + "learning_rate": 2.270961887477314e-05, + "loss": 38.2179, + "step": 1349 + }, + { + "epoch": 4.874040632054176, + "grad_norm": 209.46580505371094, + "learning_rate": 2.2704174228675136e-05, + "loss": 37.3696, + "step": 1350 + }, + { + "epoch": 4.874040632054176, + "eval_loss": 0.6412517428398132, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 1350 + }, + { + "epoch": 4.87765237020316, + "grad_norm": 205.53416442871094, + "learning_rate": 2.2698729582577134e-05, + "loss": 38.5144, + "step": 1351 + }, + { + "epoch": 4.881264108352145, + "grad_norm": 214.2522735595703, + "learning_rate": 2.269328493647913e-05, + "loss": 38.7372, + "step": 1352 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 236.9787139892578, + "learning_rate": 2.2687840290381125e-05, + "loss": 38.8987, + "step": 1353 + }, + { + "epoch": 4.888487584650113, + "grad_norm": 247.30906677246094, + "learning_rate": 2.2682395644283124e-05, + "loss": 35.0837, + "step": 1354 + }, + { + "epoch": 4.892099322799097, + "grad_norm": 287.5954284667969, + "learning_rate": 2.267695099818512e-05, + "loss": 25.5272, + "step": 1355 + }, + { + "epoch": 4.895711060948082, + "grad_norm": 254.61672973632812, + "learning_rate": 2.2671506352087115e-05, + "loss": 25.1288, + "step": 1356 + }, + { + "epoch": 4.899322799097066, + "grad_norm": 180.98666381835938, + "learning_rate": 2.266606170598911e-05, + "loss": 25.0588, + "step": 1357 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 213.0275421142578, + "learning_rate": 2.2660617059891105e-05, + "loss": 25.464, + "step": 1358 + }, + { + "epoch": 4.9065462753950335, + "grad_norm": 385.18035888671875, + "learning_rate": 2.2655172413793104e-05, + "loss": 47.0056, + "step": 1359 + }, + { + "epoch": 4.910158013544018, + "grad_norm": 383.4106140136719, + "learning_rate": 2.2649727767695103e-05, + "loss": 46.9892, + "step": 1360 + }, + { + "epoch": 4.910158013544018, + "eval_loss": 0.6618479490280151, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1360 + }, + { + "epoch": 4.913769751693002, + "grad_norm": 415.4345397949219, + "learning_rate": 2.26442831215971e-05, + "loss": 47.1619, + "step": 1361 + }, + { + "epoch": 4.917381489841986, + "grad_norm": 362.338134765625, + "learning_rate": 2.2638838475499094e-05, + "loss": 46.7232, + "step": 1362 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 378.7535400390625, + "learning_rate": 2.263339382940109e-05, + "loss": 46.4438, + "step": 1363 + }, + { + "epoch": 4.924604966139955, + "grad_norm": 251.64901733398438, + "learning_rate": 2.2627949183303085e-05, + "loss": 44.8178, + "step": 1364 + }, + { + "epoch": 4.928216704288939, + "grad_norm": 273.1052551269531, + "learning_rate": 2.2622504537205083e-05, + "loss": 43.0865, + "step": 1365 + }, + { + "epoch": 4.931828442437923, + "grad_norm": 229.66415405273438, + "learning_rate": 2.261705989110708e-05, + "loss": 42.2463, + "step": 1366 + }, + { + "epoch": 4.935440180586907, + "grad_norm": 229.47940063476562, + "learning_rate": 2.2611615245009074e-05, + "loss": 42.4395, + "step": 1367 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 224.48890686035156, + "learning_rate": 2.260617059891107e-05, + "loss": 42.4994, + "step": 1368 + }, + { + "epoch": 4.942663656884876, + "grad_norm": 241.98745727539062, + "learning_rate": 2.2600725952813065e-05, + "loss": 42.5535, + "step": 1369 + }, + { + "epoch": 4.94627539503386, + "grad_norm": 258.1711120605469, + "learning_rate": 2.2595281306715067e-05, + "loss": 42.8475, + "step": 1370 + }, + { + "epoch": 4.94627539503386, + "eval_loss": 0.639252245426178, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.09, + "eval_steps_per_second": 57.09, + "step": 1370 + }, + { + "epoch": 4.949887133182845, + "grad_norm": 204.64927673339844, + "learning_rate": 2.2589836660617062e-05, + "loss": 42.9895, + "step": 1371 + }, + { + "epoch": 4.953498871331829, + "grad_norm": 342.9057922363281, + "learning_rate": 2.2584392014519058e-05, + "loss": 43.1972, + "step": 1372 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 207.45504760742188, + "learning_rate": 2.2578947368421053e-05, + "loss": 42.406, + "step": 1373 + }, + { + "epoch": 4.960722347629797, + "grad_norm": 232.78831481933594, + "learning_rate": 2.257350272232305e-05, + "loss": 36.8817, + "step": 1374 + }, + { + "epoch": 4.9643340857787805, + "grad_norm": 249.3349609375, + "learning_rate": 2.2568058076225044e-05, + "loss": 34.584, + "step": 1375 + }, + { + "epoch": 4.967945823927765, + "grad_norm": 322.7100524902344, + "learning_rate": 2.2562613430127043e-05, + "loss": 36.9512, + "step": 1376 + }, + { + "epoch": 4.971557562076749, + "grad_norm": 357.65228271484375, + "learning_rate": 2.2557168784029038e-05, + "loss": 37.6833, + "step": 1377 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 300.0970153808594, + "learning_rate": 2.2551724137931033e-05, + "loss": 38.597, + "step": 1378 + }, + { + "epoch": 4.978781038374718, + "grad_norm": 234.52508544921875, + "learning_rate": 2.2546279491833032e-05, + "loss": 38.4155, + "step": 1379 + }, + { + "epoch": 4.982392776523702, + "grad_norm": 270.60626220703125, + "learning_rate": 2.2540834845735028e-05, + "loss": 38.1589, + "step": 1380 + }, + { + "epoch": 4.982392776523702, + "eval_loss": 0.6409950256347656, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 1380 + }, + { + "epoch": 4.986004514672686, + "grad_norm": 232.9596710205078, + "learning_rate": 2.2535390199637026e-05, + "loss": 39.281, + "step": 1381 + }, + { + "epoch": 4.98961625282167, + "grad_norm": 248.0550994873047, + "learning_rate": 2.2529945553539022e-05, + "loss": 40.0868, + "step": 1382 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 256.327880859375, + "learning_rate": 2.2524500907441017e-05, + "loss": 28.1259, + "step": 1383 + }, + { + "epoch": 4.996839729119639, + "grad_norm": 198.29559326171875, + "learning_rate": 2.2519056261343012e-05, + "loss": 25.3166, + "step": 1384 + }, + { + "epoch": 5.0, + "grad_norm": 174.66856384277344, + "learning_rate": 2.2513611615245008e-05, + "loss": 22.0749, + "step": 1385 + }, + { + "epoch": 5.003611738148984, + "grad_norm": 309.0927429199219, + "learning_rate": 2.2508166969147003e-05, + "loss": 45.2433, + "step": 1386 + }, + { + "epoch": 5.007223476297969, + "grad_norm": 293.1455383300781, + "learning_rate": 2.2502722323049002e-05, + "loss": 46.7025, + "step": 1387 + }, + { + "epoch": 5.010835214446953, + "grad_norm": 269.47662353515625, + "learning_rate": 2.2497277676951e-05, + "loss": 45.3218, + "step": 1388 + }, + { + "epoch": 5.014446952595937, + "grad_norm": 284.49560546875, + "learning_rate": 2.2491833030852996e-05, + "loss": 44.9849, + "step": 1389 + }, + { + "epoch": 5.018058690744921, + "grad_norm": 223.5511474609375, + "learning_rate": 2.248638838475499e-05, + "loss": 44.887, + "step": 1390 + }, + { + "epoch": 5.018058690744921, + "eval_loss": 0.6435533165931702, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1390 + }, + { + "epoch": 5.021670428893906, + "grad_norm": 243.4492645263672, + "learning_rate": 2.2480943738656987e-05, + "loss": 45.1483, + "step": 1391 + }, + { + "epoch": 5.0252821670428895, + "grad_norm": 265.1712646484375, + "learning_rate": 2.2475499092558986e-05, + "loss": 44.3713, + "step": 1392 + }, + { + "epoch": 5.0288939051918735, + "grad_norm": 190.72190856933594, + "learning_rate": 2.247005444646098e-05, + "loss": 45.3138, + "step": 1393 + }, + { + "epoch": 5.0325056433408575, + "grad_norm": 177.26686096191406, + "learning_rate": 2.2464609800362976e-05, + "loss": 43.302, + "step": 1394 + }, + { + "epoch": 5.036117381489842, + "grad_norm": 198.6124725341797, + "learning_rate": 2.2459165154264972e-05, + "loss": 43.6363, + "step": 1395 + }, + { + "epoch": 5.039729119638826, + "grad_norm": 233.78738403320312, + "learning_rate": 2.2453720508166967e-05, + "loss": 43.0345, + "step": 1396 + }, + { + "epoch": 5.04334085778781, + "grad_norm": 225.48614501953125, + "learning_rate": 2.2448275862068966e-05, + "loss": 41.5932, + "step": 1397 + }, + { + "epoch": 5.046952595936794, + "grad_norm": 204.31179809570312, + "learning_rate": 2.2442831215970965e-05, + "loss": 40.1401, + "step": 1398 + }, + { + "epoch": 5.050564334085779, + "grad_norm": 219.5385284423828, + "learning_rate": 2.243738656987296e-05, + "loss": 40.8834, + "step": 1399 + }, + { + "epoch": 5.054176072234763, + "grad_norm": 168.3094024658203, + "learning_rate": 2.2431941923774956e-05, + "loss": 40.4476, + "step": 1400 + }, + { + "epoch": 5.054176072234763, + "eval_loss": 0.6361114382743835, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 1400 + }, + { + "epoch": 5.057787810383747, + "grad_norm": 169.45201110839844, + "learning_rate": 2.242649727767695e-05, + "loss": 40.1949, + "step": 1401 + }, + { + "epoch": 5.061399548532731, + "grad_norm": 208.84634399414062, + "learning_rate": 2.2421052631578946e-05, + "loss": 41.0091, + "step": 1402 + }, + { + "epoch": 5.065011286681716, + "grad_norm": 248.86221313476562, + "learning_rate": 2.2415607985480945e-05, + "loss": 40.2435, + "step": 1403 + }, + { + "epoch": 5.0686230248307, + "grad_norm": 297.0834655761719, + "learning_rate": 2.241016333938294e-05, + "loss": 42.37, + "step": 1404 + }, + { + "epoch": 5.072234762979684, + "grad_norm": 242.12661743164062, + "learning_rate": 2.2404718693284936e-05, + "loss": 42.3822, + "step": 1405 + }, + { + "epoch": 5.075846501128668, + "grad_norm": 230.1178741455078, + "learning_rate": 2.2399274047186935e-05, + "loss": 41.3722, + "step": 1406 + }, + { + "epoch": 5.079458239277653, + "grad_norm": 191.32371520996094, + "learning_rate": 2.239382940108893e-05, + "loss": 41.8087, + "step": 1407 + }, + { + "epoch": 5.083069977426637, + "grad_norm": 267.28753662109375, + "learning_rate": 2.2388384754990925e-05, + "loss": 42.5938, + "step": 1408 + }, + { + "epoch": 5.0866817155756205, + "grad_norm": 186.61978149414062, + "learning_rate": 2.2382940108892924e-05, + "loss": 42.8553, + "step": 1409 + }, + { + "epoch": 5.090293453724605, + "grad_norm": 242.53433227539062, + "learning_rate": 2.237749546279492e-05, + "loss": 41.9677, + "step": 1410 + }, + { + "epoch": 5.090293453724605, + "eval_loss": 0.6330043077468872, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 1410 + }, + { + "epoch": 5.093905191873589, + "grad_norm": 199.74696350097656, + "learning_rate": 2.2372050816696915e-05, + "loss": 42.9821, + "step": 1411 + }, + { + "epoch": 5.097516930022573, + "grad_norm": 254.1063690185547, + "learning_rate": 2.236660617059891e-05, + "loss": 42.7956, + "step": 1412 + }, + { + "epoch": 5.101128668171557, + "grad_norm": 215.59056091308594, + "learning_rate": 2.2361161524500906e-05, + "loss": 43.6312, + "step": 1413 + }, + { + "epoch": 5.104740406320542, + "grad_norm": 218.69973754882812, + "learning_rate": 2.2355716878402904e-05, + "loss": 40.9468, + "step": 1414 + }, + { + "epoch": 5.108352144469526, + "grad_norm": 200.34927368164062, + "learning_rate": 2.23502722323049e-05, + "loss": 38.2656, + "step": 1415 + }, + { + "epoch": 5.11196388261851, + "grad_norm": 191.56883239746094, + "learning_rate": 2.23448275862069e-05, + "loss": 35.8111, + "step": 1416 + }, + { + "epoch": 5.115575620767494, + "grad_norm": 192.629150390625, + "learning_rate": 2.2339382940108894e-05, + "loss": 35.1287, + "step": 1417 + }, + { + "epoch": 5.119187358916479, + "grad_norm": 217.54855346679688, + "learning_rate": 2.233393829401089e-05, + "loss": 34.9664, + "step": 1418 + }, + { + "epoch": 5.122799097065463, + "grad_norm": 234.12355041503906, + "learning_rate": 2.2328493647912888e-05, + "loss": 35.9252, + "step": 1419 + }, + { + "epoch": 5.126410835214447, + "grad_norm": 201.83477783203125, + "learning_rate": 2.2323049001814884e-05, + "loss": 36.4664, + "step": 1420 + }, + { + "epoch": 5.126410835214447, + "eval_loss": 0.6359394192695618, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 1420 + }, + { + "epoch": 5.130022573363431, + "grad_norm": 212.38943481445312, + "learning_rate": 2.231760435571688e-05, + "loss": 35.2733, + "step": 1421 + }, + { + "epoch": 5.133634311512416, + "grad_norm": 219.8803253173828, + "learning_rate": 2.2312159709618874e-05, + "loss": 37.2009, + "step": 1422 + }, + { + "epoch": 5.1372460496614, + "grad_norm": 222.28221130371094, + "learning_rate": 2.230671506352087e-05, + "loss": 36.9338, + "step": 1423 + }, + { + "epoch": 5.140857787810384, + "grad_norm": 217.56607055664062, + "learning_rate": 2.2301270417422865e-05, + "loss": 38.0419, + "step": 1424 + }, + { + "epoch": 5.144469525959368, + "grad_norm": 232.7363739013672, + "learning_rate": 2.2295825771324867e-05, + "loss": 38.1393, + "step": 1425 + }, + { + "epoch": 5.148081264108352, + "grad_norm": 228.12091064453125, + "learning_rate": 2.2290381125226863e-05, + "loss": 37.4169, + "step": 1426 + }, + { + "epoch": 5.151693002257336, + "grad_norm": 247.9901580810547, + "learning_rate": 2.2284936479128858e-05, + "loss": 37.6386, + "step": 1427 + }, + { + "epoch": 5.15530474040632, + "grad_norm": 227.96649169921875, + "learning_rate": 2.2279491833030853e-05, + "loss": 38.7843, + "step": 1428 + }, + { + "epoch": 5.158916478555304, + "grad_norm": 197.85072326660156, + "learning_rate": 2.227404718693285e-05, + "loss": 37.7056, + "step": 1429 + }, + { + "epoch": 5.162528216704289, + "grad_norm": 270.6370544433594, + "learning_rate": 2.2268602540834848e-05, + "loss": 38.5554, + "step": 1430 + }, + { + "epoch": 5.162528216704289, + "eval_loss": 0.6463288068771362, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 1430 + }, + { + "epoch": 5.166139954853273, + "grad_norm": 251.65847778320312, + "learning_rate": 2.2263157894736843e-05, + "loss": 32.6593, + "step": 1431 + }, + { + "epoch": 5.169751693002257, + "grad_norm": 248.84368896484375, + "learning_rate": 2.225771324863884e-05, + "loss": 24.8031, + "step": 1432 + }, + { + "epoch": 5.173363431151241, + "grad_norm": 218.12979125976562, + "learning_rate": 2.2252268602540834e-05, + "loss": 23.8542, + "step": 1433 + }, + { + "epoch": 5.176975169300226, + "grad_norm": 171.4182586669922, + "learning_rate": 2.2246823956442832e-05, + "loss": 25.1994, + "step": 1434 + }, + { + "epoch": 5.18058690744921, + "grad_norm": 200.76271057128906, + "learning_rate": 2.2241379310344828e-05, + "loss": 25.1259, + "step": 1435 + }, + { + "epoch": 5.184198645598194, + "grad_norm": 324.8979797363281, + "learning_rate": 2.2235934664246827e-05, + "loss": 46.7466, + "step": 1436 + }, + { + "epoch": 5.187810383747179, + "grad_norm": 391.9200439453125, + "learning_rate": 2.2230490018148822e-05, + "loss": 47.366, + "step": 1437 + }, + { + "epoch": 5.191422121896163, + "grad_norm": 332.51080322265625, + "learning_rate": 2.2225045372050817e-05, + "loss": 47.5236, + "step": 1438 + }, + { + "epoch": 5.195033860045147, + "grad_norm": 295.85333251953125, + "learning_rate": 2.2219600725952813e-05, + "loss": 44.9235, + "step": 1439 + }, + { + "epoch": 5.198645598194131, + "grad_norm": 246.46482849121094, + "learning_rate": 2.2214156079854808e-05, + "loss": 44.5892, + "step": 1440 + }, + { + "epoch": 5.198645598194131, + "eval_loss": 0.6501885056495667, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.096, + "eval_steps_per_second": 57.096, + "step": 1440 + }, + { + "epoch": 5.2022573363431155, + "grad_norm": 224.99964904785156, + "learning_rate": 2.2208711433756807e-05, + "loss": 45.1496, + "step": 1441 + }, + { + "epoch": 5.2058690744920995, + "grad_norm": 201.5928497314453, + "learning_rate": 2.2203266787658802e-05, + "loss": 44.2362, + "step": 1442 + }, + { + "epoch": 5.209480812641083, + "grad_norm": 220.72509765625, + "learning_rate": 2.21978221415608e-05, + "loss": 45.7963, + "step": 1443 + }, + { + "epoch": 5.213092550790067, + "grad_norm": 229.04412841796875, + "learning_rate": 2.2192377495462796e-05, + "loss": 44.1812, + "step": 1444 + }, + { + "epoch": 5.216704288939052, + "grad_norm": 214.86207580566406, + "learning_rate": 2.2186932849364792e-05, + "loss": 44.364, + "step": 1445 + }, + { + "epoch": 5.220316027088036, + "grad_norm": 169.3239288330078, + "learning_rate": 2.2181488203266787e-05, + "loss": 44.1106, + "step": 1446 + }, + { + "epoch": 5.22392776523702, + "grad_norm": 180.3131561279297, + "learning_rate": 2.2176043557168786e-05, + "loss": 41.8791, + "step": 1447 + }, + { + "epoch": 5.227539503386004, + "grad_norm": 227.83078002929688, + "learning_rate": 2.217059891107078e-05, + "loss": 39.7917, + "step": 1448 + }, + { + "epoch": 5.231151241534989, + "grad_norm": 267.4294738769531, + "learning_rate": 2.2165154264972777e-05, + "loss": 41.2864, + "step": 1449 + }, + { + "epoch": 5.234762979683973, + "grad_norm": 210.79034423828125, + "learning_rate": 2.2159709618874772e-05, + "loss": 40.7219, + "step": 1450 + }, + { + "epoch": 5.234762979683973, + "eval_loss": 0.6369529366493225, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 1450 + }, + { + "epoch": 5.238374717832957, + "grad_norm": 205.2632598876953, + "learning_rate": 2.2154264972776768e-05, + "loss": 41.0364, + "step": 1451 + }, + { + "epoch": 5.241986455981941, + "grad_norm": 199.7196807861328, + "learning_rate": 2.214882032667877e-05, + "loss": 40.2733, + "step": 1452 + }, + { + "epoch": 5.245598194130926, + "grad_norm": 184.26495361328125, + "learning_rate": 2.2143375680580765e-05, + "loss": 40.3418, + "step": 1453 + }, + { + "epoch": 5.24920993227991, + "grad_norm": 170.1937713623047, + "learning_rate": 2.213793103448276e-05, + "loss": 40.5658, + "step": 1454 + }, + { + "epoch": 5.252821670428894, + "grad_norm": 167.71109008789062, + "learning_rate": 2.2132486388384756e-05, + "loss": 41.9252, + "step": 1455 + }, + { + "epoch": 5.2564334085778786, + "grad_norm": 184.73162841796875, + "learning_rate": 2.212704174228675e-05, + "loss": 40.0485, + "step": 1456 + }, + { + "epoch": 5.2600451467268625, + "grad_norm": 195.0812225341797, + "learning_rate": 2.2121597096188747e-05, + "loss": 41.6424, + "step": 1457 + }, + { + "epoch": 5.2636568848758465, + "grad_norm": 218.23553466796875, + "learning_rate": 2.2116152450090745e-05, + "loss": 40.6179, + "step": 1458 + }, + { + "epoch": 5.2672686230248305, + "grad_norm": 229.79299926757812, + "learning_rate": 2.211070780399274e-05, + "loss": 42.8747, + "step": 1459 + }, + { + "epoch": 5.270880361173815, + "grad_norm": 231.70692443847656, + "learning_rate": 2.2105263157894736e-05, + "loss": 42.7016, + "step": 1460 + }, + { + "epoch": 5.270880361173815, + "eval_loss": 0.6424433588981628, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 1460 + }, + { + "epoch": 5.274492099322799, + "grad_norm": 204.9513397216797, + "learning_rate": 2.209981851179673e-05, + "loss": 41.206, + "step": 1461 + }, + { + "epoch": 5.278103837471783, + "grad_norm": 220.89083862304688, + "learning_rate": 2.209437386569873e-05, + "loss": 44.0126, + "step": 1462 + }, + { + "epoch": 5.281715575620767, + "grad_norm": 266.7763671875, + "learning_rate": 2.208892921960073e-05, + "loss": 41.4934, + "step": 1463 + }, + { + "epoch": 5.285327313769752, + "grad_norm": 241.42636108398438, + "learning_rate": 2.2083484573502724e-05, + "loss": 43.3433, + "step": 1464 + }, + { + "epoch": 5.288939051918736, + "grad_norm": 221.7669219970703, + "learning_rate": 2.207803992740472e-05, + "loss": 35.9569, + "step": 1465 + }, + { + "epoch": 5.29255079006772, + "grad_norm": 236.0152130126953, + "learning_rate": 2.2072595281306715e-05, + "loss": 36.0824, + "step": 1466 + }, + { + "epoch": 5.296162528216704, + "grad_norm": 239.56224060058594, + "learning_rate": 2.206715063520871e-05, + "loss": 33.6127, + "step": 1467 + }, + { + "epoch": 5.299774266365689, + "grad_norm": 277.1287841796875, + "learning_rate": 2.2061705989110706e-05, + "loss": 36.11, + "step": 1468 + }, + { + "epoch": 5.303386004514673, + "grad_norm": 250.19515991210938, + "learning_rate": 2.2056261343012705e-05, + "loss": 36.9984, + "step": 1469 + }, + { + "epoch": 5.306997742663657, + "grad_norm": 214.2754669189453, + "learning_rate": 2.20508166969147e-05, + "loss": 36.5917, + "step": 1470 + }, + { + "epoch": 5.306997742663657, + "eval_loss": 0.6356943845748901, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 1470 + }, + { + "epoch": 5.310609480812641, + "grad_norm": 224.37388610839844, + "learning_rate": 2.20453720508167e-05, + "loss": 36.5302, + "step": 1471 + }, + { + "epoch": 5.314221218961626, + "grad_norm": 276.2541809082031, + "learning_rate": 2.2039927404718694e-05, + "loss": 36.7978, + "step": 1472 + }, + { + "epoch": 5.3178329571106095, + "grad_norm": 361.717041015625, + "learning_rate": 2.203448275862069e-05, + "loss": 37.4063, + "step": 1473 + }, + { + "epoch": 5.3214446952595935, + "grad_norm": 285.3569641113281, + "learning_rate": 2.202903811252269e-05, + "loss": 37.2472, + "step": 1474 + }, + { + "epoch": 5.3250564334085775, + "grad_norm": 268.160400390625, + "learning_rate": 2.2023593466424684e-05, + "loss": 37.7361, + "step": 1475 + }, + { + "epoch": 5.328668171557562, + "grad_norm": 211.38070678710938, + "learning_rate": 2.201814882032668e-05, + "loss": 37.7794, + "step": 1476 + }, + { + "epoch": 5.332279909706546, + "grad_norm": 214.10638427734375, + "learning_rate": 2.2012704174228675e-05, + "loss": 39.0787, + "step": 1477 + }, + { + "epoch": 5.33589164785553, + "grad_norm": 238.9603271484375, + "learning_rate": 2.200725952813067e-05, + "loss": 37.6853, + "step": 1478 + }, + { + "epoch": 5.339503386004514, + "grad_norm": 323.44976806640625, + "learning_rate": 2.2001814882032665e-05, + "loss": 38.2844, + "step": 1479 + }, + { + "epoch": 5.343115124153499, + "grad_norm": 289.6131896972656, + "learning_rate": 2.1996370235934668e-05, + "loss": 38.8953, + "step": 1480 + }, + { + "epoch": 5.343115124153499, + "eval_loss": 0.6462770700454712, + "eval_runtime": 3.1673, + "eval_samples_per_second": 56.516, + "eval_steps_per_second": 56.516, + "step": 1480 + }, + { + "epoch": 5.346726862302483, + "grad_norm": 197.47299194335938, + "learning_rate": 2.1990925589836663e-05, + "loss": 28.126, + "step": 1481 + }, + { + "epoch": 5.350338600451467, + "grad_norm": 198.37156677246094, + "learning_rate": 2.1985480943738658e-05, + "loss": 24.2205, + "step": 1482 + }, + { + "epoch": 5.353950338600452, + "grad_norm": 211.03501892089844, + "learning_rate": 2.1980036297640654e-05, + "loss": 24.119, + "step": 1483 + }, + { + "epoch": 5.357562076749436, + "grad_norm": 182.23316955566406, + "learning_rate": 2.197459165154265e-05, + "loss": 24.7386, + "step": 1484 + }, + { + "epoch": 5.36117381489842, + "grad_norm": 192.6392822265625, + "learning_rate": 2.1969147005444648e-05, + "loss": 26.0739, + "step": 1485 + }, + { + "epoch": 5.364785553047404, + "grad_norm": 380.62896728515625, + "learning_rate": 2.1963702359346643e-05, + "loss": 46.6945, + "step": 1486 + }, + { + "epoch": 5.368397291196389, + "grad_norm": 342.5572814941406, + "learning_rate": 2.195825771324864e-05, + "loss": 46.1797, + "step": 1487 + }, + { + "epoch": 5.372009029345373, + "grad_norm": 311.7198791503906, + "learning_rate": 2.1952813067150634e-05, + "loss": 45.6588, + "step": 1488 + }, + { + "epoch": 5.375620767494357, + "grad_norm": 260.9885559082031, + "learning_rate": 2.1947368421052633e-05, + "loss": 45.2405, + "step": 1489 + }, + { + "epoch": 5.3792325056433405, + "grad_norm": 263.3132019042969, + "learning_rate": 2.1941923774954628e-05, + "loss": 44.117, + "step": 1490 + }, + { + "epoch": 5.3792325056433405, + "eval_loss": 0.644275426864624, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 1490 + }, + { + "epoch": 5.382844243792325, + "grad_norm": 254.92022705078125, + "learning_rate": 2.1936479128856627e-05, + "loss": 45.4002, + "step": 1491 + }, + { + "epoch": 5.386455981941309, + "grad_norm": 246.1839599609375, + "learning_rate": 2.1931034482758622e-05, + "loss": 45.3481, + "step": 1492 + }, + { + "epoch": 5.390067720090293, + "grad_norm": 282.2879638671875, + "learning_rate": 2.1925589836660618e-05, + "loss": 45.3958, + "step": 1493 + }, + { + "epoch": 5.393679458239277, + "grad_norm": 266.9140930175781, + "learning_rate": 2.1920145190562613e-05, + "loss": 44.2959, + "step": 1494 + }, + { + "epoch": 5.397291196388262, + "grad_norm": 196.81199645996094, + "learning_rate": 2.191470054446461e-05, + "loss": 44.765, + "step": 1495 + }, + { + "epoch": 5.400902934537246, + "grad_norm": 270.7329406738281, + "learning_rate": 2.1909255898366607e-05, + "loss": 42.8581, + "step": 1496 + }, + { + "epoch": 5.40451467268623, + "grad_norm": 187.3281707763672, + "learning_rate": 2.1903811252268603e-05, + "loss": 40.7167, + "step": 1497 + }, + { + "epoch": 5.408126410835214, + "grad_norm": 302.9165954589844, + "learning_rate": 2.1898366606170598e-05, + "loss": 41.0712, + "step": 1498 + }, + { + "epoch": 5.411738148984199, + "grad_norm": 395.1492614746094, + "learning_rate": 2.1892921960072597e-05, + "loss": 40.4098, + "step": 1499 + }, + { + "epoch": 5.415349887133183, + "grad_norm": 253.91494750976562, + "learning_rate": 2.1887477313974592e-05, + "loss": 41.2985, + "step": 1500 + }, + { + "epoch": 5.415349887133183, + "eval_loss": 0.6383773684501648, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1500 + }, + { + "epoch": 5.418961625282167, + "grad_norm": 248.4109344482422, + "learning_rate": 2.1882032667876588e-05, + "loss": 41.179, + "step": 1501 + }, + { + "epoch": 5.422573363431152, + "grad_norm": 210.50015258789062, + "learning_rate": 2.1876588021778586e-05, + "loss": 41.1934, + "step": 1502 + }, + { + "epoch": 5.426185101580136, + "grad_norm": 170.64334106445312, + "learning_rate": 2.187114337568058e-05, + "loss": 41.5535, + "step": 1503 + }, + { + "epoch": 5.42979683972912, + "grad_norm": 249.41270446777344, + "learning_rate": 2.1865698729582577e-05, + "loss": 41.8323, + "step": 1504 + }, + { + "epoch": 5.433408577878104, + "grad_norm": 214.53770446777344, + "learning_rate": 2.1860254083484572e-05, + "loss": 42.1517, + "step": 1505 + }, + { + "epoch": 5.437020316027088, + "grad_norm": 225.6502227783203, + "learning_rate": 2.1854809437386568e-05, + "loss": 42.7675, + "step": 1506 + }, + { + "epoch": 5.440632054176072, + "grad_norm": 210.19219970703125, + "learning_rate": 2.1849364791288567e-05, + "loss": 42.5094, + "step": 1507 + }, + { + "epoch": 5.444243792325056, + "grad_norm": 187.03294372558594, + "learning_rate": 2.1843920145190565e-05, + "loss": 42.2218, + "step": 1508 + }, + { + "epoch": 5.44785553047404, + "grad_norm": 227.6764373779297, + "learning_rate": 2.183847549909256e-05, + "loss": 42.7061, + "step": 1509 + }, + { + "epoch": 5.451467268623025, + "grad_norm": 239.2847442626953, + "learning_rate": 2.1833030852994556e-05, + "loss": 43.1959, + "step": 1510 + }, + { + "epoch": 5.451467268623025, + "eval_loss": 0.6405091285705566, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 1510 + }, + { + "epoch": 5.455079006772009, + "grad_norm": 268.887451171875, + "learning_rate": 2.182758620689655e-05, + "loss": 42.4915, + "step": 1511 + }, + { + "epoch": 5.458690744920993, + "grad_norm": 261.0531311035156, + "learning_rate": 2.182214156079855e-05, + "loss": 42.1777, + "step": 1512 + }, + { + "epoch": 5.462302483069977, + "grad_norm": 241.58819580078125, + "learning_rate": 2.1816696914700546e-05, + "loss": 40.8728, + "step": 1513 + }, + { + "epoch": 5.465914221218962, + "grad_norm": 227.302001953125, + "learning_rate": 2.181125226860254e-05, + "loss": 39.8861, + "step": 1514 + }, + { + "epoch": 5.469525959367946, + "grad_norm": 293.8402404785156, + "learning_rate": 2.1805807622504536e-05, + "loss": 36.8716, + "step": 1515 + }, + { + "epoch": 5.47313769751693, + "grad_norm": 332.8829650878906, + "learning_rate": 2.1800362976406532e-05, + "loss": 35.6049, + "step": 1516 + }, + { + "epoch": 5.476749435665914, + "grad_norm": 271.6636962890625, + "learning_rate": 2.179491833030853e-05, + "loss": 34.6785, + "step": 1517 + }, + { + "epoch": 5.480361173814899, + "grad_norm": 211.5673065185547, + "learning_rate": 2.178947368421053e-05, + "loss": 35.5321, + "step": 1518 + }, + { + "epoch": 5.483972911963883, + "grad_norm": 168.95346069335938, + "learning_rate": 2.1784029038112525e-05, + "loss": 35.1604, + "step": 1519 + }, + { + "epoch": 5.487584650112867, + "grad_norm": 242.66725158691406, + "learning_rate": 2.177858439201452e-05, + "loss": 37.8709, + "step": 1520 + }, + { + "epoch": 5.487584650112867, + "eval_loss": 0.6324127912521362, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1520 + }, + { + "epoch": 5.491196388261851, + "grad_norm": 202.7799530029297, + "learning_rate": 2.1773139745916516e-05, + "loss": 38.1727, + "step": 1521 + }, + { + "epoch": 5.4948081264108355, + "grad_norm": 210.12704467773438, + "learning_rate": 2.176769509981851e-05, + "loss": 36.4171, + "step": 1522 + }, + { + "epoch": 5.4984198645598195, + "grad_norm": 214.7133331298828, + "learning_rate": 2.176225045372051e-05, + "loss": 37.7873, + "step": 1523 + }, + { + "epoch": 5.502031602708803, + "grad_norm": 197.89781188964844, + "learning_rate": 2.1756805807622505e-05, + "loss": 37.1096, + "step": 1524 + }, + { + "epoch": 5.505643340857787, + "grad_norm": 203.01992797851562, + "learning_rate": 2.17513611615245e-05, + "loss": 36.9907, + "step": 1525 + }, + { + "epoch": 5.509255079006772, + "grad_norm": 210.42164611816406, + "learning_rate": 2.17459165154265e-05, + "loss": 38.0291, + "step": 1526 + }, + { + "epoch": 5.512866817155756, + "grad_norm": 210.2798309326172, + "learning_rate": 2.1740471869328495e-05, + "loss": 37.5385, + "step": 1527 + }, + { + "epoch": 5.51647855530474, + "grad_norm": 217.986572265625, + "learning_rate": 2.173502722323049e-05, + "loss": 39.2736, + "step": 1528 + }, + { + "epoch": 5.520090293453725, + "grad_norm": 221.05831909179688, + "learning_rate": 2.172958257713249e-05, + "loss": 39.2733, + "step": 1529 + }, + { + "epoch": 5.523702031602709, + "grad_norm": 250.36065673828125, + "learning_rate": 2.1724137931034484e-05, + "loss": 37.8987, + "step": 1530 + }, + { + "epoch": 5.523702031602709, + "eval_loss": 0.6414559483528137, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 1530 + }, + { + "epoch": 5.527313769751693, + "grad_norm": 275.062255859375, + "learning_rate": 2.171869328493648e-05, + "loss": 29.4874, + "step": 1531 + }, + { + "epoch": 5.530925507900677, + "grad_norm": 178.79615783691406, + "learning_rate": 2.1713248638838475e-05, + "loss": 25.2165, + "step": 1532 + }, + { + "epoch": 5.534537246049661, + "grad_norm": 221.6693572998047, + "learning_rate": 2.170780399274047e-05, + "loss": 24.7139, + "step": 1533 + }, + { + "epoch": 5.538148984198646, + "grad_norm": 207.15869140625, + "learning_rate": 2.170235934664247e-05, + "loss": 25.2773, + "step": 1534 + }, + { + "epoch": 5.54176072234763, + "grad_norm": 193.37644958496094, + "learning_rate": 2.1696914700544468e-05, + "loss": 25.7936, + "step": 1535 + }, + { + "epoch": 5.545372460496614, + "grad_norm": 314.101318359375, + "learning_rate": 2.1691470054446463e-05, + "loss": 45.8573, + "step": 1536 + }, + { + "epoch": 5.5489841986455986, + "grad_norm": 376.9578552246094, + "learning_rate": 2.168602540834846e-05, + "loss": 47.1284, + "step": 1537 + }, + { + "epoch": 5.5525959367945825, + "grad_norm": 343.3904724121094, + "learning_rate": 2.1680580762250454e-05, + "loss": 45.1873, + "step": 1538 + }, + { + "epoch": 5.5562076749435665, + "grad_norm": 263.31768798828125, + "learning_rate": 2.167513611615245e-05, + "loss": 45.4906, + "step": 1539 + }, + { + "epoch": 5.5598194130925505, + "grad_norm": 295.50384521484375, + "learning_rate": 2.1669691470054448e-05, + "loss": 44.9259, + "step": 1540 + }, + { + "epoch": 5.5598194130925505, + "eval_loss": 0.6483813524246216, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.923, + "eval_steps_per_second": 56.923, + "step": 1540 + }, + { + "epoch": 5.563431151241535, + "grad_norm": 208.8861846923828, + "learning_rate": 2.1664246823956444e-05, + "loss": 43.7965, + "step": 1541 + }, + { + "epoch": 5.567042889390519, + "grad_norm": 195.8695526123047, + "learning_rate": 2.165880217785844e-05, + "loss": 44.7409, + "step": 1542 + }, + { + "epoch": 5.570654627539503, + "grad_norm": 218.10089111328125, + "learning_rate": 2.1653357531760434e-05, + "loss": 45.9364, + "step": 1543 + }, + { + "epoch": 5.574266365688487, + "grad_norm": 204.17205810546875, + "learning_rate": 2.164791288566243e-05, + "loss": 45.468, + "step": 1544 + }, + { + "epoch": 5.577878103837472, + "grad_norm": 239.03952026367188, + "learning_rate": 2.1642468239564432e-05, + "loss": 44.7685, + "step": 1545 + }, + { + "epoch": 5.581489841986456, + "grad_norm": 251.59300231933594, + "learning_rate": 2.1637023593466427e-05, + "loss": 43.011, + "step": 1546 + }, + { + "epoch": 5.58510158013544, + "grad_norm": 186.72540283203125, + "learning_rate": 2.1631578947368423e-05, + "loss": 41.5255, + "step": 1547 + }, + { + "epoch": 5.588713318284425, + "grad_norm": 199.89732360839844, + "learning_rate": 2.1626134301270418e-05, + "loss": 40.2522, + "step": 1548 + }, + { + "epoch": 5.592325056433409, + "grad_norm": 182.16624450683594, + "learning_rate": 2.1620689655172413e-05, + "loss": 41.0931, + "step": 1549 + }, + { + "epoch": 5.595936794582393, + "grad_norm": 221.58680725097656, + "learning_rate": 2.161524500907441e-05, + "loss": 40.2717, + "step": 1550 + }, + { + "epoch": 5.595936794582393, + "eval_loss": 0.6393340229988098, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 1550 + }, + { + "epoch": 5.599548532731377, + "grad_norm": 209.82183837890625, + "learning_rate": 2.1609800362976408e-05, + "loss": 41.7522, + "step": 1551 + }, + { + "epoch": 5.603160270880361, + "grad_norm": 226.1896209716797, + "learning_rate": 2.1604355716878403e-05, + "loss": 40.8078, + "step": 1552 + }, + { + "epoch": 5.606772009029346, + "grad_norm": 219.57899475097656, + "learning_rate": 2.1598911070780398e-05, + "loss": 42.2331, + "step": 1553 + }, + { + "epoch": 5.6103837471783295, + "grad_norm": 185.2303009033203, + "learning_rate": 2.1593466424682397e-05, + "loss": 42.0695, + "step": 1554 + }, + { + "epoch": 5.6139954853273135, + "grad_norm": 192.32913208007812, + "learning_rate": 2.1588021778584392e-05, + "loss": 42.1317, + "step": 1555 + }, + { + "epoch": 5.617607223476298, + "grad_norm": 183.3128662109375, + "learning_rate": 2.158257713248639e-05, + "loss": 40.4957, + "step": 1556 + }, + { + "epoch": 5.621218961625282, + "grad_norm": 178.10691833496094, + "learning_rate": 2.1577132486388387e-05, + "loss": 40.9154, + "step": 1557 + }, + { + "epoch": 5.624830699774266, + "grad_norm": 207.3495330810547, + "learning_rate": 2.1571687840290382e-05, + "loss": 42.8389, + "step": 1558 + }, + { + "epoch": 5.62844243792325, + "grad_norm": 191.46353149414062, + "learning_rate": 2.1566243194192377e-05, + "loss": 41.9483, + "step": 1559 + }, + { + "epoch": 5.632054176072235, + "grad_norm": 218.9544219970703, + "learning_rate": 2.1560798548094373e-05, + "loss": 41.2037, + "step": 1560 + }, + { + "epoch": 5.632054176072235, + "eval_loss": 0.6345452070236206, + "eval_runtime": 3.1432, + "eval_samples_per_second": 56.949, + "eval_steps_per_second": 56.949, + "step": 1560 + }, + { + "epoch": 5.635665914221219, + "grad_norm": 235.9405059814453, + "learning_rate": 2.1555353901996368e-05, + "loss": 43.1159, + "step": 1561 + }, + { + "epoch": 5.639277652370203, + "grad_norm": 207.1119384765625, + "learning_rate": 2.1549909255898367e-05, + "loss": 43.4384, + "step": 1562 + }, + { + "epoch": 5.642889390519187, + "grad_norm": 305.3013916015625, + "learning_rate": 2.1544464609800366e-05, + "loss": 42.436, + "step": 1563 + }, + { + "epoch": 5.646501128668172, + "grad_norm": 226.25282287597656, + "learning_rate": 2.153901996370236e-05, + "loss": 39.6844, + "step": 1564 + }, + { + "epoch": 5.650112866817156, + "grad_norm": 201.5033416748047, + "learning_rate": 2.1533575317604356e-05, + "loss": 35.9103, + "step": 1565 + }, + { + "epoch": 5.65372460496614, + "grad_norm": 206.63229370117188, + "learning_rate": 2.1528130671506352e-05, + "loss": 35.0026, + "step": 1566 + }, + { + "epoch": 5.657336343115124, + "grad_norm": 212.67581176757812, + "learning_rate": 2.152268602540835e-05, + "loss": 35.6298, + "step": 1567 + }, + { + "epoch": 5.660948081264109, + "grad_norm": 193.2886199951172, + "learning_rate": 2.1517241379310346e-05, + "loss": 36.0356, + "step": 1568 + }, + { + "epoch": 5.664559819413093, + "grad_norm": 166.189208984375, + "learning_rate": 2.151179673321234e-05, + "loss": 35.5423, + "step": 1569 + }, + { + "epoch": 5.668171557562077, + "grad_norm": 288.91552734375, + "learning_rate": 2.1506352087114337e-05, + "loss": 36.6227, + "step": 1570 + }, + { + "epoch": 5.668171557562077, + "eval_loss": 0.6339959502220154, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1570 + }, + { + "epoch": 5.6717832957110605, + "grad_norm": 210.91664123535156, + "learning_rate": 2.1500907441016332e-05, + "loss": 37.3015, + "step": 1571 + }, + { + "epoch": 5.675395033860045, + "grad_norm": 206.54299926757812, + "learning_rate": 2.149546279491833e-05, + "loss": 36.961, + "step": 1572 + }, + { + "epoch": 5.679006772009029, + "grad_norm": 206.55613708496094, + "learning_rate": 2.149001814882033e-05, + "loss": 36.722, + "step": 1573 + }, + { + "epoch": 5.682618510158013, + "grad_norm": 206.86563110351562, + "learning_rate": 2.1484573502722325e-05, + "loss": 37.7482, + "step": 1574 + }, + { + "epoch": 5.686230248306998, + "grad_norm": 219.96533203125, + "learning_rate": 2.147912885662432e-05, + "loss": 37.7964, + "step": 1575 + }, + { + "epoch": 5.689841986455982, + "grad_norm": 226.23887634277344, + "learning_rate": 2.1473684210526316e-05, + "loss": 38.6577, + "step": 1576 + }, + { + "epoch": 5.693453724604966, + "grad_norm": 195.1751708984375, + "learning_rate": 2.146823956442831e-05, + "loss": 36.9764, + "step": 1577 + }, + { + "epoch": 5.69706546275395, + "grad_norm": 194.3510284423828, + "learning_rate": 2.146279491833031e-05, + "loss": 39.4842, + "step": 1578 + }, + { + "epoch": 5.700677200902934, + "grad_norm": 187.02281188964844, + "learning_rate": 2.1457350272232305e-05, + "loss": 38.9574, + "step": 1579 + }, + { + "epoch": 5.704288939051919, + "grad_norm": 242.91925048828125, + "learning_rate": 2.14519056261343e-05, + "loss": 37.6359, + "step": 1580 + }, + { + "epoch": 5.704288939051919, + "eval_loss": 0.6384473443031311, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 1580 + }, + { + "epoch": 5.707900677200903, + "grad_norm": 242.9617156982422, + "learning_rate": 2.14464609800363e-05, + "loss": 31.3564, + "step": 1581 + }, + { + "epoch": 5.711512415349887, + "grad_norm": 182.00540161132812, + "learning_rate": 2.1441016333938295e-05, + "loss": 24.2933, + "step": 1582 + }, + { + "epoch": 5.715124153498872, + "grad_norm": 257.7115173339844, + "learning_rate": 2.143557168784029e-05, + "loss": 24.6299, + "step": 1583 + }, + { + "epoch": 5.718735891647856, + "grad_norm": 198.71554565429688, + "learning_rate": 2.143012704174229e-05, + "loss": 24.7344, + "step": 1584 + }, + { + "epoch": 5.72234762979684, + "grad_norm": 198.24520874023438, + "learning_rate": 2.1424682395644284e-05, + "loss": 26.0825, + "step": 1585 + }, + { + "epoch": 5.725959367945824, + "grad_norm": 248.9528045654297, + "learning_rate": 2.141923774954628e-05, + "loss": 45.1176, + "step": 1586 + }, + { + "epoch": 5.7295711060948085, + "grad_norm": 293.7327575683594, + "learning_rate": 2.1413793103448275e-05, + "loss": 45.8517, + "step": 1587 + }, + { + "epoch": 5.733182844243792, + "grad_norm": 293.1148681640625, + "learning_rate": 2.140834845735027e-05, + "loss": 45.6659, + "step": 1588 + }, + { + "epoch": 5.736794582392776, + "grad_norm": 312.7779846191406, + "learning_rate": 2.140290381125227e-05, + "loss": 44.4863, + "step": 1589 + }, + { + "epoch": 5.74040632054176, + "grad_norm": 309.1000061035156, + "learning_rate": 2.1397459165154265e-05, + "loss": 43.649, + "step": 1590 + }, + { + "epoch": 5.74040632054176, + "eval_loss": 0.6471736431121826, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 1590 + }, + { + "epoch": 5.744018058690745, + "grad_norm": 276.4226989746094, + "learning_rate": 2.1392014519056263e-05, + "loss": 45.3135, + "step": 1591 + }, + { + "epoch": 5.747629796839729, + "grad_norm": 233.6791229248047, + "learning_rate": 2.138656987295826e-05, + "loss": 44.4919, + "step": 1592 + }, + { + "epoch": 5.751241534988713, + "grad_norm": 194.2917022705078, + "learning_rate": 2.1381125226860254e-05, + "loss": 44.8033, + "step": 1593 + }, + { + "epoch": 5.754853273137698, + "grad_norm": 241.76060485839844, + "learning_rate": 2.137568058076225e-05, + "loss": 45.1427, + "step": 1594 + }, + { + "epoch": 5.758465011286682, + "grad_norm": 216.56283569335938, + "learning_rate": 2.137023593466425e-05, + "loss": 43.1769, + "step": 1595 + }, + { + "epoch": 5.762076749435666, + "grad_norm": 230.0026092529297, + "learning_rate": 2.1364791288566244e-05, + "loss": 44.1141, + "step": 1596 + }, + { + "epoch": 5.76568848758465, + "grad_norm": 191.55433654785156, + "learning_rate": 2.135934664246824e-05, + "loss": 40.7227, + "step": 1597 + }, + { + "epoch": 5.769300225733634, + "grad_norm": 180.25885009765625, + "learning_rate": 2.1353901996370235e-05, + "loss": 40.9842, + "step": 1598 + }, + { + "epoch": 5.772911963882619, + "grad_norm": 220.4018096923828, + "learning_rate": 2.134845735027223e-05, + "loss": 40.0403, + "step": 1599 + }, + { + "epoch": 5.776523702031603, + "grad_norm": 264.20587158203125, + "learning_rate": 2.1343012704174232e-05, + "loss": 40.1543, + "step": 1600 + }, + { + "epoch": 5.776523702031603, + "eval_loss": 0.6374311447143555, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1600 + }, + { + "epoch": 5.780135440180587, + "grad_norm": 167.9457244873047, + "learning_rate": 2.1337568058076227e-05, + "loss": 40.9575, + "step": 1601 + }, + { + "epoch": 5.7837471783295715, + "grad_norm": 190.05247497558594, + "learning_rate": 2.1332123411978223e-05, + "loss": 39.5593, + "step": 1602 + }, + { + "epoch": 5.7873589164785555, + "grad_norm": 246.4980926513672, + "learning_rate": 2.1326678765880218e-05, + "loss": 40.7016, + "step": 1603 + }, + { + "epoch": 5.7909706546275395, + "grad_norm": 208.7435302734375, + "learning_rate": 2.1321234119782214e-05, + "loss": 41.7855, + "step": 1604 + }, + { + "epoch": 5.794582392776523, + "grad_norm": 190.84188842773438, + "learning_rate": 2.1315789473684212e-05, + "loss": 41.2129, + "step": 1605 + }, + { + "epoch": 5.798194130925508, + "grad_norm": 196.7161102294922, + "learning_rate": 2.1310344827586208e-05, + "loss": 40.8209, + "step": 1606 + }, + { + "epoch": 5.801805869074492, + "grad_norm": 181.4319305419922, + "learning_rate": 2.1304900181488203e-05, + "loss": 41.8345, + "step": 1607 + }, + { + "epoch": 5.805417607223476, + "grad_norm": 201.2064971923828, + "learning_rate": 2.12994555353902e-05, + "loss": 43.1464, + "step": 1608 + }, + { + "epoch": 5.80902934537246, + "grad_norm": 199.15174865722656, + "learning_rate": 2.1294010889292197e-05, + "loss": 42.6041, + "step": 1609 + }, + { + "epoch": 5.812641083521445, + "grad_norm": 231.0398406982422, + "learning_rate": 2.1288566243194193e-05, + "loss": 42.867, + "step": 1610 + }, + { + "epoch": 5.812641083521445, + "eval_loss": 0.6334222555160522, + "eval_runtime": 3.1534, + "eval_samples_per_second": 56.764, + "eval_steps_per_second": 56.764, + "step": 1610 + }, + { + "epoch": 5.816252821670429, + "grad_norm": 189.26132202148438, + "learning_rate": 2.128312159709619e-05, + "loss": 41.7717, + "step": 1611 + }, + { + "epoch": 5.819864559819413, + "grad_norm": 215.5289764404297, + "learning_rate": 2.1277676950998187e-05, + "loss": 41.3994, + "step": 1612 + }, + { + "epoch": 5.823476297968397, + "grad_norm": 267.4259033203125, + "learning_rate": 2.1272232304900182e-05, + "loss": 41.8173, + "step": 1613 + }, + { + "epoch": 5.827088036117382, + "grad_norm": 241.74749755859375, + "learning_rate": 2.1266787658802178e-05, + "loss": 39.9873, + "step": 1614 + }, + { + "epoch": 5.830699774266366, + "grad_norm": 242.233642578125, + "learning_rate": 2.1261343012704173e-05, + "loss": 37.0662, + "step": 1615 + }, + { + "epoch": 5.83431151241535, + "grad_norm": 217.06141662597656, + "learning_rate": 2.1255898366606172e-05, + "loss": 36.8948, + "step": 1616 + }, + { + "epoch": 5.837923250564334, + "grad_norm": 242.05567932128906, + "learning_rate": 2.1250453720508167e-05, + "loss": 34.9909, + "step": 1617 + }, + { + "epoch": 5.8415349887133186, + "grad_norm": 178.65618896484375, + "learning_rate": 2.1245009074410166e-05, + "loss": 35.603, + "step": 1618 + }, + { + "epoch": 5.8451467268623025, + "grad_norm": 216.36865234375, + "learning_rate": 2.123956442831216e-05, + "loss": 35.9822, + "step": 1619 + }, + { + "epoch": 5.8487584650112865, + "grad_norm": 241.22161865234375, + "learning_rate": 2.1234119782214157e-05, + "loss": 35.1473, + "step": 1620 + }, + { + "epoch": 5.8487584650112865, + "eval_loss": 0.6312161087989807, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 1620 + }, + { + "epoch": 5.852370203160271, + "grad_norm": 192.05210876464844, + "learning_rate": 2.1228675136116152e-05, + "loss": 36.145, + "step": 1621 + }, + { + "epoch": 5.855981941309255, + "grad_norm": 194.0652618408203, + "learning_rate": 2.122323049001815e-05, + "loss": 37.7076, + "step": 1622 + }, + { + "epoch": 5.859593679458239, + "grad_norm": 255.59286499023438, + "learning_rate": 2.1217785843920146e-05, + "loss": 37.6837, + "step": 1623 + }, + { + "epoch": 5.863205417607223, + "grad_norm": 184.0017852783203, + "learning_rate": 2.121234119782214e-05, + "loss": 37.1681, + "step": 1624 + }, + { + "epoch": 5.866817155756207, + "grad_norm": 186.98338317871094, + "learning_rate": 2.1206896551724137e-05, + "loss": 37.4902, + "step": 1625 + }, + { + "epoch": 5.870428893905192, + "grad_norm": 253.53775024414062, + "learning_rate": 2.1201451905626132e-05, + "loss": 37.2771, + "step": 1626 + }, + { + "epoch": 5.874040632054176, + "grad_norm": 196.43038940429688, + "learning_rate": 2.119600725952813e-05, + "loss": 37.7681, + "step": 1627 + }, + { + "epoch": 5.87765237020316, + "grad_norm": 255.99879455566406, + "learning_rate": 2.119056261343013e-05, + "loss": 40.0097, + "step": 1628 + }, + { + "epoch": 5.881264108352145, + "grad_norm": 275.1465148925781, + "learning_rate": 2.1185117967332125e-05, + "loss": 38.1076, + "step": 1629 + }, + { + "epoch": 5.884875846501129, + "grad_norm": 281.8592529296875, + "learning_rate": 2.117967332123412e-05, + "loss": 38.6463, + "step": 1630 + }, + { + "epoch": 5.884875846501129, + "eval_loss": 0.6449099779129028, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 1630 + }, + { + "epoch": 5.888487584650113, + "grad_norm": 246.7912139892578, + "learning_rate": 2.1174228675136116e-05, + "loss": 36.9158, + "step": 1631 + }, + { + "epoch": 5.892099322799097, + "grad_norm": 176.7545623779297, + "learning_rate": 2.116878402903811e-05, + "loss": 25.1153, + "step": 1632 + }, + { + "epoch": 5.895711060948082, + "grad_norm": 202.2602996826172, + "learning_rate": 2.116333938294011e-05, + "loss": 24.1999, + "step": 1633 + }, + { + "epoch": 5.899322799097066, + "grad_norm": 186.26255798339844, + "learning_rate": 2.1157894736842106e-05, + "loss": 24.185, + "step": 1634 + }, + { + "epoch": 5.9029345372460496, + "grad_norm": 231.0543670654297, + "learning_rate": 2.11524500907441e-05, + "loss": 26.1841, + "step": 1635 + }, + { + "epoch": 5.9065462753950335, + "grad_norm": 336.677001953125, + "learning_rate": 2.1147005444646096e-05, + "loss": 47.1367, + "step": 1636 + }, + { + "epoch": 5.910158013544018, + "grad_norm": 299.3211975097656, + "learning_rate": 2.1141560798548095e-05, + "loss": 46.7711, + "step": 1637 + }, + { + "epoch": 5.913769751693002, + "grad_norm": 287.5389099121094, + "learning_rate": 2.1136116152450094e-05, + "loss": 44.9163, + "step": 1638 + }, + { + "epoch": 5.917381489841986, + "grad_norm": 290.34930419921875, + "learning_rate": 2.113067150635209e-05, + "loss": 45.1651, + "step": 1639 + }, + { + "epoch": 5.92099322799097, + "grad_norm": 244.7100372314453, + "learning_rate": 2.1125226860254085e-05, + "loss": 45.6252, + "step": 1640 + }, + { + "epoch": 5.92099322799097, + "eval_loss": 0.6506878733634949, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 1640 + }, + { + "epoch": 5.924604966139955, + "grad_norm": 301.48223876953125, + "learning_rate": 2.111978221415608e-05, + "loss": 44.5345, + "step": 1641 + }, + { + "epoch": 5.928216704288939, + "grad_norm": 261.05987548828125, + "learning_rate": 2.1114337568058075e-05, + "loss": 42.0263, + "step": 1642 + }, + { + "epoch": 5.931828442437923, + "grad_norm": 220.4369659423828, + "learning_rate": 2.110889292196007e-05, + "loss": 41.2405, + "step": 1643 + }, + { + "epoch": 5.935440180586907, + "grad_norm": 261.3221435546875, + "learning_rate": 2.110344827586207e-05, + "loss": 42.2734, + "step": 1644 + }, + { + "epoch": 5.939051918735892, + "grad_norm": 253.70855712890625, + "learning_rate": 2.1098003629764065e-05, + "loss": 43.0752, + "step": 1645 + }, + { + "epoch": 5.942663656884876, + "grad_norm": 198.76138305664062, + "learning_rate": 2.1092558983666064e-05, + "loss": 42.7103, + "step": 1646 + }, + { + "epoch": 5.94627539503386, + "grad_norm": 212.21466064453125, + "learning_rate": 2.108711433756806e-05, + "loss": 42.6215, + "step": 1647 + }, + { + "epoch": 5.949887133182845, + "grad_norm": 212.9633026123047, + "learning_rate": 2.1081669691470055e-05, + "loss": 42.795, + "step": 1648 + }, + { + "epoch": 5.953498871331829, + "grad_norm": 263.2871398925781, + "learning_rate": 2.1076225045372053e-05, + "loss": 43.8843, + "step": 1649 + }, + { + "epoch": 5.957110609480813, + "grad_norm": 207.67120361328125, + "learning_rate": 2.107078039927405e-05, + "loss": 43.0161, + "step": 1650 + }, + { + "epoch": 5.957110609480813, + "eval_loss": 0.6315081715583801, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1650 + }, + { + "epoch": 5.960722347629797, + "grad_norm": 176.6342010498047, + "learning_rate": 2.1065335753176044e-05, + "loss": 38.803, + "step": 1651 + }, + { + "epoch": 5.9643340857787805, + "grad_norm": 223.57485961914062, + "learning_rate": 2.105989110707804e-05, + "loss": 35.1905, + "step": 1652 + }, + { + "epoch": 5.967945823927765, + "grad_norm": 291.507568359375, + "learning_rate": 2.1054446460980035e-05, + "loss": 34.9454, + "step": 1653 + }, + { + "epoch": 5.971557562076749, + "grad_norm": 250.51063537597656, + "learning_rate": 2.104900181488203e-05, + "loss": 37.4404, + "step": 1654 + }, + { + "epoch": 5.975169300225733, + "grad_norm": 307.9601135253906, + "learning_rate": 2.1043557168784032e-05, + "loss": 36.9775, + "step": 1655 + }, + { + "epoch": 5.978781038374718, + "grad_norm": 277.24151611328125, + "learning_rate": 2.1038112522686028e-05, + "loss": 38.2696, + "step": 1656 + }, + { + "epoch": 5.982392776523702, + "grad_norm": 186.7593994140625, + "learning_rate": 2.1032667876588023e-05, + "loss": 37.0656, + "step": 1657 + }, + { + "epoch": 5.986004514672686, + "grad_norm": 201.67047119140625, + "learning_rate": 2.102722323049002e-05, + "loss": 38.1747, + "step": 1658 + }, + { + "epoch": 5.98961625282167, + "grad_norm": 216.87525939941406, + "learning_rate": 2.1021778584392014e-05, + "loss": 39.3248, + "step": 1659 + }, + { + "epoch": 5.993227990970655, + "grad_norm": 227.381103515625, + "learning_rate": 2.1016333938294013e-05, + "loss": 33.4017, + "step": 1660 + }, + { + "epoch": 5.993227990970655, + "eval_loss": 0.6369583010673523, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1660 + }, + { + "epoch": 5.996839729119639, + "grad_norm": 237.2648468017578, + "learning_rate": 2.1010889292196008e-05, + "loss": 24.679, + "step": 1661 + }, + { + "epoch": 6.0, + "grad_norm": 191.99951171875, + "learning_rate": 2.1005444646098003e-05, + "loss": 21.9552, + "step": 1662 + }, + { + "epoch": 6.003611738148984, + "grad_norm": 267.92181396484375, + "learning_rate": 2.1e-05, + "loss": 43.6884, + "step": 1663 + }, + { + "epoch": 6.007223476297969, + "grad_norm": 318.86602783203125, + "learning_rate": 2.0994555353901998e-05, + "loss": 46.0709, + "step": 1664 + }, + { + "epoch": 6.010835214446953, + "grad_norm": 282.772705078125, + "learning_rate": 2.0989110707803993e-05, + "loss": 44.2746, + "step": 1665 + }, + { + "epoch": 6.014446952595937, + "grad_norm": 263.2024841308594, + "learning_rate": 2.0983666061705992e-05, + "loss": 43.818, + "step": 1666 + }, + { + "epoch": 6.018058690744921, + "grad_norm": 229.41725158691406, + "learning_rate": 2.0978221415607987e-05, + "loss": 43.9441, + "step": 1667 + }, + { + "epoch": 6.021670428893906, + "grad_norm": 253.25624084472656, + "learning_rate": 2.0972776769509983e-05, + "loss": 43.517, + "step": 1668 + }, + { + "epoch": 6.0252821670428895, + "grad_norm": 202.00238037109375, + "learning_rate": 2.0967332123411978e-05, + "loss": 44.3685, + "step": 1669 + }, + { + "epoch": 6.0288939051918735, + "grad_norm": 196.92825317382812, + "learning_rate": 2.0961887477313973e-05, + "loss": 44.9367, + "step": 1670 + }, + { + "epoch": 6.0288939051918735, + "eval_loss": 0.6381568312644958, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1670 + }, + { + "epoch": 6.0325056433408575, + "grad_norm": 191.00900268554688, + "learning_rate": 2.0956442831215972e-05, + "loss": 44.0743, + "step": 1671 + }, + { + "epoch": 6.036117381489842, + "grad_norm": 195.92141723632812, + "learning_rate": 2.0950998185117967e-05, + "loss": 43.3278, + "step": 1672 + }, + { + "epoch": 6.039729119638826, + "grad_norm": 230.04708862304688, + "learning_rate": 2.0945553539019963e-05, + "loss": 41.6419, + "step": 1673 + }, + { + "epoch": 6.04334085778781, + "grad_norm": 215.70689392089844, + "learning_rate": 2.094010889292196e-05, + "loss": 41.0927, + "step": 1674 + }, + { + "epoch": 6.046952595936794, + "grad_norm": 227.51797485351562, + "learning_rate": 2.0934664246823957e-05, + "loss": 40.1888, + "step": 1675 + }, + { + "epoch": 6.050564334085779, + "grad_norm": 216.93089294433594, + "learning_rate": 2.0929219600725952e-05, + "loss": 39.8766, + "step": 1676 + }, + { + "epoch": 6.054176072234763, + "grad_norm": 199.3091583251953, + "learning_rate": 2.092377495462795e-05, + "loss": 40.3851, + "step": 1677 + }, + { + "epoch": 6.057787810383747, + "grad_norm": 188.56056213378906, + "learning_rate": 2.0918330308529947e-05, + "loss": 40.5289, + "step": 1678 + }, + { + "epoch": 6.061399548532731, + "grad_norm": 194.23265075683594, + "learning_rate": 2.0912885662431942e-05, + "loss": 40.7509, + "step": 1679 + }, + { + "epoch": 6.065011286681716, + "grad_norm": 199.7327423095703, + "learning_rate": 2.0907441016333937e-05, + "loss": 41.3404, + "step": 1680 + }, + { + "epoch": 6.065011286681716, + "eval_loss": 0.6312655806541443, + "eval_runtime": 3.1482, + "eval_samples_per_second": 56.858, + "eval_steps_per_second": 56.858, + "step": 1680 + }, + { + "epoch": 6.0686230248307, + "grad_norm": 189.40150451660156, + "learning_rate": 2.0901996370235933e-05, + "loss": 41.3719, + "step": 1681 + }, + { + "epoch": 6.072234762979684, + "grad_norm": 222.07705688476562, + "learning_rate": 2.089655172413793e-05, + "loss": 41.8194, + "step": 1682 + }, + { + "epoch": 6.075846501128668, + "grad_norm": 205.6264190673828, + "learning_rate": 2.089110707803993e-05, + "loss": 39.8522, + "step": 1683 + }, + { + "epoch": 6.079458239277653, + "grad_norm": 207.98802185058594, + "learning_rate": 2.0885662431941926e-05, + "loss": 41.5093, + "step": 1684 + }, + { + "epoch": 6.083069977426637, + "grad_norm": 197.24134826660156, + "learning_rate": 2.088021778584392e-05, + "loss": 41.7284, + "step": 1685 + }, + { + "epoch": 6.0866817155756205, + "grad_norm": 220.84255981445312, + "learning_rate": 2.0874773139745916e-05, + "loss": 42.7841, + "step": 1686 + }, + { + "epoch": 6.090293453724605, + "grad_norm": 239.06854248046875, + "learning_rate": 2.0869328493647912e-05, + "loss": 43.6391, + "step": 1687 + }, + { + "epoch": 6.093905191873589, + "grad_norm": 193.2572021484375, + "learning_rate": 2.086388384754991e-05, + "loss": 41.9963, + "step": 1688 + }, + { + "epoch": 6.097516930022573, + "grad_norm": 206.66473388671875, + "learning_rate": 2.0858439201451906e-05, + "loss": 41.9834, + "step": 1689 + }, + { + "epoch": 6.101128668171557, + "grad_norm": 214.81956481933594, + "learning_rate": 2.08529945553539e-05, + "loss": 41.7128, + "step": 1690 + }, + { + "epoch": 6.101128668171557, + "eval_loss": 0.6309775114059448, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1690 + }, + { + "epoch": 6.104740406320542, + "grad_norm": 189.58360290527344, + "learning_rate": 2.0847549909255897e-05, + "loss": 37.7807, + "step": 1691 + }, + { + "epoch": 6.108352144469526, + "grad_norm": 265.76934814453125, + "learning_rate": 2.0842105263157895e-05, + "loss": 37.7091, + "step": 1692 + }, + { + "epoch": 6.11196388261851, + "grad_norm": 266.4632568359375, + "learning_rate": 2.0836660617059894e-05, + "loss": 34.7386, + "step": 1693 + }, + { + "epoch": 6.115575620767494, + "grad_norm": 309.3799743652344, + "learning_rate": 2.083121597096189e-05, + "loss": 34.9386, + "step": 1694 + }, + { + "epoch": 6.119187358916479, + "grad_norm": 252.98681640625, + "learning_rate": 2.0825771324863885e-05, + "loss": 34.9113, + "step": 1695 + }, + { + "epoch": 6.122799097065463, + "grad_norm": 199.3408660888672, + "learning_rate": 2.082032667876588e-05, + "loss": 35.1914, + "step": 1696 + }, + { + "epoch": 6.126410835214447, + "grad_norm": 231.67514038085938, + "learning_rate": 2.0814882032667876e-05, + "loss": 36.3151, + "step": 1697 + }, + { + "epoch": 6.130022573363431, + "grad_norm": 215.49317932128906, + "learning_rate": 2.080943738656987e-05, + "loss": 37.6763, + "step": 1698 + }, + { + "epoch": 6.133634311512416, + "grad_norm": 239.3602752685547, + "learning_rate": 2.080399274047187e-05, + "loss": 35.7805, + "step": 1699 + }, + { + "epoch": 6.1372460496614, + "grad_norm": 192.8195037841797, + "learning_rate": 2.0798548094373865e-05, + "loss": 36.7353, + "step": 1700 + }, + { + "epoch": 6.1372460496614, + "eval_loss": 0.6290757060050964, + "eval_runtime": 3.1486, + "eval_samples_per_second": 56.851, + "eval_steps_per_second": 56.851, + "step": 1700 + }, + { + "epoch": 6.140857787810384, + "grad_norm": 191.125, + "learning_rate": 2.0793103448275864e-05, + "loss": 36.6377, + "step": 1701 + }, + { + "epoch": 6.144469525959368, + "grad_norm": 232.39170837402344, + "learning_rate": 2.078765880217786e-05, + "loss": 36.5235, + "step": 1702 + }, + { + "epoch": 6.148081264108352, + "grad_norm": 259.41204833984375, + "learning_rate": 2.0782214156079855e-05, + "loss": 37.7093, + "step": 1703 + }, + { + "epoch": 6.151693002257336, + "grad_norm": 218.00814819335938, + "learning_rate": 2.0776769509981854e-05, + "loss": 37.8061, + "step": 1704 + }, + { + "epoch": 6.15530474040632, + "grad_norm": 183.78170776367188, + "learning_rate": 2.077132486388385e-05, + "loss": 37.9451, + "step": 1705 + }, + { + "epoch": 6.158916478555304, + "grad_norm": 242.387939453125, + "learning_rate": 2.0765880217785844e-05, + "loss": 38.687, + "step": 1706 + }, + { + "epoch": 6.162528216704289, + "grad_norm": 247.09152221679688, + "learning_rate": 2.076043557168784e-05, + "loss": 38.5109, + "step": 1707 + }, + { + "epoch": 6.166139954853273, + "grad_norm": 202.3104705810547, + "learning_rate": 2.0754990925589835e-05, + "loss": 28.0115, + "step": 1708 + }, + { + "epoch": 6.169751693002257, + "grad_norm": 239.5511016845703, + "learning_rate": 2.0749546279491834e-05, + "loss": 23.8873, + "step": 1709 + }, + { + "epoch": 6.173363431151241, + "grad_norm": 233.80007934570312, + "learning_rate": 2.0744101633393833e-05, + "loss": 24.0236, + "step": 1710 + }, + { + "epoch": 6.173363431151241, + "eval_loss": 0.6451307535171509, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1710 + }, + { + "epoch": 6.176975169300226, + "grad_norm": 231.85955810546875, + "learning_rate": 2.0738656987295828e-05, + "loss": 25.2521, + "step": 1711 + }, + { + "epoch": 6.18058690744921, + "grad_norm": 207.05453491210938, + "learning_rate": 2.0733212341197823e-05, + "loss": 25.5774, + "step": 1712 + }, + { + "epoch": 6.184198645598194, + "grad_norm": 265.9180908203125, + "learning_rate": 2.072776769509982e-05, + "loss": 46.0267, + "step": 1713 + }, + { + "epoch": 6.187810383747179, + "grad_norm": 289.2763671875, + "learning_rate": 2.0722323049001814e-05, + "loss": 46.6262, + "step": 1714 + }, + { + "epoch": 6.191422121896163, + "grad_norm": 254.466552734375, + "learning_rate": 2.0716878402903813e-05, + "loss": 44.2758, + "step": 1715 + }, + { + "epoch": 6.195033860045147, + "grad_norm": 262.713134765625, + "learning_rate": 2.071143375680581e-05, + "loss": 44.6334, + "step": 1716 + }, + { + "epoch": 6.198645598194131, + "grad_norm": 272.8150939941406, + "learning_rate": 2.0705989110707804e-05, + "loss": 44.9617, + "step": 1717 + }, + { + "epoch": 6.2022573363431155, + "grad_norm": 288.115478515625, + "learning_rate": 2.07005444646098e-05, + "loss": 44.4382, + "step": 1718 + }, + { + "epoch": 6.2058690744920995, + "grad_norm": 226.08058166503906, + "learning_rate": 2.0695099818511795e-05, + "loss": 44.8551, + "step": 1719 + }, + { + "epoch": 6.209480812641083, + "grad_norm": 219.95835876464844, + "learning_rate": 2.0689655172413797e-05, + "loss": 45.5901, + "step": 1720 + }, + { + "epoch": 6.209480812641083, + "eval_loss": 0.6379314661026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 1720 + }, + { + "epoch": 6.213092550790067, + "grad_norm": 190.3118896484375, + "learning_rate": 2.0684210526315792e-05, + "loss": 44.0675, + "step": 1721 + }, + { + "epoch": 6.216704288939052, + "grad_norm": 177.408935546875, + "learning_rate": 2.0678765880217787e-05, + "loss": 42.6333, + "step": 1722 + }, + { + "epoch": 6.220316027088036, + "grad_norm": 231.3040313720703, + "learning_rate": 2.0673321234119783e-05, + "loss": 41.6771, + "step": 1723 + }, + { + "epoch": 6.22392776523702, + "grad_norm": 226.51663208007812, + "learning_rate": 2.0667876588021778e-05, + "loss": 41.0829, + "step": 1724 + }, + { + "epoch": 6.227539503386004, + "grad_norm": 184.55775451660156, + "learning_rate": 2.0662431941923774e-05, + "loss": 39.2682, + "step": 1725 + }, + { + "epoch": 6.231151241534989, + "grad_norm": 205.0491943359375, + "learning_rate": 2.0656987295825772e-05, + "loss": 40.4101, + "step": 1726 + }, + { + "epoch": 6.234762979683973, + "grad_norm": 201.45838928222656, + "learning_rate": 2.0651542649727768e-05, + "loss": 39.9147, + "step": 1727 + }, + { + "epoch": 6.238374717832957, + "grad_norm": 220.16213989257812, + "learning_rate": 2.0646098003629763e-05, + "loss": 40.7215, + "step": 1728 + }, + { + "epoch": 6.241986455981941, + "grad_norm": 260.9661560058594, + "learning_rate": 2.0640653357531762e-05, + "loss": 40.0256, + "step": 1729 + }, + { + "epoch": 6.245598194130926, + "grad_norm": 314.2476806640625, + "learning_rate": 2.0635208711433757e-05, + "loss": 41.1147, + "step": 1730 + }, + { + "epoch": 6.245598194130926, + "eval_loss": 0.6347935199737549, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1730 + }, + { + "epoch": 6.24920993227991, + "grad_norm": 262.24505615234375, + "learning_rate": 2.0629764065335756e-05, + "loss": 41.7255, + "step": 1731 + }, + { + "epoch": 6.252821670428894, + "grad_norm": 212.0876922607422, + "learning_rate": 2.062431941923775e-05, + "loss": 41.2559, + "step": 1732 + }, + { + "epoch": 6.2564334085778786, + "grad_norm": 185.3249969482422, + "learning_rate": 2.0618874773139747e-05, + "loss": 41.1664, + "step": 1733 + }, + { + "epoch": 6.2600451467268625, + "grad_norm": 184.7873077392578, + "learning_rate": 2.0613430127041742e-05, + "loss": 41.3357, + "step": 1734 + }, + { + "epoch": 6.2636568848758465, + "grad_norm": 230.11257934570312, + "learning_rate": 2.0607985480943738e-05, + "loss": 43.0978, + "step": 1735 + }, + { + "epoch": 6.2672686230248305, + "grad_norm": 251.255126953125, + "learning_rate": 2.0602540834845733e-05, + "loss": 42.4169, + "step": 1736 + }, + { + "epoch": 6.270880361173815, + "grad_norm": 230.1149444580078, + "learning_rate": 2.0597096188747732e-05, + "loss": 43.2969, + "step": 1737 + }, + { + "epoch": 6.274492099322799, + "grad_norm": 217.2769012451172, + "learning_rate": 2.059165154264973e-05, + "loss": 42.6037, + "step": 1738 + }, + { + "epoch": 6.278103837471783, + "grad_norm": 189.85533142089844, + "learning_rate": 2.0586206896551726e-05, + "loss": 42.1215, + "step": 1739 + }, + { + "epoch": 6.281715575620767, + "grad_norm": 242.15667724609375, + "learning_rate": 2.058076225045372e-05, + "loss": 42.6337, + "step": 1740 + }, + { + "epoch": 6.281715575620767, + "eval_loss": 0.6310555934906006, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 1740 + }, + { + "epoch": 6.285327313769752, + "grad_norm": 213.7873992919922, + "learning_rate": 2.0575317604355717e-05, + "loss": 40.5315, + "step": 1741 + }, + { + "epoch": 6.288939051918736, + "grad_norm": 243.86492919921875, + "learning_rate": 2.0569872958257715e-05, + "loss": 38.9483, + "step": 1742 + }, + { + "epoch": 6.29255079006772, + "grad_norm": 276.0108642578125, + "learning_rate": 2.056442831215971e-05, + "loss": 35.9627, + "step": 1743 + }, + { + "epoch": 6.296162528216704, + "grad_norm": 252.5875701904297, + "learning_rate": 2.0558983666061706e-05, + "loss": 35.4305, + "step": 1744 + }, + { + "epoch": 6.299774266365689, + "grad_norm": 227.15142822265625, + "learning_rate": 2.05535390199637e-05, + "loss": 35.2385, + "step": 1745 + }, + { + "epoch": 6.303386004514673, + "grad_norm": 259.6727294921875, + "learning_rate": 2.0548094373865697e-05, + "loss": 35.735, + "step": 1746 + }, + { + "epoch": 6.306997742663657, + "grad_norm": 185.07765197753906, + "learning_rate": 2.0542649727767696e-05, + "loss": 36.8835, + "step": 1747 + }, + { + "epoch": 6.310609480812641, + "grad_norm": 207.650146484375, + "learning_rate": 2.0537205081669694e-05, + "loss": 36.346, + "step": 1748 + }, + { + "epoch": 6.314221218961626, + "grad_norm": 223.2378692626953, + "learning_rate": 2.053176043557169e-05, + "loss": 36.1527, + "step": 1749 + }, + { + "epoch": 6.3178329571106095, + "grad_norm": 162.90794372558594, + "learning_rate": 2.0526315789473685e-05, + "loss": 35.7408, + "step": 1750 + }, + { + "epoch": 6.3178329571106095, + "eval_loss": 0.6276403069496155, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 1750 + }, + { + "epoch": 6.3214446952595935, + "grad_norm": 165.8592987060547, + "learning_rate": 2.052087114337568e-05, + "loss": 37.7916, + "step": 1751 + }, + { + "epoch": 6.3250564334085775, + "grad_norm": 179.7499542236328, + "learning_rate": 2.0515426497277676e-05, + "loss": 36.8409, + "step": 1752 + }, + { + "epoch": 6.328668171557562, + "grad_norm": 227.0990753173828, + "learning_rate": 2.0509981851179675e-05, + "loss": 37.1766, + "step": 1753 + }, + { + "epoch": 6.332279909706546, + "grad_norm": 216.3297882080078, + "learning_rate": 2.050453720508167e-05, + "loss": 37.5, + "step": 1754 + }, + { + "epoch": 6.33589164785553, + "grad_norm": 197.88409423828125, + "learning_rate": 2.0499092558983666e-05, + "loss": 38.8293, + "step": 1755 + }, + { + "epoch": 6.339503386004514, + "grad_norm": 189.74916076660156, + "learning_rate": 2.049364791288566e-05, + "loss": 37.9873, + "step": 1756 + }, + { + "epoch": 6.343115124153499, + "grad_norm": 241.16644287109375, + "learning_rate": 2.048820326678766e-05, + "loss": 39.3107, + "step": 1757 + }, + { + "epoch": 6.346726862302483, + "grad_norm": 224.3491668701172, + "learning_rate": 2.0482758620689655e-05, + "loss": 36.2482, + "step": 1758 + }, + { + "epoch": 6.350338600451467, + "grad_norm": 217.30882263183594, + "learning_rate": 2.0477313974591654e-05, + "loss": 24.1945, + "step": 1759 + }, + { + "epoch": 6.353950338600452, + "grad_norm": 213.23683166503906, + "learning_rate": 2.047186932849365e-05, + "loss": 24.2356, + "step": 1760 + }, + { + "epoch": 6.353950338600452, + "eval_loss": 0.6382855772972107, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.795, + "eval_steps_per_second": 56.795, + "step": 1760 + }, + { + "epoch": 6.357562076749436, + "grad_norm": 209.8166961669922, + "learning_rate": 2.0466424682395645e-05, + "loss": 25.1916, + "step": 1761 + }, + { + "epoch": 6.36117381489842, + "grad_norm": 197.86773681640625, + "learning_rate": 2.046098003629764e-05, + "loss": 25.1372, + "step": 1762 + }, + { + "epoch": 6.364785553047404, + "grad_norm": 280.80517578125, + "learning_rate": 2.0455535390199635e-05, + "loss": 45.0431, + "step": 1763 + }, + { + "epoch": 6.368397291196389, + "grad_norm": 239.85861206054688, + "learning_rate": 2.0450090744101634e-05, + "loss": 45.4893, + "step": 1764 + }, + { + "epoch": 6.372009029345373, + "grad_norm": 302.56024169921875, + "learning_rate": 2.044464609800363e-05, + "loss": 45.3313, + "step": 1765 + }, + { + "epoch": 6.375620767494357, + "grad_norm": 255.5519256591797, + "learning_rate": 2.043920145190563e-05, + "loss": 44.703, + "step": 1766 + }, + { + "epoch": 6.3792325056433405, + "grad_norm": 223.1331024169922, + "learning_rate": 2.0433756805807624e-05, + "loss": 45.0278, + "step": 1767 + }, + { + "epoch": 6.382844243792325, + "grad_norm": 240.68817138671875, + "learning_rate": 2.042831215970962e-05, + "loss": 44.7298, + "step": 1768 + }, + { + "epoch": 6.386455981941309, + "grad_norm": 239.5072021484375, + "learning_rate": 2.0422867513611614e-05, + "loss": 44.0512, + "step": 1769 + }, + { + "epoch": 6.390067720090293, + "grad_norm": 186.3783416748047, + "learning_rate": 2.0417422867513613e-05, + "loss": 43.8646, + "step": 1770 + }, + { + "epoch": 6.390067720090293, + "eval_loss": 0.6325972676277161, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 1770 + }, + { + "epoch": 6.393679458239277, + "grad_norm": 169.77285766601562, + "learning_rate": 2.041197822141561e-05, + "loss": 43.8688, + "step": 1771 + }, + { + "epoch": 6.397291196388262, + "grad_norm": 158.4019012451172, + "learning_rate": 2.0406533575317604e-05, + "loss": 42.5757, + "step": 1772 + }, + { + "epoch": 6.400902934537246, + "grad_norm": 209.79916381835938, + "learning_rate": 2.04010889292196e-05, + "loss": 44.8075, + "step": 1773 + }, + { + "epoch": 6.40451467268623, + "grad_norm": 215.74639892578125, + "learning_rate": 2.0395644283121595e-05, + "loss": 42.0121, + "step": 1774 + }, + { + "epoch": 6.408126410835214, + "grad_norm": 215.21121215820312, + "learning_rate": 2.0390199637023597e-05, + "loss": 40.6564, + "step": 1775 + }, + { + "epoch": 6.411738148984199, + "grad_norm": 244.49574279785156, + "learning_rate": 2.0384754990925592e-05, + "loss": 40.543, + "step": 1776 + }, + { + "epoch": 6.415349887133183, + "grad_norm": 189.22781372070312, + "learning_rate": 2.0379310344827588e-05, + "loss": 39.5569, + "step": 1777 + }, + { + "epoch": 6.418961625282167, + "grad_norm": 204.32664489746094, + "learning_rate": 2.0373865698729583e-05, + "loss": 40.0789, + "step": 1778 + }, + { + "epoch": 6.422573363431152, + "grad_norm": 217.5277557373047, + "learning_rate": 2.036842105263158e-05, + "loss": 39.6436, + "step": 1779 + }, + { + "epoch": 6.426185101580136, + "grad_norm": 196.25918579101562, + "learning_rate": 2.0362976406533574e-05, + "loss": 41.0794, + "step": 1780 + }, + { + "epoch": 6.426185101580136, + "eval_loss": 0.6334295868873596, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1780 + }, + { + "epoch": 6.42979683972912, + "grad_norm": 191.50656127929688, + "learning_rate": 2.0357531760435573e-05, + "loss": 41.2976, + "step": 1781 + }, + { + "epoch": 6.433408577878104, + "grad_norm": 192.98692321777344, + "learning_rate": 2.0352087114337568e-05, + "loss": 41.0843, + "step": 1782 + }, + { + "epoch": 6.437020316027088, + "grad_norm": 197.32862854003906, + "learning_rate": 2.0346642468239563e-05, + "loss": 40.4123, + "step": 1783 + }, + { + "epoch": 6.440632054176072, + "grad_norm": 205.18751525878906, + "learning_rate": 2.0341197822141562e-05, + "loss": 41.9185, + "step": 1784 + }, + { + "epoch": 6.444243792325056, + "grad_norm": 201.69070434570312, + "learning_rate": 2.0335753176043558e-05, + "loss": 41.6794, + "step": 1785 + }, + { + "epoch": 6.44785553047404, + "grad_norm": 218.77044677734375, + "learning_rate": 2.0330308529945556e-05, + "loss": 43.5805, + "step": 1786 + }, + { + "epoch": 6.451467268623025, + "grad_norm": 183.25967407226562, + "learning_rate": 2.0324863883847552e-05, + "loss": 41.2777, + "step": 1787 + }, + { + "epoch": 6.455079006772009, + "grad_norm": 219.97369384765625, + "learning_rate": 2.0319419237749547e-05, + "loss": 42.4618, + "step": 1788 + }, + { + "epoch": 6.458690744920993, + "grad_norm": 216.1624298095703, + "learning_rate": 2.0313974591651542e-05, + "loss": 41.6424, + "step": 1789 + }, + { + "epoch": 6.462302483069977, + "grad_norm": 222.29965209960938, + "learning_rate": 2.0308529945553538e-05, + "loss": 41.4058, + "step": 1790 + }, + { + "epoch": 6.462302483069977, + "eval_loss": 0.6282982230186462, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 1790 + }, + { + "epoch": 6.465914221218962, + "grad_norm": 215.50511169433594, + "learning_rate": 2.0303085299455533e-05, + "loss": 39.474, + "step": 1791 + }, + { + "epoch": 6.469525959367946, + "grad_norm": 237.2119903564453, + "learning_rate": 2.0297640653357532e-05, + "loss": 36.0508, + "step": 1792 + }, + { + "epoch": 6.47313769751693, + "grad_norm": 234.52975463867188, + "learning_rate": 2.029219600725953e-05, + "loss": 34.1704, + "step": 1793 + }, + { + "epoch": 6.476749435665914, + "grad_norm": 213.22216796875, + "learning_rate": 2.0286751361161526e-05, + "loss": 34.7592, + "step": 1794 + }, + { + "epoch": 6.480361173814899, + "grad_norm": 215.77244567871094, + "learning_rate": 2.028130671506352e-05, + "loss": 35.3051, + "step": 1795 + }, + { + "epoch": 6.483972911963883, + "grad_norm": 179.0439910888672, + "learning_rate": 2.0275862068965517e-05, + "loss": 35.2493, + "step": 1796 + }, + { + "epoch": 6.487584650112867, + "grad_norm": 217.47218322753906, + "learning_rate": 2.0270417422867516e-05, + "loss": 35.6169, + "step": 1797 + }, + { + "epoch": 6.491196388261851, + "grad_norm": 191.3380584716797, + "learning_rate": 2.026497277676951e-05, + "loss": 36.428, + "step": 1798 + }, + { + "epoch": 6.4948081264108355, + "grad_norm": 200.8570098876953, + "learning_rate": 2.0259528130671506e-05, + "loss": 36.5983, + "step": 1799 + }, + { + "epoch": 6.4984198645598195, + "grad_norm": 173.1240234375, + "learning_rate": 2.0254083484573502e-05, + "loss": 36.0163, + "step": 1800 + }, + { + "epoch": 6.4984198645598195, + "eval_loss": 0.6268841624259949, + "eval_runtime": 3.146, + "eval_samples_per_second": 56.898, + "eval_steps_per_second": 56.898, + "step": 1800 + }, + { + "epoch": 6.502031602708803, + "grad_norm": 225.66845703125, + "learning_rate": 2.0248638838475497e-05, + "loss": 36.2461, + "step": 1801 + }, + { + "epoch": 6.505643340857787, + "grad_norm": 189.66233825683594, + "learning_rate": 2.0243194192377496e-05, + "loss": 37.416, + "step": 1802 + }, + { + "epoch": 6.509255079006772, + "grad_norm": 243.0270233154297, + "learning_rate": 2.0237749546279495e-05, + "loss": 38.5309, + "step": 1803 + }, + { + "epoch": 6.512866817155756, + "grad_norm": 192.0927276611328, + "learning_rate": 2.023230490018149e-05, + "loss": 37.087, + "step": 1804 + }, + { + "epoch": 6.51647855530474, + "grad_norm": 222.2957305908203, + "learning_rate": 2.0226860254083486e-05, + "loss": 37.8877, + "step": 1805 + }, + { + "epoch": 6.520090293453725, + "grad_norm": 259.84722900390625, + "learning_rate": 2.022141560798548e-05, + "loss": 39.2138, + "step": 1806 + }, + { + "epoch": 6.523702031602709, + "grad_norm": 205.5794219970703, + "learning_rate": 2.0215970961887476e-05, + "loss": 38.6066, + "step": 1807 + }, + { + "epoch": 6.527313769751693, + "grad_norm": 300.455810546875, + "learning_rate": 2.0210526315789475e-05, + "loss": 36.1581, + "step": 1808 + }, + { + "epoch": 6.530925507900677, + "grad_norm": 207.18063354492188, + "learning_rate": 2.020508166969147e-05, + "loss": 24.3689, + "step": 1809 + }, + { + "epoch": 6.534537246049661, + "grad_norm": 230.98516845703125, + "learning_rate": 2.0199637023593466e-05, + "loss": 23.7019, + "step": 1810 + }, + { + "epoch": 6.534537246049661, + "eval_loss": 0.6379140615463257, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 1810 + }, + { + "epoch": 6.538148984198646, + "grad_norm": 153.8694610595703, + "learning_rate": 2.019419237749546e-05, + "loss": 24.5035, + "step": 1811 + }, + { + "epoch": 6.54176072234763, + "grad_norm": 229.9432373046875, + "learning_rate": 2.018874773139746e-05, + "loss": 26.1645, + "step": 1812 + }, + { + "epoch": 6.545372460496614, + "grad_norm": 325.3592529296875, + "learning_rate": 2.018330308529946e-05, + "loss": 45.6349, + "step": 1813 + }, + { + "epoch": 6.5489841986455986, + "grad_norm": 261.0744323730469, + "learning_rate": 2.0177858439201454e-05, + "loss": 45.5545, + "step": 1814 + }, + { + "epoch": 6.5525959367945825, + "grad_norm": 261.4237976074219, + "learning_rate": 2.017241379310345e-05, + "loss": 45.321, + "step": 1815 + }, + { + "epoch": 6.5562076749435665, + "grad_norm": 238.8377685546875, + "learning_rate": 2.0166969147005445e-05, + "loss": 44.5963, + "step": 1816 + }, + { + "epoch": 6.5598194130925505, + "grad_norm": 225.89730834960938, + "learning_rate": 2.016152450090744e-05, + "loss": 43.593, + "step": 1817 + }, + { + "epoch": 6.563431151241535, + "grad_norm": 265.09625244140625, + "learning_rate": 2.0156079854809436e-05, + "loss": 43.536, + "step": 1818 + }, + { + "epoch": 6.567042889390519, + "grad_norm": 257.9114685058594, + "learning_rate": 2.0150635208711434e-05, + "loss": 44.1125, + "step": 1819 + }, + { + "epoch": 6.570654627539503, + "grad_norm": 188.06382751464844, + "learning_rate": 2.014519056261343e-05, + "loss": 45.097, + "step": 1820 + }, + { + "epoch": 6.570654627539503, + "eval_loss": 0.6347097754478455, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 1820 + }, + { + "epoch": 6.574266365688487, + "grad_norm": 227.7350616455078, + "learning_rate": 2.013974591651543e-05, + "loss": 43.9367, + "step": 1821 + }, + { + "epoch": 6.577878103837472, + "grad_norm": 207.54774475097656, + "learning_rate": 2.0134301270417424e-05, + "loss": 43.8266, + "step": 1822 + }, + { + "epoch": 6.581489841986456, + "grad_norm": 204.62364196777344, + "learning_rate": 2.012885662431942e-05, + "loss": 42.7973, + "step": 1823 + }, + { + "epoch": 6.58510158013544, + "grad_norm": 244.32159423828125, + "learning_rate": 2.0123411978221418e-05, + "loss": 42.7741, + "step": 1824 + }, + { + "epoch": 6.588713318284425, + "grad_norm": 304.9100036621094, + "learning_rate": 2.0117967332123414e-05, + "loss": 40.6529, + "step": 1825 + }, + { + "epoch": 6.592325056433409, + "grad_norm": 275.5767517089844, + "learning_rate": 2.011252268602541e-05, + "loss": 40.2909, + "step": 1826 + }, + { + "epoch": 6.595936794582393, + "grad_norm": 227.69642639160156, + "learning_rate": 2.0107078039927404e-05, + "loss": 39.8786, + "step": 1827 + }, + { + "epoch": 6.599548532731377, + "grad_norm": 261.4333190917969, + "learning_rate": 2.01016333938294e-05, + "loss": 40.7009, + "step": 1828 + }, + { + "epoch": 6.603160270880361, + "grad_norm": 213.0095977783203, + "learning_rate": 2.0096188747731395e-05, + "loss": 40.0595, + "step": 1829 + }, + { + "epoch": 6.606772009029346, + "grad_norm": 251.78590393066406, + "learning_rate": 2.0090744101633397e-05, + "loss": 40.8939, + "step": 1830 + }, + { + "epoch": 6.606772009029346, + "eval_loss": 0.6333281397819519, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1830 + }, + { + "epoch": 6.6103837471783295, + "grad_norm": 224.89805603027344, + "learning_rate": 2.0085299455535393e-05, + "loss": 41.4123, + "step": 1831 + }, + { + "epoch": 6.6139954853273135, + "grad_norm": 195.67982482910156, + "learning_rate": 2.0079854809437388e-05, + "loss": 41.3483, + "step": 1832 + }, + { + "epoch": 6.617607223476298, + "grad_norm": 214.318603515625, + "learning_rate": 2.0074410163339383e-05, + "loss": 40.5516, + "step": 1833 + }, + { + "epoch": 6.621218961625282, + "grad_norm": 226.60968017578125, + "learning_rate": 2.006896551724138e-05, + "loss": 41.3523, + "step": 1834 + }, + { + "epoch": 6.624830699774266, + "grad_norm": 231.63604736328125, + "learning_rate": 2.0063520871143378e-05, + "loss": 41.8734, + "step": 1835 + }, + { + "epoch": 6.62844243792325, + "grad_norm": 224.1644287109375, + "learning_rate": 2.0058076225045373e-05, + "loss": 42.7386, + "step": 1836 + }, + { + "epoch": 6.632054176072235, + "grad_norm": 273.651123046875, + "learning_rate": 2.0052631578947368e-05, + "loss": 42.4525, + "step": 1837 + }, + { + "epoch": 6.635665914221219, + "grad_norm": 270.8088684082031, + "learning_rate": 2.0047186932849364e-05, + "loss": 42.1051, + "step": 1838 + }, + { + "epoch": 6.639277652370203, + "grad_norm": 303.1058044433594, + "learning_rate": 2.0041742286751362e-05, + "loss": 42.1301, + "step": 1839 + }, + { + "epoch": 6.642889390519187, + "grad_norm": 207.29380798339844, + "learning_rate": 2.0036297640653358e-05, + "loss": 42.1495, + "step": 1840 + }, + { + "epoch": 6.642889390519187, + "eval_loss": 0.6321585774421692, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1840 + }, + { + "epoch": 6.646501128668172, + "grad_norm": 262.1852722167969, + "learning_rate": 2.0030852994555357e-05, + "loss": 39.6408, + "step": 1841 + }, + { + "epoch": 6.650112866817156, + "grad_norm": 233.7991943359375, + "learning_rate": 2.0025408348457352e-05, + "loss": 37.6177, + "step": 1842 + }, + { + "epoch": 6.65372460496614, + "grad_norm": 247.25514221191406, + "learning_rate": 2.0019963702359347e-05, + "loss": 35.4287, + "step": 1843 + }, + { + "epoch": 6.657336343115124, + "grad_norm": 191.53343200683594, + "learning_rate": 2.0014519056261343e-05, + "loss": 34.2335, + "step": 1844 + }, + { + "epoch": 6.660948081264109, + "grad_norm": 245.22821044921875, + "learning_rate": 2.0009074410163338e-05, + "loss": 35.8097, + "step": 1845 + }, + { + "epoch": 6.664559819413093, + "grad_norm": 213.8151092529297, + "learning_rate": 2.0003629764065337e-05, + "loss": 35.2621, + "step": 1846 + }, + { + "epoch": 6.668171557562077, + "grad_norm": 174.6085205078125, + "learning_rate": 1.9998185117967332e-05, + "loss": 36.6137, + "step": 1847 + }, + { + "epoch": 6.6717832957110605, + "grad_norm": 287.4677429199219, + "learning_rate": 1.9992740471869328e-05, + "loss": 37.5896, + "step": 1848 + }, + { + "epoch": 6.675395033860045, + "grad_norm": 224.59771728515625, + "learning_rate": 1.9987295825771326e-05, + "loss": 36.5515, + "step": 1849 + }, + { + "epoch": 6.679006772009029, + "grad_norm": 212.73065185546875, + "learning_rate": 1.9981851179673322e-05, + "loss": 36.2511, + "step": 1850 + }, + { + "epoch": 6.679006772009029, + "eval_loss": 0.6308404803276062, + "eval_runtime": 3.1419, + "eval_samples_per_second": 56.972, + "eval_steps_per_second": 56.972, + "step": 1850 + }, + { + "epoch": 6.682618510158013, + "grad_norm": 214.7340850830078, + "learning_rate": 1.9976406533575317e-05, + "loss": 37.6949, + "step": 1851 + }, + { + "epoch": 6.686230248306998, + "grad_norm": 220.3029327392578, + "learning_rate": 1.9970961887477316e-05, + "loss": 36.5785, + "step": 1852 + }, + { + "epoch": 6.689841986455982, + "grad_norm": 198.97564697265625, + "learning_rate": 1.996551724137931e-05, + "loss": 38.5277, + "step": 1853 + }, + { + "epoch": 6.693453724604966, + "grad_norm": 180.94789123535156, + "learning_rate": 1.9960072595281307e-05, + "loss": 37.5197, + "step": 1854 + }, + { + "epoch": 6.69706546275395, + "grad_norm": 212.17584228515625, + "learning_rate": 1.9954627949183302e-05, + "loss": 37.3483, + "step": 1855 + }, + { + "epoch": 6.700677200902934, + "grad_norm": 253.88601684570312, + "learning_rate": 1.9949183303085298e-05, + "loss": 38.5224, + "step": 1856 + }, + { + "epoch": 6.704288939051919, + "grad_norm": 193.17698669433594, + "learning_rate": 1.9943738656987296e-05, + "loss": 37.5679, + "step": 1857 + }, + { + "epoch": 6.707900677200903, + "grad_norm": 217.2652130126953, + "learning_rate": 1.9938294010889295e-05, + "loss": 27.7344, + "step": 1858 + }, + { + "epoch": 6.711512415349887, + "grad_norm": 183.9295196533203, + "learning_rate": 1.993284936479129e-05, + "loss": 24.3864, + "step": 1859 + }, + { + "epoch": 6.715124153498872, + "grad_norm": 200.3455352783203, + "learning_rate": 1.9927404718693286e-05, + "loss": 23.7328, + "step": 1860 + }, + { + "epoch": 6.715124153498872, + "eval_loss": 0.636415421962738, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.943, + "eval_steps_per_second": 56.943, + "step": 1860 + }, + { + "epoch": 6.718735891647856, + "grad_norm": 206.7858123779297, + "learning_rate": 1.992196007259528e-05, + "loss": 24.6541, + "step": 1861 + }, + { + "epoch": 6.72234762979684, + "grad_norm": 208.10414123535156, + "learning_rate": 1.9916515426497277e-05, + "loss": 25.1223, + "step": 1862 + }, + { + "epoch": 6.725959367945824, + "grad_norm": 270.6657409667969, + "learning_rate": 1.9911070780399275e-05, + "loss": 44.8561, + "step": 1863 + }, + { + "epoch": 6.7295711060948085, + "grad_norm": 246.69094848632812, + "learning_rate": 1.990562613430127e-05, + "loss": 45.8683, + "step": 1864 + }, + { + "epoch": 6.733182844243792, + "grad_norm": 243.4462432861328, + "learning_rate": 1.9900181488203266e-05, + "loss": 45.1845, + "step": 1865 + }, + { + "epoch": 6.736794582392776, + "grad_norm": 218.0637969970703, + "learning_rate": 1.989473684210526e-05, + "loss": 43.9492, + "step": 1866 + }, + { + "epoch": 6.74040632054176, + "grad_norm": 200.28140258789062, + "learning_rate": 1.988929219600726e-05, + "loss": 44.0612, + "step": 1867 + }, + { + "epoch": 6.744018058690745, + "grad_norm": 200.3120880126953, + "learning_rate": 1.988384754990926e-05, + "loss": 43.4748, + "step": 1868 + }, + { + "epoch": 6.747629796839729, + "grad_norm": 186.1811065673828, + "learning_rate": 1.9878402903811254e-05, + "loss": 43.6851, + "step": 1869 + }, + { + "epoch": 6.751241534988713, + "grad_norm": 208.15167236328125, + "learning_rate": 1.987295825771325e-05, + "loss": 44.4196, + "step": 1870 + }, + { + "epoch": 6.751241534988713, + "eval_loss": 0.6353851556777954, + "eval_runtime": 3.1436, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1870 + }, + { + "epoch": 6.754853273137698, + "grad_norm": 207.500244140625, + "learning_rate": 1.9867513611615245e-05, + "loss": 44.1493, + "step": 1871 + }, + { + "epoch": 6.758465011286682, + "grad_norm": 238.17047119140625, + "learning_rate": 1.986206896551724e-05, + "loss": 44.6587, + "step": 1872 + }, + { + "epoch": 6.762076749435666, + "grad_norm": 192.9468231201172, + "learning_rate": 1.9856624319419236e-05, + "loss": 43.2409, + "step": 1873 + }, + { + "epoch": 6.76568848758465, + "grad_norm": 205.26492309570312, + "learning_rate": 1.9851179673321235e-05, + "loss": 40.8636, + "step": 1874 + }, + { + "epoch": 6.769300225733634, + "grad_norm": 190.49908447265625, + "learning_rate": 1.984573502722323e-05, + "loss": 41.0769, + "step": 1875 + }, + { + "epoch": 6.772911963882619, + "grad_norm": 206.56097412109375, + "learning_rate": 1.984029038112523e-05, + "loss": 40.1137, + "step": 1876 + }, + { + "epoch": 6.776523702031603, + "grad_norm": 212.89256286621094, + "learning_rate": 1.9834845735027224e-05, + "loss": 41.0114, + "step": 1877 + }, + { + "epoch": 6.780135440180587, + "grad_norm": 197.24267578125, + "learning_rate": 1.982940108892922e-05, + "loss": 40.6027, + "step": 1878 + }, + { + "epoch": 6.7837471783295715, + "grad_norm": 187.01942443847656, + "learning_rate": 1.982395644283122e-05, + "loss": 40.5933, + "step": 1879 + }, + { + "epoch": 6.7873589164785555, + "grad_norm": 236.31092834472656, + "learning_rate": 1.9818511796733214e-05, + "loss": 41.2282, + "step": 1880 + }, + { + "epoch": 6.7873589164785555, + "eval_loss": 0.6299392580986023, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 1880 + }, + { + "epoch": 6.7909706546275395, + "grad_norm": 194.92059326171875, + "learning_rate": 1.981306715063521e-05, + "loss": 41.5858, + "step": 1881 + }, + { + "epoch": 6.794582392776523, + "grad_norm": 192.26272583007812, + "learning_rate": 1.9807622504537205e-05, + "loss": 40.6826, + "step": 1882 + }, + { + "epoch": 6.798194130925508, + "grad_norm": 181.8116912841797, + "learning_rate": 1.98021778584392e-05, + "loss": 40.0867, + "step": 1883 + }, + { + "epoch": 6.801805869074492, + "grad_norm": 219.03494262695312, + "learning_rate": 1.9796733212341195e-05, + "loss": 41.4496, + "step": 1884 + }, + { + "epoch": 6.805417607223476, + "grad_norm": 190.7852325439453, + "learning_rate": 1.9791288566243194e-05, + "loss": 42.4147, + "step": 1885 + }, + { + "epoch": 6.80902934537246, + "grad_norm": 200.32476806640625, + "learning_rate": 1.9785843920145193e-05, + "loss": 42.0316, + "step": 1886 + }, + { + "epoch": 6.812641083521445, + "grad_norm": 240.6086883544922, + "learning_rate": 1.9780399274047188e-05, + "loss": 39.6992, + "step": 1887 + }, + { + "epoch": 6.816252821670429, + "grad_norm": 222.31700134277344, + "learning_rate": 1.9774954627949184e-05, + "loss": 42.9572, + "step": 1888 + }, + { + "epoch": 6.819864559819413, + "grad_norm": 215.65292358398438, + "learning_rate": 1.976950998185118e-05, + "loss": 42.5147, + "step": 1889 + }, + { + "epoch": 6.823476297968397, + "grad_norm": 195.71624755859375, + "learning_rate": 1.9764065335753178e-05, + "loss": 40.9536, + "step": 1890 + }, + { + "epoch": 6.823476297968397, + "eval_loss": 0.6288287043571472, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 1890 + }, + { + "epoch": 6.827088036117382, + "grad_norm": 202.301025390625, + "learning_rate": 1.9758620689655173e-05, + "loss": 40.1754, + "step": 1891 + }, + { + "epoch": 6.830699774266366, + "grad_norm": 217.07186889648438, + "learning_rate": 1.975317604355717e-05, + "loss": 35.7505, + "step": 1892 + }, + { + "epoch": 6.83431151241535, + "grad_norm": 189.78782653808594, + "learning_rate": 1.9747731397459164e-05, + "loss": 34.813, + "step": 1893 + }, + { + "epoch": 6.837923250564334, + "grad_norm": 247.2117462158203, + "learning_rate": 1.974228675136116e-05, + "loss": 33.932, + "step": 1894 + }, + { + "epoch": 6.8415349887133186, + "grad_norm": 244.06321716308594, + "learning_rate": 1.9736842105263158e-05, + "loss": 36.2514, + "step": 1895 + }, + { + "epoch": 6.8451467268623025, + "grad_norm": 235.78692626953125, + "learning_rate": 1.9731397459165157e-05, + "loss": 35.2123, + "step": 1896 + }, + { + "epoch": 6.8487584650112865, + "grad_norm": 193.82456970214844, + "learning_rate": 1.9725952813067152e-05, + "loss": 36.5477, + "step": 1897 + }, + { + "epoch": 6.852370203160271, + "grad_norm": 230.2017059326172, + "learning_rate": 1.9720508166969148e-05, + "loss": 36.1244, + "step": 1898 + }, + { + "epoch": 6.855981941309255, + "grad_norm": 205.5274200439453, + "learning_rate": 1.9715063520871143e-05, + "loss": 36.7059, + "step": 1899 + }, + { + "epoch": 6.859593679458239, + "grad_norm": 236.6873016357422, + "learning_rate": 1.970961887477314e-05, + "loss": 36.6212, + "step": 1900 + }, + { + "epoch": 6.859593679458239, + "eval_loss": 0.6235609650611877, + "eval_runtime": 3.1497, + "eval_samples_per_second": 56.831, + "eval_steps_per_second": 56.831, + "step": 1900 + }, + { + "epoch": 6.863205417607223, + "grad_norm": 217.63638305664062, + "learning_rate": 1.9704174228675137e-05, + "loss": 37.3918, + "step": 1901 + }, + { + "epoch": 6.866817155756207, + "grad_norm": 169.31996154785156, + "learning_rate": 1.9698729582577133e-05, + "loss": 37.8555, + "step": 1902 + }, + { + "epoch": 6.870428893905192, + "grad_norm": 204.2144775390625, + "learning_rate": 1.9693284936479128e-05, + "loss": 38.0013, + "step": 1903 + }, + { + "epoch": 6.874040632054176, + "grad_norm": 219.13595581054688, + "learning_rate": 1.9687840290381127e-05, + "loss": 37.2128, + "step": 1904 + }, + { + "epoch": 6.87765237020316, + "grad_norm": 189.8477325439453, + "learning_rate": 1.9682395644283122e-05, + "loss": 39.272, + "step": 1905 + }, + { + "epoch": 6.881264108352145, + "grad_norm": 214.21360778808594, + "learning_rate": 1.967695099818512e-05, + "loss": 37.5185, + "step": 1906 + }, + { + "epoch": 6.884875846501129, + "grad_norm": 252.57867431640625, + "learning_rate": 1.9671506352087116e-05, + "loss": 37.6195, + "step": 1907 + }, + { + "epoch": 6.888487584650113, + "grad_norm": 169.85382080078125, + "learning_rate": 1.966606170598911e-05, + "loss": 29.083, + "step": 1908 + }, + { + "epoch": 6.892099322799097, + "grad_norm": 161.38137817382812, + "learning_rate": 1.9660617059891107e-05, + "loss": 24.4547, + "step": 1909 + }, + { + "epoch": 6.895711060948082, + "grad_norm": 192.5706787109375, + "learning_rate": 1.9655172413793102e-05, + "loss": 24.2235, + "step": 1910 + }, + { + "epoch": 6.895711060948082, + "eval_loss": 0.6387229561805725, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1910 + }, + { + "epoch": 6.899322799097066, + "grad_norm": 177.5368194580078, + "learning_rate": 1.9649727767695098e-05, + "loss": 24.8032, + "step": 1911 + }, + { + "epoch": 6.9029345372460496, + "grad_norm": 206.98458862304688, + "learning_rate": 1.9644283121597097e-05, + "loss": 25.7293, + "step": 1912 + }, + { + "epoch": 6.9065462753950335, + "grad_norm": 238.7289581298828, + "learning_rate": 1.9638838475499095e-05, + "loss": 44.2514, + "step": 1913 + }, + { + "epoch": 6.910158013544018, + "grad_norm": 225.86854553222656, + "learning_rate": 1.963339382940109e-05, + "loss": 44.4858, + "step": 1914 + }, + { + "epoch": 6.913769751693002, + "grad_norm": 235.71524047851562, + "learning_rate": 1.9627949183303086e-05, + "loss": 44.5351, + "step": 1915 + }, + { + "epoch": 6.917381489841986, + "grad_norm": 233.1634063720703, + "learning_rate": 1.962250453720508e-05, + "loss": 44.0865, + "step": 1916 + }, + { + "epoch": 6.92099322799097, + "grad_norm": 201.48944091796875, + "learning_rate": 1.961705989110708e-05, + "loss": 45.0226, + "step": 1917 + }, + { + "epoch": 6.924604966139955, + "grad_norm": 226.95469665527344, + "learning_rate": 1.9611615245009076e-05, + "loss": 44.3969, + "step": 1918 + }, + { + "epoch": 6.928216704288939, + "grad_norm": 242.79940795898438, + "learning_rate": 1.960617059891107e-05, + "loss": 41.3037, + "step": 1919 + }, + { + "epoch": 6.931828442437923, + "grad_norm": 255.3524932861328, + "learning_rate": 1.9600725952813066e-05, + "loss": 41.3567, + "step": 1920 + }, + { + "epoch": 6.931828442437923, + "eval_loss": 0.6346065998077393, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 1920 + }, + { + "epoch": 6.935440180586907, + "grad_norm": 277.0763854980469, + "learning_rate": 1.9595281306715062e-05, + "loss": 41.142, + "step": 1921 + }, + { + "epoch": 6.939051918735892, + "grad_norm": 176.02658081054688, + "learning_rate": 1.958983666061706e-05, + "loss": 42.1963, + "step": 1922 + }, + { + "epoch": 6.942663656884876, + "grad_norm": 236.36398315429688, + "learning_rate": 1.958439201451906e-05, + "loss": 42.351, + "step": 1923 + }, + { + "epoch": 6.94627539503386, + "grad_norm": 203.0919647216797, + "learning_rate": 1.9578947368421055e-05, + "loss": 41.5248, + "step": 1924 + }, + { + "epoch": 6.949887133182845, + "grad_norm": 273.605712890625, + "learning_rate": 1.957350272232305e-05, + "loss": 42.1004, + "step": 1925 + }, + { + "epoch": 6.953498871331829, + "grad_norm": 214.04319763183594, + "learning_rate": 1.9568058076225045e-05, + "loss": 42.6326, + "step": 1926 + }, + { + "epoch": 6.957110609480813, + "grad_norm": 250.81832885742188, + "learning_rate": 1.956261343012704e-05, + "loss": 43.8045, + "step": 1927 + }, + { + "epoch": 6.960722347629797, + "grad_norm": 233.58116149902344, + "learning_rate": 1.955716878402904e-05, + "loss": 39.8991, + "step": 1928 + }, + { + "epoch": 6.9643340857787805, + "grad_norm": 269.0545654296875, + "learning_rate": 1.9551724137931035e-05, + "loss": 34.6192, + "step": 1929 + }, + { + "epoch": 6.967945823927765, + "grad_norm": 266.1218566894531, + "learning_rate": 1.954627949183303e-05, + "loss": 35.7568, + "step": 1930 + }, + { + "epoch": 6.967945823927765, + "eval_loss": 0.6233173608779907, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1930 + }, + { + "epoch": 6.971557562076749, + "grad_norm": 294.6914978027344, + "learning_rate": 1.9540834845735026e-05, + "loss": 36.0795, + "step": 1931 + }, + { + "epoch": 6.975169300225733, + "grad_norm": 373.6831970214844, + "learning_rate": 1.9535390199637025e-05, + "loss": 37.2715, + "step": 1932 + }, + { + "epoch": 6.978781038374718, + "grad_norm": 240.34738159179688, + "learning_rate": 1.952994555353902e-05, + "loss": 37.8335, + "step": 1933 + }, + { + "epoch": 6.982392776523702, + "grad_norm": 312.1968994140625, + "learning_rate": 1.952450090744102e-05, + "loss": 37.8251, + "step": 1934 + }, + { + "epoch": 6.986004514672686, + "grad_norm": 276.3544006347656, + "learning_rate": 1.9519056261343014e-05, + "loss": 38.8466, + "step": 1935 + }, + { + "epoch": 6.98961625282167, + "grad_norm": 282.6874694824219, + "learning_rate": 1.951361161524501e-05, + "loss": 37.774, + "step": 1936 + }, + { + "epoch": 6.993227990970655, + "grad_norm": 323.96612548828125, + "learning_rate": 1.9508166969147005e-05, + "loss": 34.3747, + "step": 1937 + }, + { + "epoch": 6.996839729119639, + "grad_norm": 235.02915954589844, + "learning_rate": 1.9502722323049e-05, + "loss": 24.5297, + "step": 1938 + }, + { + "epoch": 7.0, + "grad_norm": 176.4046173095703, + "learning_rate": 1.9497277676951e-05, + "loss": 22.3179, + "step": 1939 + }, + { + "epoch": 7.003611738148984, + "grad_norm": 248.2797393798828, + "learning_rate": 1.9491833030852994e-05, + "loss": 42.225, + "step": 1940 + }, + { + "epoch": 7.003611738148984, + "eval_loss": 0.6272363066673279, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.911, + "eval_steps_per_second": 56.911, + "step": 1940 + }, + { + "epoch": 7.007223476297969, + "grad_norm": 235.9131622314453, + "learning_rate": 1.9486388384754993e-05, + "loss": 43.6526, + "step": 1941 + }, + { + "epoch": 7.010835214446953, + "grad_norm": 223.63479614257812, + "learning_rate": 1.948094373865699e-05, + "loss": 42.9052, + "step": 1942 + }, + { + "epoch": 7.014446952595937, + "grad_norm": 203.92141723632812, + "learning_rate": 1.9475499092558984e-05, + "loss": 43.5819, + "step": 1943 + }, + { + "epoch": 7.018058690744921, + "grad_norm": 209.6050567626953, + "learning_rate": 1.947005444646098e-05, + "loss": 43.1077, + "step": 1944 + }, + { + "epoch": 7.021670428893906, + "grad_norm": 245.77700805664062, + "learning_rate": 1.9464609800362978e-05, + "loss": 42.7508, + "step": 1945 + }, + { + "epoch": 7.0252821670428895, + "grad_norm": 203.13465881347656, + "learning_rate": 1.9459165154264973e-05, + "loss": 42.5234, + "step": 1946 + }, + { + "epoch": 7.0288939051918735, + "grad_norm": 226.4978485107422, + "learning_rate": 1.945372050816697e-05, + "loss": 44.0725, + "step": 1947 + }, + { + "epoch": 7.0325056433408575, + "grad_norm": 225.68116760253906, + "learning_rate": 1.9448275862068964e-05, + "loss": 42.6408, + "step": 1948 + }, + { + "epoch": 7.036117381489842, + "grad_norm": 182.14202880859375, + "learning_rate": 1.944283121597096e-05, + "loss": 41.7696, + "step": 1949 + }, + { + "epoch": 7.039729119638826, + "grad_norm": 196.1949005126953, + "learning_rate": 1.9437386569872962e-05, + "loss": 42.7008, + "step": 1950 + }, + { + "epoch": 7.039729119638826, + "eval_loss": 0.6277336478233337, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 1950 + }, + { + "epoch": 7.04334085778781, + "grad_norm": 180.6853485107422, + "learning_rate": 1.9431941923774957e-05, + "loss": 41.9946, + "step": 1951 + }, + { + "epoch": 7.046952595936794, + "grad_norm": 199.0644073486328, + "learning_rate": 1.9426497277676953e-05, + "loss": 39.8965, + "step": 1952 + }, + { + "epoch": 7.050564334085779, + "grad_norm": 208.21371459960938, + "learning_rate": 1.9421052631578948e-05, + "loss": 39.3263, + "step": 1953 + }, + { + "epoch": 7.054176072234763, + "grad_norm": 239.78677368164062, + "learning_rate": 1.9415607985480943e-05, + "loss": 40.1478, + "step": 1954 + }, + { + "epoch": 7.057787810383747, + "grad_norm": 211.55030822753906, + "learning_rate": 1.941016333938294e-05, + "loss": 40.061, + "step": 1955 + }, + { + "epoch": 7.061399548532731, + "grad_norm": 199.51455688476562, + "learning_rate": 1.9404718693284937e-05, + "loss": 39.8707, + "step": 1956 + }, + { + "epoch": 7.065011286681716, + "grad_norm": 183.39486694335938, + "learning_rate": 1.9399274047186933e-05, + "loss": 40.3183, + "step": 1957 + }, + { + "epoch": 7.0686230248307, + "grad_norm": 238.36737060546875, + "learning_rate": 1.9393829401088928e-05, + "loss": 40.8581, + "step": 1958 + }, + { + "epoch": 7.072234762979684, + "grad_norm": 202.5072021484375, + "learning_rate": 1.9388384754990927e-05, + "loss": 40.2192, + "step": 1959 + }, + { + "epoch": 7.075846501128668, + "grad_norm": 204.236083984375, + "learning_rate": 1.9382940108892922e-05, + "loss": 40.8533, + "step": 1960 + }, + { + "epoch": 7.075846501128668, + "eval_loss": 0.6252757906913757, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 1960 + }, + { + "epoch": 7.079458239277653, + "grad_norm": 260.2081298828125, + "learning_rate": 1.937749546279492e-05, + "loss": 39.7229, + "step": 1961 + }, + { + "epoch": 7.083069977426637, + "grad_norm": 241.91722106933594, + "learning_rate": 1.9372050816696917e-05, + "loss": 41.547, + "step": 1962 + }, + { + "epoch": 7.0866817155756205, + "grad_norm": 168.9304656982422, + "learning_rate": 1.9366606170598912e-05, + "loss": 41.4826, + "step": 1963 + }, + { + "epoch": 7.090293453724605, + "grad_norm": 230.05349731445312, + "learning_rate": 1.9361161524500907e-05, + "loss": 41.5411, + "step": 1964 + }, + { + "epoch": 7.093905191873589, + "grad_norm": 172.16851806640625, + "learning_rate": 1.9355716878402903e-05, + "loss": 42.2347, + "step": 1965 + }, + { + "epoch": 7.097516930022573, + "grad_norm": 312.65838623046875, + "learning_rate": 1.9350272232304898e-05, + "loss": 41.4039, + "step": 1966 + }, + { + "epoch": 7.101128668171557, + "grad_norm": 249.62351989746094, + "learning_rate": 1.9344827586206897e-05, + "loss": 41.4234, + "step": 1967 + }, + { + "epoch": 7.104740406320542, + "grad_norm": 250.49143981933594, + "learning_rate": 1.9339382940108896e-05, + "loss": 38.0539, + "step": 1968 + }, + { + "epoch": 7.108352144469526, + "grad_norm": 238.41546630859375, + "learning_rate": 1.933393829401089e-05, + "loss": 35.5584, + "step": 1969 + }, + { + "epoch": 7.11196388261851, + "grad_norm": 200.78282165527344, + "learning_rate": 1.9328493647912886e-05, + "loss": 34.4491, + "step": 1970 + }, + { + "epoch": 7.11196388261851, + "eval_loss": 0.6286216378211975, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 1970 + }, + { + "epoch": 7.115575620767494, + "grad_norm": 244.61717224121094, + "learning_rate": 1.9323049001814882e-05, + "loss": 34.5403, + "step": 1971 + }, + { + "epoch": 7.119187358916479, + "grad_norm": 219.14312744140625, + "learning_rate": 1.931760435571688e-05, + "loss": 35.7815, + "step": 1972 + }, + { + "epoch": 7.122799097065463, + "grad_norm": 221.85130310058594, + "learning_rate": 1.9312159709618876e-05, + "loss": 35.638, + "step": 1973 + }, + { + "epoch": 7.126410835214447, + "grad_norm": 237.97921752929688, + "learning_rate": 1.930671506352087e-05, + "loss": 35.1348, + "step": 1974 + }, + { + "epoch": 7.130022573363431, + "grad_norm": 234.06256103515625, + "learning_rate": 1.9301270417422867e-05, + "loss": 35.8709, + "step": 1975 + }, + { + "epoch": 7.133634311512416, + "grad_norm": 231.6852264404297, + "learning_rate": 1.9295825771324862e-05, + "loss": 36.6859, + "step": 1976 + }, + { + "epoch": 7.1372460496614, + "grad_norm": 208.2762908935547, + "learning_rate": 1.9290381125226857e-05, + "loss": 37.24, + "step": 1977 + }, + { + "epoch": 7.140857787810384, + "grad_norm": 219.8532257080078, + "learning_rate": 1.928493647912886e-05, + "loss": 36.4058, + "step": 1978 + }, + { + "epoch": 7.144469525959368, + "grad_norm": 242.73159790039062, + "learning_rate": 1.9279491833030855e-05, + "loss": 36.7565, + "step": 1979 + }, + { + "epoch": 7.148081264108352, + "grad_norm": 227.09645080566406, + "learning_rate": 1.927404718693285e-05, + "loss": 37.6752, + "step": 1980 + }, + { + "epoch": 7.148081264108352, + "eval_loss": 0.6243596076965332, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 1980 + }, + { + "epoch": 7.151693002257336, + "grad_norm": 236.27169799804688, + "learning_rate": 1.9268602540834846e-05, + "loss": 38.3857, + "step": 1981 + }, + { + "epoch": 7.15530474040632, + "grad_norm": 244.84912109375, + "learning_rate": 1.926315789473684e-05, + "loss": 38.414, + "step": 1982 + }, + { + "epoch": 7.158916478555304, + "grad_norm": 203.36798095703125, + "learning_rate": 1.925771324863884e-05, + "loss": 38.938, + "step": 1983 + }, + { + "epoch": 7.162528216704289, + "grad_norm": 225.50152587890625, + "learning_rate": 1.9252268602540835e-05, + "loss": 37.654, + "step": 1984 + }, + { + "epoch": 7.166139954853273, + "grad_norm": 236.4989471435547, + "learning_rate": 1.924682395644283e-05, + "loss": 28.2794, + "step": 1985 + }, + { + "epoch": 7.169751693002257, + "grad_norm": 173.909423828125, + "learning_rate": 1.9241379310344826e-05, + "loss": 23.3804, + "step": 1986 + }, + { + "epoch": 7.173363431151241, + "grad_norm": 195.63526916503906, + "learning_rate": 1.9235934664246825e-05, + "loss": 24.4696, + "step": 1987 + }, + { + "epoch": 7.176975169300226, + "grad_norm": 150.0059356689453, + "learning_rate": 1.923049001814882e-05, + "loss": 23.9438, + "step": 1988 + }, + { + "epoch": 7.18058690744921, + "grad_norm": 217.61630249023438, + "learning_rate": 1.922504537205082e-05, + "loss": 25.4084, + "step": 1989 + }, + { + "epoch": 7.184198645598194, + "grad_norm": 259.2041015625, + "learning_rate": 1.9219600725952814e-05, + "loss": 44.7159, + "step": 1990 + }, + { + "epoch": 7.184198645598194, + "eval_loss": 0.6465168595314026, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 1990 + }, + { + "epoch": 7.187810383747179, + "grad_norm": 282.1758117675781, + "learning_rate": 1.921415607985481e-05, + "loss": 45.7571, + "step": 1991 + }, + { + "epoch": 7.191422121896163, + "grad_norm": 276.5455322265625, + "learning_rate": 1.9208711433756805e-05, + "loss": 44.7227, + "step": 1992 + }, + { + "epoch": 7.195033860045147, + "grad_norm": 251.93589782714844, + "learning_rate": 1.92032667876588e-05, + "loss": 43.0705, + "step": 1993 + }, + { + "epoch": 7.198645598194131, + "grad_norm": 224.8245086669922, + "learning_rate": 1.91978221415608e-05, + "loss": 43.2009, + "step": 1994 + }, + { + "epoch": 7.2022573363431155, + "grad_norm": 233.61770629882812, + "learning_rate": 1.9192377495462795e-05, + "loss": 43.4496, + "step": 1995 + }, + { + "epoch": 7.2058690744920995, + "grad_norm": 188.65252685546875, + "learning_rate": 1.9186932849364793e-05, + "loss": 42.5907, + "step": 1996 + }, + { + "epoch": 7.209480812641083, + "grad_norm": 185.1155242919922, + "learning_rate": 1.918148820326679e-05, + "loss": 44.4651, + "step": 1997 + }, + { + "epoch": 7.213092550790067, + "grad_norm": 169.09701538085938, + "learning_rate": 1.9176043557168784e-05, + "loss": 43.6325, + "step": 1998 + }, + { + "epoch": 7.216704288939052, + "grad_norm": 198.49114990234375, + "learning_rate": 1.9170598911070783e-05, + "loss": 43.5817, + "step": 1999 + }, + { + "epoch": 7.220316027088036, + "grad_norm": 193.17591857910156, + "learning_rate": 1.916515426497278e-05, + "loss": 41.4884, + "step": 2000 + }, + { + "epoch": 7.220316027088036, + "eval_loss": 0.6329721212387085, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.995, + "eval_steps_per_second": 56.995, + "step": 2000 + }, + { + "epoch": 7.22392776523702, + "grad_norm": 202.32730102539062, + "learning_rate": 1.9159709618874774e-05, + "loss": 41.2168, + "step": 2001 + }, + { + "epoch": 7.227539503386004, + "grad_norm": 206.4916534423828, + "learning_rate": 1.915426497277677e-05, + "loss": 39.9909, + "step": 2002 + }, + { + "epoch": 7.231151241534989, + "grad_norm": 202.2099609375, + "learning_rate": 1.9148820326678765e-05, + "loss": 40.1413, + "step": 2003 + }, + { + "epoch": 7.234762979683973, + "grad_norm": 223.7954559326172, + "learning_rate": 1.914337568058076e-05, + "loss": 39.5872, + "step": 2004 + }, + { + "epoch": 7.238374717832957, + "grad_norm": 225.8967742919922, + "learning_rate": 1.9137931034482762e-05, + "loss": 41.3396, + "step": 2005 + }, + { + "epoch": 7.241986455981941, + "grad_norm": 248.0997772216797, + "learning_rate": 1.9132486388384757e-05, + "loss": 39.012, + "step": 2006 + }, + { + "epoch": 7.245598194130926, + "grad_norm": 227.4576873779297, + "learning_rate": 1.9127041742286753e-05, + "loss": 42.5922, + "step": 2007 + }, + { + "epoch": 7.24920993227991, + "grad_norm": 197.62547302246094, + "learning_rate": 1.9121597096188748e-05, + "loss": 41.6107, + "step": 2008 + }, + { + "epoch": 7.252821670428894, + "grad_norm": 170.18817138671875, + "learning_rate": 1.9116152450090744e-05, + "loss": 40.3326, + "step": 2009 + }, + { + "epoch": 7.2564334085778786, + "grad_norm": 186.9420166015625, + "learning_rate": 1.9110707803992742e-05, + "loss": 41.0365, + "step": 2010 + }, + { + "epoch": 7.2564334085778786, + "eval_loss": 0.6230406761169434, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2010 + }, + { + "epoch": 7.2600451467268625, + "grad_norm": 188.11244201660156, + "learning_rate": 1.9105263157894738e-05, + "loss": 42.0278, + "step": 2011 + }, + { + "epoch": 7.2636568848758465, + "grad_norm": 242.47305297851562, + "learning_rate": 1.9099818511796733e-05, + "loss": 41.5539, + "step": 2012 + }, + { + "epoch": 7.2672686230248305, + "grad_norm": 190.83987426757812, + "learning_rate": 1.909437386569873e-05, + "loss": 41.8641, + "step": 2013 + }, + { + "epoch": 7.270880361173815, + "grad_norm": 214.44650268554688, + "learning_rate": 1.9088929219600724e-05, + "loss": 42.232, + "step": 2014 + }, + { + "epoch": 7.274492099322799, + "grad_norm": 216.3888397216797, + "learning_rate": 1.9083484573502723e-05, + "loss": 41.6186, + "step": 2015 + }, + { + "epoch": 7.278103837471783, + "grad_norm": 210.46673583984375, + "learning_rate": 1.907803992740472e-05, + "loss": 42.2099, + "step": 2016 + }, + { + "epoch": 7.281715575620767, + "grad_norm": 194.84165954589844, + "learning_rate": 1.9072595281306717e-05, + "loss": 42.78, + "step": 2017 + }, + { + "epoch": 7.285327313769752, + "grad_norm": 201.91297912597656, + "learning_rate": 1.9067150635208712e-05, + "loss": 38.7115, + "step": 2018 + }, + { + "epoch": 7.288939051918736, + "grad_norm": 245.42625427246094, + "learning_rate": 1.9061705989110708e-05, + "loss": 35.7841, + "step": 2019 + }, + { + "epoch": 7.29255079006772, + "grad_norm": 182.4967041015625, + "learning_rate": 1.9056261343012703e-05, + "loss": 34.3308, + "step": 2020 + }, + { + "epoch": 7.29255079006772, + "eval_loss": 0.6238341331481934, + "eval_runtime": 3.1431, + "eval_samples_per_second": 56.95, + "eval_steps_per_second": 56.95, + "step": 2020 + }, + { + "epoch": 7.296162528216704, + "grad_norm": 297.3916320800781, + "learning_rate": 1.9050816696914702e-05, + "loss": 34.7534, + "step": 2021 + }, + { + "epoch": 7.299774266365689, + "grad_norm": 211.52554321289062, + "learning_rate": 1.9045372050816697e-05, + "loss": 34.0303, + "step": 2022 + }, + { + "epoch": 7.303386004514673, + "grad_norm": 232.99844360351562, + "learning_rate": 1.9039927404718693e-05, + "loss": 35.7378, + "step": 2023 + }, + { + "epoch": 7.306997742663657, + "grad_norm": 230.34642028808594, + "learning_rate": 1.903448275862069e-05, + "loss": 36.7492, + "step": 2024 + }, + { + "epoch": 7.310609480812641, + "grad_norm": 228.88966369628906, + "learning_rate": 1.9029038112522687e-05, + "loss": 35.1188, + "step": 2025 + }, + { + "epoch": 7.314221218961626, + "grad_norm": 213.2604522705078, + "learning_rate": 1.9023593466424682e-05, + "loss": 35.0688, + "step": 2026 + }, + { + "epoch": 7.3178329571106095, + "grad_norm": 202.62200927734375, + "learning_rate": 1.901814882032668e-05, + "loss": 37.6721, + "step": 2027 + }, + { + "epoch": 7.3214446952595935, + "grad_norm": 191.8877410888672, + "learning_rate": 1.9012704174228676e-05, + "loss": 36.7728, + "step": 2028 + }, + { + "epoch": 7.3250564334085775, + "grad_norm": 211.57571411132812, + "learning_rate": 1.900725952813067e-05, + "loss": 36.6342, + "step": 2029 + }, + { + "epoch": 7.328668171557562, + "grad_norm": 177.2289581298828, + "learning_rate": 1.9001814882032667e-05, + "loss": 36.8319, + "step": 2030 + }, + { + "epoch": 7.328668171557562, + "eval_loss": 0.6231008172035217, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2030 + }, + { + "epoch": 7.332279909706546, + "grad_norm": 227.7028350830078, + "learning_rate": 1.8996370235934662e-05, + "loss": 36.6706, + "step": 2031 + }, + { + "epoch": 7.33589164785553, + "grad_norm": 229.02972412109375, + "learning_rate": 1.899092558983666e-05, + "loss": 37.0749, + "step": 2032 + }, + { + "epoch": 7.339503386004514, + "grad_norm": 234.30946350097656, + "learning_rate": 1.898548094373866e-05, + "loss": 37.3716, + "step": 2033 + }, + { + "epoch": 7.343115124153499, + "grad_norm": 236.79893493652344, + "learning_rate": 1.8980036297640655e-05, + "loss": 38.9503, + "step": 2034 + }, + { + "epoch": 7.346726862302483, + "grad_norm": 256.5646057128906, + "learning_rate": 1.897459165154265e-05, + "loss": 32.5056, + "step": 2035 + }, + { + "epoch": 7.350338600451467, + "grad_norm": 183.38961791992188, + "learning_rate": 1.8969147005444646e-05, + "loss": 25.3982, + "step": 2036 + }, + { + "epoch": 7.353950338600452, + "grad_norm": 214.09742736816406, + "learning_rate": 1.896370235934664e-05, + "loss": 23.2743, + "step": 2037 + }, + { + "epoch": 7.357562076749436, + "grad_norm": 190.10867309570312, + "learning_rate": 1.895825771324864e-05, + "loss": 24.8062, + "step": 2038 + }, + { + "epoch": 7.36117381489842, + "grad_norm": 197.85313415527344, + "learning_rate": 1.8952813067150636e-05, + "loss": 25.5098, + "step": 2039 + }, + { + "epoch": 7.364785553047404, + "grad_norm": 235.79090881347656, + "learning_rate": 1.894736842105263e-05, + "loss": 44.3536, + "step": 2040 + }, + { + "epoch": 7.364785553047404, + "eval_loss": 0.6341925263404846, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.785, + "eval_steps_per_second": 56.785, + "step": 2040 + }, + { + "epoch": 7.368397291196389, + "grad_norm": 232.7415771484375, + "learning_rate": 1.8941923774954626e-05, + "loss": 44.6073, + "step": 2041 + }, + { + "epoch": 7.372009029345373, + "grad_norm": 302.3766174316406, + "learning_rate": 1.8936479128856625e-05, + "loss": 43.8575, + "step": 2042 + }, + { + "epoch": 7.375620767494357, + "grad_norm": 208.41441345214844, + "learning_rate": 1.8931034482758624e-05, + "loss": 42.4378, + "step": 2043 + }, + { + "epoch": 7.3792325056433405, + "grad_norm": 228.000732421875, + "learning_rate": 1.892558983666062e-05, + "loss": 44.5641, + "step": 2044 + }, + { + "epoch": 7.382844243792325, + "grad_norm": 201.757080078125, + "learning_rate": 1.8920145190562615e-05, + "loss": 43.7578, + "step": 2045 + }, + { + "epoch": 7.386455981941309, + "grad_norm": 220.2481689453125, + "learning_rate": 1.891470054446461e-05, + "loss": 42.755, + "step": 2046 + }, + { + "epoch": 7.390067720090293, + "grad_norm": 225.5443115234375, + "learning_rate": 1.8909255898366605e-05, + "loss": 44.3785, + "step": 2047 + }, + { + "epoch": 7.393679458239277, + "grad_norm": 200.2024688720703, + "learning_rate": 1.89038112522686e-05, + "loss": 42.994, + "step": 2048 + }, + { + "epoch": 7.397291196388262, + "grad_norm": 205.64794921875, + "learning_rate": 1.88983666061706e-05, + "loss": 43.1902, + "step": 2049 + }, + { + "epoch": 7.400902934537246, + "grad_norm": 183.3535919189453, + "learning_rate": 1.8892921960072595e-05, + "loss": 40.9422, + "step": 2050 + }, + { + "epoch": 7.400902934537246, + "eval_loss": 0.626913845539093, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2050 + }, + { + "epoch": 7.40451467268623, + "grad_norm": 201.8138885498047, + "learning_rate": 1.8887477313974594e-05, + "loss": 39.4408, + "step": 2051 + }, + { + "epoch": 7.408126410835214, + "grad_norm": 201.8863525390625, + "learning_rate": 1.888203266787659e-05, + "loss": 39.5467, + "step": 2052 + }, + { + "epoch": 7.411738148984199, + "grad_norm": 239.10687255859375, + "learning_rate": 1.8876588021778585e-05, + "loss": 41.2256, + "step": 2053 + }, + { + "epoch": 7.415349887133183, + "grad_norm": 209.47796630859375, + "learning_rate": 1.8871143375680583e-05, + "loss": 40.8963, + "step": 2054 + }, + { + "epoch": 7.418961625282167, + "grad_norm": 202.6414794921875, + "learning_rate": 1.886569872958258e-05, + "loss": 40.5138, + "step": 2055 + }, + { + "epoch": 7.422573363431152, + "grad_norm": 198.01795959472656, + "learning_rate": 1.8860254083484574e-05, + "loss": 39.1767, + "step": 2056 + }, + { + "epoch": 7.426185101580136, + "grad_norm": 173.26507568359375, + "learning_rate": 1.885480943738657e-05, + "loss": 40.6713, + "step": 2057 + }, + { + "epoch": 7.42979683972912, + "grad_norm": 166.11607360839844, + "learning_rate": 1.8849364791288565e-05, + "loss": 41.2602, + "step": 2058 + }, + { + "epoch": 7.433408577878104, + "grad_norm": 200.76956176757812, + "learning_rate": 1.884392014519056e-05, + "loss": 41.0714, + "step": 2059 + }, + { + "epoch": 7.437020316027088, + "grad_norm": 213.75315856933594, + "learning_rate": 1.883847549909256e-05, + "loss": 39.6812, + "step": 2060 + }, + { + "epoch": 7.437020316027088, + "eval_loss": 0.6279598474502563, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.0, + "eval_steps_per_second": 57.0, + "step": 2060 + }, + { + "epoch": 7.440632054176072, + "grad_norm": 221.25025939941406, + "learning_rate": 1.8833030852994558e-05, + "loss": 41.6964, + "step": 2061 + }, + { + "epoch": 7.444243792325056, + "grad_norm": 171.32106018066406, + "learning_rate": 1.8827586206896553e-05, + "loss": 41.4608, + "step": 2062 + }, + { + "epoch": 7.44785553047404, + "grad_norm": 222.76600646972656, + "learning_rate": 1.882214156079855e-05, + "loss": 41.2687, + "step": 2063 + }, + { + "epoch": 7.451467268623025, + "grad_norm": 169.82395935058594, + "learning_rate": 1.8816696914700544e-05, + "loss": 41.6048, + "step": 2064 + }, + { + "epoch": 7.455079006772009, + "grad_norm": 190.5113525390625, + "learning_rate": 1.8811252268602543e-05, + "loss": 41.8843, + "step": 2065 + }, + { + "epoch": 7.458690744920993, + "grad_norm": 194.5990447998047, + "learning_rate": 1.8805807622504538e-05, + "loss": 43.5968, + "step": 2066 + }, + { + "epoch": 7.462302483069977, + "grad_norm": 216.0985870361328, + "learning_rate": 1.8800362976406533e-05, + "loss": 41.6743, + "step": 2067 + }, + { + "epoch": 7.465914221218962, + "grad_norm": 249.05270385742188, + "learning_rate": 1.879491833030853e-05, + "loss": 39.4203, + "step": 2068 + }, + { + "epoch": 7.469525959367946, + "grad_norm": 232.5495147705078, + "learning_rate": 1.8789473684210524e-05, + "loss": 36.2202, + "step": 2069 + }, + { + "epoch": 7.47313769751693, + "grad_norm": 218.72299194335938, + "learning_rate": 1.8784029038112523e-05, + "loss": 34.9116, + "step": 2070 + }, + { + "epoch": 7.47313769751693, + "eval_loss": 0.6241349577903748, + "eval_runtime": 3.1499, + "eval_samples_per_second": 56.827, + "eval_steps_per_second": 56.827, + "step": 2070 + }, + { + "epoch": 7.476749435665914, + "grad_norm": 241.78179931640625, + "learning_rate": 1.8778584392014522e-05, + "loss": 36.2476, + "step": 2071 + }, + { + "epoch": 7.480361173814899, + "grad_norm": 194.92982482910156, + "learning_rate": 1.8773139745916517e-05, + "loss": 34.4524, + "step": 2072 + }, + { + "epoch": 7.483972911963883, + "grad_norm": 227.76156616210938, + "learning_rate": 1.8767695099818513e-05, + "loss": 34.5292, + "step": 2073 + }, + { + "epoch": 7.487584650112867, + "grad_norm": 287.61309814453125, + "learning_rate": 1.8762250453720508e-05, + "loss": 37.8068, + "step": 2074 + }, + { + "epoch": 7.491196388261851, + "grad_norm": 191.0822296142578, + "learning_rate": 1.8756805807622503e-05, + "loss": 36.0941, + "step": 2075 + }, + { + "epoch": 7.4948081264108355, + "grad_norm": 197.5564422607422, + "learning_rate": 1.8751361161524502e-05, + "loss": 36.3624, + "step": 2076 + }, + { + "epoch": 7.4984198645598195, + "grad_norm": 187.72479248046875, + "learning_rate": 1.8745916515426497e-05, + "loss": 37.5074, + "step": 2077 + }, + { + "epoch": 7.502031602708803, + "grad_norm": 220.4607391357422, + "learning_rate": 1.8740471869328493e-05, + "loss": 35.6139, + "step": 2078 + }, + { + "epoch": 7.505643340857787, + "grad_norm": 179.05612182617188, + "learning_rate": 1.873502722323049e-05, + "loss": 37.7286, + "step": 2079 + }, + { + "epoch": 7.509255079006772, + "grad_norm": 230.91879272460938, + "learning_rate": 1.8729582577132487e-05, + "loss": 36.1803, + "step": 2080 + }, + { + "epoch": 7.509255079006772, + "eval_loss": 0.6255043148994446, + "eval_runtime": 3.1466, + "eval_samples_per_second": 56.887, + "eval_steps_per_second": 56.887, + "step": 2080 + }, + { + "epoch": 7.512866817155756, + "grad_norm": 182.89437866210938, + "learning_rate": 1.8724137931034482e-05, + "loss": 36.5782, + "step": 2081 + }, + { + "epoch": 7.51647855530474, + "grad_norm": 215.36769104003906, + "learning_rate": 1.871869328493648e-05, + "loss": 38.233, + "step": 2082 + }, + { + "epoch": 7.520090293453725, + "grad_norm": 232.6095733642578, + "learning_rate": 1.8713248638838477e-05, + "loss": 38.6268, + "step": 2083 + }, + { + "epoch": 7.523702031602709, + "grad_norm": 236.94281005859375, + "learning_rate": 1.8707803992740472e-05, + "loss": 38.1768, + "step": 2084 + }, + { + "epoch": 7.527313769751693, + "grad_norm": 214.16079711914062, + "learning_rate": 1.8702359346642467e-05, + "loss": 27.514, + "step": 2085 + }, + { + "epoch": 7.530925507900677, + "grad_norm": 192.6107940673828, + "learning_rate": 1.8696914700544463e-05, + "loss": 24.274, + "step": 2086 + }, + { + "epoch": 7.534537246049661, + "grad_norm": 217.98619079589844, + "learning_rate": 1.869147005444646e-05, + "loss": 23.2824, + "step": 2087 + }, + { + "epoch": 7.538148984198646, + "grad_norm": 183.04296875, + "learning_rate": 1.868602540834846e-05, + "loss": 24.9622, + "step": 2088 + }, + { + "epoch": 7.54176072234763, + "grad_norm": 167.1417236328125, + "learning_rate": 1.8680580762250456e-05, + "loss": 25.1446, + "step": 2089 + }, + { + "epoch": 7.545372460496614, + "grad_norm": 287.29937744140625, + "learning_rate": 1.867513611615245e-05, + "loss": 44.1171, + "step": 2090 + }, + { + "epoch": 7.545372460496614, + "eval_loss": 0.6376849412918091, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.929, + "eval_steps_per_second": 56.929, + "step": 2090 + }, + { + "epoch": 7.5489841986455986, + "grad_norm": 285.3408203125, + "learning_rate": 1.8669691470054446e-05, + "loss": 46.3716, + "step": 2091 + }, + { + "epoch": 7.5525959367945825, + "grad_norm": 233.18389892578125, + "learning_rate": 1.8664246823956445e-05, + "loss": 44.0514, + "step": 2092 + }, + { + "epoch": 7.5562076749435665, + "grad_norm": 256.4196472167969, + "learning_rate": 1.865880217785844e-05, + "loss": 44.1784, + "step": 2093 + }, + { + "epoch": 7.5598194130925505, + "grad_norm": 223.28128051757812, + "learning_rate": 1.8653357531760436e-05, + "loss": 42.9897, + "step": 2094 + }, + { + "epoch": 7.563431151241535, + "grad_norm": 235.2901153564453, + "learning_rate": 1.864791288566243e-05, + "loss": 43.7651, + "step": 2095 + }, + { + "epoch": 7.567042889390519, + "grad_norm": 285.9206237792969, + "learning_rate": 1.8642468239564427e-05, + "loss": 44.6333, + "step": 2096 + }, + { + "epoch": 7.570654627539503, + "grad_norm": 200.00210571289062, + "learning_rate": 1.8637023593466425e-05, + "loss": 43.9845, + "step": 2097 + }, + { + "epoch": 7.574266365688487, + "grad_norm": 277.73394775390625, + "learning_rate": 1.8631578947368424e-05, + "loss": 44.7301, + "step": 2098 + }, + { + "epoch": 7.577878103837472, + "grad_norm": 216.9422149658203, + "learning_rate": 1.862613430127042e-05, + "loss": 44.0409, + "step": 2099 + }, + { + "epoch": 7.581489841986456, + "grad_norm": 198.86639404296875, + "learning_rate": 1.8620689655172415e-05, + "loss": 43.4026, + "step": 2100 + }, + { + "epoch": 7.581489841986456, + "eval_loss": 0.6270378232002258, + "eval_runtime": 3.1464, + "eval_samples_per_second": 56.891, + "eval_steps_per_second": 56.891, + "step": 2100 + }, + { + "epoch": 7.58510158013544, + "grad_norm": 240.495361328125, + "learning_rate": 1.861524500907441e-05, + "loss": 41.4092, + "step": 2101 + }, + { + "epoch": 7.588713318284425, + "grad_norm": 240.1851043701172, + "learning_rate": 1.8609800362976406e-05, + "loss": 40.1396, + "step": 2102 + }, + { + "epoch": 7.592325056433409, + "grad_norm": 241.21495056152344, + "learning_rate": 1.8604355716878405e-05, + "loss": 39.1778, + "step": 2103 + }, + { + "epoch": 7.595936794582393, + "grad_norm": 287.3133544921875, + "learning_rate": 1.85989110707804e-05, + "loss": 41.0348, + "step": 2104 + }, + { + "epoch": 7.599548532731377, + "grad_norm": 230.4313201904297, + "learning_rate": 1.8593466424682395e-05, + "loss": 39.5872, + "step": 2105 + }, + { + "epoch": 7.603160270880361, + "grad_norm": 210.32962036132812, + "learning_rate": 1.858802177858439e-05, + "loss": 40.6146, + "step": 2106 + }, + { + "epoch": 7.606772009029346, + "grad_norm": 185.81752014160156, + "learning_rate": 1.858257713248639e-05, + "loss": 39.6363, + "step": 2107 + }, + { + "epoch": 7.6103837471783295, + "grad_norm": 234.63037109375, + "learning_rate": 1.8577132486388385e-05, + "loss": 40.558, + "step": 2108 + }, + { + "epoch": 7.6139954853273135, + "grad_norm": 289.92803955078125, + "learning_rate": 1.8571687840290384e-05, + "loss": 41.1624, + "step": 2109 + }, + { + "epoch": 7.617607223476298, + "grad_norm": 252.82188415527344, + "learning_rate": 1.856624319419238e-05, + "loss": 41.7827, + "step": 2110 + }, + { + "epoch": 7.617607223476298, + "eval_loss": 0.6290409564971924, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2110 + }, + { + "epoch": 7.621218961625282, + "grad_norm": 201.8303985595703, + "learning_rate": 1.8560798548094374e-05, + "loss": 39.0072, + "step": 2111 + }, + { + "epoch": 7.624830699774266, + "grad_norm": 158.71446228027344, + "learning_rate": 1.855535390199637e-05, + "loss": 39.9822, + "step": 2112 + }, + { + "epoch": 7.62844243792325, + "grad_norm": 171.3879852294922, + "learning_rate": 1.8549909255898365e-05, + "loss": 42.1973, + "step": 2113 + }, + { + "epoch": 7.632054176072235, + "grad_norm": 218.584228515625, + "learning_rate": 1.8544464609800364e-05, + "loss": 42.933, + "step": 2114 + }, + { + "epoch": 7.635665914221219, + "grad_norm": 200.60093688964844, + "learning_rate": 1.853901996370236e-05, + "loss": 41.9847, + "step": 2115 + }, + { + "epoch": 7.639277652370203, + "grad_norm": 210.75128173828125, + "learning_rate": 1.8533575317604358e-05, + "loss": 42.4961, + "step": 2116 + }, + { + "epoch": 7.642889390519187, + "grad_norm": 187.47406005859375, + "learning_rate": 1.8528130671506353e-05, + "loss": 39.3404, + "step": 2117 + }, + { + "epoch": 7.646501128668172, + "grad_norm": 204.87693786621094, + "learning_rate": 1.852268602540835e-05, + "loss": 40.3011, + "step": 2118 + }, + { + "epoch": 7.650112866817156, + "grad_norm": 228.8159637451172, + "learning_rate": 1.8517241379310344e-05, + "loss": 37.4416, + "step": 2119 + }, + { + "epoch": 7.65372460496614, + "grad_norm": 237.59664916992188, + "learning_rate": 1.8511796733212343e-05, + "loss": 35.3079, + "step": 2120 + }, + { + "epoch": 7.65372460496614, + "eval_loss": 0.6256567239761353, + "eval_runtime": 3.1458, + "eval_samples_per_second": 56.902, + "eval_steps_per_second": 56.902, + "step": 2120 + }, + { + "epoch": 7.657336343115124, + "grad_norm": 233.3187713623047, + "learning_rate": 1.850635208711434e-05, + "loss": 34.5055, + "step": 2121 + }, + { + "epoch": 7.660948081264109, + "grad_norm": 232.7037353515625, + "learning_rate": 1.8500907441016334e-05, + "loss": 34.1232, + "step": 2122 + }, + { + "epoch": 7.664559819413093, + "grad_norm": 254.53050231933594, + "learning_rate": 1.849546279491833e-05, + "loss": 35.3301, + "step": 2123 + }, + { + "epoch": 7.668171557562077, + "grad_norm": 234.93154907226562, + "learning_rate": 1.8490018148820324e-05, + "loss": 35.9202, + "step": 2124 + }, + { + "epoch": 7.6717832957110605, + "grad_norm": 237.99671936035156, + "learning_rate": 1.8484573502722327e-05, + "loss": 36.5702, + "step": 2125 + }, + { + "epoch": 7.675395033860045, + "grad_norm": 186.25271606445312, + "learning_rate": 1.8479128856624322e-05, + "loss": 35.9423, + "step": 2126 + }, + { + "epoch": 7.679006772009029, + "grad_norm": 226.461669921875, + "learning_rate": 1.8473684210526317e-05, + "loss": 37.4121, + "step": 2127 + }, + { + "epoch": 7.682618510158013, + "grad_norm": 227.0966033935547, + "learning_rate": 1.8468239564428313e-05, + "loss": 36.8802, + "step": 2128 + }, + { + "epoch": 7.686230248306998, + "grad_norm": 193.4064178466797, + "learning_rate": 1.8462794918330308e-05, + "loss": 36.0245, + "step": 2129 + }, + { + "epoch": 7.689841986455982, + "grad_norm": 279.1668395996094, + "learning_rate": 1.8457350272232304e-05, + "loss": 37.4833, + "step": 2130 + }, + { + "epoch": 7.689841986455982, + "eval_loss": 0.6227458715438843, + "eval_runtime": 3.1429, + "eval_samples_per_second": 56.953, + "eval_steps_per_second": 56.953, + "step": 2130 + }, + { + "epoch": 7.693453724604966, + "grad_norm": 254.59234619140625, + "learning_rate": 1.8451905626134302e-05, + "loss": 36.8538, + "step": 2131 + }, + { + "epoch": 7.69706546275395, + "grad_norm": 191.14463806152344, + "learning_rate": 1.8446460980036298e-05, + "loss": 37.8517, + "step": 2132 + }, + { + "epoch": 7.700677200902934, + "grad_norm": 189.20896911621094, + "learning_rate": 1.8441016333938293e-05, + "loss": 38.406, + "step": 2133 + }, + { + "epoch": 7.704288939051919, + "grad_norm": 209.61175537109375, + "learning_rate": 1.8435571687840292e-05, + "loss": 37.7692, + "step": 2134 + }, + { + "epoch": 7.707900677200903, + "grad_norm": 220.5150146484375, + "learning_rate": 1.8430127041742287e-05, + "loss": 36.087, + "step": 2135 + }, + { + "epoch": 7.711512415349887, + "grad_norm": 211.78372192382812, + "learning_rate": 1.8424682395644286e-05, + "loss": 25.6052, + "step": 2136 + }, + { + "epoch": 7.715124153498872, + "grad_norm": 223.85789489746094, + "learning_rate": 1.841923774954628e-05, + "loss": 23.5576, + "step": 2137 + }, + { + "epoch": 7.718735891647856, + "grad_norm": 163.74220275878906, + "learning_rate": 1.8413793103448277e-05, + "loss": 24.4869, + "step": 2138 + }, + { + "epoch": 7.72234762979684, + "grad_norm": 182.80079650878906, + "learning_rate": 1.8408348457350272e-05, + "loss": 25.1878, + "step": 2139 + }, + { + "epoch": 7.725959367945824, + "grad_norm": 296.0340270996094, + "learning_rate": 1.8402903811252268e-05, + "loss": 44.4643, + "step": 2140 + }, + { + "epoch": 7.725959367945824, + "eval_loss": 0.6382863521575928, + "eval_runtime": 3.1441, + "eval_samples_per_second": 56.932, + "eval_steps_per_second": 56.932, + "step": 2140 + }, + { + "epoch": 7.7295711060948085, + "grad_norm": 248.48643493652344, + "learning_rate": 1.8397459165154263e-05, + "loss": 45.2141, + "step": 2141 + }, + { + "epoch": 7.733182844243792, + "grad_norm": 240.9061279296875, + "learning_rate": 1.8392014519056262e-05, + "loss": 42.9435, + "step": 2142 + }, + { + "epoch": 7.736794582392776, + "grad_norm": 231.62315368652344, + "learning_rate": 1.8386569872958257e-05, + "loss": 42.9769, + "step": 2143 + }, + { + "epoch": 7.74040632054176, + "grad_norm": 244.36915588378906, + "learning_rate": 1.8381125226860256e-05, + "loss": 43.6058, + "step": 2144 + }, + { + "epoch": 7.744018058690745, + "grad_norm": 252.9080047607422, + "learning_rate": 1.837568058076225e-05, + "loss": 43.1753, + "step": 2145 + }, + { + "epoch": 7.747629796839729, + "grad_norm": 274.0201721191406, + "learning_rate": 1.8370235934664247e-05, + "loss": 43.3285, + "step": 2146 + }, + { + "epoch": 7.751241534988713, + "grad_norm": 226.75595092773438, + "learning_rate": 1.8364791288566245e-05, + "loss": 43.3158, + "step": 2147 + }, + { + "epoch": 7.754853273137698, + "grad_norm": 197.0859832763672, + "learning_rate": 1.835934664246824e-05, + "loss": 43.5773, + "step": 2148 + }, + { + "epoch": 7.758465011286682, + "grad_norm": 212.14720153808594, + "learning_rate": 1.8353901996370236e-05, + "loss": 43.9208, + "step": 2149 + }, + { + "epoch": 7.762076749435666, + "grad_norm": 230.22158813476562, + "learning_rate": 1.834845735027223e-05, + "loss": 42.8429, + "step": 2150 + }, + { + "epoch": 7.762076749435666, + "eval_loss": 0.6291994452476501, + "eval_runtime": 3.1473, + "eval_samples_per_second": 56.874, + "eval_steps_per_second": 56.874, + "step": 2150 + }, + { + "epoch": 7.76568848758465, + "grad_norm": 215.79391479492188, + "learning_rate": 1.8343012704174227e-05, + "loss": 40.7289, + "step": 2151 + }, + { + "epoch": 7.769300225733634, + "grad_norm": 210.00296020507812, + "learning_rate": 1.8337568058076222e-05, + "loss": 39.9759, + "step": 2152 + }, + { + "epoch": 7.772911963882619, + "grad_norm": 291.2987976074219, + "learning_rate": 1.8332123411978224e-05, + "loss": 40.551, + "step": 2153 + }, + { + "epoch": 7.776523702031603, + "grad_norm": 218.08819580078125, + "learning_rate": 1.832667876588022e-05, + "loss": 40.7981, + "step": 2154 + }, + { + "epoch": 7.780135440180587, + "grad_norm": 268.615966796875, + "learning_rate": 1.8321234119782215e-05, + "loss": 40.5463, + "step": 2155 + }, + { + "epoch": 7.7837471783295715, + "grad_norm": 269.939697265625, + "learning_rate": 1.831578947368421e-05, + "loss": 40.6168, + "step": 2156 + }, + { + "epoch": 7.7873589164785555, + "grad_norm": 268.9761657714844, + "learning_rate": 1.8310344827586206e-05, + "loss": 41.2449, + "step": 2157 + }, + { + "epoch": 7.7909706546275395, + "grad_norm": 161.08811950683594, + "learning_rate": 1.8304900181488205e-05, + "loss": 40.6308, + "step": 2158 + }, + { + "epoch": 7.794582392776523, + "grad_norm": 190.44696044921875, + "learning_rate": 1.82994555353902e-05, + "loss": 40.9708, + "step": 2159 + }, + { + "epoch": 7.798194130925508, + "grad_norm": 202.4305419921875, + "learning_rate": 1.8294010889292196e-05, + "loss": 41.2053, + "step": 2160 + }, + { + "epoch": 7.798194130925508, + "eval_loss": 0.6233534812927246, + "eval_runtime": 3.1457, + "eval_samples_per_second": 56.903, + "eval_steps_per_second": 56.903, + "step": 2160 + }, + { + "epoch": 7.801805869074492, + "grad_norm": 188.5523681640625, + "learning_rate": 1.828856624319419e-05, + "loss": 40.3928, + "step": 2161 + }, + { + "epoch": 7.805417607223476, + "grad_norm": 184.18296813964844, + "learning_rate": 1.828312159709619e-05, + "loss": 42.3466, + "step": 2162 + }, + { + "epoch": 7.80902934537246, + "grad_norm": 223.9243927001953, + "learning_rate": 1.8277676950998185e-05, + "loss": 42.0301, + "step": 2163 + }, + { + "epoch": 7.812641083521445, + "grad_norm": 202.3498077392578, + "learning_rate": 1.8272232304900184e-05, + "loss": 42.3284, + "step": 2164 + }, + { + "epoch": 7.816252821670429, + "grad_norm": 205.77940368652344, + "learning_rate": 1.826678765880218e-05, + "loss": 42.0951, + "step": 2165 + }, + { + "epoch": 7.819864559819413, + "grad_norm": 191.46728515625, + "learning_rate": 1.8261343012704175e-05, + "loss": 40.826, + "step": 2166 + }, + { + "epoch": 7.823476297968397, + "grad_norm": 276.8330383300781, + "learning_rate": 1.825589836660617e-05, + "loss": 42.7909, + "step": 2167 + }, + { + "epoch": 7.827088036117382, + "grad_norm": 181.93955993652344, + "learning_rate": 1.8250453720508165e-05, + "loss": 38.6068, + "step": 2168 + }, + { + "epoch": 7.830699774266366, + "grad_norm": 178.79856872558594, + "learning_rate": 1.8245009074410164e-05, + "loss": 35.694, + "step": 2169 + }, + { + "epoch": 7.83431151241535, + "grad_norm": 224.6522979736328, + "learning_rate": 1.823956442831216e-05, + "loss": 36.7127, + "step": 2170 + }, + { + "epoch": 7.83431151241535, + "eval_loss": 0.6237645745277405, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 2170 + }, + { + "epoch": 7.837923250564334, + "grad_norm": 203.37196350097656, + "learning_rate": 1.823411978221416e-05, + "loss": 34.0039, + "step": 2171 + }, + { + "epoch": 7.8415349887133186, + "grad_norm": 212.79307556152344, + "learning_rate": 1.8228675136116154e-05, + "loss": 33.2787, + "step": 2172 + }, + { + "epoch": 7.8451467268623025, + "grad_norm": 215.5691375732422, + "learning_rate": 1.822323049001815e-05, + "loss": 35.4241, + "step": 2173 + }, + { + "epoch": 7.8487584650112865, + "grad_norm": 230.0751190185547, + "learning_rate": 1.8217785843920144e-05, + "loss": 36.9333, + "step": 2174 + }, + { + "epoch": 7.852370203160271, + "grad_norm": 217.8132781982422, + "learning_rate": 1.8212341197822143e-05, + "loss": 35.7233, + "step": 2175 + }, + { + "epoch": 7.855981941309255, + "grad_norm": 245.93177795410156, + "learning_rate": 1.820689655172414e-05, + "loss": 36.6111, + "step": 2176 + }, + { + "epoch": 7.859593679458239, + "grad_norm": 210.58218383789062, + "learning_rate": 1.8201451905626134e-05, + "loss": 36.3243, + "step": 2177 + }, + { + "epoch": 7.863205417607223, + "grad_norm": 234.6280059814453, + "learning_rate": 1.819600725952813e-05, + "loss": 37.0315, + "step": 2178 + }, + { + "epoch": 7.866817155756207, + "grad_norm": 184.53121948242188, + "learning_rate": 1.8190562613430125e-05, + "loss": 35.8725, + "step": 2179 + }, + { + "epoch": 7.870428893905192, + "grad_norm": 201.5563507080078, + "learning_rate": 1.8185117967332127e-05, + "loss": 37.9183, + "step": 2180 + }, + { + "epoch": 7.870428893905192, + "eval_loss": 0.6210297346115112, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2180 + }, + { + "epoch": 7.874040632054176, + "grad_norm": 192.29579162597656, + "learning_rate": 1.8179673321234122e-05, + "loss": 37.1709, + "step": 2181 + }, + { + "epoch": 7.87765237020316, + "grad_norm": 246.0638427734375, + "learning_rate": 1.8174228675136118e-05, + "loss": 38.5338, + "step": 2182 + }, + { + "epoch": 7.881264108352145, + "grad_norm": 237.47607421875, + "learning_rate": 1.8168784029038113e-05, + "loss": 37.7041, + "step": 2183 + }, + { + "epoch": 7.884875846501129, + "grad_norm": 215.06407165527344, + "learning_rate": 1.816333938294011e-05, + "loss": 38.1663, + "step": 2184 + }, + { + "epoch": 7.888487584650113, + "grad_norm": 193.76809692382812, + "learning_rate": 1.8157894736842107e-05, + "loss": 32.1679, + "step": 2185 + }, + { + "epoch": 7.892099322799097, + "grad_norm": 208.66111755371094, + "learning_rate": 1.8152450090744103e-05, + "loss": 24.2413, + "step": 2186 + }, + { + "epoch": 7.895711060948082, + "grad_norm": 182.810546875, + "learning_rate": 1.8147005444646098e-05, + "loss": 24.1102, + "step": 2187 + }, + { + "epoch": 7.899322799097066, + "grad_norm": 200.25823974609375, + "learning_rate": 1.8141560798548093e-05, + "loss": 24.5778, + "step": 2188 + }, + { + "epoch": 7.9029345372460496, + "grad_norm": 224.19125366210938, + "learning_rate": 1.813611615245009e-05, + "loss": 26.1643, + "step": 2189 + }, + { + "epoch": 7.9065462753950335, + "grad_norm": 261.03033447265625, + "learning_rate": 1.8130671506352088e-05, + "loss": 45.1071, + "step": 2190 + }, + { + "epoch": 7.9065462753950335, + "eval_loss": 0.6303785443305969, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2190 + }, + { + "epoch": 7.910158013544018, + "grad_norm": 273.6593322753906, + "learning_rate": 1.8125226860254086e-05, + "loss": 43.8271, + "step": 2191 + }, + { + "epoch": 7.913769751693002, + "grad_norm": 304.0534362792969, + "learning_rate": 1.8119782214156082e-05, + "loss": 43.7623, + "step": 2192 + }, + { + "epoch": 7.917381489841986, + "grad_norm": 249.27255249023438, + "learning_rate": 1.8114337568058077e-05, + "loss": 43.7191, + "step": 2193 + }, + { + "epoch": 7.92099322799097, + "grad_norm": 199.5006103515625, + "learning_rate": 1.8108892921960072e-05, + "loss": 44.1019, + "step": 2194 + }, + { + "epoch": 7.924604966139955, + "grad_norm": 228.42832946777344, + "learning_rate": 1.8103448275862068e-05, + "loss": 43.9717, + "step": 2195 + }, + { + "epoch": 7.928216704288939, + "grad_norm": 247.20901489257812, + "learning_rate": 1.8098003629764067e-05, + "loss": 40.022, + "step": 2196 + }, + { + "epoch": 7.931828442437923, + "grad_norm": 297.5372619628906, + "learning_rate": 1.8092558983666062e-05, + "loss": 40.6639, + "step": 2197 + }, + { + "epoch": 7.935440180586907, + "grad_norm": 245.11915588378906, + "learning_rate": 1.8087114337568057e-05, + "loss": 40.3569, + "step": 2198 + }, + { + "epoch": 7.939051918735892, + "grad_norm": 255.53297424316406, + "learning_rate": 1.8081669691470056e-05, + "loss": 41.7983, + "step": 2199 + }, + { + "epoch": 7.942663656884876, + "grad_norm": 226.12783813476562, + "learning_rate": 1.807622504537205e-05, + "loss": 41.7844, + "step": 2200 + }, + { + "epoch": 7.942663656884876, + "eval_loss": 0.6214397549629211, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2200 + }, + { + "epoch": 7.94627539503386, + "grad_norm": 220.90577697753906, + "learning_rate": 1.8070780399274047e-05, + "loss": 42.057, + "step": 2201 + }, + { + "epoch": 7.949887133182845, + "grad_norm": 192.33856201171875, + "learning_rate": 1.8065335753176046e-05, + "loss": 42.0299, + "step": 2202 + }, + { + "epoch": 7.953498871331829, + "grad_norm": 192.8511962890625, + "learning_rate": 1.805989110707804e-05, + "loss": 41.7752, + "step": 2203 + }, + { + "epoch": 7.957110609480813, + "grad_norm": 223.10275268554688, + "learning_rate": 1.8054446460980036e-05, + "loss": 41.0178, + "step": 2204 + }, + { + "epoch": 7.960722347629797, + "grad_norm": 189.8402099609375, + "learning_rate": 1.8049001814882032e-05, + "loss": 37.9747, + "step": 2205 + }, + { + "epoch": 7.9643340857787805, + "grad_norm": 233.5938720703125, + "learning_rate": 1.8043557168784027e-05, + "loss": 35.3994, + "step": 2206 + }, + { + "epoch": 7.967945823927765, + "grad_norm": 218.5577850341797, + "learning_rate": 1.8038112522686026e-05, + "loss": 35.1967, + "step": 2207 + }, + { + "epoch": 7.971557562076749, + "grad_norm": 228.49502563476562, + "learning_rate": 1.8032667876588025e-05, + "loss": 34.5792, + "step": 2208 + }, + { + "epoch": 7.975169300225733, + "grad_norm": 285.4461364746094, + "learning_rate": 1.802722323049002e-05, + "loss": 37.9449, + "step": 2209 + }, + { + "epoch": 7.978781038374718, + "grad_norm": 186.83755493164062, + "learning_rate": 1.8021778584392016e-05, + "loss": 36.3295, + "step": 2210 + }, + { + "epoch": 7.978781038374718, + "eval_loss": 0.6212169528007507, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2210 + }, + { + "epoch": 7.982392776523702, + "grad_norm": 210.31175231933594, + "learning_rate": 1.801633393829401e-05, + "loss": 37.0061, + "step": 2211 + }, + { + "epoch": 7.986004514672686, + "grad_norm": 251.96026611328125, + "learning_rate": 1.8010889292196006e-05, + "loss": 37.8831, + "step": 2212 + }, + { + "epoch": 7.98961625282167, + "grad_norm": 273.8665771484375, + "learning_rate": 1.8005444646098005e-05, + "loss": 38.8926, + "step": 2213 + }, + { + "epoch": 7.993227990970655, + "grad_norm": 207.25836181640625, + "learning_rate": 1.8e-05, + "loss": 30.0468, + "step": 2214 + }, + { + "epoch": 7.996839729119639, + "grad_norm": 200.5218048095703, + "learning_rate": 1.7994555353901996e-05, + "loss": 24.0549, + "step": 2215 + }, + { + "epoch": 8.0, + "grad_norm": 245.7149200439453, + "learning_rate": 1.798911070780399e-05, + "loss": 22.3158, + "step": 2216 + }, + { + "epoch": 8.003611738148985, + "grad_norm": 263.85546875, + "learning_rate": 1.798366606170599e-05, + "loss": 43.2342, + "step": 2217 + }, + { + "epoch": 8.007223476297968, + "grad_norm": 244.57205200195312, + "learning_rate": 1.797822141560799e-05, + "loss": 44.0931, + "step": 2218 + }, + { + "epoch": 8.010835214446953, + "grad_norm": 196.4144287109375, + "learning_rate": 1.7972776769509984e-05, + "loss": 42.1926, + "step": 2219 + }, + { + "epoch": 8.014446952595938, + "grad_norm": 282.3250427246094, + "learning_rate": 1.796733212341198e-05, + "loss": 41.4664, + "step": 2220 + }, + { + "epoch": 8.014446952595938, + "eval_loss": 0.6222901344299316, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 2220 + }, + { + "epoch": 8.01805869074492, + "grad_norm": 186.79281616210938, + "learning_rate": 1.7961887477313975e-05, + "loss": 42.2133, + "step": 2221 + }, + { + "epoch": 8.021670428893906, + "grad_norm": 220.3788299560547, + "learning_rate": 1.795644283121597e-05, + "loss": 42.0159, + "step": 2222 + }, + { + "epoch": 8.025282167042889, + "grad_norm": 262.37078857421875, + "learning_rate": 1.7950998185117966e-05, + "loss": 42.6055, + "step": 2223 + }, + { + "epoch": 8.028893905191874, + "grad_norm": 199.07078552246094, + "learning_rate": 1.7945553539019964e-05, + "loss": 43.3061, + "step": 2224 + }, + { + "epoch": 8.032505643340858, + "grad_norm": 256.6651306152344, + "learning_rate": 1.794010889292196e-05, + "loss": 42.4806, + "step": 2225 + }, + { + "epoch": 8.036117381489841, + "grad_norm": 281.17431640625, + "learning_rate": 1.793466424682396e-05, + "loss": 43.9823, + "step": 2226 + }, + { + "epoch": 8.039729119638826, + "grad_norm": 201.19837951660156, + "learning_rate": 1.7929219600725954e-05, + "loss": 41.8372, + "step": 2227 + }, + { + "epoch": 8.043340857787811, + "grad_norm": 195.1905059814453, + "learning_rate": 1.792377495462795e-05, + "loss": 38.8656, + "step": 2228 + }, + { + "epoch": 8.046952595936794, + "grad_norm": 215.02772521972656, + "learning_rate": 1.7918330308529948e-05, + "loss": 39.8965, + "step": 2229 + }, + { + "epoch": 8.050564334085779, + "grad_norm": 202.16322326660156, + "learning_rate": 1.7912885662431944e-05, + "loss": 41.0917, + "step": 2230 + }, + { + "epoch": 8.050564334085779, + "eval_loss": 0.6212881207466125, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2230 + }, + { + "epoch": 8.054176072234762, + "grad_norm": 218.90786743164062, + "learning_rate": 1.790744101633394e-05, + "loss": 38.5499, + "step": 2231 + }, + { + "epoch": 8.057787810383747, + "grad_norm": 179.57138061523438, + "learning_rate": 1.7901996370235934e-05, + "loss": 39.5915, + "step": 2232 + }, + { + "epoch": 8.061399548532732, + "grad_norm": 242.74801635742188, + "learning_rate": 1.789655172413793e-05, + "loss": 39.6094, + "step": 2233 + }, + { + "epoch": 8.065011286681715, + "grad_norm": 183.07102966308594, + "learning_rate": 1.7891107078039925e-05, + "loss": 40.6025, + "step": 2234 + }, + { + "epoch": 8.0686230248307, + "grad_norm": 192.85418701171875, + "learning_rate": 1.7885662431941924e-05, + "loss": 40.3013, + "step": 2235 + }, + { + "epoch": 8.072234762979685, + "grad_norm": 254.26353454589844, + "learning_rate": 1.7880217785843923e-05, + "loss": 39.1747, + "step": 2236 + }, + { + "epoch": 8.075846501128668, + "grad_norm": 230.7747802734375, + "learning_rate": 1.7874773139745918e-05, + "loss": 40.7569, + "step": 2237 + }, + { + "epoch": 8.079458239277653, + "grad_norm": 179.30528259277344, + "learning_rate": 1.7869328493647913e-05, + "loss": 40.0753, + "step": 2238 + }, + { + "epoch": 8.083069977426636, + "grad_norm": 203.48915100097656, + "learning_rate": 1.786388384754991e-05, + "loss": 41.4453, + "step": 2239 + }, + { + "epoch": 8.08668171557562, + "grad_norm": 274.8970947265625, + "learning_rate": 1.7858439201451908e-05, + "loss": 40.5818, + "step": 2240 + }, + { + "epoch": 8.08668171557562, + "eval_loss": 0.6184170842170715, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.799, + "eval_steps_per_second": 56.799, + "step": 2240 + }, + { + "epoch": 8.090293453724605, + "grad_norm": 237.2452392578125, + "learning_rate": 1.7852994555353903e-05, + "loss": 42.5794, + "step": 2241 + }, + { + "epoch": 8.093905191873588, + "grad_norm": 236.33766174316406, + "learning_rate": 1.7847549909255898e-05, + "loss": 41.89, + "step": 2242 + }, + { + "epoch": 8.097516930022573, + "grad_norm": 269.4791564941406, + "learning_rate": 1.7842105263157894e-05, + "loss": 41.7726, + "step": 2243 + }, + { + "epoch": 8.101128668171558, + "grad_norm": 192.28457641601562, + "learning_rate": 1.783666061705989e-05, + "loss": 40.1187, + "step": 2244 + }, + { + "epoch": 8.104740406320541, + "grad_norm": 201.5625457763672, + "learning_rate": 1.7831215970961888e-05, + "loss": 36.8004, + "step": 2245 + }, + { + "epoch": 8.108352144469526, + "grad_norm": 175.7625274658203, + "learning_rate": 1.7825771324863887e-05, + "loss": 33.8354, + "step": 2246 + }, + { + "epoch": 8.111963882618511, + "grad_norm": 195.6171112060547, + "learning_rate": 1.7820326678765882e-05, + "loss": 33.5176, + "step": 2247 + }, + { + "epoch": 8.115575620767494, + "grad_norm": 158.7554168701172, + "learning_rate": 1.7814882032667877e-05, + "loss": 34.2908, + "step": 2248 + }, + { + "epoch": 8.119187358916479, + "grad_norm": 192.78900146484375, + "learning_rate": 1.7809437386569873e-05, + "loss": 34.0861, + "step": 2249 + }, + { + "epoch": 8.122799097065462, + "grad_norm": 186.6603240966797, + "learning_rate": 1.7803992740471868e-05, + "loss": 35.5742, + "step": 2250 + }, + { + "epoch": 8.122799097065462, + "eval_loss": 0.6207499504089355, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2250 + }, + { + "epoch": 8.126410835214447, + "grad_norm": 264.3590087890625, + "learning_rate": 1.7798548094373867e-05, + "loss": 35.6709, + "step": 2251 + }, + { + "epoch": 8.130022573363432, + "grad_norm": 202.9478302001953, + "learning_rate": 1.7793103448275862e-05, + "loss": 36.4221, + "step": 2252 + }, + { + "epoch": 8.133634311512415, + "grad_norm": 229.260498046875, + "learning_rate": 1.7787658802177858e-05, + "loss": 36.0745, + "step": 2253 + }, + { + "epoch": 8.1372460496614, + "grad_norm": 222.37716674804688, + "learning_rate": 1.7782214156079856e-05, + "loss": 37.3266, + "step": 2254 + }, + { + "epoch": 8.140857787810384, + "grad_norm": 217.02272033691406, + "learning_rate": 1.7776769509981852e-05, + "loss": 37.2819, + "step": 2255 + }, + { + "epoch": 8.144469525959368, + "grad_norm": 247.61016845703125, + "learning_rate": 1.7771324863883847e-05, + "loss": 37.2683, + "step": 2256 + }, + { + "epoch": 8.148081264108352, + "grad_norm": 209.7449493408203, + "learning_rate": 1.7765880217785846e-05, + "loss": 36.7165, + "step": 2257 + }, + { + "epoch": 8.151693002257336, + "grad_norm": 217.30722045898438, + "learning_rate": 1.776043557168784e-05, + "loss": 37.0805, + "step": 2258 + }, + { + "epoch": 8.15530474040632, + "grad_norm": 181.5167236328125, + "learning_rate": 1.7754990925589837e-05, + "loss": 38.0326, + "step": 2259 + }, + { + "epoch": 8.158916478555305, + "grad_norm": 217.4818878173828, + "learning_rate": 1.7749546279491832e-05, + "loss": 37.1798, + "step": 2260 + }, + { + "epoch": 8.158916478555305, + "eval_loss": 0.6218119263648987, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 2260 + }, + { + "epoch": 8.162528216704288, + "grad_norm": 233.60733032226562, + "learning_rate": 1.7744101633393828e-05, + "loss": 36.6039, + "step": 2261 + }, + { + "epoch": 8.166139954853273, + "grad_norm": 184.5128631591797, + "learning_rate": 1.7738656987295826e-05, + "loss": 30.6188, + "step": 2262 + }, + { + "epoch": 8.169751693002258, + "grad_norm": 154.25791931152344, + "learning_rate": 1.7733212341197825e-05, + "loss": 24.0782, + "step": 2263 + }, + { + "epoch": 8.173363431151241, + "grad_norm": 179.92723083496094, + "learning_rate": 1.772776769509982e-05, + "loss": 23.7072, + "step": 2264 + }, + { + "epoch": 8.176975169300226, + "grad_norm": 170.87684631347656, + "learning_rate": 1.7722323049001816e-05, + "loss": 24.0008, + "step": 2265 + }, + { + "epoch": 8.18058690744921, + "grad_norm": 179.25233459472656, + "learning_rate": 1.771687840290381e-05, + "loss": 24.8393, + "step": 2266 + }, + { + "epoch": 8.184198645598194, + "grad_norm": 268.7836608886719, + "learning_rate": 1.7711433756805807e-05, + "loss": 44.0573, + "step": 2267 + }, + { + "epoch": 8.187810383747179, + "grad_norm": 249.12033081054688, + "learning_rate": 1.7705989110707805e-05, + "loss": 45.0218, + "step": 2268 + }, + { + "epoch": 8.191422121896162, + "grad_norm": 275.2551574707031, + "learning_rate": 1.77005444646098e-05, + "loss": 43.1954, + "step": 2269 + }, + { + "epoch": 8.195033860045147, + "grad_norm": 233.5360107421875, + "learning_rate": 1.7695099818511796e-05, + "loss": 43.0807, + "step": 2270 + }, + { + "epoch": 8.195033860045147, + "eval_loss": 0.6311450600624084, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 2270 + }, + { + "epoch": 8.198645598194132, + "grad_norm": 201.01617431640625, + "learning_rate": 1.768965517241379e-05, + "loss": 43.8161, + "step": 2271 + }, + { + "epoch": 8.202257336343115, + "grad_norm": 243.028564453125, + "learning_rate": 1.7684210526315787e-05, + "loss": 43.3388, + "step": 2272 + }, + { + "epoch": 8.2058690744921, + "grad_norm": 191.8246307373047, + "learning_rate": 1.767876588021779e-05, + "loss": 42.6949, + "step": 2273 + }, + { + "epoch": 8.209480812641084, + "grad_norm": 241.33609008789062, + "learning_rate": 1.7673321234119784e-05, + "loss": 43.3541, + "step": 2274 + }, + { + "epoch": 8.213092550790067, + "grad_norm": 247.99066162109375, + "learning_rate": 1.766787658802178e-05, + "loss": 44.4262, + "step": 2275 + }, + { + "epoch": 8.216704288939052, + "grad_norm": 223.35452270507812, + "learning_rate": 1.7662431941923775e-05, + "loss": 42.5696, + "step": 2276 + }, + { + "epoch": 8.220316027088035, + "grad_norm": 208.75209045410156, + "learning_rate": 1.765698729582577e-05, + "loss": 41.9236, + "step": 2277 + }, + { + "epoch": 8.22392776523702, + "grad_norm": 229.60305786132812, + "learning_rate": 1.7651542649727766e-05, + "loss": 39.962, + "step": 2278 + }, + { + "epoch": 8.227539503386005, + "grad_norm": 294.3867492675781, + "learning_rate": 1.7646098003629765e-05, + "loss": 39.0847, + "step": 2279 + }, + { + "epoch": 8.231151241534988, + "grad_norm": 201.49679565429688, + "learning_rate": 1.764065335753176e-05, + "loss": 39.1451, + "step": 2280 + }, + { + "epoch": 8.231151241534988, + "eval_loss": 0.6214079856872559, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 2280 + }, + { + "epoch": 8.234762979683973, + "grad_norm": 201.57894897460938, + "learning_rate": 1.7635208711433756e-05, + "loss": 39.4673, + "step": 2281 + }, + { + "epoch": 8.238374717832958, + "grad_norm": 201.0395965576172, + "learning_rate": 1.7629764065335754e-05, + "loss": 39.9832, + "step": 2282 + }, + { + "epoch": 8.241986455981941, + "grad_norm": 274.41168212890625, + "learning_rate": 1.762431941923775e-05, + "loss": 40.3885, + "step": 2283 + }, + { + "epoch": 8.245598194130926, + "grad_norm": 173.79977416992188, + "learning_rate": 1.761887477313975e-05, + "loss": 39.5292, + "step": 2284 + }, + { + "epoch": 8.249209932279909, + "grad_norm": 194.91806030273438, + "learning_rate": 1.7613430127041744e-05, + "loss": 40.3855, + "step": 2285 + }, + { + "epoch": 8.252821670428894, + "grad_norm": 216.47213745117188, + "learning_rate": 1.760798548094374e-05, + "loss": 40.937, + "step": 2286 + }, + { + "epoch": 8.256433408577879, + "grad_norm": 168.1825714111328, + "learning_rate": 1.7602540834845735e-05, + "loss": 41.2523, + "step": 2287 + }, + { + "epoch": 8.260045146726862, + "grad_norm": 187.51914978027344, + "learning_rate": 1.759709618874773e-05, + "loss": 40.6913, + "step": 2288 + }, + { + "epoch": 8.263656884875846, + "grad_norm": 183.99844360351562, + "learning_rate": 1.759165154264973e-05, + "loss": 42.5074, + "step": 2289 + }, + { + "epoch": 8.267268623024831, + "grad_norm": 201.23797607421875, + "learning_rate": 1.7586206896551724e-05, + "loss": 42.0519, + "step": 2290 + }, + { + "epoch": 8.267268623024831, + "eval_loss": 0.6184054017066956, + "eval_runtime": 3.1465, + "eval_samples_per_second": 56.889, + "eval_steps_per_second": 56.889, + "step": 2290 + }, + { + "epoch": 8.270880361173814, + "grad_norm": 219.0037384033203, + "learning_rate": 1.7580762250453723e-05, + "loss": 41.7059, + "step": 2291 + }, + { + "epoch": 8.2744920993228, + "grad_norm": 221.00173950195312, + "learning_rate": 1.7575317604355718e-05, + "loss": 40.9004, + "step": 2292 + }, + { + "epoch": 8.278103837471784, + "grad_norm": 180.00828552246094, + "learning_rate": 1.7569872958257714e-05, + "loss": 38.7865, + "step": 2293 + }, + { + "epoch": 8.281715575620767, + "grad_norm": 210.69302368164062, + "learning_rate": 1.756442831215971e-05, + "loss": 39.207, + "step": 2294 + }, + { + "epoch": 8.285327313769752, + "grad_norm": 196.8787078857422, + "learning_rate": 1.7558983666061708e-05, + "loss": 39.4472, + "step": 2295 + }, + { + "epoch": 8.288939051918735, + "grad_norm": 229.16331481933594, + "learning_rate": 1.7553539019963703e-05, + "loss": 36.5539, + "step": 2296 + }, + { + "epoch": 8.29255079006772, + "grad_norm": 180.67474365234375, + "learning_rate": 1.75480943738657e-05, + "loss": 34.3887, + "step": 2297 + }, + { + "epoch": 8.296162528216705, + "grad_norm": 234.046875, + "learning_rate": 1.7542649727767694e-05, + "loss": 34.158, + "step": 2298 + }, + { + "epoch": 8.299774266365688, + "grad_norm": 213.34255981445312, + "learning_rate": 1.753720508166969e-05, + "loss": 34.7655, + "step": 2299 + }, + { + "epoch": 8.303386004514673, + "grad_norm": 205.6382598876953, + "learning_rate": 1.753176043557169e-05, + "loss": 34.4223, + "step": 2300 + }, + { + "epoch": 8.303386004514673, + "eval_loss": 0.6200549006462097, + "eval_runtime": 3.1447, + "eval_samples_per_second": 56.921, + "eval_steps_per_second": 56.921, + "step": 2300 + }, + { + "epoch": 8.306997742663658, + "grad_norm": 189.79238891601562, + "learning_rate": 1.7526315789473687e-05, + "loss": 35.3846, + "step": 2301 + }, + { + "epoch": 8.31060948081264, + "grad_norm": 202.27859497070312, + "learning_rate": 1.7520871143375682e-05, + "loss": 34.9006, + "step": 2302 + }, + { + "epoch": 8.314221218961626, + "grad_norm": 217.62327575683594, + "learning_rate": 1.7515426497277678e-05, + "loss": 36.3079, + "step": 2303 + }, + { + "epoch": 8.317832957110609, + "grad_norm": 212.82862854003906, + "learning_rate": 1.7509981851179673e-05, + "loss": 35.8598, + "step": 2304 + }, + { + "epoch": 8.321444695259594, + "grad_norm": 229.778564453125, + "learning_rate": 1.750453720508167e-05, + "loss": 37.0853, + "step": 2305 + }, + { + "epoch": 8.325056433408578, + "grad_norm": 219.99844360351562, + "learning_rate": 1.7499092558983667e-05, + "loss": 38.01, + "step": 2306 + }, + { + "epoch": 8.328668171557561, + "grad_norm": 202.63035583496094, + "learning_rate": 1.7493647912885663e-05, + "loss": 36.4756, + "step": 2307 + }, + { + "epoch": 8.332279909706546, + "grad_norm": 188.44094848632812, + "learning_rate": 1.7488203266787658e-05, + "loss": 37.0509, + "step": 2308 + }, + { + "epoch": 8.335891647855531, + "grad_norm": 187.8760223388672, + "learning_rate": 1.7482758620689657e-05, + "loss": 38.0019, + "step": 2309 + }, + { + "epoch": 8.339503386004514, + "grad_norm": 239.35833740234375, + "learning_rate": 1.7477313974591652e-05, + "loss": 38.2255, + "step": 2310 + }, + { + "epoch": 8.339503386004514, + "eval_loss": 0.6221747994422913, + "eval_runtime": 3.148, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 2310 + }, + { + "epoch": 8.343115124153499, + "grad_norm": 236.3567657470703, + "learning_rate": 1.747186932849365e-05, + "loss": 37.3598, + "step": 2311 + }, + { + "epoch": 8.346726862302482, + "grad_norm": 188.16151428222656, + "learning_rate": 1.7466424682395646e-05, + "loss": 27.1993, + "step": 2312 + }, + { + "epoch": 8.350338600451467, + "grad_norm": 216.58778381347656, + "learning_rate": 1.746098003629764e-05, + "loss": 23.7024, + "step": 2313 + }, + { + "epoch": 8.353950338600452, + "grad_norm": 221.03111267089844, + "learning_rate": 1.7455535390199637e-05, + "loss": 24.2856, + "step": 2314 + }, + { + "epoch": 8.357562076749435, + "grad_norm": 180.36221313476562, + "learning_rate": 1.7450090744101632e-05, + "loss": 23.7624, + "step": 2315 + }, + { + "epoch": 8.36117381489842, + "grad_norm": 198.77438354492188, + "learning_rate": 1.7444646098003628e-05, + "loss": 25.8628, + "step": 2316 + }, + { + "epoch": 8.364785553047405, + "grad_norm": 250.81321716308594, + "learning_rate": 1.7439201451905627e-05, + "loss": 43.4097, + "step": 2317 + }, + { + "epoch": 8.368397291196388, + "grad_norm": 246.19544982910156, + "learning_rate": 1.7433756805807622e-05, + "loss": 44.7141, + "step": 2318 + }, + { + "epoch": 8.372009029345373, + "grad_norm": 245.04241943359375, + "learning_rate": 1.742831215970962e-05, + "loss": 44.4511, + "step": 2319 + }, + { + "epoch": 8.375620767494357, + "grad_norm": 224.05331420898438, + "learning_rate": 1.7422867513611616e-05, + "loss": 43.5971, + "step": 2320 + }, + { + "epoch": 8.375620767494357, + "eval_loss": 0.6324251294136047, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 2320 + }, + { + "epoch": 8.37923250564334, + "grad_norm": 222.3795623779297, + "learning_rate": 1.741742286751361e-05, + "loss": 42.9007, + "step": 2321 + }, + { + "epoch": 8.382844243792325, + "grad_norm": 210.0133514404297, + "learning_rate": 1.741197822141561e-05, + "loss": 42.8733, + "step": 2322 + }, + { + "epoch": 8.386455981941308, + "grad_norm": 222.01031494140625, + "learning_rate": 1.7406533575317606e-05, + "loss": 42.9875, + "step": 2323 + }, + { + "epoch": 8.390067720090293, + "grad_norm": 187.30101013183594, + "learning_rate": 1.74010889292196e-05, + "loss": 42.4873, + "step": 2324 + }, + { + "epoch": 8.393679458239278, + "grad_norm": 188.22048950195312, + "learning_rate": 1.7395644283121596e-05, + "loss": 42.2066, + "step": 2325 + }, + { + "epoch": 8.397291196388261, + "grad_norm": 228.75363159179688, + "learning_rate": 1.7390199637023592e-05, + "loss": 42.7604, + "step": 2326 + }, + { + "epoch": 8.400902934537246, + "grad_norm": 196.8817901611328, + "learning_rate": 1.7384754990925587e-05, + "loss": 42.445, + "step": 2327 + }, + { + "epoch": 8.404514672686231, + "grad_norm": 205.3610382080078, + "learning_rate": 1.737931034482759e-05, + "loss": 39.8408, + "step": 2328 + }, + { + "epoch": 8.408126410835214, + "grad_norm": 259.0702819824219, + "learning_rate": 1.7373865698729585e-05, + "loss": 40.847, + "step": 2329 + }, + { + "epoch": 8.411738148984199, + "grad_norm": 216.12017822265625, + "learning_rate": 1.736842105263158e-05, + "loss": 40.4648, + "step": 2330 + }, + { + "epoch": 8.411738148984199, + "eval_loss": 0.6252871155738831, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2330 + }, + { + "epoch": 8.415349887133182, + "grad_norm": 330.9464111328125, + "learning_rate": 1.7362976406533575e-05, + "loss": 39.7682, + "step": 2331 + }, + { + "epoch": 8.418961625282167, + "grad_norm": 237.19505310058594, + "learning_rate": 1.735753176043557e-05, + "loss": 38.8824, + "step": 2332 + }, + { + "epoch": 8.422573363431152, + "grad_norm": 247.22259521484375, + "learning_rate": 1.735208711433757e-05, + "loss": 40.1187, + "step": 2333 + }, + { + "epoch": 8.426185101580135, + "grad_norm": 267.739990234375, + "learning_rate": 1.7346642468239565e-05, + "loss": 40.4589, + "step": 2334 + }, + { + "epoch": 8.42979683972912, + "grad_norm": 308.715576171875, + "learning_rate": 1.734119782214156e-05, + "loss": 41.5481, + "step": 2335 + }, + { + "epoch": 8.433408577878104, + "grad_norm": 350.8972473144531, + "learning_rate": 1.7335753176043556e-05, + "loss": 41.6628, + "step": 2336 + }, + { + "epoch": 8.437020316027088, + "grad_norm": 245.9825897216797, + "learning_rate": 1.7330308529945555e-05, + "loss": 40.3527, + "step": 2337 + }, + { + "epoch": 8.440632054176072, + "grad_norm": 253.94488525390625, + "learning_rate": 1.732486388384755e-05, + "loss": 39.6388, + "step": 2338 + }, + { + "epoch": 8.444243792325057, + "grad_norm": 226.24179077148438, + "learning_rate": 1.731941923774955e-05, + "loss": 40.5561, + "step": 2339 + }, + { + "epoch": 8.44785553047404, + "grad_norm": 188.66746520996094, + "learning_rate": 1.7313974591651544e-05, + "loss": 41.8422, + "step": 2340 + }, + { + "epoch": 8.44785553047404, + "eval_loss": 0.6197592616081238, + "eval_runtime": 3.1522, + "eval_samples_per_second": 56.786, + "eval_steps_per_second": 56.786, + "step": 2340 + }, + { + "epoch": 8.451467268623025, + "grad_norm": 227.01014709472656, + "learning_rate": 1.730852994555354e-05, + "loss": 41.4184, + "step": 2341 + }, + { + "epoch": 8.455079006772008, + "grad_norm": 187.11643981933594, + "learning_rate": 1.7303085299455535e-05, + "loss": 40.796, + "step": 2342 + }, + { + "epoch": 8.458690744920993, + "grad_norm": 243.1756134033203, + "learning_rate": 1.729764065335753e-05, + "loss": 41.7926, + "step": 2343 + }, + { + "epoch": 8.462302483069978, + "grad_norm": 226.15187072753906, + "learning_rate": 1.729219600725953e-05, + "loss": 41.588, + "step": 2344 + }, + { + "epoch": 8.465914221218961, + "grad_norm": 218.49935913085938, + "learning_rate": 1.7286751361161524e-05, + "loss": 39.6935, + "step": 2345 + }, + { + "epoch": 8.469525959367946, + "grad_norm": 232.4805145263672, + "learning_rate": 1.7281306715063523e-05, + "loss": 37.0718, + "step": 2346 + }, + { + "epoch": 8.47313769751693, + "grad_norm": 201.1748046875, + "learning_rate": 1.727586206896552e-05, + "loss": 33.9633, + "step": 2347 + }, + { + "epoch": 8.476749435665914, + "grad_norm": 208.79733276367188, + "learning_rate": 1.7270417422867514e-05, + "loss": 33.4553, + "step": 2348 + }, + { + "epoch": 8.480361173814899, + "grad_norm": 235.91151428222656, + "learning_rate": 1.726497277676951e-05, + "loss": 33.6144, + "step": 2349 + }, + { + "epoch": 8.483972911963882, + "grad_norm": 206.28811645507812, + "learning_rate": 1.7259528130671508e-05, + "loss": 35.3678, + "step": 2350 + }, + { + "epoch": 8.483972911963882, + "eval_loss": 0.6203061938285828, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 2350 + }, + { + "epoch": 8.487584650112867, + "grad_norm": 305.2204284667969, + "learning_rate": 1.7254083484573503e-05, + "loss": 35.9175, + "step": 2351 + }, + { + "epoch": 8.491196388261852, + "grad_norm": 227.1592254638672, + "learning_rate": 1.72486388384755e-05, + "loss": 35.5001, + "step": 2352 + }, + { + "epoch": 8.494808126410835, + "grad_norm": 194.739501953125, + "learning_rate": 1.7243194192377494e-05, + "loss": 35.0015, + "step": 2353 + }, + { + "epoch": 8.49841986455982, + "grad_norm": 233.8467254638672, + "learning_rate": 1.723774954627949e-05, + "loss": 36.8257, + "step": 2354 + }, + { + "epoch": 8.502031602708804, + "grad_norm": 258.8914489746094, + "learning_rate": 1.7232304900181492e-05, + "loss": 36.1246, + "step": 2355 + }, + { + "epoch": 8.505643340857787, + "grad_norm": 194.8585968017578, + "learning_rate": 1.7226860254083487e-05, + "loss": 36.1245, + "step": 2356 + }, + { + "epoch": 8.509255079006772, + "grad_norm": 191.2276153564453, + "learning_rate": 1.7221415607985483e-05, + "loss": 37.0608, + "step": 2357 + }, + { + "epoch": 8.512866817155757, + "grad_norm": 197.9025115966797, + "learning_rate": 1.7215970961887478e-05, + "loss": 37.0779, + "step": 2358 + }, + { + "epoch": 8.51647855530474, + "grad_norm": 207.01016235351562, + "learning_rate": 1.7210526315789473e-05, + "loss": 37.8432, + "step": 2359 + }, + { + "epoch": 8.520090293453725, + "grad_norm": 222.20201110839844, + "learning_rate": 1.720508166969147e-05, + "loss": 36.6983, + "step": 2360 + }, + { + "epoch": 8.520090293453725, + "eval_loss": 0.6240220665931702, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 2360 + }, + { + "epoch": 8.523702031602708, + "grad_norm": 200.19273376464844, + "learning_rate": 1.7199637023593467e-05, + "loss": 38.0613, + "step": 2361 + }, + { + "epoch": 8.527313769751693, + "grad_norm": 205.36758422851562, + "learning_rate": 1.7194192377495463e-05, + "loss": 29.6395, + "step": 2362 + }, + { + "epoch": 8.530925507900678, + "grad_norm": 206.53396606445312, + "learning_rate": 1.7188747731397458e-05, + "loss": 23.6478, + "step": 2363 + }, + { + "epoch": 8.534537246049661, + "grad_norm": 219.47044372558594, + "learning_rate": 1.7183303085299454e-05, + "loss": 22.8522, + "step": 2364 + }, + { + "epoch": 8.538148984198646, + "grad_norm": 178.48008728027344, + "learning_rate": 1.7177858439201452e-05, + "loss": 24.1411, + "step": 2365 + }, + { + "epoch": 8.54176072234763, + "grad_norm": 222.63731384277344, + "learning_rate": 1.717241379310345e-05, + "loss": 26.2818, + "step": 2366 + }, + { + "epoch": 8.545372460496614, + "grad_norm": 216.6333465576172, + "learning_rate": 1.7166969147005447e-05, + "loss": 42.5599, + "step": 2367 + }, + { + "epoch": 8.548984198645599, + "grad_norm": 241.42532348632812, + "learning_rate": 1.7161524500907442e-05, + "loss": 44.0016, + "step": 2368 + }, + { + "epoch": 8.552595936794582, + "grad_norm": 227.95193481445312, + "learning_rate": 1.7156079854809437e-05, + "loss": 44.1662, + "step": 2369 + }, + { + "epoch": 8.556207674943566, + "grad_norm": 204.9208526611328, + "learning_rate": 1.7150635208711433e-05, + "loss": 41.2255, + "step": 2370 + }, + { + "epoch": 8.556207674943566, + "eval_loss": 0.6293933987617493, + "eval_runtime": 3.1467, + "eval_samples_per_second": 56.884, + "eval_steps_per_second": 56.884, + "step": 2370 + }, + { + "epoch": 8.559819413092551, + "grad_norm": 168.1370849609375, + "learning_rate": 1.7145190562613428e-05, + "loss": 42.8374, + "step": 2371 + }, + { + "epoch": 8.563431151241534, + "grad_norm": 209.16641235351562, + "learning_rate": 1.7139745916515427e-05, + "loss": 42.4378, + "step": 2372 + }, + { + "epoch": 8.56704288939052, + "grad_norm": 235.36373901367188, + "learning_rate": 1.7134301270417422e-05, + "loss": 43.3213, + "step": 2373 + }, + { + "epoch": 8.570654627539504, + "grad_norm": 198.8206329345703, + "learning_rate": 1.712885662431942e-05, + "loss": 43.5621, + "step": 2374 + }, + { + "epoch": 8.574266365688487, + "grad_norm": 191.1640167236328, + "learning_rate": 1.7123411978221416e-05, + "loss": 41.8729, + "step": 2375 + }, + { + "epoch": 8.577878103837472, + "grad_norm": 281.6352233886719, + "learning_rate": 1.7117967332123412e-05, + "loss": 42.8306, + "step": 2376 + }, + { + "epoch": 8.581489841986457, + "grad_norm": 191.68939208984375, + "learning_rate": 1.711252268602541e-05, + "loss": 41.3603, + "step": 2377 + }, + { + "epoch": 8.58510158013544, + "grad_norm": 175.3041229248047, + "learning_rate": 1.7107078039927406e-05, + "loss": 38.7076, + "step": 2378 + }, + { + "epoch": 8.588713318284425, + "grad_norm": 186.31202697753906, + "learning_rate": 1.71016333938294e-05, + "loss": 38.832, + "step": 2379 + }, + { + "epoch": 8.592325056433408, + "grad_norm": 192.0680389404297, + "learning_rate": 1.7096188747731397e-05, + "loss": 40.6542, + "step": 2380 + }, + { + "epoch": 8.592325056433408, + "eval_loss": 0.6245992183685303, + "eval_runtime": 3.1487, + "eval_samples_per_second": 56.848, + "eval_steps_per_second": 56.848, + "step": 2380 + }, + { + "epoch": 8.595936794582393, + "grad_norm": 284.3516540527344, + "learning_rate": 1.7090744101633392e-05, + "loss": 40.3145, + "step": 2381 + }, + { + "epoch": 8.599548532731378, + "grad_norm": 210.2421875, + "learning_rate": 1.708529945553539e-05, + "loss": 39.9109, + "step": 2382 + }, + { + "epoch": 8.60316027088036, + "grad_norm": 202.3438720703125, + "learning_rate": 1.707985480943739e-05, + "loss": 39.0686, + "step": 2383 + }, + { + "epoch": 8.606772009029346, + "grad_norm": 189.5508270263672, + "learning_rate": 1.7074410163339385e-05, + "loss": 40.6673, + "step": 2384 + }, + { + "epoch": 8.610383747178329, + "grad_norm": 199.3516387939453, + "learning_rate": 1.706896551724138e-05, + "loss": 40.5357, + "step": 2385 + }, + { + "epoch": 8.613995485327314, + "grad_norm": 183.11309814453125, + "learning_rate": 1.7063520871143376e-05, + "loss": 40.7691, + "step": 2386 + }, + { + "epoch": 8.617607223476298, + "grad_norm": 347.104248046875, + "learning_rate": 1.705807622504537e-05, + "loss": 40.6822, + "step": 2387 + }, + { + "epoch": 8.621218961625281, + "grad_norm": 341.0453796386719, + "learning_rate": 1.705263157894737e-05, + "loss": 40.9791, + "step": 2388 + }, + { + "epoch": 8.624830699774266, + "grad_norm": 335.33221435546875, + "learning_rate": 1.7047186932849365e-05, + "loss": 41.0977, + "step": 2389 + }, + { + "epoch": 8.628442437923251, + "grad_norm": 209.75198364257812, + "learning_rate": 1.704174228675136e-05, + "loss": 41.3332, + "step": 2390 + }, + { + "epoch": 8.628442437923251, + "eval_loss": 0.6176490783691406, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2390 + }, + { + "epoch": 8.632054176072234, + "grad_norm": 221.6715545654297, + "learning_rate": 1.7036297640653356e-05, + "loss": 41.7456, + "step": 2391 + }, + { + "epoch": 8.635665914221219, + "grad_norm": 255.7875213623047, + "learning_rate": 1.7030852994555355e-05, + "loss": 41.7063, + "step": 2392 + }, + { + "epoch": 8.639277652370204, + "grad_norm": 206.66221618652344, + "learning_rate": 1.7025408348457354e-05, + "loss": 41.941, + "step": 2393 + }, + { + "epoch": 8.642889390519187, + "grad_norm": 381.9871826171875, + "learning_rate": 1.701996370235935e-05, + "loss": 42.8615, + "step": 2394 + }, + { + "epoch": 8.646501128668172, + "grad_norm": 303.8249816894531, + "learning_rate": 1.7014519056261344e-05, + "loss": 37.8472, + "step": 2395 + }, + { + "epoch": 8.650112866817155, + "grad_norm": 201.2444610595703, + "learning_rate": 1.700907441016334e-05, + "loss": 35.4641, + "step": 2396 + }, + { + "epoch": 8.65372460496614, + "grad_norm": 242.34298706054688, + "learning_rate": 1.7003629764065335e-05, + "loss": 33.3414, + "step": 2397 + }, + { + "epoch": 8.657336343115125, + "grad_norm": 214.45384216308594, + "learning_rate": 1.699818511796733e-05, + "loss": 33.7771, + "step": 2398 + }, + { + "epoch": 8.660948081264108, + "grad_norm": 276.4810485839844, + "learning_rate": 1.699274047186933e-05, + "loss": 35.4289, + "step": 2399 + }, + { + "epoch": 8.664559819413093, + "grad_norm": 199.68626403808594, + "learning_rate": 1.6987295825771325e-05, + "loss": 34.4205, + "step": 2400 + }, + { + "epoch": 8.664559819413093, + "eval_loss": 0.6179484128952026, + "eval_runtime": 3.1618, + "eval_samples_per_second": 56.614, + "eval_steps_per_second": 56.614, + "step": 2400 + }, + { + "epoch": 8.668171557562077, + "grad_norm": 239.19200134277344, + "learning_rate": 1.698185117967332e-05, + "loss": 34.3428, + "step": 2401 + }, + { + "epoch": 8.67178329571106, + "grad_norm": 341.44927978515625, + "learning_rate": 1.697640653357532e-05, + "loss": 37.6011, + "step": 2402 + }, + { + "epoch": 8.675395033860045, + "grad_norm": 260.5967102050781, + "learning_rate": 1.6970961887477314e-05, + "loss": 34.9222, + "step": 2403 + }, + { + "epoch": 8.679006772009028, + "grad_norm": 217.9357147216797, + "learning_rate": 1.6965517241379313e-05, + "loss": 36.6177, + "step": 2404 + }, + { + "epoch": 8.682618510158013, + "grad_norm": 355.21917724609375, + "learning_rate": 1.696007259528131e-05, + "loss": 36.3072, + "step": 2405 + }, + { + "epoch": 8.686230248306998, + "grad_norm": 279.37200927734375, + "learning_rate": 1.6954627949183304e-05, + "loss": 36.7026, + "step": 2406 + }, + { + "epoch": 8.689841986455981, + "grad_norm": 344.9017028808594, + "learning_rate": 1.69491833030853e-05, + "loss": 37.5009, + "step": 2407 + }, + { + "epoch": 8.693453724604966, + "grad_norm": 225.28668212890625, + "learning_rate": 1.6943738656987295e-05, + "loss": 36.0914, + "step": 2408 + }, + { + "epoch": 8.697065462753951, + "grad_norm": 233.16372680664062, + "learning_rate": 1.693829401088929e-05, + "loss": 38.0917, + "step": 2409 + }, + { + "epoch": 8.700677200902934, + "grad_norm": 220.2307891845703, + "learning_rate": 1.693284936479129e-05, + "loss": 37.4493, + "step": 2410 + }, + { + "epoch": 8.700677200902934, + "eval_loss": 0.6225734949111938, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2410 + }, + { + "epoch": 8.704288939051919, + "grad_norm": 298.2883605957031, + "learning_rate": 1.6927404718693287e-05, + "loss": 37.6527, + "step": 2411 + }, + { + "epoch": 8.707900677200904, + "grad_norm": 329.1615295410156, + "learning_rate": 1.6921960072595283e-05, + "loss": 30.9627, + "step": 2412 + }, + { + "epoch": 8.711512415349887, + "grad_norm": 192.55380249023438, + "learning_rate": 1.6916515426497278e-05, + "loss": 24.2028, + "step": 2413 + }, + { + "epoch": 8.715124153498872, + "grad_norm": 162.13583374023438, + "learning_rate": 1.6911070780399274e-05, + "loss": 23.3005, + "step": 2414 + }, + { + "epoch": 8.718735891647855, + "grad_norm": 152.95108032226562, + "learning_rate": 1.6905626134301272e-05, + "loss": 24.335, + "step": 2415 + }, + { + "epoch": 8.72234762979684, + "grad_norm": 183.4193572998047, + "learning_rate": 1.6900181488203268e-05, + "loss": 24.9279, + "step": 2416 + }, + { + "epoch": 8.725959367945824, + "grad_norm": 232.93650817871094, + "learning_rate": 1.6894736842105263e-05, + "loss": 43.4574, + "step": 2417 + }, + { + "epoch": 8.729571106094808, + "grad_norm": 226.85890197753906, + "learning_rate": 1.688929219600726e-05, + "loss": 44.4136, + "step": 2418 + }, + { + "epoch": 8.733182844243792, + "grad_norm": 232.16064453125, + "learning_rate": 1.6883847549909254e-05, + "loss": 42.8183, + "step": 2419 + }, + { + "epoch": 8.736794582392777, + "grad_norm": 243.5811767578125, + "learning_rate": 1.6878402903811253e-05, + "loss": 43.3031, + "step": 2420 + }, + { + "epoch": 8.736794582392777, + "eval_loss": 0.6284167170524597, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2420 + }, + { + "epoch": 8.74040632054176, + "grad_norm": 194.7115020751953, + "learning_rate": 1.687295825771325e-05, + "loss": 42.1276, + "step": 2421 + }, + { + "epoch": 8.744018058690745, + "grad_norm": 250.81983947753906, + "learning_rate": 1.6867513611615247e-05, + "loss": 42.5535, + "step": 2422 + }, + { + "epoch": 8.747629796839728, + "grad_norm": 205.1988983154297, + "learning_rate": 1.6862068965517242e-05, + "loss": 42.7745, + "step": 2423 + }, + { + "epoch": 8.751241534988713, + "grad_norm": 159.68243408203125, + "learning_rate": 1.6856624319419238e-05, + "loss": 43.6562, + "step": 2424 + }, + { + "epoch": 8.754853273137698, + "grad_norm": 164.31361389160156, + "learning_rate": 1.6851179673321233e-05, + "loss": 43.4602, + "step": 2425 + }, + { + "epoch": 8.758465011286681, + "grad_norm": 213.9793243408203, + "learning_rate": 1.6845735027223232e-05, + "loss": 42.1559, + "step": 2426 + }, + { + "epoch": 8.762076749435666, + "grad_norm": 205.79107666015625, + "learning_rate": 1.6840290381125227e-05, + "loss": 41.5687, + "step": 2427 + }, + { + "epoch": 8.76568848758465, + "grad_norm": 235.80348205566406, + "learning_rate": 1.6834845735027223e-05, + "loss": 41.0748, + "step": 2428 + }, + { + "epoch": 8.769300225733634, + "grad_norm": 203.84884643554688, + "learning_rate": 1.682940108892922e-05, + "loss": 39.3348, + "step": 2429 + }, + { + "epoch": 8.772911963882619, + "grad_norm": 271.2411804199219, + "learning_rate": 1.6823956442831217e-05, + "loss": 39.357, + "step": 2430 + }, + { + "epoch": 8.772911963882619, + "eval_loss": 0.6211046576499939, + "eval_runtime": 3.1402, + "eval_samples_per_second": 57.002, + "eval_steps_per_second": 57.002, + "step": 2430 + }, + { + "epoch": 8.776523702031604, + "grad_norm": 222.4960174560547, + "learning_rate": 1.6818511796733212e-05, + "loss": 39.2198, + "step": 2431 + }, + { + "epoch": 8.780135440180587, + "grad_norm": 325.9942932128906, + "learning_rate": 1.681306715063521e-05, + "loss": 40.572, + "step": 2432 + }, + { + "epoch": 8.783747178329572, + "grad_norm": 195.2740936279297, + "learning_rate": 1.6807622504537206e-05, + "loss": 39.2727, + "step": 2433 + }, + { + "epoch": 8.787358916478555, + "grad_norm": 196.16964721679688, + "learning_rate": 1.68021778584392e-05, + "loss": 40.6503, + "step": 2434 + }, + { + "epoch": 8.79097065462754, + "grad_norm": 183.2659454345703, + "learning_rate": 1.6796733212341197e-05, + "loss": 41.2074, + "step": 2435 + }, + { + "epoch": 8.794582392776524, + "grad_norm": 293.393798828125, + "learning_rate": 1.6791288566243192e-05, + "loss": 40.2778, + "step": 2436 + }, + { + "epoch": 8.798194130925507, + "grad_norm": 232.8402099609375, + "learning_rate": 1.678584392014519e-05, + "loss": 40.0305, + "step": 2437 + }, + { + "epoch": 8.801805869074492, + "grad_norm": 269.957275390625, + "learning_rate": 1.678039927404719e-05, + "loss": 40.4216, + "step": 2438 + }, + { + "epoch": 8.805417607223477, + "grad_norm": 175.6732635498047, + "learning_rate": 1.6774954627949185e-05, + "loss": 40.7998, + "step": 2439 + }, + { + "epoch": 8.80902934537246, + "grad_norm": 209.0604248046875, + "learning_rate": 1.676950998185118e-05, + "loss": 41.1176, + "step": 2440 + }, + { + "epoch": 8.80902934537246, + "eval_loss": 0.6211614012718201, + "eval_runtime": 3.15, + "eval_samples_per_second": 56.826, + "eval_steps_per_second": 56.826, + "step": 2440 + }, + { + "epoch": 8.812641083521445, + "grad_norm": 229.91171264648438, + "learning_rate": 1.6764065335753176e-05, + "loss": 41.37, + "step": 2441 + }, + { + "epoch": 8.816252821670428, + "grad_norm": 192.99610900878906, + "learning_rate": 1.675862068965517e-05, + "loss": 41.8377, + "step": 2442 + }, + { + "epoch": 8.819864559819413, + "grad_norm": 239.290771484375, + "learning_rate": 1.675317604355717e-05, + "loss": 42.3038, + "step": 2443 + }, + { + "epoch": 8.823476297968398, + "grad_norm": 203.52330017089844, + "learning_rate": 1.6747731397459166e-05, + "loss": 41.3334, + "step": 2444 + }, + { + "epoch": 8.827088036117381, + "grad_norm": 247.99099731445312, + "learning_rate": 1.674228675136116e-05, + "loss": 37.7455, + "step": 2445 + }, + { + "epoch": 8.830699774266366, + "grad_norm": 205.9770965576172, + "learning_rate": 1.6736842105263156e-05, + "loss": 34.6828, + "step": 2446 + }, + { + "epoch": 8.83431151241535, + "grad_norm": 215.47024536132812, + "learning_rate": 1.6731397459165152e-05, + "loss": 34.927, + "step": 2447 + }, + { + "epoch": 8.837923250564334, + "grad_norm": 254.14010620117188, + "learning_rate": 1.6725952813067154e-05, + "loss": 35.3194, + "step": 2448 + }, + { + "epoch": 8.841534988713319, + "grad_norm": 221.18174743652344, + "learning_rate": 1.672050816696915e-05, + "loss": 34.9577, + "step": 2449 + }, + { + "epoch": 8.845146726862303, + "grad_norm": 191.1651611328125, + "learning_rate": 1.6715063520871145e-05, + "loss": 33.7244, + "step": 2450 + }, + { + "epoch": 8.845146726862303, + "eval_loss": 0.6216589212417603, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2450 + }, + { + "epoch": 8.848758465011286, + "grad_norm": 228.3920135498047, + "learning_rate": 1.670961887477314e-05, + "loss": 34.9689, + "step": 2451 + }, + { + "epoch": 8.852370203160271, + "grad_norm": 227.6689910888672, + "learning_rate": 1.6704174228675135e-05, + "loss": 36.0718, + "step": 2452 + }, + { + "epoch": 8.855981941309254, + "grad_norm": 182.38978576660156, + "learning_rate": 1.669872958257713e-05, + "loss": 37.1143, + "step": 2453 + }, + { + "epoch": 8.85959367945824, + "grad_norm": 223.66966247558594, + "learning_rate": 1.669328493647913e-05, + "loss": 34.4468, + "step": 2454 + }, + { + "epoch": 8.863205417607224, + "grad_norm": 260.3930358886719, + "learning_rate": 1.6687840290381125e-05, + "loss": 36.7305, + "step": 2455 + }, + { + "epoch": 8.866817155756207, + "grad_norm": 218.60385131835938, + "learning_rate": 1.668239564428312e-05, + "loss": 36.1995, + "step": 2456 + }, + { + "epoch": 8.870428893905192, + "grad_norm": 227.4342041015625, + "learning_rate": 1.667695099818512e-05, + "loss": 35.9138, + "step": 2457 + }, + { + "epoch": 8.874040632054175, + "grad_norm": 208.42196655273438, + "learning_rate": 1.6671506352087115e-05, + "loss": 37.2621, + "step": 2458 + }, + { + "epoch": 8.87765237020316, + "grad_norm": 214.9486541748047, + "learning_rate": 1.6666061705989113e-05, + "loss": 38.5176, + "step": 2459 + }, + { + "epoch": 8.881264108352145, + "grad_norm": 226.6992645263672, + "learning_rate": 1.666061705989111e-05, + "loss": 38.3917, + "step": 2460 + }, + { + "epoch": 8.881264108352145, + "eval_loss": 0.6277003884315491, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.959, + "eval_steps_per_second": 56.959, + "step": 2460 + }, + { + "epoch": 8.884875846501128, + "grad_norm": 282.3875732421875, + "learning_rate": 1.6655172413793104e-05, + "loss": 39.1439, + "step": 2461 + }, + { + "epoch": 8.888487584650113, + "grad_norm": 240.29022216796875, + "learning_rate": 1.66497277676951e-05, + "loss": 33.7717, + "step": 2462 + }, + { + "epoch": 8.892099322799098, + "grad_norm": 231.84727478027344, + "learning_rate": 1.6644283121597095e-05, + "loss": 24.1146, + "step": 2463 + }, + { + "epoch": 8.89571106094808, + "grad_norm": 215.5159149169922, + "learning_rate": 1.663883847549909e-05, + "loss": 24.0165, + "step": 2464 + }, + { + "epoch": 8.899322799097066, + "grad_norm": 278.42950439453125, + "learning_rate": 1.663339382940109e-05, + "loss": 24.2048, + "step": 2465 + }, + { + "epoch": 8.90293453724605, + "grad_norm": 187.03341674804688, + "learning_rate": 1.6627949183303088e-05, + "loss": 24.7332, + "step": 2466 + }, + { + "epoch": 8.906546275395034, + "grad_norm": 261.2938232421875, + "learning_rate": 1.6622504537205083e-05, + "loss": 42.6764, + "step": 2467 + }, + { + "epoch": 8.910158013544018, + "grad_norm": 234.00880432128906, + "learning_rate": 1.661705989110708e-05, + "loss": 42.9894, + "step": 2468 + }, + { + "epoch": 8.913769751693001, + "grad_norm": 263.2890319824219, + "learning_rate": 1.6611615245009074e-05, + "loss": 43.3274, + "step": 2469 + }, + { + "epoch": 8.917381489841986, + "grad_norm": 286.3260192871094, + "learning_rate": 1.6606170598911073e-05, + "loss": 44.3862, + "step": 2470 + }, + { + "epoch": 8.917381489841986, + "eval_loss": 0.6278789043426514, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2470 + }, + { + "epoch": 8.920993227990971, + "grad_norm": 273.5133972167969, + "learning_rate": 1.6600725952813068e-05, + "loss": 43.4195, + "step": 2471 + }, + { + "epoch": 8.924604966139954, + "grad_norm": 246.2245330810547, + "learning_rate": 1.6595281306715063e-05, + "loss": 43.153, + "step": 2472 + }, + { + "epoch": 8.928216704288939, + "grad_norm": 261.3001403808594, + "learning_rate": 1.658983666061706e-05, + "loss": 41.1276, + "step": 2473 + }, + { + "epoch": 8.931828442437924, + "grad_norm": 263.7626037597656, + "learning_rate": 1.6584392014519054e-05, + "loss": 40.5055, + "step": 2474 + }, + { + "epoch": 8.935440180586907, + "grad_norm": 233.80442810058594, + "learning_rate": 1.6578947368421053e-05, + "loss": 40.7098, + "step": 2475 + }, + { + "epoch": 8.939051918735892, + "grad_norm": 334.1268615722656, + "learning_rate": 1.6573502722323052e-05, + "loss": 40.5404, + "step": 2476 + }, + { + "epoch": 8.942663656884875, + "grad_norm": 319.56689453125, + "learning_rate": 1.6568058076225047e-05, + "loss": 40.3434, + "step": 2477 + }, + { + "epoch": 8.94627539503386, + "grad_norm": 388.0625915527344, + "learning_rate": 1.6562613430127043e-05, + "loss": 41.1956, + "step": 2478 + }, + { + "epoch": 8.949887133182845, + "grad_norm": 256.9087829589844, + "learning_rate": 1.6557168784029038e-05, + "loss": 41.9647, + "step": 2479 + }, + { + "epoch": 8.953498871331828, + "grad_norm": 248.2635040283203, + "learning_rate": 1.6551724137931033e-05, + "loss": 41.1885, + "step": 2480 + }, + { + "epoch": 8.953498871331828, + "eval_loss": 0.6198933124542236, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2480 + }, + { + "epoch": 8.957110609480813, + "grad_norm": 236.89004516601562, + "learning_rate": 1.6546279491833032e-05, + "loss": 41.2178, + "step": 2481 + }, + { + "epoch": 8.960722347629797, + "grad_norm": 260.47357177734375, + "learning_rate": 1.6540834845735027e-05, + "loss": 42.1472, + "step": 2482 + }, + { + "epoch": 8.96433408577878, + "grad_norm": 216.1390380859375, + "learning_rate": 1.6535390199637023e-05, + "loss": 36.14, + "step": 2483 + }, + { + "epoch": 8.967945823927765, + "grad_norm": 194.7316131591797, + "learning_rate": 1.652994555353902e-05, + "loss": 33.7272, + "step": 2484 + }, + { + "epoch": 8.97155756207675, + "grad_norm": 202.0404052734375, + "learning_rate": 1.6524500907441017e-05, + "loss": 34.9427, + "step": 2485 + }, + { + "epoch": 8.975169300225733, + "grad_norm": 196.98463439941406, + "learning_rate": 1.6519056261343016e-05, + "loss": 36.4874, + "step": 2486 + }, + { + "epoch": 8.978781038374718, + "grad_norm": 211.46177673339844, + "learning_rate": 1.651361161524501e-05, + "loss": 35.7667, + "step": 2487 + }, + { + "epoch": 8.982392776523701, + "grad_norm": 190.47093200683594, + "learning_rate": 1.6508166969147006e-05, + "loss": 35.6874, + "step": 2488 + }, + { + "epoch": 8.986004514672686, + "grad_norm": 194.9825897216797, + "learning_rate": 1.6502722323049002e-05, + "loss": 36.8718, + "step": 2489 + }, + { + "epoch": 8.989616252821671, + "grad_norm": 230.24774169921875, + "learning_rate": 1.6497277676950997e-05, + "loss": 37.4962, + "step": 2490 + }, + { + "epoch": 8.989616252821671, + "eval_loss": 0.6168100237846375, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 2490 + }, + { + "epoch": 8.993227990970654, + "grad_norm": 266.5688171386719, + "learning_rate": 1.6491833030852993e-05, + "loss": 35.5063, + "step": 2491 + }, + { + "epoch": 8.996839729119639, + "grad_norm": 230.923828125, + "learning_rate": 1.648638838475499e-05, + "loss": 23.5847, + "step": 2492 + }, + { + "epoch": 9.0, + "grad_norm": 187.365478515625, + "learning_rate": 1.6480943738656987e-05, + "loss": 21.7926, + "step": 2493 + }, + { + "epoch": 9.003611738148985, + "grad_norm": 283.487060546875, + "learning_rate": 1.6475499092558986e-05, + "loss": 41.4221, + "step": 2494 + }, + { + "epoch": 9.007223476297968, + "grad_norm": 234.38009643554688, + "learning_rate": 1.647005444646098e-05, + "loss": 43.3343, + "step": 2495 + }, + { + "epoch": 9.010835214446953, + "grad_norm": 253.75588989257812, + "learning_rate": 1.6464609800362976e-05, + "loss": 42.1983, + "step": 2496 + }, + { + "epoch": 9.014446952595938, + "grad_norm": 224.6202392578125, + "learning_rate": 1.6459165154264975e-05, + "loss": 41.5355, + "step": 2497 + }, + { + "epoch": 9.01805869074492, + "grad_norm": 261.0040588378906, + "learning_rate": 1.645372050816697e-05, + "loss": 42.3058, + "step": 2498 + }, + { + "epoch": 9.021670428893906, + "grad_norm": 191.44142150878906, + "learning_rate": 1.6448275862068966e-05, + "loss": 42.3911, + "step": 2499 + }, + { + "epoch": 9.025282167042889, + "grad_norm": 246.79278564453125, + "learning_rate": 1.644283121597096e-05, + "loss": 41.6238, + "step": 2500 + }, + { + "epoch": 9.025282167042889, + "eval_loss": 0.6220878958702087, + "eval_runtime": 3.1552, + "eval_samples_per_second": 56.731, + "eval_steps_per_second": 56.731, + "step": 2500 + }, + { + "epoch": 9.028893905191874, + "grad_norm": 251.5475311279297, + "learning_rate": 1.6437386569872957e-05, + "loss": 43.9275, + "step": 2501 + }, + { + "epoch": 9.032505643340858, + "grad_norm": 300.0381164550781, + "learning_rate": 1.6431941923774952e-05, + "loss": 42.8938, + "step": 2502 + }, + { + "epoch": 9.036117381489841, + "grad_norm": 310.0517883300781, + "learning_rate": 1.6426497277676954e-05, + "loss": 42.3538, + "step": 2503 + }, + { + "epoch": 9.039729119638826, + "grad_norm": 213.50392150878906, + "learning_rate": 1.642105263157895e-05, + "loss": 40.2305, + "step": 2504 + }, + { + "epoch": 9.043340857787811, + "grad_norm": 173.3816680908203, + "learning_rate": 1.6415607985480945e-05, + "loss": 38.3336, + "step": 2505 + }, + { + "epoch": 9.046952595936794, + "grad_norm": 195.51968383789062, + "learning_rate": 1.641016333938294e-05, + "loss": 38.5937, + "step": 2506 + }, + { + "epoch": 9.050564334085779, + "grad_norm": 195.68910217285156, + "learning_rate": 1.6404718693284936e-05, + "loss": 37.9994, + "step": 2507 + }, + { + "epoch": 9.054176072234762, + "grad_norm": 239.56704711914062, + "learning_rate": 1.6399274047186934e-05, + "loss": 38.6006, + "step": 2508 + }, + { + "epoch": 9.057787810383747, + "grad_norm": 455.8309326171875, + "learning_rate": 1.639382940108893e-05, + "loss": 39.9516, + "step": 2509 + }, + { + "epoch": 9.061399548532732, + "grad_norm": 188.0857696533203, + "learning_rate": 1.6388384754990925e-05, + "loss": 38.8922, + "step": 2510 + }, + { + "epoch": 9.061399548532732, + "eval_loss": 0.6177002191543579, + "eval_runtime": 3.1595, + "eval_samples_per_second": 56.654, + "eval_steps_per_second": 56.654, + "step": 2510 + }, + { + "epoch": 9.065011286681715, + "grad_norm": 211.76168823242188, + "learning_rate": 1.638294010889292e-05, + "loss": 38.8895, + "step": 2511 + }, + { + "epoch": 9.0686230248307, + "grad_norm": 281.7332458496094, + "learning_rate": 1.637749546279492e-05, + "loss": 39.9238, + "step": 2512 + }, + { + "epoch": 9.072234762979685, + "grad_norm": 254.9953155517578, + "learning_rate": 1.6372050816696915e-05, + "loss": 41.2667, + "step": 2513 + }, + { + "epoch": 9.075846501128668, + "grad_norm": 233.8746337890625, + "learning_rate": 1.6366606170598914e-05, + "loss": 39.3087, + "step": 2514 + }, + { + "epoch": 9.079458239277653, + "grad_norm": 317.71270751953125, + "learning_rate": 1.636116152450091e-05, + "loss": 40.4902, + "step": 2515 + }, + { + "epoch": 9.083069977426636, + "grad_norm": 227.5228271484375, + "learning_rate": 1.6355716878402904e-05, + "loss": 40.1197, + "step": 2516 + }, + { + "epoch": 9.08668171557562, + "grad_norm": 225.84423828125, + "learning_rate": 1.63502722323049e-05, + "loss": 42.9099, + "step": 2517 + }, + { + "epoch": 9.090293453724605, + "grad_norm": 255.20858764648438, + "learning_rate": 1.6344827586206895e-05, + "loss": 42.0515, + "step": 2518 + }, + { + "epoch": 9.093905191873588, + "grad_norm": 215.45352172851562, + "learning_rate": 1.6339382940108894e-05, + "loss": 41.6817, + "step": 2519 + }, + { + "epoch": 9.097516930022573, + "grad_norm": 233.5334014892578, + "learning_rate": 1.633393829401089e-05, + "loss": 42.6121, + "step": 2520 + }, + { + "epoch": 9.097516930022573, + "eval_loss": 0.6148340106010437, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.926, + "eval_steps_per_second": 56.926, + "step": 2520 + }, + { + "epoch": 9.101128668171558, + "grad_norm": 196.54132080078125, + "learning_rate": 1.6328493647912888e-05, + "loss": 40.5833, + "step": 2521 + }, + { + "epoch": 9.104740406320541, + "grad_norm": 296.7503967285156, + "learning_rate": 1.6323049001814883e-05, + "loss": 39.098, + "step": 2522 + }, + { + "epoch": 9.108352144469526, + "grad_norm": 272.1104431152344, + "learning_rate": 1.631760435571688e-05, + "loss": 36.0076, + "step": 2523 + }, + { + "epoch": 9.111963882618511, + "grad_norm": 197.3100128173828, + "learning_rate": 1.6312159709618874e-05, + "loss": 33.3503, + "step": 2524 + }, + { + "epoch": 9.115575620767494, + "grad_norm": 223.1310272216797, + "learning_rate": 1.6306715063520873e-05, + "loss": 33.1386, + "step": 2525 + }, + { + "epoch": 9.119187358916479, + "grad_norm": 234.86093139648438, + "learning_rate": 1.630127041742287e-05, + "loss": 34.2101, + "step": 2526 + }, + { + "epoch": 9.122799097065462, + "grad_norm": 244.72328186035156, + "learning_rate": 1.6295825771324864e-05, + "loss": 34.955, + "step": 2527 + }, + { + "epoch": 9.126410835214447, + "grad_norm": 198.89134216308594, + "learning_rate": 1.629038112522686e-05, + "loss": 34.5405, + "step": 2528 + }, + { + "epoch": 9.130022573363432, + "grad_norm": 236.64096069335938, + "learning_rate": 1.6284936479128854e-05, + "loss": 35.2328, + "step": 2529 + }, + { + "epoch": 9.133634311512415, + "grad_norm": 212.8743438720703, + "learning_rate": 1.6279491833030853e-05, + "loss": 34.6642, + "step": 2530 + }, + { + "epoch": 9.133634311512415, + "eval_loss": 0.6154256463050842, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 2530 + }, + { + "epoch": 9.1372460496614, + "grad_norm": 227.15135192871094, + "learning_rate": 1.6274047186932852e-05, + "loss": 35.652, + "step": 2531 + }, + { + "epoch": 9.140857787810384, + "grad_norm": 207.30572509765625, + "learning_rate": 1.6268602540834847e-05, + "loss": 36.8476, + "step": 2532 + }, + { + "epoch": 9.144469525959368, + "grad_norm": 222.18023681640625, + "learning_rate": 1.6263157894736843e-05, + "loss": 35.8299, + "step": 2533 + }, + { + "epoch": 9.148081264108352, + "grad_norm": 283.674072265625, + "learning_rate": 1.6257713248638838e-05, + "loss": 36.5074, + "step": 2534 + }, + { + "epoch": 9.151693002257336, + "grad_norm": 235.69752502441406, + "learning_rate": 1.6252268602540834e-05, + "loss": 37.344, + "step": 2535 + }, + { + "epoch": 9.15530474040632, + "grad_norm": 224.37965393066406, + "learning_rate": 1.6246823956442832e-05, + "loss": 37.8138, + "step": 2536 + }, + { + "epoch": 9.158916478555305, + "grad_norm": 217.52230834960938, + "learning_rate": 1.6241379310344828e-05, + "loss": 37.1529, + "step": 2537 + }, + { + "epoch": 9.162528216704288, + "grad_norm": 234.7586212158203, + "learning_rate": 1.6235934664246823e-05, + "loss": 36.3247, + "step": 2538 + }, + { + "epoch": 9.166139954853273, + "grad_norm": 239.52479553222656, + "learning_rate": 1.623049001814882e-05, + "loss": 30.0805, + "step": 2539 + }, + { + "epoch": 9.169751693002258, + "grad_norm": 223.7616424560547, + "learning_rate": 1.6225045372050817e-05, + "loss": 23.8492, + "step": 2540 + }, + { + "epoch": 9.169751693002258, + "eval_loss": 0.6244915723800659, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.031, + "eval_steps_per_second": 57.031, + "step": 2540 + }, + { + "epoch": 9.173363431151241, + "grad_norm": 213.41371154785156, + "learning_rate": 1.6219600725952816e-05, + "loss": 23.3557, + "step": 2541 + }, + { + "epoch": 9.176975169300226, + "grad_norm": 162.4627685546875, + "learning_rate": 1.621415607985481e-05, + "loss": 23.8834, + "step": 2542 + }, + { + "epoch": 9.18058690744921, + "grad_norm": 172.13250732421875, + "learning_rate": 1.6208711433756807e-05, + "loss": 24.6428, + "step": 2543 + }, + { + "epoch": 9.184198645598194, + "grad_norm": 229.30799865722656, + "learning_rate": 1.6203266787658802e-05, + "loss": 42.5908, + "step": 2544 + }, + { + "epoch": 9.187810383747179, + "grad_norm": 195.30130004882812, + "learning_rate": 1.6197822141560798e-05, + "loss": 43.7286, + "step": 2545 + }, + { + "epoch": 9.191422121896162, + "grad_norm": 227.4984893798828, + "learning_rate": 1.6192377495462793e-05, + "loss": 43.5012, + "step": 2546 + }, + { + "epoch": 9.195033860045147, + "grad_norm": 254.69615173339844, + "learning_rate": 1.6186932849364792e-05, + "loss": 41.9295, + "step": 2547 + }, + { + "epoch": 9.198645598194132, + "grad_norm": 251.33778381347656, + "learning_rate": 1.6181488203266787e-05, + "loss": 42.0838, + "step": 2548 + }, + { + "epoch": 9.202257336343115, + "grad_norm": 237.91677856445312, + "learning_rate": 1.6176043557168786e-05, + "loss": 43.0031, + "step": 2549 + }, + { + "epoch": 9.2058690744921, + "grad_norm": 258.0311584472656, + "learning_rate": 1.617059891107078e-05, + "loss": 42.7196, + "step": 2550 + }, + { + "epoch": 9.2058690744921, + "eval_loss": 0.6245208978652954, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2550 + }, + { + "epoch": 9.209480812641084, + "grad_norm": 197.14703369140625, + "learning_rate": 1.6165154264972777e-05, + "loss": 42.1342, + "step": 2551 + }, + { + "epoch": 9.213092550790067, + "grad_norm": 235.19705200195312, + "learning_rate": 1.6159709618874775e-05, + "loss": 41.8462, + "step": 2552 + }, + { + "epoch": 9.216704288939052, + "grad_norm": 198.409423828125, + "learning_rate": 1.615426497277677e-05, + "loss": 43.5993, + "step": 2553 + }, + { + "epoch": 9.220316027088035, + "grad_norm": 254.08590698242188, + "learning_rate": 1.6148820326678766e-05, + "loss": 40.771, + "step": 2554 + }, + { + "epoch": 9.22392776523702, + "grad_norm": 181.64808654785156, + "learning_rate": 1.614337568058076e-05, + "loss": 39.3511, + "step": 2555 + }, + { + "epoch": 9.227539503386005, + "grad_norm": 294.1127014160156, + "learning_rate": 1.6137931034482757e-05, + "loss": 39.6586, + "step": 2556 + }, + { + "epoch": 9.231151241534988, + "grad_norm": 197.59982299804688, + "learning_rate": 1.6132486388384752e-05, + "loss": 38.2575, + "step": 2557 + }, + { + "epoch": 9.234762979683973, + "grad_norm": 223.74717712402344, + "learning_rate": 1.6127041742286754e-05, + "loss": 38.8801, + "step": 2558 + }, + { + "epoch": 9.238374717832958, + "grad_norm": 279.2779541015625, + "learning_rate": 1.612159709618875e-05, + "loss": 40.4591, + "step": 2559 + }, + { + "epoch": 9.241986455981941, + "grad_norm": 258.75909423828125, + "learning_rate": 1.6116152450090745e-05, + "loss": 39.2172, + "step": 2560 + }, + { + "epoch": 9.241986455981941, + "eval_loss": 0.6209923624992371, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.784, + "eval_steps_per_second": 56.784, + "step": 2560 + }, + { + "epoch": 9.245598194130926, + "grad_norm": 305.0645446777344, + "learning_rate": 1.611070780399274e-05, + "loss": 40.442, + "step": 2561 + }, + { + "epoch": 9.249209932279909, + "grad_norm": 196.18557739257812, + "learning_rate": 1.6105263157894736e-05, + "loss": 39.7092, + "step": 2562 + }, + { + "epoch": 9.252821670428894, + "grad_norm": 214.3220977783203, + "learning_rate": 1.6099818511796735e-05, + "loss": 39.3935, + "step": 2563 + }, + { + "epoch": 9.256433408577879, + "grad_norm": 217.2801055908203, + "learning_rate": 1.609437386569873e-05, + "loss": 40.39, + "step": 2564 + }, + { + "epoch": 9.260045146726862, + "grad_norm": 205.17446899414062, + "learning_rate": 1.6088929219600726e-05, + "loss": 39.9531, + "step": 2565 + }, + { + "epoch": 9.263656884875846, + "grad_norm": 197.3854217529297, + "learning_rate": 1.608348457350272e-05, + "loss": 40.474, + "step": 2566 + }, + { + "epoch": 9.267268623024831, + "grad_norm": 264.3934631347656, + "learning_rate": 1.607803992740472e-05, + "loss": 41.2794, + "step": 2567 + }, + { + "epoch": 9.270880361173814, + "grad_norm": 226.6471710205078, + "learning_rate": 1.6072595281306715e-05, + "loss": 40.3425, + "step": 2568 + }, + { + "epoch": 9.2744920993228, + "grad_norm": 198.62734985351562, + "learning_rate": 1.6067150635208714e-05, + "loss": 41.6261, + "step": 2569 + }, + { + "epoch": 9.278103837471784, + "grad_norm": 207.73509216308594, + "learning_rate": 1.606170598911071e-05, + "loss": 41.7835, + "step": 2570 + }, + { + "epoch": 9.278103837471784, + "eval_loss": 0.6173180937767029, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 2570 + }, + { + "epoch": 9.281715575620767, + "grad_norm": 214.13601684570312, + "learning_rate": 1.6056261343012705e-05, + "loss": 40.0095, + "step": 2571 + }, + { + "epoch": 9.285327313769752, + "grad_norm": 218.0533905029297, + "learning_rate": 1.60508166969147e-05, + "loss": 40.014, + "step": 2572 + }, + { + "epoch": 9.288939051918735, + "grad_norm": 211.27984619140625, + "learning_rate": 1.6045372050816695e-05, + "loss": 36.7399, + "step": 2573 + }, + { + "epoch": 9.29255079006772, + "grad_norm": 201.9020233154297, + "learning_rate": 1.6039927404718694e-05, + "loss": 33.7555, + "step": 2574 + }, + { + "epoch": 9.296162528216705, + "grad_norm": 230.27149963378906, + "learning_rate": 1.603448275862069e-05, + "loss": 32.9646, + "step": 2575 + }, + { + "epoch": 9.299774266365688, + "grad_norm": 208.77622985839844, + "learning_rate": 1.6029038112522685e-05, + "loss": 33.5332, + "step": 2576 + }, + { + "epoch": 9.303386004514673, + "grad_norm": 225.02796936035156, + "learning_rate": 1.6023593466424684e-05, + "loss": 34.2592, + "step": 2577 + }, + { + "epoch": 9.306997742663658, + "grad_norm": 201.79612731933594, + "learning_rate": 1.601814882032668e-05, + "loss": 34.6686, + "step": 2578 + }, + { + "epoch": 9.31060948081264, + "grad_norm": 235.6588134765625, + "learning_rate": 1.6012704174228678e-05, + "loss": 35.4554, + "step": 2579 + }, + { + "epoch": 9.314221218961626, + "grad_norm": 273.51904296875, + "learning_rate": 1.6007259528130673e-05, + "loss": 35.2077, + "step": 2580 + }, + { + "epoch": 9.314221218961626, + "eval_loss": 0.6169624328613281, + "eval_runtime": 3.1501, + "eval_samples_per_second": 56.823, + "eval_steps_per_second": 56.823, + "step": 2580 + }, + { + "epoch": 9.317832957110609, + "grad_norm": 199.19541931152344, + "learning_rate": 1.600181488203267e-05, + "loss": 35.0703, + "step": 2581 + }, + { + "epoch": 9.321444695259594, + "grad_norm": 212.49276733398438, + "learning_rate": 1.5996370235934664e-05, + "loss": 35.9691, + "step": 2582 + }, + { + "epoch": 9.325056433408578, + "grad_norm": 193.7330322265625, + "learning_rate": 1.599092558983666e-05, + "loss": 34.9043, + "step": 2583 + }, + { + "epoch": 9.328668171557561, + "grad_norm": 196.00503540039062, + "learning_rate": 1.5985480943738655e-05, + "loss": 36.3508, + "step": 2584 + }, + { + "epoch": 9.332279909706546, + "grad_norm": 218.78392028808594, + "learning_rate": 1.5980036297640654e-05, + "loss": 34.7672, + "step": 2585 + }, + { + "epoch": 9.335891647855531, + "grad_norm": 235.76873779296875, + "learning_rate": 1.5974591651542652e-05, + "loss": 36.8695, + "step": 2586 + }, + { + "epoch": 9.339503386004514, + "grad_norm": 250.538330078125, + "learning_rate": 1.5969147005444648e-05, + "loss": 37.4531, + "step": 2587 + }, + { + "epoch": 9.343115124153499, + "grad_norm": 234.12469482421875, + "learning_rate": 1.5963702359346643e-05, + "loss": 37.4506, + "step": 2588 + }, + { + "epoch": 9.346726862302482, + "grad_norm": 209.3461151123047, + "learning_rate": 1.595825771324864e-05, + "loss": 31.3062, + "step": 2589 + }, + { + "epoch": 9.350338600451467, + "grad_norm": 211.12277221679688, + "learning_rate": 1.5952813067150637e-05, + "loss": 23.3303, + "step": 2590 + }, + { + "epoch": 9.350338600451467, + "eval_loss": 0.6222187876701355, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 2590 + }, + { + "epoch": 9.353950338600452, + "grad_norm": 200.1257781982422, + "learning_rate": 1.5947368421052633e-05, + "loss": 22.9145, + "step": 2591 + }, + { + "epoch": 9.357562076749435, + "grad_norm": 179.01475524902344, + "learning_rate": 1.5941923774954628e-05, + "loss": 23.8842, + "step": 2592 + }, + { + "epoch": 9.36117381489842, + "grad_norm": 214.9254608154297, + "learning_rate": 1.5936479128856623e-05, + "loss": 25.4154, + "step": 2593 + }, + { + "epoch": 9.364785553047405, + "grad_norm": 211.63735961914062, + "learning_rate": 1.593103448275862e-05, + "loss": 42.6467, + "step": 2594 + }, + { + "epoch": 9.368397291196388, + "grad_norm": 232.43194580078125, + "learning_rate": 1.5925589836660618e-05, + "loss": 43.3501, + "step": 2595 + }, + { + "epoch": 9.372009029345373, + "grad_norm": 220.61468505859375, + "learning_rate": 1.5920145190562616e-05, + "loss": 43.4324, + "step": 2596 + }, + { + "epoch": 9.375620767494357, + "grad_norm": 179.00894165039062, + "learning_rate": 1.591470054446461e-05, + "loss": 41.9646, + "step": 2597 + }, + { + "epoch": 9.37923250564334, + "grad_norm": 203.847412109375, + "learning_rate": 1.5909255898366607e-05, + "loss": 41.1242, + "step": 2598 + }, + { + "epoch": 9.382844243792325, + "grad_norm": 244.20164489746094, + "learning_rate": 1.5903811252268602e-05, + "loss": 42.2451, + "step": 2599 + }, + { + "epoch": 9.386455981941308, + "grad_norm": 203.60154724121094, + "learning_rate": 1.5898366606170598e-05, + "loss": 42.0361, + "step": 2600 + }, + { + "epoch": 9.386455981941308, + "eval_loss": 0.627146303653717, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2600 + }, + { + "epoch": 9.390067720090293, + "grad_norm": 185.1741180419922, + "learning_rate": 1.5892921960072597e-05, + "loss": 41.9657, + "step": 2601 + }, + { + "epoch": 9.393679458239278, + "grad_norm": 211.64219665527344, + "learning_rate": 1.5887477313974592e-05, + "loss": 42.2619, + "step": 2602 + }, + { + "epoch": 9.397291196388261, + "grad_norm": 253.31997680664062, + "learning_rate": 1.5882032667876587e-05, + "loss": 42.5666, + "step": 2603 + }, + { + "epoch": 9.400902934537246, + "grad_norm": 257.8781433105469, + "learning_rate": 1.5876588021778586e-05, + "loss": 43.1747, + "step": 2604 + }, + { + "epoch": 9.404514672686231, + "grad_norm": 171.05398559570312, + "learning_rate": 1.587114337568058e-05, + "loss": 41.2645, + "step": 2605 + }, + { + "epoch": 9.408126410835214, + "grad_norm": 209.83749389648438, + "learning_rate": 1.5865698729582577e-05, + "loss": 38.7138, + "step": 2606 + }, + { + "epoch": 9.411738148984199, + "grad_norm": 303.92059326171875, + "learning_rate": 1.5860254083484576e-05, + "loss": 38.7962, + "step": 2607 + }, + { + "epoch": 9.415349887133182, + "grad_norm": 271.9322204589844, + "learning_rate": 1.585480943738657e-05, + "loss": 39.0622, + "step": 2608 + }, + { + "epoch": 9.418961625282167, + "grad_norm": 222.8749542236328, + "learning_rate": 1.5849364791288566e-05, + "loss": 40.0773, + "step": 2609 + }, + { + "epoch": 9.422573363431152, + "grad_norm": 194.549072265625, + "learning_rate": 1.5843920145190562e-05, + "loss": 39.3495, + "step": 2610 + }, + { + "epoch": 9.422573363431152, + "eval_loss": 0.618250846862793, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.796, + "eval_steps_per_second": 56.796, + "step": 2610 + }, + { + "epoch": 9.426185101580135, + "grad_norm": 231.32623291015625, + "learning_rate": 1.5838475499092557e-05, + "loss": 39.7577, + "step": 2611 + }, + { + "epoch": 9.42979683972912, + "grad_norm": 185.9986114501953, + "learning_rate": 1.5833030852994556e-05, + "loss": 40.9342, + "step": 2612 + }, + { + "epoch": 9.433408577878104, + "grad_norm": 221.356201171875, + "learning_rate": 1.5827586206896555e-05, + "loss": 39.7733, + "step": 2613 + }, + { + "epoch": 9.437020316027088, + "grad_norm": 216.2249755859375, + "learning_rate": 1.582214156079855e-05, + "loss": 39.7559, + "step": 2614 + }, + { + "epoch": 9.440632054176072, + "grad_norm": 263.5106201171875, + "learning_rate": 1.5816696914700546e-05, + "loss": 41.2872, + "step": 2615 + }, + { + "epoch": 9.444243792325057, + "grad_norm": 281.9518127441406, + "learning_rate": 1.581125226860254e-05, + "loss": 41.1114, + "step": 2616 + }, + { + "epoch": 9.44785553047404, + "grad_norm": 200.2808074951172, + "learning_rate": 1.5805807622504536e-05, + "loss": 41.7711, + "step": 2617 + }, + { + "epoch": 9.451467268623025, + "grad_norm": 233.034912109375, + "learning_rate": 1.5800362976406535e-05, + "loss": 41.3306, + "step": 2618 + }, + { + "epoch": 9.455079006772008, + "grad_norm": 215.5499725341797, + "learning_rate": 1.579491833030853e-05, + "loss": 41.0065, + "step": 2619 + }, + { + "epoch": 9.458690744920993, + "grad_norm": 220.21153259277344, + "learning_rate": 1.5789473684210526e-05, + "loss": 42.1116, + "step": 2620 + }, + { + "epoch": 9.458690744920993, + "eval_loss": 0.6146022081375122, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 2620 + }, + { + "epoch": 9.462302483069978, + "grad_norm": 198.20001220703125, + "learning_rate": 1.578402903811252e-05, + "loss": 39.637, + "step": 2621 + }, + { + "epoch": 9.465914221218961, + "grad_norm": 228.18357849121094, + "learning_rate": 1.5778584392014517e-05, + "loss": 37.3831, + "step": 2622 + }, + { + "epoch": 9.469525959367946, + "grad_norm": 207.68040466308594, + "learning_rate": 1.577313974591652e-05, + "loss": 35.6356, + "step": 2623 + }, + { + "epoch": 9.47313769751693, + "grad_norm": 267.0474853515625, + "learning_rate": 1.5767695099818514e-05, + "loss": 34.5549, + "step": 2624 + }, + { + "epoch": 9.476749435665914, + "grad_norm": 191.4129638671875, + "learning_rate": 1.576225045372051e-05, + "loss": 35.1065, + "step": 2625 + }, + { + "epoch": 9.480361173814899, + "grad_norm": 220.85708618164062, + "learning_rate": 1.5756805807622505e-05, + "loss": 34.9115, + "step": 2626 + }, + { + "epoch": 9.483972911963882, + "grad_norm": 218.62460327148438, + "learning_rate": 1.57513611615245e-05, + "loss": 33.9542, + "step": 2627 + }, + { + "epoch": 9.487584650112867, + "grad_norm": 184.085693359375, + "learning_rate": 1.5745916515426496e-05, + "loss": 35.2981, + "step": 2628 + }, + { + "epoch": 9.491196388261852, + "grad_norm": 286.73236083984375, + "learning_rate": 1.5740471869328494e-05, + "loss": 36.8326, + "step": 2629 + }, + { + "epoch": 9.494808126410835, + "grad_norm": 326.4263000488281, + "learning_rate": 1.573502722323049e-05, + "loss": 35.9728, + "step": 2630 + }, + { + "epoch": 9.494808126410835, + "eval_loss": 0.6165672540664673, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2630 + }, + { + "epoch": 9.49841986455982, + "grad_norm": 283.330322265625, + "learning_rate": 1.5729582577132485e-05, + "loss": 37.4227, + "step": 2631 + }, + { + "epoch": 9.502031602708804, + "grad_norm": 208.65829467773438, + "learning_rate": 1.5724137931034484e-05, + "loss": 36.8613, + "step": 2632 + }, + { + "epoch": 9.505643340857787, + "grad_norm": 191.59429931640625, + "learning_rate": 1.571869328493648e-05, + "loss": 36.2332, + "step": 2633 + }, + { + "epoch": 9.509255079006772, + "grad_norm": 306.4736022949219, + "learning_rate": 1.5713248638838478e-05, + "loss": 36.8045, + "step": 2634 + }, + { + "epoch": 9.512866817155757, + "grad_norm": 226.97509765625, + "learning_rate": 1.5707803992740474e-05, + "loss": 37.005, + "step": 2635 + }, + { + "epoch": 9.51647855530474, + "grad_norm": 230.47683715820312, + "learning_rate": 1.570235934664247e-05, + "loss": 36.9168, + "step": 2636 + }, + { + "epoch": 9.520090293453725, + "grad_norm": 221.44483947753906, + "learning_rate": 1.5696914700544464e-05, + "loss": 39.0025, + "step": 2637 + }, + { + "epoch": 9.523702031602708, + "grad_norm": 249.1531219482422, + "learning_rate": 1.569147005444646e-05, + "loss": 38.1069, + "step": 2638 + }, + { + "epoch": 9.527313769751693, + "grad_norm": 276.8532409667969, + "learning_rate": 1.5686025408348455e-05, + "loss": 30.9819, + "step": 2639 + }, + { + "epoch": 9.530925507900678, + "grad_norm": 218.25035095214844, + "learning_rate": 1.5680580762250454e-05, + "loss": 23.4807, + "step": 2640 + }, + { + "epoch": 9.530925507900678, + "eval_loss": 0.619295060634613, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2640 + }, + { + "epoch": 9.534537246049661, + "grad_norm": 185.83737182617188, + "learning_rate": 1.5675136116152453e-05, + "loss": 22.5394, + "step": 2641 + }, + { + "epoch": 9.538148984198646, + "grad_norm": 181.9920654296875, + "learning_rate": 1.5669691470054448e-05, + "loss": 23.9106, + "step": 2642 + }, + { + "epoch": 9.54176072234763, + "grad_norm": 209.20391845703125, + "learning_rate": 1.5664246823956443e-05, + "loss": 25.5328, + "step": 2643 + }, + { + "epoch": 9.545372460496614, + "grad_norm": 223.86093139648438, + "learning_rate": 1.565880217785844e-05, + "loss": 42.8563, + "step": 2644 + }, + { + "epoch": 9.548984198645599, + "grad_norm": 232.3086395263672, + "learning_rate": 1.5653357531760438e-05, + "loss": 44.0178, + "step": 2645 + }, + { + "epoch": 9.552595936794582, + "grad_norm": 223.76541137695312, + "learning_rate": 1.5647912885662433e-05, + "loss": 43.4928, + "step": 2646 + }, + { + "epoch": 9.556207674943566, + "grad_norm": 258.86700439453125, + "learning_rate": 1.5642468239564428e-05, + "loss": 42.3422, + "step": 2647 + }, + { + "epoch": 9.559819413092551, + "grad_norm": 255.09033203125, + "learning_rate": 1.5637023593466424e-05, + "loss": 41.6588, + "step": 2648 + }, + { + "epoch": 9.563431151241534, + "grad_norm": 205.88563537597656, + "learning_rate": 1.563157894736842e-05, + "loss": 41.9267, + "step": 2649 + }, + { + "epoch": 9.56704288939052, + "grad_norm": 204.12318420410156, + "learning_rate": 1.5626134301270418e-05, + "loss": 43.0326, + "step": 2650 + }, + { + "epoch": 9.56704288939052, + "eval_loss": 0.6218730807304382, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2650 + }, + { + "epoch": 9.570654627539504, + "grad_norm": 259.5694274902344, + "learning_rate": 1.5620689655172417e-05, + "loss": 42.9604, + "step": 2651 + }, + { + "epoch": 9.574266365688487, + "grad_norm": 234.35935974121094, + "learning_rate": 1.5615245009074412e-05, + "loss": 42.7316, + "step": 2652 + }, + { + "epoch": 9.577878103837472, + "grad_norm": 237.14346313476562, + "learning_rate": 1.5609800362976407e-05, + "loss": 42.4559, + "step": 2653 + }, + { + "epoch": 9.581489841986457, + "grad_norm": 208.2974395751953, + "learning_rate": 1.5604355716878403e-05, + "loss": 40.1113, + "step": 2654 + }, + { + "epoch": 9.58510158013544, + "grad_norm": 212.18814086914062, + "learning_rate": 1.5598911070780398e-05, + "loss": 38.6515, + "step": 2655 + }, + { + "epoch": 9.588713318284425, + "grad_norm": 245.23240661621094, + "learning_rate": 1.5593466424682397e-05, + "loss": 39.5289, + "step": 2656 + }, + { + "epoch": 9.592325056433408, + "grad_norm": 261.1321105957031, + "learning_rate": 1.5588021778584392e-05, + "loss": 39.3232, + "step": 2657 + }, + { + "epoch": 9.595936794582393, + "grad_norm": 257.67962646484375, + "learning_rate": 1.5582577132486388e-05, + "loss": 40.3963, + "step": 2658 + }, + { + "epoch": 9.599548532731378, + "grad_norm": 299.93914794921875, + "learning_rate": 1.5577132486388383e-05, + "loss": 39.0657, + "step": 2659 + }, + { + "epoch": 9.60316027088036, + "grad_norm": 215.45407104492188, + "learning_rate": 1.5571687840290382e-05, + "loss": 40.1408, + "step": 2660 + }, + { + "epoch": 9.60316027088036, + "eval_loss": 0.6216554045677185, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2660 + }, + { + "epoch": 9.606772009029346, + "grad_norm": 273.9233093261719, + "learning_rate": 1.5566243194192377e-05, + "loss": 40.6894, + "step": 2661 + }, + { + "epoch": 9.610383747178329, + "grad_norm": 220.76344299316406, + "learning_rate": 1.5560798548094376e-05, + "loss": 40.8146, + "step": 2662 + }, + { + "epoch": 9.613995485327314, + "grad_norm": 200.33929443359375, + "learning_rate": 1.555535390199637e-05, + "loss": 40.1362, + "step": 2663 + }, + { + "epoch": 9.617607223476298, + "grad_norm": 223.38536071777344, + "learning_rate": 1.5549909255898367e-05, + "loss": 39.3488, + "step": 2664 + }, + { + "epoch": 9.621218961625281, + "grad_norm": 240.99578857421875, + "learning_rate": 1.5544464609800362e-05, + "loss": 41.771, + "step": 2665 + }, + { + "epoch": 9.624830699774266, + "grad_norm": 202.30323791503906, + "learning_rate": 1.5539019963702357e-05, + "loss": 41.1412, + "step": 2666 + }, + { + "epoch": 9.628442437923251, + "grad_norm": 193.8411865234375, + "learning_rate": 1.5533575317604356e-05, + "loss": 41.0064, + "step": 2667 + }, + { + "epoch": 9.632054176072234, + "grad_norm": 197.1542510986328, + "learning_rate": 1.552813067150635e-05, + "loss": 41.4787, + "step": 2668 + }, + { + "epoch": 9.635665914221219, + "grad_norm": 259.21954345703125, + "learning_rate": 1.552268602540835e-05, + "loss": 41.753, + "step": 2669 + }, + { + "epoch": 9.639277652370204, + "grad_norm": 290.9770202636719, + "learning_rate": 1.5517241379310346e-05, + "loss": 40.4589, + "step": 2670 + }, + { + "epoch": 9.639277652370204, + "eval_loss": 0.6132164001464844, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2670 + }, + { + "epoch": 9.642889390519187, + "grad_norm": 252.86219787597656, + "learning_rate": 1.551179673321234e-05, + "loss": 37.356, + "step": 2671 + }, + { + "epoch": 9.646501128668172, + "grad_norm": 207.79254150390625, + "learning_rate": 1.550635208711434e-05, + "loss": 36.2071, + "step": 2672 + }, + { + "epoch": 9.650112866817155, + "grad_norm": 186.78857421875, + "learning_rate": 1.5500907441016335e-05, + "loss": 33.5074, + "step": 2673 + }, + { + "epoch": 9.65372460496614, + "grad_norm": 212.5107421875, + "learning_rate": 1.549546279491833e-05, + "loss": 33.7103, + "step": 2674 + }, + { + "epoch": 9.657336343115125, + "grad_norm": 243.2950897216797, + "learning_rate": 1.5490018148820326e-05, + "loss": 34.3476, + "step": 2675 + }, + { + "epoch": 9.660948081264108, + "grad_norm": 221.66415405273438, + "learning_rate": 1.548457350272232e-05, + "loss": 34.5377, + "step": 2676 + }, + { + "epoch": 9.664559819413093, + "grad_norm": 231.8260955810547, + "learning_rate": 1.5479128856624317e-05, + "loss": 34.3663, + "step": 2677 + }, + { + "epoch": 9.668171557562077, + "grad_norm": 284.6401062011719, + "learning_rate": 1.547368421052632e-05, + "loss": 35.5723, + "step": 2678 + }, + { + "epoch": 9.67178329571106, + "grad_norm": 373.43865966796875, + "learning_rate": 1.5468239564428314e-05, + "loss": 35.5628, + "step": 2679 + }, + { + "epoch": 9.675395033860045, + "grad_norm": 325.18316650390625, + "learning_rate": 1.546279491833031e-05, + "loss": 35.6192, + "step": 2680 + }, + { + "epoch": 9.675395033860045, + "eval_loss": 0.613842248916626, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 2680 + }, + { + "epoch": 9.679006772009028, + "grad_norm": 353.14739990234375, + "learning_rate": 1.5457350272232305e-05, + "loss": 36.4789, + "step": 2681 + }, + { + "epoch": 9.682618510158013, + "grad_norm": 215.21836853027344, + "learning_rate": 1.54519056261343e-05, + "loss": 36.0412, + "step": 2682 + }, + { + "epoch": 9.686230248306998, + "grad_norm": 219.64930725097656, + "learning_rate": 1.54464609800363e-05, + "loss": 37.1118, + "step": 2683 + }, + { + "epoch": 9.689841986455981, + "grad_norm": 247.86685180664062, + "learning_rate": 1.5441016333938295e-05, + "loss": 36.488, + "step": 2684 + }, + { + "epoch": 9.693453724604966, + "grad_norm": 248.7967071533203, + "learning_rate": 1.543557168784029e-05, + "loss": 36.2925, + "step": 2685 + }, + { + "epoch": 9.697065462753951, + "grad_norm": 243.1404571533203, + "learning_rate": 1.5430127041742285e-05, + "loss": 37.3986, + "step": 2686 + }, + { + "epoch": 9.700677200902934, + "grad_norm": 276.6585388183594, + "learning_rate": 1.5424682395644284e-05, + "loss": 37.9784, + "step": 2687 + }, + { + "epoch": 9.704288939051919, + "grad_norm": 308.171630859375, + "learning_rate": 1.541923774954628e-05, + "loss": 38.1591, + "step": 2688 + }, + { + "epoch": 9.707900677200904, + "grad_norm": 204.4575653076172, + "learning_rate": 1.541379310344828e-05, + "loss": 27.4514, + "step": 2689 + }, + { + "epoch": 9.711512415349887, + "grad_norm": 160.85946655273438, + "learning_rate": 1.5408348457350274e-05, + "loss": 23.7982, + "step": 2690 + }, + { + "epoch": 9.711512415349887, + "eval_loss": 0.619924008846283, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2690 + }, + { + "epoch": 9.715124153498872, + "grad_norm": 215.60049438476562, + "learning_rate": 1.540290381125227e-05, + "loss": 23.3927, + "step": 2691 + }, + { + "epoch": 9.718735891647855, + "grad_norm": 172.84011840820312, + "learning_rate": 1.5397459165154265e-05, + "loss": 24.1876, + "step": 2692 + }, + { + "epoch": 9.72234762979684, + "grad_norm": 208.42361450195312, + "learning_rate": 1.539201451905626e-05, + "loss": 25.1794, + "step": 2693 + }, + { + "epoch": 9.725959367945824, + "grad_norm": 255.73574829101562, + "learning_rate": 1.538656987295826e-05, + "loss": 42.3484, + "step": 2694 + }, + { + "epoch": 9.729571106094808, + "grad_norm": 239.65533447265625, + "learning_rate": 1.5381125226860254e-05, + "loss": 42.8277, + "step": 2695 + }, + { + "epoch": 9.733182844243792, + "grad_norm": 211.2068634033203, + "learning_rate": 1.5375680580762253e-05, + "loss": 42.6536, + "step": 2696 + }, + { + "epoch": 9.736794582392777, + "grad_norm": 302.85003662109375, + "learning_rate": 1.5370235934664248e-05, + "loss": 42.6263, + "step": 2697 + }, + { + "epoch": 9.74040632054176, + "grad_norm": 211.54754638671875, + "learning_rate": 1.5364791288566244e-05, + "loss": 41.5621, + "step": 2698 + }, + { + "epoch": 9.744018058690745, + "grad_norm": 229.22283935546875, + "learning_rate": 1.535934664246824e-05, + "loss": 43.3765, + "step": 2699 + }, + { + "epoch": 9.747629796839728, + "grad_norm": 206.64794921875, + "learning_rate": 1.5353901996370238e-05, + "loss": 41.4923, + "step": 2700 + }, + { + "epoch": 9.747629796839728, + "eval_loss": 0.6202616095542908, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.981, + "eval_steps_per_second": 56.981, + "step": 2700 + }, + { + "epoch": 9.751241534988713, + "grad_norm": 216.98757934570312, + "learning_rate": 1.5348457350272233e-05, + "loss": 43.1931, + "step": 2701 + }, + { + "epoch": 9.754853273137698, + "grad_norm": 222.7340545654297, + "learning_rate": 1.534301270417423e-05, + "loss": 42.485, + "step": 2702 + }, + { + "epoch": 9.758465011286681, + "grad_norm": 291.3454895019531, + "learning_rate": 1.5337568058076224e-05, + "loss": 41.4766, + "step": 2703 + }, + { + "epoch": 9.762076749435666, + "grad_norm": 239.50341796875, + "learning_rate": 1.533212341197822e-05, + "loss": 41.9215, + "step": 2704 + }, + { + "epoch": 9.76568848758465, + "grad_norm": 179.21839904785156, + "learning_rate": 1.5326678765880218e-05, + "loss": 40.6544, + "step": 2705 + }, + { + "epoch": 9.769300225733634, + "grad_norm": 210.89535522460938, + "learning_rate": 1.5321234119782217e-05, + "loss": 38.6204, + "step": 2706 + }, + { + "epoch": 9.772911963882619, + "grad_norm": 239.23291015625, + "learning_rate": 1.5315789473684212e-05, + "loss": 39.4385, + "step": 2707 + }, + { + "epoch": 9.776523702031604, + "grad_norm": 240.22772216796875, + "learning_rate": 1.5310344827586208e-05, + "loss": 40.0139, + "step": 2708 + }, + { + "epoch": 9.780135440180587, + "grad_norm": 185.4588623046875, + "learning_rate": 1.5304900181488203e-05, + "loss": 38.9331, + "step": 2709 + }, + { + "epoch": 9.783747178329572, + "grad_norm": 263.0315856933594, + "learning_rate": 1.52994555353902e-05, + "loss": 38.5485, + "step": 2710 + }, + { + "epoch": 9.783747178329572, + "eval_loss": 0.615914523601532, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2710 + }, + { + "epoch": 9.787358916478555, + "grad_norm": 209.05348205566406, + "learning_rate": 1.5294010889292197e-05, + "loss": 39.4875, + "step": 2711 + }, + { + "epoch": 9.79097065462754, + "grad_norm": 209.72293090820312, + "learning_rate": 1.5288566243194193e-05, + "loss": 40.4742, + "step": 2712 + }, + { + "epoch": 9.794582392776524, + "grad_norm": 210.02908325195312, + "learning_rate": 1.5283121597096188e-05, + "loss": 39.924, + "step": 2713 + }, + { + "epoch": 9.798194130925507, + "grad_norm": 204.3467254638672, + "learning_rate": 1.5277676950998183e-05, + "loss": 40.8893, + "step": 2714 + }, + { + "epoch": 9.801805869074492, + "grad_norm": 253.9317626953125, + "learning_rate": 1.5272232304900182e-05, + "loss": 38.3278, + "step": 2715 + }, + { + "epoch": 9.805417607223477, + "grad_norm": 263.6196594238281, + "learning_rate": 1.526678765880218e-05, + "loss": 40.5242, + "step": 2716 + }, + { + "epoch": 9.80902934537246, + "grad_norm": 230.35621643066406, + "learning_rate": 1.5261343012704176e-05, + "loss": 40.683, + "step": 2717 + }, + { + "epoch": 9.812641083521445, + "grad_norm": 190.16323852539062, + "learning_rate": 1.5255898366606172e-05, + "loss": 40.2472, + "step": 2718 + }, + { + "epoch": 9.816252821670428, + "grad_norm": 202.7122344970703, + "learning_rate": 1.5250453720508167e-05, + "loss": 38.9644, + "step": 2719 + }, + { + "epoch": 9.819864559819413, + "grad_norm": 193.65774536132812, + "learning_rate": 1.5245009074410164e-05, + "loss": 40.9982, + "step": 2720 + }, + { + "epoch": 9.819864559819413, + "eval_loss": 0.6152020692825317, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 2720 + }, + { + "epoch": 9.823476297968398, + "grad_norm": 272.0360412597656, + "learning_rate": 1.523956442831216e-05, + "loss": 40.5518, + "step": 2721 + }, + { + "epoch": 9.827088036117381, + "grad_norm": 200.20777893066406, + "learning_rate": 1.5234119782214155e-05, + "loss": 38.4801, + "step": 2722 + }, + { + "epoch": 9.830699774266366, + "grad_norm": 201.44764709472656, + "learning_rate": 1.5228675136116152e-05, + "loss": 35.7499, + "step": 2723 + }, + { + "epoch": 9.83431151241535, + "grad_norm": 234.89706420898438, + "learning_rate": 1.522323049001815e-05, + "loss": 35.4331, + "step": 2724 + }, + { + "epoch": 9.837923250564334, + "grad_norm": 193.27423095703125, + "learning_rate": 1.5217785843920146e-05, + "loss": 33.0281, + "step": 2725 + }, + { + "epoch": 9.841534988713319, + "grad_norm": 222.28060913085938, + "learning_rate": 1.5212341197822143e-05, + "loss": 34.2237, + "step": 2726 + }, + { + "epoch": 9.845146726862303, + "grad_norm": 264.2764587402344, + "learning_rate": 1.5206896551724139e-05, + "loss": 33.7112, + "step": 2727 + }, + { + "epoch": 9.848758465011286, + "grad_norm": 204.5146484375, + "learning_rate": 1.5201451905626134e-05, + "loss": 33.9014, + "step": 2728 + }, + { + "epoch": 9.852370203160271, + "grad_norm": 198.90907287597656, + "learning_rate": 1.5196007259528131e-05, + "loss": 36.6987, + "step": 2729 + }, + { + "epoch": 9.855981941309254, + "grad_norm": 254.19818115234375, + "learning_rate": 1.5190562613430126e-05, + "loss": 35.4466, + "step": 2730 + }, + { + "epoch": 9.855981941309254, + "eval_loss": 0.6153284311294556, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2730 + }, + { + "epoch": 9.85959367945824, + "grad_norm": 212.53749084472656, + "learning_rate": 1.5185117967332123e-05, + "loss": 35.659, + "step": 2731 + }, + { + "epoch": 9.863205417607224, + "grad_norm": 234.5277557373047, + "learning_rate": 1.5179673321234119e-05, + "loss": 36.7411, + "step": 2732 + }, + { + "epoch": 9.866817155756207, + "grad_norm": 229.25962829589844, + "learning_rate": 1.5174228675136118e-05, + "loss": 36.0713, + "step": 2733 + }, + { + "epoch": 9.870428893905192, + "grad_norm": 259.5096435546875, + "learning_rate": 1.5168784029038115e-05, + "loss": 37.2433, + "step": 2734 + }, + { + "epoch": 9.874040632054175, + "grad_norm": 297.2413024902344, + "learning_rate": 1.516333938294011e-05, + "loss": 37.222, + "step": 2735 + }, + { + "epoch": 9.87765237020316, + "grad_norm": 259.8325500488281, + "learning_rate": 1.5157894736842105e-05, + "loss": 37.096, + "step": 2736 + }, + { + "epoch": 9.881264108352145, + "grad_norm": 275.85888671875, + "learning_rate": 1.5152450090744103e-05, + "loss": 37.769, + "step": 2737 + }, + { + "epoch": 9.884875846501128, + "grad_norm": 261.16656494140625, + "learning_rate": 1.5147005444646098e-05, + "loss": 38.4089, + "step": 2738 + }, + { + "epoch": 9.888487584650113, + "grad_norm": 219.74351501464844, + "learning_rate": 1.5141560798548095e-05, + "loss": 32.5255, + "step": 2739 + }, + { + "epoch": 9.892099322799098, + "grad_norm": 203.9193878173828, + "learning_rate": 1.513611615245009e-05, + "loss": 24.2497, + "step": 2740 + }, + { + "epoch": 9.892099322799098, + "eval_loss": 0.6206448674201965, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 2740 + }, + { + "epoch": 9.89571106094808, + "grad_norm": 224.19454956054688, + "learning_rate": 1.5130671506352086e-05, + "loss": 23.0629, + "step": 2741 + }, + { + "epoch": 9.899322799097066, + "grad_norm": 252.4147186279297, + "learning_rate": 1.5125226860254086e-05, + "loss": 24.5799, + "step": 2742 + }, + { + "epoch": 9.90293453724605, + "grad_norm": 214.79067993164062, + "learning_rate": 1.5119782214156082e-05, + "loss": 24.6773, + "step": 2743 + }, + { + "epoch": 9.906546275395034, + "grad_norm": 225.59848022460938, + "learning_rate": 1.5114337568058077e-05, + "loss": 43.1147, + "step": 2744 + }, + { + "epoch": 9.910158013544018, + "grad_norm": 221.8661651611328, + "learning_rate": 1.5108892921960074e-05, + "loss": 42.7403, + "step": 2745 + }, + { + "epoch": 9.913769751693001, + "grad_norm": 316.3871765136719, + "learning_rate": 1.510344827586207e-05, + "loss": 41.6931, + "step": 2746 + }, + { + "epoch": 9.917381489841986, + "grad_norm": 250.6577911376953, + "learning_rate": 1.5098003629764065e-05, + "loss": 43.3, + "step": 2747 + }, + { + "epoch": 9.920993227990971, + "grad_norm": 222.44386291503906, + "learning_rate": 1.5092558983666062e-05, + "loss": 43.3128, + "step": 2748 + }, + { + "epoch": 9.924604966139954, + "grad_norm": 190.08682250976562, + "learning_rate": 1.5087114337568057e-05, + "loss": 41.4814, + "step": 2749 + }, + { + "epoch": 9.928216704288939, + "grad_norm": 276.9918212890625, + "learning_rate": 1.5081669691470054e-05, + "loss": 41.042, + "step": 2750 + }, + { + "epoch": 9.928216704288939, + "eval_loss": 0.6201648116111755, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2750 + }, + { + "epoch": 9.931828442437924, + "grad_norm": 269.7344970703125, + "learning_rate": 1.507622504537205e-05, + "loss": 40.3064, + "step": 2751 + }, + { + "epoch": 9.935440180586907, + "grad_norm": 263.11663818359375, + "learning_rate": 1.5070780399274049e-05, + "loss": 40.1675, + "step": 2752 + }, + { + "epoch": 9.939051918735892, + "grad_norm": 210.37635803222656, + "learning_rate": 1.5065335753176046e-05, + "loss": 40.5334, + "step": 2753 + }, + { + "epoch": 9.942663656884875, + "grad_norm": 206.09335327148438, + "learning_rate": 1.5059891107078041e-05, + "loss": 41.0429, + "step": 2754 + }, + { + "epoch": 9.94627539503386, + "grad_norm": 245.45013427734375, + "learning_rate": 1.5054446460980036e-05, + "loss": 40.8831, + "step": 2755 + }, + { + "epoch": 9.949887133182845, + "grad_norm": 216.63075256347656, + "learning_rate": 1.5049001814882033e-05, + "loss": 41.2453, + "step": 2756 + }, + { + "epoch": 9.953498871331828, + "grad_norm": 362.12127685546875, + "learning_rate": 1.5043557168784029e-05, + "loss": 40.4561, + "step": 2757 + }, + { + "epoch": 9.957110609480813, + "grad_norm": 222.01434326171875, + "learning_rate": 1.5038112522686024e-05, + "loss": 41.7307, + "step": 2758 + }, + { + "epoch": 9.960722347629797, + "grad_norm": 289.6107177734375, + "learning_rate": 1.5032667876588021e-05, + "loss": 37.83, + "step": 2759 + }, + { + "epoch": 9.96433408577878, + "grad_norm": 231.75274658203125, + "learning_rate": 1.5027223230490017e-05, + "loss": 34.1728, + "step": 2760 + }, + { + "epoch": 9.96433408577878, + "eval_loss": 0.6177247166633606, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2760 + }, + { + "epoch": 9.967945823927765, + "grad_norm": 269.4657287597656, + "learning_rate": 1.5021778584392017e-05, + "loss": 33.8501, + "step": 2761 + }, + { + "epoch": 9.97155756207675, + "grad_norm": 229.73004150390625, + "learning_rate": 1.5016333938294013e-05, + "loss": 35.0989, + "step": 2762 + }, + { + "epoch": 9.975169300225733, + "grad_norm": 215.75350952148438, + "learning_rate": 1.5010889292196008e-05, + "loss": 35.1091, + "step": 2763 + }, + { + "epoch": 9.978781038374718, + "grad_norm": 255.36439514160156, + "learning_rate": 1.5005444646098005e-05, + "loss": 36.8373, + "step": 2764 + }, + { + "epoch": 9.982392776523701, + "grad_norm": 226.71084594726562, + "learning_rate": 1.5e-05, + "loss": 36.6244, + "step": 2765 + }, + { + "epoch": 9.986004514672686, + "grad_norm": 264.1791076660156, + "learning_rate": 1.4994555353901996e-05, + "loss": 36.1925, + "step": 2766 + }, + { + "epoch": 9.989616252821671, + "grad_norm": 281.4349060058594, + "learning_rate": 1.4989110707803993e-05, + "loss": 38.5627, + "step": 2767 + }, + { + "epoch": 9.993227990970654, + "grad_norm": 275.13092041015625, + "learning_rate": 1.498366606170599e-05, + "loss": 33.3277, + "step": 2768 + }, + { + "epoch": 9.996839729119639, + "grad_norm": 215.79550170898438, + "learning_rate": 1.4978221415607985e-05, + "loss": 23.7482, + "step": 2769 + }, + { + "epoch": 10.0, + "grad_norm": 162.03152465820312, + "learning_rate": 1.4972776769509982e-05, + "loss": 21.7078, + "step": 2770 + }, + { + "epoch": 10.0, + "eval_loss": 0.6126651763916016, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2770 + }, + { + "epoch": 10.003611738148985, + "grad_norm": 243.1815185546875, + "learning_rate": 1.4967332123411978e-05, + "loss": 42.2449, + "step": 2771 + }, + { + "epoch": 10.007223476297968, + "grad_norm": 183.29127502441406, + "learning_rate": 1.4961887477313977e-05, + "loss": 41.5925, + "step": 2772 + }, + { + "epoch": 10.010835214446953, + "grad_norm": 206.04238891601562, + "learning_rate": 1.4956442831215972e-05, + "loss": 40.6657, + "step": 2773 + }, + { + "epoch": 10.014446952595938, + "grad_norm": 192.1796875, + "learning_rate": 1.4950998185117967e-05, + "loss": 41.7065, + "step": 2774 + }, + { + "epoch": 10.01805869074492, + "grad_norm": 202.77279663085938, + "learning_rate": 1.4945553539019964e-05, + "loss": 42.0608, + "step": 2775 + }, + { + "epoch": 10.021670428893906, + "grad_norm": 242.37734985351562, + "learning_rate": 1.494010889292196e-05, + "loss": 40.9925, + "step": 2776 + }, + { + "epoch": 10.025282167042889, + "grad_norm": 252.01358032226562, + "learning_rate": 1.4934664246823957e-05, + "loss": 41.1401, + "step": 2777 + }, + { + "epoch": 10.028893905191874, + "grad_norm": 205.82388305664062, + "learning_rate": 1.4929219600725954e-05, + "loss": 41.5, + "step": 2778 + }, + { + "epoch": 10.032505643340858, + "grad_norm": 251.53968811035156, + "learning_rate": 1.492377495462795e-05, + "loss": 41.8218, + "step": 2779 + }, + { + "epoch": 10.036117381489841, + "grad_norm": 236.55564880371094, + "learning_rate": 1.4918330308529945e-05, + "loss": 40.803, + "step": 2780 + }, + { + "epoch": 10.036117381489841, + "eval_loss": 0.6173696517944336, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 2780 + }, + { + "epoch": 10.039729119638826, + "grad_norm": 214.9959716796875, + "learning_rate": 1.4912885662431942e-05, + "loss": 40.522, + "step": 2781 + }, + { + "epoch": 10.043340857787811, + "grad_norm": 213.7000732421875, + "learning_rate": 1.4907441016333939e-05, + "loss": 38.8643, + "step": 2782 + }, + { + "epoch": 10.046952595936794, + "grad_norm": 225.6709747314453, + "learning_rate": 1.4901996370235936e-05, + "loss": 38.3625, + "step": 2783 + }, + { + "epoch": 10.050564334085779, + "grad_norm": 208.83712768554688, + "learning_rate": 1.4896551724137931e-05, + "loss": 38.5355, + "step": 2784 + }, + { + "epoch": 10.054176072234762, + "grad_norm": 185.51219177246094, + "learning_rate": 1.4891107078039927e-05, + "loss": 38.4303, + "step": 2785 + }, + { + "epoch": 10.057787810383747, + "grad_norm": 196.68551635742188, + "learning_rate": 1.4885662431941925e-05, + "loss": 38.1895, + "step": 2786 + }, + { + "epoch": 10.061399548532732, + "grad_norm": 207.4806671142578, + "learning_rate": 1.488021778584392e-05, + "loss": 39.2329, + "step": 2787 + }, + { + "epoch": 10.065011286681715, + "grad_norm": 211.640380859375, + "learning_rate": 1.4874773139745916e-05, + "loss": 40.108, + "step": 2788 + }, + { + "epoch": 10.0686230248307, + "grad_norm": 195.97006225585938, + "learning_rate": 1.4869328493647913e-05, + "loss": 39.6883, + "step": 2789 + }, + { + "epoch": 10.072234762979685, + "grad_norm": 207.20169067382812, + "learning_rate": 1.4863883847549909e-05, + "loss": 40.557, + "step": 2790 + }, + { + "epoch": 10.072234762979685, + "eval_loss": 0.6166439652442932, + "eval_runtime": 3.1461, + "eval_samples_per_second": 56.895, + "eval_steps_per_second": 56.895, + "step": 2790 + }, + { + "epoch": 10.075846501128668, + "grad_norm": 168.4052276611328, + "learning_rate": 1.4858439201451906e-05, + "loss": 39.76, + "step": 2791 + }, + { + "epoch": 10.079458239277653, + "grad_norm": 188.55575561523438, + "learning_rate": 1.4852994555353903e-05, + "loss": 40.4776, + "step": 2792 + }, + { + "epoch": 10.083069977426636, + "grad_norm": 181.60801696777344, + "learning_rate": 1.4847549909255898e-05, + "loss": 40.5414, + "step": 2793 + }, + { + "epoch": 10.08668171557562, + "grad_norm": 205.39608764648438, + "learning_rate": 1.4842105263157895e-05, + "loss": 41.4944, + "step": 2794 + }, + { + "epoch": 10.090293453724605, + "grad_norm": 271.0169372558594, + "learning_rate": 1.4836660617059892e-05, + "loss": 40.6805, + "step": 2795 + }, + { + "epoch": 10.093905191873588, + "grad_norm": 241.97889709472656, + "learning_rate": 1.4831215970961888e-05, + "loss": 39.5473, + "step": 2796 + }, + { + "epoch": 10.097516930022573, + "grad_norm": 211.64260864257812, + "learning_rate": 1.4825771324863885e-05, + "loss": 41.0357, + "step": 2797 + }, + { + "epoch": 10.101128668171558, + "grad_norm": 209.52804565429688, + "learning_rate": 1.482032667876588e-05, + "loss": 41.3357, + "step": 2798 + }, + { + "epoch": 10.104740406320541, + "grad_norm": 243.08419799804688, + "learning_rate": 1.4814882032667876e-05, + "loss": 38.6778, + "step": 2799 + }, + { + "epoch": 10.108352144469526, + "grad_norm": 227.17172241210938, + "learning_rate": 1.4809437386569874e-05, + "loss": 35.1128, + "step": 2800 + }, + { + "epoch": 10.108352144469526, + "eval_loss": 0.6153741478919983, + "eval_runtime": 3.143, + "eval_samples_per_second": 56.952, + "eval_steps_per_second": 56.952, + "step": 2800 + }, + { + "epoch": 10.111963882618511, + "grad_norm": 284.7151794433594, + "learning_rate": 1.480399274047187e-05, + "loss": 33.1712, + "step": 2801 + }, + { + "epoch": 10.115575620767494, + "grad_norm": 234.85169982910156, + "learning_rate": 1.4798548094373867e-05, + "loss": 33.495, + "step": 2802 + }, + { + "epoch": 10.119187358916479, + "grad_norm": 236.6138458251953, + "learning_rate": 1.4793103448275862e-05, + "loss": 33.2318, + "step": 2803 + }, + { + "epoch": 10.122799097065462, + "grad_norm": 240.98997497558594, + "learning_rate": 1.4787658802177858e-05, + "loss": 33.9268, + "step": 2804 + }, + { + "epoch": 10.126410835214447, + "grad_norm": 218.304443359375, + "learning_rate": 1.4782214156079856e-05, + "loss": 34.667, + "step": 2805 + }, + { + "epoch": 10.130022573363432, + "grad_norm": 290.30108642578125, + "learning_rate": 1.4776769509981852e-05, + "loss": 36.7153, + "step": 2806 + }, + { + "epoch": 10.133634311512415, + "grad_norm": 267.7265625, + "learning_rate": 1.4771324863883847e-05, + "loss": 35.2035, + "step": 2807 + }, + { + "epoch": 10.1372460496614, + "grad_norm": 300.4646301269531, + "learning_rate": 1.4765880217785844e-05, + "loss": 35.6581, + "step": 2808 + }, + { + "epoch": 10.140857787810384, + "grad_norm": 234.16448974609375, + "learning_rate": 1.4760435571687841e-05, + "loss": 35.8547, + "step": 2809 + }, + { + "epoch": 10.144469525959368, + "grad_norm": 209.23858642578125, + "learning_rate": 1.4754990925589837e-05, + "loss": 34.47, + "step": 2810 + }, + { + "epoch": 10.144469525959368, + "eval_loss": 0.6160662770271301, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2810 + }, + { + "epoch": 10.148081264108352, + "grad_norm": 207.9628143310547, + "learning_rate": 1.4749546279491834e-05, + "loss": 36.1239, + "step": 2811 + }, + { + "epoch": 10.151693002257336, + "grad_norm": 183.68545532226562, + "learning_rate": 1.4744101633393829e-05, + "loss": 36.759, + "step": 2812 + }, + { + "epoch": 10.15530474040632, + "grad_norm": 222.00164794921875, + "learning_rate": 1.4738656987295826e-05, + "loss": 37.397, + "step": 2813 + }, + { + "epoch": 10.158916478555305, + "grad_norm": 226.9628448486328, + "learning_rate": 1.4733212341197823e-05, + "loss": 36.3648, + "step": 2814 + }, + { + "epoch": 10.162528216704288, + "grad_norm": 271.061279296875, + "learning_rate": 1.4727767695099819e-05, + "loss": 37.8754, + "step": 2815 + }, + { + "epoch": 10.166139954853273, + "grad_norm": 265.2478942871094, + "learning_rate": 1.4722323049001816e-05, + "loss": 33.7491, + "step": 2816 + }, + { + "epoch": 10.169751693002258, + "grad_norm": 227.5030975341797, + "learning_rate": 1.4716878402903811e-05, + "loss": 23.0162, + "step": 2817 + }, + { + "epoch": 10.173363431151241, + "grad_norm": 195.83477783203125, + "learning_rate": 1.4711433756805808e-05, + "loss": 23.5831, + "step": 2818 + }, + { + "epoch": 10.176975169300226, + "grad_norm": 196.982421875, + "learning_rate": 1.4705989110707805e-05, + "loss": 24.1078, + "step": 2819 + }, + { + "epoch": 10.18058690744921, + "grad_norm": 212.73031616210938, + "learning_rate": 1.47005444646098e-05, + "loss": 24.8378, + "step": 2820 + }, + { + "epoch": 10.18058690744921, + "eval_loss": 0.6217848062515259, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 2820 + }, + { + "epoch": 10.184198645598194, + "grad_norm": 261.8343200683594, + "learning_rate": 1.4695099818511796e-05, + "loss": 43.3402, + "step": 2821 + }, + { + "epoch": 10.187810383747179, + "grad_norm": 272.94158935546875, + "learning_rate": 1.4689655172413793e-05, + "loss": 42.8004, + "step": 2822 + }, + { + "epoch": 10.191422121896162, + "grad_norm": 261.5067138671875, + "learning_rate": 1.468421052631579e-05, + "loss": 43.5947, + "step": 2823 + }, + { + "epoch": 10.195033860045147, + "grad_norm": 280.4205322265625, + "learning_rate": 1.4678765880217787e-05, + "loss": 42.1887, + "step": 2824 + }, + { + "epoch": 10.198645598194132, + "grad_norm": 223.82449340820312, + "learning_rate": 1.4673321234119783e-05, + "loss": 40.9825, + "step": 2825 + }, + { + "epoch": 10.202257336343115, + "grad_norm": 261.1077575683594, + "learning_rate": 1.4667876588021778e-05, + "loss": 41.8347, + "step": 2826 + }, + { + "epoch": 10.2058690744921, + "grad_norm": 189.1642608642578, + "learning_rate": 1.4662431941923775e-05, + "loss": 41.7441, + "step": 2827 + }, + { + "epoch": 10.209480812641084, + "grad_norm": 216.94410705566406, + "learning_rate": 1.4656987295825772e-05, + "loss": 42.203, + "step": 2828 + }, + { + "epoch": 10.213092550790067, + "grad_norm": 260.44744873046875, + "learning_rate": 1.4651542649727768e-05, + "loss": 41.8887, + "step": 2829 + }, + { + "epoch": 10.216704288939052, + "grad_norm": 252.21682739257812, + "learning_rate": 1.4646098003629765e-05, + "loss": 42.5977, + "step": 2830 + }, + { + "epoch": 10.216704288939052, + "eval_loss": 0.6175437569618225, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.997, + "eval_steps_per_second": 56.997, + "step": 2830 + }, + { + "epoch": 10.220316027088035, + "grad_norm": 298.4760437011719, + "learning_rate": 1.464065335753176e-05, + "loss": 40.7994, + "step": 2831 + }, + { + "epoch": 10.22392776523702, + "grad_norm": 214.0433349609375, + "learning_rate": 1.4635208711433757e-05, + "loss": 39.1571, + "step": 2832 + }, + { + "epoch": 10.227539503386005, + "grad_norm": 220.59039306640625, + "learning_rate": 1.4629764065335754e-05, + "loss": 38.257, + "step": 2833 + }, + { + "epoch": 10.231151241534988, + "grad_norm": 218.2419891357422, + "learning_rate": 1.462431941923775e-05, + "loss": 38.1954, + "step": 2834 + }, + { + "epoch": 10.234762979683973, + "grad_norm": 241.67674255371094, + "learning_rate": 1.4618874773139747e-05, + "loss": 39.7451, + "step": 2835 + }, + { + "epoch": 10.238374717832958, + "grad_norm": 260.3656005859375, + "learning_rate": 1.4613430127041742e-05, + "loss": 38.8297, + "step": 2836 + }, + { + "epoch": 10.241986455981941, + "grad_norm": 231.78102111816406, + "learning_rate": 1.4607985480943739e-05, + "loss": 38.523, + "step": 2837 + }, + { + "epoch": 10.245598194130926, + "grad_norm": 217.64820861816406, + "learning_rate": 1.4602540834845736e-05, + "loss": 40.0389, + "step": 2838 + }, + { + "epoch": 10.249209932279909, + "grad_norm": 186.45240783691406, + "learning_rate": 1.4597096188747732e-05, + "loss": 40.3306, + "step": 2839 + }, + { + "epoch": 10.252821670428894, + "grad_norm": 225.20480346679688, + "learning_rate": 1.4591651542649727e-05, + "loss": 39.0968, + "step": 2840 + }, + { + "epoch": 10.252821670428894, + "eval_loss": 0.6195141673088074, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 2840 + }, + { + "epoch": 10.256433408577879, + "grad_norm": 367.6174621582031, + "learning_rate": 1.4586206896551724e-05, + "loss": 38.869, + "step": 2841 + }, + { + "epoch": 10.260045146726862, + "grad_norm": 274.3976135253906, + "learning_rate": 1.4580762250453721e-05, + "loss": 39.7781, + "step": 2842 + }, + { + "epoch": 10.263656884875846, + "grad_norm": 193.41665649414062, + "learning_rate": 1.4575317604355718e-05, + "loss": 38.819, + "step": 2843 + }, + { + "epoch": 10.267268623024831, + "grad_norm": 204.2224578857422, + "learning_rate": 1.4569872958257714e-05, + "loss": 41.5495, + "step": 2844 + }, + { + "epoch": 10.270880361173814, + "grad_norm": 276.07476806640625, + "learning_rate": 1.4564428312159709e-05, + "loss": 40.6553, + "step": 2845 + }, + { + "epoch": 10.2744920993228, + "grad_norm": 192.6361541748047, + "learning_rate": 1.4558983666061708e-05, + "loss": 40.2147, + "step": 2846 + }, + { + "epoch": 10.278103837471784, + "grad_norm": 232.6641082763672, + "learning_rate": 1.4553539019963703e-05, + "loss": 40.7223, + "step": 2847 + }, + { + "epoch": 10.281715575620767, + "grad_norm": 266.781005859375, + "learning_rate": 1.4548094373865698e-05, + "loss": 38.0127, + "step": 2848 + }, + { + "epoch": 10.285327313769752, + "grad_norm": 289.5414123535156, + "learning_rate": 1.4542649727767696e-05, + "loss": 35.216, + "step": 2849 + }, + { + "epoch": 10.288939051918735, + "grad_norm": 208.10845947265625, + "learning_rate": 1.4537205081669691e-05, + "loss": 33.829, + "step": 2850 + }, + { + "epoch": 10.288939051918735, + "eval_loss": 0.6140356063842773, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.703, + "eval_steps_per_second": 56.703, + "step": 2850 + }, + { + "epoch": 10.29255079006772, + "grad_norm": 260.80328369140625, + "learning_rate": 1.4531760435571688e-05, + "loss": 33.8409, + "step": 2851 + }, + { + "epoch": 10.296162528216705, + "grad_norm": 202.3874053955078, + "learning_rate": 1.4526315789473685e-05, + "loss": 32.6498, + "step": 2852 + }, + { + "epoch": 10.299774266365688, + "grad_norm": 236.0218048095703, + "learning_rate": 1.452087114337568e-05, + "loss": 33.6538, + "step": 2853 + }, + { + "epoch": 10.303386004514673, + "grad_norm": 219.1603240966797, + "learning_rate": 1.4515426497277678e-05, + "loss": 33.7346, + "step": 2854 + }, + { + "epoch": 10.306997742663658, + "grad_norm": 252.8759307861328, + "learning_rate": 1.4509981851179675e-05, + "loss": 34.6996, + "step": 2855 + }, + { + "epoch": 10.31060948081264, + "grad_norm": 204.89244079589844, + "learning_rate": 1.450453720508167e-05, + "loss": 36.1145, + "step": 2856 + }, + { + "epoch": 10.314221218961626, + "grad_norm": 239.5278778076172, + "learning_rate": 1.4499092558983667e-05, + "loss": 34.8845, + "step": 2857 + }, + { + "epoch": 10.317832957110609, + "grad_norm": 235.02403259277344, + "learning_rate": 1.4493647912885662e-05, + "loss": 36.1006, + "step": 2858 + }, + { + "epoch": 10.321444695259594, + "grad_norm": 219.25686645507812, + "learning_rate": 1.4488203266787658e-05, + "loss": 37.0463, + "step": 2859 + }, + { + "epoch": 10.325056433408578, + "grad_norm": 238.1767578125, + "learning_rate": 1.4482758620689657e-05, + "loss": 35.5543, + "step": 2860 + }, + { + "epoch": 10.325056433408578, + "eval_loss": 0.6116110682487488, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.93, + "eval_steps_per_second": 56.93, + "step": 2860 + }, + { + "epoch": 10.328668171557561, + "grad_norm": 245.4133758544922, + "learning_rate": 1.4477313974591652e-05, + "loss": 35.7557, + "step": 2861 + }, + { + "epoch": 10.332279909706546, + "grad_norm": 231.70779418945312, + "learning_rate": 1.4471869328493647e-05, + "loss": 35.9535, + "step": 2862 + }, + { + "epoch": 10.335891647855531, + "grad_norm": 218.71266174316406, + "learning_rate": 1.4466424682395644e-05, + "loss": 36.747, + "step": 2863 + }, + { + "epoch": 10.339503386004514, + "grad_norm": 206.82247924804688, + "learning_rate": 1.446098003629764e-05, + "loss": 37.4007, + "step": 2864 + }, + { + "epoch": 10.343115124153499, + "grad_norm": 286.6649475097656, + "learning_rate": 1.4455535390199639e-05, + "loss": 38.183, + "step": 2865 + }, + { + "epoch": 10.346726862302482, + "grad_norm": 262.2049865722656, + "learning_rate": 1.4450090744101634e-05, + "loss": 28.1564, + "step": 2866 + }, + { + "epoch": 10.350338600451467, + "grad_norm": 203.03831481933594, + "learning_rate": 1.444464609800363e-05, + "loss": 23.7155, + "step": 2867 + }, + { + "epoch": 10.353950338600452, + "grad_norm": 220.13597106933594, + "learning_rate": 1.4439201451905626e-05, + "loss": 23.5066, + "step": 2868 + }, + { + "epoch": 10.357562076749435, + "grad_norm": 208.22035217285156, + "learning_rate": 1.4433756805807624e-05, + "loss": 23.8087, + "step": 2869 + }, + { + "epoch": 10.36117381489842, + "grad_norm": 202.74989318847656, + "learning_rate": 1.4428312159709619e-05, + "loss": 24.6194, + "step": 2870 + }, + { + "epoch": 10.36117381489842, + "eval_loss": 0.6170971989631653, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 2870 + }, + { + "epoch": 10.364785553047405, + "grad_norm": 251.78924560546875, + "learning_rate": 1.4422867513611616e-05, + "loss": 41.1333, + "step": 2871 + }, + { + "epoch": 10.368397291196388, + "grad_norm": 269.72430419921875, + "learning_rate": 1.4417422867513611e-05, + "loss": 43.5289, + "step": 2872 + }, + { + "epoch": 10.372009029345373, + "grad_norm": 226.14202880859375, + "learning_rate": 1.4411978221415607e-05, + "loss": 42.1575, + "step": 2873 + }, + { + "epoch": 10.375620767494357, + "grad_norm": 230.2255096435547, + "learning_rate": 1.4406533575317606e-05, + "loss": 42.5563, + "step": 2874 + }, + { + "epoch": 10.37923250564334, + "grad_norm": 259.2338562011719, + "learning_rate": 1.4401088929219601e-05, + "loss": 41.517, + "step": 2875 + }, + { + "epoch": 10.382844243792325, + "grad_norm": 280.06414794921875, + "learning_rate": 1.4395644283121598e-05, + "loss": 41.3589, + "step": 2876 + }, + { + "epoch": 10.386455981941308, + "grad_norm": 259.1960754394531, + "learning_rate": 1.4390199637023593e-05, + "loss": 41.539, + "step": 2877 + }, + { + "epoch": 10.390067720090293, + "grad_norm": 244.4931640625, + "learning_rate": 1.438475499092559e-05, + "loss": 41.8689, + "step": 2878 + }, + { + "epoch": 10.393679458239278, + "grad_norm": 195.65065002441406, + "learning_rate": 1.4379310344827588e-05, + "loss": 42.9191, + "step": 2879 + }, + { + "epoch": 10.397291196388261, + "grad_norm": 215.88589477539062, + "learning_rate": 1.4373865698729583e-05, + "loss": 41.4172, + "step": 2880 + }, + { + "epoch": 10.397291196388261, + "eval_loss": 0.6176813840866089, + "eval_runtime": 3.1462, + "eval_samples_per_second": 56.893, + "eval_steps_per_second": 56.893, + "step": 2880 + }, + { + "epoch": 10.400902934537246, + "grad_norm": 175.21368408203125, + "learning_rate": 1.4368421052631578e-05, + "loss": 41.8998, + "step": 2881 + }, + { + "epoch": 10.404514672686231, + "grad_norm": 207.65963745117188, + "learning_rate": 1.4362976406533575e-05, + "loss": 40.33, + "step": 2882 + }, + { + "epoch": 10.408126410835214, + "grad_norm": 213.50526428222656, + "learning_rate": 1.4357531760435572e-05, + "loss": 38.0329, + "step": 2883 + }, + { + "epoch": 10.411738148984199, + "grad_norm": 190.8444366455078, + "learning_rate": 1.4352087114337568e-05, + "loss": 39.0142, + "step": 2884 + }, + { + "epoch": 10.415349887133182, + "grad_norm": 300.2298583984375, + "learning_rate": 1.4346642468239565e-05, + "loss": 38.6364, + "step": 2885 + }, + { + "epoch": 10.418961625282167, + "grad_norm": 183.6144256591797, + "learning_rate": 1.434119782214156e-05, + "loss": 39.6747, + "step": 2886 + }, + { + "epoch": 10.422573363431152, + "grad_norm": 237.85340881347656, + "learning_rate": 1.4335753176043557e-05, + "loss": 38.3018, + "step": 2887 + }, + { + "epoch": 10.426185101580135, + "grad_norm": 325.96624755859375, + "learning_rate": 1.4330308529945554e-05, + "loss": 40.1042, + "step": 2888 + }, + { + "epoch": 10.42979683972912, + "grad_norm": 248.4732666015625, + "learning_rate": 1.432486388384755e-05, + "loss": 40.0357, + "step": 2889 + }, + { + "epoch": 10.433408577878104, + "grad_norm": 374.6653747558594, + "learning_rate": 1.4319419237749547e-05, + "loss": 40.4383, + "step": 2890 + }, + { + "epoch": 10.433408577878104, + "eval_loss": 0.6150367856025696, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.881, + "eval_steps_per_second": 56.881, + "step": 2890 + }, + { + "epoch": 10.437020316027088, + "grad_norm": 229.79647827148438, + "learning_rate": 1.4313974591651542e-05, + "loss": 40.3728, + "step": 2891 + }, + { + "epoch": 10.440632054176072, + "grad_norm": 278.7500915527344, + "learning_rate": 1.430852994555354e-05, + "loss": 39.546, + "step": 2892 + }, + { + "epoch": 10.444243792325057, + "grad_norm": 233.1890106201172, + "learning_rate": 1.4303085299455536e-05, + "loss": 41.8094, + "step": 2893 + }, + { + "epoch": 10.44785553047404, + "grad_norm": 207.7745819091797, + "learning_rate": 1.4297640653357532e-05, + "loss": 40.6225, + "step": 2894 + }, + { + "epoch": 10.451467268623025, + "grad_norm": 233.37892150878906, + "learning_rate": 1.4292196007259529e-05, + "loss": 40.2499, + "step": 2895 + }, + { + "epoch": 10.455079006772008, + "grad_norm": 225.4070587158203, + "learning_rate": 1.4286751361161524e-05, + "loss": 40.3626, + "step": 2896 + }, + { + "epoch": 10.458690744920993, + "grad_norm": 239.60231018066406, + "learning_rate": 1.4281306715063521e-05, + "loss": 40.3149, + "step": 2897 + }, + { + "epoch": 10.462302483069978, + "grad_norm": 225.3981475830078, + "learning_rate": 1.4275862068965518e-05, + "loss": 39.3443, + "step": 2898 + }, + { + "epoch": 10.465914221218961, + "grad_norm": 270.2829284667969, + "learning_rate": 1.4270417422867514e-05, + "loss": 37.8947, + "step": 2899 + }, + { + "epoch": 10.469525959367946, + "grad_norm": 263.66986083984375, + "learning_rate": 1.426497277676951e-05, + "loss": 34.4721, + "step": 2900 + }, + { + "epoch": 10.469525959367946, + "eval_loss": 0.6134031414985657, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2900 + }, + { + "epoch": 10.47313769751693, + "grad_norm": 189.3812255859375, + "learning_rate": 1.4259528130671508e-05, + "loss": 34.3148, + "step": 2901 + }, + { + "epoch": 10.476749435665914, + "grad_norm": 256.7174987792969, + "learning_rate": 1.4254083484573503e-05, + "loss": 32.1693, + "step": 2902 + }, + { + "epoch": 10.480361173814899, + "grad_norm": 265.40692138671875, + "learning_rate": 1.4248638838475499e-05, + "loss": 34.369, + "step": 2903 + }, + { + "epoch": 10.483972911963882, + "grad_norm": 315.6539001464844, + "learning_rate": 1.4243194192377496e-05, + "loss": 34.9479, + "step": 2904 + }, + { + "epoch": 10.487584650112867, + "grad_norm": 263.7816162109375, + "learning_rate": 1.4237749546279491e-05, + "loss": 33.983, + "step": 2905 + }, + { + "epoch": 10.491196388261852, + "grad_norm": 244.69192504882812, + "learning_rate": 1.423230490018149e-05, + "loss": 36.6685, + "step": 2906 + }, + { + "epoch": 10.494808126410835, + "grad_norm": 224.26071166992188, + "learning_rate": 1.4226860254083485e-05, + "loss": 35.0337, + "step": 2907 + }, + { + "epoch": 10.49841986455982, + "grad_norm": 261.0958557128906, + "learning_rate": 1.422141560798548e-05, + "loss": 34.7154, + "step": 2908 + }, + { + "epoch": 10.502031602708804, + "grad_norm": 245.85960388183594, + "learning_rate": 1.4215970961887478e-05, + "loss": 35.4156, + "step": 2909 + }, + { + "epoch": 10.505643340857787, + "grad_norm": 309.3730163574219, + "learning_rate": 1.4210526315789473e-05, + "loss": 36.3999, + "step": 2910 + }, + { + "epoch": 10.505643340857787, + "eval_loss": 0.6144266128540039, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.853, + "eval_steps_per_second": 56.853, + "step": 2910 + }, + { + "epoch": 10.509255079006772, + "grad_norm": 209.9637451171875, + "learning_rate": 1.420508166969147e-05, + "loss": 37.1515, + "step": 2911 + }, + { + "epoch": 10.512866817155757, + "grad_norm": 254.81683349609375, + "learning_rate": 1.4199637023593467e-05, + "loss": 35.5548, + "step": 2912 + }, + { + "epoch": 10.51647855530474, + "grad_norm": 224.94137573242188, + "learning_rate": 1.4194192377495463e-05, + "loss": 36.7691, + "step": 2913 + }, + { + "epoch": 10.520090293453725, + "grad_norm": 223.81838989257812, + "learning_rate": 1.4188747731397458e-05, + "loss": 37.5904, + "step": 2914 + }, + { + "epoch": 10.523702031602708, + "grad_norm": 308.0168151855469, + "learning_rate": 1.4183303085299457e-05, + "loss": 36.1561, + "step": 2915 + }, + { + "epoch": 10.527313769751693, + "grad_norm": 214.77928161621094, + "learning_rate": 1.4177858439201452e-05, + "loss": 27.6309, + "step": 2916 + }, + { + "epoch": 10.530925507900678, + "grad_norm": 153.77163696289062, + "learning_rate": 1.417241379310345e-05, + "loss": 23.6151, + "step": 2917 + }, + { + "epoch": 10.534537246049661, + "grad_norm": 161.12826538085938, + "learning_rate": 1.4166969147005445e-05, + "loss": 23.1684, + "step": 2918 + }, + { + "epoch": 10.538148984198646, + "grad_norm": 228.01441955566406, + "learning_rate": 1.416152450090744e-05, + "loss": 23.4383, + "step": 2919 + }, + { + "epoch": 10.54176072234763, + "grad_norm": 207.55052185058594, + "learning_rate": 1.4156079854809439e-05, + "loss": 25.4699, + "step": 2920 + }, + { + "epoch": 10.54176072234763, + "eval_loss": 0.6177500486373901, + "eval_runtime": 3.1369, + "eval_samples_per_second": 57.063, + "eval_steps_per_second": 57.063, + "step": 2920 + }, + { + "epoch": 10.545372460496614, + "grad_norm": 254.23828125, + "learning_rate": 1.4150635208711434e-05, + "loss": 42.1525, + "step": 2921 + }, + { + "epoch": 10.548984198645599, + "grad_norm": 228.1654815673828, + "learning_rate": 1.414519056261343e-05, + "loss": 42.4282, + "step": 2922 + }, + { + "epoch": 10.552595936794582, + "grad_norm": 258.4981689453125, + "learning_rate": 1.4139745916515427e-05, + "loss": 42.3053, + "step": 2923 + }, + { + "epoch": 10.556207674943566, + "grad_norm": 364.42059326171875, + "learning_rate": 1.4134301270417424e-05, + "loss": 41.9009, + "step": 2924 + }, + { + "epoch": 10.559819413092551, + "grad_norm": 213.5066375732422, + "learning_rate": 1.412885662431942e-05, + "loss": 41.0624, + "step": 2925 + }, + { + "epoch": 10.563431151241534, + "grad_norm": 214.23472595214844, + "learning_rate": 1.4123411978221416e-05, + "loss": 42.2508, + "step": 2926 + }, + { + "epoch": 10.56704288939052, + "grad_norm": 249.8063201904297, + "learning_rate": 1.4117967332123412e-05, + "loss": 43.0671, + "step": 2927 + }, + { + "epoch": 10.570654627539504, + "grad_norm": 210.0769805908203, + "learning_rate": 1.4112522686025409e-05, + "loss": 43.4018, + "step": 2928 + }, + { + "epoch": 10.574266365688487, + "grad_norm": 255.67225646972656, + "learning_rate": 1.4107078039927406e-05, + "loss": 42.9609, + "step": 2929 + }, + { + "epoch": 10.577878103837472, + "grad_norm": 294.2599182128906, + "learning_rate": 1.4101633393829401e-05, + "loss": 41.8748, + "step": 2930 + }, + { + "epoch": 10.577878103837472, + "eval_loss": 0.6147512793540955, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2930 + }, + { + "epoch": 10.581489841986457, + "grad_norm": 212.6685333251953, + "learning_rate": 1.4096188747731398e-05, + "loss": 42.4291, + "step": 2931 + }, + { + "epoch": 10.58510158013544, + "grad_norm": 297.016357421875, + "learning_rate": 1.4090744101633394e-05, + "loss": 39.7291, + "step": 2932 + }, + { + "epoch": 10.588713318284425, + "grad_norm": 280.308837890625, + "learning_rate": 1.4085299455535389e-05, + "loss": 37.4836, + "step": 2933 + }, + { + "epoch": 10.592325056433408, + "grad_norm": 230.28994750976562, + "learning_rate": 1.4079854809437388e-05, + "loss": 39.4075, + "step": 2934 + }, + { + "epoch": 10.595936794582393, + "grad_norm": 377.0367126464844, + "learning_rate": 1.4074410163339383e-05, + "loss": 40.5601, + "step": 2935 + }, + { + "epoch": 10.599548532731378, + "grad_norm": 238.51597595214844, + "learning_rate": 1.406896551724138e-05, + "loss": 38.1238, + "step": 2936 + }, + { + "epoch": 10.60316027088036, + "grad_norm": 197.5536651611328, + "learning_rate": 1.4063520871143376e-05, + "loss": 38.2997, + "step": 2937 + }, + { + "epoch": 10.606772009029346, + "grad_norm": 211.65162658691406, + "learning_rate": 1.4058076225045373e-05, + "loss": 39.1501, + "step": 2938 + }, + { + "epoch": 10.610383747178329, + "grad_norm": 266.4801940917969, + "learning_rate": 1.405263157894737e-05, + "loss": 40.5761, + "step": 2939 + }, + { + "epoch": 10.613995485327314, + "grad_norm": 210.29478454589844, + "learning_rate": 1.4047186932849365e-05, + "loss": 39.7387, + "step": 2940 + }, + { + "epoch": 10.613995485327314, + "eval_loss": 0.6154477000236511, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 2940 + }, + { + "epoch": 10.617607223476298, + "grad_norm": 318.0694580078125, + "learning_rate": 1.404174228675136e-05, + "loss": 38.691, + "step": 2941 + }, + { + "epoch": 10.621218961625281, + "grad_norm": 351.12811279296875, + "learning_rate": 1.4036297640653358e-05, + "loss": 40.3878, + "step": 2942 + }, + { + "epoch": 10.624830699774266, + "grad_norm": 259.8601989746094, + "learning_rate": 1.4030852994555355e-05, + "loss": 38.4447, + "step": 2943 + }, + { + "epoch": 10.628442437923251, + "grad_norm": 249.7741241455078, + "learning_rate": 1.402540834845735e-05, + "loss": 41.1242, + "step": 2944 + }, + { + "epoch": 10.632054176072234, + "grad_norm": 207.11119079589844, + "learning_rate": 1.4019963702359347e-05, + "loss": 40.1977, + "step": 2945 + }, + { + "epoch": 10.635665914221219, + "grad_norm": 199.37295532226562, + "learning_rate": 1.4014519056261343e-05, + "loss": 40.71, + "step": 2946 + }, + { + "epoch": 10.639277652370204, + "grad_norm": 238.85061645507812, + "learning_rate": 1.4009074410163341e-05, + "loss": 41.8822, + "step": 2947 + }, + { + "epoch": 10.642889390519187, + "grad_norm": 212.46388244628906, + "learning_rate": 1.4003629764065337e-05, + "loss": 40.5648, + "step": 2948 + }, + { + "epoch": 10.646501128668172, + "grad_norm": 217.60386657714844, + "learning_rate": 1.3998185117967332e-05, + "loss": 39.6074, + "step": 2949 + }, + { + "epoch": 10.650112866817155, + "grad_norm": 223.88645935058594, + "learning_rate": 1.399274047186933e-05, + "loss": 37.7394, + "step": 2950 + }, + { + "epoch": 10.650112866817155, + "eval_loss": 0.6133999228477478, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 2950 + }, + { + "epoch": 10.65372460496614, + "grad_norm": 248.87986755371094, + "learning_rate": 1.3987295825771325e-05, + "loss": 34.911, + "step": 2951 + }, + { + "epoch": 10.657336343115125, + "grad_norm": 238.0355987548828, + "learning_rate": 1.3981851179673322e-05, + "loss": 34.0325, + "step": 2952 + }, + { + "epoch": 10.660948081264108, + "grad_norm": 212.9556121826172, + "learning_rate": 1.3976406533575319e-05, + "loss": 34.9663, + "step": 2953 + }, + { + "epoch": 10.664559819413093, + "grad_norm": 274.4277648925781, + "learning_rate": 1.3970961887477314e-05, + "loss": 34.2399, + "step": 2954 + }, + { + "epoch": 10.668171557562077, + "grad_norm": 211.77976989746094, + "learning_rate": 1.396551724137931e-05, + "loss": 33.7609, + "step": 2955 + }, + { + "epoch": 10.67178329571106, + "grad_norm": 280.6621398925781, + "learning_rate": 1.3960072595281307e-05, + "loss": 35.2616, + "step": 2956 + }, + { + "epoch": 10.675395033860045, + "grad_norm": 239.06439208984375, + "learning_rate": 1.3954627949183304e-05, + "loss": 34.2542, + "step": 2957 + }, + { + "epoch": 10.679006772009028, + "grad_norm": 271.45806884765625, + "learning_rate": 1.39491833030853e-05, + "loss": 36.0551, + "step": 2958 + }, + { + "epoch": 10.682618510158013, + "grad_norm": 247.76486206054688, + "learning_rate": 1.3943738656987296e-05, + "loss": 36.9935, + "step": 2959 + }, + { + "epoch": 10.686230248306998, + "grad_norm": 259.47930908203125, + "learning_rate": 1.3938294010889292e-05, + "loss": 36.7769, + "step": 2960 + }, + { + "epoch": 10.686230248306998, + "eval_loss": 0.6107803583145142, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.138, + "eval_steps_per_second": 57.138, + "step": 2960 + }, + { + "epoch": 10.689841986455981, + "grad_norm": 247.50103759765625, + "learning_rate": 1.393284936479129e-05, + "loss": 35.4848, + "step": 2961 + }, + { + "epoch": 10.693453724604966, + "grad_norm": 242.37330627441406, + "learning_rate": 1.3927404718693286e-05, + "loss": 36.3881, + "step": 2962 + }, + { + "epoch": 10.697065462753951, + "grad_norm": 200.2835693359375, + "learning_rate": 1.3921960072595281e-05, + "loss": 37.2684, + "step": 2963 + }, + { + "epoch": 10.700677200902934, + "grad_norm": 261.6256103515625, + "learning_rate": 1.3916515426497278e-05, + "loss": 37.4581, + "step": 2964 + }, + { + "epoch": 10.704288939051919, + "grad_norm": 243.7251434326172, + "learning_rate": 1.3911070780399274e-05, + "loss": 35.8237, + "step": 2965 + }, + { + "epoch": 10.707900677200904, + "grad_norm": 172.99339294433594, + "learning_rate": 1.390562613430127e-05, + "loss": 29.5815, + "step": 2966 + }, + { + "epoch": 10.711512415349887, + "grad_norm": 168.88490295410156, + "learning_rate": 1.3900181488203268e-05, + "loss": 23.6597, + "step": 2967 + }, + { + "epoch": 10.715124153498872, + "grad_norm": 213.0456085205078, + "learning_rate": 1.3894736842105263e-05, + "loss": 22.5034, + "step": 2968 + }, + { + "epoch": 10.718735891647855, + "grad_norm": 183.87222290039062, + "learning_rate": 1.388929219600726e-05, + "loss": 24.1696, + "step": 2969 + }, + { + "epoch": 10.72234762979684, + "grad_norm": 179.4297637939453, + "learning_rate": 1.3883847549909256e-05, + "loss": 24.8905, + "step": 2970 + }, + { + "epoch": 10.72234762979684, + "eval_loss": 0.6176853179931641, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 2970 + }, + { + "epoch": 10.725959367945824, + "grad_norm": 214.10662841796875, + "learning_rate": 1.3878402903811253e-05, + "loss": 40.6941, + "step": 2971 + }, + { + "epoch": 10.729571106094808, + "grad_norm": 199.4381103515625, + "learning_rate": 1.387295825771325e-05, + "loss": 42.6363, + "step": 2972 + }, + { + "epoch": 10.733182844243792, + "grad_norm": 182.74517822265625, + "learning_rate": 1.3867513611615245e-05, + "loss": 40.9695, + "step": 2973 + }, + { + "epoch": 10.736794582392777, + "grad_norm": 182.41421508789062, + "learning_rate": 1.386206896551724e-05, + "loss": 40.8893, + "step": 2974 + }, + { + "epoch": 10.74040632054176, + "grad_norm": 215.42904663085938, + "learning_rate": 1.385662431941924e-05, + "loss": 40.6667, + "step": 2975 + }, + { + "epoch": 10.744018058690745, + "grad_norm": 208.15133666992188, + "learning_rate": 1.3851179673321235e-05, + "loss": 42.0714, + "step": 2976 + }, + { + "epoch": 10.747629796839728, + "grad_norm": 224.70242309570312, + "learning_rate": 1.384573502722323e-05, + "loss": 40.9404, + "step": 2977 + }, + { + "epoch": 10.751241534988713, + "grad_norm": 241.45301818847656, + "learning_rate": 1.3840290381125227e-05, + "loss": 43.5597, + "step": 2978 + }, + { + "epoch": 10.754853273137698, + "grad_norm": 201.2677459716797, + "learning_rate": 1.3834845735027222e-05, + "loss": 42.7741, + "step": 2979 + }, + { + "epoch": 10.758465011286681, + "grad_norm": 246.30873107910156, + "learning_rate": 1.3829401088929221e-05, + "loss": 41.7873, + "step": 2980 + }, + { + "epoch": 10.758465011286681, + "eval_loss": 0.6206657886505127, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2980 + }, + { + "epoch": 10.762076749435666, + "grad_norm": 206.91009521484375, + "learning_rate": 1.3823956442831217e-05, + "loss": 42.3601, + "step": 2981 + }, + { + "epoch": 10.76568848758465, + "grad_norm": 206.37472534179688, + "learning_rate": 1.3818511796733212e-05, + "loss": 38.5536, + "step": 2982 + }, + { + "epoch": 10.769300225733634, + "grad_norm": 206.49070739746094, + "learning_rate": 1.3813067150635209e-05, + "loss": 38.1051, + "step": 2983 + }, + { + "epoch": 10.772911963882619, + "grad_norm": 215.02455139160156, + "learning_rate": 1.3807622504537206e-05, + "loss": 39.0797, + "step": 2984 + }, + { + "epoch": 10.776523702031604, + "grad_norm": 254.23757934570312, + "learning_rate": 1.3802177858439202e-05, + "loss": 39.419, + "step": 2985 + }, + { + "epoch": 10.780135440180587, + "grad_norm": 205.85079956054688, + "learning_rate": 1.3796733212341199e-05, + "loss": 39.2075, + "step": 2986 + }, + { + "epoch": 10.783747178329572, + "grad_norm": 216.0372314453125, + "learning_rate": 1.3791288566243194e-05, + "loss": 38.5652, + "step": 2987 + }, + { + "epoch": 10.787358916478555, + "grad_norm": 258.47650146484375, + "learning_rate": 1.3785843920145191e-05, + "loss": 38.1968, + "step": 2988 + }, + { + "epoch": 10.79097065462754, + "grad_norm": 289.07354736328125, + "learning_rate": 1.3780399274047188e-05, + "loss": 40.2233, + "step": 2989 + }, + { + "epoch": 10.794582392776524, + "grad_norm": 332.9964904785156, + "learning_rate": 1.3774954627949184e-05, + "loss": 39.5959, + "step": 2990 + }, + { + "epoch": 10.794582392776524, + "eval_loss": 0.6167517304420471, + "eval_runtime": 3.1556, + "eval_samples_per_second": 56.724, + "eval_steps_per_second": 56.724, + "step": 2990 + }, + { + "epoch": 10.798194130925507, + "grad_norm": 205.10699462890625, + "learning_rate": 1.376950998185118e-05, + "loss": 40.2468, + "step": 2991 + }, + { + "epoch": 10.801805869074492, + "grad_norm": 270.2808837890625, + "learning_rate": 1.3764065335753176e-05, + "loss": 37.5956, + "step": 2992 + }, + { + "epoch": 10.805417607223477, + "grad_norm": 199.32044982910156, + "learning_rate": 1.3758620689655171e-05, + "loss": 38.7289, + "step": 2993 + }, + { + "epoch": 10.80902934537246, + "grad_norm": 196.97547912597656, + "learning_rate": 1.375317604355717e-05, + "loss": 40.6707, + "step": 2994 + }, + { + "epoch": 10.812641083521445, + "grad_norm": 219.34588623046875, + "learning_rate": 1.3747731397459166e-05, + "loss": 39.6782, + "step": 2995 + }, + { + "epoch": 10.816252821670428, + "grad_norm": 261.7323913574219, + "learning_rate": 1.3742286751361161e-05, + "loss": 41.1828, + "step": 2996 + }, + { + "epoch": 10.819864559819413, + "grad_norm": 250.89186096191406, + "learning_rate": 1.3736842105263158e-05, + "loss": 41.3582, + "step": 2997 + }, + { + "epoch": 10.823476297968398, + "grad_norm": 284.7223205566406, + "learning_rate": 1.3731397459165155e-05, + "loss": 39.3584, + "step": 2998 + }, + { + "epoch": 10.827088036117381, + "grad_norm": 212.9114990234375, + "learning_rate": 1.3725952813067152e-05, + "loss": 37.5373, + "step": 2999 + }, + { + "epoch": 10.830699774266366, + "grad_norm": 182.8346405029297, + "learning_rate": 1.3720508166969148e-05, + "loss": 35.2027, + "step": 3000 + }, + { + "epoch": 10.830699774266366, + "eval_loss": 0.6083630919456482, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.702, + "eval_steps_per_second": 56.702, + "step": 3000 + }, + { + "epoch": 10.83431151241535, + "grad_norm": 259.0496520996094, + "learning_rate": 1.3715063520871143e-05, + "loss": 33.4937, + "step": 3001 + }, + { + "epoch": 10.837923250564334, + "grad_norm": 173.037353515625, + "learning_rate": 1.370961887477314e-05, + "loss": 32.8549, + "step": 3002 + }, + { + "epoch": 10.841534988713319, + "grad_norm": 257.9381408691406, + "learning_rate": 1.3704174228675137e-05, + "loss": 33.9163, + "step": 3003 + }, + { + "epoch": 10.845146726862303, + "grad_norm": 248.58355712890625, + "learning_rate": 1.3698729582577132e-05, + "loss": 34.3948, + "step": 3004 + }, + { + "epoch": 10.848758465011286, + "grad_norm": 277.0877990722656, + "learning_rate": 1.369328493647913e-05, + "loss": 34.2868, + "step": 3005 + }, + { + "epoch": 10.852370203160271, + "grad_norm": 220.54014587402344, + "learning_rate": 1.3687840290381125e-05, + "loss": 35.2502, + "step": 3006 + }, + { + "epoch": 10.855981941309254, + "grad_norm": 248.14111328125, + "learning_rate": 1.3682395644283122e-05, + "loss": 33.4599, + "step": 3007 + }, + { + "epoch": 10.85959367945824, + "grad_norm": 284.2827453613281, + "learning_rate": 1.3676950998185119e-05, + "loss": 34.2927, + "step": 3008 + }, + { + "epoch": 10.863205417607224, + "grad_norm": 236.78201293945312, + "learning_rate": 1.3671506352087114e-05, + "loss": 34.9322, + "step": 3009 + }, + { + "epoch": 10.866817155756207, + "grad_norm": 245.58331298828125, + "learning_rate": 1.3666061705989112e-05, + "loss": 35.7628, + "step": 3010 + }, + { + "epoch": 10.866817155756207, + "eval_loss": 0.6125946640968323, + "eval_runtime": 3.1644, + "eval_samples_per_second": 56.566, + "eval_steps_per_second": 56.566, + "step": 3010 + }, + { + "epoch": 10.870428893905192, + "grad_norm": 217.79248046875, + "learning_rate": 1.3660617059891107e-05, + "loss": 35.7332, + "step": 3011 + }, + { + "epoch": 10.874040632054175, + "grad_norm": 258.78729248046875, + "learning_rate": 1.3655172413793104e-05, + "loss": 38.293, + "step": 3012 + }, + { + "epoch": 10.87765237020316, + "grad_norm": 253.94757080078125, + "learning_rate": 1.3649727767695101e-05, + "loss": 37.511, + "step": 3013 + }, + { + "epoch": 10.881264108352145, + "grad_norm": 265.5654602050781, + "learning_rate": 1.3644283121597096e-05, + "loss": 37.5786, + "step": 3014 + }, + { + "epoch": 10.884875846501128, + "grad_norm": 252.11453247070312, + "learning_rate": 1.3638838475499092e-05, + "loss": 37.1039, + "step": 3015 + }, + { + "epoch": 10.888487584650113, + "grad_norm": 259.5934753417969, + "learning_rate": 1.3633393829401089e-05, + "loss": 35.2651, + "step": 3016 + }, + { + "epoch": 10.892099322799098, + "grad_norm": 194.3569793701172, + "learning_rate": 1.3627949183303086e-05, + "loss": 23.7438, + "step": 3017 + }, + { + "epoch": 10.89571106094808, + "grad_norm": 233.95205688476562, + "learning_rate": 1.3622504537205081e-05, + "loss": 23.0061, + "step": 3018 + }, + { + "epoch": 10.899322799097066, + "grad_norm": 185.18495178222656, + "learning_rate": 1.3617059891107078e-05, + "loss": 24.5404, + "step": 3019 + }, + { + "epoch": 10.90293453724605, + "grad_norm": 200.27029418945312, + "learning_rate": 1.3611615245009074e-05, + "loss": 24.3629, + "step": 3020 + }, + { + "epoch": 10.90293453724605, + "eval_loss": 0.6178797483444214, + "eval_runtime": 3.1498, + "eval_samples_per_second": 56.829, + "eval_steps_per_second": 56.829, + "step": 3020 + }, + { + "epoch": 10.906546275395034, + "grad_norm": 226.4281463623047, + "learning_rate": 1.3606170598911073e-05, + "loss": 41.7249, + "step": 3021 + }, + { + "epoch": 10.910158013544018, + "grad_norm": 207.73768615722656, + "learning_rate": 1.3600725952813068e-05, + "loss": 42.1902, + "step": 3022 + }, + { + "epoch": 10.913769751693001, + "grad_norm": 248.69773864746094, + "learning_rate": 1.3595281306715063e-05, + "loss": 40.8419, + "step": 3023 + }, + { + "epoch": 10.917381489841986, + "grad_norm": 224.0100860595703, + "learning_rate": 1.358983666061706e-05, + "loss": 41.483, + "step": 3024 + }, + { + "epoch": 10.920993227990971, + "grad_norm": 217.3524932861328, + "learning_rate": 1.3584392014519056e-05, + "loss": 42.4667, + "step": 3025 + }, + { + "epoch": 10.924604966139954, + "grad_norm": 226.0863494873047, + "learning_rate": 1.3578947368421053e-05, + "loss": 40.8693, + "step": 3026 + }, + { + "epoch": 10.928216704288939, + "grad_norm": 278.3658447265625, + "learning_rate": 1.357350272232305e-05, + "loss": 39.5165, + "step": 3027 + }, + { + "epoch": 10.931828442437924, + "grad_norm": 226.6543731689453, + "learning_rate": 1.3568058076225045e-05, + "loss": 39.3144, + "step": 3028 + }, + { + "epoch": 10.935440180586907, + "grad_norm": 215.39073181152344, + "learning_rate": 1.3562613430127042e-05, + "loss": 39.9823, + "step": 3029 + }, + { + "epoch": 10.939051918735892, + "grad_norm": 239.6291961669922, + "learning_rate": 1.355716878402904e-05, + "loss": 40.898, + "step": 3030 + }, + { + "epoch": 10.939051918735892, + "eval_loss": 0.6163076162338257, + "eval_runtime": 3.153, + "eval_samples_per_second": 56.771, + "eval_steps_per_second": 56.771, + "step": 3030 + }, + { + "epoch": 10.942663656884875, + "grad_norm": 251.20431518554688, + "learning_rate": 1.3551724137931035e-05, + "loss": 40.8357, + "step": 3031 + }, + { + "epoch": 10.94627539503386, + "grad_norm": 243.96022033691406, + "learning_rate": 1.3546279491833032e-05, + "loss": 39.1261, + "step": 3032 + }, + { + "epoch": 10.949887133182845, + "grad_norm": 248.15545654296875, + "learning_rate": 1.3540834845735027e-05, + "loss": 40.9375, + "step": 3033 + }, + { + "epoch": 10.953498871331828, + "grad_norm": 215.00927734375, + "learning_rate": 1.3535390199637023e-05, + "loss": 42.4167, + "step": 3034 + }, + { + "epoch": 10.957110609480813, + "grad_norm": 263.11566162109375, + "learning_rate": 1.3529945553539021e-05, + "loss": 40.7363, + "step": 3035 + }, + { + "epoch": 10.960722347629797, + "grad_norm": 208.59628295898438, + "learning_rate": 1.3524500907441017e-05, + "loss": 35.7124, + "step": 3036 + }, + { + "epoch": 10.96433408577878, + "grad_norm": 187.6036834716797, + "learning_rate": 1.3519056261343012e-05, + "loss": 33.7512, + "step": 3037 + }, + { + "epoch": 10.967945823927765, + "grad_norm": 217.89825439453125, + "learning_rate": 1.351361161524501e-05, + "loss": 33.4262, + "step": 3038 + }, + { + "epoch": 10.97155756207675, + "grad_norm": 235.59889221191406, + "learning_rate": 1.3508166969147005e-05, + "loss": 35.2587, + "step": 3039 + }, + { + "epoch": 10.975169300225733, + "grad_norm": 261.9609680175781, + "learning_rate": 1.3502722323049003e-05, + "loss": 36.1296, + "step": 3040 + }, + { + "epoch": 10.975169300225733, + "eval_loss": 0.610818088054657, + "eval_runtime": 3.1502, + "eval_samples_per_second": 56.822, + "eval_steps_per_second": 56.822, + "step": 3040 + }, + { + "epoch": 10.978781038374718, + "grad_norm": 239.44386291503906, + "learning_rate": 1.3497277676950999e-05, + "loss": 35.6712, + "step": 3041 + }, + { + "epoch": 10.982392776523701, + "grad_norm": 260.9620666503906, + "learning_rate": 1.3491833030852994e-05, + "loss": 35.9054, + "step": 3042 + }, + { + "epoch": 10.986004514672686, + "grad_norm": 246.35678100585938, + "learning_rate": 1.3486388384754991e-05, + "loss": 35.6071, + "step": 3043 + }, + { + "epoch": 10.989616252821671, + "grad_norm": 259.808349609375, + "learning_rate": 1.3480943738656988e-05, + "loss": 37.8261, + "step": 3044 + }, + { + "epoch": 10.993227990970654, + "grad_norm": 187.34579467773438, + "learning_rate": 1.3475499092558984e-05, + "loss": 29.4662, + "step": 3045 + }, + { + "epoch": 10.996839729119639, + "grad_norm": 235.4073486328125, + "learning_rate": 1.3470054446460981e-05, + "loss": 23.668, + "step": 3046 + }, + { + "epoch": 11.0, + "grad_norm": 171.45904541015625, + "learning_rate": 1.3464609800362976e-05, + "loss": 21.3995, + "step": 3047 + }, + { + "epoch": 11.003611738148985, + "grad_norm": 262.18798828125, + "learning_rate": 1.3459165154264972e-05, + "loss": 40.2072, + "step": 3048 + }, + { + "epoch": 11.007223476297968, + "grad_norm": 298.67755126953125, + "learning_rate": 1.345372050816697e-05, + "loss": 42.5345, + "step": 3049 + }, + { + "epoch": 11.010835214446953, + "grad_norm": 215.71389770507812, + "learning_rate": 1.3448275862068966e-05, + "loss": 41.3491, + "step": 3050 + }, + { + "epoch": 11.010835214446953, + "eval_loss": 0.6099278330802917, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 3050 + }, + { + "epoch": 11.014446952595938, + "grad_norm": 243.77044677734375, + "learning_rate": 1.3442831215970963e-05, + "loss": 41.0093, + "step": 3051 + }, + { + "epoch": 11.01805869074492, + "grad_norm": 205.8600616455078, + "learning_rate": 1.3437386569872958e-05, + "loss": 41.944, + "step": 3052 + }, + { + "epoch": 11.021670428893906, + "grad_norm": 204.25608825683594, + "learning_rate": 1.3431941923774955e-05, + "loss": 39.3595, + "step": 3053 + }, + { + "epoch": 11.025282167042889, + "grad_norm": 195.03114318847656, + "learning_rate": 1.3426497277676952e-05, + "loss": 42.0208, + "step": 3054 + }, + { + "epoch": 11.028893905191874, + "grad_norm": 193.05857849121094, + "learning_rate": 1.3421052631578948e-05, + "loss": 41.2148, + "step": 3055 + }, + { + "epoch": 11.032505643340858, + "grad_norm": 255.9553680419922, + "learning_rate": 1.3415607985480943e-05, + "loss": 41.6029, + "step": 3056 + }, + { + "epoch": 11.036117381489841, + "grad_norm": 234.97799682617188, + "learning_rate": 1.341016333938294e-05, + "loss": 41.2583, + "step": 3057 + }, + { + "epoch": 11.039729119638826, + "grad_norm": 183.76707458496094, + "learning_rate": 1.3404718693284937e-05, + "loss": 39.4893, + "step": 3058 + }, + { + "epoch": 11.043340857787811, + "grad_norm": 162.30191040039062, + "learning_rate": 1.3399274047186933e-05, + "loss": 37.697, + "step": 3059 + }, + { + "epoch": 11.046952595936794, + "grad_norm": 223.8235626220703, + "learning_rate": 1.339382940108893e-05, + "loss": 37.2762, + "step": 3060 + }, + { + "epoch": 11.046952595936794, + "eval_loss": 0.6099210381507874, + "eval_runtime": 3.1526, + "eval_samples_per_second": 56.778, + "eval_steps_per_second": 56.778, + "step": 3060 + }, + { + "epoch": 11.050564334085779, + "grad_norm": 203.874755859375, + "learning_rate": 1.3388384754990925e-05, + "loss": 37.7674, + "step": 3061 + }, + { + "epoch": 11.054176072234762, + "grad_norm": 222.9609832763672, + "learning_rate": 1.3382940108892922e-05, + "loss": 39.5784, + "step": 3062 + }, + { + "epoch": 11.057787810383747, + "grad_norm": 177.81871032714844, + "learning_rate": 1.337749546279492e-05, + "loss": 37.5264, + "step": 3063 + }, + { + "epoch": 11.061399548532732, + "grad_norm": 209.53326416015625, + "learning_rate": 1.3372050816696915e-05, + "loss": 38.5067, + "step": 3064 + }, + { + "epoch": 11.065011286681715, + "grad_norm": 228.35260009765625, + "learning_rate": 1.3366606170598912e-05, + "loss": 37.5329, + "step": 3065 + }, + { + "epoch": 11.0686230248307, + "grad_norm": 231.5054168701172, + "learning_rate": 1.3361161524500907e-05, + "loss": 39.8565, + "step": 3066 + }, + { + "epoch": 11.072234762979685, + "grad_norm": 184.31460571289062, + "learning_rate": 1.3355716878402904e-05, + "loss": 37.9703, + "step": 3067 + }, + { + "epoch": 11.075846501128668, + "grad_norm": 230.06463623046875, + "learning_rate": 1.3350272232304901e-05, + "loss": 39.1406, + "step": 3068 + }, + { + "epoch": 11.079458239277653, + "grad_norm": 263.3990478515625, + "learning_rate": 1.3344827586206897e-05, + "loss": 39.8019, + "step": 3069 + }, + { + "epoch": 11.083069977426636, + "grad_norm": 217.89923095703125, + "learning_rate": 1.3339382940108892e-05, + "loss": 40.195, + "step": 3070 + }, + { + "epoch": 11.083069977426636, + "eval_loss": 0.6136859655380249, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 3070 + }, + { + "epoch": 11.08668171557562, + "grad_norm": 238.8343505859375, + "learning_rate": 1.333393829401089e-05, + "loss": 39.1668, + "step": 3071 + }, + { + "epoch": 11.090293453724605, + "grad_norm": 288.6470947265625, + "learning_rate": 1.3328493647912886e-05, + "loss": 40.3355, + "step": 3072 + }, + { + "epoch": 11.093905191873588, + "grad_norm": 284.3423156738281, + "learning_rate": 1.3323049001814883e-05, + "loss": 41.5359, + "step": 3073 + }, + { + "epoch": 11.097516930022573, + "grad_norm": 263.0945739746094, + "learning_rate": 1.3317604355716879e-05, + "loss": 41.3219, + "step": 3074 + }, + { + "epoch": 11.101128668171558, + "grad_norm": 208.96383666992188, + "learning_rate": 1.3312159709618874e-05, + "loss": 39.7292, + "step": 3075 + }, + { + "epoch": 11.104740406320541, + "grad_norm": 233.49888610839844, + "learning_rate": 1.3306715063520873e-05, + "loss": 35.282, + "step": 3076 + }, + { + "epoch": 11.108352144469526, + "grad_norm": 216.6250762939453, + "learning_rate": 1.3301270417422868e-05, + "loss": 34.4335, + "step": 3077 + }, + { + "epoch": 11.111963882618511, + "grad_norm": 182.3594970703125, + "learning_rate": 1.3295825771324864e-05, + "loss": 32.7557, + "step": 3078 + }, + { + "epoch": 11.115575620767494, + "grad_norm": 215.4852752685547, + "learning_rate": 1.329038112522686e-05, + "loss": 32.185, + "step": 3079 + }, + { + "epoch": 11.119187358916479, + "grad_norm": 237.4733123779297, + "learning_rate": 1.3284936479128856e-05, + "loss": 32.8733, + "step": 3080 + }, + { + "epoch": 11.119187358916479, + "eval_loss": 0.6130570769309998, + "eval_runtime": 3.154, + "eval_samples_per_second": 56.754, + "eval_steps_per_second": 56.754, + "step": 3080 + }, + { + "epoch": 11.122799097065462, + "grad_norm": 202.9044952392578, + "learning_rate": 1.3279491833030853e-05, + "loss": 33.89, + "step": 3081 + }, + { + "epoch": 11.126410835214447, + "grad_norm": 230.82086181640625, + "learning_rate": 1.327404718693285e-05, + "loss": 34.0808, + "step": 3082 + }, + { + "epoch": 11.130022573363432, + "grad_norm": 318.1103515625, + "learning_rate": 1.3268602540834846e-05, + "loss": 35.5715, + "step": 3083 + }, + { + "epoch": 11.133634311512415, + "grad_norm": 296.760986328125, + "learning_rate": 1.3263157894736843e-05, + "loss": 36.0701, + "step": 3084 + }, + { + "epoch": 11.1372460496614, + "grad_norm": 355.1922302246094, + "learning_rate": 1.3257713248638838e-05, + "loss": 35.027, + "step": 3085 + }, + { + "epoch": 11.140857787810384, + "grad_norm": 379.0643310546875, + "learning_rate": 1.3252268602540835e-05, + "loss": 36.8225, + "step": 3086 + }, + { + "epoch": 11.144469525959368, + "grad_norm": 271.0293273925781, + "learning_rate": 1.3246823956442832e-05, + "loss": 34.18, + "step": 3087 + }, + { + "epoch": 11.148081264108352, + "grad_norm": 231.29782104492188, + "learning_rate": 1.3241379310344828e-05, + "loss": 37.5546, + "step": 3088 + }, + { + "epoch": 11.151693002257336, + "grad_norm": 236.58180236816406, + "learning_rate": 1.3235934664246823e-05, + "loss": 35.8625, + "step": 3089 + }, + { + "epoch": 11.15530474040632, + "grad_norm": 220.71853637695312, + "learning_rate": 1.3230490018148822e-05, + "loss": 38.1384, + "step": 3090 + }, + { + "epoch": 11.15530474040632, + "eval_loss": 0.6140565276145935, + "eval_runtime": 3.1543, + "eval_samples_per_second": 56.747, + "eval_steps_per_second": 56.747, + "step": 3090 + }, + { + "epoch": 11.158916478555305, + "grad_norm": 251.32090759277344, + "learning_rate": 1.3225045372050817e-05, + "loss": 36.7226, + "step": 3091 + }, + { + "epoch": 11.162528216704288, + "grad_norm": 244.061279296875, + "learning_rate": 1.3219600725952814e-05, + "loss": 37.2144, + "step": 3092 + }, + { + "epoch": 11.166139954853273, + "grad_norm": 274.3013610839844, + "learning_rate": 1.321415607985481e-05, + "loss": 27.0703, + "step": 3093 + }, + { + "epoch": 11.169751693002258, + "grad_norm": 197.1829071044922, + "learning_rate": 1.3208711433756805e-05, + "loss": 23.0504, + "step": 3094 + }, + { + "epoch": 11.173363431151241, + "grad_norm": 205.8387451171875, + "learning_rate": 1.3203266787658804e-05, + "loss": 23.4632, + "step": 3095 + }, + { + "epoch": 11.176975169300226, + "grad_norm": 237.6263427734375, + "learning_rate": 1.31978221415608e-05, + "loss": 23.9426, + "step": 3096 + }, + { + "epoch": 11.18058690744921, + "grad_norm": 177.99688720703125, + "learning_rate": 1.3192377495462795e-05, + "loss": 24.2553, + "step": 3097 + }, + { + "epoch": 11.184198645598194, + "grad_norm": 235.16787719726562, + "learning_rate": 1.3186932849364792e-05, + "loss": 41.3257, + "step": 3098 + }, + { + "epoch": 11.187810383747179, + "grad_norm": 213.4043731689453, + "learning_rate": 1.3181488203266787e-05, + "loss": 42.3344, + "step": 3099 + }, + { + "epoch": 11.191422121896162, + "grad_norm": 162.57554626464844, + "learning_rate": 1.3176043557168784e-05, + "loss": 41.2702, + "step": 3100 + }, + { + "epoch": 11.191422121896162, + "eval_loss": 0.6155741214752197, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.06, + "eval_steps_per_second": 57.06, + "step": 3100 + }, + { + "epoch": 11.195033860045147, + "grad_norm": 215.84335327148438, + "learning_rate": 1.3170598911070781e-05, + "loss": 41.0582, + "step": 3101 + }, + { + "epoch": 11.198645598194132, + "grad_norm": 295.0271301269531, + "learning_rate": 1.3165154264972777e-05, + "loss": 41.3479, + "step": 3102 + }, + { + "epoch": 11.202257336343115, + "grad_norm": 287.3316955566406, + "learning_rate": 1.3159709618874774e-05, + "loss": 41.6267, + "step": 3103 + }, + { + "epoch": 11.2058690744921, + "grad_norm": 249.3993377685547, + "learning_rate": 1.315426497277677e-05, + "loss": 40.5208, + "step": 3104 + }, + { + "epoch": 11.209480812641084, + "grad_norm": 274.5410461425781, + "learning_rate": 1.3148820326678766e-05, + "loss": 41.7072, + "step": 3105 + }, + { + "epoch": 11.213092550790067, + "grad_norm": 259.49627685546875, + "learning_rate": 1.3143375680580763e-05, + "loss": 41.0034, + "step": 3106 + }, + { + "epoch": 11.216704288939052, + "grad_norm": 246.60902404785156, + "learning_rate": 1.3137931034482759e-05, + "loss": 40.1154, + "step": 3107 + }, + { + "epoch": 11.220316027088035, + "grad_norm": 224.0052947998047, + "learning_rate": 1.3132486388384754e-05, + "loss": 41.1167, + "step": 3108 + }, + { + "epoch": 11.22392776523702, + "grad_norm": 204.24021911621094, + "learning_rate": 1.3127041742286753e-05, + "loss": 37.0909, + "step": 3109 + }, + { + "epoch": 11.227539503386005, + "grad_norm": 206.67681884765625, + "learning_rate": 1.3121597096188748e-05, + "loss": 38.0959, + "step": 3110 + }, + { + "epoch": 11.227539503386005, + "eval_loss": 0.6148640513420105, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.798, + "eval_steps_per_second": 56.798, + "step": 3110 + }, + { + "epoch": 11.231151241534988, + "grad_norm": 255.91238403320312, + "learning_rate": 1.3116152450090743e-05, + "loss": 38.8076, + "step": 3111 + }, + { + "epoch": 11.234762979683973, + "grad_norm": 239.5032958984375, + "learning_rate": 1.311070780399274e-05, + "loss": 39.3991, + "step": 3112 + }, + { + "epoch": 11.238374717832958, + "grad_norm": 254.8914031982422, + "learning_rate": 1.3105263157894738e-05, + "loss": 37.7301, + "step": 3113 + }, + { + "epoch": 11.241986455981941, + "grad_norm": 229.97943115234375, + "learning_rate": 1.3099818511796735e-05, + "loss": 38.8527, + "step": 3114 + }, + { + "epoch": 11.245598194130926, + "grad_norm": 208.1148681640625, + "learning_rate": 1.309437386569873e-05, + "loss": 38.8518, + "step": 3115 + }, + { + "epoch": 11.249209932279909, + "grad_norm": 208.49557495117188, + "learning_rate": 1.3088929219600725e-05, + "loss": 38.927, + "step": 3116 + }, + { + "epoch": 11.252821670428894, + "grad_norm": 332.9958801269531, + "learning_rate": 1.3083484573502723e-05, + "loss": 40.0492, + "step": 3117 + }, + { + "epoch": 11.256433408577879, + "grad_norm": 253.16769409179688, + "learning_rate": 1.307803992740472e-05, + "loss": 39.1965, + "step": 3118 + }, + { + "epoch": 11.260045146726862, + "grad_norm": 243.8136444091797, + "learning_rate": 1.3072595281306715e-05, + "loss": 38.2286, + "step": 3119 + }, + { + "epoch": 11.263656884875846, + "grad_norm": 273.6463623046875, + "learning_rate": 1.3067150635208712e-05, + "loss": 39.3751, + "step": 3120 + }, + { + "epoch": 11.263656884875846, + "eval_loss": 0.6175129413604736, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 3120 + }, + { + "epoch": 11.267268623024831, + "grad_norm": 228.980224609375, + "learning_rate": 1.3061705989110707e-05, + "loss": 40.29, + "step": 3121 + }, + { + "epoch": 11.270880361173814, + "grad_norm": 292.6310729980469, + "learning_rate": 1.3056261343012703e-05, + "loss": 41.1785, + "step": 3122 + }, + { + "epoch": 11.2744920993228, + "grad_norm": 217.0737762451172, + "learning_rate": 1.3050816696914702e-05, + "loss": 40.9514, + "step": 3123 + }, + { + "epoch": 11.278103837471784, + "grad_norm": 227.0102081298828, + "learning_rate": 1.3045372050816697e-05, + "loss": 39.6132, + "step": 3124 + }, + { + "epoch": 11.281715575620767, + "grad_norm": 195.74667358398438, + "learning_rate": 1.3039927404718694e-05, + "loss": 39.5024, + "step": 3125 + }, + { + "epoch": 11.285327313769752, + "grad_norm": 222.6744384765625, + "learning_rate": 1.303448275862069e-05, + "loss": 37.7863, + "step": 3126 + }, + { + "epoch": 11.288939051918735, + "grad_norm": 207.1038055419922, + "learning_rate": 1.3029038112522687e-05, + "loss": 34.9129, + "step": 3127 + }, + { + "epoch": 11.29255079006772, + "grad_norm": 227.38330078125, + "learning_rate": 1.3023593466424684e-05, + "loss": 33.231, + "step": 3128 + }, + { + "epoch": 11.296162528216705, + "grad_norm": 254.19442749023438, + "learning_rate": 1.3018148820326679e-05, + "loss": 33.3166, + "step": 3129 + }, + { + "epoch": 11.299774266365688, + "grad_norm": 221.4664306640625, + "learning_rate": 1.3012704174228674e-05, + "loss": 33.2336, + "step": 3130 + }, + { + "epoch": 11.299774266365688, + "eval_loss": 0.6138683557510376, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 3130 + }, + { + "epoch": 11.303386004514673, + "grad_norm": 179.73678588867188, + "learning_rate": 1.3007259528130671e-05, + "loss": 34.0082, + "step": 3131 + }, + { + "epoch": 11.306997742663658, + "grad_norm": 238.66107177734375, + "learning_rate": 1.3001814882032669e-05, + "loss": 33.1898, + "step": 3132 + }, + { + "epoch": 11.31060948081264, + "grad_norm": 315.51934814453125, + "learning_rate": 1.2996370235934666e-05, + "loss": 34.5558, + "step": 3133 + }, + { + "epoch": 11.314221218961626, + "grad_norm": 235.54217529296875, + "learning_rate": 1.2990925589836661e-05, + "loss": 32.4498, + "step": 3134 + }, + { + "epoch": 11.317832957110609, + "grad_norm": 225.9518280029297, + "learning_rate": 1.2985480943738656e-05, + "loss": 34.1823, + "step": 3135 + }, + { + "epoch": 11.321444695259594, + "grad_norm": 276.5481262207031, + "learning_rate": 1.2980036297640655e-05, + "loss": 34.6704, + "step": 3136 + }, + { + "epoch": 11.325056433408578, + "grad_norm": 306.4985656738281, + "learning_rate": 1.297459165154265e-05, + "loss": 35.9149, + "step": 3137 + }, + { + "epoch": 11.328668171557561, + "grad_norm": 207.28550720214844, + "learning_rate": 1.2969147005444646e-05, + "loss": 34.876, + "step": 3138 + }, + { + "epoch": 11.332279909706546, + "grad_norm": 238.89157104492188, + "learning_rate": 1.2963702359346643e-05, + "loss": 36.7191, + "step": 3139 + }, + { + "epoch": 11.335891647855531, + "grad_norm": 281.7445068359375, + "learning_rate": 1.2958257713248638e-05, + "loss": 37.9134, + "step": 3140 + }, + { + "epoch": 11.335891647855531, + "eval_loss": 0.6141538023948669, + "eval_runtime": 3.1622, + "eval_samples_per_second": 56.606, + "eval_steps_per_second": 56.606, + "step": 3140 + }, + { + "epoch": 11.339503386004514, + "grad_norm": 261.58221435546875, + "learning_rate": 1.2952813067150635e-05, + "loss": 36.7193, + "step": 3141 + }, + { + "epoch": 11.343115124153499, + "grad_norm": 260.8083190917969, + "learning_rate": 1.2947368421052633e-05, + "loss": 36.9418, + "step": 3142 + }, + { + "epoch": 11.346726862302482, + "grad_norm": 263.466552734375, + "learning_rate": 1.2941923774954628e-05, + "loss": 31.1083, + "step": 3143 + }, + { + "epoch": 11.350338600451467, + "grad_norm": 201.6587677001953, + "learning_rate": 1.2936479128856625e-05, + "loss": 23.4982, + "step": 3144 + }, + { + "epoch": 11.353950338600452, + "grad_norm": 230.29629516601562, + "learning_rate": 1.293103448275862e-05, + "loss": 22.5417, + "step": 3145 + }, + { + "epoch": 11.357562076749435, + "grad_norm": 193.08795166015625, + "learning_rate": 1.2925589836660617e-05, + "loss": 23.6032, + "step": 3146 + }, + { + "epoch": 11.36117381489842, + "grad_norm": 206.49093627929688, + "learning_rate": 1.2920145190562615e-05, + "loss": 24.1813, + "step": 3147 + }, + { + "epoch": 11.364785553047405, + "grad_norm": 285.38348388671875, + "learning_rate": 1.291470054446461e-05, + "loss": 41.4394, + "step": 3148 + }, + { + "epoch": 11.368397291196388, + "grad_norm": 307.4984130859375, + "learning_rate": 1.2909255898366605e-05, + "loss": 43.8865, + "step": 3149 + }, + { + "epoch": 11.372009029345373, + "grad_norm": 256.685791015625, + "learning_rate": 1.2903811252268604e-05, + "loss": 41.5534, + "step": 3150 + }, + { + "epoch": 11.372009029345373, + "eval_loss": 0.6155339479446411, + "eval_runtime": 3.1488, + "eval_samples_per_second": 56.846, + "eval_steps_per_second": 56.846, + "step": 3150 + }, + { + "epoch": 11.375620767494357, + "grad_norm": 302.5317077636719, + "learning_rate": 1.28983666061706e-05, + "loss": 41.5231, + "step": 3151 + }, + { + "epoch": 11.37923250564334, + "grad_norm": 381.4787292480469, + "learning_rate": 1.2892921960072595e-05, + "loss": 40.7064, + "step": 3152 + }, + { + "epoch": 11.382844243792325, + "grad_norm": 313.63116455078125, + "learning_rate": 1.2887477313974592e-05, + "loss": 41.4045, + "step": 3153 + }, + { + "epoch": 11.386455981941308, + "grad_norm": 265.4134521484375, + "learning_rate": 1.2882032667876587e-05, + "loss": 41.2618, + "step": 3154 + }, + { + "epoch": 11.390067720090293, + "grad_norm": 260.43084716796875, + "learning_rate": 1.2876588021778586e-05, + "loss": 42.6311, + "step": 3155 + }, + { + "epoch": 11.393679458239278, + "grad_norm": 326.7022705078125, + "learning_rate": 1.2871143375680581e-05, + "loss": 41.8859, + "step": 3156 + }, + { + "epoch": 11.397291196388261, + "grad_norm": 420.966552734375, + "learning_rate": 1.2865698729582577e-05, + "loss": 41.8117, + "step": 3157 + }, + { + "epoch": 11.400902934537246, + "grad_norm": 280.8377380371094, + "learning_rate": 1.2860254083484574e-05, + "loss": 41.3303, + "step": 3158 + }, + { + "epoch": 11.404514672686231, + "grad_norm": 238.64564514160156, + "learning_rate": 1.2854809437386571e-05, + "loss": 38.253, + "step": 3159 + }, + { + "epoch": 11.408126410835214, + "grad_norm": 258.8091125488281, + "learning_rate": 1.2849364791288566e-05, + "loss": 39.2494, + "step": 3160 + }, + { + "epoch": 11.408126410835214, + "eval_loss": 0.6130858659744263, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 3160 + }, + { + "epoch": 11.411738148984199, + "grad_norm": 209.76300048828125, + "learning_rate": 1.2843920145190563e-05, + "loss": 39.1069, + "step": 3161 + }, + { + "epoch": 11.415349887133182, + "grad_norm": 215.24072265625, + "learning_rate": 1.2838475499092559e-05, + "loss": 38.8867, + "step": 3162 + }, + { + "epoch": 11.418961625282167, + "grad_norm": 285.4281311035156, + "learning_rate": 1.2833030852994554e-05, + "loss": 38.0298, + "step": 3163 + }, + { + "epoch": 11.422573363431152, + "grad_norm": 322.1593017578125, + "learning_rate": 1.2827586206896553e-05, + "loss": 40.2122, + "step": 3164 + }, + { + "epoch": 11.426185101580135, + "grad_norm": 277.2178955078125, + "learning_rate": 1.2822141560798548e-05, + "loss": 38.0829, + "step": 3165 + }, + { + "epoch": 11.42979683972912, + "grad_norm": 186.9705810546875, + "learning_rate": 1.2816696914700545e-05, + "loss": 40.6601, + "step": 3166 + }, + { + "epoch": 11.433408577878104, + "grad_norm": 210.6102294921875, + "learning_rate": 1.281125226860254e-05, + "loss": 39.0126, + "step": 3167 + }, + { + "epoch": 11.437020316027088, + "grad_norm": 234.50717163085938, + "learning_rate": 1.2805807622504536e-05, + "loss": 38.6465, + "step": 3168 + }, + { + "epoch": 11.440632054176072, + "grad_norm": 217.9093475341797, + "learning_rate": 1.2800362976406535e-05, + "loss": 39.2568, + "step": 3169 + }, + { + "epoch": 11.444243792325057, + "grad_norm": 252.82054138183594, + "learning_rate": 1.279491833030853e-05, + "loss": 39.005, + "step": 3170 + }, + { + "epoch": 11.444243792325057, + "eval_loss": 0.6125118732452393, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 3170 + }, + { + "epoch": 11.44785553047404, + "grad_norm": 290.2322998046875, + "learning_rate": 1.2789473684210526e-05, + "loss": 39.6133, + "step": 3171 + }, + { + "epoch": 11.451467268623025, + "grad_norm": 250.72450256347656, + "learning_rate": 1.2784029038112523e-05, + "loss": 40.3251, + "step": 3172 + }, + { + "epoch": 11.455079006772008, + "grad_norm": 273.91229248046875, + "learning_rate": 1.277858439201452e-05, + "loss": 39.5129, + "step": 3173 + }, + { + "epoch": 11.458690744920993, + "grad_norm": 214.30038452148438, + "learning_rate": 1.2773139745916515e-05, + "loss": 40.5093, + "step": 3174 + }, + { + "epoch": 11.462302483069978, + "grad_norm": 264.251708984375, + "learning_rate": 1.2767695099818512e-05, + "loss": 38.3837, + "step": 3175 + }, + { + "epoch": 11.465914221218961, + "grad_norm": 224.7700653076172, + "learning_rate": 1.2762250453720508e-05, + "loss": 37.8522, + "step": 3176 + }, + { + "epoch": 11.469525959367946, + "grad_norm": 238.35604858398438, + "learning_rate": 1.2756805807622505e-05, + "loss": 34.0249, + "step": 3177 + }, + { + "epoch": 11.47313769751693, + "grad_norm": 181.4731903076172, + "learning_rate": 1.2751361161524502e-05, + "loss": 34.2473, + "step": 3178 + }, + { + "epoch": 11.476749435665914, + "grad_norm": 240.2397003173828, + "learning_rate": 1.2745916515426497e-05, + "loss": 32.8657, + "step": 3179 + }, + { + "epoch": 11.480361173814899, + "grad_norm": 283.2740478515625, + "learning_rate": 1.2740471869328494e-05, + "loss": 34.6619, + "step": 3180 + }, + { + "epoch": 11.480361173814899, + "eval_loss": 0.6126638054847717, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 3180 + }, + { + "epoch": 11.483972911963882, + "grad_norm": 248.70912170410156, + "learning_rate": 1.273502722323049e-05, + "loss": 33.0975, + "step": 3181 + }, + { + "epoch": 11.487584650112867, + "grad_norm": 210.9479217529297, + "learning_rate": 1.2729582577132487e-05, + "loss": 34.2069, + "step": 3182 + }, + { + "epoch": 11.491196388261852, + "grad_norm": 234.31399536132812, + "learning_rate": 1.2724137931034484e-05, + "loss": 35.811, + "step": 3183 + }, + { + "epoch": 11.494808126410835, + "grad_norm": 253.24478149414062, + "learning_rate": 1.271869328493648e-05, + "loss": 35.6234, + "step": 3184 + }, + { + "epoch": 11.49841986455982, + "grad_norm": 259.0565185546875, + "learning_rate": 1.2713248638838476e-05, + "loss": 35.1495, + "step": 3185 + }, + { + "epoch": 11.502031602708804, + "grad_norm": 235.4202880859375, + "learning_rate": 1.2707803992740472e-05, + "loss": 35.1363, + "step": 3186 + }, + { + "epoch": 11.505643340857787, + "grad_norm": 248.30267333984375, + "learning_rate": 1.2702359346642469e-05, + "loss": 35.9653, + "step": 3187 + }, + { + "epoch": 11.509255079006772, + "grad_norm": 197.6142120361328, + "learning_rate": 1.2696914700544466e-05, + "loss": 35.6304, + "step": 3188 + }, + { + "epoch": 11.512866817155757, + "grad_norm": 329.27862548828125, + "learning_rate": 1.2691470054446461e-05, + "loss": 35.6111, + "step": 3189 + }, + { + "epoch": 11.51647855530474, + "grad_norm": 194.7126922607422, + "learning_rate": 1.2686025408348457e-05, + "loss": 35.0693, + "step": 3190 + }, + { + "epoch": 11.51647855530474, + "eval_loss": 0.6106634736061096, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 3190 + }, + { + "epoch": 11.520090293453725, + "grad_norm": 243.0207061767578, + "learning_rate": 1.2680580762250454e-05, + "loss": 37.6373, + "step": 3191 + }, + { + "epoch": 11.523702031602708, + "grad_norm": 282.0947265625, + "learning_rate": 1.267513611615245e-05, + "loss": 36.2595, + "step": 3192 + }, + { + "epoch": 11.527313769751693, + "grad_norm": 249.8011932373047, + "learning_rate": 1.2669691470054446e-05, + "loss": 35.5601, + "step": 3193 + }, + { + "epoch": 11.530925507900678, + "grad_norm": 202.17503356933594, + "learning_rate": 1.2664246823956443e-05, + "loss": 23.1075, + "step": 3194 + }, + { + "epoch": 11.534537246049661, + "grad_norm": 188.78128051757812, + "learning_rate": 1.2658802177858439e-05, + "loss": 22.2458, + "step": 3195 + }, + { + "epoch": 11.538148984198646, + "grad_norm": 219.24722290039062, + "learning_rate": 1.2653357531760437e-05, + "loss": 23.7842, + "step": 3196 + }, + { + "epoch": 11.54176072234763, + "grad_norm": 213.0615234375, + "learning_rate": 1.2647912885662433e-05, + "loss": 25.3773, + "step": 3197 + }, + { + "epoch": 11.545372460496614, + "grad_norm": 274.6806335449219, + "learning_rate": 1.2642468239564428e-05, + "loss": 40.396, + "step": 3198 + }, + { + "epoch": 11.548984198645599, + "grad_norm": 248.91778564453125, + "learning_rate": 1.2637023593466425e-05, + "loss": 42.2405, + "step": 3199 + }, + { + "epoch": 11.552595936794582, + "grad_norm": 228.45591735839844, + "learning_rate": 1.263157894736842e-05, + "loss": 40.7328, + "step": 3200 + }, + { + "epoch": 11.552595936794582, + "eval_loss": 0.6154705286026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.04, + "eval_steps_per_second": 57.04, + "step": 3200 + }, + { + "epoch": 11.556207674943566, + "grad_norm": 206.54483032226562, + "learning_rate": 1.2626134301270418e-05, + "loss": 40.6909, + "step": 3201 + }, + { + "epoch": 11.559819413092551, + "grad_norm": 199.14816284179688, + "learning_rate": 1.2620689655172415e-05, + "loss": 40.6918, + "step": 3202 + }, + { + "epoch": 11.563431151241534, + "grad_norm": 217.4789276123047, + "learning_rate": 1.261524500907441e-05, + "loss": 41.686, + "step": 3203 + }, + { + "epoch": 11.56704288939052, + "grad_norm": 209.83084106445312, + "learning_rate": 1.2609800362976406e-05, + "loss": 40.685, + "step": 3204 + }, + { + "epoch": 11.570654627539504, + "grad_norm": 184.56614685058594, + "learning_rate": 1.2604355716878404e-05, + "loss": 42.1684, + "step": 3205 + }, + { + "epoch": 11.574266365688487, + "grad_norm": 226.84622192382812, + "learning_rate": 1.25989110707804e-05, + "loss": 42.4169, + "step": 3206 + }, + { + "epoch": 11.577878103837472, + "grad_norm": 271.7705383300781, + "learning_rate": 1.2593466424682397e-05, + "loss": 41.9603, + "step": 3207 + }, + { + "epoch": 11.581489841986457, + "grad_norm": 206.48257446289062, + "learning_rate": 1.2588021778584392e-05, + "loss": 39.9903, + "step": 3208 + }, + { + "epoch": 11.58510158013544, + "grad_norm": 190.86009216308594, + "learning_rate": 1.2582577132486388e-05, + "loss": 39.3138, + "step": 3209 + }, + { + "epoch": 11.588713318284425, + "grad_norm": 217.0152130126953, + "learning_rate": 1.2577132486388386e-05, + "loss": 37.652, + "step": 3210 + }, + { + "epoch": 11.588713318284425, + "eval_loss": 0.6143624186515808, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 3210 + }, + { + "epoch": 11.592325056433408, + "grad_norm": 203.3090362548828, + "learning_rate": 1.2571687840290382e-05, + "loss": 38.5532, + "step": 3211 + }, + { + "epoch": 11.595936794582393, + "grad_norm": 237.18287658691406, + "learning_rate": 1.2566243194192377e-05, + "loss": 38.4073, + "step": 3212 + }, + { + "epoch": 11.599548532731378, + "grad_norm": 222.20489501953125, + "learning_rate": 1.2560798548094374e-05, + "loss": 37.7122, + "step": 3213 + }, + { + "epoch": 11.60316027088036, + "grad_norm": 261.4862060546875, + "learning_rate": 1.255535390199637e-05, + "loss": 39.0125, + "step": 3214 + }, + { + "epoch": 11.606772009029346, + "grad_norm": 235.49668884277344, + "learning_rate": 1.2549909255898367e-05, + "loss": 38.1753, + "step": 3215 + }, + { + "epoch": 11.610383747178329, + "grad_norm": 219.66139221191406, + "learning_rate": 1.2544464609800364e-05, + "loss": 40.3478, + "step": 3216 + }, + { + "epoch": 11.613995485327314, + "grad_norm": 282.8075256347656, + "learning_rate": 1.2539019963702359e-05, + "loss": 39.3672, + "step": 3217 + }, + { + "epoch": 11.617607223476298, + "grad_norm": 235.07875061035156, + "learning_rate": 1.2533575317604356e-05, + "loss": 39.8955, + "step": 3218 + }, + { + "epoch": 11.621218961625281, + "grad_norm": 328.829833984375, + "learning_rate": 1.2528130671506353e-05, + "loss": 38.626, + "step": 3219 + }, + { + "epoch": 11.624830699774266, + "grad_norm": 283.1789245605469, + "learning_rate": 1.2522686025408349e-05, + "loss": 40.0565, + "step": 3220 + }, + { + "epoch": 11.624830699774266, + "eval_loss": 0.6113889217376709, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3220 + }, + { + "epoch": 11.628442437923251, + "grad_norm": 230.88047790527344, + "learning_rate": 1.2517241379310346e-05, + "loss": 40.1155, + "step": 3221 + }, + { + "epoch": 11.632054176072234, + "grad_norm": 258.1295166015625, + "learning_rate": 1.2511796733212341e-05, + "loss": 40.4707, + "step": 3222 + }, + { + "epoch": 11.635665914221219, + "grad_norm": 255.82699584960938, + "learning_rate": 1.2506352087114336e-05, + "loss": 41.1296, + "step": 3223 + }, + { + "epoch": 11.639277652370204, + "grad_norm": 226.4784393310547, + "learning_rate": 1.2500907441016335e-05, + "loss": 39.1159, + "step": 3224 + }, + { + "epoch": 11.642889390519187, + "grad_norm": 257.38104248046875, + "learning_rate": 1.249546279491833e-05, + "loss": 40.7933, + "step": 3225 + }, + { + "epoch": 11.646501128668172, + "grad_norm": 218.69070434570312, + "learning_rate": 1.2490018148820328e-05, + "loss": 39.6723, + "step": 3226 + }, + { + "epoch": 11.650112866817155, + "grad_norm": 232.3351287841797, + "learning_rate": 1.2484573502722323e-05, + "loss": 37.5671, + "step": 3227 + }, + { + "epoch": 11.65372460496614, + "grad_norm": 229.93295288085938, + "learning_rate": 1.2479128856624318e-05, + "loss": 32.7819, + "step": 3228 + }, + { + "epoch": 11.657336343115125, + "grad_norm": 265.6002197265625, + "learning_rate": 1.2473684210526317e-05, + "loss": 32.5955, + "step": 3229 + }, + { + "epoch": 11.660948081264108, + "grad_norm": 278.47705078125, + "learning_rate": 1.2468239564428313e-05, + "loss": 32.9901, + "step": 3230 + }, + { + "epoch": 11.660948081264108, + "eval_loss": 0.6078047752380371, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 3230 + }, + { + "epoch": 11.664559819413093, + "grad_norm": 239.9285430908203, + "learning_rate": 1.2462794918330308e-05, + "loss": 33.2737, + "step": 3231 + }, + { + "epoch": 11.668171557562077, + "grad_norm": 358.36090087890625, + "learning_rate": 1.2457350272232305e-05, + "loss": 34.8522, + "step": 3232 + }, + { + "epoch": 11.67178329571106, + "grad_norm": 258.0733642578125, + "learning_rate": 1.2451905626134302e-05, + "loss": 34.6796, + "step": 3233 + }, + { + "epoch": 11.675395033860045, + "grad_norm": 296.21942138671875, + "learning_rate": 1.2446460980036298e-05, + "loss": 35.8479, + "step": 3234 + }, + { + "epoch": 11.679006772009028, + "grad_norm": 229.6141815185547, + "learning_rate": 1.2441016333938295e-05, + "loss": 36.4934, + "step": 3235 + }, + { + "epoch": 11.682618510158013, + "grad_norm": 238.6092987060547, + "learning_rate": 1.243557168784029e-05, + "loss": 35.2253, + "step": 3236 + }, + { + "epoch": 11.686230248306998, + "grad_norm": 300.76300048828125, + "learning_rate": 1.2430127041742287e-05, + "loss": 34.9373, + "step": 3237 + }, + { + "epoch": 11.689841986455981, + "grad_norm": 227.70672607421875, + "learning_rate": 1.2424682395644284e-05, + "loss": 35.4369, + "step": 3238 + }, + { + "epoch": 11.693453724604966, + "grad_norm": 218.36000061035156, + "learning_rate": 1.241923774954628e-05, + "loss": 35.3398, + "step": 3239 + }, + { + "epoch": 11.697065462753951, + "grad_norm": 220.78475952148438, + "learning_rate": 1.2413793103448277e-05, + "loss": 35.7612, + "step": 3240 + }, + { + "epoch": 11.697065462753951, + "eval_loss": 0.6067846417427063, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 3240 + }, + { + "epoch": 11.700677200902934, + "grad_norm": 237.34437561035156, + "learning_rate": 1.2408348457350272e-05, + "loss": 38.0459, + "step": 3241 + }, + { + "epoch": 11.704288939051919, + "grad_norm": 251.60633850097656, + "learning_rate": 1.2402903811252269e-05, + "loss": 35.4676, + "step": 3242 + }, + { + "epoch": 11.707900677200904, + "grad_norm": 214.17117309570312, + "learning_rate": 1.2397459165154266e-05, + "loss": 30.5595, + "step": 3243 + }, + { + "epoch": 11.711512415349887, + "grad_norm": 202.3698272705078, + "learning_rate": 1.2392014519056262e-05, + "loss": 23.7468, + "step": 3244 + }, + { + "epoch": 11.715124153498872, + "grad_norm": 229.11776733398438, + "learning_rate": 1.2386569872958257e-05, + "loss": 23.1255, + "step": 3245 + }, + { + "epoch": 11.718735891647855, + "grad_norm": 175.93829345703125, + "learning_rate": 1.2381125226860254e-05, + "loss": 23.7349, + "step": 3246 + }, + { + "epoch": 11.72234762979684, + "grad_norm": 232.7489471435547, + "learning_rate": 1.2375680580762251e-05, + "loss": 24.4997, + "step": 3247 + }, + { + "epoch": 11.725959367945824, + "grad_norm": 280.5601806640625, + "learning_rate": 1.2370235934664248e-05, + "loss": 42.3811, + "step": 3248 + }, + { + "epoch": 11.729571106094808, + "grad_norm": 292.2538146972656, + "learning_rate": 1.2364791288566244e-05, + "loss": 42.9804, + "step": 3249 + }, + { + "epoch": 11.733182844243792, + "grad_norm": 265.0259704589844, + "learning_rate": 1.2359346642468239e-05, + "loss": 41.1251, + "step": 3250 + }, + { + "epoch": 11.733182844243792, + "eval_loss": 0.6141200065612793, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 3250 + }, + { + "epoch": 11.736794582392777, + "grad_norm": 232.92893981933594, + "learning_rate": 1.2353901996370236e-05, + "loss": 40.9372, + "step": 3251 + }, + { + "epoch": 11.74040632054176, + "grad_norm": 176.99818420410156, + "learning_rate": 1.2348457350272233e-05, + "loss": 41.0757, + "step": 3252 + }, + { + "epoch": 11.744018058690745, + "grad_norm": 206.5728759765625, + "learning_rate": 1.2343012704174228e-05, + "loss": 41.9635, + "step": 3253 + }, + { + "epoch": 11.747629796839728, + "grad_norm": 211.2556915283203, + "learning_rate": 1.2337568058076226e-05, + "loss": 41.5217, + "step": 3254 + }, + { + "epoch": 11.751241534988713, + "grad_norm": 198.8915252685547, + "learning_rate": 1.2332123411978221e-05, + "loss": 42.9997, + "step": 3255 + }, + { + "epoch": 11.754853273137698, + "grad_norm": 291.2761535644531, + "learning_rate": 1.2326678765880218e-05, + "loss": 42.2561, + "step": 3256 + }, + { + "epoch": 11.758465011286681, + "grad_norm": 243.2998046875, + "learning_rate": 1.2321234119782215e-05, + "loss": 41.6219, + "step": 3257 + }, + { + "epoch": 11.762076749435666, + "grad_norm": 266.1149597167969, + "learning_rate": 1.231578947368421e-05, + "loss": 40.1646, + "step": 3258 + }, + { + "epoch": 11.76568848758465, + "grad_norm": 236.6083221435547, + "learning_rate": 1.2310344827586208e-05, + "loss": 39.7079, + "step": 3259 + }, + { + "epoch": 11.769300225733634, + "grad_norm": 196.397216796875, + "learning_rate": 1.2304900181488203e-05, + "loss": 39.6629, + "step": 3260 + }, + { + "epoch": 11.769300225733634, + "eval_loss": 0.6124016046524048, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.055, + "eval_steps_per_second": 57.055, + "step": 3260 + }, + { + "epoch": 11.772911963882619, + "grad_norm": 198.52500915527344, + "learning_rate": 1.22994555353902e-05, + "loss": 38.5285, + "step": 3261 + }, + { + "epoch": 11.776523702031604, + "grad_norm": 236.25477600097656, + "learning_rate": 1.2294010889292197e-05, + "loss": 38.3358, + "step": 3262 + }, + { + "epoch": 11.780135440180587, + "grad_norm": 260.35955810546875, + "learning_rate": 1.2288566243194192e-05, + "loss": 38.374, + "step": 3263 + }, + { + "epoch": 11.783747178329572, + "grad_norm": 313.078857421875, + "learning_rate": 1.2283121597096188e-05, + "loss": 39.124, + "step": 3264 + }, + { + "epoch": 11.787358916478555, + "grad_norm": 191.34027099609375, + "learning_rate": 1.2277676950998187e-05, + "loss": 39.1776, + "step": 3265 + }, + { + "epoch": 11.79097065462754, + "grad_norm": 203.5764923095703, + "learning_rate": 1.2272232304900182e-05, + "loss": 38.7885, + "step": 3266 + }, + { + "epoch": 11.794582392776524, + "grad_norm": 234.38479614257812, + "learning_rate": 1.2266787658802177e-05, + "loss": 39.1353, + "step": 3267 + }, + { + "epoch": 11.798194130925507, + "grad_norm": 254.5694122314453, + "learning_rate": 1.2261343012704174e-05, + "loss": 38.141, + "step": 3268 + }, + { + "epoch": 11.801805869074492, + "grad_norm": 189.8268585205078, + "learning_rate": 1.225589836660617e-05, + "loss": 39.5199, + "step": 3269 + }, + { + "epoch": 11.805417607223477, + "grad_norm": 256.52728271484375, + "learning_rate": 1.2250453720508169e-05, + "loss": 41.5113, + "step": 3270 + }, + { + "epoch": 11.805417607223477, + "eval_loss": 0.6084021329879761, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3270 + }, + { + "epoch": 11.80902934537246, + "grad_norm": 195.57321166992188, + "learning_rate": 1.2245009074410164e-05, + "loss": 39.8129, + "step": 3271 + }, + { + "epoch": 11.812641083521445, + "grad_norm": 228.6748809814453, + "learning_rate": 1.223956442831216e-05, + "loss": 40.2273, + "step": 3272 + }, + { + "epoch": 11.816252821670428, + "grad_norm": 209.96096801757812, + "learning_rate": 1.2234119782214156e-05, + "loss": 40.2254, + "step": 3273 + }, + { + "epoch": 11.819864559819413, + "grad_norm": 247.4613037109375, + "learning_rate": 1.2228675136116152e-05, + "loss": 40.71, + "step": 3274 + }, + { + "epoch": 11.823476297968398, + "grad_norm": 263.0521240234375, + "learning_rate": 1.2223230490018149e-05, + "loss": 39.5572, + "step": 3275 + }, + { + "epoch": 11.827088036117381, + "grad_norm": 225.53634643554688, + "learning_rate": 1.2217785843920146e-05, + "loss": 36.4388, + "step": 3276 + }, + { + "epoch": 11.830699774266366, + "grad_norm": 194.59527587890625, + "learning_rate": 1.2212341197822141e-05, + "loss": 33.1005, + "step": 3277 + }, + { + "epoch": 11.83431151241535, + "grad_norm": 314.715576171875, + "learning_rate": 1.2206896551724138e-05, + "loss": 32.9812, + "step": 3278 + }, + { + "epoch": 11.837923250564334, + "grad_norm": 205.86862182617188, + "learning_rate": 1.2201451905626136e-05, + "loss": 33.6331, + "step": 3279 + }, + { + "epoch": 11.841534988713319, + "grad_norm": 217.54722595214844, + "learning_rate": 1.2196007259528131e-05, + "loss": 33.6535, + "step": 3280 + }, + { + "epoch": 11.841534988713319, + "eval_loss": 0.609620213508606, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 3280 + }, + { + "epoch": 11.845146726862303, + "grad_norm": 231.25390625, + "learning_rate": 1.2190562613430128e-05, + "loss": 34.5218, + "step": 3281 + }, + { + "epoch": 11.848758465011286, + "grad_norm": 208.8440704345703, + "learning_rate": 1.2185117967332123e-05, + "loss": 34.354, + "step": 3282 + }, + { + "epoch": 11.852370203160271, + "grad_norm": 221.25547790527344, + "learning_rate": 1.2179673321234119e-05, + "loss": 34.5705, + "step": 3283 + }, + { + "epoch": 11.855981941309254, + "grad_norm": 331.4505920410156, + "learning_rate": 1.2174228675136118e-05, + "loss": 35.796, + "step": 3284 + }, + { + "epoch": 11.85959367945824, + "grad_norm": 337.1404113769531, + "learning_rate": 1.2168784029038113e-05, + "loss": 36.4544, + "step": 3285 + }, + { + "epoch": 11.863205417607224, + "grad_norm": 238.75303649902344, + "learning_rate": 1.2163339382940108e-05, + "loss": 35.7165, + "step": 3286 + }, + { + "epoch": 11.866817155756207, + "grad_norm": 260.088134765625, + "learning_rate": 1.2157894736842105e-05, + "loss": 35.5461, + "step": 3287 + }, + { + "epoch": 11.870428893905192, + "grad_norm": 265.0240173339844, + "learning_rate": 1.2152450090744102e-05, + "loss": 37.0143, + "step": 3288 + }, + { + "epoch": 11.874040632054175, + "grad_norm": 251.74273681640625, + "learning_rate": 1.21470054446461e-05, + "loss": 36.6145, + "step": 3289 + }, + { + "epoch": 11.87765237020316, + "grad_norm": 216.8999786376953, + "learning_rate": 1.2141560798548095e-05, + "loss": 36.3135, + "step": 3290 + }, + { + "epoch": 11.87765237020316, + "eval_loss": 0.6087896823883057, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.066, + "eval_steps_per_second": 57.066, + "step": 3290 + }, + { + "epoch": 11.881264108352145, + "grad_norm": 256.50006103515625, + "learning_rate": 1.213611615245009e-05, + "loss": 36.6596, + "step": 3291 + }, + { + "epoch": 11.884875846501128, + "grad_norm": 249.34164428710938, + "learning_rate": 1.2130671506352087e-05, + "loss": 37.6473, + "step": 3292 + }, + { + "epoch": 11.888487584650113, + "grad_norm": 211.9344940185547, + "learning_rate": 1.2125226860254084e-05, + "loss": 28.2839, + "step": 3293 + }, + { + "epoch": 11.892099322799098, + "grad_norm": 170.77166748046875, + "learning_rate": 1.211978221415608e-05, + "loss": 23.2231, + "step": 3294 + }, + { + "epoch": 11.89571106094808, + "grad_norm": 177.49789428710938, + "learning_rate": 1.2114337568058077e-05, + "loss": 22.7909, + "step": 3295 + }, + { + "epoch": 11.899322799097066, + "grad_norm": 189.0458221435547, + "learning_rate": 1.2108892921960072e-05, + "loss": 23.8062, + "step": 3296 + }, + { + "epoch": 11.90293453724605, + "grad_norm": 182.90457153320312, + "learning_rate": 1.2103448275862068e-05, + "loss": 24.7812, + "step": 3297 + }, + { + "epoch": 11.906546275395034, + "grad_norm": 232.61126708984375, + "learning_rate": 1.2098003629764066e-05, + "loss": 41.5496, + "step": 3298 + }, + { + "epoch": 11.910158013544018, + "grad_norm": 283.25762939453125, + "learning_rate": 1.2092558983666062e-05, + "loss": 40.7831, + "step": 3299 + }, + { + "epoch": 11.913769751693001, + "grad_norm": 316.6318359375, + "learning_rate": 1.2087114337568059e-05, + "loss": 40.6287, + "step": 3300 + }, + { + "epoch": 11.913769751693001, + "eval_loss": 0.6114257574081421, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 3300 + }, + { + "epoch": 11.917381489841986, + "grad_norm": 248.5615234375, + "learning_rate": 1.2081669691470054e-05, + "loss": 40.5648, + "step": 3301 + }, + { + "epoch": 11.920993227990971, + "grad_norm": 255.31130981445312, + "learning_rate": 1.2076225045372051e-05, + "loss": 42.4736, + "step": 3302 + }, + { + "epoch": 11.924604966139954, + "grad_norm": 229.3546600341797, + "learning_rate": 1.2070780399274048e-05, + "loss": 43.112, + "step": 3303 + }, + { + "epoch": 11.928216704288939, + "grad_norm": 226.89553833007812, + "learning_rate": 1.2065335753176044e-05, + "loss": 37.9527, + "step": 3304 + }, + { + "epoch": 11.931828442437924, + "grad_norm": 210.63919067382812, + "learning_rate": 1.205989110707804e-05, + "loss": 38.7652, + "step": 3305 + }, + { + "epoch": 11.935440180586907, + "grad_norm": 267.75335693359375, + "learning_rate": 1.2054446460980036e-05, + "loss": 39.9077, + "step": 3306 + }, + { + "epoch": 11.939051918735892, + "grad_norm": 255.3372802734375, + "learning_rate": 1.2049001814882033e-05, + "loss": 39.9008, + "step": 3307 + }, + { + "epoch": 11.942663656884875, + "grad_norm": 220.55332946777344, + "learning_rate": 1.2043557168784029e-05, + "loss": 40.8187, + "step": 3308 + }, + { + "epoch": 11.94627539503386, + "grad_norm": 350.15374755859375, + "learning_rate": 1.2038112522686026e-05, + "loss": 40.2937, + "step": 3309 + }, + { + "epoch": 11.949887133182845, + "grad_norm": 296.1144714355469, + "learning_rate": 1.2032667876588021e-05, + "loss": 41.3939, + "step": 3310 + }, + { + "epoch": 11.949887133182845, + "eval_loss": 0.6116041541099548, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3310 + }, + { + "epoch": 11.953498871331828, + "grad_norm": 220.52304077148438, + "learning_rate": 1.202722323049002e-05, + "loss": 39.108, + "step": 3311 + }, + { + "epoch": 11.957110609480813, + "grad_norm": 268.8526916503906, + "learning_rate": 1.2021778584392015e-05, + "loss": 39.547, + "step": 3312 + }, + { + "epoch": 11.960722347629797, + "grad_norm": 205.97677612304688, + "learning_rate": 1.201633393829401e-05, + "loss": 36.7144, + "step": 3313 + }, + { + "epoch": 11.96433408577878, + "grad_norm": 186.62428283691406, + "learning_rate": 1.2010889292196008e-05, + "loss": 34.0491, + "step": 3314 + }, + { + "epoch": 11.967945823927765, + "grad_norm": 214.5521697998047, + "learning_rate": 1.2005444646098003e-05, + "loss": 34.1164, + "step": 3315 + }, + { + "epoch": 11.97155756207675, + "grad_norm": 203.8130340576172, + "learning_rate": 1.2e-05, + "loss": 34.0005, + "step": 3316 + }, + { + "epoch": 11.975169300225733, + "grad_norm": 207.25648498535156, + "learning_rate": 1.1994555353901997e-05, + "loss": 34.0489, + "step": 3317 + }, + { + "epoch": 11.978781038374718, + "grad_norm": 271.1595458984375, + "learning_rate": 1.1989110707803993e-05, + "loss": 35.0359, + "step": 3318 + }, + { + "epoch": 11.982392776523701, + "grad_norm": 266.0697021484375, + "learning_rate": 1.198366606170599e-05, + "loss": 36.4684, + "step": 3319 + }, + { + "epoch": 11.986004514672686, + "grad_norm": 264.1314392089844, + "learning_rate": 1.1978221415607985e-05, + "loss": 35.8805, + "step": 3320 + }, + { + "epoch": 11.986004514672686, + "eval_loss": 0.6101864576339722, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 3320 + }, + { + "epoch": 11.989616252821671, + "grad_norm": 266.34295654296875, + "learning_rate": 1.1972776769509982e-05, + "loss": 37.2928, + "step": 3321 + }, + { + "epoch": 11.993227990970654, + "grad_norm": 222.19161987304688, + "learning_rate": 1.196733212341198e-05, + "loss": 29.0638, + "step": 3322 + }, + { + "epoch": 11.996839729119639, + "grad_norm": 244.96974182128906, + "learning_rate": 1.1961887477313975e-05, + "loss": 23.6752, + "step": 3323 + }, + { + "epoch": 12.0, + "grad_norm": 227.6931915283203, + "learning_rate": 1.195644283121597e-05, + "loss": 20.9293, + "step": 3324 + }, + { + "epoch": 12.003611738148985, + "grad_norm": 259.7235412597656, + "learning_rate": 1.1950998185117969e-05, + "loss": 39.7694, + "step": 3325 + }, + { + "epoch": 12.007223476297968, + "grad_norm": 258.8477783203125, + "learning_rate": 1.1945553539019964e-05, + "loss": 41.3742, + "step": 3326 + }, + { + "epoch": 12.010835214446953, + "grad_norm": 216.0697784423828, + "learning_rate": 1.194010889292196e-05, + "loss": 40.0706, + "step": 3327 + }, + { + "epoch": 12.014446952595938, + "grad_norm": 197.73046875, + "learning_rate": 1.1934664246823957e-05, + "loss": 39.844, + "step": 3328 + }, + { + "epoch": 12.01805869074492, + "grad_norm": 190.29563903808594, + "learning_rate": 1.1929219600725952e-05, + "loss": 41.8877, + "step": 3329 + }, + { + "epoch": 12.021670428893906, + "grad_norm": 190.01197814941406, + "learning_rate": 1.1923774954627951e-05, + "loss": 40.5782, + "step": 3330 + }, + { + "epoch": 12.021670428893906, + "eval_loss": 0.6100598573684692, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3330 + }, + { + "epoch": 12.025282167042889, + "grad_norm": 283.20965576171875, + "learning_rate": 1.1918330308529946e-05, + "loss": 42.9183, + "step": 3331 + }, + { + "epoch": 12.028893905191874, + "grad_norm": 227.9106903076172, + "learning_rate": 1.1912885662431942e-05, + "loss": 41.4606, + "step": 3332 + }, + { + "epoch": 12.032505643340858, + "grad_norm": 217.31640625, + "learning_rate": 1.1907441016333939e-05, + "loss": 40.527, + "step": 3333 + }, + { + "epoch": 12.036117381489841, + "grad_norm": 181.33787536621094, + "learning_rate": 1.1901996370235936e-05, + "loss": 40.2536, + "step": 3334 + }, + { + "epoch": 12.039729119638826, + "grad_norm": 210.638427734375, + "learning_rate": 1.1896551724137931e-05, + "loss": 39.0234, + "step": 3335 + }, + { + "epoch": 12.043340857787811, + "grad_norm": 222.1325225830078, + "learning_rate": 1.1891107078039928e-05, + "loss": 36.6929, + "step": 3336 + }, + { + "epoch": 12.046952595936794, + "grad_norm": 195.0751953125, + "learning_rate": 1.1885662431941924e-05, + "loss": 37.9547, + "step": 3337 + }, + { + "epoch": 12.050564334085779, + "grad_norm": 287.6582946777344, + "learning_rate": 1.1880217785843919e-05, + "loss": 37.9016, + "step": 3338 + }, + { + "epoch": 12.054176072234762, + "grad_norm": 351.43701171875, + "learning_rate": 1.1874773139745918e-05, + "loss": 40.014, + "step": 3339 + }, + { + "epoch": 12.057787810383747, + "grad_norm": 212.9033966064453, + "learning_rate": 1.1869328493647913e-05, + "loss": 37.8761, + "step": 3340 + }, + { + "epoch": 12.057787810383747, + "eval_loss": 0.6093400120735168, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 3340 + }, + { + "epoch": 12.061399548532732, + "grad_norm": 268.8284912109375, + "learning_rate": 1.186388384754991e-05, + "loss": 38.7171, + "step": 3341 + }, + { + "epoch": 12.065011286681715, + "grad_norm": 193.27267456054688, + "learning_rate": 1.1858439201451906e-05, + "loss": 38.4908, + "step": 3342 + }, + { + "epoch": 12.0686230248307, + "grad_norm": 244.18124389648438, + "learning_rate": 1.1852994555353901e-05, + "loss": 37.9388, + "step": 3343 + }, + { + "epoch": 12.072234762979685, + "grad_norm": 311.6593933105469, + "learning_rate": 1.18475499092559e-05, + "loss": 38.4287, + "step": 3344 + }, + { + "epoch": 12.075846501128668, + "grad_norm": 239.28526306152344, + "learning_rate": 1.1842105263157895e-05, + "loss": 38.1349, + "step": 3345 + }, + { + "epoch": 12.079458239277653, + "grad_norm": 312.1795654296875, + "learning_rate": 1.183666061705989e-05, + "loss": 39.8067, + "step": 3346 + }, + { + "epoch": 12.083069977426636, + "grad_norm": 303.3067932128906, + "learning_rate": 1.1831215970961888e-05, + "loss": 40.0617, + "step": 3347 + }, + { + "epoch": 12.08668171557562, + "grad_norm": 280.8705749511719, + "learning_rate": 1.1825771324863885e-05, + "loss": 39.244, + "step": 3348 + }, + { + "epoch": 12.090293453724605, + "grad_norm": 249.89671325683594, + "learning_rate": 1.182032667876588e-05, + "loss": 39.0047, + "step": 3349 + }, + { + "epoch": 12.093905191873588, + "grad_norm": 226.19195556640625, + "learning_rate": 1.1814882032667877e-05, + "loss": 40.8044, + "step": 3350 + }, + { + "epoch": 12.093905191873588, + "eval_loss": 0.6100687384605408, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 3350 + }, + { + "epoch": 12.097516930022573, + "grad_norm": 250.29306030273438, + "learning_rate": 1.1809437386569873e-05, + "loss": 38.0745, + "step": 3351 + }, + { + "epoch": 12.101128668171558, + "grad_norm": 255.06137084960938, + "learning_rate": 1.180399274047187e-05, + "loss": 37.2922, + "step": 3352 + }, + { + "epoch": 12.104740406320541, + "grad_norm": 293.59185791015625, + "learning_rate": 1.1798548094373867e-05, + "loss": 35.488, + "step": 3353 + }, + { + "epoch": 12.108352144469526, + "grad_norm": 260.9599914550781, + "learning_rate": 1.1793103448275862e-05, + "loss": 32.8175, + "step": 3354 + }, + { + "epoch": 12.111963882618511, + "grad_norm": 387.63671875, + "learning_rate": 1.178765880217786e-05, + "loss": 31.3901, + "step": 3355 + }, + { + "epoch": 12.115575620767494, + "grad_norm": 216.2008819580078, + "learning_rate": 1.1782214156079855e-05, + "loss": 32.9512, + "step": 3356 + }, + { + "epoch": 12.119187358916479, + "grad_norm": 260.510498046875, + "learning_rate": 1.177676950998185e-05, + "loss": 31.838, + "step": 3357 + }, + { + "epoch": 12.122799097065462, + "grad_norm": 215.96522521972656, + "learning_rate": 1.1771324863883849e-05, + "loss": 33.5854, + "step": 3358 + }, + { + "epoch": 12.126410835214447, + "grad_norm": 277.2855529785156, + "learning_rate": 1.1765880217785844e-05, + "loss": 34.947, + "step": 3359 + }, + { + "epoch": 12.130022573363432, + "grad_norm": 199.53759765625, + "learning_rate": 1.176043557168784e-05, + "loss": 34.3862, + "step": 3360 + }, + { + "epoch": 12.130022573363432, + "eval_loss": 0.6107886433601379, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 3360 + }, + { + "epoch": 12.133634311512415, + "grad_norm": 244.73654174804688, + "learning_rate": 1.1754990925589837e-05, + "loss": 34.5678, + "step": 3361 + }, + { + "epoch": 12.1372460496614, + "grad_norm": 335.4967346191406, + "learning_rate": 1.1749546279491834e-05, + "loss": 35.8974, + "step": 3362 + }, + { + "epoch": 12.140857787810384, + "grad_norm": 269.8370056152344, + "learning_rate": 1.174410163339383e-05, + "loss": 36.3458, + "step": 3363 + }, + { + "epoch": 12.144469525959368, + "grad_norm": 230.82492065429688, + "learning_rate": 1.1738656987295826e-05, + "loss": 34.6797, + "step": 3364 + }, + { + "epoch": 12.148081264108352, + "grad_norm": 266.6196594238281, + "learning_rate": 1.1733212341197822e-05, + "loss": 35.5799, + "step": 3365 + }, + { + "epoch": 12.151693002257336, + "grad_norm": 268.1825256347656, + "learning_rate": 1.1727767695099819e-05, + "loss": 34.9859, + "step": 3366 + }, + { + "epoch": 12.15530474040632, + "grad_norm": 259.6159362792969, + "learning_rate": 1.1722323049001816e-05, + "loss": 37.2283, + "step": 3367 + }, + { + "epoch": 12.158916478555305, + "grad_norm": 225.1367645263672, + "learning_rate": 1.1716878402903811e-05, + "loss": 37.4073, + "step": 3368 + }, + { + "epoch": 12.162528216704288, + "grad_norm": 277.8457946777344, + "learning_rate": 1.1711433756805808e-05, + "loss": 36.3491, + "step": 3369 + }, + { + "epoch": 12.166139954853273, + "grad_norm": 273.1939697265625, + "learning_rate": 1.1705989110707804e-05, + "loss": 31.4646, + "step": 3370 + }, + { + "epoch": 12.166139954853273, + "eval_loss": 0.6099494695663452, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 3370 + }, + { + "epoch": 12.169751693002258, + "grad_norm": 199.32516479492188, + "learning_rate": 1.17005444646098e-05, + "loss": 22.7125, + "step": 3371 + }, + { + "epoch": 12.173363431151241, + "grad_norm": 195.47630310058594, + "learning_rate": 1.1695099818511798e-05, + "loss": 22.7899, + "step": 3372 + }, + { + "epoch": 12.176975169300226, + "grad_norm": 220.02413940429688, + "learning_rate": 1.1689655172413793e-05, + "loss": 23.4427, + "step": 3373 + }, + { + "epoch": 12.18058690744921, + "grad_norm": 215.43287658691406, + "learning_rate": 1.168421052631579e-05, + "loss": 24.1504, + "step": 3374 + }, + { + "epoch": 12.184198645598194, + "grad_norm": 298.2409973144531, + "learning_rate": 1.1678765880217786e-05, + "loss": 41.4955, + "step": 3375 + }, + { + "epoch": 12.187810383747179, + "grad_norm": 235.94728088378906, + "learning_rate": 1.1673321234119783e-05, + "loss": 42.4273, + "step": 3376 + }, + { + "epoch": 12.191422121896162, + "grad_norm": 235.44480895996094, + "learning_rate": 1.166787658802178e-05, + "loss": 40.6468, + "step": 3377 + }, + { + "epoch": 12.195033860045147, + "grad_norm": 281.5338439941406, + "learning_rate": 1.1662431941923775e-05, + "loss": 39.8335, + "step": 3378 + }, + { + "epoch": 12.198645598194132, + "grad_norm": 185.87339782714844, + "learning_rate": 1.165698729582577e-05, + "loss": 40.8669, + "step": 3379 + }, + { + "epoch": 12.202257336343115, + "grad_norm": 218.88861083984375, + "learning_rate": 1.1651542649727768e-05, + "loss": 40.1351, + "step": 3380 + }, + { + "epoch": 12.202257336343115, + "eval_loss": 0.6128573417663574, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3380 + }, + { + "epoch": 12.2058690744921, + "grad_norm": 192.7227783203125, + "learning_rate": 1.1646098003629765e-05, + "loss": 40.4448, + "step": 3381 + }, + { + "epoch": 12.209480812641084, + "grad_norm": 219.68093872070312, + "learning_rate": 1.1640653357531762e-05, + "loss": 41.579, + "step": 3382 + }, + { + "epoch": 12.213092550790067, + "grad_norm": 235.8788299560547, + "learning_rate": 1.1635208711433757e-05, + "loss": 41.3374, + "step": 3383 + }, + { + "epoch": 12.216704288939052, + "grad_norm": 245.11935424804688, + "learning_rate": 1.1629764065335752e-05, + "loss": 41.1151, + "step": 3384 + }, + { + "epoch": 12.220316027088035, + "grad_norm": 260.2931823730469, + "learning_rate": 1.1624319419237751e-05, + "loss": 38.9502, + "step": 3385 + }, + { + "epoch": 12.22392776523702, + "grad_norm": 240.62734985351562, + "learning_rate": 1.1618874773139747e-05, + "loss": 38.6309, + "step": 3386 + }, + { + "epoch": 12.227539503386005, + "grad_norm": 230.9380645751953, + "learning_rate": 1.1613430127041742e-05, + "loss": 38.3077, + "step": 3387 + }, + { + "epoch": 12.231151241534988, + "grad_norm": 234.40687561035156, + "learning_rate": 1.1607985480943739e-05, + "loss": 37.1566, + "step": 3388 + }, + { + "epoch": 12.234762979683973, + "grad_norm": 216.580810546875, + "learning_rate": 1.1602540834845734e-05, + "loss": 38.4919, + "step": 3389 + }, + { + "epoch": 12.238374717832958, + "grad_norm": 210.75079345703125, + "learning_rate": 1.1597096188747732e-05, + "loss": 38.1647, + "step": 3390 + }, + { + "epoch": 12.238374717832958, + "eval_loss": 0.6105583906173706, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3390 + }, + { + "epoch": 12.241986455981941, + "grad_norm": 207.82180786132812, + "learning_rate": 1.1591651542649729e-05, + "loss": 38.5585, + "step": 3391 + }, + { + "epoch": 12.245598194130926, + "grad_norm": 186.55081176757812, + "learning_rate": 1.1586206896551724e-05, + "loss": 38.0183, + "step": 3392 + }, + { + "epoch": 12.249209932279909, + "grad_norm": 179.60572814941406, + "learning_rate": 1.1580762250453721e-05, + "loss": 39.6951, + "step": 3393 + }, + { + "epoch": 12.252821670428894, + "grad_norm": 212.59837341308594, + "learning_rate": 1.1575317604355718e-05, + "loss": 39.2908, + "step": 3394 + }, + { + "epoch": 12.256433408577879, + "grad_norm": 239.90997314453125, + "learning_rate": 1.1569872958257714e-05, + "loss": 39.9409, + "step": 3395 + }, + { + "epoch": 12.260045146726862, + "grad_norm": 240.729248046875, + "learning_rate": 1.156442831215971e-05, + "loss": 39.2386, + "step": 3396 + }, + { + "epoch": 12.263656884875846, + "grad_norm": 248.6179962158203, + "learning_rate": 1.1558983666061706e-05, + "loss": 37.3296, + "step": 3397 + }, + { + "epoch": 12.267268623024831, + "grad_norm": 192.55084228515625, + "learning_rate": 1.1553539019963701e-05, + "loss": 40.1156, + "step": 3398 + }, + { + "epoch": 12.270880361173814, + "grad_norm": 217.89109802246094, + "learning_rate": 1.15480943738657e-05, + "loss": 41.0677, + "step": 3399 + }, + { + "epoch": 12.2744920993228, + "grad_norm": 240.77633666992188, + "learning_rate": 1.1542649727767695e-05, + "loss": 39.3552, + "step": 3400 + }, + { + "epoch": 12.2744920993228, + "eval_loss": 0.6094763278961182, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3400 + }, + { + "epoch": 12.278103837471784, + "grad_norm": 210.38153076171875, + "learning_rate": 1.1537205081669691e-05, + "loss": 40.2202, + "step": 3401 + }, + { + "epoch": 12.281715575620767, + "grad_norm": 195.49087524414062, + "learning_rate": 1.1531760435571688e-05, + "loss": 37.5473, + "step": 3402 + }, + { + "epoch": 12.285327313769752, + "grad_norm": 254.43972778320312, + "learning_rate": 1.1526315789473683e-05, + "loss": 37.8032, + "step": 3403 + }, + { + "epoch": 12.288939051918735, + "grad_norm": 205.09913635253906, + "learning_rate": 1.1520871143375682e-05, + "loss": 35.1317, + "step": 3404 + }, + { + "epoch": 12.29255079006772, + "grad_norm": 241.22930908203125, + "learning_rate": 1.1515426497277677e-05, + "loss": 32.7809, + "step": 3405 + }, + { + "epoch": 12.296162528216705, + "grad_norm": 226.75311279296875, + "learning_rate": 1.1509981851179673e-05, + "loss": 32.5354, + "step": 3406 + }, + { + "epoch": 12.299774266365688, + "grad_norm": 323.5389709472656, + "learning_rate": 1.150453720508167e-05, + "loss": 33.1533, + "step": 3407 + }, + { + "epoch": 12.303386004514673, + "grad_norm": 306.7039794921875, + "learning_rate": 1.1499092558983667e-05, + "loss": 33.7924, + "step": 3408 + }, + { + "epoch": 12.306997742663658, + "grad_norm": 221.53897094726562, + "learning_rate": 1.1493647912885662e-05, + "loss": 33.829, + "step": 3409 + }, + { + "epoch": 12.31060948081264, + "grad_norm": 301.59527587890625, + "learning_rate": 1.148820326678766e-05, + "loss": 35.4583, + "step": 3410 + }, + { + "epoch": 12.31060948081264, + "eval_loss": 0.6092248558998108, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.058, + "eval_steps_per_second": 57.058, + "step": 3410 + }, + { + "epoch": 12.314221218961626, + "grad_norm": 229.63221740722656, + "learning_rate": 1.1482758620689655e-05, + "loss": 34.3258, + "step": 3411 + }, + { + "epoch": 12.317832957110609, + "grad_norm": 280.6421203613281, + "learning_rate": 1.147731397459165e-05, + "loss": 33.4522, + "step": 3412 + }, + { + "epoch": 12.321444695259594, + "grad_norm": 305.6673889160156, + "learning_rate": 1.1471869328493649e-05, + "loss": 34.8911, + "step": 3413 + }, + { + "epoch": 12.325056433408578, + "grad_norm": 278.5484924316406, + "learning_rate": 1.1466424682395644e-05, + "loss": 36.2668, + "step": 3414 + }, + { + "epoch": 12.328668171557561, + "grad_norm": 246.88082885742188, + "learning_rate": 1.1460980036297641e-05, + "loss": 34.8401, + "step": 3415 + }, + { + "epoch": 12.332279909706546, + "grad_norm": 279.730712890625, + "learning_rate": 1.1455535390199637e-05, + "loss": 36.2382, + "step": 3416 + }, + { + "epoch": 12.335891647855531, + "grad_norm": 243.62918090820312, + "learning_rate": 1.1450090744101634e-05, + "loss": 37.0742, + "step": 3417 + }, + { + "epoch": 12.339503386004514, + "grad_norm": 280.5240783691406, + "learning_rate": 1.1444646098003631e-05, + "loss": 37.0223, + "step": 3418 + }, + { + "epoch": 12.343115124153499, + "grad_norm": 270.56396484375, + "learning_rate": 1.1439201451905626e-05, + "loss": 34.8413, + "step": 3419 + }, + { + "epoch": 12.346726862302482, + "grad_norm": 246.56292724609375, + "learning_rate": 1.1433756805807622e-05, + "loss": 26.5596, + "step": 3420 + }, + { + "epoch": 12.346726862302482, + "eval_loss": 0.6123174428939819, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.143, + "eval_steps_per_second": 57.143, + "step": 3420 + }, + { + "epoch": 12.350338600451467, + "grad_norm": 199.72242736816406, + "learning_rate": 1.1428312159709619e-05, + "loss": 23.3959, + "step": 3421 + }, + { + "epoch": 12.353950338600452, + "grad_norm": 264.9206848144531, + "learning_rate": 1.1422867513611616e-05, + "loss": 23.448, + "step": 3422 + }, + { + "epoch": 12.357562076749435, + "grad_norm": 198.09420776367188, + "learning_rate": 1.1417422867513613e-05, + "loss": 23.4526, + "step": 3423 + }, + { + "epoch": 12.36117381489842, + "grad_norm": 191.74949645996094, + "learning_rate": 1.1411978221415608e-05, + "loss": 23.9586, + "step": 3424 + }, + { + "epoch": 12.364785553047405, + "grad_norm": 270.4527893066406, + "learning_rate": 1.1406533575317604e-05, + "loss": 41.2497, + "step": 3425 + }, + { + "epoch": 12.368397291196388, + "grad_norm": 253.06109619140625, + "learning_rate": 1.1401088929219601e-05, + "loss": 41.7598, + "step": 3426 + }, + { + "epoch": 12.372009029345373, + "grad_norm": 389.3164978027344, + "learning_rate": 1.1395644283121598e-05, + "loss": 42.1145, + "step": 3427 + }, + { + "epoch": 12.375620767494357, + "grad_norm": 405.1527404785156, + "learning_rate": 1.1390199637023593e-05, + "loss": 39.8163, + "step": 3428 + }, + { + "epoch": 12.37923250564334, + "grad_norm": 360.5083312988281, + "learning_rate": 1.138475499092559e-05, + "loss": 40.7344, + "step": 3429 + }, + { + "epoch": 12.382844243792325, + "grad_norm": 276.3650207519531, + "learning_rate": 1.1379310344827586e-05, + "loss": 40.6678, + "step": 3430 + }, + { + "epoch": 12.382844243792325, + "eval_loss": 0.612799346446991, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 3430 + }, + { + "epoch": 12.386455981941308, + "grad_norm": 222.34078979492188, + "learning_rate": 1.1373865698729583e-05, + "loss": 39.8701, + "step": 3431 + }, + { + "epoch": 12.390067720090293, + "grad_norm": 242.1103515625, + "learning_rate": 1.136842105263158e-05, + "loss": 42.031, + "step": 3432 + }, + { + "epoch": 12.393679458239278, + "grad_norm": 231.30453491210938, + "learning_rate": 1.1362976406533575e-05, + "loss": 40.7321, + "step": 3433 + }, + { + "epoch": 12.397291196388261, + "grad_norm": 302.65179443359375, + "learning_rate": 1.1357531760435572e-05, + "loss": 41.5889, + "step": 3434 + }, + { + "epoch": 12.400902934537246, + "grad_norm": 296.4203796386719, + "learning_rate": 1.1352087114337568e-05, + "loss": 40.3939, + "step": 3435 + }, + { + "epoch": 12.404514672686231, + "grad_norm": 281.8349304199219, + "learning_rate": 1.1346642468239565e-05, + "loss": 37.9457, + "step": 3436 + }, + { + "epoch": 12.408126410835214, + "grad_norm": 228.9622039794922, + "learning_rate": 1.1341197822141562e-05, + "loss": 37.4727, + "step": 3437 + }, + { + "epoch": 12.411738148984199, + "grad_norm": 276.8975524902344, + "learning_rate": 1.1335753176043557e-05, + "loss": 36.4285, + "step": 3438 + }, + { + "epoch": 12.415349887133182, + "grad_norm": 218.76206970214844, + "learning_rate": 1.1330308529945553e-05, + "loss": 37.7888, + "step": 3439 + }, + { + "epoch": 12.418961625282167, + "grad_norm": 277.31329345703125, + "learning_rate": 1.1324863883847551e-05, + "loss": 38.6416, + "step": 3440 + }, + { + "epoch": 12.418961625282167, + "eval_loss": 0.6118359565734863, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.105, + "eval_steps_per_second": 57.105, + "step": 3440 + }, + { + "epoch": 12.422573363431152, + "grad_norm": 239.2766876220703, + "learning_rate": 1.1319419237749547e-05, + "loss": 38.3779, + "step": 3441 + }, + { + "epoch": 12.426185101580135, + "grad_norm": 255.43939208984375, + "learning_rate": 1.1313974591651542e-05, + "loss": 38.7581, + "step": 3442 + }, + { + "epoch": 12.42979683972912, + "grad_norm": 196.33380126953125, + "learning_rate": 1.130852994555354e-05, + "loss": 40.1953, + "step": 3443 + }, + { + "epoch": 12.433408577878104, + "grad_norm": 284.2427062988281, + "learning_rate": 1.1303085299455535e-05, + "loss": 39.2743, + "step": 3444 + }, + { + "epoch": 12.437020316027088, + "grad_norm": 303.0172424316406, + "learning_rate": 1.1297640653357533e-05, + "loss": 39.4786, + "step": 3445 + }, + { + "epoch": 12.440632054176072, + "grad_norm": 231.17999267578125, + "learning_rate": 1.1292196007259529e-05, + "loss": 38.6038, + "step": 3446 + }, + { + "epoch": 12.444243792325057, + "grad_norm": 228.89599609375, + "learning_rate": 1.1286751361161524e-05, + "loss": 39.0235, + "step": 3447 + }, + { + "epoch": 12.44785553047404, + "grad_norm": 247.05203247070312, + "learning_rate": 1.1281306715063521e-05, + "loss": 39.9779, + "step": 3448 + }, + { + "epoch": 12.451467268623025, + "grad_norm": 221.5463104248047, + "learning_rate": 1.1275862068965517e-05, + "loss": 40.4104, + "step": 3449 + }, + { + "epoch": 12.455079006772008, + "grad_norm": 254.12820434570312, + "learning_rate": 1.1270417422867514e-05, + "loss": 40.8093, + "step": 3450 + }, + { + "epoch": 12.455079006772008, + "eval_loss": 0.6093817353248596, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 3450 + }, + { + "epoch": 12.458690744920993, + "grad_norm": 214.2323760986328, + "learning_rate": 1.1264972776769511e-05, + "loss": 40.3578, + "step": 3451 + }, + { + "epoch": 12.462302483069978, + "grad_norm": 230.64718627929688, + "learning_rate": 1.1259528130671506e-05, + "loss": 39.772, + "step": 3452 + }, + { + "epoch": 12.465914221218961, + "grad_norm": 217.81838989257812, + "learning_rate": 1.1254083484573502e-05, + "loss": 36.8193, + "step": 3453 + }, + { + "epoch": 12.469525959367946, + "grad_norm": 292.7674560546875, + "learning_rate": 1.12486388384755e-05, + "loss": 33.891, + "step": 3454 + }, + { + "epoch": 12.47313769751693, + "grad_norm": 241.6099395751953, + "learning_rate": 1.1243194192377496e-05, + "loss": 34.8947, + "step": 3455 + }, + { + "epoch": 12.476749435665914, + "grad_norm": 220.97128295898438, + "learning_rate": 1.1237749546279493e-05, + "loss": 31.7715, + "step": 3456 + }, + { + "epoch": 12.480361173814899, + "grad_norm": 191.04376220703125, + "learning_rate": 1.1232304900181488e-05, + "loss": 32.3878, + "step": 3457 + }, + { + "epoch": 12.483972911963882, + "grad_norm": 192.3009796142578, + "learning_rate": 1.1226860254083484e-05, + "loss": 33.3116, + "step": 3458 + }, + { + "epoch": 12.487584650112867, + "grad_norm": 214.22459411621094, + "learning_rate": 1.1221415607985482e-05, + "loss": 34.1394, + "step": 3459 + }, + { + "epoch": 12.491196388261852, + "grad_norm": 225.24191284179688, + "learning_rate": 1.1215970961887478e-05, + "loss": 34.9381, + "step": 3460 + }, + { + "epoch": 12.491196388261852, + "eval_loss": 0.6095408201217651, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 3460 + }, + { + "epoch": 12.494808126410835, + "grad_norm": 240.89199829101562, + "learning_rate": 1.1210526315789473e-05, + "loss": 34.5342, + "step": 3461 + }, + { + "epoch": 12.49841986455982, + "grad_norm": 263.5467224121094, + "learning_rate": 1.120508166969147e-05, + "loss": 35.3287, + "step": 3462 + }, + { + "epoch": 12.502031602708804, + "grad_norm": 253.0650634765625, + "learning_rate": 1.1199637023593467e-05, + "loss": 35.4859, + "step": 3463 + }, + { + "epoch": 12.505643340857787, + "grad_norm": 279.4447937011719, + "learning_rate": 1.1194192377495463e-05, + "loss": 33.919, + "step": 3464 + }, + { + "epoch": 12.509255079006772, + "grad_norm": 246.6184844970703, + "learning_rate": 1.118874773139746e-05, + "loss": 35.2743, + "step": 3465 + }, + { + "epoch": 12.512866817155757, + "grad_norm": 228.4134979248047, + "learning_rate": 1.1183303085299455e-05, + "loss": 36.0865, + "step": 3466 + }, + { + "epoch": 12.51647855530474, + "grad_norm": 264.87835693359375, + "learning_rate": 1.1177858439201452e-05, + "loss": 36.1596, + "step": 3467 + }, + { + "epoch": 12.520090293453725, + "grad_norm": 252.2872772216797, + "learning_rate": 1.117241379310345e-05, + "loss": 35.7293, + "step": 3468 + }, + { + "epoch": 12.523702031602708, + "grad_norm": 277.3695373535156, + "learning_rate": 1.1166969147005445e-05, + "loss": 36.8009, + "step": 3469 + }, + { + "epoch": 12.527313769751693, + "grad_norm": 255.64610290527344, + "learning_rate": 1.1161524500907442e-05, + "loss": 28.5986, + "step": 3470 + }, + { + "epoch": 12.527313769751693, + "eval_loss": 0.6122347116470337, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 3470 + }, + { + "epoch": 12.530925507900678, + "grad_norm": 256.1487121582031, + "learning_rate": 1.1156079854809437e-05, + "loss": 23.1289, + "step": 3471 + }, + { + "epoch": 12.534537246049661, + "grad_norm": 261.9757080078125, + "learning_rate": 1.1150635208711433e-05, + "loss": 22.3379, + "step": 3472 + }, + { + "epoch": 12.538148984198646, + "grad_norm": 194.83432006835938, + "learning_rate": 1.1145190562613431e-05, + "loss": 23.6192, + "step": 3473 + }, + { + "epoch": 12.54176072234763, + "grad_norm": 241.51089477539062, + "learning_rate": 1.1139745916515427e-05, + "loss": 24.0314, + "step": 3474 + }, + { + "epoch": 12.545372460496614, + "grad_norm": 242.6024932861328, + "learning_rate": 1.1134301270417424e-05, + "loss": 40.2969, + "step": 3475 + }, + { + "epoch": 12.548984198645599, + "grad_norm": 292.17303466796875, + "learning_rate": 1.112885662431942e-05, + "loss": 42.3448, + "step": 3476 + }, + { + "epoch": 12.552595936794582, + "grad_norm": 232.811767578125, + "learning_rate": 1.1123411978221416e-05, + "loss": 41.7642, + "step": 3477 + }, + { + "epoch": 12.556207674943566, + "grad_norm": 238.43162536621094, + "learning_rate": 1.1117967332123413e-05, + "loss": 41.0827, + "step": 3478 + }, + { + "epoch": 12.559819413092551, + "grad_norm": 290.20159912109375, + "learning_rate": 1.1112522686025409e-05, + "loss": 41.3795, + "step": 3479 + }, + { + "epoch": 12.563431151241534, + "grad_norm": 197.52903747558594, + "learning_rate": 1.1107078039927404e-05, + "loss": 40.6337, + "step": 3480 + }, + { + "epoch": 12.563431151241534, + "eval_loss": 0.6133883595466614, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.135, + "eval_steps_per_second": 57.135, + "step": 3480 + }, + { + "epoch": 12.56704288939052, + "grad_norm": 259.8161926269531, + "learning_rate": 1.1101633393829401e-05, + "loss": 40.2626, + "step": 3481 + }, + { + "epoch": 12.570654627539504, + "grad_norm": 196.7882537841797, + "learning_rate": 1.1096188747731398e-05, + "loss": 41.0171, + "step": 3482 + }, + { + "epoch": 12.574266365688487, + "grad_norm": 216.27642822265625, + "learning_rate": 1.1090744101633394e-05, + "loss": 42.1328, + "step": 3483 + }, + { + "epoch": 12.577878103837472, + "grad_norm": 292.6575012207031, + "learning_rate": 1.108529945553539e-05, + "loss": 39.9502, + "step": 3484 + }, + { + "epoch": 12.581489841986457, + "grad_norm": 254.43344116210938, + "learning_rate": 1.1079854809437386e-05, + "loss": 41.3409, + "step": 3485 + }, + { + "epoch": 12.58510158013544, + "grad_norm": 211.3965606689453, + "learning_rate": 1.1074410163339385e-05, + "loss": 39.6898, + "step": 3486 + }, + { + "epoch": 12.588713318284425, + "grad_norm": 196.2000274658203, + "learning_rate": 1.106896551724138e-05, + "loss": 38.0837, + "step": 3487 + }, + { + "epoch": 12.592325056433408, + "grad_norm": 224.4564666748047, + "learning_rate": 1.1063520871143376e-05, + "loss": 38.479, + "step": 3488 + }, + { + "epoch": 12.595936794582393, + "grad_norm": 215.7074432373047, + "learning_rate": 1.1058076225045373e-05, + "loss": 38.3103, + "step": 3489 + }, + { + "epoch": 12.599548532731378, + "grad_norm": 278.2279052734375, + "learning_rate": 1.1052631578947368e-05, + "loss": 37.9399, + "step": 3490 + }, + { + "epoch": 12.599548532731378, + "eval_loss": 0.6091782450675964, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 3490 + }, + { + "epoch": 12.60316027088036, + "grad_norm": 236.7021942138672, + "learning_rate": 1.1047186932849365e-05, + "loss": 38.185, + "step": 3491 + }, + { + "epoch": 12.606772009029346, + "grad_norm": 200.35169982910156, + "learning_rate": 1.1041742286751362e-05, + "loss": 38.7405, + "step": 3492 + }, + { + "epoch": 12.610383747178329, + "grad_norm": 211.9726104736328, + "learning_rate": 1.1036297640653358e-05, + "loss": 39.8351, + "step": 3493 + }, + { + "epoch": 12.613995485327314, + "grad_norm": 303.5962829589844, + "learning_rate": 1.1030852994555353e-05, + "loss": 39.3039, + "step": 3494 + }, + { + "epoch": 12.617607223476298, + "grad_norm": 298.086181640625, + "learning_rate": 1.102540834845735e-05, + "loss": 39.9149, + "step": 3495 + }, + { + "epoch": 12.621218961625281, + "grad_norm": 255.69854736328125, + "learning_rate": 1.1019963702359347e-05, + "loss": 36.3617, + "step": 3496 + }, + { + "epoch": 12.624830699774266, + "grad_norm": 273.2884216308594, + "learning_rate": 1.1014519056261344e-05, + "loss": 38.6865, + "step": 3497 + }, + { + "epoch": 12.628442437923251, + "grad_norm": 211.17837524414062, + "learning_rate": 1.100907441016334e-05, + "loss": 40.2771, + "step": 3498 + }, + { + "epoch": 12.632054176072234, + "grad_norm": 253.9141845703125, + "learning_rate": 1.1003629764065335e-05, + "loss": 40.3644, + "step": 3499 + }, + { + "epoch": 12.635665914221219, + "grad_norm": 247.4141082763672, + "learning_rate": 1.0998185117967334e-05, + "loss": 39.9754, + "step": 3500 + }, + { + "epoch": 12.635665914221219, + "eval_loss": 0.6086810827255249, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 3500 + }, + { + "epoch": 12.639277652370204, + "grad_norm": 237.3258056640625, + "learning_rate": 1.0992740471869329e-05, + "loss": 39.9438, + "step": 3501 + }, + { + "epoch": 12.642889390519187, + "grad_norm": 252.87744140625, + "learning_rate": 1.0987295825771325e-05, + "loss": 39.9713, + "step": 3502 + }, + { + "epoch": 12.646501128668172, + "grad_norm": 341.2947998046875, + "learning_rate": 1.0981851179673322e-05, + "loss": 36.54, + "step": 3503 + }, + { + "epoch": 12.650112866817155, + "grad_norm": 212.7144317626953, + "learning_rate": 1.0976406533575317e-05, + "loss": 33.2737, + "step": 3504 + }, + { + "epoch": 12.65372460496614, + "grad_norm": 220.15846252441406, + "learning_rate": 1.0970961887477314e-05, + "loss": 34.8862, + "step": 3505 + }, + { + "epoch": 12.657336343115125, + "grad_norm": 235.8145294189453, + "learning_rate": 1.0965517241379311e-05, + "loss": 31.637, + "step": 3506 + }, + { + "epoch": 12.660948081264108, + "grad_norm": 274.13140869140625, + "learning_rate": 1.0960072595281307e-05, + "loss": 33.6111, + "step": 3507 + }, + { + "epoch": 12.664559819413093, + "grad_norm": 259.9810791015625, + "learning_rate": 1.0954627949183304e-05, + "loss": 34.7118, + "step": 3508 + }, + { + "epoch": 12.668171557562077, + "grad_norm": 244.6074676513672, + "learning_rate": 1.0949183303085299e-05, + "loss": 34.3987, + "step": 3509 + }, + { + "epoch": 12.67178329571106, + "grad_norm": 264.0238037109375, + "learning_rate": 1.0943738656987296e-05, + "loss": 34.7304, + "step": 3510 + }, + { + "epoch": 12.67178329571106, + "eval_loss": 0.6089194416999817, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 3510 + }, + { + "epoch": 12.675395033860045, + "grad_norm": 286.857421875, + "learning_rate": 1.0938294010889293e-05, + "loss": 34.5722, + "step": 3511 + }, + { + "epoch": 12.679006772009028, + "grad_norm": 270.7839660644531, + "learning_rate": 1.0932849364791289e-05, + "loss": 35.6129, + "step": 3512 + }, + { + "epoch": 12.682618510158013, + "grad_norm": 214.4302978515625, + "learning_rate": 1.0927404718693284e-05, + "loss": 34.4318, + "step": 3513 + }, + { + "epoch": 12.686230248306998, + "grad_norm": 362.6913757324219, + "learning_rate": 1.0921960072595283e-05, + "loss": 35.6578, + "step": 3514 + }, + { + "epoch": 12.689841986455981, + "grad_norm": 266.5205993652344, + "learning_rate": 1.0916515426497278e-05, + "loss": 35.8627, + "step": 3515 + }, + { + "epoch": 12.693453724604966, + "grad_norm": 271.8298034667969, + "learning_rate": 1.0911070780399275e-05, + "loss": 36.8931, + "step": 3516 + }, + { + "epoch": 12.697065462753951, + "grad_norm": 230.13815307617188, + "learning_rate": 1.090562613430127e-05, + "loss": 35.8972, + "step": 3517 + }, + { + "epoch": 12.700677200902934, + "grad_norm": 235.57127380371094, + "learning_rate": 1.0900181488203266e-05, + "loss": 36.7884, + "step": 3518 + }, + { + "epoch": 12.704288939051919, + "grad_norm": 274.0856018066406, + "learning_rate": 1.0894736842105265e-05, + "loss": 35.938, + "step": 3519 + }, + { + "epoch": 12.707900677200904, + "grad_norm": 251.9855194091797, + "learning_rate": 1.088929219600726e-05, + "loss": 30.846, + "step": 3520 + }, + { + "epoch": 12.707900677200904, + "eval_loss": 0.6102532148361206, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 3520 + }, + { + "epoch": 12.711512415349887, + "grad_norm": 254.11465454101562, + "learning_rate": 1.0883847549909255e-05, + "loss": 22.8538, + "step": 3521 + }, + { + "epoch": 12.715124153498872, + "grad_norm": 233.05821228027344, + "learning_rate": 1.0878402903811253e-05, + "loss": 22.3346, + "step": 3522 + }, + { + "epoch": 12.718735891647855, + "grad_norm": 223.46646118164062, + "learning_rate": 1.087295825771325e-05, + "loss": 23.8109, + "step": 3523 + }, + { + "epoch": 12.72234762979684, + "grad_norm": 209.4064483642578, + "learning_rate": 1.0867513611615245e-05, + "loss": 24.7694, + "step": 3524 + }, + { + "epoch": 12.725959367945824, + "grad_norm": 299.6215515136719, + "learning_rate": 1.0862068965517242e-05, + "loss": 40.8879, + "step": 3525 + }, + { + "epoch": 12.729571106094808, + "grad_norm": 272.5259704589844, + "learning_rate": 1.0856624319419237e-05, + "loss": 41.5875, + "step": 3526 + }, + { + "epoch": 12.733182844243792, + "grad_norm": 219.70687866210938, + "learning_rate": 1.0851179673321235e-05, + "loss": 41.5546, + "step": 3527 + }, + { + "epoch": 12.736794582392777, + "grad_norm": 250.9104766845703, + "learning_rate": 1.0845735027223232e-05, + "loss": 40.0984, + "step": 3528 + }, + { + "epoch": 12.74040632054176, + "grad_norm": 260.9254150390625, + "learning_rate": 1.0840290381125227e-05, + "loss": 40.564, + "step": 3529 + }, + { + "epoch": 12.744018058690745, + "grad_norm": 275.46221923828125, + "learning_rate": 1.0834845735027224e-05, + "loss": 40.3864, + "step": 3530 + }, + { + "epoch": 12.744018058690745, + "eval_loss": 0.6099677681922913, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 3530 + }, + { + "epoch": 12.747629796839728, + "grad_norm": 200.9589385986328, + "learning_rate": 1.082940108892922e-05, + "loss": 40.5753, + "step": 3531 + }, + { + "epoch": 12.751241534988713, + "grad_norm": 228.87669372558594, + "learning_rate": 1.0823956442831215e-05, + "loss": 41.4702, + "step": 3532 + }, + { + "epoch": 12.754853273137698, + "grad_norm": 218.6998748779297, + "learning_rate": 1.0818511796733214e-05, + "loss": 41.6641, + "step": 3533 + }, + { + "epoch": 12.758465011286681, + "grad_norm": 422.519775390625, + "learning_rate": 1.0813067150635209e-05, + "loss": 41.8016, + "step": 3534 + }, + { + "epoch": 12.762076749435666, + "grad_norm": 198.31935119628906, + "learning_rate": 1.0807622504537204e-05, + "loss": 40.6053, + "step": 3535 + }, + { + "epoch": 12.76568848758465, + "grad_norm": 274.42333984375, + "learning_rate": 1.0802177858439201e-05, + "loss": 38.7974, + "step": 3536 + }, + { + "epoch": 12.769300225733634, + "grad_norm": 267.5847473144531, + "learning_rate": 1.0796733212341199e-05, + "loss": 37.157, + "step": 3537 + }, + { + "epoch": 12.772911963882619, + "grad_norm": 264.9976806640625, + "learning_rate": 1.0791288566243196e-05, + "loss": 38.1585, + "step": 3538 + }, + { + "epoch": 12.776523702031604, + "grad_norm": 216.5603790283203, + "learning_rate": 1.0785843920145191e-05, + "loss": 38.0501, + "step": 3539 + }, + { + "epoch": 12.780135440180587, + "grad_norm": 193.55081176757812, + "learning_rate": 1.0780399274047186e-05, + "loss": 38.3114, + "step": 3540 + }, + { + "epoch": 12.780135440180587, + "eval_loss": 0.6059894561767578, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3540 + }, + { + "epoch": 12.783747178329572, + "grad_norm": 256.3584289550781, + "learning_rate": 1.0774954627949183e-05, + "loss": 38.7056, + "step": 3541 + }, + { + "epoch": 12.787358916478555, + "grad_norm": 203.17401123046875, + "learning_rate": 1.076950998185118e-05, + "loss": 39.3947, + "step": 3542 + }, + { + "epoch": 12.79097065462754, + "grad_norm": 307.99517822265625, + "learning_rate": 1.0764065335753176e-05, + "loss": 39.2121, + "step": 3543 + }, + { + "epoch": 12.794582392776524, + "grad_norm": 199.4147186279297, + "learning_rate": 1.0758620689655173e-05, + "loss": 38.4621, + "step": 3544 + }, + { + "epoch": 12.798194130925507, + "grad_norm": 251.60293579101562, + "learning_rate": 1.0753176043557168e-05, + "loss": 38.2742, + "step": 3545 + }, + { + "epoch": 12.801805869074492, + "grad_norm": 277.1817321777344, + "learning_rate": 1.0747731397459165e-05, + "loss": 38.6803, + "step": 3546 + }, + { + "epoch": 12.805417607223477, + "grad_norm": 303.2837219238281, + "learning_rate": 1.0742286751361163e-05, + "loss": 39.7843, + "step": 3547 + }, + { + "epoch": 12.80902934537246, + "grad_norm": 321.22772216796875, + "learning_rate": 1.0736842105263158e-05, + "loss": 41.3761, + "step": 3548 + }, + { + "epoch": 12.812641083521445, + "grad_norm": 238.89007568359375, + "learning_rate": 1.0731397459165155e-05, + "loss": 40.3649, + "step": 3549 + }, + { + "epoch": 12.816252821670428, + "grad_norm": 251.22291564941406, + "learning_rate": 1.072595281306715e-05, + "loss": 40.8151, + "step": 3550 + }, + { + "epoch": 12.816252821670428, + "eval_loss": 0.6065003275871277, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 3550 + }, + { + "epoch": 12.819864559819413, + "grad_norm": 218.13418579101562, + "learning_rate": 1.0720508166969147e-05, + "loss": 39.381, + "step": 3551 + }, + { + "epoch": 12.823476297968398, + "grad_norm": 250.90328979492188, + "learning_rate": 1.0715063520871145e-05, + "loss": 39.8923, + "step": 3552 + }, + { + "epoch": 12.827088036117381, + "grad_norm": 227.4825897216797, + "learning_rate": 1.070961887477314e-05, + "loss": 36.836, + "step": 3553 + }, + { + "epoch": 12.830699774266366, + "grad_norm": 253.7106475830078, + "learning_rate": 1.0704174228675135e-05, + "loss": 34.499, + "step": 3554 + }, + { + "epoch": 12.83431151241535, + "grad_norm": 280.0548400878906, + "learning_rate": 1.0698729582577132e-05, + "loss": 33.3409, + "step": 3555 + }, + { + "epoch": 12.837923250564334, + "grad_norm": 201.3768768310547, + "learning_rate": 1.069328493647913e-05, + "loss": 32.4868, + "step": 3556 + }, + { + "epoch": 12.841534988713319, + "grad_norm": 245.73446655273438, + "learning_rate": 1.0687840290381125e-05, + "loss": 32.8295, + "step": 3557 + }, + { + "epoch": 12.845146726862303, + "grad_norm": 195.0170440673828, + "learning_rate": 1.0682395644283122e-05, + "loss": 33.2009, + "step": 3558 + }, + { + "epoch": 12.848758465011286, + "grad_norm": 261.66357421875, + "learning_rate": 1.0676950998185117e-05, + "loss": 33.0627, + "step": 3559 + }, + { + "epoch": 12.852370203160271, + "grad_norm": 299.0184326171875, + "learning_rate": 1.0671506352087116e-05, + "loss": 34.184, + "step": 3560 + }, + { + "epoch": 12.852370203160271, + "eval_loss": 0.6077792048454285, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.041, + "eval_steps_per_second": 57.041, + "step": 3560 + }, + { + "epoch": 12.855981941309254, + "grad_norm": 293.9249572753906, + "learning_rate": 1.0666061705989111e-05, + "loss": 34.748, + "step": 3561 + }, + { + "epoch": 12.85959367945824, + "grad_norm": 206.4182586669922, + "learning_rate": 1.0660617059891107e-05, + "loss": 33.8454, + "step": 3562 + }, + { + "epoch": 12.863205417607224, + "grad_norm": 261.4427185058594, + "learning_rate": 1.0655172413793104e-05, + "loss": 35.7317, + "step": 3563 + }, + { + "epoch": 12.866817155756207, + "grad_norm": 236.60704040527344, + "learning_rate": 1.06497277676951e-05, + "loss": 35.2389, + "step": 3564 + }, + { + "epoch": 12.870428893905192, + "grad_norm": 272.9973449707031, + "learning_rate": 1.0644283121597096e-05, + "loss": 34.8523, + "step": 3565 + }, + { + "epoch": 12.874040632054175, + "grad_norm": 228.82540893554688, + "learning_rate": 1.0638838475499093e-05, + "loss": 34.7236, + "step": 3566 + }, + { + "epoch": 12.87765237020316, + "grad_norm": 266.6078796386719, + "learning_rate": 1.0633393829401089e-05, + "loss": 36.1574, + "step": 3567 + }, + { + "epoch": 12.881264108352145, + "grad_norm": 267.52239990234375, + "learning_rate": 1.0627949183303086e-05, + "loss": 36.8466, + "step": 3568 + }, + { + "epoch": 12.884875846501128, + "grad_norm": 261.0372314453125, + "learning_rate": 1.0622504537205083e-05, + "loss": 37.2803, + "step": 3569 + }, + { + "epoch": 12.888487584650113, + "grad_norm": 220.42532348632812, + "learning_rate": 1.0617059891107078e-05, + "loss": 29.4233, + "step": 3570 + }, + { + "epoch": 12.888487584650113, + "eval_loss": 0.6131581664085388, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 3570 + }, + { + "epoch": 12.892099322799098, + "grad_norm": 187.53604125976562, + "learning_rate": 1.0611615245009075e-05, + "loss": 23.3851, + "step": 3571 + }, + { + "epoch": 12.89571106094808, + "grad_norm": 227.1913299560547, + "learning_rate": 1.060617059891107e-05, + "loss": 23.3155, + "step": 3572 + }, + { + "epoch": 12.899322799097066, + "grad_norm": 202.15939331054688, + "learning_rate": 1.0600725952813066e-05, + "loss": 24.4548, + "step": 3573 + }, + { + "epoch": 12.90293453724605, + "grad_norm": 195.67282104492188, + "learning_rate": 1.0595281306715065e-05, + "loss": 24.2037, + "step": 3574 + }, + { + "epoch": 12.906546275395034, + "grad_norm": 303.0018310546875, + "learning_rate": 1.058983666061706e-05, + "loss": 41.6489, + "step": 3575 + }, + { + "epoch": 12.910158013544018, + "grad_norm": 193.92433166503906, + "learning_rate": 1.0584392014519056e-05, + "loss": 40.3682, + "step": 3576 + }, + { + "epoch": 12.913769751693001, + "grad_norm": 305.50750732421875, + "learning_rate": 1.0578947368421053e-05, + "loss": 40.5065, + "step": 3577 + }, + { + "epoch": 12.917381489841986, + "grad_norm": 223.41732788085938, + "learning_rate": 1.0573502722323048e-05, + "loss": 41.6387, + "step": 3578 + }, + { + "epoch": 12.920993227990971, + "grad_norm": 215.65061950683594, + "learning_rate": 1.0568058076225047e-05, + "loss": 41.3623, + "step": 3579 + }, + { + "epoch": 12.924604966139954, + "grad_norm": 223.95880126953125, + "learning_rate": 1.0562613430127042e-05, + "loss": 40.7444, + "step": 3580 + }, + { + "epoch": 12.924604966139954, + "eval_loss": 0.6113386750221252, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.074, + "eval_steps_per_second": 57.074, + "step": 3580 + }, + { + "epoch": 12.928216704288939, + "grad_norm": 247.3272247314453, + "learning_rate": 1.0557168784029038e-05, + "loss": 37.8137, + "step": 3581 + }, + { + "epoch": 12.931828442437924, + "grad_norm": 277.4321594238281, + "learning_rate": 1.0551724137931035e-05, + "loss": 38.6946, + "step": 3582 + }, + { + "epoch": 12.935440180586907, + "grad_norm": 219.15576171875, + "learning_rate": 1.0546279491833032e-05, + "loss": 39.0059, + "step": 3583 + }, + { + "epoch": 12.939051918735892, + "grad_norm": 205.6105194091797, + "learning_rate": 1.0540834845735027e-05, + "loss": 39.2436, + "step": 3584 + }, + { + "epoch": 12.942663656884875, + "grad_norm": 303.84521484375, + "learning_rate": 1.0535390199637024e-05, + "loss": 39.2451, + "step": 3585 + }, + { + "epoch": 12.94627539503386, + "grad_norm": 326.2321472167969, + "learning_rate": 1.052994555353902e-05, + "loss": 38.1849, + "step": 3586 + }, + { + "epoch": 12.949887133182845, + "grad_norm": 332.7608642578125, + "learning_rate": 1.0524500907441015e-05, + "loss": 39.7121, + "step": 3587 + }, + { + "epoch": 12.953498871331828, + "grad_norm": 245.19827270507812, + "learning_rate": 1.0519056261343014e-05, + "loss": 39.6558, + "step": 3588 + }, + { + "epoch": 12.957110609480813, + "grad_norm": 227.54763793945312, + "learning_rate": 1.051361161524501e-05, + "loss": 38.6437, + "step": 3589 + }, + { + "epoch": 12.960722347629797, + "grad_norm": 273.1142272949219, + "learning_rate": 1.0508166969147006e-05, + "loss": 39.083, + "step": 3590 + }, + { + "epoch": 12.960722347629797, + "eval_loss": 0.6050187349319458, + "eval_runtime": 3.1339, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 3590 + }, + { + "epoch": 12.96433408577878, + "grad_norm": 227.0492401123047, + "learning_rate": 1.0502722323049002e-05, + "loss": 34.0254, + "step": 3591 + }, + { + "epoch": 12.967945823927765, + "grad_norm": 201.76736450195312, + "learning_rate": 1.0497277676950999e-05, + "loss": 32.4569, + "step": 3592 + }, + { + "epoch": 12.97155756207675, + "grad_norm": 279.99237060546875, + "learning_rate": 1.0491833030852996e-05, + "loss": 33.8718, + "step": 3593 + }, + { + "epoch": 12.975169300225733, + "grad_norm": 351.647705078125, + "learning_rate": 1.0486388384754991e-05, + "loss": 34.8168, + "step": 3594 + }, + { + "epoch": 12.978781038374718, + "grad_norm": 275.7414855957031, + "learning_rate": 1.0480943738656987e-05, + "loss": 35.1731, + "step": 3595 + }, + { + "epoch": 12.982392776523701, + "grad_norm": 347.0024719238281, + "learning_rate": 1.0475499092558984e-05, + "loss": 35.7127, + "step": 3596 + }, + { + "epoch": 12.986004514672686, + "grad_norm": 304.18218994140625, + "learning_rate": 1.047005444646098e-05, + "loss": 34.7709, + "step": 3597 + }, + { + "epoch": 12.989616252821671, + "grad_norm": 306.33245849609375, + "learning_rate": 1.0464609800362976e-05, + "loss": 37.2105, + "step": 3598 + }, + { + "epoch": 12.993227990970654, + "grad_norm": 326.3535461425781, + "learning_rate": 1.0459165154264973e-05, + "loss": 33.6613, + "step": 3599 + }, + { + "epoch": 12.996839729119639, + "grad_norm": 325.7522888183594, + "learning_rate": 1.0453720508166969e-05, + "loss": 22.8985, + "step": 3600 + }, + { + "epoch": 12.996839729119639, + "eval_loss": 0.6073772311210632, + "eval_runtime": 3.1391, + "eval_samples_per_second": 57.023, + "eval_steps_per_second": 57.023, + "step": 3600 + }, + { + "epoch": 13.0, + "grad_norm": 256.7010498046875, + "learning_rate": 1.0448275862068966e-05, + "loss": 21.3776, + "step": 3601 + }, + { + "epoch": 13.003611738148985, + "grad_norm": 247.7591552734375, + "learning_rate": 1.0442831215970963e-05, + "loss": 39.0509, + "step": 3602 + }, + { + "epoch": 13.007223476297968, + "grad_norm": 389.6626281738281, + "learning_rate": 1.0437386569872958e-05, + "loss": 41.042, + "step": 3603 + }, + { + "epoch": 13.010835214446953, + "grad_norm": 271.01885986328125, + "learning_rate": 1.0431941923774955e-05, + "loss": 39.9542, + "step": 3604 + }, + { + "epoch": 13.014446952595938, + "grad_norm": 263.2490539550781, + "learning_rate": 1.042649727767695e-05, + "loss": 39.8852, + "step": 3605 + }, + { + "epoch": 13.01805869074492, + "grad_norm": 255.46878051757812, + "learning_rate": 1.0421052631578948e-05, + "loss": 39.3902, + "step": 3606 + }, + { + "epoch": 13.021670428893906, + "grad_norm": 206.02244567871094, + "learning_rate": 1.0415607985480945e-05, + "loss": 40.1731, + "step": 3607 + }, + { + "epoch": 13.025282167042889, + "grad_norm": 194.83055114746094, + "learning_rate": 1.041016333938294e-05, + "loss": 39.17, + "step": 3608 + }, + { + "epoch": 13.028893905191874, + "grad_norm": 230.1270294189453, + "learning_rate": 1.0404718693284936e-05, + "loss": 40.3363, + "step": 3609 + }, + { + "epoch": 13.032505643340858, + "grad_norm": 206.0470733642578, + "learning_rate": 1.0399274047186933e-05, + "loss": 40.7774, + "step": 3610 + }, + { + "epoch": 13.032505643340858, + "eval_loss": 0.6078981161117554, + "eval_runtime": 3.1697, + "eval_samples_per_second": 56.472, + "eval_steps_per_second": 56.472, + "step": 3610 + }, + { + "epoch": 13.036117381489841, + "grad_norm": 210.79327392578125, + "learning_rate": 1.039382940108893e-05, + "loss": 40.725, + "step": 3611 + }, + { + "epoch": 13.039729119638826, + "grad_norm": 200.4281768798828, + "learning_rate": 1.0388384754990927e-05, + "loss": 38.8736, + "step": 3612 + }, + { + "epoch": 13.043340857787811, + "grad_norm": 183.33575439453125, + "learning_rate": 1.0382940108892922e-05, + "loss": 37.5542, + "step": 3613 + }, + { + "epoch": 13.046952595936794, + "grad_norm": 195.2568817138672, + "learning_rate": 1.0377495462794918e-05, + "loss": 36.5576, + "step": 3614 + }, + { + "epoch": 13.050564334085779, + "grad_norm": 223.9565887451172, + "learning_rate": 1.0372050816696916e-05, + "loss": 36.9015, + "step": 3615 + }, + { + "epoch": 13.054176072234762, + "grad_norm": 264.0516052246094, + "learning_rate": 1.0366606170598912e-05, + "loss": 38.8146, + "step": 3616 + }, + { + "epoch": 13.057787810383747, + "grad_norm": 247.3844757080078, + "learning_rate": 1.0361161524500907e-05, + "loss": 37.0338, + "step": 3617 + }, + { + "epoch": 13.061399548532732, + "grad_norm": 243.3253173828125, + "learning_rate": 1.0355716878402904e-05, + "loss": 37.3565, + "step": 3618 + }, + { + "epoch": 13.065011286681715, + "grad_norm": 213.89939880371094, + "learning_rate": 1.03502722323049e-05, + "loss": 38.367, + "step": 3619 + }, + { + "epoch": 13.0686230248307, + "grad_norm": 254.04953002929688, + "learning_rate": 1.0344827586206898e-05, + "loss": 38.3101, + "step": 3620 + }, + { + "epoch": 13.0686230248307, + "eval_loss": 0.6108394861221313, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 3620 + }, + { + "epoch": 13.072234762979685, + "grad_norm": 235.3623046875, + "learning_rate": 1.0339382940108894e-05, + "loss": 38.3113, + "step": 3621 + }, + { + "epoch": 13.075846501128668, + "grad_norm": 259.0147399902344, + "learning_rate": 1.0333938294010889e-05, + "loss": 36.9916, + "step": 3622 + }, + { + "epoch": 13.079458239277653, + "grad_norm": 257.96575927734375, + "learning_rate": 1.0328493647912886e-05, + "loss": 36.5944, + "step": 3623 + }, + { + "epoch": 13.083069977426636, + "grad_norm": 228.49131774902344, + "learning_rate": 1.0323049001814882e-05, + "loss": 39.7592, + "step": 3624 + }, + { + "epoch": 13.08668171557562, + "grad_norm": 278.5231018066406, + "learning_rate": 1.0317604355716879e-05, + "loss": 38.7785, + "step": 3625 + }, + { + "epoch": 13.090293453724605, + "grad_norm": 218.6136932373047, + "learning_rate": 1.0312159709618876e-05, + "loss": 39.6878, + "step": 3626 + }, + { + "epoch": 13.093905191873588, + "grad_norm": 231.03012084960938, + "learning_rate": 1.0306715063520871e-05, + "loss": 40.5433, + "step": 3627 + }, + { + "epoch": 13.097516930022573, + "grad_norm": 254.7096405029297, + "learning_rate": 1.0301270417422866e-05, + "loss": 39.1311, + "step": 3628 + }, + { + "epoch": 13.101128668171558, + "grad_norm": 303.50274658203125, + "learning_rate": 1.0295825771324865e-05, + "loss": 38.6237, + "step": 3629 + }, + { + "epoch": 13.104740406320541, + "grad_norm": 217.4394073486328, + "learning_rate": 1.029038112522686e-05, + "loss": 36.5534, + "step": 3630 + }, + { + "epoch": 13.104740406320541, + "eval_loss": 0.6075544357299805, + "eval_runtime": 3.1475, + "eval_samples_per_second": 56.87, + "eval_steps_per_second": 56.87, + "step": 3630 + }, + { + "epoch": 13.108352144469526, + "grad_norm": 249.18490600585938, + "learning_rate": 1.0284936479128858e-05, + "loss": 34.2153, + "step": 3631 + }, + { + "epoch": 13.111963882618511, + "grad_norm": 261.9061584472656, + "learning_rate": 1.0279491833030853e-05, + "loss": 33.7793, + "step": 3632 + }, + { + "epoch": 13.115575620767494, + "grad_norm": 205.93113708496094, + "learning_rate": 1.0274047186932848e-05, + "loss": 31.2934, + "step": 3633 + }, + { + "epoch": 13.119187358916479, + "grad_norm": 203.82980346679688, + "learning_rate": 1.0268602540834847e-05, + "loss": 31.9074, + "step": 3634 + }, + { + "epoch": 13.122799097065462, + "grad_norm": 309.0658874511719, + "learning_rate": 1.0263157894736843e-05, + "loss": 32.6883, + "step": 3635 + }, + { + "epoch": 13.126410835214447, + "grad_norm": 239.59312438964844, + "learning_rate": 1.0257713248638838e-05, + "loss": 34.1261, + "step": 3636 + }, + { + "epoch": 13.130022573363432, + "grad_norm": 360.4351501464844, + "learning_rate": 1.0252268602540835e-05, + "loss": 34.7656, + "step": 3637 + }, + { + "epoch": 13.133634311512415, + "grad_norm": 319.87451171875, + "learning_rate": 1.024682395644283e-05, + "loss": 34.6533, + "step": 3638 + }, + { + "epoch": 13.1372460496614, + "grad_norm": 352.31707763671875, + "learning_rate": 1.0241379310344828e-05, + "loss": 33.9159, + "step": 3639 + }, + { + "epoch": 13.140857787810384, + "grad_norm": 288.85418701171875, + "learning_rate": 1.0235934664246825e-05, + "loss": 34.6115, + "step": 3640 + }, + { + "epoch": 13.140857787810384, + "eval_loss": 0.6106187105178833, + "eval_runtime": 3.1535, + "eval_samples_per_second": 56.763, + "eval_steps_per_second": 56.763, + "step": 3640 + }, + { + "epoch": 13.144469525959368, + "grad_norm": 263.8638000488281, + "learning_rate": 1.023049001814882e-05, + "loss": 34.3008, + "step": 3641 + }, + { + "epoch": 13.148081264108352, + "grad_norm": 308.10650634765625, + "learning_rate": 1.0225045372050817e-05, + "loss": 35.9397, + "step": 3642 + }, + { + "epoch": 13.151693002257336, + "grad_norm": 208.60519409179688, + "learning_rate": 1.0219600725952814e-05, + "loss": 34.2573, + "step": 3643 + }, + { + "epoch": 13.15530474040632, + "grad_norm": 251.36766052246094, + "learning_rate": 1.021415607985481e-05, + "loss": 35.853, + "step": 3644 + }, + { + "epoch": 13.158916478555305, + "grad_norm": 264.94818115234375, + "learning_rate": 1.0208711433756807e-05, + "loss": 35.7057, + "step": 3645 + }, + { + "epoch": 13.162528216704288, + "grad_norm": 313.0333251953125, + "learning_rate": 1.0203266787658802e-05, + "loss": 34.611, + "step": 3646 + }, + { + "epoch": 13.166139954853273, + "grad_norm": 254.9687042236328, + "learning_rate": 1.0197822141560797e-05, + "loss": 31.1751, + "step": 3647 + }, + { + "epoch": 13.169751693002258, + "grad_norm": 219.7308349609375, + "learning_rate": 1.0192377495462796e-05, + "loss": 22.8425, + "step": 3648 + }, + { + "epoch": 13.173363431151241, + "grad_norm": 305.76416015625, + "learning_rate": 1.0186932849364792e-05, + "loss": 22.5266, + "step": 3649 + }, + { + "epoch": 13.176975169300226, + "grad_norm": 301.26239013671875, + "learning_rate": 1.0181488203266787e-05, + "loss": 23.861, + "step": 3650 + }, + { + "epoch": 13.176975169300226, + "eval_loss": 0.6107029914855957, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 3650 + }, + { + "epoch": 13.18058690744921, + "grad_norm": 235.15576171875, + "learning_rate": 1.0176043557168784e-05, + "loss": 24.495, + "step": 3651 + }, + { + "epoch": 13.184198645598194, + "grad_norm": 268.524658203125, + "learning_rate": 1.0170598911070781e-05, + "loss": 40.3819, + "step": 3652 + }, + { + "epoch": 13.187810383747179, + "grad_norm": 257.869140625, + "learning_rate": 1.0165154264972778e-05, + "loss": 42.2715, + "step": 3653 + }, + { + "epoch": 13.191422121896162, + "grad_norm": 191.8995361328125, + "learning_rate": 1.0159709618874774e-05, + "loss": 41.2991, + "step": 3654 + }, + { + "epoch": 13.195033860045147, + "grad_norm": 242.85342407226562, + "learning_rate": 1.0154264972776769e-05, + "loss": 39.6007, + "step": 3655 + }, + { + "epoch": 13.198645598194132, + "grad_norm": 279.1092529296875, + "learning_rate": 1.0148820326678766e-05, + "loss": 39.8502, + "step": 3656 + }, + { + "epoch": 13.202257336343115, + "grad_norm": 233.94708251953125, + "learning_rate": 1.0143375680580763e-05, + "loss": 39.6407, + "step": 3657 + }, + { + "epoch": 13.2058690744921, + "grad_norm": 227.53001403808594, + "learning_rate": 1.0137931034482758e-05, + "loss": 40.3618, + "step": 3658 + }, + { + "epoch": 13.209480812641084, + "grad_norm": 216.17654418945312, + "learning_rate": 1.0132486388384756e-05, + "loss": 41.3187, + "step": 3659 + }, + { + "epoch": 13.213092550790067, + "grad_norm": 199.51072692871094, + "learning_rate": 1.0127041742286751e-05, + "loss": 41.7474, + "step": 3660 + }, + { + "epoch": 13.213092550790067, + "eval_loss": 0.6099065542221069, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3660 + }, + { + "epoch": 13.216704288939052, + "grad_norm": 212.3302001953125, + "learning_rate": 1.0121597096188748e-05, + "loss": 40.8565, + "step": 3661 + }, + { + "epoch": 13.220316027088035, + "grad_norm": 185.42857360839844, + "learning_rate": 1.0116152450090745e-05, + "loss": 41.5302, + "step": 3662 + }, + { + "epoch": 13.22392776523702, + "grad_norm": 241.05487060546875, + "learning_rate": 1.011070780399274e-05, + "loss": 38.6842, + "step": 3663 + }, + { + "epoch": 13.227539503386005, + "grad_norm": 314.1755065917969, + "learning_rate": 1.0105263157894738e-05, + "loss": 37.8021, + "step": 3664 + }, + { + "epoch": 13.231151241534988, + "grad_norm": 262.6571960449219, + "learning_rate": 1.0099818511796733e-05, + "loss": 36.3265, + "step": 3665 + }, + { + "epoch": 13.234762979683973, + "grad_norm": 259.24029541015625, + "learning_rate": 1.009437386569873e-05, + "loss": 38.4521, + "step": 3666 + }, + { + "epoch": 13.238374717832958, + "grad_norm": 223.5182342529297, + "learning_rate": 1.0088929219600727e-05, + "loss": 37.3267, + "step": 3667 + }, + { + "epoch": 13.241986455981941, + "grad_norm": 181.72926330566406, + "learning_rate": 1.0083484573502722e-05, + "loss": 38.0142, + "step": 3668 + }, + { + "epoch": 13.245598194130926, + "grad_norm": 204.99813842773438, + "learning_rate": 1.0078039927404718e-05, + "loss": 37.3513, + "step": 3669 + }, + { + "epoch": 13.249209932279909, + "grad_norm": 184.05482482910156, + "learning_rate": 1.0072595281306715e-05, + "loss": 37.9737, + "step": 3670 + }, + { + "epoch": 13.249209932279909, + "eval_loss": 0.6081296801567078, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.081, + "eval_steps_per_second": 57.081, + "step": 3670 + }, + { + "epoch": 13.252821670428894, + "grad_norm": 261.076416015625, + "learning_rate": 1.0067150635208712e-05, + "loss": 38.1087, + "step": 3671 + }, + { + "epoch": 13.256433408577879, + "grad_norm": 218.79515075683594, + "learning_rate": 1.0061705989110709e-05, + "loss": 37.215, + "step": 3672 + }, + { + "epoch": 13.260045146726862, + "grad_norm": 240.93222045898438, + "learning_rate": 1.0056261343012704e-05, + "loss": 37.4461, + "step": 3673 + }, + { + "epoch": 13.263656884875846, + "grad_norm": 241.46072387695312, + "learning_rate": 1.00508166969147e-05, + "loss": 39.4396, + "step": 3674 + }, + { + "epoch": 13.267268623024831, + "grad_norm": 217.85369873046875, + "learning_rate": 1.0045372050816699e-05, + "loss": 38.5512, + "step": 3675 + }, + { + "epoch": 13.270880361173814, + "grad_norm": 254.53549194335938, + "learning_rate": 1.0039927404718694e-05, + "loss": 39.4436, + "step": 3676 + }, + { + "epoch": 13.2744920993228, + "grad_norm": 330.2030029296875, + "learning_rate": 1.003448275862069e-05, + "loss": 39.6341, + "step": 3677 + }, + { + "epoch": 13.278103837471784, + "grad_norm": 267.6778869628906, + "learning_rate": 1.0029038112522686e-05, + "loss": 38.5305, + "step": 3678 + }, + { + "epoch": 13.281715575620767, + "grad_norm": 251.23703002929688, + "learning_rate": 1.0023593466424682e-05, + "loss": 39.712, + "step": 3679 + }, + { + "epoch": 13.285327313769752, + "grad_norm": 258.8126525878906, + "learning_rate": 1.0018148820326679e-05, + "loss": 37.982, + "step": 3680 + }, + { + "epoch": 13.285327313769752, + "eval_loss": 0.6092600226402283, + "eval_runtime": 3.1494, + "eval_samples_per_second": 56.837, + "eval_steps_per_second": 56.837, + "step": 3680 + }, + { + "epoch": 13.288939051918735, + "grad_norm": 270.01690673828125, + "learning_rate": 1.0012704174228676e-05, + "loss": 35.8938, + "step": 3681 + }, + { + "epoch": 13.29255079006772, + "grad_norm": 271.138671875, + "learning_rate": 1.0007259528130671e-05, + "loss": 33.2221, + "step": 3682 + }, + { + "epoch": 13.296162528216705, + "grad_norm": 239.4976806640625, + "learning_rate": 1.0001814882032668e-05, + "loss": 32.6252, + "step": 3683 + }, + { + "epoch": 13.299774266365688, + "grad_norm": 203.7470245361328, + "learning_rate": 9.996370235934664e-06, + "loss": 32.3694, + "step": 3684 + }, + { + "epoch": 13.303386004514673, + "grad_norm": 255.28419494628906, + "learning_rate": 9.990925589836661e-06, + "loss": 32.7386, + "step": 3685 + }, + { + "epoch": 13.306997742663658, + "grad_norm": 267.82489013671875, + "learning_rate": 9.985480943738658e-06, + "loss": 33.7657, + "step": 3686 + }, + { + "epoch": 13.31060948081264, + "grad_norm": 224.82432556152344, + "learning_rate": 9.980036297640653e-06, + "loss": 34.085, + "step": 3687 + }, + { + "epoch": 13.314221218961626, + "grad_norm": 249.92684936523438, + "learning_rate": 9.974591651542649e-06, + "loss": 33.9186, + "step": 3688 + }, + { + "epoch": 13.317832957110609, + "grad_norm": 249.29620361328125, + "learning_rate": 9.969147005444648e-06, + "loss": 35.0909, + "step": 3689 + }, + { + "epoch": 13.321444695259594, + "grad_norm": 276.4640808105469, + "learning_rate": 9.963702359346643e-06, + "loss": 35.6823, + "step": 3690 + }, + { + "epoch": 13.321444695259594, + "eval_loss": 0.6132593154907227, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 3690 + }, + { + "epoch": 13.325056433408578, + "grad_norm": 245.46163940429688, + "learning_rate": 9.958257713248638e-06, + "loss": 35.7071, + "step": 3691 + }, + { + "epoch": 13.328668171557561, + "grad_norm": 311.008544921875, + "learning_rate": 9.952813067150635e-06, + "loss": 33.6089, + "step": 3692 + }, + { + "epoch": 13.332279909706546, + "grad_norm": 283.2784118652344, + "learning_rate": 9.94736842105263e-06, + "loss": 34.9939, + "step": 3693 + }, + { + "epoch": 13.335891647855531, + "grad_norm": 293.2317199707031, + "learning_rate": 9.94192377495463e-06, + "loss": 37.1149, + "step": 3694 + }, + { + "epoch": 13.339503386004514, + "grad_norm": 263.33111572265625, + "learning_rate": 9.936479128856625e-06, + "loss": 36.5911, + "step": 3695 + }, + { + "epoch": 13.343115124153499, + "grad_norm": 285.1488952636719, + "learning_rate": 9.93103448275862e-06, + "loss": 35.9336, + "step": 3696 + }, + { + "epoch": 13.346726862302482, + "grad_norm": 246.30616760253906, + "learning_rate": 9.925589836660617e-06, + "loss": 26.1555, + "step": 3697 + }, + { + "epoch": 13.350338600451467, + "grad_norm": 185.4857177734375, + "learning_rate": 9.920145190562614e-06, + "loss": 21.9519, + "step": 3698 + }, + { + "epoch": 13.353950338600452, + "grad_norm": 269.6291809082031, + "learning_rate": 9.91470054446461e-06, + "loss": 22.5592, + "step": 3699 + }, + { + "epoch": 13.357562076749435, + "grad_norm": 214.7660675048828, + "learning_rate": 9.909255898366607e-06, + "loss": 23.2505, + "step": 3700 + }, + { + "epoch": 13.357562076749435, + "eval_loss": 0.6123418211936951, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 3700 + }, + { + "epoch": 13.36117381489842, + "grad_norm": 227.8025360107422, + "learning_rate": 9.903811252268602e-06, + "loss": 23.9731, + "step": 3701 + }, + { + "epoch": 13.364785553047405, + "grad_norm": 261.7846374511719, + "learning_rate": 9.898366606170598e-06, + "loss": 40.3869, + "step": 3702 + }, + { + "epoch": 13.368397291196388, + "grad_norm": 305.4109802246094, + "learning_rate": 9.892921960072596e-06, + "loss": 41.9626, + "step": 3703 + }, + { + "epoch": 13.372009029345373, + "grad_norm": 272.86236572265625, + "learning_rate": 9.887477313974592e-06, + "loss": 39.9819, + "step": 3704 + }, + { + "epoch": 13.375620767494357, + "grad_norm": 371.4781188964844, + "learning_rate": 9.882032667876589e-06, + "loss": 40.8074, + "step": 3705 + }, + { + "epoch": 13.37923250564334, + "grad_norm": 278.7463684082031, + "learning_rate": 9.876588021778584e-06, + "loss": 40.6721, + "step": 3706 + }, + { + "epoch": 13.382844243792325, + "grad_norm": 270.41619873046875, + "learning_rate": 9.87114337568058e-06, + "loss": 40.1604, + "step": 3707 + }, + { + "epoch": 13.386455981941308, + "grad_norm": 204.42018127441406, + "learning_rate": 9.865698729582578e-06, + "loss": 41.4666, + "step": 3708 + }, + { + "epoch": 13.390067720090293, + "grad_norm": 197.43289184570312, + "learning_rate": 9.860254083484574e-06, + "loss": 40.953, + "step": 3709 + }, + { + "epoch": 13.393679458239278, + "grad_norm": 203.92056274414062, + "learning_rate": 9.85480943738657e-06, + "loss": 40.6416, + "step": 3710 + }, + { + "epoch": 13.393679458239278, + "eval_loss": 0.608938992023468, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.863, + "eval_steps_per_second": 56.863, + "step": 3710 + }, + { + "epoch": 13.397291196388261, + "grad_norm": 353.2951354980469, + "learning_rate": 9.849364791288566e-06, + "loss": 39.7, + "step": 3711 + }, + { + "epoch": 13.400902934537246, + "grad_norm": 222.94410705566406, + "learning_rate": 9.843920145190563e-06, + "loss": 40.4703, + "step": 3712 + }, + { + "epoch": 13.404514672686231, + "grad_norm": 301.0710754394531, + "learning_rate": 9.83847549909256e-06, + "loss": 37.0453, + "step": 3713 + }, + { + "epoch": 13.408126410835214, + "grad_norm": 251.70263671875, + "learning_rate": 9.833030852994556e-06, + "loss": 37.5346, + "step": 3714 + }, + { + "epoch": 13.411738148984199, + "grad_norm": 201.29335021972656, + "learning_rate": 9.827586206896551e-06, + "loss": 39.0706, + "step": 3715 + }, + { + "epoch": 13.415349887133182, + "grad_norm": 233.82212829589844, + "learning_rate": 9.822141560798548e-06, + "loss": 38.4527, + "step": 3716 + }, + { + "epoch": 13.418961625282167, + "grad_norm": 245.0128936767578, + "learning_rate": 9.816696914700545e-06, + "loss": 37.82, + "step": 3717 + }, + { + "epoch": 13.422573363431152, + "grad_norm": 325.1784973144531, + "learning_rate": 9.81125226860254e-06, + "loss": 38.8858, + "step": 3718 + }, + { + "epoch": 13.426185101580135, + "grad_norm": 196.15032958984375, + "learning_rate": 9.805807622504538e-06, + "loss": 37.1919, + "step": 3719 + }, + { + "epoch": 13.42979683972912, + "grad_norm": 254.73980712890625, + "learning_rate": 9.800362976406533e-06, + "loss": 39.1644, + "step": 3720 + }, + { + "epoch": 13.42979683972912, + "eval_loss": 0.6100116968154907, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 3720 + }, + { + "epoch": 13.433408577878104, + "grad_norm": 253.11489868164062, + "learning_rate": 9.79491833030853e-06, + "loss": 39.8542, + "step": 3721 + }, + { + "epoch": 13.437020316027088, + "grad_norm": 267.8416748046875, + "learning_rate": 9.789473684210527e-06, + "loss": 39.8469, + "step": 3722 + }, + { + "epoch": 13.440632054176072, + "grad_norm": 267.62835693359375, + "learning_rate": 9.784029038112523e-06, + "loss": 37.4556, + "step": 3723 + }, + { + "epoch": 13.444243792325057, + "grad_norm": 346.6018371582031, + "learning_rate": 9.77858439201452e-06, + "loss": 39.7817, + "step": 3724 + }, + { + "epoch": 13.44785553047404, + "grad_norm": 241.95008850097656, + "learning_rate": 9.773139745916515e-06, + "loss": 39.1631, + "step": 3725 + }, + { + "epoch": 13.451467268623025, + "grad_norm": 244.9163055419922, + "learning_rate": 9.767695099818512e-06, + "loss": 38.6152, + "step": 3726 + }, + { + "epoch": 13.455079006772008, + "grad_norm": 243.60633850097656, + "learning_rate": 9.76225045372051e-06, + "loss": 39.5388, + "step": 3727 + }, + { + "epoch": 13.458690744920993, + "grad_norm": 230.57276916503906, + "learning_rate": 9.756805807622505e-06, + "loss": 40.3007, + "step": 3728 + }, + { + "epoch": 13.462302483069978, + "grad_norm": 228.76754760742188, + "learning_rate": 9.7513611615245e-06, + "loss": 37.7111, + "step": 3729 + }, + { + "epoch": 13.465914221218961, + "grad_norm": 292.7367248535156, + "learning_rate": 9.745916515426497e-06, + "loss": 38.4114, + "step": 3730 + }, + { + "epoch": 13.465914221218961, + "eval_loss": 0.6064842939376831, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 3730 + }, + { + "epoch": 13.469525959367946, + "grad_norm": 226.9254150390625, + "learning_rate": 9.740471869328494e-06, + "loss": 34.015, + "step": 3731 + }, + { + "epoch": 13.47313769751693, + "grad_norm": 250.38137817382812, + "learning_rate": 9.73502722323049e-06, + "loss": 34.2911, + "step": 3732 + }, + { + "epoch": 13.476749435665914, + "grad_norm": 230.447265625, + "learning_rate": 9.729582577132487e-06, + "loss": 31.8708, + "step": 3733 + }, + { + "epoch": 13.480361173814899, + "grad_norm": 241.05787658691406, + "learning_rate": 9.724137931034482e-06, + "loss": 34.5685, + "step": 3734 + }, + { + "epoch": 13.483972911963882, + "grad_norm": 248.07254028320312, + "learning_rate": 9.718693284936481e-06, + "loss": 32.6084, + "step": 3735 + }, + { + "epoch": 13.487584650112867, + "grad_norm": 241.22862243652344, + "learning_rate": 9.713248638838476e-06, + "loss": 32.787, + "step": 3736 + }, + { + "epoch": 13.491196388261852, + "grad_norm": 295.4871520996094, + "learning_rate": 9.707803992740472e-06, + "loss": 33.9786, + "step": 3737 + }, + { + "epoch": 13.494808126410835, + "grad_norm": 285.3634948730469, + "learning_rate": 9.702359346642469e-06, + "loss": 33.9872, + "step": 3738 + }, + { + "epoch": 13.49841986455982, + "grad_norm": 302.39947509765625, + "learning_rate": 9.696914700544464e-06, + "loss": 33.9854, + "step": 3739 + }, + { + "epoch": 13.502031602708804, + "grad_norm": 310.0465087890625, + "learning_rate": 9.691470054446461e-06, + "loss": 34.1859, + "step": 3740 + }, + { + "epoch": 13.502031602708804, + "eval_loss": 0.6067100167274475, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 3740 + }, + { + "epoch": 13.505643340857787, + "grad_norm": 319.9311828613281, + "learning_rate": 9.686025408348458e-06, + "loss": 34.5264, + "step": 3741 + }, + { + "epoch": 13.509255079006772, + "grad_norm": 291.75738525390625, + "learning_rate": 9.680580762250454e-06, + "loss": 35.8348, + "step": 3742 + }, + { + "epoch": 13.512866817155757, + "grad_norm": 291.5312805175781, + "learning_rate": 9.675136116152449e-06, + "loss": 33.8803, + "step": 3743 + }, + { + "epoch": 13.51647855530474, + "grad_norm": 228.00588989257812, + "learning_rate": 9.669691470054448e-06, + "loss": 36.1919, + "step": 3744 + }, + { + "epoch": 13.520090293453725, + "grad_norm": 236.5559539794922, + "learning_rate": 9.664246823956443e-06, + "loss": 35.8432, + "step": 3745 + }, + { + "epoch": 13.523702031602708, + "grad_norm": 287.7408752441406, + "learning_rate": 9.65880217785844e-06, + "loss": 37.069, + "step": 3746 + }, + { + "epoch": 13.527313769751693, + "grad_norm": 272.73870849609375, + "learning_rate": 9.653357531760436e-06, + "loss": 29.1896, + "step": 3747 + }, + { + "epoch": 13.530925507900678, + "grad_norm": 256.5550842285156, + "learning_rate": 9.647912885662431e-06, + "loss": 23.0953, + "step": 3748 + }, + { + "epoch": 13.534537246049661, + "grad_norm": 230.98487854003906, + "learning_rate": 9.64246823956443e-06, + "loss": 21.9902, + "step": 3749 + }, + { + "epoch": 13.538148984198646, + "grad_norm": 247.1185760498047, + "learning_rate": 9.637023593466425e-06, + "loss": 23.7439, + "step": 3750 + }, + { + "epoch": 13.538148984198646, + "eval_loss": 0.6106311082839966, + "eval_runtime": 3.1356, + "eval_samples_per_second": 57.086, + "eval_steps_per_second": 57.086, + "step": 3750 + }, + { + "epoch": 13.54176072234763, + "grad_norm": 193.83152770996094, + "learning_rate": 9.63157894736842e-06, + "loss": 24.2292, + "step": 3751 + }, + { + "epoch": 13.545372460496614, + "grad_norm": 322.80487060546875, + "learning_rate": 9.626134301270418e-06, + "loss": 40.9778, + "step": 3752 + }, + { + "epoch": 13.548984198645599, + "grad_norm": 345.0560302734375, + "learning_rate": 9.620689655172413e-06, + "loss": 42.3601, + "step": 3753 + }, + { + "epoch": 13.552595936794582, + "grad_norm": 240.3759002685547, + "learning_rate": 9.61524500907441e-06, + "loss": 41.092, + "step": 3754 + }, + { + "epoch": 13.556207674943566, + "grad_norm": 219.0955352783203, + "learning_rate": 9.609800362976407e-06, + "loss": 40.3108, + "step": 3755 + }, + { + "epoch": 13.559819413092551, + "grad_norm": 255.6158447265625, + "learning_rate": 9.604355716878403e-06, + "loss": 39.8885, + "step": 3756 + }, + { + "epoch": 13.563431151241534, + "grad_norm": 264.55010986328125, + "learning_rate": 9.5989110707804e-06, + "loss": 40.8838, + "step": 3757 + }, + { + "epoch": 13.56704288939052, + "grad_norm": 313.0918273925781, + "learning_rate": 9.593466424682397e-06, + "loss": 40.6634, + "step": 3758 + }, + { + "epoch": 13.570654627539504, + "grad_norm": 304.87396240234375, + "learning_rate": 9.588021778584392e-06, + "loss": 41.8734, + "step": 3759 + }, + { + "epoch": 13.574266365688487, + "grad_norm": 239.76063537597656, + "learning_rate": 9.58257713248639e-06, + "loss": 40.6281, + "step": 3760 + }, + { + "epoch": 13.574266365688487, + "eval_loss": 0.6124129891395569, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.028, + "eval_steps_per_second": 57.028, + "step": 3760 + }, + { + "epoch": 13.577878103837472, + "grad_norm": 201.89422607421875, + "learning_rate": 9.577132486388385e-06, + "loss": 39.6948, + "step": 3761 + }, + { + "epoch": 13.581489841986457, + "grad_norm": 232.8797607421875, + "learning_rate": 9.57168784029038e-06, + "loss": 39.6927, + "step": 3762 + }, + { + "epoch": 13.58510158013544, + "grad_norm": 250.30355834960938, + "learning_rate": 9.566243194192379e-06, + "loss": 37.6926, + "step": 3763 + }, + { + "epoch": 13.588713318284425, + "grad_norm": 256.23626708984375, + "learning_rate": 9.560798548094374e-06, + "loss": 38.248, + "step": 3764 + }, + { + "epoch": 13.592325056433408, + "grad_norm": 234.1791534423828, + "learning_rate": 9.555353901996371e-06, + "loss": 36.8178, + "step": 3765 + }, + { + "epoch": 13.595936794582393, + "grad_norm": 243.87615966796875, + "learning_rate": 9.549909255898367e-06, + "loss": 37.0802, + "step": 3766 + }, + { + "epoch": 13.599548532731378, + "grad_norm": 220.98150634765625, + "learning_rate": 9.544464609800362e-06, + "loss": 37.1251, + "step": 3767 + }, + { + "epoch": 13.60316027088036, + "grad_norm": 235.8653564453125, + "learning_rate": 9.53901996370236e-06, + "loss": 38.2965, + "step": 3768 + }, + { + "epoch": 13.606772009029346, + "grad_norm": 237.66712951660156, + "learning_rate": 9.533575317604356e-06, + "loss": 38.0266, + "step": 3769 + }, + { + "epoch": 13.610383747178329, + "grad_norm": 229.4922637939453, + "learning_rate": 9.528130671506351e-06, + "loss": 38.4199, + "step": 3770 + }, + { + "epoch": 13.610383747178329, + "eval_loss": 0.6078812479972839, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 3770 + }, + { + "epoch": 13.613995485327314, + "grad_norm": 250.82533264160156, + "learning_rate": 9.522686025408349e-06, + "loss": 39.713, + "step": 3771 + }, + { + "epoch": 13.617607223476298, + "grad_norm": 218.97511291503906, + "learning_rate": 9.517241379310346e-06, + "loss": 37.6396, + "step": 3772 + }, + { + "epoch": 13.621218961625281, + "grad_norm": 240.13096618652344, + "learning_rate": 9.511796733212341e-06, + "loss": 39.2808, + "step": 3773 + }, + { + "epoch": 13.624830699774266, + "grad_norm": 214.77957153320312, + "learning_rate": 9.506352087114338e-06, + "loss": 39.1584, + "step": 3774 + }, + { + "epoch": 13.628442437923251, + "grad_norm": 273.2488708496094, + "learning_rate": 9.500907441016333e-06, + "loss": 39.6725, + "step": 3775 + }, + { + "epoch": 13.632054176072234, + "grad_norm": 240.46669006347656, + "learning_rate": 9.49546279491833e-06, + "loss": 40.155, + "step": 3776 + }, + { + "epoch": 13.635665914221219, + "grad_norm": 304.46533203125, + "learning_rate": 9.490018148820328e-06, + "loss": 39.5831, + "step": 3777 + }, + { + "epoch": 13.639277652370204, + "grad_norm": 282.9252624511719, + "learning_rate": 9.484573502722323e-06, + "loss": 40.8392, + "step": 3778 + }, + { + "epoch": 13.642889390519187, + "grad_norm": 229.2595977783203, + "learning_rate": 9.47912885662432e-06, + "loss": 38.4015, + "step": 3779 + }, + { + "epoch": 13.646501128668172, + "grad_norm": 300.0253601074219, + "learning_rate": 9.473684210526315e-06, + "loss": 35.0578, + "step": 3780 + }, + { + "epoch": 13.646501128668172, + "eval_loss": 0.6059401631355286, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 3780 + }, + { + "epoch": 13.650112866817155, + "grad_norm": 266.379638671875, + "learning_rate": 9.468239564428313e-06, + "loss": 33.0308, + "step": 3781 + }, + { + "epoch": 13.65372460496614, + "grad_norm": 248.8190460205078, + "learning_rate": 9.46279491833031e-06, + "loss": 31.7632, + "step": 3782 + }, + { + "epoch": 13.657336343115125, + "grad_norm": 224.4126739501953, + "learning_rate": 9.457350272232305e-06, + "loss": 32.8875, + "step": 3783 + }, + { + "epoch": 13.660948081264108, + "grad_norm": 259.84466552734375, + "learning_rate": 9.4519056261343e-06, + "loss": 32.3248, + "step": 3784 + }, + { + "epoch": 13.664559819413093, + "grad_norm": 233.59483337402344, + "learning_rate": 9.446460980036297e-06, + "loss": 32.5855, + "step": 3785 + }, + { + "epoch": 13.668171557562077, + "grad_norm": 283.1840515136719, + "learning_rate": 9.441016333938295e-06, + "loss": 33.8277, + "step": 3786 + }, + { + "epoch": 13.67178329571106, + "grad_norm": 269.51171875, + "learning_rate": 9.435571687840292e-06, + "loss": 33.8348, + "step": 3787 + }, + { + "epoch": 13.675395033860045, + "grad_norm": 284.6701354980469, + "learning_rate": 9.430127041742287e-06, + "loss": 34.2571, + "step": 3788 + }, + { + "epoch": 13.679006772009028, + "grad_norm": 308.96221923828125, + "learning_rate": 9.424682395644282e-06, + "loss": 34.2313, + "step": 3789 + }, + { + "epoch": 13.682618510158013, + "grad_norm": 229.36366271972656, + "learning_rate": 9.41923774954628e-06, + "loss": 34.6341, + "step": 3790 + }, + { + "epoch": 13.682618510158013, + "eval_loss": 0.606715202331543, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 3790 + }, + { + "epoch": 13.686230248306998, + "grad_norm": 335.4346008300781, + "learning_rate": 9.413793103448277e-06, + "loss": 35.2222, + "step": 3791 + }, + { + "epoch": 13.689841986455981, + "grad_norm": 259.72222900390625, + "learning_rate": 9.408348457350272e-06, + "loss": 34.7416, + "step": 3792 + }, + { + "epoch": 13.693453724604966, + "grad_norm": 275.96112060546875, + "learning_rate": 9.402903811252269e-06, + "loss": 34.2018, + "step": 3793 + }, + { + "epoch": 13.697065462753951, + "grad_norm": 349.28924560546875, + "learning_rate": 9.397459165154264e-06, + "loss": 37.8801, + "step": 3794 + }, + { + "epoch": 13.700677200902934, + "grad_norm": 288.47540283203125, + "learning_rate": 9.392014519056261e-06, + "loss": 37.5101, + "step": 3795 + }, + { + "epoch": 13.704288939051919, + "grad_norm": 255.31033325195312, + "learning_rate": 9.386569872958259e-06, + "loss": 36.9294, + "step": 3796 + }, + { + "epoch": 13.707900677200904, + "grad_norm": 273.757080078125, + "learning_rate": 9.381125226860254e-06, + "loss": 31.64, + "step": 3797 + }, + { + "epoch": 13.711512415349887, + "grad_norm": 236.24928283691406, + "learning_rate": 9.375680580762251e-06, + "loss": 22.9812, + "step": 3798 + }, + { + "epoch": 13.715124153498872, + "grad_norm": 206.70883178710938, + "learning_rate": 9.370235934664246e-06, + "loss": 22.4788, + "step": 3799 + }, + { + "epoch": 13.718735891647855, + "grad_norm": 168.15762329101562, + "learning_rate": 9.364791288566243e-06, + "loss": 23.3803, + "step": 3800 + }, + { + "epoch": 13.718735891647855, + "eval_loss": 0.6092759966850281, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 3800 + }, + { + "epoch": 13.72234762979684, + "grad_norm": 261.88397216796875, + "learning_rate": 9.35934664246824e-06, + "loss": 24.8757, + "step": 3801 + }, + { + "epoch": 13.725959367945824, + "grad_norm": 235.3518829345703, + "learning_rate": 9.353901996370236e-06, + "loss": 39.8777, + "step": 3802 + }, + { + "epoch": 13.729571106094808, + "grad_norm": 226.94027709960938, + "learning_rate": 9.348457350272231e-06, + "loss": 40.4357, + "step": 3803 + }, + { + "epoch": 13.733182844243792, + "grad_norm": 266.2643737792969, + "learning_rate": 9.34301270417423e-06, + "loss": 41.6411, + "step": 3804 + }, + { + "epoch": 13.736794582392777, + "grad_norm": 327.39288330078125, + "learning_rate": 9.337568058076225e-06, + "loss": 39.862, + "step": 3805 + }, + { + "epoch": 13.74040632054176, + "grad_norm": 241.03121948242188, + "learning_rate": 9.332123411978223e-06, + "loss": 39.1833, + "step": 3806 + }, + { + "epoch": 13.744018058690745, + "grad_norm": 232.2872314453125, + "learning_rate": 9.326678765880218e-06, + "loss": 40.6895, + "step": 3807 + }, + { + "epoch": 13.747629796839728, + "grad_norm": 236.909912109375, + "learning_rate": 9.321234119782213e-06, + "loss": 39.5891, + "step": 3808 + }, + { + "epoch": 13.751241534988713, + "grad_norm": 193.81478881835938, + "learning_rate": 9.315789473684212e-06, + "loss": 41.5211, + "step": 3809 + }, + { + "epoch": 13.754853273137698, + "grad_norm": 214.87301635742188, + "learning_rate": 9.310344827586207e-06, + "loss": 41.0726, + "step": 3810 + }, + { + "epoch": 13.754853273137698, + "eval_loss": 0.6098713874816895, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.047, + "eval_steps_per_second": 57.047, + "step": 3810 + }, + { + "epoch": 13.758465011286681, + "grad_norm": 196.57247924804688, + "learning_rate": 9.304900181488203e-06, + "loss": 40.1843, + "step": 3811 + }, + { + "epoch": 13.762076749435666, + "grad_norm": 215.59698486328125, + "learning_rate": 9.2994555353902e-06, + "loss": 37.6279, + "step": 3812 + }, + { + "epoch": 13.76568848758465, + "grad_norm": 221.1280059814453, + "learning_rate": 9.294010889292195e-06, + "loss": 37.9593, + "step": 3813 + }, + { + "epoch": 13.769300225733634, + "grad_norm": 314.94610595703125, + "learning_rate": 9.288566243194192e-06, + "loss": 37.3399, + "step": 3814 + }, + { + "epoch": 13.772911963882619, + "grad_norm": 240.10816955566406, + "learning_rate": 9.28312159709619e-06, + "loss": 38.3185, + "step": 3815 + }, + { + "epoch": 13.776523702031604, + "grad_norm": 229.2427978515625, + "learning_rate": 9.277676950998185e-06, + "loss": 36.9407, + "step": 3816 + }, + { + "epoch": 13.780135440180587, + "grad_norm": 224.78335571289062, + "learning_rate": 9.272232304900182e-06, + "loss": 39.3709, + "step": 3817 + }, + { + "epoch": 13.783747178329572, + "grad_norm": 216.5969696044922, + "learning_rate": 9.266787658802179e-06, + "loss": 38.2303, + "step": 3818 + }, + { + "epoch": 13.787358916478555, + "grad_norm": 208.7849884033203, + "learning_rate": 9.261343012704174e-06, + "loss": 39.492, + "step": 3819 + }, + { + "epoch": 13.79097065462754, + "grad_norm": 215.76475524902344, + "learning_rate": 9.255898366606171e-06, + "loss": 38.5599, + "step": 3820 + }, + { + "epoch": 13.79097065462754, + "eval_loss": 0.6080366969108582, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.041, + "eval_steps_per_second": 57.041, + "step": 3820 + }, + { + "epoch": 13.794582392776524, + "grad_norm": 224.64462280273438, + "learning_rate": 9.250453720508167e-06, + "loss": 39.315, + "step": 3821 + }, + { + "epoch": 13.798194130925507, + "grad_norm": 298.545654296875, + "learning_rate": 9.245009074410162e-06, + "loss": 38.3108, + "step": 3822 + }, + { + "epoch": 13.801805869074492, + "grad_norm": 236.5186767578125, + "learning_rate": 9.239564428312161e-06, + "loss": 39.9223, + "step": 3823 + }, + { + "epoch": 13.805417607223477, + "grad_norm": 251.47999572753906, + "learning_rate": 9.234119782214156e-06, + "loss": 39.4288, + "step": 3824 + }, + { + "epoch": 13.80902934537246, + "grad_norm": 260.8268737792969, + "learning_rate": 9.228675136116152e-06, + "loss": 38.276, + "step": 3825 + }, + { + "epoch": 13.812641083521445, + "grad_norm": 253.25172424316406, + "learning_rate": 9.223230490018149e-06, + "loss": 40.7118, + "step": 3826 + }, + { + "epoch": 13.816252821670428, + "grad_norm": 250.31784057617188, + "learning_rate": 9.217785843920146e-06, + "loss": 40.1916, + "step": 3827 + }, + { + "epoch": 13.819864559819413, + "grad_norm": 228.79234313964844, + "learning_rate": 9.212341197822143e-06, + "loss": 38.1513, + "step": 3828 + }, + { + "epoch": 13.823476297968398, + "grad_norm": 262.689697265625, + "learning_rate": 9.206896551724138e-06, + "loss": 38.43, + "step": 3829 + }, + { + "epoch": 13.827088036117381, + "grad_norm": 191.04139709472656, + "learning_rate": 9.201451905626134e-06, + "loss": 34.2476, + "step": 3830 + }, + { + "epoch": 13.827088036117381, + "eval_loss": 0.6077054142951965, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 3830 + }, + { + "epoch": 13.830699774266366, + "grad_norm": 236.3266143798828, + "learning_rate": 9.196007259528131e-06, + "loss": 33.7892, + "step": 3831 + }, + { + "epoch": 13.83431151241535, + "grad_norm": 284.8748474121094, + "learning_rate": 9.190562613430128e-06, + "loss": 31.9857, + "step": 3832 + }, + { + "epoch": 13.837923250564334, + "grad_norm": 261.17413330078125, + "learning_rate": 9.185117967332123e-06, + "loss": 32.8165, + "step": 3833 + }, + { + "epoch": 13.841534988713319, + "grad_norm": 195.1323699951172, + "learning_rate": 9.17967332123412e-06, + "loss": 33.1709, + "step": 3834 + }, + { + "epoch": 13.845146726862303, + "grad_norm": 220.5006561279297, + "learning_rate": 9.174228675136116e-06, + "loss": 33.149, + "step": 3835 + }, + { + "epoch": 13.848758465011286, + "grad_norm": 236.7254638671875, + "learning_rate": 9.168784029038111e-06, + "loss": 33.633, + "step": 3836 + }, + { + "epoch": 13.852370203160271, + "grad_norm": 269.1921691894531, + "learning_rate": 9.16333938294011e-06, + "loss": 34.6822, + "step": 3837 + }, + { + "epoch": 13.855981941309254, + "grad_norm": 222.4369354248047, + "learning_rate": 9.157894736842105e-06, + "loss": 35.2816, + "step": 3838 + }, + { + "epoch": 13.85959367945824, + "grad_norm": 232.4306640625, + "learning_rate": 9.152450090744102e-06, + "loss": 35.0067, + "step": 3839 + }, + { + "epoch": 13.863205417607224, + "grad_norm": 297.0786437988281, + "learning_rate": 9.147005444646098e-06, + "loss": 34.264, + "step": 3840 + }, + { + "epoch": 13.863205417607224, + "eval_loss": 0.6047748327255249, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 3840 + }, + { + "epoch": 13.866817155756207, + "grad_norm": 370.232421875, + "learning_rate": 9.141560798548095e-06, + "loss": 35.4996, + "step": 3841 + }, + { + "epoch": 13.870428893905192, + "grad_norm": 216.05775451660156, + "learning_rate": 9.136116152450092e-06, + "loss": 36.1403, + "step": 3842 + }, + { + "epoch": 13.874040632054175, + "grad_norm": 233.11138916015625, + "learning_rate": 9.130671506352087e-06, + "loss": 36.0324, + "step": 3843 + }, + { + "epoch": 13.87765237020316, + "grad_norm": 297.1761779785156, + "learning_rate": 9.125226860254083e-06, + "loss": 36.5617, + "step": 3844 + }, + { + "epoch": 13.881264108352145, + "grad_norm": 290.61590576171875, + "learning_rate": 9.11978221415608e-06, + "loss": 36.7113, + "step": 3845 + }, + { + "epoch": 13.884875846501128, + "grad_norm": 293.5744934082031, + "learning_rate": 9.114337568058077e-06, + "loss": 36.9964, + "step": 3846 + }, + { + "epoch": 13.888487584650113, + "grad_norm": 227.73455810546875, + "learning_rate": 9.108892921960072e-06, + "loss": 31.8552, + "step": 3847 + }, + { + "epoch": 13.892099322799098, + "grad_norm": 223.36077880859375, + "learning_rate": 9.10344827586207e-06, + "loss": 22.9122, + "step": 3848 + }, + { + "epoch": 13.89571106094808, + "grad_norm": 181.14501953125, + "learning_rate": 9.098003629764065e-06, + "loss": 22.366, + "step": 3849 + }, + { + "epoch": 13.899322799097066, + "grad_norm": 215.75856018066406, + "learning_rate": 9.092558983666063e-06, + "loss": 23.9545, + "step": 3850 + }, + { + "epoch": 13.899322799097066, + "eval_loss": 0.6072003245353699, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 3850 + }, + { + "epoch": 13.90293453724605, + "grad_norm": 233.22837829589844, + "learning_rate": 9.087114337568059e-06, + "loss": 23.5196, + "step": 3851 + }, + { + "epoch": 13.906546275395034, + "grad_norm": 269.9342041015625, + "learning_rate": 9.081669691470054e-06, + "loss": 41.4605, + "step": 3852 + }, + { + "epoch": 13.910158013544018, + "grad_norm": 304.4266662597656, + "learning_rate": 9.076225045372051e-06, + "loss": 40.2848, + "step": 3853 + }, + { + "epoch": 13.913769751693001, + "grad_norm": 318.2371520996094, + "learning_rate": 9.070780399274047e-06, + "loss": 41.0044, + "step": 3854 + }, + { + "epoch": 13.917381489841986, + "grad_norm": 272.9725341796875, + "learning_rate": 9.065335753176044e-06, + "loss": 40.776, + "step": 3855 + }, + { + "epoch": 13.920993227990971, + "grad_norm": 213.8822784423828, + "learning_rate": 9.059891107078041e-06, + "loss": 39.4964, + "step": 3856 + }, + { + "epoch": 13.924604966139954, + "grad_norm": 239.16128540039062, + "learning_rate": 9.054446460980036e-06, + "loss": 41.3482, + "step": 3857 + }, + { + "epoch": 13.928216704288939, + "grad_norm": 264.839111328125, + "learning_rate": 9.049001814882033e-06, + "loss": 38.2433, + "step": 3858 + }, + { + "epoch": 13.931828442437924, + "grad_norm": 244.00926208496094, + "learning_rate": 9.043557168784029e-06, + "loss": 38.6482, + "step": 3859 + }, + { + "epoch": 13.935440180586907, + "grad_norm": 342.8050537109375, + "learning_rate": 9.038112522686026e-06, + "loss": 39.2047, + "step": 3860 + }, + { + "epoch": 13.935440180586907, + "eval_loss": 0.6078094244003296, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3860 + }, + { + "epoch": 13.939051918735892, + "grad_norm": 232.509033203125, + "learning_rate": 9.032667876588023e-06, + "loss": 39.2827, + "step": 3861 + }, + { + "epoch": 13.942663656884875, + "grad_norm": 343.2891845703125, + "learning_rate": 9.027223230490018e-06, + "loss": 38.2709, + "step": 3862 + }, + { + "epoch": 13.94627539503386, + "grad_norm": 332.9613342285156, + "learning_rate": 9.021778584392014e-06, + "loss": 38.8266, + "step": 3863 + }, + { + "epoch": 13.949887133182845, + "grad_norm": 339.5653076171875, + "learning_rate": 9.016333938294012e-06, + "loss": 39.9249, + "step": 3864 + }, + { + "epoch": 13.953498871331828, + "grad_norm": 269.0108947753906, + "learning_rate": 9.010889292196008e-06, + "loss": 39.4593, + "step": 3865 + }, + { + "epoch": 13.957110609480813, + "grad_norm": 252.5339813232422, + "learning_rate": 9.005444646098003e-06, + "loss": 39.5471, + "step": 3866 + }, + { + "epoch": 13.960722347629797, + "grad_norm": 424.7225646972656, + "learning_rate": 9e-06, + "loss": 35.7505, + "step": 3867 + }, + { + "epoch": 13.96433408577878, + "grad_norm": 286.189208984375, + "learning_rate": 8.994555353901996e-06, + "loss": 32.445, + "step": 3868 + }, + { + "epoch": 13.967945823927765, + "grad_norm": 245.153564453125, + "learning_rate": 8.989110707803994e-06, + "loss": 33.2369, + "step": 3869 + }, + { + "epoch": 13.97155756207675, + "grad_norm": 305.3119812011719, + "learning_rate": 8.98366606170599e-06, + "loss": 31.7864, + "step": 3870 + }, + { + "epoch": 13.97155756207675, + "eval_loss": 0.6069231629371643, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.877, + "eval_steps_per_second": 56.877, + "step": 3870 + }, + { + "epoch": 13.975169300225733, + "grad_norm": 218.70913696289062, + "learning_rate": 8.978221415607985e-06, + "loss": 33.7166, + "step": 3871 + }, + { + "epoch": 13.978781038374718, + "grad_norm": 334.856201171875, + "learning_rate": 8.972776769509982e-06, + "loss": 35.8878, + "step": 3872 + }, + { + "epoch": 13.982392776523701, + "grad_norm": 305.65203857421875, + "learning_rate": 8.96733212341198e-06, + "loss": 35.1525, + "step": 3873 + }, + { + "epoch": 13.986004514672686, + "grad_norm": 330.148193359375, + "learning_rate": 8.961887477313975e-06, + "loss": 34.8268, + "step": 3874 + }, + { + "epoch": 13.989616252821671, + "grad_norm": 288.9424133300781, + "learning_rate": 8.956442831215972e-06, + "loss": 35.5068, + "step": 3875 + }, + { + "epoch": 13.993227990970654, + "grad_norm": 256.2596740722656, + "learning_rate": 8.950998185117967e-06, + "loss": 28.5016, + "step": 3876 + }, + { + "epoch": 13.996839729119639, + "grad_norm": 234.31991577148438, + "learning_rate": 8.945553539019963e-06, + "loss": 23.7416, + "step": 3877 + }, + { + "epoch": 14.0, + "grad_norm": 182.19000244140625, + "learning_rate": 8.940108892921961e-06, + "loss": 21.0329, + "step": 3878 + }, + { + "epoch": 14.003611738148985, + "grad_norm": 254.86355590820312, + "learning_rate": 8.934664246823957e-06, + "loss": 39.94, + "step": 3879 + }, + { + "epoch": 14.007223476297968, + "grad_norm": 229.75650024414062, + "learning_rate": 8.929219600725954e-06, + "loss": 40.3213, + "step": 3880 + }, + { + "epoch": 14.007223476297968, + "eval_loss": 0.604503870010376, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3880 + }, + { + "epoch": 14.010835214446953, + "grad_norm": 220.18190002441406, + "learning_rate": 8.923774954627949e-06, + "loss": 40.1568, + "step": 3881 + }, + { + "epoch": 14.014446952595938, + "grad_norm": 269.5978088378906, + "learning_rate": 8.918330308529945e-06, + "loss": 40.3685, + "step": 3882 + }, + { + "epoch": 14.01805869074492, + "grad_norm": 254.3507537841797, + "learning_rate": 8.912885662431943e-06, + "loss": 40.0845, + "step": 3883 + }, + { + "epoch": 14.021670428893906, + "grad_norm": 251.43653869628906, + "learning_rate": 8.907441016333939e-06, + "loss": 40.1731, + "step": 3884 + }, + { + "epoch": 14.025282167042889, + "grad_norm": 215.91253662109375, + "learning_rate": 8.901996370235934e-06, + "loss": 39.7179, + "step": 3885 + }, + { + "epoch": 14.028893905191874, + "grad_norm": 247.81790161132812, + "learning_rate": 8.896551724137931e-06, + "loss": 41.0822, + "step": 3886 + }, + { + "epoch": 14.032505643340858, + "grad_norm": 232.45892333984375, + "learning_rate": 8.891107078039928e-06, + "loss": 39.7873, + "step": 3887 + }, + { + "epoch": 14.036117381489841, + "grad_norm": 231.8137969970703, + "learning_rate": 8.885662431941924e-06, + "loss": 41.1302, + "step": 3888 + }, + { + "epoch": 14.039729119638826, + "grad_norm": 219.09446716308594, + "learning_rate": 8.88021778584392e-06, + "loss": 39.2293, + "step": 3889 + }, + { + "epoch": 14.043340857787811, + "grad_norm": 187.99874877929688, + "learning_rate": 8.874773139745916e-06, + "loss": 37.3338, + "step": 3890 + }, + { + "epoch": 14.043340857787811, + "eval_loss": 0.603966236114502, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 3890 + }, + { + "epoch": 14.046952595936794, + "grad_norm": 285.2400207519531, + "learning_rate": 8.869328493647913e-06, + "loss": 36.9479, + "step": 3891 + }, + { + "epoch": 14.050564334085779, + "grad_norm": 234.23655700683594, + "learning_rate": 8.86388384754991e-06, + "loss": 35.1313, + "step": 3892 + }, + { + "epoch": 14.054176072234762, + "grad_norm": 234.78717041015625, + "learning_rate": 8.858439201451906e-06, + "loss": 36.5917, + "step": 3893 + }, + { + "epoch": 14.057787810383747, + "grad_norm": 226.53997802734375, + "learning_rate": 8.852994555353903e-06, + "loss": 38.3228, + "step": 3894 + }, + { + "epoch": 14.061399548532732, + "grad_norm": 222.05213928222656, + "learning_rate": 8.847549909255898e-06, + "loss": 37.3542, + "step": 3895 + }, + { + "epoch": 14.065011286681715, + "grad_norm": 222.9646759033203, + "learning_rate": 8.842105263157893e-06, + "loss": 37.6396, + "step": 3896 + }, + { + "epoch": 14.0686230248307, + "grad_norm": 227.78965759277344, + "learning_rate": 8.836660617059892e-06, + "loss": 38.1988, + "step": 3897 + }, + { + "epoch": 14.072234762979685, + "grad_norm": 200.89691162109375, + "learning_rate": 8.831215970961888e-06, + "loss": 38.3981, + "step": 3898 + }, + { + "epoch": 14.075846501128668, + "grad_norm": 212.52891540527344, + "learning_rate": 8.825771324863883e-06, + "loss": 37.3422, + "step": 3899 + }, + { + "epoch": 14.079458239277653, + "grad_norm": 312.33905029296875, + "learning_rate": 8.82032667876588e-06, + "loss": 38.1292, + "step": 3900 + }, + { + "epoch": 14.079458239277653, + "eval_loss": 0.6061921119689941, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.017, + "eval_steps_per_second": 57.017, + "step": 3900 + }, + { + "epoch": 14.083069977426636, + "grad_norm": 261.8415832519531, + "learning_rate": 8.814882032667877e-06, + "loss": 37.5543, + "step": 3901 + }, + { + "epoch": 14.08668171557562, + "grad_norm": 264.625732421875, + "learning_rate": 8.809437386569874e-06, + "loss": 39.3912, + "step": 3902 + }, + { + "epoch": 14.090293453724605, + "grad_norm": 305.7203063964844, + "learning_rate": 8.80399274047187e-06, + "loss": 39.7879, + "step": 3903 + }, + { + "epoch": 14.093905191873588, + "grad_norm": 282.63616943359375, + "learning_rate": 8.798548094373865e-06, + "loss": 38.7212, + "step": 3904 + }, + { + "epoch": 14.097516930022573, + "grad_norm": 246.49169921875, + "learning_rate": 8.793103448275862e-06, + "loss": 40.6198, + "step": 3905 + }, + { + "epoch": 14.101128668171558, + "grad_norm": 283.2737731933594, + "learning_rate": 8.787658802177859e-06, + "loss": 39.6947, + "step": 3906 + }, + { + "epoch": 14.104740406320541, + "grad_norm": 306.95721435546875, + "learning_rate": 8.782214156079855e-06, + "loss": 38.6157, + "step": 3907 + }, + { + "epoch": 14.108352144469526, + "grad_norm": 238.1789093017578, + "learning_rate": 8.776769509981852e-06, + "loss": 35.5328, + "step": 3908 + }, + { + "epoch": 14.111963882618511, + "grad_norm": 233.2298126220703, + "learning_rate": 8.771324863883847e-06, + "loss": 32.4008, + "step": 3909 + }, + { + "epoch": 14.115575620767494, + "grad_norm": 233.46339416503906, + "learning_rate": 8.765880217785846e-06, + "loss": 31.0712, + "step": 3910 + }, + { + "epoch": 14.115575620767494, + "eval_loss": 0.6046931147575378, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 3910 + }, + { + "epoch": 14.119187358916479, + "grad_norm": 226.30343627929688, + "learning_rate": 8.760435571687841e-06, + "loss": 33.252, + "step": 3911 + }, + { + "epoch": 14.122799097065462, + "grad_norm": 247.17465209960938, + "learning_rate": 8.754990925589837e-06, + "loss": 31.526, + "step": 3912 + }, + { + "epoch": 14.126410835214447, + "grad_norm": 208.25439453125, + "learning_rate": 8.749546279491834e-06, + "loss": 32.4838, + "step": 3913 + }, + { + "epoch": 14.130022573363432, + "grad_norm": 236.4488525390625, + "learning_rate": 8.744101633393829e-06, + "loss": 32.7987, + "step": 3914 + }, + { + "epoch": 14.133634311512415, + "grad_norm": 219.13279724121094, + "learning_rate": 8.738656987295826e-06, + "loss": 32.8516, + "step": 3915 + }, + { + "epoch": 14.1372460496614, + "grad_norm": 239.7289581298828, + "learning_rate": 8.733212341197823e-06, + "loss": 33.7763, + "step": 3916 + }, + { + "epoch": 14.140857787810384, + "grad_norm": 226.3568878173828, + "learning_rate": 8.727767695099819e-06, + "loss": 35.675, + "step": 3917 + }, + { + "epoch": 14.144469525959368, + "grad_norm": 302.84307861328125, + "learning_rate": 8.722323049001814e-06, + "loss": 34.0523, + "step": 3918 + }, + { + "epoch": 14.148081264108352, + "grad_norm": 280.40106201171875, + "learning_rate": 8.716878402903811e-06, + "loss": 35.2923, + "step": 3919 + }, + { + "epoch": 14.151693002257336, + "grad_norm": 238.30520629882812, + "learning_rate": 8.711433756805808e-06, + "loss": 36.0242, + "step": 3920 + }, + { + "epoch": 14.151693002257336, + "eval_loss": 0.6067762970924377, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 3920 + }, + { + "epoch": 14.15530474040632, + "grad_norm": 238.6465301513672, + "learning_rate": 8.705989110707805e-06, + "loss": 36.2959, + "step": 3921 + }, + { + "epoch": 14.158916478555305, + "grad_norm": 273.26837158203125, + "learning_rate": 8.7005444646098e-06, + "loss": 35.45, + "step": 3922 + }, + { + "epoch": 14.162528216704288, + "grad_norm": 296.907958984375, + "learning_rate": 8.695099818511796e-06, + "loss": 36.4428, + "step": 3923 + }, + { + "epoch": 14.166139954853273, + "grad_norm": 215.07374572753906, + "learning_rate": 8.689655172413795e-06, + "loss": 26.4171, + "step": 3924 + }, + { + "epoch": 14.169751693002258, + "grad_norm": 217.64779663085938, + "learning_rate": 8.68421052631579e-06, + "loss": 22.5483, + "step": 3925 + }, + { + "epoch": 14.173363431151241, + "grad_norm": 243.59364318847656, + "learning_rate": 8.678765880217785e-06, + "loss": 22.0396, + "step": 3926 + }, + { + "epoch": 14.176975169300226, + "grad_norm": 189.66969299316406, + "learning_rate": 8.673321234119783e-06, + "loss": 23.0957, + "step": 3927 + }, + { + "epoch": 14.18058690744921, + "grad_norm": 191.86180114746094, + "learning_rate": 8.667876588021778e-06, + "loss": 23.9385, + "step": 3928 + }, + { + "epoch": 14.184198645598194, + "grad_norm": 234.34896850585938, + "learning_rate": 8.662431941923775e-06, + "loss": 40.1665, + "step": 3929 + }, + { + "epoch": 14.187810383747179, + "grad_norm": 230.52401733398438, + "learning_rate": 8.656987295825772e-06, + "loss": 40.6752, + "step": 3930 + }, + { + "epoch": 14.187810383747179, + "eval_loss": 0.6088615655899048, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.998, + "eval_steps_per_second": 56.998, + "step": 3930 + }, + { + "epoch": 14.191422121896162, + "grad_norm": 234.06272888183594, + "learning_rate": 8.651542649727767e-06, + "loss": 40.7938, + "step": 3931 + }, + { + "epoch": 14.195033860045147, + "grad_norm": 344.4232482910156, + "learning_rate": 8.646098003629765e-06, + "loss": 38.7342, + "step": 3932 + }, + { + "epoch": 14.198645598194132, + "grad_norm": 375.74365234375, + "learning_rate": 8.640653357531762e-06, + "loss": 40.2052, + "step": 3933 + }, + { + "epoch": 14.202257336343115, + "grad_norm": 258.15570068359375, + "learning_rate": 8.635208711433757e-06, + "loss": 39.7266, + "step": 3934 + }, + { + "epoch": 14.2058690744921, + "grad_norm": 235.2681121826172, + "learning_rate": 8.629764065335754e-06, + "loss": 40.4821, + "step": 3935 + }, + { + "epoch": 14.209480812641084, + "grad_norm": 226.94764709472656, + "learning_rate": 8.62431941923775e-06, + "loss": 41.2414, + "step": 3936 + }, + { + "epoch": 14.213092550790067, + "grad_norm": 236.22109985351562, + "learning_rate": 8.618874773139745e-06, + "loss": 40.5807, + "step": 3937 + }, + { + "epoch": 14.216704288939052, + "grad_norm": 201.31112670898438, + "learning_rate": 8.613430127041744e-06, + "loss": 40.4824, + "step": 3938 + }, + { + "epoch": 14.220316027088035, + "grad_norm": 328.0167541503906, + "learning_rate": 8.607985480943739e-06, + "loss": 38.3881, + "step": 3939 + }, + { + "epoch": 14.22392776523702, + "grad_norm": 281.4416809082031, + "learning_rate": 8.602540834845734e-06, + "loss": 36.5777, + "step": 3940 + }, + { + "epoch": 14.22392776523702, + "eval_loss": 0.6099084615707397, + "eval_runtime": 3.1377, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 3940 + }, + { + "epoch": 14.227539503386005, + "grad_norm": 258.5203552246094, + "learning_rate": 8.597096188747731e-06, + "loss": 37.5071, + "step": 3941 + }, + { + "epoch": 14.231151241534988, + "grad_norm": 274.8222351074219, + "learning_rate": 8.591651542649727e-06, + "loss": 36.358, + "step": 3942 + }, + { + "epoch": 14.234762979683973, + "grad_norm": 253.1671600341797, + "learning_rate": 8.586206896551726e-06, + "loss": 37.5859, + "step": 3943 + }, + { + "epoch": 14.238374717832958, + "grad_norm": 249.80943298339844, + "learning_rate": 8.580762250453721e-06, + "loss": 37.8799, + "step": 3944 + }, + { + "epoch": 14.241986455981941, + "grad_norm": 245.29103088378906, + "learning_rate": 8.575317604355716e-06, + "loss": 36.7551, + "step": 3945 + }, + { + "epoch": 14.245598194130926, + "grad_norm": 205.5915985107422, + "learning_rate": 8.569872958257713e-06, + "loss": 38.4761, + "step": 3946 + }, + { + "epoch": 14.249209932279909, + "grad_norm": 218.10328674316406, + "learning_rate": 8.56442831215971e-06, + "loss": 37.5862, + "step": 3947 + }, + { + "epoch": 14.252821670428894, + "grad_norm": 273.5924072265625, + "learning_rate": 8.558983666061706e-06, + "loss": 39.2851, + "step": 3948 + }, + { + "epoch": 14.256433408577879, + "grad_norm": 235.48069763183594, + "learning_rate": 8.553539019963703e-06, + "loss": 39.0707, + "step": 3949 + }, + { + "epoch": 14.260045146726862, + "grad_norm": 230.93150329589844, + "learning_rate": 8.548094373865698e-06, + "loss": 37.8469, + "step": 3950 + }, + { + "epoch": 14.260045146726862, + "eval_loss": 0.6072147488594055, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 3950 + }, + { + "epoch": 14.263656884875846, + "grad_norm": 226.3638458251953, + "learning_rate": 8.542649727767695e-06, + "loss": 39.4245, + "step": 3951 + }, + { + "epoch": 14.267268623024831, + "grad_norm": 226.74595642089844, + "learning_rate": 8.537205081669693e-06, + "loss": 38.116, + "step": 3952 + }, + { + "epoch": 14.270880361173814, + "grad_norm": 226.1452178955078, + "learning_rate": 8.531760435571688e-06, + "loss": 39.9114, + "step": 3953 + }, + { + "epoch": 14.2744920993228, + "grad_norm": 387.8020324707031, + "learning_rate": 8.526315789473685e-06, + "loss": 38.9457, + "step": 3954 + }, + { + "epoch": 14.278103837471784, + "grad_norm": 381.5679931640625, + "learning_rate": 8.52087114337568e-06, + "loss": 40.7989, + "step": 3955 + }, + { + "epoch": 14.281715575620767, + "grad_norm": 246.16464233398438, + "learning_rate": 8.515426497277677e-06, + "loss": 37.6288, + "step": 3956 + }, + { + "epoch": 14.285327313769752, + "grad_norm": 337.05059814453125, + "learning_rate": 8.509981851179674e-06, + "loss": 37.3276, + "step": 3957 + }, + { + "epoch": 14.288939051918735, + "grad_norm": 223.80421447753906, + "learning_rate": 8.50453720508167e-06, + "loss": 33.9465, + "step": 3958 + }, + { + "epoch": 14.29255079006772, + "grad_norm": 218.9332275390625, + "learning_rate": 8.499092558983665e-06, + "loss": 33.0305, + "step": 3959 + }, + { + "epoch": 14.296162528216705, + "grad_norm": 254.20726013183594, + "learning_rate": 8.493647912885662e-06, + "loss": 31.3806, + "step": 3960 + }, + { + "epoch": 14.296162528216705, + "eval_loss": 0.6070483922958374, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 3960 + }, + { + "epoch": 14.299774266365688, + "grad_norm": 232.96702575683594, + "learning_rate": 8.48820326678766e-06, + "loss": 31.7001, + "step": 3961 + }, + { + "epoch": 14.303386004514673, + "grad_norm": 305.31207275390625, + "learning_rate": 8.482758620689656e-06, + "loss": 32.2629, + "step": 3962 + }, + { + "epoch": 14.306997742663658, + "grad_norm": 253.60858154296875, + "learning_rate": 8.477313974591652e-06, + "loss": 34.2635, + "step": 3963 + }, + { + "epoch": 14.31060948081264, + "grad_norm": 395.4168701171875, + "learning_rate": 8.471869328493647e-06, + "loss": 34.6987, + "step": 3964 + }, + { + "epoch": 14.314221218961626, + "grad_norm": 279.72845458984375, + "learning_rate": 8.466424682395644e-06, + "loss": 34.5488, + "step": 3965 + }, + { + "epoch": 14.317832957110609, + "grad_norm": 285.7306213378906, + "learning_rate": 8.460980036297641e-06, + "loss": 35.2566, + "step": 3966 + }, + { + "epoch": 14.321444695259594, + "grad_norm": 229.04226684570312, + "learning_rate": 8.455535390199637e-06, + "loss": 34.5273, + "step": 3967 + }, + { + "epoch": 14.325056433408578, + "grad_norm": 232.50205993652344, + "learning_rate": 8.450090744101634e-06, + "loss": 34.6337, + "step": 3968 + }, + { + "epoch": 14.328668171557561, + "grad_norm": 225.87583923339844, + "learning_rate": 8.44464609800363e-06, + "loss": 35.1575, + "step": 3969 + }, + { + "epoch": 14.332279909706546, + "grad_norm": 266.2709045410156, + "learning_rate": 8.439201451905626e-06, + "loss": 34.2619, + "step": 3970 + }, + { + "epoch": 14.332279909706546, + "eval_loss": 0.6066078543663025, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 3970 + }, + { + "epoch": 14.335891647855531, + "grad_norm": 283.557373046875, + "learning_rate": 8.433756805807623e-06, + "loss": 35.5713, + "step": 3971 + }, + { + "epoch": 14.339503386004514, + "grad_norm": 288.43707275390625, + "learning_rate": 8.428312159709619e-06, + "loss": 36.7442, + "step": 3972 + }, + { + "epoch": 14.343115124153499, + "grad_norm": 331.3218994140625, + "learning_rate": 8.422867513611616e-06, + "loss": 35.5839, + "step": 3973 + }, + { + "epoch": 14.346726862302482, + "grad_norm": 257.1488037109375, + "learning_rate": 8.417422867513611e-06, + "loss": 30.2221, + "step": 3974 + }, + { + "epoch": 14.350338600451467, + "grad_norm": 200.0919189453125, + "learning_rate": 8.411978221415608e-06, + "loss": 22.217, + "step": 3975 + }, + { + "epoch": 14.353950338600452, + "grad_norm": 245.030029296875, + "learning_rate": 8.406533575317605e-06, + "loss": 22.8927, + "step": 3976 + }, + { + "epoch": 14.357562076749435, + "grad_norm": 208.5701904296875, + "learning_rate": 8.4010889292196e-06, + "loss": 22.9537, + "step": 3977 + }, + { + "epoch": 14.36117381489842, + "grad_norm": 232.0613250732422, + "learning_rate": 8.395644283121596e-06, + "loss": 24.5304, + "step": 3978 + }, + { + "epoch": 14.364785553047405, + "grad_norm": 193.56541442871094, + "learning_rate": 8.390199637023595e-06, + "loss": 39.4552, + "step": 3979 + }, + { + "epoch": 14.368397291196388, + "grad_norm": 230.35507202148438, + "learning_rate": 8.38475499092559e-06, + "loss": 41.0417, + "step": 3980 + }, + { + "epoch": 14.368397291196388, + "eval_loss": 0.6071842908859253, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 3980 + }, + { + "epoch": 14.372009029345373, + "grad_norm": 191.09242248535156, + "learning_rate": 8.379310344827586e-06, + "loss": 40.1548, + "step": 3981 + }, + { + "epoch": 14.375620767494357, + "grad_norm": 249.24520874023438, + "learning_rate": 8.373865698729583e-06, + "loss": 39.5746, + "step": 3982 + }, + { + "epoch": 14.37923250564334, + "grad_norm": 266.509033203125, + "learning_rate": 8.368421052631578e-06, + "loss": 39.2388, + "step": 3983 + }, + { + "epoch": 14.382844243792325, + "grad_norm": 255.36209106445312, + "learning_rate": 8.362976406533577e-06, + "loss": 39.9314, + "step": 3984 + }, + { + "epoch": 14.386455981941308, + "grad_norm": 239.0690460205078, + "learning_rate": 8.357531760435572e-06, + "loss": 39.9124, + "step": 3985 + }, + { + "epoch": 14.390067720090293, + "grad_norm": 211.36135864257812, + "learning_rate": 8.352087114337568e-06, + "loss": 40.1307, + "step": 3986 + }, + { + "epoch": 14.393679458239278, + "grad_norm": 215.28912353515625, + "learning_rate": 8.346642468239565e-06, + "loss": 40.5252, + "step": 3987 + }, + { + "epoch": 14.397291196388261, + "grad_norm": 240.84271240234375, + "learning_rate": 8.34119782214156e-06, + "loss": 40.8348, + "step": 3988 + }, + { + "epoch": 14.400902934537246, + "grad_norm": 228.41758728027344, + "learning_rate": 8.335753176043557e-06, + "loss": 39.8228, + "step": 3989 + }, + { + "epoch": 14.404514672686231, + "grad_norm": 203.0228729248047, + "learning_rate": 8.330308529945554e-06, + "loss": 38.0696, + "step": 3990 + }, + { + "epoch": 14.404514672686231, + "eval_loss": 0.6064196825027466, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.983, + "eval_steps_per_second": 56.983, + "step": 3990 + }, + { + "epoch": 14.408126410835214, + "grad_norm": 245.14646911621094, + "learning_rate": 8.32486388384755e-06, + "loss": 37.3921, + "step": 3991 + }, + { + "epoch": 14.411738148984199, + "grad_norm": 230.0685577392578, + "learning_rate": 8.319419237749545e-06, + "loss": 36.8794, + "step": 3992 + }, + { + "epoch": 14.415349887133182, + "grad_norm": 203.02955627441406, + "learning_rate": 8.313974591651544e-06, + "loss": 38.011, + "step": 3993 + }, + { + "epoch": 14.418961625282167, + "grad_norm": 276.0522766113281, + "learning_rate": 8.30852994555354e-06, + "loss": 37.8114, + "step": 3994 + }, + { + "epoch": 14.422573363431152, + "grad_norm": 205.56423950195312, + "learning_rate": 8.303085299455536e-06, + "loss": 38.1956, + "step": 3995 + }, + { + "epoch": 14.426185101580135, + "grad_norm": 200.71507263183594, + "learning_rate": 8.297640653357532e-06, + "loss": 36.4471, + "step": 3996 + }, + { + "epoch": 14.42979683972912, + "grad_norm": 217.8540496826172, + "learning_rate": 8.292196007259527e-06, + "loss": 37.6204, + "step": 3997 + }, + { + "epoch": 14.433408577878104, + "grad_norm": 228.0621337890625, + "learning_rate": 8.286751361161526e-06, + "loss": 38.6074, + "step": 3998 + }, + { + "epoch": 14.437020316027088, + "grad_norm": 246.05203247070312, + "learning_rate": 8.281306715063521e-06, + "loss": 37.8614, + "step": 3999 + }, + { + "epoch": 14.440632054176072, + "grad_norm": 216.0327911376953, + "learning_rate": 8.275862068965517e-06, + "loss": 37.4941, + "step": 4000 + }, + { + "epoch": 14.440632054176072, + "eval_loss": 0.605604887008667, + "eval_runtime": 3.1399, + "eval_samples_per_second": 57.008, + "eval_steps_per_second": 57.008, + "step": 4000 + }, + { + "epoch": 14.444243792325057, + "grad_norm": 292.38653564453125, + "learning_rate": 8.270417422867514e-06, + "loss": 37.9576, + "step": 4001 + }, + { + "epoch": 14.44785553047404, + "grad_norm": 268.2558288574219, + "learning_rate": 8.26497277676951e-06, + "loss": 38.7505, + "step": 4002 + }, + { + "epoch": 14.451467268623025, + "grad_norm": 324.135498046875, + "learning_rate": 8.259528130671508e-06, + "loss": 39.9733, + "step": 4003 + }, + { + "epoch": 14.455079006772008, + "grad_norm": 269.1458740234375, + "learning_rate": 8.254083484573503e-06, + "loss": 38.8272, + "step": 4004 + }, + { + "epoch": 14.458690744920993, + "grad_norm": 214.26547241210938, + "learning_rate": 8.248638838475499e-06, + "loss": 37.7277, + "step": 4005 + }, + { + "epoch": 14.462302483069978, + "grad_norm": 256.4419860839844, + "learning_rate": 8.243194192377496e-06, + "loss": 39.0446, + "step": 4006 + }, + { + "epoch": 14.465914221218961, + "grad_norm": 226.9741973876953, + "learning_rate": 8.237749546279493e-06, + "loss": 34.2491, + "step": 4007 + }, + { + "epoch": 14.469525959367946, + "grad_norm": 238.4901123046875, + "learning_rate": 8.232304900181488e-06, + "loss": 32.1969, + "step": 4008 + }, + { + "epoch": 14.47313769751693, + "grad_norm": 260.6334533691406, + "learning_rate": 8.226860254083485e-06, + "loss": 32.5999, + "step": 4009 + }, + { + "epoch": 14.476749435665914, + "grad_norm": 227.4844970703125, + "learning_rate": 8.22141560798548e-06, + "loss": 30.3598, + "step": 4010 + }, + { + "epoch": 14.476749435665914, + "eval_loss": 0.6049788594245911, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 4010 + }, + { + "epoch": 14.480361173814899, + "grad_norm": 231.49935913085938, + "learning_rate": 8.215970961887476e-06, + "loss": 32.3228, + "step": 4011 + }, + { + "epoch": 14.483972911963882, + "grad_norm": 246.83099365234375, + "learning_rate": 8.210526315789475e-06, + "loss": 32.1275, + "step": 4012 + }, + { + "epoch": 14.487584650112867, + "grad_norm": 283.0715026855469, + "learning_rate": 8.20508166969147e-06, + "loss": 32.9237, + "step": 4013 + }, + { + "epoch": 14.491196388261852, + "grad_norm": 264.58941650390625, + "learning_rate": 8.199637023593467e-06, + "loss": 34.3091, + "step": 4014 + }, + { + "epoch": 14.494808126410835, + "grad_norm": 207.57241821289062, + "learning_rate": 8.194192377495463e-06, + "loss": 34.2317, + "step": 4015 + }, + { + "epoch": 14.49841986455982, + "grad_norm": 266.3730163574219, + "learning_rate": 8.18874773139746e-06, + "loss": 35.5423, + "step": 4016 + }, + { + "epoch": 14.502031602708804, + "grad_norm": 274.2936096191406, + "learning_rate": 8.183303085299457e-06, + "loss": 34.0383, + "step": 4017 + }, + { + "epoch": 14.505643340857787, + "grad_norm": 345.4320068359375, + "learning_rate": 8.177858439201452e-06, + "loss": 35.6892, + "step": 4018 + }, + { + "epoch": 14.509255079006772, + "grad_norm": 254.9503631591797, + "learning_rate": 8.172413793103448e-06, + "loss": 34.4219, + "step": 4019 + }, + { + "epoch": 14.512866817155757, + "grad_norm": 277.176025390625, + "learning_rate": 8.166969147005445e-06, + "loss": 34.6322, + "step": 4020 + }, + { + "epoch": 14.512866817155757, + "eval_loss": 0.6078911423683167, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 4020 + }, + { + "epoch": 14.51647855530474, + "grad_norm": 267.24737548828125, + "learning_rate": 8.161524500907442e-06, + "loss": 36.4843, + "step": 4021 + }, + { + "epoch": 14.520090293453725, + "grad_norm": 291.5208740234375, + "learning_rate": 8.156079854809437e-06, + "loss": 36.347, + "step": 4022 + }, + { + "epoch": 14.523702031602708, + "grad_norm": 331.9736328125, + "learning_rate": 8.150635208711434e-06, + "loss": 36.5678, + "step": 4023 + }, + { + "epoch": 14.527313769751693, + "grad_norm": 283.7598876953125, + "learning_rate": 8.14519056261343e-06, + "loss": 29.4886, + "step": 4024 + }, + { + "epoch": 14.530925507900678, + "grad_norm": 214.61712646484375, + "learning_rate": 8.139745916515427e-06, + "loss": 23.2178, + "step": 4025 + }, + { + "epoch": 14.534537246049661, + "grad_norm": 286.7948913574219, + "learning_rate": 8.134301270417424e-06, + "loss": 22.0972, + "step": 4026 + }, + { + "epoch": 14.538148984198646, + "grad_norm": 230.6540069580078, + "learning_rate": 8.128856624319419e-06, + "loss": 23.2764, + "step": 4027 + }, + { + "epoch": 14.54176072234763, + "grad_norm": 300.9560241699219, + "learning_rate": 8.123411978221416e-06, + "loss": 24.1889, + "step": 4028 + }, + { + "epoch": 14.545372460496614, + "grad_norm": 211.4068145751953, + "learning_rate": 8.117967332123412e-06, + "loss": 39.0039, + "step": 4029 + }, + { + "epoch": 14.548984198645599, + "grad_norm": 274.3965759277344, + "learning_rate": 8.112522686025409e-06, + "loss": 41.1832, + "step": 4030 + }, + { + "epoch": 14.548984198645599, + "eval_loss": 0.6079195141792297, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 4030 + }, + { + "epoch": 14.552595936794582, + "grad_norm": 247.50657653808594, + "learning_rate": 8.107078039927406e-06, + "loss": 38.28, + "step": 4031 + }, + { + "epoch": 14.556207674943566, + "grad_norm": 216.0500946044922, + "learning_rate": 8.101633393829401e-06, + "loss": 39.5079, + "step": 4032 + }, + { + "epoch": 14.559819413092551, + "grad_norm": 271.37066650390625, + "learning_rate": 8.096188747731396e-06, + "loss": 40.1902, + "step": 4033 + }, + { + "epoch": 14.563431151241534, + "grad_norm": 233.35415649414062, + "learning_rate": 8.090744101633394e-06, + "loss": 40.2113, + "step": 4034 + }, + { + "epoch": 14.56704288939052, + "grad_norm": 214.67381286621094, + "learning_rate": 8.08529945553539e-06, + "loss": 39.794, + "step": 4035 + }, + { + "epoch": 14.570654627539504, + "grad_norm": 298.1142578125, + "learning_rate": 8.079854809437388e-06, + "loss": 39.9214, + "step": 4036 + }, + { + "epoch": 14.574266365688487, + "grad_norm": 197.40823364257812, + "learning_rate": 8.074410163339383e-06, + "loss": 40.9599, + "step": 4037 + }, + { + "epoch": 14.577878103837472, + "grad_norm": 242.1573028564453, + "learning_rate": 8.068965517241378e-06, + "loss": 40.2351, + "step": 4038 + }, + { + "epoch": 14.581489841986457, + "grad_norm": 224.93801879882812, + "learning_rate": 8.063520871143377e-06, + "loss": 39.0174, + "step": 4039 + }, + { + "epoch": 14.58510158013544, + "grad_norm": 295.4931335449219, + "learning_rate": 8.058076225045373e-06, + "loss": 37.4696, + "step": 4040 + }, + { + "epoch": 14.58510158013544, + "eval_loss": 0.6091852188110352, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 4040 + }, + { + "epoch": 14.588713318284425, + "grad_norm": 302.8267517089844, + "learning_rate": 8.052631578947368e-06, + "loss": 37.3227, + "step": 4041 + }, + { + "epoch": 14.592325056433408, + "grad_norm": 355.2379150390625, + "learning_rate": 8.047186932849365e-06, + "loss": 38.433, + "step": 4042 + }, + { + "epoch": 14.595936794582393, + "grad_norm": 304.96234130859375, + "learning_rate": 8.04174228675136e-06, + "loss": 37.8352, + "step": 4043 + }, + { + "epoch": 14.599548532731378, + "grad_norm": 309.294921875, + "learning_rate": 8.036297640653358e-06, + "loss": 38.1734, + "step": 4044 + }, + { + "epoch": 14.60316027088036, + "grad_norm": 216.3328399658203, + "learning_rate": 8.030852994555355e-06, + "loss": 37.3612, + "step": 4045 + }, + { + "epoch": 14.606772009029346, + "grad_norm": 250.9885711669922, + "learning_rate": 8.02540834845735e-06, + "loss": 39.1612, + "step": 4046 + }, + { + "epoch": 14.610383747178329, + "grad_norm": 215.0750732421875, + "learning_rate": 8.019963702359347e-06, + "loss": 39.6837, + "step": 4047 + }, + { + "epoch": 14.613995485327314, + "grad_norm": 234.02069091796875, + "learning_rate": 8.014519056261342e-06, + "loss": 37.9746, + "step": 4048 + }, + { + "epoch": 14.617607223476298, + "grad_norm": 233.7527313232422, + "learning_rate": 8.00907441016334e-06, + "loss": 38.5114, + "step": 4049 + }, + { + "epoch": 14.621218961625281, + "grad_norm": 271.77496337890625, + "learning_rate": 8.003629764065337e-06, + "loss": 37.1647, + "step": 4050 + }, + { + "epoch": 14.621218961625281, + "eval_loss": 0.6047770977020264, + "eval_runtime": 3.1379, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 4050 + }, + { + "epoch": 14.624830699774266, + "grad_norm": 281.7846374511719, + "learning_rate": 7.998185117967332e-06, + "loss": 38.981, + "step": 4051 + }, + { + "epoch": 14.628442437923251, + "grad_norm": 308.8702697753906, + "learning_rate": 7.992740471869327e-06, + "loss": 39.4821, + "step": 4052 + }, + { + "epoch": 14.632054176072234, + "grad_norm": 366.1501770019531, + "learning_rate": 7.987295825771326e-06, + "loss": 39.0898, + "step": 4053 + }, + { + "epoch": 14.635665914221219, + "grad_norm": 276.92962646484375, + "learning_rate": 7.981851179673322e-06, + "loss": 39.6162, + "step": 4054 + }, + { + "epoch": 14.639277652370204, + "grad_norm": 220.0023651123047, + "learning_rate": 7.976406533575319e-06, + "loss": 38.5888, + "step": 4055 + }, + { + "epoch": 14.642889390519187, + "grad_norm": 268.57293701171875, + "learning_rate": 7.970961887477314e-06, + "loss": 38.4631, + "step": 4056 + }, + { + "epoch": 14.646501128668172, + "grad_norm": 307.8072509765625, + "learning_rate": 7.96551724137931e-06, + "loss": 35.4139, + "step": 4057 + }, + { + "epoch": 14.650112866817155, + "grad_norm": 228.11767578125, + "learning_rate": 7.960072595281308e-06, + "loss": 33.3694, + "step": 4058 + }, + { + "epoch": 14.65372460496614, + "grad_norm": 217.6271209716797, + "learning_rate": 7.954627949183304e-06, + "loss": 31.3355, + "step": 4059 + }, + { + "epoch": 14.657336343115125, + "grad_norm": 232.31944274902344, + "learning_rate": 7.949183303085299e-06, + "loss": 32.8306, + "step": 4060 + }, + { + "epoch": 14.657336343115125, + "eval_loss": 0.6018487215042114, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 4060 + }, + { + "epoch": 14.660948081264108, + "grad_norm": 244.58303833007812, + "learning_rate": 7.943738656987296e-06, + "loss": 33.2157, + "step": 4061 + }, + { + "epoch": 14.664559819413093, + "grad_norm": 306.12005615234375, + "learning_rate": 7.938294010889293e-06, + "loss": 33.6361, + "step": 4062 + }, + { + "epoch": 14.668171557562077, + "grad_norm": 266.2792053222656, + "learning_rate": 7.932849364791288e-06, + "loss": 32.3917, + "step": 4063 + }, + { + "epoch": 14.67178329571106, + "grad_norm": 259.373779296875, + "learning_rate": 7.927404718693286e-06, + "loss": 33.3598, + "step": 4064 + }, + { + "epoch": 14.675395033860045, + "grad_norm": 247.35179138183594, + "learning_rate": 7.921960072595281e-06, + "loss": 32.2699, + "step": 4065 + }, + { + "epoch": 14.679006772009028, + "grad_norm": 280.02960205078125, + "learning_rate": 7.916515426497278e-06, + "loss": 33.0305, + "step": 4066 + }, + { + "epoch": 14.682618510158013, + "grad_norm": 394.6492919921875, + "learning_rate": 7.911070780399275e-06, + "loss": 35.1854, + "step": 4067 + }, + { + "epoch": 14.686230248306998, + "grad_norm": 298.6531677246094, + "learning_rate": 7.90562613430127e-06, + "loss": 35.1836, + "step": 4068 + }, + { + "epoch": 14.689841986455981, + "grad_norm": 250.960693359375, + "learning_rate": 7.900181488203268e-06, + "loss": 32.6266, + "step": 4069 + }, + { + "epoch": 14.693453724604966, + "grad_norm": 240.4825897216797, + "learning_rate": 7.894736842105263e-06, + "loss": 35.5937, + "step": 4070 + }, + { + "epoch": 14.693453724604966, + "eval_loss": 0.6042065620422363, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.91, + "eval_steps_per_second": 56.91, + "step": 4070 + }, + { + "epoch": 14.697065462753951, + "grad_norm": 274.6919860839844, + "learning_rate": 7.889292196007258e-06, + "loss": 36.4225, + "step": 4071 + }, + { + "epoch": 14.700677200902934, + "grad_norm": 245.4980010986328, + "learning_rate": 7.883847549909257e-06, + "loss": 36.5503, + "step": 4072 + }, + { + "epoch": 14.704288939051919, + "grad_norm": 373.362548828125, + "learning_rate": 7.878402903811252e-06, + "loss": 35.38, + "step": 4073 + }, + { + "epoch": 14.707900677200904, + "grad_norm": 337.5054626464844, + "learning_rate": 7.872958257713248e-06, + "loss": 28.869, + "step": 4074 + }, + { + "epoch": 14.711512415349887, + "grad_norm": 238.19195556640625, + "learning_rate": 7.867513611615245e-06, + "loss": 22.99, + "step": 4075 + }, + { + "epoch": 14.715124153498872, + "grad_norm": 254.274169921875, + "learning_rate": 7.862068965517242e-06, + "loss": 22.5274, + "step": 4076 + }, + { + "epoch": 14.718735891647855, + "grad_norm": 236.74099731445312, + "learning_rate": 7.856624319419239e-06, + "loss": 23.6756, + "step": 4077 + }, + { + "epoch": 14.72234762979684, + "grad_norm": 239.69911193847656, + "learning_rate": 7.851179673321234e-06, + "loss": 23.2024, + "step": 4078 + }, + { + "epoch": 14.725959367945824, + "grad_norm": 296.35101318359375, + "learning_rate": 7.84573502722323e-06, + "loss": 40.0026, + "step": 4079 + }, + { + "epoch": 14.729571106094808, + "grad_norm": 202.52577209472656, + "learning_rate": 7.840290381125227e-06, + "loss": 41.2817, + "step": 4080 + }, + { + "epoch": 14.729571106094808, + "eval_loss": 0.6069625616073608, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4080 + }, + { + "epoch": 14.733182844243792, + "grad_norm": 290.4194030761719, + "learning_rate": 7.834845735027224e-06, + "loss": 40.5411, + "step": 4081 + }, + { + "epoch": 14.736794582392777, + "grad_norm": 284.0616455078125, + "learning_rate": 7.82940108892922e-06, + "loss": 40.6588, + "step": 4082 + }, + { + "epoch": 14.74040632054176, + "grad_norm": 289.5628967285156, + "learning_rate": 7.823956442831216e-06, + "loss": 38.986, + "step": 4083 + }, + { + "epoch": 14.744018058690745, + "grad_norm": 217.09841918945312, + "learning_rate": 7.818511796733212e-06, + "loss": 38.83, + "step": 4084 + }, + { + "epoch": 14.747629796839728, + "grad_norm": 223.49148559570312, + "learning_rate": 7.813067150635209e-06, + "loss": 39.4897, + "step": 4085 + }, + { + "epoch": 14.751241534988713, + "grad_norm": 240.41578674316406, + "learning_rate": 7.807622504537206e-06, + "loss": 38.9963, + "step": 4086 + }, + { + "epoch": 14.754853273137698, + "grad_norm": 206.7586212158203, + "learning_rate": 7.802177858439201e-06, + "loss": 39.7875, + "step": 4087 + }, + { + "epoch": 14.758465011286681, + "grad_norm": 239.97174072265625, + "learning_rate": 7.796733212341198e-06, + "loss": 39.3977, + "step": 4088 + }, + { + "epoch": 14.762076749435666, + "grad_norm": 204.50839233398438, + "learning_rate": 7.791288566243194e-06, + "loss": 38.7869, + "step": 4089 + }, + { + "epoch": 14.76568848758465, + "grad_norm": 216.79583740234375, + "learning_rate": 7.785843920145191e-06, + "loss": 36.7325, + "step": 4090 + }, + { + "epoch": 14.76568848758465, + "eval_loss": 0.6052367091178894, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.93, + "eval_steps_per_second": 56.93, + "step": 4090 + }, + { + "epoch": 14.769300225733634, + "grad_norm": 251.13209533691406, + "learning_rate": 7.780399274047188e-06, + "loss": 38.2377, + "step": 4091 + }, + { + "epoch": 14.772911963882619, + "grad_norm": 222.745361328125, + "learning_rate": 7.774954627949183e-06, + "loss": 36.8119, + "step": 4092 + }, + { + "epoch": 14.776523702031604, + "grad_norm": 252.72117614746094, + "learning_rate": 7.769509981851179e-06, + "loss": 38.1241, + "step": 4093 + }, + { + "epoch": 14.780135440180587, + "grad_norm": 272.38165283203125, + "learning_rate": 7.764065335753176e-06, + "loss": 37.6839, + "step": 4094 + }, + { + "epoch": 14.783747178329572, + "grad_norm": 301.0637512207031, + "learning_rate": 7.758620689655173e-06, + "loss": 38.1267, + "step": 4095 + }, + { + "epoch": 14.787358916478555, + "grad_norm": 240.22515869140625, + "learning_rate": 7.75317604355717e-06, + "loss": 36.9847, + "step": 4096 + }, + { + "epoch": 14.79097065462754, + "grad_norm": 273.3988952636719, + "learning_rate": 7.747731397459165e-06, + "loss": 39.0368, + "step": 4097 + }, + { + "epoch": 14.794582392776524, + "grad_norm": 252.66497802734375, + "learning_rate": 7.74228675136116e-06, + "loss": 38.6439, + "step": 4098 + }, + { + "epoch": 14.798194130925507, + "grad_norm": 246.3287811279297, + "learning_rate": 7.73684210526316e-06, + "loss": 36.3503, + "step": 4099 + }, + { + "epoch": 14.801805869074492, + "grad_norm": 220.6704559326172, + "learning_rate": 7.731397459165155e-06, + "loss": 38.1603, + "step": 4100 + }, + { + "epoch": 14.801805869074492, + "eval_loss": 0.6043270826339722, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4100 + }, + { + "epoch": 14.805417607223477, + "grad_norm": 215.94979858398438, + "learning_rate": 7.72595281306715e-06, + "loss": 38.9624, + "step": 4101 + }, + { + "epoch": 14.80902934537246, + "grad_norm": 228.76815795898438, + "learning_rate": 7.720508166969147e-06, + "loss": 39.2196, + "step": 4102 + }, + { + "epoch": 14.812641083521445, + "grad_norm": 216.1998291015625, + "learning_rate": 7.715063520871143e-06, + "loss": 39.3677, + "step": 4103 + }, + { + "epoch": 14.816252821670428, + "grad_norm": 266.1018981933594, + "learning_rate": 7.70961887477314e-06, + "loss": 38.1856, + "step": 4104 + }, + { + "epoch": 14.819864559819413, + "grad_norm": 234.2566680908203, + "learning_rate": 7.704174228675137e-06, + "loss": 39.6282, + "step": 4105 + }, + { + "epoch": 14.823476297968398, + "grad_norm": 241.16615295410156, + "learning_rate": 7.698729582577132e-06, + "loss": 38.2693, + "step": 4106 + }, + { + "epoch": 14.827088036117381, + "grad_norm": 332.6835021972656, + "learning_rate": 7.69328493647913e-06, + "loss": 37.7161, + "step": 4107 + }, + { + "epoch": 14.830699774266366, + "grad_norm": 260.1654357910156, + "learning_rate": 7.687840290381126e-06, + "loss": 33.9704, + "step": 4108 + }, + { + "epoch": 14.83431151241535, + "grad_norm": 214.45509338378906, + "learning_rate": 7.682395644283122e-06, + "loss": 32.5126, + "step": 4109 + }, + { + "epoch": 14.837923250564334, + "grad_norm": 257.4847717285156, + "learning_rate": 7.676950998185119e-06, + "loss": 32.0682, + "step": 4110 + }, + { + "epoch": 14.837923250564334, + "eval_loss": 0.6022929549217224, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.957, + "eval_steps_per_second": 56.957, + "step": 4110 + }, + { + "epoch": 14.841534988713319, + "grad_norm": 241.302978515625, + "learning_rate": 7.671506352087114e-06, + "loss": 32.8817, + "step": 4111 + }, + { + "epoch": 14.845146726862303, + "grad_norm": 238.0950164794922, + "learning_rate": 7.66606170598911e-06, + "loss": 31.9995, + "step": 4112 + }, + { + "epoch": 14.848758465011286, + "grad_norm": 239.700439453125, + "learning_rate": 7.660617059891108e-06, + "loss": 32.9681, + "step": 4113 + }, + { + "epoch": 14.852370203160271, + "grad_norm": 234.23890686035156, + "learning_rate": 7.655172413793104e-06, + "loss": 33.6878, + "step": 4114 + }, + { + "epoch": 14.855981941309254, + "grad_norm": 367.3103332519531, + "learning_rate": 7.6497277676951e-06, + "loss": 34.2346, + "step": 4115 + }, + { + "epoch": 14.85959367945824, + "grad_norm": 221.31381225585938, + "learning_rate": 7.644283121597096e-06, + "loss": 35.0148, + "step": 4116 + }, + { + "epoch": 14.863205417607224, + "grad_norm": 352.1162109375, + "learning_rate": 7.638838475499092e-06, + "loss": 34.8326, + "step": 4117 + }, + { + "epoch": 14.866817155756207, + "grad_norm": 296.8202209472656, + "learning_rate": 7.63339382940109e-06, + "loss": 34.2522, + "step": 4118 + }, + { + "epoch": 14.870428893905192, + "grad_norm": 283.4679870605469, + "learning_rate": 7.627949183303086e-06, + "loss": 34.5005, + "step": 4119 + }, + { + "epoch": 14.874040632054175, + "grad_norm": 249.95033264160156, + "learning_rate": 7.622504537205082e-06, + "loss": 34.9581, + "step": 4120 + }, + { + "epoch": 14.874040632054175, + "eval_loss": 0.6031190752983093, + "eval_runtime": 3.1392, + "eval_samples_per_second": 57.02, + "eval_steps_per_second": 57.02, + "step": 4120 + }, + { + "epoch": 14.87765237020316, + "grad_norm": 235.65065002441406, + "learning_rate": 7.6170598911070774e-06, + "loss": 35.3024, + "step": 4121 + }, + { + "epoch": 14.881264108352145, + "grad_norm": 258.1300964355469, + "learning_rate": 7.611615245009075e-06, + "loss": 35.4444, + "step": 4122 + }, + { + "epoch": 14.884875846501128, + "grad_norm": 262.9698791503906, + "learning_rate": 7.606170598911072e-06, + "loss": 36.5643, + "step": 4123 + }, + { + "epoch": 14.888487584650113, + "grad_norm": 274.81781005859375, + "learning_rate": 7.600725952813067e-06, + "loss": 33.0157, + "step": 4124 + }, + { + "epoch": 14.892099322799098, + "grad_norm": 205.41566467285156, + "learning_rate": 7.595281306715063e-06, + "loss": 22.226, + "step": 4125 + }, + { + "epoch": 14.89571106094808, + "grad_norm": 231.19541931152344, + "learning_rate": 7.5898366606170594e-06, + "loss": 22.1499, + "step": 4126 + }, + { + "epoch": 14.899322799097066, + "grad_norm": 203.04856872558594, + "learning_rate": 7.584392014519057e-06, + "loss": 23.3987, + "step": 4127 + }, + { + "epoch": 14.90293453724605, + "grad_norm": 289.031005859375, + "learning_rate": 7.578947368421053e-06, + "loss": 24.3649, + "step": 4128 + }, + { + "epoch": 14.906546275395034, + "grad_norm": 285.2325744628906, + "learning_rate": 7.573502722323049e-06, + "loss": 41.146, + "step": 4129 + }, + { + "epoch": 14.910158013544018, + "grad_norm": 232.21603393554688, + "learning_rate": 7.568058076225045e-06, + "loss": 40.3871, + "step": 4130 + }, + { + "epoch": 14.910158013544018, + "eval_loss": 0.6056836247444153, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 4130 + }, + { + "epoch": 14.913769751693001, + "grad_norm": 358.63238525390625, + "learning_rate": 7.562613430127043e-06, + "loss": 39.5914, + "step": 4131 + }, + { + "epoch": 14.917381489841986, + "grad_norm": 262.66741943359375, + "learning_rate": 7.5571687840290385e-06, + "loss": 39.4552, + "step": 4132 + }, + { + "epoch": 14.920993227990971, + "grad_norm": 228.7096710205078, + "learning_rate": 7.551724137931035e-06, + "loss": 41.5379, + "step": 4133 + }, + { + "epoch": 14.924604966139954, + "grad_norm": 266.6537780761719, + "learning_rate": 7.546279491833031e-06, + "loss": 39.8314, + "step": 4134 + }, + { + "epoch": 14.928216704288939, + "grad_norm": 329.5486755371094, + "learning_rate": 7.540834845735027e-06, + "loss": 37.8247, + "step": 4135 + }, + { + "epoch": 14.931828442437924, + "grad_norm": 391.49127197265625, + "learning_rate": 7.535390199637024e-06, + "loss": 36.8491, + "step": 4136 + }, + { + "epoch": 14.935440180586907, + "grad_norm": 342.66632080078125, + "learning_rate": 7.5299455535390205e-06, + "loss": 37.7245, + "step": 4137 + }, + { + "epoch": 14.939051918735892, + "grad_norm": 309.25115966796875, + "learning_rate": 7.524500907441017e-06, + "loss": 38.3694, + "step": 4138 + }, + { + "epoch": 14.942663656884875, + "grad_norm": 438.21539306640625, + "learning_rate": 7.519056261343012e-06, + "loss": 38.5028, + "step": 4139 + }, + { + "epoch": 14.94627539503386, + "grad_norm": 314.2667541503906, + "learning_rate": 7.513611615245008e-06, + "loss": 39.2531, + "step": 4140 + }, + { + "epoch": 14.94627539503386, + "eval_loss": 0.6075459718704224, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 4140 + }, + { + "epoch": 14.949887133182845, + "grad_norm": 348.3675537109375, + "learning_rate": 7.508166969147006e-06, + "loss": 38.3904, + "step": 4141 + }, + { + "epoch": 14.953498871331828, + "grad_norm": 448.6506652832031, + "learning_rate": 7.5027223230490025e-06, + "loss": 39.0257, + "step": 4142 + }, + { + "epoch": 14.957110609480813, + "grad_norm": 407.4074401855469, + "learning_rate": 7.497277676950998e-06, + "loss": 36.8144, + "step": 4143 + }, + { + "epoch": 14.960722347629797, + "grad_norm": 311.0707702636719, + "learning_rate": 7.491833030852995e-06, + "loss": 34.3852, + "step": 4144 + }, + { + "epoch": 14.96433408577878, + "grad_norm": 316.660400390625, + "learning_rate": 7.486388384754991e-06, + "loss": 32.9411, + "step": 4145 + }, + { + "epoch": 14.967945823927765, + "grad_norm": 405.3203125, + "learning_rate": 7.480943738656988e-06, + "loss": 32.9947, + "step": 4146 + }, + { + "epoch": 14.97155756207675, + "grad_norm": 246.47296142578125, + "learning_rate": 7.475499092558984e-06, + "loss": 34.9284, + "step": 4147 + }, + { + "epoch": 14.975169300225733, + "grad_norm": 250.6293487548828, + "learning_rate": 7.47005444646098e-06, + "loss": 33.5852, + "step": 4148 + }, + { + "epoch": 14.978781038374718, + "grad_norm": 367.8492736816406, + "learning_rate": 7.464609800362977e-06, + "loss": 34.5658, + "step": 4149 + }, + { + "epoch": 14.982392776523701, + "grad_norm": 299.1382141113281, + "learning_rate": 7.459165154264972e-06, + "loss": 35.4483, + "step": 4150 + }, + { + "epoch": 14.982392776523701, + "eval_loss": 0.6054605841636658, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 4150 + }, + { + "epoch": 14.986004514672686, + "grad_norm": 448.0080261230469, + "learning_rate": 7.453720508166969e-06, + "loss": 35.9366, + "step": 4151 + }, + { + "epoch": 14.989616252821671, + "grad_norm": 496.0691223144531, + "learning_rate": 7.448275862068966e-06, + "loss": 37.6222, + "step": 4152 + }, + { + "epoch": 14.993227990970654, + "grad_norm": 300.7026062011719, + "learning_rate": 7.442831215970963e-06, + "loss": 27.5573, + "step": 4153 + }, + { + "epoch": 14.996839729119639, + "grad_norm": 183.81434631347656, + "learning_rate": 7.437386569872958e-06, + "loss": 23.0142, + "step": 4154 + }, + { + "epoch": 15.0, + "grad_norm": 198.61032104492188, + "learning_rate": 7.431941923774954e-06, + "loss": 21.0732, + "step": 4155 + }, + { + "epoch": 15.003611738148985, + "grad_norm": 244.2176513671875, + "learning_rate": 7.426497277676951e-06, + "loss": 39.1709, + "step": 4156 + }, + { + "epoch": 15.007223476297968, + "grad_norm": 211.74375915527344, + "learning_rate": 7.421052631578948e-06, + "loss": 39.9364, + "step": 4157 + }, + { + "epoch": 15.010835214446953, + "grad_norm": 216.2489013671875, + "learning_rate": 7.415607985480944e-06, + "loss": 39.5166, + "step": 4158 + }, + { + "epoch": 15.014446952595938, + "grad_norm": 279.423583984375, + "learning_rate": 7.41016333938294e-06, + "loss": 39.6738, + "step": 4159 + }, + { + "epoch": 15.01805869074492, + "grad_norm": 279.117919921875, + "learning_rate": 7.404718693284937e-06, + "loss": 39.3556, + "step": 4160 + }, + { + "epoch": 15.01805869074492, + "eval_loss": 0.6020110249519348, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 4160 + }, + { + "epoch": 15.021670428893906, + "grad_norm": 213.57162475585938, + "learning_rate": 7.399274047186933e-06, + "loss": 38.9987, + "step": 4161 + }, + { + "epoch": 15.025282167042889, + "grad_norm": 184.1968994140625, + "learning_rate": 7.393829401088929e-06, + "loss": 39.1696, + "step": 4162 + }, + { + "epoch": 15.028893905191874, + "grad_norm": 219.38076782226562, + "learning_rate": 7.388384754990926e-06, + "loss": 39.8897, + "step": 4163 + }, + { + "epoch": 15.032505643340858, + "grad_norm": 225.4325714111328, + "learning_rate": 7.382940108892922e-06, + "loss": 40.7633, + "step": 4164 + }, + { + "epoch": 15.036117381489841, + "grad_norm": 274.78472900390625, + "learning_rate": 7.377495462794918e-06, + "loss": 39.8768, + "step": 4165 + }, + { + "epoch": 15.039729119638826, + "grad_norm": 269.5557861328125, + "learning_rate": 7.3720508166969146e-06, + "loss": 38.4735, + "step": 4166 + }, + { + "epoch": 15.043340857787811, + "grad_norm": 219.78761291503906, + "learning_rate": 7.366606170598912e-06, + "loss": 37.2117, + "step": 4167 + }, + { + "epoch": 15.046952595936794, + "grad_norm": 205.49771118164062, + "learning_rate": 7.361161524500908e-06, + "loss": 36.6855, + "step": 4168 + }, + { + "epoch": 15.050564334085779, + "grad_norm": 235.72068786621094, + "learning_rate": 7.355716878402904e-06, + "loss": 35.4408, + "step": 4169 + }, + { + "epoch": 15.054176072234762, + "grad_norm": 218.84732055664062, + "learning_rate": 7.3502722323049e-06, + "loss": 38.2297, + "step": 4170 + }, + { + "epoch": 15.054176072234762, + "eval_loss": 0.6053969860076904, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 4170 + }, + { + "epoch": 15.057787810383747, + "grad_norm": 195.80685424804688, + "learning_rate": 7.3448275862068966e-06, + "loss": 35.7271, + "step": 4171 + }, + { + "epoch": 15.061399548532732, + "grad_norm": 207.12481689453125, + "learning_rate": 7.339382940108894e-06, + "loss": 37.3393, + "step": 4172 + }, + { + "epoch": 15.065011286681715, + "grad_norm": 211.0287322998047, + "learning_rate": 7.333938294010889e-06, + "loss": 36.9505, + "step": 4173 + }, + { + "epoch": 15.0686230248307, + "grad_norm": 279.0206604003906, + "learning_rate": 7.328493647912886e-06, + "loss": 38.1225, + "step": 4174 + }, + { + "epoch": 15.072234762979685, + "grad_norm": 206.3834228515625, + "learning_rate": 7.323049001814882e-06, + "loss": 37.1117, + "step": 4175 + }, + { + "epoch": 15.075846501128668, + "grad_norm": 266.8707275390625, + "learning_rate": 7.3176043557168786e-06, + "loss": 36.1971, + "step": 4176 + }, + { + "epoch": 15.079458239277653, + "grad_norm": 260.35791015625, + "learning_rate": 7.312159709618875e-06, + "loss": 37.4714, + "step": 4177 + }, + { + "epoch": 15.083069977426636, + "grad_norm": 281.152587890625, + "learning_rate": 7.306715063520871e-06, + "loss": 37.621, + "step": 4178 + }, + { + "epoch": 15.08668171557562, + "grad_norm": 246.25758361816406, + "learning_rate": 7.301270417422868e-06, + "loss": 38.919, + "step": 4179 + }, + { + "epoch": 15.090293453724605, + "grad_norm": 378.4499816894531, + "learning_rate": 7.2958257713248635e-06, + "loss": 39.5783, + "step": 4180 + }, + { + "epoch": 15.090293453724605, + "eval_loss": 0.6071392297744751, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 4180 + }, + { + "epoch": 15.093905191873588, + "grad_norm": 421.0552673339844, + "learning_rate": 7.2903811252268606e-06, + "loss": 38.9023, + "step": 4181 + }, + { + "epoch": 15.097516930022573, + "grad_norm": 264.24359130859375, + "learning_rate": 7.284936479128857e-06, + "loss": 39.6466, + "step": 4182 + }, + { + "epoch": 15.101128668171558, + "grad_norm": 246.88182067871094, + "learning_rate": 7.279491833030854e-06, + "loss": 39.4899, + "step": 4183 + }, + { + "epoch": 15.104740406320541, + "grad_norm": 236.83848571777344, + "learning_rate": 7.274047186932849e-06, + "loss": 35.6587, + "step": 4184 + }, + { + "epoch": 15.108352144469526, + "grad_norm": 278.31573486328125, + "learning_rate": 7.2686025408348455e-06, + "loss": 34.1567, + "step": 4185 + }, + { + "epoch": 15.111963882618511, + "grad_norm": 243.71160888671875, + "learning_rate": 7.2631578947368426e-06, + "loss": 32.1268, + "step": 4186 + }, + { + "epoch": 15.115575620767494, + "grad_norm": 233.81211853027344, + "learning_rate": 7.257713248638839e-06, + "loss": 31.498, + "step": 4187 + }, + { + "epoch": 15.119187358916479, + "grad_norm": 243.12672424316406, + "learning_rate": 7.252268602540835e-06, + "loss": 32.3648, + "step": 4188 + }, + { + "epoch": 15.122799097065462, + "grad_norm": 293.38299560546875, + "learning_rate": 7.246823956442831e-06, + "loss": 32.2236, + "step": 4189 + }, + { + "epoch": 15.126410835214447, + "grad_norm": 249.70071411132812, + "learning_rate": 7.241379310344828e-06, + "loss": 34.5535, + "step": 4190 + }, + { + "epoch": 15.126410835214447, + "eval_loss": 0.6050077676773071, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.934, + "eval_steps_per_second": 56.934, + "step": 4190 + }, + { + "epoch": 15.130022573363432, + "grad_norm": 300.9483642578125, + "learning_rate": 7.235934664246824e-06, + "loss": 32.9552, + "step": 4191 + }, + { + "epoch": 15.133634311512415, + "grad_norm": 228.797607421875, + "learning_rate": 7.23049001814882e-06, + "loss": 33.0974, + "step": 4192 + }, + { + "epoch": 15.1372460496614, + "grad_norm": 279.9087219238281, + "learning_rate": 7.225045372050817e-06, + "loss": 34.2865, + "step": 4193 + }, + { + "epoch": 15.140857787810384, + "grad_norm": 254.15928649902344, + "learning_rate": 7.219600725952813e-06, + "loss": 34.5603, + "step": 4194 + }, + { + "epoch": 15.144469525959368, + "grad_norm": 314.19012451171875, + "learning_rate": 7.2141560798548095e-06, + "loss": 34.6428, + "step": 4195 + }, + { + "epoch": 15.148081264108352, + "grad_norm": 291.8244323730469, + "learning_rate": 7.208711433756806e-06, + "loss": 33.6676, + "step": 4196 + }, + { + "epoch": 15.151693002257336, + "grad_norm": 276.4428405761719, + "learning_rate": 7.203266787658803e-06, + "loss": 33.9118, + "step": 4197 + }, + { + "epoch": 15.15530474040632, + "grad_norm": 265.7801208496094, + "learning_rate": 7.197822141560799e-06, + "loss": 35.1971, + "step": 4198 + }, + { + "epoch": 15.158916478555305, + "grad_norm": 244.48667907714844, + "learning_rate": 7.192377495462795e-06, + "loss": 33.0843, + "step": 4199 + }, + { + "epoch": 15.162528216704288, + "grad_norm": 348.6037902832031, + "learning_rate": 7.1869328493647915e-06, + "loss": 36.7957, + "step": 4200 + }, + { + "epoch": 15.162528216704288, + "eval_loss": 0.6052607297897339, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 4200 + }, + { + "epoch": 15.166139954853273, + "grad_norm": 227.31346130371094, + "learning_rate": 7.181488203266788e-06, + "loss": 28.0234, + "step": 4201 + }, + { + "epoch": 15.169751693002258, + "grad_norm": 208.75048828125, + "learning_rate": 7.176043557168784e-06, + "loss": 22.5147, + "step": 4202 + }, + { + "epoch": 15.173363431151241, + "grad_norm": 222.91090393066406, + "learning_rate": 7.17059891107078e-06, + "loss": 22.1029, + "step": 4203 + }, + { + "epoch": 15.176975169300226, + "grad_norm": 219.40621948242188, + "learning_rate": 7.165154264972777e-06, + "loss": 22.9827, + "step": 4204 + }, + { + "epoch": 15.18058690744921, + "grad_norm": 229.11813354492188, + "learning_rate": 7.1597096188747735e-06, + "loss": 23.6974, + "step": 4205 + }, + { + "epoch": 15.184198645598194, + "grad_norm": 256.7950744628906, + "learning_rate": 7.15426497277677e-06, + "loss": 39.6585, + "step": 4206 + }, + { + "epoch": 15.187810383747179, + "grad_norm": 237.47613525390625, + "learning_rate": 7.148820326678766e-06, + "loss": 40.0478, + "step": 4207 + }, + { + "epoch": 15.191422121896162, + "grad_norm": 259.54296875, + "learning_rate": 7.143375680580762e-06, + "loss": 39.7604, + "step": 4208 + }, + { + "epoch": 15.195033860045147, + "grad_norm": 249.7389678955078, + "learning_rate": 7.137931034482759e-06, + "loss": 39.0201, + "step": 4209 + }, + { + "epoch": 15.198645598194132, + "grad_norm": 298.4624938964844, + "learning_rate": 7.132486388384755e-06, + "loss": 39.8575, + "step": 4210 + }, + { + "epoch": 15.198645598194132, + "eval_loss": 0.6088115572929382, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 4210 + }, + { + "epoch": 15.202257336343115, + "grad_norm": 267.57659912109375, + "learning_rate": 7.127041742286752e-06, + "loss": 38.8929, + "step": 4211 + }, + { + "epoch": 15.2058690744921, + "grad_norm": 243.88333129882812, + "learning_rate": 7.121597096188748e-06, + "loss": 39.6078, + "step": 4212 + }, + { + "epoch": 15.209480812641084, + "grad_norm": 268.2644348144531, + "learning_rate": 7.116152450090745e-06, + "loss": 39.9488, + "step": 4213 + }, + { + "epoch": 15.213092550790067, + "grad_norm": 240.2657928466797, + "learning_rate": 7.11070780399274e-06, + "loss": 40.1645, + "step": 4214 + }, + { + "epoch": 15.216704288939052, + "grad_norm": 198.76910400390625, + "learning_rate": 7.105263157894737e-06, + "loss": 38.2229, + "step": 4215 + }, + { + "epoch": 15.220316027088035, + "grad_norm": 234.11170959472656, + "learning_rate": 7.099818511796734e-06, + "loss": 39.5294, + "step": 4216 + }, + { + "epoch": 15.22392776523702, + "grad_norm": 192.80194091796875, + "learning_rate": 7.094373865698729e-06, + "loss": 36.9752, + "step": 4217 + }, + { + "epoch": 15.227539503386005, + "grad_norm": 241.8236846923828, + "learning_rate": 7.088929219600726e-06, + "loss": 36.1043, + "step": 4218 + }, + { + "epoch": 15.231151241534988, + "grad_norm": 451.6199645996094, + "learning_rate": 7.083484573502722e-06, + "loss": 37.7911, + "step": 4219 + }, + { + "epoch": 15.234762979683973, + "grad_norm": 351.9429626464844, + "learning_rate": 7.0780399274047195e-06, + "loss": 35.5202, + "step": 4220 + }, + { + "epoch": 15.234762979683973, + "eval_loss": 0.6093130111694336, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 4220 + }, + { + "epoch": 15.238374717832958, + "grad_norm": 266.4995422363281, + "learning_rate": 7.072595281306715e-06, + "loss": 37.5552, + "step": 4221 + }, + { + "epoch": 15.241986455981941, + "grad_norm": 258.74578857421875, + "learning_rate": 7.067150635208712e-06, + "loss": 37.1315, + "step": 4222 + }, + { + "epoch": 15.245598194130926, + "grad_norm": 233.30921936035156, + "learning_rate": 7.061705989110708e-06, + "loss": 36.9237, + "step": 4223 + }, + { + "epoch": 15.249209932279909, + "grad_norm": 235.8688201904297, + "learning_rate": 7.056261343012704e-06, + "loss": 38.0112, + "step": 4224 + }, + { + "epoch": 15.252821670428894, + "grad_norm": 214.88436889648438, + "learning_rate": 7.050816696914701e-06, + "loss": 38.5641, + "step": 4225 + }, + { + "epoch": 15.256433408577879, + "grad_norm": 252.64144897460938, + "learning_rate": 7.045372050816697e-06, + "loss": 36.7125, + "step": 4226 + }, + { + "epoch": 15.260045146726862, + "grad_norm": 293.78424072265625, + "learning_rate": 7.039927404718694e-06, + "loss": 37.5956, + "step": 4227 + }, + { + "epoch": 15.263656884875846, + "grad_norm": 234.13510131835938, + "learning_rate": 7.03448275862069e-06, + "loss": 38.1829, + "step": 4228 + }, + { + "epoch": 15.267268623024831, + "grad_norm": 279.534912109375, + "learning_rate": 7.029038112522686e-06, + "loss": 39.0785, + "step": 4229 + }, + { + "epoch": 15.270880361173814, + "grad_norm": 246.4442596435547, + "learning_rate": 7.023593466424683e-06, + "loss": 39.1753, + "step": 4230 + }, + { + "epoch": 15.270880361173814, + "eval_loss": 0.6043311357498169, + "eval_runtime": 3.1452, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 4230 + }, + { + "epoch": 15.2744920993228, + "grad_norm": 233.87466430664062, + "learning_rate": 7.018148820326679e-06, + "loss": 39.8464, + "step": 4231 + }, + { + "epoch": 15.278103837471784, + "grad_norm": 228.54898071289062, + "learning_rate": 7.012704174228675e-06, + "loss": 37.9721, + "step": 4232 + }, + { + "epoch": 15.281715575620767, + "grad_norm": 273.70050048828125, + "learning_rate": 7.007259528130671e-06, + "loss": 38.9153, + "step": 4233 + }, + { + "epoch": 15.285327313769752, + "grad_norm": 269.8402404785156, + "learning_rate": 7.001814882032668e-06, + "loss": 36.7607, + "step": 4234 + }, + { + "epoch": 15.288939051918735, + "grad_norm": 260.13629150390625, + "learning_rate": 6.996370235934665e-06, + "loss": 35.3684, + "step": 4235 + }, + { + "epoch": 15.29255079006772, + "grad_norm": 223.9878692626953, + "learning_rate": 6.990925589836661e-06, + "loss": 32.8784, + "step": 4236 + }, + { + "epoch": 15.296162528216705, + "grad_norm": 225.69212341308594, + "learning_rate": 6.985480943738657e-06, + "loss": 31.3751, + "step": 4237 + }, + { + "epoch": 15.299774266365688, + "grad_norm": 215.99801635742188, + "learning_rate": 6.980036297640653e-06, + "loss": 31.5331, + "step": 4238 + }, + { + "epoch": 15.303386004514673, + "grad_norm": 263.26568603515625, + "learning_rate": 6.97459165154265e-06, + "loss": 32.5806, + "step": 4239 + }, + { + "epoch": 15.306997742663658, + "grad_norm": 203.2392578125, + "learning_rate": 6.969147005444646e-06, + "loss": 31.6379, + "step": 4240 + }, + { + "epoch": 15.306997742663658, + "eval_loss": 0.6046441793441772, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 4240 + }, + { + "epoch": 15.31060948081264, + "grad_norm": 221.2167510986328, + "learning_rate": 6.963702359346643e-06, + "loss": 33.7034, + "step": 4241 + }, + { + "epoch": 15.314221218961626, + "grad_norm": 212.58737182617188, + "learning_rate": 6.958257713248639e-06, + "loss": 32.5511, + "step": 4242 + }, + { + "epoch": 15.317832957110609, + "grad_norm": 270.7123718261719, + "learning_rate": 6.952813067150635e-06, + "loss": 33.2513, + "step": 4243 + }, + { + "epoch": 15.321444695259594, + "grad_norm": 270.2066345214844, + "learning_rate": 6.9473684210526315e-06, + "loss": 33.9559, + "step": 4244 + }, + { + "epoch": 15.325056433408578, + "grad_norm": 232.8043212890625, + "learning_rate": 6.941923774954628e-06, + "loss": 33.9916, + "step": 4245 + }, + { + "epoch": 15.328668171557561, + "grad_norm": 325.419921875, + "learning_rate": 6.936479128856625e-06, + "loss": 35.2098, + "step": 4246 + }, + { + "epoch": 15.332279909706546, + "grad_norm": 303.326416015625, + "learning_rate": 6.93103448275862e-06, + "loss": 35.0784, + "step": 4247 + }, + { + "epoch": 15.335891647855531, + "grad_norm": 327.05963134765625, + "learning_rate": 6.925589836660617e-06, + "loss": 35.9915, + "step": 4248 + }, + { + "epoch": 15.339503386004514, + "grad_norm": 326.58795166015625, + "learning_rate": 6.9201451905626135e-06, + "loss": 35.1914, + "step": 4249 + }, + { + "epoch": 15.343115124153499, + "grad_norm": 406.38812255859375, + "learning_rate": 6.914700544464611e-06, + "loss": 37.1535, + "step": 4250 + }, + { + "epoch": 15.343115124153499, + "eval_loss": 0.6056071519851685, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 4250 + }, + { + "epoch": 15.346726862302482, + "grad_norm": 325.6965637207031, + "learning_rate": 6.909255898366606e-06, + "loss": 29.8698, + "step": 4251 + }, + { + "epoch": 15.350338600451467, + "grad_norm": 212.59727478027344, + "learning_rate": 6.903811252268603e-06, + "loss": 22.2995, + "step": 4252 + }, + { + "epoch": 15.353950338600452, + "grad_norm": 257.447509765625, + "learning_rate": 6.898366606170599e-06, + "loss": 23.1014, + "step": 4253 + }, + { + "epoch": 15.357562076749435, + "grad_norm": 266.139892578125, + "learning_rate": 6.8929219600725955e-06, + "loss": 23.2319, + "step": 4254 + }, + { + "epoch": 15.36117381489842, + "grad_norm": 332.7207336425781, + "learning_rate": 6.887477313974592e-06, + "loss": 23.7218, + "step": 4255 + }, + { + "epoch": 15.364785553047405, + "grad_norm": 272.7341003417969, + "learning_rate": 6.882032667876588e-06, + "loss": 39.5787, + "step": 4256 + }, + { + "epoch": 15.368397291196388, + "grad_norm": 259.00872802734375, + "learning_rate": 6.876588021778585e-06, + "loss": 41.0874, + "step": 4257 + }, + { + "epoch": 15.372009029345373, + "grad_norm": 236.87033081054688, + "learning_rate": 6.8711433756805804e-06, + "loss": 38.9811, + "step": 4258 + }, + { + "epoch": 15.375620767494357, + "grad_norm": 293.6808776855469, + "learning_rate": 6.8656987295825775e-06, + "loss": 39.481, + "step": 4259 + }, + { + "epoch": 15.37923250564334, + "grad_norm": 266.0845947265625, + "learning_rate": 6.860254083484574e-06, + "loss": 39.4595, + "step": 4260 + }, + { + "epoch": 15.37923250564334, + "eval_loss": 0.6039742231369019, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.047, + "eval_steps_per_second": 57.047, + "step": 4260 + }, + { + "epoch": 15.382844243792325, + "grad_norm": 398.0877685546875, + "learning_rate": 6.85480943738657e-06, + "loss": 38.8899, + "step": 4261 + }, + { + "epoch": 15.386455981941308, + "grad_norm": 208.37376403808594, + "learning_rate": 6.849364791288566e-06, + "loss": 39.2194, + "step": 4262 + }, + { + "epoch": 15.390067720090293, + "grad_norm": 214.6958770751953, + "learning_rate": 6.8439201451905624e-06, + "loss": 38.9911, + "step": 4263 + }, + { + "epoch": 15.393679458239278, + "grad_norm": 210.2147674560547, + "learning_rate": 6.8384754990925595e-06, + "loss": 40.5973, + "step": 4264 + }, + { + "epoch": 15.397291196388261, + "grad_norm": 240.47030639648438, + "learning_rate": 6.833030852994556e-06, + "loss": 39.3936, + "step": 4265 + }, + { + "epoch": 15.400902934537246, + "grad_norm": 273.86883544921875, + "learning_rate": 6.827586206896552e-06, + "loss": 40.0848, + "step": 4266 + }, + { + "epoch": 15.404514672686231, + "grad_norm": 239.36453247070312, + "learning_rate": 6.822141560798548e-06, + "loss": 36.5967, + "step": 4267 + }, + { + "epoch": 15.408126410835214, + "grad_norm": 215.3413543701172, + "learning_rate": 6.8166969147005444e-06, + "loss": 37.8173, + "step": 4268 + }, + { + "epoch": 15.411738148984199, + "grad_norm": 260.1557312011719, + "learning_rate": 6.811252268602541e-06, + "loss": 37.7175, + "step": 4269 + }, + { + "epoch": 15.415349887133182, + "grad_norm": 239.4988555908203, + "learning_rate": 6.805807622504537e-06, + "loss": 37.0618, + "step": 4270 + }, + { + "epoch": 15.415349887133182, + "eval_loss": 0.6049810647964478, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 4270 + }, + { + "epoch": 15.418961625282167, + "grad_norm": 223.06094360351562, + "learning_rate": 6.800362976406534e-06, + "loss": 37.0687, + "step": 4271 + }, + { + "epoch": 15.422573363431152, + "grad_norm": 261.7460632324219, + "learning_rate": 6.79491833030853e-06, + "loss": 35.9437, + "step": 4272 + }, + { + "epoch": 15.426185101580135, + "grad_norm": 230.92135620117188, + "learning_rate": 6.7894736842105264e-06, + "loss": 38.3316, + "step": 4273 + }, + { + "epoch": 15.42979683972912, + "grad_norm": 370.6309509277344, + "learning_rate": 6.784029038112523e-06, + "loss": 38.2666, + "step": 4274 + }, + { + "epoch": 15.433408577878104, + "grad_norm": 249.7823944091797, + "learning_rate": 6.77858439201452e-06, + "loss": 38.1159, + "step": 4275 + }, + { + "epoch": 15.437020316027088, + "grad_norm": 404.1676330566406, + "learning_rate": 6.773139745916516e-06, + "loss": 37.6548, + "step": 4276 + }, + { + "epoch": 15.440632054176072, + "grad_norm": 256.3241271972656, + "learning_rate": 6.767695099818511e-06, + "loss": 38.3713, + "step": 4277 + }, + { + "epoch": 15.444243792325057, + "grad_norm": 240.55934143066406, + "learning_rate": 6.7622504537205084e-06, + "loss": 39.2487, + "step": 4278 + }, + { + "epoch": 15.44785553047404, + "grad_norm": 230.010009765625, + "learning_rate": 6.756805807622505e-06, + "loss": 39.4391, + "step": 4279 + }, + { + "epoch": 15.451467268623025, + "grad_norm": 226.51385498046875, + "learning_rate": 6.751361161524502e-06, + "loss": 38.6273, + "step": 4280 + }, + { + "epoch": 15.451467268623025, + "eval_loss": 0.6027400493621826, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.029, + "eval_steps_per_second": 57.029, + "step": 4280 + }, + { + "epoch": 15.455079006772008, + "grad_norm": 314.57476806640625, + "learning_rate": 6.745916515426497e-06, + "loss": 38.583, + "step": 4281 + }, + { + "epoch": 15.458690744920993, + "grad_norm": 229.91238403320312, + "learning_rate": 6.740471869328494e-06, + "loss": 39.2433, + "step": 4282 + }, + { + "epoch": 15.462302483069978, + "grad_norm": 284.7301330566406, + "learning_rate": 6.7350272232304904e-06, + "loss": 38.8577, + "step": 4283 + }, + { + "epoch": 15.465914221218961, + "grad_norm": 209.32266235351562, + "learning_rate": 6.729582577132486e-06, + "loss": 34.928, + "step": 4284 + }, + { + "epoch": 15.469525959367946, + "grad_norm": 264.6195068359375, + "learning_rate": 6.724137931034483e-06, + "loss": 32.0527, + "step": 4285 + }, + { + "epoch": 15.47313769751693, + "grad_norm": 224.2421112060547, + "learning_rate": 6.718693284936479e-06, + "loss": 31.939, + "step": 4286 + }, + { + "epoch": 15.476749435665914, + "grad_norm": 233.0791015625, + "learning_rate": 6.713248638838476e-06, + "loss": 32.5402, + "step": 4287 + }, + { + "epoch": 15.480361173814899, + "grad_norm": 284.129638671875, + "learning_rate": 6.707803992740472e-06, + "loss": 31.0069, + "step": 4288 + }, + { + "epoch": 15.483972911963882, + "grad_norm": 253.6517791748047, + "learning_rate": 6.702359346642469e-06, + "loss": 32.0172, + "step": 4289 + }, + { + "epoch": 15.487584650112867, + "grad_norm": 305.63775634765625, + "learning_rate": 6.696914700544465e-06, + "loss": 34.1643, + "step": 4290 + }, + { + "epoch": 15.487584650112867, + "eval_loss": 0.6044390201568604, + "eval_runtime": 3.1391, + "eval_samples_per_second": 57.023, + "eval_steps_per_second": 57.023, + "step": 4290 + }, + { + "epoch": 15.491196388261852, + "grad_norm": 224.6516876220703, + "learning_rate": 6.691470054446461e-06, + "loss": 32.4735, + "step": 4291 + }, + { + "epoch": 15.494808126410835, + "grad_norm": 257.5385437011719, + "learning_rate": 6.686025408348457e-06, + "loss": 33.9272, + "step": 4292 + }, + { + "epoch": 15.49841986455982, + "grad_norm": 393.9106140136719, + "learning_rate": 6.680580762250454e-06, + "loss": 34.4176, + "step": 4293 + }, + { + "epoch": 15.502031602708804, + "grad_norm": 333.5639953613281, + "learning_rate": 6.675136116152451e-06, + "loss": 34.5695, + "step": 4294 + }, + { + "epoch": 15.505643340857787, + "grad_norm": 319.8660888671875, + "learning_rate": 6.669691470054446e-06, + "loss": 34.5337, + "step": 4295 + }, + { + "epoch": 15.509255079006772, + "grad_norm": 246.78086853027344, + "learning_rate": 6.664246823956443e-06, + "loss": 34.8297, + "step": 4296 + }, + { + "epoch": 15.512866817155757, + "grad_norm": 313.4530944824219, + "learning_rate": 6.658802177858439e-06, + "loss": 34.6901, + "step": 4297 + }, + { + "epoch": 15.51647855530474, + "grad_norm": 257.2852783203125, + "learning_rate": 6.6533575317604364e-06, + "loss": 35.3892, + "step": 4298 + }, + { + "epoch": 15.520090293453725, + "grad_norm": 336.5549011230469, + "learning_rate": 6.647912885662432e-06, + "loss": 36.3347, + "step": 4299 + }, + { + "epoch": 15.523702031602708, + "grad_norm": 275.726806640625, + "learning_rate": 6.642468239564428e-06, + "loss": 36.3559, + "step": 4300 + }, + { + "epoch": 15.523702031602708, + "eval_loss": 0.6056334376335144, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.028, + "eval_steps_per_second": 57.028, + "step": 4300 + }, + { + "epoch": 15.527313769751693, + "grad_norm": 275.5987243652344, + "learning_rate": 6.637023593466425e-06, + "loss": 28.5887, + "step": 4301 + }, + { + "epoch": 15.530925507900678, + "grad_norm": 242.59762573242188, + "learning_rate": 6.631578947368421e-06, + "loss": 22.1398, + "step": 4302 + }, + { + "epoch": 15.534537246049661, + "grad_norm": 228.04344177246094, + "learning_rate": 6.626134301270418e-06, + "loss": 21.4593, + "step": 4303 + }, + { + "epoch": 15.538148984198646, + "grad_norm": 204.2377166748047, + "learning_rate": 6.620689655172414e-06, + "loss": 22.5132, + "step": 4304 + }, + { + "epoch": 15.54176072234763, + "grad_norm": 243.0237579345703, + "learning_rate": 6.615245009074411e-06, + "loss": 24.2777, + "step": 4305 + }, + { + "epoch": 15.545372460496614, + "grad_norm": 227.2841339111328, + "learning_rate": 6.609800362976407e-06, + "loss": 39.7235, + "step": 4306 + }, + { + "epoch": 15.548984198645599, + "grad_norm": 253.8453826904297, + "learning_rate": 6.6043557168784025e-06, + "loss": 39.9317, + "step": 4307 + }, + { + "epoch": 15.552595936794582, + "grad_norm": 243.62757873535156, + "learning_rate": 6.5989110707804e-06, + "loss": 38.9825, + "step": 4308 + }, + { + "epoch": 15.556207674943566, + "grad_norm": 262.4398498535156, + "learning_rate": 6.593466424682396e-06, + "loss": 39.7456, + "step": 4309 + }, + { + "epoch": 15.559819413092551, + "grad_norm": 268.5821228027344, + "learning_rate": 6.588021778584392e-06, + "loss": 39.5152, + "step": 4310 + }, + { + "epoch": 15.559819413092551, + "eval_loss": 0.6060237288475037, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 4310 + }, + { + "epoch": 15.563431151241534, + "grad_norm": 297.6933898925781, + "learning_rate": 6.582577132486388e-06, + "loss": 40.1259, + "step": 4311 + }, + { + "epoch": 15.56704288939052, + "grad_norm": 234.08816528320312, + "learning_rate": 6.577132486388385e-06, + "loss": 40.8591, + "step": 4312 + }, + { + "epoch": 15.570654627539504, + "grad_norm": 292.2416687011719, + "learning_rate": 6.571687840290382e-06, + "loss": 39.2377, + "step": 4313 + }, + { + "epoch": 15.574266365688487, + "grad_norm": 205.25888061523438, + "learning_rate": 6.566243194192377e-06, + "loss": 39.92, + "step": 4314 + }, + { + "epoch": 15.577878103837472, + "grad_norm": 229.06695556640625, + "learning_rate": 6.560798548094374e-06, + "loss": 39.8886, + "step": 4315 + }, + { + "epoch": 15.581489841986457, + "grad_norm": 223.3977508544922, + "learning_rate": 6.55535390199637e-06, + "loss": 38.5423, + "step": 4316 + }, + { + "epoch": 15.58510158013544, + "grad_norm": 254.60203552246094, + "learning_rate": 6.549909255898367e-06, + "loss": 36.8055, + "step": 4317 + }, + { + "epoch": 15.588713318284425, + "grad_norm": 304.463623046875, + "learning_rate": 6.544464609800363e-06, + "loss": 37.6164, + "step": 4318 + }, + { + "epoch": 15.592325056433408, + "grad_norm": 279.955810546875, + "learning_rate": 6.53901996370236e-06, + "loss": 37.4778, + "step": 4319 + }, + { + "epoch": 15.595936794582393, + "grad_norm": 230.11105346679688, + "learning_rate": 6.533575317604356e-06, + "loss": 36.9663, + "step": 4320 + }, + { + "epoch": 15.595936794582393, + "eval_loss": 0.6048213243484497, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.966, + "eval_steps_per_second": 56.966, + "step": 4320 + }, + { + "epoch": 15.599548532731378, + "grad_norm": 261.98187255859375, + "learning_rate": 6.528130671506351e-06, + "loss": 37.7402, + "step": 4321 + }, + { + "epoch": 15.60316027088036, + "grad_norm": 247.34771728515625, + "learning_rate": 6.5226860254083485e-06, + "loss": 37.1402, + "step": 4322 + }, + { + "epoch": 15.606772009029346, + "grad_norm": 277.1517333984375, + "learning_rate": 6.517241379310345e-06, + "loss": 38.3976, + "step": 4323 + }, + { + "epoch": 15.610383747178329, + "grad_norm": 231.89683532714844, + "learning_rate": 6.511796733212342e-06, + "loss": 38.0834, + "step": 4324 + }, + { + "epoch": 15.613995485327314, + "grad_norm": 323.8349304199219, + "learning_rate": 6.506352087114337e-06, + "loss": 37.9085, + "step": 4325 + }, + { + "epoch": 15.617607223476298, + "grad_norm": 263.5240783691406, + "learning_rate": 6.500907441016334e-06, + "loss": 37.0702, + "step": 4326 + }, + { + "epoch": 15.621218961625281, + "grad_norm": 217.0517578125, + "learning_rate": 6.4954627949183305e-06, + "loss": 36.9406, + "step": 4327 + }, + { + "epoch": 15.624830699774266, + "grad_norm": 267.4161682128906, + "learning_rate": 6.4900181488203276e-06, + "loss": 38.8773, + "step": 4328 + }, + { + "epoch": 15.628442437923251, + "grad_norm": 232.36000061035156, + "learning_rate": 6.484573502722323e-06, + "loss": 38.4978, + "step": 4329 + }, + { + "epoch": 15.632054176072234, + "grad_norm": 241.61373901367188, + "learning_rate": 6.479128856624319e-06, + "loss": 38.4895, + "step": 4330 + }, + { + "epoch": 15.632054176072234, + "eval_loss": 0.6024956703186035, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 4330 + }, + { + "epoch": 15.635665914221219, + "grad_norm": 232.27928161621094, + "learning_rate": 6.473684210526316e-06, + "loss": 38.8551, + "step": 4331 + }, + { + "epoch": 15.639277652370204, + "grad_norm": 243.42828369140625, + "learning_rate": 6.4682395644283125e-06, + "loss": 38.6475, + "step": 4332 + }, + { + "epoch": 15.642889390519187, + "grad_norm": 306.2618103027344, + "learning_rate": 6.462794918330309e-06, + "loss": 37.2015, + "step": 4333 + }, + { + "epoch": 15.646501128668172, + "grad_norm": 335.795166015625, + "learning_rate": 6.457350272232305e-06, + "loss": 36.5255, + "step": 4334 + }, + { + "epoch": 15.650112866817155, + "grad_norm": 209.6246337890625, + "learning_rate": 6.451905626134302e-06, + "loss": 32.4219, + "step": 4335 + }, + { + "epoch": 15.65372460496614, + "grad_norm": 283.2094421386719, + "learning_rate": 6.446460980036297e-06, + "loss": 30.9137, + "step": 4336 + }, + { + "epoch": 15.657336343115125, + "grad_norm": 255.4412841796875, + "learning_rate": 6.441016333938294e-06, + "loss": 30.8939, + "step": 4337 + }, + { + "epoch": 15.660948081264108, + "grad_norm": 217.8052215576172, + "learning_rate": 6.435571687840291e-06, + "loss": 31.5974, + "step": 4338 + }, + { + "epoch": 15.664559819413093, + "grad_norm": 215.64398193359375, + "learning_rate": 6.430127041742287e-06, + "loss": 30.0276, + "step": 4339 + }, + { + "epoch": 15.668171557562077, + "grad_norm": 244.32704162597656, + "learning_rate": 6.424682395644283e-06, + "loss": 32.5249, + "step": 4340 + }, + { + "epoch": 15.668171557562077, + "eval_loss": 0.6037233471870422, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.033, + "eval_steps_per_second": 57.033, + "step": 4340 + }, + { + "epoch": 15.67178329571106, + "grad_norm": 270.9132080078125, + "learning_rate": 6.419237749546279e-06, + "loss": 32.9923, + "step": 4341 + }, + { + "epoch": 15.675395033860045, + "grad_norm": 230.20314025878906, + "learning_rate": 6.4137931034482765e-06, + "loss": 32.871, + "step": 4342 + }, + { + "epoch": 15.679006772009028, + "grad_norm": 372.4366149902344, + "learning_rate": 6.408348457350273e-06, + "loss": 35.2687, + "step": 4343 + }, + { + "epoch": 15.682618510158013, + "grad_norm": 325.0901794433594, + "learning_rate": 6.402903811252268e-06, + "loss": 34.3107, + "step": 4344 + }, + { + "epoch": 15.686230248306998, + "grad_norm": 277.8683166503906, + "learning_rate": 6.397459165154265e-06, + "loss": 34.291, + "step": 4345 + }, + { + "epoch": 15.689841986455981, + "grad_norm": 262.566162109375, + "learning_rate": 6.392014519056261e-06, + "loss": 33.2989, + "step": 4346 + }, + { + "epoch": 15.693453724604966, + "grad_norm": 293.56536865234375, + "learning_rate": 6.386569872958258e-06, + "loss": 35.6865, + "step": 4347 + }, + { + "epoch": 15.697065462753951, + "grad_norm": 291.1886291503906, + "learning_rate": 6.381125226860254e-06, + "loss": 35.6959, + "step": 4348 + }, + { + "epoch": 15.700677200902934, + "grad_norm": 265.2365417480469, + "learning_rate": 6.375680580762251e-06, + "loss": 36.479, + "step": 4349 + }, + { + "epoch": 15.704288939051919, + "grad_norm": 342.8822021484375, + "learning_rate": 6.370235934664247e-06, + "loss": 35.9198, + "step": 4350 + }, + { + "epoch": 15.704288939051919, + "eval_loss": 0.603361189365387, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.98, + "eval_steps_per_second": 56.98, + "step": 4350 + }, + { + "epoch": 15.707900677200904, + "grad_norm": 276.1657409667969, + "learning_rate": 6.364791288566243e-06, + "loss": 29.429, + "step": 4351 + }, + { + "epoch": 15.711512415349887, + "grad_norm": 267.2456359863281, + "learning_rate": 6.35934664246824e-06, + "loss": 23.0038, + "step": 4352 + }, + { + "epoch": 15.715124153498872, + "grad_norm": 255.4893798828125, + "learning_rate": 6.353901996370236e-06, + "loss": 21.1185, + "step": 4353 + }, + { + "epoch": 15.718735891647855, + "grad_norm": 252.10501098632812, + "learning_rate": 6.348457350272233e-06, + "loss": 23.1769, + "step": 4354 + }, + { + "epoch": 15.72234762979684, + "grad_norm": 239.63905334472656, + "learning_rate": 6.343012704174228e-06, + "loss": 24.5905, + "step": 4355 + }, + { + "epoch": 15.725959367945824, + "grad_norm": 228.00950622558594, + "learning_rate": 6.337568058076225e-06, + "loss": 39.6657, + "step": 4356 + }, + { + "epoch": 15.729571106094808, + "grad_norm": 234.10647583007812, + "learning_rate": 6.332123411978222e-06, + "loss": 41.145, + "step": 4357 + }, + { + "epoch": 15.733182844243792, + "grad_norm": 236.55223083496094, + "learning_rate": 6.326678765880219e-06, + "loss": 40.2784, + "step": 4358 + }, + { + "epoch": 15.736794582392777, + "grad_norm": 340.1712646484375, + "learning_rate": 6.321234119782214e-06, + "loss": 39.3598, + "step": 4359 + }, + { + "epoch": 15.74040632054176, + "grad_norm": 269.4134826660156, + "learning_rate": 6.31578947368421e-06, + "loss": 38.7777, + "step": 4360 + }, + { + "epoch": 15.74040632054176, + "eval_loss": 0.6048015356063843, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 4360 + }, + { + "epoch": 15.744018058690745, + "grad_norm": 316.5471496582031, + "learning_rate": 6.310344827586207e-06, + "loss": 39.6707, + "step": 4361 + }, + { + "epoch": 15.747629796839728, + "grad_norm": 231.31820678710938, + "learning_rate": 6.304900181488203e-06, + "loss": 38.0009, + "step": 4362 + }, + { + "epoch": 15.751241534988713, + "grad_norm": 207.19117736816406, + "learning_rate": 6.2994555353902e-06, + "loss": 41.6523, + "step": 4363 + }, + { + "epoch": 15.754853273137698, + "grad_norm": 239.8341064453125, + "learning_rate": 6.294010889292196e-06, + "loss": 40.3203, + "step": 4364 + }, + { + "epoch": 15.758465011286681, + "grad_norm": 277.2004089355469, + "learning_rate": 6.288566243194193e-06, + "loss": 39.8026, + "step": 4365 + }, + { + "epoch": 15.762076749435666, + "grad_norm": 227.74728393554688, + "learning_rate": 6.2831215970961886e-06, + "loss": 38.1561, + "step": 4366 + }, + { + "epoch": 15.76568848758465, + "grad_norm": 268.6826477050781, + "learning_rate": 6.277676950998185e-06, + "loss": 37.4653, + "step": 4367 + }, + { + "epoch": 15.769300225733634, + "grad_norm": 308.92950439453125, + "learning_rate": 6.272232304900182e-06, + "loss": 36.3506, + "step": 4368 + }, + { + "epoch": 15.772911963882619, + "grad_norm": 216.53627014160156, + "learning_rate": 6.266787658802178e-06, + "loss": 36.12, + "step": 4369 + }, + { + "epoch": 15.776523702031604, + "grad_norm": 264.0691833496094, + "learning_rate": 6.261343012704174e-06, + "loss": 37.5023, + "step": 4370 + }, + { + "epoch": 15.776523702031604, + "eval_loss": 0.608928382396698, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.989, + "eval_steps_per_second": 56.989, + "step": 4370 + }, + { + "epoch": 15.780135440180587, + "grad_norm": 474.7265319824219, + "learning_rate": 6.2558983666061706e-06, + "loss": 38.8381, + "step": 4371 + }, + { + "epoch": 15.783747178329572, + "grad_norm": 303.66229248046875, + "learning_rate": 6.250453720508168e-06, + "loss": 36.5951, + "step": 4372 + }, + { + "epoch": 15.787358916478555, + "grad_norm": 231.65744018554688, + "learning_rate": 6.245009074410164e-06, + "loss": 36.4717, + "step": 4373 + }, + { + "epoch": 15.79097065462754, + "grad_norm": 235.25833129882812, + "learning_rate": 6.239564428312159e-06, + "loss": 38.4578, + "step": 4374 + }, + { + "epoch": 15.794582392776524, + "grad_norm": 215.5384063720703, + "learning_rate": 6.234119782214156e-06, + "loss": 38.0475, + "step": 4375 + }, + { + "epoch": 15.798194130925507, + "grad_norm": 216.3609619140625, + "learning_rate": 6.2286751361161526e-06, + "loss": 37.1825, + "step": 4376 + }, + { + "epoch": 15.801805869074492, + "grad_norm": 275.54522705078125, + "learning_rate": 6.223230490018149e-06, + "loss": 38.5608, + "step": 4377 + }, + { + "epoch": 15.805417607223477, + "grad_norm": 226.7752685546875, + "learning_rate": 6.217785843920145e-06, + "loss": 38.0612, + "step": 4378 + }, + { + "epoch": 15.80902934537246, + "grad_norm": 262.14501953125, + "learning_rate": 6.212341197822142e-06, + "loss": 38.0049, + "step": 4379 + }, + { + "epoch": 15.812641083521445, + "grad_norm": 299.82196044921875, + "learning_rate": 6.206896551724138e-06, + "loss": 39.1441, + "step": 4380 + }, + { + "epoch": 15.812641083521445, + "eval_loss": 0.6033969521522522, + "eval_runtime": 3.14, + "eval_samples_per_second": 57.007, + "eval_steps_per_second": 57.007, + "step": 4380 + }, + { + "epoch": 15.816252821670428, + "grad_norm": 295.24188232421875, + "learning_rate": 6.2014519056261346e-06, + "loss": 39.266, + "step": 4381 + }, + { + "epoch": 15.819864559819413, + "grad_norm": 298.1729736328125, + "learning_rate": 6.196007259528131e-06, + "loss": 39.4025, + "step": 4382 + }, + { + "epoch": 15.823476297968398, + "grad_norm": 234.97958374023438, + "learning_rate": 6.190562613430127e-06, + "loss": 39.4752, + "step": 4383 + }, + { + "epoch": 15.827088036117381, + "grad_norm": 270.3009338378906, + "learning_rate": 6.185117967332124e-06, + "loss": 36.0322, + "step": 4384 + }, + { + "epoch": 15.830699774266366, + "grad_norm": 279.78314208984375, + "learning_rate": 6.1796733212341195e-06, + "loss": 33.3256, + "step": 4385 + }, + { + "epoch": 15.83431151241535, + "grad_norm": 258.82598876953125, + "learning_rate": 6.1742286751361166e-06, + "loss": 33.1552, + "step": 4386 + }, + { + "epoch": 15.837923250564334, + "grad_norm": 280.8109130859375, + "learning_rate": 6.168784029038113e-06, + "loss": 32.0024, + "step": 4387 + }, + { + "epoch": 15.841534988713319, + "grad_norm": 265.08111572265625, + "learning_rate": 6.163339382940109e-06, + "loss": 32.4901, + "step": 4388 + }, + { + "epoch": 15.845146726862303, + "grad_norm": 316.56427001953125, + "learning_rate": 6.157894736842105e-06, + "loss": 33.1995, + "step": 4389 + }, + { + "epoch": 15.848758465011286, + "grad_norm": 256.03717041015625, + "learning_rate": 6.1524500907441015e-06, + "loss": 33.1914, + "step": 4390 + }, + { + "epoch": 15.848758465011286, + "eval_loss": 0.6017575263977051, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.034, + "eval_steps_per_second": 57.034, + "step": 4390 + }, + { + "epoch": 15.852370203160271, + "grad_norm": 242.54119873046875, + "learning_rate": 6.1470054446460985e-06, + "loss": 33.8459, + "step": 4391 + }, + { + "epoch": 15.855981941309254, + "grad_norm": 259.1406555175781, + "learning_rate": 6.141560798548094e-06, + "loss": 34.1317, + "step": 4392 + }, + { + "epoch": 15.85959367945824, + "grad_norm": 272.77880859375, + "learning_rate": 6.136116152450091e-06, + "loss": 34.2777, + "step": 4393 + }, + { + "epoch": 15.863205417607224, + "grad_norm": 231.60845947265625, + "learning_rate": 6.130671506352087e-06, + "loss": 34.0165, + "step": 4394 + }, + { + "epoch": 15.866817155756207, + "grad_norm": 230.85675048828125, + "learning_rate": 6.125226860254084e-06, + "loss": 34.2761, + "step": 4395 + }, + { + "epoch": 15.870428893905192, + "grad_norm": 307.4486389160156, + "learning_rate": 6.11978221415608e-06, + "loss": 33.7407, + "step": 4396 + }, + { + "epoch": 15.874040632054175, + "grad_norm": 264.7835388183594, + "learning_rate": 6.114337568058076e-06, + "loss": 34.1672, + "step": 4397 + }, + { + "epoch": 15.87765237020316, + "grad_norm": 234.93968200683594, + "learning_rate": 6.108892921960073e-06, + "loss": 35.7158, + "step": 4398 + }, + { + "epoch": 15.881264108352145, + "grad_norm": 300.0079345703125, + "learning_rate": 6.103448275862069e-06, + "loss": 36.1292, + "step": 4399 + }, + { + "epoch": 15.884875846501128, + "grad_norm": 326.20416259765625, + "learning_rate": 6.0980036297640655e-06, + "loss": 34.8222, + "step": 4400 + }, + { + "epoch": 15.884875846501128, + "eval_loss": 0.6024067401885986, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.99, + "eval_steps_per_second": 56.99, + "step": 4400 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.1578911293800448e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..af39eb73c6071a97d40f3172063556b6c1d95b21 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07bb2fe9e280eafaf6e8b4c4d204ed3b0c0e693fd10787bca074a89324ce8e2e +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..81489a74ff6d985250d0f3c0f42cf3a308d9413c --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dab1ac7a5b0dd10fc5e8e28f91185ace56a994587fd81708b6ced2531810cfb6 +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c76140573fd127d305b273fd8541848bdc6a650 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df0d2fd74b98ecf86832e61e1e87f569a68aac47155ba2a720af4cc591b9fa8e +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9b91490f5c66739b0a2e8983109cf9735366c9b --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90b4f6929ed828964309570d0e4e45bc9d59234c2211f3e2a623edc2e8f9b25c +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..50642f3cf2214db081b5c26f5b8ec1ff61ca3084 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab09101d6eb866470aed477b8f591ac627cab51fc51a99ea8995d097bf4db695 +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5aa9aa90c2910e744d6748798df8561388c6e271 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/trainer_state.json @@ -0,0 +1,35913 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 16.606772009029346, + "eval_steps": 10, + "global_step": 4600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + }, + { + "epoch": 4.33589164785553, + "grad_norm": 232.332763671875, + "learning_rate": 2.3515426497277678e-05, + "loss": 38.1029, + "step": 1201 + }, + { + "epoch": 4.339503386004514, + "grad_norm": 251.8763885498047, + "learning_rate": 2.3509981851179673e-05, + "loss": 40.2538, + "step": 1202 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 260.1363525390625, + "learning_rate": 2.350453720508167e-05, + "loss": 39.115, + "step": 1203 + }, + { + "epoch": 4.346726862302483, + "grad_norm": 227.32473754882812, + "learning_rate": 2.3499092558983667e-05, + "loss": 37.7692, + "step": 1204 + }, + { + "epoch": 4.350338600451467, + "grad_norm": 208.3872528076172, + "learning_rate": 2.3493647912885663e-05, + "loss": 26.7583, + "step": 1205 + }, + { + "epoch": 4.353950338600452, + "grad_norm": 173.05075073242188, + "learning_rate": 2.348820326678766e-05, + "loss": 24.7576, + "step": 1206 + }, + { + "epoch": 4.357562076749436, + "grad_norm": 214.4512939453125, + "learning_rate": 2.3482758620689657e-05, + "loss": 24.8792, + "step": 1207 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 179.293701171875, + "learning_rate": 2.3477313974591652e-05, + "loss": 26.1507, + "step": 1208 + }, + { + "epoch": 4.364785553047404, + "grad_norm": 401.9908142089844, + "learning_rate": 2.3471869328493648e-05, + "loss": 47.4017, + "step": 1209 + }, + { + "epoch": 4.368397291196389, + "grad_norm": 399.3369140625, + "learning_rate": 2.3466424682395643e-05, + "loss": 48.0082, + "step": 1210 + }, + { + "epoch": 4.368397291196389, + "eval_loss": 0.6664602756500244, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.18, + "eval_steps_per_second": 57.18, + "step": 1210 + }, + { + "epoch": 4.372009029345373, + "grad_norm": 320.49090576171875, + "learning_rate": 2.346098003629764e-05, + "loss": 47.4843, + "step": 1211 + }, + { + "epoch": 4.375620767494357, + "grad_norm": 297.55615234375, + "learning_rate": 2.3455535390199637e-05, + "loss": 46.3087, + "step": 1212 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 245.03399658203125, + "learning_rate": 2.3450090744101636e-05, + "loss": 45.4889, + "step": 1213 + }, + { + "epoch": 4.382844243792325, + "grad_norm": 227.94091796875, + "learning_rate": 2.344464609800363e-05, + "loss": 45.8501, + "step": 1214 + }, + { + "epoch": 4.386455981941309, + "grad_norm": 262.7824401855469, + "learning_rate": 2.3439201451905627e-05, + "loss": 46.2737, + "step": 1215 + }, + { + "epoch": 4.390067720090293, + "grad_norm": 235.969970703125, + "learning_rate": 2.3433756805807622e-05, + "loss": 45.2876, + "step": 1216 + }, + { + "epoch": 4.393679458239277, + "grad_norm": 244.8028106689453, + "learning_rate": 2.342831215970962e-05, + "loss": 45.4931, + "step": 1217 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 236.24844360351562, + "learning_rate": 2.3422867513611616e-05, + "loss": 45.6649, + "step": 1218 + }, + { + "epoch": 4.400902934537246, + "grad_norm": 204.7911834716797, + "learning_rate": 2.341742286751361e-05, + "loss": 43.9613, + "step": 1219 + }, + { + "epoch": 4.40451467268623, + "grad_norm": 190.6739044189453, + "learning_rate": 2.3411978221415607e-05, + "loss": 41.9267, + "step": 1220 + }, + { + "epoch": 4.40451467268623, + "eval_loss": 0.6481396555900574, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1220 + }, + { + "epoch": 4.408126410835214, + "grad_norm": 224.25758361816406, + "learning_rate": 2.3406533575317602e-05, + "loss": 42.34, + "step": 1221 + }, + { + "epoch": 4.411738148984199, + "grad_norm": 238.21913146972656, + "learning_rate": 2.34010889292196e-05, + "loss": 40.6947, + "step": 1222 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 255.64395141601562, + "learning_rate": 2.33956442831216e-05, + "loss": 39.8585, + "step": 1223 + }, + { + "epoch": 4.418961625282167, + "grad_norm": 202.08859252929688, + "learning_rate": 2.3390199637023595e-05, + "loss": 42.6031, + "step": 1224 + }, + { + "epoch": 4.422573363431152, + "grad_norm": 222.359619140625, + "learning_rate": 2.338475499092559e-05, + "loss": 41.9946, + "step": 1225 + }, + { + "epoch": 4.426185101580136, + "grad_norm": 198.84461975097656, + "learning_rate": 2.3379310344827586e-05, + "loss": 40.9174, + "step": 1226 + }, + { + "epoch": 4.42979683972912, + "grad_norm": 227.34942626953125, + "learning_rate": 2.337386569872958e-05, + "loss": 42.2865, + "step": 1227 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 249.9097900390625, + "learning_rate": 2.336842105263158e-05, + "loss": 42.6508, + "step": 1228 + }, + { + "epoch": 4.437020316027088, + "grad_norm": 236.96009826660156, + "learning_rate": 2.3362976406533576e-05, + "loss": 43.0846, + "step": 1229 + }, + { + "epoch": 4.440632054176072, + "grad_norm": 183.06201171875, + "learning_rate": 2.335753176043557e-05, + "loss": 42.4119, + "step": 1230 + }, + { + "epoch": 4.440632054176072, + "eval_loss": 0.6428424715995789, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 1230 + }, + { + "epoch": 4.444243792325056, + "grad_norm": 199.0382843017578, + "learning_rate": 2.335208711433757e-05, + "loss": 43.1702, + "step": 1231 + }, + { + "epoch": 4.44785553047404, + "grad_norm": 221.87939453125, + "learning_rate": 2.3346642468239565e-05, + "loss": 43.3518, + "step": 1232 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 205.0601043701172, + "learning_rate": 2.3341197822141564e-05, + "loss": 42.9713, + "step": 1233 + }, + { + "epoch": 4.455079006772009, + "grad_norm": 235.3998565673828, + "learning_rate": 2.333575317604356e-05, + "loss": 42.6973, + "step": 1234 + }, + { + "epoch": 4.458690744920993, + "grad_norm": 171.76986694335938, + "learning_rate": 2.3330308529945555e-05, + "loss": 43.351, + "step": 1235 + }, + { + "epoch": 4.462302483069977, + "grad_norm": 261.549072265625, + "learning_rate": 2.332486388384755e-05, + "loss": 43.8662, + "step": 1236 + }, + { + "epoch": 4.465914221218962, + "grad_norm": 256.76837158203125, + "learning_rate": 2.3319419237749545e-05, + "loss": 40.7938, + "step": 1237 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 176.35060119628906, + "learning_rate": 2.331397459165154e-05, + "loss": 38.1021, + "step": 1238 + }, + { + "epoch": 4.47313769751693, + "grad_norm": 203.00906372070312, + "learning_rate": 2.330852994555354e-05, + "loss": 36.6359, + "step": 1239 + }, + { + "epoch": 4.476749435665914, + "grad_norm": 259.6462707519531, + "learning_rate": 2.3303085299455535e-05, + "loss": 34.448, + "step": 1240 + }, + { + "epoch": 4.476749435665914, + "eval_loss": 0.6386051177978516, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 1240 + }, + { + "epoch": 4.480361173814899, + "grad_norm": 215.24737548828125, + "learning_rate": 2.3297640653357534e-05, + "loss": 35.2353, + "step": 1241 + }, + { + "epoch": 4.483972911963883, + "grad_norm": 249.12355041503906, + "learning_rate": 2.329219600725953e-05, + "loss": 38.2077, + "step": 1242 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 191.0881805419922, + "learning_rate": 2.3286751361161525e-05, + "loss": 36.8363, + "step": 1243 + }, + { + "epoch": 4.491196388261851, + "grad_norm": 229.26449584960938, + "learning_rate": 2.3281306715063523e-05, + "loss": 36.7398, + "step": 1244 + }, + { + "epoch": 4.4948081264108355, + "grad_norm": 184.931884765625, + "learning_rate": 2.327586206896552e-05, + "loss": 35.6614, + "step": 1245 + }, + { + "epoch": 4.4984198645598195, + "grad_norm": 183.7378387451172, + "learning_rate": 2.3270417422867514e-05, + "loss": 36.9818, + "step": 1246 + }, + { + "epoch": 4.502031602708803, + "grad_norm": 191.42543029785156, + "learning_rate": 2.326497277676951e-05, + "loss": 38.1348, + "step": 1247 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 211.6359100341797, + "learning_rate": 2.3259528130671505e-05, + "loss": 37.0112, + "step": 1248 + }, + { + "epoch": 4.509255079006772, + "grad_norm": 245.6946563720703, + "learning_rate": 2.32540834845735e-05, + "loss": 38.6218, + "step": 1249 + }, + { + "epoch": 4.512866817155756, + "grad_norm": 193.29095458984375, + "learning_rate": 2.3248638838475502e-05, + "loss": 36.9687, + "step": 1250 + }, + { + "epoch": 4.512866817155756, + "eval_loss": 0.6432057023048401, + "eval_runtime": 3.1301, + "eval_samples_per_second": 57.187, + "eval_steps_per_second": 57.187, + "step": 1250 + }, + { + "epoch": 4.51647855530474, + "grad_norm": 247.0595245361328, + "learning_rate": 2.3243194192377498e-05, + "loss": 39.8086, + "step": 1251 + }, + { + "epoch": 4.520090293453725, + "grad_norm": 243.1544189453125, + "learning_rate": 2.3237749546279493e-05, + "loss": 38.7245, + "step": 1252 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 322.0834045410156, + "learning_rate": 2.323230490018149e-05, + "loss": 39.5335, + "step": 1253 + }, + { + "epoch": 4.527313769751693, + "grad_norm": 201.5956573486328, + "learning_rate": 2.3226860254083484e-05, + "loss": 30.2928, + "step": 1254 + }, + { + "epoch": 4.530925507900677, + "grad_norm": 186.13291931152344, + "learning_rate": 2.3221415607985483e-05, + "loss": 24.8504, + "step": 1255 + }, + { + "epoch": 4.534537246049661, + "grad_norm": 251.50608825683594, + "learning_rate": 2.3215970961887478e-05, + "loss": 24.5528, + "step": 1256 + }, + { + "epoch": 4.538148984198646, + "grad_norm": 180.21124267578125, + "learning_rate": 2.3210526315789473e-05, + "loss": 25.0864, + "step": 1257 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 206.5410614013672, + "learning_rate": 2.320508166969147e-05, + "loss": 27.1602, + "step": 1258 + }, + { + "epoch": 4.545372460496614, + "grad_norm": 342.1103210449219, + "learning_rate": 2.3199637023593468e-05, + "loss": 47.3734, + "step": 1259 + }, + { + "epoch": 4.5489841986455986, + "grad_norm": 418.3056945800781, + "learning_rate": 2.3194192377495463e-05, + "loss": 48.0316, + "step": 1260 + }, + { + "epoch": 4.5489841986455986, + "eval_loss": 0.6742400527000427, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 1260 + }, + { + "epoch": 4.5525959367945825, + "grad_norm": 369.8560791015625, + "learning_rate": 2.3188747731397462e-05, + "loss": 47.4532, + "step": 1261 + }, + { + "epoch": 4.5562076749435665, + "grad_norm": 322.0288391113281, + "learning_rate": 2.3183303085299457e-05, + "loss": 47.0661, + "step": 1262 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 244.79066467285156, + "learning_rate": 2.3177858439201453e-05, + "loss": 45.1875, + "step": 1263 + }, + { + "epoch": 4.563431151241535, + "grad_norm": 209.29397583007812, + "learning_rate": 2.3172413793103448e-05, + "loss": 46.1355, + "step": 1264 + }, + { + "epoch": 4.567042889390519, + "grad_norm": 271.5123291015625, + "learning_rate": 2.3166969147005443e-05, + "loss": 45.8947, + "step": 1265 + }, + { + "epoch": 4.570654627539503, + "grad_norm": 232.42913818359375, + "learning_rate": 2.3161524500907442e-05, + "loss": 45.6542, + "step": 1266 + }, + { + "epoch": 4.574266365688487, + "grad_norm": 282.50738525390625, + "learning_rate": 2.3156079854809437e-05, + "loss": 45.8805, + "step": 1267 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 203.39031982421875, + "learning_rate": 2.3150635208711436e-05, + "loss": 44.8926, + "step": 1268 + }, + { + "epoch": 4.581489841986456, + "grad_norm": 213.94894409179688, + "learning_rate": 2.314519056261343e-05, + "loss": 43.7589, + "step": 1269 + }, + { + "epoch": 4.58510158013544, + "grad_norm": 198.9677734375, + "learning_rate": 2.3139745916515427e-05, + "loss": 41.819, + "step": 1270 + }, + { + "epoch": 4.58510158013544, + "eval_loss": 0.6428627371788025, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1270 + }, + { + "epoch": 4.588713318284425, + "grad_norm": 197.69903564453125, + "learning_rate": 2.3134301270417422e-05, + "loss": 40.6128, + "step": 1271 + }, + { + "epoch": 4.592325056433409, + "grad_norm": 229.10488891601562, + "learning_rate": 2.312885662431942e-05, + "loss": 41.1856, + "step": 1272 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 254.4750213623047, + "learning_rate": 2.3123411978221417e-05, + "loss": 40.2048, + "step": 1273 + }, + { + "epoch": 4.599548532731377, + "grad_norm": 247.2012939453125, + "learning_rate": 2.3117967332123412e-05, + "loss": 41.663, + "step": 1274 + }, + { + "epoch": 4.603160270880361, + "grad_norm": 196.78761291503906, + "learning_rate": 2.3112522686025407e-05, + "loss": 41.1102, + "step": 1275 + }, + { + "epoch": 4.606772009029346, + "grad_norm": 179.03880310058594, + "learning_rate": 2.3107078039927403e-05, + "loss": 39.6368, + "step": 1276 + }, + { + "epoch": 4.6103837471783295, + "grad_norm": 203.49159240722656, + "learning_rate": 2.3101633393829405e-05, + "loss": 42.9424, + "step": 1277 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 254.80018615722656, + "learning_rate": 2.30961887477314e-05, + "loss": 42.0636, + "step": 1278 + }, + { + "epoch": 4.617607223476298, + "grad_norm": 201.86109924316406, + "learning_rate": 2.3090744101633396e-05, + "loss": 41.4738, + "step": 1279 + }, + { + "epoch": 4.621218961625282, + "grad_norm": 185.1239471435547, + "learning_rate": 2.308529945553539e-05, + "loss": 41.8529, + "step": 1280 + }, + { + "epoch": 4.621218961625282, + "eval_loss": 0.6457561254501343, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 1280 + }, + { + "epoch": 4.624830699774266, + "grad_norm": 198.6769561767578, + "learning_rate": 2.3079854809437386e-05, + "loss": 41.8397, + "step": 1281 + }, + { + "epoch": 4.62844243792325, + "grad_norm": 254.9165496826172, + "learning_rate": 2.3074410163339382e-05, + "loss": 43.5585, + "step": 1282 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 183.61181640625, + "learning_rate": 2.306896551724138e-05, + "loss": 41.7349, + "step": 1283 + }, + { + "epoch": 4.635665914221219, + "grad_norm": 206.0381622314453, + "learning_rate": 2.3063520871143376e-05, + "loss": 42.6239, + "step": 1284 + }, + { + "epoch": 4.639277652370203, + "grad_norm": 188.5303497314453, + "learning_rate": 2.305807622504537e-05, + "loss": 43.0988, + "step": 1285 + }, + { + "epoch": 4.642889390519187, + "grad_norm": 208.30039978027344, + "learning_rate": 2.3052631578947367e-05, + "loss": 43.8379, + "step": 1286 + }, + { + "epoch": 4.646501128668172, + "grad_norm": 209.494384765625, + "learning_rate": 2.3047186932849365e-05, + "loss": 41.4395, + "step": 1287 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 223.97824096679688, + "learning_rate": 2.3041742286751364e-05, + "loss": 38.5792, + "step": 1288 + }, + { + "epoch": 4.65372460496614, + "grad_norm": 209.16192626953125, + "learning_rate": 2.303629764065336e-05, + "loss": 36.2448, + "step": 1289 + }, + { + "epoch": 4.657336343115124, + "grad_norm": 260.72821044921875, + "learning_rate": 2.3030852994555355e-05, + "loss": 35.1692, + "step": 1290 + }, + { + "epoch": 4.657336343115124, + "eval_loss": 0.6381233334541321, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1290 + }, + { + "epoch": 4.660948081264109, + "grad_norm": 222.2270965576172, + "learning_rate": 2.302540834845735e-05, + "loss": 35.2234, + "step": 1291 + }, + { + "epoch": 4.664559819413093, + "grad_norm": 208.68218994140625, + "learning_rate": 2.3019963702359346e-05, + "loss": 35.6167, + "step": 1292 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 199.57015991210938, + "learning_rate": 2.301451905626134e-05, + "loss": 36.9489, + "step": 1293 + }, + { + "epoch": 4.6717832957110605, + "grad_norm": 249.1312255859375, + "learning_rate": 2.300907441016334e-05, + "loss": 37.0681, + "step": 1294 + }, + { + "epoch": 4.675395033860045, + "grad_norm": 227.86341857910156, + "learning_rate": 2.3003629764065335e-05, + "loss": 38.3897, + "step": 1295 + }, + { + "epoch": 4.679006772009029, + "grad_norm": 290.3368225097656, + "learning_rate": 2.2998185117967334e-05, + "loss": 39.1391, + "step": 1296 + }, + { + "epoch": 4.682618510158013, + "grad_norm": 222.59974670410156, + "learning_rate": 2.299274047186933e-05, + "loss": 38.6362, + "step": 1297 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 233.853515625, + "learning_rate": 2.2987295825771325e-05, + "loss": 37.1796, + "step": 1298 + }, + { + "epoch": 4.689841986455982, + "grad_norm": 202.83212280273438, + "learning_rate": 2.2981851179673324e-05, + "loss": 38.5097, + "step": 1299 + }, + { + "epoch": 4.693453724604966, + "grad_norm": 203.59027099609375, + "learning_rate": 2.297640653357532e-05, + "loss": 38.3335, + "step": 1300 + }, + { + "epoch": 4.693453724604966, + "eval_loss": 0.6446877717971802, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 1300 + }, + { + "epoch": 4.69706546275395, + "grad_norm": 250.48324584960938, + "learning_rate": 2.2970961887477314e-05, + "loss": 39.1848, + "step": 1301 + }, + { + "epoch": 4.700677200902934, + "grad_norm": 218.0867462158203, + "learning_rate": 2.296551724137931e-05, + "loss": 38.2276, + "step": 1302 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 316.4258728027344, + "learning_rate": 2.2960072595281305e-05, + "loss": 38.4487, + "step": 1303 + }, + { + "epoch": 4.707900677200903, + "grad_norm": 262.96832275390625, + "learning_rate": 2.29546279491833e-05, + "loss": 29.1075, + "step": 1304 + }, + { + "epoch": 4.711512415349887, + "grad_norm": 261.25897216796875, + "learning_rate": 2.2949183303085303e-05, + "loss": 24.6257, + "step": 1305 + }, + { + "epoch": 4.715124153498872, + "grad_norm": 223.29014587402344, + "learning_rate": 2.2943738656987298e-05, + "loss": 24.4387, + "step": 1306 + }, + { + "epoch": 4.718735891647856, + "grad_norm": 167.95193481445312, + "learning_rate": 2.2938294010889293e-05, + "loss": 25.0916, + "step": 1307 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 203.88392639160156, + "learning_rate": 2.293284936479129e-05, + "loss": 26.1631, + "step": 1308 + }, + { + "epoch": 4.725959367945824, + "grad_norm": 350.67657470703125, + "learning_rate": 2.2927404718693284e-05, + "loss": 47.7021, + "step": 1309 + }, + { + "epoch": 4.7295711060948085, + "grad_norm": 357.1839294433594, + "learning_rate": 2.2921960072595283e-05, + "loss": 47.8161, + "step": 1310 + }, + { + "epoch": 4.7295711060948085, + "eval_loss": 0.6716815829277039, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 1310 + }, + { + "epoch": 4.733182844243792, + "grad_norm": 334.40216064453125, + "learning_rate": 2.291651542649728e-05, + "loss": 47.5608, + "step": 1311 + }, + { + "epoch": 4.736794582392776, + "grad_norm": 322.90008544921875, + "learning_rate": 2.2911070780399274e-05, + "loss": 45.9858, + "step": 1312 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 291.5083923339844, + "learning_rate": 2.290562613430127e-05, + "loss": 45.9813, + "step": 1313 + }, + { + "epoch": 4.744018058690745, + "grad_norm": 234.91102600097656, + "learning_rate": 2.2900181488203268e-05, + "loss": 44.4287, + "step": 1314 + }, + { + "epoch": 4.747629796839729, + "grad_norm": 271.03582763671875, + "learning_rate": 2.2894736842105263e-05, + "loss": 45.3697, + "step": 1315 + }, + { + "epoch": 4.751241534988713, + "grad_norm": 256.219482421875, + "learning_rate": 2.2889292196007262e-05, + "loss": 45.1817, + "step": 1316 + }, + { + "epoch": 4.754853273137698, + "grad_norm": 252.0631561279297, + "learning_rate": 2.2883847549909257e-05, + "loss": 45.2029, + "step": 1317 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 249.41812133789062, + "learning_rate": 2.2878402903811253e-05, + "loss": 44.9802, + "step": 1318 + }, + { + "epoch": 4.762076749435666, + "grad_norm": 208.9102325439453, + "learning_rate": 2.2872958257713248e-05, + "loss": 44.3745, + "step": 1319 + }, + { + "epoch": 4.76568848758465, + "grad_norm": 322.94903564453125, + "learning_rate": 2.2867513611615244e-05, + "loss": 40.9193, + "step": 1320 + }, + { + "epoch": 4.76568848758465, + "eval_loss": 0.6515910029411316, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1320 + }, + { + "epoch": 4.769300225733634, + "grad_norm": 264.6942138671875, + "learning_rate": 2.2862068965517242e-05, + "loss": 39.7286, + "step": 1321 + }, + { + "epoch": 4.772911963882619, + "grad_norm": 276.6095886230469, + "learning_rate": 2.2856624319419238e-05, + "loss": 41.3846, + "step": 1322 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 199.59877014160156, + "learning_rate": 2.2851179673321233e-05, + "loss": 40.5583, + "step": 1323 + }, + { + "epoch": 4.780135440180587, + "grad_norm": 252.59158325195312, + "learning_rate": 2.2845735027223232e-05, + "loss": 40.9513, + "step": 1324 + }, + { + "epoch": 4.7837471783295715, + "grad_norm": 215.53826904296875, + "learning_rate": 2.2840290381125227e-05, + "loss": 41.5119, + "step": 1325 + }, + { + "epoch": 4.7873589164785555, + "grad_norm": 290.7100524902344, + "learning_rate": 2.2834845735027226e-05, + "loss": 42.7646, + "step": 1326 + }, + { + "epoch": 4.7909706546275395, + "grad_norm": 190.2306671142578, + "learning_rate": 2.282940108892922e-05, + "loss": 42.2708, + "step": 1327 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 187.5550079345703, + "learning_rate": 2.2823956442831217e-05, + "loss": 41.9279, + "step": 1328 + }, + { + "epoch": 4.798194130925508, + "grad_norm": 169.10414123535156, + "learning_rate": 2.2818511796733212e-05, + "loss": 42.2688, + "step": 1329 + }, + { + "epoch": 4.801805869074492, + "grad_norm": 199.5216064453125, + "learning_rate": 2.2813067150635208e-05, + "loss": 41.9192, + "step": 1330 + }, + { + "epoch": 4.801805869074492, + "eval_loss": 0.6402038335800171, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 1330 + }, + { + "epoch": 4.805417607223476, + "grad_norm": 222.4996337890625, + "learning_rate": 2.2807622504537203e-05, + "loss": 43.8218, + "step": 1331 + }, + { + "epoch": 4.80902934537246, + "grad_norm": 228.1157684326172, + "learning_rate": 2.2802177858439202e-05, + "loss": 42.9497, + "step": 1332 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 179.83697509765625, + "learning_rate": 2.27967332123412e-05, + "loss": 43.9723, + "step": 1333 + }, + { + "epoch": 4.816252821670429, + "grad_norm": 196.81983947753906, + "learning_rate": 2.2791288566243196e-05, + "loss": 43.3302, + "step": 1334 + }, + { + "epoch": 4.819864559819413, + "grad_norm": 186.61160278320312, + "learning_rate": 2.278584392014519e-05, + "loss": 41.8957, + "step": 1335 + }, + { + "epoch": 4.823476297968397, + "grad_norm": 242.55886840820312, + "learning_rate": 2.2780399274047187e-05, + "loss": 43.1916, + "step": 1336 + }, + { + "epoch": 4.827088036117382, + "grad_norm": 212.07177734375, + "learning_rate": 2.2774954627949185e-05, + "loss": 38.3371, + "step": 1337 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 180.1990966796875, + "learning_rate": 2.276950998185118e-05, + "loss": 36.3413, + "step": 1338 + }, + { + "epoch": 4.83431151241535, + "grad_norm": 202.69529724121094, + "learning_rate": 2.2764065335753176e-05, + "loss": 35.4426, + "step": 1339 + }, + { + "epoch": 4.837923250564334, + "grad_norm": 180.47283935546875, + "learning_rate": 2.275862068965517e-05, + "loss": 35.5281, + "step": 1340 + }, + { + "epoch": 4.837923250564334, + "eval_loss": 0.6356105804443359, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 1340 + }, + { + "epoch": 4.8415349887133186, + "grad_norm": 204.674560546875, + "learning_rate": 2.2753176043557167e-05, + "loss": 36.2566, + "step": 1341 + }, + { + "epoch": 4.8451467268623025, + "grad_norm": 272.1197204589844, + "learning_rate": 2.2747731397459166e-05, + "loss": 36.3862, + "step": 1342 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 235.55101013183594, + "learning_rate": 2.2742286751361165e-05, + "loss": 35.1455, + "step": 1343 + }, + { + "epoch": 4.852370203160271, + "grad_norm": 271.2718200683594, + "learning_rate": 2.273684210526316e-05, + "loss": 37.3824, + "step": 1344 + }, + { + "epoch": 4.855981941309255, + "grad_norm": 242.15728759765625, + "learning_rate": 2.2731397459165155e-05, + "loss": 37.6587, + "step": 1345 + }, + { + "epoch": 4.859593679458239, + "grad_norm": 218.59481811523438, + "learning_rate": 2.272595281306715e-05, + "loss": 36.7602, + "step": 1346 + }, + { + "epoch": 4.863205417607223, + "grad_norm": 231.9490203857422, + "learning_rate": 2.2720508166969146e-05, + "loss": 38.187, + "step": 1347 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 385.56158447265625, + "learning_rate": 2.2715063520871145e-05, + "loss": 38.1905, + "step": 1348 + }, + { + "epoch": 4.870428893905192, + "grad_norm": 219.38204956054688, + "learning_rate": 2.270961887477314e-05, + "loss": 38.2179, + "step": 1349 + }, + { + "epoch": 4.874040632054176, + "grad_norm": 209.46580505371094, + "learning_rate": 2.2704174228675136e-05, + "loss": 37.3696, + "step": 1350 + }, + { + "epoch": 4.874040632054176, + "eval_loss": 0.6412517428398132, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 1350 + }, + { + "epoch": 4.87765237020316, + "grad_norm": 205.53416442871094, + "learning_rate": 2.2698729582577134e-05, + "loss": 38.5144, + "step": 1351 + }, + { + "epoch": 4.881264108352145, + "grad_norm": 214.2522735595703, + "learning_rate": 2.269328493647913e-05, + "loss": 38.7372, + "step": 1352 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 236.9787139892578, + "learning_rate": 2.2687840290381125e-05, + "loss": 38.8987, + "step": 1353 + }, + { + "epoch": 4.888487584650113, + "grad_norm": 247.30906677246094, + "learning_rate": 2.2682395644283124e-05, + "loss": 35.0837, + "step": 1354 + }, + { + "epoch": 4.892099322799097, + "grad_norm": 287.5954284667969, + "learning_rate": 2.267695099818512e-05, + "loss": 25.5272, + "step": 1355 + }, + { + "epoch": 4.895711060948082, + "grad_norm": 254.61672973632812, + "learning_rate": 2.2671506352087115e-05, + "loss": 25.1288, + "step": 1356 + }, + { + "epoch": 4.899322799097066, + "grad_norm": 180.98666381835938, + "learning_rate": 2.266606170598911e-05, + "loss": 25.0588, + "step": 1357 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 213.0275421142578, + "learning_rate": 2.2660617059891105e-05, + "loss": 25.464, + "step": 1358 + }, + { + "epoch": 4.9065462753950335, + "grad_norm": 385.18035888671875, + "learning_rate": 2.2655172413793104e-05, + "loss": 47.0056, + "step": 1359 + }, + { + "epoch": 4.910158013544018, + "grad_norm": 383.4106140136719, + "learning_rate": 2.2649727767695103e-05, + "loss": 46.9892, + "step": 1360 + }, + { + "epoch": 4.910158013544018, + "eval_loss": 0.6618479490280151, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1360 + }, + { + "epoch": 4.913769751693002, + "grad_norm": 415.4345397949219, + "learning_rate": 2.26442831215971e-05, + "loss": 47.1619, + "step": 1361 + }, + { + "epoch": 4.917381489841986, + "grad_norm": 362.338134765625, + "learning_rate": 2.2638838475499094e-05, + "loss": 46.7232, + "step": 1362 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 378.7535400390625, + "learning_rate": 2.263339382940109e-05, + "loss": 46.4438, + "step": 1363 + }, + { + "epoch": 4.924604966139955, + "grad_norm": 251.64901733398438, + "learning_rate": 2.2627949183303085e-05, + "loss": 44.8178, + "step": 1364 + }, + { + "epoch": 4.928216704288939, + "grad_norm": 273.1052551269531, + "learning_rate": 2.2622504537205083e-05, + "loss": 43.0865, + "step": 1365 + }, + { + "epoch": 4.931828442437923, + "grad_norm": 229.66415405273438, + "learning_rate": 2.261705989110708e-05, + "loss": 42.2463, + "step": 1366 + }, + { + "epoch": 4.935440180586907, + "grad_norm": 229.47940063476562, + "learning_rate": 2.2611615245009074e-05, + "loss": 42.4395, + "step": 1367 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 224.48890686035156, + "learning_rate": 2.260617059891107e-05, + "loss": 42.4994, + "step": 1368 + }, + { + "epoch": 4.942663656884876, + "grad_norm": 241.98745727539062, + "learning_rate": 2.2600725952813065e-05, + "loss": 42.5535, + "step": 1369 + }, + { + "epoch": 4.94627539503386, + "grad_norm": 258.1711120605469, + "learning_rate": 2.2595281306715067e-05, + "loss": 42.8475, + "step": 1370 + }, + { + "epoch": 4.94627539503386, + "eval_loss": 0.639252245426178, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.09, + "eval_steps_per_second": 57.09, + "step": 1370 + }, + { + "epoch": 4.949887133182845, + "grad_norm": 204.64927673339844, + "learning_rate": 2.2589836660617062e-05, + "loss": 42.9895, + "step": 1371 + }, + { + "epoch": 4.953498871331829, + "grad_norm": 342.9057922363281, + "learning_rate": 2.2584392014519058e-05, + "loss": 43.1972, + "step": 1372 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 207.45504760742188, + "learning_rate": 2.2578947368421053e-05, + "loss": 42.406, + "step": 1373 + }, + { + "epoch": 4.960722347629797, + "grad_norm": 232.78831481933594, + "learning_rate": 2.257350272232305e-05, + "loss": 36.8817, + "step": 1374 + }, + { + "epoch": 4.9643340857787805, + "grad_norm": 249.3349609375, + "learning_rate": 2.2568058076225044e-05, + "loss": 34.584, + "step": 1375 + }, + { + "epoch": 4.967945823927765, + "grad_norm": 322.7100524902344, + "learning_rate": 2.2562613430127043e-05, + "loss": 36.9512, + "step": 1376 + }, + { + "epoch": 4.971557562076749, + "grad_norm": 357.65228271484375, + "learning_rate": 2.2557168784029038e-05, + "loss": 37.6833, + "step": 1377 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 300.0970153808594, + "learning_rate": 2.2551724137931033e-05, + "loss": 38.597, + "step": 1378 + }, + { + "epoch": 4.978781038374718, + "grad_norm": 234.52508544921875, + "learning_rate": 2.2546279491833032e-05, + "loss": 38.4155, + "step": 1379 + }, + { + "epoch": 4.982392776523702, + "grad_norm": 270.60626220703125, + "learning_rate": 2.2540834845735028e-05, + "loss": 38.1589, + "step": 1380 + }, + { + "epoch": 4.982392776523702, + "eval_loss": 0.6409950256347656, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 1380 + }, + { + "epoch": 4.986004514672686, + "grad_norm": 232.9596710205078, + "learning_rate": 2.2535390199637026e-05, + "loss": 39.281, + "step": 1381 + }, + { + "epoch": 4.98961625282167, + "grad_norm": 248.0550994873047, + "learning_rate": 2.2529945553539022e-05, + "loss": 40.0868, + "step": 1382 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 256.327880859375, + "learning_rate": 2.2524500907441017e-05, + "loss": 28.1259, + "step": 1383 + }, + { + "epoch": 4.996839729119639, + "grad_norm": 198.29559326171875, + "learning_rate": 2.2519056261343012e-05, + "loss": 25.3166, + "step": 1384 + }, + { + "epoch": 5.0, + "grad_norm": 174.66856384277344, + "learning_rate": 2.2513611615245008e-05, + "loss": 22.0749, + "step": 1385 + }, + { + "epoch": 5.003611738148984, + "grad_norm": 309.0927429199219, + "learning_rate": 2.2508166969147003e-05, + "loss": 45.2433, + "step": 1386 + }, + { + "epoch": 5.007223476297969, + "grad_norm": 293.1455383300781, + "learning_rate": 2.2502722323049002e-05, + "loss": 46.7025, + "step": 1387 + }, + { + "epoch": 5.010835214446953, + "grad_norm": 269.47662353515625, + "learning_rate": 2.2497277676951e-05, + "loss": 45.3218, + "step": 1388 + }, + { + "epoch": 5.014446952595937, + "grad_norm": 284.49560546875, + "learning_rate": 2.2491833030852996e-05, + "loss": 44.9849, + "step": 1389 + }, + { + "epoch": 5.018058690744921, + "grad_norm": 223.5511474609375, + "learning_rate": 2.248638838475499e-05, + "loss": 44.887, + "step": 1390 + }, + { + "epoch": 5.018058690744921, + "eval_loss": 0.6435533165931702, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1390 + }, + { + "epoch": 5.021670428893906, + "grad_norm": 243.4492645263672, + "learning_rate": 2.2480943738656987e-05, + "loss": 45.1483, + "step": 1391 + }, + { + "epoch": 5.0252821670428895, + "grad_norm": 265.1712646484375, + "learning_rate": 2.2475499092558986e-05, + "loss": 44.3713, + "step": 1392 + }, + { + "epoch": 5.0288939051918735, + "grad_norm": 190.72190856933594, + "learning_rate": 2.247005444646098e-05, + "loss": 45.3138, + "step": 1393 + }, + { + "epoch": 5.0325056433408575, + "grad_norm": 177.26686096191406, + "learning_rate": 2.2464609800362976e-05, + "loss": 43.302, + "step": 1394 + }, + { + "epoch": 5.036117381489842, + "grad_norm": 198.6124725341797, + "learning_rate": 2.2459165154264972e-05, + "loss": 43.6363, + "step": 1395 + }, + { + "epoch": 5.039729119638826, + "grad_norm": 233.78738403320312, + "learning_rate": 2.2453720508166967e-05, + "loss": 43.0345, + "step": 1396 + }, + { + "epoch": 5.04334085778781, + "grad_norm": 225.48614501953125, + "learning_rate": 2.2448275862068966e-05, + "loss": 41.5932, + "step": 1397 + }, + { + "epoch": 5.046952595936794, + "grad_norm": 204.31179809570312, + "learning_rate": 2.2442831215970965e-05, + "loss": 40.1401, + "step": 1398 + }, + { + "epoch": 5.050564334085779, + "grad_norm": 219.5385284423828, + "learning_rate": 2.243738656987296e-05, + "loss": 40.8834, + "step": 1399 + }, + { + "epoch": 5.054176072234763, + "grad_norm": 168.3094024658203, + "learning_rate": 2.2431941923774956e-05, + "loss": 40.4476, + "step": 1400 + }, + { + "epoch": 5.054176072234763, + "eval_loss": 0.6361114382743835, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 1400 + }, + { + "epoch": 5.057787810383747, + "grad_norm": 169.45201110839844, + "learning_rate": 2.242649727767695e-05, + "loss": 40.1949, + "step": 1401 + }, + { + "epoch": 5.061399548532731, + "grad_norm": 208.84634399414062, + "learning_rate": 2.2421052631578946e-05, + "loss": 41.0091, + "step": 1402 + }, + { + "epoch": 5.065011286681716, + "grad_norm": 248.86221313476562, + "learning_rate": 2.2415607985480945e-05, + "loss": 40.2435, + "step": 1403 + }, + { + "epoch": 5.0686230248307, + "grad_norm": 297.0834655761719, + "learning_rate": 2.241016333938294e-05, + "loss": 42.37, + "step": 1404 + }, + { + "epoch": 5.072234762979684, + "grad_norm": 242.12661743164062, + "learning_rate": 2.2404718693284936e-05, + "loss": 42.3822, + "step": 1405 + }, + { + "epoch": 5.075846501128668, + "grad_norm": 230.1178741455078, + "learning_rate": 2.2399274047186935e-05, + "loss": 41.3722, + "step": 1406 + }, + { + "epoch": 5.079458239277653, + "grad_norm": 191.32371520996094, + "learning_rate": 2.239382940108893e-05, + "loss": 41.8087, + "step": 1407 + }, + { + "epoch": 5.083069977426637, + "grad_norm": 267.28753662109375, + "learning_rate": 2.2388384754990925e-05, + "loss": 42.5938, + "step": 1408 + }, + { + "epoch": 5.0866817155756205, + "grad_norm": 186.61978149414062, + "learning_rate": 2.2382940108892924e-05, + "loss": 42.8553, + "step": 1409 + }, + { + "epoch": 5.090293453724605, + "grad_norm": 242.53433227539062, + "learning_rate": 2.237749546279492e-05, + "loss": 41.9677, + "step": 1410 + }, + { + "epoch": 5.090293453724605, + "eval_loss": 0.6330043077468872, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 1410 + }, + { + "epoch": 5.093905191873589, + "grad_norm": 199.74696350097656, + "learning_rate": 2.2372050816696915e-05, + "loss": 42.9821, + "step": 1411 + }, + { + "epoch": 5.097516930022573, + "grad_norm": 254.1063690185547, + "learning_rate": 2.236660617059891e-05, + "loss": 42.7956, + "step": 1412 + }, + { + "epoch": 5.101128668171557, + "grad_norm": 215.59056091308594, + "learning_rate": 2.2361161524500906e-05, + "loss": 43.6312, + "step": 1413 + }, + { + "epoch": 5.104740406320542, + "grad_norm": 218.69973754882812, + "learning_rate": 2.2355716878402904e-05, + "loss": 40.9468, + "step": 1414 + }, + { + "epoch": 5.108352144469526, + "grad_norm": 200.34927368164062, + "learning_rate": 2.23502722323049e-05, + "loss": 38.2656, + "step": 1415 + }, + { + "epoch": 5.11196388261851, + "grad_norm": 191.56883239746094, + "learning_rate": 2.23448275862069e-05, + "loss": 35.8111, + "step": 1416 + }, + { + "epoch": 5.115575620767494, + "grad_norm": 192.629150390625, + "learning_rate": 2.2339382940108894e-05, + "loss": 35.1287, + "step": 1417 + }, + { + "epoch": 5.119187358916479, + "grad_norm": 217.54855346679688, + "learning_rate": 2.233393829401089e-05, + "loss": 34.9664, + "step": 1418 + }, + { + "epoch": 5.122799097065463, + "grad_norm": 234.12355041503906, + "learning_rate": 2.2328493647912888e-05, + "loss": 35.9252, + "step": 1419 + }, + { + "epoch": 5.126410835214447, + "grad_norm": 201.83477783203125, + "learning_rate": 2.2323049001814884e-05, + "loss": 36.4664, + "step": 1420 + }, + { + "epoch": 5.126410835214447, + "eval_loss": 0.6359394192695618, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 1420 + }, + { + "epoch": 5.130022573363431, + "grad_norm": 212.38943481445312, + "learning_rate": 2.231760435571688e-05, + "loss": 35.2733, + "step": 1421 + }, + { + "epoch": 5.133634311512416, + "grad_norm": 219.8803253173828, + "learning_rate": 2.2312159709618874e-05, + "loss": 37.2009, + "step": 1422 + }, + { + "epoch": 5.1372460496614, + "grad_norm": 222.28221130371094, + "learning_rate": 2.230671506352087e-05, + "loss": 36.9338, + "step": 1423 + }, + { + "epoch": 5.140857787810384, + "grad_norm": 217.56607055664062, + "learning_rate": 2.2301270417422865e-05, + "loss": 38.0419, + "step": 1424 + }, + { + "epoch": 5.144469525959368, + "grad_norm": 232.7363739013672, + "learning_rate": 2.2295825771324867e-05, + "loss": 38.1393, + "step": 1425 + }, + { + "epoch": 5.148081264108352, + "grad_norm": 228.12091064453125, + "learning_rate": 2.2290381125226863e-05, + "loss": 37.4169, + "step": 1426 + }, + { + "epoch": 5.151693002257336, + "grad_norm": 247.9901580810547, + "learning_rate": 2.2284936479128858e-05, + "loss": 37.6386, + "step": 1427 + }, + { + "epoch": 5.15530474040632, + "grad_norm": 227.96649169921875, + "learning_rate": 2.2279491833030853e-05, + "loss": 38.7843, + "step": 1428 + }, + { + "epoch": 5.158916478555304, + "grad_norm": 197.85072326660156, + "learning_rate": 2.227404718693285e-05, + "loss": 37.7056, + "step": 1429 + }, + { + "epoch": 5.162528216704289, + "grad_norm": 270.6370544433594, + "learning_rate": 2.2268602540834848e-05, + "loss": 38.5554, + "step": 1430 + }, + { + "epoch": 5.162528216704289, + "eval_loss": 0.6463288068771362, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 1430 + }, + { + "epoch": 5.166139954853273, + "grad_norm": 251.65847778320312, + "learning_rate": 2.2263157894736843e-05, + "loss": 32.6593, + "step": 1431 + }, + { + "epoch": 5.169751693002257, + "grad_norm": 248.84368896484375, + "learning_rate": 2.225771324863884e-05, + "loss": 24.8031, + "step": 1432 + }, + { + "epoch": 5.173363431151241, + "grad_norm": 218.12979125976562, + "learning_rate": 2.2252268602540834e-05, + "loss": 23.8542, + "step": 1433 + }, + { + "epoch": 5.176975169300226, + "grad_norm": 171.4182586669922, + "learning_rate": 2.2246823956442832e-05, + "loss": 25.1994, + "step": 1434 + }, + { + "epoch": 5.18058690744921, + "grad_norm": 200.76271057128906, + "learning_rate": 2.2241379310344828e-05, + "loss": 25.1259, + "step": 1435 + }, + { + "epoch": 5.184198645598194, + "grad_norm": 324.8979797363281, + "learning_rate": 2.2235934664246827e-05, + "loss": 46.7466, + "step": 1436 + }, + { + "epoch": 5.187810383747179, + "grad_norm": 391.9200439453125, + "learning_rate": 2.2230490018148822e-05, + "loss": 47.366, + "step": 1437 + }, + { + "epoch": 5.191422121896163, + "grad_norm": 332.51080322265625, + "learning_rate": 2.2225045372050817e-05, + "loss": 47.5236, + "step": 1438 + }, + { + "epoch": 5.195033860045147, + "grad_norm": 295.85333251953125, + "learning_rate": 2.2219600725952813e-05, + "loss": 44.9235, + "step": 1439 + }, + { + "epoch": 5.198645598194131, + "grad_norm": 246.46482849121094, + "learning_rate": 2.2214156079854808e-05, + "loss": 44.5892, + "step": 1440 + }, + { + "epoch": 5.198645598194131, + "eval_loss": 0.6501885056495667, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.096, + "eval_steps_per_second": 57.096, + "step": 1440 + }, + { + "epoch": 5.2022573363431155, + "grad_norm": 224.99964904785156, + "learning_rate": 2.2208711433756807e-05, + "loss": 45.1496, + "step": 1441 + }, + { + "epoch": 5.2058690744920995, + "grad_norm": 201.5928497314453, + "learning_rate": 2.2203266787658802e-05, + "loss": 44.2362, + "step": 1442 + }, + { + "epoch": 5.209480812641083, + "grad_norm": 220.72509765625, + "learning_rate": 2.21978221415608e-05, + "loss": 45.7963, + "step": 1443 + }, + { + "epoch": 5.213092550790067, + "grad_norm": 229.04412841796875, + "learning_rate": 2.2192377495462796e-05, + "loss": 44.1812, + "step": 1444 + }, + { + "epoch": 5.216704288939052, + "grad_norm": 214.86207580566406, + "learning_rate": 2.2186932849364792e-05, + "loss": 44.364, + "step": 1445 + }, + { + "epoch": 5.220316027088036, + "grad_norm": 169.3239288330078, + "learning_rate": 2.2181488203266787e-05, + "loss": 44.1106, + "step": 1446 + }, + { + "epoch": 5.22392776523702, + "grad_norm": 180.3131561279297, + "learning_rate": 2.2176043557168786e-05, + "loss": 41.8791, + "step": 1447 + }, + { + "epoch": 5.227539503386004, + "grad_norm": 227.83078002929688, + "learning_rate": 2.217059891107078e-05, + "loss": 39.7917, + "step": 1448 + }, + { + "epoch": 5.231151241534989, + "grad_norm": 267.4294738769531, + "learning_rate": 2.2165154264972777e-05, + "loss": 41.2864, + "step": 1449 + }, + { + "epoch": 5.234762979683973, + "grad_norm": 210.79034423828125, + "learning_rate": 2.2159709618874772e-05, + "loss": 40.7219, + "step": 1450 + }, + { + "epoch": 5.234762979683973, + "eval_loss": 0.6369529366493225, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 1450 + }, + { + "epoch": 5.238374717832957, + "grad_norm": 205.2632598876953, + "learning_rate": 2.2154264972776768e-05, + "loss": 41.0364, + "step": 1451 + }, + { + "epoch": 5.241986455981941, + "grad_norm": 199.7196807861328, + "learning_rate": 2.214882032667877e-05, + "loss": 40.2733, + "step": 1452 + }, + { + "epoch": 5.245598194130926, + "grad_norm": 184.26495361328125, + "learning_rate": 2.2143375680580765e-05, + "loss": 40.3418, + "step": 1453 + }, + { + "epoch": 5.24920993227991, + "grad_norm": 170.1937713623047, + "learning_rate": 2.213793103448276e-05, + "loss": 40.5658, + "step": 1454 + }, + { + "epoch": 5.252821670428894, + "grad_norm": 167.71109008789062, + "learning_rate": 2.2132486388384756e-05, + "loss": 41.9252, + "step": 1455 + }, + { + "epoch": 5.2564334085778786, + "grad_norm": 184.73162841796875, + "learning_rate": 2.212704174228675e-05, + "loss": 40.0485, + "step": 1456 + }, + { + "epoch": 5.2600451467268625, + "grad_norm": 195.0812225341797, + "learning_rate": 2.2121597096188747e-05, + "loss": 41.6424, + "step": 1457 + }, + { + "epoch": 5.2636568848758465, + "grad_norm": 218.23553466796875, + "learning_rate": 2.2116152450090745e-05, + "loss": 40.6179, + "step": 1458 + }, + { + "epoch": 5.2672686230248305, + "grad_norm": 229.79299926757812, + "learning_rate": 2.211070780399274e-05, + "loss": 42.8747, + "step": 1459 + }, + { + "epoch": 5.270880361173815, + "grad_norm": 231.70692443847656, + "learning_rate": 2.2105263157894736e-05, + "loss": 42.7016, + "step": 1460 + }, + { + "epoch": 5.270880361173815, + "eval_loss": 0.6424433588981628, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 1460 + }, + { + "epoch": 5.274492099322799, + "grad_norm": 204.9513397216797, + "learning_rate": 2.209981851179673e-05, + "loss": 41.206, + "step": 1461 + }, + { + "epoch": 5.278103837471783, + "grad_norm": 220.89083862304688, + "learning_rate": 2.209437386569873e-05, + "loss": 44.0126, + "step": 1462 + }, + { + "epoch": 5.281715575620767, + "grad_norm": 266.7763671875, + "learning_rate": 2.208892921960073e-05, + "loss": 41.4934, + "step": 1463 + }, + { + "epoch": 5.285327313769752, + "grad_norm": 241.42636108398438, + "learning_rate": 2.2083484573502724e-05, + "loss": 43.3433, + "step": 1464 + }, + { + "epoch": 5.288939051918736, + "grad_norm": 221.7669219970703, + "learning_rate": 2.207803992740472e-05, + "loss": 35.9569, + "step": 1465 + }, + { + "epoch": 5.29255079006772, + "grad_norm": 236.0152130126953, + "learning_rate": 2.2072595281306715e-05, + "loss": 36.0824, + "step": 1466 + }, + { + "epoch": 5.296162528216704, + "grad_norm": 239.56224060058594, + "learning_rate": 2.206715063520871e-05, + "loss": 33.6127, + "step": 1467 + }, + { + "epoch": 5.299774266365689, + "grad_norm": 277.1287841796875, + "learning_rate": 2.2061705989110706e-05, + "loss": 36.11, + "step": 1468 + }, + { + "epoch": 5.303386004514673, + "grad_norm": 250.19515991210938, + "learning_rate": 2.2056261343012705e-05, + "loss": 36.9984, + "step": 1469 + }, + { + "epoch": 5.306997742663657, + "grad_norm": 214.2754669189453, + "learning_rate": 2.20508166969147e-05, + "loss": 36.5917, + "step": 1470 + }, + { + "epoch": 5.306997742663657, + "eval_loss": 0.6356943845748901, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 1470 + }, + { + "epoch": 5.310609480812641, + "grad_norm": 224.37388610839844, + "learning_rate": 2.20453720508167e-05, + "loss": 36.5302, + "step": 1471 + }, + { + "epoch": 5.314221218961626, + "grad_norm": 276.2541809082031, + "learning_rate": 2.2039927404718694e-05, + "loss": 36.7978, + "step": 1472 + }, + { + "epoch": 5.3178329571106095, + "grad_norm": 361.717041015625, + "learning_rate": 2.203448275862069e-05, + "loss": 37.4063, + "step": 1473 + }, + { + "epoch": 5.3214446952595935, + "grad_norm": 285.3569641113281, + "learning_rate": 2.202903811252269e-05, + "loss": 37.2472, + "step": 1474 + }, + { + "epoch": 5.3250564334085775, + "grad_norm": 268.160400390625, + "learning_rate": 2.2023593466424684e-05, + "loss": 37.7361, + "step": 1475 + }, + { + "epoch": 5.328668171557562, + "grad_norm": 211.38070678710938, + "learning_rate": 2.201814882032668e-05, + "loss": 37.7794, + "step": 1476 + }, + { + "epoch": 5.332279909706546, + "grad_norm": 214.10638427734375, + "learning_rate": 2.2012704174228675e-05, + "loss": 39.0787, + "step": 1477 + }, + { + "epoch": 5.33589164785553, + "grad_norm": 238.9603271484375, + "learning_rate": 2.200725952813067e-05, + "loss": 37.6853, + "step": 1478 + }, + { + "epoch": 5.339503386004514, + "grad_norm": 323.44976806640625, + "learning_rate": 2.2001814882032665e-05, + "loss": 38.2844, + "step": 1479 + }, + { + "epoch": 5.343115124153499, + "grad_norm": 289.6131896972656, + "learning_rate": 2.1996370235934668e-05, + "loss": 38.8953, + "step": 1480 + }, + { + "epoch": 5.343115124153499, + "eval_loss": 0.6462770700454712, + "eval_runtime": 3.1673, + "eval_samples_per_second": 56.516, + "eval_steps_per_second": 56.516, + "step": 1480 + }, + { + "epoch": 5.346726862302483, + "grad_norm": 197.47299194335938, + "learning_rate": 2.1990925589836663e-05, + "loss": 28.126, + "step": 1481 + }, + { + "epoch": 5.350338600451467, + "grad_norm": 198.37156677246094, + "learning_rate": 2.1985480943738658e-05, + "loss": 24.2205, + "step": 1482 + }, + { + "epoch": 5.353950338600452, + "grad_norm": 211.03501892089844, + "learning_rate": 2.1980036297640654e-05, + "loss": 24.119, + "step": 1483 + }, + { + "epoch": 5.357562076749436, + "grad_norm": 182.23316955566406, + "learning_rate": 2.197459165154265e-05, + "loss": 24.7386, + "step": 1484 + }, + { + "epoch": 5.36117381489842, + "grad_norm": 192.6392822265625, + "learning_rate": 2.1969147005444648e-05, + "loss": 26.0739, + "step": 1485 + }, + { + "epoch": 5.364785553047404, + "grad_norm": 380.62896728515625, + "learning_rate": 2.1963702359346643e-05, + "loss": 46.6945, + "step": 1486 + }, + { + "epoch": 5.368397291196389, + "grad_norm": 342.5572814941406, + "learning_rate": 2.195825771324864e-05, + "loss": 46.1797, + "step": 1487 + }, + { + "epoch": 5.372009029345373, + "grad_norm": 311.7198791503906, + "learning_rate": 2.1952813067150634e-05, + "loss": 45.6588, + "step": 1488 + }, + { + "epoch": 5.375620767494357, + "grad_norm": 260.9885559082031, + "learning_rate": 2.1947368421052633e-05, + "loss": 45.2405, + "step": 1489 + }, + { + "epoch": 5.3792325056433405, + "grad_norm": 263.3132019042969, + "learning_rate": 2.1941923774954628e-05, + "loss": 44.117, + "step": 1490 + }, + { + "epoch": 5.3792325056433405, + "eval_loss": 0.644275426864624, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 1490 + }, + { + "epoch": 5.382844243792325, + "grad_norm": 254.92022705078125, + "learning_rate": 2.1936479128856627e-05, + "loss": 45.4002, + "step": 1491 + }, + { + "epoch": 5.386455981941309, + "grad_norm": 246.1839599609375, + "learning_rate": 2.1931034482758622e-05, + "loss": 45.3481, + "step": 1492 + }, + { + "epoch": 5.390067720090293, + "grad_norm": 282.2879638671875, + "learning_rate": 2.1925589836660618e-05, + "loss": 45.3958, + "step": 1493 + }, + { + "epoch": 5.393679458239277, + "grad_norm": 266.9140930175781, + "learning_rate": 2.1920145190562613e-05, + "loss": 44.2959, + "step": 1494 + }, + { + "epoch": 5.397291196388262, + "grad_norm": 196.81199645996094, + "learning_rate": 2.191470054446461e-05, + "loss": 44.765, + "step": 1495 + }, + { + "epoch": 5.400902934537246, + "grad_norm": 270.7329406738281, + "learning_rate": 2.1909255898366607e-05, + "loss": 42.8581, + "step": 1496 + }, + { + "epoch": 5.40451467268623, + "grad_norm": 187.3281707763672, + "learning_rate": 2.1903811252268603e-05, + "loss": 40.7167, + "step": 1497 + }, + { + "epoch": 5.408126410835214, + "grad_norm": 302.9165954589844, + "learning_rate": 2.1898366606170598e-05, + "loss": 41.0712, + "step": 1498 + }, + { + "epoch": 5.411738148984199, + "grad_norm": 395.1492614746094, + "learning_rate": 2.1892921960072597e-05, + "loss": 40.4098, + "step": 1499 + }, + { + "epoch": 5.415349887133183, + "grad_norm": 253.91494750976562, + "learning_rate": 2.1887477313974592e-05, + "loss": 41.2985, + "step": 1500 + }, + { + "epoch": 5.415349887133183, + "eval_loss": 0.6383773684501648, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1500 + }, + { + "epoch": 5.418961625282167, + "grad_norm": 248.4109344482422, + "learning_rate": 2.1882032667876588e-05, + "loss": 41.179, + "step": 1501 + }, + { + "epoch": 5.422573363431152, + "grad_norm": 210.50015258789062, + "learning_rate": 2.1876588021778586e-05, + "loss": 41.1934, + "step": 1502 + }, + { + "epoch": 5.426185101580136, + "grad_norm": 170.64334106445312, + "learning_rate": 2.187114337568058e-05, + "loss": 41.5535, + "step": 1503 + }, + { + "epoch": 5.42979683972912, + "grad_norm": 249.41270446777344, + "learning_rate": 2.1865698729582577e-05, + "loss": 41.8323, + "step": 1504 + }, + { + "epoch": 5.433408577878104, + "grad_norm": 214.53770446777344, + "learning_rate": 2.1860254083484572e-05, + "loss": 42.1517, + "step": 1505 + }, + { + "epoch": 5.437020316027088, + "grad_norm": 225.6502227783203, + "learning_rate": 2.1854809437386568e-05, + "loss": 42.7675, + "step": 1506 + }, + { + "epoch": 5.440632054176072, + "grad_norm": 210.19219970703125, + "learning_rate": 2.1849364791288567e-05, + "loss": 42.5094, + "step": 1507 + }, + { + "epoch": 5.444243792325056, + "grad_norm": 187.03294372558594, + "learning_rate": 2.1843920145190565e-05, + "loss": 42.2218, + "step": 1508 + }, + { + "epoch": 5.44785553047404, + "grad_norm": 227.6764373779297, + "learning_rate": 2.183847549909256e-05, + "loss": 42.7061, + "step": 1509 + }, + { + "epoch": 5.451467268623025, + "grad_norm": 239.2847442626953, + "learning_rate": 2.1833030852994556e-05, + "loss": 43.1959, + "step": 1510 + }, + { + "epoch": 5.451467268623025, + "eval_loss": 0.6405091285705566, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 1510 + }, + { + "epoch": 5.455079006772009, + "grad_norm": 268.887451171875, + "learning_rate": 2.182758620689655e-05, + "loss": 42.4915, + "step": 1511 + }, + { + "epoch": 5.458690744920993, + "grad_norm": 261.0531311035156, + "learning_rate": 2.182214156079855e-05, + "loss": 42.1777, + "step": 1512 + }, + { + "epoch": 5.462302483069977, + "grad_norm": 241.58819580078125, + "learning_rate": 2.1816696914700546e-05, + "loss": 40.8728, + "step": 1513 + }, + { + "epoch": 5.465914221218962, + "grad_norm": 227.302001953125, + "learning_rate": 2.181125226860254e-05, + "loss": 39.8861, + "step": 1514 + }, + { + "epoch": 5.469525959367946, + "grad_norm": 293.8402404785156, + "learning_rate": 2.1805807622504536e-05, + "loss": 36.8716, + "step": 1515 + }, + { + "epoch": 5.47313769751693, + "grad_norm": 332.8829650878906, + "learning_rate": 2.1800362976406532e-05, + "loss": 35.6049, + "step": 1516 + }, + { + "epoch": 5.476749435665914, + "grad_norm": 271.6636962890625, + "learning_rate": 2.179491833030853e-05, + "loss": 34.6785, + "step": 1517 + }, + { + "epoch": 5.480361173814899, + "grad_norm": 211.5673065185547, + "learning_rate": 2.178947368421053e-05, + "loss": 35.5321, + "step": 1518 + }, + { + "epoch": 5.483972911963883, + "grad_norm": 168.95346069335938, + "learning_rate": 2.1784029038112525e-05, + "loss": 35.1604, + "step": 1519 + }, + { + "epoch": 5.487584650112867, + "grad_norm": 242.66725158691406, + "learning_rate": 2.177858439201452e-05, + "loss": 37.8709, + "step": 1520 + }, + { + "epoch": 5.487584650112867, + "eval_loss": 0.6324127912521362, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1520 + }, + { + "epoch": 5.491196388261851, + "grad_norm": 202.7799530029297, + "learning_rate": 2.1773139745916516e-05, + "loss": 38.1727, + "step": 1521 + }, + { + "epoch": 5.4948081264108355, + "grad_norm": 210.12704467773438, + "learning_rate": 2.176769509981851e-05, + "loss": 36.4171, + "step": 1522 + }, + { + "epoch": 5.4984198645598195, + "grad_norm": 214.7133331298828, + "learning_rate": 2.176225045372051e-05, + "loss": 37.7873, + "step": 1523 + }, + { + "epoch": 5.502031602708803, + "grad_norm": 197.89781188964844, + "learning_rate": 2.1756805807622505e-05, + "loss": 37.1096, + "step": 1524 + }, + { + "epoch": 5.505643340857787, + "grad_norm": 203.01992797851562, + "learning_rate": 2.17513611615245e-05, + "loss": 36.9907, + "step": 1525 + }, + { + "epoch": 5.509255079006772, + "grad_norm": 210.42164611816406, + "learning_rate": 2.17459165154265e-05, + "loss": 38.0291, + "step": 1526 + }, + { + "epoch": 5.512866817155756, + "grad_norm": 210.2798309326172, + "learning_rate": 2.1740471869328495e-05, + "loss": 37.5385, + "step": 1527 + }, + { + "epoch": 5.51647855530474, + "grad_norm": 217.986572265625, + "learning_rate": 2.173502722323049e-05, + "loss": 39.2736, + "step": 1528 + }, + { + "epoch": 5.520090293453725, + "grad_norm": 221.05831909179688, + "learning_rate": 2.172958257713249e-05, + "loss": 39.2733, + "step": 1529 + }, + { + "epoch": 5.523702031602709, + "grad_norm": 250.36065673828125, + "learning_rate": 2.1724137931034484e-05, + "loss": 37.8987, + "step": 1530 + }, + { + "epoch": 5.523702031602709, + "eval_loss": 0.6414559483528137, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 1530 + }, + { + "epoch": 5.527313769751693, + "grad_norm": 275.062255859375, + "learning_rate": 2.171869328493648e-05, + "loss": 29.4874, + "step": 1531 + }, + { + "epoch": 5.530925507900677, + "grad_norm": 178.79615783691406, + "learning_rate": 2.1713248638838475e-05, + "loss": 25.2165, + "step": 1532 + }, + { + "epoch": 5.534537246049661, + "grad_norm": 221.6693572998047, + "learning_rate": 2.170780399274047e-05, + "loss": 24.7139, + "step": 1533 + }, + { + "epoch": 5.538148984198646, + "grad_norm": 207.15869140625, + "learning_rate": 2.170235934664247e-05, + "loss": 25.2773, + "step": 1534 + }, + { + "epoch": 5.54176072234763, + "grad_norm": 193.37644958496094, + "learning_rate": 2.1696914700544468e-05, + "loss": 25.7936, + "step": 1535 + }, + { + "epoch": 5.545372460496614, + "grad_norm": 314.101318359375, + "learning_rate": 2.1691470054446463e-05, + "loss": 45.8573, + "step": 1536 + }, + { + "epoch": 5.5489841986455986, + "grad_norm": 376.9578552246094, + "learning_rate": 2.168602540834846e-05, + "loss": 47.1284, + "step": 1537 + }, + { + "epoch": 5.5525959367945825, + "grad_norm": 343.3904724121094, + "learning_rate": 2.1680580762250454e-05, + "loss": 45.1873, + "step": 1538 + }, + { + "epoch": 5.5562076749435665, + "grad_norm": 263.31768798828125, + "learning_rate": 2.167513611615245e-05, + "loss": 45.4906, + "step": 1539 + }, + { + "epoch": 5.5598194130925505, + "grad_norm": 295.50384521484375, + "learning_rate": 2.1669691470054448e-05, + "loss": 44.9259, + "step": 1540 + }, + { + "epoch": 5.5598194130925505, + "eval_loss": 0.6483813524246216, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.923, + "eval_steps_per_second": 56.923, + "step": 1540 + }, + { + "epoch": 5.563431151241535, + "grad_norm": 208.8861846923828, + "learning_rate": 2.1664246823956444e-05, + "loss": 43.7965, + "step": 1541 + }, + { + "epoch": 5.567042889390519, + "grad_norm": 195.8695526123047, + "learning_rate": 2.165880217785844e-05, + "loss": 44.7409, + "step": 1542 + }, + { + "epoch": 5.570654627539503, + "grad_norm": 218.10089111328125, + "learning_rate": 2.1653357531760434e-05, + "loss": 45.9364, + "step": 1543 + }, + { + "epoch": 5.574266365688487, + "grad_norm": 204.17205810546875, + "learning_rate": 2.164791288566243e-05, + "loss": 45.468, + "step": 1544 + }, + { + "epoch": 5.577878103837472, + "grad_norm": 239.03952026367188, + "learning_rate": 2.1642468239564432e-05, + "loss": 44.7685, + "step": 1545 + }, + { + "epoch": 5.581489841986456, + "grad_norm": 251.59300231933594, + "learning_rate": 2.1637023593466427e-05, + "loss": 43.011, + "step": 1546 + }, + { + "epoch": 5.58510158013544, + "grad_norm": 186.72540283203125, + "learning_rate": 2.1631578947368423e-05, + "loss": 41.5255, + "step": 1547 + }, + { + "epoch": 5.588713318284425, + "grad_norm": 199.89732360839844, + "learning_rate": 2.1626134301270418e-05, + "loss": 40.2522, + "step": 1548 + }, + { + "epoch": 5.592325056433409, + "grad_norm": 182.16624450683594, + "learning_rate": 2.1620689655172413e-05, + "loss": 41.0931, + "step": 1549 + }, + { + "epoch": 5.595936794582393, + "grad_norm": 221.58680725097656, + "learning_rate": 2.161524500907441e-05, + "loss": 40.2717, + "step": 1550 + }, + { + "epoch": 5.595936794582393, + "eval_loss": 0.6393340229988098, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 1550 + }, + { + "epoch": 5.599548532731377, + "grad_norm": 209.82183837890625, + "learning_rate": 2.1609800362976408e-05, + "loss": 41.7522, + "step": 1551 + }, + { + "epoch": 5.603160270880361, + "grad_norm": 226.1896209716797, + "learning_rate": 2.1604355716878403e-05, + "loss": 40.8078, + "step": 1552 + }, + { + "epoch": 5.606772009029346, + "grad_norm": 219.57899475097656, + "learning_rate": 2.1598911070780398e-05, + "loss": 42.2331, + "step": 1553 + }, + { + "epoch": 5.6103837471783295, + "grad_norm": 185.2303009033203, + "learning_rate": 2.1593466424682397e-05, + "loss": 42.0695, + "step": 1554 + }, + { + "epoch": 5.6139954853273135, + "grad_norm": 192.32913208007812, + "learning_rate": 2.1588021778584392e-05, + "loss": 42.1317, + "step": 1555 + }, + { + "epoch": 5.617607223476298, + "grad_norm": 183.3128662109375, + "learning_rate": 2.158257713248639e-05, + "loss": 40.4957, + "step": 1556 + }, + { + "epoch": 5.621218961625282, + "grad_norm": 178.10691833496094, + "learning_rate": 2.1577132486388387e-05, + "loss": 40.9154, + "step": 1557 + }, + { + "epoch": 5.624830699774266, + "grad_norm": 207.3495330810547, + "learning_rate": 2.1571687840290382e-05, + "loss": 42.8389, + "step": 1558 + }, + { + "epoch": 5.62844243792325, + "grad_norm": 191.46353149414062, + "learning_rate": 2.1566243194192377e-05, + "loss": 41.9483, + "step": 1559 + }, + { + "epoch": 5.632054176072235, + "grad_norm": 218.9544219970703, + "learning_rate": 2.1560798548094373e-05, + "loss": 41.2037, + "step": 1560 + }, + { + "epoch": 5.632054176072235, + "eval_loss": 0.6345452070236206, + "eval_runtime": 3.1432, + "eval_samples_per_second": 56.949, + "eval_steps_per_second": 56.949, + "step": 1560 + }, + { + "epoch": 5.635665914221219, + "grad_norm": 235.9405059814453, + "learning_rate": 2.1555353901996368e-05, + "loss": 43.1159, + "step": 1561 + }, + { + "epoch": 5.639277652370203, + "grad_norm": 207.1119384765625, + "learning_rate": 2.1549909255898367e-05, + "loss": 43.4384, + "step": 1562 + }, + { + "epoch": 5.642889390519187, + "grad_norm": 305.3013916015625, + "learning_rate": 2.1544464609800366e-05, + "loss": 42.436, + "step": 1563 + }, + { + "epoch": 5.646501128668172, + "grad_norm": 226.25282287597656, + "learning_rate": 2.153901996370236e-05, + "loss": 39.6844, + "step": 1564 + }, + { + "epoch": 5.650112866817156, + "grad_norm": 201.5033416748047, + "learning_rate": 2.1533575317604356e-05, + "loss": 35.9103, + "step": 1565 + }, + { + "epoch": 5.65372460496614, + "grad_norm": 206.63229370117188, + "learning_rate": 2.1528130671506352e-05, + "loss": 35.0026, + "step": 1566 + }, + { + "epoch": 5.657336343115124, + "grad_norm": 212.67581176757812, + "learning_rate": 2.152268602540835e-05, + "loss": 35.6298, + "step": 1567 + }, + { + "epoch": 5.660948081264109, + "grad_norm": 193.2886199951172, + "learning_rate": 2.1517241379310346e-05, + "loss": 36.0356, + "step": 1568 + }, + { + "epoch": 5.664559819413093, + "grad_norm": 166.189208984375, + "learning_rate": 2.151179673321234e-05, + "loss": 35.5423, + "step": 1569 + }, + { + "epoch": 5.668171557562077, + "grad_norm": 288.91552734375, + "learning_rate": 2.1506352087114337e-05, + "loss": 36.6227, + "step": 1570 + }, + { + "epoch": 5.668171557562077, + "eval_loss": 0.6339959502220154, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1570 + }, + { + "epoch": 5.6717832957110605, + "grad_norm": 210.91664123535156, + "learning_rate": 2.1500907441016332e-05, + "loss": 37.3015, + "step": 1571 + }, + { + "epoch": 5.675395033860045, + "grad_norm": 206.54299926757812, + "learning_rate": 2.149546279491833e-05, + "loss": 36.961, + "step": 1572 + }, + { + "epoch": 5.679006772009029, + "grad_norm": 206.55613708496094, + "learning_rate": 2.149001814882033e-05, + "loss": 36.722, + "step": 1573 + }, + { + "epoch": 5.682618510158013, + "grad_norm": 206.86563110351562, + "learning_rate": 2.1484573502722325e-05, + "loss": 37.7482, + "step": 1574 + }, + { + "epoch": 5.686230248306998, + "grad_norm": 219.96533203125, + "learning_rate": 2.147912885662432e-05, + "loss": 37.7964, + "step": 1575 + }, + { + "epoch": 5.689841986455982, + "grad_norm": 226.23887634277344, + "learning_rate": 2.1473684210526316e-05, + "loss": 38.6577, + "step": 1576 + }, + { + "epoch": 5.693453724604966, + "grad_norm": 195.1751708984375, + "learning_rate": 2.146823956442831e-05, + "loss": 36.9764, + "step": 1577 + }, + { + "epoch": 5.69706546275395, + "grad_norm": 194.3510284423828, + "learning_rate": 2.146279491833031e-05, + "loss": 39.4842, + "step": 1578 + }, + { + "epoch": 5.700677200902934, + "grad_norm": 187.02281188964844, + "learning_rate": 2.1457350272232305e-05, + "loss": 38.9574, + "step": 1579 + }, + { + "epoch": 5.704288939051919, + "grad_norm": 242.91925048828125, + "learning_rate": 2.14519056261343e-05, + "loss": 37.6359, + "step": 1580 + }, + { + "epoch": 5.704288939051919, + "eval_loss": 0.6384473443031311, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 1580 + }, + { + "epoch": 5.707900677200903, + "grad_norm": 242.9617156982422, + "learning_rate": 2.14464609800363e-05, + "loss": 31.3564, + "step": 1581 + }, + { + "epoch": 5.711512415349887, + "grad_norm": 182.00540161132812, + "learning_rate": 2.1441016333938295e-05, + "loss": 24.2933, + "step": 1582 + }, + { + "epoch": 5.715124153498872, + "grad_norm": 257.7115173339844, + "learning_rate": 2.143557168784029e-05, + "loss": 24.6299, + "step": 1583 + }, + { + "epoch": 5.718735891647856, + "grad_norm": 198.71554565429688, + "learning_rate": 2.143012704174229e-05, + "loss": 24.7344, + "step": 1584 + }, + { + "epoch": 5.72234762979684, + "grad_norm": 198.24520874023438, + "learning_rate": 2.1424682395644284e-05, + "loss": 26.0825, + "step": 1585 + }, + { + "epoch": 5.725959367945824, + "grad_norm": 248.9528045654297, + "learning_rate": 2.141923774954628e-05, + "loss": 45.1176, + "step": 1586 + }, + { + "epoch": 5.7295711060948085, + "grad_norm": 293.7327575683594, + "learning_rate": 2.1413793103448275e-05, + "loss": 45.8517, + "step": 1587 + }, + { + "epoch": 5.733182844243792, + "grad_norm": 293.1148681640625, + "learning_rate": 2.140834845735027e-05, + "loss": 45.6659, + "step": 1588 + }, + { + "epoch": 5.736794582392776, + "grad_norm": 312.7779846191406, + "learning_rate": 2.140290381125227e-05, + "loss": 44.4863, + "step": 1589 + }, + { + "epoch": 5.74040632054176, + "grad_norm": 309.1000061035156, + "learning_rate": 2.1397459165154265e-05, + "loss": 43.649, + "step": 1590 + }, + { + "epoch": 5.74040632054176, + "eval_loss": 0.6471736431121826, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 1590 + }, + { + "epoch": 5.744018058690745, + "grad_norm": 276.4226989746094, + "learning_rate": 2.1392014519056263e-05, + "loss": 45.3135, + "step": 1591 + }, + { + "epoch": 5.747629796839729, + "grad_norm": 233.6791229248047, + "learning_rate": 2.138656987295826e-05, + "loss": 44.4919, + "step": 1592 + }, + { + "epoch": 5.751241534988713, + "grad_norm": 194.2917022705078, + "learning_rate": 2.1381125226860254e-05, + "loss": 44.8033, + "step": 1593 + }, + { + "epoch": 5.754853273137698, + "grad_norm": 241.76060485839844, + "learning_rate": 2.137568058076225e-05, + "loss": 45.1427, + "step": 1594 + }, + { + "epoch": 5.758465011286682, + "grad_norm": 216.56283569335938, + "learning_rate": 2.137023593466425e-05, + "loss": 43.1769, + "step": 1595 + }, + { + "epoch": 5.762076749435666, + "grad_norm": 230.0026092529297, + "learning_rate": 2.1364791288566244e-05, + "loss": 44.1141, + "step": 1596 + }, + { + "epoch": 5.76568848758465, + "grad_norm": 191.55433654785156, + "learning_rate": 2.135934664246824e-05, + "loss": 40.7227, + "step": 1597 + }, + { + "epoch": 5.769300225733634, + "grad_norm": 180.25885009765625, + "learning_rate": 2.1353901996370235e-05, + "loss": 40.9842, + "step": 1598 + }, + { + "epoch": 5.772911963882619, + "grad_norm": 220.4018096923828, + "learning_rate": 2.134845735027223e-05, + "loss": 40.0403, + "step": 1599 + }, + { + "epoch": 5.776523702031603, + "grad_norm": 264.20587158203125, + "learning_rate": 2.1343012704174232e-05, + "loss": 40.1543, + "step": 1600 + }, + { + "epoch": 5.776523702031603, + "eval_loss": 0.6374311447143555, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1600 + }, + { + "epoch": 5.780135440180587, + "grad_norm": 167.9457244873047, + "learning_rate": 2.1337568058076227e-05, + "loss": 40.9575, + "step": 1601 + }, + { + "epoch": 5.7837471783295715, + "grad_norm": 190.05247497558594, + "learning_rate": 2.1332123411978223e-05, + "loss": 39.5593, + "step": 1602 + }, + { + "epoch": 5.7873589164785555, + "grad_norm": 246.4980926513672, + "learning_rate": 2.1326678765880218e-05, + "loss": 40.7016, + "step": 1603 + }, + { + "epoch": 5.7909706546275395, + "grad_norm": 208.7435302734375, + "learning_rate": 2.1321234119782214e-05, + "loss": 41.7855, + "step": 1604 + }, + { + "epoch": 5.794582392776523, + "grad_norm": 190.84188842773438, + "learning_rate": 2.1315789473684212e-05, + "loss": 41.2129, + "step": 1605 + }, + { + "epoch": 5.798194130925508, + "grad_norm": 196.7161102294922, + "learning_rate": 2.1310344827586208e-05, + "loss": 40.8209, + "step": 1606 + }, + { + "epoch": 5.801805869074492, + "grad_norm": 181.4319305419922, + "learning_rate": 2.1304900181488203e-05, + "loss": 41.8345, + "step": 1607 + }, + { + "epoch": 5.805417607223476, + "grad_norm": 201.2064971923828, + "learning_rate": 2.12994555353902e-05, + "loss": 43.1464, + "step": 1608 + }, + { + "epoch": 5.80902934537246, + "grad_norm": 199.15174865722656, + "learning_rate": 2.1294010889292197e-05, + "loss": 42.6041, + "step": 1609 + }, + { + "epoch": 5.812641083521445, + "grad_norm": 231.0398406982422, + "learning_rate": 2.1288566243194193e-05, + "loss": 42.867, + "step": 1610 + }, + { + "epoch": 5.812641083521445, + "eval_loss": 0.6334222555160522, + "eval_runtime": 3.1534, + "eval_samples_per_second": 56.764, + "eval_steps_per_second": 56.764, + "step": 1610 + }, + { + "epoch": 5.816252821670429, + "grad_norm": 189.26132202148438, + "learning_rate": 2.128312159709619e-05, + "loss": 41.7717, + "step": 1611 + }, + { + "epoch": 5.819864559819413, + "grad_norm": 215.5289764404297, + "learning_rate": 2.1277676950998187e-05, + "loss": 41.3994, + "step": 1612 + }, + { + "epoch": 5.823476297968397, + "grad_norm": 267.4259033203125, + "learning_rate": 2.1272232304900182e-05, + "loss": 41.8173, + "step": 1613 + }, + { + "epoch": 5.827088036117382, + "grad_norm": 241.74749755859375, + "learning_rate": 2.1266787658802178e-05, + "loss": 39.9873, + "step": 1614 + }, + { + "epoch": 5.830699774266366, + "grad_norm": 242.233642578125, + "learning_rate": 2.1261343012704173e-05, + "loss": 37.0662, + "step": 1615 + }, + { + "epoch": 5.83431151241535, + "grad_norm": 217.06141662597656, + "learning_rate": 2.1255898366606172e-05, + "loss": 36.8948, + "step": 1616 + }, + { + "epoch": 5.837923250564334, + "grad_norm": 242.05567932128906, + "learning_rate": 2.1250453720508167e-05, + "loss": 34.9909, + "step": 1617 + }, + { + "epoch": 5.8415349887133186, + "grad_norm": 178.65618896484375, + "learning_rate": 2.1245009074410166e-05, + "loss": 35.603, + "step": 1618 + }, + { + "epoch": 5.8451467268623025, + "grad_norm": 216.36865234375, + "learning_rate": 2.123956442831216e-05, + "loss": 35.9822, + "step": 1619 + }, + { + "epoch": 5.8487584650112865, + "grad_norm": 241.22161865234375, + "learning_rate": 2.1234119782214157e-05, + "loss": 35.1473, + "step": 1620 + }, + { + "epoch": 5.8487584650112865, + "eval_loss": 0.6312161087989807, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 1620 + }, + { + "epoch": 5.852370203160271, + "grad_norm": 192.05210876464844, + "learning_rate": 2.1228675136116152e-05, + "loss": 36.145, + "step": 1621 + }, + { + "epoch": 5.855981941309255, + "grad_norm": 194.0652618408203, + "learning_rate": 2.122323049001815e-05, + "loss": 37.7076, + "step": 1622 + }, + { + "epoch": 5.859593679458239, + "grad_norm": 255.59286499023438, + "learning_rate": 2.1217785843920146e-05, + "loss": 37.6837, + "step": 1623 + }, + { + "epoch": 5.863205417607223, + "grad_norm": 184.0017852783203, + "learning_rate": 2.121234119782214e-05, + "loss": 37.1681, + "step": 1624 + }, + { + "epoch": 5.866817155756207, + "grad_norm": 186.98338317871094, + "learning_rate": 2.1206896551724137e-05, + "loss": 37.4902, + "step": 1625 + }, + { + "epoch": 5.870428893905192, + "grad_norm": 253.53775024414062, + "learning_rate": 2.1201451905626132e-05, + "loss": 37.2771, + "step": 1626 + }, + { + "epoch": 5.874040632054176, + "grad_norm": 196.43038940429688, + "learning_rate": 2.119600725952813e-05, + "loss": 37.7681, + "step": 1627 + }, + { + "epoch": 5.87765237020316, + "grad_norm": 255.99879455566406, + "learning_rate": 2.119056261343013e-05, + "loss": 40.0097, + "step": 1628 + }, + { + "epoch": 5.881264108352145, + "grad_norm": 275.1465148925781, + "learning_rate": 2.1185117967332125e-05, + "loss": 38.1076, + "step": 1629 + }, + { + "epoch": 5.884875846501129, + "grad_norm": 281.8592529296875, + "learning_rate": 2.117967332123412e-05, + "loss": 38.6463, + "step": 1630 + }, + { + "epoch": 5.884875846501129, + "eval_loss": 0.6449099779129028, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 1630 + }, + { + "epoch": 5.888487584650113, + "grad_norm": 246.7912139892578, + "learning_rate": 2.1174228675136116e-05, + "loss": 36.9158, + "step": 1631 + }, + { + "epoch": 5.892099322799097, + "grad_norm": 176.7545623779297, + "learning_rate": 2.116878402903811e-05, + "loss": 25.1153, + "step": 1632 + }, + { + "epoch": 5.895711060948082, + "grad_norm": 202.2602996826172, + "learning_rate": 2.116333938294011e-05, + "loss": 24.1999, + "step": 1633 + }, + { + "epoch": 5.899322799097066, + "grad_norm": 186.26255798339844, + "learning_rate": 2.1157894736842106e-05, + "loss": 24.185, + "step": 1634 + }, + { + "epoch": 5.9029345372460496, + "grad_norm": 231.0543670654297, + "learning_rate": 2.11524500907441e-05, + "loss": 26.1841, + "step": 1635 + }, + { + "epoch": 5.9065462753950335, + "grad_norm": 336.677001953125, + "learning_rate": 2.1147005444646096e-05, + "loss": 47.1367, + "step": 1636 + }, + { + "epoch": 5.910158013544018, + "grad_norm": 299.3211975097656, + "learning_rate": 2.1141560798548095e-05, + "loss": 46.7711, + "step": 1637 + }, + { + "epoch": 5.913769751693002, + "grad_norm": 287.5389099121094, + "learning_rate": 2.1136116152450094e-05, + "loss": 44.9163, + "step": 1638 + }, + { + "epoch": 5.917381489841986, + "grad_norm": 290.34930419921875, + "learning_rate": 2.113067150635209e-05, + "loss": 45.1651, + "step": 1639 + }, + { + "epoch": 5.92099322799097, + "grad_norm": 244.7100372314453, + "learning_rate": 2.1125226860254085e-05, + "loss": 45.6252, + "step": 1640 + }, + { + "epoch": 5.92099322799097, + "eval_loss": 0.6506878733634949, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 1640 + }, + { + "epoch": 5.924604966139955, + "grad_norm": 301.48223876953125, + "learning_rate": 2.111978221415608e-05, + "loss": 44.5345, + "step": 1641 + }, + { + "epoch": 5.928216704288939, + "grad_norm": 261.05987548828125, + "learning_rate": 2.1114337568058075e-05, + "loss": 42.0263, + "step": 1642 + }, + { + "epoch": 5.931828442437923, + "grad_norm": 220.4369659423828, + "learning_rate": 2.110889292196007e-05, + "loss": 41.2405, + "step": 1643 + }, + { + "epoch": 5.935440180586907, + "grad_norm": 261.3221435546875, + "learning_rate": 2.110344827586207e-05, + "loss": 42.2734, + "step": 1644 + }, + { + "epoch": 5.939051918735892, + "grad_norm": 253.70855712890625, + "learning_rate": 2.1098003629764065e-05, + "loss": 43.0752, + "step": 1645 + }, + { + "epoch": 5.942663656884876, + "grad_norm": 198.76138305664062, + "learning_rate": 2.1092558983666064e-05, + "loss": 42.7103, + "step": 1646 + }, + { + "epoch": 5.94627539503386, + "grad_norm": 212.21466064453125, + "learning_rate": 2.108711433756806e-05, + "loss": 42.6215, + "step": 1647 + }, + { + "epoch": 5.949887133182845, + "grad_norm": 212.9633026123047, + "learning_rate": 2.1081669691470055e-05, + "loss": 42.795, + "step": 1648 + }, + { + "epoch": 5.953498871331829, + "grad_norm": 263.2871398925781, + "learning_rate": 2.1076225045372053e-05, + "loss": 43.8843, + "step": 1649 + }, + { + "epoch": 5.957110609480813, + "grad_norm": 207.67120361328125, + "learning_rate": 2.107078039927405e-05, + "loss": 43.0161, + "step": 1650 + }, + { + "epoch": 5.957110609480813, + "eval_loss": 0.6315081715583801, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1650 + }, + { + "epoch": 5.960722347629797, + "grad_norm": 176.6342010498047, + "learning_rate": 2.1065335753176044e-05, + "loss": 38.803, + "step": 1651 + }, + { + "epoch": 5.9643340857787805, + "grad_norm": 223.57485961914062, + "learning_rate": 2.105989110707804e-05, + "loss": 35.1905, + "step": 1652 + }, + { + "epoch": 5.967945823927765, + "grad_norm": 291.507568359375, + "learning_rate": 2.1054446460980035e-05, + "loss": 34.9454, + "step": 1653 + }, + { + "epoch": 5.971557562076749, + "grad_norm": 250.51063537597656, + "learning_rate": 2.104900181488203e-05, + "loss": 37.4404, + "step": 1654 + }, + { + "epoch": 5.975169300225733, + "grad_norm": 307.9601135253906, + "learning_rate": 2.1043557168784032e-05, + "loss": 36.9775, + "step": 1655 + }, + { + "epoch": 5.978781038374718, + "grad_norm": 277.24151611328125, + "learning_rate": 2.1038112522686028e-05, + "loss": 38.2696, + "step": 1656 + }, + { + "epoch": 5.982392776523702, + "grad_norm": 186.7593994140625, + "learning_rate": 2.1032667876588023e-05, + "loss": 37.0656, + "step": 1657 + }, + { + "epoch": 5.986004514672686, + "grad_norm": 201.67047119140625, + "learning_rate": 2.102722323049002e-05, + "loss": 38.1747, + "step": 1658 + }, + { + "epoch": 5.98961625282167, + "grad_norm": 216.87525939941406, + "learning_rate": 2.1021778584392014e-05, + "loss": 39.3248, + "step": 1659 + }, + { + "epoch": 5.993227990970655, + "grad_norm": 227.381103515625, + "learning_rate": 2.1016333938294013e-05, + "loss": 33.4017, + "step": 1660 + }, + { + "epoch": 5.993227990970655, + "eval_loss": 0.6369583010673523, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1660 + }, + { + "epoch": 5.996839729119639, + "grad_norm": 237.2648468017578, + "learning_rate": 2.1010889292196008e-05, + "loss": 24.679, + "step": 1661 + }, + { + "epoch": 6.0, + "grad_norm": 191.99951171875, + "learning_rate": 2.1005444646098003e-05, + "loss": 21.9552, + "step": 1662 + }, + { + "epoch": 6.003611738148984, + "grad_norm": 267.92181396484375, + "learning_rate": 2.1e-05, + "loss": 43.6884, + "step": 1663 + }, + { + "epoch": 6.007223476297969, + "grad_norm": 318.86602783203125, + "learning_rate": 2.0994555353901998e-05, + "loss": 46.0709, + "step": 1664 + }, + { + "epoch": 6.010835214446953, + "grad_norm": 282.772705078125, + "learning_rate": 2.0989110707803993e-05, + "loss": 44.2746, + "step": 1665 + }, + { + "epoch": 6.014446952595937, + "grad_norm": 263.2024841308594, + "learning_rate": 2.0983666061705992e-05, + "loss": 43.818, + "step": 1666 + }, + { + "epoch": 6.018058690744921, + "grad_norm": 229.41725158691406, + "learning_rate": 2.0978221415607987e-05, + "loss": 43.9441, + "step": 1667 + }, + { + "epoch": 6.021670428893906, + "grad_norm": 253.25624084472656, + "learning_rate": 2.0972776769509983e-05, + "loss": 43.517, + "step": 1668 + }, + { + "epoch": 6.0252821670428895, + "grad_norm": 202.00238037109375, + "learning_rate": 2.0967332123411978e-05, + "loss": 44.3685, + "step": 1669 + }, + { + "epoch": 6.0288939051918735, + "grad_norm": 196.92825317382812, + "learning_rate": 2.0961887477313973e-05, + "loss": 44.9367, + "step": 1670 + }, + { + "epoch": 6.0288939051918735, + "eval_loss": 0.6381568312644958, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1670 + }, + { + "epoch": 6.0325056433408575, + "grad_norm": 191.00900268554688, + "learning_rate": 2.0956442831215972e-05, + "loss": 44.0743, + "step": 1671 + }, + { + "epoch": 6.036117381489842, + "grad_norm": 195.92141723632812, + "learning_rate": 2.0950998185117967e-05, + "loss": 43.3278, + "step": 1672 + }, + { + "epoch": 6.039729119638826, + "grad_norm": 230.04708862304688, + "learning_rate": 2.0945553539019963e-05, + "loss": 41.6419, + "step": 1673 + }, + { + "epoch": 6.04334085778781, + "grad_norm": 215.70689392089844, + "learning_rate": 2.094010889292196e-05, + "loss": 41.0927, + "step": 1674 + }, + { + "epoch": 6.046952595936794, + "grad_norm": 227.51797485351562, + "learning_rate": 2.0934664246823957e-05, + "loss": 40.1888, + "step": 1675 + }, + { + "epoch": 6.050564334085779, + "grad_norm": 216.93089294433594, + "learning_rate": 2.0929219600725952e-05, + "loss": 39.8766, + "step": 1676 + }, + { + "epoch": 6.054176072234763, + "grad_norm": 199.3091583251953, + "learning_rate": 2.092377495462795e-05, + "loss": 40.3851, + "step": 1677 + }, + { + "epoch": 6.057787810383747, + "grad_norm": 188.56056213378906, + "learning_rate": 2.0918330308529947e-05, + "loss": 40.5289, + "step": 1678 + }, + { + "epoch": 6.061399548532731, + "grad_norm": 194.23265075683594, + "learning_rate": 2.0912885662431942e-05, + "loss": 40.7509, + "step": 1679 + }, + { + "epoch": 6.065011286681716, + "grad_norm": 199.7327423095703, + "learning_rate": 2.0907441016333937e-05, + "loss": 41.3404, + "step": 1680 + }, + { + "epoch": 6.065011286681716, + "eval_loss": 0.6312655806541443, + "eval_runtime": 3.1482, + "eval_samples_per_second": 56.858, + "eval_steps_per_second": 56.858, + "step": 1680 + }, + { + "epoch": 6.0686230248307, + "grad_norm": 189.40150451660156, + "learning_rate": 2.0901996370235933e-05, + "loss": 41.3719, + "step": 1681 + }, + { + "epoch": 6.072234762979684, + "grad_norm": 222.07705688476562, + "learning_rate": 2.089655172413793e-05, + "loss": 41.8194, + "step": 1682 + }, + { + "epoch": 6.075846501128668, + "grad_norm": 205.6264190673828, + "learning_rate": 2.089110707803993e-05, + "loss": 39.8522, + "step": 1683 + }, + { + "epoch": 6.079458239277653, + "grad_norm": 207.98802185058594, + "learning_rate": 2.0885662431941926e-05, + "loss": 41.5093, + "step": 1684 + }, + { + "epoch": 6.083069977426637, + "grad_norm": 197.24134826660156, + "learning_rate": 2.088021778584392e-05, + "loss": 41.7284, + "step": 1685 + }, + { + "epoch": 6.0866817155756205, + "grad_norm": 220.84255981445312, + "learning_rate": 2.0874773139745916e-05, + "loss": 42.7841, + "step": 1686 + }, + { + "epoch": 6.090293453724605, + "grad_norm": 239.06854248046875, + "learning_rate": 2.0869328493647912e-05, + "loss": 43.6391, + "step": 1687 + }, + { + "epoch": 6.093905191873589, + "grad_norm": 193.2572021484375, + "learning_rate": 2.086388384754991e-05, + "loss": 41.9963, + "step": 1688 + }, + { + "epoch": 6.097516930022573, + "grad_norm": 206.66473388671875, + "learning_rate": 2.0858439201451906e-05, + "loss": 41.9834, + "step": 1689 + }, + { + "epoch": 6.101128668171557, + "grad_norm": 214.81956481933594, + "learning_rate": 2.08529945553539e-05, + "loss": 41.7128, + "step": 1690 + }, + { + "epoch": 6.101128668171557, + "eval_loss": 0.6309775114059448, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1690 + }, + { + "epoch": 6.104740406320542, + "grad_norm": 189.58360290527344, + "learning_rate": 2.0847549909255897e-05, + "loss": 37.7807, + "step": 1691 + }, + { + "epoch": 6.108352144469526, + "grad_norm": 265.76934814453125, + "learning_rate": 2.0842105263157895e-05, + "loss": 37.7091, + "step": 1692 + }, + { + "epoch": 6.11196388261851, + "grad_norm": 266.4632568359375, + "learning_rate": 2.0836660617059894e-05, + "loss": 34.7386, + "step": 1693 + }, + { + "epoch": 6.115575620767494, + "grad_norm": 309.3799743652344, + "learning_rate": 2.083121597096189e-05, + "loss": 34.9386, + "step": 1694 + }, + { + "epoch": 6.119187358916479, + "grad_norm": 252.98681640625, + "learning_rate": 2.0825771324863885e-05, + "loss": 34.9113, + "step": 1695 + }, + { + "epoch": 6.122799097065463, + "grad_norm": 199.3408660888672, + "learning_rate": 2.082032667876588e-05, + "loss": 35.1914, + "step": 1696 + }, + { + "epoch": 6.126410835214447, + "grad_norm": 231.67514038085938, + "learning_rate": 2.0814882032667876e-05, + "loss": 36.3151, + "step": 1697 + }, + { + "epoch": 6.130022573363431, + "grad_norm": 215.49317932128906, + "learning_rate": 2.080943738656987e-05, + "loss": 37.6763, + "step": 1698 + }, + { + "epoch": 6.133634311512416, + "grad_norm": 239.3602752685547, + "learning_rate": 2.080399274047187e-05, + "loss": 35.7805, + "step": 1699 + }, + { + "epoch": 6.1372460496614, + "grad_norm": 192.8195037841797, + "learning_rate": 2.0798548094373865e-05, + "loss": 36.7353, + "step": 1700 + }, + { + "epoch": 6.1372460496614, + "eval_loss": 0.6290757060050964, + "eval_runtime": 3.1486, + "eval_samples_per_second": 56.851, + "eval_steps_per_second": 56.851, + "step": 1700 + }, + { + "epoch": 6.140857787810384, + "grad_norm": 191.125, + "learning_rate": 2.0793103448275864e-05, + "loss": 36.6377, + "step": 1701 + }, + { + "epoch": 6.144469525959368, + "grad_norm": 232.39170837402344, + "learning_rate": 2.078765880217786e-05, + "loss": 36.5235, + "step": 1702 + }, + { + "epoch": 6.148081264108352, + "grad_norm": 259.41204833984375, + "learning_rate": 2.0782214156079855e-05, + "loss": 37.7093, + "step": 1703 + }, + { + "epoch": 6.151693002257336, + "grad_norm": 218.00814819335938, + "learning_rate": 2.0776769509981854e-05, + "loss": 37.8061, + "step": 1704 + }, + { + "epoch": 6.15530474040632, + "grad_norm": 183.78170776367188, + "learning_rate": 2.077132486388385e-05, + "loss": 37.9451, + "step": 1705 + }, + { + "epoch": 6.158916478555304, + "grad_norm": 242.387939453125, + "learning_rate": 2.0765880217785844e-05, + "loss": 38.687, + "step": 1706 + }, + { + "epoch": 6.162528216704289, + "grad_norm": 247.09152221679688, + "learning_rate": 2.076043557168784e-05, + "loss": 38.5109, + "step": 1707 + }, + { + "epoch": 6.166139954853273, + "grad_norm": 202.3104705810547, + "learning_rate": 2.0754990925589835e-05, + "loss": 28.0115, + "step": 1708 + }, + { + "epoch": 6.169751693002257, + "grad_norm": 239.5511016845703, + "learning_rate": 2.0749546279491834e-05, + "loss": 23.8873, + "step": 1709 + }, + { + "epoch": 6.173363431151241, + "grad_norm": 233.80007934570312, + "learning_rate": 2.0744101633393833e-05, + "loss": 24.0236, + "step": 1710 + }, + { + "epoch": 6.173363431151241, + "eval_loss": 0.6451307535171509, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1710 + }, + { + "epoch": 6.176975169300226, + "grad_norm": 231.85955810546875, + "learning_rate": 2.0738656987295828e-05, + "loss": 25.2521, + "step": 1711 + }, + { + "epoch": 6.18058690744921, + "grad_norm": 207.05453491210938, + "learning_rate": 2.0733212341197823e-05, + "loss": 25.5774, + "step": 1712 + }, + { + "epoch": 6.184198645598194, + "grad_norm": 265.9180908203125, + "learning_rate": 2.072776769509982e-05, + "loss": 46.0267, + "step": 1713 + }, + { + "epoch": 6.187810383747179, + "grad_norm": 289.2763671875, + "learning_rate": 2.0722323049001814e-05, + "loss": 46.6262, + "step": 1714 + }, + { + "epoch": 6.191422121896163, + "grad_norm": 254.466552734375, + "learning_rate": 2.0716878402903813e-05, + "loss": 44.2758, + "step": 1715 + }, + { + "epoch": 6.195033860045147, + "grad_norm": 262.713134765625, + "learning_rate": 2.071143375680581e-05, + "loss": 44.6334, + "step": 1716 + }, + { + "epoch": 6.198645598194131, + "grad_norm": 272.8150939941406, + "learning_rate": 2.0705989110707804e-05, + "loss": 44.9617, + "step": 1717 + }, + { + "epoch": 6.2022573363431155, + "grad_norm": 288.115478515625, + "learning_rate": 2.07005444646098e-05, + "loss": 44.4382, + "step": 1718 + }, + { + "epoch": 6.2058690744920995, + "grad_norm": 226.08058166503906, + "learning_rate": 2.0695099818511795e-05, + "loss": 44.8551, + "step": 1719 + }, + { + "epoch": 6.209480812641083, + "grad_norm": 219.95835876464844, + "learning_rate": 2.0689655172413797e-05, + "loss": 45.5901, + "step": 1720 + }, + { + "epoch": 6.209480812641083, + "eval_loss": 0.6379314661026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 1720 + }, + { + "epoch": 6.213092550790067, + "grad_norm": 190.3118896484375, + "learning_rate": 2.0684210526315792e-05, + "loss": 44.0675, + "step": 1721 + }, + { + "epoch": 6.216704288939052, + "grad_norm": 177.408935546875, + "learning_rate": 2.0678765880217787e-05, + "loss": 42.6333, + "step": 1722 + }, + { + "epoch": 6.220316027088036, + "grad_norm": 231.3040313720703, + "learning_rate": 2.0673321234119783e-05, + "loss": 41.6771, + "step": 1723 + }, + { + "epoch": 6.22392776523702, + "grad_norm": 226.51663208007812, + "learning_rate": 2.0667876588021778e-05, + "loss": 41.0829, + "step": 1724 + }, + { + "epoch": 6.227539503386004, + "grad_norm": 184.55775451660156, + "learning_rate": 2.0662431941923774e-05, + "loss": 39.2682, + "step": 1725 + }, + { + "epoch": 6.231151241534989, + "grad_norm": 205.0491943359375, + "learning_rate": 2.0656987295825772e-05, + "loss": 40.4101, + "step": 1726 + }, + { + "epoch": 6.234762979683973, + "grad_norm": 201.45838928222656, + "learning_rate": 2.0651542649727768e-05, + "loss": 39.9147, + "step": 1727 + }, + { + "epoch": 6.238374717832957, + "grad_norm": 220.16213989257812, + "learning_rate": 2.0646098003629763e-05, + "loss": 40.7215, + "step": 1728 + }, + { + "epoch": 6.241986455981941, + "grad_norm": 260.9661560058594, + "learning_rate": 2.0640653357531762e-05, + "loss": 40.0256, + "step": 1729 + }, + { + "epoch": 6.245598194130926, + "grad_norm": 314.2476806640625, + "learning_rate": 2.0635208711433757e-05, + "loss": 41.1147, + "step": 1730 + }, + { + "epoch": 6.245598194130926, + "eval_loss": 0.6347935199737549, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1730 + }, + { + "epoch": 6.24920993227991, + "grad_norm": 262.24505615234375, + "learning_rate": 2.0629764065335756e-05, + "loss": 41.7255, + "step": 1731 + }, + { + "epoch": 6.252821670428894, + "grad_norm": 212.0876922607422, + "learning_rate": 2.062431941923775e-05, + "loss": 41.2559, + "step": 1732 + }, + { + "epoch": 6.2564334085778786, + "grad_norm": 185.3249969482422, + "learning_rate": 2.0618874773139747e-05, + "loss": 41.1664, + "step": 1733 + }, + { + "epoch": 6.2600451467268625, + "grad_norm": 184.7873077392578, + "learning_rate": 2.0613430127041742e-05, + "loss": 41.3357, + "step": 1734 + }, + { + "epoch": 6.2636568848758465, + "grad_norm": 230.11257934570312, + "learning_rate": 2.0607985480943738e-05, + "loss": 43.0978, + "step": 1735 + }, + { + "epoch": 6.2672686230248305, + "grad_norm": 251.255126953125, + "learning_rate": 2.0602540834845733e-05, + "loss": 42.4169, + "step": 1736 + }, + { + "epoch": 6.270880361173815, + "grad_norm": 230.1149444580078, + "learning_rate": 2.0597096188747732e-05, + "loss": 43.2969, + "step": 1737 + }, + { + "epoch": 6.274492099322799, + "grad_norm": 217.2769012451172, + "learning_rate": 2.059165154264973e-05, + "loss": 42.6037, + "step": 1738 + }, + { + "epoch": 6.278103837471783, + "grad_norm": 189.85533142089844, + "learning_rate": 2.0586206896551726e-05, + "loss": 42.1215, + "step": 1739 + }, + { + "epoch": 6.281715575620767, + "grad_norm": 242.15667724609375, + "learning_rate": 2.058076225045372e-05, + "loss": 42.6337, + "step": 1740 + }, + { + "epoch": 6.281715575620767, + "eval_loss": 0.6310555934906006, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 1740 + }, + { + "epoch": 6.285327313769752, + "grad_norm": 213.7873992919922, + "learning_rate": 2.0575317604355717e-05, + "loss": 40.5315, + "step": 1741 + }, + { + "epoch": 6.288939051918736, + "grad_norm": 243.86492919921875, + "learning_rate": 2.0569872958257715e-05, + "loss": 38.9483, + "step": 1742 + }, + { + "epoch": 6.29255079006772, + "grad_norm": 276.0108642578125, + "learning_rate": 2.056442831215971e-05, + "loss": 35.9627, + "step": 1743 + }, + { + "epoch": 6.296162528216704, + "grad_norm": 252.5875701904297, + "learning_rate": 2.0558983666061706e-05, + "loss": 35.4305, + "step": 1744 + }, + { + "epoch": 6.299774266365689, + "grad_norm": 227.15142822265625, + "learning_rate": 2.05535390199637e-05, + "loss": 35.2385, + "step": 1745 + }, + { + "epoch": 6.303386004514673, + "grad_norm": 259.6727294921875, + "learning_rate": 2.0548094373865697e-05, + "loss": 35.735, + "step": 1746 + }, + { + "epoch": 6.306997742663657, + "grad_norm": 185.07765197753906, + "learning_rate": 2.0542649727767696e-05, + "loss": 36.8835, + "step": 1747 + }, + { + "epoch": 6.310609480812641, + "grad_norm": 207.650146484375, + "learning_rate": 2.0537205081669694e-05, + "loss": 36.346, + "step": 1748 + }, + { + "epoch": 6.314221218961626, + "grad_norm": 223.2378692626953, + "learning_rate": 2.053176043557169e-05, + "loss": 36.1527, + "step": 1749 + }, + { + "epoch": 6.3178329571106095, + "grad_norm": 162.90794372558594, + "learning_rate": 2.0526315789473685e-05, + "loss": 35.7408, + "step": 1750 + }, + { + "epoch": 6.3178329571106095, + "eval_loss": 0.6276403069496155, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 1750 + }, + { + "epoch": 6.3214446952595935, + "grad_norm": 165.8592987060547, + "learning_rate": 2.052087114337568e-05, + "loss": 37.7916, + "step": 1751 + }, + { + "epoch": 6.3250564334085775, + "grad_norm": 179.7499542236328, + "learning_rate": 2.0515426497277676e-05, + "loss": 36.8409, + "step": 1752 + }, + { + "epoch": 6.328668171557562, + "grad_norm": 227.0990753173828, + "learning_rate": 2.0509981851179675e-05, + "loss": 37.1766, + "step": 1753 + }, + { + "epoch": 6.332279909706546, + "grad_norm": 216.3297882080078, + "learning_rate": 2.050453720508167e-05, + "loss": 37.5, + "step": 1754 + }, + { + "epoch": 6.33589164785553, + "grad_norm": 197.88409423828125, + "learning_rate": 2.0499092558983666e-05, + "loss": 38.8293, + "step": 1755 + }, + { + "epoch": 6.339503386004514, + "grad_norm": 189.74916076660156, + "learning_rate": 2.049364791288566e-05, + "loss": 37.9873, + "step": 1756 + }, + { + "epoch": 6.343115124153499, + "grad_norm": 241.16644287109375, + "learning_rate": 2.048820326678766e-05, + "loss": 39.3107, + "step": 1757 + }, + { + "epoch": 6.346726862302483, + "grad_norm": 224.3491668701172, + "learning_rate": 2.0482758620689655e-05, + "loss": 36.2482, + "step": 1758 + }, + { + "epoch": 6.350338600451467, + "grad_norm": 217.30882263183594, + "learning_rate": 2.0477313974591654e-05, + "loss": 24.1945, + "step": 1759 + }, + { + "epoch": 6.353950338600452, + "grad_norm": 213.23683166503906, + "learning_rate": 2.047186932849365e-05, + "loss": 24.2356, + "step": 1760 + }, + { + "epoch": 6.353950338600452, + "eval_loss": 0.6382855772972107, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.795, + "eval_steps_per_second": 56.795, + "step": 1760 + }, + { + "epoch": 6.357562076749436, + "grad_norm": 209.8166961669922, + "learning_rate": 2.0466424682395645e-05, + "loss": 25.1916, + "step": 1761 + }, + { + "epoch": 6.36117381489842, + "grad_norm": 197.86773681640625, + "learning_rate": 2.046098003629764e-05, + "loss": 25.1372, + "step": 1762 + }, + { + "epoch": 6.364785553047404, + "grad_norm": 280.80517578125, + "learning_rate": 2.0455535390199635e-05, + "loss": 45.0431, + "step": 1763 + }, + { + "epoch": 6.368397291196389, + "grad_norm": 239.85861206054688, + "learning_rate": 2.0450090744101634e-05, + "loss": 45.4893, + "step": 1764 + }, + { + "epoch": 6.372009029345373, + "grad_norm": 302.56024169921875, + "learning_rate": 2.044464609800363e-05, + "loss": 45.3313, + "step": 1765 + }, + { + "epoch": 6.375620767494357, + "grad_norm": 255.5519256591797, + "learning_rate": 2.043920145190563e-05, + "loss": 44.703, + "step": 1766 + }, + { + "epoch": 6.3792325056433405, + "grad_norm": 223.1331024169922, + "learning_rate": 2.0433756805807624e-05, + "loss": 45.0278, + "step": 1767 + }, + { + "epoch": 6.382844243792325, + "grad_norm": 240.68817138671875, + "learning_rate": 2.042831215970962e-05, + "loss": 44.7298, + "step": 1768 + }, + { + "epoch": 6.386455981941309, + "grad_norm": 239.5072021484375, + "learning_rate": 2.0422867513611614e-05, + "loss": 44.0512, + "step": 1769 + }, + { + "epoch": 6.390067720090293, + "grad_norm": 186.3783416748047, + "learning_rate": 2.0417422867513613e-05, + "loss": 43.8646, + "step": 1770 + }, + { + "epoch": 6.390067720090293, + "eval_loss": 0.6325972676277161, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 1770 + }, + { + "epoch": 6.393679458239277, + "grad_norm": 169.77285766601562, + "learning_rate": 2.041197822141561e-05, + "loss": 43.8688, + "step": 1771 + }, + { + "epoch": 6.397291196388262, + "grad_norm": 158.4019012451172, + "learning_rate": 2.0406533575317604e-05, + "loss": 42.5757, + "step": 1772 + }, + { + "epoch": 6.400902934537246, + "grad_norm": 209.79916381835938, + "learning_rate": 2.04010889292196e-05, + "loss": 44.8075, + "step": 1773 + }, + { + "epoch": 6.40451467268623, + "grad_norm": 215.74639892578125, + "learning_rate": 2.0395644283121595e-05, + "loss": 42.0121, + "step": 1774 + }, + { + "epoch": 6.408126410835214, + "grad_norm": 215.21121215820312, + "learning_rate": 2.0390199637023597e-05, + "loss": 40.6564, + "step": 1775 + }, + { + "epoch": 6.411738148984199, + "grad_norm": 244.49574279785156, + "learning_rate": 2.0384754990925592e-05, + "loss": 40.543, + "step": 1776 + }, + { + "epoch": 6.415349887133183, + "grad_norm": 189.22781372070312, + "learning_rate": 2.0379310344827588e-05, + "loss": 39.5569, + "step": 1777 + }, + { + "epoch": 6.418961625282167, + "grad_norm": 204.32664489746094, + "learning_rate": 2.0373865698729583e-05, + "loss": 40.0789, + "step": 1778 + }, + { + "epoch": 6.422573363431152, + "grad_norm": 217.5277557373047, + "learning_rate": 2.036842105263158e-05, + "loss": 39.6436, + "step": 1779 + }, + { + "epoch": 6.426185101580136, + "grad_norm": 196.25918579101562, + "learning_rate": 2.0362976406533574e-05, + "loss": 41.0794, + "step": 1780 + }, + { + "epoch": 6.426185101580136, + "eval_loss": 0.6334295868873596, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1780 + }, + { + "epoch": 6.42979683972912, + "grad_norm": 191.50656127929688, + "learning_rate": 2.0357531760435573e-05, + "loss": 41.2976, + "step": 1781 + }, + { + "epoch": 6.433408577878104, + "grad_norm": 192.98692321777344, + "learning_rate": 2.0352087114337568e-05, + "loss": 41.0843, + "step": 1782 + }, + { + "epoch": 6.437020316027088, + "grad_norm": 197.32862854003906, + "learning_rate": 2.0346642468239563e-05, + "loss": 40.4123, + "step": 1783 + }, + { + "epoch": 6.440632054176072, + "grad_norm": 205.18751525878906, + "learning_rate": 2.0341197822141562e-05, + "loss": 41.9185, + "step": 1784 + }, + { + "epoch": 6.444243792325056, + "grad_norm": 201.69070434570312, + "learning_rate": 2.0335753176043558e-05, + "loss": 41.6794, + "step": 1785 + }, + { + "epoch": 6.44785553047404, + "grad_norm": 218.77044677734375, + "learning_rate": 2.0330308529945556e-05, + "loss": 43.5805, + "step": 1786 + }, + { + "epoch": 6.451467268623025, + "grad_norm": 183.25967407226562, + "learning_rate": 2.0324863883847552e-05, + "loss": 41.2777, + "step": 1787 + }, + { + "epoch": 6.455079006772009, + "grad_norm": 219.97369384765625, + "learning_rate": 2.0319419237749547e-05, + "loss": 42.4618, + "step": 1788 + }, + { + "epoch": 6.458690744920993, + "grad_norm": 216.1624298095703, + "learning_rate": 2.0313974591651542e-05, + "loss": 41.6424, + "step": 1789 + }, + { + "epoch": 6.462302483069977, + "grad_norm": 222.29965209960938, + "learning_rate": 2.0308529945553538e-05, + "loss": 41.4058, + "step": 1790 + }, + { + "epoch": 6.462302483069977, + "eval_loss": 0.6282982230186462, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 1790 + }, + { + "epoch": 6.465914221218962, + "grad_norm": 215.50511169433594, + "learning_rate": 2.0303085299455533e-05, + "loss": 39.474, + "step": 1791 + }, + { + "epoch": 6.469525959367946, + "grad_norm": 237.2119903564453, + "learning_rate": 2.0297640653357532e-05, + "loss": 36.0508, + "step": 1792 + }, + { + "epoch": 6.47313769751693, + "grad_norm": 234.52975463867188, + "learning_rate": 2.029219600725953e-05, + "loss": 34.1704, + "step": 1793 + }, + { + "epoch": 6.476749435665914, + "grad_norm": 213.22216796875, + "learning_rate": 2.0286751361161526e-05, + "loss": 34.7592, + "step": 1794 + }, + { + "epoch": 6.480361173814899, + "grad_norm": 215.77244567871094, + "learning_rate": 2.028130671506352e-05, + "loss": 35.3051, + "step": 1795 + }, + { + "epoch": 6.483972911963883, + "grad_norm": 179.0439910888672, + "learning_rate": 2.0275862068965517e-05, + "loss": 35.2493, + "step": 1796 + }, + { + "epoch": 6.487584650112867, + "grad_norm": 217.47218322753906, + "learning_rate": 2.0270417422867516e-05, + "loss": 35.6169, + "step": 1797 + }, + { + "epoch": 6.491196388261851, + "grad_norm": 191.3380584716797, + "learning_rate": 2.026497277676951e-05, + "loss": 36.428, + "step": 1798 + }, + { + "epoch": 6.4948081264108355, + "grad_norm": 200.8570098876953, + "learning_rate": 2.0259528130671506e-05, + "loss": 36.5983, + "step": 1799 + }, + { + "epoch": 6.4984198645598195, + "grad_norm": 173.1240234375, + "learning_rate": 2.0254083484573502e-05, + "loss": 36.0163, + "step": 1800 + }, + { + "epoch": 6.4984198645598195, + "eval_loss": 0.6268841624259949, + "eval_runtime": 3.146, + "eval_samples_per_second": 56.898, + "eval_steps_per_second": 56.898, + "step": 1800 + }, + { + "epoch": 6.502031602708803, + "grad_norm": 225.66845703125, + "learning_rate": 2.0248638838475497e-05, + "loss": 36.2461, + "step": 1801 + }, + { + "epoch": 6.505643340857787, + "grad_norm": 189.66233825683594, + "learning_rate": 2.0243194192377496e-05, + "loss": 37.416, + "step": 1802 + }, + { + "epoch": 6.509255079006772, + "grad_norm": 243.0270233154297, + "learning_rate": 2.0237749546279495e-05, + "loss": 38.5309, + "step": 1803 + }, + { + "epoch": 6.512866817155756, + "grad_norm": 192.0927276611328, + "learning_rate": 2.023230490018149e-05, + "loss": 37.087, + "step": 1804 + }, + { + "epoch": 6.51647855530474, + "grad_norm": 222.2957305908203, + "learning_rate": 2.0226860254083486e-05, + "loss": 37.8877, + "step": 1805 + }, + { + "epoch": 6.520090293453725, + "grad_norm": 259.84722900390625, + "learning_rate": 2.022141560798548e-05, + "loss": 39.2138, + "step": 1806 + }, + { + "epoch": 6.523702031602709, + "grad_norm": 205.5794219970703, + "learning_rate": 2.0215970961887476e-05, + "loss": 38.6066, + "step": 1807 + }, + { + "epoch": 6.527313769751693, + "grad_norm": 300.455810546875, + "learning_rate": 2.0210526315789475e-05, + "loss": 36.1581, + "step": 1808 + }, + { + "epoch": 6.530925507900677, + "grad_norm": 207.18063354492188, + "learning_rate": 2.020508166969147e-05, + "loss": 24.3689, + "step": 1809 + }, + { + "epoch": 6.534537246049661, + "grad_norm": 230.98516845703125, + "learning_rate": 2.0199637023593466e-05, + "loss": 23.7019, + "step": 1810 + }, + { + "epoch": 6.534537246049661, + "eval_loss": 0.6379140615463257, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 1810 + }, + { + "epoch": 6.538148984198646, + "grad_norm": 153.8694610595703, + "learning_rate": 2.019419237749546e-05, + "loss": 24.5035, + "step": 1811 + }, + { + "epoch": 6.54176072234763, + "grad_norm": 229.9432373046875, + "learning_rate": 2.018874773139746e-05, + "loss": 26.1645, + "step": 1812 + }, + { + "epoch": 6.545372460496614, + "grad_norm": 325.3592529296875, + "learning_rate": 2.018330308529946e-05, + "loss": 45.6349, + "step": 1813 + }, + { + "epoch": 6.5489841986455986, + "grad_norm": 261.0744323730469, + "learning_rate": 2.0177858439201454e-05, + "loss": 45.5545, + "step": 1814 + }, + { + "epoch": 6.5525959367945825, + "grad_norm": 261.4237976074219, + "learning_rate": 2.017241379310345e-05, + "loss": 45.321, + "step": 1815 + }, + { + "epoch": 6.5562076749435665, + "grad_norm": 238.8377685546875, + "learning_rate": 2.0166969147005445e-05, + "loss": 44.5963, + "step": 1816 + }, + { + "epoch": 6.5598194130925505, + "grad_norm": 225.89730834960938, + "learning_rate": 2.016152450090744e-05, + "loss": 43.593, + "step": 1817 + }, + { + "epoch": 6.563431151241535, + "grad_norm": 265.09625244140625, + "learning_rate": 2.0156079854809436e-05, + "loss": 43.536, + "step": 1818 + }, + { + "epoch": 6.567042889390519, + "grad_norm": 257.9114685058594, + "learning_rate": 2.0150635208711434e-05, + "loss": 44.1125, + "step": 1819 + }, + { + "epoch": 6.570654627539503, + "grad_norm": 188.06382751464844, + "learning_rate": 2.014519056261343e-05, + "loss": 45.097, + "step": 1820 + }, + { + "epoch": 6.570654627539503, + "eval_loss": 0.6347097754478455, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 1820 + }, + { + "epoch": 6.574266365688487, + "grad_norm": 227.7350616455078, + "learning_rate": 2.013974591651543e-05, + "loss": 43.9367, + "step": 1821 + }, + { + "epoch": 6.577878103837472, + "grad_norm": 207.54774475097656, + "learning_rate": 2.0134301270417424e-05, + "loss": 43.8266, + "step": 1822 + }, + { + "epoch": 6.581489841986456, + "grad_norm": 204.62364196777344, + "learning_rate": 2.012885662431942e-05, + "loss": 42.7973, + "step": 1823 + }, + { + "epoch": 6.58510158013544, + "grad_norm": 244.32159423828125, + "learning_rate": 2.0123411978221418e-05, + "loss": 42.7741, + "step": 1824 + }, + { + "epoch": 6.588713318284425, + "grad_norm": 304.9100036621094, + "learning_rate": 2.0117967332123414e-05, + "loss": 40.6529, + "step": 1825 + }, + { + "epoch": 6.592325056433409, + "grad_norm": 275.5767517089844, + "learning_rate": 2.011252268602541e-05, + "loss": 40.2909, + "step": 1826 + }, + { + "epoch": 6.595936794582393, + "grad_norm": 227.69642639160156, + "learning_rate": 2.0107078039927404e-05, + "loss": 39.8786, + "step": 1827 + }, + { + "epoch": 6.599548532731377, + "grad_norm": 261.4333190917969, + "learning_rate": 2.01016333938294e-05, + "loss": 40.7009, + "step": 1828 + }, + { + "epoch": 6.603160270880361, + "grad_norm": 213.0095977783203, + "learning_rate": 2.0096188747731395e-05, + "loss": 40.0595, + "step": 1829 + }, + { + "epoch": 6.606772009029346, + "grad_norm": 251.78590393066406, + "learning_rate": 2.0090744101633397e-05, + "loss": 40.8939, + "step": 1830 + }, + { + "epoch": 6.606772009029346, + "eval_loss": 0.6333281397819519, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1830 + }, + { + "epoch": 6.6103837471783295, + "grad_norm": 224.89805603027344, + "learning_rate": 2.0085299455535393e-05, + "loss": 41.4123, + "step": 1831 + }, + { + "epoch": 6.6139954853273135, + "grad_norm": 195.67982482910156, + "learning_rate": 2.0079854809437388e-05, + "loss": 41.3483, + "step": 1832 + }, + { + "epoch": 6.617607223476298, + "grad_norm": 214.318603515625, + "learning_rate": 2.0074410163339383e-05, + "loss": 40.5516, + "step": 1833 + }, + { + "epoch": 6.621218961625282, + "grad_norm": 226.60968017578125, + "learning_rate": 2.006896551724138e-05, + "loss": 41.3523, + "step": 1834 + }, + { + "epoch": 6.624830699774266, + "grad_norm": 231.63604736328125, + "learning_rate": 2.0063520871143378e-05, + "loss": 41.8734, + "step": 1835 + }, + { + "epoch": 6.62844243792325, + "grad_norm": 224.1644287109375, + "learning_rate": 2.0058076225045373e-05, + "loss": 42.7386, + "step": 1836 + }, + { + "epoch": 6.632054176072235, + "grad_norm": 273.651123046875, + "learning_rate": 2.0052631578947368e-05, + "loss": 42.4525, + "step": 1837 + }, + { + "epoch": 6.635665914221219, + "grad_norm": 270.8088684082031, + "learning_rate": 2.0047186932849364e-05, + "loss": 42.1051, + "step": 1838 + }, + { + "epoch": 6.639277652370203, + "grad_norm": 303.1058044433594, + "learning_rate": 2.0041742286751362e-05, + "loss": 42.1301, + "step": 1839 + }, + { + "epoch": 6.642889390519187, + "grad_norm": 207.29380798339844, + "learning_rate": 2.0036297640653358e-05, + "loss": 42.1495, + "step": 1840 + }, + { + "epoch": 6.642889390519187, + "eval_loss": 0.6321585774421692, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1840 + }, + { + "epoch": 6.646501128668172, + "grad_norm": 262.1852722167969, + "learning_rate": 2.0030852994555357e-05, + "loss": 39.6408, + "step": 1841 + }, + { + "epoch": 6.650112866817156, + "grad_norm": 233.7991943359375, + "learning_rate": 2.0025408348457352e-05, + "loss": 37.6177, + "step": 1842 + }, + { + "epoch": 6.65372460496614, + "grad_norm": 247.25514221191406, + "learning_rate": 2.0019963702359347e-05, + "loss": 35.4287, + "step": 1843 + }, + { + "epoch": 6.657336343115124, + "grad_norm": 191.53343200683594, + "learning_rate": 2.0014519056261343e-05, + "loss": 34.2335, + "step": 1844 + }, + { + "epoch": 6.660948081264109, + "grad_norm": 245.22821044921875, + "learning_rate": 2.0009074410163338e-05, + "loss": 35.8097, + "step": 1845 + }, + { + "epoch": 6.664559819413093, + "grad_norm": 213.8151092529297, + "learning_rate": 2.0003629764065337e-05, + "loss": 35.2621, + "step": 1846 + }, + { + "epoch": 6.668171557562077, + "grad_norm": 174.6085205078125, + "learning_rate": 1.9998185117967332e-05, + "loss": 36.6137, + "step": 1847 + }, + { + "epoch": 6.6717832957110605, + "grad_norm": 287.4677429199219, + "learning_rate": 1.9992740471869328e-05, + "loss": 37.5896, + "step": 1848 + }, + { + "epoch": 6.675395033860045, + "grad_norm": 224.59771728515625, + "learning_rate": 1.9987295825771326e-05, + "loss": 36.5515, + "step": 1849 + }, + { + "epoch": 6.679006772009029, + "grad_norm": 212.73065185546875, + "learning_rate": 1.9981851179673322e-05, + "loss": 36.2511, + "step": 1850 + }, + { + "epoch": 6.679006772009029, + "eval_loss": 0.6308404803276062, + "eval_runtime": 3.1419, + "eval_samples_per_second": 56.972, + "eval_steps_per_second": 56.972, + "step": 1850 + }, + { + "epoch": 6.682618510158013, + "grad_norm": 214.7340850830078, + "learning_rate": 1.9976406533575317e-05, + "loss": 37.6949, + "step": 1851 + }, + { + "epoch": 6.686230248306998, + "grad_norm": 220.3029327392578, + "learning_rate": 1.9970961887477316e-05, + "loss": 36.5785, + "step": 1852 + }, + { + "epoch": 6.689841986455982, + "grad_norm": 198.97564697265625, + "learning_rate": 1.996551724137931e-05, + "loss": 38.5277, + "step": 1853 + }, + { + "epoch": 6.693453724604966, + "grad_norm": 180.94789123535156, + "learning_rate": 1.9960072595281307e-05, + "loss": 37.5197, + "step": 1854 + }, + { + "epoch": 6.69706546275395, + "grad_norm": 212.17584228515625, + "learning_rate": 1.9954627949183302e-05, + "loss": 37.3483, + "step": 1855 + }, + { + "epoch": 6.700677200902934, + "grad_norm": 253.88601684570312, + "learning_rate": 1.9949183303085298e-05, + "loss": 38.5224, + "step": 1856 + }, + { + "epoch": 6.704288939051919, + "grad_norm": 193.17698669433594, + "learning_rate": 1.9943738656987296e-05, + "loss": 37.5679, + "step": 1857 + }, + { + "epoch": 6.707900677200903, + "grad_norm": 217.2652130126953, + "learning_rate": 1.9938294010889295e-05, + "loss": 27.7344, + "step": 1858 + }, + { + "epoch": 6.711512415349887, + "grad_norm": 183.9295196533203, + "learning_rate": 1.993284936479129e-05, + "loss": 24.3864, + "step": 1859 + }, + { + "epoch": 6.715124153498872, + "grad_norm": 200.3455352783203, + "learning_rate": 1.9927404718693286e-05, + "loss": 23.7328, + "step": 1860 + }, + { + "epoch": 6.715124153498872, + "eval_loss": 0.636415421962738, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.943, + "eval_steps_per_second": 56.943, + "step": 1860 + }, + { + "epoch": 6.718735891647856, + "grad_norm": 206.7858123779297, + "learning_rate": 1.992196007259528e-05, + "loss": 24.6541, + "step": 1861 + }, + { + "epoch": 6.72234762979684, + "grad_norm": 208.10414123535156, + "learning_rate": 1.9916515426497277e-05, + "loss": 25.1223, + "step": 1862 + }, + { + "epoch": 6.725959367945824, + "grad_norm": 270.6657409667969, + "learning_rate": 1.9911070780399275e-05, + "loss": 44.8561, + "step": 1863 + }, + { + "epoch": 6.7295711060948085, + "grad_norm": 246.69094848632812, + "learning_rate": 1.990562613430127e-05, + "loss": 45.8683, + "step": 1864 + }, + { + "epoch": 6.733182844243792, + "grad_norm": 243.4462432861328, + "learning_rate": 1.9900181488203266e-05, + "loss": 45.1845, + "step": 1865 + }, + { + "epoch": 6.736794582392776, + "grad_norm": 218.0637969970703, + "learning_rate": 1.989473684210526e-05, + "loss": 43.9492, + "step": 1866 + }, + { + "epoch": 6.74040632054176, + "grad_norm": 200.28140258789062, + "learning_rate": 1.988929219600726e-05, + "loss": 44.0612, + "step": 1867 + }, + { + "epoch": 6.744018058690745, + "grad_norm": 200.3120880126953, + "learning_rate": 1.988384754990926e-05, + "loss": 43.4748, + "step": 1868 + }, + { + "epoch": 6.747629796839729, + "grad_norm": 186.1811065673828, + "learning_rate": 1.9878402903811254e-05, + "loss": 43.6851, + "step": 1869 + }, + { + "epoch": 6.751241534988713, + "grad_norm": 208.15167236328125, + "learning_rate": 1.987295825771325e-05, + "loss": 44.4196, + "step": 1870 + }, + { + "epoch": 6.751241534988713, + "eval_loss": 0.6353851556777954, + "eval_runtime": 3.1436, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1870 + }, + { + "epoch": 6.754853273137698, + "grad_norm": 207.500244140625, + "learning_rate": 1.9867513611615245e-05, + "loss": 44.1493, + "step": 1871 + }, + { + "epoch": 6.758465011286682, + "grad_norm": 238.17047119140625, + "learning_rate": 1.986206896551724e-05, + "loss": 44.6587, + "step": 1872 + }, + { + "epoch": 6.762076749435666, + "grad_norm": 192.9468231201172, + "learning_rate": 1.9856624319419236e-05, + "loss": 43.2409, + "step": 1873 + }, + { + "epoch": 6.76568848758465, + "grad_norm": 205.26492309570312, + "learning_rate": 1.9851179673321235e-05, + "loss": 40.8636, + "step": 1874 + }, + { + "epoch": 6.769300225733634, + "grad_norm": 190.49908447265625, + "learning_rate": 1.984573502722323e-05, + "loss": 41.0769, + "step": 1875 + }, + { + "epoch": 6.772911963882619, + "grad_norm": 206.56097412109375, + "learning_rate": 1.984029038112523e-05, + "loss": 40.1137, + "step": 1876 + }, + { + "epoch": 6.776523702031603, + "grad_norm": 212.89256286621094, + "learning_rate": 1.9834845735027224e-05, + "loss": 41.0114, + "step": 1877 + }, + { + "epoch": 6.780135440180587, + "grad_norm": 197.24267578125, + "learning_rate": 1.982940108892922e-05, + "loss": 40.6027, + "step": 1878 + }, + { + "epoch": 6.7837471783295715, + "grad_norm": 187.01942443847656, + "learning_rate": 1.982395644283122e-05, + "loss": 40.5933, + "step": 1879 + }, + { + "epoch": 6.7873589164785555, + "grad_norm": 236.31092834472656, + "learning_rate": 1.9818511796733214e-05, + "loss": 41.2282, + "step": 1880 + }, + { + "epoch": 6.7873589164785555, + "eval_loss": 0.6299392580986023, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 1880 + }, + { + "epoch": 6.7909706546275395, + "grad_norm": 194.92059326171875, + "learning_rate": 1.981306715063521e-05, + "loss": 41.5858, + "step": 1881 + }, + { + "epoch": 6.794582392776523, + "grad_norm": 192.26272583007812, + "learning_rate": 1.9807622504537205e-05, + "loss": 40.6826, + "step": 1882 + }, + { + "epoch": 6.798194130925508, + "grad_norm": 181.8116912841797, + "learning_rate": 1.98021778584392e-05, + "loss": 40.0867, + "step": 1883 + }, + { + "epoch": 6.801805869074492, + "grad_norm": 219.03494262695312, + "learning_rate": 1.9796733212341195e-05, + "loss": 41.4496, + "step": 1884 + }, + { + "epoch": 6.805417607223476, + "grad_norm": 190.7852325439453, + "learning_rate": 1.9791288566243194e-05, + "loss": 42.4147, + "step": 1885 + }, + { + "epoch": 6.80902934537246, + "grad_norm": 200.32476806640625, + "learning_rate": 1.9785843920145193e-05, + "loss": 42.0316, + "step": 1886 + }, + { + "epoch": 6.812641083521445, + "grad_norm": 240.6086883544922, + "learning_rate": 1.9780399274047188e-05, + "loss": 39.6992, + "step": 1887 + }, + { + "epoch": 6.816252821670429, + "grad_norm": 222.31700134277344, + "learning_rate": 1.9774954627949184e-05, + "loss": 42.9572, + "step": 1888 + }, + { + "epoch": 6.819864559819413, + "grad_norm": 215.65292358398438, + "learning_rate": 1.976950998185118e-05, + "loss": 42.5147, + "step": 1889 + }, + { + "epoch": 6.823476297968397, + "grad_norm": 195.71624755859375, + "learning_rate": 1.9764065335753178e-05, + "loss": 40.9536, + "step": 1890 + }, + { + "epoch": 6.823476297968397, + "eval_loss": 0.6288287043571472, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 1890 + }, + { + "epoch": 6.827088036117382, + "grad_norm": 202.301025390625, + "learning_rate": 1.9758620689655173e-05, + "loss": 40.1754, + "step": 1891 + }, + { + "epoch": 6.830699774266366, + "grad_norm": 217.07186889648438, + "learning_rate": 1.975317604355717e-05, + "loss": 35.7505, + "step": 1892 + }, + { + "epoch": 6.83431151241535, + "grad_norm": 189.78782653808594, + "learning_rate": 1.9747731397459164e-05, + "loss": 34.813, + "step": 1893 + }, + { + "epoch": 6.837923250564334, + "grad_norm": 247.2117462158203, + "learning_rate": 1.974228675136116e-05, + "loss": 33.932, + "step": 1894 + }, + { + "epoch": 6.8415349887133186, + "grad_norm": 244.06321716308594, + "learning_rate": 1.9736842105263158e-05, + "loss": 36.2514, + "step": 1895 + }, + { + "epoch": 6.8451467268623025, + "grad_norm": 235.78692626953125, + "learning_rate": 1.9731397459165157e-05, + "loss": 35.2123, + "step": 1896 + }, + { + "epoch": 6.8487584650112865, + "grad_norm": 193.82456970214844, + "learning_rate": 1.9725952813067152e-05, + "loss": 36.5477, + "step": 1897 + }, + { + "epoch": 6.852370203160271, + "grad_norm": 230.2017059326172, + "learning_rate": 1.9720508166969148e-05, + "loss": 36.1244, + "step": 1898 + }, + { + "epoch": 6.855981941309255, + "grad_norm": 205.5274200439453, + "learning_rate": 1.9715063520871143e-05, + "loss": 36.7059, + "step": 1899 + }, + { + "epoch": 6.859593679458239, + "grad_norm": 236.6873016357422, + "learning_rate": 1.970961887477314e-05, + "loss": 36.6212, + "step": 1900 + }, + { + "epoch": 6.859593679458239, + "eval_loss": 0.6235609650611877, + "eval_runtime": 3.1497, + "eval_samples_per_second": 56.831, + "eval_steps_per_second": 56.831, + "step": 1900 + }, + { + "epoch": 6.863205417607223, + "grad_norm": 217.63638305664062, + "learning_rate": 1.9704174228675137e-05, + "loss": 37.3918, + "step": 1901 + }, + { + "epoch": 6.866817155756207, + "grad_norm": 169.31996154785156, + "learning_rate": 1.9698729582577133e-05, + "loss": 37.8555, + "step": 1902 + }, + { + "epoch": 6.870428893905192, + "grad_norm": 204.2144775390625, + "learning_rate": 1.9693284936479128e-05, + "loss": 38.0013, + "step": 1903 + }, + { + "epoch": 6.874040632054176, + "grad_norm": 219.13595581054688, + "learning_rate": 1.9687840290381127e-05, + "loss": 37.2128, + "step": 1904 + }, + { + "epoch": 6.87765237020316, + "grad_norm": 189.8477325439453, + "learning_rate": 1.9682395644283122e-05, + "loss": 39.272, + "step": 1905 + }, + { + "epoch": 6.881264108352145, + "grad_norm": 214.21360778808594, + "learning_rate": 1.967695099818512e-05, + "loss": 37.5185, + "step": 1906 + }, + { + "epoch": 6.884875846501129, + "grad_norm": 252.57867431640625, + "learning_rate": 1.9671506352087116e-05, + "loss": 37.6195, + "step": 1907 + }, + { + "epoch": 6.888487584650113, + "grad_norm": 169.85382080078125, + "learning_rate": 1.966606170598911e-05, + "loss": 29.083, + "step": 1908 + }, + { + "epoch": 6.892099322799097, + "grad_norm": 161.38137817382812, + "learning_rate": 1.9660617059891107e-05, + "loss": 24.4547, + "step": 1909 + }, + { + "epoch": 6.895711060948082, + "grad_norm": 192.5706787109375, + "learning_rate": 1.9655172413793102e-05, + "loss": 24.2235, + "step": 1910 + }, + { + "epoch": 6.895711060948082, + "eval_loss": 0.6387229561805725, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1910 + }, + { + "epoch": 6.899322799097066, + "grad_norm": 177.5368194580078, + "learning_rate": 1.9649727767695098e-05, + "loss": 24.8032, + "step": 1911 + }, + { + "epoch": 6.9029345372460496, + "grad_norm": 206.98458862304688, + "learning_rate": 1.9644283121597097e-05, + "loss": 25.7293, + "step": 1912 + }, + { + "epoch": 6.9065462753950335, + "grad_norm": 238.7289581298828, + "learning_rate": 1.9638838475499095e-05, + "loss": 44.2514, + "step": 1913 + }, + { + "epoch": 6.910158013544018, + "grad_norm": 225.86854553222656, + "learning_rate": 1.963339382940109e-05, + "loss": 44.4858, + "step": 1914 + }, + { + "epoch": 6.913769751693002, + "grad_norm": 235.71524047851562, + "learning_rate": 1.9627949183303086e-05, + "loss": 44.5351, + "step": 1915 + }, + { + "epoch": 6.917381489841986, + "grad_norm": 233.1634063720703, + "learning_rate": 1.962250453720508e-05, + "loss": 44.0865, + "step": 1916 + }, + { + "epoch": 6.92099322799097, + "grad_norm": 201.48944091796875, + "learning_rate": 1.961705989110708e-05, + "loss": 45.0226, + "step": 1917 + }, + { + "epoch": 6.924604966139955, + "grad_norm": 226.95469665527344, + "learning_rate": 1.9611615245009076e-05, + "loss": 44.3969, + "step": 1918 + }, + { + "epoch": 6.928216704288939, + "grad_norm": 242.79940795898438, + "learning_rate": 1.960617059891107e-05, + "loss": 41.3037, + "step": 1919 + }, + { + "epoch": 6.931828442437923, + "grad_norm": 255.3524932861328, + "learning_rate": 1.9600725952813066e-05, + "loss": 41.3567, + "step": 1920 + }, + { + "epoch": 6.931828442437923, + "eval_loss": 0.6346065998077393, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 1920 + }, + { + "epoch": 6.935440180586907, + "grad_norm": 277.0763854980469, + "learning_rate": 1.9595281306715062e-05, + "loss": 41.142, + "step": 1921 + }, + { + "epoch": 6.939051918735892, + "grad_norm": 176.02658081054688, + "learning_rate": 1.958983666061706e-05, + "loss": 42.1963, + "step": 1922 + }, + { + "epoch": 6.942663656884876, + "grad_norm": 236.36398315429688, + "learning_rate": 1.958439201451906e-05, + "loss": 42.351, + "step": 1923 + }, + { + "epoch": 6.94627539503386, + "grad_norm": 203.0919647216797, + "learning_rate": 1.9578947368421055e-05, + "loss": 41.5248, + "step": 1924 + }, + { + "epoch": 6.949887133182845, + "grad_norm": 273.605712890625, + "learning_rate": 1.957350272232305e-05, + "loss": 42.1004, + "step": 1925 + }, + { + "epoch": 6.953498871331829, + "grad_norm": 214.04319763183594, + "learning_rate": 1.9568058076225045e-05, + "loss": 42.6326, + "step": 1926 + }, + { + "epoch": 6.957110609480813, + "grad_norm": 250.81832885742188, + "learning_rate": 1.956261343012704e-05, + "loss": 43.8045, + "step": 1927 + }, + { + "epoch": 6.960722347629797, + "grad_norm": 233.58116149902344, + "learning_rate": 1.955716878402904e-05, + "loss": 39.8991, + "step": 1928 + }, + { + "epoch": 6.9643340857787805, + "grad_norm": 269.0545654296875, + "learning_rate": 1.9551724137931035e-05, + "loss": 34.6192, + "step": 1929 + }, + { + "epoch": 6.967945823927765, + "grad_norm": 266.1218566894531, + "learning_rate": 1.954627949183303e-05, + "loss": 35.7568, + "step": 1930 + }, + { + "epoch": 6.967945823927765, + "eval_loss": 0.6233173608779907, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1930 + }, + { + "epoch": 6.971557562076749, + "grad_norm": 294.6914978027344, + "learning_rate": 1.9540834845735026e-05, + "loss": 36.0795, + "step": 1931 + }, + { + "epoch": 6.975169300225733, + "grad_norm": 373.6831970214844, + "learning_rate": 1.9535390199637025e-05, + "loss": 37.2715, + "step": 1932 + }, + { + "epoch": 6.978781038374718, + "grad_norm": 240.34738159179688, + "learning_rate": 1.952994555353902e-05, + "loss": 37.8335, + "step": 1933 + }, + { + "epoch": 6.982392776523702, + "grad_norm": 312.1968994140625, + "learning_rate": 1.952450090744102e-05, + "loss": 37.8251, + "step": 1934 + }, + { + "epoch": 6.986004514672686, + "grad_norm": 276.3544006347656, + "learning_rate": 1.9519056261343014e-05, + "loss": 38.8466, + "step": 1935 + }, + { + "epoch": 6.98961625282167, + "grad_norm": 282.6874694824219, + "learning_rate": 1.951361161524501e-05, + "loss": 37.774, + "step": 1936 + }, + { + "epoch": 6.993227990970655, + "grad_norm": 323.96612548828125, + "learning_rate": 1.9508166969147005e-05, + "loss": 34.3747, + "step": 1937 + }, + { + "epoch": 6.996839729119639, + "grad_norm": 235.02915954589844, + "learning_rate": 1.9502722323049e-05, + "loss": 24.5297, + "step": 1938 + }, + { + "epoch": 7.0, + "grad_norm": 176.4046173095703, + "learning_rate": 1.9497277676951e-05, + "loss": 22.3179, + "step": 1939 + }, + { + "epoch": 7.003611738148984, + "grad_norm": 248.2797393798828, + "learning_rate": 1.9491833030852994e-05, + "loss": 42.225, + "step": 1940 + }, + { + "epoch": 7.003611738148984, + "eval_loss": 0.6272363066673279, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.911, + "eval_steps_per_second": 56.911, + "step": 1940 + }, + { + "epoch": 7.007223476297969, + "grad_norm": 235.9131622314453, + "learning_rate": 1.9486388384754993e-05, + "loss": 43.6526, + "step": 1941 + }, + { + "epoch": 7.010835214446953, + "grad_norm": 223.63479614257812, + "learning_rate": 1.948094373865699e-05, + "loss": 42.9052, + "step": 1942 + }, + { + "epoch": 7.014446952595937, + "grad_norm": 203.92141723632812, + "learning_rate": 1.9475499092558984e-05, + "loss": 43.5819, + "step": 1943 + }, + { + "epoch": 7.018058690744921, + "grad_norm": 209.6050567626953, + "learning_rate": 1.947005444646098e-05, + "loss": 43.1077, + "step": 1944 + }, + { + "epoch": 7.021670428893906, + "grad_norm": 245.77700805664062, + "learning_rate": 1.9464609800362978e-05, + "loss": 42.7508, + "step": 1945 + }, + { + "epoch": 7.0252821670428895, + "grad_norm": 203.13465881347656, + "learning_rate": 1.9459165154264973e-05, + "loss": 42.5234, + "step": 1946 + }, + { + "epoch": 7.0288939051918735, + "grad_norm": 226.4978485107422, + "learning_rate": 1.945372050816697e-05, + "loss": 44.0725, + "step": 1947 + }, + { + "epoch": 7.0325056433408575, + "grad_norm": 225.68116760253906, + "learning_rate": 1.9448275862068964e-05, + "loss": 42.6408, + "step": 1948 + }, + { + "epoch": 7.036117381489842, + "grad_norm": 182.14202880859375, + "learning_rate": 1.944283121597096e-05, + "loss": 41.7696, + "step": 1949 + }, + { + "epoch": 7.039729119638826, + "grad_norm": 196.1949005126953, + "learning_rate": 1.9437386569872962e-05, + "loss": 42.7008, + "step": 1950 + }, + { + "epoch": 7.039729119638826, + "eval_loss": 0.6277336478233337, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 1950 + }, + { + "epoch": 7.04334085778781, + "grad_norm": 180.6853485107422, + "learning_rate": 1.9431941923774957e-05, + "loss": 41.9946, + "step": 1951 + }, + { + "epoch": 7.046952595936794, + "grad_norm": 199.0644073486328, + "learning_rate": 1.9426497277676953e-05, + "loss": 39.8965, + "step": 1952 + }, + { + "epoch": 7.050564334085779, + "grad_norm": 208.21371459960938, + "learning_rate": 1.9421052631578948e-05, + "loss": 39.3263, + "step": 1953 + }, + { + "epoch": 7.054176072234763, + "grad_norm": 239.78677368164062, + "learning_rate": 1.9415607985480943e-05, + "loss": 40.1478, + "step": 1954 + }, + { + "epoch": 7.057787810383747, + "grad_norm": 211.55030822753906, + "learning_rate": 1.941016333938294e-05, + "loss": 40.061, + "step": 1955 + }, + { + "epoch": 7.061399548532731, + "grad_norm": 199.51455688476562, + "learning_rate": 1.9404718693284937e-05, + "loss": 39.8707, + "step": 1956 + }, + { + "epoch": 7.065011286681716, + "grad_norm": 183.39486694335938, + "learning_rate": 1.9399274047186933e-05, + "loss": 40.3183, + "step": 1957 + }, + { + "epoch": 7.0686230248307, + "grad_norm": 238.36737060546875, + "learning_rate": 1.9393829401088928e-05, + "loss": 40.8581, + "step": 1958 + }, + { + "epoch": 7.072234762979684, + "grad_norm": 202.5072021484375, + "learning_rate": 1.9388384754990927e-05, + "loss": 40.2192, + "step": 1959 + }, + { + "epoch": 7.075846501128668, + "grad_norm": 204.236083984375, + "learning_rate": 1.9382940108892922e-05, + "loss": 40.8533, + "step": 1960 + }, + { + "epoch": 7.075846501128668, + "eval_loss": 0.6252757906913757, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 1960 + }, + { + "epoch": 7.079458239277653, + "grad_norm": 260.2081298828125, + "learning_rate": 1.937749546279492e-05, + "loss": 39.7229, + "step": 1961 + }, + { + "epoch": 7.083069977426637, + "grad_norm": 241.91722106933594, + "learning_rate": 1.9372050816696917e-05, + "loss": 41.547, + "step": 1962 + }, + { + "epoch": 7.0866817155756205, + "grad_norm": 168.9304656982422, + "learning_rate": 1.9366606170598912e-05, + "loss": 41.4826, + "step": 1963 + }, + { + "epoch": 7.090293453724605, + "grad_norm": 230.05349731445312, + "learning_rate": 1.9361161524500907e-05, + "loss": 41.5411, + "step": 1964 + }, + { + "epoch": 7.093905191873589, + "grad_norm": 172.16851806640625, + "learning_rate": 1.9355716878402903e-05, + "loss": 42.2347, + "step": 1965 + }, + { + "epoch": 7.097516930022573, + "grad_norm": 312.65838623046875, + "learning_rate": 1.9350272232304898e-05, + "loss": 41.4039, + "step": 1966 + }, + { + "epoch": 7.101128668171557, + "grad_norm": 249.62351989746094, + "learning_rate": 1.9344827586206897e-05, + "loss": 41.4234, + "step": 1967 + }, + { + "epoch": 7.104740406320542, + "grad_norm": 250.49143981933594, + "learning_rate": 1.9339382940108896e-05, + "loss": 38.0539, + "step": 1968 + }, + { + "epoch": 7.108352144469526, + "grad_norm": 238.41546630859375, + "learning_rate": 1.933393829401089e-05, + "loss": 35.5584, + "step": 1969 + }, + { + "epoch": 7.11196388261851, + "grad_norm": 200.78282165527344, + "learning_rate": 1.9328493647912886e-05, + "loss": 34.4491, + "step": 1970 + }, + { + "epoch": 7.11196388261851, + "eval_loss": 0.6286216378211975, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 1970 + }, + { + "epoch": 7.115575620767494, + "grad_norm": 244.61717224121094, + "learning_rate": 1.9323049001814882e-05, + "loss": 34.5403, + "step": 1971 + }, + { + "epoch": 7.119187358916479, + "grad_norm": 219.14312744140625, + "learning_rate": 1.931760435571688e-05, + "loss": 35.7815, + "step": 1972 + }, + { + "epoch": 7.122799097065463, + "grad_norm": 221.85130310058594, + "learning_rate": 1.9312159709618876e-05, + "loss": 35.638, + "step": 1973 + }, + { + "epoch": 7.126410835214447, + "grad_norm": 237.97921752929688, + "learning_rate": 1.930671506352087e-05, + "loss": 35.1348, + "step": 1974 + }, + { + "epoch": 7.130022573363431, + "grad_norm": 234.06256103515625, + "learning_rate": 1.9301270417422867e-05, + "loss": 35.8709, + "step": 1975 + }, + { + "epoch": 7.133634311512416, + "grad_norm": 231.6852264404297, + "learning_rate": 1.9295825771324862e-05, + "loss": 36.6859, + "step": 1976 + }, + { + "epoch": 7.1372460496614, + "grad_norm": 208.2762908935547, + "learning_rate": 1.9290381125226857e-05, + "loss": 37.24, + "step": 1977 + }, + { + "epoch": 7.140857787810384, + "grad_norm": 219.8532257080078, + "learning_rate": 1.928493647912886e-05, + "loss": 36.4058, + "step": 1978 + }, + { + "epoch": 7.144469525959368, + "grad_norm": 242.73159790039062, + "learning_rate": 1.9279491833030855e-05, + "loss": 36.7565, + "step": 1979 + }, + { + "epoch": 7.148081264108352, + "grad_norm": 227.09645080566406, + "learning_rate": 1.927404718693285e-05, + "loss": 37.6752, + "step": 1980 + }, + { + "epoch": 7.148081264108352, + "eval_loss": 0.6243596076965332, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 1980 + }, + { + "epoch": 7.151693002257336, + "grad_norm": 236.27169799804688, + "learning_rate": 1.9268602540834846e-05, + "loss": 38.3857, + "step": 1981 + }, + { + "epoch": 7.15530474040632, + "grad_norm": 244.84912109375, + "learning_rate": 1.926315789473684e-05, + "loss": 38.414, + "step": 1982 + }, + { + "epoch": 7.158916478555304, + "grad_norm": 203.36798095703125, + "learning_rate": 1.925771324863884e-05, + "loss": 38.938, + "step": 1983 + }, + { + "epoch": 7.162528216704289, + "grad_norm": 225.50152587890625, + "learning_rate": 1.9252268602540835e-05, + "loss": 37.654, + "step": 1984 + }, + { + "epoch": 7.166139954853273, + "grad_norm": 236.4989471435547, + "learning_rate": 1.924682395644283e-05, + "loss": 28.2794, + "step": 1985 + }, + { + "epoch": 7.169751693002257, + "grad_norm": 173.909423828125, + "learning_rate": 1.9241379310344826e-05, + "loss": 23.3804, + "step": 1986 + }, + { + "epoch": 7.173363431151241, + "grad_norm": 195.63526916503906, + "learning_rate": 1.9235934664246825e-05, + "loss": 24.4696, + "step": 1987 + }, + { + "epoch": 7.176975169300226, + "grad_norm": 150.0059356689453, + "learning_rate": 1.923049001814882e-05, + "loss": 23.9438, + "step": 1988 + }, + { + "epoch": 7.18058690744921, + "grad_norm": 217.61630249023438, + "learning_rate": 1.922504537205082e-05, + "loss": 25.4084, + "step": 1989 + }, + { + "epoch": 7.184198645598194, + "grad_norm": 259.2041015625, + "learning_rate": 1.9219600725952814e-05, + "loss": 44.7159, + "step": 1990 + }, + { + "epoch": 7.184198645598194, + "eval_loss": 0.6465168595314026, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 1990 + }, + { + "epoch": 7.187810383747179, + "grad_norm": 282.1758117675781, + "learning_rate": 1.921415607985481e-05, + "loss": 45.7571, + "step": 1991 + }, + { + "epoch": 7.191422121896163, + "grad_norm": 276.5455322265625, + "learning_rate": 1.9208711433756805e-05, + "loss": 44.7227, + "step": 1992 + }, + { + "epoch": 7.195033860045147, + "grad_norm": 251.93589782714844, + "learning_rate": 1.92032667876588e-05, + "loss": 43.0705, + "step": 1993 + }, + { + "epoch": 7.198645598194131, + "grad_norm": 224.8245086669922, + "learning_rate": 1.91978221415608e-05, + "loss": 43.2009, + "step": 1994 + }, + { + "epoch": 7.2022573363431155, + "grad_norm": 233.61770629882812, + "learning_rate": 1.9192377495462795e-05, + "loss": 43.4496, + "step": 1995 + }, + { + "epoch": 7.2058690744920995, + "grad_norm": 188.65252685546875, + "learning_rate": 1.9186932849364793e-05, + "loss": 42.5907, + "step": 1996 + }, + { + "epoch": 7.209480812641083, + "grad_norm": 185.1155242919922, + "learning_rate": 1.918148820326679e-05, + "loss": 44.4651, + "step": 1997 + }, + { + "epoch": 7.213092550790067, + "grad_norm": 169.09701538085938, + "learning_rate": 1.9176043557168784e-05, + "loss": 43.6325, + "step": 1998 + }, + { + "epoch": 7.216704288939052, + "grad_norm": 198.49114990234375, + "learning_rate": 1.9170598911070783e-05, + "loss": 43.5817, + "step": 1999 + }, + { + "epoch": 7.220316027088036, + "grad_norm": 193.17591857910156, + "learning_rate": 1.916515426497278e-05, + "loss": 41.4884, + "step": 2000 + }, + { + "epoch": 7.220316027088036, + "eval_loss": 0.6329721212387085, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.995, + "eval_steps_per_second": 56.995, + "step": 2000 + }, + { + "epoch": 7.22392776523702, + "grad_norm": 202.32730102539062, + "learning_rate": 1.9159709618874774e-05, + "loss": 41.2168, + "step": 2001 + }, + { + "epoch": 7.227539503386004, + "grad_norm": 206.4916534423828, + "learning_rate": 1.915426497277677e-05, + "loss": 39.9909, + "step": 2002 + }, + { + "epoch": 7.231151241534989, + "grad_norm": 202.2099609375, + "learning_rate": 1.9148820326678765e-05, + "loss": 40.1413, + "step": 2003 + }, + { + "epoch": 7.234762979683973, + "grad_norm": 223.7954559326172, + "learning_rate": 1.914337568058076e-05, + "loss": 39.5872, + "step": 2004 + }, + { + "epoch": 7.238374717832957, + "grad_norm": 225.8967742919922, + "learning_rate": 1.9137931034482762e-05, + "loss": 41.3396, + "step": 2005 + }, + { + "epoch": 7.241986455981941, + "grad_norm": 248.0997772216797, + "learning_rate": 1.9132486388384757e-05, + "loss": 39.012, + "step": 2006 + }, + { + "epoch": 7.245598194130926, + "grad_norm": 227.4576873779297, + "learning_rate": 1.9127041742286753e-05, + "loss": 42.5922, + "step": 2007 + }, + { + "epoch": 7.24920993227991, + "grad_norm": 197.62547302246094, + "learning_rate": 1.9121597096188748e-05, + "loss": 41.6107, + "step": 2008 + }, + { + "epoch": 7.252821670428894, + "grad_norm": 170.18817138671875, + "learning_rate": 1.9116152450090744e-05, + "loss": 40.3326, + "step": 2009 + }, + { + "epoch": 7.2564334085778786, + "grad_norm": 186.9420166015625, + "learning_rate": 1.9110707803992742e-05, + "loss": 41.0365, + "step": 2010 + }, + { + "epoch": 7.2564334085778786, + "eval_loss": 0.6230406761169434, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2010 + }, + { + "epoch": 7.2600451467268625, + "grad_norm": 188.11244201660156, + "learning_rate": 1.9105263157894738e-05, + "loss": 42.0278, + "step": 2011 + }, + { + "epoch": 7.2636568848758465, + "grad_norm": 242.47305297851562, + "learning_rate": 1.9099818511796733e-05, + "loss": 41.5539, + "step": 2012 + }, + { + "epoch": 7.2672686230248305, + "grad_norm": 190.83987426757812, + "learning_rate": 1.909437386569873e-05, + "loss": 41.8641, + "step": 2013 + }, + { + "epoch": 7.270880361173815, + "grad_norm": 214.44650268554688, + "learning_rate": 1.9088929219600724e-05, + "loss": 42.232, + "step": 2014 + }, + { + "epoch": 7.274492099322799, + "grad_norm": 216.3888397216797, + "learning_rate": 1.9083484573502723e-05, + "loss": 41.6186, + "step": 2015 + }, + { + "epoch": 7.278103837471783, + "grad_norm": 210.46673583984375, + "learning_rate": 1.907803992740472e-05, + "loss": 42.2099, + "step": 2016 + }, + { + "epoch": 7.281715575620767, + "grad_norm": 194.84165954589844, + "learning_rate": 1.9072595281306717e-05, + "loss": 42.78, + "step": 2017 + }, + { + "epoch": 7.285327313769752, + "grad_norm": 201.91297912597656, + "learning_rate": 1.9067150635208712e-05, + "loss": 38.7115, + "step": 2018 + }, + { + "epoch": 7.288939051918736, + "grad_norm": 245.42625427246094, + "learning_rate": 1.9061705989110708e-05, + "loss": 35.7841, + "step": 2019 + }, + { + "epoch": 7.29255079006772, + "grad_norm": 182.4967041015625, + "learning_rate": 1.9056261343012703e-05, + "loss": 34.3308, + "step": 2020 + }, + { + "epoch": 7.29255079006772, + "eval_loss": 0.6238341331481934, + "eval_runtime": 3.1431, + "eval_samples_per_second": 56.95, + "eval_steps_per_second": 56.95, + "step": 2020 + }, + { + "epoch": 7.296162528216704, + "grad_norm": 297.3916320800781, + "learning_rate": 1.9050816696914702e-05, + "loss": 34.7534, + "step": 2021 + }, + { + "epoch": 7.299774266365689, + "grad_norm": 211.52554321289062, + "learning_rate": 1.9045372050816697e-05, + "loss": 34.0303, + "step": 2022 + }, + { + "epoch": 7.303386004514673, + "grad_norm": 232.99844360351562, + "learning_rate": 1.9039927404718693e-05, + "loss": 35.7378, + "step": 2023 + }, + { + "epoch": 7.306997742663657, + "grad_norm": 230.34642028808594, + "learning_rate": 1.903448275862069e-05, + "loss": 36.7492, + "step": 2024 + }, + { + "epoch": 7.310609480812641, + "grad_norm": 228.88966369628906, + "learning_rate": 1.9029038112522687e-05, + "loss": 35.1188, + "step": 2025 + }, + { + "epoch": 7.314221218961626, + "grad_norm": 213.2604522705078, + "learning_rate": 1.9023593466424682e-05, + "loss": 35.0688, + "step": 2026 + }, + { + "epoch": 7.3178329571106095, + "grad_norm": 202.62200927734375, + "learning_rate": 1.901814882032668e-05, + "loss": 37.6721, + "step": 2027 + }, + { + "epoch": 7.3214446952595935, + "grad_norm": 191.8877410888672, + "learning_rate": 1.9012704174228676e-05, + "loss": 36.7728, + "step": 2028 + }, + { + "epoch": 7.3250564334085775, + "grad_norm": 211.57571411132812, + "learning_rate": 1.900725952813067e-05, + "loss": 36.6342, + "step": 2029 + }, + { + "epoch": 7.328668171557562, + "grad_norm": 177.2289581298828, + "learning_rate": 1.9001814882032667e-05, + "loss": 36.8319, + "step": 2030 + }, + { + "epoch": 7.328668171557562, + "eval_loss": 0.6231008172035217, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2030 + }, + { + "epoch": 7.332279909706546, + "grad_norm": 227.7028350830078, + "learning_rate": 1.8996370235934662e-05, + "loss": 36.6706, + "step": 2031 + }, + { + "epoch": 7.33589164785553, + "grad_norm": 229.02972412109375, + "learning_rate": 1.899092558983666e-05, + "loss": 37.0749, + "step": 2032 + }, + { + "epoch": 7.339503386004514, + "grad_norm": 234.30946350097656, + "learning_rate": 1.898548094373866e-05, + "loss": 37.3716, + "step": 2033 + }, + { + "epoch": 7.343115124153499, + "grad_norm": 236.79893493652344, + "learning_rate": 1.8980036297640655e-05, + "loss": 38.9503, + "step": 2034 + }, + { + "epoch": 7.346726862302483, + "grad_norm": 256.5646057128906, + "learning_rate": 1.897459165154265e-05, + "loss": 32.5056, + "step": 2035 + }, + { + "epoch": 7.350338600451467, + "grad_norm": 183.38961791992188, + "learning_rate": 1.8969147005444646e-05, + "loss": 25.3982, + "step": 2036 + }, + { + "epoch": 7.353950338600452, + "grad_norm": 214.09742736816406, + "learning_rate": 1.896370235934664e-05, + "loss": 23.2743, + "step": 2037 + }, + { + "epoch": 7.357562076749436, + "grad_norm": 190.10867309570312, + "learning_rate": 1.895825771324864e-05, + "loss": 24.8062, + "step": 2038 + }, + { + "epoch": 7.36117381489842, + "grad_norm": 197.85313415527344, + "learning_rate": 1.8952813067150636e-05, + "loss": 25.5098, + "step": 2039 + }, + { + "epoch": 7.364785553047404, + "grad_norm": 235.79090881347656, + "learning_rate": 1.894736842105263e-05, + "loss": 44.3536, + "step": 2040 + }, + { + "epoch": 7.364785553047404, + "eval_loss": 0.6341925263404846, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.785, + "eval_steps_per_second": 56.785, + "step": 2040 + }, + { + "epoch": 7.368397291196389, + "grad_norm": 232.7415771484375, + "learning_rate": 1.8941923774954626e-05, + "loss": 44.6073, + "step": 2041 + }, + { + "epoch": 7.372009029345373, + "grad_norm": 302.3766174316406, + "learning_rate": 1.8936479128856625e-05, + "loss": 43.8575, + "step": 2042 + }, + { + "epoch": 7.375620767494357, + "grad_norm": 208.41441345214844, + "learning_rate": 1.8931034482758624e-05, + "loss": 42.4378, + "step": 2043 + }, + { + "epoch": 7.3792325056433405, + "grad_norm": 228.000732421875, + "learning_rate": 1.892558983666062e-05, + "loss": 44.5641, + "step": 2044 + }, + { + "epoch": 7.382844243792325, + "grad_norm": 201.757080078125, + "learning_rate": 1.8920145190562615e-05, + "loss": 43.7578, + "step": 2045 + }, + { + "epoch": 7.386455981941309, + "grad_norm": 220.2481689453125, + "learning_rate": 1.891470054446461e-05, + "loss": 42.755, + "step": 2046 + }, + { + "epoch": 7.390067720090293, + "grad_norm": 225.5443115234375, + "learning_rate": 1.8909255898366605e-05, + "loss": 44.3785, + "step": 2047 + }, + { + "epoch": 7.393679458239277, + "grad_norm": 200.2024688720703, + "learning_rate": 1.89038112522686e-05, + "loss": 42.994, + "step": 2048 + }, + { + "epoch": 7.397291196388262, + "grad_norm": 205.64794921875, + "learning_rate": 1.88983666061706e-05, + "loss": 43.1902, + "step": 2049 + }, + { + "epoch": 7.400902934537246, + "grad_norm": 183.3535919189453, + "learning_rate": 1.8892921960072595e-05, + "loss": 40.9422, + "step": 2050 + }, + { + "epoch": 7.400902934537246, + "eval_loss": 0.626913845539093, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2050 + }, + { + "epoch": 7.40451467268623, + "grad_norm": 201.8138885498047, + "learning_rate": 1.8887477313974594e-05, + "loss": 39.4408, + "step": 2051 + }, + { + "epoch": 7.408126410835214, + "grad_norm": 201.8863525390625, + "learning_rate": 1.888203266787659e-05, + "loss": 39.5467, + "step": 2052 + }, + { + "epoch": 7.411738148984199, + "grad_norm": 239.10687255859375, + "learning_rate": 1.8876588021778585e-05, + "loss": 41.2256, + "step": 2053 + }, + { + "epoch": 7.415349887133183, + "grad_norm": 209.47796630859375, + "learning_rate": 1.8871143375680583e-05, + "loss": 40.8963, + "step": 2054 + }, + { + "epoch": 7.418961625282167, + "grad_norm": 202.6414794921875, + "learning_rate": 1.886569872958258e-05, + "loss": 40.5138, + "step": 2055 + }, + { + "epoch": 7.422573363431152, + "grad_norm": 198.01795959472656, + "learning_rate": 1.8860254083484574e-05, + "loss": 39.1767, + "step": 2056 + }, + { + "epoch": 7.426185101580136, + "grad_norm": 173.26507568359375, + "learning_rate": 1.885480943738657e-05, + "loss": 40.6713, + "step": 2057 + }, + { + "epoch": 7.42979683972912, + "grad_norm": 166.11607360839844, + "learning_rate": 1.8849364791288565e-05, + "loss": 41.2602, + "step": 2058 + }, + { + "epoch": 7.433408577878104, + "grad_norm": 200.76956176757812, + "learning_rate": 1.884392014519056e-05, + "loss": 41.0714, + "step": 2059 + }, + { + "epoch": 7.437020316027088, + "grad_norm": 213.75315856933594, + "learning_rate": 1.883847549909256e-05, + "loss": 39.6812, + "step": 2060 + }, + { + "epoch": 7.437020316027088, + "eval_loss": 0.6279598474502563, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.0, + "eval_steps_per_second": 57.0, + "step": 2060 + }, + { + "epoch": 7.440632054176072, + "grad_norm": 221.25025939941406, + "learning_rate": 1.8833030852994558e-05, + "loss": 41.6964, + "step": 2061 + }, + { + "epoch": 7.444243792325056, + "grad_norm": 171.32106018066406, + "learning_rate": 1.8827586206896553e-05, + "loss": 41.4608, + "step": 2062 + }, + { + "epoch": 7.44785553047404, + "grad_norm": 222.76600646972656, + "learning_rate": 1.882214156079855e-05, + "loss": 41.2687, + "step": 2063 + }, + { + "epoch": 7.451467268623025, + "grad_norm": 169.82395935058594, + "learning_rate": 1.8816696914700544e-05, + "loss": 41.6048, + "step": 2064 + }, + { + "epoch": 7.455079006772009, + "grad_norm": 190.5113525390625, + "learning_rate": 1.8811252268602543e-05, + "loss": 41.8843, + "step": 2065 + }, + { + "epoch": 7.458690744920993, + "grad_norm": 194.5990447998047, + "learning_rate": 1.8805807622504538e-05, + "loss": 43.5968, + "step": 2066 + }, + { + "epoch": 7.462302483069977, + "grad_norm": 216.0985870361328, + "learning_rate": 1.8800362976406533e-05, + "loss": 41.6743, + "step": 2067 + }, + { + "epoch": 7.465914221218962, + "grad_norm": 249.05270385742188, + "learning_rate": 1.879491833030853e-05, + "loss": 39.4203, + "step": 2068 + }, + { + "epoch": 7.469525959367946, + "grad_norm": 232.5495147705078, + "learning_rate": 1.8789473684210524e-05, + "loss": 36.2202, + "step": 2069 + }, + { + "epoch": 7.47313769751693, + "grad_norm": 218.72299194335938, + "learning_rate": 1.8784029038112523e-05, + "loss": 34.9116, + "step": 2070 + }, + { + "epoch": 7.47313769751693, + "eval_loss": 0.6241349577903748, + "eval_runtime": 3.1499, + "eval_samples_per_second": 56.827, + "eval_steps_per_second": 56.827, + "step": 2070 + }, + { + "epoch": 7.476749435665914, + "grad_norm": 241.78179931640625, + "learning_rate": 1.8778584392014522e-05, + "loss": 36.2476, + "step": 2071 + }, + { + "epoch": 7.480361173814899, + "grad_norm": 194.92982482910156, + "learning_rate": 1.8773139745916517e-05, + "loss": 34.4524, + "step": 2072 + }, + { + "epoch": 7.483972911963883, + "grad_norm": 227.76156616210938, + "learning_rate": 1.8767695099818513e-05, + "loss": 34.5292, + "step": 2073 + }, + { + "epoch": 7.487584650112867, + "grad_norm": 287.61309814453125, + "learning_rate": 1.8762250453720508e-05, + "loss": 37.8068, + "step": 2074 + }, + { + "epoch": 7.491196388261851, + "grad_norm": 191.0822296142578, + "learning_rate": 1.8756805807622503e-05, + "loss": 36.0941, + "step": 2075 + }, + { + "epoch": 7.4948081264108355, + "grad_norm": 197.5564422607422, + "learning_rate": 1.8751361161524502e-05, + "loss": 36.3624, + "step": 2076 + }, + { + "epoch": 7.4984198645598195, + "grad_norm": 187.72479248046875, + "learning_rate": 1.8745916515426497e-05, + "loss": 37.5074, + "step": 2077 + }, + { + "epoch": 7.502031602708803, + "grad_norm": 220.4607391357422, + "learning_rate": 1.8740471869328493e-05, + "loss": 35.6139, + "step": 2078 + }, + { + "epoch": 7.505643340857787, + "grad_norm": 179.05612182617188, + "learning_rate": 1.873502722323049e-05, + "loss": 37.7286, + "step": 2079 + }, + { + "epoch": 7.509255079006772, + "grad_norm": 230.91879272460938, + "learning_rate": 1.8729582577132487e-05, + "loss": 36.1803, + "step": 2080 + }, + { + "epoch": 7.509255079006772, + "eval_loss": 0.6255043148994446, + "eval_runtime": 3.1466, + "eval_samples_per_second": 56.887, + "eval_steps_per_second": 56.887, + "step": 2080 + }, + { + "epoch": 7.512866817155756, + "grad_norm": 182.89437866210938, + "learning_rate": 1.8724137931034482e-05, + "loss": 36.5782, + "step": 2081 + }, + { + "epoch": 7.51647855530474, + "grad_norm": 215.36769104003906, + "learning_rate": 1.871869328493648e-05, + "loss": 38.233, + "step": 2082 + }, + { + "epoch": 7.520090293453725, + "grad_norm": 232.6095733642578, + "learning_rate": 1.8713248638838477e-05, + "loss": 38.6268, + "step": 2083 + }, + { + "epoch": 7.523702031602709, + "grad_norm": 236.94281005859375, + "learning_rate": 1.8707803992740472e-05, + "loss": 38.1768, + "step": 2084 + }, + { + "epoch": 7.527313769751693, + "grad_norm": 214.16079711914062, + "learning_rate": 1.8702359346642467e-05, + "loss": 27.514, + "step": 2085 + }, + { + "epoch": 7.530925507900677, + "grad_norm": 192.6107940673828, + "learning_rate": 1.8696914700544463e-05, + "loss": 24.274, + "step": 2086 + }, + { + "epoch": 7.534537246049661, + "grad_norm": 217.98619079589844, + "learning_rate": 1.869147005444646e-05, + "loss": 23.2824, + "step": 2087 + }, + { + "epoch": 7.538148984198646, + "grad_norm": 183.04296875, + "learning_rate": 1.868602540834846e-05, + "loss": 24.9622, + "step": 2088 + }, + { + "epoch": 7.54176072234763, + "grad_norm": 167.1417236328125, + "learning_rate": 1.8680580762250456e-05, + "loss": 25.1446, + "step": 2089 + }, + { + "epoch": 7.545372460496614, + "grad_norm": 287.29937744140625, + "learning_rate": 1.867513611615245e-05, + "loss": 44.1171, + "step": 2090 + }, + { + "epoch": 7.545372460496614, + "eval_loss": 0.6376849412918091, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.929, + "eval_steps_per_second": 56.929, + "step": 2090 + }, + { + "epoch": 7.5489841986455986, + "grad_norm": 285.3408203125, + "learning_rate": 1.8669691470054446e-05, + "loss": 46.3716, + "step": 2091 + }, + { + "epoch": 7.5525959367945825, + "grad_norm": 233.18389892578125, + "learning_rate": 1.8664246823956445e-05, + "loss": 44.0514, + "step": 2092 + }, + { + "epoch": 7.5562076749435665, + "grad_norm": 256.4196472167969, + "learning_rate": 1.865880217785844e-05, + "loss": 44.1784, + "step": 2093 + }, + { + "epoch": 7.5598194130925505, + "grad_norm": 223.28128051757812, + "learning_rate": 1.8653357531760436e-05, + "loss": 42.9897, + "step": 2094 + }, + { + "epoch": 7.563431151241535, + "grad_norm": 235.2901153564453, + "learning_rate": 1.864791288566243e-05, + "loss": 43.7651, + "step": 2095 + }, + { + "epoch": 7.567042889390519, + "grad_norm": 285.9206237792969, + "learning_rate": 1.8642468239564427e-05, + "loss": 44.6333, + "step": 2096 + }, + { + "epoch": 7.570654627539503, + "grad_norm": 200.00210571289062, + "learning_rate": 1.8637023593466425e-05, + "loss": 43.9845, + "step": 2097 + }, + { + "epoch": 7.574266365688487, + "grad_norm": 277.73394775390625, + "learning_rate": 1.8631578947368424e-05, + "loss": 44.7301, + "step": 2098 + }, + { + "epoch": 7.577878103837472, + "grad_norm": 216.9422149658203, + "learning_rate": 1.862613430127042e-05, + "loss": 44.0409, + "step": 2099 + }, + { + "epoch": 7.581489841986456, + "grad_norm": 198.86639404296875, + "learning_rate": 1.8620689655172415e-05, + "loss": 43.4026, + "step": 2100 + }, + { + "epoch": 7.581489841986456, + "eval_loss": 0.6270378232002258, + "eval_runtime": 3.1464, + "eval_samples_per_second": 56.891, + "eval_steps_per_second": 56.891, + "step": 2100 + }, + { + "epoch": 7.58510158013544, + "grad_norm": 240.495361328125, + "learning_rate": 1.861524500907441e-05, + "loss": 41.4092, + "step": 2101 + }, + { + "epoch": 7.588713318284425, + "grad_norm": 240.1851043701172, + "learning_rate": 1.8609800362976406e-05, + "loss": 40.1396, + "step": 2102 + }, + { + "epoch": 7.592325056433409, + "grad_norm": 241.21495056152344, + "learning_rate": 1.8604355716878405e-05, + "loss": 39.1778, + "step": 2103 + }, + { + "epoch": 7.595936794582393, + "grad_norm": 287.3133544921875, + "learning_rate": 1.85989110707804e-05, + "loss": 41.0348, + "step": 2104 + }, + { + "epoch": 7.599548532731377, + "grad_norm": 230.4313201904297, + "learning_rate": 1.8593466424682395e-05, + "loss": 39.5872, + "step": 2105 + }, + { + "epoch": 7.603160270880361, + "grad_norm": 210.32962036132812, + "learning_rate": 1.858802177858439e-05, + "loss": 40.6146, + "step": 2106 + }, + { + "epoch": 7.606772009029346, + "grad_norm": 185.81752014160156, + "learning_rate": 1.858257713248639e-05, + "loss": 39.6363, + "step": 2107 + }, + { + "epoch": 7.6103837471783295, + "grad_norm": 234.63037109375, + "learning_rate": 1.8577132486388385e-05, + "loss": 40.558, + "step": 2108 + }, + { + "epoch": 7.6139954853273135, + "grad_norm": 289.92803955078125, + "learning_rate": 1.8571687840290384e-05, + "loss": 41.1624, + "step": 2109 + }, + { + "epoch": 7.617607223476298, + "grad_norm": 252.82188415527344, + "learning_rate": 1.856624319419238e-05, + "loss": 41.7827, + "step": 2110 + }, + { + "epoch": 7.617607223476298, + "eval_loss": 0.6290409564971924, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2110 + }, + { + "epoch": 7.621218961625282, + "grad_norm": 201.8303985595703, + "learning_rate": 1.8560798548094374e-05, + "loss": 39.0072, + "step": 2111 + }, + { + "epoch": 7.624830699774266, + "grad_norm": 158.71446228027344, + "learning_rate": 1.855535390199637e-05, + "loss": 39.9822, + "step": 2112 + }, + { + "epoch": 7.62844243792325, + "grad_norm": 171.3879852294922, + "learning_rate": 1.8549909255898365e-05, + "loss": 42.1973, + "step": 2113 + }, + { + "epoch": 7.632054176072235, + "grad_norm": 218.584228515625, + "learning_rate": 1.8544464609800364e-05, + "loss": 42.933, + "step": 2114 + }, + { + "epoch": 7.635665914221219, + "grad_norm": 200.60093688964844, + "learning_rate": 1.853901996370236e-05, + "loss": 41.9847, + "step": 2115 + }, + { + "epoch": 7.639277652370203, + "grad_norm": 210.75128173828125, + "learning_rate": 1.8533575317604358e-05, + "loss": 42.4961, + "step": 2116 + }, + { + "epoch": 7.642889390519187, + "grad_norm": 187.47406005859375, + "learning_rate": 1.8528130671506353e-05, + "loss": 39.3404, + "step": 2117 + }, + { + "epoch": 7.646501128668172, + "grad_norm": 204.87693786621094, + "learning_rate": 1.852268602540835e-05, + "loss": 40.3011, + "step": 2118 + }, + { + "epoch": 7.650112866817156, + "grad_norm": 228.8159637451172, + "learning_rate": 1.8517241379310344e-05, + "loss": 37.4416, + "step": 2119 + }, + { + "epoch": 7.65372460496614, + "grad_norm": 237.59664916992188, + "learning_rate": 1.8511796733212343e-05, + "loss": 35.3079, + "step": 2120 + }, + { + "epoch": 7.65372460496614, + "eval_loss": 0.6256567239761353, + "eval_runtime": 3.1458, + "eval_samples_per_second": 56.902, + "eval_steps_per_second": 56.902, + "step": 2120 + }, + { + "epoch": 7.657336343115124, + "grad_norm": 233.3187713623047, + "learning_rate": 1.850635208711434e-05, + "loss": 34.5055, + "step": 2121 + }, + { + "epoch": 7.660948081264109, + "grad_norm": 232.7037353515625, + "learning_rate": 1.8500907441016334e-05, + "loss": 34.1232, + "step": 2122 + }, + { + "epoch": 7.664559819413093, + "grad_norm": 254.53050231933594, + "learning_rate": 1.849546279491833e-05, + "loss": 35.3301, + "step": 2123 + }, + { + "epoch": 7.668171557562077, + "grad_norm": 234.93154907226562, + "learning_rate": 1.8490018148820324e-05, + "loss": 35.9202, + "step": 2124 + }, + { + "epoch": 7.6717832957110605, + "grad_norm": 237.99671936035156, + "learning_rate": 1.8484573502722327e-05, + "loss": 36.5702, + "step": 2125 + }, + { + "epoch": 7.675395033860045, + "grad_norm": 186.25271606445312, + "learning_rate": 1.8479128856624322e-05, + "loss": 35.9423, + "step": 2126 + }, + { + "epoch": 7.679006772009029, + "grad_norm": 226.461669921875, + "learning_rate": 1.8473684210526317e-05, + "loss": 37.4121, + "step": 2127 + }, + { + "epoch": 7.682618510158013, + "grad_norm": 227.0966033935547, + "learning_rate": 1.8468239564428313e-05, + "loss": 36.8802, + "step": 2128 + }, + { + "epoch": 7.686230248306998, + "grad_norm": 193.4064178466797, + "learning_rate": 1.8462794918330308e-05, + "loss": 36.0245, + "step": 2129 + }, + { + "epoch": 7.689841986455982, + "grad_norm": 279.1668395996094, + "learning_rate": 1.8457350272232304e-05, + "loss": 37.4833, + "step": 2130 + }, + { + "epoch": 7.689841986455982, + "eval_loss": 0.6227458715438843, + "eval_runtime": 3.1429, + "eval_samples_per_second": 56.953, + "eval_steps_per_second": 56.953, + "step": 2130 + }, + { + "epoch": 7.693453724604966, + "grad_norm": 254.59234619140625, + "learning_rate": 1.8451905626134302e-05, + "loss": 36.8538, + "step": 2131 + }, + { + "epoch": 7.69706546275395, + "grad_norm": 191.14463806152344, + "learning_rate": 1.8446460980036298e-05, + "loss": 37.8517, + "step": 2132 + }, + { + "epoch": 7.700677200902934, + "grad_norm": 189.20896911621094, + "learning_rate": 1.8441016333938293e-05, + "loss": 38.406, + "step": 2133 + }, + { + "epoch": 7.704288939051919, + "grad_norm": 209.61175537109375, + "learning_rate": 1.8435571687840292e-05, + "loss": 37.7692, + "step": 2134 + }, + { + "epoch": 7.707900677200903, + "grad_norm": 220.5150146484375, + "learning_rate": 1.8430127041742287e-05, + "loss": 36.087, + "step": 2135 + }, + { + "epoch": 7.711512415349887, + "grad_norm": 211.78372192382812, + "learning_rate": 1.8424682395644286e-05, + "loss": 25.6052, + "step": 2136 + }, + { + "epoch": 7.715124153498872, + "grad_norm": 223.85789489746094, + "learning_rate": 1.841923774954628e-05, + "loss": 23.5576, + "step": 2137 + }, + { + "epoch": 7.718735891647856, + "grad_norm": 163.74220275878906, + "learning_rate": 1.8413793103448277e-05, + "loss": 24.4869, + "step": 2138 + }, + { + "epoch": 7.72234762979684, + "grad_norm": 182.80079650878906, + "learning_rate": 1.8408348457350272e-05, + "loss": 25.1878, + "step": 2139 + }, + { + "epoch": 7.725959367945824, + "grad_norm": 296.0340270996094, + "learning_rate": 1.8402903811252268e-05, + "loss": 44.4643, + "step": 2140 + }, + { + "epoch": 7.725959367945824, + "eval_loss": 0.6382863521575928, + "eval_runtime": 3.1441, + "eval_samples_per_second": 56.932, + "eval_steps_per_second": 56.932, + "step": 2140 + }, + { + "epoch": 7.7295711060948085, + "grad_norm": 248.48643493652344, + "learning_rate": 1.8397459165154263e-05, + "loss": 45.2141, + "step": 2141 + }, + { + "epoch": 7.733182844243792, + "grad_norm": 240.9061279296875, + "learning_rate": 1.8392014519056262e-05, + "loss": 42.9435, + "step": 2142 + }, + { + "epoch": 7.736794582392776, + "grad_norm": 231.62315368652344, + "learning_rate": 1.8386569872958257e-05, + "loss": 42.9769, + "step": 2143 + }, + { + "epoch": 7.74040632054176, + "grad_norm": 244.36915588378906, + "learning_rate": 1.8381125226860256e-05, + "loss": 43.6058, + "step": 2144 + }, + { + "epoch": 7.744018058690745, + "grad_norm": 252.9080047607422, + "learning_rate": 1.837568058076225e-05, + "loss": 43.1753, + "step": 2145 + }, + { + "epoch": 7.747629796839729, + "grad_norm": 274.0201721191406, + "learning_rate": 1.8370235934664247e-05, + "loss": 43.3285, + "step": 2146 + }, + { + "epoch": 7.751241534988713, + "grad_norm": 226.75595092773438, + "learning_rate": 1.8364791288566245e-05, + "loss": 43.3158, + "step": 2147 + }, + { + "epoch": 7.754853273137698, + "grad_norm": 197.0859832763672, + "learning_rate": 1.835934664246824e-05, + "loss": 43.5773, + "step": 2148 + }, + { + "epoch": 7.758465011286682, + "grad_norm": 212.14720153808594, + "learning_rate": 1.8353901996370236e-05, + "loss": 43.9208, + "step": 2149 + }, + { + "epoch": 7.762076749435666, + "grad_norm": 230.22158813476562, + "learning_rate": 1.834845735027223e-05, + "loss": 42.8429, + "step": 2150 + }, + { + "epoch": 7.762076749435666, + "eval_loss": 0.6291994452476501, + "eval_runtime": 3.1473, + "eval_samples_per_second": 56.874, + "eval_steps_per_second": 56.874, + "step": 2150 + }, + { + "epoch": 7.76568848758465, + "grad_norm": 215.79391479492188, + "learning_rate": 1.8343012704174227e-05, + "loss": 40.7289, + "step": 2151 + }, + { + "epoch": 7.769300225733634, + "grad_norm": 210.00296020507812, + "learning_rate": 1.8337568058076222e-05, + "loss": 39.9759, + "step": 2152 + }, + { + "epoch": 7.772911963882619, + "grad_norm": 291.2987976074219, + "learning_rate": 1.8332123411978224e-05, + "loss": 40.551, + "step": 2153 + }, + { + "epoch": 7.776523702031603, + "grad_norm": 218.08819580078125, + "learning_rate": 1.832667876588022e-05, + "loss": 40.7981, + "step": 2154 + }, + { + "epoch": 7.780135440180587, + "grad_norm": 268.615966796875, + "learning_rate": 1.8321234119782215e-05, + "loss": 40.5463, + "step": 2155 + }, + { + "epoch": 7.7837471783295715, + "grad_norm": 269.939697265625, + "learning_rate": 1.831578947368421e-05, + "loss": 40.6168, + "step": 2156 + }, + { + "epoch": 7.7873589164785555, + "grad_norm": 268.9761657714844, + "learning_rate": 1.8310344827586206e-05, + "loss": 41.2449, + "step": 2157 + }, + { + "epoch": 7.7909706546275395, + "grad_norm": 161.08811950683594, + "learning_rate": 1.8304900181488205e-05, + "loss": 40.6308, + "step": 2158 + }, + { + "epoch": 7.794582392776523, + "grad_norm": 190.44696044921875, + "learning_rate": 1.82994555353902e-05, + "loss": 40.9708, + "step": 2159 + }, + { + "epoch": 7.798194130925508, + "grad_norm": 202.4305419921875, + "learning_rate": 1.8294010889292196e-05, + "loss": 41.2053, + "step": 2160 + }, + { + "epoch": 7.798194130925508, + "eval_loss": 0.6233534812927246, + "eval_runtime": 3.1457, + "eval_samples_per_second": 56.903, + "eval_steps_per_second": 56.903, + "step": 2160 + }, + { + "epoch": 7.801805869074492, + "grad_norm": 188.5523681640625, + "learning_rate": 1.828856624319419e-05, + "loss": 40.3928, + "step": 2161 + }, + { + "epoch": 7.805417607223476, + "grad_norm": 184.18296813964844, + "learning_rate": 1.828312159709619e-05, + "loss": 42.3466, + "step": 2162 + }, + { + "epoch": 7.80902934537246, + "grad_norm": 223.9243927001953, + "learning_rate": 1.8277676950998185e-05, + "loss": 42.0301, + "step": 2163 + }, + { + "epoch": 7.812641083521445, + "grad_norm": 202.3498077392578, + "learning_rate": 1.8272232304900184e-05, + "loss": 42.3284, + "step": 2164 + }, + { + "epoch": 7.816252821670429, + "grad_norm": 205.77940368652344, + "learning_rate": 1.826678765880218e-05, + "loss": 42.0951, + "step": 2165 + }, + { + "epoch": 7.819864559819413, + "grad_norm": 191.46728515625, + "learning_rate": 1.8261343012704175e-05, + "loss": 40.826, + "step": 2166 + }, + { + "epoch": 7.823476297968397, + "grad_norm": 276.8330383300781, + "learning_rate": 1.825589836660617e-05, + "loss": 42.7909, + "step": 2167 + }, + { + "epoch": 7.827088036117382, + "grad_norm": 181.93955993652344, + "learning_rate": 1.8250453720508165e-05, + "loss": 38.6068, + "step": 2168 + }, + { + "epoch": 7.830699774266366, + "grad_norm": 178.79856872558594, + "learning_rate": 1.8245009074410164e-05, + "loss": 35.694, + "step": 2169 + }, + { + "epoch": 7.83431151241535, + "grad_norm": 224.6522979736328, + "learning_rate": 1.823956442831216e-05, + "loss": 36.7127, + "step": 2170 + }, + { + "epoch": 7.83431151241535, + "eval_loss": 0.6237645745277405, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 2170 + }, + { + "epoch": 7.837923250564334, + "grad_norm": 203.37196350097656, + "learning_rate": 1.823411978221416e-05, + "loss": 34.0039, + "step": 2171 + }, + { + "epoch": 7.8415349887133186, + "grad_norm": 212.79307556152344, + "learning_rate": 1.8228675136116154e-05, + "loss": 33.2787, + "step": 2172 + }, + { + "epoch": 7.8451467268623025, + "grad_norm": 215.5691375732422, + "learning_rate": 1.822323049001815e-05, + "loss": 35.4241, + "step": 2173 + }, + { + "epoch": 7.8487584650112865, + "grad_norm": 230.0751190185547, + "learning_rate": 1.8217785843920144e-05, + "loss": 36.9333, + "step": 2174 + }, + { + "epoch": 7.852370203160271, + "grad_norm": 217.8132781982422, + "learning_rate": 1.8212341197822143e-05, + "loss": 35.7233, + "step": 2175 + }, + { + "epoch": 7.855981941309255, + "grad_norm": 245.93177795410156, + "learning_rate": 1.820689655172414e-05, + "loss": 36.6111, + "step": 2176 + }, + { + "epoch": 7.859593679458239, + "grad_norm": 210.58218383789062, + "learning_rate": 1.8201451905626134e-05, + "loss": 36.3243, + "step": 2177 + }, + { + "epoch": 7.863205417607223, + "grad_norm": 234.6280059814453, + "learning_rate": 1.819600725952813e-05, + "loss": 37.0315, + "step": 2178 + }, + { + "epoch": 7.866817155756207, + "grad_norm": 184.53121948242188, + "learning_rate": 1.8190562613430125e-05, + "loss": 35.8725, + "step": 2179 + }, + { + "epoch": 7.870428893905192, + "grad_norm": 201.5563507080078, + "learning_rate": 1.8185117967332127e-05, + "loss": 37.9183, + "step": 2180 + }, + { + "epoch": 7.870428893905192, + "eval_loss": 0.6210297346115112, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2180 + }, + { + "epoch": 7.874040632054176, + "grad_norm": 192.29579162597656, + "learning_rate": 1.8179673321234122e-05, + "loss": 37.1709, + "step": 2181 + }, + { + "epoch": 7.87765237020316, + "grad_norm": 246.0638427734375, + "learning_rate": 1.8174228675136118e-05, + "loss": 38.5338, + "step": 2182 + }, + { + "epoch": 7.881264108352145, + "grad_norm": 237.47607421875, + "learning_rate": 1.8168784029038113e-05, + "loss": 37.7041, + "step": 2183 + }, + { + "epoch": 7.884875846501129, + "grad_norm": 215.06407165527344, + "learning_rate": 1.816333938294011e-05, + "loss": 38.1663, + "step": 2184 + }, + { + "epoch": 7.888487584650113, + "grad_norm": 193.76809692382812, + "learning_rate": 1.8157894736842107e-05, + "loss": 32.1679, + "step": 2185 + }, + { + "epoch": 7.892099322799097, + "grad_norm": 208.66111755371094, + "learning_rate": 1.8152450090744103e-05, + "loss": 24.2413, + "step": 2186 + }, + { + "epoch": 7.895711060948082, + "grad_norm": 182.810546875, + "learning_rate": 1.8147005444646098e-05, + "loss": 24.1102, + "step": 2187 + }, + { + "epoch": 7.899322799097066, + "grad_norm": 200.25823974609375, + "learning_rate": 1.8141560798548093e-05, + "loss": 24.5778, + "step": 2188 + }, + { + "epoch": 7.9029345372460496, + "grad_norm": 224.19125366210938, + "learning_rate": 1.813611615245009e-05, + "loss": 26.1643, + "step": 2189 + }, + { + "epoch": 7.9065462753950335, + "grad_norm": 261.03033447265625, + "learning_rate": 1.8130671506352088e-05, + "loss": 45.1071, + "step": 2190 + }, + { + "epoch": 7.9065462753950335, + "eval_loss": 0.6303785443305969, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2190 + }, + { + "epoch": 7.910158013544018, + "grad_norm": 273.6593322753906, + "learning_rate": 1.8125226860254086e-05, + "loss": 43.8271, + "step": 2191 + }, + { + "epoch": 7.913769751693002, + "grad_norm": 304.0534362792969, + "learning_rate": 1.8119782214156082e-05, + "loss": 43.7623, + "step": 2192 + }, + { + "epoch": 7.917381489841986, + "grad_norm": 249.27255249023438, + "learning_rate": 1.8114337568058077e-05, + "loss": 43.7191, + "step": 2193 + }, + { + "epoch": 7.92099322799097, + "grad_norm": 199.5006103515625, + "learning_rate": 1.8108892921960072e-05, + "loss": 44.1019, + "step": 2194 + }, + { + "epoch": 7.924604966139955, + "grad_norm": 228.42832946777344, + "learning_rate": 1.8103448275862068e-05, + "loss": 43.9717, + "step": 2195 + }, + { + "epoch": 7.928216704288939, + "grad_norm": 247.20901489257812, + "learning_rate": 1.8098003629764067e-05, + "loss": 40.022, + "step": 2196 + }, + { + "epoch": 7.931828442437923, + "grad_norm": 297.5372619628906, + "learning_rate": 1.8092558983666062e-05, + "loss": 40.6639, + "step": 2197 + }, + { + "epoch": 7.935440180586907, + "grad_norm": 245.11915588378906, + "learning_rate": 1.8087114337568057e-05, + "loss": 40.3569, + "step": 2198 + }, + { + "epoch": 7.939051918735892, + "grad_norm": 255.53297424316406, + "learning_rate": 1.8081669691470056e-05, + "loss": 41.7983, + "step": 2199 + }, + { + "epoch": 7.942663656884876, + "grad_norm": 226.12783813476562, + "learning_rate": 1.807622504537205e-05, + "loss": 41.7844, + "step": 2200 + }, + { + "epoch": 7.942663656884876, + "eval_loss": 0.6214397549629211, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2200 + }, + { + "epoch": 7.94627539503386, + "grad_norm": 220.90577697753906, + "learning_rate": 1.8070780399274047e-05, + "loss": 42.057, + "step": 2201 + }, + { + "epoch": 7.949887133182845, + "grad_norm": 192.33856201171875, + "learning_rate": 1.8065335753176046e-05, + "loss": 42.0299, + "step": 2202 + }, + { + "epoch": 7.953498871331829, + "grad_norm": 192.8511962890625, + "learning_rate": 1.805989110707804e-05, + "loss": 41.7752, + "step": 2203 + }, + { + "epoch": 7.957110609480813, + "grad_norm": 223.10275268554688, + "learning_rate": 1.8054446460980036e-05, + "loss": 41.0178, + "step": 2204 + }, + { + "epoch": 7.960722347629797, + "grad_norm": 189.8402099609375, + "learning_rate": 1.8049001814882032e-05, + "loss": 37.9747, + "step": 2205 + }, + { + "epoch": 7.9643340857787805, + "grad_norm": 233.5938720703125, + "learning_rate": 1.8043557168784027e-05, + "loss": 35.3994, + "step": 2206 + }, + { + "epoch": 7.967945823927765, + "grad_norm": 218.5577850341797, + "learning_rate": 1.8038112522686026e-05, + "loss": 35.1967, + "step": 2207 + }, + { + "epoch": 7.971557562076749, + "grad_norm": 228.49502563476562, + "learning_rate": 1.8032667876588025e-05, + "loss": 34.5792, + "step": 2208 + }, + { + "epoch": 7.975169300225733, + "grad_norm": 285.4461364746094, + "learning_rate": 1.802722323049002e-05, + "loss": 37.9449, + "step": 2209 + }, + { + "epoch": 7.978781038374718, + "grad_norm": 186.83755493164062, + "learning_rate": 1.8021778584392016e-05, + "loss": 36.3295, + "step": 2210 + }, + { + "epoch": 7.978781038374718, + "eval_loss": 0.6212169528007507, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2210 + }, + { + "epoch": 7.982392776523702, + "grad_norm": 210.31175231933594, + "learning_rate": 1.801633393829401e-05, + "loss": 37.0061, + "step": 2211 + }, + { + "epoch": 7.986004514672686, + "grad_norm": 251.96026611328125, + "learning_rate": 1.8010889292196006e-05, + "loss": 37.8831, + "step": 2212 + }, + { + "epoch": 7.98961625282167, + "grad_norm": 273.8665771484375, + "learning_rate": 1.8005444646098005e-05, + "loss": 38.8926, + "step": 2213 + }, + { + "epoch": 7.993227990970655, + "grad_norm": 207.25836181640625, + "learning_rate": 1.8e-05, + "loss": 30.0468, + "step": 2214 + }, + { + "epoch": 7.996839729119639, + "grad_norm": 200.5218048095703, + "learning_rate": 1.7994555353901996e-05, + "loss": 24.0549, + "step": 2215 + }, + { + "epoch": 8.0, + "grad_norm": 245.7149200439453, + "learning_rate": 1.798911070780399e-05, + "loss": 22.3158, + "step": 2216 + }, + { + "epoch": 8.003611738148985, + "grad_norm": 263.85546875, + "learning_rate": 1.798366606170599e-05, + "loss": 43.2342, + "step": 2217 + }, + { + "epoch": 8.007223476297968, + "grad_norm": 244.57205200195312, + "learning_rate": 1.797822141560799e-05, + "loss": 44.0931, + "step": 2218 + }, + { + "epoch": 8.010835214446953, + "grad_norm": 196.4144287109375, + "learning_rate": 1.7972776769509984e-05, + "loss": 42.1926, + "step": 2219 + }, + { + "epoch": 8.014446952595938, + "grad_norm": 282.3250427246094, + "learning_rate": 1.796733212341198e-05, + "loss": 41.4664, + "step": 2220 + }, + { + "epoch": 8.014446952595938, + "eval_loss": 0.6222901344299316, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 2220 + }, + { + "epoch": 8.01805869074492, + "grad_norm": 186.79281616210938, + "learning_rate": 1.7961887477313975e-05, + "loss": 42.2133, + "step": 2221 + }, + { + "epoch": 8.021670428893906, + "grad_norm": 220.3788299560547, + "learning_rate": 1.795644283121597e-05, + "loss": 42.0159, + "step": 2222 + }, + { + "epoch": 8.025282167042889, + "grad_norm": 262.37078857421875, + "learning_rate": 1.7950998185117966e-05, + "loss": 42.6055, + "step": 2223 + }, + { + "epoch": 8.028893905191874, + "grad_norm": 199.07078552246094, + "learning_rate": 1.7945553539019964e-05, + "loss": 43.3061, + "step": 2224 + }, + { + "epoch": 8.032505643340858, + "grad_norm": 256.6651306152344, + "learning_rate": 1.794010889292196e-05, + "loss": 42.4806, + "step": 2225 + }, + { + "epoch": 8.036117381489841, + "grad_norm": 281.17431640625, + "learning_rate": 1.793466424682396e-05, + "loss": 43.9823, + "step": 2226 + }, + { + "epoch": 8.039729119638826, + "grad_norm": 201.19837951660156, + "learning_rate": 1.7929219600725954e-05, + "loss": 41.8372, + "step": 2227 + }, + { + "epoch": 8.043340857787811, + "grad_norm": 195.1905059814453, + "learning_rate": 1.792377495462795e-05, + "loss": 38.8656, + "step": 2228 + }, + { + "epoch": 8.046952595936794, + "grad_norm": 215.02772521972656, + "learning_rate": 1.7918330308529948e-05, + "loss": 39.8965, + "step": 2229 + }, + { + "epoch": 8.050564334085779, + "grad_norm": 202.16322326660156, + "learning_rate": 1.7912885662431944e-05, + "loss": 41.0917, + "step": 2230 + }, + { + "epoch": 8.050564334085779, + "eval_loss": 0.6212881207466125, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2230 + }, + { + "epoch": 8.054176072234762, + "grad_norm": 218.90786743164062, + "learning_rate": 1.790744101633394e-05, + "loss": 38.5499, + "step": 2231 + }, + { + "epoch": 8.057787810383747, + "grad_norm": 179.57138061523438, + "learning_rate": 1.7901996370235934e-05, + "loss": 39.5915, + "step": 2232 + }, + { + "epoch": 8.061399548532732, + "grad_norm": 242.74801635742188, + "learning_rate": 1.789655172413793e-05, + "loss": 39.6094, + "step": 2233 + }, + { + "epoch": 8.065011286681715, + "grad_norm": 183.07102966308594, + "learning_rate": 1.7891107078039925e-05, + "loss": 40.6025, + "step": 2234 + }, + { + "epoch": 8.0686230248307, + "grad_norm": 192.85418701171875, + "learning_rate": 1.7885662431941924e-05, + "loss": 40.3013, + "step": 2235 + }, + { + "epoch": 8.072234762979685, + "grad_norm": 254.26353454589844, + "learning_rate": 1.7880217785843923e-05, + "loss": 39.1747, + "step": 2236 + }, + { + "epoch": 8.075846501128668, + "grad_norm": 230.7747802734375, + "learning_rate": 1.7874773139745918e-05, + "loss": 40.7569, + "step": 2237 + }, + { + "epoch": 8.079458239277653, + "grad_norm": 179.30528259277344, + "learning_rate": 1.7869328493647913e-05, + "loss": 40.0753, + "step": 2238 + }, + { + "epoch": 8.083069977426636, + "grad_norm": 203.48915100097656, + "learning_rate": 1.786388384754991e-05, + "loss": 41.4453, + "step": 2239 + }, + { + "epoch": 8.08668171557562, + "grad_norm": 274.8970947265625, + "learning_rate": 1.7858439201451908e-05, + "loss": 40.5818, + "step": 2240 + }, + { + "epoch": 8.08668171557562, + "eval_loss": 0.6184170842170715, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.799, + "eval_steps_per_second": 56.799, + "step": 2240 + }, + { + "epoch": 8.090293453724605, + "grad_norm": 237.2452392578125, + "learning_rate": 1.7852994555353903e-05, + "loss": 42.5794, + "step": 2241 + }, + { + "epoch": 8.093905191873588, + "grad_norm": 236.33766174316406, + "learning_rate": 1.7847549909255898e-05, + "loss": 41.89, + "step": 2242 + }, + { + "epoch": 8.097516930022573, + "grad_norm": 269.4791564941406, + "learning_rate": 1.7842105263157894e-05, + "loss": 41.7726, + "step": 2243 + }, + { + "epoch": 8.101128668171558, + "grad_norm": 192.28457641601562, + "learning_rate": 1.783666061705989e-05, + "loss": 40.1187, + "step": 2244 + }, + { + "epoch": 8.104740406320541, + "grad_norm": 201.5625457763672, + "learning_rate": 1.7831215970961888e-05, + "loss": 36.8004, + "step": 2245 + }, + { + "epoch": 8.108352144469526, + "grad_norm": 175.7625274658203, + "learning_rate": 1.7825771324863887e-05, + "loss": 33.8354, + "step": 2246 + }, + { + "epoch": 8.111963882618511, + "grad_norm": 195.6171112060547, + "learning_rate": 1.7820326678765882e-05, + "loss": 33.5176, + "step": 2247 + }, + { + "epoch": 8.115575620767494, + "grad_norm": 158.7554168701172, + "learning_rate": 1.7814882032667877e-05, + "loss": 34.2908, + "step": 2248 + }, + { + "epoch": 8.119187358916479, + "grad_norm": 192.78900146484375, + "learning_rate": 1.7809437386569873e-05, + "loss": 34.0861, + "step": 2249 + }, + { + "epoch": 8.122799097065462, + "grad_norm": 186.6603240966797, + "learning_rate": 1.7803992740471868e-05, + "loss": 35.5742, + "step": 2250 + }, + { + "epoch": 8.122799097065462, + "eval_loss": 0.6207499504089355, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2250 + }, + { + "epoch": 8.126410835214447, + "grad_norm": 264.3590087890625, + "learning_rate": 1.7798548094373867e-05, + "loss": 35.6709, + "step": 2251 + }, + { + "epoch": 8.130022573363432, + "grad_norm": 202.9478302001953, + "learning_rate": 1.7793103448275862e-05, + "loss": 36.4221, + "step": 2252 + }, + { + "epoch": 8.133634311512415, + "grad_norm": 229.260498046875, + "learning_rate": 1.7787658802177858e-05, + "loss": 36.0745, + "step": 2253 + }, + { + "epoch": 8.1372460496614, + "grad_norm": 222.37716674804688, + "learning_rate": 1.7782214156079856e-05, + "loss": 37.3266, + "step": 2254 + }, + { + "epoch": 8.140857787810384, + "grad_norm": 217.02272033691406, + "learning_rate": 1.7776769509981852e-05, + "loss": 37.2819, + "step": 2255 + }, + { + "epoch": 8.144469525959368, + "grad_norm": 247.61016845703125, + "learning_rate": 1.7771324863883847e-05, + "loss": 37.2683, + "step": 2256 + }, + { + "epoch": 8.148081264108352, + "grad_norm": 209.7449493408203, + "learning_rate": 1.7765880217785846e-05, + "loss": 36.7165, + "step": 2257 + }, + { + "epoch": 8.151693002257336, + "grad_norm": 217.30722045898438, + "learning_rate": 1.776043557168784e-05, + "loss": 37.0805, + "step": 2258 + }, + { + "epoch": 8.15530474040632, + "grad_norm": 181.5167236328125, + "learning_rate": 1.7754990925589837e-05, + "loss": 38.0326, + "step": 2259 + }, + { + "epoch": 8.158916478555305, + "grad_norm": 217.4818878173828, + "learning_rate": 1.7749546279491832e-05, + "loss": 37.1798, + "step": 2260 + }, + { + "epoch": 8.158916478555305, + "eval_loss": 0.6218119263648987, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 2260 + }, + { + "epoch": 8.162528216704288, + "grad_norm": 233.60733032226562, + "learning_rate": 1.7744101633393828e-05, + "loss": 36.6039, + "step": 2261 + }, + { + "epoch": 8.166139954853273, + "grad_norm": 184.5128631591797, + "learning_rate": 1.7738656987295826e-05, + "loss": 30.6188, + "step": 2262 + }, + { + "epoch": 8.169751693002258, + "grad_norm": 154.25791931152344, + "learning_rate": 1.7733212341197825e-05, + "loss": 24.0782, + "step": 2263 + }, + { + "epoch": 8.173363431151241, + "grad_norm": 179.92723083496094, + "learning_rate": 1.772776769509982e-05, + "loss": 23.7072, + "step": 2264 + }, + { + "epoch": 8.176975169300226, + "grad_norm": 170.87684631347656, + "learning_rate": 1.7722323049001816e-05, + "loss": 24.0008, + "step": 2265 + }, + { + "epoch": 8.18058690744921, + "grad_norm": 179.25233459472656, + "learning_rate": 1.771687840290381e-05, + "loss": 24.8393, + "step": 2266 + }, + { + "epoch": 8.184198645598194, + "grad_norm": 268.7836608886719, + "learning_rate": 1.7711433756805807e-05, + "loss": 44.0573, + "step": 2267 + }, + { + "epoch": 8.187810383747179, + "grad_norm": 249.12033081054688, + "learning_rate": 1.7705989110707805e-05, + "loss": 45.0218, + "step": 2268 + }, + { + "epoch": 8.191422121896162, + "grad_norm": 275.2551574707031, + "learning_rate": 1.77005444646098e-05, + "loss": 43.1954, + "step": 2269 + }, + { + "epoch": 8.195033860045147, + "grad_norm": 233.5360107421875, + "learning_rate": 1.7695099818511796e-05, + "loss": 43.0807, + "step": 2270 + }, + { + "epoch": 8.195033860045147, + "eval_loss": 0.6311450600624084, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 2270 + }, + { + "epoch": 8.198645598194132, + "grad_norm": 201.01617431640625, + "learning_rate": 1.768965517241379e-05, + "loss": 43.8161, + "step": 2271 + }, + { + "epoch": 8.202257336343115, + "grad_norm": 243.028564453125, + "learning_rate": 1.7684210526315787e-05, + "loss": 43.3388, + "step": 2272 + }, + { + "epoch": 8.2058690744921, + "grad_norm": 191.8246307373047, + "learning_rate": 1.767876588021779e-05, + "loss": 42.6949, + "step": 2273 + }, + { + "epoch": 8.209480812641084, + "grad_norm": 241.33609008789062, + "learning_rate": 1.7673321234119784e-05, + "loss": 43.3541, + "step": 2274 + }, + { + "epoch": 8.213092550790067, + "grad_norm": 247.99066162109375, + "learning_rate": 1.766787658802178e-05, + "loss": 44.4262, + "step": 2275 + }, + { + "epoch": 8.216704288939052, + "grad_norm": 223.35452270507812, + "learning_rate": 1.7662431941923775e-05, + "loss": 42.5696, + "step": 2276 + }, + { + "epoch": 8.220316027088035, + "grad_norm": 208.75209045410156, + "learning_rate": 1.765698729582577e-05, + "loss": 41.9236, + "step": 2277 + }, + { + "epoch": 8.22392776523702, + "grad_norm": 229.60305786132812, + "learning_rate": 1.7651542649727766e-05, + "loss": 39.962, + "step": 2278 + }, + { + "epoch": 8.227539503386005, + "grad_norm": 294.3867492675781, + "learning_rate": 1.7646098003629765e-05, + "loss": 39.0847, + "step": 2279 + }, + { + "epoch": 8.231151241534988, + "grad_norm": 201.49679565429688, + "learning_rate": 1.764065335753176e-05, + "loss": 39.1451, + "step": 2280 + }, + { + "epoch": 8.231151241534988, + "eval_loss": 0.6214079856872559, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 2280 + }, + { + "epoch": 8.234762979683973, + "grad_norm": 201.57894897460938, + "learning_rate": 1.7635208711433756e-05, + "loss": 39.4673, + "step": 2281 + }, + { + "epoch": 8.238374717832958, + "grad_norm": 201.0395965576172, + "learning_rate": 1.7629764065335754e-05, + "loss": 39.9832, + "step": 2282 + }, + { + "epoch": 8.241986455981941, + "grad_norm": 274.41168212890625, + "learning_rate": 1.762431941923775e-05, + "loss": 40.3885, + "step": 2283 + }, + { + "epoch": 8.245598194130926, + "grad_norm": 173.79977416992188, + "learning_rate": 1.761887477313975e-05, + "loss": 39.5292, + "step": 2284 + }, + { + "epoch": 8.249209932279909, + "grad_norm": 194.91806030273438, + "learning_rate": 1.7613430127041744e-05, + "loss": 40.3855, + "step": 2285 + }, + { + "epoch": 8.252821670428894, + "grad_norm": 216.47213745117188, + "learning_rate": 1.760798548094374e-05, + "loss": 40.937, + "step": 2286 + }, + { + "epoch": 8.256433408577879, + "grad_norm": 168.1825714111328, + "learning_rate": 1.7602540834845735e-05, + "loss": 41.2523, + "step": 2287 + }, + { + "epoch": 8.260045146726862, + "grad_norm": 187.51914978027344, + "learning_rate": 1.759709618874773e-05, + "loss": 40.6913, + "step": 2288 + }, + { + "epoch": 8.263656884875846, + "grad_norm": 183.99844360351562, + "learning_rate": 1.759165154264973e-05, + "loss": 42.5074, + "step": 2289 + }, + { + "epoch": 8.267268623024831, + "grad_norm": 201.23797607421875, + "learning_rate": 1.7586206896551724e-05, + "loss": 42.0519, + "step": 2290 + }, + { + "epoch": 8.267268623024831, + "eval_loss": 0.6184054017066956, + "eval_runtime": 3.1465, + "eval_samples_per_second": 56.889, + "eval_steps_per_second": 56.889, + "step": 2290 + }, + { + "epoch": 8.270880361173814, + "grad_norm": 219.0037384033203, + "learning_rate": 1.7580762250453723e-05, + "loss": 41.7059, + "step": 2291 + }, + { + "epoch": 8.2744920993228, + "grad_norm": 221.00173950195312, + "learning_rate": 1.7575317604355718e-05, + "loss": 40.9004, + "step": 2292 + }, + { + "epoch": 8.278103837471784, + "grad_norm": 180.00828552246094, + "learning_rate": 1.7569872958257714e-05, + "loss": 38.7865, + "step": 2293 + }, + { + "epoch": 8.281715575620767, + "grad_norm": 210.69302368164062, + "learning_rate": 1.756442831215971e-05, + "loss": 39.207, + "step": 2294 + }, + { + "epoch": 8.285327313769752, + "grad_norm": 196.8787078857422, + "learning_rate": 1.7558983666061708e-05, + "loss": 39.4472, + "step": 2295 + }, + { + "epoch": 8.288939051918735, + "grad_norm": 229.16331481933594, + "learning_rate": 1.7553539019963703e-05, + "loss": 36.5539, + "step": 2296 + }, + { + "epoch": 8.29255079006772, + "grad_norm": 180.67474365234375, + "learning_rate": 1.75480943738657e-05, + "loss": 34.3887, + "step": 2297 + }, + { + "epoch": 8.296162528216705, + "grad_norm": 234.046875, + "learning_rate": 1.7542649727767694e-05, + "loss": 34.158, + "step": 2298 + }, + { + "epoch": 8.299774266365688, + "grad_norm": 213.34255981445312, + "learning_rate": 1.753720508166969e-05, + "loss": 34.7655, + "step": 2299 + }, + { + "epoch": 8.303386004514673, + "grad_norm": 205.6382598876953, + "learning_rate": 1.753176043557169e-05, + "loss": 34.4223, + "step": 2300 + }, + { + "epoch": 8.303386004514673, + "eval_loss": 0.6200549006462097, + "eval_runtime": 3.1447, + "eval_samples_per_second": 56.921, + "eval_steps_per_second": 56.921, + "step": 2300 + }, + { + "epoch": 8.306997742663658, + "grad_norm": 189.79238891601562, + "learning_rate": 1.7526315789473687e-05, + "loss": 35.3846, + "step": 2301 + }, + { + "epoch": 8.31060948081264, + "grad_norm": 202.27859497070312, + "learning_rate": 1.7520871143375682e-05, + "loss": 34.9006, + "step": 2302 + }, + { + "epoch": 8.314221218961626, + "grad_norm": 217.62327575683594, + "learning_rate": 1.7515426497277678e-05, + "loss": 36.3079, + "step": 2303 + }, + { + "epoch": 8.317832957110609, + "grad_norm": 212.82862854003906, + "learning_rate": 1.7509981851179673e-05, + "loss": 35.8598, + "step": 2304 + }, + { + "epoch": 8.321444695259594, + "grad_norm": 229.778564453125, + "learning_rate": 1.750453720508167e-05, + "loss": 37.0853, + "step": 2305 + }, + { + "epoch": 8.325056433408578, + "grad_norm": 219.99844360351562, + "learning_rate": 1.7499092558983667e-05, + "loss": 38.01, + "step": 2306 + }, + { + "epoch": 8.328668171557561, + "grad_norm": 202.63035583496094, + "learning_rate": 1.7493647912885663e-05, + "loss": 36.4756, + "step": 2307 + }, + { + "epoch": 8.332279909706546, + "grad_norm": 188.44094848632812, + "learning_rate": 1.7488203266787658e-05, + "loss": 37.0509, + "step": 2308 + }, + { + "epoch": 8.335891647855531, + "grad_norm": 187.8760223388672, + "learning_rate": 1.7482758620689657e-05, + "loss": 38.0019, + "step": 2309 + }, + { + "epoch": 8.339503386004514, + "grad_norm": 239.35833740234375, + "learning_rate": 1.7477313974591652e-05, + "loss": 38.2255, + "step": 2310 + }, + { + "epoch": 8.339503386004514, + "eval_loss": 0.6221747994422913, + "eval_runtime": 3.148, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 2310 + }, + { + "epoch": 8.343115124153499, + "grad_norm": 236.3567657470703, + "learning_rate": 1.747186932849365e-05, + "loss": 37.3598, + "step": 2311 + }, + { + "epoch": 8.346726862302482, + "grad_norm": 188.16151428222656, + "learning_rate": 1.7466424682395646e-05, + "loss": 27.1993, + "step": 2312 + }, + { + "epoch": 8.350338600451467, + "grad_norm": 216.58778381347656, + "learning_rate": 1.746098003629764e-05, + "loss": 23.7024, + "step": 2313 + }, + { + "epoch": 8.353950338600452, + "grad_norm": 221.03111267089844, + "learning_rate": 1.7455535390199637e-05, + "loss": 24.2856, + "step": 2314 + }, + { + "epoch": 8.357562076749435, + "grad_norm": 180.36221313476562, + "learning_rate": 1.7450090744101632e-05, + "loss": 23.7624, + "step": 2315 + }, + { + "epoch": 8.36117381489842, + "grad_norm": 198.77438354492188, + "learning_rate": 1.7444646098003628e-05, + "loss": 25.8628, + "step": 2316 + }, + { + "epoch": 8.364785553047405, + "grad_norm": 250.81321716308594, + "learning_rate": 1.7439201451905627e-05, + "loss": 43.4097, + "step": 2317 + }, + { + "epoch": 8.368397291196388, + "grad_norm": 246.19544982910156, + "learning_rate": 1.7433756805807622e-05, + "loss": 44.7141, + "step": 2318 + }, + { + "epoch": 8.372009029345373, + "grad_norm": 245.04241943359375, + "learning_rate": 1.742831215970962e-05, + "loss": 44.4511, + "step": 2319 + }, + { + "epoch": 8.375620767494357, + "grad_norm": 224.05331420898438, + "learning_rate": 1.7422867513611616e-05, + "loss": 43.5971, + "step": 2320 + }, + { + "epoch": 8.375620767494357, + "eval_loss": 0.6324251294136047, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 2320 + }, + { + "epoch": 8.37923250564334, + "grad_norm": 222.3795623779297, + "learning_rate": 1.741742286751361e-05, + "loss": 42.9007, + "step": 2321 + }, + { + "epoch": 8.382844243792325, + "grad_norm": 210.0133514404297, + "learning_rate": 1.741197822141561e-05, + "loss": 42.8733, + "step": 2322 + }, + { + "epoch": 8.386455981941308, + "grad_norm": 222.01031494140625, + "learning_rate": 1.7406533575317606e-05, + "loss": 42.9875, + "step": 2323 + }, + { + "epoch": 8.390067720090293, + "grad_norm": 187.30101013183594, + "learning_rate": 1.74010889292196e-05, + "loss": 42.4873, + "step": 2324 + }, + { + "epoch": 8.393679458239278, + "grad_norm": 188.22048950195312, + "learning_rate": 1.7395644283121596e-05, + "loss": 42.2066, + "step": 2325 + }, + { + "epoch": 8.397291196388261, + "grad_norm": 228.75363159179688, + "learning_rate": 1.7390199637023592e-05, + "loss": 42.7604, + "step": 2326 + }, + { + "epoch": 8.400902934537246, + "grad_norm": 196.8817901611328, + "learning_rate": 1.7384754990925587e-05, + "loss": 42.445, + "step": 2327 + }, + { + "epoch": 8.404514672686231, + "grad_norm": 205.3610382080078, + "learning_rate": 1.737931034482759e-05, + "loss": 39.8408, + "step": 2328 + }, + { + "epoch": 8.408126410835214, + "grad_norm": 259.0702819824219, + "learning_rate": 1.7373865698729585e-05, + "loss": 40.847, + "step": 2329 + }, + { + "epoch": 8.411738148984199, + "grad_norm": 216.12017822265625, + "learning_rate": 1.736842105263158e-05, + "loss": 40.4648, + "step": 2330 + }, + { + "epoch": 8.411738148984199, + "eval_loss": 0.6252871155738831, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2330 + }, + { + "epoch": 8.415349887133182, + "grad_norm": 330.9464111328125, + "learning_rate": 1.7362976406533575e-05, + "loss": 39.7682, + "step": 2331 + }, + { + "epoch": 8.418961625282167, + "grad_norm": 237.19505310058594, + "learning_rate": 1.735753176043557e-05, + "loss": 38.8824, + "step": 2332 + }, + { + "epoch": 8.422573363431152, + "grad_norm": 247.22259521484375, + "learning_rate": 1.735208711433757e-05, + "loss": 40.1187, + "step": 2333 + }, + { + "epoch": 8.426185101580135, + "grad_norm": 267.739990234375, + "learning_rate": 1.7346642468239565e-05, + "loss": 40.4589, + "step": 2334 + }, + { + "epoch": 8.42979683972912, + "grad_norm": 308.715576171875, + "learning_rate": 1.734119782214156e-05, + "loss": 41.5481, + "step": 2335 + }, + { + "epoch": 8.433408577878104, + "grad_norm": 350.8972473144531, + "learning_rate": 1.7335753176043556e-05, + "loss": 41.6628, + "step": 2336 + }, + { + "epoch": 8.437020316027088, + "grad_norm": 245.9825897216797, + "learning_rate": 1.7330308529945555e-05, + "loss": 40.3527, + "step": 2337 + }, + { + "epoch": 8.440632054176072, + "grad_norm": 253.94488525390625, + "learning_rate": 1.732486388384755e-05, + "loss": 39.6388, + "step": 2338 + }, + { + "epoch": 8.444243792325057, + "grad_norm": 226.24179077148438, + "learning_rate": 1.731941923774955e-05, + "loss": 40.5561, + "step": 2339 + }, + { + "epoch": 8.44785553047404, + "grad_norm": 188.66746520996094, + "learning_rate": 1.7313974591651544e-05, + "loss": 41.8422, + "step": 2340 + }, + { + "epoch": 8.44785553047404, + "eval_loss": 0.6197592616081238, + "eval_runtime": 3.1522, + "eval_samples_per_second": 56.786, + "eval_steps_per_second": 56.786, + "step": 2340 + }, + { + "epoch": 8.451467268623025, + "grad_norm": 227.01014709472656, + "learning_rate": 1.730852994555354e-05, + "loss": 41.4184, + "step": 2341 + }, + { + "epoch": 8.455079006772008, + "grad_norm": 187.11643981933594, + "learning_rate": 1.7303085299455535e-05, + "loss": 40.796, + "step": 2342 + }, + { + "epoch": 8.458690744920993, + "grad_norm": 243.1756134033203, + "learning_rate": 1.729764065335753e-05, + "loss": 41.7926, + "step": 2343 + }, + { + "epoch": 8.462302483069978, + "grad_norm": 226.15187072753906, + "learning_rate": 1.729219600725953e-05, + "loss": 41.588, + "step": 2344 + }, + { + "epoch": 8.465914221218961, + "grad_norm": 218.49935913085938, + "learning_rate": 1.7286751361161524e-05, + "loss": 39.6935, + "step": 2345 + }, + { + "epoch": 8.469525959367946, + "grad_norm": 232.4805145263672, + "learning_rate": 1.7281306715063523e-05, + "loss": 37.0718, + "step": 2346 + }, + { + "epoch": 8.47313769751693, + "grad_norm": 201.1748046875, + "learning_rate": 1.727586206896552e-05, + "loss": 33.9633, + "step": 2347 + }, + { + "epoch": 8.476749435665914, + "grad_norm": 208.79733276367188, + "learning_rate": 1.7270417422867514e-05, + "loss": 33.4553, + "step": 2348 + }, + { + "epoch": 8.480361173814899, + "grad_norm": 235.91151428222656, + "learning_rate": 1.726497277676951e-05, + "loss": 33.6144, + "step": 2349 + }, + { + "epoch": 8.483972911963882, + "grad_norm": 206.28811645507812, + "learning_rate": 1.7259528130671508e-05, + "loss": 35.3678, + "step": 2350 + }, + { + "epoch": 8.483972911963882, + "eval_loss": 0.6203061938285828, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 2350 + }, + { + "epoch": 8.487584650112867, + "grad_norm": 305.2204284667969, + "learning_rate": 1.7254083484573503e-05, + "loss": 35.9175, + "step": 2351 + }, + { + "epoch": 8.491196388261852, + "grad_norm": 227.1592254638672, + "learning_rate": 1.72486388384755e-05, + "loss": 35.5001, + "step": 2352 + }, + { + "epoch": 8.494808126410835, + "grad_norm": 194.739501953125, + "learning_rate": 1.7243194192377494e-05, + "loss": 35.0015, + "step": 2353 + }, + { + "epoch": 8.49841986455982, + "grad_norm": 233.8467254638672, + "learning_rate": 1.723774954627949e-05, + "loss": 36.8257, + "step": 2354 + }, + { + "epoch": 8.502031602708804, + "grad_norm": 258.8914489746094, + "learning_rate": 1.7232304900181492e-05, + "loss": 36.1246, + "step": 2355 + }, + { + "epoch": 8.505643340857787, + "grad_norm": 194.8585968017578, + "learning_rate": 1.7226860254083487e-05, + "loss": 36.1245, + "step": 2356 + }, + { + "epoch": 8.509255079006772, + "grad_norm": 191.2276153564453, + "learning_rate": 1.7221415607985483e-05, + "loss": 37.0608, + "step": 2357 + }, + { + "epoch": 8.512866817155757, + "grad_norm": 197.9025115966797, + "learning_rate": 1.7215970961887478e-05, + "loss": 37.0779, + "step": 2358 + }, + { + "epoch": 8.51647855530474, + "grad_norm": 207.01016235351562, + "learning_rate": 1.7210526315789473e-05, + "loss": 37.8432, + "step": 2359 + }, + { + "epoch": 8.520090293453725, + "grad_norm": 222.20201110839844, + "learning_rate": 1.720508166969147e-05, + "loss": 36.6983, + "step": 2360 + }, + { + "epoch": 8.520090293453725, + "eval_loss": 0.6240220665931702, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 2360 + }, + { + "epoch": 8.523702031602708, + "grad_norm": 200.19273376464844, + "learning_rate": 1.7199637023593467e-05, + "loss": 38.0613, + "step": 2361 + }, + { + "epoch": 8.527313769751693, + "grad_norm": 205.36758422851562, + "learning_rate": 1.7194192377495463e-05, + "loss": 29.6395, + "step": 2362 + }, + { + "epoch": 8.530925507900678, + "grad_norm": 206.53396606445312, + "learning_rate": 1.7188747731397458e-05, + "loss": 23.6478, + "step": 2363 + }, + { + "epoch": 8.534537246049661, + "grad_norm": 219.47044372558594, + "learning_rate": 1.7183303085299454e-05, + "loss": 22.8522, + "step": 2364 + }, + { + "epoch": 8.538148984198646, + "grad_norm": 178.48008728027344, + "learning_rate": 1.7177858439201452e-05, + "loss": 24.1411, + "step": 2365 + }, + { + "epoch": 8.54176072234763, + "grad_norm": 222.63731384277344, + "learning_rate": 1.717241379310345e-05, + "loss": 26.2818, + "step": 2366 + }, + { + "epoch": 8.545372460496614, + "grad_norm": 216.6333465576172, + "learning_rate": 1.7166969147005447e-05, + "loss": 42.5599, + "step": 2367 + }, + { + "epoch": 8.548984198645599, + "grad_norm": 241.42532348632812, + "learning_rate": 1.7161524500907442e-05, + "loss": 44.0016, + "step": 2368 + }, + { + "epoch": 8.552595936794582, + "grad_norm": 227.95193481445312, + "learning_rate": 1.7156079854809437e-05, + "loss": 44.1662, + "step": 2369 + }, + { + "epoch": 8.556207674943566, + "grad_norm": 204.9208526611328, + "learning_rate": 1.7150635208711433e-05, + "loss": 41.2255, + "step": 2370 + }, + { + "epoch": 8.556207674943566, + "eval_loss": 0.6293933987617493, + "eval_runtime": 3.1467, + "eval_samples_per_second": 56.884, + "eval_steps_per_second": 56.884, + "step": 2370 + }, + { + "epoch": 8.559819413092551, + "grad_norm": 168.1370849609375, + "learning_rate": 1.7145190562613428e-05, + "loss": 42.8374, + "step": 2371 + }, + { + "epoch": 8.563431151241534, + "grad_norm": 209.16641235351562, + "learning_rate": 1.7139745916515427e-05, + "loss": 42.4378, + "step": 2372 + }, + { + "epoch": 8.56704288939052, + "grad_norm": 235.36373901367188, + "learning_rate": 1.7134301270417422e-05, + "loss": 43.3213, + "step": 2373 + }, + { + "epoch": 8.570654627539504, + "grad_norm": 198.8206329345703, + "learning_rate": 1.712885662431942e-05, + "loss": 43.5621, + "step": 2374 + }, + { + "epoch": 8.574266365688487, + "grad_norm": 191.1640167236328, + "learning_rate": 1.7123411978221416e-05, + "loss": 41.8729, + "step": 2375 + }, + { + "epoch": 8.577878103837472, + "grad_norm": 281.6352233886719, + "learning_rate": 1.7117967332123412e-05, + "loss": 42.8306, + "step": 2376 + }, + { + "epoch": 8.581489841986457, + "grad_norm": 191.68939208984375, + "learning_rate": 1.711252268602541e-05, + "loss": 41.3603, + "step": 2377 + }, + { + "epoch": 8.58510158013544, + "grad_norm": 175.3041229248047, + "learning_rate": 1.7107078039927406e-05, + "loss": 38.7076, + "step": 2378 + }, + { + "epoch": 8.588713318284425, + "grad_norm": 186.31202697753906, + "learning_rate": 1.71016333938294e-05, + "loss": 38.832, + "step": 2379 + }, + { + "epoch": 8.592325056433408, + "grad_norm": 192.0680389404297, + "learning_rate": 1.7096188747731397e-05, + "loss": 40.6542, + "step": 2380 + }, + { + "epoch": 8.592325056433408, + "eval_loss": 0.6245992183685303, + "eval_runtime": 3.1487, + "eval_samples_per_second": 56.848, + "eval_steps_per_second": 56.848, + "step": 2380 + }, + { + "epoch": 8.595936794582393, + "grad_norm": 284.3516540527344, + "learning_rate": 1.7090744101633392e-05, + "loss": 40.3145, + "step": 2381 + }, + { + "epoch": 8.599548532731378, + "grad_norm": 210.2421875, + "learning_rate": 1.708529945553539e-05, + "loss": 39.9109, + "step": 2382 + }, + { + "epoch": 8.60316027088036, + "grad_norm": 202.3438720703125, + "learning_rate": 1.707985480943739e-05, + "loss": 39.0686, + "step": 2383 + }, + { + "epoch": 8.606772009029346, + "grad_norm": 189.5508270263672, + "learning_rate": 1.7074410163339385e-05, + "loss": 40.6673, + "step": 2384 + }, + { + "epoch": 8.610383747178329, + "grad_norm": 199.3516387939453, + "learning_rate": 1.706896551724138e-05, + "loss": 40.5357, + "step": 2385 + }, + { + "epoch": 8.613995485327314, + "grad_norm": 183.11309814453125, + "learning_rate": 1.7063520871143376e-05, + "loss": 40.7691, + "step": 2386 + }, + { + "epoch": 8.617607223476298, + "grad_norm": 347.104248046875, + "learning_rate": 1.705807622504537e-05, + "loss": 40.6822, + "step": 2387 + }, + { + "epoch": 8.621218961625281, + "grad_norm": 341.0453796386719, + "learning_rate": 1.705263157894737e-05, + "loss": 40.9791, + "step": 2388 + }, + { + "epoch": 8.624830699774266, + "grad_norm": 335.33221435546875, + "learning_rate": 1.7047186932849365e-05, + "loss": 41.0977, + "step": 2389 + }, + { + "epoch": 8.628442437923251, + "grad_norm": 209.75198364257812, + "learning_rate": 1.704174228675136e-05, + "loss": 41.3332, + "step": 2390 + }, + { + "epoch": 8.628442437923251, + "eval_loss": 0.6176490783691406, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2390 + }, + { + "epoch": 8.632054176072234, + "grad_norm": 221.6715545654297, + "learning_rate": 1.7036297640653356e-05, + "loss": 41.7456, + "step": 2391 + }, + { + "epoch": 8.635665914221219, + "grad_norm": 255.7875213623047, + "learning_rate": 1.7030852994555355e-05, + "loss": 41.7063, + "step": 2392 + }, + { + "epoch": 8.639277652370204, + "grad_norm": 206.66221618652344, + "learning_rate": 1.7025408348457354e-05, + "loss": 41.941, + "step": 2393 + }, + { + "epoch": 8.642889390519187, + "grad_norm": 381.9871826171875, + "learning_rate": 1.701996370235935e-05, + "loss": 42.8615, + "step": 2394 + }, + { + "epoch": 8.646501128668172, + "grad_norm": 303.8249816894531, + "learning_rate": 1.7014519056261344e-05, + "loss": 37.8472, + "step": 2395 + }, + { + "epoch": 8.650112866817155, + "grad_norm": 201.2444610595703, + "learning_rate": 1.700907441016334e-05, + "loss": 35.4641, + "step": 2396 + }, + { + "epoch": 8.65372460496614, + "grad_norm": 242.34298706054688, + "learning_rate": 1.7003629764065335e-05, + "loss": 33.3414, + "step": 2397 + }, + { + "epoch": 8.657336343115125, + "grad_norm": 214.45384216308594, + "learning_rate": 1.699818511796733e-05, + "loss": 33.7771, + "step": 2398 + }, + { + "epoch": 8.660948081264108, + "grad_norm": 276.4810485839844, + "learning_rate": 1.699274047186933e-05, + "loss": 35.4289, + "step": 2399 + }, + { + "epoch": 8.664559819413093, + "grad_norm": 199.68626403808594, + "learning_rate": 1.6987295825771325e-05, + "loss": 34.4205, + "step": 2400 + }, + { + "epoch": 8.664559819413093, + "eval_loss": 0.6179484128952026, + "eval_runtime": 3.1618, + "eval_samples_per_second": 56.614, + "eval_steps_per_second": 56.614, + "step": 2400 + }, + { + "epoch": 8.668171557562077, + "grad_norm": 239.19200134277344, + "learning_rate": 1.698185117967332e-05, + "loss": 34.3428, + "step": 2401 + }, + { + "epoch": 8.67178329571106, + "grad_norm": 341.44927978515625, + "learning_rate": 1.697640653357532e-05, + "loss": 37.6011, + "step": 2402 + }, + { + "epoch": 8.675395033860045, + "grad_norm": 260.5967102050781, + "learning_rate": 1.6970961887477314e-05, + "loss": 34.9222, + "step": 2403 + }, + { + "epoch": 8.679006772009028, + "grad_norm": 217.9357147216797, + "learning_rate": 1.6965517241379313e-05, + "loss": 36.6177, + "step": 2404 + }, + { + "epoch": 8.682618510158013, + "grad_norm": 355.21917724609375, + "learning_rate": 1.696007259528131e-05, + "loss": 36.3072, + "step": 2405 + }, + { + "epoch": 8.686230248306998, + "grad_norm": 279.37200927734375, + "learning_rate": 1.6954627949183304e-05, + "loss": 36.7026, + "step": 2406 + }, + { + "epoch": 8.689841986455981, + "grad_norm": 344.9017028808594, + "learning_rate": 1.69491833030853e-05, + "loss": 37.5009, + "step": 2407 + }, + { + "epoch": 8.693453724604966, + "grad_norm": 225.28668212890625, + "learning_rate": 1.6943738656987295e-05, + "loss": 36.0914, + "step": 2408 + }, + { + "epoch": 8.697065462753951, + "grad_norm": 233.16372680664062, + "learning_rate": 1.693829401088929e-05, + "loss": 38.0917, + "step": 2409 + }, + { + "epoch": 8.700677200902934, + "grad_norm": 220.2307891845703, + "learning_rate": 1.693284936479129e-05, + "loss": 37.4493, + "step": 2410 + }, + { + "epoch": 8.700677200902934, + "eval_loss": 0.6225734949111938, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2410 + }, + { + "epoch": 8.704288939051919, + "grad_norm": 298.2883605957031, + "learning_rate": 1.6927404718693287e-05, + "loss": 37.6527, + "step": 2411 + }, + { + "epoch": 8.707900677200904, + "grad_norm": 329.1615295410156, + "learning_rate": 1.6921960072595283e-05, + "loss": 30.9627, + "step": 2412 + }, + { + "epoch": 8.711512415349887, + "grad_norm": 192.55380249023438, + "learning_rate": 1.6916515426497278e-05, + "loss": 24.2028, + "step": 2413 + }, + { + "epoch": 8.715124153498872, + "grad_norm": 162.13583374023438, + "learning_rate": 1.6911070780399274e-05, + "loss": 23.3005, + "step": 2414 + }, + { + "epoch": 8.718735891647855, + "grad_norm": 152.95108032226562, + "learning_rate": 1.6905626134301272e-05, + "loss": 24.335, + "step": 2415 + }, + { + "epoch": 8.72234762979684, + "grad_norm": 183.4193572998047, + "learning_rate": 1.6900181488203268e-05, + "loss": 24.9279, + "step": 2416 + }, + { + "epoch": 8.725959367945824, + "grad_norm": 232.93650817871094, + "learning_rate": 1.6894736842105263e-05, + "loss": 43.4574, + "step": 2417 + }, + { + "epoch": 8.729571106094808, + "grad_norm": 226.85890197753906, + "learning_rate": 1.688929219600726e-05, + "loss": 44.4136, + "step": 2418 + }, + { + "epoch": 8.733182844243792, + "grad_norm": 232.16064453125, + "learning_rate": 1.6883847549909254e-05, + "loss": 42.8183, + "step": 2419 + }, + { + "epoch": 8.736794582392777, + "grad_norm": 243.5811767578125, + "learning_rate": 1.6878402903811253e-05, + "loss": 43.3031, + "step": 2420 + }, + { + "epoch": 8.736794582392777, + "eval_loss": 0.6284167170524597, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2420 + }, + { + "epoch": 8.74040632054176, + "grad_norm": 194.7115020751953, + "learning_rate": 1.687295825771325e-05, + "loss": 42.1276, + "step": 2421 + }, + { + "epoch": 8.744018058690745, + "grad_norm": 250.81983947753906, + "learning_rate": 1.6867513611615247e-05, + "loss": 42.5535, + "step": 2422 + }, + { + "epoch": 8.747629796839728, + "grad_norm": 205.1988983154297, + "learning_rate": 1.6862068965517242e-05, + "loss": 42.7745, + "step": 2423 + }, + { + "epoch": 8.751241534988713, + "grad_norm": 159.68243408203125, + "learning_rate": 1.6856624319419238e-05, + "loss": 43.6562, + "step": 2424 + }, + { + "epoch": 8.754853273137698, + "grad_norm": 164.31361389160156, + "learning_rate": 1.6851179673321233e-05, + "loss": 43.4602, + "step": 2425 + }, + { + "epoch": 8.758465011286681, + "grad_norm": 213.9793243408203, + "learning_rate": 1.6845735027223232e-05, + "loss": 42.1559, + "step": 2426 + }, + { + "epoch": 8.762076749435666, + "grad_norm": 205.79107666015625, + "learning_rate": 1.6840290381125227e-05, + "loss": 41.5687, + "step": 2427 + }, + { + "epoch": 8.76568848758465, + "grad_norm": 235.80348205566406, + "learning_rate": 1.6834845735027223e-05, + "loss": 41.0748, + "step": 2428 + }, + { + "epoch": 8.769300225733634, + "grad_norm": 203.84884643554688, + "learning_rate": 1.682940108892922e-05, + "loss": 39.3348, + "step": 2429 + }, + { + "epoch": 8.772911963882619, + "grad_norm": 271.2411804199219, + "learning_rate": 1.6823956442831217e-05, + "loss": 39.357, + "step": 2430 + }, + { + "epoch": 8.772911963882619, + "eval_loss": 0.6211046576499939, + "eval_runtime": 3.1402, + "eval_samples_per_second": 57.002, + "eval_steps_per_second": 57.002, + "step": 2430 + }, + { + "epoch": 8.776523702031604, + "grad_norm": 222.4960174560547, + "learning_rate": 1.6818511796733212e-05, + "loss": 39.2198, + "step": 2431 + }, + { + "epoch": 8.780135440180587, + "grad_norm": 325.9942932128906, + "learning_rate": 1.681306715063521e-05, + "loss": 40.572, + "step": 2432 + }, + { + "epoch": 8.783747178329572, + "grad_norm": 195.2740936279297, + "learning_rate": 1.6807622504537206e-05, + "loss": 39.2727, + "step": 2433 + }, + { + "epoch": 8.787358916478555, + "grad_norm": 196.16964721679688, + "learning_rate": 1.68021778584392e-05, + "loss": 40.6503, + "step": 2434 + }, + { + "epoch": 8.79097065462754, + "grad_norm": 183.2659454345703, + "learning_rate": 1.6796733212341197e-05, + "loss": 41.2074, + "step": 2435 + }, + { + "epoch": 8.794582392776524, + "grad_norm": 293.393798828125, + "learning_rate": 1.6791288566243192e-05, + "loss": 40.2778, + "step": 2436 + }, + { + "epoch": 8.798194130925507, + "grad_norm": 232.8402099609375, + "learning_rate": 1.678584392014519e-05, + "loss": 40.0305, + "step": 2437 + }, + { + "epoch": 8.801805869074492, + "grad_norm": 269.957275390625, + "learning_rate": 1.678039927404719e-05, + "loss": 40.4216, + "step": 2438 + }, + { + "epoch": 8.805417607223477, + "grad_norm": 175.6732635498047, + "learning_rate": 1.6774954627949185e-05, + "loss": 40.7998, + "step": 2439 + }, + { + "epoch": 8.80902934537246, + "grad_norm": 209.0604248046875, + "learning_rate": 1.676950998185118e-05, + "loss": 41.1176, + "step": 2440 + }, + { + "epoch": 8.80902934537246, + "eval_loss": 0.6211614012718201, + "eval_runtime": 3.15, + "eval_samples_per_second": 56.826, + "eval_steps_per_second": 56.826, + "step": 2440 + }, + { + "epoch": 8.812641083521445, + "grad_norm": 229.91171264648438, + "learning_rate": 1.6764065335753176e-05, + "loss": 41.37, + "step": 2441 + }, + { + "epoch": 8.816252821670428, + "grad_norm": 192.99610900878906, + "learning_rate": 1.675862068965517e-05, + "loss": 41.8377, + "step": 2442 + }, + { + "epoch": 8.819864559819413, + "grad_norm": 239.290771484375, + "learning_rate": 1.675317604355717e-05, + "loss": 42.3038, + "step": 2443 + }, + { + "epoch": 8.823476297968398, + "grad_norm": 203.52330017089844, + "learning_rate": 1.6747731397459166e-05, + "loss": 41.3334, + "step": 2444 + }, + { + "epoch": 8.827088036117381, + "grad_norm": 247.99099731445312, + "learning_rate": 1.674228675136116e-05, + "loss": 37.7455, + "step": 2445 + }, + { + "epoch": 8.830699774266366, + "grad_norm": 205.9770965576172, + "learning_rate": 1.6736842105263156e-05, + "loss": 34.6828, + "step": 2446 + }, + { + "epoch": 8.83431151241535, + "grad_norm": 215.47024536132812, + "learning_rate": 1.6731397459165152e-05, + "loss": 34.927, + "step": 2447 + }, + { + "epoch": 8.837923250564334, + "grad_norm": 254.14010620117188, + "learning_rate": 1.6725952813067154e-05, + "loss": 35.3194, + "step": 2448 + }, + { + "epoch": 8.841534988713319, + "grad_norm": 221.18174743652344, + "learning_rate": 1.672050816696915e-05, + "loss": 34.9577, + "step": 2449 + }, + { + "epoch": 8.845146726862303, + "grad_norm": 191.1651611328125, + "learning_rate": 1.6715063520871145e-05, + "loss": 33.7244, + "step": 2450 + }, + { + "epoch": 8.845146726862303, + "eval_loss": 0.6216589212417603, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2450 + }, + { + "epoch": 8.848758465011286, + "grad_norm": 228.3920135498047, + "learning_rate": 1.670961887477314e-05, + "loss": 34.9689, + "step": 2451 + }, + { + "epoch": 8.852370203160271, + "grad_norm": 227.6689910888672, + "learning_rate": 1.6704174228675135e-05, + "loss": 36.0718, + "step": 2452 + }, + { + "epoch": 8.855981941309254, + "grad_norm": 182.38978576660156, + "learning_rate": 1.669872958257713e-05, + "loss": 37.1143, + "step": 2453 + }, + { + "epoch": 8.85959367945824, + "grad_norm": 223.66966247558594, + "learning_rate": 1.669328493647913e-05, + "loss": 34.4468, + "step": 2454 + }, + { + "epoch": 8.863205417607224, + "grad_norm": 260.3930358886719, + "learning_rate": 1.6687840290381125e-05, + "loss": 36.7305, + "step": 2455 + }, + { + "epoch": 8.866817155756207, + "grad_norm": 218.60385131835938, + "learning_rate": 1.668239564428312e-05, + "loss": 36.1995, + "step": 2456 + }, + { + "epoch": 8.870428893905192, + "grad_norm": 227.4342041015625, + "learning_rate": 1.667695099818512e-05, + "loss": 35.9138, + "step": 2457 + }, + { + "epoch": 8.874040632054175, + "grad_norm": 208.42196655273438, + "learning_rate": 1.6671506352087115e-05, + "loss": 37.2621, + "step": 2458 + }, + { + "epoch": 8.87765237020316, + "grad_norm": 214.9486541748047, + "learning_rate": 1.6666061705989113e-05, + "loss": 38.5176, + "step": 2459 + }, + { + "epoch": 8.881264108352145, + "grad_norm": 226.6992645263672, + "learning_rate": 1.666061705989111e-05, + "loss": 38.3917, + "step": 2460 + }, + { + "epoch": 8.881264108352145, + "eval_loss": 0.6277003884315491, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.959, + "eval_steps_per_second": 56.959, + "step": 2460 + }, + { + "epoch": 8.884875846501128, + "grad_norm": 282.3875732421875, + "learning_rate": 1.6655172413793104e-05, + "loss": 39.1439, + "step": 2461 + }, + { + "epoch": 8.888487584650113, + "grad_norm": 240.29022216796875, + "learning_rate": 1.66497277676951e-05, + "loss": 33.7717, + "step": 2462 + }, + { + "epoch": 8.892099322799098, + "grad_norm": 231.84727478027344, + "learning_rate": 1.6644283121597095e-05, + "loss": 24.1146, + "step": 2463 + }, + { + "epoch": 8.89571106094808, + "grad_norm": 215.5159149169922, + "learning_rate": 1.663883847549909e-05, + "loss": 24.0165, + "step": 2464 + }, + { + "epoch": 8.899322799097066, + "grad_norm": 278.42950439453125, + "learning_rate": 1.663339382940109e-05, + "loss": 24.2048, + "step": 2465 + }, + { + "epoch": 8.90293453724605, + "grad_norm": 187.03341674804688, + "learning_rate": 1.6627949183303088e-05, + "loss": 24.7332, + "step": 2466 + }, + { + "epoch": 8.906546275395034, + "grad_norm": 261.2938232421875, + "learning_rate": 1.6622504537205083e-05, + "loss": 42.6764, + "step": 2467 + }, + { + "epoch": 8.910158013544018, + "grad_norm": 234.00880432128906, + "learning_rate": 1.661705989110708e-05, + "loss": 42.9894, + "step": 2468 + }, + { + "epoch": 8.913769751693001, + "grad_norm": 263.2890319824219, + "learning_rate": 1.6611615245009074e-05, + "loss": 43.3274, + "step": 2469 + }, + { + "epoch": 8.917381489841986, + "grad_norm": 286.3260192871094, + "learning_rate": 1.6606170598911073e-05, + "loss": 44.3862, + "step": 2470 + }, + { + "epoch": 8.917381489841986, + "eval_loss": 0.6278789043426514, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2470 + }, + { + "epoch": 8.920993227990971, + "grad_norm": 273.5133972167969, + "learning_rate": 1.6600725952813068e-05, + "loss": 43.4195, + "step": 2471 + }, + { + "epoch": 8.924604966139954, + "grad_norm": 246.2245330810547, + "learning_rate": 1.6595281306715063e-05, + "loss": 43.153, + "step": 2472 + }, + { + "epoch": 8.928216704288939, + "grad_norm": 261.3001403808594, + "learning_rate": 1.658983666061706e-05, + "loss": 41.1276, + "step": 2473 + }, + { + "epoch": 8.931828442437924, + "grad_norm": 263.7626037597656, + "learning_rate": 1.6584392014519054e-05, + "loss": 40.5055, + "step": 2474 + }, + { + "epoch": 8.935440180586907, + "grad_norm": 233.80442810058594, + "learning_rate": 1.6578947368421053e-05, + "loss": 40.7098, + "step": 2475 + }, + { + "epoch": 8.939051918735892, + "grad_norm": 334.1268615722656, + "learning_rate": 1.6573502722323052e-05, + "loss": 40.5404, + "step": 2476 + }, + { + "epoch": 8.942663656884875, + "grad_norm": 319.56689453125, + "learning_rate": 1.6568058076225047e-05, + "loss": 40.3434, + "step": 2477 + }, + { + "epoch": 8.94627539503386, + "grad_norm": 388.0625915527344, + "learning_rate": 1.6562613430127043e-05, + "loss": 41.1956, + "step": 2478 + }, + { + "epoch": 8.949887133182845, + "grad_norm": 256.9087829589844, + "learning_rate": 1.6557168784029038e-05, + "loss": 41.9647, + "step": 2479 + }, + { + "epoch": 8.953498871331828, + "grad_norm": 248.2635040283203, + "learning_rate": 1.6551724137931033e-05, + "loss": 41.1885, + "step": 2480 + }, + { + "epoch": 8.953498871331828, + "eval_loss": 0.6198933124542236, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2480 + }, + { + "epoch": 8.957110609480813, + "grad_norm": 236.89004516601562, + "learning_rate": 1.6546279491833032e-05, + "loss": 41.2178, + "step": 2481 + }, + { + "epoch": 8.960722347629797, + "grad_norm": 260.47357177734375, + "learning_rate": 1.6540834845735027e-05, + "loss": 42.1472, + "step": 2482 + }, + { + "epoch": 8.96433408577878, + "grad_norm": 216.1390380859375, + "learning_rate": 1.6535390199637023e-05, + "loss": 36.14, + "step": 2483 + }, + { + "epoch": 8.967945823927765, + "grad_norm": 194.7316131591797, + "learning_rate": 1.652994555353902e-05, + "loss": 33.7272, + "step": 2484 + }, + { + "epoch": 8.97155756207675, + "grad_norm": 202.0404052734375, + "learning_rate": 1.6524500907441017e-05, + "loss": 34.9427, + "step": 2485 + }, + { + "epoch": 8.975169300225733, + "grad_norm": 196.98463439941406, + "learning_rate": 1.6519056261343016e-05, + "loss": 36.4874, + "step": 2486 + }, + { + "epoch": 8.978781038374718, + "grad_norm": 211.46177673339844, + "learning_rate": 1.651361161524501e-05, + "loss": 35.7667, + "step": 2487 + }, + { + "epoch": 8.982392776523701, + "grad_norm": 190.47093200683594, + "learning_rate": 1.6508166969147006e-05, + "loss": 35.6874, + "step": 2488 + }, + { + "epoch": 8.986004514672686, + "grad_norm": 194.9825897216797, + "learning_rate": 1.6502722323049002e-05, + "loss": 36.8718, + "step": 2489 + }, + { + "epoch": 8.989616252821671, + "grad_norm": 230.24774169921875, + "learning_rate": 1.6497277676950997e-05, + "loss": 37.4962, + "step": 2490 + }, + { + "epoch": 8.989616252821671, + "eval_loss": 0.6168100237846375, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 2490 + }, + { + "epoch": 8.993227990970654, + "grad_norm": 266.5688171386719, + "learning_rate": 1.6491833030852993e-05, + "loss": 35.5063, + "step": 2491 + }, + { + "epoch": 8.996839729119639, + "grad_norm": 230.923828125, + "learning_rate": 1.648638838475499e-05, + "loss": 23.5847, + "step": 2492 + }, + { + "epoch": 9.0, + "grad_norm": 187.365478515625, + "learning_rate": 1.6480943738656987e-05, + "loss": 21.7926, + "step": 2493 + }, + { + "epoch": 9.003611738148985, + "grad_norm": 283.487060546875, + "learning_rate": 1.6475499092558986e-05, + "loss": 41.4221, + "step": 2494 + }, + { + "epoch": 9.007223476297968, + "grad_norm": 234.38009643554688, + "learning_rate": 1.647005444646098e-05, + "loss": 43.3343, + "step": 2495 + }, + { + "epoch": 9.010835214446953, + "grad_norm": 253.75588989257812, + "learning_rate": 1.6464609800362976e-05, + "loss": 42.1983, + "step": 2496 + }, + { + "epoch": 9.014446952595938, + "grad_norm": 224.6202392578125, + "learning_rate": 1.6459165154264975e-05, + "loss": 41.5355, + "step": 2497 + }, + { + "epoch": 9.01805869074492, + "grad_norm": 261.0040588378906, + "learning_rate": 1.645372050816697e-05, + "loss": 42.3058, + "step": 2498 + }, + { + "epoch": 9.021670428893906, + "grad_norm": 191.44142150878906, + "learning_rate": 1.6448275862068966e-05, + "loss": 42.3911, + "step": 2499 + }, + { + "epoch": 9.025282167042889, + "grad_norm": 246.79278564453125, + "learning_rate": 1.644283121597096e-05, + "loss": 41.6238, + "step": 2500 + }, + { + "epoch": 9.025282167042889, + "eval_loss": 0.6220878958702087, + "eval_runtime": 3.1552, + "eval_samples_per_second": 56.731, + "eval_steps_per_second": 56.731, + "step": 2500 + }, + { + "epoch": 9.028893905191874, + "grad_norm": 251.5475311279297, + "learning_rate": 1.6437386569872957e-05, + "loss": 43.9275, + "step": 2501 + }, + { + "epoch": 9.032505643340858, + "grad_norm": 300.0381164550781, + "learning_rate": 1.6431941923774952e-05, + "loss": 42.8938, + "step": 2502 + }, + { + "epoch": 9.036117381489841, + "grad_norm": 310.0517883300781, + "learning_rate": 1.6426497277676954e-05, + "loss": 42.3538, + "step": 2503 + }, + { + "epoch": 9.039729119638826, + "grad_norm": 213.50392150878906, + "learning_rate": 1.642105263157895e-05, + "loss": 40.2305, + "step": 2504 + }, + { + "epoch": 9.043340857787811, + "grad_norm": 173.3816680908203, + "learning_rate": 1.6415607985480945e-05, + "loss": 38.3336, + "step": 2505 + }, + { + "epoch": 9.046952595936794, + "grad_norm": 195.51968383789062, + "learning_rate": 1.641016333938294e-05, + "loss": 38.5937, + "step": 2506 + }, + { + "epoch": 9.050564334085779, + "grad_norm": 195.68910217285156, + "learning_rate": 1.6404718693284936e-05, + "loss": 37.9994, + "step": 2507 + }, + { + "epoch": 9.054176072234762, + "grad_norm": 239.56704711914062, + "learning_rate": 1.6399274047186934e-05, + "loss": 38.6006, + "step": 2508 + }, + { + "epoch": 9.057787810383747, + "grad_norm": 455.8309326171875, + "learning_rate": 1.639382940108893e-05, + "loss": 39.9516, + "step": 2509 + }, + { + "epoch": 9.061399548532732, + "grad_norm": 188.0857696533203, + "learning_rate": 1.6388384754990925e-05, + "loss": 38.8922, + "step": 2510 + }, + { + "epoch": 9.061399548532732, + "eval_loss": 0.6177002191543579, + "eval_runtime": 3.1595, + "eval_samples_per_second": 56.654, + "eval_steps_per_second": 56.654, + "step": 2510 + }, + { + "epoch": 9.065011286681715, + "grad_norm": 211.76168823242188, + "learning_rate": 1.638294010889292e-05, + "loss": 38.8895, + "step": 2511 + }, + { + "epoch": 9.0686230248307, + "grad_norm": 281.7332458496094, + "learning_rate": 1.637749546279492e-05, + "loss": 39.9238, + "step": 2512 + }, + { + "epoch": 9.072234762979685, + "grad_norm": 254.9953155517578, + "learning_rate": 1.6372050816696915e-05, + "loss": 41.2667, + "step": 2513 + }, + { + "epoch": 9.075846501128668, + "grad_norm": 233.8746337890625, + "learning_rate": 1.6366606170598914e-05, + "loss": 39.3087, + "step": 2514 + }, + { + "epoch": 9.079458239277653, + "grad_norm": 317.71270751953125, + "learning_rate": 1.636116152450091e-05, + "loss": 40.4902, + "step": 2515 + }, + { + "epoch": 9.083069977426636, + "grad_norm": 227.5228271484375, + "learning_rate": 1.6355716878402904e-05, + "loss": 40.1197, + "step": 2516 + }, + { + "epoch": 9.08668171557562, + "grad_norm": 225.84423828125, + "learning_rate": 1.63502722323049e-05, + "loss": 42.9099, + "step": 2517 + }, + { + "epoch": 9.090293453724605, + "grad_norm": 255.20858764648438, + "learning_rate": 1.6344827586206895e-05, + "loss": 42.0515, + "step": 2518 + }, + { + "epoch": 9.093905191873588, + "grad_norm": 215.45352172851562, + "learning_rate": 1.6339382940108894e-05, + "loss": 41.6817, + "step": 2519 + }, + { + "epoch": 9.097516930022573, + "grad_norm": 233.5334014892578, + "learning_rate": 1.633393829401089e-05, + "loss": 42.6121, + "step": 2520 + }, + { + "epoch": 9.097516930022573, + "eval_loss": 0.6148340106010437, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.926, + "eval_steps_per_second": 56.926, + "step": 2520 + }, + { + "epoch": 9.101128668171558, + "grad_norm": 196.54132080078125, + "learning_rate": 1.6328493647912888e-05, + "loss": 40.5833, + "step": 2521 + }, + { + "epoch": 9.104740406320541, + "grad_norm": 296.7503967285156, + "learning_rate": 1.6323049001814883e-05, + "loss": 39.098, + "step": 2522 + }, + { + "epoch": 9.108352144469526, + "grad_norm": 272.1104431152344, + "learning_rate": 1.631760435571688e-05, + "loss": 36.0076, + "step": 2523 + }, + { + "epoch": 9.111963882618511, + "grad_norm": 197.3100128173828, + "learning_rate": 1.6312159709618874e-05, + "loss": 33.3503, + "step": 2524 + }, + { + "epoch": 9.115575620767494, + "grad_norm": 223.1310272216797, + "learning_rate": 1.6306715063520873e-05, + "loss": 33.1386, + "step": 2525 + }, + { + "epoch": 9.119187358916479, + "grad_norm": 234.86093139648438, + "learning_rate": 1.630127041742287e-05, + "loss": 34.2101, + "step": 2526 + }, + { + "epoch": 9.122799097065462, + "grad_norm": 244.72328186035156, + "learning_rate": 1.6295825771324864e-05, + "loss": 34.955, + "step": 2527 + }, + { + "epoch": 9.126410835214447, + "grad_norm": 198.89134216308594, + "learning_rate": 1.629038112522686e-05, + "loss": 34.5405, + "step": 2528 + }, + { + "epoch": 9.130022573363432, + "grad_norm": 236.64096069335938, + "learning_rate": 1.6284936479128854e-05, + "loss": 35.2328, + "step": 2529 + }, + { + "epoch": 9.133634311512415, + "grad_norm": 212.8743438720703, + "learning_rate": 1.6279491833030853e-05, + "loss": 34.6642, + "step": 2530 + }, + { + "epoch": 9.133634311512415, + "eval_loss": 0.6154256463050842, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 2530 + }, + { + "epoch": 9.1372460496614, + "grad_norm": 227.15135192871094, + "learning_rate": 1.6274047186932852e-05, + "loss": 35.652, + "step": 2531 + }, + { + "epoch": 9.140857787810384, + "grad_norm": 207.30572509765625, + "learning_rate": 1.6268602540834847e-05, + "loss": 36.8476, + "step": 2532 + }, + { + "epoch": 9.144469525959368, + "grad_norm": 222.18023681640625, + "learning_rate": 1.6263157894736843e-05, + "loss": 35.8299, + "step": 2533 + }, + { + "epoch": 9.148081264108352, + "grad_norm": 283.674072265625, + "learning_rate": 1.6257713248638838e-05, + "loss": 36.5074, + "step": 2534 + }, + { + "epoch": 9.151693002257336, + "grad_norm": 235.69752502441406, + "learning_rate": 1.6252268602540834e-05, + "loss": 37.344, + "step": 2535 + }, + { + "epoch": 9.15530474040632, + "grad_norm": 224.37965393066406, + "learning_rate": 1.6246823956442832e-05, + "loss": 37.8138, + "step": 2536 + }, + { + "epoch": 9.158916478555305, + "grad_norm": 217.52230834960938, + "learning_rate": 1.6241379310344828e-05, + "loss": 37.1529, + "step": 2537 + }, + { + "epoch": 9.162528216704288, + "grad_norm": 234.7586212158203, + "learning_rate": 1.6235934664246823e-05, + "loss": 36.3247, + "step": 2538 + }, + { + "epoch": 9.166139954853273, + "grad_norm": 239.52479553222656, + "learning_rate": 1.623049001814882e-05, + "loss": 30.0805, + "step": 2539 + }, + { + "epoch": 9.169751693002258, + "grad_norm": 223.7616424560547, + "learning_rate": 1.6225045372050817e-05, + "loss": 23.8492, + "step": 2540 + }, + { + "epoch": 9.169751693002258, + "eval_loss": 0.6244915723800659, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.031, + "eval_steps_per_second": 57.031, + "step": 2540 + }, + { + "epoch": 9.173363431151241, + "grad_norm": 213.41371154785156, + "learning_rate": 1.6219600725952816e-05, + "loss": 23.3557, + "step": 2541 + }, + { + "epoch": 9.176975169300226, + "grad_norm": 162.4627685546875, + "learning_rate": 1.621415607985481e-05, + "loss": 23.8834, + "step": 2542 + }, + { + "epoch": 9.18058690744921, + "grad_norm": 172.13250732421875, + "learning_rate": 1.6208711433756807e-05, + "loss": 24.6428, + "step": 2543 + }, + { + "epoch": 9.184198645598194, + "grad_norm": 229.30799865722656, + "learning_rate": 1.6203266787658802e-05, + "loss": 42.5908, + "step": 2544 + }, + { + "epoch": 9.187810383747179, + "grad_norm": 195.30130004882812, + "learning_rate": 1.6197822141560798e-05, + "loss": 43.7286, + "step": 2545 + }, + { + "epoch": 9.191422121896162, + "grad_norm": 227.4984893798828, + "learning_rate": 1.6192377495462793e-05, + "loss": 43.5012, + "step": 2546 + }, + { + "epoch": 9.195033860045147, + "grad_norm": 254.69615173339844, + "learning_rate": 1.6186932849364792e-05, + "loss": 41.9295, + "step": 2547 + }, + { + "epoch": 9.198645598194132, + "grad_norm": 251.33778381347656, + "learning_rate": 1.6181488203266787e-05, + "loss": 42.0838, + "step": 2548 + }, + { + "epoch": 9.202257336343115, + "grad_norm": 237.91677856445312, + "learning_rate": 1.6176043557168786e-05, + "loss": 43.0031, + "step": 2549 + }, + { + "epoch": 9.2058690744921, + "grad_norm": 258.0311584472656, + "learning_rate": 1.617059891107078e-05, + "loss": 42.7196, + "step": 2550 + }, + { + "epoch": 9.2058690744921, + "eval_loss": 0.6245208978652954, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2550 + }, + { + "epoch": 9.209480812641084, + "grad_norm": 197.14703369140625, + "learning_rate": 1.6165154264972777e-05, + "loss": 42.1342, + "step": 2551 + }, + { + "epoch": 9.213092550790067, + "grad_norm": 235.19705200195312, + "learning_rate": 1.6159709618874775e-05, + "loss": 41.8462, + "step": 2552 + }, + { + "epoch": 9.216704288939052, + "grad_norm": 198.409423828125, + "learning_rate": 1.615426497277677e-05, + "loss": 43.5993, + "step": 2553 + }, + { + "epoch": 9.220316027088035, + "grad_norm": 254.08590698242188, + "learning_rate": 1.6148820326678766e-05, + "loss": 40.771, + "step": 2554 + }, + { + "epoch": 9.22392776523702, + "grad_norm": 181.64808654785156, + "learning_rate": 1.614337568058076e-05, + "loss": 39.3511, + "step": 2555 + }, + { + "epoch": 9.227539503386005, + "grad_norm": 294.1127014160156, + "learning_rate": 1.6137931034482757e-05, + "loss": 39.6586, + "step": 2556 + }, + { + "epoch": 9.231151241534988, + "grad_norm": 197.59982299804688, + "learning_rate": 1.6132486388384752e-05, + "loss": 38.2575, + "step": 2557 + }, + { + "epoch": 9.234762979683973, + "grad_norm": 223.74717712402344, + "learning_rate": 1.6127041742286754e-05, + "loss": 38.8801, + "step": 2558 + }, + { + "epoch": 9.238374717832958, + "grad_norm": 279.2779541015625, + "learning_rate": 1.612159709618875e-05, + "loss": 40.4591, + "step": 2559 + }, + { + "epoch": 9.241986455981941, + "grad_norm": 258.75909423828125, + "learning_rate": 1.6116152450090745e-05, + "loss": 39.2172, + "step": 2560 + }, + { + "epoch": 9.241986455981941, + "eval_loss": 0.6209923624992371, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.784, + "eval_steps_per_second": 56.784, + "step": 2560 + }, + { + "epoch": 9.245598194130926, + "grad_norm": 305.0645446777344, + "learning_rate": 1.611070780399274e-05, + "loss": 40.442, + "step": 2561 + }, + { + "epoch": 9.249209932279909, + "grad_norm": 196.18557739257812, + "learning_rate": 1.6105263157894736e-05, + "loss": 39.7092, + "step": 2562 + }, + { + "epoch": 9.252821670428894, + "grad_norm": 214.3220977783203, + "learning_rate": 1.6099818511796735e-05, + "loss": 39.3935, + "step": 2563 + }, + { + "epoch": 9.256433408577879, + "grad_norm": 217.2801055908203, + "learning_rate": 1.609437386569873e-05, + "loss": 40.39, + "step": 2564 + }, + { + "epoch": 9.260045146726862, + "grad_norm": 205.17446899414062, + "learning_rate": 1.6088929219600726e-05, + "loss": 39.9531, + "step": 2565 + }, + { + "epoch": 9.263656884875846, + "grad_norm": 197.3854217529297, + "learning_rate": 1.608348457350272e-05, + "loss": 40.474, + "step": 2566 + }, + { + "epoch": 9.267268623024831, + "grad_norm": 264.3934631347656, + "learning_rate": 1.607803992740472e-05, + "loss": 41.2794, + "step": 2567 + }, + { + "epoch": 9.270880361173814, + "grad_norm": 226.6471710205078, + "learning_rate": 1.6072595281306715e-05, + "loss": 40.3425, + "step": 2568 + }, + { + "epoch": 9.2744920993228, + "grad_norm": 198.62734985351562, + "learning_rate": 1.6067150635208714e-05, + "loss": 41.6261, + "step": 2569 + }, + { + "epoch": 9.278103837471784, + "grad_norm": 207.73509216308594, + "learning_rate": 1.606170598911071e-05, + "loss": 41.7835, + "step": 2570 + }, + { + "epoch": 9.278103837471784, + "eval_loss": 0.6173180937767029, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 2570 + }, + { + "epoch": 9.281715575620767, + "grad_norm": 214.13601684570312, + "learning_rate": 1.6056261343012705e-05, + "loss": 40.0095, + "step": 2571 + }, + { + "epoch": 9.285327313769752, + "grad_norm": 218.0533905029297, + "learning_rate": 1.60508166969147e-05, + "loss": 40.014, + "step": 2572 + }, + { + "epoch": 9.288939051918735, + "grad_norm": 211.27984619140625, + "learning_rate": 1.6045372050816695e-05, + "loss": 36.7399, + "step": 2573 + }, + { + "epoch": 9.29255079006772, + "grad_norm": 201.9020233154297, + "learning_rate": 1.6039927404718694e-05, + "loss": 33.7555, + "step": 2574 + }, + { + "epoch": 9.296162528216705, + "grad_norm": 230.27149963378906, + "learning_rate": 1.603448275862069e-05, + "loss": 32.9646, + "step": 2575 + }, + { + "epoch": 9.299774266365688, + "grad_norm": 208.77622985839844, + "learning_rate": 1.6029038112522685e-05, + "loss": 33.5332, + "step": 2576 + }, + { + "epoch": 9.303386004514673, + "grad_norm": 225.02796936035156, + "learning_rate": 1.6023593466424684e-05, + "loss": 34.2592, + "step": 2577 + }, + { + "epoch": 9.306997742663658, + "grad_norm": 201.79612731933594, + "learning_rate": 1.601814882032668e-05, + "loss": 34.6686, + "step": 2578 + }, + { + "epoch": 9.31060948081264, + "grad_norm": 235.6588134765625, + "learning_rate": 1.6012704174228678e-05, + "loss": 35.4554, + "step": 2579 + }, + { + "epoch": 9.314221218961626, + "grad_norm": 273.51904296875, + "learning_rate": 1.6007259528130673e-05, + "loss": 35.2077, + "step": 2580 + }, + { + "epoch": 9.314221218961626, + "eval_loss": 0.6169624328613281, + "eval_runtime": 3.1501, + "eval_samples_per_second": 56.823, + "eval_steps_per_second": 56.823, + "step": 2580 + }, + { + "epoch": 9.317832957110609, + "grad_norm": 199.19541931152344, + "learning_rate": 1.600181488203267e-05, + "loss": 35.0703, + "step": 2581 + }, + { + "epoch": 9.321444695259594, + "grad_norm": 212.49276733398438, + "learning_rate": 1.5996370235934664e-05, + "loss": 35.9691, + "step": 2582 + }, + { + "epoch": 9.325056433408578, + "grad_norm": 193.7330322265625, + "learning_rate": 1.599092558983666e-05, + "loss": 34.9043, + "step": 2583 + }, + { + "epoch": 9.328668171557561, + "grad_norm": 196.00503540039062, + "learning_rate": 1.5985480943738655e-05, + "loss": 36.3508, + "step": 2584 + }, + { + "epoch": 9.332279909706546, + "grad_norm": 218.78392028808594, + "learning_rate": 1.5980036297640654e-05, + "loss": 34.7672, + "step": 2585 + }, + { + "epoch": 9.335891647855531, + "grad_norm": 235.76873779296875, + "learning_rate": 1.5974591651542652e-05, + "loss": 36.8695, + "step": 2586 + }, + { + "epoch": 9.339503386004514, + "grad_norm": 250.538330078125, + "learning_rate": 1.5969147005444648e-05, + "loss": 37.4531, + "step": 2587 + }, + { + "epoch": 9.343115124153499, + "grad_norm": 234.12469482421875, + "learning_rate": 1.5963702359346643e-05, + "loss": 37.4506, + "step": 2588 + }, + { + "epoch": 9.346726862302482, + "grad_norm": 209.3461151123047, + "learning_rate": 1.595825771324864e-05, + "loss": 31.3062, + "step": 2589 + }, + { + "epoch": 9.350338600451467, + "grad_norm": 211.12277221679688, + "learning_rate": 1.5952813067150637e-05, + "loss": 23.3303, + "step": 2590 + }, + { + "epoch": 9.350338600451467, + "eval_loss": 0.6222187876701355, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 2590 + }, + { + "epoch": 9.353950338600452, + "grad_norm": 200.1257781982422, + "learning_rate": 1.5947368421052633e-05, + "loss": 22.9145, + "step": 2591 + }, + { + "epoch": 9.357562076749435, + "grad_norm": 179.01475524902344, + "learning_rate": 1.5941923774954628e-05, + "loss": 23.8842, + "step": 2592 + }, + { + "epoch": 9.36117381489842, + "grad_norm": 214.9254608154297, + "learning_rate": 1.5936479128856623e-05, + "loss": 25.4154, + "step": 2593 + }, + { + "epoch": 9.364785553047405, + "grad_norm": 211.63735961914062, + "learning_rate": 1.593103448275862e-05, + "loss": 42.6467, + "step": 2594 + }, + { + "epoch": 9.368397291196388, + "grad_norm": 232.43194580078125, + "learning_rate": 1.5925589836660618e-05, + "loss": 43.3501, + "step": 2595 + }, + { + "epoch": 9.372009029345373, + "grad_norm": 220.61468505859375, + "learning_rate": 1.5920145190562616e-05, + "loss": 43.4324, + "step": 2596 + }, + { + "epoch": 9.375620767494357, + "grad_norm": 179.00894165039062, + "learning_rate": 1.591470054446461e-05, + "loss": 41.9646, + "step": 2597 + }, + { + "epoch": 9.37923250564334, + "grad_norm": 203.847412109375, + "learning_rate": 1.5909255898366607e-05, + "loss": 41.1242, + "step": 2598 + }, + { + "epoch": 9.382844243792325, + "grad_norm": 244.20164489746094, + "learning_rate": 1.5903811252268602e-05, + "loss": 42.2451, + "step": 2599 + }, + { + "epoch": 9.386455981941308, + "grad_norm": 203.60154724121094, + "learning_rate": 1.5898366606170598e-05, + "loss": 42.0361, + "step": 2600 + }, + { + "epoch": 9.386455981941308, + "eval_loss": 0.627146303653717, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2600 + }, + { + "epoch": 9.390067720090293, + "grad_norm": 185.1741180419922, + "learning_rate": 1.5892921960072597e-05, + "loss": 41.9657, + "step": 2601 + }, + { + "epoch": 9.393679458239278, + "grad_norm": 211.64219665527344, + "learning_rate": 1.5887477313974592e-05, + "loss": 42.2619, + "step": 2602 + }, + { + "epoch": 9.397291196388261, + "grad_norm": 253.31997680664062, + "learning_rate": 1.5882032667876587e-05, + "loss": 42.5666, + "step": 2603 + }, + { + "epoch": 9.400902934537246, + "grad_norm": 257.8781433105469, + "learning_rate": 1.5876588021778586e-05, + "loss": 43.1747, + "step": 2604 + }, + { + "epoch": 9.404514672686231, + "grad_norm": 171.05398559570312, + "learning_rate": 1.587114337568058e-05, + "loss": 41.2645, + "step": 2605 + }, + { + "epoch": 9.408126410835214, + "grad_norm": 209.83749389648438, + "learning_rate": 1.5865698729582577e-05, + "loss": 38.7138, + "step": 2606 + }, + { + "epoch": 9.411738148984199, + "grad_norm": 303.92059326171875, + "learning_rate": 1.5860254083484576e-05, + "loss": 38.7962, + "step": 2607 + }, + { + "epoch": 9.415349887133182, + "grad_norm": 271.9322204589844, + "learning_rate": 1.585480943738657e-05, + "loss": 39.0622, + "step": 2608 + }, + { + "epoch": 9.418961625282167, + "grad_norm": 222.8749542236328, + "learning_rate": 1.5849364791288566e-05, + "loss": 40.0773, + "step": 2609 + }, + { + "epoch": 9.422573363431152, + "grad_norm": 194.549072265625, + "learning_rate": 1.5843920145190562e-05, + "loss": 39.3495, + "step": 2610 + }, + { + "epoch": 9.422573363431152, + "eval_loss": 0.618250846862793, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.796, + "eval_steps_per_second": 56.796, + "step": 2610 + }, + { + "epoch": 9.426185101580135, + "grad_norm": 231.32623291015625, + "learning_rate": 1.5838475499092557e-05, + "loss": 39.7577, + "step": 2611 + }, + { + "epoch": 9.42979683972912, + "grad_norm": 185.9986114501953, + "learning_rate": 1.5833030852994556e-05, + "loss": 40.9342, + "step": 2612 + }, + { + "epoch": 9.433408577878104, + "grad_norm": 221.356201171875, + "learning_rate": 1.5827586206896555e-05, + "loss": 39.7733, + "step": 2613 + }, + { + "epoch": 9.437020316027088, + "grad_norm": 216.2249755859375, + "learning_rate": 1.582214156079855e-05, + "loss": 39.7559, + "step": 2614 + }, + { + "epoch": 9.440632054176072, + "grad_norm": 263.5106201171875, + "learning_rate": 1.5816696914700546e-05, + "loss": 41.2872, + "step": 2615 + }, + { + "epoch": 9.444243792325057, + "grad_norm": 281.9518127441406, + "learning_rate": 1.581125226860254e-05, + "loss": 41.1114, + "step": 2616 + }, + { + "epoch": 9.44785553047404, + "grad_norm": 200.2808074951172, + "learning_rate": 1.5805807622504536e-05, + "loss": 41.7711, + "step": 2617 + }, + { + "epoch": 9.451467268623025, + "grad_norm": 233.034912109375, + "learning_rate": 1.5800362976406535e-05, + "loss": 41.3306, + "step": 2618 + }, + { + "epoch": 9.455079006772008, + "grad_norm": 215.5499725341797, + "learning_rate": 1.579491833030853e-05, + "loss": 41.0065, + "step": 2619 + }, + { + "epoch": 9.458690744920993, + "grad_norm": 220.21153259277344, + "learning_rate": 1.5789473684210526e-05, + "loss": 42.1116, + "step": 2620 + }, + { + "epoch": 9.458690744920993, + "eval_loss": 0.6146022081375122, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 2620 + }, + { + "epoch": 9.462302483069978, + "grad_norm": 198.20001220703125, + "learning_rate": 1.578402903811252e-05, + "loss": 39.637, + "step": 2621 + }, + { + "epoch": 9.465914221218961, + "grad_norm": 228.18357849121094, + "learning_rate": 1.5778584392014517e-05, + "loss": 37.3831, + "step": 2622 + }, + { + "epoch": 9.469525959367946, + "grad_norm": 207.68040466308594, + "learning_rate": 1.577313974591652e-05, + "loss": 35.6356, + "step": 2623 + }, + { + "epoch": 9.47313769751693, + "grad_norm": 267.0474853515625, + "learning_rate": 1.5767695099818514e-05, + "loss": 34.5549, + "step": 2624 + }, + { + "epoch": 9.476749435665914, + "grad_norm": 191.4129638671875, + "learning_rate": 1.576225045372051e-05, + "loss": 35.1065, + "step": 2625 + }, + { + "epoch": 9.480361173814899, + "grad_norm": 220.85708618164062, + "learning_rate": 1.5756805807622505e-05, + "loss": 34.9115, + "step": 2626 + }, + { + "epoch": 9.483972911963882, + "grad_norm": 218.62460327148438, + "learning_rate": 1.57513611615245e-05, + "loss": 33.9542, + "step": 2627 + }, + { + "epoch": 9.487584650112867, + "grad_norm": 184.085693359375, + "learning_rate": 1.5745916515426496e-05, + "loss": 35.2981, + "step": 2628 + }, + { + "epoch": 9.491196388261852, + "grad_norm": 286.73236083984375, + "learning_rate": 1.5740471869328494e-05, + "loss": 36.8326, + "step": 2629 + }, + { + "epoch": 9.494808126410835, + "grad_norm": 326.4263000488281, + "learning_rate": 1.573502722323049e-05, + "loss": 35.9728, + "step": 2630 + }, + { + "epoch": 9.494808126410835, + "eval_loss": 0.6165672540664673, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2630 + }, + { + "epoch": 9.49841986455982, + "grad_norm": 283.330322265625, + "learning_rate": 1.5729582577132485e-05, + "loss": 37.4227, + "step": 2631 + }, + { + "epoch": 9.502031602708804, + "grad_norm": 208.65829467773438, + "learning_rate": 1.5724137931034484e-05, + "loss": 36.8613, + "step": 2632 + }, + { + "epoch": 9.505643340857787, + "grad_norm": 191.59429931640625, + "learning_rate": 1.571869328493648e-05, + "loss": 36.2332, + "step": 2633 + }, + { + "epoch": 9.509255079006772, + "grad_norm": 306.4736022949219, + "learning_rate": 1.5713248638838478e-05, + "loss": 36.8045, + "step": 2634 + }, + { + "epoch": 9.512866817155757, + "grad_norm": 226.97509765625, + "learning_rate": 1.5707803992740474e-05, + "loss": 37.005, + "step": 2635 + }, + { + "epoch": 9.51647855530474, + "grad_norm": 230.47683715820312, + "learning_rate": 1.570235934664247e-05, + "loss": 36.9168, + "step": 2636 + }, + { + "epoch": 9.520090293453725, + "grad_norm": 221.44483947753906, + "learning_rate": 1.5696914700544464e-05, + "loss": 39.0025, + "step": 2637 + }, + { + "epoch": 9.523702031602708, + "grad_norm": 249.1531219482422, + "learning_rate": 1.569147005444646e-05, + "loss": 38.1069, + "step": 2638 + }, + { + "epoch": 9.527313769751693, + "grad_norm": 276.8532409667969, + "learning_rate": 1.5686025408348455e-05, + "loss": 30.9819, + "step": 2639 + }, + { + "epoch": 9.530925507900678, + "grad_norm": 218.25035095214844, + "learning_rate": 1.5680580762250454e-05, + "loss": 23.4807, + "step": 2640 + }, + { + "epoch": 9.530925507900678, + "eval_loss": 0.619295060634613, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2640 + }, + { + "epoch": 9.534537246049661, + "grad_norm": 185.83737182617188, + "learning_rate": 1.5675136116152453e-05, + "loss": 22.5394, + "step": 2641 + }, + { + "epoch": 9.538148984198646, + "grad_norm": 181.9920654296875, + "learning_rate": 1.5669691470054448e-05, + "loss": 23.9106, + "step": 2642 + }, + { + "epoch": 9.54176072234763, + "grad_norm": 209.20391845703125, + "learning_rate": 1.5664246823956443e-05, + "loss": 25.5328, + "step": 2643 + }, + { + "epoch": 9.545372460496614, + "grad_norm": 223.86093139648438, + "learning_rate": 1.565880217785844e-05, + "loss": 42.8563, + "step": 2644 + }, + { + "epoch": 9.548984198645599, + "grad_norm": 232.3086395263672, + "learning_rate": 1.5653357531760438e-05, + "loss": 44.0178, + "step": 2645 + }, + { + "epoch": 9.552595936794582, + "grad_norm": 223.76541137695312, + "learning_rate": 1.5647912885662433e-05, + "loss": 43.4928, + "step": 2646 + }, + { + "epoch": 9.556207674943566, + "grad_norm": 258.86700439453125, + "learning_rate": 1.5642468239564428e-05, + "loss": 42.3422, + "step": 2647 + }, + { + "epoch": 9.559819413092551, + "grad_norm": 255.09033203125, + "learning_rate": 1.5637023593466424e-05, + "loss": 41.6588, + "step": 2648 + }, + { + "epoch": 9.563431151241534, + "grad_norm": 205.88563537597656, + "learning_rate": 1.563157894736842e-05, + "loss": 41.9267, + "step": 2649 + }, + { + "epoch": 9.56704288939052, + "grad_norm": 204.12318420410156, + "learning_rate": 1.5626134301270418e-05, + "loss": 43.0326, + "step": 2650 + }, + { + "epoch": 9.56704288939052, + "eval_loss": 0.6218730807304382, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2650 + }, + { + "epoch": 9.570654627539504, + "grad_norm": 259.5694274902344, + "learning_rate": 1.5620689655172417e-05, + "loss": 42.9604, + "step": 2651 + }, + { + "epoch": 9.574266365688487, + "grad_norm": 234.35935974121094, + "learning_rate": 1.5615245009074412e-05, + "loss": 42.7316, + "step": 2652 + }, + { + "epoch": 9.577878103837472, + "grad_norm": 237.14346313476562, + "learning_rate": 1.5609800362976407e-05, + "loss": 42.4559, + "step": 2653 + }, + { + "epoch": 9.581489841986457, + "grad_norm": 208.2974395751953, + "learning_rate": 1.5604355716878403e-05, + "loss": 40.1113, + "step": 2654 + }, + { + "epoch": 9.58510158013544, + "grad_norm": 212.18814086914062, + "learning_rate": 1.5598911070780398e-05, + "loss": 38.6515, + "step": 2655 + }, + { + "epoch": 9.588713318284425, + "grad_norm": 245.23240661621094, + "learning_rate": 1.5593466424682397e-05, + "loss": 39.5289, + "step": 2656 + }, + { + "epoch": 9.592325056433408, + "grad_norm": 261.1321105957031, + "learning_rate": 1.5588021778584392e-05, + "loss": 39.3232, + "step": 2657 + }, + { + "epoch": 9.595936794582393, + "grad_norm": 257.67962646484375, + "learning_rate": 1.5582577132486388e-05, + "loss": 40.3963, + "step": 2658 + }, + { + "epoch": 9.599548532731378, + "grad_norm": 299.93914794921875, + "learning_rate": 1.5577132486388383e-05, + "loss": 39.0657, + "step": 2659 + }, + { + "epoch": 9.60316027088036, + "grad_norm": 215.45407104492188, + "learning_rate": 1.5571687840290382e-05, + "loss": 40.1408, + "step": 2660 + }, + { + "epoch": 9.60316027088036, + "eval_loss": 0.6216554045677185, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2660 + }, + { + "epoch": 9.606772009029346, + "grad_norm": 273.9233093261719, + "learning_rate": 1.5566243194192377e-05, + "loss": 40.6894, + "step": 2661 + }, + { + "epoch": 9.610383747178329, + "grad_norm": 220.76344299316406, + "learning_rate": 1.5560798548094376e-05, + "loss": 40.8146, + "step": 2662 + }, + { + "epoch": 9.613995485327314, + "grad_norm": 200.33929443359375, + "learning_rate": 1.555535390199637e-05, + "loss": 40.1362, + "step": 2663 + }, + { + "epoch": 9.617607223476298, + "grad_norm": 223.38536071777344, + "learning_rate": 1.5549909255898367e-05, + "loss": 39.3488, + "step": 2664 + }, + { + "epoch": 9.621218961625281, + "grad_norm": 240.99578857421875, + "learning_rate": 1.5544464609800362e-05, + "loss": 41.771, + "step": 2665 + }, + { + "epoch": 9.624830699774266, + "grad_norm": 202.30323791503906, + "learning_rate": 1.5539019963702357e-05, + "loss": 41.1412, + "step": 2666 + }, + { + "epoch": 9.628442437923251, + "grad_norm": 193.8411865234375, + "learning_rate": 1.5533575317604356e-05, + "loss": 41.0064, + "step": 2667 + }, + { + "epoch": 9.632054176072234, + "grad_norm": 197.1542510986328, + "learning_rate": 1.552813067150635e-05, + "loss": 41.4787, + "step": 2668 + }, + { + "epoch": 9.635665914221219, + "grad_norm": 259.21954345703125, + "learning_rate": 1.552268602540835e-05, + "loss": 41.753, + "step": 2669 + }, + { + "epoch": 9.639277652370204, + "grad_norm": 290.9770202636719, + "learning_rate": 1.5517241379310346e-05, + "loss": 40.4589, + "step": 2670 + }, + { + "epoch": 9.639277652370204, + "eval_loss": 0.6132164001464844, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2670 + }, + { + "epoch": 9.642889390519187, + "grad_norm": 252.86219787597656, + "learning_rate": 1.551179673321234e-05, + "loss": 37.356, + "step": 2671 + }, + { + "epoch": 9.646501128668172, + "grad_norm": 207.79254150390625, + "learning_rate": 1.550635208711434e-05, + "loss": 36.2071, + "step": 2672 + }, + { + "epoch": 9.650112866817155, + "grad_norm": 186.78857421875, + "learning_rate": 1.5500907441016335e-05, + "loss": 33.5074, + "step": 2673 + }, + { + "epoch": 9.65372460496614, + "grad_norm": 212.5107421875, + "learning_rate": 1.549546279491833e-05, + "loss": 33.7103, + "step": 2674 + }, + { + "epoch": 9.657336343115125, + "grad_norm": 243.2950897216797, + "learning_rate": 1.5490018148820326e-05, + "loss": 34.3476, + "step": 2675 + }, + { + "epoch": 9.660948081264108, + "grad_norm": 221.66415405273438, + "learning_rate": 1.548457350272232e-05, + "loss": 34.5377, + "step": 2676 + }, + { + "epoch": 9.664559819413093, + "grad_norm": 231.8260955810547, + "learning_rate": 1.5479128856624317e-05, + "loss": 34.3663, + "step": 2677 + }, + { + "epoch": 9.668171557562077, + "grad_norm": 284.6401062011719, + "learning_rate": 1.547368421052632e-05, + "loss": 35.5723, + "step": 2678 + }, + { + "epoch": 9.67178329571106, + "grad_norm": 373.43865966796875, + "learning_rate": 1.5468239564428314e-05, + "loss": 35.5628, + "step": 2679 + }, + { + "epoch": 9.675395033860045, + "grad_norm": 325.18316650390625, + "learning_rate": 1.546279491833031e-05, + "loss": 35.6192, + "step": 2680 + }, + { + "epoch": 9.675395033860045, + "eval_loss": 0.613842248916626, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 2680 + }, + { + "epoch": 9.679006772009028, + "grad_norm": 353.14739990234375, + "learning_rate": 1.5457350272232305e-05, + "loss": 36.4789, + "step": 2681 + }, + { + "epoch": 9.682618510158013, + "grad_norm": 215.21836853027344, + "learning_rate": 1.54519056261343e-05, + "loss": 36.0412, + "step": 2682 + }, + { + "epoch": 9.686230248306998, + "grad_norm": 219.64930725097656, + "learning_rate": 1.54464609800363e-05, + "loss": 37.1118, + "step": 2683 + }, + { + "epoch": 9.689841986455981, + "grad_norm": 247.86685180664062, + "learning_rate": 1.5441016333938295e-05, + "loss": 36.488, + "step": 2684 + }, + { + "epoch": 9.693453724604966, + "grad_norm": 248.7967071533203, + "learning_rate": 1.543557168784029e-05, + "loss": 36.2925, + "step": 2685 + }, + { + "epoch": 9.697065462753951, + "grad_norm": 243.1404571533203, + "learning_rate": 1.5430127041742285e-05, + "loss": 37.3986, + "step": 2686 + }, + { + "epoch": 9.700677200902934, + "grad_norm": 276.6585388183594, + "learning_rate": 1.5424682395644284e-05, + "loss": 37.9784, + "step": 2687 + }, + { + "epoch": 9.704288939051919, + "grad_norm": 308.171630859375, + "learning_rate": 1.541923774954628e-05, + "loss": 38.1591, + "step": 2688 + }, + { + "epoch": 9.707900677200904, + "grad_norm": 204.4575653076172, + "learning_rate": 1.541379310344828e-05, + "loss": 27.4514, + "step": 2689 + }, + { + "epoch": 9.711512415349887, + "grad_norm": 160.85946655273438, + "learning_rate": 1.5408348457350274e-05, + "loss": 23.7982, + "step": 2690 + }, + { + "epoch": 9.711512415349887, + "eval_loss": 0.619924008846283, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2690 + }, + { + "epoch": 9.715124153498872, + "grad_norm": 215.60049438476562, + "learning_rate": 1.540290381125227e-05, + "loss": 23.3927, + "step": 2691 + }, + { + "epoch": 9.718735891647855, + "grad_norm": 172.84011840820312, + "learning_rate": 1.5397459165154265e-05, + "loss": 24.1876, + "step": 2692 + }, + { + "epoch": 9.72234762979684, + "grad_norm": 208.42361450195312, + "learning_rate": 1.539201451905626e-05, + "loss": 25.1794, + "step": 2693 + }, + { + "epoch": 9.725959367945824, + "grad_norm": 255.73574829101562, + "learning_rate": 1.538656987295826e-05, + "loss": 42.3484, + "step": 2694 + }, + { + "epoch": 9.729571106094808, + "grad_norm": 239.65533447265625, + "learning_rate": 1.5381125226860254e-05, + "loss": 42.8277, + "step": 2695 + }, + { + "epoch": 9.733182844243792, + "grad_norm": 211.2068634033203, + "learning_rate": 1.5375680580762253e-05, + "loss": 42.6536, + "step": 2696 + }, + { + "epoch": 9.736794582392777, + "grad_norm": 302.85003662109375, + "learning_rate": 1.5370235934664248e-05, + "loss": 42.6263, + "step": 2697 + }, + { + "epoch": 9.74040632054176, + "grad_norm": 211.54754638671875, + "learning_rate": 1.5364791288566244e-05, + "loss": 41.5621, + "step": 2698 + }, + { + "epoch": 9.744018058690745, + "grad_norm": 229.22283935546875, + "learning_rate": 1.535934664246824e-05, + "loss": 43.3765, + "step": 2699 + }, + { + "epoch": 9.747629796839728, + "grad_norm": 206.64794921875, + "learning_rate": 1.5353901996370238e-05, + "loss": 41.4923, + "step": 2700 + }, + { + "epoch": 9.747629796839728, + "eval_loss": 0.6202616095542908, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.981, + "eval_steps_per_second": 56.981, + "step": 2700 + }, + { + "epoch": 9.751241534988713, + "grad_norm": 216.98757934570312, + "learning_rate": 1.5348457350272233e-05, + "loss": 43.1931, + "step": 2701 + }, + { + "epoch": 9.754853273137698, + "grad_norm": 222.7340545654297, + "learning_rate": 1.534301270417423e-05, + "loss": 42.485, + "step": 2702 + }, + { + "epoch": 9.758465011286681, + "grad_norm": 291.3454895019531, + "learning_rate": 1.5337568058076224e-05, + "loss": 41.4766, + "step": 2703 + }, + { + "epoch": 9.762076749435666, + "grad_norm": 239.50341796875, + "learning_rate": 1.533212341197822e-05, + "loss": 41.9215, + "step": 2704 + }, + { + "epoch": 9.76568848758465, + "grad_norm": 179.21839904785156, + "learning_rate": 1.5326678765880218e-05, + "loss": 40.6544, + "step": 2705 + }, + { + "epoch": 9.769300225733634, + "grad_norm": 210.89535522460938, + "learning_rate": 1.5321234119782217e-05, + "loss": 38.6204, + "step": 2706 + }, + { + "epoch": 9.772911963882619, + "grad_norm": 239.23291015625, + "learning_rate": 1.5315789473684212e-05, + "loss": 39.4385, + "step": 2707 + }, + { + "epoch": 9.776523702031604, + "grad_norm": 240.22772216796875, + "learning_rate": 1.5310344827586208e-05, + "loss": 40.0139, + "step": 2708 + }, + { + "epoch": 9.780135440180587, + "grad_norm": 185.4588623046875, + "learning_rate": 1.5304900181488203e-05, + "loss": 38.9331, + "step": 2709 + }, + { + "epoch": 9.783747178329572, + "grad_norm": 263.0315856933594, + "learning_rate": 1.52994555353902e-05, + "loss": 38.5485, + "step": 2710 + }, + { + "epoch": 9.783747178329572, + "eval_loss": 0.615914523601532, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2710 + }, + { + "epoch": 9.787358916478555, + "grad_norm": 209.05348205566406, + "learning_rate": 1.5294010889292197e-05, + "loss": 39.4875, + "step": 2711 + }, + { + "epoch": 9.79097065462754, + "grad_norm": 209.72293090820312, + "learning_rate": 1.5288566243194193e-05, + "loss": 40.4742, + "step": 2712 + }, + { + "epoch": 9.794582392776524, + "grad_norm": 210.02908325195312, + "learning_rate": 1.5283121597096188e-05, + "loss": 39.924, + "step": 2713 + }, + { + "epoch": 9.798194130925507, + "grad_norm": 204.3467254638672, + "learning_rate": 1.5277676950998183e-05, + "loss": 40.8893, + "step": 2714 + }, + { + "epoch": 9.801805869074492, + "grad_norm": 253.9317626953125, + "learning_rate": 1.5272232304900182e-05, + "loss": 38.3278, + "step": 2715 + }, + { + "epoch": 9.805417607223477, + "grad_norm": 263.6196594238281, + "learning_rate": 1.526678765880218e-05, + "loss": 40.5242, + "step": 2716 + }, + { + "epoch": 9.80902934537246, + "grad_norm": 230.35621643066406, + "learning_rate": 1.5261343012704176e-05, + "loss": 40.683, + "step": 2717 + }, + { + "epoch": 9.812641083521445, + "grad_norm": 190.16323852539062, + "learning_rate": 1.5255898366606172e-05, + "loss": 40.2472, + "step": 2718 + }, + { + "epoch": 9.816252821670428, + "grad_norm": 202.7122344970703, + "learning_rate": 1.5250453720508167e-05, + "loss": 38.9644, + "step": 2719 + }, + { + "epoch": 9.819864559819413, + "grad_norm": 193.65774536132812, + "learning_rate": 1.5245009074410164e-05, + "loss": 40.9982, + "step": 2720 + }, + { + "epoch": 9.819864559819413, + "eval_loss": 0.6152020692825317, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 2720 + }, + { + "epoch": 9.823476297968398, + "grad_norm": 272.0360412597656, + "learning_rate": 1.523956442831216e-05, + "loss": 40.5518, + "step": 2721 + }, + { + "epoch": 9.827088036117381, + "grad_norm": 200.20777893066406, + "learning_rate": 1.5234119782214155e-05, + "loss": 38.4801, + "step": 2722 + }, + { + "epoch": 9.830699774266366, + "grad_norm": 201.44764709472656, + "learning_rate": 1.5228675136116152e-05, + "loss": 35.7499, + "step": 2723 + }, + { + "epoch": 9.83431151241535, + "grad_norm": 234.89706420898438, + "learning_rate": 1.522323049001815e-05, + "loss": 35.4331, + "step": 2724 + }, + { + "epoch": 9.837923250564334, + "grad_norm": 193.27423095703125, + "learning_rate": 1.5217785843920146e-05, + "loss": 33.0281, + "step": 2725 + }, + { + "epoch": 9.841534988713319, + "grad_norm": 222.28060913085938, + "learning_rate": 1.5212341197822143e-05, + "loss": 34.2237, + "step": 2726 + }, + { + "epoch": 9.845146726862303, + "grad_norm": 264.2764587402344, + "learning_rate": 1.5206896551724139e-05, + "loss": 33.7112, + "step": 2727 + }, + { + "epoch": 9.848758465011286, + "grad_norm": 204.5146484375, + "learning_rate": 1.5201451905626134e-05, + "loss": 33.9014, + "step": 2728 + }, + { + "epoch": 9.852370203160271, + "grad_norm": 198.90907287597656, + "learning_rate": 1.5196007259528131e-05, + "loss": 36.6987, + "step": 2729 + }, + { + "epoch": 9.855981941309254, + "grad_norm": 254.19818115234375, + "learning_rate": 1.5190562613430126e-05, + "loss": 35.4466, + "step": 2730 + }, + { + "epoch": 9.855981941309254, + "eval_loss": 0.6153284311294556, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2730 + }, + { + "epoch": 9.85959367945824, + "grad_norm": 212.53749084472656, + "learning_rate": 1.5185117967332123e-05, + "loss": 35.659, + "step": 2731 + }, + { + "epoch": 9.863205417607224, + "grad_norm": 234.5277557373047, + "learning_rate": 1.5179673321234119e-05, + "loss": 36.7411, + "step": 2732 + }, + { + "epoch": 9.866817155756207, + "grad_norm": 229.25962829589844, + "learning_rate": 1.5174228675136118e-05, + "loss": 36.0713, + "step": 2733 + }, + { + "epoch": 9.870428893905192, + "grad_norm": 259.5096435546875, + "learning_rate": 1.5168784029038115e-05, + "loss": 37.2433, + "step": 2734 + }, + { + "epoch": 9.874040632054175, + "grad_norm": 297.2413024902344, + "learning_rate": 1.516333938294011e-05, + "loss": 37.222, + "step": 2735 + }, + { + "epoch": 9.87765237020316, + "grad_norm": 259.8325500488281, + "learning_rate": 1.5157894736842105e-05, + "loss": 37.096, + "step": 2736 + }, + { + "epoch": 9.881264108352145, + "grad_norm": 275.85888671875, + "learning_rate": 1.5152450090744103e-05, + "loss": 37.769, + "step": 2737 + }, + { + "epoch": 9.884875846501128, + "grad_norm": 261.16656494140625, + "learning_rate": 1.5147005444646098e-05, + "loss": 38.4089, + "step": 2738 + }, + { + "epoch": 9.888487584650113, + "grad_norm": 219.74351501464844, + "learning_rate": 1.5141560798548095e-05, + "loss": 32.5255, + "step": 2739 + }, + { + "epoch": 9.892099322799098, + "grad_norm": 203.9193878173828, + "learning_rate": 1.513611615245009e-05, + "loss": 24.2497, + "step": 2740 + }, + { + "epoch": 9.892099322799098, + "eval_loss": 0.6206448674201965, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 2740 + }, + { + "epoch": 9.89571106094808, + "grad_norm": 224.19454956054688, + "learning_rate": 1.5130671506352086e-05, + "loss": 23.0629, + "step": 2741 + }, + { + "epoch": 9.899322799097066, + "grad_norm": 252.4147186279297, + "learning_rate": 1.5125226860254086e-05, + "loss": 24.5799, + "step": 2742 + }, + { + "epoch": 9.90293453724605, + "grad_norm": 214.79067993164062, + "learning_rate": 1.5119782214156082e-05, + "loss": 24.6773, + "step": 2743 + }, + { + "epoch": 9.906546275395034, + "grad_norm": 225.59848022460938, + "learning_rate": 1.5114337568058077e-05, + "loss": 43.1147, + "step": 2744 + }, + { + "epoch": 9.910158013544018, + "grad_norm": 221.8661651611328, + "learning_rate": 1.5108892921960074e-05, + "loss": 42.7403, + "step": 2745 + }, + { + "epoch": 9.913769751693001, + "grad_norm": 316.3871765136719, + "learning_rate": 1.510344827586207e-05, + "loss": 41.6931, + "step": 2746 + }, + { + "epoch": 9.917381489841986, + "grad_norm": 250.6577911376953, + "learning_rate": 1.5098003629764065e-05, + "loss": 43.3, + "step": 2747 + }, + { + "epoch": 9.920993227990971, + "grad_norm": 222.44386291503906, + "learning_rate": 1.5092558983666062e-05, + "loss": 43.3128, + "step": 2748 + }, + { + "epoch": 9.924604966139954, + "grad_norm": 190.08682250976562, + "learning_rate": 1.5087114337568057e-05, + "loss": 41.4814, + "step": 2749 + }, + { + "epoch": 9.928216704288939, + "grad_norm": 276.9918212890625, + "learning_rate": 1.5081669691470054e-05, + "loss": 41.042, + "step": 2750 + }, + { + "epoch": 9.928216704288939, + "eval_loss": 0.6201648116111755, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2750 + }, + { + "epoch": 9.931828442437924, + "grad_norm": 269.7344970703125, + "learning_rate": 1.507622504537205e-05, + "loss": 40.3064, + "step": 2751 + }, + { + "epoch": 9.935440180586907, + "grad_norm": 263.11663818359375, + "learning_rate": 1.5070780399274049e-05, + "loss": 40.1675, + "step": 2752 + }, + { + "epoch": 9.939051918735892, + "grad_norm": 210.37635803222656, + "learning_rate": 1.5065335753176046e-05, + "loss": 40.5334, + "step": 2753 + }, + { + "epoch": 9.942663656884875, + "grad_norm": 206.09335327148438, + "learning_rate": 1.5059891107078041e-05, + "loss": 41.0429, + "step": 2754 + }, + { + "epoch": 9.94627539503386, + "grad_norm": 245.45013427734375, + "learning_rate": 1.5054446460980036e-05, + "loss": 40.8831, + "step": 2755 + }, + { + "epoch": 9.949887133182845, + "grad_norm": 216.63075256347656, + "learning_rate": 1.5049001814882033e-05, + "loss": 41.2453, + "step": 2756 + }, + { + "epoch": 9.953498871331828, + "grad_norm": 362.12127685546875, + "learning_rate": 1.5043557168784029e-05, + "loss": 40.4561, + "step": 2757 + }, + { + "epoch": 9.957110609480813, + "grad_norm": 222.01434326171875, + "learning_rate": 1.5038112522686024e-05, + "loss": 41.7307, + "step": 2758 + }, + { + "epoch": 9.960722347629797, + "grad_norm": 289.6107177734375, + "learning_rate": 1.5032667876588021e-05, + "loss": 37.83, + "step": 2759 + }, + { + "epoch": 9.96433408577878, + "grad_norm": 231.75274658203125, + "learning_rate": 1.5027223230490017e-05, + "loss": 34.1728, + "step": 2760 + }, + { + "epoch": 9.96433408577878, + "eval_loss": 0.6177247166633606, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2760 + }, + { + "epoch": 9.967945823927765, + "grad_norm": 269.4657287597656, + "learning_rate": 1.5021778584392017e-05, + "loss": 33.8501, + "step": 2761 + }, + { + "epoch": 9.97155756207675, + "grad_norm": 229.73004150390625, + "learning_rate": 1.5016333938294013e-05, + "loss": 35.0989, + "step": 2762 + }, + { + "epoch": 9.975169300225733, + "grad_norm": 215.75350952148438, + "learning_rate": 1.5010889292196008e-05, + "loss": 35.1091, + "step": 2763 + }, + { + "epoch": 9.978781038374718, + "grad_norm": 255.36439514160156, + "learning_rate": 1.5005444646098005e-05, + "loss": 36.8373, + "step": 2764 + }, + { + "epoch": 9.982392776523701, + "grad_norm": 226.71084594726562, + "learning_rate": 1.5e-05, + "loss": 36.6244, + "step": 2765 + }, + { + "epoch": 9.986004514672686, + "grad_norm": 264.1791076660156, + "learning_rate": 1.4994555353901996e-05, + "loss": 36.1925, + "step": 2766 + }, + { + "epoch": 9.989616252821671, + "grad_norm": 281.4349060058594, + "learning_rate": 1.4989110707803993e-05, + "loss": 38.5627, + "step": 2767 + }, + { + "epoch": 9.993227990970654, + "grad_norm": 275.13092041015625, + "learning_rate": 1.498366606170599e-05, + "loss": 33.3277, + "step": 2768 + }, + { + "epoch": 9.996839729119639, + "grad_norm": 215.79550170898438, + "learning_rate": 1.4978221415607985e-05, + "loss": 23.7482, + "step": 2769 + }, + { + "epoch": 10.0, + "grad_norm": 162.03152465820312, + "learning_rate": 1.4972776769509982e-05, + "loss": 21.7078, + "step": 2770 + }, + { + "epoch": 10.0, + "eval_loss": 0.6126651763916016, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2770 + }, + { + "epoch": 10.003611738148985, + "grad_norm": 243.1815185546875, + "learning_rate": 1.4967332123411978e-05, + "loss": 42.2449, + "step": 2771 + }, + { + "epoch": 10.007223476297968, + "grad_norm": 183.29127502441406, + "learning_rate": 1.4961887477313977e-05, + "loss": 41.5925, + "step": 2772 + }, + { + "epoch": 10.010835214446953, + "grad_norm": 206.04238891601562, + "learning_rate": 1.4956442831215972e-05, + "loss": 40.6657, + "step": 2773 + }, + { + "epoch": 10.014446952595938, + "grad_norm": 192.1796875, + "learning_rate": 1.4950998185117967e-05, + "loss": 41.7065, + "step": 2774 + }, + { + "epoch": 10.01805869074492, + "grad_norm": 202.77279663085938, + "learning_rate": 1.4945553539019964e-05, + "loss": 42.0608, + "step": 2775 + }, + { + "epoch": 10.021670428893906, + "grad_norm": 242.37734985351562, + "learning_rate": 1.494010889292196e-05, + "loss": 40.9925, + "step": 2776 + }, + { + "epoch": 10.025282167042889, + "grad_norm": 252.01358032226562, + "learning_rate": 1.4934664246823957e-05, + "loss": 41.1401, + "step": 2777 + }, + { + "epoch": 10.028893905191874, + "grad_norm": 205.82388305664062, + "learning_rate": 1.4929219600725954e-05, + "loss": 41.5, + "step": 2778 + }, + { + "epoch": 10.032505643340858, + "grad_norm": 251.53968811035156, + "learning_rate": 1.492377495462795e-05, + "loss": 41.8218, + "step": 2779 + }, + { + "epoch": 10.036117381489841, + "grad_norm": 236.55564880371094, + "learning_rate": 1.4918330308529945e-05, + "loss": 40.803, + "step": 2780 + }, + { + "epoch": 10.036117381489841, + "eval_loss": 0.6173696517944336, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 2780 + }, + { + "epoch": 10.039729119638826, + "grad_norm": 214.9959716796875, + "learning_rate": 1.4912885662431942e-05, + "loss": 40.522, + "step": 2781 + }, + { + "epoch": 10.043340857787811, + "grad_norm": 213.7000732421875, + "learning_rate": 1.4907441016333939e-05, + "loss": 38.8643, + "step": 2782 + }, + { + "epoch": 10.046952595936794, + "grad_norm": 225.6709747314453, + "learning_rate": 1.4901996370235936e-05, + "loss": 38.3625, + "step": 2783 + }, + { + "epoch": 10.050564334085779, + "grad_norm": 208.83712768554688, + "learning_rate": 1.4896551724137931e-05, + "loss": 38.5355, + "step": 2784 + }, + { + "epoch": 10.054176072234762, + "grad_norm": 185.51219177246094, + "learning_rate": 1.4891107078039927e-05, + "loss": 38.4303, + "step": 2785 + }, + { + "epoch": 10.057787810383747, + "grad_norm": 196.68551635742188, + "learning_rate": 1.4885662431941925e-05, + "loss": 38.1895, + "step": 2786 + }, + { + "epoch": 10.061399548532732, + "grad_norm": 207.4806671142578, + "learning_rate": 1.488021778584392e-05, + "loss": 39.2329, + "step": 2787 + }, + { + "epoch": 10.065011286681715, + "grad_norm": 211.640380859375, + "learning_rate": 1.4874773139745916e-05, + "loss": 40.108, + "step": 2788 + }, + { + "epoch": 10.0686230248307, + "grad_norm": 195.97006225585938, + "learning_rate": 1.4869328493647913e-05, + "loss": 39.6883, + "step": 2789 + }, + { + "epoch": 10.072234762979685, + "grad_norm": 207.20169067382812, + "learning_rate": 1.4863883847549909e-05, + "loss": 40.557, + "step": 2790 + }, + { + "epoch": 10.072234762979685, + "eval_loss": 0.6166439652442932, + "eval_runtime": 3.1461, + "eval_samples_per_second": 56.895, + "eval_steps_per_second": 56.895, + "step": 2790 + }, + { + "epoch": 10.075846501128668, + "grad_norm": 168.4052276611328, + "learning_rate": 1.4858439201451906e-05, + "loss": 39.76, + "step": 2791 + }, + { + "epoch": 10.079458239277653, + "grad_norm": 188.55575561523438, + "learning_rate": 1.4852994555353903e-05, + "loss": 40.4776, + "step": 2792 + }, + { + "epoch": 10.083069977426636, + "grad_norm": 181.60801696777344, + "learning_rate": 1.4847549909255898e-05, + "loss": 40.5414, + "step": 2793 + }, + { + "epoch": 10.08668171557562, + "grad_norm": 205.39608764648438, + "learning_rate": 1.4842105263157895e-05, + "loss": 41.4944, + "step": 2794 + }, + { + "epoch": 10.090293453724605, + "grad_norm": 271.0169372558594, + "learning_rate": 1.4836660617059892e-05, + "loss": 40.6805, + "step": 2795 + }, + { + "epoch": 10.093905191873588, + "grad_norm": 241.97889709472656, + "learning_rate": 1.4831215970961888e-05, + "loss": 39.5473, + "step": 2796 + }, + { + "epoch": 10.097516930022573, + "grad_norm": 211.64260864257812, + "learning_rate": 1.4825771324863885e-05, + "loss": 41.0357, + "step": 2797 + }, + { + "epoch": 10.101128668171558, + "grad_norm": 209.52804565429688, + "learning_rate": 1.482032667876588e-05, + "loss": 41.3357, + "step": 2798 + }, + { + "epoch": 10.104740406320541, + "grad_norm": 243.08419799804688, + "learning_rate": 1.4814882032667876e-05, + "loss": 38.6778, + "step": 2799 + }, + { + "epoch": 10.108352144469526, + "grad_norm": 227.17172241210938, + "learning_rate": 1.4809437386569874e-05, + "loss": 35.1128, + "step": 2800 + }, + { + "epoch": 10.108352144469526, + "eval_loss": 0.6153741478919983, + "eval_runtime": 3.143, + "eval_samples_per_second": 56.952, + "eval_steps_per_second": 56.952, + "step": 2800 + }, + { + "epoch": 10.111963882618511, + "grad_norm": 284.7151794433594, + "learning_rate": 1.480399274047187e-05, + "loss": 33.1712, + "step": 2801 + }, + { + "epoch": 10.115575620767494, + "grad_norm": 234.85169982910156, + "learning_rate": 1.4798548094373867e-05, + "loss": 33.495, + "step": 2802 + }, + { + "epoch": 10.119187358916479, + "grad_norm": 236.6138458251953, + "learning_rate": 1.4793103448275862e-05, + "loss": 33.2318, + "step": 2803 + }, + { + "epoch": 10.122799097065462, + "grad_norm": 240.98997497558594, + "learning_rate": 1.4787658802177858e-05, + "loss": 33.9268, + "step": 2804 + }, + { + "epoch": 10.126410835214447, + "grad_norm": 218.304443359375, + "learning_rate": 1.4782214156079856e-05, + "loss": 34.667, + "step": 2805 + }, + { + "epoch": 10.130022573363432, + "grad_norm": 290.30108642578125, + "learning_rate": 1.4776769509981852e-05, + "loss": 36.7153, + "step": 2806 + }, + { + "epoch": 10.133634311512415, + "grad_norm": 267.7265625, + "learning_rate": 1.4771324863883847e-05, + "loss": 35.2035, + "step": 2807 + }, + { + "epoch": 10.1372460496614, + "grad_norm": 300.4646301269531, + "learning_rate": 1.4765880217785844e-05, + "loss": 35.6581, + "step": 2808 + }, + { + "epoch": 10.140857787810384, + "grad_norm": 234.16448974609375, + "learning_rate": 1.4760435571687841e-05, + "loss": 35.8547, + "step": 2809 + }, + { + "epoch": 10.144469525959368, + "grad_norm": 209.23858642578125, + "learning_rate": 1.4754990925589837e-05, + "loss": 34.47, + "step": 2810 + }, + { + "epoch": 10.144469525959368, + "eval_loss": 0.6160662770271301, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2810 + }, + { + "epoch": 10.148081264108352, + "grad_norm": 207.9628143310547, + "learning_rate": 1.4749546279491834e-05, + "loss": 36.1239, + "step": 2811 + }, + { + "epoch": 10.151693002257336, + "grad_norm": 183.68545532226562, + "learning_rate": 1.4744101633393829e-05, + "loss": 36.759, + "step": 2812 + }, + { + "epoch": 10.15530474040632, + "grad_norm": 222.00164794921875, + "learning_rate": 1.4738656987295826e-05, + "loss": 37.397, + "step": 2813 + }, + { + "epoch": 10.158916478555305, + "grad_norm": 226.9628448486328, + "learning_rate": 1.4733212341197823e-05, + "loss": 36.3648, + "step": 2814 + }, + { + "epoch": 10.162528216704288, + "grad_norm": 271.061279296875, + "learning_rate": 1.4727767695099819e-05, + "loss": 37.8754, + "step": 2815 + }, + { + "epoch": 10.166139954853273, + "grad_norm": 265.2478942871094, + "learning_rate": 1.4722323049001816e-05, + "loss": 33.7491, + "step": 2816 + }, + { + "epoch": 10.169751693002258, + "grad_norm": 227.5030975341797, + "learning_rate": 1.4716878402903811e-05, + "loss": 23.0162, + "step": 2817 + }, + { + "epoch": 10.173363431151241, + "grad_norm": 195.83477783203125, + "learning_rate": 1.4711433756805808e-05, + "loss": 23.5831, + "step": 2818 + }, + { + "epoch": 10.176975169300226, + "grad_norm": 196.982421875, + "learning_rate": 1.4705989110707805e-05, + "loss": 24.1078, + "step": 2819 + }, + { + "epoch": 10.18058690744921, + "grad_norm": 212.73031616210938, + "learning_rate": 1.47005444646098e-05, + "loss": 24.8378, + "step": 2820 + }, + { + "epoch": 10.18058690744921, + "eval_loss": 0.6217848062515259, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 2820 + }, + { + "epoch": 10.184198645598194, + "grad_norm": 261.8343200683594, + "learning_rate": 1.4695099818511796e-05, + "loss": 43.3402, + "step": 2821 + }, + { + "epoch": 10.187810383747179, + "grad_norm": 272.94158935546875, + "learning_rate": 1.4689655172413793e-05, + "loss": 42.8004, + "step": 2822 + }, + { + "epoch": 10.191422121896162, + "grad_norm": 261.5067138671875, + "learning_rate": 1.468421052631579e-05, + "loss": 43.5947, + "step": 2823 + }, + { + "epoch": 10.195033860045147, + "grad_norm": 280.4205322265625, + "learning_rate": 1.4678765880217787e-05, + "loss": 42.1887, + "step": 2824 + }, + { + "epoch": 10.198645598194132, + "grad_norm": 223.82449340820312, + "learning_rate": 1.4673321234119783e-05, + "loss": 40.9825, + "step": 2825 + }, + { + "epoch": 10.202257336343115, + "grad_norm": 261.1077575683594, + "learning_rate": 1.4667876588021778e-05, + "loss": 41.8347, + "step": 2826 + }, + { + "epoch": 10.2058690744921, + "grad_norm": 189.1642608642578, + "learning_rate": 1.4662431941923775e-05, + "loss": 41.7441, + "step": 2827 + }, + { + "epoch": 10.209480812641084, + "grad_norm": 216.94410705566406, + "learning_rate": 1.4656987295825772e-05, + "loss": 42.203, + "step": 2828 + }, + { + "epoch": 10.213092550790067, + "grad_norm": 260.44744873046875, + "learning_rate": 1.4651542649727768e-05, + "loss": 41.8887, + "step": 2829 + }, + { + "epoch": 10.216704288939052, + "grad_norm": 252.21682739257812, + "learning_rate": 1.4646098003629765e-05, + "loss": 42.5977, + "step": 2830 + }, + { + "epoch": 10.216704288939052, + "eval_loss": 0.6175437569618225, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.997, + "eval_steps_per_second": 56.997, + "step": 2830 + }, + { + "epoch": 10.220316027088035, + "grad_norm": 298.4760437011719, + "learning_rate": 1.464065335753176e-05, + "loss": 40.7994, + "step": 2831 + }, + { + "epoch": 10.22392776523702, + "grad_norm": 214.0433349609375, + "learning_rate": 1.4635208711433757e-05, + "loss": 39.1571, + "step": 2832 + }, + { + "epoch": 10.227539503386005, + "grad_norm": 220.59039306640625, + "learning_rate": 1.4629764065335754e-05, + "loss": 38.257, + "step": 2833 + }, + { + "epoch": 10.231151241534988, + "grad_norm": 218.2419891357422, + "learning_rate": 1.462431941923775e-05, + "loss": 38.1954, + "step": 2834 + }, + { + "epoch": 10.234762979683973, + "grad_norm": 241.67674255371094, + "learning_rate": 1.4618874773139747e-05, + "loss": 39.7451, + "step": 2835 + }, + { + "epoch": 10.238374717832958, + "grad_norm": 260.3656005859375, + "learning_rate": 1.4613430127041742e-05, + "loss": 38.8297, + "step": 2836 + }, + { + "epoch": 10.241986455981941, + "grad_norm": 231.78102111816406, + "learning_rate": 1.4607985480943739e-05, + "loss": 38.523, + "step": 2837 + }, + { + "epoch": 10.245598194130926, + "grad_norm": 217.64820861816406, + "learning_rate": 1.4602540834845736e-05, + "loss": 40.0389, + "step": 2838 + }, + { + "epoch": 10.249209932279909, + "grad_norm": 186.45240783691406, + "learning_rate": 1.4597096188747732e-05, + "loss": 40.3306, + "step": 2839 + }, + { + "epoch": 10.252821670428894, + "grad_norm": 225.20480346679688, + "learning_rate": 1.4591651542649727e-05, + "loss": 39.0968, + "step": 2840 + }, + { + "epoch": 10.252821670428894, + "eval_loss": 0.6195141673088074, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 2840 + }, + { + "epoch": 10.256433408577879, + "grad_norm": 367.6174621582031, + "learning_rate": 1.4586206896551724e-05, + "loss": 38.869, + "step": 2841 + }, + { + "epoch": 10.260045146726862, + "grad_norm": 274.3976135253906, + "learning_rate": 1.4580762250453721e-05, + "loss": 39.7781, + "step": 2842 + }, + { + "epoch": 10.263656884875846, + "grad_norm": 193.41665649414062, + "learning_rate": 1.4575317604355718e-05, + "loss": 38.819, + "step": 2843 + }, + { + "epoch": 10.267268623024831, + "grad_norm": 204.2224578857422, + "learning_rate": 1.4569872958257714e-05, + "loss": 41.5495, + "step": 2844 + }, + { + "epoch": 10.270880361173814, + "grad_norm": 276.07476806640625, + "learning_rate": 1.4564428312159709e-05, + "loss": 40.6553, + "step": 2845 + }, + { + "epoch": 10.2744920993228, + "grad_norm": 192.6361541748047, + "learning_rate": 1.4558983666061708e-05, + "loss": 40.2147, + "step": 2846 + }, + { + "epoch": 10.278103837471784, + "grad_norm": 232.6641082763672, + "learning_rate": 1.4553539019963703e-05, + "loss": 40.7223, + "step": 2847 + }, + { + "epoch": 10.281715575620767, + "grad_norm": 266.781005859375, + "learning_rate": 1.4548094373865698e-05, + "loss": 38.0127, + "step": 2848 + }, + { + "epoch": 10.285327313769752, + "grad_norm": 289.5414123535156, + "learning_rate": 1.4542649727767696e-05, + "loss": 35.216, + "step": 2849 + }, + { + "epoch": 10.288939051918735, + "grad_norm": 208.10845947265625, + "learning_rate": 1.4537205081669691e-05, + "loss": 33.829, + "step": 2850 + }, + { + "epoch": 10.288939051918735, + "eval_loss": 0.6140356063842773, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.703, + "eval_steps_per_second": 56.703, + "step": 2850 + }, + { + "epoch": 10.29255079006772, + "grad_norm": 260.80328369140625, + "learning_rate": 1.4531760435571688e-05, + "loss": 33.8409, + "step": 2851 + }, + { + "epoch": 10.296162528216705, + "grad_norm": 202.3874053955078, + "learning_rate": 1.4526315789473685e-05, + "loss": 32.6498, + "step": 2852 + }, + { + "epoch": 10.299774266365688, + "grad_norm": 236.0218048095703, + "learning_rate": 1.452087114337568e-05, + "loss": 33.6538, + "step": 2853 + }, + { + "epoch": 10.303386004514673, + "grad_norm": 219.1603240966797, + "learning_rate": 1.4515426497277678e-05, + "loss": 33.7346, + "step": 2854 + }, + { + "epoch": 10.306997742663658, + "grad_norm": 252.8759307861328, + "learning_rate": 1.4509981851179675e-05, + "loss": 34.6996, + "step": 2855 + }, + { + "epoch": 10.31060948081264, + "grad_norm": 204.89244079589844, + "learning_rate": 1.450453720508167e-05, + "loss": 36.1145, + "step": 2856 + }, + { + "epoch": 10.314221218961626, + "grad_norm": 239.5278778076172, + "learning_rate": 1.4499092558983667e-05, + "loss": 34.8845, + "step": 2857 + }, + { + "epoch": 10.317832957110609, + "grad_norm": 235.02403259277344, + "learning_rate": 1.4493647912885662e-05, + "loss": 36.1006, + "step": 2858 + }, + { + "epoch": 10.321444695259594, + "grad_norm": 219.25686645507812, + "learning_rate": 1.4488203266787658e-05, + "loss": 37.0463, + "step": 2859 + }, + { + "epoch": 10.325056433408578, + "grad_norm": 238.1767578125, + "learning_rate": 1.4482758620689657e-05, + "loss": 35.5543, + "step": 2860 + }, + { + "epoch": 10.325056433408578, + "eval_loss": 0.6116110682487488, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.93, + "eval_steps_per_second": 56.93, + "step": 2860 + }, + { + "epoch": 10.328668171557561, + "grad_norm": 245.4133758544922, + "learning_rate": 1.4477313974591652e-05, + "loss": 35.7557, + "step": 2861 + }, + { + "epoch": 10.332279909706546, + "grad_norm": 231.70779418945312, + "learning_rate": 1.4471869328493647e-05, + "loss": 35.9535, + "step": 2862 + }, + { + "epoch": 10.335891647855531, + "grad_norm": 218.71266174316406, + "learning_rate": 1.4466424682395644e-05, + "loss": 36.747, + "step": 2863 + }, + { + "epoch": 10.339503386004514, + "grad_norm": 206.82247924804688, + "learning_rate": 1.446098003629764e-05, + "loss": 37.4007, + "step": 2864 + }, + { + "epoch": 10.343115124153499, + "grad_norm": 286.6649475097656, + "learning_rate": 1.4455535390199639e-05, + "loss": 38.183, + "step": 2865 + }, + { + "epoch": 10.346726862302482, + "grad_norm": 262.2049865722656, + "learning_rate": 1.4450090744101634e-05, + "loss": 28.1564, + "step": 2866 + }, + { + "epoch": 10.350338600451467, + "grad_norm": 203.03831481933594, + "learning_rate": 1.444464609800363e-05, + "loss": 23.7155, + "step": 2867 + }, + { + "epoch": 10.353950338600452, + "grad_norm": 220.13597106933594, + "learning_rate": 1.4439201451905626e-05, + "loss": 23.5066, + "step": 2868 + }, + { + "epoch": 10.357562076749435, + "grad_norm": 208.22035217285156, + "learning_rate": 1.4433756805807624e-05, + "loss": 23.8087, + "step": 2869 + }, + { + "epoch": 10.36117381489842, + "grad_norm": 202.74989318847656, + "learning_rate": 1.4428312159709619e-05, + "loss": 24.6194, + "step": 2870 + }, + { + "epoch": 10.36117381489842, + "eval_loss": 0.6170971989631653, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 2870 + }, + { + "epoch": 10.364785553047405, + "grad_norm": 251.78924560546875, + "learning_rate": 1.4422867513611616e-05, + "loss": 41.1333, + "step": 2871 + }, + { + "epoch": 10.368397291196388, + "grad_norm": 269.72430419921875, + "learning_rate": 1.4417422867513611e-05, + "loss": 43.5289, + "step": 2872 + }, + { + "epoch": 10.372009029345373, + "grad_norm": 226.14202880859375, + "learning_rate": 1.4411978221415607e-05, + "loss": 42.1575, + "step": 2873 + }, + { + "epoch": 10.375620767494357, + "grad_norm": 230.2255096435547, + "learning_rate": 1.4406533575317606e-05, + "loss": 42.5563, + "step": 2874 + }, + { + "epoch": 10.37923250564334, + "grad_norm": 259.2338562011719, + "learning_rate": 1.4401088929219601e-05, + "loss": 41.517, + "step": 2875 + }, + { + "epoch": 10.382844243792325, + "grad_norm": 280.06414794921875, + "learning_rate": 1.4395644283121598e-05, + "loss": 41.3589, + "step": 2876 + }, + { + "epoch": 10.386455981941308, + "grad_norm": 259.1960754394531, + "learning_rate": 1.4390199637023593e-05, + "loss": 41.539, + "step": 2877 + }, + { + "epoch": 10.390067720090293, + "grad_norm": 244.4931640625, + "learning_rate": 1.438475499092559e-05, + "loss": 41.8689, + "step": 2878 + }, + { + "epoch": 10.393679458239278, + "grad_norm": 195.65065002441406, + "learning_rate": 1.4379310344827588e-05, + "loss": 42.9191, + "step": 2879 + }, + { + "epoch": 10.397291196388261, + "grad_norm": 215.88589477539062, + "learning_rate": 1.4373865698729583e-05, + "loss": 41.4172, + "step": 2880 + }, + { + "epoch": 10.397291196388261, + "eval_loss": 0.6176813840866089, + "eval_runtime": 3.1462, + "eval_samples_per_second": 56.893, + "eval_steps_per_second": 56.893, + "step": 2880 + }, + { + "epoch": 10.400902934537246, + "grad_norm": 175.21368408203125, + "learning_rate": 1.4368421052631578e-05, + "loss": 41.8998, + "step": 2881 + }, + { + "epoch": 10.404514672686231, + "grad_norm": 207.65963745117188, + "learning_rate": 1.4362976406533575e-05, + "loss": 40.33, + "step": 2882 + }, + { + "epoch": 10.408126410835214, + "grad_norm": 213.50526428222656, + "learning_rate": 1.4357531760435572e-05, + "loss": 38.0329, + "step": 2883 + }, + { + "epoch": 10.411738148984199, + "grad_norm": 190.8444366455078, + "learning_rate": 1.4352087114337568e-05, + "loss": 39.0142, + "step": 2884 + }, + { + "epoch": 10.415349887133182, + "grad_norm": 300.2298583984375, + "learning_rate": 1.4346642468239565e-05, + "loss": 38.6364, + "step": 2885 + }, + { + "epoch": 10.418961625282167, + "grad_norm": 183.6144256591797, + "learning_rate": 1.434119782214156e-05, + "loss": 39.6747, + "step": 2886 + }, + { + "epoch": 10.422573363431152, + "grad_norm": 237.85340881347656, + "learning_rate": 1.4335753176043557e-05, + "loss": 38.3018, + "step": 2887 + }, + { + "epoch": 10.426185101580135, + "grad_norm": 325.96624755859375, + "learning_rate": 1.4330308529945554e-05, + "loss": 40.1042, + "step": 2888 + }, + { + "epoch": 10.42979683972912, + "grad_norm": 248.4732666015625, + "learning_rate": 1.432486388384755e-05, + "loss": 40.0357, + "step": 2889 + }, + { + "epoch": 10.433408577878104, + "grad_norm": 374.6653747558594, + "learning_rate": 1.4319419237749547e-05, + "loss": 40.4383, + "step": 2890 + }, + { + "epoch": 10.433408577878104, + "eval_loss": 0.6150367856025696, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.881, + "eval_steps_per_second": 56.881, + "step": 2890 + }, + { + "epoch": 10.437020316027088, + "grad_norm": 229.79647827148438, + "learning_rate": 1.4313974591651542e-05, + "loss": 40.3728, + "step": 2891 + }, + { + "epoch": 10.440632054176072, + "grad_norm": 278.7500915527344, + "learning_rate": 1.430852994555354e-05, + "loss": 39.546, + "step": 2892 + }, + { + "epoch": 10.444243792325057, + "grad_norm": 233.1890106201172, + "learning_rate": 1.4303085299455536e-05, + "loss": 41.8094, + "step": 2893 + }, + { + "epoch": 10.44785553047404, + "grad_norm": 207.7745819091797, + "learning_rate": 1.4297640653357532e-05, + "loss": 40.6225, + "step": 2894 + }, + { + "epoch": 10.451467268623025, + "grad_norm": 233.37892150878906, + "learning_rate": 1.4292196007259529e-05, + "loss": 40.2499, + "step": 2895 + }, + { + "epoch": 10.455079006772008, + "grad_norm": 225.4070587158203, + "learning_rate": 1.4286751361161524e-05, + "loss": 40.3626, + "step": 2896 + }, + { + "epoch": 10.458690744920993, + "grad_norm": 239.60231018066406, + "learning_rate": 1.4281306715063521e-05, + "loss": 40.3149, + "step": 2897 + }, + { + "epoch": 10.462302483069978, + "grad_norm": 225.3981475830078, + "learning_rate": 1.4275862068965518e-05, + "loss": 39.3443, + "step": 2898 + }, + { + "epoch": 10.465914221218961, + "grad_norm": 270.2829284667969, + "learning_rate": 1.4270417422867514e-05, + "loss": 37.8947, + "step": 2899 + }, + { + "epoch": 10.469525959367946, + "grad_norm": 263.66986083984375, + "learning_rate": 1.426497277676951e-05, + "loss": 34.4721, + "step": 2900 + }, + { + "epoch": 10.469525959367946, + "eval_loss": 0.6134031414985657, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2900 + }, + { + "epoch": 10.47313769751693, + "grad_norm": 189.3812255859375, + "learning_rate": 1.4259528130671508e-05, + "loss": 34.3148, + "step": 2901 + }, + { + "epoch": 10.476749435665914, + "grad_norm": 256.7174987792969, + "learning_rate": 1.4254083484573503e-05, + "loss": 32.1693, + "step": 2902 + }, + { + "epoch": 10.480361173814899, + "grad_norm": 265.40692138671875, + "learning_rate": 1.4248638838475499e-05, + "loss": 34.369, + "step": 2903 + }, + { + "epoch": 10.483972911963882, + "grad_norm": 315.6539001464844, + "learning_rate": 1.4243194192377496e-05, + "loss": 34.9479, + "step": 2904 + }, + { + "epoch": 10.487584650112867, + "grad_norm": 263.7816162109375, + "learning_rate": 1.4237749546279491e-05, + "loss": 33.983, + "step": 2905 + }, + { + "epoch": 10.491196388261852, + "grad_norm": 244.69192504882812, + "learning_rate": 1.423230490018149e-05, + "loss": 36.6685, + "step": 2906 + }, + { + "epoch": 10.494808126410835, + "grad_norm": 224.26071166992188, + "learning_rate": 1.4226860254083485e-05, + "loss": 35.0337, + "step": 2907 + }, + { + "epoch": 10.49841986455982, + "grad_norm": 261.0958557128906, + "learning_rate": 1.422141560798548e-05, + "loss": 34.7154, + "step": 2908 + }, + { + "epoch": 10.502031602708804, + "grad_norm": 245.85960388183594, + "learning_rate": 1.4215970961887478e-05, + "loss": 35.4156, + "step": 2909 + }, + { + "epoch": 10.505643340857787, + "grad_norm": 309.3730163574219, + "learning_rate": 1.4210526315789473e-05, + "loss": 36.3999, + "step": 2910 + }, + { + "epoch": 10.505643340857787, + "eval_loss": 0.6144266128540039, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.853, + "eval_steps_per_second": 56.853, + "step": 2910 + }, + { + "epoch": 10.509255079006772, + "grad_norm": 209.9637451171875, + "learning_rate": 1.420508166969147e-05, + "loss": 37.1515, + "step": 2911 + }, + { + "epoch": 10.512866817155757, + "grad_norm": 254.81683349609375, + "learning_rate": 1.4199637023593467e-05, + "loss": 35.5548, + "step": 2912 + }, + { + "epoch": 10.51647855530474, + "grad_norm": 224.94137573242188, + "learning_rate": 1.4194192377495463e-05, + "loss": 36.7691, + "step": 2913 + }, + { + "epoch": 10.520090293453725, + "grad_norm": 223.81838989257812, + "learning_rate": 1.4188747731397458e-05, + "loss": 37.5904, + "step": 2914 + }, + { + "epoch": 10.523702031602708, + "grad_norm": 308.0168151855469, + "learning_rate": 1.4183303085299457e-05, + "loss": 36.1561, + "step": 2915 + }, + { + "epoch": 10.527313769751693, + "grad_norm": 214.77928161621094, + "learning_rate": 1.4177858439201452e-05, + "loss": 27.6309, + "step": 2916 + }, + { + "epoch": 10.530925507900678, + "grad_norm": 153.77163696289062, + "learning_rate": 1.417241379310345e-05, + "loss": 23.6151, + "step": 2917 + }, + { + "epoch": 10.534537246049661, + "grad_norm": 161.12826538085938, + "learning_rate": 1.4166969147005445e-05, + "loss": 23.1684, + "step": 2918 + }, + { + "epoch": 10.538148984198646, + "grad_norm": 228.01441955566406, + "learning_rate": 1.416152450090744e-05, + "loss": 23.4383, + "step": 2919 + }, + { + "epoch": 10.54176072234763, + "grad_norm": 207.55052185058594, + "learning_rate": 1.4156079854809439e-05, + "loss": 25.4699, + "step": 2920 + }, + { + "epoch": 10.54176072234763, + "eval_loss": 0.6177500486373901, + "eval_runtime": 3.1369, + "eval_samples_per_second": 57.063, + "eval_steps_per_second": 57.063, + "step": 2920 + }, + { + "epoch": 10.545372460496614, + "grad_norm": 254.23828125, + "learning_rate": 1.4150635208711434e-05, + "loss": 42.1525, + "step": 2921 + }, + { + "epoch": 10.548984198645599, + "grad_norm": 228.1654815673828, + "learning_rate": 1.414519056261343e-05, + "loss": 42.4282, + "step": 2922 + }, + { + "epoch": 10.552595936794582, + "grad_norm": 258.4981689453125, + "learning_rate": 1.4139745916515427e-05, + "loss": 42.3053, + "step": 2923 + }, + { + "epoch": 10.556207674943566, + "grad_norm": 364.42059326171875, + "learning_rate": 1.4134301270417424e-05, + "loss": 41.9009, + "step": 2924 + }, + { + "epoch": 10.559819413092551, + "grad_norm": 213.5066375732422, + "learning_rate": 1.412885662431942e-05, + "loss": 41.0624, + "step": 2925 + }, + { + "epoch": 10.563431151241534, + "grad_norm": 214.23472595214844, + "learning_rate": 1.4123411978221416e-05, + "loss": 42.2508, + "step": 2926 + }, + { + "epoch": 10.56704288939052, + "grad_norm": 249.8063201904297, + "learning_rate": 1.4117967332123412e-05, + "loss": 43.0671, + "step": 2927 + }, + { + "epoch": 10.570654627539504, + "grad_norm": 210.0769805908203, + "learning_rate": 1.4112522686025409e-05, + "loss": 43.4018, + "step": 2928 + }, + { + "epoch": 10.574266365688487, + "grad_norm": 255.67225646972656, + "learning_rate": 1.4107078039927406e-05, + "loss": 42.9609, + "step": 2929 + }, + { + "epoch": 10.577878103837472, + "grad_norm": 294.2599182128906, + "learning_rate": 1.4101633393829401e-05, + "loss": 41.8748, + "step": 2930 + }, + { + "epoch": 10.577878103837472, + "eval_loss": 0.6147512793540955, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2930 + }, + { + "epoch": 10.581489841986457, + "grad_norm": 212.6685333251953, + "learning_rate": 1.4096188747731398e-05, + "loss": 42.4291, + "step": 2931 + }, + { + "epoch": 10.58510158013544, + "grad_norm": 297.016357421875, + "learning_rate": 1.4090744101633394e-05, + "loss": 39.7291, + "step": 2932 + }, + { + "epoch": 10.588713318284425, + "grad_norm": 280.308837890625, + "learning_rate": 1.4085299455535389e-05, + "loss": 37.4836, + "step": 2933 + }, + { + "epoch": 10.592325056433408, + "grad_norm": 230.28994750976562, + "learning_rate": 1.4079854809437388e-05, + "loss": 39.4075, + "step": 2934 + }, + { + "epoch": 10.595936794582393, + "grad_norm": 377.0367126464844, + "learning_rate": 1.4074410163339383e-05, + "loss": 40.5601, + "step": 2935 + }, + { + "epoch": 10.599548532731378, + "grad_norm": 238.51597595214844, + "learning_rate": 1.406896551724138e-05, + "loss": 38.1238, + "step": 2936 + }, + { + "epoch": 10.60316027088036, + "grad_norm": 197.5536651611328, + "learning_rate": 1.4063520871143376e-05, + "loss": 38.2997, + "step": 2937 + }, + { + "epoch": 10.606772009029346, + "grad_norm": 211.65162658691406, + "learning_rate": 1.4058076225045373e-05, + "loss": 39.1501, + "step": 2938 + }, + { + "epoch": 10.610383747178329, + "grad_norm": 266.4801940917969, + "learning_rate": 1.405263157894737e-05, + "loss": 40.5761, + "step": 2939 + }, + { + "epoch": 10.613995485327314, + "grad_norm": 210.29478454589844, + "learning_rate": 1.4047186932849365e-05, + "loss": 39.7387, + "step": 2940 + }, + { + "epoch": 10.613995485327314, + "eval_loss": 0.6154477000236511, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 2940 + }, + { + "epoch": 10.617607223476298, + "grad_norm": 318.0694580078125, + "learning_rate": 1.404174228675136e-05, + "loss": 38.691, + "step": 2941 + }, + { + "epoch": 10.621218961625281, + "grad_norm": 351.12811279296875, + "learning_rate": 1.4036297640653358e-05, + "loss": 40.3878, + "step": 2942 + }, + { + "epoch": 10.624830699774266, + "grad_norm": 259.8601989746094, + "learning_rate": 1.4030852994555355e-05, + "loss": 38.4447, + "step": 2943 + }, + { + "epoch": 10.628442437923251, + "grad_norm": 249.7741241455078, + "learning_rate": 1.402540834845735e-05, + "loss": 41.1242, + "step": 2944 + }, + { + "epoch": 10.632054176072234, + "grad_norm": 207.11119079589844, + "learning_rate": 1.4019963702359347e-05, + "loss": 40.1977, + "step": 2945 + }, + { + "epoch": 10.635665914221219, + "grad_norm": 199.37295532226562, + "learning_rate": 1.4014519056261343e-05, + "loss": 40.71, + "step": 2946 + }, + { + "epoch": 10.639277652370204, + "grad_norm": 238.85061645507812, + "learning_rate": 1.4009074410163341e-05, + "loss": 41.8822, + "step": 2947 + }, + { + "epoch": 10.642889390519187, + "grad_norm": 212.46388244628906, + "learning_rate": 1.4003629764065337e-05, + "loss": 40.5648, + "step": 2948 + }, + { + "epoch": 10.646501128668172, + "grad_norm": 217.60386657714844, + "learning_rate": 1.3998185117967332e-05, + "loss": 39.6074, + "step": 2949 + }, + { + "epoch": 10.650112866817155, + "grad_norm": 223.88645935058594, + "learning_rate": 1.399274047186933e-05, + "loss": 37.7394, + "step": 2950 + }, + { + "epoch": 10.650112866817155, + "eval_loss": 0.6133999228477478, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 2950 + }, + { + "epoch": 10.65372460496614, + "grad_norm": 248.87986755371094, + "learning_rate": 1.3987295825771325e-05, + "loss": 34.911, + "step": 2951 + }, + { + "epoch": 10.657336343115125, + "grad_norm": 238.0355987548828, + "learning_rate": 1.3981851179673322e-05, + "loss": 34.0325, + "step": 2952 + }, + { + "epoch": 10.660948081264108, + "grad_norm": 212.9556121826172, + "learning_rate": 1.3976406533575319e-05, + "loss": 34.9663, + "step": 2953 + }, + { + "epoch": 10.664559819413093, + "grad_norm": 274.4277648925781, + "learning_rate": 1.3970961887477314e-05, + "loss": 34.2399, + "step": 2954 + }, + { + "epoch": 10.668171557562077, + "grad_norm": 211.77976989746094, + "learning_rate": 1.396551724137931e-05, + "loss": 33.7609, + "step": 2955 + }, + { + "epoch": 10.67178329571106, + "grad_norm": 280.6621398925781, + "learning_rate": 1.3960072595281307e-05, + "loss": 35.2616, + "step": 2956 + }, + { + "epoch": 10.675395033860045, + "grad_norm": 239.06439208984375, + "learning_rate": 1.3954627949183304e-05, + "loss": 34.2542, + "step": 2957 + }, + { + "epoch": 10.679006772009028, + "grad_norm": 271.45806884765625, + "learning_rate": 1.39491833030853e-05, + "loss": 36.0551, + "step": 2958 + }, + { + "epoch": 10.682618510158013, + "grad_norm": 247.76486206054688, + "learning_rate": 1.3943738656987296e-05, + "loss": 36.9935, + "step": 2959 + }, + { + "epoch": 10.686230248306998, + "grad_norm": 259.47930908203125, + "learning_rate": 1.3938294010889292e-05, + "loss": 36.7769, + "step": 2960 + }, + { + "epoch": 10.686230248306998, + "eval_loss": 0.6107803583145142, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.138, + "eval_steps_per_second": 57.138, + "step": 2960 + }, + { + "epoch": 10.689841986455981, + "grad_norm": 247.50103759765625, + "learning_rate": 1.393284936479129e-05, + "loss": 35.4848, + "step": 2961 + }, + { + "epoch": 10.693453724604966, + "grad_norm": 242.37330627441406, + "learning_rate": 1.3927404718693286e-05, + "loss": 36.3881, + "step": 2962 + }, + { + "epoch": 10.697065462753951, + "grad_norm": 200.2835693359375, + "learning_rate": 1.3921960072595281e-05, + "loss": 37.2684, + "step": 2963 + }, + { + "epoch": 10.700677200902934, + "grad_norm": 261.6256103515625, + "learning_rate": 1.3916515426497278e-05, + "loss": 37.4581, + "step": 2964 + }, + { + "epoch": 10.704288939051919, + "grad_norm": 243.7251434326172, + "learning_rate": 1.3911070780399274e-05, + "loss": 35.8237, + "step": 2965 + }, + { + "epoch": 10.707900677200904, + "grad_norm": 172.99339294433594, + "learning_rate": 1.390562613430127e-05, + "loss": 29.5815, + "step": 2966 + }, + { + "epoch": 10.711512415349887, + "grad_norm": 168.88490295410156, + "learning_rate": 1.3900181488203268e-05, + "loss": 23.6597, + "step": 2967 + }, + { + "epoch": 10.715124153498872, + "grad_norm": 213.0456085205078, + "learning_rate": 1.3894736842105263e-05, + "loss": 22.5034, + "step": 2968 + }, + { + "epoch": 10.718735891647855, + "grad_norm": 183.87222290039062, + "learning_rate": 1.388929219600726e-05, + "loss": 24.1696, + "step": 2969 + }, + { + "epoch": 10.72234762979684, + "grad_norm": 179.4297637939453, + "learning_rate": 1.3883847549909256e-05, + "loss": 24.8905, + "step": 2970 + }, + { + "epoch": 10.72234762979684, + "eval_loss": 0.6176853179931641, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 2970 + }, + { + "epoch": 10.725959367945824, + "grad_norm": 214.10662841796875, + "learning_rate": 1.3878402903811253e-05, + "loss": 40.6941, + "step": 2971 + }, + { + "epoch": 10.729571106094808, + "grad_norm": 199.4381103515625, + "learning_rate": 1.387295825771325e-05, + "loss": 42.6363, + "step": 2972 + }, + { + "epoch": 10.733182844243792, + "grad_norm": 182.74517822265625, + "learning_rate": 1.3867513611615245e-05, + "loss": 40.9695, + "step": 2973 + }, + { + "epoch": 10.736794582392777, + "grad_norm": 182.41421508789062, + "learning_rate": 1.386206896551724e-05, + "loss": 40.8893, + "step": 2974 + }, + { + "epoch": 10.74040632054176, + "grad_norm": 215.42904663085938, + "learning_rate": 1.385662431941924e-05, + "loss": 40.6667, + "step": 2975 + }, + { + "epoch": 10.744018058690745, + "grad_norm": 208.15133666992188, + "learning_rate": 1.3851179673321235e-05, + "loss": 42.0714, + "step": 2976 + }, + { + "epoch": 10.747629796839728, + "grad_norm": 224.70242309570312, + "learning_rate": 1.384573502722323e-05, + "loss": 40.9404, + "step": 2977 + }, + { + "epoch": 10.751241534988713, + "grad_norm": 241.45301818847656, + "learning_rate": 1.3840290381125227e-05, + "loss": 43.5597, + "step": 2978 + }, + { + "epoch": 10.754853273137698, + "grad_norm": 201.2677459716797, + "learning_rate": 1.3834845735027222e-05, + "loss": 42.7741, + "step": 2979 + }, + { + "epoch": 10.758465011286681, + "grad_norm": 246.30873107910156, + "learning_rate": 1.3829401088929221e-05, + "loss": 41.7873, + "step": 2980 + }, + { + "epoch": 10.758465011286681, + "eval_loss": 0.6206657886505127, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2980 + }, + { + "epoch": 10.762076749435666, + "grad_norm": 206.91009521484375, + "learning_rate": 1.3823956442831217e-05, + "loss": 42.3601, + "step": 2981 + }, + { + "epoch": 10.76568848758465, + "grad_norm": 206.37472534179688, + "learning_rate": 1.3818511796733212e-05, + "loss": 38.5536, + "step": 2982 + }, + { + "epoch": 10.769300225733634, + "grad_norm": 206.49070739746094, + "learning_rate": 1.3813067150635209e-05, + "loss": 38.1051, + "step": 2983 + }, + { + "epoch": 10.772911963882619, + "grad_norm": 215.02455139160156, + "learning_rate": 1.3807622504537206e-05, + "loss": 39.0797, + "step": 2984 + }, + { + "epoch": 10.776523702031604, + "grad_norm": 254.23757934570312, + "learning_rate": 1.3802177858439202e-05, + "loss": 39.419, + "step": 2985 + }, + { + "epoch": 10.780135440180587, + "grad_norm": 205.85079956054688, + "learning_rate": 1.3796733212341199e-05, + "loss": 39.2075, + "step": 2986 + }, + { + "epoch": 10.783747178329572, + "grad_norm": 216.0372314453125, + "learning_rate": 1.3791288566243194e-05, + "loss": 38.5652, + "step": 2987 + }, + { + "epoch": 10.787358916478555, + "grad_norm": 258.47650146484375, + "learning_rate": 1.3785843920145191e-05, + "loss": 38.1968, + "step": 2988 + }, + { + "epoch": 10.79097065462754, + "grad_norm": 289.07354736328125, + "learning_rate": 1.3780399274047188e-05, + "loss": 40.2233, + "step": 2989 + }, + { + "epoch": 10.794582392776524, + "grad_norm": 332.9964904785156, + "learning_rate": 1.3774954627949184e-05, + "loss": 39.5959, + "step": 2990 + }, + { + "epoch": 10.794582392776524, + "eval_loss": 0.6167517304420471, + "eval_runtime": 3.1556, + "eval_samples_per_second": 56.724, + "eval_steps_per_second": 56.724, + "step": 2990 + }, + { + "epoch": 10.798194130925507, + "grad_norm": 205.10699462890625, + "learning_rate": 1.376950998185118e-05, + "loss": 40.2468, + "step": 2991 + }, + { + "epoch": 10.801805869074492, + "grad_norm": 270.2808837890625, + "learning_rate": 1.3764065335753176e-05, + "loss": 37.5956, + "step": 2992 + }, + { + "epoch": 10.805417607223477, + "grad_norm": 199.32044982910156, + "learning_rate": 1.3758620689655171e-05, + "loss": 38.7289, + "step": 2993 + }, + { + "epoch": 10.80902934537246, + "grad_norm": 196.97547912597656, + "learning_rate": 1.375317604355717e-05, + "loss": 40.6707, + "step": 2994 + }, + { + "epoch": 10.812641083521445, + "grad_norm": 219.34588623046875, + "learning_rate": 1.3747731397459166e-05, + "loss": 39.6782, + "step": 2995 + }, + { + "epoch": 10.816252821670428, + "grad_norm": 261.7323913574219, + "learning_rate": 1.3742286751361161e-05, + "loss": 41.1828, + "step": 2996 + }, + { + "epoch": 10.819864559819413, + "grad_norm": 250.89186096191406, + "learning_rate": 1.3736842105263158e-05, + "loss": 41.3582, + "step": 2997 + }, + { + "epoch": 10.823476297968398, + "grad_norm": 284.7223205566406, + "learning_rate": 1.3731397459165155e-05, + "loss": 39.3584, + "step": 2998 + }, + { + "epoch": 10.827088036117381, + "grad_norm": 212.9114990234375, + "learning_rate": 1.3725952813067152e-05, + "loss": 37.5373, + "step": 2999 + }, + { + "epoch": 10.830699774266366, + "grad_norm": 182.8346405029297, + "learning_rate": 1.3720508166969148e-05, + "loss": 35.2027, + "step": 3000 + }, + { + "epoch": 10.830699774266366, + "eval_loss": 0.6083630919456482, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.702, + "eval_steps_per_second": 56.702, + "step": 3000 + }, + { + "epoch": 10.83431151241535, + "grad_norm": 259.0496520996094, + "learning_rate": 1.3715063520871143e-05, + "loss": 33.4937, + "step": 3001 + }, + { + "epoch": 10.837923250564334, + "grad_norm": 173.037353515625, + "learning_rate": 1.370961887477314e-05, + "loss": 32.8549, + "step": 3002 + }, + { + "epoch": 10.841534988713319, + "grad_norm": 257.9381408691406, + "learning_rate": 1.3704174228675137e-05, + "loss": 33.9163, + "step": 3003 + }, + { + "epoch": 10.845146726862303, + "grad_norm": 248.58355712890625, + "learning_rate": 1.3698729582577132e-05, + "loss": 34.3948, + "step": 3004 + }, + { + "epoch": 10.848758465011286, + "grad_norm": 277.0877990722656, + "learning_rate": 1.369328493647913e-05, + "loss": 34.2868, + "step": 3005 + }, + { + "epoch": 10.852370203160271, + "grad_norm": 220.54014587402344, + "learning_rate": 1.3687840290381125e-05, + "loss": 35.2502, + "step": 3006 + }, + { + "epoch": 10.855981941309254, + "grad_norm": 248.14111328125, + "learning_rate": 1.3682395644283122e-05, + "loss": 33.4599, + "step": 3007 + }, + { + "epoch": 10.85959367945824, + "grad_norm": 284.2827453613281, + "learning_rate": 1.3676950998185119e-05, + "loss": 34.2927, + "step": 3008 + }, + { + "epoch": 10.863205417607224, + "grad_norm": 236.78201293945312, + "learning_rate": 1.3671506352087114e-05, + "loss": 34.9322, + "step": 3009 + }, + { + "epoch": 10.866817155756207, + "grad_norm": 245.58331298828125, + "learning_rate": 1.3666061705989112e-05, + "loss": 35.7628, + "step": 3010 + }, + { + "epoch": 10.866817155756207, + "eval_loss": 0.6125946640968323, + "eval_runtime": 3.1644, + "eval_samples_per_second": 56.566, + "eval_steps_per_second": 56.566, + "step": 3010 + }, + { + "epoch": 10.870428893905192, + "grad_norm": 217.79248046875, + "learning_rate": 1.3660617059891107e-05, + "loss": 35.7332, + "step": 3011 + }, + { + "epoch": 10.874040632054175, + "grad_norm": 258.78729248046875, + "learning_rate": 1.3655172413793104e-05, + "loss": 38.293, + "step": 3012 + }, + { + "epoch": 10.87765237020316, + "grad_norm": 253.94757080078125, + "learning_rate": 1.3649727767695101e-05, + "loss": 37.511, + "step": 3013 + }, + { + "epoch": 10.881264108352145, + "grad_norm": 265.5654602050781, + "learning_rate": 1.3644283121597096e-05, + "loss": 37.5786, + "step": 3014 + }, + { + "epoch": 10.884875846501128, + "grad_norm": 252.11453247070312, + "learning_rate": 1.3638838475499092e-05, + "loss": 37.1039, + "step": 3015 + }, + { + "epoch": 10.888487584650113, + "grad_norm": 259.5934753417969, + "learning_rate": 1.3633393829401089e-05, + "loss": 35.2651, + "step": 3016 + }, + { + "epoch": 10.892099322799098, + "grad_norm": 194.3569793701172, + "learning_rate": 1.3627949183303086e-05, + "loss": 23.7438, + "step": 3017 + }, + { + "epoch": 10.89571106094808, + "grad_norm": 233.95205688476562, + "learning_rate": 1.3622504537205081e-05, + "loss": 23.0061, + "step": 3018 + }, + { + "epoch": 10.899322799097066, + "grad_norm": 185.18495178222656, + "learning_rate": 1.3617059891107078e-05, + "loss": 24.5404, + "step": 3019 + }, + { + "epoch": 10.90293453724605, + "grad_norm": 200.27029418945312, + "learning_rate": 1.3611615245009074e-05, + "loss": 24.3629, + "step": 3020 + }, + { + "epoch": 10.90293453724605, + "eval_loss": 0.6178797483444214, + "eval_runtime": 3.1498, + "eval_samples_per_second": 56.829, + "eval_steps_per_second": 56.829, + "step": 3020 + }, + { + "epoch": 10.906546275395034, + "grad_norm": 226.4281463623047, + "learning_rate": 1.3606170598911073e-05, + "loss": 41.7249, + "step": 3021 + }, + { + "epoch": 10.910158013544018, + "grad_norm": 207.73768615722656, + "learning_rate": 1.3600725952813068e-05, + "loss": 42.1902, + "step": 3022 + }, + { + "epoch": 10.913769751693001, + "grad_norm": 248.69773864746094, + "learning_rate": 1.3595281306715063e-05, + "loss": 40.8419, + "step": 3023 + }, + { + "epoch": 10.917381489841986, + "grad_norm": 224.0100860595703, + "learning_rate": 1.358983666061706e-05, + "loss": 41.483, + "step": 3024 + }, + { + "epoch": 10.920993227990971, + "grad_norm": 217.3524932861328, + "learning_rate": 1.3584392014519056e-05, + "loss": 42.4667, + "step": 3025 + }, + { + "epoch": 10.924604966139954, + "grad_norm": 226.0863494873047, + "learning_rate": 1.3578947368421053e-05, + "loss": 40.8693, + "step": 3026 + }, + { + "epoch": 10.928216704288939, + "grad_norm": 278.3658447265625, + "learning_rate": 1.357350272232305e-05, + "loss": 39.5165, + "step": 3027 + }, + { + "epoch": 10.931828442437924, + "grad_norm": 226.6543731689453, + "learning_rate": 1.3568058076225045e-05, + "loss": 39.3144, + "step": 3028 + }, + { + "epoch": 10.935440180586907, + "grad_norm": 215.39073181152344, + "learning_rate": 1.3562613430127042e-05, + "loss": 39.9823, + "step": 3029 + }, + { + "epoch": 10.939051918735892, + "grad_norm": 239.6291961669922, + "learning_rate": 1.355716878402904e-05, + "loss": 40.898, + "step": 3030 + }, + { + "epoch": 10.939051918735892, + "eval_loss": 0.6163076162338257, + "eval_runtime": 3.153, + "eval_samples_per_second": 56.771, + "eval_steps_per_second": 56.771, + "step": 3030 + }, + { + "epoch": 10.942663656884875, + "grad_norm": 251.20431518554688, + "learning_rate": 1.3551724137931035e-05, + "loss": 40.8357, + "step": 3031 + }, + { + "epoch": 10.94627539503386, + "grad_norm": 243.96022033691406, + "learning_rate": 1.3546279491833032e-05, + "loss": 39.1261, + "step": 3032 + }, + { + "epoch": 10.949887133182845, + "grad_norm": 248.15545654296875, + "learning_rate": 1.3540834845735027e-05, + "loss": 40.9375, + "step": 3033 + }, + { + "epoch": 10.953498871331828, + "grad_norm": 215.00927734375, + "learning_rate": 1.3535390199637023e-05, + "loss": 42.4167, + "step": 3034 + }, + { + "epoch": 10.957110609480813, + "grad_norm": 263.11566162109375, + "learning_rate": 1.3529945553539021e-05, + "loss": 40.7363, + "step": 3035 + }, + { + "epoch": 10.960722347629797, + "grad_norm": 208.59628295898438, + "learning_rate": 1.3524500907441017e-05, + "loss": 35.7124, + "step": 3036 + }, + { + "epoch": 10.96433408577878, + "grad_norm": 187.6036834716797, + "learning_rate": 1.3519056261343012e-05, + "loss": 33.7512, + "step": 3037 + }, + { + "epoch": 10.967945823927765, + "grad_norm": 217.89825439453125, + "learning_rate": 1.351361161524501e-05, + "loss": 33.4262, + "step": 3038 + }, + { + "epoch": 10.97155756207675, + "grad_norm": 235.59889221191406, + "learning_rate": 1.3508166969147005e-05, + "loss": 35.2587, + "step": 3039 + }, + { + "epoch": 10.975169300225733, + "grad_norm": 261.9609680175781, + "learning_rate": 1.3502722323049003e-05, + "loss": 36.1296, + "step": 3040 + }, + { + "epoch": 10.975169300225733, + "eval_loss": 0.610818088054657, + "eval_runtime": 3.1502, + "eval_samples_per_second": 56.822, + "eval_steps_per_second": 56.822, + "step": 3040 + }, + { + "epoch": 10.978781038374718, + "grad_norm": 239.44386291503906, + "learning_rate": 1.3497277676950999e-05, + "loss": 35.6712, + "step": 3041 + }, + { + "epoch": 10.982392776523701, + "grad_norm": 260.9620666503906, + "learning_rate": 1.3491833030852994e-05, + "loss": 35.9054, + "step": 3042 + }, + { + "epoch": 10.986004514672686, + "grad_norm": 246.35678100585938, + "learning_rate": 1.3486388384754991e-05, + "loss": 35.6071, + "step": 3043 + }, + { + "epoch": 10.989616252821671, + "grad_norm": 259.808349609375, + "learning_rate": 1.3480943738656988e-05, + "loss": 37.8261, + "step": 3044 + }, + { + "epoch": 10.993227990970654, + "grad_norm": 187.34579467773438, + "learning_rate": 1.3475499092558984e-05, + "loss": 29.4662, + "step": 3045 + }, + { + "epoch": 10.996839729119639, + "grad_norm": 235.4073486328125, + "learning_rate": 1.3470054446460981e-05, + "loss": 23.668, + "step": 3046 + }, + { + "epoch": 11.0, + "grad_norm": 171.45904541015625, + "learning_rate": 1.3464609800362976e-05, + "loss": 21.3995, + "step": 3047 + }, + { + "epoch": 11.003611738148985, + "grad_norm": 262.18798828125, + "learning_rate": 1.3459165154264972e-05, + "loss": 40.2072, + "step": 3048 + }, + { + "epoch": 11.007223476297968, + "grad_norm": 298.67755126953125, + "learning_rate": 1.345372050816697e-05, + "loss": 42.5345, + "step": 3049 + }, + { + "epoch": 11.010835214446953, + "grad_norm": 215.71389770507812, + "learning_rate": 1.3448275862068966e-05, + "loss": 41.3491, + "step": 3050 + }, + { + "epoch": 11.010835214446953, + "eval_loss": 0.6099278330802917, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 3050 + }, + { + "epoch": 11.014446952595938, + "grad_norm": 243.77044677734375, + "learning_rate": 1.3442831215970963e-05, + "loss": 41.0093, + "step": 3051 + }, + { + "epoch": 11.01805869074492, + "grad_norm": 205.8600616455078, + "learning_rate": 1.3437386569872958e-05, + "loss": 41.944, + "step": 3052 + }, + { + "epoch": 11.021670428893906, + "grad_norm": 204.25608825683594, + "learning_rate": 1.3431941923774955e-05, + "loss": 39.3595, + "step": 3053 + }, + { + "epoch": 11.025282167042889, + "grad_norm": 195.03114318847656, + "learning_rate": 1.3426497277676952e-05, + "loss": 42.0208, + "step": 3054 + }, + { + "epoch": 11.028893905191874, + "grad_norm": 193.05857849121094, + "learning_rate": 1.3421052631578948e-05, + "loss": 41.2148, + "step": 3055 + }, + { + "epoch": 11.032505643340858, + "grad_norm": 255.9553680419922, + "learning_rate": 1.3415607985480943e-05, + "loss": 41.6029, + "step": 3056 + }, + { + "epoch": 11.036117381489841, + "grad_norm": 234.97799682617188, + "learning_rate": 1.341016333938294e-05, + "loss": 41.2583, + "step": 3057 + }, + { + "epoch": 11.039729119638826, + "grad_norm": 183.76707458496094, + "learning_rate": 1.3404718693284937e-05, + "loss": 39.4893, + "step": 3058 + }, + { + "epoch": 11.043340857787811, + "grad_norm": 162.30191040039062, + "learning_rate": 1.3399274047186933e-05, + "loss": 37.697, + "step": 3059 + }, + { + "epoch": 11.046952595936794, + "grad_norm": 223.8235626220703, + "learning_rate": 1.339382940108893e-05, + "loss": 37.2762, + "step": 3060 + }, + { + "epoch": 11.046952595936794, + "eval_loss": 0.6099210381507874, + "eval_runtime": 3.1526, + "eval_samples_per_second": 56.778, + "eval_steps_per_second": 56.778, + "step": 3060 + }, + { + "epoch": 11.050564334085779, + "grad_norm": 203.874755859375, + "learning_rate": 1.3388384754990925e-05, + "loss": 37.7674, + "step": 3061 + }, + { + "epoch": 11.054176072234762, + "grad_norm": 222.9609832763672, + "learning_rate": 1.3382940108892922e-05, + "loss": 39.5784, + "step": 3062 + }, + { + "epoch": 11.057787810383747, + "grad_norm": 177.81871032714844, + "learning_rate": 1.337749546279492e-05, + "loss": 37.5264, + "step": 3063 + }, + { + "epoch": 11.061399548532732, + "grad_norm": 209.53326416015625, + "learning_rate": 1.3372050816696915e-05, + "loss": 38.5067, + "step": 3064 + }, + { + "epoch": 11.065011286681715, + "grad_norm": 228.35260009765625, + "learning_rate": 1.3366606170598912e-05, + "loss": 37.5329, + "step": 3065 + }, + { + "epoch": 11.0686230248307, + "grad_norm": 231.5054168701172, + "learning_rate": 1.3361161524500907e-05, + "loss": 39.8565, + "step": 3066 + }, + { + "epoch": 11.072234762979685, + "grad_norm": 184.31460571289062, + "learning_rate": 1.3355716878402904e-05, + "loss": 37.9703, + "step": 3067 + }, + { + "epoch": 11.075846501128668, + "grad_norm": 230.06463623046875, + "learning_rate": 1.3350272232304901e-05, + "loss": 39.1406, + "step": 3068 + }, + { + "epoch": 11.079458239277653, + "grad_norm": 263.3990478515625, + "learning_rate": 1.3344827586206897e-05, + "loss": 39.8019, + "step": 3069 + }, + { + "epoch": 11.083069977426636, + "grad_norm": 217.89923095703125, + "learning_rate": 1.3339382940108892e-05, + "loss": 40.195, + "step": 3070 + }, + { + "epoch": 11.083069977426636, + "eval_loss": 0.6136859655380249, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 3070 + }, + { + "epoch": 11.08668171557562, + "grad_norm": 238.8343505859375, + "learning_rate": 1.333393829401089e-05, + "loss": 39.1668, + "step": 3071 + }, + { + "epoch": 11.090293453724605, + "grad_norm": 288.6470947265625, + "learning_rate": 1.3328493647912886e-05, + "loss": 40.3355, + "step": 3072 + }, + { + "epoch": 11.093905191873588, + "grad_norm": 284.3423156738281, + "learning_rate": 1.3323049001814883e-05, + "loss": 41.5359, + "step": 3073 + }, + { + "epoch": 11.097516930022573, + "grad_norm": 263.0945739746094, + "learning_rate": 1.3317604355716879e-05, + "loss": 41.3219, + "step": 3074 + }, + { + "epoch": 11.101128668171558, + "grad_norm": 208.96383666992188, + "learning_rate": 1.3312159709618874e-05, + "loss": 39.7292, + "step": 3075 + }, + { + "epoch": 11.104740406320541, + "grad_norm": 233.49888610839844, + "learning_rate": 1.3306715063520873e-05, + "loss": 35.282, + "step": 3076 + }, + { + "epoch": 11.108352144469526, + "grad_norm": 216.6250762939453, + "learning_rate": 1.3301270417422868e-05, + "loss": 34.4335, + "step": 3077 + }, + { + "epoch": 11.111963882618511, + "grad_norm": 182.3594970703125, + "learning_rate": 1.3295825771324864e-05, + "loss": 32.7557, + "step": 3078 + }, + { + "epoch": 11.115575620767494, + "grad_norm": 215.4852752685547, + "learning_rate": 1.329038112522686e-05, + "loss": 32.185, + "step": 3079 + }, + { + "epoch": 11.119187358916479, + "grad_norm": 237.4733123779297, + "learning_rate": 1.3284936479128856e-05, + "loss": 32.8733, + "step": 3080 + }, + { + "epoch": 11.119187358916479, + "eval_loss": 0.6130570769309998, + "eval_runtime": 3.154, + "eval_samples_per_second": 56.754, + "eval_steps_per_second": 56.754, + "step": 3080 + }, + { + "epoch": 11.122799097065462, + "grad_norm": 202.9044952392578, + "learning_rate": 1.3279491833030853e-05, + "loss": 33.89, + "step": 3081 + }, + { + "epoch": 11.126410835214447, + "grad_norm": 230.82086181640625, + "learning_rate": 1.327404718693285e-05, + "loss": 34.0808, + "step": 3082 + }, + { + "epoch": 11.130022573363432, + "grad_norm": 318.1103515625, + "learning_rate": 1.3268602540834846e-05, + "loss": 35.5715, + "step": 3083 + }, + { + "epoch": 11.133634311512415, + "grad_norm": 296.760986328125, + "learning_rate": 1.3263157894736843e-05, + "loss": 36.0701, + "step": 3084 + }, + { + "epoch": 11.1372460496614, + "grad_norm": 355.1922302246094, + "learning_rate": 1.3257713248638838e-05, + "loss": 35.027, + "step": 3085 + }, + { + "epoch": 11.140857787810384, + "grad_norm": 379.0643310546875, + "learning_rate": 1.3252268602540835e-05, + "loss": 36.8225, + "step": 3086 + }, + { + "epoch": 11.144469525959368, + "grad_norm": 271.0293273925781, + "learning_rate": 1.3246823956442832e-05, + "loss": 34.18, + "step": 3087 + }, + { + "epoch": 11.148081264108352, + "grad_norm": 231.29782104492188, + "learning_rate": 1.3241379310344828e-05, + "loss": 37.5546, + "step": 3088 + }, + { + "epoch": 11.151693002257336, + "grad_norm": 236.58180236816406, + "learning_rate": 1.3235934664246823e-05, + "loss": 35.8625, + "step": 3089 + }, + { + "epoch": 11.15530474040632, + "grad_norm": 220.71853637695312, + "learning_rate": 1.3230490018148822e-05, + "loss": 38.1384, + "step": 3090 + }, + { + "epoch": 11.15530474040632, + "eval_loss": 0.6140565276145935, + "eval_runtime": 3.1543, + "eval_samples_per_second": 56.747, + "eval_steps_per_second": 56.747, + "step": 3090 + }, + { + "epoch": 11.158916478555305, + "grad_norm": 251.32090759277344, + "learning_rate": 1.3225045372050817e-05, + "loss": 36.7226, + "step": 3091 + }, + { + "epoch": 11.162528216704288, + "grad_norm": 244.061279296875, + "learning_rate": 1.3219600725952814e-05, + "loss": 37.2144, + "step": 3092 + }, + { + "epoch": 11.166139954853273, + "grad_norm": 274.3013610839844, + "learning_rate": 1.321415607985481e-05, + "loss": 27.0703, + "step": 3093 + }, + { + "epoch": 11.169751693002258, + "grad_norm": 197.1829071044922, + "learning_rate": 1.3208711433756805e-05, + "loss": 23.0504, + "step": 3094 + }, + { + "epoch": 11.173363431151241, + "grad_norm": 205.8387451171875, + "learning_rate": 1.3203266787658804e-05, + "loss": 23.4632, + "step": 3095 + }, + { + "epoch": 11.176975169300226, + "grad_norm": 237.6263427734375, + "learning_rate": 1.31978221415608e-05, + "loss": 23.9426, + "step": 3096 + }, + { + "epoch": 11.18058690744921, + "grad_norm": 177.99688720703125, + "learning_rate": 1.3192377495462795e-05, + "loss": 24.2553, + "step": 3097 + }, + { + "epoch": 11.184198645598194, + "grad_norm": 235.16787719726562, + "learning_rate": 1.3186932849364792e-05, + "loss": 41.3257, + "step": 3098 + }, + { + "epoch": 11.187810383747179, + "grad_norm": 213.4043731689453, + "learning_rate": 1.3181488203266787e-05, + "loss": 42.3344, + "step": 3099 + }, + { + "epoch": 11.191422121896162, + "grad_norm": 162.57554626464844, + "learning_rate": 1.3176043557168784e-05, + "loss": 41.2702, + "step": 3100 + }, + { + "epoch": 11.191422121896162, + "eval_loss": 0.6155741214752197, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.06, + "eval_steps_per_second": 57.06, + "step": 3100 + }, + { + "epoch": 11.195033860045147, + "grad_norm": 215.84335327148438, + "learning_rate": 1.3170598911070781e-05, + "loss": 41.0582, + "step": 3101 + }, + { + "epoch": 11.198645598194132, + "grad_norm": 295.0271301269531, + "learning_rate": 1.3165154264972777e-05, + "loss": 41.3479, + "step": 3102 + }, + { + "epoch": 11.202257336343115, + "grad_norm": 287.3316955566406, + "learning_rate": 1.3159709618874774e-05, + "loss": 41.6267, + "step": 3103 + }, + { + "epoch": 11.2058690744921, + "grad_norm": 249.3993377685547, + "learning_rate": 1.315426497277677e-05, + "loss": 40.5208, + "step": 3104 + }, + { + "epoch": 11.209480812641084, + "grad_norm": 274.5410461425781, + "learning_rate": 1.3148820326678766e-05, + "loss": 41.7072, + "step": 3105 + }, + { + "epoch": 11.213092550790067, + "grad_norm": 259.49627685546875, + "learning_rate": 1.3143375680580763e-05, + "loss": 41.0034, + "step": 3106 + }, + { + "epoch": 11.216704288939052, + "grad_norm": 246.60902404785156, + "learning_rate": 1.3137931034482759e-05, + "loss": 40.1154, + "step": 3107 + }, + { + "epoch": 11.220316027088035, + "grad_norm": 224.0052947998047, + "learning_rate": 1.3132486388384754e-05, + "loss": 41.1167, + "step": 3108 + }, + { + "epoch": 11.22392776523702, + "grad_norm": 204.24021911621094, + "learning_rate": 1.3127041742286753e-05, + "loss": 37.0909, + "step": 3109 + }, + { + "epoch": 11.227539503386005, + "grad_norm": 206.67681884765625, + "learning_rate": 1.3121597096188748e-05, + "loss": 38.0959, + "step": 3110 + }, + { + "epoch": 11.227539503386005, + "eval_loss": 0.6148640513420105, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.798, + "eval_steps_per_second": 56.798, + "step": 3110 + }, + { + "epoch": 11.231151241534988, + "grad_norm": 255.91238403320312, + "learning_rate": 1.3116152450090743e-05, + "loss": 38.8076, + "step": 3111 + }, + { + "epoch": 11.234762979683973, + "grad_norm": 239.5032958984375, + "learning_rate": 1.311070780399274e-05, + "loss": 39.3991, + "step": 3112 + }, + { + "epoch": 11.238374717832958, + "grad_norm": 254.8914031982422, + "learning_rate": 1.3105263157894738e-05, + "loss": 37.7301, + "step": 3113 + }, + { + "epoch": 11.241986455981941, + "grad_norm": 229.97943115234375, + "learning_rate": 1.3099818511796735e-05, + "loss": 38.8527, + "step": 3114 + }, + { + "epoch": 11.245598194130926, + "grad_norm": 208.1148681640625, + "learning_rate": 1.309437386569873e-05, + "loss": 38.8518, + "step": 3115 + }, + { + "epoch": 11.249209932279909, + "grad_norm": 208.49557495117188, + "learning_rate": 1.3088929219600725e-05, + "loss": 38.927, + "step": 3116 + }, + { + "epoch": 11.252821670428894, + "grad_norm": 332.9958801269531, + "learning_rate": 1.3083484573502723e-05, + "loss": 40.0492, + "step": 3117 + }, + { + "epoch": 11.256433408577879, + "grad_norm": 253.16769409179688, + "learning_rate": 1.307803992740472e-05, + "loss": 39.1965, + "step": 3118 + }, + { + "epoch": 11.260045146726862, + "grad_norm": 243.8136444091797, + "learning_rate": 1.3072595281306715e-05, + "loss": 38.2286, + "step": 3119 + }, + { + "epoch": 11.263656884875846, + "grad_norm": 273.6463623046875, + "learning_rate": 1.3067150635208712e-05, + "loss": 39.3751, + "step": 3120 + }, + { + "epoch": 11.263656884875846, + "eval_loss": 0.6175129413604736, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 3120 + }, + { + "epoch": 11.267268623024831, + "grad_norm": 228.980224609375, + "learning_rate": 1.3061705989110707e-05, + "loss": 40.29, + "step": 3121 + }, + { + "epoch": 11.270880361173814, + "grad_norm": 292.6310729980469, + "learning_rate": 1.3056261343012703e-05, + "loss": 41.1785, + "step": 3122 + }, + { + "epoch": 11.2744920993228, + "grad_norm": 217.0737762451172, + "learning_rate": 1.3050816696914702e-05, + "loss": 40.9514, + "step": 3123 + }, + { + "epoch": 11.278103837471784, + "grad_norm": 227.0102081298828, + "learning_rate": 1.3045372050816697e-05, + "loss": 39.6132, + "step": 3124 + }, + { + "epoch": 11.281715575620767, + "grad_norm": 195.74667358398438, + "learning_rate": 1.3039927404718694e-05, + "loss": 39.5024, + "step": 3125 + }, + { + "epoch": 11.285327313769752, + "grad_norm": 222.6744384765625, + "learning_rate": 1.303448275862069e-05, + "loss": 37.7863, + "step": 3126 + }, + { + "epoch": 11.288939051918735, + "grad_norm": 207.1038055419922, + "learning_rate": 1.3029038112522687e-05, + "loss": 34.9129, + "step": 3127 + }, + { + "epoch": 11.29255079006772, + "grad_norm": 227.38330078125, + "learning_rate": 1.3023593466424684e-05, + "loss": 33.231, + "step": 3128 + }, + { + "epoch": 11.296162528216705, + "grad_norm": 254.19442749023438, + "learning_rate": 1.3018148820326679e-05, + "loss": 33.3166, + "step": 3129 + }, + { + "epoch": 11.299774266365688, + "grad_norm": 221.4664306640625, + "learning_rate": 1.3012704174228674e-05, + "loss": 33.2336, + "step": 3130 + }, + { + "epoch": 11.299774266365688, + "eval_loss": 0.6138683557510376, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 3130 + }, + { + "epoch": 11.303386004514673, + "grad_norm": 179.73678588867188, + "learning_rate": 1.3007259528130671e-05, + "loss": 34.0082, + "step": 3131 + }, + { + "epoch": 11.306997742663658, + "grad_norm": 238.66107177734375, + "learning_rate": 1.3001814882032669e-05, + "loss": 33.1898, + "step": 3132 + }, + { + "epoch": 11.31060948081264, + "grad_norm": 315.51934814453125, + "learning_rate": 1.2996370235934666e-05, + "loss": 34.5558, + "step": 3133 + }, + { + "epoch": 11.314221218961626, + "grad_norm": 235.54217529296875, + "learning_rate": 1.2990925589836661e-05, + "loss": 32.4498, + "step": 3134 + }, + { + "epoch": 11.317832957110609, + "grad_norm": 225.9518280029297, + "learning_rate": 1.2985480943738656e-05, + "loss": 34.1823, + "step": 3135 + }, + { + "epoch": 11.321444695259594, + "grad_norm": 276.5481262207031, + "learning_rate": 1.2980036297640655e-05, + "loss": 34.6704, + "step": 3136 + }, + { + "epoch": 11.325056433408578, + "grad_norm": 306.4985656738281, + "learning_rate": 1.297459165154265e-05, + "loss": 35.9149, + "step": 3137 + }, + { + "epoch": 11.328668171557561, + "grad_norm": 207.28550720214844, + "learning_rate": 1.2969147005444646e-05, + "loss": 34.876, + "step": 3138 + }, + { + "epoch": 11.332279909706546, + "grad_norm": 238.89157104492188, + "learning_rate": 1.2963702359346643e-05, + "loss": 36.7191, + "step": 3139 + }, + { + "epoch": 11.335891647855531, + "grad_norm": 281.7445068359375, + "learning_rate": 1.2958257713248638e-05, + "loss": 37.9134, + "step": 3140 + }, + { + "epoch": 11.335891647855531, + "eval_loss": 0.6141538023948669, + "eval_runtime": 3.1622, + "eval_samples_per_second": 56.606, + "eval_steps_per_second": 56.606, + "step": 3140 + }, + { + "epoch": 11.339503386004514, + "grad_norm": 261.58221435546875, + "learning_rate": 1.2952813067150635e-05, + "loss": 36.7193, + "step": 3141 + }, + { + "epoch": 11.343115124153499, + "grad_norm": 260.8083190917969, + "learning_rate": 1.2947368421052633e-05, + "loss": 36.9418, + "step": 3142 + }, + { + "epoch": 11.346726862302482, + "grad_norm": 263.466552734375, + "learning_rate": 1.2941923774954628e-05, + "loss": 31.1083, + "step": 3143 + }, + { + "epoch": 11.350338600451467, + "grad_norm": 201.6587677001953, + "learning_rate": 1.2936479128856625e-05, + "loss": 23.4982, + "step": 3144 + }, + { + "epoch": 11.353950338600452, + "grad_norm": 230.29629516601562, + "learning_rate": 1.293103448275862e-05, + "loss": 22.5417, + "step": 3145 + }, + { + "epoch": 11.357562076749435, + "grad_norm": 193.08795166015625, + "learning_rate": 1.2925589836660617e-05, + "loss": 23.6032, + "step": 3146 + }, + { + "epoch": 11.36117381489842, + "grad_norm": 206.49093627929688, + "learning_rate": 1.2920145190562615e-05, + "loss": 24.1813, + "step": 3147 + }, + { + "epoch": 11.364785553047405, + "grad_norm": 285.38348388671875, + "learning_rate": 1.291470054446461e-05, + "loss": 41.4394, + "step": 3148 + }, + { + "epoch": 11.368397291196388, + "grad_norm": 307.4984130859375, + "learning_rate": 1.2909255898366605e-05, + "loss": 43.8865, + "step": 3149 + }, + { + "epoch": 11.372009029345373, + "grad_norm": 256.685791015625, + "learning_rate": 1.2903811252268604e-05, + "loss": 41.5534, + "step": 3150 + }, + { + "epoch": 11.372009029345373, + "eval_loss": 0.6155339479446411, + "eval_runtime": 3.1488, + "eval_samples_per_second": 56.846, + "eval_steps_per_second": 56.846, + "step": 3150 + }, + { + "epoch": 11.375620767494357, + "grad_norm": 302.5317077636719, + "learning_rate": 1.28983666061706e-05, + "loss": 41.5231, + "step": 3151 + }, + { + "epoch": 11.37923250564334, + "grad_norm": 381.4787292480469, + "learning_rate": 1.2892921960072595e-05, + "loss": 40.7064, + "step": 3152 + }, + { + "epoch": 11.382844243792325, + "grad_norm": 313.63116455078125, + "learning_rate": 1.2887477313974592e-05, + "loss": 41.4045, + "step": 3153 + }, + { + "epoch": 11.386455981941308, + "grad_norm": 265.4134521484375, + "learning_rate": 1.2882032667876587e-05, + "loss": 41.2618, + "step": 3154 + }, + { + "epoch": 11.390067720090293, + "grad_norm": 260.43084716796875, + "learning_rate": 1.2876588021778586e-05, + "loss": 42.6311, + "step": 3155 + }, + { + "epoch": 11.393679458239278, + "grad_norm": 326.7022705078125, + "learning_rate": 1.2871143375680581e-05, + "loss": 41.8859, + "step": 3156 + }, + { + "epoch": 11.397291196388261, + "grad_norm": 420.966552734375, + "learning_rate": 1.2865698729582577e-05, + "loss": 41.8117, + "step": 3157 + }, + { + "epoch": 11.400902934537246, + "grad_norm": 280.8377380371094, + "learning_rate": 1.2860254083484574e-05, + "loss": 41.3303, + "step": 3158 + }, + { + "epoch": 11.404514672686231, + "grad_norm": 238.64564514160156, + "learning_rate": 1.2854809437386571e-05, + "loss": 38.253, + "step": 3159 + }, + { + "epoch": 11.408126410835214, + "grad_norm": 258.8091125488281, + "learning_rate": 1.2849364791288566e-05, + "loss": 39.2494, + "step": 3160 + }, + { + "epoch": 11.408126410835214, + "eval_loss": 0.6130858659744263, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 3160 + }, + { + "epoch": 11.411738148984199, + "grad_norm": 209.76300048828125, + "learning_rate": 1.2843920145190563e-05, + "loss": 39.1069, + "step": 3161 + }, + { + "epoch": 11.415349887133182, + "grad_norm": 215.24072265625, + "learning_rate": 1.2838475499092559e-05, + "loss": 38.8867, + "step": 3162 + }, + { + "epoch": 11.418961625282167, + "grad_norm": 285.4281311035156, + "learning_rate": 1.2833030852994554e-05, + "loss": 38.0298, + "step": 3163 + }, + { + "epoch": 11.422573363431152, + "grad_norm": 322.1593017578125, + "learning_rate": 1.2827586206896553e-05, + "loss": 40.2122, + "step": 3164 + }, + { + "epoch": 11.426185101580135, + "grad_norm": 277.2178955078125, + "learning_rate": 1.2822141560798548e-05, + "loss": 38.0829, + "step": 3165 + }, + { + "epoch": 11.42979683972912, + "grad_norm": 186.9705810546875, + "learning_rate": 1.2816696914700545e-05, + "loss": 40.6601, + "step": 3166 + }, + { + "epoch": 11.433408577878104, + "grad_norm": 210.6102294921875, + "learning_rate": 1.281125226860254e-05, + "loss": 39.0126, + "step": 3167 + }, + { + "epoch": 11.437020316027088, + "grad_norm": 234.50717163085938, + "learning_rate": 1.2805807622504536e-05, + "loss": 38.6465, + "step": 3168 + }, + { + "epoch": 11.440632054176072, + "grad_norm": 217.9093475341797, + "learning_rate": 1.2800362976406535e-05, + "loss": 39.2568, + "step": 3169 + }, + { + "epoch": 11.444243792325057, + "grad_norm": 252.82054138183594, + "learning_rate": 1.279491833030853e-05, + "loss": 39.005, + "step": 3170 + }, + { + "epoch": 11.444243792325057, + "eval_loss": 0.6125118732452393, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 3170 + }, + { + "epoch": 11.44785553047404, + "grad_norm": 290.2322998046875, + "learning_rate": 1.2789473684210526e-05, + "loss": 39.6133, + "step": 3171 + }, + { + "epoch": 11.451467268623025, + "grad_norm": 250.72450256347656, + "learning_rate": 1.2784029038112523e-05, + "loss": 40.3251, + "step": 3172 + }, + { + "epoch": 11.455079006772008, + "grad_norm": 273.91229248046875, + "learning_rate": 1.277858439201452e-05, + "loss": 39.5129, + "step": 3173 + }, + { + "epoch": 11.458690744920993, + "grad_norm": 214.30038452148438, + "learning_rate": 1.2773139745916515e-05, + "loss": 40.5093, + "step": 3174 + }, + { + "epoch": 11.462302483069978, + "grad_norm": 264.251708984375, + "learning_rate": 1.2767695099818512e-05, + "loss": 38.3837, + "step": 3175 + }, + { + "epoch": 11.465914221218961, + "grad_norm": 224.7700653076172, + "learning_rate": 1.2762250453720508e-05, + "loss": 37.8522, + "step": 3176 + }, + { + "epoch": 11.469525959367946, + "grad_norm": 238.35604858398438, + "learning_rate": 1.2756805807622505e-05, + "loss": 34.0249, + "step": 3177 + }, + { + "epoch": 11.47313769751693, + "grad_norm": 181.4731903076172, + "learning_rate": 1.2751361161524502e-05, + "loss": 34.2473, + "step": 3178 + }, + { + "epoch": 11.476749435665914, + "grad_norm": 240.2397003173828, + "learning_rate": 1.2745916515426497e-05, + "loss": 32.8657, + "step": 3179 + }, + { + "epoch": 11.480361173814899, + "grad_norm": 283.2740478515625, + "learning_rate": 1.2740471869328494e-05, + "loss": 34.6619, + "step": 3180 + }, + { + "epoch": 11.480361173814899, + "eval_loss": 0.6126638054847717, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 3180 + }, + { + "epoch": 11.483972911963882, + "grad_norm": 248.70912170410156, + "learning_rate": 1.273502722323049e-05, + "loss": 33.0975, + "step": 3181 + }, + { + "epoch": 11.487584650112867, + "grad_norm": 210.9479217529297, + "learning_rate": 1.2729582577132487e-05, + "loss": 34.2069, + "step": 3182 + }, + { + "epoch": 11.491196388261852, + "grad_norm": 234.31399536132812, + "learning_rate": 1.2724137931034484e-05, + "loss": 35.811, + "step": 3183 + }, + { + "epoch": 11.494808126410835, + "grad_norm": 253.24478149414062, + "learning_rate": 1.271869328493648e-05, + "loss": 35.6234, + "step": 3184 + }, + { + "epoch": 11.49841986455982, + "grad_norm": 259.0565185546875, + "learning_rate": 1.2713248638838476e-05, + "loss": 35.1495, + "step": 3185 + }, + { + "epoch": 11.502031602708804, + "grad_norm": 235.4202880859375, + "learning_rate": 1.2707803992740472e-05, + "loss": 35.1363, + "step": 3186 + }, + { + "epoch": 11.505643340857787, + "grad_norm": 248.30267333984375, + "learning_rate": 1.2702359346642469e-05, + "loss": 35.9653, + "step": 3187 + }, + { + "epoch": 11.509255079006772, + "grad_norm": 197.6142120361328, + "learning_rate": 1.2696914700544466e-05, + "loss": 35.6304, + "step": 3188 + }, + { + "epoch": 11.512866817155757, + "grad_norm": 329.27862548828125, + "learning_rate": 1.2691470054446461e-05, + "loss": 35.6111, + "step": 3189 + }, + { + "epoch": 11.51647855530474, + "grad_norm": 194.7126922607422, + "learning_rate": 1.2686025408348457e-05, + "loss": 35.0693, + "step": 3190 + }, + { + "epoch": 11.51647855530474, + "eval_loss": 0.6106634736061096, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 3190 + }, + { + "epoch": 11.520090293453725, + "grad_norm": 243.0207061767578, + "learning_rate": 1.2680580762250454e-05, + "loss": 37.6373, + "step": 3191 + }, + { + "epoch": 11.523702031602708, + "grad_norm": 282.0947265625, + "learning_rate": 1.267513611615245e-05, + "loss": 36.2595, + "step": 3192 + }, + { + "epoch": 11.527313769751693, + "grad_norm": 249.8011932373047, + "learning_rate": 1.2669691470054446e-05, + "loss": 35.5601, + "step": 3193 + }, + { + "epoch": 11.530925507900678, + "grad_norm": 202.17503356933594, + "learning_rate": 1.2664246823956443e-05, + "loss": 23.1075, + "step": 3194 + }, + { + "epoch": 11.534537246049661, + "grad_norm": 188.78128051757812, + "learning_rate": 1.2658802177858439e-05, + "loss": 22.2458, + "step": 3195 + }, + { + "epoch": 11.538148984198646, + "grad_norm": 219.24722290039062, + "learning_rate": 1.2653357531760437e-05, + "loss": 23.7842, + "step": 3196 + }, + { + "epoch": 11.54176072234763, + "grad_norm": 213.0615234375, + "learning_rate": 1.2647912885662433e-05, + "loss": 25.3773, + "step": 3197 + }, + { + "epoch": 11.545372460496614, + "grad_norm": 274.6806335449219, + "learning_rate": 1.2642468239564428e-05, + "loss": 40.396, + "step": 3198 + }, + { + "epoch": 11.548984198645599, + "grad_norm": 248.91778564453125, + "learning_rate": 1.2637023593466425e-05, + "loss": 42.2405, + "step": 3199 + }, + { + "epoch": 11.552595936794582, + "grad_norm": 228.45591735839844, + "learning_rate": 1.263157894736842e-05, + "loss": 40.7328, + "step": 3200 + }, + { + "epoch": 11.552595936794582, + "eval_loss": 0.6154705286026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.04, + "eval_steps_per_second": 57.04, + "step": 3200 + }, + { + "epoch": 11.556207674943566, + "grad_norm": 206.54483032226562, + "learning_rate": 1.2626134301270418e-05, + "loss": 40.6909, + "step": 3201 + }, + { + "epoch": 11.559819413092551, + "grad_norm": 199.14816284179688, + "learning_rate": 1.2620689655172415e-05, + "loss": 40.6918, + "step": 3202 + }, + { + "epoch": 11.563431151241534, + "grad_norm": 217.4789276123047, + "learning_rate": 1.261524500907441e-05, + "loss": 41.686, + "step": 3203 + }, + { + "epoch": 11.56704288939052, + "grad_norm": 209.83084106445312, + "learning_rate": 1.2609800362976406e-05, + "loss": 40.685, + "step": 3204 + }, + { + "epoch": 11.570654627539504, + "grad_norm": 184.56614685058594, + "learning_rate": 1.2604355716878404e-05, + "loss": 42.1684, + "step": 3205 + }, + { + "epoch": 11.574266365688487, + "grad_norm": 226.84622192382812, + "learning_rate": 1.25989110707804e-05, + "loss": 42.4169, + "step": 3206 + }, + { + "epoch": 11.577878103837472, + "grad_norm": 271.7705383300781, + "learning_rate": 1.2593466424682397e-05, + "loss": 41.9603, + "step": 3207 + }, + { + "epoch": 11.581489841986457, + "grad_norm": 206.48257446289062, + "learning_rate": 1.2588021778584392e-05, + "loss": 39.9903, + "step": 3208 + }, + { + "epoch": 11.58510158013544, + "grad_norm": 190.86009216308594, + "learning_rate": 1.2582577132486388e-05, + "loss": 39.3138, + "step": 3209 + }, + { + "epoch": 11.588713318284425, + "grad_norm": 217.0152130126953, + "learning_rate": 1.2577132486388386e-05, + "loss": 37.652, + "step": 3210 + }, + { + "epoch": 11.588713318284425, + "eval_loss": 0.6143624186515808, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 3210 + }, + { + "epoch": 11.592325056433408, + "grad_norm": 203.3090362548828, + "learning_rate": 1.2571687840290382e-05, + "loss": 38.5532, + "step": 3211 + }, + { + "epoch": 11.595936794582393, + "grad_norm": 237.18287658691406, + "learning_rate": 1.2566243194192377e-05, + "loss": 38.4073, + "step": 3212 + }, + { + "epoch": 11.599548532731378, + "grad_norm": 222.20489501953125, + "learning_rate": 1.2560798548094374e-05, + "loss": 37.7122, + "step": 3213 + }, + { + "epoch": 11.60316027088036, + "grad_norm": 261.4862060546875, + "learning_rate": 1.255535390199637e-05, + "loss": 39.0125, + "step": 3214 + }, + { + "epoch": 11.606772009029346, + "grad_norm": 235.49668884277344, + "learning_rate": 1.2549909255898367e-05, + "loss": 38.1753, + "step": 3215 + }, + { + "epoch": 11.610383747178329, + "grad_norm": 219.66139221191406, + "learning_rate": 1.2544464609800364e-05, + "loss": 40.3478, + "step": 3216 + }, + { + "epoch": 11.613995485327314, + "grad_norm": 282.8075256347656, + "learning_rate": 1.2539019963702359e-05, + "loss": 39.3672, + "step": 3217 + }, + { + "epoch": 11.617607223476298, + "grad_norm": 235.07875061035156, + "learning_rate": 1.2533575317604356e-05, + "loss": 39.8955, + "step": 3218 + }, + { + "epoch": 11.621218961625281, + "grad_norm": 328.829833984375, + "learning_rate": 1.2528130671506353e-05, + "loss": 38.626, + "step": 3219 + }, + { + "epoch": 11.624830699774266, + "grad_norm": 283.1789245605469, + "learning_rate": 1.2522686025408349e-05, + "loss": 40.0565, + "step": 3220 + }, + { + "epoch": 11.624830699774266, + "eval_loss": 0.6113889217376709, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3220 + }, + { + "epoch": 11.628442437923251, + "grad_norm": 230.88047790527344, + "learning_rate": 1.2517241379310346e-05, + "loss": 40.1155, + "step": 3221 + }, + { + "epoch": 11.632054176072234, + "grad_norm": 258.1295166015625, + "learning_rate": 1.2511796733212341e-05, + "loss": 40.4707, + "step": 3222 + }, + { + "epoch": 11.635665914221219, + "grad_norm": 255.82699584960938, + "learning_rate": 1.2506352087114336e-05, + "loss": 41.1296, + "step": 3223 + }, + { + "epoch": 11.639277652370204, + "grad_norm": 226.4784393310547, + "learning_rate": 1.2500907441016335e-05, + "loss": 39.1159, + "step": 3224 + }, + { + "epoch": 11.642889390519187, + "grad_norm": 257.38104248046875, + "learning_rate": 1.249546279491833e-05, + "loss": 40.7933, + "step": 3225 + }, + { + "epoch": 11.646501128668172, + "grad_norm": 218.69070434570312, + "learning_rate": 1.2490018148820328e-05, + "loss": 39.6723, + "step": 3226 + }, + { + "epoch": 11.650112866817155, + "grad_norm": 232.3351287841797, + "learning_rate": 1.2484573502722323e-05, + "loss": 37.5671, + "step": 3227 + }, + { + "epoch": 11.65372460496614, + "grad_norm": 229.93295288085938, + "learning_rate": 1.2479128856624318e-05, + "loss": 32.7819, + "step": 3228 + }, + { + "epoch": 11.657336343115125, + "grad_norm": 265.6002197265625, + "learning_rate": 1.2473684210526317e-05, + "loss": 32.5955, + "step": 3229 + }, + { + "epoch": 11.660948081264108, + "grad_norm": 278.47705078125, + "learning_rate": 1.2468239564428313e-05, + "loss": 32.9901, + "step": 3230 + }, + { + "epoch": 11.660948081264108, + "eval_loss": 0.6078047752380371, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 3230 + }, + { + "epoch": 11.664559819413093, + "grad_norm": 239.9285430908203, + "learning_rate": 1.2462794918330308e-05, + "loss": 33.2737, + "step": 3231 + }, + { + "epoch": 11.668171557562077, + "grad_norm": 358.36090087890625, + "learning_rate": 1.2457350272232305e-05, + "loss": 34.8522, + "step": 3232 + }, + { + "epoch": 11.67178329571106, + "grad_norm": 258.0733642578125, + "learning_rate": 1.2451905626134302e-05, + "loss": 34.6796, + "step": 3233 + }, + { + "epoch": 11.675395033860045, + "grad_norm": 296.21942138671875, + "learning_rate": 1.2446460980036298e-05, + "loss": 35.8479, + "step": 3234 + }, + { + "epoch": 11.679006772009028, + "grad_norm": 229.6141815185547, + "learning_rate": 1.2441016333938295e-05, + "loss": 36.4934, + "step": 3235 + }, + { + "epoch": 11.682618510158013, + "grad_norm": 238.6092987060547, + "learning_rate": 1.243557168784029e-05, + "loss": 35.2253, + "step": 3236 + }, + { + "epoch": 11.686230248306998, + "grad_norm": 300.76300048828125, + "learning_rate": 1.2430127041742287e-05, + "loss": 34.9373, + "step": 3237 + }, + { + "epoch": 11.689841986455981, + "grad_norm": 227.70672607421875, + "learning_rate": 1.2424682395644284e-05, + "loss": 35.4369, + "step": 3238 + }, + { + "epoch": 11.693453724604966, + "grad_norm": 218.36000061035156, + "learning_rate": 1.241923774954628e-05, + "loss": 35.3398, + "step": 3239 + }, + { + "epoch": 11.697065462753951, + "grad_norm": 220.78475952148438, + "learning_rate": 1.2413793103448277e-05, + "loss": 35.7612, + "step": 3240 + }, + { + "epoch": 11.697065462753951, + "eval_loss": 0.6067846417427063, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 3240 + }, + { + "epoch": 11.700677200902934, + "grad_norm": 237.34437561035156, + "learning_rate": 1.2408348457350272e-05, + "loss": 38.0459, + "step": 3241 + }, + { + "epoch": 11.704288939051919, + "grad_norm": 251.60633850097656, + "learning_rate": 1.2402903811252269e-05, + "loss": 35.4676, + "step": 3242 + }, + { + "epoch": 11.707900677200904, + "grad_norm": 214.17117309570312, + "learning_rate": 1.2397459165154266e-05, + "loss": 30.5595, + "step": 3243 + }, + { + "epoch": 11.711512415349887, + "grad_norm": 202.3698272705078, + "learning_rate": 1.2392014519056262e-05, + "loss": 23.7468, + "step": 3244 + }, + { + "epoch": 11.715124153498872, + "grad_norm": 229.11776733398438, + "learning_rate": 1.2386569872958257e-05, + "loss": 23.1255, + "step": 3245 + }, + { + "epoch": 11.718735891647855, + "grad_norm": 175.93829345703125, + "learning_rate": 1.2381125226860254e-05, + "loss": 23.7349, + "step": 3246 + }, + { + "epoch": 11.72234762979684, + "grad_norm": 232.7489471435547, + "learning_rate": 1.2375680580762251e-05, + "loss": 24.4997, + "step": 3247 + }, + { + "epoch": 11.725959367945824, + "grad_norm": 280.5601806640625, + "learning_rate": 1.2370235934664248e-05, + "loss": 42.3811, + "step": 3248 + }, + { + "epoch": 11.729571106094808, + "grad_norm": 292.2538146972656, + "learning_rate": 1.2364791288566244e-05, + "loss": 42.9804, + "step": 3249 + }, + { + "epoch": 11.733182844243792, + "grad_norm": 265.0259704589844, + "learning_rate": 1.2359346642468239e-05, + "loss": 41.1251, + "step": 3250 + }, + { + "epoch": 11.733182844243792, + "eval_loss": 0.6141200065612793, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 3250 + }, + { + "epoch": 11.736794582392777, + "grad_norm": 232.92893981933594, + "learning_rate": 1.2353901996370236e-05, + "loss": 40.9372, + "step": 3251 + }, + { + "epoch": 11.74040632054176, + "grad_norm": 176.99818420410156, + "learning_rate": 1.2348457350272233e-05, + "loss": 41.0757, + "step": 3252 + }, + { + "epoch": 11.744018058690745, + "grad_norm": 206.5728759765625, + "learning_rate": 1.2343012704174228e-05, + "loss": 41.9635, + "step": 3253 + }, + { + "epoch": 11.747629796839728, + "grad_norm": 211.2556915283203, + "learning_rate": 1.2337568058076226e-05, + "loss": 41.5217, + "step": 3254 + }, + { + "epoch": 11.751241534988713, + "grad_norm": 198.8915252685547, + "learning_rate": 1.2332123411978221e-05, + "loss": 42.9997, + "step": 3255 + }, + { + "epoch": 11.754853273137698, + "grad_norm": 291.2761535644531, + "learning_rate": 1.2326678765880218e-05, + "loss": 42.2561, + "step": 3256 + }, + { + "epoch": 11.758465011286681, + "grad_norm": 243.2998046875, + "learning_rate": 1.2321234119782215e-05, + "loss": 41.6219, + "step": 3257 + }, + { + "epoch": 11.762076749435666, + "grad_norm": 266.1149597167969, + "learning_rate": 1.231578947368421e-05, + "loss": 40.1646, + "step": 3258 + }, + { + "epoch": 11.76568848758465, + "grad_norm": 236.6083221435547, + "learning_rate": 1.2310344827586208e-05, + "loss": 39.7079, + "step": 3259 + }, + { + "epoch": 11.769300225733634, + "grad_norm": 196.397216796875, + "learning_rate": 1.2304900181488203e-05, + "loss": 39.6629, + "step": 3260 + }, + { + "epoch": 11.769300225733634, + "eval_loss": 0.6124016046524048, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.055, + "eval_steps_per_second": 57.055, + "step": 3260 + }, + { + "epoch": 11.772911963882619, + "grad_norm": 198.52500915527344, + "learning_rate": 1.22994555353902e-05, + "loss": 38.5285, + "step": 3261 + }, + { + "epoch": 11.776523702031604, + "grad_norm": 236.25477600097656, + "learning_rate": 1.2294010889292197e-05, + "loss": 38.3358, + "step": 3262 + }, + { + "epoch": 11.780135440180587, + "grad_norm": 260.35955810546875, + "learning_rate": 1.2288566243194192e-05, + "loss": 38.374, + "step": 3263 + }, + { + "epoch": 11.783747178329572, + "grad_norm": 313.078857421875, + "learning_rate": 1.2283121597096188e-05, + "loss": 39.124, + "step": 3264 + }, + { + "epoch": 11.787358916478555, + "grad_norm": 191.34027099609375, + "learning_rate": 1.2277676950998187e-05, + "loss": 39.1776, + "step": 3265 + }, + { + "epoch": 11.79097065462754, + "grad_norm": 203.5764923095703, + "learning_rate": 1.2272232304900182e-05, + "loss": 38.7885, + "step": 3266 + }, + { + "epoch": 11.794582392776524, + "grad_norm": 234.38479614257812, + "learning_rate": 1.2266787658802177e-05, + "loss": 39.1353, + "step": 3267 + }, + { + "epoch": 11.798194130925507, + "grad_norm": 254.5694122314453, + "learning_rate": 1.2261343012704174e-05, + "loss": 38.141, + "step": 3268 + }, + { + "epoch": 11.801805869074492, + "grad_norm": 189.8268585205078, + "learning_rate": 1.225589836660617e-05, + "loss": 39.5199, + "step": 3269 + }, + { + "epoch": 11.805417607223477, + "grad_norm": 256.52728271484375, + "learning_rate": 1.2250453720508169e-05, + "loss": 41.5113, + "step": 3270 + }, + { + "epoch": 11.805417607223477, + "eval_loss": 0.6084021329879761, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3270 + }, + { + "epoch": 11.80902934537246, + "grad_norm": 195.57321166992188, + "learning_rate": 1.2245009074410164e-05, + "loss": 39.8129, + "step": 3271 + }, + { + "epoch": 11.812641083521445, + "grad_norm": 228.6748809814453, + "learning_rate": 1.223956442831216e-05, + "loss": 40.2273, + "step": 3272 + }, + { + "epoch": 11.816252821670428, + "grad_norm": 209.96096801757812, + "learning_rate": 1.2234119782214156e-05, + "loss": 40.2254, + "step": 3273 + }, + { + "epoch": 11.819864559819413, + "grad_norm": 247.4613037109375, + "learning_rate": 1.2228675136116152e-05, + "loss": 40.71, + "step": 3274 + }, + { + "epoch": 11.823476297968398, + "grad_norm": 263.0521240234375, + "learning_rate": 1.2223230490018149e-05, + "loss": 39.5572, + "step": 3275 + }, + { + "epoch": 11.827088036117381, + "grad_norm": 225.53634643554688, + "learning_rate": 1.2217785843920146e-05, + "loss": 36.4388, + "step": 3276 + }, + { + "epoch": 11.830699774266366, + "grad_norm": 194.59527587890625, + "learning_rate": 1.2212341197822141e-05, + "loss": 33.1005, + "step": 3277 + }, + { + "epoch": 11.83431151241535, + "grad_norm": 314.715576171875, + "learning_rate": 1.2206896551724138e-05, + "loss": 32.9812, + "step": 3278 + }, + { + "epoch": 11.837923250564334, + "grad_norm": 205.86862182617188, + "learning_rate": 1.2201451905626136e-05, + "loss": 33.6331, + "step": 3279 + }, + { + "epoch": 11.841534988713319, + "grad_norm": 217.54722595214844, + "learning_rate": 1.2196007259528131e-05, + "loss": 33.6535, + "step": 3280 + }, + { + "epoch": 11.841534988713319, + "eval_loss": 0.609620213508606, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 3280 + }, + { + "epoch": 11.845146726862303, + "grad_norm": 231.25390625, + "learning_rate": 1.2190562613430128e-05, + "loss": 34.5218, + "step": 3281 + }, + { + "epoch": 11.848758465011286, + "grad_norm": 208.8440704345703, + "learning_rate": 1.2185117967332123e-05, + "loss": 34.354, + "step": 3282 + }, + { + "epoch": 11.852370203160271, + "grad_norm": 221.25547790527344, + "learning_rate": 1.2179673321234119e-05, + "loss": 34.5705, + "step": 3283 + }, + { + "epoch": 11.855981941309254, + "grad_norm": 331.4505920410156, + "learning_rate": 1.2174228675136118e-05, + "loss": 35.796, + "step": 3284 + }, + { + "epoch": 11.85959367945824, + "grad_norm": 337.1404113769531, + "learning_rate": 1.2168784029038113e-05, + "loss": 36.4544, + "step": 3285 + }, + { + "epoch": 11.863205417607224, + "grad_norm": 238.75303649902344, + "learning_rate": 1.2163339382940108e-05, + "loss": 35.7165, + "step": 3286 + }, + { + "epoch": 11.866817155756207, + "grad_norm": 260.088134765625, + "learning_rate": 1.2157894736842105e-05, + "loss": 35.5461, + "step": 3287 + }, + { + "epoch": 11.870428893905192, + "grad_norm": 265.0240173339844, + "learning_rate": 1.2152450090744102e-05, + "loss": 37.0143, + "step": 3288 + }, + { + "epoch": 11.874040632054175, + "grad_norm": 251.74273681640625, + "learning_rate": 1.21470054446461e-05, + "loss": 36.6145, + "step": 3289 + }, + { + "epoch": 11.87765237020316, + "grad_norm": 216.8999786376953, + "learning_rate": 1.2141560798548095e-05, + "loss": 36.3135, + "step": 3290 + }, + { + "epoch": 11.87765237020316, + "eval_loss": 0.6087896823883057, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.066, + "eval_steps_per_second": 57.066, + "step": 3290 + }, + { + "epoch": 11.881264108352145, + "grad_norm": 256.50006103515625, + "learning_rate": 1.213611615245009e-05, + "loss": 36.6596, + "step": 3291 + }, + { + "epoch": 11.884875846501128, + "grad_norm": 249.34164428710938, + "learning_rate": 1.2130671506352087e-05, + "loss": 37.6473, + "step": 3292 + }, + { + "epoch": 11.888487584650113, + "grad_norm": 211.9344940185547, + "learning_rate": 1.2125226860254084e-05, + "loss": 28.2839, + "step": 3293 + }, + { + "epoch": 11.892099322799098, + "grad_norm": 170.77166748046875, + "learning_rate": 1.211978221415608e-05, + "loss": 23.2231, + "step": 3294 + }, + { + "epoch": 11.89571106094808, + "grad_norm": 177.49789428710938, + "learning_rate": 1.2114337568058077e-05, + "loss": 22.7909, + "step": 3295 + }, + { + "epoch": 11.899322799097066, + "grad_norm": 189.0458221435547, + "learning_rate": 1.2108892921960072e-05, + "loss": 23.8062, + "step": 3296 + }, + { + "epoch": 11.90293453724605, + "grad_norm": 182.90457153320312, + "learning_rate": 1.2103448275862068e-05, + "loss": 24.7812, + "step": 3297 + }, + { + "epoch": 11.906546275395034, + "grad_norm": 232.61126708984375, + "learning_rate": 1.2098003629764066e-05, + "loss": 41.5496, + "step": 3298 + }, + { + "epoch": 11.910158013544018, + "grad_norm": 283.25762939453125, + "learning_rate": 1.2092558983666062e-05, + "loss": 40.7831, + "step": 3299 + }, + { + "epoch": 11.913769751693001, + "grad_norm": 316.6318359375, + "learning_rate": 1.2087114337568059e-05, + "loss": 40.6287, + "step": 3300 + }, + { + "epoch": 11.913769751693001, + "eval_loss": 0.6114257574081421, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 3300 + }, + { + "epoch": 11.917381489841986, + "grad_norm": 248.5615234375, + "learning_rate": 1.2081669691470054e-05, + "loss": 40.5648, + "step": 3301 + }, + { + "epoch": 11.920993227990971, + "grad_norm": 255.31130981445312, + "learning_rate": 1.2076225045372051e-05, + "loss": 42.4736, + "step": 3302 + }, + { + "epoch": 11.924604966139954, + "grad_norm": 229.3546600341797, + "learning_rate": 1.2070780399274048e-05, + "loss": 43.112, + "step": 3303 + }, + { + "epoch": 11.928216704288939, + "grad_norm": 226.89553833007812, + "learning_rate": 1.2065335753176044e-05, + "loss": 37.9527, + "step": 3304 + }, + { + "epoch": 11.931828442437924, + "grad_norm": 210.63919067382812, + "learning_rate": 1.205989110707804e-05, + "loss": 38.7652, + "step": 3305 + }, + { + "epoch": 11.935440180586907, + "grad_norm": 267.75335693359375, + "learning_rate": 1.2054446460980036e-05, + "loss": 39.9077, + "step": 3306 + }, + { + "epoch": 11.939051918735892, + "grad_norm": 255.3372802734375, + "learning_rate": 1.2049001814882033e-05, + "loss": 39.9008, + "step": 3307 + }, + { + "epoch": 11.942663656884875, + "grad_norm": 220.55332946777344, + "learning_rate": 1.2043557168784029e-05, + "loss": 40.8187, + "step": 3308 + }, + { + "epoch": 11.94627539503386, + "grad_norm": 350.15374755859375, + "learning_rate": 1.2038112522686026e-05, + "loss": 40.2937, + "step": 3309 + }, + { + "epoch": 11.949887133182845, + "grad_norm": 296.1144714355469, + "learning_rate": 1.2032667876588021e-05, + "loss": 41.3939, + "step": 3310 + }, + { + "epoch": 11.949887133182845, + "eval_loss": 0.6116041541099548, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3310 + }, + { + "epoch": 11.953498871331828, + "grad_norm": 220.52304077148438, + "learning_rate": 1.202722323049002e-05, + "loss": 39.108, + "step": 3311 + }, + { + "epoch": 11.957110609480813, + "grad_norm": 268.8526916503906, + "learning_rate": 1.2021778584392015e-05, + "loss": 39.547, + "step": 3312 + }, + { + "epoch": 11.960722347629797, + "grad_norm": 205.97677612304688, + "learning_rate": 1.201633393829401e-05, + "loss": 36.7144, + "step": 3313 + }, + { + "epoch": 11.96433408577878, + "grad_norm": 186.62428283691406, + "learning_rate": 1.2010889292196008e-05, + "loss": 34.0491, + "step": 3314 + }, + { + "epoch": 11.967945823927765, + "grad_norm": 214.5521697998047, + "learning_rate": 1.2005444646098003e-05, + "loss": 34.1164, + "step": 3315 + }, + { + "epoch": 11.97155756207675, + "grad_norm": 203.8130340576172, + "learning_rate": 1.2e-05, + "loss": 34.0005, + "step": 3316 + }, + { + "epoch": 11.975169300225733, + "grad_norm": 207.25648498535156, + "learning_rate": 1.1994555353901997e-05, + "loss": 34.0489, + "step": 3317 + }, + { + "epoch": 11.978781038374718, + "grad_norm": 271.1595458984375, + "learning_rate": 1.1989110707803993e-05, + "loss": 35.0359, + "step": 3318 + }, + { + "epoch": 11.982392776523701, + "grad_norm": 266.0697021484375, + "learning_rate": 1.198366606170599e-05, + "loss": 36.4684, + "step": 3319 + }, + { + "epoch": 11.986004514672686, + "grad_norm": 264.1314392089844, + "learning_rate": 1.1978221415607985e-05, + "loss": 35.8805, + "step": 3320 + }, + { + "epoch": 11.986004514672686, + "eval_loss": 0.6101864576339722, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 3320 + }, + { + "epoch": 11.989616252821671, + "grad_norm": 266.34295654296875, + "learning_rate": 1.1972776769509982e-05, + "loss": 37.2928, + "step": 3321 + }, + { + "epoch": 11.993227990970654, + "grad_norm": 222.19161987304688, + "learning_rate": 1.196733212341198e-05, + "loss": 29.0638, + "step": 3322 + }, + { + "epoch": 11.996839729119639, + "grad_norm": 244.96974182128906, + "learning_rate": 1.1961887477313975e-05, + "loss": 23.6752, + "step": 3323 + }, + { + "epoch": 12.0, + "grad_norm": 227.6931915283203, + "learning_rate": 1.195644283121597e-05, + "loss": 20.9293, + "step": 3324 + }, + { + "epoch": 12.003611738148985, + "grad_norm": 259.7235412597656, + "learning_rate": 1.1950998185117969e-05, + "loss": 39.7694, + "step": 3325 + }, + { + "epoch": 12.007223476297968, + "grad_norm": 258.8477783203125, + "learning_rate": 1.1945553539019964e-05, + "loss": 41.3742, + "step": 3326 + }, + { + "epoch": 12.010835214446953, + "grad_norm": 216.0697784423828, + "learning_rate": 1.194010889292196e-05, + "loss": 40.0706, + "step": 3327 + }, + { + "epoch": 12.014446952595938, + "grad_norm": 197.73046875, + "learning_rate": 1.1934664246823957e-05, + "loss": 39.844, + "step": 3328 + }, + { + "epoch": 12.01805869074492, + "grad_norm": 190.29563903808594, + "learning_rate": 1.1929219600725952e-05, + "loss": 41.8877, + "step": 3329 + }, + { + "epoch": 12.021670428893906, + "grad_norm": 190.01197814941406, + "learning_rate": 1.1923774954627951e-05, + "loss": 40.5782, + "step": 3330 + }, + { + "epoch": 12.021670428893906, + "eval_loss": 0.6100598573684692, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3330 + }, + { + "epoch": 12.025282167042889, + "grad_norm": 283.20965576171875, + "learning_rate": 1.1918330308529946e-05, + "loss": 42.9183, + "step": 3331 + }, + { + "epoch": 12.028893905191874, + "grad_norm": 227.9106903076172, + "learning_rate": 1.1912885662431942e-05, + "loss": 41.4606, + "step": 3332 + }, + { + "epoch": 12.032505643340858, + "grad_norm": 217.31640625, + "learning_rate": 1.1907441016333939e-05, + "loss": 40.527, + "step": 3333 + }, + { + "epoch": 12.036117381489841, + "grad_norm": 181.33787536621094, + "learning_rate": 1.1901996370235936e-05, + "loss": 40.2536, + "step": 3334 + }, + { + "epoch": 12.039729119638826, + "grad_norm": 210.638427734375, + "learning_rate": 1.1896551724137931e-05, + "loss": 39.0234, + "step": 3335 + }, + { + "epoch": 12.043340857787811, + "grad_norm": 222.1325225830078, + "learning_rate": 1.1891107078039928e-05, + "loss": 36.6929, + "step": 3336 + }, + { + "epoch": 12.046952595936794, + "grad_norm": 195.0751953125, + "learning_rate": 1.1885662431941924e-05, + "loss": 37.9547, + "step": 3337 + }, + { + "epoch": 12.050564334085779, + "grad_norm": 287.6582946777344, + "learning_rate": 1.1880217785843919e-05, + "loss": 37.9016, + "step": 3338 + }, + { + "epoch": 12.054176072234762, + "grad_norm": 351.43701171875, + "learning_rate": 1.1874773139745918e-05, + "loss": 40.014, + "step": 3339 + }, + { + "epoch": 12.057787810383747, + "grad_norm": 212.9033966064453, + "learning_rate": 1.1869328493647913e-05, + "loss": 37.8761, + "step": 3340 + }, + { + "epoch": 12.057787810383747, + "eval_loss": 0.6093400120735168, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 3340 + }, + { + "epoch": 12.061399548532732, + "grad_norm": 268.8284912109375, + "learning_rate": 1.186388384754991e-05, + "loss": 38.7171, + "step": 3341 + }, + { + "epoch": 12.065011286681715, + "grad_norm": 193.27267456054688, + "learning_rate": 1.1858439201451906e-05, + "loss": 38.4908, + "step": 3342 + }, + { + "epoch": 12.0686230248307, + "grad_norm": 244.18124389648438, + "learning_rate": 1.1852994555353901e-05, + "loss": 37.9388, + "step": 3343 + }, + { + "epoch": 12.072234762979685, + "grad_norm": 311.6593933105469, + "learning_rate": 1.18475499092559e-05, + "loss": 38.4287, + "step": 3344 + }, + { + "epoch": 12.075846501128668, + "grad_norm": 239.28526306152344, + "learning_rate": 1.1842105263157895e-05, + "loss": 38.1349, + "step": 3345 + }, + { + "epoch": 12.079458239277653, + "grad_norm": 312.1795654296875, + "learning_rate": 1.183666061705989e-05, + "loss": 39.8067, + "step": 3346 + }, + { + "epoch": 12.083069977426636, + "grad_norm": 303.3067932128906, + "learning_rate": 1.1831215970961888e-05, + "loss": 40.0617, + "step": 3347 + }, + { + "epoch": 12.08668171557562, + "grad_norm": 280.8705749511719, + "learning_rate": 1.1825771324863885e-05, + "loss": 39.244, + "step": 3348 + }, + { + "epoch": 12.090293453724605, + "grad_norm": 249.89671325683594, + "learning_rate": 1.182032667876588e-05, + "loss": 39.0047, + "step": 3349 + }, + { + "epoch": 12.093905191873588, + "grad_norm": 226.19195556640625, + "learning_rate": 1.1814882032667877e-05, + "loss": 40.8044, + "step": 3350 + }, + { + "epoch": 12.093905191873588, + "eval_loss": 0.6100687384605408, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 3350 + }, + { + "epoch": 12.097516930022573, + "grad_norm": 250.29306030273438, + "learning_rate": 1.1809437386569873e-05, + "loss": 38.0745, + "step": 3351 + }, + { + "epoch": 12.101128668171558, + "grad_norm": 255.06137084960938, + "learning_rate": 1.180399274047187e-05, + "loss": 37.2922, + "step": 3352 + }, + { + "epoch": 12.104740406320541, + "grad_norm": 293.59185791015625, + "learning_rate": 1.1798548094373867e-05, + "loss": 35.488, + "step": 3353 + }, + { + "epoch": 12.108352144469526, + "grad_norm": 260.9599914550781, + "learning_rate": 1.1793103448275862e-05, + "loss": 32.8175, + "step": 3354 + }, + { + "epoch": 12.111963882618511, + "grad_norm": 387.63671875, + "learning_rate": 1.178765880217786e-05, + "loss": 31.3901, + "step": 3355 + }, + { + "epoch": 12.115575620767494, + "grad_norm": 216.2008819580078, + "learning_rate": 1.1782214156079855e-05, + "loss": 32.9512, + "step": 3356 + }, + { + "epoch": 12.119187358916479, + "grad_norm": 260.510498046875, + "learning_rate": 1.177676950998185e-05, + "loss": 31.838, + "step": 3357 + }, + { + "epoch": 12.122799097065462, + "grad_norm": 215.96522521972656, + "learning_rate": 1.1771324863883849e-05, + "loss": 33.5854, + "step": 3358 + }, + { + "epoch": 12.126410835214447, + "grad_norm": 277.2855529785156, + "learning_rate": 1.1765880217785844e-05, + "loss": 34.947, + "step": 3359 + }, + { + "epoch": 12.130022573363432, + "grad_norm": 199.53759765625, + "learning_rate": 1.176043557168784e-05, + "loss": 34.3862, + "step": 3360 + }, + { + "epoch": 12.130022573363432, + "eval_loss": 0.6107886433601379, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 3360 + }, + { + "epoch": 12.133634311512415, + "grad_norm": 244.73654174804688, + "learning_rate": 1.1754990925589837e-05, + "loss": 34.5678, + "step": 3361 + }, + { + "epoch": 12.1372460496614, + "grad_norm": 335.4967346191406, + "learning_rate": 1.1749546279491834e-05, + "loss": 35.8974, + "step": 3362 + }, + { + "epoch": 12.140857787810384, + "grad_norm": 269.8370056152344, + "learning_rate": 1.174410163339383e-05, + "loss": 36.3458, + "step": 3363 + }, + { + "epoch": 12.144469525959368, + "grad_norm": 230.82492065429688, + "learning_rate": 1.1738656987295826e-05, + "loss": 34.6797, + "step": 3364 + }, + { + "epoch": 12.148081264108352, + "grad_norm": 266.6196594238281, + "learning_rate": 1.1733212341197822e-05, + "loss": 35.5799, + "step": 3365 + }, + { + "epoch": 12.151693002257336, + "grad_norm": 268.1825256347656, + "learning_rate": 1.1727767695099819e-05, + "loss": 34.9859, + "step": 3366 + }, + { + "epoch": 12.15530474040632, + "grad_norm": 259.6159362792969, + "learning_rate": 1.1722323049001816e-05, + "loss": 37.2283, + "step": 3367 + }, + { + "epoch": 12.158916478555305, + "grad_norm": 225.1367645263672, + "learning_rate": 1.1716878402903811e-05, + "loss": 37.4073, + "step": 3368 + }, + { + "epoch": 12.162528216704288, + "grad_norm": 277.8457946777344, + "learning_rate": 1.1711433756805808e-05, + "loss": 36.3491, + "step": 3369 + }, + { + "epoch": 12.166139954853273, + "grad_norm": 273.1939697265625, + "learning_rate": 1.1705989110707804e-05, + "loss": 31.4646, + "step": 3370 + }, + { + "epoch": 12.166139954853273, + "eval_loss": 0.6099494695663452, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 3370 + }, + { + "epoch": 12.169751693002258, + "grad_norm": 199.32516479492188, + "learning_rate": 1.17005444646098e-05, + "loss": 22.7125, + "step": 3371 + }, + { + "epoch": 12.173363431151241, + "grad_norm": 195.47630310058594, + "learning_rate": 1.1695099818511798e-05, + "loss": 22.7899, + "step": 3372 + }, + { + "epoch": 12.176975169300226, + "grad_norm": 220.02413940429688, + "learning_rate": 1.1689655172413793e-05, + "loss": 23.4427, + "step": 3373 + }, + { + "epoch": 12.18058690744921, + "grad_norm": 215.43287658691406, + "learning_rate": 1.168421052631579e-05, + "loss": 24.1504, + "step": 3374 + }, + { + "epoch": 12.184198645598194, + "grad_norm": 298.2409973144531, + "learning_rate": 1.1678765880217786e-05, + "loss": 41.4955, + "step": 3375 + }, + { + "epoch": 12.187810383747179, + "grad_norm": 235.94728088378906, + "learning_rate": 1.1673321234119783e-05, + "loss": 42.4273, + "step": 3376 + }, + { + "epoch": 12.191422121896162, + "grad_norm": 235.44480895996094, + "learning_rate": 1.166787658802178e-05, + "loss": 40.6468, + "step": 3377 + }, + { + "epoch": 12.195033860045147, + "grad_norm": 281.5338439941406, + "learning_rate": 1.1662431941923775e-05, + "loss": 39.8335, + "step": 3378 + }, + { + "epoch": 12.198645598194132, + "grad_norm": 185.87339782714844, + "learning_rate": 1.165698729582577e-05, + "loss": 40.8669, + "step": 3379 + }, + { + "epoch": 12.202257336343115, + "grad_norm": 218.88861083984375, + "learning_rate": 1.1651542649727768e-05, + "loss": 40.1351, + "step": 3380 + }, + { + "epoch": 12.202257336343115, + "eval_loss": 0.6128573417663574, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3380 + }, + { + "epoch": 12.2058690744921, + "grad_norm": 192.7227783203125, + "learning_rate": 1.1646098003629765e-05, + "loss": 40.4448, + "step": 3381 + }, + { + "epoch": 12.209480812641084, + "grad_norm": 219.68093872070312, + "learning_rate": 1.1640653357531762e-05, + "loss": 41.579, + "step": 3382 + }, + { + "epoch": 12.213092550790067, + "grad_norm": 235.8788299560547, + "learning_rate": 1.1635208711433757e-05, + "loss": 41.3374, + "step": 3383 + }, + { + "epoch": 12.216704288939052, + "grad_norm": 245.11935424804688, + "learning_rate": 1.1629764065335752e-05, + "loss": 41.1151, + "step": 3384 + }, + { + "epoch": 12.220316027088035, + "grad_norm": 260.2931823730469, + "learning_rate": 1.1624319419237751e-05, + "loss": 38.9502, + "step": 3385 + }, + { + "epoch": 12.22392776523702, + "grad_norm": 240.62734985351562, + "learning_rate": 1.1618874773139747e-05, + "loss": 38.6309, + "step": 3386 + }, + { + "epoch": 12.227539503386005, + "grad_norm": 230.9380645751953, + "learning_rate": 1.1613430127041742e-05, + "loss": 38.3077, + "step": 3387 + }, + { + "epoch": 12.231151241534988, + "grad_norm": 234.40687561035156, + "learning_rate": 1.1607985480943739e-05, + "loss": 37.1566, + "step": 3388 + }, + { + "epoch": 12.234762979683973, + "grad_norm": 216.580810546875, + "learning_rate": 1.1602540834845734e-05, + "loss": 38.4919, + "step": 3389 + }, + { + "epoch": 12.238374717832958, + "grad_norm": 210.75079345703125, + "learning_rate": 1.1597096188747732e-05, + "loss": 38.1647, + "step": 3390 + }, + { + "epoch": 12.238374717832958, + "eval_loss": 0.6105583906173706, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3390 + }, + { + "epoch": 12.241986455981941, + "grad_norm": 207.82180786132812, + "learning_rate": 1.1591651542649729e-05, + "loss": 38.5585, + "step": 3391 + }, + { + "epoch": 12.245598194130926, + "grad_norm": 186.55081176757812, + "learning_rate": 1.1586206896551724e-05, + "loss": 38.0183, + "step": 3392 + }, + { + "epoch": 12.249209932279909, + "grad_norm": 179.60572814941406, + "learning_rate": 1.1580762250453721e-05, + "loss": 39.6951, + "step": 3393 + }, + { + "epoch": 12.252821670428894, + "grad_norm": 212.59837341308594, + "learning_rate": 1.1575317604355718e-05, + "loss": 39.2908, + "step": 3394 + }, + { + "epoch": 12.256433408577879, + "grad_norm": 239.90997314453125, + "learning_rate": 1.1569872958257714e-05, + "loss": 39.9409, + "step": 3395 + }, + { + "epoch": 12.260045146726862, + "grad_norm": 240.729248046875, + "learning_rate": 1.156442831215971e-05, + "loss": 39.2386, + "step": 3396 + }, + { + "epoch": 12.263656884875846, + "grad_norm": 248.6179962158203, + "learning_rate": 1.1558983666061706e-05, + "loss": 37.3296, + "step": 3397 + }, + { + "epoch": 12.267268623024831, + "grad_norm": 192.55084228515625, + "learning_rate": 1.1553539019963701e-05, + "loss": 40.1156, + "step": 3398 + }, + { + "epoch": 12.270880361173814, + "grad_norm": 217.89109802246094, + "learning_rate": 1.15480943738657e-05, + "loss": 41.0677, + "step": 3399 + }, + { + "epoch": 12.2744920993228, + "grad_norm": 240.77633666992188, + "learning_rate": 1.1542649727767695e-05, + "loss": 39.3552, + "step": 3400 + }, + { + "epoch": 12.2744920993228, + "eval_loss": 0.6094763278961182, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3400 + }, + { + "epoch": 12.278103837471784, + "grad_norm": 210.38153076171875, + "learning_rate": 1.1537205081669691e-05, + "loss": 40.2202, + "step": 3401 + }, + { + "epoch": 12.281715575620767, + "grad_norm": 195.49087524414062, + "learning_rate": 1.1531760435571688e-05, + "loss": 37.5473, + "step": 3402 + }, + { + "epoch": 12.285327313769752, + "grad_norm": 254.43972778320312, + "learning_rate": 1.1526315789473683e-05, + "loss": 37.8032, + "step": 3403 + }, + { + "epoch": 12.288939051918735, + "grad_norm": 205.09913635253906, + "learning_rate": 1.1520871143375682e-05, + "loss": 35.1317, + "step": 3404 + }, + { + "epoch": 12.29255079006772, + "grad_norm": 241.22930908203125, + "learning_rate": 1.1515426497277677e-05, + "loss": 32.7809, + "step": 3405 + }, + { + "epoch": 12.296162528216705, + "grad_norm": 226.75311279296875, + "learning_rate": 1.1509981851179673e-05, + "loss": 32.5354, + "step": 3406 + }, + { + "epoch": 12.299774266365688, + "grad_norm": 323.5389709472656, + "learning_rate": 1.150453720508167e-05, + "loss": 33.1533, + "step": 3407 + }, + { + "epoch": 12.303386004514673, + "grad_norm": 306.7039794921875, + "learning_rate": 1.1499092558983667e-05, + "loss": 33.7924, + "step": 3408 + }, + { + "epoch": 12.306997742663658, + "grad_norm": 221.53897094726562, + "learning_rate": 1.1493647912885662e-05, + "loss": 33.829, + "step": 3409 + }, + { + "epoch": 12.31060948081264, + "grad_norm": 301.59527587890625, + "learning_rate": 1.148820326678766e-05, + "loss": 35.4583, + "step": 3410 + }, + { + "epoch": 12.31060948081264, + "eval_loss": 0.6092248558998108, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.058, + "eval_steps_per_second": 57.058, + "step": 3410 + }, + { + "epoch": 12.314221218961626, + "grad_norm": 229.63221740722656, + "learning_rate": 1.1482758620689655e-05, + "loss": 34.3258, + "step": 3411 + }, + { + "epoch": 12.317832957110609, + "grad_norm": 280.6421203613281, + "learning_rate": 1.147731397459165e-05, + "loss": 33.4522, + "step": 3412 + }, + { + "epoch": 12.321444695259594, + "grad_norm": 305.6673889160156, + "learning_rate": 1.1471869328493649e-05, + "loss": 34.8911, + "step": 3413 + }, + { + "epoch": 12.325056433408578, + "grad_norm": 278.5484924316406, + "learning_rate": 1.1466424682395644e-05, + "loss": 36.2668, + "step": 3414 + }, + { + "epoch": 12.328668171557561, + "grad_norm": 246.88082885742188, + "learning_rate": 1.1460980036297641e-05, + "loss": 34.8401, + "step": 3415 + }, + { + "epoch": 12.332279909706546, + "grad_norm": 279.730712890625, + "learning_rate": 1.1455535390199637e-05, + "loss": 36.2382, + "step": 3416 + }, + { + "epoch": 12.335891647855531, + "grad_norm": 243.62918090820312, + "learning_rate": 1.1450090744101634e-05, + "loss": 37.0742, + "step": 3417 + }, + { + "epoch": 12.339503386004514, + "grad_norm": 280.5240783691406, + "learning_rate": 1.1444646098003631e-05, + "loss": 37.0223, + "step": 3418 + }, + { + "epoch": 12.343115124153499, + "grad_norm": 270.56396484375, + "learning_rate": 1.1439201451905626e-05, + "loss": 34.8413, + "step": 3419 + }, + { + "epoch": 12.346726862302482, + "grad_norm": 246.56292724609375, + "learning_rate": 1.1433756805807622e-05, + "loss": 26.5596, + "step": 3420 + }, + { + "epoch": 12.346726862302482, + "eval_loss": 0.6123174428939819, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.143, + "eval_steps_per_second": 57.143, + "step": 3420 + }, + { + "epoch": 12.350338600451467, + "grad_norm": 199.72242736816406, + "learning_rate": 1.1428312159709619e-05, + "loss": 23.3959, + "step": 3421 + }, + { + "epoch": 12.353950338600452, + "grad_norm": 264.9206848144531, + "learning_rate": 1.1422867513611616e-05, + "loss": 23.448, + "step": 3422 + }, + { + "epoch": 12.357562076749435, + "grad_norm": 198.09420776367188, + "learning_rate": 1.1417422867513613e-05, + "loss": 23.4526, + "step": 3423 + }, + { + "epoch": 12.36117381489842, + "grad_norm": 191.74949645996094, + "learning_rate": 1.1411978221415608e-05, + "loss": 23.9586, + "step": 3424 + }, + { + "epoch": 12.364785553047405, + "grad_norm": 270.4527893066406, + "learning_rate": 1.1406533575317604e-05, + "loss": 41.2497, + "step": 3425 + }, + { + "epoch": 12.368397291196388, + "grad_norm": 253.06109619140625, + "learning_rate": 1.1401088929219601e-05, + "loss": 41.7598, + "step": 3426 + }, + { + "epoch": 12.372009029345373, + "grad_norm": 389.3164978027344, + "learning_rate": 1.1395644283121598e-05, + "loss": 42.1145, + "step": 3427 + }, + { + "epoch": 12.375620767494357, + "grad_norm": 405.1527404785156, + "learning_rate": 1.1390199637023593e-05, + "loss": 39.8163, + "step": 3428 + }, + { + "epoch": 12.37923250564334, + "grad_norm": 360.5083312988281, + "learning_rate": 1.138475499092559e-05, + "loss": 40.7344, + "step": 3429 + }, + { + "epoch": 12.382844243792325, + "grad_norm": 276.3650207519531, + "learning_rate": 1.1379310344827586e-05, + "loss": 40.6678, + "step": 3430 + }, + { + "epoch": 12.382844243792325, + "eval_loss": 0.612799346446991, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 3430 + }, + { + "epoch": 12.386455981941308, + "grad_norm": 222.34078979492188, + "learning_rate": 1.1373865698729583e-05, + "loss": 39.8701, + "step": 3431 + }, + { + "epoch": 12.390067720090293, + "grad_norm": 242.1103515625, + "learning_rate": 1.136842105263158e-05, + "loss": 42.031, + "step": 3432 + }, + { + "epoch": 12.393679458239278, + "grad_norm": 231.30453491210938, + "learning_rate": 1.1362976406533575e-05, + "loss": 40.7321, + "step": 3433 + }, + { + "epoch": 12.397291196388261, + "grad_norm": 302.65179443359375, + "learning_rate": 1.1357531760435572e-05, + "loss": 41.5889, + "step": 3434 + }, + { + "epoch": 12.400902934537246, + "grad_norm": 296.4203796386719, + "learning_rate": 1.1352087114337568e-05, + "loss": 40.3939, + "step": 3435 + }, + { + "epoch": 12.404514672686231, + "grad_norm": 281.8349304199219, + "learning_rate": 1.1346642468239565e-05, + "loss": 37.9457, + "step": 3436 + }, + { + "epoch": 12.408126410835214, + "grad_norm": 228.9622039794922, + "learning_rate": 1.1341197822141562e-05, + "loss": 37.4727, + "step": 3437 + }, + { + "epoch": 12.411738148984199, + "grad_norm": 276.8975524902344, + "learning_rate": 1.1335753176043557e-05, + "loss": 36.4285, + "step": 3438 + }, + { + "epoch": 12.415349887133182, + "grad_norm": 218.76206970214844, + "learning_rate": 1.1330308529945553e-05, + "loss": 37.7888, + "step": 3439 + }, + { + "epoch": 12.418961625282167, + "grad_norm": 277.31329345703125, + "learning_rate": 1.1324863883847551e-05, + "loss": 38.6416, + "step": 3440 + }, + { + "epoch": 12.418961625282167, + "eval_loss": 0.6118359565734863, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.105, + "eval_steps_per_second": 57.105, + "step": 3440 + }, + { + "epoch": 12.422573363431152, + "grad_norm": 239.2766876220703, + "learning_rate": 1.1319419237749547e-05, + "loss": 38.3779, + "step": 3441 + }, + { + "epoch": 12.426185101580135, + "grad_norm": 255.43939208984375, + "learning_rate": 1.1313974591651542e-05, + "loss": 38.7581, + "step": 3442 + }, + { + "epoch": 12.42979683972912, + "grad_norm": 196.33380126953125, + "learning_rate": 1.130852994555354e-05, + "loss": 40.1953, + "step": 3443 + }, + { + "epoch": 12.433408577878104, + "grad_norm": 284.2427062988281, + "learning_rate": 1.1303085299455535e-05, + "loss": 39.2743, + "step": 3444 + }, + { + "epoch": 12.437020316027088, + "grad_norm": 303.0172424316406, + "learning_rate": 1.1297640653357533e-05, + "loss": 39.4786, + "step": 3445 + }, + { + "epoch": 12.440632054176072, + "grad_norm": 231.17999267578125, + "learning_rate": 1.1292196007259529e-05, + "loss": 38.6038, + "step": 3446 + }, + { + "epoch": 12.444243792325057, + "grad_norm": 228.89599609375, + "learning_rate": 1.1286751361161524e-05, + "loss": 39.0235, + "step": 3447 + }, + { + "epoch": 12.44785553047404, + "grad_norm": 247.05203247070312, + "learning_rate": 1.1281306715063521e-05, + "loss": 39.9779, + "step": 3448 + }, + { + "epoch": 12.451467268623025, + "grad_norm": 221.5463104248047, + "learning_rate": 1.1275862068965517e-05, + "loss": 40.4104, + "step": 3449 + }, + { + "epoch": 12.455079006772008, + "grad_norm": 254.12820434570312, + "learning_rate": 1.1270417422867514e-05, + "loss": 40.8093, + "step": 3450 + }, + { + "epoch": 12.455079006772008, + "eval_loss": 0.6093817353248596, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 3450 + }, + { + "epoch": 12.458690744920993, + "grad_norm": 214.2323760986328, + "learning_rate": 1.1264972776769511e-05, + "loss": 40.3578, + "step": 3451 + }, + { + "epoch": 12.462302483069978, + "grad_norm": 230.64718627929688, + "learning_rate": 1.1259528130671506e-05, + "loss": 39.772, + "step": 3452 + }, + { + "epoch": 12.465914221218961, + "grad_norm": 217.81838989257812, + "learning_rate": 1.1254083484573502e-05, + "loss": 36.8193, + "step": 3453 + }, + { + "epoch": 12.469525959367946, + "grad_norm": 292.7674560546875, + "learning_rate": 1.12486388384755e-05, + "loss": 33.891, + "step": 3454 + }, + { + "epoch": 12.47313769751693, + "grad_norm": 241.6099395751953, + "learning_rate": 1.1243194192377496e-05, + "loss": 34.8947, + "step": 3455 + }, + { + "epoch": 12.476749435665914, + "grad_norm": 220.97128295898438, + "learning_rate": 1.1237749546279493e-05, + "loss": 31.7715, + "step": 3456 + }, + { + "epoch": 12.480361173814899, + "grad_norm": 191.04376220703125, + "learning_rate": 1.1232304900181488e-05, + "loss": 32.3878, + "step": 3457 + }, + { + "epoch": 12.483972911963882, + "grad_norm": 192.3009796142578, + "learning_rate": 1.1226860254083484e-05, + "loss": 33.3116, + "step": 3458 + }, + { + "epoch": 12.487584650112867, + "grad_norm": 214.22459411621094, + "learning_rate": 1.1221415607985482e-05, + "loss": 34.1394, + "step": 3459 + }, + { + "epoch": 12.491196388261852, + "grad_norm": 225.24191284179688, + "learning_rate": 1.1215970961887478e-05, + "loss": 34.9381, + "step": 3460 + }, + { + "epoch": 12.491196388261852, + "eval_loss": 0.6095408201217651, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 3460 + }, + { + "epoch": 12.494808126410835, + "grad_norm": 240.89199829101562, + "learning_rate": 1.1210526315789473e-05, + "loss": 34.5342, + "step": 3461 + }, + { + "epoch": 12.49841986455982, + "grad_norm": 263.5467224121094, + "learning_rate": 1.120508166969147e-05, + "loss": 35.3287, + "step": 3462 + }, + { + "epoch": 12.502031602708804, + "grad_norm": 253.0650634765625, + "learning_rate": 1.1199637023593467e-05, + "loss": 35.4859, + "step": 3463 + }, + { + "epoch": 12.505643340857787, + "grad_norm": 279.4447937011719, + "learning_rate": 1.1194192377495463e-05, + "loss": 33.919, + "step": 3464 + }, + { + "epoch": 12.509255079006772, + "grad_norm": 246.6184844970703, + "learning_rate": 1.118874773139746e-05, + "loss": 35.2743, + "step": 3465 + }, + { + "epoch": 12.512866817155757, + "grad_norm": 228.4134979248047, + "learning_rate": 1.1183303085299455e-05, + "loss": 36.0865, + "step": 3466 + }, + { + "epoch": 12.51647855530474, + "grad_norm": 264.87835693359375, + "learning_rate": 1.1177858439201452e-05, + "loss": 36.1596, + "step": 3467 + }, + { + "epoch": 12.520090293453725, + "grad_norm": 252.2872772216797, + "learning_rate": 1.117241379310345e-05, + "loss": 35.7293, + "step": 3468 + }, + { + "epoch": 12.523702031602708, + "grad_norm": 277.3695373535156, + "learning_rate": 1.1166969147005445e-05, + "loss": 36.8009, + "step": 3469 + }, + { + "epoch": 12.527313769751693, + "grad_norm": 255.64610290527344, + "learning_rate": 1.1161524500907442e-05, + "loss": 28.5986, + "step": 3470 + }, + { + "epoch": 12.527313769751693, + "eval_loss": 0.6122347116470337, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 3470 + }, + { + "epoch": 12.530925507900678, + "grad_norm": 256.1487121582031, + "learning_rate": 1.1156079854809437e-05, + "loss": 23.1289, + "step": 3471 + }, + { + "epoch": 12.534537246049661, + "grad_norm": 261.9757080078125, + "learning_rate": 1.1150635208711433e-05, + "loss": 22.3379, + "step": 3472 + }, + { + "epoch": 12.538148984198646, + "grad_norm": 194.83432006835938, + "learning_rate": 1.1145190562613431e-05, + "loss": 23.6192, + "step": 3473 + }, + { + "epoch": 12.54176072234763, + "grad_norm": 241.51089477539062, + "learning_rate": 1.1139745916515427e-05, + "loss": 24.0314, + "step": 3474 + }, + { + "epoch": 12.545372460496614, + "grad_norm": 242.6024932861328, + "learning_rate": 1.1134301270417424e-05, + "loss": 40.2969, + "step": 3475 + }, + { + "epoch": 12.548984198645599, + "grad_norm": 292.17303466796875, + "learning_rate": 1.112885662431942e-05, + "loss": 42.3448, + "step": 3476 + }, + { + "epoch": 12.552595936794582, + "grad_norm": 232.811767578125, + "learning_rate": 1.1123411978221416e-05, + "loss": 41.7642, + "step": 3477 + }, + { + "epoch": 12.556207674943566, + "grad_norm": 238.43162536621094, + "learning_rate": 1.1117967332123413e-05, + "loss": 41.0827, + "step": 3478 + }, + { + "epoch": 12.559819413092551, + "grad_norm": 290.20159912109375, + "learning_rate": 1.1112522686025409e-05, + "loss": 41.3795, + "step": 3479 + }, + { + "epoch": 12.563431151241534, + "grad_norm": 197.52903747558594, + "learning_rate": 1.1107078039927404e-05, + "loss": 40.6337, + "step": 3480 + }, + { + "epoch": 12.563431151241534, + "eval_loss": 0.6133883595466614, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.135, + "eval_steps_per_second": 57.135, + "step": 3480 + }, + { + "epoch": 12.56704288939052, + "grad_norm": 259.8161926269531, + "learning_rate": 1.1101633393829401e-05, + "loss": 40.2626, + "step": 3481 + }, + { + "epoch": 12.570654627539504, + "grad_norm": 196.7882537841797, + "learning_rate": 1.1096188747731398e-05, + "loss": 41.0171, + "step": 3482 + }, + { + "epoch": 12.574266365688487, + "grad_norm": 216.27642822265625, + "learning_rate": 1.1090744101633394e-05, + "loss": 42.1328, + "step": 3483 + }, + { + "epoch": 12.577878103837472, + "grad_norm": 292.6575012207031, + "learning_rate": 1.108529945553539e-05, + "loss": 39.9502, + "step": 3484 + }, + { + "epoch": 12.581489841986457, + "grad_norm": 254.43344116210938, + "learning_rate": 1.1079854809437386e-05, + "loss": 41.3409, + "step": 3485 + }, + { + "epoch": 12.58510158013544, + "grad_norm": 211.3965606689453, + "learning_rate": 1.1074410163339385e-05, + "loss": 39.6898, + "step": 3486 + }, + { + "epoch": 12.588713318284425, + "grad_norm": 196.2000274658203, + "learning_rate": 1.106896551724138e-05, + "loss": 38.0837, + "step": 3487 + }, + { + "epoch": 12.592325056433408, + "grad_norm": 224.4564666748047, + "learning_rate": 1.1063520871143376e-05, + "loss": 38.479, + "step": 3488 + }, + { + "epoch": 12.595936794582393, + "grad_norm": 215.7074432373047, + "learning_rate": 1.1058076225045373e-05, + "loss": 38.3103, + "step": 3489 + }, + { + "epoch": 12.599548532731378, + "grad_norm": 278.2279052734375, + "learning_rate": 1.1052631578947368e-05, + "loss": 37.9399, + "step": 3490 + }, + { + "epoch": 12.599548532731378, + "eval_loss": 0.6091782450675964, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 3490 + }, + { + "epoch": 12.60316027088036, + "grad_norm": 236.7021942138672, + "learning_rate": 1.1047186932849365e-05, + "loss": 38.185, + "step": 3491 + }, + { + "epoch": 12.606772009029346, + "grad_norm": 200.35169982910156, + "learning_rate": 1.1041742286751362e-05, + "loss": 38.7405, + "step": 3492 + }, + { + "epoch": 12.610383747178329, + "grad_norm": 211.9726104736328, + "learning_rate": 1.1036297640653358e-05, + "loss": 39.8351, + "step": 3493 + }, + { + "epoch": 12.613995485327314, + "grad_norm": 303.5962829589844, + "learning_rate": 1.1030852994555353e-05, + "loss": 39.3039, + "step": 3494 + }, + { + "epoch": 12.617607223476298, + "grad_norm": 298.086181640625, + "learning_rate": 1.102540834845735e-05, + "loss": 39.9149, + "step": 3495 + }, + { + "epoch": 12.621218961625281, + "grad_norm": 255.69854736328125, + "learning_rate": 1.1019963702359347e-05, + "loss": 36.3617, + "step": 3496 + }, + { + "epoch": 12.624830699774266, + "grad_norm": 273.2884216308594, + "learning_rate": 1.1014519056261344e-05, + "loss": 38.6865, + "step": 3497 + }, + { + "epoch": 12.628442437923251, + "grad_norm": 211.17837524414062, + "learning_rate": 1.100907441016334e-05, + "loss": 40.2771, + "step": 3498 + }, + { + "epoch": 12.632054176072234, + "grad_norm": 253.9141845703125, + "learning_rate": 1.1003629764065335e-05, + "loss": 40.3644, + "step": 3499 + }, + { + "epoch": 12.635665914221219, + "grad_norm": 247.4141082763672, + "learning_rate": 1.0998185117967334e-05, + "loss": 39.9754, + "step": 3500 + }, + { + "epoch": 12.635665914221219, + "eval_loss": 0.6086810827255249, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 3500 + }, + { + "epoch": 12.639277652370204, + "grad_norm": 237.3258056640625, + "learning_rate": 1.0992740471869329e-05, + "loss": 39.9438, + "step": 3501 + }, + { + "epoch": 12.642889390519187, + "grad_norm": 252.87744140625, + "learning_rate": 1.0987295825771325e-05, + "loss": 39.9713, + "step": 3502 + }, + { + "epoch": 12.646501128668172, + "grad_norm": 341.2947998046875, + "learning_rate": 1.0981851179673322e-05, + "loss": 36.54, + "step": 3503 + }, + { + "epoch": 12.650112866817155, + "grad_norm": 212.7144317626953, + "learning_rate": 1.0976406533575317e-05, + "loss": 33.2737, + "step": 3504 + }, + { + "epoch": 12.65372460496614, + "grad_norm": 220.15846252441406, + "learning_rate": 1.0970961887477314e-05, + "loss": 34.8862, + "step": 3505 + }, + { + "epoch": 12.657336343115125, + "grad_norm": 235.8145294189453, + "learning_rate": 1.0965517241379311e-05, + "loss": 31.637, + "step": 3506 + }, + { + "epoch": 12.660948081264108, + "grad_norm": 274.13140869140625, + "learning_rate": 1.0960072595281307e-05, + "loss": 33.6111, + "step": 3507 + }, + { + "epoch": 12.664559819413093, + "grad_norm": 259.9810791015625, + "learning_rate": 1.0954627949183304e-05, + "loss": 34.7118, + "step": 3508 + }, + { + "epoch": 12.668171557562077, + "grad_norm": 244.6074676513672, + "learning_rate": 1.0949183303085299e-05, + "loss": 34.3987, + "step": 3509 + }, + { + "epoch": 12.67178329571106, + "grad_norm": 264.0238037109375, + "learning_rate": 1.0943738656987296e-05, + "loss": 34.7304, + "step": 3510 + }, + { + "epoch": 12.67178329571106, + "eval_loss": 0.6089194416999817, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 3510 + }, + { + "epoch": 12.675395033860045, + "grad_norm": 286.857421875, + "learning_rate": 1.0938294010889293e-05, + "loss": 34.5722, + "step": 3511 + }, + { + "epoch": 12.679006772009028, + "grad_norm": 270.7839660644531, + "learning_rate": 1.0932849364791289e-05, + "loss": 35.6129, + "step": 3512 + }, + { + "epoch": 12.682618510158013, + "grad_norm": 214.4302978515625, + "learning_rate": 1.0927404718693284e-05, + "loss": 34.4318, + "step": 3513 + }, + { + "epoch": 12.686230248306998, + "grad_norm": 362.6913757324219, + "learning_rate": 1.0921960072595283e-05, + "loss": 35.6578, + "step": 3514 + }, + { + "epoch": 12.689841986455981, + "grad_norm": 266.5205993652344, + "learning_rate": 1.0916515426497278e-05, + "loss": 35.8627, + "step": 3515 + }, + { + "epoch": 12.693453724604966, + "grad_norm": 271.8298034667969, + "learning_rate": 1.0911070780399275e-05, + "loss": 36.8931, + "step": 3516 + }, + { + "epoch": 12.697065462753951, + "grad_norm": 230.13815307617188, + "learning_rate": 1.090562613430127e-05, + "loss": 35.8972, + "step": 3517 + }, + { + "epoch": 12.700677200902934, + "grad_norm": 235.57127380371094, + "learning_rate": 1.0900181488203266e-05, + "loss": 36.7884, + "step": 3518 + }, + { + "epoch": 12.704288939051919, + "grad_norm": 274.0856018066406, + "learning_rate": 1.0894736842105265e-05, + "loss": 35.938, + "step": 3519 + }, + { + "epoch": 12.707900677200904, + "grad_norm": 251.9855194091797, + "learning_rate": 1.088929219600726e-05, + "loss": 30.846, + "step": 3520 + }, + { + "epoch": 12.707900677200904, + "eval_loss": 0.6102532148361206, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 3520 + }, + { + "epoch": 12.711512415349887, + "grad_norm": 254.11465454101562, + "learning_rate": 1.0883847549909255e-05, + "loss": 22.8538, + "step": 3521 + }, + { + "epoch": 12.715124153498872, + "grad_norm": 233.05821228027344, + "learning_rate": 1.0878402903811253e-05, + "loss": 22.3346, + "step": 3522 + }, + { + "epoch": 12.718735891647855, + "grad_norm": 223.46646118164062, + "learning_rate": 1.087295825771325e-05, + "loss": 23.8109, + "step": 3523 + }, + { + "epoch": 12.72234762979684, + "grad_norm": 209.4064483642578, + "learning_rate": 1.0867513611615245e-05, + "loss": 24.7694, + "step": 3524 + }, + { + "epoch": 12.725959367945824, + "grad_norm": 299.6215515136719, + "learning_rate": 1.0862068965517242e-05, + "loss": 40.8879, + "step": 3525 + }, + { + "epoch": 12.729571106094808, + "grad_norm": 272.5259704589844, + "learning_rate": 1.0856624319419237e-05, + "loss": 41.5875, + "step": 3526 + }, + { + "epoch": 12.733182844243792, + "grad_norm": 219.70687866210938, + "learning_rate": 1.0851179673321235e-05, + "loss": 41.5546, + "step": 3527 + }, + { + "epoch": 12.736794582392777, + "grad_norm": 250.9104766845703, + "learning_rate": 1.0845735027223232e-05, + "loss": 40.0984, + "step": 3528 + }, + { + "epoch": 12.74040632054176, + "grad_norm": 260.9254150390625, + "learning_rate": 1.0840290381125227e-05, + "loss": 40.564, + "step": 3529 + }, + { + "epoch": 12.744018058690745, + "grad_norm": 275.46221923828125, + "learning_rate": 1.0834845735027224e-05, + "loss": 40.3864, + "step": 3530 + }, + { + "epoch": 12.744018058690745, + "eval_loss": 0.6099677681922913, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 3530 + }, + { + "epoch": 12.747629796839728, + "grad_norm": 200.9589385986328, + "learning_rate": 1.082940108892922e-05, + "loss": 40.5753, + "step": 3531 + }, + { + "epoch": 12.751241534988713, + "grad_norm": 228.87669372558594, + "learning_rate": 1.0823956442831215e-05, + "loss": 41.4702, + "step": 3532 + }, + { + "epoch": 12.754853273137698, + "grad_norm": 218.6998748779297, + "learning_rate": 1.0818511796733214e-05, + "loss": 41.6641, + "step": 3533 + }, + { + "epoch": 12.758465011286681, + "grad_norm": 422.519775390625, + "learning_rate": 1.0813067150635209e-05, + "loss": 41.8016, + "step": 3534 + }, + { + "epoch": 12.762076749435666, + "grad_norm": 198.31935119628906, + "learning_rate": 1.0807622504537204e-05, + "loss": 40.6053, + "step": 3535 + }, + { + "epoch": 12.76568848758465, + "grad_norm": 274.42333984375, + "learning_rate": 1.0802177858439201e-05, + "loss": 38.7974, + "step": 3536 + }, + { + "epoch": 12.769300225733634, + "grad_norm": 267.5847473144531, + "learning_rate": 1.0796733212341199e-05, + "loss": 37.157, + "step": 3537 + }, + { + "epoch": 12.772911963882619, + "grad_norm": 264.9976806640625, + "learning_rate": 1.0791288566243196e-05, + "loss": 38.1585, + "step": 3538 + }, + { + "epoch": 12.776523702031604, + "grad_norm": 216.5603790283203, + "learning_rate": 1.0785843920145191e-05, + "loss": 38.0501, + "step": 3539 + }, + { + "epoch": 12.780135440180587, + "grad_norm": 193.55081176757812, + "learning_rate": 1.0780399274047186e-05, + "loss": 38.3114, + "step": 3540 + }, + { + "epoch": 12.780135440180587, + "eval_loss": 0.6059894561767578, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3540 + }, + { + "epoch": 12.783747178329572, + "grad_norm": 256.3584289550781, + "learning_rate": 1.0774954627949183e-05, + "loss": 38.7056, + "step": 3541 + }, + { + "epoch": 12.787358916478555, + "grad_norm": 203.17401123046875, + "learning_rate": 1.076950998185118e-05, + "loss": 39.3947, + "step": 3542 + }, + { + "epoch": 12.79097065462754, + "grad_norm": 307.99517822265625, + "learning_rate": 1.0764065335753176e-05, + "loss": 39.2121, + "step": 3543 + }, + { + "epoch": 12.794582392776524, + "grad_norm": 199.4147186279297, + "learning_rate": 1.0758620689655173e-05, + "loss": 38.4621, + "step": 3544 + }, + { + "epoch": 12.798194130925507, + "grad_norm": 251.60293579101562, + "learning_rate": 1.0753176043557168e-05, + "loss": 38.2742, + "step": 3545 + }, + { + "epoch": 12.801805869074492, + "grad_norm": 277.1817321777344, + "learning_rate": 1.0747731397459165e-05, + "loss": 38.6803, + "step": 3546 + }, + { + "epoch": 12.805417607223477, + "grad_norm": 303.2837219238281, + "learning_rate": 1.0742286751361163e-05, + "loss": 39.7843, + "step": 3547 + }, + { + "epoch": 12.80902934537246, + "grad_norm": 321.22772216796875, + "learning_rate": 1.0736842105263158e-05, + "loss": 41.3761, + "step": 3548 + }, + { + "epoch": 12.812641083521445, + "grad_norm": 238.89007568359375, + "learning_rate": 1.0731397459165155e-05, + "loss": 40.3649, + "step": 3549 + }, + { + "epoch": 12.816252821670428, + "grad_norm": 251.22291564941406, + "learning_rate": 1.072595281306715e-05, + "loss": 40.8151, + "step": 3550 + }, + { + "epoch": 12.816252821670428, + "eval_loss": 0.6065003275871277, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 3550 + }, + { + "epoch": 12.819864559819413, + "grad_norm": 218.13418579101562, + "learning_rate": 1.0720508166969147e-05, + "loss": 39.381, + "step": 3551 + }, + { + "epoch": 12.823476297968398, + "grad_norm": 250.90328979492188, + "learning_rate": 1.0715063520871145e-05, + "loss": 39.8923, + "step": 3552 + }, + { + "epoch": 12.827088036117381, + "grad_norm": 227.4825897216797, + "learning_rate": 1.070961887477314e-05, + "loss": 36.836, + "step": 3553 + }, + { + "epoch": 12.830699774266366, + "grad_norm": 253.7106475830078, + "learning_rate": 1.0704174228675135e-05, + "loss": 34.499, + "step": 3554 + }, + { + "epoch": 12.83431151241535, + "grad_norm": 280.0548400878906, + "learning_rate": 1.0698729582577132e-05, + "loss": 33.3409, + "step": 3555 + }, + { + "epoch": 12.837923250564334, + "grad_norm": 201.3768768310547, + "learning_rate": 1.069328493647913e-05, + "loss": 32.4868, + "step": 3556 + }, + { + "epoch": 12.841534988713319, + "grad_norm": 245.73446655273438, + "learning_rate": 1.0687840290381125e-05, + "loss": 32.8295, + "step": 3557 + }, + { + "epoch": 12.845146726862303, + "grad_norm": 195.0170440673828, + "learning_rate": 1.0682395644283122e-05, + "loss": 33.2009, + "step": 3558 + }, + { + "epoch": 12.848758465011286, + "grad_norm": 261.66357421875, + "learning_rate": 1.0676950998185117e-05, + "loss": 33.0627, + "step": 3559 + }, + { + "epoch": 12.852370203160271, + "grad_norm": 299.0184326171875, + "learning_rate": 1.0671506352087116e-05, + "loss": 34.184, + "step": 3560 + }, + { + "epoch": 12.852370203160271, + "eval_loss": 0.6077792048454285, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.041, + "eval_steps_per_second": 57.041, + "step": 3560 + }, + { + "epoch": 12.855981941309254, + "grad_norm": 293.9249572753906, + "learning_rate": 1.0666061705989111e-05, + "loss": 34.748, + "step": 3561 + }, + { + "epoch": 12.85959367945824, + "grad_norm": 206.4182586669922, + "learning_rate": 1.0660617059891107e-05, + "loss": 33.8454, + "step": 3562 + }, + { + "epoch": 12.863205417607224, + "grad_norm": 261.4427185058594, + "learning_rate": 1.0655172413793104e-05, + "loss": 35.7317, + "step": 3563 + }, + { + "epoch": 12.866817155756207, + "grad_norm": 236.60704040527344, + "learning_rate": 1.06497277676951e-05, + "loss": 35.2389, + "step": 3564 + }, + { + "epoch": 12.870428893905192, + "grad_norm": 272.9973449707031, + "learning_rate": 1.0644283121597096e-05, + "loss": 34.8523, + "step": 3565 + }, + { + "epoch": 12.874040632054175, + "grad_norm": 228.82540893554688, + "learning_rate": 1.0638838475499093e-05, + "loss": 34.7236, + "step": 3566 + }, + { + "epoch": 12.87765237020316, + "grad_norm": 266.6078796386719, + "learning_rate": 1.0633393829401089e-05, + "loss": 36.1574, + "step": 3567 + }, + { + "epoch": 12.881264108352145, + "grad_norm": 267.52239990234375, + "learning_rate": 1.0627949183303086e-05, + "loss": 36.8466, + "step": 3568 + }, + { + "epoch": 12.884875846501128, + "grad_norm": 261.0372314453125, + "learning_rate": 1.0622504537205083e-05, + "loss": 37.2803, + "step": 3569 + }, + { + "epoch": 12.888487584650113, + "grad_norm": 220.42532348632812, + "learning_rate": 1.0617059891107078e-05, + "loss": 29.4233, + "step": 3570 + }, + { + "epoch": 12.888487584650113, + "eval_loss": 0.6131581664085388, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 3570 + }, + { + "epoch": 12.892099322799098, + "grad_norm": 187.53604125976562, + "learning_rate": 1.0611615245009075e-05, + "loss": 23.3851, + "step": 3571 + }, + { + "epoch": 12.89571106094808, + "grad_norm": 227.1913299560547, + "learning_rate": 1.060617059891107e-05, + "loss": 23.3155, + "step": 3572 + }, + { + "epoch": 12.899322799097066, + "grad_norm": 202.15939331054688, + "learning_rate": 1.0600725952813066e-05, + "loss": 24.4548, + "step": 3573 + }, + { + "epoch": 12.90293453724605, + "grad_norm": 195.67282104492188, + "learning_rate": 1.0595281306715065e-05, + "loss": 24.2037, + "step": 3574 + }, + { + "epoch": 12.906546275395034, + "grad_norm": 303.0018310546875, + "learning_rate": 1.058983666061706e-05, + "loss": 41.6489, + "step": 3575 + }, + { + "epoch": 12.910158013544018, + "grad_norm": 193.92433166503906, + "learning_rate": 1.0584392014519056e-05, + "loss": 40.3682, + "step": 3576 + }, + { + "epoch": 12.913769751693001, + "grad_norm": 305.50750732421875, + "learning_rate": 1.0578947368421053e-05, + "loss": 40.5065, + "step": 3577 + }, + { + "epoch": 12.917381489841986, + "grad_norm": 223.41732788085938, + "learning_rate": 1.0573502722323048e-05, + "loss": 41.6387, + "step": 3578 + }, + { + "epoch": 12.920993227990971, + "grad_norm": 215.65061950683594, + "learning_rate": 1.0568058076225047e-05, + "loss": 41.3623, + "step": 3579 + }, + { + "epoch": 12.924604966139954, + "grad_norm": 223.95880126953125, + "learning_rate": 1.0562613430127042e-05, + "loss": 40.7444, + "step": 3580 + }, + { + "epoch": 12.924604966139954, + "eval_loss": 0.6113386750221252, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.074, + "eval_steps_per_second": 57.074, + "step": 3580 + }, + { + "epoch": 12.928216704288939, + "grad_norm": 247.3272247314453, + "learning_rate": 1.0557168784029038e-05, + "loss": 37.8137, + "step": 3581 + }, + { + "epoch": 12.931828442437924, + "grad_norm": 277.4321594238281, + "learning_rate": 1.0551724137931035e-05, + "loss": 38.6946, + "step": 3582 + }, + { + "epoch": 12.935440180586907, + "grad_norm": 219.15576171875, + "learning_rate": 1.0546279491833032e-05, + "loss": 39.0059, + "step": 3583 + }, + { + "epoch": 12.939051918735892, + "grad_norm": 205.6105194091797, + "learning_rate": 1.0540834845735027e-05, + "loss": 39.2436, + "step": 3584 + }, + { + "epoch": 12.942663656884875, + "grad_norm": 303.84521484375, + "learning_rate": 1.0535390199637024e-05, + "loss": 39.2451, + "step": 3585 + }, + { + "epoch": 12.94627539503386, + "grad_norm": 326.2321472167969, + "learning_rate": 1.052994555353902e-05, + "loss": 38.1849, + "step": 3586 + }, + { + "epoch": 12.949887133182845, + "grad_norm": 332.7608642578125, + "learning_rate": 1.0524500907441015e-05, + "loss": 39.7121, + "step": 3587 + }, + { + "epoch": 12.953498871331828, + "grad_norm": 245.19827270507812, + "learning_rate": 1.0519056261343014e-05, + "loss": 39.6558, + "step": 3588 + }, + { + "epoch": 12.957110609480813, + "grad_norm": 227.54763793945312, + "learning_rate": 1.051361161524501e-05, + "loss": 38.6437, + "step": 3589 + }, + { + "epoch": 12.960722347629797, + "grad_norm": 273.1142272949219, + "learning_rate": 1.0508166969147006e-05, + "loss": 39.083, + "step": 3590 + }, + { + "epoch": 12.960722347629797, + "eval_loss": 0.6050187349319458, + "eval_runtime": 3.1339, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 3590 + }, + { + "epoch": 12.96433408577878, + "grad_norm": 227.0492401123047, + "learning_rate": 1.0502722323049002e-05, + "loss": 34.0254, + "step": 3591 + }, + { + "epoch": 12.967945823927765, + "grad_norm": 201.76736450195312, + "learning_rate": 1.0497277676950999e-05, + "loss": 32.4569, + "step": 3592 + }, + { + "epoch": 12.97155756207675, + "grad_norm": 279.99237060546875, + "learning_rate": 1.0491833030852996e-05, + "loss": 33.8718, + "step": 3593 + }, + { + "epoch": 12.975169300225733, + "grad_norm": 351.647705078125, + "learning_rate": 1.0486388384754991e-05, + "loss": 34.8168, + "step": 3594 + }, + { + "epoch": 12.978781038374718, + "grad_norm": 275.7414855957031, + "learning_rate": 1.0480943738656987e-05, + "loss": 35.1731, + "step": 3595 + }, + { + "epoch": 12.982392776523701, + "grad_norm": 347.0024719238281, + "learning_rate": 1.0475499092558984e-05, + "loss": 35.7127, + "step": 3596 + }, + { + "epoch": 12.986004514672686, + "grad_norm": 304.18218994140625, + "learning_rate": 1.047005444646098e-05, + "loss": 34.7709, + "step": 3597 + }, + { + "epoch": 12.989616252821671, + "grad_norm": 306.33245849609375, + "learning_rate": 1.0464609800362976e-05, + "loss": 37.2105, + "step": 3598 + }, + { + "epoch": 12.993227990970654, + "grad_norm": 326.3535461425781, + "learning_rate": 1.0459165154264973e-05, + "loss": 33.6613, + "step": 3599 + }, + { + "epoch": 12.996839729119639, + "grad_norm": 325.7522888183594, + "learning_rate": 1.0453720508166969e-05, + "loss": 22.8985, + "step": 3600 + }, + { + "epoch": 12.996839729119639, + "eval_loss": 0.6073772311210632, + "eval_runtime": 3.1391, + "eval_samples_per_second": 57.023, + "eval_steps_per_second": 57.023, + "step": 3600 + }, + { + "epoch": 13.0, + "grad_norm": 256.7010498046875, + "learning_rate": 1.0448275862068966e-05, + "loss": 21.3776, + "step": 3601 + }, + { + "epoch": 13.003611738148985, + "grad_norm": 247.7591552734375, + "learning_rate": 1.0442831215970963e-05, + "loss": 39.0509, + "step": 3602 + }, + { + "epoch": 13.007223476297968, + "grad_norm": 389.6626281738281, + "learning_rate": 1.0437386569872958e-05, + "loss": 41.042, + "step": 3603 + }, + { + "epoch": 13.010835214446953, + "grad_norm": 271.01885986328125, + "learning_rate": 1.0431941923774955e-05, + "loss": 39.9542, + "step": 3604 + }, + { + "epoch": 13.014446952595938, + "grad_norm": 263.2490539550781, + "learning_rate": 1.042649727767695e-05, + "loss": 39.8852, + "step": 3605 + }, + { + "epoch": 13.01805869074492, + "grad_norm": 255.46878051757812, + "learning_rate": 1.0421052631578948e-05, + "loss": 39.3902, + "step": 3606 + }, + { + "epoch": 13.021670428893906, + "grad_norm": 206.02244567871094, + "learning_rate": 1.0415607985480945e-05, + "loss": 40.1731, + "step": 3607 + }, + { + "epoch": 13.025282167042889, + "grad_norm": 194.83055114746094, + "learning_rate": 1.041016333938294e-05, + "loss": 39.17, + "step": 3608 + }, + { + "epoch": 13.028893905191874, + "grad_norm": 230.1270294189453, + "learning_rate": 1.0404718693284936e-05, + "loss": 40.3363, + "step": 3609 + }, + { + "epoch": 13.032505643340858, + "grad_norm": 206.0470733642578, + "learning_rate": 1.0399274047186933e-05, + "loss": 40.7774, + "step": 3610 + }, + { + "epoch": 13.032505643340858, + "eval_loss": 0.6078981161117554, + "eval_runtime": 3.1697, + "eval_samples_per_second": 56.472, + "eval_steps_per_second": 56.472, + "step": 3610 + }, + { + "epoch": 13.036117381489841, + "grad_norm": 210.79327392578125, + "learning_rate": 1.039382940108893e-05, + "loss": 40.725, + "step": 3611 + }, + { + "epoch": 13.039729119638826, + "grad_norm": 200.4281768798828, + "learning_rate": 1.0388384754990927e-05, + "loss": 38.8736, + "step": 3612 + }, + { + "epoch": 13.043340857787811, + "grad_norm": 183.33575439453125, + "learning_rate": 1.0382940108892922e-05, + "loss": 37.5542, + "step": 3613 + }, + { + "epoch": 13.046952595936794, + "grad_norm": 195.2568817138672, + "learning_rate": 1.0377495462794918e-05, + "loss": 36.5576, + "step": 3614 + }, + { + "epoch": 13.050564334085779, + "grad_norm": 223.9565887451172, + "learning_rate": 1.0372050816696916e-05, + "loss": 36.9015, + "step": 3615 + }, + { + "epoch": 13.054176072234762, + "grad_norm": 264.0516052246094, + "learning_rate": 1.0366606170598912e-05, + "loss": 38.8146, + "step": 3616 + }, + { + "epoch": 13.057787810383747, + "grad_norm": 247.3844757080078, + "learning_rate": 1.0361161524500907e-05, + "loss": 37.0338, + "step": 3617 + }, + { + "epoch": 13.061399548532732, + "grad_norm": 243.3253173828125, + "learning_rate": 1.0355716878402904e-05, + "loss": 37.3565, + "step": 3618 + }, + { + "epoch": 13.065011286681715, + "grad_norm": 213.89939880371094, + "learning_rate": 1.03502722323049e-05, + "loss": 38.367, + "step": 3619 + }, + { + "epoch": 13.0686230248307, + "grad_norm": 254.04953002929688, + "learning_rate": 1.0344827586206898e-05, + "loss": 38.3101, + "step": 3620 + }, + { + "epoch": 13.0686230248307, + "eval_loss": 0.6108394861221313, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 3620 + }, + { + "epoch": 13.072234762979685, + "grad_norm": 235.3623046875, + "learning_rate": 1.0339382940108894e-05, + "loss": 38.3113, + "step": 3621 + }, + { + "epoch": 13.075846501128668, + "grad_norm": 259.0147399902344, + "learning_rate": 1.0333938294010889e-05, + "loss": 36.9916, + "step": 3622 + }, + { + "epoch": 13.079458239277653, + "grad_norm": 257.96575927734375, + "learning_rate": 1.0328493647912886e-05, + "loss": 36.5944, + "step": 3623 + }, + { + "epoch": 13.083069977426636, + "grad_norm": 228.49131774902344, + "learning_rate": 1.0323049001814882e-05, + "loss": 39.7592, + "step": 3624 + }, + { + "epoch": 13.08668171557562, + "grad_norm": 278.5231018066406, + "learning_rate": 1.0317604355716879e-05, + "loss": 38.7785, + "step": 3625 + }, + { + "epoch": 13.090293453724605, + "grad_norm": 218.6136932373047, + "learning_rate": 1.0312159709618876e-05, + "loss": 39.6878, + "step": 3626 + }, + { + "epoch": 13.093905191873588, + "grad_norm": 231.03012084960938, + "learning_rate": 1.0306715063520871e-05, + "loss": 40.5433, + "step": 3627 + }, + { + "epoch": 13.097516930022573, + "grad_norm": 254.7096405029297, + "learning_rate": 1.0301270417422866e-05, + "loss": 39.1311, + "step": 3628 + }, + { + "epoch": 13.101128668171558, + "grad_norm": 303.50274658203125, + "learning_rate": 1.0295825771324865e-05, + "loss": 38.6237, + "step": 3629 + }, + { + "epoch": 13.104740406320541, + "grad_norm": 217.4394073486328, + "learning_rate": 1.029038112522686e-05, + "loss": 36.5534, + "step": 3630 + }, + { + "epoch": 13.104740406320541, + "eval_loss": 0.6075544357299805, + "eval_runtime": 3.1475, + "eval_samples_per_second": 56.87, + "eval_steps_per_second": 56.87, + "step": 3630 + }, + { + "epoch": 13.108352144469526, + "grad_norm": 249.18490600585938, + "learning_rate": 1.0284936479128858e-05, + "loss": 34.2153, + "step": 3631 + }, + { + "epoch": 13.111963882618511, + "grad_norm": 261.9061584472656, + "learning_rate": 1.0279491833030853e-05, + "loss": 33.7793, + "step": 3632 + }, + { + "epoch": 13.115575620767494, + "grad_norm": 205.93113708496094, + "learning_rate": 1.0274047186932848e-05, + "loss": 31.2934, + "step": 3633 + }, + { + "epoch": 13.119187358916479, + "grad_norm": 203.82980346679688, + "learning_rate": 1.0268602540834847e-05, + "loss": 31.9074, + "step": 3634 + }, + { + "epoch": 13.122799097065462, + "grad_norm": 309.0658874511719, + "learning_rate": 1.0263157894736843e-05, + "loss": 32.6883, + "step": 3635 + }, + { + "epoch": 13.126410835214447, + "grad_norm": 239.59312438964844, + "learning_rate": 1.0257713248638838e-05, + "loss": 34.1261, + "step": 3636 + }, + { + "epoch": 13.130022573363432, + "grad_norm": 360.4351501464844, + "learning_rate": 1.0252268602540835e-05, + "loss": 34.7656, + "step": 3637 + }, + { + "epoch": 13.133634311512415, + "grad_norm": 319.87451171875, + "learning_rate": 1.024682395644283e-05, + "loss": 34.6533, + "step": 3638 + }, + { + "epoch": 13.1372460496614, + "grad_norm": 352.31707763671875, + "learning_rate": 1.0241379310344828e-05, + "loss": 33.9159, + "step": 3639 + }, + { + "epoch": 13.140857787810384, + "grad_norm": 288.85418701171875, + "learning_rate": 1.0235934664246825e-05, + "loss": 34.6115, + "step": 3640 + }, + { + "epoch": 13.140857787810384, + "eval_loss": 0.6106187105178833, + "eval_runtime": 3.1535, + "eval_samples_per_second": 56.763, + "eval_steps_per_second": 56.763, + "step": 3640 + }, + { + "epoch": 13.144469525959368, + "grad_norm": 263.8638000488281, + "learning_rate": 1.023049001814882e-05, + "loss": 34.3008, + "step": 3641 + }, + { + "epoch": 13.148081264108352, + "grad_norm": 308.10650634765625, + "learning_rate": 1.0225045372050817e-05, + "loss": 35.9397, + "step": 3642 + }, + { + "epoch": 13.151693002257336, + "grad_norm": 208.60519409179688, + "learning_rate": 1.0219600725952814e-05, + "loss": 34.2573, + "step": 3643 + }, + { + "epoch": 13.15530474040632, + "grad_norm": 251.36766052246094, + "learning_rate": 1.021415607985481e-05, + "loss": 35.853, + "step": 3644 + }, + { + "epoch": 13.158916478555305, + "grad_norm": 264.94818115234375, + "learning_rate": 1.0208711433756807e-05, + "loss": 35.7057, + "step": 3645 + }, + { + "epoch": 13.162528216704288, + "grad_norm": 313.0333251953125, + "learning_rate": 1.0203266787658802e-05, + "loss": 34.611, + "step": 3646 + }, + { + "epoch": 13.166139954853273, + "grad_norm": 254.9687042236328, + "learning_rate": 1.0197822141560797e-05, + "loss": 31.1751, + "step": 3647 + }, + { + "epoch": 13.169751693002258, + "grad_norm": 219.7308349609375, + "learning_rate": 1.0192377495462796e-05, + "loss": 22.8425, + "step": 3648 + }, + { + "epoch": 13.173363431151241, + "grad_norm": 305.76416015625, + "learning_rate": 1.0186932849364792e-05, + "loss": 22.5266, + "step": 3649 + }, + { + "epoch": 13.176975169300226, + "grad_norm": 301.26239013671875, + "learning_rate": 1.0181488203266787e-05, + "loss": 23.861, + "step": 3650 + }, + { + "epoch": 13.176975169300226, + "eval_loss": 0.6107029914855957, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 3650 + }, + { + "epoch": 13.18058690744921, + "grad_norm": 235.15576171875, + "learning_rate": 1.0176043557168784e-05, + "loss": 24.495, + "step": 3651 + }, + { + "epoch": 13.184198645598194, + "grad_norm": 268.524658203125, + "learning_rate": 1.0170598911070781e-05, + "loss": 40.3819, + "step": 3652 + }, + { + "epoch": 13.187810383747179, + "grad_norm": 257.869140625, + "learning_rate": 1.0165154264972778e-05, + "loss": 42.2715, + "step": 3653 + }, + { + "epoch": 13.191422121896162, + "grad_norm": 191.8995361328125, + "learning_rate": 1.0159709618874774e-05, + "loss": 41.2991, + "step": 3654 + }, + { + "epoch": 13.195033860045147, + "grad_norm": 242.85342407226562, + "learning_rate": 1.0154264972776769e-05, + "loss": 39.6007, + "step": 3655 + }, + { + "epoch": 13.198645598194132, + "grad_norm": 279.1092529296875, + "learning_rate": 1.0148820326678766e-05, + "loss": 39.8502, + "step": 3656 + }, + { + "epoch": 13.202257336343115, + "grad_norm": 233.94708251953125, + "learning_rate": 1.0143375680580763e-05, + "loss": 39.6407, + "step": 3657 + }, + { + "epoch": 13.2058690744921, + "grad_norm": 227.53001403808594, + "learning_rate": 1.0137931034482758e-05, + "loss": 40.3618, + "step": 3658 + }, + { + "epoch": 13.209480812641084, + "grad_norm": 216.17654418945312, + "learning_rate": 1.0132486388384756e-05, + "loss": 41.3187, + "step": 3659 + }, + { + "epoch": 13.213092550790067, + "grad_norm": 199.51072692871094, + "learning_rate": 1.0127041742286751e-05, + "loss": 41.7474, + "step": 3660 + }, + { + "epoch": 13.213092550790067, + "eval_loss": 0.6099065542221069, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3660 + }, + { + "epoch": 13.216704288939052, + "grad_norm": 212.3302001953125, + "learning_rate": 1.0121597096188748e-05, + "loss": 40.8565, + "step": 3661 + }, + { + "epoch": 13.220316027088035, + "grad_norm": 185.42857360839844, + "learning_rate": 1.0116152450090745e-05, + "loss": 41.5302, + "step": 3662 + }, + { + "epoch": 13.22392776523702, + "grad_norm": 241.05487060546875, + "learning_rate": 1.011070780399274e-05, + "loss": 38.6842, + "step": 3663 + }, + { + "epoch": 13.227539503386005, + "grad_norm": 314.1755065917969, + "learning_rate": 1.0105263157894738e-05, + "loss": 37.8021, + "step": 3664 + }, + { + "epoch": 13.231151241534988, + "grad_norm": 262.6571960449219, + "learning_rate": 1.0099818511796733e-05, + "loss": 36.3265, + "step": 3665 + }, + { + "epoch": 13.234762979683973, + "grad_norm": 259.24029541015625, + "learning_rate": 1.009437386569873e-05, + "loss": 38.4521, + "step": 3666 + }, + { + "epoch": 13.238374717832958, + "grad_norm": 223.5182342529297, + "learning_rate": 1.0088929219600727e-05, + "loss": 37.3267, + "step": 3667 + }, + { + "epoch": 13.241986455981941, + "grad_norm": 181.72926330566406, + "learning_rate": 1.0083484573502722e-05, + "loss": 38.0142, + "step": 3668 + }, + { + "epoch": 13.245598194130926, + "grad_norm": 204.99813842773438, + "learning_rate": 1.0078039927404718e-05, + "loss": 37.3513, + "step": 3669 + }, + { + "epoch": 13.249209932279909, + "grad_norm": 184.05482482910156, + "learning_rate": 1.0072595281306715e-05, + "loss": 37.9737, + "step": 3670 + }, + { + "epoch": 13.249209932279909, + "eval_loss": 0.6081296801567078, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.081, + "eval_steps_per_second": 57.081, + "step": 3670 + }, + { + "epoch": 13.252821670428894, + "grad_norm": 261.076416015625, + "learning_rate": 1.0067150635208712e-05, + "loss": 38.1087, + "step": 3671 + }, + { + "epoch": 13.256433408577879, + "grad_norm": 218.79515075683594, + "learning_rate": 1.0061705989110709e-05, + "loss": 37.215, + "step": 3672 + }, + { + "epoch": 13.260045146726862, + "grad_norm": 240.93222045898438, + "learning_rate": 1.0056261343012704e-05, + "loss": 37.4461, + "step": 3673 + }, + { + "epoch": 13.263656884875846, + "grad_norm": 241.46072387695312, + "learning_rate": 1.00508166969147e-05, + "loss": 39.4396, + "step": 3674 + }, + { + "epoch": 13.267268623024831, + "grad_norm": 217.85369873046875, + "learning_rate": 1.0045372050816699e-05, + "loss": 38.5512, + "step": 3675 + }, + { + "epoch": 13.270880361173814, + "grad_norm": 254.53549194335938, + "learning_rate": 1.0039927404718694e-05, + "loss": 39.4436, + "step": 3676 + }, + { + "epoch": 13.2744920993228, + "grad_norm": 330.2030029296875, + "learning_rate": 1.003448275862069e-05, + "loss": 39.6341, + "step": 3677 + }, + { + "epoch": 13.278103837471784, + "grad_norm": 267.6778869628906, + "learning_rate": 1.0029038112522686e-05, + "loss": 38.5305, + "step": 3678 + }, + { + "epoch": 13.281715575620767, + "grad_norm": 251.23703002929688, + "learning_rate": 1.0023593466424682e-05, + "loss": 39.712, + "step": 3679 + }, + { + "epoch": 13.285327313769752, + "grad_norm": 258.8126525878906, + "learning_rate": 1.0018148820326679e-05, + "loss": 37.982, + "step": 3680 + }, + { + "epoch": 13.285327313769752, + "eval_loss": 0.6092600226402283, + "eval_runtime": 3.1494, + "eval_samples_per_second": 56.837, + "eval_steps_per_second": 56.837, + "step": 3680 + }, + { + "epoch": 13.288939051918735, + "grad_norm": 270.01690673828125, + "learning_rate": 1.0012704174228676e-05, + "loss": 35.8938, + "step": 3681 + }, + { + "epoch": 13.29255079006772, + "grad_norm": 271.138671875, + "learning_rate": 1.0007259528130671e-05, + "loss": 33.2221, + "step": 3682 + }, + { + "epoch": 13.296162528216705, + "grad_norm": 239.4976806640625, + "learning_rate": 1.0001814882032668e-05, + "loss": 32.6252, + "step": 3683 + }, + { + "epoch": 13.299774266365688, + "grad_norm": 203.7470245361328, + "learning_rate": 9.996370235934664e-06, + "loss": 32.3694, + "step": 3684 + }, + { + "epoch": 13.303386004514673, + "grad_norm": 255.28419494628906, + "learning_rate": 9.990925589836661e-06, + "loss": 32.7386, + "step": 3685 + }, + { + "epoch": 13.306997742663658, + "grad_norm": 267.82489013671875, + "learning_rate": 9.985480943738658e-06, + "loss": 33.7657, + "step": 3686 + }, + { + "epoch": 13.31060948081264, + "grad_norm": 224.82432556152344, + "learning_rate": 9.980036297640653e-06, + "loss": 34.085, + "step": 3687 + }, + { + "epoch": 13.314221218961626, + "grad_norm": 249.92684936523438, + "learning_rate": 9.974591651542649e-06, + "loss": 33.9186, + "step": 3688 + }, + { + "epoch": 13.317832957110609, + "grad_norm": 249.29620361328125, + "learning_rate": 9.969147005444648e-06, + "loss": 35.0909, + "step": 3689 + }, + { + "epoch": 13.321444695259594, + "grad_norm": 276.4640808105469, + "learning_rate": 9.963702359346643e-06, + "loss": 35.6823, + "step": 3690 + }, + { + "epoch": 13.321444695259594, + "eval_loss": 0.6132593154907227, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 3690 + }, + { + "epoch": 13.325056433408578, + "grad_norm": 245.46163940429688, + "learning_rate": 9.958257713248638e-06, + "loss": 35.7071, + "step": 3691 + }, + { + "epoch": 13.328668171557561, + "grad_norm": 311.008544921875, + "learning_rate": 9.952813067150635e-06, + "loss": 33.6089, + "step": 3692 + }, + { + "epoch": 13.332279909706546, + "grad_norm": 283.2784118652344, + "learning_rate": 9.94736842105263e-06, + "loss": 34.9939, + "step": 3693 + }, + { + "epoch": 13.335891647855531, + "grad_norm": 293.2317199707031, + "learning_rate": 9.94192377495463e-06, + "loss": 37.1149, + "step": 3694 + }, + { + "epoch": 13.339503386004514, + "grad_norm": 263.33111572265625, + "learning_rate": 9.936479128856625e-06, + "loss": 36.5911, + "step": 3695 + }, + { + "epoch": 13.343115124153499, + "grad_norm": 285.1488952636719, + "learning_rate": 9.93103448275862e-06, + "loss": 35.9336, + "step": 3696 + }, + { + "epoch": 13.346726862302482, + "grad_norm": 246.30616760253906, + "learning_rate": 9.925589836660617e-06, + "loss": 26.1555, + "step": 3697 + }, + { + "epoch": 13.350338600451467, + "grad_norm": 185.4857177734375, + "learning_rate": 9.920145190562614e-06, + "loss": 21.9519, + "step": 3698 + }, + { + "epoch": 13.353950338600452, + "grad_norm": 269.6291809082031, + "learning_rate": 9.91470054446461e-06, + "loss": 22.5592, + "step": 3699 + }, + { + "epoch": 13.357562076749435, + "grad_norm": 214.7660675048828, + "learning_rate": 9.909255898366607e-06, + "loss": 23.2505, + "step": 3700 + }, + { + "epoch": 13.357562076749435, + "eval_loss": 0.6123418211936951, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 3700 + }, + { + "epoch": 13.36117381489842, + "grad_norm": 227.8025360107422, + "learning_rate": 9.903811252268602e-06, + "loss": 23.9731, + "step": 3701 + }, + { + "epoch": 13.364785553047405, + "grad_norm": 261.7846374511719, + "learning_rate": 9.898366606170598e-06, + "loss": 40.3869, + "step": 3702 + }, + { + "epoch": 13.368397291196388, + "grad_norm": 305.4109802246094, + "learning_rate": 9.892921960072596e-06, + "loss": 41.9626, + "step": 3703 + }, + { + "epoch": 13.372009029345373, + "grad_norm": 272.86236572265625, + "learning_rate": 9.887477313974592e-06, + "loss": 39.9819, + "step": 3704 + }, + { + "epoch": 13.375620767494357, + "grad_norm": 371.4781188964844, + "learning_rate": 9.882032667876589e-06, + "loss": 40.8074, + "step": 3705 + }, + { + "epoch": 13.37923250564334, + "grad_norm": 278.7463684082031, + "learning_rate": 9.876588021778584e-06, + "loss": 40.6721, + "step": 3706 + }, + { + "epoch": 13.382844243792325, + "grad_norm": 270.41619873046875, + "learning_rate": 9.87114337568058e-06, + "loss": 40.1604, + "step": 3707 + }, + { + "epoch": 13.386455981941308, + "grad_norm": 204.42018127441406, + "learning_rate": 9.865698729582578e-06, + "loss": 41.4666, + "step": 3708 + }, + { + "epoch": 13.390067720090293, + "grad_norm": 197.43289184570312, + "learning_rate": 9.860254083484574e-06, + "loss": 40.953, + "step": 3709 + }, + { + "epoch": 13.393679458239278, + "grad_norm": 203.92056274414062, + "learning_rate": 9.85480943738657e-06, + "loss": 40.6416, + "step": 3710 + }, + { + "epoch": 13.393679458239278, + "eval_loss": 0.608938992023468, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.863, + "eval_steps_per_second": 56.863, + "step": 3710 + }, + { + "epoch": 13.397291196388261, + "grad_norm": 353.2951354980469, + "learning_rate": 9.849364791288566e-06, + "loss": 39.7, + "step": 3711 + }, + { + "epoch": 13.400902934537246, + "grad_norm": 222.94410705566406, + "learning_rate": 9.843920145190563e-06, + "loss": 40.4703, + "step": 3712 + }, + { + "epoch": 13.404514672686231, + "grad_norm": 301.0710754394531, + "learning_rate": 9.83847549909256e-06, + "loss": 37.0453, + "step": 3713 + }, + { + "epoch": 13.408126410835214, + "grad_norm": 251.70263671875, + "learning_rate": 9.833030852994556e-06, + "loss": 37.5346, + "step": 3714 + }, + { + "epoch": 13.411738148984199, + "grad_norm": 201.29335021972656, + "learning_rate": 9.827586206896551e-06, + "loss": 39.0706, + "step": 3715 + }, + { + "epoch": 13.415349887133182, + "grad_norm": 233.82212829589844, + "learning_rate": 9.822141560798548e-06, + "loss": 38.4527, + "step": 3716 + }, + { + "epoch": 13.418961625282167, + "grad_norm": 245.0128936767578, + "learning_rate": 9.816696914700545e-06, + "loss": 37.82, + "step": 3717 + }, + { + "epoch": 13.422573363431152, + "grad_norm": 325.1784973144531, + "learning_rate": 9.81125226860254e-06, + "loss": 38.8858, + "step": 3718 + }, + { + "epoch": 13.426185101580135, + "grad_norm": 196.15032958984375, + "learning_rate": 9.805807622504538e-06, + "loss": 37.1919, + "step": 3719 + }, + { + "epoch": 13.42979683972912, + "grad_norm": 254.73980712890625, + "learning_rate": 9.800362976406533e-06, + "loss": 39.1644, + "step": 3720 + }, + { + "epoch": 13.42979683972912, + "eval_loss": 0.6100116968154907, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 3720 + }, + { + "epoch": 13.433408577878104, + "grad_norm": 253.11489868164062, + "learning_rate": 9.79491833030853e-06, + "loss": 39.8542, + "step": 3721 + }, + { + "epoch": 13.437020316027088, + "grad_norm": 267.8416748046875, + "learning_rate": 9.789473684210527e-06, + "loss": 39.8469, + "step": 3722 + }, + { + "epoch": 13.440632054176072, + "grad_norm": 267.62835693359375, + "learning_rate": 9.784029038112523e-06, + "loss": 37.4556, + "step": 3723 + }, + { + "epoch": 13.444243792325057, + "grad_norm": 346.6018371582031, + "learning_rate": 9.77858439201452e-06, + "loss": 39.7817, + "step": 3724 + }, + { + "epoch": 13.44785553047404, + "grad_norm": 241.95008850097656, + "learning_rate": 9.773139745916515e-06, + "loss": 39.1631, + "step": 3725 + }, + { + "epoch": 13.451467268623025, + "grad_norm": 244.9163055419922, + "learning_rate": 9.767695099818512e-06, + "loss": 38.6152, + "step": 3726 + }, + { + "epoch": 13.455079006772008, + "grad_norm": 243.60633850097656, + "learning_rate": 9.76225045372051e-06, + "loss": 39.5388, + "step": 3727 + }, + { + "epoch": 13.458690744920993, + "grad_norm": 230.57276916503906, + "learning_rate": 9.756805807622505e-06, + "loss": 40.3007, + "step": 3728 + }, + { + "epoch": 13.462302483069978, + "grad_norm": 228.76754760742188, + "learning_rate": 9.7513611615245e-06, + "loss": 37.7111, + "step": 3729 + }, + { + "epoch": 13.465914221218961, + "grad_norm": 292.7367248535156, + "learning_rate": 9.745916515426497e-06, + "loss": 38.4114, + "step": 3730 + }, + { + "epoch": 13.465914221218961, + "eval_loss": 0.6064842939376831, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 3730 + }, + { + "epoch": 13.469525959367946, + "grad_norm": 226.9254150390625, + "learning_rate": 9.740471869328494e-06, + "loss": 34.015, + "step": 3731 + }, + { + "epoch": 13.47313769751693, + "grad_norm": 250.38137817382812, + "learning_rate": 9.73502722323049e-06, + "loss": 34.2911, + "step": 3732 + }, + { + "epoch": 13.476749435665914, + "grad_norm": 230.447265625, + "learning_rate": 9.729582577132487e-06, + "loss": 31.8708, + "step": 3733 + }, + { + "epoch": 13.480361173814899, + "grad_norm": 241.05787658691406, + "learning_rate": 9.724137931034482e-06, + "loss": 34.5685, + "step": 3734 + }, + { + "epoch": 13.483972911963882, + "grad_norm": 248.07254028320312, + "learning_rate": 9.718693284936481e-06, + "loss": 32.6084, + "step": 3735 + }, + { + "epoch": 13.487584650112867, + "grad_norm": 241.22862243652344, + "learning_rate": 9.713248638838476e-06, + "loss": 32.787, + "step": 3736 + }, + { + "epoch": 13.491196388261852, + "grad_norm": 295.4871520996094, + "learning_rate": 9.707803992740472e-06, + "loss": 33.9786, + "step": 3737 + }, + { + "epoch": 13.494808126410835, + "grad_norm": 285.3634948730469, + "learning_rate": 9.702359346642469e-06, + "loss": 33.9872, + "step": 3738 + }, + { + "epoch": 13.49841986455982, + "grad_norm": 302.39947509765625, + "learning_rate": 9.696914700544464e-06, + "loss": 33.9854, + "step": 3739 + }, + { + "epoch": 13.502031602708804, + "grad_norm": 310.0465087890625, + "learning_rate": 9.691470054446461e-06, + "loss": 34.1859, + "step": 3740 + }, + { + "epoch": 13.502031602708804, + "eval_loss": 0.6067100167274475, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 3740 + }, + { + "epoch": 13.505643340857787, + "grad_norm": 319.9311828613281, + "learning_rate": 9.686025408348458e-06, + "loss": 34.5264, + "step": 3741 + }, + { + "epoch": 13.509255079006772, + "grad_norm": 291.75738525390625, + "learning_rate": 9.680580762250454e-06, + "loss": 35.8348, + "step": 3742 + }, + { + "epoch": 13.512866817155757, + "grad_norm": 291.5312805175781, + "learning_rate": 9.675136116152449e-06, + "loss": 33.8803, + "step": 3743 + }, + { + "epoch": 13.51647855530474, + "grad_norm": 228.00588989257812, + "learning_rate": 9.669691470054448e-06, + "loss": 36.1919, + "step": 3744 + }, + { + "epoch": 13.520090293453725, + "grad_norm": 236.5559539794922, + "learning_rate": 9.664246823956443e-06, + "loss": 35.8432, + "step": 3745 + }, + { + "epoch": 13.523702031602708, + "grad_norm": 287.7408752441406, + "learning_rate": 9.65880217785844e-06, + "loss": 37.069, + "step": 3746 + }, + { + "epoch": 13.527313769751693, + "grad_norm": 272.73870849609375, + "learning_rate": 9.653357531760436e-06, + "loss": 29.1896, + "step": 3747 + }, + { + "epoch": 13.530925507900678, + "grad_norm": 256.5550842285156, + "learning_rate": 9.647912885662431e-06, + "loss": 23.0953, + "step": 3748 + }, + { + "epoch": 13.534537246049661, + "grad_norm": 230.98487854003906, + "learning_rate": 9.64246823956443e-06, + "loss": 21.9902, + "step": 3749 + }, + { + "epoch": 13.538148984198646, + "grad_norm": 247.1185760498047, + "learning_rate": 9.637023593466425e-06, + "loss": 23.7439, + "step": 3750 + }, + { + "epoch": 13.538148984198646, + "eval_loss": 0.6106311082839966, + "eval_runtime": 3.1356, + "eval_samples_per_second": 57.086, + "eval_steps_per_second": 57.086, + "step": 3750 + }, + { + "epoch": 13.54176072234763, + "grad_norm": 193.83152770996094, + "learning_rate": 9.63157894736842e-06, + "loss": 24.2292, + "step": 3751 + }, + { + "epoch": 13.545372460496614, + "grad_norm": 322.80487060546875, + "learning_rate": 9.626134301270418e-06, + "loss": 40.9778, + "step": 3752 + }, + { + "epoch": 13.548984198645599, + "grad_norm": 345.0560302734375, + "learning_rate": 9.620689655172413e-06, + "loss": 42.3601, + "step": 3753 + }, + { + "epoch": 13.552595936794582, + "grad_norm": 240.3759002685547, + "learning_rate": 9.61524500907441e-06, + "loss": 41.092, + "step": 3754 + }, + { + "epoch": 13.556207674943566, + "grad_norm": 219.0955352783203, + "learning_rate": 9.609800362976407e-06, + "loss": 40.3108, + "step": 3755 + }, + { + "epoch": 13.559819413092551, + "grad_norm": 255.6158447265625, + "learning_rate": 9.604355716878403e-06, + "loss": 39.8885, + "step": 3756 + }, + { + "epoch": 13.563431151241534, + "grad_norm": 264.55010986328125, + "learning_rate": 9.5989110707804e-06, + "loss": 40.8838, + "step": 3757 + }, + { + "epoch": 13.56704288939052, + "grad_norm": 313.0918273925781, + "learning_rate": 9.593466424682397e-06, + "loss": 40.6634, + "step": 3758 + }, + { + "epoch": 13.570654627539504, + "grad_norm": 304.87396240234375, + "learning_rate": 9.588021778584392e-06, + "loss": 41.8734, + "step": 3759 + }, + { + "epoch": 13.574266365688487, + "grad_norm": 239.76063537597656, + "learning_rate": 9.58257713248639e-06, + "loss": 40.6281, + "step": 3760 + }, + { + "epoch": 13.574266365688487, + "eval_loss": 0.6124129891395569, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.028, + "eval_steps_per_second": 57.028, + "step": 3760 + }, + { + "epoch": 13.577878103837472, + "grad_norm": 201.89422607421875, + "learning_rate": 9.577132486388385e-06, + "loss": 39.6948, + "step": 3761 + }, + { + "epoch": 13.581489841986457, + "grad_norm": 232.8797607421875, + "learning_rate": 9.57168784029038e-06, + "loss": 39.6927, + "step": 3762 + }, + { + "epoch": 13.58510158013544, + "grad_norm": 250.30355834960938, + "learning_rate": 9.566243194192379e-06, + "loss": 37.6926, + "step": 3763 + }, + { + "epoch": 13.588713318284425, + "grad_norm": 256.23626708984375, + "learning_rate": 9.560798548094374e-06, + "loss": 38.248, + "step": 3764 + }, + { + "epoch": 13.592325056433408, + "grad_norm": 234.1791534423828, + "learning_rate": 9.555353901996371e-06, + "loss": 36.8178, + "step": 3765 + }, + { + "epoch": 13.595936794582393, + "grad_norm": 243.87615966796875, + "learning_rate": 9.549909255898367e-06, + "loss": 37.0802, + "step": 3766 + }, + { + "epoch": 13.599548532731378, + "grad_norm": 220.98150634765625, + "learning_rate": 9.544464609800362e-06, + "loss": 37.1251, + "step": 3767 + }, + { + "epoch": 13.60316027088036, + "grad_norm": 235.8653564453125, + "learning_rate": 9.53901996370236e-06, + "loss": 38.2965, + "step": 3768 + }, + { + "epoch": 13.606772009029346, + "grad_norm": 237.66712951660156, + "learning_rate": 9.533575317604356e-06, + "loss": 38.0266, + "step": 3769 + }, + { + "epoch": 13.610383747178329, + "grad_norm": 229.4922637939453, + "learning_rate": 9.528130671506351e-06, + "loss": 38.4199, + "step": 3770 + }, + { + "epoch": 13.610383747178329, + "eval_loss": 0.6078812479972839, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 3770 + }, + { + "epoch": 13.613995485327314, + "grad_norm": 250.82533264160156, + "learning_rate": 9.522686025408349e-06, + "loss": 39.713, + "step": 3771 + }, + { + "epoch": 13.617607223476298, + "grad_norm": 218.97511291503906, + "learning_rate": 9.517241379310346e-06, + "loss": 37.6396, + "step": 3772 + }, + { + "epoch": 13.621218961625281, + "grad_norm": 240.13096618652344, + "learning_rate": 9.511796733212341e-06, + "loss": 39.2808, + "step": 3773 + }, + { + "epoch": 13.624830699774266, + "grad_norm": 214.77957153320312, + "learning_rate": 9.506352087114338e-06, + "loss": 39.1584, + "step": 3774 + }, + { + "epoch": 13.628442437923251, + "grad_norm": 273.2488708496094, + "learning_rate": 9.500907441016333e-06, + "loss": 39.6725, + "step": 3775 + }, + { + "epoch": 13.632054176072234, + "grad_norm": 240.46669006347656, + "learning_rate": 9.49546279491833e-06, + "loss": 40.155, + "step": 3776 + }, + { + "epoch": 13.635665914221219, + "grad_norm": 304.46533203125, + "learning_rate": 9.490018148820328e-06, + "loss": 39.5831, + "step": 3777 + }, + { + "epoch": 13.639277652370204, + "grad_norm": 282.9252624511719, + "learning_rate": 9.484573502722323e-06, + "loss": 40.8392, + "step": 3778 + }, + { + "epoch": 13.642889390519187, + "grad_norm": 229.2595977783203, + "learning_rate": 9.47912885662432e-06, + "loss": 38.4015, + "step": 3779 + }, + { + "epoch": 13.646501128668172, + "grad_norm": 300.0253601074219, + "learning_rate": 9.473684210526315e-06, + "loss": 35.0578, + "step": 3780 + }, + { + "epoch": 13.646501128668172, + "eval_loss": 0.6059401631355286, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 3780 + }, + { + "epoch": 13.650112866817155, + "grad_norm": 266.379638671875, + "learning_rate": 9.468239564428313e-06, + "loss": 33.0308, + "step": 3781 + }, + { + "epoch": 13.65372460496614, + "grad_norm": 248.8190460205078, + "learning_rate": 9.46279491833031e-06, + "loss": 31.7632, + "step": 3782 + }, + { + "epoch": 13.657336343115125, + "grad_norm": 224.4126739501953, + "learning_rate": 9.457350272232305e-06, + "loss": 32.8875, + "step": 3783 + }, + { + "epoch": 13.660948081264108, + "grad_norm": 259.84466552734375, + "learning_rate": 9.4519056261343e-06, + "loss": 32.3248, + "step": 3784 + }, + { + "epoch": 13.664559819413093, + "grad_norm": 233.59483337402344, + "learning_rate": 9.446460980036297e-06, + "loss": 32.5855, + "step": 3785 + }, + { + "epoch": 13.668171557562077, + "grad_norm": 283.1840515136719, + "learning_rate": 9.441016333938295e-06, + "loss": 33.8277, + "step": 3786 + }, + { + "epoch": 13.67178329571106, + "grad_norm": 269.51171875, + "learning_rate": 9.435571687840292e-06, + "loss": 33.8348, + "step": 3787 + }, + { + "epoch": 13.675395033860045, + "grad_norm": 284.6701354980469, + "learning_rate": 9.430127041742287e-06, + "loss": 34.2571, + "step": 3788 + }, + { + "epoch": 13.679006772009028, + "grad_norm": 308.96221923828125, + "learning_rate": 9.424682395644282e-06, + "loss": 34.2313, + "step": 3789 + }, + { + "epoch": 13.682618510158013, + "grad_norm": 229.36366271972656, + "learning_rate": 9.41923774954628e-06, + "loss": 34.6341, + "step": 3790 + }, + { + "epoch": 13.682618510158013, + "eval_loss": 0.606715202331543, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 3790 + }, + { + "epoch": 13.686230248306998, + "grad_norm": 335.4346008300781, + "learning_rate": 9.413793103448277e-06, + "loss": 35.2222, + "step": 3791 + }, + { + "epoch": 13.689841986455981, + "grad_norm": 259.72222900390625, + "learning_rate": 9.408348457350272e-06, + "loss": 34.7416, + "step": 3792 + }, + { + "epoch": 13.693453724604966, + "grad_norm": 275.96112060546875, + "learning_rate": 9.402903811252269e-06, + "loss": 34.2018, + "step": 3793 + }, + { + "epoch": 13.697065462753951, + "grad_norm": 349.28924560546875, + "learning_rate": 9.397459165154264e-06, + "loss": 37.8801, + "step": 3794 + }, + { + "epoch": 13.700677200902934, + "grad_norm": 288.47540283203125, + "learning_rate": 9.392014519056261e-06, + "loss": 37.5101, + "step": 3795 + }, + { + "epoch": 13.704288939051919, + "grad_norm": 255.31033325195312, + "learning_rate": 9.386569872958259e-06, + "loss": 36.9294, + "step": 3796 + }, + { + "epoch": 13.707900677200904, + "grad_norm": 273.757080078125, + "learning_rate": 9.381125226860254e-06, + "loss": 31.64, + "step": 3797 + }, + { + "epoch": 13.711512415349887, + "grad_norm": 236.24928283691406, + "learning_rate": 9.375680580762251e-06, + "loss": 22.9812, + "step": 3798 + }, + { + "epoch": 13.715124153498872, + "grad_norm": 206.70883178710938, + "learning_rate": 9.370235934664246e-06, + "loss": 22.4788, + "step": 3799 + }, + { + "epoch": 13.718735891647855, + "grad_norm": 168.15762329101562, + "learning_rate": 9.364791288566243e-06, + "loss": 23.3803, + "step": 3800 + }, + { + "epoch": 13.718735891647855, + "eval_loss": 0.6092759966850281, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 3800 + }, + { + "epoch": 13.72234762979684, + "grad_norm": 261.88397216796875, + "learning_rate": 9.35934664246824e-06, + "loss": 24.8757, + "step": 3801 + }, + { + "epoch": 13.725959367945824, + "grad_norm": 235.3518829345703, + "learning_rate": 9.353901996370236e-06, + "loss": 39.8777, + "step": 3802 + }, + { + "epoch": 13.729571106094808, + "grad_norm": 226.94027709960938, + "learning_rate": 9.348457350272231e-06, + "loss": 40.4357, + "step": 3803 + }, + { + "epoch": 13.733182844243792, + "grad_norm": 266.2643737792969, + "learning_rate": 9.34301270417423e-06, + "loss": 41.6411, + "step": 3804 + }, + { + "epoch": 13.736794582392777, + "grad_norm": 327.39288330078125, + "learning_rate": 9.337568058076225e-06, + "loss": 39.862, + "step": 3805 + }, + { + "epoch": 13.74040632054176, + "grad_norm": 241.03121948242188, + "learning_rate": 9.332123411978223e-06, + "loss": 39.1833, + "step": 3806 + }, + { + "epoch": 13.744018058690745, + "grad_norm": 232.2872314453125, + "learning_rate": 9.326678765880218e-06, + "loss": 40.6895, + "step": 3807 + }, + { + "epoch": 13.747629796839728, + "grad_norm": 236.909912109375, + "learning_rate": 9.321234119782213e-06, + "loss": 39.5891, + "step": 3808 + }, + { + "epoch": 13.751241534988713, + "grad_norm": 193.81478881835938, + "learning_rate": 9.315789473684212e-06, + "loss": 41.5211, + "step": 3809 + }, + { + "epoch": 13.754853273137698, + "grad_norm": 214.87301635742188, + "learning_rate": 9.310344827586207e-06, + "loss": 41.0726, + "step": 3810 + }, + { + "epoch": 13.754853273137698, + "eval_loss": 0.6098713874816895, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.047, + "eval_steps_per_second": 57.047, + "step": 3810 + }, + { + "epoch": 13.758465011286681, + "grad_norm": 196.57247924804688, + "learning_rate": 9.304900181488203e-06, + "loss": 40.1843, + "step": 3811 + }, + { + "epoch": 13.762076749435666, + "grad_norm": 215.59698486328125, + "learning_rate": 9.2994555353902e-06, + "loss": 37.6279, + "step": 3812 + }, + { + "epoch": 13.76568848758465, + "grad_norm": 221.1280059814453, + "learning_rate": 9.294010889292195e-06, + "loss": 37.9593, + "step": 3813 + }, + { + "epoch": 13.769300225733634, + "grad_norm": 314.94610595703125, + "learning_rate": 9.288566243194192e-06, + "loss": 37.3399, + "step": 3814 + }, + { + "epoch": 13.772911963882619, + "grad_norm": 240.10816955566406, + "learning_rate": 9.28312159709619e-06, + "loss": 38.3185, + "step": 3815 + }, + { + "epoch": 13.776523702031604, + "grad_norm": 229.2427978515625, + "learning_rate": 9.277676950998185e-06, + "loss": 36.9407, + "step": 3816 + }, + { + "epoch": 13.780135440180587, + "grad_norm": 224.78335571289062, + "learning_rate": 9.272232304900182e-06, + "loss": 39.3709, + "step": 3817 + }, + { + "epoch": 13.783747178329572, + "grad_norm": 216.5969696044922, + "learning_rate": 9.266787658802179e-06, + "loss": 38.2303, + "step": 3818 + }, + { + "epoch": 13.787358916478555, + "grad_norm": 208.7849884033203, + "learning_rate": 9.261343012704174e-06, + "loss": 39.492, + "step": 3819 + }, + { + "epoch": 13.79097065462754, + "grad_norm": 215.76475524902344, + "learning_rate": 9.255898366606171e-06, + "loss": 38.5599, + "step": 3820 + }, + { + "epoch": 13.79097065462754, + "eval_loss": 0.6080366969108582, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.041, + "eval_steps_per_second": 57.041, + "step": 3820 + }, + { + "epoch": 13.794582392776524, + "grad_norm": 224.64462280273438, + "learning_rate": 9.250453720508167e-06, + "loss": 39.315, + "step": 3821 + }, + { + "epoch": 13.798194130925507, + "grad_norm": 298.545654296875, + "learning_rate": 9.245009074410162e-06, + "loss": 38.3108, + "step": 3822 + }, + { + "epoch": 13.801805869074492, + "grad_norm": 236.5186767578125, + "learning_rate": 9.239564428312161e-06, + "loss": 39.9223, + "step": 3823 + }, + { + "epoch": 13.805417607223477, + "grad_norm": 251.47999572753906, + "learning_rate": 9.234119782214156e-06, + "loss": 39.4288, + "step": 3824 + }, + { + "epoch": 13.80902934537246, + "grad_norm": 260.8268737792969, + "learning_rate": 9.228675136116152e-06, + "loss": 38.276, + "step": 3825 + }, + { + "epoch": 13.812641083521445, + "grad_norm": 253.25172424316406, + "learning_rate": 9.223230490018149e-06, + "loss": 40.7118, + "step": 3826 + }, + { + "epoch": 13.816252821670428, + "grad_norm": 250.31784057617188, + "learning_rate": 9.217785843920146e-06, + "loss": 40.1916, + "step": 3827 + }, + { + "epoch": 13.819864559819413, + "grad_norm": 228.79234313964844, + "learning_rate": 9.212341197822143e-06, + "loss": 38.1513, + "step": 3828 + }, + { + "epoch": 13.823476297968398, + "grad_norm": 262.689697265625, + "learning_rate": 9.206896551724138e-06, + "loss": 38.43, + "step": 3829 + }, + { + "epoch": 13.827088036117381, + "grad_norm": 191.04139709472656, + "learning_rate": 9.201451905626134e-06, + "loss": 34.2476, + "step": 3830 + }, + { + "epoch": 13.827088036117381, + "eval_loss": 0.6077054142951965, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 3830 + }, + { + "epoch": 13.830699774266366, + "grad_norm": 236.3266143798828, + "learning_rate": 9.196007259528131e-06, + "loss": 33.7892, + "step": 3831 + }, + { + "epoch": 13.83431151241535, + "grad_norm": 284.8748474121094, + "learning_rate": 9.190562613430128e-06, + "loss": 31.9857, + "step": 3832 + }, + { + "epoch": 13.837923250564334, + "grad_norm": 261.17413330078125, + "learning_rate": 9.185117967332123e-06, + "loss": 32.8165, + "step": 3833 + }, + { + "epoch": 13.841534988713319, + "grad_norm": 195.1323699951172, + "learning_rate": 9.17967332123412e-06, + "loss": 33.1709, + "step": 3834 + }, + { + "epoch": 13.845146726862303, + "grad_norm": 220.5006561279297, + "learning_rate": 9.174228675136116e-06, + "loss": 33.149, + "step": 3835 + }, + { + "epoch": 13.848758465011286, + "grad_norm": 236.7254638671875, + "learning_rate": 9.168784029038111e-06, + "loss": 33.633, + "step": 3836 + }, + { + "epoch": 13.852370203160271, + "grad_norm": 269.1921691894531, + "learning_rate": 9.16333938294011e-06, + "loss": 34.6822, + "step": 3837 + }, + { + "epoch": 13.855981941309254, + "grad_norm": 222.4369354248047, + "learning_rate": 9.157894736842105e-06, + "loss": 35.2816, + "step": 3838 + }, + { + "epoch": 13.85959367945824, + "grad_norm": 232.4306640625, + "learning_rate": 9.152450090744102e-06, + "loss": 35.0067, + "step": 3839 + }, + { + "epoch": 13.863205417607224, + "grad_norm": 297.0786437988281, + "learning_rate": 9.147005444646098e-06, + "loss": 34.264, + "step": 3840 + }, + { + "epoch": 13.863205417607224, + "eval_loss": 0.6047748327255249, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 3840 + }, + { + "epoch": 13.866817155756207, + "grad_norm": 370.232421875, + "learning_rate": 9.141560798548095e-06, + "loss": 35.4996, + "step": 3841 + }, + { + "epoch": 13.870428893905192, + "grad_norm": 216.05775451660156, + "learning_rate": 9.136116152450092e-06, + "loss": 36.1403, + "step": 3842 + }, + { + "epoch": 13.874040632054175, + "grad_norm": 233.11138916015625, + "learning_rate": 9.130671506352087e-06, + "loss": 36.0324, + "step": 3843 + }, + { + "epoch": 13.87765237020316, + "grad_norm": 297.1761779785156, + "learning_rate": 9.125226860254083e-06, + "loss": 36.5617, + "step": 3844 + }, + { + "epoch": 13.881264108352145, + "grad_norm": 290.61590576171875, + "learning_rate": 9.11978221415608e-06, + "loss": 36.7113, + "step": 3845 + }, + { + "epoch": 13.884875846501128, + "grad_norm": 293.5744934082031, + "learning_rate": 9.114337568058077e-06, + "loss": 36.9964, + "step": 3846 + }, + { + "epoch": 13.888487584650113, + "grad_norm": 227.73455810546875, + "learning_rate": 9.108892921960072e-06, + "loss": 31.8552, + "step": 3847 + }, + { + "epoch": 13.892099322799098, + "grad_norm": 223.36077880859375, + "learning_rate": 9.10344827586207e-06, + "loss": 22.9122, + "step": 3848 + }, + { + "epoch": 13.89571106094808, + "grad_norm": 181.14501953125, + "learning_rate": 9.098003629764065e-06, + "loss": 22.366, + "step": 3849 + }, + { + "epoch": 13.899322799097066, + "grad_norm": 215.75856018066406, + "learning_rate": 9.092558983666063e-06, + "loss": 23.9545, + "step": 3850 + }, + { + "epoch": 13.899322799097066, + "eval_loss": 0.6072003245353699, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 3850 + }, + { + "epoch": 13.90293453724605, + "grad_norm": 233.22837829589844, + "learning_rate": 9.087114337568059e-06, + "loss": 23.5196, + "step": 3851 + }, + { + "epoch": 13.906546275395034, + "grad_norm": 269.9342041015625, + "learning_rate": 9.081669691470054e-06, + "loss": 41.4605, + "step": 3852 + }, + { + "epoch": 13.910158013544018, + "grad_norm": 304.4266662597656, + "learning_rate": 9.076225045372051e-06, + "loss": 40.2848, + "step": 3853 + }, + { + "epoch": 13.913769751693001, + "grad_norm": 318.2371520996094, + "learning_rate": 9.070780399274047e-06, + "loss": 41.0044, + "step": 3854 + }, + { + "epoch": 13.917381489841986, + "grad_norm": 272.9725341796875, + "learning_rate": 9.065335753176044e-06, + "loss": 40.776, + "step": 3855 + }, + { + "epoch": 13.920993227990971, + "grad_norm": 213.8822784423828, + "learning_rate": 9.059891107078041e-06, + "loss": 39.4964, + "step": 3856 + }, + { + "epoch": 13.924604966139954, + "grad_norm": 239.16128540039062, + "learning_rate": 9.054446460980036e-06, + "loss": 41.3482, + "step": 3857 + }, + { + "epoch": 13.928216704288939, + "grad_norm": 264.839111328125, + "learning_rate": 9.049001814882033e-06, + "loss": 38.2433, + "step": 3858 + }, + { + "epoch": 13.931828442437924, + "grad_norm": 244.00926208496094, + "learning_rate": 9.043557168784029e-06, + "loss": 38.6482, + "step": 3859 + }, + { + "epoch": 13.935440180586907, + "grad_norm": 342.8050537109375, + "learning_rate": 9.038112522686026e-06, + "loss": 39.2047, + "step": 3860 + }, + { + "epoch": 13.935440180586907, + "eval_loss": 0.6078094244003296, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3860 + }, + { + "epoch": 13.939051918735892, + "grad_norm": 232.509033203125, + "learning_rate": 9.032667876588023e-06, + "loss": 39.2827, + "step": 3861 + }, + { + "epoch": 13.942663656884875, + "grad_norm": 343.2891845703125, + "learning_rate": 9.027223230490018e-06, + "loss": 38.2709, + "step": 3862 + }, + { + "epoch": 13.94627539503386, + "grad_norm": 332.9613342285156, + "learning_rate": 9.021778584392014e-06, + "loss": 38.8266, + "step": 3863 + }, + { + "epoch": 13.949887133182845, + "grad_norm": 339.5653076171875, + "learning_rate": 9.016333938294012e-06, + "loss": 39.9249, + "step": 3864 + }, + { + "epoch": 13.953498871331828, + "grad_norm": 269.0108947753906, + "learning_rate": 9.010889292196008e-06, + "loss": 39.4593, + "step": 3865 + }, + { + "epoch": 13.957110609480813, + "grad_norm": 252.5339813232422, + "learning_rate": 9.005444646098003e-06, + "loss": 39.5471, + "step": 3866 + }, + { + "epoch": 13.960722347629797, + "grad_norm": 424.7225646972656, + "learning_rate": 9e-06, + "loss": 35.7505, + "step": 3867 + }, + { + "epoch": 13.96433408577878, + "grad_norm": 286.189208984375, + "learning_rate": 8.994555353901996e-06, + "loss": 32.445, + "step": 3868 + }, + { + "epoch": 13.967945823927765, + "grad_norm": 245.153564453125, + "learning_rate": 8.989110707803994e-06, + "loss": 33.2369, + "step": 3869 + }, + { + "epoch": 13.97155756207675, + "grad_norm": 305.3119812011719, + "learning_rate": 8.98366606170599e-06, + "loss": 31.7864, + "step": 3870 + }, + { + "epoch": 13.97155756207675, + "eval_loss": 0.6069231629371643, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.877, + "eval_steps_per_second": 56.877, + "step": 3870 + }, + { + "epoch": 13.975169300225733, + "grad_norm": 218.70913696289062, + "learning_rate": 8.978221415607985e-06, + "loss": 33.7166, + "step": 3871 + }, + { + "epoch": 13.978781038374718, + "grad_norm": 334.856201171875, + "learning_rate": 8.972776769509982e-06, + "loss": 35.8878, + "step": 3872 + }, + { + "epoch": 13.982392776523701, + "grad_norm": 305.65203857421875, + "learning_rate": 8.96733212341198e-06, + "loss": 35.1525, + "step": 3873 + }, + { + "epoch": 13.986004514672686, + "grad_norm": 330.148193359375, + "learning_rate": 8.961887477313975e-06, + "loss": 34.8268, + "step": 3874 + }, + { + "epoch": 13.989616252821671, + "grad_norm": 288.9424133300781, + "learning_rate": 8.956442831215972e-06, + "loss": 35.5068, + "step": 3875 + }, + { + "epoch": 13.993227990970654, + "grad_norm": 256.2596740722656, + "learning_rate": 8.950998185117967e-06, + "loss": 28.5016, + "step": 3876 + }, + { + "epoch": 13.996839729119639, + "grad_norm": 234.31991577148438, + "learning_rate": 8.945553539019963e-06, + "loss": 23.7416, + "step": 3877 + }, + { + "epoch": 14.0, + "grad_norm": 182.19000244140625, + "learning_rate": 8.940108892921961e-06, + "loss": 21.0329, + "step": 3878 + }, + { + "epoch": 14.003611738148985, + "grad_norm": 254.86355590820312, + "learning_rate": 8.934664246823957e-06, + "loss": 39.94, + "step": 3879 + }, + { + "epoch": 14.007223476297968, + "grad_norm": 229.75650024414062, + "learning_rate": 8.929219600725954e-06, + "loss": 40.3213, + "step": 3880 + }, + { + "epoch": 14.007223476297968, + "eval_loss": 0.604503870010376, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3880 + }, + { + "epoch": 14.010835214446953, + "grad_norm": 220.18190002441406, + "learning_rate": 8.923774954627949e-06, + "loss": 40.1568, + "step": 3881 + }, + { + "epoch": 14.014446952595938, + "grad_norm": 269.5978088378906, + "learning_rate": 8.918330308529945e-06, + "loss": 40.3685, + "step": 3882 + }, + { + "epoch": 14.01805869074492, + "grad_norm": 254.3507537841797, + "learning_rate": 8.912885662431943e-06, + "loss": 40.0845, + "step": 3883 + }, + { + "epoch": 14.021670428893906, + "grad_norm": 251.43653869628906, + "learning_rate": 8.907441016333939e-06, + "loss": 40.1731, + "step": 3884 + }, + { + "epoch": 14.025282167042889, + "grad_norm": 215.91253662109375, + "learning_rate": 8.901996370235934e-06, + "loss": 39.7179, + "step": 3885 + }, + { + "epoch": 14.028893905191874, + "grad_norm": 247.81790161132812, + "learning_rate": 8.896551724137931e-06, + "loss": 41.0822, + "step": 3886 + }, + { + "epoch": 14.032505643340858, + "grad_norm": 232.45892333984375, + "learning_rate": 8.891107078039928e-06, + "loss": 39.7873, + "step": 3887 + }, + { + "epoch": 14.036117381489841, + "grad_norm": 231.8137969970703, + "learning_rate": 8.885662431941924e-06, + "loss": 41.1302, + "step": 3888 + }, + { + "epoch": 14.039729119638826, + "grad_norm": 219.09446716308594, + "learning_rate": 8.88021778584392e-06, + "loss": 39.2293, + "step": 3889 + }, + { + "epoch": 14.043340857787811, + "grad_norm": 187.99874877929688, + "learning_rate": 8.874773139745916e-06, + "loss": 37.3338, + "step": 3890 + }, + { + "epoch": 14.043340857787811, + "eval_loss": 0.603966236114502, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 3890 + }, + { + "epoch": 14.046952595936794, + "grad_norm": 285.2400207519531, + "learning_rate": 8.869328493647913e-06, + "loss": 36.9479, + "step": 3891 + }, + { + "epoch": 14.050564334085779, + "grad_norm": 234.23655700683594, + "learning_rate": 8.86388384754991e-06, + "loss": 35.1313, + "step": 3892 + }, + { + "epoch": 14.054176072234762, + "grad_norm": 234.78717041015625, + "learning_rate": 8.858439201451906e-06, + "loss": 36.5917, + "step": 3893 + }, + { + "epoch": 14.057787810383747, + "grad_norm": 226.53997802734375, + "learning_rate": 8.852994555353903e-06, + "loss": 38.3228, + "step": 3894 + }, + { + "epoch": 14.061399548532732, + "grad_norm": 222.05213928222656, + "learning_rate": 8.847549909255898e-06, + "loss": 37.3542, + "step": 3895 + }, + { + "epoch": 14.065011286681715, + "grad_norm": 222.9646759033203, + "learning_rate": 8.842105263157893e-06, + "loss": 37.6396, + "step": 3896 + }, + { + "epoch": 14.0686230248307, + "grad_norm": 227.78965759277344, + "learning_rate": 8.836660617059892e-06, + "loss": 38.1988, + "step": 3897 + }, + { + "epoch": 14.072234762979685, + "grad_norm": 200.89691162109375, + "learning_rate": 8.831215970961888e-06, + "loss": 38.3981, + "step": 3898 + }, + { + "epoch": 14.075846501128668, + "grad_norm": 212.52891540527344, + "learning_rate": 8.825771324863883e-06, + "loss": 37.3422, + "step": 3899 + }, + { + "epoch": 14.079458239277653, + "grad_norm": 312.33905029296875, + "learning_rate": 8.82032667876588e-06, + "loss": 38.1292, + "step": 3900 + }, + { + "epoch": 14.079458239277653, + "eval_loss": 0.6061921119689941, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.017, + "eval_steps_per_second": 57.017, + "step": 3900 + }, + { + "epoch": 14.083069977426636, + "grad_norm": 261.8415832519531, + "learning_rate": 8.814882032667877e-06, + "loss": 37.5543, + "step": 3901 + }, + { + "epoch": 14.08668171557562, + "grad_norm": 264.625732421875, + "learning_rate": 8.809437386569874e-06, + "loss": 39.3912, + "step": 3902 + }, + { + "epoch": 14.090293453724605, + "grad_norm": 305.7203063964844, + "learning_rate": 8.80399274047187e-06, + "loss": 39.7879, + "step": 3903 + }, + { + "epoch": 14.093905191873588, + "grad_norm": 282.63616943359375, + "learning_rate": 8.798548094373865e-06, + "loss": 38.7212, + "step": 3904 + }, + { + "epoch": 14.097516930022573, + "grad_norm": 246.49169921875, + "learning_rate": 8.793103448275862e-06, + "loss": 40.6198, + "step": 3905 + }, + { + "epoch": 14.101128668171558, + "grad_norm": 283.2737731933594, + "learning_rate": 8.787658802177859e-06, + "loss": 39.6947, + "step": 3906 + }, + { + "epoch": 14.104740406320541, + "grad_norm": 306.95721435546875, + "learning_rate": 8.782214156079855e-06, + "loss": 38.6157, + "step": 3907 + }, + { + "epoch": 14.108352144469526, + "grad_norm": 238.1789093017578, + "learning_rate": 8.776769509981852e-06, + "loss": 35.5328, + "step": 3908 + }, + { + "epoch": 14.111963882618511, + "grad_norm": 233.2298126220703, + "learning_rate": 8.771324863883847e-06, + "loss": 32.4008, + "step": 3909 + }, + { + "epoch": 14.115575620767494, + "grad_norm": 233.46339416503906, + "learning_rate": 8.765880217785846e-06, + "loss": 31.0712, + "step": 3910 + }, + { + "epoch": 14.115575620767494, + "eval_loss": 0.6046931147575378, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 3910 + }, + { + "epoch": 14.119187358916479, + "grad_norm": 226.30343627929688, + "learning_rate": 8.760435571687841e-06, + "loss": 33.252, + "step": 3911 + }, + { + "epoch": 14.122799097065462, + "grad_norm": 247.17465209960938, + "learning_rate": 8.754990925589837e-06, + "loss": 31.526, + "step": 3912 + }, + { + "epoch": 14.126410835214447, + "grad_norm": 208.25439453125, + "learning_rate": 8.749546279491834e-06, + "loss": 32.4838, + "step": 3913 + }, + { + "epoch": 14.130022573363432, + "grad_norm": 236.4488525390625, + "learning_rate": 8.744101633393829e-06, + "loss": 32.7987, + "step": 3914 + }, + { + "epoch": 14.133634311512415, + "grad_norm": 219.13279724121094, + "learning_rate": 8.738656987295826e-06, + "loss": 32.8516, + "step": 3915 + }, + { + "epoch": 14.1372460496614, + "grad_norm": 239.7289581298828, + "learning_rate": 8.733212341197823e-06, + "loss": 33.7763, + "step": 3916 + }, + { + "epoch": 14.140857787810384, + "grad_norm": 226.3568878173828, + "learning_rate": 8.727767695099819e-06, + "loss": 35.675, + "step": 3917 + }, + { + "epoch": 14.144469525959368, + "grad_norm": 302.84307861328125, + "learning_rate": 8.722323049001814e-06, + "loss": 34.0523, + "step": 3918 + }, + { + "epoch": 14.148081264108352, + "grad_norm": 280.40106201171875, + "learning_rate": 8.716878402903811e-06, + "loss": 35.2923, + "step": 3919 + }, + { + "epoch": 14.151693002257336, + "grad_norm": 238.30520629882812, + "learning_rate": 8.711433756805808e-06, + "loss": 36.0242, + "step": 3920 + }, + { + "epoch": 14.151693002257336, + "eval_loss": 0.6067762970924377, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 3920 + }, + { + "epoch": 14.15530474040632, + "grad_norm": 238.6465301513672, + "learning_rate": 8.705989110707805e-06, + "loss": 36.2959, + "step": 3921 + }, + { + "epoch": 14.158916478555305, + "grad_norm": 273.26837158203125, + "learning_rate": 8.7005444646098e-06, + "loss": 35.45, + "step": 3922 + }, + { + "epoch": 14.162528216704288, + "grad_norm": 296.907958984375, + "learning_rate": 8.695099818511796e-06, + "loss": 36.4428, + "step": 3923 + }, + { + "epoch": 14.166139954853273, + "grad_norm": 215.07374572753906, + "learning_rate": 8.689655172413795e-06, + "loss": 26.4171, + "step": 3924 + }, + { + "epoch": 14.169751693002258, + "grad_norm": 217.64779663085938, + "learning_rate": 8.68421052631579e-06, + "loss": 22.5483, + "step": 3925 + }, + { + "epoch": 14.173363431151241, + "grad_norm": 243.59364318847656, + "learning_rate": 8.678765880217785e-06, + "loss": 22.0396, + "step": 3926 + }, + { + "epoch": 14.176975169300226, + "grad_norm": 189.66969299316406, + "learning_rate": 8.673321234119783e-06, + "loss": 23.0957, + "step": 3927 + }, + { + "epoch": 14.18058690744921, + "grad_norm": 191.86180114746094, + "learning_rate": 8.667876588021778e-06, + "loss": 23.9385, + "step": 3928 + }, + { + "epoch": 14.184198645598194, + "grad_norm": 234.34896850585938, + "learning_rate": 8.662431941923775e-06, + "loss": 40.1665, + "step": 3929 + }, + { + "epoch": 14.187810383747179, + "grad_norm": 230.52401733398438, + "learning_rate": 8.656987295825772e-06, + "loss": 40.6752, + "step": 3930 + }, + { + "epoch": 14.187810383747179, + "eval_loss": 0.6088615655899048, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.998, + "eval_steps_per_second": 56.998, + "step": 3930 + }, + { + "epoch": 14.191422121896162, + "grad_norm": 234.06272888183594, + "learning_rate": 8.651542649727767e-06, + "loss": 40.7938, + "step": 3931 + }, + { + "epoch": 14.195033860045147, + "grad_norm": 344.4232482910156, + "learning_rate": 8.646098003629765e-06, + "loss": 38.7342, + "step": 3932 + }, + { + "epoch": 14.198645598194132, + "grad_norm": 375.74365234375, + "learning_rate": 8.640653357531762e-06, + "loss": 40.2052, + "step": 3933 + }, + { + "epoch": 14.202257336343115, + "grad_norm": 258.15570068359375, + "learning_rate": 8.635208711433757e-06, + "loss": 39.7266, + "step": 3934 + }, + { + "epoch": 14.2058690744921, + "grad_norm": 235.2681121826172, + "learning_rate": 8.629764065335754e-06, + "loss": 40.4821, + "step": 3935 + }, + { + "epoch": 14.209480812641084, + "grad_norm": 226.94764709472656, + "learning_rate": 8.62431941923775e-06, + "loss": 41.2414, + "step": 3936 + }, + { + "epoch": 14.213092550790067, + "grad_norm": 236.22109985351562, + "learning_rate": 8.618874773139745e-06, + "loss": 40.5807, + "step": 3937 + }, + { + "epoch": 14.216704288939052, + "grad_norm": 201.31112670898438, + "learning_rate": 8.613430127041744e-06, + "loss": 40.4824, + "step": 3938 + }, + { + "epoch": 14.220316027088035, + "grad_norm": 328.0167541503906, + "learning_rate": 8.607985480943739e-06, + "loss": 38.3881, + "step": 3939 + }, + { + "epoch": 14.22392776523702, + "grad_norm": 281.4416809082031, + "learning_rate": 8.602540834845734e-06, + "loss": 36.5777, + "step": 3940 + }, + { + "epoch": 14.22392776523702, + "eval_loss": 0.6099084615707397, + "eval_runtime": 3.1377, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 3940 + }, + { + "epoch": 14.227539503386005, + "grad_norm": 258.5203552246094, + "learning_rate": 8.597096188747731e-06, + "loss": 37.5071, + "step": 3941 + }, + { + "epoch": 14.231151241534988, + "grad_norm": 274.8222351074219, + "learning_rate": 8.591651542649727e-06, + "loss": 36.358, + "step": 3942 + }, + { + "epoch": 14.234762979683973, + "grad_norm": 253.1671600341797, + "learning_rate": 8.586206896551726e-06, + "loss": 37.5859, + "step": 3943 + }, + { + "epoch": 14.238374717832958, + "grad_norm": 249.80943298339844, + "learning_rate": 8.580762250453721e-06, + "loss": 37.8799, + "step": 3944 + }, + { + "epoch": 14.241986455981941, + "grad_norm": 245.29103088378906, + "learning_rate": 8.575317604355716e-06, + "loss": 36.7551, + "step": 3945 + }, + { + "epoch": 14.245598194130926, + "grad_norm": 205.5915985107422, + "learning_rate": 8.569872958257713e-06, + "loss": 38.4761, + "step": 3946 + }, + { + "epoch": 14.249209932279909, + "grad_norm": 218.10328674316406, + "learning_rate": 8.56442831215971e-06, + "loss": 37.5862, + "step": 3947 + }, + { + "epoch": 14.252821670428894, + "grad_norm": 273.5924072265625, + "learning_rate": 8.558983666061706e-06, + "loss": 39.2851, + "step": 3948 + }, + { + "epoch": 14.256433408577879, + "grad_norm": 235.48069763183594, + "learning_rate": 8.553539019963703e-06, + "loss": 39.0707, + "step": 3949 + }, + { + "epoch": 14.260045146726862, + "grad_norm": 230.93150329589844, + "learning_rate": 8.548094373865698e-06, + "loss": 37.8469, + "step": 3950 + }, + { + "epoch": 14.260045146726862, + "eval_loss": 0.6072147488594055, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 3950 + }, + { + "epoch": 14.263656884875846, + "grad_norm": 226.3638458251953, + "learning_rate": 8.542649727767695e-06, + "loss": 39.4245, + "step": 3951 + }, + { + "epoch": 14.267268623024831, + "grad_norm": 226.74595642089844, + "learning_rate": 8.537205081669693e-06, + "loss": 38.116, + "step": 3952 + }, + { + "epoch": 14.270880361173814, + "grad_norm": 226.1452178955078, + "learning_rate": 8.531760435571688e-06, + "loss": 39.9114, + "step": 3953 + }, + { + "epoch": 14.2744920993228, + "grad_norm": 387.8020324707031, + "learning_rate": 8.526315789473685e-06, + "loss": 38.9457, + "step": 3954 + }, + { + "epoch": 14.278103837471784, + "grad_norm": 381.5679931640625, + "learning_rate": 8.52087114337568e-06, + "loss": 40.7989, + "step": 3955 + }, + { + "epoch": 14.281715575620767, + "grad_norm": 246.16464233398438, + "learning_rate": 8.515426497277677e-06, + "loss": 37.6288, + "step": 3956 + }, + { + "epoch": 14.285327313769752, + "grad_norm": 337.05059814453125, + "learning_rate": 8.509981851179674e-06, + "loss": 37.3276, + "step": 3957 + }, + { + "epoch": 14.288939051918735, + "grad_norm": 223.80421447753906, + "learning_rate": 8.50453720508167e-06, + "loss": 33.9465, + "step": 3958 + }, + { + "epoch": 14.29255079006772, + "grad_norm": 218.9332275390625, + "learning_rate": 8.499092558983665e-06, + "loss": 33.0305, + "step": 3959 + }, + { + "epoch": 14.296162528216705, + "grad_norm": 254.20726013183594, + "learning_rate": 8.493647912885662e-06, + "loss": 31.3806, + "step": 3960 + }, + { + "epoch": 14.296162528216705, + "eval_loss": 0.6070483922958374, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 3960 + }, + { + "epoch": 14.299774266365688, + "grad_norm": 232.96702575683594, + "learning_rate": 8.48820326678766e-06, + "loss": 31.7001, + "step": 3961 + }, + { + "epoch": 14.303386004514673, + "grad_norm": 305.31207275390625, + "learning_rate": 8.482758620689656e-06, + "loss": 32.2629, + "step": 3962 + }, + { + "epoch": 14.306997742663658, + "grad_norm": 253.60858154296875, + "learning_rate": 8.477313974591652e-06, + "loss": 34.2635, + "step": 3963 + }, + { + "epoch": 14.31060948081264, + "grad_norm": 395.4168701171875, + "learning_rate": 8.471869328493647e-06, + "loss": 34.6987, + "step": 3964 + }, + { + "epoch": 14.314221218961626, + "grad_norm": 279.72845458984375, + "learning_rate": 8.466424682395644e-06, + "loss": 34.5488, + "step": 3965 + }, + { + "epoch": 14.317832957110609, + "grad_norm": 285.7306213378906, + "learning_rate": 8.460980036297641e-06, + "loss": 35.2566, + "step": 3966 + }, + { + "epoch": 14.321444695259594, + "grad_norm": 229.04226684570312, + "learning_rate": 8.455535390199637e-06, + "loss": 34.5273, + "step": 3967 + }, + { + "epoch": 14.325056433408578, + "grad_norm": 232.50205993652344, + "learning_rate": 8.450090744101634e-06, + "loss": 34.6337, + "step": 3968 + }, + { + "epoch": 14.328668171557561, + "grad_norm": 225.87583923339844, + "learning_rate": 8.44464609800363e-06, + "loss": 35.1575, + "step": 3969 + }, + { + "epoch": 14.332279909706546, + "grad_norm": 266.2709045410156, + "learning_rate": 8.439201451905626e-06, + "loss": 34.2619, + "step": 3970 + }, + { + "epoch": 14.332279909706546, + "eval_loss": 0.6066078543663025, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 3970 + }, + { + "epoch": 14.335891647855531, + "grad_norm": 283.557373046875, + "learning_rate": 8.433756805807623e-06, + "loss": 35.5713, + "step": 3971 + }, + { + "epoch": 14.339503386004514, + "grad_norm": 288.43707275390625, + "learning_rate": 8.428312159709619e-06, + "loss": 36.7442, + "step": 3972 + }, + { + "epoch": 14.343115124153499, + "grad_norm": 331.3218994140625, + "learning_rate": 8.422867513611616e-06, + "loss": 35.5839, + "step": 3973 + }, + { + "epoch": 14.346726862302482, + "grad_norm": 257.1488037109375, + "learning_rate": 8.417422867513611e-06, + "loss": 30.2221, + "step": 3974 + }, + { + "epoch": 14.350338600451467, + "grad_norm": 200.0919189453125, + "learning_rate": 8.411978221415608e-06, + "loss": 22.217, + "step": 3975 + }, + { + "epoch": 14.353950338600452, + "grad_norm": 245.030029296875, + "learning_rate": 8.406533575317605e-06, + "loss": 22.8927, + "step": 3976 + }, + { + "epoch": 14.357562076749435, + "grad_norm": 208.5701904296875, + "learning_rate": 8.4010889292196e-06, + "loss": 22.9537, + "step": 3977 + }, + { + "epoch": 14.36117381489842, + "grad_norm": 232.0613250732422, + "learning_rate": 8.395644283121596e-06, + "loss": 24.5304, + "step": 3978 + }, + { + "epoch": 14.364785553047405, + "grad_norm": 193.56541442871094, + "learning_rate": 8.390199637023595e-06, + "loss": 39.4552, + "step": 3979 + }, + { + "epoch": 14.368397291196388, + "grad_norm": 230.35507202148438, + "learning_rate": 8.38475499092559e-06, + "loss": 41.0417, + "step": 3980 + }, + { + "epoch": 14.368397291196388, + "eval_loss": 0.6071842908859253, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 3980 + }, + { + "epoch": 14.372009029345373, + "grad_norm": 191.09242248535156, + "learning_rate": 8.379310344827586e-06, + "loss": 40.1548, + "step": 3981 + }, + { + "epoch": 14.375620767494357, + "grad_norm": 249.24520874023438, + "learning_rate": 8.373865698729583e-06, + "loss": 39.5746, + "step": 3982 + }, + { + "epoch": 14.37923250564334, + "grad_norm": 266.509033203125, + "learning_rate": 8.368421052631578e-06, + "loss": 39.2388, + "step": 3983 + }, + { + "epoch": 14.382844243792325, + "grad_norm": 255.36209106445312, + "learning_rate": 8.362976406533577e-06, + "loss": 39.9314, + "step": 3984 + }, + { + "epoch": 14.386455981941308, + "grad_norm": 239.0690460205078, + "learning_rate": 8.357531760435572e-06, + "loss": 39.9124, + "step": 3985 + }, + { + "epoch": 14.390067720090293, + "grad_norm": 211.36135864257812, + "learning_rate": 8.352087114337568e-06, + "loss": 40.1307, + "step": 3986 + }, + { + "epoch": 14.393679458239278, + "grad_norm": 215.28912353515625, + "learning_rate": 8.346642468239565e-06, + "loss": 40.5252, + "step": 3987 + }, + { + "epoch": 14.397291196388261, + "grad_norm": 240.84271240234375, + "learning_rate": 8.34119782214156e-06, + "loss": 40.8348, + "step": 3988 + }, + { + "epoch": 14.400902934537246, + "grad_norm": 228.41758728027344, + "learning_rate": 8.335753176043557e-06, + "loss": 39.8228, + "step": 3989 + }, + { + "epoch": 14.404514672686231, + "grad_norm": 203.0228729248047, + "learning_rate": 8.330308529945554e-06, + "loss": 38.0696, + "step": 3990 + }, + { + "epoch": 14.404514672686231, + "eval_loss": 0.6064196825027466, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.983, + "eval_steps_per_second": 56.983, + "step": 3990 + }, + { + "epoch": 14.408126410835214, + "grad_norm": 245.14646911621094, + "learning_rate": 8.32486388384755e-06, + "loss": 37.3921, + "step": 3991 + }, + { + "epoch": 14.411738148984199, + "grad_norm": 230.0685577392578, + "learning_rate": 8.319419237749545e-06, + "loss": 36.8794, + "step": 3992 + }, + { + "epoch": 14.415349887133182, + "grad_norm": 203.02955627441406, + "learning_rate": 8.313974591651544e-06, + "loss": 38.011, + "step": 3993 + }, + { + "epoch": 14.418961625282167, + "grad_norm": 276.0522766113281, + "learning_rate": 8.30852994555354e-06, + "loss": 37.8114, + "step": 3994 + }, + { + "epoch": 14.422573363431152, + "grad_norm": 205.56423950195312, + "learning_rate": 8.303085299455536e-06, + "loss": 38.1956, + "step": 3995 + }, + { + "epoch": 14.426185101580135, + "grad_norm": 200.71507263183594, + "learning_rate": 8.297640653357532e-06, + "loss": 36.4471, + "step": 3996 + }, + { + "epoch": 14.42979683972912, + "grad_norm": 217.8540496826172, + "learning_rate": 8.292196007259527e-06, + "loss": 37.6204, + "step": 3997 + }, + { + "epoch": 14.433408577878104, + "grad_norm": 228.0621337890625, + "learning_rate": 8.286751361161526e-06, + "loss": 38.6074, + "step": 3998 + }, + { + "epoch": 14.437020316027088, + "grad_norm": 246.05203247070312, + "learning_rate": 8.281306715063521e-06, + "loss": 37.8614, + "step": 3999 + }, + { + "epoch": 14.440632054176072, + "grad_norm": 216.0327911376953, + "learning_rate": 8.275862068965517e-06, + "loss": 37.4941, + "step": 4000 + }, + { + "epoch": 14.440632054176072, + "eval_loss": 0.605604887008667, + "eval_runtime": 3.1399, + "eval_samples_per_second": 57.008, + "eval_steps_per_second": 57.008, + "step": 4000 + }, + { + "epoch": 14.444243792325057, + "grad_norm": 292.38653564453125, + "learning_rate": 8.270417422867514e-06, + "loss": 37.9576, + "step": 4001 + }, + { + "epoch": 14.44785553047404, + "grad_norm": 268.2558288574219, + "learning_rate": 8.26497277676951e-06, + "loss": 38.7505, + "step": 4002 + }, + { + "epoch": 14.451467268623025, + "grad_norm": 324.135498046875, + "learning_rate": 8.259528130671508e-06, + "loss": 39.9733, + "step": 4003 + }, + { + "epoch": 14.455079006772008, + "grad_norm": 269.1458740234375, + "learning_rate": 8.254083484573503e-06, + "loss": 38.8272, + "step": 4004 + }, + { + "epoch": 14.458690744920993, + "grad_norm": 214.26547241210938, + "learning_rate": 8.248638838475499e-06, + "loss": 37.7277, + "step": 4005 + }, + { + "epoch": 14.462302483069978, + "grad_norm": 256.4419860839844, + "learning_rate": 8.243194192377496e-06, + "loss": 39.0446, + "step": 4006 + }, + { + "epoch": 14.465914221218961, + "grad_norm": 226.9741973876953, + "learning_rate": 8.237749546279493e-06, + "loss": 34.2491, + "step": 4007 + }, + { + "epoch": 14.469525959367946, + "grad_norm": 238.4901123046875, + "learning_rate": 8.232304900181488e-06, + "loss": 32.1969, + "step": 4008 + }, + { + "epoch": 14.47313769751693, + "grad_norm": 260.6334533691406, + "learning_rate": 8.226860254083485e-06, + "loss": 32.5999, + "step": 4009 + }, + { + "epoch": 14.476749435665914, + "grad_norm": 227.4844970703125, + "learning_rate": 8.22141560798548e-06, + "loss": 30.3598, + "step": 4010 + }, + { + "epoch": 14.476749435665914, + "eval_loss": 0.6049788594245911, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 4010 + }, + { + "epoch": 14.480361173814899, + "grad_norm": 231.49935913085938, + "learning_rate": 8.215970961887476e-06, + "loss": 32.3228, + "step": 4011 + }, + { + "epoch": 14.483972911963882, + "grad_norm": 246.83099365234375, + "learning_rate": 8.210526315789475e-06, + "loss": 32.1275, + "step": 4012 + }, + { + "epoch": 14.487584650112867, + "grad_norm": 283.0715026855469, + "learning_rate": 8.20508166969147e-06, + "loss": 32.9237, + "step": 4013 + }, + { + "epoch": 14.491196388261852, + "grad_norm": 264.58941650390625, + "learning_rate": 8.199637023593467e-06, + "loss": 34.3091, + "step": 4014 + }, + { + "epoch": 14.494808126410835, + "grad_norm": 207.57241821289062, + "learning_rate": 8.194192377495463e-06, + "loss": 34.2317, + "step": 4015 + }, + { + "epoch": 14.49841986455982, + "grad_norm": 266.3730163574219, + "learning_rate": 8.18874773139746e-06, + "loss": 35.5423, + "step": 4016 + }, + { + "epoch": 14.502031602708804, + "grad_norm": 274.2936096191406, + "learning_rate": 8.183303085299457e-06, + "loss": 34.0383, + "step": 4017 + }, + { + "epoch": 14.505643340857787, + "grad_norm": 345.4320068359375, + "learning_rate": 8.177858439201452e-06, + "loss": 35.6892, + "step": 4018 + }, + { + "epoch": 14.509255079006772, + "grad_norm": 254.9503631591797, + "learning_rate": 8.172413793103448e-06, + "loss": 34.4219, + "step": 4019 + }, + { + "epoch": 14.512866817155757, + "grad_norm": 277.176025390625, + "learning_rate": 8.166969147005445e-06, + "loss": 34.6322, + "step": 4020 + }, + { + "epoch": 14.512866817155757, + "eval_loss": 0.6078911423683167, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 4020 + }, + { + "epoch": 14.51647855530474, + "grad_norm": 267.24737548828125, + "learning_rate": 8.161524500907442e-06, + "loss": 36.4843, + "step": 4021 + }, + { + "epoch": 14.520090293453725, + "grad_norm": 291.5208740234375, + "learning_rate": 8.156079854809437e-06, + "loss": 36.347, + "step": 4022 + }, + { + "epoch": 14.523702031602708, + "grad_norm": 331.9736328125, + "learning_rate": 8.150635208711434e-06, + "loss": 36.5678, + "step": 4023 + }, + { + "epoch": 14.527313769751693, + "grad_norm": 283.7598876953125, + "learning_rate": 8.14519056261343e-06, + "loss": 29.4886, + "step": 4024 + }, + { + "epoch": 14.530925507900678, + "grad_norm": 214.61712646484375, + "learning_rate": 8.139745916515427e-06, + "loss": 23.2178, + "step": 4025 + }, + { + "epoch": 14.534537246049661, + "grad_norm": 286.7948913574219, + "learning_rate": 8.134301270417424e-06, + "loss": 22.0972, + "step": 4026 + }, + { + "epoch": 14.538148984198646, + "grad_norm": 230.6540069580078, + "learning_rate": 8.128856624319419e-06, + "loss": 23.2764, + "step": 4027 + }, + { + "epoch": 14.54176072234763, + "grad_norm": 300.9560241699219, + "learning_rate": 8.123411978221416e-06, + "loss": 24.1889, + "step": 4028 + }, + { + "epoch": 14.545372460496614, + "grad_norm": 211.4068145751953, + "learning_rate": 8.117967332123412e-06, + "loss": 39.0039, + "step": 4029 + }, + { + "epoch": 14.548984198645599, + "grad_norm": 274.3965759277344, + "learning_rate": 8.112522686025409e-06, + "loss": 41.1832, + "step": 4030 + }, + { + "epoch": 14.548984198645599, + "eval_loss": 0.6079195141792297, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 4030 + }, + { + "epoch": 14.552595936794582, + "grad_norm": 247.50657653808594, + "learning_rate": 8.107078039927406e-06, + "loss": 38.28, + "step": 4031 + }, + { + "epoch": 14.556207674943566, + "grad_norm": 216.0500946044922, + "learning_rate": 8.101633393829401e-06, + "loss": 39.5079, + "step": 4032 + }, + { + "epoch": 14.559819413092551, + "grad_norm": 271.37066650390625, + "learning_rate": 8.096188747731396e-06, + "loss": 40.1902, + "step": 4033 + }, + { + "epoch": 14.563431151241534, + "grad_norm": 233.35415649414062, + "learning_rate": 8.090744101633394e-06, + "loss": 40.2113, + "step": 4034 + }, + { + "epoch": 14.56704288939052, + "grad_norm": 214.67381286621094, + "learning_rate": 8.08529945553539e-06, + "loss": 39.794, + "step": 4035 + }, + { + "epoch": 14.570654627539504, + "grad_norm": 298.1142578125, + "learning_rate": 8.079854809437388e-06, + "loss": 39.9214, + "step": 4036 + }, + { + "epoch": 14.574266365688487, + "grad_norm": 197.40823364257812, + "learning_rate": 8.074410163339383e-06, + "loss": 40.9599, + "step": 4037 + }, + { + "epoch": 14.577878103837472, + "grad_norm": 242.1573028564453, + "learning_rate": 8.068965517241378e-06, + "loss": 40.2351, + "step": 4038 + }, + { + "epoch": 14.581489841986457, + "grad_norm": 224.93801879882812, + "learning_rate": 8.063520871143377e-06, + "loss": 39.0174, + "step": 4039 + }, + { + "epoch": 14.58510158013544, + "grad_norm": 295.4931335449219, + "learning_rate": 8.058076225045373e-06, + "loss": 37.4696, + "step": 4040 + }, + { + "epoch": 14.58510158013544, + "eval_loss": 0.6091852188110352, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 4040 + }, + { + "epoch": 14.588713318284425, + "grad_norm": 302.8267517089844, + "learning_rate": 8.052631578947368e-06, + "loss": 37.3227, + "step": 4041 + }, + { + "epoch": 14.592325056433408, + "grad_norm": 355.2379150390625, + "learning_rate": 8.047186932849365e-06, + "loss": 38.433, + "step": 4042 + }, + { + "epoch": 14.595936794582393, + "grad_norm": 304.96234130859375, + "learning_rate": 8.04174228675136e-06, + "loss": 37.8352, + "step": 4043 + }, + { + "epoch": 14.599548532731378, + "grad_norm": 309.294921875, + "learning_rate": 8.036297640653358e-06, + "loss": 38.1734, + "step": 4044 + }, + { + "epoch": 14.60316027088036, + "grad_norm": 216.3328399658203, + "learning_rate": 8.030852994555355e-06, + "loss": 37.3612, + "step": 4045 + }, + { + "epoch": 14.606772009029346, + "grad_norm": 250.9885711669922, + "learning_rate": 8.02540834845735e-06, + "loss": 39.1612, + "step": 4046 + }, + { + "epoch": 14.610383747178329, + "grad_norm": 215.0750732421875, + "learning_rate": 8.019963702359347e-06, + "loss": 39.6837, + "step": 4047 + }, + { + "epoch": 14.613995485327314, + "grad_norm": 234.02069091796875, + "learning_rate": 8.014519056261342e-06, + "loss": 37.9746, + "step": 4048 + }, + { + "epoch": 14.617607223476298, + "grad_norm": 233.7527313232422, + "learning_rate": 8.00907441016334e-06, + "loss": 38.5114, + "step": 4049 + }, + { + "epoch": 14.621218961625281, + "grad_norm": 271.77496337890625, + "learning_rate": 8.003629764065337e-06, + "loss": 37.1647, + "step": 4050 + }, + { + "epoch": 14.621218961625281, + "eval_loss": 0.6047770977020264, + "eval_runtime": 3.1379, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 4050 + }, + { + "epoch": 14.624830699774266, + "grad_norm": 281.7846374511719, + "learning_rate": 7.998185117967332e-06, + "loss": 38.981, + "step": 4051 + }, + { + "epoch": 14.628442437923251, + "grad_norm": 308.8702697753906, + "learning_rate": 7.992740471869327e-06, + "loss": 39.4821, + "step": 4052 + }, + { + "epoch": 14.632054176072234, + "grad_norm": 366.1501770019531, + "learning_rate": 7.987295825771326e-06, + "loss": 39.0898, + "step": 4053 + }, + { + "epoch": 14.635665914221219, + "grad_norm": 276.92962646484375, + "learning_rate": 7.981851179673322e-06, + "loss": 39.6162, + "step": 4054 + }, + { + "epoch": 14.639277652370204, + "grad_norm": 220.0023651123047, + "learning_rate": 7.976406533575319e-06, + "loss": 38.5888, + "step": 4055 + }, + { + "epoch": 14.642889390519187, + "grad_norm": 268.57293701171875, + "learning_rate": 7.970961887477314e-06, + "loss": 38.4631, + "step": 4056 + }, + { + "epoch": 14.646501128668172, + "grad_norm": 307.8072509765625, + "learning_rate": 7.96551724137931e-06, + "loss": 35.4139, + "step": 4057 + }, + { + "epoch": 14.650112866817155, + "grad_norm": 228.11767578125, + "learning_rate": 7.960072595281308e-06, + "loss": 33.3694, + "step": 4058 + }, + { + "epoch": 14.65372460496614, + "grad_norm": 217.6271209716797, + "learning_rate": 7.954627949183304e-06, + "loss": 31.3355, + "step": 4059 + }, + { + "epoch": 14.657336343115125, + "grad_norm": 232.31944274902344, + "learning_rate": 7.949183303085299e-06, + "loss": 32.8306, + "step": 4060 + }, + { + "epoch": 14.657336343115125, + "eval_loss": 0.6018487215042114, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 4060 + }, + { + "epoch": 14.660948081264108, + "grad_norm": 244.58303833007812, + "learning_rate": 7.943738656987296e-06, + "loss": 33.2157, + "step": 4061 + }, + { + "epoch": 14.664559819413093, + "grad_norm": 306.12005615234375, + "learning_rate": 7.938294010889293e-06, + "loss": 33.6361, + "step": 4062 + }, + { + "epoch": 14.668171557562077, + "grad_norm": 266.2792053222656, + "learning_rate": 7.932849364791288e-06, + "loss": 32.3917, + "step": 4063 + }, + { + "epoch": 14.67178329571106, + "grad_norm": 259.373779296875, + "learning_rate": 7.927404718693286e-06, + "loss": 33.3598, + "step": 4064 + }, + { + "epoch": 14.675395033860045, + "grad_norm": 247.35179138183594, + "learning_rate": 7.921960072595281e-06, + "loss": 32.2699, + "step": 4065 + }, + { + "epoch": 14.679006772009028, + "grad_norm": 280.02960205078125, + "learning_rate": 7.916515426497278e-06, + "loss": 33.0305, + "step": 4066 + }, + { + "epoch": 14.682618510158013, + "grad_norm": 394.6492919921875, + "learning_rate": 7.911070780399275e-06, + "loss": 35.1854, + "step": 4067 + }, + { + "epoch": 14.686230248306998, + "grad_norm": 298.6531677246094, + "learning_rate": 7.90562613430127e-06, + "loss": 35.1836, + "step": 4068 + }, + { + "epoch": 14.689841986455981, + "grad_norm": 250.960693359375, + "learning_rate": 7.900181488203268e-06, + "loss": 32.6266, + "step": 4069 + }, + { + "epoch": 14.693453724604966, + "grad_norm": 240.4825897216797, + "learning_rate": 7.894736842105263e-06, + "loss": 35.5937, + "step": 4070 + }, + { + "epoch": 14.693453724604966, + "eval_loss": 0.6042065620422363, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.91, + "eval_steps_per_second": 56.91, + "step": 4070 + }, + { + "epoch": 14.697065462753951, + "grad_norm": 274.6919860839844, + "learning_rate": 7.889292196007258e-06, + "loss": 36.4225, + "step": 4071 + }, + { + "epoch": 14.700677200902934, + "grad_norm": 245.4980010986328, + "learning_rate": 7.883847549909257e-06, + "loss": 36.5503, + "step": 4072 + }, + { + "epoch": 14.704288939051919, + "grad_norm": 373.362548828125, + "learning_rate": 7.878402903811252e-06, + "loss": 35.38, + "step": 4073 + }, + { + "epoch": 14.707900677200904, + "grad_norm": 337.5054626464844, + "learning_rate": 7.872958257713248e-06, + "loss": 28.869, + "step": 4074 + }, + { + "epoch": 14.711512415349887, + "grad_norm": 238.19195556640625, + "learning_rate": 7.867513611615245e-06, + "loss": 22.99, + "step": 4075 + }, + { + "epoch": 14.715124153498872, + "grad_norm": 254.274169921875, + "learning_rate": 7.862068965517242e-06, + "loss": 22.5274, + "step": 4076 + }, + { + "epoch": 14.718735891647855, + "grad_norm": 236.74099731445312, + "learning_rate": 7.856624319419239e-06, + "loss": 23.6756, + "step": 4077 + }, + { + "epoch": 14.72234762979684, + "grad_norm": 239.69911193847656, + "learning_rate": 7.851179673321234e-06, + "loss": 23.2024, + "step": 4078 + }, + { + "epoch": 14.725959367945824, + "grad_norm": 296.35101318359375, + "learning_rate": 7.84573502722323e-06, + "loss": 40.0026, + "step": 4079 + }, + { + "epoch": 14.729571106094808, + "grad_norm": 202.52577209472656, + "learning_rate": 7.840290381125227e-06, + "loss": 41.2817, + "step": 4080 + }, + { + "epoch": 14.729571106094808, + "eval_loss": 0.6069625616073608, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4080 + }, + { + "epoch": 14.733182844243792, + "grad_norm": 290.4194030761719, + "learning_rate": 7.834845735027224e-06, + "loss": 40.5411, + "step": 4081 + }, + { + "epoch": 14.736794582392777, + "grad_norm": 284.0616455078125, + "learning_rate": 7.82940108892922e-06, + "loss": 40.6588, + "step": 4082 + }, + { + "epoch": 14.74040632054176, + "grad_norm": 289.5628967285156, + "learning_rate": 7.823956442831216e-06, + "loss": 38.986, + "step": 4083 + }, + { + "epoch": 14.744018058690745, + "grad_norm": 217.09841918945312, + "learning_rate": 7.818511796733212e-06, + "loss": 38.83, + "step": 4084 + }, + { + "epoch": 14.747629796839728, + "grad_norm": 223.49148559570312, + "learning_rate": 7.813067150635209e-06, + "loss": 39.4897, + "step": 4085 + }, + { + "epoch": 14.751241534988713, + "grad_norm": 240.41578674316406, + "learning_rate": 7.807622504537206e-06, + "loss": 38.9963, + "step": 4086 + }, + { + "epoch": 14.754853273137698, + "grad_norm": 206.7586212158203, + "learning_rate": 7.802177858439201e-06, + "loss": 39.7875, + "step": 4087 + }, + { + "epoch": 14.758465011286681, + "grad_norm": 239.97174072265625, + "learning_rate": 7.796733212341198e-06, + "loss": 39.3977, + "step": 4088 + }, + { + "epoch": 14.762076749435666, + "grad_norm": 204.50839233398438, + "learning_rate": 7.791288566243194e-06, + "loss": 38.7869, + "step": 4089 + }, + { + "epoch": 14.76568848758465, + "grad_norm": 216.79583740234375, + "learning_rate": 7.785843920145191e-06, + "loss": 36.7325, + "step": 4090 + }, + { + "epoch": 14.76568848758465, + "eval_loss": 0.6052367091178894, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.93, + "eval_steps_per_second": 56.93, + "step": 4090 + }, + { + "epoch": 14.769300225733634, + "grad_norm": 251.13209533691406, + "learning_rate": 7.780399274047188e-06, + "loss": 38.2377, + "step": 4091 + }, + { + "epoch": 14.772911963882619, + "grad_norm": 222.745361328125, + "learning_rate": 7.774954627949183e-06, + "loss": 36.8119, + "step": 4092 + }, + { + "epoch": 14.776523702031604, + "grad_norm": 252.72117614746094, + "learning_rate": 7.769509981851179e-06, + "loss": 38.1241, + "step": 4093 + }, + { + "epoch": 14.780135440180587, + "grad_norm": 272.38165283203125, + "learning_rate": 7.764065335753176e-06, + "loss": 37.6839, + "step": 4094 + }, + { + "epoch": 14.783747178329572, + "grad_norm": 301.0637512207031, + "learning_rate": 7.758620689655173e-06, + "loss": 38.1267, + "step": 4095 + }, + { + "epoch": 14.787358916478555, + "grad_norm": 240.22515869140625, + "learning_rate": 7.75317604355717e-06, + "loss": 36.9847, + "step": 4096 + }, + { + "epoch": 14.79097065462754, + "grad_norm": 273.3988952636719, + "learning_rate": 7.747731397459165e-06, + "loss": 39.0368, + "step": 4097 + }, + { + "epoch": 14.794582392776524, + "grad_norm": 252.66497802734375, + "learning_rate": 7.74228675136116e-06, + "loss": 38.6439, + "step": 4098 + }, + { + "epoch": 14.798194130925507, + "grad_norm": 246.3287811279297, + "learning_rate": 7.73684210526316e-06, + "loss": 36.3503, + "step": 4099 + }, + { + "epoch": 14.801805869074492, + "grad_norm": 220.6704559326172, + "learning_rate": 7.731397459165155e-06, + "loss": 38.1603, + "step": 4100 + }, + { + "epoch": 14.801805869074492, + "eval_loss": 0.6043270826339722, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4100 + }, + { + "epoch": 14.805417607223477, + "grad_norm": 215.94979858398438, + "learning_rate": 7.72595281306715e-06, + "loss": 38.9624, + "step": 4101 + }, + { + "epoch": 14.80902934537246, + "grad_norm": 228.76815795898438, + "learning_rate": 7.720508166969147e-06, + "loss": 39.2196, + "step": 4102 + }, + { + "epoch": 14.812641083521445, + "grad_norm": 216.1998291015625, + "learning_rate": 7.715063520871143e-06, + "loss": 39.3677, + "step": 4103 + }, + { + "epoch": 14.816252821670428, + "grad_norm": 266.1018981933594, + "learning_rate": 7.70961887477314e-06, + "loss": 38.1856, + "step": 4104 + }, + { + "epoch": 14.819864559819413, + "grad_norm": 234.2566680908203, + "learning_rate": 7.704174228675137e-06, + "loss": 39.6282, + "step": 4105 + }, + { + "epoch": 14.823476297968398, + "grad_norm": 241.16615295410156, + "learning_rate": 7.698729582577132e-06, + "loss": 38.2693, + "step": 4106 + }, + { + "epoch": 14.827088036117381, + "grad_norm": 332.6835021972656, + "learning_rate": 7.69328493647913e-06, + "loss": 37.7161, + "step": 4107 + }, + { + "epoch": 14.830699774266366, + "grad_norm": 260.1654357910156, + "learning_rate": 7.687840290381126e-06, + "loss": 33.9704, + "step": 4108 + }, + { + "epoch": 14.83431151241535, + "grad_norm": 214.45509338378906, + "learning_rate": 7.682395644283122e-06, + "loss": 32.5126, + "step": 4109 + }, + { + "epoch": 14.837923250564334, + "grad_norm": 257.4847717285156, + "learning_rate": 7.676950998185119e-06, + "loss": 32.0682, + "step": 4110 + }, + { + "epoch": 14.837923250564334, + "eval_loss": 0.6022929549217224, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.957, + "eval_steps_per_second": 56.957, + "step": 4110 + }, + { + "epoch": 14.841534988713319, + "grad_norm": 241.302978515625, + "learning_rate": 7.671506352087114e-06, + "loss": 32.8817, + "step": 4111 + }, + { + "epoch": 14.845146726862303, + "grad_norm": 238.0950164794922, + "learning_rate": 7.66606170598911e-06, + "loss": 31.9995, + "step": 4112 + }, + { + "epoch": 14.848758465011286, + "grad_norm": 239.700439453125, + "learning_rate": 7.660617059891108e-06, + "loss": 32.9681, + "step": 4113 + }, + { + "epoch": 14.852370203160271, + "grad_norm": 234.23890686035156, + "learning_rate": 7.655172413793104e-06, + "loss": 33.6878, + "step": 4114 + }, + { + "epoch": 14.855981941309254, + "grad_norm": 367.3103332519531, + "learning_rate": 7.6497277676951e-06, + "loss": 34.2346, + "step": 4115 + }, + { + "epoch": 14.85959367945824, + "grad_norm": 221.31381225585938, + "learning_rate": 7.644283121597096e-06, + "loss": 35.0148, + "step": 4116 + }, + { + "epoch": 14.863205417607224, + "grad_norm": 352.1162109375, + "learning_rate": 7.638838475499092e-06, + "loss": 34.8326, + "step": 4117 + }, + { + "epoch": 14.866817155756207, + "grad_norm": 296.8202209472656, + "learning_rate": 7.63339382940109e-06, + "loss": 34.2522, + "step": 4118 + }, + { + "epoch": 14.870428893905192, + "grad_norm": 283.4679870605469, + "learning_rate": 7.627949183303086e-06, + "loss": 34.5005, + "step": 4119 + }, + { + "epoch": 14.874040632054175, + "grad_norm": 249.95033264160156, + "learning_rate": 7.622504537205082e-06, + "loss": 34.9581, + "step": 4120 + }, + { + "epoch": 14.874040632054175, + "eval_loss": 0.6031190752983093, + "eval_runtime": 3.1392, + "eval_samples_per_second": 57.02, + "eval_steps_per_second": 57.02, + "step": 4120 + }, + { + "epoch": 14.87765237020316, + "grad_norm": 235.65065002441406, + "learning_rate": 7.6170598911070774e-06, + "loss": 35.3024, + "step": 4121 + }, + { + "epoch": 14.881264108352145, + "grad_norm": 258.1300964355469, + "learning_rate": 7.611615245009075e-06, + "loss": 35.4444, + "step": 4122 + }, + { + "epoch": 14.884875846501128, + "grad_norm": 262.9698791503906, + "learning_rate": 7.606170598911072e-06, + "loss": 36.5643, + "step": 4123 + }, + { + "epoch": 14.888487584650113, + "grad_norm": 274.81781005859375, + "learning_rate": 7.600725952813067e-06, + "loss": 33.0157, + "step": 4124 + }, + { + "epoch": 14.892099322799098, + "grad_norm": 205.41566467285156, + "learning_rate": 7.595281306715063e-06, + "loss": 22.226, + "step": 4125 + }, + { + "epoch": 14.89571106094808, + "grad_norm": 231.19541931152344, + "learning_rate": 7.5898366606170594e-06, + "loss": 22.1499, + "step": 4126 + }, + { + "epoch": 14.899322799097066, + "grad_norm": 203.04856872558594, + "learning_rate": 7.584392014519057e-06, + "loss": 23.3987, + "step": 4127 + }, + { + "epoch": 14.90293453724605, + "grad_norm": 289.031005859375, + "learning_rate": 7.578947368421053e-06, + "loss": 24.3649, + "step": 4128 + }, + { + "epoch": 14.906546275395034, + "grad_norm": 285.2325744628906, + "learning_rate": 7.573502722323049e-06, + "loss": 41.146, + "step": 4129 + }, + { + "epoch": 14.910158013544018, + "grad_norm": 232.21603393554688, + "learning_rate": 7.568058076225045e-06, + "loss": 40.3871, + "step": 4130 + }, + { + "epoch": 14.910158013544018, + "eval_loss": 0.6056836247444153, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 4130 + }, + { + "epoch": 14.913769751693001, + "grad_norm": 358.63238525390625, + "learning_rate": 7.562613430127043e-06, + "loss": 39.5914, + "step": 4131 + }, + { + "epoch": 14.917381489841986, + "grad_norm": 262.66741943359375, + "learning_rate": 7.5571687840290385e-06, + "loss": 39.4552, + "step": 4132 + }, + { + "epoch": 14.920993227990971, + "grad_norm": 228.7096710205078, + "learning_rate": 7.551724137931035e-06, + "loss": 41.5379, + "step": 4133 + }, + { + "epoch": 14.924604966139954, + "grad_norm": 266.6537780761719, + "learning_rate": 7.546279491833031e-06, + "loss": 39.8314, + "step": 4134 + }, + { + "epoch": 14.928216704288939, + "grad_norm": 329.5486755371094, + "learning_rate": 7.540834845735027e-06, + "loss": 37.8247, + "step": 4135 + }, + { + "epoch": 14.931828442437924, + "grad_norm": 391.49127197265625, + "learning_rate": 7.535390199637024e-06, + "loss": 36.8491, + "step": 4136 + }, + { + "epoch": 14.935440180586907, + "grad_norm": 342.66632080078125, + "learning_rate": 7.5299455535390205e-06, + "loss": 37.7245, + "step": 4137 + }, + { + "epoch": 14.939051918735892, + "grad_norm": 309.25115966796875, + "learning_rate": 7.524500907441017e-06, + "loss": 38.3694, + "step": 4138 + }, + { + "epoch": 14.942663656884875, + "grad_norm": 438.21539306640625, + "learning_rate": 7.519056261343012e-06, + "loss": 38.5028, + "step": 4139 + }, + { + "epoch": 14.94627539503386, + "grad_norm": 314.2667541503906, + "learning_rate": 7.513611615245008e-06, + "loss": 39.2531, + "step": 4140 + }, + { + "epoch": 14.94627539503386, + "eval_loss": 0.6075459718704224, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 4140 + }, + { + "epoch": 14.949887133182845, + "grad_norm": 348.3675537109375, + "learning_rate": 7.508166969147006e-06, + "loss": 38.3904, + "step": 4141 + }, + { + "epoch": 14.953498871331828, + "grad_norm": 448.6506652832031, + "learning_rate": 7.5027223230490025e-06, + "loss": 39.0257, + "step": 4142 + }, + { + "epoch": 14.957110609480813, + "grad_norm": 407.4074401855469, + "learning_rate": 7.497277676950998e-06, + "loss": 36.8144, + "step": 4143 + }, + { + "epoch": 14.960722347629797, + "grad_norm": 311.0707702636719, + "learning_rate": 7.491833030852995e-06, + "loss": 34.3852, + "step": 4144 + }, + { + "epoch": 14.96433408577878, + "grad_norm": 316.660400390625, + "learning_rate": 7.486388384754991e-06, + "loss": 32.9411, + "step": 4145 + }, + { + "epoch": 14.967945823927765, + "grad_norm": 405.3203125, + "learning_rate": 7.480943738656988e-06, + "loss": 32.9947, + "step": 4146 + }, + { + "epoch": 14.97155756207675, + "grad_norm": 246.47296142578125, + "learning_rate": 7.475499092558984e-06, + "loss": 34.9284, + "step": 4147 + }, + { + "epoch": 14.975169300225733, + "grad_norm": 250.6293487548828, + "learning_rate": 7.47005444646098e-06, + "loss": 33.5852, + "step": 4148 + }, + { + "epoch": 14.978781038374718, + "grad_norm": 367.8492736816406, + "learning_rate": 7.464609800362977e-06, + "loss": 34.5658, + "step": 4149 + }, + { + "epoch": 14.982392776523701, + "grad_norm": 299.1382141113281, + "learning_rate": 7.459165154264972e-06, + "loss": 35.4483, + "step": 4150 + }, + { + "epoch": 14.982392776523701, + "eval_loss": 0.6054605841636658, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 4150 + }, + { + "epoch": 14.986004514672686, + "grad_norm": 448.0080261230469, + "learning_rate": 7.453720508166969e-06, + "loss": 35.9366, + "step": 4151 + }, + { + "epoch": 14.989616252821671, + "grad_norm": 496.0691223144531, + "learning_rate": 7.448275862068966e-06, + "loss": 37.6222, + "step": 4152 + }, + { + "epoch": 14.993227990970654, + "grad_norm": 300.7026062011719, + "learning_rate": 7.442831215970963e-06, + "loss": 27.5573, + "step": 4153 + }, + { + "epoch": 14.996839729119639, + "grad_norm": 183.81434631347656, + "learning_rate": 7.437386569872958e-06, + "loss": 23.0142, + "step": 4154 + }, + { + "epoch": 15.0, + "grad_norm": 198.61032104492188, + "learning_rate": 7.431941923774954e-06, + "loss": 21.0732, + "step": 4155 + }, + { + "epoch": 15.003611738148985, + "grad_norm": 244.2176513671875, + "learning_rate": 7.426497277676951e-06, + "loss": 39.1709, + "step": 4156 + }, + { + "epoch": 15.007223476297968, + "grad_norm": 211.74375915527344, + "learning_rate": 7.421052631578948e-06, + "loss": 39.9364, + "step": 4157 + }, + { + "epoch": 15.010835214446953, + "grad_norm": 216.2489013671875, + "learning_rate": 7.415607985480944e-06, + "loss": 39.5166, + "step": 4158 + }, + { + "epoch": 15.014446952595938, + "grad_norm": 279.423583984375, + "learning_rate": 7.41016333938294e-06, + "loss": 39.6738, + "step": 4159 + }, + { + "epoch": 15.01805869074492, + "grad_norm": 279.117919921875, + "learning_rate": 7.404718693284937e-06, + "loss": 39.3556, + "step": 4160 + }, + { + "epoch": 15.01805869074492, + "eval_loss": 0.6020110249519348, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 4160 + }, + { + "epoch": 15.021670428893906, + "grad_norm": 213.57162475585938, + "learning_rate": 7.399274047186933e-06, + "loss": 38.9987, + "step": 4161 + }, + { + "epoch": 15.025282167042889, + "grad_norm": 184.1968994140625, + "learning_rate": 7.393829401088929e-06, + "loss": 39.1696, + "step": 4162 + }, + { + "epoch": 15.028893905191874, + "grad_norm": 219.38076782226562, + "learning_rate": 7.388384754990926e-06, + "loss": 39.8897, + "step": 4163 + }, + { + "epoch": 15.032505643340858, + "grad_norm": 225.4325714111328, + "learning_rate": 7.382940108892922e-06, + "loss": 40.7633, + "step": 4164 + }, + { + "epoch": 15.036117381489841, + "grad_norm": 274.78472900390625, + "learning_rate": 7.377495462794918e-06, + "loss": 39.8768, + "step": 4165 + }, + { + "epoch": 15.039729119638826, + "grad_norm": 269.5557861328125, + "learning_rate": 7.3720508166969146e-06, + "loss": 38.4735, + "step": 4166 + }, + { + "epoch": 15.043340857787811, + "grad_norm": 219.78761291503906, + "learning_rate": 7.366606170598912e-06, + "loss": 37.2117, + "step": 4167 + }, + { + "epoch": 15.046952595936794, + "grad_norm": 205.49771118164062, + "learning_rate": 7.361161524500908e-06, + "loss": 36.6855, + "step": 4168 + }, + { + "epoch": 15.050564334085779, + "grad_norm": 235.72068786621094, + "learning_rate": 7.355716878402904e-06, + "loss": 35.4408, + "step": 4169 + }, + { + "epoch": 15.054176072234762, + "grad_norm": 218.84732055664062, + "learning_rate": 7.3502722323049e-06, + "loss": 38.2297, + "step": 4170 + }, + { + "epoch": 15.054176072234762, + "eval_loss": 0.6053969860076904, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 4170 + }, + { + "epoch": 15.057787810383747, + "grad_norm": 195.80685424804688, + "learning_rate": 7.3448275862068966e-06, + "loss": 35.7271, + "step": 4171 + }, + { + "epoch": 15.061399548532732, + "grad_norm": 207.12481689453125, + "learning_rate": 7.339382940108894e-06, + "loss": 37.3393, + "step": 4172 + }, + { + "epoch": 15.065011286681715, + "grad_norm": 211.0287322998047, + "learning_rate": 7.333938294010889e-06, + "loss": 36.9505, + "step": 4173 + }, + { + "epoch": 15.0686230248307, + "grad_norm": 279.0206604003906, + "learning_rate": 7.328493647912886e-06, + "loss": 38.1225, + "step": 4174 + }, + { + "epoch": 15.072234762979685, + "grad_norm": 206.3834228515625, + "learning_rate": 7.323049001814882e-06, + "loss": 37.1117, + "step": 4175 + }, + { + "epoch": 15.075846501128668, + "grad_norm": 266.8707275390625, + "learning_rate": 7.3176043557168786e-06, + "loss": 36.1971, + "step": 4176 + }, + { + "epoch": 15.079458239277653, + "grad_norm": 260.35791015625, + "learning_rate": 7.312159709618875e-06, + "loss": 37.4714, + "step": 4177 + }, + { + "epoch": 15.083069977426636, + "grad_norm": 281.152587890625, + "learning_rate": 7.306715063520871e-06, + "loss": 37.621, + "step": 4178 + }, + { + "epoch": 15.08668171557562, + "grad_norm": 246.25758361816406, + "learning_rate": 7.301270417422868e-06, + "loss": 38.919, + "step": 4179 + }, + { + "epoch": 15.090293453724605, + "grad_norm": 378.4499816894531, + "learning_rate": 7.2958257713248635e-06, + "loss": 39.5783, + "step": 4180 + }, + { + "epoch": 15.090293453724605, + "eval_loss": 0.6071392297744751, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 4180 + }, + { + "epoch": 15.093905191873588, + "grad_norm": 421.0552673339844, + "learning_rate": 7.2903811252268606e-06, + "loss": 38.9023, + "step": 4181 + }, + { + "epoch": 15.097516930022573, + "grad_norm": 264.24359130859375, + "learning_rate": 7.284936479128857e-06, + "loss": 39.6466, + "step": 4182 + }, + { + "epoch": 15.101128668171558, + "grad_norm": 246.88182067871094, + "learning_rate": 7.279491833030854e-06, + "loss": 39.4899, + "step": 4183 + }, + { + "epoch": 15.104740406320541, + "grad_norm": 236.83848571777344, + "learning_rate": 7.274047186932849e-06, + "loss": 35.6587, + "step": 4184 + }, + { + "epoch": 15.108352144469526, + "grad_norm": 278.31573486328125, + "learning_rate": 7.2686025408348455e-06, + "loss": 34.1567, + "step": 4185 + }, + { + "epoch": 15.111963882618511, + "grad_norm": 243.71160888671875, + "learning_rate": 7.2631578947368426e-06, + "loss": 32.1268, + "step": 4186 + }, + { + "epoch": 15.115575620767494, + "grad_norm": 233.81211853027344, + "learning_rate": 7.257713248638839e-06, + "loss": 31.498, + "step": 4187 + }, + { + "epoch": 15.119187358916479, + "grad_norm": 243.12672424316406, + "learning_rate": 7.252268602540835e-06, + "loss": 32.3648, + "step": 4188 + }, + { + "epoch": 15.122799097065462, + "grad_norm": 293.38299560546875, + "learning_rate": 7.246823956442831e-06, + "loss": 32.2236, + "step": 4189 + }, + { + "epoch": 15.126410835214447, + "grad_norm": 249.70071411132812, + "learning_rate": 7.241379310344828e-06, + "loss": 34.5535, + "step": 4190 + }, + { + "epoch": 15.126410835214447, + "eval_loss": 0.6050077676773071, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.934, + "eval_steps_per_second": 56.934, + "step": 4190 + }, + { + "epoch": 15.130022573363432, + "grad_norm": 300.9483642578125, + "learning_rate": 7.235934664246824e-06, + "loss": 32.9552, + "step": 4191 + }, + { + "epoch": 15.133634311512415, + "grad_norm": 228.797607421875, + "learning_rate": 7.23049001814882e-06, + "loss": 33.0974, + "step": 4192 + }, + { + "epoch": 15.1372460496614, + "grad_norm": 279.9087219238281, + "learning_rate": 7.225045372050817e-06, + "loss": 34.2865, + "step": 4193 + }, + { + "epoch": 15.140857787810384, + "grad_norm": 254.15928649902344, + "learning_rate": 7.219600725952813e-06, + "loss": 34.5603, + "step": 4194 + }, + { + "epoch": 15.144469525959368, + "grad_norm": 314.19012451171875, + "learning_rate": 7.2141560798548095e-06, + "loss": 34.6428, + "step": 4195 + }, + { + "epoch": 15.148081264108352, + "grad_norm": 291.8244323730469, + "learning_rate": 7.208711433756806e-06, + "loss": 33.6676, + "step": 4196 + }, + { + "epoch": 15.151693002257336, + "grad_norm": 276.4428405761719, + "learning_rate": 7.203266787658803e-06, + "loss": 33.9118, + "step": 4197 + }, + { + "epoch": 15.15530474040632, + "grad_norm": 265.7801208496094, + "learning_rate": 7.197822141560799e-06, + "loss": 35.1971, + "step": 4198 + }, + { + "epoch": 15.158916478555305, + "grad_norm": 244.48667907714844, + "learning_rate": 7.192377495462795e-06, + "loss": 33.0843, + "step": 4199 + }, + { + "epoch": 15.162528216704288, + "grad_norm": 348.6037902832031, + "learning_rate": 7.1869328493647915e-06, + "loss": 36.7957, + "step": 4200 + }, + { + "epoch": 15.162528216704288, + "eval_loss": 0.6052607297897339, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 4200 + }, + { + "epoch": 15.166139954853273, + "grad_norm": 227.31346130371094, + "learning_rate": 7.181488203266788e-06, + "loss": 28.0234, + "step": 4201 + }, + { + "epoch": 15.169751693002258, + "grad_norm": 208.75048828125, + "learning_rate": 7.176043557168784e-06, + "loss": 22.5147, + "step": 4202 + }, + { + "epoch": 15.173363431151241, + "grad_norm": 222.91090393066406, + "learning_rate": 7.17059891107078e-06, + "loss": 22.1029, + "step": 4203 + }, + { + "epoch": 15.176975169300226, + "grad_norm": 219.40621948242188, + "learning_rate": 7.165154264972777e-06, + "loss": 22.9827, + "step": 4204 + }, + { + "epoch": 15.18058690744921, + "grad_norm": 229.11813354492188, + "learning_rate": 7.1597096188747735e-06, + "loss": 23.6974, + "step": 4205 + }, + { + "epoch": 15.184198645598194, + "grad_norm": 256.7950744628906, + "learning_rate": 7.15426497277677e-06, + "loss": 39.6585, + "step": 4206 + }, + { + "epoch": 15.187810383747179, + "grad_norm": 237.47613525390625, + "learning_rate": 7.148820326678766e-06, + "loss": 40.0478, + "step": 4207 + }, + { + "epoch": 15.191422121896162, + "grad_norm": 259.54296875, + "learning_rate": 7.143375680580762e-06, + "loss": 39.7604, + "step": 4208 + }, + { + "epoch": 15.195033860045147, + "grad_norm": 249.7389678955078, + "learning_rate": 7.137931034482759e-06, + "loss": 39.0201, + "step": 4209 + }, + { + "epoch": 15.198645598194132, + "grad_norm": 298.4624938964844, + "learning_rate": 7.132486388384755e-06, + "loss": 39.8575, + "step": 4210 + }, + { + "epoch": 15.198645598194132, + "eval_loss": 0.6088115572929382, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 4210 + }, + { + "epoch": 15.202257336343115, + "grad_norm": 267.57659912109375, + "learning_rate": 7.127041742286752e-06, + "loss": 38.8929, + "step": 4211 + }, + { + "epoch": 15.2058690744921, + "grad_norm": 243.88333129882812, + "learning_rate": 7.121597096188748e-06, + "loss": 39.6078, + "step": 4212 + }, + { + "epoch": 15.209480812641084, + "grad_norm": 268.2644348144531, + "learning_rate": 7.116152450090745e-06, + "loss": 39.9488, + "step": 4213 + }, + { + "epoch": 15.213092550790067, + "grad_norm": 240.2657928466797, + "learning_rate": 7.11070780399274e-06, + "loss": 40.1645, + "step": 4214 + }, + { + "epoch": 15.216704288939052, + "grad_norm": 198.76910400390625, + "learning_rate": 7.105263157894737e-06, + "loss": 38.2229, + "step": 4215 + }, + { + "epoch": 15.220316027088035, + "grad_norm": 234.11170959472656, + "learning_rate": 7.099818511796734e-06, + "loss": 39.5294, + "step": 4216 + }, + { + "epoch": 15.22392776523702, + "grad_norm": 192.80194091796875, + "learning_rate": 7.094373865698729e-06, + "loss": 36.9752, + "step": 4217 + }, + { + "epoch": 15.227539503386005, + "grad_norm": 241.8236846923828, + "learning_rate": 7.088929219600726e-06, + "loss": 36.1043, + "step": 4218 + }, + { + "epoch": 15.231151241534988, + "grad_norm": 451.6199645996094, + "learning_rate": 7.083484573502722e-06, + "loss": 37.7911, + "step": 4219 + }, + { + "epoch": 15.234762979683973, + "grad_norm": 351.9429626464844, + "learning_rate": 7.0780399274047195e-06, + "loss": 35.5202, + "step": 4220 + }, + { + "epoch": 15.234762979683973, + "eval_loss": 0.6093130111694336, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 4220 + }, + { + "epoch": 15.238374717832958, + "grad_norm": 266.4995422363281, + "learning_rate": 7.072595281306715e-06, + "loss": 37.5552, + "step": 4221 + }, + { + "epoch": 15.241986455981941, + "grad_norm": 258.74578857421875, + "learning_rate": 7.067150635208712e-06, + "loss": 37.1315, + "step": 4222 + }, + { + "epoch": 15.245598194130926, + "grad_norm": 233.30921936035156, + "learning_rate": 7.061705989110708e-06, + "loss": 36.9237, + "step": 4223 + }, + { + "epoch": 15.249209932279909, + "grad_norm": 235.8688201904297, + "learning_rate": 7.056261343012704e-06, + "loss": 38.0112, + "step": 4224 + }, + { + "epoch": 15.252821670428894, + "grad_norm": 214.88436889648438, + "learning_rate": 7.050816696914701e-06, + "loss": 38.5641, + "step": 4225 + }, + { + "epoch": 15.256433408577879, + "grad_norm": 252.64144897460938, + "learning_rate": 7.045372050816697e-06, + "loss": 36.7125, + "step": 4226 + }, + { + "epoch": 15.260045146726862, + "grad_norm": 293.78424072265625, + "learning_rate": 7.039927404718694e-06, + "loss": 37.5956, + "step": 4227 + }, + { + "epoch": 15.263656884875846, + "grad_norm": 234.13510131835938, + "learning_rate": 7.03448275862069e-06, + "loss": 38.1829, + "step": 4228 + }, + { + "epoch": 15.267268623024831, + "grad_norm": 279.534912109375, + "learning_rate": 7.029038112522686e-06, + "loss": 39.0785, + "step": 4229 + }, + { + "epoch": 15.270880361173814, + "grad_norm": 246.4442596435547, + "learning_rate": 7.023593466424683e-06, + "loss": 39.1753, + "step": 4230 + }, + { + "epoch": 15.270880361173814, + "eval_loss": 0.6043311357498169, + "eval_runtime": 3.1452, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 4230 + }, + { + "epoch": 15.2744920993228, + "grad_norm": 233.87466430664062, + "learning_rate": 7.018148820326679e-06, + "loss": 39.8464, + "step": 4231 + }, + { + "epoch": 15.278103837471784, + "grad_norm": 228.54898071289062, + "learning_rate": 7.012704174228675e-06, + "loss": 37.9721, + "step": 4232 + }, + { + "epoch": 15.281715575620767, + "grad_norm": 273.70050048828125, + "learning_rate": 7.007259528130671e-06, + "loss": 38.9153, + "step": 4233 + }, + { + "epoch": 15.285327313769752, + "grad_norm": 269.8402404785156, + "learning_rate": 7.001814882032668e-06, + "loss": 36.7607, + "step": 4234 + }, + { + "epoch": 15.288939051918735, + "grad_norm": 260.13629150390625, + "learning_rate": 6.996370235934665e-06, + "loss": 35.3684, + "step": 4235 + }, + { + "epoch": 15.29255079006772, + "grad_norm": 223.9878692626953, + "learning_rate": 6.990925589836661e-06, + "loss": 32.8784, + "step": 4236 + }, + { + "epoch": 15.296162528216705, + "grad_norm": 225.69212341308594, + "learning_rate": 6.985480943738657e-06, + "loss": 31.3751, + "step": 4237 + }, + { + "epoch": 15.299774266365688, + "grad_norm": 215.99801635742188, + "learning_rate": 6.980036297640653e-06, + "loss": 31.5331, + "step": 4238 + }, + { + "epoch": 15.303386004514673, + "grad_norm": 263.26568603515625, + "learning_rate": 6.97459165154265e-06, + "loss": 32.5806, + "step": 4239 + }, + { + "epoch": 15.306997742663658, + "grad_norm": 203.2392578125, + "learning_rate": 6.969147005444646e-06, + "loss": 31.6379, + "step": 4240 + }, + { + "epoch": 15.306997742663658, + "eval_loss": 0.6046441793441772, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 4240 + }, + { + "epoch": 15.31060948081264, + "grad_norm": 221.2167510986328, + "learning_rate": 6.963702359346643e-06, + "loss": 33.7034, + "step": 4241 + }, + { + "epoch": 15.314221218961626, + "grad_norm": 212.58737182617188, + "learning_rate": 6.958257713248639e-06, + "loss": 32.5511, + "step": 4242 + }, + { + "epoch": 15.317832957110609, + "grad_norm": 270.7123718261719, + "learning_rate": 6.952813067150635e-06, + "loss": 33.2513, + "step": 4243 + }, + { + "epoch": 15.321444695259594, + "grad_norm": 270.2066345214844, + "learning_rate": 6.9473684210526315e-06, + "loss": 33.9559, + "step": 4244 + }, + { + "epoch": 15.325056433408578, + "grad_norm": 232.8043212890625, + "learning_rate": 6.941923774954628e-06, + "loss": 33.9916, + "step": 4245 + }, + { + "epoch": 15.328668171557561, + "grad_norm": 325.419921875, + "learning_rate": 6.936479128856625e-06, + "loss": 35.2098, + "step": 4246 + }, + { + "epoch": 15.332279909706546, + "grad_norm": 303.326416015625, + "learning_rate": 6.93103448275862e-06, + "loss": 35.0784, + "step": 4247 + }, + { + "epoch": 15.335891647855531, + "grad_norm": 327.05963134765625, + "learning_rate": 6.925589836660617e-06, + "loss": 35.9915, + "step": 4248 + }, + { + "epoch": 15.339503386004514, + "grad_norm": 326.58795166015625, + "learning_rate": 6.9201451905626135e-06, + "loss": 35.1914, + "step": 4249 + }, + { + "epoch": 15.343115124153499, + "grad_norm": 406.38812255859375, + "learning_rate": 6.914700544464611e-06, + "loss": 37.1535, + "step": 4250 + }, + { + "epoch": 15.343115124153499, + "eval_loss": 0.6056071519851685, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 4250 + }, + { + "epoch": 15.346726862302482, + "grad_norm": 325.6965637207031, + "learning_rate": 6.909255898366606e-06, + "loss": 29.8698, + "step": 4251 + }, + { + "epoch": 15.350338600451467, + "grad_norm": 212.59727478027344, + "learning_rate": 6.903811252268603e-06, + "loss": 22.2995, + "step": 4252 + }, + { + "epoch": 15.353950338600452, + "grad_norm": 257.447509765625, + "learning_rate": 6.898366606170599e-06, + "loss": 23.1014, + "step": 4253 + }, + { + "epoch": 15.357562076749435, + "grad_norm": 266.139892578125, + "learning_rate": 6.8929219600725955e-06, + "loss": 23.2319, + "step": 4254 + }, + { + "epoch": 15.36117381489842, + "grad_norm": 332.7207336425781, + "learning_rate": 6.887477313974592e-06, + "loss": 23.7218, + "step": 4255 + }, + { + "epoch": 15.364785553047405, + "grad_norm": 272.7341003417969, + "learning_rate": 6.882032667876588e-06, + "loss": 39.5787, + "step": 4256 + }, + { + "epoch": 15.368397291196388, + "grad_norm": 259.00872802734375, + "learning_rate": 6.876588021778585e-06, + "loss": 41.0874, + "step": 4257 + }, + { + "epoch": 15.372009029345373, + "grad_norm": 236.87033081054688, + "learning_rate": 6.8711433756805804e-06, + "loss": 38.9811, + "step": 4258 + }, + { + "epoch": 15.375620767494357, + "grad_norm": 293.6808776855469, + "learning_rate": 6.8656987295825775e-06, + "loss": 39.481, + "step": 4259 + }, + { + "epoch": 15.37923250564334, + "grad_norm": 266.0845947265625, + "learning_rate": 6.860254083484574e-06, + "loss": 39.4595, + "step": 4260 + }, + { + "epoch": 15.37923250564334, + "eval_loss": 0.6039742231369019, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.047, + "eval_steps_per_second": 57.047, + "step": 4260 + }, + { + "epoch": 15.382844243792325, + "grad_norm": 398.0877685546875, + "learning_rate": 6.85480943738657e-06, + "loss": 38.8899, + "step": 4261 + }, + { + "epoch": 15.386455981941308, + "grad_norm": 208.37376403808594, + "learning_rate": 6.849364791288566e-06, + "loss": 39.2194, + "step": 4262 + }, + { + "epoch": 15.390067720090293, + "grad_norm": 214.6958770751953, + "learning_rate": 6.8439201451905624e-06, + "loss": 38.9911, + "step": 4263 + }, + { + "epoch": 15.393679458239278, + "grad_norm": 210.2147674560547, + "learning_rate": 6.8384754990925595e-06, + "loss": 40.5973, + "step": 4264 + }, + { + "epoch": 15.397291196388261, + "grad_norm": 240.47030639648438, + "learning_rate": 6.833030852994556e-06, + "loss": 39.3936, + "step": 4265 + }, + { + "epoch": 15.400902934537246, + "grad_norm": 273.86883544921875, + "learning_rate": 6.827586206896552e-06, + "loss": 40.0848, + "step": 4266 + }, + { + "epoch": 15.404514672686231, + "grad_norm": 239.36453247070312, + "learning_rate": 6.822141560798548e-06, + "loss": 36.5967, + "step": 4267 + }, + { + "epoch": 15.408126410835214, + "grad_norm": 215.3413543701172, + "learning_rate": 6.8166969147005444e-06, + "loss": 37.8173, + "step": 4268 + }, + { + "epoch": 15.411738148984199, + "grad_norm": 260.1557312011719, + "learning_rate": 6.811252268602541e-06, + "loss": 37.7175, + "step": 4269 + }, + { + "epoch": 15.415349887133182, + "grad_norm": 239.4988555908203, + "learning_rate": 6.805807622504537e-06, + "loss": 37.0618, + "step": 4270 + }, + { + "epoch": 15.415349887133182, + "eval_loss": 0.6049810647964478, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 4270 + }, + { + "epoch": 15.418961625282167, + "grad_norm": 223.06094360351562, + "learning_rate": 6.800362976406534e-06, + "loss": 37.0687, + "step": 4271 + }, + { + "epoch": 15.422573363431152, + "grad_norm": 261.7460632324219, + "learning_rate": 6.79491833030853e-06, + "loss": 35.9437, + "step": 4272 + }, + { + "epoch": 15.426185101580135, + "grad_norm": 230.92135620117188, + "learning_rate": 6.7894736842105264e-06, + "loss": 38.3316, + "step": 4273 + }, + { + "epoch": 15.42979683972912, + "grad_norm": 370.6309509277344, + "learning_rate": 6.784029038112523e-06, + "loss": 38.2666, + "step": 4274 + }, + { + "epoch": 15.433408577878104, + "grad_norm": 249.7823944091797, + "learning_rate": 6.77858439201452e-06, + "loss": 38.1159, + "step": 4275 + }, + { + "epoch": 15.437020316027088, + "grad_norm": 404.1676330566406, + "learning_rate": 6.773139745916516e-06, + "loss": 37.6548, + "step": 4276 + }, + { + "epoch": 15.440632054176072, + "grad_norm": 256.3241271972656, + "learning_rate": 6.767695099818511e-06, + "loss": 38.3713, + "step": 4277 + }, + { + "epoch": 15.444243792325057, + "grad_norm": 240.55934143066406, + "learning_rate": 6.7622504537205084e-06, + "loss": 39.2487, + "step": 4278 + }, + { + "epoch": 15.44785553047404, + "grad_norm": 230.010009765625, + "learning_rate": 6.756805807622505e-06, + "loss": 39.4391, + "step": 4279 + }, + { + "epoch": 15.451467268623025, + "grad_norm": 226.51385498046875, + "learning_rate": 6.751361161524502e-06, + "loss": 38.6273, + "step": 4280 + }, + { + "epoch": 15.451467268623025, + "eval_loss": 0.6027400493621826, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.029, + "eval_steps_per_second": 57.029, + "step": 4280 + }, + { + "epoch": 15.455079006772008, + "grad_norm": 314.57476806640625, + "learning_rate": 6.745916515426497e-06, + "loss": 38.583, + "step": 4281 + }, + { + "epoch": 15.458690744920993, + "grad_norm": 229.91238403320312, + "learning_rate": 6.740471869328494e-06, + "loss": 39.2433, + "step": 4282 + }, + { + "epoch": 15.462302483069978, + "grad_norm": 284.7301330566406, + "learning_rate": 6.7350272232304904e-06, + "loss": 38.8577, + "step": 4283 + }, + { + "epoch": 15.465914221218961, + "grad_norm": 209.32266235351562, + "learning_rate": 6.729582577132486e-06, + "loss": 34.928, + "step": 4284 + }, + { + "epoch": 15.469525959367946, + "grad_norm": 264.6195068359375, + "learning_rate": 6.724137931034483e-06, + "loss": 32.0527, + "step": 4285 + }, + { + "epoch": 15.47313769751693, + "grad_norm": 224.2421112060547, + "learning_rate": 6.718693284936479e-06, + "loss": 31.939, + "step": 4286 + }, + { + "epoch": 15.476749435665914, + "grad_norm": 233.0791015625, + "learning_rate": 6.713248638838476e-06, + "loss": 32.5402, + "step": 4287 + }, + { + "epoch": 15.480361173814899, + "grad_norm": 284.129638671875, + "learning_rate": 6.707803992740472e-06, + "loss": 31.0069, + "step": 4288 + }, + { + "epoch": 15.483972911963882, + "grad_norm": 253.6517791748047, + "learning_rate": 6.702359346642469e-06, + "loss": 32.0172, + "step": 4289 + }, + { + "epoch": 15.487584650112867, + "grad_norm": 305.63775634765625, + "learning_rate": 6.696914700544465e-06, + "loss": 34.1643, + "step": 4290 + }, + { + "epoch": 15.487584650112867, + "eval_loss": 0.6044390201568604, + "eval_runtime": 3.1391, + "eval_samples_per_second": 57.023, + "eval_steps_per_second": 57.023, + "step": 4290 + }, + { + "epoch": 15.491196388261852, + "grad_norm": 224.6516876220703, + "learning_rate": 6.691470054446461e-06, + "loss": 32.4735, + "step": 4291 + }, + { + "epoch": 15.494808126410835, + "grad_norm": 257.5385437011719, + "learning_rate": 6.686025408348457e-06, + "loss": 33.9272, + "step": 4292 + }, + { + "epoch": 15.49841986455982, + "grad_norm": 393.9106140136719, + "learning_rate": 6.680580762250454e-06, + "loss": 34.4176, + "step": 4293 + }, + { + "epoch": 15.502031602708804, + "grad_norm": 333.5639953613281, + "learning_rate": 6.675136116152451e-06, + "loss": 34.5695, + "step": 4294 + }, + { + "epoch": 15.505643340857787, + "grad_norm": 319.8660888671875, + "learning_rate": 6.669691470054446e-06, + "loss": 34.5337, + "step": 4295 + }, + { + "epoch": 15.509255079006772, + "grad_norm": 246.78086853027344, + "learning_rate": 6.664246823956443e-06, + "loss": 34.8297, + "step": 4296 + }, + { + "epoch": 15.512866817155757, + "grad_norm": 313.4530944824219, + "learning_rate": 6.658802177858439e-06, + "loss": 34.6901, + "step": 4297 + }, + { + "epoch": 15.51647855530474, + "grad_norm": 257.2852783203125, + "learning_rate": 6.6533575317604364e-06, + "loss": 35.3892, + "step": 4298 + }, + { + "epoch": 15.520090293453725, + "grad_norm": 336.5549011230469, + "learning_rate": 6.647912885662432e-06, + "loss": 36.3347, + "step": 4299 + }, + { + "epoch": 15.523702031602708, + "grad_norm": 275.726806640625, + "learning_rate": 6.642468239564428e-06, + "loss": 36.3559, + "step": 4300 + }, + { + "epoch": 15.523702031602708, + "eval_loss": 0.6056334376335144, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.028, + "eval_steps_per_second": 57.028, + "step": 4300 + }, + { + "epoch": 15.527313769751693, + "grad_norm": 275.5987243652344, + "learning_rate": 6.637023593466425e-06, + "loss": 28.5887, + "step": 4301 + }, + { + "epoch": 15.530925507900678, + "grad_norm": 242.59762573242188, + "learning_rate": 6.631578947368421e-06, + "loss": 22.1398, + "step": 4302 + }, + { + "epoch": 15.534537246049661, + "grad_norm": 228.04344177246094, + "learning_rate": 6.626134301270418e-06, + "loss": 21.4593, + "step": 4303 + }, + { + "epoch": 15.538148984198646, + "grad_norm": 204.2377166748047, + "learning_rate": 6.620689655172414e-06, + "loss": 22.5132, + "step": 4304 + }, + { + "epoch": 15.54176072234763, + "grad_norm": 243.0237579345703, + "learning_rate": 6.615245009074411e-06, + "loss": 24.2777, + "step": 4305 + }, + { + "epoch": 15.545372460496614, + "grad_norm": 227.2841339111328, + "learning_rate": 6.609800362976407e-06, + "loss": 39.7235, + "step": 4306 + }, + { + "epoch": 15.548984198645599, + "grad_norm": 253.8453826904297, + "learning_rate": 6.6043557168784025e-06, + "loss": 39.9317, + "step": 4307 + }, + { + "epoch": 15.552595936794582, + "grad_norm": 243.62757873535156, + "learning_rate": 6.5989110707804e-06, + "loss": 38.9825, + "step": 4308 + }, + { + "epoch": 15.556207674943566, + "grad_norm": 262.4398498535156, + "learning_rate": 6.593466424682396e-06, + "loss": 39.7456, + "step": 4309 + }, + { + "epoch": 15.559819413092551, + "grad_norm": 268.5821228027344, + "learning_rate": 6.588021778584392e-06, + "loss": 39.5152, + "step": 4310 + }, + { + "epoch": 15.559819413092551, + "eval_loss": 0.6060237288475037, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 4310 + }, + { + "epoch": 15.563431151241534, + "grad_norm": 297.6933898925781, + "learning_rate": 6.582577132486388e-06, + "loss": 40.1259, + "step": 4311 + }, + { + "epoch": 15.56704288939052, + "grad_norm": 234.08816528320312, + "learning_rate": 6.577132486388385e-06, + "loss": 40.8591, + "step": 4312 + }, + { + "epoch": 15.570654627539504, + "grad_norm": 292.2416687011719, + "learning_rate": 6.571687840290382e-06, + "loss": 39.2377, + "step": 4313 + }, + { + "epoch": 15.574266365688487, + "grad_norm": 205.25888061523438, + "learning_rate": 6.566243194192377e-06, + "loss": 39.92, + "step": 4314 + }, + { + "epoch": 15.577878103837472, + "grad_norm": 229.06695556640625, + "learning_rate": 6.560798548094374e-06, + "loss": 39.8886, + "step": 4315 + }, + { + "epoch": 15.581489841986457, + "grad_norm": 223.3977508544922, + "learning_rate": 6.55535390199637e-06, + "loss": 38.5423, + "step": 4316 + }, + { + "epoch": 15.58510158013544, + "grad_norm": 254.60203552246094, + "learning_rate": 6.549909255898367e-06, + "loss": 36.8055, + "step": 4317 + }, + { + "epoch": 15.588713318284425, + "grad_norm": 304.463623046875, + "learning_rate": 6.544464609800363e-06, + "loss": 37.6164, + "step": 4318 + }, + { + "epoch": 15.592325056433408, + "grad_norm": 279.955810546875, + "learning_rate": 6.53901996370236e-06, + "loss": 37.4778, + "step": 4319 + }, + { + "epoch": 15.595936794582393, + "grad_norm": 230.11105346679688, + "learning_rate": 6.533575317604356e-06, + "loss": 36.9663, + "step": 4320 + }, + { + "epoch": 15.595936794582393, + "eval_loss": 0.6048213243484497, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.966, + "eval_steps_per_second": 56.966, + "step": 4320 + }, + { + "epoch": 15.599548532731378, + "grad_norm": 261.98187255859375, + "learning_rate": 6.528130671506351e-06, + "loss": 37.7402, + "step": 4321 + }, + { + "epoch": 15.60316027088036, + "grad_norm": 247.34771728515625, + "learning_rate": 6.5226860254083485e-06, + "loss": 37.1402, + "step": 4322 + }, + { + "epoch": 15.606772009029346, + "grad_norm": 277.1517333984375, + "learning_rate": 6.517241379310345e-06, + "loss": 38.3976, + "step": 4323 + }, + { + "epoch": 15.610383747178329, + "grad_norm": 231.89683532714844, + "learning_rate": 6.511796733212342e-06, + "loss": 38.0834, + "step": 4324 + }, + { + "epoch": 15.613995485327314, + "grad_norm": 323.8349304199219, + "learning_rate": 6.506352087114337e-06, + "loss": 37.9085, + "step": 4325 + }, + { + "epoch": 15.617607223476298, + "grad_norm": 263.5240783691406, + "learning_rate": 6.500907441016334e-06, + "loss": 37.0702, + "step": 4326 + }, + { + "epoch": 15.621218961625281, + "grad_norm": 217.0517578125, + "learning_rate": 6.4954627949183305e-06, + "loss": 36.9406, + "step": 4327 + }, + { + "epoch": 15.624830699774266, + "grad_norm": 267.4161682128906, + "learning_rate": 6.4900181488203276e-06, + "loss": 38.8773, + "step": 4328 + }, + { + "epoch": 15.628442437923251, + "grad_norm": 232.36000061035156, + "learning_rate": 6.484573502722323e-06, + "loss": 38.4978, + "step": 4329 + }, + { + "epoch": 15.632054176072234, + "grad_norm": 241.61373901367188, + "learning_rate": 6.479128856624319e-06, + "loss": 38.4895, + "step": 4330 + }, + { + "epoch": 15.632054176072234, + "eval_loss": 0.6024956703186035, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 4330 + }, + { + "epoch": 15.635665914221219, + "grad_norm": 232.27928161621094, + "learning_rate": 6.473684210526316e-06, + "loss": 38.8551, + "step": 4331 + }, + { + "epoch": 15.639277652370204, + "grad_norm": 243.42828369140625, + "learning_rate": 6.4682395644283125e-06, + "loss": 38.6475, + "step": 4332 + }, + { + "epoch": 15.642889390519187, + "grad_norm": 306.2618103027344, + "learning_rate": 6.462794918330309e-06, + "loss": 37.2015, + "step": 4333 + }, + { + "epoch": 15.646501128668172, + "grad_norm": 335.795166015625, + "learning_rate": 6.457350272232305e-06, + "loss": 36.5255, + "step": 4334 + }, + { + "epoch": 15.650112866817155, + "grad_norm": 209.6246337890625, + "learning_rate": 6.451905626134302e-06, + "loss": 32.4219, + "step": 4335 + }, + { + "epoch": 15.65372460496614, + "grad_norm": 283.2094421386719, + "learning_rate": 6.446460980036297e-06, + "loss": 30.9137, + "step": 4336 + }, + { + "epoch": 15.657336343115125, + "grad_norm": 255.4412841796875, + "learning_rate": 6.441016333938294e-06, + "loss": 30.8939, + "step": 4337 + }, + { + "epoch": 15.660948081264108, + "grad_norm": 217.8052215576172, + "learning_rate": 6.435571687840291e-06, + "loss": 31.5974, + "step": 4338 + }, + { + "epoch": 15.664559819413093, + "grad_norm": 215.64398193359375, + "learning_rate": 6.430127041742287e-06, + "loss": 30.0276, + "step": 4339 + }, + { + "epoch": 15.668171557562077, + "grad_norm": 244.32704162597656, + "learning_rate": 6.424682395644283e-06, + "loss": 32.5249, + "step": 4340 + }, + { + "epoch": 15.668171557562077, + "eval_loss": 0.6037233471870422, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.033, + "eval_steps_per_second": 57.033, + "step": 4340 + }, + { + "epoch": 15.67178329571106, + "grad_norm": 270.9132080078125, + "learning_rate": 6.419237749546279e-06, + "loss": 32.9923, + "step": 4341 + }, + { + "epoch": 15.675395033860045, + "grad_norm": 230.20314025878906, + "learning_rate": 6.4137931034482765e-06, + "loss": 32.871, + "step": 4342 + }, + { + "epoch": 15.679006772009028, + "grad_norm": 372.4366149902344, + "learning_rate": 6.408348457350273e-06, + "loss": 35.2687, + "step": 4343 + }, + { + "epoch": 15.682618510158013, + "grad_norm": 325.0901794433594, + "learning_rate": 6.402903811252268e-06, + "loss": 34.3107, + "step": 4344 + }, + { + "epoch": 15.686230248306998, + "grad_norm": 277.8683166503906, + "learning_rate": 6.397459165154265e-06, + "loss": 34.291, + "step": 4345 + }, + { + "epoch": 15.689841986455981, + "grad_norm": 262.566162109375, + "learning_rate": 6.392014519056261e-06, + "loss": 33.2989, + "step": 4346 + }, + { + "epoch": 15.693453724604966, + "grad_norm": 293.56536865234375, + "learning_rate": 6.386569872958258e-06, + "loss": 35.6865, + "step": 4347 + }, + { + "epoch": 15.697065462753951, + "grad_norm": 291.1886291503906, + "learning_rate": 6.381125226860254e-06, + "loss": 35.6959, + "step": 4348 + }, + { + "epoch": 15.700677200902934, + "grad_norm": 265.2365417480469, + "learning_rate": 6.375680580762251e-06, + "loss": 36.479, + "step": 4349 + }, + { + "epoch": 15.704288939051919, + "grad_norm": 342.8822021484375, + "learning_rate": 6.370235934664247e-06, + "loss": 35.9198, + "step": 4350 + }, + { + "epoch": 15.704288939051919, + "eval_loss": 0.603361189365387, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.98, + "eval_steps_per_second": 56.98, + "step": 4350 + }, + { + "epoch": 15.707900677200904, + "grad_norm": 276.1657409667969, + "learning_rate": 6.364791288566243e-06, + "loss": 29.429, + "step": 4351 + }, + { + "epoch": 15.711512415349887, + "grad_norm": 267.2456359863281, + "learning_rate": 6.35934664246824e-06, + "loss": 23.0038, + "step": 4352 + }, + { + "epoch": 15.715124153498872, + "grad_norm": 255.4893798828125, + "learning_rate": 6.353901996370236e-06, + "loss": 21.1185, + "step": 4353 + }, + { + "epoch": 15.718735891647855, + "grad_norm": 252.10501098632812, + "learning_rate": 6.348457350272233e-06, + "loss": 23.1769, + "step": 4354 + }, + { + "epoch": 15.72234762979684, + "grad_norm": 239.63905334472656, + "learning_rate": 6.343012704174228e-06, + "loss": 24.5905, + "step": 4355 + }, + { + "epoch": 15.725959367945824, + "grad_norm": 228.00950622558594, + "learning_rate": 6.337568058076225e-06, + "loss": 39.6657, + "step": 4356 + }, + { + "epoch": 15.729571106094808, + "grad_norm": 234.10647583007812, + "learning_rate": 6.332123411978222e-06, + "loss": 41.145, + "step": 4357 + }, + { + "epoch": 15.733182844243792, + "grad_norm": 236.55223083496094, + "learning_rate": 6.326678765880219e-06, + "loss": 40.2784, + "step": 4358 + }, + { + "epoch": 15.736794582392777, + "grad_norm": 340.1712646484375, + "learning_rate": 6.321234119782214e-06, + "loss": 39.3598, + "step": 4359 + }, + { + "epoch": 15.74040632054176, + "grad_norm": 269.4134826660156, + "learning_rate": 6.31578947368421e-06, + "loss": 38.7777, + "step": 4360 + }, + { + "epoch": 15.74040632054176, + "eval_loss": 0.6048015356063843, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 4360 + }, + { + "epoch": 15.744018058690745, + "grad_norm": 316.5471496582031, + "learning_rate": 6.310344827586207e-06, + "loss": 39.6707, + "step": 4361 + }, + { + "epoch": 15.747629796839728, + "grad_norm": 231.31820678710938, + "learning_rate": 6.304900181488203e-06, + "loss": 38.0009, + "step": 4362 + }, + { + "epoch": 15.751241534988713, + "grad_norm": 207.19117736816406, + "learning_rate": 6.2994555353902e-06, + "loss": 41.6523, + "step": 4363 + }, + { + "epoch": 15.754853273137698, + "grad_norm": 239.8341064453125, + "learning_rate": 6.294010889292196e-06, + "loss": 40.3203, + "step": 4364 + }, + { + "epoch": 15.758465011286681, + "grad_norm": 277.2004089355469, + "learning_rate": 6.288566243194193e-06, + "loss": 39.8026, + "step": 4365 + }, + { + "epoch": 15.762076749435666, + "grad_norm": 227.74728393554688, + "learning_rate": 6.2831215970961886e-06, + "loss": 38.1561, + "step": 4366 + }, + { + "epoch": 15.76568848758465, + "grad_norm": 268.6826477050781, + "learning_rate": 6.277676950998185e-06, + "loss": 37.4653, + "step": 4367 + }, + { + "epoch": 15.769300225733634, + "grad_norm": 308.92950439453125, + "learning_rate": 6.272232304900182e-06, + "loss": 36.3506, + "step": 4368 + }, + { + "epoch": 15.772911963882619, + "grad_norm": 216.53627014160156, + "learning_rate": 6.266787658802178e-06, + "loss": 36.12, + "step": 4369 + }, + { + "epoch": 15.776523702031604, + "grad_norm": 264.0691833496094, + "learning_rate": 6.261343012704174e-06, + "loss": 37.5023, + "step": 4370 + }, + { + "epoch": 15.776523702031604, + "eval_loss": 0.608928382396698, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.989, + "eval_steps_per_second": 56.989, + "step": 4370 + }, + { + "epoch": 15.780135440180587, + "grad_norm": 474.7265319824219, + "learning_rate": 6.2558983666061706e-06, + "loss": 38.8381, + "step": 4371 + }, + { + "epoch": 15.783747178329572, + "grad_norm": 303.66229248046875, + "learning_rate": 6.250453720508168e-06, + "loss": 36.5951, + "step": 4372 + }, + { + "epoch": 15.787358916478555, + "grad_norm": 231.65744018554688, + "learning_rate": 6.245009074410164e-06, + "loss": 36.4717, + "step": 4373 + }, + { + "epoch": 15.79097065462754, + "grad_norm": 235.25833129882812, + "learning_rate": 6.239564428312159e-06, + "loss": 38.4578, + "step": 4374 + }, + { + "epoch": 15.794582392776524, + "grad_norm": 215.5384063720703, + "learning_rate": 6.234119782214156e-06, + "loss": 38.0475, + "step": 4375 + }, + { + "epoch": 15.798194130925507, + "grad_norm": 216.3609619140625, + "learning_rate": 6.2286751361161526e-06, + "loss": 37.1825, + "step": 4376 + }, + { + "epoch": 15.801805869074492, + "grad_norm": 275.54522705078125, + "learning_rate": 6.223230490018149e-06, + "loss": 38.5608, + "step": 4377 + }, + { + "epoch": 15.805417607223477, + "grad_norm": 226.7752685546875, + "learning_rate": 6.217785843920145e-06, + "loss": 38.0612, + "step": 4378 + }, + { + "epoch": 15.80902934537246, + "grad_norm": 262.14501953125, + "learning_rate": 6.212341197822142e-06, + "loss": 38.0049, + "step": 4379 + }, + { + "epoch": 15.812641083521445, + "grad_norm": 299.82196044921875, + "learning_rate": 6.206896551724138e-06, + "loss": 39.1441, + "step": 4380 + }, + { + "epoch": 15.812641083521445, + "eval_loss": 0.6033969521522522, + "eval_runtime": 3.14, + "eval_samples_per_second": 57.007, + "eval_steps_per_second": 57.007, + "step": 4380 + }, + { + "epoch": 15.816252821670428, + "grad_norm": 295.24188232421875, + "learning_rate": 6.2014519056261346e-06, + "loss": 39.266, + "step": 4381 + }, + { + "epoch": 15.819864559819413, + "grad_norm": 298.1729736328125, + "learning_rate": 6.196007259528131e-06, + "loss": 39.4025, + "step": 4382 + }, + { + "epoch": 15.823476297968398, + "grad_norm": 234.97958374023438, + "learning_rate": 6.190562613430127e-06, + "loss": 39.4752, + "step": 4383 + }, + { + "epoch": 15.827088036117381, + "grad_norm": 270.3009338378906, + "learning_rate": 6.185117967332124e-06, + "loss": 36.0322, + "step": 4384 + }, + { + "epoch": 15.830699774266366, + "grad_norm": 279.78314208984375, + "learning_rate": 6.1796733212341195e-06, + "loss": 33.3256, + "step": 4385 + }, + { + "epoch": 15.83431151241535, + "grad_norm": 258.82598876953125, + "learning_rate": 6.1742286751361166e-06, + "loss": 33.1552, + "step": 4386 + }, + { + "epoch": 15.837923250564334, + "grad_norm": 280.8109130859375, + "learning_rate": 6.168784029038113e-06, + "loss": 32.0024, + "step": 4387 + }, + { + "epoch": 15.841534988713319, + "grad_norm": 265.08111572265625, + "learning_rate": 6.163339382940109e-06, + "loss": 32.4901, + "step": 4388 + }, + { + "epoch": 15.845146726862303, + "grad_norm": 316.56427001953125, + "learning_rate": 6.157894736842105e-06, + "loss": 33.1995, + "step": 4389 + }, + { + "epoch": 15.848758465011286, + "grad_norm": 256.03717041015625, + "learning_rate": 6.1524500907441015e-06, + "loss": 33.1914, + "step": 4390 + }, + { + "epoch": 15.848758465011286, + "eval_loss": 0.6017575263977051, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.034, + "eval_steps_per_second": 57.034, + "step": 4390 + }, + { + "epoch": 15.852370203160271, + "grad_norm": 242.54119873046875, + "learning_rate": 6.1470054446460985e-06, + "loss": 33.8459, + "step": 4391 + }, + { + "epoch": 15.855981941309254, + "grad_norm": 259.1406555175781, + "learning_rate": 6.141560798548094e-06, + "loss": 34.1317, + "step": 4392 + }, + { + "epoch": 15.85959367945824, + "grad_norm": 272.77880859375, + "learning_rate": 6.136116152450091e-06, + "loss": 34.2777, + "step": 4393 + }, + { + "epoch": 15.863205417607224, + "grad_norm": 231.60845947265625, + "learning_rate": 6.130671506352087e-06, + "loss": 34.0165, + "step": 4394 + }, + { + "epoch": 15.866817155756207, + "grad_norm": 230.85675048828125, + "learning_rate": 6.125226860254084e-06, + "loss": 34.2761, + "step": 4395 + }, + { + "epoch": 15.870428893905192, + "grad_norm": 307.4486389160156, + "learning_rate": 6.11978221415608e-06, + "loss": 33.7407, + "step": 4396 + }, + { + "epoch": 15.874040632054175, + "grad_norm": 264.7835388183594, + "learning_rate": 6.114337568058076e-06, + "loss": 34.1672, + "step": 4397 + }, + { + "epoch": 15.87765237020316, + "grad_norm": 234.93968200683594, + "learning_rate": 6.108892921960073e-06, + "loss": 35.7158, + "step": 4398 + }, + { + "epoch": 15.881264108352145, + "grad_norm": 300.0079345703125, + "learning_rate": 6.103448275862069e-06, + "loss": 36.1292, + "step": 4399 + }, + { + "epoch": 15.884875846501128, + "grad_norm": 326.20416259765625, + "learning_rate": 6.0980036297640655e-06, + "loss": 34.8222, + "step": 4400 + }, + { + "epoch": 15.884875846501128, + "eval_loss": 0.6024067401885986, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.99, + "eval_steps_per_second": 56.99, + "step": 4400 + }, + { + "epoch": 15.888487584650113, + "grad_norm": 214.6174774169922, + "learning_rate": 6.092558983666062e-06, + "loss": 27.4819, + "step": 4401 + }, + { + "epoch": 15.892099322799098, + "grad_norm": 222.7063446044922, + "learning_rate": 6.087114337568059e-06, + "loss": 22.3862, + "step": 4402 + }, + { + "epoch": 15.89571106094808, + "grad_norm": 277.0006103515625, + "learning_rate": 6.081669691470054e-06, + "loss": 22.8483, + "step": 4403 + }, + { + "epoch": 15.899322799097066, + "grad_norm": 264.3949890136719, + "learning_rate": 6.076225045372051e-06, + "loss": 23.2021, + "step": 4404 + }, + { + "epoch": 15.90293453724605, + "grad_norm": 244.04611206054688, + "learning_rate": 6.0707803992740475e-06, + "loss": 23.9378, + "step": 4405 + }, + { + "epoch": 15.906546275395034, + "grad_norm": 219.24403381347656, + "learning_rate": 6.065335753176044e-06, + "loss": 39.4708, + "step": 4406 + }, + { + "epoch": 15.910158013544018, + "grad_norm": 297.3822937011719, + "learning_rate": 6.05989110707804e-06, + "loss": 39.9151, + "step": 4407 + }, + { + "epoch": 15.913769751693001, + "grad_norm": 282.748291015625, + "learning_rate": 6.054446460980036e-06, + "loss": 39.0545, + "step": 4408 + }, + { + "epoch": 15.917381489841986, + "grad_norm": 274.6419982910156, + "learning_rate": 6.049001814882033e-06, + "loss": 39.7046, + "step": 4409 + }, + { + "epoch": 15.920993227990971, + "grad_norm": 261.2831115722656, + "learning_rate": 6.0435571687840295e-06, + "loss": 39.8849, + "step": 4410 + }, + { + "epoch": 15.920993227990971, + "eval_loss": 0.6017056107521057, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 4410 + }, + { + "epoch": 15.924604966139954, + "grad_norm": 276.61505126953125, + "learning_rate": 6.038112522686026e-06, + "loss": 39.8861, + "step": 4411 + }, + { + "epoch": 15.928216704288939, + "grad_norm": 273.4017333984375, + "learning_rate": 6.032667876588022e-06, + "loss": 36.2526, + "step": 4412 + }, + { + "epoch": 15.931828442437924, + "grad_norm": 314.4811706542969, + "learning_rate": 6.027223230490018e-06, + "loss": 37.1316, + "step": 4413 + }, + { + "epoch": 15.935440180586907, + "grad_norm": 265.7447204589844, + "learning_rate": 6.021778584392014e-06, + "loss": 38.1698, + "step": 4414 + }, + { + "epoch": 15.939051918735892, + "grad_norm": 448.373291015625, + "learning_rate": 6.016333938294011e-06, + "loss": 38.9541, + "step": 4415 + }, + { + "epoch": 15.942663656884875, + "grad_norm": 261.33966064453125, + "learning_rate": 6.010889292196008e-06, + "loss": 36.6694, + "step": 4416 + }, + { + "epoch": 15.94627539503386, + "grad_norm": 383.16363525390625, + "learning_rate": 6.005444646098004e-06, + "loss": 39.1773, + "step": 4417 + }, + { + "epoch": 15.949887133182845, + "grad_norm": 279.26446533203125, + "learning_rate": 6e-06, + "loss": 36.9482, + "step": 4418 + }, + { + "epoch": 15.953498871331828, + "grad_norm": 307.5321960449219, + "learning_rate": 5.994555353901996e-06, + "loss": 36.653, + "step": 4419 + }, + { + "epoch": 15.957110609480813, + "grad_norm": 412.80023193359375, + "learning_rate": 5.989110707803993e-06, + "loss": 36.3768, + "step": 4420 + }, + { + "epoch": 15.957110609480813, + "eval_loss": 0.6033455729484558, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 4420 + }, + { + "epoch": 15.960722347629797, + "grad_norm": 254.2952880859375, + "learning_rate": 5.98366606170599e-06, + "loss": 32.546, + "step": 4421 + }, + { + "epoch": 15.96433408577878, + "grad_norm": 324.0749816894531, + "learning_rate": 5.978221415607985e-06, + "loss": 32.7021, + "step": 4422 + }, + { + "epoch": 15.967945823927765, + "grad_norm": 326.0075988769531, + "learning_rate": 5.972776769509982e-06, + "loss": 33.3823, + "step": 4423 + }, + { + "epoch": 15.97155756207675, + "grad_norm": 252.98471069335938, + "learning_rate": 5.967332123411978e-06, + "loss": 33.3397, + "step": 4424 + }, + { + "epoch": 15.975169300225733, + "grad_norm": 243.14117431640625, + "learning_rate": 5.9618874773139755e-06, + "loss": 34.2781, + "step": 4425 + }, + { + "epoch": 15.978781038374718, + "grad_norm": 304.3429260253906, + "learning_rate": 5.956442831215971e-06, + "loss": 34.1163, + "step": 4426 + }, + { + "epoch": 15.982392776523701, + "grad_norm": 320.1651916503906, + "learning_rate": 5.950998185117968e-06, + "loss": 34.1024, + "step": 4427 + }, + { + "epoch": 15.986004514672686, + "grad_norm": 252.0004425048828, + "learning_rate": 5.945553539019964e-06, + "loss": 35.8121, + "step": 4428 + }, + { + "epoch": 15.989616252821671, + "grad_norm": 342.5635986328125, + "learning_rate": 5.9401088929219595e-06, + "loss": 35.6666, + "step": 4429 + }, + { + "epoch": 15.993227990970654, + "grad_norm": 226.57249450683594, + "learning_rate": 5.934664246823957e-06, + "loss": 30.2617, + "step": 4430 + }, + { + "epoch": 15.993227990970654, + "eval_loss": 0.6029886603355408, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.979, + "eval_steps_per_second": 56.979, + "step": 4430 + }, + { + "epoch": 15.996839729119639, + "grad_norm": 202.94903564453125, + "learning_rate": 5.929219600725953e-06, + "loss": 22.8166, + "step": 4431 + }, + { + "epoch": 16.0, + "grad_norm": 200.84317016601562, + "learning_rate": 5.92377495462795e-06, + "loss": 20.3903, + "step": 4432 + }, + { + "epoch": 16.003611738148983, + "grad_norm": 230.5917510986328, + "learning_rate": 5.918330308529945e-06, + "loss": 39.0985, + "step": 4433 + }, + { + "epoch": 16.00722347629797, + "grad_norm": 285.6978759765625, + "learning_rate": 5.912885662431942e-06, + "loss": 39.2128, + "step": 4434 + }, + { + "epoch": 16.010835214446953, + "grad_norm": 221.70896911621094, + "learning_rate": 5.907441016333939e-06, + "loss": 38.9026, + "step": 4435 + }, + { + "epoch": 16.014446952595936, + "grad_norm": 318.14068603515625, + "learning_rate": 5.901996370235935e-06, + "loss": 38.7336, + "step": 4436 + }, + { + "epoch": 16.018058690744923, + "grad_norm": 324.451904296875, + "learning_rate": 5.896551724137931e-06, + "loss": 38.7117, + "step": 4437 + }, + { + "epoch": 16.021670428893906, + "grad_norm": 295.038818359375, + "learning_rate": 5.891107078039927e-06, + "loss": 39.6053, + "step": 4438 + }, + { + "epoch": 16.02528216704289, + "grad_norm": 267.0055236816406, + "learning_rate": 5.885662431941924e-06, + "loss": 38.931, + "step": 4439 + }, + { + "epoch": 16.028893905191875, + "grad_norm": 269.20074462890625, + "learning_rate": 5.88021778584392e-06, + "loss": 41.1717, + "step": 4440 + }, + { + "epoch": 16.028893905191875, + "eval_loss": 0.6036069393157959, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.899, + "eval_steps_per_second": 56.899, + "step": 4440 + }, + { + "epoch": 16.03250564334086, + "grad_norm": 241.9443359375, + "learning_rate": 5.874773139745917e-06, + "loss": 38.7027, + "step": 4441 + }, + { + "epoch": 16.03611738148984, + "grad_norm": 238.54847717285156, + "learning_rate": 5.869328493647913e-06, + "loss": 39.1284, + "step": 4442 + }, + { + "epoch": 16.039729119638825, + "grad_norm": 339.3023681640625, + "learning_rate": 5.863883847549909e-06, + "loss": 38.0767, + "step": 4443 + }, + { + "epoch": 16.04334085778781, + "grad_norm": 257.29522705078125, + "learning_rate": 5.8584392014519055e-06, + "loss": 34.8207, + "step": 4444 + }, + { + "epoch": 16.046952595936794, + "grad_norm": 264.24200439453125, + "learning_rate": 5.852994555353902e-06, + "loss": 35.5021, + "step": 4445 + }, + { + "epoch": 16.050564334085777, + "grad_norm": 251.3128662109375, + "learning_rate": 5.847549909255899e-06, + "loss": 35.7826, + "step": 4446 + }, + { + "epoch": 16.054176072234764, + "grad_norm": 310.6581726074219, + "learning_rate": 5.842105263157895e-06, + "loss": 36.7373, + "step": 4447 + }, + { + "epoch": 16.057787810383747, + "grad_norm": 299.07550048828125, + "learning_rate": 5.836660617059891e-06, + "loss": 36.4048, + "step": 4448 + }, + { + "epoch": 16.06139954853273, + "grad_norm": 257.58740234375, + "learning_rate": 5.8312159709618875e-06, + "loss": 36.3982, + "step": 4449 + }, + { + "epoch": 16.065011286681717, + "grad_norm": 337.6795654296875, + "learning_rate": 5.825771324863884e-06, + "loss": 36.8518, + "step": 4450 + }, + { + "epoch": 16.065011286681717, + "eval_loss": 0.6036850214004517, + "eval_runtime": 3.1399, + "eval_samples_per_second": 57.009, + "eval_steps_per_second": 57.009, + "step": 4450 + }, + { + "epoch": 16.0686230248307, + "grad_norm": 275.02423095703125, + "learning_rate": 5.820326678765881e-06, + "loss": 36.1763, + "step": 4451 + }, + { + "epoch": 16.072234762979683, + "grad_norm": 263.4334716796875, + "learning_rate": 5.814882032667876e-06, + "loss": 37.6417, + "step": 4452 + }, + { + "epoch": 16.07584650112867, + "grad_norm": 213.16749572753906, + "learning_rate": 5.809437386569873e-06, + "loss": 35.6537, + "step": 4453 + }, + { + "epoch": 16.079458239277653, + "grad_norm": 263.4288330078125, + "learning_rate": 5.8039927404718695e-06, + "loss": 36.5693, + "step": 4454 + }, + { + "epoch": 16.083069977426636, + "grad_norm": 284.67254638671875, + "learning_rate": 5.798548094373866e-06, + "loss": 37.3424, + "step": 4455 + }, + { + "epoch": 16.086681715575622, + "grad_norm": 355.7987060546875, + "learning_rate": 5.793103448275862e-06, + "loss": 38.7851, + "step": 4456 + }, + { + "epoch": 16.090293453724605, + "grad_norm": 249.7351531982422, + "learning_rate": 5.787658802177859e-06, + "loss": 38.1334, + "step": 4457 + }, + { + "epoch": 16.09390519187359, + "grad_norm": 257.4977722167969, + "learning_rate": 5.782214156079855e-06, + "loss": 37.8369, + "step": 4458 + }, + { + "epoch": 16.097516930022575, + "grad_norm": 242.59584045410156, + "learning_rate": 5.776769509981851e-06, + "loss": 37.4005, + "step": 4459 + }, + { + "epoch": 16.101128668171558, + "grad_norm": 270.0740966796875, + "learning_rate": 5.771324863883848e-06, + "loss": 38.2287, + "step": 4460 + }, + { + "epoch": 16.101128668171558, + "eval_loss": 0.6018803119659424, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.04, + "eval_steps_per_second": 57.04, + "step": 4460 + }, + { + "epoch": 16.10474040632054, + "grad_norm": 225.32322692871094, + "learning_rate": 5.765880217785844e-06, + "loss": 35.7162, + "step": 4461 + }, + { + "epoch": 16.108352144469524, + "grad_norm": 275.3272705078125, + "learning_rate": 5.760435571687841e-06, + "loss": 32.8733, + "step": 4462 + }, + { + "epoch": 16.11196388261851, + "grad_norm": 259.5124206542969, + "learning_rate": 5.7549909255898364e-06, + "loss": 33.2271, + "step": 4463 + }, + { + "epoch": 16.115575620767494, + "grad_norm": 249.75738525390625, + "learning_rate": 5.7495462794918335e-06, + "loss": 30.2931, + "step": 4464 + }, + { + "epoch": 16.119187358916477, + "grad_norm": 277.7652282714844, + "learning_rate": 5.74410163339383e-06, + "loss": 30.9294, + "step": 4465 + }, + { + "epoch": 16.122799097065464, + "grad_norm": 223.28250122070312, + "learning_rate": 5.738656987295825e-06, + "loss": 31.7337, + "step": 4466 + }, + { + "epoch": 16.126410835214447, + "grad_norm": 259.5106201171875, + "learning_rate": 5.733212341197822e-06, + "loss": 31.2897, + "step": 4467 + }, + { + "epoch": 16.13002257336343, + "grad_norm": 241.0313720703125, + "learning_rate": 5.7277676950998184e-06, + "loss": 32.8436, + "step": 4468 + }, + { + "epoch": 16.133634311512417, + "grad_norm": 277.46905517578125, + "learning_rate": 5.7223230490018155e-06, + "loss": 33.6823, + "step": 4469 + }, + { + "epoch": 16.1372460496614, + "grad_norm": 264.2905578613281, + "learning_rate": 5.716878402903811e-06, + "loss": 33.1107, + "step": 4470 + }, + { + "epoch": 16.1372460496614, + "eval_loss": 0.6046355962753296, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 4470 + }, + { + "epoch": 16.140857787810383, + "grad_norm": 295.5188903808594, + "learning_rate": 5.711433756805808e-06, + "loss": 33.6291, + "step": 4471 + }, + { + "epoch": 16.14446952595937, + "grad_norm": 282.6014709472656, + "learning_rate": 5.705989110707804e-06, + "loss": 33.0773, + "step": 4472 + }, + { + "epoch": 16.148081264108352, + "grad_norm": 270.7958679199219, + "learning_rate": 5.7005444646098004e-06, + "loss": 35.0269, + "step": 4473 + }, + { + "epoch": 16.151693002257336, + "grad_norm": 344.7304992675781, + "learning_rate": 5.695099818511797e-06, + "loss": 35.1349, + "step": 4474 + }, + { + "epoch": 16.155304740406322, + "grad_norm": 294.5618896484375, + "learning_rate": 5.689655172413793e-06, + "loss": 36.3309, + "step": 4475 + }, + { + "epoch": 16.158916478555305, + "grad_norm": 305.5354309082031, + "learning_rate": 5.68421052631579e-06, + "loss": 35.0976, + "step": 4476 + }, + { + "epoch": 16.16252821670429, + "grad_norm": 293.9934387207031, + "learning_rate": 5.678765880217786e-06, + "loss": 34.9113, + "step": 4477 + }, + { + "epoch": 16.16613995485327, + "grad_norm": 277.9523010253906, + "learning_rate": 5.6733212341197824e-06, + "loss": 24.8815, + "step": 4478 + }, + { + "epoch": 16.169751693002258, + "grad_norm": 297.0547790527344, + "learning_rate": 5.667876588021779e-06, + "loss": 22.4544, + "step": 4479 + }, + { + "epoch": 16.17336343115124, + "grad_norm": 237.44741821289062, + "learning_rate": 5.662431941923776e-06, + "loss": 21.8323, + "step": 4480 + }, + { + "epoch": 16.17336343115124, + "eval_loss": 0.6061411499977112, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.98, + "eval_steps_per_second": 56.98, + "step": 4480 + }, + { + "epoch": 16.176975169300224, + "grad_norm": 220.5832977294922, + "learning_rate": 5.656987295825771e-06, + "loss": 22.7531, + "step": 4481 + }, + { + "epoch": 16.18058690744921, + "grad_norm": 298.8033142089844, + "learning_rate": 5.651542649727767e-06, + "loss": 23.7107, + "step": 4482 + }, + { + "epoch": 16.184198645598194, + "grad_norm": 250.02593994140625, + "learning_rate": 5.6460980036297644e-06, + "loss": 39.1679, + "step": 4483 + }, + { + "epoch": 16.187810383747177, + "grad_norm": 253.00746154785156, + "learning_rate": 5.640653357531761e-06, + "loss": 40.6492, + "step": 4484 + }, + { + "epoch": 16.191422121896164, + "grad_norm": 215.04270935058594, + "learning_rate": 5.635208711433757e-06, + "loss": 38.604, + "step": 4485 + }, + { + "epoch": 16.195033860045147, + "grad_norm": 395.6152648925781, + "learning_rate": 5.629764065335753e-06, + "loss": 39.1417, + "step": 4486 + }, + { + "epoch": 16.19864559819413, + "grad_norm": 380.3653869628906, + "learning_rate": 5.62431941923775e-06, + "loss": 39.4322, + "step": 4487 + }, + { + "epoch": 16.202257336343116, + "grad_norm": 309.3524475097656, + "learning_rate": 5.6188747731397464e-06, + "loss": 39.1721, + "step": 4488 + }, + { + "epoch": 16.2058690744921, + "grad_norm": 237.88262939453125, + "learning_rate": 5.613430127041742e-06, + "loss": 39.1462, + "step": 4489 + }, + { + "epoch": 16.209480812641083, + "grad_norm": 233.66690063476562, + "learning_rate": 5.607985480943739e-06, + "loss": 39.8177, + "step": 4490 + }, + { + "epoch": 16.209480812641083, + "eval_loss": 0.6043822169303894, + "eval_runtime": 3.1418, + "eval_samples_per_second": 56.974, + "eval_steps_per_second": 56.974, + "step": 4490 + }, + { + "epoch": 16.21309255079007, + "grad_norm": 229.3720703125, + "learning_rate": 5.602540834845735e-06, + "loss": 39.7878, + "step": 4491 + }, + { + "epoch": 16.216704288939052, + "grad_norm": 228.66493225097656, + "learning_rate": 5.597096188747731e-06, + "loss": 40.0754, + "step": 4492 + }, + { + "epoch": 16.220316027088035, + "grad_norm": 276.40240478515625, + "learning_rate": 5.591651542649728e-06, + "loss": 38.7709, + "step": 4493 + }, + { + "epoch": 16.223927765237022, + "grad_norm": 268.62371826171875, + "learning_rate": 5.586206896551725e-06, + "loss": 37.7439, + "step": 4494 + }, + { + "epoch": 16.227539503386005, + "grad_norm": 271.0934753417969, + "learning_rate": 5.580762250453721e-06, + "loss": 38.2511, + "step": 4495 + }, + { + "epoch": 16.231151241534988, + "grad_norm": 253.63385009765625, + "learning_rate": 5.575317604355716e-06, + "loss": 36.716, + "step": 4496 + }, + { + "epoch": 16.23476297968397, + "grad_norm": 265.1177978515625, + "learning_rate": 5.569872958257713e-06, + "loss": 36.5517, + "step": 4497 + }, + { + "epoch": 16.238374717832958, + "grad_norm": 332.52972412109375, + "learning_rate": 5.56442831215971e-06, + "loss": 37.1524, + "step": 4498 + }, + { + "epoch": 16.24198645598194, + "grad_norm": 247.53643798828125, + "learning_rate": 5.558983666061707e-06, + "loss": 36.6666, + "step": 4499 + }, + { + "epoch": 16.245598194130924, + "grad_norm": 233.3318634033203, + "learning_rate": 5.553539019963702e-06, + "loss": 37.0842, + "step": 4500 + }, + { + "epoch": 16.245598194130924, + "eval_loss": 0.6042913794517517, + "eval_runtime": 3.14, + "eval_samples_per_second": 57.007, + "eval_steps_per_second": 57.007, + "step": 4500 + }, + { + "epoch": 16.24920993227991, + "grad_norm": 222.98350524902344, + "learning_rate": 5.548094373865699e-06, + "loss": 37.6382, + "step": 4501 + }, + { + "epoch": 16.252821670428894, + "grad_norm": 234.33267211914062, + "learning_rate": 5.542649727767695e-06, + "loss": 38.0509, + "step": 4502 + }, + { + "epoch": 16.256433408577877, + "grad_norm": 303.56005859375, + "learning_rate": 5.5372050816696924e-06, + "loss": 36.509, + "step": 4503 + }, + { + "epoch": 16.260045146726863, + "grad_norm": 232.0821075439453, + "learning_rate": 5.531760435571688e-06, + "loss": 36.3975, + "step": 4504 + }, + { + "epoch": 16.263656884875846, + "grad_norm": 223.3292236328125, + "learning_rate": 5.526315789473684e-06, + "loss": 37.0448, + "step": 4505 + }, + { + "epoch": 16.26726862302483, + "grad_norm": 241.2131805419922, + "learning_rate": 5.520871143375681e-06, + "loss": 37.8635, + "step": 4506 + }, + { + "epoch": 16.270880361173816, + "grad_norm": 288.62689208984375, + "learning_rate": 5.5154264972776765e-06, + "loss": 38.2789, + "step": 4507 + }, + { + "epoch": 16.2744920993228, + "grad_norm": 262.59637451171875, + "learning_rate": 5.5099818511796736e-06, + "loss": 37.9052, + "step": 4508 + }, + { + "epoch": 16.278103837471782, + "grad_norm": 258.0476379394531, + "learning_rate": 5.50453720508167e-06, + "loss": 38.0485, + "step": 4509 + }, + { + "epoch": 16.28171557562077, + "grad_norm": 295.2730407714844, + "learning_rate": 5.499092558983667e-06, + "loss": 37.6134, + "step": 4510 + }, + { + "epoch": 16.28171557562077, + "eval_loss": 0.601740300655365, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 4510 + }, + { + "epoch": 16.285327313769752, + "grad_norm": 246.38548278808594, + "learning_rate": 5.493647912885662e-06, + "loss": 36.1289, + "step": 4511 + }, + { + "epoch": 16.288939051918735, + "grad_norm": 271.28997802734375, + "learning_rate": 5.4882032667876585e-06, + "loss": 31.8834, + "step": 4512 + }, + { + "epoch": 16.292550790067722, + "grad_norm": 231.76246643066406, + "learning_rate": 5.4827586206896556e-06, + "loss": 31.4899, + "step": 4513 + }, + { + "epoch": 16.296162528216705, + "grad_norm": 238.7414093017578, + "learning_rate": 5.477313974591652e-06, + "loss": 31.7102, + "step": 4514 + }, + { + "epoch": 16.299774266365688, + "grad_norm": 302.0710144042969, + "learning_rate": 5.471869328493648e-06, + "loss": 31.3557, + "step": 4515 + }, + { + "epoch": 16.30338600451467, + "grad_norm": 282.72015380859375, + "learning_rate": 5.466424682395644e-06, + "loss": 33.0781, + "step": 4516 + }, + { + "epoch": 16.306997742663658, + "grad_norm": 224.8140869140625, + "learning_rate": 5.460980036297641e-06, + "loss": 33.2963, + "step": 4517 + }, + { + "epoch": 16.31060948081264, + "grad_norm": 239.20570373535156, + "learning_rate": 5.4555353901996376e-06, + "loss": 34.4455, + "step": 4518 + }, + { + "epoch": 16.314221218961624, + "grad_norm": 304.7758483886719, + "learning_rate": 5.450090744101633e-06, + "loss": 34.534, + "step": 4519 + }, + { + "epoch": 16.31783295711061, + "grad_norm": 274.8758239746094, + "learning_rate": 5.44464609800363e-06, + "loss": 33.5232, + "step": 4520 + }, + { + "epoch": 16.31783295711061, + "eval_loss": 0.6031973958015442, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 4520 + }, + { + "epoch": 16.321444695259594, + "grad_norm": 295.1776428222656, + "learning_rate": 5.439201451905626e-06, + "loss": 33.403, + "step": 4521 + }, + { + "epoch": 16.325056433408577, + "grad_norm": 309.03399658203125, + "learning_rate": 5.4337568058076225e-06, + "loss": 34.1785, + "step": 4522 + }, + { + "epoch": 16.328668171557563, + "grad_norm": 285.26385498046875, + "learning_rate": 5.428312159709619e-06, + "loss": 34.4855, + "step": 4523 + }, + { + "epoch": 16.332279909706546, + "grad_norm": 307.0184020996094, + "learning_rate": 5.422867513611616e-06, + "loss": 32.4791, + "step": 4524 + }, + { + "epoch": 16.33589164785553, + "grad_norm": 318.8267822265625, + "learning_rate": 5.417422867513612e-06, + "loss": 35.697, + "step": 4525 + }, + { + "epoch": 16.339503386004516, + "grad_norm": 356.0179138183594, + "learning_rate": 5.411978221415607e-06, + "loss": 36.1811, + "step": 4526 + }, + { + "epoch": 16.3431151241535, + "grad_norm": 332.1255187988281, + "learning_rate": 5.4065335753176045e-06, + "loss": 36.2251, + "step": 4527 + }, + { + "epoch": 16.346726862302482, + "grad_norm": 288.78118896484375, + "learning_rate": 5.401088929219601e-06, + "loss": 32.0518, + "step": 4528 + }, + { + "epoch": 16.35033860045147, + "grad_norm": 250.37245178222656, + "learning_rate": 5.395644283121598e-06, + "loss": 23.627, + "step": 4529 + }, + { + "epoch": 16.353950338600452, + "grad_norm": 199.92352294921875, + "learning_rate": 5.390199637023593e-06, + "loss": 21.7919, + "step": 4530 + }, + { + "epoch": 16.353950338600452, + "eval_loss": 0.6021688580513, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 4530 + }, + { + "epoch": 16.357562076749435, + "grad_norm": 265.47015380859375, + "learning_rate": 5.38475499092559e-06, + "loss": 23.0672, + "step": 4531 + }, + { + "epoch": 16.36117381489842, + "grad_norm": 281.188720703125, + "learning_rate": 5.3793103448275865e-06, + "loss": 22.7983, + "step": 4532 + }, + { + "epoch": 16.364785553047405, + "grad_norm": 195.5351104736328, + "learning_rate": 5.373865698729583e-06, + "loss": 38.1042, + "step": 4533 + }, + { + "epoch": 16.368397291196388, + "grad_norm": 234.76573181152344, + "learning_rate": 5.368421052631579e-06, + "loss": 39.8602, + "step": 4534 + }, + { + "epoch": 16.37200902934537, + "grad_norm": 237.9152374267578, + "learning_rate": 5.362976406533575e-06, + "loss": 40.2156, + "step": 4535 + }, + { + "epoch": 16.375620767494357, + "grad_norm": 297.722900390625, + "learning_rate": 5.357531760435572e-06, + "loss": 39.3676, + "step": 4536 + }, + { + "epoch": 16.37923250564334, + "grad_norm": 218.61727905273438, + "learning_rate": 5.352087114337568e-06, + "loss": 38.7905, + "step": 4537 + }, + { + "epoch": 16.382844243792324, + "grad_norm": 245.19561767578125, + "learning_rate": 5.346642468239565e-06, + "loss": 39.3998, + "step": 4538 + }, + { + "epoch": 16.38645598194131, + "grad_norm": 247.5048370361328, + "learning_rate": 5.341197822141561e-06, + "loss": 40.0835, + "step": 4539 + }, + { + "epoch": 16.390067720090293, + "grad_norm": 214.40684509277344, + "learning_rate": 5.335753176043558e-06, + "loss": 39.1135, + "step": 4540 + }, + { + "epoch": 16.390067720090293, + "eval_loss": 0.6014460325241089, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.946, + "eval_steps_per_second": 56.946, + "step": 4540 + }, + { + "epoch": 16.393679458239276, + "grad_norm": 216.72271728515625, + "learning_rate": 5.330308529945553e-06, + "loss": 38.9449, + "step": 4541 + }, + { + "epoch": 16.397291196388263, + "grad_norm": 224.22262573242188, + "learning_rate": 5.32486388384755e-06, + "loss": 39.2646, + "step": 4542 + }, + { + "epoch": 16.400902934537246, + "grad_norm": 258.6524353027344, + "learning_rate": 5.319419237749547e-06, + "loss": 38.0846, + "step": 4543 + }, + { + "epoch": 16.40451467268623, + "grad_norm": 241.7313232421875, + "learning_rate": 5.313974591651543e-06, + "loss": 37.4963, + "step": 4544 + }, + { + "epoch": 16.408126410835216, + "grad_norm": 241.3990478515625, + "learning_rate": 5.308529945553539e-06, + "loss": 36.4783, + "step": 4545 + }, + { + "epoch": 16.4117381489842, + "grad_norm": 207.1470947265625, + "learning_rate": 5.303085299455535e-06, + "loss": 36.1592, + "step": 4546 + }, + { + "epoch": 16.415349887133182, + "grad_norm": 224.51690673828125, + "learning_rate": 5.2976406533575325e-06, + "loss": 35.7946, + "step": 4547 + }, + { + "epoch": 16.41896162528217, + "grad_norm": 292.4340515136719, + "learning_rate": 5.292196007259528e-06, + "loss": 36.8986, + "step": 4548 + }, + { + "epoch": 16.42257336343115, + "grad_norm": 244.67117309570312, + "learning_rate": 5.286751361161524e-06, + "loss": 37.1165, + "step": 4549 + }, + { + "epoch": 16.426185101580135, + "grad_norm": 331.14654541015625, + "learning_rate": 5.281306715063521e-06, + "loss": 36.4423, + "step": 4550 + }, + { + "epoch": 16.426185101580135, + "eval_loss": 0.6067427396774292, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.946, + "eval_steps_per_second": 56.946, + "step": 4550 + }, + { + "epoch": 16.42979683972912, + "grad_norm": 262.373046875, + "learning_rate": 5.275862068965517e-06, + "loss": 39.0014, + "step": 4551 + }, + { + "epoch": 16.433408577878104, + "grad_norm": 237.48350524902344, + "learning_rate": 5.270417422867514e-06, + "loss": 38.0152, + "step": 4552 + }, + { + "epoch": 16.437020316027088, + "grad_norm": 273.0652770996094, + "learning_rate": 5.26497277676951e-06, + "loss": 37.6952, + "step": 4553 + }, + { + "epoch": 16.44063205417607, + "grad_norm": 239.0780029296875, + "learning_rate": 5.259528130671507e-06, + "loss": 38.4266, + "step": 4554 + }, + { + "epoch": 16.444243792325057, + "grad_norm": 277.978759765625, + "learning_rate": 5.254083484573503e-06, + "loss": 36.5596, + "step": 4555 + }, + { + "epoch": 16.44785553047404, + "grad_norm": 216.2267303466797, + "learning_rate": 5.248638838475499e-06, + "loss": 39.1408, + "step": 4556 + }, + { + "epoch": 16.451467268623023, + "grad_norm": 231.80581665039062, + "learning_rate": 5.243194192377496e-06, + "loss": 38.7286, + "step": 4557 + }, + { + "epoch": 16.45507900677201, + "grad_norm": 236.4004669189453, + "learning_rate": 5.237749546279492e-06, + "loss": 39.2426, + "step": 4558 + }, + { + "epoch": 16.458690744920993, + "grad_norm": 270.0268859863281, + "learning_rate": 5.232304900181488e-06, + "loss": 38.6546, + "step": 4559 + }, + { + "epoch": 16.462302483069976, + "grad_norm": 255.8044891357422, + "learning_rate": 5.226860254083484e-06, + "loss": 37.554, + "step": 4560 + }, + { + "epoch": 16.462302483069976, + "eval_loss": 0.6019929647445679, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.062, + "eval_steps_per_second": 57.062, + "step": 4560 + }, + { + "epoch": 16.465914221218963, + "grad_norm": 321.18499755859375, + "learning_rate": 5.221415607985481e-06, + "loss": 34.9309, + "step": 4561 + }, + { + "epoch": 16.469525959367946, + "grad_norm": 311.94305419921875, + "learning_rate": 5.215970961887478e-06, + "loss": 35.8779, + "step": 4562 + }, + { + "epoch": 16.47313769751693, + "grad_norm": 211.90234375, + "learning_rate": 5.210526315789474e-06, + "loss": 31.8385, + "step": 4563 + }, + { + "epoch": 16.476749435665916, + "grad_norm": 284.64581298828125, + "learning_rate": 5.20508166969147e-06, + "loss": 31.8078, + "step": 4564 + }, + { + "epoch": 16.4803611738149, + "grad_norm": 291.94891357421875, + "learning_rate": 5.199637023593466e-06, + "loss": 33.2542, + "step": 4565 + }, + { + "epoch": 16.483972911963882, + "grad_norm": 243.61956787109375, + "learning_rate": 5.194192377495463e-06, + "loss": 31.5292, + "step": 4566 + }, + { + "epoch": 16.48758465011287, + "grad_norm": 242.07696533203125, + "learning_rate": 5.188747731397459e-06, + "loss": 33.9643, + "step": 4567 + }, + { + "epoch": 16.49119638826185, + "grad_norm": 255.0625457763672, + "learning_rate": 5.183303085299456e-06, + "loss": 33.7718, + "step": 4568 + }, + { + "epoch": 16.494808126410835, + "grad_norm": 249.40240478515625, + "learning_rate": 5.177858439201452e-06, + "loss": 31.5248, + "step": 4569 + }, + { + "epoch": 16.498419864559818, + "grad_norm": 231.3375244140625, + "learning_rate": 5.172413793103449e-06, + "loss": 34.5657, + "step": 4570 + }, + { + "epoch": 16.498419864559818, + "eval_loss": 0.6017265319824219, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.01, + "eval_steps_per_second": 57.01, + "step": 4570 + }, + { + "epoch": 16.502031602708804, + "grad_norm": 247.97012329101562, + "learning_rate": 5.1669691470054445e-06, + "loss": 33.766, + "step": 4571 + }, + { + "epoch": 16.505643340857787, + "grad_norm": 310.730224609375, + "learning_rate": 5.161524500907441e-06, + "loss": 34.0841, + "step": 4572 + }, + { + "epoch": 16.50925507900677, + "grad_norm": 323.5569152832031, + "learning_rate": 5.156079854809438e-06, + "loss": 35.0788, + "step": 4573 + }, + { + "epoch": 16.512866817155757, + "grad_norm": 247.95480346679688, + "learning_rate": 5.150635208711433e-06, + "loss": 33.5322, + "step": 4574 + }, + { + "epoch": 16.51647855530474, + "grad_norm": 307.6163024902344, + "learning_rate": 5.14519056261343e-06, + "loss": 34.4701, + "step": 4575 + }, + { + "epoch": 16.520090293453723, + "grad_norm": 239.569580078125, + "learning_rate": 5.1397459165154265e-06, + "loss": 35.8526, + "step": 4576 + }, + { + "epoch": 16.52370203160271, + "grad_norm": 362.4159240722656, + "learning_rate": 5.134301270417424e-06, + "loss": 36.2235, + "step": 4577 + }, + { + "epoch": 16.527313769751693, + "grad_norm": 321.2509765625, + "learning_rate": 5.128856624319419e-06, + "loss": 33.4705, + "step": 4578 + }, + { + "epoch": 16.530925507900676, + "grad_norm": 248.6092071533203, + "learning_rate": 5.123411978221415e-06, + "loss": 23.1329, + "step": 4579 + }, + { + "epoch": 16.534537246049663, + "grad_norm": 289.8996276855469, + "learning_rate": 5.117967332123412e-06, + "loss": 20.3184, + "step": 4580 + }, + { + "epoch": 16.534537246049663, + "eval_loss": 0.6034744381904602, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.997, + "eval_steps_per_second": 56.997, + "step": 4580 + }, + { + "epoch": 16.538148984198646, + "grad_norm": 215.02142333984375, + "learning_rate": 5.1125226860254085e-06, + "loss": 23.0513, + "step": 4581 + }, + { + "epoch": 16.54176072234763, + "grad_norm": 299.8429870605469, + "learning_rate": 5.107078039927405e-06, + "loss": 24.462, + "step": 4582 + }, + { + "epoch": 16.545372460496615, + "grad_norm": 267.0840759277344, + "learning_rate": 5.101633393829401e-06, + "loss": 39.9148, + "step": 4583 + }, + { + "epoch": 16.5489841986456, + "grad_norm": 227.23731994628906, + "learning_rate": 5.096188747731398e-06, + "loss": 40.6498, + "step": 4584 + }, + { + "epoch": 16.55259593679458, + "grad_norm": 313.9705810546875, + "learning_rate": 5.0907441016333935e-06, + "loss": 38.7711, + "step": 4585 + }, + { + "epoch": 16.55620767494357, + "grad_norm": 398.0429382324219, + "learning_rate": 5.0852994555353905e-06, + "loss": 39.6938, + "step": 4586 + }, + { + "epoch": 16.55981941309255, + "grad_norm": 365.489990234375, + "learning_rate": 5.079854809437387e-06, + "loss": 39.356, + "step": 4587 + }, + { + "epoch": 16.563431151241534, + "grad_norm": 365.05267333984375, + "learning_rate": 5.074410163339383e-06, + "loss": 40.2504, + "step": 4588 + }, + { + "epoch": 16.567042889390518, + "grad_norm": 288.0643310546875, + "learning_rate": 5.068965517241379e-06, + "loss": 39.6045, + "step": 4589 + }, + { + "epoch": 16.570654627539504, + "grad_norm": 262.0147705078125, + "learning_rate": 5.0635208711433755e-06, + "loss": 40.2504, + "step": 4590 + }, + { + "epoch": 16.570654627539504, + "eval_loss": 0.6028281450271606, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 4590 + }, + { + "epoch": 16.574266365688487, + "grad_norm": 325.78387451171875, + "learning_rate": 5.0580762250453725e-06, + "loss": 40.3154, + "step": 4591 + }, + { + "epoch": 16.57787810383747, + "grad_norm": 221.56591796875, + "learning_rate": 5.052631578947369e-06, + "loss": 39.5046, + "step": 4592 + }, + { + "epoch": 16.581489841986457, + "grad_norm": 227.02520751953125, + "learning_rate": 5.047186932849365e-06, + "loss": 38.3611, + "step": 4593 + }, + { + "epoch": 16.58510158013544, + "grad_norm": 232.46922302246094, + "learning_rate": 5.041742286751361e-06, + "loss": 36.5043, + "step": 4594 + }, + { + "epoch": 16.588713318284423, + "grad_norm": 230.59536743164062, + "learning_rate": 5.0362976406533575e-06, + "loss": 36.2179, + "step": 4595 + }, + { + "epoch": 16.59232505643341, + "grad_norm": 439.9609069824219, + "learning_rate": 5.0308529945553545e-06, + "loss": 36.4797, + "step": 4596 + }, + { + "epoch": 16.595936794582393, + "grad_norm": 322.4086608886719, + "learning_rate": 5.02540834845735e-06, + "loss": 37.4151, + "step": 4597 + }, + { + "epoch": 16.599548532731376, + "grad_norm": 318.1732482910156, + "learning_rate": 5.019963702359347e-06, + "loss": 37.2815, + "step": 4598 + }, + { + "epoch": 16.603160270880363, + "grad_norm": 321.34039306640625, + "learning_rate": 5.014519056261343e-06, + "loss": 36.8388, + "step": 4599 + }, + { + "epoch": 16.606772009029346, + "grad_norm": 341.28790283203125, + "learning_rate": 5.0090744101633395e-06, + "loss": 37.9805, + "step": 4600 + }, + { + "epoch": 16.606772009029346, + "eval_loss": 0.6045316457748413, + "eval_runtime": 3.1402, + "eval_samples_per_second": 57.002, + "eval_steps_per_second": 57.002, + "step": 4600 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.2563771479241523e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3029ab34d5e8a28cccda82ac2cc8146ca3f302de --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce793adc3cc5d22984a150d10277982fa17260cec0b3089edaef3f66dfbf19ac +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..817cd1191cbfc4057075f924ae42ac19422b5434 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3029d8683ff6ef3cf1d3704afb09e72b4e43a27901826a0b3dc3f0b27122ab78 +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b9f5a591755f82d947f446b58f67e1cb74187c5 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3515085f7c66ae7b5a70abd1e4951920b20e5cd65268121595a64a64f0a38733 +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..62ad5139fabcced74ce336df20bc8bb5f084a25f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1aa8dedfe92ec443a77f975a718da54dc0f3a55fb6a9d161fc730ac1126ab99d +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9f348889bfb5364da5d6afcd488731e6b1d7c90 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63a995b0a2d70e9b0b0733066ea1f21ff3eb816eb8a743ab7c491add0ee9e42a +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7b82a315f3907c5b4365247a0bd0e6b55ca4e2ca --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/trainer_state.json @@ -0,0 +1,37473 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 17.328668171557563, + "eval_steps": 10, + "global_step": 4800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + }, + { + "epoch": 4.33589164785553, + "grad_norm": 232.332763671875, + "learning_rate": 2.3515426497277678e-05, + "loss": 38.1029, + "step": 1201 + }, + { + "epoch": 4.339503386004514, + "grad_norm": 251.8763885498047, + "learning_rate": 2.3509981851179673e-05, + "loss": 40.2538, + "step": 1202 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 260.1363525390625, + "learning_rate": 2.350453720508167e-05, + "loss": 39.115, + "step": 1203 + }, + { + "epoch": 4.346726862302483, + "grad_norm": 227.32473754882812, + "learning_rate": 2.3499092558983667e-05, + "loss": 37.7692, + "step": 1204 + }, + { + "epoch": 4.350338600451467, + "grad_norm": 208.3872528076172, + "learning_rate": 2.3493647912885663e-05, + "loss": 26.7583, + "step": 1205 + }, + { + "epoch": 4.353950338600452, + "grad_norm": 173.05075073242188, + "learning_rate": 2.348820326678766e-05, + "loss": 24.7576, + "step": 1206 + }, + { + "epoch": 4.357562076749436, + "grad_norm": 214.4512939453125, + "learning_rate": 2.3482758620689657e-05, + "loss": 24.8792, + "step": 1207 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 179.293701171875, + "learning_rate": 2.3477313974591652e-05, + "loss": 26.1507, + "step": 1208 + }, + { + "epoch": 4.364785553047404, + "grad_norm": 401.9908142089844, + "learning_rate": 2.3471869328493648e-05, + "loss": 47.4017, + "step": 1209 + }, + { + "epoch": 4.368397291196389, + "grad_norm": 399.3369140625, + "learning_rate": 2.3466424682395643e-05, + "loss": 48.0082, + "step": 1210 + }, + { + "epoch": 4.368397291196389, + "eval_loss": 0.6664602756500244, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.18, + "eval_steps_per_second": 57.18, + "step": 1210 + }, + { + "epoch": 4.372009029345373, + "grad_norm": 320.49090576171875, + "learning_rate": 2.346098003629764e-05, + "loss": 47.4843, + "step": 1211 + }, + { + "epoch": 4.375620767494357, + "grad_norm": 297.55615234375, + "learning_rate": 2.3455535390199637e-05, + "loss": 46.3087, + "step": 1212 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 245.03399658203125, + "learning_rate": 2.3450090744101636e-05, + "loss": 45.4889, + "step": 1213 + }, + { + "epoch": 4.382844243792325, + "grad_norm": 227.94091796875, + "learning_rate": 2.344464609800363e-05, + "loss": 45.8501, + "step": 1214 + }, + { + "epoch": 4.386455981941309, + "grad_norm": 262.7824401855469, + "learning_rate": 2.3439201451905627e-05, + "loss": 46.2737, + "step": 1215 + }, + { + "epoch": 4.390067720090293, + "grad_norm": 235.969970703125, + "learning_rate": 2.3433756805807622e-05, + "loss": 45.2876, + "step": 1216 + }, + { + "epoch": 4.393679458239277, + "grad_norm": 244.8028106689453, + "learning_rate": 2.342831215970962e-05, + "loss": 45.4931, + "step": 1217 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 236.24844360351562, + "learning_rate": 2.3422867513611616e-05, + "loss": 45.6649, + "step": 1218 + }, + { + "epoch": 4.400902934537246, + "grad_norm": 204.7911834716797, + "learning_rate": 2.341742286751361e-05, + "loss": 43.9613, + "step": 1219 + }, + { + "epoch": 4.40451467268623, + "grad_norm": 190.6739044189453, + "learning_rate": 2.3411978221415607e-05, + "loss": 41.9267, + "step": 1220 + }, + { + "epoch": 4.40451467268623, + "eval_loss": 0.6481396555900574, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1220 + }, + { + "epoch": 4.408126410835214, + "grad_norm": 224.25758361816406, + "learning_rate": 2.3406533575317602e-05, + "loss": 42.34, + "step": 1221 + }, + { + "epoch": 4.411738148984199, + "grad_norm": 238.21913146972656, + "learning_rate": 2.34010889292196e-05, + "loss": 40.6947, + "step": 1222 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 255.64395141601562, + "learning_rate": 2.33956442831216e-05, + "loss": 39.8585, + "step": 1223 + }, + { + "epoch": 4.418961625282167, + "grad_norm": 202.08859252929688, + "learning_rate": 2.3390199637023595e-05, + "loss": 42.6031, + "step": 1224 + }, + { + "epoch": 4.422573363431152, + "grad_norm": 222.359619140625, + "learning_rate": 2.338475499092559e-05, + "loss": 41.9946, + "step": 1225 + }, + { + "epoch": 4.426185101580136, + "grad_norm": 198.84461975097656, + "learning_rate": 2.3379310344827586e-05, + "loss": 40.9174, + "step": 1226 + }, + { + "epoch": 4.42979683972912, + "grad_norm": 227.34942626953125, + "learning_rate": 2.337386569872958e-05, + "loss": 42.2865, + "step": 1227 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 249.9097900390625, + "learning_rate": 2.336842105263158e-05, + "loss": 42.6508, + "step": 1228 + }, + { + "epoch": 4.437020316027088, + "grad_norm": 236.96009826660156, + "learning_rate": 2.3362976406533576e-05, + "loss": 43.0846, + "step": 1229 + }, + { + "epoch": 4.440632054176072, + "grad_norm": 183.06201171875, + "learning_rate": 2.335753176043557e-05, + "loss": 42.4119, + "step": 1230 + }, + { + "epoch": 4.440632054176072, + "eval_loss": 0.6428424715995789, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 1230 + }, + { + "epoch": 4.444243792325056, + "grad_norm": 199.0382843017578, + "learning_rate": 2.335208711433757e-05, + "loss": 43.1702, + "step": 1231 + }, + { + "epoch": 4.44785553047404, + "grad_norm": 221.87939453125, + "learning_rate": 2.3346642468239565e-05, + "loss": 43.3518, + "step": 1232 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 205.0601043701172, + "learning_rate": 2.3341197822141564e-05, + "loss": 42.9713, + "step": 1233 + }, + { + "epoch": 4.455079006772009, + "grad_norm": 235.3998565673828, + "learning_rate": 2.333575317604356e-05, + "loss": 42.6973, + "step": 1234 + }, + { + "epoch": 4.458690744920993, + "grad_norm": 171.76986694335938, + "learning_rate": 2.3330308529945555e-05, + "loss": 43.351, + "step": 1235 + }, + { + "epoch": 4.462302483069977, + "grad_norm": 261.549072265625, + "learning_rate": 2.332486388384755e-05, + "loss": 43.8662, + "step": 1236 + }, + { + "epoch": 4.465914221218962, + "grad_norm": 256.76837158203125, + "learning_rate": 2.3319419237749545e-05, + "loss": 40.7938, + "step": 1237 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 176.35060119628906, + "learning_rate": 2.331397459165154e-05, + "loss": 38.1021, + "step": 1238 + }, + { + "epoch": 4.47313769751693, + "grad_norm": 203.00906372070312, + "learning_rate": 2.330852994555354e-05, + "loss": 36.6359, + "step": 1239 + }, + { + "epoch": 4.476749435665914, + "grad_norm": 259.6462707519531, + "learning_rate": 2.3303085299455535e-05, + "loss": 34.448, + "step": 1240 + }, + { + "epoch": 4.476749435665914, + "eval_loss": 0.6386051177978516, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 1240 + }, + { + "epoch": 4.480361173814899, + "grad_norm": 215.24737548828125, + "learning_rate": 2.3297640653357534e-05, + "loss": 35.2353, + "step": 1241 + }, + { + "epoch": 4.483972911963883, + "grad_norm": 249.12355041503906, + "learning_rate": 2.329219600725953e-05, + "loss": 38.2077, + "step": 1242 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 191.0881805419922, + "learning_rate": 2.3286751361161525e-05, + "loss": 36.8363, + "step": 1243 + }, + { + "epoch": 4.491196388261851, + "grad_norm": 229.26449584960938, + "learning_rate": 2.3281306715063523e-05, + "loss": 36.7398, + "step": 1244 + }, + { + "epoch": 4.4948081264108355, + "grad_norm": 184.931884765625, + "learning_rate": 2.327586206896552e-05, + "loss": 35.6614, + "step": 1245 + }, + { + "epoch": 4.4984198645598195, + "grad_norm": 183.7378387451172, + "learning_rate": 2.3270417422867514e-05, + "loss": 36.9818, + "step": 1246 + }, + { + "epoch": 4.502031602708803, + "grad_norm": 191.42543029785156, + "learning_rate": 2.326497277676951e-05, + "loss": 38.1348, + "step": 1247 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 211.6359100341797, + "learning_rate": 2.3259528130671505e-05, + "loss": 37.0112, + "step": 1248 + }, + { + "epoch": 4.509255079006772, + "grad_norm": 245.6946563720703, + "learning_rate": 2.32540834845735e-05, + "loss": 38.6218, + "step": 1249 + }, + { + "epoch": 4.512866817155756, + "grad_norm": 193.29095458984375, + "learning_rate": 2.3248638838475502e-05, + "loss": 36.9687, + "step": 1250 + }, + { + "epoch": 4.512866817155756, + "eval_loss": 0.6432057023048401, + "eval_runtime": 3.1301, + "eval_samples_per_second": 57.187, + "eval_steps_per_second": 57.187, + "step": 1250 + }, + { + "epoch": 4.51647855530474, + "grad_norm": 247.0595245361328, + "learning_rate": 2.3243194192377498e-05, + "loss": 39.8086, + "step": 1251 + }, + { + "epoch": 4.520090293453725, + "grad_norm": 243.1544189453125, + "learning_rate": 2.3237749546279493e-05, + "loss": 38.7245, + "step": 1252 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 322.0834045410156, + "learning_rate": 2.323230490018149e-05, + "loss": 39.5335, + "step": 1253 + }, + { + "epoch": 4.527313769751693, + "grad_norm": 201.5956573486328, + "learning_rate": 2.3226860254083484e-05, + "loss": 30.2928, + "step": 1254 + }, + { + "epoch": 4.530925507900677, + "grad_norm": 186.13291931152344, + "learning_rate": 2.3221415607985483e-05, + "loss": 24.8504, + "step": 1255 + }, + { + "epoch": 4.534537246049661, + "grad_norm": 251.50608825683594, + "learning_rate": 2.3215970961887478e-05, + "loss": 24.5528, + "step": 1256 + }, + { + "epoch": 4.538148984198646, + "grad_norm": 180.21124267578125, + "learning_rate": 2.3210526315789473e-05, + "loss": 25.0864, + "step": 1257 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 206.5410614013672, + "learning_rate": 2.320508166969147e-05, + "loss": 27.1602, + "step": 1258 + }, + { + "epoch": 4.545372460496614, + "grad_norm": 342.1103210449219, + "learning_rate": 2.3199637023593468e-05, + "loss": 47.3734, + "step": 1259 + }, + { + "epoch": 4.5489841986455986, + "grad_norm": 418.3056945800781, + "learning_rate": 2.3194192377495463e-05, + "loss": 48.0316, + "step": 1260 + }, + { + "epoch": 4.5489841986455986, + "eval_loss": 0.6742400527000427, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 1260 + }, + { + "epoch": 4.5525959367945825, + "grad_norm": 369.8560791015625, + "learning_rate": 2.3188747731397462e-05, + "loss": 47.4532, + "step": 1261 + }, + { + "epoch": 4.5562076749435665, + "grad_norm": 322.0288391113281, + "learning_rate": 2.3183303085299457e-05, + "loss": 47.0661, + "step": 1262 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 244.79066467285156, + "learning_rate": 2.3177858439201453e-05, + "loss": 45.1875, + "step": 1263 + }, + { + "epoch": 4.563431151241535, + "grad_norm": 209.29397583007812, + "learning_rate": 2.3172413793103448e-05, + "loss": 46.1355, + "step": 1264 + }, + { + "epoch": 4.567042889390519, + "grad_norm": 271.5123291015625, + "learning_rate": 2.3166969147005443e-05, + "loss": 45.8947, + "step": 1265 + }, + { + "epoch": 4.570654627539503, + "grad_norm": 232.42913818359375, + "learning_rate": 2.3161524500907442e-05, + "loss": 45.6542, + "step": 1266 + }, + { + "epoch": 4.574266365688487, + "grad_norm": 282.50738525390625, + "learning_rate": 2.3156079854809437e-05, + "loss": 45.8805, + "step": 1267 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 203.39031982421875, + "learning_rate": 2.3150635208711436e-05, + "loss": 44.8926, + "step": 1268 + }, + { + "epoch": 4.581489841986456, + "grad_norm": 213.94894409179688, + "learning_rate": 2.314519056261343e-05, + "loss": 43.7589, + "step": 1269 + }, + { + "epoch": 4.58510158013544, + "grad_norm": 198.9677734375, + "learning_rate": 2.3139745916515427e-05, + "loss": 41.819, + "step": 1270 + }, + { + "epoch": 4.58510158013544, + "eval_loss": 0.6428627371788025, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1270 + }, + { + "epoch": 4.588713318284425, + "grad_norm": 197.69903564453125, + "learning_rate": 2.3134301270417422e-05, + "loss": 40.6128, + "step": 1271 + }, + { + "epoch": 4.592325056433409, + "grad_norm": 229.10488891601562, + "learning_rate": 2.312885662431942e-05, + "loss": 41.1856, + "step": 1272 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 254.4750213623047, + "learning_rate": 2.3123411978221417e-05, + "loss": 40.2048, + "step": 1273 + }, + { + "epoch": 4.599548532731377, + "grad_norm": 247.2012939453125, + "learning_rate": 2.3117967332123412e-05, + "loss": 41.663, + "step": 1274 + }, + { + "epoch": 4.603160270880361, + "grad_norm": 196.78761291503906, + "learning_rate": 2.3112522686025407e-05, + "loss": 41.1102, + "step": 1275 + }, + { + "epoch": 4.606772009029346, + "grad_norm": 179.03880310058594, + "learning_rate": 2.3107078039927403e-05, + "loss": 39.6368, + "step": 1276 + }, + { + "epoch": 4.6103837471783295, + "grad_norm": 203.49159240722656, + "learning_rate": 2.3101633393829405e-05, + "loss": 42.9424, + "step": 1277 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 254.80018615722656, + "learning_rate": 2.30961887477314e-05, + "loss": 42.0636, + "step": 1278 + }, + { + "epoch": 4.617607223476298, + "grad_norm": 201.86109924316406, + "learning_rate": 2.3090744101633396e-05, + "loss": 41.4738, + "step": 1279 + }, + { + "epoch": 4.621218961625282, + "grad_norm": 185.1239471435547, + "learning_rate": 2.308529945553539e-05, + "loss": 41.8529, + "step": 1280 + }, + { + "epoch": 4.621218961625282, + "eval_loss": 0.6457561254501343, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 1280 + }, + { + "epoch": 4.624830699774266, + "grad_norm": 198.6769561767578, + "learning_rate": 2.3079854809437386e-05, + "loss": 41.8397, + "step": 1281 + }, + { + "epoch": 4.62844243792325, + "grad_norm": 254.9165496826172, + "learning_rate": 2.3074410163339382e-05, + "loss": 43.5585, + "step": 1282 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 183.61181640625, + "learning_rate": 2.306896551724138e-05, + "loss": 41.7349, + "step": 1283 + }, + { + "epoch": 4.635665914221219, + "grad_norm": 206.0381622314453, + "learning_rate": 2.3063520871143376e-05, + "loss": 42.6239, + "step": 1284 + }, + { + "epoch": 4.639277652370203, + "grad_norm": 188.5303497314453, + "learning_rate": 2.305807622504537e-05, + "loss": 43.0988, + "step": 1285 + }, + { + "epoch": 4.642889390519187, + "grad_norm": 208.30039978027344, + "learning_rate": 2.3052631578947367e-05, + "loss": 43.8379, + "step": 1286 + }, + { + "epoch": 4.646501128668172, + "grad_norm": 209.494384765625, + "learning_rate": 2.3047186932849365e-05, + "loss": 41.4395, + "step": 1287 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 223.97824096679688, + "learning_rate": 2.3041742286751364e-05, + "loss": 38.5792, + "step": 1288 + }, + { + "epoch": 4.65372460496614, + "grad_norm": 209.16192626953125, + "learning_rate": 2.303629764065336e-05, + "loss": 36.2448, + "step": 1289 + }, + { + "epoch": 4.657336343115124, + "grad_norm": 260.72821044921875, + "learning_rate": 2.3030852994555355e-05, + "loss": 35.1692, + "step": 1290 + }, + { + "epoch": 4.657336343115124, + "eval_loss": 0.6381233334541321, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1290 + }, + { + "epoch": 4.660948081264109, + "grad_norm": 222.2270965576172, + "learning_rate": 2.302540834845735e-05, + "loss": 35.2234, + "step": 1291 + }, + { + "epoch": 4.664559819413093, + "grad_norm": 208.68218994140625, + "learning_rate": 2.3019963702359346e-05, + "loss": 35.6167, + "step": 1292 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 199.57015991210938, + "learning_rate": 2.301451905626134e-05, + "loss": 36.9489, + "step": 1293 + }, + { + "epoch": 4.6717832957110605, + "grad_norm": 249.1312255859375, + "learning_rate": 2.300907441016334e-05, + "loss": 37.0681, + "step": 1294 + }, + { + "epoch": 4.675395033860045, + "grad_norm": 227.86341857910156, + "learning_rate": 2.3003629764065335e-05, + "loss": 38.3897, + "step": 1295 + }, + { + "epoch": 4.679006772009029, + "grad_norm": 290.3368225097656, + "learning_rate": 2.2998185117967334e-05, + "loss": 39.1391, + "step": 1296 + }, + { + "epoch": 4.682618510158013, + "grad_norm": 222.59974670410156, + "learning_rate": 2.299274047186933e-05, + "loss": 38.6362, + "step": 1297 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 233.853515625, + "learning_rate": 2.2987295825771325e-05, + "loss": 37.1796, + "step": 1298 + }, + { + "epoch": 4.689841986455982, + "grad_norm": 202.83212280273438, + "learning_rate": 2.2981851179673324e-05, + "loss": 38.5097, + "step": 1299 + }, + { + "epoch": 4.693453724604966, + "grad_norm": 203.59027099609375, + "learning_rate": 2.297640653357532e-05, + "loss": 38.3335, + "step": 1300 + }, + { + "epoch": 4.693453724604966, + "eval_loss": 0.6446877717971802, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 1300 + }, + { + "epoch": 4.69706546275395, + "grad_norm": 250.48324584960938, + "learning_rate": 2.2970961887477314e-05, + "loss": 39.1848, + "step": 1301 + }, + { + "epoch": 4.700677200902934, + "grad_norm": 218.0867462158203, + "learning_rate": 2.296551724137931e-05, + "loss": 38.2276, + "step": 1302 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 316.4258728027344, + "learning_rate": 2.2960072595281305e-05, + "loss": 38.4487, + "step": 1303 + }, + { + "epoch": 4.707900677200903, + "grad_norm": 262.96832275390625, + "learning_rate": 2.29546279491833e-05, + "loss": 29.1075, + "step": 1304 + }, + { + "epoch": 4.711512415349887, + "grad_norm": 261.25897216796875, + "learning_rate": 2.2949183303085303e-05, + "loss": 24.6257, + "step": 1305 + }, + { + "epoch": 4.715124153498872, + "grad_norm": 223.29014587402344, + "learning_rate": 2.2943738656987298e-05, + "loss": 24.4387, + "step": 1306 + }, + { + "epoch": 4.718735891647856, + "grad_norm": 167.95193481445312, + "learning_rate": 2.2938294010889293e-05, + "loss": 25.0916, + "step": 1307 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 203.88392639160156, + "learning_rate": 2.293284936479129e-05, + "loss": 26.1631, + "step": 1308 + }, + { + "epoch": 4.725959367945824, + "grad_norm": 350.67657470703125, + "learning_rate": 2.2927404718693284e-05, + "loss": 47.7021, + "step": 1309 + }, + { + "epoch": 4.7295711060948085, + "grad_norm": 357.1839294433594, + "learning_rate": 2.2921960072595283e-05, + "loss": 47.8161, + "step": 1310 + }, + { + "epoch": 4.7295711060948085, + "eval_loss": 0.6716815829277039, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 1310 + }, + { + "epoch": 4.733182844243792, + "grad_norm": 334.40216064453125, + "learning_rate": 2.291651542649728e-05, + "loss": 47.5608, + "step": 1311 + }, + { + "epoch": 4.736794582392776, + "grad_norm": 322.90008544921875, + "learning_rate": 2.2911070780399274e-05, + "loss": 45.9858, + "step": 1312 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 291.5083923339844, + "learning_rate": 2.290562613430127e-05, + "loss": 45.9813, + "step": 1313 + }, + { + "epoch": 4.744018058690745, + "grad_norm": 234.91102600097656, + "learning_rate": 2.2900181488203268e-05, + "loss": 44.4287, + "step": 1314 + }, + { + "epoch": 4.747629796839729, + "grad_norm": 271.03582763671875, + "learning_rate": 2.2894736842105263e-05, + "loss": 45.3697, + "step": 1315 + }, + { + "epoch": 4.751241534988713, + "grad_norm": 256.219482421875, + "learning_rate": 2.2889292196007262e-05, + "loss": 45.1817, + "step": 1316 + }, + { + "epoch": 4.754853273137698, + "grad_norm": 252.0631561279297, + "learning_rate": 2.2883847549909257e-05, + "loss": 45.2029, + "step": 1317 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 249.41812133789062, + "learning_rate": 2.2878402903811253e-05, + "loss": 44.9802, + "step": 1318 + }, + { + "epoch": 4.762076749435666, + "grad_norm": 208.9102325439453, + "learning_rate": 2.2872958257713248e-05, + "loss": 44.3745, + "step": 1319 + }, + { + "epoch": 4.76568848758465, + "grad_norm": 322.94903564453125, + "learning_rate": 2.2867513611615244e-05, + "loss": 40.9193, + "step": 1320 + }, + { + "epoch": 4.76568848758465, + "eval_loss": 0.6515910029411316, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1320 + }, + { + "epoch": 4.769300225733634, + "grad_norm": 264.6942138671875, + "learning_rate": 2.2862068965517242e-05, + "loss": 39.7286, + "step": 1321 + }, + { + "epoch": 4.772911963882619, + "grad_norm": 276.6095886230469, + "learning_rate": 2.2856624319419238e-05, + "loss": 41.3846, + "step": 1322 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 199.59877014160156, + "learning_rate": 2.2851179673321233e-05, + "loss": 40.5583, + "step": 1323 + }, + { + "epoch": 4.780135440180587, + "grad_norm": 252.59158325195312, + "learning_rate": 2.2845735027223232e-05, + "loss": 40.9513, + "step": 1324 + }, + { + "epoch": 4.7837471783295715, + "grad_norm": 215.53826904296875, + "learning_rate": 2.2840290381125227e-05, + "loss": 41.5119, + "step": 1325 + }, + { + "epoch": 4.7873589164785555, + "grad_norm": 290.7100524902344, + "learning_rate": 2.2834845735027226e-05, + "loss": 42.7646, + "step": 1326 + }, + { + "epoch": 4.7909706546275395, + "grad_norm": 190.2306671142578, + "learning_rate": 2.282940108892922e-05, + "loss": 42.2708, + "step": 1327 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 187.5550079345703, + "learning_rate": 2.2823956442831217e-05, + "loss": 41.9279, + "step": 1328 + }, + { + "epoch": 4.798194130925508, + "grad_norm": 169.10414123535156, + "learning_rate": 2.2818511796733212e-05, + "loss": 42.2688, + "step": 1329 + }, + { + "epoch": 4.801805869074492, + "grad_norm": 199.5216064453125, + "learning_rate": 2.2813067150635208e-05, + "loss": 41.9192, + "step": 1330 + }, + { + "epoch": 4.801805869074492, + "eval_loss": 0.6402038335800171, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 1330 + }, + { + "epoch": 4.805417607223476, + "grad_norm": 222.4996337890625, + "learning_rate": 2.2807622504537203e-05, + "loss": 43.8218, + "step": 1331 + }, + { + "epoch": 4.80902934537246, + "grad_norm": 228.1157684326172, + "learning_rate": 2.2802177858439202e-05, + "loss": 42.9497, + "step": 1332 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 179.83697509765625, + "learning_rate": 2.27967332123412e-05, + "loss": 43.9723, + "step": 1333 + }, + { + "epoch": 4.816252821670429, + "grad_norm": 196.81983947753906, + "learning_rate": 2.2791288566243196e-05, + "loss": 43.3302, + "step": 1334 + }, + { + "epoch": 4.819864559819413, + "grad_norm": 186.61160278320312, + "learning_rate": 2.278584392014519e-05, + "loss": 41.8957, + "step": 1335 + }, + { + "epoch": 4.823476297968397, + "grad_norm": 242.55886840820312, + "learning_rate": 2.2780399274047187e-05, + "loss": 43.1916, + "step": 1336 + }, + { + "epoch": 4.827088036117382, + "grad_norm": 212.07177734375, + "learning_rate": 2.2774954627949185e-05, + "loss": 38.3371, + "step": 1337 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 180.1990966796875, + "learning_rate": 2.276950998185118e-05, + "loss": 36.3413, + "step": 1338 + }, + { + "epoch": 4.83431151241535, + "grad_norm": 202.69529724121094, + "learning_rate": 2.2764065335753176e-05, + "loss": 35.4426, + "step": 1339 + }, + { + "epoch": 4.837923250564334, + "grad_norm": 180.47283935546875, + "learning_rate": 2.275862068965517e-05, + "loss": 35.5281, + "step": 1340 + }, + { + "epoch": 4.837923250564334, + "eval_loss": 0.6356105804443359, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 1340 + }, + { + "epoch": 4.8415349887133186, + "grad_norm": 204.674560546875, + "learning_rate": 2.2753176043557167e-05, + "loss": 36.2566, + "step": 1341 + }, + { + "epoch": 4.8451467268623025, + "grad_norm": 272.1197204589844, + "learning_rate": 2.2747731397459166e-05, + "loss": 36.3862, + "step": 1342 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 235.55101013183594, + "learning_rate": 2.2742286751361165e-05, + "loss": 35.1455, + "step": 1343 + }, + { + "epoch": 4.852370203160271, + "grad_norm": 271.2718200683594, + "learning_rate": 2.273684210526316e-05, + "loss": 37.3824, + "step": 1344 + }, + { + "epoch": 4.855981941309255, + "grad_norm": 242.15728759765625, + "learning_rate": 2.2731397459165155e-05, + "loss": 37.6587, + "step": 1345 + }, + { + "epoch": 4.859593679458239, + "grad_norm": 218.59481811523438, + "learning_rate": 2.272595281306715e-05, + "loss": 36.7602, + "step": 1346 + }, + { + "epoch": 4.863205417607223, + "grad_norm": 231.9490203857422, + "learning_rate": 2.2720508166969146e-05, + "loss": 38.187, + "step": 1347 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 385.56158447265625, + "learning_rate": 2.2715063520871145e-05, + "loss": 38.1905, + "step": 1348 + }, + { + "epoch": 4.870428893905192, + "grad_norm": 219.38204956054688, + "learning_rate": 2.270961887477314e-05, + "loss": 38.2179, + "step": 1349 + }, + { + "epoch": 4.874040632054176, + "grad_norm": 209.46580505371094, + "learning_rate": 2.2704174228675136e-05, + "loss": 37.3696, + "step": 1350 + }, + { + "epoch": 4.874040632054176, + "eval_loss": 0.6412517428398132, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 1350 + }, + { + "epoch": 4.87765237020316, + "grad_norm": 205.53416442871094, + "learning_rate": 2.2698729582577134e-05, + "loss": 38.5144, + "step": 1351 + }, + { + "epoch": 4.881264108352145, + "grad_norm": 214.2522735595703, + "learning_rate": 2.269328493647913e-05, + "loss": 38.7372, + "step": 1352 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 236.9787139892578, + "learning_rate": 2.2687840290381125e-05, + "loss": 38.8987, + "step": 1353 + }, + { + "epoch": 4.888487584650113, + "grad_norm": 247.30906677246094, + "learning_rate": 2.2682395644283124e-05, + "loss": 35.0837, + "step": 1354 + }, + { + "epoch": 4.892099322799097, + "grad_norm": 287.5954284667969, + "learning_rate": 2.267695099818512e-05, + "loss": 25.5272, + "step": 1355 + }, + { + "epoch": 4.895711060948082, + "grad_norm": 254.61672973632812, + "learning_rate": 2.2671506352087115e-05, + "loss": 25.1288, + "step": 1356 + }, + { + "epoch": 4.899322799097066, + "grad_norm": 180.98666381835938, + "learning_rate": 2.266606170598911e-05, + "loss": 25.0588, + "step": 1357 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 213.0275421142578, + "learning_rate": 2.2660617059891105e-05, + "loss": 25.464, + "step": 1358 + }, + { + "epoch": 4.9065462753950335, + "grad_norm": 385.18035888671875, + "learning_rate": 2.2655172413793104e-05, + "loss": 47.0056, + "step": 1359 + }, + { + "epoch": 4.910158013544018, + "grad_norm": 383.4106140136719, + "learning_rate": 2.2649727767695103e-05, + "loss": 46.9892, + "step": 1360 + }, + { + "epoch": 4.910158013544018, + "eval_loss": 0.6618479490280151, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1360 + }, + { + "epoch": 4.913769751693002, + "grad_norm": 415.4345397949219, + "learning_rate": 2.26442831215971e-05, + "loss": 47.1619, + "step": 1361 + }, + { + "epoch": 4.917381489841986, + "grad_norm": 362.338134765625, + "learning_rate": 2.2638838475499094e-05, + "loss": 46.7232, + "step": 1362 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 378.7535400390625, + "learning_rate": 2.263339382940109e-05, + "loss": 46.4438, + "step": 1363 + }, + { + "epoch": 4.924604966139955, + "grad_norm": 251.64901733398438, + "learning_rate": 2.2627949183303085e-05, + "loss": 44.8178, + "step": 1364 + }, + { + "epoch": 4.928216704288939, + "grad_norm": 273.1052551269531, + "learning_rate": 2.2622504537205083e-05, + "loss": 43.0865, + "step": 1365 + }, + { + "epoch": 4.931828442437923, + "grad_norm": 229.66415405273438, + "learning_rate": 2.261705989110708e-05, + "loss": 42.2463, + "step": 1366 + }, + { + "epoch": 4.935440180586907, + "grad_norm": 229.47940063476562, + "learning_rate": 2.2611615245009074e-05, + "loss": 42.4395, + "step": 1367 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 224.48890686035156, + "learning_rate": 2.260617059891107e-05, + "loss": 42.4994, + "step": 1368 + }, + { + "epoch": 4.942663656884876, + "grad_norm": 241.98745727539062, + "learning_rate": 2.2600725952813065e-05, + "loss": 42.5535, + "step": 1369 + }, + { + "epoch": 4.94627539503386, + "grad_norm": 258.1711120605469, + "learning_rate": 2.2595281306715067e-05, + "loss": 42.8475, + "step": 1370 + }, + { + "epoch": 4.94627539503386, + "eval_loss": 0.639252245426178, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.09, + "eval_steps_per_second": 57.09, + "step": 1370 + }, + { + "epoch": 4.949887133182845, + "grad_norm": 204.64927673339844, + "learning_rate": 2.2589836660617062e-05, + "loss": 42.9895, + "step": 1371 + }, + { + "epoch": 4.953498871331829, + "grad_norm": 342.9057922363281, + "learning_rate": 2.2584392014519058e-05, + "loss": 43.1972, + "step": 1372 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 207.45504760742188, + "learning_rate": 2.2578947368421053e-05, + "loss": 42.406, + "step": 1373 + }, + { + "epoch": 4.960722347629797, + "grad_norm": 232.78831481933594, + "learning_rate": 2.257350272232305e-05, + "loss": 36.8817, + "step": 1374 + }, + { + "epoch": 4.9643340857787805, + "grad_norm": 249.3349609375, + "learning_rate": 2.2568058076225044e-05, + "loss": 34.584, + "step": 1375 + }, + { + "epoch": 4.967945823927765, + "grad_norm": 322.7100524902344, + "learning_rate": 2.2562613430127043e-05, + "loss": 36.9512, + "step": 1376 + }, + { + "epoch": 4.971557562076749, + "grad_norm": 357.65228271484375, + "learning_rate": 2.2557168784029038e-05, + "loss": 37.6833, + "step": 1377 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 300.0970153808594, + "learning_rate": 2.2551724137931033e-05, + "loss": 38.597, + "step": 1378 + }, + { + "epoch": 4.978781038374718, + "grad_norm": 234.52508544921875, + "learning_rate": 2.2546279491833032e-05, + "loss": 38.4155, + "step": 1379 + }, + { + "epoch": 4.982392776523702, + "grad_norm": 270.60626220703125, + "learning_rate": 2.2540834845735028e-05, + "loss": 38.1589, + "step": 1380 + }, + { + "epoch": 4.982392776523702, + "eval_loss": 0.6409950256347656, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 1380 + }, + { + "epoch": 4.986004514672686, + "grad_norm": 232.9596710205078, + "learning_rate": 2.2535390199637026e-05, + "loss": 39.281, + "step": 1381 + }, + { + "epoch": 4.98961625282167, + "grad_norm": 248.0550994873047, + "learning_rate": 2.2529945553539022e-05, + "loss": 40.0868, + "step": 1382 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 256.327880859375, + "learning_rate": 2.2524500907441017e-05, + "loss": 28.1259, + "step": 1383 + }, + { + "epoch": 4.996839729119639, + "grad_norm": 198.29559326171875, + "learning_rate": 2.2519056261343012e-05, + "loss": 25.3166, + "step": 1384 + }, + { + "epoch": 5.0, + "grad_norm": 174.66856384277344, + "learning_rate": 2.2513611615245008e-05, + "loss": 22.0749, + "step": 1385 + }, + { + "epoch": 5.003611738148984, + "grad_norm": 309.0927429199219, + "learning_rate": 2.2508166969147003e-05, + "loss": 45.2433, + "step": 1386 + }, + { + "epoch": 5.007223476297969, + "grad_norm": 293.1455383300781, + "learning_rate": 2.2502722323049002e-05, + "loss": 46.7025, + "step": 1387 + }, + { + "epoch": 5.010835214446953, + "grad_norm": 269.47662353515625, + "learning_rate": 2.2497277676951e-05, + "loss": 45.3218, + "step": 1388 + }, + { + "epoch": 5.014446952595937, + "grad_norm": 284.49560546875, + "learning_rate": 2.2491833030852996e-05, + "loss": 44.9849, + "step": 1389 + }, + { + "epoch": 5.018058690744921, + "grad_norm": 223.5511474609375, + "learning_rate": 2.248638838475499e-05, + "loss": 44.887, + "step": 1390 + }, + { + "epoch": 5.018058690744921, + "eval_loss": 0.6435533165931702, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1390 + }, + { + "epoch": 5.021670428893906, + "grad_norm": 243.4492645263672, + "learning_rate": 2.2480943738656987e-05, + "loss": 45.1483, + "step": 1391 + }, + { + "epoch": 5.0252821670428895, + "grad_norm": 265.1712646484375, + "learning_rate": 2.2475499092558986e-05, + "loss": 44.3713, + "step": 1392 + }, + { + "epoch": 5.0288939051918735, + "grad_norm": 190.72190856933594, + "learning_rate": 2.247005444646098e-05, + "loss": 45.3138, + "step": 1393 + }, + { + "epoch": 5.0325056433408575, + "grad_norm": 177.26686096191406, + "learning_rate": 2.2464609800362976e-05, + "loss": 43.302, + "step": 1394 + }, + { + "epoch": 5.036117381489842, + "grad_norm": 198.6124725341797, + "learning_rate": 2.2459165154264972e-05, + "loss": 43.6363, + "step": 1395 + }, + { + "epoch": 5.039729119638826, + "grad_norm": 233.78738403320312, + "learning_rate": 2.2453720508166967e-05, + "loss": 43.0345, + "step": 1396 + }, + { + "epoch": 5.04334085778781, + "grad_norm": 225.48614501953125, + "learning_rate": 2.2448275862068966e-05, + "loss": 41.5932, + "step": 1397 + }, + { + "epoch": 5.046952595936794, + "grad_norm": 204.31179809570312, + "learning_rate": 2.2442831215970965e-05, + "loss": 40.1401, + "step": 1398 + }, + { + "epoch": 5.050564334085779, + "grad_norm": 219.5385284423828, + "learning_rate": 2.243738656987296e-05, + "loss": 40.8834, + "step": 1399 + }, + { + "epoch": 5.054176072234763, + "grad_norm": 168.3094024658203, + "learning_rate": 2.2431941923774956e-05, + "loss": 40.4476, + "step": 1400 + }, + { + "epoch": 5.054176072234763, + "eval_loss": 0.6361114382743835, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 1400 + }, + { + "epoch": 5.057787810383747, + "grad_norm": 169.45201110839844, + "learning_rate": 2.242649727767695e-05, + "loss": 40.1949, + "step": 1401 + }, + { + "epoch": 5.061399548532731, + "grad_norm": 208.84634399414062, + "learning_rate": 2.2421052631578946e-05, + "loss": 41.0091, + "step": 1402 + }, + { + "epoch": 5.065011286681716, + "grad_norm": 248.86221313476562, + "learning_rate": 2.2415607985480945e-05, + "loss": 40.2435, + "step": 1403 + }, + { + "epoch": 5.0686230248307, + "grad_norm": 297.0834655761719, + "learning_rate": 2.241016333938294e-05, + "loss": 42.37, + "step": 1404 + }, + { + "epoch": 5.072234762979684, + "grad_norm": 242.12661743164062, + "learning_rate": 2.2404718693284936e-05, + "loss": 42.3822, + "step": 1405 + }, + { + "epoch": 5.075846501128668, + "grad_norm": 230.1178741455078, + "learning_rate": 2.2399274047186935e-05, + "loss": 41.3722, + "step": 1406 + }, + { + "epoch": 5.079458239277653, + "grad_norm": 191.32371520996094, + "learning_rate": 2.239382940108893e-05, + "loss": 41.8087, + "step": 1407 + }, + { + "epoch": 5.083069977426637, + "grad_norm": 267.28753662109375, + "learning_rate": 2.2388384754990925e-05, + "loss": 42.5938, + "step": 1408 + }, + { + "epoch": 5.0866817155756205, + "grad_norm": 186.61978149414062, + "learning_rate": 2.2382940108892924e-05, + "loss": 42.8553, + "step": 1409 + }, + { + "epoch": 5.090293453724605, + "grad_norm": 242.53433227539062, + "learning_rate": 2.237749546279492e-05, + "loss": 41.9677, + "step": 1410 + }, + { + "epoch": 5.090293453724605, + "eval_loss": 0.6330043077468872, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 1410 + }, + { + "epoch": 5.093905191873589, + "grad_norm": 199.74696350097656, + "learning_rate": 2.2372050816696915e-05, + "loss": 42.9821, + "step": 1411 + }, + { + "epoch": 5.097516930022573, + "grad_norm": 254.1063690185547, + "learning_rate": 2.236660617059891e-05, + "loss": 42.7956, + "step": 1412 + }, + { + "epoch": 5.101128668171557, + "grad_norm": 215.59056091308594, + "learning_rate": 2.2361161524500906e-05, + "loss": 43.6312, + "step": 1413 + }, + { + "epoch": 5.104740406320542, + "grad_norm": 218.69973754882812, + "learning_rate": 2.2355716878402904e-05, + "loss": 40.9468, + "step": 1414 + }, + { + "epoch": 5.108352144469526, + "grad_norm": 200.34927368164062, + "learning_rate": 2.23502722323049e-05, + "loss": 38.2656, + "step": 1415 + }, + { + "epoch": 5.11196388261851, + "grad_norm": 191.56883239746094, + "learning_rate": 2.23448275862069e-05, + "loss": 35.8111, + "step": 1416 + }, + { + "epoch": 5.115575620767494, + "grad_norm": 192.629150390625, + "learning_rate": 2.2339382940108894e-05, + "loss": 35.1287, + "step": 1417 + }, + { + "epoch": 5.119187358916479, + "grad_norm": 217.54855346679688, + "learning_rate": 2.233393829401089e-05, + "loss": 34.9664, + "step": 1418 + }, + { + "epoch": 5.122799097065463, + "grad_norm": 234.12355041503906, + "learning_rate": 2.2328493647912888e-05, + "loss": 35.9252, + "step": 1419 + }, + { + "epoch": 5.126410835214447, + "grad_norm": 201.83477783203125, + "learning_rate": 2.2323049001814884e-05, + "loss": 36.4664, + "step": 1420 + }, + { + "epoch": 5.126410835214447, + "eval_loss": 0.6359394192695618, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 1420 + }, + { + "epoch": 5.130022573363431, + "grad_norm": 212.38943481445312, + "learning_rate": 2.231760435571688e-05, + "loss": 35.2733, + "step": 1421 + }, + { + "epoch": 5.133634311512416, + "grad_norm": 219.8803253173828, + "learning_rate": 2.2312159709618874e-05, + "loss": 37.2009, + "step": 1422 + }, + { + "epoch": 5.1372460496614, + "grad_norm": 222.28221130371094, + "learning_rate": 2.230671506352087e-05, + "loss": 36.9338, + "step": 1423 + }, + { + "epoch": 5.140857787810384, + "grad_norm": 217.56607055664062, + "learning_rate": 2.2301270417422865e-05, + "loss": 38.0419, + "step": 1424 + }, + { + "epoch": 5.144469525959368, + "grad_norm": 232.7363739013672, + "learning_rate": 2.2295825771324867e-05, + "loss": 38.1393, + "step": 1425 + }, + { + "epoch": 5.148081264108352, + "grad_norm": 228.12091064453125, + "learning_rate": 2.2290381125226863e-05, + "loss": 37.4169, + "step": 1426 + }, + { + "epoch": 5.151693002257336, + "grad_norm": 247.9901580810547, + "learning_rate": 2.2284936479128858e-05, + "loss": 37.6386, + "step": 1427 + }, + { + "epoch": 5.15530474040632, + "grad_norm": 227.96649169921875, + "learning_rate": 2.2279491833030853e-05, + "loss": 38.7843, + "step": 1428 + }, + { + "epoch": 5.158916478555304, + "grad_norm": 197.85072326660156, + "learning_rate": 2.227404718693285e-05, + "loss": 37.7056, + "step": 1429 + }, + { + "epoch": 5.162528216704289, + "grad_norm": 270.6370544433594, + "learning_rate": 2.2268602540834848e-05, + "loss": 38.5554, + "step": 1430 + }, + { + "epoch": 5.162528216704289, + "eval_loss": 0.6463288068771362, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 1430 + }, + { + "epoch": 5.166139954853273, + "grad_norm": 251.65847778320312, + "learning_rate": 2.2263157894736843e-05, + "loss": 32.6593, + "step": 1431 + }, + { + "epoch": 5.169751693002257, + "grad_norm": 248.84368896484375, + "learning_rate": 2.225771324863884e-05, + "loss": 24.8031, + "step": 1432 + }, + { + "epoch": 5.173363431151241, + "grad_norm": 218.12979125976562, + "learning_rate": 2.2252268602540834e-05, + "loss": 23.8542, + "step": 1433 + }, + { + "epoch": 5.176975169300226, + "grad_norm": 171.4182586669922, + "learning_rate": 2.2246823956442832e-05, + "loss": 25.1994, + "step": 1434 + }, + { + "epoch": 5.18058690744921, + "grad_norm": 200.76271057128906, + "learning_rate": 2.2241379310344828e-05, + "loss": 25.1259, + "step": 1435 + }, + { + "epoch": 5.184198645598194, + "grad_norm": 324.8979797363281, + "learning_rate": 2.2235934664246827e-05, + "loss": 46.7466, + "step": 1436 + }, + { + "epoch": 5.187810383747179, + "grad_norm": 391.9200439453125, + "learning_rate": 2.2230490018148822e-05, + "loss": 47.366, + "step": 1437 + }, + { + "epoch": 5.191422121896163, + "grad_norm": 332.51080322265625, + "learning_rate": 2.2225045372050817e-05, + "loss": 47.5236, + "step": 1438 + }, + { + "epoch": 5.195033860045147, + "grad_norm": 295.85333251953125, + "learning_rate": 2.2219600725952813e-05, + "loss": 44.9235, + "step": 1439 + }, + { + "epoch": 5.198645598194131, + "grad_norm": 246.46482849121094, + "learning_rate": 2.2214156079854808e-05, + "loss": 44.5892, + "step": 1440 + }, + { + "epoch": 5.198645598194131, + "eval_loss": 0.6501885056495667, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.096, + "eval_steps_per_second": 57.096, + "step": 1440 + }, + { + "epoch": 5.2022573363431155, + "grad_norm": 224.99964904785156, + "learning_rate": 2.2208711433756807e-05, + "loss": 45.1496, + "step": 1441 + }, + { + "epoch": 5.2058690744920995, + "grad_norm": 201.5928497314453, + "learning_rate": 2.2203266787658802e-05, + "loss": 44.2362, + "step": 1442 + }, + { + "epoch": 5.209480812641083, + "grad_norm": 220.72509765625, + "learning_rate": 2.21978221415608e-05, + "loss": 45.7963, + "step": 1443 + }, + { + "epoch": 5.213092550790067, + "grad_norm": 229.04412841796875, + "learning_rate": 2.2192377495462796e-05, + "loss": 44.1812, + "step": 1444 + }, + { + "epoch": 5.216704288939052, + "grad_norm": 214.86207580566406, + "learning_rate": 2.2186932849364792e-05, + "loss": 44.364, + "step": 1445 + }, + { + "epoch": 5.220316027088036, + "grad_norm": 169.3239288330078, + "learning_rate": 2.2181488203266787e-05, + "loss": 44.1106, + "step": 1446 + }, + { + "epoch": 5.22392776523702, + "grad_norm": 180.3131561279297, + "learning_rate": 2.2176043557168786e-05, + "loss": 41.8791, + "step": 1447 + }, + { + "epoch": 5.227539503386004, + "grad_norm": 227.83078002929688, + "learning_rate": 2.217059891107078e-05, + "loss": 39.7917, + "step": 1448 + }, + { + "epoch": 5.231151241534989, + "grad_norm": 267.4294738769531, + "learning_rate": 2.2165154264972777e-05, + "loss": 41.2864, + "step": 1449 + }, + { + "epoch": 5.234762979683973, + "grad_norm": 210.79034423828125, + "learning_rate": 2.2159709618874772e-05, + "loss": 40.7219, + "step": 1450 + }, + { + "epoch": 5.234762979683973, + "eval_loss": 0.6369529366493225, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 1450 + }, + { + "epoch": 5.238374717832957, + "grad_norm": 205.2632598876953, + "learning_rate": 2.2154264972776768e-05, + "loss": 41.0364, + "step": 1451 + }, + { + "epoch": 5.241986455981941, + "grad_norm": 199.7196807861328, + "learning_rate": 2.214882032667877e-05, + "loss": 40.2733, + "step": 1452 + }, + { + "epoch": 5.245598194130926, + "grad_norm": 184.26495361328125, + "learning_rate": 2.2143375680580765e-05, + "loss": 40.3418, + "step": 1453 + }, + { + "epoch": 5.24920993227991, + "grad_norm": 170.1937713623047, + "learning_rate": 2.213793103448276e-05, + "loss": 40.5658, + "step": 1454 + }, + { + "epoch": 5.252821670428894, + "grad_norm": 167.71109008789062, + "learning_rate": 2.2132486388384756e-05, + "loss": 41.9252, + "step": 1455 + }, + { + "epoch": 5.2564334085778786, + "grad_norm": 184.73162841796875, + "learning_rate": 2.212704174228675e-05, + "loss": 40.0485, + "step": 1456 + }, + { + "epoch": 5.2600451467268625, + "grad_norm": 195.0812225341797, + "learning_rate": 2.2121597096188747e-05, + "loss": 41.6424, + "step": 1457 + }, + { + "epoch": 5.2636568848758465, + "grad_norm": 218.23553466796875, + "learning_rate": 2.2116152450090745e-05, + "loss": 40.6179, + "step": 1458 + }, + { + "epoch": 5.2672686230248305, + "grad_norm": 229.79299926757812, + "learning_rate": 2.211070780399274e-05, + "loss": 42.8747, + "step": 1459 + }, + { + "epoch": 5.270880361173815, + "grad_norm": 231.70692443847656, + "learning_rate": 2.2105263157894736e-05, + "loss": 42.7016, + "step": 1460 + }, + { + "epoch": 5.270880361173815, + "eval_loss": 0.6424433588981628, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 1460 + }, + { + "epoch": 5.274492099322799, + "grad_norm": 204.9513397216797, + "learning_rate": 2.209981851179673e-05, + "loss": 41.206, + "step": 1461 + }, + { + "epoch": 5.278103837471783, + "grad_norm": 220.89083862304688, + "learning_rate": 2.209437386569873e-05, + "loss": 44.0126, + "step": 1462 + }, + { + "epoch": 5.281715575620767, + "grad_norm": 266.7763671875, + "learning_rate": 2.208892921960073e-05, + "loss": 41.4934, + "step": 1463 + }, + { + "epoch": 5.285327313769752, + "grad_norm": 241.42636108398438, + "learning_rate": 2.2083484573502724e-05, + "loss": 43.3433, + "step": 1464 + }, + { + "epoch": 5.288939051918736, + "grad_norm": 221.7669219970703, + "learning_rate": 2.207803992740472e-05, + "loss": 35.9569, + "step": 1465 + }, + { + "epoch": 5.29255079006772, + "grad_norm": 236.0152130126953, + "learning_rate": 2.2072595281306715e-05, + "loss": 36.0824, + "step": 1466 + }, + { + "epoch": 5.296162528216704, + "grad_norm": 239.56224060058594, + "learning_rate": 2.206715063520871e-05, + "loss": 33.6127, + "step": 1467 + }, + { + "epoch": 5.299774266365689, + "grad_norm": 277.1287841796875, + "learning_rate": 2.2061705989110706e-05, + "loss": 36.11, + "step": 1468 + }, + { + "epoch": 5.303386004514673, + "grad_norm": 250.19515991210938, + "learning_rate": 2.2056261343012705e-05, + "loss": 36.9984, + "step": 1469 + }, + { + "epoch": 5.306997742663657, + "grad_norm": 214.2754669189453, + "learning_rate": 2.20508166969147e-05, + "loss": 36.5917, + "step": 1470 + }, + { + "epoch": 5.306997742663657, + "eval_loss": 0.6356943845748901, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 1470 + }, + { + "epoch": 5.310609480812641, + "grad_norm": 224.37388610839844, + "learning_rate": 2.20453720508167e-05, + "loss": 36.5302, + "step": 1471 + }, + { + "epoch": 5.314221218961626, + "grad_norm": 276.2541809082031, + "learning_rate": 2.2039927404718694e-05, + "loss": 36.7978, + "step": 1472 + }, + { + "epoch": 5.3178329571106095, + "grad_norm": 361.717041015625, + "learning_rate": 2.203448275862069e-05, + "loss": 37.4063, + "step": 1473 + }, + { + "epoch": 5.3214446952595935, + "grad_norm": 285.3569641113281, + "learning_rate": 2.202903811252269e-05, + "loss": 37.2472, + "step": 1474 + }, + { + "epoch": 5.3250564334085775, + "grad_norm": 268.160400390625, + "learning_rate": 2.2023593466424684e-05, + "loss": 37.7361, + "step": 1475 + }, + { + "epoch": 5.328668171557562, + "grad_norm": 211.38070678710938, + "learning_rate": 2.201814882032668e-05, + "loss": 37.7794, + "step": 1476 + }, + { + "epoch": 5.332279909706546, + "grad_norm": 214.10638427734375, + "learning_rate": 2.2012704174228675e-05, + "loss": 39.0787, + "step": 1477 + }, + { + "epoch": 5.33589164785553, + "grad_norm": 238.9603271484375, + "learning_rate": 2.200725952813067e-05, + "loss": 37.6853, + "step": 1478 + }, + { + "epoch": 5.339503386004514, + "grad_norm": 323.44976806640625, + "learning_rate": 2.2001814882032665e-05, + "loss": 38.2844, + "step": 1479 + }, + { + "epoch": 5.343115124153499, + "grad_norm": 289.6131896972656, + "learning_rate": 2.1996370235934668e-05, + "loss": 38.8953, + "step": 1480 + }, + { + "epoch": 5.343115124153499, + "eval_loss": 0.6462770700454712, + "eval_runtime": 3.1673, + "eval_samples_per_second": 56.516, + "eval_steps_per_second": 56.516, + "step": 1480 + }, + { + "epoch": 5.346726862302483, + "grad_norm": 197.47299194335938, + "learning_rate": 2.1990925589836663e-05, + "loss": 28.126, + "step": 1481 + }, + { + "epoch": 5.350338600451467, + "grad_norm": 198.37156677246094, + "learning_rate": 2.1985480943738658e-05, + "loss": 24.2205, + "step": 1482 + }, + { + "epoch": 5.353950338600452, + "grad_norm": 211.03501892089844, + "learning_rate": 2.1980036297640654e-05, + "loss": 24.119, + "step": 1483 + }, + { + "epoch": 5.357562076749436, + "grad_norm": 182.23316955566406, + "learning_rate": 2.197459165154265e-05, + "loss": 24.7386, + "step": 1484 + }, + { + "epoch": 5.36117381489842, + "grad_norm": 192.6392822265625, + "learning_rate": 2.1969147005444648e-05, + "loss": 26.0739, + "step": 1485 + }, + { + "epoch": 5.364785553047404, + "grad_norm": 380.62896728515625, + "learning_rate": 2.1963702359346643e-05, + "loss": 46.6945, + "step": 1486 + }, + { + "epoch": 5.368397291196389, + "grad_norm": 342.5572814941406, + "learning_rate": 2.195825771324864e-05, + "loss": 46.1797, + "step": 1487 + }, + { + "epoch": 5.372009029345373, + "grad_norm": 311.7198791503906, + "learning_rate": 2.1952813067150634e-05, + "loss": 45.6588, + "step": 1488 + }, + { + "epoch": 5.375620767494357, + "grad_norm": 260.9885559082031, + "learning_rate": 2.1947368421052633e-05, + "loss": 45.2405, + "step": 1489 + }, + { + "epoch": 5.3792325056433405, + "grad_norm": 263.3132019042969, + "learning_rate": 2.1941923774954628e-05, + "loss": 44.117, + "step": 1490 + }, + { + "epoch": 5.3792325056433405, + "eval_loss": 0.644275426864624, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 1490 + }, + { + "epoch": 5.382844243792325, + "grad_norm": 254.92022705078125, + "learning_rate": 2.1936479128856627e-05, + "loss": 45.4002, + "step": 1491 + }, + { + "epoch": 5.386455981941309, + "grad_norm": 246.1839599609375, + "learning_rate": 2.1931034482758622e-05, + "loss": 45.3481, + "step": 1492 + }, + { + "epoch": 5.390067720090293, + "grad_norm": 282.2879638671875, + "learning_rate": 2.1925589836660618e-05, + "loss": 45.3958, + "step": 1493 + }, + { + "epoch": 5.393679458239277, + "grad_norm": 266.9140930175781, + "learning_rate": 2.1920145190562613e-05, + "loss": 44.2959, + "step": 1494 + }, + { + "epoch": 5.397291196388262, + "grad_norm": 196.81199645996094, + "learning_rate": 2.191470054446461e-05, + "loss": 44.765, + "step": 1495 + }, + { + "epoch": 5.400902934537246, + "grad_norm": 270.7329406738281, + "learning_rate": 2.1909255898366607e-05, + "loss": 42.8581, + "step": 1496 + }, + { + "epoch": 5.40451467268623, + "grad_norm": 187.3281707763672, + "learning_rate": 2.1903811252268603e-05, + "loss": 40.7167, + "step": 1497 + }, + { + "epoch": 5.408126410835214, + "grad_norm": 302.9165954589844, + "learning_rate": 2.1898366606170598e-05, + "loss": 41.0712, + "step": 1498 + }, + { + "epoch": 5.411738148984199, + "grad_norm": 395.1492614746094, + "learning_rate": 2.1892921960072597e-05, + "loss": 40.4098, + "step": 1499 + }, + { + "epoch": 5.415349887133183, + "grad_norm": 253.91494750976562, + "learning_rate": 2.1887477313974592e-05, + "loss": 41.2985, + "step": 1500 + }, + { + "epoch": 5.415349887133183, + "eval_loss": 0.6383773684501648, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1500 + }, + { + "epoch": 5.418961625282167, + "grad_norm": 248.4109344482422, + "learning_rate": 2.1882032667876588e-05, + "loss": 41.179, + "step": 1501 + }, + { + "epoch": 5.422573363431152, + "grad_norm": 210.50015258789062, + "learning_rate": 2.1876588021778586e-05, + "loss": 41.1934, + "step": 1502 + }, + { + "epoch": 5.426185101580136, + "grad_norm": 170.64334106445312, + "learning_rate": 2.187114337568058e-05, + "loss": 41.5535, + "step": 1503 + }, + { + "epoch": 5.42979683972912, + "grad_norm": 249.41270446777344, + "learning_rate": 2.1865698729582577e-05, + "loss": 41.8323, + "step": 1504 + }, + { + "epoch": 5.433408577878104, + "grad_norm": 214.53770446777344, + "learning_rate": 2.1860254083484572e-05, + "loss": 42.1517, + "step": 1505 + }, + { + "epoch": 5.437020316027088, + "grad_norm": 225.6502227783203, + "learning_rate": 2.1854809437386568e-05, + "loss": 42.7675, + "step": 1506 + }, + { + "epoch": 5.440632054176072, + "grad_norm": 210.19219970703125, + "learning_rate": 2.1849364791288567e-05, + "loss": 42.5094, + "step": 1507 + }, + { + "epoch": 5.444243792325056, + "grad_norm": 187.03294372558594, + "learning_rate": 2.1843920145190565e-05, + "loss": 42.2218, + "step": 1508 + }, + { + "epoch": 5.44785553047404, + "grad_norm": 227.6764373779297, + "learning_rate": 2.183847549909256e-05, + "loss": 42.7061, + "step": 1509 + }, + { + "epoch": 5.451467268623025, + "grad_norm": 239.2847442626953, + "learning_rate": 2.1833030852994556e-05, + "loss": 43.1959, + "step": 1510 + }, + { + "epoch": 5.451467268623025, + "eval_loss": 0.6405091285705566, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 1510 + }, + { + "epoch": 5.455079006772009, + "grad_norm": 268.887451171875, + "learning_rate": 2.182758620689655e-05, + "loss": 42.4915, + "step": 1511 + }, + { + "epoch": 5.458690744920993, + "grad_norm": 261.0531311035156, + "learning_rate": 2.182214156079855e-05, + "loss": 42.1777, + "step": 1512 + }, + { + "epoch": 5.462302483069977, + "grad_norm": 241.58819580078125, + "learning_rate": 2.1816696914700546e-05, + "loss": 40.8728, + "step": 1513 + }, + { + "epoch": 5.465914221218962, + "grad_norm": 227.302001953125, + "learning_rate": 2.181125226860254e-05, + "loss": 39.8861, + "step": 1514 + }, + { + "epoch": 5.469525959367946, + "grad_norm": 293.8402404785156, + "learning_rate": 2.1805807622504536e-05, + "loss": 36.8716, + "step": 1515 + }, + { + "epoch": 5.47313769751693, + "grad_norm": 332.8829650878906, + "learning_rate": 2.1800362976406532e-05, + "loss": 35.6049, + "step": 1516 + }, + { + "epoch": 5.476749435665914, + "grad_norm": 271.6636962890625, + "learning_rate": 2.179491833030853e-05, + "loss": 34.6785, + "step": 1517 + }, + { + "epoch": 5.480361173814899, + "grad_norm": 211.5673065185547, + "learning_rate": 2.178947368421053e-05, + "loss": 35.5321, + "step": 1518 + }, + { + "epoch": 5.483972911963883, + "grad_norm": 168.95346069335938, + "learning_rate": 2.1784029038112525e-05, + "loss": 35.1604, + "step": 1519 + }, + { + "epoch": 5.487584650112867, + "grad_norm": 242.66725158691406, + "learning_rate": 2.177858439201452e-05, + "loss": 37.8709, + "step": 1520 + }, + { + "epoch": 5.487584650112867, + "eval_loss": 0.6324127912521362, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1520 + }, + { + "epoch": 5.491196388261851, + "grad_norm": 202.7799530029297, + "learning_rate": 2.1773139745916516e-05, + "loss": 38.1727, + "step": 1521 + }, + { + "epoch": 5.4948081264108355, + "grad_norm": 210.12704467773438, + "learning_rate": 2.176769509981851e-05, + "loss": 36.4171, + "step": 1522 + }, + { + "epoch": 5.4984198645598195, + "grad_norm": 214.7133331298828, + "learning_rate": 2.176225045372051e-05, + "loss": 37.7873, + "step": 1523 + }, + { + "epoch": 5.502031602708803, + "grad_norm": 197.89781188964844, + "learning_rate": 2.1756805807622505e-05, + "loss": 37.1096, + "step": 1524 + }, + { + "epoch": 5.505643340857787, + "grad_norm": 203.01992797851562, + "learning_rate": 2.17513611615245e-05, + "loss": 36.9907, + "step": 1525 + }, + { + "epoch": 5.509255079006772, + "grad_norm": 210.42164611816406, + "learning_rate": 2.17459165154265e-05, + "loss": 38.0291, + "step": 1526 + }, + { + "epoch": 5.512866817155756, + "grad_norm": 210.2798309326172, + "learning_rate": 2.1740471869328495e-05, + "loss": 37.5385, + "step": 1527 + }, + { + "epoch": 5.51647855530474, + "grad_norm": 217.986572265625, + "learning_rate": 2.173502722323049e-05, + "loss": 39.2736, + "step": 1528 + }, + { + "epoch": 5.520090293453725, + "grad_norm": 221.05831909179688, + "learning_rate": 2.172958257713249e-05, + "loss": 39.2733, + "step": 1529 + }, + { + "epoch": 5.523702031602709, + "grad_norm": 250.36065673828125, + "learning_rate": 2.1724137931034484e-05, + "loss": 37.8987, + "step": 1530 + }, + { + "epoch": 5.523702031602709, + "eval_loss": 0.6414559483528137, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 1530 + }, + { + "epoch": 5.527313769751693, + "grad_norm": 275.062255859375, + "learning_rate": 2.171869328493648e-05, + "loss": 29.4874, + "step": 1531 + }, + { + "epoch": 5.530925507900677, + "grad_norm": 178.79615783691406, + "learning_rate": 2.1713248638838475e-05, + "loss": 25.2165, + "step": 1532 + }, + { + "epoch": 5.534537246049661, + "grad_norm": 221.6693572998047, + "learning_rate": 2.170780399274047e-05, + "loss": 24.7139, + "step": 1533 + }, + { + "epoch": 5.538148984198646, + "grad_norm": 207.15869140625, + "learning_rate": 2.170235934664247e-05, + "loss": 25.2773, + "step": 1534 + }, + { + "epoch": 5.54176072234763, + "grad_norm": 193.37644958496094, + "learning_rate": 2.1696914700544468e-05, + "loss": 25.7936, + "step": 1535 + }, + { + "epoch": 5.545372460496614, + "grad_norm": 314.101318359375, + "learning_rate": 2.1691470054446463e-05, + "loss": 45.8573, + "step": 1536 + }, + { + "epoch": 5.5489841986455986, + "grad_norm": 376.9578552246094, + "learning_rate": 2.168602540834846e-05, + "loss": 47.1284, + "step": 1537 + }, + { + "epoch": 5.5525959367945825, + "grad_norm": 343.3904724121094, + "learning_rate": 2.1680580762250454e-05, + "loss": 45.1873, + "step": 1538 + }, + { + "epoch": 5.5562076749435665, + "grad_norm": 263.31768798828125, + "learning_rate": 2.167513611615245e-05, + "loss": 45.4906, + "step": 1539 + }, + { + "epoch": 5.5598194130925505, + "grad_norm": 295.50384521484375, + "learning_rate": 2.1669691470054448e-05, + "loss": 44.9259, + "step": 1540 + }, + { + "epoch": 5.5598194130925505, + "eval_loss": 0.6483813524246216, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.923, + "eval_steps_per_second": 56.923, + "step": 1540 + }, + { + "epoch": 5.563431151241535, + "grad_norm": 208.8861846923828, + "learning_rate": 2.1664246823956444e-05, + "loss": 43.7965, + "step": 1541 + }, + { + "epoch": 5.567042889390519, + "grad_norm": 195.8695526123047, + "learning_rate": 2.165880217785844e-05, + "loss": 44.7409, + "step": 1542 + }, + { + "epoch": 5.570654627539503, + "grad_norm": 218.10089111328125, + "learning_rate": 2.1653357531760434e-05, + "loss": 45.9364, + "step": 1543 + }, + { + "epoch": 5.574266365688487, + "grad_norm": 204.17205810546875, + "learning_rate": 2.164791288566243e-05, + "loss": 45.468, + "step": 1544 + }, + { + "epoch": 5.577878103837472, + "grad_norm": 239.03952026367188, + "learning_rate": 2.1642468239564432e-05, + "loss": 44.7685, + "step": 1545 + }, + { + "epoch": 5.581489841986456, + "grad_norm": 251.59300231933594, + "learning_rate": 2.1637023593466427e-05, + "loss": 43.011, + "step": 1546 + }, + { + "epoch": 5.58510158013544, + "grad_norm": 186.72540283203125, + "learning_rate": 2.1631578947368423e-05, + "loss": 41.5255, + "step": 1547 + }, + { + "epoch": 5.588713318284425, + "grad_norm": 199.89732360839844, + "learning_rate": 2.1626134301270418e-05, + "loss": 40.2522, + "step": 1548 + }, + { + "epoch": 5.592325056433409, + "grad_norm": 182.16624450683594, + "learning_rate": 2.1620689655172413e-05, + "loss": 41.0931, + "step": 1549 + }, + { + "epoch": 5.595936794582393, + "grad_norm": 221.58680725097656, + "learning_rate": 2.161524500907441e-05, + "loss": 40.2717, + "step": 1550 + }, + { + "epoch": 5.595936794582393, + "eval_loss": 0.6393340229988098, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 1550 + }, + { + "epoch": 5.599548532731377, + "grad_norm": 209.82183837890625, + "learning_rate": 2.1609800362976408e-05, + "loss": 41.7522, + "step": 1551 + }, + { + "epoch": 5.603160270880361, + "grad_norm": 226.1896209716797, + "learning_rate": 2.1604355716878403e-05, + "loss": 40.8078, + "step": 1552 + }, + { + "epoch": 5.606772009029346, + "grad_norm": 219.57899475097656, + "learning_rate": 2.1598911070780398e-05, + "loss": 42.2331, + "step": 1553 + }, + { + "epoch": 5.6103837471783295, + "grad_norm": 185.2303009033203, + "learning_rate": 2.1593466424682397e-05, + "loss": 42.0695, + "step": 1554 + }, + { + "epoch": 5.6139954853273135, + "grad_norm": 192.32913208007812, + "learning_rate": 2.1588021778584392e-05, + "loss": 42.1317, + "step": 1555 + }, + { + "epoch": 5.617607223476298, + "grad_norm": 183.3128662109375, + "learning_rate": 2.158257713248639e-05, + "loss": 40.4957, + "step": 1556 + }, + { + "epoch": 5.621218961625282, + "grad_norm": 178.10691833496094, + "learning_rate": 2.1577132486388387e-05, + "loss": 40.9154, + "step": 1557 + }, + { + "epoch": 5.624830699774266, + "grad_norm": 207.3495330810547, + "learning_rate": 2.1571687840290382e-05, + "loss": 42.8389, + "step": 1558 + }, + { + "epoch": 5.62844243792325, + "grad_norm": 191.46353149414062, + "learning_rate": 2.1566243194192377e-05, + "loss": 41.9483, + "step": 1559 + }, + { + "epoch": 5.632054176072235, + "grad_norm": 218.9544219970703, + "learning_rate": 2.1560798548094373e-05, + "loss": 41.2037, + "step": 1560 + }, + { + "epoch": 5.632054176072235, + "eval_loss": 0.6345452070236206, + "eval_runtime": 3.1432, + "eval_samples_per_second": 56.949, + "eval_steps_per_second": 56.949, + "step": 1560 + }, + { + "epoch": 5.635665914221219, + "grad_norm": 235.9405059814453, + "learning_rate": 2.1555353901996368e-05, + "loss": 43.1159, + "step": 1561 + }, + { + "epoch": 5.639277652370203, + "grad_norm": 207.1119384765625, + "learning_rate": 2.1549909255898367e-05, + "loss": 43.4384, + "step": 1562 + }, + { + "epoch": 5.642889390519187, + "grad_norm": 305.3013916015625, + "learning_rate": 2.1544464609800366e-05, + "loss": 42.436, + "step": 1563 + }, + { + "epoch": 5.646501128668172, + "grad_norm": 226.25282287597656, + "learning_rate": 2.153901996370236e-05, + "loss": 39.6844, + "step": 1564 + }, + { + "epoch": 5.650112866817156, + "grad_norm": 201.5033416748047, + "learning_rate": 2.1533575317604356e-05, + "loss": 35.9103, + "step": 1565 + }, + { + "epoch": 5.65372460496614, + "grad_norm": 206.63229370117188, + "learning_rate": 2.1528130671506352e-05, + "loss": 35.0026, + "step": 1566 + }, + { + "epoch": 5.657336343115124, + "grad_norm": 212.67581176757812, + "learning_rate": 2.152268602540835e-05, + "loss": 35.6298, + "step": 1567 + }, + { + "epoch": 5.660948081264109, + "grad_norm": 193.2886199951172, + "learning_rate": 2.1517241379310346e-05, + "loss": 36.0356, + "step": 1568 + }, + { + "epoch": 5.664559819413093, + "grad_norm": 166.189208984375, + "learning_rate": 2.151179673321234e-05, + "loss": 35.5423, + "step": 1569 + }, + { + "epoch": 5.668171557562077, + "grad_norm": 288.91552734375, + "learning_rate": 2.1506352087114337e-05, + "loss": 36.6227, + "step": 1570 + }, + { + "epoch": 5.668171557562077, + "eval_loss": 0.6339959502220154, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1570 + }, + { + "epoch": 5.6717832957110605, + "grad_norm": 210.91664123535156, + "learning_rate": 2.1500907441016332e-05, + "loss": 37.3015, + "step": 1571 + }, + { + "epoch": 5.675395033860045, + "grad_norm": 206.54299926757812, + "learning_rate": 2.149546279491833e-05, + "loss": 36.961, + "step": 1572 + }, + { + "epoch": 5.679006772009029, + "grad_norm": 206.55613708496094, + "learning_rate": 2.149001814882033e-05, + "loss": 36.722, + "step": 1573 + }, + { + "epoch": 5.682618510158013, + "grad_norm": 206.86563110351562, + "learning_rate": 2.1484573502722325e-05, + "loss": 37.7482, + "step": 1574 + }, + { + "epoch": 5.686230248306998, + "grad_norm": 219.96533203125, + "learning_rate": 2.147912885662432e-05, + "loss": 37.7964, + "step": 1575 + }, + { + "epoch": 5.689841986455982, + "grad_norm": 226.23887634277344, + "learning_rate": 2.1473684210526316e-05, + "loss": 38.6577, + "step": 1576 + }, + { + "epoch": 5.693453724604966, + "grad_norm": 195.1751708984375, + "learning_rate": 2.146823956442831e-05, + "loss": 36.9764, + "step": 1577 + }, + { + "epoch": 5.69706546275395, + "grad_norm": 194.3510284423828, + "learning_rate": 2.146279491833031e-05, + "loss": 39.4842, + "step": 1578 + }, + { + "epoch": 5.700677200902934, + "grad_norm": 187.02281188964844, + "learning_rate": 2.1457350272232305e-05, + "loss": 38.9574, + "step": 1579 + }, + { + "epoch": 5.704288939051919, + "grad_norm": 242.91925048828125, + "learning_rate": 2.14519056261343e-05, + "loss": 37.6359, + "step": 1580 + }, + { + "epoch": 5.704288939051919, + "eval_loss": 0.6384473443031311, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 1580 + }, + { + "epoch": 5.707900677200903, + "grad_norm": 242.9617156982422, + "learning_rate": 2.14464609800363e-05, + "loss": 31.3564, + "step": 1581 + }, + { + "epoch": 5.711512415349887, + "grad_norm": 182.00540161132812, + "learning_rate": 2.1441016333938295e-05, + "loss": 24.2933, + "step": 1582 + }, + { + "epoch": 5.715124153498872, + "grad_norm": 257.7115173339844, + "learning_rate": 2.143557168784029e-05, + "loss": 24.6299, + "step": 1583 + }, + { + "epoch": 5.718735891647856, + "grad_norm": 198.71554565429688, + "learning_rate": 2.143012704174229e-05, + "loss": 24.7344, + "step": 1584 + }, + { + "epoch": 5.72234762979684, + "grad_norm": 198.24520874023438, + "learning_rate": 2.1424682395644284e-05, + "loss": 26.0825, + "step": 1585 + }, + { + "epoch": 5.725959367945824, + "grad_norm": 248.9528045654297, + "learning_rate": 2.141923774954628e-05, + "loss": 45.1176, + "step": 1586 + }, + { + "epoch": 5.7295711060948085, + "grad_norm": 293.7327575683594, + "learning_rate": 2.1413793103448275e-05, + "loss": 45.8517, + "step": 1587 + }, + { + "epoch": 5.733182844243792, + "grad_norm": 293.1148681640625, + "learning_rate": 2.140834845735027e-05, + "loss": 45.6659, + "step": 1588 + }, + { + "epoch": 5.736794582392776, + "grad_norm": 312.7779846191406, + "learning_rate": 2.140290381125227e-05, + "loss": 44.4863, + "step": 1589 + }, + { + "epoch": 5.74040632054176, + "grad_norm": 309.1000061035156, + "learning_rate": 2.1397459165154265e-05, + "loss": 43.649, + "step": 1590 + }, + { + "epoch": 5.74040632054176, + "eval_loss": 0.6471736431121826, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 1590 + }, + { + "epoch": 5.744018058690745, + "grad_norm": 276.4226989746094, + "learning_rate": 2.1392014519056263e-05, + "loss": 45.3135, + "step": 1591 + }, + { + "epoch": 5.747629796839729, + "grad_norm": 233.6791229248047, + "learning_rate": 2.138656987295826e-05, + "loss": 44.4919, + "step": 1592 + }, + { + "epoch": 5.751241534988713, + "grad_norm": 194.2917022705078, + "learning_rate": 2.1381125226860254e-05, + "loss": 44.8033, + "step": 1593 + }, + { + "epoch": 5.754853273137698, + "grad_norm": 241.76060485839844, + "learning_rate": 2.137568058076225e-05, + "loss": 45.1427, + "step": 1594 + }, + { + "epoch": 5.758465011286682, + "grad_norm": 216.56283569335938, + "learning_rate": 2.137023593466425e-05, + "loss": 43.1769, + "step": 1595 + }, + { + "epoch": 5.762076749435666, + "grad_norm": 230.0026092529297, + "learning_rate": 2.1364791288566244e-05, + "loss": 44.1141, + "step": 1596 + }, + { + "epoch": 5.76568848758465, + "grad_norm": 191.55433654785156, + "learning_rate": 2.135934664246824e-05, + "loss": 40.7227, + "step": 1597 + }, + { + "epoch": 5.769300225733634, + "grad_norm": 180.25885009765625, + "learning_rate": 2.1353901996370235e-05, + "loss": 40.9842, + "step": 1598 + }, + { + "epoch": 5.772911963882619, + "grad_norm": 220.4018096923828, + "learning_rate": 2.134845735027223e-05, + "loss": 40.0403, + "step": 1599 + }, + { + "epoch": 5.776523702031603, + "grad_norm": 264.20587158203125, + "learning_rate": 2.1343012704174232e-05, + "loss": 40.1543, + "step": 1600 + }, + { + "epoch": 5.776523702031603, + "eval_loss": 0.6374311447143555, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1600 + }, + { + "epoch": 5.780135440180587, + "grad_norm": 167.9457244873047, + "learning_rate": 2.1337568058076227e-05, + "loss": 40.9575, + "step": 1601 + }, + { + "epoch": 5.7837471783295715, + "grad_norm": 190.05247497558594, + "learning_rate": 2.1332123411978223e-05, + "loss": 39.5593, + "step": 1602 + }, + { + "epoch": 5.7873589164785555, + "grad_norm": 246.4980926513672, + "learning_rate": 2.1326678765880218e-05, + "loss": 40.7016, + "step": 1603 + }, + { + "epoch": 5.7909706546275395, + "grad_norm": 208.7435302734375, + "learning_rate": 2.1321234119782214e-05, + "loss": 41.7855, + "step": 1604 + }, + { + "epoch": 5.794582392776523, + "grad_norm": 190.84188842773438, + "learning_rate": 2.1315789473684212e-05, + "loss": 41.2129, + "step": 1605 + }, + { + "epoch": 5.798194130925508, + "grad_norm": 196.7161102294922, + "learning_rate": 2.1310344827586208e-05, + "loss": 40.8209, + "step": 1606 + }, + { + "epoch": 5.801805869074492, + "grad_norm": 181.4319305419922, + "learning_rate": 2.1304900181488203e-05, + "loss": 41.8345, + "step": 1607 + }, + { + "epoch": 5.805417607223476, + "grad_norm": 201.2064971923828, + "learning_rate": 2.12994555353902e-05, + "loss": 43.1464, + "step": 1608 + }, + { + "epoch": 5.80902934537246, + "grad_norm": 199.15174865722656, + "learning_rate": 2.1294010889292197e-05, + "loss": 42.6041, + "step": 1609 + }, + { + "epoch": 5.812641083521445, + "grad_norm": 231.0398406982422, + "learning_rate": 2.1288566243194193e-05, + "loss": 42.867, + "step": 1610 + }, + { + "epoch": 5.812641083521445, + "eval_loss": 0.6334222555160522, + "eval_runtime": 3.1534, + "eval_samples_per_second": 56.764, + "eval_steps_per_second": 56.764, + "step": 1610 + }, + { + "epoch": 5.816252821670429, + "grad_norm": 189.26132202148438, + "learning_rate": 2.128312159709619e-05, + "loss": 41.7717, + "step": 1611 + }, + { + "epoch": 5.819864559819413, + "grad_norm": 215.5289764404297, + "learning_rate": 2.1277676950998187e-05, + "loss": 41.3994, + "step": 1612 + }, + { + "epoch": 5.823476297968397, + "grad_norm": 267.4259033203125, + "learning_rate": 2.1272232304900182e-05, + "loss": 41.8173, + "step": 1613 + }, + { + "epoch": 5.827088036117382, + "grad_norm": 241.74749755859375, + "learning_rate": 2.1266787658802178e-05, + "loss": 39.9873, + "step": 1614 + }, + { + "epoch": 5.830699774266366, + "grad_norm": 242.233642578125, + "learning_rate": 2.1261343012704173e-05, + "loss": 37.0662, + "step": 1615 + }, + { + "epoch": 5.83431151241535, + "grad_norm": 217.06141662597656, + "learning_rate": 2.1255898366606172e-05, + "loss": 36.8948, + "step": 1616 + }, + { + "epoch": 5.837923250564334, + "grad_norm": 242.05567932128906, + "learning_rate": 2.1250453720508167e-05, + "loss": 34.9909, + "step": 1617 + }, + { + "epoch": 5.8415349887133186, + "grad_norm": 178.65618896484375, + "learning_rate": 2.1245009074410166e-05, + "loss": 35.603, + "step": 1618 + }, + { + "epoch": 5.8451467268623025, + "grad_norm": 216.36865234375, + "learning_rate": 2.123956442831216e-05, + "loss": 35.9822, + "step": 1619 + }, + { + "epoch": 5.8487584650112865, + "grad_norm": 241.22161865234375, + "learning_rate": 2.1234119782214157e-05, + "loss": 35.1473, + "step": 1620 + }, + { + "epoch": 5.8487584650112865, + "eval_loss": 0.6312161087989807, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 1620 + }, + { + "epoch": 5.852370203160271, + "grad_norm": 192.05210876464844, + "learning_rate": 2.1228675136116152e-05, + "loss": 36.145, + "step": 1621 + }, + { + "epoch": 5.855981941309255, + "grad_norm": 194.0652618408203, + "learning_rate": 2.122323049001815e-05, + "loss": 37.7076, + "step": 1622 + }, + { + "epoch": 5.859593679458239, + "grad_norm": 255.59286499023438, + "learning_rate": 2.1217785843920146e-05, + "loss": 37.6837, + "step": 1623 + }, + { + "epoch": 5.863205417607223, + "grad_norm": 184.0017852783203, + "learning_rate": 2.121234119782214e-05, + "loss": 37.1681, + "step": 1624 + }, + { + "epoch": 5.866817155756207, + "grad_norm": 186.98338317871094, + "learning_rate": 2.1206896551724137e-05, + "loss": 37.4902, + "step": 1625 + }, + { + "epoch": 5.870428893905192, + "grad_norm": 253.53775024414062, + "learning_rate": 2.1201451905626132e-05, + "loss": 37.2771, + "step": 1626 + }, + { + "epoch": 5.874040632054176, + "grad_norm": 196.43038940429688, + "learning_rate": 2.119600725952813e-05, + "loss": 37.7681, + "step": 1627 + }, + { + "epoch": 5.87765237020316, + "grad_norm": 255.99879455566406, + "learning_rate": 2.119056261343013e-05, + "loss": 40.0097, + "step": 1628 + }, + { + "epoch": 5.881264108352145, + "grad_norm": 275.1465148925781, + "learning_rate": 2.1185117967332125e-05, + "loss": 38.1076, + "step": 1629 + }, + { + "epoch": 5.884875846501129, + "grad_norm": 281.8592529296875, + "learning_rate": 2.117967332123412e-05, + "loss": 38.6463, + "step": 1630 + }, + { + "epoch": 5.884875846501129, + "eval_loss": 0.6449099779129028, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 1630 + }, + { + "epoch": 5.888487584650113, + "grad_norm": 246.7912139892578, + "learning_rate": 2.1174228675136116e-05, + "loss": 36.9158, + "step": 1631 + }, + { + "epoch": 5.892099322799097, + "grad_norm": 176.7545623779297, + "learning_rate": 2.116878402903811e-05, + "loss": 25.1153, + "step": 1632 + }, + { + "epoch": 5.895711060948082, + "grad_norm": 202.2602996826172, + "learning_rate": 2.116333938294011e-05, + "loss": 24.1999, + "step": 1633 + }, + { + "epoch": 5.899322799097066, + "grad_norm": 186.26255798339844, + "learning_rate": 2.1157894736842106e-05, + "loss": 24.185, + "step": 1634 + }, + { + "epoch": 5.9029345372460496, + "grad_norm": 231.0543670654297, + "learning_rate": 2.11524500907441e-05, + "loss": 26.1841, + "step": 1635 + }, + { + "epoch": 5.9065462753950335, + "grad_norm": 336.677001953125, + "learning_rate": 2.1147005444646096e-05, + "loss": 47.1367, + "step": 1636 + }, + { + "epoch": 5.910158013544018, + "grad_norm": 299.3211975097656, + "learning_rate": 2.1141560798548095e-05, + "loss": 46.7711, + "step": 1637 + }, + { + "epoch": 5.913769751693002, + "grad_norm": 287.5389099121094, + "learning_rate": 2.1136116152450094e-05, + "loss": 44.9163, + "step": 1638 + }, + { + "epoch": 5.917381489841986, + "grad_norm": 290.34930419921875, + "learning_rate": 2.113067150635209e-05, + "loss": 45.1651, + "step": 1639 + }, + { + "epoch": 5.92099322799097, + "grad_norm": 244.7100372314453, + "learning_rate": 2.1125226860254085e-05, + "loss": 45.6252, + "step": 1640 + }, + { + "epoch": 5.92099322799097, + "eval_loss": 0.6506878733634949, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 1640 + }, + { + "epoch": 5.924604966139955, + "grad_norm": 301.48223876953125, + "learning_rate": 2.111978221415608e-05, + "loss": 44.5345, + "step": 1641 + }, + { + "epoch": 5.928216704288939, + "grad_norm": 261.05987548828125, + "learning_rate": 2.1114337568058075e-05, + "loss": 42.0263, + "step": 1642 + }, + { + "epoch": 5.931828442437923, + "grad_norm": 220.4369659423828, + "learning_rate": 2.110889292196007e-05, + "loss": 41.2405, + "step": 1643 + }, + { + "epoch": 5.935440180586907, + "grad_norm": 261.3221435546875, + "learning_rate": 2.110344827586207e-05, + "loss": 42.2734, + "step": 1644 + }, + { + "epoch": 5.939051918735892, + "grad_norm": 253.70855712890625, + "learning_rate": 2.1098003629764065e-05, + "loss": 43.0752, + "step": 1645 + }, + { + "epoch": 5.942663656884876, + "grad_norm": 198.76138305664062, + "learning_rate": 2.1092558983666064e-05, + "loss": 42.7103, + "step": 1646 + }, + { + "epoch": 5.94627539503386, + "grad_norm": 212.21466064453125, + "learning_rate": 2.108711433756806e-05, + "loss": 42.6215, + "step": 1647 + }, + { + "epoch": 5.949887133182845, + "grad_norm": 212.9633026123047, + "learning_rate": 2.1081669691470055e-05, + "loss": 42.795, + "step": 1648 + }, + { + "epoch": 5.953498871331829, + "grad_norm": 263.2871398925781, + "learning_rate": 2.1076225045372053e-05, + "loss": 43.8843, + "step": 1649 + }, + { + "epoch": 5.957110609480813, + "grad_norm": 207.67120361328125, + "learning_rate": 2.107078039927405e-05, + "loss": 43.0161, + "step": 1650 + }, + { + "epoch": 5.957110609480813, + "eval_loss": 0.6315081715583801, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1650 + }, + { + "epoch": 5.960722347629797, + "grad_norm": 176.6342010498047, + "learning_rate": 2.1065335753176044e-05, + "loss": 38.803, + "step": 1651 + }, + { + "epoch": 5.9643340857787805, + "grad_norm": 223.57485961914062, + "learning_rate": 2.105989110707804e-05, + "loss": 35.1905, + "step": 1652 + }, + { + "epoch": 5.967945823927765, + "grad_norm": 291.507568359375, + "learning_rate": 2.1054446460980035e-05, + "loss": 34.9454, + "step": 1653 + }, + { + "epoch": 5.971557562076749, + "grad_norm": 250.51063537597656, + "learning_rate": 2.104900181488203e-05, + "loss": 37.4404, + "step": 1654 + }, + { + "epoch": 5.975169300225733, + "grad_norm": 307.9601135253906, + "learning_rate": 2.1043557168784032e-05, + "loss": 36.9775, + "step": 1655 + }, + { + "epoch": 5.978781038374718, + "grad_norm": 277.24151611328125, + "learning_rate": 2.1038112522686028e-05, + "loss": 38.2696, + "step": 1656 + }, + { + "epoch": 5.982392776523702, + "grad_norm": 186.7593994140625, + "learning_rate": 2.1032667876588023e-05, + "loss": 37.0656, + "step": 1657 + }, + { + "epoch": 5.986004514672686, + "grad_norm": 201.67047119140625, + "learning_rate": 2.102722323049002e-05, + "loss": 38.1747, + "step": 1658 + }, + { + "epoch": 5.98961625282167, + "grad_norm": 216.87525939941406, + "learning_rate": 2.1021778584392014e-05, + "loss": 39.3248, + "step": 1659 + }, + { + "epoch": 5.993227990970655, + "grad_norm": 227.381103515625, + "learning_rate": 2.1016333938294013e-05, + "loss": 33.4017, + "step": 1660 + }, + { + "epoch": 5.993227990970655, + "eval_loss": 0.6369583010673523, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1660 + }, + { + "epoch": 5.996839729119639, + "grad_norm": 237.2648468017578, + "learning_rate": 2.1010889292196008e-05, + "loss": 24.679, + "step": 1661 + }, + { + "epoch": 6.0, + "grad_norm": 191.99951171875, + "learning_rate": 2.1005444646098003e-05, + "loss": 21.9552, + "step": 1662 + }, + { + "epoch": 6.003611738148984, + "grad_norm": 267.92181396484375, + "learning_rate": 2.1e-05, + "loss": 43.6884, + "step": 1663 + }, + { + "epoch": 6.007223476297969, + "grad_norm": 318.86602783203125, + "learning_rate": 2.0994555353901998e-05, + "loss": 46.0709, + "step": 1664 + }, + { + "epoch": 6.010835214446953, + "grad_norm": 282.772705078125, + "learning_rate": 2.0989110707803993e-05, + "loss": 44.2746, + "step": 1665 + }, + { + "epoch": 6.014446952595937, + "grad_norm": 263.2024841308594, + "learning_rate": 2.0983666061705992e-05, + "loss": 43.818, + "step": 1666 + }, + { + "epoch": 6.018058690744921, + "grad_norm": 229.41725158691406, + "learning_rate": 2.0978221415607987e-05, + "loss": 43.9441, + "step": 1667 + }, + { + "epoch": 6.021670428893906, + "grad_norm": 253.25624084472656, + "learning_rate": 2.0972776769509983e-05, + "loss": 43.517, + "step": 1668 + }, + { + "epoch": 6.0252821670428895, + "grad_norm": 202.00238037109375, + "learning_rate": 2.0967332123411978e-05, + "loss": 44.3685, + "step": 1669 + }, + { + "epoch": 6.0288939051918735, + "grad_norm": 196.92825317382812, + "learning_rate": 2.0961887477313973e-05, + "loss": 44.9367, + "step": 1670 + }, + { + "epoch": 6.0288939051918735, + "eval_loss": 0.6381568312644958, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1670 + }, + { + "epoch": 6.0325056433408575, + "grad_norm": 191.00900268554688, + "learning_rate": 2.0956442831215972e-05, + "loss": 44.0743, + "step": 1671 + }, + { + "epoch": 6.036117381489842, + "grad_norm": 195.92141723632812, + "learning_rate": 2.0950998185117967e-05, + "loss": 43.3278, + "step": 1672 + }, + { + "epoch": 6.039729119638826, + "grad_norm": 230.04708862304688, + "learning_rate": 2.0945553539019963e-05, + "loss": 41.6419, + "step": 1673 + }, + { + "epoch": 6.04334085778781, + "grad_norm": 215.70689392089844, + "learning_rate": 2.094010889292196e-05, + "loss": 41.0927, + "step": 1674 + }, + { + "epoch": 6.046952595936794, + "grad_norm": 227.51797485351562, + "learning_rate": 2.0934664246823957e-05, + "loss": 40.1888, + "step": 1675 + }, + { + "epoch": 6.050564334085779, + "grad_norm": 216.93089294433594, + "learning_rate": 2.0929219600725952e-05, + "loss": 39.8766, + "step": 1676 + }, + { + "epoch": 6.054176072234763, + "grad_norm": 199.3091583251953, + "learning_rate": 2.092377495462795e-05, + "loss": 40.3851, + "step": 1677 + }, + { + "epoch": 6.057787810383747, + "grad_norm": 188.56056213378906, + "learning_rate": 2.0918330308529947e-05, + "loss": 40.5289, + "step": 1678 + }, + { + "epoch": 6.061399548532731, + "grad_norm": 194.23265075683594, + "learning_rate": 2.0912885662431942e-05, + "loss": 40.7509, + "step": 1679 + }, + { + "epoch": 6.065011286681716, + "grad_norm": 199.7327423095703, + "learning_rate": 2.0907441016333937e-05, + "loss": 41.3404, + "step": 1680 + }, + { + "epoch": 6.065011286681716, + "eval_loss": 0.6312655806541443, + "eval_runtime": 3.1482, + "eval_samples_per_second": 56.858, + "eval_steps_per_second": 56.858, + "step": 1680 + }, + { + "epoch": 6.0686230248307, + "grad_norm": 189.40150451660156, + "learning_rate": 2.0901996370235933e-05, + "loss": 41.3719, + "step": 1681 + }, + { + "epoch": 6.072234762979684, + "grad_norm": 222.07705688476562, + "learning_rate": 2.089655172413793e-05, + "loss": 41.8194, + "step": 1682 + }, + { + "epoch": 6.075846501128668, + "grad_norm": 205.6264190673828, + "learning_rate": 2.089110707803993e-05, + "loss": 39.8522, + "step": 1683 + }, + { + "epoch": 6.079458239277653, + "grad_norm": 207.98802185058594, + "learning_rate": 2.0885662431941926e-05, + "loss": 41.5093, + "step": 1684 + }, + { + "epoch": 6.083069977426637, + "grad_norm": 197.24134826660156, + "learning_rate": 2.088021778584392e-05, + "loss": 41.7284, + "step": 1685 + }, + { + "epoch": 6.0866817155756205, + "grad_norm": 220.84255981445312, + "learning_rate": 2.0874773139745916e-05, + "loss": 42.7841, + "step": 1686 + }, + { + "epoch": 6.090293453724605, + "grad_norm": 239.06854248046875, + "learning_rate": 2.0869328493647912e-05, + "loss": 43.6391, + "step": 1687 + }, + { + "epoch": 6.093905191873589, + "grad_norm": 193.2572021484375, + "learning_rate": 2.086388384754991e-05, + "loss": 41.9963, + "step": 1688 + }, + { + "epoch": 6.097516930022573, + "grad_norm": 206.66473388671875, + "learning_rate": 2.0858439201451906e-05, + "loss": 41.9834, + "step": 1689 + }, + { + "epoch": 6.101128668171557, + "grad_norm": 214.81956481933594, + "learning_rate": 2.08529945553539e-05, + "loss": 41.7128, + "step": 1690 + }, + { + "epoch": 6.101128668171557, + "eval_loss": 0.6309775114059448, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1690 + }, + { + "epoch": 6.104740406320542, + "grad_norm": 189.58360290527344, + "learning_rate": 2.0847549909255897e-05, + "loss": 37.7807, + "step": 1691 + }, + { + "epoch": 6.108352144469526, + "grad_norm": 265.76934814453125, + "learning_rate": 2.0842105263157895e-05, + "loss": 37.7091, + "step": 1692 + }, + { + "epoch": 6.11196388261851, + "grad_norm": 266.4632568359375, + "learning_rate": 2.0836660617059894e-05, + "loss": 34.7386, + "step": 1693 + }, + { + "epoch": 6.115575620767494, + "grad_norm": 309.3799743652344, + "learning_rate": 2.083121597096189e-05, + "loss": 34.9386, + "step": 1694 + }, + { + "epoch": 6.119187358916479, + "grad_norm": 252.98681640625, + "learning_rate": 2.0825771324863885e-05, + "loss": 34.9113, + "step": 1695 + }, + { + "epoch": 6.122799097065463, + "grad_norm": 199.3408660888672, + "learning_rate": 2.082032667876588e-05, + "loss": 35.1914, + "step": 1696 + }, + { + "epoch": 6.126410835214447, + "grad_norm": 231.67514038085938, + "learning_rate": 2.0814882032667876e-05, + "loss": 36.3151, + "step": 1697 + }, + { + "epoch": 6.130022573363431, + "grad_norm": 215.49317932128906, + "learning_rate": 2.080943738656987e-05, + "loss": 37.6763, + "step": 1698 + }, + { + "epoch": 6.133634311512416, + "grad_norm": 239.3602752685547, + "learning_rate": 2.080399274047187e-05, + "loss": 35.7805, + "step": 1699 + }, + { + "epoch": 6.1372460496614, + "grad_norm": 192.8195037841797, + "learning_rate": 2.0798548094373865e-05, + "loss": 36.7353, + "step": 1700 + }, + { + "epoch": 6.1372460496614, + "eval_loss": 0.6290757060050964, + "eval_runtime": 3.1486, + "eval_samples_per_second": 56.851, + "eval_steps_per_second": 56.851, + "step": 1700 + }, + { + "epoch": 6.140857787810384, + "grad_norm": 191.125, + "learning_rate": 2.0793103448275864e-05, + "loss": 36.6377, + "step": 1701 + }, + { + "epoch": 6.144469525959368, + "grad_norm": 232.39170837402344, + "learning_rate": 2.078765880217786e-05, + "loss": 36.5235, + "step": 1702 + }, + { + "epoch": 6.148081264108352, + "grad_norm": 259.41204833984375, + "learning_rate": 2.0782214156079855e-05, + "loss": 37.7093, + "step": 1703 + }, + { + "epoch": 6.151693002257336, + "grad_norm": 218.00814819335938, + "learning_rate": 2.0776769509981854e-05, + "loss": 37.8061, + "step": 1704 + }, + { + "epoch": 6.15530474040632, + "grad_norm": 183.78170776367188, + "learning_rate": 2.077132486388385e-05, + "loss": 37.9451, + "step": 1705 + }, + { + "epoch": 6.158916478555304, + "grad_norm": 242.387939453125, + "learning_rate": 2.0765880217785844e-05, + "loss": 38.687, + "step": 1706 + }, + { + "epoch": 6.162528216704289, + "grad_norm": 247.09152221679688, + "learning_rate": 2.076043557168784e-05, + "loss": 38.5109, + "step": 1707 + }, + { + "epoch": 6.166139954853273, + "grad_norm": 202.3104705810547, + "learning_rate": 2.0754990925589835e-05, + "loss": 28.0115, + "step": 1708 + }, + { + "epoch": 6.169751693002257, + "grad_norm": 239.5511016845703, + "learning_rate": 2.0749546279491834e-05, + "loss": 23.8873, + "step": 1709 + }, + { + "epoch": 6.173363431151241, + "grad_norm": 233.80007934570312, + "learning_rate": 2.0744101633393833e-05, + "loss": 24.0236, + "step": 1710 + }, + { + "epoch": 6.173363431151241, + "eval_loss": 0.6451307535171509, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1710 + }, + { + "epoch": 6.176975169300226, + "grad_norm": 231.85955810546875, + "learning_rate": 2.0738656987295828e-05, + "loss": 25.2521, + "step": 1711 + }, + { + "epoch": 6.18058690744921, + "grad_norm": 207.05453491210938, + "learning_rate": 2.0733212341197823e-05, + "loss": 25.5774, + "step": 1712 + }, + { + "epoch": 6.184198645598194, + "grad_norm": 265.9180908203125, + "learning_rate": 2.072776769509982e-05, + "loss": 46.0267, + "step": 1713 + }, + { + "epoch": 6.187810383747179, + "grad_norm": 289.2763671875, + "learning_rate": 2.0722323049001814e-05, + "loss": 46.6262, + "step": 1714 + }, + { + "epoch": 6.191422121896163, + "grad_norm": 254.466552734375, + "learning_rate": 2.0716878402903813e-05, + "loss": 44.2758, + "step": 1715 + }, + { + "epoch": 6.195033860045147, + "grad_norm": 262.713134765625, + "learning_rate": 2.071143375680581e-05, + "loss": 44.6334, + "step": 1716 + }, + { + "epoch": 6.198645598194131, + "grad_norm": 272.8150939941406, + "learning_rate": 2.0705989110707804e-05, + "loss": 44.9617, + "step": 1717 + }, + { + "epoch": 6.2022573363431155, + "grad_norm": 288.115478515625, + "learning_rate": 2.07005444646098e-05, + "loss": 44.4382, + "step": 1718 + }, + { + "epoch": 6.2058690744920995, + "grad_norm": 226.08058166503906, + "learning_rate": 2.0695099818511795e-05, + "loss": 44.8551, + "step": 1719 + }, + { + "epoch": 6.209480812641083, + "grad_norm": 219.95835876464844, + "learning_rate": 2.0689655172413797e-05, + "loss": 45.5901, + "step": 1720 + }, + { + "epoch": 6.209480812641083, + "eval_loss": 0.6379314661026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 1720 + }, + { + "epoch": 6.213092550790067, + "grad_norm": 190.3118896484375, + "learning_rate": 2.0684210526315792e-05, + "loss": 44.0675, + "step": 1721 + }, + { + "epoch": 6.216704288939052, + "grad_norm": 177.408935546875, + "learning_rate": 2.0678765880217787e-05, + "loss": 42.6333, + "step": 1722 + }, + { + "epoch": 6.220316027088036, + "grad_norm": 231.3040313720703, + "learning_rate": 2.0673321234119783e-05, + "loss": 41.6771, + "step": 1723 + }, + { + "epoch": 6.22392776523702, + "grad_norm": 226.51663208007812, + "learning_rate": 2.0667876588021778e-05, + "loss": 41.0829, + "step": 1724 + }, + { + "epoch": 6.227539503386004, + "grad_norm": 184.55775451660156, + "learning_rate": 2.0662431941923774e-05, + "loss": 39.2682, + "step": 1725 + }, + { + "epoch": 6.231151241534989, + "grad_norm": 205.0491943359375, + "learning_rate": 2.0656987295825772e-05, + "loss": 40.4101, + "step": 1726 + }, + { + "epoch": 6.234762979683973, + "grad_norm": 201.45838928222656, + "learning_rate": 2.0651542649727768e-05, + "loss": 39.9147, + "step": 1727 + }, + { + "epoch": 6.238374717832957, + "grad_norm": 220.16213989257812, + "learning_rate": 2.0646098003629763e-05, + "loss": 40.7215, + "step": 1728 + }, + { + "epoch": 6.241986455981941, + "grad_norm": 260.9661560058594, + "learning_rate": 2.0640653357531762e-05, + "loss": 40.0256, + "step": 1729 + }, + { + "epoch": 6.245598194130926, + "grad_norm": 314.2476806640625, + "learning_rate": 2.0635208711433757e-05, + "loss": 41.1147, + "step": 1730 + }, + { + "epoch": 6.245598194130926, + "eval_loss": 0.6347935199737549, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1730 + }, + { + "epoch": 6.24920993227991, + "grad_norm": 262.24505615234375, + "learning_rate": 2.0629764065335756e-05, + "loss": 41.7255, + "step": 1731 + }, + { + "epoch": 6.252821670428894, + "grad_norm": 212.0876922607422, + "learning_rate": 2.062431941923775e-05, + "loss": 41.2559, + "step": 1732 + }, + { + "epoch": 6.2564334085778786, + "grad_norm": 185.3249969482422, + "learning_rate": 2.0618874773139747e-05, + "loss": 41.1664, + "step": 1733 + }, + { + "epoch": 6.2600451467268625, + "grad_norm": 184.7873077392578, + "learning_rate": 2.0613430127041742e-05, + "loss": 41.3357, + "step": 1734 + }, + { + "epoch": 6.2636568848758465, + "grad_norm": 230.11257934570312, + "learning_rate": 2.0607985480943738e-05, + "loss": 43.0978, + "step": 1735 + }, + { + "epoch": 6.2672686230248305, + "grad_norm": 251.255126953125, + "learning_rate": 2.0602540834845733e-05, + "loss": 42.4169, + "step": 1736 + }, + { + "epoch": 6.270880361173815, + "grad_norm": 230.1149444580078, + "learning_rate": 2.0597096188747732e-05, + "loss": 43.2969, + "step": 1737 + }, + { + "epoch": 6.274492099322799, + "grad_norm": 217.2769012451172, + "learning_rate": 2.059165154264973e-05, + "loss": 42.6037, + "step": 1738 + }, + { + "epoch": 6.278103837471783, + "grad_norm": 189.85533142089844, + "learning_rate": 2.0586206896551726e-05, + "loss": 42.1215, + "step": 1739 + }, + { + "epoch": 6.281715575620767, + "grad_norm": 242.15667724609375, + "learning_rate": 2.058076225045372e-05, + "loss": 42.6337, + "step": 1740 + }, + { + "epoch": 6.281715575620767, + "eval_loss": 0.6310555934906006, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 1740 + }, + { + "epoch": 6.285327313769752, + "grad_norm": 213.7873992919922, + "learning_rate": 2.0575317604355717e-05, + "loss": 40.5315, + "step": 1741 + }, + { + "epoch": 6.288939051918736, + "grad_norm": 243.86492919921875, + "learning_rate": 2.0569872958257715e-05, + "loss": 38.9483, + "step": 1742 + }, + { + "epoch": 6.29255079006772, + "grad_norm": 276.0108642578125, + "learning_rate": 2.056442831215971e-05, + "loss": 35.9627, + "step": 1743 + }, + { + "epoch": 6.296162528216704, + "grad_norm": 252.5875701904297, + "learning_rate": 2.0558983666061706e-05, + "loss": 35.4305, + "step": 1744 + }, + { + "epoch": 6.299774266365689, + "grad_norm": 227.15142822265625, + "learning_rate": 2.05535390199637e-05, + "loss": 35.2385, + "step": 1745 + }, + { + "epoch": 6.303386004514673, + "grad_norm": 259.6727294921875, + "learning_rate": 2.0548094373865697e-05, + "loss": 35.735, + "step": 1746 + }, + { + "epoch": 6.306997742663657, + "grad_norm": 185.07765197753906, + "learning_rate": 2.0542649727767696e-05, + "loss": 36.8835, + "step": 1747 + }, + { + "epoch": 6.310609480812641, + "grad_norm": 207.650146484375, + "learning_rate": 2.0537205081669694e-05, + "loss": 36.346, + "step": 1748 + }, + { + "epoch": 6.314221218961626, + "grad_norm": 223.2378692626953, + "learning_rate": 2.053176043557169e-05, + "loss": 36.1527, + "step": 1749 + }, + { + "epoch": 6.3178329571106095, + "grad_norm": 162.90794372558594, + "learning_rate": 2.0526315789473685e-05, + "loss": 35.7408, + "step": 1750 + }, + { + "epoch": 6.3178329571106095, + "eval_loss": 0.6276403069496155, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 1750 + }, + { + "epoch": 6.3214446952595935, + "grad_norm": 165.8592987060547, + "learning_rate": 2.052087114337568e-05, + "loss": 37.7916, + "step": 1751 + }, + { + "epoch": 6.3250564334085775, + "grad_norm": 179.7499542236328, + "learning_rate": 2.0515426497277676e-05, + "loss": 36.8409, + "step": 1752 + }, + { + "epoch": 6.328668171557562, + "grad_norm": 227.0990753173828, + "learning_rate": 2.0509981851179675e-05, + "loss": 37.1766, + "step": 1753 + }, + { + "epoch": 6.332279909706546, + "grad_norm": 216.3297882080078, + "learning_rate": 2.050453720508167e-05, + "loss": 37.5, + "step": 1754 + }, + { + "epoch": 6.33589164785553, + "grad_norm": 197.88409423828125, + "learning_rate": 2.0499092558983666e-05, + "loss": 38.8293, + "step": 1755 + }, + { + "epoch": 6.339503386004514, + "grad_norm": 189.74916076660156, + "learning_rate": 2.049364791288566e-05, + "loss": 37.9873, + "step": 1756 + }, + { + "epoch": 6.343115124153499, + "grad_norm": 241.16644287109375, + "learning_rate": 2.048820326678766e-05, + "loss": 39.3107, + "step": 1757 + }, + { + "epoch": 6.346726862302483, + "grad_norm": 224.3491668701172, + "learning_rate": 2.0482758620689655e-05, + "loss": 36.2482, + "step": 1758 + }, + { + "epoch": 6.350338600451467, + "grad_norm": 217.30882263183594, + "learning_rate": 2.0477313974591654e-05, + "loss": 24.1945, + "step": 1759 + }, + { + "epoch": 6.353950338600452, + "grad_norm": 213.23683166503906, + "learning_rate": 2.047186932849365e-05, + "loss": 24.2356, + "step": 1760 + }, + { + "epoch": 6.353950338600452, + "eval_loss": 0.6382855772972107, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.795, + "eval_steps_per_second": 56.795, + "step": 1760 + }, + { + "epoch": 6.357562076749436, + "grad_norm": 209.8166961669922, + "learning_rate": 2.0466424682395645e-05, + "loss": 25.1916, + "step": 1761 + }, + { + "epoch": 6.36117381489842, + "grad_norm": 197.86773681640625, + "learning_rate": 2.046098003629764e-05, + "loss": 25.1372, + "step": 1762 + }, + { + "epoch": 6.364785553047404, + "grad_norm": 280.80517578125, + "learning_rate": 2.0455535390199635e-05, + "loss": 45.0431, + "step": 1763 + }, + { + "epoch": 6.368397291196389, + "grad_norm": 239.85861206054688, + "learning_rate": 2.0450090744101634e-05, + "loss": 45.4893, + "step": 1764 + }, + { + "epoch": 6.372009029345373, + "grad_norm": 302.56024169921875, + "learning_rate": 2.044464609800363e-05, + "loss": 45.3313, + "step": 1765 + }, + { + "epoch": 6.375620767494357, + "grad_norm": 255.5519256591797, + "learning_rate": 2.043920145190563e-05, + "loss": 44.703, + "step": 1766 + }, + { + "epoch": 6.3792325056433405, + "grad_norm": 223.1331024169922, + "learning_rate": 2.0433756805807624e-05, + "loss": 45.0278, + "step": 1767 + }, + { + "epoch": 6.382844243792325, + "grad_norm": 240.68817138671875, + "learning_rate": 2.042831215970962e-05, + "loss": 44.7298, + "step": 1768 + }, + { + "epoch": 6.386455981941309, + "grad_norm": 239.5072021484375, + "learning_rate": 2.0422867513611614e-05, + "loss": 44.0512, + "step": 1769 + }, + { + "epoch": 6.390067720090293, + "grad_norm": 186.3783416748047, + "learning_rate": 2.0417422867513613e-05, + "loss": 43.8646, + "step": 1770 + }, + { + "epoch": 6.390067720090293, + "eval_loss": 0.6325972676277161, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 1770 + }, + { + "epoch": 6.393679458239277, + "grad_norm": 169.77285766601562, + "learning_rate": 2.041197822141561e-05, + "loss": 43.8688, + "step": 1771 + }, + { + "epoch": 6.397291196388262, + "grad_norm": 158.4019012451172, + "learning_rate": 2.0406533575317604e-05, + "loss": 42.5757, + "step": 1772 + }, + { + "epoch": 6.400902934537246, + "grad_norm": 209.79916381835938, + "learning_rate": 2.04010889292196e-05, + "loss": 44.8075, + "step": 1773 + }, + { + "epoch": 6.40451467268623, + "grad_norm": 215.74639892578125, + "learning_rate": 2.0395644283121595e-05, + "loss": 42.0121, + "step": 1774 + }, + { + "epoch": 6.408126410835214, + "grad_norm": 215.21121215820312, + "learning_rate": 2.0390199637023597e-05, + "loss": 40.6564, + "step": 1775 + }, + { + "epoch": 6.411738148984199, + "grad_norm": 244.49574279785156, + "learning_rate": 2.0384754990925592e-05, + "loss": 40.543, + "step": 1776 + }, + { + "epoch": 6.415349887133183, + "grad_norm": 189.22781372070312, + "learning_rate": 2.0379310344827588e-05, + "loss": 39.5569, + "step": 1777 + }, + { + "epoch": 6.418961625282167, + "grad_norm": 204.32664489746094, + "learning_rate": 2.0373865698729583e-05, + "loss": 40.0789, + "step": 1778 + }, + { + "epoch": 6.422573363431152, + "grad_norm": 217.5277557373047, + "learning_rate": 2.036842105263158e-05, + "loss": 39.6436, + "step": 1779 + }, + { + "epoch": 6.426185101580136, + "grad_norm": 196.25918579101562, + "learning_rate": 2.0362976406533574e-05, + "loss": 41.0794, + "step": 1780 + }, + { + "epoch": 6.426185101580136, + "eval_loss": 0.6334295868873596, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1780 + }, + { + "epoch": 6.42979683972912, + "grad_norm": 191.50656127929688, + "learning_rate": 2.0357531760435573e-05, + "loss": 41.2976, + "step": 1781 + }, + { + "epoch": 6.433408577878104, + "grad_norm": 192.98692321777344, + "learning_rate": 2.0352087114337568e-05, + "loss": 41.0843, + "step": 1782 + }, + { + "epoch": 6.437020316027088, + "grad_norm": 197.32862854003906, + "learning_rate": 2.0346642468239563e-05, + "loss": 40.4123, + "step": 1783 + }, + { + "epoch": 6.440632054176072, + "grad_norm": 205.18751525878906, + "learning_rate": 2.0341197822141562e-05, + "loss": 41.9185, + "step": 1784 + }, + { + "epoch": 6.444243792325056, + "grad_norm": 201.69070434570312, + "learning_rate": 2.0335753176043558e-05, + "loss": 41.6794, + "step": 1785 + }, + { + "epoch": 6.44785553047404, + "grad_norm": 218.77044677734375, + "learning_rate": 2.0330308529945556e-05, + "loss": 43.5805, + "step": 1786 + }, + { + "epoch": 6.451467268623025, + "grad_norm": 183.25967407226562, + "learning_rate": 2.0324863883847552e-05, + "loss": 41.2777, + "step": 1787 + }, + { + "epoch": 6.455079006772009, + "grad_norm": 219.97369384765625, + "learning_rate": 2.0319419237749547e-05, + "loss": 42.4618, + "step": 1788 + }, + { + "epoch": 6.458690744920993, + "grad_norm": 216.1624298095703, + "learning_rate": 2.0313974591651542e-05, + "loss": 41.6424, + "step": 1789 + }, + { + "epoch": 6.462302483069977, + "grad_norm": 222.29965209960938, + "learning_rate": 2.0308529945553538e-05, + "loss": 41.4058, + "step": 1790 + }, + { + "epoch": 6.462302483069977, + "eval_loss": 0.6282982230186462, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 1790 + }, + { + "epoch": 6.465914221218962, + "grad_norm": 215.50511169433594, + "learning_rate": 2.0303085299455533e-05, + "loss": 39.474, + "step": 1791 + }, + { + "epoch": 6.469525959367946, + "grad_norm": 237.2119903564453, + "learning_rate": 2.0297640653357532e-05, + "loss": 36.0508, + "step": 1792 + }, + { + "epoch": 6.47313769751693, + "grad_norm": 234.52975463867188, + "learning_rate": 2.029219600725953e-05, + "loss": 34.1704, + "step": 1793 + }, + { + "epoch": 6.476749435665914, + "grad_norm": 213.22216796875, + "learning_rate": 2.0286751361161526e-05, + "loss": 34.7592, + "step": 1794 + }, + { + "epoch": 6.480361173814899, + "grad_norm": 215.77244567871094, + "learning_rate": 2.028130671506352e-05, + "loss": 35.3051, + "step": 1795 + }, + { + "epoch": 6.483972911963883, + "grad_norm": 179.0439910888672, + "learning_rate": 2.0275862068965517e-05, + "loss": 35.2493, + "step": 1796 + }, + { + "epoch": 6.487584650112867, + "grad_norm": 217.47218322753906, + "learning_rate": 2.0270417422867516e-05, + "loss": 35.6169, + "step": 1797 + }, + { + "epoch": 6.491196388261851, + "grad_norm": 191.3380584716797, + "learning_rate": 2.026497277676951e-05, + "loss": 36.428, + "step": 1798 + }, + { + "epoch": 6.4948081264108355, + "grad_norm": 200.8570098876953, + "learning_rate": 2.0259528130671506e-05, + "loss": 36.5983, + "step": 1799 + }, + { + "epoch": 6.4984198645598195, + "grad_norm": 173.1240234375, + "learning_rate": 2.0254083484573502e-05, + "loss": 36.0163, + "step": 1800 + }, + { + "epoch": 6.4984198645598195, + "eval_loss": 0.6268841624259949, + "eval_runtime": 3.146, + "eval_samples_per_second": 56.898, + "eval_steps_per_second": 56.898, + "step": 1800 + }, + { + "epoch": 6.502031602708803, + "grad_norm": 225.66845703125, + "learning_rate": 2.0248638838475497e-05, + "loss": 36.2461, + "step": 1801 + }, + { + "epoch": 6.505643340857787, + "grad_norm": 189.66233825683594, + "learning_rate": 2.0243194192377496e-05, + "loss": 37.416, + "step": 1802 + }, + { + "epoch": 6.509255079006772, + "grad_norm": 243.0270233154297, + "learning_rate": 2.0237749546279495e-05, + "loss": 38.5309, + "step": 1803 + }, + { + "epoch": 6.512866817155756, + "grad_norm": 192.0927276611328, + "learning_rate": 2.023230490018149e-05, + "loss": 37.087, + "step": 1804 + }, + { + "epoch": 6.51647855530474, + "grad_norm": 222.2957305908203, + "learning_rate": 2.0226860254083486e-05, + "loss": 37.8877, + "step": 1805 + }, + { + "epoch": 6.520090293453725, + "grad_norm": 259.84722900390625, + "learning_rate": 2.022141560798548e-05, + "loss": 39.2138, + "step": 1806 + }, + { + "epoch": 6.523702031602709, + "grad_norm": 205.5794219970703, + "learning_rate": 2.0215970961887476e-05, + "loss": 38.6066, + "step": 1807 + }, + { + "epoch": 6.527313769751693, + "grad_norm": 300.455810546875, + "learning_rate": 2.0210526315789475e-05, + "loss": 36.1581, + "step": 1808 + }, + { + "epoch": 6.530925507900677, + "grad_norm": 207.18063354492188, + "learning_rate": 2.020508166969147e-05, + "loss": 24.3689, + "step": 1809 + }, + { + "epoch": 6.534537246049661, + "grad_norm": 230.98516845703125, + "learning_rate": 2.0199637023593466e-05, + "loss": 23.7019, + "step": 1810 + }, + { + "epoch": 6.534537246049661, + "eval_loss": 0.6379140615463257, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 1810 + }, + { + "epoch": 6.538148984198646, + "grad_norm": 153.8694610595703, + "learning_rate": 2.019419237749546e-05, + "loss": 24.5035, + "step": 1811 + }, + { + "epoch": 6.54176072234763, + "grad_norm": 229.9432373046875, + "learning_rate": 2.018874773139746e-05, + "loss": 26.1645, + "step": 1812 + }, + { + "epoch": 6.545372460496614, + "grad_norm": 325.3592529296875, + "learning_rate": 2.018330308529946e-05, + "loss": 45.6349, + "step": 1813 + }, + { + "epoch": 6.5489841986455986, + "grad_norm": 261.0744323730469, + "learning_rate": 2.0177858439201454e-05, + "loss": 45.5545, + "step": 1814 + }, + { + "epoch": 6.5525959367945825, + "grad_norm": 261.4237976074219, + "learning_rate": 2.017241379310345e-05, + "loss": 45.321, + "step": 1815 + }, + { + "epoch": 6.5562076749435665, + "grad_norm": 238.8377685546875, + "learning_rate": 2.0166969147005445e-05, + "loss": 44.5963, + "step": 1816 + }, + { + "epoch": 6.5598194130925505, + "grad_norm": 225.89730834960938, + "learning_rate": 2.016152450090744e-05, + "loss": 43.593, + "step": 1817 + }, + { + "epoch": 6.563431151241535, + "grad_norm": 265.09625244140625, + "learning_rate": 2.0156079854809436e-05, + "loss": 43.536, + "step": 1818 + }, + { + "epoch": 6.567042889390519, + "grad_norm": 257.9114685058594, + "learning_rate": 2.0150635208711434e-05, + "loss": 44.1125, + "step": 1819 + }, + { + "epoch": 6.570654627539503, + "grad_norm": 188.06382751464844, + "learning_rate": 2.014519056261343e-05, + "loss": 45.097, + "step": 1820 + }, + { + "epoch": 6.570654627539503, + "eval_loss": 0.6347097754478455, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 1820 + }, + { + "epoch": 6.574266365688487, + "grad_norm": 227.7350616455078, + "learning_rate": 2.013974591651543e-05, + "loss": 43.9367, + "step": 1821 + }, + { + "epoch": 6.577878103837472, + "grad_norm": 207.54774475097656, + "learning_rate": 2.0134301270417424e-05, + "loss": 43.8266, + "step": 1822 + }, + { + "epoch": 6.581489841986456, + "grad_norm": 204.62364196777344, + "learning_rate": 2.012885662431942e-05, + "loss": 42.7973, + "step": 1823 + }, + { + "epoch": 6.58510158013544, + "grad_norm": 244.32159423828125, + "learning_rate": 2.0123411978221418e-05, + "loss": 42.7741, + "step": 1824 + }, + { + "epoch": 6.588713318284425, + "grad_norm": 304.9100036621094, + "learning_rate": 2.0117967332123414e-05, + "loss": 40.6529, + "step": 1825 + }, + { + "epoch": 6.592325056433409, + "grad_norm": 275.5767517089844, + "learning_rate": 2.011252268602541e-05, + "loss": 40.2909, + "step": 1826 + }, + { + "epoch": 6.595936794582393, + "grad_norm": 227.69642639160156, + "learning_rate": 2.0107078039927404e-05, + "loss": 39.8786, + "step": 1827 + }, + { + "epoch": 6.599548532731377, + "grad_norm": 261.4333190917969, + "learning_rate": 2.01016333938294e-05, + "loss": 40.7009, + "step": 1828 + }, + { + "epoch": 6.603160270880361, + "grad_norm": 213.0095977783203, + "learning_rate": 2.0096188747731395e-05, + "loss": 40.0595, + "step": 1829 + }, + { + "epoch": 6.606772009029346, + "grad_norm": 251.78590393066406, + "learning_rate": 2.0090744101633397e-05, + "loss": 40.8939, + "step": 1830 + }, + { + "epoch": 6.606772009029346, + "eval_loss": 0.6333281397819519, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1830 + }, + { + "epoch": 6.6103837471783295, + "grad_norm": 224.89805603027344, + "learning_rate": 2.0085299455535393e-05, + "loss": 41.4123, + "step": 1831 + }, + { + "epoch": 6.6139954853273135, + "grad_norm": 195.67982482910156, + "learning_rate": 2.0079854809437388e-05, + "loss": 41.3483, + "step": 1832 + }, + { + "epoch": 6.617607223476298, + "grad_norm": 214.318603515625, + "learning_rate": 2.0074410163339383e-05, + "loss": 40.5516, + "step": 1833 + }, + { + "epoch": 6.621218961625282, + "grad_norm": 226.60968017578125, + "learning_rate": 2.006896551724138e-05, + "loss": 41.3523, + "step": 1834 + }, + { + "epoch": 6.624830699774266, + "grad_norm": 231.63604736328125, + "learning_rate": 2.0063520871143378e-05, + "loss": 41.8734, + "step": 1835 + }, + { + "epoch": 6.62844243792325, + "grad_norm": 224.1644287109375, + "learning_rate": 2.0058076225045373e-05, + "loss": 42.7386, + "step": 1836 + }, + { + "epoch": 6.632054176072235, + "grad_norm": 273.651123046875, + "learning_rate": 2.0052631578947368e-05, + "loss": 42.4525, + "step": 1837 + }, + { + "epoch": 6.635665914221219, + "grad_norm": 270.8088684082031, + "learning_rate": 2.0047186932849364e-05, + "loss": 42.1051, + "step": 1838 + }, + { + "epoch": 6.639277652370203, + "grad_norm": 303.1058044433594, + "learning_rate": 2.0041742286751362e-05, + "loss": 42.1301, + "step": 1839 + }, + { + "epoch": 6.642889390519187, + "grad_norm": 207.29380798339844, + "learning_rate": 2.0036297640653358e-05, + "loss": 42.1495, + "step": 1840 + }, + { + "epoch": 6.642889390519187, + "eval_loss": 0.6321585774421692, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1840 + }, + { + "epoch": 6.646501128668172, + "grad_norm": 262.1852722167969, + "learning_rate": 2.0030852994555357e-05, + "loss": 39.6408, + "step": 1841 + }, + { + "epoch": 6.650112866817156, + "grad_norm": 233.7991943359375, + "learning_rate": 2.0025408348457352e-05, + "loss": 37.6177, + "step": 1842 + }, + { + "epoch": 6.65372460496614, + "grad_norm": 247.25514221191406, + "learning_rate": 2.0019963702359347e-05, + "loss": 35.4287, + "step": 1843 + }, + { + "epoch": 6.657336343115124, + "grad_norm": 191.53343200683594, + "learning_rate": 2.0014519056261343e-05, + "loss": 34.2335, + "step": 1844 + }, + { + "epoch": 6.660948081264109, + "grad_norm": 245.22821044921875, + "learning_rate": 2.0009074410163338e-05, + "loss": 35.8097, + "step": 1845 + }, + { + "epoch": 6.664559819413093, + "grad_norm": 213.8151092529297, + "learning_rate": 2.0003629764065337e-05, + "loss": 35.2621, + "step": 1846 + }, + { + "epoch": 6.668171557562077, + "grad_norm": 174.6085205078125, + "learning_rate": 1.9998185117967332e-05, + "loss": 36.6137, + "step": 1847 + }, + { + "epoch": 6.6717832957110605, + "grad_norm": 287.4677429199219, + "learning_rate": 1.9992740471869328e-05, + "loss": 37.5896, + "step": 1848 + }, + { + "epoch": 6.675395033860045, + "grad_norm": 224.59771728515625, + "learning_rate": 1.9987295825771326e-05, + "loss": 36.5515, + "step": 1849 + }, + { + "epoch": 6.679006772009029, + "grad_norm": 212.73065185546875, + "learning_rate": 1.9981851179673322e-05, + "loss": 36.2511, + "step": 1850 + }, + { + "epoch": 6.679006772009029, + "eval_loss": 0.6308404803276062, + "eval_runtime": 3.1419, + "eval_samples_per_second": 56.972, + "eval_steps_per_second": 56.972, + "step": 1850 + }, + { + "epoch": 6.682618510158013, + "grad_norm": 214.7340850830078, + "learning_rate": 1.9976406533575317e-05, + "loss": 37.6949, + "step": 1851 + }, + { + "epoch": 6.686230248306998, + "grad_norm": 220.3029327392578, + "learning_rate": 1.9970961887477316e-05, + "loss": 36.5785, + "step": 1852 + }, + { + "epoch": 6.689841986455982, + "grad_norm": 198.97564697265625, + "learning_rate": 1.996551724137931e-05, + "loss": 38.5277, + "step": 1853 + }, + { + "epoch": 6.693453724604966, + "grad_norm": 180.94789123535156, + "learning_rate": 1.9960072595281307e-05, + "loss": 37.5197, + "step": 1854 + }, + { + "epoch": 6.69706546275395, + "grad_norm": 212.17584228515625, + "learning_rate": 1.9954627949183302e-05, + "loss": 37.3483, + "step": 1855 + }, + { + "epoch": 6.700677200902934, + "grad_norm": 253.88601684570312, + "learning_rate": 1.9949183303085298e-05, + "loss": 38.5224, + "step": 1856 + }, + { + "epoch": 6.704288939051919, + "grad_norm": 193.17698669433594, + "learning_rate": 1.9943738656987296e-05, + "loss": 37.5679, + "step": 1857 + }, + { + "epoch": 6.707900677200903, + "grad_norm": 217.2652130126953, + "learning_rate": 1.9938294010889295e-05, + "loss": 27.7344, + "step": 1858 + }, + { + "epoch": 6.711512415349887, + "grad_norm": 183.9295196533203, + "learning_rate": 1.993284936479129e-05, + "loss": 24.3864, + "step": 1859 + }, + { + "epoch": 6.715124153498872, + "grad_norm": 200.3455352783203, + "learning_rate": 1.9927404718693286e-05, + "loss": 23.7328, + "step": 1860 + }, + { + "epoch": 6.715124153498872, + "eval_loss": 0.636415421962738, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.943, + "eval_steps_per_second": 56.943, + "step": 1860 + }, + { + "epoch": 6.718735891647856, + "grad_norm": 206.7858123779297, + "learning_rate": 1.992196007259528e-05, + "loss": 24.6541, + "step": 1861 + }, + { + "epoch": 6.72234762979684, + "grad_norm": 208.10414123535156, + "learning_rate": 1.9916515426497277e-05, + "loss": 25.1223, + "step": 1862 + }, + { + "epoch": 6.725959367945824, + "grad_norm": 270.6657409667969, + "learning_rate": 1.9911070780399275e-05, + "loss": 44.8561, + "step": 1863 + }, + { + "epoch": 6.7295711060948085, + "grad_norm": 246.69094848632812, + "learning_rate": 1.990562613430127e-05, + "loss": 45.8683, + "step": 1864 + }, + { + "epoch": 6.733182844243792, + "grad_norm": 243.4462432861328, + "learning_rate": 1.9900181488203266e-05, + "loss": 45.1845, + "step": 1865 + }, + { + "epoch": 6.736794582392776, + "grad_norm": 218.0637969970703, + "learning_rate": 1.989473684210526e-05, + "loss": 43.9492, + "step": 1866 + }, + { + "epoch": 6.74040632054176, + "grad_norm": 200.28140258789062, + "learning_rate": 1.988929219600726e-05, + "loss": 44.0612, + "step": 1867 + }, + { + "epoch": 6.744018058690745, + "grad_norm": 200.3120880126953, + "learning_rate": 1.988384754990926e-05, + "loss": 43.4748, + "step": 1868 + }, + { + "epoch": 6.747629796839729, + "grad_norm": 186.1811065673828, + "learning_rate": 1.9878402903811254e-05, + "loss": 43.6851, + "step": 1869 + }, + { + "epoch": 6.751241534988713, + "grad_norm": 208.15167236328125, + "learning_rate": 1.987295825771325e-05, + "loss": 44.4196, + "step": 1870 + }, + { + "epoch": 6.751241534988713, + "eval_loss": 0.6353851556777954, + "eval_runtime": 3.1436, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1870 + }, + { + "epoch": 6.754853273137698, + "grad_norm": 207.500244140625, + "learning_rate": 1.9867513611615245e-05, + "loss": 44.1493, + "step": 1871 + }, + { + "epoch": 6.758465011286682, + "grad_norm": 238.17047119140625, + "learning_rate": 1.986206896551724e-05, + "loss": 44.6587, + "step": 1872 + }, + { + "epoch": 6.762076749435666, + "grad_norm": 192.9468231201172, + "learning_rate": 1.9856624319419236e-05, + "loss": 43.2409, + "step": 1873 + }, + { + "epoch": 6.76568848758465, + "grad_norm": 205.26492309570312, + "learning_rate": 1.9851179673321235e-05, + "loss": 40.8636, + "step": 1874 + }, + { + "epoch": 6.769300225733634, + "grad_norm": 190.49908447265625, + "learning_rate": 1.984573502722323e-05, + "loss": 41.0769, + "step": 1875 + }, + { + "epoch": 6.772911963882619, + "grad_norm": 206.56097412109375, + "learning_rate": 1.984029038112523e-05, + "loss": 40.1137, + "step": 1876 + }, + { + "epoch": 6.776523702031603, + "grad_norm": 212.89256286621094, + "learning_rate": 1.9834845735027224e-05, + "loss": 41.0114, + "step": 1877 + }, + { + "epoch": 6.780135440180587, + "grad_norm": 197.24267578125, + "learning_rate": 1.982940108892922e-05, + "loss": 40.6027, + "step": 1878 + }, + { + "epoch": 6.7837471783295715, + "grad_norm": 187.01942443847656, + "learning_rate": 1.982395644283122e-05, + "loss": 40.5933, + "step": 1879 + }, + { + "epoch": 6.7873589164785555, + "grad_norm": 236.31092834472656, + "learning_rate": 1.9818511796733214e-05, + "loss": 41.2282, + "step": 1880 + }, + { + "epoch": 6.7873589164785555, + "eval_loss": 0.6299392580986023, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 1880 + }, + { + "epoch": 6.7909706546275395, + "grad_norm": 194.92059326171875, + "learning_rate": 1.981306715063521e-05, + "loss": 41.5858, + "step": 1881 + }, + { + "epoch": 6.794582392776523, + "grad_norm": 192.26272583007812, + "learning_rate": 1.9807622504537205e-05, + "loss": 40.6826, + "step": 1882 + }, + { + "epoch": 6.798194130925508, + "grad_norm": 181.8116912841797, + "learning_rate": 1.98021778584392e-05, + "loss": 40.0867, + "step": 1883 + }, + { + "epoch": 6.801805869074492, + "grad_norm": 219.03494262695312, + "learning_rate": 1.9796733212341195e-05, + "loss": 41.4496, + "step": 1884 + }, + { + "epoch": 6.805417607223476, + "grad_norm": 190.7852325439453, + "learning_rate": 1.9791288566243194e-05, + "loss": 42.4147, + "step": 1885 + }, + { + "epoch": 6.80902934537246, + "grad_norm": 200.32476806640625, + "learning_rate": 1.9785843920145193e-05, + "loss": 42.0316, + "step": 1886 + }, + { + "epoch": 6.812641083521445, + "grad_norm": 240.6086883544922, + "learning_rate": 1.9780399274047188e-05, + "loss": 39.6992, + "step": 1887 + }, + { + "epoch": 6.816252821670429, + "grad_norm": 222.31700134277344, + "learning_rate": 1.9774954627949184e-05, + "loss": 42.9572, + "step": 1888 + }, + { + "epoch": 6.819864559819413, + "grad_norm": 215.65292358398438, + "learning_rate": 1.976950998185118e-05, + "loss": 42.5147, + "step": 1889 + }, + { + "epoch": 6.823476297968397, + "grad_norm": 195.71624755859375, + "learning_rate": 1.9764065335753178e-05, + "loss": 40.9536, + "step": 1890 + }, + { + "epoch": 6.823476297968397, + "eval_loss": 0.6288287043571472, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 1890 + }, + { + "epoch": 6.827088036117382, + "grad_norm": 202.301025390625, + "learning_rate": 1.9758620689655173e-05, + "loss": 40.1754, + "step": 1891 + }, + { + "epoch": 6.830699774266366, + "grad_norm": 217.07186889648438, + "learning_rate": 1.975317604355717e-05, + "loss": 35.7505, + "step": 1892 + }, + { + "epoch": 6.83431151241535, + "grad_norm": 189.78782653808594, + "learning_rate": 1.9747731397459164e-05, + "loss": 34.813, + "step": 1893 + }, + { + "epoch": 6.837923250564334, + "grad_norm": 247.2117462158203, + "learning_rate": 1.974228675136116e-05, + "loss": 33.932, + "step": 1894 + }, + { + "epoch": 6.8415349887133186, + "grad_norm": 244.06321716308594, + "learning_rate": 1.9736842105263158e-05, + "loss": 36.2514, + "step": 1895 + }, + { + "epoch": 6.8451467268623025, + "grad_norm": 235.78692626953125, + "learning_rate": 1.9731397459165157e-05, + "loss": 35.2123, + "step": 1896 + }, + { + "epoch": 6.8487584650112865, + "grad_norm": 193.82456970214844, + "learning_rate": 1.9725952813067152e-05, + "loss": 36.5477, + "step": 1897 + }, + { + "epoch": 6.852370203160271, + "grad_norm": 230.2017059326172, + "learning_rate": 1.9720508166969148e-05, + "loss": 36.1244, + "step": 1898 + }, + { + "epoch": 6.855981941309255, + "grad_norm": 205.5274200439453, + "learning_rate": 1.9715063520871143e-05, + "loss": 36.7059, + "step": 1899 + }, + { + "epoch": 6.859593679458239, + "grad_norm": 236.6873016357422, + "learning_rate": 1.970961887477314e-05, + "loss": 36.6212, + "step": 1900 + }, + { + "epoch": 6.859593679458239, + "eval_loss": 0.6235609650611877, + "eval_runtime": 3.1497, + "eval_samples_per_second": 56.831, + "eval_steps_per_second": 56.831, + "step": 1900 + }, + { + "epoch": 6.863205417607223, + "grad_norm": 217.63638305664062, + "learning_rate": 1.9704174228675137e-05, + "loss": 37.3918, + "step": 1901 + }, + { + "epoch": 6.866817155756207, + "grad_norm": 169.31996154785156, + "learning_rate": 1.9698729582577133e-05, + "loss": 37.8555, + "step": 1902 + }, + { + "epoch": 6.870428893905192, + "grad_norm": 204.2144775390625, + "learning_rate": 1.9693284936479128e-05, + "loss": 38.0013, + "step": 1903 + }, + { + "epoch": 6.874040632054176, + "grad_norm": 219.13595581054688, + "learning_rate": 1.9687840290381127e-05, + "loss": 37.2128, + "step": 1904 + }, + { + "epoch": 6.87765237020316, + "grad_norm": 189.8477325439453, + "learning_rate": 1.9682395644283122e-05, + "loss": 39.272, + "step": 1905 + }, + { + "epoch": 6.881264108352145, + "grad_norm": 214.21360778808594, + "learning_rate": 1.967695099818512e-05, + "loss": 37.5185, + "step": 1906 + }, + { + "epoch": 6.884875846501129, + "grad_norm": 252.57867431640625, + "learning_rate": 1.9671506352087116e-05, + "loss": 37.6195, + "step": 1907 + }, + { + "epoch": 6.888487584650113, + "grad_norm": 169.85382080078125, + "learning_rate": 1.966606170598911e-05, + "loss": 29.083, + "step": 1908 + }, + { + "epoch": 6.892099322799097, + "grad_norm": 161.38137817382812, + "learning_rate": 1.9660617059891107e-05, + "loss": 24.4547, + "step": 1909 + }, + { + "epoch": 6.895711060948082, + "grad_norm": 192.5706787109375, + "learning_rate": 1.9655172413793102e-05, + "loss": 24.2235, + "step": 1910 + }, + { + "epoch": 6.895711060948082, + "eval_loss": 0.6387229561805725, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1910 + }, + { + "epoch": 6.899322799097066, + "grad_norm": 177.5368194580078, + "learning_rate": 1.9649727767695098e-05, + "loss": 24.8032, + "step": 1911 + }, + { + "epoch": 6.9029345372460496, + "grad_norm": 206.98458862304688, + "learning_rate": 1.9644283121597097e-05, + "loss": 25.7293, + "step": 1912 + }, + { + "epoch": 6.9065462753950335, + "grad_norm": 238.7289581298828, + "learning_rate": 1.9638838475499095e-05, + "loss": 44.2514, + "step": 1913 + }, + { + "epoch": 6.910158013544018, + "grad_norm": 225.86854553222656, + "learning_rate": 1.963339382940109e-05, + "loss": 44.4858, + "step": 1914 + }, + { + "epoch": 6.913769751693002, + "grad_norm": 235.71524047851562, + "learning_rate": 1.9627949183303086e-05, + "loss": 44.5351, + "step": 1915 + }, + { + "epoch": 6.917381489841986, + "grad_norm": 233.1634063720703, + "learning_rate": 1.962250453720508e-05, + "loss": 44.0865, + "step": 1916 + }, + { + "epoch": 6.92099322799097, + "grad_norm": 201.48944091796875, + "learning_rate": 1.961705989110708e-05, + "loss": 45.0226, + "step": 1917 + }, + { + "epoch": 6.924604966139955, + "grad_norm": 226.95469665527344, + "learning_rate": 1.9611615245009076e-05, + "loss": 44.3969, + "step": 1918 + }, + { + "epoch": 6.928216704288939, + "grad_norm": 242.79940795898438, + "learning_rate": 1.960617059891107e-05, + "loss": 41.3037, + "step": 1919 + }, + { + "epoch": 6.931828442437923, + "grad_norm": 255.3524932861328, + "learning_rate": 1.9600725952813066e-05, + "loss": 41.3567, + "step": 1920 + }, + { + "epoch": 6.931828442437923, + "eval_loss": 0.6346065998077393, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 1920 + }, + { + "epoch": 6.935440180586907, + "grad_norm": 277.0763854980469, + "learning_rate": 1.9595281306715062e-05, + "loss": 41.142, + "step": 1921 + }, + { + "epoch": 6.939051918735892, + "grad_norm": 176.02658081054688, + "learning_rate": 1.958983666061706e-05, + "loss": 42.1963, + "step": 1922 + }, + { + "epoch": 6.942663656884876, + "grad_norm": 236.36398315429688, + "learning_rate": 1.958439201451906e-05, + "loss": 42.351, + "step": 1923 + }, + { + "epoch": 6.94627539503386, + "grad_norm": 203.0919647216797, + "learning_rate": 1.9578947368421055e-05, + "loss": 41.5248, + "step": 1924 + }, + { + "epoch": 6.949887133182845, + "grad_norm": 273.605712890625, + "learning_rate": 1.957350272232305e-05, + "loss": 42.1004, + "step": 1925 + }, + { + "epoch": 6.953498871331829, + "grad_norm": 214.04319763183594, + "learning_rate": 1.9568058076225045e-05, + "loss": 42.6326, + "step": 1926 + }, + { + "epoch": 6.957110609480813, + "grad_norm": 250.81832885742188, + "learning_rate": 1.956261343012704e-05, + "loss": 43.8045, + "step": 1927 + }, + { + "epoch": 6.960722347629797, + "grad_norm": 233.58116149902344, + "learning_rate": 1.955716878402904e-05, + "loss": 39.8991, + "step": 1928 + }, + { + "epoch": 6.9643340857787805, + "grad_norm": 269.0545654296875, + "learning_rate": 1.9551724137931035e-05, + "loss": 34.6192, + "step": 1929 + }, + { + "epoch": 6.967945823927765, + "grad_norm": 266.1218566894531, + "learning_rate": 1.954627949183303e-05, + "loss": 35.7568, + "step": 1930 + }, + { + "epoch": 6.967945823927765, + "eval_loss": 0.6233173608779907, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1930 + }, + { + "epoch": 6.971557562076749, + "grad_norm": 294.6914978027344, + "learning_rate": 1.9540834845735026e-05, + "loss": 36.0795, + "step": 1931 + }, + { + "epoch": 6.975169300225733, + "grad_norm": 373.6831970214844, + "learning_rate": 1.9535390199637025e-05, + "loss": 37.2715, + "step": 1932 + }, + { + "epoch": 6.978781038374718, + "grad_norm": 240.34738159179688, + "learning_rate": 1.952994555353902e-05, + "loss": 37.8335, + "step": 1933 + }, + { + "epoch": 6.982392776523702, + "grad_norm": 312.1968994140625, + "learning_rate": 1.952450090744102e-05, + "loss": 37.8251, + "step": 1934 + }, + { + "epoch": 6.986004514672686, + "grad_norm": 276.3544006347656, + "learning_rate": 1.9519056261343014e-05, + "loss": 38.8466, + "step": 1935 + }, + { + "epoch": 6.98961625282167, + "grad_norm": 282.6874694824219, + "learning_rate": 1.951361161524501e-05, + "loss": 37.774, + "step": 1936 + }, + { + "epoch": 6.993227990970655, + "grad_norm": 323.96612548828125, + "learning_rate": 1.9508166969147005e-05, + "loss": 34.3747, + "step": 1937 + }, + { + "epoch": 6.996839729119639, + "grad_norm": 235.02915954589844, + "learning_rate": 1.9502722323049e-05, + "loss": 24.5297, + "step": 1938 + }, + { + "epoch": 7.0, + "grad_norm": 176.4046173095703, + "learning_rate": 1.9497277676951e-05, + "loss": 22.3179, + "step": 1939 + }, + { + "epoch": 7.003611738148984, + "grad_norm": 248.2797393798828, + "learning_rate": 1.9491833030852994e-05, + "loss": 42.225, + "step": 1940 + }, + { + "epoch": 7.003611738148984, + "eval_loss": 0.6272363066673279, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.911, + "eval_steps_per_second": 56.911, + "step": 1940 + }, + { + "epoch": 7.007223476297969, + "grad_norm": 235.9131622314453, + "learning_rate": 1.9486388384754993e-05, + "loss": 43.6526, + "step": 1941 + }, + { + "epoch": 7.010835214446953, + "grad_norm": 223.63479614257812, + "learning_rate": 1.948094373865699e-05, + "loss": 42.9052, + "step": 1942 + }, + { + "epoch": 7.014446952595937, + "grad_norm": 203.92141723632812, + "learning_rate": 1.9475499092558984e-05, + "loss": 43.5819, + "step": 1943 + }, + { + "epoch": 7.018058690744921, + "grad_norm": 209.6050567626953, + "learning_rate": 1.947005444646098e-05, + "loss": 43.1077, + "step": 1944 + }, + { + "epoch": 7.021670428893906, + "grad_norm": 245.77700805664062, + "learning_rate": 1.9464609800362978e-05, + "loss": 42.7508, + "step": 1945 + }, + { + "epoch": 7.0252821670428895, + "grad_norm": 203.13465881347656, + "learning_rate": 1.9459165154264973e-05, + "loss": 42.5234, + "step": 1946 + }, + { + "epoch": 7.0288939051918735, + "grad_norm": 226.4978485107422, + "learning_rate": 1.945372050816697e-05, + "loss": 44.0725, + "step": 1947 + }, + { + "epoch": 7.0325056433408575, + "grad_norm": 225.68116760253906, + "learning_rate": 1.9448275862068964e-05, + "loss": 42.6408, + "step": 1948 + }, + { + "epoch": 7.036117381489842, + "grad_norm": 182.14202880859375, + "learning_rate": 1.944283121597096e-05, + "loss": 41.7696, + "step": 1949 + }, + { + "epoch": 7.039729119638826, + "grad_norm": 196.1949005126953, + "learning_rate": 1.9437386569872962e-05, + "loss": 42.7008, + "step": 1950 + }, + { + "epoch": 7.039729119638826, + "eval_loss": 0.6277336478233337, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 1950 + }, + { + "epoch": 7.04334085778781, + "grad_norm": 180.6853485107422, + "learning_rate": 1.9431941923774957e-05, + "loss": 41.9946, + "step": 1951 + }, + { + "epoch": 7.046952595936794, + "grad_norm": 199.0644073486328, + "learning_rate": 1.9426497277676953e-05, + "loss": 39.8965, + "step": 1952 + }, + { + "epoch": 7.050564334085779, + "grad_norm": 208.21371459960938, + "learning_rate": 1.9421052631578948e-05, + "loss": 39.3263, + "step": 1953 + }, + { + "epoch": 7.054176072234763, + "grad_norm": 239.78677368164062, + "learning_rate": 1.9415607985480943e-05, + "loss": 40.1478, + "step": 1954 + }, + { + "epoch": 7.057787810383747, + "grad_norm": 211.55030822753906, + "learning_rate": 1.941016333938294e-05, + "loss": 40.061, + "step": 1955 + }, + { + "epoch": 7.061399548532731, + "grad_norm": 199.51455688476562, + "learning_rate": 1.9404718693284937e-05, + "loss": 39.8707, + "step": 1956 + }, + { + "epoch": 7.065011286681716, + "grad_norm": 183.39486694335938, + "learning_rate": 1.9399274047186933e-05, + "loss": 40.3183, + "step": 1957 + }, + { + "epoch": 7.0686230248307, + "grad_norm": 238.36737060546875, + "learning_rate": 1.9393829401088928e-05, + "loss": 40.8581, + "step": 1958 + }, + { + "epoch": 7.072234762979684, + "grad_norm": 202.5072021484375, + "learning_rate": 1.9388384754990927e-05, + "loss": 40.2192, + "step": 1959 + }, + { + "epoch": 7.075846501128668, + "grad_norm": 204.236083984375, + "learning_rate": 1.9382940108892922e-05, + "loss": 40.8533, + "step": 1960 + }, + { + "epoch": 7.075846501128668, + "eval_loss": 0.6252757906913757, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 1960 + }, + { + "epoch": 7.079458239277653, + "grad_norm": 260.2081298828125, + "learning_rate": 1.937749546279492e-05, + "loss": 39.7229, + "step": 1961 + }, + { + "epoch": 7.083069977426637, + "grad_norm": 241.91722106933594, + "learning_rate": 1.9372050816696917e-05, + "loss": 41.547, + "step": 1962 + }, + { + "epoch": 7.0866817155756205, + "grad_norm": 168.9304656982422, + "learning_rate": 1.9366606170598912e-05, + "loss": 41.4826, + "step": 1963 + }, + { + "epoch": 7.090293453724605, + "grad_norm": 230.05349731445312, + "learning_rate": 1.9361161524500907e-05, + "loss": 41.5411, + "step": 1964 + }, + { + "epoch": 7.093905191873589, + "grad_norm": 172.16851806640625, + "learning_rate": 1.9355716878402903e-05, + "loss": 42.2347, + "step": 1965 + }, + { + "epoch": 7.097516930022573, + "grad_norm": 312.65838623046875, + "learning_rate": 1.9350272232304898e-05, + "loss": 41.4039, + "step": 1966 + }, + { + "epoch": 7.101128668171557, + "grad_norm": 249.62351989746094, + "learning_rate": 1.9344827586206897e-05, + "loss": 41.4234, + "step": 1967 + }, + { + "epoch": 7.104740406320542, + "grad_norm": 250.49143981933594, + "learning_rate": 1.9339382940108896e-05, + "loss": 38.0539, + "step": 1968 + }, + { + "epoch": 7.108352144469526, + "grad_norm": 238.41546630859375, + "learning_rate": 1.933393829401089e-05, + "loss": 35.5584, + "step": 1969 + }, + { + "epoch": 7.11196388261851, + "grad_norm": 200.78282165527344, + "learning_rate": 1.9328493647912886e-05, + "loss": 34.4491, + "step": 1970 + }, + { + "epoch": 7.11196388261851, + "eval_loss": 0.6286216378211975, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 1970 + }, + { + "epoch": 7.115575620767494, + "grad_norm": 244.61717224121094, + "learning_rate": 1.9323049001814882e-05, + "loss": 34.5403, + "step": 1971 + }, + { + "epoch": 7.119187358916479, + "grad_norm": 219.14312744140625, + "learning_rate": 1.931760435571688e-05, + "loss": 35.7815, + "step": 1972 + }, + { + "epoch": 7.122799097065463, + "grad_norm": 221.85130310058594, + "learning_rate": 1.9312159709618876e-05, + "loss": 35.638, + "step": 1973 + }, + { + "epoch": 7.126410835214447, + "grad_norm": 237.97921752929688, + "learning_rate": 1.930671506352087e-05, + "loss": 35.1348, + "step": 1974 + }, + { + "epoch": 7.130022573363431, + "grad_norm": 234.06256103515625, + "learning_rate": 1.9301270417422867e-05, + "loss": 35.8709, + "step": 1975 + }, + { + "epoch": 7.133634311512416, + "grad_norm": 231.6852264404297, + "learning_rate": 1.9295825771324862e-05, + "loss": 36.6859, + "step": 1976 + }, + { + "epoch": 7.1372460496614, + "grad_norm": 208.2762908935547, + "learning_rate": 1.9290381125226857e-05, + "loss": 37.24, + "step": 1977 + }, + { + "epoch": 7.140857787810384, + "grad_norm": 219.8532257080078, + "learning_rate": 1.928493647912886e-05, + "loss": 36.4058, + "step": 1978 + }, + { + "epoch": 7.144469525959368, + "grad_norm": 242.73159790039062, + "learning_rate": 1.9279491833030855e-05, + "loss": 36.7565, + "step": 1979 + }, + { + "epoch": 7.148081264108352, + "grad_norm": 227.09645080566406, + "learning_rate": 1.927404718693285e-05, + "loss": 37.6752, + "step": 1980 + }, + { + "epoch": 7.148081264108352, + "eval_loss": 0.6243596076965332, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 1980 + }, + { + "epoch": 7.151693002257336, + "grad_norm": 236.27169799804688, + "learning_rate": 1.9268602540834846e-05, + "loss": 38.3857, + "step": 1981 + }, + { + "epoch": 7.15530474040632, + "grad_norm": 244.84912109375, + "learning_rate": 1.926315789473684e-05, + "loss": 38.414, + "step": 1982 + }, + { + "epoch": 7.158916478555304, + "grad_norm": 203.36798095703125, + "learning_rate": 1.925771324863884e-05, + "loss": 38.938, + "step": 1983 + }, + { + "epoch": 7.162528216704289, + "grad_norm": 225.50152587890625, + "learning_rate": 1.9252268602540835e-05, + "loss": 37.654, + "step": 1984 + }, + { + "epoch": 7.166139954853273, + "grad_norm": 236.4989471435547, + "learning_rate": 1.924682395644283e-05, + "loss": 28.2794, + "step": 1985 + }, + { + "epoch": 7.169751693002257, + "grad_norm": 173.909423828125, + "learning_rate": 1.9241379310344826e-05, + "loss": 23.3804, + "step": 1986 + }, + { + "epoch": 7.173363431151241, + "grad_norm": 195.63526916503906, + "learning_rate": 1.9235934664246825e-05, + "loss": 24.4696, + "step": 1987 + }, + { + "epoch": 7.176975169300226, + "grad_norm": 150.0059356689453, + "learning_rate": 1.923049001814882e-05, + "loss": 23.9438, + "step": 1988 + }, + { + "epoch": 7.18058690744921, + "grad_norm": 217.61630249023438, + "learning_rate": 1.922504537205082e-05, + "loss": 25.4084, + "step": 1989 + }, + { + "epoch": 7.184198645598194, + "grad_norm": 259.2041015625, + "learning_rate": 1.9219600725952814e-05, + "loss": 44.7159, + "step": 1990 + }, + { + "epoch": 7.184198645598194, + "eval_loss": 0.6465168595314026, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 1990 + }, + { + "epoch": 7.187810383747179, + "grad_norm": 282.1758117675781, + "learning_rate": 1.921415607985481e-05, + "loss": 45.7571, + "step": 1991 + }, + { + "epoch": 7.191422121896163, + "grad_norm": 276.5455322265625, + "learning_rate": 1.9208711433756805e-05, + "loss": 44.7227, + "step": 1992 + }, + { + "epoch": 7.195033860045147, + "grad_norm": 251.93589782714844, + "learning_rate": 1.92032667876588e-05, + "loss": 43.0705, + "step": 1993 + }, + { + "epoch": 7.198645598194131, + "grad_norm": 224.8245086669922, + "learning_rate": 1.91978221415608e-05, + "loss": 43.2009, + "step": 1994 + }, + { + "epoch": 7.2022573363431155, + "grad_norm": 233.61770629882812, + "learning_rate": 1.9192377495462795e-05, + "loss": 43.4496, + "step": 1995 + }, + { + "epoch": 7.2058690744920995, + "grad_norm": 188.65252685546875, + "learning_rate": 1.9186932849364793e-05, + "loss": 42.5907, + "step": 1996 + }, + { + "epoch": 7.209480812641083, + "grad_norm": 185.1155242919922, + "learning_rate": 1.918148820326679e-05, + "loss": 44.4651, + "step": 1997 + }, + { + "epoch": 7.213092550790067, + "grad_norm": 169.09701538085938, + "learning_rate": 1.9176043557168784e-05, + "loss": 43.6325, + "step": 1998 + }, + { + "epoch": 7.216704288939052, + "grad_norm": 198.49114990234375, + "learning_rate": 1.9170598911070783e-05, + "loss": 43.5817, + "step": 1999 + }, + { + "epoch": 7.220316027088036, + "grad_norm": 193.17591857910156, + "learning_rate": 1.916515426497278e-05, + "loss": 41.4884, + "step": 2000 + }, + { + "epoch": 7.220316027088036, + "eval_loss": 0.6329721212387085, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.995, + "eval_steps_per_second": 56.995, + "step": 2000 + }, + { + "epoch": 7.22392776523702, + "grad_norm": 202.32730102539062, + "learning_rate": 1.9159709618874774e-05, + "loss": 41.2168, + "step": 2001 + }, + { + "epoch": 7.227539503386004, + "grad_norm": 206.4916534423828, + "learning_rate": 1.915426497277677e-05, + "loss": 39.9909, + "step": 2002 + }, + { + "epoch": 7.231151241534989, + "grad_norm": 202.2099609375, + "learning_rate": 1.9148820326678765e-05, + "loss": 40.1413, + "step": 2003 + }, + { + "epoch": 7.234762979683973, + "grad_norm": 223.7954559326172, + "learning_rate": 1.914337568058076e-05, + "loss": 39.5872, + "step": 2004 + }, + { + "epoch": 7.238374717832957, + "grad_norm": 225.8967742919922, + "learning_rate": 1.9137931034482762e-05, + "loss": 41.3396, + "step": 2005 + }, + { + "epoch": 7.241986455981941, + "grad_norm": 248.0997772216797, + "learning_rate": 1.9132486388384757e-05, + "loss": 39.012, + "step": 2006 + }, + { + "epoch": 7.245598194130926, + "grad_norm": 227.4576873779297, + "learning_rate": 1.9127041742286753e-05, + "loss": 42.5922, + "step": 2007 + }, + { + "epoch": 7.24920993227991, + "grad_norm": 197.62547302246094, + "learning_rate": 1.9121597096188748e-05, + "loss": 41.6107, + "step": 2008 + }, + { + "epoch": 7.252821670428894, + "grad_norm": 170.18817138671875, + "learning_rate": 1.9116152450090744e-05, + "loss": 40.3326, + "step": 2009 + }, + { + "epoch": 7.2564334085778786, + "grad_norm": 186.9420166015625, + "learning_rate": 1.9110707803992742e-05, + "loss": 41.0365, + "step": 2010 + }, + { + "epoch": 7.2564334085778786, + "eval_loss": 0.6230406761169434, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2010 + }, + { + "epoch": 7.2600451467268625, + "grad_norm": 188.11244201660156, + "learning_rate": 1.9105263157894738e-05, + "loss": 42.0278, + "step": 2011 + }, + { + "epoch": 7.2636568848758465, + "grad_norm": 242.47305297851562, + "learning_rate": 1.9099818511796733e-05, + "loss": 41.5539, + "step": 2012 + }, + { + "epoch": 7.2672686230248305, + "grad_norm": 190.83987426757812, + "learning_rate": 1.909437386569873e-05, + "loss": 41.8641, + "step": 2013 + }, + { + "epoch": 7.270880361173815, + "grad_norm": 214.44650268554688, + "learning_rate": 1.9088929219600724e-05, + "loss": 42.232, + "step": 2014 + }, + { + "epoch": 7.274492099322799, + "grad_norm": 216.3888397216797, + "learning_rate": 1.9083484573502723e-05, + "loss": 41.6186, + "step": 2015 + }, + { + "epoch": 7.278103837471783, + "grad_norm": 210.46673583984375, + "learning_rate": 1.907803992740472e-05, + "loss": 42.2099, + "step": 2016 + }, + { + "epoch": 7.281715575620767, + "grad_norm": 194.84165954589844, + "learning_rate": 1.9072595281306717e-05, + "loss": 42.78, + "step": 2017 + }, + { + "epoch": 7.285327313769752, + "grad_norm": 201.91297912597656, + "learning_rate": 1.9067150635208712e-05, + "loss": 38.7115, + "step": 2018 + }, + { + "epoch": 7.288939051918736, + "grad_norm": 245.42625427246094, + "learning_rate": 1.9061705989110708e-05, + "loss": 35.7841, + "step": 2019 + }, + { + "epoch": 7.29255079006772, + "grad_norm": 182.4967041015625, + "learning_rate": 1.9056261343012703e-05, + "loss": 34.3308, + "step": 2020 + }, + { + "epoch": 7.29255079006772, + "eval_loss": 0.6238341331481934, + "eval_runtime": 3.1431, + "eval_samples_per_second": 56.95, + "eval_steps_per_second": 56.95, + "step": 2020 + }, + { + "epoch": 7.296162528216704, + "grad_norm": 297.3916320800781, + "learning_rate": 1.9050816696914702e-05, + "loss": 34.7534, + "step": 2021 + }, + { + "epoch": 7.299774266365689, + "grad_norm": 211.52554321289062, + "learning_rate": 1.9045372050816697e-05, + "loss": 34.0303, + "step": 2022 + }, + { + "epoch": 7.303386004514673, + "grad_norm": 232.99844360351562, + "learning_rate": 1.9039927404718693e-05, + "loss": 35.7378, + "step": 2023 + }, + { + "epoch": 7.306997742663657, + "grad_norm": 230.34642028808594, + "learning_rate": 1.903448275862069e-05, + "loss": 36.7492, + "step": 2024 + }, + { + "epoch": 7.310609480812641, + "grad_norm": 228.88966369628906, + "learning_rate": 1.9029038112522687e-05, + "loss": 35.1188, + "step": 2025 + }, + { + "epoch": 7.314221218961626, + "grad_norm": 213.2604522705078, + "learning_rate": 1.9023593466424682e-05, + "loss": 35.0688, + "step": 2026 + }, + { + "epoch": 7.3178329571106095, + "grad_norm": 202.62200927734375, + "learning_rate": 1.901814882032668e-05, + "loss": 37.6721, + "step": 2027 + }, + { + "epoch": 7.3214446952595935, + "grad_norm": 191.8877410888672, + "learning_rate": 1.9012704174228676e-05, + "loss": 36.7728, + "step": 2028 + }, + { + "epoch": 7.3250564334085775, + "grad_norm": 211.57571411132812, + "learning_rate": 1.900725952813067e-05, + "loss": 36.6342, + "step": 2029 + }, + { + "epoch": 7.328668171557562, + "grad_norm": 177.2289581298828, + "learning_rate": 1.9001814882032667e-05, + "loss": 36.8319, + "step": 2030 + }, + { + "epoch": 7.328668171557562, + "eval_loss": 0.6231008172035217, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2030 + }, + { + "epoch": 7.332279909706546, + "grad_norm": 227.7028350830078, + "learning_rate": 1.8996370235934662e-05, + "loss": 36.6706, + "step": 2031 + }, + { + "epoch": 7.33589164785553, + "grad_norm": 229.02972412109375, + "learning_rate": 1.899092558983666e-05, + "loss": 37.0749, + "step": 2032 + }, + { + "epoch": 7.339503386004514, + "grad_norm": 234.30946350097656, + "learning_rate": 1.898548094373866e-05, + "loss": 37.3716, + "step": 2033 + }, + { + "epoch": 7.343115124153499, + "grad_norm": 236.79893493652344, + "learning_rate": 1.8980036297640655e-05, + "loss": 38.9503, + "step": 2034 + }, + { + "epoch": 7.346726862302483, + "grad_norm": 256.5646057128906, + "learning_rate": 1.897459165154265e-05, + "loss": 32.5056, + "step": 2035 + }, + { + "epoch": 7.350338600451467, + "grad_norm": 183.38961791992188, + "learning_rate": 1.8969147005444646e-05, + "loss": 25.3982, + "step": 2036 + }, + { + "epoch": 7.353950338600452, + "grad_norm": 214.09742736816406, + "learning_rate": 1.896370235934664e-05, + "loss": 23.2743, + "step": 2037 + }, + { + "epoch": 7.357562076749436, + "grad_norm": 190.10867309570312, + "learning_rate": 1.895825771324864e-05, + "loss": 24.8062, + "step": 2038 + }, + { + "epoch": 7.36117381489842, + "grad_norm": 197.85313415527344, + "learning_rate": 1.8952813067150636e-05, + "loss": 25.5098, + "step": 2039 + }, + { + "epoch": 7.364785553047404, + "grad_norm": 235.79090881347656, + "learning_rate": 1.894736842105263e-05, + "loss": 44.3536, + "step": 2040 + }, + { + "epoch": 7.364785553047404, + "eval_loss": 0.6341925263404846, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.785, + "eval_steps_per_second": 56.785, + "step": 2040 + }, + { + "epoch": 7.368397291196389, + "grad_norm": 232.7415771484375, + "learning_rate": 1.8941923774954626e-05, + "loss": 44.6073, + "step": 2041 + }, + { + "epoch": 7.372009029345373, + "grad_norm": 302.3766174316406, + "learning_rate": 1.8936479128856625e-05, + "loss": 43.8575, + "step": 2042 + }, + { + "epoch": 7.375620767494357, + "grad_norm": 208.41441345214844, + "learning_rate": 1.8931034482758624e-05, + "loss": 42.4378, + "step": 2043 + }, + { + "epoch": 7.3792325056433405, + "grad_norm": 228.000732421875, + "learning_rate": 1.892558983666062e-05, + "loss": 44.5641, + "step": 2044 + }, + { + "epoch": 7.382844243792325, + "grad_norm": 201.757080078125, + "learning_rate": 1.8920145190562615e-05, + "loss": 43.7578, + "step": 2045 + }, + { + "epoch": 7.386455981941309, + "grad_norm": 220.2481689453125, + "learning_rate": 1.891470054446461e-05, + "loss": 42.755, + "step": 2046 + }, + { + "epoch": 7.390067720090293, + "grad_norm": 225.5443115234375, + "learning_rate": 1.8909255898366605e-05, + "loss": 44.3785, + "step": 2047 + }, + { + "epoch": 7.393679458239277, + "grad_norm": 200.2024688720703, + "learning_rate": 1.89038112522686e-05, + "loss": 42.994, + "step": 2048 + }, + { + "epoch": 7.397291196388262, + "grad_norm": 205.64794921875, + "learning_rate": 1.88983666061706e-05, + "loss": 43.1902, + "step": 2049 + }, + { + "epoch": 7.400902934537246, + "grad_norm": 183.3535919189453, + "learning_rate": 1.8892921960072595e-05, + "loss": 40.9422, + "step": 2050 + }, + { + "epoch": 7.400902934537246, + "eval_loss": 0.626913845539093, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2050 + }, + { + "epoch": 7.40451467268623, + "grad_norm": 201.8138885498047, + "learning_rate": 1.8887477313974594e-05, + "loss": 39.4408, + "step": 2051 + }, + { + "epoch": 7.408126410835214, + "grad_norm": 201.8863525390625, + "learning_rate": 1.888203266787659e-05, + "loss": 39.5467, + "step": 2052 + }, + { + "epoch": 7.411738148984199, + "grad_norm": 239.10687255859375, + "learning_rate": 1.8876588021778585e-05, + "loss": 41.2256, + "step": 2053 + }, + { + "epoch": 7.415349887133183, + "grad_norm": 209.47796630859375, + "learning_rate": 1.8871143375680583e-05, + "loss": 40.8963, + "step": 2054 + }, + { + "epoch": 7.418961625282167, + "grad_norm": 202.6414794921875, + "learning_rate": 1.886569872958258e-05, + "loss": 40.5138, + "step": 2055 + }, + { + "epoch": 7.422573363431152, + "grad_norm": 198.01795959472656, + "learning_rate": 1.8860254083484574e-05, + "loss": 39.1767, + "step": 2056 + }, + { + "epoch": 7.426185101580136, + "grad_norm": 173.26507568359375, + "learning_rate": 1.885480943738657e-05, + "loss": 40.6713, + "step": 2057 + }, + { + "epoch": 7.42979683972912, + "grad_norm": 166.11607360839844, + "learning_rate": 1.8849364791288565e-05, + "loss": 41.2602, + "step": 2058 + }, + { + "epoch": 7.433408577878104, + "grad_norm": 200.76956176757812, + "learning_rate": 1.884392014519056e-05, + "loss": 41.0714, + "step": 2059 + }, + { + "epoch": 7.437020316027088, + "grad_norm": 213.75315856933594, + "learning_rate": 1.883847549909256e-05, + "loss": 39.6812, + "step": 2060 + }, + { + "epoch": 7.437020316027088, + "eval_loss": 0.6279598474502563, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.0, + "eval_steps_per_second": 57.0, + "step": 2060 + }, + { + "epoch": 7.440632054176072, + "grad_norm": 221.25025939941406, + "learning_rate": 1.8833030852994558e-05, + "loss": 41.6964, + "step": 2061 + }, + { + "epoch": 7.444243792325056, + "grad_norm": 171.32106018066406, + "learning_rate": 1.8827586206896553e-05, + "loss": 41.4608, + "step": 2062 + }, + { + "epoch": 7.44785553047404, + "grad_norm": 222.76600646972656, + "learning_rate": 1.882214156079855e-05, + "loss": 41.2687, + "step": 2063 + }, + { + "epoch": 7.451467268623025, + "grad_norm": 169.82395935058594, + "learning_rate": 1.8816696914700544e-05, + "loss": 41.6048, + "step": 2064 + }, + { + "epoch": 7.455079006772009, + "grad_norm": 190.5113525390625, + "learning_rate": 1.8811252268602543e-05, + "loss": 41.8843, + "step": 2065 + }, + { + "epoch": 7.458690744920993, + "grad_norm": 194.5990447998047, + "learning_rate": 1.8805807622504538e-05, + "loss": 43.5968, + "step": 2066 + }, + { + "epoch": 7.462302483069977, + "grad_norm": 216.0985870361328, + "learning_rate": 1.8800362976406533e-05, + "loss": 41.6743, + "step": 2067 + }, + { + "epoch": 7.465914221218962, + "grad_norm": 249.05270385742188, + "learning_rate": 1.879491833030853e-05, + "loss": 39.4203, + "step": 2068 + }, + { + "epoch": 7.469525959367946, + "grad_norm": 232.5495147705078, + "learning_rate": 1.8789473684210524e-05, + "loss": 36.2202, + "step": 2069 + }, + { + "epoch": 7.47313769751693, + "grad_norm": 218.72299194335938, + "learning_rate": 1.8784029038112523e-05, + "loss": 34.9116, + "step": 2070 + }, + { + "epoch": 7.47313769751693, + "eval_loss": 0.6241349577903748, + "eval_runtime": 3.1499, + "eval_samples_per_second": 56.827, + "eval_steps_per_second": 56.827, + "step": 2070 + }, + { + "epoch": 7.476749435665914, + "grad_norm": 241.78179931640625, + "learning_rate": 1.8778584392014522e-05, + "loss": 36.2476, + "step": 2071 + }, + { + "epoch": 7.480361173814899, + "grad_norm": 194.92982482910156, + "learning_rate": 1.8773139745916517e-05, + "loss": 34.4524, + "step": 2072 + }, + { + "epoch": 7.483972911963883, + "grad_norm": 227.76156616210938, + "learning_rate": 1.8767695099818513e-05, + "loss": 34.5292, + "step": 2073 + }, + { + "epoch": 7.487584650112867, + "grad_norm": 287.61309814453125, + "learning_rate": 1.8762250453720508e-05, + "loss": 37.8068, + "step": 2074 + }, + { + "epoch": 7.491196388261851, + "grad_norm": 191.0822296142578, + "learning_rate": 1.8756805807622503e-05, + "loss": 36.0941, + "step": 2075 + }, + { + "epoch": 7.4948081264108355, + "grad_norm": 197.5564422607422, + "learning_rate": 1.8751361161524502e-05, + "loss": 36.3624, + "step": 2076 + }, + { + "epoch": 7.4984198645598195, + "grad_norm": 187.72479248046875, + "learning_rate": 1.8745916515426497e-05, + "loss": 37.5074, + "step": 2077 + }, + { + "epoch": 7.502031602708803, + "grad_norm": 220.4607391357422, + "learning_rate": 1.8740471869328493e-05, + "loss": 35.6139, + "step": 2078 + }, + { + "epoch": 7.505643340857787, + "grad_norm": 179.05612182617188, + "learning_rate": 1.873502722323049e-05, + "loss": 37.7286, + "step": 2079 + }, + { + "epoch": 7.509255079006772, + "grad_norm": 230.91879272460938, + "learning_rate": 1.8729582577132487e-05, + "loss": 36.1803, + "step": 2080 + }, + { + "epoch": 7.509255079006772, + "eval_loss": 0.6255043148994446, + "eval_runtime": 3.1466, + "eval_samples_per_second": 56.887, + "eval_steps_per_second": 56.887, + "step": 2080 + }, + { + "epoch": 7.512866817155756, + "grad_norm": 182.89437866210938, + "learning_rate": 1.8724137931034482e-05, + "loss": 36.5782, + "step": 2081 + }, + { + "epoch": 7.51647855530474, + "grad_norm": 215.36769104003906, + "learning_rate": 1.871869328493648e-05, + "loss": 38.233, + "step": 2082 + }, + { + "epoch": 7.520090293453725, + "grad_norm": 232.6095733642578, + "learning_rate": 1.8713248638838477e-05, + "loss": 38.6268, + "step": 2083 + }, + { + "epoch": 7.523702031602709, + "grad_norm": 236.94281005859375, + "learning_rate": 1.8707803992740472e-05, + "loss": 38.1768, + "step": 2084 + }, + { + "epoch": 7.527313769751693, + "grad_norm": 214.16079711914062, + "learning_rate": 1.8702359346642467e-05, + "loss": 27.514, + "step": 2085 + }, + { + "epoch": 7.530925507900677, + "grad_norm": 192.6107940673828, + "learning_rate": 1.8696914700544463e-05, + "loss": 24.274, + "step": 2086 + }, + { + "epoch": 7.534537246049661, + "grad_norm": 217.98619079589844, + "learning_rate": 1.869147005444646e-05, + "loss": 23.2824, + "step": 2087 + }, + { + "epoch": 7.538148984198646, + "grad_norm": 183.04296875, + "learning_rate": 1.868602540834846e-05, + "loss": 24.9622, + "step": 2088 + }, + { + "epoch": 7.54176072234763, + "grad_norm": 167.1417236328125, + "learning_rate": 1.8680580762250456e-05, + "loss": 25.1446, + "step": 2089 + }, + { + "epoch": 7.545372460496614, + "grad_norm": 287.29937744140625, + "learning_rate": 1.867513611615245e-05, + "loss": 44.1171, + "step": 2090 + }, + { + "epoch": 7.545372460496614, + "eval_loss": 0.6376849412918091, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.929, + "eval_steps_per_second": 56.929, + "step": 2090 + }, + { + "epoch": 7.5489841986455986, + "grad_norm": 285.3408203125, + "learning_rate": 1.8669691470054446e-05, + "loss": 46.3716, + "step": 2091 + }, + { + "epoch": 7.5525959367945825, + "grad_norm": 233.18389892578125, + "learning_rate": 1.8664246823956445e-05, + "loss": 44.0514, + "step": 2092 + }, + { + "epoch": 7.5562076749435665, + "grad_norm": 256.4196472167969, + "learning_rate": 1.865880217785844e-05, + "loss": 44.1784, + "step": 2093 + }, + { + "epoch": 7.5598194130925505, + "grad_norm": 223.28128051757812, + "learning_rate": 1.8653357531760436e-05, + "loss": 42.9897, + "step": 2094 + }, + { + "epoch": 7.563431151241535, + "grad_norm": 235.2901153564453, + "learning_rate": 1.864791288566243e-05, + "loss": 43.7651, + "step": 2095 + }, + { + "epoch": 7.567042889390519, + "grad_norm": 285.9206237792969, + "learning_rate": 1.8642468239564427e-05, + "loss": 44.6333, + "step": 2096 + }, + { + "epoch": 7.570654627539503, + "grad_norm": 200.00210571289062, + "learning_rate": 1.8637023593466425e-05, + "loss": 43.9845, + "step": 2097 + }, + { + "epoch": 7.574266365688487, + "grad_norm": 277.73394775390625, + "learning_rate": 1.8631578947368424e-05, + "loss": 44.7301, + "step": 2098 + }, + { + "epoch": 7.577878103837472, + "grad_norm": 216.9422149658203, + "learning_rate": 1.862613430127042e-05, + "loss": 44.0409, + "step": 2099 + }, + { + "epoch": 7.581489841986456, + "grad_norm": 198.86639404296875, + "learning_rate": 1.8620689655172415e-05, + "loss": 43.4026, + "step": 2100 + }, + { + "epoch": 7.581489841986456, + "eval_loss": 0.6270378232002258, + "eval_runtime": 3.1464, + "eval_samples_per_second": 56.891, + "eval_steps_per_second": 56.891, + "step": 2100 + }, + { + "epoch": 7.58510158013544, + "grad_norm": 240.495361328125, + "learning_rate": 1.861524500907441e-05, + "loss": 41.4092, + "step": 2101 + }, + { + "epoch": 7.588713318284425, + "grad_norm": 240.1851043701172, + "learning_rate": 1.8609800362976406e-05, + "loss": 40.1396, + "step": 2102 + }, + { + "epoch": 7.592325056433409, + "grad_norm": 241.21495056152344, + "learning_rate": 1.8604355716878405e-05, + "loss": 39.1778, + "step": 2103 + }, + { + "epoch": 7.595936794582393, + "grad_norm": 287.3133544921875, + "learning_rate": 1.85989110707804e-05, + "loss": 41.0348, + "step": 2104 + }, + { + "epoch": 7.599548532731377, + "grad_norm": 230.4313201904297, + "learning_rate": 1.8593466424682395e-05, + "loss": 39.5872, + "step": 2105 + }, + { + "epoch": 7.603160270880361, + "grad_norm": 210.32962036132812, + "learning_rate": 1.858802177858439e-05, + "loss": 40.6146, + "step": 2106 + }, + { + "epoch": 7.606772009029346, + "grad_norm": 185.81752014160156, + "learning_rate": 1.858257713248639e-05, + "loss": 39.6363, + "step": 2107 + }, + { + "epoch": 7.6103837471783295, + "grad_norm": 234.63037109375, + "learning_rate": 1.8577132486388385e-05, + "loss": 40.558, + "step": 2108 + }, + { + "epoch": 7.6139954853273135, + "grad_norm": 289.92803955078125, + "learning_rate": 1.8571687840290384e-05, + "loss": 41.1624, + "step": 2109 + }, + { + "epoch": 7.617607223476298, + "grad_norm": 252.82188415527344, + "learning_rate": 1.856624319419238e-05, + "loss": 41.7827, + "step": 2110 + }, + { + "epoch": 7.617607223476298, + "eval_loss": 0.6290409564971924, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2110 + }, + { + "epoch": 7.621218961625282, + "grad_norm": 201.8303985595703, + "learning_rate": 1.8560798548094374e-05, + "loss": 39.0072, + "step": 2111 + }, + { + "epoch": 7.624830699774266, + "grad_norm": 158.71446228027344, + "learning_rate": 1.855535390199637e-05, + "loss": 39.9822, + "step": 2112 + }, + { + "epoch": 7.62844243792325, + "grad_norm": 171.3879852294922, + "learning_rate": 1.8549909255898365e-05, + "loss": 42.1973, + "step": 2113 + }, + { + "epoch": 7.632054176072235, + "grad_norm": 218.584228515625, + "learning_rate": 1.8544464609800364e-05, + "loss": 42.933, + "step": 2114 + }, + { + "epoch": 7.635665914221219, + "grad_norm": 200.60093688964844, + "learning_rate": 1.853901996370236e-05, + "loss": 41.9847, + "step": 2115 + }, + { + "epoch": 7.639277652370203, + "grad_norm": 210.75128173828125, + "learning_rate": 1.8533575317604358e-05, + "loss": 42.4961, + "step": 2116 + }, + { + "epoch": 7.642889390519187, + "grad_norm": 187.47406005859375, + "learning_rate": 1.8528130671506353e-05, + "loss": 39.3404, + "step": 2117 + }, + { + "epoch": 7.646501128668172, + "grad_norm": 204.87693786621094, + "learning_rate": 1.852268602540835e-05, + "loss": 40.3011, + "step": 2118 + }, + { + "epoch": 7.650112866817156, + "grad_norm": 228.8159637451172, + "learning_rate": 1.8517241379310344e-05, + "loss": 37.4416, + "step": 2119 + }, + { + "epoch": 7.65372460496614, + "grad_norm": 237.59664916992188, + "learning_rate": 1.8511796733212343e-05, + "loss": 35.3079, + "step": 2120 + }, + { + "epoch": 7.65372460496614, + "eval_loss": 0.6256567239761353, + "eval_runtime": 3.1458, + "eval_samples_per_second": 56.902, + "eval_steps_per_second": 56.902, + "step": 2120 + }, + { + "epoch": 7.657336343115124, + "grad_norm": 233.3187713623047, + "learning_rate": 1.850635208711434e-05, + "loss": 34.5055, + "step": 2121 + }, + { + "epoch": 7.660948081264109, + "grad_norm": 232.7037353515625, + "learning_rate": 1.8500907441016334e-05, + "loss": 34.1232, + "step": 2122 + }, + { + "epoch": 7.664559819413093, + "grad_norm": 254.53050231933594, + "learning_rate": 1.849546279491833e-05, + "loss": 35.3301, + "step": 2123 + }, + { + "epoch": 7.668171557562077, + "grad_norm": 234.93154907226562, + "learning_rate": 1.8490018148820324e-05, + "loss": 35.9202, + "step": 2124 + }, + { + "epoch": 7.6717832957110605, + "grad_norm": 237.99671936035156, + "learning_rate": 1.8484573502722327e-05, + "loss": 36.5702, + "step": 2125 + }, + { + "epoch": 7.675395033860045, + "grad_norm": 186.25271606445312, + "learning_rate": 1.8479128856624322e-05, + "loss": 35.9423, + "step": 2126 + }, + { + "epoch": 7.679006772009029, + "grad_norm": 226.461669921875, + "learning_rate": 1.8473684210526317e-05, + "loss": 37.4121, + "step": 2127 + }, + { + "epoch": 7.682618510158013, + "grad_norm": 227.0966033935547, + "learning_rate": 1.8468239564428313e-05, + "loss": 36.8802, + "step": 2128 + }, + { + "epoch": 7.686230248306998, + "grad_norm": 193.4064178466797, + "learning_rate": 1.8462794918330308e-05, + "loss": 36.0245, + "step": 2129 + }, + { + "epoch": 7.689841986455982, + "grad_norm": 279.1668395996094, + "learning_rate": 1.8457350272232304e-05, + "loss": 37.4833, + "step": 2130 + }, + { + "epoch": 7.689841986455982, + "eval_loss": 0.6227458715438843, + "eval_runtime": 3.1429, + "eval_samples_per_second": 56.953, + "eval_steps_per_second": 56.953, + "step": 2130 + }, + { + "epoch": 7.693453724604966, + "grad_norm": 254.59234619140625, + "learning_rate": 1.8451905626134302e-05, + "loss": 36.8538, + "step": 2131 + }, + { + "epoch": 7.69706546275395, + "grad_norm": 191.14463806152344, + "learning_rate": 1.8446460980036298e-05, + "loss": 37.8517, + "step": 2132 + }, + { + "epoch": 7.700677200902934, + "grad_norm": 189.20896911621094, + "learning_rate": 1.8441016333938293e-05, + "loss": 38.406, + "step": 2133 + }, + { + "epoch": 7.704288939051919, + "grad_norm": 209.61175537109375, + "learning_rate": 1.8435571687840292e-05, + "loss": 37.7692, + "step": 2134 + }, + { + "epoch": 7.707900677200903, + "grad_norm": 220.5150146484375, + "learning_rate": 1.8430127041742287e-05, + "loss": 36.087, + "step": 2135 + }, + { + "epoch": 7.711512415349887, + "grad_norm": 211.78372192382812, + "learning_rate": 1.8424682395644286e-05, + "loss": 25.6052, + "step": 2136 + }, + { + "epoch": 7.715124153498872, + "grad_norm": 223.85789489746094, + "learning_rate": 1.841923774954628e-05, + "loss": 23.5576, + "step": 2137 + }, + { + "epoch": 7.718735891647856, + "grad_norm": 163.74220275878906, + "learning_rate": 1.8413793103448277e-05, + "loss": 24.4869, + "step": 2138 + }, + { + "epoch": 7.72234762979684, + "grad_norm": 182.80079650878906, + "learning_rate": 1.8408348457350272e-05, + "loss": 25.1878, + "step": 2139 + }, + { + "epoch": 7.725959367945824, + "grad_norm": 296.0340270996094, + "learning_rate": 1.8402903811252268e-05, + "loss": 44.4643, + "step": 2140 + }, + { + "epoch": 7.725959367945824, + "eval_loss": 0.6382863521575928, + "eval_runtime": 3.1441, + "eval_samples_per_second": 56.932, + "eval_steps_per_second": 56.932, + "step": 2140 + }, + { + "epoch": 7.7295711060948085, + "grad_norm": 248.48643493652344, + "learning_rate": 1.8397459165154263e-05, + "loss": 45.2141, + "step": 2141 + }, + { + "epoch": 7.733182844243792, + "grad_norm": 240.9061279296875, + "learning_rate": 1.8392014519056262e-05, + "loss": 42.9435, + "step": 2142 + }, + { + "epoch": 7.736794582392776, + "grad_norm": 231.62315368652344, + "learning_rate": 1.8386569872958257e-05, + "loss": 42.9769, + "step": 2143 + }, + { + "epoch": 7.74040632054176, + "grad_norm": 244.36915588378906, + "learning_rate": 1.8381125226860256e-05, + "loss": 43.6058, + "step": 2144 + }, + { + "epoch": 7.744018058690745, + "grad_norm": 252.9080047607422, + "learning_rate": 1.837568058076225e-05, + "loss": 43.1753, + "step": 2145 + }, + { + "epoch": 7.747629796839729, + "grad_norm": 274.0201721191406, + "learning_rate": 1.8370235934664247e-05, + "loss": 43.3285, + "step": 2146 + }, + { + "epoch": 7.751241534988713, + "grad_norm": 226.75595092773438, + "learning_rate": 1.8364791288566245e-05, + "loss": 43.3158, + "step": 2147 + }, + { + "epoch": 7.754853273137698, + "grad_norm": 197.0859832763672, + "learning_rate": 1.835934664246824e-05, + "loss": 43.5773, + "step": 2148 + }, + { + "epoch": 7.758465011286682, + "grad_norm": 212.14720153808594, + "learning_rate": 1.8353901996370236e-05, + "loss": 43.9208, + "step": 2149 + }, + { + "epoch": 7.762076749435666, + "grad_norm": 230.22158813476562, + "learning_rate": 1.834845735027223e-05, + "loss": 42.8429, + "step": 2150 + }, + { + "epoch": 7.762076749435666, + "eval_loss": 0.6291994452476501, + "eval_runtime": 3.1473, + "eval_samples_per_second": 56.874, + "eval_steps_per_second": 56.874, + "step": 2150 + }, + { + "epoch": 7.76568848758465, + "grad_norm": 215.79391479492188, + "learning_rate": 1.8343012704174227e-05, + "loss": 40.7289, + "step": 2151 + }, + { + "epoch": 7.769300225733634, + "grad_norm": 210.00296020507812, + "learning_rate": 1.8337568058076222e-05, + "loss": 39.9759, + "step": 2152 + }, + { + "epoch": 7.772911963882619, + "grad_norm": 291.2987976074219, + "learning_rate": 1.8332123411978224e-05, + "loss": 40.551, + "step": 2153 + }, + { + "epoch": 7.776523702031603, + "grad_norm": 218.08819580078125, + "learning_rate": 1.832667876588022e-05, + "loss": 40.7981, + "step": 2154 + }, + { + "epoch": 7.780135440180587, + "grad_norm": 268.615966796875, + "learning_rate": 1.8321234119782215e-05, + "loss": 40.5463, + "step": 2155 + }, + { + "epoch": 7.7837471783295715, + "grad_norm": 269.939697265625, + "learning_rate": 1.831578947368421e-05, + "loss": 40.6168, + "step": 2156 + }, + { + "epoch": 7.7873589164785555, + "grad_norm": 268.9761657714844, + "learning_rate": 1.8310344827586206e-05, + "loss": 41.2449, + "step": 2157 + }, + { + "epoch": 7.7909706546275395, + "grad_norm": 161.08811950683594, + "learning_rate": 1.8304900181488205e-05, + "loss": 40.6308, + "step": 2158 + }, + { + "epoch": 7.794582392776523, + "grad_norm": 190.44696044921875, + "learning_rate": 1.82994555353902e-05, + "loss": 40.9708, + "step": 2159 + }, + { + "epoch": 7.798194130925508, + "grad_norm": 202.4305419921875, + "learning_rate": 1.8294010889292196e-05, + "loss": 41.2053, + "step": 2160 + }, + { + "epoch": 7.798194130925508, + "eval_loss": 0.6233534812927246, + "eval_runtime": 3.1457, + "eval_samples_per_second": 56.903, + "eval_steps_per_second": 56.903, + "step": 2160 + }, + { + "epoch": 7.801805869074492, + "grad_norm": 188.5523681640625, + "learning_rate": 1.828856624319419e-05, + "loss": 40.3928, + "step": 2161 + }, + { + "epoch": 7.805417607223476, + "grad_norm": 184.18296813964844, + "learning_rate": 1.828312159709619e-05, + "loss": 42.3466, + "step": 2162 + }, + { + "epoch": 7.80902934537246, + "grad_norm": 223.9243927001953, + "learning_rate": 1.8277676950998185e-05, + "loss": 42.0301, + "step": 2163 + }, + { + "epoch": 7.812641083521445, + "grad_norm": 202.3498077392578, + "learning_rate": 1.8272232304900184e-05, + "loss": 42.3284, + "step": 2164 + }, + { + "epoch": 7.816252821670429, + "grad_norm": 205.77940368652344, + "learning_rate": 1.826678765880218e-05, + "loss": 42.0951, + "step": 2165 + }, + { + "epoch": 7.819864559819413, + "grad_norm": 191.46728515625, + "learning_rate": 1.8261343012704175e-05, + "loss": 40.826, + "step": 2166 + }, + { + "epoch": 7.823476297968397, + "grad_norm": 276.8330383300781, + "learning_rate": 1.825589836660617e-05, + "loss": 42.7909, + "step": 2167 + }, + { + "epoch": 7.827088036117382, + "grad_norm": 181.93955993652344, + "learning_rate": 1.8250453720508165e-05, + "loss": 38.6068, + "step": 2168 + }, + { + "epoch": 7.830699774266366, + "grad_norm": 178.79856872558594, + "learning_rate": 1.8245009074410164e-05, + "loss": 35.694, + "step": 2169 + }, + { + "epoch": 7.83431151241535, + "grad_norm": 224.6522979736328, + "learning_rate": 1.823956442831216e-05, + "loss": 36.7127, + "step": 2170 + }, + { + "epoch": 7.83431151241535, + "eval_loss": 0.6237645745277405, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 2170 + }, + { + "epoch": 7.837923250564334, + "grad_norm": 203.37196350097656, + "learning_rate": 1.823411978221416e-05, + "loss": 34.0039, + "step": 2171 + }, + { + "epoch": 7.8415349887133186, + "grad_norm": 212.79307556152344, + "learning_rate": 1.8228675136116154e-05, + "loss": 33.2787, + "step": 2172 + }, + { + "epoch": 7.8451467268623025, + "grad_norm": 215.5691375732422, + "learning_rate": 1.822323049001815e-05, + "loss": 35.4241, + "step": 2173 + }, + { + "epoch": 7.8487584650112865, + "grad_norm": 230.0751190185547, + "learning_rate": 1.8217785843920144e-05, + "loss": 36.9333, + "step": 2174 + }, + { + "epoch": 7.852370203160271, + "grad_norm": 217.8132781982422, + "learning_rate": 1.8212341197822143e-05, + "loss": 35.7233, + "step": 2175 + }, + { + "epoch": 7.855981941309255, + "grad_norm": 245.93177795410156, + "learning_rate": 1.820689655172414e-05, + "loss": 36.6111, + "step": 2176 + }, + { + "epoch": 7.859593679458239, + "grad_norm": 210.58218383789062, + "learning_rate": 1.8201451905626134e-05, + "loss": 36.3243, + "step": 2177 + }, + { + "epoch": 7.863205417607223, + "grad_norm": 234.6280059814453, + "learning_rate": 1.819600725952813e-05, + "loss": 37.0315, + "step": 2178 + }, + { + "epoch": 7.866817155756207, + "grad_norm": 184.53121948242188, + "learning_rate": 1.8190562613430125e-05, + "loss": 35.8725, + "step": 2179 + }, + { + "epoch": 7.870428893905192, + "grad_norm": 201.5563507080078, + "learning_rate": 1.8185117967332127e-05, + "loss": 37.9183, + "step": 2180 + }, + { + "epoch": 7.870428893905192, + "eval_loss": 0.6210297346115112, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2180 + }, + { + "epoch": 7.874040632054176, + "grad_norm": 192.29579162597656, + "learning_rate": 1.8179673321234122e-05, + "loss": 37.1709, + "step": 2181 + }, + { + "epoch": 7.87765237020316, + "grad_norm": 246.0638427734375, + "learning_rate": 1.8174228675136118e-05, + "loss": 38.5338, + "step": 2182 + }, + { + "epoch": 7.881264108352145, + "grad_norm": 237.47607421875, + "learning_rate": 1.8168784029038113e-05, + "loss": 37.7041, + "step": 2183 + }, + { + "epoch": 7.884875846501129, + "grad_norm": 215.06407165527344, + "learning_rate": 1.816333938294011e-05, + "loss": 38.1663, + "step": 2184 + }, + { + "epoch": 7.888487584650113, + "grad_norm": 193.76809692382812, + "learning_rate": 1.8157894736842107e-05, + "loss": 32.1679, + "step": 2185 + }, + { + "epoch": 7.892099322799097, + "grad_norm": 208.66111755371094, + "learning_rate": 1.8152450090744103e-05, + "loss": 24.2413, + "step": 2186 + }, + { + "epoch": 7.895711060948082, + "grad_norm": 182.810546875, + "learning_rate": 1.8147005444646098e-05, + "loss": 24.1102, + "step": 2187 + }, + { + "epoch": 7.899322799097066, + "grad_norm": 200.25823974609375, + "learning_rate": 1.8141560798548093e-05, + "loss": 24.5778, + "step": 2188 + }, + { + "epoch": 7.9029345372460496, + "grad_norm": 224.19125366210938, + "learning_rate": 1.813611615245009e-05, + "loss": 26.1643, + "step": 2189 + }, + { + "epoch": 7.9065462753950335, + "grad_norm": 261.03033447265625, + "learning_rate": 1.8130671506352088e-05, + "loss": 45.1071, + "step": 2190 + }, + { + "epoch": 7.9065462753950335, + "eval_loss": 0.6303785443305969, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2190 + }, + { + "epoch": 7.910158013544018, + "grad_norm": 273.6593322753906, + "learning_rate": 1.8125226860254086e-05, + "loss": 43.8271, + "step": 2191 + }, + { + "epoch": 7.913769751693002, + "grad_norm": 304.0534362792969, + "learning_rate": 1.8119782214156082e-05, + "loss": 43.7623, + "step": 2192 + }, + { + "epoch": 7.917381489841986, + "grad_norm": 249.27255249023438, + "learning_rate": 1.8114337568058077e-05, + "loss": 43.7191, + "step": 2193 + }, + { + "epoch": 7.92099322799097, + "grad_norm": 199.5006103515625, + "learning_rate": 1.8108892921960072e-05, + "loss": 44.1019, + "step": 2194 + }, + { + "epoch": 7.924604966139955, + "grad_norm": 228.42832946777344, + "learning_rate": 1.8103448275862068e-05, + "loss": 43.9717, + "step": 2195 + }, + { + "epoch": 7.928216704288939, + "grad_norm": 247.20901489257812, + "learning_rate": 1.8098003629764067e-05, + "loss": 40.022, + "step": 2196 + }, + { + "epoch": 7.931828442437923, + "grad_norm": 297.5372619628906, + "learning_rate": 1.8092558983666062e-05, + "loss": 40.6639, + "step": 2197 + }, + { + "epoch": 7.935440180586907, + "grad_norm": 245.11915588378906, + "learning_rate": 1.8087114337568057e-05, + "loss": 40.3569, + "step": 2198 + }, + { + "epoch": 7.939051918735892, + "grad_norm": 255.53297424316406, + "learning_rate": 1.8081669691470056e-05, + "loss": 41.7983, + "step": 2199 + }, + { + "epoch": 7.942663656884876, + "grad_norm": 226.12783813476562, + "learning_rate": 1.807622504537205e-05, + "loss": 41.7844, + "step": 2200 + }, + { + "epoch": 7.942663656884876, + "eval_loss": 0.6214397549629211, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2200 + }, + { + "epoch": 7.94627539503386, + "grad_norm": 220.90577697753906, + "learning_rate": 1.8070780399274047e-05, + "loss": 42.057, + "step": 2201 + }, + { + "epoch": 7.949887133182845, + "grad_norm": 192.33856201171875, + "learning_rate": 1.8065335753176046e-05, + "loss": 42.0299, + "step": 2202 + }, + { + "epoch": 7.953498871331829, + "grad_norm": 192.8511962890625, + "learning_rate": 1.805989110707804e-05, + "loss": 41.7752, + "step": 2203 + }, + { + "epoch": 7.957110609480813, + "grad_norm": 223.10275268554688, + "learning_rate": 1.8054446460980036e-05, + "loss": 41.0178, + "step": 2204 + }, + { + "epoch": 7.960722347629797, + "grad_norm": 189.8402099609375, + "learning_rate": 1.8049001814882032e-05, + "loss": 37.9747, + "step": 2205 + }, + { + "epoch": 7.9643340857787805, + "grad_norm": 233.5938720703125, + "learning_rate": 1.8043557168784027e-05, + "loss": 35.3994, + "step": 2206 + }, + { + "epoch": 7.967945823927765, + "grad_norm": 218.5577850341797, + "learning_rate": 1.8038112522686026e-05, + "loss": 35.1967, + "step": 2207 + }, + { + "epoch": 7.971557562076749, + "grad_norm": 228.49502563476562, + "learning_rate": 1.8032667876588025e-05, + "loss": 34.5792, + "step": 2208 + }, + { + "epoch": 7.975169300225733, + "grad_norm": 285.4461364746094, + "learning_rate": 1.802722323049002e-05, + "loss": 37.9449, + "step": 2209 + }, + { + "epoch": 7.978781038374718, + "grad_norm": 186.83755493164062, + "learning_rate": 1.8021778584392016e-05, + "loss": 36.3295, + "step": 2210 + }, + { + "epoch": 7.978781038374718, + "eval_loss": 0.6212169528007507, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2210 + }, + { + "epoch": 7.982392776523702, + "grad_norm": 210.31175231933594, + "learning_rate": 1.801633393829401e-05, + "loss": 37.0061, + "step": 2211 + }, + { + "epoch": 7.986004514672686, + "grad_norm": 251.96026611328125, + "learning_rate": 1.8010889292196006e-05, + "loss": 37.8831, + "step": 2212 + }, + { + "epoch": 7.98961625282167, + "grad_norm": 273.8665771484375, + "learning_rate": 1.8005444646098005e-05, + "loss": 38.8926, + "step": 2213 + }, + { + "epoch": 7.993227990970655, + "grad_norm": 207.25836181640625, + "learning_rate": 1.8e-05, + "loss": 30.0468, + "step": 2214 + }, + { + "epoch": 7.996839729119639, + "grad_norm": 200.5218048095703, + "learning_rate": 1.7994555353901996e-05, + "loss": 24.0549, + "step": 2215 + }, + { + "epoch": 8.0, + "grad_norm": 245.7149200439453, + "learning_rate": 1.798911070780399e-05, + "loss": 22.3158, + "step": 2216 + }, + { + "epoch": 8.003611738148985, + "grad_norm": 263.85546875, + "learning_rate": 1.798366606170599e-05, + "loss": 43.2342, + "step": 2217 + }, + { + "epoch": 8.007223476297968, + "grad_norm": 244.57205200195312, + "learning_rate": 1.797822141560799e-05, + "loss": 44.0931, + "step": 2218 + }, + { + "epoch": 8.010835214446953, + "grad_norm": 196.4144287109375, + "learning_rate": 1.7972776769509984e-05, + "loss": 42.1926, + "step": 2219 + }, + { + "epoch": 8.014446952595938, + "grad_norm": 282.3250427246094, + "learning_rate": 1.796733212341198e-05, + "loss": 41.4664, + "step": 2220 + }, + { + "epoch": 8.014446952595938, + "eval_loss": 0.6222901344299316, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 2220 + }, + { + "epoch": 8.01805869074492, + "grad_norm": 186.79281616210938, + "learning_rate": 1.7961887477313975e-05, + "loss": 42.2133, + "step": 2221 + }, + { + "epoch": 8.021670428893906, + "grad_norm": 220.3788299560547, + "learning_rate": 1.795644283121597e-05, + "loss": 42.0159, + "step": 2222 + }, + { + "epoch": 8.025282167042889, + "grad_norm": 262.37078857421875, + "learning_rate": 1.7950998185117966e-05, + "loss": 42.6055, + "step": 2223 + }, + { + "epoch": 8.028893905191874, + "grad_norm": 199.07078552246094, + "learning_rate": 1.7945553539019964e-05, + "loss": 43.3061, + "step": 2224 + }, + { + "epoch": 8.032505643340858, + "grad_norm": 256.6651306152344, + "learning_rate": 1.794010889292196e-05, + "loss": 42.4806, + "step": 2225 + }, + { + "epoch": 8.036117381489841, + "grad_norm": 281.17431640625, + "learning_rate": 1.793466424682396e-05, + "loss": 43.9823, + "step": 2226 + }, + { + "epoch": 8.039729119638826, + "grad_norm": 201.19837951660156, + "learning_rate": 1.7929219600725954e-05, + "loss": 41.8372, + "step": 2227 + }, + { + "epoch": 8.043340857787811, + "grad_norm": 195.1905059814453, + "learning_rate": 1.792377495462795e-05, + "loss": 38.8656, + "step": 2228 + }, + { + "epoch": 8.046952595936794, + "grad_norm": 215.02772521972656, + "learning_rate": 1.7918330308529948e-05, + "loss": 39.8965, + "step": 2229 + }, + { + "epoch": 8.050564334085779, + "grad_norm": 202.16322326660156, + "learning_rate": 1.7912885662431944e-05, + "loss": 41.0917, + "step": 2230 + }, + { + "epoch": 8.050564334085779, + "eval_loss": 0.6212881207466125, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2230 + }, + { + "epoch": 8.054176072234762, + "grad_norm": 218.90786743164062, + "learning_rate": 1.790744101633394e-05, + "loss": 38.5499, + "step": 2231 + }, + { + "epoch": 8.057787810383747, + "grad_norm": 179.57138061523438, + "learning_rate": 1.7901996370235934e-05, + "loss": 39.5915, + "step": 2232 + }, + { + "epoch": 8.061399548532732, + "grad_norm": 242.74801635742188, + "learning_rate": 1.789655172413793e-05, + "loss": 39.6094, + "step": 2233 + }, + { + "epoch": 8.065011286681715, + "grad_norm": 183.07102966308594, + "learning_rate": 1.7891107078039925e-05, + "loss": 40.6025, + "step": 2234 + }, + { + "epoch": 8.0686230248307, + "grad_norm": 192.85418701171875, + "learning_rate": 1.7885662431941924e-05, + "loss": 40.3013, + "step": 2235 + }, + { + "epoch": 8.072234762979685, + "grad_norm": 254.26353454589844, + "learning_rate": 1.7880217785843923e-05, + "loss": 39.1747, + "step": 2236 + }, + { + "epoch": 8.075846501128668, + "grad_norm": 230.7747802734375, + "learning_rate": 1.7874773139745918e-05, + "loss": 40.7569, + "step": 2237 + }, + { + "epoch": 8.079458239277653, + "grad_norm": 179.30528259277344, + "learning_rate": 1.7869328493647913e-05, + "loss": 40.0753, + "step": 2238 + }, + { + "epoch": 8.083069977426636, + "grad_norm": 203.48915100097656, + "learning_rate": 1.786388384754991e-05, + "loss": 41.4453, + "step": 2239 + }, + { + "epoch": 8.08668171557562, + "grad_norm": 274.8970947265625, + "learning_rate": 1.7858439201451908e-05, + "loss": 40.5818, + "step": 2240 + }, + { + "epoch": 8.08668171557562, + "eval_loss": 0.6184170842170715, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.799, + "eval_steps_per_second": 56.799, + "step": 2240 + }, + { + "epoch": 8.090293453724605, + "grad_norm": 237.2452392578125, + "learning_rate": 1.7852994555353903e-05, + "loss": 42.5794, + "step": 2241 + }, + { + "epoch": 8.093905191873588, + "grad_norm": 236.33766174316406, + "learning_rate": 1.7847549909255898e-05, + "loss": 41.89, + "step": 2242 + }, + { + "epoch": 8.097516930022573, + "grad_norm": 269.4791564941406, + "learning_rate": 1.7842105263157894e-05, + "loss": 41.7726, + "step": 2243 + }, + { + "epoch": 8.101128668171558, + "grad_norm": 192.28457641601562, + "learning_rate": 1.783666061705989e-05, + "loss": 40.1187, + "step": 2244 + }, + { + "epoch": 8.104740406320541, + "grad_norm": 201.5625457763672, + "learning_rate": 1.7831215970961888e-05, + "loss": 36.8004, + "step": 2245 + }, + { + "epoch": 8.108352144469526, + "grad_norm": 175.7625274658203, + "learning_rate": 1.7825771324863887e-05, + "loss": 33.8354, + "step": 2246 + }, + { + "epoch": 8.111963882618511, + "grad_norm": 195.6171112060547, + "learning_rate": 1.7820326678765882e-05, + "loss": 33.5176, + "step": 2247 + }, + { + "epoch": 8.115575620767494, + "grad_norm": 158.7554168701172, + "learning_rate": 1.7814882032667877e-05, + "loss": 34.2908, + "step": 2248 + }, + { + "epoch": 8.119187358916479, + "grad_norm": 192.78900146484375, + "learning_rate": 1.7809437386569873e-05, + "loss": 34.0861, + "step": 2249 + }, + { + "epoch": 8.122799097065462, + "grad_norm": 186.6603240966797, + "learning_rate": 1.7803992740471868e-05, + "loss": 35.5742, + "step": 2250 + }, + { + "epoch": 8.122799097065462, + "eval_loss": 0.6207499504089355, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2250 + }, + { + "epoch": 8.126410835214447, + "grad_norm": 264.3590087890625, + "learning_rate": 1.7798548094373867e-05, + "loss": 35.6709, + "step": 2251 + }, + { + "epoch": 8.130022573363432, + "grad_norm": 202.9478302001953, + "learning_rate": 1.7793103448275862e-05, + "loss": 36.4221, + "step": 2252 + }, + { + "epoch": 8.133634311512415, + "grad_norm": 229.260498046875, + "learning_rate": 1.7787658802177858e-05, + "loss": 36.0745, + "step": 2253 + }, + { + "epoch": 8.1372460496614, + "grad_norm": 222.37716674804688, + "learning_rate": 1.7782214156079856e-05, + "loss": 37.3266, + "step": 2254 + }, + { + "epoch": 8.140857787810384, + "grad_norm": 217.02272033691406, + "learning_rate": 1.7776769509981852e-05, + "loss": 37.2819, + "step": 2255 + }, + { + "epoch": 8.144469525959368, + "grad_norm": 247.61016845703125, + "learning_rate": 1.7771324863883847e-05, + "loss": 37.2683, + "step": 2256 + }, + { + "epoch": 8.148081264108352, + "grad_norm": 209.7449493408203, + "learning_rate": 1.7765880217785846e-05, + "loss": 36.7165, + "step": 2257 + }, + { + "epoch": 8.151693002257336, + "grad_norm": 217.30722045898438, + "learning_rate": 1.776043557168784e-05, + "loss": 37.0805, + "step": 2258 + }, + { + "epoch": 8.15530474040632, + "grad_norm": 181.5167236328125, + "learning_rate": 1.7754990925589837e-05, + "loss": 38.0326, + "step": 2259 + }, + { + "epoch": 8.158916478555305, + "grad_norm": 217.4818878173828, + "learning_rate": 1.7749546279491832e-05, + "loss": 37.1798, + "step": 2260 + }, + { + "epoch": 8.158916478555305, + "eval_loss": 0.6218119263648987, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 2260 + }, + { + "epoch": 8.162528216704288, + "grad_norm": 233.60733032226562, + "learning_rate": 1.7744101633393828e-05, + "loss": 36.6039, + "step": 2261 + }, + { + "epoch": 8.166139954853273, + "grad_norm": 184.5128631591797, + "learning_rate": 1.7738656987295826e-05, + "loss": 30.6188, + "step": 2262 + }, + { + "epoch": 8.169751693002258, + "grad_norm": 154.25791931152344, + "learning_rate": 1.7733212341197825e-05, + "loss": 24.0782, + "step": 2263 + }, + { + "epoch": 8.173363431151241, + "grad_norm": 179.92723083496094, + "learning_rate": 1.772776769509982e-05, + "loss": 23.7072, + "step": 2264 + }, + { + "epoch": 8.176975169300226, + "grad_norm": 170.87684631347656, + "learning_rate": 1.7722323049001816e-05, + "loss": 24.0008, + "step": 2265 + }, + { + "epoch": 8.18058690744921, + "grad_norm": 179.25233459472656, + "learning_rate": 1.771687840290381e-05, + "loss": 24.8393, + "step": 2266 + }, + { + "epoch": 8.184198645598194, + "grad_norm": 268.7836608886719, + "learning_rate": 1.7711433756805807e-05, + "loss": 44.0573, + "step": 2267 + }, + { + "epoch": 8.187810383747179, + "grad_norm": 249.12033081054688, + "learning_rate": 1.7705989110707805e-05, + "loss": 45.0218, + "step": 2268 + }, + { + "epoch": 8.191422121896162, + "grad_norm": 275.2551574707031, + "learning_rate": 1.77005444646098e-05, + "loss": 43.1954, + "step": 2269 + }, + { + "epoch": 8.195033860045147, + "grad_norm": 233.5360107421875, + "learning_rate": 1.7695099818511796e-05, + "loss": 43.0807, + "step": 2270 + }, + { + "epoch": 8.195033860045147, + "eval_loss": 0.6311450600624084, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 2270 + }, + { + "epoch": 8.198645598194132, + "grad_norm": 201.01617431640625, + "learning_rate": 1.768965517241379e-05, + "loss": 43.8161, + "step": 2271 + }, + { + "epoch": 8.202257336343115, + "grad_norm": 243.028564453125, + "learning_rate": 1.7684210526315787e-05, + "loss": 43.3388, + "step": 2272 + }, + { + "epoch": 8.2058690744921, + "grad_norm": 191.8246307373047, + "learning_rate": 1.767876588021779e-05, + "loss": 42.6949, + "step": 2273 + }, + { + "epoch": 8.209480812641084, + "grad_norm": 241.33609008789062, + "learning_rate": 1.7673321234119784e-05, + "loss": 43.3541, + "step": 2274 + }, + { + "epoch": 8.213092550790067, + "grad_norm": 247.99066162109375, + "learning_rate": 1.766787658802178e-05, + "loss": 44.4262, + "step": 2275 + }, + { + "epoch": 8.216704288939052, + "grad_norm": 223.35452270507812, + "learning_rate": 1.7662431941923775e-05, + "loss": 42.5696, + "step": 2276 + }, + { + "epoch": 8.220316027088035, + "grad_norm": 208.75209045410156, + "learning_rate": 1.765698729582577e-05, + "loss": 41.9236, + "step": 2277 + }, + { + "epoch": 8.22392776523702, + "grad_norm": 229.60305786132812, + "learning_rate": 1.7651542649727766e-05, + "loss": 39.962, + "step": 2278 + }, + { + "epoch": 8.227539503386005, + "grad_norm": 294.3867492675781, + "learning_rate": 1.7646098003629765e-05, + "loss": 39.0847, + "step": 2279 + }, + { + "epoch": 8.231151241534988, + "grad_norm": 201.49679565429688, + "learning_rate": 1.764065335753176e-05, + "loss": 39.1451, + "step": 2280 + }, + { + "epoch": 8.231151241534988, + "eval_loss": 0.6214079856872559, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 2280 + }, + { + "epoch": 8.234762979683973, + "grad_norm": 201.57894897460938, + "learning_rate": 1.7635208711433756e-05, + "loss": 39.4673, + "step": 2281 + }, + { + "epoch": 8.238374717832958, + "grad_norm": 201.0395965576172, + "learning_rate": 1.7629764065335754e-05, + "loss": 39.9832, + "step": 2282 + }, + { + "epoch": 8.241986455981941, + "grad_norm": 274.41168212890625, + "learning_rate": 1.762431941923775e-05, + "loss": 40.3885, + "step": 2283 + }, + { + "epoch": 8.245598194130926, + "grad_norm": 173.79977416992188, + "learning_rate": 1.761887477313975e-05, + "loss": 39.5292, + "step": 2284 + }, + { + "epoch": 8.249209932279909, + "grad_norm": 194.91806030273438, + "learning_rate": 1.7613430127041744e-05, + "loss": 40.3855, + "step": 2285 + }, + { + "epoch": 8.252821670428894, + "grad_norm": 216.47213745117188, + "learning_rate": 1.760798548094374e-05, + "loss": 40.937, + "step": 2286 + }, + { + "epoch": 8.256433408577879, + "grad_norm": 168.1825714111328, + "learning_rate": 1.7602540834845735e-05, + "loss": 41.2523, + "step": 2287 + }, + { + "epoch": 8.260045146726862, + "grad_norm": 187.51914978027344, + "learning_rate": 1.759709618874773e-05, + "loss": 40.6913, + "step": 2288 + }, + { + "epoch": 8.263656884875846, + "grad_norm": 183.99844360351562, + "learning_rate": 1.759165154264973e-05, + "loss": 42.5074, + "step": 2289 + }, + { + "epoch": 8.267268623024831, + "grad_norm": 201.23797607421875, + "learning_rate": 1.7586206896551724e-05, + "loss": 42.0519, + "step": 2290 + }, + { + "epoch": 8.267268623024831, + "eval_loss": 0.6184054017066956, + "eval_runtime": 3.1465, + "eval_samples_per_second": 56.889, + "eval_steps_per_second": 56.889, + "step": 2290 + }, + { + "epoch": 8.270880361173814, + "grad_norm": 219.0037384033203, + "learning_rate": 1.7580762250453723e-05, + "loss": 41.7059, + "step": 2291 + }, + { + "epoch": 8.2744920993228, + "grad_norm": 221.00173950195312, + "learning_rate": 1.7575317604355718e-05, + "loss": 40.9004, + "step": 2292 + }, + { + "epoch": 8.278103837471784, + "grad_norm": 180.00828552246094, + "learning_rate": 1.7569872958257714e-05, + "loss": 38.7865, + "step": 2293 + }, + { + "epoch": 8.281715575620767, + "grad_norm": 210.69302368164062, + "learning_rate": 1.756442831215971e-05, + "loss": 39.207, + "step": 2294 + }, + { + "epoch": 8.285327313769752, + "grad_norm": 196.8787078857422, + "learning_rate": 1.7558983666061708e-05, + "loss": 39.4472, + "step": 2295 + }, + { + "epoch": 8.288939051918735, + "grad_norm": 229.16331481933594, + "learning_rate": 1.7553539019963703e-05, + "loss": 36.5539, + "step": 2296 + }, + { + "epoch": 8.29255079006772, + "grad_norm": 180.67474365234375, + "learning_rate": 1.75480943738657e-05, + "loss": 34.3887, + "step": 2297 + }, + { + "epoch": 8.296162528216705, + "grad_norm": 234.046875, + "learning_rate": 1.7542649727767694e-05, + "loss": 34.158, + "step": 2298 + }, + { + "epoch": 8.299774266365688, + "grad_norm": 213.34255981445312, + "learning_rate": 1.753720508166969e-05, + "loss": 34.7655, + "step": 2299 + }, + { + "epoch": 8.303386004514673, + "grad_norm": 205.6382598876953, + "learning_rate": 1.753176043557169e-05, + "loss": 34.4223, + "step": 2300 + }, + { + "epoch": 8.303386004514673, + "eval_loss": 0.6200549006462097, + "eval_runtime": 3.1447, + "eval_samples_per_second": 56.921, + "eval_steps_per_second": 56.921, + "step": 2300 + }, + { + "epoch": 8.306997742663658, + "grad_norm": 189.79238891601562, + "learning_rate": 1.7526315789473687e-05, + "loss": 35.3846, + "step": 2301 + }, + { + "epoch": 8.31060948081264, + "grad_norm": 202.27859497070312, + "learning_rate": 1.7520871143375682e-05, + "loss": 34.9006, + "step": 2302 + }, + { + "epoch": 8.314221218961626, + "grad_norm": 217.62327575683594, + "learning_rate": 1.7515426497277678e-05, + "loss": 36.3079, + "step": 2303 + }, + { + "epoch": 8.317832957110609, + "grad_norm": 212.82862854003906, + "learning_rate": 1.7509981851179673e-05, + "loss": 35.8598, + "step": 2304 + }, + { + "epoch": 8.321444695259594, + "grad_norm": 229.778564453125, + "learning_rate": 1.750453720508167e-05, + "loss": 37.0853, + "step": 2305 + }, + { + "epoch": 8.325056433408578, + "grad_norm": 219.99844360351562, + "learning_rate": 1.7499092558983667e-05, + "loss": 38.01, + "step": 2306 + }, + { + "epoch": 8.328668171557561, + "grad_norm": 202.63035583496094, + "learning_rate": 1.7493647912885663e-05, + "loss": 36.4756, + "step": 2307 + }, + { + "epoch": 8.332279909706546, + "grad_norm": 188.44094848632812, + "learning_rate": 1.7488203266787658e-05, + "loss": 37.0509, + "step": 2308 + }, + { + "epoch": 8.335891647855531, + "grad_norm": 187.8760223388672, + "learning_rate": 1.7482758620689657e-05, + "loss": 38.0019, + "step": 2309 + }, + { + "epoch": 8.339503386004514, + "grad_norm": 239.35833740234375, + "learning_rate": 1.7477313974591652e-05, + "loss": 38.2255, + "step": 2310 + }, + { + "epoch": 8.339503386004514, + "eval_loss": 0.6221747994422913, + "eval_runtime": 3.148, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 2310 + }, + { + "epoch": 8.343115124153499, + "grad_norm": 236.3567657470703, + "learning_rate": 1.747186932849365e-05, + "loss": 37.3598, + "step": 2311 + }, + { + "epoch": 8.346726862302482, + "grad_norm": 188.16151428222656, + "learning_rate": 1.7466424682395646e-05, + "loss": 27.1993, + "step": 2312 + }, + { + "epoch": 8.350338600451467, + "grad_norm": 216.58778381347656, + "learning_rate": 1.746098003629764e-05, + "loss": 23.7024, + "step": 2313 + }, + { + "epoch": 8.353950338600452, + "grad_norm": 221.03111267089844, + "learning_rate": 1.7455535390199637e-05, + "loss": 24.2856, + "step": 2314 + }, + { + "epoch": 8.357562076749435, + "grad_norm": 180.36221313476562, + "learning_rate": 1.7450090744101632e-05, + "loss": 23.7624, + "step": 2315 + }, + { + "epoch": 8.36117381489842, + "grad_norm": 198.77438354492188, + "learning_rate": 1.7444646098003628e-05, + "loss": 25.8628, + "step": 2316 + }, + { + "epoch": 8.364785553047405, + "grad_norm": 250.81321716308594, + "learning_rate": 1.7439201451905627e-05, + "loss": 43.4097, + "step": 2317 + }, + { + "epoch": 8.368397291196388, + "grad_norm": 246.19544982910156, + "learning_rate": 1.7433756805807622e-05, + "loss": 44.7141, + "step": 2318 + }, + { + "epoch": 8.372009029345373, + "grad_norm": 245.04241943359375, + "learning_rate": 1.742831215970962e-05, + "loss": 44.4511, + "step": 2319 + }, + { + "epoch": 8.375620767494357, + "grad_norm": 224.05331420898438, + "learning_rate": 1.7422867513611616e-05, + "loss": 43.5971, + "step": 2320 + }, + { + "epoch": 8.375620767494357, + "eval_loss": 0.6324251294136047, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 2320 + }, + { + "epoch": 8.37923250564334, + "grad_norm": 222.3795623779297, + "learning_rate": 1.741742286751361e-05, + "loss": 42.9007, + "step": 2321 + }, + { + "epoch": 8.382844243792325, + "grad_norm": 210.0133514404297, + "learning_rate": 1.741197822141561e-05, + "loss": 42.8733, + "step": 2322 + }, + { + "epoch": 8.386455981941308, + "grad_norm": 222.01031494140625, + "learning_rate": 1.7406533575317606e-05, + "loss": 42.9875, + "step": 2323 + }, + { + "epoch": 8.390067720090293, + "grad_norm": 187.30101013183594, + "learning_rate": 1.74010889292196e-05, + "loss": 42.4873, + "step": 2324 + }, + { + "epoch": 8.393679458239278, + "grad_norm": 188.22048950195312, + "learning_rate": 1.7395644283121596e-05, + "loss": 42.2066, + "step": 2325 + }, + { + "epoch": 8.397291196388261, + "grad_norm": 228.75363159179688, + "learning_rate": 1.7390199637023592e-05, + "loss": 42.7604, + "step": 2326 + }, + { + "epoch": 8.400902934537246, + "grad_norm": 196.8817901611328, + "learning_rate": 1.7384754990925587e-05, + "loss": 42.445, + "step": 2327 + }, + { + "epoch": 8.404514672686231, + "grad_norm": 205.3610382080078, + "learning_rate": 1.737931034482759e-05, + "loss": 39.8408, + "step": 2328 + }, + { + "epoch": 8.408126410835214, + "grad_norm": 259.0702819824219, + "learning_rate": 1.7373865698729585e-05, + "loss": 40.847, + "step": 2329 + }, + { + "epoch": 8.411738148984199, + "grad_norm": 216.12017822265625, + "learning_rate": 1.736842105263158e-05, + "loss": 40.4648, + "step": 2330 + }, + { + "epoch": 8.411738148984199, + "eval_loss": 0.6252871155738831, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2330 + }, + { + "epoch": 8.415349887133182, + "grad_norm": 330.9464111328125, + "learning_rate": 1.7362976406533575e-05, + "loss": 39.7682, + "step": 2331 + }, + { + "epoch": 8.418961625282167, + "grad_norm": 237.19505310058594, + "learning_rate": 1.735753176043557e-05, + "loss": 38.8824, + "step": 2332 + }, + { + "epoch": 8.422573363431152, + "grad_norm": 247.22259521484375, + "learning_rate": 1.735208711433757e-05, + "loss": 40.1187, + "step": 2333 + }, + { + "epoch": 8.426185101580135, + "grad_norm": 267.739990234375, + "learning_rate": 1.7346642468239565e-05, + "loss": 40.4589, + "step": 2334 + }, + { + "epoch": 8.42979683972912, + "grad_norm": 308.715576171875, + "learning_rate": 1.734119782214156e-05, + "loss": 41.5481, + "step": 2335 + }, + { + "epoch": 8.433408577878104, + "grad_norm": 350.8972473144531, + "learning_rate": 1.7335753176043556e-05, + "loss": 41.6628, + "step": 2336 + }, + { + "epoch": 8.437020316027088, + "grad_norm": 245.9825897216797, + "learning_rate": 1.7330308529945555e-05, + "loss": 40.3527, + "step": 2337 + }, + { + "epoch": 8.440632054176072, + "grad_norm": 253.94488525390625, + "learning_rate": 1.732486388384755e-05, + "loss": 39.6388, + "step": 2338 + }, + { + "epoch": 8.444243792325057, + "grad_norm": 226.24179077148438, + "learning_rate": 1.731941923774955e-05, + "loss": 40.5561, + "step": 2339 + }, + { + "epoch": 8.44785553047404, + "grad_norm": 188.66746520996094, + "learning_rate": 1.7313974591651544e-05, + "loss": 41.8422, + "step": 2340 + }, + { + "epoch": 8.44785553047404, + "eval_loss": 0.6197592616081238, + "eval_runtime": 3.1522, + "eval_samples_per_second": 56.786, + "eval_steps_per_second": 56.786, + "step": 2340 + }, + { + "epoch": 8.451467268623025, + "grad_norm": 227.01014709472656, + "learning_rate": 1.730852994555354e-05, + "loss": 41.4184, + "step": 2341 + }, + { + "epoch": 8.455079006772008, + "grad_norm": 187.11643981933594, + "learning_rate": 1.7303085299455535e-05, + "loss": 40.796, + "step": 2342 + }, + { + "epoch": 8.458690744920993, + "grad_norm": 243.1756134033203, + "learning_rate": 1.729764065335753e-05, + "loss": 41.7926, + "step": 2343 + }, + { + "epoch": 8.462302483069978, + "grad_norm": 226.15187072753906, + "learning_rate": 1.729219600725953e-05, + "loss": 41.588, + "step": 2344 + }, + { + "epoch": 8.465914221218961, + "grad_norm": 218.49935913085938, + "learning_rate": 1.7286751361161524e-05, + "loss": 39.6935, + "step": 2345 + }, + { + "epoch": 8.469525959367946, + "grad_norm": 232.4805145263672, + "learning_rate": 1.7281306715063523e-05, + "loss": 37.0718, + "step": 2346 + }, + { + "epoch": 8.47313769751693, + "grad_norm": 201.1748046875, + "learning_rate": 1.727586206896552e-05, + "loss": 33.9633, + "step": 2347 + }, + { + "epoch": 8.476749435665914, + "grad_norm": 208.79733276367188, + "learning_rate": 1.7270417422867514e-05, + "loss": 33.4553, + "step": 2348 + }, + { + "epoch": 8.480361173814899, + "grad_norm": 235.91151428222656, + "learning_rate": 1.726497277676951e-05, + "loss": 33.6144, + "step": 2349 + }, + { + "epoch": 8.483972911963882, + "grad_norm": 206.28811645507812, + "learning_rate": 1.7259528130671508e-05, + "loss": 35.3678, + "step": 2350 + }, + { + "epoch": 8.483972911963882, + "eval_loss": 0.6203061938285828, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 2350 + }, + { + "epoch": 8.487584650112867, + "grad_norm": 305.2204284667969, + "learning_rate": 1.7254083484573503e-05, + "loss": 35.9175, + "step": 2351 + }, + { + "epoch": 8.491196388261852, + "grad_norm": 227.1592254638672, + "learning_rate": 1.72486388384755e-05, + "loss": 35.5001, + "step": 2352 + }, + { + "epoch": 8.494808126410835, + "grad_norm": 194.739501953125, + "learning_rate": 1.7243194192377494e-05, + "loss": 35.0015, + "step": 2353 + }, + { + "epoch": 8.49841986455982, + "grad_norm": 233.8467254638672, + "learning_rate": 1.723774954627949e-05, + "loss": 36.8257, + "step": 2354 + }, + { + "epoch": 8.502031602708804, + "grad_norm": 258.8914489746094, + "learning_rate": 1.7232304900181492e-05, + "loss": 36.1246, + "step": 2355 + }, + { + "epoch": 8.505643340857787, + "grad_norm": 194.8585968017578, + "learning_rate": 1.7226860254083487e-05, + "loss": 36.1245, + "step": 2356 + }, + { + "epoch": 8.509255079006772, + "grad_norm": 191.2276153564453, + "learning_rate": 1.7221415607985483e-05, + "loss": 37.0608, + "step": 2357 + }, + { + "epoch": 8.512866817155757, + "grad_norm": 197.9025115966797, + "learning_rate": 1.7215970961887478e-05, + "loss": 37.0779, + "step": 2358 + }, + { + "epoch": 8.51647855530474, + "grad_norm": 207.01016235351562, + "learning_rate": 1.7210526315789473e-05, + "loss": 37.8432, + "step": 2359 + }, + { + "epoch": 8.520090293453725, + "grad_norm": 222.20201110839844, + "learning_rate": 1.720508166969147e-05, + "loss": 36.6983, + "step": 2360 + }, + { + "epoch": 8.520090293453725, + "eval_loss": 0.6240220665931702, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 2360 + }, + { + "epoch": 8.523702031602708, + "grad_norm": 200.19273376464844, + "learning_rate": 1.7199637023593467e-05, + "loss": 38.0613, + "step": 2361 + }, + { + "epoch": 8.527313769751693, + "grad_norm": 205.36758422851562, + "learning_rate": 1.7194192377495463e-05, + "loss": 29.6395, + "step": 2362 + }, + { + "epoch": 8.530925507900678, + "grad_norm": 206.53396606445312, + "learning_rate": 1.7188747731397458e-05, + "loss": 23.6478, + "step": 2363 + }, + { + "epoch": 8.534537246049661, + "grad_norm": 219.47044372558594, + "learning_rate": 1.7183303085299454e-05, + "loss": 22.8522, + "step": 2364 + }, + { + "epoch": 8.538148984198646, + "grad_norm": 178.48008728027344, + "learning_rate": 1.7177858439201452e-05, + "loss": 24.1411, + "step": 2365 + }, + { + "epoch": 8.54176072234763, + "grad_norm": 222.63731384277344, + "learning_rate": 1.717241379310345e-05, + "loss": 26.2818, + "step": 2366 + }, + { + "epoch": 8.545372460496614, + "grad_norm": 216.6333465576172, + "learning_rate": 1.7166969147005447e-05, + "loss": 42.5599, + "step": 2367 + }, + { + "epoch": 8.548984198645599, + "grad_norm": 241.42532348632812, + "learning_rate": 1.7161524500907442e-05, + "loss": 44.0016, + "step": 2368 + }, + { + "epoch": 8.552595936794582, + "grad_norm": 227.95193481445312, + "learning_rate": 1.7156079854809437e-05, + "loss": 44.1662, + "step": 2369 + }, + { + "epoch": 8.556207674943566, + "grad_norm": 204.9208526611328, + "learning_rate": 1.7150635208711433e-05, + "loss": 41.2255, + "step": 2370 + }, + { + "epoch": 8.556207674943566, + "eval_loss": 0.6293933987617493, + "eval_runtime": 3.1467, + "eval_samples_per_second": 56.884, + "eval_steps_per_second": 56.884, + "step": 2370 + }, + { + "epoch": 8.559819413092551, + "grad_norm": 168.1370849609375, + "learning_rate": 1.7145190562613428e-05, + "loss": 42.8374, + "step": 2371 + }, + { + "epoch": 8.563431151241534, + "grad_norm": 209.16641235351562, + "learning_rate": 1.7139745916515427e-05, + "loss": 42.4378, + "step": 2372 + }, + { + "epoch": 8.56704288939052, + "grad_norm": 235.36373901367188, + "learning_rate": 1.7134301270417422e-05, + "loss": 43.3213, + "step": 2373 + }, + { + "epoch": 8.570654627539504, + "grad_norm": 198.8206329345703, + "learning_rate": 1.712885662431942e-05, + "loss": 43.5621, + "step": 2374 + }, + { + "epoch": 8.574266365688487, + "grad_norm": 191.1640167236328, + "learning_rate": 1.7123411978221416e-05, + "loss": 41.8729, + "step": 2375 + }, + { + "epoch": 8.577878103837472, + "grad_norm": 281.6352233886719, + "learning_rate": 1.7117967332123412e-05, + "loss": 42.8306, + "step": 2376 + }, + { + "epoch": 8.581489841986457, + "grad_norm": 191.68939208984375, + "learning_rate": 1.711252268602541e-05, + "loss": 41.3603, + "step": 2377 + }, + { + "epoch": 8.58510158013544, + "grad_norm": 175.3041229248047, + "learning_rate": 1.7107078039927406e-05, + "loss": 38.7076, + "step": 2378 + }, + { + "epoch": 8.588713318284425, + "grad_norm": 186.31202697753906, + "learning_rate": 1.71016333938294e-05, + "loss": 38.832, + "step": 2379 + }, + { + "epoch": 8.592325056433408, + "grad_norm": 192.0680389404297, + "learning_rate": 1.7096188747731397e-05, + "loss": 40.6542, + "step": 2380 + }, + { + "epoch": 8.592325056433408, + "eval_loss": 0.6245992183685303, + "eval_runtime": 3.1487, + "eval_samples_per_second": 56.848, + "eval_steps_per_second": 56.848, + "step": 2380 + }, + { + "epoch": 8.595936794582393, + "grad_norm": 284.3516540527344, + "learning_rate": 1.7090744101633392e-05, + "loss": 40.3145, + "step": 2381 + }, + { + "epoch": 8.599548532731378, + "grad_norm": 210.2421875, + "learning_rate": 1.708529945553539e-05, + "loss": 39.9109, + "step": 2382 + }, + { + "epoch": 8.60316027088036, + "grad_norm": 202.3438720703125, + "learning_rate": 1.707985480943739e-05, + "loss": 39.0686, + "step": 2383 + }, + { + "epoch": 8.606772009029346, + "grad_norm": 189.5508270263672, + "learning_rate": 1.7074410163339385e-05, + "loss": 40.6673, + "step": 2384 + }, + { + "epoch": 8.610383747178329, + "grad_norm": 199.3516387939453, + "learning_rate": 1.706896551724138e-05, + "loss": 40.5357, + "step": 2385 + }, + { + "epoch": 8.613995485327314, + "grad_norm": 183.11309814453125, + "learning_rate": 1.7063520871143376e-05, + "loss": 40.7691, + "step": 2386 + }, + { + "epoch": 8.617607223476298, + "grad_norm": 347.104248046875, + "learning_rate": 1.705807622504537e-05, + "loss": 40.6822, + "step": 2387 + }, + { + "epoch": 8.621218961625281, + "grad_norm": 341.0453796386719, + "learning_rate": 1.705263157894737e-05, + "loss": 40.9791, + "step": 2388 + }, + { + "epoch": 8.624830699774266, + "grad_norm": 335.33221435546875, + "learning_rate": 1.7047186932849365e-05, + "loss": 41.0977, + "step": 2389 + }, + { + "epoch": 8.628442437923251, + "grad_norm": 209.75198364257812, + "learning_rate": 1.704174228675136e-05, + "loss": 41.3332, + "step": 2390 + }, + { + "epoch": 8.628442437923251, + "eval_loss": 0.6176490783691406, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2390 + }, + { + "epoch": 8.632054176072234, + "grad_norm": 221.6715545654297, + "learning_rate": 1.7036297640653356e-05, + "loss": 41.7456, + "step": 2391 + }, + { + "epoch": 8.635665914221219, + "grad_norm": 255.7875213623047, + "learning_rate": 1.7030852994555355e-05, + "loss": 41.7063, + "step": 2392 + }, + { + "epoch": 8.639277652370204, + "grad_norm": 206.66221618652344, + "learning_rate": 1.7025408348457354e-05, + "loss": 41.941, + "step": 2393 + }, + { + "epoch": 8.642889390519187, + "grad_norm": 381.9871826171875, + "learning_rate": 1.701996370235935e-05, + "loss": 42.8615, + "step": 2394 + }, + { + "epoch": 8.646501128668172, + "grad_norm": 303.8249816894531, + "learning_rate": 1.7014519056261344e-05, + "loss": 37.8472, + "step": 2395 + }, + { + "epoch": 8.650112866817155, + "grad_norm": 201.2444610595703, + "learning_rate": 1.700907441016334e-05, + "loss": 35.4641, + "step": 2396 + }, + { + "epoch": 8.65372460496614, + "grad_norm": 242.34298706054688, + "learning_rate": 1.7003629764065335e-05, + "loss": 33.3414, + "step": 2397 + }, + { + "epoch": 8.657336343115125, + "grad_norm": 214.45384216308594, + "learning_rate": 1.699818511796733e-05, + "loss": 33.7771, + "step": 2398 + }, + { + "epoch": 8.660948081264108, + "grad_norm": 276.4810485839844, + "learning_rate": 1.699274047186933e-05, + "loss": 35.4289, + "step": 2399 + }, + { + "epoch": 8.664559819413093, + "grad_norm": 199.68626403808594, + "learning_rate": 1.6987295825771325e-05, + "loss": 34.4205, + "step": 2400 + }, + { + "epoch": 8.664559819413093, + "eval_loss": 0.6179484128952026, + "eval_runtime": 3.1618, + "eval_samples_per_second": 56.614, + "eval_steps_per_second": 56.614, + "step": 2400 + }, + { + "epoch": 8.668171557562077, + "grad_norm": 239.19200134277344, + "learning_rate": 1.698185117967332e-05, + "loss": 34.3428, + "step": 2401 + }, + { + "epoch": 8.67178329571106, + "grad_norm": 341.44927978515625, + "learning_rate": 1.697640653357532e-05, + "loss": 37.6011, + "step": 2402 + }, + { + "epoch": 8.675395033860045, + "grad_norm": 260.5967102050781, + "learning_rate": 1.6970961887477314e-05, + "loss": 34.9222, + "step": 2403 + }, + { + "epoch": 8.679006772009028, + "grad_norm": 217.9357147216797, + "learning_rate": 1.6965517241379313e-05, + "loss": 36.6177, + "step": 2404 + }, + { + "epoch": 8.682618510158013, + "grad_norm": 355.21917724609375, + "learning_rate": 1.696007259528131e-05, + "loss": 36.3072, + "step": 2405 + }, + { + "epoch": 8.686230248306998, + "grad_norm": 279.37200927734375, + "learning_rate": 1.6954627949183304e-05, + "loss": 36.7026, + "step": 2406 + }, + { + "epoch": 8.689841986455981, + "grad_norm": 344.9017028808594, + "learning_rate": 1.69491833030853e-05, + "loss": 37.5009, + "step": 2407 + }, + { + "epoch": 8.693453724604966, + "grad_norm": 225.28668212890625, + "learning_rate": 1.6943738656987295e-05, + "loss": 36.0914, + "step": 2408 + }, + { + "epoch": 8.697065462753951, + "grad_norm": 233.16372680664062, + "learning_rate": 1.693829401088929e-05, + "loss": 38.0917, + "step": 2409 + }, + { + "epoch": 8.700677200902934, + "grad_norm": 220.2307891845703, + "learning_rate": 1.693284936479129e-05, + "loss": 37.4493, + "step": 2410 + }, + { + "epoch": 8.700677200902934, + "eval_loss": 0.6225734949111938, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2410 + }, + { + "epoch": 8.704288939051919, + "grad_norm": 298.2883605957031, + "learning_rate": 1.6927404718693287e-05, + "loss": 37.6527, + "step": 2411 + }, + { + "epoch": 8.707900677200904, + "grad_norm": 329.1615295410156, + "learning_rate": 1.6921960072595283e-05, + "loss": 30.9627, + "step": 2412 + }, + { + "epoch": 8.711512415349887, + "grad_norm": 192.55380249023438, + "learning_rate": 1.6916515426497278e-05, + "loss": 24.2028, + "step": 2413 + }, + { + "epoch": 8.715124153498872, + "grad_norm": 162.13583374023438, + "learning_rate": 1.6911070780399274e-05, + "loss": 23.3005, + "step": 2414 + }, + { + "epoch": 8.718735891647855, + "grad_norm": 152.95108032226562, + "learning_rate": 1.6905626134301272e-05, + "loss": 24.335, + "step": 2415 + }, + { + "epoch": 8.72234762979684, + "grad_norm": 183.4193572998047, + "learning_rate": 1.6900181488203268e-05, + "loss": 24.9279, + "step": 2416 + }, + { + "epoch": 8.725959367945824, + "grad_norm": 232.93650817871094, + "learning_rate": 1.6894736842105263e-05, + "loss": 43.4574, + "step": 2417 + }, + { + "epoch": 8.729571106094808, + "grad_norm": 226.85890197753906, + "learning_rate": 1.688929219600726e-05, + "loss": 44.4136, + "step": 2418 + }, + { + "epoch": 8.733182844243792, + "grad_norm": 232.16064453125, + "learning_rate": 1.6883847549909254e-05, + "loss": 42.8183, + "step": 2419 + }, + { + "epoch": 8.736794582392777, + "grad_norm": 243.5811767578125, + "learning_rate": 1.6878402903811253e-05, + "loss": 43.3031, + "step": 2420 + }, + { + "epoch": 8.736794582392777, + "eval_loss": 0.6284167170524597, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2420 + }, + { + "epoch": 8.74040632054176, + "grad_norm": 194.7115020751953, + "learning_rate": 1.687295825771325e-05, + "loss": 42.1276, + "step": 2421 + }, + { + "epoch": 8.744018058690745, + "grad_norm": 250.81983947753906, + "learning_rate": 1.6867513611615247e-05, + "loss": 42.5535, + "step": 2422 + }, + { + "epoch": 8.747629796839728, + "grad_norm": 205.1988983154297, + "learning_rate": 1.6862068965517242e-05, + "loss": 42.7745, + "step": 2423 + }, + { + "epoch": 8.751241534988713, + "grad_norm": 159.68243408203125, + "learning_rate": 1.6856624319419238e-05, + "loss": 43.6562, + "step": 2424 + }, + { + "epoch": 8.754853273137698, + "grad_norm": 164.31361389160156, + "learning_rate": 1.6851179673321233e-05, + "loss": 43.4602, + "step": 2425 + }, + { + "epoch": 8.758465011286681, + "grad_norm": 213.9793243408203, + "learning_rate": 1.6845735027223232e-05, + "loss": 42.1559, + "step": 2426 + }, + { + "epoch": 8.762076749435666, + "grad_norm": 205.79107666015625, + "learning_rate": 1.6840290381125227e-05, + "loss": 41.5687, + "step": 2427 + }, + { + "epoch": 8.76568848758465, + "grad_norm": 235.80348205566406, + "learning_rate": 1.6834845735027223e-05, + "loss": 41.0748, + "step": 2428 + }, + { + "epoch": 8.769300225733634, + "grad_norm": 203.84884643554688, + "learning_rate": 1.682940108892922e-05, + "loss": 39.3348, + "step": 2429 + }, + { + "epoch": 8.772911963882619, + "grad_norm": 271.2411804199219, + "learning_rate": 1.6823956442831217e-05, + "loss": 39.357, + "step": 2430 + }, + { + "epoch": 8.772911963882619, + "eval_loss": 0.6211046576499939, + "eval_runtime": 3.1402, + "eval_samples_per_second": 57.002, + "eval_steps_per_second": 57.002, + "step": 2430 + }, + { + "epoch": 8.776523702031604, + "grad_norm": 222.4960174560547, + "learning_rate": 1.6818511796733212e-05, + "loss": 39.2198, + "step": 2431 + }, + { + "epoch": 8.780135440180587, + "grad_norm": 325.9942932128906, + "learning_rate": 1.681306715063521e-05, + "loss": 40.572, + "step": 2432 + }, + { + "epoch": 8.783747178329572, + "grad_norm": 195.2740936279297, + "learning_rate": 1.6807622504537206e-05, + "loss": 39.2727, + "step": 2433 + }, + { + "epoch": 8.787358916478555, + "grad_norm": 196.16964721679688, + "learning_rate": 1.68021778584392e-05, + "loss": 40.6503, + "step": 2434 + }, + { + "epoch": 8.79097065462754, + "grad_norm": 183.2659454345703, + "learning_rate": 1.6796733212341197e-05, + "loss": 41.2074, + "step": 2435 + }, + { + "epoch": 8.794582392776524, + "grad_norm": 293.393798828125, + "learning_rate": 1.6791288566243192e-05, + "loss": 40.2778, + "step": 2436 + }, + { + "epoch": 8.798194130925507, + "grad_norm": 232.8402099609375, + "learning_rate": 1.678584392014519e-05, + "loss": 40.0305, + "step": 2437 + }, + { + "epoch": 8.801805869074492, + "grad_norm": 269.957275390625, + "learning_rate": 1.678039927404719e-05, + "loss": 40.4216, + "step": 2438 + }, + { + "epoch": 8.805417607223477, + "grad_norm": 175.6732635498047, + "learning_rate": 1.6774954627949185e-05, + "loss": 40.7998, + "step": 2439 + }, + { + "epoch": 8.80902934537246, + "grad_norm": 209.0604248046875, + "learning_rate": 1.676950998185118e-05, + "loss": 41.1176, + "step": 2440 + }, + { + "epoch": 8.80902934537246, + "eval_loss": 0.6211614012718201, + "eval_runtime": 3.15, + "eval_samples_per_second": 56.826, + "eval_steps_per_second": 56.826, + "step": 2440 + }, + { + "epoch": 8.812641083521445, + "grad_norm": 229.91171264648438, + "learning_rate": 1.6764065335753176e-05, + "loss": 41.37, + "step": 2441 + }, + { + "epoch": 8.816252821670428, + "grad_norm": 192.99610900878906, + "learning_rate": 1.675862068965517e-05, + "loss": 41.8377, + "step": 2442 + }, + { + "epoch": 8.819864559819413, + "grad_norm": 239.290771484375, + "learning_rate": 1.675317604355717e-05, + "loss": 42.3038, + "step": 2443 + }, + { + "epoch": 8.823476297968398, + "grad_norm": 203.52330017089844, + "learning_rate": 1.6747731397459166e-05, + "loss": 41.3334, + "step": 2444 + }, + { + "epoch": 8.827088036117381, + "grad_norm": 247.99099731445312, + "learning_rate": 1.674228675136116e-05, + "loss": 37.7455, + "step": 2445 + }, + { + "epoch": 8.830699774266366, + "grad_norm": 205.9770965576172, + "learning_rate": 1.6736842105263156e-05, + "loss": 34.6828, + "step": 2446 + }, + { + "epoch": 8.83431151241535, + "grad_norm": 215.47024536132812, + "learning_rate": 1.6731397459165152e-05, + "loss": 34.927, + "step": 2447 + }, + { + "epoch": 8.837923250564334, + "grad_norm": 254.14010620117188, + "learning_rate": 1.6725952813067154e-05, + "loss": 35.3194, + "step": 2448 + }, + { + "epoch": 8.841534988713319, + "grad_norm": 221.18174743652344, + "learning_rate": 1.672050816696915e-05, + "loss": 34.9577, + "step": 2449 + }, + { + "epoch": 8.845146726862303, + "grad_norm": 191.1651611328125, + "learning_rate": 1.6715063520871145e-05, + "loss": 33.7244, + "step": 2450 + }, + { + "epoch": 8.845146726862303, + "eval_loss": 0.6216589212417603, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2450 + }, + { + "epoch": 8.848758465011286, + "grad_norm": 228.3920135498047, + "learning_rate": 1.670961887477314e-05, + "loss": 34.9689, + "step": 2451 + }, + { + "epoch": 8.852370203160271, + "grad_norm": 227.6689910888672, + "learning_rate": 1.6704174228675135e-05, + "loss": 36.0718, + "step": 2452 + }, + { + "epoch": 8.855981941309254, + "grad_norm": 182.38978576660156, + "learning_rate": 1.669872958257713e-05, + "loss": 37.1143, + "step": 2453 + }, + { + "epoch": 8.85959367945824, + "grad_norm": 223.66966247558594, + "learning_rate": 1.669328493647913e-05, + "loss": 34.4468, + "step": 2454 + }, + { + "epoch": 8.863205417607224, + "grad_norm": 260.3930358886719, + "learning_rate": 1.6687840290381125e-05, + "loss": 36.7305, + "step": 2455 + }, + { + "epoch": 8.866817155756207, + "grad_norm": 218.60385131835938, + "learning_rate": 1.668239564428312e-05, + "loss": 36.1995, + "step": 2456 + }, + { + "epoch": 8.870428893905192, + "grad_norm": 227.4342041015625, + "learning_rate": 1.667695099818512e-05, + "loss": 35.9138, + "step": 2457 + }, + { + "epoch": 8.874040632054175, + "grad_norm": 208.42196655273438, + "learning_rate": 1.6671506352087115e-05, + "loss": 37.2621, + "step": 2458 + }, + { + "epoch": 8.87765237020316, + "grad_norm": 214.9486541748047, + "learning_rate": 1.6666061705989113e-05, + "loss": 38.5176, + "step": 2459 + }, + { + "epoch": 8.881264108352145, + "grad_norm": 226.6992645263672, + "learning_rate": 1.666061705989111e-05, + "loss": 38.3917, + "step": 2460 + }, + { + "epoch": 8.881264108352145, + "eval_loss": 0.6277003884315491, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.959, + "eval_steps_per_second": 56.959, + "step": 2460 + }, + { + "epoch": 8.884875846501128, + "grad_norm": 282.3875732421875, + "learning_rate": 1.6655172413793104e-05, + "loss": 39.1439, + "step": 2461 + }, + { + "epoch": 8.888487584650113, + "grad_norm": 240.29022216796875, + "learning_rate": 1.66497277676951e-05, + "loss": 33.7717, + "step": 2462 + }, + { + "epoch": 8.892099322799098, + "grad_norm": 231.84727478027344, + "learning_rate": 1.6644283121597095e-05, + "loss": 24.1146, + "step": 2463 + }, + { + "epoch": 8.89571106094808, + "grad_norm": 215.5159149169922, + "learning_rate": 1.663883847549909e-05, + "loss": 24.0165, + "step": 2464 + }, + { + "epoch": 8.899322799097066, + "grad_norm": 278.42950439453125, + "learning_rate": 1.663339382940109e-05, + "loss": 24.2048, + "step": 2465 + }, + { + "epoch": 8.90293453724605, + "grad_norm": 187.03341674804688, + "learning_rate": 1.6627949183303088e-05, + "loss": 24.7332, + "step": 2466 + }, + { + "epoch": 8.906546275395034, + "grad_norm": 261.2938232421875, + "learning_rate": 1.6622504537205083e-05, + "loss": 42.6764, + "step": 2467 + }, + { + "epoch": 8.910158013544018, + "grad_norm": 234.00880432128906, + "learning_rate": 1.661705989110708e-05, + "loss": 42.9894, + "step": 2468 + }, + { + "epoch": 8.913769751693001, + "grad_norm": 263.2890319824219, + "learning_rate": 1.6611615245009074e-05, + "loss": 43.3274, + "step": 2469 + }, + { + "epoch": 8.917381489841986, + "grad_norm": 286.3260192871094, + "learning_rate": 1.6606170598911073e-05, + "loss": 44.3862, + "step": 2470 + }, + { + "epoch": 8.917381489841986, + "eval_loss": 0.6278789043426514, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2470 + }, + { + "epoch": 8.920993227990971, + "grad_norm": 273.5133972167969, + "learning_rate": 1.6600725952813068e-05, + "loss": 43.4195, + "step": 2471 + }, + { + "epoch": 8.924604966139954, + "grad_norm": 246.2245330810547, + "learning_rate": 1.6595281306715063e-05, + "loss": 43.153, + "step": 2472 + }, + { + "epoch": 8.928216704288939, + "grad_norm": 261.3001403808594, + "learning_rate": 1.658983666061706e-05, + "loss": 41.1276, + "step": 2473 + }, + { + "epoch": 8.931828442437924, + "grad_norm": 263.7626037597656, + "learning_rate": 1.6584392014519054e-05, + "loss": 40.5055, + "step": 2474 + }, + { + "epoch": 8.935440180586907, + "grad_norm": 233.80442810058594, + "learning_rate": 1.6578947368421053e-05, + "loss": 40.7098, + "step": 2475 + }, + { + "epoch": 8.939051918735892, + "grad_norm": 334.1268615722656, + "learning_rate": 1.6573502722323052e-05, + "loss": 40.5404, + "step": 2476 + }, + { + "epoch": 8.942663656884875, + "grad_norm": 319.56689453125, + "learning_rate": 1.6568058076225047e-05, + "loss": 40.3434, + "step": 2477 + }, + { + "epoch": 8.94627539503386, + "grad_norm": 388.0625915527344, + "learning_rate": 1.6562613430127043e-05, + "loss": 41.1956, + "step": 2478 + }, + { + "epoch": 8.949887133182845, + "grad_norm": 256.9087829589844, + "learning_rate": 1.6557168784029038e-05, + "loss": 41.9647, + "step": 2479 + }, + { + "epoch": 8.953498871331828, + "grad_norm": 248.2635040283203, + "learning_rate": 1.6551724137931033e-05, + "loss": 41.1885, + "step": 2480 + }, + { + "epoch": 8.953498871331828, + "eval_loss": 0.6198933124542236, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2480 + }, + { + "epoch": 8.957110609480813, + "grad_norm": 236.89004516601562, + "learning_rate": 1.6546279491833032e-05, + "loss": 41.2178, + "step": 2481 + }, + { + "epoch": 8.960722347629797, + "grad_norm": 260.47357177734375, + "learning_rate": 1.6540834845735027e-05, + "loss": 42.1472, + "step": 2482 + }, + { + "epoch": 8.96433408577878, + "grad_norm": 216.1390380859375, + "learning_rate": 1.6535390199637023e-05, + "loss": 36.14, + "step": 2483 + }, + { + "epoch": 8.967945823927765, + "grad_norm": 194.7316131591797, + "learning_rate": 1.652994555353902e-05, + "loss": 33.7272, + "step": 2484 + }, + { + "epoch": 8.97155756207675, + "grad_norm": 202.0404052734375, + "learning_rate": 1.6524500907441017e-05, + "loss": 34.9427, + "step": 2485 + }, + { + "epoch": 8.975169300225733, + "grad_norm": 196.98463439941406, + "learning_rate": 1.6519056261343016e-05, + "loss": 36.4874, + "step": 2486 + }, + { + "epoch": 8.978781038374718, + "grad_norm": 211.46177673339844, + "learning_rate": 1.651361161524501e-05, + "loss": 35.7667, + "step": 2487 + }, + { + "epoch": 8.982392776523701, + "grad_norm": 190.47093200683594, + "learning_rate": 1.6508166969147006e-05, + "loss": 35.6874, + "step": 2488 + }, + { + "epoch": 8.986004514672686, + "grad_norm": 194.9825897216797, + "learning_rate": 1.6502722323049002e-05, + "loss": 36.8718, + "step": 2489 + }, + { + "epoch": 8.989616252821671, + "grad_norm": 230.24774169921875, + "learning_rate": 1.6497277676950997e-05, + "loss": 37.4962, + "step": 2490 + }, + { + "epoch": 8.989616252821671, + "eval_loss": 0.6168100237846375, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 2490 + }, + { + "epoch": 8.993227990970654, + "grad_norm": 266.5688171386719, + "learning_rate": 1.6491833030852993e-05, + "loss": 35.5063, + "step": 2491 + }, + { + "epoch": 8.996839729119639, + "grad_norm": 230.923828125, + "learning_rate": 1.648638838475499e-05, + "loss": 23.5847, + "step": 2492 + }, + { + "epoch": 9.0, + "grad_norm": 187.365478515625, + "learning_rate": 1.6480943738656987e-05, + "loss": 21.7926, + "step": 2493 + }, + { + "epoch": 9.003611738148985, + "grad_norm": 283.487060546875, + "learning_rate": 1.6475499092558986e-05, + "loss": 41.4221, + "step": 2494 + }, + { + "epoch": 9.007223476297968, + "grad_norm": 234.38009643554688, + "learning_rate": 1.647005444646098e-05, + "loss": 43.3343, + "step": 2495 + }, + { + "epoch": 9.010835214446953, + "grad_norm": 253.75588989257812, + "learning_rate": 1.6464609800362976e-05, + "loss": 42.1983, + "step": 2496 + }, + { + "epoch": 9.014446952595938, + "grad_norm": 224.6202392578125, + "learning_rate": 1.6459165154264975e-05, + "loss": 41.5355, + "step": 2497 + }, + { + "epoch": 9.01805869074492, + "grad_norm": 261.0040588378906, + "learning_rate": 1.645372050816697e-05, + "loss": 42.3058, + "step": 2498 + }, + { + "epoch": 9.021670428893906, + "grad_norm": 191.44142150878906, + "learning_rate": 1.6448275862068966e-05, + "loss": 42.3911, + "step": 2499 + }, + { + "epoch": 9.025282167042889, + "grad_norm": 246.79278564453125, + "learning_rate": 1.644283121597096e-05, + "loss": 41.6238, + "step": 2500 + }, + { + "epoch": 9.025282167042889, + "eval_loss": 0.6220878958702087, + "eval_runtime": 3.1552, + "eval_samples_per_second": 56.731, + "eval_steps_per_second": 56.731, + "step": 2500 + }, + { + "epoch": 9.028893905191874, + "grad_norm": 251.5475311279297, + "learning_rate": 1.6437386569872957e-05, + "loss": 43.9275, + "step": 2501 + }, + { + "epoch": 9.032505643340858, + "grad_norm": 300.0381164550781, + "learning_rate": 1.6431941923774952e-05, + "loss": 42.8938, + "step": 2502 + }, + { + "epoch": 9.036117381489841, + "grad_norm": 310.0517883300781, + "learning_rate": 1.6426497277676954e-05, + "loss": 42.3538, + "step": 2503 + }, + { + "epoch": 9.039729119638826, + "grad_norm": 213.50392150878906, + "learning_rate": 1.642105263157895e-05, + "loss": 40.2305, + "step": 2504 + }, + { + "epoch": 9.043340857787811, + "grad_norm": 173.3816680908203, + "learning_rate": 1.6415607985480945e-05, + "loss": 38.3336, + "step": 2505 + }, + { + "epoch": 9.046952595936794, + "grad_norm": 195.51968383789062, + "learning_rate": 1.641016333938294e-05, + "loss": 38.5937, + "step": 2506 + }, + { + "epoch": 9.050564334085779, + "grad_norm": 195.68910217285156, + "learning_rate": 1.6404718693284936e-05, + "loss": 37.9994, + "step": 2507 + }, + { + "epoch": 9.054176072234762, + "grad_norm": 239.56704711914062, + "learning_rate": 1.6399274047186934e-05, + "loss": 38.6006, + "step": 2508 + }, + { + "epoch": 9.057787810383747, + "grad_norm": 455.8309326171875, + "learning_rate": 1.639382940108893e-05, + "loss": 39.9516, + "step": 2509 + }, + { + "epoch": 9.061399548532732, + "grad_norm": 188.0857696533203, + "learning_rate": 1.6388384754990925e-05, + "loss": 38.8922, + "step": 2510 + }, + { + "epoch": 9.061399548532732, + "eval_loss": 0.6177002191543579, + "eval_runtime": 3.1595, + "eval_samples_per_second": 56.654, + "eval_steps_per_second": 56.654, + "step": 2510 + }, + { + "epoch": 9.065011286681715, + "grad_norm": 211.76168823242188, + "learning_rate": 1.638294010889292e-05, + "loss": 38.8895, + "step": 2511 + }, + { + "epoch": 9.0686230248307, + "grad_norm": 281.7332458496094, + "learning_rate": 1.637749546279492e-05, + "loss": 39.9238, + "step": 2512 + }, + { + "epoch": 9.072234762979685, + "grad_norm": 254.9953155517578, + "learning_rate": 1.6372050816696915e-05, + "loss": 41.2667, + "step": 2513 + }, + { + "epoch": 9.075846501128668, + "grad_norm": 233.8746337890625, + "learning_rate": 1.6366606170598914e-05, + "loss": 39.3087, + "step": 2514 + }, + { + "epoch": 9.079458239277653, + "grad_norm": 317.71270751953125, + "learning_rate": 1.636116152450091e-05, + "loss": 40.4902, + "step": 2515 + }, + { + "epoch": 9.083069977426636, + "grad_norm": 227.5228271484375, + "learning_rate": 1.6355716878402904e-05, + "loss": 40.1197, + "step": 2516 + }, + { + "epoch": 9.08668171557562, + "grad_norm": 225.84423828125, + "learning_rate": 1.63502722323049e-05, + "loss": 42.9099, + "step": 2517 + }, + { + "epoch": 9.090293453724605, + "grad_norm": 255.20858764648438, + "learning_rate": 1.6344827586206895e-05, + "loss": 42.0515, + "step": 2518 + }, + { + "epoch": 9.093905191873588, + "grad_norm": 215.45352172851562, + "learning_rate": 1.6339382940108894e-05, + "loss": 41.6817, + "step": 2519 + }, + { + "epoch": 9.097516930022573, + "grad_norm": 233.5334014892578, + "learning_rate": 1.633393829401089e-05, + "loss": 42.6121, + "step": 2520 + }, + { + "epoch": 9.097516930022573, + "eval_loss": 0.6148340106010437, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.926, + "eval_steps_per_second": 56.926, + "step": 2520 + }, + { + "epoch": 9.101128668171558, + "grad_norm": 196.54132080078125, + "learning_rate": 1.6328493647912888e-05, + "loss": 40.5833, + "step": 2521 + }, + { + "epoch": 9.104740406320541, + "grad_norm": 296.7503967285156, + "learning_rate": 1.6323049001814883e-05, + "loss": 39.098, + "step": 2522 + }, + { + "epoch": 9.108352144469526, + "grad_norm": 272.1104431152344, + "learning_rate": 1.631760435571688e-05, + "loss": 36.0076, + "step": 2523 + }, + { + "epoch": 9.111963882618511, + "grad_norm": 197.3100128173828, + "learning_rate": 1.6312159709618874e-05, + "loss": 33.3503, + "step": 2524 + }, + { + "epoch": 9.115575620767494, + "grad_norm": 223.1310272216797, + "learning_rate": 1.6306715063520873e-05, + "loss": 33.1386, + "step": 2525 + }, + { + "epoch": 9.119187358916479, + "grad_norm": 234.86093139648438, + "learning_rate": 1.630127041742287e-05, + "loss": 34.2101, + "step": 2526 + }, + { + "epoch": 9.122799097065462, + "grad_norm": 244.72328186035156, + "learning_rate": 1.6295825771324864e-05, + "loss": 34.955, + "step": 2527 + }, + { + "epoch": 9.126410835214447, + "grad_norm": 198.89134216308594, + "learning_rate": 1.629038112522686e-05, + "loss": 34.5405, + "step": 2528 + }, + { + "epoch": 9.130022573363432, + "grad_norm": 236.64096069335938, + "learning_rate": 1.6284936479128854e-05, + "loss": 35.2328, + "step": 2529 + }, + { + "epoch": 9.133634311512415, + "grad_norm": 212.8743438720703, + "learning_rate": 1.6279491833030853e-05, + "loss": 34.6642, + "step": 2530 + }, + { + "epoch": 9.133634311512415, + "eval_loss": 0.6154256463050842, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 2530 + }, + { + "epoch": 9.1372460496614, + "grad_norm": 227.15135192871094, + "learning_rate": 1.6274047186932852e-05, + "loss": 35.652, + "step": 2531 + }, + { + "epoch": 9.140857787810384, + "grad_norm": 207.30572509765625, + "learning_rate": 1.6268602540834847e-05, + "loss": 36.8476, + "step": 2532 + }, + { + "epoch": 9.144469525959368, + "grad_norm": 222.18023681640625, + "learning_rate": 1.6263157894736843e-05, + "loss": 35.8299, + "step": 2533 + }, + { + "epoch": 9.148081264108352, + "grad_norm": 283.674072265625, + "learning_rate": 1.6257713248638838e-05, + "loss": 36.5074, + "step": 2534 + }, + { + "epoch": 9.151693002257336, + "grad_norm": 235.69752502441406, + "learning_rate": 1.6252268602540834e-05, + "loss": 37.344, + "step": 2535 + }, + { + "epoch": 9.15530474040632, + "grad_norm": 224.37965393066406, + "learning_rate": 1.6246823956442832e-05, + "loss": 37.8138, + "step": 2536 + }, + { + "epoch": 9.158916478555305, + "grad_norm": 217.52230834960938, + "learning_rate": 1.6241379310344828e-05, + "loss": 37.1529, + "step": 2537 + }, + { + "epoch": 9.162528216704288, + "grad_norm": 234.7586212158203, + "learning_rate": 1.6235934664246823e-05, + "loss": 36.3247, + "step": 2538 + }, + { + "epoch": 9.166139954853273, + "grad_norm": 239.52479553222656, + "learning_rate": 1.623049001814882e-05, + "loss": 30.0805, + "step": 2539 + }, + { + "epoch": 9.169751693002258, + "grad_norm": 223.7616424560547, + "learning_rate": 1.6225045372050817e-05, + "loss": 23.8492, + "step": 2540 + }, + { + "epoch": 9.169751693002258, + "eval_loss": 0.6244915723800659, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.031, + "eval_steps_per_second": 57.031, + "step": 2540 + }, + { + "epoch": 9.173363431151241, + "grad_norm": 213.41371154785156, + "learning_rate": 1.6219600725952816e-05, + "loss": 23.3557, + "step": 2541 + }, + { + "epoch": 9.176975169300226, + "grad_norm": 162.4627685546875, + "learning_rate": 1.621415607985481e-05, + "loss": 23.8834, + "step": 2542 + }, + { + "epoch": 9.18058690744921, + "grad_norm": 172.13250732421875, + "learning_rate": 1.6208711433756807e-05, + "loss": 24.6428, + "step": 2543 + }, + { + "epoch": 9.184198645598194, + "grad_norm": 229.30799865722656, + "learning_rate": 1.6203266787658802e-05, + "loss": 42.5908, + "step": 2544 + }, + { + "epoch": 9.187810383747179, + "grad_norm": 195.30130004882812, + "learning_rate": 1.6197822141560798e-05, + "loss": 43.7286, + "step": 2545 + }, + { + "epoch": 9.191422121896162, + "grad_norm": 227.4984893798828, + "learning_rate": 1.6192377495462793e-05, + "loss": 43.5012, + "step": 2546 + }, + { + "epoch": 9.195033860045147, + "grad_norm": 254.69615173339844, + "learning_rate": 1.6186932849364792e-05, + "loss": 41.9295, + "step": 2547 + }, + { + "epoch": 9.198645598194132, + "grad_norm": 251.33778381347656, + "learning_rate": 1.6181488203266787e-05, + "loss": 42.0838, + "step": 2548 + }, + { + "epoch": 9.202257336343115, + "grad_norm": 237.91677856445312, + "learning_rate": 1.6176043557168786e-05, + "loss": 43.0031, + "step": 2549 + }, + { + "epoch": 9.2058690744921, + "grad_norm": 258.0311584472656, + "learning_rate": 1.617059891107078e-05, + "loss": 42.7196, + "step": 2550 + }, + { + "epoch": 9.2058690744921, + "eval_loss": 0.6245208978652954, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2550 + }, + { + "epoch": 9.209480812641084, + "grad_norm": 197.14703369140625, + "learning_rate": 1.6165154264972777e-05, + "loss": 42.1342, + "step": 2551 + }, + { + "epoch": 9.213092550790067, + "grad_norm": 235.19705200195312, + "learning_rate": 1.6159709618874775e-05, + "loss": 41.8462, + "step": 2552 + }, + { + "epoch": 9.216704288939052, + "grad_norm": 198.409423828125, + "learning_rate": 1.615426497277677e-05, + "loss": 43.5993, + "step": 2553 + }, + { + "epoch": 9.220316027088035, + "grad_norm": 254.08590698242188, + "learning_rate": 1.6148820326678766e-05, + "loss": 40.771, + "step": 2554 + }, + { + "epoch": 9.22392776523702, + "grad_norm": 181.64808654785156, + "learning_rate": 1.614337568058076e-05, + "loss": 39.3511, + "step": 2555 + }, + { + "epoch": 9.227539503386005, + "grad_norm": 294.1127014160156, + "learning_rate": 1.6137931034482757e-05, + "loss": 39.6586, + "step": 2556 + }, + { + "epoch": 9.231151241534988, + "grad_norm": 197.59982299804688, + "learning_rate": 1.6132486388384752e-05, + "loss": 38.2575, + "step": 2557 + }, + { + "epoch": 9.234762979683973, + "grad_norm": 223.74717712402344, + "learning_rate": 1.6127041742286754e-05, + "loss": 38.8801, + "step": 2558 + }, + { + "epoch": 9.238374717832958, + "grad_norm": 279.2779541015625, + "learning_rate": 1.612159709618875e-05, + "loss": 40.4591, + "step": 2559 + }, + { + "epoch": 9.241986455981941, + "grad_norm": 258.75909423828125, + "learning_rate": 1.6116152450090745e-05, + "loss": 39.2172, + "step": 2560 + }, + { + "epoch": 9.241986455981941, + "eval_loss": 0.6209923624992371, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.784, + "eval_steps_per_second": 56.784, + "step": 2560 + }, + { + "epoch": 9.245598194130926, + "grad_norm": 305.0645446777344, + "learning_rate": 1.611070780399274e-05, + "loss": 40.442, + "step": 2561 + }, + { + "epoch": 9.249209932279909, + "grad_norm": 196.18557739257812, + "learning_rate": 1.6105263157894736e-05, + "loss": 39.7092, + "step": 2562 + }, + { + "epoch": 9.252821670428894, + "grad_norm": 214.3220977783203, + "learning_rate": 1.6099818511796735e-05, + "loss": 39.3935, + "step": 2563 + }, + { + "epoch": 9.256433408577879, + "grad_norm": 217.2801055908203, + "learning_rate": 1.609437386569873e-05, + "loss": 40.39, + "step": 2564 + }, + { + "epoch": 9.260045146726862, + "grad_norm": 205.17446899414062, + "learning_rate": 1.6088929219600726e-05, + "loss": 39.9531, + "step": 2565 + }, + { + "epoch": 9.263656884875846, + "grad_norm": 197.3854217529297, + "learning_rate": 1.608348457350272e-05, + "loss": 40.474, + "step": 2566 + }, + { + "epoch": 9.267268623024831, + "grad_norm": 264.3934631347656, + "learning_rate": 1.607803992740472e-05, + "loss": 41.2794, + "step": 2567 + }, + { + "epoch": 9.270880361173814, + "grad_norm": 226.6471710205078, + "learning_rate": 1.6072595281306715e-05, + "loss": 40.3425, + "step": 2568 + }, + { + "epoch": 9.2744920993228, + "grad_norm": 198.62734985351562, + "learning_rate": 1.6067150635208714e-05, + "loss": 41.6261, + "step": 2569 + }, + { + "epoch": 9.278103837471784, + "grad_norm": 207.73509216308594, + "learning_rate": 1.606170598911071e-05, + "loss": 41.7835, + "step": 2570 + }, + { + "epoch": 9.278103837471784, + "eval_loss": 0.6173180937767029, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 2570 + }, + { + "epoch": 9.281715575620767, + "grad_norm": 214.13601684570312, + "learning_rate": 1.6056261343012705e-05, + "loss": 40.0095, + "step": 2571 + }, + { + "epoch": 9.285327313769752, + "grad_norm": 218.0533905029297, + "learning_rate": 1.60508166969147e-05, + "loss": 40.014, + "step": 2572 + }, + { + "epoch": 9.288939051918735, + "grad_norm": 211.27984619140625, + "learning_rate": 1.6045372050816695e-05, + "loss": 36.7399, + "step": 2573 + }, + { + "epoch": 9.29255079006772, + "grad_norm": 201.9020233154297, + "learning_rate": 1.6039927404718694e-05, + "loss": 33.7555, + "step": 2574 + }, + { + "epoch": 9.296162528216705, + "grad_norm": 230.27149963378906, + "learning_rate": 1.603448275862069e-05, + "loss": 32.9646, + "step": 2575 + }, + { + "epoch": 9.299774266365688, + "grad_norm": 208.77622985839844, + "learning_rate": 1.6029038112522685e-05, + "loss": 33.5332, + "step": 2576 + }, + { + "epoch": 9.303386004514673, + "grad_norm": 225.02796936035156, + "learning_rate": 1.6023593466424684e-05, + "loss": 34.2592, + "step": 2577 + }, + { + "epoch": 9.306997742663658, + "grad_norm": 201.79612731933594, + "learning_rate": 1.601814882032668e-05, + "loss": 34.6686, + "step": 2578 + }, + { + "epoch": 9.31060948081264, + "grad_norm": 235.6588134765625, + "learning_rate": 1.6012704174228678e-05, + "loss": 35.4554, + "step": 2579 + }, + { + "epoch": 9.314221218961626, + "grad_norm": 273.51904296875, + "learning_rate": 1.6007259528130673e-05, + "loss": 35.2077, + "step": 2580 + }, + { + "epoch": 9.314221218961626, + "eval_loss": 0.6169624328613281, + "eval_runtime": 3.1501, + "eval_samples_per_second": 56.823, + "eval_steps_per_second": 56.823, + "step": 2580 + }, + { + "epoch": 9.317832957110609, + "grad_norm": 199.19541931152344, + "learning_rate": 1.600181488203267e-05, + "loss": 35.0703, + "step": 2581 + }, + { + "epoch": 9.321444695259594, + "grad_norm": 212.49276733398438, + "learning_rate": 1.5996370235934664e-05, + "loss": 35.9691, + "step": 2582 + }, + { + "epoch": 9.325056433408578, + "grad_norm": 193.7330322265625, + "learning_rate": 1.599092558983666e-05, + "loss": 34.9043, + "step": 2583 + }, + { + "epoch": 9.328668171557561, + "grad_norm": 196.00503540039062, + "learning_rate": 1.5985480943738655e-05, + "loss": 36.3508, + "step": 2584 + }, + { + "epoch": 9.332279909706546, + "grad_norm": 218.78392028808594, + "learning_rate": 1.5980036297640654e-05, + "loss": 34.7672, + "step": 2585 + }, + { + "epoch": 9.335891647855531, + "grad_norm": 235.76873779296875, + "learning_rate": 1.5974591651542652e-05, + "loss": 36.8695, + "step": 2586 + }, + { + "epoch": 9.339503386004514, + "grad_norm": 250.538330078125, + "learning_rate": 1.5969147005444648e-05, + "loss": 37.4531, + "step": 2587 + }, + { + "epoch": 9.343115124153499, + "grad_norm": 234.12469482421875, + "learning_rate": 1.5963702359346643e-05, + "loss": 37.4506, + "step": 2588 + }, + { + "epoch": 9.346726862302482, + "grad_norm": 209.3461151123047, + "learning_rate": 1.595825771324864e-05, + "loss": 31.3062, + "step": 2589 + }, + { + "epoch": 9.350338600451467, + "grad_norm": 211.12277221679688, + "learning_rate": 1.5952813067150637e-05, + "loss": 23.3303, + "step": 2590 + }, + { + "epoch": 9.350338600451467, + "eval_loss": 0.6222187876701355, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 2590 + }, + { + "epoch": 9.353950338600452, + "grad_norm": 200.1257781982422, + "learning_rate": 1.5947368421052633e-05, + "loss": 22.9145, + "step": 2591 + }, + { + "epoch": 9.357562076749435, + "grad_norm": 179.01475524902344, + "learning_rate": 1.5941923774954628e-05, + "loss": 23.8842, + "step": 2592 + }, + { + "epoch": 9.36117381489842, + "grad_norm": 214.9254608154297, + "learning_rate": 1.5936479128856623e-05, + "loss": 25.4154, + "step": 2593 + }, + { + "epoch": 9.364785553047405, + "grad_norm": 211.63735961914062, + "learning_rate": 1.593103448275862e-05, + "loss": 42.6467, + "step": 2594 + }, + { + "epoch": 9.368397291196388, + "grad_norm": 232.43194580078125, + "learning_rate": 1.5925589836660618e-05, + "loss": 43.3501, + "step": 2595 + }, + { + "epoch": 9.372009029345373, + "grad_norm": 220.61468505859375, + "learning_rate": 1.5920145190562616e-05, + "loss": 43.4324, + "step": 2596 + }, + { + "epoch": 9.375620767494357, + "grad_norm": 179.00894165039062, + "learning_rate": 1.591470054446461e-05, + "loss": 41.9646, + "step": 2597 + }, + { + "epoch": 9.37923250564334, + "grad_norm": 203.847412109375, + "learning_rate": 1.5909255898366607e-05, + "loss": 41.1242, + "step": 2598 + }, + { + "epoch": 9.382844243792325, + "grad_norm": 244.20164489746094, + "learning_rate": 1.5903811252268602e-05, + "loss": 42.2451, + "step": 2599 + }, + { + "epoch": 9.386455981941308, + "grad_norm": 203.60154724121094, + "learning_rate": 1.5898366606170598e-05, + "loss": 42.0361, + "step": 2600 + }, + { + "epoch": 9.386455981941308, + "eval_loss": 0.627146303653717, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2600 + }, + { + "epoch": 9.390067720090293, + "grad_norm": 185.1741180419922, + "learning_rate": 1.5892921960072597e-05, + "loss": 41.9657, + "step": 2601 + }, + { + "epoch": 9.393679458239278, + "grad_norm": 211.64219665527344, + "learning_rate": 1.5887477313974592e-05, + "loss": 42.2619, + "step": 2602 + }, + { + "epoch": 9.397291196388261, + "grad_norm": 253.31997680664062, + "learning_rate": 1.5882032667876587e-05, + "loss": 42.5666, + "step": 2603 + }, + { + "epoch": 9.400902934537246, + "grad_norm": 257.8781433105469, + "learning_rate": 1.5876588021778586e-05, + "loss": 43.1747, + "step": 2604 + }, + { + "epoch": 9.404514672686231, + "grad_norm": 171.05398559570312, + "learning_rate": 1.587114337568058e-05, + "loss": 41.2645, + "step": 2605 + }, + { + "epoch": 9.408126410835214, + "grad_norm": 209.83749389648438, + "learning_rate": 1.5865698729582577e-05, + "loss": 38.7138, + "step": 2606 + }, + { + "epoch": 9.411738148984199, + "grad_norm": 303.92059326171875, + "learning_rate": 1.5860254083484576e-05, + "loss": 38.7962, + "step": 2607 + }, + { + "epoch": 9.415349887133182, + "grad_norm": 271.9322204589844, + "learning_rate": 1.585480943738657e-05, + "loss": 39.0622, + "step": 2608 + }, + { + "epoch": 9.418961625282167, + "grad_norm": 222.8749542236328, + "learning_rate": 1.5849364791288566e-05, + "loss": 40.0773, + "step": 2609 + }, + { + "epoch": 9.422573363431152, + "grad_norm": 194.549072265625, + "learning_rate": 1.5843920145190562e-05, + "loss": 39.3495, + "step": 2610 + }, + { + "epoch": 9.422573363431152, + "eval_loss": 0.618250846862793, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.796, + "eval_steps_per_second": 56.796, + "step": 2610 + }, + { + "epoch": 9.426185101580135, + "grad_norm": 231.32623291015625, + "learning_rate": 1.5838475499092557e-05, + "loss": 39.7577, + "step": 2611 + }, + { + "epoch": 9.42979683972912, + "grad_norm": 185.9986114501953, + "learning_rate": 1.5833030852994556e-05, + "loss": 40.9342, + "step": 2612 + }, + { + "epoch": 9.433408577878104, + "grad_norm": 221.356201171875, + "learning_rate": 1.5827586206896555e-05, + "loss": 39.7733, + "step": 2613 + }, + { + "epoch": 9.437020316027088, + "grad_norm": 216.2249755859375, + "learning_rate": 1.582214156079855e-05, + "loss": 39.7559, + "step": 2614 + }, + { + "epoch": 9.440632054176072, + "grad_norm": 263.5106201171875, + "learning_rate": 1.5816696914700546e-05, + "loss": 41.2872, + "step": 2615 + }, + { + "epoch": 9.444243792325057, + "grad_norm": 281.9518127441406, + "learning_rate": 1.581125226860254e-05, + "loss": 41.1114, + "step": 2616 + }, + { + "epoch": 9.44785553047404, + "grad_norm": 200.2808074951172, + "learning_rate": 1.5805807622504536e-05, + "loss": 41.7711, + "step": 2617 + }, + { + "epoch": 9.451467268623025, + "grad_norm": 233.034912109375, + "learning_rate": 1.5800362976406535e-05, + "loss": 41.3306, + "step": 2618 + }, + { + "epoch": 9.455079006772008, + "grad_norm": 215.5499725341797, + "learning_rate": 1.579491833030853e-05, + "loss": 41.0065, + "step": 2619 + }, + { + "epoch": 9.458690744920993, + "grad_norm": 220.21153259277344, + "learning_rate": 1.5789473684210526e-05, + "loss": 42.1116, + "step": 2620 + }, + { + "epoch": 9.458690744920993, + "eval_loss": 0.6146022081375122, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 2620 + }, + { + "epoch": 9.462302483069978, + "grad_norm": 198.20001220703125, + "learning_rate": 1.578402903811252e-05, + "loss": 39.637, + "step": 2621 + }, + { + "epoch": 9.465914221218961, + "grad_norm": 228.18357849121094, + "learning_rate": 1.5778584392014517e-05, + "loss": 37.3831, + "step": 2622 + }, + { + "epoch": 9.469525959367946, + "grad_norm": 207.68040466308594, + "learning_rate": 1.577313974591652e-05, + "loss": 35.6356, + "step": 2623 + }, + { + "epoch": 9.47313769751693, + "grad_norm": 267.0474853515625, + "learning_rate": 1.5767695099818514e-05, + "loss": 34.5549, + "step": 2624 + }, + { + "epoch": 9.476749435665914, + "grad_norm": 191.4129638671875, + "learning_rate": 1.576225045372051e-05, + "loss": 35.1065, + "step": 2625 + }, + { + "epoch": 9.480361173814899, + "grad_norm": 220.85708618164062, + "learning_rate": 1.5756805807622505e-05, + "loss": 34.9115, + "step": 2626 + }, + { + "epoch": 9.483972911963882, + "grad_norm": 218.62460327148438, + "learning_rate": 1.57513611615245e-05, + "loss": 33.9542, + "step": 2627 + }, + { + "epoch": 9.487584650112867, + "grad_norm": 184.085693359375, + "learning_rate": 1.5745916515426496e-05, + "loss": 35.2981, + "step": 2628 + }, + { + "epoch": 9.491196388261852, + "grad_norm": 286.73236083984375, + "learning_rate": 1.5740471869328494e-05, + "loss": 36.8326, + "step": 2629 + }, + { + "epoch": 9.494808126410835, + "grad_norm": 326.4263000488281, + "learning_rate": 1.573502722323049e-05, + "loss": 35.9728, + "step": 2630 + }, + { + "epoch": 9.494808126410835, + "eval_loss": 0.6165672540664673, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2630 + }, + { + "epoch": 9.49841986455982, + "grad_norm": 283.330322265625, + "learning_rate": 1.5729582577132485e-05, + "loss": 37.4227, + "step": 2631 + }, + { + "epoch": 9.502031602708804, + "grad_norm": 208.65829467773438, + "learning_rate": 1.5724137931034484e-05, + "loss": 36.8613, + "step": 2632 + }, + { + "epoch": 9.505643340857787, + "grad_norm": 191.59429931640625, + "learning_rate": 1.571869328493648e-05, + "loss": 36.2332, + "step": 2633 + }, + { + "epoch": 9.509255079006772, + "grad_norm": 306.4736022949219, + "learning_rate": 1.5713248638838478e-05, + "loss": 36.8045, + "step": 2634 + }, + { + "epoch": 9.512866817155757, + "grad_norm": 226.97509765625, + "learning_rate": 1.5707803992740474e-05, + "loss": 37.005, + "step": 2635 + }, + { + "epoch": 9.51647855530474, + "grad_norm": 230.47683715820312, + "learning_rate": 1.570235934664247e-05, + "loss": 36.9168, + "step": 2636 + }, + { + "epoch": 9.520090293453725, + "grad_norm": 221.44483947753906, + "learning_rate": 1.5696914700544464e-05, + "loss": 39.0025, + "step": 2637 + }, + { + "epoch": 9.523702031602708, + "grad_norm": 249.1531219482422, + "learning_rate": 1.569147005444646e-05, + "loss": 38.1069, + "step": 2638 + }, + { + "epoch": 9.527313769751693, + "grad_norm": 276.8532409667969, + "learning_rate": 1.5686025408348455e-05, + "loss": 30.9819, + "step": 2639 + }, + { + "epoch": 9.530925507900678, + "grad_norm": 218.25035095214844, + "learning_rate": 1.5680580762250454e-05, + "loss": 23.4807, + "step": 2640 + }, + { + "epoch": 9.530925507900678, + "eval_loss": 0.619295060634613, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2640 + }, + { + "epoch": 9.534537246049661, + "grad_norm": 185.83737182617188, + "learning_rate": 1.5675136116152453e-05, + "loss": 22.5394, + "step": 2641 + }, + { + "epoch": 9.538148984198646, + "grad_norm": 181.9920654296875, + "learning_rate": 1.5669691470054448e-05, + "loss": 23.9106, + "step": 2642 + }, + { + "epoch": 9.54176072234763, + "grad_norm": 209.20391845703125, + "learning_rate": 1.5664246823956443e-05, + "loss": 25.5328, + "step": 2643 + }, + { + "epoch": 9.545372460496614, + "grad_norm": 223.86093139648438, + "learning_rate": 1.565880217785844e-05, + "loss": 42.8563, + "step": 2644 + }, + { + "epoch": 9.548984198645599, + "grad_norm": 232.3086395263672, + "learning_rate": 1.5653357531760438e-05, + "loss": 44.0178, + "step": 2645 + }, + { + "epoch": 9.552595936794582, + "grad_norm": 223.76541137695312, + "learning_rate": 1.5647912885662433e-05, + "loss": 43.4928, + "step": 2646 + }, + { + "epoch": 9.556207674943566, + "grad_norm": 258.86700439453125, + "learning_rate": 1.5642468239564428e-05, + "loss": 42.3422, + "step": 2647 + }, + { + "epoch": 9.559819413092551, + "grad_norm": 255.09033203125, + "learning_rate": 1.5637023593466424e-05, + "loss": 41.6588, + "step": 2648 + }, + { + "epoch": 9.563431151241534, + "grad_norm": 205.88563537597656, + "learning_rate": 1.563157894736842e-05, + "loss": 41.9267, + "step": 2649 + }, + { + "epoch": 9.56704288939052, + "grad_norm": 204.12318420410156, + "learning_rate": 1.5626134301270418e-05, + "loss": 43.0326, + "step": 2650 + }, + { + "epoch": 9.56704288939052, + "eval_loss": 0.6218730807304382, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2650 + }, + { + "epoch": 9.570654627539504, + "grad_norm": 259.5694274902344, + "learning_rate": 1.5620689655172417e-05, + "loss": 42.9604, + "step": 2651 + }, + { + "epoch": 9.574266365688487, + "grad_norm": 234.35935974121094, + "learning_rate": 1.5615245009074412e-05, + "loss": 42.7316, + "step": 2652 + }, + { + "epoch": 9.577878103837472, + "grad_norm": 237.14346313476562, + "learning_rate": 1.5609800362976407e-05, + "loss": 42.4559, + "step": 2653 + }, + { + "epoch": 9.581489841986457, + "grad_norm": 208.2974395751953, + "learning_rate": 1.5604355716878403e-05, + "loss": 40.1113, + "step": 2654 + }, + { + "epoch": 9.58510158013544, + "grad_norm": 212.18814086914062, + "learning_rate": 1.5598911070780398e-05, + "loss": 38.6515, + "step": 2655 + }, + { + "epoch": 9.588713318284425, + "grad_norm": 245.23240661621094, + "learning_rate": 1.5593466424682397e-05, + "loss": 39.5289, + "step": 2656 + }, + { + "epoch": 9.592325056433408, + "grad_norm": 261.1321105957031, + "learning_rate": 1.5588021778584392e-05, + "loss": 39.3232, + "step": 2657 + }, + { + "epoch": 9.595936794582393, + "grad_norm": 257.67962646484375, + "learning_rate": 1.5582577132486388e-05, + "loss": 40.3963, + "step": 2658 + }, + { + "epoch": 9.599548532731378, + "grad_norm": 299.93914794921875, + "learning_rate": 1.5577132486388383e-05, + "loss": 39.0657, + "step": 2659 + }, + { + "epoch": 9.60316027088036, + "grad_norm": 215.45407104492188, + "learning_rate": 1.5571687840290382e-05, + "loss": 40.1408, + "step": 2660 + }, + { + "epoch": 9.60316027088036, + "eval_loss": 0.6216554045677185, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2660 + }, + { + "epoch": 9.606772009029346, + "grad_norm": 273.9233093261719, + "learning_rate": 1.5566243194192377e-05, + "loss": 40.6894, + "step": 2661 + }, + { + "epoch": 9.610383747178329, + "grad_norm": 220.76344299316406, + "learning_rate": 1.5560798548094376e-05, + "loss": 40.8146, + "step": 2662 + }, + { + "epoch": 9.613995485327314, + "grad_norm": 200.33929443359375, + "learning_rate": 1.555535390199637e-05, + "loss": 40.1362, + "step": 2663 + }, + { + "epoch": 9.617607223476298, + "grad_norm": 223.38536071777344, + "learning_rate": 1.5549909255898367e-05, + "loss": 39.3488, + "step": 2664 + }, + { + "epoch": 9.621218961625281, + "grad_norm": 240.99578857421875, + "learning_rate": 1.5544464609800362e-05, + "loss": 41.771, + "step": 2665 + }, + { + "epoch": 9.624830699774266, + "grad_norm": 202.30323791503906, + "learning_rate": 1.5539019963702357e-05, + "loss": 41.1412, + "step": 2666 + }, + { + "epoch": 9.628442437923251, + "grad_norm": 193.8411865234375, + "learning_rate": 1.5533575317604356e-05, + "loss": 41.0064, + "step": 2667 + }, + { + "epoch": 9.632054176072234, + "grad_norm": 197.1542510986328, + "learning_rate": 1.552813067150635e-05, + "loss": 41.4787, + "step": 2668 + }, + { + "epoch": 9.635665914221219, + "grad_norm": 259.21954345703125, + "learning_rate": 1.552268602540835e-05, + "loss": 41.753, + "step": 2669 + }, + { + "epoch": 9.639277652370204, + "grad_norm": 290.9770202636719, + "learning_rate": 1.5517241379310346e-05, + "loss": 40.4589, + "step": 2670 + }, + { + "epoch": 9.639277652370204, + "eval_loss": 0.6132164001464844, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2670 + }, + { + "epoch": 9.642889390519187, + "grad_norm": 252.86219787597656, + "learning_rate": 1.551179673321234e-05, + "loss": 37.356, + "step": 2671 + }, + { + "epoch": 9.646501128668172, + "grad_norm": 207.79254150390625, + "learning_rate": 1.550635208711434e-05, + "loss": 36.2071, + "step": 2672 + }, + { + "epoch": 9.650112866817155, + "grad_norm": 186.78857421875, + "learning_rate": 1.5500907441016335e-05, + "loss": 33.5074, + "step": 2673 + }, + { + "epoch": 9.65372460496614, + "grad_norm": 212.5107421875, + "learning_rate": 1.549546279491833e-05, + "loss": 33.7103, + "step": 2674 + }, + { + "epoch": 9.657336343115125, + "grad_norm": 243.2950897216797, + "learning_rate": 1.5490018148820326e-05, + "loss": 34.3476, + "step": 2675 + }, + { + "epoch": 9.660948081264108, + "grad_norm": 221.66415405273438, + "learning_rate": 1.548457350272232e-05, + "loss": 34.5377, + "step": 2676 + }, + { + "epoch": 9.664559819413093, + "grad_norm": 231.8260955810547, + "learning_rate": 1.5479128856624317e-05, + "loss": 34.3663, + "step": 2677 + }, + { + "epoch": 9.668171557562077, + "grad_norm": 284.6401062011719, + "learning_rate": 1.547368421052632e-05, + "loss": 35.5723, + "step": 2678 + }, + { + "epoch": 9.67178329571106, + "grad_norm": 373.43865966796875, + "learning_rate": 1.5468239564428314e-05, + "loss": 35.5628, + "step": 2679 + }, + { + "epoch": 9.675395033860045, + "grad_norm": 325.18316650390625, + "learning_rate": 1.546279491833031e-05, + "loss": 35.6192, + "step": 2680 + }, + { + "epoch": 9.675395033860045, + "eval_loss": 0.613842248916626, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 2680 + }, + { + "epoch": 9.679006772009028, + "grad_norm": 353.14739990234375, + "learning_rate": 1.5457350272232305e-05, + "loss": 36.4789, + "step": 2681 + }, + { + "epoch": 9.682618510158013, + "grad_norm": 215.21836853027344, + "learning_rate": 1.54519056261343e-05, + "loss": 36.0412, + "step": 2682 + }, + { + "epoch": 9.686230248306998, + "grad_norm": 219.64930725097656, + "learning_rate": 1.54464609800363e-05, + "loss": 37.1118, + "step": 2683 + }, + { + "epoch": 9.689841986455981, + "grad_norm": 247.86685180664062, + "learning_rate": 1.5441016333938295e-05, + "loss": 36.488, + "step": 2684 + }, + { + "epoch": 9.693453724604966, + "grad_norm": 248.7967071533203, + "learning_rate": 1.543557168784029e-05, + "loss": 36.2925, + "step": 2685 + }, + { + "epoch": 9.697065462753951, + "grad_norm": 243.1404571533203, + "learning_rate": 1.5430127041742285e-05, + "loss": 37.3986, + "step": 2686 + }, + { + "epoch": 9.700677200902934, + "grad_norm": 276.6585388183594, + "learning_rate": 1.5424682395644284e-05, + "loss": 37.9784, + "step": 2687 + }, + { + "epoch": 9.704288939051919, + "grad_norm": 308.171630859375, + "learning_rate": 1.541923774954628e-05, + "loss": 38.1591, + "step": 2688 + }, + { + "epoch": 9.707900677200904, + "grad_norm": 204.4575653076172, + "learning_rate": 1.541379310344828e-05, + "loss": 27.4514, + "step": 2689 + }, + { + "epoch": 9.711512415349887, + "grad_norm": 160.85946655273438, + "learning_rate": 1.5408348457350274e-05, + "loss": 23.7982, + "step": 2690 + }, + { + "epoch": 9.711512415349887, + "eval_loss": 0.619924008846283, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2690 + }, + { + "epoch": 9.715124153498872, + "grad_norm": 215.60049438476562, + "learning_rate": 1.540290381125227e-05, + "loss": 23.3927, + "step": 2691 + }, + { + "epoch": 9.718735891647855, + "grad_norm": 172.84011840820312, + "learning_rate": 1.5397459165154265e-05, + "loss": 24.1876, + "step": 2692 + }, + { + "epoch": 9.72234762979684, + "grad_norm": 208.42361450195312, + "learning_rate": 1.539201451905626e-05, + "loss": 25.1794, + "step": 2693 + }, + { + "epoch": 9.725959367945824, + "grad_norm": 255.73574829101562, + "learning_rate": 1.538656987295826e-05, + "loss": 42.3484, + "step": 2694 + }, + { + "epoch": 9.729571106094808, + "grad_norm": 239.65533447265625, + "learning_rate": 1.5381125226860254e-05, + "loss": 42.8277, + "step": 2695 + }, + { + "epoch": 9.733182844243792, + "grad_norm": 211.2068634033203, + "learning_rate": 1.5375680580762253e-05, + "loss": 42.6536, + "step": 2696 + }, + { + "epoch": 9.736794582392777, + "grad_norm": 302.85003662109375, + "learning_rate": 1.5370235934664248e-05, + "loss": 42.6263, + "step": 2697 + }, + { + "epoch": 9.74040632054176, + "grad_norm": 211.54754638671875, + "learning_rate": 1.5364791288566244e-05, + "loss": 41.5621, + "step": 2698 + }, + { + "epoch": 9.744018058690745, + "grad_norm": 229.22283935546875, + "learning_rate": 1.535934664246824e-05, + "loss": 43.3765, + "step": 2699 + }, + { + "epoch": 9.747629796839728, + "grad_norm": 206.64794921875, + "learning_rate": 1.5353901996370238e-05, + "loss": 41.4923, + "step": 2700 + }, + { + "epoch": 9.747629796839728, + "eval_loss": 0.6202616095542908, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.981, + "eval_steps_per_second": 56.981, + "step": 2700 + }, + { + "epoch": 9.751241534988713, + "grad_norm": 216.98757934570312, + "learning_rate": 1.5348457350272233e-05, + "loss": 43.1931, + "step": 2701 + }, + { + "epoch": 9.754853273137698, + "grad_norm": 222.7340545654297, + "learning_rate": 1.534301270417423e-05, + "loss": 42.485, + "step": 2702 + }, + { + "epoch": 9.758465011286681, + "grad_norm": 291.3454895019531, + "learning_rate": 1.5337568058076224e-05, + "loss": 41.4766, + "step": 2703 + }, + { + "epoch": 9.762076749435666, + "grad_norm": 239.50341796875, + "learning_rate": 1.533212341197822e-05, + "loss": 41.9215, + "step": 2704 + }, + { + "epoch": 9.76568848758465, + "grad_norm": 179.21839904785156, + "learning_rate": 1.5326678765880218e-05, + "loss": 40.6544, + "step": 2705 + }, + { + "epoch": 9.769300225733634, + "grad_norm": 210.89535522460938, + "learning_rate": 1.5321234119782217e-05, + "loss": 38.6204, + "step": 2706 + }, + { + "epoch": 9.772911963882619, + "grad_norm": 239.23291015625, + "learning_rate": 1.5315789473684212e-05, + "loss": 39.4385, + "step": 2707 + }, + { + "epoch": 9.776523702031604, + "grad_norm": 240.22772216796875, + "learning_rate": 1.5310344827586208e-05, + "loss": 40.0139, + "step": 2708 + }, + { + "epoch": 9.780135440180587, + "grad_norm": 185.4588623046875, + "learning_rate": 1.5304900181488203e-05, + "loss": 38.9331, + "step": 2709 + }, + { + "epoch": 9.783747178329572, + "grad_norm": 263.0315856933594, + "learning_rate": 1.52994555353902e-05, + "loss": 38.5485, + "step": 2710 + }, + { + "epoch": 9.783747178329572, + "eval_loss": 0.615914523601532, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2710 + }, + { + "epoch": 9.787358916478555, + "grad_norm": 209.05348205566406, + "learning_rate": 1.5294010889292197e-05, + "loss": 39.4875, + "step": 2711 + }, + { + "epoch": 9.79097065462754, + "grad_norm": 209.72293090820312, + "learning_rate": 1.5288566243194193e-05, + "loss": 40.4742, + "step": 2712 + }, + { + "epoch": 9.794582392776524, + "grad_norm": 210.02908325195312, + "learning_rate": 1.5283121597096188e-05, + "loss": 39.924, + "step": 2713 + }, + { + "epoch": 9.798194130925507, + "grad_norm": 204.3467254638672, + "learning_rate": 1.5277676950998183e-05, + "loss": 40.8893, + "step": 2714 + }, + { + "epoch": 9.801805869074492, + "grad_norm": 253.9317626953125, + "learning_rate": 1.5272232304900182e-05, + "loss": 38.3278, + "step": 2715 + }, + { + "epoch": 9.805417607223477, + "grad_norm": 263.6196594238281, + "learning_rate": 1.526678765880218e-05, + "loss": 40.5242, + "step": 2716 + }, + { + "epoch": 9.80902934537246, + "grad_norm": 230.35621643066406, + "learning_rate": 1.5261343012704176e-05, + "loss": 40.683, + "step": 2717 + }, + { + "epoch": 9.812641083521445, + "grad_norm": 190.16323852539062, + "learning_rate": 1.5255898366606172e-05, + "loss": 40.2472, + "step": 2718 + }, + { + "epoch": 9.816252821670428, + "grad_norm": 202.7122344970703, + "learning_rate": 1.5250453720508167e-05, + "loss": 38.9644, + "step": 2719 + }, + { + "epoch": 9.819864559819413, + "grad_norm": 193.65774536132812, + "learning_rate": 1.5245009074410164e-05, + "loss": 40.9982, + "step": 2720 + }, + { + "epoch": 9.819864559819413, + "eval_loss": 0.6152020692825317, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 2720 + }, + { + "epoch": 9.823476297968398, + "grad_norm": 272.0360412597656, + "learning_rate": 1.523956442831216e-05, + "loss": 40.5518, + "step": 2721 + }, + { + "epoch": 9.827088036117381, + "grad_norm": 200.20777893066406, + "learning_rate": 1.5234119782214155e-05, + "loss": 38.4801, + "step": 2722 + }, + { + "epoch": 9.830699774266366, + "grad_norm": 201.44764709472656, + "learning_rate": 1.5228675136116152e-05, + "loss": 35.7499, + "step": 2723 + }, + { + "epoch": 9.83431151241535, + "grad_norm": 234.89706420898438, + "learning_rate": 1.522323049001815e-05, + "loss": 35.4331, + "step": 2724 + }, + { + "epoch": 9.837923250564334, + "grad_norm": 193.27423095703125, + "learning_rate": 1.5217785843920146e-05, + "loss": 33.0281, + "step": 2725 + }, + { + "epoch": 9.841534988713319, + "grad_norm": 222.28060913085938, + "learning_rate": 1.5212341197822143e-05, + "loss": 34.2237, + "step": 2726 + }, + { + "epoch": 9.845146726862303, + "grad_norm": 264.2764587402344, + "learning_rate": 1.5206896551724139e-05, + "loss": 33.7112, + "step": 2727 + }, + { + "epoch": 9.848758465011286, + "grad_norm": 204.5146484375, + "learning_rate": 1.5201451905626134e-05, + "loss": 33.9014, + "step": 2728 + }, + { + "epoch": 9.852370203160271, + "grad_norm": 198.90907287597656, + "learning_rate": 1.5196007259528131e-05, + "loss": 36.6987, + "step": 2729 + }, + { + "epoch": 9.855981941309254, + "grad_norm": 254.19818115234375, + "learning_rate": 1.5190562613430126e-05, + "loss": 35.4466, + "step": 2730 + }, + { + "epoch": 9.855981941309254, + "eval_loss": 0.6153284311294556, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2730 + }, + { + "epoch": 9.85959367945824, + "grad_norm": 212.53749084472656, + "learning_rate": 1.5185117967332123e-05, + "loss": 35.659, + "step": 2731 + }, + { + "epoch": 9.863205417607224, + "grad_norm": 234.5277557373047, + "learning_rate": 1.5179673321234119e-05, + "loss": 36.7411, + "step": 2732 + }, + { + "epoch": 9.866817155756207, + "grad_norm": 229.25962829589844, + "learning_rate": 1.5174228675136118e-05, + "loss": 36.0713, + "step": 2733 + }, + { + "epoch": 9.870428893905192, + "grad_norm": 259.5096435546875, + "learning_rate": 1.5168784029038115e-05, + "loss": 37.2433, + "step": 2734 + }, + { + "epoch": 9.874040632054175, + "grad_norm": 297.2413024902344, + "learning_rate": 1.516333938294011e-05, + "loss": 37.222, + "step": 2735 + }, + { + "epoch": 9.87765237020316, + "grad_norm": 259.8325500488281, + "learning_rate": 1.5157894736842105e-05, + "loss": 37.096, + "step": 2736 + }, + { + "epoch": 9.881264108352145, + "grad_norm": 275.85888671875, + "learning_rate": 1.5152450090744103e-05, + "loss": 37.769, + "step": 2737 + }, + { + "epoch": 9.884875846501128, + "grad_norm": 261.16656494140625, + "learning_rate": 1.5147005444646098e-05, + "loss": 38.4089, + "step": 2738 + }, + { + "epoch": 9.888487584650113, + "grad_norm": 219.74351501464844, + "learning_rate": 1.5141560798548095e-05, + "loss": 32.5255, + "step": 2739 + }, + { + "epoch": 9.892099322799098, + "grad_norm": 203.9193878173828, + "learning_rate": 1.513611615245009e-05, + "loss": 24.2497, + "step": 2740 + }, + { + "epoch": 9.892099322799098, + "eval_loss": 0.6206448674201965, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 2740 + }, + { + "epoch": 9.89571106094808, + "grad_norm": 224.19454956054688, + "learning_rate": 1.5130671506352086e-05, + "loss": 23.0629, + "step": 2741 + }, + { + "epoch": 9.899322799097066, + "grad_norm": 252.4147186279297, + "learning_rate": 1.5125226860254086e-05, + "loss": 24.5799, + "step": 2742 + }, + { + "epoch": 9.90293453724605, + "grad_norm": 214.79067993164062, + "learning_rate": 1.5119782214156082e-05, + "loss": 24.6773, + "step": 2743 + }, + { + "epoch": 9.906546275395034, + "grad_norm": 225.59848022460938, + "learning_rate": 1.5114337568058077e-05, + "loss": 43.1147, + "step": 2744 + }, + { + "epoch": 9.910158013544018, + "grad_norm": 221.8661651611328, + "learning_rate": 1.5108892921960074e-05, + "loss": 42.7403, + "step": 2745 + }, + { + "epoch": 9.913769751693001, + "grad_norm": 316.3871765136719, + "learning_rate": 1.510344827586207e-05, + "loss": 41.6931, + "step": 2746 + }, + { + "epoch": 9.917381489841986, + "grad_norm": 250.6577911376953, + "learning_rate": 1.5098003629764065e-05, + "loss": 43.3, + "step": 2747 + }, + { + "epoch": 9.920993227990971, + "grad_norm": 222.44386291503906, + "learning_rate": 1.5092558983666062e-05, + "loss": 43.3128, + "step": 2748 + }, + { + "epoch": 9.924604966139954, + "grad_norm": 190.08682250976562, + "learning_rate": 1.5087114337568057e-05, + "loss": 41.4814, + "step": 2749 + }, + { + "epoch": 9.928216704288939, + "grad_norm": 276.9918212890625, + "learning_rate": 1.5081669691470054e-05, + "loss": 41.042, + "step": 2750 + }, + { + "epoch": 9.928216704288939, + "eval_loss": 0.6201648116111755, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2750 + }, + { + "epoch": 9.931828442437924, + "grad_norm": 269.7344970703125, + "learning_rate": 1.507622504537205e-05, + "loss": 40.3064, + "step": 2751 + }, + { + "epoch": 9.935440180586907, + "grad_norm": 263.11663818359375, + "learning_rate": 1.5070780399274049e-05, + "loss": 40.1675, + "step": 2752 + }, + { + "epoch": 9.939051918735892, + "grad_norm": 210.37635803222656, + "learning_rate": 1.5065335753176046e-05, + "loss": 40.5334, + "step": 2753 + }, + { + "epoch": 9.942663656884875, + "grad_norm": 206.09335327148438, + "learning_rate": 1.5059891107078041e-05, + "loss": 41.0429, + "step": 2754 + }, + { + "epoch": 9.94627539503386, + "grad_norm": 245.45013427734375, + "learning_rate": 1.5054446460980036e-05, + "loss": 40.8831, + "step": 2755 + }, + { + "epoch": 9.949887133182845, + "grad_norm": 216.63075256347656, + "learning_rate": 1.5049001814882033e-05, + "loss": 41.2453, + "step": 2756 + }, + { + "epoch": 9.953498871331828, + "grad_norm": 362.12127685546875, + "learning_rate": 1.5043557168784029e-05, + "loss": 40.4561, + "step": 2757 + }, + { + "epoch": 9.957110609480813, + "grad_norm": 222.01434326171875, + "learning_rate": 1.5038112522686024e-05, + "loss": 41.7307, + "step": 2758 + }, + { + "epoch": 9.960722347629797, + "grad_norm": 289.6107177734375, + "learning_rate": 1.5032667876588021e-05, + "loss": 37.83, + "step": 2759 + }, + { + "epoch": 9.96433408577878, + "grad_norm": 231.75274658203125, + "learning_rate": 1.5027223230490017e-05, + "loss": 34.1728, + "step": 2760 + }, + { + "epoch": 9.96433408577878, + "eval_loss": 0.6177247166633606, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2760 + }, + { + "epoch": 9.967945823927765, + "grad_norm": 269.4657287597656, + "learning_rate": 1.5021778584392017e-05, + "loss": 33.8501, + "step": 2761 + }, + { + "epoch": 9.97155756207675, + "grad_norm": 229.73004150390625, + "learning_rate": 1.5016333938294013e-05, + "loss": 35.0989, + "step": 2762 + }, + { + "epoch": 9.975169300225733, + "grad_norm": 215.75350952148438, + "learning_rate": 1.5010889292196008e-05, + "loss": 35.1091, + "step": 2763 + }, + { + "epoch": 9.978781038374718, + "grad_norm": 255.36439514160156, + "learning_rate": 1.5005444646098005e-05, + "loss": 36.8373, + "step": 2764 + }, + { + "epoch": 9.982392776523701, + "grad_norm": 226.71084594726562, + "learning_rate": 1.5e-05, + "loss": 36.6244, + "step": 2765 + }, + { + "epoch": 9.986004514672686, + "grad_norm": 264.1791076660156, + "learning_rate": 1.4994555353901996e-05, + "loss": 36.1925, + "step": 2766 + }, + { + "epoch": 9.989616252821671, + "grad_norm": 281.4349060058594, + "learning_rate": 1.4989110707803993e-05, + "loss": 38.5627, + "step": 2767 + }, + { + "epoch": 9.993227990970654, + "grad_norm": 275.13092041015625, + "learning_rate": 1.498366606170599e-05, + "loss": 33.3277, + "step": 2768 + }, + { + "epoch": 9.996839729119639, + "grad_norm": 215.79550170898438, + "learning_rate": 1.4978221415607985e-05, + "loss": 23.7482, + "step": 2769 + }, + { + "epoch": 10.0, + "grad_norm": 162.03152465820312, + "learning_rate": 1.4972776769509982e-05, + "loss": 21.7078, + "step": 2770 + }, + { + "epoch": 10.0, + "eval_loss": 0.6126651763916016, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2770 + }, + { + "epoch": 10.003611738148985, + "grad_norm": 243.1815185546875, + "learning_rate": 1.4967332123411978e-05, + "loss": 42.2449, + "step": 2771 + }, + { + "epoch": 10.007223476297968, + "grad_norm": 183.29127502441406, + "learning_rate": 1.4961887477313977e-05, + "loss": 41.5925, + "step": 2772 + }, + { + "epoch": 10.010835214446953, + "grad_norm": 206.04238891601562, + "learning_rate": 1.4956442831215972e-05, + "loss": 40.6657, + "step": 2773 + }, + { + "epoch": 10.014446952595938, + "grad_norm": 192.1796875, + "learning_rate": 1.4950998185117967e-05, + "loss": 41.7065, + "step": 2774 + }, + { + "epoch": 10.01805869074492, + "grad_norm": 202.77279663085938, + "learning_rate": 1.4945553539019964e-05, + "loss": 42.0608, + "step": 2775 + }, + { + "epoch": 10.021670428893906, + "grad_norm": 242.37734985351562, + "learning_rate": 1.494010889292196e-05, + "loss": 40.9925, + "step": 2776 + }, + { + "epoch": 10.025282167042889, + "grad_norm": 252.01358032226562, + "learning_rate": 1.4934664246823957e-05, + "loss": 41.1401, + "step": 2777 + }, + { + "epoch": 10.028893905191874, + "grad_norm": 205.82388305664062, + "learning_rate": 1.4929219600725954e-05, + "loss": 41.5, + "step": 2778 + }, + { + "epoch": 10.032505643340858, + "grad_norm": 251.53968811035156, + "learning_rate": 1.492377495462795e-05, + "loss": 41.8218, + "step": 2779 + }, + { + "epoch": 10.036117381489841, + "grad_norm": 236.55564880371094, + "learning_rate": 1.4918330308529945e-05, + "loss": 40.803, + "step": 2780 + }, + { + "epoch": 10.036117381489841, + "eval_loss": 0.6173696517944336, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 2780 + }, + { + "epoch": 10.039729119638826, + "grad_norm": 214.9959716796875, + "learning_rate": 1.4912885662431942e-05, + "loss": 40.522, + "step": 2781 + }, + { + "epoch": 10.043340857787811, + "grad_norm": 213.7000732421875, + "learning_rate": 1.4907441016333939e-05, + "loss": 38.8643, + "step": 2782 + }, + { + "epoch": 10.046952595936794, + "grad_norm": 225.6709747314453, + "learning_rate": 1.4901996370235936e-05, + "loss": 38.3625, + "step": 2783 + }, + { + "epoch": 10.050564334085779, + "grad_norm": 208.83712768554688, + "learning_rate": 1.4896551724137931e-05, + "loss": 38.5355, + "step": 2784 + }, + { + "epoch": 10.054176072234762, + "grad_norm": 185.51219177246094, + "learning_rate": 1.4891107078039927e-05, + "loss": 38.4303, + "step": 2785 + }, + { + "epoch": 10.057787810383747, + "grad_norm": 196.68551635742188, + "learning_rate": 1.4885662431941925e-05, + "loss": 38.1895, + "step": 2786 + }, + { + "epoch": 10.061399548532732, + "grad_norm": 207.4806671142578, + "learning_rate": 1.488021778584392e-05, + "loss": 39.2329, + "step": 2787 + }, + { + "epoch": 10.065011286681715, + "grad_norm": 211.640380859375, + "learning_rate": 1.4874773139745916e-05, + "loss": 40.108, + "step": 2788 + }, + { + "epoch": 10.0686230248307, + "grad_norm": 195.97006225585938, + "learning_rate": 1.4869328493647913e-05, + "loss": 39.6883, + "step": 2789 + }, + { + "epoch": 10.072234762979685, + "grad_norm": 207.20169067382812, + "learning_rate": 1.4863883847549909e-05, + "loss": 40.557, + "step": 2790 + }, + { + "epoch": 10.072234762979685, + "eval_loss": 0.6166439652442932, + "eval_runtime": 3.1461, + "eval_samples_per_second": 56.895, + "eval_steps_per_second": 56.895, + "step": 2790 + }, + { + "epoch": 10.075846501128668, + "grad_norm": 168.4052276611328, + "learning_rate": 1.4858439201451906e-05, + "loss": 39.76, + "step": 2791 + }, + { + "epoch": 10.079458239277653, + "grad_norm": 188.55575561523438, + "learning_rate": 1.4852994555353903e-05, + "loss": 40.4776, + "step": 2792 + }, + { + "epoch": 10.083069977426636, + "grad_norm": 181.60801696777344, + "learning_rate": 1.4847549909255898e-05, + "loss": 40.5414, + "step": 2793 + }, + { + "epoch": 10.08668171557562, + "grad_norm": 205.39608764648438, + "learning_rate": 1.4842105263157895e-05, + "loss": 41.4944, + "step": 2794 + }, + { + "epoch": 10.090293453724605, + "grad_norm": 271.0169372558594, + "learning_rate": 1.4836660617059892e-05, + "loss": 40.6805, + "step": 2795 + }, + { + "epoch": 10.093905191873588, + "grad_norm": 241.97889709472656, + "learning_rate": 1.4831215970961888e-05, + "loss": 39.5473, + "step": 2796 + }, + { + "epoch": 10.097516930022573, + "grad_norm": 211.64260864257812, + "learning_rate": 1.4825771324863885e-05, + "loss": 41.0357, + "step": 2797 + }, + { + "epoch": 10.101128668171558, + "grad_norm": 209.52804565429688, + "learning_rate": 1.482032667876588e-05, + "loss": 41.3357, + "step": 2798 + }, + { + "epoch": 10.104740406320541, + "grad_norm": 243.08419799804688, + "learning_rate": 1.4814882032667876e-05, + "loss": 38.6778, + "step": 2799 + }, + { + "epoch": 10.108352144469526, + "grad_norm": 227.17172241210938, + "learning_rate": 1.4809437386569874e-05, + "loss": 35.1128, + "step": 2800 + }, + { + "epoch": 10.108352144469526, + "eval_loss": 0.6153741478919983, + "eval_runtime": 3.143, + "eval_samples_per_second": 56.952, + "eval_steps_per_second": 56.952, + "step": 2800 + }, + { + "epoch": 10.111963882618511, + "grad_norm": 284.7151794433594, + "learning_rate": 1.480399274047187e-05, + "loss": 33.1712, + "step": 2801 + }, + { + "epoch": 10.115575620767494, + "grad_norm": 234.85169982910156, + "learning_rate": 1.4798548094373867e-05, + "loss": 33.495, + "step": 2802 + }, + { + "epoch": 10.119187358916479, + "grad_norm": 236.6138458251953, + "learning_rate": 1.4793103448275862e-05, + "loss": 33.2318, + "step": 2803 + }, + { + "epoch": 10.122799097065462, + "grad_norm": 240.98997497558594, + "learning_rate": 1.4787658802177858e-05, + "loss": 33.9268, + "step": 2804 + }, + { + "epoch": 10.126410835214447, + "grad_norm": 218.304443359375, + "learning_rate": 1.4782214156079856e-05, + "loss": 34.667, + "step": 2805 + }, + { + "epoch": 10.130022573363432, + "grad_norm": 290.30108642578125, + "learning_rate": 1.4776769509981852e-05, + "loss": 36.7153, + "step": 2806 + }, + { + "epoch": 10.133634311512415, + "grad_norm": 267.7265625, + "learning_rate": 1.4771324863883847e-05, + "loss": 35.2035, + "step": 2807 + }, + { + "epoch": 10.1372460496614, + "grad_norm": 300.4646301269531, + "learning_rate": 1.4765880217785844e-05, + "loss": 35.6581, + "step": 2808 + }, + { + "epoch": 10.140857787810384, + "grad_norm": 234.16448974609375, + "learning_rate": 1.4760435571687841e-05, + "loss": 35.8547, + "step": 2809 + }, + { + "epoch": 10.144469525959368, + "grad_norm": 209.23858642578125, + "learning_rate": 1.4754990925589837e-05, + "loss": 34.47, + "step": 2810 + }, + { + "epoch": 10.144469525959368, + "eval_loss": 0.6160662770271301, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2810 + }, + { + "epoch": 10.148081264108352, + "grad_norm": 207.9628143310547, + "learning_rate": 1.4749546279491834e-05, + "loss": 36.1239, + "step": 2811 + }, + { + "epoch": 10.151693002257336, + "grad_norm": 183.68545532226562, + "learning_rate": 1.4744101633393829e-05, + "loss": 36.759, + "step": 2812 + }, + { + "epoch": 10.15530474040632, + "grad_norm": 222.00164794921875, + "learning_rate": 1.4738656987295826e-05, + "loss": 37.397, + "step": 2813 + }, + { + "epoch": 10.158916478555305, + "grad_norm": 226.9628448486328, + "learning_rate": 1.4733212341197823e-05, + "loss": 36.3648, + "step": 2814 + }, + { + "epoch": 10.162528216704288, + "grad_norm": 271.061279296875, + "learning_rate": 1.4727767695099819e-05, + "loss": 37.8754, + "step": 2815 + }, + { + "epoch": 10.166139954853273, + "grad_norm": 265.2478942871094, + "learning_rate": 1.4722323049001816e-05, + "loss": 33.7491, + "step": 2816 + }, + { + "epoch": 10.169751693002258, + "grad_norm": 227.5030975341797, + "learning_rate": 1.4716878402903811e-05, + "loss": 23.0162, + "step": 2817 + }, + { + "epoch": 10.173363431151241, + "grad_norm": 195.83477783203125, + "learning_rate": 1.4711433756805808e-05, + "loss": 23.5831, + "step": 2818 + }, + { + "epoch": 10.176975169300226, + "grad_norm": 196.982421875, + "learning_rate": 1.4705989110707805e-05, + "loss": 24.1078, + "step": 2819 + }, + { + "epoch": 10.18058690744921, + "grad_norm": 212.73031616210938, + "learning_rate": 1.47005444646098e-05, + "loss": 24.8378, + "step": 2820 + }, + { + "epoch": 10.18058690744921, + "eval_loss": 0.6217848062515259, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 2820 + }, + { + "epoch": 10.184198645598194, + "grad_norm": 261.8343200683594, + "learning_rate": 1.4695099818511796e-05, + "loss": 43.3402, + "step": 2821 + }, + { + "epoch": 10.187810383747179, + "grad_norm": 272.94158935546875, + "learning_rate": 1.4689655172413793e-05, + "loss": 42.8004, + "step": 2822 + }, + { + "epoch": 10.191422121896162, + "grad_norm": 261.5067138671875, + "learning_rate": 1.468421052631579e-05, + "loss": 43.5947, + "step": 2823 + }, + { + "epoch": 10.195033860045147, + "grad_norm": 280.4205322265625, + "learning_rate": 1.4678765880217787e-05, + "loss": 42.1887, + "step": 2824 + }, + { + "epoch": 10.198645598194132, + "grad_norm": 223.82449340820312, + "learning_rate": 1.4673321234119783e-05, + "loss": 40.9825, + "step": 2825 + }, + { + "epoch": 10.202257336343115, + "grad_norm": 261.1077575683594, + "learning_rate": 1.4667876588021778e-05, + "loss": 41.8347, + "step": 2826 + }, + { + "epoch": 10.2058690744921, + "grad_norm": 189.1642608642578, + "learning_rate": 1.4662431941923775e-05, + "loss": 41.7441, + "step": 2827 + }, + { + "epoch": 10.209480812641084, + "grad_norm": 216.94410705566406, + "learning_rate": 1.4656987295825772e-05, + "loss": 42.203, + "step": 2828 + }, + { + "epoch": 10.213092550790067, + "grad_norm": 260.44744873046875, + "learning_rate": 1.4651542649727768e-05, + "loss": 41.8887, + "step": 2829 + }, + { + "epoch": 10.216704288939052, + "grad_norm": 252.21682739257812, + "learning_rate": 1.4646098003629765e-05, + "loss": 42.5977, + "step": 2830 + }, + { + "epoch": 10.216704288939052, + "eval_loss": 0.6175437569618225, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.997, + "eval_steps_per_second": 56.997, + "step": 2830 + }, + { + "epoch": 10.220316027088035, + "grad_norm": 298.4760437011719, + "learning_rate": 1.464065335753176e-05, + "loss": 40.7994, + "step": 2831 + }, + { + "epoch": 10.22392776523702, + "grad_norm": 214.0433349609375, + "learning_rate": 1.4635208711433757e-05, + "loss": 39.1571, + "step": 2832 + }, + { + "epoch": 10.227539503386005, + "grad_norm": 220.59039306640625, + "learning_rate": 1.4629764065335754e-05, + "loss": 38.257, + "step": 2833 + }, + { + "epoch": 10.231151241534988, + "grad_norm": 218.2419891357422, + "learning_rate": 1.462431941923775e-05, + "loss": 38.1954, + "step": 2834 + }, + { + "epoch": 10.234762979683973, + "grad_norm": 241.67674255371094, + "learning_rate": 1.4618874773139747e-05, + "loss": 39.7451, + "step": 2835 + }, + { + "epoch": 10.238374717832958, + "grad_norm": 260.3656005859375, + "learning_rate": 1.4613430127041742e-05, + "loss": 38.8297, + "step": 2836 + }, + { + "epoch": 10.241986455981941, + "grad_norm": 231.78102111816406, + "learning_rate": 1.4607985480943739e-05, + "loss": 38.523, + "step": 2837 + }, + { + "epoch": 10.245598194130926, + "grad_norm": 217.64820861816406, + "learning_rate": 1.4602540834845736e-05, + "loss": 40.0389, + "step": 2838 + }, + { + "epoch": 10.249209932279909, + "grad_norm": 186.45240783691406, + "learning_rate": 1.4597096188747732e-05, + "loss": 40.3306, + "step": 2839 + }, + { + "epoch": 10.252821670428894, + "grad_norm": 225.20480346679688, + "learning_rate": 1.4591651542649727e-05, + "loss": 39.0968, + "step": 2840 + }, + { + "epoch": 10.252821670428894, + "eval_loss": 0.6195141673088074, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 2840 + }, + { + "epoch": 10.256433408577879, + "grad_norm": 367.6174621582031, + "learning_rate": 1.4586206896551724e-05, + "loss": 38.869, + "step": 2841 + }, + { + "epoch": 10.260045146726862, + "grad_norm": 274.3976135253906, + "learning_rate": 1.4580762250453721e-05, + "loss": 39.7781, + "step": 2842 + }, + { + "epoch": 10.263656884875846, + "grad_norm": 193.41665649414062, + "learning_rate": 1.4575317604355718e-05, + "loss": 38.819, + "step": 2843 + }, + { + "epoch": 10.267268623024831, + "grad_norm": 204.2224578857422, + "learning_rate": 1.4569872958257714e-05, + "loss": 41.5495, + "step": 2844 + }, + { + "epoch": 10.270880361173814, + "grad_norm": 276.07476806640625, + "learning_rate": 1.4564428312159709e-05, + "loss": 40.6553, + "step": 2845 + }, + { + "epoch": 10.2744920993228, + "grad_norm": 192.6361541748047, + "learning_rate": 1.4558983666061708e-05, + "loss": 40.2147, + "step": 2846 + }, + { + "epoch": 10.278103837471784, + "grad_norm": 232.6641082763672, + "learning_rate": 1.4553539019963703e-05, + "loss": 40.7223, + "step": 2847 + }, + { + "epoch": 10.281715575620767, + "grad_norm": 266.781005859375, + "learning_rate": 1.4548094373865698e-05, + "loss": 38.0127, + "step": 2848 + }, + { + "epoch": 10.285327313769752, + "grad_norm": 289.5414123535156, + "learning_rate": 1.4542649727767696e-05, + "loss": 35.216, + "step": 2849 + }, + { + "epoch": 10.288939051918735, + "grad_norm": 208.10845947265625, + "learning_rate": 1.4537205081669691e-05, + "loss": 33.829, + "step": 2850 + }, + { + "epoch": 10.288939051918735, + "eval_loss": 0.6140356063842773, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.703, + "eval_steps_per_second": 56.703, + "step": 2850 + }, + { + "epoch": 10.29255079006772, + "grad_norm": 260.80328369140625, + "learning_rate": 1.4531760435571688e-05, + "loss": 33.8409, + "step": 2851 + }, + { + "epoch": 10.296162528216705, + "grad_norm": 202.3874053955078, + "learning_rate": 1.4526315789473685e-05, + "loss": 32.6498, + "step": 2852 + }, + { + "epoch": 10.299774266365688, + "grad_norm": 236.0218048095703, + "learning_rate": 1.452087114337568e-05, + "loss": 33.6538, + "step": 2853 + }, + { + "epoch": 10.303386004514673, + "grad_norm": 219.1603240966797, + "learning_rate": 1.4515426497277678e-05, + "loss": 33.7346, + "step": 2854 + }, + { + "epoch": 10.306997742663658, + "grad_norm": 252.8759307861328, + "learning_rate": 1.4509981851179675e-05, + "loss": 34.6996, + "step": 2855 + }, + { + "epoch": 10.31060948081264, + "grad_norm": 204.89244079589844, + "learning_rate": 1.450453720508167e-05, + "loss": 36.1145, + "step": 2856 + }, + { + "epoch": 10.314221218961626, + "grad_norm": 239.5278778076172, + "learning_rate": 1.4499092558983667e-05, + "loss": 34.8845, + "step": 2857 + }, + { + "epoch": 10.317832957110609, + "grad_norm": 235.02403259277344, + "learning_rate": 1.4493647912885662e-05, + "loss": 36.1006, + "step": 2858 + }, + { + "epoch": 10.321444695259594, + "grad_norm": 219.25686645507812, + "learning_rate": 1.4488203266787658e-05, + "loss": 37.0463, + "step": 2859 + }, + { + "epoch": 10.325056433408578, + "grad_norm": 238.1767578125, + "learning_rate": 1.4482758620689657e-05, + "loss": 35.5543, + "step": 2860 + }, + { + "epoch": 10.325056433408578, + "eval_loss": 0.6116110682487488, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.93, + "eval_steps_per_second": 56.93, + "step": 2860 + }, + { + "epoch": 10.328668171557561, + "grad_norm": 245.4133758544922, + "learning_rate": 1.4477313974591652e-05, + "loss": 35.7557, + "step": 2861 + }, + { + "epoch": 10.332279909706546, + "grad_norm": 231.70779418945312, + "learning_rate": 1.4471869328493647e-05, + "loss": 35.9535, + "step": 2862 + }, + { + "epoch": 10.335891647855531, + "grad_norm": 218.71266174316406, + "learning_rate": 1.4466424682395644e-05, + "loss": 36.747, + "step": 2863 + }, + { + "epoch": 10.339503386004514, + "grad_norm": 206.82247924804688, + "learning_rate": 1.446098003629764e-05, + "loss": 37.4007, + "step": 2864 + }, + { + "epoch": 10.343115124153499, + "grad_norm": 286.6649475097656, + "learning_rate": 1.4455535390199639e-05, + "loss": 38.183, + "step": 2865 + }, + { + "epoch": 10.346726862302482, + "grad_norm": 262.2049865722656, + "learning_rate": 1.4450090744101634e-05, + "loss": 28.1564, + "step": 2866 + }, + { + "epoch": 10.350338600451467, + "grad_norm": 203.03831481933594, + "learning_rate": 1.444464609800363e-05, + "loss": 23.7155, + "step": 2867 + }, + { + "epoch": 10.353950338600452, + "grad_norm": 220.13597106933594, + "learning_rate": 1.4439201451905626e-05, + "loss": 23.5066, + "step": 2868 + }, + { + "epoch": 10.357562076749435, + "grad_norm": 208.22035217285156, + "learning_rate": 1.4433756805807624e-05, + "loss": 23.8087, + "step": 2869 + }, + { + "epoch": 10.36117381489842, + "grad_norm": 202.74989318847656, + "learning_rate": 1.4428312159709619e-05, + "loss": 24.6194, + "step": 2870 + }, + { + "epoch": 10.36117381489842, + "eval_loss": 0.6170971989631653, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 2870 + }, + { + "epoch": 10.364785553047405, + "grad_norm": 251.78924560546875, + "learning_rate": 1.4422867513611616e-05, + "loss": 41.1333, + "step": 2871 + }, + { + "epoch": 10.368397291196388, + "grad_norm": 269.72430419921875, + "learning_rate": 1.4417422867513611e-05, + "loss": 43.5289, + "step": 2872 + }, + { + "epoch": 10.372009029345373, + "grad_norm": 226.14202880859375, + "learning_rate": 1.4411978221415607e-05, + "loss": 42.1575, + "step": 2873 + }, + { + "epoch": 10.375620767494357, + "grad_norm": 230.2255096435547, + "learning_rate": 1.4406533575317606e-05, + "loss": 42.5563, + "step": 2874 + }, + { + "epoch": 10.37923250564334, + "grad_norm": 259.2338562011719, + "learning_rate": 1.4401088929219601e-05, + "loss": 41.517, + "step": 2875 + }, + { + "epoch": 10.382844243792325, + "grad_norm": 280.06414794921875, + "learning_rate": 1.4395644283121598e-05, + "loss": 41.3589, + "step": 2876 + }, + { + "epoch": 10.386455981941308, + "grad_norm": 259.1960754394531, + "learning_rate": 1.4390199637023593e-05, + "loss": 41.539, + "step": 2877 + }, + { + "epoch": 10.390067720090293, + "grad_norm": 244.4931640625, + "learning_rate": 1.438475499092559e-05, + "loss": 41.8689, + "step": 2878 + }, + { + "epoch": 10.393679458239278, + "grad_norm": 195.65065002441406, + "learning_rate": 1.4379310344827588e-05, + "loss": 42.9191, + "step": 2879 + }, + { + "epoch": 10.397291196388261, + "grad_norm": 215.88589477539062, + "learning_rate": 1.4373865698729583e-05, + "loss": 41.4172, + "step": 2880 + }, + { + "epoch": 10.397291196388261, + "eval_loss": 0.6176813840866089, + "eval_runtime": 3.1462, + "eval_samples_per_second": 56.893, + "eval_steps_per_second": 56.893, + "step": 2880 + }, + { + "epoch": 10.400902934537246, + "grad_norm": 175.21368408203125, + "learning_rate": 1.4368421052631578e-05, + "loss": 41.8998, + "step": 2881 + }, + { + "epoch": 10.404514672686231, + "grad_norm": 207.65963745117188, + "learning_rate": 1.4362976406533575e-05, + "loss": 40.33, + "step": 2882 + }, + { + "epoch": 10.408126410835214, + "grad_norm": 213.50526428222656, + "learning_rate": 1.4357531760435572e-05, + "loss": 38.0329, + "step": 2883 + }, + { + "epoch": 10.411738148984199, + "grad_norm": 190.8444366455078, + "learning_rate": 1.4352087114337568e-05, + "loss": 39.0142, + "step": 2884 + }, + { + "epoch": 10.415349887133182, + "grad_norm": 300.2298583984375, + "learning_rate": 1.4346642468239565e-05, + "loss": 38.6364, + "step": 2885 + }, + { + "epoch": 10.418961625282167, + "grad_norm": 183.6144256591797, + "learning_rate": 1.434119782214156e-05, + "loss": 39.6747, + "step": 2886 + }, + { + "epoch": 10.422573363431152, + "grad_norm": 237.85340881347656, + "learning_rate": 1.4335753176043557e-05, + "loss": 38.3018, + "step": 2887 + }, + { + "epoch": 10.426185101580135, + "grad_norm": 325.96624755859375, + "learning_rate": 1.4330308529945554e-05, + "loss": 40.1042, + "step": 2888 + }, + { + "epoch": 10.42979683972912, + "grad_norm": 248.4732666015625, + "learning_rate": 1.432486388384755e-05, + "loss": 40.0357, + "step": 2889 + }, + { + "epoch": 10.433408577878104, + "grad_norm": 374.6653747558594, + "learning_rate": 1.4319419237749547e-05, + "loss": 40.4383, + "step": 2890 + }, + { + "epoch": 10.433408577878104, + "eval_loss": 0.6150367856025696, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.881, + "eval_steps_per_second": 56.881, + "step": 2890 + }, + { + "epoch": 10.437020316027088, + "grad_norm": 229.79647827148438, + "learning_rate": 1.4313974591651542e-05, + "loss": 40.3728, + "step": 2891 + }, + { + "epoch": 10.440632054176072, + "grad_norm": 278.7500915527344, + "learning_rate": 1.430852994555354e-05, + "loss": 39.546, + "step": 2892 + }, + { + "epoch": 10.444243792325057, + "grad_norm": 233.1890106201172, + "learning_rate": 1.4303085299455536e-05, + "loss": 41.8094, + "step": 2893 + }, + { + "epoch": 10.44785553047404, + "grad_norm": 207.7745819091797, + "learning_rate": 1.4297640653357532e-05, + "loss": 40.6225, + "step": 2894 + }, + { + "epoch": 10.451467268623025, + "grad_norm": 233.37892150878906, + "learning_rate": 1.4292196007259529e-05, + "loss": 40.2499, + "step": 2895 + }, + { + "epoch": 10.455079006772008, + "grad_norm": 225.4070587158203, + "learning_rate": 1.4286751361161524e-05, + "loss": 40.3626, + "step": 2896 + }, + { + "epoch": 10.458690744920993, + "grad_norm": 239.60231018066406, + "learning_rate": 1.4281306715063521e-05, + "loss": 40.3149, + "step": 2897 + }, + { + "epoch": 10.462302483069978, + "grad_norm": 225.3981475830078, + "learning_rate": 1.4275862068965518e-05, + "loss": 39.3443, + "step": 2898 + }, + { + "epoch": 10.465914221218961, + "grad_norm": 270.2829284667969, + "learning_rate": 1.4270417422867514e-05, + "loss": 37.8947, + "step": 2899 + }, + { + "epoch": 10.469525959367946, + "grad_norm": 263.66986083984375, + "learning_rate": 1.426497277676951e-05, + "loss": 34.4721, + "step": 2900 + }, + { + "epoch": 10.469525959367946, + "eval_loss": 0.6134031414985657, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2900 + }, + { + "epoch": 10.47313769751693, + "grad_norm": 189.3812255859375, + "learning_rate": 1.4259528130671508e-05, + "loss": 34.3148, + "step": 2901 + }, + { + "epoch": 10.476749435665914, + "grad_norm": 256.7174987792969, + "learning_rate": 1.4254083484573503e-05, + "loss": 32.1693, + "step": 2902 + }, + { + "epoch": 10.480361173814899, + "grad_norm": 265.40692138671875, + "learning_rate": 1.4248638838475499e-05, + "loss": 34.369, + "step": 2903 + }, + { + "epoch": 10.483972911963882, + "grad_norm": 315.6539001464844, + "learning_rate": 1.4243194192377496e-05, + "loss": 34.9479, + "step": 2904 + }, + { + "epoch": 10.487584650112867, + "grad_norm": 263.7816162109375, + "learning_rate": 1.4237749546279491e-05, + "loss": 33.983, + "step": 2905 + }, + { + "epoch": 10.491196388261852, + "grad_norm": 244.69192504882812, + "learning_rate": 1.423230490018149e-05, + "loss": 36.6685, + "step": 2906 + }, + { + "epoch": 10.494808126410835, + "grad_norm": 224.26071166992188, + "learning_rate": 1.4226860254083485e-05, + "loss": 35.0337, + "step": 2907 + }, + { + "epoch": 10.49841986455982, + "grad_norm": 261.0958557128906, + "learning_rate": 1.422141560798548e-05, + "loss": 34.7154, + "step": 2908 + }, + { + "epoch": 10.502031602708804, + "grad_norm": 245.85960388183594, + "learning_rate": 1.4215970961887478e-05, + "loss": 35.4156, + "step": 2909 + }, + { + "epoch": 10.505643340857787, + "grad_norm": 309.3730163574219, + "learning_rate": 1.4210526315789473e-05, + "loss": 36.3999, + "step": 2910 + }, + { + "epoch": 10.505643340857787, + "eval_loss": 0.6144266128540039, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.853, + "eval_steps_per_second": 56.853, + "step": 2910 + }, + { + "epoch": 10.509255079006772, + "grad_norm": 209.9637451171875, + "learning_rate": 1.420508166969147e-05, + "loss": 37.1515, + "step": 2911 + }, + { + "epoch": 10.512866817155757, + "grad_norm": 254.81683349609375, + "learning_rate": 1.4199637023593467e-05, + "loss": 35.5548, + "step": 2912 + }, + { + "epoch": 10.51647855530474, + "grad_norm": 224.94137573242188, + "learning_rate": 1.4194192377495463e-05, + "loss": 36.7691, + "step": 2913 + }, + { + "epoch": 10.520090293453725, + "grad_norm": 223.81838989257812, + "learning_rate": 1.4188747731397458e-05, + "loss": 37.5904, + "step": 2914 + }, + { + "epoch": 10.523702031602708, + "grad_norm": 308.0168151855469, + "learning_rate": 1.4183303085299457e-05, + "loss": 36.1561, + "step": 2915 + }, + { + "epoch": 10.527313769751693, + "grad_norm": 214.77928161621094, + "learning_rate": 1.4177858439201452e-05, + "loss": 27.6309, + "step": 2916 + }, + { + "epoch": 10.530925507900678, + "grad_norm": 153.77163696289062, + "learning_rate": 1.417241379310345e-05, + "loss": 23.6151, + "step": 2917 + }, + { + "epoch": 10.534537246049661, + "grad_norm": 161.12826538085938, + "learning_rate": 1.4166969147005445e-05, + "loss": 23.1684, + "step": 2918 + }, + { + "epoch": 10.538148984198646, + "grad_norm": 228.01441955566406, + "learning_rate": 1.416152450090744e-05, + "loss": 23.4383, + "step": 2919 + }, + { + "epoch": 10.54176072234763, + "grad_norm": 207.55052185058594, + "learning_rate": 1.4156079854809439e-05, + "loss": 25.4699, + "step": 2920 + }, + { + "epoch": 10.54176072234763, + "eval_loss": 0.6177500486373901, + "eval_runtime": 3.1369, + "eval_samples_per_second": 57.063, + "eval_steps_per_second": 57.063, + "step": 2920 + }, + { + "epoch": 10.545372460496614, + "grad_norm": 254.23828125, + "learning_rate": 1.4150635208711434e-05, + "loss": 42.1525, + "step": 2921 + }, + { + "epoch": 10.548984198645599, + "grad_norm": 228.1654815673828, + "learning_rate": 1.414519056261343e-05, + "loss": 42.4282, + "step": 2922 + }, + { + "epoch": 10.552595936794582, + "grad_norm": 258.4981689453125, + "learning_rate": 1.4139745916515427e-05, + "loss": 42.3053, + "step": 2923 + }, + { + "epoch": 10.556207674943566, + "grad_norm": 364.42059326171875, + "learning_rate": 1.4134301270417424e-05, + "loss": 41.9009, + "step": 2924 + }, + { + "epoch": 10.559819413092551, + "grad_norm": 213.5066375732422, + "learning_rate": 1.412885662431942e-05, + "loss": 41.0624, + "step": 2925 + }, + { + "epoch": 10.563431151241534, + "grad_norm": 214.23472595214844, + "learning_rate": 1.4123411978221416e-05, + "loss": 42.2508, + "step": 2926 + }, + { + "epoch": 10.56704288939052, + "grad_norm": 249.8063201904297, + "learning_rate": 1.4117967332123412e-05, + "loss": 43.0671, + "step": 2927 + }, + { + "epoch": 10.570654627539504, + "grad_norm": 210.0769805908203, + "learning_rate": 1.4112522686025409e-05, + "loss": 43.4018, + "step": 2928 + }, + { + "epoch": 10.574266365688487, + "grad_norm": 255.67225646972656, + "learning_rate": 1.4107078039927406e-05, + "loss": 42.9609, + "step": 2929 + }, + { + "epoch": 10.577878103837472, + "grad_norm": 294.2599182128906, + "learning_rate": 1.4101633393829401e-05, + "loss": 41.8748, + "step": 2930 + }, + { + "epoch": 10.577878103837472, + "eval_loss": 0.6147512793540955, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2930 + }, + { + "epoch": 10.581489841986457, + "grad_norm": 212.6685333251953, + "learning_rate": 1.4096188747731398e-05, + "loss": 42.4291, + "step": 2931 + }, + { + "epoch": 10.58510158013544, + "grad_norm": 297.016357421875, + "learning_rate": 1.4090744101633394e-05, + "loss": 39.7291, + "step": 2932 + }, + { + "epoch": 10.588713318284425, + "grad_norm": 280.308837890625, + "learning_rate": 1.4085299455535389e-05, + "loss": 37.4836, + "step": 2933 + }, + { + "epoch": 10.592325056433408, + "grad_norm": 230.28994750976562, + "learning_rate": 1.4079854809437388e-05, + "loss": 39.4075, + "step": 2934 + }, + { + "epoch": 10.595936794582393, + "grad_norm": 377.0367126464844, + "learning_rate": 1.4074410163339383e-05, + "loss": 40.5601, + "step": 2935 + }, + { + "epoch": 10.599548532731378, + "grad_norm": 238.51597595214844, + "learning_rate": 1.406896551724138e-05, + "loss": 38.1238, + "step": 2936 + }, + { + "epoch": 10.60316027088036, + "grad_norm": 197.5536651611328, + "learning_rate": 1.4063520871143376e-05, + "loss": 38.2997, + "step": 2937 + }, + { + "epoch": 10.606772009029346, + "grad_norm": 211.65162658691406, + "learning_rate": 1.4058076225045373e-05, + "loss": 39.1501, + "step": 2938 + }, + { + "epoch": 10.610383747178329, + "grad_norm": 266.4801940917969, + "learning_rate": 1.405263157894737e-05, + "loss": 40.5761, + "step": 2939 + }, + { + "epoch": 10.613995485327314, + "grad_norm": 210.29478454589844, + "learning_rate": 1.4047186932849365e-05, + "loss": 39.7387, + "step": 2940 + }, + { + "epoch": 10.613995485327314, + "eval_loss": 0.6154477000236511, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 2940 + }, + { + "epoch": 10.617607223476298, + "grad_norm": 318.0694580078125, + "learning_rate": 1.404174228675136e-05, + "loss": 38.691, + "step": 2941 + }, + { + "epoch": 10.621218961625281, + "grad_norm": 351.12811279296875, + "learning_rate": 1.4036297640653358e-05, + "loss": 40.3878, + "step": 2942 + }, + { + "epoch": 10.624830699774266, + "grad_norm": 259.8601989746094, + "learning_rate": 1.4030852994555355e-05, + "loss": 38.4447, + "step": 2943 + }, + { + "epoch": 10.628442437923251, + "grad_norm": 249.7741241455078, + "learning_rate": 1.402540834845735e-05, + "loss": 41.1242, + "step": 2944 + }, + { + "epoch": 10.632054176072234, + "grad_norm": 207.11119079589844, + "learning_rate": 1.4019963702359347e-05, + "loss": 40.1977, + "step": 2945 + }, + { + "epoch": 10.635665914221219, + "grad_norm": 199.37295532226562, + "learning_rate": 1.4014519056261343e-05, + "loss": 40.71, + "step": 2946 + }, + { + "epoch": 10.639277652370204, + "grad_norm": 238.85061645507812, + "learning_rate": 1.4009074410163341e-05, + "loss": 41.8822, + "step": 2947 + }, + { + "epoch": 10.642889390519187, + "grad_norm": 212.46388244628906, + "learning_rate": 1.4003629764065337e-05, + "loss": 40.5648, + "step": 2948 + }, + { + "epoch": 10.646501128668172, + "grad_norm": 217.60386657714844, + "learning_rate": 1.3998185117967332e-05, + "loss": 39.6074, + "step": 2949 + }, + { + "epoch": 10.650112866817155, + "grad_norm": 223.88645935058594, + "learning_rate": 1.399274047186933e-05, + "loss": 37.7394, + "step": 2950 + }, + { + "epoch": 10.650112866817155, + "eval_loss": 0.6133999228477478, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 2950 + }, + { + "epoch": 10.65372460496614, + "grad_norm": 248.87986755371094, + "learning_rate": 1.3987295825771325e-05, + "loss": 34.911, + "step": 2951 + }, + { + "epoch": 10.657336343115125, + "grad_norm": 238.0355987548828, + "learning_rate": 1.3981851179673322e-05, + "loss": 34.0325, + "step": 2952 + }, + { + "epoch": 10.660948081264108, + "grad_norm": 212.9556121826172, + "learning_rate": 1.3976406533575319e-05, + "loss": 34.9663, + "step": 2953 + }, + { + "epoch": 10.664559819413093, + "grad_norm": 274.4277648925781, + "learning_rate": 1.3970961887477314e-05, + "loss": 34.2399, + "step": 2954 + }, + { + "epoch": 10.668171557562077, + "grad_norm": 211.77976989746094, + "learning_rate": 1.396551724137931e-05, + "loss": 33.7609, + "step": 2955 + }, + { + "epoch": 10.67178329571106, + "grad_norm": 280.6621398925781, + "learning_rate": 1.3960072595281307e-05, + "loss": 35.2616, + "step": 2956 + }, + { + "epoch": 10.675395033860045, + "grad_norm": 239.06439208984375, + "learning_rate": 1.3954627949183304e-05, + "loss": 34.2542, + "step": 2957 + }, + { + "epoch": 10.679006772009028, + "grad_norm": 271.45806884765625, + "learning_rate": 1.39491833030853e-05, + "loss": 36.0551, + "step": 2958 + }, + { + "epoch": 10.682618510158013, + "grad_norm": 247.76486206054688, + "learning_rate": 1.3943738656987296e-05, + "loss": 36.9935, + "step": 2959 + }, + { + "epoch": 10.686230248306998, + "grad_norm": 259.47930908203125, + "learning_rate": 1.3938294010889292e-05, + "loss": 36.7769, + "step": 2960 + }, + { + "epoch": 10.686230248306998, + "eval_loss": 0.6107803583145142, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.138, + "eval_steps_per_second": 57.138, + "step": 2960 + }, + { + "epoch": 10.689841986455981, + "grad_norm": 247.50103759765625, + "learning_rate": 1.393284936479129e-05, + "loss": 35.4848, + "step": 2961 + }, + { + "epoch": 10.693453724604966, + "grad_norm": 242.37330627441406, + "learning_rate": 1.3927404718693286e-05, + "loss": 36.3881, + "step": 2962 + }, + { + "epoch": 10.697065462753951, + "grad_norm": 200.2835693359375, + "learning_rate": 1.3921960072595281e-05, + "loss": 37.2684, + "step": 2963 + }, + { + "epoch": 10.700677200902934, + "grad_norm": 261.6256103515625, + "learning_rate": 1.3916515426497278e-05, + "loss": 37.4581, + "step": 2964 + }, + { + "epoch": 10.704288939051919, + "grad_norm": 243.7251434326172, + "learning_rate": 1.3911070780399274e-05, + "loss": 35.8237, + "step": 2965 + }, + { + "epoch": 10.707900677200904, + "grad_norm": 172.99339294433594, + "learning_rate": 1.390562613430127e-05, + "loss": 29.5815, + "step": 2966 + }, + { + "epoch": 10.711512415349887, + "grad_norm": 168.88490295410156, + "learning_rate": 1.3900181488203268e-05, + "loss": 23.6597, + "step": 2967 + }, + { + "epoch": 10.715124153498872, + "grad_norm": 213.0456085205078, + "learning_rate": 1.3894736842105263e-05, + "loss": 22.5034, + "step": 2968 + }, + { + "epoch": 10.718735891647855, + "grad_norm": 183.87222290039062, + "learning_rate": 1.388929219600726e-05, + "loss": 24.1696, + "step": 2969 + }, + { + "epoch": 10.72234762979684, + "grad_norm": 179.4297637939453, + "learning_rate": 1.3883847549909256e-05, + "loss": 24.8905, + "step": 2970 + }, + { + "epoch": 10.72234762979684, + "eval_loss": 0.6176853179931641, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 2970 + }, + { + "epoch": 10.725959367945824, + "grad_norm": 214.10662841796875, + "learning_rate": 1.3878402903811253e-05, + "loss": 40.6941, + "step": 2971 + }, + { + "epoch": 10.729571106094808, + "grad_norm": 199.4381103515625, + "learning_rate": 1.387295825771325e-05, + "loss": 42.6363, + "step": 2972 + }, + { + "epoch": 10.733182844243792, + "grad_norm": 182.74517822265625, + "learning_rate": 1.3867513611615245e-05, + "loss": 40.9695, + "step": 2973 + }, + { + "epoch": 10.736794582392777, + "grad_norm": 182.41421508789062, + "learning_rate": 1.386206896551724e-05, + "loss": 40.8893, + "step": 2974 + }, + { + "epoch": 10.74040632054176, + "grad_norm": 215.42904663085938, + "learning_rate": 1.385662431941924e-05, + "loss": 40.6667, + "step": 2975 + }, + { + "epoch": 10.744018058690745, + "grad_norm": 208.15133666992188, + "learning_rate": 1.3851179673321235e-05, + "loss": 42.0714, + "step": 2976 + }, + { + "epoch": 10.747629796839728, + "grad_norm": 224.70242309570312, + "learning_rate": 1.384573502722323e-05, + "loss": 40.9404, + "step": 2977 + }, + { + "epoch": 10.751241534988713, + "grad_norm": 241.45301818847656, + "learning_rate": 1.3840290381125227e-05, + "loss": 43.5597, + "step": 2978 + }, + { + "epoch": 10.754853273137698, + "grad_norm": 201.2677459716797, + "learning_rate": 1.3834845735027222e-05, + "loss": 42.7741, + "step": 2979 + }, + { + "epoch": 10.758465011286681, + "grad_norm": 246.30873107910156, + "learning_rate": 1.3829401088929221e-05, + "loss": 41.7873, + "step": 2980 + }, + { + "epoch": 10.758465011286681, + "eval_loss": 0.6206657886505127, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2980 + }, + { + "epoch": 10.762076749435666, + "grad_norm": 206.91009521484375, + "learning_rate": 1.3823956442831217e-05, + "loss": 42.3601, + "step": 2981 + }, + { + "epoch": 10.76568848758465, + "grad_norm": 206.37472534179688, + "learning_rate": 1.3818511796733212e-05, + "loss": 38.5536, + "step": 2982 + }, + { + "epoch": 10.769300225733634, + "grad_norm": 206.49070739746094, + "learning_rate": 1.3813067150635209e-05, + "loss": 38.1051, + "step": 2983 + }, + { + "epoch": 10.772911963882619, + "grad_norm": 215.02455139160156, + "learning_rate": 1.3807622504537206e-05, + "loss": 39.0797, + "step": 2984 + }, + { + "epoch": 10.776523702031604, + "grad_norm": 254.23757934570312, + "learning_rate": 1.3802177858439202e-05, + "loss": 39.419, + "step": 2985 + }, + { + "epoch": 10.780135440180587, + "grad_norm": 205.85079956054688, + "learning_rate": 1.3796733212341199e-05, + "loss": 39.2075, + "step": 2986 + }, + { + "epoch": 10.783747178329572, + "grad_norm": 216.0372314453125, + "learning_rate": 1.3791288566243194e-05, + "loss": 38.5652, + "step": 2987 + }, + { + "epoch": 10.787358916478555, + "grad_norm": 258.47650146484375, + "learning_rate": 1.3785843920145191e-05, + "loss": 38.1968, + "step": 2988 + }, + { + "epoch": 10.79097065462754, + "grad_norm": 289.07354736328125, + "learning_rate": 1.3780399274047188e-05, + "loss": 40.2233, + "step": 2989 + }, + { + "epoch": 10.794582392776524, + "grad_norm": 332.9964904785156, + "learning_rate": 1.3774954627949184e-05, + "loss": 39.5959, + "step": 2990 + }, + { + "epoch": 10.794582392776524, + "eval_loss": 0.6167517304420471, + "eval_runtime": 3.1556, + "eval_samples_per_second": 56.724, + "eval_steps_per_second": 56.724, + "step": 2990 + }, + { + "epoch": 10.798194130925507, + "grad_norm": 205.10699462890625, + "learning_rate": 1.376950998185118e-05, + "loss": 40.2468, + "step": 2991 + }, + { + "epoch": 10.801805869074492, + "grad_norm": 270.2808837890625, + "learning_rate": 1.3764065335753176e-05, + "loss": 37.5956, + "step": 2992 + }, + { + "epoch": 10.805417607223477, + "grad_norm": 199.32044982910156, + "learning_rate": 1.3758620689655171e-05, + "loss": 38.7289, + "step": 2993 + }, + { + "epoch": 10.80902934537246, + "grad_norm": 196.97547912597656, + "learning_rate": 1.375317604355717e-05, + "loss": 40.6707, + "step": 2994 + }, + { + "epoch": 10.812641083521445, + "grad_norm": 219.34588623046875, + "learning_rate": 1.3747731397459166e-05, + "loss": 39.6782, + "step": 2995 + }, + { + "epoch": 10.816252821670428, + "grad_norm": 261.7323913574219, + "learning_rate": 1.3742286751361161e-05, + "loss": 41.1828, + "step": 2996 + }, + { + "epoch": 10.819864559819413, + "grad_norm": 250.89186096191406, + "learning_rate": 1.3736842105263158e-05, + "loss": 41.3582, + "step": 2997 + }, + { + "epoch": 10.823476297968398, + "grad_norm": 284.7223205566406, + "learning_rate": 1.3731397459165155e-05, + "loss": 39.3584, + "step": 2998 + }, + { + "epoch": 10.827088036117381, + "grad_norm": 212.9114990234375, + "learning_rate": 1.3725952813067152e-05, + "loss": 37.5373, + "step": 2999 + }, + { + "epoch": 10.830699774266366, + "grad_norm": 182.8346405029297, + "learning_rate": 1.3720508166969148e-05, + "loss": 35.2027, + "step": 3000 + }, + { + "epoch": 10.830699774266366, + "eval_loss": 0.6083630919456482, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.702, + "eval_steps_per_second": 56.702, + "step": 3000 + }, + { + "epoch": 10.83431151241535, + "grad_norm": 259.0496520996094, + "learning_rate": 1.3715063520871143e-05, + "loss": 33.4937, + "step": 3001 + }, + { + "epoch": 10.837923250564334, + "grad_norm": 173.037353515625, + "learning_rate": 1.370961887477314e-05, + "loss": 32.8549, + "step": 3002 + }, + { + "epoch": 10.841534988713319, + "grad_norm": 257.9381408691406, + "learning_rate": 1.3704174228675137e-05, + "loss": 33.9163, + "step": 3003 + }, + { + "epoch": 10.845146726862303, + "grad_norm": 248.58355712890625, + "learning_rate": 1.3698729582577132e-05, + "loss": 34.3948, + "step": 3004 + }, + { + "epoch": 10.848758465011286, + "grad_norm": 277.0877990722656, + "learning_rate": 1.369328493647913e-05, + "loss": 34.2868, + "step": 3005 + }, + { + "epoch": 10.852370203160271, + "grad_norm": 220.54014587402344, + "learning_rate": 1.3687840290381125e-05, + "loss": 35.2502, + "step": 3006 + }, + { + "epoch": 10.855981941309254, + "grad_norm": 248.14111328125, + "learning_rate": 1.3682395644283122e-05, + "loss": 33.4599, + "step": 3007 + }, + { + "epoch": 10.85959367945824, + "grad_norm": 284.2827453613281, + "learning_rate": 1.3676950998185119e-05, + "loss": 34.2927, + "step": 3008 + }, + { + "epoch": 10.863205417607224, + "grad_norm": 236.78201293945312, + "learning_rate": 1.3671506352087114e-05, + "loss": 34.9322, + "step": 3009 + }, + { + "epoch": 10.866817155756207, + "grad_norm": 245.58331298828125, + "learning_rate": 1.3666061705989112e-05, + "loss": 35.7628, + "step": 3010 + }, + { + "epoch": 10.866817155756207, + "eval_loss": 0.6125946640968323, + "eval_runtime": 3.1644, + "eval_samples_per_second": 56.566, + "eval_steps_per_second": 56.566, + "step": 3010 + }, + { + "epoch": 10.870428893905192, + "grad_norm": 217.79248046875, + "learning_rate": 1.3660617059891107e-05, + "loss": 35.7332, + "step": 3011 + }, + { + "epoch": 10.874040632054175, + "grad_norm": 258.78729248046875, + "learning_rate": 1.3655172413793104e-05, + "loss": 38.293, + "step": 3012 + }, + { + "epoch": 10.87765237020316, + "grad_norm": 253.94757080078125, + "learning_rate": 1.3649727767695101e-05, + "loss": 37.511, + "step": 3013 + }, + { + "epoch": 10.881264108352145, + "grad_norm": 265.5654602050781, + "learning_rate": 1.3644283121597096e-05, + "loss": 37.5786, + "step": 3014 + }, + { + "epoch": 10.884875846501128, + "grad_norm": 252.11453247070312, + "learning_rate": 1.3638838475499092e-05, + "loss": 37.1039, + "step": 3015 + }, + { + "epoch": 10.888487584650113, + "grad_norm": 259.5934753417969, + "learning_rate": 1.3633393829401089e-05, + "loss": 35.2651, + "step": 3016 + }, + { + "epoch": 10.892099322799098, + "grad_norm": 194.3569793701172, + "learning_rate": 1.3627949183303086e-05, + "loss": 23.7438, + "step": 3017 + }, + { + "epoch": 10.89571106094808, + "grad_norm": 233.95205688476562, + "learning_rate": 1.3622504537205081e-05, + "loss": 23.0061, + "step": 3018 + }, + { + "epoch": 10.899322799097066, + "grad_norm": 185.18495178222656, + "learning_rate": 1.3617059891107078e-05, + "loss": 24.5404, + "step": 3019 + }, + { + "epoch": 10.90293453724605, + "grad_norm": 200.27029418945312, + "learning_rate": 1.3611615245009074e-05, + "loss": 24.3629, + "step": 3020 + }, + { + "epoch": 10.90293453724605, + "eval_loss": 0.6178797483444214, + "eval_runtime": 3.1498, + "eval_samples_per_second": 56.829, + "eval_steps_per_second": 56.829, + "step": 3020 + }, + { + "epoch": 10.906546275395034, + "grad_norm": 226.4281463623047, + "learning_rate": 1.3606170598911073e-05, + "loss": 41.7249, + "step": 3021 + }, + { + "epoch": 10.910158013544018, + "grad_norm": 207.73768615722656, + "learning_rate": 1.3600725952813068e-05, + "loss": 42.1902, + "step": 3022 + }, + { + "epoch": 10.913769751693001, + "grad_norm": 248.69773864746094, + "learning_rate": 1.3595281306715063e-05, + "loss": 40.8419, + "step": 3023 + }, + { + "epoch": 10.917381489841986, + "grad_norm": 224.0100860595703, + "learning_rate": 1.358983666061706e-05, + "loss": 41.483, + "step": 3024 + }, + { + "epoch": 10.920993227990971, + "grad_norm": 217.3524932861328, + "learning_rate": 1.3584392014519056e-05, + "loss": 42.4667, + "step": 3025 + }, + { + "epoch": 10.924604966139954, + "grad_norm": 226.0863494873047, + "learning_rate": 1.3578947368421053e-05, + "loss": 40.8693, + "step": 3026 + }, + { + "epoch": 10.928216704288939, + "grad_norm": 278.3658447265625, + "learning_rate": 1.357350272232305e-05, + "loss": 39.5165, + "step": 3027 + }, + { + "epoch": 10.931828442437924, + "grad_norm": 226.6543731689453, + "learning_rate": 1.3568058076225045e-05, + "loss": 39.3144, + "step": 3028 + }, + { + "epoch": 10.935440180586907, + "grad_norm": 215.39073181152344, + "learning_rate": 1.3562613430127042e-05, + "loss": 39.9823, + "step": 3029 + }, + { + "epoch": 10.939051918735892, + "grad_norm": 239.6291961669922, + "learning_rate": 1.355716878402904e-05, + "loss": 40.898, + "step": 3030 + }, + { + "epoch": 10.939051918735892, + "eval_loss": 0.6163076162338257, + "eval_runtime": 3.153, + "eval_samples_per_second": 56.771, + "eval_steps_per_second": 56.771, + "step": 3030 + }, + { + "epoch": 10.942663656884875, + "grad_norm": 251.20431518554688, + "learning_rate": 1.3551724137931035e-05, + "loss": 40.8357, + "step": 3031 + }, + { + "epoch": 10.94627539503386, + "grad_norm": 243.96022033691406, + "learning_rate": 1.3546279491833032e-05, + "loss": 39.1261, + "step": 3032 + }, + { + "epoch": 10.949887133182845, + "grad_norm": 248.15545654296875, + "learning_rate": 1.3540834845735027e-05, + "loss": 40.9375, + "step": 3033 + }, + { + "epoch": 10.953498871331828, + "grad_norm": 215.00927734375, + "learning_rate": 1.3535390199637023e-05, + "loss": 42.4167, + "step": 3034 + }, + { + "epoch": 10.957110609480813, + "grad_norm": 263.11566162109375, + "learning_rate": 1.3529945553539021e-05, + "loss": 40.7363, + "step": 3035 + }, + { + "epoch": 10.960722347629797, + "grad_norm": 208.59628295898438, + "learning_rate": 1.3524500907441017e-05, + "loss": 35.7124, + "step": 3036 + }, + { + "epoch": 10.96433408577878, + "grad_norm": 187.6036834716797, + "learning_rate": 1.3519056261343012e-05, + "loss": 33.7512, + "step": 3037 + }, + { + "epoch": 10.967945823927765, + "grad_norm": 217.89825439453125, + "learning_rate": 1.351361161524501e-05, + "loss": 33.4262, + "step": 3038 + }, + { + "epoch": 10.97155756207675, + "grad_norm": 235.59889221191406, + "learning_rate": 1.3508166969147005e-05, + "loss": 35.2587, + "step": 3039 + }, + { + "epoch": 10.975169300225733, + "grad_norm": 261.9609680175781, + "learning_rate": 1.3502722323049003e-05, + "loss": 36.1296, + "step": 3040 + }, + { + "epoch": 10.975169300225733, + "eval_loss": 0.610818088054657, + "eval_runtime": 3.1502, + "eval_samples_per_second": 56.822, + "eval_steps_per_second": 56.822, + "step": 3040 + }, + { + "epoch": 10.978781038374718, + "grad_norm": 239.44386291503906, + "learning_rate": 1.3497277676950999e-05, + "loss": 35.6712, + "step": 3041 + }, + { + "epoch": 10.982392776523701, + "grad_norm": 260.9620666503906, + "learning_rate": 1.3491833030852994e-05, + "loss": 35.9054, + "step": 3042 + }, + { + "epoch": 10.986004514672686, + "grad_norm": 246.35678100585938, + "learning_rate": 1.3486388384754991e-05, + "loss": 35.6071, + "step": 3043 + }, + { + "epoch": 10.989616252821671, + "grad_norm": 259.808349609375, + "learning_rate": 1.3480943738656988e-05, + "loss": 37.8261, + "step": 3044 + }, + { + "epoch": 10.993227990970654, + "grad_norm": 187.34579467773438, + "learning_rate": 1.3475499092558984e-05, + "loss": 29.4662, + "step": 3045 + }, + { + "epoch": 10.996839729119639, + "grad_norm": 235.4073486328125, + "learning_rate": 1.3470054446460981e-05, + "loss": 23.668, + "step": 3046 + }, + { + "epoch": 11.0, + "grad_norm": 171.45904541015625, + "learning_rate": 1.3464609800362976e-05, + "loss": 21.3995, + "step": 3047 + }, + { + "epoch": 11.003611738148985, + "grad_norm": 262.18798828125, + "learning_rate": 1.3459165154264972e-05, + "loss": 40.2072, + "step": 3048 + }, + { + "epoch": 11.007223476297968, + "grad_norm": 298.67755126953125, + "learning_rate": 1.345372050816697e-05, + "loss": 42.5345, + "step": 3049 + }, + { + "epoch": 11.010835214446953, + "grad_norm": 215.71389770507812, + "learning_rate": 1.3448275862068966e-05, + "loss": 41.3491, + "step": 3050 + }, + { + "epoch": 11.010835214446953, + "eval_loss": 0.6099278330802917, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 3050 + }, + { + "epoch": 11.014446952595938, + "grad_norm": 243.77044677734375, + "learning_rate": 1.3442831215970963e-05, + "loss": 41.0093, + "step": 3051 + }, + { + "epoch": 11.01805869074492, + "grad_norm": 205.8600616455078, + "learning_rate": 1.3437386569872958e-05, + "loss": 41.944, + "step": 3052 + }, + { + "epoch": 11.021670428893906, + "grad_norm": 204.25608825683594, + "learning_rate": 1.3431941923774955e-05, + "loss": 39.3595, + "step": 3053 + }, + { + "epoch": 11.025282167042889, + "grad_norm": 195.03114318847656, + "learning_rate": 1.3426497277676952e-05, + "loss": 42.0208, + "step": 3054 + }, + { + "epoch": 11.028893905191874, + "grad_norm": 193.05857849121094, + "learning_rate": 1.3421052631578948e-05, + "loss": 41.2148, + "step": 3055 + }, + { + "epoch": 11.032505643340858, + "grad_norm": 255.9553680419922, + "learning_rate": 1.3415607985480943e-05, + "loss": 41.6029, + "step": 3056 + }, + { + "epoch": 11.036117381489841, + "grad_norm": 234.97799682617188, + "learning_rate": 1.341016333938294e-05, + "loss": 41.2583, + "step": 3057 + }, + { + "epoch": 11.039729119638826, + "grad_norm": 183.76707458496094, + "learning_rate": 1.3404718693284937e-05, + "loss": 39.4893, + "step": 3058 + }, + { + "epoch": 11.043340857787811, + "grad_norm": 162.30191040039062, + "learning_rate": 1.3399274047186933e-05, + "loss": 37.697, + "step": 3059 + }, + { + "epoch": 11.046952595936794, + "grad_norm": 223.8235626220703, + "learning_rate": 1.339382940108893e-05, + "loss": 37.2762, + "step": 3060 + }, + { + "epoch": 11.046952595936794, + "eval_loss": 0.6099210381507874, + "eval_runtime": 3.1526, + "eval_samples_per_second": 56.778, + "eval_steps_per_second": 56.778, + "step": 3060 + }, + { + "epoch": 11.050564334085779, + "grad_norm": 203.874755859375, + "learning_rate": 1.3388384754990925e-05, + "loss": 37.7674, + "step": 3061 + }, + { + "epoch": 11.054176072234762, + "grad_norm": 222.9609832763672, + "learning_rate": 1.3382940108892922e-05, + "loss": 39.5784, + "step": 3062 + }, + { + "epoch": 11.057787810383747, + "grad_norm": 177.81871032714844, + "learning_rate": 1.337749546279492e-05, + "loss": 37.5264, + "step": 3063 + }, + { + "epoch": 11.061399548532732, + "grad_norm": 209.53326416015625, + "learning_rate": 1.3372050816696915e-05, + "loss": 38.5067, + "step": 3064 + }, + { + "epoch": 11.065011286681715, + "grad_norm": 228.35260009765625, + "learning_rate": 1.3366606170598912e-05, + "loss": 37.5329, + "step": 3065 + }, + { + "epoch": 11.0686230248307, + "grad_norm": 231.5054168701172, + "learning_rate": 1.3361161524500907e-05, + "loss": 39.8565, + "step": 3066 + }, + { + "epoch": 11.072234762979685, + "grad_norm": 184.31460571289062, + "learning_rate": 1.3355716878402904e-05, + "loss": 37.9703, + "step": 3067 + }, + { + "epoch": 11.075846501128668, + "grad_norm": 230.06463623046875, + "learning_rate": 1.3350272232304901e-05, + "loss": 39.1406, + "step": 3068 + }, + { + "epoch": 11.079458239277653, + "grad_norm": 263.3990478515625, + "learning_rate": 1.3344827586206897e-05, + "loss": 39.8019, + "step": 3069 + }, + { + "epoch": 11.083069977426636, + "grad_norm": 217.89923095703125, + "learning_rate": 1.3339382940108892e-05, + "loss": 40.195, + "step": 3070 + }, + { + "epoch": 11.083069977426636, + "eval_loss": 0.6136859655380249, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 3070 + }, + { + "epoch": 11.08668171557562, + "grad_norm": 238.8343505859375, + "learning_rate": 1.333393829401089e-05, + "loss": 39.1668, + "step": 3071 + }, + { + "epoch": 11.090293453724605, + "grad_norm": 288.6470947265625, + "learning_rate": 1.3328493647912886e-05, + "loss": 40.3355, + "step": 3072 + }, + { + "epoch": 11.093905191873588, + "grad_norm": 284.3423156738281, + "learning_rate": 1.3323049001814883e-05, + "loss": 41.5359, + "step": 3073 + }, + { + "epoch": 11.097516930022573, + "grad_norm": 263.0945739746094, + "learning_rate": 1.3317604355716879e-05, + "loss": 41.3219, + "step": 3074 + }, + { + "epoch": 11.101128668171558, + "grad_norm": 208.96383666992188, + "learning_rate": 1.3312159709618874e-05, + "loss": 39.7292, + "step": 3075 + }, + { + "epoch": 11.104740406320541, + "grad_norm": 233.49888610839844, + "learning_rate": 1.3306715063520873e-05, + "loss": 35.282, + "step": 3076 + }, + { + "epoch": 11.108352144469526, + "grad_norm": 216.6250762939453, + "learning_rate": 1.3301270417422868e-05, + "loss": 34.4335, + "step": 3077 + }, + { + "epoch": 11.111963882618511, + "grad_norm": 182.3594970703125, + "learning_rate": 1.3295825771324864e-05, + "loss": 32.7557, + "step": 3078 + }, + { + "epoch": 11.115575620767494, + "grad_norm": 215.4852752685547, + "learning_rate": 1.329038112522686e-05, + "loss": 32.185, + "step": 3079 + }, + { + "epoch": 11.119187358916479, + "grad_norm": 237.4733123779297, + "learning_rate": 1.3284936479128856e-05, + "loss": 32.8733, + "step": 3080 + }, + { + "epoch": 11.119187358916479, + "eval_loss": 0.6130570769309998, + "eval_runtime": 3.154, + "eval_samples_per_second": 56.754, + "eval_steps_per_second": 56.754, + "step": 3080 + }, + { + "epoch": 11.122799097065462, + "grad_norm": 202.9044952392578, + "learning_rate": 1.3279491833030853e-05, + "loss": 33.89, + "step": 3081 + }, + { + "epoch": 11.126410835214447, + "grad_norm": 230.82086181640625, + "learning_rate": 1.327404718693285e-05, + "loss": 34.0808, + "step": 3082 + }, + { + "epoch": 11.130022573363432, + "grad_norm": 318.1103515625, + "learning_rate": 1.3268602540834846e-05, + "loss": 35.5715, + "step": 3083 + }, + { + "epoch": 11.133634311512415, + "grad_norm": 296.760986328125, + "learning_rate": 1.3263157894736843e-05, + "loss": 36.0701, + "step": 3084 + }, + { + "epoch": 11.1372460496614, + "grad_norm": 355.1922302246094, + "learning_rate": 1.3257713248638838e-05, + "loss": 35.027, + "step": 3085 + }, + { + "epoch": 11.140857787810384, + "grad_norm": 379.0643310546875, + "learning_rate": 1.3252268602540835e-05, + "loss": 36.8225, + "step": 3086 + }, + { + "epoch": 11.144469525959368, + "grad_norm": 271.0293273925781, + "learning_rate": 1.3246823956442832e-05, + "loss": 34.18, + "step": 3087 + }, + { + "epoch": 11.148081264108352, + "grad_norm": 231.29782104492188, + "learning_rate": 1.3241379310344828e-05, + "loss": 37.5546, + "step": 3088 + }, + { + "epoch": 11.151693002257336, + "grad_norm": 236.58180236816406, + "learning_rate": 1.3235934664246823e-05, + "loss": 35.8625, + "step": 3089 + }, + { + "epoch": 11.15530474040632, + "grad_norm": 220.71853637695312, + "learning_rate": 1.3230490018148822e-05, + "loss": 38.1384, + "step": 3090 + }, + { + "epoch": 11.15530474040632, + "eval_loss": 0.6140565276145935, + "eval_runtime": 3.1543, + "eval_samples_per_second": 56.747, + "eval_steps_per_second": 56.747, + "step": 3090 + }, + { + "epoch": 11.158916478555305, + "grad_norm": 251.32090759277344, + "learning_rate": 1.3225045372050817e-05, + "loss": 36.7226, + "step": 3091 + }, + { + "epoch": 11.162528216704288, + "grad_norm": 244.061279296875, + "learning_rate": 1.3219600725952814e-05, + "loss": 37.2144, + "step": 3092 + }, + { + "epoch": 11.166139954853273, + "grad_norm": 274.3013610839844, + "learning_rate": 1.321415607985481e-05, + "loss": 27.0703, + "step": 3093 + }, + { + "epoch": 11.169751693002258, + "grad_norm": 197.1829071044922, + "learning_rate": 1.3208711433756805e-05, + "loss": 23.0504, + "step": 3094 + }, + { + "epoch": 11.173363431151241, + "grad_norm": 205.8387451171875, + "learning_rate": 1.3203266787658804e-05, + "loss": 23.4632, + "step": 3095 + }, + { + "epoch": 11.176975169300226, + "grad_norm": 237.6263427734375, + "learning_rate": 1.31978221415608e-05, + "loss": 23.9426, + "step": 3096 + }, + { + "epoch": 11.18058690744921, + "grad_norm": 177.99688720703125, + "learning_rate": 1.3192377495462795e-05, + "loss": 24.2553, + "step": 3097 + }, + { + "epoch": 11.184198645598194, + "grad_norm": 235.16787719726562, + "learning_rate": 1.3186932849364792e-05, + "loss": 41.3257, + "step": 3098 + }, + { + "epoch": 11.187810383747179, + "grad_norm": 213.4043731689453, + "learning_rate": 1.3181488203266787e-05, + "loss": 42.3344, + "step": 3099 + }, + { + "epoch": 11.191422121896162, + "grad_norm": 162.57554626464844, + "learning_rate": 1.3176043557168784e-05, + "loss": 41.2702, + "step": 3100 + }, + { + "epoch": 11.191422121896162, + "eval_loss": 0.6155741214752197, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.06, + "eval_steps_per_second": 57.06, + "step": 3100 + }, + { + "epoch": 11.195033860045147, + "grad_norm": 215.84335327148438, + "learning_rate": 1.3170598911070781e-05, + "loss": 41.0582, + "step": 3101 + }, + { + "epoch": 11.198645598194132, + "grad_norm": 295.0271301269531, + "learning_rate": 1.3165154264972777e-05, + "loss": 41.3479, + "step": 3102 + }, + { + "epoch": 11.202257336343115, + "grad_norm": 287.3316955566406, + "learning_rate": 1.3159709618874774e-05, + "loss": 41.6267, + "step": 3103 + }, + { + "epoch": 11.2058690744921, + "grad_norm": 249.3993377685547, + "learning_rate": 1.315426497277677e-05, + "loss": 40.5208, + "step": 3104 + }, + { + "epoch": 11.209480812641084, + "grad_norm": 274.5410461425781, + "learning_rate": 1.3148820326678766e-05, + "loss": 41.7072, + "step": 3105 + }, + { + "epoch": 11.213092550790067, + "grad_norm": 259.49627685546875, + "learning_rate": 1.3143375680580763e-05, + "loss": 41.0034, + "step": 3106 + }, + { + "epoch": 11.216704288939052, + "grad_norm": 246.60902404785156, + "learning_rate": 1.3137931034482759e-05, + "loss": 40.1154, + "step": 3107 + }, + { + "epoch": 11.220316027088035, + "grad_norm": 224.0052947998047, + "learning_rate": 1.3132486388384754e-05, + "loss": 41.1167, + "step": 3108 + }, + { + "epoch": 11.22392776523702, + "grad_norm": 204.24021911621094, + "learning_rate": 1.3127041742286753e-05, + "loss": 37.0909, + "step": 3109 + }, + { + "epoch": 11.227539503386005, + "grad_norm": 206.67681884765625, + "learning_rate": 1.3121597096188748e-05, + "loss": 38.0959, + "step": 3110 + }, + { + "epoch": 11.227539503386005, + "eval_loss": 0.6148640513420105, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.798, + "eval_steps_per_second": 56.798, + "step": 3110 + }, + { + "epoch": 11.231151241534988, + "grad_norm": 255.91238403320312, + "learning_rate": 1.3116152450090743e-05, + "loss": 38.8076, + "step": 3111 + }, + { + "epoch": 11.234762979683973, + "grad_norm": 239.5032958984375, + "learning_rate": 1.311070780399274e-05, + "loss": 39.3991, + "step": 3112 + }, + { + "epoch": 11.238374717832958, + "grad_norm": 254.8914031982422, + "learning_rate": 1.3105263157894738e-05, + "loss": 37.7301, + "step": 3113 + }, + { + "epoch": 11.241986455981941, + "grad_norm": 229.97943115234375, + "learning_rate": 1.3099818511796735e-05, + "loss": 38.8527, + "step": 3114 + }, + { + "epoch": 11.245598194130926, + "grad_norm": 208.1148681640625, + "learning_rate": 1.309437386569873e-05, + "loss": 38.8518, + "step": 3115 + }, + { + "epoch": 11.249209932279909, + "grad_norm": 208.49557495117188, + "learning_rate": 1.3088929219600725e-05, + "loss": 38.927, + "step": 3116 + }, + { + "epoch": 11.252821670428894, + "grad_norm": 332.9958801269531, + "learning_rate": 1.3083484573502723e-05, + "loss": 40.0492, + "step": 3117 + }, + { + "epoch": 11.256433408577879, + "grad_norm": 253.16769409179688, + "learning_rate": 1.307803992740472e-05, + "loss": 39.1965, + "step": 3118 + }, + { + "epoch": 11.260045146726862, + "grad_norm": 243.8136444091797, + "learning_rate": 1.3072595281306715e-05, + "loss": 38.2286, + "step": 3119 + }, + { + "epoch": 11.263656884875846, + "grad_norm": 273.6463623046875, + "learning_rate": 1.3067150635208712e-05, + "loss": 39.3751, + "step": 3120 + }, + { + "epoch": 11.263656884875846, + "eval_loss": 0.6175129413604736, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 3120 + }, + { + "epoch": 11.267268623024831, + "grad_norm": 228.980224609375, + "learning_rate": 1.3061705989110707e-05, + "loss": 40.29, + "step": 3121 + }, + { + "epoch": 11.270880361173814, + "grad_norm": 292.6310729980469, + "learning_rate": 1.3056261343012703e-05, + "loss": 41.1785, + "step": 3122 + }, + { + "epoch": 11.2744920993228, + "grad_norm": 217.0737762451172, + "learning_rate": 1.3050816696914702e-05, + "loss": 40.9514, + "step": 3123 + }, + { + "epoch": 11.278103837471784, + "grad_norm": 227.0102081298828, + "learning_rate": 1.3045372050816697e-05, + "loss": 39.6132, + "step": 3124 + }, + { + "epoch": 11.281715575620767, + "grad_norm": 195.74667358398438, + "learning_rate": 1.3039927404718694e-05, + "loss": 39.5024, + "step": 3125 + }, + { + "epoch": 11.285327313769752, + "grad_norm": 222.6744384765625, + "learning_rate": 1.303448275862069e-05, + "loss": 37.7863, + "step": 3126 + }, + { + "epoch": 11.288939051918735, + "grad_norm": 207.1038055419922, + "learning_rate": 1.3029038112522687e-05, + "loss": 34.9129, + "step": 3127 + }, + { + "epoch": 11.29255079006772, + "grad_norm": 227.38330078125, + "learning_rate": 1.3023593466424684e-05, + "loss": 33.231, + "step": 3128 + }, + { + "epoch": 11.296162528216705, + "grad_norm": 254.19442749023438, + "learning_rate": 1.3018148820326679e-05, + "loss": 33.3166, + "step": 3129 + }, + { + "epoch": 11.299774266365688, + "grad_norm": 221.4664306640625, + "learning_rate": 1.3012704174228674e-05, + "loss": 33.2336, + "step": 3130 + }, + { + "epoch": 11.299774266365688, + "eval_loss": 0.6138683557510376, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 3130 + }, + { + "epoch": 11.303386004514673, + "grad_norm": 179.73678588867188, + "learning_rate": 1.3007259528130671e-05, + "loss": 34.0082, + "step": 3131 + }, + { + "epoch": 11.306997742663658, + "grad_norm": 238.66107177734375, + "learning_rate": 1.3001814882032669e-05, + "loss": 33.1898, + "step": 3132 + }, + { + "epoch": 11.31060948081264, + "grad_norm": 315.51934814453125, + "learning_rate": 1.2996370235934666e-05, + "loss": 34.5558, + "step": 3133 + }, + { + "epoch": 11.314221218961626, + "grad_norm": 235.54217529296875, + "learning_rate": 1.2990925589836661e-05, + "loss": 32.4498, + "step": 3134 + }, + { + "epoch": 11.317832957110609, + "grad_norm": 225.9518280029297, + "learning_rate": 1.2985480943738656e-05, + "loss": 34.1823, + "step": 3135 + }, + { + "epoch": 11.321444695259594, + "grad_norm": 276.5481262207031, + "learning_rate": 1.2980036297640655e-05, + "loss": 34.6704, + "step": 3136 + }, + { + "epoch": 11.325056433408578, + "grad_norm": 306.4985656738281, + "learning_rate": 1.297459165154265e-05, + "loss": 35.9149, + "step": 3137 + }, + { + "epoch": 11.328668171557561, + "grad_norm": 207.28550720214844, + "learning_rate": 1.2969147005444646e-05, + "loss": 34.876, + "step": 3138 + }, + { + "epoch": 11.332279909706546, + "grad_norm": 238.89157104492188, + "learning_rate": 1.2963702359346643e-05, + "loss": 36.7191, + "step": 3139 + }, + { + "epoch": 11.335891647855531, + "grad_norm": 281.7445068359375, + "learning_rate": 1.2958257713248638e-05, + "loss": 37.9134, + "step": 3140 + }, + { + "epoch": 11.335891647855531, + "eval_loss": 0.6141538023948669, + "eval_runtime": 3.1622, + "eval_samples_per_second": 56.606, + "eval_steps_per_second": 56.606, + "step": 3140 + }, + { + "epoch": 11.339503386004514, + "grad_norm": 261.58221435546875, + "learning_rate": 1.2952813067150635e-05, + "loss": 36.7193, + "step": 3141 + }, + { + "epoch": 11.343115124153499, + "grad_norm": 260.8083190917969, + "learning_rate": 1.2947368421052633e-05, + "loss": 36.9418, + "step": 3142 + }, + { + "epoch": 11.346726862302482, + "grad_norm": 263.466552734375, + "learning_rate": 1.2941923774954628e-05, + "loss": 31.1083, + "step": 3143 + }, + { + "epoch": 11.350338600451467, + "grad_norm": 201.6587677001953, + "learning_rate": 1.2936479128856625e-05, + "loss": 23.4982, + "step": 3144 + }, + { + "epoch": 11.353950338600452, + "grad_norm": 230.29629516601562, + "learning_rate": 1.293103448275862e-05, + "loss": 22.5417, + "step": 3145 + }, + { + "epoch": 11.357562076749435, + "grad_norm": 193.08795166015625, + "learning_rate": 1.2925589836660617e-05, + "loss": 23.6032, + "step": 3146 + }, + { + "epoch": 11.36117381489842, + "grad_norm": 206.49093627929688, + "learning_rate": 1.2920145190562615e-05, + "loss": 24.1813, + "step": 3147 + }, + { + "epoch": 11.364785553047405, + "grad_norm": 285.38348388671875, + "learning_rate": 1.291470054446461e-05, + "loss": 41.4394, + "step": 3148 + }, + { + "epoch": 11.368397291196388, + "grad_norm": 307.4984130859375, + "learning_rate": 1.2909255898366605e-05, + "loss": 43.8865, + "step": 3149 + }, + { + "epoch": 11.372009029345373, + "grad_norm": 256.685791015625, + "learning_rate": 1.2903811252268604e-05, + "loss": 41.5534, + "step": 3150 + }, + { + "epoch": 11.372009029345373, + "eval_loss": 0.6155339479446411, + "eval_runtime": 3.1488, + "eval_samples_per_second": 56.846, + "eval_steps_per_second": 56.846, + "step": 3150 + }, + { + "epoch": 11.375620767494357, + "grad_norm": 302.5317077636719, + "learning_rate": 1.28983666061706e-05, + "loss": 41.5231, + "step": 3151 + }, + { + "epoch": 11.37923250564334, + "grad_norm": 381.4787292480469, + "learning_rate": 1.2892921960072595e-05, + "loss": 40.7064, + "step": 3152 + }, + { + "epoch": 11.382844243792325, + "grad_norm": 313.63116455078125, + "learning_rate": 1.2887477313974592e-05, + "loss": 41.4045, + "step": 3153 + }, + { + "epoch": 11.386455981941308, + "grad_norm": 265.4134521484375, + "learning_rate": 1.2882032667876587e-05, + "loss": 41.2618, + "step": 3154 + }, + { + "epoch": 11.390067720090293, + "grad_norm": 260.43084716796875, + "learning_rate": 1.2876588021778586e-05, + "loss": 42.6311, + "step": 3155 + }, + { + "epoch": 11.393679458239278, + "grad_norm": 326.7022705078125, + "learning_rate": 1.2871143375680581e-05, + "loss": 41.8859, + "step": 3156 + }, + { + "epoch": 11.397291196388261, + "grad_norm": 420.966552734375, + "learning_rate": 1.2865698729582577e-05, + "loss": 41.8117, + "step": 3157 + }, + { + "epoch": 11.400902934537246, + "grad_norm": 280.8377380371094, + "learning_rate": 1.2860254083484574e-05, + "loss": 41.3303, + "step": 3158 + }, + { + "epoch": 11.404514672686231, + "grad_norm": 238.64564514160156, + "learning_rate": 1.2854809437386571e-05, + "loss": 38.253, + "step": 3159 + }, + { + "epoch": 11.408126410835214, + "grad_norm": 258.8091125488281, + "learning_rate": 1.2849364791288566e-05, + "loss": 39.2494, + "step": 3160 + }, + { + "epoch": 11.408126410835214, + "eval_loss": 0.6130858659744263, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 3160 + }, + { + "epoch": 11.411738148984199, + "grad_norm": 209.76300048828125, + "learning_rate": 1.2843920145190563e-05, + "loss": 39.1069, + "step": 3161 + }, + { + "epoch": 11.415349887133182, + "grad_norm": 215.24072265625, + "learning_rate": 1.2838475499092559e-05, + "loss": 38.8867, + "step": 3162 + }, + { + "epoch": 11.418961625282167, + "grad_norm": 285.4281311035156, + "learning_rate": 1.2833030852994554e-05, + "loss": 38.0298, + "step": 3163 + }, + { + "epoch": 11.422573363431152, + "grad_norm": 322.1593017578125, + "learning_rate": 1.2827586206896553e-05, + "loss": 40.2122, + "step": 3164 + }, + { + "epoch": 11.426185101580135, + "grad_norm": 277.2178955078125, + "learning_rate": 1.2822141560798548e-05, + "loss": 38.0829, + "step": 3165 + }, + { + "epoch": 11.42979683972912, + "grad_norm": 186.9705810546875, + "learning_rate": 1.2816696914700545e-05, + "loss": 40.6601, + "step": 3166 + }, + { + "epoch": 11.433408577878104, + "grad_norm": 210.6102294921875, + "learning_rate": 1.281125226860254e-05, + "loss": 39.0126, + "step": 3167 + }, + { + "epoch": 11.437020316027088, + "grad_norm": 234.50717163085938, + "learning_rate": 1.2805807622504536e-05, + "loss": 38.6465, + "step": 3168 + }, + { + "epoch": 11.440632054176072, + "grad_norm": 217.9093475341797, + "learning_rate": 1.2800362976406535e-05, + "loss": 39.2568, + "step": 3169 + }, + { + "epoch": 11.444243792325057, + "grad_norm": 252.82054138183594, + "learning_rate": 1.279491833030853e-05, + "loss": 39.005, + "step": 3170 + }, + { + "epoch": 11.444243792325057, + "eval_loss": 0.6125118732452393, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 3170 + }, + { + "epoch": 11.44785553047404, + "grad_norm": 290.2322998046875, + "learning_rate": 1.2789473684210526e-05, + "loss": 39.6133, + "step": 3171 + }, + { + "epoch": 11.451467268623025, + "grad_norm": 250.72450256347656, + "learning_rate": 1.2784029038112523e-05, + "loss": 40.3251, + "step": 3172 + }, + { + "epoch": 11.455079006772008, + "grad_norm": 273.91229248046875, + "learning_rate": 1.277858439201452e-05, + "loss": 39.5129, + "step": 3173 + }, + { + "epoch": 11.458690744920993, + "grad_norm": 214.30038452148438, + "learning_rate": 1.2773139745916515e-05, + "loss": 40.5093, + "step": 3174 + }, + { + "epoch": 11.462302483069978, + "grad_norm": 264.251708984375, + "learning_rate": 1.2767695099818512e-05, + "loss": 38.3837, + "step": 3175 + }, + { + "epoch": 11.465914221218961, + "grad_norm": 224.7700653076172, + "learning_rate": 1.2762250453720508e-05, + "loss": 37.8522, + "step": 3176 + }, + { + "epoch": 11.469525959367946, + "grad_norm": 238.35604858398438, + "learning_rate": 1.2756805807622505e-05, + "loss": 34.0249, + "step": 3177 + }, + { + "epoch": 11.47313769751693, + "grad_norm": 181.4731903076172, + "learning_rate": 1.2751361161524502e-05, + "loss": 34.2473, + "step": 3178 + }, + { + "epoch": 11.476749435665914, + "grad_norm": 240.2397003173828, + "learning_rate": 1.2745916515426497e-05, + "loss": 32.8657, + "step": 3179 + }, + { + "epoch": 11.480361173814899, + "grad_norm": 283.2740478515625, + "learning_rate": 1.2740471869328494e-05, + "loss": 34.6619, + "step": 3180 + }, + { + "epoch": 11.480361173814899, + "eval_loss": 0.6126638054847717, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 3180 + }, + { + "epoch": 11.483972911963882, + "grad_norm": 248.70912170410156, + "learning_rate": 1.273502722323049e-05, + "loss": 33.0975, + "step": 3181 + }, + { + "epoch": 11.487584650112867, + "grad_norm": 210.9479217529297, + "learning_rate": 1.2729582577132487e-05, + "loss": 34.2069, + "step": 3182 + }, + { + "epoch": 11.491196388261852, + "grad_norm": 234.31399536132812, + "learning_rate": 1.2724137931034484e-05, + "loss": 35.811, + "step": 3183 + }, + { + "epoch": 11.494808126410835, + "grad_norm": 253.24478149414062, + "learning_rate": 1.271869328493648e-05, + "loss": 35.6234, + "step": 3184 + }, + { + "epoch": 11.49841986455982, + "grad_norm": 259.0565185546875, + "learning_rate": 1.2713248638838476e-05, + "loss": 35.1495, + "step": 3185 + }, + { + "epoch": 11.502031602708804, + "grad_norm": 235.4202880859375, + "learning_rate": 1.2707803992740472e-05, + "loss": 35.1363, + "step": 3186 + }, + { + "epoch": 11.505643340857787, + "grad_norm": 248.30267333984375, + "learning_rate": 1.2702359346642469e-05, + "loss": 35.9653, + "step": 3187 + }, + { + "epoch": 11.509255079006772, + "grad_norm": 197.6142120361328, + "learning_rate": 1.2696914700544466e-05, + "loss": 35.6304, + "step": 3188 + }, + { + "epoch": 11.512866817155757, + "grad_norm": 329.27862548828125, + "learning_rate": 1.2691470054446461e-05, + "loss": 35.6111, + "step": 3189 + }, + { + "epoch": 11.51647855530474, + "grad_norm": 194.7126922607422, + "learning_rate": 1.2686025408348457e-05, + "loss": 35.0693, + "step": 3190 + }, + { + "epoch": 11.51647855530474, + "eval_loss": 0.6106634736061096, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 3190 + }, + { + "epoch": 11.520090293453725, + "grad_norm": 243.0207061767578, + "learning_rate": 1.2680580762250454e-05, + "loss": 37.6373, + "step": 3191 + }, + { + "epoch": 11.523702031602708, + "grad_norm": 282.0947265625, + "learning_rate": 1.267513611615245e-05, + "loss": 36.2595, + "step": 3192 + }, + { + "epoch": 11.527313769751693, + "grad_norm": 249.8011932373047, + "learning_rate": 1.2669691470054446e-05, + "loss": 35.5601, + "step": 3193 + }, + { + "epoch": 11.530925507900678, + "grad_norm": 202.17503356933594, + "learning_rate": 1.2664246823956443e-05, + "loss": 23.1075, + "step": 3194 + }, + { + "epoch": 11.534537246049661, + "grad_norm": 188.78128051757812, + "learning_rate": 1.2658802177858439e-05, + "loss": 22.2458, + "step": 3195 + }, + { + "epoch": 11.538148984198646, + "grad_norm": 219.24722290039062, + "learning_rate": 1.2653357531760437e-05, + "loss": 23.7842, + "step": 3196 + }, + { + "epoch": 11.54176072234763, + "grad_norm": 213.0615234375, + "learning_rate": 1.2647912885662433e-05, + "loss": 25.3773, + "step": 3197 + }, + { + "epoch": 11.545372460496614, + "grad_norm": 274.6806335449219, + "learning_rate": 1.2642468239564428e-05, + "loss": 40.396, + "step": 3198 + }, + { + "epoch": 11.548984198645599, + "grad_norm": 248.91778564453125, + "learning_rate": 1.2637023593466425e-05, + "loss": 42.2405, + "step": 3199 + }, + { + "epoch": 11.552595936794582, + "grad_norm": 228.45591735839844, + "learning_rate": 1.263157894736842e-05, + "loss": 40.7328, + "step": 3200 + }, + { + "epoch": 11.552595936794582, + "eval_loss": 0.6154705286026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.04, + "eval_steps_per_second": 57.04, + "step": 3200 + }, + { + "epoch": 11.556207674943566, + "grad_norm": 206.54483032226562, + "learning_rate": 1.2626134301270418e-05, + "loss": 40.6909, + "step": 3201 + }, + { + "epoch": 11.559819413092551, + "grad_norm": 199.14816284179688, + "learning_rate": 1.2620689655172415e-05, + "loss": 40.6918, + "step": 3202 + }, + { + "epoch": 11.563431151241534, + "grad_norm": 217.4789276123047, + "learning_rate": 1.261524500907441e-05, + "loss": 41.686, + "step": 3203 + }, + { + "epoch": 11.56704288939052, + "grad_norm": 209.83084106445312, + "learning_rate": 1.2609800362976406e-05, + "loss": 40.685, + "step": 3204 + }, + { + "epoch": 11.570654627539504, + "grad_norm": 184.56614685058594, + "learning_rate": 1.2604355716878404e-05, + "loss": 42.1684, + "step": 3205 + }, + { + "epoch": 11.574266365688487, + "grad_norm": 226.84622192382812, + "learning_rate": 1.25989110707804e-05, + "loss": 42.4169, + "step": 3206 + }, + { + "epoch": 11.577878103837472, + "grad_norm": 271.7705383300781, + "learning_rate": 1.2593466424682397e-05, + "loss": 41.9603, + "step": 3207 + }, + { + "epoch": 11.581489841986457, + "grad_norm": 206.48257446289062, + "learning_rate": 1.2588021778584392e-05, + "loss": 39.9903, + "step": 3208 + }, + { + "epoch": 11.58510158013544, + "grad_norm": 190.86009216308594, + "learning_rate": 1.2582577132486388e-05, + "loss": 39.3138, + "step": 3209 + }, + { + "epoch": 11.588713318284425, + "grad_norm": 217.0152130126953, + "learning_rate": 1.2577132486388386e-05, + "loss": 37.652, + "step": 3210 + }, + { + "epoch": 11.588713318284425, + "eval_loss": 0.6143624186515808, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 3210 + }, + { + "epoch": 11.592325056433408, + "grad_norm": 203.3090362548828, + "learning_rate": 1.2571687840290382e-05, + "loss": 38.5532, + "step": 3211 + }, + { + "epoch": 11.595936794582393, + "grad_norm": 237.18287658691406, + "learning_rate": 1.2566243194192377e-05, + "loss": 38.4073, + "step": 3212 + }, + { + "epoch": 11.599548532731378, + "grad_norm": 222.20489501953125, + "learning_rate": 1.2560798548094374e-05, + "loss": 37.7122, + "step": 3213 + }, + { + "epoch": 11.60316027088036, + "grad_norm": 261.4862060546875, + "learning_rate": 1.255535390199637e-05, + "loss": 39.0125, + "step": 3214 + }, + { + "epoch": 11.606772009029346, + "grad_norm": 235.49668884277344, + "learning_rate": 1.2549909255898367e-05, + "loss": 38.1753, + "step": 3215 + }, + { + "epoch": 11.610383747178329, + "grad_norm": 219.66139221191406, + "learning_rate": 1.2544464609800364e-05, + "loss": 40.3478, + "step": 3216 + }, + { + "epoch": 11.613995485327314, + "grad_norm": 282.8075256347656, + "learning_rate": 1.2539019963702359e-05, + "loss": 39.3672, + "step": 3217 + }, + { + "epoch": 11.617607223476298, + "grad_norm": 235.07875061035156, + "learning_rate": 1.2533575317604356e-05, + "loss": 39.8955, + "step": 3218 + }, + { + "epoch": 11.621218961625281, + "grad_norm": 328.829833984375, + "learning_rate": 1.2528130671506353e-05, + "loss": 38.626, + "step": 3219 + }, + { + "epoch": 11.624830699774266, + "grad_norm": 283.1789245605469, + "learning_rate": 1.2522686025408349e-05, + "loss": 40.0565, + "step": 3220 + }, + { + "epoch": 11.624830699774266, + "eval_loss": 0.6113889217376709, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3220 + }, + { + "epoch": 11.628442437923251, + "grad_norm": 230.88047790527344, + "learning_rate": 1.2517241379310346e-05, + "loss": 40.1155, + "step": 3221 + }, + { + "epoch": 11.632054176072234, + "grad_norm": 258.1295166015625, + "learning_rate": 1.2511796733212341e-05, + "loss": 40.4707, + "step": 3222 + }, + { + "epoch": 11.635665914221219, + "grad_norm": 255.82699584960938, + "learning_rate": 1.2506352087114336e-05, + "loss": 41.1296, + "step": 3223 + }, + { + "epoch": 11.639277652370204, + "grad_norm": 226.4784393310547, + "learning_rate": 1.2500907441016335e-05, + "loss": 39.1159, + "step": 3224 + }, + { + "epoch": 11.642889390519187, + "grad_norm": 257.38104248046875, + "learning_rate": 1.249546279491833e-05, + "loss": 40.7933, + "step": 3225 + }, + { + "epoch": 11.646501128668172, + "grad_norm": 218.69070434570312, + "learning_rate": 1.2490018148820328e-05, + "loss": 39.6723, + "step": 3226 + }, + { + "epoch": 11.650112866817155, + "grad_norm": 232.3351287841797, + "learning_rate": 1.2484573502722323e-05, + "loss": 37.5671, + "step": 3227 + }, + { + "epoch": 11.65372460496614, + "grad_norm": 229.93295288085938, + "learning_rate": 1.2479128856624318e-05, + "loss": 32.7819, + "step": 3228 + }, + { + "epoch": 11.657336343115125, + "grad_norm": 265.6002197265625, + "learning_rate": 1.2473684210526317e-05, + "loss": 32.5955, + "step": 3229 + }, + { + "epoch": 11.660948081264108, + "grad_norm": 278.47705078125, + "learning_rate": 1.2468239564428313e-05, + "loss": 32.9901, + "step": 3230 + }, + { + "epoch": 11.660948081264108, + "eval_loss": 0.6078047752380371, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 3230 + }, + { + "epoch": 11.664559819413093, + "grad_norm": 239.9285430908203, + "learning_rate": 1.2462794918330308e-05, + "loss": 33.2737, + "step": 3231 + }, + { + "epoch": 11.668171557562077, + "grad_norm": 358.36090087890625, + "learning_rate": 1.2457350272232305e-05, + "loss": 34.8522, + "step": 3232 + }, + { + "epoch": 11.67178329571106, + "grad_norm": 258.0733642578125, + "learning_rate": 1.2451905626134302e-05, + "loss": 34.6796, + "step": 3233 + }, + { + "epoch": 11.675395033860045, + "grad_norm": 296.21942138671875, + "learning_rate": 1.2446460980036298e-05, + "loss": 35.8479, + "step": 3234 + }, + { + "epoch": 11.679006772009028, + "grad_norm": 229.6141815185547, + "learning_rate": 1.2441016333938295e-05, + "loss": 36.4934, + "step": 3235 + }, + { + "epoch": 11.682618510158013, + "grad_norm": 238.6092987060547, + "learning_rate": 1.243557168784029e-05, + "loss": 35.2253, + "step": 3236 + }, + { + "epoch": 11.686230248306998, + "grad_norm": 300.76300048828125, + "learning_rate": 1.2430127041742287e-05, + "loss": 34.9373, + "step": 3237 + }, + { + "epoch": 11.689841986455981, + "grad_norm": 227.70672607421875, + "learning_rate": 1.2424682395644284e-05, + "loss": 35.4369, + "step": 3238 + }, + { + "epoch": 11.693453724604966, + "grad_norm": 218.36000061035156, + "learning_rate": 1.241923774954628e-05, + "loss": 35.3398, + "step": 3239 + }, + { + "epoch": 11.697065462753951, + "grad_norm": 220.78475952148438, + "learning_rate": 1.2413793103448277e-05, + "loss": 35.7612, + "step": 3240 + }, + { + "epoch": 11.697065462753951, + "eval_loss": 0.6067846417427063, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 3240 + }, + { + "epoch": 11.700677200902934, + "grad_norm": 237.34437561035156, + "learning_rate": 1.2408348457350272e-05, + "loss": 38.0459, + "step": 3241 + }, + { + "epoch": 11.704288939051919, + "grad_norm": 251.60633850097656, + "learning_rate": 1.2402903811252269e-05, + "loss": 35.4676, + "step": 3242 + }, + { + "epoch": 11.707900677200904, + "grad_norm": 214.17117309570312, + "learning_rate": 1.2397459165154266e-05, + "loss": 30.5595, + "step": 3243 + }, + { + "epoch": 11.711512415349887, + "grad_norm": 202.3698272705078, + "learning_rate": 1.2392014519056262e-05, + "loss": 23.7468, + "step": 3244 + }, + { + "epoch": 11.715124153498872, + "grad_norm": 229.11776733398438, + "learning_rate": 1.2386569872958257e-05, + "loss": 23.1255, + "step": 3245 + }, + { + "epoch": 11.718735891647855, + "grad_norm": 175.93829345703125, + "learning_rate": 1.2381125226860254e-05, + "loss": 23.7349, + "step": 3246 + }, + { + "epoch": 11.72234762979684, + "grad_norm": 232.7489471435547, + "learning_rate": 1.2375680580762251e-05, + "loss": 24.4997, + "step": 3247 + }, + { + "epoch": 11.725959367945824, + "grad_norm": 280.5601806640625, + "learning_rate": 1.2370235934664248e-05, + "loss": 42.3811, + "step": 3248 + }, + { + "epoch": 11.729571106094808, + "grad_norm": 292.2538146972656, + "learning_rate": 1.2364791288566244e-05, + "loss": 42.9804, + "step": 3249 + }, + { + "epoch": 11.733182844243792, + "grad_norm": 265.0259704589844, + "learning_rate": 1.2359346642468239e-05, + "loss": 41.1251, + "step": 3250 + }, + { + "epoch": 11.733182844243792, + "eval_loss": 0.6141200065612793, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 3250 + }, + { + "epoch": 11.736794582392777, + "grad_norm": 232.92893981933594, + "learning_rate": 1.2353901996370236e-05, + "loss": 40.9372, + "step": 3251 + }, + { + "epoch": 11.74040632054176, + "grad_norm": 176.99818420410156, + "learning_rate": 1.2348457350272233e-05, + "loss": 41.0757, + "step": 3252 + }, + { + "epoch": 11.744018058690745, + "grad_norm": 206.5728759765625, + "learning_rate": 1.2343012704174228e-05, + "loss": 41.9635, + "step": 3253 + }, + { + "epoch": 11.747629796839728, + "grad_norm": 211.2556915283203, + "learning_rate": 1.2337568058076226e-05, + "loss": 41.5217, + "step": 3254 + }, + { + "epoch": 11.751241534988713, + "grad_norm": 198.8915252685547, + "learning_rate": 1.2332123411978221e-05, + "loss": 42.9997, + "step": 3255 + }, + { + "epoch": 11.754853273137698, + "grad_norm": 291.2761535644531, + "learning_rate": 1.2326678765880218e-05, + "loss": 42.2561, + "step": 3256 + }, + { + "epoch": 11.758465011286681, + "grad_norm": 243.2998046875, + "learning_rate": 1.2321234119782215e-05, + "loss": 41.6219, + "step": 3257 + }, + { + "epoch": 11.762076749435666, + "grad_norm": 266.1149597167969, + "learning_rate": 1.231578947368421e-05, + "loss": 40.1646, + "step": 3258 + }, + { + "epoch": 11.76568848758465, + "grad_norm": 236.6083221435547, + "learning_rate": 1.2310344827586208e-05, + "loss": 39.7079, + "step": 3259 + }, + { + "epoch": 11.769300225733634, + "grad_norm": 196.397216796875, + "learning_rate": 1.2304900181488203e-05, + "loss": 39.6629, + "step": 3260 + }, + { + "epoch": 11.769300225733634, + "eval_loss": 0.6124016046524048, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.055, + "eval_steps_per_second": 57.055, + "step": 3260 + }, + { + "epoch": 11.772911963882619, + "grad_norm": 198.52500915527344, + "learning_rate": 1.22994555353902e-05, + "loss": 38.5285, + "step": 3261 + }, + { + "epoch": 11.776523702031604, + "grad_norm": 236.25477600097656, + "learning_rate": 1.2294010889292197e-05, + "loss": 38.3358, + "step": 3262 + }, + { + "epoch": 11.780135440180587, + "grad_norm": 260.35955810546875, + "learning_rate": 1.2288566243194192e-05, + "loss": 38.374, + "step": 3263 + }, + { + "epoch": 11.783747178329572, + "grad_norm": 313.078857421875, + "learning_rate": 1.2283121597096188e-05, + "loss": 39.124, + "step": 3264 + }, + { + "epoch": 11.787358916478555, + "grad_norm": 191.34027099609375, + "learning_rate": 1.2277676950998187e-05, + "loss": 39.1776, + "step": 3265 + }, + { + "epoch": 11.79097065462754, + "grad_norm": 203.5764923095703, + "learning_rate": 1.2272232304900182e-05, + "loss": 38.7885, + "step": 3266 + }, + { + "epoch": 11.794582392776524, + "grad_norm": 234.38479614257812, + "learning_rate": 1.2266787658802177e-05, + "loss": 39.1353, + "step": 3267 + }, + { + "epoch": 11.798194130925507, + "grad_norm": 254.5694122314453, + "learning_rate": 1.2261343012704174e-05, + "loss": 38.141, + "step": 3268 + }, + { + "epoch": 11.801805869074492, + "grad_norm": 189.8268585205078, + "learning_rate": 1.225589836660617e-05, + "loss": 39.5199, + "step": 3269 + }, + { + "epoch": 11.805417607223477, + "grad_norm": 256.52728271484375, + "learning_rate": 1.2250453720508169e-05, + "loss": 41.5113, + "step": 3270 + }, + { + "epoch": 11.805417607223477, + "eval_loss": 0.6084021329879761, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3270 + }, + { + "epoch": 11.80902934537246, + "grad_norm": 195.57321166992188, + "learning_rate": 1.2245009074410164e-05, + "loss": 39.8129, + "step": 3271 + }, + { + "epoch": 11.812641083521445, + "grad_norm": 228.6748809814453, + "learning_rate": 1.223956442831216e-05, + "loss": 40.2273, + "step": 3272 + }, + { + "epoch": 11.816252821670428, + "grad_norm": 209.96096801757812, + "learning_rate": 1.2234119782214156e-05, + "loss": 40.2254, + "step": 3273 + }, + { + "epoch": 11.819864559819413, + "grad_norm": 247.4613037109375, + "learning_rate": 1.2228675136116152e-05, + "loss": 40.71, + "step": 3274 + }, + { + "epoch": 11.823476297968398, + "grad_norm": 263.0521240234375, + "learning_rate": 1.2223230490018149e-05, + "loss": 39.5572, + "step": 3275 + }, + { + "epoch": 11.827088036117381, + "grad_norm": 225.53634643554688, + "learning_rate": 1.2217785843920146e-05, + "loss": 36.4388, + "step": 3276 + }, + { + "epoch": 11.830699774266366, + "grad_norm": 194.59527587890625, + "learning_rate": 1.2212341197822141e-05, + "loss": 33.1005, + "step": 3277 + }, + { + "epoch": 11.83431151241535, + "grad_norm": 314.715576171875, + "learning_rate": 1.2206896551724138e-05, + "loss": 32.9812, + "step": 3278 + }, + { + "epoch": 11.837923250564334, + "grad_norm": 205.86862182617188, + "learning_rate": 1.2201451905626136e-05, + "loss": 33.6331, + "step": 3279 + }, + { + "epoch": 11.841534988713319, + "grad_norm": 217.54722595214844, + "learning_rate": 1.2196007259528131e-05, + "loss": 33.6535, + "step": 3280 + }, + { + "epoch": 11.841534988713319, + "eval_loss": 0.609620213508606, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 3280 + }, + { + "epoch": 11.845146726862303, + "grad_norm": 231.25390625, + "learning_rate": 1.2190562613430128e-05, + "loss": 34.5218, + "step": 3281 + }, + { + "epoch": 11.848758465011286, + "grad_norm": 208.8440704345703, + "learning_rate": 1.2185117967332123e-05, + "loss": 34.354, + "step": 3282 + }, + { + "epoch": 11.852370203160271, + "grad_norm": 221.25547790527344, + "learning_rate": 1.2179673321234119e-05, + "loss": 34.5705, + "step": 3283 + }, + { + "epoch": 11.855981941309254, + "grad_norm": 331.4505920410156, + "learning_rate": 1.2174228675136118e-05, + "loss": 35.796, + "step": 3284 + }, + { + "epoch": 11.85959367945824, + "grad_norm": 337.1404113769531, + "learning_rate": 1.2168784029038113e-05, + "loss": 36.4544, + "step": 3285 + }, + { + "epoch": 11.863205417607224, + "grad_norm": 238.75303649902344, + "learning_rate": 1.2163339382940108e-05, + "loss": 35.7165, + "step": 3286 + }, + { + "epoch": 11.866817155756207, + "grad_norm": 260.088134765625, + "learning_rate": 1.2157894736842105e-05, + "loss": 35.5461, + "step": 3287 + }, + { + "epoch": 11.870428893905192, + "grad_norm": 265.0240173339844, + "learning_rate": 1.2152450090744102e-05, + "loss": 37.0143, + "step": 3288 + }, + { + "epoch": 11.874040632054175, + "grad_norm": 251.74273681640625, + "learning_rate": 1.21470054446461e-05, + "loss": 36.6145, + "step": 3289 + }, + { + "epoch": 11.87765237020316, + "grad_norm": 216.8999786376953, + "learning_rate": 1.2141560798548095e-05, + "loss": 36.3135, + "step": 3290 + }, + { + "epoch": 11.87765237020316, + "eval_loss": 0.6087896823883057, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.066, + "eval_steps_per_second": 57.066, + "step": 3290 + }, + { + "epoch": 11.881264108352145, + "grad_norm": 256.50006103515625, + "learning_rate": 1.213611615245009e-05, + "loss": 36.6596, + "step": 3291 + }, + { + "epoch": 11.884875846501128, + "grad_norm": 249.34164428710938, + "learning_rate": 1.2130671506352087e-05, + "loss": 37.6473, + "step": 3292 + }, + { + "epoch": 11.888487584650113, + "grad_norm": 211.9344940185547, + "learning_rate": 1.2125226860254084e-05, + "loss": 28.2839, + "step": 3293 + }, + { + "epoch": 11.892099322799098, + "grad_norm": 170.77166748046875, + "learning_rate": 1.211978221415608e-05, + "loss": 23.2231, + "step": 3294 + }, + { + "epoch": 11.89571106094808, + "grad_norm": 177.49789428710938, + "learning_rate": 1.2114337568058077e-05, + "loss": 22.7909, + "step": 3295 + }, + { + "epoch": 11.899322799097066, + "grad_norm": 189.0458221435547, + "learning_rate": 1.2108892921960072e-05, + "loss": 23.8062, + "step": 3296 + }, + { + "epoch": 11.90293453724605, + "grad_norm": 182.90457153320312, + "learning_rate": 1.2103448275862068e-05, + "loss": 24.7812, + "step": 3297 + }, + { + "epoch": 11.906546275395034, + "grad_norm": 232.61126708984375, + "learning_rate": 1.2098003629764066e-05, + "loss": 41.5496, + "step": 3298 + }, + { + "epoch": 11.910158013544018, + "grad_norm": 283.25762939453125, + "learning_rate": 1.2092558983666062e-05, + "loss": 40.7831, + "step": 3299 + }, + { + "epoch": 11.913769751693001, + "grad_norm": 316.6318359375, + "learning_rate": 1.2087114337568059e-05, + "loss": 40.6287, + "step": 3300 + }, + { + "epoch": 11.913769751693001, + "eval_loss": 0.6114257574081421, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 3300 + }, + { + "epoch": 11.917381489841986, + "grad_norm": 248.5615234375, + "learning_rate": 1.2081669691470054e-05, + "loss": 40.5648, + "step": 3301 + }, + { + "epoch": 11.920993227990971, + "grad_norm": 255.31130981445312, + "learning_rate": 1.2076225045372051e-05, + "loss": 42.4736, + "step": 3302 + }, + { + "epoch": 11.924604966139954, + "grad_norm": 229.3546600341797, + "learning_rate": 1.2070780399274048e-05, + "loss": 43.112, + "step": 3303 + }, + { + "epoch": 11.928216704288939, + "grad_norm": 226.89553833007812, + "learning_rate": 1.2065335753176044e-05, + "loss": 37.9527, + "step": 3304 + }, + { + "epoch": 11.931828442437924, + "grad_norm": 210.63919067382812, + "learning_rate": 1.205989110707804e-05, + "loss": 38.7652, + "step": 3305 + }, + { + "epoch": 11.935440180586907, + "grad_norm": 267.75335693359375, + "learning_rate": 1.2054446460980036e-05, + "loss": 39.9077, + "step": 3306 + }, + { + "epoch": 11.939051918735892, + "grad_norm": 255.3372802734375, + "learning_rate": 1.2049001814882033e-05, + "loss": 39.9008, + "step": 3307 + }, + { + "epoch": 11.942663656884875, + "grad_norm": 220.55332946777344, + "learning_rate": 1.2043557168784029e-05, + "loss": 40.8187, + "step": 3308 + }, + { + "epoch": 11.94627539503386, + "grad_norm": 350.15374755859375, + "learning_rate": 1.2038112522686026e-05, + "loss": 40.2937, + "step": 3309 + }, + { + "epoch": 11.949887133182845, + "grad_norm": 296.1144714355469, + "learning_rate": 1.2032667876588021e-05, + "loss": 41.3939, + "step": 3310 + }, + { + "epoch": 11.949887133182845, + "eval_loss": 0.6116041541099548, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3310 + }, + { + "epoch": 11.953498871331828, + "grad_norm": 220.52304077148438, + "learning_rate": 1.202722323049002e-05, + "loss": 39.108, + "step": 3311 + }, + { + "epoch": 11.957110609480813, + "grad_norm": 268.8526916503906, + "learning_rate": 1.2021778584392015e-05, + "loss": 39.547, + "step": 3312 + }, + { + "epoch": 11.960722347629797, + "grad_norm": 205.97677612304688, + "learning_rate": 1.201633393829401e-05, + "loss": 36.7144, + "step": 3313 + }, + { + "epoch": 11.96433408577878, + "grad_norm": 186.62428283691406, + "learning_rate": 1.2010889292196008e-05, + "loss": 34.0491, + "step": 3314 + }, + { + "epoch": 11.967945823927765, + "grad_norm": 214.5521697998047, + "learning_rate": 1.2005444646098003e-05, + "loss": 34.1164, + "step": 3315 + }, + { + "epoch": 11.97155756207675, + "grad_norm": 203.8130340576172, + "learning_rate": 1.2e-05, + "loss": 34.0005, + "step": 3316 + }, + { + "epoch": 11.975169300225733, + "grad_norm": 207.25648498535156, + "learning_rate": 1.1994555353901997e-05, + "loss": 34.0489, + "step": 3317 + }, + { + "epoch": 11.978781038374718, + "grad_norm": 271.1595458984375, + "learning_rate": 1.1989110707803993e-05, + "loss": 35.0359, + "step": 3318 + }, + { + "epoch": 11.982392776523701, + "grad_norm": 266.0697021484375, + "learning_rate": 1.198366606170599e-05, + "loss": 36.4684, + "step": 3319 + }, + { + "epoch": 11.986004514672686, + "grad_norm": 264.1314392089844, + "learning_rate": 1.1978221415607985e-05, + "loss": 35.8805, + "step": 3320 + }, + { + "epoch": 11.986004514672686, + "eval_loss": 0.6101864576339722, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 3320 + }, + { + "epoch": 11.989616252821671, + "grad_norm": 266.34295654296875, + "learning_rate": 1.1972776769509982e-05, + "loss": 37.2928, + "step": 3321 + }, + { + "epoch": 11.993227990970654, + "grad_norm": 222.19161987304688, + "learning_rate": 1.196733212341198e-05, + "loss": 29.0638, + "step": 3322 + }, + { + "epoch": 11.996839729119639, + "grad_norm": 244.96974182128906, + "learning_rate": 1.1961887477313975e-05, + "loss": 23.6752, + "step": 3323 + }, + { + "epoch": 12.0, + "grad_norm": 227.6931915283203, + "learning_rate": 1.195644283121597e-05, + "loss": 20.9293, + "step": 3324 + }, + { + "epoch": 12.003611738148985, + "grad_norm": 259.7235412597656, + "learning_rate": 1.1950998185117969e-05, + "loss": 39.7694, + "step": 3325 + }, + { + "epoch": 12.007223476297968, + "grad_norm": 258.8477783203125, + "learning_rate": 1.1945553539019964e-05, + "loss": 41.3742, + "step": 3326 + }, + { + "epoch": 12.010835214446953, + "grad_norm": 216.0697784423828, + "learning_rate": 1.194010889292196e-05, + "loss": 40.0706, + "step": 3327 + }, + { + "epoch": 12.014446952595938, + "grad_norm": 197.73046875, + "learning_rate": 1.1934664246823957e-05, + "loss": 39.844, + "step": 3328 + }, + { + "epoch": 12.01805869074492, + "grad_norm": 190.29563903808594, + "learning_rate": 1.1929219600725952e-05, + "loss": 41.8877, + "step": 3329 + }, + { + "epoch": 12.021670428893906, + "grad_norm": 190.01197814941406, + "learning_rate": 1.1923774954627951e-05, + "loss": 40.5782, + "step": 3330 + }, + { + "epoch": 12.021670428893906, + "eval_loss": 0.6100598573684692, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3330 + }, + { + "epoch": 12.025282167042889, + "grad_norm": 283.20965576171875, + "learning_rate": 1.1918330308529946e-05, + "loss": 42.9183, + "step": 3331 + }, + { + "epoch": 12.028893905191874, + "grad_norm": 227.9106903076172, + "learning_rate": 1.1912885662431942e-05, + "loss": 41.4606, + "step": 3332 + }, + { + "epoch": 12.032505643340858, + "grad_norm": 217.31640625, + "learning_rate": 1.1907441016333939e-05, + "loss": 40.527, + "step": 3333 + }, + { + "epoch": 12.036117381489841, + "grad_norm": 181.33787536621094, + "learning_rate": 1.1901996370235936e-05, + "loss": 40.2536, + "step": 3334 + }, + { + "epoch": 12.039729119638826, + "grad_norm": 210.638427734375, + "learning_rate": 1.1896551724137931e-05, + "loss": 39.0234, + "step": 3335 + }, + { + "epoch": 12.043340857787811, + "grad_norm": 222.1325225830078, + "learning_rate": 1.1891107078039928e-05, + "loss": 36.6929, + "step": 3336 + }, + { + "epoch": 12.046952595936794, + "grad_norm": 195.0751953125, + "learning_rate": 1.1885662431941924e-05, + "loss": 37.9547, + "step": 3337 + }, + { + "epoch": 12.050564334085779, + "grad_norm": 287.6582946777344, + "learning_rate": 1.1880217785843919e-05, + "loss": 37.9016, + "step": 3338 + }, + { + "epoch": 12.054176072234762, + "grad_norm": 351.43701171875, + "learning_rate": 1.1874773139745918e-05, + "loss": 40.014, + "step": 3339 + }, + { + "epoch": 12.057787810383747, + "grad_norm": 212.9033966064453, + "learning_rate": 1.1869328493647913e-05, + "loss": 37.8761, + "step": 3340 + }, + { + "epoch": 12.057787810383747, + "eval_loss": 0.6093400120735168, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 3340 + }, + { + "epoch": 12.061399548532732, + "grad_norm": 268.8284912109375, + "learning_rate": 1.186388384754991e-05, + "loss": 38.7171, + "step": 3341 + }, + { + "epoch": 12.065011286681715, + "grad_norm": 193.27267456054688, + "learning_rate": 1.1858439201451906e-05, + "loss": 38.4908, + "step": 3342 + }, + { + "epoch": 12.0686230248307, + "grad_norm": 244.18124389648438, + "learning_rate": 1.1852994555353901e-05, + "loss": 37.9388, + "step": 3343 + }, + { + "epoch": 12.072234762979685, + "grad_norm": 311.6593933105469, + "learning_rate": 1.18475499092559e-05, + "loss": 38.4287, + "step": 3344 + }, + { + "epoch": 12.075846501128668, + "grad_norm": 239.28526306152344, + "learning_rate": 1.1842105263157895e-05, + "loss": 38.1349, + "step": 3345 + }, + { + "epoch": 12.079458239277653, + "grad_norm": 312.1795654296875, + "learning_rate": 1.183666061705989e-05, + "loss": 39.8067, + "step": 3346 + }, + { + "epoch": 12.083069977426636, + "grad_norm": 303.3067932128906, + "learning_rate": 1.1831215970961888e-05, + "loss": 40.0617, + "step": 3347 + }, + { + "epoch": 12.08668171557562, + "grad_norm": 280.8705749511719, + "learning_rate": 1.1825771324863885e-05, + "loss": 39.244, + "step": 3348 + }, + { + "epoch": 12.090293453724605, + "grad_norm": 249.89671325683594, + "learning_rate": 1.182032667876588e-05, + "loss": 39.0047, + "step": 3349 + }, + { + "epoch": 12.093905191873588, + "grad_norm": 226.19195556640625, + "learning_rate": 1.1814882032667877e-05, + "loss": 40.8044, + "step": 3350 + }, + { + "epoch": 12.093905191873588, + "eval_loss": 0.6100687384605408, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 3350 + }, + { + "epoch": 12.097516930022573, + "grad_norm": 250.29306030273438, + "learning_rate": 1.1809437386569873e-05, + "loss": 38.0745, + "step": 3351 + }, + { + "epoch": 12.101128668171558, + "grad_norm": 255.06137084960938, + "learning_rate": 1.180399274047187e-05, + "loss": 37.2922, + "step": 3352 + }, + { + "epoch": 12.104740406320541, + "grad_norm": 293.59185791015625, + "learning_rate": 1.1798548094373867e-05, + "loss": 35.488, + "step": 3353 + }, + { + "epoch": 12.108352144469526, + "grad_norm": 260.9599914550781, + "learning_rate": 1.1793103448275862e-05, + "loss": 32.8175, + "step": 3354 + }, + { + "epoch": 12.111963882618511, + "grad_norm": 387.63671875, + "learning_rate": 1.178765880217786e-05, + "loss": 31.3901, + "step": 3355 + }, + { + "epoch": 12.115575620767494, + "grad_norm": 216.2008819580078, + "learning_rate": 1.1782214156079855e-05, + "loss": 32.9512, + "step": 3356 + }, + { + "epoch": 12.119187358916479, + "grad_norm": 260.510498046875, + "learning_rate": 1.177676950998185e-05, + "loss": 31.838, + "step": 3357 + }, + { + "epoch": 12.122799097065462, + "grad_norm": 215.96522521972656, + "learning_rate": 1.1771324863883849e-05, + "loss": 33.5854, + "step": 3358 + }, + { + "epoch": 12.126410835214447, + "grad_norm": 277.2855529785156, + "learning_rate": 1.1765880217785844e-05, + "loss": 34.947, + "step": 3359 + }, + { + "epoch": 12.130022573363432, + "grad_norm": 199.53759765625, + "learning_rate": 1.176043557168784e-05, + "loss": 34.3862, + "step": 3360 + }, + { + "epoch": 12.130022573363432, + "eval_loss": 0.6107886433601379, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 3360 + }, + { + "epoch": 12.133634311512415, + "grad_norm": 244.73654174804688, + "learning_rate": 1.1754990925589837e-05, + "loss": 34.5678, + "step": 3361 + }, + { + "epoch": 12.1372460496614, + "grad_norm": 335.4967346191406, + "learning_rate": 1.1749546279491834e-05, + "loss": 35.8974, + "step": 3362 + }, + { + "epoch": 12.140857787810384, + "grad_norm": 269.8370056152344, + "learning_rate": 1.174410163339383e-05, + "loss": 36.3458, + "step": 3363 + }, + { + "epoch": 12.144469525959368, + "grad_norm": 230.82492065429688, + "learning_rate": 1.1738656987295826e-05, + "loss": 34.6797, + "step": 3364 + }, + { + "epoch": 12.148081264108352, + "grad_norm": 266.6196594238281, + "learning_rate": 1.1733212341197822e-05, + "loss": 35.5799, + "step": 3365 + }, + { + "epoch": 12.151693002257336, + "grad_norm": 268.1825256347656, + "learning_rate": 1.1727767695099819e-05, + "loss": 34.9859, + "step": 3366 + }, + { + "epoch": 12.15530474040632, + "grad_norm": 259.6159362792969, + "learning_rate": 1.1722323049001816e-05, + "loss": 37.2283, + "step": 3367 + }, + { + "epoch": 12.158916478555305, + "grad_norm": 225.1367645263672, + "learning_rate": 1.1716878402903811e-05, + "loss": 37.4073, + "step": 3368 + }, + { + "epoch": 12.162528216704288, + "grad_norm": 277.8457946777344, + "learning_rate": 1.1711433756805808e-05, + "loss": 36.3491, + "step": 3369 + }, + { + "epoch": 12.166139954853273, + "grad_norm": 273.1939697265625, + "learning_rate": 1.1705989110707804e-05, + "loss": 31.4646, + "step": 3370 + }, + { + "epoch": 12.166139954853273, + "eval_loss": 0.6099494695663452, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 3370 + }, + { + "epoch": 12.169751693002258, + "grad_norm": 199.32516479492188, + "learning_rate": 1.17005444646098e-05, + "loss": 22.7125, + "step": 3371 + }, + { + "epoch": 12.173363431151241, + "grad_norm": 195.47630310058594, + "learning_rate": 1.1695099818511798e-05, + "loss": 22.7899, + "step": 3372 + }, + { + "epoch": 12.176975169300226, + "grad_norm": 220.02413940429688, + "learning_rate": 1.1689655172413793e-05, + "loss": 23.4427, + "step": 3373 + }, + { + "epoch": 12.18058690744921, + "grad_norm": 215.43287658691406, + "learning_rate": 1.168421052631579e-05, + "loss": 24.1504, + "step": 3374 + }, + { + "epoch": 12.184198645598194, + "grad_norm": 298.2409973144531, + "learning_rate": 1.1678765880217786e-05, + "loss": 41.4955, + "step": 3375 + }, + { + "epoch": 12.187810383747179, + "grad_norm": 235.94728088378906, + "learning_rate": 1.1673321234119783e-05, + "loss": 42.4273, + "step": 3376 + }, + { + "epoch": 12.191422121896162, + "grad_norm": 235.44480895996094, + "learning_rate": 1.166787658802178e-05, + "loss": 40.6468, + "step": 3377 + }, + { + "epoch": 12.195033860045147, + "grad_norm": 281.5338439941406, + "learning_rate": 1.1662431941923775e-05, + "loss": 39.8335, + "step": 3378 + }, + { + "epoch": 12.198645598194132, + "grad_norm": 185.87339782714844, + "learning_rate": 1.165698729582577e-05, + "loss": 40.8669, + "step": 3379 + }, + { + "epoch": 12.202257336343115, + "grad_norm": 218.88861083984375, + "learning_rate": 1.1651542649727768e-05, + "loss": 40.1351, + "step": 3380 + }, + { + "epoch": 12.202257336343115, + "eval_loss": 0.6128573417663574, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3380 + }, + { + "epoch": 12.2058690744921, + "grad_norm": 192.7227783203125, + "learning_rate": 1.1646098003629765e-05, + "loss": 40.4448, + "step": 3381 + }, + { + "epoch": 12.209480812641084, + "grad_norm": 219.68093872070312, + "learning_rate": 1.1640653357531762e-05, + "loss": 41.579, + "step": 3382 + }, + { + "epoch": 12.213092550790067, + "grad_norm": 235.8788299560547, + "learning_rate": 1.1635208711433757e-05, + "loss": 41.3374, + "step": 3383 + }, + { + "epoch": 12.216704288939052, + "grad_norm": 245.11935424804688, + "learning_rate": 1.1629764065335752e-05, + "loss": 41.1151, + "step": 3384 + }, + { + "epoch": 12.220316027088035, + "grad_norm": 260.2931823730469, + "learning_rate": 1.1624319419237751e-05, + "loss": 38.9502, + "step": 3385 + }, + { + "epoch": 12.22392776523702, + "grad_norm": 240.62734985351562, + "learning_rate": 1.1618874773139747e-05, + "loss": 38.6309, + "step": 3386 + }, + { + "epoch": 12.227539503386005, + "grad_norm": 230.9380645751953, + "learning_rate": 1.1613430127041742e-05, + "loss": 38.3077, + "step": 3387 + }, + { + "epoch": 12.231151241534988, + "grad_norm": 234.40687561035156, + "learning_rate": 1.1607985480943739e-05, + "loss": 37.1566, + "step": 3388 + }, + { + "epoch": 12.234762979683973, + "grad_norm": 216.580810546875, + "learning_rate": 1.1602540834845734e-05, + "loss": 38.4919, + "step": 3389 + }, + { + "epoch": 12.238374717832958, + "grad_norm": 210.75079345703125, + "learning_rate": 1.1597096188747732e-05, + "loss": 38.1647, + "step": 3390 + }, + { + "epoch": 12.238374717832958, + "eval_loss": 0.6105583906173706, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3390 + }, + { + "epoch": 12.241986455981941, + "grad_norm": 207.82180786132812, + "learning_rate": 1.1591651542649729e-05, + "loss": 38.5585, + "step": 3391 + }, + { + "epoch": 12.245598194130926, + "grad_norm": 186.55081176757812, + "learning_rate": 1.1586206896551724e-05, + "loss": 38.0183, + "step": 3392 + }, + { + "epoch": 12.249209932279909, + "grad_norm": 179.60572814941406, + "learning_rate": 1.1580762250453721e-05, + "loss": 39.6951, + "step": 3393 + }, + { + "epoch": 12.252821670428894, + "grad_norm": 212.59837341308594, + "learning_rate": 1.1575317604355718e-05, + "loss": 39.2908, + "step": 3394 + }, + { + "epoch": 12.256433408577879, + "grad_norm": 239.90997314453125, + "learning_rate": 1.1569872958257714e-05, + "loss": 39.9409, + "step": 3395 + }, + { + "epoch": 12.260045146726862, + "grad_norm": 240.729248046875, + "learning_rate": 1.156442831215971e-05, + "loss": 39.2386, + "step": 3396 + }, + { + "epoch": 12.263656884875846, + "grad_norm": 248.6179962158203, + "learning_rate": 1.1558983666061706e-05, + "loss": 37.3296, + "step": 3397 + }, + { + "epoch": 12.267268623024831, + "grad_norm": 192.55084228515625, + "learning_rate": 1.1553539019963701e-05, + "loss": 40.1156, + "step": 3398 + }, + { + "epoch": 12.270880361173814, + "grad_norm": 217.89109802246094, + "learning_rate": 1.15480943738657e-05, + "loss": 41.0677, + "step": 3399 + }, + { + "epoch": 12.2744920993228, + "grad_norm": 240.77633666992188, + "learning_rate": 1.1542649727767695e-05, + "loss": 39.3552, + "step": 3400 + }, + { + "epoch": 12.2744920993228, + "eval_loss": 0.6094763278961182, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3400 + }, + { + "epoch": 12.278103837471784, + "grad_norm": 210.38153076171875, + "learning_rate": 1.1537205081669691e-05, + "loss": 40.2202, + "step": 3401 + }, + { + "epoch": 12.281715575620767, + "grad_norm": 195.49087524414062, + "learning_rate": 1.1531760435571688e-05, + "loss": 37.5473, + "step": 3402 + }, + { + "epoch": 12.285327313769752, + "grad_norm": 254.43972778320312, + "learning_rate": 1.1526315789473683e-05, + "loss": 37.8032, + "step": 3403 + }, + { + "epoch": 12.288939051918735, + "grad_norm": 205.09913635253906, + "learning_rate": 1.1520871143375682e-05, + "loss": 35.1317, + "step": 3404 + }, + { + "epoch": 12.29255079006772, + "grad_norm": 241.22930908203125, + "learning_rate": 1.1515426497277677e-05, + "loss": 32.7809, + "step": 3405 + }, + { + "epoch": 12.296162528216705, + "grad_norm": 226.75311279296875, + "learning_rate": 1.1509981851179673e-05, + "loss": 32.5354, + "step": 3406 + }, + { + "epoch": 12.299774266365688, + "grad_norm": 323.5389709472656, + "learning_rate": 1.150453720508167e-05, + "loss": 33.1533, + "step": 3407 + }, + { + "epoch": 12.303386004514673, + "grad_norm": 306.7039794921875, + "learning_rate": 1.1499092558983667e-05, + "loss": 33.7924, + "step": 3408 + }, + { + "epoch": 12.306997742663658, + "grad_norm": 221.53897094726562, + "learning_rate": 1.1493647912885662e-05, + "loss": 33.829, + "step": 3409 + }, + { + "epoch": 12.31060948081264, + "grad_norm": 301.59527587890625, + "learning_rate": 1.148820326678766e-05, + "loss": 35.4583, + "step": 3410 + }, + { + "epoch": 12.31060948081264, + "eval_loss": 0.6092248558998108, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.058, + "eval_steps_per_second": 57.058, + "step": 3410 + }, + { + "epoch": 12.314221218961626, + "grad_norm": 229.63221740722656, + "learning_rate": 1.1482758620689655e-05, + "loss": 34.3258, + "step": 3411 + }, + { + "epoch": 12.317832957110609, + "grad_norm": 280.6421203613281, + "learning_rate": 1.147731397459165e-05, + "loss": 33.4522, + "step": 3412 + }, + { + "epoch": 12.321444695259594, + "grad_norm": 305.6673889160156, + "learning_rate": 1.1471869328493649e-05, + "loss": 34.8911, + "step": 3413 + }, + { + "epoch": 12.325056433408578, + "grad_norm": 278.5484924316406, + "learning_rate": 1.1466424682395644e-05, + "loss": 36.2668, + "step": 3414 + }, + { + "epoch": 12.328668171557561, + "grad_norm": 246.88082885742188, + "learning_rate": 1.1460980036297641e-05, + "loss": 34.8401, + "step": 3415 + }, + { + "epoch": 12.332279909706546, + "grad_norm": 279.730712890625, + "learning_rate": 1.1455535390199637e-05, + "loss": 36.2382, + "step": 3416 + }, + { + "epoch": 12.335891647855531, + "grad_norm": 243.62918090820312, + "learning_rate": 1.1450090744101634e-05, + "loss": 37.0742, + "step": 3417 + }, + { + "epoch": 12.339503386004514, + "grad_norm": 280.5240783691406, + "learning_rate": 1.1444646098003631e-05, + "loss": 37.0223, + "step": 3418 + }, + { + "epoch": 12.343115124153499, + "grad_norm": 270.56396484375, + "learning_rate": 1.1439201451905626e-05, + "loss": 34.8413, + "step": 3419 + }, + { + "epoch": 12.346726862302482, + "grad_norm": 246.56292724609375, + "learning_rate": 1.1433756805807622e-05, + "loss": 26.5596, + "step": 3420 + }, + { + "epoch": 12.346726862302482, + "eval_loss": 0.6123174428939819, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.143, + "eval_steps_per_second": 57.143, + "step": 3420 + }, + { + "epoch": 12.350338600451467, + "grad_norm": 199.72242736816406, + "learning_rate": 1.1428312159709619e-05, + "loss": 23.3959, + "step": 3421 + }, + { + "epoch": 12.353950338600452, + "grad_norm": 264.9206848144531, + "learning_rate": 1.1422867513611616e-05, + "loss": 23.448, + "step": 3422 + }, + { + "epoch": 12.357562076749435, + "grad_norm": 198.09420776367188, + "learning_rate": 1.1417422867513613e-05, + "loss": 23.4526, + "step": 3423 + }, + { + "epoch": 12.36117381489842, + "grad_norm": 191.74949645996094, + "learning_rate": 1.1411978221415608e-05, + "loss": 23.9586, + "step": 3424 + }, + { + "epoch": 12.364785553047405, + "grad_norm": 270.4527893066406, + "learning_rate": 1.1406533575317604e-05, + "loss": 41.2497, + "step": 3425 + }, + { + "epoch": 12.368397291196388, + "grad_norm": 253.06109619140625, + "learning_rate": 1.1401088929219601e-05, + "loss": 41.7598, + "step": 3426 + }, + { + "epoch": 12.372009029345373, + "grad_norm": 389.3164978027344, + "learning_rate": 1.1395644283121598e-05, + "loss": 42.1145, + "step": 3427 + }, + { + "epoch": 12.375620767494357, + "grad_norm": 405.1527404785156, + "learning_rate": 1.1390199637023593e-05, + "loss": 39.8163, + "step": 3428 + }, + { + "epoch": 12.37923250564334, + "grad_norm": 360.5083312988281, + "learning_rate": 1.138475499092559e-05, + "loss": 40.7344, + "step": 3429 + }, + { + "epoch": 12.382844243792325, + "grad_norm": 276.3650207519531, + "learning_rate": 1.1379310344827586e-05, + "loss": 40.6678, + "step": 3430 + }, + { + "epoch": 12.382844243792325, + "eval_loss": 0.612799346446991, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 3430 + }, + { + "epoch": 12.386455981941308, + "grad_norm": 222.34078979492188, + "learning_rate": 1.1373865698729583e-05, + "loss": 39.8701, + "step": 3431 + }, + { + "epoch": 12.390067720090293, + "grad_norm": 242.1103515625, + "learning_rate": 1.136842105263158e-05, + "loss": 42.031, + "step": 3432 + }, + { + "epoch": 12.393679458239278, + "grad_norm": 231.30453491210938, + "learning_rate": 1.1362976406533575e-05, + "loss": 40.7321, + "step": 3433 + }, + { + "epoch": 12.397291196388261, + "grad_norm": 302.65179443359375, + "learning_rate": 1.1357531760435572e-05, + "loss": 41.5889, + "step": 3434 + }, + { + "epoch": 12.400902934537246, + "grad_norm": 296.4203796386719, + "learning_rate": 1.1352087114337568e-05, + "loss": 40.3939, + "step": 3435 + }, + { + "epoch": 12.404514672686231, + "grad_norm": 281.8349304199219, + "learning_rate": 1.1346642468239565e-05, + "loss": 37.9457, + "step": 3436 + }, + { + "epoch": 12.408126410835214, + "grad_norm": 228.9622039794922, + "learning_rate": 1.1341197822141562e-05, + "loss": 37.4727, + "step": 3437 + }, + { + "epoch": 12.411738148984199, + "grad_norm": 276.8975524902344, + "learning_rate": 1.1335753176043557e-05, + "loss": 36.4285, + "step": 3438 + }, + { + "epoch": 12.415349887133182, + "grad_norm": 218.76206970214844, + "learning_rate": 1.1330308529945553e-05, + "loss": 37.7888, + "step": 3439 + }, + { + "epoch": 12.418961625282167, + "grad_norm": 277.31329345703125, + "learning_rate": 1.1324863883847551e-05, + "loss": 38.6416, + "step": 3440 + }, + { + "epoch": 12.418961625282167, + "eval_loss": 0.6118359565734863, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.105, + "eval_steps_per_second": 57.105, + "step": 3440 + }, + { + "epoch": 12.422573363431152, + "grad_norm": 239.2766876220703, + "learning_rate": 1.1319419237749547e-05, + "loss": 38.3779, + "step": 3441 + }, + { + "epoch": 12.426185101580135, + "grad_norm": 255.43939208984375, + "learning_rate": 1.1313974591651542e-05, + "loss": 38.7581, + "step": 3442 + }, + { + "epoch": 12.42979683972912, + "grad_norm": 196.33380126953125, + "learning_rate": 1.130852994555354e-05, + "loss": 40.1953, + "step": 3443 + }, + { + "epoch": 12.433408577878104, + "grad_norm": 284.2427062988281, + "learning_rate": 1.1303085299455535e-05, + "loss": 39.2743, + "step": 3444 + }, + { + "epoch": 12.437020316027088, + "grad_norm": 303.0172424316406, + "learning_rate": 1.1297640653357533e-05, + "loss": 39.4786, + "step": 3445 + }, + { + "epoch": 12.440632054176072, + "grad_norm": 231.17999267578125, + "learning_rate": 1.1292196007259529e-05, + "loss": 38.6038, + "step": 3446 + }, + { + "epoch": 12.444243792325057, + "grad_norm": 228.89599609375, + "learning_rate": 1.1286751361161524e-05, + "loss": 39.0235, + "step": 3447 + }, + { + "epoch": 12.44785553047404, + "grad_norm": 247.05203247070312, + "learning_rate": 1.1281306715063521e-05, + "loss": 39.9779, + "step": 3448 + }, + { + "epoch": 12.451467268623025, + "grad_norm": 221.5463104248047, + "learning_rate": 1.1275862068965517e-05, + "loss": 40.4104, + "step": 3449 + }, + { + "epoch": 12.455079006772008, + "grad_norm": 254.12820434570312, + "learning_rate": 1.1270417422867514e-05, + "loss": 40.8093, + "step": 3450 + }, + { + "epoch": 12.455079006772008, + "eval_loss": 0.6093817353248596, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 3450 + }, + { + "epoch": 12.458690744920993, + "grad_norm": 214.2323760986328, + "learning_rate": 1.1264972776769511e-05, + "loss": 40.3578, + "step": 3451 + }, + { + "epoch": 12.462302483069978, + "grad_norm": 230.64718627929688, + "learning_rate": 1.1259528130671506e-05, + "loss": 39.772, + "step": 3452 + }, + { + "epoch": 12.465914221218961, + "grad_norm": 217.81838989257812, + "learning_rate": 1.1254083484573502e-05, + "loss": 36.8193, + "step": 3453 + }, + { + "epoch": 12.469525959367946, + "grad_norm": 292.7674560546875, + "learning_rate": 1.12486388384755e-05, + "loss": 33.891, + "step": 3454 + }, + { + "epoch": 12.47313769751693, + "grad_norm": 241.6099395751953, + "learning_rate": 1.1243194192377496e-05, + "loss": 34.8947, + "step": 3455 + }, + { + "epoch": 12.476749435665914, + "grad_norm": 220.97128295898438, + "learning_rate": 1.1237749546279493e-05, + "loss": 31.7715, + "step": 3456 + }, + { + "epoch": 12.480361173814899, + "grad_norm": 191.04376220703125, + "learning_rate": 1.1232304900181488e-05, + "loss": 32.3878, + "step": 3457 + }, + { + "epoch": 12.483972911963882, + "grad_norm": 192.3009796142578, + "learning_rate": 1.1226860254083484e-05, + "loss": 33.3116, + "step": 3458 + }, + { + "epoch": 12.487584650112867, + "grad_norm": 214.22459411621094, + "learning_rate": 1.1221415607985482e-05, + "loss": 34.1394, + "step": 3459 + }, + { + "epoch": 12.491196388261852, + "grad_norm": 225.24191284179688, + "learning_rate": 1.1215970961887478e-05, + "loss": 34.9381, + "step": 3460 + }, + { + "epoch": 12.491196388261852, + "eval_loss": 0.6095408201217651, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 3460 + }, + { + "epoch": 12.494808126410835, + "grad_norm": 240.89199829101562, + "learning_rate": 1.1210526315789473e-05, + "loss": 34.5342, + "step": 3461 + }, + { + "epoch": 12.49841986455982, + "grad_norm": 263.5467224121094, + "learning_rate": 1.120508166969147e-05, + "loss": 35.3287, + "step": 3462 + }, + { + "epoch": 12.502031602708804, + "grad_norm": 253.0650634765625, + "learning_rate": 1.1199637023593467e-05, + "loss": 35.4859, + "step": 3463 + }, + { + "epoch": 12.505643340857787, + "grad_norm": 279.4447937011719, + "learning_rate": 1.1194192377495463e-05, + "loss": 33.919, + "step": 3464 + }, + { + "epoch": 12.509255079006772, + "grad_norm": 246.6184844970703, + "learning_rate": 1.118874773139746e-05, + "loss": 35.2743, + "step": 3465 + }, + { + "epoch": 12.512866817155757, + "grad_norm": 228.4134979248047, + "learning_rate": 1.1183303085299455e-05, + "loss": 36.0865, + "step": 3466 + }, + { + "epoch": 12.51647855530474, + "grad_norm": 264.87835693359375, + "learning_rate": 1.1177858439201452e-05, + "loss": 36.1596, + "step": 3467 + }, + { + "epoch": 12.520090293453725, + "grad_norm": 252.2872772216797, + "learning_rate": 1.117241379310345e-05, + "loss": 35.7293, + "step": 3468 + }, + { + "epoch": 12.523702031602708, + "grad_norm": 277.3695373535156, + "learning_rate": 1.1166969147005445e-05, + "loss": 36.8009, + "step": 3469 + }, + { + "epoch": 12.527313769751693, + "grad_norm": 255.64610290527344, + "learning_rate": 1.1161524500907442e-05, + "loss": 28.5986, + "step": 3470 + }, + { + "epoch": 12.527313769751693, + "eval_loss": 0.6122347116470337, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 3470 + }, + { + "epoch": 12.530925507900678, + "grad_norm": 256.1487121582031, + "learning_rate": 1.1156079854809437e-05, + "loss": 23.1289, + "step": 3471 + }, + { + "epoch": 12.534537246049661, + "grad_norm": 261.9757080078125, + "learning_rate": 1.1150635208711433e-05, + "loss": 22.3379, + "step": 3472 + }, + { + "epoch": 12.538148984198646, + "grad_norm": 194.83432006835938, + "learning_rate": 1.1145190562613431e-05, + "loss": 23.6192, + "step": 3473 + }, + { + "epoch": 12.54176072234763, + "grad_norm": 241.51089477539062, + "learning_rate": 1.1139745916515427e-05, + "loss": 24.0314, + "step": 3474 + }, + { + "epoch": 12.545372460496614, + "grad_norm": 242.6024932861328, + "learning_rate": 1.1134301270417424e-05, + "loss": 40.2969, + "step": 3475 + }, + { + "epoch": 12.548984198645599, + "grad_norm": 292.17303466796875, + "learning_rate": 1.112885662431942e-05, + "loss": 42.3448, + "step": 3476 + }, + { + "epoch": 12.552595936794582, + "grad_norm": 232.811767578125, + "learning_rate": 1.1123411978221416e-05, + "loss": 41.7642, + "step": 3477 + }, + { + "epoch": 12.556207674943566, + "grad_norm": 238.43162536621094, + "learning_rate": 1.1117967332123413e-05, + "loss": 41.0827, + "step": 3478 + }, + { + "epoch": 12.559819413092551, + "grad_norm": 290.20159912109375, + "learning_rate": 1.1112522686025409e-05, + "loss": 41.3795, + "step": 3479 + }, + { + "epoch": 12.563431151241534, + "grad_norm": 197.52903747558594, + "learning_rate": 1.1107078039927404e-05, + "loss": 40.6337, + "step": 3480 + }, + { + "epoch": 12.563431151241534, + "eval_loss": 0.6133883595466614, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.135, + "eval_steps_per_second": 57.135, + "step": 3480 + }, + { + "epoch": 12.56704288939052, + "grad_norm": 259.8161926269531, + "learning_rate": 1.1101633393829401e-05, + "loss": 40.2626, + "step": 3481 + }, + { + "epoch": 12.570654627539504, + "grad_norm": 196.7882537841797, + "learning_rate": 1.1096188747731398e-05, + "loss": 41.0171, + "step": 3482 + }, + { + "epoch": 12.574266365688487, + "grad_norm": 216.27642822265625, + "learning_rate": 1.1090744101633394e-05, + "loss": 42.1328, + "step": 3483 + }, + { + "epoch": 12.577878103837472, + "grad_norm": 292.6575012207031, + "learning_rate": 1.108529945553539e-05, + "loss": 39.9502, + "step": 3484 + }, + { + "epoch": 12.581489841986457, + "grad_norm": 254.43344116210938, + "learning_rate": 1.1079854809437386e-05, + "loss": 41.3409, + "step": 3485 + }, + { + "epoch": 12.58510158013544, + "grad_norm": 211.3965606689453, + "learning_rate": 1.1074410163339385e-05, + "loss": 39.6898, + "step": 3486 + }, + { + "epoch": 12.588713318284425, + "grad_norm": 196.2000274658203, + "learning_rate": 1.106896551724138e-05, + "loss": 38.0837, + "step": 3487 + }, + { + "epoch": 12.592325056433408, + "grad_norm": 224.4564666748047, + "learning_rate": 1.1063520871143376e-05, + "loss": 38.479, + "step": 3488 + }, + { + "epoch": 12.595936794582393, + "grad_norm": 215.7074432373047, + "learning_rate": 1.1058076225045373e-05, + "loss": 38.3103, + "step": 3489 + }, + { + "epoch": 12.599548532731378, + "grad_norm": 278.2279052734375, + "learning_rate": 1.1052631578947368e-05, + "loss": 37.9399, + "step": 3490 + }, + { + "epoch": 12.599548532731378, + "eval_loss": 0.6091782450675964, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 3490 + }, + { + "epoch": 12.60316027088036, + "grad_norm": 236.7021942138672, + "learning_rate": 1.1047186932849365e-05, + "loss": 38.185, + "step": 3491 + }, + { + "epoch": 12.606772009029346, + "grad_norm": 200.35169982910156, + "learning_rate": 1.1041742286751362e-05, + "loss": 38.7405, + "step": 3492 + }, + { + "epoch": 12.610383747178329, + "grad_norm": 211.9726104736328, + "learning_rate": 1.1036297640653358e-05, + "loss": 39.8351, + "step": 3493 + }, + { + "epoch": 12.613995485327314, + "grad_norm": 303.5962829589844, + "learning_rate": 1.1030852994555353e-05, + "loss": 39.3039, + "step": 3494 + }, + { + "epoch": 12.617607223476298, + "grad_norm": 298.086181640625, + "learning_rate": 1.102540834845735e-05, + "loss": 39.9149, + "step": 3495 + }, + { + "epoch": 12.621218961625281, + "grad_norm": 255.69854736328125, + "learning_rate": 1.1019963702359347e-05, + "loss": 36.3617, + "step": 3496 + }, + { + "epoch": 12.624830699774266, + "grad_norm": 273.2884216308594, + "learning_rate": 1.1014519056261344e-05, + "loss": 38.6865, + "step": 3497 + }, + { + "epoch": 12.628442437923251, + "grad_norm": 211.17837524414062, + "learning_rate": 1.100907441016334e-05, + "loss": 40.2771, + "step": 3498 + }, + { + "epoch": 12.632054176072234, + "grad_norm": 253.9141845703125, + "learning_rate": 1.1003629764065335e-05, + "loss": 40.3644, + "step": 3499 + }, + { + "epoch": 12.635665914221219, + "grad_norm": 247.4141082763672, + "learning_rate": 1.0998185117967334e-05, + "loss": 39.9754, + "step": 3500 + }, + { + "epoch": 12.635665914221219, + "eval_loss": 0.6086810827255249, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 3500 + }, + { + "epoch": 12.639277652370204, + "grad_norm": 237.3258056640625, + "learning_rate": 1.0992740471869329e-05, + "loss": 39.9438, + "step": 3501 + }, + { + "epoch": 12.642889390519187, + "grad_norm": 252.87744140625, + "learning_rate": 1.0987295825771325e-05, + "loss": 39.9713, + "step": 3502 + }, + { + "epoch": 12.646501128668172, + "grad_norm": 341.2947998046875, + "learning_rate": 1.0981851179673322e-05, + "loss": 36.54, + "step": 3503 + }, + { + "epoch": 12.650112866817155, + "grad_norm": 212.7144317626953, + "learning_rate": 1.0976406533575317e-05, + "loss": 33.2737, + "step": 3504 + }, + { + "epoch": 12.65372460496614, + "grad_norm": 220.15846252441406, + "learning_rate": 1.0970961887477314e-05, + "loss": 34.8862, + "step": 3505 + }, + { + "epoch": 12.657336343115125, + "grad_norm": 235.8145294189453, + "learning_rate": 1.0965517241379311e-05, + "loss": 31.637, + "step": 3506 + }, + { + "epoch": 12.660948081264108, + "grad_norm": 274.13140869140625, + "learning_rate": 1.0960072595281307e-05, + "loss": 33.6111, + "step": 3507 + }, + { + "epoch": 12.664559819413093, + "grad_norm": 259.9810791015625, + "learning_rate": 1.0954627949183304e-05, + "loss": 34.7118, + "step": 3508 + }, + { + "epoch": 12.668171557562077, + "grad_norm": 244.6074676513672, + "learning_rate": 1.0949183303085299e-05, + "loss": 34.3987, + "step": 3509 + }, + { + "epoch": 12.67178329571106, + "grad_norm": 264.0238037109375, + "learning_rate": 1.0943738656987296e-05, + "loss": 34.7304, + "step": 3510 + }, + { + "epoch": 12.67178329571106, + "eval_loss": 0.6089194416999817, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 3510 + }, + { + "epoch": 12.675395033860045, + "grad_norm": 286.857421875, + "learning_rate": 1.0938294010889293e-05, + "loss": 34.5722, + "step": 3511 + }, + { + "epoch": 12.679006772009028, + "grad_norm": 270.7839660644531, + "learning_rate": 1.0932849364791289e-05, + "loss": 35.6129, + "step": 3512 + }, + { + "epoch": 12.682618510158013, + "grad_norm": 214.4302978515625, + "learning_rate": 1.0927404718693284e-05, + "loss": 34.4318, + "step": 3513 + }, + { + "epoch": 12.686230248306998, + "grad_norm": 362.6913757324219, + "learning_rate": 1.0921960072595283e-05, + "loss": 35.6578, + "step": 3514 + }, + { + "epoch": 12.689841986455981, + "grad_norm": 266.5205993652344, + "learning_rate": 1.0916515426497278e-05, + "loss": 35.8627, + "step": 3515 + }, + { + "epoch": 12.693453724604966, + "grad_norm": 271.8298034667969, + "learning_rate": 1.0911070780399275e-05, + "loss": 36.8931, + "step": 3516 + }, + { + "epoch": 12.697065462753951, + "grad_norm": 230.13815307617188, + "learning_rate": 1.090562613430127e-05, + "loss": 35.8972, + "step": 3517 + }, + { + "epoch": 12.700677200902934, + "grad_norm": 235.57127380371094, + "learning_rate": 1.0900181488203266e-05, + "loss": 36.7884, + "step": 3518 + }, + { + "epoch": 12.704288939051919, + "grad_norm": 274.0856018066406, + "learning_rate": 1.0894736842105265e-05, + "loss": 35.938, + "step": 3519 + }, + { + "epoch": 12.707900677200904, + "grad_norm": 251.9855194091797, + "learning_rate": 1.088929219600726e-05, + "loss": 30.846, + "step": 3520 + }, + { + "epoch": 12.707900677200904, + "eval_loss": 0.6102532148361206, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 3520 + }, + { + "epoch": 12.711512415349887, + "grad_norm": 254.11465454101562, + "learning_rate": 1.0883847549909255e-05, + "loss": 22.8538, + "step": 3521 + }, + { + "epoch": 12.715124153498872, + "grad_norm": 233.05821228027344, + "learning_rate": 1.0878402903811253e-05, + "loss": 22.3346, + "step": 3522 + }, + { + "epoch": 12.718735891647855, + "grad_norm": 223.46646118164062, + "learning_rate": 1.087295825771325e-05, + "loss": 23.8109, + "step": 3523 + }, + { + "epoch": 12.72234762979684, + "grad_norm": 209.4064483642578, + "learning_rate": 1.0867513611615245e-05, + "loss": 24.7694, + "step": 3524 + }, + { + "epoch": 12.725959367945824, + "grad_norm": 299.6215515136719, + "learning_rate": 1.0862068965517242e-05, + "loss": 40.8879, + "step": 3525 + }, + { + "epoch": 12.729571106094808, + "grad_norm": 272.5259704589844, + "learning_rate": 1.0856624319419237e-05, + "loss": 41.5875, + "step": 3526 + }, + { + "epoch": 12.733182844243792, + "grad_norm": 219.70687866210938, + "learning_rate": 1.0851179673321235e-05, + "loss": 41.5546, + "step": 3527 + }, + { + "epoch": 12.736794582392777, + "grad_norm": 250.9104766845703, + "learning_rate": 1.0845735027223232e-05, + "loss": 40.0984, + "step": 3528 + }, + { + "epoch": 12.74040632054176, + "grad_norm": 260.9254150390625, + "learning_rate": 1.0840290381125227e-05, + "loss": 40.564, + "step": 3529 + }, + { + "epoch": 12.744018058690745, + "grad_norm": 275.46221923828125, + "learning_rate": 1.0834845735027224e-05, + "loss": 40.3864, + "step": 3530 + }, + { + "epoch": 12.744018058690745, + "eval_loss": 0.6099677681922913, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 3530 + }, + { + "epoch": 12.747629796839728, + "grad_norm": 200.9589385986328, + "learning_rate": 1.082940108892922e-05, + "loss": 40.5753, + "step": 3531 + }, + { + "epoch": 12.751241534988713, + "grad_norm": 228.87669372558594, + "learning_rate": 1.0823956442831215e-05, + "loss": 41.4702, + "step": 3532 + }, + { + "epoch": 12.754853273137698, + "grad_norm": 218.6998748779297, + "learning_rate": 1.0818511796733214e-05, + "loss": 41.6641, + "step": 3533 + }, + { + "epoch": 12.758465011286681, + "grad_norm": 422.519775390625, + "learning_rate": 1.0813067150635209e-05, + "loss": 41.8016, + "step": 3534 + }, + { + "epoch": 12.762076749435666, + "grad_norm": 198.31935119628906, + "learning_rate": 1.0807622504537204e-05, + "loss": 40.6053, + "step": 3535 + }, + { + "epoch": 12.76568848758465, + "grad_norm": 274.42333984375, + "learning_rate": 1.0802177858439201e-05, + "loss": 38.7974, + "step": 3536 + }, + { + "epoch": 12.769300225733634, + "grad_norm": 267.5847473144531, + "learning_rate": 1.0796733212341199e-05, + "loss": 37.157, + "step": 3537 + }, + { + "epoch": 12.772911963882619, + "grad_norm": 264.9976806640625, + "learning_rate": 1.0791288566243196e-05, + "loss": 38.1585, + "step": 3538 + }, + { + "epoch": 12.776523702031604, + "grad_norm": 216.5603790283203, + "learning_rate": 1.0785843920145191e-05, + "loss": 38.0501, + "step": 3539 + }, + { + "epoch": 12.780135440180587, + "grad_norm": 193.55081176757812, + "learning_rate": 1.0780399274047186e-05, + "loss": 38.3114, + "step": 3540 + }, + { + "epoch": 12.780135440180587, + "eval_loss": 0.6059894561767578, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3540 + }, + { + "epoch": 12.783747178329572, + "grad_norm": 256.3584289550781, + "learning_rate": 1.0774954627949183e-05, + "loss": 38.7056, + "step": 3541 + }, + { + "epoch": 12.787358916478555, + "grad_norm": 203.17401123046875, + "learning_rate": 1.076950998185118e-05, + "loss": 39.3947, + "step": 3542 + }, + { + "epoch": 12.79097065462754, + "grad_norm": 307.99517822265625, + "learning_rate": 1.0764065335753176e-05, + "loss": 39.2121, + "step": 3543 + }, + { + "epoch": 12.794582392776524, + "grad_norm": 199.4147186279297, + "learning_rate": 1.0758620689655173e-05, + "loss": 38.4621, + "step": 3544 + }, + { + "epoch": 12.798194130925507, + "grad_norm": 251.60293579101562, + "learning_rate": 1.0753176043557168e-05, + "loss": 38.2742, + "step": 3545 + }, + { + "epoch": 12.801805869074492, + "grad_norm": 277.1817321777344, + "learning_rate": 1.0747731397459165e-05, + "loss": 38.6803, + "step": 3546 + }, + { + "epoch": 12.805417607223477, + "grad_norm": 303.2837219238281, + "learning_rate": 1.0742286751361163e-05, + "loss": 39.7843, + "step": 3547 + }, + { + "epoch": 12.80902934537246, + "grad_norm": 321.22772216796875, + "learning_rate": 1.0736842105263158e-05, + "loss": 41.3761, + "step": 3548 + }, + { + "epoch": 12.812641083521445, + "grad_norm": 238.89007568359375, + "learning_rate": 1.0731397459165155e-05, + "loss": 40.3649, + "step": 3549 + }, + { + "epoch": 12.816252821670428, + "grad_norm": 251.22291564941406, + "learning_rate": 1.072595281306715e-05, + "loss": 40.8151, + "step": 3550 + }, + { + "epoch": 12.816252821670428, + "eval_loss": 0.6065003275871277, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 3550 + }, + { + "epoch": 12.819864559819413, + "grad_norm": 218.13418579101562, + "learning_rate": 1.0720508166969147e-05, + "loss": 39.381, + "step": 3551 + }, + { + "epoch": 12.823476297968398, + "grad_norm": 250.90328979492188, + "learning_rate": 1.0715063520871145e-05, + "loss": 39.8923, + "step": 3552 + }, + { + "epoch": 12.827088036117381, + "grad_norm": 227.4825897216797, + "learning_rate": 1.070961887477314e-05, + "loss": 36.836, + "step": 3553 + }, + { + "epoch": 12.830699774266366, + "grad_norm": 253.7106475830078, + "learning_rate": 1.0704174228675135e-05, + "loss": 34.499, + "step": 3554 + }, + { + "epoch": 12.83431151241535, + "grad_norm": 280.0548400878906, + "learning_rate": 1.0698729582577132e-05, + "loss": 33.3409, + "step": 3555 + }, + { + "epoch": 12.837923250564334, + "grad_norm": 201.3768768310547, + "learning_rate": 1.069328493647913e-05, + "loss": 32.4868, + "step": 3556 + }, + { + "epoch": 12.841534988713319, + "grad_norm": 245.73446655273438, + "learning_rate": 1.0687840290381125e-05, + "loss": 32.8295, + "step": 3557 + }, + { + "epoch": 12.845146726862303, + "grad_norm": 195.0170440673828, + "learning_rate": 1.0682395644283122e-05, + "loss": 33.2009, + "step": 3558 + }, + { + "epoch": 12.848758465011286, + "grad_norm": 261.66357421875, + "learning_rate": 1.0676950998185117e-05, + "loss": 33.0627, + "step": 3559 + }, + { + "epoch": 12.852370203160271, + "grad_norm": 299.0184326171875, + "learning_rate": 1.0671506352087116e-05, + "loss": 34.184, + "step": 3560 + }, + { + "epoch": 12.852370203160271, + "eval_loss": 0.6077792048454285, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.041, + "eval_steps_per_second": 57.041, + "step": 3560 + }, + { + "epoch": 12.855981941309254, + "grad_norm": 293.9249572753906, + "learning_rate": 1.0666061705989111e-05, + "loss": 34.748, + "step": 3561 + }, + { + "epoch": 12.85959367945824, + "grad_norm": 206.4182586669922, + "learning_rate": 1.0660617059891107e-05, + "loss": 33.8454, + "step": 3562 + }, + { + "epoch": 12.863205417607224, + "grad_norm": 261.4427185058594, + "learning_rate": 1.0655172413793104e-05, + "loss": 35.7317, + "step": 3563 + }, + { + "epoch": 12.866817155756207, + "grad_norm": 236.60704040527344, + "learning_rate": 1.06497277676951e-05, + "loss": 35.2389, + "step": 3564 + }, + { + "epoch": 12.870428893905192, + "grad_norm": 272.9973449707031, + "learning_rate": 1.0644283121597096e-05, + "loss": 34.8523, + "step": 3565 + }, + { + "epoch": 12.874040632054175, + "grad_norm": 228.82540893554688, + "learning_rate": 1.0638838475499093e-05, + "loss": 34.7236, + "step": 3566 + }, + { + "epoch": 12.87765237020316, + "grad_norm": 266.6078796386719, + "learning_rate": 1.0633393829401089e-05, + "loss": 36.1574, + "step": 3567 + }, + { + "epoch": 12.881264108352145, + "grad_norm": 267.52239990234375, + "learning_rate": 1.0627949183303086e-05, + "loss": 36.8466, + "step": 3568 + }, + { + "epoch": 12.884875846501128, + "grad_norm": 261.0372314453125, + "learning_rate": 1.0622504537205083e-05, + "loss": 37.2803, + "step": 3569 + }, + { + "epoch": 12.888487584650113, + "grad_norm": 220.42532348632812, + "learning_rate": 1.0617059891107078e-05, + "loss": 29.4233, + "step": 3570 + }, + { + "epoch": 12.888487584650113, + "eval_loss": 0.6131581664085388, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 3570 + }, + { + "epoch": 12.892099322799098, + "grad_norm": 187.53604125976562, + "learning_rate": 1.0611615245009075e-05, + "loss": 23.3851, + "step": 3571 + }, + { + "epoch": 12.89571106094808, + "grad_norm": 227.1913299560547, + "learning_rate": 1.060617059891107e-05, + "loss": 23.3155, + "step": 3572 + }, + { + "epoch": 12.899322799097066, + "grad_norm": 202.15939331054688, + "learning_rate": 1.0600725952813066e-05, + "loss": 24.4548, + "step": 3573 + }, + { + "epoch": 12.90293453724605, + "grad_norm": 195.67282104492188, + "learning_rate": 1.0595281306715065e-05, + "loss": 24.2037, + "step": 3574 + }, + { + "epoch": 12.906546275395034, + "grad_norm": 303.0018310546875, + "learning_rate": 1.058983666061706e-05, + "loss": 41.6489, + "step": 3575 + }, + { + "epoch": 12.910158013544018, + "grad_norm": 193.92433166503906, + "learning_rate": 1.0584392014519056e-05, + "loss": 40.3682, + "step": 3576 + }, + { + "epoch": 12.913769751693001, + "grad_norm": 305.50750732421875, + "learning_rate": 1.0578947368421053e-05, + "loss": 40.5065, + "step": 3577 + }, + { + "epoch": 12.917381489841986, + "grad_norm": 223.41732788085938, + "learning_rate": 1.0573502722323048e-05, + "loss": 41.6387, + "step": 3578 + }, + { + "epoch": 12.920993227990971, + "grad_norm": 215.65061950683594, + "learning_rate": 1.0568058076225047e-05, + "loss": 41.3623, + "step": 3579 + }, + { + "epoch": 12.924604966139954, + "grad_norm": 223.95880126953125, + "learning_rate": 1.0562613430127042e-05, + "loss": 40.7444, + "step": 3580 + }, + { + "epoch": 12.924604966139954, + "eval_loss": 0.6113386750221252, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.074, + "eval_steps_per_second": 57.074, + "step": 3580 + }, + { + "epoch": 12.928216704288939, + "grad_norm": 247.3272247314453, + "learning_rate": 1.0557168784029038e-05, + "loss": 37.8137, + "step": 3581 + }, + { + "epoch": 12.931828442437924, + "grad_norm": 277.4321594238281, + "learning_rate": 1.0551724137931035e-05, + "loss": 38.6946, + "step": 3582 + }, + { + "epoch": 12.935440180586907, + "grad_norm": 219.15576171875, + "learning_rate": 1.0546279491833032e-05, + "loss": 39.0059, + "step": 3583 + }, + { + "epoch": 12.939051918735892, + "grad_norm": 205.6105194091797, + "learning_rate": 1.0540834845735027e-05, + "loss": 39.2436, + "step": 3584 + }, + { + "epoch": 12.942663656884875, + "grad_norm": 303.84521484375, + "learning_rate": 1.0535390199637024e-05, + "loss": 39.2451, + "step": 3585 + }, + { + "epoch": 12.94627539503386, + "grad_norm": 326.2321472167969, + "learning_rate": 1.052994555353902e-05, + "loss": 38.1849, + "step": 3586 + }, + { + "epoch": 12.949887133182845, + "grad_norm": 332.7608642578125, + "learning_rate": 1.0524500907441015e-05, + "loss": 39.7121, + "step": 3587 + }, + { + "epoch": 12.953498871331828, + "grad_norm": 245.19827270507812, + "learning_rate": 1.0519056261343014e-05, + "loss": 39.6558, + "step": 3588 + }, + { + "epoch": 12.957110609480813, + "grad_norm": 227.54763793945312, + "learning_rate": 1.051361161524501e-05, + "loss": 38.6437, + "step": 3589 + }, + { + "epoch": 12.960722347629797, + "grad_norm": 273.1142272949219, + "learning_rate": 1.0508166969147006e-05, + "loss": 39.083, + "step": 3590 + }, + { + "epoch": 12.960722347629797, + "eval_loss": 0.6050187349319458, + "eval_runtime": 3.1339, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 3590 + }, + { + "epoch": 12.96433408577878, + "grad_norm": 227.0492401123047, + "learning_rate": 1.0502722323049002e-05, + "loss": 34.0254, + "step": 3591 + }, + { + "epoch": 12.967945823927765, + "grad_norm": 201.76736450195312, + "learning_rate": 1.0497277676950999e-05, + "loss": 32.4569, + "step": 3592 + }, + { + "epoch": 12.97155756207675, + "grad_norm": 279.99237060546875, + "learning_rate": 1.0491833030852996e-05, + "loss": 33.8718, + "step": 3593 + }, + { + "epoch": 12.975169300225733, + "grad_norm": 351.647705078125, + "learning_rate": 1.0486388384754991e-05, + "loss": 34.8168, + "step": 3594 + }, + { + "epoch": 12.978781038374718, + "grad_norm": 275.7414855957031, + "learning_rate": 1.0480943738656987e-05, + "loss": 35.1731, + "step": 3595 + }, + { + "epoch": 12.982392776523701, + "grad_norm": 347.0024719238281, + "learning_rate": 1.0475499092558984e-05, + "loss": 35.7127, + "step": 3596 + }, + { + "epoch": 12.986004514672686, + "grad_norm": 304.18218994140625, + "learning_rate": 1.047005444646098e-05, + "loss": 34.7709, + "step": 3597 + }, + { + "epoch": 12.989616252821671, + "grad_norm": 306.33245849609375, + "learning_rate": 1.0464609800362976e-05, + "loss": 37.2105, + "step": 3598 + }, + { + "epoch": 12.993227990970654, + "grad_norm": 326.3535461425781, + "learning_rate": 1.0459165154264973e-05, + "loss": 33.6613, + "step": 3599 + }, + { + "epoch": 12.996839729119639, + "grad_norm": 325.7522888183594, + "learning_rate": 1.0453720508166969e-05, + "loss": 22.8985, + "step": 3600 + }, + { + "epoch": 12.996839729119639, + "eval_loss": 0.6073772311210632, + "eval_runtime": 3.1391, + "eval_samples_per_second": 57.023, + "eval_steps_per_second": 57.023, + "step": 3600 + }, + { + "epoch": 13.0, + "grad_norm": 256.7010498046875, + "learning_rate": 1.0448275862068966e-05, + "loss": 21.3776, + "step": 3601 + }, + { + "epoch": 13.003611738148985, + "grad_norm": 247.7591552734375, + "learning_rate": 1.0442831215970963e-05, + "loss": 39.0509, + "step": 3602 + }, + { + "epoch": 13.007223476297968, + "grad_norm": 389.6626281738281, + "learning_rate": 1.0437386569872958e-05, + "loss": 41.042, + "step": 3603 + }, + { + "epoch": 13.010835214446953, + "grad_norm": 271.01885986328125, + "learning_rate": 1.0431941923774955e-05, + "loss": 39.9542, + "step": 3604 + }, + { + "epoch": 13.014446952595938, + "grad_norm": 263.2490539550781, + "learning_rate": 1.042649727767695e-05, + "loss": 39.8852, + "step": 3605 + }, + { + "epoch": 13.01805869074492, + "grad_norm": 255.46878051757812, + "learning_rate": 1.0421052631578948e-05, + "loss": 39.3902, + "step": 3606 + }, + { + "epoch": 13.021670428893906, + "grad_norm": 206.02244567871094, + "learning_rate": 1.0415607985480945e-05, + "loss": 40.1731, + "step": 3607 + }, + { + "epoch": 13.025282167042889, + "grad_norm": 194.83055114746094, + "learning_rate": 1.041016333938294e-05, + "loss": 39.17, + "step": 3608 + }, + { + "epoch": 13.028893905191874, + "grad_norm": 230.1270294189453, + "learning_rate": 1.0404718693284936e-05, + "loss": 40.3363, + "step": 3609 + }, + { + "epoch": 13.032505643340858, + "grad_norm": 206.0470733642578, + "learning_rate": 1.0399274047186933e-05, + "loss": 40.7774, + "step": 3610 + }, + { + "epoch": 13.032505643340858, + "eval_loss": 0.6078981161117554, + "eval_runtime": 3.1697, + "eval_samples_per_second": 56.472, + "eval_steps_per_second": 56.472, + "step": 3610 + }, + { + "epoch": 13.036117381489841, + "grad_norm": 210.79327392578125, + "learning_rate": 1.039382940108893e-05, + "loss": 40.725, + "step": 3611 + }, + { + "epoch": 13.039729119638826, + "grad_norm": 200.4281768798828, + "learning_rate": 1.0388384754990927e-05, + "loss": 38.8736, + "step": 3612 + }, + { + "epoch": 13.043340857787811, + "grad_norm": 183.33575439453125, + "learning_rate": 1.0382940108892922e-05, + "loss": 37.5542, + "step": 3613 + }, + { + "epoch": 13.046952595936794, + "grad_norm": 195.2568817138672, + "learning_rate": 1.0377495462794918e-05, + "loss": 36.5576, + "step": 3614 + }, + { + "epoch": 13.050564334085779, + "grad_norm": 223.9565887451172, + "learning_rate": 1.0372050816696916e-05, + "loss": 36.9015, + "step": 3615 + }, + { + "epoch": 13.054176072234762, + "grad_norm": 264.0516052246094, + "learning_rate": 1.0366606170598912e-05, + "loss": 38.8146, + "step": 3616 + }, + { + "epoch": 13.057787810383747, + "grad_norm": 247.3844757080078, + "learning_rate": 1.0361161524500907e-05, + "loss": 37.0338, + "step": 3617 + }, + { + "epoch": 13.061399548532732, + "grad_norm": 243.3253173828125, + "learning_rate": 1.0355716878402904e-05, + "loss": 37.3565, + "step": 3618 + }, + { + "epoch": 13.065011286681715, + "grad_norm": 213.89939880371094, + "learning_rate": 1.03502722323049e-05, + "loss": 38.367, + "step": 3619 + }, + { + "epoch": 13.0686230248307, + "grad_norm": 254.04953002929688, + "learning_rate": 1.0344827586206898e-05, + "loss": 38.3101, + "step": 3620 + }, + { + "epoch": 13.0686230248307, + "eval_loss": 0.6108394861221313, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 3620 + }, + { + "epoch": 13.072234762979685, + "grad_norm": 235.3623046875, + "learning_rate": 1.0339382940108894e-05, + "loss": 38.3113, + "step": 3621 + }, + { + "epoch": 13.075846501128668, + "grad_norm": 259.0147399902344, + "learning_rate": 1.0333938294010889e-05, + "loss": 36.9916, + "step": 3622 + }, + { + "epoch": 13.079458239277653, + "grad_norm": 257.96575927734375, + "learning_rate": 1.0328493647912886e-05, + "loss": 36.5944, + "step": 3623 + }, + { + "epoch": 13.083069977426636, + "grad_norm": 228.49131774902344, + "learning_rate": 1.0323049001814882e-05, + "loss": 39.7592, + "step": 3624 + }, + { + "epoch": 13.08668171557562, + "grad_norm": 278.5231018066406, + "learning_rate": 1.0317604355716879e-05, + "loss": 38.7785, + "step": 3625 + }, + { + "epoch": 13.090293453724605, + "grad_norm": 218.6136932373047, + "learning_rate": 1.0312159709618876e-05, + "loss": 39.6878, + "step": 3626 + }, + { + "epoch": 13.093905191873588, + "grad_norm": 231.03012084960938, + "learning_rate": 1.0306715063520871e-05, + "loss": 40.5433, + "step": 3627 + }, + { + "epoch": 13.097516930022573, + "grad_norm": 254.7096405029297, + "learning_rate": 1.0301270417422866e-05, + "loss": 39.1311, + "step": 3628 + }, + { + "epoch": 13.101128668171558, + "grad_norm": 303.50274658203125, + "learning_rate": 1.0295825771324865e-05, + "loss": 38.6237, + "step": 3629 + }, + { + "epoch": 13.104740406320541, + "grad_norm": 217.4394073486328, + "learning_rate": 1.029038112522686e-05, + "loss": 36.5534, + "step": 3630 + }, + { + "epoch": 13.104740406320541, + "eval_loss": 0.6075544357299805, + "eval_runtime": 3.1475, + "eval_samples_per_second": 56.87, + "eval_steps_per_second": 56.87, + "step": 3630 + }, + { + "epoch": 13.108352144469526, + "grad_norm": 249.18490600585938, + "learning_rate": 1.0284936479128858e-05, + "loss": 34.2153, + "step": 3631 + }, + { + "epoch": 13.111963882618511, + "grad_norm": 261.9061584472656, + "learning_rate": 1.0279491833030853e-05, + "loss": 33.7793, + "step": 3632 + }, + { + "epoch": 13.115575620767494, + "grad_norm": 205.93113708496094, + "learning_rate": 1.0274047186932848e-05, + "loss": 31.2934, + "step": 3633 + }, + { + "epoch": 13.119187358916479, + "grad_norm": 203.82980346679688, + "learning_rate": 1.0268602540834847e-05, + "loss": 31.9074, + "step": 3634 + }, + { + "epoch": 13.122799097065462, + "grad_norm": 309.0658874511719, + "learning_rate": 1.0263157894736843e-05, + "loss": 32.6883, + "step": 3635 + }, + { + "epoch": 13.126410835214447, + "grad_norm": 239.59312438964844, + "learning_rate": 1.0257713248638838e-05, + "loss": 34.1261, + "step": 3636 + }, + { + "epoch": 13.130022573363432, + "grad_norm": 360.4351501464844, + "learning_rate": 1.0252268602540835e-05, + "loss": 34.7656, + "step": 3637 + }, + { + "epoch": 13.133634311512415, + "grad_norm": 319.87451171875, + "learning_rate": 1.024682395644283e-05, + "loss": 34.6533, + "step": 3638 + }, + { + "epoch": 13.1372460496614, + "grad_norm": 352.31707763671875, + "learning_rate": 1.0241379310344828e-05, + "loss": 33.9159, + "step": 3639 + }, + { + "epoch": 13.140857787810384, + "grad_norm": 288.85418701171875, + "learning_rate": 1.0235934664246825e-05, + "loss": 34.6115, + "step": 3640 + }, + { + "epoch": 13.140857787810384, + "eval_loss": 0.6106187105178833, + "eval_runtime": 3.1535, + "eval_samples_per_second": 56.763, + "eval_steps_per_second": 56.763, + "step": 3640 + }, + { + "epoch": 13.144469525959368, + "grad_norm": 263.8638000488281, + "learning_rate": 1.023049001814882e-05, + "loss": 34.3008, + "step": 3641 + }, + { + "epoch": 13.148081264108352, + "grad_norm": 308.10650634765625, + "learning_rate": 1.0225045372050817e-05, + "loss": 35.9397, + "step": 3642 + }, + { + "epoch": 13.151693002257336, + "grad_norm": 208.60519409179688, + "learning_rate": 1.0219600725952814e-05, + "loss": 34.2573, + "step": 3643 + }, + { + "epoch": 13.15530474040632, + "grad_norm": 251.36766052246094, + "learning_rate": 1.021415607985481e-05, + "loss": 35.853, + "step": 3644 + }, + { + "epoch": 13.158916478555305, + "grad_norm": 264.94818115234375, + "learning_rate": 1.0208711433756807e-05, + "loss": 35.7057, + "step": 3645 + }, + { + "epoch": 13.162528216704288, + "grad_norm": 313.0333251953125, + "learning_rate": 1.0203266787658802e-05, + "loss": 34.611, + "step": 3646 + }, + { + "epoch": 13.166139954853273, + "grad_norm": 254.9687042236328, + "learning_rate": 1.0197822141560797e-05, + "loss": 31.1751, + "step": 3647 + }, + { + "epoch": 13.169751693002258, + "grad_norm": 219.7308349609375, + "learning_rate": 1.0192377495462796e-05, + "loss": 22.8425, + "step": 3648 + }, + { + "epoch": 13.173363431151241, + "grad_norm": 305.76416015625, + "learning_rate": 1.0186932849364792e-05, + "loss": 22.5266, + "step": 3649 + }, + { + "epoch": 13.176975169300226, + "grad_norm": 301.26239013671875, + "learning_rate": 1.0181488203266787e-05, + "loss": 23.861, + "step": 3650 + }, + { + "epoch": 13.176975169300226, + "eval_loss": 0.6107029914855957, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 3650 + }, + { + "epoch": 13.18058690744921, + "grad_norm": 235.15576171875, + "learning_rate": 1.0176043557168784e-05, + "loss": 24.495, + "step": 3651 + }, + { + "epoch": 13.184198645598194, + "grad_norm": 268.524658203125, + "learning_rate": 1.0170598911070781e-05, + "loss": 40.3819, + "step": 3652 + }, + { + "epoch": 13.187810383747179, + "grad_norm": 257.869140625, + "learning_rate": 1.0165154264972778e-05, + "loss": 42.2715, + "step": 3653 + }, + { + "epoch": 13.191422121896162, + "grad_norm": 191.8995361328125, + "learning_rate": 1.0159709618874774e-05, + "loss": 41.2991, + "step": 3654 + }, + { + "epoch": 13.195033860045147, + "grad_norm": 242.85342407226562, + "learning_rate": 1.0154264972776769e-05, + "loss": 39.6007, + "step": 3655 + }, + { + "epoch": 13.198645598194132, + "grad_norm": 279.1092529296875, + "learning_rate": 1.0148820326678766e-05, + "loss": 39.8502, + "step": 3656 + }, + { + "epoch": 13.202257336343115, + "grad_norm": 233.94708251953125, + "learning_rate": 1.0143375680580763e-05, + "loss": 39.6407, + "step": 3657 + }, + { + "epoch": 13.2058690744921, + "grad_norm": 227.53001403808594, + "learning_rate": 1.0137931034482758e-05, + "loss": 40.3618, + "step": 3658 + }, + { + "epoch": 13.209480812641084, + "grad_norm": 216.17654418945312, + "learning_rate": 1.0132486388384756e-05, + "loss": 41.3187, + "step": 3659 + }, + { + "epoch": 13.213092550790067, + "grad_norm": 199.51072692871094, + "learning_rate": 1.0127041742286751e-05, + "loss": 41.7474, + "step": 3660 + }, + { + "epoch": 13.213092550790067, + "eval_loss": 0.6099065542221069, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3660 + }, + { + "epoch": 13.216704288939052, + "grad_norm": 212.3302001953125, + "learning_rate": 1.0121597096188748e-05, + "loss": 40.8565, + "step": 3661 + }, + { + "epoch": 13.220316027088035, + "grad_norm": 185.42857360839844, + "learning_rate": 1.0116152450090745e-05, + "loss": 41.5302, + "step": 3662 + }, + { + "epoch": 13.22392776523702, + "grad_norm": 241.05487060546875, + "learning_rate": 1.011070780399274e-05, + "loss": 38.6842, + "step": 3663 + }, + { + "epoch": 13.227539503386005, + "grad_norm": 314.1755065917969, + "learning_rate": 1.0105263157894738e-05, + "loss": 37.8021, + "step": 3664 + }, + { + "epoch": 13.231151241534988, + "grad_norm": 262.6571960449219, + "learning_rate": 1.0099818511796733e-05, + "loss": 36.3265, + "step": 3665 + }, + { + "epoch": 13.234762979683973, + "grad_norm": 259.24029541015625, + "learning_rate": 1.009437386569873e-05, + "loss": 38.4521, + "step": 3666 + }, + { + "epoch": 13.238374717832958, + "grad_norm": 223.5182342529297, + "learning_rate": 1.0088929219600727e-05, + "loss": 37.3267, + "step": 3667 + }, + { + "epoch": 13.241986455981941, + "grad_norm": 181.72926330566406, + "learning_rate": 1.0083484573502722e-05, + "loss": 38.0142, + "step": 3668 + }, + { + "epoch": 13.245598194130926, + "grad_norm": 204.99813842773438, + "learning_rate": 1.0078039927404718e-05, + "loss": 37.3513, + "step": 3669 + }, + { + "epoch": 13.249209932279909, + "grad_norm": 184.05482482910156, + "learning_rate": 1.0072595281306715e-05, + "loss": 37.9737, + "step": 3670 + }, + { + "epoch": 13.249209932279909, + "eval_loss": 0.6081296801567078, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.081, + "eval_steps_per_second": 57.081, + "step": 3670 + }, + { + "epoch": 13.252821670428894, + "grad_norm": 261.076416015625, + "learning_rate": 1.0067150635208712e-05, + "loss": 38.1087, + "step": 3671 + }, + { + "epoch": 13.256433408577879, + "grad_norm": 218.79515075683594, + "learning_rate": 1.0061705989110709e-05, + "loss": 37.215, + "step": 3672 + }, + { + "epoch": 13.260045146726862, + "grad_norm": 240.93222045898438, + "learning_rate": 1.0056261343012704e-05, + "loss": 37.4461, + "step": 3673 + }, + { + "epoch": 13.263656884875846, + "grad_norm": 241.46072387695312, + "learning_rate": 1.00508166969147e-05, + "loss": 39.4396, + "step": 3674 + }, + { + "epoch": 13.267268623024831, + "grad_norm": 217.85369873046875, + "learning_rate": 1.0045372050816699e-05, + "loss": 38.5512, + "step": 3675 + }, + { + "epoch": 13.270880361173814, + "grad_norm": 254.53549194335938, + "learning_rate": 1.0039927404718694e-05, + "loss": 39.4436, + "step": 3676 + }, + { + "epoch": 13.2744920993228, + "grad_norm": 330.2030029296875, + "learning_rate": 1.003448275862069e-05, + "loss": 39.6341, + "step": 3677 + }, + { + "epoch": 13.278103837471784, + "grad_norm": 267.6778869628906, + "learning_rate": 1.0029038112522686e-05, + "loss": 38.5305, + "step": 3678 + }, + { + "epoch": 13.281715575620767, + "grad_norm": 251.23703002929688, + "learning_rate": 1.0023593466424682e-05, + "loss": 39.712, + "step": 3679 + }, + { + "epoch": 13.285327313769752, + "grad_norm": 258.8126525878906, + "learning_rate": 1.0018148820326679e-05, + "loss": 37.982, + "step": 3680 + }, + { + "epoch": 13.285327313769752, + "eval_loss": 0.6092600226402283, + "eval_runtime": 3.1494, + "eval_samples_per_second": 56.837, + "eval_steps_per_second": 56.837, + "step": 3680 + }, + { + "epoch": 13.288939051918735, + "grad_norm": 270.01690673828125, + "learning_rate": 1.0012704174228676e-05, + "loss": 35.8938, + "step": 3681 + }, + { + "epoch": 13.29255079006772, + "grad_norm": 271.138671875, + "learning_rate": 1.0007259528130671e-05, + "loss": 33.2221, + "step": 3682 + }, + { + "epoch": 13.296162528216705, + "grad_norm": 239.4976806640625, + "learning_rate": 1.0001814882032668e-05, + "loss": 32.6252, + "step": 3683 + }, + { + "epoch": 13.299774266365688, + "grad_norm": 203.7470245361328, + "learning_rate": 9.996370235934664e-06, + "loss": 32.3694, + "step": 3684 + }, + { + "epoch": 13.303386004514673, + "grad_norm": 255.28419494628906, + "learning_rate": 9.990925589836661e-06, + "loss": 32.7386, + "step": 3685 + }, + { + "epoch": 13.306997742663658, + "grad_norm": 267.82489013671875, + "learning_rate": 9.985480943738658e-06, + "loss": 33.7657, + "step": 3686 + }, + { + "epoch": 13.31060948081264, + "grad_norm": 224.82432556152344, + "learning_rate": 9.980036297640653e-06, + "loss": 34.085, + "step": 3687 + }, + { + "epoch": 13.314221218961626, + "grad_norm": 249.92684936523438, + "learning_rate": 9.974591651542649e-06, + "loss": 33.9186, + "step": 3688 + }, + { + "epoch": 13.317832957110609, + "grad_norm": 249.29620361328125, + "learning_rate": 9.969147005444648e-06, + "loss": 35.0909, + "step": 3689 + }, + { + "epoch": 13.321444695259594, + "grad_norm": 276.4640808105469, + "learning_rate": 9.963702359346643e-06, + "loss": 35.6823, + "step": 3690 + }, + { + "epoch": 13.321444695259594, + "eval_loss": 0.6132593154907227, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 3690 + }, + { + "epoch": 13.325056433408578, + "grad_norm": 245.46163940429688, + "learning_rate": 9.958257713248638e-06, + "loss": 35.7071, + "step": 3691 + }, + { + "epoch": 13.328668171557561, + "grad_norm": 311.008544921875, + "learning_rate": 9.952813067150635e-06, + "loss": 33.6089, + "step": 3692 + }, + { + "epoch": 13.332279909706546, + "grad_norm": 283.2784118652344, + "learning_rate": 9.94736842105263e-06, + "loss": 34.9939, + "step": 3693 + }, + { + "epoch": 13.335891647855531, + "grad_norm": 293.2317199707031, + "learning_rate": 9.94192377495463e-06, + "loss": 37.1149, + "step": 3694 + }, + { + "epoch": 13.339503386004514, + "grad_norm": 263.33111572265625, + "learning_rate": 9.936479128856625e-06, + "loss": 36.5911, + "step": 3695 + }, + { + "epoch": 13.343115124153499, + "grad_norm": 285.1488952636719, + "learning_rate": 9.93103448275862e-06, + "loss": 35.9336, + "step": 3696 + }, + { + "epoch": 13.346726862302482, + "grad_norm": 246.30616760253906, + "learning_rate": 9.925589836660617e-06, + "loss": 26.1555, + "step": 3697 + }, + { + "epoch": 13.350338600451467, + "grad_norm": 185.4857177734375, + "learning_rate": 9.920145190562614e-06, + "loss": 21.9519, + "step": 3698 + }, + { + "epoch": 13.353950338600452, + "grad_norm": 269.6291809082031, + "learning_rate": 9.91470054446461e-06, + "loss": 22.5592, + "step": 3699 + }, + { + "epoch": 13.357562076749435, + "grad_norm": 214.7660675048828, + "learning_rate": 9.909255898366607e-06, + "loss": 23.2505, + "step": 3700 + }, + { + "epoch": 13.357562076749435, + "eval_loss": 0.6123418211936951, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 3700 + }, + { + "epoch": 13.36117381489842, + "grad_norm": 227.8025360107422, + "learning_rate": 9.903811252268602e-06, + "loss": 23.9731, + "step": 3701 + }, + { + "epoch": 13.364785553047405, + "grad_norm": 261.7846374511719, + "learning_rate": 9.898366606170598e-06, + "loss": 40.3869, + "step": 3702 + }, + { + "epoch": 13.368397291196388, + "grad_norm": 305.4109802246094, + "learning_rate": 9.892921960072596e-06, + "loss": 41.9626, + "step": 3703 + }, + { + "epoch": 13.372009029345373, + "grad_norm": 272.86236572265625, + "learning_rate": 9.887477313974592e-06, + "loss": 39.9819, + "step": 3704 + }, + { + "epoch": 13.375620767494357, + "grad_norm": 371.4781188964844, + "learning_rate": 9.882032667876589e-06, + "loss": 40.8074, + "step": 3705 + }, + { + "epoch": 13.37923250564334, + "grad_norm": 278.7463684082031, + "learning_rate": 9.876588021778584e-06, + "loss": 40.6721, + "step": 3706 + }, + { + "epoch": 13.382844243792325, + "grad_norm": 270.41619873046875, + "learning_rate": 9.87114337568058e-06, + "loss": 40.1604, + "step": 3707 + }, + { + "epoch": 13.386455981941308, + "grad_norm": 204.42018127441406, + "learning_rate": 9.865698729582578e-06, + "loss": 41.4666, + "step": 3708 + }, + { + "epoch": 13.390067720090293, + "grad_norm": 197.43289184570312, + "learning_rate": 9.860254083484574e-06, + "loss": 40.953, + "step": 3709 + }, + { + "epoch": 13.393679458239278, + "grad_norm": 203.92056274414062, + "learning_rate": 9.85480943738657e-06, + "loss": 40.6416, + "step": 3710 + }, + { + "epoch": 13.393679458239278, + "eval_loss": 0.608938992023468, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.863, + "eval_steps_per_second": 56.863, + "step": 3710 + }, + { + "epoch": 13.397291196388261, + "grad_norm": 353.2951354980469, + "learning_rate": 9.849364791288566e-06, + "loss": 39.7, + "step": 3711 + }, + { + "epoch": 13.400902934537246, + "grad_norm": 222.94410705566406, + "learning_rate": 9.843920145190563e-06, + "loss": 40.4703, + "step": 3712 + }, + { + "epoch": 13.404514672686231, + "grad_norm": 301.0710754394531, + "learning_rate": 9.83847549909256e-06, + "loss": 37.0453, + "step": 3713 + }, + { + "epoch": 13.408126410835214, + "grad_norm": 251.70263671875, + "learning_rate": 9.833030852994556e-06, + "loss": 37.5346, + "step": 3714 + }, + { + "epoch": 13.411738148984199, + "grad_norm": 201.29335021972656, + "learning_rate": 9.827586206896551e-06, + "loss": 39.0706, + "step": 3715 + }, + { + "epoch": 13.415349887133182, + "grad_norm": 233.82212829589844, + "learning_rate": 9.822141560798548e-06, + "loss": 38.4527, + "step": 3716 + }, + { + "epoch": 13.418961625282167, + "grad_norm": 245.0128936767578, + "learning_rate": 9.816696914700545e-06, + "loss": 37.82, + "step": 3717 + }, + { + "epoch": 13.422573363431152, + "grad_norm": 325.1784973144531, + "learning_rate": 9.81125226860254e-06, + "loss": 38.8858, + "step": 3718 + }, + { + "epoch": 13.426185101580135, + "grad_norm": 196.15032958984375, + "learning_rate": 9.805807622504538e-06, + "loss": 37.1919, + "step": 3719 + }, + { + "epoch": 13.42979683972912, + "grad_norm": 254.73980712890625, + "learning_rate": 9.800362976406533e-06, + "loss": 39.1644, + "step": 3720 + }, + { + "epoch": 13.42979683972912, + "eval_loss": 0.6100116968154907, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 3720 + }, + { + "epoch": 13.433408577878104, + "grad_norm": 253.11489868164062, + "learning_rate": 9.79491833030853e-06, + "loss": 39.8542, + "step": 3721 + }, + { + "epoch": 13.437020316027088, + "grad_norm": 267.8416748046875, + "learning_rate": 9.789473684210527e-06, + "loss": 39.8469, + "step": 3722 + }, + { + "epoch": 13.440632054176072, + "grad_norm": 267.62835693359375, + "learning_rate": 9.784029038112523e-06, + "loss": 37.4556, + "step": 3723 + }, + { + "epoch": 13.444243792325057, + "grad_norm": 346.6018371582031, + "learning_rate": 9.77858439201452e-06, + "loss": 39.7817, + "step": 3724 + }, + { + "epoch": 13.44785553047404, + "grad_norm": 241.95008850097656, + "learning_rate": 9.773139745916515e-06, + "loss": 39.1631, + "step": 3725 + }, + { + "epoch": 13.451467268623025, + "grad_norm": 244.9163055419922, + "learning_rate": 9.767695099818512e-06, + "loss": 38.6152, + "step": 3726 + }, + { + "epoch": 13.455079006772008, + "grad_norm": 243.60633850097656, + "learning_rate": 9.76225045372051e-06, + "loss": 39.5388, + "step": 3727 + }, + { + "epoch": 13.458690744920993, + "grad_norm": 230.57276916503906, + "learning_rate": 9.756805807622505e-06, + "loss": 40.3007, + "step": 3728 + }, + { + "epoch": 13.462302483069978, + "grad_norm": 228.76754760742188, + "learning_rate": 9.7513611615245e-06, + "loss": 37.7111, + "step": 3729 + }, + { + "epoch": 13.465914221218961, + "grad_norm": 292.7367248535156, + "learning_rate": 9.745916515426497e-06, + "loss": 38.4114, + "step": 3730 + }, + { + "epoch": 13.465914221218961, + "eval_loss": 0.6064842939376831, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 3730 + }, + { + "epoch": 13.469525959367946, + "grad_norm": 226.9254150390625, + "learning_rate": 9.740471869328494e-06, + "loss": 34.015, + "step": 3731 + }, + { + "epoch": 13.47313769751693, + "grad_norm": 250.38137817382812, + "learning_rate": 9.73502722323049e-06, + "loss": 34.2911, + "step": 3732 + }, + { + "epoch": 13.476749435665914, + "grad_norm": 230.447265625, + "learning_rate": 9.729582577132487e-06, + "loss": 31.8708, + "step": 3733 + }, + { + "epoch": 13.480361173814899, + "grad_norm": 241.05787658691406, + "learning_rate": 9.724137931034482e-06, + "loss": 34.5685, + "step": 3734 + }, + { + "epoch": 13.483972911963882, + "grad_norm": 248.07254028320312, + "learning_rate": 9.718693284936481e-06, + "loss": 32.6084, + "step": 3735 + }, + { + "epoch": 13.487584650112867, + "grad_norm": 241.22862243652344, + "learning_rate": 9.713248638838476e-06, + "loss": 32.787, + "step": 3736 + }, + { + "epoch": 13.491196388261852, + "grad_norm": 295.4871520996094, + "learning_rate": 9.707803992740472e-06, + "loss": 33.9786, + "step": 3737 + }, + { + "epoch": 13.494808126410835, + "grad_norm": 285.3634948730469, + "learning_rate": 9.702359346642469e-06, + "loss": 33.9872, + "step": 3738 + }, + { + "epoch": 13.49841986455982, + "grad_norm": 302.39947509765625, + "learning_rate": 9.696914700544464e-06, + "loss": 33.9854, + "step": 3739 + }, + { + "epoch": 13.502031602708804, + "grad_norm": 310.0465087890625, + "learning_rate": 9.691470054446461e-06, + "loss": 34.1859, + "step": 3740 + }, + { + "epoch": 13.502031602708804, + "eval_loss": 0.6067100167274475, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 3740 + }, + { + "epoch": 13.505643340857787, + "grad_norm": 319.9311828613281, + "learning_rate": 9.686025408348458e-06, + "loss": 34.5264, + "step": 3741 + }, + { + "epoch": 13.509255079006772, + "grad_norm": 291.75738525390625, + "learning_rate": 9.680580762250454e-06, + "loss": 35.8348, + "step": 3742 + }, + { + "epoch": 13.512866817155757, + "grad_norm": 291.5312805175781, + "learning_rate": 9.675136116152449e-06, + "loss": 33.8803, + "step": 3743 + }, + { + "epoch": 13.51647855530474, + "grad_norm": 228.00588989257812, + "learning_rate": 9.669691470054448e-06, + "loss": 36.1919, + "step": 3744 + }, + { + "epoch": 13.520090293453725, + "grad_norm": 236.5559539794922, + "learning_rate": 9.664246823956443e-06, + "loss": 35.8432, + "step": 3745 + }, + { + "epoch": 13.523702031602708, + "grad_norm": 287.7408752441406, + "learning_rate": 9.65880217785844e-06, + "loss": 37.069, + "step": 3746 + }, + { + "epoch": 13.527313769751693, + "grad_norm": 272.73870849609375, + "learning_rate": 9.653357531760436e-06, + "loss": 29.1896, + "step": 3747 + }, + { + "epoch": 13.530925507900678, + "grad_norm": 256.5550842285156, + "learning_rate": 9.647912885662431e-06, + "loss": 23.0953, + "step": 3748 + }, + { + "epoch": 13.534537246049661, + "grad_norm": 230.98487854003906, + "learning_rate": 9.64246823956443e-06, + "loss": 21.9902, + "step": 3749 + }, + { + "epoch": 13.538148984198646, + "grad_norm": 247.1185760498047, + "learning_rate": 9.637023593466425e-06, + "loss": 23.7439, + "step": 3750 + }, + { + "epoch": 13.538148984198646, + "eval_loss": 0.6106311082839966, + "eval_runtime": 3.1356, + "eval_samples_per_second": 57.086, + "eval_steps_per_second": 57.086, + "step": 3750 + }, + { + "epoch": 13.54176072234763, + "grad_norm": 193.83152770996094, + "learning_rate": 9.63157894736842e-06, + "loss": 24.2292, + "step": 3751 + }, + { + "epoch": 13.545372460496614, + "grad_norm": 322.80487060546875, + "learning_rate": 9.626134301270418e-06, + "loss": 40.9778, + "step": 3752 + }, + { + "epoch": 13.548984198645599, + "grad_norm": 345.0560302734375, + "learning_rate": 9.620689655172413e-06, + "loss": 42.3601, + "step": 3753 + }, + { + "epoch": 13.552595936794582, + "grad_norm": 240.3759002685547, + "learning_rate": 9.61524500907441e-06, + "loss": 41.092, + "step": 3754 + }, + { + "epoch": 13.556207674943566, + "grad_norm": 219.0955352783203, + "learning_rate": 9.609800362976407e-06, + "loss": 40.3108, + "step": 3755 + }, + { + "epoch": 13.559819413092551, + "grad_norm": 255.6158447265625, + "learning_rate": 9.604355716878403e-06, + "loss": 39.8885, + "step": 3756 + }, + { + "epoch": 13.563431151241534, + "grad_norm": 264.55010986328125, + "learning_rate": 9.5989110707804e-06, + "loss": 40.8838, + "step": 3757 + }, + { + "epoch": 13.56704288939052, + "grad_norm": 313.0918273925781, + "learning_rate": 9.593466424682397e-06, + "loss": 40.6634, + "step": 3758 + }, + { + "epoch": 13.570654627539504, + "grad_norm": 304.87396240234375, + "learning_rate": 9.588021778584392e-06, + "loss": 41.8734, + "step": 3759 + }, + { + "epoch": 13.574266365688487, + "grad_norm": 239.76063537597656, + "learning_rate": 9.58257713248639e-06, + "loss": 40.6281, + "step": 3760 + }, + { + "epoch": 13.574266365688487, + "eval_loss": 0.6124129891395569, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.028, + "eval_steps_per_second": 57.028, + "step": 3760 + }, + { + "epoch": 13.577878103837472, + "grad_norm": 201.89422607421875, + "learning_rate": 9.577132486388385e-06, + "loss": 39.6948, + "step": 3761 + }, + { + "epoch": 13.581489841986457, + "grad_norm": 232.8797607421875, + "learning_rate": 9.57168784029038e-06, + "loss": 39.6927, + "step": 3762 + }, + { + "epoch": 13.58510158013544, + "grad_norm": 250.30355834960938, + "learning_rate": 9.566243194192379e-06, + "loss": 37.6926, + "step": 3763 + }, + { + "epoch": 13.588713318284425, + "grad_norm": 256.23626708984375, + "learning_rate": 9.560798548094374e-06, + "loss": 38.248, + "step": 3764 + }, + { + "epoch": 13.592325056433408, + "grad_norm": 234.1791534423828, + "learning_rate": 9.555353901996371e-06, + "loss": 36.8178, + "step": 3765 + }, + { + "epoch": 13.595936794582393, + "grad_norm": 243.87615966796875, + "learning_rate": 9.549909255898367e-06, + "loss": 37.0802, + "step": 3766 + }, + { + "epoch": 13.599548532731378, + "grad_norm": 220.98150634765625, + "learning_rate": 9.544464609800362e-06, + "loss": 37.1251, + "step": 3767 + }, + { + "epoch": 13.60316027088036, + "grad_norm": 235.8653564453125, + "learning_rate": 9.53901996370236e-06, + "loss": 38.2965, + "step": 3768 + }, + { + "epoch": 13.606772009029346, + "grad_norm": 237.66712951660156, + "learning_rate": 9.533575317604356e-06, + "loss": 38.0266, + "step": 3769 + }, + { + "epoch": 13.610383747178329, + "grad_norm": 229.4922637939453, + "learning_rate": 9.528130671506351e-06, + "loss": 38.4199, + "step": 3770 + }, + { + "epoch": 13.610383747178329, + "eval_loss": 0.6078812479972839, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 3770 + }, + { + "epoch": 13.613995485327314, + "grad_norm": 250.82533264160156, + "learning_rate": 9.522686025408349e-06, + "loss": 39.713, + "step": 3771 + }, + { + "epoch": 13.617607223476298, + "grad_norm": 218.97511291503906, + "learning_rate": 9.517241379310346e-06, + "loss": 37.6396, + "step": 3772 + }, + { + "epoch": 13.621218961625281, + "grad_norm": 240.13096618652344, + "learning_rate": 9.511796733212341e-06, + "loss": 39.2808, + "step": 3773 + }, + { + "epoch": 13.624830699774266, + "grad_norm": 214.77957153320312, + "learning_rate": 9.506352087114338e-06, + "loss": 39.1584, + "step": 3774 + }, + { + "epoch": 13.628442437923251, + "grad_norm": 273.2488708496094, + "learning_rate": 9.500907441016333e-06, + "loss": 39.6725, + "step": 3775 + }, + { + "epoch": 13.632054176072234, + "grad_norm": 240.46669006347656, + "learning_rate": 9.49546279491833e-06, + "loss": 40.155, + "step": 3776 + }, + { + "epoch": 13.635665914221219, + "grad_norm": 304.46533203125, + "learning_rate": 9.490018148820328e-06, + "loss": 39.5831, + "step": 3777 + }, + { + "epoch": 13.639277652370204, + "grad_norm": 282.9252624511719, + "learning_rate": 9.484573502722323e-06, + "loss": 40.8392, + "step": 3778 + }, + { + "epoch": 13.642889390519187, + "grad_norm": 229.2595977783203, + "learning_rate": 9.47912885662432e-06, + "loss": 38.4015, + "step": 3779 + }, + { + "epoch": 13.646501128668172, + "grad_norm": 300.0253601074219, + "learning_rate": 9.473684210526315e-06, + "loss": 35.0578, + "step": 3780 + }, + { + "epoch": 13.646501128668172, + "eval_loss": 0.6059401631355286, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 3780 + }, + { + "epoch": 13.650112866817155, + "grad_norm": 266.379638671875, + "learning_rate": 9.468239564428313e-06, + "loss": 33.0308, + "step": 3781 + }, + { + "epoch": 13.65372460496614, + "grad_norm": 248.8190460205078, + "learning_rate": 9.46279491833031e-06, + "loss": 31.7632, + "step": 3782 + }, + { + "epoch": 13.657336343115125, + "grad_norm": 224.4126739501953, + "learning_rate": 9.457350272232305e-06, + "loss": 32.8875, + "step": 3783 + }, + { + "epoch": 13.660948081264108, + "grad_norm": 259.84466552734375, + "learning_rate": 9.4519056261343e-06, + "loss": 32.3248, + "step": 3784 + }, + { + "epoch": 13.664559819413093, + "grad_norm": 233.59483337402344, + "learning_rate": 9.446460980036297e-06, + "loss": 32.5855, + "step": 3785 + }, + { + "epoch": 13.668171557562077, + "grad_norm": 283.1840515136719, + "learning_rate": 9.441016333938295e-06, + "loss": 33.8277, + "step": 3786 + }, + { + "epoch": 13.67178329571106, + "grad_norm": 269.51171875, + "learning_rate": 9.435571687840292e-06, + "loss": 33.8348, + "step": 3787 + }, + { + "epoch": 13.675395033860045, + "grad_norm": 284.6701354980469, + "learning_rate": 9.430127041742287e-06, + "loss": 34.2571, + "step": 3788 + }, + { + "epoch": 13.679006772009028, + "grad_norm": 308.96221923828125, + "learning_rate": 9.424682395644282e-06, + "loss": 34.2313, + "step": 3789 + }, + { + "epoch": 13.682618510158013, + "grad_norm": 229.36366271972656, + "learning_rate": 9.41923774954628e-06, + "loss": 34.6341, + "step": 3790 + }, + { + "epoch": 13.682618510158013, + "eval_loss": 0.606715202331543, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 3790 + }, + { + "epoch": 13.686230248306998, + "grad_norm": 335.4346008300781, + "learning_rate": 9.413793103448277e-06, + "loss": 35.2222, + "step": 3791 + }, + { + "epoch": 13.689841986455981, + "grad_norm": 259.72222900390625, + "learning_rate": 9.408348457350272e-06, + "loss": 34.7416, + "step": 3792 + }, + { + "epoch": 13.693453724604966, + "grad_norm": 275.96112060546875, + "learning_rate": 9.402903811252269e-06, + "loss": 34.2018, + "step": 3793 + }, + { + "epoch": 13.697065462753951, + "grad_norm": 349.28924560546875, + "learning_rate": 9.397459165154264e-06, + "loss": 37.8801, + "step": 3794 + }, + { + "epoch": 13.700677200902934, + "grad_norm": 288.47540283203125, + "learning_rate": 9.392014519056261e-06, + "loss": 37.5101, + "step": 3795 + }, + { + "epoch": 13.704288939051919, + "grad_norm": 255.31033325195312, + "learning_rate": 9.386569872958259e-06, + "loss": 36.9294, + "step": 3796 + }, + { + "epoch": 13.707900677200904, + "grad_norm": 273.757080078125, + "learning_rate": 9.381125226860254e-06, + "loss": 31.64, + "step": 3797 + }, + { + "epoch": 13.711512415349887, + "grad_norm": 236.24928283691406, + "learning_rate": 9.375680580762251e-06, + "loss": 22.9812, + "step": 3798 + }, + { + "epoch": 13.715124153498872, + "grad_norm": 206.70883178710938, + "learning_rate": 9.370235934664246e-06, + "loss": 22.4788, + "step": 3799 + }, + { + "epoch": 13.718735891647855, + "grad_norm": 168.15762329101562, + "learning_rate": 9.364791288566243e-06, + "loss": 23.3803, + "step": 3800 + }, + { + "epoch": 13.718735891647855, + "eval_loss": 0.6092759966850281, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 3800 + }, + { + "epoch": 13.72234762979684, + "grad_norm": 261.88397216796875, + "learning_rate": 9.35934664246824e-06, + "loss": 24.8757, + "step": 3801 + }, + { + "epoch": 13.725959367945824, + "grad_norm": 235.3518829345703, + "learning_rate": 9.353901996370236e-06, + "loss": 39.8777, + "step": 3802 + }, + { + "epoch": 13.729571106094808, + "grad_norm": 226.94027709960938, + "learning_rate": 9.348457350272231e-06, + "loss": 40.4357, + "step": 3803 + }, + { + "epoch": 13.733182844243792, + "grad_norm": 266.2643737792969, + "learning_rate": 9.34301270417423e-06, + "loss": 41.6411, + "step": 3804 + }, + { + "epoch": 13.736794582392777, + "grad_norm": 327.39288330078125, + "learning_rate": 9.337568058076225e-06, + "loss": 39.862, + "step": 3805 + }, + { + "epoch": 13.74040632054176, + "grad_norm": 241.03121948242188, + "learning_rate": 9.332123411978223e-06, + "loss": 39.1833, + "step": 3806 + }, + { + "epoch": 13.744018058690745, + "grad_norm": 232.2872314453125, + "learning_rate": 9.326678765880218e-06, + "loss": 40.6895, + "step": 3807 + }, + { + "epoch": 13.747629796839728, + "grad_norm": 236.909912109375, + "learning_rate": 9.321234119782213e-06, + "loss": 39.5891, + "step": 3808 + }, + { + "epoch": 13.751241534988713, + "grad_norm": 193.81478881835938, + "learning_rate": 9.315789473684212e-06, + "loss": 41.5211, + "step": 3809 + }, + { + "epoch": 13.754853273137698, + "grad_norm": 214.87301635742188, + "learning_rate": 9.310344827586207e-06, + "loss": 41.0726, + "step": 3810 + }, + { + "epoch": 13.754853273137698, + "eval_loss": 0.6098713874816895, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.047, + "eval_steps_per_second": 57.047, + "step": 3810 + }, + { + "epoch": 13.758465011286681, + "grad_norm": 196.57247924804688, + "learning_rate": 9.304900181488203e-06, + "loss": 40.1843, + "step": 3811 + }, + { + "epoch": 13.762076749435666, + "grad_norm": 215.59698486328125, + "learning_rate": 9.2994555353902e-06, + "loss": 37.6279, + "step": 3812 + }, + { + "epoch": 13.76568848758465, + "grad_norm": 221.1280059814453, + "learning_rate": 9.294010889292195e-06, + "loss": 37.9593, + "step": 3813 + }, + { + "epoch": 13.769300225733634, + "grad_norm": 314.94610595703125, + "learning_rate": 9.288566243194192e-06, + "loss": 37.3399, + "step": 3814 + }, + { + "epoch": 13.772911963882619, + "grad_norm": 240.10816955566406, + "learning_rate": 9.28312159709619e-06, + "loss": 38.3185, + "step": 3815 + }, + { + "epoch": 13.776523702031604, + "grad_norm": 229.2427978515625, + "learning_rate": 9.277676950998185e-06, + "loss": 36.9407, + "step": 3816 + }, + { + "epoch": 13.780135440180587, + "grad_norm": 224.78335571289062, + "learning_rate": 9.272232304900182e-06, + "loss": 39.3709, + "step": 3817 + }, + { + "epoch": 13.783747178329572, + "grad_norm": 216.5969696044922, + "learning_rate": 9.266787658802179e-06, + "loss": 38.2303, + "step": 3818 + }, + { + "epoch": 13.787358916478555, + "grad_norm": 208.7849884033203, + "learning_rate": 9.261343012704174e-06, + "loss": 39.492, + "step": 3819 + }, + { + "epoch": 13.79097065462754, + "grad_norm": 215.76475524902344, + "learning_rate": 9.255898366606171e-06, + "loss": 38.5599, + "step": 3820 + }, + { + "epoch": 13.79097065462754, + "eval_loss": 0.6080366969108582, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.041, + "eval_steps_per_second": 57.041, + "step": 3820 + }, + { + "epoch": 13.794582392776524, + "grad_norm": 224.64462280273438, + "learning_rate": 9.250453720508167e-06, + "loss": 39.315, + "step": 3821 + }, + { + "epoch": 13.798194130925507, + "grad_norm": 298.545654296875, + "learning_rate": 9.245009074410162e-06, + "loss": 38.3108, + "step": 3822 + }, + { + "epoch": 13.801805869074492, + "grad_norm": 236.5186767578125, + "learning_rate": 9.239564428312161e-06, + "loss": 39.9223, + "step": 3823 + }, + { + "epoch": 13.805417607223477, + "grad_norm": 251.47999572753906, + "learning_rate": 9.234119782214156e-06, + "loss": 39.4288, + "step": 3824 + }, + { + "epoch": 13.80902934537246, + "grad_norm": 260.8268737792969, + "learning_rate": 9.228675136116152e-06, + "loss": 38.276, + "step": 3825 + }, + { + "epoch": 13.812641083521445, + "grad_norm": 253.25172424316406, + "learning_rate": 9.223230490018149e-06, + "loss": 40.7118, + "step": 3826 + }, + { + "epoch": 13.816252821670428, + "grad_norm": 250.31784057617188, + "learning_rate": 9.217785843920146e-06, + "loss": 40.1916, + "step": 3827 + }, + { + "epoch": 13.819864559819413, + "grad_norm": 228.79234313964844, + "learning_rate": 9.212341197822143e-06, + "loss": 38.1513, + "step": 3828 + }, + { + "epoch": 13.823476297968398, + "grad_norm": 262.689697265625, + "learning_rate": 9.206896551724138e-06, + "loss": 38.43, + "step": 3829 + }, + { + "epoch": 13.827088036117381, + "grad_norm": 191.04139709472656, + "learning_rate": 9.201451905626134e-06, + "loss": 34.2476, + "step": 3830 + }, + { + "epoch": 13.827088036117381, + "eval_loss": 0.6077054142951965, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 3830 + }, + { + "epoch": 13.830699774266366, + "grad_norm": 236.3266143798828, + "learning_rate": 9.196007259528131e-06, + "loss": 33.7892, + "step": 3831 + }, + { + "epoch": 13.83431151241535, + "grad_norm": 284.8748474121094, + "learning_rate": 9.190562613430128e-06, + "loss": 31.9857, + "step": 3832 + }, + { + "epoch": 13.837923250564334, + "grad_norm": 261.17413330078125, + "learning_rate": 9.185117967332123e-06, + "loss": 32.8165, + "step": 3833 + }, + { + "epoch": 13.841534988713319, + "grad_norm": 195.1323699951172, + "learning_rate": 9.17967332123412e-06, + "loss": 33.1709, + "step": 3834 + }, + { + "epoch": 13.845146726862303, + "grad_norm": 220.5006561279297, + "learning_rate": 9.174228675136116e-06, + "loss": 33.149, + "step": 3835 + }, + { + "epoch": 13.848758465011286, + "grad_norm": 236.7254638671875, + "learning_rate": 9.168784029038111e-06, + "loss": 33.633, + "step": 3836 + }, + { + "epoch": 13.852370203160271, + "grad_norm": 269.1921691894531, + "learning_rate": 9.16333938294011e-06, + "loss": 34.6822, + "step": 3837 + }, + { + "epoch": 13.855981941309254, + "grad_norm": 222.4369354248047, + "learning_rate": 9.157894736842105e-06, + "loss": 35.2816, + "step": 3838 + }, + { + "epoch": 13.85959367945824, + "grad_norm": 232.4306640625, + "learning_rate": 9.152450090744102e-06, + "loss": 35.0067, + "step": 3839 + }, + { + "epoch": 13.863205417607224, + "grad_norm": 297.0786437988281, + "learning_rate": 9.147005444646098e-06, + "loss": 34.264, + "step": 3840 + }, + { + "epoch": 13.863205417607224, + "eval_loss": 0.6047748327255249, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 3840 + }, + { + "epoch": 13.866817155756207, + "grad_norm": 370.232421875, + "learning_rate": 9.141560798548095e-06, + "loss": 35.4996, + "step": 3841 + }, + { + "epoch": 13.870428893905192, + "grad_norm": 216.05775451660156, + "learning_rate": 9.136116152450092e-06, + "loss": 36.1403, + "step": 3842 + }, + { + "epoch": 13.874040632054175, + "grad_norm": 233.11138916015625, + "learning_rate": 9.130671506352087e-06, + "loss": 36.0324, + "step": 3843 + }, + { + "epoch": 13.87765237020316, + "grad_norm": 297.1761779785156, + "learning_rate": 9.125226860254083e-06, + "loss": 36.5617, + "step": 3844 + }, + { + "epoch": 13.881264108352145, + "grad_norm": 290.61590576171875, + "learning_rate": 9.11978221415608e-06, + "loss": 36.7113, + "step": 3845 + }, + { + "epoch": 13.884875846501128, + "grad_norm": 293.5744934082031, + "learning_rate": 9.114337568058077e-06, + "loss": 36.9964, + "step": 3846 + }, + { + "epoch": 13.888487584650113, + "grad_norm": 227.73455810546875, + "learning_rate": 9.108892921960072e-06, + "loss": 31.8552, + "step": 3847 + }, + { + "epoch": 13.892099322799098, + "grad_norm": 223.36077880859375, + "learning_rate": 9.10344827586207e-06, + "loss": 22.9122, + "step": 3848 + }, + { + "epoch": 13.89571106094808, + "grad_norm": 181.14501953125, + "learning_rate": 9.098003629764065e-06, + "loss": 22.366, + "step": 3849 + }, + { + "epoch": 13.899322799097066, + "grad_norm": 215.75856018066406, + "learning_rate": 9.092558983666063e-06, + "loss": 23.9545, + "step": 3850 + }, + { + "epoch": 13.899322799097066, + "eval_loss": 0.6072003245353699, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 3850 + }, + { + "epoch": 13.90293453724605, + "grad_norm": 233.22837829589844, + "learning_rate": 9.087114337568059e-06, + "loss": 23.5196, + "step": 3851 + }, + { + "epoch": 13.906546275395034, + "grad_norm": 269.9342041015625, + "learning_rate": 9.081669691470054e-06, + "loss": 41.4605, + "step": 3852 + }, + { + "epoch": 13.910158013544018, + "grad_norm": 304.4266662597656, + "learning_rate": 9.076225045372051e-06, + "loss": 40.2848, + "step": 3853 + }, + { + "epoch": 13.913769751693001, + "grad_norm": 318.2371520996094, + "learning_rate": 9.070780399274047e-06, + "loss": 41.0044, + "step": 3854 + }, + { + "epoch": 13.917381489841986, + "grad_norm": 272.9725341796875, + "learning_rate": 9.065335753176044e-06, + "loss": 40.776, + "step": 3855 + }, + { + "epoch": 13.920993227990971, + "grad_norm": 213.8822784423828, + "learning_rate": 9.059891107078041e-06, + "loss": 39.4964, + "step": 3856 + }, + { + "epoch": 13.924604966139954, + "grad_norm": 239.16128540039062, + "learning_rate": 9.054446460980036e-06, + "loss": 41.3482, + "step": 3857 + }, + { + "epoch": 13.928216704288939, + "grad_norm": 264.839111328125, + "learning_rate": 9.049001814882033e-06, + "loss": 38.2433, + "step": 3858 + }, + { + "epoch": 13.931828442437924, + "grad_norm": 244.00926208496094, + "learning_rate": 9.043557168784029e-06, + "loss": 38.6482, + "step": 3859 + }, + { + "epoch": 13.935440180586907, + "grad_norm": 342.8050537109375, + "learning_rate": 9.038112522686026e-06, + "loss": 39.2047, + "step": 3860 + }, + { + "epoch": 13.935440180586907, + "eval_loss": 0.6078094244003296, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3860 + }, + { + "epoch": 13.939051918735892, + "grad_norm": 232.509033203125, + "learning_rate": 9.032667876588023e-06, + "loss": 39.2827, + "step": 3861 + }, + { + "epoch": 13.942663656884875, + "grad_norm": 343.2891845703125, + "learning_rate": 9.027223230490018e-06, + "loss": 38.2709, + "step": 3862 + }, + { + "epoch": 13.94627539503386, + "grad_norm": 332.9613342285156, + "learning_rate": 9.021778584392014e-06, + "loss": 38.8266, + "step": 3863 + }, + { + "epoch": 13.949887133182845, + "grad_norm": 339.5653076171875, + "learning_rate": 9.016333938294012e-06, + "loss": 39.9249, + "step": 3864 + }, + { + "epoch": 13.953498871331828, + "grad_norm": 269.0108947753906, + "learning_rate": 9.010889292196008e-06, + "loss": 39.4593, + "step": 3865 + }, + { + "epoch": 13.957110609480813, + "grad_norm": 252.5339813232422, + "learning_rate": 9.005444646098003e-06, + "loss": 39.5471, + "step": 3866 + }, + { + "epoch": 13.960722347629797, + "grad_norm": 424.7225646972656, + "learning_rate": 9e-06, + "loss": 35.7505, + "step": 3867 + }, + { + "epoch": 13.96433408577878, + "grad_norm": 286.189208984375, + "learning_rate": 8.994555353901996e-06, + "loss": 32.445, + "step": 3868 + }, + { + "epoch": 13.967945823927765, + "grad_norm": 245.153564453125, + "learning_rate": 8.989110707803994e-06, + "loss": 33.2369, + "step": 3869 + }, + { + "epoch": 13.97155756207675, + "grad_norm": 305.3119812011719, + "learning_rate": 8.98366606170599e-06, + "loss": 31.7864, + "step": 3870 + }, + { + "epoch": 13.97155756207675, + "eval_loss": 0.6069231629371643, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.877, + "eval_steps_per_second": 56.877, + "step": 3870 + }, + { + "epoch": 13.975169300225733, + "grad_norm": 218.70913696289062, + "learning_rate": 8.978221415607985e-06, + "loss": 33.7166, + "step": 3871 + }, + { + "epoch": 13.978781038374718, + "grad_norm": 334.856201171875, + "learning_rate": 8.972776769509982e-06, + "loss": 35.8878, + "step": 3872 + }, + { + "epoch": 13.982392776523701, + "grad_norm": 305.65203857421875, + "learning_rate": 8.96733212341198e-06, + "loss": 35.1525, + "step": 3873 + }, + { + "epoch": 13.986004514672686, + "grad_norm": 330.148193359375, + "learning_rate": 8.961887477313975e-06, + "loss": 34.8268, + "step": 3874 + }, + { + "epoch": 13.989616252821671, + "grad_norm": 288.9424133300781, + "learning_rate": 8.956442831215972e-06, + "loss": 35.5068, + "step": 3875 + }, + { + "epoch": 13.993227990970654, + "grad_norm": 256.2596740722656, + "learning_rate": 8.950998185117967e-06, + "loss": 28.5016, + "step": 3876 + }, + { + "epoch": 13.996839729119639, + "grad_norm": 234.31991577148438, + "learning_rate": 8.945553539019963e-06, + "loss": 23.7416, + "step": 3877 + }, + { + "epoch": 14.0, + "grad_norm": 182.19000244140625, + "learning_rate": 8.940108892921961e-06, + "loss": 21.0329, + "step": 3878 + }, + { + "epoch": 14.003611738148985, + "grad_norm": 254.86355590820312, + "learning_rate": 8.934664246823957e-06, + "loss": 39.94, + "step": 3879 + }, + { + "epoch": 14.007223476297968, + "grad_norm": 229.75650024414062, + "learning_rate": 8.929219600725954e-06, + "loss": 40.3213, + "step": 3880 + }, + { + "epoch": 14.007223476297968, + "eval_loss": 0.604503870010376, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3880 + }, + { + "epoch": 14.010835214446953, + "grad_norm": 220.18190002441406, + "learning_rate": 8.923774954627949e-06, + "loss": 40.1568, + "step": 3881 + }, + { + "epoch": 14.014446952595938, + "grad_norm": 269.5978088378906, + "learning_rate": 8.918330308529945e-06, + "loss": 40.3685, + "step": 3882 + }, + { + "epoch": 14.01805869074492, + "grad_norm": 254.3507537841797, + "learning_rate": 8.912885662431943e-06, + "loss": 40.0845, + "step": 3883 + }, + { + "epoch": 14.021670428893906, + "grad_norm": 251.43653869628906, + "learning_rate": 8.907441016333939e-06, + "loss": 40.1731, + "step": 3884 + }, + { + "epoch": 14.025282167042889, + "grad_norm": 215.91253662109375, + "learning_rate": 8.901996370235934e-06, + "loss": 39.7179, + "step": 3885 + }, + { + "epoch": 14.028893905191874, + "grad_norm": 247.81790161132812, + "learning_rate": 8.896551724137931e-06, + "loss": 41.0822, + "step": 3886 + }, + { + "epoch": 14.032505643340858, + "grad_norm": 232.45892333984375, + "learning_rate": 8.891107078039928e-06, + "loss": 39.7873, + "step": 3887 + }, + { + "epoch": 14.036117381489841, + "grad_norm": 231.8137969970703, + "learning_rate": 8.885662431941924e-06, + "loss": 41.1302, + "step": 3888 + }, + { + "epoch": 14.039729119638826, + "grad_norm": 219.09446716308594, + "learning_rate": 8.88021778584392e-06, + "loss": 39.2293, + "step": 3889 + }, + { + "epoch": 14.043340857787811, + "grad_norm": 187.99874877929688, + "learning_rate": 8.874773139745916e-06, + "loss": 37.3338, + "step": 3890 + }, + { + "epoch": 14.043340857787811, + "eval_loss": 0.603966236114502, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 3890 + }, + { + "epoch": 14.046952595936794, + "grad_norm": 285.2400207519531, + "learning_rate": 8.869328493647913e-06, + "loss": 36.9479, + "step": 3891 + }, + { + "epoch": 14.050564334085779, + "grad_norm": 234.23655700683594, + "learning_rate": 8.86388384754991e-06, + "loss": 35.1313, + "step": 3892 + }, + { + "epoch": 14.054176072234762, + "grad_norm": 234.78717041015625, + "learning_rate": 8.858439201451906e-06, + "loss": 36.5917, + "step": 3893 + }, + { + "epoch": 14.057787810383747, + "grad_norm": 226.53997802734375, + "learning_rate": 8.852994555353903e-06, + "loss": 38.3228, + "step": 3894 + }, + { + "epoch": 14.061399548532732, + "grad_norm": 222.05213928222656, + "learning_rate": 8.847549909255898e-06, + "loss": 37.3542, + "step": 3895 + }, + { + "epoch": 14.065011286681715, + "grad_norm": 222.9646759033203, + "learning_rate": 8.842105263157893e-06, + "loss": 37.6396, + "step": 3896 + }, + { + "epoch": 14.0686230248307, + "grad_norm": 227.78965759277344, + "learning_rate": 8.836660617059892e-06, + "loss": 38.1988, + "step": 3897 + }, + { + "epoch": 14.072234762979685, + "grad_norm": 200.89691162109375, + "learning_rate": 8.831215970961888e-06, + "loss": 38.3981, + "step": 3898 + }, + { + "epoch": 14.075846501128668, + "grad_norm": 212.52891540527344, + "learning_rate": 8.825771324863883e-06, + "loss": 37.3422, + "step": 3899 + }, + { + "epoch": 14.079458239277653, + "grad_norm": 312.33905029296875, + "learning_rate": 8.82032667876588e-06, + "loss": 38.1292, + "step": 3900 + }, + { + "epoch": 14.079458239277653, + "eval_loss": 0.6061921119689941, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.017, + "eval_steps_per_second": 57.017, + "step": 3900 + }, + { + "epoch": 14.083069977426636, + "grad_norm": 261.8415832519531, + "learning_rate": 8.814882032667877e-06, + "loss": 37.5543, + "step": 3901 + }, + { + "epoch": 14.08668171557562, + "grad_norm": 264.625732421875, + "learning_rate": 8.809437386569874e-06, + "loss": 39.3912, + "step": 3902 + }, + { + "epoch": 14.090293453724605, + "grad_norm": 305.7203063964844, + "learning_rate": 8.80399274047187e-06, + "loss": 39.7879, + "step": 3903 + }, + { + "epoch": 14.093905191873588, + "grad_norm": 282.63616943359375, + "learning_rate": 8.798548094373865e-06, + "loss": 38.7212, + "step": 3904 + }, + { + "epoch": 14.097516930022573, + "grad_norm": 246.49169921875, + "learning_rate": 8.793103448275862e-06, + "loss": 40.6198, + "step": 3905 + }, + { + "epoch": 14.101128668171558, + "grad_norm": 283.2737731933594, + "learning_rate": 8.787658802177859e-06, + "loss": 39.6947, + "step": 3906 + }, + { + "epoch": 14.104740406320541, + "grad_norm": 306.95721435546875, + "learning_rate": 8.782214156079855e-06, + "loss": 38.6157, + "step": 3907 + }, + { + "epoch": 14.108352144469526, + "grad_norm": 238.1789093017578, + "learning_rate": 8.776769509981852e-06, + "loss": 35.5328, + "step": 3908 + }, + { + "epoch": 14.111963882618511, + "grad_norm": 233.2298126220703, + "learning_rate": 8.771324863883847e-06, + "loss": 32.4008, + "step": 3909 + }, + { + "epoch": 14.115575620767494, + "grad_norm": 233.46339416503906, + "learning_rate": 8.765880217785846e-06, + "loss": 31.0712, + "step": 3910 + }, + { + "epoch": 14.115575620767494, + "eval_loss": 0.6046931147575378, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 3910 + }, + { + "epoch": 14.119187358916479, + "grad_norm": 226.30343627929688, + "learning_rate": 8.760435571687841e-06, + "loss": 33.252, + "step": 3911 + }, + { + "epoch": 14.122799097065462, + "grad_norm": 247.17465209960938, + "learning_rate": 8.754990925589837e-06, + "loss": 31.526, + "step": 3912 + }, + { + "epoch": 14.126410835214447, + "grad_norm": 208.25439453125, + "learning_rate": 8.749546279491834e-06, + "loss": 32.4838, + "step": 3913 + }, + { + "epoch": 14.130022573363432, + "grad_norm": 236.4488525390625, + "learning_rate": 8.744101633393829e-06, + "loss": 32.7987, + "step": 3914 + }, + { + "epoch": 14.133634311512415, + "grad_norm": 219.13279724121094, + "learning_rate": 8.738656987295826e-06, + "loss": 32.8516, + "step": 3915 + }, + { + "epoch": 14.1372460496614, + "grad_norm": 239.7289581298828, + "learning_rate": 8.733212341197823e-06, + "loss": 33.7763, + "step": 3916 + }, + { + "epoch": 14.140857787810384, + "grad_norm": 226.3568878173828, + "learning_rate": 8.727767695099819e-06, + "loss": 35.675, + "step": 3917 + }, + { + "epoch": 14.144469525959368, + "grad_norm": 302.84307861328125, + "learning_rate": 8.722323049001814e-06, + "loss": 34.0523, + "step": 3918 + }, + { + "epoch": 14.148081264108352, + "grad_norm": 280.40106201171875, + "learning_rate": 8.716878402903811e-06, + "loss": 35.2923, + "step": 3919 + }, + { + "epoch": 14.151693002257336, + "grad_norm": 238.30520629882812, + "learning_rate": 8.711433756805808e-06, + "loss": 36.0242, + "step": 3920 + }, + { + "epoch": 14.151693002257336, + "eval_loss": 0.6067762970924377, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 3920 + }, + { + "epoch": 14.15530474040632, + "grad_norm": 238.6465301513672, + "learning_rate": 8.705989110707805e-06, + "loss": 36.2959, + "step": 3921 + }, + { + "epoch": 14.158916478555305, + "grad_norm": 273.26837158203125, + "learning_rate": 8.7005444646098e-06, + "loss": 35.45, + "step": 3922 + }, + { + "epoch": 14.162528216704288, + "grad_norm": 296.907958984375, + "learning_rate": 8.695099818511796e-06, + "loss": 36.4428, + "step": 3923 + }, + { + "epoch": 14.166139954853273, + "grad_norm": 215.07374572753906, + "learning_rate": 8.689655172413795e-06, + "loss": 26.4171, + "step": 3924 + }, + { + "epoch": 14.169751693002258, + "grad_norm": 217.64779663085938, + "learning_rate": 8.68421052631579e-06, + "loss": 22.5483, + "step": 3925 + }, + { + "epoch": 14.173363431151241, + "grad_norm": 243.59364318847656, + "learning_rate": 8.678765880217785e-06, + "loss": 22.0396, + "step": 3926 + }, + { + "epoch": 14.176975169300226, + "grad_norm": 189.66969299316406, + "learning_rate": 8.673321234119783e-06, + "loss": 23.0957, + "step": 3927 + }, + { + "epoch": 14.18058690744921, + "grad_norm": 191.86180114746094, + "learning_rate": 8.667876588021778e-06, + "loss": 23.9385, + "step": 3928 + }, + { + "epoch": 14.184198645598194, + "grad_norm": 234.34896850585938, + "learning_rate": 8.662431941923775e-06, + "loss": 40.1665, + "step": 3929 + }, + { + "epoch": 14.187810383747179, + "grad_norm": 230.52401733398438, + "learning_rate": 8.656987295825772e-06, + "loss": 40.6752, + "step": 3930 + }, + { + "epoch": 14.187810383747179, + "eval_loss": 0.6088615655899048, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.998, + "eval_steps_per_second": 56.998, + "step": 3930 + }, + { + "epoch": 14.191422121896162, + "grad_norm": 234.06272888183594, + "learning_rate": 8.651542649727767e-06, + "loss": 40.7938, + "step": 3931 + }, + { + "epoch": 14.195033860045147, + "grad_norm": 344.4232482910156, + "learning_rate": 8.646098003629765e-06, + "loss": 38.7342, + "step": 3932 + }, + { + "epoch": 14.198645598194132, + "grad_norm": 375.74365234375, + "learning_rate": 8.640653357531762e-06, + "loss": 40.2052, + "step": 3933 + }, + { + "epoch": 14.202257336343115, + "grad_norm": 258.15570068359375, + "learning_rate": 8.635208711433757e-06, + "loss": 39.7266, + "step": 3934 + }, + { + "epoch": 14.2058690744921, + "grad_norm": 235.2681121826172, + "learning_rate": 8.629764065335754e-06, + "loss": 40.4821, + "step": 3935 + }, + { + "epoch": 14.209480812641084, + "grad_norm": 226.94764709472656, + "learning_rate": 8.62431941923775e-06, + "loss": 41.2414, + "step": 3936 + }, + { + "epoch": 14.213092550790067, + "grad_norm": 236.22109985351562, + "learning_rate": 8.618874773139745e-06, + "loss": 40.5807, + "step": 3937 + }, + { + "epoch": 14.216704288939052, + "grad_norm": 201.31112670898438, + "learning_rate": 8.613430127041744e-06, + "loss": 40.4824, + "step": 3938 + }, + { + "epoch": 14.220316027088035, + "grad_norm": 328.0167541503906, + "learning_rate": 8.607985480943739e-06, + "loss": 38.3881, + "step": 3939 + }, + { + "epoch": 14.22392776523702, + "grad_norm": 281.4416809082031, + "learning_rate": 8.602540834845734e-06, + "loss": 36.5777, + "step": 3940 + }, + { + "epoch": 14.22392776523702, + "eval_loss": 0.6099084615707397, + "eval_runtime": 3.1377, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 3940 + }, + { + "epoch": 14.227539503386005, + "grad_norm": 258.5203552246094, + "learning_rate": 8.597096188747731e-06, + "loss": 37.5071, + "step": 3941 + }, + { + "epoch": 14.231151241534988, + "grad_norm": 274.8222351074219, + "learning_rate": 8.591651542649727e-06, + "loss": 36.358, + "step": 3942 + }, + { + "epoch": 14.234762979683973, + "grad_norm": 253.1671600341797, + "learning_rate": 8.586206896551726e-06, + "loss": 37.5859, + "step": 3943 + }, + { + "epoch": 14.238374717832958, + "grad_norm": 249.80943298339844, + "learning_rate": 8.580762250453721e-06, + "loss": 37.8799, + "step": 3944 + }, + { + "epoch": 14.241986455981941, + "grad_norm": 245.29103088378906, + "learning_rate": 8.575317604355716e-06, + "loss": 36.7551, + "step": 3945 + }, + { + "epoch": 14.245598194130926, + "grad_norm": 205.5915985107422, + "learning_rate": 8.569872958257713e-06, + "loss": 38.4761, + "step": 3946 + }, + { + "epoch": 14.249209932279909, + "grad_norm": 218.10328674316406, + "learning_rate": 8.56442831215971e-06, + "loss": 37.5862, + "step": 3947 + }, + { + "epoch": 14.252821670428894, + "grad_norm": 273.5924072265625, + "learning_rate": 8.558983666061706e-06, + "loss": 39.2851, + "step": 3948 + }, + { + "epoch": 14.256433408577879, + "grad_norm": 235.48069763183594, + "learning_rate": 8.553539019963703e-06, + "loss": 39.0707, + "step": 3949 + }, + { + "epoch": 14.260045146726862, + "grad_norm": 230.93150329589844, + "learning_rate": 8.548094373865698e-06, + "loss": 37.8469, + "step": 3950 + }, + { + "epoch": 14.260045146726862, + "eval_loss": 0.6072147488594055, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 3950 + }, + { + "epoch": 14.263656884875846, + "grad_norm": 226.3638458251953, + "learning_rate": 8.542649727767695e-06, + "loss": 39.4245, + "step": 3951 + }, + { + "epoch": 14.267268623024831, + "grad_norm": 226.74595642089844, + "learning_rate": 8.537205081669693e-06, + "loss": 38.116, + "step": 3952 + }, + { + "epoch": 14.270880361173814, + "grad_norm": 226.1452178955078, + "learning_rate": 8.531760435571688e-06, + "loss": 39.9114, + "step": 3953 + }, + { + "epoch": 14.2744920993228, + "grad_norm": 387.8020324707031, + "learning_rate": 8.526315789473685e-06, + "loss": 38.9457, + "step": 3954 + }, + { + "epoch": 14.278103837471784, + "grad_norm": 381.5679931640625, + "learning_rate": 8.52087114337568e-06, + "loss": 40.7989, + "step": 3955 + }, + { + "epoch": 14.281715575620767, + "grad_norm": 246.16464233398438, + "learning_rate": 8.515426497277677e-06, + "loss": 37.6288, + "step": 3956 + }, + { + "epoch": 14.285327313769752, + "grad_norm": 337.05059814453125, + "learning_rate": 8.509981851179674e-06, + "loss": 37.3276, + "step": 3957 + }, + { + "epoch": 14.288939051918735, + "grad_norm": 223.80421447753906, + "learning_rate": 8.50453720508167e-06, + "loss": 33.9465, + "step": 3958 + }, + { + "epoch": 14.29255079006772, + "grad_norm": 218.9332275390625, + "learning_rate": 8.499092558983665e-06, + "loss": 33.0305, + "step": 3959 + }, + { + "epoch": 14.296162528216705, + "grad_norm": 254.20726013183594, + "learning_rate": 8.493647912885662e-06, + "loss": 31.3806, + "step": 3960 + }, + { + "epoch": 14.296162528216705, + "eval_loss": 0.6070483922958374, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 3960 + }, + { + "epoch": 14.299774266365688, + "grad_norm": 232.96702575683594, + "learning_rate": 8.48820326678766e-06, + "loss": 31.7001, + "step": 3961 + }, + { + "epoch": 14.303386004514673, + "grad_norm": 305.31207275390625, + "learning_rate": 8.482758620689656e-06, + "loss": 32.2629, + "step": 3962 + }, + { + "epoch": 14.306997742663658, + "grad_norm": 253.60858154296875, + "learning_rate": 8.477313974591652e-06, + "loss": 34.2635, + "step": 3963 + }, + { + "epoch": 14.31060948081264, + "grad_norm": 395.4168701171875, + "learning_rate": 8.471869328493647e-06, + "loss": 34.6987, + "step": 3964 + }, + { + "epoch": 14.314221218961626, + "grad_norm": 279.72845458984375, + "learning_rate": 8.466424682395644e-06, + "loss": 34.5488, + "step": 3965 + }, + { + "epoch": 14.317832957110609, + "grad_norm": 285.7306213378906, + "learning_rate": 8.460980036297641e-06, + "loss": 35.2566, + "step": 3966 + }, + { + "epoch": 14.321444695259594, + "grad_norm": 229.04226684570312, + "learning_rate": 8.455535390199637e-06, + "loss": 34.5273, + "step": 3967 + }, + { + "epoch": 14.325056433408578, + "grad_norm": 232.50205993652344, + "learning_rate": 8.450090744101634e-06, + "loss": 34.6337, + "step": 3968 + }, + { + "epoch": 14.328668171557561, + "grad_norm": 225.87583923339844, + "learning_rate": 8.44464609800363e-06, + "loss": 35.1575, + "step": 3969 + }, + { + "epoch": 14.332279909706546, + "grad_norm": 266.2709045410156, + "learning_rate": 8.439201451905626e-06, + "loss": 34.2619, + "step": 3970 + }, + { + "epoch": 14.332279909706546, + "eval_loss": 0.6066078543663025, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 3970 + }, + { + "epoch": 14.335891647855531, + "grad_norm": 283.557373046875, + "learning_rate": 8.433756805807623e-06, + "loss": 35.5713, + "step": 3971 + }, + { + "epoch": 14.339503386004514, + "grad_norm": 288.43707275390625, + "learning_rate": 8.428312159709619e-06, + "loss": 36.7442, + "step": 3972 + }, + { + "epoch": 14.343115124153499, + "grad_norm": 331.3218994140625, + "learning_rate": 8.422867513611616e-06, + "loss": 35.5839, + "step": 3973 + }, + { + "epoch": 14.346726862302482, + "grad_norm": 257.1488037109375, + "learning_rate": 8.417422867513611e-06, + "loss": 30.2221, + "step": 3974 + }, + { + "epoch": 14.350338600451467, + "grad_norm": 200.0919189453125, + "learning_rate": 8.411978221415608e-06, + "loss": 22.217, + "step": 3975 + }, + { + "epoch": 14.353950338600452, + "grad_norm": 245.030029296875, + "learning_rate": 8.406533575317605e-06, + "loss": 22.8927, + "step": 3976 + }, + { + "epoch": 14.357562076749435, + "grad_norm": 208.5701904296875, + "learning_rate": 8.4010889292196e-06, + "loss": 22.9537, + "step": 3977 + }, + { + "epoch": 14.36117381489842, + "grad_norm": 232.0613250732422, + "learning_rate": 8.395644283121596e-06, + "loss": 24.5304, + "step": 3978 + }, + { + "epoch": 14.364785553047405, + "grad_norm": 193.56541442871094, + "learning_rate": 8.390199637023595e-06, + "loss": 39.4552, + "step": 3979 + }, + { + "epoch": 14.368397291196388, + "grad_norm": 230.35507202148438, + "learning_rate": 8.38475499092559e-06, + "loss": 41.0417, + "step": 3980 + }, + { + "epoch": 14.368397291196388, + "eval_loss": 0.6071842908859253, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 3980 + }, + { + "epoch": 14.372009029345373, + "grad_norm": 191.09242248535156, + "learning_rate": 8.379310344827586e-06, + "loss": 40.1548, + "step": 3981 + }, + { + "epoch": 14.375620767494357, + "grad_norm": 249.24520874023438, + "learning_rate": 8.373865698729583e-06, + "loss": 39.5746, + "step": 3982 + }, + { + "epoch": 14.37923250564334, + "grad_norm": 266.509033203125, + "learning_rate": 8.368421052631578e-06, + "loss": 39.2388, + "step": 3983 + }, + { + "epoch": 14.382844243792325, + "grad_norm": 255.36209106445312, + "learning_rate": 8.362976406533577e-06, + "loss": 39.9314, + "step": 3984 + }, + { + "epoch": 14.386455981941308, + "grad_norm": 239.0690460205078, + "learning_rate": 8.357531760435572e-06, + "loss": 39.9124, + "step": 3985 + }, + { + "epoch": 14.390067720090293, + "grad_norm": 211.36135864257812, + "learning_rate": 8.352087114337568e-06, + "loss": 40.1307, + "step": 3986 + }, + { + "epoch": 14.393679458239278, + "grad_norm": 215.28912353515625, + "learning_rate": 8.346642468239565e-06, + "loss": 40.5252, + "step": 3987 + }, + { + "epoch": 14.397291196388261, + "grad_norm": 240.84271240234375, + "learning_rate": 8.34119782214156e-06, + "loss": 40.8348, + "step": 3988 + }, + { + "epoch": 14.400902934537246, + "grad_norm": 228.41758728027344, + "learning_rate": 8.335753176043557e-06, + "loss": 39.8228, + "step": 3989 + }, + { + "epoch": 14.404514672686231, + "grad_norm": 203.0228729248047, + "learning_rate": 8.330308529945554e-06, + "loss": 38.0696, + "step": 3990 + }, + { + "epoch": 14.404514672686231, + "eval_loss": 0.6064196825027466, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.983, + "eval_steps_per_second": 56.983, + "step": 3990 + }, + { + "epoch": 14.408126410835214, + "grad_norm": 245.14646911621094, + "learning_rate": 8.32486388384755e-06, + "loss": 37.3921, + "step": 3991 + }, + { + "epoch": 14.411738148984199, + "grad_norm": 230.0685577392578, + "learning_rate": 8.319419237749545e-06, + "loss": 36.8794, + "step": 3992 + }, + { + "epoch": 14.415349887133182, + "grad_norm": 203.02955627441406, + "learning_rate": 8.313974591651544e-06, + "loss": 38.011, + "step": 3993 + }, + { + "epoch": 14.418961625282167, + "grad_norm": 276.0522766113281, + "learning_rate": 8.30852994555354e-06, + "loss": 37.8114, + "step": 3994 + }, + { + "epoch": 14.422573363431152, + "grad_norm": 205.56423950195312, + "learning_rate": 8.303085299455536e-06, + "loss": 38.1956, + "step": 3995 + }, + { + "epoch": 14.426185101580135, + "grad_norm": 200.71507263183594, + "learning_rate": 8.297640653357532e-06, + "loss": 36.4471, + "step": 3996 + }, + { + "epoch": 14.42979683972912, + "grad_norm": 217.8540496826172, + "learning_rate": 8.292196007259527e-06, + "loss": 37.6204, + "step": 3997 + }, + { + "epoch": 14.433408577878104, + "grad_norm": 228.0621337890625, + "learning_rate": 8.286751361161526e-06, + "loss": 38.6074, + "step": 3998 + }, + { + "epoch": 14.437020316027088, + "grad_norm": 246.05203247070312, + "learning_rate": 8.281306715063521e-06, + "loss": 37.8614, + "step": 3999 + }, + { + "epoch": 14.440632054176072, + "grad_norm": 216.0327911376953, + "learning_rate": 8.275862068965517e-06, + "loss": 37.4941, + "step": 4000 + }, + { + "epoch": 14.440632054176072, + "eval_loss": 0.605604887008667, + "eval_runtime": 3.1399, + "eval_samples_per_second": 57.008, + "eval_steps_per_second": 57.008, + "step": 4000 + }, + { + "epoch": 14.444243792325057, + "grad_norm": 292.38653564453125, + "learning_rate": 8.270417422867514e-06, + "loss": 37.9576, + "step": 4001 + }, + { + "epoch": 14.44785553047404, + "grad_norm": 268.2558288574219, + "learning_rate": 8.26497277676951e-06, + "loss": 38.7505, + "step": 4002 + }, + { + "epoch": 14.451467268623025, + "grad_norm": 324.135498046875, + "learning_rate": 8.259528130671508e-06, + "loss": 39.9733, + "step": 4003 + }, + { + "epoch": 14.455079006772008, + "grad_norm": 269.1458740234375, + "learning_rate": 8.254083484573503e-06, + "loss": 38.8272, + "step": 4004 + }, + { + "epoch": 14.458690744920993, + "grad_norm": 214.26547241210938, + "learning_rate": 8.248638838475499e-06, + "loss": 37.7277, + "step": 4005 + }, + { + "epoch": 14.462302483069978, + "grad_norm": 256.4419860839844, + "learning_rate": 8.243194192377496e-06, + "loss": 39.0446, + "step": 4006 + }, + { + "epoch": 14.465914221218961, + "grad_norm": 226.9741973876953, + "learning_rate": 8.237749546279493e-06, + "loss": 34.2491, + "step": 4007 + }, + { + "epoch": 14.469525959367946, + "grad_norm": 238.4901123046875, + "learning_rate": 8.232304900181488e-06, + "loss": 32.1969, + "step": 4008 + }, + { + "epoch": 14.47313769751693, + "grad_norm": 260.6334533691406, + "learning_rate": 8.226860254083485e-06, + "loss": 32.5999, + "step": 4009 + }, + { + "epoch": 14.476749435665914, + "grad_norm": 227.4844970703125, + "learning_rate": 8.22141560798548e-06, + "loss": 30.3598, + "step": 4010 + }, + { + "epoch": 14.476749435665914, + "eval_loss": 0.6049788594245911, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 4010 + }, + { + "epoch": 14.480361173814899, + "grad_norm": 231.49935913085938, + "learning_rate": 8.215970961887476e-06, + "loss": 32.3228, + "step": 4011 + }, + { + "epoch": 14.483972911963882, + "grad_norm": 246.83099365234375, + "learning_rate": 8.210526315789475e-06, + "loss": 32.1275, + "step": 4012 + }, + { + "epoch": 14.487584650112867, + "grad_norm": 283.0715026855469, + "learning_rate": 8.20508166969147e-06, + "loss": 32.9237, + "step": 4013 + }, + { + "epoch": 14.491196388261852, + "grad_norm": 264.58941650390625, + "learning_rate": 8.199637023593467e-06, + "loss": 34.3091, + "step": 4014 + }, + { + "epoch": 14.494808126410835, + "grad_norm": 207.57241821289062, + "learning_rate": 8.194192377495463e-06, + "loss": 34.2317, + "step": 4015 + }, + { + "epoch": 14.49841986455982, + "grad_norm": 266.3730163574219, + "learning_rate": 8.18874773139746e-06, + "loss": 35.5423, + "step": 4016 + }, + { + "epoch": 14.502031602708804, + "grad_norm": 274.2936096191406, + "learning_rate": 8.183303085299457e-06, + "loss": 34.0383, + "step": 4017 + }, + { + "epoch": 14.505643340857787, + "grad_norm": 345.4320068359375, + "learning_rate": 8.177858439201452e-06, + "loss": 35.6892, + "step": 4018 + }, + { + "epoch": 14.509255079006772, + "grad_norm": 254.9503631591797, + "learning_rate": 8.172413793103448e-06, + "loss": 34.4219, + "step": 4019 + }, + { + "epoch": 14.512866817155757, + "grad_norm": 277.176025390625, + "learning_rate": 8.166969147005445e-06, + "loss": 34.6322, + "step": 4020 + }, + { + "epoch": 14.512866817155757, + "eval_loss": 0.6078911423683167, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 4020 + }, + { + "epoch": 14.51647855530474, + "grad_norm": 267.24737548828125, + "learning_rate": 8.161524500907442e-06, + "loss": 36.4843, + "step": 4021 + }, + { + "epoch": 14.520090293453725, + "grad_norm": 291.5208740234375, + "learning_rate": 8.156079854809437e-06, + "loss": 36.347, + "step": 4022 + }, + { + "epoch": 14.523702031602708, + "grad_norm": 331.9736328125, + "learning_rate": 8.150635208711434e-06, + "loss": 36.5678, + "step": 4023 + }, + { + "epoch": 14.527313769751693, + "grad_norm": 283.7598876953125, + "learning_rate": 8.14519056261343e-06, + "loss": 29.4886, + "step": 4024 + }, + { + "epoch": 14.530925507900678, + "grad_norm": 214.61712646484375, + "learning_rate": 8.139745916515427e-06, + "loss": 23.2178, + "step": 4025 + }, + { + "epoch": 14.534537246049661, + "grad_norm": 286.7948913574219, + "learning_rate": 8.134301270417424e-06, + "loss": 22.0972, + "step": 4026 + }, + { + "epoch": 14.538148984198646, + "grad_norm": 230.6540069580078, + "learning_rate": 8.128856624319419e-06, + "loss": 23.2764, + "step": 4027 + }, + { + "epoch": 14.54176072234763, + "grad_norm": 300.9560241699219, + "learning_rate": 8.123411978221416e-06, + "loss": 24.1889, + "step": 4028 + }, + { + "epoch": 14.545372460496614, + "grad_norm": 211.4068145751953, + "learning_rate": 8.117967332123412e-06, + "loss": 39.0039, + "step": 4029 + }, + { + "epoch": 14.548984198645599, + "grad_norm": 274.3965759277344, + "learning_rate": 8.112522686025409e-06, + "loss": 41.1832, + "step": 4030 + }, + { + "epoch": 14.548984198645599, + "eval_loss": 0.6079195141792297, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 4030 + }, + { + "epoch": 14.552595936794582, + "grad_norm": 247.50657653808594, + "learning_rate": 8.107078039927406e-06, + "loss": 38.28, + "step": 4031 + }, + { + "epoch": 14.556207674943566, + "grad_norm": 216.0500946044922, + "learning_rate": 8.101633393829401e-06, + "loss": 39.5079, + "step": 4032 + }, + { + "epoch": 14.559819413092551, + "grad_norm": 271.37066650390625, + "learning_rate": 8.096188747731396e-06, + "loss": 40.1902, + "step": 4033 + }, + { + "epoch": 14.563431151241534, + "grad_norm": 233.35415649414062, + "learning_rate": 8.090744101633394e-06, + "loss": 40.2113, + "step": 4034 + }, + { + "epoch": 14.56704288939052, + "grad_norm": 214.67381286621094, + "learning_rate": 8.08529945553539e-06, + "loss": 39.794, + "step": 4035 + }, + { + "epoch": 14.570654627539504, + "grad_norm": 298.1142578125, + "learning_rate": 8.079854809437388e-06, + "loss": 39.9214, + "step": 4036 + }, + { + "epoch": 14.574266365688487, + "grad_norm": 197.40823364257812, + "learning_rate": 8.074410163339383e-06, + "loss": 40.9599, + "step": 4037 + }, + { + "epoch": 14.577878103837472, + "grad_norm": 242.1573028564453, + "learning_rate": 8.068965517241378e-06, + "loss": 40.2351, + "step": 4038 + }, + { + "epoch": 14.581489841986457, + "grad_norm": 224.93801879882812, + "learning_rate": 8.063520871143377e-06, + "loss": 39.0174, + "step": 4039 + }, + { + "epoch": 14.58510158013544, + "grad_norm": 295.4931335449219, + "learning_rate": 8.058076225045373e-06, + "loss": 37.4696, + "step": 4040 + }, + { + "epoch": 14.58510158013544, + "eval_loss": 0.6091852188110352, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 4040 + }, + { + "epoch": 14.588713318284425, + "grad_norm": 302.8267517089844, + "learning_rate": 8.052631578947368e-06, + "loss": 37.3227, + "step": 4041 + }, + { + "epoch": 14.592325056433408, + "grad_norm": 355.2379150390625, + "learning_rate": 8.047186932849365e-06, + "loss": 38.433, + "step": 4042 + }, + { + "epoch": 14.595936794582393, + "grad_norm": 304.96234130859375, + "learning_rate": 8.04174228675136e-06, + "loss": 37.8352, + "step": 4043 + }, + { + "epoch": 14.599548532731378, + "grad_norm": 309.294921875, + "learning_rate": 8.036297640653358e-06, + "loss": 38.1734, + "step": 4044 + }, + { + "epoch": 14.60316027088036, + "grad_norm": 216.3328399658203, + "learning_rate": 8.030852994555355e-06, + "loss": 37.3612, + "step": 4045 + }, + { + "epoch": 14.606772009029346, + "grad_norm": 250.9885711669922, + "learning_rate": 8.02540834845735e-06, + "loss": 39.1612, + "step": 4046 + }, + { + "epoch": 14.610383747178329, + "grad_norm": 215.0750732421875, + "learning_rate": 8.019963702359347e-06, + "loss": 39.6837, + "step": 4047 + }, + { + "epoch": 14.613995485327314, + "grad_norm": 234.02069091796875, + "learning_rate": 8.014519056261342e-06, + "loss": 37.9746, + "step": 4048 + }, + { + "epoch": 14.617607223476298, + "grad_norm": 233.7527313232422, + "learning_rate": 8.00907441016334e-06, + "loss": 38.5114, + "step": 4049 + }, + { + "epoch": 14.621218961625281, + "grad_norm": 271.77496337890625, + "learning_rate": 8.003629764065337e-06, + "loss": 37.1647, + "step": 4050 + }, + { + "epoch": 14.621218961625281, + "eval_loss": 0.6047770977020264, + "eval_runtime": 3.1379, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 4050 + }, + { + "epoch": 14.624830699774266, + "grad_norm": 281.7846374511719, + "learning_rate": 7.998185117967332e-06, + "loss": 38.981, + "step": 4051 + }, + { + "epoch": 14.628442437923251, + "grad_norm": 308.8702697753906, + "learning_rate": 7.992740471869327e-06, + "loss": 39.4821, + "step": 4052 + }, + { + "epoch": 14.632054176072234, + "grad_norm": 366.1501770019531, + "learning_rate": 7.987295825771326e-06, + "loss": 39.0898, + "step": 4053 + }, + { + "epoch": 14.635665914221219, + "grad_norm": 276.92962646484375, + "learning_rate": 7.981851179673322e-06, + "loss": 39.6162, + "step": 4054 + }, + { + "epoch": 14.639277652370204, + "grad_norm": 220.0023651123047, + "learning_rate": 7.976406533575319e-06, + "loss": 38.5888, + "step": 4055 + }, + { + "epoch": 14.642889390519187, + "grad_norm": 268.57293701171875, + "learning_rate": 7.970961887477314e-06, + "loss": 38.4631, + "step": 4056 + }, + { + "epoch": 14.646501128668172, + "grad_norm": 307.8072509765625, + "learning_rate": 7.96551724137931e-06, + "loss": 35.4139, + "step": 4057 + }, + { + "epoch": 14.650112866817155, + "grad_norm": 228.11767578125, + "learning_rate": 7.960072595281308e-06, + "loss": 33.3694, + "step": 4058 + }, + { + "epoch": 14.65372460496614, + "grad_norm": 217.6271209716797, + "learning_rate": 7.954627949183304e-06, + "loss": 31.3355, + "step": 4059 + }, + { + "epoch": 14.657336343115125, + "grad_norm": 232.31944274902344, + "learning_rate": 7.949183303085299e-06, + "loss": 32.8306, + "step": 4060 + }, + { + "epoch": 14.657336343115125, + "eval_loss": 0.6018487215042114, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 4060 + }, + { + "epoch": 14.660948081264108, + "grad_norm": 244.58303833007812, + "learning_rate": 7.943738656987296e-06, + "loss": 33.2157, + "step": 4061 + }, + { + "epoch": 14.664559819413093, + "grad_norm": 306.12005615234375, + "learning_rate": 7.938294010889293e-06, + "loss": 33.6361, + "step": 4062 + }, + { + "epoch": 14.668171557562077, + "grad_norm": 266.2792053222656, + "learning_rate": 7.932849364791288e-06, + "loss": 32.3917, + "step": 4063 + }, + { + "epoch": 14.67178329571106, + "grad_norm": 259.373779296875, + "learning_rate": 7.927404718693286e-06, + "loss": 33.3598, + "step": 4064 + }, + { + "epoch": 14.675395033860045, + "grad_norm": 247.35179138183594, + "learning_rate": 7.921960072595281e-06, + "loss": 32.2699, + "step": 4065 + }, + { + "epoch": 14.679006772009028, + "grad_norm": 280.02960205078125, + "learning_rate": 7.916515426497278e-06, + "loss": 33.0305, + "step": 4066 + }, + { + "epoch": 14.682618510158013, + "grad_norm": 394.6492919921875, + "learning_rate": 7.911070780399275e-06, + "loss": 35.1854, + "step": 4067 + }, + { + "epoch": 14.686230248306998, + "grad_norm": 298.6531677246094, + "learning_rate": 7.90562613430127e-06, + "loss": 35.1836, + "step": 4068 + }, + { + "epoch": 14.689841986455981, + "grad_norm": 250.960693359375, + "learning_rate": 7.900181488203268e-06, + "loss": 32.6266, + "step": 4069 + }, + { + "epoch": 14.693453724604966, + "grad_norm": 240.4825897216797, + "learning_rate": 7.894736842105263e-06, + "loss": 35.5937, + "step": 4070 + }, + { + "epoch": 14.693453724604966, + "eval_loss": 0.6042065620422363, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.91, + "eval_steps_per_second": 56.91, + "step": 4070 + }, + { + "epoch": 14.697065462753951, + "grad_norm": 274.6919860839844, + "learning_rate": 7.889292196007258e-06, + "loss": 36.4225, + "step": 4071 + }, + { + "epoch": 14.700677200902934, + "grad_norm": 245.4980010986328, + "learning_rate": 7.883847549909257e-06, + "loss": 36.5503, + "step": 4072 + }, + { + "epoch": 14.704288939051919, + "grad_norm": 373.362548828125, + "learning_rate": 7.878402903811252e-06, + "loss": 35.38, + "step": 4073 + }, + { + "epoch": 14.707900677200904, + "grad_norm": 337.5054626464844, + "learning_rate": 7.872958257713248e-06, + "loss": 28.869, + "step": 4074 + }, + { + "epoch": 14.711512415349887, + "grad_norm": 238.19195556640625, + "learning_rate": 7.867513611615245e-06, + "loss": 22.99, + "step": 4075 + }, + { + "epoch": 14.715124153498872, + "grad_norm": 254.274169921875, + "learning_rate": 7.862068965517242e-06, + "loss": 22.5274, + "step": 4076 + }, + { + "epoch": 14.718735891647855, + "grad_norm": 236.74099731445312, + "learning_rate": 7.856624319419239e-06, + "loss": 23.6756, + "step": 4077 + }, + { + "epoch": 14.72234762979684, + "grad_norm": 239.69911193847656, + "learning_rate": 7.851179673321234e-06, + "loss": 23.2024, + "step": 4078 + }, + { + "epoch": 14.725959367945824, + "grad_norm": 296.35101318359375, + "learning_rate": 7.84573502722323e-06, + "loss": 40.0026, + "step": 4079 + }, + { + "epoch": 14.729571106094808, + "grad_norm": 202.52577209472656, + "learning_rate": 7.840290381125227e-06, + "loss": 41.2817, + "step": 4080 + }, + { + "epoch": 14.729571106094808, + "eval_loss": 0.6069625616073608, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4080 + }, + { + "epoch": 14.733182844243792, + "grad_norm": 290.4194030761719, + "learning_rate": 7.834845735027224e-06, + "loss": 40.5411, + "step": 4081 + }, + { + "epoch": 14.736794582392777, + "grad_norm": 284.0616455078125, + "learning_rate": 7.82940108892922e-06, + "loss": 40.6588, + "step": 4082 + }, + { + "epoch": 14.74040632054176, + "grad_norm": 289.5628967285156, + "learning_rate": 7.823956442831216e-06, + "loss": 38.986, + "step": 4083 + }, + { + "epoch": 14.744018058690745, + "grad_norm": 217.09841918945312, + "learning_rate": 7.818511796733212e-06, + "loss": 38.83, + "step": 4084 + }, + { + "epoch": 14.747629796839728, + "grad_norm": 223.49148559570312, + "learning_rate": 7.813067150635209e-06, + "loss": 39.4897, + "step": 4085 + }, + { + "epoch": 14.751241534988713, + "grad_norm": 240.41578674316406, + "learning_rate": 7.807622504537206e-06, + "loss": 38.9963, + "step": 4086 + }, + { + "epoch": 14.754853273137698, + "grad_norm": 206.7586212158203, + "learning_rate": 7.802177858439201e-06, + "loss": 39.7875, + "step": 4087 + }, + { + "epoch": 14.758465011286681, + "grad_norm": 239.97174072265625, + "learning_rate": 7.796733212341198e-06, + "loss": 39.3977, + "step": 4088 + }, + { + "epoch": 14.762076749435666, + "grad_norm": 204.50839233398438, + "learning_rate": 7.791288566243194e-06, + "loss": 38.7869, + "step": 4089 + }, + { + "epoch": 14.76568848758465, + "grad_norm": 216.79583740234375, + "learning_rate": 7.785843920145191e-06, + "loss": 36.7325, + "step": 4090 + }, + { + "epoch": 14.76568848758465, + "eval_loss": 0.6052367091178894, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.93, + "eval_steps_per_second": 56.93, + "step": 4090 + }, + { + "epoch": 14.769300225733634, + "grad_norm": 251.13209533691406, + "learning_rate": 7.780399274047188e-06, + "loss": 38.2377, + "step": 4091 + }, + { + "epoch": 14.772911963882619, + "grad_norm": 222.745361328125, + "learning_rate": 7.774954627949183e-06, + "loss": 36.8119, + "step": 4092 + }, + { + "epoch": 14.776523702031604, + "grad_norm": 252.72117614746094, + "learning_rate": 7.769509981851179e-06, + "loss": 38.1241, + "step": 4093 + }, + { + "epoch": 14.780135440180587, + "grad_norm": 272.38165283203125, + "learning_rate": 7.764065335753176e-06, + "loss": 37.6839, + "step": 4094 + }, + { + "epoch": 14.783747178329572, + "grad_norm": 301.0637512207031, + "learning_rate": 7.758620689655173e-06, + "loss": 38.1267, + "step": 4095 + }, + { + "epoch": 14.787358916478555, + "grad_norm": 240.22515869140625, + "learning_rate": 7.75317604355717e-06, + "loss": 36.9847, + "step": 4096 + }, + { + "epoch": 14.79097065462754, + "grad_norm": 273.3988952636719, + "learning_rate": 7.747731397459165e-06, + "loss": 39.0368, + "step": 4097 + }, + { + "epoch": 14.794582392776524, + "grad_norm": 252.66497802734375, + "learning_rate": 7.74228675136116e-06, + "loss": 38.6439, + "step": 4098 + }, + { + "epoch": 14.798194130925507, + "grad_norm": 246.3287811279297, + "learning_rate": 7.73684210526316e-06, + "loss": 36.3503, + "step": 4099 + }, + { + "epoch": 14.801805869074492, + "grad_norm": 220.6704559326172, + "learning_rate": 7.731397459165155e-06, + "loss": 38.1603, + "step": 4100 + }, + { + "epoch": 14.801805869074492, + "eval_loss": 0.6043270826339722, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4100 + }, + { + "epoch": 14.805417607223477, + "grad_norm": 215.94979858398438, + "learning_rate": 7.72595281306715e-06, + "loss": 38.9624, + "step": 4101 + }, + { + "epoch": 14.80902934537246, + "grad_norm": 228.76815795898438, + "learning_rate": 7.720508166969147e-06, + "loss": 39.2196, + "step": 4102 + }, + { + "epoch": 14.812641083521445, + "grad_norm": 216.1998291015625, + "learning_rate": 7.715063520871143e-06, + "loss": 39.3677, + "step": 4103 + }, + { + "epoch": 14.816252821670428, + "grad_norm": 266.1018981933594, + "learning_rate": 7.70961887477314e-06, + "loss": 38.1856, + "step": 4104 + }, + { + "epoch": 14.819864559819413, + "grad_norm": 234.2566680908203, + "learning_rate": 7.704174228675137e-06, + "loss": 39.6282, + "step": 4105 + }, + { + "epoch": 14.823476297968398, + "grad_norm": 241.16615295410156, + "learning_rate": 7.698729582577132e-06, + "loss": 38.2693, + "step": 4106 + }, + { + "epoch": 14.827088036117381, + "grad_norm": 332.6835021972656, + "learning_rate": 7.69328493647913e-06, + "loss": 37.7161, + "step": 4107 + }, + { + "epoch": 14.830699774266366, + "grad_norm": 260.1654357910156, + "learning_rate": 7.687840290381126e-06, + "loss": 33.9704, + "step": 4108 + }, + { + "epoch": 14.83431151241535, + "grad_norm": 214.45509338378906, + "learning_rate": 7.682395644283122e-06, + "loss": 32.5126, + "step": 4109 + }, + { + "epoch": 14.837923250564334, + "grad_norm": 257.4847717285156, + "learning_rate": 7.676950998185119e-06, + "loss": 32.0682, + "step": 4110 + }, + { + "epoch": 14.837923250564334, + "eval_loss": 0.6022929549217224, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.957, + "eval_steps_per_second": 56.957, + "step": 4110 + }, + { + "epoch": 14.841534988713319, + "grad_norm": 241.302978515625, + "learning_rate": 7.671506352087114e-06, + "loss": 32.8817, + "step": 4111 + }, + { + "epoch": 14.845146726862303, + "grad_norm": 238.0950164794922, + "learning_rate": 7.66606170598911e-06, + "loss": 31.9995, + "step": 4112 + }, + { + "epoch": 14.848758465011286, + "grad_norm": 239.700439453125, + "learning_rate": 7.660617059891108e-06, + "loss": 32.9681, + "step": 4113 + }, + { + "epoch": 14.852370203160271, + "grad_norm": 234.23890686035156, + "learning_rate": 7.655172413793104e-06, + "loss": 33.6878, + "step": 4114 + }, + { + "epoch": 14.855981941309254, + "grad_norm": 367.3103332519531, + "learning_rate": 7.6497277676951e-06, + "loss": 34.2346, + "step": 4115 + }, + { + "epoch": 14.85959367945824, + "grad_norm": 221.31381225585938, + "learning_rate": 7.644283121597096e-06, + "loss": 35.0148, + "step": 4116 + }, + { + "epoch": 14.863205417607224, + "grad_norm": 352.1162109375, + "learning_rate": 7.638838475499092e-06, + "loss": 34.8326, + "step": 4117 + }, + { + "epoch": 14.866817155756207, + "grad_norm": 296.8202209472656, + "learning_rate": 7.63339382940109e-06, + "loss": 34.2522, + "step": 4118 + }, + { + "epoch": 14.870428893905192, + "grad_norm": 283.4679870605469, + "learning_rate": 7.627949183303086e-06, + "loss": 34.5005, + "step": 4119 + }, + { + "epoch": 14.874040632054175, + "grad_norm": 249.95033264160156, + "learning_rate": 7.622504537205082e-06, + "loss": 34.9581, + "step": 4120 + }, + { + "epoch": 14.874040632054175, + "eval_loss": 0.6031190752983093, + "eval_runtime": 3.1392, + "eval_samples_per_second": 57.02, + "eval_steps_per_second": 57.02, + "step": 4120 + }, + { + "epoch": 14.87765237020316, + "grad_norm": 235.65065002441406, + "learning_rate": 7.6170598911070774e-06, + "loss": 35.3024, + "step": 4121 + }, + { + "epoch": 14.881264108352145, + "grad_norm": 258.1300964355469, + "learning_rate": 7.611615245009075e-06, + "loss": 35.4444, + "step": 4122 + }, + { + "epoch": 14.884875846501128, + "grad_norm": 262.9698791503906, + "learning_rate": 7.606170598911072e-06, + "loss": 36.5643, + "step": 4123 + }, + { + "epoch": 14.888487584650113, + "grad_norm": 274.81781005859375, + "learning_rate": 7.600725952813067e-06, + "loss": 33.0157, + "step": 4124 + }, + { + "epoch": 14.892099322799098, + "grad_norm": 205.41566467285156, + "learning_rate": 7.595281306715063e-06, + "loss": 22.226, + "step": 4125 + }, + { + "epoch": 14.89571106094808, + "grad_norm": 231.19541931152344, + "learning_rate": 7.5898366606170594e-06, + "loss": 22.1499, + "step": 4126 + }, + { + "epoch": 14.899322799097066, + "grad_norm": 203.04856872558594, + "learning_rate": 7.584392014519057e-06, + "loss": 23.3987, + "step": 4127 + }, + { + "epoch": 14.90293453724605, + "grad_norm": 289.031005859375, + "learning_rate": 7.578947368421053e-06, + "loss": 24.3649, + "step": 4128 + }, + { + "epoch": 14.906546275395034, + "grad_norm": 285.2325744628906, + "learning_rate": 7.573502722323049e-06, + "loss": 41.146, + "step": 4129 + }, + { + "epoch": 14.910158013544018, + "grad_norm": 232.21603393554688, + "learning_rate": 7.568058076225045e-06, + "loss": 40.3871, + "step": 4130 + }, + { + "epoch": 14.910158013544018, + "eval_loss": 0.6056836247444153, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 4130 + }, + { + "epoch": 14.913769751693001, + "grad_norm": 358.63238525390625, + "learning_rate": 7.562613430127043e-06, + "loss": 39.5914, + "step": 4131 + }, + { + "epoch": 14.917381489841986, + "grad_norm": 262.66741943359375, + "learning_rate": 7.5571687840290385e-06, + "loss": 39.4552, + "step": 4132 + }, + { + "epoch": 14.920993227990971, + "grad_norm": 228.7096710205078, + "learning_rate": 7.551724137931035e-06, + "loss": 41.5379, + "step": 4133 + }, + { + "epoch": 14.924604966139954, + "grad_norm": 266.6537780761719, + "learning_rate": 7.546279491833031e-06, + "loss": 39.8314, + "step": 4134 + }, + { + "epoch": 14.928216704288939, + "grad_norm": 329.5486755371094, + "learning_rate": 7.540834845735027e-06, + "loss": 37.8247, + "step": 4135 + }, + { + "epoch": 14.931828442437924, + "grad_norm": 391.49127197265625, + "learning_rate": 7.535390199637024e-06, + "loss": 36.8491, + "step": 4136 + }, + { + "epoch": 14.935440180586907, + "grad_norm": 342.66632080078125, + "learning_rate": 7.5299455535390205e-06, + "loss": 37.7245, + "step": 4137 + }, + { + "epoch": 14.939051918735892, + "grad_norm": 309.25115966796875, + "learning_rate": 7.524500907441017e-06, + "loss": 38.3694, + "step": 4138 + }, + { + "epoch": 14.942663656884875, + "grad_norm": 438.21539306640625, + "learning_rate": 7.519056261343012e-06, + "loss": 38.5028, + "step": 4139 + }, + { + "epoch": 14.94627539503386, + "grad_norm": 314.2667541503906, + "learning_rate": 7.513611615245008e-06, + "loss": 39.2531, + "step": 4140 + }, + { + "epoch": 14.94627539503386, + "eval_loss": 0.6075459718704224, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 4140 + }, + { + "epoch": 14.949887133182845, + "grad_norm": 348.3675537109375, + "learning_rate": 7.508166969147006e-06, + "loss": 38.3904, + "step": 4141 + }, + { + "epoch": 14.953498871331828, + "grad_norm": 448.6506652832031, + "learning_rate": 7.5027223230490025e-06, + "loss": 39.0257, + "step": 4142 + }, + { + "epoch": 14.957110609480813, + "grad_norm": 407.4074401855469, + "learning_rate": 7.497277676950998e-06, + "loss": 36.8144, + "step": 4143 + }, + { + "epoch": 14.960722347629797, + "grad_norm": 311.0707702636719, + "learning_rate": 7.491833030852995e-06, + "loss": 34.3852, + "step": 4144 + }, + { + "epoch": 14.96433408577878, + "grad_norm": 316.660400390625, + "learning_rate": 7.486388384754991e-06, + "loss": 32.9411, + "step": 4145 + }, + { + "epoch": 14.967945823927765, + "grad_norm": 405.3203125, + "learning_rate": 7.480943738656988e-06, + "loss": 32.9947, + "step": 4146 + }, + { + "epoch": 14.97155756207675, + "grad_norm": 246.47296142578125, + "learning_rate": 7.475499092558984e-06, + "loss": 34.9284, + "step": 4147 + }, + { + "epoch": 14.975169300225733, + "grad_norm": 250.6293487548828, + "learning_rate": 7.47005444646098e-06, + "loss": 33.5852, + "step": 4148 + }, + { + "epoch": 14.978781038374718, + "grad_norm": 367.8492736816406, + "learning_rate": 7.464609800362977e-06, + "loss": 34.5658, + "step": 4149 + }, + { + "epoch": 14.982392776523701, + "grad_norm": 299.1382141113281, + "learning_rate": 7.459165154264972e-06, + "loss": 35.4483, + "step": 4150 + }, + { + "epoch": 14.982392776523701, + "eval_loss": 0.6054605841636658, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 4150 + }, + { + "epoch": 14.986004514672686, + "grad_norm": 448.0080261230469, + "learning_rate": 7.453720508166969e-06, + "loss": 35.9366, + "step": 4151 + }, + { + "epoch": 14.989616252821671, + "grad_norm": 496.0691223144531, + "learning_rate": 7.448275862068966e-06, + "loss": 37.6222, + "step": 4152 + }, + { + "epoch": 14.993227990970654, + "grad_norm": 300.7026062011719, + "learning_rate": 7.442831215970963e-06, + "loss": 27.5573, + "step": 4153 + }, + { + "epoch": 14.996839729119639, + "grad_norm": 183.81434631347656, + "learning_rate": 7.437386569872958e-06, + "loss": 23.0142, + "step": 4154 + }, + { + "epoch": 15.0, + "grad_norm": 198.61032104492188, + "learning_rate": 7.431941923774954e-06, + "loss": 21.0732, + "step": 4155 + }, + { + "epoch": 15.003611738148985, + "grad_norm": 244.2176513671875, + "learning_rate": 7.426497277676951e-06, + "loss": 39.1709, + "step": 4156 + }, + { + "epoch": 15.007223476297968, + "grad_norm": 211.74375915527344, + "learning_rate": 7.421052631578948e-06, + "loss": 39.9364, + "step": 4157 + }, + { + "epoch": 15.010835214446953, + "grad_norm": 216.2489013671875, + "learning_rate": 7.415607985480944e-06, + "loss": 39.5166, + "step": 4158 + }, + { + "epoch": 15.014446952595938, + "grad_norm": 279.423583984375, + "learning_rate": 7.41016333938294e-06, + "loss": 39.6738, + "step": 4159 + }, + { + "epoch": 15.01805869074492, + "grad_norm": 279.117919921875, + "learning_rate": 7.404718693284937e-06, + "loss": 39.3556, + "step": 4160 + }, + { + "epoch": 15.01805869074492, + "eval_loss": 0.6020110249519348, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 4160 + }, + { + "epoch": 15.021670428893906, + "grad_norm": 213.57162475585938, + "learning_rate": 7.399274047186933e-06, + "loss": 38.9987, + "step": 4161 + }, + { + "epoch": 15.025282167042889, + "grad_norm": 184.1968994140625, + "learning_rate": 7.393829401088929e-06, + "loss": 39.1696, + "step": 4162 + }, + { + "epoch": 15.028893905191874, + "grad_norm": 219.38076782226562, + "learning_rate": 7.388384754990926e-06, + "loss": 39.8897, + "step": 4163 + }, + { + "epoch": 15.032505643340858, + "grad_norm": 225.4325714111328, + "learning_rate": 7.382940108892922e-06, + "loss": 40.7633, + "step": 4164 + }, + { + "epoch": 15.036117381489841, + "grad_norm": 274.78472900390625, + "learning_rate": 7.377495462794918e-06, + "loss": 39.8768, + "step": 4165 + }, + { + "epoch": 15.039729119638826, + "grad_norm": 269.5557861328125, + "learning_rate": 7.3720508166969146e-06, + "loss": 38.4735, + "step": 4166 + }, + { + "epoch": 15.043340857787811, + "grad_norm": 219.78761291503906, + "learning_rate": 7.366606170598912e-06, + "loss": 37.2117, + "step": 4167 + }, + { + "epoch": 15.046952595936794, + "grad_norm": 205.49771118164062, + "learning_rate": 7.361161524500908e-06, + "loss": 36.6855, + "step": 4168 + }, + { + "epoch": 15.050564334085779, + "grad_norm": 235.72068786621094, + "learning_rate": 7.355716878402904e-06, + "loss": 35.4408, + "step": 4169 + }, + { + "epoch": 15.054176072234762, + "grad_norm": 218.84732055664062, + "learning_rate": 7.3502722323049e-06, + "loss": 38.2297, + "step": 4170 + }, + { + "epoch": 15.054176072234762, + "eval_loss": 0.6053969860076904, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 4170 + }, + { + "epoch": 15.057787810383747, + "grad_norm": 195.80685424804688, + "learning_rate": 7.3448275862068966e-06, + "loss": 35.7271, + "step": 4171 + }, + { + "epoch": 15.061399548532732, + "grad_norm": 207.12481689453125, + "learning_rate": 7.339382940108894e-06, + "loss": 37.3393, + "step": 4172 + }, + { + "epoch": 15.065011286681715, + "grad_norm": 211.0287322998047, + "learning_rate": 7.333938294010889e-06, + "loss": 36.9505, + "step": 4173 + }, + { + "epoch": 15.0686230248307, + "grad_norm": 279.0206604003906, + "learning_rate": 7.328493647912886e-06, + "loss": 38.1225, + "step": 4174 + }, + { + "epoch": 15.072234762979685, + "grad_norm": 206.3834228515625, + "learning_rate": 7.323049001814882e-06, + "loss": 37.1117, + "step": 4175 + }, + { + "epoch": 15.075846501128668, + "grad_norm": 266.8707275390625, + "learning_rate": 7.3176043557168786e-06, + "loss": 36.1971, + "step": 4176 + }, + { + "epoch": 15.079458239277653, + "grad_norm": 260.35791015625, + "learning_rate": 7.312159709618875e-06, + "loss": 37.4714, + "step": 4177 + }, + { + "epoch": 15.083069977426636, + "grad_norm": 281.152587890625, + "learning_rate": 7.306715063520871e-06, + "loss": 37.621, + "step": 4178 + }, + { + "epoch": 15.08668171557562, + "grad_norm": 246.25758361816406, + "learning_rate": 7.301270417422868e-06, + "loss": 38.919, + "step": 4179 + }, + { + "epoch": 15.090293453724605, + "grad_norm": 378.4499816894531, + "learning_rate": 7.2958257713248635e-06, + "loss": 39.5783, + "step": 4180 + }, + { + "epoch": 15.090293453724605, + "eval_loss": 0.6071392297744751, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 4180 + }, + { + "epoch": 15.093905191873588, + "grad_norm": 421.0552673339844, + "learning_rate": 7.2903811252268606e-06, + "loss": 38.9023, + "step": 4181 + }, + { + "epoch": 15.097516930022573, + "grad_norm": 264.24359130859375, + "learning_rate": 7.284936479128857e-06, + "loss": 39.6466, + "step": 4182 + }, + { + "epoch": 15.101128668171558, + "grad_norm": 246.88182067871094, + "learning_rate": 7.279491833030854e-06, + "loss": 39.4899, + "step": 4183 + }, + { + "epoch": 15.104740406320541, + "grad_norm": 236.83848571777344, + "learning_rate": 7.274047186932849e-06, + "loss": 35.6587, + "step": 4184 + }, + { + "epoch": 15.108352144469526, + "grad_norm": 278.31573486328125, + "learning_rate": 7.2686025408348455e-06, + "loss": 34.1567, + "step": 4185 + }, + { + "epoch": 15.111963882618511, + "grad_norm": 243.71160888671875, + "learning_rate": 7.2631578947368426e-06, + "loss": 32.1268, + "step": 4186 + }, + { + "epoch": 15.115575620767494, + "grad_norm": 233.81211853027344, + "learning_rate": 7.257713248638839e-06, + "loss": 31.498, + "step": 4187 + }, + { + "epoch": 15.119187358916479, + "grad_norm": 243.12672424316406, + "learning_rate": 7.252268602540835e-06, + "loss": 32.3648, + "step": 4188 + }, + { + "epoch": 15.122799097065462, + "grad_norm": 293.38299560546875, + "learning_rate": 7.246823956442831e-06, + "loss": 32.2236, + "step": 4189 + }, + { + "epoch": 15.126410835214447, + "grad_norm": 249.70071411132812, + "learning_rate": 7.241379310344828e-06, + "loss": 34.5535, + "step": 4190 + }, + { + "epoch": 15.126410835214447, + "eval_loss": 0.6050077676773071, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.934, + "eval_steps_per_second": 56.934, + "step": 4190 + }, + { + "epoch": 15.130022573363432, + "grad_norm": 300.9483642578125, + "learning_rate": 7.235934664246824e-06, + "loss": 32.9552, + "step": 4191 + }, + { + "epoch": 15.133634311512415, + "grad_norm": 228.797607421875, + "learning_rate": 7.23049001814882e-06, + "loss": 33.0974, + "step": 4192 + }, + { + "epoch": 15.1372460496614, + "grad_norm": 279.9087219238281, + "learning_rate": 7.225045372050817e-06, + "loss": 34.2865, + "step": 4193 + }, + { + "epoch": 15.140857787810384, + "grad_norm": 254.15928649902344, + "learning_rate": 7.219600725952813e-06, + "loss": 34.5603, + "step": 4194 + }, + { + "epoch": 15.144469525959368, + "grad_norm": 314.19012451171875, + "learning_rate": 7.2141560798548095e-06, + "loss": 34.6428, + "step": 4195 + }, + { + "epoch": 15.148081264108352, + "grad_norm": 291.8244323730469, + "learning_rate": 7.208711433756806e-06, + "loss": 33.6676, + "step": 4196 + }, + { + "epoch": 15.151693002257336, + "grad_norm": 276.4428405761719, + "learning_rate": 7.203266787658803e-06, + "loss": 33.9118, + "step": 4197 + }, + { + "epoch": 15.15530474040632, + "grad_norm": 265.7801208496094, + "learning_rate": 7.197822141560799e-06, + "loss": 35.1971, + "step": 4198 + }, + { + "epoch": 15.158916478555305, + "grad_norm": 244.48667907714844, + "learning_rate": 7.192377495462795e-06, + "loss": 33.0843, + "step": 4199 + }, + { + "epoch": 15.162528216704288, + "grad_norm": 348.6037902832031, + "learning_rate": 7.1869328493647915e-06, + "loss": 36.7957, + "step": 4200 + }, + { + "epoch": 15.162528216704288, + "eval_loss": 0.6052607297897339, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 4200 + }, + { + "epoch": 15.166139954853273, + "grad_norm": 227.31346130371094, + "learning_rate": 7.181488203266788e-06, + "loss": 28.0234, + "step": 4201 + }, + { + "epoch": 15.169751693002258, + "grad_norm": 208.75048828125, + "learning_rate": 7.176043557168784e-06, + "loss": 22.5147, + "step": 4202 + }, + { + "epoch": 15.173363431151241, + "grad_norm": 222.91090393066406, + "learning_rate": 7.17059891107078e-06, + "loss": 22.1029, + "step": 4203 + }, + { + "epoch": 15.176975169300226, + "grad_norm": 219.40621948242188, + "learning_rate": 7.165154264972777e-06, + "loss": 22.9827, + "step": 4204 + }, + { + "epoch": 15.18058690744921, + "grad_norm": 229.11813354492188, + "learning_rate": 7.1597096188747735e-06, + "loss": 23.6974, + "step": 4205 + }, + { + "epoch": 15.184198645598194, + "grad_norm": 256.7950744628906, + "learning_rate": 7.15426497277677e-06, + "loss": 39.6585, + "step": 4206 + }, + { + "epoch": 15.187810383747179, + "grad_norm": 237.47613525390625, + "learning_rate": 7.148820326678766e-06, + "loss": 40.0478, + "step": 4207 + }, + { + "epoch": 15.191422121896162, + "grad_norm": 259.54296875, + "learning_rate": 7.143375680580762e-06, + "loss": 39.7604, + "step": 4208 + }, + { + "epoch": 15.195033860045147, + "grad_norm": 249.7389678955078, + "learning_rate": 7.137931034482759e-06, + "loss": 39.0201, + "step": 4209 + }, + { + "epoch": 15.198645598194132, + "grad_norm": 298.4624938964844, + "learning_rate": 7.132486388384755e-06, + "loss": 39.8575, + "step": 4210 + }, + { + "epoch": 15.198645598194132, + "eval_loss": 0.6088115572929382, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 4210 + }, + { + "epoch": 15.202257336343115, + "grad_norm": 267.57659912109375, + "learning_rate": 7.127041742286752e-06, + "loss": 38.8929, + "step": 4211 + }, + { + "epoch": 15.2058690744921, + "grad_norm": 243.88333129882812, + "learning_rate": 7.121597096188748e-06, + "loss": 39.6078, + "step": 4212 + }, + { + "epoch": 15.209480812641084, + "grad_norm": 268.2644348144531, + "learning_rate": 7.116152450090745e-06, + "loss": 39.9488, + "step": 4213 + }, + { + "epoch": 15.213092550790067, + "grad_norm": 240.2657928466797, + "learning_rate": 7.11070780399274e-06, + "loss": 40.1645, + "step": 4214 + }, + { + "epoch": 15.216704288939052, + "grad_norm": 198.76910400390625, + "learning_rate": 7.105263157894737e-06, + "loss": 38.2229, + "step": 4215 + }, + { + "epoch": 15.220316027088035, + "grad_norm": 234.11170959472656, + "learning_rate": 7.099818511796734e-06, + "loss": 39.5294, + "step": 4216 + }, + { + "epoch": 15.22392776523702, + "grad_norm": 192.80194091796875, + "learning_rate": 7.094373865698729e-06, + "loss": 36.9752, + "step": 4217 + }, + { + "epoch": 15.227539503386005, + "grad_norm": 241.8236846923828, + "learning_rate": 7.088929219600726e-06, + "loss": 36.1043, + "step": 4218 + }, + { + "epoch": 15.231151241534988, + "grad_norm": 451.6199645996094, + "learning_rate": 7.083484573502722e-06, + "loss": 37.7911, + "step": 4219 + }, + { + "epoch": 15.234762979683973, + "grad_norm": 351.9429626464844, + "learning_rate": 7.0780399274047195e-06, + "loss": 35.5202, + "step": 4220 + }, + { + "epoch": 15.234762979683973, + "eval_loss": 0.6093130111694336, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 4220 + }, + { + "epoch": 15.238374717832958, + "grad_norm": 266.4995422363281, + "learning_rate": 7.072595281306715e-06, + "loss": 37.5552, + "step": 4221 + }, + { + "epoch": 15.241986455981941, + "grad_norm": 258.74578857421875, + "learning_rate": 7.067150635208712e-06, + "loss": 37.1315, + "step": 4222 + }, + { + "epoch": 15.245598194130926, + "grad_norm": 233.30921936035156, + "learning_rate": 7.061705989110708e-06, + "loss": 36.9237, + "step": 4223 + }, + { + "epoch": 15.249209932279909, + "grad_norm": 235.8688201904297, + "learning_rate": 7.056261343012704e-06, + "loss": 38.0112, + "step": 4224 + }, + { + "epoch": 15.252821670428894, + "grad_norm": 214.88436889648438, + "learning_rate": 7.050816696914701e-06, + "loss": 38.5641, + "step": 4225 + }, + { + "epoch": 15.256433408577879, + "grad_norm": 252.64144897460938, + "learning_rate": 7.045372050816697e-06, + "loss": 36.7125, + "step": 4226 + }, + { + "epoch": 15.260045146726862, + "grad_norm": 293.78424072265625, + "learning_rate": 7.039927404718694e-06, + "loss": 37.5956, + "step": 4227 + }, + { + "epoch": 15.263656884875846, + "grad_norm": 234.13510131835938, + "learning_rate": 7.03448275862069e-06, + "loss": 38.1829, + "step": 4228 + }, + { + "epoch": 15.267268623024831, + "grad_norm": 279.534912109375, + "learning_rate": 7.029038112522686e-06, + "loss": 39.0785, + "step": 4229 + }, + { + "epoch": 15.270880361173814, + "grad_norm": 246.4442596435547, + "learning_rate": 7.023593466424683e-06, + "loss": 39.1753, + "step": 4230 + }, + { + "epoch": 15.270880361173814, + "eval_loss": 0.6043311357498169, + "eval_runtime": 3.1452, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 4230 + }, + { + "epoch": 15.2744920993228, + "grad_norm": 233.87466430664062, + "learning_rate": 7.018148820326679e-06, + "loss": 39.8464, + "step": 4231 + }, + { + "epoch": 15.278103837471784, + "grad_norm": 228.54898071289062, + "learning_rate": 7.012704174228675e-06, + "loss": 37.9721, + "step": 4232 + }, + { + "epoch": 15.281715575620767, + "grad_norm": 273.70050048828125, + "learning_rate": 7.007259528130671e-06, + "loss": 38.9153, + "step": 4233 + }, + { + "epoch": 15.285327313769752, + "grad_norm": 269.8402404785156, + "learning_rate": 7.001814882032668e-06, + "loss": 36.7607, + "step": 4234 + }, + { + "epoch": 15.288939051918735, + "grad_norm": 260.13629150390625, + "learning_rate": 6.996370235934665e-06, + "loss": 35.3684, + "step": 4235 + }, + { + "epoch": 15.29255079006772, + "grad_norm": 223.9878692626953, + "learning_rate": 6.990925589836661e-06, + "loss": 32.8784, + "step": 4236 + }, + { + "epoch": 15.296162528216705, + "grad_norm": 225.69212341308594, + "learning_rate": 6.985480943738657e-06, + "loss": 31.3751, + "step": 4237 + }, + { + "epoch": 15.299774266365688, + "grad_norm": 215.99801635742188, + "learning_rate": 6.980036297640653e-06, + "loss": 31.5331, + "step": 4238 + }, + { + "epoch": 15.303386004514673, + "grad_norm": 263.26568603515625, + "learning_rate": 6.97459165154265e-06, + "loss": 32.5806, + "step": 4239 + }, + { + "epoch": 15.306997742663658, + "grad_norm": 203.2392578125, + "learning_rate": 6.969147005444646e-06, + "loss": 31.6379, + "step": 4240 + }, + { + "epoch": 15.306997742663658, + "eval_loss": 0.6046441793441772, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 4240 + }, + { + "epoch": 15.31060948081264, + "grad_norm": 221.2167510986328, + "learning_rate": 6.963702359346643e-06, + "loss": 33.7034, + "step": 4241 + }, + { + "epoch": 15.314221218961626, + "grad_norm": 212.58737182617188, + "learning_rate": 6.958257713248639e-06, + "loss": 32.5511, + "step": 4242 + }, + { + "epoch": 15.317832957110609, + "grad_norm": 270.7123718261719, + "learning_rate": 6.952813067150635e-06, + "loss": 33.2513, + "step": 4243 + }, + { + "epoch": 15.321444695259594, + "grad_norm": 270.2066345214844, + "learning_rate": 6.9473684210526315e-06, + "loss": 33.9559, + "step": 4244 + }, + { + "epoch": 15.325056433408578, + "grad_norm": 232.8043212890625, + "learning_rate": 6.941923774954628e-06, + "loss": 33.9916, + "step": 4245 + }, + { + "epoch": 15.328668171557561, + "grad_norm": 325.419921875, + "learning_rate": 6.936479128856625e-06, + "loss": 35.2098, + "step": 4246 + }, + { + "epoch": 15.332279909706546, + "grad_norm": 303.326416015625, + "learning_rate": 6.93103448275862e-06, + "loss": 35.0784, + "step": 4247 + }, + { + "epoch": 15.335891647855531, + "grad_norm": 327.05963134765625, + "learning_rate": 6.925589836660617e-06, + "loss": 35.9915, + "step": 4248 + }, + { + "epoch": 15.339503386004514, + "grad_norm": 326.58795166015625, + "learning_rate": 6.9201451905626135e-06, + "loss": 35.1914, + "step": 4249 + }, + { + "epoch": 15.343115124153499, + "grad_norm": 406.38812255859375, + "learning_rate": 6.914700544464611e-06, + "loss": 37.1535, + "step": 4250 + }, + { + "epoch": 15.343115124153499, + "eval_loss": 0.6056071519851685, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 4250 + }, + { + "epoch": 15.346726862302482, + "grad_norm": 325.6965637207031, + "learning_rate": 6.909255898366606e-06, + "loss": 29.8698, + "step": 4251 + }, + { + "epoch": 15.350338600451467, + "grad_norm": 212.59727478027344, + "learning_rate": 6.903811252268603e-06, + "loss": 22.2995, + "step": 4252 + }, + { + "epoch": 15.353950338600452, + "grad_norm": 257.447509765625, + "learning_rate": 6.898366606170599e-06, + "loss": 23.1014, + "step": 4253 + }, + { + "epoch": 15.357562076749435, + "grad_norm": 266.139892578125, + "learning_rate": 6.8929219600725955e-06, + "loss": 23.2319, + "step": 4254 + }, + { + "epoch": 15.36117381489842, + "grad_norm": 332.7207336425781, + "learning_rate": 6.887477313974592e-06, + "loss": 23.7218, + "step": 4255 + }, + { + "epoch": 15.364785553047405, + "grad_norm": 272.7341003417969, + "learning_rate": 6.882032667876588e-06, + "loss": 39.5787, + "step": 4256 + }, + { + "epoch": 15.368397291196388, + "grad_norm": 259.00872802734375, + "learning_rate": 6.876588021778585e-06, + "loss": 41.0874, + "step": 4257 + }, + { + "epoch": 15.372009029345373, + "grad_norm": 236.87033081054688, + "learning_rate": 6.8711433756805804e-06, + "loss": 38.9811, + "step": 4258 + }, + { + "epoch": 15.375620767494357, + "grad_norm": 293.6808776855469, + "learning_rate": 6.8656987295825775e-06, + "loss": 39.481, + "step": 4259 + }, + { + "epoch": 15.37923250564334, + "grad_norm": 266.0845947265625, + "learning_rate": 6.860254083484574e-06, + "loss": 39.4595, + "step": 4260 + }, + { + "epoch": 15.37923250564334, + "eval_loss": 0.6039742231369019, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.047, + "eval_steps_per_second": 57.047, + "step": 4260 + }, + { + "epoch": 15.382844243792325, + "grad_norm": 398.0877685546875, + "learning_rate": 6.85480943738657e-06, + "loss": 38.8899, + "step": 4261 + }, + { + "epoch": 15.386455981941308, + "grad_norm": 208.37376403808594, + "learning_rate": 6.849364791288566e-06, + "loss": 39.2194, + "step": 4262 + }, + { + "epoch": 15.390067720090293, + "grad_norm": 214.6958770751953, + "learning_rate": 6.8439201451905624e-06, + "loss": 38.9911, + "step": 4263 + }, + { + "epoch": 15.393679458239278, + "grad_norm": 210.2147674560547, + "learning_rate": 6.8384754990925595e-06, + "loss": 40.5973, + "step": 4264 + }, + { + "epoch": 15.397291196388261, + "grad_norm": 240.47030639648438, + "learning_rate": 6.833030852994556e-06, + "loss": 39.3936, + "step": 4265 + }, + { + "epoch": 15.400902934537246, + "grad_norm": 273.86883544921875, + "learning_rate": 6.827586206896552e-06, + "loss": 40.0848, + "step": 4266 + }, + { + "epoch": 15.404514672686231, + "grad_norm": 239.36453247070312, + "learning_rate": 6.822141560798548e-06, + "loss": 36.5967, + "step": 4267 + }, + { + "epoch": 15.408126410835214, + "grad_norm": 215.3413543701172, + "learning_rate": 6.8166969147005444e-06, + "loss": 37.8173, + "step": 4268 + }, + { + "epoch": 15.411738148984199, + "grad_norm": 260.1557312011719, + "learning_rate": 6.811252268602541e-06, + "loss": 37.7175, + "step": 4269 + }, + { + "epoch": 15.415349887133182, + "grad_norm": 239.4988555908203, + "learning_rate": 6.805807622504537e-06, + "loss": 37.0618, + "step": 4270 + }, + { + "epoch": 15.415349887133182, + "eval_loss": 0.6049810647964478, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 4270 + }, + { + "epoch": 15.418961625282167, + "grad_norm": 223.06094360351562, + "learning_rate": 6.800362976406534e-06, + "loss": 37.0687, + "step": 4271 + }, + { + "epoch": 15.422573363431152, + "grad_norm": 261.7460632324219, + "learning_rate": 6.79491833030853e-06, + "loss": 35.9437, + "step": 4272 + }, + { + "epoch": 15.426185101580135, + "grad_norm": 230.92135620117188, + "learning_rate": 6.7894736842105264e-06, + "loss": 38.3316, + "step": 4273 + }, + { + "epoch": 15.42979683972912, + "grad_norm": 370.6309509277344, + "learning_rate": 6.784029038112523e-06, + "loss": 38.2666, + "step": 4274 + }, + { + "epoch": 15.433408577878104, + "grad_norm": 249.7823944091797, + "learning_rate": 6.77858439201452e-06, + "loss": 38.1159, + "step": 4275 + }, + { + "epoch": 15.437020316027088, + "grad_norm": 404.1676330566406, + "learning_rate": 6.773139745916516e-06, + "loss": 37.6548, + "step": 4276 + }, + { + "epoch": 15.440632054176072, + "grad_norm": 256.3241271972656, + "learning_rate": 6.767695099818511e-06, + "loss": 38.3713, + "step": 4277 + }, + { + "epoch": 15.444243792325057, + "grad_norm": 240.55934143066406, + "learning_rate": 6.7622504537205084e-06, + "loss": 39.2487, + "step": 4278 + }, + { + "epoch": 15.44785553047404, + "grad_norm": 230.010009765625, + "learning_rate": 6.756805807622505e-06, + "loss": 39.4391, + "step": 4279 + }, + { + "epoch": 15.451467268623025, + "grad_norm": 226.51385498046875, + "learning_rate": 6.751361161524502e-06, + "loss": 38.6273, + "step": 4280 + }, + { + "epoch": 15.451467268623025, + "eval_loss": 0.6027400493621826, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.029, + "eval_steps_per_second": 57.029, + "step": 4280 + }, + { + "epoch": 15.455079006772008, + "grad_norm": 314.57476806640625, + "learning_rate": 6.745916515426497e-06, + "loss": 38.583, + "step": 4281 + }, + { + "epoch": 15.458690744920993, + "grad_norm": 229.91238403320312, + "learning_rate": 6.740471869328494e-06, + "loss": 39.2433, + "step": 4282 + }, + { + "epoch": 15.462302483069978, + "grad_norm": 284.7301330566406, + "learning_rate": 6.7350272232304904e-06, + "loss": 38.8577, + "step": 4283 + }, + { + "epoch": 15.465914221218961, + "grad_norm": 209.32266235351562, + "learning_rate": 6.729582577132486e-06, + "loss": 34.928, + "step": 4284 + }, + { + "epoch": 15.469525959367946, + "grad_norm": 264.6195068359375, + "learning_rate": 6.724137931034483e-06, + "loss": 32.0527, + "step": 4285 + }, + { + "epoch": 15.47313769751693, + "grad_norm": 224.2421112060547, + "learning_rate": 6.718693284936479e-06, + "loss": 31.939, + "step": 4286 + }, + { + "epoch": 15.476749435665914, + "grad_norm": 233.0791015625, + "learning_rate": 6.713248638838476e-06, + "loss": 32.5402, + "step": 4287 + }, + { + "epoch": 15.480361173814899, + "grad_norm": 284.129638671875, + "learning_rate": 6.707803992740472e-06, + "loss": 31.0069, + "step": 4288 + }, + { + "epoch": 15.483972911963882, + "grad_norm": 253.6517791748047, + "learning_rate": 6.702359346642469e-06, + "loss": 32.0172, + "step": 4289 + }, + { + "epoch": 15.487584650112867, + "grad_norm": 305.63775634765625, + "learning_rate": 6.696914700544465e-06, + "loss": 34.1643, + "step": 4290 + }, + { + "epoch": 15.487584650112867, + "eval_loss": 0.6044390201568604, + "eval_runtime": 3.1391, + "eval_samples_per_second": 57.023, + "eval_steps_per_second": 57.023, + "step": 4290 + }, + { + "epoch": 15.491196388261852, + "grad_norm": 224.6516876220703, + "learning_rate": 6.691470054446461e-06, + "loss": 32.4735, + "step": 4291 + }, + { + "epoch": 15.494808126410835, + "grad_norm": 257.5385437011719, + "learning_rate": 6.686025408348457e-06, + "loss": 33.9272, + "step": 4292 + }, + { + "epoch": 15.49841986455982, + "grad_norm": 393.9106140136719, + "learning_rate": 6.680580762250454e-06, + "loss": 34.4176, + "step": 4293 + }, + { + "epoch": 15.502031602708804, + "grad_norm": 333.5639953613281, + "learning_rate": 6.675136116152451e-06, + "loss": 34.5695, + "step": 4294 + }, + { + "epoch": 15.505643340857787, + "grad_norm": 319.8660888671875, + "learning_rate": 6.669691470054446e-06, + "loss": 34.5337, + "step": 4295 + }, + { + "epoch": 15.509255079006772, + "grad_norm": 246.78086853027344, + "learning_rate": 6.664246823956443e-06, + "loss": 34.8297, + "step": 4296 + }, + { + "epoch": 15.512866817155757, + "grad_norm": 313.4530944824219, + "learning_rate": 6.658802177858439e-06, + "loss": 34.6901, + "step": 4297 + }, + { + "epoch": 15.51647855530474, + "grad_norm": 257.2852783203125, + "learning_rate": 6.6533575317604364e-06, + "loss": 35.3892, + "step": 4298 + }, + { + "epoch": 15.520090293453725, + "grad_norm": 336.5549011230469, + "learning_rate": 6.647912885662432e-06, + "loss": 36.3347, + "step": 4299 + }, + { + "epoch": 15.523702031602708, + "grad_norm": 275.726806640625, + "learning_rate": 6.642468239564428e-06, + "loss": 36.3559, + "step": 4300 + }, + { + "epoch": 15.523702031602708, + "eval_loss": 0.6056334376335144, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.028, + "eval_steps_per_second": 57.028, + "step": 4300 + }, + { + "epoch": 15.527313769751693, + "grad_norm": 275.5987243652344, + "learning_rate": 6.637023593466425e-06, + "loss": 28.5887, + "step": 4301 + }, + { + "epoch": 15.530925507900678, + "grad_norm": 242.59762573242188, + "learning_rate": 6.631578947368421e-06, + "loss": 22.1398, + "step": 4302 + }, + { + "epoch": 15.534537246049661, + "grad_norm": 228.04344177246094, + "learning_rate": 6.626134301270418e-06, + "loss": 21.4593, + "step": 4303 + }, + { + "epoch": 15.538148984198646, + "grad_norm": 204.2377166748047, + "learning_rate": 6.620689655172414e-06, + "loss": 22.5132, + "step": 4304 + }, + { + "epoch": 15.54176072234763, + "grad_norm": 243.0237579345703, + "learning_rate": 6.615245009074411e-06, + "loss": 24.2777, + "step": 4305 + }, + { + "epoch": 15.545372460496614, + "grad_norm": 227.2841339111328, + "learning_rate": 6.609800362976407e-06, + "loss": 39.7235, + "step": 4306 + }, + { + "epoch": 15.548984198645599, + "grad_norm": 253.8453826904297, + "learning_rate": 6.6043557168784025e-06, + "loss": 39.9317, + "step": 4307 + }, + { + "epoch": 15.552595936794582, + "grad_norm": 243.62757873535156, + "learning_rate": 6.5989110707804e-06, + "loss": 38.9825, + "step": 4308 + }, + { + "epoch": 15.556207674943566, + "grad_norm": 262.4398498535156, + "learning_rate": 6.593466424682396e-06, + "loss": 39.7456, + "step": 4309 + }, + { + "epoch": 15.559819413092551, + "grad_norm": 268.5821228027344, + "learning_rate": 6.588021778584392e-06, + "loss": 39.5152, + "step": 4310 + }, + { + "epoch": 15.559819413092551, + "eval_loss": 0.6060237288475037, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 4310 + }, + { + "epoch": 15.563431151241534, + "grad_norm": 297.6933898925781, + "learning_rate": 6.582577132486388e-06, + "loss": 40.1259, + "step": 4311 + }, + { + "epoch": 15.56704288939052, + "grad_norm": 234.08816528320312, + "learning_rate": 6.577132486388385e-06, + "loss": 40.8591, + "step": 4312 + }, + { + "epoch": 15.570654627539504, + "grad_norm": 292.2416687011719, + "learning_rate": 6.571687840290382e-06, + "loss": 39.2377, + "step": 4313 + }, + { + "epoch": 15.574266365688487, + "grad_norm": 205.25888061523438, + "learning_rate": 6.566243194192377e-06, + "loss": 39.92, + "step": 4314 + }, + { + "epoch": 15.577878103837472, + "grad_norm": 229.06695556640625, + "learning_rate": 6.560798548094374e-06, + "loss": 39.8886, + "step": 4315 + }, + { + "epoch": 15.581489841986457, + "grad_norm": 223.3977508544922, + "learning_rate": 6.55535390199637e-06, + "loss": 38.5423, + "step": 4316 + }, + { + "epoch": 15.58510158013544, + "grad_norm": 254.60203552246094, + "learning_rate": 6.549909255898367e-06, + "loss": 36.8055, + "step": 4317 + }, + { + "epoch": 15.588713318284425, + "grad_norm": 304.463623046875, + "learning_rate": 6.544464609800363e-06, + "loss": 37.6164, + "step": 4318 + }, + { + "epoch": 15.592325056433408, + "grad_norm": 279.955810546875, + "learning_rate": 6.53901996370236e-06, + "loss": 37.4778, + "step": 4319 + }, + { + "epoch": 15.595936794582393, + "grad_norm": 230.11105346679688, + "learning_rate": 6.533575317604356e-06, + "loss": 36.9663, + "step": 4320 + }, + { + "epoch": 15.595936794582393, + "eval_loss": 0.6048213243484497, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.966, + "eval_steps_per_second": 56.966, + "step": 4320 + }, + { + "epoch": 15.599548532731378, + "grad_norm": 261.98187255859375, + "learning_rate": 6.528130671506351e-06, + "loss": 37.7402, + "step": 4321 + }, + { + "epoch": 15.60316027088036, + "grad_norm": 247.34771728515625, + "learning_rate": 6.5226860254083485e-06, + "loss": 37.1402, + "step": 4322 + }, + { + "epoch": 15.606772009029346, + "grad_norm": 277.1517333984375, + "learning_rate": 6.517241379310345e-06, + "loss": 38.3976, + "step": 4323 + }, + { + "epoch": 15.610383747178329, + "grad_norm": 231.89683532714844, + "learning_rate": 6.511796733212342e-06, + "loss": 38.0834, + "step": 4324 + }, + { + "epoch": 15.613995485327314, + "grad_norm": 323.8349304199219, + "learning_rate": 6.506352087114337e-06, + "loss": 37.9085, + "step": 4325 + }, + { + "epoch": 15.617607223476298, + "grad_norm": 263.5240783691406, + "learning_rate": 6.500907441016334e-06, + "loss": 37.0702, + "step": 4326 + }, + { + "epoch": 15.621218961625281, + "grad_norm": 217.0517578125, + "learning_rate": 6.4954627949183305e-06, + "loss": 36.9406, + "step": 4327 + }, + { + "epoch": 15.624830699774266, + "grad_norm": 267.4161682128906, + "learning_rate": 6.4900181488203276e-06, + "loss": 38.8773, + "step": 4328 + }, + { + "epoch": 15.628442437923251, + "grad_norm": 232.36000061035156, + "learning_rate": 6.484573502722323e-06, + "loss": 38.4978, + "step": 4329 + }, + { + "epoch": 15.632054176072234, + "grad_norm": 241.61373901367188, + "learning_rate": 6.479128856624319e-06, + "loss": 38.4895, + "step": 4330 + }, + { + "epoch": 15.632054176072234, + "eval_loss": 0.6024956703186035, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 4330 + }, + { + "epoch": 15.635665914221219, + "grad_norm": 232.27928161621094, + "learning_rate": 6.473684210526316e-06, + "loss": 38.8551, + "step": 4331 + }, + { + "epoch": 15.639277652370204, + "grad_norm": 243.42828369140625, + "learning_rate": 6.4682395644283125e-06, + "loss": 38.6475, + "step": 4332 + }, + { + "epoch": 15.642889390519187, + "grad_norm": 306.2618103027344, + "learning_rate": 6.462794918330309e-06, + "loss": 37.2015, + "step": 4333 + }, + { + "epoch": 15.646501128668172, + "grad_norm": 335.795166015625, + "learning_rate": 6.457350272232305e-06, + "loss": 36.5255, + "step": 4334 + }, + { + "epoch": 15.650112866817155, + "grad_norm": 209.6246337890625, + "learning_rate": 6.451905626134302e-06, + "loss": 32.4219, + "step": 4335 + }, + { + "epoch": 15.65372460496614, + "grad_norm": 283.2094421386719, + "learning_rate": 6.446460980036297e-06, + "loss": 30.9137, + "step": 4336 + }, + { + "epoch": 15.657336343115125, + "grad_norm": 255.4412841796875, + "learning_rate": 6.441016333938294e-06, + "loss": 30.8939, + "step": 4337 + }, + { + "epoch": 15.660948081264108, + "grad_norm": 217.8052215576172, + "learning_rate": 6.435571687840291e-06, + "loss": 31.5974, + "step": 4338 + }, + { + "epoch": 15.664559819413093, + "grad_norm": 215.64398193359375, + "learning_rate": 6.430127041742287e-06, + "loss": 30.0276, + "step": 4339 + }, + { + "epoch": 15.668171557562077, + "grad_norm": 244.32704162597656, + "learning_rate": 6.424682395644283e-06, + "loss": 32.5249, + "step": 4340 + }, + { + "epoch": 15.668171557562077, + "eval_loss": 0.6037233471870422, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.033, + "eval_steps_per_second": 57.033, + "step": 4340 + }, + { + "epoch": 15.67178329571106, + "grad_norm": 270.9132080078125, + "learning_rate": 6.419237749546279e-06, + "loss": 32.9923, + "step": 4341 + }, + { + "epoch": 15.675395033860045, + "grad_norm": 230.20314025878906, + "learning_rate": 6.4137931034482765e-06, + "loss": 32.871, + "step": 4342 + }, + { + "epoch": 15.679006772009028, + "grad_norm": 372.4366149902344, + "learning_rate": 6.408348457350273e-06, + "loss": 35.2687, + "step": 4343 + }, + { + "epoch": 15.682618510158013, + "grad_norm": 325.0901794433594, + "learning_rate": 6.402903811252268e-06, + "loss": 34.3107, + "step": 4344 + }, + { + "epoch": 15.686230248306998, + "grad_norm": 277.8683166503906, + "learning_rate": 6.397459165154265e-06, + "loss": 34.291, + "step": 4345 + }, + { + "epoch": 15.689841986455981, + "grad_norm": 262.566162109375, + "learning_rate": 6.392014519056261e-06, + "loss": 33.2989, + "step": 4346 + }, + { + "epoch": 15.693453724604966, + "grad_norm": 293.56536865234375, + "learning_rate": 6.386569872958258e-06, + "loss": 35.6865, + "step": 4347 + }, + { + "epoch": 15.697065462753951, + "grad_norm": 291.1886291503906, + "learning_rate": 6.381125226860254e-06, + "loss": 35.6959, + "step": 4348 + }, + { + "epoch": 15.700677200902934, + "grad_norm": 265.2365417480469, + "learning_rate": 6.375680580762251e-06, + "loss": 36.479, + "step": 4349 + }, + { + "epoch": 15.704288939051919, + "grad_norm": 342.8822021484375, + "learning_rate": 6.370235934664247e-06, + "loss": 35.9198, + "step": 4350 + }, + { + "epoch": 15.704288939051919, + "eval_loss": 0.603361189365387, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.98, + "eval_steps_per_second": 56.98, + "step": 4350 + }, + { + "epoch": 15.707900677200904, + "grad_norm": 276.1657409667969, + "learning_rate": 6.364791288566243e-06, + "loss": 29.429, + "step": 4351 + }, + { + "epoch": 15.711512415349887, + "grad_norm": 267.2456359863281, + "learning_rate": 6.35934664246824e-06, + "loss": 23.0038, + "step": 4352 + }, + { + "epoch": 15.715124153498872, + "grad_norm": 255.4893798828125, + "learning_rate": 6.353901996370236e-06, + "loss": 21.1185, + "step": 4353 + }, + { + "epoch": 15.718735891647855, + "grad_norm": 252.10501098632812, + "learning_rate": 6.348457350272233e-06, + "loss": 23.1769, + "step": 4354 + }, + { + "epoch": 15.72234762979684, + "grad_norm": 239.63905334472656, + "learning_rate": 6.343012704174228e-06, + "loss": 24.5905, + "step": 4355 + }, + { + "epoch": 15.725959367945824, + "grad_norm": 228.00950622558594, + "learning_rate": 6.337568058076225e-06, + "loss": 39.6657, + "step": 4356 + }, + { + "epoch": 15.729571106094808, + "grad_norm": 234.10647583007812, + "learning_rate": 6.332123411978222e-06, + "loss": 41.145, + "step": 4357 + }, + { + "epoch": 15.733182844243792, + "grad_norm": 236.55223083496094, + "learning_rate": 6.326678765880219e-06, + "loss": 40.2784, + "step": 4358 + }, + { + "epoch": 15.736794582392777, + "grad_norm": 340.1712646484375, + "learning_rate": 6.321234119782214e-06, + "loss": 39.3598, + "step": 4359 + }, + { + "epoch": 15.74040632054176, + "grad_norm": 269.4134826660156, + "learning_rate": 6.31578947368421e-06, + "loss": 38.7777, + "step": 4360 + }, + { + "epoch": 15.74040632054176, + "eval_loss": 0.6048015356063843, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 4360 + }, + { + "epoch": 15.744018058690745, + "grad_norm": 316.5471496582031, + "learning_rate": 6.310344827586207e-06, + "loss": 39.6707, + "step": 4361 + }, + { + "epoch": 15.747629796839728, + "grad_norm": 231.31820678710938, + "learning_rate": 6.304900181488203e-06, + "loss": 38.0009, + "step": 4362 + }, + { + "epoch": 15.751241534988713, + "grad_norm": 207.19117736816406, + "learning_rate": 6.2994555353902e-06, + "loss": 41.6523, + "step": 4363 + }, + { + "epoch": 15.754853273137698, + "grad_norm": 239.8341064453125, + "learning_rate": 6.294010889292196e-06, + "loss": 40.3203, + "step": 4364 + }, + { + "epoch": 15.758465011286681, + "grad_norm": 277.2004089355469, + "learning_rate": 6.288566243194193e-06, + "loss": 39.8026, + "step": 4365 + }, + { + "epoch": 15.762076749435666, + "grad_norm": 227.74728393554688, + "learning_rate": 6.2831215970961886e-06, + "loss": 38.1561, + "step": 4366 + }, + { + "epoch": 15.76568848758465, + "grad_norm": 268.6826477050781, + "learning_rate": 6.277676950998185e-06, + "loss": 37.4653, + "step": 4367 + }, + { + "epoch": 15.769300225733634, + "grad_norm": 308.92950439453125, + "learning_rate": 6.272232304900182e-06, + "loss": 36.3506, + "step": 4368 + }, + { + "epoch": 15.772911963882619, + "grad_norm": 216.53627014160156, + "learning_rate": 6.266787658802178e-06, + "loss": 36.12, + "step": 4369 + }, + { + "epoch": 15.776523702031604, + "grad_norm": 264.0691833496094, + "learning_rate": 6.261343012704174e-06, + "loss": 37.5023, + "step": 4370 + }, + { + "epoch": 15.776523702031604, + "eval_loss": 0.608928382396698, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.989, + "eval_steps_per_second": 56.989, + "step": 4370 + }, + { + "epoch": 15.780135440180587, + "grad_norm": 474.7265319824219, + "learning_rate": 6.2558983666061706e-06, + "loss": 38.8381, + "step": 4371 + }, + { + "epoch": 15.783747178329572, + "grad_norm": 303.66229248046875, + "learning_rate": 6.250453720508168e-06, + "loss": 36.5951, + "step": 4372 + }, + { + "epoch": 15.787358916478555, + "grad_norm": 231.65744018554688, + "learning_rate": 6.245009074410164e-06, + "loss": 36.4717, + "step": 4373 + }, + { + "epoch": 15.79097065462754, + "grad_norm": 235.25833129882812, + "learning_rate": 6.239564428312159e-06, + "loss": 38.4578, + "step": 4374 + }, + { + "epoch": 15.794582392776524, + "grad_norm": 215.5384063720703, + "learning_rate": 6.234119782214156e-06, + "loss": 38.0475, + "step": 4375 + }, + { + "epoch": 15.798194130925507, + "grad_norm": 216.3609619140625, + "learning_rate": 6.2286751361161526e-06, + "loss": 37.1825, + "step": 4376 + }, + { + "epoch": 15.801805869074492, + "grad_norm": 275.54522705078125, + "learning_rate": 6.223230490018149e-06, + "loss": 38.5608, + "step": 4377 + }, + { + "epoch": 15.805417607223477, + "grad_norm": 226.7752685546875, + "learning_rate": 6.217785843920145e-06, + "loss": 38.0612, + "step": 4378 + }, + { + "epoch": 15.80902934537246, + "grad_norm": 262.14501953125, + "learning_rate": 6.212341197822142e-06, + "loss": 38.0049, + "step": 4379 + }, + { + "epoch": 15.812641083521445, + "grad_norm": 299.82196044921875, + "learning_rate": 6.206896551724138e-06, + "loss": 39.1441, + "step": 4380 + }, + { + "epoch": 15.812641083521445, + "eval_loss": 0.6033969521522522, + "eval_runtime": 3.14, + "eval_samples_per_second": 57.007, + "eval_steps_per_second": 57.007, + "step": 4380 + }, + { + "epoch": 15.816252821670428, + "grad_norm": 295.24188232421875, + "learning_rate": 6.2014519056261346e-06, + "loss": 39.266, + "step": 4381 + }, + { + "epoch": 15.819864559819413, + "grad_norm": 298.1729736328125, + "learning_rate": 6.196007259528131e-06, + "loss": 39.4025, + "step": 4382 + }, + { + "epoch": 15.823476297968398, + "grad_norm": 234.97958374023438, + "learning_rate": 6.190562613430127e-06, + "loss": 39.4752, + "step": 4383 + }, + { + "epoch": 15.827088036117381, + "grad_norm": 270.3009338378906, + "learning_rate": 6.185117967332124e-06, + "loss": 36.0322, + "step": 4384 + }, + { + "epoch": 15.830699774266366, + "grad_norm": 279.78314208984375, + "learning_rate": 6.1796733212341195e-06, + "loss": 33.3256, + "step": 4385 + }, + { + "epoch": 15.83431151241535, + "grad_norm": 258.82598876953125, + "learning_rate": 6.1742286751361166e-06, + "loss": 33.1552, + "step": 4386 + }, + { + "epoch": 15.837923250564334, + "grad_norm": 280.8109130859375, + "learning_rate": 6.168784029038113e-06, + "loss": 32.0024, + "step": 4387 + }, + { + "epoch": 15.841534988713319, + "grad_norm": 265.08111572265625, + "learning_rate": 6.163339382940109e-06, + "loss": 32.4901, + "step": 4388 + }, + { + "epoch": 15.845146726862303, + "grad_norm": 316.56427001953125, + "learning_rate": 6.157894736842105e-06, + "loss": 33.1995, + "step": 4389 + }, + { + "epoch": 15.848758465011286, + "grad_norm": 256.03717041015625, + "learning_rate": 6.1524500907441015e-06, + "loss": 33.1914, + "step": 4390 + }, + { + "epoch": 15.848758465011286, + "eval_loss": 0.6017575263977051, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.034, + "eval_steps_per_second": 57.034, + "step": 4390 + }, + { + "epoch": 15.852370203160271, + "grad_norm": 242.54119873046875, + "learning_rate": 6.1470054446460985e-06, + "loss": 33.8459, + "step": 4391 + }, + { + "epoch": 15.855981941309254, + "grad_norm": 259.1406555175781, + "learning_rate": 6.141560798548094e-06, + "loss": 34.1317, + "step": 4392 + }, + { + "epoch": 15.85959367945824, + "grad_norm": 272.77880859375, + "learning_rate": 6.136116152450091e-06, + "loss": 34.2777, + "step": 4393 + }, + { + "epoch": 15.863205417607224, + "grad_norm": 231.60845947265625, + "learning_rate": 6.130671506352087e-06, + "loss": 34.0165, + "step": 4394 + }, + { + "epoch": 15.866817155756207, + "grad_norm": 230.85675048828125, + "learning_rate": 6.125226860254084e-06, + "loss": 34.2761, + "step": 4395 + }, + { + "epoch": 15.870428893905192, + "grad_norm": 307.4486389160156, + "learning_rate": 6.11978221415608e-06, + "loss": 33.7407, + "step": 4396 + }, + { + "epoch": 15.874040632054175, + "grad_norm": 264.7835388183594, + "learning_rate": 6.114337568058076e-06, + "loss": 34.1672, + "step": 4397 + }, + { + "epoch": 15.87765237020316, + "grad_norm": 234.93968200683594, + "learning_rate": 6.108892921960073e-06, + "loss": 35.7158, + "step": 4398 + }, + { + "epoch": 15.881264108352145, + "grad_norm": 300.0079345703125, + "learning_rate": 6.103448275862069e-06, + "loss": 36.1292, + "step": 4399 + }, + { + "epoch": 15.884875846501128, + "grad_norm": 326.20416259765625, + "learning_rate": 6.0980036297640655e-06, + "loss": 34.8222, + "step": 4400 + }, + { + "epoch": 15.884875846501128, + "eval_loss": 0.6024067401885986, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.99, + "eval_steps_per_second": 56.99, + "step": 4400 + }, + { + "epoch": 15.888487584650113, + "grad_norm": 214.6174774169922, + "learning_rate": 6.092558983666062e-06, + "loss": 27.4819, + "step": 4401 + }, + { + "epoch": 15.892099322799098, + "grad_norm": 222.7063446044922, + "learning_rate": 6.087114337568059e-06, + "loss": 22.3862, + "step": 4402 + }, + { + "epoch": 15.89571106094808, + "grad_norm": 277.0006103515625, + "learning_rate": 6.081669691470054e-06, + "loss": 22.8483, + "step": 4403 + }, + { + "epoch": 15.899322799097066, + "grad_norm": 264.3949890136719, + "learning_rate": 6.076225045372051e-06, + "loss": 23.2021, + "step": 4404 + }, + { + "epoch": 15.90293453724605, + "grad_norm": 244.04611206054688, + "learning_rate": 6.0707803992740475e-06, + "loss": 23.9378, + "step": 4405 + }, + { + "epoch": 15.906546275395034, + "grad_norm": 219.24403381347656, + "learning_rate": 6.065335753176044e-06, + "loss": 39.4708, + "step": 4406 + }, + { + "epoch": 15.910158013544018, + "grad_norm": 297.3822937011719, + "learning_rate": 6.05989110707804e-06, + "loss": 39.9151, + "step": 4407 + }, + { + "epoch": 15.913769751693001, + "grad_norm": 282.748291015625, + "learning_rate": 6.054446460980036e-06, + "loss": 39.0545, + "step": 4408 + }, + { + "epoch": 15.917381489841986, + "grad_norm": 274.6419982910156, + "learning_rate": 6.049001814882033e-06, + "loss": 39.7046, + "step": 4409 + }, + { + "epoch": 15.920993227990971, + "grad_norm": 261.2831115722656, + "learning_rate": 6.0435571687840295e-06, + "loss": 39.8849, + "step": 4410 + }, + { + "epoch": 15.920993227990971, + "eval_loss": 0.6017056107521057, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 4410 + }, + { + "epoch": 15.924604966139954, + "grad_norm": 276.61505126953125, + "learning_rate": 6.038112522686026e-06, + "loss": 39.8861, + "step": 4411 + }, + { + "epoch": 15.928216704288939, + "grad_norm": 273.4017333984375, + "learning_rate": 6.032667876588022e-06, + "loss": 36.2526, + "step": 4412 + }, + { + "epoch": 15.931828442437924, + "grad_norm": 314.4811706542969, + "learning_rate": 6.027223230490018e-06, + "loss": 37.1316, + "step": 4413 + }, + { + "epoch": 15.935440180586907, + "grad_norm": 265.7447204589844, + "learning_rate": 6.021778584392014e-06, + "loss": 38.1698, + "step": 4414 + }, + { + "epoch": 15.939051918735892, + "grad_norm": 448.373291015625, + "learning_rate": 6.016333938294011e-06, + "loss": 38.9541, + "step": 4415 + }, + { + "epoch": 15.942663656884875, + "grad_norm": 261.33966064453125, + "learning_rate": 6.010889292196008e-06, + "loss": 36.6694, + "step": 4416 + }, + { + "epoch": 15.94627539503386, + "grad_norm": 383.16363525390625, + "learning_rate": 6.005444646098004e-06, + "loss": 39.1773, + "step": 4417 + }, + { + "epoch": 15.949887133182845, + "grad_norm": 279.26446533203125, + "learning_rate": 6e-06, + "loss": 36.9482, + "step": 4418 + }, + { + "epoch": 15.953498871331828, + "grad_norm": 307.5321960449219, + "learning_rate": 5.994555353901996e-06, + "loss": 36.653, + "step": 4419 + }, + { + "epoch": 15.957110609480813, + "grad_norm": 412.80023193359375, + "learning_rate": 5.989110707803993e-06, + "loss": 36.3768, + "step": 4420 + }, + { + "epoch": 15.957110609480813, + "eval_loss": 0.6033455729484558, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 4420 + }, + { + "epoch": 15.960722347629797, + "grad_norm": 254.2952880859375, + "learning_rate": 5.98366606170599e-06, + "loss": 32.546, + "step": 4421 + }, + { + "epoch": 15.96433408577878, + "grad_norm": 324.0749816894531, + "learning_rate": 5.978221415607985e-06, + "loss": 32.7021, + "step": 4422 + }, + { + "epoch": 15.967945823927765, + "grad_norm": 326.0075988769531, + "learning_rate": 5.972776769509982e-06, + "loss": 33.3823, + "step": 4423 + }, + { + "epoch": 15.97155756207675, + "grad_norm": 252.98471069335938, + "learning_rate": 5.967332123411978e-06, + "loss": 33.3397, + "step": 4424 + }, + { + "epoch": 15.975169300225733, + "grad_norm": 243.14117431640625, + "learning_rate": 5.9618874773139755e-06, + "loss": 34.2781, + "step": 4425 + }, + { + "epoch": 15.978781038374718, + "grad_norm": 304.3429260253906, + "learning_rate": 5.956442831215971e-06, + "loss": 34.1163, + "step": 4426 + }, + { + "epoch": 15.982392776523701, + "grad_norm": 320.1651916503906, + "learning_rate": 5.950998185117968e-06, + "loss": 34.1024, + "step": 4427 + }, + { + "epoch": 15.986004514672686, + "grad_norm": 252.0004425048828, + "learning_rate": 5.945553539019964e-06, + "loss": 35.8121, + "step": 4428 + }, + { + "epoch": 15.989616252821671, + "grad_norm": 342.5635986328125, + "learning_rate": 5.9401088929219595e-06, + "loss": 35.6666, + "step": 4429 + }, + { + "epoch": 15.993227990970654, + "grad_norm": 226.57249450683594, + "learning_rate": 5.934664246823957e-06, + "loss": 30.2617, + "step": 4430 + }, + { + "epoch": 15.993227990970654, + "eval_loss": 0.6029886603355408, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.979, + "eval_steps_per_second": 56.979, + "step": 4430 + }, + { + "epoch": 15.996839729119639, + "grad_norm": 202.94903564453125, + "learning_rate": 5.929219600725953e-06, + "loss": 22.8166, + "step": 4431 + }, + { + "epoch": 16.0, + "grad_norm": 200.84317016601562, + "learning_rate": 5.92377495462795e-06, + "loss": 20.3903, + "step": 4432 + }, + { + "epoch": 16.003611738148983, + "grad_norm": 230.5917510986328, + "learning_rate": 5.918330308529945e-06, + "loss": 39.0985, + "step": 4433 + }, + { + "epoch": 16.00722347629797, + "grad_norm": 285.6978759765625, + "learning_rate": 5.912885662431942e-06, + "loss": 39.2128, + "step": 4434 + }, + { + "epoch": 16.010835214446953, + "grad_norm": 221.70896911621094, + "learning_rate": 5.907441016333939e-06, + "loss": 38.9026, + "step": 4435 + }, + { + "epoch": 16.014446952595936, + "grad_norm": 318.14068603515625, + "learning_rate": 5.901996370235935e-06, + "loss": 38.7336, + "step": 4436 + }, + { + "epoch": 16.018058690744923, + "grad_norm": 324.451904296875, + "learning_rate": 5.896551724137931e-06, + "loss": 38.7117, + "step": 4437 + }, + { + "epoch": 16.021670428893906, + "grad_norm": 295.038818359375, + "learning_rate": 5.891107078039927e-06, + "loss": 39.6053, + "step": 4438 + }, + { + "epoch": 16.02528216704289, + "grad_norm": 267.0055236816406, + "learning_rate": 5.885662431941924e-06, + "loss": 38.931, + "step": 4439 + }, + { + "epoch": 16.028893905191875, + "grad_norm": 269.20074462890625, + "learning_rate": 5.88021778584392e-06, + "loss": 41.1717, + "step": 4440 + }, + { + "epoch": 16.028893905191875, + "eval_loss": 0.6036069393157959, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.899, + "eval_steps_per_second": 56.899, + "step": 4440 + }, + { + "epoch": 16.03250564334086, + "grad_norm": 241.9443359375, + "learning_rate": 5.874773139745917e-06, + "loss": 38.7027, + "step": 4441 + }, + { + "epoch": 16.03611738148984, + "grad_norm": 238.54847717285156, + "learning_rate": 5.869328493647913e-06, + "loss": 39.1284, + "step": 4442 + }, + { + "epoch": 16.039729119638825, + "grad_norm": 339.3023681640625, + "learning_rate": 5.863883847549909e-06, + "loss": 38.0767, + "step": 4443 + }, + { + "epoch": 16.04334085778781, + "grad_norm": 257.29522705078125, + "learning_rate": 5.8584392014519055e-06, + "loss": 34.8207, + "step": 4444 + }, + { + "epoch": 16.046952595936794, + "grad_norm": 264.24200439453125, + "learning_rate": 5.852994555353902e-06, + "loss": 35.5021, + "step": 4445 + }, + { + "epoch": 16.050564334085777, + "grad_norm": 251.3128662109375, + "learning_rate": 5.847549909255899e-06, + "loss": 35.7826, + "step": 4446 + }, + { + "epoch": 16.054176072234764, + "grad_norm": 310.6581726074219, + "learning_rate": 5.842105263157895e-06, + "loss": 36.7373, + "step": 4447 + }, + { + "epoch": 16.057787810383747, + "grad_norm": 299.07550048828125, + "learning_rate": 5.836660617059891e-06, + "loss": 36.4048, + "step": 4448 + }, + { + "epoch": 16.06139954853273, + "grad_norm": 257.58740234375, + "learning_rate": 5.8312159709618875e-06, + "loss": 36.3982, + "step": 4449 + }, + { + "epoch": 16.065011286681717, + "grad_norm": 337.6795654296875, + "learning_rate": 5.825771324863884e-06, + "loss": 36.8518, + "step": 4450 + }, + { + "epoch": 16.065011286681717, + "eval_loss": 0.6036850214004517, + "eval_runtime": 3.1399, + "eval_samples_per_second": 57.009, + "eval_steps_per_second": 57.009, + "step": 4450 + }, + { + "epoch": 16.0686230248307, + "grad_norm": 275.02423095703125, + "learning_rate": 5.820326678765881e-06, + "loss": 36.1763, + "step": 4451 + }, + { + "epoch": 16.072234762979683, + "grad_norm": 263.4334716796875, + "learning_rate": 5.814882032667876e-06, + "loss": 37.6417, + "step": 4452 + }, + { + "epoch": 16.07584650112867, + "grad_norm": 213.16749572753906, + "learning_rate": 5.809437386569873e-06, + "loss": 35.6537, + "step": 4453 + }, + { + "epoch": 16.079458239277653, + "grad_norm": 263.4288330078125, + "learning_rate": 5.8039927404718695e-06, + "loss": 36.5693, + "step": 4454 + }, + { + "epoch": 16.083069977426636, + "grad_norm": 284.67254638671875, + "learning_rate": 5.798548094373866e-06, + "loss": 37.3424, + "step": 4455 + }, + { + "epoch": 16.086681715575622, + "grad_norm": 355.7987060546875, + "learning_rate": 5.793103448275862e-06, + "loss": 38.7851, + "step": 4456 + }, + { + "epoch": 16.090293453724605, + "grad_norm": 249.7351531982422, + "learning_rate": 5.787658802177859e-06, + "loss": 38.1334, + "step": 4457 + }, + { + "epoch": 16.09390519187359, + "grad_norm": 257.4977722167969, + "learning_rate": 5.782214156079855e-06, + "loss": 37.8369, + "step": 4458 + }, + { + "epoch": 16.097516930022575, + "grad_norm": 242.59584045410156, + "learning_rate": 5.776769509981851e-06, + "loss": 37.4005, + "step": 4459 + }, + { + "epoch": 16.101128668171558, + "grad_norm": 270.0740966796875, + "learning_rate": 5.771324863883848e-06, + "loss": 38.2287, + "step": 4460 + }, + { + "epoch": 16.101128668171558, + "eval_loss": 0.6018803119659424, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.04, + "eval_steps_per_second": 57.04, + "step": 4460 + }, + { + "epoch": 16.10474040632054, + "grad_norm": 225.32322692871094, + "learning_rate": 5.765880217785844e-06, + "loss": 35.7162, + "step": 4461 + }, + { + "epoch": 16.108352144469524, + "grad_norm": 275.3272705078125, + "learning_rate": 5.760435571687841e-06, + "loss": 32.8733, + "step": 4462 + }, + { + "epoch": 16.11196388261851, + "grad_norm": 259.5124206542969, + "learning_rate": 5.7549909255898364e-06, + "loss": 33.2271, + "step": 4463 + }, + { + "epoch": 16.115575620767494, + "grad_norm": 249.75738525390625, + "learning_rate": 5.7495462794918335e-06, + "loss": 30.2931, + "step": 4464 + }, + { + "epoch": 16.119187358916477, + "grad_norm": 277.7652282714844, + "learning_rate": 5.74410163339383e-06, + "loss": 30.9294, + "step": 4465 + }, + { + "epoch": 16.122799097065464, + "grad_norm": 223.28250122070312, + "learning_rate": 5.738656987295825e-06, + "loss": 31.7337, + "step": 4466 + }, + { + "epoch": 16.126410835214447, + "grad_norm": 259.5106201171875, + "learning_rate": 5.733212341197822e-06, + "loss": 31.2897, + "step": 4467 + }, + { + "epoch": 16.13002257336343, + "grad_norm": 241.0313720703125, + "learning_rate": 5.7277676950998184e-06, + "loss": 32.8436, + "step": 4468 + }, + { + "epoch": 16.133634311512417, + "grad_norm": 277.46905517578125, + "learning_rate": 5.7223230490018155e-06, + "loss": 33.6823, + "step": 4469 + }, + { + "epoch": 16.1372460496614, + "grad_norm": 264.2905578613281, + "learning_rate": 5.716878402903811e-06, + "loss": 33.1107, + "step": 4470 + }, + { + "epoch": 16.1372460496614, + "eval_loss": 0.6046355962753296, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 4470 + }, + { + "epoch": 16.140857787810383, + "grad_norm": 295.5188903808594, + "learning_rate": 5.711433756805808e-06, + "loss": 33.6291, + "step": 4471 + }, + { + "epoch": 16.14446952595937, + "grad_norm": 282.6014709472656, + "learning_rate": 5.705989110707804e-06, + "loss": 33.0773, + "step": 4472 + }, + { + "epoch": 16.148081264108352, + "grad_norm": 270.7958679199219, + "learning_rate": 5.7005444646098004e-06, + "loss": 35.0269, + "step": 4473 + }, + { + "epoch": 16.151693002257336, + "grad_norm": 344.7304992675781, + "learning_rate": 5.695099818511797e-06, + "loss": 35.1349, + "step": 4474 + }, + { + "epoch": 16.155304740406322, + "grad_norm": 294.5618896484375, + "learning_rate": 5.689655172413793e-06, + "loss": 36.3309, + "step": 4475 + }, + { + "epoch": 16.158916478555305, + "grad_norm": 305.5354309082031, + "learning_rate": 5.68421052631579e-06, + "loss": 35.0976, + "step": 4476 + }, + { + "epoch": 16.16252821670429, + "grad_norm": 293.9934387207031, + "learning_rate": 5.678765880217786e-06, + "loss": 34.9113, + "step": 4477 + }, + { + "epoch": 16.16613995485327, + "grad_norm": 277.9523010253906, + "learning_rate": 5.6733212341197824e-06, + "loss": 24.8815, + "step": 4478 + }, + { + "epoch": 16.169751693002258, + "grad_norm": 297.0547790527344, + "learning_rate": 5.667876588021779e-06, + "loss": 22.4544, + "step": 4479 + }, + { + "epoch": 16.17336343115124, + "grad_norm": 237.44741821289062, + "learning_rate": 5.662431941923776e-06, + "loss": 21.8323, + "step": 4480 + }, + { + "epoch": 16.17336343115124, + "eval_loss": 0.6061411499977112, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.98, + "eval_steps_per_second": 56.98, + "step": 4480 + }, + { + "epoch": 16.176975169300224, + "grad_norm": 220.5832977294922, + "learning_rate": 5.656987295825771e-06, + "loss": 22.7531, + "step": 4481 + }, + { + "epoch": 16.18058690744921, + "grad_norm": 298.8033142089844, + "learning_rate": 5.651542649727767e-06, + "loss": 23.7107, + "step": 4482 + }, + { + "epoch": 16.184198645598194, + "grad_norm": 250.02593994140625, + "learning_rate": 5.6460980036297644e-06, + "loss": 39.1679, + "step": 4483 + }, + { + "epoch": 16.187810383747177, + "grad_norm": 253.00746154785156, + "learning_rate": 5.640653357531761e-06, + "loss": 40.6492, + "step": 4484 + }, + { + "epoch": 16.191422121896164, + "grad_norm": 215.04270935058594, + "learning_rate": 5.635208711433757e-06, + "loss": 38.604, + "step": 4485 + }, + { + "epoch": 16.195033860045147, + "grad_norm": 395.6152648925781, + "learning_rate": 5.629764065335753e-06, + "loss": 39.1417, + "step": 4486 + }, + { + "epoch": 16.19864559819413, + "grad_norm": 380.3653869628906, + "learning_rate": 5.62431941923775e-06, + "loss": 39.4322, + "step": 4487 + }, + { + "epoch": 16.202257336343116, + "grad_norm": 309.3524475097656, + "learning_rate": 5.6188747731397464e-06, + "loss": 39.1721, + "step": 4488 + }, + { + "epoch": 16.2058690744921, + "grad_norm": 237.88262939453125, + "learning_rate": 5.613430127041742e-06, + "loss": 39.1462, + "step": 4489 + }, + { + "epoch": 16.209480812641083, + "grad_norm": 233.66690063476562, + "learning_rate": 5.607985480943739e-06, + "loss": 39.8177, + "step": 4490 + }, + { + "epoch": 16.209480812641083, + "eval_loss": 0.6043822169303894, + "eval_runtime": 3.1418, + "eval_samples_per_second": 56.974, + "eval_steps_per_second": 56.974, + "step": 4490 + }, + { + "epoch": 16.21309255079007, + "grad_norm": 229.3720703125, + "learning_rate": 5.602540834845735e-06, + "loss": 39.7878, + "step": 4491 + }, + { + "epoch": 16.216704288939052, + "grad_norm": 228.66493225097656, + "learning_rate": 5.597096188747731e-06, + "loss": 40.0754, + "step": 4492 + }, + { + "epoch": 16.220316027088035, + "grad_norm": 276.40240478515625, + "learning_rate": 5.591651542649728e-06, + "loss": 38.7709, + "step": 4493 + }, + { + "epoch": 16.223927765237022, + "grad_norm": 268.62371826171875, + "learning_rate": 5.586206896551725e-06, + "loss": 37.7439, + "step": 4494 + }, + { + "epoch": 16.227539503386005, + "grad_norm": 271.0934753417969, + "learning_rate": 5.580762250453721e-06, + "loss": 38.2511, + "step": 4495 + }, + { + "epoch": 16.231151241534988, + "grad_norm": 253.63385009765625, + "learning_rate": 5.575317604355716e-06, + "loss": 36.716, + "step": 4496 + }, + { + "epoch": 16.23476297968397, + "grad_norm": 265.1177978515625, + "learning_rate": 5.569872958257713e-06, + "loss": 36.5517, + "step": 4497 + }, + { + "epoch": 16.238374717832958, + "grad_norm": 332.52972412109375, + "learning_rate": 5.56442831215971e-06, + "loss": 37.1524, + "step": 4498 + }, + { + "epoch": 16.24198645598194, + "grad_norm": 247.53643798828125, + "learning_rate": 5.558983666061707e-06, + "loss": 36.6666, + "step": 4499 + }, + { + "epoch": 16.245598194130924, + "grad_norm": 233.3318634033203, + "learning_rate": 5.553539019963702e-06, + "loss": 37.0842, + "step": 4500 + }, + { + "epoch": 16.245598194130924, + "eval_loss": 0.6042913794517517, + "eval_runtime": 3.14, + "eval_samples_per_second": 57.007, + "eval_steps_per_second": 57.007, + "step": 4500 + }, + { + "epoch": 16.24920993227991, + "grad_norm": 222.98350524902344, + "learning_rate": 5.548094373865699e-06, + "loss": 37.6382, + "step": 4501 + }, + { + "epoch": 16.252821670428894, + "grad_norm": 234.33267211914062, + "learning_rate": 5.542649727767695e-06, + "loss": 38.0509, + "step": 4502 + }, + { + "epoch": 16.256433408577877, + "grad_norm": 303.56005859375, + "learning_rate": 5.5372050816696924e-06, + "loss": 36.509, + "step": 4503 + }, + { + "epoch": 16.260045146726863, + "grad_norm": 232.0821075439453, + "learning_rate": 5.531760435571688e-06, + "loss": 36.3975, + "step": 4504 + }, + { + "epoch": 16.263656884875846, + "grad_norm": 223.3292236328125, + "learning_rate": 5.526315789473684e-06, + "loss": 37.0448, + "step": 4505 + }, + { + "epoch": 16.26726862302483, + "grad_norm": 241.2131805419922, + "learning_rate": 5.520871143375681e-06, + "loss": 37.8635, + "step": 4506 + }, + { + "epoch": 16.270880361173816, + "grad_norm": 288.62689208984375, + "learning_rate": 5.5154264972776765e-06, + "loss": 38.2789, + "step": 4507 + }, + { + "epoch": 16.2744920993228, + "grad_norm": 262.59637451171875, + "learning_rate": 5.5099818511796736e-06, + "loss": 37.9052, + "step": 4508 + }, + { + "epoch": 16.278103837471782, + "grad_norm": 258.0476379394531, + "learning_rate": 5.50453720508167e-06, + "loss": 38.0485, + "step": 4509 + }, + { + "epoch": 16.28171557562077, + "grad_norm": 295.2730407714844, + "learning_rate": 5.499092558983667e-06, + "loss": 37.6134, + "step": 4510 + }, + { + "epoch": 16.28171557562077, + "eval_loss": 0.601740300655365, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 4510 + }, + { + "epoch": 16.285327313769752, + "grad_norm": 246.38548278808594, + "learning_rate": 5.493647912885662e-06, + "loss": 36.1289, + "step": 4511 + }, + { + "epoch": 16.288939051918735, + "grad_norm": 271.28997802734375, + "learning_rate": 5.4882032667876585e-06, + "loss": 31.8834, + "step": 4512 + }, + { + "epoch": 16.292550790067722, + "grad_norm": 231.76246643066406, + "learning_rate": 5.4827586206896556e-06, + "loss": 31.4899, + "step": 4513 + }, + { + "epoch": 16.296162528216705, + "grad_norm": 238.7414093017578, + "learning_rate": 5.477313974591652e-06, + "loss": 31.7102, + "step": 4514 + }, + { + "epoch": 16.299774266365688, + "grad_norm": 302.0710144042969, + "learning_rate": 5.471869328493648e-06, + "loss": 31.3557, + "step": 4515 + }, + { + "epoch": 16.30338600451467, + "grad_norm": 282.72015380859375, + "learning_rate": 5.466424682395644e-06, + "loss": 33.0781, + "step": 4516 + }, + { + "epoch": 16.306997742663658, + "grad_norm": 224.8140869140625, + "learning_rate": 5.460980036297641e-06, + "loss": 33.2963, + "step": 4517 + }, + { + "epoch": 16.31060948081264, + "grad_norm": 239.20570373535156, + "learning_rate": 5.4555353901996376e-06, + "loss": 34.4455, + "step": 4518 + }, + { + "epoch": 16.314221218961624, + "grad_norm": 304.7758483886719, + "learning_rate": 5.450090744101633e-06, + "loss": 34.534, + "step": 4519 + }, + { + "epoch": 16.31783295711061, + "grad_norm": 274.8758239746094, + "learning_rate": 5.44464609800363e-06, + "loss": 33.5232, + "step": 4520 + }, + { + "epoch": 16.31783295711061, + "eval_loss": 0.6031973958015442, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 4520 + }, + { + "epoch": 16.321444695259594, + "grad_norm": 295.1776428222656, + "learning_rate": 5.439201451905626e-06, + "loss": 33.403, + "step": 4521 + }, + { + "epoch": 16.325056433408577, + "grad_norm": 309.03399658203125, + "learning_rate": 5.4337568058076225e-06, + "loss": 34.1785, + "step": 4522 + }, + { + "epoch": 16.328668171557563, + "grad_norm": 285.26385498046875, + "learning_rate": 5.428312159709619e-06, + "loss": 34.4855, + "step": 4523 + }, + { + "epoch": 16.332279909706546, + "grad_norm": 307.0184020996094, + "learning_rate": 5.422867513611616e-06, + "loss": 32.4791, + "step": 4524 + }, + { + "epoch": 16.33589164785553, + "grad_norm": 318.8267822265625, + "learning_rate": 5.417422867513612e-06, + "loss": 35.697, + "step": 4525 + }, + { + "epoch": 16.339503386004516, + "grad_norm": 356.0179138183594, + "learning_rate": 5.411978221415607e-06, + "loss": 36.1811, + "step": 4526 + }, + { + "epoch": 16.3431151241535, + "grad_norm": 332.1255187988281, + "learning_rate": 5.4065335753176045e-06, + "loss": 36.2251, + "step": 4527 + }, + { + "epoch": 16.346726862302482, + "grad_norm": 288.78118896484375, + "learning_rate": 5.401088929219601e-06, + "loss": 32.0518, + "step": 4528 + }, + { + "epoch": 16.35033860045147, + "grad_norm": 250.37245178222656, + "learning_rate": 5.395644283121598e-06, + "loss": 23.627, + "step": 4529 + }, + { + "epoch": 16.353950338600452, + "grad_norm": 199.92352294921875, + "learning_rate": 5.390199637023593e-06, + "loss": 21.7919, + "step": 4530 + }, + { + "epoch": 16.353950338600452, + "eval_loss": 0.6021688580513, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 4530 + }, + { + "epoch": 16.357562076749435, + "grad_norm": 265.47015380859375, + "learning_rate": 5.38475499092559e-06, + "loss": 23.0672, + "step": 4531 + }, + { + "epoch": 16.36117381489842, + "grad_norm": 281.188720703125, + "learning_rate": 5.3793103448275865e-06, + "loss": 22.7983, + "step": 4532 + }, + { + "epoch": 16.364785553047405, + "grad_norm": 195.5351104736328, + "learning_rate": 5.373865698729583e-06, + "loss": 38.1042, + "step": 4533 + }, + { + "epoch": 16.368397291196388, + "grad_norm": 234.76573181152344, + "learning_rate": 5.368421052631579e-06, + "loss": 39.8602, + "step": 4534 + }, + { + "epoch": 16.37200902934537, + "grad_norm": 237.9152374267578, + "learning_rate": 5.362976406533575e-06, + "loss": 40.2156, + "step": 4535 + }, + { + "epoch": 16.375620767494357, + "grad_norm": 297.722900390625, + "learning_rate": 5.357531760435572e-06, + "loss": 39.3676, + "step": 4536 + }, + { + "epoch": 16.37923250564334, + "grad_norm": 218.61727905273438, + "learning_rate": 5.352087114337568e-06, + "loss": 38.7905, + "step": 4537 + }, + { + "epoch": 16.382844243792324, + "grad_norm": 245.19561767578125, + "learning_rate": 5.346642468239565e-06, + "loss": 39.3998, + "step": 4538 + }, + { + "epoch": 16.38645598194131, + "grad_norm": 247.5048370361328, + "learning_rate": 5.341197822141561e-06, + "loss": 40.0835, + "step": 4539 + }, + { + "epoch": 16.390067720090293, + "grad_norm": 214.40684509277344, + "learning_rate": 5.335753176043558e-06, + "loss": 39.1135, + "step": 4540 + }, + { + "epoch": 16.390067720090293, + "eval_loss": 0.6014460325241089, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.946, + "eval_steps_per_second": 56.946, + "step": 4540 + }, + { + "epoch": 16.393679458239276, + "grad_norm": 216.72271728515625, + "learning_rate": 5.330308529945553e-06, + "loss": 38.9449, + "step": 4541 + }, + { + "epoch": 16.397291196388263, + "grad_norm": 224.22262573242188, + "learning_rate": 5.32486388384755e-06, + "loss": 39.2646, + "step": 4542 + }, + { + "epoch": 16.400902934537246, + "grad_norm": 258.6524353027344, + "learning_rate": 5.319419237749547e-06, + "loss": 38.0846, + "step": 4543 + }, + { + "epoch": 16.40451467268623, + "grad_norm": 241.7313232421875, + "learning_rate": 5.313974591651543e-06, + "loss": 37.4963, + "step": 4544 + }, + { + "epoch": 16.408126410835216, + "grad_norm": 241.3990478515625, + "learning_rate": 5.308529945553539e-06, + "loss": 36.4783, + "step": 4545 + }, + { + "epoch": 16.4117381489842, + "grad_norm": 207.1470947265625, + "learning_rate": 5.303085299455535e-06, + "loss": 36.1592, + "step": 4546 + }, + { + "epoch": 16.415349887133182, + "grad_norm": 224.51690673828125, + "learning_rate": 5.2976406533575325e-06, + "loss": 35.7946, + "step": 4547 + }, + { + "epoch": 16.41896162528217, + "grad_norm": 292.4340515136719, + "learning_rate": 5.292196007259528e-06, + "loss": 36.8986, + "step": 4548 + }, + { + "epoch": 16.42257336343115, + "grad_norm": 244.67117309570312, + "learning_rate": 5.286751361161524e-06, + "loss": 37.1165, + "step": 4549 + }, + { + "epoch": 16.426185101580135, + "grad_norm": 331.14654541015625, + "learning_rate": 5.281306715063521e-06, + "loss": 36.4423, + "step": 4550 + }, + { + "epoch": 16.426185101580135, + "eval_loss": 0.6067427396774292, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.946, + "eval_steps_per_second": 56.946, + "step": 4550 + }, + { + "epoch": 16.42979683972912, + "grad_norm": 262.373046875, + "learning_rate": 5.275862068965517e-06, + "loss": 39.0014, + "step": 4551 + }, + { + "epoch": 16.433408577878104, + "grad_norm": 237.48350524902344, + "learning_rate": 5.270417422867514e-06, + "loss": 38.0152, + "step": 4552 + }, + { + "epoch": 16.437020316027088, + "grad_norm": 273.0652770996094, + "learning_rate": 5.26497277676951e-06, + "loss": 37.6952, + "step": 4553 + }, + { + "epoch": 16.44063205417607, + "grad_norm": 239.0780029296875, + "learning_rate": 5.259528130671507e-06, + "loss": 38.4266, + "step": 4554 + }, + { + "epoch": 16.444243792325057, + "grad_norm": 277.978759765625, + "learning_rate": 5.254083484573503e-06, + "loss": 36.5596, + "step": 4555 + }, + { + "epoch": 16.44785553047404, + "grad_norm": 216.2267303466797, + "learning_rate": 5.248638838475499e-06, + "loss": 39.1408, + "step": 4556 + }, + { + "epoch": 16.451467268623023, + "grad_norm": 231.80581665039062, + "learning_rate": 5.243194192377496e-06, + "loss": 38.7286, + "step": 4557 + }, + { + "epoch": 16.45507900677201, + "grad_norm": 236.4004669189453, + "learning_rate": 5.237749546279492e-06, + "loss": 39.2426, + "step": 4558 + }, + { + "epoch": 16.458690744920993, + "grad_norm": 270.0268859863281, + "learning_rate": 5.232304900181488e-06, + "loss": 38.6546, + "step": 4559 + }, + { + "epoch": 16.462302483069976, + "grad_norm": 255.8044891357422, + "learning_rate": 5.226860254083484e-06, + "loss": 37.554, + "step": 4560 + }, + { + "epoch": 16.462302483069976, + "eval_loss": 0.6019929647445679, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.062, + "eval_steps_per_second": 57.062, + "step": 4560 + }, + { + "epoch": 16.465914221218963, + "grad_norm": 321.18499755859375, + "learning_rate": 5.221415607985481e-06, + "loss": 34.9309, + "step": 4561 + }, + { + "epoch": 16.469525959367946, + "grad_norm": 311.94305419921875, + "learning_rate": 5.215970961887478e-06, + "loss": 35.8779, + "step": 4562 + }, + { + "epoch": 16.47313769751693, + "grad_norm": 211.90234375, + "learning_rate": 5.210526315789474e-06, + "loss": 31.8385, + "step": 4563 + }, + { + "epoch": 16.476749435665916, + "grad_norm": 284.64581298828125, + "learning_rate": 5.20508166969147e-06, + "loss": 31.8078, + "step": 4564 + }, + { + "epoch": 16.4803611738149, + "grad_norm": 291.94891357421875, + "learning_rate": 5.199637023593466e-06, + "loss": 33.2542, + "step": 4565 + }, + { + "epoch": 16.483972911963882, + "grad_norm": 243.61956787109375, + "learning_rate": 5.194192377495463e-06, + "loss": 31.5292, + "step": 4566 + }, + { + "epoch": 16.48758465011287, + "grad_norm": 242.07696533203125, + "learning_rate": 5.188747731397459e-06, + "loss": 33.9643, + "step": 4567 + }, + { + "epoch": 16.49119638826185, + "grad_norm": 255.0625457763672, + "learning_rate": 5.183303085299456e-06, + "loss": 33.7718, + "step": 4568 + }, + { + "epoch": 16.494808126410835, + "grad_norm": 249.40240478515625, + "learning_rate": 5.177858439201452e-06, + "loss": 31.5248, + "step": 4569 + }, + { + "epoch": 16.498419864559818, + "grad_norm": 231.3375244140625, + "learning_rate": 5.172413793103449e-06, + "loss": 34.5657, + "step": 4570 + }, + { + "epoch": 16.498419864559818, + "eval_loss": 0.6017265319824219, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.01, + "eval_steps_per_second": 57.01, + "step": 4570 + }, + { + "epoch": 16.502031602708804, + "grad_norm": 247.97012329101562, + "learning_rate": 5.1669691470054445e-06, + "loss": 33.766, + "step": 4571 + }, + { + "epoch": 16.505643340857787, + "grad_norm": 310.730224609375, + "learning_rate": 5.161524500907441e-06, + "loss": 34.0841, + "step": 4572 + }, + { + "epoch": 16.50925507900677, + "grad_norm": 323.5569152832031, + "learning_rate": 5.156079854809438e-06, + "loss": 35.0788, + "step": 4573 + }, + { + "epoch": 16.512866817155757, + "grad_norm": 247.95480346679688, + "learning_rate": 5.150635208711433e-06, + "loss": 33.5322, + "step": 4574 + }, + { + "epoch": 16.51647855530474, + "grad_norm": 307.6163024902344, + "learning_rate": 5.14519056261343e-06, + "loss": 34.4701, + "step": 4575 + }, + { + "epoch": 16.520090293453723, + "grad_norm": 239.569580078125, + "learning_rate": 5.1397459165154265e-06, + "loss": 35.8526, + "step": 4576 + }, + { + "epoch": 16.52370203160271, + "grad_norm": 362.4159240722656, + "learning_rate": 5.134301270417424e-06, + "loss": 36.2235, + "step": 4577 + }, + { + "epoch": 16.527313769751693, + "grad_norm": 321.2509765625, + "learning_rate": 5.128856624319419e-06, + "loss": 33.4705, + "step": 4578 + }, + { + "epoch": 16.530925507900676, + "grad_norm": 248.6092071533203, + "learning_rate": 5.123411978221415e-06, + "loss": 23.1329, + "step": 4579 + }, + { + "epoch": 16.534537246049663, + "grad_norm": 289.8996276855469, + "learning_rate": 5.117967332123412e-06, + "loss": 20.3184, + "step": 4580 + }, + { + "epoch": 16.534537246049663, + "eval_loss": 0.6034744381904602, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.997, + "eval_steps_per_second": 56.997, + "step": 4580 + }, + { + "epoch": 16.538148984198646, + "grad_norm": 215.02142333984375, + "learning_rate": 5.1125226860254085e-06, + "loss": 23.0513, + "step": 4581 + }, + { + "epoch": 16.54176072234763, + "grad_norm": 299.8429870605469, + "learning_rate": 5.107078039927405e-06, + "loss": 24.462, + "step": 4582 + }, + { + "epoch": 16.545372460496615, + "grad_norm": 267.0840759277344, + "learning_rate": 5.101633393829401e-06, + "loss": 39.9148, + "step": 4583 + }, + { + "epoch": 16.5489841986456, + "grad_norm": 227.23731994628906, + "learning_rate": 5.096188747731398e-06, + "loss": 40.6498, + "step": 4584 + }, + { + "epoch": 16.55259593679458, + "grad_norm": 313.9705810546875, + "learning_rate": 5.0907441016333935e-06, + "loss": 38.7711, + "step": 4585 + }, + { + "epoch": 16.55620767494357, + "grad_norm": 398.0429382324219, + "learning_rate": 5.0852994555353905e-06, + "loss": 39.6938, + "step": 4586 + }, + { + "epoch": 16.55981941309255, + "grad_norm": 365.489990234375, + "learning_rate": 5.079854809437387e-06, + "loss": 39.356, + "step": 4587 + }, + { + "epoch": 16.563431151241534, + "grad_norm": 365.05267333984375, + "learning_rate": 5.074410163339383e-06, + "loss": 40.2504, + "step": 4588 + }, + { + "epoch": 16.567042889390518, + "grad_norm": 288.0643310546875, + "learning_rate": 5.068965517241379e-06, + "loss": 39.6045, + "step": 4589 + }, + { + "epoch": 16.570654627539504, + "grad_norm": 262.0147705078125, + "learning_rate": 5.0635208711433755e-06, + "loss": 40.2504, + "step": 4590 + }, + { + "epoch": 16.570654627539504, + "eval_loss": 0.6028281450271606, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 4590 + }, + { + "epoch": 16.574266365688487, + "grad_norm": 325.78387451171875, + "learning_rate": 5.0580762250453725e-06, + "loss": 40.3154, + "step": 4591 + }, + { + "epoch": 16.57787810383747, + "grad_norm": 221.56591796875, + "learning_rate": 5.052631578947369e-06, + "loss": 39.5046, + "step": 4592 + }, + { + "epoch": 16.581489841986457, + "grad_norm": 227.02520751953125, + "learning_rate": 5.047186932849365e-06, + "loss": 38.3611, + "step": 4593 + }, + { + "epoch": 16.58510158013544, + "grad_norm": 232.46922302246094, + "learning_rate": 5.041742286751361e-06, + "loss": 36.5043, + "step": 4594 + }, + { + "epoch": 16.588713318284423, + "grad_norm": 230.59536743164062, + "learning_rate": 5.0362976406533575e-06, + "loss": 36.2179, + "step": 4595 + }, + { + "epoch": 16.59232505643341, + "grad_norm": 439.9609069824219, + "learning_rate": 5.0308529945553545e-06, + "loss": 36.4797, + "step": 4596 + }, + { + "epoch": 16.595936794582393, + "grad_norm": 322.4086608886719, + "learning_rate": 5.02540834845735e-06, + "loss": 37.4151, + "step": 4597 + }, + { + "epoch": 16.599548532731376, + "grad_norm": 318.1732482910156, + "learning_rate": 5.019963702359347e-06, + "loss": 37.2815, + "step": 4598 + }, + { + "epoch": 16.603160270880363, + "grad_norm": 321.34039306640625, + "learning_rate": 5.014519056261343e-06, + "loss": 36.8388, + "step": 4599 + }, + { + "epoch": 16.606772009029346, + "grad_norm": 341.28790283203125, + "learning_rate": 5.0090744101633395e-06, + "loss": 37.9805, + "step": 4600 + }, + { + "epoch": 16.606772009029346, + "eval_loss": 0.6045316457748413, + "eval_runtime": 3.1402, + "eval_samples_per_second": 57.002, + "eval_steps_per_second": 57.002, + "step": 4600 + }, + { + "epoch": 16.61038374717833, + "grad_norm": 259.9163513183594, + "learning_rate": 5.003629764065336e-06, + "loss": 37.5832, + "step": 4601 + }, + { + "epoch": 16.613995485327315, + "grad_norm": 297.02587890625, + "learning_rate": 4.998185117967332e-06, + "loss": 37.3808, + "step": 4602 + }, + { + "epoch": 16.6176072234763, + "grad_norm": 263.32244873046875, + "learning_rate": 4.992740471869329e-06, + "loss": 37.1047, + "step": 4603 + }, + { + "epoch": 16.62121896162528, + "grad_norm": 262.26104736328125, + "learning_rate": 4.987295825771324e-06, + "loss": 38.3592, + "step": 4604 + }, + { + "epoch": 16.624830699774268, + "grad_norm": 253.7144012451172, + "learning_rate": 4.9818511796733215e-06, + "loss": 37.4098, + "step": 4605 + }, + { + "epoch": 16.62844243792325, + "grad_norm": 279.1004943847656, + "learning_rate": 4.976406533575318e-06, + "loss": 39.3865, + "step": 4606 + }, + { + "epoch": 16.632054176072234, + "grad_norm": 298.7977600097656, + "learning_rate": 4.970961887477315e-06, + "loss": 38.6865, + "step": 4607 + }, + { + "epoch": 16.635665914221217, + "grad_norm": 256.7657470703125, + "learning_rate": 4.96551724137931e-06, + "loss": 38.7068, + "step": 4608 + }, + { + "epoch": 16.639277652370204, + "grad_norm": 238.22979736328125, + "learning_rate": 4.960072595281307e-06, + "loss": 37.749, + "step": 4609 + }, + { + "epoch": 16.642889390519187, + "grad_norm": 248.4231414794922, + "learning_rate": 4.9546279491833035e-06, + "loss": 37.582, + "step": 4610 + }, + { + "epoch": 16.642889390519187, + "eval_loss": 0.6026645302772522, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.99, + "eval_steps_per_second": 56.99, + "step": 4610 + }, + { + "epoch": 16.64650112866817, + "grad_norm": 232.70289611816406, + "learning_rate": 4.949183303085299e-06, + "loss": 34.4589, + "step": 4611 + }, + { + "epoch": 16.650112866817157, + "grad_norm": 268.4678955078125, + "learning_rate": 4.943738656987296e-06, + "loss": 32.3619, + "step": 4612 + }, + { + "epoch": 16.65372460496614, + "grad_norm": 272.07794189453125, + "learning_rate": 4.938294010889292e-06, + "loss": 32.3436, + "step": 4613 + }, + { + "epoch": 16.657336343115123, + "grad_norm": 304.4588317871094, + "learning_rate": 4.932849364791289e-06, + "loss": 30.8798, + "step": 4614 + }, + { + "epoch": 16.66094808126411, + "grad_norm": 293.3638000488281, + "learning_rate": 4.927404718693285e-06, + "loss": 31.1892, + "step": 4615 + }, + { + "epoch": 16.664559819413093, + "grad_norm": 292.844482421875, + "learning_rate": 4.921960072595282e-06, + "loss": 31.9604, + "step": 4616 + }, + { + "epoch": 16.668171557562076, + "grad_norm": 246.45339965820312, + "learning_rate": 4.916515426497278e-06, + "loss": 32.242, + "step": 4617 + }, + { + "epoch": 16.671783295711062, + "grad_norm": 269.9577941894531, + "learning_rate": 4.911070780399274e-06, + "loss": 32.5072, + "step": 4618 + }, + { + "epoch": 16.675395033860045, + "grad_norm": 312.8960876464844, + "learning_rate": 4.90562613430127e-06, + "loss": 33.8243, + "step": 4619 + }, + { + "epoch": 16.67900677200903, + "grad_norm": 287.4557189941406, + "learning_rate": 4.900181488203267e-06, + "loss": 34.3557, + "step": 4620 + }, + { + "epoch": 16.67900677200903, + "eval_loss": 0.6047338843345642, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 4620 + }, + { + "epoch": 16.682618510158015, + "grad_norm": 403.533935546875, + "learning_rate": 4.894736842105264e-06, + "loss": 34.6895, + "step": 4621 + }, + { + "epoch": 16.686230248306998, + "grad_norm": 387.5083923339844, + "learning_rate": 4.88929219600726e-06, + "loss": 34.2407, + "step": 4622 + }, + { + "epoch": 16.68984198645598, + "grad_norm": 278.8225402832031, + "learning_rate": 4.883847549909256e-06, + "loss": 33.3489, + "step": 4623 + }, + { + "epoch": 16.693453724604964, + "grad_norm": 270.46685791015625, + "learning_rate": 4.878402903811252e-06, + "loss": 34.2095, + "step": 4624 + }, + { + "epoch": 16.69706546275395, + "grad_norm": 244.6392059326172, + "learning_rate": 4.872958257713249e-06, + "loss": 35.783, + "step": 4625 + }, + { + "epoch": 16.700677200902934, + "grad_norm": 327.0617370605469, + "learning_rate": 4.867513611615245e-06, + "loss": 36.4928, + "step": 4626 + }, + { + "epoch": 16.704288939051917, + "grad_norm": 297.0531311035156, + "learning_rate": 4.862068965517241e-06, + "loss": 33.4827, + "step": 4627 + }, + { + "epoch": 16.707900677200904, + "grad_norm": 366.2174377441406, + "learning_rate": 4.856624319419238e-06, + "loss": 26.9456, + "step": 4628 + }, + { + "epoch": 16.711512415349887, + "grad_norm": 436.22613525390625, + "learning_rate": 4.851179673321234e-06, + "loss": 22.2349, + "step": 4629 + }, + { + "epoch": 16.71512415349887, + "grad_norm": 391.7647705078125, + "learning_rate": 4.845735027223231e-06, + "loss": 22.8557, + "step": 4630 + }, + { + "epoch": 16.71512415349887, + "eval_loss": 0.6052708029747009, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.038, + "eval_steps_per_second": 57.038, + "step": 4630 + }, + { + "epoch": 16.718735891647857, + "grad_norm": 277.8678283691406, + "learning_rate": 4.840290381125227e-06, + "loss": 23.3521, + "step": 4631 + }, + { + "epoch": 16.72234762979684, + "grad_norm": 252.46131896972656, + "learning_rate": 4.834845735027224e-06, + "loss": 23.7394, + "step": 4632 + }, + { + "epoch": 16.725959367945823, + "grad_norm": 214.6287078857422, + "learning_rate": 4.82940108892922e-06, + "loss": 38.6633, + "step": 4633 + }, + { + "epoch": 16.72957110609481, + "grad_norm": 257.454345703125, + "learning_rate": 4.8239564428312155e-06, + "loss": 40.5165, + "step": 4634 + }, + { + "epoch": 16.733182844243792, + "grad_norm": 211.1912841796875, + "learning_rate": 4.818511796733213e-06, + "loss": 38.483, + "step": 4635 + }, + { + "epoch": 16.736794582392776, + "grad_norm": 226.8388214111328, + "learning_rate": 4.813067150635209e-06, + "loss": 39.6143, + "step": 4636 + }, + { + "epoch": 16.740406320541762, + "grad_norm": 263.8160400390625, + "learning_rate": 4.807622504537205e-06, + "loss": 37.8442, + "step": 4637 + }, + { + "epoch": 16.744018058690745, + "grad_norm": 284.8119201660156, + "learning_rate": 4.802177858439201e-06, + "loss": 39.1835, + "step": 4638 + }, + { + "epoch": 16.74762979683973, + "grad_norm": 310.31390380859375, + "learning_rate": 4.796733212341198e-06, + "loss": 38.7035, + "step": 4639 + }, + { + "epoch": 16.751241534988715, + "grad_norm": 212.71315002441406, + "learning_rate": 4.791288566243195e-06, + "loss": 38.8803, + "step": 4640 + }, + { + "epoch": 16.751241534988715, + "eval_loss": 0.6030828952789307, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 4640 + }, + { + "epoch": 16.754853273137698, + "grad_norm": 209.7708740234375, + "learning_rate": 4.78584392014519e-06, + "loss": 39.0808, + "step": 4641 + }, + { + "epoch": 16.75846501128668, + "grad_norm": 251.971435546875, + "learning_rate": 4.780399274047187e-06, + "loss": 39.2025, + "step": 4642 + }, + { + "epoch": 16.762076749435664, + "grad_norm": 210.54151916503906, + "learning_rate": 4.774954627949183e-06, + "loss": 37.7541, + "step": 4643 + }, + { + "epoch": 16.76568848758465, + "grad_norm": 221.22119140625, + "learning_rate": 4.76950998185118e-06, + "loss": 36.4328, + "step": 4644 + }, + { + "epoch": 16.769300225733634, + "grad_norm": 201.45025634765625, + "learning_rate": 4.764065335753176e-06, + "loss": 34.9771, + "step": 4645 + }, + { + "epoch": 16.772911963882617, + "grad_norm": 241.33030700683594, + "learning_rate": 4.758620689655173e-06, + "loss": 37.6231, + "step": 4646 + }, + { + "epoch": 16.776523702031604, + "grad_norm": 282.12255859375, + "learning_rate": 4.753176043557169e-06, + "loss": 36.9822, + "step": 4647 + }, + { + "epoch": 16.780135440180587, + "grad_norm": 239.93885803222656, + "learning_rate": 4.747731397459165e-06, + "loss": 36.3529, + "step": 4648 + }, + { + "epoch": 16.78374717832957, + "grad_norm": 245.9400634765625, + "learning_rate": 4.7422867513611615e-06, + "loss": 37.518, + "step": 4649 + }, + { + "epoch": 16.787358916478556, + "grad_norm": 280.63720703125, + "learning_rate": 4.736842105263158e-06, + "loss": 37.6323, + "step": 4650 + }, + { + "epoch": 16.787358916478556, + "eval_loss": 0.6054876446723938, + "eval_runtime": 3.1439, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 4650 + }, + { + "epoch": 16.79097065462754, + "grad_norm": 368.47698974609375, + "learning_rate": 4.731397459165155e-06, + "loss": 38.1543, + "step": 4651 + }, + { + "epoch": 16.794582392776523, + "grad_norm": 346.9169616699219, + "learning_rate": 4.72595281306715e-06, + "loss": 38.8746, + "step": 4652 + }, + { + "epoch": 16.79819413092551, + "grad_norm": 311.7519836425781, + "learning_rate": 4.720508166969147e-06, + "loss": 37.3475, + "step": 4653 + }, + { + "epoch": 16.801805869074492, + "grad_norm": 323.14910888671875, + "learning_rate": 4.7150635208711435e-06, + "loss": 38.5308, + "step": 4654 + }, + { + "epoch": 16.805417607223475, + "grad_norm": 252.71958923339844, + "learning_rate": 4.70961887477314e-06, + "loss": 38.3275, + "step": 4655 + }, + { + "epoch": 16.809029345372462, + "grad_norm": 364.2929382324219, + "learning_rate": 4.704174228675136e-06, + "loss": 38.9973, + "step": 4656 + }, + { + "epoch": 16.812641083521445, + "grad_norm": 267.23980712890625, + "learning_rate": 4.698729582577132e-06, + "loss": 38.0867, + "step": 4657 + }, + { + "epoch": 16.816252821670428, + "grad_norm": 297.4647521972656, + "learning_rate": 4.693284936479129e-06, + "loss": 38.6933, + "step": 4658 + }, + { + "epoch": 16.819864559819415, + "grad_norm": 276.2767333984375, + "learning_rate": 4.6878402903811255e-06, + "loss": 38.0279, + "step": 4659 + }, + { + "epoch": 16.823476297968398, + "grad_norm": 261.5404052734375, + "learning_rate": 4.682395644283122e-06, + "loss": 36.5149, + "step": 4660 + }, + { + "epoch": 16.823476297968398, + "eval_loss": 0.6019832491874695, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.074, + "eval_steps_per_second": 57.074, + "step": 4660 + }, + { + "epoch": 16.82708803611738, + "grad_norm": 313.2170104980469, + "learning_rate": 4.676950998185118e-06, + "loss": 35.6121, + "step": 4661 + }, + { + "epoch": 16.830699774266364, + "grad_norm": 297.2791442871094, + "learning_rate": 4.671506352087115e-06, + "loss": 31.1869, + "step": 4662 + }, + { + "epoch": 16.83431151241535, + "grad_norm": 269.7320556640625, + "learning_rate": 4.666061705989111e-06, + "loss": 31.8674, + "step": 4663 + }, + { + "epoch": 16.837923250564334, + "grad_norm": 245.3898468017578, + "learning_rate": 4.660617059891107e-06, + "loss": 30.3726, + "step": 4664 + }, + { + "epoch": 16.841534988713317, + "grad_norm": 244.63223266601562, + "learning_rate": 4.655172413793104e-06, + "loss": 32.6154, + "step": 4665 + }, + { + "epoch": 16.845146726862303, + "grad_norm": 263.6791076660156, + "learning_rate": 4.6497277676951e-06, + "loss": 33.0104, + "step": 4666 + }, + { + "epoch": 16.848758465011286, + "grad_norm": 398.6610107421875, + "learning_rate": 4.644283121597096e-06, + "loss": 32.5445, + "step": 4667 + }, + { + "epoch": 16.85237020316027, + "grad_norm": 312.8116149902344, + "learning_rate": 4.6388384754990924e-06, + "loss": 32.5698, + "step": 4668 + }, + { + "epoch": 16.855981941309256, + "grad_norm": 296.6167297363281, + "learning_rate": 4.6333938294010895e-06, + "loss": 33.1377, + "step": 4669 + }, + { + "epoch": 16.85959367945824, + "grad_norm": 285.299560546875, + "learning_rate": 4.627949183303086e-06, + "loss": 33.3279, + "step": 4670 + }, + { + "epoch": 16.85959367945824, + "eval_loss": 0.6027817726135254, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.985, + "eval_steps_per_second": 56.985, + "step": 4670 + }, + { + "epoch": 16.863205417607222, + "grad_norm": 285.2948913574219, + "learning_rate": 4.622504537205081e-06, + "loss": 35.6879, + "step": 4671 + }, + { + "epoch": 16.86681715575621, + "grad_norm": 280.6530456542969, + "learning_rate": 4.617059891107078e-06, + "loss": 32.3154, + "step": 4672 + }, + { + "epoch": 16.870428893905192, + "grad_norm": 314.206787109375, + "learning_rate": 4.6116152450090744e-06, + "loss": 34.3517, + "step": 4673 + }, + { + "epoch": 16.874040632054175, + "grad_norm": 305.9198913574219, + "learning_rate": 4.6061705989110715e-06, + "loss": 34.1571, + "step": 4674 + }, + { + "epoch": 16.877652370203162, + "grad_norm": 287.0543212890625, + "learning_rate": 4.600725952813067e-06, + "loss": 35.1647, + "step": 4675 + }, + { + "epoch": 16.881264108352145, + "grad_norm": 286.912109375, + "learning_rate": 4.595281306715064e-06, + "loss": 34.8698, + "step": 4676 + }, + { + "epoch": 16.884875846501128, + "grad_norm": 322.4527587890625, + "learning_rate": 4.58983666061706e-06, + "loss": 36.3449, + "step": 4677 + }, + { + "epoch": 16.888487584650115, + "grad_norm": 239.41659545898438, + "learning_rate": 4.584392014519056e-06, + "loss": 25.3085, + "step": 4678 + }, + { + "epoch": 16.892099322799098, + "grad_norm": 215.5685577392578, + "learning_rate": 4.578947368421053e-06, + "loss": 22.3485, + "step": 4679 + }, + { + "epoch": 16.89571106094808, + "grad_norm": 291.2452697753906, + "learning_rate": 4.573502722323049e-06, + "loss": 22.3257, + "step": 4680 + }, + { + "epoch": 16.89571106094808, + "eval_loss": 0.6040940284729004, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 4680 + }, + { + "epoch": 16.899322799097064, + "grad_norm": 291.39935302734375, + "learning_rate": 4.568058076225046e-06, + "loss": 23.268, + "step": 4681 + }, + { + "epoch": 16.90293453724605, + "grad_norm": 272.211181640625, + "learning_rate": 4.562613430127041e-06, + "loss": 23.7127, + "step": 4682 + }, + { + "epoch": 16.906546275395034, + "grad_norm": 220.84397888183594, + "learning_rate": 4.5571687840290384e-06, + "loss": 39.2488, + "step": 4683 + }, + { + "epoch": 16.910158013544017, + "grad_norm": 238.49859619140625, + "learning_rate": 4.551724137931035e-06, + "loss": 39.5643, + "step": 4684 + }, + { + "epoch": 16.913769751693003, + "grad_norm": 325.3870544433594, + "learning_rate": 4.546279491833032e-06, + "loss": 38.6149, + "step": 4685 + }, + { + "epoch": 16.917381489841986, + "grad_norm": 307.02349853515625, + "learning_rate": 4.540834845735027e-06, + "loss": 38.0317, + "step": 4686 + }, + { + "epoch": 16.92099322799097, + "grad_norm": 433.99359130859375, + "learning_rate": 4.535390199637023e-06, + "loss": 40.4567, + "step": 4687 + }, + { + "epoch": 16.924604966139956, + "grad_norm": 327.97015380859375, + "learning_rate": 4.5299455535390204e-06, + "loss": 40.3109, + "step": 4688 + }, + { + "epoch": 16.92821670428894, + "grad_norm": 257.20684814453125, + "learning_rate": 4.524500907441017e-06, + "loss": 36.2826, + "step": 4689 + }, + { + "epoch": 16.931828442437922, + "grad_norm": 402.6732177734375, + "learning_rate": 4.519056261343013e-06, + "loss": 36.9163, + "step": 4690 + }, + { + "epoch": 16.931828442437922, + "eval_loss": 0.6016727089881897, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 4690 + }, + { + "epoch": 16.93544018058691, + "grad_norm": 380.8903503417969, + "learning_rate": 4.513611615245009e-06, + "loss": 36.7101, + "step": 4691 + }, + { + "epoch": 16.939051918735892, + "grad_norm": 365.4950256347656, + "learning_rate": 4.508166969147006e-06, + "loss": 37.9853, + "step": 4692 + }, + { + "epoch": 16.942663656884875, + "grad_norm": 302.3895568847656, + "learning_rate": 4.5027223230490016e-06, + "loss": 38.109, + "step": 4693 + }, + { + "epoch": 16.94627539503386, + "grad_norm": 333.5274963378906, + "learning_rate": 4.497277676950998e-06, + "loss": 37.5992, + "step": 4694 + }, + { + "epoch": 16.949887133182845, + "grad_norm": 364.3126525878906, + "learning_rate": 4.491833030852995e-06, + "loss": 38.0139, + "step": 4695 + }, + { + "epoch": 16.953498871331828, + "grad_norm": 509.94671630859375, + "learning_rate": 4.486388384754991e-06, + "loss": 39.8027, + "step": 4696 + }, + { + "epoch": 16.957110609480814, + "grad_norm": 507.8591613769531, + "learning_rate": 4.480943738656987e-06, + "loss": 40.0044, + "step": 4697 + }, + { + "epoch": 16.960722347629797, + "grad_norm": 324.5463562011719, + "learning_rate": 4.4754990925589836e-06, + "loss": 34.9058, + "step": 4698 + }, + { + "epoch": 16.96433408577878, + "grad_norm": 318.39801025390625, + "learning_rate": 4.470054446460981e-06, + "loss": 33.1318, + "step": 4699 + }, + { + "epoch": 16.967945823927764, + "grad_norm": 391.8466796875, + "learning_rate": 4.464609800362977e-06, + "loss": 32.2083, + "step": 4700 + }, + { + "epoch": 16.967945823927764, + "eval_loss": 0.6047930717468262, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.009, + "eval_steps_per_second": 57.009, + "step": 4700 + }, + { + "epoch": 16.97155756207675, + "grad_norm": 530.4073486328125, + "learning_rate": 4.459165154264972e-06, + "loss": 31.9882, + "step": 4701 + }, + { + "epoch": 16.975169300225733, + "grad_norm": 590.9242553710938, + "learning_rate": 4.453720508166969e-06, + "loss": 34.1937, + "step": 4702 + }, + { + "epoch": 16.978781038374716, + "grad_norm": 377.5596618652344, + "learning_rate": 4.4482758620689656e-06, + "loss": 34.6501, + "step": 4703 + }, + { + "epoch": 16.982392776523703, + "grad_norm": 431.2909240722656, + "learning_rate": 4.442831215970962e-06, + "loss": 33.9402, + "step": 4704 + }, + { + "epoch": 16.986004514672686, + "grad_norm": 294.7673645019531, + "learning_rate": 4.437386569872958e-06, + "loss": 33.7873, + "step": 4705 + }, + { + "epoch": 16.98961625282167, + "grad_norm": 346.1203918457031, + "learning_rate": 4.431941923774955e-06, + "loss": 35.2935, + "step": 4706 + }, + { + "epoch": 16.993227990970656, + "grad_norm": 257.8351745605469, + "learning_rate": 4.426497277676951e-06, + "loss": 28.3513, + "step": 4707 + }, + { + "epoch": 16.99683972911964, + "grad_norm": 168.35118103027344, + "learning_rate": 4.421052631578947e-06, + "loss": 22.3009, + "step": 4708 + }, + { + "epoch": 17.0, + "grad_norm": 210.20738220214844, + "learning_rate": 4.415607985480944e-06, + "loss": 20.1848, + "step": 4709 + }, + { + "epoch": 17.003611738148983, + "grad_norm": 234.40866088867188, + "learning_rate": 4.41016333938294e-06, + "loss": 38.0969, + "step": 4710 + }, + { + "epoch": 17.003611738148983, + "eval_loss": 0.6026900410652161, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 4710 + }, + { + "epoch": 17.00722347629797, + "grad_norm": 242.27195739746094, + "learning_rate": 4.404718693284937e-06, + "loss": 38.8902, + "step": 4711 + }, + { + "epoch": 17.010835214446953, + "grad_norm": 215.1695556640625, + "learning_rate": 4.3992740471869325e-06, + "loss": 38.5509, + "step": 4712 + }, + { + "epoch": 17.014446952595936, + "grad_norm": 390.2027587890625, + "learning_rate": 4.3938294010889296e-06, + "loss": 38.5247, + "step": 4713 + }, + { + "epoch": 17.018058690744923, + "grad_norm": 397.77484130859375, + "learning_rate": 4.388384754990926e-06, + "loss": 39.1981, + "step": 4714 + }, + { + "epoch": 17.021670428893906, + "grad_norm": 298.10089111328125, + "learning_rate": 4.382940108892923e-06, + "loss": 38.2627, + "step": 4715 + }, + { + "epoch": 17.02528216704289, + "grad_norm": 291.7283935546875, + "learning_rate": 4.377495462794918e-06, + "loss": 38.8027, + "step": 4716 + }, + { + "epoch": 17.028893905191875, + "grad_norm": 254.8542938232422, + "learning_rate": 4.3720508166969145e-06, + "loss": 38.6095, + "step": 4717 + }, + { + "epoch": 17.03250564334086, + "grad_norm": 244.336181640625, + "learning_rate": 4.3666061705989116e-06, + "loss": 38.2955, + "step": 4718 + }, + { + "epoch": 17.03611738148984, + "grad_norm": 376.92523193359375, + "learning_rate": 4.361161524500907e-06, + "loss": 38.5203, + "step": 4719 + }, + { + "epoch": 17.039729119638825, + "grad_norm": 339.6172790527344, + "learning_rate": 4.355716878402904e-06, + "loss": 37.4332, + "step": 4720 + }, + { + "epoch": 17.039729119638825, + "eval_loss": 0.6024167537689209, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 4720 + }, + { + "epoch": 17.04334085778781, + "grad_norm": 433.0855712890625, + "learning_rate": 4.3502722323049e-06, + "loss": 36.4444, + "step": 4721 + }, + { + "epoch": 17.046952595936794, + "grad_norm": 224.3468475341797, + "learning_rate": 4.344827586206897e-06, + "loss": 35.7802, + "step": 4722 + }, + { + "epoch": 17.050564334085777, + "grad_norm": 385.5466003417969, + "learning_rate": 4.339382940108893e-06, + "loss": 35.4641, + "step": 4723 + }, + { + "epoch": 17.054176072234764, + "grad_norm": 311.80596923828125, + "learning_rate": 4.333938294010889e-06, + "loss": 36.4231, + "step": 4724 + }, + { + "epoch": 17.057787810383747, + "grad_norm": 283.189453125, + "learning_rate": 4.328493647912886e-06, + "loss": 37.5405, + "step": 4725 + }, + { + "epoch": 17.06139954853273, + "grad_norm": 403.85833740234375, + "learning_rate": 4.323049001814882e-06, + "loss": 37.4723, + "step": 4726 + }, + { + "epoch": 17.065011286681717, + "grad_norm": 390.03515625, + "learning_rate": 4.3176043557168785e-06, + "loss": 36.6799, + "step": 4727 + }, + { + "epoch": 17.0686230248307, + "grad_norm": 318.63427734375, + "learning_rate": 4.312159709618875e-06, + "loss": 36.6312, + "step": 4728 + }, + { + "epoch": 17.072234762979683, + "grad_norm": 318.43402099609375, + "learning_rate": 4.306715063520872e-06, + "loss": 37.9104, + "step": 4729 + }, + { + "epoch": 17.07584650112867, + "grad_norm": 320.9336853027344, + "learning_rate": 4.301270417422867e-06, + "loss": 36.7254, + "step": 4730 + }, + { + "epoch": 17.07584650112867, + "eval_loss": 0.6046721339225769, + "eval_runtime": 3.1418, + "eval_samples_per_second": 56.974, + "eval_steps_per_second": 56.974, + "step": 4730 + }, + { + "epoch": 17.079458239277653, + "grad_norm": 345.9001770019531, + "learning_rate": 4.295825771324863e-06, + "loss": 36.0298, + "step": 4731 + }, + { + "epoch": 17.083069977426636, + "grad_norm": 397.10369873046875, + "learning_rate": 4.2903811252268605e-06, + "loss": 37.9418, + "step": 4732 + }, + { + "epoch": 17.086681715575622, + "grad_norm": 293.1039123535156, + "learning_rate": 4.284936479128857e-06, + "loss": 37.2627, + "step": 4733 + }, + { + "epoch": 17.090293453724605, + "grad_norm": 412.5190734863281, + "learning_rate": 4.279491833030853e-06, + "loss": 38.3429, + "step": 4734 + }, + { + "epoch": 17.09390519187359, + "grad_norm": 241.35105895996094, + "learning_rate": 4.274047186932849e-06, + "loss": 38.559, + "step": 4735 + }, + { + "epoch": 17.097516930022575, + "grad_norm": 275.169189453125, + "learning_rate": 4.268602540834846e-06, + "loss": 36.8167, + "step": 4736 + }, + { + "epoch": 17.101128668171558, + "grad_norm": 272.3182678222656, + "learning_rate": 4.2631578947368425e-06, + "loss": 37.0246, + "step": 4737 + }, + { + "epoch": 17.10474040632054, + "grad_norm": 215.6425018310547, + "learning_rate": 4.257713248638839e-06, + "loss": 33.1282, + "step": 4738 + }, + { + "epoch": 17.108352144469524, + "grad_norm": 276.6223449707031, + "learning_rate": 4.252268602540835e-06, + "loss": 33.2698, + "step": 4739 + }, + { + "epoch": 17.11196388261851, + "grad_norm": 311.1632385253906, + "learning_rate": 4.246823956442831e-06, + "loss": 31.0105, + "step": 4740 + }, + { + "epoch": 17.11196388261851, + "eval_loss": 0.6019421815872192, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.089, + "eval_steps_per_second": 57.089, + "step": 4740 + }, + { + "epoch": 17.115575620767494, + "grad_norm": 254.7543487548828, + "learning_rate": 4.241379310344828e-06, + "loss": 31.4721, + "step": 4741 + }, + { + "epoch": 17.119187358916477, + "grad_norm": 239.24957275390625, + "learning_rate": 4.235934664246824e-06, + "loss": 31.0346, + "step": 4742 + }, + { + "epoch": 17.122799097065464, + "grad_norm": 262.0681457519531, + "learning_rate": 4.230490018148821e-06, + "loss": 32.0604, + "step": 4743 + }, + { + "epoch": 17.126410835214447, + "grad_norm": 218.3557586669922, + "learning_rate": 4.225045372050817e-06, + "loss": 32.2036, + "step": 4744 + }, + { + "epoch": 17.13002257336343, + "grad_norm": 277.5924072265625, + "learning_rate": 4.219600725952813e-06, + "loss": 32.1412, + "step": 4745 + }, + { + "epoch": 17.133634311512417, + "grad_norm": 226.93211364746094, + "learning_rate": 4.214156079854809e-06, + "loss": 34.3367, + "step": 4746 + }, + { + "epoch": 17.1372460496614, + "grad_norm": 303.2422180175781, + "learning_rate": 4.208711433756806e-06, + "loss": 33.2001, + "step": 4747 + }, + { + "epoch": 17.140857787810383, + "grad_norm": 257.6164245605469, + "learning_rate": 4.203266787658803e-06, + "loss": 34.155, + "step": 4748 + }, + { + "epoch": 17.14446952595937, + "grad_norm": 361.1567077636719, + "learning_rate": 4.197822141560798e-06, + "loss": 35.236, + "step": 4749 + }, + { + "epoch": 17.148081264108352, + "grad_norm": 292.0034484863281, + "learning_rate": 4.192377495462795e-06, + "loss": 34.304, + "step": 4750 + }, + { + "epoch": 17.148081264108352, + "eval_loss": 0.6034401059150696, + "eval_runtime": 3.1399, + "eval_samples_per_second": 57.008, + "eval_steps_per_second": 57.008, + "step": 4750 + }, + { + "epoch": 17.151693002257336, + "grad_norm": 327.8070983886719, + "learning_rate": 4.186932849364791e-06, + "loss": 33.7346, + "step": 4751 + }, + { + "epoch": 17.155304740406322, + "grad_norm": 312.9547119140625, + "learning_rate": 4.1814882032667885e-06, + "loss": 35.9274, + "step": 4752 + }, + { + "epoch": 17.158916478555305, + "grad_norm": 305.19500732421875, + "learning_rate": 4.176043557168784e-06, + "loss": 35.5567, + "step": 4753 + }, + { + "epoch": 17.16252821670429, + "grad_norm": 339.37152099609375, + "learning_rate": 4.17059891107078e-06, + "loss": 35.8013, + "step": 4754 + }, + { + "epoch": 17.16613995485327, + "grad_norm": 247.36679077148438, + "learning_rate": 4.165154264972777e-06, + "loss": 29.2211, + "step": 4755 + }, + { + "epoch": 17.169751693002258, + "grad_norm": 255.65269470214844, + "learning_rate": 4.1597096188747725e-06, + "loss": 21.6191, + "step": 4756 + }, + { + "epoch": 17.17336343115124, + "grad_norm": 239.66448974609375, + "learning_rate": 4.15426497277677e-06, + "loss": 22.0521, + "step": 4757 + }, + { + "epoch": 17.176975169300224, + "grad_norm": 212.25955200195312, + "learning_rate": 4.148820326678766e-06, + "loss": 22.6641, + "step": 4758 + }, + { + "epoch": 17.18058690744921, + "grad_norm": 229.9394073486328, + "learning_rate": 4.143375680580763e-06, + "loss": 22.8787, + "step": 4759 + }, + { + "epoch": 17.184198645598194, + "grad_norm": 237.46343994140625, + "learning_rate": 4.137931034482758e-06, + "loss": 39.1222, + "step": 4760 + }, + { + "epoch": 17.184198645598194, + "eval_loss": 0.6031526327133179, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 4760 + }, + { + "epoch": 17.187810383747177, + "grad_norm": 229.23849487304688, + "learning_rate": 4.132486388384755e-06, + "loss": 39.7664, + "step": 4761 + }, + { + "epoch": 17.191422121896164, + "grad_norm": 250.67529296875, + "learning_rate": 4.127041742286752e-06, + "loss": 38.6754, + "step": 4762 + }, + { + "epoch": 17.195033860045147, + "grad_norm": 272.9320068359375, + "learning_rate": 4.121597096188748e-06, + "loss": 39.1262, + "step": 4763 + }, + { + "epoch": 17.19864559819413, + "grad_norm": 267.82427978515625, + "learning_rate": 4.116152450090744e-06, + "loss": 38.2223, + "step": 4764 + }, + { + "epoch": 17.202257336343116, + "grad_norm": 266.35760498046875, + "learning_rate": 4.11070780399274e-06, + "loss": 39.2069, + "step": 4765 + }, + { + "epoch": 17.2058690744921, + "grad_norm": 221.62606811523438, + "learning_rate": 4.105263157894737e-06, + "loss": 38.8956, + "step": 4766 + }, + { + "epoch": 17.209480812641083, + "grad_norm": 243.73110961914062, + "learning_rate": 4.099818511796734e-06, + "loss": 41.5868, + "step": 4767 + }, + { + "epoch": 17.21309255079007, + "grad_norm": 268.6092224121094, + "learning_rate": 4.09437386569873e-06, + "loss": 39.1041, + "step": 4768 + }, + { + "epoch": 17.216704288939052, + "grad_norm": 300.3140563964844, + "learning_rate": 4.088929219600726e-06, + "loss": 38.25, + "step": 4769 + }, + { + "epoch": 17.220316027088035, + "grad_norm": 264.56805419921875, + "learning_rate": 4.083484573502722e-06, + "loss": 38.186, + "step": 4770 + }, + { + "epoch": 17.220316027088035, + "eval_loss": 0.6044566631317139, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4770 + }, + { + "epoch": 17.223927765237022, + "grad_norm": 303.47686767578125, + "learning_rate": 4.0780399274047185e-06, + "loss": 37.7011, + "step": 4771 + }, + { + "epoch": 17.227539503386005, + "grad_norm": 238.3590545654297, + "learning_rate": 4.072595281306715e-06, + "loss": 34.6695, + "step": 4772 + }, + { + "epoch": 17.231151241534988, + "grad_norm": 252.90081787109375, + "learning_rate": 4.067150635208712e-06, + "loss": 36.1903, + "step": 4773 + }, + { + "epoch": 17.23476297968397, + "grad_norm": 286.5584716796875, + "learning_rate": 4.061705989110708e-06, + "loss": 36.4185, + "step": 4774 + }, + { + "epoch": 17.238374717832958, + "grad_norm": 322.25323486328125, + "learning_rate": 4.056261343012704e-06, + "loss": 36.0098, + "step": 4775 + }, + { + "epoch": 17.24198645598194, + "grad_norm": 292.09405517578125, + "learning_rate": 4.0508166969147005e-06, + "loss": 35.4347, + "step": 4776 + }, + { + "epoch": 17.245598194130924, + "grad_norm": 295.9725341796875, + "learning_rate": 4.045372050816697e-06, + "loss": 37.3512, + "step": 4777 + }, + { + "epoch": 17.24920993227991, + "grad_norm": 326.34539794921875, + "learning_rate": 4.039927404718694e-06, + "loss": 38.6739, + "step": 4778 + }, + { + "epoch": 17.252821670428894, + "grad_norm": 384.3682861328125, + "learning_rate": 4.034482758620689e-06, + "loss": 38.0995, + "step": 4779 + }, + { + "epoch": 17.256433408577877, + "grad_norm": 400.59136962890625, + "learning_rate": 4.029038112522686e-06, + "loss": 36.7733, + "step": 4780 + }, + { + "epoch": 17.256433408577877, + "eval_loss": 0.6064656972885132, + "eval_runtime": 3.14, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 4780 + }, + { + "epoch": 17.260045146726863, + "grad_norm": 379.5261535644531, + "learning_rate": 4.0235934664246825e-06, + "loss": 36.1385, + "step": 4781 + }, + { + "epoch": 17.263656884875846, + "grad_norm": 277.1004638671875, + "learning_rate": 4.018148820326679e-06, + "loss": 39.1495, + "step": 4782 + }, + { + "epoch": 17.26726862302483, + "grad_norm": 274.6176452636719, + "learning_rate": 4.012704174228675e-06, + "loss": 37.8503, + "step": 4783 + }, + { + "epoch": 17.270880361173816, + "grad_norm": 338.9375305175781, + "learning_rate": 4.007259528130671e-06, + "loss": 39.7149, + "step": 4784 + }, + { + "epoch": 17.2744920993228, + "grad_norm": 299.60662841796875, + "learning_rate": 4.001814882032668e-06, + "loss": 37.6013, + "step": 4785 + }, + { + "epoch": 17.278103837471782, + "grad_norm": 278.9190368652344, + "learning_rate": 3.996370235934664e-06, + "loss": 38.1106, + "step": 4786 + }, + { + "epoch": 17.28171557562077, + "grad_norm": 254.48443603515625, + "learning_rate": 3.990925589836661e-06, + "loss": 35.9676, + "step": 4787 + }, + { + "epoch": 17.285327313769752, + "grad_norm": 274.65338134765625, + "learning_rate": 3.985480943738657e-06, + "loss": 35.3535, + "step": 4788 + }, + { + "epoch": 17.288939051918735, + "grad_norm": 288.748779296875, + "learning_rate": 3.980036297640654e-06, + "loss": 32.7356, + "step": 4789 + }, + { + "epoch": 17.292550790067722, + "grad_norm": 229.0682830810547, + "learning_rate": 3.9745916515426495e-06, + "loss": 31.2048, + "step": 4790 + }, + { + "epoch": 17.292550790067722, + "eval_loss": 0.6020387411117554, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.06, + "eval_steps_per_second": 57.06, + "step": 4790 + }, + { + "epoch": 17.296162528216705, + "grad_norm": 234.29937744140625, + "learning_rate": 3.9691470054446465e-06, + "loss": 31.7953, + "step": 4791 + }, + { + "epoch": 17.299774266365688, + "grad_norm": 236.3527069091797, + "learning_rate": 3.963702359346643e-06, + "loss": 31.6686, + "step": 4792 + }, + { + "epoch": 17.30338600451467, + "grad_norm": 253.44126892089844, + "learning_rate": 3.958257713248639e-06, + "loss": 31.8848, + "step": 4793 + }, + { + "epoch": 17.306997742663658, + "grad_norm": 270.66046142578125, + "learning_rate": 3.952813067150635e-06, + "loss": 32.1593, + "step": 4794 + }, + { + "epoch": 17.31060948081264, + "grad_norm": 242.77777099609375, + "learning_rate": 3.9473684210526315e-06, + "loss": 32.4555, + "step": 4795 + }, + { + "epoch": 17.314221218961624, + "grad_norm": 243.9296112060547, + "learning_rate": 3.9419237749546285e-06, + "loss": 34.0444, + "step": 4796 + }, + { + "epoch": 17.31783295711061, + "grad_norm": 276.2138671875, + "learning_rate": 3.936479128856624e-06, + "loss": 32.0404, + "step": 4797 + }, + { + "epoch": 17.321444695259594, + "grad_norm": 262.97802734375, + "learning_rate": 3.931034482758621e-06, + "loss": 32.4535, + "step": 4798 + }, + { + "epoch": 17.325056433408577, + "grad_norm": 338.9852600097656, + "learning_rate": 3.925589836660617e-06, + "loss": 34.6855, + "step": 4799 + }, + { + "epoch": 17.328668171557563, + "grad_norm": 270.85650634765625, + "learning_rate": 3.9201451905626135e-06, + "loss": 32.2425, + "step": 4800 + }, + { + "epoch": 17.328668171557563, + "eval_loss": 0.603055477142334, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 4800 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.354097956532388e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-4800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..52493f66a0a959f110775853f1437dd34becff2a --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a42666f178e230de0e55b7f6cd78f24709dc819cc8e22276b9b7dbb818c40abc +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..a469d692aed6ae313198183c161d3a78aaab0ed3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1db4080238ccc9a1f4299f6bf218ea8996da967a5144026156ec3033cdf068f +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..99be0e82082edd834ba85af673721c6e94efcc03 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:144484bfe77d7345b51b476ce162e7d30e2af44e4cc09cb2605b41275fc9fff8 +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0778ccd2e2316a664ca03f17ee5fb0c432737574 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acdfe7dc8f0816c3642f717ff6ca80591142414dfab5c9999d5272956cb94a82 +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d5395b8e3bfadc63ba41671b193ed8619d6e9d6b --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1331f18b6fe4c16577c3a9e1711b4957ff6558f8a377c19030d2c08b29df7f86 +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f5498256a877f0deffddee0ab0a7f398de7d6153 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/trainer_state.json @@ -0,0 +1,39033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 18.050564334085777, + "eval_steps": 10, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + }, + { + "epoch": 4.33589164785553, + "grad_norm": 232.332763671875, + "learning_rate": 2.3515426497277678e-05, + "loss": 38.1029, + "step": 1201 + }, + { + "epoch": 4.339503386004514, + "grad_norm": 251.8763885498047, + "learning_rate": 2.3509981851179673e-05, + "loss": 40.2538, + "step": 1202 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 260.1363525390625, + "learning_rate": 2.350453720508167e-05, + "loss": 39.115, + "step": 1203 + }, + { + "epoch": 4.346726862302483, + "grad_norm": 227.32473754882812, + "learning_rate": 2.3499092558983667e-05, + "loss": 37.7692, + "step": 1204 + }, + { + "epoch": 4.350338600451467, + "grad_norm": 208.3872528076172, + "learning_rate": 2.3493647912885663e-05, + "loss": 26.7583, + "step": 1205 + }, + { + "epoch": 4.353950338600452, + "grad_norm": 173.05075073242188, + "learning_rate": 2.348820326678766e-05, + "loss": 24.7576, + "step": 1206 + }, + { + "epoch": 4.357562076749436, + "grad_norm": 214.4512939453125, + "learning_rate": 2.3482758620689657e-05, + "loss": 24.8792, + "step": 1207 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 179.293701171875, + "learning_rate": 2.3477313974591652e-05, + "loss": 26.1507, + "step": 1208 + }, + { + "epoch": 4.364785553047404, + "grad_norm": 401.9908142089844, + "learning_rate": 2.3471869328493648e-05, + "loss": 47.4017, + "step": 1209 + }, + { + "epoch": 4.368397291196389, + "grad_norm": 399.3369140625, + "learning_rate": 2.3466424682395643e-05, + "loss": 48.0082, + "step": 1210 + }, + { + "epoch": 4.368397291196389, + "eval_loss": 0.6664602756500244, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.18, + "eval_steps_per_second": 57.18, + "step": 1210 + }, + { + "epoch": 4.372009029345373, + "grad_norm": 320.49090576171875, + "learning_rate": 2.346098003629764e-05, + "loss": 47.4843, + "step": 1211 + }, + { + "epoch": 4.375620767494357, + "grad_norm": 297.55615234375, + "learning_rate": 2.3455535390199637e-05, + "loss": 46.3087, + "step": 1212 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 245.03399658203125, + "learning_rate": 2.3450090744101636e-05, + "loss": 45.4889, + "step": 1213 + }, + { + "epoch": 4.382844243792325, + "grad_norm": 227.94091796875, + "learning_rate": 2.344464609800363e-05, + "loss": 45.8501, + "step": 1214 + }, + { + "epoch": 4.386455981941309, + "grad_norm": 262.7824401855469, + "learning_rate": 2.3439201451905627e-05, + "loss": 46.2737, + "step": 1215 + }, + { + "epoch": 4.390067720090293, + "grad_norm": 235.969970703125, + "learning_rate": 2.3433756805807622e-05, + "loss": 45.2876, + "step": 1216 + }, + { + "epoch": 4.393679458239277, + "grad_norm": 244.8028106689453, + "learning_rate": 2.342831215970962e-05, + "loss": 45.4931, + "step": 1217 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 236.24844360351562, + "learning_rate": 2.3422867513611616e-05, + "loss": 45.6649, + "step": 1218 + }, + { + "epoch": 4.400902934537246, + "grad_norm": 204.7911834716797, + "learning_rate": 2.341742286751361e-05, + "loss": 43.9613, + "step": 1219 + }, + { + "epoch": 4.40451467268623, + "grad_norm": 190.6739044189453, + "learning_rate": 2.3411978221415607e-05, + "loss": 41.9267, + "step": 1220 + }, + { + "epoch": 4.40451467268623, + "eval_loss": 0.6481396555900574, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1220 + }, + { + "epoch": 4.408126410835214, + "grad_norm": 224.25758361816406, + "learning_rate": 2.3406533575317602e-05, + "loss": 42.34, + "step": 1221 + }, + { + "epoch": 4.411738148984199, + "grad_norm": 238.21913146972656, + "learning_rate": 2.34010889292196e-05, + "loss": 40.6947, + "step": 1222 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 255.64395141601562, + "learning_rate": 2.33956442831216e-05, + "loss": 39.8585, + "step": 1223 + }, + { + "epoch": 4.418961625282167, + "grad_norm": 202.08859252929688, + "learning_rate": 2.3390199637023595e-05, + "loss": 42.6031, + "step": 1224 + }, + { + "epoch": 4.422573363431152, + "grad_norm": 222.359619140625, + "learning_rate": 2.338475499092559e-05, + "loss": 41.9946, + "step": 1225 + }, + { + "epoch": 4.426185101580136, + "grad_norm": 198.84461975097656, + "learning_rate": 2.3379310344827586e-05, + "loss": 40.9174, + "step": 1226 + }, + { + "epoch": 4.42979683972912, + "grad_norm": 227.34942626953125, + "learning_rate": 2.337386569872958e-05, + "loss": 42.2865, + "step": 1227 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 249.9097900390625, + "learning_rate": 2.336842105263158e-05, + "loss": 42.6508, + "step": 1228 + }, + { + "epoch": 4.437020316027088, + "grad_norm": 236.96009826660156, + "learning_rate": 2.3362976406533576e-05, + "loss": 43.0846, + "step": 1229 + }, + { + "epoch": 4.440632054176072, + "grad_norm": 183.06201171875, + "learning_rate": 2.335753176043557e-05, + "loss": 42.4119, + "step": 1230 + }, + { + "epoch": 4.440632054176072, + "eval_loss": 0.6428424715995789, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 1230 + }, + { + "epoch": 4.444243792325056, + "grad_norm": 199.0382843017578, + "learning_rate": 2.335208711433757e-05, + "loss": 43.1702, + "step": 1231 + }, + { + "epoch": 4.44785553047404, + "grad_norm": 221.87939453125, + "learning_rate": 2.3346642468239565e-05, + "loss": 43.3518, + "step": 1232 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 205.0601043701172, + "learning_rate": 2.3341197822141564e-05, + "loss": 42.9713, + "step": 1233 + }, + { + "epoch": 4.455079006772009, + "grad_norm": 235.3998565673828, + "learning_rate": 2.333575317604356e-05, + "loss": 42.6973, + "step": 1234 + }, + { + "epoch": 4.458690744920993, + "grad_norm": 171.76986694335938, + "learning_rate": 2.3330308529945555e-05, + "loss": 43.351, + "step": 1235 + }, + { + "epoch": 4.462302483069977, + "grad_norm": 261.549072265625, + "learning_rate": 2.332486388384755e-05, + "loss": 43.8662, + "step": 1236 + }, + { + "epoch": 4.465914221218962, + "grad_norm": 256.76837158203125, + "learning_rate": 2.3319419237749545e-05, + "loss": 40.7938, + "step": 1237 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 176.35060119628906, + "learning_rate": 2.331397459165154e-05, + "loss": 38.1021, + "step": 1238 + }, + { + "epoch": 4.47313769751693, + "grad_norm": 203.00906372070312, + "learning_rate": 2.330852994555354e-05, + "loss": 36.6359, + "step": 1239 + }, + { + "epoch": 4.476749435665914, + "grad_norm": 259.6462707519531, + "learning_rate": 2.3303085299455535e-05, + "loss": 34.448, + "step": 1240 + }, + { + "epoch": 4.476749435665914, + "eval_loss": 0.6386051177978516, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 1240 + }, + { + "epoch": 4.480361173814899, + "grad_norm": 215.24737548828125, + "learning_rate": 2.3297640653357534e-05, + "loss": 35.2353, + "step": 1241 + }, + { + "epoch": 4.483972911963883, + "grad_norm": 249.12355041503906, + "learning_rate": 2.329219600725953e-05, + "loss": 38.2077, + "step": 1242 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 191.0881805419922, + "learning_rate": 2.3286751361161525e-05, + "loss": 36.8363, + "step": 1243 + }, + { + "epoch": 4.491196388261851, + "grad_norm": 229.26449584960938, + "learning_rate": 2.3281306715063523e-05, + "loss": 36.7398, + "step": 1244 + }, + { + "epoch": 4.4948081264108355, + "grad_norm": 184.931884765625, + "learning_rate": 2.327586206896552e-05, + "loss": 35.6614, + "step": 1245 + }, + { + "epoch": 4.4984198645598195, + "grad_norm": 183.7378387451172, + "learning_rate": 2.3270417422867514e-05, + "loss": 36.9818, + "step": 1246 + }, + { + "epoch": 4.502031602708803, + "grad_norm": 191.42543029785156, + "learning_rate": 2.326497277676951e-05, + "loss": 38.1348, + "step": 1247 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 211.6359100341797, + "learning_rate": 2.3259528130671505e-05, + "loss": 37.0112, + "step": 1248 + }, + { + "epoch": 4.509255079006772, + "grad_norm": 245.6946563720703, + "learning_rate": 2.32540834845735e-05, + "loss": 38.6218, + "step": 1249 + }, + { + "epoch": 4.512866817155756, + "grad_norm": 193.29095458984375, + "learning_rate": 2.3248638838475502e-05, + "loss": 36.9687, + "step": 1250 + }, + { + "epoch": 4.512866817155756, + "eval_loss": 0.6432057023048401, + "eval_runtime": 3.1301, + "eval_samples_per_second": 57.187, + "eval_steps_per_second": 57.187, + "step": 1250 + }, + { + "epoch": 4.51647855530474, + "grad_norm": 247.0595245361328, + "learning_rate": 2.3243194192377498e-05, + "loss": 39.8086, + "step": 1251 + }, + { + "epoch": 4.520090293453725, + "grad_norm": 243.1544189453125, + "learning_rate": 2.3237749546279493e-05, + "loss": 38.7245, + "step": 1252 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 322.0834045410156, + "learning_rate": 2.323230490018149e-05, + "loss": 39.5335, + "step": 1253 + }, + { + "epoch": 4.527313769751693, + "grad_norm": 201.5956573486328, + "learning_rate": 2.3226860254083484e-05, + "loss": 30.2928, + "step": 1254 + }, + { + "epoch": 4.530925507900677, + "grad_norm": 186.13291931152344, + "learning_rate": 2.3221415607985483e-05, + "loss": 24.8504, + "step": 1255 + }, + { + "epoch": 4.534537246049661, + "grad_norm": 251.50608825683594, + "learning_rate": 2.3215970961887478e-05, + "loss": 24.5528, + "step": 1256 + }, + { + "epoch": 4.538148984198646, + "grad_norm": 180.21124267578125, + "learning_rate": 2.3210526315789473e-05, + "loss": 25.0864, + "step": 1257 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 206.5410614013672, + "learning_rate": 2.320508166969147e-05, + "loss": 27.1602, + "step": 1258 + }, + { + "epoch": 4.545372460496614, + "grad_norm": 342.1103210449219, + "learning_rate": 2.3199637023593468e-05, + "loss": 47.3734, + "step": 1259 + }, + { + "epoch": 4.5489841986455986, + "grad_norm": 418.3056945800781, + "learning_rate": 2.3194192377495463e-05, + "loss": 48.0316, + "step": 1260 + }, + { + "epoch": 4.5489841986455986, + "eval_loss": 0.6742400527000427, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 1260 + }, + { + "epoch": 4.5525959367945825, + "grad_norm": 369.8560791015625, + "learning_rate": 2.3188747731397462e-05, + "loss": 47.4532, + "step": 1261 + }, + { + "epoch": 4.5562076749435665, + "grad_norm": 322.0288391113281, + "learning_rate": 2.3183303085299457e-05, + "loss": 47.0661, + "step": 1262 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 244.79066467285156, + "learning_rate": 2.3177858439201453e-05, + "loss": 45.1875, + "step": 1263 + }, + { + "epoch": 4.563431151241535, + "grad_norm": 209.29397583007812, + "learning_rate": 2.3172413793103448e-05, + "loss": 46.1355, + "step": 1264 + }, + { + "epoch": 4.567042889390519, + "grad_norm": 271.5123291015625, + "learning_rate": 2.3166969147005443e-05, + "loss": 45.8947, + "step": 1265 + }, + { + "epoch": 4.570654627539503, + "grad_norm": 232.42913818359375, + "learning_rate": 2.3161524500907442e-05, + "loss": 45.6542, + "step": 1266 + }, + { + "epoch": 4.574266365688487, + "grad_norm": 282.50738525390625, + "learning_rate": 2.3156079854809437e-05, + "loss": 45.8805, + "step": 1267 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 203.39031982421875, + "learning_rate": 2.3150635208711436e-05, + "loss": 44.8926, + "step": 1268 + }, + { + "epoch": 4.581489841986456, + "grad_norm": 213.94894409179688, + "learning_rate": 2.314519056261343e-05, + "loss": 43.7589, + "step": 1269 + }, + { + "epoch": 4.58510158013544, + "grad_norm": 198.9677734375, + "learning_rate": 2.3139745916515427e-05, + "loss": 41.819, + "step": 1270 + }, + { + "epoch": 4.58510158013544, + "eval_loss": 0.6428627371788025, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1270 + }, + { + "epoch": 4.588713318284425, + "grad_norm": 197.69903564453125, + "learning_rate": 2.3134301270417422e-05, + "loss": 40.6128, + "step": 1271 + }, + { + "epoch": 4.592325056433409, + "grad_norm": 229.10488891601562, + "learning_rate": 2.312885662431942e-05, + "loss": 41.1856, + "step": 1272 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 254.4750213623047, + "learning_rate": 2.3123411978221417e-05, + "loss": 40.2048, + "step": 1273 + }, + { + "epoch": 4.599548532731377, + "grad_norm": 247.2012939453125, + "learning_rate": 2.3117967332123412e-05, + "loss": 41.663, + "step": 1274 + }, + { + "epoch": 4.603160270880361, + "grad_norm": 196.78761291503906, + "learning_rate": 2.3112522686025407e-05, + "loss": 41.1102, + "step": 1275 + }, + { + "epoch": 4.606772009029346, + "grad_norm": 179.03880310058594, + "learning_rate": 2.3107078039927403e-05, + "loss": 39.6368, + "step": 1276 + }, + { + "epoch": 4.6103837471783295, + "grad_norm": 203.49159240722656, + "learning_rate": 2.3101633393829405e-05, + "loss": 42.9424, + "step": 1277 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 254.80018615722656, + "learning_rate": 2.30961887477314e-05, + "loss": 42.0636, + "step": 1278 + }, + { + "epoch": 4.617607223476298, + "grad_norm": 201.86109924316406, + "learning_rate": 2.3090744101633396e-05, + "loss": 41.4738, + "step": 1279 + }, + { + "epoch": 4.621218961625282, + "grad_norm": 185.1239471435547, + "learning_rate": 2.308529945553539e-05, + "loss": 41.8529, + "step": 1280 + }, + { + "epoch": 4.621218961625282, + "eval_loss": 0.6457561254501343, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 1280 + }, + { + "epoch": 4.624830699774266, + "grad_norm": 198.6769561767578, + "learning_rate": 2.3079854809437386e-05, + "loss": 41.8397, + "step": 1281 + }, + { + "epoch": 4.62844243792325, + "grad_norm": 254.9165496826172, + "learning_rate": 2.3074410163339382e-05, + "loss": 43.5585, + "step": 1282 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 183.61181640625, + "learning_rate": 2.306896551724138e-05, + "loss": 41.7349, + "step": 1283 + }, + { + "epoch": 4.635665914221219, + "grad_norm": 206.0381622314453, + "learning_rate": 2.3063520871143376e-05, + "loss": 42.6239, + "step": 1284 + }, + { + "epoch": 4.639277652370203, + "grad_norm": 188.5303497314453, + "learning_rate": 2.305807622504537e-05, + "loss": 43.0988, + "step": 1285 + }, + { + "epoch": 4.642889390519187, + "grad_norm": 208.30039978027344, + "learning_rate": 2.3052631578947367e-05, + "loss": 43.8379, + "step": 1286 + }, + { + "epoch": 4.646501128668172, + "grad_norm": 209.494384765625, + "learning_rate": 2.3047186932849365e-05, + "loss": 41.4395, + "step": 1287 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 223.97824096679688, + "learning_rate": 2.3041742286751364e-05, + "loss": 38.5792, + "step": 1288 + }, + { + "epoch": 4.65372460496614, + "grad_norm": 209.16192626953125, + "learning_rate": 2.303629764065336e-05, + "loss": 36.2448, + "step": 1289 + }, + { + "epoch": 4.657336343115124, + "grad_norm": 260.72821044921875, + "learning_rate": 2.3030852994555355e-05, + "loss": 35.1692, + "step": 1290 + }, + { + "epoch": 4.657336343115124, + "eval_loss": 0.6381233334541321, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1290 + }, + { + "epoch": 4.660948081264109, + "grad_norm": 222.2270965576172, + "learning_rate": 2.302540834845735e-05, + "loss": 35.2234, + "step": 1291 + }, + { + "epoch": 4.664559819413093, + "grad_norm": 208.68218994140625, + "learning_rate": 2.3019963702359346e-05, + "loss": 35.6167, + "step": 1292 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 199.57015991210938, + "learning_rate": 2.301451905626134e-05, + "loss": 36.9489, + "step": 1293 + }, + { + "epoch": 4.6717832957110605, + "grad_norm": 249.1312255859375, + "learning_rate": 2.300907441016334e-05, + "loss": 37.0681, + "step": 1294 + }, + { + "epoch": 4.675395033860045, + "grad_norm": 227.86341857910156, + "learning_rate": 2.3003629764065335e-05, + "loss": 38.3897, + "step": 1295 + }, + { + "epoch": 4.679006772009029, + "grad_norm": 290.3368225097656, + "learning_rate": 2.2998185117967334e-05, + "loss": 39.1391, + "step": 1296 + }, + { + "epoch": 4.682618510158013, + "grad_norm": 222.59974670410156, + "learning_rate": 2.299274047186933e-05, + "loss": 38.6362, + "step": 1297 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 233.853515625, + "learning_rate": 2.2987295825771325e-05, + "loss": 37.1796, + "step": 1298 + }, + { + "epoch": 4.689841986455982, + "grad_norm": 202.83212280273438, + "learning_rate": 2.2981851179673324e-05, + "loss": 38.5097, + "step": 1299 + }, + { + "epoch": 4.693453724604966, + "grad_norm": 203.59027099609375, + "learning_rate": 2.297640653357532e-05, + "loss": 38.3335, + "step": 1300 + }, + { + "epoch": 4.693453724604966, + "eval_loss": 0.6446877717971802, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 1300 + }, + { + "epoch": 4.69706546275395, + "grad_norm": 250.48324584960938, + "learning_rate": 2.2970961887477314e-05, + "loss": 39.1848, + "step": 1301 + }, + { + "epoch": 4.700677200902934, + "grad_norm": 218.0867462158203, + "learning_rate": 2.296551724137931e-05, + "loss": 38.2276, + "step": 1302 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 316.4258728027344, + "learning_rate": 2.2960072595281305e-05, + "loss": 38.4487, + "step": 1303 + }, + { + "epoch": 4.707900677200903, + "grad_norm": 262.96832275390625, + "learning_rate": 2.29546279491833e-05, + "loss": 29.1075, + "step": 1304 + }, + { + "epoch": 4.711512415349887, + "grad_norm": 261.25897216796875, + "learning_rate": 2.2949183303085303e-05, + "loss": 24.6257, + "step": 1305 + }, + { + "epoch": 4.715124153498872, + "grad_norm": 223.29014587402344, + "learning_rate": 2.2943738656987298e-05, + "loss": 24.4387, + "step": 1306 + }, + { + "epoch": 4.718735891647856, + "grad_norm": 167.95193481445312, + "learning_rate": 2.2938294010889293e-05, + "loss": 25.0916, + "step": 1307 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 203.88392639160156, + "learning_rate": 2.293284936479129e-05, + "loss": 26.1631, + "step": 1308 + }, + { + "epoch": 4.725959367945824, + "grad_norm": 350.67657470703125, + "learning_rate": 2.2927404718693284e-05, + "loss": 47.7021, + "step": 1309 + }, + { + "epoch": 4.7295711060948085, + "grad_norm": 357.1839294433594, + "learning_rate": 2.2921960072595283e-05, + "loss": 47.8161, + "step": 1310 + }, + { + "epoch": 4.7295711060948085, + "eval_loss": 0.6716815829277039, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 1310 + }, + { + "epoch": 4.733182844243792, + "grad_norm": 334.40216064453125, + "learning_rate": 2.291651542649728e-05, + "loss": 47.5608, + "step": 1311 + }, + { + "epoch": 4.736794582392776, + "grad_norm": 322.90008544921875, + "learning_rate": 2.2911070780399274e-05, + "loss": 45.9858, + "step": 1312 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 291.5083923339844, + "learning_rate": 2.290562613430127e-05, + "loss": 45.9813, + "step": 1313 + }, + { + "epoch": 4.744018058690745, + "grad_norm": 234.91102600097656, + "learning_rate": 2.2900181488203268e-05, + "loss": 44.4287, + "step": 1314 + }, + { + "epoch": 4.747629796839729, + "grad_norm": 271.03582763671875, + "learning_rate": 2.2894736842105263e-05, + "loss": 45.3697, + "step": 1315 + }, + { + "epoch": 4.751241534988713, + "grad_norm": 256.219482421875, + "learning_rate": 2.2889292196007262e-05, + "loss": 45.1817, + "step": 1316 + }, + { + "epoch": 4.754853273137698, + "grad_norm": 252.0631561279297, + "learning_rate": 2.2883847549909257e-05, + "loss": 45.2029, + "step": 1317 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 249.41812133789062, + "learning_rate": 2.2878402903811253e-05, + "loss": 44.9802, + "step": 1318 + }, + { + "epoch": 4.762076749435666, + "grad_norm": 208.9102325439453, + "learning_rate": 2.2872958257713248e-05, + "loss": 44.3745, + "step": 1319 + }, + { + "epoch": 4.76568848758465, + "grad_norm": 322.94903564453125, + "learning_rate": 2.2867513611615244e-05, + "loss": 40.9193, + "step": 1320 + }, + { + "epoch": 4.76568848758465, + "eval_loss": 0.6515910029411316, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1320 + }, + { + "epoch": 4.769300225733634, + "grad_norm": 264.6942138671875, + "learning_rate": 2.2862068965517242e-05, + "loss": 39.7286, + "step": 1321 + }, + { + "epoch": 4.772911963882619, + "grad_norm": 276.6095886230469, + "learning_rate": 2.2856624319419238e-05, + "loss": 41.3846, + "step": 1322 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 199.59877014160156, + "learning_rate": 2.2851179673321233e-05, + "loss": 40.5583, + "step": 1323 + }, + { + "epoch": 4.780135440180587, + "grad_norm": 252.59158325195312, + "learning_rate": 2.2845735027223232e-05, + "loss": 40.9513, + "step": 1324 + }, + { + "epoch": 4.7837471783295715, + "grad_norm": 215.53826904296875, + "learning_rate": 2.2840290381125227e-05, + "loss": 41.5119, + "step": 1325 + }, + { + "epoch": 4.7873589164785555, + "grad_norm": 290.7100524902344, + "learning_rate": 2.2834845735027226e-05, + "loss": 42.7646, + "step": 1326 + }, + { + "epoch": 4.7909706546275395, + "grad_norm": 190.2306671142578, + "learning_rate": 2.282940108892922e-05, + "loss": 42.2708, + "step": 1327 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 187.5550079345703, + "learning_rate": 2.2823956442831217e-05, + "loss": 41.9279, + "step": 1328 + }, + { + "epoch": 4.798194130925508, + "grad_norm": 169.10414123535156, + "learning_rate": 2.2818511796733212e-05, + "loss": 42.2688, + "step": 1329 + }, + { + "epoch": 4.801805869074492, + "grad_norm": 199.5216064453125, + "learning_rate": 2.2813067150635208e-05, + "loss": 41.9192, + "step": 1330 + }, + { + "epoch": 4.801805869074492, + "eval_loss": 0.6402038335800171, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 1330 + }, + { + "epoch": 4.805417607223476, + "grad_norm": 222.4996337890625, + "learning_rate": 2.2807622504537203e-05, + "loss": 43.8218, + "step": 1331 + }, + { + "epoch": 4.80902934537246, + "grad_norm": 228.1157684326172, + "learning_rate": 2.2802177858439202e-05, + "loss": 42.9497, + "step": 1332 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 179.83697509765625, + "learning_rate": 2.27967332123412e-05, + "loss": 43.9723, + "step": 1333 + }, + { + "epoch": 4.816252821670429, + "grad_norm": 196.81983947753906, + "learning_rate": 2.2791288566243196e-05, + "loss": 43.3302, + "step": 1334 + }, + { + "epoch": 4.819864559819413, + "grad_norm": 186.61160278320312, + "learning_rate": 2.278584392014519e-05, + "loss": 41.8957, + "step": 1335 + }, + { + "epoch": 4.823476297968397, + "grad_norm": 242.55886840820312, + "learning_rate": 2.2780399274047187e-05, + "loss": 43.1916, + "step": 1336 + }, + { + "epoch": 4.827088036117382, + "grad_norm": 212.07177734375, + "learning_rate": 2.2774954627949185e-05, + "loss": 38.3371, + "step": 1337 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 180.1990966796875, + "learning_rate": 2.276950998185118e-05, + "loss": 36.3413, + "step": 1338 + }, + { + "epoch": 4.83431151241535, + "grad_norm": 202.69529724121094, + "learning_rate": 2.2764065335753176e-05, + "loss": 35.4426, + "step": 1339 + }, + { + "epoch": 4.837923250564334, + "grad_norm": 180.47283935546875, + "learning_rate": 2.275862068965517e-05, + "loss": 35.5281, + "step": 1340 + }, + { + "epoch": 4.837923250564334, + "eval_loss": 0.6356105804443359, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 1340 + }, + { + "epoch": 4.8415349887133186, + "grad_norm": 204.674560546875, + "learning_rate": 2.2753176043557167e-05, + "loss": 36.2566, + "step": 1341 + }, + { + "epoch": 4.8451467268623025, + "grad_norm": 272.1197204589844, + "learning_rate": 2.2747731397459166e-05, + "loss": 36.3862, + "step": 1342 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 235.55101013183594, + "learning_rate": 2.2742286751361165e-05, + "loss": 35.1455, + "step": 1343 + }, + { + "epoch": 4.852370203160271, + "grad_norm": 271.2718200683594, + "learning_rate": 2.273684210526316e-05, + "loss": 37.3824, + "step": 1344 + }, + { + "epoch": 4.855981941309255, + "grad_norm": 242.15728759765625, + "learning_rate": 2.2731397459165155e-05, + "loss": 37.6587, + "step": 1345 + }, + { + "epoch": 4.859593679458239, + "grad_norm": 218.59481811523438, + "learning_rate": 2.272595281306715e-05, + "loss": 36.7602, + "step": 1346 + }, + { + "epoch": 4.863205417607223, + "grad_norm": 231.9490203857422, + "learning_rate": 2.2720508166969146e-05, + "loss": 38.187, + "step": 1347 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 385.56158447265625, + "learning_rate": 2.2715063520871145e-05, + "loss": 38.1905, + "step": 1348 + }, + { + "epoch": 4.870428893905192, + "grad_norm": 219.38204956054688, + "learning_rate": 2.270961887477314e-05, + "loss": 38.2179, + "step": 1349 + }, + { + "epoch": 4.874040632054176, + "grad_norm": 209.46580505371094, + "learning_rate": 2.2704174228675136e-05, + "loss": 37.3696, + "step": 1350 + }, + { + "epoch": 4.874040632054176, + "eval_loss": 0.6412517428398132, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 1350 + }, + { + "epoch": 4.87765237020316, + "grad_norm": 205.53416442871094, + "learning_rate": 2.2698729582577134e-05, + "loss": 38.5144, + "step": 1351 + }, + { + "epoch": 4.881264108352145, + "grad_norm": 214.2522735595703, + "learning_rate": 2.269328493647913e-05, + "loss": 38.7372, + "step": 1352 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 236.9787139892578, + "learning_rate": 2.2687840290381125e-05, + "loss": 38.8987, + "step": 1353 + }, + { + "epoch": 4.888487584650113, + "grad_norm": 247.30906677246094, + "learning_rate": 2.2682395644283124e-05, + "loss": 35.0837, + "step": 1354 + }, + { + "epoch": 4.892099322799097, + "grad_norm": 287.5954284667969, + "learning_rate": 2.267695099818512e-05, + "loss": 25.5272, + "step": 1355 + }, + { + "epoch": 4.895711060948082, + "grad_norm": 254.61672973632812, + "learning_rate": 2.2671506352087115e-05, + "loss": 25.1288, + "step": 1356 + }, + { + "epoch": 4.899322799097066, + "grad_norm": 180.98666381835938, + "learning_rate": 2.266606170598911e-05, + "loss": 25.0588, + "step": 1357 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 213.0275421142578, + "learning_rate": 2.2660617059891105e-05, + "loss": 25.464, + "step": 1358 + }, + { + "epoch": 4.9065462753950335, + "grad_norm": 385.18035888671875, + "learning_rate": 2.2655172413793104e-05, + "loss": 47.0056, + "step": 1359 + }, + { + "epoch": 4.910158013544018, + "grad_norm": 383.4106140136719, + "learning_rate": 2.2649727767695103e-05, + "loss": 46.9892, + "step": 1360 + }, + { + "epoch": 4.910158013544018, + "eval_loss": 0.6618479490280151, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1360 + }, + { + "epoch": 4.913769751693002, + "grad_norm": 415.4345397949219, + "learning_rate": 2.26442831215971e-05, + "loss": 47.1619, + "step": 1361 + }, + { + "epoch": 4.917381489841986, + "grad_norm": 362.338134765625, + "learning_rate": 2.2638838475499094e-05, + "loss": 46.7232, + "step": 1362 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 378.7535400390625, + "learning_rate": 2.263339382940109e-05, + "loss": 46.4438, + "step": 1363 + }, + { + "epoch": 4.924604966139955, + "grad_norm": 251.64901733398438, + "learning_rate": 2.2627949183303085e-05, + "loss": 44.8178, + "step": 1364 + }, + { + "epoch": 4.928216704288939, + "grad_norm": 273.1052551269531, + "learning_rate": 2.2622504537205083e-05, + "loss": 43.0865, + "step": 1365 + }, + { + "epoch": 4.931828442437923, + "grad_norm": 229.66415405273438, + "learning_rate": 2.261705989110708e-05, + "loss": 42.2463, + "step": 1366 + }, + { + "epoch": 4.935440180586907, + "grad_norm": 229.47940063476562, + "learning_rate": 2.2611615245009074e-05, + "loss": 42.4395, + "step": 1367 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 224.48890686035156, + "learning_rate": 2.260617059891107e-05, + "loss": 42.4994, + "step": 1368 + }, + { + "epoch": 4.942663656884876, + "grad_norm": 241.98745727539062, + "learning_rate": 2.2600725952813065e-05, + "loss": 42.5535, + "step": 1369 + }, + { + "epoch": 4.94627539503386, + "grad_norm": 258.1711120605469, + "learning_rate": 2.2595281306715067e-05, + "loss": 42.8475, + "step": 1370 + }, + { + "epoch": 4.94627539503386, + "eval_loss": 0.639252245426178, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.09, + "eval_steps_per_second": 57.09, + "step": 1370 + }, + { + "epoch": 4.949887133182845, + "grad_norm": 204.64927673339844, + "learning_rate": 2.2589836660617062e-05, + "loss": 42.9895, + "step": 1371 + }, + { + "epoch": 4.953498871331829, + "grad_norm": 342.9057922363281, + "learning_rate": 2.2584392014519058e-05, + "loss": 43.1972, + "step": 1372 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 207.45504760742188, + "learning_rate": 2.2578947368421053e-05, + "loss": 42.406, + "step": 1373 + }, + { + "epoch": 4.960722347629797, + "grad_norm": 232.78831481933594, + "learning_rate": 2.257350272232305e-05, + "loss": 36.8817, + "step": 1374 + }, + { + "epoch": 4.9643340857787805, + "grad_norm": 249.3349609375, + "learning_rate": 2.2568058076225044e-05, + "loss": 34.584, + "step": 1375 + }, + { + "epoch": 4.967945823927765, + "grad_norm": 322.7100524902344, + "learning_rate": 2.2562613430127043e-05, + "loss": 36.9512, + "step": 1376 + }, + { + "epoch": 4.971557562076749, + "grad_norm": 357.65228271484375, + "learning_rate": 2.2557168784029038e-05, + "loss": 37.6833, + "step": 1377 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 300.0970153808594, + "learning_rate": 2.2551724137931033e-05, + "loss": 38.597, + "step": 1378 + }, + { + "epoch": 4.978781038374718, + "grad_norm": 234.52508544921875, + "learning_rate": 2.2546279491833032e-05, + "loss": 38.4155, + "step": 1379 + }, + { + "epoch": 4.982392776523702, + "grad_norm": 270.60626220703125, + "learning_rate": 2.2540834845735028e-05, + "loss": 38.1589, + "step": 1380 + }, + { + "epoch": 4.982392776523702, + "eval_loss": 0.6409950256347656, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 1380 + }, + { + "epoch": 4.986004514672686, + "grad_norm": 232.9596710205078, + "learning_rate": 2.2535390199637026e-05, + "loss": 39.281, + "step": 1381 + }, + { + "epoch": 4.98961625282167, + "grad_norm": 248.0550994873047, + "learning_rate": 2.2529945553539022e-05, + "loss": 40.0868, + "step": 1382 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 256.327880859375, + "learning_rate": 2.2524500907441017e-05, + "loss": 28.1259, + "step": 1383 + }, + { + "epoch": 4.996839729119639, + "grad_norm": 198.29559326171875, + "learning_rate": 2.2519056261343012e-05, + "loss": 25.3166, + "step": 1384 + }, + { + "epoch": 5.0, + "grad_norm": 174.66856384277344, + "learning_rate": 2.2513611615245008e-05, + "loss": 22.0749, + "step": 1385 + }, + { + "epoch": 5.003611738148984, + "grad_norm": 309.0927429199219, + "learning_rate": 2.2508166969147003e-05, + "loss": 45.2433, + "step": 1386 + }, + { + "epoch": 5.007223476297969, + "grad_norm": 293.1455383300781, + "learning_rate": 2.2502722323049002e-05, + "loss": 46.7025, + "step": 1387 + }, + { + "epoch": 5.010835214446953, + "grad_norm": 269.47662353515625, + "learning_rate": 2.2497277676951e-05, + "loss": 45.3218, + "step": 1388 + }, + { + "epoch": 5.014446952595937, + "grad_norm": 284.49560546875, + "learning_rate": 2.2491833030852996e-05, + "loss": 44.9849, + "step": 1389 + }, + { + "epoch": 5.018058690744921, + "grad_norm": 223.5511474609375, + "learning_rate": 2.248638838475499e-05, + "loss": 44.887, + "step": 1390 + }, + { + "epoch": 5.018058690744921, + "eval_loss": 0.6435533165931702, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1390 + }, + { + "epoch": 5.021670428893906, + "grad_norm": 243.4492645263672, + "learning_rate": 2.2480943738656987e-05, + "loss": 45.1483, + "step": 1391 + }, + { + "epoch": 5.0252821670428895, + "grad_norm": 265.1712646484375, + "learning_rate": 2.2475499092558986e-05, + "loss": 44.3713, + "step": 1392 + }, + { + "epoch": 5.0288939051918735, + "grad_norm": 190.72190856933594, + "learning_rate": 2.247005444646098e-05, + "loss": 45.3138, + "step": 1393 + }, + { + "epoch": 5.0325056433408575, + "grad_norm": 177.26686096191406, + "learning_rate": 2.2464609800362976e-05, + "loss": 43.302, + "step": 1394 + }, + { + "epoch": 5.036117381489842, + "grad_norm": 198.6124725341797, + "learning_rate": 2.2459165154264972e-05, + "loss": 43.6363, + "step": 1395 + }, + { + "epoch": 5.039729119638826, + "grad_norm": 233.78738403320312, + "learning_rate": 2.2453720508166967e-05, + "loss": 43.0345, + "step": 1396 + }, + { + "epoch": 5.04334085778781, + "grad_norm": 225.48614501953125, + "learning_rate": 2.2448275862068966e-05, + "loss": 41.5932, + "step": 1397 + }, + { + "epoch": 5.046952595936794, + "grad_norm": 204.31179809570312, + "learning_rate": 2.2442831215970965e-05, + "loss": 40.1401, + "step": 1398 + }, + { + "epoch": 5.050564334085779, + "grad_norm": 219.5385284423828, + "learning_rate": 2.243738656987296e-05, + "loss": 40.8834, + "step": 1399 + }, + { + "epoch": 5.054176072234763, + "grad_norm": 168.3094024658203, + "learning_rate": 2.2431941923774956e-05, + "loss": 40.4476, + "step": 1400 + }, + { + "epoch": 5.054176072234763, + "eval_loss": 0.6361114382743835, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 1400 + }, + { + "epoch": 5.057787810383747, + "grad_norm": 169.45201110839844, + "learning_rate": 2.242649727767695e-05, + "loss": 40.1949, + "step": 1401 + }, + { + "epoch": 5.061399548532731, + "grad_norm": 208.84634399414062, + "learning_rate": 2.2421052631578946e-05, + "loss": 41.0091, + "step": 1402 + }, + { + "epoch": 5.065011286681716, + "grad_norm": 248.86221313476562, + "learning_rate": 2.2415607985480945e-05, + "loss": 40.2435, + "step": 1403 + }, + { + "epoch": 5.0686230248307, + "grad_norm": 297.0834655761719, + "learning_rate": 2.241016333938294e-05, + "loss": 42.37, + "step": 1404 + }, + { + "epoch": 5.072234762979684, + "grad_norm": 242.12661743164062, + "learning_rate": 2.2404718693284936e-05, + "loss": 42.3822, + "step": 1405 + }, + { + "epoch": 5.075846501128668, + "grad_norm": 230.1178741455078, + "learning_rate": 2.2399274047186935e-05, + "loss": 41.3722, + "step": 1406 + }, + { + "epoch": 5.079458239277653, + "grad_norm": 191.32371520996094, + "learning_rate": 2.239382940108893e-05, + "loss": 41.8087, + "step": 1407 + }, + { + "epoch": 5.083069977426637, + "grad_norm": 267.28753662109375, + "learning_rate": 2.2388384754990925e-05, + "loss": 42.5938, + "step": 1408 + }, + { + "epoch": 5.0866817155756205, + "grad_norm": 186.61978149414062, + "learning_rate": 2.2382940108892924e-05, + "loss": 42.8553, + "step": 1409 + }, + { + "epoch": 5.090293453724605, + "grad_norm": 242.53433227539062, + "learning_rate": 2.237749546279492e-05, + "loss": 41.9677, + "step": 1410 + }, + { + "epoch": 5.090293453724605, + "eval_loss": 0.6330043077468872, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 1410 + }, + { + "epoch": 5.093905191873589, + "grad_norm": 199.74696350097656, + "learning_rate": 2.2372050816696915e-05, + "loss": 42.9821, + "step": 1411 + }, + { + "epoch": 5.097516930022573, + "grad_norm": 254.1063690185547, + "learning_rate": 2.236660617059891e-05, + "loss": 42.7956, + "step": 1412 + }, + { + "epoch": 5.101128668171557, + "grad_norm": 215.59056091308594, + "learning_rate": 2.2361161524500906e-05, + "loss": 43.6312, + "step": 1413 + }, + { + "epoch": 5.104740406320542, + "grad_norm": 218.69973754882812, + "learning_rate": 2.2355716878402904e-05, + "loss": 40.9468, + "step": 1414 + }, + { + "epoch": 5.108352144469526, + "grad_norm": 200.34927368164062, + "learning_rate": 2.23502722323049e-05, + "loss": 38.2656, + "step": 1415 + }, + { + "epoch": 5.11196388261851, + "grad_norm": 191.56883239746094, + "learning_rate": 2.23448275862069e-05, + "loss": 35.8111, + "step": 1416 + }, + { + "epoch": 5.115575620767494, + "grad_norm": 192.629150390625, + "learning_rate": 2.2339382940108894e-05, + "loss": 35.1287, + "step": 1417 + }, + { + "epoch": 5.119187358916479, + "grad_norm": 217.54855346679688, + "learning_rate": 2.233393829401089e-05, + "loss": 34.9664, + "step": 1418 + }, + { + "epoch": 5.122799097065463, + "grad_norm": 234.12355041503906, + "learning_rate": 2.2328493647912888e-05, + "loss": 35.9252, + "step": 1419 + }, + { + "epoch": 5.126410835214447, + "grad_norm": 201.83477783203125, + "learning_rate": 2.2323049001814884e-05, + "loss": 36.4664, + "step": 1420 + }, + { + "epoch": 5.126410835214447, + "eval_loss": 0.6359394192695618, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 1420 + }, + { + "epoch": 5.130022573363431, + "grad_norm": 212.38943481445312, + "learning_rate": 2.231760435571688e-05, + "loss": 35.2733, + "step": 1421 + }, + { + "epoch": 5.133634311512416, + "grad_norm": 219.8803253173828, + "learning_rate": 2.2312159709618874e-05, + "loss": 37.2009, + "step": 1422 + }, + { + "epoch": 5.1372460496614, + "grad_norm": 222.28221130371094, + "learning_rate": 2.230671506352087e-05, + "loss": 36.9338, + "step": 1423 + }, + { + "epoch": 5.140857787810384, + "grad_norm": 217.56607055664062, + "learning_rate": 2.2301270417422865e-05, + "loss": 38.0419, + "step": 1424 + }, + { + "epoch": 5.144469525959368, + "grad_norm": 232.7363739013672, + "learning_rate": 2.2295825771324867e-05, + "loss": 38.1393, + "step": 1425 + }, + { + "epoch": 5.148081264108352, + "grad_norm": 228.12091064453125, + "learning_rate": 2.2290381125226863e-05, + "loss": 37.4169, + "step": 1426 + }, + { + "epoch": 5.151693002257336, + "grad_norm": 247.9901580810547, + "learning_rate": 2.2284936479128858e-05, + "loss": 37.6386, + "step": 1427 + }, + { + "epoch": 5.15530474040632, + "grad_norm": 227.96649169921875, + "learning_rate": 2.2279491833030853e-05, + "loss": 38.7843, + "step": 1428 + }, + { + "epoch": 5.158916478555304, + "grad_norm": 197.85072326660156, + "learning_rate": 2.227404718693285e-05, + "loss": 37.7056, + "step": 1429 + }, + { + "epoch": 5.162528216704289, + "grad_norm": 270.6370544433594, + "learning_rate": 2.2268602540834848e-05, + "loss": 38.5554, + "step": 1430 + }, + { + "epoch": 5.162528216704289, + "eval_loss": 0.6463288068771362, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 1430 + }, + { + "epoch": 5.166139954853273, + "grad_norm": 251.65847778320312, + "learning_rate": 2.2263157894736843e-05, + "loss": 32.6593, + "step": 1431 + }, + { + "epoch": 5.169751693002257, + "grad_norm": 248.84368896484375, + "learning_rate": 2.225771324863884e-05, + "loss": 24.8031, + "step": 1432 + }, + { + "epoch": 5.173363431151241, + "grad_norm": 218.12979125976562, + "learning_rate": 2.2252268602540834e-05, + "loss": 23.8542, + "step": 1433 + }, + { + "epoch": 5.176975169300226, + "grad_norm": 171.4182586669922, + "learning_rate": 2.2246823956442832e-05, + "loss": 25.1994, + "step": 1434 + }, + { + "epoch": 5.18058690744921, + "grad_norm": 200.76271057128906, + "learning_rate": 2.2241379310344828e-05, + "loss": 25.1259, + "step": 1435 + }, + { + "epoch": 5.184198645598194, + "grad_norm": 324.8979797363281, + "learning_rate": 2.2235934664246827e-05, + "loss": 46.7466, + "step": 1436 + }, + { + "epoch": 5.187810383747179, + "grad_norm": 391.9200439453125, + "learning_rate": 2.2230490018148822e-05, + "loss": 47.366, + "step": 1437 + }, + { + "epoch": 5.191422121896163, + "grad_norm": 332.51080322265625, + "learning_rate": 2.2225045372050817e-05, + "loss": 47.5236, + "step": 1438 + }, + { + "epoch": 5.195033860045147, + "grad_norm": 295.85333251953125, + "learning_rate": 2.2219600725952813e-05, + "loss": 44.9235, + "step": 1439 + }, + { + "epoch": 5.198645598194131, + "grad_norm": 246.46482849121094, + "learning_rate": 2.2214156079854808e-05, + "loss": 44.5892, + "step": 1440 + }, + { + "epoch": 5.198645598194131, + "eval_loss": 0.6501885056495667, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.096, + "eval_steps_per_second": 57.096, + "step": 1440 + }, + { + "epoch": 5.2022573363431155, + "grad_norm": 224.99964904785156, + "learning_rate": 2.2208711433756807e-05, + "loss": 45.1496, + "step": 1441 + }, + { + "epoch": 5.2058690744920995, + "grad_norm": 201.5928497314453, + "learning_rate": 2.2203266787658802e-05, + "loss": 44.2362, + "step": 1442 + }, + { + "epoch": 5.209480812641083, + "grad_norm": 220.72509765625, + "learning_rate": 2.21978221415608e-05, + "loss": 45.7963, + "step": 1443 + }, + { + "epoch": 5.213092550790067, + "grad_norm": 229.04412841796875, + "learning_rate": 2.2192377495462796e-05, + "loss": 44.1812, + "step": 1444 + }, + { + "epoch": 5.216704288939052, + "grad_norm": 214.86207580566406, + "learning_rate": 2.2186932849364792e-05, + "loss": 44.364, + "step": 1445 + }, + { + "epoch": 5.220316027088036, + "grad_norm": 169.3239288330078, + "learning_rate": 2.2181488203266787e-05, + "loss": 44.1106, + "step": 1446 + }, + { + "epoch": 5.22392776523702, + "grad_norm": 180.3131561279297, + "learning_rate": 2.2176043557168786e-05, + "loss": 41.8791, + "step": 1447 + }, + { + "epoch": 5.227539503386004, + "grad_norm": 227.83078002929688, + "learning_rate": 2.217059891107078e-05, + "loss": 39.7917, + "step": 1448 + }, + { + "epoch": 5.231151241534989, + "grad_norm": 267.4294738769531, + "learning_rate": 2.2165154264972777e-05, + "loss": 41.2864, + "step": 1449 + }, + { + "epoch": 5.234762979683973, + "grad_norm": 210.79034423828125, + "learning_rate": 2.2159709618874772e-05, + "loss": 40.7219, + "step": 1450 + }, + { + "epoch": 5.234762979683973, + "eval_loss": 0.6369529366493225, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 1450 + }, + { + "epoch": 5.238374717832957, + "grad_norm": 205.2632598876953, + "learning_rate": 2.2154264972776768e-05, + "loss": 41.0364, + "step": 1451 + }, + { + "epoch": 5.241986455981941, + "grad_norm": 199.7196807861328, + "learning_rate": 2.214882032667877e-05, + "loss": 40.2733, + "step": 1452 + }, + { + "epoch": 5.245598194130926, + "grad_norm": 184.26495361328125, + "learning_rate": 2.2143375680580765e-05, + "loss": 40.3418, + "step": 1453 + }, + { + "epoch": 5.24920993227991, + "grad_norm": 170.1937713623047, + "learning_rate": 2.213793103448276e-05, + "loss": 40.5658, + "step": 1454 + }, + { + "epoch": 5.252821670428894, + "grad_norm": 167.71109008789062, + "learning_rate": 2.2132486388384756e-05, + "loss": 41.9252, + "step": 1455 + }, + { + "epoch": 5.2564334085778786, + "grad_norm": 184.73162841796875, + "learning_rate": 2.212704174228675e-05, + "loss": 40.0485, + "step": 1456 + }, + { + "epoch": 5.2600451467268625, + "grad_norm": 195.0812225341797, + "learning_rate": 2.2121597096188747e-05, + "loss": 41.6424, + "step": 1457 + }, + { + "epoch": 5.2636568848758465, + "grad_norm": 218.23553466796875, + "learning_rate": 2.2116152450090745e-05, + "loss": 40.6179, + "step": 1458 + }, + { + "epoch": 5.2672686230248305, + "grad_norm": 229.79299926757812, + "learning_rate": 2.211070780399274e-05, + "loss": 42.8747, + "step": 1459 + }, + { + "epoch": 5.270880361173815, + "grad_norm": 231.70692443847656, + "learning_rate": 2.2105263157894736e-05, + "loss": 42.7016, + "step": 1460 + }, + { + "epoch": 5.270880361173815, + "eval_loss": 0.6424433588981628, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 1460 + }, + { + "epoch": 5.274492099322799, + "grad_norm": 204.9513397216797, + "learning_rate": 2.209981851179673e-05, + "loss": 41.206, + "step": 1461 + }, + { + "epoch": 5.278103837471783, + "grad_norm": 220.89083862304688, + "learning_rate": 2.209437386569873e-05, + "loss": 44.0126, + "step": 1462 + }, + { + "epoch": 5.281715575620767, + "grad_norm": 266.7763671875, + "learning_rate": 2.208892921960073e-05, + "loss": 41.4934, + "step": 1463 + }, + { + "epoch": 5.285327313769752, + "grad_norm": 241.42636108398438, + "learning_rate": 2.2083484573502724e-05, + "loss": 43.3433, + "step": 1464 + }, + { + "epoch": 5.288939051918736, + "grad_norm": 221.7669219970703, + "learning_rate": 2.207803992740472e-05, + "loss": 35.9569, + "step": 1465 + }, + { + "epoch": 5.29255079006772, + "grad_norm": 236.0152130126953, + "learning_rate": 2.2072595281306715e-05, + "loss": 36.0824, + "step": 1466 + }, + { + "epoch": 5.296162528216704, + "grad_norm": 239.56224060058594, + "learning_rate": 2.206715063520871e-05, + "loss": 33.6127, + "step": 1467 + }, + { + "epoch": 5.299774266365689, + "grad_norm": 277.1287841796875, + "learning_rate": 2.2061705989110706e-05, + "loss": 36.11, + "step": 1468 + }, + { + "epoch": 5.303386004514673, + "grad_norm": 250.19515991210938, + "learning_rate": 2.2056261343012705e-05, + "loss": 36.9984, + "step": 1469 + }, + { + "epoch": 5.306997742663657, + "grad_norm": 214.2754669189453, + "learning_rate": 2.20508166969147e-05, + "loss": 36.5917, + "step": 1470 + }, + { + "epoch": 5.306997742663657, + "eval_loss": 0.6356943845748901, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 1470 + }, + { + "epoch": 5.310609480812641, + "grad_norm": 224.37388610839844, + "learning_rate": 2.20453720508167e-05, + "loss": 36.5302, + "step": 1471 + }, + { + "epoch": 5.314221218961626, + "grad_norm": 276.2541809082031, + "learning_rate": 2.2039927404718694e-05, + "loss": 36.7978, + "step": 1472 + }, + { + "epoch": 5.3178329571106095, + "grad_norm": 361.717041015625, + "learning_rate": 2.203448275862069e-05, + "loss": 37.4063, + "step": 1473 + }, + { + "epoch": 5.3214446952595935, + "grad_norm": 285.3569641113281, + "learning_rate": 2.202903811252269e-05, + "loss": 37.2472, + "step": 1474 + }, + { + "epoch": 5.3250564334085775, + "grad_norm": 268.160400390625, + "learning_rate": 2.2023593466424684e-05, + "loss": 37.7361, + "step": 1475 + }, + { + "epoch": 5.328668171557562, + "grad_norm": 211.38070678710938, + "learning_rate": 2.201814882032668e-05, + "loss": 37.7794, + "step": 1476 + }, + { + "epoch": 5.332279909706546, + "grad_norm": 214.10638427734375, + "learning_rate": 2.2012704174228675e-05, + "loss": 39.0787, + "step": 1477 + }, + { + "epoch": 5.33589164785553, + "grad_norm": 238.9603271484375, + "learning_rate": 2.200725952813067e-05, + "loss": 37.6853, + "step": 1478 + }, + { + "epoch": 5.339503386004514, + "grad_norm": 323.44976806640625, + "learning_rate": 2.2001814882032665e-05, + "loss": 38.2844, + "step": 1479 + }, + { + "epoch": 5.343115124153499, + "grad_norm": 289.6131896972656, + "learning_rate": 2.1996370235934668e-05, + "loss": 38.8953, + "step": 1480 + }, + { + "epoch": 5.343115124153499, + "eval_loss": 0.6462770700454712, + "eval_runtime": 3.1673, + "eval_samples_per_second": 56.516, + "eval_steps_per_second": 56.516, + "step": 1480 + }, + { + "epoch": 5.346726862302483, + "grad_norm": 197.47299194335938, + "learning_rate": 2.1990925589836663e-05, + "loss": 28.126, + "step": 1481 + }, + { + "epoch": 5.350338600451467, + "grad_norm": 198.37156677246094, + "learning_rate": 2.1985480943738658e-05, + "loss": 24.2205, + "step": 1482 + }, + { + "epoch": 5.353950338600452, + "grad_norm": 211.03501892089844, + "learning_rate": 2.1980036297640654e-05, + "loss": 24.119, + "step": 1483 + }, + { + "epoch": 5.357562076749436, + "grad_norm": 182.23316955566406, + "learning_rate": 2.197459165154265e-05, + "loss": 24.7386, + "step": 1484 + }, + { + "epoch": 5.36117381489842, + "grad_norm": 192.6392822265625, + "learning_rate": 2.1969147005444648e-05, + "loss": 26.0739, + "step": 1485 + }, + { + "epoch": 5.364785553047404, + "grad_norm": 380.62896728515625, + "learning_rate": 2.1963702359346643e-05, + "loss": 46.6945, + "step": 1486 + }, + { + "epoch": 5.368397291196389, + "grad_norm": 342.5572814941406, + "learning_rate": 2.195825771324864e-05, + "loss": 46.1797, + "step": 1487 + }, + { + "epoch": 5.372009029345373, + "grad_norm": 311.7198791503906, + "learning_rate": 2.1952813067150634e-05, + "loss": 45.6588, + "step": 1488 + }, + { + "epoch": 5.375620767494357, + "grad_norm": 260.9885559082031, + "learning_rate": 2.1947368421052633e-05, + "loss": 45.2405, + "step": 1489 + }, + { + "epoch": 5.3792325056433405, + "grad_norm": 263.3132019042969, + "learning_rate": 2.1941923774954628e-05, + "loss": 44.117, + "step": 1490 + }, + { + "epoch": 5.3792325056433405, + "eval_loss": 0.644275426864624, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 1490 + }, + { + "epoch": 5.382844243792325, + "grad_norm": 254.92022705078125, + "learning_rate": 2.1936479128856627e-05, + "loss": 45.4002, + "step": 1491 + }, + { + "epoch": 5.386455981941309, + "grad_norm": 246.1839599609375, + "learning_rate": 2.1931034482758622e-05, + "loss": 45.3481, + "step": 1492 + }, + { + "epoch": 5.390067720090293, + "grad_norm": 282.2879638671875, + "learning_rate": 2.1925589836660618e-05, + "loss": 45.3958, + "step": 1493 + }, + { + "epoch": 5.393679458239277, + "grad_norm": 266.9140930175781, + "learning_rate": 2.1920145190562613e-05, + "loss": 44.2959, + "step": 1494 + }, + { + "epoch": 5.397291196388262, + "grad_norm": 196.81199645996094, + "learning_rate": 2.191470054446461e-05, + "loss": 44.765, + "step": 1495 + }, + { + "epoch": 5.400902934537246, + "grad_norm": 270.7329406738281, + "learning_rate": 2.1909255898366607e-05, + "loss": 42.8581, + "step": 1496 + }, + { + "epoch": 5.40451467268623, + "grad_norm": 187.3281707763672, + "learning_rate": 2.1903811252268603e-05, + "loss": 40.7167, + "step": 1497 + }, + { + "epoch": 5.408126410835214, + "grad_norm": 302.9165954589844, + "learning_rate": 2.1898366606170598e-05, + "loss": 41.0712, + "step": 1498 + }, + { + "epoch": 5.411738148984199, + "grad_norm": 395.1492614746094, + "learning_rate": 2.1892921960072597e-05, + "loss": 40.4098, + "step": 1499 + }, + { + "epoch": 5.415349887133183, + "grad_norm": 253.91494750976562, + "learning_rate": 2.1887477313974592e-05, + "loss": 41.2985, + "step": 1500 + }, + { + "epoch": 5.415349887133183, + "eval_loss": 0.6383773684501648, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1500 + }, + { + "epoch": 5.418961625282167, + "grad_norm": 248.4109344482422, + "learning_rate": 2.1882032667876588e-05, + "loss": 41.179, + "step": 1501 + }, + { + "epoch": 5.422573363431152, + "grad_norm": 210.50015258789062, + "learning_rate": 2.1876588021778586e-05, + "loss": 41.1934, + "step": 1502 + }, + { + "epoch": 5.426185101580136, + "grad_norm": 170.64334106445312, + "learning_rate": 2.187114337568058e-05, + "loss": 41.5535, + "step": 1503 + }, + { + "epoch": 5.42979683972912, + "grad_norm": 249.41270446777344, + "learning_rate": 2.1865698729582577e-05, + "loss": 41.8323, + "step": 1504 + }, + { + "epoch": 5.433408577878104, + "grad_norm": 214.53770446777344, + "learning_rate": 2.1860254083484572e-05, + "loss": 42.1517, + "step": 1505 + }, + { + "epoch": 5.437020316027088, + "grad_norm": 225.6502227783203, + "learning_rate": 2.1854809437386568e-05, + "loss": 42.7675, + "step": 1506 + }, + { + "epoch": 5.440632054176072, + "grad_norm": 210.19219970703125, + "learning_rate": 2.1849364791288567e-05, + "loss": 42.5094, + "step": 1507 + }, + { + "epoch": 5.444243792325056, + "grad_norm": 187.03294372558594, + "learning_rate": 2.1843920145190565e-05, + "loss": 42.2218, + "step": 1508 + }, + { + "epoch": 5.44785553047404, + "grad_norm": 227.6764373779297, + "learning_rate": 2.183847549909256e-05, + "loss": 42.7061, + "step": 1509 + }, + { + "epoch": 5.451467268623025, + "grad_norm": 239.2847442626953, + "learning_rate": 2.1833030852994556e-05, + "loss": 43.1959, + "step": 1510 + }, + { + "epoch": 5.451467268623025, + "eval_loss": 0.6405091285705566, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 1510 + }, + { + "epoch": 5.455079006772009, + "grad_norm": 268.887451171875, + "learning_rate": 2.182758620689655e-05, + "loss": 42.4915, + "step": 1511 + }, + { + "epoch": 5.458690744920993, + "grad_norm": 261.0531311035156, + "learning_rate": 2.182214156079855e-05, + "loss": 42.1777, + "step": 1512 + }, + { + "epoch": 5.462302483069977, + "grad_norm": 241.58819580078125, + "learning_rate": 2.1816696914700546e-05, + "loss": 40.8728, + "step": 1513 + }, + { + "epoch": 5.465914221218962, + "grad_norm": 227.302001953125, + "learning_rate": 2.181125226860254e-05, + "loss": 39.8861, + "step": 1514 + }, + { + "epoch": 5.469525959367946, + "grad_norm": 293.8402404785156, + "learning_rate": 2.1805807622504536e-05, + "loss": 36.8716, + "step": 1515 + }, + { + "epoch": 5.47313769751693, + "grad_norm": 332.8829650878906, + "learning_rate": 2.1800362976406532e-05, + "loss": 35.6049, + "step": 1516 + }, + { + "epoch": 5.476749435665914, + "grad_norm": 271.6636962890625, + "learning_rate": 2.179491833030853e-05, + "loss": 34.6785, + "step": 1517 + }, + { + "epoch": 5.480361173814899, + "grad_norm": 211.5673065185547, + "learning_rate": 2.178947368421053e-05, + "loss": 35.5321, + "step": 1518 + }, + { + "epoch": 5.483972911963883, + "grad_norm": 168.95346069335938, + "learning_rate": 2.1784029038112525e-05, + "loss": 35.1604, + "step": 1519 + }, + { + "epoch": 5.487584650112867, + "grad_norm": 242.66725158691406, + "learning_rate": 2.177858439201452e-05, + "loss": 37.8709, + "step": 1520 + }, + { + "epoch": 5.487584650112867, + "eval_loss": 0.6324127912521362, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1520 + }, + { + "epoch": 5.491196388261851, + "grad_norm": 202.7799530029297, + "learning_rate": 2.1773139745916516e-05, + "loss": 38.1727, + "step": 1521 + }, + { + "epoch": 5.4948081264108355, + "grad_norm": 210.12704467773438, + "learning_rate": 2.176769509981851e-05, + "loss": 36.4171, + "step": 1522 + }, + { + "epoch": 5.4984198645598195, + "grad_norm": 214.7133331298828, + "learning_rate": 2.176225045372051e-05, + "loss": 37.7873, + "step": 1523 + }, + { + "epoch": 5.502031602708803, + "grad_norm": 197.89781188964844, + "learning_rate": 2.1756805807622505e-05, + "loss": 37.1096, + "step": 1524 + }, + { + "epoch": 5.505643340857787, + "grad_norm": 203.01992797851562, + "learning_rate": 2.17513611615245e-05, + "loss": 36.9907, + "step": 1525 + }, + { + "epoch": 5.509255079006772, + "grad_norm": 210.42164611816406, + "learning_rate": 2.17459165154265e-05, + "loss": 38.0291, + "step": 1526 + }, + { + "epoch": 5.512866817155756, + "grad_norm": 210.2798309326172, + "learning_rate": 2.1740471869328495e-05, + "loss": 37.5385, + "step": 1527 + }, + { + "epoch": 5.51647855530474, + "grad_norm": 217.986572265625, + "learning_rate": 2.173502722323049e-05, + "loss": 39.2736, + "step": 1528 + }, + { + "epoch": 5.520090293453725, + "grad_norm": 221.05831909179688, + "learning_rate": 2.172958257713249e-05, + "loss": 39.2733, + "step": 1529 + }, + { + "epoch": 5.523702031602709, + "grad_norm": 250.36065673828125, + "learning_rate": 2.1724137931034484e-05, + "loss": 37.8987, + "step": 1530 + }, + { + "epoch": 5.523702031602709, + "eval_loss": 0.6414559483528137, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 1530 + }, + { + "epoch": 5.527313769751693, + "grad_norm": 275.062255859375, + "learning_rate": 2.171869328493648e-05, + "loss": 29.4874, + "step": 1531 + }, + { + "epoch": 5.530925507900677, + "grad_norm": 178.79615783691406, + "learning_rate": 2.1713248638838475e-05, + "loss": 25.2165, + "step": 1532 + }, + { + "epoch": 5.534537246049661, + "grad_norm": 221.6693572998047, + "learning_rate": 2.170780399274047e-05, + "loss": 24.7139, + "step": 1533 + }, + { + "epoch": 5.538148984198646, + "grad_norm": 207.15869140625, + "learning_rate": 2.170235934664247e-05, + "loss": 25.2773, + "step": 1534 + }, + { + "epoch": 5.54176072234763, + "grad_norm": 193.37644958496094, + "learning_rate": 2.1696914700544468e-05, + "loss": 25.7936, + "step": 1535 + }, + { + "epoch": 5.545372460496614, + "grad_norm": 314.101318359375, + "learning_rate": 2.1691470054446463e-05, + "loss": 45.8573, + "step": 1536 + }, + { + "epoch": 5.5489841986455986, + "grad_norm": 376.9578552246094, + "learning_rate": 2.168602540834846e-05, + "loss": 47.1284, + "step": 1537 + }, + { + "epoch": 5.5525959367945825, + "grad_norm": 343.3904724121094, + "learning_rate": 2.1680580762250454e-05, + "loss": 45.1873, + "step": 1538 + }, + { + "epoch": 5.5562076749435665, + "grad_norm": 263.31768798828125, + "learning_rate": 2.167513611615245e-05, + "loss": 45.4906, + "step": 1539 + }, + { + "epoch": 5.5598194130925505, + "grad_norm": 295.50384521484375, + "learning_rate": 2.1669691470054448e-05, + "loss": 44.9259, + "step": 1540 + }, + { + "epoch": 5.5598194130925505, + "eval_loss": 0.6483813524246216, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.923, + "eval_steps_per_second": 56.923, + "step": 1540 + }, + { + "epoch": 5.563431151241535, + "grad_norm": 208.8861846923828, + "learning_rate": 2.1664246823956444e-05, + "loss": 43.7965, + "step": 1541 + }, + { + "epoch": 5.567042889390519, + "grad_norm": 195.8695526123047, + "learning_rate": 2.165880217785844e-05, + "loss": 44.7409, + "step": 1542 + }, + { + "epoch": 5.570654627539503, + "grad_norm": 218.10089111328125, + "learning_rate": 2.1653357531760434e-05, + "loss": 45.9364, + "step": 1543 + }, + { + "epoch": 5.574266365688487, + "grad_norm": 204.17205810546875, + "learning_rate": 2.164791288566243e-05, + "loss": 45.468, + "step": 1544 + }, + { + "epoch": 5.577878103837472, + "grad_norm": 239.03952026367188, + "learning_rate": 2.1642468239564432e-05, + "loss": 44.7685, + "step": 1545 + }, + { + "epoch": 5.581489841986456, + "grad_norm": 251.59300231933594, + "learning_rate": 2.1637023593466427e-05, + "loss": 43.011, + "step": 1546 + }, + { + "epoch": 5.58510158013544, + "grad_norm": 186.72540283203125, + "learning_rate": 2.1631578947368423e-05, + "loss": 41.5255, + "step": 1547 + }, + { + "epoch": 5.588713318284425, + "grad_norm": 199.89732360839844, + "learning_rate": 2.1626134301270418e-05, + "loss": 40.2522, + "step": 1548 + }, + { + "epoch": 5.592325056433409, + "grad_norm": 182.16624450683594, + "learning_rate": 2.1620689655172413e-05, + "loss": 41.0931, + "step": 1549 + }, + { + "epoch": 5.595936794582393, + "grad_norm": 221.58680725097656, + "learning_rate": 2.161524500907441e-05, + "loss": 40.2717, + "step": 1550 + }, + { + "epoch": 5.595936794582393, + "eval_loss": 0.6393340229988098, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 1550 + }, + { + "epoch": 5.599548532731377, + "grad_norm": 209.82183837890625, + "learning_rate": 2.1609800362976408e-05, + "loss": 41.7522, + "step": 1551 + }, + { + "epoch": 5.603160270880361, + "grad_norm": 226.1896209716797, + "learning_rate": 2.1604355716878403e-05, + "loss": 40.8078, + "step": 1552 + }, + { + "epoch": 5.606772009029346, + "grad_norm": 219.57899475097656, + "learning_rate": 2.1598911070780398e-05, + "loss": 42.2331, + "step": 1553 + }, + { + "epoch": 5.6103837471783295, + "grad_norm": 185.2303009033203, + "learning_rate": 2.1593466424682397e-05, + "loss": 42.0695, + "step": 1554 + }, + { + "epoch": 5.6139954853273135, + "grad_norm": 192.32913208007812, + "learning_rate": 2.1588021778584392e-05, + "loss": 42.1317, + "step": 1555 + }, + { + "epoch": 5.617607223476298, + "grad_norm": 183.3128662109375, + "learning_rate": 2.158257713248639e-05, + "loss": 40.4957, + "step": 1556 + }, + { + "epoch": 5.621218961625282, + "grad_norm": 178.10691833496094, + "learning_rate": 2.1577132486388387e-05, + "loss": 40.9154, + "step": 1557 + }, + { + "epoch": 5.624830699774266, + "grad_norm": 207.3495330810547, + "learning_rate": 2.1571687840290382e-05, + "loss": 42.8389, + "step": 1558 + }, + { + "epoch": 5.62844243792325, + "grad_norm": 191.46353149414062, + "learning_rate": 2.1566243194192377e-05, + "loss": 41.9483, + "step": 1559 + }, + { + "epoch": 5.632054176072235, + "grad_norm": 218.9544219970703, + "learning_rate": 2.1560798548094373e-05, + "loss": 41.2037, + "step": 1560 + }, + { + "epoch": 5.632054176072235, + "eval_loss": 0.6345452070236206, + "eval_runtime": 3.1432, + "eval_samples_per_second": 56.949, + "eval_steps_per_second": 56.949, + "step": 1560 + }, + { + "epoch": 5.635665914221219, + "grad_norm": 235.9405059814453, + "learning_rate": 2.1555353901996368e-05, + "loss": 43.1159, + "step": 1561 + }, + { + "epoch": 5.639277652370203, + "grad_norm": 207.1119384765625, + "learning_rate": 2.1549909255898367e-05, + "loss": 43.4384, + "step": 1562 + }, + { + "epoch": 5.642889390519187, + "grad_norm": 305.3013916015625, + "learning_rate": 2.1544464609800366e-05, + "loss": 42.436, + "step": 1563 + }, + { + "epoch": 5.646501128668172, + "grad_norm": 226.25282287597656, + "learning_rate": 2.153901996370236e-05, + "loss": 39.6844, + "step": 1564 + }, + { + "epoch": 5.650112866817156, + "grad_norm": 201.5033416748047, + "learning_rate": 2.1533575317604356e-05, + "loss": 35.9103, + "step": 1565 + }, + { + "epoch": 5.65372460496614, + "grad_norm": 206.63229370117188, + "learning_rate": 2.1528130671506352e-05, + "loss": 35.0026, + "step": 1566 + }, + { + "epoch": 5.657336343115124, + "grad_norm": 212.67581176757812, + "learning_rate": 2.152268602540835e-05, + "loss": 35.6298, + "step": 1567 + }, + { + "epoch": 5.660948081264109, + "grad_norm": 193.2886199951172, + "learning_rate": 2.1517241379310346e-05, + "loss": 36.0356, + "step": 1568 + }, + { + "epoch": 5.664559819413093, + "grad_norm": 166.189208984375, + "learning_rate": 2.151179673321234e-05, + "loss": 35.5423, + "step": 1569 + }, + { + "epoch": 5.668171557562077, + "grad_norm": 288.91552734375, + "learning_rate": 2.1506352087114337e-05, + "loss": 36.6227, + "step": 1570 + }, + { + "epoch": 5.668171557562077, + "eval_loss": 0.6339959502220154, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1570 + }, + { + "epoch": 5.6717832957110605, + "grad_norm": 210.91664123535156, + "learning_rate": 2.1500907441016332e-05, + "loss": 37.3015, + "step": 1571 + }, + { + "epoch": 5.675395033860045, + "grad_norm": 206.54299926757812, + "learning_rate": 2.149546279491833e-05, + "loss": 36.961, + "step": 1572 + }, + { + "epoch": 5.679006772009029, + "grad_norm": 206.55613708496094, + "learning_rate": 2.149001814882033e-05, + "loss": 36.722, + "step": 1573 + }, + { + "epoch": 5.682618510158013, + "grad_norm": 206.86563110351562, + "learning_rate": 2.1484573502722325e-05, + "loss": 37.7482, + "step": 1574 + }, + { + "epoch": 5.686230248306998, + "grad_norm": 219.96533203125, + "learning_rate": 2.147912885662432e-05, + "loss": 37.7964, + "step": 1575 + }, + { + "epoch": 5.689841986455982, + "grad_norm": 226.23887634277344, + "learning_rate": 2.1473684210526316e-05, + "loss": 38.6577, + "step": 1576 + }, + { + "epoch": 5.693453724604966, + "grad_norm": 195.1751708984375, + "learning_rate": 2.146823956442831e-05, + "loss": 36.9764, + "step": 1577 + }, + { + "epoch": 5.69706546275395, + "grad_norm": 194.3510284423828, + "learning_rate": 2.146279491833031e-05, + "loss": 39.4842, + "step": 1578 + }, + { + "epoch": 5.700677200902934, + "grad_norm": 187.02281188964844, + "learning_rate": 2.1457350272232305e-05, + "loss": 38.9574, + "step": 1579 + }, + { + "epoch": 5.704288939051919, + "grad_norm": 242.91925048828125, + "learning_rate": 2.14519056261343e-05, + "loss": 37.6359, + "step": 1580 + }, + { + "epoch": 5.704288939051919, + "eval_loss": 0.6384473443031311, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 1580 + }, + { + "epoch": 5.707900677200903, + "grad_norm": 242.9617156982422, + "learning_rate": 2.14464609800363e-05, + "loss": 31.3564, + "step": 1581 + }, + { + "epoch": 5.711512415349887, + "grad_norm": 182.00540161132812, + "learning_rate": 2.1441016333938295e-05, + "loss": 24.2933, + "step": 1582 + }, + { + "epoch": 5.715124153498872, + "grad_norm": 257.7115173339844, + "learning_rate": 2.143557168784029e-05, + "loss": 24.6299, + "step": 1583 + }, + { + "epoch": 5.718735891647856, + "grad_norm": 198.71554565429688, + "learning_rate": 2.143012704174229e-05, + "loss": 24.7344, + "step": 1584 + }, + { + "epoch": 5.72234762979684, + "grad_norm": 198.24520874023438, + "learning_rate": 2.1424682395644284e-05, + "loss": 26.0825, + "step": 1585 + }, + { + "epoch": 5.725959367945824, + "grad_norm": 248.9528045654297, + "learning_rate": 2.141923774954628e-05, + "loss": 45.1176, + "step": 1586 + }, + { + "epoch": 5.7295711060948085, + "grad_norm": 293.7327575683594, + "learning_rate": 2.1413793103448275e-05, + "loss": 45.8517, + "step": 1587 + }, + { + "epoch": 5.733182844243792, + "grad_norm": 293.1148681640625, + "learning_rate": 2.140834845735027e-05, + "loss": 45.6659, + "step": 1588 + }, + { + "epoch": 5.736794582392776, + "grad_norm": 312.7779846191406, + "learning_rate": 2.140290381125227e-05, + "loss": 44.4863, + "step": 1589 + }, + { + "epoch": 5.74040632054176, + "grad_norm": 309.1000061035156, + "learning_rate": 2.1397459165154265e-05, + "loss": 43.649, + "step": 1590 + }, + { + "epoch": 5.74040632054176, + "eval_loss": 0.6471736431121826, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 1590 + }, + { + "epoch": 5.744018058690745, + "grad_norm": 276.4226989746094, + "learning_rate": 2.1392014519056263e-05, + "loss": 45.3135, + "step": 1591 + }, + { + "epoch": 5.747629796839729, + "grad_norm": 233.6791229248047, + "learning_rate": 2.138656987295826e-05, + "loss": 44.4919, + "step": 1592 + }, + { + "epoch": 5.751241534988713, + "grad_norm": 194.2917022705078, + "learning_rate": 2.1381125226860254e-05, + "loss": 44.8033, + "step": 1593 + }, + { + "epoch": 5.754853273137698, + "grad_norm": 241.76060485839844, + "learning_rate": 2.137568058076225e-05, + "loss": 45.1427, + "step": 1594 + }, + { + "epoch": 5.758465011286682, + "grad_norm": 216.56283569335938, + "learning_rate": 2.137023593466425e-05, + "loss": 43.1769, + "step": 1595 + }, + { + "epoch": 5.762076749435666, + "grad_norm": 230.0026092529297, + "learning_rate": 2.1364791288566244e-05, + "loss": 44.1141, + "step": 1596 + }, + { + "epoch": 5.76568848758465, + "grad_norm": 191.55433654785156, + "learning_rate": 2.135934664246824e-05, + "loss": 40.7227, + "step": 1597 + }, + { + "epoch": 5.769300225733634, + "grad_norm": 180.25885009765625, + "learning_rate": 2.1353901996370235e-05, + "loss": 40.9842, + "step": 1598 + }, + { + "epoch": 5.772911963882619, + "grad_norm": 220.4018096923828, + "learning_rate": 2.134845735027223e-05, + "loss": 40.0403, + "step": 1599 + }, + { + "epoch": 5.776523702031603, + "grad_norm": 264.20587158203125, + "learning_rate": 2.1343012704174232e-05, + "loss": 40.1543, + "step": 1600 + }, + { + "epoch": 5.776523702031603, + "eval_loss": 0.6374311447143555, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1600 + }, + { + "epoch": 5.780135440180587, + "grad_norm": 167.9457244873047, + "learning_rate": 2.1337568058076227e-05, + "loss": 40.9575, + "step": 1601 + }, + { + "epoch": 5.7837471783295715, + "grad_norm": 190.05247497558594, + "learning_rate": 2.1332123411978223e-05, + "loss": 39.5593, + "step": 1602 + }, + { + "epoch": 5.7873589164785555, + "grad_norm": 246.4980926513672, + "learning_rate": 2.1326678765880218e-05, + "loss": 40.7016, + "step": 1603 + }, + { + "epoch": 5.7909706546275395, + "grad_norm": 208.7435302734375, + "learning_rate": 2.1321234119782214e-05, + "loss": 41.7855, + "step": 1604 + }, + { + "epoch": 5.794582392776523, + "grad_norm": 190.84188842773438, + "learning_rate": 2.1315789473684212e-05, + "loss": 41.2129, + "step": 1605 + }, + { + "epoch": 5.798194130925508, + "grad_norm": 196.7161102294922, + "learning_rate": 2.1310344827586208e-05, + "loss": 40.8209, + "step": 1606 + }, + { + "epoch": 5.801805869074492, + "grad_norm": 181.4319305419922, + "learning_rate": 2.1304900181488203e-05, + "loss": 41.8345, + "step": 1607 + }, + { + "epoch": 5.805417607223476, + "grad_norm": 201.2064971923828, + "learning_rate": 2.12994555353902e-05, + "loss": 43.1464, + "step": 1608 + }, + { + "epoch": 5.80902934537246, + "grad_norm": 199.15174865722656, + "learning_rate": 2.1294010889292197e-05, + "loss": 42.6041, + "step": 1609 + }, + { + "epoch": 5.812641083521445, + "grad_norm": 231.0398406982422, + "learning_rate": 2.1288566243194193e-05, + "loss": 42.867, + "step": 1610 + }, + { + "epoch": 5.812641083521445, + "eval_loss": 0.6334222555160522, + "eval_runtime": 3.1534, + "eval_samples_per_second": 56.764, + "eval_steps_per_second": 56.764, + "step": 1610 + }, + { + "epoch": 5.816252821670429, + "grad_norm": 189.26132202148438, + "learning_rate": 2.128312159709619e-05, + "loss": 41.7717, + "step": 1611 + }, + { + "epoch": 5.819864559819413, + "grad_norm": 215.5289764404297, + "learning_rate": 2.1277676950998187e-05, + "loss": 41.3994, + "step": 1612 + }, + { + "epoch": 5.823476297968397, + "grad_norm": 267.4259033203125, + "learning_rate": 2.1272232304900182e-05, + "loss": 41.8173, + "step": 1613 + }, + { + "epoch": 5.827088036117382, + "grad_norm": 241.74749755859375, + "learning_rate": 2.1266787658802178e-05, + "loss": 39.9873, + "step": 1614 + }, + { + "epoch": 5.830699774266366, + "grad_norm": 242.233642578125, + "learning_rate": 2.1261343012704173e-05, + "loss": 37.0662, + "step": 1615 + }, + { + "epoch": 5.83431151241535, + "grad_norm": 217.06141662597656, + "learning_rate": 2.1255898366606172e-05, + "loss": 36.8948, + "step": 1616 + }, + { + "epoch": 5.837923250564334, + "grad_norm": 242.05567932128906, + "learning_rate": 2.1250453720508167e-05, + "loss": 34.9909, + "step": 1617 + }, + { + "epoch": 5.8415349887133186, + "grad_norm": 178.65618896484375, + "learning_rate": 2.1245009074410166e-05, + "loss": 35.603, + "step": 1618 + }, + { + "epoch": 5.8451467268623025, + "grad_norm": 216.36865234375, + "learning_rate": 2.123956442831216e-05, + "loss": 35.9822, + "step": 1619 + }, + { + "epoch": 5.8487584650112865, + "grad_norm": 241.22161865234375, + "learning_rate": 2.1234119782214157e-05, + "loss": 35.1473, + "step": 1620 + }, + { + "epoch": 5.8487584650112865, + "eval_loss": 0.6312161087989807, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 1620 + }, + { + "epoch": 5.852370203160271, + "grad_norm": 192.05210876464844, + "learning_rate": 2.1228675136116152e-05, + "loss": 36.145, + "step": 1621 + }, + { + "epoch": 5.855981941309255, + "grad_norm": 194.0652618408203, + "learning_rate": 2.122323049001815e-05, + "loss": 37.7076, + "step": 1622 + }, + { + "epoch": 5.859593679458239, + "grad_norm": 255.59286499023438, + "learning_rate": 2.1217785843920146e-05, + "loss": 37.6837, + "step": 1623 + }, + { + "epoch": 5.863205417607223, + "grad_norm": 184.0017852783203, + "learning_rate": 2.121234119782214e-05, + "loss": 37.1681, + "step": 1624 + }, + { + "epoch": 5.866817155756207, + "grad_norm": 186.98338317871094, + "learning_rate": 2.1206896551724137e-05, + "loss": 37.4902, + "step": 1625 + }, + { + "epoch": 5.870428893905192, + "grad_norm": 253.53775024414062, + "learning_rate": 2.1201451905626132e-05, + "loss": 37.2771, + "step": 1626 + }, + { + "epoch": 5.874040632054176, + "grad_norm": 196.43038940429688, + "learning_rate": 2.119600725952813e-05, + "loss": 37.7681, + "step": 1627 + }, + { + "epoch": 5.87765237020316, + "grad_norm": 255.99879455566406, + "learning_rate": 2.119056261343013e-05, + "loss": 40.0097, + "step": 1628 + }, + { + "epoch": 5.881264108352145, + "grad_norm": 275.1465148925781, + "learning_rate": 2.1185117967332125e-05, + "loss": 38.1076, + "step": 1629 + }, + { + "epoch": 5.884875846501129, + "grad_norm": 281.8592529296875, + "learning_rate": 2.117967332123412e-05, + "loss": 38.6463, + "step": 1630 + }, + { + "epoch": 5.884875846501129, + "eval_loss": 0.6449099779129028, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 1630 + }, + { + "epoch": 5.888487584650113, + "grad_norm": 246.7912139892578, + "learning_rate": 2.1174228675136116e-05, + "loss": 36.9158, + "step": 1631 + }, + { + "epoch": 5.892099322799097, + "grad_norm": 176.7545623779297, + "learning_rate": 2.116878402903811e-05, + "loss": 25.1153, + "step": 1632 + }, + { + "epoch": 5.895711060948082, + "grad_norm": 202.2602996826172, + "learning_rate": 2.116333938294011e-05, + "loss": 24.1999, + "step": 1633 + }, + { + "epoch": 5.899322799097066, + "grad_norm": 186.26255798339844, + "learning_rate": 2.1157894736842106e-05, + "loss": 24.185, + "step": 1634 + }, + { + "epoch": 5.9029345372460496, + "grad_norm": 231.0543670654297, + "learning_rate": 2.11524500907441e-05, + "loss": 26.1841, + "step": 1635 + }, + { + "epoch": 5.9065462753950335, + "grad_norm": 336.677001953125, + "learning_rate": 2.1147005444646096e-05, + "loss": 47.1367, + "step": 1636 + }, + { + "epoch": 5.910158013544018, + "grad_norm": 299.3211975097656, + "learning_rate": 2.1141560798548095e-05, + "loss": 46.7711, + "step": 1637 + }, + { + "epoch": 5.913769751693002, + "grad_norm": 287.5389099121094, + "learning_rate": 2.1136116152450094e-05, + "loss": 44.9163, + "step": 1638 + }, + { + "epoch": 5.917381489841986, + "grad_norm": 290.34930419921875, + "learning_rate": 2.113067150635209e-05, + "loss": 45.1651, + "step": 1639 + }, + { + "epoch": 5.92099322799097, + "grad_norm": 244.7100372314453, + "learning_rate": 2.1125226860254085e-05, + "loss": 45.6252, + "step": 1640 + }, + { + "epoch": 5.92099322799097, + "eval_loss": 0.6506878733634949, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 1640 + }, + { + "epoch": 5.924604966139955, + "grad_norm": 301.48223876953125, + "learning_rate": 2.111978221415608e-05, + "loss": 44.5345, + "step": 1641 + }, + { + "epoch": 5.928216704288939, + "grad_norm": 261.05987548828125, + "learning_rate": 2.1114337568058075e-05, + "loss": 42.0263, + "step": 1642 + }, + { + "epoch": 5.931828442437923, + "grad_norm": 220.4369659423828, + "learning_rate": 2.110889292196007e-05, + "loss": 41.2405, + "step": 1643 + }, + { + "epoch": 5.935440180586907, + "grad_norm": 261.3221435546875, + "learning_rate": 2.110344827586207e-05, + "loss": 42.2734, + "step": 1644 + }, + { + "epoch": 5.939051918735892, + "grad_norm": 253.70855712890625, + "learning_rate": 2.1098003629764065e-05, + "loss": 43.0752, + "step": 1645 + }, + { + "epoch": 5.942663656884876, + "grad_norm": 198.76138305664062, + "learning_rate": 2.1092558983666064e-05, + "loss": 42.7103, + "step": 1646 + }, + { + "epoch": 5.94627539503386, + "grad_norm": 212.21466064453125, + "learning_rate": 2.108711433756806e-05, + "loss": 42.6215, + "step": 1647 + }, + { + "epoch": 5.949887133182845, + "grad_norm": 212.9633026123047, + "learning_rate": 2.1081669691470055e-05, + "loss": 42.795, + "step": 1648 + }, + { + "epoch": 5.953498871331829, + "grad_norm": 263.2871398925781, + "learning_rate": 2.1076225045372053e-05, + "loss": 43.8843, + "step": 1649 + }, + { + "epoch": 5.957110609480813, + "grad_norm": 207.67120361328125, + "learning_rate": 2.107078039927405e-05, + "loss": 43.0161, + "step": 1650 + }, + { + "epoch": 5.957110609480813, + "eval_loss": 0.6315081715583801, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1650 + }, + { + "epoch": 5.960722347629797, + "grad_norm": 176.6342010498047, + "learning_rate": 2.1065335753176044e-05, + "loss": 38.803, + "step": 1651 + }, + { + "epoch": 5.9643340857787805, + "grad_norm": 223.57485961914062, + "learning_rate": 2.105989110707804e-05, + "loss": 35.1905, + "step": 1652 + }, + { + "epoch": 5.967945823927765, + "grad_norm": 291.507568359375, + "learning_rate": 2.1054446460980035e-05, + "loss": 34.9454, + "step": 1653 + }, + { + "epoch": 5.971557562076749, + "grad_norm": 250.51063537597656, + "learning_rate": 2.104900181488203e-05, + "loss": 37.4404, + "step": 1654 + }, + { + "epoch": 5.975169300225733, + "grad_norm": 307.9601135253906, + "learning_rate": 2.1043557168784032e-05, + "loss": 36.9775, + "step": 1655 + }, + { + "epoch": 5.978781038374718, + "grad_norm": 277.24151611328125, + "learning_rate": 2.1038112522686028e-05, + "loss": 38.2696, + "step": 1656 + }, + { + "epoch": 5.982392776523702, + "grad_norm": 186.7593994140625, + "learning_rate": 2.1032667876588023e-05, + "loss": 37.0656, + "step": 1657 + }, + { + "epoch": 5.986004514672686, + "grad_norm": 201.67047119140625, + "learning_rate": 2.102722323049002e-05, + "loss": 38.1747, + "step": 1658 + }, + { + "epoch": 5.98961625282167, + "grad_norm": 216.87525939941406, + "learning_rate": 2.1021778584392014e-05, + "loss": 39.3248, + "step": 1659 + }, + { + "epoch": 5.993227990970655, + "grad_norm": 227.381103515625, + "learning_rate": 2.1016333938294013e-05, + "loss": 33.4017, + "step": 1660 + }, + { + "epoch": 5.993227990970655, + "eval_loss": 0.6369583010673523, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1660 + }, + { + "epoch": 5.996839729119639, + "grad_norm": 237.2648468017578, + "learning_rate": 2.1010889292196008e-05, + "loss": 24.679, + "step": 1661 + }, + { + "epoch": 6.0, + "grad_norm": 191.99951171875, + "learning_rate": 2.1005444646098003e-05, + "loss": 21.9552, + "step": 1662 + }, + { + "epoch": 6.003611738148984, + "grad_norm": 267.92181396484375, + "learning_rate": 2.1e-05, + "loss": 43.6884, + "step": 1663 + }, + { + "epoch": 6.007223476297969, + "grad_norm": 318.86602783203125, + "learning_rate": 2.0994555353901998e-05, + "loss": 46.0709, + "step": 1664 + }, + { + "epoch": 6.010835214446953, + "grad_norm": 282.772705078125, + "learning_rate": 2.0989110707803993e-05, + "loss": 44.2746, + "step": 1665 + }, + { + "epoch": 6.014446952595937, + "grad_norm": 263.2024841308594, + "learning_rate": 2.0983666061705992e-05, + "loss": 43.818, + "step": 1666 + }, + { + "epoch": 6.018058690744921, + "grad_norm": 229.41725158691406, + "learning_rate": 2.0978221415607987e-05, + "loss": 43.9441, + "step": 1667 + }, + { + "epoch": 6.021670428893906, + "grad_norm": 253.25624084472656, + "learning_rate": 2.0972776769509983e-05, + "loss": 43.517, + "step": 1668 + }, + { + "epoch": 6.0252821670428895, + "grad_norm": 202.00238037109375, + "learning_rate": 2.0967332123411978e-05, + "loss": 44.3685, + "step": 1669 + }, + { + "epoch": 6.0288939051918735, + "grad_norm": 196.92825317382812, + "learning_rate": 2.0961887477313973e-05, + "loss": 44.9367, + "step": 1670 + }, + { + "epoch": 6.0288939051918735, + "eval_loss": 0.6381568312644958, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1670 + }, + { + "epoch": 6.0325056433408575, + "grad_norm": 191.00900268554688, + "learning_rate": 2.0956442831215972e-05, + "loss": 44.0743, + "step": 1671 + }, + { + "epoch": 6.036117381489842, + "grad_norm": 195.92141723632812, + "learning_rate": 2.0950998185117967e-05, + "loss": 43.3278, + "step": 1672 + }, + { + "epoch": 6.039729119638826, + "grad_norm": 230.04708862304688, + "learning_rate": 2.0945553539019963e-05, + "loss": 41.6419, + "step": 1673 + }, + { + "epoch": 6.04334085778781, + "grad_norm": 215.70689392089844, + "learning_rate": 2.094010889292196e-05, + "loss": 41.0927, + "step": 1674 + }, + { + "epoch": 6.046952595936794, + "grad_norm": 227.51797485351562, + "learning_rate": 2.0934664246823957e-05, + "loss": 40.1888, + "step": 1675 + }, + { + "epoch": 6.050564334085779, + "grad_norm": 216.93089294433594, + "learning_rate": 2.0929219600725952e-05, + "loss": 39.8766, + "step": 1676 + }, + { + "epoch": 6.054176072234763, + "grad_norm": 199.3091583251953, + "learning_rate": 2.092377495462795e-05, + "loss": 40.3851, + "step": 1677 + }, + { + "epoch": 6.057787810383747, + "grad_norm": 188.56056213378906, + "learning_rate": 2.0918330308529947e-05, + "loss": 40.5289, + "step": 1678 + }, + { + "epoch": 6.061399548532731, + "grad_norm": 194.23265075683594, + "learning_rate": 2.0912885662431942e-05, + "loss": 40.7509, + "step": 1679 + }, + { + "epoch": 6.065011286681716, + "grad_norm": 199.7327423095703, + "learning_rate": 2.0907441016333937e-05, + "loss": 41.3404, + "step": 1680 + }, + { + "epoch": 6.065011286681716, + "eval_loss": 0.6312655806541443, + "eval_runtime": 3.1482, + "eval_samples_per_second": 56.858, + "eval_steps_per_second": 56.858, + "step": 1680 + }, + { + "epoch": 6.0686230248307, + "grad_norm": 189.40150451660156, + "learning_rate": 2.0901996370235933e-05, + "loss": 41.3719, + "step": 1681 + }, + { + "epoch": 6.072234762979684, + "grad_norm": 222.07705688476562, + "learning_rate": 2.089655172413793e-05, + "loss": 41.8194, + "step": 1682 + }, + { + "epoch": 6.075846501128668, + "grad_norm": 205.6264190673828, + "learning_rate": 2.089110707803993e-05, + "loss": 39.8522, + "step": 1683 + }, + { + "epoch": 6.079458239277653, + "grad_norm": 207.98802185058594, + "learning_rate": 2.0885662431941926e-05, + "loss": 41.5093, + "step": 1684 + }, + { + "epoch": 6.083069977426637, + "grad_norm": 197.24134826660156, + "learning_rate": 2.088021778584392e-05, + "loss": 41.7284, + "step": 1685 + }, + { + "epoch": 6.0866817155756205, + "grad_norm": 220.84255981445312, + "learning_rate": 2.0874773139745916e-05, + "loss": 42.7841, + "step": 1686 + }, + { + "epoch": 6.090293453724605, + "grad_norm": 239.06854248046875, + "learning_rate": 2.0869328493647912e-05, + "loss": 43.6391, + "step": 1687 + }, + { + "epoch": 6.093905191873589, + "grad_norm": 193.2572021484375, + "learning_rate": 2.086388384754991e-05, + "loss": 41.9963, + "step": 1688 + }, + { + "epoch": 6.097516930022573, + "grad_norm": 206.66473388671875, + "learning_rate": 2.0858439201451906e-05, + "loss": 41.9834, + "step": 1689 + }, + { + "epoch": 6.101128668171557, + "grad_norm": 214.81956481933594, + "learning_rate": 2.08529945553539e-05, + "loss": 41.7128, + "step": 1690 + }, + { + "epoch": 6.101128668171557, + "eval_loss": 0.6309775114059448, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1690 + }, + { + "epoch": 6.104740406320542, + "grad_norm": 189.58360290527344, + "learning_rate": 2.0847549909255897e-05, + "loss": 37.7807, + "step": 1691 + }, + { + "epoch": 6.108352144469526, + "grad_norm": 265.76934814453125, + "learning_rate": 2.0842105263157895e-05, + "loss": 37.7091, + "step": 1692 + }, + { + "epoch": 6.11196388261851, + "grad_norm": 266.4632568359375, + "learning_rate": 2.0836660617059894e-05, + "loss": 34.7386, + "step": 1693 + }, + { + "epoch": 6.115575620767494, + "grad_norm": 309.3799743652344, + "learning_rate": 2.083121597096189e-05, + "loss": 34.9386, + "step": 1694 + }, + { + "epoch": 6.119187358916479, + "grad_norm": 252.98681640625, + "learning_rate": 2.0825771324863885e-05, + "loss": 34.9113, + "step": 1695 + }, + { + "epoch": 6.122799097065463, + "grad_norm": 199.3408660888672, + "learning_rate": 2.082032667876588e-05, + "loss": 35.1914, + "step": 1696 + }, + { + "epoch": 6.126410835214447, + "grad_norm": 231.67514038085938, + "learning_rate": 2.0814882032667876e-05, + "loss": 36.3151, + "step": 1697 + }, + { + "epoch": 6.130022573363431, + "grad_norm": 215.49317932128906, + "learning_rate": 2.080943738656987e-05, + "loss": 37.6763, + "step": 1698 + }, + { + "epoch": 6.133634311512416, + "grad_norm": 239.3602752685547, + "learning_rate": 2.080399274047187e-05, + "loss": 35.7805, + "step": 1699 + }, + { + "epoch": 6.1372460496614, + "grad_norm": 192.8195037841797, + "learning_rate": 2.0798548094373865e-05, + "loss": 36.7353, + "step": 1700 + }, + { + "epoch": 6.1372460496614, + "eval_loss": 0.6290757060050964, + "eval_runtime": 3.1486, + "eval_samples_per_second": 56.851, + "eval_steps_per_second": 56.851, + "step": 1700 + }, + { + "epoch": 6.140857787810384, + "grad_norm": 191.125, + "learning_rate": 2.0793103448275864e-05, + "loss": 36.6377, + "step": 1701 + }, + { + "epoch": 6.144469525959368, + "grad_norm": 232.39170837402344, + "learning_rate": 2.078765880217786e-05, + "loss": 36.5235, + "step": 1702 + }, + { + "epoch": 6.148081264108352, + "grad_norm": 259.41204833984375, + "learning_rate": 2.0782214156079855e-05, + "loss": 37.7093, + "step": 1703 + }, + { + "epoch": 6.151693002257336, + "grad_norm": 218.00814819335938, + "learning_rate": 2.0776769509981854e-05, + "loss": 37.8061, + "step": 1704 + }, + { + "epoch": 6.15530474040632, + "grad_norm": 183.78170776367188, + "learning_rate": 2.077132486388385e-05, + "loss": 37.9451, + "step": 1705 + }, + { + "epoch": 6.158916478555304, + "grad_norm": 242.387939453125, + "learning_rate": 2.0765880217785844e-05, + "loss": 38.687, + "step": 1706 + }, + { + "epoch": 6.162528216704289, + "grad_norm": 247.09152221679688, + "learning_rate": 2.076043557168784e-05, + "loss": 38.5109, + "step": 1707 + }, + { + "epoch": 6.166139954853273, + "grad_norm": 202.3104705810547, + "learning_rate": 2.0754990925589835e-05, + "loss": 28.0115, + "step": 1708 + }, + { + "epoch": 6.169751693002257, + "grad_norm": 239.5511016845703, + "learning_rate": 2.0749546279491834e-05, + "loss": 23.8873, + "step": 1709 + }, + { + "epoch": 6.173363431151241, + "grad_norm": 233.80007934570312, + "learning_rate": 2.0744101633393833e-05, + "loss": 24.0236, + "step": 1710 + }, + { + "epoch": 6.173363431151241, + "eval_loss": 0.6451307535171509, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1710 + }, + { + "epoch": 6.176975169300226, + "grad_norm": 231.85955810546875, + "learning_rate": 2.0738656987295828e-05, + "loss": 25.2521, + "step": 1711 + }, + { + "epoch": 6.18058690744921, + "grad_norm": 207.05453491210938, + "learning_rate": 2.0733212341197823e-05, + "loss": 25.5774, + "step": 1712 + }, + { + "epoch": 6.184198645598194, + "grad_norm": 265.9180908203125, + "learning_rate": 2.072776769509982e-05, + "loss": 46.0267, + "step": 1713 + }, + { + "epoch": 6.187810383747179, + "grad_norm": 289.2763671875, + "learning_rate": 2.0722323049001814e-05, + "loss": 46.6262, + "step": 1714 + }, + { + "epoch": 6.191422121896163, + "grad_norm": 254.466552734375, + "learning_rate": 2.0716878402903813e-05, + "loss": 44.2758, + "step": 1715 + }, + { + "epoch": 6.195033860045147, + "grad_norm": 262.713134765625, + "learning_rate": 2.071143375680581e-05, + "loss": 44.6334, + "step": 1716 + }, + { + "epoch": 6.198645598194131, + "grad_norm": 272.8150939941406, + "learning_rate": 2.0705989110707804e-05, + "loss": 44.9617, + "step": 1717 + }, + { + "epoch": 6.2022573363431155, + "grad_norm": 288.115478515625, + "learning_rate": 2.07005444646098e-05, + "loss": 44.4382, + "step": 1718 + }, + { + "epoch": 6.2058690744920995, + "grad_norm": 226.08058166503906, + "learning_rate": 2.0695099818511795e-05, + "loss": 44.8551, + "step": 1719 + }, + { + "epoch": 6.209480812641083, + "grad_norm": 219.95835876464844, + "learning_rate": 2.0689655172413797e-05, + "loss": 45.5901, + "step": 1720 + }, + { + "epoch": 6.209480812641083, + "eval_loss": 0.6379314661026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 1720 + }, + { + "epoch": 6.213092550790067, + "grad_norm": 190.3118896484375, + "learning_rate": 2.0684210526315792e-05, + "loss": 44.0675, + "step": 1721 + }, + { + "epoch": 6.216704288939052, + "grad_norm": 177.408935546875, + "learning_rate": 2.0678765880217787e-05, + "loss": 42.6333, + "step": 1722 + }, + { + "epoch": 6.220316027088036, + "grad_norm": 231.3040313720703, + "learning_rate": 2.0673321234119783e-05, + "loss": 41.6771, + "step": 1723 + }, + { + "epoch": 6.22392776523702, + "grad_norm": 226.51663208007812, + "learning_rate": 2.0667876588021778e-05, + "loss": 41.0829, + "step": 1724 + }, + { + "epoch": 6.227539503386004, + "grad_norm": 184.55775451660156, + "learning_rate": 2.0662431941923774e-05, + "loss": 39.2682, + "step": 1725 + }, + { + "epoch": 6.231151241534989, + "grad_norm": 205.0491943359375, + "learning_rate": 2.0656987295825772e-05, + "loss": 40.4101, + "step": 1726 + }, + { + "epoch": 6.234762979683973, + "grad_norm": 201.45838928222656, + "learning_rate": 2.0651542649727768e-05, + "loss": 39.9147, + "step": 1727 + }, + { + "epoch": 6.238374717832957, + "grad_norm": 220.16213989257812, + "learning_rate": 2.0646098003629763e-05, + "loss": 40.7215, + "step": 1728 + }, + { + "epoch": 6.241986455981941, + "grad_norm": 260.9661560058594, + "learning_rate": 2.0640653357531762e-05, + "loss": 40.0256, + "step": 1729 + }, + { + "epoch": 6.245598194130926, + "grad_norm": 314.2476806640625, + "learning_rate": 2.0635208711433757e-05, + "loss": 41.1147, + "step": 1730 + }, + { + "epoch": 6.245598194130926, + "eval_loss": 0.6347935199737549, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1730 + }, + { + "epoch": 6.24920993227991, + "grad_norm": 262.24505615234375, + "learning_rate": 2.0629764065335756e-05, + "loss": 41.7255, + "step": 1731 + }, + { + "epoch": 6.252821670428894, + "grad_norm": 212.0876922607422, + "learning_rate": 2.062431941923775e-05, + "loss": 41.2559, + "step": 1732 + }, + { + "epoch": 6.2564334085778786, + "grad_norm": 185.3249969482422, + "learning_rate": 2.0618874773139747e-05, + "loss": 41.1664, + "step": 1733 + }, + { + "epoch": 6.2600451467268625, + "grad_norm": 184.7873077392578, + "learning_rate": 2.0613430127041742e-05, + "loss": 41.3357, + "step": 1734 + }, + { + "epoch": 6.2636568848758465, + "grad_norm": 230.11257934570312, + "learning_rate": 2.0607985480943738e-05, + "loss": 43.0978, + "step": 1735 + }, + { + "epoch": 6.2672686230248305, + "grad_norm": 251.255126953125, + "learning_rate": 2.0602540834845733e-05, + "loss": 42.4169, + "step": 1736 + }, + { + "epoch": 6.270880361173815, + "grad_norm": 230.1149444580078, + "learning_rate": 2.0597096188747732e-05, + "loss": 43.2969, + "step": 1737 + }, + { + "epoch": 6.274492099322799, + "grad_norm": 217.2769012451172, + "learning_rate": 2.059165154264973e-05, + "loss": 42.6037, + "step": 1738 + }, + { + "epoch": 6.278103837471783, + "grad_norm": 189.85533142089844, + "learning_rate": 2.0586206896551726e-05, + "loss": 42.1215, + "step": 1739 + }, + { + "epoch": 6.281715575620767, + "grad_norm": 242.15667724609375, + "learning_rate": 2.058076225045372e-05, + "loss": 42.6337, + "step": 1740 + }, + { + "epoch": 6.281715575620767, + "eval_loss": 0.6310555934906006, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 1740 + }, + { + "epoch": 6.285327313769752, + "grad_norm": 213.7873992919922, + "learning_rate": 2.0575317604355717e-05, + "loss": 40.5315, + "step": 1741 + }, + { + "epoch": 6.288939051918736, + "grad_norm": 243.86492919921875, + "learning_rate": 2.0569872958257715e-05, + "loss": 38.9483, + "step": 1742 + }, + { + "epoch": 6.29255079006772, + "grad_norm": 276.0108642578125, + "learning_rate": 2.056442831215971e-05, + "loss": 35.9627, + "step": 1743 + }, + { + "epoch": 6.296162528216704, + "grad_norm": 252.5875701904297, + "learning_rate": 2.0558983666061706e-05, + "loss": 35.4305, + "step": 1744 + }, + { + "epoch": 6.299774266365689, + "grad_norm": 227.15142822265625, + "learning_rate": 2.05535390199637e-05, + "loss": 35.2385, + "step": 1745 + }, + { + "epoch": 6.303386004514673, + "grad_norm": 259.6727294921875, + "learning_rate": 2.0548094373865697e-05, + "loss": 35.735, + "step": 1746 + }, + { + "epoch": 6.306997742663657, + "grad_norm": 185.07765197753906, + "learning_rate": 2.0542649727767696e-05, + "loss": 36.8835, + "step": 1747 + }, + { + "epoch": 6.310609480812641, + "grad_norm": 207.650146484375, + "learning_rate": 2.0537205081669694e-05, + "loss": 36.346, + "step": 1748 + }, + { + "epoch": 6.314221218961626, + "grad_norm": 223.2378692626953, + "learning_rate": 2.053176043557169e-05, + "loss": 36.1527, + "step": 1749 + }, + { + "epoch": 6.3178329571106095, + "grad_norm": 162.90794372558594, + "learning_rate": 2.0526315789473685e-05, + "loss": 35.7408, + "step": 1750 + }, + { + "epoch": 6.3178329571106095, + "eval_loss": 0.6276403069496155, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 1750 + }, + { + "epoch": 6.3214446952595935, + "grad_norm": 165.8592987060547, + "learning_rate": 2.052087114337568e-05, + "loss": 37.7916, + "step": 1751 + }, + { + "epoch": 6.3250564334085775, + "grad_norm": 179.7499542236328, + "learning_rate": 2.0515426497277676e-05, + "loss": 36.8409, + "step": 1752 + }, + { + "epoch": 6.328668171557562, + "grad_norm": 227.0990753173828, + "learning_rate": 2.0509981851179675e-05, + "loss": 37.1766, + "step": 1753 + }, + { + "epoch": 6.332279909706546, + "grad_norm": 216.3297882080078, + "learning_rate": 2.050453720508167e-05, + "loss": 37.5, + "step": 1754 + }, + { + "epoch": 6.33589164785553, + "grad_norm": 197.88409423828125, + "learning_rate": 2.0499092558983666e-05, + "loss": 38.8293, + "step": 1755 + }, + { + "epoch": 6.339503386004514, + "grad_norm": 189.74916076660156, + "learning_rate": 2.049364791288566e-05, + "loss": 37.9873, + "step": 1756 + }, + { + "epoch": 6.343115124153499, + "grad_norm": 241.16644287109375, + "learning_rate": 2.048820326678766e-05, + "loss": 39.3107, + "step": 1757 + }, + { + "epoch": 6.346726862302483, + "grad_norm": 224.3491668701172, + "learning_rate": 2.0482758620689655e-05, + "loss": 36.2482, + "step": 1758 + }, + { + "epoch": 6.350338600451467, + "grad_norm": 217.30882263183594, + "learning_rate": 2.0477313974591654e-05, + "loss": 24.1945, + "step": 1759 + }, + { + "epoch": 6.353950338600452, + "grad_norm": 213.23683166503906, + "learning_rate": 2.047186932849365e-05, + "loss": 24.2356, + "step": 1760 + }, + { + "epoch": 6.353950338600452, + "eval_loss": 0.6382855772972107, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.795, + "eval_steps_per_second": 56.795, + "step": 1760 + }, + { + "epoch": 6.357562076749436, + "grad_norm": 209.8166961669922, + "learning_rate": 2.0466424682395645e-05, + "loss": 25.1916, + "step": 1761 + }, + { + "epoch": 6.36117381489842, + "grad_norm": 197.86773681640625, + "learning_rate": 2.046098003629764e-05, + "loss": 25.1372, + "step": 1762 + }, + { + "epoch": 6.364785553047404, + "grad_norm": 280.80517578125, + "learning_rate": 2.0455535390199635e-05, + "loss": 45.0431, + "step": 1763 + }, + { + "epoch": 6.368397291196389, + "grad_norm": 239.85861206054688, + "learning_rate": 2.0450090744101634e-05, + "loss": 45.4893, + "step": 1764 + }, + { + "epoch": 6.372009029345373, + "grad_norm": 302.56024169921875, + "learning_rate": 2.044464609800363e-05, + "loss": 45.3313, + "step": 1765 + }, + { + "epoch": 6.375620767494357, + "grad_norm": 255.5519256591797, + "learning_rate": 2.043920145190563e-05, + "loss": 44.703, + "step": 1766 + }, + { + "epoch": 6.3792325056433405, + "grad_norm": 223.1331024169922, + "learning_rate": 2.0433756805807624e-05, + "loss": 45.0278, + "step": 1767 + }, + { + "epoch": 6.382844243792325, + "grad_norm": 240.68817138671875, + "learning_rate": 2.042831215970962e-05, + "loss": 44.7298, + "step": 1768 + }, + { + "epoch": 6.386455981941309, + "grad_norm": 239.5072021484375, + "learning_rate": 2.0422867513611614e-05, + "loss": 44.0512, + "step": 1769 + }, + { + "epoch": 6.390067720090293, + "grad_norm": 186.3783416748047, + "learning_rate": 2.0417422867513613e-05, + "loss": 43.8646, + "step": 1770 + }, + { + "epoch": 6.390067720090293, + "eval_loss": 0.6325972676277161, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 1770 + }, + { + "epoch": 6.393679458239277, + "grad_norm": 169.77285766601562, + "learning_rate": 2.041197822141561e-05, + "loss": 43.8688, + "step": 1771 + }, + { + "epoch": 6.397291196388262, + "grad_norm": 158.4019012451172, + "learning_rate": 2.0406533575317604e-05, + "loss": 42.5757, + "step": 1772 + }, + { + "epoch": 6.400902934537246, + "grad_norm": 209.79916381835938, + "learning_rate": 2.04010889292196e-05, + "loss": 44.8075, + "step": 1773 + }, + { + "epoch": 6.40451467268623, + "grad_norm": 215.74639892578125, + "learning_rate": 2.0395644283121595e-05, + "loss": 42.0121, + "step": 1774 + }, + { + "epoch": 6.408126410835214, + "grad_norm": 215.21121215820312, + "learning_rate": 2.0390199637023597e-05, + "loss": 40.6564, + "step": 1775 + }, + { + "epoch": 6.411738148984199, + "grad_norm": 244.49574279785156, + "learning_rate": 2.0384754990925592e-05, + "loss": 40.543, + "step": 1776 + }, + { + "epoch": 6.415349887133183, + "grad_norm": 189.22781372070312, + "learning_rate": 2.0379310344827588e-05, + "loss": 39.5569, + "step": 1777 + }, + { + "epoch": 6.418961625282167, + "grad_norm": 204.32664489746094, + "learning_rate": 2.0373865698729583e-05, + "loss": 40.0789, + "step": 1778 + }, + { + "epoch": 6.422573363431152, + "grad_norm": 217.5277557373047, + "learning_rate": 2.036842105263158e-05, + "loss": 39.6436, + "step": 1779 + }, + { + "epoch": 6.426185101580136, + "grad_norm": 196.25918579101562, + "learning_rate": 2.0362976406533574e-05, + "loss": 41.0794, + "step": 1780 + }, + { + "epoch": 6.426185101580136, + "eval_loss": 0.6334295868873596, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1780 + }, + { + "epoch": 6.42979683972912, + "grad_norm": 191.50656127929688, + "learning_rate": 2.0357531760435573e-05, + "loss": 41.2976, + "step": 1781 + }, + { + "epoch": 6.433408577878104, + "grad_norm": 192.98692321777344, + "learning_rate": 2.0352087114337568e-05, + "loss": 41.0843, + "step": 1782 + }, + { + "epoch": 6.437020316027088, + "grad_norm": 197.32862854003906, + "learning_rate": 2.0346642468239563e-05, + "loss": 40.4123, + "step": 1783 + }, + { + "epoch": 6.440632054176072, + "grad_norm": 205.18751525878906, + "learning_rate": 2.0341197822141562e-05, + "loss": 41.9185, + "step": 1784 + }, + { + "epoch": 6.444243792325056, + "grad_norm": 201.69070434570312, + "learning_rate": 2.0335753176043558e-05, + "loss": 41.6794, + "step": 1785 + }, + { + "epoch": 6.44785553047404, + "grad_norm": 218.77044677734375, + "learning_rate": 2.0330308529945556e-05, + "loss": 43.5805, + "step": 1786 + }, + { + "epoch": 6.451467268623025, + "grad_norm": 183.25967407226562, + "learning_rate": 2.0324863883847552e-05, + "loss": 41.2777, + "step": 1787 + }, + { + "epoch": 6.455079006772009, + "grad_norm": 219.97369384765625, + "learning_rate": 2.0319419237749547e-05, + "loss": 42.4618, + "step": 1788 + }, + { + "epoch": 6.458690744920993, + "grad_norm": 216.1624298095703, + "learning_rate": 2.0313974591651542e-05, + "loss": 41.6424, + "step": 1789 + }, + { + "epoch": 6.462302483069977, + "grad_norm": 222.29965209960938, + "learning_rate": 2.0308529945553538e-05, + "loss": 41.4058, + "step": 1790 + }, + { + "epoch": 6.462302483069977, + "eval_loss": 0.6282982230186462, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 1790 + }, + { + "epoch": 6.465914221218962, + "grad_norm": 215.50511169433594, + "learning_rate": 2.0303085299455533e-05, + "loss": 39.474, + "step": 1791 + }, + { + "epoch": 6.469525959367946, + "grad_norm": 237.2119903564453, + "learning_rate": 2.0297640653357532e-05, + "loss": 36.0508, + "step": 1792 + }, + { + "epoch": 6.47313769751693, + "grad_norm": 234.52975463867188, + "learning_rate": 2.029219600725953e-05, + "loss": 34.1704, + "step": 1793 + }, + { + "epoch": 6.476749435665914, + "grad_norm": 213.22216796875, + "learning_rate": 2.0286751361161526e-05, + "loss": 34.7592, + "step": 1794 + }, + { + "epoch": 6.480361173814899, + "grad_norm": 215.77244567871094, + "learning_rate": 2.028130671506352e-05, + "loss": 35.3051, + "step": 1795 + }, + { + "epoch": 6.483972911963883, + "grad_norm": 179.0439910888672, + "learning_rate": 2.0275862068965517e-05, + "loss": 35.2493, + "step": 1796 + }, + { + "epoch": 6.487584650112867, + "grad_norm": 217.47218322753906, + "learning_rate": 2.0270417422867516e-05, + "loss": 35.6169, + "step": 1797 + }, + { + "epoch": 6.491196388261851, + "grad_norm": 191.3380584716797, + "learning_rate": 2.026497277676951e-05, + "loss": 36.428, + "step": 1798 + }, + { + "epoch": 6.4948081264108355, + "grad_norm": 200.8570098876953, + "learning_rate": 2.0259528130671506e-05, + "loss": 36.5983, + "step": 1799 + }, + { + "epoch": 6.4984198645598195, + "grad_norm": 173.1240234375, + "learning_rate": 2.0254083484573502e-05, + "loss": 36.0163, + "step": 1800 + }, + { + "epoch": 6.4984198645598195, + "eval_loss": 0.6268841624259949, + "eval_runtime": 3.146, + "eval_samples_per_second": 56.898, + "eval_steps_per_second": 56.898, + "step": 1800 + }, + { + "epoch": 6.502031602708803, + "grad_norm": 225.66845703125, + "learning_rate": 2.0248638838475497e-05, + "loss": 36.2461, + "step": 1801 + }, + { + "epoch": 6.505643340857787, + "grad_norm": 189.66233825683594, + "learning_rate": 2.0243194192377496e-05, + "loss": 37.416, + "step": 1802 + }, + { + "epoch": 6.509255079006772, + "grad_norm": 243.0270233154297, + "learning_rate": 2.0237749546279495e-05, + "loss": 38.5309, + "step": 1803 + }, + { + "epoch": 6.512866817155756, + "grad_norm": 192.0927276611328, + "learning_rate": 2.023230490018149e-05, + "loss": 37.087, + "step": 1804 + }, + { + "epoch": 6.51647855530474, + "grad_norm": 222.2957305908203, + "learning_rate": 2.0226860254083486e-05, + "loss": 37.8877, + "step": 1805 + }, + { + "epoch": 6.520090293453725, + "grad_norm": 259.84722900390625, + "learning_rate": 2.022141560798548e-05, + "loss": 39.2138, + "step": 1806 + }, + { + "epoch": 6.523702031602709, + "grad_norm": 205.5794219970703, + "learning_rate": 2.0215970961887476e-05, + "loss": 38.6066, + "step": 1807 + }, + { + "epoch": 6.527313769751693, + "grad_norm": 300.455810546875, + "learning_rate": 2.0210526315789475e-05, + "loss": 36.1581, + "step": 1808 + }, + { + "epoch": 6.530925507900677, + "grad_norm": 207.18063354492188, + "learning_rate": 2.020508166969147e-05, + "loss": 24.3689, + "step": 1809 + }, + { + "epoch": 6.534537246049661, + "grad_norm": 230.98516845703125, + "learning_rate": 2.0199637023593466e-05, + "loss": 23.7019, + "step": 1810 + }, + { + "epoch": 6.534537246049661, + "eval_loss": 0.6379140615463257, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 1810 + }, + { + "epoch": 6.538148984198646, + "grad_norm": 153.8694610595703, + "learning_rate": 2.019419237749546e-05, + "loss": 24.5035, + "step": 1811 + }, + { + "epoch": 6.54176072234763, + "grad_norm": 229.9432373046875, + "learning_rate": 2.018874773139746e-05, + "loss": 26.1645, + "step": 1812 + }, + { + "epoch": 6.545372460496614, + "grad_norm": 325.3592529296875, + "learning_rate": 2.018330308529946e-05, + "loss": 45.6349, + "step": 1813 + }, + { + "epoch": 6.5489841986455986, + "grad_norm": 261.0744323730469, + "learning_rate": 2.0177858439201454e-05, + "loss": 45.5545, + "step": 1814 + }, + { + "epoch": 6.5525959367945825, + "grad_norm": 261.4237976074219, + "learning_rate": 2.017241379310345e-05, + "loss": 45.321, + "step": 1815 + }, + { + "epoch": 6.5562076749435665, + "grad_norm": 238.8377685546875, + "learning_rate": 2.0166969147005445e-05, + "loss": 44.5963, + "step": 1816 + }, + { + "epoch": 6.5598194130925505, + "grad_norm": 225.89730834960938, + "learning_rate": 2.016152450090744e-05, + "loss": 43.593, + "step": 1817 + }, + { + "epoch": 6.563431151241535, + "grad_norm": 265.09625244140625, + "learning_rate": 2.0156079854809436e-05, + "loss": 43.536, + "step": 1818 + }, + { + "epoch": 6.567042889390519, + "grad_norm": 257.9114685058594, + "learning_rate": 2.0150635208711434e-05, + "loss": 44.1125, + "step": 1819 + }, + { + "epoch": 6.570654627539503, + "grad_norm": 188.06382751464844, + "learning_rate": 2.014519056261343e-05, + "loss": 45.097, + "step": 1820 + }, + { + "epoch": 6.570654627539503, + "eval_loss": 0.6347097754478455, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 1820 + }, + { + "epoch": 6.574266365688487, + "grad_norm": 227.7350616455078, + "learning_rate": 2.013974591651543e-05, + "loss": 43.9367, + "step": 1821 + }, + { + "epoch": 6.577878103837472, + "grad_norm": 207.54774475097656, + "learning_rate": 2.0134301270417424e-05, + "loss": 43.8266, + "step": 1822 + }, + { + "epoch": 6.581489841986456, + "grad_norm": 204.62364196777344, + "learning_rate": 2.012885662431942e-05, + "loss": 42.7973, + "step": 1823 + }, + { + "epoch": 6.58510158013544, + "grad_norm": 244.32159423828125, + "learning_rate": 2.0123411978221418e-05, + "loss": 42.7741, + "step": 1824 + }, + { + "epoch": 6.588713318284425, + "grad_norm": 304.9100036621094, + "learning_rate": 2.0117967332123414e-05, + "loss": 40.6529, + "step": 1825 + }, + { + "epoch": 6.592325056433409, + "grad_norm": 275.5767517089844, + "learning_rate": 2.011252268602541e-05, + "loss": 40.2909, + "step": 1826 + }, + { + "epoch": 6.595936794582393, + "grad_norm": 227.69642639160156, + "learning_rate": 2.0107078039927404e-05, + "loss": 39.8786, + "step": 1827 + }, + { + "epoch": 6.599548532731377, + "grad_norm": 261.4333190917969, + "learning_rate": 2.01016333938294e-05, + "loss": 40.7009, + "step": 1828 + }, + { + "epoch": 6.603160270880361, + "grad_norm": 213.0095977783203, + "learning_rate": 2.0096188747731395e-05, + "loss": 40.0595, + "step": 1829 + }, + { + "epoch": 6.606772009029346, + "grad_norm": 251.78590393066406, + "learning_rate": 2.0090744101633397e-05, + "loss": 40.8939, + "step": 1830 + }, + { + "epoch": 6.606772009029346, + "eval_loss": 0.6333281397819519, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1830 + }, + { + "epoch": 6.6103837471783295, + "grad_norm": 224.89805603027344, + "learning_rate": 2.0085299455535393e-05, + "loss": 41.4123, + "step": 1831 + }, + { + "epoch": 6.6139954853273135, + "grad_norm": 195.67982482910156, + "learning_rate": 2.0079854809437388e-05, + "loss": 41.3483, + "step": 1832 + }, + { + "epoch": 6.617607223476298, + "grad_norm": 214.318603515625, + "learning_rate": 2.0074410163339383e-05, + "loss": 40.5516, + "step": 1833 + }, + { + "epoch": 6.621218961625282, + "grad_norm": 226.60968017578125, + "learning_rate": 2.006896551724138e-05, + "loss": 41.3523, + "step": 1834 + }, + { + "epoch": 6.624830699774266, + "grad_norm": 231.63604736328125, + "learning_rate": 2.0063520871143378e-05, + "loss": 41.8734, + "step": 1835 + }, + { + "epoch": 6.62844243792325, + "grad_norm": 224.1644287109375, + "learning_rate": 2.0058076225045373e-05, + "loss": 42.7386, + "step": 1836 + }, + { + "epoch": 6.632054176072235, + "grad_norm": 273.651123046875, + "learning_rate": 2.0052631578947368e-05, + "loss": 42.4525, + "step": 1837 + }, + { + "epoch": 6.635665914221219, + "grad_norm": 270.8088684082031, + "learning_rate": 2.0047186932849364e-05, + "loss": 42.1051, + "step": 1838 + }, + { + "epoch": 6.639277652370203, + "grad_norm": 303.1058044433594, + "learning_rate": 2.0041742286751362e-05, + "loss": 42.1301, + "step": 1839 + }, + { + "epoch": 6.642889390519187, + "grad_norm": 207.29380798339844, + "learning_rate": 2.0036297640653358e-05, + "loss": 42.1495, + "step": 1840 + }, + { + "epoch": 6.642889390519187, + "eval_loss": 0.6321585774421692, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1840 + }, + { + "epoch": 6.646501128668172, + "grad_norm": 262.1852722167969, + "learning_rate": 2.0030852994555357e-05, + "loss": 39.6408, + "step": 1841 + }, + { + "epoch": 6.650112866817156, + "grad_norm": 233.7991943359375, + "learning_rate": 2.0025408348457352e-05, + "loss": 37.6177, + "step": 1842 + }, + { + "epoch": 6.65372460496614, + "grad_norm": 247.25514221191406, + "learning_rate": 2.0019963702359347e-05, + "loss": 35.4287, + "step": 1843 + }, + { + "epoch": 6.657336343115124, + "grad_norm": 191.53343200683594, + "learning_rate": 2.0014519056261343e-05, + "loss": 34.2335, + "step": 1844 + }, + { + "epoch": 6.660948081264109, + "grad_norm": 245.22821044921875, + "learning_rate": 2.0009074410163338e-05, + "loss": 35.8097, + "step": 1845 + }, + { + "epoch": 6.664559819413093, + "grad_norm": 213.8151092529297, + "learning_rate": 2.0003629764065337e-05, + "loss": 35.2621, + "step": 1846 + }, + { + "epoch": 6.668171557562077, + "grad_norm": 174.6085205078125, + "learning_rate": 1.9998185117967332e-05, + "loss": 36.6137, + "step": 1847 + }, + { + "epoch": 6.6717832957110605, + "grad_norm": 287.4677429199219, + "learning_rate": 1.9992740471869328e-05, + "loss": 37.5896, + "step": 1848 + }, + { + "epoch": 6.675395033860045, + "grad_norm": 224.59771728515625, + "learning_rate": 1.9987295825771326e-05, + "loss": 36.5515, + "step": 1849 + }, + { + "epoch": 6.679006772009029, + "grad_norm": 212.73065185546875, + "learning_rate": 1.9981851179673322e-05, + "loss": 36.2511, + "step": 1850 + }, + { + "epoch": 6.679006772009029, + "eval_loss": 0.6308404803276062, + "eval_runtime": 3.1419, + "eval_samples_per_second": 56.972, + "eval_steps_per_second": 56.972, + "step": 1850 + }, + { + "epoch": 6.682618510158013, + "grad_norm": 214.7340850830078, + "learning_rate": 1.9976406533575317e-05, + "loss": 37.6949, + "step": 1851 + }, + { + "epoch": 6.686230248306998, + "grad_norm": 220.3029327392578, + "learning_rate": 1.9970961887477316e-05, + "loss": 36.5785, + "step": 1852 + }, + { + "epoch": 6.689841986455982, + "grad_norm": 198.97564697265625, + "learning_rate": 1.996551724137931e-05, + "loss": 38.5277, + "step": 1853 + }, + { + "epoch": 6.693453724604966, + "grad_norm": 180.94789123535156, + "learning_rate": 1.9960072595281307e-05, + "loss": 37.5197, + "step": 1854 + }, + { + "epoch": 6.69706546275395, + "grad_norm": 212.17584228515625, + "learning_rate": 1.9954627949183302e-05, + "loss": 37.3483, + "step": 1855 + }, + { + "epoch": 6.700677200902934, + "grad_norm": 253.88601684570312, + "learning_rate": 1.9949183303085298e-05, + "loss": 38.5224, + "step": 1856 + }, + { + "epoch": 6.704288939051919, + "grad_norm": 193.17698669433594, + "learning_rate": 1.9943738656987296e-05, + "loss": 37.5679, + "step": 1857 + }, + { + "epoch": 6.707900677200903, + "grad_norm": 217.2652130126953, + "learning_rate": 1.9938294010889295e-05, + "loss": 27.7344, + "step": 1858 + }, + { + "epoch": 6.711512415349887, + "grad_norm": 183.9295196533203, + "learning_rate": 1.993284936479129e-05, + "loss": 24.3864, + "step": 1859 + }, + { + "epoch": 6.715124153498872, + "grad_norm": 200.3455352783203, + "learning_rate": 1.9927404718693286e-05, + "loss": 23.7328, + "step": 1860 + }, + { + "epoch": 6.715124153498872, + "eval_loss": 0.636415421962738, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.943, + "eval_steps_per_second": 56.943, + "step": 1860 + }, + { + "epoch": 6.718735891647856, + "grad_norm": 206.7858123779297, + "learning_rate": 1.992196007259528e-05, + "loss": 24.6541, + "step": 1861 + }, + { + "epoch": 6.72234762979684, + "grad_norm": 208.10414123535156, + "learning_rate": 1.9916515426497277e-05, + "loss": 25.1223, + "step": 1862 + }, + { + "epoch": 6.725959367945824, + "grad_norm": 270.6657409667969, + "learning_rate": 1.9911070780399275e-05, + "loss": 44.8561, + "step": 1863 + }, + { + "epoch": 6.7295711060948085, + "grad_norm": 246.69094848632812, + "learning_rate": 1.990562613430127e-05, + "loss": 45.8683, + "step": 1864 + }, + { + "epoch": 6.733182844243792, + "grad_norm": 243.4462432861328, + "learning_rate": 1.9900181488203266e-05, + "loss": 45.1845, + "step": 1865 + }, + { + "epoch": 6.736794582392776, + "grad_norm": 218.0637969970703, + "learning_rate": 1.989473684210526e-05, + "loss": 43.9492, + "step": 1866 + }, + { + "epoch": 6.74040632054176, + "grad_norm": 200.28140258789062, + "learning_rate": 1.988929219600726e-05, + "loss": 44.0612, + "step": 1867 + }, + { + "epoch": 6.744018058690745, + "grad_norm": 200.3120880126953, + "learning_rate": 1.988384754990926e-05, + "loss": 43.4748, + "step": 1868 + }, + { + "epoch": 6.747629796839729, + "grad_norm": 186.1811065673828, + "learning_rate": 1.9878402903811254e-05, + "loss": 43.6851, + "step": 1869 + }, + { + "epoch": 6.751241534988713, + "grad_norm": 208.15167236328125, + "learning_rate": 1.987295825771325e-05, + "loss": 44.4196, + "step": 1870 + }, + { + "epoch": 6.751241534988713, + "eval_loss": 0.6353851556777954, + "eval_runtime": 3.1436, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1870 + }, + { + "epoch": 6.754853273137698, + "grad_norm": 207.500244140625, + "learning_rate": 1.9867513611615245e-05, + "loss": 44.1493, + "step": 1871 + }, + { + "epoch": 6.758465011286682, + "grad_norm": 238.17047119140625, + "learning_rate": 1.986206896551724e-05, + "loss": 44.6587, + "step": 1872 + }, + { + "epoch": 6.762076749435666, + "grad_norm": 192.9468231201172, + "learning_rate": 1.9856624319419236e-05, + "loss": 43.2409, + "step": 1873 + }, + { + "epoch": 6.76568848758465, + "grad_norm": 205.26492309570312, + "learning_rate": 1.9851179673321235e-05, + "loss": 40.8636, + "step": 1874 + }, + { + "epoch": 6.769300225733634, + "grad_norm": 190.49908447265625, + "learning_rate": 1.984573502722323e-05, + "loss": 41.0769, + "step": 1875 + }, + { + "epoch": 6.772911963882619, + "grad_norm": 206.56097412109375, + "learning_rate": 1.984029038112523e-05, + "loss": 40.1137, + "step": 1876 + }, + { + "epoch": 6.776523702031603, + "grad_norm": 212.89256286621094, + "learning_rate": 1.9834845735027224e-05, + "loss": 41.0114, + "step": 1877 + }, + { + "epoch": 6.780135440180587, + "grad_norm": 197.24267578125, + "learning_rate": 1.982940108892922e-05, + "loss": 40.6027, + "step": 1878 + }, + { + "epoch": 6.7837471783295715, + "grad_norm": 187.01942443847656, + "learning_rate": 1.982395644283122e-05, + "loss": 40.5933, + "step": 1879 + }, + { + "epoch": 6.7873589164785555, + "grad_norm": 236.31092834472656, + "learning_rate": 1.9818511796733214e-05, + "loss": 41.2282, + "step": 1880 + }, + { + "epoch": 6.7873589164785555, + "eval_loss": 0.6299392580986023, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 1880 + }, + { + "epoch": 6.7909706546275395, + "grad_norm": 194.92059326171875, + "learning_rate": 1.981306715063521e-05, + "loss": 41.5858, + "step": 1881 + }, + { + "epoch": 6.794582392776523, + "grad_norm": 192.26272583007812, + "learning_rate": 1.9807622504537205e-05, + "loss": 40.6826, + "step": 1882 + }, + { + "epoch": 6.798194130925508, + "grad_norm": 181.8116912841797, + "learning_rate": 1.98021778584392e-05, + "loss": 40.0867, + "step": 1883 + }, + { + "epoch": 6.801805869074492, + "grad_norm": 219.03494262695312, + "learning_rate": 1.9796733212341195e-05, + "loss": 41.4496, + "step": 1884 + }, + { + "epoch": 6.805417607223476, + "grad_norm": 190.7852325439453, + "learning_rate": 1.9791288566243194e-05, + "loss": 42.4147, + "step": 1885 + }, + { + "epoch": 6.80902934537246, + "grad_norm": 200.32476806640625, + "learning_rate": 1.9785843920145193e-05, + "loss": 42.0316, + "step": 1886 + }, + { + "epoch": 6.812641083521445, + "grad_norm": 240.6086883544922, + "learning_rate": 1.9780399274047188e-05, + "loss": 39.6992, + "step": 1887 + }, + { + "epoch": 6.816252821670429, + "grad_norm": 222.31700134277344, + "learning_rate": 1.9774954627949184e-05, + "loss": 42.9572, + "step": 1888 + }, + { + "epoch": 6.819864559819413, + "grad_norm": 215.65292358398438, + "learning_rate": 1.976950998185118e-05, + "loss": 42.5147, + "step": 1889 + }, + { + "epoch": 6.823476297968397, + "grad_norm": 195.71624755859375, + "learning_rate": 1.9764065335753178e-05, + "loss": 40.9536, + "step": 1890 + }, + { + "epoch": 6.823476297968397, + "eval_loss": 0.6288287043571472, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 1890 + }, + { + "epoch": 6.827088036117382, + "grad_norm": 202.301025390625, + "learning_rate": 1.9758620689655173e-05, + "loss": 40.1754, + "step": 1891 + }, + { + "epoch": 6.830699774266366, + "grad_norm": 217.07186889648438, + "learning_rate": 1.975317604355717e-05, + "loss": 35.7505, + "step": 1892 + }, + { + "epoch": 6.83431151241535, + "grad_norm": 189.78782653808594, + "learning_rate": 1.9747731397459164e-05, + "loss": 34.813, + "step": 1893 + }, + { + "epoch": 6.837923250564334, + "grad_norm": 247.2117462158203, + "learning_rate": 1.974228675136116e-05, + "loss": 33.932, + "step": 1894 + }, + { + "epoch": 6.8415349887133186, + "grad_norm": 244.06321716308594, + "learning_rate": 1.9736842105263158e-05, + "loss": 36.2514, + "step": 1895 + }, + { + "epoch": 6.8451467268623025, + "grad_norm": 235.78692626953125, + "learning_rate": 1.9731397459165157e-05, + "loss": 35.2123, + "step": 1896 + }, + { + "epoch": 6.8487584650112865, + "grad_norm": 193.82456970214844, + "learning_rate": 1.9725952813067152e-05, + "loss": 36.5477, + "step": 1897 + }, + { + "epoch": 6.852370203160271, + "grad_norm": 230.2017059326172, + "learning_rate": 1.9720508166969148e-05, + "loss": 36.1244, + "step": 1898 + }, + { + "epoch": 6.855981941309255, + "grad_norm": 205.5274200439453, + "learning_rate": 1.9715063520871143e-05, + "loss": 36.7059, + "step": 1899 + }, + { + "epoch": 6.859593679458239, + "grad_norm": 236.6873016357422, + "learning_rate": 1.970961887477314e-05, + "loss": 36.6212, + "step": 1900 + }, + { + "epoch": 6.859593679458239, + "eval_loss": 0.6235609650611877, + "eval_runtime": 3.1497, + "eval_samples_per_second": 56.831, + "eval_steps_per_second": 56.831, + "step": 1900 + }, + { + "epoch": 6.863205417607223, + "grad_norm": 217.63638305664062, + "learning_rate": 1.9704174228675137e-05, + "loss": 37.3918, + "step": 1901 + }, + { + "epoch": 6.866817155756207, + "grad_norm": 169.31996154785156, + "learning_rate": 1.9698729582577133e-05, + "loss": 37.8555, + "step": 1902 + }, + { + "epoch": 6.870428893905192, + "grad_norm": 204.2144775390625, + "learning_rate": 1.9693284936479128e-05, + "loss": 38.0013, + "step": 1903 + }, + { + "epoch": 6.874040632054176, + "grad_norm": 219.13595581054688, + "learning_rate": 1.9687840290381127e-05, + "loss": 37.2128, + "step": 1904 + }, + { + "epoch": 6.87765237020316, + "grad_norm": 189.8477325439453, + "learning_rate": 1.9682395644283122e-05, + "loss": 39.272, + "step": 1905 + }, + { + "epoch": 6.881264108352145, + "grad_norm": 214.21360778808594, + "learning_rate": 1.967695099818512e-05, + "loss": 37.5185, + "step": 1906 + }, + { + "epoch": 6.884875846501129, + "grad_norm": 252.57867431640625, + "learning_rate": 1.9671506352087116e-05, + "loss": 37.6195, + "step": 1907 + }, + { + "epoch": 6.888487584650113, + "grad_norm": 169.85382080078125, + "learning_rate": 1.966606170598911e-05, + "loss": 29.083, + "step": 1908 + }, + { + "epoch": 6.892099322799097, + "grad_norm": 161.38137817382812, + "learning_rate": 1.9660617059891107e-05, + "loss": 24.4547, + "step": 1909 + }, + { + "epoch": 6.895711060948082, + "grad_norm": 192.5706787109375, + "learning_rate": 1.9655172413793102e-05, + "loss": 24.2235, + "step": 1910 + }, + { + "epoch": 6.895711060948082, + "eval_loss": 0.6387229561805725, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1910 + }, + { + "epoch": 6.899322799097066, + "grad_norm": 177.5368194580078, + "learning_rate": 1.9649727767695098e-05, + "loss": 24.8032, + "step": 1911 + }, + { + "epoch": 6.9029345372460496, + "grad_norm": 206.98458862304688, + "learning_rate": 1.9644283121597097e-05, + "loss": 25.7293, + "step": 1912 + }, + { + "epoch": 6.9065462753950335, + "grad_norm": 238.7289581298828, + "learning_rate": 1.9638838475499095e-05, + "loss": 44.2514, + "step": 1913 + }, + { + "epoch": 6.910158013544018, + "grad_norm": 225.86854553222656, + "learning_rate": 1.963339382940109e-05, + "loss": 44.4858, + "step": 1914 + }, + { + "epoch": 6.913769751693002, + "grad_norm": 235.71524047851562, + "learning_rate": 1.9627949183303086e-05, + "loss": 44.5351, + "step": 1915 + }, + { + "epoch": 6.917381489841986, + "grad_norm": 233.1634063720703, + "learning_rate": 1.962250453720508e-05, + "loss": 44.0865, + "step": 1916 + }, + { + "epoch": 6.92099322799097, + "grad_norm": 201.48944091796875, + "learning_rate": 1.961705989110708e-05, + "loss": 45.0226, + "step": 1917 + }, + { + "epoch": 6.924604966139955, + "grad_norm": 226.95469665527344, + "learning_rate": 1.9611615245009076e-05, + "loss": 44.3969, + "step": 1918 + }, + { + "epoch": 6.928216704288939, + "grad_norm": 242.79940795898438, + "learning_rate": 1.960617059891107e-05, + "loss": 41.3037, + "step": 1919 + }, + { + "epoch": 6.931828442437923, + "grad_norm": 255.3524932861328, + "learning_rate": 1.9600725952813066e-05, + "loss": 41.3567, + "step": 1920 + }, + { + "epoch": 6.931828442437923, + "eval_loss": 0.6346065998077393, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 1920 + }, + { + "epoch": 6.935440180586907, + "grad_norm": 277.0763854980469, + "learning_rate": 1.9595281306715062e-05, + "loss": 41.142, + "step": 1921 + }, + { + "epoch": 6.939051918735892, + "grad_norm": 176.02658081054688, + "learning_rate": 1.958983666061706e-05, + "loss": 42.1963, + "step": 1922 + }, + { + "epoch": 6.942663656884876, + "grad_norm": 236.36398315429688, + "learning_rate": 1.958439201451906e-05, + "loss": 42.351, + "step": 1923 + }, + { + "epoch": 6.94627539503386, + "grad_norm": 203.0919647216797, + "learning_rate": 1.9578947368421055e-05, + "loss": 41.5248, + "step": 1924 + }, + { + "epoch": 6.949887133182845, + "grad_norm": 273.605712890625, + "learning_rate": 1.957350272232305e-05, + "loss": 42.1004, + "step": 1925 + }, + { + "epoch": 6.953498871331829, + "grad_norm": 214.04319763183594, + "learning_rate": 1.9568058076225045e-05, + "loss": 42.6326, + "step": 1926 + }, + { + "epoch": 6.957110609480813, + "grad_norm": 250.81832885742188, + "learning_rate": 1.956261343012704e-05, + "loss": 43.8045, + "step": 1927 + }, + { + "epoch": 6.960722347629797, + "grad_norm": 233.58116149902344, + "learning_rate": 1.955716878402904e-05, + "loss": 39.8991, + "step": 1928 + }, + { + "epoch": 6.9643340857787805, + "grad_norm": 269.0545654296875, + "learning_rate": 1.9551724137931035e-05, + "loss": 34.6192, + "step": 1929 + }, + { + "epoch": 6.967945823927765, + "grad_norm": 266.1218566894531, + "learning_rate": 1.954627949183303e-05, + "loss": 35.7568, + "step": 1930 + }, + { + "epoch": 6.967945823927765, + "eval_loss": 0.6233173608779907, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1930 + }, + { + "epoch": 6.971557562076749, + "grad_norm": 294.6914978027344, + "learning_rate": 1.9540834845735026e-05, + "loss": 36.0795, + "step": 1931 + }, + { + "epoch": 6.975169300225733, + "grad_norm": 373.6831970214844, + "learning_rate": 1.9535390199637025e-05, + "loss": 37.2715, + "step": 1932 + }, + { + "epoch": 6.978781038374718, + "grad_norm": 240.34738159179688, + "learning_rate": 1.952994555353902e-05, + "loss": 37.8335, + "step": 1933 + }, + { + "epoch": 6.982392776523702, + "grad_norm": 312.1968994140625, + "learning_rate": 1.952450090744102e-05, + "loss": 37.8251, + "step": 1934 + }, + { + "epoch": 6.986004514672686, + "grad_norm": 276.3544006347656, + "learning_rate": 1.9519056261343014e-05, + "loss": 38.8466, + "step": 1935 + }, + { + "epoch": 6.98961625282167, + "grad_norm": 282.6874694824219, + "learning_rate": 1.951361161524501e-05, + "loss": 37.774, + "step": 1936 + }, + { + "epoch": 6.993227990970655, + "grad_norm": 323.96612548828125, + "learning_rate": 1.9508166969147005e-05, + "loss": 34.3747, + "step": 1937 + }, + { + "epoch": 6.996839729119639, + "grad_norm": 235.02915954589844, + "learning_rate": 1.9502722323049e-05, + "loss": 24.5297, + "step": 1938 + }, + { + "epoch": 7.0, + "grad_norm": 176.4046173095703, + "learning_rate": 1.9497277676951e-05, + "loss": 22.3179, + "step": 1939 + }, + { + "epoch": 7.003611738148984, + "grad_norm": 248.2797393798828, + "learning_rate": 1.9491833030852994e-05, + "loss": 42.225, + "step": 1940 + }, + { + "epoch": 7.003611738148984, + "eval_loss": 0.6272363066673279, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.911, + "eval_steps_per_second": 56.911, + "step": 1940 + }, + { + "epoch": 7.007223476297969, + "grad_norm": 235.9131622314453, + "learning_rate": 1.9486388384754993e-05, + "loss": 43.6526, + "step": 1941 + }, + { + "epoch": 7.010835214446953, + "grad_norm": 223.63479614257812, + "learning_rate": 1.948094373865699e-05, + "loss": 42.9052, + "step": 1942 + }, + { + "epoch": 7.014446952595937, + "grad_norm": 203.92141723632812, + "learning_rate": 1.9475499092558984e-05, + "loss": 43.5819, + "step": 1943 + }, + { + "epoch": 7.018058690744921, + "grad_norm": 209.6050567626953, + "learning_rate": 1.947005444646098e-05, + "loss": 43.1077, + "step": 1944 + }, + { + "epoch": 7.021670428893906, + "grad_norm": 245.77700805664062, + "learning_rate": 1.9464609800362978e-05, + "loss": 42.7508, + "step": 1945 + }, + { + "epoch": 7.0252821670428895, + "grad_norm": 203.13465881347656, + "learning_rate": 1.9459165154264973e-05, + "loss": 42.5234, + "step": 1946 + }, + { + "epoch": 7.0288939051918735, + "grad_norm": 226.4978485107422, + "learning_rate": 1.945372050816697e-05, + "loss": 44.0725, + "step": 1947 + }, + { + "epoch": 7.0325056433408575, + "grad_norm": 225.68116760253906, + "learning_rate": 1.9448275862068964e-05, + "loss": 42.6408, + "step": 1948 + }, + { + "epoch": 7.036117381489842, + "grad_norm": 182.14202880859375, + "learning_rate": 1.944283121597096e-05, + "loss": 41.7696, + "step": 1949 + }, + { + "epoch": 7.039729119638826, + "grad_norm": 196.1949005126953, + "learning_rate": 1.9437386569872962e-05, + "loss": 42.7008, + "step": 1950 + }, + { + "epoch": 7.039729119638826, + "eval_loss": 0.6277336478233337, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 1950 + }, + { + "epoch": 7.04334085778781, + "grad_norm": 180.6853485107422, + "learning_rate": 1.9431941923774957e-05, + "loss": 41.9946, + "step": 1951 + }, + { + "epoch": 7.046952595936794, + "grad_norm": 199.0644073486328, + "learning_rate": 1.9426497277676953e-05, + "loss": 39.8965, + "step": 1952 + }, + { + "epoch": 7.050564334085779, + "grad_norm": 208.21371459960938, + "learning_rate": 1.9421052631578948e-05, + "loss": 39.3263, + "step": 1953 + }, + { + "epoch": 7.054176072234763, + "grad_norm": 239.78677368164062, + "learning_rate": 1.9415607985480943e-05, + "loss": 40.1478, + "step": 1954 + }, + { + "epoch": 7.057787810383747, + "grad_norm": 211.55030822753906, + "learning_rate": 1.941016333938294e-05, + "loss": 40.061, + "step": 1955 + }, + { + "epoch": 7.061399548532731, + "grad_norm": 199.51455688476562, + "learning_rate": 1.9404718693284937e-05, + "loss": 39.8707, + "step": 1956 + }, + { + "epoch": 7.065011286681716, + "grad_norm": 183.39486694335938, + "learning_rate": 1.9399274047186933e-05, + "loss": 40.3183, + "step": 1957 + }, + { + "epoch": 7.0686230248307, + "grad_norm": 238.36737060546875, + "learning_rate": 1.9393829401088928e-05, + "loss": 40.8581, + "step": 1958 + }, + { + "epoch": 7.072234762979684, + "grad_norm": 202.5072021484375, + "learning_rate": 1.9388384754990927e-05, + "loss": 40.2192, + "step": 1959 + }, + { + "epoch": 7.075846501128668, + "grad_norm": 204.236083984375, + "learning_rate": 1.9382940108892922e-05, + "loss": 40.8533, + "step": 1960 + }, + { + "epoch": 7.075846501128668, + "eval_loss": 0.6252757906913757, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 1960 + }, + { + "epoch": 7.079458239277653, + "grad_norm": 260.2081298828125, + "learning_rate": 1.937749546279492e-05, + "loss": 39.7229, + "step": 1961 + }, + { + "epoch": 7.083069977426637, + "grad_norm": 241.91722106933594, + "learning_rate": 1.9372050816696917e-05, + "loss": 41.547, + "step": 1962 + }, + { + "epoch": 7.0866817155756205, + "grad_norm": 168.9304656982422, + "learning_rate": 1.9366606170598912e-05, + "loss": 41.4826, + "step": 1963 + }, + { + "epoch": 7.090293453724605, + "grad_norm": 230.05349731445312, + "learning_rate": 1.9361161524500907e-05, + "loss": 41.5411, + "step": 1964 + }, + { + "epoch": 7.093905191873589, + "grad_norm": 172.16851806640625, + "learning_rate": 1.9355716878402903e-05, + "loss": 42.2347, + "step": 1965 + }, + { + "epoch": 7.097516930022573, + "grad_norm": 312.65838623046875, + "learning_rate": 1.9350272232304898e-05, + "loss": 41.4039, + "step": 1966 + }, + { + "epoch": 7.101128668171557, + "grad_norm": 249.62351989746094, + "learning_rate": 1.9344827586206897e-05, + "loss": 41.4234, + "step": 1967 + }, + { + "epoch": 7.104740406320542, + "grad_norm": 250.49143981933594, + "learning_rate": 1.9339382940108896e-05, + "loss": 38.0539, + "step": 1968 + }, + { + "epoch": 7.108352144469526, + "grad_norm": 238.41546630859375, + "learning_rate": 1.933393829401089e-05, + "loss": 35.5584, + "step": 1969 + }, + { + "epoch": 7.11196388261851, + "grad_norm": 200.78282165527344, + "learning_rate": 1.9328493647912886e-05, + "loss": 34.4491, + "step": 1970 + }, + { + "epoch": 7.11196388261851, + "eval_loss": 0.6286216378211975, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 1970 + }, + { + "epoch": 7.115575620767494, + "grad_norm": 244.61717224121094, + "learning_rate": 1.9323049001814882e-05, + "loss": 34.5403, + "step": 1971 + }, + { + "epoch": 7.119187358916479, + "grad_norm": 219.14312744140625, + "learning_rate": 1.931760435571688e-05, + "loss": 35.7815, + "step": 1972 + }, + { + "epoch": 7.122799097065463, + "grad_norm": 221.85130310058594, + "learning_rate": 1.9312159709618876e-05, + "loss": 35.638, + "step": 1973 + }, + { + "epoch": 7.126410835214447, + "grad_norm": 237.97921752929688, + "learning_rate": 1.930671506352087e-05, + "loss": 35.1348, + "step": 1974 + }, + { + "epoch": 7.130022573363431, + "grad_norm": 234.06256103515625, + "learning_rate": 1.9301270417422867e-05, + "loss": 35.8709, + "step": 1975 + }, + { + "epoch": 7.133634311512416, + "grad_norm": 231.6852264404297, + "learning_rate": 1.9295825771324862e-05, + "loss": 36.6859, + "step": 1976 + }, + { + "epoch": 7.1372460496614, + "grad_norm": 208.2762908935547, + "learning_rate": 1.9290381125226857e-05, + "loss": 37.24, + "step": 1977 + }, + { + "epoch": 7.140857787810384, + "grad_norm": 219.8532257080078, + "learning_rate": 1.928493647912886e-05, + "loss": 36.4058, + "step": 1978 + }, + { + "epoch": 7.144469525959368, + "grad_norm": 242.73159790039062, + "learning_rate": 1.9279491833030855e-05, + "loss": 36.7565, + "step": 1979 + }, + { + "epoch": 7.148081264108352, + "grad_norm": 227.09645080566406, + "learning_rate": 1.927404718693285e-05, + "loss": 37.6752, + "step": 1980 + }, + { + "epoch": 7.148081264108352, + "eval_loss": 0.6243596076965332, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 1980 + }, + { + "epoch": 7.151693002257336, + "grad_norm": 236.27169799804688, + "learning_rate": 1.9268602540834846e-05, + "loss": 38.3857, + "step": 1981 + }, + { + "epoch": 7.15530474040632, + "grad_norm": 244.84912109375, + "learning_rate": 1.926315789473684e-05, + "loss": 38.414, + "step": 1982 + }, + { + "epoch": 7.158916478555304, + "grad_norm": 203.36798095703125, + "learning_rate": 1.925771324863884e-05, + "loss": 38.938, + "step": 1983 + }, + { + "epoch": 7.162528216704289, + "grad_norm": 225.50152587890625, + "learning_rate": 1.9252268602540835e-05, + "loss": 37.654, + "step": 1984 + }, + { + "epoch": 7.166139954853273, + "grad_norm": 236.4989471435547, + "learning_rate": 1.924682395644283e-05, + "loss": 28.2794, + "step": 1985 + }, + { + "epoch": 7.169751693002257, + "grad_norm": 173.909423828125, + "learning_rate": 1.9241379310344826e-05, + "loss": 23.3804, + "step": 1986 + }, + { + "epoch": 7.173363431151241, + "grad_norm": 195.63526916503906, + "learning_rate": 1.9235934664246825e-05, + "loss": 24.4696, + "step": 1987 + }, + { + "epoch": 7.176975169300226, + "grad_norm": 150.0059356689453, + "learning_rate": 1.923049001814882e-05, + "loss": 23.9438, + "step": 1988 + }, + { + "epoch": 7.18058690744921, + "grad_norm": 217.61630249023438, + "learning_rate": 1.922504537205082e-05, + "loss": 25.4084, + "step": 1989 + }, + { + "epoch": 7.184198645598194, + "grad_norm": 259.2041015625, + "learning_rate": 1.9219600725952814e-05, + "loss": 44.7159, + "step": 1990 + }, + { + "epoch": 7.184198645598194, + "eval_loss": 0.6465168595314026, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 1990 + }, + { + "epoch": 7.187810383747179, + "grad_norm": 282.1758117675781, + "learning_rate": 1.921415607985481e-05, + "loss": 45.7571, + "step": 1991 + }, + { + "epoch": 7.191422121896163, + "grad_norm": 276.5455322265625, + "learning_rate": 1.9208711433756805e-05, + "loss": 44.7227, + "step": 1992 + }, + { + "epoch": 7.195033860045147, + "grad_norm": 251.93589782714844, + "learning_rate": 1.92032667876588e-05, + "loss": 43.0705, + "step": 1993 + }, + { + "epoch": 7.198645598194131, + "grad_norm": 224.8245086669922, + "learning_rate": 1.91978221415608e-05, + "loss": 43.2009, + "step": 1994 + }, + { + "epoch": 7.2022573363431155, + "grad_norm": 233.61770629882812, + "learning_rate": 1.9192377495462795e-05, + "loss": 43.4496, + "step": 1995 + }, + { + "epoch": 7.2058690744920995, + "grad_norm": 188.65252685546875, + "learning_rate": 1.9186932849364793e-05, + "loss": 42.5907, + "step": 1996 + }, + { + "epoch": 7.209480812641083, + "grad_norm": 185.1155242919922, + "learning_rate": 1.918148820326679e-05, + "loss": 44.4651, + "step": 1997 + }, + { + "epoch": 7.213092550790067, + "grad_norm": 169.09701538085938, + "learning_rate": 1.9176043557168784e-05, + "loss": 43.6325, + "step": 1998 + }, + { + "epoch": 7.216704288939052, + "grad_norm": 198.49114990234375, + "learning_rate": 1.9170598911070783e-05, + "loss": 43.5817, + "step": 1999 + }, + { + "epoch": 7.220316027088036, + "grad_norm": 193.17591857910156, + "learning_rate": 1.916515426497278e-05, + "loss": 41.4884, + "step": 2000 + }, + { + "epoch": 7.220316027088036, + "eval_loss": 0.6329721212387085, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.995, + "eval_steps_per_second": 56.995, + "step": 2000 + }, + { + "epoch": 7.22392776523702, + "grad_norm": 202.32730102539062, + "learning_rate": 1.9159709618874774e-05, + "loss": 41.2168, + "step": 2001 + }, + { + "epoch": 7.227539503386004, + "grad_norm": 206.4916534423828, + "learning_rate": 1.915426497277677e-05, + "loss": 39.9909, + "step": 2002 + }, + { + "epoch": 7.231151241534989, + "grad_norm": 202.2099609375, + "learning_rate": 1.9148820326678765e-05, + "loss": 40.1413, + "step": 2003 + }, + { + "epoch": 7.234762979683973, + "grad_norm": 223.7954559326172, + "learning_rate": 1.914337568058076e-05, + "loss": 39.5872, + "step": 2004 + }, + { + "epoch": 7.238374717832957, + "grad_norm": 225.8967742919922, + "learning_rate": 1.9137931034482762e-05, + "loss": 41.3396, + "step": 2005 + }, + { + "epoch": 7.241986455981941, + "grad_norm": 248.0997772216797, + "learning_rate": 1.9132486388384757e-05, + "loss": 39.012, + "step": 2006 + }, + { + "epoch": 7.245598194130926, + "grad_norm": 227.4576873779297, + "learning_rate": 1.9127041742286753e-05, + "loss": 42.5922, + "step": 2007 + }, + { + "epoch": 7.24920993227991, + "grad_norm": 197.62547302246094, + "learning_rate": 1.9121597096188748e-05, + "loss": 41.6107, + "step": 2008 + }, + { + "epoch": 7.252821670428894, + "grad_norm": 170.18817138671875, + "learning_rate": 1.9116152450090744e-05, + "loss": 40.3326, + "step": 2009 + }, + { + "epoch": 7.2564334085778786, + "grad_norm": 186.9420166015625, + "learning_rate": 1.9110707803992742e-05, + "loss": 41.0365, + "step": 2010 + }, + { + "epoch": 7.2564334085778786, + "eval_loss": 0.6230406761169434, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2010 + }, + { + "epoch": 7.2600451467268625, + "grad_norm": 188.11244201660156, + "learning_rate": 1.9105263157894738e-05, + "loss": 42.0278, + "step": 2011 + }, + { + "epoch": 7.2636568848758465, + "grad_norm": 242.47305297851562, + "learning_rate": 1.9099818511796733e-05, + "loss": 41.5539, + "step": 2012 + }, + { + "epoch": 7.2672686230248305, + "grad_norm": 190.83987426757812, + "learning_rate": 1.909437386569873e-05, + "loss": 41.8641, + "step": 2013 + }, + { + "epoch": 7.270880361173815, + "grad_norm": 214.44650268554688, + "learning_rate": 1.9088929219600724e-05, + "loss": 42.232, + "step": 2014 + }, + { + "epoch": 7.274492099322799, + "grad_norm": 216.3888397216797, + "learning_rate": 1.9083484573502723e-05, + "loss": 41.6186, + "step": 2015 + }, + { + "epoch": 7.278103837471783, + "grad_norm": 210.46673583984375, + "learning_rate": 1.907803992740472e-05, + "loss": 42.2099, + "step": 2016 + }, + { + "epoch": 7.281715575620767, + "grad_norm": 194.84165954589844, + "learning_rate": 1.9072595281306717e-05, + "loss": 42.78, + "step": 2017 + }, + { + "epoch": 7.285327313769752, + "grad_norm": 201.91297912597656, + "learning_rate": 1.9067150635208712e-05, + "loss": 38.7115, + "step": 2018 + }, + { + "epoch": 7.288939051918736, + "grad_norm": 245.42625427246094, + "learning_rate": 1.9061705989110708e-05, + "loss": 35.7841, + "step": 2019 + }, + { + "epoch": 7.29255079006772, + "grad_norm": 182.4967041015625, + "learning_rate": 1.9056261343012703e-05, + "loss": 34.3308, + "step": 2020 + }, + { + "epoch": 7.29255079006772, + "eval_loss": 0.6238341331481934, + "eval_runtime": 3.1431, + "eval_samples_per_second": 56.95, + "eval_steps_per_second": 56.95, + "step": 2020 + }, + { + "epoch": 7.296162528216704, + "grad_norm": 297.3916320800781, + "learning_rate": 1.9050816696914702e-05, + "loss": 34.7534, + "step": 2021 + }, + { + "epoch": 7.299774266365689, + "grad_norm": 211.52554321289062, + "learning_rate": 1.9045372050816697e-05, + "loss": 34.0303, + "step": 2022 + }, + { + "epoch": 7.303386004514673, + "grad_norm": 232.99844360351562, + "learning_rate": 1.9039927404718693e-05, + "loss": 35.7378, + "step": 2023 + }, + { + "epoch": 7.306997742663657, + "grad_norm": 230.34642028808594, + "learning_rate": 1.903448275862069e-05, + "loss": 36.7492, + "step": 2024 + }, + { + "epoch": 7.310609480812641, + "grad_norm": 228.88966369628906, + "learning_rate": 1.9029038112522687e-05, + "loss": 35.1188, + "step": 2025 + }, + { + "epoch": 7.314221218961626, + "grad_norm": 213.2604522705078, + "learning_rate": 1.9023593466424682e-05, + "loss": 35.0688, + "step": 2026 + }, + { + "epoch": 7.3178329571106095, + "grad_norm": 202.62200927734375, + "learning_rate": 1.901814882032668e-05, + "loss": 37.6721, + "step": 2027 + }, + { + "epoch": 7.3214446952595935, + "grad_norm": 191.8877410888672, + "learning_rate": 1.9012704174228676e-05, + "loss": 36.7728, + "step": 2028 + }, + { + "epoch": 7.3250564334085775, + "grad_norm": 211.57571411132812, + "learning_rate": 1.900725952813067e-05, + "loss": 36.6342, + "step": 2029 + }, + { + "epoch": 7.328668171557562, + "grad_norm": 177.2289581298828, + "learning_rate": 1.9001814882032667e-05, + "loss": 36.8319, + "step": 2030 + }, + { + "epoch": 7.328668171557562, + "eval_loss": 0.6231008172035217, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2030 + }, + { + "epoch": 7.332279909706546, + "grad_norm": 227.7028350830078, + "learning_rate": 1.8996370235934662e-05, + "loss": 36.6706, + "step": 2031 + }, + { + "epoch": 7.33589164785553, + "grad_norm": 229.02972412109375, + "learning_rate": 1.899092558983666e-05, + "loss": 37.0749, + "step": 2032 + }, + { + "epoch": 7.339503386004514, + "grad_norm": 234.30946350097656, + "learning_rate": 1.898548094373866e-05, + "loss": 37.3716, + "step": 2033 + }, + { + "epoch": 7.343115124153499, + "grad_norm": 236.79893493652344, + "learning_rate": 1.8980036297640655e-05, + "loss": 38.9503, + "step": 2034 + }, + { + "epoch": 7.346726862302483, + "grad_norm": 256.5646057128906, + "learning_rate": 1.897459165154265e-05, + "loss": 32.5056, + "step": 2035 + }, + { + "epoch": 7.350338600451467, + "grad_norm": 183.38961791992188, + "learning_rate": 1.8969147005444646e-05, + "loss": 25.3982, + "step": 2036 + }, + { + "epoch": 7.353950338600452, + "grad_norm": 214.09742736816406, + "learning_rate": 1.896370235934664e-05, + "loss": 23.2743, + "step": 2037 + }, + { + "epoch": 7.357562076749436, + "grad_norm": 190.10867309570312, + "learning_rate": 1.895825771324864e-05, + "loss": 24.8062, + "step": 2038 + }, + { + "epoch": 7.36117381489842, + "grad_norm": 197.85313415527344, + "learning_rate": 1.8952813067150636e-05, + "loss": 25.5098, + "step": 2039 + }, + { + "epoch": 7.364785553047404, + "grad_norm": 235.79090881347656, + "learning_rate": 1.894736842105263e-05, + "loss": 44.3536, + "step": 2040 + }, + { + "epoch": 7.364785553047404, + "eval_loss": 0.6341925263404846, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.785, + "eval_steps_per_second": 56.785, + "step": 2040 + }, + { + "epoch": 7.368397291196389, + "grad_norm": 232.7415771484375, + "learning_rate": 1.8941923774954626e-05, + "loss": 44.6073, + "step": 2041 + }, + { + "epoch": 7.372009029345373, + "grad_norm": 302.3766174316406, + "learning_rate": 1.8936479128856625e-05, + "loss": 43.8575, + "step": 2042 + }, + { + "epoch": 7.375620767494357, + "grad_norm": 208.41441345214844, + "learning_rate": 1.8931034482758624e-05, + "loss": 42.4378, + "step": 2043 + }, + { + "epoch": 7.3792325056433405, + "grad_norm": 228.000732421875, + "learning_rate": 1.892558983666062e-05, + "loss": 44.5641, + "step": 2044 + }, + { + "epoch": 7.382844243792325, + "grad_norm": 201.757080078125, + "learning_rate": 1.8920145190562615e-05, + "loss": 43.7578, + "step": 2045 + }, + { + "epoch": 7.386455981941309, + "grad_norm": 220.2481689453125, + "learning_rate": 1.891470054446461e-05, + "loss": 42.755, + "step": 2046 + }, + { + "epoch": 7.390067720090293, + "grad_norm": 225.5443115234375, + "learning_rate": 1.8909255898366605e-05, + "loss": 44.3785, + "step": 2047 + }, + { + "epoch": 7.393679458239277, + "grad_norm": 200.2024688720703, + "learning_rate": 1.89038112522686e-05, + "loss": 42.994, + "step": 2048 + }, + { + "epoch": 7.397291196388262, + "grad_norm": 205.64794921875, + "learning_rate": 1.88983666061706e-05, + "loss": 43.1902, + "step": 2049 + }, + { + "epoch": 7.400902934537246, + "grad_norm": 183.3535919189453, + "learning_rate": 1.8892921960072595e-05, + "loss": 40.9422, + "step": 2050 + }, + { + "epoch": 7.400902934537246, + "eval_loss": 0.626913845539093, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2050 + }, + { + "epoch": 7.40451467268623, + "grad_norm": 201.8138885498047, + "learning_rate": 1.8887477313974594e-05, + "loss": 39.4408, + "step": 2051 + }, + { + "epoch": 7.408126410835214, + "grad_norm": 201.8863525390625, + "learning_rate": 1.888203266787659e-05, + "loss": 39.5467, + "step": 2052 + }, + { + "epoch": 7.411738148984199, + "grad_norm": 239.10687255859375, + "learning_rate": 1.8876588021778585e-05, + "loss": 41.2256, + "step": 2053 + }, + { + "epoch": 7.415349887133183, + "grad_norm": 209.47796630859375, + "learning_rate": 1.8871143375680583e-05, + "loss": 40.8963, + "step": 2054 + }, + { + "epoch": 7.418961625282167, + "grad_norm": 202.6414794921875, + "learning_rate": 1.886569872958258e-05, + "loss": 40.5138, + "step": 2055 + }, + { + "epoch": 7.422573363431152, + "grad_norm": 198.01795959472656, + "learning_rate": 1.8860254083484574e-05, + "loss": 39.1767, + "step": 2056 + }, + { + "epoch": 7.426185101580136, + "grad_norm": 173.26507568359375, + "learning_rate": 1.885480943738657e-05, + "loss": 40.6713, + "step": 2057 + }, + { + "epoch": 7.42979683972912, + "grad_norm": 166.11607360839844, + "learning_rate": 1.8849364791288565e-05, + "loss": 41.2602, + "step": 2058 + }, + { + "epoch": 7.433408577878104, + "grad_norm": 200.76956176757812, + "learning_rate": 1.884392014519056e-05, + "loss": 41.0714, + "step": 2059 + }, + { + "epoch": 7.437020316027088, + "grad_norm": 213.75315856933594, + "learning_rate": 1.883847549909256e-05, + "loss": 39.6812, + "step": 2060 + }, + { + "epoch": 7.437020316027088, + "eval_loss": 0.6279598474502563, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.0, + "eval_steps_per_second": 57.0, + "step": 2060 + }, + { + "epoch": 7.440632054176072, + "grad_norm": 221.25025939941406, + "learning_rate": 1.8833030852994558e-05, + "loss": 41.6964, + "step": 2061 + }, + { + "epoch": 7.444243792325056, + "grad_norm": 171.32106018066406, + "learning_rate": 1.8827586206896553e-05, + "loss": 41.4608, + "step": 2062 + }, + { + "epoch": 7.44785553047404, + "grad_norm": 222.76600646972656, + "learning_rate": 1.882214156079855e-05, + "loss": 41.2687, + "step": 2063 + }, + { + "epoch": 7.451467268623025, + "grad_norm": 169.82395935058594, + "learning_rate": 1.8816696914700544e-05, + "loss": 41.6048, + "step": 2064 + }, + { + "epoch": 7.455079006772009, + "grad_norm": 190.5113525390625, + "learning_rate": 1.8811252268602543e-05, + "loss": 41.8843, + "step": 2065 + }, + { + "epoch": 7.458690744920993, + "grad_norm": 194.5990447998047, + "learning_rate": 1.8805807622504538e-05, + "loss": 43.5968, + "step": 2066 + }, + { + "epoch": 7.462302483069977, + "grad_norm": 216.0985870361328, + "learning_rate": 1.8800362976406533e-05, + "loss": 41.6743, + "step": 2067 + }, + { + "epoch": 7.465914221218962, + "grad_norm": 249.05270385742188, + "learning_rate": 1.879491833030853e-05, + "loss": 39.4203, + "step": 2068 + }, + { + "epoch": 7.469525959367946, + "grad_norm": 232.5495147705078, + "learning_rate": 1.8789473684210524e-05, + "loss": 36.2202, + "step": 2069 + }, + { + "epoch": 7.47313769751693, + "grad_norm": 218.72299194335938, + "learning_rate": 1.8784029038112523e-05, + "loss": 34.9116, + "step": 2070 + }, + { + "epoch": 7.47313769751693, + "eval_loss": 0.6241349577903748, + "eval_runtime": 3.1499, + "eval_samples_per_second": 56.827, + "eval_steps_per_second": 56.827, + "step": 2070 + }, + { + "epoch": 7.476749435665914, + "grad_norm": 241.78179931640625, + "learning_rate": 1.8778584392014522e-05, + "loss": 36.2476, + "step": 2071 + }, + { + "epoch": 7.480361173814899, + "grad_norm": 194.92982482910156, + "learning_rate": 1.8773139745916517e-05, + "loss": 34.4524, + "step": 2072 + }, + { + "epoch": 7.483972911963883, + "grad_norm": 227.76156616210938, + "learning_rate": 1.8767695099818513e-05, + "loss": 34.5292, + "step": 2073 + }, + { + "epoch": 7.487584650112867, + "grad_norm": 287.61309814453125, + "learning_rate": 1.8762250453720508e-05, + "loss": 37.8068, + "step": 2074 + }, + { + "epoch": 7.491196388261851, + "grad_norm": 191.0822296142578, + "learning_rate": 1.8756805807622503e-05, + "loss": 36.0941, + "step": 2075 + }, + { + "epoch": 7.4948081264108355, + "grad_norm": 197.5564422607422, + "learning_rate": 1.8751361161524502e-05, + "loss": 36.3624, + "step": 2076 + }, + { + "epoch": 7.4984198645598195, + "grad_norm": 187.72479248046875, + "learning_rate": 1.8745916515426497e-05, + "loss": 37.5074, + "step": 2077 + }, + { + "epoch": 7.502031602708803, + "grad_norm": 220.4607391357422, + "learning_rate": 1.8740471869328493e-05, + "loss": 35.6139, + "step": 2078 + }, + { + "epoch": 7.505643340857787, + "grad_norm": 179.05612182617188, + "learning_rate": 1.873502722323049e-05, + "loss": 37.7286, + "step": 2079 + }, + { + "epoch": 7.509255079006772, + "grad_norm": 230.91879272460938, + "learning_rate": 1.8729582577132487e-05, + "loss": 36.1803, + "step": 2080 + }, + { + "epoch": 7.509255079006772, + "eval_loss": 0.6255043148994446, + "eval_runtime": 3.1466, + "eval_samples_per_second": 56.887, + "eval_steps_per_second": 56.887, + "step": 2080 + }, + { + "epoch": 7.512866817155756, + "grad_norm": 182.89437866210938, + "learning_rate": 1.8724137931034482e-05, + "loss": 36.5782, + "step": 2081 + }, + { + "epoch": 7.51647855530474, + "grad_norm": 215.36769104003906, + "learning_rate": 1.871869328493648e-05, + "loss": 38.233, + "step": 2082 + }, + { + "epoch": 7.520090293453725, + "grad_norm": 232.6095733642578, + "learning_rate": 1.8713248638838477e-05, + "loss": 38.6268, + "step": 2083 + }, + { + "epoch": 7.523702031602709, + "grad_norm": 236.94281005859375, + "learning_rate": 1.8707803992740472e-05, + "loss": 38.1768, + "step": 2084 + }, + { + "epoch": 7.527313769751693, + "grad_norm": 214.16079711914062, + "learning_rate": 1.8702359346642467e-05, + "loss": 27.514, + "step": 2085 + }, + { + "epoch": 7.530925507900677, + "grad_norm": 192.6107940673828, + "learning_rate": 1.8696914700544463e-05, + "loss": 24.274, + "step": 2086 + }, + { + "epoch": 7.534537246049661, + "grad_norm": 217.98619079589844, + "learning_rate": 1.869147005444646e-05, + "loss": 23.2824, + "step": 2087 + }, + { + "epoch": 7.538148984198646, + "grad_norm": 183.04296875, + "learning_rate": 1.868602540834846e-05, + "loss": 24.9622, + "step": 2088 + }, + { + "epoch": 7.54176072234763, + "grad_norm": 167.1417236328125, + "learning_rate": 1.8680580762250456e-05, + "loss": 25.1446, + "step": 2089 + }, + { + "epoch": 7.545372460496614, + "grad_norm": 287.29937744140625, + "learning_rate": 1.867513611615245e-05, + "loss": 44.1171, + "step": 2090 + }, + { + "epoch": 7.545372460496614, + "eval_loss": 0.6376849412918091, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.929, + "eval_steps_per_second": 56.929, + "step": 2090 + }, + { + "epoch": 7.5489841986455986, + "grad_norm": 285.3408203125, + "learning_rate": 1.8669691470054446e-05, + "loss": 46.3716, + "step": 2091 + }, + { + "epoch": 7.5525959367945825, + "grad_norm": 233.18389892578125, + "learning_rate": 1.8664246823956445e-05, + "loss": 44.0514, + "step": 2092 + }, + { + "epoch": 7.5562076749435665, + "grad_norm": 256.4196472167969, + "learning_rate": 1.865880217785844e-05, + "loss": 44.1784, + "step": 2093 + }, + { + "epoch": 7.5598194130925505, + "grad_norm": 223.28128051757812, + "learning_rate": 1.8653357531760436e-05, + "loss": 42.9897, + "step": 2094 + }, + { + "epoch": 7.563431151241535, + "grad_norm": 235.2901153564453, + "learning_rate": 1.864791288566243e-05, + "loss": 43.7651, + "step": 2095 + }, + { + "epoch": 7.567042889390519, + "grad_norm": 285.9206237792969, + "learning_rate": 1.8642468239564427e-05, + "loss": 44.6333, + "step": 2096 + }, + { + "epoch": 7.570654627539503, + "grad_norm": 200.00210571289062, + "learning_rate": 1.8637023593466425e-05, + "loss": 43.9845, + "step": 2097 + }, + { + "epoch": 7.574266365688487, + "grad_norm": 277.73394775390625, + "learning_rate": 1.8631578947368424e-05, + "loss": 44.7301, + "step": 2098 + }, + { + "epoch": 7.577878103837472, + "grad_norm": 216.9422149658203, + "learning_rate": 1.862613430127042e-05, + "loss": 44.0409, + "step": 2099 + }, + { + "epoch": 7.581489841986456, + "grad_norm": 198.86639404296875, + "learning_rate": 1.8620689655172415e-05, + "loss": 43.4026, + "step": 2100 + }, + { + "epoch": 7.581489841986456, + "eval_loss": 0.6270378232002258, + "eval_runtime": 3.1464, + "eval_samples_per_second": 56.891, + "eval_steps_per_second": 56.891, + "step": 2100 + }, + { + "epoch": 7.58510158013544, + "grad_norm": 240.495361328125, + "learning_rate": 1.861524500907441e-05, + "loss": 41.4092, + "step": 2101 + }, + { + "epoch": 7.588713318284425, + "grad_norm": 240.1851043701172, + "learning_rate": 1.8609800362976406e-05, + "loss": 40.1396, + "step": 2102 + }, + { + "epoch": 7.592325056433409, + "grad_norm": 241.21495056152344, + "learning_rate": 1.8604355716878405e-05, + "loss": 39.1778, + "step": 2103 + }, + { + "epoch": 7.595936794582393, + "grad_norm": 287.3133544921875, + "learning_rate": 1.85989110707804e-05, + "loss": 41.0348, + "step": 2104 + }, + { + "epoch": 7.599548532731377, + "grad_norm": 230.4313201904297, + "learning_rate": 1.8593466424682395e-05, + "loss": 39.5872, + "step": 2105 + }, + { + "epoch": 7.603160270880361, + "grad_norm": 210.32962036132812, + "learning_rate": 1.858802177858439e-05, + "loss": 40.6146, + "step": 2106 + }, + { + "epoch": 7.606772009029346, + "grad_norm": 185.81752014160156, + "learning_rate": 1.858257713248639e-05, + "loss": 39.6363, + "step": 2107 + }, + { + "epoch": 7.6103837471783295, + "grad_norm": 234.63037109375, + "learning_rate": 1.8577132486388385e-05, + "loss": 40.558, + "step": 2108 + }, + { + "epoch": 7.6139954853273135, + "grad_norm": 289.92803955078125, + "learning_rate": 1.8571687840290384e-05, + "loss": 41.1624, + "step": 2109 + }, + { + "epoch": 7.617607223476298, + "grad_norm": 252.82188415527344, + "learning_rate": 1.856624319419238e-05, + "loss": 41.7827, + "step": 2110 + }, + { + "epoch": 7.617607223476298, + "eval_loss": 0.6290409564971924, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2110 + }, + { + "epoch": 7.621218961625282, + "grad_norm": 201.8303985595703, + "learning_rate": 1.8560798548094374e-05, + "loss": 39.0072, + "step": 2111 + }, + { + "epoch": 7.624830699774266, + "grad_norm": 158.71446228027344, + "learning_rate": 1.855535390199637e-05, + "loss": 39.9822, + "step": 2112 + }, + { + "epoch": 7.62844243792325, + "grad_norm": 171.3879852294922, + "learning_rate": 1.8549909255898365e-05, + "loss": 42.1973, + "step": 2113 + }, + { + "epoch": 7.632054176072235, + "grad_norm": 218.584228515625, + "learning_rate": 1.8544464609800364e-05, + "loss": 42.933, + "step": 2114 + }, + { + "epoch": 7.635665914221219, + "grad_norm": 200.60093688964844, + "learning_rate": 1.853901996370236e-05, + "loss": 41.9847, + "step": 2115 + }, + { + "epoch": 7.639277652370203, + "grad_norm": 210.75128173828125, + "learning_rate": 1.8533575317604358e-05, + "loss": 42.4961, + "step": 2116 + }, + { + "epoch": 7.642889390519187, + "grad_norm": 187.47406005859375, + "learning_rate": 1.8528130671506353e-05, + "loss": 39.3404, + "step": 2117 + }, + { + "epoch": 7.646501128668172, + "grad_norm": 204.87693786621094, + "learning_rate": 1.852268602540835e-05, + "loss": 40.3011, + "step": 2118 + }, + { + "epoch": 7.650112866817156, + "grad_norm": 228.8159637451172, + "learning_rate": 1.8517241379310344e-05, + "loss": 37.4416, + "step": 2119 + }, + { + "epoch": 7.65372460496614, + "grad_norm": 237.59664916992188, + "learning_rate": 1.8511796733212343e-05, + "loss": 35.3079, + "step": 2120 + }, + { + "epoch": 7.65372460496614, + "eval_loss": 0.6256567239761353, + "eval_runtime": 3.1458, + "eval_samples_per_second": 56.902, + "eval_steps_per_second": 56.902, + "step": 2120 + }, + { + "epoch": 7.657336343115124, + "grad_norm": 233.3187713623047, + "learning_rate": 1.850635208711434e-05, + "loss": 34.5055, + "step": 2121 + }, + { + "epoch": 7.660948081264109, + "grad_norm": 232.7037353515625, + "learning_rate": 1.8500907441016334e-05, + "loss": 34.1232, + "step": 2122 + }, + { + "epoch": 7.664559819413093, + "grad_norm": 254.53050231933594, + "learning_rate": 1.849546279491833e-05, + "loss": 35.3301, + "step": 2123 + }, + { + "epoch": 7.668171557562077, + "grad_norm": 234.93154907226562, + "learning_rate": 1.8490018148820324e-05, + "loss": 35.9202, + "step": 2124 + }, + { + "epoch": 7.6717832957110605, + "grad_norm": 237.99671936035156, + "learning_rate": 1.8484573502722327e-05, + "loss": 36.5702, + "step": 2125 + }, + { + "epoch": 7.675395033860045, + "grad_norm": 186.25271606445312, + "learning_rate": 1.8479128856624322e-05, + "loss": 35.9423, + "step": 2126 + }, + { + "epoch": 7.679006772009029, + "grad_norm": 226.461669921875, + "learning_rate": 1.8473684210526317e-05, + "loss": 37.4121, + "step": 2127 + }, + { + "epoch": 7.682618510158013, + "grad_norm": 227.0966033935547, + "learning_rate": 1.8468239564428313e-05, + "loss": 36.8802, + "step": 2128 + }, + { + "epoch": 7.686230248306998, + "grad_norm": 193.4064178466797, + "learning_rate": 1.8462794918330308e-05, + "loss": 36.0245, + "step": 2129 + }, + { + "epoch": 7.689841986455982, + "grad_norm": 279.1668395996094, + "learning_rate": 1.8457350272232304e-05, + "loss": 37.4833, + "step": 2130 + }, + { + "epoch": 7.689841986455982, + "eval_loss": 0.6227458715438843, + "eval_runtime": 3.1429, + "eval_samples_per_second": 56.953, + "eval_steps_per_second": 56.953, + "step": 2130 + }, + { + "epoch": 7.693453724604966, + "grad_norm": 254.59234619140625, + "learning_rate": 1.8451905626134302e-05, + "loss": 36.8538, + "step": 2131 + }, + { + "epoch": 7.69706546275395, + "grad_norm": 191.14463806152344, + "learning_rate": 1.8446460980036298e-05, + "loss": 37.8517, + "step": 2132 + }, + { + "epoch": 7.700677200902934, + "grad_norm": 189.20896911621094, + "learning_rate": 1.8441016333938293e-05, + "loss": 38.406, + "step": 2133 + }, + { + "epoch": 7.704288939051919, + "grad_norm": 209.61175537109375, + "learning_rate": 1.8435571687840292e-05, + "loss": 37.7692, + "step": 2134 + }, + { + "epoch": 7.707900677200903, + "grad_norm": 220.5150146484375, + "learning_rate": 1.8430127041742287e-05, + "loss": 36.087, + "step": 2135 + }, + { + "epoch": 7.711512415349887, + "grad_norm": 211.78372192382812, + "learning_rate": 1.8424682395644286e-05, + "loss": 25.6052, + "step": 2136 + }, + { + "epoch": 7.715124153498872, + "grad_norm": 223.85789489746094, + "learning_rate": 1.841923774954628e-05, + "loss": 23.5576, + "step": 2137 + }, + { + "epoch": 7.718735891647856, + "grad_norm": 163.74220275878906, + "learning_rate": 1.8413793103448277e-05, + "loss": 24.4869, + "step": 2138 + }, + { + "epoch": 7.72234762979684, + "grad_norm": 182.80079650878906, + "learning_rate": 1.8408348457350272e-05, + "loss": 25.1878, + "step": 2139 + }, + { + "epoch": 7.725959367945824, + "grad_norm": 296.0340270996094, + "learning_rate": 1.8402903811252268e-05, + "loss": 44.4643, + "step": 2140 + }, + { + "epoch": 7.725959367945824, + "eval_loss": 0.6382863521575928, + "eval_runtime": 3.1441, + "eval_samples_per_second": 56.932, + "eval_steps_per_second": 56.932, + "step": 2140 + }, + { + "epoch": 7.7295711060948085, + "grad_norm": 248.48643493652344, + "learning_rate": 1.8397459165154263e-05, + "loss": 45.2141, + "step": 2141 + }, + { + "epoch": 7.733182844243792, + "grad_norm": 240.9061279296875, + "learning_rate": 1.8392014519056262e-05, + "loss": 42.9435, + "step": 2142 + }, + { + "epoch": 7.736794582392776, + "grad_norm": 231.62315368652344, + "learning_rate": 1.8386569872958257e-05, + "loss": 42.9769, + "step": 2143 + }, + { + "epoch": 7.74040632054176, + "grad_norm": 244.36915588378906, + "learning_rate": 1.8381125226860256e-05, + "loss": 43.6058, + "step": 2144 + }, + { + "epoch": 7.744018058690745, + "grad_norm": 252.9080047607422, + "learning_rate": 1.837568058076225e-05, + "loss": 43.1753, + "step": 2145 + }, + { + "epoch": 7.747629796839729, + "grad_norm": 274.0201721191406, + "learning_rate": 1.8370235934664247e-05, + "loss": 43.3285, + "step": 2146 + }, + { + "epoch": 7.751241534988713, + "grad_norm": 226.75595092773438, + "learning_rate": 1.8364791288566245e-05, + "loss": 43.3158, + "step": 2147 + }, + { + "epoch": 7.754853273137698, + "grad_norm": 197.0859832763672, + "learning_rate": 1.835934664246824e-05, + "loss": 43.5773, + "step": 2148 + }, + { + "epoch": 7.758465011286682, + "grad_norm": 212.14720153808594, + "learning_rate": 1.8353901996370236e-05, + "loss": 43.9208, + "step": 2149 + }, + { + "epoch": 7.762076749435666, + "grad_norm": 230.22158813476562, + "learning_rate": 1.834845735027223e-05, + "loss": 42.8429, + "step": 2150 + }, + { + "epoch": 7.762076749435666, + "eval_loss": 0.6291994452476501, + "eval_runtime": 3.1473, + "eval_samples_per_second": 56.874, + "eval_steps_per_second": 56.874, + "step": 2150 + }, + { + "epoch": 7.76568848758465, + "grad_norm": 215.79391479492188, + "learning_rate": 1.8343012704174227e-05, + "loss": 40.7289, + "step": 2151 + }, + { + "epoch": 7.769300225733634, + "grad_norm": 210.00296020507812, + "learning_rate": 1.8337568058076222e-05, + "loss": 39.9759, + "step": 2152 + }, + { + "epoch": 7.772911963882619, + "grad_norm": 291.2987976074219, + "learning_rate": 1.8332123411978224e-05, + "loss": 40.551, + "step": 2153 + }, + { + "epoch": 7.776523702031603, + "grad_norm": 218.08819580078125, + "learning_rate": 1.832667876588022e-05, + "loss": 40.7981, + "step": 2154 + }, + { + "epoch": 7.780135440180587, + "grad_norm": 268.615966796875, + "learning_rate": 1.8321234119782215e-05, + "loss": 40.5463, + "step": 2155 + }, + { + "epoch": 7.7837471783295715, + "grad_norm": 269.939697265625, + "learning_rate": 1.831578947368421e-05, + "loss": 40.6168, + "step": 2156 + }, + { + "epoch": 7.7873589164785555, + "grad_norm": 268.9761657714844, + "learning_rate": 1.8310344827586206e-05, + "loss": 41.2449, + "step": 2157 + }, + { + "epoch": 7.7909706546275395, + "grad_norm": 161.08811950683594, + "learning_rate": 1.8304900181488205e-05, + "loss": 40.6308, + "step": 2158 + }, + { + "epoch": 7.794582392776523, + "grad_norm": 190.44696044921875, + "learning_rate": 1.82994555353902e-05, + "loss": 40.9708, + "step": 2159 + }, + { + "epoch": 7.798194130925508, + "grad_norm": 202.4305419921875, + "learning_rate": 1.8294010889292196e-05, + "loss": 41.2053, + "step": 2160 + }, + { + "epoch": 7.798194130925508, + "eval_loss": 0.6233534812927246, + "eval_runtime": 3.1457, + "eval_samples_per_second": 56.903, + "eval_steps_per_second": 56.903, + "step": 2160 + }, + { + "epoch": 7.801805869074492, + "grad_norm": 188.5523681640625, + "learning_rate": 1.828856624319419e-05, + "loss": 40.3928, + "step": 2161 + }, + { + "epoch": 7.805417607223476, + "grad_norm": 184.18296813964844, + "learning_rate": 1.828312159709619e-05, + "loss": 42.3466, + "step": 2162 + }, + { + "epoch": 7.80902934537246, + "grad_norm": 223.9243927001953, + "learning_rate": 1.8277676950998185e-05, + "loss": 42.0301, + "step": 2163 + }, + { + "epoch": 7.812641083521445, + "grad_norm": 202.3498077392578, + "learning_rate": 1.8272232304900184e-05, + "loss": 42.3284, + "step": 2164 + }, + { + "epoch": 7.816252821670429, + "grad_norm": 205.77940368652344, + "learning_rate": 1.826678765880218e-05, + "loss": 42.0951, + "step": 2165 + }, + { + "epoch": 7.819864559819413, + "grad_norm": 191.46728515625, + "learning_rate": 1.8261343012704175e-05, + "loss": 40.826, + "step": 2166 + }, + { + "epoch": 7.823476297968397, + "grad_norm": 276.8330383300781, + "learning_rate": 1.825589836660617e-05, + "loss": 42.7909, + "step": 2167 + }, + { + "epoch": 7.827088036117382, + "grad_norm": 181.93955993652344, + "learning_rate": 1.8250453720508165e-05, + "loss": 38.6068, + "step": 2168 + }, + { + "epoch": 7.830699774266366, + "grad_norm": 178.79856872558594, + "learning_rate": 1.8245009074410164e-05, + "loss": 35.694, + "step": 2169 + }, + { + "epoch": 7.83431151241535, + "grad_norm": 224.6522979736328, + "learning_rate": 1.823956442831216e-05, + "loss": 36.7127, + "step": 2170 + }, + { + "epoch": 7.83431151241535, + "eval_loss": 0.6237645745277405, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 2170 + }, + { + "epoch": 7.837923250564334, + "grad_norm": 203.37196350097656, + "learning_rate": 1.823411978221416e-05, + "loss": 34.0039, + "step": 2171 + }, + { + "epoch": 7.8415349887133186, + "grad_norm": 212.79307556152344, + "learning_rate": 1.8228675136116154e-05, + "loss": 33.2787, + "step": 2172 + }, + { + "epoch": 7.8451467268623025, + "grad_norm": 215.5691375732422, + "learning_rate": 1.822323049001815e-05, + "loss": 35.4241, + "step": 2173 + }, + { + "epoch": 7.8487584650112865, + "grad_norm": 230.0751190185547, + "learning_rate": 1.8217785843920144e-05, + "loss": 36.9333, + "step": 2174 + }, + { + "epoch": 7.852370203160271, + "grad_norm": 217.8132781982422, + "learning_rate": 1.8212341197822143e-05, + "loss": 35.7233, + "step": 2175 + }, + { + "epoch": 7.855981941309255, + "grad_norm": 245.93177795410156, + "learning_rate": 1.820689655172414e-05, + "loss": 36.6111, + "step": 2176 + }, + { + "epoch": 7.859593679458239, + "grad_norm": 210.58218383789062, + "learning_rate": 1.8201451905626134e-05, + "loss": 36.3243, + "step": 2177 + }, + { + "epoch": 7.863205417607223, + "grad_norm": 234.6280059814453, + "learning_rate": 1.819600725952813e-05, + "loss": 37.0315, + "step": 2178 + }, + { + "epoch": 7.866817155756207, + "grad_norm": 184.53121948242188, + "learning_rate": 1.8190562613430125e-05, + "loss": 35.8725, + "step": 2179 + }, + { + "epoch": 7.870428893905192, + "grad_norm": 201.5563507080078, + "learning_rate": 1.8185117967332127e-05, + "loss": 37.9183, + "step": 2180 + }, + { + "epoch": 7.870428893905192, + "eval_loss": 0.6210297346115112, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2180 + }, + { + "epoch": 7.874040632054176, + "grad_norm": 192.29579162597656, + "learning_rate": 1.8179673321234122e-05, + "loss": 37.1709, + "step": 2181 + }, + { + "epoch": 7.87765237020316, + "grad_norm": 246.0638427734375, + "learning_rate": 1.8174228675136118e-05, + "loss": 38.5338, + "step": 2182 + }, + { + "epoch": 7.881264108352145, + "grad_norm": 237.47607421875, + "learning_rate": 1.8168784029038113e-05, + "loss": 37.7041, + "step": 2183 + }, + { + "epoch": 7.884875846501129, + "grad_norm": 215.06407165527344, + "learning_rate": 1.816333938294011e-05, + "loss": 38.1663, + "step": 2184 + }, + { + "epoch": 7.888487584650113, + "grad_norm": 193.76809692382812, + "learning_rate": 1.8157894736842107e-05, + "loss": 32.1679, + "step": 2185 + }, + { + "epoch": 7.892099322799097, + "grad_norm": 208.66111755371094, + "learning_rate": 1.8152450090744103e-05, + "loss": 24.2413, + "step": 2186 + }, + { + "epoch": 7.895711060948082, + "grad_norm": 182.810546875, + "learning_rate": 1.8147005444646098e-05, + "loss": 24.1102, + "step": 2187 + }, + { + "epoch": 7.899322799097066, + "grad_norm": 200.25823974609375, + "learning_rate": 1.8141560798548093e-05, + "loss": 24.5778, + "step": 2188 + }, + { + "epoch": 7.9029345372460496, + "grad_norm": 224.19125366210938, + "learning_rate": 1.813611615245009e-05, + "loss": 26.1643, + "step": 2189 + }, + { + "epoch": 7.9065462753950335, + "grad_norm": 261.03033447265625, + "learning_rate": 1.8130671506352088e-05, + "loss": 45.1071, + "step": 2190 + }, + { + "epoch": 7.9065462753950335, + "eval_loss": 0.6303785443305969, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2190 + }, + { + "epoch": 7.910158013544018, + "grad_norm": 273.6593322753906, + "learning_rate": 1.8125226860254086e-05, + "loss": 43.8271, + "step": 2191 + }, + { + "epoch": 7.913769751693002, + "grad_norm": 304.0534362792969, + "learning_rate": 1.8119782214156082e-05, + "loss": 43.7623, + "step": 2192 + }, + { + "epoch": 7.917381489841986, + "grad_norm": 249.27255249023438, + "learning_rate": 1.8114337568058077e-05, + "loss": 43.7191, + "step": 2193 + }, + { + "epoch": 7.92099322799097, + "grad_norm": 199.5006103515625, + "learning_rate": 1.8108892921960072e-05, + "loss": 44.1019, + "step": 2194 + }, + { + "epoch": 7.924604966139955, + "grad_norm": 228.42832946777344, + "learning_rate": 1.8103448275862068e-05, + "loss": 43.9717, + "step": 2195 + }, + { + "epoch": 7.928216704288939, + "grad_norm": 247.20901489257812, + "learning_rate": 1.8098003629764067e-05, + "loss": 40.022, + "step": 2196 + }, + { + "epoch": 7.931828442437923, + "grad_norm": 297.5372619628906, + "learning_rate": 1.8092558983666062e-05, + "loss": 40.6639, + "step": 2197 + }, + { + "epoch": 7.935440180586907, + "grad_norm": 245.11915588378906, + "learning_rate": 1.8087114337568057e-05, + "loss": 40.3569, + "step": 2198 + }, + { + "epoch": 7.939051918735892, + "grad_norm": 255.53297424316406, + "learning_rate": 1.8081669691470056e-05, + "loss": 41.7983, + "step": 2199 + }, + { + "epoch": 7.942663656884876, + "grad_norm": 226.12783813476562, + "learning_rate": 1.807622504537205e-05, + "loss": 41.7844, + "step": 2200 + }, + { + "epoch": 7.942663656884876, + "eval_loss": 0.6214397549629211, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2200 + }, + { + "epoch": 7.94627539503386, + "grad_norm": 220.90577697753906, + "learning_rate": 1.8070780399274047e-05, + "loss": 42.057, + "step": 2201 + }, + { + "epoch": 7.949887133182845, + "grad_norm": 192.33856201171875, + "learning_rate": 1.8065335753176046e-05, + "loss": 42.0299, + "step": 2202 + }, + { + "epoch": 7.953498871331829, + "grad_norm": 192.8511962890625, + "learning_rate": 1.805989110707804e-05, + "loss": 41.7752, + "step": 2203 + }, + { + "epoch": 7.957110609480813, + "grad_norm": 223.10275268554688, + "learning_rate": 1.8054446460980036e-05, + "loss": 41.0178, + "step": 2204 + }, + { + "epoch": 7.960722347629797, + "grad_norm": 189.8402099609375, + "learning_rate": 1.8049001814882032e-05, + "loss": 37.9747, + "step": 2205 + }, + { + "epoch": 7.9643340857787805, + "grad_norm": 233.5938720703125, + "learning_rate": 1.8043557168784027e-05, + "loss": 35.3994, + "step": 2206 + }, + { + "epoch": 7.967945823927765, + "grad_norm": 218.5577850341797, + "learning_rate": 1.8038112522686026e-05, + "loss": 35.1967, + "step": 2207 + }, + { + "epoch": 7.971557562076749, + "grad_norm": 228.49502563476562, + "learning_rate": 1.8032667876588025e-05, + "loss": 34.5792, + "step": 2208 + }, + { + "epoch": 7.975169300225733, + "grad_norm": 285.4461364746094, + "learning_rate": 1.802722323049002e-05, + "loss": 37.9449, + "step": 2209 + }, + { + "epoch": 7.978781038374718, + "grad_norm": 186.83755493164062, + "learning_rate": 1.8021778584392016e-05, + "loss": 36.3295, + "step": 2210 + }, + { + "epoch": 7.978781038374718, + "eval_loss": 0.6212169528007507, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2210 + }, + { + "epoch": 7.982392776523702, + "grad_norm": 210.31175231933594, + "learning_rate": 1.801633393829401e-05, + "loss": 37.0061, + "step": 2211 + }, + { + "epoch": 7.986004514672686, + "grad_norm": 251.96026611328125, + "learning_rate": 1.8010889292196006e-05, + "loss": 37.8831, + "step": 2212 + }, + { + "epoch": 7.98961625282167, + "grad_norm": 273.8665771484375, + "learning_rate": 1.8005444646098005e-05, + "loss": 38.8926, + "step": 2213 + }, + { + "epoch": 7.993227990970655, + "grad_norm": 207.25836181640625, + "learning_rate": 1.8e-05, + "loss": 30.0468, + "step": 2214 + }, + { + "epoch": 7.996839729119639, + "grad_norm": 200.5218048095703, + "learning_rate": 1.7994555353901996e-05, + "loss": 24.0549, + "step": 2215 + }, + { + "epoch": 8.0, + "grad_norm": 245.7149200439453, + "learning_rate": 1.798911070780399e-05, + "loss": 22.3158, + "step": 2216 + }, + { + "epoch": 8.003611738148985, + "grad_norm": 263.85546875, + "learning_rate": 1.798366606170599e-05, + "loss": 43.2342, + "step": 2217 + }, + { + "epoch": 8.007223476297968, + "grad_norm": 244.57205200195312, + "learning_rate": 1.797822141560799e-05, + "loss": 44.0931, + "step": 2218 + }, + { + "epoch": 8.010835214446953, + "grad_norm": 196.4144287109375, + "learning_rate": 1.7972776769509984e-05, + "loss": 42.1926, + "step": 2219 + }, + { + "epoch": 8.014446952595938, + "grad_norm": 282.3250427246094, + "learning_rate": 1.796733212341198e-05, + "loss": 41.4664, + "step": 2220 + }, + { + "epoch": 8.014446952595938, + "eval_loss": 0.6222901344299316, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 2220 + }, + { + "epoch": 8.01805869074492, + "grad_norm": 186.79281616210938, + "learning_rate": 1.7961887477313975e-05, + "loss": 42.2133, + "step": 2221 + }, + { + "epoch": 8.021670428893906, + "grad_norm": 220.3788299560547, + "learning_rate": 1.795644283121597e-05, + "loss": 42.0159, + "step": 2222 + }, + { + "epoch": 8.025282167042889, + "grad_norm": 262.37078857421875, + "learning_rate": 1.7950998185117966e-05, + "loss": 42.6055, + "step": 2223 + }, + { + "epoch": 8.028893905191874, + "grad_norm": 199.07078552246094, + "learning_rate": 1.7945553539019964e-05, + "loss": 43.3061, + "step": 2224 + }, + { + "epoch": 8.032505643340858, + "grad_norm": 256.6651306152344, + "learning_rate": 1.794010889292196e-05, + "loss": 42.4806, + "step": 2225 + }, + { + "epoch": 8.036117381489841, + "grad_norm": 281.17431640625, + "learning_rate": 1.793466424682396e-05, + "loss": 43.9823, + "step": 2226 + }, + { + "epoch": 8.039729119638826, + "grad_norm": 201.19837951660156, + "learning_rate": 1.7929219600725954e-05, + "loss": 41.8372, + "step": 2227 + }, + { + "epoch": 8.043340857787811, + "grad_norm": 195.1905059814453, + "learning_rate": 1.792377495462795e-05, + "loss": 38.8656, + "step": 2228 + }, + { + "epoch": 8.046952595936794, + "grad_norm": 215.02772521972656, + "learning_rate": 1.7918330308529948e-05, + "loss": 39.8965, + "step": 2229 + }, + { + "epoch": 8.050564334085779, + "grad_norm": 202.16322326660156, + "learning_rate": 1.7912885662431944e-05, + "loss": 41.0917, + "step": 2230 + }, + { + "epoch": 8.050564334085779, + "eval_loss": 0.6212881207466125, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2230 + }, + { + "epoch": 8.054176072234762, + "grad_norm": 218.90786743164062, + "learning_rate": 1.790744101633394e-05, + "loss": 38.5499, + "step": 2231 + }, + { + "epoch": 8.057787810383747, + "grad_norm": 179.57138061523438, + "learning_rate": 1.7901996370235934e-05, + "loss": 39.5915, + "step": 2232 + }, + { + "epoch": 8.061399548532732, + "grad_norm": 242.74801635742188, + "learning_rate": 1.789655172413793e-05, + "loss": 39.6094, + "step": 2233 + }, + { + "epoch": 8.065011286681715, + "grad_norm": 183.07102966308594, + "learning_rate": 1.7891107078039925e-05, + "loss": 40.6025, + "step": 2234 + }, + { + "epoch": 8.0686230248307, + "grad_norm": 192.85418701171875, + "learning_rate": 1.7885662431941924e-05, + "loss": 40.3013, + "step": 2235 + }, + { + "epoch": 8.072234762979685, + "grad_norm": 254.26353454589844, + "learning_rate": 1.7880217785843923e-05, + "loss": 39.1747, + "step": 2236 + }, + { + "epoch": 8.075846501128668, + "grad_norm": 230.7747802734375, + "learning_rate": 1.7874773139745918e-05, + "loss": 40.7569, + "step": 2237 + }, + { + "epoch": 8.079458239277653, + "grad_norm": 179.30528259277344, + "learning_rate": 1.7869328493647913e-05, + "loss": 40.0753, + "step": 2238 + }, + { + "epoch": 8.083069977426636, + "grad_norm": 203.48915100097656, + "learning_rate": 1.786388384754991e-05, + "loss": 41.4453, + "step": 2239 + }, + { + "epoch": 8.08668171557562, + "grad_norm": 274.8970947265625, + "learning_rate": 1.7858439201451908e-05, + "loss": 40.5818, + "step": 2240 + }, + { + "epoch": 8.08668171557562, + "eval_loss": 0.6184170842170715, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.799, + "eval_steps_per_second": 56.799, + "step": 2240 + }, + { + "epoch": 8.090293453724605, + "grad_norm": 237.2452392578125, + "learning_rate": 1.7852994555353903e-05, + "loss": 42.5794, + "step": 2241 + }, + { + "epoch": 8.093905191873588, + "grad_norm": 236.33766174316406, + "learning_rate": 1.7847549909255898e-05, + "loss": 41.89, + "step": 2242 + }, + { + "epoch": 8.097516930022573, + "grad_norm": 269.4791564941406, + "learning_rate": 1.7842105263157894e-05, + "loss": 41.7726, + "step": 2243 + }, + { + "epoch": 8.101128668171558, + "grad_norm": 192.28457641601562, + "learning_rate": 1.783666061705989e-05, + "loss": 40.1187, + "step": 2244 + }, + { + "epoch": 8.104740406320541, + "grad_norm": 201.5625457763672, + "learning_rate": 1.7831215970961888e-05, + "loss": 36.8004, + "step": 2245 + }, + { + "epoch": 8.108352144469526, + "grad_norm": 175.7625274658203, + "learning_rate": 1.7825771324863887e-05, + "loss": 33.8354, + "step": 2246 + }, + { + "epoch": 8.111963882618511, + "grad_norm": 195.6171112060547, + "learning_rate": 1.7820326678765882e-05, + "loss": 33.5176, + "step": 2247 + }, + { + "epoch": 8.115575620767494, + "grad_norm": 158.7554168701172, + "learning_rate": 1.7814882032667877e-05, + "loss": 34.2908, + "step": 2248 + }, + { + "epoch": 8.119187358916479, + "grad_norm": 192.78900146484375, + "learning_rate": 1.7809437386569873e-05, + "loss": 34.0861, + "step": 2249 + }, + { + "epoch": 8.122799097065462, + "grad_norm": 186.6603240966797, + "learning_rate": 1.7803992740471868e-05, + "loss": 35.5742, + "step": 2250 + }, + { + "epoch": 8.122799097065462, + "eval_loss": 0.6207499504089355, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2250 + }, + { + "epoch": 8.126410835214447, + "grad_norm": 264.3590087890625, + "learning_rate": 1.7798548094373867e-05, + "loss": 35.6709, + "step": 2251 + }, + { + "epoch": 8.130022573363432, + "grad_norm": 202.9478302001953, + "learning_rate": 1.7793103448275862e-05, + "loss": 36.4221, + "step": 2252 + }, + { + "epoch": 8.133634311512415, + "grad_norm": 229.260498046875, + "learning_rate": 1.7787658802177858e-05, + "loss": 36.0745, + "step": 2253 + }, + { + "epoch": 8.1372460496614, + "grad_norm": 222.37716674804688, + "learning_rate": 1.7782214156079856e-05, + "loss": 37.3266, + "step": 2254 + }, + { + "epoch": 8.140857787810384, + "grad_norm": 217.02272033691406, + "learning_rate": 1.7776769509981852e-05, + "loss": 37.2819, + "step": 2255 + }, + { + "epoch": 8.144469525959368, + "grad_norm": 247.61016845703125, + "learning_rate": 1.7771324863883847e-05, + "loss": 37.2683, + "step": 2256 + }, + { + "epoch": 8.148081264108352, + "grad_norm": 209.7449493408203, + "learning_rate": 1.7765880217785846e-05, + "loss": 36.7165, + "step": 2257 + }, + { + "epoch": 8.151693002257336, + "grad_norm": 217.30722045898438, + "learning_rate": 1.776043557168784e-05, + "loss": 37.0805, + "step": 2258 + }, + { + "epoch": 8.15530474040632, + "grad_norm": 181.5167236328125, + "learning_rate": 1.7754990925589837e-05, + "loss": 38.0326, + "step": 2259 + }, + { + "epoch": 8.158916478555305, + "grad_norm": 217.4818878173828, + "learning_rate": 1.7749546279491832e-05, + "loss": 37.1798, + "step": 2260 + }, + { + "epoch": 8.158916478555305, + "eval_loss": 0.6218119263648987, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 2260 + }, + { + "epoch": 8.162528216704288, + "grad_norm": 233.60733032226562, + "learning_rate": 1.7744101633393828e-05, + "loss": 36.6039, + "step": 2261 + }, + { + "epoch": 8.166139954853273, + "grad_norm": 184.5128631591797, + "learning_rate": 1.7738656987295826e-05, + "loss": 30.6188, + "step": 2262 + }, + { + "epoch": 8.169751693002258, + "grad_norm": 154.25791931152344, + "learning_rate": 1.7733212341197825e-05, + "loss": 24.0782, + "step": 2263 + }, + { + "epoch": 8.173363431151241, + "grad_norm": 179.92723083496094, + "learning_rate": 1.772776769509982e-05, + "loss": 23.7072, + "step": 2264 + }, + { + "epoch": 8.176975169300226, + "grad_norm": 170.87684631347656, + "learning_rate": 1.7722323049001816e-05, + "loss": 24.0008, + "step": 2265 + }, + { + "epoch": 8.18058690744921, + "grad_norm": 179.25233459472656, + "learning_rate": 1.771687840290381e-05, + "loss": 24.8393, + "step": 2266 + }, + { + "epoch": 8.184198645598194, + "grad_norm": 268.7836608886719, + "learning_rate": 1.7711433756805807e-05, + "loss": 44.0573, + "step": 2267 + }, + { + "epoch": 8.187810383747179, + "grad_norm": 249.12033081054688, + "learning_rate": 1.7705989110707805e-05, + "loss": 45.0218, + "step": 2268 + }, + { + "epoch": 8.191422121896162, + "grad_norm": 275.2551574707031, + "learning_rate": 1.77005444646098e-05, + "loss": 43.1954, + "step": 2269 + }, + { + "epoch": 8.195033860045147, + "grad_norm": 233.5360107421875, + "learning_rate": 1.7695099818511796e-05, + "loss": 43.0807, + "step": 2270 + }, + { + "epoch": 8.195033860045147, + "eval_loss": 0.6311450600624084, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 2270 + }, + { + "epoch": 8.198645598194132, + "grad_norm": 201.01617431640625, + "learning_rate": 1.768965517241379e-05, + "loss": 43.8161, + "step": 2271 + }, + { + "epoch": 8.202257336343115, + "grad_norm": 243.028564453125, + "learning_rate": 1.7684210526315787e-05, + "loss": 43.3388, + "step": 2272 + }, + { + "epoch": 8.2058690744921, + "grad_norm": 191.8246307373047, + "learning_rate": 1.767876588021779e-05, + "loss": 42.6949, + "step": 2273 + }, + { + "epoch": 8.209480812641084, + "grad_norm": 241.33609008789062, + "learning_rate": 1.7673321234119784e-05, + "loss": 43.3541, + "step": 2274 + }, + { + "epoch": 8.213092550790067, + "grad_norm": 247.99066162109375, + "learning_rate": 1.766787658802178e-05, + "loss": 44.4262, + "step": 2275 + }, + { + "epoch": 8.216704288939052, + "grad_norm": 223.35452270507812, + "learning_rate": 1.7662431941923775e-05, + "loss": 42.5696, + "step": 2276 + }, + { + "epoch": 8.220316027088035, + "grad_norm": 208.75209045410156, + "learning_rate": 1.765698729582577e-05, + "loss": 41.9236, + "step": 2277 + }, + { + "epoch": 8.22392776523702, + "grad_norm": 229.60305786132812, + "learning_rate": 1.7651542649727766e-05, + "loss": 39.962, + "step": 2278 + }, + { + "epoch": 8.227539503386005, + "grad_norm": 294.3867492675781, + "learning_rate": 1.7646098003629765e-05, + "loss": 39.0847, + "step": 2279 + }, + { + "epoch": 8.231151241534988, + "grad_norm": 201.49679565429688, + "learning_rate": 1.764065335753176e-05, + "loss": 39.1451, + "step": 2280 + }, + { + "epoch": 8.231151241534988, + "eval_loss": 0.6214079856872559, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 2280 + }, + { + "epoch": 8.234762979683973, + "grad_norm": 201.57894897460938, + "learning_rate": 1.7635208711433756e-05, + "loss": 39.4673, + "step": 2281 + }, + { + "epoch": 8.238374717832958, + "grad_norm": 201.0395965576172, + "learning_rate": 1.7629764065335754e-05, + "loss": 39.9832, + "step": 2282 + }, + { + "epoch": 8.241986455981941, + "grad_norm": 274.41168212890625, + "learning_rate": 1.762431941923775e-05, + "loss": 40.3885, + "step": 2283 + }, + { + "epoch": 8.245598194130926, + "grad_norm": 173.79977416992188, + "learning_rate": 1.761887477313975e-05, + "loss": 39.5292, + "step": 2284 + }, + { + "epoch": 8.249209932279909, + "grad_norm": 194.91806030273438, + "learning_rate": 1.7613430127041744e-05, + "loss": 40.3855, + "step": 2285 + }, + { + "epoch": 8.252821670428894, + "grad_norm": 216.47213745117188, + "learning_rate": 1.760798548094374e-05, + "loss": 40.937, + "step": 2286 + }, + { + "epoch": 8.256433408577879, + "grad_norm": 168.1825714111328, + "learning_rate": 1.7602540834845735e-05, + "loss": 41.2523, + "step": 2287 + }, + { + "epoch": 8.260045146726862, + "grad_norm": 187.51914978027344, + "learning_rate": 1.759709618874773e-05, + "loss": 40.6913, + "step": 2288 + }, + { + "epoch": 8.263656884875846, + "grad_norm": 183.99844360351562, + "learning_rate": 1.759165154264973e-05, + "loss": 42.5074, + "step": 2289 + }, + { + "epoch": 8.267268623024831, + "grad_norm": 201.23797607421875, + "learning_rate": 1.7586206896551724e-05, + "loss": 42.0519, + "step": 2290 + }, + { + "epoch": 8.267268623024831, + "eval_loss": 0.6184054017066956, + "eval_runtime": 3.1465, + "eval_samples_per_second": 56.889, + "eval_steps_per_second": 56.889, + "step": 2290 + }, + { + "epoch": 8.270880361173814, + "grad_norm": 219.0037384033203, + "learning_rate": 1.7580762250453723e-05, + "loss": 41.7059, + "step": 2291 + }, + { + "epoch": 8.2744920993228, + "grad_norm": 221.00173950195312, + "learning_rate": 1.7575317604355718e-05, + "loss": 40.9004, + "step": 2292 + }, + { + "epoch": 8.278103837471784, + "grad_norm": 180.00828552246094, + "learning_rate": 1.7569872958257714e-05, + "loss": 38.7865, + "step": 2293 + }, + { + "epoch": 8.281715575620767, + "grad_norm": 210.69302368164062, + "learning_rate": 1.756442831215971e-05, + "loss": 39.207, + "step": 2294 + }, + { + "epoch": 8.285327313769752, + "grad_norm": 196.8787078857422, + "learning_rate": 1.7558983666061708e-05, + "loss": 39.4472, + "step": 2295 + }, + { + "epoch": 8.288939051918735, + "grad_norm": 229.16331481933594, + "learning_rate": 1.7553539019963703e-05, + "loss": 36.5539, + "step": 2296 + }, + { + "epoch": 8.29255079006772, + "grad_norm": 180.67474365234375, + "learning_rate": 1.75480943738657e-05, + "loss": 34.3887, + "step": 2297 + }, + { + "epoch": 8.296162528216705, + "grad_norm": 234.046875, + "learning_rate": 1.7542649727767694e-05, + "loss": 34.158, + "step": 2298 + }, + { + "epoch": 8.299774266365688, + "grad_norm": 213.34255981445312, + "learning_rate": 1.753720508166969e-05, + "loss": 34.7655, + "step": 2299 + }, + { + "epoch": 8.303386004514673, + "grad_norm": 205.6382598876953, + "learning_rate": 1.753176043557169e-05, + "loss": 34.4223, + "step": 2300 + }, + { + "epoch": 8.303386004514673, + "eval_loss": 0.6200549006462097, + "eval_runtime": 3.1447, + "eval_samples_per_second": 56.921, + "eval_steps_per_second": 56.921, + "step": 2300 + }, + { + "epoch": 8.306997742663658, + "grad_norm": 189.79238891601562, + "learning_rate": 1.7526315789473687e-05, + "loss": 35.3846, + "step": 2301 + }, + { + "epoch": 8.31060948081264, + "grad_norm": 202.27859497070312, + "learning_rate": 1.7520871143375682e-05, + "loss": 34.9006, + "step": 2302 + }, + { + "epoch": 8.314221218961626, + "grad_norm": 217.62327575683594, + "learning_rate": 1.7515426497277678e-05, + "loss": 36.3079, + "step": 2303 + }, + { + "epoch": 8.317832957110609, + "grad_norm": 212.82862854003906, + "learning_rate": 1.7509981851179673e-05, + "loss": 35.8598, + "step": 2304 + }, + { + "epoch": 8.321444695259594, + "grad_norm": 229.778564453125, + "learning_rate": 1.750453720508167e-05, + "loss": 37.0853, + "step": 2305 + }, + { + "epoch": 8.325056433408578, + "grad_norm": 219.99844360351562, + "learning_rate": 1.7499092558983667e-05, + "loss": 38.01, + "step": 2306 + }, + { + "epoch": 8.328668171557561, + "grad_norm": 202.63035583496094, + "learning_rate": 1.7493647912885663e-05, + "loss": 36.4756, + "step": 2307 + }, + { + "epoch": 8.332279909706546, + "grad_norm": 188.44094848632812, + "learning_rate": 1.7488203266787658e-05, + "loss": 37.0509, + "step": 2308 + }, + { + "epoch": 8.335891647855531, + "grad_norm": 187.8760223388672, + "learning_rate": 1.7482758620689657e-05, + "loss": 38.0019, + "step": 2309 + }, + { + "epoch": 8.339503386004514, + "grad_norm": 239.35833740234375, + "learning_rate": 1.7477313974591652e-05, + "loss": 38.2255, + "step": 2310 + }, + { + "epoch": 8.339503386004514, + "eval_loss": 0.6221747994422913, + "eval_runtime": 3.148, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 2310 + }, + { + "epoch": 8.343115124153499, + "grad_norm": 236.3567657470703, + "learning_rate": 1.747186932849365e-05, + "loss": 37.3598, + "step": 2311 + }, + { + "epoch": 8.346726862302482, + "grad_norm": 188.16151428222656, + "learning_rate": 1.7466424682395646e-05, + "loss": 27.1993, + "step": 2312 + }, + { + "epoch": 8.350338600451467, + "grad_norm": 216.58778381347656, + "learning_rate": 1.746098003629764e-05, + "loss": 23.7024, + "step": 2313 + }, + { + "epoch": 8.353950338600452, + "grad_norm": 221.03111267089844, + "learning_rate": 1.7455535390199637e-05, + "loss": 24.2856, + "step": 2314 + }, + { + "epoch": 8.357562076749435, + "grad_norm": 180.36221313476562, + "learning_rate": 1.7450090744101632e-05, + "loss": 23.7624, + "step": 2315 + }, + { + "epoch": 8.36117381489842, + "grad_norm": 198.77438354492188, + "learning_rate": 1.7444646098003628e-05, + "loss": 25.8628, + "step": 2316 + }, + { + "epoch": 8.364785553047405, + "grad_norm": 250.81321716308594, + "learning_rate": 1.7439201451905627e-05, + "loss": 43.4097, + "step": 2317 + }, + { + "epoch": 8.368397291196388, + "grad_norm": 246.19544982910156, + "learning_rate": 1.7433756805807622e-05, + "loss": 44.7141, + "step": 2318 + }, + { + "epoch": 8.372009029345373, + "grad_norm": 245.04241943359375, + "learning_rate": 1.742831215970962e-05, + "loss": 44.4511, + "step": 2319 + }, + { + "epoch": 8.375620767494357, + "grad_norm": 224.05331420898438, + "learning_rate": 1.7422867513611616e-05, + "loss": 43.5971, + "step": 2320 + }, + { + "epoch": 8.375620767494357, + "eval_loss": 0.6324251294136047, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 2320 + }, + { + "epoch": 8.37923250564334, + "grad_norm": 222.3795623779297, + "learning_rate": 1.741742286751361e-05, + "loss": 42.9007, + "step": 2321 + }, + { + "epoch": 8.382844243792325, + "grad_norm": 210.0133514404297, + "learning_rate": 1.741197822141561e-05, + "loss": 42.8733, + "step": 2322 + }, + { + "epoch": 8.386455981941308, + "grad_norm": 222.01031494140625, + "learning_rate": 1.7406533575317606e-05, + "loss": 42.9875, + "step": 2323 + }, + { + "epoch": 8.390067720090293, + "grad_norm": 187.30101013183594, + "learning_rate": 1.74010889292196e-05, + "loss": 42.4873, + "step": 2324 + }, + { + "epoch": 8.393679458239278, + "grad_norm": 188.22048950195312, + "learning_rate": 1.7395644283121596e-05, + "loss": 42.2066, + "step": 2325 + }, + { + "epoch": 8.397291196388261, + "grad_norm": 228.75363159179688, + "learning_rate": 1.7390199637023592e-05, + "loss": 42.7604, + "step": 2326 + }, + { + "epoch": 8.400902934537246, + "grad_norm": 196.8817901611328, + "learning_rate": 1.7384754990925587e-05, + "loss": 42.445, + "step": 2327 + }, + { + "epoch": 8.404514672686231, + "grad_norm": 205.3610382080078, + "learning_rate": 1.737931034482759e-05, + "loss": 39.8408, + "step": 2328 + }, + { + "epoch": 8.408126410835214, + "grad_norm": 259.0702819824219, + "learning_rate": 1.7373865698729585e-05, + "loss": 40.847, + "step": 2329 + }, + { + "epoch": 8.411738148984199, + "grad_norm": 216.12017822265625, + "learning_rate": 1.736842105263158e-05, + "loss": 40.4648, + "step": 2330 + }, + { + "epoch": 8.411738148984199, + "eval_loss": 0.6252871155738831, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2330 + }, + { + "epoch": 8.415349887133182, + "grad_norm": 330.9464111328125, + "learning_rate": 1.7362976406533575e-05, + "loss": 39.7682, + "step": 2331 + }, + { + "epoch": 8.418961625282167, + "grad_norm": 237.19505310058594, + "learning_rate": 1.735753176043557e-05, + "loss": 38.8824, + "step": 2332 + }, + { + "epoch": 8.422573363431152, + "grad_norm": 247.22259521484375, + "learning_rate": 1.735208711433757e-05, + "loss": 40.1187, + "step": 2333 + }, + { + "epoch": 8.426185101580135, + "grad_norm": 267.739990234375, + "learning_rate": 1.7346642468239565e-05, + "loss": 40.4589, + "step": 2334 + }, + { + "epoch": 8.42979683972912, + "grad_norm": 308.715576171875, + "learning_rate": 1.734119782214156e-05, + "loss": 41.5481, + "step": 2335 + }, + { + "epoch": 8.433408577878104, + "grad_norm": 350.8972473144531, + "learning_rate": 1.7335753176043556e-05, + "loss": 41.6628, + "step": 2336 + }, + { + "epoch": 8.437020316027088, + "grad_norm": 245.9825897216797, + "learning_rate": 1.7330308529945555e-05, + "loss": 40.3527, + "step": 2337 + }, + { + "epoch": 8.440632054176072, + "grad_norm": 253.94488525390625, + "learning_rate": 1.732486388384755e-05, + "loss": 39.6388, + "step": 2338 + }, + { + "epoch": 8.444243792325057, + "grad_norm": 226.24179077148438, + "learning_rate": 1.731941923774955e-05, + "loss": 40.5561, + "step": 2339 + }, + { + "epoch": 8.44785553047404, + "grad_norm": 188.66746520996094, + "learning_rate": 1.7313974591651544e-05, + "loss": 41.8422, + "step": 2340 + }, + { + "epoch": 8.44785553047404, + "eval_loss": 0.6197592616081238, + "eval_runtime": 3.1522, + "eval_samples_per_second": 56.786, + "eval_steps_per_second": 56.786, + "step": 2340 + }, + { + "epoch": 8.451467268623025, + "grad_norm": 227.01014709472656, + "learning_rate": 1.730852994555354e-05, + "loss": 41.4184, + "step": 2341 + }, + { + "epoch": 8.455079006772008, + "grad_norm": 187.11643981933594, + "learning_rate": 1.7303085299455535e-05, + "loss": 40.796, + "step": 2342 + }, + { + "epoch": 8.458690744920993, + "grad_norm": 243.1756134033203, + "learning_rate": 1.729764065335753e-05, + "loss": 41.7926, + "step": 2343 + }, + { + "epoch": 8.462302483069978, + "grad_norm": 226.15187072753906, + "learning_rate": 1.729219600725953e-05, + "loss": 41.588, + "step": 2344 + }, + { + "epoch": 8.465914221218961, + "grad_norm": 218.49935913085938, + "learning_rate": 1.7286751361161524e-05, + "loss": 39.6935, + "step": 2345 + }, + { + "epoch": 8.469525959367946, + "grad_norm": 232.4805145263672, + "learning_rate": 1.7281306715063523e-05, + "loss": 37.0718, + "step": 2346 + }, + { + "epoch": 8.47313769751693, + "grad_norm": 201.1748046875, + "learning_rate": 1.727586206896552e-05, + "loss": 33.9633, + "step": 2347 + }, + { + "epoch": 8.476749435665914, + "grad_norm": 208.79733276367188, + "learning_rate": 1.7270417422867514e-05, + "loss": 33.4553, + "step": 2348 + }, + { + "epoch": 8.480361173814899, + "grad_norm": 235.91151428222656, + "learning_rate": 1.726497277676951e-05, + "loss": 33.6144, + "step": 2349 + }, + { + "epoch": 8.483972911963882, + "grad_norm": 206.28811645507812, + "learning_rate": 1.7259528130671508e-05, + "loss": 35.3678, + "step": 2350 + }, + { + "epoch": 8.483972911963882, + "eval_loss": 0.6203061938285828, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 2350 + }, + { + "epoch": 8.487584650112867, + "grad_norm": 305.2204284667969, + "learning_rate": 1.7254083484573503e-05, + "loss": 35.9175, + "step": 2351 + }, + { + "epoch": 8.491196388261852, + "grad_norm": 227.1592254638672, + "learning_rate": 1.72486388384755e-05, + "loss": 35.5001, + "step": 2352 + }, + { + "epoch": 8.494808126410835, + "grad_norm": 194.739501953125, + "learning_rate": 1.7243194192377494e-05, + "loss": 35.0015, + "step": 2353 + }, + { + "epoch": 8.49841986455982, + "grad_norm": 233.8467254638672, + "learning_rate": 1.723774954627949e-05, + "loss": 36.8257, + "step": 2354 + }, + { + "epoch": 8.502031602708804, + "grad_norm": 258.8914489746094, + "learning_rate": 1.7232304900181492e-05, + "loss": 36.1246, + "step": 2355 + }, + { + "epoch": 8.505643340857787, + "grad_norm": 194.8585968017578, + "learning_rate": 1.7226860254083487e-05, + "loss": 36.1245, + "step": 2356 + }, + { + "epoch": 8.509255079006772, + "grad_norm": 191.2276153564453, + "learning_rate": 1.7221415607985483e-05, + "loss": 37.0608, + "step": 2357 + }, + { + "epoch": 8.512866817155757, + "grad_norm": 197.9025115966797, + "learning_rate": 1.7215970961887478e-05, + "loss": 37.0779, + "step": 2358 + }, + { + "epoch": 8.51647855530474, + "grad_norm": 207.01016235351562, + "learning_rate": 1.7210526315789473e-05, + "loss": 37.8432, + "step": 2359 + }, + { + "epoch": 8.520090293453725, + "grad_norm": 222.20201110839844, + "learning_rate": 1.720508166969147e-05, + "loss": 36.6983, + "step": 2360 + }, + { + "epoch": 8.520090293453725, + "eval_loss": 0.6240220665931702, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 2360 + }, + { + "epoch": 8.523702031602708, + "grad_norm": 200.19273376464844, + "learning_rate": 1.7199637023593467e-05, + "loss": 38.0613, + "step": 2361 + }, + { + "epoch": 8.527313769751693, + "grad_norm": 205.36758422851562, + "learning_rate": 1.7194192377495463e-05, + "loss": 29.6395, + "step": 2362 + }, + { + "epoch": 8.530925507900678, + "grad_norm": 206.53396606445312, + "learning_rate": 1.7188747731397458e-05, + "loss": 23.6478, + "step": 2363 + }, + { + "epoch": 8.534537246049661, + "grad_norm": 219.47044372558594, + "learning_rate": 1.7183303085299454e-05, + "loss": 22.8522, + "step": 2364 + }, + { + "epoch": 8.538148984198646, + "grad_norm": 178.48008728027344, + "learning_rate": 1.7177858439201452e-05, + "loss": 24.1411, + "step": 2365 + }, + { + "epoch": 8.54176072234763, + "grad_norm": 222.63731384277344, + "learning_rate": 1.717241379310345e-05, + "loss": 26.2818, + "step": 2366 + }, + { + "epoch": 8.545372460496614, + "grad_norm": 216.6333465576172, + "learning_rate": 1.7166969147005447e-05, + "loss": 42.5599, + "step": 2367 + }, + { + "epoch": 8.548984198645599, + "grad_norm": 241.42532348632812, + "learning_rate": 1.7161524500907442e-05, + "loss": 44.0016, + "step": 2368 + }, + { + "epoch": 8.552595936794582, + "grad_norm": 227.95193481445312, + "learning_rate": 1.7156079854809437e-05, + "loss": 44.1662, + "step": 2369 + }, + { + "epoch": 8.556207674943566, + "grad_norm": 204.9208526611328, + "learning_rate": 1.7150635208711433e-05, + "loss": 41.2255, + "step": 2370 + }, + { + "epoch": 8.556207674943566, + "eval_loss": 0.6293933987617493, + "eval_runtime": 3.1467, + "eval_samples_per_second": 56.884, + "eval_steps_per_second": 56.884, + "step": 2370 + }, + { + "epoch": 8.559819413092551, + "grad_norm": 168.1370849609375, + "learning_rate": 1.7145190562613428e-05, + "loss": 42.8374, + "step": 2371 + }, + { + "epoch": 8.563431151241534, + "grad_norm": 209.16641235351562, + "learning_rate": 1.7139745916515427e-05, + "loss": 42.4378, + "step": 2372 + }, + { + "epoch": 8.56704288939052, + "grad_norm": 235.36373901367188, + "learning_rate": 1.7134301270417422e-05, + "loss": 43.3213, + "step": 2373 + }, + { + "epoch": 8.570654627539504, + "grad_norm": 198.8206329345703, + "learning_rate": 1.712885662431942e-05, + "loss": 43.5621, + "step": 2374 + }, + { + "epoch": 8.574266365688487, + "grad_norm": 191.1640167236328, + "learning_rate": 1.7123411978221416e-05, + "loss": 41.8729, + "step": 2375 + }, + { + "epoch": 8.577878103837472, + "grad_norm": 281.6352233886719, + "learning_rate": 1.7117967332123412e-05, + "loss": 42.8306, + "step": 2376 + }, + { + "epoch": 8.581489841986457, + "grad_norm": 191.68939208984375, + "learning_rate": 1.711252268602541e-05, + "loss": 41.3603, + "step": 2377 + }, + { + "epoch": 8.58510158013544, + "grad_norm": 175.3041229248047, + "learning_rate": 1.7107078039927406e-05, + "loss": 38.7076, + "step": 2378 + }, + { + "epoch": 8.588713318284425, + "grad_norm": 186.31202697753906, + "learning_rate": 1.71016333938294e-05, + "loss": 38.832, + "step": 2379 + }, + { + "epoch": 8.592325056433408, + "grad_norm": 192.0680389404297, + "learning_rate": 1.7096188747731397e-05, + "loss": 40.6542, + "step": 2380 + }, + { + "epoch": 8.592325056433408, + "eval_loss": 0.6245992183685303, + "eval_runtime": 3.1487, + "eval_samples_per_second": 56.848, + "eval_steps_per_second": 56.848, + "step": 2380 + }, + { + "epoch": 8.595936794582393, + "grad_norm": 284.3516540527344, + "learning_rate": 1.7090744101633392e-05, + "loss": 40.3145, + "step": 2381 + }, + { + "epoch": 8.599548532731378, + "grad_norm": 210.2421875, + "learning_rate": 1.708529945553539e-05, + "loss": 39.9109, + "step": 2382 + }, + { + "epoch": 8.60316027088036, + "grad_norm": 202.3438720703125, + "learning_rate": 1.707985480943739e-05, + "loss": 39.0686, + "step": 2383 + }, + { + "epoch": 8.606772009029346, + "grad_norm": 189.5508270263672, + "learning_rate": 1.7074410163339385e-05, + "loss": 40.6673, + "step": 2384 + }, + { + "epoch": 8.610383747178329, + "grad_norm": 199.3516387939453, + "learning_rate": 1.706896551724138e-05, + "loss": 40.5357, + "step": 2385 + }, + { + "epoch": 8.613995485327314, + "grad_norm": 183.11309814453125, + "learning_rate": 1.7063520871143376e-05, + "loss": 40.7691, + "step": 2386 + }, + { + "epoch": 8.617607223476298, + "grad_norm": 347.104248046875, + "learning_rate": 1.705807622504537e-05, + "loss": 40.6822, + "step": 2387 + }, + { + "epoch": 8.621218961625281, + "grad_norm": 341.0453796386719, + "learning_rate": 1.705263157894737e-05, + "loss": 40.9791, + "step": 2388 + }, + { + "epoch": 8.624830699774266, + "grad_norm": 335.33221435546875, + "learning_rate": 1.7047186932849365e-05, + "loss": 41.0977, + "step": 2389 + }, + { + "epoch": 8.628442437923251, + "grad_norm": 209.75198364257812, + "learning_rate": 1.704174228675136e-05, + "loss": 41.3332, + "step": 2390 + }, + { + "epoch": 8.628442437923251, + "eval_loss": 0.6176490783691406, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2390 + }, + { + "epoch": 8.632054176072234, + "grad_norm": 221.6715545654297, + "learning_rate": 1.7036297640653356e-05, + "loss": 41.7456, + "step": 2391 + }, + { + "epoch": 8.635665914221219, + "grad_norm": 255.7875213623047, + "learning_rate": 1.7030852994555355e-05, + "loss": 41.7063, + "step": 2392 + }, + { + "epoch": 8.639277652370204, + "grad_norm": 206.66221618652344, + "learning_rate": 1.7025408348457354e-05, + "loss": 41.941, + "step": 2393 + }, + { + "epoch": 8.642889390519187, + "grad_norm": 381.9871826171875, + "learning_rate": 1.701996370235935e-05, + "loss": 42.8615, + "step": 2394 + }, + { + "epoch": 8.646501128668172, + "grad_norm": 303.8249816894531, + "learning_rate": 1.7014519056261344e-05, + "loss": 37.8472, + "step": 2395 + }, + { + "epoch": 8.650112866817155, + "grad_norm": 201.2444610595703, + "learning_rate": 1.700907441016334e-05, + "loss": 35.4641, + "step": 2396 + }, + { + "epoch": 8.65372460496614, + "grad_norm": 242.34298706054688, + "learning_rate": 1.7003629764065335e-05, + "loss": 33.3414, + "step": 2397 + }, + { + "epoch": 8.657336343115125, + "grad_norm": 214.45384216308594, + "learning_rate": 1.699818511796733e-05, + "loss": 33.7771, + "step": 2398 + }, + { + "epoch": 8.660948081264108, + "grad_norm": 276.4810485839844, + "learning_rate": 1.699274047186933e-05, + "loss": 35.4289, + "step": 2399 + }, + { + "epoch": 8.664559819413093, + "grad_norm": 199.68626403808594, + "learning_rate": 1.6987295825771325e-05, + "loss": 34.4205, + "step": 2400 + }, + { + "epoch": 8.664559819413093, + "eval_loss": 0.6179484128952026, + "eval_runtime": 3.1618, + "eval_samples_per_second": 56.614, + "eval_steps_per_second": 56.614, + "step": 2400 + }, + { + "epoch": 8.668171557562077, + "grad_norm": 239.19200134277344, + "learning_rate": 1.698185117967332e-05, + "loss": 34.3428, + "step": 2401 + }, + { + "epoch": 8.67178329571106, + "grad_norm": 341.44927978515625, + "learning_rate": 1.697640653357532e-05, + "loss": 37.6011, + "step": 2402 + }, + { + "epoch": 8.675395033860045, + "grad_norm": 260.5967102050781, + "learning_rate": 1.6970961887477314e-05, + "loss": 34.9222, + "step": 2403 + }, + { + "epoch": 8.679006772009028, + "grad_norm": 217.9357147216797, + "learning_rate": 1.6965517241379313e-05, + "loss": 36.6177, + "step": 2404 + }, + { + "epoch": 8.682618510158013, + "grad_norm": 355.21917724609375, + "learning_rate": 1.696007259528131e-05, + "loss": 36.3072, + "step": 2405 + }, + { + "epoch": 8.686230248306998, + "grad_norm": 279.37200927734375, + "learning_rate": 1.6954627949183304e-05, + "loss": 36.7026, + "step": 2406 + }, + { + "epoch": 8.689841986455981, + "grad_norm": 344.9017028808594, + "learning_rate": 1.69491833030853e-05, + "loss": 37.5009, + "step": 2407 + }, + { + "epoch": 8.693453724604966, + "grad_norm": 225.28668212890625, + "learning_rate": 1.6943738656987295e-05, + "loss": 36.0914, + "step": 2408 + }, + { + "epoch": 8.697065462753951, + "grad_norm": 233.16372680664062, + "learning_rate": 1.693829401088929e-05, + "loss": 38.0917, + "step": 2409 + }, + { + "epoch": 8.700677200902934, + "grad_norm": 220.2307891845703, + "learning_rate": 1.693284936479129e-05, + "loss": 37.4493, + "step": 2410 + }, + { + "epoch": 8.700677200902934, + "eval_loss": 0.6225734949111938, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2410 + }, + { + "epoch": 8.704288939051919, + "grad_norm": 298.2883605957031, + "learning_rate": 1.6927404718693287e-05, + "loss": 37.6527, + "step": 2411 + }, + { + "epoch": 8.707900677200904, + "grad_norm": 329.1615295410156, + "learning_rate": 1.6921960072595283e-05, + "loss": 30.9627, + "step": 2412 + }, + { + "epoch": 8.711512415349887, + "grad_norm": 192.55380249023438, + "learning_rate": 1.6916515426497278e-05, + "loss": 24.2028, + "step": 2413 + }, + { + "epoch": 8.715124153498872, + "grad_norm": 162.13583374023438, + "learning_rate": 1.6911070780399274e-05, + "loss": 23.3005, + "step": 2414 + }, + { + "epoch": 8.718735891647855, + "grad_norm": 152.95108032226562, + "learning_rate": 1.6905626134301272e-05, + "loss": 24.335, + "step": 2415 + }, + { + "epoch": 8.72234762979684, + "grad_norm": 183.4193572998047, + "learning_rate": 1.6900181488203268e-05, + "loss": 24.9279, + "step": 2416 + }, + { + "epoch": 8.725959367945824, + "grad_norm": 232.93650817871094, + "learning_rate": 1.6894736842105263e-05, + "loss": 43.4574, + "step": 2417 + }, + { + "epoch": 8.729571106094808, + "grad_norm": 226.85890197753906, + "learning_rate": 1.688929219600726e-05, + "loss": 44.4136, + "step": 2418 + }, + { + "epoch": 8.733182844243792, + "grad_norm": 232.16064453125, + "learning_rate": 1.6883847549909254e-05, + "loss": 42.8183, + "step": 2419 + }, + { + "epoch": 8.736794582392777, + "grad_norm": 243.5811767578125, + "learning_rate": 1.6878402903811253e-05, + "loss": 43.3031, + "step": 2420 + }, + { + "epoch": 8.736794582392777, + "eval_loss": 0.6284167170524597, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2420 + }, + { + "epoch": 8.74040632054176, + "grad_norm": 194.7115020751953, + "learning_rate": 1.687295825771325e-05, + "loss": 42.1276, + "step": 2421 + }, + { + "epoch": 8.744018058690745, + "grad_norm": 250.81983947753906, + "learning_rate": 1.6867513611615247e-05, + "loss": 42.5535, + "step": 2422 + }, + { + "epoch": 8.747629796839728, + "grad_norm": 205.1988983154297, + "learning_rate": 1.6862068965517242e-05, + "loss": 42.7745, + "step": 2423 + }, + { + "epoch": 8.751241534988713, + "grad_norm": 159.68243408203125, + "learning_rate": 1.6856624319419238e-05, + "loss": 43.6562, + "step": 2424 + }, + { + "epoch": 8.754853273137698, + "grad_norm": 164.31361389160156, + "learning_rate": 1.6851179673321233e-05, + "loss": 43.4602, + "step": 2425 + }, + { + "epoch": 8.758465011286681, + "grad_norm": 213.9793243408203, + "learning_rate": 1.6845735027223232e-05, + "loss": 42.1559, + "step": 2426 + }, + { + "epoch": 8.762076749435666, + "grad_norm": 205.79107666015625, + "learning_rate": 1.6840290381125227e-05, + "loss": 41.5687, + "step": 2427 + }, + { + "epoch": 8.76568848758465, + "grad_norm": 235.80348205566406, + "learning_rate": 1.6834845735027223e-05, + "loss": 41.0748, + "step": 2428 + }, + { + "epoch": 8.769300225733634, + "grad_norm": 203.84884643554688, + "learning_rate": 1.682940108892922e-05, + "loss": 39.3348, + "step": 2429 + }, + { + "epoch": 8.772911963882619, + "grad_norm": 271.2411804199219, + "learning_rate": 1.6823956442831217e-05, + "loss": 39.357, + "step": 2430 + }, + { + "epoch": 8.772911963882619, + "eval_loss": 0.6211046576499939, + "eval_runtime": 3.1402, + "eval_samples_per_second": 57.002, + "eval_steps_per_second": 57.002, + "step": 2430 + }, + { + "epoch": 8.776523702031604, + "grad_norm": 222.4960174560547, + "learning_rate": 1.6818511796733212e-05, + "loss": 39.2198, + "step": 2431 + }, + { + "epoch": 8.780135440180587, + "grad_norm": 325.9942932128906, + "learning_rate": 1.681306715063521e-05, + "loss": 40.572, + "step": 2432 + }, + { + "epoch": 8.783747178329572, + "grad_norm": 195.2740936279297, + "learning_rate": 1.6807622504537206e-05, + "loss": 39.2727, + "step": 2433 + }, + { + "epoch": 8.787358916478555, + "grad_norm": 196.16964721679688, + "learning_rate": 1.68021778584392e-05, + "loss": 40.6503, + "step": 2434 + }, + { + "epoch": 8.79097065462754, + "grad_norm": 183.2659454345703, + "learning_rate": 1.6796733212341197e-05, + "loss": 41.2074, + "step": 2435 + }, + { + "epoch": 8.794582392776524, + "grad_norm": 293.393798828125, + "learning_rate": 1.6791288566243192e-05, + "loss": 40.2778, + "step": 2436 + }, + { + "epoch": 8.798194130925507, + "grad_norm": 232.8402099609375, + "learning_rate": 1.678584392014519e-05, + "loss": 40.0305, + "step": 2437 + }, + { + "epoch": 8.801805869074492, + "grad_norm": 269.957275390625, + "learning_rate": 1.678039927404719e-05, + "loss": 40.4216, + "step": 2438 + }, + { + "epoch": 8.805417607223477, + "grad_norm": 175.6732635498047, + "learning_rate": 1.6774954627949185e-05, + "loss": 40.7998, + "step": 2439 + }, + { + "epoch": 8.80902934537246, + "grad_norm": 209.0604248046875, + "learning_rate": 1.676950998185118e-05, + "loss": 41.1176, + "step": 2440 + }, + { + "epoch": 8.80902934537246, + "eval_loss": 0.6211614012718201, + "eval_runtime": 3.15, + "eval_samples_per_second": 56.826, + "eval_steps_per_second": 56.826, + "step": 2440 + }, + { + "epoch": 8.812641083521445, + "grad_norm": 229.91171264648438, + "learning_rate": 1.6764065335753176e-05, + "loss": 41.37, + "step": 2441 + }, + { + "epoch": 8.816252821670428, + "grad_norm": 192.99610900878906, + "learning_rate": 1.675862068965517e-05, + "loss": 41.8377, + "step": 2442 + }, + { + "epoch": 8.819864559819413, + "grad_norm": 239.290771484375, + "learning_rate": 1.675317604355717e-05, + "loss": 42.3038, + "step": 2443 + }, + { + "epoch": 8.823476297968398, + "grad_norm": 203.52330017089844, + "learning_rate": 1.6747731397459166e-05, + "loss": 41.3334, + "step": 2444 + }, + { + "epoch": 8.827088036117381, + "grad_norm": 247.99099731445312, + "learning_rate": 1.674228675136116e-05, + "loss": 37.7455, + "step": 2445 + }, + { + "epoch": 8.830699774266366, + "grad_norm": 205.9770965576172, + "learning_rate": 1.6736842105263156e-05, + "loss": 34.6828, + "step": 2446 + }, + { + "epoch": 8.83431151241535, + "grad_norm": 215.47024536132812, + "learning_rate": 1.6731397459165152e-05, + "loss": 34.927, + "step": 2447 + }, + { + "epoch": 8.837923250564334, + "grad_norm": 254.14010620117188, + "learning_rate": 1.6725952813067154e-05, + "loss": 35.3194, + "step": 2448 + }, + { + "epoch": 8.841534988713319, + "grad_norm": 221.18174743652344, + "learning_rate": 1.672050816696915e-05, + "loss": 34.9577, + "step": 2449 + }, + { + "epoch": 8.845146726862303, + "grad_norm": 191.1651611328125, + "learning_rate": 1.6715063520871145e-05, + "loss": 33.7244, + "step": 2450 + }, + { + "epoch": 8.845146726862303, + "eval_loss": 0.6216589212417603, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2450 + }, + { + "epoch": 8.848758465011286, + "grad_norm": 228.3920135498047, + "learning_rate": 1.670961887477314e-05, + "loss": 34.9689, + "step": 2451 + }, + { + "epoch": 8.852370203160271, + "grad_norm": 227.6689910888672, + "learning_rate": 1.6704174228675135e-05, + "loss": 36.0718, + "step": 2452 + }, + { + "epoch": 8.855981941309254, + "grad_norm": 182.38978576660156, + "learning_rate": 1.669872958257713e-05, + "loss": 37.1143, + "step": 2453 + }, + { + "epoch": 8.85959367945824, + "grad_norm": 223.66966247558594, + "learning_rate": 1.669328493647913e-05, + "loss": 34.4468, + "step": 2454 + }, + { + "epoch": 8.863205417607224, + "grad_norm": 260.3930358886719, + "learning_rate": 1.6687840290381125e-05, + "loss": 36.7305, + "step": 2455 + }, + { + "epoch": 8.866817155756207, + "grad_norm": 218.60385131835938, + "learning_rate": 1.668239564428312e-05, + "loss": 36.1995, + "step": 2456 + }, + { + "epoch": 8.870428893905192, + "grad_norm": 227.4342041015625, + "learning_rate": 1.667695099818512e-05, + "loss": 35.9138, + "step": 2457 + }, + { + "epoch": 8.874040632054175, + "grad_norm": 208.42196655273438, + "learning_rate": 1.6671506352087115e-05, + "loss": 37.2621, + "step": 2458 + }, + { + "epoch": 8.87765237020316, + "grad_norm": 214.9486541748047, + "learning_rate": 1.6666061705989113e-05, + "loss": 38.5176, + "step": 2459 + }, + { + "epoch": 8.881264108352145, + "grad_norm": 226.6992645263672, + "learning_rate": 1.666061705989111e-05, + "loss": 38.3917, + "step": 2460 + }, + { + "epoch": 8.881264108352145, + "eval_loss": 0.6277003884315491, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.959, + "eval_steps_per_second": 56.959, + "step": 2460 + }, + { + "epoch": 8.884875846501128, + "grad_norm": 282.3875732421875, + "learning_rate": 1.6655172413793104e-05, + "loss": 39.1439, + "step": 2461 + }, + { + "epoch": 8.888487584650113, + "grad_norm": 240.29022216796875, + "learning_rate": 1.66497277676951e-05, + "loss": 33.7717, + "step": 2462 + }, + { + "epoch": 8.892099322799098, + "grad_norm": 231.84727478027344, + "learning_rate": 1.6644283121597095e-05, + "loss": 24.1146, + "step": 2463 + }, + { + "epoch": 8.89571106094808, + "grad_norm": 215.5159149169922, + "learning_rate": 1.663883847549909e-05, + "loss": 24.0165, + "step": 2464 + }, + { + "epoch": 8.899322799097066, + "grad_norm": 278.42950439453125, + "learning_rate": 1.663339382940109e-05, + "loss": 24.2048, + "step": 2465 + }, + { + "epoch": 8.90293453724605, + "grad_norm": 187.03341674804688, + "learning_rate": 1.6627949183303088e-05, + "loss": 24.7332, + "step": 2466 + }, + { + "epoch": 8.906546275395034, + "grad_norm": 261.2938232421875, + "learning_rate": 1.6622504537205083e-05, + "loss": 42.6764, + "step": 2467 + }, + { + "epoch": 8.910158013544018, + "grad_norm": 234.00880432128906, + "learning_rate": 1.661705989110708e-05, + "loss": 42.9894, + "step": 2468 + }, + { + "epoch": 8.913769751693001, + "grad_norm": 263.2890319824219, + "learning_rate": 1.6611615245009074e-05, + "loss": 43.3274, + "step": 2469 + }, + { + "epoch": 8.917381489841986, + "grad_norm": 286.3260192871094, + "learning_rate": 1.6606170598911073e-05, + "loss": 44.3862, + "step": 2470 + }, + { + "epoch": 8.917381489841986, + "eval_loss": 0.6278789043426514, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2470 + }, + { + "epoch": 8.920993227990971, + "grad_norm": 273.5133972167969, + "learning_rate": 1.6600725952813068e-05, + "loss": 43.4195, + "step": 2471 + }, + { + "epoch": 8.924604966139954, + "grad_norm": 246.2245330810547, + "learning_rate": 1.6595281306715063e-05, + "loss": 43.153, + "step": 2472 + }, + { + "epoch": 8.928216704288939, + "grad_norm": 261.3001403808594, + "learning_rate": 1.658983666061706e-05, + "loss": 41.1276, + "step": 2473 + }, + { + "epoch": 8.931828442437924, + "grad_norm": 263.7626037597656, + "learning_rate": 1.6584392014519054e-05, + "loss": 40.5055, + "step": 2474 + }, + { + "epoch": 8.935440180586907, + "grad_norm": 233.80442810058594, + "learning_rate": 1.6578947368421053e-05, + "loss": 40.7098, + "step": 2475 + }, + { + "epoch": 8.939051918735892, + "grad_norm": 334.1268615722656, + "learning_rate": 1.6573502722323052e-05, + "loss": 40.5404, + "step": 2476 + }, + { + "epoch": 8.942663656884875, + "grad_norm": 319.56689453125, + "learning_rate": 1.6568058076225047e-05, + "loss": 40.3434, + "step": 2477 + }, + { + "epoch": 8.94627539503386, + "grad_norm": 388.0625915527344, + "learning_rate": 1.6562613430127043e-05, + "loss": 41.1956, + "step": 2478 + }, + { + "epoch": 8.949887133182845, + "grad_norm": 256.9087829589844, + "learning_rate": 1.6557168784029038e-05, + "loss": 41.9647, + "step": 2479 + }, + { + "epoch": 8.953498871331828, + "grad_norm": 248.2635040283203, + "learning_rate": 1.6551724137931033e-05, + "loss": 41.1885, + "step": 2480 + }, + { + "epoch": 8.953498871331828, + "eval_loss": 0.6198933124542236, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2480 + }, + { + "epoch": 8.957110609480813, + "grad_norm": 236.89004516601562, + "learning_rate": 1.6546279491833032e-05, + "loss": 41.2178, + "step": 2481 + }, + { + "epoch": 8.960722347629797, + "grad_norm": 260.47357177734375, + "learning_rate": 1.6540834845735027e-05, + "loss": 42.1472, + "step": 2482 + }, + { + "epoch": 8.96433408577878, + "grad_norm": 216.1390380859375, + "learning_rate": 1.6535390199637023e-05, + "loss": 36.14, + "step": 2483 + }, + { + "epoch": 8.967945823927765, + "grad_norm": 194.7316131591797, + "learning_rate": 1.652994555353902e-05, + "loss": 33.7272, + "step": 2484 + }, + { + "epoch": 8.97155756207675, + "grad_norm": 202.0404052734375, + "learning_rate": 1.6524500907441017e-05, + "loss": 34.9427, + "step": 2485 + }, + { + "epoch": 8.975169300225733, + "grad_norm": 196.98463439941406, + "learning_rate": 1.6519056261343016e-05, + "loss": 36.4874, + "step": 2486 + }, + { + "epoch": 8.978781038374718, + "grad_norm": 211.46177673339844, + "learning_rate": 1.651361161524501e-05, + "loss": 35.7667, + "step": 2487 + }, + { + "epoch": 8.982392776523701, + "grad_norm": 190.47093200683594, + "learning_rate": 1.6508166969147006e-05, + "loss": 35.6874, + "step": 2488 + }, + { + "epoch": 8.986004514672686, + "grad_norm": 194.9825897216797, + "learning_rate": 1.6502722323049002e-05, + "loss": 36.8718, + "step": 2489 + }, + { + "epoch": 8.989616252821671, + "grad_norm": 230.24774169921875, + "learning_rate": 1.6497277676950997e-05, + "loss": 37.4962, + "step": 2490 + }, + { + "epoch": 8.989616252821671, + "eval_loss": 0.6168100237846375, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 2490 + }, + { + "epoch": 8.993227990970654, + "grad_norm": 266.5688171386719, + "learning_rate": 1.6491833030852993e-05, + "loss": 35.5063, + "step": 2491 + }, + { + "epoch": 8.996839729119639, + "grad_norm": 230.923828125, + "learning_rate": 1.648638838475499e-05, + "loss": 23.5847, + "step": 2492 + }, + { + "epoch": 9.0, + "grad_norm": 187.365478515625, + "learning_rate": 1.6480943738656987e-05, + "loss": 21.7926, + "step": 2493 + }, + { + "epoch": 9.003611738148985, + "grad_norm": 283.487060546875, + "learning_rate": 1.6475499092558986e-05, + "loss": 41.4221, + "step": 2494 + }, + { + "epoch": 9.007223476297968, + "grad_norm": 234.38009643554688, + "learning_rate": 1.647005444646098e-05, + "loss": 43.3343, + "step": 2495 + }, + { + "epoch": 9.010835214446953, + "grad_norm": 253.75588989257812, + "learning_rate": 1.6464609800362976e-05, + "loss": 42.1983, + "step": 2496 + }, + { + "epoch": 9.014446952595938, + "grad_norm": 224.6202392578125, + "learning_rate": 1.6459165154264975e-05, + "loss": 41.5355, + "step": 2497 + }, + { + "epoch": 9.01805869074492, + "grad_norm": 261.0040588378906, + "learning_rate": 1.645372050816697e-05, + "loss": 42.3058, + "step": 2498 + }, + { + "epoch": 9.021670428893906, + "grad_norm": 191.44142150878906, + "learning_rate": 1.6448275862068966e-05, + "loss": 42.3911, + "step": 2499 + }, + { + "epoch": 9.025282167042889, + "grad_norm": 246.79278564453125, + "learning_rate": 1.644283121597096e-05, + "loss": 41.6238, + "step": 2500 + }, + { + "epoch": 9.025282167042889, + "eval_loss": 0.6220878958702087, + "eval_runtime": 3.1552, + "eval_samples_per_second": 56.731, + "eval_steps_per_second": 56.731, + "step": 2500 + }, + { + "epoch": 9.028893905191874, + "grad_norm": 251.5475311279297, + "learning_rate": 1.6437386569872957e-05, + "loss": 43.9275, + "step": 2501 + }, + { + "epoch": 9.032505643340858, + "grad_norm": 300.0381164550781, + "learning_rate": 1.6431941923774952e-05, + "loss": 42.8938, + "step": 2502 + }, + { + "epoch": 9.036117381489841, + "grad_norm": 310.0517883300781, + "learning_rate": 1.6426497277676954e-05, + "loss": 42.3538, + "step": 2503 + }, + { + "epoch": 9.039729119638826, + "grad_norm": 213.50392150878906, + "learning_rate": 1.642105263157895e-05, + "loss": 40.2305, + "step": 2504 + }, + { + "epoch": 9.043340857787811, + "grad_norm": 173.3816680908203, + "learning_rate": 1.6415607985480945e-05, + "loss": 38.3336, + "step": 2505 + }, + { + "epoch": 9.046952595936794, + "grad_norm": 195.51968383789062, + "learning_rate": 1.641016333938294e-05, + "loss": 38.5937, + "step": 2506 + }, + { + "epoch": 9.050564334085779, + "grad_norm": 195.68910217285156, + "learning_rate": 1.6404718693284936e-05, + "loss": 37.9994, + "step": 2507 + }, + { + "epoch": 9.054176072234762, + "grad_norm": 239.56704711914062, + "learning_rate": 1.6399274047186934e-05, + "loss": 38.6006, + "step": 2508 + }, + { + "epoch": 9.057787810383747, + "grad_norm": 455.8309326171875, + "learning_rate": 1.639382940108893e-05, + "loss": 39.9516, + "step": 2509 + }, + { + "epoch": 9.061399548532732, + "grad_norm": 188.0857696533203, + "learning_rate": 1.6388384754990925e-05, + "loss": 38.8922, + "step": 2510 + }, + { + "epoch": 9.061399548532732, + "eval_loss": 0.6177002191543579, + "eval_runtime": 3.1595, + "eval_samples_per_second": 56.654, + "eval_steps_per_second": 56.654, + "step": 2510 + }, + { + "epoch": 9.065011286681715, + "grad_norm": 211.76168823242188, + "learning_rate": 1.638294010889292e-05, + "loss": 38.8895, + "step": 2511 + }, + { + "epoch": 9.0686230248307, + "grad_norm": 281.7332458496094, + "learning_rate": 1.637749546279492e-05, + "loss": 39.9238, + "step": 2512 + }, + { + "epoch": 9.072234762979685, + "grad_norm": 254.9953155517578, + "learning_rate": 1.6372050816696915e-05, + "loss": 41.2667, + "step": 2513 + }, + { + "epoch": 9.075846501128668, + "grad_norm": 233.8746337890625, + "learning_rate": 1.6366606170598914e-05, + "loss": 39.3087, + "step": 2514 + }, + { + "epoch": 9.079458239277653, + "grad_norm": 317.71270751953125, + "learning_rate": 1.636116152450091e-05, + "loss": 40.4902, + "step": 2515 + }, + { + "epoch": 9.083069977426636, + "grad_norm": 227.5228271484375, + "learning_rate": 1.6355716878402904e-05, + "loss": 40.1197, + "step": 2516 + }, + { + "epoch": 9.08668171557562, + "grad_norm": 225.84423828125, + "learning_rate": 1.63502722323049e-05, + "loss": 42.9099, + "step": 2517 + }, + { + "epoch": 9.090293453724605, + "grad_norm": 255.20858764648438, + "learning_rate": 1.6344827586206895e-05, + "loss": 42.0515, + "step": 2518 + }, + { + "epoch": 9.093905191873588, + "grad_norm": 215.45352172851562, + "learning_rate": 1.6339382940108894e-05, + "loss": 41.6817, + "step": 2519 + }, + { + "epoch": 9.097516930022573, + "grad_norm": 233.5334014892578, + "learning_rate": 1.633393829401089e-05, + "loss": 42.6121, + "step": 2520 + }, + { + "epoch": 9.097516930022573, + "eval_loss": 0.6148340106010437, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.926, + "eval_steps_per_second": 56.926, + "step": 2520 + }, + { + "epoch": 9.101128668171558, + "grad_norm": 196.54132080078125, + "learning_rate": 1.6328493647912888e-05, + "loss": 40.5833, + "step": 2521 + }, + { + "epoch": 9.104740406320541, + "grad_norm": 296.7503967285156, + "learning_rate": 1.6323049001814883e-05, + "loss": 39.098, + "step": 2522 + }, + { + "epoch": 9.108352144469526, + "grad_norm": 272.1104431152344, + "learning_rate": 1.631760435571688e-05, + "loss": 36.0076, + "step": 2523 + }, + { + "epoch": 9.111963882618511, + "grad_norm": 197.3100128173828, + "learning_rate": 1.6312159709618874e-05, + "loss": 33.3503, + "step": 2524 + }, + { + "epoch": 9.115575620767494, + "grad_norm": 223.1310272216797, + "learning_rate": 1.6306715063520873e-05, + "loss": 33.1386, + "step": 2525 + }, + { + "epoch": 9.119187358916479, + "grad_norm": 234.86093139648438, + "learning_rate": 1.630127041742287e-05, + "loss": 34.2101, + "step": 2526 + }, + { + "epoch": 9.122799097065462, + "grad_norm": 244.72328186035156, + "learning_rate": 1.6295825771324864e-05, + "loss": 34.955, + "step": 2527 + }, + { + "epoch": 9.126410835214447, + "grad_norm": 198.89134216308594, + "learning_rate": 1.629038112522686e-05, + "loss": 34.5405, + "step": 2528 + }, + { + "epoch": 9.130022573363432, + "grad_norm": 236.64096069335938, + "learning_rate": 1.6284936479128854e-05, + "loss": 35.2328, + "step": 2529 + }, + { + "epoch": 9.133634311512415, + "grad_norm": 212.8743438720703, + "learning_rate": 1.6279491833030853e-05, + "loss": 34.6642, + "step": 2530 + }, + { + "epoch": 9.133634311512415, + "eval_loss": 0.6154256463050842, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 2530 + }, + { + "epoch": 9.1372460496614, + "grad_norm": 227.15135192871094, + "learning_rate": 1.6274047186932852e-05, + "loss": 35.652, + "step": 2531 + }, + { + "epoch": 9.140857787810384, + "grad_norm": 207.30572509765625, + "learning_rate": 1.6268602540834847e-05, + "loss": 36.8476, + "step": 2532 + }, + { + "epoch": 9.144469525959368, + "grad_norm": 222.18023681640625, + "learning_rate": 1.6263157894736843e-05, + "loss": 35.8299, + "step": 2533 + }, + { + "epoch": 9.148081264108352, + "grad_norm": 283.674072265625, + "learning_rate": 1.6257713248638838e-05, + "loss": 36.5074, + "step": 2534 + }, + { + "epoch": 9.151693002257336, + "grad_norm": 235.69752502441406, + "learning_rate": 1.6252268602540834e-05, + "loss": 37.344, + "step": 2535 + }, + { + "epoch": 9.15530474040632, + "grad_norm": 224.37965393066406, + "learning_rate": 1.6246823956442832e-05, + "loss": 37.8138, + "step": 2536 + }, + { + "epoch": 9.158916478555305, + "grad_norm": 217.52230834960938, + "learning_rate": 1.6241379310344828e-05, + "loss": 37.1529, + "step": 2537 + }, + { + "epoch": 9.162528216704288, + "grad_norm": 234.7586212158203, + "learning_rate": 1.6235934664246823e-05, + "loss": 36.3247, + "step": 2538 + }, + { + "epoch": 9.166139954853273, + "grad_norm": 239.52479553222656, + "learning_rate": 1.623049001814882e-05, + "loss": 30.0805, + "step": 2539 + }, + { + "epoch": 9.169751693002258, + "grad_norm": 223.7616424560547, + "learning_rate": 1.6225045372050817e-05, + "loss": 23.8492, + "step": 2540 + }, + { + "epoch": 9.169751693002258, + "eval_loss": 0.6244915723800659, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.031, + "eval_steps_per_second": 57.031, + "step": 2540 + }, + { + "epoch": 9.173363431151241, + "grad_norm": 213.41371154785156, + "learning_rate": 1.6219600725952816e-05, + "loss": 23.3557, + "step": 2541 + }, + { + "epoch": 9.176975169300226, + "grad_norm": 162.4627685546875, + "learning_rate": 1.621415607985481e-05, + "loss": 23.8834, + "step": 2542 + }, + { + "epoch": 9.18058690744921, + "grad_norm": 172.13250732421875, + "learning_rate": 1.6208711433756807e-05, + "loss": 24.6428, + "step": 2543 + }, + { + "epoch": 9.184198645598194, + "grad_norm": 229.30799865722656, + "learning_rate": 1.6203266787658802e-05, + "loss": 42.5908, + "step": 2544 + }, + { + "epoch": 9.187810383747179, + "grad_norm": 195.30130004882812, + "learning_rate": 1.6197822141560798e-05, + "loss": 43.7286, + "step": 2545 + }, + { + "epoch": 9.191422121896162, + "grad_norm": 227.4984893798828, + "learning_rate": 1.6192377495462793e-05, + "loss": 43.5012, + "step": 2546 + }, + { + "epoch": 9.195033860045147, + "grad_norm": 254.69615173339844, + "learning_rate": 1.6186932849364792e-05, + "loss": 41.9295, + "step": 2547 + }, + { + "epoch": 9.198645598194132, + "grad_norm": 251.33778381347656, + "learning_rate": 1.6181488203266787e-05, + "loss": 42.0838, + "step": 2548 + }, + { + "epoch": 9.202257336343115, + "grad_norm": 237.91677856445312, + "learning_rate": 1.6176043557168786e-05, + "loss": 43.0031, + "step": 2549 + }, + { + "epoch": 9.2058690744921, + "grad_norm": 258.0311584472656, + "learning_rate": 1.617059891107078e-05, + "loss": 42.7196, + "step": 2550 + }, + { + "epoch": 9.2058690744921, + "eval_loss": 0.6245208978652954, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2550 + }, + { + "epoch": 9.209480812641084, + "grad_norm": 197.14703369140625, + "learning_rate": 1.6165154264972777e-05, + "loss": 42.1342, + "step": 2551 + }, + { + "epoch": 9.213092550790067, + "grad_norm": 235.19705200195312, + "learning_rate": 1.6159709618874775e-05, + "loss": 41.8462, + "step": 2552 + }, + { + "epoch": 9.216704288939052, + "grad_norm": 198.409423828125, + "learning_rate": 1.615426497277677e-05, + "loss": 43.5993, + "step": 2553 + }, + { + "epoch": 9.220316027088035, + "grad_norm": 254.08590698242188, + "learning_rate": 1.6148820326678766e-05, + "loss": 40.771, + "step": 2554 + }, + { + "epoch": 9.22392776523702, + "grad_norm": 181.64808654785156, + "learning_rate": 1.614337568058076e-05, + "loss": 39.3511, + "step": 2555 + }, + { + "epoch": 9.227539503386005, + "grad_norm": 294.1127014160156, + "learning_rate": 1.6137931034482757e-05, + "loss": 39.6586, + "step": 2556 + }, + { + "epoch": 9.231151241534988, + "grad_norm": 197.59982299804688, + "learning_rate": 1.6132486388384752e-05, + "loss": 38.2575, + "step": 2557 + }, + { + "epoch": 9.234762979683973, + "grad_norm": 223.74717712402344, + "learning_rate": 1.6127041742286754e-05, + "loss": 38.8801, + "step": 2558 + }, + { + "epoch": 9.238374717832958, + "grad_norm": 279.2779541015625, + "learning_rate": 1.612159709618875e-05, + "loss": 40.4591, + "step": 2559 + }, + { + "epoch": 9.241986455981941, + "grad_norm": 258.75909423828125, + "learning_rate": 1.6116152450090745e-05, + "loss": 39.2172, + "step": 2560 + }, + { + "epoch": 9.241986455981941, + "eval_loss": 0.6209923624992371, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.784, + "eval_steps_per_second": 56.784, + "step": 2560 + }, + { + "epoch": 9.245598194130926, + "grad_norm": 305.0645446777344, + "learning_rate": 1.611070780399274e-05, + "loss": 40.442, + "step": 2561 + }, + { + "epoch": 9.249209932279909, + "grad_norm": 196.18557739257812, + "learning_rate": 1.6105263157894736e-05, + "loss": 39.7092, + "step": 2562 + }, + { + "epoch": 9.252821670428894, + "grad_norm": 214.3220977783203, + "learning_rate": 1.6099818511796735e-05, + "loss": 39.3935, + "step": 2563 + }, + { + "epoch": 9.256433408577879, + "grad_norm": 217.2801055908203, + "learning_rate": 1.609437386569873e-05, + "loss": 40.39, + "step": 2564 + }, + { + "epoch": 9.260045146726862, + "grad_norm": 205.17446899414062, + "learning_rate": 1.6088929219600726e-05, + "loss": 39.9531, + "step": 2565 + }, + { + "epoch": 9.263656884875846, + "grad_norm": 197.3854217529297, + "learning_rate": 1.608348457350272e-05, + "loss": 40.474, + "step": 2566 + }, + { + "epoch": 9.267268623024831, + "grad_norm": 264.3934631347656, + "learning_rate": 1.607803992740472e-05, + "loss": 41.2794, + "step": 2567 + }, + { + "epoch": 9.270880361173814, + "grad_norm": 226.6471710205078, + "learning_rate": 1.6072595281306715e-05, + "loss": 40.3425, + "step": 2568 + }, + { + "epoch": 9.2744920993228, + "grad_norm": 198.62734985351562, + "learning_rate": 1.6067150635208714e-05, + "loss": 41.6261, + "step": 2569 + }, + { + "epoch": 9.278103837471784, + "grad_norm": 207.73509216308594, + "learning_rate": 1.606170598911071e-05, + "loss": 41.7835, + "step": 2570 + }, + { + "epoch": 9.278103837471784, + "eval_loss": 0.6173180937767029, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 2570 + }, + { + "epoch": 9.281715575620767, + "grad_norm": 214.13601684570312, + "learning_rate": 1.6056261343012705e-05, + "loss": 40.0095, + "step": 2571 + }, + { + "epoch": 9.285327313769752, + "grad_norm": 218.0533905029297, + "learning_rate": 1.60508166969147e-05, + "loss": 40.014, + "step": 2572 + }, + { + "epoch": 9.288939051918735, + "grad_norm": 211.27984619140625, + "learning_rate": 1.6045372050816695e-05, + "loss": 36.7399, + "step": 2573 + }, + { + "epoch": 9.29255079006772, + "grad_norm": 201.9020233154297, + "learning_rate": 1.6039927404718694e-05, + "loss": 33.7555, + "step": 2574 + }, + { + "epoch": 9.296162528216705, + "grad_norm": 230.27149963378906, + "learning_rate": 1.603448275862069e-05, + "loss": 32.9646, + "step": 2575 + }, + { + "epoch": 9.299774266365688, + "grad_norm": 208.77622985839844, + "learning_rate": 1.6029038112522685e-05, + "loss": 33.5332, + "step": 2576 + }, + { + "epoch": 9.303386004514673, + "grad_norm": 225.02796936035156, + "learning_rate": 1.6023593466424684e-05, + "loss": 34.2592, + "step": 2577 + }, + { + "epoch": 9.306997742663658, + "grad_norm": 201.79612731933594, + "learning_rate": 1.601814882032668e-05, + "loss": 34.6686, + "step": 2578 + }, + { + "epoch": 9.31060948081264, + "grad_norm": 235.6588134765625, + "learning_rate": 1.6012704174228678e-05, + "loss": 35.4554, + "step": 2579 + }, + { + "epoch": 9.314221218961626, + "grad_norm": 273.51904296875, + "learning_rate": 1.6007259528130673e-05, + "loss": 35.2077, + "step": 2580 + }, + { + "epoch": 9.314221218961626, + "eval_loss": 0.6169624328613281, + "eval_runtime": 3.1501, + "eval_samples_per_second": 56.823, + "eval_steps_per_second": 56.823, + "step": 2580 + }, + { + "epoch": 9.317832957110609, + "grad_norm": 199.19541931152344, + "learning_rate": 1.600181488203267e-05, + "loss": 35.0703, + "step": 2581 + }, + { + "epoch": 9.321444695259594, + "grad_norm": 212.49276733398438, + "learning_rate": 1.5996370235934664e-05, + "loss": 35.9691, + "step": 2582 + }, + { + "epoch": 9.325056433408578, + "grad_norm": 193.7330322265625, + "learning_rate": 1.599092558983666e-05, + "loss": 34.9043, + "step": 2583 + }, + { + "epoch": 9.328668171557561, + "grad_norm": 196.00503540039062, + "learning_rate": 1.5985480943738655e-05, + "loss": 36.3508, + "step": 2584 + }, + { + "epoch": 9.332279909706546, + "grad_norm": 218.78392028808594, + "learning_rate": 1.5980036297640654e-05, + "loss": 34.7672, + "step": 2585 + }, + { + "epoch": 9.335891647855531, + "grad_norm": 235.76873779296875, + "learning_rate": 1.5974591651542652e-05, + "loss": 36.8695, + "step": 2586 + }, + { + "epoch": 9.339503386004514, + "grad_norm": 250.538330078125, + "learning_rate": 1.5969147005444648e-05, + "loss": 37.4531, + "step": 2587 + }, + { + "epoch": 9.343115124153499, + "grad_norm": 234.12469482421875, + "learning_rate": 1.5963702359346643e-05, + "loss": 37.4506, + "step": 2588 + }, + { + "epoch": 9.346726862302482, + "grad_norm": 209.3461151123047, + "learning_rate": 1.595825771324864e-05, + "loss": 31.3062, + "step": 2589 + }, + { + "epoch": 9.350338600451467, + "grad_norm": 211.12277221679688, + "learning_rate": 1.5952813067150637e-05, + "loss": 23.3303, + "step": 2590 + }, + { + "epoch": 9.350338600451467, + "eval_loss": 0.6222187876701355, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 2590 + }, + { + "epoch": 9.353950338600452, + "grad_norm": 200.1257781982422, + "learning_rate": 1.5947368421052633e-05, + "loss": 22.9145, + "step": 2591 + }, + { + "epoch": 9.357562076749435, + "grad_norm": 179.01475524902344, + "learning_rate": 1.5941923774954628e-05, + "loss": 23.8842, + "step": 2592 + }, + { + "epoch": 9.36117381489842, + "grad_norm": 214.9254608154297, + "learning_rate": 1.5936479128856623e-05, + "loss": 25.4154, + "step": 2593 + }, + { + "epoch": 9.364785553047405, + "grad_norm": 211.63735961914062, + "learning_rate": 1.593103448275862e-05, + "loss": 42.6467, + "step": 2594 + }, + { + "epoch": 9.368397291196388, + "grad_norm": 232.43194580078125, + "learning_rate": 1.5925589836660618e-05, + "loss": 43.3501, + "step": 2595 + }, + { + "epoch": 9.372009029345373, + "grad_norm": 220.61468505859375, + "learning_rate": 1.5920145190562616e-05, + "loss": 43.4324, + "step": 2596 + }, + { + "epoch": 9.375620767494357, + "grad_norm": 179.00894165039062, + "learning_rate": 1.591470054446461e-05, + "loss": 41.9646, + "step": 2597 + }, + { + "epoch": 9.37923250564334, + "grad_norm": 203.847412109375, + "learning_rate": 1.5909255898366607e-05, + "loss": 41.1242, + "step": 2598 + }, + { + "epoch": 9.382844243792325, + "grad_norm": 244.20164489746094, + "learning_rate": 1.5903811252268602e-05, + "loss": 42.2451, + "step": 2599 + }, + { + "epoch": 9.386455981941308, + "grad_norm": 203.60154724121094, + "learning_rate": 1.5898366606170598e-05, + "loss": 42.0361, + "step": 2600 + }, + { + "epoch": 9.386455981941308, + "eval_loss": 0.627146303653717, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2600 + }, + { + "epoch": 9.390067720090293, + "grad_norm": 185.1741180419922, + "learning_rate": 1.5892921960072597e-05, + "loss": 41.9657, + "step": 2601 + }, + { + "epoch": 9.393679458239278, + "grad_norm": 211.64219665527344, + "learning_rate": 1.5887477313974592e-05, + "loss": 42.2619, + "step": 2602 + }, + { + "epoch": 9.397291196388261, + "grad_norm": 253.31997680664062, + "learning_rate": 1.5882032667876587e-05, + "loss": 42.5666, + "step": 2603 + }, + { + "epoch": 9.400902934537246, + "grad_norm": 257.8781433105469, + "learning_rate": 1.5876588021778586e-05, + "loss": 43.1747, + "step": 2604 + }, + { + "epoch": 9.404514672686231, + "grad_norm": 171.05398559570312, + "learning_rate": 1.587114337568058e-05, + "loss": 41.2645, + "step": 2605 + }, + { + "epoch": 9.408126410835214, + "grad_norm": 209.83749389648438, + "learning_rate": 1.5865698729582577e-05, + "loss": 38.7138, + "step": 2606 + }, + { + "epoch": 9.411738148984199, + "grad_norm": 303.92059326171875, + "learning_rate": 1.5860254083484576e-05, + "loss": 38.7962, + "step": 2607 + }, + { + "epoch": 9.415349887133182, + "grad_norm": 271.9322204589844, + "learning_rate": 1.585480943738657e-05, + "loss": 39.0622, + "step": 2608 + }, + { + "epoch": 9.418961625282167, + "grad_norm": 222.8749542236328, + "learning_rate": 1.5849364791288566e-05, + "loss": 40.0773, + "step": 2609 + }, + { + "epoch": 9.422573363431152, + "grad_norm": 194.549072265625, + "learning_rate": 1.5843920145190562e-05, + "loss": 39.3495, + "step": 2610 + }, + { + "epoch": 9.422573363431152, + "eval_loss": 0.618250846862793, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.796, + "eval_steps_per_second": 56.796, + "step": 2610 + }, + { + "epoch": 9.426185101580135, + "grad_norm": 231.32623291015625, + "learning_rate": 1.5838475499092557e-05, + "loss": 39.7577, + "step": 2611 + }, + { + "epoch": 9.42979683972912, + "grad_norm": 185.9986114501953, + "learning_rate": 1.5833030852994556e-05, + "loss": 40.9342, + "step": 2612 + }, + { + "epoch": 9.433408577878104, + "grad_norm": 221.356201171875, + "learning_rate": 1.5827586206896555e-05, + "loss": 39.7733, + "step": 2613 + }, + { + "epoch": 9.437020316027088, + "grad_norm": 216.2249755859375, + "learning_rate": 1.582214156079855e-05, + "loss": 39.7559, + "step": 2614 + }, + { + "epoch": 9.440632054176072, + "grad_norm": 263.5106201171875, + "learning_rate": 1.5816696914700546e-05, + "loss": 41.2872, + "step": 2615 + }, + { + "epoch": 9.444243792325057, + "grad_norm": 281.9518127441406, + "learning_rate": 1.581125226860254e-05, + "loss": 41.1114, + "step": 2616 + }, + { + "epoch": 9.44785553047404, + "grad_norm": 200.2808074951172, + "learning_rate": 1.5805807622504536e-05, + "loss": 41.7711, + "step": 2617 + }, + { + "epoch": 9.451467268623025, + "grad_norm": 233.034912109375, + "learning_rate": 1.5800362976406535e-05, + "loss": 41.3306, + "step": 2618 + }, + { + "epoch": 9.455079006772008, + "grad_norm": 215.5499725341797, + "learning_rate": 1.579491833030853e-05, + "loss": 41.0065, + "step": 2619 + }, + { + "epoch": 9.458690744920993, + "grad_norm": 220.21153259277344, + "learning_rate": 1.5789473684210526e-05, + "loss": 42.1116, + "step": 2620 + }, + { + "epoch": 9.458690744920993, + "eval_loss": 0.6146022081375122, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 2620 + }, + { + "epoch": 9.462302483069978, + "grad_norm": 198.20001220703125, + "learning_rate": 1.578402903811252e-05, + "loss": 39.637, + "step": 2621 + }, + { + "epoch": 9.465914221218961, + "grad_norm": 228.18357849121094, + "learning_rate": 1.5778584392014517e-05, + "loss": 37.3831, + "step": 2622 + }, + { + "epoch": 9.469525959367946, + "grad_norm": 207.68040466308594, + "learning_rate": 1.577313974591652e-05, + "loss": 35.6356, + "step": 2623 + }, + { + "epoch": 9.47313769751693, + "grad_norm": 267.0474853515625, + "learning_rate": 1.5767695099818514e-05, + "loss": 34.5549, + "step": 2624 + }, + { + "epoch": 9.476749435665914, + "grad_norm": 191.4129638671875, + "learning_rate": 1.576225045372051e-05, + "loss": 35.1065, + "step": 2625 + }, + { + "epoch": 9.480361173814899, + "grad_norm": 220.85708618164062, + "learning_rate": 1.5756805807622505e-05, + "loss": 34.9115, + "step": 2626 + }, + { + "epoch": 9.483972911963882, + "grad_norm": 218.62460327148438, + "learning_rate": 1.57513611615245e-05, + "loss": 33.9542, + "step": 2627 + }, + { + "epoch": 9.487584650112867, + "grad_norm": 184.085693359375, + "learning_rate": 1.5745916515426496e-05, + "loss": 35.2981, + "step": 2628 + }, + { + "epoch": 9.491196388261852, + "grad_norm": 286.73236083984375, + "learning_rate": 1.5740471869328494e-05, + "loss": 36.8326, + "step": 2629 + }, + { + "epoch": 9.494808126410835, + "grad_norm": 326.4263000488281, + "learning_rate": 1.573502722323049e-05, + "loss": 35.9728, + "step": 2630 + }, + { + "epoch": 9.494808126410835, + "eval_loss": 0.6165672540664673, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2630 + }, + { + "epoch": 9.49841986455982, + "grad_norm": 283.330322265625, + "learning_rate": 1.5729582577132485e-05, + "loss": 37.4227, + "step": 2631 + }, + { + "epoch": 9.502031602708804, + "grad_norm": 208.65829467773438, + "learning_rate": 1.5724137931034484e-05, + "loss": 36.8613, + "step": 2632 + }, + { + "epoch": 9.505643340857787, + "grad_norm": 191.59429931640625, + "learning_rate": 1.571869328493648e-05, + "loss": 36.2332, + "step": 2633 + }, + { + "epoch": 9.509255079006772, + "grad_norm": 306.4736022949219, + "learning_rate": 1.5713248638838478e-05, + "loss": 36.8045, + "step": 2634 + }, + { + "epoch": 9.512866817155757, + "grad_norm": 226.97509765625, + "learning_rate": 1.5707803992740474e-05, + "loss": 37.005, + "step": 2635 + }, + { + "epoch": 9.51647855530474, + "grad_norm": 230.47683715820312, + "learning_rate": 1.570235934664247e-05, + "loss": 36.9168, + "step": 2636 + }, + { + "epoch": 9.520090293453725, + "grad_norm": 221.44483947753906, + "learning_rate": 1.5696914700544464e-05, + "loss": 39.0025, + "step": 2637 + }, + { + "epoch": 9.523702031602708, + "grad_norm": 249.1531219482422, + "learning_rate": 1.569147005444646e-05, + "loss": 38.1069, + "step": 2638 + }, + { + "epoch": 9.527313769751693, + "grad_norm": 276.8532409667969, + "learning_rate": 1.5686025408348455e-05, + "loss": 30.9819, + "step": 2639 + }, + { + "epoch": 9.530925507900678, + "grad_norm": 218.25035095214844, + "learning_rate": 1.5680580762250454e-05, + "loss": 23.4807, + "step": 2640 + }, + { + "epoch": 9.530925507900678, + "eval_loss": 0.619295060634613, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2640 + }, + { + "epoch": 9.534537246049661, + "grad_norm": 185.83737182617188, + "learning_rate": 1.5675136116152453e-05, + "loss": 22.5394, + "step": 2641 + }, + { + "epoch": 9.538148984198646, + "grad_norm": 181.9920654296875, + "learning_rate": 1.5669691470054448e-05, + "loss": 23.9106, + "step": 2642 + }, + { + "epoch": 9.54176072234763, + "grad_norm": 209.20391845703125, + "learning_rate": 1.5664246823956443e-05, + "loss": 25.5328, + "step": 2643 + }, + { + "epoch": 9.545372460496614, + "grad_norm": 223.86093139648438, + "learning_rate": 1.565880217785844e-05, + "loss": 42.8563, + "step": 2644 + }, + { + "epoch": 9.548984198645599, + "grad_norm": 232.3086395263672, + "learning_rate": 1.5653357531760438e-05, + "loss": 44.0178, + "step": 2645 + }, + { + "epoch": 9.552595936794582, + "grad_norm": 223.76541137695312, + "learning_rate": 1.5647912885662433e-05, + "loss": 43.4928, + "step": 2646 + }, + { + "epoch": 9.556207674943566, + "grad_norm": 258.86700439453125, + "learning_rate": 1.5642468239564428e-05, + "loss": 42.3422, + "step": 2647 + }, + { + "epoch": 9.559819413092551, + "grad_norm": 255.09033203125, + "learning_rate": 1.5637023593466424e-05, + "loss": 41.6588, + "step": 2648 + }, + { + "epoch": 9.563431151241534, + "grad_norm": 205.88563537597656, + "learning_rate": 1.563157894736842e-05, + "loss": 41.9267, + "step": 2649 + }, + { + "epoch": 9.56704288939052, + "grad_norm": 204.12318420410156, + "learning_rate": 1.5626134301270418e-05, + "loss": 43.0326, + "step": 2650 + }, + { + "epoch": 9.56704288939052, + "eval_loss": 0.6218730807304382, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2650 + }, + { + "epoch": 9.570654627539504, + "grad_norm": 259.5694274902344, + "learning_rate": 1.5620689655172417e-05, + "loss": 42.9604, + "step": 2651 + }, + { + "epoch": 9.574266365688487, + "grad_norm": 234.35935974121094, + "learning_rate": 1.5615245009074412e-05, + "loss": 42.7316, + "step": 2652 + }, + { + "epoch": 9.577878103837472, + "grad_norm": 237.14346313476562, + "learning_rate": 1.5609800362976407e-05, + "loss": 42.4559, + "step": 2653 + }, + { + "epoch": 9.581489841986457, + "grad_norm": 208.2974395751953, + "learning_rate": 1.5604355716878403e-05, + "loss": 40.1113, + "step": 2654 + }, + { + "epoch": 9.58510158013544, + "grad_norm": 212.18814086914062, + "learning_rate": 1.5598911070780398e-05, + "loss": 38.6515, + "step": 2655 + }, + { + "epoch": 9.588713318284425, + "grad_norm": 245.23240661621094, + "learning_rate": 1.5593466424682397e-05, + "loss": 39.5289, + "step": 2656 + }, + { + "epoch": 9.592325056433408, + "grad_norm": 261.1321105957031, + "learning_rate": 1.5588021778584392e-05, + "loss": 39.3232, + "step": 2657 + }, + { + "epoch": 9.595936794582393, + "grad_norm": 257.67962646484375, + "learning_rate": 1.5582577132486388e-05, + "loss": 40.3963, + "step": 2658 + }, + { + "epoch": 9.599548532731378, + "grad_norm": 299.93914794921875, + "learning_rate": 1.5577132486388383e-05, + "loss": 39.0657, + "step": 2659 + }, + { + "epoch": 9.60316027088036, + "grad_norm": 215.45407104492188, + "learning_rate": 1.5571687840290382e-05, + "loss": 40.1408, + "step": 2660 + }, + { + "epoch": 9.60316027088036, + "eval_loss": 0.6216554045677185, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2660 + }, + { + "epoch": 9.606772009029346, + "grad_norm": 273.9233093261719, + "learning_rate": 1.5566243194192377e-05, + "loss": 40.6894, + "step": 2661 + }, + { + "epoch": 9.610383747178329, + "grad_norm": 220.76344299316406, + "learning_rate": 1.5560798548094376e-05, + "loss": 40.8146, + "step": 2662 + }, + { + "epoch": 9.613995485327314, + "grad_norm": 200.33929443359375, + "learning_rate": 1.555535390199637e-05, + "loss": 40.1362, + "step": 2663 + }, + { + "epoch": 9.617607223476298, + "grad_norm": 223.38536071777344, + "learning_rate": 1.5549909255898367e-05, + "loss": 39.3488, + "step": 2664 + }, + { + "epoch": 9.621218961625281, + "grad_norm": 240.99578857421875, + "learning_rate": 1.5544464609800362e-05, + "loss": 41.771, + "step": 2665 + }, + { + "epoch": 9.624830699774266, + "grad_norm": 202.30323791503906, + "learning_rate": 1.5539019963702357e-05, + "loss": 41.1412, + "step": 2666 + }, + { + "epoch": 9.628442437923251, + "grad_norm": 193.8411865234375, + "learning_rate": 1.5533575317604356e-05, + "loss": 41.0064, + "step": 2667 + }, + { + "epoch": 9.632054176072234, + "grad_norm": 197.1542510986328, + "learning_rate": 1.552813067150635e-05, + "loss": 41.4787, + "step": 2668 + }, + { + "epoch": 9.635665914221219, + "grad_norm": 259.21954345703125, + "learning_rate": 1.552268602540835e-05, + "loss": 41.753, + "step": 2669 + }, + { + "epoch": 9.639277652370204, + "grad_norm": 290.9770202636719, + "learning_rate": 1.5517241379310346e-05, + "loss": 40.4589, + "step": 2670 + }, + { + "epoch": 9.639277652370204, + "eval_loss": 0.6132164001464844, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2670 + }, + { + "epoch": 9.642889390519187, + "grad_norm": 252.86219787597656, + "learning_rate": 1.551179673321234e-05, + "loss": 37.356, + "step": 2671 + }, + { + "epoch": 9.646501128668172, + "grad_norm": 207.79254150390625, + "learning_rate": 1.550635208711434e-05, + "loss": 36.2071, + "step": 2672 + }, + { + "epoch": 9.650112866817155, + "grad_norm": 186.78857421875, + "learning_rate": 1.5500907441016335e-05, + "loss": 33.5074, + "step": 2673 + }, + { + "epoch": 9.65372460496614, + "grad_norm": 212.5107421875, + "learning_rate": 1.549546279491833e-05, + "loss": 33.7103, + "step": 2674 + }, + { + "epoch": 9.657336343115125, + "grad_norm": 243.2950897216797, + "learning_rate": 1.5490018148820326e-05, + "loss": 34.3476, + "step": 2675 + }, + { + "epoch": 9.660948081264108, + "grad_norm": 221.66415405273438, + "learning_rate": 1.548457350272232e-05, + "loss": 34.5377, + "step": 2676 + }, + { + "epoch": 9.664559819413093, + "grad_norm": 231.8260955810547, + "learning_rate": 1.5479128856624317e-05, + "loss": 34.3663, + "step": 2677 + }, + { + "epoch": 9.668171557562077, + "grad_norm": 284.6401062011719, + "learning_rate": 1.547368421052632e-05, + "loss": 35.5723, + "step": 2678 + }, + { + "epoch": 9.67178329571106, + "grad_norm": 373.43865966796875, + "learning_rate": 1.5468239564428314e-05, + "loss": 35.5628, + "step": 2679 + }, + { + "epoch": 9.675395033860045, + "grad_norm": 325.18316650390625, + "learning_rate": 1.546279491833031e-05, + "loss": 35.6192, + "step": 2680 + }, + { + "epoch": 9.675395033860045, + "eval_loss": 0.613842248916626, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 2680 + }, + { + "epoch": 9.679006772009028, + "grad_norm": 353.14739990234375, + "learning_rate": 1.5457350272232305e-05, + "loss": 36.4789, + "step": 2681 + }, + { + "epoch": 9.682618510158013, + "grad_norm": 215.21836853027344, + "learning_rate": 1.54519056261343e-05, + "loss": 36.0412, + "step": 2682 + }, + { + "epoch": 9.686230248306998, + "grad_norm": 219.64930725097656, + "learning_rate": 1.54464609800363e-05, + "loss": 37.1118, + "step": 2683 + }, + { + "epoch": 9.689841986455981, + "grad_norm": 247.86685180664062, + "learning_rate": 1.5441016333938295e-05, + "loss": 36.488, + "step": 2684 + }, + { + "epoch": 9.693453724604966, + "grad_norm": 248.7967071533203, + "learning_rate": 1.543557168784029e-05, + "loss": 36.2925, + "step": 2685 + }, + { + "epoch": 9.697065462753951, + "grad_norm": 243.1404571533203, + "learning_rate": 1.5430127041742285e-05, + "loss": 37.3986, + "step": 2686 + }, + { + "epoch": 9.700677200902934, + "grad_norm": 276.6585388183594, + "learning_rate": 1.5424682395644284e-05, + "loss": 37.9784, + "step": 2687 + }, + { + "epoch": 9.704288939051919, + "grad_norm": 308.171630859375, + "learning_rate": 1.541923774954628e-05, + "loss": 38.1591, + "step": 2688 + }, + { + "epoch": 9.707900677200904, + "grad_norm": 204.4575653076172, + "learning_rate": 1.541379310344828e-05, + "loss": 27.4514, + "step": 2689 + }, + { + "epoch": 9.711512415349887, + "grad_norm": 160.85946655273438, + "learning_rate": 1.5408348457350274e-05, + "loss": 23.7982, + "step": 2690 + }, + { + "epoch": 9.711512415349887, + "eval_loss": 0.619924008846283, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2690 + }, + { + "epoch": 9.715124153498872, + "grad_norm": 215.60049438476562, + "learning_rate": 1.540290381125227e-05, + "loss": 23.3927, + "step": 2691 + }, + { + "epoch": 9.718735891647855, + "grad_norm": 172.84011840820312, + "learning_rate": 1.5397459165154265e-05, + "loss": 24.1876, + "step": 2692 + }, + { + "epoch": 9.72234762979684, + "grad_norm": 208.42361450195312, + "learning_rate": 1.539201451905626e-05, + "loss": 25.1794, + "step": 2693 + }, + { + "epoch": 9.725959367945824, + "grad_norm": 255.73574829101562, + "learning_rate": 1.538656987295826e-05, + "loss": 42.3484, + "step": 2694 + }, + { + "epoch": 9.729571106094808, + "grad_norm": 239.65533447265625, + "learning_rate": 1.5381125226860254e-05, + "loss": 42.8277, + "step": 2695 + }, + { + "epoch": 9.733182844243792, + "grad_norm": 211.2068634033203, + "learning_rate": 1.5375680580762253e-05, + "loss": 42.6536, + "step": 2696 + }, + { + "epoch": 9.736794582392777, + "grad_norm": 302.85003662109375, + "learning_rate": 1.5370235934664248e-05, + "loss": 42.6263, + "step": 2697 + }, + { + "epoch": 9.74040632054176, + "grad_norm": 211.54754638671875, + "learning_rate": 1.5364791288566244e-05, + "loss": 41.5621, + "step": 2698 + }, + { + "epoch": 9.744018058690745, + "grad_norm": 229.22283935546875, + "learning_rate": 1.535934664246824e-05, + "loss": 43.3765, + "step": 2699 + }, + { + "epoch": 9.747629796839728, + "grad_norm": 206.64794921875, + "learning_rate": 1.5353901996370238e-05, + "loss": 41.4923, + "step": 2700 + }, + { + "epoch": 9.747629796839728, + "eval_loss": 0.6202616095542908, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.981, + "eval_steps_per_second": 56.981, + "step": 2700 + }, + { + "epoch": 9.751241534988713, + "grad_norm": 216.98757934570312, + "learning_rate": 1.5348457350272233e-05, + "loss": 43.1931, + "step": 2701 + }, + { + "epoch": 9.754853273137698, + "grad_norm": 222.7340545654297, + "learning_rate": 1.534301270417423e-05, + "loss": 42.485, + "step": 2702 + }, + { + "epoch": 9.758465011286681, + "grad_norm": 291.3454895019531, + "learning_rate": 1.5337568058076224e-05, + "loss": 41.4766, + "step": 2703 + }, + { + "epoch": 9.762076749435666, + "grad_norm": 239.50341796875, + "learning_rate": 1.533212341197822e-05, + "loss": 41.9215, + "step": 2704 + }, + { + "epoch": 9.76568848758465, + "grad_norm": 179.21839904785156, + "learning_rate": 1.5326678765880218e-05, + "loss": 40.6544, + "step": 2705 + }, + { + "epoch": 9.769300225733634, + "grad_norm": 210.89535522460938, + "learning_rate": 1.5321234119782217e-05, + "loss": 38.6204, + "step": 2706 + }, + { + "epoch": 9.772911963882619, + "grad_norm": 239.23291015625, + "learning_rate": 1.5315789473684212e-05, + "loss": 39.4385, + "step": 2707 + }, + { + "epoch": 9.776523702031604, + "grad_norm": 240.22772216796875, + "learning_rate": 1.5310344827586208e-05, + "loss": 40.0139, + "step": 2708 + }, + { + "epoch": 9.780135440180587, + "grad_norm": 185.4588623046875, + "learning_rate": 1.5304900181488203e-05, + "loss": 38.9331, + "step": 2709 + }, + { + "epoch": 9.783747178329572, + "grad_norm": 263.0315856933594, + "learning_rate": 1.52994555353902e-05, + "loss": 38.5485, + "step": 2710 + }, + { + "epoch": 9.783747178329572, + "eval_loss": 0.615914523601532, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2710 + }, + { + "epoch": 9.787358916478555, + "grad_norm": 209.05348205566406, + "learning_rate": 1.5294010889292197e-05, + "loss": 39.4875, + "step": 2711 + }, + { + "epoch": 9.79097065462754, + "grad_norm": 209.72293090820312, + "learning_rate": 1.5288566243194193e-05, + "loss": 40.4742, + "step": 2712 + }, + { + "epoch": 9.794582392776524, + "grad_norm": 210.02908325195312, + "learning_rate": 1.5283121597096188e-05, + "loss": 39.924, + "step": 2713 + }, + { + "epoch": 9.798194130925507, + "grad_norm": 204.3467254638672, + "learning_rate": 1.5277676950998183e-05, + "loss": 40.8893, + "step": 2714 + }, + { + "epoch": 9.801805869074492, + "grad_norm": 253.9317626953125, + "learning_rate": 1.5272232304900182e-05, + "loss": 38.3278, + "step": 2715 + }, + { + "epoch": 9.805417607223477, + "grad_norm": 263.6196594238281, + "learning_rate": 1.526678765880218e-05, + "loss": 40.5242, + "step": 2716 + }, + { + "epoch": 9.80902934537246, + "grad_norm": 230.35621643066406, + "learning_rate": 1.5261343012704176e-05, + "loss": 40.683, + "step": 2717 + }, + { + "epoch": 9.812641083521445, + "grad_norm": 190.16323852539062, + "learning_rate": 1.5255898366606172e-05, + "loss": 40.2472, + "step": 2718 + }, + { + "epoch": 9.816252821670428, + "grad_norm": 202.7122344970703, + "learning_rate": 1.5250453720508167e-05, + "loss": 38.9644, + "step": 2719 + }, + { + "epoch": 9.819864559819413, + "grad_norm": 193.65774536132812, + "learning_rate": 1.5245009074410164e-05, + "loss": 40.9982, + "step": 2720 + }, + { + "epoch": 9.819864559819413, + "eval_loss": 0.6152020692825317, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 2720 + }, + { + "epoch": 9.823476297968398, + "grad_norm": 272.0360412597656, + "learning_rate": 1.523956442831216e-05, + "loss": 40.5518, + "step": 2721 + }, + { + "epoch": 9.827088036117381, + "grad_norm": 200.20777893066406, + "learning_rate": 1.5234119782214155e-05, + "loss": 38.4801, + "step": 2722 + }, + { + "epoch": 9.830699774266366, + "grad_norm": 201.44764709472656, + "learning_rate": 1.5228675136116152e-05, + "loss": 35.7499, + "step": 2723 + }, + { + "epoch": 9.83431151241535, + "grad_norm": 234.89706420898438, + "learning_rate": 1.522323049001815e-05, + "loss": 35.4331, + "step": 2724 + }, + { + "epoch": 9.837923250564334, + "grad_norm": 193.27423095703125, + "learning_rate": 1.5217785843920146e-05, + "loss": 33.0281, + "step": 2725 + }, + { + "epoch": 9.841534988713319, + "grad_norm": 222.28060913085938, + "learning_rate": 1.5212341197822143e-05, + "loss": 34.2237, + "step": 2726 + }, + { + "epoch": 9.845146726862303, + "grad_norm": 264.2764587402344, + "learning_rate": 1.5206896551724139e-05, + "loss": 33.7112, + "step": 2727 + }, + { + "epoch": 9.848758465011286, + "grad_norm": 204.5146484375, + "learning_rate": 1.5201451905626134e-05, + "loss": 33.9014, + "step": 2728 + }, + { + "epoch": 9.852370203160271, + "grad_norm": 198.90907287597656, + "learning_rate": 1.5196007259528131e-05, + "loss": 36.6987, + "step": 2729 + }, + { + "epoch": 9.855981941309254, + "grad_norm": 254.19818115234375, + "learning_rate": 1.5190562613430126e-05, + "loss": 35.4466, + "step": 2730 + }, + { + "epoch": 9.855981941309254, + "eval_loss": 0.6153284311294556, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2730 + }, + { + "epoch": 9.85959367945824, + "grad_norm": 212.53749084472656, + "learning_rate": 1.5185117967332123e-05, + "loss": 35.659, + "step": 2731 + }, + { + "epoch": 9.863205417607224, + "grad_norm": 234.5277557373047, + "learning_rate": 1.5179673321234119e-05, + "loss": 36.7411, + "step": 2732 + }, + { + "epoch": 9.866817155756207, + "grad_norm": 229.25962829589844, + "learning_rate": 1.5174228675136118e-05, + "loss": 36.0713, + "step": 2733 + }, + { + "epoch": 9.870428893905192, + "grad_norm": 259.5096435546875, + "learning_rate": 1.5168784029038115e-05, + "loss": 37.2433, + "step": 2734 + }, + { + "epoch": 9.874040632054175, + "grad_norm": 297.2413024902344, + "learning_rate": 1.516333938294011e-05, + "loss": 37.222, + "step": 2735 + }, + { + "epoch": 9.87765237020316, + "grad_norm": 259.8325500488281, + "learning_rate": 1.5157894736842105e-05, + "loss": 37.096, + "step": 2736 + }, + { + "epoch": 9.881264108352145, + "grad_norm": 275.85888671875, + "learning_rate": 1.5152450090744103e-05, + "loss": 37.769, + "step": 2737 + }, + { + "epoch": 9.884875846501128, + "grad_norm": 261.16656494140625, + "learning_rate": 1.5147005444646098e-05, + "loss": 38.4089, + "step": 2738 + }, + { + "epoch": 9.888487584650113, + "grad_norm": 219.74351501464844, + "learning_rate": 1.5141560798548095e-05, + "loss": 32.5255, + "step": 2739 + }, + { + "epoch": 9.892099322799098, + "grad_norm": 203.9193878173828, + "learning_rate": 1.513611615245009e-05, + "loss": 24.2497, + "step": 2740 + }, + { + "epoch": 9.892099322799098, + "eval_loss": 0.6206448674201965, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 2740 + }, + { + "epoch": 9.89571106094808, + "grad_norm": 224.19454956054688, + "learning_rate": 1.5130671506352086e-05, + "loss": 23.0629, + "step": 2741 + }, + { + "epoch": 9.899322799097066, + "grad_norm": 252.4147186279297, + "learning_rate": 1.5125226860254086e-05, + "loss": 24.5799, + "step": 2742 + }, + { + "epoch": 9.90293453724605, + "grad_norm": 214.79067993164062, + "learning_rate": 1.5119782214156082e-05, + "loss": 24.6773, + "step": 2743 + }, + { + "epoch": 9.906546275395034, + "grad_norm": 225.59848022460938, + "learning_rate": 1.5114337568058077e-05, + "loss": 43.1147, + "step": 2744 + }, + { + "epoch": 9.910158013544018, + "grad_norm": 221.8661651611328, + "learning_rate": 1.5108892921960074e-05, + "loss": 42.7403, + "step": 2745 + }, + { + "epoch": 9.913769751693001, + "grad_norm": 316.3871765136719, + "learning_rate": 1.510344827586207e-05, + "loss": 41.6931, + "step": 2746 + }, + { + "epoch": 9.917381489841986, + "grad_norm": 250.6577911376953, + "learning_rate": 1.5098003629764065e-05, + "loss": 43.3, + "step": 2747 + }, + { + "epoch": 9.920993227990971, + "grad_norm": 222.44386291503906, + "learning_rate": 1.5092558983666062e-05, + "loss": 43.3128, + "step": 2748 + }, + { + "epoch": 9.924604966139954, + "grad_norm": 190.08682250976562, + "learning_rate": 1.5087114337568057e-05, + "loss": 41.4814, + "step": 2749 + }, + { + "epoch": 9.928216704288939, + "grad_norm": 276.9918212890625, + "learning_rate": 1.5081669691470054e-05, + "loss": 41.042, + "step": 2750 + }, + { + "epoch": 9.928216704288939, + "eval_loss": 0.6201648116111755, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2750 + }, + { + "epoch": 9.931828442437924, + "grad_norm": 269.7344970703125, + "learning_rate": 1.507622504537205e-05, + "loss": 40.3064, + "step": 2751 + }, + { + "epoch": 9.935440180586907, + "grad_norm": 263.11663818359375, + "learning_rate": 1.5070780399274049e-05, + "loss": 40.1675, + "step": 2752 + }, + { + "epoch": 9.939051918735892, + "grad_norm": 210.37635803222656, + "learning_rate": 1.5065335753176046e-05, + "loss": 40.5334, + "step": 2753 + }, + { + "epoch": 9.942663656884875, + "grad_norm": 206.09335327148438, + "learning_rate": 1.5059891107078041e-05, + "loss": 41.0429, + "step": 2754 + }, + { + "epoch": 9.94627539503386, + "grad_norm": 245.45013427734375, + "learning_rate": 1.5054446460980036e-05, + "loss": 40.8831, + "step": 2755 + }, + { + "epoch": 9.949887133182845, + "grad_norm": 216.63075256347656, + "learning_rate": 1.5049001814882033e-05, + "loss": 41.2453, + "step": 2756 + }, + { + "epoch": 9.953498871331828, + "grad_norm": 362.12127685546875, + "learning_rate": 1.5043557168784029e-05, + "loss": 40.4561, + "step": 2757 + }, + { + "epoch": 9.957110609480813, + "grad_norm": 222.01434326171875, + "learning_rate": 1.5038112522686024e-05, + "loss": 41.7307, + "step": 2758 + }, + { + "epoch": 9.960722347629797, + "grad_norm": 289.6107177734375, + "learning_rate": 1.5032667876588021e-05, + "loss": 37.83, + "step": 2759 + }, + { + "epoch": 9.96433408577878, + "grad_norm": 231.75274658203125, + "learning_rate": 1.5027223230490017e-05, + "loss": 34.1728, + "step": 2760 + }, + { + "epoch": 9.96433408577878, + "eval_loss": 0.6177247166633606, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2760 + }, + { + "epoch": 9.967945823927765, + "grad_norm": 269.4657287597656, + "learning_rate": 1.5021778584392017e-05, + "loss": 33.8501, + "step": 2761 + }, + { + "epoch": 9.97155756207675, + "grad_norm": 229.73004150390625, + "learning_rate": 1.5016333938294013e-05, + "loss": 35.0989, + "step": 2762 + }, + { + "epoch": 9.975169300225733, + "grad_norm": 215.75350952148438, + "learning_rate": 1.5010889292196008e-05, + "loss": 35.1091, + "step": 2763 + }, + { + "epoch": 9.978781038374718, + "grad_norm": 255.36439514160156, + "learning_rate": 1.5005444646098005e-05, + "loss": 36.8373, + "step": 2764 + }, + { + "epoch": 9.982392776523701, + "grad_norm": 226.71084594726562, + "learning_rate": 1.5e-05, + "loss": 36.6244, + "step": 2765 + }, + { + "epoch": 9.986004514672686, + "grad_norm": 264.1791076660156, + "learning_rate": 1.4994555353901996e-05, + "loss": 36.1925, + "step": 2766 + }, + { + "epoch": 9.989616252821671, + "grad_norm": 281.4349060058594, + "learning_rate": 1.4989110707803993e-05, + "loss": 38.5627, + "step": 2767 + }, + { + "epoch": 9.993227990970654, + "grad_norm": 275.13092041015625, + "learning_rate": 1.498366606170599e-05, + "loss": 33.3277, + "step": 2768 + }, + { + "epoch": 9.996839729119639, + "grad_norm": 215.79550170898438, + "learning_rate": 1.4978221415607985e-05, + "loss": 23.7482, + "step": 2769 + }, + { + "epoch": 10.0, + "grad_norm": 162.03152465820312, + "learning_rate": 1.4972776769509982e-05, + "loss": 21.7078, + "step": 2770 + }, + { + "epoch": 10.0, + "eval_loss": 0.6126651763916016, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2770 + }, + { + "epoch": 10.003611738148985, + "grad_norm": 243.1815185546875, + "learning_rate": 1.4967332123411978e-05, + "loss": 42.2449, + "step": 2771 + }, + { + "epoch": 10.007223476297968, + "grad_norm": 183.29127502441406, + "learning_rate": 1.4961887477313977e-05, + "loss": 41.5925, + "step": 2772 + }, + { + "epoch": 10.010835214446953, + "grad_norm": 206.04238891601562, + "learning_rate": 1.4956442831215972e-05, + "loss": 40.6657, + "step": 2773 + }, + { + "epoch": 10.014446952595938, + "grad_norm": 192.1796875, + "learning_rate": 1.4950998185117967e-05, + "loss": 41.7065, + "step": 2774 + }, + { + "epoch": 10.01805869074492, + "grad_norm": 202.77279663085938, + "learning_rate": 1.4945553539019964e-05, + "loss": 42.0608, + "step": 2775 + }, + { + "epoch": 10.021670428893906, + "grad_norm": 242.37734985351562, + "learning_rate": 1.494010889292196e-05, + "loss": 40.9925, + "step": 2776 + }, + { + "epoch": 10.025282167042889, + "grad_norm": 252.01358032226562, + "learning_rate": 1.4934664246823957e-05, + "loss": 41.1401, + "step": 2777 + }, + { + "epoch": 10.028893905191874, + "grad_norm": 205.82388305664062, + "learning_rate": 1.4929219600725954e-05, + "loss": 41.5, + "step": 2778 + }, + { + "epoch": 10.032505643340858, + "grad_norm": 251.53968811035156, + "learning_rate": 1.492377495462795e-05, + "loss": 41.8218, + "step": 2779 + }, + { + "epoch": 10.036117381489841, + "grad_norm": 236.55564880371094, + "learning_rate": 1.4918330308529945e-05, + "loss": 40.803, + "step": 2780 + }, + { + "epoch": 10.036117381489841, + "eval_loss": 0.6173696517944336, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 2780 + }, + { + "epoch": 10.039729119638826, + "grad_norm": 214.9959716796875, + "learning_rate": 1.4912885662431942e-05, + "loss": 40.522, + "step": 2781 + }, + { + "epoch": 10.043340857787811, + "grad_norm": 213.7000732421875, + "learning_rate": 1.4907441016333939e-05, + "loss": 38.8643, + "step": 2782 + }, + { + "epoch": 10.046952595936794, + "grad_norm": 225.6709747314453, + "learning_rate": 1.4901996370235936e-05, + "loss": 38.3625, + "step": 2783 + }, + { + "epoch": 10.050564334085779, + "grad_norm": 208.83712768554688, + "learning_rate": 1.4896551724137931e-05, + "loss": 38.5355, + "step": 2784 + }, + { + "epoch": 10.054176072234762, + "grad_norm": 185.51219177246094, + "learning_rate": 1.4891107078039927e-05, + "loss": 38.4303, + "step": 2785 + }, + { + "epoch": 10.057787810383747, + "grad_norm": 196.68551635742188, + "learning_rate": 1.4885662431941925e-05, + "loss": 38.1895, + "step": 2786 + }, + { + "epoch": 10.061399548532732, + "grad_norm": 207.4806671142578, + "learning_rate": 1.488021778584392e-05, + "loss": 39.2329, + "step": 2787 + }, + { + "epoch": 10.065011286681715, + "grad_norm": 211.640380859375, + "learning_rate": 1.4874773139745916e-05, + "loss": 40.108, + "step": 2788 + }, + { + "epoch": 10.0686230248307, + "grad_norm": 195.97006225585938, + "learning_rate": 1.4869328493647913e-05, + "loss": 39.6883, + "step": 2789 + }, + { + "epoch": 10.072234762979685, + "grad_norm": 207.20169067382812, + "learning_rate": 1.4863883847549909e-05, + "loss": 40.557, + "step": 2790 + }, + { + "epoch": 10.072234762979685, + "eval_loss": 0.6166439652442932, + "eval_runtime": 3.1461, + "eval_samples_per_second": 56.895, + "eval_steps_per_second": 56.895, + "step": 2790 + }, + { + "epoch": 10.075846501128668, + "grad_norm": 168.4052276611328, + "learning_rate": 1.4858439201451906e-05, + "loss": 39.76, + "step": 2791 + }, + { + "epoch": 10.079458239277653, + "grad_norm": 188.55575561523438, + "learning_rate": 1.4852994555353903e-05, + "loss": 40.4776, + "step": 2792 + }, + { + "epoch": 10.083069977426636, + "grad_norm": 181.60801696777344, + "learning_rate": 1.4847549909255898e-05, + "loss": 40.5414, + "step": 2793 + }, + { + "epoch": 10.08668171557562, + "grad_norm": 205.39608764648438, + "learning_rate": 1.4842105263157895e-05, + "loss": 41.4944, + "step": 2794 + }, + { + "epoch": 10.090293453724605, + "grad_norm": 271.0169372558594, + "learning_rate": 1.4836660617059892e-05, + "loss": 40.6805, + "step": 2795 + }, + { + "epoch": 10.093905191873588, + "grad_norm": 241.97889709472656, + "learning_rate": 1.4831215970961888e-05, + "loss": 39.5473, + "step": 2796 + }, + { + "epoch": 10.097516930022573, + "grad_norm": 211.64260864257812, + "learning_rate": 1.4825771324863885e-05, + "loss": 41.0357, + "step": 2797 + }, + { + "epoch": 10.101128668171558, + "grad_norm": 209.52804565429688, + "learning_rate": 1.482032667876588e-05, + "loss": 41.3357, + "step": 2798 + }, + { + "epoch": 10.104740406320541, + "grad_norm": 243.08419799804688, + "learning_rate": 1.4814882032667876e-05, + "loss": 38.6778, + "step": 2799 + }, + { + "epoch": 10.108352144469526, + "grad_norm": 227.17172241210938, + "learning_rate": 1.4809437386569874e-05, + "loss": 35.1128, + "step": 2800 + }, + { + "epoch": 10.108352144469526, + "eval_loss": 0.6153741478919983, + "eval_runtime": 3.143, + "eval_samples_per_second": 56.952, + "eval_steps_per_second": 56.952, + "step": 2800 + }, + { + "epoch": 10.111963882618511, + "grad_norm": 284.7151794433594, + "learning_rate": 1.480399274047187e-05, + "loss": 33.1712, + "step": 2801 + }, + { + "epoch": 10.115575620767494, + "grad_norm": 234.85169982910156, + "learning_rate": 1.4798548094373867e-05, + "loss": 33.495, + "step": 2802 + }, + { + "epoch": 10.119187358916479, + "grad_norm": 236.6138458251953, + "learning_rate": 1.4793103448275862e-05, + "loss": 33.2318, + "step": 2803 + }, + { + "epoch": 10.122799097065462, + "grad_norm": 240.98997497558594, + "learning_rate": 1.4787658802177858e-05, + "loss": 33.9268, + "step": 2804 + }, + { + "epoch": 10.126410835214447, + "grad_norm": 218.304443359375, + "learning_rate": 1.4782214156079856e-05, + "loss": 34.667, + "step": 2805 + }, + { + "epoch": 10.130022573363432, + "grad_norm": 290.30108642578125, + "learning_rate": 1.4776769509981852e-05, + "loss": 36.7153, + "step": 2806 + }, + { + "epoch": 10.133634311512415, + "grad_norm": 267.7265625, + "learning_rate": 1.4771324863883847e-05, + "loss": 35.2035, + "step": 2807 + }, + { + "epoch": 10.1372460496614, + "grad_norm": 300.4646301269531, + "learning_rate": 1.4765880217785844e-05, + "loss": 35.6581, + "step": 2808 + }, + { + "epoch": 10.140857787810384, + "grad_norm": 234.16448974609375, + "learning_rate": 1.4760435571687841e-05, + "loss": 35.8547, + "step": 2809 + }, + { + "epoch": 10.144469525959368, + "grad_norm": 209.23858642578125, + "learning_rate": 1.4754990925589837e-05, + "loss": 34.47, + "step": 2810 + }, + { + "epoch": 10.144469525959368, + "eval_loss": 0.6160662770271301, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2810 + }, + { + "epoch": 10.148081264108352, + "grad_norm": 207.9628143310547, + "learning_rate": 1.4749546279491834e-05, + "loss": 36.1239, + "step": 2811 + }, + { + "epoch": 10.151693002257336, + "grad_norm": 183.68545532226562, + "learning_rate": 1.4744101633393829e-05, + "loss": 36.759, + "step": 2812 + }, + { + "epoch": 10.15530474040632, + "grad_norm": 222.00164794921875, + "learning_rate": 1.4738656987295826e-05, + "loss": 37.397, + "step": 2813 + }, + { + "epoch": 10.158916478555305, + "grad_norm": 226.9628448486328, + "learning_rate": 1.4733212341197823e-05, + "loss": 36.3648, + "step": 2814 + }, + { + "epoch": 10.162528216704288, + "grad_norm": 271.061279296875, + "learning_rate": 1.4727767695099819e-05, + "loss": 37.8754, + "step": 2815 + }, + { + "epoch": 10.166139954853273, + "grad_norm": 265.2478942871094, + "learning_rate": 1.4722323049001816e-05, + "loss": 33.7491, + "step": 2816 + }, + { + "epoch": 10.169751693002258, + "grad_norm": 227.5030975341797, + "learning_rate": 1.4716878402903811e-05, + "loss": 23.0162, + "step": 2817 + }, + { + "epoch": 10.173363431151241, + "grad_norm": 195.83477783203125, + "learning_rate": 1.4711433756805808e-05, + "loss": 23.5831, + "step": 2818 + }, + { + "epoch": 10.176975169300226, + "grad_norm": 196.982421875, + "learning_rate": 1.4705989110707805e-05, + "loss": 24.1078, + "step": 2819 + }, + { + "epoch": 10.18058690744921, + "grad_norm": 212.73031616210938, + "learning_rate": 1.47005444646098e-05, + "loss": 24.8378, + "step": 2820 + }, + { + "epoch": 10.18058690744921, + "eval_loss": 0.6217848062515259, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 2820 + }, + { + "epoch": 10.184198645598194, + "grad_norm": 261.8343200683594, + "learning_rate": 1.4695099818511796e-05, + "loss": 43.3402, + "step": 2821 + }, + { + "epoch": 10.187810383747179, + "grad_norm": 272.94158935546875, + "learning_rate": 1.4689655172413793e-05, + "loss": 42.8004, + "step": 2822 + }, + { + "epoch": 10.191422121896162, + "grad_norm": 261.5067138671875, + "learning_rate": 1.468421052631579e-05, + "loss": 43.5947, + "step": 2823 + }, + { + "epoch": 10.195033860045147, + "grad_norm": 280.4205322265625, + "learning_rate": 1.4678765880217787e-05, + "loss": 42.1887, + "step": 2824 + }, + { + "epoch": 10.198645598194132, + "grad_norm": 223.82449340820312, + "learning_rate": 1.4673321234119783e-05, + "loss": 40.9825, + "step": 2825 + }, + { + "epoch": 10.202257336343115, + "grad_norm": 261.1077575683594, + "learning_rate": 1.4667876588021778e-05, + "loss": 41.8347, + "step": 2826 + }, + { + "epoch": 10.2058690744921, + "grad_norm": 189.1642608642578, + "learning_rate": 1.4662431941923775e-05, + "loss": 41.7441, + "step": 2827 + }, + { + "epoch": 10.209480812641084, + "grad_norm": 216.94410705566406, + "learning_rate": 1.4656987295825772e-05, + "loss": 42.203, + "step": 2828 + }, + { + "epoch": 10.213092550790067, + "grad_norm": 260.44744873046875, + "learning_rate": 1.4651542649727768e-05, + "loss": 41.8887, + "step": 2829 + }, + { + "epoch": 10.216704288939052, + "grad_norm": 252.21682739257812, + "learning_rate": 1.4646098003629765e-05, + "loss": 42.5977, + "step": 2830 + }, + { + "epoch": 10.216704288939052, + "eval_loss": 0.6175437569618225, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.997, + "eval_steps_per_second": 56.997, + "step": 2830 + }, + { + "epoch": 10.220316027088035, + "grad_norm": 298.4760437011719, + "learning_rate": 1.464065335753176e-05, + "loss": 40.7994, + "step": 2831 + }, + { + "epoch": 10.22392776523702, + "grad_norm": 214.0433349609375, + "learning_rate": 1.4635208711433757e-05, + "loss": 39.1571, + "step": 2832 + }, + { + "epoch": 10.227539503386005, + "grad_norm": 220.59039306640625, + "learning_rate": 1.4629764065335754e-05, + "loss": 38.257, + "step": 2833 + }, + { + "epoch": 10.231151241534988, + "grad_norm": 218.2419891357422, + "learning_rate": 1.462431941923775e-05, + "loss": 38.1954, + "step": 2834 + }, + { + "epoch": 10.234762979683973, + "grad_norm": 241.67674255371094, + "learning_rate": 1.4618874773139747e-05, + "loss": 39.7451, + "step": 2835 + }, + { + "epoch": 10.238374717832958, + "grad_norm": 260.3656005859375, + "learning_rate": 1.4613430127041742e-05, + "loss": 38.8297, + "step": 2836 + }, + { + "epoch": 10.241986455981941, + "grad_norm": 231.78102111816406, + "learning_rate": 1.4607985480943739e-05, + "loss": 38.523, + "step": 2837 + }, + { + "epoch": 10.245598194130926, + "grad_norm": 217.64820861816406, + "learning_rate": 1.4602540834845736e-05, + "loss": 40.0389, + "step": 2838 + }, + { + "epoch": 10.249209932279909, + "grad_norm": 186.45240783691406, + "learning_rate": 1.4597096188747732e-05, + "loss": 40.3306, + "step": 2839 + }, + { + "epoch": 10.252821670428894, + "grad_norm": 225.20480346679688, + "learning_rate": 1.4591651542649727e-05, + "loss": 39.0968, + "step": 2840 + }, + { + "epoch": 10.252821670428894, + "eval_loss": 0.6195141673088074, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 2840 + }, + { + "epoch": 10.256433408577879, + "grad_norm": 367.6174621582031, + "learning_rate": 1.4586206896551724e-05, + "loss": 38.869, + "step": 2841 + }, + { + "epoch": 10.260045146726862, + "grad_norm": 274.3976135253906, + "learning_rate": 1.4580762250453721e-05, + "loss": 39.7781, + "step": 2842 + }, + { + "epoch": 10.263656884875846, + "grad_norm": 193.41665649414062, + "learning_rate": 1.4575317604355718e-05, + "loss": 38.819, + "step": 2843 + }, + { + "epoch": 10.267268623024831, + "grad_norm": 204.2224578857422, + "learning_rate": 1.4569872958257714e-05, + "loss": 41.5495, + "step": 2844 + }, + { + "epoch": 10.270880361173814, + "grad_norm": 276.07476806640625, + "learning_rate": 1.4564428312159709e-05, + "loss": 40.6553, + "step": 2845 + }, + { + "epoch": 10.2744920993228, + "grad_norm": 192.6361541748047, + "learning_rate": 1.4558983666061708e-05, + "loss": 40.2147, + "step": 2846 + }, + { + "epoch": 10.278103837471784, + "grad_norm": 232.6641082763672, + "learning_rate": 1.4553539019963703e-05, + "loss": 40.7223, + "step": 2847 + }, + { + "epoch": 10.281715575620767, + "grad_norm": 266.781005859375, + "learning_rate": 1.4548094373865698e-05, + "loss": 38.0127, + "step": 2848 + }, + { + "epoch": 10.285327313769752, + "grad_norm": 289.5414123535156, + "learning_rate": 1.4542649727767696e-05, + "loss": 35.216, + "step": 2849 + }, + { + "epoch": 10.288939051918735, + "grad_norm": 208.10845947265625, + "learning_rate": 1.4537205081669691e-05, + "loss": 33.829, + "step": 2850 + }, + { + "epoch": 10.288939051918735, + "eval_loss": 0.6140356063842773, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.703, + "eval_steps_per_second": 56.703, + "step": 2850 + }, + { + "epoch": 10.29255079006772, + "grad_norm": 260.80328369140625, + "learning_rate": 1.4531760435571688e-05, + "loss": 33.8409, + "step": 2851 + }, + { + "epoch": 10.296162528216705, + "grad_norm": 202.3874053955078, + "learning_rate": 1.4526315789473685e-05, + "loss": 32.6498, + "step": 2852 + }, + { + "epoch": 10.299774266365688, + "grad_norm": 236.0218048095703, + "learning_rate": 1.452087114337568e-05, + "loss": 33.6538, + "step": 2853 + }, + { + "epoch": 10.303386004514673, + "grad_norm": 219.1603240966797, + "learning_rate": 1.4515426497277678e-05, + "loss": 33.7346, + "step": 2854 + }, + { + "epoch": 10.306997742663658, + "grad_norm": 252.8759307861328, + "learning_rate": 1.4509981851179675e-05, + "loss": 34.6996, + "step": 2855 + }, + { + "epoch": 10.31060948081264, + "grad_norm": 204.89244079589844, + "learning_rate": 1.450453720508167e-05, + "loss": 36.1145, + "step": 2856 + }, + { + "epoch": 10.314221218961626, + "grad_norm": 239.5278778076172, + "learning_rate": 1.4499092558983667e-05, + "loss": 34.8845, + "step": 2857 + }, + { + "epoch": 10.317832957110609, + "grad_norm": 235.02403259277344, + "learning_rate": 1.4493647912885662e-05, + "loss": 36.1006, + "step": 2858 + }, + { + "epoch": 10.321444695259594, + "grad_norm": 219.25686645507812, + "learning_rate": 1.4488203266787658e-05, + "loss": 37.0463, + "step": 2859 + }, + { + "epoch": 10.325056433408578, + "grad_norm": 238.1767578125, + "learning_rate": 1.4482758620689657e-05, + "loss": 35.5543, + "step": 2860 + }, + { + "epoch": 10.325056433408578, + "eval_loss": 0.6116110682487488, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.93, + "eval_steps_per_second": 56.93, + "step": 2860 + }, + { + "epoch": 10.328668171557561, + "grad_norm": 245.4133758544922, + "learning_rate": 1.4477313974591652e-05, + "loss": 35.7557, + "step": 2861 + }, + { + "epoch": 10.332279909706546, + "grad_norm": 231.70779418945312, + "learning_rate": 1.4471869328493647e-05, + "loss": 35.9535, + "step": 2862 + }, + { + "epoch": 10.335891647855531, + "grad_norm": 218.71266174316406, + "learning_rate": 1.4466424682395644e-05, + "loss": 36.747, + "step": 2863 + }, + { + "epoch": 10.339503386004514, + "grad_norm": 206.82247924804688, + "learning_rate": 1.446098003629764e-05, + "loss": 37.4007, + "step": 2864 + }, + { + "epoch": 10.343115124153499, + "grad_norm": 286.6649475097656, + "learning_rate": 1.4455535390199639e-05, + "loss": 38.183, + "step": 2865 + }, + { + "epoch": 10.346726862302482, + "grad_norm": 262.2049865722656, + "learning_rate": 1.4450090744101634e-05, + "loss": 28.1564, + "step": 2866 + }, + { + "epoch": 10.350338600451467, + "grad_norm": 203.03831481933594, + "learning_rate": 1.444464609800363e-05, + "loss": 23.7155, + "step": 2867 + }, + { + "epoch": 10.353950338600452, + "grad_norm": 220.13597106933594, + "learning_rate": 1.4439201451905626e-05, + "loss": 23.5066, + "step": 2868 + }, + { + "epoch": 10.357562076749435, + "grad_norm": 208.22035217285156, + "learning_rate": 1.4433756805807624e-05, + "loss": 23.8087, + "step": 2869 + }, + { + "epoch": 10.36117381489842, + "grad_norm": 202.74989318847656, + "learning_rate": 1.4428312159709619e-05, + "loss": 24.6194, + "step": 2870 + }, + { + "epoch": 10.36117381489842, + "eval_loss": 0.6170971989631653, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 2870 + }, + { + "epoch": 10.364785553047405, + "grad_norm": 251.78924560546875, + "learning_rate": 1.4422867513611616e-05, + "loss": 41.1333, + "step": 2871 + }, + { + "epoch": 10.368397291196388, + "grad_norm": 269.72430419921875, + "learning_rate": 1.4417422867513611e-05, + "loss": 43.5289, + "step": 2872 + }, + { + "epoch": 10.372009029345373, + "grad_norm": 226.14202880859375, + "learning_rate": 1.4411978221415607e-05, + "loss": 42.1575, + "step": 2873 + }, + { + "epoch": 10.375620767494357, + "grad_norm": 230.2255096435547, + "learning_rate": 1.4406533575317606e-05, + "loss": 42.5563, + "step": 2874 + }, + { + "epoch": 10.37923250564334, + "grad_norm": 259.2338562011719, + "learning_rate": 1.4401088929219601e-05, + "loss": 41.517, + "step": 2875 + }, + { + "epoch": 10.382844243792325, + "grad_norm": 280.06414794921875, + "learning_rate": 1.4395644283121598e-05, + "loss": 41.3589, + "step": 2876 + }, + { + "epoch": 10.386455981941308, + "grad_norm": 259.1960754394531, + "learning_rate": 1.4390199637023593e-05, + "loss": 41.539, + "step": 2877 + }, + { + "epoch": 10.390067720090293, + "grad_norm": 244.4931640625, + "learning_rate": 1.438475499092559e-05, + "loss": 41.8689, + "step": 2878 + }, + { + "epoch": 10.393679458239278, + "grad_norm": 195.65065002441406, + "learning_rate": 1.4379310344827588e-05, + "loss": 42.9191, + "step": 2879 + }, + { + "epoch": 10.397291196388261, + "grad_norm": 215.88589477539062, + "learning_rate": 1.4373865698729583e-05, + "loss": 41.4172, + "step": 2880 + }, + { + "epoch": 10.397291196388261, + "eval_loss": 0.6176813840866089, + "eval_runtime": 3.1462, + "eval_samples_per_second": 56.893, + "eval_steps_per_second": 56.893, + "step": 2880 + }, + { + "epoch": 10.400902934537246, + "grad_norm": 175.21368408203125, + "learning_rate": 1.4368421052631578e-05, + "loss": 41.8998, + "step": 2881 + }, + { + "epoch": 10.404514672686231, + "grad_norm": 207.65963745117188, + "learning_rate": 1.4362976406533575e-05, + "loss": 40.33, + "step": 2882 + }, + { + "epoch": 10.408126410835214, + "grad_norm": 213.50526428222656, + "learning_rate": 1.4357531760435572e-05, + "loss": 38.0329, + "step": 2883 + }, + { + "epoch": 10.411738148984199, + "grad_norm": 190.8444366455078, + "learning_rate": 1.4352087114337568e-05, + "loss": 39.0142, + "step": 2884 + }, + { + "epoch": 10.415349887133182, + "grad_norm": 300.2298583984375, + "learning_rate": 1.4346642468239565e-05, + "loss": 38.6364, + "step": 2885 + }, + { + "epoch": 10.418961625282167, + "grad_norm": 183.6144256591797, + "learning_rate": 1.434119782214156e-05, + "loss": 39.6747, + "step": 2886 + }, + { + "epoch": 10.422573363431152, + "grad_norm": 237.85340881347656, + "learning_rate": 1.4335753176043557e-05, + "loss": 38.3018, + "step": 2887 + }, + { + "epoch": 10.426185101580135, + "grad_norm": 325.96624755859375, + "learning_rate": 1.4330308529945554e-05, + "loss": 40.1042, + "step": 2888 + }, + { + "epoch": 10.42979683972912, + "grad_norm": 248.4732666015625, + "learning_rate": 1.432486388384755e-05, + "loss": 40.0357, + "step": 2889 + }, + { + "epoch": 10.433408577878104, + "grad_norm": 374.6653747558594, + "learning_rate": 1.4319419237749547e-05, + "loss": 40.4383, + "step": 2890 + }, + { + "epoch": 10.433408577878104, + "eval_loss": 0.6150367856025696, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.881, + "eval_steps_per_second": 56.881, + "step": 2890 + }, + { + "epoch": 10.437020316027088, + "grad_norm": 229.79647827148438, + "learning_rate": 1.4313974591651542e-05, + "loss": 40.3728, + "step": 2891 + }, + { + "epoch": 10.440632054176072, + "grad_norm": 278.7500915527344, + "learning_rate": 1.430852994555354e-05, + "loss": 39.546, + "step": 2892 + }, + { + "epoch": 10.444243792325057, + "grad_norm": 233.1890106201172, + "learning_rate": 1.4303085299455536e-05, + "loss": 41.8094, + "step": 2893 + }, + { + "epoch": 10.44785553047404, + "grad_norm": 207.7745819091797, + "learning_rate": 1.4297640653357532e-05, + "loss": 40.6225, + "step": 2894 + }, + { + "epoch": 10.451467268623025, + "grad_norm": 233.37892150878906, + "learning_rate": 1.4292196007259529e-05, + "loss": 40.2499, + "step": 2895 + }, + { + "epoch": 10.455079006772008, + "grad_norm": 225.4070587158203, + "learning_rate": 1.4286751361161524e-05, + "loss": 40.3626, + "step": 2896 + }, + { + "epoch": 10.458690744920993, + "grad_norm": 239.60231018066406, + "learning_rate": 1.4281306715063521e-05, + "loss": 40.3149, + "step": 2897 + }, + { + "epoch": 10.462302483069978, + "grad_norm": 225.3981475830078, + "learning_rate": 1.4275862068965518e-05, + "loss": 39.3443, + "step": 2898 + }, + { + "epoch": 10.465914221218961, + "grad_norm": 270.2829284667969, + "learning_rate": 1.4270417422867514e-05, + "loss": 37.8947, + "step": 2899 + }, + { + "epoch": 10.469525959367946, + "grad_norm": 263.66986083984375, + "learning_rate": 1.426497277676951e-05, + "loss": 34.4721, + "step": 2900 + }, + { + "epoch": 10.469525959367946, + "eval_loss": 0.6134031414985657, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2900 + }, + { + "epoch": 10.47313769751693, + "grad_norm": 189.3812255859375, + "learning_rate": 1.4259528130671508e-05, + "loss": 34.3148, + "step": 2901 + }, + { + "epoch": 10.476749435665914, + "grad_norm": 256.7174987792969, + "learning_rate": 1.4254083484573503e-05, + "loss": 32.1693, + "step": 2902 + }, + { + "epoch": 10.480361173814899, + "grad_norm": 265.40692138671875, + "learning_rate": 1.4248638838475499e-05, + "loss": 34.369, + "step": 2903 + }, + { + "epoch": 10.483972911963882, + "grad_norm": 315.6539001464844, + "learning_rate": 1.4243194192377496e-05, + "loss": 34.9479, + "step": 2904 + }, + { + "epoch": 10.487584650112867, + "grad_norm": 263.7816162109375, + "learning_rate": 1.4237749546279491e-05, + "loss": 33.983, + "step": 2905 + }, + { + "epoch": 10.491196388261852, + "grad_norm": 244.69192504882812, + "learning_rate": 1.423230490018149e-05, + "loss": 36.6685, + "step": 2906 + }, + { + "epoch": 10.494808126410835, + "grad_norm": 224.26071166992188, + "learning_rate": 1.4226860254083485e-05, + "loss": 35.0337, + "step": 2907 + }, + { + "epoch": 10.49841986455982, + "grad_norm": 261.0958557128906, + "learning_rate": 1.422141560798548e-05, + "loss": 34.7154, + "step": 2908 + }, + { + "epoch": 10.502031602708804, + "grad_norm": 245.85960388183594, + "learning_rate": 1.4215970961887478e-05, + "loss": 35.4156, + "step": 2909 + }, + { + "epoch": 10.505643340857787, + "grad_norm": 309.3730163574219, + "learning_rate": 1.4210526315789473e-05, + "loss": 36.3999, + "step": 2910 + }, + { + "epoch": 10.505643340857787, + "eval_loss": 0.6144266128540039, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.853, + "eval_steps_per_second": 56.853, + "step": 2910 + }, + { + "epoch": 10.509255079006772, + "grad_norm": 209.9637451171875, + "learning_rate": 1.420508166969147e-05, + "loss": 37.1515, + "step": 2911 + }, + { + "epoch": 10.512866817155757, + "grad_norm": 254.81683349609375, + "learning_rate": 1.4199637023593467e-05, + "loss": 35.5548, + "step": 2912 + }, + { + "epoch": 10.51647855530474, + "grad_norm": 224.94137573242188, + "learning_rate": 1.4194192377495463e-05, + "loss": 36.7691, + "step": 2913 + }, + { + "epoch": 10.520090293453725, + "grad_norm": 223.81838989257812, + "learning_rate": 1.4188747731397458e-05, + "loss": 37.5904, + "step": 2914 + }, + { + "epoch": 10.523702031602708, + "grad_norm": 308.0168151855469, + "learning_rate": 1.4183303085299457e-05, + "loss": 36.1561, + "step": 2915 + }, + { + "epoch": 10.527313769751693, + "grad_norm": 214.77928161621094, + "learning_rate": 1.4177858439201452e-05, + "loss": 27.6309, + "step": 2916 + }, + { + "epoch": 10.530925507900678, + "grad_norm": 153.77163696289062, + "learning_rate": 1.417241379310345e-05, + "loss": 23.6151, + "step": 2917 + }, + { + "epoch": 10.534537246049661, + "grad_norm": 161.12826538085938, + "learning_rate": 1.4166969147005445e-05, + "loss": 23.1684, + "step": 2918 + }, + { + "epoch": 10.538148984198646, + "grad_norm": 228.01441955566406, + "learning_rate": 1.416152450090744e-05, + "loss": 23.4383, + "step": 2919 + }, + { + "epoch": 10.54176072234763, + "grad_norm": 207.55052185058594, + "learning_rate": 1.4156079854809439e-05, + "loss": 25.4699, + "step": 2920 + }, + { + "epoch": 10.54176072234763, + "eval_loss": 0.6177500486373901, + "eval_runtime": 3.1369, + "eval_samples_per_second": 57.063, + "eval_steps_per_second": 57.063, + "step": 2920 + }, + { + "epoch": 10.545372460496614, + "grad_norm": 254.23828125, + "learning_rate": 1.4150635208711434e-05, + "loss": 42.1525, + "step": 2921 + }, + { + "epoch": 10.548984198645599, + "grad_norm": 228.1654815673828, + "learning_rate": 1.414519056261343e-05, + "loss": 42.4282, + "step": 2922 + }, + { + "epoch": 10.552595936794582, + "grad_norm": 258.4981689453125, + "learning_rate": 1.4139745916515427e-05, + "loss": 42.3053, + "step": 2923 + }, + { + "epoch": 10.556207674943566, + "grad_norm": 364.42059326171875, + "learning_rate": 1.4134301270417424e-05, + "loss": 41.9009, + "step": 2924 + }, + { + "epoch": 10.559819413092551, + "grad_norm": 213.5066375732422, + "learning_rate": 1.412885662431942e-05, + "loss": 41.0624, + "step": 2925 + }, + { + "epoch": 10.563431151241534, + "grad_norm": 214.23472595214844, + "learning_rate": 1.4123411978221416e-05, + "loss": 42.2508, + "step": 2926 + }, + { + "epoch": 10.56704288939052, + "grad_norm": 249.8063201904297, + "learning_rate": 1.4117967332123412e-05, + "loss": 43.0671, + "step": 2927 + }, + { + "epoch": 10.570654627539504, + "grad_norm": 210.0769805908203, + "learning_rate": 1.4112522686025409e-05, + "loss": 43.4018, + "step": 2928 + }, + { + "epoch": 10.574266365688487, + "grad_norm": 255.67225646972656, + "learning_rate": 1.4107078039927406e-05, + "loss": 42.9609, + "step": 2929 + }, + { + "epoch": 10.577878103837472, + "grad_norm": 294.2599182128906, + "learning_rate": 1.4101633393829401e-05, + "loss": 41.8748, + "step": 2930 + }, + { + "epoch": 10.577878103837472, + "eval_loss": 0.6147512793540955, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2930 + }, + { + "epoch": 10.581489841986457, + "grad_norm": 212.6685333251953, + "learning_rate": 1.4096188747731398e-05, + "loss": 42.4291, + "step": 2931 + }, + { + "epoch": 10.58510158013544, + "grad_norm": 297.016357421875, + "learning_rate": 1.4090744101633394e-05, + "loss": 39.7291, + "step": 2932 + }, + { + "epoch": 10.588713318284425, + "grad_norm": 280.308837890625, + "learning_rate": 1.4085299455535389e-05, + "loss": 37.4836, + "step": 2933 + }, + { + "epoch": 10.592325056433408, + "grad_norm": 230.28994750976562, + "learning_rate": 1.4079854809437388e-05, + "loss": 39.4075, + "step": 2934 + }, + { + "epoch": 10.595936794582393, + "grad_norm": 377.0367126464844, + "learning_rate": 1.4074410163339383e-05, + "loss": 40.5601, + "step": 2935 + }, + { + "epoch": 10.599548532731378, + "grad_norm": 238.51597595214844, + "learning_rate": 1.406896551724138e-05, + "loss": 38.1238, + "step": 2936 + }, + { + "epoch": 10.60316027088036, + "grad_norm": 197.5536651611328, + "learning_rate": 1.4063520871143376e-05, + "loss": 38.2997, + "step": 2937 + }, + { + "epoch": 10.606772009029346, + "grad_norm": 211.65162658691406, + "learning_rate": 1.4058076225045373e-05, + "loss": 39.1501, + "step": 2938 + }, + { + "epoch": 10.610383747178329, + "grad_norm": 266.4801940917969, + "learning_rate": 1.405263157894737e-05, + "loss": 40.5761, + "step": 2939 + }, + { + "epoch": 10.613995485327314, + "grad_norm": 210.29478454589844, + "learning_rate": 1.4047186932849365e-05, + "loss": 39.7387, + "step": 2940 + }, + { + "epoch": 10.613995485327314, + "eval_loss": 0.6154477000236511, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 2940 + }, + { + "epoch": 10.617607223476298, + "grad_norm": 318.0694580078125, + "learning_rate": 1.404174228675136e-05, + "loss": 38.691, + "step": 2941 + }, + { + "epoch": 10.621218961625281, + "grad_norm": 351.12811279296875, + "learning_rate": 1.4036297640653358e-05, + "loss": 40.3878, + "step": 2942 + }, + { + "epoch": 10.624830699774266, + "grad_norm": 259.8601989746094, + "learning_rate": 1.4030852994555355e-05, + "loss": 38.4447, + "step": 2943 + }, + { + "epoch": 10.628442437923251, + "grad_norm": 249.7741241455078, + "learning_rate": 1.402540834845735e-05, + "loss": 41.1242, + "step": 2944 + }, + { + "epoch": 10.632054176072234, + "grad_norm": 207.11119079589844, + "learning_rate": 1.4019963702359347e-05, + "loss": 40.1977, + "step": 2945 + }, + { + "epoch": 10.635665914221219, + "grad_norm": 199.37295532226562, + "learning_rate": 1.4014519056261343e-05, + "loss": 40.71, + "step": 2946 + }, + { + "epoch": 10.639277652370204, + "grad_norm": 238.85061645507812, + "learning_rate": 1.4009074410163341e-05, + "loss": 41.8822, + "step": 2947 + }, + { + "epoch": 10.642889390519187, + "grad_norm": 212.46388244628906, + "learning_rate": 1.4003629764065337e-05, + "loss": 40.5648, + "step": 2948 + }, + { + "epoch": 10.646501128668172, + "grad_norm": 217.60386657714844, + "learning_rate": 1.3998185117967332e-05, + "loss": 39.6074, + "step": 2949 + }, + { + "epoch": 10.650112866817155, + "grad_norm": 223.88645935058594, + "learning_rate": 1.399274047186933e-05, + "loss": 37.7394, + "step": 2950 + }, + { + "epoch": 10.650112866817155, + "eval_loss": 0.6133999228477478, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 2950 + }, + { + "epoch": 10.65372460496614, + "grad_norm": 248.87986755371094, + "learning_rate": 1.3987295825771325e-05, + "loss": 34.911, + "step": 2951 + }, + { + "epoch": 10.657336343115125, + "grad_norm": 238.0355987548828, + "learning_rate": 1.3981851179673322e-05, + "loss": 34.0325, + "step": 2952 + }, + { + "epoch": 10.660948081264108, + "grad_norm": 212.9556121826172, + "learning_rate": 1.3976406533575319e-05, + "loss": 34.9663, + "step": 2953 + }, + { + "epoch": 10.664559819413093, + "grad_norm": 274.4277648925781, + "learning_rate": 1.3970961887477314e-05, + "loss": 34.2399, + "step": 2954 + }, + { + "epoch": 10.668171557562077, + "grad_norm": 211.77976989746094, + "learning_rate": 1.396551724137931e-05, + "loss": 33.7609, + "step": 2955 + }, + { + "epoch": 10.67178329571106, + "grad_norm": 280.6621398925781, + "learning_rate": 1.3960072595281307e-05, + "loss": 35.2616, + "step": 2956 + }, + { + "epoch": 10.675395033860045, + "grad_norm": 239.06439208984375, + "learning_rate": 1.3954627949183304e-05, + "loss": 34.2542, + "step": 2957 + }, + { + "epoch": 10.679006772009028, + "grad_norm": 271.45806884765625, + "learning_rate": 1.39491833030853e-05, + "loss": 36.0551, + "step": 2958 + }, + { + "epoch": 10.682618510158013, + "grad_norm": 247.76486206054688, + "learning_rate": 1.3943738656987296e-05, + "loss": 36.9935, + "step": 2959 + }, + { + "epoch": 10.686230248306998, + "grad_norm": 259.47930908203125, + "learning_rate": 1.3938294010889292e-05, + "loss": 36.7769, + "step": 2960 + }, + { + "epoch": 10.686230248306998, + "eval_loss": 0.6107803583145142, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.138, + "eval_steps_per_second": 57.138, + "step": 2960 + }, + { + "epoch": 10.689841986455981, + "grad_norm": 247.50103759765625, + "learning_rate": 1.393284936479129e-05, + "loss": 35.4848, + "step": 2961 + }, + { + "epoch": 10.693453724604966, + "grad_norm": 242.37330627441406, + "learning_rate": 1.3927404718693286e-05, + "loss": 36.3881, + "step": 2962 + }, + { + "epoch": 10.697065462753951, + "grad_norm": 200.2835693359375, + "learning_rate": 1.3921960072595281e-05, + "loss": 37.2684, + "step": 2963 + }, + { + "epoch": 10.700677200902934, + "grad_norm": 261.6256103515625, + "learning_rate": 1.3916515426497278e-05, + "loss": 37.4581, + "step": 2964 + }, + { + "epoch": 10.704288939051919, + "grad_norm": 243.7251434326172, + "learning_rate": 1.3911070780399274e-05, + "loss": 35.8237, + "step": 2965 + }, + { + "epoch": 10.707900677200904, + "grad_norm": 172.99339294433594, + "learning_rate": 1.390562613430127e-05, + "loss": 29.5815, + "step": 2966 + }, + { + "epoch": 10.711512415349887, + "grad_norm": 168.88490295410156, + "learning_rate": 1.3900181488203268e-05, + "loss": 23.6597, + "step": 2967 + }, + { + "epoch": 10.715124153498872, + "grad_norm": 213.0456085205078, + "learning_rate": 1.3894736842105263e-05, + "loss": 22.5034, + "step": 2968 + }, + { + "epoch": 10.718735891647855, + "grad_norm": 183.87222290039062, + "learning_rate": 1.388929219600726e-05, + "loss": 24.1696, + "step": 2969 + }, + { + "epoch": 10.72234762979684, + "grad_norm": 179.4297637939453, + "learning_rate": 1.3883847549909256e-05, + "loss": 24.8905, + "step": 2970 + }, + { + "epoch": 10.72234762979684, + "eval_loss": 0.6176853179931641, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 2970 + }, + { + "epoch": 10.725959367945824, + "grad_norm": 214.10662841796875, + "learning_rate": 1.3878402903811253e-05, + "loss": 40.6941, + "step": 2971 + }, + { + "epoch": 10.729571106094808, + "grad_norm": 199.4381103515625, + "learning_rate": 1.387295825771325e-05, + "loss": 42.6363, + "step": 2972 + }, + { + "epoch": 10.733182844243792, + "grad_norm": 182.74517822265625, + "learning_rate": 1.3867513611615245e-05, + "loss": 40.9695, + "step": 2973 + }, + { + "epoch": 10.736794582392777, + "grad_norm": 182.41421508789062, + "learning_rate": 1.386206896551724e-05, + "loss": 40.8893, + "step": 2974 + }, + { + "epoch": 10.74040632054176, + "grad_norm": 215.42904663085938, + "learning_rate": 1.385662431941924e-05, + "loss": 40.6667, + "step": 2975 + }, + { + "epoch": 10.744018058690745, + "grad_norm": 208.15133666992188, + "learning_rate": 1.3851179673321235e-05, + "loss": 42.0714, + "step": 2976 + }, + { + "epoch": 10.747629796839728, + "grad_norm": 224.70242309570312, + "learning_rate": 1.384573502722323e-05, + "loss": 40.9404, + "step": 2977 + }, + { + "epoch": 10.751241534988713, + "grad_norm": 241.45301818847656, + "learning_rate": 1.3840290381125227e-05, + "loss": 43.5597, + "step": 2978 + }, + { + "epoch": 10.754853273137698, + "grad_norm": 201.2677459716797, + "learning_rate": 1.3834845735027222e-05, + "loss": 42.7741, + "step": 2979 + }, + { + "epoch": 10.758465011286681, + "grad_norm": 246.30873107910156, + "learning_rate": 1.3829401088929221e-05, + "loss": 41.7873, + "step": 2980 + }, + { + "epoch": 10.758465011286681, + "eval_loss": 0.6206657886505127, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2980 + }, + { + "epoch": 10.762076749435666, + "grad_norm": 206.91009521484375, + "learning_rate": 1.3823956442831217e-05, + "loss": 42.3601, + "step": 2981 + }, + { + "epoch": 10.76568848758465, + "grad_norm": 206.37472534179688, + "learning_rate": 1.3818511796733212e-05, + "loss": 38.5536, + "step": 2982 + }, + { + "epoch": 10.769300225733634, + "grad_norm": 206.49070739746094, + "learning_rate": 1.3813067150635209e-05, + "loss": 38.1051, + "step": 2983 + }, + { + "epoch": 10.772911963882619, + "grad_norm": 215.02455139160156, + "learning_rate": 1.3807622504537206e-05, + "loss": 39.0797, + "step": 2984 + }, + { + "epoch": 10.776523702031604, + "grad_norm": 254.23757934570312, + "learning_rate": 1.3802177858439202e-05, + "loss": 39.419, + "step": 2985 + }, + { + "epoch": 10.780135440180587, + "grad_norm": 205.85079956054688, + "learning_rate": 1.3796733212341199e-05, + "loss": 39.2075, + "step": 2986 + }, + { + "epoch": 10.783747178329572, + "grad_norm": 216.0372314453125, + "learning_rate": 1.3791288566243194e-05, + "loss": 38.5652, + "step": 2987 + }, + { + "epoch": 10.787358916478555, + "grad_norm": 258.47650146484375, + "learning_rate": 1.3785843920145191e-05, + "loss": 38.1968, + "step": 2988 + }, + { + "epoch": 10.79097065462754, + "grad_norm": 289.07354736328125, + "learning_rate": 1.3780399274047188e-05, + "loss": 40.2233, + "step": 2989 + }, + { + "epoch": 10.794582392776524, + "grad_norm": 332.9964904785156, + "learning_rate": 1.3774954627949184e-05, + "loss": 39.5959, + "step": 2990 + }, + { + "epoch": 10.794582392776524, + "eval_loss": 0.6167517304420471, + "eval_runtime": 3.1556, + "eval_samples_per_second": 56.724, + "eval_steps_per_second": 56.724, + "step": 2990 + }, + { + "epoch": 10.798194130925507, + "grad_norm": 205.10699462890625, + "learning_rate": 1.376950998185118e-05, + "loss": 40.2468, + "step": 2991 + }, + { + "epoch": 10.801805869074492, + "grad_norm": 270.2808837890625, + "learning_rate": 1.3764065335753176e-05, + "loss": 37.5956, + "step": 2992 + }, + { + "epoch": 10.805417607223477, + "grad_norm": 199.32044982910156, + "learning_rate": 1.3758620689655171e-05, + "loss": 38.7289, + "step": 2993 + }, + { + "epoch": 10.80902934537246, + "grad_norm": 196.97547912597656, + "learning_rate": 1.375317604355717e-05, + "loss": 40.6707, + "step": 2994 + }, + { + "epoch": 10.812641083521445, + "grad_norm": 219.34588623046875, + "learning_rate": 1.3747731397459166e-05, + "loss": 39.6782, + "step": 2995 + }, + { + "epoch": 10.816252821670428, + "grad_norm": 261.7323913574219, + "learning_rate": 1.3742286751361161e-05, + "loss": 41.1828, + "step": 2996 + }, + { + "epoch": 10.819864559819413, + "grad_norm": 250.89186096191406, + "learning_rate": 1.3736842105263158e-05, + "loss": 41.3582, + "step": 2997 + }, + { + "epoch": 10.823476297968398, + "grad_norm": 284.7223205566406, + "learning_rate": 1.3731397459165155e-05, + "loss": 39.3584, + "step": 2998 + }, + { + "epoch": 10.827088036117381, + "grad_norm": 212.9114990234375, + "learning_rate": 1.3725952813067152e-05, + "loss": 37.5373, + "step": 2999 + }, + { + "epoch": 10.830699774266366, + "grad_norm": 182.8346405029297, + "learning_rate": 1.3720508166969148e-05, + "loss": 35.2027, + "step": 3000 + }, + { + "epoch": 10.830699774266366, + "eval_loss": 0.6083630919456482, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.702, + "eval_steps_per_second": 56.702, + "step": 3000 + }, + { + "epoch": 10.83431151241535, + "grad_norm": 259.0496520996094, + "learning_rate": 1.3715063520871143e-05, + "loss": 33.4937, + "step": 3001 + }, + { + "epoch": 10.837923250564334, + "grad_norm": 173.037353515625, + "learning_rate": 1.370961887477314e-05, + "loss": 32.8549, + "step": 3002 + }, + { + "epoch": 10.841534988713319, + "grad_norm": 257.9381408691406, + "learning_rate": 1.3704174228675137e-05, + "loss": 33.9163, + "step": 3003 + }, + { + "epoch": 10.845146726862303, + "grad_norm": 248.58355712890625, + "learning_rate": 1.3698729582577132e-05, + "loss": 34.3948, + "step": 3004 + }, + { + "epoch": 10.848758465011286, + "grad_norm": 277.0877990722656, + "learning_rate": 1.369328493647913e-05, + "loss": 34.2868, + "step": 3005 + }, + { + "epoch": 10.852370203160271, + "grad_norm": 220.54014587402344, + "learning_rate": 1.3687840290381125e-05, + "loss": 35.2502, + "step": 3006 + }, + { + "epoch": 10.855981941309254, + "grad_norm": 248.14111328125, + "learning_rate": 1.3682395644283122e-05, + "loss": 33.4599, + "step": 3007 + }, + { + "epoch": 10.85959367945824, + "grad_norm": 284.2827453613281, + "learning_rate": 1.3676950998185119e-05, + "loss": 34.2927, + "step": 3008 + }, + { + "epoch": 10.863205417607224, + "grad_norm": 236.78201293945312, + "learning_rate": 1.3671506352087114e-05, + "loss": 34.9322, + "step": 3009 + }, + { + "epoch": 10.866817155756207, + "grad_norm": 245.58331298828125, + "learning_rate": 1.3666061705989112e-05, + "loss": 35.7628, + "step": 3010 + }, + { + "epoch": 10.866817155756207, + "eval_loss": 0.6125946640968323, + "eval_runtime": 3.1644, + "eval_samples_per_second": 56.566, + "eval_steps_per_second": 56.566, + "step": 3010 + }, + { + "epoch": 10.870428893905192, + "grad_norm": 217.79248046875, + "learning_rate": 1.3660617059891107e-05, + "loss": 35.7332, + "step": 3011 + }, + { + "epoch": 10.874040632054175, + "grad_norm": 258.78729248046875, + "learning_rate": 1.3655172413793104e-05, + "loss": 38.293, + "step": 3012 + }, + { + "epoch": 10.87765237020316, + "grad_norm": 253.94757080078125, + "learning_rate": 1.3649727767695101e-05, + "loss": 37.511, + "step": 3013 + }, + { + "epoch": 10.881264108352145, + "grad_norm": 265.5654602050781, + "learning_rate": 1.3644283121597096e-05, + "loss": 37.5786, + "step": 3014 + }, + { + "epoch": 10.884875846501128, + "grad_norm": 252.11453247070312, + "learning_rate": 1.3638838475499092e-05, + "loss": 37.1039, + "step": 3015 + }, + { + "epoch": 10.888487584650113, + "grad_norm": 259.5934753417969, + "learning_rate": 1.3633393829401089e-05, + "loss": 35.2651, + "step": 3016 + }, + { + "epoch": 10.892099322799098, + "grad_norm": 194.3569793701172, + "learning_rate": 1.3627949183303086e-05, + "loss": 23.7438, + "step": 3017 + }, + { + "epoch": 10.89571106094808, + "grad_norm": 233.95205688476562, + "learning_rate": 1.3622504537205081e-05, + "loss": 23.0061, + "step": 3018 + }, + { + "epoch": 10.899322799097066, + "grad_norm": 185.18495178222656, + "learning_rate": 1.3617059891107078e-05, + "loss": 24.5404, + "step": 3019 + }, + { + "epoch": 10.90293453724605, + "grad_norm": 200.27029418945312, + "learning_rate": 1.3611615245009074e-05, + "loss": 24.3629, + "step": 3020 + }, + { + "epoch": 10.90293453724605, + "eval_loss": 0.6178797483444214, + "eval_runtime": 3.1498, + "eval_samples_per_second": 56.829, + "eval_steps_per_second": 56.829, + "step": 3020 + }, + { + "epoch": 10.906546275395034, + "grad_norm": 226.4281463623047, + "learning_rate": 1.3606170598911073e-05, + "loss": 41.7249, + "step": 3021 + }, + { + "epoch": 10.910158013544018, + "grad_norm": 207.73768615722656, + "learning_rate": 1.3600725952813068e-05, + "loss": 42.1902, + "step": 3022 + }, + { + "epoch": 10.913769751693001, + "grad_norm": 248.69773864746094, + "learning_rate": 1.3595281306715063e-05, + "loss": 40.8419, + "step": 3023 + }, + { + "epoch": 10.917381489841986, + "grad_norm": 224.0100860595703, + "learning_rate": 1.358983666061706e-05, + "loss": 41.483, + "step": 3024 + }, + { + "epoch": 10.920993227990971, + "grad_norm": 217.3524932861328, + "learning_rate": 1.3584392014519056e-05, + "loss": 42.4667, + "step": 3025 + }, + { + "epoch": 10.924604966139954, + "grad_norm": 226.0863494873047, + "learning_rate": 1.3578947368421053e-05, + "loss": 40.8693, + "step": 3026 + }, + { + "epoch": 10.928216704288939, + "grad_norm": 278.3658447265625, + "learning_rate": 1.357350272232305e-05, + "loss": 39.5165, + "step": 3027 + }, + { + "epoch": 10.931828442437924, + "grad_norm": 226.6543731689453, + "learning_rate": 1.3568058076225045e-05, + "loss": 39.3144, + "step": 3028 + }, + { + "epoch": 10.935440180586907, + "grad_norm": 215.39073181152344, + "learning_rate": 1.3562613430127042e-05, + "loss": 39.9823, + "step": 3029 + }, + { + "epoch": 10.939051918735892, + "grad_norm": 239.6291961669922, + "learning_rate": 1.355716878402904e-05, + "loss": 40.898, + "step": 3030 + }, + { + "epoch": 10.939051918735892, + "eval_loss": 0.6163076162338257, + "eval_runtime": 3.153, + "eval_samples_per_second": 56.771, + "eval_steps_per_second": 56.771, + "step": 3030 + }, + { + "epoch": 10.942663656884875, + "grad_norm": 251.20431518554688, + "learning_rate": 1.3551724137931035e-05, + "loss": 40.8357, + "step": 3031 + }, + { + "epoch": 10.94627539503386, + "grad_norm": 243.96022033691406, + "learning_rate": 1.3546279491833032e-05, + "loss": 39.1261, + "step": 3032 + }, + { + "epoch": 10.949887133182845, + "grad_norm": 248.15545654296875, + "learning_rate": 1.3540834845735027e-05, + "loss": 40.9375, + "step": 3033 + }, + { + "epoch": 10.953498871331828, + "grad_norm": 215.00927734375, + "learning_rate": 1.3535390199637023e-05, + "loss": 42.4167, + "step": 3034 + }, + { + "epoch": 10.957110609480813, + "grad_norm": 263.11566162109375, + "learning_rate": 1.3529945553539021e-05, + "loss": 40.7363, + "step": 3035 + }, + { + "epoch": 10.960722347629797, + "grad_norm": 208.59628295898438, + "learning_rate": 1.3524500907441017e-05, + "loss": 35.7124, + "step": 3036 + }, + { + "epoch": 10.96433408577878, + "grad_norm": 187.6036834716797, + "learning_rate": 1.3519056261343012e-05, + "loss": 33.7512, + "step": 3037 + }, + { + "epoch": 10.967945823927765, + "grad_norm": 217.89825439453125, + "learning_rate": 1.351361161524501e-05, + "loss": 33.4262, + "step": 3038 + }, + { + "epoch": 10.97155756207675, + "grad_norm": 235.59889221191406, + "learning_rate": 1.3508166969147005e-05, + "loss": 35.2587, + "step": 3039 + }, + { + "epoch": 10.975169300225733, + "grad_norm": 261.9609680175781, + "learning_rate": 1.3502722323049003e-05, + "loss": 36.1296, + "step": 3040 + }, + { + "epoch": 10.975169300225733, + "eval_loss": 0.610818088054657, + "eval_runtime": 3.1502, + "eval_samples_per_second": 56.822, + "eval_steps_per_second": 56.822, + "step": 3040 + }, + { + "epoch": 10.978781038374718, + "grad_norm": 239.44386291503906, + "learning_rate": 1.3497277676950999e-05, + "loss": 35.6712, + "step": 3041 + }, + { + "epoch": 10.982392776523701, + "grad_norm": 260.9620666503906, + "learning_rate": 1.3491833030852994e-05, + "loss": 35.9054, + "step": 3042 + }, + { + "epoch": 10.986004514672686, + "grad_norm": 246.35678100585938, + "learning_rate": 1.3486388384754991e-05, + "loss": 35.6071, + "step": 3043 + }, + { + "epoch": 10.989616252821671, + "grad_norm": 259.808349609375, + "learning_rate": 1.3480943738656988e-05, + "loss": 37.8261, + "step": 3044 + }, + { + "epoch": 10.993227990970654, + "grad_norm": 187.34579467773438, + "learning_rate": 1.3475499092558984e-05, + "loss": 29.4662, + "step": 3045 + }, + { + "epoch": 10.996839729119639, + "grad_norm": 235.4073486328125, + "learning_rate": 1.3470054446460981e-05, + "loss": 23.668, + "step": 3046 + }, + { + "epoch": 11.0, + "grad_norm": 171.45904541015625, + "learning_rate": 1.3464609800362976e-05, + "loss": 21.3995, + "step": 3047 + }, + { + "epoch": 11.003611738148985, + "grad_norm": 262.18798828125, + "learning_rate": 1.3459165154264972e-05, + "loss": 40.2072, + "step": 3048 + }, + { + "epoch": 11.007223476297968, + "grad_norm": 298.67755126953125, + "learning_rate": 1.345372050816697e-05, + "loss": 42.5345, + "step": 3049 + }, + { + "epoch": 11.010835214446953, + "grad_norm": 215.71389770507812, + "learning_rate": 1.3448275862068966e-05, + "loss": 41.3491, + "step": 3050 + }, + { + "epoch": 11.010835214446953, + "eval_loss": 0.6099278330802917, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 3050 + }, + { + "epoch": 11.014446952595938, + "grad_norm": 243.77044677734375, + "learning_rate": 1.3442831215970963e-05, + "loss": 41.0093, + "step": 3051 + }, + { + "epoch": 11.01805869074492, + "grad_norm": 205.8600616455078, + "learning_rate": 1.3437386569872958e-05, + "loss": 41.944, + "step": 3052 + }, + { + "epoch": 11.021670428893906, + "grad_norm": 204.25608825683594, + "learning_rate": 1.3431941923774955e-05, + "loss": 39.3595, + "step": 3053 + }, + { + "epoch": 11.025282167042889, + "grad_norm": 195.03114318847656, + "learning_rate": 1.3426497277676952e-05, + "loss": 42.0208, + "step": 3054 + }, + { + "epoch": 11.028893905191874, + "grad_norm": 193.05857849121094, + "learning_rate": 1.3421052631578948e-05, + "loss": 41.2148, + "step": 3055 + }, + { + "epoch": 11.032505643340858, + "grad_norm": 255.9553680419922, + "learning_rate": 1.3415607985480943e-05, + "loss": 41.6029, + "step": 3056 + }, + { + "epoch": 11.036117381489841, + "grad_norm": 234.97799682617188, + "learning_rate": 1.341016333938294e-05, + "loss": 41.2583, + "step": 3057 + }, + { + "epoch": 11.039729119638826, + "grad_norm": 183.76707458496094, + "learning_rate": 1.3404718693284937e-05, + "loss": 39.4893, + "step": 3058 + }, + { + "epoch": 11.043340857787811, + "grad_norm": 162.30191040039062, + "learning_rate": 1.3399274047186933e-05, + "loss": 37.697, + "step": 3059 + }, + { + "epoch": 11.046952595936794, + "grad_norm": 223.8235626220703, + "learning_rate": 1.339382940108893e-05, + "loss": 37.2762, + "step": 3060 + }, + { + "epoch": 11.046952595936794, + "eval_loss": 0.6099210381507874, + "eval_runtime": 3.1526, + "eval_samples_per_second": 56.778, + "eval_steps_per_second": 56.778, + "step": 3060 + }, + { + "epoch": 11.050564334085779, + "grad_norm": 203.874755859375, + "learning_rate": 1.3388384754990925e-05, + "loss": 37.7674, + "step": 3061 + }, + { + "epoch": 11.054176072234762, + "grad_norm": 222.9609832763672, + "learning_rate": 1.3382940108892922e-05, + "loss": 39.5784, + "step": 3062 + }, + { + "epoch": 11.057787810383747, + "grad_norm": 177.81871032714844, + "learning_rate": 1.337749546279492e-05, + "loss": 37.5264, + "step": 3063 + }, + { + "epoch": 11.061399548532732, + "grad_norm": 209.53326416015625, + "learning_rate": 1.3372050816696915e-05, + "loss": 38.5067, + "step": 3064 + }, + { + "epoch": 11.065011286681715, + "grad_norm": 228.35260009765625, + "learning_rate": 1.3366606170598912e-05, + "loss": 37.5329, + "step": 3065 + }, + { + "epoch": 11.0686230248307, + "grad_norm": 231.5054168701172, + "learning_rate": 1.3361161524500907e-05, + "loss": 39.8565, + "step": 3066 + }, + { + "epoch": 11.072234762979685, + "grad_norm": 184.31460571289062, + "learning_rate": 1.3355716878402904e-05, + "loss": 37.9703, + "step": 3067 + }, + { + "epoch": 11.075846501128668, + "grad_norm": 230.06463623046875, + "learning_rate": 1.3350272232304901e-05, + "loss": 39.1406, + "step": 3068 + }, + { + "epoch": 11.079458239277653, + "grad_norm": 263.3990478515625, + "learning_rate": 1.3344827586206897e-05, + "loss": 39.8019, + "step": 3069 + }, + { + "epoch": 11.083069977426636, + "grad_norm": 217.89923095703125, + "learning_rate": 1.3339382940108892e-05, + "loss": 40.195, + "step": 3070 + }, + { + "epoch": 11.083069977426636, + "eval_loss": 0.6136859655380249, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 3070 + }, + { + "epoch": 11.08668171557562, + "grad_norm": 238.8343505859375, + "learning_rate": 1.333393829401089e-05, + "loss": 39.1668, + "step": 3071 + }, + { + "epoch": 11.090293453724605, + "grad_norm": 288.6470947265625, + "learning_rate": 1.3328493647912886e-05, + "loss": 40.3355, + "step": 3072 + }, + { + "epoch": 11.093905191873588, + "grad_norm": 284.3423156738281, + "learning_rate": 1.3323049001814883e-05, + "loss": 41.5359, + "step": 3073 + }, + { + "epoch": 11.097516930022573, + "grad_norm": 263.0945739746094, + "learning_rate": 1.3317604355716879e-05, + "loss": 41.3219, + "step": 3074 + }, + { + "epoch": 11.101128668171558, + "grad_norm": 208.96383666992188, + "learning_rate": 1.3312159709618874e-05, + "loss": 39.7292, + "step": 3075 + }, + { + "epoch": 11.104740406320541, + "grad_norm": 233.49888610839844, + "learning_rate": 1.3306715063520873e-05, + "loss": 35.282, + "step": 3076 + }, + { + "epoch": 11.108352144469526, + "grad_norm": 216.6250762939453, + "learning_rate": 1.3301270417422868e-05, + "loss": 34.4335, + "step": 3077 + }, + { + "epoch": 11.111963882618511, + "grad_norm": 182.3594970703125, + "learning_rate": 1.3295825771324864e-05, + "loss": 32.7557, + "step": 3078 + }, + { + "epoch": 11.115575620767494, + "grad_norm": 215.4852752685547, + "learning_rate": 1.329038112522686e-05, + "loss": 32.185, + "step": 3079 + }, + { + "epoch": 11.119187358916479, + "grad_norm": 237.4733123779297, + "learning_rate": 1.3284936479128856e-05, + "loss": 32.8733, + "step": 3080 + }, + { + "epoch": 11.119187358916479, + "eval_loss": 0.6130570769309998, + "eval_runtime": 3.154, + "eval_samples_per_second": 56.754, + "eval_steps_per_second": 56.754, + "step": 3080 + }, + { + "epoch": 11.122799097065462, + "grad_norm": 202.9044952392578, + "learning_rate": 1.3279491833030853e-05, + "loss": 33.89, + "step": 3081 + }, + { + "epoch": 11.126410835214447, + "grad_norm": 230.82086181640625, + "learning_rate": 1.327404718693285e-05, + "loss": 34.0808, + "step": 3082 + }, + { + "epoch": 11.130022573363432, + "grad_norm": 318.1103515625, + "learning_rate": 1.3268602540834846e-05, + "loss": 35.5715, + "step": 3083 + }, + { + "epoch": 11.133634311512415, + "grad_norm": 296.760986328125, + "learning_rate": 1.3263157894736843e-05, + "loss": 36.0701, + "step": 3084 + }, + { + "epoch": 11.1372460496614, + "grad_norm": 355.1922302246094, + "learning_rate": 1.3257713248638838e-05, + "loss": 35.027, + "step": 3085 + }, + { + "epoch": 11.140857787810384, + "grad_norm": 379.0643310546875, + "learning_rate": 1.3252268602540835e-05, + "loss": 36.8225, + "step": 3086 + }, + { + "epoch": 11.144469525959368, + "grad_norm": 271.0293273925781, + "learning_rate": 1.3246823956442832e-05, + "loss": 34.18, + "step": 3087 + }, + { + "epoch": 11.148081264108352, + "grad_norm": 231.29782104492188, + "learning_rate": 1.3241379310344828e-05, + "loss": 37.5546, + "step": 3088 + }, + { + "epoch": 11.151693002257336, + "grad_norm": 236.58180236816406, + "learning_rate": 1.3235934664246823e-05, + "loss": 35.8625, + "step": 3089 + }, + { + "epoch": 11.15530474040632, + "grad_norm": 220.71853637695312, + "learning_rate": 1.3230490018148822e-05, + "loss": 38.1384, + "step": 3090 + }, + { + "epoch": 11.15530474040632, + "eval_loss": 0.6140565276145935, + "eval_runtime": 3.1543, + "eval_samples_per_second": 56.747, + "eval_steps_per_second": 56.747, + "step": 3090 + }, + { + "epoch": 11.158916478555305, + "grad_norm": 251.32090759277344, + "learning_rate": 1.3225045372050817e-05, + "loss": 36.7226, + "step": 3091 + }, + { + "epoch": 11.162528216704288, + "grad_norm": 244.061279296875, + "learning_rate": 1.3219600725952814e-05, + "loss": 37.2144, + "step": 3092 + }, + { + "epoch": 11.166139954853273, + "grad_norm": 274.3013610839844, + "learning_rate": 1.321415607985481e-05, + "loss": 27.0703, + "step": 3093 + }, + { + "epoch": 11.169751693002258, + "grad_norm": 197.1829071044922, + "learning_rate": 1.3208711433756805e-05, + "loss": 23.0504, + "step": 3094 + }, + { + "epoch": 11.173363431151241, + "grad_norm": 205.8387451171875, + "learning_rate": 1.3203266787658804e-05, + "loss": 23.4632, + "step": 3095 + }, + { + "epoch": 11.176975169300226, + "grad_norm": 237.6263427734375, + "learning_rate": 1.31978221415608e-05, + "loss": 23.9426, + "step": 3096 + }, + { + "epoch": 11.18058690744921, + "grad_norm": 177.99688720703125, + "learning_rate": 1.3192377495462795e-05, + "loss": 24.2553, + "step": 3097 + }, + { + "epoch": 11.184198645598194, + "grad_norm": 235.16787719726562, + "learning_rate": 1.3186932849364792e-05, + "loss": 41.3257, + "step": 3098 + }, + { + "epoch": 11.187810383747179, + "grad_norm": 213.4043731689453, + "learning_rate": 1.3181488203266787e-05, + "loss": 42.3344, + "step": 3099 + }, + { + "epoch": 11.191422121896162, + "grad_norm": 162.57554626464844, + "learning_rate": 1.3176043557168784e-05, + "loss": 41.2702, + "step": 3100 + }, + { + "epoch": 11.191422121896162, + "eval_loss": 0.6155741214752197, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.06, + "eval_steps_per_second": 57.06, + "step": 3100 + }, + { + "epoch": 11.195033860045147, + "grad_norm": 215.84335327148438, + "learning_rate": 1.3170598911070781e-05, + "loss": 41.0582, + "step": 3101 + }, + { + "epoch": 11.198645598194132, + "grad_norm": 295.0271301269531, + "learning_rate": 1.3165154264972777e-05, + "loss": 41.3479, + "step": 3102 + }, + { + "epoch": 11.202257336343115, + "grad_norm": 287.3316955566406, + "learning_rate": 1.3159709618874774e-05, + "loss": 41.6267, + "step": 3103 + }, + { + "epoch": 11.2058690744921, + "grad_norm": 249.3993377685547, + "learning_rate": 1.315426497277677e-05, + "loss": 40.5208, + "step": 3104 + }, + { + "epoch": 11.209480812641084, + "grad_norm": 274.5410461425781, + "learning_rate": 1.3148820326678766e-05, + "loss": 41.7072, + "step": 3105 + }, + { + "epoch": 11.213092550790067, + "grad_norm": 259.49627685546875, + "learning_rate": 1.3143375680580763e-05, + "loss": 41.0034, + "step": 3106 + }, + { + "epoch": 11.216704288939052, + "grad_norm": 246.60902404785156, + "learning_rate": 1.3137931034482759e-05, + "loss": 40.1154, + "step": 3107 + }, + { + "epoch": 11.220316027088035, + "grad_norm": 224.0052947998047, + "learning_rate": 1.3132486388384754e-05, + "loss": 41.1167, + "step": 3108 + }, + { + "epoch": 11.22392776523702, + "grad_norm": 204.24021911621094, + "learning_rate": 1.3127041742286753e-05, + "loss": 37.0909, + "step": 3109 + }, + { + "epoch": 11.227539503386005, + "grad_norm": 206.67681884765625, + "learning_rate": 1.3121597096188748e-05, + "loss": 38.0959, + "step": 3110 + }, + { + "epoch": 11.227539503386005, + "eval_loss": 0.6148640513420105, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.798, + "eval_steps_per_second": 56.798, + "step": 3110 + }, + { + "epoch": 11.231151241534988, + "grad_norm": 255.91238403320312, + "learning_rate": 1.3116152450090743e-05, + "loss": 38.8076, + "step": 3111 + }, + { + "epoch": 11.234762979683973, + "grad_norm": 239.5032958984375, + "learning_rate": 1.311070780399274e-05, + "loss": 39.3991, + "step": 3112 + }, + { + "epoch": 11.238374717832958, + "grad_norm": 254.8914031982422, + "learning_rate": 1.3105263157894738e-05, + "loss": 37.7301, + "step": 3113 + }, + { + "epoch": 11.241986455981941, + "grad_norm": 229.97943115234375, + "learning_rate": 1.3099818511796735e-05, + "loss": 38.8527, + "step": 3114 + }, + { + "epoch": 11.245598194130926, + "grad_norm": 208.1148681640625, + "learning_rate": 1.309437386569873e-05, + "loss": 38.8518, + "step": 3115 + }, + { + "epoch": 11.249209932279909, + "grad_norm": 208.49557495117188, + "learning_rate": 1.3088929219600725e-05, + "loss": 38.927, + "step": 3116 + }, + { + "epoch": 11.252821670428894, + "grad_norm": 332.9958801269531, + "learning_rate": 1.3083484573502723e-05, + "loss": 40.0492, + "step": 3117 + }, + { + "epoch": 11.256433408577879, + "grad_norm": 253.16769409179688, + "learning_rate": 1.307803992740472e-05, + "loss": 39.1965, + "step": 3118 + }, + { + "epoch": 11.260045146726862, + "grad_norm": 243.8136444091797, + "learning_rate": 1.3072595281306715e-05, + "loss": 38.2286, + "step": 3119 + }, + { + "epoch": 11.263656884875846, + "grad_norm": 273.6463623046875, + "learning_rate": 1.3067150635208712e-05, + "loss": 39.3751, + "step": 3120 + }, + { + "epoch": 11.263656884875846, + "eval_loss": 0.6175129413604736, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 3120 + }, + { + "epoch": 11.267268623024831, + "grad_norm": 228.980224609375, + "learning_rate": 1.3061705989110707e-05, + "loss": 40.29, + "step": 3121 + }, + { + "epoch": 11.270880361173814, + "grad_norm": 292.6310729980469, + "learning_rate": 1.3056261343012703e-05, + "loss": 41.1785, + "step": 3122 + }, + { + "epoch": 11.2744920993228, + "grad_norm": 217.0737762451172, + "learning_rate": 1.3050816696914702e-05, + "loss": 40.9514, + "step": 3123 + }, + { + "epoch": 11.278103837471784, + "grad_norm": 227.0102081298828, + "learning_rate": 1.3045372050816697e-05, + "loss": 39.6132, + "step": 3124 + }, + { + "epoch": 11.281715575620767, + "grad_norm": 195.74667358398438, + "learning_rate": 1.3039927404718694e-05, + "loss": 39.5024, + "step": 3125 + }, + { + "epoch": 11.285327313769752, + "grad_norm": 222.6744384765625, + "learning_rate": 1.303448275862069e-05, + "loss": 37.7863, + "step": 3126 + }, + { + "epoch": 11.288939051918735, + "grad_norm": 207.1038055419922, + "learning_rate": 1.3029038112522687e-05, + "loss": 34.9129, + "step": 3127 + }, + { + "epoch": 11.29255079006772, + "grad_norm": 227.38330078125, + "learning_rate": 1.3023593466424684e-05, + "loss": 33.231, + "step": 3128 + }, + { + "epoch": 11.296162528216705, + "grad_norm": 254.19442749023438, + "learning_rate": 1.3018148820326679e-05, + "loss": 33.3166, + "step": 3129 + }, + { + "epoch": 11.299774266365688, + "grad_norm": 221.4664306640625, + "learning_rate": 1.3012704174228674e-05, + "loss": 33.2336, + "step": 3130 + }, + { + "epoch": 11.299774266365688, + "eval_loss": 0.6138683557510376, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 3130 + }, + { + "epoch": 11.303386004514673, + "grad_norm": 179.73678588867188, + "learning_rate": 1.3007259528130671e-05, + "loss": 34.0082, + "step": 3131 + }, + { + "epoch": 11.306997742663658, + "grad_norm": 238.66107177734375, + "learning_rate": 1.3001814882032669e-05, + "loss": 33.1898, + "step": 3132 + }, + { + "epoch": 11.31060948081264, + "grad_norm": 315.51934814453125, + "learning_rate": 1.2996370235934666e-05, + "loss": 34.5558, + "step": 3133 + }, + { + "epoch": 11.314221218961626, + "grad_norm": 235.54217529296875, + "learning_rate": 1.2990925589836661e-05, + "loss": 32.4498, + "step": 3134 + }, + { + "epoch": 11.317832957110609, + "grad_norm": 225.9518280029297, + "learning_rate": 1.2985480943738656e-05, + "loss": 34.1823, + "step": 3135 + }, + { + "epoch": 11.321444695259594, + "grad_norm": 276.5481262207031, + "learning_rate": 1.2980036297640655e-05, + "loss": 34.6704, + "step": 3136 + }, + { + "epoch": 11.325056433408578, + "grad_norm": 306.4985656738281, + "learning_rate": 1.297459165154265e-05, + "loss": 35.9149, + "step": 3137 + }, + { + "epoch": 11.328668171557561, + "grad_norm": 207.28550720214844, + "learning_rate": 1.2969147005444646e-05, + "loss": 34.876, + "step": 3138 + }, + { + "epoch": 11.332279909706546, + "grad_norm": 238.89157104492188, + "learning_rate": 1.2963702359346643e-05, + "loss": 36.7191, + "step": 3139 + }, + { + "epoch": 11.335891647855531, + "grad_norm": 281.7445068359375, + "learning_rate": 1.2958257713248638e-05, + "loss": 37.9134, + "step": 3140 + }, + { + "epoch": 11.335891647855531, + "eval_loss": 0.6141538023948669, + "eval_runtime": 3.1622, + "eval_samples_per_second": 56.606, + "eval_steps_per_second": 56.606, + "step": 3140 + }, + { + "epoch": 11.339503386004514, + "grad_norm": 261.58221435546875, + "learning_rate": 1.2952813067150635e-05, + "loss": 36.7193, + "step": 3141 + }, + { + "epoch": 11.343115124153499, + "grad_norm": 260.8083190917969, + "learning_rate": 1.2947368421052633e-05, + "loss": 36.9418, + "step": 3142 + }, + { + "epoch": 11.346726862302482, + "grad_norm": 263.466552734375, + "learning_rate": 1.2941923774954628e-05, + "loss": 31.1083, + "step": 3143 + }, + { + "epoch": 11.350338600451467, + "grad_norm": 201.6587677001953, + "learning_rate": 1.2936479128856625e-05, + "loss": 23.4982, + "step": 3144 + }, + { + "epoch": 11.353950338600452, + "grad_norm": 230.29629516601562, + "learning_rate": 1.293103448275862e-05, + "loss": 22.5417, + "step": 3145 + }, + { + "epoch": 11.357562076749435, + "grad_norm": 193.08795166015625, + "learning_rate": 1.2925589836660617e-05, + "loss": 23.6032, + "step": 3146 + }, + { + "epoch": 11.36117381489842, + "grad_norm": 206.49093627929688, + "learning_rate": 1.2920145190562615e-05, + "loss": 24.1813, + "step": 3147 + }, + { + "epoch": 11.364785553047405, + "grad_norm": 285.38348388671875, + "learning_rate": 1.291470054446461e-05, + "loss": 41.4394, + "step": 3148 + }, + { + "epoch": 11.368397291196388, + "grad_norm": 307.4984130859375, + "learning_rate": 1.2909255898366605e-05, + "loss": 43.8865, + "step": 3149 + }, + { + "epoch": 11.372009029345373, + "grad_norm": 256.685791015625, + "learning_rate": 1.2903811252268604e-05, + "loss": 41.5534, + "step": 3150 + }, + { + "epoch": 11.372009029345373, + "eval_loss": 0.6155339479446411, + "eval_runtime": 3.1488, + "eval_samples_per_second": 56.846, + "eval_steps_per_second": 56.846, + "step": 3150 + }, + { + "epoch": 11.375620767494357, + "grad_norm": 302.5317077636719, + "learning_rate": 1.28983666061706e-05, + "loss": 41.5231, + "step": 3151 + }, + { + "epoch": 11.37923250564334, + "grad_norm": 381.4787292480469, + "learning_rate": 1.2892921960072595e-05, + "loss": 40.7064, + "step": 3152 + }, + { + "epoch": 11.382844243792325, + "grad_norm": 313.63116455078125, + "learning_rate": 1.2887477313974592e-05, + "loss": 41.4045, + "step": 3153 + }, + { + "epoch": 11.386455981941308, + "grad_norm": 265.4134521484375, + "learning_rate": 1.2882032667876587e-05, + "loss": 41.2618, + "step": 3154 + }, + { + "epoch": 11.390067720090293, + "grad_norm": 260.43084716796875, + "learning_rate": 1.2876588021778586e-05, + "loss": 42.6311, + "step": 3155 + }, + { + "epoch": 11.393679458239278, + "grad_norm": 326.7022705078125, + "learning_rate": 1.2871143375680581e-05, + "loss": 41.8859, + "step": 3156 + }, + { + "epoch": 11.397291196388261, + "grad_norm": 420.966552734375, + "learning_rate": 1.2865698729582577e-05, + "loss": 41.8117, + "step": 3157 + }, + { + "epoch": 11.400902934537246, + "grad_norm": 280.8377380371094, + "learning_rate": 1.2860254083484574e-05, + "loss": 41.3303, + "step": 3158 + }, + { + "epoch": 11.404514672686231, + "grad_norm": 238.64564514160156, + "learning_rate": 1.2854809437386571e-05, + "loss": 38.253, + "step": 3159 + }, + { + "epoch": 11.408126410835214, + "grad_norm": 258.8091125488281, + "learning_rate": 1.2849364791288566e-05, + "loss": 39.2494, + "step": 3160 + }, + { + "epoch": 11.408126410835214, + "eval_loss": 0.6130858659744263, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 3160 + }, + { + "epoch": 11.411738148984199, + "grad_norm": 209.76300048828125, + "learning_rate": 1.2843920145190563e-05, + "loss": 39.1069, + "step": 3161 + }, + { + "epoch": 11.415349887133182, + "grad_norm": 215.24072265625, + "learning_rate": 1.2838475499092559e-05, + "loss": 38.8867, + "step": 3162 + }, + { + "epoch": 11.418961625282167, + "grad_norm": 285.4281311035156, + "learning_rate": 1.2833030852994554e-05, + "loss": 38.0298, + "step": 3163 + }, + { + "epoch": 11.422573363431152, + "grad_norm": 322.1593017578125, + "learning_rate": 1.2827586206896553e-05, + "loss": 40.2122, + "step": 3164 + }, + { + "epoch": 11.426185101580135, + "grad_norm": 277.2178955078125, + "learning_rate": 1.2822141560798548e-05, + "loss": 38.0829, + "step": 3165 + }, + { + "epoch": 11.42979683972912, + "grad_norm": 186.9705810546875, + "learning_rate": 1.2816696914700545e-05, + "loss": 40.6601, + "step": 3166 + }, + { + "epoch": 11.433408577878104, + "grad_norm": 210.6102294921875, + "learning_rate": 1.281125226860254e-05, + "loss": 39.0126, + "step": 3167 + }, + { + "epoch": 11.437020316027088, + "grad_norm": 234.50717163085938, + "learning_rate": 1.2805807622504536e-05, + "loss": 38.6465, + "step": 3168 + }, + { + "epoch": 11.440632054176072, + "grad_norm": 217.9093475341797, + "learning_rate": 1.2800362976406535e-05, + "loss": 39.2568, + "step": 3169 + }, + { + "epoch": 11.444243792325057, + "grad_norm": 252.82054138183594, + "learning_rate": 1.279491833030853e-05, + "loss": 39.005, + "step": 3170 + }, + { + "epoch": 11.444243792325057, + "eval_loss": 0.6125118732452393, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 3170 + }, + { + "epoch": 11.44785553047404, + "grad_norm": 290.2322998046875, + "learning_rate": 1.2789473684210526e-05, + "loss": 39.6133, + "step": 3171 + }, + { + "epoch": 11.451467268623025, + "grad_norm": 250.72450256347656, + "learning_rate": 1.2784029038112523e-05, + "loss": 40.3251, + "step": 3172 + }, + { + "epoch": 11.455079006772008, + "grad_norm": 273.91229248046875, + "learning_rate": 1.277858439201452e-05, + "loss": 39.5129, + "step": 3173 + }, + { + "epoch": 11.458690744920993, + "grad_norm": 214.30038452148438, + "learning_rate": 1.2773139745916515e-05, + "loss": 40.5093, + "step": 3174 + }, + { + "epoch": 11.462302483069978, + "grad_norm": 264.251708984375, + "learning_rate": 1.2767695099818512e-05, + "loss": 38.3837, + "step": 3175 + }, + { + "epoch": 11.465914221218961, + "grad_norm": 224.7700653076172, + "learning_rate": 1.2762250453720508e-05, + "loss": 37.8522, + "step": 3176 + }, + { + "epoch": 11.469525959367946, + "grad_norm": 238.35604858398438, + "learning_rate": 1.2756805807622505e-05, + "loss": 34.0249, + "step": 3177 + }, + { + "epoch": 11.47313769751693, + "grad_norm": 181.4731903076172, + "learning_rate": 1.2751361161524502e-05, + "loss": 34.2473, + "step": 3178 + }, + { + "epoch": 11.476749435665914, + "grad_norm": 240.2397003173828, + "learning_rate": 1.2745916515426497e-05, + "loss": 32.8657, + "step": 3179 + }, + { + "epoch": 11.480361173814899, + "grad_norm": 283.2740478515625, + "learning_rate": 1.2740471869328494e-05, + "loss": 34.6619, + "step": 3180 + }, + { + "epoch": 11.480361173814899, + "eval_loss": 0.6126638054847717, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 3180 + }, + { + "epoch": 11.483972911963882, + "grad_norm": 248.70912170410156, + "learning_rate": 1.273502722323049e-05, + "loss": 33.0975, + "step": 3181 + }, + { + "epoch": 11.487584650112867, + "grad_norm": 210.9479217529297, + "learning_rate": 1.2729582577132487e-05, + "loss": 34.2069, + "step": 3182 + }, + { + "epoch": 11.491196388261852, + "grad_norm": 234.31399536132812, + "learning_rate": 1.2724137931034484e-05, + "loss": 35.811, + "step": 3183 + }, + { + "epoch": 11.494808126410835, + "grad_norm": 253.24478149414062, + "learning_rate": 1.271869328493648e-05, + "loss": 35.6234, + "step": 3184 + }, + { + "epoch": 11.49841986455982, + "grad_norm": 259.0565185546875, + "learning_rate": 1.2713248638838476e-05, + "loss": 35.1495, + "step": 3185 + }, + { + "epoch": 11.502031602708804, + "grad_norm": 235.4202880859375, + "learning_rate": 1.2707803992740472e-05, + "loss": 35.1363, + "step": 3186 + }, + { + "epoch": 11.505643340857787, + "grad_norm": 248.30267333984375, + "learning_rate": 1.2702359346642469e-05, + "loss": 35.9653, + "step": 3187 + }, + { + "epoch": 11.509255079006772, + "grad_norm": 197.6142120361328, + "learning_rate": 1.2696914700544466e-05, + "loss": 35.6304, + "step": 3188 + }, + { + "epoch": 11.512866817155757, + "grad_norm": 329.27862548828125, + "learning_rate": 1.2691470054446461e-05, + "loss": 35.6111, + "step": 3189 + }, + { + "epoch": 11.51647855530474, + "grad_norm": 194.7126922607422, + "learning_rate": 1.2686025408348457e-05, + "loss": 35.0693, + "step": 3190 + }, + { + "epoch": 11.51647855530474, + "eval_loss": 0.6106634736061096, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 3190 + }, + { + "epoch": 11.520090293453725, + "grad_norm": 243.0207061767578, + "learning_rate": 1.2680580762250454e-05, + "loss": 37.6373, + "step": 3191 + }, + { + "epoch": 11.523702031602708, + "grad_norm": 282.0947265625, + "learning_rate": 1.267513611615245e-05, + "loss": 36.2595, + "step": 3192 + }, + { + "epoch": 11.527313769751693, + "grad_norm": 249.8011932373047, + "learning_rate": 1.2669691470054446e-05, + "loss": 35.5601, + "step": 3193 + }, + { + "epoch": 11.530925507900678, + "grad_norm": 202.17503356933594, + "learning_rate": 1.2664246823956443e-05, + "loss": 23.1075, + "step": 3194 + }, + { + "epoch": 11.534537246049661, + "grad_norm": 188.78128051757812, + "learning_rate": 1.2658802177858439e-05, + "loss": 22.2458, + "step": 3195 + }, + { + "epoch": 11.538148984198646, + "grad_norm": 219.24722290039062, + "learning_rate": 1.2653357531760437e-05, + "loss": 23.7842, + "step": 3196 + }, + { + "epoch": 11.54176072234763, + "grad_norm": 213.0615234375, + "learning_rate": 1.2647912885662433e-05, + "loss": 25.3773, + "step": 3197 + }, + { + "epoch": 11.545372460496614, + "grad_norm": 274.6806335449219, + "learning_rate": 1.2642468239564428e-05, + "loss": 40.396, + "step": 3198 + }, + { + "epoch": 11.548984198645599, + "grad_norm": 248.91778564453125, + "learning_rate": 1.2637023593466425e-05, + "loss": 42.2405, + "step": 3199 + }, + { + "epoch": 11.552595936794582, + "grad_norm": 228.45591735839844, + "learning_rate": 1.263157894736842e-05, + "loss": 40.7328, + "step": 3200 + }, + { + "epoch": 11.552595936794582, + "eval_loss": 0.6154705286026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.04, + "eval_steps_per_second": 57.04, + "step": 3200 + }, + { + "epoch": 11.556207674943566, + "grad_norm": 206.54483032226562, + "learning_rate": 1.2626134301270418e-05, + "loss": 40.6909, + "step": 3201 + }, + { + "epoch": 11.559819413092551, + "grad_norm": 199.14816284179688, + "learning_rate": 1.2620689655172415e-05, + "loss": 40.6918, + "step": 3202 + }, + { + "epoch": 11.563431151241534, + "grad_norm": 217.4789276123047, + "learning_rate": 1.261524500907441e-05, + "loss": 41.686, + "step": 3203 + }, + { + "epoch": 11.56704288939052, + "grad_norm": 209.83084106445312, + "learning_rate": 1.2609800362976406e-05, + "loss": 40.685, + "step": 3204 + }, + { + "epoch": 11.570654627539504, + "grad_norm": 184.56614685058594, + "learning_rate": 1.2604355716878404e-05, + "loss": 42.1684, + "step": 3205 + }, + { + "epoch": 11.574266365688487, + "grad_norm": 226.84622192382812, + "learning_rate": 1.25989110707804e-05, + "loss": 42.4169, + "step": 3206 + }, + { + "epoch": 11.577878103837472, + "grad_norm": 271.7705383300781, + "learning_rate": 1.2593466424682397e-05, + "loss": 41.9603, + "step": 3207 + }, + { + "epoch": 11.581489841986457, + "grad_norm": 206.48257446289062, + "learning_rate": 1.2588021778584392e-05, + "loss": 39.9903, + "step": 3208 + }, + { + "epoch": 11.58510158013544, + "grad_norm": 190.86009216308594, + "learning_rate": 1.2582577132486388e-05, + "loss": 39.3138, + "step": 3209 + }, + { + "epoch": 11.588713318284425, + "grad_norm": 217.0152130126953, + "learning_rate": 1.2577132486388386e-05, + "loss": 37.652, + "step": 3210 + }, + { + "epoch": 11.588713318284425, + "eval_loss": 0.6143624186515808, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 3210 + }, + { + "epoch": 11.592325056433408, + "grad_norm": 203.3090362548828, + "learning_rate": 1.2571687840290382e-05, + "loss": 38.5532, + "step": 3211 + }, + { + "epoch": 11.595936794582393, + "grad_norm": 237.18287658691406, + "learning_rate": 1.2566243194192377e-05, + "loss": 38.4073, + "step": 3212 + }, + { + "epoch": 11.599548532731378, + "grad_norm": 222.20489501953125, + "learning_rate": 1.2560798548094374e-05, + "loss": 37.7122, + "step": 3213 + }, + { + "epoch": 11.60316027088036, + "grad_norm": 261.4862060546875, + "learning_rate": 1.255535390199637e-05, + "loss": 39.0125, + "step": 3214 + }, + { + "epoch": 11.606772009029346, + "grad_norm": 235.49668884277344, + "learning_rate": 1.2549909255898367e-05, + "loss": 38.1753, + "step": 3215 + }, + { + "epoch": 11.610383747178329, + "grad_norm": 219.66139221191406, + "learning_rate": 1.2544464609800364e-05, + "loss": 40.3478, + "step": 3216 + }, + { + "epoch": 11.613995485327314, + "grad_norm": 282.8075256347656, + "learning_rate": 1.2539019963702359e-05, + "loss": 39.3672, + "step": 3217 + }, + { + "epoch": 11.617607223476298, + "grad_norm": 235.07875061035156, + "learning_rate": 1.2533575317604356e-05, + "loss": 39.8955, + "step": 3218 + }, + { + "epoch": 11.621218961625281, + "grad_norm": 328.829833984375, + "learning_rate": 1.2528130671506353e-05, + "loss": 38.626, + "step": 3219 + }, + { + "epoch": 11.624830699774266, + "grad_norm": 283.1789245605469, + "learning_rate": 1.2522686025408349e-05, + "loss": 40.0565, + "step": 3220 + }, + { + "epoch": 11.624830699774266, + "eval_loss": 0.6113889217376709, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3220 + }, + { + "epoch": 11.628442437923251, + "grad_norm": 230.88047790527344, + "learning_rate": 1.2517241379310346e-05, + "loss": 40.1155, + "step": 3221 + }, + { + "epoch": 11.632054176072234, + "grad_norm": 258.1295166015625, + "learning_rate": 1.2511796733212341e-05, + "loss": 40.4707, + "step": 3222 + }, + { + "epoch": 11.635665914221219, + "grad_norm": 255.82699584960938, + "learning_rate": 1.2506352087114336e-05, + "loss": 41.1296, + "step": 3223 + }, + { + "epoch": 11.639277652370204, + "grad_norm": 226.4784393310547, + "learning_rate": 1.2500907441016335e-05, + "loss": 39.1159, + "step": 3224 + }, + { + "epoch": 11.642889390519187, + "grad_norm": 257.38104248046875, + "learning_rate": 1.249546279491833e-05, + "loss": 40.7933, + "step": 3225 + }, + { + "epoch": 11.646501128668172, + "grad_norm": 218.69070434570312, + "learning_rate": 1.2490018148820328e-05, + "loss": 39.6723, + "step": 3226 + }, + { + "epoch": 11.650112866817155, + "grad_norm": 232.3351287841797, + "learning_rate": 1.2484573502722323e-05, + "loss": 37.5671, + "step": 3227 + }, + { + "epoch": 11.65372460496614, + "grad_norm": 229.93295288085938, + "learning_rate": 1.2479128856624318e-05, + "loss": 32.7819, + "step": 3228 + }, + { + "epoch": 11.657336343115125, + "grad_norm": 265.6002197265625, + "learning_rate": 1.2473684210526317e-05, + "loss": 32.5955, + "step": 3229 + }, + { + "epoch": 11.660948081264108, + "grad_norm": 278.47705078125, + "learning_rate": 1.2468239564428313e-05, + "loss": 32.9901, + "step": 3230 + }, + { + "epoch": 11.660948081264108, + "eval_loss": 0.6078047752380371, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 3230 + }, + { + "epoch": 11.664559819413093, + "grad_norm": 239.9285430908203, + "learning_rate": 1.2462794918330308e-05, + "loss": 33.2737, + "step": 3231 + }, + { + "epoch": 11.668171557562077, + "grad_norm": 358.36090087890625, + "learning_rate": 1.2457350272232305e-05, + "loss": 34.8522, + "step": 3232 + }, + { + "epoch": 11.67178329571106, + "grad_norm": 258.0733642578125, + "learning_rate": 1.2451905626134302e-05, + "loss": 34.6796, + "step": 3233 + }, + { + "epoch": 11.675395033860045, + "grad_norm": 296.21942138671875, + "learning_rate": 1.2446460980036298e-05, + "loss": 35.8479, + "step": 3234 + }, + { + "epoch": 11.679006772009028, + "grad_norm": 229.6141815185547, + "learning_rate": 1.2441016333938295e-05, + "loss": 36.4934, + "step": 3235 + }, + { + "epoch": 11.682618510158013, + "grad_norm": 238.6092987060547, + "learning_rate": 1.243557168784029e-05, + "loss": 35.2253, + "step": 3236 + }, + { + "epoch": 11.686230248306998, + "grad_norm": 300.76300048828125, + "learning_rate": 1.2430127041742287e-05, + "loss": 34.9373, + "step": 3237 + }, + { + "epoch": 11.689841986455981, + "grad_norm": 227.70672607421875, + "learning_rate": 1.2424682395644284e-05, + "loss": 35.4369, + "step": 3238 + }, + { + "epoch": 11.693453724604966, + "grad_norm": 218.36000061035156, + "learning_rate": 1.241923774954628e-05, + "loss": 35.3398, + "step": 3239 + }, + { + "epoch": 11.697065462753951, + "grad_norm": 220.78475952148438, + "learning_rate": 1.2413793103448277e-05, + "loss": 35.7612, + "step": 3240 + }, + { + "epoch": 11.697065462753951, + "eval_loss": 0.6067846417427063, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 3240 + }, + { + "epoch": 11.700677200902934, + "grad_norm": 237.34437561035156, + "learning_rate": 1.2408348457350272e-05, + "loss": 38.0459, + "step": 3241 + }, + { + "epoch": 11.704288939051919, + "grad_norm": 251.60633850097656, + "learning_rate": 1.2402903811252269e-05, + "loss": 35.4676, + "step": 3242 + }, + { + "epoch": 11.707900677200904, + "grad_norm": 214.17117309570312, + "learning_rate": 1.2397459165154266e-05, + "loss": 30.5595, + "step": 3243 + }, + { + "epoch": 11.711512415349887, + "grad_norm": 202.3698272705078, + "learning_rate": 1.2392014519056262e-05, + "loss": 23.7468, + "step": 3244 + }, + { + "epoch": 11.715124153498872, + "grad_norm": 229.11776733398438, + "learning_rate": 1.2386569872958257e-05, + "loss": 23.1255, + "step": 3245 + }, + { + "epoch": 11.718735891647855, + "grad_norm": 175.93829345703125, + "learning_rate": 1.2381125226860254e-05, + "loss": 23.7349, + "step": 3246 + }, + { + "epoch": 11.72234762979684, + "grad_norm": 232.7489471435547, + "learning_rate": 1.2375680580762251e-05, + "loss": 24.4997, + "step": 3247 + }, + { + "epoch": 11.725959367945824, + "grad_norm": 280.5601806640625, + "learning_rate": 1.2370235934664248e-05, + "loss": 42.3811, + "step": 3248 + }, + { + "epoch": 11.729571106094808, + "grad_norm": 292.2538146972656, + "learning_rate": 1.2364791288566244e-05, + "loss": 42.9804, + "step": 3249 + }, + { + "epoch": 11.733182844243792, + "grad_norm": 265.0259704589844, + "learning_rate": 1.2359346642468239e-05, + "loss": 41.1251, + "step": 3250 + }, + { + "epoch": 11.733182844243792, + "eval_loss": 0.6141200065612793, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 3250 + }, + { + "epoch": 11.736794582392777, + "grad_norm": 232.92893981933594, + "learning_rate": 1.2353901996370236e-05, + "loss": 40.9372, + "step": 3251 + }, + { + "epoch": 11.74040632054176, + "grad_norm": 176.99818420410156, + "learning_rate": 1.2348457350272233e-05, + "loss": 41.0757, + "step": 3252 + }, + { + "epoch": 11.744018058690745, + "grad_norm": 206.5728759765625, + "learning_rate": 1.2343012704174228e-05, + "loss": 41.9635, + "step": 3253 + }, + { + "epoch": 11.747629796839728, + "grad_norm": 211.2556915283203, + "learning_rate": 1.2337568058076226e-05, + "loss": 41.5217, + "step": 3254 + }, + { + "epoch": 11.751241534988713, + "grad_norm": 198.8915252685547, + "learning_rate": 1.2332123411978221e-05, + "loss": 42.9997, + "step": 3255 + }, + { + "epoch": 11.754853273137698, + "grad_norm": 291.2761535644531, + "learning_rate": 1.2326678765880218e-05, + "loss": 42.2561, + "step": 3256 + }, + { + "epoch": 11.758465011286681, + "grad_norm": 243.2998046875, + "learning_rate": 1.2321234119782215e-05, + "loss": 41.6219, + "step": 3257 + }, + { + "epoch": 11.762076749435666, + "grad_norm": 266.1149597167969, + "learning_rate": 1.231578947368421e-05, + "loss": 40.1646, + "step": 3258 + }, + { + "epoch": 11.76568848758465, + "grad_norm": 236.6083221435547, + "learning_rate": 1.2310344827586208e-05, + "loss": 39.7079, + "step": 3259 + }, + { + "epoch": 11.769300225733634, + "grad_norm": 196.397216796875, + "learning_rate": 1.2304900181488203e-05, + "loss": 39.6629, + "step": 3260 + }, + { + "epoch": 11.769300225733634, + "eval_loss": 0.6124016046524048, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.055, + "eval_steps_per_second": 57.055, + "step": 3260 + }, + { + "epoch": 11.772911963882619, + "grad_norm": 198.52500915527344, + "learning_rate": 1.22994555353902e-05, + "loss": 38.5285, + "step": 3261 + }, + { + "epoch": 11.776523702031604, + "grad_norm": 236.25477600097656, + "learning_rate": 1.2294010889292197e-05, + "loss": 38.3358, + "step": 3262 + }, + { + "epoch": 11.780135440180587, + "grad_norm": 260.35955810546875, + "learning_rate": 1.2288566243194192e-05, + "loss": 38.374, + "step": 3263 + }, + { + "epoch": 11.783747178329572, + "grad_norm": 313.078857421875, + "learning_rate": 1.2283121597096188e-05, + "loss": 39.124, + "step": 3264 + }, + { + "epoch": 11.787358916478555, + "grad_norm": 191.34027099609375, + "learning_rate": 1.2277676950998187e-05, + "loss": 39.1776, + "step": 3265 + }, + { + "epoch": 11.79097065462754, + "grad_norm": 203.5764923095703, + "learning_rate": 1.2272232304900182e-05, + "loss": 38.7885, + "step": 3266 + }, + { + "epoch": 11.794582392776524, + "grad_norm": 234.38479614257812, + "learning_rate": 1.2266787658802177e-05, + "loss": 39.1353, + "step": 3267 + }, + { + "epoch": 11.798194130925507, + "grad_norm": 254.5694122314453, + "learning_rate": 1.2261343012704174e-05, + "loss": 38.141, + "step": 3268 + }, + { + "epoch": 11.801805869074492, + "grad_norm": 189.8268585205078, + "learning_rate": 1.225589836660617e-05, + "loss": 39.5199, + "step": 3269 + }, + { + "epoch": 11.805417607223477, + "grad_norm": 256.52728271484375, + "learning_rate": 1.2250453720508169e-05, + "loss": 41.5113, + "step": 3270 + }, + { + "epoch": 11.805417607223477, + "eval_loss": 0.6084021329879761, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3270 + }, + { + "epoch": 11.80902934537246, + "grad_norm": 195.57321166992188, + "learning_rate": 1.2245009074410164e-05, + "loss": 39.8129, + "step": 3271 + }, + { + "epoch": 11.812641083521445, + "grad_norm": 228.6748809814453, + "learning_rate": 1.223956442831216e-05, + "loss": 40.2273, + "step": 3272 + }, + { + "epoch": 11.816252821670428, + "grad_norm": 209.96096801757812, + "learning_rate": 1.2234119782214156e-05, + "loss": 40.2254, + "step": 3273 + }, + { + "epoch": 11.819864559819413, + "grad_norm": 247.4613037109375, + "learning_rate": 1.2228675136116152e-05, + "loss": 40.71, + "step": 3274 + }, + { + "epoch": 11.823476297968398, + "grad_norm": 263.0521240234375, + "learning_rate": 1.2223230490018149e-05, + "loss": 39.5572, + "step": 3275 + }, + { + "epoch": 11.827088036117381, + "grad_norm": 225.53634643554688, + "learning_rate": 1.2217785843920146e-05, + "loss": 36.4388, + "step": 3276 + }, + { + "epoch": 11.830699774266366, + "grad_norm": 194.59527587890625, + "learning_rate": 1.2212341197822141e-05, + "loss": 33.1005, + "step": 3277 + }, + { + "epoch": 11.83431151241535, + "grad_norm": 314.715576171875, + "learning_rate": 1.2206896551724138e-05, + "loss": 32.9812, + "step": 3278 + }, + { + "epoch": 11.837923250564334, + "grad_norm": 205.86862182617188, + "learning_rate": 1.2201451905626136e-05, + "loss": 33.6331, + "step": 3279 + }, + { + "epoch": 11.841534988713319, + "grad_norm": 217.54722595214844, + "learning_rate": 1.2196007259528131e-05, + "loss": 33.6535, + "step": 3280 + }, + { + "epoch": 11.841534988713319, + "eval_loss": 0.609620213508606, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 3280 + }, + { + "epoch": 11.845146726862303, + "grad_norm": 231.25390625, + "learning_rate": 1.2190562613430128e-05, + "loss": 34.5218, + "step": 3281 + }, + { + "epoch": 11.848758465011286, + "grad_norm": 208.8440704345703, + "learning_rate": 1.2185117967332123e-05, + "loss": 34.354, + "step": 3282 + }, + { + "epoch": 11.852370203160271, + "grad_norm": 221.25547790527344, + "learning_rate": 1.2179673321234119e-05, + "loss": 34.5705, + "step": 3283 + }, + { + "epoch": 11.855981941309254, + "grad_norm": 331.4505920410156, + "learning_rate": 1.2174228675136118e-05, + "loss": 35.796, + "step": 3284 + }, + { + "epoch": 11.85959367945824, + "grad_norm": 337.1404113769531, + "learning_rate": 1.2168784029038113e-05, + "loss": 36.4544, + "step": 3285 + }, + { + "epoch": 11.863205417607224, + "grad_norm": 238.75303649902344, + "learning_rate": 1.2163339382940108e-05, + "loss": 35.7165, + "step": 3286 + }, + { + "epoch": 11.866817155756207, + "grad_norm": 260.088134765625, + "learning_rate": 1.2157894736842105e-05, + "loss": 35.5461, + "step": 3287 + }, + { + "epoch": 11.870428893905192, + "grad_norm": 265.0240173339844, + "learning_rate": 1.2152450090744102e-05, + "loss": 37.0143, + "step": 3288 + }, + { + "epoch": 11.874040632054175, + "grad_norm": 251.74273681640625, + "learning_rate": 1.21470054446461e-05, + "loss": 36.6145, + "step": 3289 + }, + { + "epoch": 11.87765237020316, + "grad_norm": 216.8999786376953, + "learning_rate": 1.2141560798548095e-05, + "loss": 36.3135, + "step": 3290 + }, + { + "epoch": 11.87765237020316, + "eval_loss": 0.6087896823883057, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.066, + "eval_steps_per_second": 57.066, + "step": 3290 + }, + { + "epoch": 11.881264108352145, + "grad_norm": 256.50006103515625, + "learning_rate": 1.213611615245009e-05, + "loss": 36.6596, + "step": 3291 + }, + { + "epoch": 11.884875846501128, + "grad_norm": 249.34164428710938, + "learning_rate": 1.2130671506352087e-05, + "loss": 37.6473, + "step": 3292 + }, + { + "epoch": 11.888487584650113, + "grad_norm": 211.9344940185547, + "learning_rate": 1.2125226860254084e-05, + "loss": 28.2839, + "step": 3293 + }, + { + "epoch": 11.892099322799098, + "grad_norm": 170.77166748046875, + "learning_rate": 1.211978221415608e-05, + "loss": 23.2231, + "step": 3294 + }, + { + "epoch": 11.89571106094808, + "grad_norm": 177.49789428710938, + "learning_rate": 1.2114337568058077e-05, + "loss": 22.7909, + "step": 3295 + }, + { + "epoch": 11.899322799097066, + "grad_norm": 189.0458221435547, + "learning_rate": 1.2108892921960072e-05, + "loss": 23.8062, + "step": 3296 + }, + { + "epoch": 11.90293453724605, + "grad_norm": 182.90457153320312, + "learning_rate": 1.2103448275862068e-05, + "loss": 24.7812, + "step": 3297 + }, + { + "epoch": 11.906546275395034, + "grad_norm": 232.61126708984375, + "learning_rate": 1.2098003629764066e-05, + "loss": 41.5496, + "step": 3298 + }, + { + "epoch": 11.910158013544018, + "grad_norm": 283.25762939453125, + "learning_rate": 1.2092558983666062e-05, + "loss": 40.7831, + "step": 3299 + }, + { + "epoch": 11.913769751693001, + "grad_norm": 316.6318359375, + "learning_rate": 1.2087114337568059e-05, + "loss": 40.6287, + "step": 3300 + }, + { + "epoch": 11.913769751693001, + "eval_loss": 0.6114257574081421, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 3300 + }, + { + "epoch": 11.917381489841986, + "grad_norm": 248.5615234375, + "learning_rate": 1.2081669691470054e-05, + "loss": 40.5648, + "step": 3301 + }, + { + "epoch": 11.920993227990971, + "grad_norm": 255.31130981445312, + "learning_rate": 1.2076225045372051e-05, + "loss": 42.4736, + "step": 3302 + }, + { + "epoch": 11.924604966139954, + "grad_norm": 229.3546600341797, + "learning_rate": 1.2070780399274048e-05, + "loss": 43.112, + "step": 3303 + }, + { + "epoch": 11.928216704288939, + "grad_norm": 226.89553833007812, + "learning_rate": 1.2065335753176044e-05, + "loss": 37.9527, + "step": 3304 + }, + { + "epoch": 11.931828442437924, + "grad_norm": 210.63919067382812, + "learning_rate": 1.205989110707804e-05, + "loss": 38.7652, + "step": 3305 + }, + { + "epoch": 11.935440180586907, + "grad_norm": 267.75335693359375, + "learning_rate": 1.2054446460980036e-05, + "loss": 39.9077, + "step": 3306 + }, + { + "epoch": 11.939051918735892, + "grad_norm": 255.3372802734375, + "learning_rate": 1.2049001814882033e-05, + "loss": 39.9008, + "step": 3307 + }, + { + "epoch": 11.942663656884875, + "grad_norm": 220.55332946777344, + "learning_rate": 1.2043557168784029e-05, + "loss": 40.8187, + "step": 3308 + }, + { + "epoch": 11.94627539503386, + "grad_norm": 350.15374755859375, + "learning_rate": 1.2038112522686026e-05, + "loss": 40.2937, + "step": 3309 + }, + { + "epoch": 11.949887133182845, + "grad_norm": 296.1144714355469, + "learning_rate": 1.2032667876588021e-05, + "loss": 41.3939, + "step": 3310 + }, + { + "epoch": 11.949887133182845, + "eval_loss": 0.6116041541099548, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3310 + }, + { + "epoch": 11.953498871331828, + "grad_norm": 220.52304077148438, + "learning_rate": 1.202722323049002e-05, + "loss": 39.108, + "step": 3311 + }, + { + "epoch": 11.957110609480813, + "grad_norm": 268.8526916503906, + "learning_rate": 1.2021778584392015e-05, + "loss": 39.547, + "step": 3312 + }, + { + "epoch": 11.960722347629797, + "grad_norm": 205.97677612304688, + "learning_rate": 1.201633393829401e-05, + "loss": 36.7144, + "step": 3313 + }, + { + "epoch": 11.96433408577878, + "grad_norm": 186.62428283691406, + "learning_rate": 1.2010889292196008e-05, + "loss": 34.0491, + "step": 3314 + }, + { + "epoch": 11.967945823927765, + "grad_norm": 214.5521697998047, + "learning_rate": 1.2005444646098003e-05, + "loss": 34.1164, + "step": 3315 + }, + { + "epoch": 11.97155756207675, + "grad_norm": 203.8130340576172, + "learning_rate": 1.2e-05, + "loss": 34.0005, + "step": 3316 + }, + { + "epoch": 11.975169300225733, + "grad_norm": 207.25648498535156, + "learning_rate": 1.1994555353901997e-05, + "loss": 34.0489, + "step": 3317 + }, + { + "epoch": 11.978781038374718, + "grad_norm": 271.1595458984375, + "learning_rate": 1.1989110707803993e-05, + "loss": 35.0359, + "step": 3318 + }, + { + "epoch": 11.982392776523701, + "grad_norm": 266.0697021484375, + "learning_rate": 1.198366606170599e-05, + "loss": 36.4684, + "step": 3319 + }, + { + "epoch": 11.986004514672686, + "grad_norm": 264.1314392089844, + "learning_rate": 1.1978221415607985e-05, + "loss": 35.8805, + "step": 3320 + }, + { + "epoch": 11.986004514672686, + "eval_loss": 0.6101864576339722, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 3320 + }, + { + "epoch": 11.989616252821671, + "grad_norm": 266.34295654296875, + "learning_rate": 1.1972776769509982e-05, + "loss": 37.2928, + "step": 3321 + }, + { + "epoch": 11.993227990970654, + "grad_norm": 222.19161987304688, + "learning_rate": 1.196733212341198e-05, + "loss": 29.0638, + "step": 3322 + }, + { + "epoch": 11.996839729119639, + "grad_norm": 244.96974182128906, + "learning_rate": 1.1961887477313975e-05, + "loss": 23.6752, + "step": 3323 + }, + { + "epoch": 12.0, + "grad_norm": 227.6931915283203, + "learning_rate": 1.195644283121597e-05, + "loss": 20.9293, + "step": 3324 + }, + { + "epoch": 12.003611738148985, + "grad_norm": 259.7235412597656, + "learning_rate": 1.1950998185117969e-05, + "loss": 39.7694, + "step": 3325 + }, + { + "epoch": 12.007223476297968, + "grad_norm": 258.8477783203125, + "learning_rate": 1.1945553539019964e-05, + "loss": 41.3742, + "step": 3326 + }, + { + "epoch": 12.010835214446953, + "grad_norm": 216.0697784423828, + "learning_rate": 1.194010889292196e-05, + "loss": 40.0706, + "step": 3327 + }, + { + "epoch": 12.014446952595938, + "grad_norm": 197.73046875, + "learning_rate": 1.1934664246823957e-05, + "loss": 39.844, + "step": 3328 + }, + { + "epoch": 12.01805869074492, + "grad_norm": 190.29563903808594, + "learning_rate": 1.1929219600725952e-05, + "loss": 41.8877, + "step": 3329 + }, + { + "epoch": 12.021670428893906, + "grad_norm": 190.01197814941406, + "learning_rate": 1.1923774954627951e-05, + "loss": 40.5782, + "step": 3330 + }, + { + "epoch": 12.021670428893906, + "eval_loss": 0.6100598573684692, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3330 + }, + { + "epoch": 12.025282167042889, + "grad_norm": 283.20965576171875, + "learning_rate": 1.1918330308529946e-05, + "loss": 42.9183, + "step": 3331 + }, + { + "epoch": 12.028893905191874, + "grad_norm": 227.9106903076172, + "learning_rate": 1.1912885662431942e-05, + "loss": 41.4606, + "step": 3332 + }, + { + "epoch": 12.032505643340858, + "grad_norm": 217.31640625, + "learning_rate": 1.1907441016333939e-05, + "loss": 40.527, + "step": 3333 + }, + { + "epoch": 12.036117381489841, + "grad_norm": 181.33787536621094, + "learning_rate": 1.1901996370235936e-05, + "loss": 40.2536, + "step": 3334 + }, + { + "epoch": 12.039729119638826, + "grad_norm": 210.638427734375, + "learning_rate": 1.1896551724137931e-05, + "loss": 39.0234, + "step": 3335 + }, + { + "epoch": 12.043340857787811, + "grad_norm": 222.1325225830078, + "learning_rate": 1.1891107078039928e-05, + "loss": 36.6929, + "step": 3336 + }, + { + "epoch": 12.046952595936794, + "grad_norm": 195.0751953125, + "learning_rate": 1.1885662431941924e-05, + "loss": 37.9547, + "step": 3337 + }, + { + "epoch": 12.050564334085779, + "grad_norm": 287.6582946777344, + "learning_rate": 1.1880217785843919e-05, + "loss": 37.9016, + "step": 3338 + }, + { + "epoch": 12.054176072234762, + "grad_norm": 351.43701171875, + "learning_rate": 1.1874773139745918e-05, + "loss": 40.014, + "step": 3339 + }, + { + "epoch": 12.057787810383747, + "grad_norm": 212.9033966064453, + "learning_rate": 1.1869328493647913e-05, + "loss": 37.8761, + "step": 3340 + }, + { + "epoch": 12.057787810383747, + "eval_loss": 0.6093400120735168, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 3340 + }, + { + "epoch": 12.061399548532732, + "grad_norm": 268.8284912109375, + "learning_rate": 1.186388384754991e-05, + "loss": 38.7171, + "step": 3341 + }, + { + "epoch": 12.065011286681715, + "grad_norm": 193.27267456054688, + "learning_rate": 1.1858439201451906e-05, + "loss": 38.4908, + "step": 3342 + }, + { + "epoch": 12.0686230248307, + "grad_norm": 244.18124389648438, + "learning_rate": 1.1852994555353901e-05, + "loss": 37.9388, + "step": 3343 + }, + { + "epoch": 12.072234762979685, + "grad_norm": 311.6593933105469, + "learning_rate": 1.18475499092559e-05, + "loss": 38.4287, + "step": 3344 + }, + { + "epoch": 12.075846501128668, + "grad_norm": 239.28526306152344, + "learning_rate": 1.1842105263157895e-05, + "loss": 38.1349, + "step": 3345 + }, + { + "epoch": 12.079458239277653, + "grad_norm": 312.1795654296875, + "learning_rate": 1.183666061705989e-05, + "loss": 39.8067, + "step": 3346 + }, + { + "epoch": 12.083069977426636, + "grad_norm": 303.3067932128906, + "learning_rate": 1.1831215970961888e-05, + "loss": 40.0617, + "step": 3347 + }, + { + "epoch": 12.08668171557562, + "grad_norm": 280.8705749511719, + "learning_rate": 1.1825771324863885e-05, + "loss": 39.244, + "step": 3348 + }, + { + "epoch": 12.090293453724605, + "grad_norm": 249.89671325683594, + "learning_rate": 1.182032667876588e-05, + "loss": 39.0047, + "step": 3349 + }, + { + "epoch": 12.093905191873588, + "grad_norm": 226.19195556640625, + "learning_rate": 1.1814882032667877e-05, + "loss": 40.8044, + "step": 3350 + }, + { + "epoch": 12.093905191873588, + "eval_loss": 0.6100687384605408, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 3350 + }, + { + "epoch": 12.097516930022573, + "grad_norm": 250.29306030273438, + "learning_rate": 1.1809437386569873e-05, + "loss": 38.0745, + "step": 3351 + }, + { + "epoch": 12.101128668171558, + "grad_norm": 255.06137084960938, + "learning_rate": 1.180399274047187e-05, + "loss": 37.2922, + "step": 3352 + }, + { + "epoch": 12.104740406320541, + "grad_norm": 293.59185791015625, + "learning_rate": 1.1798548094373867e-05, + "loss": 35.488, + "step": 3353 + }, + { + "epoch": 12.108352144469526, + "grad_norm": 260.9599914550781, + "learning_rate": 1.1793103448275862e-05, + "loss": 32.8175, + "step": 3354 + }, + { + "epoch": 12.111963882618511, + "grad_norm": 387.63671875, + "learning_rate": 1.178765880217786e-05, + "loss": 31.3901, + "step": 3355 + }, + { + "epoch": 12.115575620767494, + "grad_norm": 216.2008819580078, + "learning_rate": 1.1782214156079855e-05, + "loss": 32.9512, + "step": 3356 + }, + { + "epoch": 12.119187358916479, + "grad_norm": 260.510498046875, + "learning_rate": 1.177676950998185e-05, + "loss": 31.838, + "step": 3357 + }, + { + "epoch": 12.122799097065462, + "grad_norm": 215.96522521972656, + "learning_rate": 1.1771324863883849e-05, + "loss": 33.5854, + "step": 3358 + }, + { + "epoch": 12.126410835214447, + "grad_norm": 277.2855529785156, + "learning_rate": 1.1765880217785844e-05, + "loss": 34.947, + "step": 3359 + }, + { + "epoch": 12.130022573363432, + "grad_norm": 199.53759765625, + "learning_rate": 1.176043557168784e-05, + "loss": 34.3862, + "step": 3360 + }, + { + "epoch": 12.130022573363432, + "eval_loss": 0.6107886433601379, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 3360 + }, + { + "epoch": 12.133634311512415, + "grad_norm": 244.73654174804688, + "learning_rate": 1.1754990925589837e-05, + "loss": 34.5678, + "step": 3361 + }, + { + "epoch": 12.1372460496614, + "grad_norm": 335.4967346191406, + "learning_rate": 1.1749546279491834e-05, + "loss": 35.8974, + "step": 3362 + }, + { + "epoch": 12.140857787810384, + "grad_norm": 269.8370056152344, + "learning_rate": 1.174410163339383e-05, + "loss": 36.3458, + "step": 3363 + }, + { + "epoch": 12.144469525959368, + "grad_norm": 230.82492065429688, + "learning_rate": 1.1738656987295826e-05, + "loss": 34.6797, + "step": 3364 + }, + { + "epoch": 12.148081264108352, + "grad_norm": 266.6196594238281, + "learning_rate": 1.1733212341197822e-05, + "loss": 35.5799, + "step": 3365 + }, + { + "epoch": 12.151693002257336, + "grad_norm": 268.1825256347656, + "learning_rate": 1.1727767695099819e-05, + "loss": 34.9859, + "step": 3366 + }, + { + "epoch": 12.15530474040632, + "grad_norm": 259.6159362792969, + "learning_rate": 1.1722323049001816e-05, + "loss": 37.2283, + "step": 3367 + }, + { + "epoch": 12.158916478555305, + "grad_norm": 225.1367645263672, + "learning_rate": 1.1716878402903811e-05, + "loss": 37.4073, + "step": 3368 + }, + { + "epoch": 12.162528216704288, + "grad_norm": 277.8457946777344, + "learning_rate": 1.1711433756805808e-05, + "loss": 36.3491, + "step": 3369 + }, + { + "epoch": 12.166139954853273, + "grad_norm": 273.1939697265625, + "learning_rate": 1.1705989110707804e-05, + "loss": 31.4646, + "step": 3370 + }, + { + "epoch": 12.166139954853273, + "eval_loss": 0.6099494695663452, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 3370 + }, + { + "epoch": 12.169751693002258, + "grad_norm": 199.32516479492188, + "learning_rate": 1.17005444646098e-05, + "loss": 22.7125, + "step": 3371 + }, + { + "epoch": 12.173363431151241, + "grad_norm": 195.47630310058594, + "learning_rate": 1.1695099818511798e-05, + "loss": 22.7899, + "step": 3372 + }, + { + "epoch": 12.176975169300226, + "grad_norm": 220.02413940429688, + "learning_rate": 1.1689655172413793e-05, + "loss": 23.4427, + "step": 3373 + }, + { + "epoch": 12.18058690744921, + "grad_norm": 215.43287658691406, + "learning_rate": 1.168421052631579e-05, + "loss": 24.1504, + "step": 3374 + }, + { + "epoch": 12.184198645598194, + "grad_norm": 298.2409973144531, + "learning_rate": 1.1678765880217786e-05, + "loss": 41.4955, + "step": 3375 + }, + { + "epoch": 12.187810383747179, + "grad_norm": 235.94728088378906, + "learning_rate": 1.1673321234119783e-05, + "loss": 42.4273, + "step": 3376 + }, + { + "epoch": 12.191422121896162, + "grad_norm": 235.44480895996094, + "learning_rate": 1.166787658802178e-05, + "loss": 40.6468, + "step": 3377 + }, + { + "epoch": 12.195033860045147, + "grad_norm": 281.5338439941406, + "learning_rate": 1.1662431941923775e-05, + "loss": 39.8335, + "step": 3378 + }, + { + "epoch": 12.198645598194132, + "grad_norm": 185.87339782714844, + "learning_rate": 1.165698729582577e-05, + "loss": 40.8669, + "step": 3379 + }, + { + "epoch": 12.202257336343115, + "grad_norm": 218.88861083984375, + "learning_rate": 1.1651542649727768e-05, + "loss": 40.1351, + "step": 3380 + }, + { + "epoch": 12.202257336343115, + "eval_loss": 0.6128573417663574, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3380 + }, + { + "epoch": 12.2058690744921, + "grad_norm": 192.7227783203125, + "learning_rate": 1.1646098003629765e-05, + "loss": 40.4448, + "step": 3381 + }, + { + "epoch": 12.209480812641084, + "grad_norm": 219.68093872070312, + "learning_rate": 1.1640653357531762e-05, + "loss": 41.579, + "step": 3382 + }, + { + "epoch": 12.213092550790067, + "grad_norm": 235.8788299560547, + "learning_rate": 1.1635208711433757e-05, + "loss": 41.3374, + "step": 3383 + }, + { + "epoch": 12.216704288939052, + "grad_norm": 245.11935424804688, + "learning_rate": 1.1629764065335752e-05, + "loss": 41.1151, + "step": 3384 + }, + { + "epoch": 12.220316027088035, + "grad_norm": 260.2931823730469, + "learning_rate": 1.1624319419237751e-05, + "loss": 38.9502, + "step": 3385 + }, + { + "epoch": 12.22392776523702, + "grad_norm": 240.62734985351562, + "learning_rate": 1.1618874773139747e-05, + "loss": 38.6309, + "step": 3386 + }, + { + "epoch": 12.227539503386005, + "grad_norm": 230.9380645751953, + "learning_rate": 1.1613430127041742e-05, + "loss": 38.3077, + "step": 3387 + }, + { + "epoch": 12.231151241534988, + "grad_norm": 234.40687561035156, + "learning_rate": 1.1607985480943739e-05, + "loss": 37.1566, + "step": 3388 + }, + { + "epoch": 12.234762979683973, + "grad_norm": 216.580810546875, + "learning_rate": 1.1602540834845734e-05, + "loss": 38.4919, + "step": 3389 + }, + { + "epoch": 12.238374717832958, + "grad_norm": 210.75079345703125, + "learning_rate": 1.1597096188747732e-05, + "loss": 38.1647, + "step": 3390 + }, + { + "epoch": 12.238374717832958, + "eval_loss": 0.6105583906173706, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3390 + }, + { + "epoch": 12.241986455981941, + "grad_norm": 207.82180786132812, + "learning_rate": 1.1591651542649729e-05, + "loss": 38.5585, + "step": 3391 + }, + { + "epoch": 12.245598194130926, + "grad_norm": 186.55081176757812, + "learning_rate": 1.1586206896551724e-05, + "loss": 38.0183, + "step": 3392 + }, + { + "epoch": 12.249209932279909, + "grad_norm": 179.60572814941406, + "learning_rate": 1.1580762250453721e-05, + "loss": 39.6951, + "step": 3393 + }, + { + "epoch": 12.252821670428894, + "grad_norm": 212.59837341308594, + "learning_rate": 1.1575317604355718e-05, + "loss": 39.2908, + "step": 3394 + }, + { + "epoch": 12.256433408577879, + "grad_norm": 239.90997314453125, + "learning_rate": 1.1569872958257714e-05, + "loss": 39.9409, + "step": 3395 + }, + { + "epoch": 12.260045146726862, + "grad_norm": 240.729248046875, + "learning_rate": 1.156442831215971e-05, + "loss": 39.2386, + "step": 3396 + }, + { + "epoch": 12.263656884875846, + "grad_norm": 248.6179962158203, + "learning_rate": 1.1558983666061706e-05, + "loss": 37.3296, + "step": 3397 + }, + { + "epoch": 12.267268623024831, + "grad_norm": 192.55084228515625, + "learning_rate": 1.1553539019963701e-05, + "loss": 40.1156, + "step": 3398 + }, + { + "epoch": 12.270880361173814, + "grad_norm": 217.89109802246094, + "learning_rate": 1.15480943738657e-05, + "loss": 41.0677, + "step": 3399 + }, + { + "epoch": 12.2744920993228, + "grad_norm": 240.77633666992188, + "learning_rate": 1.1542649727767695e-05, + "loss": 39.3552, + "step": 3400 + }, + { + "epoch": 12.2744920993228, + "eval_loss": 0.6094763278961182, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3400 + }, + { + "epoch": 12.278103837471784, + "grad_norm": 210.38153076171875, + "learning_rate": 1.1537205081669691e-05, + "loss": 40.2202, + "step": 3401 + }, + { + "epoch": 12.281715575620767, + "grad_norm": 195.49087524414062, + "learning_rate": 1.1531760435571688e-05, + "loss": 37.5473, + "step": 3402 + }, + { + "epoch": 12.285327313769752, + "grad_norm": 254.43972778320312, + "learning_rate": 1.1526315789473683e-05, + "loss": 37.8032, + "step": 3403 + }, + { + "epoch": 12.288939051918735, + "grad_norm": 205.09913635253906, + "learning_rate": 1.1520871143375682e-05, + "loss": 35.1317, + "step": 3404 + }, + { + "epoch": 12.29255079006772, + "grad_norm": 241.22930908203125, + "learning_rate": 1.1515426497277677e-05, + "loss": 32.7809, + "step": 3405 + }, + { + "epoch": 12.296162528216705, + "grad_norm": 226.75311279296875, + "learning_rate": 1.1509981851179673e-05, + "loss": 32.5354, + "step": 3406 + }, + { + "epoch": 12.299774266365688, + "grad_norm": 323.5389709472656, + "learning_rate": 1.150453720508167e-05, + "loss": 33.1533, + "step": 3407 + }, + { + "epoch": 12.303386004514673, + "grad_norm": 306.7039794921875, + "learning_rate": 1.1499092558983667e-05, + "loss": 33.7924, + "step": 3408 + }, + { + "epoch": 12.306997742663658, + "grad_norm": 221.53897094726562, + "learning_rate": 1.1493647912885662e-05, + "loss": 33.829, + "step": 3409 + }, + { + "epoch": 12.31060948081264, + "grad_norm": 301.59527587890625, + "learning_rate": 1.148820326678766e-05, + "loss": 35.4583, + "step": 3410 + }, + { + "epoch": 12.31060948081264, + "eval_loss": 0.6092248558998108, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.058, + "eval_steps_per_second": 57.058, + "step": 3410 + }, + { + "epoch": 12.314221218961626, + "grad_norm": 229.63221740722656, + "learning_rate": 1.1482758620689655e-05, + "loss": 34.3258, + "step": 3411 + }, + { + "epoch": 12.317832957110609, + "grad_norm": 280.6421203613281, + "learning_rate": 1.147731397459165e-05, + "loss": 33.4522, + "step": 3412 + }, + { + "epoch": 12.321444695259594, + "grad_norm": 305.6673889160156, + "learning_rate": 1.1471869328493649e-05, + "loss": 34.8911, + "step": 3413 + }, + { + "epoch": 12.325056433408578, + "grad_norm": 278.5484924316406, + "learning_rate": 1.1466424682395644e-05, + "loss": 36.2668, + "step": 3414 + }, + { + "epoch": 12.328668171557561, + "grad_norm": 246.88082885742188, + "learning_rate": 1.1460980036297641e-05, + "loss": 34.8401, + "step": 3415 + }, + { + "epoch": 12.332279909706546, + "grad_norm": 279.730712890625, + "learning_rate": 1.1455535390199637e-05, + "loss": 36.2382, + "step": 3416 + }, + { + "epoch": 12.335891647855531, + "grad_norm": 243.62918090820312, + "learning_rate": 1.1450090744101634e-05, + "loss": 37.0742, + "step": 3417 + }, + { + "epoch": 12.339503386004514, + "grad_norm": 280.5240783691406, + "learning_rate": 1.1444646098003631e-05, + "loss": 37.0223, + "step": 3418 + }, + { + "epoch": 12.343115124153499, + "grad_norm": 270.56396484375, + "learning_rate": 1.1439201451905626e-05, + "loss": 34.8413, + "step": 3419 + }, + { + "epoch": 12.346726862302482, + "grad_norm": 246.56292724609375, + "learning_rate": 1.1433756805807622e-05, + "loss": 26.5596, + "step": 3420 + }, + { + "epoch": 12.346726862302482, + "eval_loss": 0.6123174428939819, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.143, + "eval_steps_per_second": 57.143, + "step": 3420 + }, + { + "epoch": 12.350338600451467, + "grad_norm": 199.72242736816406, + "learning_rate": 1.1428312159709619e-05, + "loss": 23.3959, + "step": 3421 + }, + { + "epoch": 12.353950338600452, + "grad_norm": 264.9206848144531, + "learning_rate": 1.1422867513611616e-05, + "loss": 23.448, + "step": 3422 + }, + { + "epoch": 12.357562076749435, + "grad_norm": 198.09420776367188, + "learning_rate": 1.1417422867513613e-05, + "loss": 23.4526, + "step": 3423 + }, + { + "epoch": 12.36117381489842, + "grad_norm": 191.74949645996094, + "learning_rate": 1.1411978221415608e-05, + "loss": 23.9586, + "step": 3424 + }, + { + "epoch": 12.364785553047405, + "grad_norm": 270.4527893066406, + "learning_rate": 1.1406533575317604e-05, + "loss": 41.2497, + "step": 3425 + }, + { + "epoch": 12.368397291196388, + "grad_norm": 253.06109619140625, + "learning_rate": 1.1401088929219601e-05, + "loss": 41.7598, + "step": 3426 + }, + { + "epoch": 12.372009029345373, + "grad_norm": 389.3164978027344, + "learning_rate": 1.1395644283121598e-05, + "loss": 42.1145, + "step": 3427 + }, + { + "epoch": 12.375620767494357, + "grad_norm": 405.1527404785156, + "learning_rate": 1.1390199637023593e-05, + "loss": 39.8163, + "step": 3428 + }, + { + "epoch": 12.37923250564334, + "grad_norm": 360.5083312988281, + "learning_rate": 1.138475499092559e-05, + "loss": 40.7344, + "step": 3429 + }, + { + "epoch": 12.382844243792325, + "grad_norm": 276.3650207519531, + "learning_rate": 1.1379310344827586e-05, + "loss": 40.6678, + "step": 3430 + }, + { + "epoch": 12.382844243792325, + "eval_loss": 0.612799346446991, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 3430 + }, + { + "epoch": 12.386455981941308, + "grad_norm": 222.34078979492188, + "learning_rate": 1.1373865698729583e-05, + "loss": 39.8701, + "step": 3431 + }, + { + "epoch": 12.390067720090293, + "grad_norm": 242.1103515625, + "learning_rate": 1.136842105263158e-05, + "loss": 42.031, + "step": 3432 + }, + { + "epoch": 12.393679458239278, + "grad_norm": 231.30453491210938, + "learning_rate": 1.1362976406533575e-05, + "loss": 40.7321, + "step": 3433 + }, + { + "epoch": 12.397291196388261, + "grad_norm": 302.65179443359375, + "learning_rate": 1.1357531760435572e-05, + "loss": 41.5889, + "step": 3434 + }, + { + "epoch": 12.400902934537246, + "grad_norm": 296.4203796386719, + "learning_rate": 1.1352087114337568e-05, + "loss": 40.3939, + "step": 3435 + }, + { + "epoch": 12.404514672686231, + "grad_norm": 281.8349304199219, + "learning_rate": 1.1346642468239565e-05, + "loss": 37.9457, + "step": 3436 + }, + { + "epoch": 12.408126410835214, + "grad_norm": 228.9622039794922, + "learning_rate": 1.1341197822141562e-05, + "loss": 37.4727, + "step": 3437 + }, + { + "epoch": 12.411738148984199, + "grad_norm": 276.8975524902344, + "learning_rate": 1.1335753176043557e-05, + "loss": 36.4285, + "step": 3438 + }, + { + "epoch": 12.415349887133182, + "grad_norm": 218.76206970214844, + "learning_rate": 1.1330308529945553e-05, + "loss": 37.7888, + "step": 3439 + }, + { + "epoch": 12.418961625282167, + "grad_norm": 277.31329345703125, + "learning_rate": 1.1324863883847551e-05, + "loss": 38.6416, + "step": 3440 + }, + { + "epoch": 12.418961625282167, + "eval_loss": 0.6118359565734863, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.105, + "eval_steps_per_second": 57.105, + "step": 3440 + }, + { + "epoch": 12.422573363431152, + "grad_norm": 239.2766876220703, + "learning_rate": 1.1319419237749547e-05, + "loss": 38.3779, + "step": 3441 + }, + { + "epoch": 12.426185101580135, + "grad_norm": 255.43939208984375, + "learning_rate": 1.1313974591651542e-05, + "loss": 38.7581, + "step": 3442 + }, + { + "epoch": 12.42979683972912, + "grad_norm": 196.33380126953125, + "learning_rate": 1.130852994555354e-05, + "loss": 40.1953, + "step": 3443 + }, + { + "epoch": 12.433408577878104, + "grad_norm": 284.2427062988281, + "learning_rate": 1.1303085299455535e-05, + "loss": 39.2743, + "step": 3444 + }, + { + "epoch": 12.437020316027088, + "grad_norm": 303.0172424316406, + "learning_rate": 1.1297640653357533e-05, + "loss": 39.4786, + "step": 3445 + }, + { + "epoch": 12.440632054176072, + "grad_norm": 231.17999267578125, + "learning_rate": 1.1292196007259529e-05, + "loss": 38.6038, + "step": 3446 + }, + { + "epoch": 12.444243792325057, + "grad_norm": 228.89599609375, + "learning_rate": 1.1286751361161524e-05, + "loss": 39.0235, + "step": 3447 + }, + { + "epoch": 12.44785553047404, + "grad_norm": 247.05203247070312, + "learning_rate": 1.1281306715063521e-05, + "loss": 39.9779, + "step": 3448 + }, + { + "epoch": 12.451467268623025, + "grad_norm": 221.5463104248047, + "learning_rate": 1.1275862068965517e-05, + "loss": 40.4104, + "step": 3449 + }, + { + "epoch": 12.455079006772008, + "grad_norm": 254.12820434570312, + "learning_rate": 1.1270417422867514e-05, + "loss": 40.8093, + "step": 3450 + }, + { + "epoch": 12.455079006772008, + "eval_loss": 0.6093817353248596, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 3450 + }, + { + "epoch": 12.458690744920993, + "grad_norm": 214.2323760986328, + "learning_rate": 1.1264972776769511e-05, + "loss": 40.3578, + "step": 3451 + }, + { + "epoch": 12.462302483069978, + "grad_norm": 230.64718627929688, + "learning_rate": 1.1259528130671506e-05, + "loss": 39.772, + "step": 3452 + }, + { + "epoch": 12.465914221218961, + "grad_norm": 217.81838989257812, + "learning_rate": 1.1254083484573502e-05, + "loss": 36.8193, + "step": 3453 + }, + { + "epoch": 12.469525959367946, + "grad_norm": 292.7674560546875, + "learning_rate": 1.12486388384755e-05, + "loss": 33.891, + "step": 3454 + }, + { + "epoch": 12.47313769751693, + "grad_norm": 241.6099395751953, + "learning_rate": 1.1243194192377496e-05, + "loss": 34.8947, + "step": 3455 + }, + { + "epoch": 12.476749435665914, + "grad_norm": 220.97128295898438, + "learning_rate": 1.1237749546279493e-05, + "loss": 31.7715, + "step": 3456 + }, + { + "epoch": 12.480361173814899, + "grad_norm": 191.04376220703125, + "learning_rate": 1.1232304900181488e-05, + "loss": 32.3878, + "step": 3457 + }, + { + "epoch": 12.483972911963882, + "grad_norm": 192.3009796142578, + "learning_rate": 1.1226860254083484e-05, + "loss": 33.3116, + "step": 3458 + }, + { + "epoch": 12.487584650112867, + "grad_norm": 214.22459411621094, + "learning_rate": 1.1221415607985482e-05, + "loss": 34.1394, + "step": 3459 + }, + { + "epoch": 12.491196388261852, + "grad_norm": 225.24191284179688, + "learning_rate": 1.1215970961887478e-05, + "loss": 34.9381, + "step": 3460 + }, + { + "epoch": 12.491196388261852, + "eval_loss": 0.6095408201217651, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 3460 + }, + { + "epoch": 12.494808126410835, + "grad_norm": 240.89199829101562, + "learning_rate": 1.1210526315789473e-05, + "loss": 34.5342, + "step": 3461 + }, + { + "epoch": 12.49841986455982, + "grad_norm": 263.5467224121094, + "learning_rate": 1.120508166969147e-05, + "loss": 35.3287, + "step": 3462 + }, + { + "epoch": 12.502031602708804, + "grad_norm": 253.0650634765625, + "learning_rate": 1.1199637023593467e-05, + "loss": 35.4859, + "step": 3463 + }, + { + "epoch": 12.505643340857787, + "grad_norm": 279.4447937011719, + "learning_rate": 1.1194192377495463e-05, + "loss": 33.919, + "step": 3464 + }, + { + "epoch": 12.509255079006772, + "grad_norm": 246.6184844970703, + "learning_rate": 1.118874773139746e-05, + "loss": 35.2743, + "step": 3465 + }, + { + "epoch": 12.512866817155757, + "grad_norm": 228.4134979248047, + "learning_rate": 1.1183303085299455e-05, + "loss": 36.0865, + "step": 3466 + }, + { + "epoch": 12.51647855530474, + "grad_norm": 264.87835693359375, + "learning_rate": 1.1177858439201452e-05, + "loss": 36.1596, + "step": 3467 + }, + { + "epoch": 12.520090293453725, + "grad_norm": 252.2872772216797, + "learning_rate": 1.117241379310345e-05, + "loss": 35.7293, + "step": 3468 + }, + { + "epoch": 12.523702031602708, + "grad_norm": 277.3695373535156, + "learning_rate": 1.1166969147005445e-05, + "loss": 36.8009, + "step": 3469 + }, + { + "epoch": 12.527313769751693, + "grad_norm": 255.64610290527344, + "learning_rate": 1.1161524500907442e-05, + "loss": 28.5986, + "step": 3470 + }, + { + "epoch": 12.527313769751693, + "eval_loss": 0.6122347116470337, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 3470 + }, + { + "epoch": 12.530925507900678, + "grad_norm": 256.1487121582031, + "learning_rate": 1.1156079854809437e-05, + "loss": 23.1289, + "step": 3471 + }, + { + "epoch": 12.534537246049661, + "grad_norm": 261.9757080078125, + "learning_rate": 1.1150635208711433e-05, + "loss": 22.3379, + "step": 3472 + }, + { + "epoch": 12.538148984198646, + "grad_norm": 194.83432006835938, + "learning_rate": 1.1145190562613431e-05, + "loss": 23.6192, + "step": 3473 + }, + { + "epoch": 12.54176072234763, + "grad_norm": 241.51089477539062, + "learning_rate": 1.1139745916515427e-05, + "loss": 24.0314, + "step": 3474 + }, + { + "epoch": 12.545372460496614, + "grad_norm": 242.6024932861328, + "learning_rate": 1.1134301270417424e-05, + "loss": 40.2969, + "step": 3475 + }, + { + "epoch": 12.548984198645599, + "grad_norm": 292.17303466796875, + "learning_rate": 1.112885662431942e-05, + "loss": 42.3448, + "step": 3476 + }, + { + "epoch": 12.552595936794582, + "grad_norm": 232.811767578125, + "learning_rate": 1.1123411978221416e-05, + "loss": 41.7642, + "step": 3477 + }, + { + "epoch": 12.556207674943566, + "grad_norm": 238.43162536621094, + "learning_rate": 1.1117967332123413e-05, + "loss": 41.0827, + "step": 3478 + }, + { + "epoch": 12.559819413092551, + "grad_norm": 290.20159912109375, + "learning_rate": 1.1112522686025409e-05, + "loss": 41.3795, + "step": 3479 + }, + { + "epoch": 12.563431151241534, + "grad_norm": 197.52903747558594, + "learning_rate": 1.1107078039927404e-05, + "loss": 40.6337, + "step": 3480 + }, + { + "epoch": 12.563431151241534, + "eval_loss": 0.6133883595466614, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.135, + "eval_steps_per_second": 57.135, + "step": 3480 + }, + { + "epoch": 12.56704288939052, + "grad_norm": 259.8161926269531, + "learning_rate": 1.1101633393829401e-05, + "loss": 40.2626, + "step": 3481 + }, + { + "epoch": 12.570654627539504, + "grad_norm": 196.7882537841797, + "learning_rate": 1.1096188747731398e-05, + "loss": 41.0171, + "step": 3482 + }, + { + "epoch": 12.574266365688487, + "grad_norm": 216.27642822265625, + "learning_rate": 1.1090744101633394e-05, + "loss": 42.1328, + "step": 3483 + }, + { + "epoch": 12.577878103837472, + "grad_norm": 292.6575012207031, + "learning_rate": 1.108529945553539e-05, + "loss": 39.9502, + "step": 3484 + }, + { + "epoch": 12.581489841986457, + "grad_norm": 254.43344116210938, + "learning_rate": 1.1079854809437386e-05, + "loss": 41.3409, + "step": 3485 + }, + { + "epoch": 12.58510158013544, + "grad_norm": 211.3965606689453, + "learning_rate": 1.1074410163339385e-05, + "loss": 39.6898, + "step": 3486 + }, + { + "epoch": 12.588713318284425, + "grad_norm": 196.2000274658203, + "learning_rate": 1.106896551724138e-05, + "loss": 38.0837, + "step": 3487 + }, + { + "epoch": 12.592325056433408, + "grad_norm": 224.4564666748047, + "learning_rate": 1.1063520871143376e-05, + "loss": 38.479, + "step": 3488 + }, + { + "epoch": 12.595936794582393, + "grad_norm": 215.7074432373047, + "learning_rate": 1.1058076225045373e-05, + "loss": 38.3103, + "step": 3489 + }, + { + "epoch": 12.599548532731378, + "grad_norm": 278.2279052734375, + "learning_rate": 1.1052631578947368e-05, + "loss": 37.9399, + "step": 3490 + }, + { + "epoch": 12.599548532731378, + "eval_loss": 0.6091782450675964, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 3490 + }, + { + "epoch": 12.60316027088036, + "grad_norm": 236.7021942138672, + "learning_rate": 1.1047186932849365e-05, + "loss": 38.185, + "step": 3491 + }, + { + "epoch": 12.606772009029346, + "grad_norm": 200.35169982910156, + "learning_rate": 1.1041742286751362e-05, + "loss": 38.7405, + "step": 3492 + }, + { + "epoch": 12.610383747178329, + "grad_norm": 211.9726104736328, + "learning_rate": 1.1036297640653358e-05, + "loss": 39.8351, + "step": 3493 + }, + { + "epoch": 12.613995485327314, + "grad_norm": 303.5962829589844, + "learning_rate": 1.1030852994555353e-05, + "loss": 39.3039, + "step": 3494 + }, + { + "epoch": 12.617607223476298, + "grad_norm": 298.086181640625, + "learning_rate": 1.102540834845735e-05, + "loss": 39.9149, + "step": 3495 + }, + { + "epoch": 12.621218961625281, + "grad_norm": 255.69854736328125, + "learning_rate": 1.1019963702359347e-05, + "loss": 36.3617, + "step": 3496 + }, + { + "epoch": 12.624830699774266, + "grad_norm": 273.2884216308594, + "learning_rate": 1.1014519056261344e-05, + "loss": 38.6865, + "step": 3497 + }, + { + "epoch": 12.628442437923251, + "grad_norm": 211.17837524414062, + "learning_rate": 1.100907441016334e-05, + "loss": 40.2771, + "step": 3498 + }, + { + "epoch": 12.632054176072234, + "grad_norm": 253.9141845703125, + "learning_rate": 1.1003629764065335e-05, + "loss": 40.3644, + "step": 3499 + }, + { + "epoch": 12.635665914221219, + "grad_norm": 247.4141082763672, + "learning_rate": 1.0998185117967334e-05, + "loss": 39.9754, + "step": 3500 + }, + { + "epoch": 12.635665914221219, + "eval_loss": 0.6086810827255249, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 3500 + }, + { + "epoch": 12.639277652370204, + "grad_norm": 237.3258056640625, + "learning_rate": 1.0992740471869329e-05, + "loss": 39.9438, + "step": 3501 + }, + { + "epoch": 12.642889390519187, + "grad_norm": 252.87744140625, + "learning_rate": 1.0987295825771325e-05, + "loss": 39.9713, + "step": 3502 + }, + { + "epoch": 12.646501128668172, + "grad_norm": 341.2947998046875, + "learning_rate": 1.0981851179673322e-05, + "loss": 36.54, + "step": 3503 + }, + { + "epoch": 12.650112866817155, + "grad_norm": 212.7144317626953, + "learning_rate": 1.0976406533575317e-05, + "loss": 33.2737, + "step": 3504 + }, + { + "epoch": 12.65372460496614, + "grad_norm": 220.15846252441406, + "learning_rate": 1.0970961887477314e-05, + "loss": 34.8862, + "step": 3505 + }, + { + "epoch": 12.657336343115125, + "grad_norm": 235.8145294189453, + "learning_rate": 1.0965517241379311e-05, + "loss": 31.637, + "step": 3506 + }, + { + "epoch": 12.660948081264108, + "grad_norm": 274.13140869140625, + "learning_rate": 1.0960072595281307e-05, + "loss": 33.6111, + "step": 3507 + }, + { + "epoch": 12.664559819413093, + "grad_norm": 259.9810791015625, + "learning_rate": 1.0954627949183304e-05, + "loss": 34.7118, + "step": 3508 + }, + { + "epoch": 12.668171557562077, + "grad_norm": 244.6074676513672, + "learning_rate": 1.0949183303085299e-05, + "loss": 34.3987, + "step": 3509 + }, + { + "epoch": 12.67178329571106, + "grad_norm": 264.0238037109375, + "learning_rate": 1.0943738656987296e-05, + "loss": 34.7304, + "step": 3510 + }, + { + "epoch": 12.67178329571106, + "eval_loss": 0.6089194416999817, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 3510 + }, + { + "epoch": 12.675395033860045, + "grad_norm": 286.857421875, + "learning_rate": 1.0938294010889293e-05, + "loss": 34.5722, + "step": 3511 + }, + { + "epoch": 12.679006772009028, + "grad_norm": 270.7839660644531, + "learning_rate": 1.0932849364791289e-05, + "loss": 35.6129, + "step": 3512 + }, + { + "epoch": 12.682618510158013, + "grad_norm": 214.4302978515625, + "learning_rate": 1.0927404718693284e-05, + "loss": 34.4318, + "step": 3513 + }, + { + "epoch": 12.686230248306998, + "grad_norm": 362.6913757324219, + "learning_rate": 1.0921960072595283e-05, + "loss": 35.6578, + "step": 3514 + }, + { + "epoch": 12.689841986455981, + "grad_norm": 266.5205993652344, + "learning_rate": 1.0916515426497278e-05, + "loss": 35.8627, + "step": 3515 + }, + { + "epoch": 12.693453724604966, + "grad_norm": 271.8298034667969, + "learning_rate": 1.0911070780399275e-05, + "loss": 36.8931, + "step": 3516 + }, + { + "epoch": 12.697065462753951, + "grad_norm": 230.13815307617188, + "learning_rate": 1.090562613430127e-05, + "loss": 35.8972, + "step": 3517 + }, + { + "epoch": 12.700677200902934, + "grad_norm": 235.57127380371094, + "learning_rate": 1.0900181488203266e-05, + "loss": 36.7884, + "step": 3518 + }, + { + "epoch": 12.704288939051919, + "grad_norm": 274.0856018066406, + "learning_rate": 1.0894736842105265e-05, + "loss": 35.938, + "step": 3519 + }, + { + "epoch": 12.707900677200904, + "grad_norm": 251.9855194091797, + "learning_rate": 1.088929219600726e-05, + "loss": 30.846, + "step": 3520 + }, + { + "epoch": 12.707900677200904, + "eval_loss": 0.6102532148361206, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 3520 + }, + { + "epoch": 12.711512415349887, + "grad_norm": 254.11465454101562, + "learning_rate": 1.0883847549909255e-05, + "loss": 22.8538, + "step": 3521 + }, + { + "epoch": 12.715124153498872, + "grad_norm": 233.05821228027344, + "learning_rate": 1.0878402903811253e-05, + "loss": 22.3346, + "step": 3522 + }, + { + "epoch": 12.718735891647855, + "grad_norm": 223.46646118164062, + "learning_rate": 1.087295825771325e-05, + "loss": 23.8109, + "step": 3523 + }, + { + "epoch": 12.72234762979684, + "grad_norm": 209.4064483642578, + "learning_rate": 1.0867513611615245e-05, + "loss": 24.7694, + "step": 3524 + }, + { + "epoch": 12.725959367945824, + "grad_norm": 299.6215515136719, + "learning_rate": 1.0862068965517242e-05, + "loss": 40.8879, + "step": 3525 + }, + { + "epoch": 12.729571106094808, + "grad_norm": 272.5259704589844, + "learning_rate": 1.0856624319419237e-05, + "loss": 41.5875, + "step": 3526 + }, + { + "epoch": 12.733182844243792, + "grad_norm": 219.70687866210938, + "learning_rate": 1.0851179673321235e-05, + "loss": 41.5546, + "step": 3527 + }, + { + "epoch": 12.736794582392777, + "grad_norm": 250.9104766845703, + "learning_rate": 1.0845735027223232e-05, + "loss": 40.0984, + "step": 3528 + }, + { + "epoch": 12.74040632054176, + "grad_norm": 260.9254150390625, + "learning_rate": 1.0840290381125227e-05, + "loss": 40.564, + "step": 3529 + }, + { + "epoch": 12.744018058690745, + "grad_norm": 275.46221923828125, + "learning_rate": 1.0834845735027224e-05, + "loss": 40.3864, + "step": 3530 + }, + { + "epoch": 12.744018058690745, + "eval_loss": 0.6099677681922913, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 3530 + }, + { + "epoch": 12.747629796839728, + "grad_norm": 200.9589385986328, + "learning_rate": 1.082940108892922e-05, + "loss": 40.5753, + "step": 3531 + }, + { + "epoch": 12.751241534988713, + "grad_norm": 228.87669372558594, + "learning_rate": 1.0823956442831215e-05, + "loss": 41.4702, + "step": 3532 + }, + { + "epoch": 12.754853273137698, + "grad_norm": 218.6998748779297, + "learning_rate": 1.0818511796733214e-05, + "loss": 41.6641, + "step": 3533 + }, + { + "epoch": 12.758465011286681, + "grad_norm": 422.519775390625, + "learning_rate": 1.0813067150635209e-05, + "loss": 41.8016, + "step": 3534 + }, + { + "epoch": 12.762076749435666, + "grad_norm": 198.31935119628906, + "learning_rate": 1.0807622504537204e-05, + "loss": 40.6053, + "step": 3535 + }, + { + "epoch": 12.76568848758465, + "grad_norm": 274.42333984375, + "learning_rate": 1.0802177858439201e-05, + "loss": 38.7974, + "step": 3536 + }, + { + "epoch": 12.769300225733634, + "grad_norm": 267.5847473144531, + "learning_rate": 1.0796733212341199e-05, + "loss": 37.157, + "step": 3537 + }, + { + "epoch": 12.772911963882619, + "grad_norm": 264.9976806640625, + "learning_rate": 1.0791288566243196e-05, + "loss": 38.1585, + "step": 3538 + }, + { + "epoch": 12.776523702031604, + "grad_norm": 216.5603790283203, + "learning_rate": 1.0785843920145191e-05, + "loss": 38.0501, + "step": 3539 + }, + { + "epoch": 12.780135440180587, + "grad_norm": 193.55081176757812, + "learning_rate": 1.0780399274047186e-05, + "loss": 38.3114, + "step": 3540 + }, + { + "epoch": 12.780135440180587, + "eval_loss": 0.6059894561767578, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3540 + }, + { + "epoch": 12.783747178329572, + "grad_norm": 256.3584289550781, + "learning_rate": 1.0774954627949183e-05, + "loss": 38.7056, + "step": 3541 + }, + { + "epoch": 12.787358916478555, + "grad_norm": 203.17401123046875, + "learning_rate": 1.076950998185118e-05, + "loss": 39.3947, + "step": 3542 + }, + { + "epoch": 12.79097065462754, + "grad_norm": 307.99517822265625, + "learning_rate": 1.0764065335753176e-05, + "loss": 39.2121, + "step": 3543 + }, + { + "epoch": 12.794582392776524, + "grad_norm": 199.4147186279297, + "learning_rate": 1.0758620689655173e-05, + "loss": 38.4621, + "step": 3544 + }, + { + "epoch": 12.798194130925507, + "grad_norm": 251.60293579101562, + "learning_rate": 1.0753176043557168e-05, + "loss": 38.2742, + "step": 3545 + }, + { + "epoch": 12.801805869074492, + "grad_norm": 277.1817321777344, + "learning_rate": 1.0747731397459165e-05, + "loss": 38.6803, + "step": 3546 + }, + { + "epoch": 12.805417607223477, + "grad_norm": 303.2837219238281, + "learning_rate": 1.0742286751361163e-05, + "loss": 39.7843, + "step": 3547 + }, + { + "epoch": 12.80902934537246, + "grad_norm": 321.22772216796875, + "learning_rate": 1.0736842105263158e-05, + "loss": 41.3761, + "step": 3548 + }, + { + "epoch": 12.812641083521445, + "grad_norm": 238.89007568359375, + "learning_rate": 1.0731397459165155e-05, + "loss": 40.3649, + "step": 3549 + }, + { + "epoch": 12.816252821670428, + "grad_norm": 251.22291564941406, + "learning_rate": 1.072595281306715e-05, + "loss": 40.8151, + "step": 3550 + }, + { + "epoch": 12.816252821670428, + "eval_loss": 0.6065003275871277, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 3550 + }, + { + "epoch": 12.819864559819413, + "grad_norm": 218.13418579101562, + "learning_rate": 1.0720508166969147e-05, + "loss": 39.381, + "step": 3551 + }, + { + "epoch": 12.823476297968398, + "grad_norm": 250.90328979492188, + "learning_rate": 1.0715063520871145e-05, + "loss": 39.8923, + "step": 3552 + }, + { + "epoch": 12.827088036117381, + "grad_norm": 227.4825897216797, + "learning_rate": 1.070961887477314e-05, + "loss": 36.836, + "step": 3553 + }, + { + "epoch": 12.830699774266366, + "grad_norm": 253.7106475830078, + "learning_rate": 1.0704174228675135e-05, + "loss": 34.499, + "step": 3554 + }, + { + "epoch": 12.83431151241535, + "grad_norm": 280.0548400878906, + "learning_rate": 1.0698729582577132e-05, + "loss": 33.3409, + "step": 3555 + }, + { + "epoch": 12.837923250564334, + "grad_norm": 201.3768768310547, + "learning_rate": 1.069328493647913e-05, + "loss": 32.4868, + "step": 3556 + }, + { + "epoch": 12.841534988713319, + "grad_norm": 245.73446655273438, + "learning_rate": 1.0687840290381125e-05, + "loss": 32.8295, + "step": 3557 + }, + { + "epoch": 12.845146726862303, + "grad_norm": 195.0170440673828, + "learning_rate": 1.0682395644283122e-05, + "loss": 33.2009, + "step": 3558 + }, + { + "epoch": 12.848758465011286, + "grad_norm": 261.66357421875, + "learning_rate": 1.0676950998185117e-05, + "loss": 33.0627, + "step": 3559 + }, + { + "epoch": 12.852370203160271, + "grad_norm": 299.0184326171875, + "learning_rate": 1.0671506352087116e-05, + "loss": 34.184, + "step": 3560 + }, + { + "epoch": 12.852370203160271, + "eval_loss": 0.6077792048454285, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.041, + "eval_steps_per_second": 57.041, + "step": 3560 + }, + { + "epoch": 12.855981941309254, + "grad_norm": 293.9249572753906, + "learning_rate": 1.0666061705989111e-05, + "loss": 34.748, + "step": 3561 + }, + { + "epoch": 12.85959367945824, + "grad_norm": 206.4182586669922, + "learning_rate": 1.0660617059891107e-05, + "loss": 33.8454, + "step": 3562 + }, + { + "epoch": 12.863205417607224, + "grad_norm": 261.4427185058594, + "learning_rate": 1.0655172413793104e-05, + "loss": 35.7317, + "step": 3563 + }, + { + "epoch": 12.866817155756207, + "grad_norm": 236.60704040527344, + "learning_rate": 1.06497277676951e-05, + "loss": 35.2389, + "step": 3564 + }, + { + "epoch": 12.870428893905192, + "grad_norm": 272.9973449707031, + "learning_rate": 1.0644283121597096e-05, + "loss": 34.8523, + "step": 3565 + }, + { + "epoch": 12.874040632054175, + "grad_norm": 228.82540893554688, + "learning_rate": 1.0638838475499093e-05, + "loss": 34.7236, + "step": 3566 + }, + { + "epoch": 12.87765237020316, + "grad_norm": 266.6078796386719, + "learning_rate": 1.0633393829401089e-05, + "loss": 36.1574, + "step": 3567 + }, + { + "epoch": 12.881264108352145, + "grad_norm": 267.52239990234375, + "learning_rate": 1.0627949183303086e-05, + "loss": 36.8466, + "step": 3568 + }, + { + "epoch": 12.884875846501128, + "grad_norm": 261.0372314453125, + "learning_rate": 1.0622504537205083e-05, + "loss": 37.2803, + "step": 3569 + }, + { + "epoch": 12.888487584650113, + "grad_norm": 220.42532348632812, + "learning_rate": 1.0617059891107078e-05, + "loss": 29.4233, + "step": 3570 + }, + { + "epoch": 12.888487584650113, + "eval_loss": 0.6131581664085388, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 3570 + }, + { + "epoch": 12.892099322799098, + "grad_norm": 187.53604125976562, + "learning_rate": 1.0611615245009075e-05, + "loss": 23.3851, + "step": 3571 + }, + { + "epoch": 12.89571106094808, + "grad_norm": 227.1913299560547, + "learning_rate": 1.060617059891107e-05, + "loss": 23.3155, + "step": 3572 + }, + { + "epoch": 12.899322799097066, + "grad_norm": 202.15939331054688, + "learning_rate": 1.0600725952813066e-05, + "loss": 24.4548, + "step": 3573 + }, + { + "epoch": 12.90293453724605, + "grad_norm": 195.67282104492188, + "learning_rate": 1.0595281306715065e-05, + "loss": 24.2037, + "step": 3574 + }, + { + "epoch": 12.906546275395034, + "grad_norm": 303.0018310546875, + "learning_rate": 1.058983666061706e-05, + "loss": 41.6489, + "step": 3575 + }, + { + "epoch": 12.910158013544018, + "grad_norm": 193.92433166503906, + "learning_rate": 1.0584392014519056e-05, + "loss": 40.3682, + "step": 3576 + }, + { + "epoch": 12.913769751693001, + "grad_norm": 305.50750732421875, + "learning_rate": 1.0578947368421053e-05, + "loss": 40.5065, + "step": 3577 + }, + { + "epoch": 12.917381489841986, + "grad_norm": 223.41732788085938, + "learning_rate": 1.0573502722323048e-05, + "loss": 41.6387, + "step": 3578 + }, + { + "epoch": 12.920993227990971, + "grad_norm": 215.65061950683594, + "learning_rate": 1.0568058076225047e-05, + "loss": 41.3623, + "step": 3579 + }, + { + "epoch": 12.924604966139954, + "grad_norm": 223.95880126953125, + "learning_rate": 1.0562613430127042e-05, + "loss": 40.7444, + "step": 3580 + }, + { + "epoch": 12.924604966139954, + "eval_loss": 0.6113386750221252, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.074, + "eval_steps_per_second": 57.074, + "step": 3580 + }, + { + "epoch": 12.928216704288939, + "grad_norm": 247.3272247314453, + "learning_rate": 1.0557168784029038e-05, + "loss": 37.8137, + "step": 3581 + }, + { + "epoch": 12.931828442437924, + "grad_norm": 277.4321594238281, + "learning_rate": 1.0551724137931035e-05, + "loss": 38.6946, + "step": 3582 + }, + { + "epoch": 12.935440180586907, + "grad_norm": 219.15576171875, + "learning_rate": 1.0546279491833032e-05, + "loss": 39.0059, + "step": 3583 + }, + { + "epoch": 12.939051918735892, + "grad_norm": 205.6105194091797, + "learning_rate": 1.0540834845735027e-05, + "loss": 39.2436, + "step": 3584 + }, + { + "epoch": 12.942663656884875, + "grad_norm": 303.84521484375, + "learning_rate": 1.0535390199637024e-05, + "loss": 39.2451, + "step": 3585 + }, + { + "epoch": 12.94627539503386, + "grad_norm": 326.2321472167969, + "learning_rate": 1.052994555353902e-05, + "loss": 38.1849, + "step": 3586 + }, + { + "epoch": 12.949887133182845, + "grad_norm": 332.7608642578125, + "learning_rate": 1.0524500907441015e-05, + "loss": 39.7121, + "step": 3587 + }, + { + "epoch": 12.953498871331828, + "grad_norm": 245.19827270507812, + "learning_rate": 1.0519056261343014e-05, + "loss": 39.6558, + "step": 3588 + }, + { + "epoch": 12.957110609480813, + "grad_norm": 227.54763793945312, + "learning_rate": 1.051361161524501e-05, + "loss": 38.6437, + "step": 3589 + }, + { + "epoch": 12.960722347629797, + "grad_norm": 273.1142272949219, + "learning_rate": 1.0508166969147006e-05, + "loss": 39.083, + "step": 3590 + }, + { + "epoch": 12.960722347629797, + "eval_loss": 0.6050187349319458, + "eval_runtime": 3.1339, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 3590 + }, + { + "epoch": 12.96433408577878, + "grad_norm": 227.0492401123047, + "learning_rate": 1.0502722323049002e-05, + "loss": 34.0254, + "step": 3591 + }, + { + "epoch": 12.967945823927765, + "grad_norm": 201.76736450195312, + "learning_rate": 1.0497277676950999e-05, + "loss": 32.4569, + "step": 3592 + }, + { + "epoch": 12.97155756207675, + "grad_norm": 279.99237060546875, + "learning_rate": 1.0491833030852996e-05, + "loss": 33.8718, + "step": 3593 + }, + { + "epoch": 12.975169300225733, + "grad_norm": 351.647705078125, + "learning_rate": 1.0486388384754991e-05, + "loss": 34.8168, + "step": 3594 + }, + { + "epoch": 12.978781038374718, + "grad_norm": 275.7414855957031, + "learning_rate": 1.0480943738656987e-05, + "loss": 35.1731, + "step": 3595 + }, + { + "epoch": 12.982392776523701, + "grad_norm": 347.0024719238281, + "learning_rate": 1.0475499092558984e-05, + "loss": 35.7127, + "step": 3596 + }, + { + "epoch": 12.986004514672686, + "grad_norm": 304.18218994140625, + "learning_rate": 1.047005444646098e-05, + "loss": 34.7709, + "step": 3597 + }, + { + "epoch": 12.989616252821671, + "grad_norm": 306.33245849609375, + "learning_rate": 1.0464609800362976e-05, + "loss": 37.2105, + "step": 3598 + }, + { + "epoch": 12.993227990970654, + "grad_norm": 326.3535461425781, + "learning_rate": 1.0459165154264973e-05, + "loss": 33.6613, + "step": 3599 + }, + { + "epoch": 12.996839729119639, + "grad_norm": 325.7522888183594, + "learning_rate": 1.0453720508166969e-05, + "loss": 22.8985, + "step": 3600 + }, + { + "epoch": 12.996839729119639, + "eval_loss": 0.6073772311210632, + "eval_runtime": 3.1391, + "eval_samples_per_second": 57.023, + "eval_steps_per_second": 57.023, + "step": 3600 + }, + { + "epoch": 13.0, + "grad_norm": 256.7010498046875, + "learning_rate": 1.0448275862068966e-05, + "loss": 21.3776, + "step": 3601 + }, + { + "epoch": 13.003611738148985, + "grad_norm": 247.7591552734375, + "learning_rate": 1.0442831215970963e-05, + "loss": 39.0509, + "step": 3602 + }, + { + "epoch": 13.007223476297968, + "grad_norm": 389.6626281738281, + "learning_rate": 1.0437386569872958e-05, + "loss": 41.042, + "step": 3603 + }, + { + "epoch": 13.010835214446953, + "grad_norm": 271.01885986328125, + "learning_rate": 1.0431941923774955e-05, + "loss": 39.9542, + "step": 3604 + }, + { + "epoch": 13.014446952595938, + "grad_norm": 263.2490539550781, + "learning_rate": 1.042649727767695e-05, + "loss": 39.8852, + "step": 3605 + }, + { + "epoch": 13.01805869074492, + "grad_norm": 255.46878051757812, + "learning_rate": 1.0421052631578948e-05, + "loss": 39.3902, + "step": 3606 + }, + { + "epoch": 13.021670428893906, + "grad_norm": 206.02244567871094, + "learning_rate": 1.0415607985480945e-05, + "loss": 40.1731, + "step": 3607 + }, + { + "epoch": 13.025282167042889, + "grad_norm": 194.83055114746094, + "learning_rate": 1.041016333938294e-05, + "loss": 39.17, + "step": 3608 + }, + { + "epoch": 13.028893905191874, + "grad_norm": 230.1270294189453, + "learning_rate": 1.0404718693284936e-05, + "loss": 40.3363, + "step": 3609 + }, + { + "epoch": 13.032505643340858, + "grad_norm": 206.0470733642578, + "learning_rate": 1.0399274047186933e-05, + "loss": 40.7774, + "step": 3610 + }, + { + "epoch": 13.032505643340858, + "eval_loss": 0.6078981161117554, + "eval_runtime": 3.1697, + "eval_samples_per_second": 56.472, + "eval_steps_per_second": 56.472, + "step": 3610 + }, + { + "epoch": 13.036117381489841, + "grad_norm": 210.79327392578125, + "learning_rate": 1.039382940108893e-05, + "loss": 40.725, + "step": 3611 + }, + { + "epoch": 13.039729119638826, + "grad_norm": 200.4281768798828, + "learning_rate": 1.0388384754990927e-05, + "loss": 38.8736, + "step": 3612 + }, + { + "epoch": 13.043340857787811, + "grad_norm": 183.33575439453125, + "learning_rate": 1.0382940108892922e-05, + "loss": 37.5542, + "step": 3613 + }, + { + "epoch": 13.046952595936794, + "grad_norm": 195.2568817138672, + "learning_rate": 1.0377495462794918e-05, + "loss": 36.5576, + "step": 3614 + }, + { + "epoch": 13.050564334085779, + "grad_norm": 223.9565887451172, + "learning_rate": 1.0372050816696916e-05, + "loss": 36.9015, + "step": 3615 + }, + { + "epoch": 13.054176072234762, + "grad_norm": 264.0516052246094, + "learning_rate": 1.0366606170598912e-05, + "loss": 38.8146, + "step": 3616 + }, + { + "epoch": 13.057787810383747, + "grad_norm": 247.3844757080078, + "learning_rate": 1.0361161524500907e-05, + "loss": 37.0338, + "step": 3617 + }, + { + "epoch": 13.061399548532732, + "grad_norm": 243.3253173828125, + "learning_rate": 1.0355716878402904e-05, + "loss": 37.3565, + "step": 3618 + }, + { + "epoch": 13.065011286681715, + "grad_norm": 213.89939880371094, + "learning_rate": 1.03502722323049e-05, + "loss": 38.367, + "step": 3619 + }, + { + "epoch": 13.0686230248307, + "grad_norm": 254.04953002929688, + "learning_rate": 1.0344827586206898e-05, + "loss": 38.3101, + "step": 3620 + }, + { + "epoch": 13.0686230248307, + "eval_loss": 0.6108394861221313, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 3620 + }, + { + "epoch": 13.072234762979685, + "grad_norm": 235.3623046875, + "learning_rate": 1.0339382940108894e-05, + "loss": 38.3113, + "step": 3621 + }, + { + "epoch": 13.075846501128668, + "grad_norm": 259.0147399902344, + "learning_rate": 1.0333938294010889e-05, + "loss": 36.9916, + "step": 3622 + }, + { + "epoch": 13.079458239277653, + "grad_norm": 257.96575927734375, + "learning_rate": 1.0328493647912886e-05, + "loss": 36.5944, + "step": 3623 + }, + { + "epoch": 13.083069977426636, + "grad_norm": 228.49131774902344, + "learning_rate": 1.0323049001814882e-05, + "loss": 39.7592, + "step": 3624 + }, + { + "epoch": 13.08668171557562, + "grad_norm": 278.5231018066406, + "learning_rate": 1.0317604355716879e-05, + "loss": 38.7785, + "step": 3625 + }, + { + "epoch": 13.090293453724605, + "grad_norm": 218.6136932373047, + "learning_rate": 1.0312159709618876e-05, + "loss": 39.6878, + "step": 3626 + }, + { + "epoch": 13.093905191873588, + "grad_norm": 231.03012084960938, + "learning_rate": 1.0306715063520871e-05, + "loss": 40.5433, + "step": 3627 + }, + { + "epoch": 13.097516930022573, + "grad_norm": 254.7096405029297, + "learning_rate": 1.0301270417422866e-05, + "loss": 39.1311, + "step": 3628 + }, + { + "epoch": 13.101128668171558, + "grad_norm": 303.50274658203125, + "learning_rate": 1.0295825771324865e-05, + "loss": 38.6237, + "step": 3629 + }, + { + "epoch": 13.104740406320541, + "grad_norm": 217.4394073486328, + "learning_rate": 1.029038112522686e-05, + "loss": 36.5534, + "step": 3630 + }, + { + "epoch": 13.104740406320541, + "eval_loss": 0.6075544357299805, + "eval_runtime": 3.1475, + "eval_samples_per_second": 56.87, + "eval_steps_per_second": 56.87, + "step": 3630 + }, + { + "epoch": 13.108352144469526, + "grad_norm": 249.18490600585938, + "learning_rate": 1.0284936479128858e-05, + "loss": 34.2153, + "step": 3631 + }, + { + "epoch": 13.111963882618511, + "grad_norm": 261.9061584472656, + "learning_rate": 1.0279491833030853e-05, + "loss": 33.7793, + "step": 3632 + }, + { + "epoch": 13.115575620767494, + "grad_norm": 205.93113708496094, + "learning_rate": 1.0274047186932848e-05, + "loss": 31.2934, + "step": 3633 + }, + { + "epoch": 13.119187358916479, + "grad_norm": 203.82980346679688, + "learning_rate": 1.0268602540834847e-05, + "loss": 31.9074, + "step": 3634 + }, + { + "epoch": 13.122799097065462, + "grad_norm": 309.0658874511719, + "learning_rate": 1.0263157894736843e-05, + "loss": 32.6883, + "step": 3635 + }, + { + "epoch": 13.126410835214447, + "grad_norm": 239.59312438964844, + "learning_rate": 1.0257713248638838e-05, + "loss": 34.1261, + "step": 3636 + }, + { + "epoch": 13.130022573363432, + "grad_norm": 360.4351501464844, + "learning_rate": 1.0252268602540835e-05, + "loss": 34.7656, + "step": 3637 + }, + { + "epoch": 13.133634311512415, + "grad_norm": 319.87451171875, + "learning_rate": 1.024682395644283e-05, + "loss": 34.6533, + "step": 3638 + }, + { + "epoch": 13.1372460496614, + "grad_norm": 352.31707763671875, + "learning_rate": 1.0241379310344828e-05, + "loss": 33.9159, + "step": 3639 + }, + { + "epoch": 13.140857787810384, + "grad_norm": 288.85418701171875, + "learning_rate": 1.0235934664246825e-05, + "loss": 34.6115, + "step": 3640 + }, + { + "epoch": 13.140857787810384, + "eval_loss": 0.6106187105178833, + "eval_runtime": 3.1535, + "eval_samples_per_second": 56.763, + "eval_steps_per_second": 56.763, + "step": 3640 + }, + { + "epoch": 13.144469525959368, + "grad_norm": 263.8638000488281, + "learning_rate": 1.023049001814882e-05, + "loss": 34.3008, + "step": 3641 + }, + { + "epoch": 13.148081264108352, + "grad_norm": 308.10650634765625, + "learning_rate": 1.0225045372050817e-05, + "loss": 35.9397, + "step": 3642 + }, + { + "epoch": 13.151693002257336, + "grad_norm": 208.60519409179688, + "learning_rate": 1.0219600725952814e-05, + "loss": 34.2573, + "step": 3643 + }, + { + "epoch": 13.15530474040632, + "grad_norm": 251.36766052246094, + "learning_rate": 1.021415607985481e-05, + "loss": 35.853, + "step": 3644 + }, + { + "epoch": 13.158916478555305, + "grad_norm": 264.94818115234375, + "learning_rate": 1.0208711433756807e-05, + "loss": 35.7057, + "step": 3645 + }, + { + "epoch": 13.162528216704288, + "grad_norm": 313.0333251953125, + "learning_rate": 1.0203266787658802e-05, + "loss": 34.611, + "step": 3646 + }, + { + "epoch": 13.166139954853273, + "grad_norm": 254.9687042236328, + "learning_rate": 1.0197822141560797e-05, + "loss": 31.1751, + "step": 3647 + }, + { + "epoch": 13.169751693002258, + "grad_norm": 219.7308349609375, + "learning_rate": 1.0192377495462796e-05, + "loss": 22.8425, + "step": 3648 + }, + { + "epoch": 13.173363431151241, + "grad_norm": 305.76416015625, + "learning_rate": 1.0186932849364792e-05, + "loss": 22.5266, + "step": 3649 + }, + { + "epoch": 13.176975169300226, + "grad_norm": 301.26239013671875, + "learning_rate": 1.0181488203266787e-05, + "loss": 23.861, + "step": 3650 + }, + { + "epoch": 13.176975169300226, + "eval_loss": 0.6107029914855957, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 3650 + }, + { + "epoch": 13.18058690744921, + "grad_norm": 235.15576171875, + "learning_rate": 1.0176043557168784e-05, + "loss": 24.495, + "step": 3651 + }, + { + "epoch": 13.184198645598194, + "grad_norm": 268.524658203125, + "learning_rate": 1.0170598911070781e-05, + "loss": 40.3819, + "step": 3652 + }, + { + "epoch": 13.187810383747179, + "grad_norm": 257.869140625, + "learning_rate": 1.0165154264972778e-05, + "loss": 42.2715, + "step": 3653 + }, + { + "epoch": 13.191422121896162, + "grad_norm": 191.8995361328125, + "learning_rate": 1.0159709618874774e-05, + "loss": 41.2991, + "step": 3654 + }, + { + "epoch": 13.195033860045147, + "grad_norm": 242.85342407226562, + "learning_rate": 1.0154264972776769e-05, + "loss": 39.6007, + "step": 3655 + }, + { + "epoch": 13.198645598194132, + "grad_norm": 279.1092529296875, + "learning_rate": 1.0148820326678766e-05, + "loss": 39.8502, + "step": 3656 + }, + { + "epoch": 13.202257336343115, + "grad_norm": 233.94708251953125, + "learning_rate": 1.0143375680580763e-05, + "loss": 39.6407, + "step": 3657 + }, + { + "epoch": 13.2058690744921, + "grad_norm": 227.53001403808594, + "learning_rate": 1.0137931034482758e-05, + "loss": 40.3618, + "step": 3658 + }, + { + "epoch": 13.209480812641084, + "grad_norm": 216.17654418945312, + "learning_rate": 1.0132486388384756e-05, + "loss": 41.3187, + "step": 3659 + }, + { + "epoch": 13.213092550790067, + "grad_norm": 199.51072692871094, + "learning_rate": 1.0127041742286751e-05, + "loss": 41.7474, + "step": 3660 + }, + { + "epoch": 13.213092550790067, + "eval_loss": 0.6099065542221069, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3660 + }, + { + "epoch": 13.216704288939052, + "grad_norm": 212.3302001953125, + "learning_rate": 1.0121597096188748e-05, + "loss": 40.8565, + "step": 3661 + }, + { + "epoch": 13.220316027088035, + "grad_norm": 185.42857360839844, + "learning_rate": 1.0116152450090745e-05, + "loss": 41.5302, + "step": 3662 + }, + { + "epoch": 13.22392776523702, + "grad_norm": 241.05487060546875, + "learning_rate": 1.011070780399274e-05, + "loss": 38.6842, + "step": 3663 + }, + { + "epoch": 13.227539503386005, + "grad_norm": 314.1755065917969, + "learning_rate": 1.0105263157894738e-05, + "loss": 37.8021, + "step": 3664 + }, + { + "epoch": 13.231151241534988, + "grad_norm": 262.6571960449219, + "learning_rate": 1.0099818511796733e-05, + "loss": 36.3265, + "step": 3665 + }, + { + "epoch": 13.234762979683973, + "grad_norm": 259.24029541015625, + "learning_rate": 1.009437386569873e-05, + "loss": 38.4521, + "step": 3666 + }, + { + "epoch": 13.238374717832958, + "grad_norm": 223.5182342529297, + "learning_rate": 1.0088929219600727e-05, + "loss": 37.3267, + "step": 3667 + }, + { + "epoch": 13.241986455981941, + "grad_norm": 181.72926330566406, + "learning_rate": 1.0083484573502722e-05, + "loss": 38.0142, + "step": 3668 + }, + { + "epoch": 13.245598194130926, + "grad_norm": 204.99813842773438, + "learning_rate": 1.0078039927404718e-05, + "loss": 37.3513, + "step": 3669 + }, + { + "epoch": 13.249209932279909, + "grad_norm": 184.05482482910156, + "learning_rate": 1.0072595281306715e-05, + "loss": 37.9737, + "step": 3670 + }, + { + "epoch": 13.249209932279909, + "eval_loss": 0.6081296801567078, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.081, + "eval_steps_per_second": 57.081, + "step": 3670 + }, + { + "epoch": 13.252821670428894, + "grad_norm": 261.076416015625, + "learning_rate": 1.0067150635208712e-05, + "loss": 38.1087, + "step": 3671 + }, + { + "epoch": 13.256433408577879, + "grad_norm": 218.79515075683594, + "learning_rate": 1.0061705989110709e-05, + "loss": 37.215, + "step": 3672 + }, + { + "epoch": 13.260045146726862, + "grad_norm": 240.93222045898438, + "learning_rate": 1.0056261343012704e-05, + "loss": 37.4461, + "step": 3673 + }, + { + "epoch": 13.263656884875846, + "grad_norm": 241.46072387695312, + "learning_rate": 1.00508166969147e-05, + "loss": 39.4396, + "step": 3674 + }, + { + "epoch": 13.267268623024831, + "grad_norm": 217.85369873046875, + "learning_rate": 1.0045372050816699e-05, + "loss": 38.5512, + "step": 3675 + }, + { + "epoch": 13.270880361173814, + "grad_norm": 254.53549194335938, + "learning_rate": 1.0039927404718694e-05, + "loss": 39.4436, + "step": 3676 + }, + { + "epoch": 13.2744920993228, + "grad_norm": 330.2030029296875, + "learning_rate": 1.003448275862069e-05, + "loss": 39.6341, + "step": 3677 + }, + { + "epoch": 13.278103837471784, + "grad_norm": 267.6778869628906, + "learning_rate": 1.0029038112522686e-05, + "loss": 38.5305, + "step": 3678 + }, + { + "epoch": 13.281715575620767, + "grad_norm": 251.23703002929688, + "learning_rate": 1.0023593466424682e-05, + "loss": 39.712, + "step": 3679 + }, + { + "epoch": 13.285327313769752, + "grad_norm": 258.8126525878906, + "learning_rate": 1.0018148820326679e-05, + "loss": 37.982, + "step": 3680 + }, + { + "epoch": 13.285327313769752, + "eval_loss": 0.6092600226402283, + "eval_runtime": 3.1494, + "eval_samples_per_second": 56.837, + "eval_steps_per_second": 56.837, + "step": 3680 + }, + { + "epoch": 13.288939051918735, + "grad_norm": 270.01690673828125, + "learning_rate": 1.0012704174228676e-05, + "loss": 35.8938, + "step": 3681 + }, + { + "epoch": 13.29255079006772, + "grad_norm": 271.138671875, + "learning_rate": 1.0007259528130671e-05, + "loss": 33.2221, + "step": 3682 + }, + { + "epoch": 13.296162528216705, + "grad_norm": 239.4976806640625, + "learning_rate": 1.0001814882032668e-05, + "loss": 32.6252, + "step": 3683 + }, + { + "epoch": 13.299774266365688, + "grad_norm": 203.7470245361328, + "learning_rate": 9.996370235934664e-06, + "loss": 32.3694, + "step": 3684 + }, + { + "epoch": 13.303386004514673, + "grad_norm": 255.28419494628906, + "learning_rate": 9.990925589836661e-06, + "loss": 32.7386, + "step": 3685 + }, + { + "epoch": 13.306997742663658, + "grad_norm": 267.82489013671875, + "learning_rate": 9.985480943738658e-06, + "loss": 33.7657, + "step": 3686 + }, + { + "epoch": 13.31060948081264, + "grad_norm": 224.82432556152344, + "learning_rate": 9.980036297640653e-06, + "loss": 34.085, + "step": 3687 + }, + { + "epoch": 13.314221218961626, + "grad_norm": 249.92684936523438, + "learning_rate": 9.974591651542649e-06, + "loss": 33.9186, + "step": 3688 + }, + { + "epoch": 13.317832957110609, + "grad_norm": 249.29620361328125, + "learning_rate": 9.969147005444648e-06, + "loss": 35.0909, + "step": 3689 + }, + { + "epoch": 13.321444695259594, + "grad_norm": 276.4640808105469, + "learning_rate": 9.963702359346643e-06, + "loss": 35.6823, + "step": 3690 + }, + { + "epoch": 13.321444695259594, + "eval_loss": 0.6132593154907227, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 3690 + }, + { + "epoch": 13.325056433408578, + "grad_norm": 245.46163940429688, + "learning_rate": 9.958257713248638e-06, + "loss": 35.7071, + "step": 3691 + }, + { + "epoch": 13.328668171557561, + "grad_norm": 311.008544921875, + "learning_rate": 9.952813067150635e-06, + "loss": 33.6089, + "step": 3692 + }, + { + "epoch": 13.332279909706546, + "grad_norm": 283.2784118652344, + "learning_rate": 9.94736842105263e-06, + "loss": 34.9939, + "step": 3693 + }, + { + "epoch": 13.335891647855531, + "grad_norm": 293.2317199707031, + "learning_rate": 9.94192377495463e-06, + "loss": 37.1149, + "step": 3694 + }, + { + "epoch": 13.339503386004514, + "grad_norm": 263.33111572265625, + "learning_rate": 9.936479128856625e-06, + "loss": 36.5911, + "step": 3695 + }, + { + "epoch": 13.343115124153499, + "grad_norm": 285.1488952636719, + "learning_rate": 9.93103448275862e-06, + "loss": 35.9336, + "step": 3696 + }, + { + "epoch": 13.346726862302482, + "grad_norm": 246.30616760253906, + "learning_rate": 9.925589836660617e-06, + "loss": 26.1555, + "step": 3697 + }, + { + "epoch": 13.350338600451467, + "grad_norm": 185.4857177734375, + "learning_rate": 9.920145190562614e-06, + "loss": 21.9519, + "step": 3698 + }, + { + "epoch": 13.353950338600452, + "grad_norm": 269.6291809082031, + "learning_rate": 9.91470054446461e-06, + "loss": 22.5592, + "step": 3699 + }, + { + "epoch": 13.357562076749435, + "grad_norm": 214.7660675048828, + "learning_rate": 9.909255898366607e-06, + "loss": 23.2505, + "step": 3700 + }, + { + "epoch": 13.357562076749435, + "eval_loss": 0.6123418211936951, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 3700 + }, + { + "epoch": 13.36117381489842, + "grad_norm": 227.8025360107422, + "learning_rate": 9.903811252268602e-06, + "loss": 23.9731, + "step": 3701 + }, + { + "epoch": 13.364785553047405, + "grad_norm": 261.7846374511719, + "learning_rate": 9.898366606170598e-06, + "loss": 40.3869, + "step": 3702 + }, + { + "epoch": 13.368397291196388, + "grad_norm": 305.4109802246094, + "learning_rate": 9.892921960072596e-06, + "loss": 41.9626, + "step": 3703 + }, + { + "epoch": 13.372009029345373, + "grad_norm": 272.86236572265625, + "learning_rate": 9.887477313974592e-06, + "loss": 39.9819, + "step": 3704 + }, + { + "epoch": 13.375620767494357, + "grad_norm": 371.4781188964844, + "learning_rate": 9.882032667876589e-06, + "loss": 40.8074, + "step": 3705 + }, + { + "epoch": 13.37923250564334, + "grad_norm": 278.7463684082031, + "learning_rate": 9.876588021778584e-06, + "loss": 40.6721, + "step": 3706 + }, + { + "epoch": 13.382844243792325, + "grad_norm": 270.41619873046875, + "learning_rate": 9.87114337568058e-06, + "loss": 40.1604, + "step": 3707 + }, + { + "epoch": 13.386455981941308, + "grad_norm": 204.42018127441406, + "learning_rate": 9.865698729582578e-06, + "loss": 41.4666, + "step": 3708 + }, + { + "epoch": 13.390067720090293, + "grad_norm": 197.43289184570312, + "learning_rate": 9.860254083484574e-06, + "loss": 40.953, + "step": 3709 + }, + { + "epoch": 13.393679458239278, + "grad_norm": 203.92056274414062, + "learning_rate": 9.85480943738657e-06, + "loss": 40.6416, + "step": 3710 + }, + { + "epoch": 13.393679458239278, + "eval_loss": 0.608938992023468, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.863, + "eval_steps_per_second": 56.863, + "step": 3710 + }, + { + "epoch": 13.397291196388261, + "grad_norm": 353.2951354980469, + "learning_rate": 9.849364791288566e-06, + "loss": 39.7, + "step": 3711 + }, + { + "epoch": 13.400902934537246, + "grad_norm": 222.94410705566406, + "learning_rate": 9.843920145190563e-06, + "loss": 40.4703, + "step": 3712 + }, + { + "epoch": 13.404514672686231, + "grad_norm": 301.0710754394531, + "learning_rate": 9.83847549909256e-06, + "loss": 37.0453, + "step": 3713 + }, + { + "epoch": 13.408126410835214, + "grad_norm": 251.70263671875, + "learning_rate": 9.833030852994556e-06, + "loss": 37.5346, + "step": 3714 + }, + { + "epoch": 13.411738148984199, + "grad_norm": 201.29335021972656, + "learning_rate": 9.827586206896551e-06, + "loss": 39.0706, + "step": 3715 + }, + { + "epoch": 13.415349887133182, + "grad_norm": 233.82212829589844, + "learning_rate": 9.822141560798548e-06, + "loss": 38.4527, + "step": 3716 + }, + { + "epoch": 13.418961625282167, + "grad_norm": 245.0128936767578, + "learning_rate": 9.816696914700545e-06, + "loss": 37.82, + "step": 3717 + }, + { + "epoch": 13.422573363431152, + "grad_norm": 325.1784973144531, + "learning_rate": 9.81125226860254e-06, + "loss": 38.8858, + "step": 3718 + }, + { + "epoch": 13.426185101580135, + "grad_norm": 196.15032958984375, + "learning_rate": 9.805807622504538e-06, + "loss": 37.1919, + "step": 3719 + }, + { + "epoch": 13.42979683972912, + "grad_norm": 254.73980712890625, + "learning_rate": 9.800362976406533e-06, + "loss": 39.1644, + "step": 3720 + }, + { + "epoch": 13.42979683972912, + "eval_loss": 0.6100116968154907, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 3720 + }, + { + "epoch": 13.433408577878104, + "grad_norm": 253.11489868164062, + "learning_rate": 9.79491833030853e-06, + "loss": 39.8542, + "step": 3721 + }, + { + "epoch": 13.437020316027088, + "grad_norm": 267.8416748046875, + "learning_rate": 9.789473684210527e-06, + "loss": 39.8469, + "step": 3722 + }, + { + "epoch": 13.440632054176072, + "grad_norm": 267.62835693359375, + "learning_rate": 9.784029038112523e-06, + "loss": 37.4556, + "step": 3723 + }, + { + "epoch": 13.444243792325057, + "grad_norm": 346.6018371582031, + "learning_rate": 9.77858439201452e-06, + "loss": 39.7817, + "step": 3724 + }, + { + "epoch": 13.44785553047404, + "grad_norm": 241.95008850097656, + "learning_rate": 9.773139745916515e-06, + "loss": 39.1631, + "step": 3725 + }, + { + "epoch": 13.451467268623025, + "grad_norm": 244.9163055419922, + "learning_rate": 9.767695099818512e-06, + "loss": 38.6152, + "step": 3726 + }, + { + "epoch": 13.455079006772008, + "grad_norm": 243.60633850097656, + "learning_rate": 9.76225045372051e-06, + "loss": 39.5388, + "step": 3727 + }, + { + "epoch": 13.458690744920993, + "grad_norm": 230.57276916503906, + "learning_rate": 9.756805807622505e-06, + "loss": 40.3007, + "step": 3728 + }, + { + "epoch": 13.462302483069978, + "grad_norm": 228.76754760742188, + "learning_rate": 9.7513611615245e-06, + "loss": 37.7111, + "step": 3729 + }, + { + "epoch": 13.465914221218961, + "grad_norm": 292.7367248535156, + "learning_rate": 9.745916515426497e-06, + "loss": 38.4114, + "step": 3730 + }, + { + "epoch": 13.465914221218961, + "eval_loss": 0.6064842939376831, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 3730 + }, + { + "epoch": 13.469525959367946, + "grad_norm": 226.9254150390625, + "learning_rate": 9.740471869328494e-06, + "loss": 34.015, + "step": 3731 + }, + { + "epoch": 13.47313769751693, + "grad_norm": 250.38137817382812, + "learning_rate": 9.73502722323049e-06, + "loss": 34.2911, + "step": 3732 + }, + { + "epoch": 13.476749435665914, + "grad_norm": 230.447265625, + "learning_rate": 9.729582577132487e-06, + "loss": 31.8708, + "step": 3733 + }, + { + "epoch": 13.480361173814899, + "grad_norm": 241.05787658691406, + "learning_rate": 9.724137931034482e-06, + "loss": 34.5685, + "step": 3734 + }, + { + "epoch": 13.483972911963882, + "grad_norm": 248.07254028320312, + "learning_rate": 9.718693284936481e-06, + "loss": 32.6084, + "step": 3735 + }, + { + "epoch": 13.487584650112867, + "grad_norm": 241.22862243652344, + "learning_rate": 9.713248638838476e-06, + "loss": 32.787, + "step": 3736 + }, + { + "epoch": 13.491196388261852, + "grad_norm": 295.4871520996094, + "learning_rate": 9.707803992740472e-06, + "loss": 33.9786, + "step": 3737 + }, + { + "epoch": 13.494808126410835, + "grad_norm": 285.3634948730469, + "learning_rate": 9.702359346642469e-06, + "loss": 33.9872, + "step": 3738 + }, + { + "epoch": 13.49841986455982, + "grad_norm": 302.39947509765625, + "learning_rate": 9.696914700544464e-06, + "loss": 33.9854, + "step": 3739 + }, + { + "epoch": 13.502031602708804, + "grad_norm": 310.0465087890625, + "learning_rate": 9.691470054446461e-06, + "loss": 34.1859, + "step": 3740 + }, + { + "epoch": 13.502031602708804, + "eval_loss": 0.6067100167274475, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 3740 + }, + { + "epoch": 13.505643340857787, + "grad_norm": 319.9311828613281, + "learning_rate": 9.686025408348458e-06, + "loss": 34.5264, + "step": 3741 + }, + { + "epoch": 13.509255079006772, + "grad_norm": 291.75738525390625, + "learning_rate": 9.680580762250454e-06, + "loss": 35.8348, + "step": 3742 + }, + { + "epoch": 13.512866817155757, + "grad_norm": 291.5312805175781, + "learning_rate": 9.675136116152449e-06, + "loss": 33.8803, + "step": 3743 + }, + { + "epoch": 13.51647855530474, + "grad_norm": 228.00588989257812, + "learning_rate": 9.669691470054448e-06, + "loss": 36.1919, + "step": 3744 + }, + { + "epoch": 13.520090293453725, + "grad_norm": 236.5559539794922, + "learning_rate": 9.664246823956443e-06, + "loss": 35.8432, + "step": 3745 + }, + { + "epoch": 13.523702031602708, + "grad_norm": 287.7408752441406, + "learning_rate": 9.65880217785844e-06, + "loss": 37.069, + "step": 3746 + }, + { + "epoch": 13.527313769751693, + "grad_norm": 272.73870849609375, + "learning_rate": 9.653357531760436e-06, + "loss": 29.1896, + "step": 3747 + }, + { + "epoch": 13.530925507900678, + "grad_norm": 256.5550842285156, + "learning_rate": 9.647912885662431e-06, + "loss": 23.0953, + "step": 3748 + }, + { + "epoch": 13.534537246049661, + "grad_norm": 230.98487854003906, + "learning_rate": 9.64246823956443e-06, + "loss": 21.9902, + "step": 3749 + }, + { + "epoch": 13.538148984198646, + "grad_norm": 247.1185760498047, + "learning_rate": 9.637023593466425e-06, + "loss": 23.7439, + "step": 3750 + }, + { + "epoch": 13.538148984198646, + "eval_loss": 0.6106311082839966, + "eval_runtime": 3.1356, + "eval_samples_per_second": 57.086, + "eval_steps_per_second": 57.086, + "step": 3750 + }, + { + "epoch": 13.54176072234763, + "grad_norm": 193.83152770996094, + "learning_rate": 9.63157894736842e-06, + "loss": 24.2292, + "step": 3751 + }, + { + "epoch": 13.545372460496614, + "grad_norm": 322.80487060546875, + "learning_rate": 9.626134301270418e-06, + "loss": 40.9778, + "step": 3752 + }, + { + "epoch": 13.548984198645599, + "grad_norm": 345.0560302734375, + "learning_rate": 9.620689655172413e-06, + "loss": 42.3601, + "step": 3753 + }, + { + "epoch": 13.552595936794582, + "grad_norm": 240.3759002685547, + "learning_rate": 9.61524500907441e-06, + "loss": 41.092, + "step": 3754 + }, + { + "epoch": 13.556207674943566, + "grad_norm": 219.0955352783203, + "learning_rate": 9.609800362976407e-06, + "loss": 40.3108, + "step": 3755 + }, + { + "epoch": 13.559819413092551, + "grad_norm": 255.6158447265625, + "learning_rate": 9.604355716878403e-06, + "loss": 39.8885, + "step": 3756 + }, + { + "epoch": 13.563431151241534, + "grad_norm": 264.55010986328125, + "learning_rate": 9.5989110707804e-06, + "loss": 40.8838, + "step": 3757 + }, + { + "epoch": 13.56704288939052, + "grad_norm": 313.0918273925781, + "learning_rate": 9.593466424682397e-06, + "loss": 40.6634, + "step": 3758 + }, + { + "epoch": 13.570654627539504, + "grad_norm": 304.87396240234375, + "learning_rate": 9.588021778584392e-06, + "loss": 41.8734, + "step": 3759 + }, + { + "epoch": 13.574266365688487, + "grad_norm": 239.76063537597656, + "learning_rate": 9.58257713248639e-06, + "loss": 40.6281, + "step": 3760 + }, + { + "epoch": 13.574266365688487, + "eval_loss": 0.6124129891395569, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.028, + "eval_steps_per_second": 57.028, + "step": 3760 + }, + { + "epoch": 13.577878103837472, + "grad_norm": 201.89422607421875, + "learning_rate": 9.577132486388385e-06, + "loss": 39.6948, + "step": 3761 + }, + { + "epoch": 13.581489841986457, + "grad_norm": 232.8797607421875, + "learning_rate": 9.57168784029038e-06, + "loss": 39.6927, + "step": 3762 + }, + { + "epoch": 13.58510158013544, + "grad_norm": 250.30355834960938, + "learning_rate": 9.566243194192379e-06, + "loss": 37.6926, + "step": 3763 + }, + { + "epoch": 13.588713318284425, + "grad_norm": 256.23626708984375, + "learning_rate": 9.560798548094374e-06, + "loss": 38.248, + "step": 3764 + }, + { + "epoch": 13.592325056433408, + "grad_norm": 234.1791534423828, + "learning_rate": 9.555353901996371e-06, + "loss": 36.8178, + "step": 3765 + }, + { + "epoch": 13.595936794582393, + "grad_norm": 243.87615966796875, + "learning_rate": 9.549909255898367e-06, + "loss": 37.0802, + "step": 3766 + }, + { + "epoch": 13.599548532731378, + "grad_norm": 220.98150634765625, + "learning_rate": 9.544464609800362e-06, + "loss": 37.1251, + "step": 3767 + }, + { + "epoch": 13.60316027088036, + "grad_norm": 235.8653564453125, + "learning_rate": 9.53901996370236e-06, + "loss": 38.2965, + "step": 3768 + }, + { + "epoch": 13.606772009029346, + "grad_norm": 237.66712951660156, + "learning_rate": 9.533575317604356e-06, + "loss": 38.0266, + "step": 3769 + }, + { + "epoch": 13.610383747178329, + "grad_norm": 229.4922637939453, + "learning_rate": 9.528130671506351e-06, + "loss": 38.4199, + "step": 3770 + }, + { + "epoch": 13.610383747178329, + "eval_loss": 0.6078812479972839, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 3770 + }, + { + "epoch": 13.613995485327314, + "grad_norm": 250.82533264160156, + "learning_rate": 9.522686025408349e-06, + "loss": 39.713, + "step": 3771 + }, + { + "epoch": 13.617607223476298, + "grad_norm": 218.97511291503906, + "learning_rate": 9.517241379310346e-06, + "loss": 37.6396, + "step": 3772 + }, + { + "epoch": 13.621218961625281, + "grad_norm": 240.13096618652344, + "learning_rate": 9.511796733212341e-06, + "loss": 39.2808, + "step": 3773 + }, + { + "epoch": 13.624830699774266, + "grad_norm": 214.77957153320312, + "learning_rate": 9.506352087114338e-06, + "loss": 39.1584, + "step": 3774 + }, + { + "epoch": 13.628442437923251, + "grad_norm": 273.2488708496094, + "learning_rate": 9.500907441016333e-06, + "loss": 39.6725, + "step": 3775 + }, + { + "epoch": 13.632054176072234, + "grad_norm": 240.46669006347656, + "learning_rate": 9.49546279491833e-06, + "loss": 40.155, + "step": 3776 + }, + { + "epoch": 13.635665914221219, + "grad_norm": 304.46533203125, + "learning_rate": 9.490018148820328e-06, + "loss": 39.5831, + "step": 3777 + }, + { + "epoch": 13.639277652370204, + "grad_norm": 282.9252624511719, + "learning_rate": 9.484573502722323e-06, + "loss": 40.8392, + "step": 3778 + }, + { + "epoch": 13.642889390519187, + "grad_norm": 229.2595977783203, + "learning_rate": 9.47912885662432e-06, + "loss": 38.4015, + "step": 3779 + }, + { + "epoch": 13.646501128668172, + "grad_norm": 300.0253601074219, + "learning_rate": 9.473684210526315e-06, + "loss": 35.0578, + "step": 3780 + }, + { + "epoch": 13.646501128668172, + "eval_loss": 0.6059401631355286, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 3780 + }, + { + "epoch": 13.650112866817155, + "grad_norm": 266.379638671875, + "learning_rate": 9.468239564428313e-06, + "loss": 33.0308, + "step": 3781 + }, + { + "epoch": 13.65372460496614, + "grad_norm": 248.8190460205078, + "learning_rate": 9.46279491833031e-06, + "loss": 31.7632, + "step": 3782 + }, + { + "epoch": 13.657336343115125, + "grad_norm": 224.4126739501953, + "learning_rate": 9.457350272232305e-06, + "loss": 32.8875, + "step": 3783 + }, + { + "epoch": 13.660948081264108, + "grad_norm": 259.84466552734375, + "learning_rate": 9.4519056261343e-06, + "loss": 32.3248, + "step": 3784 + }, + { + "epoch": 13.664559819413093, + "grad_norm": 233.59483337402344, + "learning_rate": 9.446460980036297e-06, + "loss": 32.5855, + "step": 3785 + }, + { + "epoch": 13.668171557562077, + "grad_norm": 283.1840515136719, + "learning_rate": 9.441016333938295e-06, + "loss": 33.8277, + "step": 3786 + }, + { + "epoch": 13.67178329571106, + "grad_norm": 269.51171875, + "learning_rate": 9.435571687840292e-06, + "loss": 33.8348, + "step": 3787 + }, + { + "epoch": 13.675395033860045, + "grad_norm": 284.6701354980469, + "learning_rate": 9.430127041742287e-06, + "loss": 34.2571, + "step": 3788 + }, + { + "epoch": 13.679006772009028, + "grad_norm": 308.96221923828125, + "learning_rate": 9.424682395644282e-06, + "loss": 34.2313, + "step": 3789 + }, + { + "epoch": 13.682618510158013, + "grad_norm": 229.36366271972656, + "learning_rate": 9.41923774954628e-06, + "loss": 34.6341, + "step": 3790 + }, + { + "epoch": 13.682618510158013, + "eval_loss": 0.606715202331543, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 3790 + }, + { + "epoch": 13.686230248306998, + "grad_norm": 335.4346008300781, + "learning_rate": 9.413793103448277e-06, + "loss": 35.2222, + "step": 3791 + }, + { + "epoch": 13.689841986455981, + "grad_norm": 259.72222900390625, + "learning_rate": 9.408348457350272e-06, + "loss": 34.7416, + "step": 3792 + }, + { + "epoch": 13.693453724604966, + "grad_norm": 275.96112060546875, + "learning_rate": 9.402903811252269e-06, + "loss": 34.2018, + "step": 3793 + }, + { + "epoch": 13.697065462753951, + "grad_norm": 349.28924560546875, + "learning_rate": 9.397459165154264e-06, + "loss": 37.8801, + "step": 3794 + }, + { + "epoch": 13.700677200902934, + "grad_norm": 288.47540283203125, + "learning_rate": 9.392014519056261e-06, + "loss": 37.5101, + "step": 3795 + }, + { + "epoch": 13.704288939051919, + "grad_norm": 255.31033325195312, + "learning_rate": 9.386569872958259e-06, + "loss": 36.9294, + "step": 3796 + }, + { + "epoch": 13.707900677200904, + "grad_norm": 273.757080078125, + "learning_rate": 9.381125226860254e-06, + "loss": 31.64, + "step": 3797 + }, + { + "epoch": 13.711512415349887, + "grad_norm": 236.24928283691406, + "learning_rate": 9.375680580762251e-06, + "loss": 22.9812, + "step": 3798 + }, + { + "epoch": 13.715124153498872, + "grad_norm": 206.70883178710938, + "learning_rate": 9.370235934664246e-06, + "loss": 22.4788, + "step": 3799 + }, + { + "epoch": 13.718735891647855, + "grad_norm": 168.15762329101562, + "learning_rate": 9.364791288566243e-06, + "loss": 23.3803, + "step": 3800 + }, + { + "epoch": 13.718735891647855, + "eval_loss": 0.6092759966850281, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 3800 + }, + { + "epoch": 13.72234762979684, + "grad_norm": 261.88397216796875, + "learning_rate": 9.35934664246824e-06, + "loss": 24.8757, + "step": 3801 + }, + { + "epoch": 13.725959367945824, + "grad_norm": 235.3518829345703, + "learning_rate": 9.353901996370236e-06, + "loss": 39.8777, + "step": 3802 + }, + { + "epoch": 13.729571106094808, + "grad_norm": 226.94027709960938, + "learning_rate": 9.348457350272231e-06, + "loss": 40.4357, + "step": 3803 + }, + { + "epoch": 13.733182844243792, + "grad_norm": 266.2643737792969, + "learning_rate": 9.34301270417423e-06, + "loss": 41.6411, + "step": 3804 + }, + { + "epoch": 13.736794582392777, + "grad_norm": 327.39288330078125, + "learning_rate": 9.337568058076225e-06, + "loss": 39.862, + "step": 3805 + }, + { + "epoch": 13.74040632054176, + "grad_norm": 241.03121948242188, + "learning_rate": 9.332123411978223e-06, + "loss": 39.1833, + "step": 3806 + }, + { + "epoch": 13.744018058690745, + "grad_norm": 232.2872314453125, + "learning_rate": 9.326678765880218e-06, + "loss": 40.6895, + "step": 3807 + }, + { + "epoch": 13.747629796839728, + "grad_norm": 236.909912109375, + "learning_rate": 9.321234119782213e-06, + "loss": 39.5891, + "step": 3808 + }, + { + "epoch": 13.751241534988713, + "grad_norm": 193.81478881835938, + "learning_rate": 9.315789473684212e-06, + "loss": 41.5211, + "step": 3809 + }, + { + "epoch": 13.754853273137698, + "grad_norm": 214.87301635742188, + "learning_rate": 9.310344827586207e-06, + "loss": 41.0726, + "step": 3810 + }, + { + "epoch": 13.754853273137698, + "eval_loss": 0.6098713874816895, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.047, + "eval_steps_per_second": 57.047, + "step": 3810 + }, + { + "epoch": 13.758465011286681, + "grad_norm": 196.57247924804688, + "learning_rate": 9.304900181488203e-06, + "loss": 40.1843, + "step": 3811 + }, + { + "epoch": 13.762076749435666, + "grad_norm": 215.59698486328125, + "learning_rate": 9.2994555353902e-06, + "loss": 37.6279, + "step": 3812 + }, + { + "epoch": 13.76568848758465, + "grad_norm": 221.1280059814453, + "learning_rate": 9.294010889292195e-06, + "loss": 37.9593, + "step": 3813 + }, + { + "epoch": 13.769300225733634, + "grad_norm": 314.94610595703125, + "learning_rate": 9.288566243194192e-06, + "loss": 37.3399, + "step": 3814 + }, + { + "epoch": 13.772911963882619, + "grad_norm": 240.10816955566406, + "learning_rate": 9.28312159709619e-06, + "loss": 38.3185, + "step": 3815 + }, + { + "epoch": 13.776523702031604, + "grad_norm": 229.2427978515625, + "learning_rate": 9.277676950998185e-06, + "loss": 36.9407, + "step": 3816 + }, + { + "epoch": 13.780135440180587, + "grad_norm": 224.78335571289062, + "learning_rate": 9.272232304900182e-06, + "loss": 39.3709, + "step": 3817 + }, + { + "epoch": 13.783747178329572, + "grad_norm": 216.5969696044922, + "learning_rate": 9.266787658802179e-06, + "loss": 38.2303, + "step": 3818 + }, + { + "epoch": 13.787358916478555, + "grad_norm": 208.7849884033203, + "learning_rate": 9.261343012704174e-06, + "loss": 39.492, + "step": 3819 + }, + { + "epoch": 13.79097065462754, + "grad_norm": 215.76475524902344, + "learning_rate": 9.255898366606171e-06, + "loss": 38.5599, + "step": 3820 + }, + { + "epoch": 13.79097065462754, + "eval_loss": 0.6080366969108582, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.041, + "eval_steps_per_second": 57.041, + "step": 3820 + }, + { + "epoch": 13.794582392776524, + "grad_norm": 224.64462280273438, + "learning_rate": 9.250453720508167e-06, + "loss": 39.315, + "step": 3821 + }, + { + "epoch": 13.798194130925507, + "grad_norm": 298.545654296875, + "learning_rate": 9.245009074410162e-06, + "loss": 38.3108, + "step": 3822 + }, + { + "epoch": 13.801805869074492, + "grad_norm": 236.5186767578125, + "learning_rate": 9.239564428312161e-06, + "loss": 39.9223, + "step": 3823 + }, + { + "epoch": 13.805417607223477, + "grad_norm": 251.47999572753906, + "learning_rate": 9.234119782214156e-06, + "loss": 39.4288, + "step": 3824 + }, + { + "epoch": 13.80902934537246, + "grad_norm": 260.8268737792969, + "learning_rate": 9.228675136116152e-06, + "loss": 38.276, + "step": 3825 + }, + { + "epoch": 13.812641083521445, + "grad_norm": 253.25172424316406, + "learning_rate": 9.223230490018149e-06, + "loss": 40.7118, + "step": 3826 + }, + { + "epoch": 13.816252821670428, + "grad_norm": 250.31784057617188, + "learning_rate": 9.217785843920146e-06, + "loss": 40.1916, + "step": 3827 + }, + { + "epoch": 13.819864559819413, + "grad_norm": 228.79234313964844, + "learning_rate": 9.212341197822143e-06, + "loss": 38.1513, + "step": 3828 + }, + { + "epoch": 13.823476297968398, + "grad_norm": 262.689697265625, + "learning_rate": 9.206896551724138e-06, + "loss": 38.43, + "step": 3829 + }, + { + "epoch": 13.827088036117381, + "grad_norm": 191.04139709472656, + "learning_rate": 9.201451905626134e-06, + "loss": 34.2476, + "step": 3830 + }, + { + "epoch": 13.827088036117381, + "eval_loss": 0.6077054142951965, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 3830 + }, + { + "epoch": 13.830699774266366, + "grad_norm": 236.3266143798828, + "learning_rate": 9.196007259528131e-06, + "loss": 33.7892, + "step": 3831 + }, + { + "epoch": 13.83431151241535, + "grad_norm": 284.8748474121094, + "learning_rate": 9.190562613430128e-06, + "loss": 31.9857, + "step": 3832 + }, + { + "epoch": 13.837923250564334, + "grad_norm": 261.17413330078125, + "learning_rate": 9.185117967332123e-06, + "loss": 32.8165, + "step": 3833 + }, + { + "epoch": 13.841534988713319, + "grad_norm": 195.1323699951172, + "learning_rate": 9.17967332123412e-06, + "loss": 33.1709, + "step": 3834 + }, + { + "epoch": 13.845146726862303, + "grad_norm": 220.5006561279297, + "learning_rate": 9.174228675136116e-06, + "loss": 33.149, + "step": 3835 + }, + { + "epoch": 13.848758465011286, + "grad_norm": 236.7254638671875, + "learning_rate": 9.168784029038111e-06, + "loss": 33.633, + "step": 3836 + }, + { + "epoch": 13.852370203160271, + "grad_norm": 269.1921691894531, + "learning_rate": 9.16333938294011e-06, + "loss": 34.6822, + "step": 3837 + }, + { + "epoch": 13.855981941309254, + "grad_norm": 222.4369354248047, + "learning_rate": 9.157894736842105e-06, + "loss": 35.2816, + "step": 3838 + }, + { + "epoch": 13.85959367945824, + "grad_norm": 232.4306640625, + "learning_rate": 9.152450090744102e-06, + "loss": 35.0067, + "step": 3839 + }, + { + "epoch": 13.863205417607224, + "grad_norm": 297.0786437988281, + "learning_rate": 9.147005444646098e-06, + "loss": 34.264, + "step": 3840 + }, + { + "epoch": 13.863205417607224, + "eval_loss": 0.6047748327255249, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 3840 + }, + { + "epoch": 13.866817155756207, + "grad_norm": 370.232421875, + "learning_rate": 9.141560798548095e-06, + "loss": 35.4996, + "step": 3841 + }, + { + "epoch": 13.870428893905192, + "grad_norm": 216.05775451660156, + "learning_rate": 9.136116152450092e-06, + "loss": 36.1403, + "step": 3842 + }, + { + "epoch": 13.874040632054175, + "grad_norm": 233.11138916015625, + "learning_rate": 9.130671506352087e-06, + "loss": 36.0324, + "step": 3843 + }, + { + "epoch": 13.87765237020316, + "grad_norm": 297.1761779785156, + "learning_rate": 9.125226860254083e-06, + "loss": 36.5617, + "step": 3844 + }, + { + "epoch": 13.881264108352145, + "grad_norm": 290.61590576171875, + "learning_rate": 9.11978221415608e-06, + "loss": 36.7113, + "step": 3845 + }, + { + "epoch": 13.884875846501128, + "grad_norm": 293.5744934082031, + "learning_rate": 9.114337568058077e-06, + "loss": 36.9964, + "step": 3846 + }, + { + "epoch": 13.888487584650113, + "grad_norm": 227.73455810546875, + "learning_rate": 9.108892921960072e-06, + "loss": 31.8552, + "step": 3847 + }, + { + "epoch": 13.892099322799098, + "grad_norm": 223.36077880859375, + "learning_rate": 9.10344827586207e-06, + "loss": 22.9122, + "step": 3848 + }, + { + "epoch": 13.89571106094808, + "grad_norm": 181.14501953125, + "learning_rate": 9.098003629764065e-06, + "loss": 22.366, + "step": 3849 + }, + { + "epoch": 13.899322799097066, + "grad_norm": 215.75856018066406, + "learning_rate": 9.092558983666063e-06, + "loss": 23.9545, + "step": 3850 + }, + { + "epoch": 13.899322799097066, + "eval_loss": 0.6072003245353699, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 3850 + }, + { + "epoch": 13.90293453724605, + "grad_norm": 233.22837829589844, + "learning_rate": 9.087114337568059e-06, + "loss": 23.5196, + "step": 3851 + }, + { + "epoch": 13.906546275395034, + "grad_norm": 269.9342041015625, + "learning_rate": 9.081669691470054e-06, + "loss": 41.4605, + "step": 3852 + }, + { + "epoch": 13.910158013544018, + "grad_norm": 304.4266662597656, + "learning_rate": 9.076225045372051e-06, + "loss": 40.2848, + "step": 3853 + }, + { + "epoch": 13.913769751693001, + "grad_norm": 318.2371520996094, + "learning_rate": 9.070780399274047e-06, + "loss": 41.0044, + "step": 3854 + }, + { + "epoch": 13.917381489841986, + "grad_norm": 272.9725341796875, + "learning_rate": 9.065335753176044e-06, + "loss": 40.776, + "step": 3855 + }, + { + "epoch": 13.920993227990971, + "grad_norm": 213.8822784423828, + "learning_rate": 9.059891107078041e-06, + "loss": 39.4964, + "step": 3856 + }, + { + "epoch": 13.924604966139954, + "grad_norm": 239.16128540039062, + "learning_rate": 9.054446460980036e-06, + "loss": 41.3482, + "step": 3857 + }, + { + "epoch": 13.928216704288939, + "grad_norm": 264.839111328125, + "learning_rate": 9.049001814882033e-06, + "loss": 38.2433, + "step": 3858 + }, + { + "epoch": 13.931828442437924, + "grad_norm": 244.00926208496094, + "learning_rate": 9.043557168784029e-06, + "loss": 38.6482, + "step": 3859 + }, + { + "epoch": 13.935440180586907, + "grad_norm": 342.8050537109375, + "learning_rate": 9.038112522686026e-06, + "loss": 39.2047, + "step": 3860 + }, + { + "epoch": 13.935440180586907, + "eval_loss": 0.6078094244003296, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3860 + }, + { + "epoch": 13.939051918735892, + "grad_norm": 232.509033203125, + "learning_rate": 9.032667876588023e-06, + "loss": 39.2827, + "step": 3861 + }, + { + "epoch": 13.942663656884875, + "grad_norm": 343.2891845703125, + "learning_rate": 9.027223230490018e-06, + "loss": 38.2709, + "step": 3862 + }, + { + "epoch": 13.94627539503386, + "grad_norm": 332.9613342285156, + "learning_rate": 9.021778584392014e-06, + "loss": 38.8266, + "step": 3863 + }, + { + "epoch": 13.949887133182845, + "grad_norm": 339.5653076171875, + "learning_rate": 9.016333938294012e-06, + "loss": 39.9249, + "step": 3864 + }, + { + "epoch": 13.953498871331828, + "grad_norm": 269.0108947753906, + "learning_rate": 9.010889292196008e-06, + "loss": 39.4593, + "step": 3865 + }, + { + "epoch": 13.957110609480813, + "grad_norm": 252.5339813232422, + "learning_rate": 9.005444646098003e-06, + "loss": 39.5471, + "step": 3866 + }, + { + "epoch": 13.960722347629797, + "grad_norm": 424.7225646972656, + "learning_rate": 9e-06, + "loss": 35.7505, + "step": 3867 + }, + { + "epoch": 13.96433408577878, + "grad_norm": 286.189208984375, + "learning_rate": 8.994555353901996e-06, + "loss": 32.445, + "step": 3868 + }, + { + "epoch": 13.967945823927765, + "grad_norm": 245.153564453125, + "learning_rate": 8.989110707803994e-06, + "loss": 33.2369, + "step": 3869 + }, + { + "epoch": 13.97155756207675, + "grad_norm": 305.3119812011719, + "learning_rate": 8.98366606170599e-06, + "loss": 31.7864, + "step": 3870 + }, + { + "epoch": 13.97155756207675, + "eval_loss": 0.6069231629371643, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.877, + "eval_steps_per_second": 56.877, + "step": 3870 + }, + { + "epoch": 13.975169300225733, + "grad_norm": 218.70913696289062, + "learning_rate": 8.978221415607985e-06, + "loss": 33.7166, + "step": 3871 + }, + { + "epoch": 13.978781038374718, + "grad_norm": 334.856201171875, + "learning_rate": 8.972776769509982e-06, + "loss": 35.8878, + "step": 3872 + }, + { + "epoch": 13.982392776523701, + "grad_norm": 305.65203857421875, + "learning_rate": 8.96733212341198e-06, + "loss": 35.1525, + "step": 3873 + }, + { + "epoch": 13.986004514672686, + "grad_norm": 330.148193359375, + "learning_rate": 8.961887477313975e-06, + "loss": 34.8268, + "step": 3874 + }, + { + "epoch": 13.989616252821671, + "grad_norm": 288.9424133300781, + "learning_rate": 8.956442831215972e-06, + "loss": 35.5068, + "step": 3875 + }, + { + "epoch": 13.993227990970654, + "grad_norm": 256.2596740722656, + "learning_rate": 8.950998185117967e-06, + "loss": 28.5016, + "step": 3876 + }, + { + "epoch": 13.996839729119639, + "grad_norm": 234.31991577148438, + "learning_rate": 8.945553539019963e-06, + "loss": 23.7416, + "step": 3877 + }, + { + "epoch": 14.0, + "grad_norm": 182.19000244140625, + "learning_rate": 8.940108892921961e-06, + "loss": 21.0329, + "step": 3878 + }, + { + "epoch": 14.003611738148985, + "grad_norm": 254.86355590820312, + "learning_rate": 8.934664246823957e-06, + "loss": 39.94, + "step": 3879 + }, + { + "epoch": 14.007223476297968, + "grad_norm": 229.75650024414062, + "learning_rate": 8.929219600725954e-06, + "loss": 40.3213, + "step": 3880 + }, + { + "epoch": 14.007223476297968, + "eval_loss": 0.604503870010376, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3880 + }, + { + "epoch": 14.010835214446953, + "grad_norm": 220.18190002441406, + "learning_rate": 8.923774954627949e-06, + "loss": 40.1568, + "step": 3881 + }, + { + "epoch": 14.014446952595938, + "grad_norm": 269.5978088378906, + "learning_rate": 8.918330308529945e-06, + "loss": 40.3685, + "step": 3882 + }, + { + "epoch": 14.01805869074492, + "grad_norm": 254.3507537841797, + "learning_rate": 8.912885662431943e-06, + "loss": 40.0845, + "step": 3883 + }, + { + "epoch": 14.021670428893906, + "grad_norm": 251.43653869628906, + "learning_rate": 8.907441016333939e-06, + "loss": 40.1731, + "step": 3884 + }, + { + "epoch": 14.025282167042889, + "grad_norm": 215.91253662109375, + "learning_rate": 8.901996370235934e-06, + "loss": 39.7179, + "step": 3885 + }, + { + "epoch": 14.028893905191874, + "grad_norm": 247.81790161132812, + "learning_rate": 8.896551724137931e-06, + "loss": 41.0822, + "step": 3886 + }, + { + "epoch": 14.032505643340858, + "grad_norm": 232.45892333984375, + "learning_rate": 8.891107078039928e-06, + "loss": 39.7873, + "step": 3887 + }, + { + "epoch": 14.036117381489841, + "grad_norm": 231.8137969970703, + "learning_rate": 8.885662431941924e-06, + "loss": 41.1302, + "step": 3888 + }, + { + "epoch": 14.039729119638826, + "grad_norm": 219.09446716308594, + "learning_rate": 8.88021778584392e-06, + "loss": 39.2293, + "step": 3889 + }, + { + "epoch": 14.043340857787811, + "grad_norm": 187.99874877929688, + "learning_rate": 8.874773139745916e-06, + "loss": 37.3338, + "step": 3890 + }, + { + "epoch": 14.043340857787811, + "eval_loss": 0.603966236114502, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 3890 + }, + { + "epoch": 14.046952595936794, + "grad_norm": 285.2400207519531, + "learning_rate": 8.869328493647913e-06, + "loss": 36.9479, + "step": 3891 + }, + { + "epoch": 14.050564334085779, + "grad_norm": 234.23655700683594, + "learning_rate": 8.86388384754991e-06, + "loss": 35.1313, + "step": 3892 + }, + { + "epoch": 14.054176072234762, + "grad_norm": 234.78717041015625, + "learning_rate": 8.858439201451906e-06, + "loss": 36.5917, + "step": 3893 + }, + { + "epoch": 14.057787810383747, + "grad_norm": 226.53997802734375, + "learning_rate": 8.852994555353903e-06, + "loss": 38.3228, + "step": 3894 + }, + { + "epoch": 14.061399548532732, + "grad_norm": 222.05213928222656, + "learning_rate": 8.847549909255898e-06, + "loss": 37.3542, + "step": 3895 + }, + { + "epoch": 14.065011286681715, + "grad_norm": 222.9646759033203, + "learning_rate": 8.842105263157893e-06, + "loss": 37.6396, + "step": 3896 + }, + { + "epoch": 14.0686230248307, + "grad_norm": 227.78965759277344, + "learning_rate": 8.836660617059892e-06, + "loss": 38.1988, + "step": 3897 + }, + { + "epoch": 14.072234762979685, + "grad_norm": 200.89691162109375, + "learning_rate": 8.831215970961888e-06, + "loss": 38.3981, + "step": 3898 + }, + { + "epoch": 14.075846501128668, + "grad_norm": 212.52891540527344, + "learning_rate": 8.825771324863883e-06, + "loss": 37.3422, + "step": 3899 + }, + { + "epoch": 14.079458239277653, + "grad_norm": 312.33905029296875, + "learning_rate": 8.82032667876588e-06, + "loss": 38.1292, + "step": 3900 + }, + { + "epoch": 14.079458239277653, + "eval_loss": 0.6061921119689941, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.017, + "eval_steps_per_second": 57.017, + "step": 3900 + }, + { + "epoch": 14.083069977426636, + "grad_norm": 261.8415832519531, + "learning_rate": 8.814882032667877e-06, + "loss": 37.5543, + "step": 3901 + }, + { + "epoch": 14.08668171557562, + "grad_norm": 264.625732421875, + "learning_rate": 8.809437386569874e-06, + "loss": 39.3912, + "step": 3902 + }, + { + "epoch": 14.090293453724605, + "grad_norm": 305.7203063964844, + "learning_rate": 8.80399274047187e-06, + "loss": 39.7879, + "step": 3903 + }, + { + "epoch": 14.093905191873588, + "grad_norm": 282.63616943359375, + "learning_rate": 8.798548094373865e-06, + "loss": 38.7212, + "step": 3904 + }, + { + "epoch": 14.097516930022573, + "grad_norm": 246.49169921875, + "learning_rate": 8.793103448275862e-06, + "loss": 40.6198, + "step": 3905 + }, + { + "epoch": 14.101128668171558, + "grad_norm": 283.2737731933594, + "learning_rate": 8.787658802177859e-06, + "loss": 39.6947, + "step": 3906 + }, + { + "epoch": 14.104740406320541, + "grad_norm": 306.95721435546875, + "learning_rate": 8.782214156079855e-06, + "loss": 38.6157, + "step": 3907 + }, + { + "epoch": 14.108352144469526, + "grad_norm": 238.1789093017578, + "learning_rate": 8.776769509981852e-06, + "loss": 35.5328, + "step": 3908 + }, + { + "epoch": 14.111963882618511, + "grad_norm": 233.2298126220703, + "learning_rate": 8.771324863883847e-06, + "loss": 32.4008, + "step": 3909 + }, + { + "epoch": 14.115575620767494, + "grad_norm": 233.46339416503906, + "learning_rate": 8.765880217785846e-06, + "loss": 31.0712, + "step": 3910 + }, + { + "epoch": 14.115575620767494, + "eval_loss": 0.6046931147575378, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 3910 + }, + { + "epoch": 14.119187358916479, + "grad_norm": 226.30343627929688, + "learning_rate": 8.760435571687841e-06, + "loss": 33.252, + "step": 3911 + }, + { + "epoch": 14.122799097065462, + "grad_norm": 247.17465209960938, + "learning_rate": 8.754990925589837e-06, + "loss": 31.526, + "step": 3912 + }, + { + "epoch": 14.126410835214447, + "grad_norm": 208.25439453125, + "learning_rate": 8.749546279491834e-06, + "loss": 32.4838, + "step": 3913 + }, + { + "epoch": 14.130022573363432, + "grad_norm": 236.4488525390625, + "learning_rate": 8.744101633393829e-06, + "loss": 32.7987, + "step": 3914 + }, + { + "epoch": 14.133634311512415, + "grad_norm": 219.13279724121094, + "learning_rate": 8.738656987295826e-06, + "loss": 32.8516, + "step": 3915 + }, + { + "epoch": 14.1372460496614, + "grad_norm": 239.7289581298828, + "learning_rate": 8.733212341197823e-06, + "loss": 33.7763, + "step": 3916 + }, + { + "epoch": 14.140857787810384, + "grad_norm": 226.3568878173828, + "learning_rate": 8.727767695099819e-06, + "loss": 35.675, + "step": 3917 + }, + { + "epoch": 14.144469525959368, + "grad_norm": 302.84307861328125, + "learning_rate": 8.722323049001814e-06, + "loss": 34.0523, + "step": 3918 + }, + { + "epoch": 14.148081264108352, + "grad_norm": 280.40106201171875, + "learning_rate": 8.716878402903811e-06, + "loss": 35.2923, + "step": 3919 + }, + { + "epoch": 14.151693002257336, + "grad_norm": 238.30520629882812, + "learning_rate": 8.711433756805808e-06, + "loss": 36.0242, + "step": 3920 + }, + { + "epoch": 14.151693002257336, + "eval_loss": 0.6067762970924377, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 3920 + }, + { + "epoch": 14.15530474040632, + "grad_norm": 238.6465301513672, + "learning_rate": 8.705989110707805e-06, + "loss": 36.2959, + "step": 3921 + }, + { + "epoch": 14.158916478555305, + "grad_norm": 273.26837158203125, + "learning_rate": 8.7005444646098e-06, + "loss": 35.45, + "step": 3922 + }, + { + "epoch": 14.162528216704288, + "grad_norm": 296.907958984375, + "learning_rate": 8.695099818511796e-06, + "loss": 36.4428, + "step": 3923 + }, + { + "epoch": 14.166139954853273, + "grad_norm": 215.07374572753906, + "learning_rate": 8.689655172413795e-06, + "loss": 26.4171, + "step": 3924 + }, + { + "epoch": 14.169751693002258, + "grad_norm": 217.64779663085938, + "learning_rate": 8.68421052631579e-06, + "loss": 22.5483, + "step": 3925 + }, + { + "epoch": 14.173363431151241, + "grad_norm": 243.59364318847656, + "learning_rate": 8.678765880217785e-06, + "loss": 22.0396, + "step": 3926 + }, + { + "epoch": 14.176975169300226, + "grad_norm": 189.66969299316406, + "learning_rate": 8.673321234119783e-06, + "loss": 23.0957, + "step": 3927 + }, + { + "epoch": 14.18058690744921, + "grad_norm": 191.86180114746094, + "learning_rate": 8.667876588021778e-06, + "loss": 23.9385, + "step": 3928 + }, + { + "epoch": 14.184198645598194, + "grad_norm": 234.34896850585938, + "learning_rate": 8.662431941923775e-06, + "loss": 40.1665, + "step": 3929 + }, + { + "epoch": 14.187810383747179, + "grad_norm": 230.52401733398438, + "learning_rate": 8.656987295825772e-06, + "loss": 40.6752, + "step": 3930 + }, + { + "epoch": 14.187810383747179, + "eval_loss": 0.6088615655899048, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.998, + "eval_steps_per_second": 56.998, + "step": 3930 + }, + { + "epoch": 14.191422121896162, + "grad_norm": 234.06272888183594, + "learning_rate": 8.651542649727767e-06, + "loss": 40.7938, + "step": 3931 + }, + { + "epoch": 14.195033860045147, + "grad_norm": 344.4232482910156, + "learning_rate": 8.646098003629765e-06, + "loss": 38.7342, + "step": 3932 + }, + { + "epoch": 14.198645598194132, + "grad_norm": 375.74365234375, + "learning_rate": 8.640653357531762e-06, + "loss": 40.2052, + "step": 3933 + }, + { + "epoch": 14.202257336343115, + "grad_norm": 258.15570068359375, + "learning_rate": 8.635208711433757e-06, + "loss": 39.7266, + "step": 3934 + }, + { + "epoch": 14.2058690744921, + "grad_norm": 235.2681121826172, + "learning_rate": 8.629764065335754e-06, + "loss": 40.4821, + "step": 3935 + }, + { + "epoch": 14.209480812641084, + "grad_norm": 226.94764709472656, + "learning_rate": 8.62431941923775e-06, + "loss": 41.2414, + "step": 3936 + }, + { + "epoch": 14.213092550790067, + "grad_norm": 236.22109985351562, + "learning_rate": 8.618874773139745e-06, + "loss": 40.5807, + "step": 3937 + }, + { + "epoch": 14.216704288939052, + "grad_norm": 201.31112670898438, + "learning_rate": 8.613430127041744e-06, + "loss": 40.4824, + "step": 3938 + }, + { + "epoch": 14.220316027088035, + "grad_norm": 328.0167541503906, + "learning_rate": 8.607985480943739e-06, + "loss": 38.3881, + "step": 3939 + }, + { + "epoch": 14.22392776523702, + "grad_norm": 281.4416809082031, + "learning_rate": 8.602540834845734e-06, + "loss": 36.5777, + "step": 3940 + }, + { + "epoch": 14.22392776523702, + "eval_loss": 0.6099084615707397, + "eval_runtime": 3.1377, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 3940 + }, + { + "epoch": 14.227539503386005, + "grad_norm": 258.5203552246094, + "learning_rate": 8.597096188747731e-06, + "loss": 37.5071, + "step": 3941 + }, + { + "epoch": 14.231151241534988, + "grad_norm": 274.8222351074219, + "learning_rate": 8.591651542649727e-06, + "loss": 36.358, + "step": 3942 + }, + { + "epoch": 14.234762979683973, + "grad_norm": 253.1671600341797, + "learning_rate": 8.586206896551726e-06, + "loss": 37.5859, + "step": 3943 + }, + { + "epoch": 14.238374717832958, + "grad_norm": 249.80943298339844, + "learning_rate": 8.580762250453721e-06, + "loss": 37.8799, + "step": 3944 + }, + { + "epoch": 14.241986455981941, + "grad_norm": 245.29103088378906, + "learning_rate": 8.575317604355716e-06, + "loss": 36.7551, + "step": 3945 + }, + { + "epoch": 14.245598194130926, + "grad_norm": 205.5915985107422, + "learning_rate": 8.569872958257713e-06, + "loss": 38.4761, + "step": 3946 + }, + { + "epoch": 14.249209932279909, + "grad_norm": 218.10328674316406, + "learning_rate": 8.56442831215971e-06, + "loss": 37.5862, + "step": 3947 + }, + { + "epoch": 14.252821670428894, + "grad_norm": 273.5924072265625, + "learning_rate": 8.558983666061706e-06, + "loss": 39.2851, + "step": 3948 + }, + { + "epoch": 14.256433408577879, + "grad_norm": 235.48069763183594, + "learning_rate": 8.553539019963703e-06, + "loss": 39.0707, + "step": 3949 + }, + { + "epoch": 14.260045146726862, + "grad_norm": 230.93150329589844, + "learning_rate": 8.548094373865698e-06, + "loss": 37.8469, + "step": 3950 + }, + { + "epoch": 14.260045146726862, + "eval_loss": 0.6072147488594055, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 3950 + }, + { + "epoch": 14.263656884875846, + "grad_norm": 226.3638458251953, + "learning_rate": 8.542649727767695e-06, + "loss": 39.4245, + "step": 3951 + }, + { + "epoch": 14.267268623024831, + "grad_norm": 226.74595642089844, + "learning_rate": 8.537205081669693e-06, + "loss": 38.116, + "step": 3952 + }, + { + "epoch": 14.270880361173814, + "grad_norm": 226.1452178955078, + "learning_rate": 8.531760435571688e-06, + "loss": 39.9114, + "step": 3953 + }, + { + "epoch": 14.2744920993228, + "grad_norm": 387.8020324707031, + "learning_rate": 8.526315789473685e-06, + "loss": 38.9457, + "step": 3954 + }, + { + "epoch": 14.278103837471784, + "grad_norm": 381.5679931640625, + "learning_rate": 8.52087114337568e-06, + "loss": 40.7989, + "step": 3955 + }, + { + "epoch": 14.281715575620767, + "grad_norm": 246.16464233398438, + "learning_rate": 8.515426497277677e-06, + "loss": 37.6288, + "step": 3956 + }, + { + "epoch": 14.285327313769752, + "grad_norm": 337.05059814453125, + "learning_rate": 8.509981851179674e-06, + "loss": 37.3276, + "step": 3957 + }, + { + "epoch": 14.288939051918735, + "grad_norm": 223.80421447753906, + "learning_rate": 8.50453720508167e-06, + "loss": 33.9465, + "step": 3958 + }, + { + "epoch": 14.29255079006772, + "grad_norm": 218.9332275390625, + "learning_rate": 8.499092558983665e-06, + "loss": 33.0305, + "step": 3959 + }, + { + "epoch": 14.296162528216705, + "grad_norm": 254.20726013183594, + "learning_rate": 8.493647912885662e-06, + "loss": 31.3806, + "step": 3960 + }, + { + "epoch": 14.296162528216705, + "eval_loss": 0.6070483922958374, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 3960 + }, + { + "epoch": 14.299774266365688, + "grad_norm": 232.96702575683594, + "learning_rate": 8.48820326678766e-06, + "loss": 31.7001, + "step": 3961 + }, + { + "epoch": 14.303386004514673, + "grad_norm": 305.31207275390625, + "learning_rate": 8.482758620689656e-06, + "loss": 32.2629, + "step": 3962 + }, + { + "epoch": 14.306997742663658, + "grad_norm": 253.60858154296875, + "learning_rate": 8.477313974591652e-06, + "loss": 34.2635, + "step": 3963 + }, + { + "epoch": 14.31060948081264, + "grad_norm": 395.4168701171875, + "learning_rate": 8.471869328493647e-06, + "loss": 34.6987, + "step": 3964 + }, + { + "epoch": 14.314221218961626, + "grad_norm": 279.72845458984375, + "learning_rate": 8.466424682395644e-06, + "loss": 34.5488, + "step": 3965 + }, + { + "epoch": 14.317832957110609, + "grad_norm": 285.7306213378906, + "learning_rate": 8.460980036297641e-06, + "loss": 35.2566, + "step": 3966 + }, + { + "epoch": 14.321444695259594, + "grad_norm": 229.04226684570312, + "learning_rate": 8.455535390199637e-06, + "loss": 34.5273, + "step": 3967 + }, + { + "epoch": 14.325056433408578, + "grad_norm": 232.50205993652344, + "learning_rate": 8.450090744101634e-06, + "loss": 34.6337, + "step": 3968 + }, + { + "epoch": 14.328668171557561, + "grad_norm": 225.87583923339844, + "learning_rate": 8.44464609800363e-06, + "loss": 35.1575, + "step": 3969 + }, + { + "epoch": 14.332279909706546, + "grad_norm": 266.2709045410156, + "learning_rate": 8.439201451905626e-06, + "loss": 34.2619, + "step": 3970 + }, + { + "epoch": 14.332279909706546, + "eval_loss": 0.6066078543663025, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 3970 + }, + { + "epoch": 14.335891647855531, + "grad_norm": 283.557373046875, + "learning_rate": 8.433756805807623e-06, + "loss": 35.5713, + "step": 3971 + }, + { + "epoch": 14.339503386004514, + "grad_norm": 288.43707275390625, + "learning_rate": 8.428312159709619e-06, + "loss": 36.7442, + "step": 3972 + }, + { + "epoch": 14.343115124153499, + "grad_norm": 331.3218994140625, + "learning_rate": 8.422867513611616e-06, + "loss": 35.5839, + "step": 3973 + }, + { + "epoch": 14.346726862302482, + "grad_norm": 257.1488037109375, + "learning_rate": 8.417422867513611e-06, + "loss": 30.2221, + "step": 3974 + }, + { + "epoch": 14.350338600451467, + "grad_norm": 200.0919189453125, + "learning_rate": 8.411978221415608e-06, + "loss": 22.217, + "step": 3975 + }, + { + "epoch": 14.353950338600452, + "grad_norm": 245.030029296875, + "learning_rate": 8.406533575317605e-06, + "loss": 22.8927, + "step": 3976 + }, + { + "epoch": 14.357562076749435, + "grad_norm": 208.5701904296875, + "learning_rate": 8.4010889292196e-06, + "loss": 22.9537, + "step": 3977 + }, + { + "epoch": 14.36117381489842, + "grad_norm": 232.0613250732422, + "learning_rate": 8.395644283121596e-06, + "loss": 24.5304, + "step": 3978 + }, + { + "epoch": 14.364785553047405, + "grad_norm": 193.56541442871094, + "learning_rate": 8.390199637023595e-06, + "loss": 39.4552, + "step": 3979 + }, + { + "epoch": 14.368397291196388, + "grad_norm": 230.35507202148438, + "learning_rate": 8.38475499092559e-06, + "loss": 41.0417, + "step": 3980 + }, + { + "epoch": 14.368397291196388, + "eval_loss": 0.6071842908859253, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 3980 + }, + { + "epoch": 14.372009029345373, + "grad_norm": 191.09242248535156, + "learning_rate": 8.379310344827586e-06, + "loss": 40.1548, + "step": 3981 + }, + { + "epoch": 14.375620767494357, + "grad_norm": 249.24520874023438, + "learning_rate": 8.373865698729583e-06, + "loss": 39.5746, + "step": 3982 + }, + { + "epoch": 14.37923250564334, + "grad_norm": 266.509033203125, + "learning_rate": 8.368421052631578e-06, + "loss": 39.2388, + "step": 3983 + }, + { + "epoch": 14.382844243792325, + "grad_norm": 255.36209106445312, + "learning_rate": 8.362976406533577e-06, + "loss": 39.9314, + "step": 3984 + }, + { + "epoch": 14.386455981941308, + "grad_norm": 239.0690460205078, + "learning_rate": 8.357531760435572e-06, + "loss": 39.9124, + "step": 3985 + }, + { + "epoch": 14.390067720090293, + "grad_norm": 211.36135864257812, + "learning_rate": 8.352087114337568e-06, + "loss": 40.1307, + "step": 3986 + }, + { + "epoch": 14.393679458239278, + "grad_norm": 215.28912353515625, + "learning_rate": 8.346642468239565e-06, + "loss": 40.5252, + "step": 3987 + }, + { + "epoch": 14.397291196388261, + "grad_norm": 240.84271240234375, + "learning_rate": 8.34119782214156e-06, + "loss": 40.8348, + "step": 3988 + }, + { + "epoch": 14.400902934537246, + "grad_norm": 228.41758728027344, + "learning_rate": 8.335753176043557e-06, + "loss": 39.8228, + "step": 3989 + }, + { + "epoch": 14.404514672686231, + "grad_norm": 203.0228729248047, + "learning_rate": 8.330308529945554e-06, + "loss": 38.0696, + "step": 3990 + }, + { + "epoch": 14.404514672686231, + "eval_loss": 0.6064196825027466, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.983, + "eval_steps_per_second": 56.983, + "step": 3990 + }, + { + "epoch": 14.408126410835214, + "grad_norm": 245.14646911621094, + "learning_rate": 8.32486388384755e-06, + "loss": 37.3921, + "step": 3991 + }, + { + "epoch": 14.411738148984199, + "grad_norm": 230.0685577392578, + "learning_rate": 8.319419237749545e-06, + "loss": 36.8794, + "step": 3992 + }, + { + "epoch": 14.415349887133182, + "grad_norm": 203.02955627441406, + "learning_rate": 8.313974591651544e-06, + "loss": 38.011, + "step": 3993 + }, + { + "epoch": 14.418961625282167, + "grad_norm": 276.0522766113281, + "learning_rate": 8.30852994555354e-06, + "loss": 37.8114, + "step": 3994 + }, + { + "epoch": 14.422573363431152, + "grad_norm": 205.56423950195312, + "learning_rate": 8.303085299455536e-06, + "loss": 38.1956, + "step": 3995 + }, + { + "epoch": 14.426185101580135, + "grad_norm": 200.71507263183594, + "learning_rate": 8.297640653357532e-06, + "loss": 36.4471, + "step": 3996 + }, + { + "epoch": 14.42979683972912, + "grad_norm": 217.8540496826172, + "learning_rate": 8.292196007259527e-06, + "loss": 37.6204, + "step": 3997 + }, + { + "epoch": 14.433408577878104, + "grad_norm": 228.0621337890625, + "learning_rate": 8.286751361161526e-06, + "loss": 38.6074, + "step": 3998 + }, + { + "epoch": 14.437020316027088, + "grad_norm": 246.05203247070312, + "learning_rate": 8.281306715063521e-06, + "loss": 37.8614, + "step": 3999 + }, + { + "epoch": 14.440632054176072, + "grad_norm": 216.0327911376953, + "learning_rate": 8.275862068965517e-06, + "loss": 37.4941, + "step": 4000 + }, + { + "epoch": 14.440632054176072, + "eval_loss": 0.605604887008667, + "eval_runtime": 3.1399, + "eval_samples_per_second": 57.008, + "eval_steps_per_second": 57.008, + "step": 4000 + }, + { + "epoch": 14.444243792325057, + "grad_norm": 292.38653564453125, + "learning_rate": 8.270417422867514e-06, + "loss": 37.9576, + "step": 4001 + }, + { + "epoch": 14.44785553047404, + "grad_norm": 268.2558288574219, + "learning_rate": 8.26497277676951e-06, + "loss": 38.7505, + "step": 4002 + }, + { + "epoch": 14.451467268623025, + "grad_norm": 324.135498046875, + "learning_rate": 8.259528130671508e-06, + "loss": 39.9733, + "step": 4003 + }, + { + "epoch": 14.455079006772008, + "grad_norm": 269.1458740234375, + "learning_rate": 8.254083484573503e-06, + "loss": 38.8272, + "step": 4004 + }, + { + "epoch": 14.458690744920993, + "grad_norm": 214.26547241210938, + "learning_rate": 8.248638838475499e-06, + "loss": 37.7277, + "step": 4005 + }, + { + "epoch": 14.462302483069978, + "grad_norm": 256.4419860839844, + "learning_rate": 8.243194192377496e-06, + "loss": 39.0446, + "step": 4006 + }, + { + "epoch": 14.465914221218961, + "grad_norm": 226.9741973876953, + "learning_rate": 8.237749546279493e-06, + "loss": 34.2491, + "step": 4007 + }, + { + "epoch": 14.469525959367946, + "grad_norm": 238.4901123046875, + "learning_rate": 8.232304900181488e-06, + "loss": 32.1969, + "step": 4008 + }, + { + "epoch": 14.47313769751693, + "grad_norm": 260.6334533691406, + "learning_rate": 8.226860254083485e-06, + "loss": 32.5999, + "step": 4009 + }, + { + "epoch": 14.476749435665914, + "grad_norm": 227.4844970703125, + "learning_rate": 8.22141560798548e-06, + "loss": 30.3598, + "step": 4010 + }, + { + "epoch": 14.476749435665914, + "eval_loss": 0.6049788594245911, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 4010 + }, + { + "epoch": 14.480361173814899, + "grad_norm": 231.49935913085938, + "learning_rate": 8.215970961887476e-06, + "loss": 32.3228, + "step": 4011 + }, + { + "epoch": 14.483972911963882, + "grad_norm": 246.83099365234375, + "learning_rate": 8.210526315789475e-06, + "loss": 32.1275, + "step": 4012 + }, + { + "epoch": 14.487584650112867, + "grad_norm": 283.0715026855469, + "learning_rate": 8.20508166969147e-06, + "loss": 32.9237, + "step": 4013 + }, + { + "epoch": 14.491196388261852, + "grad_norm": 264.58941650390625, + "learning_rate": 8.199637023593467e-06, + "loss": 34.3091, + "step": 4014 + }, + { + "epoch": 14.494808126410835, + "grad_norm": 207.57241821289062, + "learning_rate": 8.194192377495463e-06, + "loss": 34.2317, + "step": 4015 + }, + { + "epoch": 14.49841986455982, + "grad_norm": 266.3730163574219, + "learning_rate": 8.18874773139746e-06, + "loss": 35.5423, + "step": 4016 + }, + { + "epoch": 14.502031602708804, + "grad_norm": 274.2936096191406, + "learning_rate": 8.183303085299457e-06, + "loss": 34.0383, + "step": 4017 + }, + { + "epoch": 14.505643340857787, + "grad_norm": 345.4320068359375, + "learning_rate": 8.177858439201452e-06, + "loss": 35.6892, + "step": 4018 + }, + { + "epoch": 14.509255079006772, + "grad_norm": 254.9503631591797, + "learning_rate": 8.172413793103448e-06, + "loss": 34.4219, + "step": 4019 + }, + { + "epoch": 14.512866817155757, + "grad_norm": 277.176025390625, + "learning_rate": 8.166969147005445e-06, + "loss": 34.6322, + "step": 4020 + }, + { + "epoch": 14.512866817155757, + "eval_loss": 0.6078911423683167, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 4020 + }, + { + "epoch": 14.51647855530474, + "grad_norm": 267.24737548828125, + "learning_rate": 8.161524500907442e-06, + "loss": 36.4843, + "step": 4021 + }, + { + "epoch": 14.520090293453725, + "grad_norm": 291.5208740234375, + "learning_rate": 8.156079854809437e-06, + "loss": 36.347, + "step": 4022 + }, + { + "epoch": 14.523702031602708, + "grad_norm": 331.9736328125, + "learning_rate": 8.150635208711434e-06, + "loss": 36.5678, + "step": 4023 + }, + { + "epoch": 14.527313769751693, + "grad_norm": 283.7598876953125, + "learning_rate": 8.14519056261343e-06, + "loss": 29.4886, + "step": 4024 + }, + { + "epoch": 14.530925507900678, + "grad_norm": 214.61712646484375, + "learning_rate": 8.139745916515427e-06, + "loss": 23.2178, + "step": 4025 + }, + { + "epoch": 14.534537246049661, + "grad_norm": 286.7948913574219, + "learning_rate": 8.134301270417424e-06, + "loss": 22.0972, + "step": 4026 + }, + { + "epoch": 14.538148984198646, + "grad_norm": 230.6540069580078, + "learning_rate": 8.128856624319419e-06, + "loss": 23.2764, + "step": 4027 + }, + { + "epoch": 14.54176072234763, + "grad_norm": 300.9560241699219, + "learning_rate": 8.123411978221416e-06, + "loss": 24.1889, + "step": 4028 + }, + { + "epoch": 14.545372460496614, + "grad_norm": 211.4068145751953, + "learning_rate": 8.117967332123412e-06, + "loss": 39.0039, + "step": 4029 + }, + { + "epoch": 14.548984198645599, + "grad_norm": 274.3965759277344, + "learning_rate": 8.112522686025409e-06, + "loss": 41.1832, + "step": 4030 + }, + { + "epoch": 14.548984198645599, + "eval_loss": 0.6079195141792297, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 4030 + }, + { + "epoch": 14.552595936794582, + "grad_norm": 247.50657653808594, + "learning_rate": 8.107078039927406e-06, + "loss": 38.28, + "step": 4031 + }, + { + "epoch": 14.556207674943566, + "grad_norm": 216.0500946044922, + "learning_rate": 8.101633393829401e-06, + "loss": 39.5079, + "step": 4032 + }, + { + "epoch": 14.559819413092551, + "grad_norm": 271.37066650390625, + "learning_rate": 8.096188747731396e-06, + "loss": 40.1902, + "step": 4033 + }, + { + "epoch": 14.563431151241534, + "grad_norm": 233.35415649414062, + "learning_rate": 8.090744101633394e-06, + "loss": 40.2113, + "step": 4034 + }, + { + "epoch": 14.56704288939052, + "grad_norm": 214.67381286621094, + "learning_rate": 8.08529945553539e-06, + "loss": 39.794, + "step": 4035 + }, + { + "epoch": 14.570654627539504, + "grad_norm": 298.1142578125, + "learning_rate": 8.079854809437388e-06, + "loss": 39.9214, + "step": 4036 + }, + { + "epoch": 14.574266365688487, + "grad_norm": 197.40823364257812, + "learning_rate": 8.074410163339383e-06, + "loss": 40.9599, + "step": 4037 + }, + { + "epoch": 14.577878103837472, + "grad_norm": 242.1573028564453, + "learning_rate": 8.068965517241378e-06, + "loss": 40.2351, + "step": 4038 + }, + { + "epoch": 14.581489841986457, + "grad_norm": 224.93801879882812, + "learning_rate": 8.063520871143377e-06, + "loss": 39.0174, + "step": 4039 + }, + { + "epoch": 14.58510158013544, + "grad_norm": 295.4931335449219, + "learning_rate": 8.058076225045373e-06, + "loss": 37.4696, + "step": 4040 + }, + { + "epoch": 14.58510158013544, + "eval_loss": 0.6091852188110352, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 4040 + }, + { + "epoch": 14.588713318284425, + "grad_norm": 302.8267517089844, + "learning_rate": 8.052631578947368e-06, + "loss": 37.3227, + "step": 4041 + }, + { + "epoch": 14.592325056433408, + "grad_norm": 355.2379150390625, + "learning_rate": 8.047186932849365e-06, + "loss": 38.433, + "step": 4042 + }, + { + "epoch": 14.595936794582393, + "grad_norm": 304.96234130859375, + "learning_rate": 8.04174228675136e-06, + "loss": 37.8352, + "step": 4043 + }, + { + "epoch": 14.599548532731378, + "grad_norm": 309.294921875, + "learning_rate": 8.036297640653358e-06, + "loss": 38.1734, + "step": 4044 + }, + { + "epoch": 14.60316027088036, + "grad_norm": 216.3328399658203, + "learning_rate": 8.030852994555355e-06, + "loss": 37.3612, + "step": 4045 + }, + { + "epoch": 14.606772009029346, + "grad_norm": 250.9885711669922, + "learning_rate": 8.02540834845735e-06, + "loss": 39.1612, + "step": 4046 + }, + { + "epoch": 14.610383747178329, + "grad_norm": 215.0750732421875, + "learning_rate": 8.019963702359347e-06, + "loss": 39.6837, + "step": 4047 + }, + { + "epoch": 14.613995485327314, + "grad_norm": 234.02069091796875, + "learning_rate": 8.014519056261342e-06, + "loss": 37.9746, + "step": 4048 + }, + { + "epoch": 14.617607223476298, + "grad_norm": 233.7527313232422, + "learning_rate": 8.00907441016334e-06, + "loss": 38.5114, + "step": 4049 + }, + { + "epoch": 14.621218961625281, + "grad_norm": 271.77496337890625, + "learning_rate": 8.003629764065337e-06, + "loss": 37.1647, + "step": 4050 + }, + { + "epoch": 14.621218961625281, + "eval_loss": 0.6047770977020264, + "eval_runtime": 3.1379, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 4050 + }, + { + "epoch": 14.624830699774266, + "grad_norm": 281.7846374511719, + "learning_rate": 7.998185117967332e-06, + "loss": 38.981, + "step": 4051 + }, + { + "epoch": 14.628442437923251, + "grad_norm": 308.8702697753906, + "learning_rate": 7.992740471869327e-06, + "loss": 39.4821, + "step": 4052 + }, + { + "epoch": 14.632054176072234, + "grad_norm": 366.1501770019531, + "learning_rate": 7.987295825771326e-06, + "loss": 39.0898, + "step": 4053 + }, + { + "epoch": 14.635665914221219, + "grad_norm": 276.92962646484375, + "learning_rate": 7.981851179673322e-06, + "loss": 39.6162, + "step": 4054 + }, + { + "epoch": 14.639277652370204, + "grad_norm": 220.0023651123047, + "learning_rate": 7.976406533575319e-06, + "loss": 38.5888, + "step": 4055 + }, + { + "epoch": 14.642889390519187, + "grad_norm": 268.57293701171875, + "learning_rate": 7.970961887477314e-06, + "loss": 38.4631, + "step": 4056 + }, + { + "epoch": 14.646501128668172, + "grad_norm": 307.8072509765625, + "learning_rate": 7.96551724137931e-06, + "loss": 35.4139, + "step": 4057 + }, + { + "epoch": 14.650112866817155, + "grad_norm": 228.11767578125, + "learning_rate": 7.960072595281308e-06, + "loss": 33.3694, + "step": 4058 + }, + { + "epoch": 14.65372460496614, + "grad_norm": 217.6271209716797, + "learning_rate": 7.954627949183304e-06, + "loss": 31.3355, + "step": 4059 + }, + { + "epoch": 14.657336343115125, + "grad_norm": 232.31944274902344, + "learning_rate": 7.949183303085299e-06, + "loss": 32.8306, + "step": 4060 + }, + { + "epoch": 14.657336343115125, + "eval_loss": 0.6018487215042114, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 4060 + }, + { + "epoch": 14.660948081264108, + "grad_norm": 244.58303833007812, + "learning_rate": 7.943738656987296e-06, + "loss": 33.2157, + "step": 4061 + }, + { + "epoch": 14.664559819413093, + "grad_norm": 306.12005615234375, + "learning_rate": 7.938294010889293e-06, + "loss": 33.6361, + "step": 4062 + }, + { + "epoch": 14.668171557562077, + "grad_norm": 266.2792053222656, + "learning_rate": 7.932849364791288e-06, + "loss": 32.3917, + "step": 4063 + }, + { + "epoch": 14.67178329571106, + "grad_norm": 259.373779296875, + "learning_rate": 7.927404718693286e-06, + "loss": 33.3598, + "step": 4064 + }, + { + "epoch": 14.675395033860045, + "grad_norm": 247.35179138183594, + "learning_rate": 7.921960072595281e-06, + "loss": 32.2699, + "step": 4065 + }, + { + "epoch": 14.679006772009028, + "grad_norm": 280.02960205078125, + "learning_rate": 7.916515426497278e-06, + "loss": 33.0305, + "step": 4066 + }, + { + "epoch": 14.682618510158013, + "grad_norm": 394.6492919921875, + "learning_rate": 7.911070780399275e-06, + "loss": 35.1854, + "step": 4067 + }, + { + "epoch": 14.686230248306998, + "grad_norm": 298.6531677246094, + "learning_rate": 7.90562613430127e-06, + "loss": 35.1836, + "step": 4068 + }, + { + "epoch": 14.689841986455981, + "grad_norm": 250.960693359375, + "learning_rate": 7.900181488203268e-06, + "loss": 32.6266, + "step": 4069 + }, + { + "epoch": 14.693453724604966, + "grad_norm": 240.4825897216797, + "learning_rate": 7.894736842105263e-06, + "loss": 35.5937, + "step": 4070 + }, + { + "epoch": 14.693453724604966, + "eval_loss": 0.6042065620422363, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.91, + "eval_steps_per_second": 56.91, + "step": 4070 + }, + { + "epoch": 14.697065462753951, + "grad_norm": 274.6919860839844, + "learning_rate": 7.889292196007258e-06, + "loss": 36.4225, + "step": 4071 + }, + { + "epoch": 14.700677200902934, + "grad_norm": 245.4980010986328, + "learning_rate": 7.883847549909257e-06, + "loss": 36.5503, + "step": 4072 + }, + { + "epoch": 14.704288939051919, + "grad_norm": 373.362548828125, + "learning_rate": 7.878402903811252e-06, + "loss": 35.38, + "step": 4073 + }, + { + "epoch": 14.707900677200904, + "grad_norm": 337.5054626464844, + "learning_rate": 7.872958257713248e-06, + "loss": 28.869, + "step": 4074 + }, + { + "epoch": 14.711512415349887, + "grad_norm": 238.19195556640625, + "learning_rate": 7.867513611615245e-06, + "loss": 22.99, + "step": 4075 + }, + { + "epoch": 14.715124153498872, + "grad_norm": 254.274169921875, + "learning_rate": 7.862068965517242e-06, + "loss": 22.5274, + "step": 4076 + }, + { + "epoch": 14.718735891647855, + "grad_norm": 236.74099731445312, + "learning_rate": 7.856624319419239e-06, + "loss": 23.6756, + "step": 4077 + }, + { + "epoch": 14.72234762979684, + "grad_norm": 239.69911193847656, + "learning_rate": 7.851179673321234e-06, + "loss": 23.2024, + "step": 4078 + }, + { + "epoch": 14.725959367945824, + "grad_norm": 296.35101318359375, + "learning_rate": 7.84573502722323e-06, + "loss": 40.0026, + "step": 4079 + }, + { + "epoch": 14.729571106094808, + "grad_norm": 202.52577209472656, + "learning_rate": 7.840290381125227e-06, + "loss": 41.2817, + "step": 4080 + }, + { + "epoch": 14.729571106094808, + "eval_loss": 0.6069625616073608, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4080 + }, + { + "epoch": 14.733182844243792, + "grad_norm": 290.4194030761719, + "learning_rate": 7.834845735027224e-06, + "loss": 40.5411, + "step": 4081 + }, + { + "epoch": 14.736794582392777, + "grad_norm": 284.0616455078125, + "learning_rate": 7.82940108892922e-06, + "loss": 40.6588, + "step": 4082 + }, + { + "epoch": 14.74040632054176, + "grad_norm": 289.5628967285156, + "learning_rate": 7.823956442831216e-06, + "loss": 38.986, + "step": 4083 + }, + { + "epoch": 14.744018058690745, + "grad_norm": 217.09841918945312, + "learning_rate": 7.818511796733212e-06, + "loss": 38.83, + "step": 4084 + }, + { + "epoch": 14.747629796839728, + "grad_norm": 223.49148559570312, + "learning_rate": 7.813067150635209e-06, + "loss": 39.4897, + "step": 4085 + }, + { + "epoch": 14.751241534988713, + "grad_norm": 240.41578674316406, + "learning_rate": 7.807622504537206e-06, + "loss": 38.9963, + "step": 4086 + }, + { + "epoch": 14.754853273137698, + "grad_norm": 206.7586212158203, + "learning_rate": 7.802177858439201e-06, + "loss": 39.7875, + "step": 4087 + }, + { + "epoch": 14.758465011286681, + "grad_norm": 239.97174072265625, + "learning_rate": 7.796733212341198e-06, + "loss": 39.3977, + "step": 4088 + }, + { + "epoch": 14.762076749435666, + "grad_norm": 204.50839233398438, + "learning_rate": 7.791288566243194e-06, + "loss": 38.7869, + "step": 4089 + }, + { + "epoch": 14.76568848758465, + "grad_norm": 216.79583740234375, + "learning_rate": 7.785843920145191e-06, + "loss": 36.7325, + "step": 4090 + }, + { + "epoch": 14.76568848758465, + "eval_loss": 0.6052367091178894, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.93, + "eval_steps_per_second": 56.93, + "step": 4090 + }, + { + "epoch": 14.769300225733634, + "grad_norm": 251.13209533691406, + "learning_rate": 7.780399274047188e-06, + "loss": 38.2377, + "step": 4091 + }, + { + "epoch": 14.772911963882619, + "grad_norm": 222.745361328125, + "learning_rate": 7.774954627949183e-06, + "loss": 36.8119, + "step": 4092 + }, + { + "epoch": 14.776523702031604, + "grad_norm": 252.72117614746094, + "learning_rate": 7.769509981851179e-06, + "loss": 38.1241, + "step": 4093 + }, + { + "epoch": 14.780135440180587, + "grad_norm": 272.38165283203125, + "learning_rate": 7.764065335753176e-06, + "loss": 37.6839, + "step": 4094 + }, + { + "epoch": 14.783747178329572, + "grad_norm": 301.0637512207031, + "learning_rate": 7.758620689655173e-06, + "loss": 38.1267, + "step": 4095 + }, + { + "epoch": 14.787358916478555, + "grad_norm": 240.22515869140625, + "learning_rate": 7.75317604355717e-06, + "loss": 36.9847, + "step": 4096 + }, + { + "epoch": 14.79097065462754, + "grad_norm": 273.3988952636719, + "learning_rate": 7.747731397459165e-06, + "loss": 39.0368, + "step": 4097 + }, + { + "epoch": 14.794582392776524, + "grad_norm": 252.66497802734375, + "learning_rate": 7.74228675136116e-06, + "loss": 38.6439, + "step": 4098 + }, + { + "epoch": 14.798194130925507, + "grad_norm": 246.3287811279297, + "learning_rate": 7.73684210526316e-06, + "loss": 36.3503, + "step": 4099 + }, + { + "epoch": 14.801805869074492, + "grad_norm": 220.6704559326172, + "learning_rate": 7.731397459165155e-06, + "loss": 38.1603, + "step": 4100 + }, + { + "epoch": 14.801805869074492, + "eval_loss": 0.6043270826339722, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4100 + }, + { + "epoch": 14.805417607223477, + "grad_norm": 215.94979858398438, + "learning_rate": 7.72595281306715e-06, + "loss": 38.9624, + "step": 4101 + }, + { + "epoch": 14.80902934537246, + "grad_norm": 228.76815795898438, + "learning_rate": 7.720508166969147e-06, + "loss": 39.2196, + "step": 4102 + }, + { + "epoch": 14.812641083521445, + "grad_norm": 216.1998291015625, + "learning_rate": 7.715063520871143e-06, + "loss": 39.3677, + "step": 4103 + }, + { + "epoch": 14.816252821670428, + "grad_norm": 266.1018981933594, + "learning_rate": 7.70961887477314e-06, + "loss": 38.1856, + "step": 4104 + }, + { + "epoch": 14.819864559819413, + "grad_norm": 234.2566680908203, + "learning_rate": 7.704174228675137e-06, + "loss": 39.6282, + "step": 4105 + }, + { + "epoch": 14.823476297968398, + "grad_norm": 241.16615295410156, + "learning_rate": 7.698729582577132e-06, + "loss": 38.2693, + "step": 4106 + }, + { + "epoch": 14.827088036117381, + "grad_norm": 332.6835021972656, + "learning_rate": 7.69328493647913e-06, + "loss": 37.7161, + "step": 4107 + }, + { + "epoch": 14.830699774266366, + "grad_norm": 260.1654357910156, + "learning_rate": 7.687840290381126e-06, + "loss": 33.9704, + "step": 4108 + }, + { + "epoch": 14.83431151241535, + "grad_norm": 214.45509338378906, + "learning_rate": 7.682395644283122e-06, + "loss": 32.5126, + "step": 4109 + }, + { + "epoch": 14.837923250564334, + "grad_norm": 257.4847717285156, + "learning_rate": 7.676950998185119e-06, + "loss": 32.0682, + "step": 4110 + }, + { + "epoch": 14.837923250564334, + "eval_loss": 0.6022929549217224, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.957, + "eval_steps_per_second": 56.957, + "step": 4110 + }, + { + "epoch": 14.841534988713319, + "grad_norm": 241.302978515625, + "learning_rate": 7.671506352087114e-06, + "loss": 32.8817, + "step": 4111 + }, + { + "epoch": 14.845146726862303, + "grad_norm": 238.0950164794922, + "learning_rate": 7.66606170598911e-06, + "loss": 31.9995, + "step": 4112 + }, + { + "epoch": 14.848758465011286, + "grad_norm": 239.700439453125, + "learning_rate": 7.660617059891108e-06, + "loss": 32.9681, + "step": 4113 + }, + { + "epoch": 14.852370203160271, + "grad_norm": 234.23890686035156, + "learning_rate": 7.655172413793104e-06, + "loss": 33.6878, + "step": 4114 + }, + { + "epoch": 14.855981941309254, + "grad_norm": 367.3103332519531, + "learning_rate": 7.6497277676951e-06, + "loss": 34.2346, + "step": 4115 + }, + { + "epoch": 14.85959367945824, + "grad_norm": 221.31381225585938, + "learning_rate": 7.644283121597096e-06, + "loss": 35.0148, + "step": 4116 + }, + { + "epoch": 14.863205417607224, + "grad_norm": 352.1162109375, + "learning_rate": 7.638838475499092e-06, + "loss": 34.8326, + "step": 4117 + }, + { + "epoch": 14.866817155756207, + "grad_norm": 296.8202209472656, + "learning_rate": 7.63339382940109e-06, + "loss": 34.2522, + "step": 4118 + }, + { + "epoch": 14.870428893905192, + "grad_norm": 283.4679870605469, + "learning_rate": 7.627949183303086e-06, + "loss": 34.5005, + "step": 4119 + }, + { + "epoch": 14.874040632054175, + "grad_norm": 249.95033264160156, + "learning_rate": 7.622504537205082e-06, + "loss": 34.9581, + "step": 4120 + }, + { + "epoch": 14.874040632054175, + "eval_loss": 0.6031190752983093, + "eval_runtime": 3.1392, + "eval_samples_per_second": 57.02, + "eval_steps_per_second": 57.02, + "step": 4120 + }, + { + "epoch": 14.87765237020316, + "grad_norm": 235.65065002441406, + "learning_rate": 7.6170598911070774e-06, + "loss": 35.3024, + "step": 4121 + }, + { + "epoch": 14.881264108352145, + "grad_norm": 258.1300964355469, + "learning_rate": 7.611615245009075e-06, + "loss": 35.4444, + "step": 4122 + }, + { + "epoch": 14.884875846501128, + "grad_norm": 262.9698791503906, + "learning_rate": 7.606170598911072e-06, + "loss": 36.5643, + "step": 4123 + }, + { + "epoch": 14.888487584650113, + "grad_norm": 274.81781005859375, + "learning_rate": 7.600725952813067e-06, + "loss": 33.0157, + "step": 4124 + }, + { + "epoch": 14.892099322799098, + "grad_norm": 205.41566467285156, + "learning_rate": 7.595281306715063e-06, + "loss": 22.226, + "step": 4125 + }, + { + "epoch": 14.89571106094808, + "grad_norm": 231.19541931152344, + "learning_rate": 7.5898366606170594e-06, + "loss": 22.1499, + "step": 4126 + }, + { + "epoch": 14.899322799097066, + "grad_norm": 203.04856872558594, + "learning_rate": 7.584392014519057e-06, + "loss": 23.3987, + "step": 4127 + }, + { + "epoch": 14.90293453724605, + "grad_norm": 289.031005859375, + "learning_rate": 7.578947368421053e-06, + "loss": 24.3649, + "step": 4128 + }, + { + "epoch": 14.906546275395034, + "grad_norm": 285.2325744628906, + "learning_rate": 7.573502722323049e-06, + "loss": 41.146, + "step": 4129 + }, + { + "epoch": 14.910158013544018, + "grad_norm": 232.21603393554688, + "learning_rate": 7.568058076225045e-06, + "loss": 40.3871, + "step": 4130 + }, + { + "epoch": 14.910158013544018, + "eval_loss": 0.6056836247444153, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 4130 + }, + { + "epoch": 14.913769751693001, + "grad_norm": 358.63238525390625, + "learning_rate": 7.562613430127043e-06, + "loss": 39.5914, + "step": 4131 + }, + { + "epoch": 14.917381489841986, + "grad_norm": 262.66741943359375, + "learning_rate": 7.5571687840290385e-06, + "loss": 39.4552, + "step": 4132 + }, + { + "epoch": 14.920993227990971, + "grad_norm": 228.7096710205078, + "learning_rate": 7.551724137931035e-06, + "loss": 41.5379, + "step": 4133 + }, + { + "epoch": 14.924604966139954, + "grad_norm": 266.6537780761719, + "learning_rate": 7.546279491833031e-06, + "loss": 39.8314, + "step": 4134 + }, + { + "epoch": 14.928216704288939, + "grad_norm": 329.5486755371094, + "learning_rate": 7.540834845735027e-06, + "loss": 37.8247, + "step": 4135 + }, + { + "epoch": 14.931828442437924, + "grad_norm": 391.49127197265625, + "learning_rate": 7.535390199637024e-06, + "loss": 36.8491, + "step": 4136 + }, + { + "epoch": 14.935440180586907, + "grad_norm": 342.66632080078125, + "learning_rate": 7.5299455535390205e-06, + "loss": 37.7245, + "step": 4137 + }, + { + "epoch": 14.939051918735892, + "grad_norm": 309.25115966796875, + "learning_rate": 7.524500907441017e-06, + "loss": 38.3694, + "step": 4138 + }, + { + "epoch": 14.942663656884875, + "grad_norm": 438.21539306640625, + "learning_rate": 7.519056261343012e-06, + "loss": 38.5028, + "step": 4139 + }, + { + "epoch": 14.94627539503386, + "grad_norm": 314.2667541503906, + "learning_rate": 7.513611615245008e-06, + "loss": 39.2531, + "step": 4140 + }, + { + "epoch": 14.94627539503386, + "eval_loss": 0.6075459718704224, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 4140 + }, + { + "epoch": 14.949887133182845, + "grad_norm": 348.3675537109375, + "learning_rate": 7.508166969147006e-06, + "loss": 38.3904, + "step": 4141 + }, + { + "epoch": 14.953498871331828, + "grad_norm": 448.6506652832031, + "learning_rate": 7.5027223230490025e-06, + "loss": 39.0257, + "step": 4142 + }, + { + "epoch": 14.957110609480813, + "grad_norm": 407.4074401855469, + "learning_rate": 7.497277676950998e-06, + "loss": 36.8144, + "step": 4143 + }, + { + "epoch": 14.960722347629797, + "grad_norm": 311.0707702636719, + "learning_rate": 7.491833030852995e-06, + "loss": 34.3852, + "step": 4144 + }, + { + "epoch": 14.96433408577878, + "grad_norm": 316.660400390625, + "learning_rate": 7.486388384754991e-06, + "loss": 32.9411, + "step": 4145 + }, + { + "epoch": 14.967945823927765, + "grad_norm": 405.3203125, + "learning_rate": 7.480943738656988e-06, + "loss": 32.9947, + "step": 4146 + }, + { + "epoch": 14.97155756207675, + "grad_norm": 246.47296142578125, + "learning_rate": 7.475499092558984e-06, + "loss": 34.9284, + "step": 4147 + }, + { + "epoch": 14.975169300225733, + "grad_norm": 250.6293487548828, + "learning_rate": 7.47005444646098e-06, + "loss": 33.5852, + "step": 4148 + }, + { + "epoch": 14.978781038374718, + "grad_norm": 367.8492736816406, + "learning_rate": 7.464609800362977e-06, + "loss": 34.5658, + "step": 4149 + }, + { + "epoch": 14.982392776523701, + "grad_norm": 299.1382141113281, + "learning_rate": 7.459165154264972e-06, + "loss": 35.4483, + "step": 4150 + }, + { + "epoch": 14.982392776523701, + "eval_loss": 0.6054605841636658, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 4150 + }, + { + "epoch": 14.986004514672686, + "grad_norm": 448.0080261230469, + "learning_rate": 7.453720508166969e-06, + "loss": 35.9366, + "step": 4151 + }, + { + "epoch": 14.989616252821671, + "grad_norm": 496.0691223144531, + "learning_rate": 7.448275862068966e-06, + "loss": 37.6222, + "step": 4152 + }, + { + "epoch": 14.993227990970654, + "grad_norm": 300.7026062011719, + "learning_rate": 7.442831215970963e-06, + "loss": 27.5573, + "step": 4153 + }, + { + "epoch": 14.996839729119639, + "grad_norm": 183.81434631347656, + "learning_rate": 7.437386569872958e-06, + "loss": 23.0142, + "step": 4154 + }, + { + "epoch": 15.0, + "grad_norm": 198.61032104492188, + "learning_rate": 7.431941923774954e-06, + "loss": 21.0732, + "step": 4155 + }, + { + "epoch": 15.003611738148985, + "grad_norm": 244.2176513671875, + "learning_rate": 7.426497277676951e-06, + "loss": 39.1709, + "step": 4156 + }, + { + "epoch": 15.007223476297968, + "grad_norm": 211.74375915527344, + "learning_rate": 7.421052631578948e-06, + "loss": 39.9364, + "step": 4157 + }, + { + "epoch": 15.010835214446953, + "grad_norm": 216.2489013671875, + "learning_rate": 7.415607985480944e-06, + "loss": 39.5166, + "step": 4158 + }, + { + "epoch": 15.014446952595938, + "grad_norm": 279.423583984375, + "learning_rate": 7.41016333938294e-06, + "loss": 39.6738, + "step": 4159 + }, + { + "epoch": 15.01805869074492, + "grad_norm": 279.117919921875, + "learning_rate": 7.404718693284937e-06, + "loss": 39.3556, + "step": 4160 + }, + { + "epoch": 15.01805869074492, + "eval_loss": 0.6020110249519348, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 4160 + }, + { + "epoch": 15.021670428893906, + "grad_norm": 213.57162475585938, + "learning_rate": 7.399274047186933e-06, + "loss": 38.9987, + "step": 4161 + }, + { + "epoch": 15.025282167042889, + "grad_norm": 184.1968994140625, + "learning_rate": 7.393829401088929e-06, + "loss": 39.1696, + "step": 4162 + }, + { + "epoch": 15.028893905191874, + "grad_norm": 219.38076782226562, + "learning_rate": 7.388384754990926e-06, + "loss": 39.8897, + "step": 4163 + }, + { + "epoch": 15.032505643340858, + "grad_norm": 225.4325714111328, + "learning_rate": 7.382940108892922e-06, + "loss": 40.7633, + "step": 4164 + }, + { + "epoch": 15.036117381489841, + "grad_norm": 274.78472900390625, + "learning_rate": 7.377495462794918e-06, + "loss": 39.8768, + "step": 4165 + }, + { + "epoch": 15.039729119638826, + "grad_norm": 269.5557861328125, + "learning_rate": 7.3720508166969146e-06, + "loss": 38.4735, + "step": 4166 + }, + { + "epoch": 15.043340857787811, + "grad_norm": 219.78761291503906, + "learning_rate": 7.366606170598912e-06, + "loss": 37.2117, + "step": 4167 + }, + { + "epoch": 15.046952595936794, + "grad_norm": 205.49771118164062, + "learning_rate": 7.361161524500908e-06, + "loss": 36.6855, + "step": 4168 + }, + { + "epoch": 15.050564334085779, + "grad_norm": 235.72068786621094, + "learning_rate": 7.355716878402904e-06, + "loss": 35.4408, + "step": 4169 + }, + { + "epoch": 15.054176072234762, + "grad_norm": 218.84732055664062, + "learning_rate": 7.3502722323049e-06, + "loss": 38.2297, + "step": 4170 + }, + { + "epoch": 15.054176072234762, + "eval_loss": 0.6053969860076904, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 4170 + }, + { + "epoch": 15.057787810383747, + "grad_norm": 195.80685424804688, + "learning_rate": 7.3448275862068966e-06, + "loss": 35.7271, + "step": 4171 + }, + { + "epoch": 15.061399548532732, + "grad_norm": 207.12481689453125, + "learning_rate": 7.339382940108894e-06, + "loss": 37.3393, + "step": 4172 + }, + { + "epoch": 15.065011286681715, + "grad_norm": 211.0287322998047, + "learning_rate": 7.333938294010889e-06, + "loss": 36.9505, + "step": 4173 + }, + { + "epoch": 15.0686230248307, + "grad_norm": 279.0206604003906, + "learning_rate": 7.328493647912886e-06, + "loss": 38.1225, + "step": 4174 + }, + { + "epoch": 15.072234762979685, + "grad_norm": 206.3834228515625, + "learning_rate": 7.323049001814882e-06, + "loss": 37.1117, + "step": 4175 + }, + { + "epoch": 15.075846501128668, + "grad_norm": 266.8707275390625, + "learning_rate": 7.3176043557168786e-06, + "loss": 36.1971, + "step": 4176 + }, + { + "epoch": 15.079458239277653, + "grad_norm": 260.35791015625, + "learning_rate": 7.312159709618875e-06, + "loss": 37.4714, + "step": 4177 + }, + { + "epoch": 15.083069977426636, + "grad_norm": 281.152587890625, + "learning_rate": 7.306715063520871e-06, + "loss": 37.621, + "step": 4178 + }, + { + "epoch": 15.08668171557562, + "grad_norm": 246.25758361816406, + "learning_rate": 7.301270417422868e-06, + "loss": 38.919, + "step": 4179 + }, + { + "epoch": 15.090293453724605, + "grad_norm": 378.4499816894531, + "learning_rate": 7.2958257713248635e-06, + "loss": 39.5783, + "step": 4180 + }, + { + "epoch": 15.090293453724605, + "eval_loss": 0.6071392297744751, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 4180 + }, + { + "epoch": 15.093905191873588, + "grad_norm": 421.0552673339844, + "learning_rate": 7.2903811252268606e-06, + "loss": 38.9023, + "step": 4181 + }, + { + "epoch": 15.097516930022573, + "grad_norm": 264.24359130859375, + "learning_rate": 7.284936479128857e-06, + "loss": 39.6466, + "step": 4182 + }, + { + "epoch": 15.101128668171558, + "grad_norm": 246.88182067871094, + "learning_rate": 7.279491833030854e-06, + "loss": 39.4899, + "step": 4183 + }, + { + "epoch": 15.104740406320541, + "grad_norm": 236.83848571777344, + "learning_rate": 7.274047186932849e-06, + "loss": 35.6587, + "step": 4184 + }, + { + "epoch": 15.108352144469526, + "grad_norm": 278.31573486328125, + "learning_rate": 7.2686025408348455e-06, + "loss": 34.1567, + "step": 4185 + }, + { + "epoch": 15.111963882618511, + "grad_norm": 243.71160888671875, + "learning_rate": 7.2631578947368426e-06, + "loss": 32.1268, + "step": 4186 + }, + { + "epoch": 15.115575620767494, + "grad_norm": 233.81211853027344, + "learning_rate": 7.257713248638839e-06, + "loss": 31.498, + "step": 4187 + }, + { + "epoch": 15.119187358916479, + "grad_norm": 243.12672424316406, + "learning_rate": 7.252268602540835e-06, + "loss": 32.3648, + "step": 4188 + }, + { + "epoch": 15.122799097065462, + "grad_norm": 293.38299560546875, + "learning_rate": 7.246823956442831e-06, + "loss": 32.2236, + "step": 4189 + }, + { + "epoch": 15.126410835214447, + "grad_norm": 249.70071411132812, + "learning_rate": 7.241379310344828e-06, + "loss": 34.5535, + "step": 4190 + }, + { + "epoch": 15.126410835214447, + "eval_loss": 0.6050077676773071, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.934, + "eval_steps_per_second": 56.934, + "step": 4190 + }, + { + "epoch": 15.130022573363432, + "grad_norm": 300.9483642578125, + "learning_rate": 7.235934664246824e-06, + "loss": 32.9552, + "step": 4191 + }, + { + "epoch": 15.133634311512415, + "grad_norm": 228.797607421875, + "learning_rate": 7.23049001814882e-06, + "loss": 33.0974, + "step": 4192 + }, + { + "epoch": 15.1372460496614, + "grad_norm": 279.9087219238281, + "learning_rate": 7.225045372050817e-06, + "loss": 34.2865, + "step": 4193 + }, + { + "epoch": 15.140857787810384, + "grad_norm": 254.15928649902344, + "learning_rate": 7.219600725952813e-06, + "loss": 34.5603, + "step": 4194 + }, + { + "epoch": 15.144469525959368, + "grad_norm": 314.19012451171875, + "learning_rate": 7.2141560798548095e-06, + "loss": 34.6428, + "step": 4195 + }, + { + "epoch": 15.148081264108352, + "grad_norm": 291.8244323730469, + "learning_rate": 7.208711433756806e-06, + "loss": 33.6676, + "step": 4196 + }, + { + "epoch": 15.151693002257336, + "grad_norm": 276.4428405761719, + "learning_rate": 7.203266787658803e-06, + "loss": 33.9118, + "step": 4197 + }, + { + "epoch": 15.15530474040632, + "grad_norm": 265.7801208496094, + "learning_rate": 7.197822141560799e-06, + "loss": 35.1971, + "step": 4198 + }, + { + "epoch": 15.158916478555305, + "grad_norm": 244.48667907714844, + "learning_rate": 7.192377495462795e-06, + "loss": 33.0843, + "step": 4199 + }, + { + "epoch": 15.162528216704288, + "grad_norm": 348.6037902832031, + "learning_rate": 7.1869328493647915e-06, + "loss": 36.7957, + "step": 4200 + }, + { + "epoch": 15.162528216704288, + "eval_loss": 0.6052607297897339, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 4200 + }, + { + "epoch": 15.166139954853273, + "grad_norm": 227.31346130371094, + "learning_rate": 7.181488203266788e-06, + "loss": 28.0234, + "step": 4201 + }, + { + "epoch": 15.169751693002258, + "grad_norm": 208.75048828125, + "learning_rate": 7.176043557168784e-06, + "loss": 22.5147, + "step": 4202 + }, + { + "epoch": 15.173363431151241, + "grad_norm": 222.91090393066406, + "learning_rate": 7.17059891107078e-06, + "loss": 22.1029, + "step": 4203 + }, + { + "epoch": 15.176975169300226, + "grad_norm": 219.40621948242188, + "learning_rate": 7.165154264972777e-06, + "loss": 22.9827, + "step": 4204 + }, + { + "epoch": 15.18058690744921, + "grad_norm": 229.11813354492188, + "learning_rate": 7.1597096188747735e-06, + "loss": 23.6974, + "step": 4205 + }, + { + "epoch": 15.184198645598194, + "grad_norm": 256.7950744628906, + "learning_rate": 7.15426497277677e-06, + "loss": 39.6585, + "step": 4206 + }, + { + "epoch": 15.187810383747179, + "grad_norm": 237.47613525390625, + "learning_rate": 7.148820326678766e-06, + "loss": 40.0478, + "step": 4207 + }, + { + "epoch": 15.191422121896162, + "grad_norm": 259.54296875, + "learning_rate": 7.143375680580762e-06, + "loss": 39.7604, + "step": 4208 + }, + { + "epoch": 15.195033860045147, + "grad_norm": 249.7389678955078, + "learning_rate": 7.137931034482759e-06, + "loss": 39.0201, + "step": 4209 + }, + { + "epoch": 15.198645598194132, + "grad_norm": 298.4624938964844, + "learning_rate": 7.132486388384755e-06, + "loss": 39.8575, + "step": 4210 + }, + { + "epoch": 15.198645598194132, + "eval_loss": 0.6088115572929382, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 4210 + }, + { + "epoch": 15.202257336343115, + "grad_norm": 267.57659912109375, + "learning_rate": 7.127041742286752e-06, + "loss": 38.8929, + "step": 4211 + }, + { + "epoch": 15.2058690744921, + "grad_norm": 243.88333129882812, + "learning_rate": 7.121597096188748e-06, + "loss": 39.6078, + "step": 4212 + }, + { + "epoch": 15.209480812641084, + "grad_norm": 268.2644348144531, + "learning_rate": 7.116152450090745e-06, + "loss": 39.9488, + "step": 4213 + }, + { + "epoch": 15.213092550790067, + "grad_norm": 240.2657928466797, + "learning_rate": 7.11070780399274e-06, + "loss": 40.1645, + "step": 4214 + }, + { + "epoch": 15.216704288939052, + "grad_norm": 198.76910400390625, + "learning_rate": 7.105263157894737e-06, + "loss": 38.2229, + "step": 4215 + }, + { + "epoch": 15.220316027088035, + "grad_norm": 234.11170959472656, + "learning_rate": 7.099818511796734e-06, + "loss": 39.5294, + "step": 4216 + }, + { + "epoch": 15.22392776523702, + "grad_norm": 192.80194091796875, + "learning_rate": 7.094373865698729e-06, + "loss": 36.9752, + "step": 4217 + }, + { + "epoch": 15.227539503386005, + "grad_norm": 241.8236846923828, + "learning_rate": 7.088929219600726e-06, + "loss": 36.1043, + "step": 4218 + }, + { + "epoch": 15.231151241534988, + "grad_norm": 451.6199645996094, + "learning_rate": 7.083484573502722e-06, + "loss": 37.7911, + "step": 4219 + }, + { + "epoch": 15.234762979683973, + "grad_norm": 351.9429626464844, + "learning_rate": 7.0780399274047195e-06, + "loss": 35.5202, + "step": 4220 + }, + { + "epoch": 15.234762979683973, + "eval_loss": 0.6093130111694336, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 4220 + }, + { + "epoch": 15.238374717832958, + "grad_norm": 266.4995422363281, + "learning_rate": 7.072595281306715e-06, + "loss": 37.5552, + "step": 4221 + }, + { + "epoch": 15.241986455981941, + "grad_norm": 258.74578857421875, + "learning_rate": 7.067150635208712e-06, + "loss": 37.1315, + "step": 4222 + }, + { + "epoch": 15.245598194130926, + "grad_norm": 233.30921936035156, + "learning_rate": 7.061705989110708e-06, + "loss": 36.9237, + "step": 4223 + }, + { + "epoch": 15.249209932279909, + "grad_norm": 235.8688201904297, + "learning_rate": 7.056261343012704e-06, + "loss": 38.0112, + "step": 4224 + }, + { + "epoch": 15.252821670428894, + "grad_norm": 214.88436889648438, + "learning_rate": 7.050816696914701e-06, + "loss": 38.5641, + "step": 4225 + }, + { + "epoch": 15.256433408577879, + "grad_norm": 252.64144897460938, + "learning_rate": 7.045372050816697e-06, + "loss": 36.7125, + "step": 4226 + }, + { + "epoch": 15.260045146726862, + "grad_norm": 293.78424072265625, + "learning_rate": 7.039927404718694e-06, + "loss": 37.5956, + "step": 4227 + }, + { + "epoch": 15.263656884875846, + "grad_norm": 234.13510131835938, + "learning_rate": 7.03448275862069e-06, + "loss": 38.1829, + "step": 4228 + }, + { + "epoch": 15.267268623024831, + "grad_norm": 279.534912109375, + "learning_rate": 7.029038112522686e-06, + "loss": 39.0785, + "step": 4229 + }, + { + "epoch": 15.270880361173814, + "grad_norm": 246.4442596435547, + "learning_rate": 7.023593466424683e-06, + "loss": 39.1753, + "step": 4230 + }, + { + "epoch": 15.270880361173814, + "eval_loss": 0.6043311357498169, + "eval_runtime": 3.1452, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 4230 + }, + { + "epoch": 15.2744920993228, + "grad_norm": 233.87466430664062, + "learning_rate": 7.018148820326679e-06, + "loss": 39.8464, + "step": 4231 + }, + { + "epoch": 15.278103837471784, + "grad_norm": 228.54898071289062, + "learning_rate": 7.012704174228675e-06, + "loss": 37.9721, + "step": 4232 + }, + { + "epoch": 15.281715575620767, + "grad_norm": 273.70050048828125, + "learning_rate": 7.007259528130671e-06, + "loss": 38.9153, + "step": 4233 + }, + { + "epoch": 15.285327313769752, + "grad_norm": 269.8402404785156, + "learning_rate": 7.001814882032668e-06, + "loss": 36.7607, + "step": 4234 + }, + { + "epoch": 15.288939051918735, + "grad_norm": 260.13629150390625, + "learning_rate": 6.996370235934665e-06, + "loss": 35.3684, + "step": 4235 + }, + { + "epoch": 15.29255079006772, + "grad_norm": 223.9878692626953, + "learning_rate": 6.990925589836661e-06, + "loss": 32.8784, + "step": 4236 + }, + { + "epoch": 15.296162528216705, + "grad_norm": 225.69212341308594, + "learning_rate": 6.985480943738657e-06, + "loss": 31.3751, + "step": 4237 + }, + { + "epoch": 15.299774266365688, + "grad_norm": 215.99801635742188, + "learning_rate": 6.980036297640653e-06, + "loss": 31.5331, + "step": 4238 + }, + { + "epoch": 15.303386004514673, + "grad_norm": 263.26568603515625, + "learning_rate": 6.97459165154265e-06, + "loss": 32.5806, + "step": 4239 + }, + { + "epoch": 15.306997742663658, + "grad_norm": 203.2392578125, + "learning_rate": 6.969147005444646e-06, + "loss": 31.6379, + "step": 4240 + }, + { + "epoch": 15.306997742663658, + "eval_loss": 0.6046441793441772, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 4240 + }, + { + "epoch": 15.31060948081264, + "grad_norm": 221.2167510986328, + "learning_rate": 6.963702359346643e-06, + "loss": 33.7034, + "step": 4241 + }, + { + "epoch": 15.314221218961626, + "grad_norm": 212.58737182617188, + "learning_rate": 6.958257713248639e-06, + "loss": 32.5511, + "step": 4242 + }, + { + "epoch": 15.317832957110609, + "grad_norm": 270.7123718261719, + "learning_rate": 6.952813067150635e-06, + "loss": 33.2513, + "step": 4243 + }, + { + "epoch": 15.321444695259594, + "grad_norm": 270.2066345214844, + "learning_rate": 6.9473684210526315e-06, + "loss": 33.9559, + "step": 4244 + }, + { + "epoch": 15.325056433408578, + "grad_norm": 232.8043212890625, + "learning_rate": 6.941923774954628e-06, + "loss": 33.9916, + "step": 4245 + }, + { + "epoch": 15.328668171557561, + "grad_norm": 325.419921875, + "learning_rate": 6.936479128856625e-06, + "loss": 35.2098, + "step": 4246 + }, + { + "epoch": 15.332279909706546, + "grad_norm": 303.326416015625, + "learning_rate": 6.93103448275862e-06, + "loss": 35.0784, + "step": 4247 + }, + { + "epoch": 15.335891647855531, + "grad_norm": 327.05963134765625, + "learning_rate": 6.925589836660617e-06, + "loss": 35.9915, + "step": 4248 + }, + { + "epoch": 15.339503386004514, + "grad_norm": 326.58795166015625, + "learning_rate": 6.9201451905626135e-06, + "loss": 35.1914, + "step": 4249 + }, + { + "epoch": 15.343115124153499, + "grad_norm": 406.38812255859375, + "learning_rate": 6.914700544464611e-06, + "loss": 37.1535, + "step": 4250 + }, + { + "epoch": 15.343115124153499, + "eval_loss": 0.6056071519851685, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 4250 + }, + { + "epoch": 15.346726862302482, + "grad_norm": 325.6965637207031, + "learning_rate": 6.909255898366606e-06, + "loss": 29.8698, + "step": 4251 + }, + { + "epoch": 15.350338600451467, + "grad_norm": 212.59727478027344, + "learning_rate": 6.903811252268603e-06, + "loss": 22.2995, + "step": 4252 + }, + { + "epoch": 15.353950338600452, + "grad_norm": 257.447509765625, + "learning_rate": 6.898366606170599e-06, + "loss": 23.1014, + "step": 4253 + }, + { + "epoch": 15.357562076749435, + "grad_norm": 266.139892578125, + "learning_rate": 6.8929219600725955e-06, + "loss": 23.2319, + "step": 4254 + }, + { + "epoch": 15.36117381489842, + "grad_norm": 332.7207336425781, + "learning_rate": 6.887477313974592e-06, + "loss": 23.7218, + "step": 4255 + }, + { + "epoch": 15.364785553047405, + "grad_norm": 272.7341003417969, + "learning_rate": 6.882032667876588e-06, + "loss": 39.5787, + "step": 4256 + }, + { + "epoch": 15.368397291196388, + "grad_norm": 259.00872802734375, + "learning_rate": 6.876588021778585e-06, + "loss": 41.0874, + "step": 4257 + }, + { + "epoch": 15.372009029345373, + "grad_norm": 236.87033081054688, + "learning_rate": 6.8711433756805804e-06, + "loss": 38.9811, + "step": 4258 + }, + { + "epoch": 15.375620767494357, + "grad_norm": 293.6808776855469, + "learning_rate": 6.8656987295825775e-06, + "loss": 39.481, + "step": 4259 + }, + { + "epoch": 15.37923250564334, + "grad_norm": 266.0845947265625, + "learning_rate": 6.860254083484574e-06, + "loss": 39.4595, + "step": 4260 + }, + { + "epoch": 15.37923250564334, + "eval_loss": 0.6039742231369019, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.047, + "eval_steps_per_second": 57.047, + "step": 4260 + }, + { + "epoch": 15.382844243792325, + "grad_norm": 398.0877685546875, + "learning_rate": 6.85480943738657e-06, + "loss": 38.8899, + "step": 4261 + }, + { + "epoch": 15.386455981941308, + "grad_norm": 208.37376403808594, + "learning_rate": 6.849364791288566e-06, + "loss": 39.2194, + "step": 4262 + }, + { + "epoch": 15.390067720090293, + "grad_norm": 214.6958770751953, + "learning_rate": 6.8439201451905624e-06, + "loss": 38.9911, + "step": 4263 + }, + { + "epoch": 15.393679458239278, + "grad_norm": 210.2147674560547, + "learning_rate": 6.8384754990925595e-06, + "loss": 40.5973, + "step": 4264 + }, + { + "epoch": 15.397291196388261, + "grad_norm": 240.47030639648438, + "learning_rate": 6.833030852994556e-06, + "loss": 39.3936, + "step": 4265 + }, + { + "epoch": 15.400902934537246, + "grad_norm": 273.86883544921875, + "learning_rate": 6.827586206896552e-06, + "loss": 40.0848, + "step": 4266 + }, + { + "epoch": 15.404514672686231, + "grad_norm": 239.36453247070312, + "learning_rate": 6.822141560798548e-06, + "loss": 36.5967, + "step": 4267 + }, + { + "epoch": 15.408126410835214, + "grad_norm": 215.3413543701172, + "learning_rate": 6.8166969147005444e-06, + "loss": 37.8173, + "step": 4268 + }, + { + "epoch": 15.411738148984199, + "grad_norm": 260.1557312011719, + "learning_rate": 6.811252268602541e-06, + "loss": 37.7175, + "step": 4269 + }, + { + "epoch": 15.415349887133182, + "grad_norm": 239.4988555908203, + "learning_rate": 6.805807622504537e-06, + "loss": 37.0618, + "step": 4270 + }, + { + "epoch": 15.415349887133182, + "eval_loss": 0.6049810647964478, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 4270 + }, + { + "epoch": 15.418961625282167, + "grad_norm": 223.06094360351562, + "learning_rate": 6.800362976406534e-06, + "loss": 37.0687, + "step": 4271 + }, + { + "epoch": 15.422573363431152, + "grad_norm": 261.7460632324219, + "learning_rate": 6.79491833030853e-06, + "loss": 35.9437, + "step": 4272 + }, + { + "epoch": 15.426185101580135, + "grad_norm": 230.92135620117188, + "learning_rate": 6.7894736842105264e-06, + "loss": 38.3316, + "step": 4273 + }, + { + "epoch": 15.42979683972912, + "grad_norm": 370.6309509277344, + "learning_rate": 6.784029038112523e-06, + "loss": 38.2666, + "step": 4274 + }, + { + "epoch": 15.433408577878104, + "grad_norm": 249.7823944091797, + "learning_rate": 6.77858439201452e-06, + "loss": 38.1159, + "step": 4275 + }, + { + "epoch": 15.437020316027088, + "grad_norm": 404.1676330566406, + "learning_rate": 6.773139745916516e-06, + "loss": 37.6548, + "step": 4276 + }, + { + "epoch": 15.440632054176072, + "grad_norm": 256.3241271972656, + "learning_rate": 6.767695099818511e-06, + "loss": 38.3713, + "step": 4277 + }, + { + "epoch": 15.444243792325057, + "grad_norm": 240.55934143066406, + "learning_rate": 6.7622504537205084e-06, + "loss": 39.2487, + "step": 4278 + }, + { + "epoch": 15.44785553047404, + "grad_norm": 230.010009765625, + "learning_rate": 6.756805807622505e-06, + "loss": 39.4391, + "step": 4279 + }, + { + "epoch": 15.451467268623025, + "grad_norm": 226.51385498046875, + "learning_rate": 6.751361161524502e-06, + "loss": 38.6273, + "step": 4280 + }, + { + "epoch": 15.451467268623025, + "eval_loss": 0.6027400493621826, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.029, + "eval_steps_per_second": 57.029, + "step": 4280 + }, + { + "epoch": 15.455079006772008, + "grad_norm": 314.57476806640625, + "learning_rate": 6.745916515426497e-06, + "loss": 38.583, + "step": 4281 + }, + { + "epoch": 15.458690744920993, + "grad_norm": 229.91238403320312, + "learning_rate": 6.740471869328494e-06, + "loss": 39.2433, + "step": 4282 + }, + { + "epoch": 15.462302483069978, + "grad_norm": 284.7301330566406, + "learning_rate": 6.7350272232304904e-06, + "loss": 38.8577, + "step": 4283 + }, + { + "epoch": 15.465914221218961, + "grad_norm": 209.32266235351562, + "learning_rate": 6.729582577132486e-06, + "loss": 34.928, + "step": 4284 + }, + { + "epoch": 15.469525959367946, + "grad_norm": 264.6195068359375, + "learning_rate": 6.724137931034483e-06, + "loss": 32.0527, + "step": 4285 + }, + { + "epoch": 15.47313769751693, + "grad_norm": 224.2421112060547, + "learning_rate": 6.718693284936479e-06, + "loss": 31.939, + "step": 4286 + }, + { + "epoch": 15.476749435665914, + "grad_norm": 233.0791015625, + "learning_rate": 6.713248638838476e-06, + "loss": 32.5402, + "step": 4287 + }, + { + "epoch": 15.480361173814899, + "grad_norm": 284.129638671875, + "learning_rate": 6.707803992740472e-06, + "loss": 31.0069, + "step": 4288 + }, + { + "epoch": 15.483972911963882, + "grad_norm": 253.6517791748047, + "learning_rate": 6.702359346642469e-06, + "loss": 32.0172, + "step": 4289 + }, + { + "epoch": 15.487584650112867, + "grad_norm": 305.63775634765625, + "learning_rate": 6.696914700544465e-06, + "loss": 34.1643, + "step": 4290 + }, + { + "epoch": 15.487584650112867, + "eval_loss": 0.6044390201568604, + "eval_runtime": 3.1391, + "eval_samples_per_second": 57.023, + "eval_steps_per_second": 57.023, + "step": 4290 + }, + { + "epoch": 15.491196388261852, + "grad_norm": 224.6516876220703, + "learning_rate": 6.691470054446461e-06, + "loss": 32.4735, + "step": 4291 + }, + { + "epoch": 15.494808126410835, + "grad_norm": 257.5385437011719, + "learning_rate": 6.686025408348457e-06, + "loss": 33.9272, + "step": 4292 + }, + { + "epoch": 15.49841986455982, + "grad_norm": 393.9106140136719, + "learning_rate": 6.680580762250454e-06, + "loss": 34.4176, + "step": 4293 + }, + { + "epoch": 15.502031602708804, + "grad_norm": 333.5639953613281, + "learning_rate": 6.675136116152451e-06, + "loss": 34.5695, + "step": 4294 + }, + { + "epoch": 15.505643340857787, + "grad_norm": 319.8660888671875, + "learning_rate": 6.669691470054446e-06, + "loss": 34.5337, + "step": 4295 + }, + { + "epoch": 15.509255079006772, + "grad_norm": 246.78086853027344, + "learning_rate": 6.664246823956443e-06, + "loss": 34.8297, + "step": 4296 + }, + { + "epoch": 15.512866817155757, + "grad_norm": 313.4530944824219, + "learning_rate": 6.658802177858439e-06, + "loss": 34.6901, + "step": 4297 + }, + { + "epoch": 15.51647855530474, + "grad_norm": 257.2852783203125, + "learning_rate": 6.6533575317604364e-06, + "loss": 35.3892, + "step": 4298 + }, + { + "epoch": 15.520090293453725, + "grad_norm": 336.5549011230469, + "learning_rate": 6.647912885662432e-06, + "loss": 36.3347, + "step": 4299 + }, + { + "epoch": 15.523702031602708, + "grad_norm": 275.726806640625, + "learning_rate": 6.642468239564428e-06, + "loss": 36.3559, + "step": 4300 + }, + { + "epoch": 15.523702031602708, + "eval_loss": 0.6056334376335144, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.028, + "eval_steps_per_second": 57.028, + "step": 4300 + }, + { + "epoch": 15.527313769751693, + "grad_norm": 275.5987243652344, + "learning_rate": 6.637023593466425e-06, + "loss": 28.5887, + "step": 4301 + }, + { + "epoch": 15.530925507900678, + "grad_norm": 242.59762573242188, + "learning_rate": 6.631578947368421e-06, + "loss": 22.1398, + "step": 4302 + }, + { + "epoch": 15.534537246049661, + "grad_norm": 228.04344177246094, + "learning_rate": 6.626134301270418e-06, + "loss": 21.4593, + "step": 4303 + }, + { + "epoch": 15.538148984198646, + "grad_norm": 204.2377166748047, + "learning_rate": 6.620689655172414e-06, + "loss": 22.5132, + "step": 4304 + }, + { + "epoch": 15.54176072234763, + "grad_norm": 243.0237579345703, + "learning_rate": 6.615245009074411e-06, + "loss": 24.2777, + "step": 4305 + }, + { + "epoch": 15.545372460496614, + "grad_norm": 227.2841339111328, + "learning_rate": 6.609800362976407e-06, + "loss": 39.7235, + "step": 4306 + }, + { + "epoch": 15.548984198645599, + "grad_norm": 253.8453826904297, + "learning_rate": 6.6043557168784025e-06, + "loss": 39.9317, + "step": 4307 + }, + { + "epoch": 15.552595936794582, + "grad_norm": 243.62757873535156, + "learning_rate": 6.5989110707804e-06, + "loss": 38.9825, + "step": 4308 + }, + { + "epoch": 15.556207674943566, + "grad_norm": 262.4398498535156, + "learning_rate": 6.593466424682396e-06, + "loss": 39.7456, + "step": 4309 + }, + { + "epoch": 15.559819413092551, + "grad_norm": 268.5821228027344, + "learning_rate": 6.588021778584392e-06, + "loss": 39.5152, + "step": 4310 + }, + { + "epoch": 15.559819413092551, + "eval_loss": 0.6060237288475037, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 4310 + }, + { + "epoch": 15.563431151241534, + "grad_norm": 297.6933898925781, + "learning_rate": 6.582577132486388e-06, + "loss": 40.1259, + "step": 4311 + }, + { + "epoch": 15.56704288939052, + "grad_norm": 234.08816528320312, + "learning_rate": 6.577132486388385e-06, + "loss": 40.8591, + "step": 4312 + }, + { + "epoch": 15.570654627539504, + "grad_norm": 292.2416687011719, + "learning_rate": 6.571687840290382e-06, + "loss": 39.2377, + "step": 4313 + }, + { + "epoch": 15.574266365688487, + "grad_norm": 205.25888061523438, + "learning_rate": 6.566243194192377e-06, + "loss": 39.92, + "step": 4314 + }, + { + "epoch": 15.577878103837472, + "grad_norm": 229.06695556640625, + "learning_rate": 6.560798548094374e-06, + "loss": 39.8886, + "step": 4315 + }, + { + "epoch": 15.581489841986457, + "grad_norm": 223.3977508544922, + "learning_rate": 6.55535390199637e-06, + "loss": 38.5423, + "step": 4316 + }, + { + "epoch": 15.58510158013544, + "grad_norm": 254.60203552246094, + "learning_rate": 6.549909255898367e-06, + "loss": 36.8055, + "step": 4317 + }, + { + "epoch": 15.588713318284425, + "grad_norm": 304.463623046875, + "learning_rate": 6.544464609800363e-06, + "loss": 37.6164, + "step": 4318 + }, + { + "epoch": 15.592325056433408, + "grad_norm": 279.955810546875, + "learning_rate": 6.53901996370236e-06, + "loss": 37.4778, + "step": 4319 + }, + { + "epoch": 15.595936794582393, + "grad_norm": 230.11105346679688, + "learning_rate": 6.533575317604356e-06, + "loss": 36.9663, + "step": 4320 + }, + { + "epoch": 15.595936794582393, + "eval_loss": 0.6048213243484497, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.966, + "eval_steps_per_second": 56.966, + "step": 4320 + }, + { + "epoch": 15.599548532731378, + "grad_norm": 261.98187255859375, + "learning_rate": 6.528130671506351e-06, + "loss": 37.7402, + "step": 4321 + }, + { + "epoch": 15.60316027088036, + "grad_norm": 247.34771728515625, + "learning_rate": 6.5226860254083485e-06, + "loss": 37.1402, + "step": 4322 + }, + { + "epoch": 15.606772009029346, + "grad_norm": 277.1517333984375, + "learning_rate": 6.517241379310345e-06, + "loss": 38.3976, + "step": 4323 + }, + { + "epoch": 15.610383747178329, + "grad_norm": 231.89683532714844, + "learning_rate": 6.511796733212342e-06, + "loss": 38.0834, + "step": 4324 + }, + { + "epoch": 15.613995485327314, + "grad_norm": 323.8349304199219, + "learning_rate": 6.506352087114337e-06, + "loss": 37.9085, + "step": 4325 + }, + { + "epoch": 15.617607223476298, + "grad_norm": 263.5240783691406, + "learning_rate": 6.500907441016334e-06, + "loss": 37.0702, + "step": 4326 + }, + { + "epoch": 15.621218961625281, + "grad_norm": 217.0517578125, + "learning_rate": 6.4954627949183305e-06, + "loss": 36.9406, + "step": 4327 + }, + { + "epoch": 15.624830699774266, + "grad_norm": 267.4161682128906, + "learning_rate": 6.4900181488203276e-06, + "loss": 38.8773, + "step": 4328 + }, + { + "epoch": 15.628442437923251, + "grad_norm": 232.36000061035156, + "learning_rate": 6.484573502722323e-06, + "loss": 38.4978, + "step": 4329 + }, + { + "epoch": 15.632054176072234, + "grad_norm": 241.61373901367188, + "learning_rate": 6.479128856624319e-06, + "loss": 38.4895, + "step": 4330 + }, + { + "epoch": 15.632054176072234, + "eval_loss": 0.6024956703186035, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 4330 + }, + { + "epoch": 15.635665914221219, + "grad_norm": 232.27928161621094, + "learning_rate": 6.473684210526316e-06, + "loss": 38.8551, + "step": 4331 + }, + { + "epoch": 15.639277652370204, + "grad_norm": 243.42828369140625, + "learning_rate": 6.4682395644283125e-06, + "loss": 38.6475, + "step": 4332 + }, + { + "epoch": 15.642889390519187, + "grad_norm": 306.2618103027344, + "learning_rate": 6.462794918330309e-06, + "loss": 37.2015, + "step": 4333 + }, + { + "epoch": 15.646501128668172, + "grad_norm": 335.795166015625, + "learning_rate": 6.457350272232305e-06, + "loss": 36.5255, + "step": 4334 + }, + { + "epoch": 15.650112866817155, + "grad_norm": 209.6246337890625, + "learning_rate": 6.451905626134302e-06, + "loss": 32.4219, + "step": 4335 + }, + { + "epoch": 15.65372460496614, + "grad_norm": 283.2094421386719, + "learning_rate": 6.446460980036297e-06, + "loss": 30.9137, + "step": 4336 + }, + { + "epoch": 15.657336343115125, + "grad_norm": 255.4412841796875, + "learning_rate": 6.441016333938294e-06, + "loss": 30.8939, + "step": 4337 + }, + { + "epoch": 15.660948081264108, + "grad_norm": 217.8052215576172, + "learning_rate": 6.435571687840291e-06, + "loss": 31.5974, + "step": 4338 + }, + { + "epoch": 15.664559819413093, + "grad_norm": 215.64398193359375, + "learning_rate": 6.430127041742287e-06, + "loss": 30.0276, + "step": 4339 + }, + { + "epoch": 15.668171557562077, + "grad_norm": 244.32704162597656, + "learning_rate": 6.424682395644283e-06, + "loss": 32.5249, + "step": 4340 + }, + { + "epoch": 15.668171557562077, + "eval_loss": 0.6037233471870422, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.033, + "eval_steps_per_second": 57.033, + "step": 4340 + }, + { + "epoch": 15.67178329571106, + "grad_norm": 270.9132080078125, + "learning_rate": 6.419237749546279e-06, + "loss": 32.9923, + "step": 4341 + }, + { + "epoch": 15.675395033860045, + "grad_norm": 230.20314025878906, + "learning_rate": 6.4137931034482765e-06, + "loss": 32.871, + "step": 4342 + }, + { + "epoch": 15.679006772009028, + "grad_norm": 372.4366149902344, + "learning_rate": 6.408348457350273e-06, + "loss": 35.2687, + "step": 4343 + }, + { + "epoch": 15.682618510158013, + "grad_norm": 325.0901794433594, + "learning_rate": 6.402903811252268e-06, + "loss": 34.3107, + "step": 4344 + }, + { + "epoch": 15.686230248306998, + "grad_norm": 277.8683166503906, + "learning_rate": 6.397459165154265e-06, + "loss": 34.291, + "step": 4345 + }, + { + "epoch": 15.689841986455981, + "grad_norm": 262.566162109375, + "learning_rate": 6.392014519056261e-06, + "loss": 33.2989, + "step": 4346 + }, + { + "epoch": 15.693453724604966, + "grad_norm": 293.56536865234375, + "learning_rate": 6.386569872958258e-06, + "loss": 35.6865, + "step": 4347 + }, + { + "epoch": 15.697065462753951, + "grad_norm": 291.1886291503906, + "learning_rate": 6.381125226860254e-06, + "loss": 35.6959, + "step": 4348 + }, + { + "epoch": 15.700677200902934, + "grad_norm": 265.2365417480469, + "learning_rate": 6.375680580762251e-06, + "loss": 36.479, + "step": 4349 + }, + { + "epoch": 15.704288939051919, + "grad_norm": 342.8822021484375, + "learning_rate": 6.370235934664247e-06, + "loss": 35.9198, + "step": 4350 + }, + { + "epoch": 15.704288939051919, + "eval_loss": 0.603361189365387, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.98, + "eval_steps_per_second": 56.98, + "step": 4350 + }, + { + "epoch": 15.707900677200904, + "grad_norm": 276.1657409667969, + "learning_rate": 6.364791288566243e-06, + "loss": 29.429, + "step": 4351 + }, + { + "epoch": 15.711512415349887, + "grad_norm": 267.2456359863281, + "learning_rate": 6.35934664246824e-06, + "loss": 23.0038, + "step": 4352 + }, + { + "epoch": 15.715124153498872, + "grad_norm": 255.4893798828125, + "learning_rate": 6.353901996370236e-06, + "loss": 21.1185, + "step": 4353 + }, + { + "epoch": 15.718735891647855, + "grad_norm": 252.10501098632812, + "learning_rate": 6.348457350272233e-06, + "loss": 23.1769, + "step": 4354 + }, + { + "epoch": 15.72234762979684, + "grad_norm": 239.63905334472656, + "learning_rate": 6.343012704174228e-06, + "loss": 24.5905, + "step": 4355 + }, + { + "epoch": 15.725959367945824, + "grad_norm": 228.00950622558594, + "learning_rate": 6.337568058076225e-06, + "loss": 39.6657, + "step": 4356 + }, + { + "epoch": 15.729571106094808, + "grad_norm": 234.10647583007812, + "learning_rate": 6.332123411978222e-06, + "loss": 41.145, + "step": 4357 + }, + { + "epoch": 15.733182844243792, + "grad_norm": 236.55223083496094, + "learning_rate": 6.326678765880219e-06, + "loss": 40.2784, + "step": 4358 + }, + { + "epoch": 15.736794582392777, + "grad_norm": 340.1712646484375, + "learning_rate": 6.321234119782214e-06, + "loss": 39.3598, + "step": 4359 + }, + { + "epoch": 15.74040632054176, + "grad_norm": 269.4134826660156, + "learning_rate": 6.31578947368421e-06, + "loss": 38.7777, + "step": 4360 + }, + { + "epoch": 15.74040632054176, + "eval_loss": 0.6048015356063843, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 4360 + }, + { + "epoch": 15.744018058690745, + "grad_norm": 316.5471496582031, + "learning_rate": 6.310344827586207e-06, + "loss": 39.6707, + "step": 4361 + }, + { + "epoch": 15.747629796839728, + "grad_norm": 231.31820678710938, + "learning_rate": 6.304900181488203e-06, + "loss": 38.0009, + "step": 4362 + }, + { + "epoch": 15.751241534988713, + "grad_norm": 207.19117736816406, + "learning_rate": 6.2994555353902e-06, + "loss": 41.6523, + "step": 4363 + }, + { + "epoch": 15.754853273137698, + "grad_norm": 239.8341064453125, + "learning_rate": 6.294010889292196e-06, + "loss": 40.3203, + "step": 4364 + }, + { + "epoch": 15.758465011286681, + "grad_norm": 277.2004089355469, + "learning_rate": 6.288566243194193e-06, + "loss": 39.8026, + "step": 4365 + }, + { + "epoch": 15.762076749435666, + "grad_norm": 227.74728393554688, + "learning_rate": 6.2831215970961886e-06, + "loss": 38.1561, + "step": 4366 + }, + { + "epoch": 15.76568848758465, + "grad_norm": 268.6826477050781, + "learning_rate": 6.277676950998185e-06, + "loss": 37.4653, + "step": 4367 + }, + { + "epoch": 15.769300225733634, + "grad_norm": 308.92950439453125, + "learning_rate": 6.272232304900182e-06, + "loss": 36.3506, + "step": 4368 + }, + { + "epoch": 15.772911963882619, + "grad_norm": 216.53627014160156, + "learning_rate": 6.266787658802178e-06, + "loss": 36.12, + "step": 4369 + }, + { + "epoch": 15.776523702031604, + "grad_norm": 264.0691833496094, + "learning_rate": 6.261343012704174e-06, + "loss": 37.5023, + "step": 4370 + }, + { + "epoch": 15.776523702031604, + "eval_loss": 0.608928382396698, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.989, + "eval_steps_per_second": 56.989, + "step": 4370 + }, + { + "epoch": 15.780135440180587, + "grad_norm": 474.7265319824219, + "learning_rate": 6.2558983666061706e-06, + "loss": 38.8381, + "step": 4371 + }, + { + "epoch": 15.783747178329572, + "grad_norm": 303.66229248046875, + "learning_rate": 6.250453720508168e-06, + "loss": 36.5951, + "step": 4372 + }, + { + "epoch": 15.787358916478555, + "grad_norm": 231.65744018554688, + "learning_rate": 6.245009074410164e-06, + "loss": 36.4717, + "step": 4373 + }, + { + "epoch": 15.79097065462754, + "grad_norm": 235.25833129882812, + "learning_rate": 6.239564428312159e-06, + "loss": 38.4578, + "step": 4374 + }, + { + "epoch": 15.794582392776524, + "grad_norm": 215.5384063720703, + "learning_rate": 6.234119782214156e-06, + "loss": 38.0475, + "step": 4375 + }, + { + "epoch": 15.798194130925507, + "grad_norm": 216.3609619140625, + "learning_rate": 6.2286751361161526e-06, + "loss": 37.1825, + "step": 4376 + }, + { + "epoch": 15.801805869074492, + "grad_norm": 275.54522705078125, + "learning_rate": 6.223230490018149e-06, + "loss": 38.5608, + "step": 4377 + }, + { + "epoch": 15.805417607223477, + "grad_norm": 226.7752685546875, + "learning_rate": 6.217785843920145e-06, + "loss": 38.0612, + "step": 4378 + }, + { + "epoch": 15.80902934537246, + "grad_norm": 262.14501953125, + "learning_rate": 6.212341197822142e-06, + "loss": 38.0049, + "step": 4379 + }, + { + "epoch": 15.812641083521445, + "grad_norm": 299.82196044921875, + "learning_rate": 6.206896551724138e-06, + "loss": 39.1441, + "step": 4380 + }, + { + "epoch": 15.812641083521445, + "eval_loss": 0.6033969521522522, + "eval_runtime": 3.14, + "eval_samples_per_second": 57.007, + "eval_steps_per_second": 57.007, + "step": 4380 + }, + { + "epoch": 15.816252821670428, + "grad_norm": 295.24188232421875, + "learning_rate": 6.2014519056261346e-06, + "loss": 39.266, + "step": 4381 + }, + { + "epoch": 15.819864559819413, + "grad_norm": 298.1729736328125, + "learning_rate": 6.196007259528131e-06, + "loss": 39.4025, + "step": 4382 + }, + { + "epoch": 15.823476297968398, + "grad_norm": 234.97958374023438, + "learning_rate": 6.190562613430127e-06, + "loss": 39.4752, + "step": 4383 + }, + { + "epoch": 15.827088036117381, + "grad_norm": 270.3009338378906, + "learning_rate": 6.185117967332124e-06, + "loss": 36.0322, + "step": 4384 + }, + { + "epoch": 15.830699774266366, + "grad_norm": 279.78314208984375, + "learning_rate": 6.1796733212341195e-06, + "loss": 33.3256, + "step": 4385 + }, + { + "epoch": 15.83431151241535, + "grad_norm": 258.82598876953125, + "learning_rate": 6.1742286751361166e-06, + "loss": 33.1552, + "step": 4386 + }, + { + "epoch": 15.837923250564334, + "grad_norm": 280.8109130859375, + "learning_rate": 6.168784029038113e-06, + "loss": 32.0024, + "step": 4387 + }, + { + "epoch": 15.841534988713319, + "grad_norm": 265.08111572265625, + "learning_rate": 6.163339382940109e-06, + "loss": 32.4901, + "step": 4388 + }, + { + "epoch": 15.845146726862303, + "grad_norm": 316.56427001953125, + "learning_rate": 6.157894736842105e-06, + "loss": 33.1995, + "step": 4389 + }, + { + "epoch": 15.848758465011286, + "grad_norm": 256.03717041015625, + "learning_rate": 6.1524500907441015e-06, + "loss": 33.1914, + "step": 4390 + }, + { + "epoch": 15.848758465011286, + "eval_loss": 0.6017575263977051, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.034, + "eval_steps_per_second": 57.034, + "step": 4390 + }, + { + "epoch": 15.852370203160271, + "grad_norm": 242.54119873046875, + "learning_rate": 6.1470054446460985e-06, + "loss": 33.8459, + "step": 4391 + }, + { + "epoch": 15.855981941309254, + "grad_norm": 259.1406555175781, + "learning_rate": 6.141560798548094e-06, + "loss": 34.1317, + "step": 4392 + }, + { + "epoch": 15.85959367945824, + "grad_norm": 272.77880859375, + "learning_rate": 6.136116152450091e-06, + "loss": 34.2777, + "step": 4393 + }, + { + "epoch": 15.863205417607224, + "grad_norm": 231.60845947265625, + "learning_rate": 6.130671506352087e-06, + "loss": 34.0165, + "step": 4394 + }, + { + "epoch": 15.866817155756207, + "grad_norm": 230.85675048828125, + "learning_rate": 6.125226860254084e-06, + "loss": 34.2761, + "step": 4395 + }, + { + "epoch": 15.870428893905192, + "grad_norm": 307.4486389160156, + "learning_rate": 6.11978221415608e-06, + "loss": 33.7407, + "step": 4396 + }, + { + "epoch": 15.874040632054175, + "grad_norm": 264.7835388183594, + "learning_rate": 6.114337568058076e-06, + "loss": 34.1672, + "step": 4397 + }, + { + "epoch": 15.87765237020316, + "grad_norm": 234.93968200683594, + "learning_rate": 6.108892921960073e-06, + "loss": 35.7158, + "step": 4398 + }, + { + "epoch": 15.881264108352145, + "grad_norm": 300.0079345703125, + "learning_rate": 6.103448275862069e-06, + "loss": 36.1292, + "step": 4399 + }, + { + "epoch": 15.884875846501128, + "grad_norm": 326.20416259765625, + "learning_rate": 6.0980036297640655e-06, + "loss": 34.8222, + "step": 4400 + }, + { + "epoch": 15.884875846501128, + "eval_loss": 0.6024067401885986, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.99, + "eval_steps_per_second": 56.99, + "step": 4400 + }, + { + "epoch": 15.888487584650113, + "grad_norm": 214.6174774169922, + "learning_rate": 6.092558983666062e-06, + "loss": 27.4819, + "step": 4401 + }, + { + "epoch": 15.892099322799098, + "grad_norm": 222.7063446044922, + "learning_rate": 6.087114337568059e-06, + "loss": 22.3862, + "step": 4402 + }, + { + "epoch": 15.89571106094808, + "grad_norm": 277.0006103515625, + "learning_rate": 6.081669691470054e-06, + "loss": 22.8483, + "step": 4403 + }, + { + "epoch": 15.899322799097066, + "grad_norm": 264.3949890136719, + "learning_rate": 6.076225045372051e-06, + "loss": 23.2021, + "step": 4404 + }, + { + "epoch": 15.90293453724605, + "grad_norm": 244.04611206054688, + "learning_rate": 6.0707803992740475e-06, + "loss": 23.9378, + "step": 4405 + }, + { + "epoch": 15.906546275395034, + "grad_norm": 219.24403381347656, + "learning_rate": 6.065335753176044e-06, + "loss": 39.4708, + "step": 4406 + }, + { + "epoch": 15.910158013544018, + "grad_norm": 297.3822937011719, + "learning_rate": 6.05989110707804e-06, + "loss": 39.9151, + "step": 4407 + }, + { + "epoch": 15.913769751693001, + "grad_norm": 282.748291015625, + "learning_rate": 6.054446460980036e-06, + "loss": 39.0545, + "step": 4408 + }, + { + "epoch": 15.917381489841986, + "grad_norm": 274.6419982910156, + "learning_rate": 6.049001814882033e-06, + "loss": 39.7046, + "step": 4409 + }, + { + "epoch": 15.920993227990971, + "grad_norm": 261.2831115722656, + "learning_rate": 6.0435571687840295e-06, + "loss": 39.8849, + "step": 4410 + }, + { + "epoch": 15.920993227990971, + "eval_loss": 0.6017056107521057, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 4410 + }, + { + "epoch": 15.924604966139954, + "grad_norm": 276.61505126953125, + "learning_rate": 6.038112522686026e-06, + "loss": 39.8861, + "step": 4411 + }, + { + "epoch": 15.928216704288939, + "grad_norm": 273.4017333984375, + "learning_rate": 6.032667876588022e-06, + "loss": 36.2526, + "step": 4412 + }, + { + "epoch": 15.931828442437924, + "grad_norm": 314.4811706542969, + "learning_rate": 6.027223230490018e-06, + "loss": 37.1316, + "step": 4413 + }, + { + "epoch": 15.935440180586907, + "grad_norm": 265.7447204589844, + "learning_rate": 6.021778584392014e-06, + "loss": 38.1698, + "step": 4414 + }, + { + "epoch": 15.939051918735892, + "grad_norm": 448.373291015625, + "learning_rate": 6.016333938294011e-06, + "loss": 38.9541, + "step": 4415 + }, + { + "epoch": 15.942663656884875, + "grad_norm": 261.33966064453125, + "learning_rate": 6.010889292196008e-06, + "loss": 36.6694, + "step": 4416 + }, + { + "epoch": 15.94627539503386, + "grad_norm": 383.16363525390625, + "learning_rate": 6.005444646098004e-06, + "loss": 39.1773, + "step": 4417 + }, + { + "epoch": 15.949887133182845, + "grad_norm": 279.26446533203125, + "learning_rate": 6e-06, + "loss": 36.9482, + "step": 4418 + }, + { + "epoch": 15.953498871331828, + "grad_norm": 307.5321960449219, + "learning_rate": 5.994555353901996e-06, + "loss": 36.653, + "step": 4419 + }, + { + "epoch": 15.957110609480813, + "grad_norm": 412.80023193359375, + "learning_rate": 5.989110707803993e-06, + "loss": 36.3768, + "step": 4420 + }, + { + "epoch": 15.957110609480813, + "eval_loss": 0.6033455729484558, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 4420 + }, + { + "epoch": 15.960722347629797, + "grad_norm": 254.2952880859375, + "learning_rate": 5.98366606170599e-06, + "loss": 32.546, + "step": 4421 + }, + { + "epoch": 15.96433408577878, + "grad_norm": 324.0749816894531, + "learning_rate": 5.978221415607985e-06, + "loss": 32.7021, + "step": 4422 + }, + { + "epoch": 15.967945823927765, + "grad_norm": 326.0075988769531, + "learning_rate": 5.972776769509982e-06, + "loss": 33.3823, + "step": 4423 + }, + { + "epoch": 15.97155756207675, + "grad_norm": 252.98471069335938, + "learning_rate": 5.967332123411978e-06, + "loss": 33.3397, + "step": 4424 + }, + { + "epoch": 15.975169300225733, + "grad_norm": 243.14117431640625, + "learning_rate": 5.9618874773139755e-06, + "loss": 34.2781, + "step": 4425 + }, + { + "epoch": 15.978781038374718, + "grad_norm": 304.3429260253906, + "learning_rate": 5.956442831215971e-06, + "loss": 34.1163, + "step": 4426 + }, + { + "epoch": 15.982392776523701, + "grad_norm": 320.1651916503906, + "learning_rate": 5.950998185117968e-06, + "loss": 34.1024, + "step": 4427 + }, + { + "epoch": 15.986004514672686, + "grad_norm": 252.0004425048828, + "learning_rate": 5.945553539019964e-06, + "loss": 35.8121, + "step": 4428 + }, + { + "epoch": 15.989616252821671, + "grad_norm": 342.5635986328125, + "learning_rate": 5.9401088929219595e-06, + "loss": 35.6666, + "step": 4429 + }, + { + "epoch": 15.993227990970654, + "grad_norm": 226.57249450683594, + "learning_rate": 5.934664246823957e-06, + "loss": 30.2617, + "step": 4430 + }, + { + "epoch": 15.993227990970654, + "eval_loss": 0.6029886603355408, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.979, + "eval_steps_per_second": 56.979, + "step": 4430 + }, + { + "epoch": 15.996839729119639, + "grad_norm": 202.94903564453125, + "learning_rate": 5.929219600725953e-06, + "loss": 22.8166, + "step": 4431 + }, + { + "epoch": 16.0, + "grad_norm": 200.84317016601562, + "learning_rate": 5.92377495462795e-06, + "loss": 20.3903, + "step": 4432 + }, + { + "epoch": 16.003611738148983, + "grad_norm": 230.5917510986328, + "learning_rate": 5.918330308529945e-06, + "loss": 39.0985, + "step": 4433 + }, + { + "epoch": 16.00722347629797, + "grad_norm": 285.6978759765625, + "learning_rate": 5.912885662431942e-06, + "loss": 39.2128, + "step": 4434 + }, + { + "epoch": 16.010835214446953, + "grad_norm": 221.70896911621094, + "learning_rate": 5.907441016333939e-06, + "loss": 38.9026, + "step": 4435 + }, + { + "epoch": 16.014446952595936, + "grad_norm": 318.14068603515625, + "learning_rate": 5.901996370235935e-06, + "loss": 38.7336, + "step": 4436 + }, + { + "epoch": 16.018058690744923, + "grad_norm": 324.451904296875, + "learning_rate": 5.896551724137931e-06, + "loss": 38.7117, + "step": 4437 + }, + { + "epoch": 16.021670428893906, + "grad_norm": 295.038818359375, + "learning_rate": 5.891107078039927e-06, + "loss": 39.6053, + "step": 4438 + }, + { + "epoch": 16.02528216704289, + "grad_norm": 267.0055236816406, + "learning_rate": 5.885662431941924e-06, + "loss": 38.931, + "step": 4439 + }, + { + "epoch": 16.028893905191875, + "grad_norm": 269.20074462890625, + "learning_rate": 5.88021778584392e-06, + "loss": 41.1717, + "step": 4440 + }, + { + "epoch": 16.028893905191875, + "eval_loss": 0.6036069393157959, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.899, + "eval_steps_per_second": 56.899, + "step": 4440 + }, + { + "epoch": 16.03250564334086, + "grad_norm": 241.9443359375, + "learning_rate": 5.874773139745917e-06, + "loss": 38.7027, + "step": 4441 + }, + { + "epoch": 16.03611738148984, + "grad_norm": 238.54847717285156, + "learning_rate": 5.869328493647913e-06, + "loss": 39.1284, + "step": 4442 + }, + { + "epoch": 16.039729119638825, + "grad_norm": 339.3023681640625, + "learning_rate": 5.863883847549909e-06, + "loss": 38.0767, + "step": 4443 + }, + { + "epoch": 16.04334085778781, + "grad_norm": 257.29522705078125, + "learning_rate": 5.8584392014519055e-06, + "loss": 34.8207, + "step": 4444 + }, + { + "epoch": 16.046952595936794, + "grad_norm": 264.24200439453125, + "learning_rate": 5.852994555353902e-06, + "loss": 35.5021, + "step": 4445 + }, + { + "epoch": 16.050564334085777, + "grad_norm": 251.3128662109375, + "learning_rate": 5.847549909255899e-06, + "loss": 35.7826, + "step": 4446 + }, + { + "epoch": 16.054176072234764, + "grad_norm": 310.6581726074219, + "learning_rate": 5.842105263157895e-06, + "loss": 36.7373, + "step": 4447 + }, + { + "epoch": 16.057787810383747, + "grad_norm": 299.07550048828125, + "learning_rate": 5.836660617059891e-06, + "loss": 36.4048, + "step": 4448 + }, + { + "epoch": 16.06139954853273, + "grad_norm": 257.58740234375, + "learning_rate": 5.8312159709618875e-06, + "loss": 36.3982, + "step": 4449 + }, + { + "epoch": 16.065011286681717, + "grad_norm": 337.6795654296875, + "learning_rate": 5.825771324863884e-06, + "loss": 36.8518, + "step": 4450 + }, + { + "epoch": 16.065011286681717, + "eval_loss": 0.6036850214004517, + "eval_runtime": 3.1399, + "eval_samples_per_second": 57.009, + "eval_steps_per_second": 57.009, + "step": 4450 + }, + { + "epoch": 16.0686230248307, + "grad_norm": 275.02423095703125, + "learning_rate": 5.820326678765881e-06, + "loss": 36.1763, + "step": 4451 + }, + { + "epoch": 16.072234762979683, + "grad_norm": 263.4334716796875, + "learning_rate": 5.814882032667876e-06, + "loss": 37.6417, + "step": 4452 + }, + { + "epoch": 16.07584650112867, + "grad_norm": 213.16749572753906, + "learning_rate": 5.809437386569873e-06, + "loss": 35.6537, + "step": 4453 + }, + { + "epoch": 16.079458239277653, + "grad_norm": 263.4288330078125, + "learning_rate": 5.8039927404718695e-06, + "loss": 36.5693, + "step": 4454 + }, + { + "epoch": 16.083069977426636, + "grad_norm": 284.67254638671875, + "learning_rate": 5.798548094373866e-06, + "loss": 37.3424, + "step": 4455 + }, + { + "epoch": 16.086681715575622, + "grad_norm": 355.7987060546875, + "learning_rate": 5.793103448275862e-06, + "loss": 38.7851, + "step": 4456 + }, + { + "epoch": 16.090293453724605, + "grad_norm": 249.7351531982422, + "learning_rate": 5.787658802177859e-06, + "loss": 38.1334, + "step": 4457 + }, + { + "epoch": 16.09390519187359, + "grad_norm": 257.4977722167969, + "learning_rate": 5.782214156079855e-06, + "loss": 37.8369, + "step": 4458 + }, + { + "epoch": 16.097516930022575, + "grad_norm": 242.59584045410156, + "learning_rate": 5.776769509981851e-06, + "loss": 37.4005, + "step": 4459 + }, + { + "epoch": 16.101128668171558, + "grad_norm": 270.0740966796875, + "learning_rate": 5.771324863883848e-06, + "loss": 38.2287, + "step": 4460 + }, + { + "epoch": 16.101128668171558, + "eval_loss": 0.6018803119659424, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.04, + "eval_steps_per_second": 57.04, + "step": 4460 + }, + { + "epoch": 16.10474040632054, + "grad_norm": 225.32322692871094, + "learning_rate": 5.765880217785844e-06, + "loss": 35.7162, + "step": 4461 + }, + { + "epoch": 16.108352144469524, + "grad_norm": 275.3272705078125, + "learning_rate": 5.760435571687841e-06, + "loss": 32.8733, + "step": 4462 + }, + { + "epoch": 16.11196388261851, + "grad_norm": 259.5124206542969, + "learning_rate": 5.7549909255898364e-06, + "loss": 33.2271, + "step": 4463 + }, + { + "epoch": 16.115575620767494, + "grad_norm": 249.75738525390625, + "learning_rate": 5.7495462794918335e-06, + "loss": 30.2931, + "step": 4464 + }, + { + "epoch": 16.119187358916477, + "grad_norm": 277.7652282714844, + "learning_rate": 5.74410163339383e-06, + "loss": 30.9294, + "step": 4465 + }, + { + "epoch": 16.122799097065464, + "grad_norm": 223.28250122070312, + "learning_rate": 5.738656987295825e-06, + "loss": 31.7337, + "step": 4466 + }, + { + "epoch": 16.126410835214447, + "grad_norm": 259.5106201171875, + "learning_rate": 5.733212341197822e-06, + "loss": 31.2897, + "step": 4467 + }, + { + "epoch": 16.13002257336343, + "grad_norm": 241.0313720703125, + "learning_rate": 5.7277676950998184e-06, + "loss": 32.8436, + "step": 4468 + }, + { + "epoch": 16.133634311512417, + "grad_norm": 277.46905517578125, + "learning_rate": 5.7223230490018155e-06, + "loss": 33.6823, + "step": 4469 + }, + { + "epoch": 16.1372460496614, + "grad_norm": 264.2905578613281, + "learning_rate": 5.716878402903811e-06, + "loss": 33.1107, + "step": 4470 + }, + { + "epoch": 16.1372460496614, + "eval_loss": 0.6046355962753296, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 4470 + }, + { + "epoch": 16.140857787810383, + "grad_norm": 295.5188903808594, + "learning_rate": 5.711433756805808e-06, + "loss": 33.6291, + "step": 4471 + }, + { + "epoch": 16.14446952595937, + "grad_norm": 282.6014709472656, + "learning_rate": 5.705989110707804e-06, + "loss": 33.0773, + "step": 4472 + }, + { + "epoch": 16.148081264108352, + "grad_norm": 270.7958679199219, + "learning_rate": 5.7005444646098004e-06, + "loss": 35.0269, + "step": 4473 + }, + { + "epoch": 16.151693002257336, + "grad_norm": 344.7304992675781, + "learning_rate": 5.695099818511797e-06, + "loss": 35.1349, + "step": 4474 + }, + { + "epoch": 16.155304740406322, + "grad_norm": 294.5618896484375, + "learning_rate": 5.689655172413793e-06, + "loss": 36.3309, + "step": 4475 + }, + { + "epoch": 16.158916478555305, + "grad_norm": 305.5354309082031, + "learning_rate": 5.68421052631579e-06, + "loss": 35.0976, + "step": 4476 + }, + { + "epoch": 16.16252821670429, + "grad_norm": 293.9934387207031, + "learning_rate": 5.678765880217786e-06, + "loss": 34.9113, + "step": 4477 + }, + { + "epoch": 16.16613995485327, + "grad_norm": 277.9523010253906, + "learning_rate": 5.6733212341197824e-06, + "loss": 24.8815, + "step": 4478 + }, + { + "epoch": 16.169751693002258, + "grad_norm": 297.0547790527344, + "learning_rate": 5.667876588021779e-06, + "loss": 22.4544, + "step": 4479 + }, + { + "epoch": 16.17336343115124, + "grad_norm": 237.44741821289062, + "learning_rate": 5.662431941923776e-06, + "loss": 21.8323, + "step": 4480 + }, + { + "epoch": 16.17336343115124, + "eval_loss": 0.6061411499977112, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.98, + "eval_steps_per_second": 56.98, + "step": 4480 + }, + { + "epoch": 16.176975169300224, + "grad_norm": 220.5832977294922, + "learning_rate": 5.656987295825771e-06, + "loss": 22.7531, + "step": 4481 + }, + { + "epoch": 16.18058690744921, + "grad_norm": 298.8033142089844, + "learning_rate": 5.651542649727767e-06, + "loss": 23.7107, + "step": 4482 + }, + { + "epoch": 16.184198645598194, + "grad_norm": 250.02593994140625, + "learning_rate": 5.6460980036297644e-06, + "loss": 39.1679, + "step": 4483 + }, + { + "epoch": 16.187810383747177, + "grad_norm": 253.00746154785156, + "learning_rate": 5.640653357531761e-06, + "loss": 40.6492, + "step": 4484 + }, + { + "epoch": 16.191422121896164, + "grad_norm": 215.04270935058594, + "learning_rate": 5.635208711433757e-06, + "loss": 38.604, + "step": 4485 + }, + { + "epoch": 16.195033860045147, + "grad_norm": 395.6152648925781, + "learning_rate": 5.629764065335753e-06, + "loss": 39.1417, + "step": 4486 + }, + { + "epoch": 16.19864559819413, + "grad_norm": 380.3653869628906, + "learning_rate": 5.62431941923775e-06, + "loss": 39.4322, + "step": 4487 + }, + { + "epoch": 16.202257336343116, + "grad_norm": 309.3524475097656, + "learning_rate": 5.6188747731397464e-06, + "loss": 39.1721, + "step": 4488 + }, + { + "epoch": 16.2058690744921, + "grad_norm": 237.88262939453125, + "learning_rate": 5.613430127041742e-06, + "loss": 39.1462, + "step": 4489 + }, + { + "epoch": 16.209480812641083, + "grad_norm": 233.66690063476562, + "learning_rate": 5.607985480943739e-06, + "loss": 39.8177, + "step": 4490 + }, + { + "epoch": 16.209480812641083, + "eval_loss": 0.6043822169303894, + "eval_runtime": 3.1418, + "eval_samples_per_second": 56.974, + "eval_steps_per_second": 56.974, + "step": 4490 + }, + { + "epoch": 16.21309255079007, + "grad_norm": 229.3720703125, + "learning_rate": 5.602540834845735e-06, + "loss": 39.7878, + "step": 4491 + }, + { + "epoch": 16.216704288939052, + "grad_norm": 228.66493225097656, + "learning_rate": 5.597096188747731e-06, + "loss": 40.0754, + "step": 4492 + }, + { + "epoch": 16.220316027088035, + "grad_norm": 276.40240478515625, + "learning_rate": 5.591651542649728e-06, + "loss": 38.7709, + "step": 4493 + }, + { + "epoch": 16.223927765237022, + "grad_norm": 268.62371826171875, + "learning_rate": 5.586206896551725e-06, + "loss": 37.7439, + "step": 4494 + }, + { + "epoch": 16.227539503386005, + "grad_norm": 271.0934753417969, + "learning_rate": 5.580762250453721e-06, + "loss": 38.2511, + "step": 4495 + }, + { + "epoch": 16.231151241534988, + "grad_norm": 253.63385009765625, + "learning_rate": 5.575317604355716e-06, + "loss": 36.716, + "step": 4496 + }, + { + "epoch": 16.23476297968397, + "grad_norm": 265.1177978515625, + "learning_rate": 5.569872958257713e-06, + "loss": 36.5517, + "step": 4497 + }, + { + "epoch": 16.238374717832958, + "grad_norm": 332.52972412109375, + "learning_rate": 5.56442831215971e-06, + "loss": 37.1524, + "step": 4498 + }, + { + "epoch": 16.24198645598194, + "grad_norm": 247.53643798828125, + "learning_rate": 5.558983666061707e-06, + "loss": 36.6666, + "step": 4499 + }, + { + "epoch": 16.245598194130924, + "grad_norm": 233.3318634033203, + "learning_rate": 5.553539019963702e-06, + "loss": 37.0842, + "step": 4500 + }, + { + "epoch": 16.245598194130924, + "eval_loss": 0.6042913794517517, + "eval_runtime": 3.14, + "eval_samples_per_second": 57.007, + "eval_steps_per_second": 57.007, + "step": 4500 + }, + { + "epoch": 16.24920993227991, + "grad_norm": 222.98350524902344, + "learning_rate": 5.548094373865699e-06, + "loss": 37.6382, + "step": 4501 + }, + { + "epoch": 16.252821670428894, + "grad_norm": 234.33267211914062, + "learning_rate": 5.542649727767695e-06, + "loss": 38.0509, + "step": 4502 + }, + { + "epoch": 16.256433408577877, + "grad_norm": 303.56005859375, + "learning_rate": 5.5372050816696924e-06, + "loss": 36.509, + "step": 4503 + }, + { + "epoch": 16.260045146726863, + "grad_norm": 232.0821075439453, + "learning_rate": 5.531760435571688e-06, + "loss": 36.3975, + "step": 4504 + }, + { + "epoch": 16.263656884875846, + "grad_norm": 223.3292236328125, + "learning_rate": 5.526315789473684e-06, + "loss": 37.0448, + "step": 4505 + }, + { + "epoch": 16.26726862302483, + "grad_norm": 241.2131805419922, + "learning_rate": 5.520871143375681e-06, + "loss": 37.8635, + "step": 4506 + }, + { + "epoch": 16.270880361173816, + "grad_norm": 288.62689208984375, + "learning_rate": 5.5154264972776765e-06, + "loss": 38.2789, + "step": 4507 + }, + { + "epoch": 16.2744920993228, + "grad_norm": 262.59637451171875, + "learning_rate": 5.5099818511796736e-06, + "loss": 37.9052, + "step": 4508 + }, + { + "epoch": 16.278103837471782, + "grad_norm": 258.0476379394531, + "learning_rate": 5.50453720508167e-06, + "loss": 38.0485, + "step": 4509 + }, + { + "epoch": 16.28171557562077, + "grad_norm": 295.2730407714844, + "learning_rate": 5.499092558983667e-06, + "loss": 37.6134, + "step": 4510 + }, + { + "epoch": 16.28171557562077, + "eval_loss": 0.601740300655365, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 4510 + }, + { + "epoch": 16.285327313769752, + "grad_norm": 246.38548278808594, + "learning_rate": 5.493647912885662e-06, + "loss": 36.1289, + "step": 4511 + }, + { + "epoch": 16.288939051918735, + "grad_norm": 271.28997802734375, + "learning_rate": 5.4882032667876585e-06, + "loss": 31.8834, + "step": 4512 + }, + { + "epoch": 16.292550790067722, + "grad_norm": 231.76246643066406, + "learning_rate": 5.4827586206896556e-06, + "loss": 31.4899, + "step": 4513 + }, + { + "epoch": 16.296162528216705, + "grad_norm": 238.7414093017578, + "learning_rate": 5.477313974591652e-06, + "loss": 31.7102, + "step": 4514 + }, + { + "epoch": 16.299774266365688, + "grad_norm": 302.0710144042969, + "learning_rate": 5.471869328493648e-06, + "loss": 31.3557, + "step": 4515 + }, + { + "epoch": 16.30338600451467, + "grad_norm": 282.72015380859375, + "learning_rate": 5.466424682395644e-06, + "loss": 33.0781, + "step": 4516 + }, + { + "epoch": 16.306997742663658, + "grad_norm": 224.8140869140625, + "learning_rate": 5.460980036297641e-06, + "loss": 33.2963, + "step": 4517 + }, + { + "epoch": 16.31060948081264, + "grad_norm": 239.20570373535156, + "learning_rate": 5.4555353901996376e-06, + "loss": 34.4455, + "step": 4518 + }, + { + "epoch": 16.314221218961624, + "grad_norm": 304.7758483886719, + "learning_rate": 5.450090744101633e-06, + "loss": 34.534, + "step": 4519 + }, + { + "epoch": 16.31783295711061, + "grad_norm": 274.8758239746094, + "learning_rate": 5.44464609800363e-06, + "loss": 33.5232, + "step": 4520 + }, + { + "epoch": 16.31783295711061, + "eval_loss": 0.6031973958015442, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 4520 + }, + { + "epoch": 16.321444695259594, + "grad_norm": 295.1776428222656, + "learning_rate": 5.439201451905626e-06, + "loss": 33.403, + "step": 4521 + }, + { + "epoch": 16.325056433408577, + "grad_norm": 309.03399658203125, + "learning_rate": 5.4337568058076225e-06, + "loss": 34.1785, + "step": 4522 + }, + { + "epoch": 16.328668171557563, + "grad_norm": 285.26385498046875, + "learning_rate": 5.428312159709619e-06, + "loss": 34.4855, + "step": 4523 + }, + { + "epoch": 16.332279909706546, + "grad_norm": 307.0184020996094, + "learning_rate": 5.422867513611616e-06, + "loss": 32.4791, + "step": 4524 + }, + { + "epoch": 16.33589164785553, + "grad_norm": 318.8267822265625, + "learning_rate": 5.417422867513612e-06, + "loss": 35.697, + "step": 4525 + }, + { + "epoch": 16.339503386004516, + "grad_norm": 356.0179138183594, + "learning_rate": 5.411978221415607e-06, + "loss": 36.1811, + "step": 4526 + }, + { + "epoch": 16.3431151241535, + "grad_norm": 332.1255187988281, + "learning_rate": 5.4065335753176045e-06, + "loss": 36.2251, + "step": 4527 + }, + { + "epoch": 16.346726862302482, + "grad_norm": 288.78118896484375, + "learning_rate": 5.401088929219601e-06, + "loss": 32.0518, + "step": 4528 + }, + { + "epoch": 16.35033860045147, + "grad_norm": 250.37245178222656, + "learning_rate": 5.395644283121598e-06, + "loss": 23.627, + "step": 4529 + }, + { + "epoch": 16.353950338600452, + "grad_norm": 199.92352294921875, + "learning_rate": 5.390199637023593e-06, + "loss": 21.7919, + "step": 4530 + }, + { + "epoch": 16.353950338600452, + "eval_loss": 0.6021688580513, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 4530 + }, + { + "epoch": 16.357562076749435, + "grad_norm": 265.47015380859375, + "learning_rate": 5.38475499092559e-06, + "loss": 23.0672, + "step": 4531 + }, + { + "epoch": 16.36117381489842, + "grad_norm": 281.188720703125, + "learning_rate": 5.3793103448275865e-06, + "loss": 22.7983, + "step": 4532 + }, + { + "epoch": 16.364785553047405, + "grad_norm": 195.5351104736328, + "learning_rate": 5.373865698729583e-06, + "loss": 38.1042, + "step": 4533 + }, + { + "epoch": 16.368397291196388, + "grad_norm": 234.76573181152344, + "learning_rate": 5.368421052631579e-06, + "loss": 39.8602, + "step": 4534 + }, + { + "epoch": 16.37200902934537, + "grad_norm": 237.9152374267578, + "learning_rate": 5.362976406533575e-06, + "loss": 40.2156, + "step": 4535 + }, + { + "epoch": 16.375620767494357, + "grad_norm": 297.722900390625, + "learning_rate": 5.357531760435572e-06, + "loss": 39.3676, + "step": 4536 + }, + { + "epoch": 16.37923250564334, + "grad_norm": 218.61727905273438, + "learning_rate": 5.352087114337568e-06, + "loss": 38.7905, + "step": 4537 + }, + { + "epoch": 16.382844243792324, + "grad_norm": 245.19561767578125, + "learning_rate": 5.346642468239565e-06, + "loss": 39.3998, + "step": 4538 + }, + { + "epoch": 16.38645598194131, + "grad_norm": 247.5048370361328, + "learning_rate": 5.341197822141561e-06, + "loss": 40.0835, + "step": 4539 + }, + { + "epoch": 16.390067720090293, + "grad_norm": 214.40684509277344, + "learning_rate": 5.335753176043558e-06, + "loss": 39.1135, + "step": 4540 + }, + { + "epoch": 16.390067720090293, + "eval_loss": 0.6014460325241089, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.946, + "eval_steps_per_second": 56.946, + "step": 4540 + }, + { + "epoch": 16.393679458239276, + "grad_norm": 216.72271728515625, + "learning_rate": 5.330308529945553e-06, + "loss": 38.9449, + "step": 4541 + }, + { + "epoch": 16.397291196388263, + "grad_norm": 224.22262573242188, + "learning_rate": 5.32486388384755e-06, + "loss": 39.2646, + "step": 4542 + }, + { + "epoch": 16.400902934537246, + "grad_norm": 258.6524353027344, + "learning_rate": 5.319419237749547e-06, + "loss": 38.0846, + "step": 4543 + }, + { + "epoch": 16.40451467268623, + "grad_norm": 241.7313232421875, + "learning_rate": 5.313974591651543e-06, + "loss": 37.4963, + "step": 4544 + }, + { + "epoch": 16.408126410835216, + "grad_norm": 241.3990478515625, + "learning_rate": 5.308529945553539e-06, + "loss": 36.4783, + "step": 4545 + }, + { + "epoch": 16.4117381489842, + "grad_norm": 207.1470947265625, + "learning_rate": 5.303085299455535e-06, + "loss": 36.1592, + "step": 4546 + }, + { + "epoch": 16.415349887133182, + "grad_norm": 224.51690673828125, + "learning_rate": 5.2976406533575325e-06, + "loss": 35.7946, + "step": 4547 + }, + { + "epoch": 16.41896162528217, + "grad_norm": 292.4340515136719, + "learning_rate": 5.292196007259528e-06, + "loss": 36.8986, + "step": 4548 + }, + { + "epoch": 16.42257336343115, + "grad_norm": 244.67117309570312, + "learning_rate": 5.286751361161524e-06, + "loss": 37.1165, + "step": 4549 + }, + { + "epoch": 16.426185101580135, + "grad_norm": 331.14654541015625, + "learning_rate": 5.281306715063521e-06, + "loss": 36.4423, + "step": 4550 + }, + { + "epoch": 16.426185101580135, + "eval_loss": 0.6067427396774292, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.946, + "eval_steps_per_second": 56.946, + "step": 4550 + }, + { + "epoch": 16.42979683972912, + "grad_norm": 262.373046875, + "learning_rate": 5.275862068965517e-06, + "loss": 39.0014, + "step": 4551 + }, + { + "epoch": 16.433408577878104, + "grad_norm": 237.48350524902344, + "learning_rate": 5.270417422867514e-06, + "loss": 38.0152, + "step": 4552 + }, + { + "epoch": 16.437020316027088, + "grad_norm": 273.0652770996094, + "learning_rate": 5.26497277676951e-06, + "loss": 37.6952, + "step": 4553 + }, + { + "epoch": 16.44063205417607, + "grad_norm": 239.0780029296875, + "learning_rate": 5.259528130671507e-06, + "loss": 38.4266, + "step": 4554 + }, + { + "epoch": 16.444243792325057, + "grad_norm": 277.978759765625, + "learning_rate": 5.254083484573503e-06, + "loss": 36.5596, + "step": 4555 + }, + { + "epoch": 16.44785553047404, + "grad_norm": 216.2267303466797, + "learning_rate": 5.248638838475499e-06, + "loss": 39.1408, + "step": 4556 + }, + { + "epoch": 16.451467268623023, + "grad_norm": 231.80581665039062, + "learning_rate": 5.243194192377496e-06, + "loss": 38.7286, + "step": 4557 + }, + { + "epoch": 16.45507900677201, + "grad_norm": 236.4004669189453, + "learning_rate": 5.237749546279492e-06, + "loss": 39.2426, + "step": 4558 + }, + { + "epoch": 16.458690744920993, + "grad_norm": 270.0268859863281, + "learning_rate": 5.232304900181488e-06, + "loss": 38.6546, + "step": 4559 + }, + { + "epoch": 16.462302483069976, + "grad_norm": 255.8044891357422, + "learning_rate": 5.226860254083484e-06, + "loss": 37.554, + "step": 4560 + }, + { + "epoch": 16.462302483069976, + "eval_loss": 0.6019929647445679, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.062, + "eval_steps_per_second": 57.062, + "step": 4560 + }, + { + "epoch": 16.465914221218963, + "grad_norm": 321.18499755859375, + "learning_rate": 5.221415607985481e-06, + "loss": 34.9309, + "step": 4561 + }, + { + "epoch": 16.469525959367946, + "grad_norm": 311.94305419921875, + "learning_rate": 5.215970961887478e-06, + "loss": 35.8779, + "step": 4562 + }, + { + "epoch": 16.47313769751693, + "grad_norm": 211.90234375, + "learning_rate": 5.210526315789474e-06, + "loss": 31.8385, + "step": 4563 + }, + { + "epoch": 16.476749435665916, + "grad_norm": 284.64581298828125, + "learning_rate": 5.20508166969147e-06, + "loss": 31.8078, + "step": 4564 + }, + { + "epoch": 16.4803611738149, + "grad_norm": 291.94891357421875, + "learning_rate": 5.199637023593466e-06, + "loss": 33.2542, + "step": 4565 + }, + { + "epoch": 16.483972911963882, + "grad_norm": 243.61956787109375, + "learning_rate": 5.194192377495463e-06, + "loss": 31.5292, + "step": 4566 + }, + { + "epoch": 16.48758465011287, + "grad_norm": 242.07696533203125, + "learning_rate": 5.188747731397459e-06, + "loss": 33.9643, + "step": 4567 + }, + { + "epoch": 16.49119638826185, + "grad_norm": 255.0625457763672, + "learning_rate": 5.183303085299456e-06, + "loss": 33.7718, + "step": 4568 + }, + { + "epoch": 16.494808126410835, + "grad_norm": 249.40240478515625, + "learning_rate": 5.177858439201452e-06, + "loss": 31.5248, + "step": 4569 + }, + { + "epoch": 16.498419864559818, + "grad_norm": 231.3375244140625, + "learning_rate": 5.172413793103449e-06, + "loss": 34.5657, + "step": 4570 + }, + { + "epoch": 16.498419864559818, + "eval_loss": 0.6017265319824219, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.01, + "eval_steps_per_second": 57.01, + "step": 4570 + }, + { + "epoch": 16.502031602708804, + "grad_norm": 247.97012329101562, + "learning_rate": 5.1669691470054445e-06, + "loss": 33.766, + "step": 4571 + }, + { + "epoch": 16.505643340857787, + "grad_norm": 310.730224609375, + "learning_rate": 5.161524500907441e-06, + "loss": 34.0841, + "step": 4572 + }, + { + "epoch": 16.50925507900677, + "grad_norm": 323.5569152832031, + "learning_rate": 5.156079854809438e-06, + "loss": 35.0788, + "step": 4573 + }, + { + "epoch": 16.512866817155757, + "grad_norm": 247.95480346679688, + "learning_rate": 5.150635208711433e-06, + "loss": 33.5322, + "step": 4574 + }, + { + "epoch": 16.51647855530474, + "grad_norm": 307.6163024902344, + "learning_rate": 5.14519056261343e-06, + "loss": 34.4701, + "step": 4575 + }, + { + "epoch": 16.520090293453723, + "grad_norm": 239.569580078125, + "learning_rate": 5.1397459165154265e-06, + "loss": 35.8526, + "step": 4576 + }, + { + "epoch": 16.52370203160271, + "grad_norm": 362.4159240722656, + "learning_rate": 5.134301270417424e-06, + "loss": 36.2235, + "step": 4577 + }, + { + "epoch": 16.527313769751693, + "grad_norm": 321.2509765625, + "learning_rate": 5.128856624319419e-06, + "loss": 33.4705, + "step": 4578 + }, + { + "epoch": 16.530925507900676, + "grad_norm": 248.6092071533203, + "learning_rate": 5.123411978221415e-06, + "loss": 23.1329, + "step": 4579 + }, + { + "epoch": 16.534537246049663, + "grad_norm": 289.8996276855469, + "learning_rate": 5.117967332123412e-06, + "loss": 20.3184, + "step": 4580 + }, + { + "epoch": 16.534537246049663, + "eval_loss": 0.6034744381904602, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.997, + "eval_steps_per_second": 56.997, + "step": 4580 + }, + { + "epoch": 16.538148984198646, + "grad_norm": 215.02142333984375, + "learning_rate": 5.1125226860254085e-06, + "loss": 23.0513, + "step": 4581 + }, + { + "epoch": 16.54176072234763, + "grad_norm": 299.8429870605469, + "learning_rate": 5.107078039927405e-06, + "loss": 24.462, + "step": 4582 + }, + { + "epoch": 16.545372460496615, + "grad_norm": 267.0840759277344, + "learning_rate": 5.101633393829401e-06, + "loss": 39.9148, + "step": 4583 + }, + { + "epoch": 16.5489841986456, + "grad_norm": 227.23731994628906, + "learning_rate": 5.096188747731398e-06, + "loss": 40.6498, + "step": 4584 + }, + { + "epoch": 16.55259593679458, + "grad_norm": 313.9705810546875, + "learning_rate": 5.0907441016333935e-06, + "loss": 38.7711, + "step": 4585 + }, + { + "epoch": 16.55620767494357, + "grad_norm": 398.0429382324219, + "learning_rate": 5.0852994555353905e-06, + "loss": 39.6938, + "step": 4586 + }, + { + "epoch": 16.55981941309255, + "grad_norm": 365.489990234375, + "learning_rate": 5.079854809437387e-06, + "loss": 39.356, + "step": 4587 + }, + { + "epoch": 16.563431151241534, + "grad_norm": 365.05267333984375, + "learning_rate": 5.074410163339383e-06, + "loss": 40.2504, + "step": 4588 + }, + { + "epoch": 16.567042889390518, + "grad_norm": 288.0643310546875, + "learning_rate": 5.068965517241379e-06, + "loss": 39.6045, + "step": 4589 + }, + { + "epoch": 16.570654627539504, + "grad_norm": 262.0147705078125, + "learning_rate": 5.0635208711433755e-06, + "loss": 40.2504, + "step": 4590 + }, + { + "epoch": 16.570654627539504, + "eval_loss": 0.6028281450271606, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 4590 + }, + { + "epoch": 16.574266365688487, + "grad_norm": 325.78387451171875, + "learning_rate": 5.0580762250453725e-06, + "loss": 40.3154, + "step": 4591 + }, + { + "epoch": 16.57787810383747, + "grad_norm": 221.56591796875, + "learning_rate": 5.052631578947369e-06, + "loss": 39.5046, + "step": 4592 + }, + { + "epoch": 16.581489841986457, + "grad_norm": 227.02520751953125, + "learning_rate": 5.047186932849365e-06, + "loss": 38.3611, + "step": 4593 + }, + { + "epoch": 16.58510158013544, + "grad_norm": 232.46922302246094, + "learning_rate": 5.041742286751361e-06, + "loss": 36.5043, + "step": 4594 + }, + { + "epoch": 16.588713318284423, + "grad_norm": 230.59536743164062, + "learning_rate": 5.0362976406533575e-06, + "loss": 36.2179, + "step": 4595 + }, + { + "epoch": 16.59232505643341, + "grad_norm": 439.9609069824219, + "learning_rate": 5.0308529945553545e-06, + "loss": 36.4797, + "step": 4596 + }, + { + "epoch": 16.595936794582393, + "grad_norm": 322.4086608886719, + "learning_rate": 5.02540834845735e-06, + "loss": 37.4151, + "step": 4597 + }, + { + "epoch": 16.599548532731376, + "grad_norm": 318.1732482910156, + "learning_rate": 5.019963702359347e-06, + "loss": 37.2815, + "step": 4598 + }, + { + "epoch": 16.603160270880363, + "grad_norm": 321.34039306640625, + "learning_rate": 5.014519056261343e-06, + "loss": 36.8388, + "step": 4599 + }, + { + "epoch": 16.606772009029346, + "grad_norm": 341.28790283203125, + "learning_rate": 5.0090744101633395e-06, + "loss": 37.9805, + "step": 4600 + }, + { + "epoch": 16.606772009029346, + "eval_loss": 0.6045316457748413, + "eval_runtime": 3.1402, + "eval_samples_per_second": 57.002, + "eval_steps_per_second": 57.002, + "step": 4600 + }, + { + "epoch": 16.61038374717833, + "grad_norm": 259.9163513183594, + "learning_rate": 5.003629764065336e-06, + "loss": 37.5832, + "step": 4601 + }, + { + "epoch": 16.613995485327315, + "grad_norm": 297.02587890625, + "learning_rate": 4.998185117967332e-06, + "loss": 37.3808, + "step": 4602 + }, + { + "epoch": 16.6176072234763, + "grad_norm": 263.32244873046875, + "learning_rate": 4.992740471869329e-06, + "loss": 37.1047, + "step": 4603 + }, + { + "epoch": 16.62121896162528, + "grad_norm": 262.26104736328125, + "learning_rate": 4.987295825771324e-06, + "loss": 38.3592, + "step": 4604 + }, + { + "epoch": 16.624830699774268, + "grad_norm": 253.7144012451172, + "learning_rate": 4.9818511796733215e-06, + "loss": 37.4098, + "step": 4605 + }, + { + "epoch": 16.62844243792325, + "grad_norm": 279.1004943847656, + "learning_rate": 4.976406533575318e-06, + "loss": 39.3865, + "step": 4606 + }, + { + "epoch": 16.632054176072234, + "grad_norm": 298.7977600097656, + "learning_rate": 4.970961887477315e-06, + "loss": 38.6865, + "step": 4607 + }, + { + "epoch": 16.635665914221217, + "grad_norm": 256.7657470703125, + "learning_rate": 4.96551724137931e-06, + "loss": 38.7068, + "step": 4608 + }, + { + "epoch": 16.639277652370204, + "grad_norm": 238.22979736328125, + "learning_rate": 4.960072595281307e-06, + "loss": 37.749, + "step": 4609 + }, + { + "epoch": 16.642889390519187, + "grad_norm": 248.4231414794922, + "learning_rate": 4.9546279491833035e-06, + "loss": 37.582, + "step": 4610 + }, + { + "epoch": 16.642889390519187, + "eval_loss": 0.6026645302772522, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.99, + "eval_steps_per_second": 56.99, + "step": 4610 + }, + { + "epoch": 16.64650112866817, + "grad_norm": 232.70289611816406, + "learning_rate": 4.949183303085299e-06, + "loss": 34.4589, + "step": 4611 + }, + { + "epoch": 16.650112866817157, + "grad_norm": 268.4678955078125, + "learning_rate": 4.943738656987296e-06, + "loss": 32.3619, + "step": 4612 + }, + { + "epoch": 16.65372460496614, + "grad_norm": 272.07794189453125, + "learning_rate": 4.938294010889292e-06, + "loss": 32.3436, + "step": 4613 + }, + { + "epoch": 16.657336343115123, + "grad_norm": 304.4588317871094, + "learning_rate": 4.932849364791289e-06, + "loss": 30.8798, + "step": 4614 + }, + { + "epoch": 16.66094808126411, + "grad_norm": 293.3638000488281, + "learning_rate": 4.927404718693285e-06, + "loss": 31.1892, + "step": 4615 + }, + { + "epoch": 16.664559819413093, + "grad_norm": 292.844482421875, + "learning_rate": 4.921960072595282e-06, + "loss": 31.9604, + "step": 4616 + }, + { + "epoch": 16.668171557562076, + "grad_norm": 246.45339965820312, + "learning_rate": 4.916515426497278e-06, + "loss": 32.242, + "step": 4617 + }, + { + "epoch": 16.671783295711062, + "grad_norm": 269.9577941894531, + "learning_rate": 4.911070780399274e-06, + "loss": 32.5072, + "step": 4618 + }, + { + "epoch": 16.675395033860045, + "grad_norm": 312.8960876464844, + "learning_rate": 4.90562613430127e-06, + "loss": 33.8243, + "step": 4619 + }, + { + "epoch": 16.67900677200903, + "grad_norm": 287.4557189941406, + "learning_rate": 4.900181488203267e-06, + "loss": 34.3557, + "step": 4620 + }, + { + "epoch": 16.67900677200903, + "eval_loss": 0.6047338843345642, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 4620 + }, + { + "epoch": 16.682618510158015, + "grad_norm": 403.533935546875, + "learning_rate": 4.894736842105264e-06, + "loss": 34.6895, + "step": 4621 + }, + { + "epoch": 16.686230248306998, + "grad_norm": 387.5083923339844, + "learning_rate": 4.88929219600726e-06, + "loss": 34.2407, + "step": 4622 + }, + { + "epoch": 16.68984198645598, + "grad_norm": 278.8225402832031, + "learning_rate": 4.883847549909256e-06, + "loss": 33.3489, + "step": 4623 + }, + { + "epoch": 16.693453724604964, + "grad_norm": 270.46685791015625, + "learning_rate": 4.878402903811252e-06, + "loss": 34.2095, + "step": 4624 + }, + { + "epoch": 16.69706546275395, + "grad_norm": 244.6392059326172, + "learning_rate": 4.872958257713249e-06, + "loss": 35.783, + "step": 4625 + }, + { + "epoch": 16.700677200902934, + "grad_norm": 327.0617370605469, + "learning_rate": 4.867513611615245e-06, + "loss": 36.4928, + "step": 4626 + }, + { + "epoch": 16.704288939051917, + "grad_norm": 297.0531311035156, + "learning_rate": 4.862068965517241e-06, + "loss": 33.4827, + "step": 4627 + }, + { + "epoch": 16.707900677200904, + "grad_norm": 366.2174377441406, + "learning_rate": 4.856624319419238e-06, + "loss": 26.9456, + "step": 4628 + }, + { + "epoch": 16.711512415349887, + "grad_norm": 436.22613525390625, + "learning_rate": 4.851179673321234e-06, + "loss": 22.2349, + "step": 4629 + }, + { + "epoch": 16.71512415349887, + "grad_norm": 391.7647705078125, + "learning_rate": 4.845735027223231e-06, + "loss": 22.8557, + "step": 4630 + }, + { + "epoch": 16.71512415349887, + "eval_loss": 0.6052708029747009, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.038, + "eval_steps_per_second": 57.038, + "step": 4630 + }, + { + "epoch": 16.718735891647857, + "grad_norm": 277.8678283691406, + "learning_rate": 4.840290381125227e-06, + "loss": 23.3521, + "step": 4631 + }, + { + "epoch": 16.72234762979684, + "grad_norm": 252.46131896972656, + "learning_rate": 4.834845735027224e-06, + "loss": 23.7394, + "step": 4632 + }, + { + "epoch": 16.725959367945823, + "grad_norm": 214.6287078857422, + "learning_rate": 4.82940108892922e-06, + "loss": 38.6633, + "step": 4633 + }, + { + "epoch": 16.72957110609481, + "grad_norm": 257.454345703125, + "learning_rate": 4.8239564428312155e-06, + "loss": 40.5165, + "step": 4634 + }, + { + "epoch": 16.733182844243792, + "grad_norm": 211.1912841796875, + "learning_rate": 4.818511796733213e-06, + "loss": 38.483, + "step": 4635 + }, + { + "epoch": 16.736794582392776, + "grad_norm": 226.8388214111328, + "learning_rate": 4.813067150635209e-06, + "loss": 39.6143, + "step": 4636 + }, + { + "epoch": 16.740406320541762, + "grad_norm": 263.8160400390625, + "learning_rate": 4.807622504537205e-06, + "loss": 37.8442, + "step": 4637 + }, + { + "epoch": 16.744018058690745, + "grad_norm": 284.8119201660156, + "learning_rate": 4.802177858439201e-06, + "loss": 39.1835, + "step": 4638 + }, + { + "epoch": 16.74762979683973, + "grad_norm": 310.31390380859375, + "learning_rate": 4.796733212341198e-06, + "loss": 38.7035, + "step": 4639 + }, + { + "epoch": 16.751241534988715, + "grad_norm": 212.71315002441406, + "learning_rate": 4.791288566243195e-06, + "loss": 38.8803, + "step": 4640 + }, + { + "epoch": 16.751241534988715, + "eval_loss": 0.6030828952789307, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 4640 + }, + { + "epoch": 16.754853273137698, + "grad_norm": 209.7708740234375, + "learning_rate": 4.78584392014519e-06, + "loss": 39.0808, + "step": 4641 + }, + { + "epoch": 16.75846501128668, + "grad_norm": 251.971435546875, + "learning_rate": 4.780399274047187e-06, + "loss": 39.2025, + "step": 4642 + }, + { + "epoch": 16.762076749435664, + "grad_norm": 210.54151916503906, + "learning_rate": 4.774954627949183e-06, + "loss": 37.7541, + "step": 4643 + }, + { + "epoch": 16.76568848758465, + "grad_norm": 221.22119140625, + "learning_rate": 4.76950998185118e-06, + "loss": 36.4328, + "step": 4644 + }, + { + "epoch": 16.769300225733634, + "grad_norm": 201.45025634765625, + "learning_rate": 4.764065335753176e-06, + "loss": 34.9771, + "step": 4645 + }, + { + "epoch": 16.772911963882617, + "grad_norm": 241.33030700683594, + "learning_rate": 4.758620689655173e-06, + "loss": 37.6231, + "step": 4646 + }, + { + "epoch": 16.776523702031604, + "grad_norm": 282.12255859375, + "learning_rate": 4.753176043557169e-06, + "loss": 36.9822, + "step": 4647 + }, + { + "epoch": 16.780135440180587, + "grad_norm": 239.93885803222656, + "learning_rate": 4.747731397459165e-06, + "loss": 36.3529, + "step": 4648 + }, + { + "epoch": 16.78374717832957, + "grad_norm": 245.9400634765625, + "learning_rate": 4.7422867513611615e-06, + "loss": 37.518, + "step": 4649 + }, + { + "epoch": 16.787358916478556, + "grad_norm": 280.63720703125, + "learning_rate": 4.736842105263158e-06, + "loss": 37.6323, + "step": 4650 + }, + { + "epoch": 16.787358916478556, + "eval_loss": 0.6054876446723938, + "eval_runtime": 3.1439, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 4650 + }, + { + "epoch": 16.79097065462754, + "grad_norm": 368.47698974609375, + "learning_rate": 4.731397459165155e-06, + "loss": 38.1543, + "step": 4651 + }, + { + "epoch": 16.794582392776523, + "grad_norm": 346.9169616699219, + "learning_rate": 4.72595281306715e-06, + "loss": 38.8746, + "step": 4652 + }, + { + "epoch": 16.79819413092551, + "grad_norm": 311.7519836425781, + "learning_rate": 4.720508166969147e-06, + "loss": 37.3475, + "step": 4653 + }, + { + "epoch": 16.801805869074492, + "grad_norm": 323.14910888671875, + "learning_rate": 4.7150635208711435e-06, + "loss": 38.5308, + "step": 4654 + }, + { + "epoch": 16.805417607223475, + "grad_norm": 252.71958923339844, + "learning_rate": 4.70961887477314e-06, + "loss": 38.3275, + "step": 4655 + }, + { + "epoch": 16.809029345372462, + "grad_norm": 364.2929382324219, + "learning_rate": 4.704174228675136e-06, + "loss": 38.9973, + "step": 4656 + }, + { + "epoch": 16.812641083521445, + "grad_norm": 267.23980712890625, + "learning_rate": 4.698729582577132e-06, + "loss": 38.0867, + "step": 4657 + }, + { + "epoch": 16.816252821670428, + "grad_norm": 297.4647521972656, + "learning_rate": 4.693284936479129e-06, + "loss": 38.6933, + "step": 4658 + }, + { + "epoch": 16.819864559819415, + "grad_norm": 276.2767333984375, + "learning_rate": 4.6878402903811255e-06, + "loss": 38.0279, + "step": 4659 + }, + { + "epoch": 16.823476297968398, + "grad_norm": 261.5404052734375, + "learning_rate": 4.682395644283122e-06, + "loss": 36.5149, + "step": 4660 + }, + { + "epoch": 16.823476297968398, + "eval_loss": 0.6019832491874695, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.074, + "eval_steps_per_second": 57.074, + "step": 4660 + }, + { + "epoch": 16.82708803611738, + "grad_norm": 313.2170104980469, + "learning_rate": 4.676950998185118e-06, + "loss": 35.6121, + "step": 4661 + }, + { + "epoch": 16.830699774266364, + "grad_norm": 297.2791442871094, + "learning_rate": 4.671506352087115e-06, + "loss": 31.1869, + "step": 4662 + }, + { + "epoch": 16.83431151241535, + "grad_norm": 269.7320556640625, + "learning_rate": 4.666061705989111e-06, + "loss": 31.8674, + "step": 4663 + }, + { + "epoch": 16.837923250564334, + "grad_norm": 245.3898468017578, + "learning_rate": 4.660617059891107e-06, + "loss": 30.3726, + "step": 4664 + }, + { + "epoch": 16.841534988713317, + "grad_norm": 244.63223266601562, + "learning_rate": 4.655172413793104e-06, + "loss": 32.6154, + "step": 4665 + }, + { + "epoch": 16.845146726862303, + "grad_norm": 263.6791076660156, + "learning_rate": 4.6497277676951e-06, + "loss": 33.0104, + "step": 4666 + }, + { + "epoch": 16.848758465011286, + "grad_norm": 398.6610107421875, + "learning_rate": 4.644283121597096e-06, + "loss": 32.5445, + "step": 4667 + }, + { + "epoch": 16.85237020316027, + "grad_norm": 312.8116149902344, + "learning_rate": 4.6388384754990924e-06, + "loss": 32.5698, + "step": 4668 + }, + { + "epoch": 16.855981941309256, + "grad_norm": 296.6167297363281, + "learning_rate": 4.6333938294010895e-06, + "loss": 33.1377, + "step": 4669 + }, + { + "epoch": 16.85959367945824, + "grad_norm": 285.299560546875, + "learning_rate": 4.627949183303086e-06, + "loss": 33.3279, + "step": 4670 + }, + { + "epoch": 16.85959367945824, + "eval_loss": 0.6027817726135254, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.985, + "eval_steps_per_second": 56.985, + "step": 4670 + }, + { + "epoch": 16.863205417607222, + "grad_norm": 285.2948913574219, + "learning_rate": 4.622504537205081e-06, + "loss": 35.6879, + "step": 4671 + }, + { + "epoch": 16.86681715575621, + "grad_norm": 280.6530456542969, + "learning_rate": 4.617059891107078e-06, + "loss": 32.3154, + "step": 4672 + }, + { + "epoch": 16.870428893905192, + "grad_norm": 314.206787109375, + "learning_rate": 4.6116152450090744e-06, + "loss": 34.3517, + "step": 4673 + }, + { + "epoch": 16.874040632054175, + "grad_norm": 305.9198913574219, + "learning_rate": 4.6061705989110715e-06, + "loss": 34.1571, + "step": 4674 + }, + { + "epoch": 16.877652370203162, + "grad_norm": 287.0543212890625, + "learning_rate": 4.600725952813067e-06, + "loss": 35.1647, + "step": 4675 + }, + { + "epoch": 16.881264108352145, + "grad_norm": 286.912109375, + "learning_rate": 4.595281306715064e-06, + "loss": 34.8698, + "step": 4676 + }, + { + "epoch": 16.884875846501128, + "grad_norm": 322.4527587890625, + "learning_rate": 4.58983666061706e-06, + "loss": 36.3449, + "step": 4677 + }, + { + "epoch": 16.888487584650115, + "grad_norm": 239.41659545898438, + "learning_rate": 4.584392014519056e-06, + "loss": 25.3085, + "step": 4678 + }, + { + "epoch": 16.892099322799098, + "grad_norm": 215.5685577392578, + "learning_rate": 4.578947368421053e-06, + "loss": 22.3485, + "step": 4679 + }, + { + "epoch": 16.89571106094808, + "grad_norm": 291.2452697753906, + "learning_rate": 4.573502722323049e-06, + "loss": 22.3257, + "step": 4680 + }, + { + "epoch": 16.89571106094808, + "eval_loss": 0.6040940284729004, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 4680 + }, + { + "epoch": 16.899322799097064, + "grad_norm": 291.39935302734375, + "learning_rate": 4.568058076225046e-06, + "loss": 23.268, + "step": 4681 + }, + { + "epoch": 16.90293453724605, + "grad_norm": 272.211181640625, + "learning_rate": 4.562613430127041e-06, + "loss": 23.7127, + "step": 4682 + }, + { + "epoch": 16.906546275395034, + "grad_norm": 220.84397888183594, + "learning_rate": 4.5571687840290384e-06, + "loss": 39.2488, + "step": 4683 + }, + { + "epoch": 16.910158013544017, + "grad_norm": 238.49859619140625, + "learning_rate": 4.551724137931035e-06, + "loss": 39.5643, + "step": 4684 + }, + { + "epoch": 16.913769751693003, + "grad_norm": 325.3870544433594, + "learning_rate": 4.546279491833032e-06, + "loss": 38.6149, + "step": 4685 + }, + { + "epoch": 16.917381489841986, + "grad_norm": 307.02349853515625, + "learning_rate": 4.540834845735027e-06, + "loss": 38.0317, + "step": 4686 + }, + { + "epoch": 16.92099322799097, + "grad_norm": 433.99359130859375, + "learning_rate": 4.535390199637023e-06, + "loss": 40.4567, + "step": 4687 + }, + { + "epoch": 16.924604966139956, + "grad_norm": 327.97015380859375, + "learning_rate": 4.5299455535390204e-06, + "loss": 40.3109, + "step": 4688 + }, + { + "epoch": 16.92821670428894, + "grad_norm": 257.20684814453125, + "learning_rate": 4.524500907441017e-06, + "loss": 36.2826, + "step": 4689 + }, + { + "epoch": 16.931828442437922, + "grad_norm": 402.6732177734375, + "learning_rate": 4.519056261343013e-06, + "loss": 36.9163, + "step": 4690 + }, + { + "epoch": 16.931828442437922, + "eval_loss": 0.6016727089881897, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 4690 + }, + { + "epoch": 16.93544018058691, + "grad_norm": 380.8903503417969, + "learning_rate": 4.513611615245009e-06, + "loss": 36.7101, + "step": 4691 + }, + { + "epoch": 16.939051918735892, + "grad_norm": 365.4950256347656, + "learning_rate": 4.508166969147006e-06, + "loss": 37.9853, + "step": 4692 + }, + { + "epoch": 16.942663656884875, + "grad_norm": 302.3895568847656, + "learning_rate": 4.5027223230490016e-06, + "loss": 38.109, + "step": 4693 + }, + { + "epoch": 16.94627539503386, + "grad_norm": 333.5274963378906, + "learning_rate": 4.497277676950998e-06, + "loss": 37.5992, + "step": 4694 + }, + { + "epoch": 16.949887133182845, + "grad_norm": 364.3126525878906, + "learning_rate": 4.491833030852995e-06, + "loss": 38.0139, + "step": 4695 + }, + { + "epoch": 16.953498871331828, + "grad_norm": 509.94671630859375, + "learning_rate": 4.486388384754991e-06, + "loss": 39.8027, + "step": 4696 + }, + { + "epoch": 16.957110609480814, + "grad_norm": 507.8591613769531, + "learning_rate": 4.480943738656987e-06, + "loss": 40.0044, + "step": 4697 + }, + { + "epoch": 16.960722347629797, + "grad_norm": 324.5463562011719, + "learning_rate": 4.4754990925589836e-06, + "loss": 34.9058, + "step": 4698 + }, + { + "epoch": 16.96433408577878, + "grad_norm": 318.39801025390625, + "learning_rate": 4.470054446460981e-06, + "loss": 33.1318, + "step": 4699 + }, + { + "epoch": 16.967945823927764, + "grad_norm": 391.8466796875, + "learning_rate": 4.464609800362977e-06, + "loss": 32.2083, + "step": 4700 + }, + { + "epoch": 16.967945823927764, + "eval_loss": 0.6047930717468262, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.009, + "eval_steps_per_second": 57.009, + "step": 4700 + }, + { + "epoch": 16.97155756207675, + "grad_norm": 530.4073486328125, + "learning_rate": 4.459165154264972e-06, + "loss": 31.9882, + "step": 4701 + }, + { + "epoch": 16.975169300225733, + "grad_norm": 590.9242553710938, + "learning_rate": 4.453720508166969e-06, + "loss": 34.1937, + "step": 4702 + }, + { + "epoch": 16.978781038374716, + "grad_norm": 377.5596618652344, + "learning_rate": 4.4482758620689656e-06, + "loss": 34.6501, + "step": 4703 + }, + { + "epoch": 16.982392776523703, + "grad_norm": 431.2909240722656, + "learning_rate": 4.442831215970962e-06, + "loss": 33.9402, + "step": 4704 + }, + { + "epoch": 16.986004514672686, + "grad_norm": 294.7673645019531, + "learning_rate": 4.437386569872958e-06, + "loss": 33.7873, + "step": 4705 + }, + { + "epoch": 16.98961625282167, + "grad_norm": 346.1203918457031, + "learning_rate": 4.431941923774955e-06, + "loss": 35.2935, + "step": 4706 + }, + { + "epoch": 16.993227990970656, + "grad_norm": 257.8351745605469, + "learning_rate": 4.426497277676951e-06, + "loss": 28.3513, + "step": 4707 + }, + { + "epoch": 16.99683972911964, + "grad_norm": 168.35118103027344, + "learning_rate": 4.421052631578947e-06, + "loss": 22.3009, + "step": 4708 + }, + { + "epoch": 17.0, + "grad_norm": 210.20738220214844, + "learning_rate": 4.415607985480944e-06, + "loss": 20.1848, + "step": 4709 + }, + { + "epoch": 17.003611738148983, + "grad_norm": 234.40866088867188, + "learning_rate": 4.41016333938294e-06, + "loss": 38.0969, + "step": 4710 + }, + { + "epoch": 17.003611738148983, + "eval_loss": 0.6026900410652161, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 4710 + }, + { + "epoch": 17.00722347629797, + "grad_norm": 242.27195739746094, + "learning_rate": 4.404718693284937e-06, + "loss": 38.8902, + "step": 4711 + }, + { + "epoch": 17.010835214446953, + "grad_norm": 215.1695556640625, + "learning_rate": 4.3992740471869325e-06, + "loss": 38.5509, + "step": 4712 + }, + { + "epoch": 17.014446952595936, + "grad_norm": 390.2027587890625, + "learning_rate": 4.3938294010889296e-06, + "loss": 38.5247, + "step": 4713 + }, + { + "epoch": 17.018058690744923, + "grad_norm": 397.77484130859375, + "learning_rate": 4.388384754990926e-06, + "loss": 39.1981, + "step": 4714 + }, + { + "epoch": 17.021670428893906, + "grad_norm": 298.10089111328125, + "learning_rate": 4.382940108892923e-06, + "loss": 38.2627, + "step": 4715 + }, + { + "epoch": 17.02528216704289, + "grad_norm": 291.7283935546875, + "learning_rate": 4.377495462794918e-06, + "loss": 38.8027, + "step": 4716 + }, + { + "epoch": 17.028893905191875, + "grad_norm": 254.8542938232422, + "learning_rate": 4.3720508166969145e-06, + "loss": 38.6095, + "step": 4717 + }, + { + "epoch": 17.03250564334086, + "grad_norm": 244.336181640625, + "learning_rate": 4.3666061705989116e-06, + "loss": 38.2955, + "step": 4718 + }, + { + "epoch": 17.03611738148984, + "grad_norm": 376.92523193359375, + "learning_rate": 4.361161524500907e-06, + "loss": 38.5203, + "step": 4719 + }, + { + "epoch": 17.039729119638825, + "grad_norm": 339.6172790527344, + "learning_rate": 4.355716878402904e-06, + "loss": 37.4332, + "step": 4720 + }, + { + "epoch": 17.039729119638825, + "eval_loss": 0.6024167537689209, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 4720 + }, + { + "epoch": 17.04334085778781, + "grad_norm": 433.0855712890625, + "learning_rate": 4.3502722323049e-06, + "loss": 36.4444, + "step": 4721 + }, + { + "epoch": 17.046952595936794, + "grad_norm": 224.3468475341797, + "learning_rate": 4.344827586206897e-06, + "loss": 35.7802, + "step": 4722 + }, + { + "epoch": 17.050564334085777, + "grad_norm": 385.5466003417969, + "learning_rate": 4.339382940108893e-06, + "loss": 35.4641, + "step": 4723 + }, + { + "epoch": 17.054176072234764, + "grad_norm": 311.80596923828125, + "learning_rate": 4.333938294010889e-06, + "loss": 36.4231, + "step": 4724 + }, + { + "epoch": 17.057787810383747, + "grad_norm": 283.189453125, + "learning_rate": 4.328493647912886e-06, + "loss": 37.5405, + "step": 4725 + }, + { + "epoch": 17.06139954853273, + "grad_norm": 403.85833740234375, + "learning_rate": 4.323049001814882e-06, + "loss": 37.4723, + "step": 4726 + }, + { + "epoch": 17.065011286681717, + "grad_norm": 390.03515625, + "learning_rate": 4.3176043557168785e-06, + "loss": 36.6799, + "step": 4727 + }, + { + "epoch": 17.0686230248307, + "grad_norm": 318.63427734375, + "learning_rate": 4.312159709618875e-06, + "loss": 36.6312, + "step": 4728 + }, + { + "epoch": 17.072234762979683, + "grad_norm": 318.43402099609375, + "learning_rate": 4.306715063520872e-06, + "loss": 37.9104, + "step": 4729 + }, + { + "epoch": 17.07584650112867, + "grad_norm": 320.9336853027344, + "learning_rate": 4.301270417422867e-06, + "loss": 36.7254, + "step": 4730 + }, + { + "epoch": 17.07584650112867, + "eval_loss": 0.6046721339225769, + "eval_runtime": 3.1418, + "eval_samples_per_second": 56.974, + "eval_steps_per_second": 56.974, + "step": 4730 + }, + { + "epoch": 17.079458239277653, + "grad_norm": 345.9001770019531, + "learning_rate": 4.295825771324863e-06, + "loss": 36.0298, + "step": 4731 + }, + { + "epoch": 17.083069977426636, + "grad_norm": 397.10369873046875, + "learning_rate": 4.2903811252268605e-06, + "loss": 37.9418, + "step": 4732 + }, + { + "epoch": 17.086681715575622, + "grad_norm": 293.1039123535156, + "learning_rate": 4.284936479128857e-06, + "loss": 37.2627, + "step": 4733 + }, + { + "epoch": 17.090293453724605, + "grad_norm": 412.5190734863281, + "learning_rate": 4.279491833030853e-06, + "loss": 38.3429, + "step": 4734 + }, + { + "epoch": 17.09390519187359, + "grad_norm": 241.35105895996094, + "learning_rate": 4.274047186932849e-06, + "loss": 38.559, + "step": 4735 + }, + { + "epoch": 17.097516930022575, + "grad_norm": 275.169189453125, + "learning_rate": 4.268602540834846e-06, + "loss": 36.8167, + "step": 4736 + }, + { + "epoch": 17.101128668171558, + "grad_norm": 272.3182678222656, + "learning_rate": 4.2631578947368425e-06, + "loss": 37.0246, + "step": 4737 + }, + { + "epoch": 17.10474040632054, + "grad_norm": 215.6425018310547, + "learning_rate": 4.257713248638839e-06, + "loss": 33.1282, + "step": 4738 + }, + { + "epoch": 17.108352144469524, + "grad_norm": 276.6223449707031, + "learning_rate": 4.252268602540835e-06, + "loss": 33.2698, + "step": 4739 + }, + { + "epoch": 17.11196388261851, + "grad_norm": 311.1632385253906, + "learning_rate": 4.246823956442831e-06, + "loss": 31.0105, + "step": 4740 + }, + { + "epoch": 17.11196388261851, + "eval_loss": 0.6019421815872192, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.089, + "eval_steps_per_second": 57.089, + "step": 4740 + }, + { + "epoch": 17.115575620767494, + "grad_norm": 254.7543487548828, + "learning_rate": 4.241379310344828e-06, + "loss": 31.4721, + "step": 4741 + }, + { + "epoch": 17.119187358916477, + "grad_norm": 239.24957275390625, + "learning_rate": 4.235934664246824e-06, + "loss": 31.0346, + "step": 4742 + }, + { + "epoch": 17.122799097065464, + "grad_norm": 262.0681457519531, + "learning_rate": 4.230490018148821e-06, + "loss": 32.0604, + "step": 4743 + }, + { + "epoch": 17.126410835214447, + "grad_norm": 218.3557586669922, + "learning_rate": 4.225045372050817e-06, + "loss": 32.2036, + "step": 4744 + }, + { + "epoch": 17.13002257336343, + "grad_norm": 277.5924072265625, + "learning_rate": 4.219600725952813e-06, + "loss": 32.1412, + "step": 4745 + }, + { + "epoch": 17.133634311512417, + "grad_norm": 226.93211364746094, + "learning_rate": 4.214156079854809e-06, + "loss": 34.3367, + "step": 4746 + }, + { + "epoch": 17.1372460496614, + "grad_norm": 303.2422180175781, + "learning_rate": 4.208711433756806e-06, + "loss": 33.2001, + "step": 4747 + }, + { + "epoch": 17.140857787810383, + "grad_norm": 257.6164245605469, + "learning_rate": 4.203266787658803e-06, + "loss": 34.155, + "step": 4748 + }, + { + "epoch": 17.14446952595937, + "grad_norm": 361.1567077636719, + "learning_rate": 4.197822141560798e-06, + "loss": 35.236, + "step": 4749 + }, + { + "epoch": 17.148081264108352, + "grad_norm": 292.0034484863281, + "learning_rate": 4.192377495462795e-06, + "loss": 34.304, + "step": 4750 + }, + { + "epoch": 17.148081264108352, + "eval_loss": 0.6034401059150696, + "eval_runtime": 3.1399, + "eval_samples_per_second": 57.008, + "eval_steps_per_second": 57.008, + "step": 4750 + }, + { + "epoch": 17.151693002257336, + "grad_norm": 327.8070983886719, + "learning_rate": 4.186932849364791e-06, + "loss": 33.7346, + "step": 4751 + }, + { + "epoch": 17.155304740406322, + "grad_norm": 312.9547119140625, + "learning_rate": 4.1814882032667885e-06, + "loss": 35.9274, + "step": 4752 + }, + { + "epoch": 17.158916478555305, + "grad_norm": 305.19500732421875, + "learning_rate": 4.176043557168784e-06, + "loss": 35.5567, + "step": 4753 + }, + { + "epoch": 17.16252821670429, + "grad_norm": 339.37152099609375, + "learning_rate": 4.17059891107078e-06, + "loss": 35.8013, + "step": 4754 + }, + { + "epoch": 17.16613995485327, + "grad_norm": 247.36679077148438, + "learning_rate": 4.165154264972777e-06, + "loss": 29.2211, + "step": 4755 + }, + { + "epoch": 17.169751693002258, + "grad_norm": 255.65269470214844, + "learning_rate": 4.1597096188747725e-06, + "loss": 21.6191, + "step": 4756 + }, + { + "epoch": 17.17336343115124, + "grad_norm": 239.66448974609375, + "learning_rate": 4.15426497277677e-06, + "loss": 22.0521, + "step": 4757 + }, + { + "epoch": 17.176975169300224, + "grad_norm": 212.25955200195312, + "learning_rate": 4.148820326678766e-06, + "loss": 22.6641, + "step": 4758 + }, + { + "epoch": 17.18058690744921, + "grad_norm": 229.9394073486328, + "learning_rate": 4.143375680580763e-06, + "loss": 22.8787, + "step": 4759 + }, + { + "epoch": 17.184198645598194, + "grad_norm": 237.46343994140625, + "learning_rate": 4.137931034482758e-06, + "loss": 39.1222, + "step": 4760 + }, + { + "epoch": 17.184198645598194, + "eval_loss": 0.6031526327133179, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 4760 + }, + { + "epoch": 17.187810383747177, + "grad_norm": 229.23849487304688, + "learning_rate": 4.132486388384755e-06, + "loss": 39.7664, + "step": 4761 + }, + { + "epoch": 17.191422121896164, + "grad_norm": 250.67529296875, + "learning_rate": 4.127041742286752e-06, + "loss": 38.6754, + "step": 4762 + }, + { + "epoch": 17.195033860045147, + "grad_norm": 272.9320068359375, + "learning_rate": 4.121597096188748e-06, + "loss": 39.1262, + "step": 4763 + }, + { + "epoch": 17.19864559819413, + "grad_norm": 267.82427978515625, + "learning_rate": 4.116152450090744e-06, + "loss": 38.2223, + "step": 4764 + }, + { + "epoch": 17.202257336343116, + "grad_norm": 266.35760498046875, + "learning_rate": 4.11070780399274e-06, + "loss": 39.2069, + "step": 4765 + }, + { + "epoch": 17.2058690744921, + "grad_norm": 221.62606811523438, + "learning_rate": 4.105263157894737e-06, + "loss": 38.8956, + "step": 4766 + }, + { + "epoch": 17.209480812641083, + "grad_norm": 243.73110961914062, + "learning_rate": 4.099818511796734e-06, + "loss": 41.5868, + "step": 4767 + }, + { + "epoch": 17.21309255079007, + "grad_norm": 268.6092224121094, + "learning_rate": 4.09437386569873e-06, + "loss": 39.1041, + "step": 4768 + }, + { + "epoch": 17.216704288939052, + "grad_norm": 300.3140563964844, + "learning_rate": 4.088929219600726e-06, + "loss": 38.25, + "step": 4769 + }, + { + "epoch": 17.220316027088035, + "grad_norm": 264.56805419921875, + "learning_rate": 4.083484573502722e-06, + "loss": 38.186, + "step": 4770 + }, + { + "epoch": 17.220316027088035, + "eval_loss": 0.6044566631317139, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4770 + }, + { + "epoch": 17.223927765237022, + "grad_norm": 303.47686767578125, + "learning_rate": 4.0780399274047185e-06, + "loss": 37.7011, + "step": 4771 + }, + { + "epoch": 17.227539503386005, + "grad_norm": 238.3590545654297, + "learning_rate": 4.072595281306715e-06, + "loss": 34.6695, + "step": 4772 + }, + { + "epoch": 17.231151241534988, + "grad_norm": 252.90081787109375, + "learning_rate": 4.067150635208712e-06, + "loss": 36.1903, + "step": 4773 + }, + { + "epoch": 17.23476297968397, + "grad_norm": 286.5584716796875, + "learning_rate": 4.061705989110708e-06, + "loss": 36.4185, + "step": 4774 + }, + { + "epoch": 17.238374717832958, + "grad_norm": 322.25323486328125, + "learning_rate": 4.056261343012704e-06, + "loss": 36.0098, + "step": 4775 + }, + { + "epoch": 17.24198645598194, + "grad_norm": 292.09405517578125, + "learning_rate": 4.0508166969147005e-06, + "loss": 35.4347, + "step": 4776 + }, + { + "epoch": 17.245598194130924, + "grad_norm": 295.9725341796875, + "learning_rate": 4.045372050816697e-06, + "loss": 37.3512, + "step": 4777 + }, + { + "epoch": 17.24920993227991, + "grad_norm": 326.34539794921875, + "learning_rate": 4.039927404718694e-06, + "loss": 38.6739, + "step": 4778 + }, + { + "epoch": 17.252821670428894, + "grad_norm": 384.3682861328125, + "learning_rate": 4.034482758620689e-06, + "loss": 38.0995, + "step": 4779 + }, + { + "epoch": 17.256433408577877, + "grad_norm": 400.59136962890625, + "learning_rate": 4.029038112522686e-06, + "loss": 36.7733, + "step": 4780 + }, + { + "epoch": 17.256433408577877, + "eval_loss": 0.6064656972885132, + "eval_runtime": 3.14, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 4780 + }, + { + "epoch": 17.260045146726863, + "grad_norm": 379.5261535644531, + "learning_rate": 4.0235934664246825e-06, + "loss": 36.1385, + "step": 4781 + }, + { + "epoch": 17.263656884875846, + "grad_norm": 277.1004638671875, + "learning_rate": 4.018148820326679e-06, + "loss": 39.1495, + "step": 4782 + }, + { + "epoch": 17.26726862302483, + "grad_norm": 274.6176452636719, + "learning_rate": 4.012704174228675e-06, + "loss": 37.8503, + "step": 4783 + }, + { + "epoch": 17.270880361173816, + "grad_norm": 338.9375305175781, + "learning_rate": 4.007259528130671e-06, + "loss": 39.7149, + "step": 4784 + }, + { + "epoch": 17.2744920993228, + "grad_norm": 299.60662841796875, + "learning_rate": 4.001814882032668e-06, + "loss": 37.6013, + "step": 4785 + }, + { + "epoch": 17.278103837471782, + "grad_norm": 278.9190368652344, + "learning_rate": 3.996370235934664e-06, + "loss": 38.1106, + "step": 4786 + }, + { + "epoch": 17.28171557562077, + "grad_norm": 254.48443603515625, + "learning_rate": 3.990925589836661e-06, + "loss": 35.9676, + "step": 4787 + }, + { + "epoch": 17.285327313769752, + "grad_norm": 274.65338134765625, + "learning_rate": 3.985480943738657e-06, + "loss": 35.3535, + "step": 4788 + }, + { + "epoch": 17.288939051918735, + "grad_norm": 288.748779296875, + "learning_rate": 3.980036297640654e-06, + "loss": 32.7356, + "step": 4789 + }, + { + "epoch": 17.292550790067722, + "grad_norm": 229.0682830810547, + "learning_rate": 3.9745916515426495e-06, + "loss": 31.2048, + "step": 4790 + }, + { + "epoch": 17.292550790067722, + "eval_loss": 0.6020387411117554, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.06, + "eval_steps_per_second": 57.06, + "step": 4790 + }, + { + "epoch": 17.296162528216705, + "grad_norm": 234.29937744140625, + "learning_rate": 3.9691470054446465e-06, + "loss": 31.7953, + "step": 4791 + }, + { + "epoch": 17.299774266365688, + "grad_norm": 236.3527069091797, + "learning_rate": 3.963702359346643e-06, + "loss": 31.6686, + "step": 4792 + }, + { + "epoch": 17.30338600451467, + "grad_norm": 253.44126892089844, + "learning_rate": 3.958257713248639e-06, + "loss": 31.8848, + "step": 4793 + }, + { + "epoch": 17.306997742663658, + "grad_norm": 270.66046142578125, + "learning_rate": 3.952813067150635e-06, + "loss": 32.1593, + "step": 4794 + }, + { + "epoch": 17.31060948081264, + "grad_norm": 242.77777099609375, + "learning_rate": 3.9473684210526315e-06, + "loss": 32.4555, + "step": 4795 + }, + { + "epoch": 17.314221218961624, + "grad_norm": 243.9296112060547, + "learning_rate": 3.9419237749546285e-06, + "loss": 34.0444, + "step": 4796 + }, + { + "epoch": 17.31783295711061, + "grad_norm": 276.2138671875, + "learning_rate": 3.936479128856624e-06, + "loss": 32.0404, + "step": 4797 + }, + { + "epoch": 17.321444695259594, + "grad_norm": 262.97802734375, + "learning_rate": 3.931034482758621e-06, + "loss": 32.4535, + "step": 4798 + }, + { + "epoch": 17.325056433408577, + "grad_norm": 338.9852600097656, + "learning_rate": 3.925589836660617e-06, + "loss": 34.6855, + "step": 4799 + }, + { + "epoch": 17.328668171557563, + "grad_norm": 270.85650634765625, + "learning_rate": 3.9201451905626135e-06, + "loss": 32.2425, + "step": 4800 + }, + { + "epoch": 17.328668171557563, + "eval_loss": 0.603055477142334, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 4800 + }, + { + "epoch": 17.332279909706546, + "grad_norm": 289.17584228515625, + "learning_rate": 3.91470054446461e-06, + "loss": 34.6461, + "step": 4801 + }, + { + "epoch": 17.33589164785553, + "grad_norm": 301.120361328125, + "learning_rate": 3.909255898366606e-06, + "loss": 34.5622, + "step": 4802 + }, + { + "epoch": 17.339503386004516, + "grad_norm": 328.93524169921875, + "learning_rate": 3.903811252268603e-06, + "loss": 34.9585, + "step": 4803 + }, + { + "epoch": 17.3431151241535, + "grad_norm": 445.72003173828125, + "learning_rate": 3.898366606170599e-06, + "loss": 36.9729, + "step": 4804 + }, + { + "epoch": 17.346726862302482, + "grad_norm": 249.7901153564453, + "learning_rate": 3.8929219600725955e-06, + "loss": 30.1609, + "step": 4805 + }, + { + "epoch": 17.35033860045147, + "grad_norm": 230.1756134033203, + "learning_rate": 3.887477313974592e-06, + "loss": 21.6742, + "step": 4806 + }, + { + "epoch": 17.353950338600452, + "grad_norm": 193.68104553222656, + "learning_rate": 3.882032667876588e-06, + "loss": 22.0064, + "step": 4807 + }, + { + "epoch": 17.357562076749435, + "grad_norm": 232.58486938476562, + "learning_rate": 3.876588021778585e-06, + "loss": 23.1576, + "step": 4808 + }, + { + "epoch": 17.36117381489842, + "grad_norm": 256.0340270996094, + "learning_rate": 3.87114337568058e-06, + "loss": 23.5346, + "step": 4809 + }, + { + "epoch": 17.364785553047405, + "grad_norm": 260.8665771484375, + "learning_rate": 3.8656987295825775e-06, + "loss": 39.5267, + "step": 4810 + }, + { + "epoch": 17.364785553047405, + "eval_loss": 0.6040924191474915, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.926, + "eval_steps_per_second": 56.926, + "step": 4810 + }, + { + "epoch": 17.368397291196388, + "grad_norm": 253.2076873779297, + "learning_rate": 3.860254083484574e-06, + "loss": 40.222, + "step": 4811 + }, + { + "epoch": 17.37200902934537, + "grad_norm": 232.68162536621094, + "learning_rate": 3.85480943738657e-06, + "loss": 38.8405, + "step": 4812 + }, + { + "epoch": 17.375620767494357, + "grad_norm": 264.7735290527344, + "learning_rate": 3.849364791288566e-06, + "loss": 37.8169, + "step": 4813 + }, + { + "epoch": 17.37923250564334, + "grad_norm": 305.1289978027344, + "learning_rate": 3.843920145190563e-06, + "loss": 39.4413, + "step": 4814 + }, + { + "epoch": 17.382844243792324, + "grad_norm": 409.03106689453125, + "learning_rate": 3.8384754990925594e-06, + "loss": 40.146, + "step": 4815 + }, + { + "epoch": 17.38645598194131, + "grad_norm": 307.2272644042969, + "learning_rate": 3.833030852994555e-06, + "loss": 39.0141, + "step": 4816 + }, + { + "epoch": 17.390067720090293, + "grad_norm": 272.6708068847656, + "learning_rate": 3.827586206896552e-06, + "loss": 39.4356, + "step": 4817 + }, + { + "epoch": 17.393679458239276, + "grad_norm": 239.75225830078125, + "learning_rate": 3.822141560798548e-06, + "loss": 39.1581, + "step": 4818 + }, + { + "epoch": 17.397291196388263, + "grad_norm": 203.42205810546875, + "learning_rate": 3.816696914700545e-06, + "loss": 39.9827, + "step": 4819 + }, + { + "epoch": 17.400902934537246, + "grad_norm": 217.77159118652344, + "learning_rate": 3.811252268602541e-06, + "loss": 37.5404, + "step": 4820 + }, + { + "epoch": 17.400902934537246, + "eval_loss": 0.6033807396888733, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.033, + "eval_steps_per_second": 57.033, + "step": 4820 + }, + { + "epoch": 17.40451467268623, + "grad_norm": 257.9713134765625, + "learning_rate": 3.8058076225045377e-06, + "loss": 35.6571, + "step": 4821 + }, + { + "epoch": 17.408126410835216, + "grad_norm": 295.11468505859375, + "learning_rate": 3.8003629764065335e-06, + "loss": 34.7256, + "step": 4822 + }, + { + "epoch": 17.4117381489842, + "grad_norm": 248.15908813476562, + "learning_rate": 3.7949183303085297e-06, + "loss": 37.3417, + "step": 4823 + }, + { + "epoch": 17.415349887133182, + "grad_norm": 295.19085693359375, + "learning_rate": 3.7894736842105264e-06, + "loss": 37.0117, + "step": 4824 + }, + { + "epoch": 17.41896162528217, + "grad_norm": 249.31576538085938, + "learning_rate": 3.7840290381125226e-06, + "loss": 37.168, + "step": 4825 + }, + { + "epoch": 17.42257336343115, + "grad_norm": 271.1731262207031, + "learning_rate": 3.7785843920145193e-06, + "loss": 35.9932, + "step": 4826 + }, + { + "epoch": 17.426185101580135, + "grad_norm": 380.6817626953125, + "learning_rate": 3.7731397459165155e-06, + "loss": 36.952, + "step": 4827 + }, + { + "epoch": 17.42979683972912, + "grad_norm": 370.125244140625, + "learning_rate": 3.767695099818512e-06, + "loss": 38.2224, + "step": 4828 + }, + { + "epoch": 17.433408577878104, + "grad_norm": 291.13568115234375, + "learning_rate": 3.7622504537205084e-06, + "loss": 38.5377, + "step": 4829 + }, + { + "epoch": 17.437020316027088, + "grad_norm": 329.5670471191406, + "learning_rate": 3.756805807622504e-06, + "loss": 38.1665, + "step": 4830 + }, + { + "epoch": 17.437020316027088, + "eval_loss": 0.6047329902648926, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.081, + "eval_steps_per_second": 57.081, + "step": 4830 + }, + { + "epoch": 17.44063205417607, + "grad_norm": 266.0620422363281, + "learning_rate": 3.7513611615245012e-06, + "loss": 34.8371, + "step": 4831 + }, + { + "epoch": 17.444243792325057, + "grad_norm": 257.48980712890625, + "learning_rate": 3.7459165154264975e-06, + "loss": 37.1885, + "step": 4832 + }, + { + "epoch": 17.44785553047404, + "grad_norm": 346.8575439453125, + "learning_rate": 3.740471869328494e-06, + "loss": 38.1426, + "step": 4833 + }, + { + "epoch": 17.451467268623023, + "grad_norm": 246.66868591308594, + "learning_rate": 3.73502722323049e-06, + "loss": 37.6658, + "step": 4834 + }, + { + "epoch": 17.45507900677201, + "grad_norm": 309.71087646484375, + "learning_rate": 3.729582577132486e-06, + "loss": 38.2335, + "step": 4835 + }, + { + "epoch": 17.458690744920993, + "grad_norm": 304.1862487792969, + "learning_rate": 3.724137931034483e-06, + "loss": 38.5964, + "step": 4836 + }, + { + "epoch": 17.462302483069976, + "grad_norm": 253.73211669921875, + "learning_rate": 3.718693284936479e-06, + "loss": 38.9237, + "step": 4837 + }, + { + "epoch": 17.465914221218963, + "grad_norm": 208.52822875976562, + "learning_rate": 3.7132486388384757e-06, + "loss": 35.9177, + "step": 4838 + }, + { + "epoch": 17.469525959367946, + "grad_norm": 258.5502014160156, + "learning_rate": 3.707803992740472e-06, + "loss": 33.2577, + "step": 4839 + }, + { + "epoch": 17.47313769751693, + "grad_norm": 269.1754150390625, + "learning_rate": 3.7023593466424686e-06, + "loss": 31.2634, + "step": 4840 + }, + { + "epoch": 17.47313769751693, + "eval_loss": 0.6035012006759644, + "eval_runtime": 3.1369, + "eval_samples_per_second": 57.062, + "eval_steps_per_second": 57.062, + "step": 4840 + }, + { + "epoch": 17.476749435665916, + "grad_norm": 268.5780029296875, + "learning_rate": 3.6969147005444644e-06, + "loss": 30.6732, + "step": 4841 + }, + { + "epoch": 17.4803611738149, + "grad_norm": 223.7191619873047, + "learning_rate": 3.691470054446461e-06, + "loss": 31.5905, + "step": 4842 + }, + { + "epoch": 17.483972911963882, + "grad_norm": 266.960205078125, + "learning_rate": 3.6860254083484573e-06, + "loss": 31.9407, + "step": 4843 + }, + { + "epoch": 17.48758465011287, + "grad_norm": 241.2608184814453, + "learning_rate": 3.680580762250454e-06, + "loss": 31.8078, + "step": 4844 + }, + { + "epoch": 17.49119638826185, + "grad_norm": 315.95166015625, + "learning_rate": 3.67513611615245e-06, + "loss": 33.5336, + "step": 4845 + }, + { + "epoch": 17.494808126410835, + "grad_norm": 277.731689453125, + "learning_rate": 3.669691470054447e-06, + "loss": 33.0484, + "step": 4846 + }, + { + "epoch": 17.498419864559818, + "grad_norm": 272.35137939453125, + "learning_rate": 3.664246823956443e-06, + "loss": 33.5048, + "step": 4847 + }, + { + "epoch": 17.502031602708804, + "grad_norm": 260.4573974609375, + "learning_rate": 3.6588021778584393e-06, + "loss": 33.5782, + "step": 4848 + }, + { + "epoch": 17.505643340857787, + "grad_norm": 285.7935485839844, + "learning_rate": 3.6533575317604355e-06, + "loss": 35.0308, + "step": 4849 + }, + { + "epoch": 17.50925507900677, + "grad_norm": 267.613037109375, + "learning_rate": 3.6479128856624317e-06, + "loss": 34.8067, + "step": 4850 + }, + { + "epoch": 17.50925507900677, + "eval_loss": 0.6035751700401306, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4850 + }, + { + "epoch": 17.512866817155757, + "grad_norm": 301.43536376953125, + "learning_rate": 3.6424682395644284e-06, + "loss": 33.1631, + "step": 4851 + }, + { + "epoch": 17.51647855530474, + "grad_norm": 270.10467529296875, + "learning_rate": 3.6370235934664246e-06, + "loss": 32.978, + "step": 4852 + }, + { + "epoch": 17.520090293453723, + "grad_norm": 280.802001953125, + "learning_rate": 3.6315789473684213e-06, + "loss": 35.3346, + "step": 4853 + }, + { + "epoch": 17.52370203160271, + "grad_norm": 314.7720031738281, + "learning_rate": 3.6261343012704175e-06, + "loss": 33.4881, + "step": 4854 + }, + { + "epoch": 17.527313769751693, + "grad_norm": 347.4674072265625, + "learning_rate": 3.620689655172414e-06, + "loss": 31.5599, + "step": 4855 + }, + { + "epoch": 17.530925507900676, + "grad_norm": 207.3061981201172, + "learning_rate": 3.61524500907441e-06, + "loss": 22.159, + "step": 4856 + }, + { + "epoch": 17.534537246049663, + "grad_norm": 216.7202911376953, + "learning_rate": 3.6098003629764066e-06, + "loss": 21.6584, + "step": 4857 + }, + { + "epoch": 17.538148984198646, + "grad_norm": 260.20452880859375, + "learning_rate": 3.604355716878403e-06, + "loss": 22.9289, + "step": 4858 + }, + { + "epoch": 17.54176072234763, + "grad_norm": 295.9897766113281, + "learning_rate": 3.5989110707803995e-06, + "loss": 23.7172, + "step": 4859 + }, + { + "epoch": 17.545372460496615, + "grad_norm": 226.99484252929688, + "learning_rate": 3.5934664246823957e-06, + "loss": 37.5844, + "step": 4860 + }, + { + "epoch": 17.545372460496615, + "eval_loss": 0.6059216260910034, + "eval_runtime": 3.1302, + "eval_samples_per_second": 57.185, + "eval_steps_per_second": 57.185, + "step": 4860 + }, + { + "epoch": 17.5489841986456, + "grad_norm": 231.67477416992188, + "learning_rate": 3.588021778584392e-06, + "loss": 39.5191, + "step": 4861 + }, + { + "epoch": 17.55259593679458, + "grad_norm": 248.46058654785156, + "learning_rate": 3.5825771324863886e-06, + "loss": 39.4246, + "step": 4862 + }, + { + "epoch": 17.55620767494357, + "grad_norm": 239.17247009277344, + "learning_rate": 3.577132486388385e-06, + "loss": 38.9811, + "step": 4863 + }, + { + "epoch": 17.55981941309255, + "grad_norm": 325.3457946777344, + "learning_rate": 3.571687840290381e-06, + "loss": 38.4724, + "step": 4864 + }, + { + "epoch": 17.563431151241534, + "grad_norm": 264.5011901855469, + "learning_rate": 3.5662431941923773e-06, + "loss": 38.79, + "step": 4865 + }, + { + "epoch": 17.567042889390518, + "grad_norm": 251.97154235839844, + "learning_rate": 3.560798548094374e-06, + "loss": 38.0342, + "step": 4866 + }, + { + "epoch": 17.570654627539504, + "grad_norm": 236.78271484375, + "learning_rate": 3.55535390199637e-06, + "loss": 39.8586, + "step": 4867 + }, + { + "epoch": 17.574266365688487, + "grad_norm": 276.8800048828125, + "learning_rate": 3.549909255898367e-06, + "loss": 37.8967, + "step": 4868 + }, + { + "epoch": 17.57787810383747, + "grad_norm": 255.9346160888672, + "learning_rate": 3.544464609800363e-06, + "loss": 39.9833, + "step": 4869 + }, + { + "epoch": 17.581489841986457, + "grad_norm": 273.71337890625, + "learning_rate": 3.5390199637023597e-06, + "loss": 38.6235, + "step": 4870 + }, + { + "epoch": 17.581489841986457, + "eval_loss": 0.6033145189285278, + "eval_runtime": 3.1252, + "eval_samples_per_second": 57.275, + "eval_steps_per_second": 57.275, + "step": 4870 + }, + { + "epoch": 17.58510158013544, + "grad_norm": 252.93063354492188, + "learning_rate": 3.533575317604356e-06, + "loss": 37.9017, + "step": 4871 + }, + { + "epoch": 17.588713318284423, + "grad_norm": 259.8314208984375, + "learning_rate": 3.528130671506352e-06, + "loss": 34.6046, + "step": 4872 + }, + { + "epoch": 17.59232505643341, + "grad_norm": 230.2709197998047, + "learning_rate": 3.5226860254083484e-06, + "loss": 35.301, + "step": 4873 + }, + { + "epoch": 17.595936794582393, + "grad_norm": 306.6289367675781, + "learning_rate": 3.517241379310345e-06, + "loss": 37.4443, + "step": 4874 + }, + { + "epoch": 17.599548532731376, + "grad_norm": 241.5065460205078, + "learning_rate": 3.5117967332123413e-06, + "loss": 36.3646, + "step": 4875 + }, + { + "epoch": 17.603160270880363, + "grad_norm": 234.2492218017578, + "learning_rate": 3.5063520871143375e-06, + "loss": 36.2621, + "step": 4876 + }, + { + "epoch": 17.606772009029346, + "grad_norm": 256.5443115234375, + "learning_rate": 3.500907441016334e-06, + "loss": 36.2202, + "step": 4877 + }, + { + "epoch": 17.61038374717833, + "grad_norm": 280.31097412109375, + "learning_rate": 3.4954627949183304e-06, + "loss": 37.5031, + "step": 4878 + }, + { + "epoch": 17.613995485327315, + "grad_norm": 304.2773132324219, + "learning_rate": 3.4900181488203267e-06, + "loss": 37.1418, + "step": 4879 + }, + { + "epoch": 17.6176072234763, + "grad_norm": 361.27716064453125, + "learning_rate": 3.484573502722323e-06, + "loss": 37.1474, + "step": 4880 + }, + { + "epoch": 17.6176072234763, + "eval_loss": 0.6052342653274536, + "eval_runtime": 3.1249, + "eval_samples_per_second": 57.282, + "eval_steps_per_second": 57.282, + "step": 4880 + }, + { + "epoch": 17.62121896162528, + "grad_norm": 237.64540100097656, + "learning_rate": 3.4791288566243195e-06, + "loss": 38.0673, + "step": 4881 + }, + { + "epoch": 17.624830699774268, + "grad_norm": 351.27215576171875, + "learning_rate": 3.4736842105263158e-06, + "loss": 38.8272, + "step": 4882 + }, + { + "epoch": 17.62844243792325, + "grad_norm": 277.1895751953125, + "learning_rate": 3.4682395644283124e-06, + "loss": 39.1524, + "step": 4883 + }, + { + "epoch": 17.632054176072234, + "grad_norm": 275.1535949707031, + "learning_rate": 3.4627949183303086e-06, + "loss": 37.9027, + "step": 4884 + }, + { + "epoch": 17.635665914221217, + "grad_norm": 335.01776123046875, + "learning_rate": 3.4573502722323053e-06, + "loss": 36.7233, + "step": 4885 + }, + { + "epoch": 17.639277652370204, + "grad_norm": 297.1637878417969, + "learning_rate": 3.4519056261343015e-06, + "loss": 37.782, + "step": 4886 + }, + { + "epoch": 17.642889390519187, + "grad_norm": 265.400390625, + "learning_rate": 3.4464609800362978e-06, + "loss": 37.6639, + "step": 4887 + }, + { + "epoch": 17.64650112866817, + "grad_norm": 345.3449401855469, + "learning_rate": 3.441016333938294e-06, + "loss": 36.7617, + "step": 4888 + }, + { + "epoch": 17.650112866817157, + "grad_norm": 256.0724182128906, + "learning_rate": 3.4355716878402902e-06, + "loss": 32.9906, + "step": 4889 + }, + { + "epoch": 17.65372460496614, + "grad_norm": 260.698486328125, + "learning_rate": 3.430127041742287e-06, + "loss": 32.0811, + "step": 4890 + }, + { + "epoch": 17.65372460496614, + "eval_loss": 0.603126585483551, + "eval_runtime": 3.1268, + "eval_samples_per_second": 57.247, + "eval_steps_per_second": 57.247, + "step": 4890 + }, + { + "epoch": 17.657336343115123, + "grad_norm": 274.9847717285156, + "learning_rate": 3.424682395644283e-06, + "loss": 31.2138, + "step": 4891 + }, + { + "epoch": 17.66094808126411, + "grad_norm": 345.5099182128906, + "learning_rate": 3.4192377495462798e-06, + "loss": 30.302, + "step": 4892 + }, + { + "epoch": 17.664559819413093, + "grad_norm": 269.1453857421875, + "learning_rate": 3.413793103448276e-06, + "loss": 30.2679, + "step": 4893 + }, + { + "epoch": 17.668171557562076, + "grad_norm": 293.7955017089844, + "learning_rate": 3.4083484573502722e-06, + "loss": 31.7616, + "step": 4894 + }, + { + "epoch": 17.671783295711062, + "grad_norm": 306.1725769042969, + "learning_rate": 3.4029038112522685e-06, + "loss": 33.1265, + "step": 4895 + }, + { + "epoch": 17.675395033860045, + "grad_norm": 329.8185119628906, + "learning_rate": 3.397459165154265e-06, + "loss": 33.2131, + "step": 4896 + }, + { + "epoch": 17.67900677200903, + "grad_norm": 340.790283203125, + "learning_rate": 3.3920145190562613e-06, + "loss": 33.243, + "step": 4897 + }, + { + "epoch": 17.682618510158015, + "grad_norm": 324.004150390625, + "learning_rate": 3.386569872958258e-06, + "loss": 33.6235, + "step": 4898 + }, + { + "epoch": 17.686230248306998, + "grad_norm": 263.9126892089844, + "learning_rate": 3.3811252268602542e-06, + "loss": 33.2524, + "step": 4899 + }, + { + "epoch": 17.68984198645598, + "grad_norm": 274.6680603027344, + "learning_rate": 3.375680580762251e-06, + "loss": 34.6629, + "step": 4900 + }, + { + "epoch": 17.68984198645598, + "eval_loss": 0.6027778387069702, + "eval_runtime": 3.1418, + "eval_samples_per_second": 56.974, + "eval_steps_per_second": 56.974, + "step": 4900 + }, + { + "epoch": 17.693453724604964, + "grad_norm": 317.1280822753906, + "learning_rate": 3.370235934664247e-06, + "loss": 33.3088, + "step": 4901 + }, + { + "epoch": 17.69706546275395, + "grad_norm": 304.1892395019531, + "learning_rate": 3.364791288566243e-06, + "loss": 34.5045, + "step": 4902 + }, + { + "epoch": 17.700677200902934, + "grad_norm": 278.75933837890625, + "learning_rate": 3.3593466424682396e-06, + "loss": 35.8429, + "step": 4903 + }, + { + "epoch": 17.704288939051917, + "grad_norm": 299.76971435546875, + "learning_rate": 3.353901996370236e-06, + "loss": 36.2401, + "step": 4904 + }, + { + "epoch": 17.707900677200904, + "grad_norm": 253.46795654296875, + "learning_rate": 3.3484573502722324e-06, + "loss": 28.938, + "step": 4905 + }, + { + "epoch": 17.711512415349887, + "grad_norm": 220.74098205566406, + "learning_rate": 3.3430127041742287e-06, + "loss": 21.6689, + "step": 4906 + }, + { + "epoch": 17.71512415349887, + "grad_norm": 255.79150390625, + "learning_rate": 3.3375680580762253e-06, + "loss": 21.3497, + "step": 4907 + }, + { + "epoch": 17.718735891647857, + "grad_norm": 284.2683410644531, + "learning_rate": 3.3321234119782216e-06, + "loss": 22.9276, + "step": 4908 + }, + { + "epoch": 17.72234762979684, + "grad_norm": 296.7882080078125, + "learning_rate": 3.3266787658802182e-06, + "loss": 24.7304, + "step": 4909 + }, + { + "epoch": 17.725959367945823, + "grad_norm": 217.35546875, + "learning_rate": 3.321234119782214e-06, + "loss": 38.7687, + "step": 4910 + }, + { + "epoch": 17.725959367945823, + "eval_loss": 0.6015192866325378, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.074, + "eval_steps_per_second": 57.074, + "step": 4910 + }, + { + "epoch": 17.72957110609481, + "grad_norm": 256.7005920410156, + "learning_rate": 3.3157894736842107e-06, + "loss": 39.7421, + "step": 4911 + }, + { + "epoch": 17.733182844243792, + "grad_norm": 203.49417114257812, + "learning_rate": 3.310344827586207e-06, + "loss": 39.2911, + "step": 4912 + }, + { + "epoch": 17.736794582392776, + "grad_norm": 282.81439208984375, + "learning_rate": 3.3049001814882036e-06, + "loss": 39.2524, + "step": 4913 + }, + { + "epoch": 17.740406320541762, + "grad_norm": 315.3716735839844, + "learning_rate": 3.2994555353902e-06, + "loss": 37.2097, + "step": 4914 + }, + { + "epoch": 17.744018058690745, + "grad_norm": 250.96484375, + "learning_rate": 3.294010889292196e-06, + "loss": 37.6568, + "step": 4915 + }, + { + "epoch": 17.74762979683973, + "grad_norm": 299.4822082519531, + "learning_rate": 3.2885662431941927e-06, + "loss": 38.9578, + "step": 4916 + }, + { + "epoch": 17.751241534988715, + "grad_norm": 261.2537536621094, + "learning_rate": 3.2831215970961885e-06, + "loss": 40.3838, + "step": 4917 + }, + { + "epoch": 17.754853273137698, + "grad_norm": 220.55218505859375, + "learning_rate": 3.277676950998185e-06, + "loss": 39.2068, + "step": 4918 + }, + { + "epoch": 17.75846501128668, + "grad_norm": 238.06874084472656, + "learning_rate": 3.2722323049001814e-06, + "loss": 40.5383, + "step": 4919 + }, + { + "epoch": 17.762076749435664, + "grad_norm": 223.9597625732422, + "learning_rate": 3.266787658802178e-06, + "loss": 37.3857, + "step": 4920 + }, + { + "epoch": 17.762076749435664, + "eval_loss": 0.602606475353241, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.033, + "eval_steps_per_second": 57.033, + "step": 4920 + }, + { + "epoch": 17.76568848758465, + "grad_norm": 278.9289245605469, + "learning_rate": 3.2613430127041742e-06, + "loss": 37.187, + "step": 4921 + }, + { + "epoch": 17.769300225733634, + "grad_norm": 306.52398681640625, + "learning_rate": 3.255898366606171e-06, + "loss": 37.5243, + "step": 4922 + }, + { + "epoch": 17.772911963882617, + "grad_norm": 231.3939208984375, + "learning_rate": 3.250453720508167e-06, + "loss": 35.3104, + "step": 4923 + }, + { + "epoch": 17.776523702031604, + "grad_norm": 216.77613830566406, + "learning_rate": 3.2450090744101638e-06, + "loss": 36.0904, + "step": 4924 + }, + { + "epoch": 17.780135440180587, + "grad_norm": 256.0504150390625, + "learning_rate": 3.2395644283121596e-06, + "loss": 36.4117, + "step": 4925 + }, + { + "epoch": 17.78374717832957, + "grad_norm": 253.29734802246094, + "learning_rate": 3.2341197822141562e-06, + "loss": 37.197, + "step": 4926 + }, + { + "epoch": 17.787358916478556, + "grad_norm": 268.80780029296875, + "learning_rate": 3.2286751361161525e-06, + "loss": 36.4606, + "step": 4927 + }, + { + "epoch": 17.79097065462754, + "grad_norm": 302.3041076660156, + "learning_rate": 3.2232304900181487e-06, + "loss": 36.8647, + "step": 4928 + }, + { + "epoch": 17.794582392776523, + "grad_norm": 274.23797607421875, + "learning_rate": 3.2177858439201454e-06, + "loss": 37.3981, + "step": 4929 + }, + { + "epoch": 17.79819413092551, + "grad_norm": 281.4304504394531, + "learning_rate": 3.2123411978221416e-06, + "loss": 37.2304, + "step": 4930 + }, + { + "epoch": 17.79819413092551, + "eval_loss": 0.6050394773483276, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 4930 + }, + { + "epoch": 17.801805869074492, + "grad_norm": 277.47698974609375, + "learning_rate": 3.2068965517241382e-06, + "loss": 35.9281, + "step": 4931 + }, + { + "epoch": 17.805417607223475, + "grad_norm": 394.02294921875, + "learning_rate": 3.201451905626134e-06, + "loss": 39.0143, + "step": 4932 + }, + { + "epoch": 17.809029345372462, + "grad_norm": 252.8087158203125, + "learning_rate": 3.1960072595281307e-06, + "loss": 36.9452, + "step": 4933 + }, + { + "epoch": 17.812641083521445, + "grad_norm": 249.54962158203125, + "learning_rate": 3.190562613430127e-06, + "loss": 39.2442, + "step": 4934 + }, + { + "epoch": 17.816252821670428, + "grad_norm": 286.9231262207031, + "learning_rate": 3.1851179673321236e-06, + "loss": 38.6445, + "step": 4935 + }, + { + "epoch": 17.819864559819415, + "grad_norm": 345.7146911621094, + "learning_rate": 3.17967332123412e-06, + "loss": 37.1794, + "step": 4936 + }, + { + "epoch": 17.823476297968398, + "grad_norm": 271.23089599609375, + "learning_rate": 3.1742286751361165e-06, + "loss": 36.3952, + "step": 4937 + }, + { + "epoch": 17.82708803611738, + "grad_norm": 406.3717346191406, + "learning_rate": 3.1687840290381127e-06, + "loss": 33.8166, + "step": 4938 + }, + { + "epoch": 17.830699774266364, + "grad_norm": 300.12554931640625, + "learning_rate": 3.1633393829401094e-06, + "loss": 30.9614, + "step": 4939 + }, + { + "epoch": 17.83431151241535, + "grad_norm": 229.67218017578125, + "learning_rate": 3.157894736842105e-06, + "loss": 31.8592, + "step": 4940 + }, + { + "epoch": 17.83431151241535, + "eval_loss": 0.6021057367324829, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 4940 + }, + { + "epoch": 17.837923250564334, + "grad_norm": 269.0873107910156, + "learning_rate": 3.1524500907441014e-06, + "loss": 31.7702, + "step": 4941 + }, + { + "epoch": 17.841534988713317, + "grad_norm": 279.0237731933594, + "learning_rate": 3.147005444646098e-06, + "loss": 31.3615, + "step": 4942 + }, + { + "epoch": 17.845146726862303, + "grad_norm": 234.94839477539062, + "learning_rate": 3.1415607985480943e-06, + "loss": 31.9314, + "step": 4943 + }, + { + "epoch": 17.848758465011286, + "grad_norm": 239.25613403320312, + "learning_rate": 3.136116152450091e-06, + "loss": 32.4513, + "step": 4944 + }, + { + "epoch": 17.85237020316027, + "grad_norm": 257.09661865234375, + "learning_rate": 3.130671506352087e-06, + "loss": 34.4964, + "step": 4945 + }, + { + "epoch": 17.855981941309256, + "grad_norm": 328.88006591796875, + "learning_rate": 3.125226860254084e-06, + "loss": 33.1662, + "step": 4946 + }, + { + "epoch": 17.85959367945824, + "grad_norm": 291.4894714355469, + "learning_rate": 3.1197822141560796e-06, + "loss": 34.4406, + "step": 4947 + }, + { + "epoch": 17.863205417607222, + "grad_norm": 282.81158447265625, + "learning_rate": 3.1143375680580763e-06, + "loss": 32.7141, + "step": 4948 + }, + { + "epoch": 17.86681715575621, + "grad_norm": 300.0378112792969, + "learning_rate": 3.1088929219600725e-06, + "loss": 34.3423, + "step": 4949 + }, + { + "epoch": 17.870428893905192, + "grad_norm": 267.2983703613281, + "learning_rate": 3.103448275862069e-06, + "loss": 33.1653, + "step": 4950 + }, + { + "epoch": 17.870428893905192, + "eval_loss": 0.6020416021347046, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.081, + "eval_steps_per_second": 57.081, + "step": 4950 + }, + { + "epoch": 17.874040632054175, + "grad_norm": 270.53277587890625, + "learning_rate": 3.0980036297640654e-06, + "loss": 34.7582, + "step": 4951 + }, + { + "epoch": 17.877652370203162, + "grad_norm": 346.0074157714844, + "learning_rate": 3.092558983666062e-06, + "loss": 35.9911, + "step": 4952 + }, + { + "epoch": 17.881264108352145, + "grad_norm": 367.5807189941406, + "learning_rate": 3.0871143375680583e-06, + "loss": 35.3345, + "step": 4953 + }, + { + "epoch": 17.884875846501128, + "grad_norm": 304.21649169921875, + "learning_rate": 3.0816696914700545e-06, + "loss": 32.9797, + "step": 4954 + }, + { + "epoch": 17.888487584650115, + "grad_norm": 253.14601135253906, + "learning_rate": 3.0762250453720507e-06, + "loss": 22.6226, + "step": 4955 + }, + { + "epoch": 17.892099322799098, + "grad_norm": 270.3512268066406, + "learning_rate": 3.070780399274047e-06, + "loss": 21.9531, + "step": 4956 + }, + { + "epoch": 17.89571106094808, + "grad_norm": 192.73712158203125, + "learning_rate": 3.0653357531760436e-06, + "loss": 21.8497, + "step": 4957 + }, + { + "epoch": 17.899322799097064, + "grad_norm": 254.43759155273438, + "learning_rate": 3.05989110707804e-06, + "loss": 23.2694, + "step": 4958 + }, + { + "epoch": 17.90293453724605, + "grad_norm": 271.2293395996094, + "learning_rate": 3.0544464609800365e-06, + "loss": 22.9774, + "step": 4959 + }, + { + "epoch": 17.906546275395034, + "grad_norm": 213.7334747314453, + "learning_rate": 3.0490018148820327e-06, + "loss": 38.8821, + "step": 4960 + }, + { + "epoch": 17.906546275395034, + "eval_loss": 0.600848913192749, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 4960 + }, + { + "epoch": 17.910158013544017, + "grad_norm": 269.9356384277344, + "learning_rate": 3.0435571687840294e-06, + "loss": 38.6362, + "step": 4961 + }, + { + "epoch": 17.913769751693003, + "grad_norm": 237.6484832763672, + "learning_rate": 3.0381125226860256e-06, + "loss": 39.6388, + "step": 4962 + }, + { + "epoch": 17.917381489841986, + "grad_norm": 304.2347106933594, + "learning_rate": 3.032667876588022e-06, + "loss": 39.4308, + "step": 4963 + }, + { + "epoch": 17.92099322799097, + "grad_norm": 250.6772918701172, + "learning_rate": 3.027223230490018e-06, + "loss": 40.1923, + "step": 4964 + }, + { + "epoch": 17.924604966139956, + "grad_norm": 261.7320556640625, + "learning_rate": 3.0217785843920147e-06, + "loss": 37.862, + "step": 4965 + }, + { + "epoch": 17.92821670428894, + "grad_norm": 385.33197021484375, + "learning_rate": 3.016333938294011e-06, + "loss": 35.9139, + "step": 4966 + }, + { + "epoch": 17.931828442437922, + "grad_norm": 436.6773986816406, + "learning_rate": 3.010889292196007e-06, + "loss": 36.6259, + "step": 4967 + }, + { + "epoch": 17.93544018058691, + "grad_norm": 318.65673828125, + "learning_rate": 3.005444646098004e-06, + "loss": 36.1235, + "step": 4968 + }, + { + "epoch": 17.939051918735892, + "grad_norm": 241.6234893798828, + "learning_rate": 3e-06, + "loss": 37.4148, + "step": 4969 + }, + { + "epoch": 17.942663656884875, + "grad_norm": 316.8415832519531, + "learning_rate": 2.9945553539019963e-06, + "loss": 36.7089, + "step": 4970 + }, + { + "epoch": 17.942663656884875, + "eval_loss": 0.6032605171203613, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.061, + "eval_steps_per_second": 57.061, + "step": 4970 + }, + { + "epoch": 17.94627539503386, + "grad_norm": 322.0501403808594, + "learning_rate": 2.9891107078039925e-06, + "loss": 37.2222, + "step": 4971 + }, + { + "epoch": 17.949887133182845, + "grad_norm": 300.4189453125, + "learning_rate": 2.983666061705989e-06, + "loss": 37.9156, + "step": 4972 + }, + { + "epoch": 17.953498871331828, + "grad_norm": 304.39263916015625, + "learning_rate": 2.9782214156079854e-06, + "loss": 38.5253, + "step": 4973 + }, + { + "epoch": 17.957110609480814, + "grad_norm": 297.4574890136719, + "learning_rate": 2.972776769509982e-06, + "loss": 38.4385, + "step": 4974 + }, + { + "epoch": 17.960722347629797, + "grad_norm": 367.7257080078125, + "learning_rate": 2.9673321234119783e-06, + "loss": 36.2943, + "step": 4975 + }, + { + "epoch": 17.96433408577878, + "grad_norm": 274.61724853515625, + "learning_rate": 2.961887477313975e-06, + "loss": 30.8753, + "step": 4976 + }, + { + "epoch": 17.967945823927764, + "grad_norm": 358.50201416015625, + "learning_rate": 2.956442831215971e-06, + "loss": 32.1308, + "step": 4977 + }, + { + "epoch": 17.97155756207675, + "grad_norm": 493.7792663574219, + "learning_rate": 2.9509981851179674e-06, + "loss": 33.2474, + "step": 4978 + }, + { + "epoch": 17.975169300225733, + "grad_norm": 426.67138671875, + "learning_rate": 2.9455535390199636e-06, + "loss": 33.7065, + "step": 4979 + }, + { + "epoch": 17.978781038374716, + "grad_norm": 524.0231323242188, + "learning_rate": 2.94010889292196e-06, + "loss": 34.6007, + "step": 4980 + }, + { + "epoch": 17.978781038374716, + "eval_loss": 0.6021283268928528, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 4980 + }, + { + "epoch": 17.982392776523703, + "grad_norm": 395.26715087890625, + "learning_rate": 2.9346642468239565e-06, + "loss": 33.9185, + "step": 4981 + }, + { + "epoch": 17.986004514672686, + "grad_norm": 400.0454406738281, + "learning_rate": 2.9292196007259528e-06, + "loss": 34.6485, + "step": 4982 + }, + { + "epoch": 17.98961625282167, + "grad_norm": 376.1269226074219, + "learning_rate": 2.9237749546279494e-06, + "loss": 34.668, + "step": 4983 + }, + { + "epoch": 17.993227990970656, + "grad_norm": 315.5225524902344, + "learning_rate": 2.9183303085299456e-06, + "loss": 30.7058, + "step": 4984 + }, + { + "epoch": 17.99683972911964, + "grad_norm": 221.5032958984375, + "learning_rate": 2.912885662431942e-06, + "loss": 21.8055, + "step": 4985 + }, + { + "epoch": 18.0, + "grad_norm": 226.06068420410156, + "learning_rate": 2.907441016333938e-06, + "loss": 20.5066, + "step": 4986 + }, + { + "epoch": 18.003611738148983, + "grad_norm": 209.69607543945312, + "learning_rate": 2.9019963702359348e-06, + "loss": 37.9156, + "step": 4987 + }, + { + "epoch": 18.00722347629797, + "grad_norm": 218.86709594726562, + "learning_rate": 2.896551724137931e-06, + "loss": 38.8204, + "step": 4988 + }, + { + "epoch": 18.010835214446953, + "grad_norm": 218.38180541992188, + "learning_rate": 2.8911070780399276e-06, + "loss": 38.5472, + "step": 4989 + }, + { + "epoch": 18.014446952595936, + "grad_norm": 338.4778747558594, + "learning_rate": 2.885662431941924e-06, + "loss": 37.7233, + "step": 4990 + }, + { + "epoch": 18.014446952595936, + "eval_loss": 0.6013379096984863, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.979, + "eval_steps_per_second": 56.979, + "step": 4990 + }, + { + "epoch": 18.018058690744923, + "grad_norm": 309.5385437011719, + "learning_rate": 2.8802177858439205e-06, + "loss": 38.3321, + "step": 4991 + }, + { + "epoch": 18.021670428893906, + "grad_norm": 335.67169189453125, + "learning_rate": 2.8747731397459168e-06, + "loss": 38.2367, + "step": 4992 + }, + { + "epoch": 18.02528216704289, + "grad_norm": 260.5025939941406, + "learning_rate": 2.8693284936479126e-06, + "loss": 38.5516, + "step": 4993 + }, + { + "epoch": 18.028893905191875, + "grad_norm": 265.4793395996094, + "learning_rate": 2.8638838475499092e-06, + "loss": 38.9539, + "step": 4994 + }, + { + "epoch": 18.03250564334086, + "grad_norm": 237.87942504882812, + "learning_rate": 2.8584392014519054e-06, + "loss": 39.4582, + "step": 4995 + }, + { + "epoch": 18.03611738148984, + "grad_norm": 252.11746215820312, + "learning_rate": 2.852994555353902e-06, + "loss": 39.3466, + "step": 4996 + }, + { + "epoch": 18.039729119638825, + "grad_norm": 298.1370849609375, + "learning_rate": 2.8475499092558983e-06, + "loss": 36.9779, + "step": 4997 + }, + { + "epoch": 18.04334085778781, + "grad_norm": 341.9007873535156, + "learning_rate": 2.842105263157895e-06, + "loss": 36.5117, + "step": 4998 + }, + { + "epoch": 18.046952595936794, + "grad_norm": 210.0319366455078, + "learning_rate": 2.8366606170598912e-06, + "loss": 34.7543, + "step": 4999 + }, + { + "epoch": 18.050564334085777, + "grad_norm": 385.6400146484375, + "learning_rate": 2.831215970961888e-06, + "loss": 36.4577, + "step": 5000 + }, + { + "epoch": 18.050564334085777, + "eval_loss": 0.6031082272529602, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 5000 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.452399434912891e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d7f68df75c7e34a42d5fd163e9fe171deafb9bc5 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fed762759c792298109a88e9a12796e07f6c701360a0508d433ab6b33a13863d +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..9d9cc82877c916445851cf1d338cc2cf83224157 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52c88be144dc5fe7d595a97b14f9eebacc85835778dff637aeb19394c2bd084c +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d83d57e19eb880505d4f72eb4cf4998534649f55 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4daee8d382a6a34e2ef1e141eff13fc54897883ab7a5dba419ce5cc17b9c991c +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..96bbcf132e589e28482bd789abcd8b390b8eb834 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7561111f244646485049f0a00aea1ba5deefb4d04d02514133625ded4dd49797 +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e5c7cb00cab0fba41f81a6b58ffff8ad9d01553 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fc4ae160960660603e4542a0b4c000b831e8416fb0d7e44755074ce084690ff +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..463a136a5c6302ba193190999780097c092f1a93 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/trainer_state.json @@ -0,0 +1,40593 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 18.772911963882617, + "eval_steps": 10, + "global_step": 5200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + }, + { + "epoch": 4.33589164785553, + "grad_norm": 232.332763671875, + "learning_rate": 2.3515426497277678e-05, + "loss": 38.1029, + "step": 1201 + }, + { + "epoch": 4.339503386004514, + "grad_norm": 251.8763885498047, + "learning_rate": 2.3509981851179673e-05, + "loss": 40.2538, + "step": 1202 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 260.1363525390625, + "learning_rate": 2.350453720508167e-05, + "loss": 39.115, + "step": 1203 + }, + { + "epoch": 4.346726862302483, + "grad_norm": 227.32473754882812, + "learning_rate": 2.3499092558983667e-05, + "loss": 37.7692, + "step": 1204 + }, + { + "epoch": 4.350338600451467, + "grad_norm": 208.3872528076172, + "learning_rate": 2.3493647912885663e-05, + "loss": 26.7583, + "step": 1205 + }, + { + "epoch": 4.353950338600452, + "grad_norm": 173.05075073242188, + "learning_rate": 2.348820326678766e-05, + "loss": 24.7576, + "step": 1206 + }, + { + "epoch": 4.357562076749436, + "grad_norm": 214.4512939453125, + "learning_rate": 2.3482758620689657e-05, + "loss": 24.8792, + "step": 1207 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 179.293701171875, + "learning_rate": 2.3477313974591652e-05, + "loss": 26.1507, + "step": 1208 + }, + { + "epoch": 4.364785553047404, + "grad_norm": 401.9908142089844, + "learning_rate": 2.3471869328493648e-05, + "loss": 47.4017, + "step": 1209 + }, + { + "epoch": 4.368397291196389, + "grad_norm": 399.3369140625, + "learning_rate": 2.3466424682395643e-05, + "loss": 48.0082, + "step": 1210 + }, + { + "epoch": 4.368397291196389, + "eval_loss": 0.6664602756500244, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.18, + "eval_steps_per_second": 57.18, + "step": 1210 + }, + { + "epoch": 4.372009029345373, + "grad_norm": 320.49090576171875, + "learning_rate": 2.346098003629764e-05, + "loss": 47.4843, + "step": 1211 + }, + { + "epoch": 4.375620767494357, + "grad_norm": 297.55615234375, + "learning_rate": 2.3455535390199637e-05, + "loss": 46.3087, + "step": 1212 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 245.03399658203125, + "learning_rate": 2.3450090744101636e-05, + "loss": 45.4889, + "step": 1213 + }, + { + "epoch": 4.382844243792325, + "grad_norm": 227.94091796875, + "learning_rate": 2.344464609800363e-05, + "loss": 45.8501, + "step": 1214 + }, + { + "epoch": 4.386455981941309, + "grad_norm": 262.7824401855469, + "learning_rate": 2.3439201451905627e-05, + "loss": 46.2737, + "step": 1215 + }, + { + "epoch": 4.390067720090293, + "grad_norm": 235.969970703125, + "learning_rate": 2.3433756805807622e-05, + "loss": 45.2876, + "step": 1216 + }, + { + "epoch": 4.393679458239277, + "grad_norm": 244.8028106689453, + "learning_rate": 2.342831215970962e-05, + "loss": 45.4931, + "step": 1217 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 236.24844360351562, + "learning_rate": 2.3422867513611616e-05, + "loss": 45.6649, + "step": 1218 + }, + { + "epoch": 4.400902934537246, + "grad_norm": 204.7911834716797, + "learning_rate": 2.341742286751361e-05, + "loss": 43.9613, + "step": 1219 + }, + { + "epoch": 4.40451467268623, + "grad_norm": 190.6739044189453, + "learning_rate": 2.3411978221415607e-05, + "loss": 41.9267, + "step": 1220 + }, + { + "epoch": 4.40451467268623, + "eval_loss": 0.6481396555900574, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1220 + }, + { + "epoch": 4.408126410835214, + "grad_norm": 224.25758361816406, + "learning_rate": 2.3406533575317602e-05, + "loss": 42.34, + "step": 1221 + }, + { + "epoch": 4.411738148984199, + "grad_norm": 238.21913146972656, + "learning_rate": 2.34010889292196e-05, + "loss": 40.6947, + "step": 1222 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 255.64395141601562, + "learning_rate": 2.33956442831216e-05, + "loss": 39.8585, + "step": 1223 + }, + { + "epoch": 4.418961625282167, + "grad_norm": 202.08859252929688, + "learning_rate": 2.3390199637023595e-05, + "loss": 42.6031, + "step": 1224 + }, + { + "epoch": 4.422573363431152, + "grad_norm": 222.359619140625, + "learning_rate": 2.338475499092559e-05, + "loss": 41.9946, + "step": 1225 + }, + { + "epoch": 4.426185101580136, + "grad_norm": 198.84461975097656, + "learning_rate": 2.3379310344827586e-05, + "loss": 40.9174, + "step": 1226 + }, + { + "epoch": 4.42979683972912, + "grad_norm": 227.34942626953125, + "learning_rate": 2.337386569872958e-05, + "loss": 42.2865, + "step": 1227 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 249.9097900390625, + "learning_rate": 2.336842105263158e-05, + "loss": 42.6508, + "step": 1228 + }, + { + "epoch": 4.437020316027088, + "grad_norm": 236.96009826660156, + "learning_rate": 2.3362976406533576e-05, + "loss": 43.0846, + "step": 1229 + }, + { + "epoch": 4.440632054176072, + "grad_norm": 183.06201171875, + "learning_rate": 2.335753176043557e-05, + "loss": 42.4119, + "step": 1230 + }, + { + "epoch": 4.440632054176072, + "eval_loss": 0.6428424715995789, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 1230 + }, + { + "epoch": 4.444243792325056, + "grad_norm": 199.0382843017578, + "learning_rate": 2.335208711433757e-05, + "loss": 43.1702, + "step": 1231 + }, + { + "epoch": 4.44785553047404, + "grad_norm": 221.87939453125, + "learning_rate": 2.3346642468239565e-05, + "loss": 43.3518, + "step": 1232 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 205.0601043701172, + "learning_rate": 2.3341197822141564e-05, + "loss": 42.9713, + "step": 1233 + }, + { + "epoch": 4.455079006772009, + "grad_norm": 235.3998565673828, + "learning_rate": 2.333575317604356e-05, + "loss": 42.6973, + "step": 1234 + }, + { + "epoch": 4.458690744920993, + "grad_norm": 171.76986694335938, + "learning_rate": 2.3330308529945555e-05, + "loss": 43.351, + "step": 1235 + }, + { + "epoch": 4.462302483069977, + "grad_norm": 261.549072265625, + "learning_rate": 2.332486388384755e-05, + "loss": 43.8662, + "step": 1236 + }, + { + "epoch": 4.465914221218962, + "grad_norm": 256.76837158203125, + "learning_rate": 2.3319419237749545e-05, + "loss": 40.7938, + "step": 1237 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 176.35060119628906, + "learning_rate": 2.331397459165154e-05, + "loss": 38.1021, + "step": 1238 + }, + { + "epoch": 4.47313769751693, + "grad_norm": 203.00906372070312, + "learning_rate": 2.330852994555354e-05, + "loss": 36.6359, + "step": 1239 + }, + { + "epoch": 4.476749435665914, + "grad_norm": 259.6462707519531, + "learning_rate": 2.3303085299455535e-05, + "loss": 34.448, + "step": 1240 + }, + { + "epoch": 4.476749435665914, + "eval_loss": 0.6386051177978516, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 1240 + }, + { + "epoch": 4.480361173814899, + "grad_norm": 215.24737548828125, + "learning_rate": 2.3297640653357534e-05, + "loss": 35.2353, + "step": 1241 + }, + { + "epoch": 4.483972911963883, + "grad_norm": 249.12355041503906, + "learning_rate": 2.329219600725953e-05, + "loss": 38.2077, + "step": 1242 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 191.0881805419922, + "learning_rate": 2.3286751361161525e-05, + "loss": 36.8363, + "step": 1243 + }, + { + "epoch": 4.491196388261851, + "grad_norm": 229.26449584960938, + "learning_rate": 2.3281306715063523e-05, + "loss": 36.7398, + "step": 1244 + }, + { + "epoch": 4.4948081264108355, + "grad_norm": 184.931884765625, + "learning_rate": 2.327586206896552e-05, + "loss": 35.6614, + "step": 1245 + }, + { + "epoch": 4.4984198645598195, + "grad_norm": 183.7378387451172, + "learning_rate": 2.3270417422867514e-05, + "loss": 36.9818, + "step": 1246 + }, + { + "epoch": 4.502031602708803, + "grad_norm": 191.42543029785156, + "learning_rate": 2.326497277676951e-05, + "loss": 38.1348, + "step": 1247 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 211.6359100341797, + "learning_rate": 2.3259528130671505e-05, + "loss": 37.0112, + "step": 1248 + }, + { + "epoch": 4.509255079006772, + "grad_norm": 245.6946563720703, + "learning_rate": 2.32540834845735e-05, + "loss": 38.6218, + "step": 1249 + }, + { + "epoch": 4.512866817155756, + "grad_norm": 193.29095458984375, + "learning_rate": 2.3248638838475502e-05, + "loss": 36.9687, + "step": 1250 + }, + { + "epoch": 4.512866817155756, + "eval_loss": 0.6432057023048401, + "eval_runtime": 3.1301, + "eval_samples_per_second": 57.187, + "eval_steps_per_second": 57.187, + "step": 1250 + }, + { + "epoch": 4.51647855530474, + "grad_norm": 247.0595245361328, + "learning_rate": 2.3243194192377498e-05, + "loss": 39.8086, + "step": 1251 + }, + { + "epoch": 4.520090293453725, + "grad_norm": 243.1544189453125, + "learning_rate": 2.3237749546279493e-05, + "loss": 38.7245, + "step": 1252 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 322.0834045410156, + "learning_rate": 2.323230490018149e-05, + "loss": 39.5335, + "step": 1253 + }, + { + "epoch": 4.527313769751693, + "grad_norm": 201.5956573486328, + "learning_rate": 2.3226860254083484e-05, + "loss": 30.2928, + "step": 1254 + }, + { + "epoch": 4.530925507900677, + "grad_norm": 186.13291931152344, + "learning_rate": 2.3221415607985483e-05, + "loss": 24.8504, + "step": 1255 + }, + { + "epoch": 4.534537246049661, + "grad_norm": 251.50608825683594, + "learning_rate": 2.3215970961887478e-05, + "loss": 24.5528, + "step": 1256 + }, + { + "epoch": 4.538148984198646, + "grad_norm": 180.21124267578125, + "learning_rate": 2.3210526315789473e-05, + "loss": 25.0864, + "step": 1257 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 206.5410614013672, + "learning_rate": 2.320508166969147e-05, + "loss": 27.1602, + "step": 1258 + }, + { + "epoch": 4.545372460496614, + "grad_norm": 342.1103210449219, + "learning_rate": 2.3199637023593468e-05, + "loss": 47.3734, + "step": 1259 + }, + { + "epoch": 4.5489841986455986, + "grad_norm": 418.3056945800781, + "learning_rate": 2.3194192377495463e-05, + "loss": 48.0316, + "step": 1260 + }, + { + "epoch": 4.5489841986455986, + "eval_loss": 0.6742400527000427, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 1260 + }, + { + "epoch": 4.5525959367945825, + "grad_norm": 369.8560791015625, + "learning_rate": 2.3188747731397462e-05, + "loss": 47.4532, + "step": 1261 + }, + { + "epoch": 4.5562076749435665, + "grad_norm": 322.0288391113281, + "learning_rate": 2.3183303085299457e-05, + "loss": 47.0661, + "step": 1262 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 244.79066467285156, + "learning_rate": 2.3177858439201453e-05, + "loss": 45.1875, + "step": 1263 + }, + { + "epoch": 4.563431151241535, + "grad_norm": 209.29397583007812, + "learning_rate": 2.3172413793103448e-05, + "loss": 46.1355, + "step": 1264 + }, + { + "epoch": 4.567042889390519, + "grad_norm": 271.5123291015625, + "learning_rate": 2.3166969147005443e-05, + "loss": 45.8947, + "step": 1265 + }, + { + "epoch": 4.570654627539503, + "grad_norm": 232.42913818359375, + "learning_rate": 2.3161524500907442e-05, + "loss": 45.6542, + "step": 1266 + }, + { + "epoch": 4.574266365688487, + "grad_norm": 282.50738525390625, + "learning_rate": 2.3156079854809437e-05, + "loss": 45.8805, + "step": 1267 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 203.39031982421875, + "learning_rate": 2.3150635208711436e-05, + "loss": 44.8926, + "step": 1268 + }, + { + "epoch": 4.581489841986456, + "grad_norm": 213.94894409179688, + "learning_rate": 2.314519056261343e-05, + "loss": 43.7589, + "step": 1269 + }, + { + "epoch": 4.58510158013544, + "grad_norm": 198.9677734375, + "learning_rate": 2.3139745916515427e-05, + "loss": 41.819, + "step": 1270 + }, + { + "epoch": 4.58510158013544, + "eval_loss": 0.6428627371788025, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1270 + }, + { + "epoch": 4.588713318284425, + "grad_norm": 197.69903564453125, + "learning_rate": 2.3134301270417422e-05, + "loss": 40.6128, + "step": 1271 + }, + { + "epoch": 4.592325056433409, + "grad_norm": 229.10488891601562, + "learning_rate": 2.312885662431942e-05, + "loss": 41.1856, + "step": 1272 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 254.4750213623047, + "learning_rate": 2.3123411978221417e-05, + "loss": 40.2048, + "step": 1273 + }, + { + "epoch": 4.599548532731377, + "grad_norm": 247.2012939453125, + "learning_rate": 2.3117967332123412e-05, + "loss": 41.663, + "step": 1274 + }, + { + "epoch": 4.603160270880361, + "grad_norm": 196.78761291503906, + "learning_rate": 2.3112522686025407e-05, + "loss": 41.1102, + "step": 1275 + }, + { + "epoch": 4.606772009029346, + "grad_norm": 179.03880310058594, + "learning_rate": 2.3107078039927403e-05, + "loss": 39.6368, + "step": 1276 + }, + { + "epoch": 4.6103837471783295, + "grad_norm": 203.49159240722656, + "learning_rate": 2.3101633393829405e-05, + "loss": 42.9424, + "step": 1277 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 254.80018615722656, + "learning_rate": 2.30961887477314e-05, + "loss": 42.0636, + "step": 1278 + }, + { + "epoch": 4.617607223476298, + "grad_norm": 201.86109924316406, + "learning_rate": 2.3090744101633396e-05, + "loss": 41.4738, + "step": 1279 + }, + { + "epoch": 4.621218961625282, + "grad_norm": 185.1239471435547, + "learning_rate": 2.308529945553539e-05, + "loss": 41.8529, + "step": 1280 + }, + { + "epoch": 4.621218961625282, + "eval_loss": 0.6457561254501343, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 1280 + }, + { + "epoch": 4.624830699774266, + "grad_norm": 198.6769561767578, + "learning_rate": 2.3079854809437386e-05, + "loss": 41.8397, + "step": 1281 + }, + { + "epoch": 4.62844243792325, + "grad_norm": 254.9165496826172, + "learning_rate": 2.3074410163339382e-05, + "loss": 43.5585, + "step": 1282 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 183.61181640625, + "learning_rate": 2.306896551724138e-05, + "loss": 41.7349, + "step": 1283 + }, + { + "epoch": 4.635665914221219, + "grad_norm": 206.0381622314453, + "learning_rate": 2.3063520871143376e-05, + "loss": 42.6239, + "step": 1284 + }, + { + "epoch": 4.639277652370203, + "grad_norm": 188.5303497314453, + "learning_rate": 2.305807622504537e-05, + "loss": 43.0988, + "step": 1285 + }, + { + "epoch": 4.642889390519187, + "grad_norm": 208.30039978027344, + "learning_rate": 2.3052631578947367e-05, + "loss": 43.8379, + "step": 1286 + }, + { + "epoch": 4.646501128668172, + "grad_norm": 209.494384765625, + "learning_rate": 2.3047186932849365e-05, + "loss": 41.4395, + "step": 1287 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 223.97824096679688, + "learning_rate": 2.3041742286751364e-05, + "loss": 38.5792, + "step": 1288 + }, + { + "epoch": 4.65372460496614, + "grad_norm": 209.16192626953125, + "learning_rate": 2.303629764065336e-05, + "loss": 36.2448, + "step": 1289 + }, + { + "epoch": 4.657336343115124, + "grad_norm": 260.72821044921875, + "learning_rate": 2.3030852994555355e-05, + "loss": 35.1692, + "step": 1290 + }, + { + "epoch": 4.657336343115124, + "eval_loss": 0.6381233334541321, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1290 + }, + { + "epoch": 4.660948081264109, + "grad_norm": 222.2270965576172, + "learning_rate": 2.302540834845735e-05, + "loss": 35.2234, + "step": 1291 + }, + { + "epoch": 4.664559819413093, + "grad_norm": 208.68218994140625, + "learning_rate": 2.3019963702359346e-05, + "loss": 35.6167, + "step": 1292 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 199.57015991210938, + "learning_rate": 2.301451905626134e-05, + "loss": 36.9489, + "step": 1293 + }, + { + "epoch": 4.6717832957110605, + "grad_norm": 249.1312255859375, + "learning_rate": 2.300907441016334e-05, + "loss": 37.0681, + "step": 1294 + }, + { + "epoch": 4.675395033860045, + "grad_norm": 227.86341857910156, + "learning_rate": 2.3003629764065335e-05, + "loss": 38.3897, + "step": 1295 + }, + { + "epoch": 4.679006772009029, + "grad_norm": 290.3368225097656, + "learning_rate": 2.2998185117967334e-05, + "loss": 39.1391, + "step": 1296 + }, + { + "epoch": 4.682618510158013, + "grad_norm": 222.59974670410156, + "learning_rate": 2.299274047186933e-05, + "loss": 38.6362, + "step": 1297 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 233.853515625, + "learning_rate": 2.2987295825771325e-05, + "loss": 37.1796, + "step": 1298 + }, + { + "epoch": 4.689841986455982, + "grad_norm": 202.83212280273438, + "learning_rate": 2.2981851179673324e-05, + "loss": 38.5097, + "step": 1299 + }, + { + "epoch": 4.693453724604966, + "grad_norm": 203.59027099609375, + "learning_rate": 2.297640653357532e-05, + "loss": 38.3335, + "step": 1300 + }, + { + "epoch": 4.693453724604966, + "eval_loss": 0.6446877717971802, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 1300 + }, + { + "epoch": 4.69706546275395, + "grad_norm": 250.48324584960938, + "learning_rate": 2.2970961887477314e-05, + "loss": 39.1848, + "step": 1301 + }, + { + "epoch": 4.700677200902934, + "grad_norm": 218.0867462158203, + "learning_rate": 2.296551724137931e-05, + "loss": 38.2276, + "step": 1302 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 316.4258728027344, + "learning_rate": 2.2960072595281305e-05, + "loss": 38.4487, + "step": 1303 + }, + { + "epoch": 4.707900677200903, + "grad_norm": 262.96832275390625, + "learning_rate": 2.29546279491833e-05, + "loss": 29.1075, + "step": 1304 + }, + { + "epoch": 4.711512415349887, + "grad_norm": 261.25897216796875, + "learning_rate": 2.2949183303085303e-05, + "loss": 24.6257, + "step": 1305 + }, + { + "epoch": 4.715124153498872, + "grad_norm": 223.29014587402344, + "learning_rate": 2.2943738656987298e-05, + "loss": 24.4387, + "step": 1306 + }, + { + "epoch": 4.718735891647856, + "grad_norm": 167.95193481445312, + "learning_rate": 2.2938294010889293e-05, + "loss": 25.0916, + "step": 1307 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 203.88392639160156, + "learning_rate": 2.293284936479129e-05, + "loss": 26.1631, + "step": 1308 + }, + { + "epoch": 4.725959367945824, + "grad_norm": 350.67657470703125, + "learning_rate": 2.2927404718693284e-05, + "loss": 47.7021, + "step": 1309 + }, + { + "epoch": 4.7295711060948085, + "grad_norm": 357.1839294433594, + "learning_rate": 2.2921960072595283e-05, + "loss": 47.8161, + "step": 1310 + }, + { + "epoch": 4.7295711060948085, + "eval_loss": 0.6716815829277039, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 1310 + }, + { + "epoch": 4.733182844243792, + "grad_norm": 334.40216064453125, + "learning_rate": 2.291651542649728e-05, + "loss": 47.5608, + "step": 1311 + }, + { + "epoch": 4.736794582392776, + "grad_norm": 322.90008544921875, + "learning_rate": 2.2911070780399274e-05, + "loss": 45.9858, + "step": 1312 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 291.5083923339844, + "learning_rate": 2.290562613430127e-05, + "loss": 45.9813, + "step": 1313 + }, + { + "epoch": 4.744018058690745, + "grad_norm": 234.91102600097656, + "learning_rate": 2.2900181488203268e-05, + "loss": 44.4287, + "step": 1314 + }, + { + "epoch": 4.747629796839729, + "grad_norm": 271.03582763671875, + "learning_rate": 2.2894736842105263e-05, + "loss": 45.3697, + "step": 1315 + }, + { + "epoch": 4.751241534988713, + "grad_norm": 256.219482421875, + "learning_rate": 2.2889292196007262e-05, + "loss": 45.1817, + "step": 1316 + }, + { + "epoch": 4.754853273137698, + "grad_norm": 252.0631561279297, + "learning_rate": 2.2883847549909257e-05, + "loss": 45.2029, + "step": 1317 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 249.41812133789062, + "learning_rate": 2.2878402903811253e-05, + "loss": 44.9802, + "step": 1318 + }, + { + "epoch": 4.762076749435666, + "grad_norm": 208.9102325439453, + "learning_rate": 2.2872958257713248e-05, + "loss": 44.3745, + "step": 1319 + }, + { + "epoch": 4.76568848758465, + "grad_norm": 322.94903564453125, + "learning_rate": 2.2867513611615244e-05, + "loss": 40.9193, + "step": 1320 + }, + { + "epoch": 4.76568848758465, + "eval_loss": 0.6515910029411316, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1320 + }, + { + "epoch": 4.769300225733634, + "grad_norm": 264.6942138671875, + "learning_rate": 2.2862068965517242e-05, + "loss": 39.7286, + "step": 1321 + }, + { + "epoch": 4.772911963882619, + "grad_norm": 276.6095886230469, + "learning_rate": 2.2856624319419238e-05, + "loss": 41.3846, + "step": 1322 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 199.59877014160156, + "learning_rate": 2.2851179673321233e-05, + "loss": 40.5583, + "step": 1323 + }, + { + "epoch": 4.780135440180587, + "grad_norm": 252.59158325195312, + "learning_rate": 2.2845735027223232e-05, + "loss": 40.9513, + "step": 1324 + }, + { + "epoch": 4.7837471783295715, + "grad_norm": 215.53826904296875, + "learning_rate": 2.2840290381125227e-05, + "loss": 41.5119, + "step": 1325 + }, + { + "epoch": 4.7873589164785555, + "grad_norm": 290.7100524902344, + "learning_rate": 2.2834845735027226e-05, + "loss": 42.7646, + "step": 1326 + }, + { + "epoch": 4.7909706546275395, + "grad_norm": 190.2306671142578, + "learning_rate": 2.282940108892922e-05, + "loss": 42.2708, + "step": 1327 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 187.5550079345703, + "learning_rate": 2.2823956442831217e-05, + "loss": 41.9279, + "step": 1328 + }, + { + "epoch": 4.798194130925508, + "grad_norm": 169.10414123535156, + "learning_rate": 2.2818511796733212e-05, + "loss": 42.2688, + "step": 1329 + }, + { + "epoch": 4.801805869074492, + "grad_norm": 199.5216064453125, + "learning_rate": 2.2813067150635208e-05, + "loss": 41.9192, + "step": 1330 + }, + { + "epoch": 4.801805869074492, + "eval_loss": 0.6402038335800171, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 1330 + }, + { + "epoch": 4.805417607223476, + "grad_norm": 222.4996337890625, + "learning_rate": 2.2807622504537203e-05, + "loss": 43.8218, + "step": 1331 + }, + { + "epoch": 4.80902934537246, + "grad_norm": 228.1157684326172, + "learning_rate": 2.2802177858439202e-05, + "loss": 42.9497, + "step": 1332 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 179.83697509765625, + "learning_rate": 2.27967332123412e-05, + "loss": 43.9723, + "step": 1333 + }, + { + "epoch": 4.816252821670429, + "grad_norm": 196.81983947753906, + "learning_rate": 2.2791288566243196e-05, + "loss": 43.3302, + "step": 1334 + }, + { + "epoch": 4.819864559819413, + "grad_norm": 186.61160278320312, + "learning_rate": 2.278584392014519e-05, + "loss": 41.8957, + "step": 1335 + }, + { + "epoch": 4.823476297968397, + "grad_norm": 242.55886840820312, + "learning_rate": 2.2780399274047187e-05, + "loss": 43.1916, + "step": 1336 + }, + { + "epoch": 4.827088036117382, + "grad_norm": 212.07177734375, + "learning_rate": 2.2774954627949185e-05, + "loss": 38.3371, + "step": 1337 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 180.1990966796875, + "learning_rate": 2.276950998185118e-05, + "loss": 36.3413, + "step": 1338 + }, + { + "epoch": 4.83431151241535, + "grad_norm": 202.69529724121094, + "learning_rate": 2.2764065335753176e-05, + "loss": 35.4426, + "step": 1339 + }, + { + "epoch": 4.837923250564334, + "grad_norm": 180.47283935546875, + "learning_rate": 2.275862068965517e-05, + "loss": 35.5281, + "step": 1340 + }, + { + "epoch": 4.837923250564334, + "eval_loss": 0.6356105804443359, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 1340 + }, + { + "epoch": 4.8415349887133186, + "grad_norm": 204.674560546875, + "learning_rate": 2.2753176043557167e-05, + "loss": 36.2566, + "step": 1341 + }, + { + "epoch": 4.8451467268623025, + "grad_norm": 272.1197204589844, + "learning_rate": 2.2747731397459166e-05, + "loss": 36.3862, + "step": 1342 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 235.55101013183594, + "learning_rate": 2.2742286751361165e-05, + "loss": 35.1455, + "step": 1343 + }, + { + "epoch": 4.852370203160271, + "grad_norm": 271.2718200683594, + "learning_rate": 2.273684210526316e-05, + "loss": 37.3824, + "step": 1344 + }, + { + "epoch": 4.855981941309255, + "grad_norm": 242.15728759765625, + "learning_rate": 2.2731397459165155e-05, + "loss": 37.6587, + "step": 1345 + }, + { + "epoch": 4.859593679458239, + "grad_norm": 218.59481811523438, + "learning_rate": 2.272595281306715e-05, + "loss": 36.7602, + "step": 1346 + }, + { + "epoch": 4.863205417607223, + "grad_norm": 231.9490203857422, + "learning_rate": 2.2720508166969146e-05, + "loss": 38.187, + "step": 1347 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 385.56158447265625, + "learning_rate": 2.2715063520871145e-05, + "loss": 38.1905, + "step": 1348 + }, + { + "epoch": 4.870428893905192, + "grad_norm": 219.38204956054688, + "learning_rate": 2.270961887477314e-05, + "loss": 38.2179, + "step": 1349 + }, + { + "epoch": 4.874040632054176, + "grad_norm": 209.46580505371094, + "learning_rate": 2.2704174228675136e-05, + "loss": 37.3696, + "step": 1350 + }, + { + "epoch": 4.874040632054176, + "eval_loss": 0.6412517428398132, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 1350 + }, + { + "epoch": 4.87765237020316, + "grad_norm": 205.53416442871094, + "learning_rate": 2.2698729582577134e-05, + "loss": 38.5144, + "step": 1351 + }, + { + "epoch": 4.881264108352145, + "grad_norm": 214.2522735595703, + "learning_rate": 2.269328493647913e-05, + "loss": 38.7372, + "step": 1352 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 236.9787139892578, + "learning_rate": 2.2687840290381125e-05, + "loss": 38.8987, + "step": 1353 + }, + { + "epoch": 4.888487584650113, + "grad_norm": 247.30906677246094, + "learning_rate": 2.2682395644283124e-05, + "loss": 35.0837, + "step": 1354 + }, + { + "epoch": 4.892099322799097, + "grad_norm": 287.5954284667969, + "learning_rate": 2.267695099818512e-05, + "loss": 25.5272, + "step": 1355 + }, + { + "epoch": 4.895711060948082, + "grad_norm": 254.61672973632812, + "learning_rate": 2.2671506352087115e-05, + "loss": 25.1288, + "step": 1356 + }, + { + "epoch": 4.899322799097066, + "grad_norm": 180.98666381835938, + "learning_rate": 2.266606170598911e-05, + "loss": 25.0588, + "step": 1357 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 213.0275421142578, + "learning_rate": 2.2660617059891105e-05, + "loss": 25.464, + "step": 1358 + }, + { + "epoch": 4.9065462753950335, + "grad_norm": 385.18035888671875, + "learning_rate": 2.2655172413793104e-05, + "loss": 47.0056, + "step": 1359 + }, + { + "epoch": 4.910158013544018, + "grad_norm": 383.4106140136719, + "learning_rate": 2.2649727767695103e-05, + "loss": 46.9892, + "step": 1360 + }, + { + "epoch": 4.910158013544018, + "eval_loss": 0.6618479490280151, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1360 + }, + { + "epoch": 4.913769751693002, + "grad_norm": 415.4345397949219, + "learning_rate": 2.26442831215971e-05, + "loss": 47.1619, + "step": 1361 + }, + { + "epoch": 4.917381489841986, + "grad_norm": 362.338134765625, + "learning_rate": 2.2638838475499094e-05, + "loss": 46.7232, + "step": 1362 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 378.7535400390625, + "learning_rate": 2.263339382940109e-05, + "loss": 46.4438, + "step": 1363 + }, + { + "epoch": 4.924604966139955, + "grad_norm": 251.64901733398438, + "learning_rate": 2.2627949183303085e-05, + "loss": 44.8178, + "step": 1364 + }, + { + "epoch": 4.928216704288939, + "grad_norm": 273.1052551269531, + "learning_rate": 2.2622504537205083e-05, + "loss": 43.0865, + "step": 1365 + }, + { + "epoch": 4.931828442437923, + "grad_norm": 229.66415405273438, + "learning_rate": 2.261705989110708e-05, + "loss": 42.2463, + "step": 1366 + }, + { + "epoch": 4.935440180586907, + "grad_norm": 229.47940063476562, + "learning_rate": 2.2611615245009074e-05, + "loss": 42.4395, + "step": 1367 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 224.48890686035156, + "learning_rate": 2.260617059891107e-05, + "loss": 42.4994, + "step": 1368 + }, + { + "epoch": 4.942663656884876, + "grad_norm": 241.98745727539062, + "learning_rate": 2.2600725952813065e-05, + "loss": 42.5535, + "step": 1369 + }, + { + "epoch": 4.94627539503386, + "grad_norm": 258.1711120605469, + "learning_rate": 2.2595281306715067e-05, + "loss": 42.8475, + "step": 1370 + }, + { + "epoch": 4.94627539503386, + "eval_loss": 0.639252245426178, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.09, + "eval_steps_per_second": 57.09, + "step": 1370 + }, + { + "epoch": 4.949887133182845, + "grad_norm": 204.64927673339844, + "learning_rate": 2.2589836660617062e-05, + "loss": 42.9895, + "step": 1371 + }, + { + "epoch": 4.953498871331829, + "grad_norm": 342.9057922363281, + "learning_rate": 2.2584392014519058e-05, + "loss": 43.1972, + "step": 1372 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 207.45504760742188, + "learning_rate": 2.2578947368421053e-05, + "loss": 42.406, + "step": 1373 + }, + { + "epoch": 4.960722347629797, + "grad_norm": 232.78831481933594, + "learning_rate": 2.257350272232305e-05, + "loss": 36.8817, + "step": 1374 + }, + { + "epoch": 4.9643340857787805, + "grad_norm": 249.3349609375, + "learning_rate": 2.2568058076225044e-05, + "loss": 34.584, + "step": 1375 + }, + { + "epoch": 4.967945823927765, + "grad_norm": 322.7100524902344, + "learning_rate": 2.2562613430127043e-05, + "loss": 36.9512, + "step": 1376 + }, + { + "epoch": 4.971557562076749, + "grad_norm": 357.65228271484375, + "learning_rate": 2.2557168784029038e-05, + "loss": 37.6833, + "step": 1377 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 300.0970153808594, + "learning_rate": 2.2551724137931033e-05, + "loss": 38.597, + "step": 1378 + }, + { + "epoch": 4.978781038374718, + "grad_norm": 234.52508544921875, + "learning_rate": 2.2546279491833032e-05, + "loss": 38.4155, + "step": 1379 + }, + { + "epoch": 4.982392776523702, + "grad_norm": 270.60626220703125, + "learning_rate": 2.2540834845735028e-05, + "loss": 38.1589, + "step": 1380 + }, + { + "epoch": 4.982392776523702, + "eval_loss": 0.6409950256347656, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 1380 + }, + { + "epoch": 4.986004514672686, + "grad_norm": 232.9596710205078, + "learning_rate": 2.2535390199637026e-05, + "loss": 39.281, + "step": 1381 + }, + { + "epoch": 4.98961625282167, + "grad_norm": 248.0550994873047, + "learning_rate": 2.2529945553539022e-05, + "loss": 40.0868, + "step": 1382 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 256.327880859375, + "learning_rate": 2.2524500907441017e-05, + "loss": 28.1259, + "step": 1383 + }, + { + "epoch": 4.996839729119639, + "grad_norm": 198.29559326171875, + "learning_rate": 2.2519056261343012e-05, + "loss": 25.3166, + "step": 1384 + }, + { + "epoch": 5.0, + "grad_norm": 174.66856384277344, + "learning_rate": 2.2513611615245008e-05, + "loss": 22.0749, + "step": 1385 + }, + { + "epoch": 5.003611738148984, + "grad_norm": 309.0927429199219, + "learning_rate": 2.2508166969147003e-05, + "loss": 45.2433, + "step": 1386 + }, + { + "epoch": 5.007223476297969, + "grad_norm": 293.1455383300781, + "learning_rate": 2.2502722323049002e-05, + "loss": 46.7025, + "step": 1387 + }, + { + "epoch": 5.010835214446953, + "grad_norm": 269.47662353515625, + "learning_rate": 2.2497277676951e-05, + "loss": 45.3218, + "step": 1388 + }, + { + "epoch": 5.014446952595937, + "grad_norm": 284.49560546875, + "learning_rate": 2.2491833030852996e-05, + "loss": 44.9849, + "step": 1389 + }, + { + "epoch": 5.018058690744921, + "grad_norm": 223.5511474609375, + "learning_rate": 2.248638838475499e-05, + "loss": 44.887, + "step": 1390 + }, + { + "epoch": 5.018058690744921, + "eval_loss": 0.6435533165931702, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1390 + }, + { + "epoch": 5.021670428893906, + "grad_norm": 243.4492645263672, + "learning_rate": 2.2480943738656987e-05, + "loss": 45.1483, + "step": 1391 + }, + { + "epoch": 5.0252821670428895, + "grad_norm": 265.1712646484375, + "learning_rate": 2.2475499092558986e-05, + "loss": 44.3713, + "step": 1392 + }, + { + "epoch": 5.0288939051918735, + "grad_norm": 190.72190856933594, + "learning_rate": 2.247005444646098e-05, + "loss": 45.3138, + "step": 1393 + }, + { + "epoch": 5.0325056433408575, + "grad_norm": 177.26686096191406, + "learning_rate": 2.2464609800362976e-05, + "loss": 43.302, + "step": 1394 + }, + { + "epoch": 5.036117381489842, + "grad_norm": 198.6124725341797, + "learning_rate": 2.2459165154264972e-05, + "loss": 43.6363, + "step": 1395 + }, + { + "epoch": 5.039729119638826, + "grad_norm": 233.78738403320312, + "learning_rate": 2.2453720508166967e-05, + "loss": 43.0345, + "step": 1396 + }, + { + "epoch": 5.04334085778781, + "grad_norm": 225.48614501953125, + "learning_rate": 2.2448275862068966e-05, + "loss": 41.5932, + "step": 1397 + }, + { + "epoch": 5.046952595936794, + "grad_norm": 204.31179809570312, + "learning_rate": 2.2442831215970965e-05, + "loss": 40.1401, + "step": 1398 + }, + { + "epoch": 5.050564334085779, + "grad_norm": 219.5385284423828, + "learning_rate": 2.243738656987296e-05, + "loss": 40.8834, + "step": 1399 + }, + { + "epoch": 5.054176072234763, + "grad_norm": 168.3094024658203, + "learning_rate": 2.2431941923774956e-05, + "loss": 40.4476, + "step": 1400 + }, + { + "epoch": 5.054176072234763, + "eval_loss": 0.6361114382743835, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 1400 + }, + { + "epoch": 5.057787810383747, + "grad_norm": 169.45201110839844, + "learning_rate": 2.242649727767695e-05, + "loss": 40.1949, + "step": 1401 + }, + { + "epoch": 5.061399548532731, + "grad_norm": 208.84634399414062, + "learning_rate": 2.2421052631578946e-05, + "loss": 41.0091, + "step": 1402 + }, + { + "epoch": 5.065011286681716, + "grad_norm": 248.86221313476562, + "learning_rate": 2.2415607985480945e-05, + "loss": 40.2435, + "step": 1403 + }, + { + "epoch": 5.0686230248307, + "grad_norm": 297.0834655761719, + "learning_rate": 2.241016333938294e-05, + "loss": 42.37, + "step": 1404 + }, + { + "epoch": 5.072234762979684, + "grad_norm": 242.12661743164062, + "learning_rate": 2.2404718693284936e-05, + "loss": 42.3822, + "step": 1405 + }, + { + "epoch": 5.075846501128668, + "grad_norm": 230.1178741455078, + "learning_rate": 2.2399274047186935e-05, + "loss": 41.3722, + "step": 1406 + }, + { + "epoch": 5.079458239277653, + "grad_norm": 191.32371520996094, + "learning_rate": 2.239382940108893e-05, + "loss": 41.8087, + "step": 1407 + }, + { + "epoch": 5.083069977426637, + "grad_norm": 267.28753662109375, + "learning_rate": 2.2388384754990925e-05, + "loss": 42.5938, + "step": 1408 + }, + { + "epoch": 5.0866817155756205, + "grad_norm": 186.61978149414062, + "learning_rate": 2.2382940108892924e-05, + "loss": 42.8553, + "step": 1409 + }, + { + "epoch": 5.090293453724605, + "grad_norm": 242.53433227539062, + "learning_rate": 2.237749546279492e-05, + "loss": 41.9677, + "step": 1410 + }, + { + "epoch": 5.090293453724605, + "eval_loss": 0.6330043077468872, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 1410 + }, + { + "epoch": 5.093905191873589, + "grad_norm": 199.74696350097656, + "learning_rate": 2.2372050816696915e-05, + "loss": 42.9821, + "step": 1411 + }, + { + "epoch": 5.097516930022573, + "grad_norm": 254.1063690185547, + "learning_rate": 2.236660617059891e-05, + "loss": 42.7956, + "step": 1412 + }, + { + "epoch": 5.101128668171557, + "grad_norm": 215.59056091308594, + "learning_rate": 2.2361161524500906e-05, + "loss": 43.6312, + "step": 1413 + }, + { + "epoch": 5.104740406320542, + "grad_norm": 218.69973754882812, + "learning_rate": 2.2355716878402904e-05, + "loss": 40.9468, + "step": 1414 + }, + { + "epoch": 5.108352144469526, + "grad_norm": 200.34927368164062, + "learning_rate": 2.23502722323049e-05, + "loss": 38.2656, + "step": 1415 + }, + { + "epoch": 5.11196388261851, + "grad_norm": 191.56883239746094, + "learning_rate": 2.23448275862069e-05, + "loss": 35.8111, + "step": 1416 + }, + { + "epoch": 5.115575620767494, + "grad_norm": 192.629150390625, + "learning_rate": 2.2339382940108894e-05, + "loss": 35.1287, + "step": 1417 + }, + { + "epoch": 5.119187358916479, + "grad_norm": 217.54855346679688, + "learning_rate": 2.233393829401089e-05, + "loss": 34.9664, + "step": 1418 + }, + { + "epoch": 5.122799097065463, + "grad_norm": 234.12355041503906, + "learning_rate": 2.2328493647912888e-05, + "loss": 35.9252, + "step": 1419 + }, + { + "epoch": 5.126410835214447, + "grad_norm": 201.83477783203125, + "learning_rate": 2.2323049001814884e-05, + "loss": 36.4664, + "step": 1420 + }, + { + "epoch": 5.126410835214447, + "eval_loss": 0.6359394192695618, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 1420 + }, + { + "epoch": 5.130022573363431, + "grad_norm": 212.38943481445312, + "learning_rate": 2.231760435571688e-05, + "loss": 35.2733, + "step": 1421 + }, + { + "epoch": 5.133634311512416, + "grad_norm": 219.8803253173828, + "learning_rate": 2.2312159709618874e-05, + "loss": 37.2009, + "step": 1422 + }, + { + "epoch": 5.1372460496614, + "grad_norm": 222.28221130371094, + "learning_rate": 2.230671506352087e-05, + "loss": 36.9338, + "step": 1423 + }, + { + "epoch": 5.140857787810384, + "grad_norm": 217.56607055664062, + "learning_rate": 2.2301270417422865e-05, + "loss": 38.0419, + "step": 1424 + }, + { + "epoch": 5.144469525959368, + "grad_norm": 232.7363739013672, + "learning_rate": 2.2295825771324867e-05, + "loss": 38.1393, + "step": 1425 + }, + { + "epoch": 5.148081264108352, + "grad_norm": 228.12091064453125, + "learning_rate": 2.2290381125226863e-05, + "loss": 37.4169, + "step": 1426 + }, + { + "epoch": 5.151693002257336, + "grad_norm": 247.9901580810547, + "learning_rate": 2.2284936479128858e-05, + "loss": 37.6386, + "step": 1427 + }, + { + "epoch": 5.15530474040632, + "grad_norm": 227.96649169921875, + "learning_rate": 2.2279491833030853e-05, + "loss": 38.7843, + "step": 1428 + }, + { + "epoch": 5.158916478555304, + "grad_norm": 197.85072326660156, + "learning_rate": 2.227404718693285e-05, + "loss": 37.7056, + "step": 1429 + }, + { + "epoch": 5.162528216704289, + "grad_norm": 270.6370544433594, + "learning_rate": 2.2268602540834848e-05, + "loss": 38.5554, + "step": 1430 + }, + { + "epoch": 5.162528216704289, + "eval_loss": 0.6463288068771362, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 1430 + }, + { + "epoch": 5.166139954853273, + "grad_norm": 251.65847778320312, + "learning_rate": 2.2263157894736843e-05, + "loss": 32.6593, + "step": 1431 + }, + { + "epoch": 5.169751693002257, + "grad_norm": 248.84368896484375, + "learning_rate": 2.225771324863884e-05, + "loss": 24.8031, + "step": 1432 + }, + { + "epoch": 5.173363431151241, + "grad_norm": 218.12979125976562, + "learning_rate": 2.2252268602540834e-05, + "loss": 23.8542, + "step": 1433 + }, + { + "epoch": 5.176975169300226, + "grad_norm": 171.4182586669922, + "learning_rate": 2.2246823956442832e-05, + "loss": 25.1994, + "step": 1434 + }, + { + "epoch": 5.18058690744921, + "grad_norm": 200.76271057128906, + "learning_rate": 2.2241379310344828e-05, + "loss": 25.1259, + "step": 1435 + }, + { + "epoch": 5.184198645598194, + "grad_norm": 324.8979797363281, + "learning_rate": 2.2235934664246827e-05, + "loss": 46.7466, + "step": 1436 + }, + { + "epoch": 5.187810383747179, + "grad_norm": 391.9200439453125, + "learning_rate": 2.2230490018148822e-05, + "loss": 47.366, + "step": 1437 + }, + { + "epoch": 5.191422121896163, + "grad_norm": 332.51080322265625, + "learning_rate": 2.2225045372050817e-05, + "loss": 47.5236, + "step": 1438 + }, + { + "epoch": 5.195033860045147, + "grad_norm": 295.85333251953125, + "learning_rate": 2.2219600725952813e-05, + "loss": 44.9235, + "step": 1439 + }, + { + "epoch": 5.198645598194131, + "grad_norm": 246.46482849121094, + "learning_rate": 2.2214156079854808e-05, + "loss": 44.5892, + "step": 1440 + }, + { + "epoch": 5.198645598194131, + "eval_loss": 0.6501885056495667, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.096, + "eval_steps_per_second": 57.096, + "step": 1440 + }, + { + "epoch": 5.2022573363431155, + "grad_norm": 224.99964904785156, + "learning_rate": 2.2208711433756807e-05, + "loss": 45.1496, + "step": 1441 + }, + { + "epoch": 5.2058690744920995, + "grad_norm": 201.5928497314453, + "learning_rate": 2.2203266787658802e-05, + "loss": 44.2362, + "step": 1442 + }, + { + "epoch": 5.209480812641083, + "grad_norm": 220.72509765625, + "learning_rate": 2.21978221415608e-05, + "loss": 45.7963, + "step": 1443 + }, + { + "epoch": 5.213092550790067, + "grad_norm": 229.04412841796875, + "learning_rate": 2.2192377495462796e-05, + "loss": 44.1812, + "step": 1444 + }, + { + "epoch": 5.216704288939052, + "grad_norm": 214.86207580566406, + "learning_rate": 2.2186932849364792e-05, + "loss": 44.364, + "step": 1445 + }, + { + "epoch": 5.220316027088036, + "grad_norm": 169.3239288330078, + "learning_rate": 2.2181488203266787e-05, + "loss": 44.1106, + "step": 1446 + }, + { + "epoch": 5.22392776523702, + "grad_norm": 180.3131561279297, + "learning_rate": 2.2176043557168786e-05, + "loss": 41.8791, + "step": 1447 + }, + { + "epoch": 5.227539503386004, + "grad_norm": 227.83078002929688, + "learning_rate": 2.217059891107078e-05, + "loss": 39.7917, + "step": 1448 + }, + { + "epoch": 5.231151241534989, + "grad_norm": 267.4294738769531, + "learning_rate": 2.2165154264972777e-05, + "loss": 41.2864, + "step": 1449 + }, + { + "epoch": 5.234762979683973, + "grad_norm": 210.79034423828125, + "learning_rate": 2.2159709618874772e-05, + "loss": 40.7219, + "step": 1450 + }, + { + "epoch": 5.234762979683973, + "eval_loss": 0.6369529366493225, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 1450 + }, + { + "epoch": 5.238374717832957, + "grad_norm": 205.2632598876953, + "learning_rate": 2.2154264972776768e-05, + "loss": 41.0364, + "step": 1451 + }, + { + "epoch": 5.241986455981941, + "grad_norm": 199.7196807861328, + "learning_rate": 2.214882032667877e-05, + "loss": 40.2733, + "step": 1452 + }, + { + "epoch": 5.245598194130926, + "grad_norm": 184.26495361328125, + "learning_rate": 2.2143375680580765e-05, + "loss": 40.3418, + "step": 1453 + }, + { + "epoch": 5.24920993227991, + "grad_norm": 170.1937713623047, + "learning_rate": 2.213793103448276e-05, + "loss": 40.5658, + "step": 1454 + }, + { + "epoch": 5.252821670428894, + "grad_norm": 167.71109008789062, + "learning_rate": 2.2132486388384756e-05, + "loss": 41.9252, + "step": 1455 + }, + { + "epoch": 5.2564334085778786, + "grad_norm": 184.73162841796875, + "learning_rate": 2.212704174228675e-05, + "loss": 40.0485, + "step": 1456 + }, + { + "epoch": 5.2600451467268625, + "grad_norm": 195.0812225341797, + "learning_rate": 2.2121597096188747e-05, + "loss": 41.6424, + "step": 1457 + }, + { + "epoch": 5.2636568848758465, + "grad_norm": 218.23553466796875, + "learning_rate": 2.2116152450090745e-05, + "loss": 40.6179, + "step": 1458 + }, + { + "epoch": 5.2672686230248305, + "grad_norm": 229.79299926757812, + "learning_rate": 2.211070780399274e-05, + "loss": 42.8747, + "step": 1459 + }, + { + "epoch": 5.270880361173815, + "grad_norm": 231.70692443847656, + "learning_rate": 2.2105263157894736e-05, + "loss": 42.7016, + "step": 1460 + }, + { + "epoch": 5.270880361173815, + "eval_loss": 0.6424433588981628, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 1460 + }, + { + "epoch": 5.274492099322799, + "grad_norm": 204.9513397216797, + "learning_rate": 2.209981851179673e-05, + "loss": 41.206, + "step": 1461 + }, + { + "epoch": 5.278103837471783, + "grad_norm": 220.89083862304688, + "learning_rate": 2.209437386569873e-05, + "loss": 44.0126, + "step": 1462 + }, + { + "epoch": 5.281715575620767, + "grad_norm": 266.7763671875, + "learning_rate": 2.208892921960073e-05, + "loss": 41.4934, + "step": 1463 + }, + { + "epoch": 5.285327313769752, + "grad_norm": 241.42636108398438, + "learning_rate": 2.2083484573502724e-05, + "loss": 43.3433, + "step": 1464 + }, + { + "epoch": 5.288939051918736, + "grad_norm": 221.7669219970703, + "learning_rate": 2.207803992740472e-05, + "loss": 35.9569, + "step": 1465 + }, + { + "epoch": 5.29255079006772, + "grad_norm": 236.0152130126953, + "learning_rate": 2.2072595281306715e-05, + "loss": 36.0824, + "step": 1466 + }, + { + "epoch": 5.296162528216704, + "grad_norm": 239.56224060058594, + "learning_rate": 2.206715063520871e-05, + "loss": 33.6127, + "step": 1467 + }, + { + "epoch": 5.299774266365689, + "grad_norm": 277.1287841796875, + "learning_rate": 2.2061705989110706e-05, + "loss": 36.11, + "step": 1468 + }, + { + "epoch": 5.303386004514673, + "grad_norm": 250.19515991210938, + "learning_rate": 2.2056261343012705e-05, + "loss": 36.9984, + "step": 1469 + }, + { + "epoch": 5.306997742663657, + "grad_norm": 214.2754669189453, + "learning_rate": 2.20508166969147e-05, + "loss": 36.5917, + "step": 1470 + }, + { + "epoch": 5.306997742663657, + "eval_loss": 0.6356943845748901, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 1470 + }, + { + "epoch": 5.310609480812641, + "grad_norm": 224.37388610839844, + "learning_rate": 2.20453720508167e-05, + "loss": 36.5302, + "step": 1471 + }, + { + "epoch": 5.314221218961626, + "grad_norm": 276.2541809082031, + "learning_rate": 2.2039927404718694e-05, + "loss": 36.7978, + "step": 1472 + }, + { + "epoch": 5.3178329571106095, + "grad_norm": 361.717041015625, + "learning_rate": 2.203448275862069e-05, + "loss": 37.4063, + "step": 1473 + }, + { + "epoch": 5.3214446952595935, + "grad_norm": 285.3569641113281, + "learning_rate": 2.202903811252269e-05, + "loss": 37.2472, + "step": 1474 + }, + { + "epoch": 5.3250564334085775, + "grad_norm": 268.160400390625, + "learning_rate": 2.2023593466424684e-05, + "loss": 37.7361, + "step": 1475 + }, + { + "epoch": 5.328668171557562, + "grad_norm": 211.38070678710938, + "learning_rate": 2.201814882032668e-05, + "loss": 37.7794, + "step": 1476 + }, + { + "epoch": 5.332279909706546, + "grad_norm": 214.10638427734375, + "learning_rate": 2.2012704174228675e-05, + "loss": 39.0787, + "step": 1477 + }, + { + "epoch": 5.33589164785553, + "grad_norm": 238.9603271484375, + "learning_rate": 2.200725952813067e-05, + "loss": 37.6853, + "step": 1478 + }, + { + "epoch": 5.339503386004514, + "grad_norm": 323.44976806640625, + "learning_rate": 2.2001814882032665e-05, + "loss": 38.2844, + "step": 1479 + }, + { + "epoch": 5.343115124153499, + "grad_norm": 289.6131896972656, + "learning_rate": 2.1996370235934668e-05, + "loss": 38.8953, + "step": 1480 + }, + { + "epoch": 5.343115124153499, + "eval_loss": 0.6462770700454712, + "eval_runtime": 3.1673, + "eval_samples_per_second": 56.516, + "eval_steps_per_second": 56.516, + "step": 1480 + }, + { + "epoch": 5.346726862302483, + "grad_norm": 197.47299194335938, + "learning_rate": 2.1990925589836663e-05, + "loss": 28.126, + "step": 1481 + }, + { + "epoch": 5.350338600451467, + "grad_norm": 198.37156677246094, + "learning_rate": 2.1985480943738658e-05, + "loss": 24.2205, + "step": 1482 + }, + { + "epoch": 5.353950338600452, + "grad_norm": 211.03501892089844, + "learning_rate": 2.1980036297640654e-05, + "loss": 24.119, + "step": 1483 + }, + { + "epoch": 5.357562076749436, + "grad_norm": 182.23316955566406, + "learning_rate": 2.197459165154265e-05, + "loss": 24.7386, + "step": 1484 + }, + { + "epoch": 5.36117381489842, + "grad_norm": 192.6392822265625, + "learning_rate": 2.1969147005444648e-05, + "loss": 26.0739, + "step": 1485 + }, + { + "epoch": 5.364785553047404, + "grad_norm": 380.62896728515625, + "learning_rate": 2.1963702359346643e-05, + "loss": 46.6945, + "step": 1486 + }, + { + "epoch": 5.368397291196389, + "grad_norm": 342.5572814941406, + "learning_rate": 2.195825771324864e-05, + "loss": 46.1797, + "step": 1487 + }, + { + "epoch": 5.372009029345373, + "grad_norm": 311.7198791503906, + "learning_rate": 2.1952813067150634e-05, + "loss": 45.6588, + "step": 1488 + }, + { + "epoch": 5.375620767494357, + "grad_norm": 260.9885559082031, + "learning_rate": 2.1947368421052633e-05, + "loss": 45.2405, + "step": 1489 + }, + { + "epoch": 5.3792325056433405, + "grad_norm": 263.3132019042969, + "learning_rate": 2.1941923774954628e-05, + "loss": 44.117, + "step": 1490 + }, + { + "epoch": 5.3792325056433405, + "eval_loss": 0.644275426864624, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 1490 + }, + { + "epoch": 5.382844243792325, + "grad_norm": 254.92022705078125, + "learning_rate": 2.1936479128856627e-05, + "loss": 45.4002, + "step": 1491 + }, + { + "epoch": 5.386455981941309, + "grad_norm": 246.1839599609375, + "learning_rate": 2.1931034482758622e-05, + "loss": 45.3481, + "step": 1492 + }, + { + "epoch": 5.390067720090293, + "grad_norm": 282.2879638671875, + "learning_rate": 2.1925589836660618e-05, + "loss": 45.3958, + "step": 1493 + }, + { + "epoch": 5.393679458239277, + "grad_norm": 266.9140930175781, + "learning_rate": 2.1920145190562613e-05, + "loss": 44.2959, + "step": 1494 + }, + { + "epoch": 5.397291196388262, + "grad_norm": 196.81199645996094, + "learning_rate": 2.191470054446461e-05, + "loss": 44.765, + "step": 1495 + }, + { + "epoch": 5.400902934537246, + "grad_norm": 270.7329406738281, + "learning_rate": 2.1909255898366607e-05, + "loss": 42.8581, + "step": 1496 + }, + { + "epoch": 5.40451467268623, + "grad_norm": 187.3281707763672, + "learning_rate": 2.1903811252268603e-05, + "loss": 40.7167, + "step": 1497 + }, + { + "epoch": 5.408126410835214, + "grad_norm": 302.9165954589844, + "learning_rate": 2.1898366606170598e-05, + "loss": 41.0712, + "step": 1498 + }, + { + "epoch": 5.411738148984199, + "grad_norm": 395.1492614746094, + "learning_rate": 2.1892921960072597e-05, + "loss": 40.4098, + "step": 1499 + }, + { + "epoch": 5.415349887133183, + "grad_norm": 253.91494750976562, + "learning_rate": 2.1887477313974592e-05, + "loss": 41.2985, + "step": 1500 + }, + { + "epoch": 5.415349887133183, + "eval_loss": 0.6383773684501648, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1500 + }, + { + "epoch": 5.418961625282167, + "grad_norm": 248.4109344482422, + "learning_rate": 2.1882032667876588e-05, + "loss": 41.179, + "step": 1501 + }, + { + "epoch": 5.422573363431152, + "grad_norm": 210.50015258789062, + "learning_rate": 2.1876588021778586e-05, + "loss": 41.1934, + "step": 1502 + }, + { + "epoch": 5.426185101580136, + "grad_norm": 170.64334106445312, + "learning_rate": 2.187114337568058e-05, + "loss": 41.5535, + "step": 1503 + }, + { + "epoch": 5.42979683972912, + "grad_norm": 249.41270446777344, + "learning_rate": 2.1865698729582577e-05, + "loss": 41.8323, + "step": 1504 + }, + { + "epoch": 5.433408577878104, + "grad_norm": 214.53770446777344, + "learning_rate": 2.1860254083484572e-05, + "loss": 42.1517, + "step": 1505 + }, + { + "epoch": 5.437020316027088, + "grad_norm": 225.6502227783203, + "learning_rate": 2.1854809437386568e-05, + "loss": 42.7675, + "step": 1506 + }, + { + "epoch": 5.440632054176072, + "grad_norm": 210.19219970703125, + "learning_rate": 2.1849364791288567e-05, + "loss": 42.5094, + "step": 1507 + }, + { + "epoch": 5.444243792325056, + "grad_norm": 187.03294372558594, + "learning_rate": 2.1843920145190565e-05, + "loss": 42.2218, + "step": 1508 + }, + { + "epoch": 5.44785553047404, + "grad_norm": 227.6764373779297, + "learning_rate": 2.183847549909256e-05, + "loss": 42.7061, + "step": 1509 + }, + { + "epoch": 5.451467268623025, + "grad_norm": 239.2847442626953, + "learning_rate": 2.1833030852994556e-05, + "loss": 43.1959, + "step": 1510 + }, + { + "epoch": 5.451467268623025, + "eval_loss": 0.6405091285705566, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 1510 + }, + { + "epoch": 5.455079006772009, + "grad_norm": 268.887451171875, + "learning_rate": 2.182758620689655e-05, + "loss": 42.4915, + "step": 1511 + }, + { + "epoch": 5.458690744920993, + "grad_norm": 261.0531311035156, + "learning_rate": 2.182214156079855e-05, + "loss": 42.1777, + "step": 1512 + }, + { + "epoch": 5.462302483069977, + "grad_norm": 241.58819580078125, + "learning_rate": 2.1816696914700546e-05, + "loss": 40.8728, + "step": 1513 + }, + { + "epoch": 5.465914221218962, + "grad_norm": 227.302001953125, + "learning_rate": 2.181125226860254e-05, + "loss": 39.8861, + "step": 1514 + }, + { + "epoch": 5.469525959367946, + "grad_norm": 293.8402404785156, + "learning_rate": 2.1805807622504536e-05, + "loss": 36.8716, + "step": 1515 + }, + { + "epoch": 5.47313769751693, + "grad_norm": 332.8829650878906, + "learning_rate": 2.1800362976406532e-05, + "loss": 35.6049, + "step": 1516 + }, + { + "epoch": 5.476749435665914, + "grad_norm": 271.6636962890625, + "learning_rate": 2.179491833030853e-05, + "loss": 34.6785, + "step": 1517 + }, + { + "epoch": 5.480361173814899, + "grad_norm": 211.5673065185547, + "learning_rate": 2.178947368421053e-05, + "loss": 35.5321, + "step": 1518 + }, + { + "epoch": 5.483972911963883, + "grad_norm": 168.95346069335938, + "learning_rate": 2.1784029038112525e-05, + "loss": 35.1604, + "step": 1519 + }, + { + "epoch": 5.487584650112867, + "grad_norm": 242.66725158691406, + "learning_rate": 2.177858439201452e-05, + "loss": 37.8709, + "step": 1520 + }, + { + "epoch": 5.487584650112867, + "eval_loss": 0.6324127912521362, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1520 + }, + { + "epoch": 5.491196388261851, + "grad_norm": 202.7799530029297, + "learning_rate": 2.1773139745916516e-05, + "loss": 38.1727, + "step": 1521 + }, + { + "epoch": 5.4948081264108355, + "grad_norm": 210.12704467773438, + "learning_rate": 2.176769509981851e-05, + "loss": 36.4171, + "step": 1522 + }, + { + "epoch": 5.4984198645598195, + "grad_norm": 214.7133331298828, + "learning_rate": 2.176225045372051e-05, + "loss": 37.7873, + "step": 1523 + }, + { + "epoch": 5.502031602708803, + "grad_norm": 197.89781188964844, + "learning_rate": 2.1756805807622505e-05, + "loss": 37.1096, + "step": 1524 + }, + { + "epoch": 5.505643340857787, + "grad_norm": 203.01992797851562, + "learning_rate": 2.17513611615245e-05, + "loss": 36.9907, + "step": 1525 + }, + { + "epoch": 5.509255079006772, + "grad_norm": 210.42164611816406, + "learning_rate": 2.17459165154265e-05, + "loss": 38.0291, + "step": 1526 + }, + { + "epoch": 5.512866817155756, + "grad_norm": 210.2798309326172, + "learning_rate": 2.1740471869328495e-05, + "loss": 37.5385, + "step": 1527 + }, + { + "epoch": 5.51647855530474, + "grad_norm": 217.986572265625, + "learning_rate": 2.173502722323049e-05, + "loss": 39.2736, + "step": 1528 + }, + { + "epoch": 5.520090293453725, + "grad_norm": 221.05831909179688, + "learning_rate": 2.172958257713249e-05, + "loss": 39.2733, + "step": 1529 + }, + { + "epoch": 5.523702031602709, + "grad_norm": 250.36065673828125, + "learning_rate": 2.1724137931034484e-05, + "loss": 37.8987, + "step": 1530 + }, + { + "epoch": 5.523702031602709, + "eval_loss": 0.6414559483528137, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 1530 + }, + { + "epoch": 5.527313769751693, + "grad_norm": 275.062255859375, + "learning_rate": 2.171869328493648e-05, + "loss": 29.4874, + "step": 1531 + }, + { + "epoch": 5.530925507900677, + "grad_norm": 178.79615783691406, + "learning_rate": 2.1713248638838475e-05, + "loss": 25.2165, + "step": 1532 + }, + { + "epoch": 5.534537246049661, + "grad_norm": 221.6693572998047, + "learning_rate": 2.170780399274047e-05, + "loss": 24.7139, + "step": 1533 + }, + { + "epoch": 5.538148984198646, + "grad_norm": 207.15869140625, + "learning_rate": 2.170235934664247e-05, + "loss": 25.2773, + "step": 1534 + }, + { + "epoch": 5.54176072234763, + "grad_norm": 193.37644958496094, + "learning_rate": 2.1696914700544468e-05, + "loss": 25.7936, + "step": 1535 + }, + { + "epoch": 5.545372460496614, + "grad_norm": 314.101318359375, + "learning_rate": 2.1691470054446463e-05, + "loss": 45.8573, + "step": 1536 + }, + { + "epoch": 5.5489841986455986, + "grad_norm": 376.9578552246094, + "learning_rate": 2.168602540834846e-05, + "loss": 47.1284, + "step": 1537 + }, + { + "epoch": 5.5525959367945825, + "grad_norm": 343.3904724121094, + "learning_rate": 2.1680580762250454e-05, + "loss": 45.1873, + "step": 1538 + }, + { + "epoch": 5.5562076749435665, + "grad_norm": 263.31768798828125, + "learning_rate": 2.167513611615245e-05, + "loss": 45.4906, + "step": 1539 + }, + { + "epoch": 5.5598194130925505, + "grad_norm": 295.50384521484375, + "learning_rate": 2.1669691470054448e-05, + "loss": 44.9259, + "step": 1540 + }, + { + "epoch": 5.5598194130925505, + "eval_loss": 0.6483813524246216, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.923, + "eval_steps_per_second": 56.923, + "step": 1540 + }, + { + "epoch": 5.563431151241535, + "grad_norm": 208.8861846923828, + "learning_rate": 2.1664246823956444e-05, + "loss": 43.7965, + "step": 1541 + }, + { + "epoch": 5.567042889390519, + "grad_norm": 195.8695526123047, + "learning_rate": 2.165880217785844e-05, + "loss": 44.7409, + "step": 1542 + }, + { + "epoch": 5.570654627539503, + "grad_norm": 218.10089111328125, + "learning_rate": 2.1653357531760434e-05, + "loss": 45.9364, + "step": 1543 + }, + { + "epoch": 5.574266365688487, + "grad_norm": 204.17205810546875, + "learning_rate": 2.164791288566243e-05, + "loss": 45.468, + "step": 1544 + }, + { + "epoch": 5.577878103837472, + "grad_norm": 239.03952026367188, + "learning_rate": 2.1642468239564432e-05, + "loss": 44.7685, + "step": 1545 + }, + { + "epoch": 5.581489841986456, + "grad_norm": 251.59300231933594, + "learning_rate": 2.1637023593466427e-05, + "loss": 43.011, + "step": 1546 + }, + { + "epoch": 5.58510158013544, + "grad_norm": 186.72540283203125, + "learning_rate": 2.1631578947368423e-05, + "loss": 41.5255, + "step": 1547 + }, + { + "epoch": 5.588713318284425, + "grad_norm": 199.89732360839844, + "learning_rate": 2.1626134301270418e-05, + "loss": 40.2522, + "step": 1548 + }, + { + "epoch": 5.592325056433409, + "grad_norm": 182.16624450683594, + "learning_rate": 2.1620689655172413e-05, + "loss": 41.0931, + "step": 1549 + }, + { + "epoch": 5.595936794582393, + "grad_norm": 221.58680725097656, + "learning_rate": 2.161524500907441e-05, + "loss": 40.2717, + "step": 1550 + }, + { + "epoch": 5.595936794582393, + "eval_loss": 0.6393340229988098, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 1550 + }, + { + "epoch": 5.599548532731377, + "grad_norm": 209.82183837890625, + "learning_rate": 2.1609800362976408e-05, + "loss": 41.7522, + "step": 1551 + }, + { + "epoch": 5.603160270880361, + "grad_norm": 226.1896209716797, + "learning_rate": 2.1604355716878403e-05, + "loss": 40.8078, + "step": 1552 + }, + { + "epoch": 5.606772009029346, + "grad_norm": 219.57899475097656, + "learning_rate": 2.1598911070780398e-05, + "loss": 42.2331, + "step": 1553 + }, + { + "epoch": 5.6103837471783295, + "grad_norm": 185.2303009033203, + "learning_rate": 2.1593466424682397e-05, + "loss": 42.0695, + "step": 1554 + }, + { + "epoch": 5.6139954853273135, + "grad_norm": 192.32913208007812, + "learning_rate": 2.1588021778584392e-05, + "loss": 42.1317, + "step": 1555 + }, + { + "epoch": 5.617607223476298, + "grad_norm": 183.3128662109375, + "learning_rate": 2.158257713248639e-05, + "loss": 40.4957, + "step": 1556 + }, + { + "epoch": 5.621218961625282, + "grad_norm": 178.10691833496094, + "learning_rate": 2.1577132486388387e-05, + "loss": 40.9154, + "step": 1557 + }, + { + "epoch": 5.624830699774266, + "grad_norm": 207.3495330810547, + "learning_rate": 2.1571687840290382e-05, + "loss": 42.8389, + "step": 1558 + }, + { + "epoch": 5.62844243792325, + "grad_norm": 191.46353149414062, + "learning_rate": 2.1566243194192377e-05, + "loss": 41.9483, + "step": 1559 + }, + { + "epoch": 5.632054176072235, + "grad_norm": 218.9544219970703, + "learning_rate": 2.1560798548094373e-05, + "loss": 41.2037, + "step": 1560 + }, + { + "epoch": 5.632054176072235, + "eval_loss": 0.6345452070236206, + "eval_runtime": 3.1432, + "eval_samples_per_second": 56.949, + "eval_steps_per_second": 56.949, + "step": 1560 + }, + { + "epoch": 5.635665914221219, + "grad_norm": 235.9405059814453, + "learning_rate": 2.1555353901996368e-05, + "loss": 43.1159, + "step": 1561 + }, + { + "epoch": 5.639277652370203, + "grad_norm": 207.1119384765625, + "learning_rate": 2.1549909255898367e-05, + "loss": 43.4384, + "step": 1562 + }, + { + "epoch": 5.642889390519187, + "grad_norm": 305.3013916015625, + "learning_rate": 2.1544464609800366e-05, + "loss": 42.436, + "step": 1563 + }, + { + "epoch": 5.646501128668172, + "grad_norm": 226.25282287597656, + "learning_rate": 2.153901996370236e-05, + "loss": 39.6844, + "step": 1564 + }, + { + "epoch": 5.650112866817156, + "grad_norm": 201.5033416748047, + "learning_rate": 2.1533575317604356e-05, + "loss": 35.9103, + "step": 1565 + }, + { + "epoch": 5.65372460496614, + "grad_norm": 206.63229370117188, + "learning_rate": 2.1528130671506352e-05, + "loss": 35.0026, + "step": 1566 + }, + { + "epoch": 5.657336343115124, + "grad_norm": 212.67581176757812, + "learning_rate": 2.152268602540835e-05, + "loss": 35.6298, + "step": 1567 + }, + { + "epoch": 5.660948081264109, + "grad_norm": 193.2886199951172, + "learning_rate": 2.1517241379310346e-05, + "loss": 36.0356, + "step": 1568 + }, + { + "epoch": 5.664559819413093, + "grad_norm": 166.189208984375, + "learning_rate": 2.151179673321234e-05, + "loss": 35.5423, + "step": 1569 + }, + { + "epoch": 5.668171557562077, + "grad_norm": 288.91552734375, + "learning_rate": 2.1506352087114337e-05, + "loss": 36.6227, + "step": 1570 + }, + { + "epoch": 5.668171557562077, + "eval_loss": 0.6339959502220154, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1570 + }, + { + "epoch": 5.6717832957110605, + "grad_norm": 210.91664123535156, + "learning_rate": 2.1500907441016332e-05, + "loss": 37.3015, + "step": 1571 + }, + { + "epoch": 5.675395033860045, + "grad_norm": 206.54299926757812, + "learning_rate": 2.149546279491833e-05, + "loss": 36.961, + "step": 1572 + }, + { + "epoch": 5.679006772009029, + "grad_norm": 206.55613708496094, + "learning_rate": 2.149001814882033e-05, + "loss": 36.722, + "step": 1573 + }, + { + "epoch": 5.682618510158013, + "grad_norm": 206.86563110351562, + "learning_rate": 2.1484573502722325e-05, + "loss": 37.7482, + "step": 1574 + }, + { + "epoch": 5.686230248306998, + "grad_norm": 219.96533203125, + "learning_rate": 2.147912885662432e-05, + "loss": 37.7964, + "step": 1575 + }, + { + "epoch": 5.689841986455982, + "grad_norm": 226.23887634277344, + "learning_rate": 2.1473684210526316e-05, + "loss": 38.6577, + "step": 1576 + }, + { + "epoch": 5.693453724604966, + "grad_norm": 195.1751708984375, + "learning_rate": 2.146823956442831e-05, + "loss": 36.9764, + "step": 1577 + }, + { + "epoch": 5.69706546275395, + "grad_norm": 194.3510284423828, + "learning_rate": 2.146279491833031e-05, + "loss": 39.4842, + "step": 1578 + }, + { + "epoch": 5.700677200902934, + "grad_norm": 187.02281188964844, + "learning_rate": 2.1457350272232305e-05, + "loss": 38.9574, + "step": 1579 + }, + { + "epoch": 5.704288939051919, + "grad_norm": 242.91925048828125, + "learning_rate": 2.14519056261343e-05, + "loss": 37.6359, + "step": 1580 + }, + { + "epoch": 5.704288939051919, + "eval_loss": 0.6384473443031311, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 1580 + }, + { + "epoch": 5.707900677200903, + "grad_norm": 242.9617156982422, + "learning_rate": 2.14464609800363e-05, + "loss": 31.3564, + "step": 1581 + }, + { + "epoch": 5.711512415349887, + "grad_norm": 182.00540161132812, + "learning_rate": 2.1441016333938295e-05, + "loss": 24.2933, + "step": 1582 + }, + { + "epoch": 5.715124153498872, + "grad_norm": 257.7115173339844, + "learning_rate": 2.143557168784029e-05, + "loss": 24.6299, + "step": 1583 + }, + { + "epoch": 5.718735891647856, + "grad_norm": 198.71554565429688, + "learning_rate": 2.143012704174229e-05, + "loss": 24.7344, + "step": 1584 + }, + { + "epoch": 5.72234762979684, + "grad_norm": 198.24520874023438, + "learning_rate": 2.1424682395644284e-05, + "loss": 26.0825, + "step": 1585 + }, + { + "epoch": 5.725959367945824, + "grad_norm": 248.9528045654297, + "learning_rate": 2.141923774954628e-05, + "loss": 45.1176, + "step": 1586 + }, + { + "epoch": 5.7295711060948085, + "grad_norm": 293.7327575683594, + "learning_rate": 2.1413793103448275e-05, + "loss": 45.8517, + "step": 1587 + }, + { + "epoch": 5.733182844243792, + "grad_norm": 293.1148681640625, + "learning_rate": 2.140834845735027e-05, + "loss": 45.6659, + "step": 1588 + }, + { + "epoch": 5.736794582392776, + "grad_norm": 312.7779846191406, + "learning_rate": 2.140290381125227e-05, + "loss": 44.4863, + "step": 1589 + }, + { + "epoch": 5.74040632054176, + "grad_norm": 309.1000061035156, + "learning_rate": 2.1397459165154265e-05, + "loss": 43.649, + "step": 1590 + }, + { + "epoch": 5.74040632054176, + "eval_loss": 0.6471736431121826, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 1590 + }, + { + "epoch": 5.744018058690745, + "grad_norm": 276.4226989746094, + "learning_rate": 2.1392014519056263e-05, + "loss": 45.3135, + "step": 1591 + }, + { + "epoch": 5.747629796839729, + "grad_norm": 233.6791229248047, + "learning_rate": 2.138656987295826e-05, + "loss": 44.4919, + "step": 1592 + }, + { + "epoch": 5.751241534988713, + "grad_norm": 194.2917022705078, + "learning_rate": 2.1381125226860254e-05, + "loss": 44.8033, + "step": 1593 + }, + { + "epoch": 5.754853273137698, + "grad_norm": 241.76060485839844, + "learning_rate": 2.137568058076225e-05, + "loss": 45.1427, + "step": 1594 + }, + { + "epoch": 5.758465011286682, + "grad_norm": 216.56283569335938, + "learning_rate": 2.137023593466425e-05, + "loss": 43.1769, + "step": 1595 + }, + { + "epoch": 5.762076749435666, + "grad_norm": 230.0026092529297, + "learning_rate": 2.1364791288566244e-05, + "loss": 44.1141, + "step": 1596 + }, + { + "epoch": 5.76568848758465, + "grad_norm": 191.55433654785156, + "learning_rate": 2.135934664246824e-05, + "loss": 40.7227, + "step": 1597 + }, + { + "epoch": 5.769300225733634, + "grad_norm": 180.25885009765625, + "learning_rate": 2.1353901996370235e-05, + "loss": 40.9842, + "step": 1598 + }, + { + "epoch": 5.772911963882619, + "grad_norm": 220.4018096923828, + "learning_rate": 2.134845735027223e-05, + "loss": 40.0403, + "step": 1599 + }, + { + "epoch": 5.776523702031603, + "grad_norm": 264.20587158203125, + "learning_rate": 2.1343012704174232e-05, + "loss": 40.1543, + "step": 1600 + }, + { + "epoch": 5.776523702031603, + "eval_loss": 0.6374311447143555, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1600 + }, + { + "epoch": 5.780135440180587, + "grad_norm": 167.9457244873047, + "learning_rate": 2.1337568058076227e-05, + "loss": 40.9575, + "step": 1601 + }, + { + "epoch": 5.7837471783295715, + "grad_norm": 190.05247497558594, + "learning_rate": 2.1332123411978223e-05, + "loss": 39.5593, + "step": 1602 + }, + { + "epoch": 5.7873589164785555, + "grad_norm": 246.4980926513672, + "learning_rate": 2.1326678765880218e-05, + "loss": 40.7016, + "step": 1603 + }, + { + "epoch": 5.7909706546275395, + "grad_norm": 208.7435302734375, + "learning_rate": 2.1321234119782214e-05, + "loss": 41.7855, + "step": 1604 + }, + { + "epoch": 5.794582392776523, + "grad_norm": 190.84188842773438, + "learning_rate": 2.1315789473684212e-05, + "loss": 41.2129, + "step": 1605 + }, + { + "epoch": 5.798194130925508, + "grad_norm": 196.7161102294922, + "learning_rate": 2.1310344827586208e-05, + "loss": 40.8209, + "step": 1606 + }, + { + "epoch": 5.801805869074492, + "grad_norm": 181.4319305419922, + "learning_rate": 2.1304900181488203e-05, + "loss": 41.8345, + "step": 1607 + }, + { + "epoch": 5.805417607223476, + "grad_norm": 201.2064971923828, + "learning_rate": 2.12994555353902e-05, + "loss": 43.1464, + "step": 1608 + }, + { + "epoch": 5.80902934537246, + "grad_norm": 199.15174865722656, + "learning_rate": 2.1294010889292197e-05, + "loss": 42.6041, + "step": 1609 + }, + { + "epoch": 5.812641083521445, + "grad_norm": 231.0398406982422, + "learning_rate": 2.1288566243194193e-05, + "loss": 42.867, + "step": 1610 + }, + { + "epoch": 5.812641083521445, + "eval_loss": 0.6334222555160522, + "eval_runtime": 3.1534, + "eval_samples_per_second": 56.764, + "eval_steps_per_second": 56.764, + "step": 1610 + }, + { + "epoch": 5.816252821670429, + "grad_norm": 189.26132202148438, + "learning_rate": 2.128312159709619e-05, + "loss": 41.7717, + "step": 1611 + }, + { + "epoch": 5.819864559819413, + "grad_norm": 215.5289764404297, + "learning_rate": 2.1277676950998187e-05, + "loss": 41.3994, + "step": 1612 + }, + { + "epoch": 5.823476297968397, + "grad_norm": 267.4259033203125, + "learning_rate": 2.1272232304900182e-05, + "loss": 41.8173, + "step": 1613 + }, + { + "epoch": 5.827088036117382, + "grad_norm": 241.74749755859375, + "learning_rate": 2.1266787658802178e-05, + "loss": 39.9873, + "step": 1614 + }, + { + "epoch": 5.830699774266366, + "grad_norm": 242.233642578125, + "learning_rate": 2.1261343012704173e-05, + "loss": 37.0662, + "step": 1615 + }, + { + "epoch": 5.83431151241535, + "grad_norm": 217.06141662597656, + "learning_rate": 2.1255898366606172e-05, + "loss": 36.8948, + "step": 1616 + }, + { + "epoch": 5.837923250564334, + "grad_norm": 242.05567932128906, + "learning_rate": 2.1250453720508167e-05, + "loss": 34.9909, + "step": 1617 + }, + { + "epoch": 5.8415349887133186, + "grad_norm": 178.65618896484375, + "learning_rate": 2.1245009074410166e-05, + "loss": 35.603, + "step": 1618 + }, + { + "epoch": 5.8451467268623025, + "grad_norm": 216.36865234375, + "learning_rate": 2.123956442831216e-05, + "loss": 35.9822, + "step": 1619 + }, + { + "epoch": 5.8487584650112865, + "grad_norm": 241.22161865234375, + "learning_rate": 2.1234119782214157e-05, + "loss": 35.1473, + "step": 1620 + }, + { + "epoch": 5.8487584650112865, + "eval_loss": 0.6312161087989807, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 1620 + }, + { + "epoch": 5.852370203160271, + "grad_norm": 192.05210876464844, + "learning_rate": 2.1228675136116152e-05, + "loss": 36.145, + "step": 1621 + }, + { + "epoch": 5.855981941309255, + "grad_norm": 194.0652618408203, + "learning_rate": 2.122323049001815e-05, + "loss": 37.7076, + "step": 1622 + }, + { + "epoch": 5.859593679458239, + "grad_norm": 255.59286499023438, + "learning_rate": 2.1217785843920146e-05, + "loss": 37.6837, + "step": 1623 + }, + { + "epoch": 5.863205417607223, + "grad_norm": 184.0017852783203, + "learning_rate": 2.121234119782214e-05, + "loss": 37.1681, + "step": 1624 + }, + { + "epoch": 5.866817155756207, + "grad_norm": 186.98338317871094, + "learning_rate": 2.1206896551724137e-05, + "loss": 37.4902, + "step": 1625 + }, + { + "epoch": 5.870428893905192, + "grad_norm": 253.53775024414062, + "learning_rate": 2.1201451905626132e-05, + "loss": 37.2771, + "step": 1626 + }, + { + "epoch": 5.874040632054176, + "grad_norm": 196.43038940429688, + "learning_rate": 2.119600725952813e-05, + "loss": 37.7681, + "step": 1627 + }, + { + "epoch": 5.87765237020316, + "grad_norm": 255.99879455566406, + "learning_rate": 2.119056261343013e-05, + "loss": 40.0097, + "step": 1628 + }, + { + "epoch": 5.881264108352145, + "grad_norm": 275.1465148925781, + "learning_rate": 2.1185117967332125e-05, + "loss": 38.1076, + "step": 1629 + }, + { + "epoch": 5.884875846501129, + "grad_norm": 281.8592529296875, + "learning_rate": 2.117967332123412e-05, + "loss": 38.6463, + "step": 1630 + }, + { + "epoch": 5.884875846501129, + "eval_loss": 0.6449099779129028, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 1630 + }, + { + "epoch": 5.888487584650113, + "grad_norm": 246.7912139892578, + "learning_rate": 2.1174228675136116e-05, + "loss": 36.9158, + "step": 1631 + }, + { + "epoch": 5.892099322799097, + "grad_norm": 176.7545623779297, + "learning_rate": 2.116878402903811e-05, + "loss": 25.1153, + "step": 1632 + }, + { + "epoch": 5.895711060948082, + "grad_norm": 202.2602996826172, + "learning_rate": 2.116333938294011e-05, + "loss": 24.1999, + "step": 1633 + }, + { + "epoch": 5.899322799097066, + "grad_norm": 186.26255798339844, + "learning_rate": 2.1157894736842106e-05, + "loss": 24.185, + "step": 1634 + }, + { + "epoch": 5.9029345372460496, + "grad_norm": 231.0543670654297, + "learning_rate": 2.11524500907441e-05, + "loss": 26.1841, + "step": 1635 + }, + { + "epoch": 5.9065462753950335, + "grad_norm": 336.677001953125, + "learning_rate": 2.1147005444646096e-05, + "loss": 47.1367, + "step": 1636 + }, + { + "epoch": 5.910158013544018, + "grad_norm": 299.3211975097656, + "learning_rate": 2.1141560798548095e-05, + "loss": 46.7711, + "step": 1637 + }, + { + "epoch": 5.913769751693002, + "grad_norm": 287.5389099121094, + "learning_rate": 2.1136116152450094e-05, + "loss": 44.9163, + "step": 1638 + }, + { + "epoch": 5.917381489841986, + "grad_norm": 290.34930419921875, + "learning_rate": 2.113067150635209e-05, + "loss": 45.1651, + "step": 1639 + }, + { + "epoch": 5.92099322799097, + "grad_norm": 244.7100372314453, + "learning_rate": 2.1125226860254085e-05, + "loss": 45.6252, + "step": 1640 + }, + { + "epoch": 5.92099322799097, + "eval_loss": 0.6506878733634949, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 1640 + }, + { + "epoch": 5.924604966139955, + "grad_norm": 301.48223876953125, + "learning_rate": 2.111978221415608e-05, + "loss": 44.5345, + "step": 1641 + }, + { + "epoch": 5.928216704288939, + "grad_norm": 261.05987548828125, + "learning_rate": 2.1114337568058075e-05, + "loss": 42.0263, + "step": 1642 + }, + { + "epoch": 5.931828442437923, + "grad_norm": 220.4369659423828, + "learning_rate": 2.110889292196007e-05, + "loss": 41.2405, + "step": 1643 + }, + { + "epoch": 5.935440180586907, + "grad_norm": 261.3221435546875, + "learning_rate": 2.110344827586207e-05, + "loss": 42.2734, + "step": 1644 + }, + { + "epoch": 5.939051918735892, + "grad_norm": 253.70855712890625, + "learning_rate": 2.1098003629764065e-05, + "loss": 43.0752, + "step": 1645 + }, + { + "epoch": 5.942663656884876, + "grad_norm": 198.76138305664062, + "learning_rate": 2.1092558983666064e-05, + "loss": 42.7103, + "step": 1646 + }, + { + "epoch": 5.94627539503386, + "grad_norm": 212.21466064453125, + "learning_rate": 2.108711433756806e-05, + "loss": 42.6215, + "step": 1647 + }, + { + "epoch": 5.949887133182845, + "grad_norm": 212.9633026123047, + "learning_rate": 2.1081669691470055e-05, + "loss": 42.795, + "step": 1648 + }, + { + "epoch": 5.953498871331829, + "grad_norm": 263.2871398925781, + "learning_rate": 2.1076225045372053e-05, + "loss": 43.8843, + "step": 1649 + }, + { + "epoch": 5.957110609480813, + "grad_norm": 207.67120361328125, + "learning_rate": 2.107078039927405e-05, + "loss": 43.0161, + "step": 1650 + }, + { + "epoch": 5.957110609480813, + "eval_loss": 0.6315081715583801, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1650 + }, + { + "epoch": 5.960722347629797, + "grad_norm": 176.6342010498047, + "learning_rate": 2.1065335753176044e-05, + "loss": 38.803, + "step": 1651 + }, + { + "epoch": 5.9643340857787805, + "grad_norm": 223.57485961914062, + "learning_rate": 2.105989110707804e-05, + "loss": 35.1905, + "step": 1652 + }, + { + "epoch": 5.967945823927765, + "grad_norm": 291.507568359375, + "learning_rate": 2.1054446460980035e-05, + "loss": 34.9454, + "step": 1653 + }, + { + "epoch": 5.971557562076749, + "grad_norm": 250.51063537597656, + "learning_rate": 2.104900181488203e-05, + "loss": 37.4404, + "step": 1654 + }, + { + "epoch": 5.975169300225733, + "grad_norm": 307.9601135253906, + "learning_rate": 2.1043557168784032e-05, + "loss": 36.9775, + "step": 1655 + }, + { + "epoch": 5.978781038374718, + "grad_norm": 277.24151611328125, + "learning_rate": 2.1038112522686028e-05, + "loss": 38.2696, + "step": 1656 + }, + { + "epoch": 5.982392776523702, + "grad_norm": 186.7593994140625, + "learning_rate": 2.1032667876588023e-05, + "loss": 37.0656, + "step": 1657 + }, + { + "epoch": 5.986004514672686, + "grad_norm": 201.67047119140625, + "learning_rate": 2.102722323049002e-05, + "loss": 38.1747, + "step": 1658 + }, + { + "epoch": 5.98961625282167, + "grad_norm": 216.87525939941406, + "learning_rate": 2.1021778584392014e-05, + "loss": 39.3248, + "step": 1659 + }, + { + "epoch": 5.993227990970655, + "grad_norm": 227.381103515625, + "learning_rate": 2.1016333938294013e-05, + "loss": 33.4017, + "step": 1660 + }, + { + "epoch": 5.993227990970655, + "eval_loss": 0.6369583010673523, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1660 + }, + { + "epoch": 5.996839729119639, + "grad_norm": 237.2648468017578, + "learning_rate": 2.1010889292196008e-05, + "loss": 24.679, + "step": 1661 + }, + { + "epoch": 6.0, + "grad_norm": 191.99951171875, + "learning_rate": 2.1005444646098003e-05, + "loss": 21.9552, + "step": 1662 + }, + { + "epoch": 6.003611738148984, + "grad_norm": 267.92181396484375, + "learning_rate": 2.1e-05, + "loss": 43.6884, + "step": 1663 + }, + { + "epoch": 6.007223476297969, + "grad_norm": 318.86602783203125, + "learning_rate": 2.0994555353901998e-05, + "loss": 46.0709, + "step": 1664 + }, + { + "epoch": 6.010835214446953, + "grad_norm": 282.772705078125, + "learning_rate": 2.0989110707803993e-05, + "loss": 44.2746, + "step": 1665 + }, + { + "epoch": 6.014446952595937, + "grad_norm": 263.2024841308594, + "learning_rate": 2.0983666061705992e-05, + "loss": 43.818, + "step": 1666 + }, + { + "epoch": 6.018058690744921, + "grad_norm": 229.41725158691406, + "learning_rate": 2.0978221415607987e-05, + "loss": 43.9441, + "step": 1667 + }, + { + "epoch": 6.021670428893906, + "grad_norm": 253.25624084472656, + "learning_rate": 2.0972776769509983e-05, + "loss": 43.517, + "step": 1668 + }, + { + "epoch": 6.0252821670428895, + "grad_norm": 202.00238037109375, + "learning_rate": 2.0967332123411978e-05, + "loss": 44.3685, + "step": 1669 + }, + { + "epoch": 6.0288939051918735, + "grad_norm": 196.92825317382812, + "learning_rate": 2.0961887477313973e-05, + "loss": 44.9367, + "step": 1670 + }, + { + "epoch": 6.0288939051918735, + "eval_loss": 0.6381568312644958, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1670 + }, + { + "epoch": 6.0325056433408575, + "grad_norm": 191.00900268554688, + "learning_rate": 2.0956442831215972e-05, + "loss": 44.0743, + "step": 1671 + }, + { + "epoch": 6.036117381489842, + "grad_norm": 195.92141723632812, + "learning_rate": 2.0950998185117967e-05, + "loss": 43.3278, + "step": 1672 + }, + { + "epoch": 6.039729119638826, + "grad_norm": 230.04708862304688, + "learning_rate": 2.0945553539019963e-05, + "loss": 41.6419, + "step": 1673 + }, + { + "epoch": 6.04334085778781, + "grad_norm": 215.70689392089844, + "learning_rate": 2.094010889292196e-05, + "loss": 41.0927, + "step": 1674 + }, + { + "epoch": 6.046952595936794, + "grad_norm": 227.51797485351562, + "learning_rate": 2.0934664246823957e-05, + "loss": 40.1888, + "step": 1675 + }, + { + "epoch": 6.050564334085779, + "grad_norm": 216.93089294433594, + "learning_rate": 2.0929219600725952e-05, + "loss": 39.8766, + "step": 1676 + }, + { + "epoch": 6.054176072234763, + "grad_norm": 199.3091583251953, + "learning_rate": 2.092377495462795e-05, + "loss": 40.3851, + "step": 1677 + }, + { + "epoch": 6.057787810383747, + "grad_norm": 188.56056213378906, + "learning_rate": 2.0918330308529947e-05, + "loss": 40.5289, + "step": 1678 + }, + { + "epoch": 6.061399548532731, + "grad_norm": 194.23265075683594, + "learning_rate": 2.0912885662431942e-05, + "loss": 40.7509, + "step": 1679 + }, + { + "epoch": 6.065011286681716, + "grad_norm": 199.7327423095703, + "learning_rate": 2.0907441016333937e-05, + "loss": 41.3404, + "step": 1680 + }, + { + "epoch": 6.065011286681716, + "eval_loss": 0.6312655806541443, + "eval_runtime": 3.1482, + "eval_samples_per_second": 56.858, + "eval_steps_per_second": 56.858, + "step": 1680 + }, + { + "epoch": 6.0686230248307, + "grad_norm": 189.40150451660156, + "learning_rate": 2.0901996370235933e-05, + "loss": 41.3719, + "step": 1681 + }, + { + "epoch": 6.072234762979684, + "grad_norm": 222.07705688476562, + "learning_rate": 2.089655172413793e-05, + "loss": 41.8194, + "step": 1682 + }, + { + "epoch": 6.075846501128668, + "grad_norm": 205.6264190673828, + "learning_rate": 2.089110707803993e-05, + "loss": 39.8522, + "step": 1683 + }, + { + "epoch": 6.079458239277653, + "grad_norm": 207.98802185058594, + "learning_rate": 2.0885662431941926e-05, + "loss": 41.5093, + "step": 1684 + }, + { + "epoch": 6.083069977426637, + "grad_norm": 197.24134826660156, + "learning_rate": 2.088021778584392e-05, + "loss": 41.7284, + "step": 1685 + }, + { + "epoch": 6.0866817155756205, + "grad_norm": 220.84255981445312, + "learning_rate": 2.0874773139745916e-05, + "loss": 42.7841, + "step": 1686 + }, + { + "epoch": 6.090293453724605, + "grad_norm": 239.06854248046875, + "learning_rate": 2.0869328493647912e-05, + "loss": 43.6391, + "step": 1687 + }, + { + "epoch": 6.093905191873589, + "grad_norm": 193.2572021484375, + "learning_rate": 2.086388384754991e-05, + "loss": 41.9963, + "step": 1688 + }, + { + "epoch": 6.097516930022573, + "grad_norm": 206.66473388671875, + "learning_rate": 2.0858439201451906e-05, + "loss": 41.9834, + "step": 1689 + }, + { + "epoch": 6.101128668171557, + "grad_norm": 214.81956481933594, + "learning_rate": 2.08529945553539e-05, + "loss": 41.7128, + "step": 1690 + }, + { + "epoch": 6.101128668171557, + "eval_loss": 0.6309775114059448, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1690 + }, + { + "epoch": 6.104740406320542, + "grad_norm": 189.58360290527344, + "learning_rate": 2.0847549909255897e-05, + "loss": 37.7807, + "step": 1691 + }, + { + "epoch": 6.108352144469526, + "grad_norm": 265.76934814453125, + "learning_rate": 2.0842105263157895e-05, + "loss": 37.7091, + "step": 1692 + }, + { + "epoch": 6.11196388261851, + "grad_norm": 266.4632568359375, + "learning_rate": 2.0836660617059894e-05, + "loss": 34.7386, + "step": 1693 + }, + { + "epoch": 6.115575620767494, + "grad_norm": 309.3799743652344, + "learning_rate": 2.083121597096189e-05, + "loss": 34.9386, + "step": 1694 + }, + { + "epoch": 6.119187358916479, + "grad_norm": 252.98681640625, + "learning_rate": 2.0825771324863885e-05, + "loss": 34.9113, + "step": 1695 + }, + { + "epoch": 6.122799097065463, + "grad_norm": 199.3408660888672, + "learning_rate": 2.082032667876588e-05, + "loss": 35.1914, + "step": 1696 + }, + { + "epoch": 6.126410835214447, + "grad_norm": 231.67514038085938, + "learning_rate": 2.0814882032667876e-05, + "loss": 36.3151, + "step": 1697 + }, + { + "epoch": 6.130022573363431, + "grad_norm": 215.49317932128906, + "learning_rate": 2.080943738656987e-05, + "loss": 37.6763, + "step": 1698 + }, + { + "epoch": 6.133634311512416, + "grad_norm": 239.3602752685547, + "learning_rate": 2.080399274047187e-05, + "loss": 35.7805, + "step": 1699 + }, + { + "epoch": 6.1372460496614, + "grad_norm": 192.8195037841797, + "learning_rate": 2.0798548094373865e-05, + "loss": 36.7353, + "step": 1700 + }, + { + "epoch": 6.1372460496614, + "eval_loss": 0.6290757060050964, + "eval_runtime": 3.1486, + "eval_samples_per_second": 56.851, + "eval_steps_per_second": 56.851, + "step": 1700 + }, + { + "epoch": 6.140857787810384, + "grad_norm": 191.125, + "learning_rate": 2.0793103448275864e-05, + "loss": 36.6377, + "step": 1701 + }, + { + "epoch": 6.144469525959368, + "grad_norm": 232.39170837402344, + "learning_rate": 2.078765880217786e-05, + "loss": 36.5235, + "step": 1702 + }, + { + "epoch": 6.148081264108352, + "grad_norm": 259.41204833984375, + "learning_rate": 2.0782214156079855e-05, + "loss": 37.7093, + "step": 1703 + }, + { + "epoch": 6.151693002257336, + "grad_norm": 218.00814819335938, + "learning_rate": 2.0776769509981854e-05, + "loss": 37.8061, + "step": 1704 + }, + { + "epoch": 6.15530474040632, + "grad_norm": 183.78170776367188, + "learning_rate": 2.077132486388385e-05, + "loss": 37.9451, + "step": 1705 + }, + { + "epoch": 6.158916478555304, + "grad_norm": 242.387939453125, + "learning_rate": 2.0765880217785844e-05, + "loss": 38.687, + "step": 1706 + }, + { + "epoch": 6.162528216704289, + "grad_norm": 247.09152221679688, + "learning_rate": 2.076043557168784e-05, + "loss": 38.5109, + "step": 1707 + }, + { + "epoch": 6.166139954853273, + "grad_norm": 202.3104705810547, + "learning_rate": 2.0754990925589835e-05, + "loss": 28.0115, + "step": 1708 + }, + { + "epoch": 6.169751693002257, + "grad_norm": 239.5511016845703, + "learning_rate": 2.0749546279491834e-05, + "loss": 23.8873, + "step": 1709 + }, + { + "epoch": 6.173363431151241, + "grad_norm": 233.80007934570312, + "learning_rate": 2.0744101633393833e-05, + "loss": 24.0236, + "step": 1710 + }, + { + "epoch": 6.173363431151241, + "eval_loss": 0.6451307535171509, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1710 + }, + { + "epoch": 6.176975169300226, + "grad_norm": 231.85955810546875, + "learning_rate": 2.0738656987295828e-05, + "loss": 25.2521, + "step": 1711 + }, + { + "epoch": 6.18058690744921, + "grad_norm": 207.05453491210938, + "learning_rate": 2.0733212341197823e-05, + "loss": 25.5774, + "step": 1712 + }, + { + "epoch": 6.184198645598194, + "grad_norm": 265.9180908203125, + "learning_rate": 2.072776769509982e-05, + "loss": 46.0267, + "step": 1713 + }, + { + "epoch": 6.187810383747179, + "grad_norm": 289.2763671875, + "learning_rate": 2.0722323049001814e-05, + "loss": 46.6262, + "step": 1714 + }, + { + "epoch": 6.191422121896163, + "grad_norm": 254.466552734375, + "learning_rate": 2.0716878402903813e-05, + "loss": 44.2758, + "step": 1715 + }, + { + "epoch": 6.195033860045147, + "grad_norm": 262.713134765625, + "learning_rate": 2.071143375680581e-05, + "loss": 44.6334, + "step": 1716 + }, + { + "epoch": 6.198645598194131, + "grad_norm": 272.8150939941406, + "learning_rate": 2.0705989110707804e-05, + "loss": 44.9617, + "step": 1717 + }, + { + "epoch": 6.2022573363431155, + "grad_norm": 288.115478515625, + "learning_rate": 2.07005444646098e-05, + "loss": 44.4382, + "step": 1718 + }, + { + "epoch": 6.2058690744920995, + "grad_norm": 226.08058166503906, + "learning_rate": 2.0695099818511795e-05, + "loss": 44.8551, + "step": 1719 + }, + { + "epoch": 6.209480812641083, + "grad_norm": 219.95835876464844, + "learning_rate": 2.0689655172413797e-05, + "loss": 45.5901, + "step": 1720 + }, + { + "epoch": 6.209480812641083, + "eval_loss": 0.6379314661026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 1720 + }, + { + "epoch": 6.213092550790067, + "grad_norm": 190.3118896484375, + "learning_rate": 2.0684210526315792e-05, + "loss": 44.0675, + "step": 1721 + }, + { + "epoch": 6.216704288939052, + "grad_norm": 177.408935546875, + "learning_rate": 2.0678765880217787e-05, + "loss": 42.6333, + "step": 1722 + }, + { + "epoch": 6.220316027088036, + "grad_norm": 231.3040313720703, + "learning_rate": 2.0673321234119783e-05, + "loss": 41.6771, + "step": 1723 + }, + { + "epoch": 6.22392776523702, + "grad_norm": 226.51663208007812, + "learning_rate": 2.0667876588021778e-05, + "loss": 41.0829, + "step": 1724 + }, + { + "epoch": 6.227539503386004, + "grad_norm": 184.55775451660156, + "learning_rate": 2.0662431941923774e-05, + "loss": 39.2682, + "step": 1725 + }, + { + "epoch": 6.231151241534989, + "grad_norm": 205.0491943359375, + "learning_rate": 2.0656987295825772e-05, + "loss": 40.4101, + "step": 1726 + }, + { + "epoch": 6.234762979683973, + "grad_norm": 201.45838928222656, + "learning_rate": 2.0651542649727768e-05, + "loss": 39.9147, + "step": 1727 + }, + { + "epoch": 6.238374717832957, + "grad_norm": 220.16213989257812, + "learning_rate": 2.0646098003629763e-05, + "loss": 40.7215, + "step": 1728 + }, + { + "epoch": 6.241986455981941, + "grad_norm": 260.9661560058594, + "learning_rate": 2.0640653357531762e-05, + "loss": 40.0256, + "step": 1729 + }, + { + "epoch": 6.245598194130926, + "grad_norm": 314.2476806640625, + "learning_rate": 2.0635208711433757e-05, + "loss": 41.1147, + "step": 1730 + }, + { + "epoch": 6.245598194130926, + "eval_loss": 0.6347935199737549, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1730 + }, + { + "epoch": 6.24920993227991, + "grad_norm": 262.24505615234375, + "learning_rate": 2.0629764065335756e-05, + "loss": 41.7255, + "step": 1731 + }, + { + "epoch": 6.252821670428894, + "grad_norm": 212.0876922607422, + "learning_rate": 2.062431941923775e-05, + "loss": 41.2559, + "step": 1732 + }, + { + "epoch": 6.2564334085778786, + "grad_norm": 185.3249969482422, + "learning_rate": 2.0618874773139747e-05, + "loss": 41.1664, + "step": 1733 + }, + { + "epoch": 6.2600451467268625, + "grad_norm": 184.7873077392578, + "learning_rate": 2.0613430127041742e-05, + "loss": 41.3357, + "step": 1734 + }, + { + "epoch": 6.2636568848758465, + "grad_norm": 230.11257934570312, + "learning_rate": 2.0607985480943738e-05, + "loss": 43.0978, + "step": 1735 + }, + { + "epoch": 6.2672686230248305, + "grad_norm": 251.255126953125, + "learning_rate": 2.0602540834845733e-05, + "loss": 42.4169, + "step": 1736 + }, + { + "epoch": 6.270880361173815, + "grad_norm": 230.1149444580078, + "learning_rate": 2.0597096188747732e-05, + "loss": 43.2969, + "step": 1737 + }, + { + "epoch": 6.274492099322799, + "grad_norm": 217.2769012451172, + "learning_rate": 2.059165154264973e-05, + "loss": 42.6037, + "step": 1738 + }, + { + "epoch": 6.278103837471783, + "grad_norm": 189.85533142089844, + "learning_rate": 2.0586206896551726e-05, + "loss": 42.1215, + "step": 1739 + }, + { + "epoch": 6.281715575620767, + "grad_norm": 242.15667724609375, + "learning_rate": 2.058076225045372e-05, + "loss": 42.6337, + "step": 1740 + }, + { + "epoch": 6.281715575620767, + "eval_loss": 0.6310555934906006, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 1740 + }, + { + "epoch": 6.285327313769752, + "grad_norm": 213.7873992919922, + "learning_rate": 2.0575317604355717e-05, + "loss": 40.5315, + "step": 1741 + }, + { + "epoch": 6.288939051918736, + "grad_norm": 243.86492919921875, + "learning_rate": 2.0569872958257715e-05, + "loss": 38.9483, + "step": 1742 + }, + { + "epoch": 6.29255079006772, + "grad_norm": 276.0108642578125, + "learning_rate": 2.056442831215971e-05, + "loss": 35.9627, + "step": 1743 + }, + { + "epoch": 6.296162528216704, + "grad_norm": 252.5875701904297, + "learning_rate": 2.0558983666061706e-05, + "loss": 35.4305, + "step": 1744 + }, + { + "epoch": 6.299774266365689, + "grad_norm": 227.15142822265625, + "learning_rate": 2.05535390199637e-05, + "loss": 35.2385, + "step": 1745 + }, + { + "epoch": 6.303386004514673, + "grad_norm": 259.6727294921875, + "learning_rate": 2.0548094373865697e-05, + "loss": 35.735, + "step": 1746 + }, + { + "epoch": 6.306997742663657, + "grad_norm": 185.07765197753906, + "learning_rate": 2.0542649727767696e-05, + "loss": 36.8835, + "step": 1747 + }, + { + "epoch": 6.310609480812641, + "grad_norm": 207.650146484375, + "learning_rate": 2.0537205081669694e-05, + "loss": 36.346, + "step": 1748 + }, + { + "epoch": 6.314221218961626, + "grad_norm": 223.2378692626953, + "learning_rate": 2.053176043557169e-05, + "loss": 36.1527, + "step": 1749 + }, + { + "epoch": 6.3178329571106095, + "grad_norm": 162.90794372558594, + "learning_rate": 2.0526315789473685e-05, + "loss": 35.7408, + "step": 1750 + }, + { + "epoch": 6.3178329571106095, + "eval_loss": 0.6276403069496155, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 1750 + }, + { + "epoch": 6.3214446952595935, + "grad_norm": 165.8592987060547, + "learning_rate": 2.052087114337568e-05, + "loss": 37.7916, + "step": 1751 + }, + { + "epoch": 6.3250564334085775, + "grad_norm": 179.7499542236328, + "learning_rate": 2.0515426497277676e-05, + "loss": 36.8409, + "step": 1752 + }, + { + "epoch": 6.328668171557562, + "grad_norm": 227.0990753173828, + "learning_rate": 2.0509981851179675e-05, + "loss": 37.1766, + "step": 1753 + }, + { + "epoch": 6.332279909706546, + "grad_norm": 216.3297882080078, + "learning_rate": 2.050453720508167e-05, + "loss": 37.5, + "step": 1754 + }, + { + "epoch": 6.33589164785553, + "grad_norm": 197.88409423828125, + "learning_rate": 2.0499092558983666e-05, + "loss": 38.8293, + "step": 1755 + }, + { + "epoch": 6.339503386004514, + "grad_norm": 189.74916076660156, + "learning_rate": 2.049364791288566e-05, + "loss": 37.9873, + "step": 1756 + }, + { + "epoch": 6.343115124153499, + "grad_norm": 241.16644287109375, + "learning_rate": 2.048820326678766e-05, + "loss": 39.3107, + "step": 1757 + }, + { + "epoch": 6.346726862302483, + "grad_norm": 224.3491668701172, + "learning_rate": 2.0482758620689655e-05, + "loss": 36.2482, + "step": 1758 + }, + { + "epoch": 6.350338600451467, + "grad_norm": 217.30882263183594, + "learning_rate": 2.0477313974591654e-05, + "loss": 24.1945, + "step": 1759 + }, + { + "epoch": 6.353950338600452, + "grad_norm": 213.23683166503906, + "learning_rate": 2.047186932849365e-05, + "loss": 24.2356, + "step": 1760 + }, + { + "epoch": 6.353950338600452, + "eval_loss": 0.6382855772972107, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.795, + "eval_steps_per_second": 56.795, + "step": 1760 + }, + { + "epoch": 6.357562076749436, + "grad_norm": 209.8166961669922, + "learning_rate": 2.0466424682395645e-05, + "loss": 25.1916, + "step": 1761 + }, + { + "epoch": 6.36117381489842, + "grad_norm": 197.86773681640625, + "learning_rate": 2.046098003629764e-05, + "loss": 25.1372, + "step": 1762 + }, + { + "epoch": 6.364785553047404, + "grad_norm": 280.80517578125, + "learning_rate": 2.0455535390199635e-05, + "loss": 45.0431, + "step": 1763 + }, + { + "epoch": 6.368397291196389, + "grad_norm": 239.85861206054688, + "learning_rate": 2.0450090744101634e-05, + "loss": 45.4893, + "step": 1764 + }, + { + "epoch": 6.372009029345373, + "grad_norm": 302.56024169921875, + "learning_rate": 2.044464609800363e-05, + "loss": 45.3313, + "step": 1765 + }, + { + "epoch": 6.375620767494357, + "grad_norm": 255.5519256591797, + "learning_rate": 2.043920145190563e-05, + "loss": 44.703, + "step": 1766 + }, + { + "epoch": 6.3792325056433405, + "grad_norm": 223.1331024169922, + "learning_rate": 2.0433756805807624e-05, + "loss": 45.0278, + "step": 1767 + }, + { + "epoch": 6.382844243792325, + "grad_norm": 240.68817138671875, + "learning_rate": 2.042831215970962e-05, + "loss": 44.7298, + "step": 1768 + }, + { + "epoch": 6.386455981941309, + "grad_norm": 239.5072021484375, + "learning_rate": 2.0422867513611614e-05, + "loss": 44.0512, + "step": 1769 + }, + { + "epoch": 6.390067720090293, + "grad_norm": 186.3783416748047, + "learning_rate": 2.0417422867513613e-05, + "loss": 43.8646, + "step": 1770 + }, + { + "epoch": 6.390067720090293, + "eval_loss": 0.6325972676277161, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 1770 + }, + { + "epoch": 6.393679458239277, + "grad_norm": 169.77285766601562, + "learning_rate": 2.041197822141561e-05, + "loss": 43.8688, + "step": 1771 + }, + { + "epoch": 6.397291196388262, + "grad_norm": 158.4019012451172, + "learning_rate": 2.0406533575317604e-05, + "loss": 42.5757, + "step": 1772 + }, + { + "epoch": 6.400902934537246, + "grad_norm": 209.79916381835938, + "learning_rate": 2.04010889292196e-05, + "loss": 44.8075, + "step": 1773 + }, + { + "epoch": 6.40451467268623, + "grad_norm": 215.74639892578125, + "learning_rate": 2.0395644283121595e-05, + "loss": 42.0121, + "step": 1774 + }, + { + "epoch": 6.408126410835214, + "grad_norm": 215.21121215820312, + "learning_rate": 2.0390199637023597e-05, + "loss": 40.6564, + "step": 1775 + }, + { + "epoch": 6.411738148984199, + "grad_norm": 244.49574279785156, + "learning_rate": 2.0384754990925592e-05, + "loss": 40.543, + "step": 1776 + }, + { + "epoch": 6.415349887133183, + "grad_norm": 189.22781372070312, + "learning_rate": 2.0379310344827588e-05, + "loss": 39.5569, + "step": 1777 + }, + { + "epoch": 6.418961625282167, + "grad_norm": 204.32664489746094, + "learning_rate": 2.0373865698729583e-05, + "loss": 40.0789, + "step": 1778 + }, + { + "epoch": 6.422573363431152, + "grad_norm": 217.5277557373047, + "learning_rate": 2.036842105263158e-05, + "loss": 39.6436, + "step": 1779 + }, + { + "epoch": 6.426185101580136, + "grad_norm": 196.25918579101562, + "learning_rate": 2.0362976406533574e-05, + "loss": 41.0794, + "step": 1780 + }, + { + "epoch": 6.426185101580136, + "eval_loss": 0.6334295868873596, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1780 + }, + { + "epoch": 6.42979683972912, + "grad_norm": 191.50656127929688, + "learning_rate": 2.0357531760435573e-05, + "loss": 41.2976, + "step": 1781 + }, + { + "epoch": 6.433408577878104, + "grad_norm": 192.98692321777344, + "learning_rate": 2.0352087114337568e-05, + "loss": 41.0843, + "step": 1782 + }, + { + "epoch": 6.437020316027088, + "grad_norm": 197.32862854003906, + "learning_rate": 2.0346642468239563e-05, + "loss": 40.4123, + "step": 1783 + }, + { + "epoch": 6.440632054176072, + "grad_norm": 205.18751525878906, + "learning_rate": 2.0341197822141562e-05, + "loss": 41.9185, + "step": 1784 + }, + { + "epoch": 6.444243792325056, + "grad_norm": 201.69070434570312, + "learning_rate": 2.0335753176043558e-05, + "loss": 41.6794, + "step": 1785 + }, + { + "epoch": 6.44785553047404, + "grad_norm": 218.77044677734375, + "learning_rate": 2.0330308529945556e-05, + "loss": 43.5805, + "step": 1786 + }, + { + "epoch": 6.451467268623025, + "grad_norm": 183.25967407226562, + "learning_rate": 2.0324863883847552e-05, + "loss": 41.2777, + "step": 1787 + }, + { + "epoch": 6.455079006772009, + "grad_norm": 219.97369384765625, + "learning_rate": 2.0319419237749547e-05, + "loss": 42.4618, + "step": 1788 + }, + { + "epoch": 6.458690744920993, + "grad_norm": 216.1624298095703, + "learning_rate": 2.0313974591651542e-05, + "loss": 41.6424, + "step": 1789 + }, + { + "epoch": 6.462302483069977, + "grad_norm": 222.29965209960938, + "learning_rate": 2.0308529945553538e-05, + "loss": 41.4058, + "step": 1790 + }, + { + "epoch": 6.462302483069977, + "eval_loss": 0.6282982230186462, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 1790 + }, + { + "epoch": 6.465914221218962, + "grad_norm": 215.50511169433594, + "learning_rate": 2.0303085299455533e-05, + "loss": 39.474, + "step": 1791 + }, + { + "epoch": 6.469525959367946, + "grad_norm": 237.2119903564453, + "learning_rate": 2.0297640653357532e-05, + "loss": 36.0508, + "step": 1792 + }, + { + "epoch": 6.47313769751693, + "grad_norm": 234.52975463867188, + "learning_rate": 2.029219600725953e-05, + "loss": 34.1704, + "step": 1793 + }, + { + "epoch": 6.476749435665914, + "grad_norm": 213.22216796875, + "learning_rate": 2.0286751361161526e-05, + "loss": 34.7592, + "step": 1794 + }, + { + "epoch": 6.480361173814899, + "grad_norm": 215.77244567871094, + "learning_rate": 2.028130671506352e-05, + "loss": 35.3051, + "step": 1795 + }, + { + "epoch": 6.483972911963883, + "grad_norm": 179.0439910888672, + "learning_rate": 2.0275862068965517e-05, + "loss": 35.2493, + "step": 1796 + }, + { + "epoch": 6.487584650112867, + "grad_norm": 217.47218322753906, + "learning_rate": 2.0270417422867516e-05, + "loss": 35.6169, + "step": 1797 + }, + { + "epoch": 6.491196388261851, + "grad_norm": 191.3380584716797, + "learning_rate": 2.026497277676951e-05, + "loss": 36.428, + "step": 1798 + }, + { + "epoch": 6.4948081264108355, + "grad_norm": 200.8570098876953, + "learning_rate": 2.0259528130671506e-05, + "loss": 36.5983, + "step": 1799 + }, + { + "epoch": 6.4984198645598195, + "grad_norm": 173.1240234375, + "learning_rate": 2.0254083484573502e-05, + "loss": 36.0163, + "step": 1800 + }, + { + "epoch": 6.4984198645598195, + "eval_loss": 0.6268841624259949, + "eval_runtime": 3.146, + "eval_samples_per_second": 56.898, + "eval_steps_per_second": 56.898, + "step": 1800 + }, + { + "epoch": 6.502031602708803, + "grad_norm": 225.66845703125, + "learning_rate": 2.0248638838475497e-05, + "loss": 36.2461, + "step": 1801 + }, + { + "epoch": 6.505643340857787, + "grad_norm": 189.66233825683594, + "learning_rate": 2.0243194192377496e-05, + "loss": 37.416, + "step": 1802 + }, + { + "epoch": 6.509255079006772, + "grad_norm": 243.0270233154297, + "learning_rate": 2.0237749546279495e-05, + "loss": 38.5309, + "step": 1803 + }, + { + "epoch": 6.512866817155756, + "grad_norm": 192.0927276611328, + "learning_rate": 2.023230490018149e-05, + "loss": 37.087, + "step": 1804 + }, + { + "epoch": 6.51647855530474, + "grad_norm": 222.2957305908203, + "learning_rate": 2.0226860254083486e-05, + "loss": 37.8877, + "step": 1805 + }, + { + "epoch": 6.520090293453725, + "grad_norm": 259.84722900390625, + "learning_rate": 2.022141560798548e-05, + "loss": 39.2138, + "step": 1806 + }, + { + "epoch": 6.523702031602709, + "grad_norm": 205.5794219970703, + "learning_rate": 2.0215970961887476e-05, + "loss": 38.6066, + "step": 1807 + }, + { + "epoch": 6.527313769751693, + "grad_norm": 300.455810546875, + "learning_rate": 2.0210526315789475e-05, + "loss": 36.1581, + "step": 1808 + }, + { + "epoch": 6.530925507900677, + "grad_norm": 207.18063354492188, + "learning_rate": 2.020508166969147e-05, + "loss": 24.3689, + "step": 1809 + }, + { + "epoch": 6.534537246049661, + "grad_norm": 230.98516845703125, + "learning_rate": 2.0199637023593466e-05, + "loss": 23.7019, + "step": 1810 + }, + { + "epoch": 6.534537246049661, + "eval_loss": 0.6379140615463257, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 1810 + }, + { + "epoch": 6.538148984198646, + "grad_norm": 153.8694610595703, + "learning_rate": 2.019419237749546e-05, + "loss": 24.5035, + "step": 1811 + }, + { + "epoch": 6.54176072234763, + "grad_norm": 229.9432373046875, + "learning_rate": 2.018874773139746e-05, + "loss": 26.1645, + "step": 1812 + }, + { + "epoch": 6.545372460496614, + "grad_norm": 325.3592529296875, + "learning_rate": 2.018330308529946e-05, + "loss": 45.6349, + "step": 1813 + }, + { + "epoch": 6.5489841986455986, + "grad_norm": 261.0744323730469, + "learning_rate": 2.0177858439201454e-05, + "loss": 45.5545, + "step": 1814 + }, + { + "epoch": 6.5525959367945825, + "grad_norm": 261.4237976074219, + "learning_rate": 2.017241379310345e-05, + "loss": 45.321, + "step": 1815 + }, + { + "epoch": 6.5562076749435665, + "grad_norm": 238.8377685546875, + "learning_rate": 2.0166969147005445e-05, + "loss": 44.5963, + "step": 1816 + }, + { + "epoch": 6.5598194130925505, + "grad_norm": 225.89730834960938, + "learning_rate": 2.016152450090744e-05, + "loss": 43.593, + "step": 1817 + }, + { + "epoch": 6.563431151241535, + "grad_norm": 265.09625244140625, + "learning_rate": 2.0156079854809436e-05, + "loss": 43.536, + "step": 1818 + }, + { + "epoch": 6.567042889390519, + "grad_norm": 257.9114685058594, + "learning_rate": 2.0150635208711434e-05, + "loss": 44.1125, + "step": 1819 + }, + { + "epoch": 6.570654627539503, + "grad_norm": 188.06382751464844, + "learning_rate": 2.014519056261343e-05, + "loss": 45.097, + "step": 1820 + }, + { + "epoch": 6.570654627539503, + "eval_loss": 0.6347097754478455, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 1820 + }, + { + "epoch": 6.574266365688487, + "grad_norm": 227.7350616455078, + "learning_rate": 2.013974591651543e-05, + "loss": 43.9367, + "step": 1821 + }, + { + "epoch": 6.577878103837472, + "grad_norm": 207.54774475097656, + "learning_rate": 2.0134301270417424e-05, + "loss": 43.8266, + "step": 1822 + }, + { + "epoch": 6.581489841986456, + "grad_norm": 204.62364196777344, + "learning_rate": 2.012885662431942e-05, + "loss": 42.7973, + "step": 1823 + }, + { + "epoch": 6.58510158013544, + "grad_norm": 244.32159423828125, + "learning_rate": 2.0123411978221418e-05, + "loss": 42.7741, + "step": 1824 + }, + { + "epoch": 6.588713318284425, + "grad_norm": 304.9100036621094, + "learning_rate": 2.0117967332123414e-05, + "loss": 40.6529, + "step": 1825 + }, + { + "epoch": 6.592325056433409, + "grad_norm": 275.5767517089844, + "learning_rate": 2.011252268602541e-05, + "loss": 40.2909, + "step": 1826 + }, + { + "epoch": 6.595936794582393, + "grad_norm": 227.69642639160156, + "learning_rate": 2.0107078039927404e-05, + "loss": 39.8786, + "step": 1827 + }, + { + "epoch": 6.599548532731377, + "grad_norm": 261.4333190917969, + "learning_rate": 2.01016333938294e-05, + "loss": 40.7009, + "step": 1828 + }, + { + "epoch": 6.603160270880361, + "grad_norm": 213.0095977783203, + "learning_rate": 2.0096188747731395e-05, + "loss": 40.0595, + "step": 1829 + }, + { + "epoch": 6.606772009029346, + "grad_norm": 251.78590393066406, + "learning_rate": 2.0090744101633397e-05, + "loss": 40.8939, + "step": 1830 + }, + { + "epoch": 6.606772009029346, + "eval_loss": 0.6333281397819519, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1830 + }, + { + "epoch": 6.6103837471783295, + "grad_norm": 224.89805603027344, + "learning_rate": 2.0085299455535393e-05, + "loss": 41.4123, + "step": 1831 + }, + { + "epoch": 6.6139954853273135, + "grad_norm": 195.67982482910156, + "learning_rate": 2.0079854809437388e-05, + "loss": 41.3483, + "step": 1832 + }, + { + "epoch": 6.617607223476298, + "grad_norm": 214.318603515625, + "learning_rate": 2.0074410163339383e-05, + "loss": 40.5516, + "step": 1833 + }, + { + "epoch": 6.621218961625282, + "grad_norm": 226.60968017578125, + "learning_rate": 2.006896551724138e-05, + "loss": 41.3523, + "step": 1834 + }, + { + "epoch": 6.624830699774266, + "grad_norm": 231.63604736328125, + "learning_rate": 2.0063520871143378e-05, + "loss": 41.8734, + "step": 1835 + }, + { + "epoch": 6.62844243792325, + "grad_norm": 224.1644287109375, + "learning_rate": 2.0058076225045373e-05, + "loss": 42.7386, + "step": 1836 + }, + { + "epoch": 6.632054176072235, + "grad_norm": 273.651123046875, + "learning_rate": 2.0052631578947368e-05, + "loss": 42.4525, + "step": 1837 + }, + { + "epoch": 6.635665914221219, + "grad_norm": 270.8088684082031, + "learning_rate": 2.0047186932849364e-05, + "loss": 42.1051, + "step": 1838 + }, + { + "epoch": 6.639277652370203, + "grad_norm": 303.1058044433594, + "learning_rate": 2.0041742286751362e-05, + "loss": 42.1301, + "step": 1839 + }, + { + "epoch": 6.642889390519187, + "grad_norm": 207.29380798339844, + "learning_rate": 2.0036297640653358e-05, + "loss": 42.1495, + "step": 1840 + }, + { + "epoch": 6.642889390519187, + "eval_loss": 0.6321585774421692, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1840 + }, + { + "epoch": 6.646501128668172, + "grad_norm": 262.1852722167969, + "learning_rate": 2.0030852994555357e-05, + "loss": 39.6408, + "step": 1841 + }, + { + "epoch": 6.650112866817156, + "grad_norm": 233.7991943359375, + "learning_rate": 2.0025408348457352e-05, + "loss": 37.6177, + "step": 1842 + }, + { + "epoch": 6.65372460496614, + "grad_norm": 247.25514221191406, + "learning_rate": 2.0019963702359347e-05, + "loss": 35.4287, + "step": 1843 + }, + { + "epoch": 6.657336343115124, + "grad_norm": 191.53343200683594, + "learning_rate": 2.0014519056261343e-05, + "loss": 34.2335, + "step": 1844 + }, + { + "epoch": 6.660948081264109, + "grad_norm": 245.22821044921875, + "learning_rate": 2.0009074410163338e-05, + "loss": 35.8097, + "step": 1845 + }, + { + "epoch": 6.664559819413093, + "grad_norm": 213.8151092529297, + "learning_rate": 2.0003629764065337e-05, + "loss": 35.2621, + "step": 1846 + }, + { + "epoch": 6.668171557562077, + "grad_norm": 174.6085205078125, + "learning_rate": 1.9998185117967332e-05, + "loss": 36.6137, + "step": 1847 + }, + { + "epoch": 6.6717832957110605, + "grad_norm": 287.4677429199219, + "learning_rate": 1.9992740471869328e-05, + "loss": 37.5896, + "step": 1848 + }, + { + "epoch": 6.675395033860045, + "grad_norm": 224.59771728515625, + "learning_rate": 1.9987295825771326e-05, + "loss": 36.5515, + "step": 1849 + }, + { + "epoch": 6.679006772009029, + "grad_norm": 212.73065185546875, + "learning_rate": 1.9981851179673322e-05, + "loss": 36.2511, + "step": 1850 + }, + { + "epoch": 6.679006772009029, + "eval_loss": 0.6308404803276062, + "eval_runtime": 3.1419, + "eval_samples_per_second": 56.972, + "eval_steps_per_second": 56.972, + "step": 1850 + }, + { + "epoch": 6.682618510158013, + "grad_norm": 214.7340850830078, + "learning_rate": 1.9976406533575317e-05, + "loss": 37.6949, + "step": 1851 + }, + { + "epoch": 6.686230248306998, + "grad_norm": 220.3029327392578, + "learning_rate": 1.9970961887477316e-05, + "loss": 36.5785, + "step": 1852 + }, + { + "epoch": 6.689841986455982, + "grad_norm": 198.97564697265625, + "learning_rate": 1.996551724137931e-05, + "loss": 38.5277, + "step": 1853 + }, + { + "epoch": 6.693453724604966, + "grad_norm": 180.94789123535156, + "learning_rate": 1.9960072595281307e-05, + "loss": 37.5197, + "step": 1854 + }, + { + "epoch": 6.69706546275395, + "grad_norm": 212.17584228515625, + "learning_rate": 1.9954627949183302e-05, + "loss": 37.3483, + "step": 1855 + }, + { + "epoch": 6.700677200902934, + "grad_norm": 253.88601684570312, + "learning_rate": 1.9949183303085298e-05, + "loss": 38.5224, + "step": 1856 + }, + { + "epoch": 6.704288939051919, + "grad_norm": 193.17698669433594, + "learning_rate": 1.9943738656987296e-05, + "loss": 37.5679, + "step": 1857 + }, + { + "epoch": 6.707900677200903, + "grad_norm": 217.2652130126953, + "learning_rate": 1.9938294010889295e-05, + "loss": 27.7344, + "step": 1858 + }, + { + "epoch": 6.711512415349887, + "grad_norm": 183.9295196533203, + "learning_rate": 1.993284936479129e-05, + "loss": 24.3864, + "step": 1859 + }, + { + "epoch": 6.715124153498872, + "grad_norm": 200.3455352783203, + "learning_rate": 1.9927404718693286e-05, + "loss": 23.7328, + "step": 1860 + }, + { + "epoch": 6.715124153498872, + "eval_loss": 0.636415421962738, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.943, + "eval_steps_per_second": 56.943, + "step": 1860 + }, + { + "epoch": 6.718735891647856, + "grad_norm": 206.7858123779297, + "learning_rate": 1.992196007259528e-05, + "loss": 24.6541, + "step": 1861 + }, + { + "epoch": 6.72234762979684, + "grad_norm": 208.10414123535156, + "learning_rate": 1.9916515426497277e-05, + "loss": 25.1223, + "step": 1862 + }, + { + "epoch": 6.725959367945824, + "grad_norm": 270.6657409667969, + "learning_rate": 1.9911070780399275e-05, + "loss": 44.8561, + "step": 1863 + }, + { + "epoch": 6.7295711060948085, + "grad_norm": 246.69094848632812, + "learning_rate": 1.990562613430127e-05, + "loss": 45.8683, + "step": 1864 + }, + { + "epoch": 6.733182844243792, + "grad_norm": 243.4462432861328, + "learning_rate": 1.9900181488203266e-05, + "loss": 45.1845, + "step": 1865 + }, + { + "epoch": 6.736794582392776, + "grad_norm": 218.0637969970703, + "learning_rate": 1.989473684210526e-05, + "loss": 43.9492, + "step": 1866 + }, + { + "epoch": 6.74040632054176, + "grad_norm": 200.28140258789062, + "learning_rate": 1.988929219600726e-05, + "loss": 44.0612, + "step": 1867 + }, + { + "epoch": 6.744018058690745, + "grad_norm": 200.3120880126953, + "learning_rate": 1.988384754990926e-05, + "loss": 43.4748, + "step": 1868 + }, + { + "epoch": 6.747629796839729, + "grad_norm": 186.1811065673828, + "learning_rate": 1.9878402903811254e-05, + "loss": 43.6851, + "step": 1869 + }, + { + "epoch": 6.751241534988713, + "grad_norm": 208.15167236328125, + "learning_rate": 1.987295825771325e-05, + "loss": 44.4196, + "step": 1870 + }, + { + "epoch": 6.751241534988713, + "eval_loss": 0.6353851556777954, + "eval_runtime": 3.1436, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1870 + }, + { + "epoch": 6.754853273137698, + "grad_norm": 207.500244140625, + "learning_rate": 1.9867513611615245e-05, + "loss": 44.1493, + "step": 1871 + }, + { + "epoch": 6.758465011286682, + "grad_norm": 238.17047119140625, + "learning_rate": 1.986206896551724e-05, + "loss": 44.6587, + "step": 1872 + }, + { + "epoch": 6.762076749435666, + "grad_norm": 192.9468231201172, + "learning_rate": 1.9856624319419236e-05, + "loss": 43.2409, + "step": 1873 + }, + { + "epoch": 6.76568848758465, + "grad_norm": 205.26492309570312, + "learning_rate": 1.9851179673321235e-05, + "loss": 40.8636, + "step": 1874 + }, + { + "epoch": 6.769300225733634, + "grad_norm": 190.49908447265625, + "learning_rate": 1.984573502722323e-05, + "loss": 41.0769, + "step": 1875 + }, + { + "epoch": 6.772911963882619, + "grad_norm": 206.56097412109375, + "learning_rate": 1.984029038112523e-05, + "loss": 40.1137, + "step": 1876 + }, + { + "epoch": 6.776523702031603, + "grad_norm": 212.89256286621094, + "learning_rate": 1.9834845735027224e-05, + "loss": 41.0114, + "step": 1877 + }, + { + "epoch": 6.780135440180587, + "grad_norm": 197.24267578125, + "learning_rate": 1.982940108892922e-05, + "loss": 40.6027, + "step": 1878 + }, + { + "epoch": 6.7837471783295715, + "grad_norm": 187.01942443847656, + "learning_rate": 1.982395644283122e-05, + "loss": 40.5933, + "step": 1879 + }, + { + "epoch": 6.7873589164785555, + "grad_norm": 236.31092834472656, + "learning_rate": 1.9818511796733214e-05, + "loss": 41.2282, + "step": 1880 + }, + { + "epoch": 6.7873589164785555, + "eval_loss": 0.6299392580986023, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 1880 + }, + { + "epoch": 6.7909706546275395, + "grad_norm": 194.92059326171875, + "learning_rate": 1.981306715063521e-05, + "loss": 41.5858, + "step": 1881 + }, + { + "epoch": 6.794582392776523, + "grad_norm": 192.26272583007812, + "learning_rate": 1.9807622504537205e-05, + "loss": 40.6826, + "step": 1882 + }, + { + "epoch": 6.798194130925508, + "grad_norm": 181.8116912841797, + "learning_rate": 1.98021778584392e-05, + "loss": 40.0867, + "step": 1883 + }, + { + "epoch": 6.801805869074492, + "grad_norm": 219.03494262695312, + "learning_rate": 1.9796733212341195e-05, + "loss": 41.4496, + "step": 1884 + }, + { + "epoch": 6.805417607223476, + "grad_norm": 190.7852325439453, + "learning_rate": 1.9791288566243194e-05, + "loss": 42.4147, + "step": 1885 + }, + { + "epoch": 6.80902934537246, + "grad_norm": 200.32476806640625, + "learning_rate": 1.9785843920145193e-05, + "loss": 42.0316, + "step": 1886 + }, + { + "epoch": 6.812641083521445, + "grad_norm": 240.6086883544922, + "learning_rate": 1.9780399274047188e-05, + "loss": 39.6992, + "step": 1887 + }, + { + "epoch": 6.816252821670429, + "grad_norm": 222.31700134277344, + "learning_rate": 1.9774954627949184e-05, + "loss": 42.9572, + "step": 1888 + }, + { + "epoch": 6.819864559819413, + "grad_norm": 215.65292358398438, + "learning_rate": 1.976950998185118e-05, + "loss": 42.5147, + "step": 1889 + }, + { + "epoch": 6.823476297968397, + "grad_norm": 195.71624755859375, + "learning_rate": 1.9764065335753178e-05, + "loss": 40.9536, + "step": 1890 + }, + { + "epoch": 6.823476297968397, + "eval_loss": 0.6288287043571472, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 1890 + }, + { + "epoch": 6.827088036117382, + "grad_norm": 202.301025390625, + "learning_rate": 1.9758620689655173e-05, + "loss": 40.1754, + "step": 1891 + }, + { + "epoch": 6.830699774266366, + "grad_norm": 217.07186889648438, + "learning_rate": 1.975317604355717e-05, + "loss": 35.7505, + "step": 1892 + }, + { + "epoch": 6.83431151241535, + "grad_norm": 189.78782653808594, + "learning_rate": 1.9747731397459164e-05, + "loss": 34.813, + "step": 1893 + }, + { + "epoch": 6.837923250564334, + "grad_norm": 247.2117462158203, + "learning_rate": 1.974228675136116e-05, + "loss": 33.932, + "step": 1894 + }, + { + "epoch": 6.8415349887133186, + "grad_norm": 244.06321716308594, + "learning_rate": 1.9736842105263158e-05, + "loss": 36.2514, + "step": 1895 + }, + { + "epoch": 6.8451467268623025, + "grad_norm": 235.78692626953125, + "learning_rate": 1.9731397459165157e-05, + "loss": 35.2123, + "step": 1896 + }, + { + "epoch": 6.8487584650112865, + "grad_norm": 193.82456970214844, + "learning_rate": 1.9725952813067152e-05, + "loss": 36.5477, + "step": 1897 + }, + { + "epoch": 6.852370203160271, + "grad_norm": 230.2017059326172, + "learning_rate": 1.9720508166969148e-05, + "loss": 36.1244, + "step": 1898 + }, + { + "epoch": 6.855981941309255, + "grad_norm": 205.5274200439453, + "learning_rate": 1.9715063520871143e-05, + "loss": 36.7059, + "step": 1899 + }, + { + "epoch": 6.859593679458239, + "grad_norm": 236.6873016357422, + "learning_rate": 1.970961887477314e-05, + "loss": 36.6212, + "step": 1900 + }, + { + "epoch": 6.859593679458239, + "eval_loss": 0.6235609650611877, + "eval_runtime": 3.1497, + "eval_samples_per_second": 56.831, + "eval_steps_per_second": 56.831, + "step": 1900 + }, + { + "epoch": 6.863205417607223, + "grad_norm": 217.63638305664062, + "learning_rate": 1.9704174228675137e-05, + "loss": 37.3918, + "step": 1901 + }, + { + "epoch": 6.866817155756207, + "grad_norm": 169.31996154785156, + "learning_rate": 1.9698729582577133e-05, + "loss": 37.8555, + "step": 1902 + }, + { + "epoch": 6.870428893905192, + "grad_norm": 204.2144775390625, + "learning_rate": 1.9693284936479128e-05, + "loss": 38.0013, + "step": 1903 + }, + { + "epoch": 6.874040632054176, + "grad_norm": 219.13595581054688, + "learning_rate": 1.9687840290381127e-05, + "loss": 37.2128, + "step": 1904 + }, + { + "epoch": 6.87765237020316, + "grad_norm": 189.8477325439453, + "learning_rate": 1.9682395644283122e-05, + "loss": 39.272, + "step": 1905 + }, + { + "epoch": 6.881264108352145, + "grad_norm": 214.21360778808594, + "learning_rate": 1.967695099818512e-05, + "loss": 37.5185, + "step": 1906 + }, + { + "epoch": 6.884875846501129, + "grad_norm": 252.57867431640625, + "learning_rate": 1.9671506352087116e-05, + "loss": 37.6195, + "step": 1907 + }, + { + "epoch": 6.888487584650113, + "grad_norm": 169.85382080078125, + "learning_rate": 1.966606170598911e-05, + "loss": 29.083, + "step": 1908 + }, + { + "epoch": 6.892099322799097, + "grad_norm": 161.38137817382812, + "learning_rate": 1.9660617059891107e-05, + "loss": 24.4547, + "step": 1909 + }, + { + "epoch": 6.895711060948082, + "grad_norm": 192.5706787109375, + "learning_rate": 1.9655172413793102e-05, + "loss": 24.2235, + "step": 1910 + }, + { + "epoch": 6.895711060948082, + "eval_loss": 0.6387229561805725, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1910 + }, + { + "epoch": 6.899322799097066, + "grad_norm": 177.5368194580078, + "learning_rate": 1.9649727767695098e-05, + "loss": 24.8032, + "step": 1911 + }, + { + "epoch": 6.9029345372460496, + "grad_norm": 206.98458862304688, + "learning_rate": 1.9644283121597097e-05, + "loss": 25.7293, + "step": 1912 + }, + { + "epoch": 6.9065462753950335, + "grad_norm": 238.7289581298828, + "learning_rate": 1.9638838475499095e-05, + "loss": 44.2514, + "step": 1913 + }, + { + "epoch": 6.910158013544018, + "grad_norm": 225.86854553222656, + "learning_rate": 1.963339382940109e-05, + "loss": 44.4858, + "step": 1914 + }, + { + "epoch": 6.913769751693002, + "grad_norm": 235.71524047851562, + "learning_rate": 1.9627949183303086e-05, + "loss": 44.5351, + "step": 1915 + }, + { + "epoch": 6.917381489841986, + "grad_norm": 233.1634063720703, + "learning_rate": 1.962250453720508e-05, + "loss": 44.0865, + "step": 1916 + }, + { + "epoch": 6.92099322799097, + "grad_norm": 201.48944091796875, + "learning_rate": 1.961705989110708e-05, + "loss": 45.0226, + "step": 1917 + }, + { + "epoch": 6.924604966139955, + "grad_norm": 226.95469665527344, + "learning_rate": 1.9611615245009076e-05, + "loss": 44.3969, + "step": 1918 + }, + { + "epoch": 6.928216704288939, + "grad_norm": 242.79940795898438, + "learning_rate": 1.960617059891107e-05, + "loss": 41.3037, + "step": 1919 + }, + { + "epoch": 6.931828442437923, + "grad_norm": 255.3524932861328, + "learning_rate": 1.9600725952813066e-05, + "loss": 41.3567, + "step": 1920 + }, + { + "epoch": 6.931828442437923, + "eval_loss": 0.6346065998077393, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 1920 + }, + { + "epoch": 6.935440180586907, + "grad_norm": 277.0763854980469, + "learning_rate": 1.9595281306715062e-05, + "loss": 41.142, + "step": 1921 + }, + { + "epoch": 6.939051918735892, + "grad_norm": 176.02658081054688, + "learning_rate": 1.958983666061706e-05, + "loss": 42.1963, + "step": 1922 + }, + { + "epoch": 6.942663656884876, + "grad_norm": 236.36398315429688, + "learning_rate": 1.958439201451906e-05, + "loss": 42.351, + "step": 1923 + }, + { + "epoch": 6.94627539503386, + "grad_norm": 203.0919647216797, + "learning_rate": 1.9578947368421055e-05, + "loss": 41.5248, + "step": 1924 + }, + { + "epoch": 6.949887133182845, + "grad_norm": 273.605712890625, + "learning_rate": 1.957350272232305e-05, + "loss": 42.1004, + "step": 1925 + }, + { + "epoch": 6.953498871331829, + "grad_norm": 214.04319763183594, + "learning_rate": 1.9568058076225045e-05, + "loss": 42.6326, + "step": 1926 + }, + { + "epoch": 6.957110609480813, + "grad_norm": 250.81832885742188, + "learning_rate": 1.956261343012704e-05, + "loss": 43.8045, + "step": 1927 + }, + { + "epoch": 6.960722347629797, + "grad_norm": 233.58116149902344, + "learning_rate": 1.955716878402904e-05, + "loss": 39.8991, + "step": 1928 + }, + { + "epoch": 6.9643340857787805, + "grad_norm": 269.0545654296875, + "learning_rate": 1.9551724137931035e-05, + "loss": 34.6192, + "step": 1929 + }, + { + "epoch": 6.967945823927765, + "grad_norm": 266.1218566894531, + "learning_rate": 1.954627949183303e-05, + "loss": 35.7568, + "step": 1930 + }, + { + "epoch": 6.967945823927765, + "eval_loss": 0.6233173608779907, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1930 + }, + { + "epoch": 6.971557562076749, + "grad_norm": 294.6914978027344, + "learning_rate": 1.9540834845735026e-05, + "loss": 36.0795, + "step": 1931 + }, + { + "epoch": 6.975169300225733, + "grad_norm": 373.6831970214844, + "learning_rate": 1.9535390199637025e-05, + "loss": 37.2715, + "step": 1932 + }, + { + "epoch": 6.978781038374718, + "grad_norm": 240.34738159179688, + "learning_rate": 1.952994555353902e-05, + "loss": 37.8335, + "step": 1933 + }, + { + "epoch": 6.982392776523702, + "grad_norm": 312.1968994140625, + "learning_rate": 1.952450090744102e-05, + "loss": 37.8251, + "step": 1934 + }, + { + "epoch": 6.986004514672686, + "grad_norm": 276.3544006347656, + "learning_rate": 1.9519056261343014e-05, + "loss": 38.8466, + "step": 1935 + }, + { + "epoch": 6.98961625282167, + "grad_norm": 282.6874694824219, + "learning_rate": 1.951361161524501e-05, + "loss": 37.774, + "step": 1936 + }, + { + "epoch": 6.993227990970655, + "grad_norm": 323.96612548828125, + "learning_rate": 1.9508166969147005e-05, + "loss": 34.3747, + "step": 1937 + }, + { + "epoch": 6.996839729119639, + "grad_norm": 235.02915954589844, + "learning_rate": 1.9502722323049e-05, + "loss": 24.5297, + "step": 1938 + }, + { + "epoch": 7.0, + "grad_norm": 176.4046173095703, + "learning_rate": 1.9497277676951e-05, + "loss": 22.3179, + "step": 1939 + }, + { + "epoch": 7.003611738148984, + "grad_norm": 248.2797393798828, + "learning_rate": 1.9491833030852994e-05, + "loss": 42.225, + "step": 1940 + }, + { + "epoch": 7.003611738148984, + "eval_loss": 0.6272363066673279, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.911, + "eval_steps_per_second": 56.911, + "step": 1940 + }, + { + "epoch": 7.007223476297969, + "grad_norm": 235.9131622314453, + "learning_rate": 1.9486388384754993e-05, + "loss": 43.6526, + "step": 1941 + }, + { + "epoch": 7.010835214446953, + "grad_norm": 223.63479614257812, + "learning_rate": 1.948094373865699e-05, + "loss": 42.9052, + "step": 1942 + }, + { + "epoch": 7.014446952595937, + "grad_norm": 203.92141723632812, + "learning_rate": 1.9475499092558984e-05, + "loss": 43.5819, + "step": 1943 + }, + { + "epoch": 7.018058690744921, + "grad_norm": 209.6050567626953, + "learning_rate": 1.947005444646098e-05, + "loss": 43.1077, + "step": 1944 + }, + { + "epoch": 7.021670428893906, + "grad_norm": 245.77700805664062, + "learning_rate": 1.9464609800362978e-05, + "loss": 42.7508, + "step": 1945 + }, + { + "epoch": 7.0252821670428895, + "grad_norm": 203.13465881347656, + "learning_rate": 1.9459165154264973e-05, + "loss": 42.5234, + "step": 1946 + }, + { + "epoch": 7.0288939051918735, + "grad_norm": 226.4978485107422, + "learning_rate": 1.945372050816697e-05, + "loss": 44.0725, + "step": 1947 + }, + { + "epoch": 7.0325056433408575, + "grad_norm": 225.68116760253906, + "learning_rate": 1.9448275862068964e-05, + "loss": 42.6408, + "step": 1948 + }, + { + "epoch": 7.036117381489842, + "grad_norm": 182.14202880859375, + "learning_rate": 1.944283121597096e-05, + "loss": 41.7696, + "step": 1949 + }, + { + "epoch": 7.039729119638826, + "grad_norm": 196.1949005126953, + "learning_rate": 1.9437386569872962e-05, + "loss": 42.7008, + "step": 1950 + }, + { + "epoch": 7.039729119638826, + "eval_loss": 0.6277336478233337, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 1950 + }, + { + "epoch": 7.04334085778781, + "grad_norm": 180.6853485107422, + "learning_rate": 1.9431941923774957e-05, + "loss": 41.9946, + "step": 1951 + }, + { + "epoch": 7.046952595936794, + "grad_norm": 199.0644073486328, + "learning_rate": 1.9426497277676953e-05, + "loss": 39.8965, + "step": 1952 + }, + { + "epoch": 7.050564334085779, + "grad_norm": 208.21371459960938, + "learning_rate": 1.9421052631578948e-05, + "loss": 39.3263, + "step": 1953 + }, + { + "epoch": 7.054176072234763, + "grad_norm": 239.78677368164062, + "learning_rate": 1.9415607985480943e-05, + "loss": 40.1478, + "step": 1954 + }, + { + "epoch": 7.057787810383747, + "grad_norm": 211.55030822753906, + "learning_rate": 1.941016333938294e-05, + "loss": 40.061, + "step": 1955 + }, + { + "epoch": 7.061399548532731, + "grad_norm": 199.51455688476562, + "learning_rate": 1.9404718693284937e-05, + "loss": 39.8707, + "step": 1956 + }, + { + "epoch": 7.065011286681716, + "grad_norm": 183.39486694335938, + "learning_rate": 1.9399274047186933e-05, + "loss": 40.3183, + "step": 1957 + }, + { + "epoch": 7.0686230248307, + "grad_norm": 238.36737060546875, + "learning_rate": 1.9393829401088928e-05, + "loss": 40.8581, + "step": 1958 + }, + { + "epoch": 7.072234762979684, + "grad_norm": 202.5072021484375, + "learning_rate": 1.9388384754990927e-05, + "loss": 40.2192, + "step": 1959 + }, + { + "epoch": 7.075846501128668, + "grad_norm": 204.236083984375, + "learning_rate": 1.9382940108892922e-05, + "loss": 40.8533, + "step": 1960 + }, + { + "epoch": 7.075846501128668, + "eval_loss": 0.6252757906913757, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 1960 + }, + { + "epoch": 7.079458239277653, + "grad_norm": 260.2081298828125, + "learning_rate": 1.937749546279492e-05, + "loss": 39.7229, + "step": 1961 + }, + { + "epoch": 7.083069977426637, + "grad_norm": 241.91722106933594, + "learning_rate": 1.9372050816696917e-05, + "loss": 41.547, + "step": 1962 + }, + { + "epoch": 7.0866817155756205, + "grad_norm": 168.9304656982422, + "learning_rate": 1.9366606170598912e-05, + "loss": 41.4826, + "step": 1963 + }, + { + "epoch": 7.090293453724605, + "grad_norm": 230.05349731445312, + "learning_rate": 1.9361161524500907e-05, + "loss": 41.5411, + "step": 1964 + }, + { + "epoch": 7.093905191873589, + "grad_norm": 172.16851806640625, + "learning_rate": 1.9355716878402903e-05, + "loss": 42.2347, + "step": 1965 + }, + { + "epoch": 7.097516930022573, + "grad_norm": 312.65838623046875, + "learning_rate": 1.9350272232304898e-05, + "loss": 41.4039, + "step": 1966 + }, + { + "epoch": 7.101128668171557, + "grad_norm": 249.62351989746094, + "learning_rate": 1.9344827586206897e-05, + "loss": 41.4234, + "step": 1967 + }, + { + "epoch": 7.104740406320542, + "grad_norm": 250.49143981933594, + "learning_rate": 1.9339382940108896e-05, + "loss": 38.0539, + "step": 1968 + }, + { + "epoch": 7.108352144469526, + "grad_norm": 238.41546630859375, + "learning_rate": 1.933393829401089e-05, + "loss": 35.5584, + "step": 1969 + }, + { + "epoch": 7.11196388261851, + "grad_norm": 200.78282165527344, + "learning_rate": 1.9328493647912886e-05, + "loss": 34.4491, + "step": 1970 + }, + { + "epoch": 7.11196388261851, + "eval_loss": 0.6286216378211975, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 1970 + }, + { + "epoch": 7.115575620767494, + "grad_norm": 244.61717224121094, + "learning_rate": 1.9323049001814882e-05, + "loss": 34.5403, + "step": 1971 + }, + { + "epoch": 7.119187358916479, + "grad_norm": 219.14312744140625, + "learning_rate": 1.931760435571688e-05, + "loss": 35.7815, + "step": 1972 + }, + { + "epoch": 7.122799097065463, + "grad_norm": 221.85130310058594, + "learning_rate": 1.9312159709618876e-05, + "loss": 35.638, + "step": 1973 + }, + { + "epoch": 7.126410835214447, + "grad_norm": 237.97921752929688, + "learning_rate": 1.930671506352087e-05, + "loss": 35.1348, + "step": 1974 + }, + { + "epoch": 7.130022573363431, + "grad_norm": 234.06256103515625, + "learning_rate": 1.9301270417422867e-05, + "loss": 35.8709, + "step": 1975 + }, + { + "epoch": 7.133634311512416, + "grad_norm": 231.6852264404297, + "learning_rate": 1.9295825771324862e-05, + "loss": 36.6859, + "step": 1976 + }, + { + "epoch": 7.1372460496614, + "grad_norm": 208.2762908935547, + "learning_rate": 1.9290381125226857e-05, + "loss": 37.24, + "step": 1977 + }, + { + "epoch": 7.140857787810384, + "grad_norm": 219.8532257080078, + "learning_rate": 1.928493647912886e-05, + "loss": 36.4058, + "step": 1978 + }, + { + "epoch": 7.144469525959368, + "grad_norm": 242.73159790039062, + "learning_rate": 1.9279491833030855e-05, + "loss": 36.7565, + "step": 1979 + }, + { + "epoch": 7.148081264108352, + "grad_norm": 227.09645080566406, + "learning_rate": 1.927404718693285e-05, + "loss": 37.6752, + "step": 1980 + }, + { + "epoch": 7.148081264108352, + "eval_loss": 0.6243596076965332, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 1980 + }, + { + "epoch": 7.151693002257336, + "grad_norm": 236.27169799804688, + "learning_rate": 1.9268602540834846e-05, + "loss": 38.3857, + "step": 1981 + }, + { + "epoch": 7.15530474040632, + "grad_norm": 244.84912109375, + "learning_rate": 1.926315789473684e-05, + "loss": 38.414, + "step": 1982 + }, + { + "epoch": 7.158916478555304, + "grad_norm": 203.36798095703125, + "learning_rate": 1.925771324863884e-05, + "loss": 38.938, + "step": 1983 + }, + { + "epoch": 7.162528216704289, + "grad_norm": 225.50152587890625, + "learning_rate": 1.9252268602540835e-05, + "loss": 37.654, + "step": 1984 + }, + { + "epoch": 7.166139954853273, + "grad_norm": 236.4989471435547, + "learning_rate": 1.924682395644283e-05, + "loss": 28.2794, + "step": 1985 + }, + { + "epoch": 7.169751693002257, + "grad_norm": 173.909423828125, + "learning_rate": 1.9241379310344826e-05, + "loss": 23.3804, + "step": 1986 + }, + { + "epoch": 7.173363431151241, + "grad_norm": 195.63526916503906, + "learning_rate": 1.9235934664246825e-05, + "loss": 24.4696, + "step": 1987 + }, + { + "epoch": 7.176975169300226, + "grad_norm": 150.0059356689453, + "learning_rate": 1.923049001814882e-05, + "loss": 23.9438, + "step": 1988 + }, + { + "epoch": 7.18058690744921, + "grad_norm": 217.61630249023438, + "learning_rate": 1.922504537205082e-05, + "loss": 25.4084, + "step": 1989 + }, + { + "epoch": 7.184198645598194, + "grad_norm": 259.2041015625, + "learning_rate": 1.9219600725952814e-05, + "loss": 44.7159, + "step": 1990 + }, + { + "epoch": 7.184198645598194, + "eval_loss": 0.6465168595314026, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 1990 + }, + { + "epoch": 7.187810383747179, + "grad_norm": 282.1758117675781, + "learning_rate": 1.921415607985481e-05, + "loss": 45.7571, + "step": 1991 + }, + { + "epoch": 7.191422121896163, + "grad_norm": 276.5455322265625, + "learning_rate": 1.9208711433756805e-05, + "loss": 44.7227, + "step": 1992 + }, + { + "epoch": 7.195033860045147, + "grad_norm": 251.93589782714844, + "learning_rate": 1.92032667876588e-05, + "loss": 43.0705, + "step": 1993 + }, + { + "epoch": 7.198645598194131, + "grad_norm": 224.8245086669922, + "learning_rate": 1.91978221415608e-05, + "loss": 43.2009, + "step": 1994 + }, + { + "epoch": 7.2022573363431155, + "grad_norm": 233.61770629882812, + "learning_rate": 1.9192377495462795e-05, + "loss": 43.4496, + "step": 1995 + }, + { + "epoch": 7.2058690744920995, + "grad_norm": 188.65252685546875, + "learning_rate": 1.9186932849364793e-05, + "loss": 42.5907, + "step": 1996 + }, + { + "epoch": 7.209480812641083, + "grad_norm": 185.1155242919922, + "learning_rate": 1.918148820326679e-05, + "loss": 44.4651, + "step": 1997 + }, + { + "epoch": 7.213092550790067, + "grad_norm": 169.09701538085938, + "learning_rate": 1.9176043557168784e-05, + "loss": 43.6325, + "step": 1998 + }, + { + "epoch": 7.216704288939052, + "grad_norm": 198.49114990234375, + "learning_rate": 1.9170598911070783e-05, + "loss": 43.5817, + "step": 1999 + }, + { + "epoch": 7.220316027088036, + "grad_norm": 193.17591857910156, + "learning_rate": 1.916515426497278e-05, + "loss": 41.4884, + "step": 2000 + }, + { + "epoch": 7.220316027088036, + "eval_loss": 0.6329721212387085, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.995, + "eval_steps_per_second": 56.995, + "step": 2000 + }, + { + "epoch": 7.22392776523702, + "grad_norm": 202.32730102539062, + "learning_rate": 1.9159709618874774e-05, + "loss": 41.2168, + "step": 2001 + }, + { + "epoch": 7.227539503386004, + "grad_norm": 206.4916534423828, + "learning_rate": 1.915426497277677e-05, + "loss": 39.9909, + "step": 2002 + }, + { + "epoch": 7.231151241534989, + "grad_norm": 202.2099609375, + "learning_rate": 1.9148820326678765e-05, + "loss": 40.1413, + "step": 2003 + }, + { + "epoch": 7.234762979683973, + "grad_norm": 223.7954559326172, + "learning_rate": 1.914337568058076e-05, + "loss": 39.5872, + "step": 2004 + }, + { + "epoch": 7.238374717832957, + "grad_norm": 225.8967742919922, + "learning_rate": 1.9137931034482762e-05, + "loss": 41.3396, + "step": 2005 + }, + { + "epoch": 7.241986455981941, + "grad_norm": 248.0997772216797, + "learning_rate": 1.9132486388384757e-05, + "loss": 39.012, + "step": 2006 + }, + { + "epoch": 7.245598194130926, + "grad_norm": 227.4576873779297, + "learning_rate": 1.9127041742286753e-05, + "loss": 42.5922, + "step": 2007 + }, + { + "epoch": 7.24920993227991, + "grad_norm": 197.62547302246094, + "learning_rate": 1.9121597096188748e-05, + "loss": 41.6107, + "step": 2008 + }, + { + "epoch": 7.252821670428894, + "grad_norm": 170.18817138671875, + "learning_rate": 1.9116152450090744e-05, + "loss": 40.3326, + "step": 2009 + }, + { + "epoch": 7.2564334085778786, + "grad_norm": 186.9420166015625, + "learning_rate": 1.9110707803992742e-05, + "loss": 41.0365, + "step": 2010 + }, + { + "epoch": 7.2564334085778786, + "eval_loss": 0.6230406761169434, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2010 + }, + { + "epoch": 7.2600451467268625, + "grad_norm": 188.11244201660156, + "learning_rate": 1.9105263157894738e-05, + "loss": 42.0278, + "step": 2011 + }, + { + "epoch": 7.2636568848758465, + "grad_norm": 242.47305297851562, + "learning_rate": 1.9099818511796733e-05, + "loss": 41.5539, + "step": 2012 + }, + { + "epoch": 7.2672686230248305, + "grad_norm": 190.83987426757812, + "learning_rate": 1.909437386569873e-05, + "loss": 41.8641, + "step": 2013 + }, + { + "epoch": 7.270880361173815, + "grad_norm": 214.44650268554688, + "learning_rate": 1.9088929219600724e-05, + "loss": 42.232, + "step": 2014 + }, + { + "epoch": 7.274492099322799, + "grad_norm": 216.3888397216797, + "learning_rate": 1.9083484573502723e-05, + "loss": 41.6186, + "step": 2015 + }, + { + "epoch": 7.278103837471783, + "grad_norm": 210.46673583984375, + "learning_rate": 1.907803992740472e-05, + "loss": 42.2099, + "step": 2016 + }, + { + "epoch": 7.281715575620767, + "grad_norm": 194.84165954589844, + "learning_rate": 1.9072595281306717e-05, + "loss": 42.78, + "step": 2017 + }, + { + "epoch": 7.285327313769752, + "grad_norm": 201.91297912597656, + "learning_rate": 1.9067150635208712e-05, + "loss": 38.7115, + "step": 2018 + }, + { + "epoch": 7.288939051918736, + "grad_norm": 245.42625427246094, + "learning_rate": 1.9061705989110708e-05, + "loss": 35.7841, + "step": 2019 + }, + { + "epoch": 7.29255079006772, + "grad_norm": 182.4967041015625, + "learning_rate": 1.9056261343012703e-05, + "loss": 34.3308, + "step": 2020 + }, + { + "epoch": 7.29255079006772, + "eval_loss": 0.6238341331481934, + "eval_runtime": 3.1431, + "eval_samples_per_second": 56.95, + "eval_steps_per_second": 56.95, + "step": 2020 + }, + { + "epoch": 7.296162528216704, + "grad_norm": 297.3916320800781, + "learning_rate": 1.9050816696914702e-05, + "loss": 34.7534, + "step": 2021 + }, + { + "epoch": 7.299774266365689, + "grad_norm": 211.52554321289062, + "learning_rate": 1.9045372050816697e-05, + "loss": 34.0303, + "step": 2022 + }, + { + "epoch": 7.303386004514673, + "grad_norm": 232.99844360351562, + "learning_rate": 1.9039927404718693e-05, + "loss": 35.7378, + "step": 2023 + }, + { + "epoch": 7.306997742663657, + "grad_norm": 230.34642028808594, + "learning_rate": 1.903448275862069e-05, + "loss": 36.7492, + "step": 2024 + }, + { + "epoch": 7.310609480812641, + "grad_norm": 228.88966369628906, + "learning_rate": 1.9029038112522687e-05, + "loss": 35.1188, + "step": 2025 + }, + { + "epoch": 7.314221218961626, + "grad_norm": 213.2604522705078, + "learning_rate": 1.9023593466424682e-05, + "loss": 35.0688, + "step": 2026 + }, + { + "epoch": 7.3178329571106095, + "grad_norm": 202.62200927734375, + "learning_rate": 1.901814882032668e-05, + "loss": 37.6721, + "step": 2027 + }, + { + "epoch": 7.3214446952595935, + "grad_norm": 191.8877410888672, + "learning_rate": 1.9012704174228676e-05, + "loss": 36.7728, + "step": 2028 + }, + { + "epoch": 7.3250564334085775, + "grad_norm": 211.57571411132812, + "learning_rate": 1.900725952813067e-05, + "loss": 36.6342, + "step": 2029 + }, + { + "epoch": 7.328668171557562, + "grad_norm": 177.2289581298828, + "learning_rate": 1.9001814882032667e-05, + "loss": 36.8319, + "step": 2030 + }, + { + "epoch": 7.328668171557562, + "eval_loss": 0.6231008172035217, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2030 + }, + { + "epoch": 7.332279909706546, + "grad_norm": 227.7028350830078, + "learning_rate": 1.8996370235934662e-05, + "loss": 36.6706, + "step": 2031 + }, + { + "epoch": 7.33589164785553, + "grad_norm": 229.02972412109375, + "learning_rate": 1.899092558983666e-05, + "loss": 37.0749, + "step": 2032 + }, + { + "epoch": 7.339503386004514, + "grad_norm": 234.30946350097656, + "learning_rate": 1.898548094373866e-05, + "loss": 37.3716, + "step": 2033 + }, + { + "epoch": 7.343115124153499, + "grad_norm": 236.79893493652344, + "learning_rate": 1.8980036297640655e-05, + "loss": 38.9503, + "step": 2034 + }, + { + "epoch": 7.346726862302483, + "grad_norm": 256.5646057128906, + "learning_rate": 1.897459165154265e-05, + "loss": 32.5056, + "step": 2035 + }, + { + "epoch": 7.350338600451467, + "grad_norm": 183.38961791992188, + "learning_rate": 1.8969147005444646e-05, + "loss": 25.3982, + "step": 2036 + }, + { + "epoch": 7.353950338600452, + "grad_norm": 214.09742736816406, + "learning_rate": 1.896370235934664e-05, + "loss": 23.2743, + "step": 2037 + }, + { + "epoch": 7.357562076749436, + "grad_norm": 190.10867309570312, + "learning_rate": 1.895825771324864e-05, + "loss": 24.8062, + "step": 2038 + }, + { + "epoch": 7.36117381489842, + "grad_norm": 197.85313415527344, + "learning_rate": 1.8952813067150636e-05, + "loss": 25.5098, + "step": 2039 + }, + { + "epoch": 7.364785553047404, + "grad_norm": 235.79090881347656, + "learning_rate": 1.894736842105263e-05, + "loss": 44.3536, + "step": 2040 + }, + { + "epoch": 7.364785553047404, + "eval_loss": 0.6341925263404846, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.785, + "eval_steps_per_second": 56.785, + "step": 2040 + }, + { + "epoch": 7.368397291196389, + "grad_norm": 232.7415771484375, + "learning_rate": 1.8941923774954626e-05, + "loss": 44.6073, + "step": 2041 + }, + { + "epoch": 7.372009029345373, + "grad_norm": 302.3766174316406, + "learning_rate": 1.8936479128856625e-05, + "loss": 43.8575, + "step": 2042 + }, + { + "epoch": 7.375620767494357, + "grad_norm": 208.41441345214844, + "learning_rate": 1.8931034482758624e-05, + "loss": 42.4378, + "step": 2043 + }, + { + "epoch": 7.3792325056433405, + "grad_norm": 228.000732421875, + "learning_rate": 1.892558983666062e-05, + "loss": 44.5641, + "step": 2044 + }, + { + "epoch": 7.382844243792325, + "grad_norm": 201.757080078125, + "learning_rate": 1.8920145190562615e-05, + "loss": 43.7578, + "step": 2045 + }, + { + "epoch": 7.386455981941309, + "grad_norm": 220.2481689453125, + "learning_rate": 1.891470054446461e-05, + "loss": 42.755, + "step": 2046 + }, + { + "epoch": 7.390067720090293, + "grad_norm": 225.5443115234375, + "learning_rate": 1.8909255898366605e-05, + "loss": 44.3785, + "step": 2047 + }, + { + "epoch": 7.393679458239277, + "grad_norm": 200.2024688720703, + "learning_rate": 1.89038112522686e-05, + "loss": 42.994, + "step": 2048 + }, + { + "epoch": 7.397291196388262, + "grad_norm": 205.64794921875, + "learning_rate": 1.88983666061706e-05, + "loss": 43.1902, + "step": 2049 + }, + { + "epoch": 7.400902934537246, + "grad_norm": 183.3535919189453, + "learning_rate": 1.8892921960072595e-05, + "loss": 40.9422, + "step": 2050 + }, + { + "epoch": 7.400902934537246, + "eval_loss": 0.626913845539093, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2050 + }, + { + "epoch": 7.40451467268623, + "grad_norm": 201.8138885498047, + "learning_rate": 1.8887477313974594e-05, + "loss": 39.4408, + "step": 2051 + }, + { + "epoch": 7.408126410835214, + "grad_norm": 201.8863525390625, + "learning_rate": 1.888203266787659e-05, + "loss": 39.5467, + "step": 2052 + }, + { + "epoch": 7.411738148984199, + "grad_norm": 239.10687255859375, + "learning_rate": 1.8876588021778585e-05, + "loss": 41.2256, + "step": 2053 + }, + { + "epoch": 7.415349887133183, + "grad_norm": 209.47796630859375, + "learning_rate": 1.8871143375680583e-05, + "loss": 40.8963, + "step": 2054 + }, + { + "epoch": 7.418961625282167, + "grad_norm": 202.6414794921875, + "learning_rate": 1.886569872958258e-05, + "loss": 40.5138, + "step": 2055 + }, + { + "epoch": 7.422573363431152, + "grad_norm": 198.01795959472656, + "learning_rate": 1.8860254083484574e-05, + "loss": 39.1767, + "step": 2056 + }, + { + "epoch": 7.426185101580136, + "grad_norm": 173.26507568359375, + "learning_rate": 1.885480943738657e-05, + "loss": 40.6713, + "step": 2057 + }, + { + "epoch": 7.42979683972912, + "grad_norm": 166.11607360839844, + "learning_rate": 1.8849364791288565e-05, + "loss": 41.2602, + "step": 2058 + }, + { + "epoch": 7.433408577878104, + "grad_norm": 200.76956176757812, + "learning_rate": 1.884392014519056e-05, + "loss": 41.0714, + "step": 2059 + }, + { + "epoch": 7.437020316027088, + "grad_norm": 213.75315856933594, + "learning_rate": 1.883847549909256e-05, + "loss": 39.6812, + "step": 2060 + }, + { + "epoch": 7.437020316027088, + "eval_loss": 0.6279598474502563, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.0, + "eval_steps_per_second": 57.0, + "step": 2060 + }, + { + "epoch": 7.440632054176072, + "grad_norm": 221.25025939941406, + "learning_rate": 1.8833030852994558e-05, + "loss": 41.6964, + "step": 2061 + }, + { + "epoch": 7.444243792325056, + "grad_norm": 171.32106018066406, + "learning_rate": 1.8827586206896553e-05, + "loss": 41.4608, + "step": 2062 + }, + { + "epoch": 7.44785553047404, + "grad_norm": 222.76600646972656, + "learning_rate": 1.882214156079855e-05, + "loss": 41.2687, + "step": 2063 + }, + { + "epoch": 7.451467268623025, + "grad_norm": 169.82395935058594, + "learning_rate": 1.8816696914700544e-05, + "loss": 41.6048, + "step": 2064 + }, + { + "epoch": 7.455079006772009, + "grad_norm": 190.5113525390625, + "learning_rate": 1.8811252268602543e-05, + "loss": 41.8843, + "step": 2065 + }, + { + "epoch": 7.458690744920993, + "grad_norm": 194.5990447998047, + "learning_rate": 1.8805807622504538e-05, + "loss": 43.5968, + "step": 2066 + }, + { + "epoch": 7.462302483069977, + "grad_norm": 216.0985870361328, + "learning_rate": 1.8800362976406533e-05, + "loss": 41.6743, + "step": 2067 + }, + { + "epoch": 7.465914221218962, + "grad_norm": 249.05270385742188, + "learning_rate": 1.879491833030853e-05, + "loss": 39.4203, + "step": 2068 + }, + { + "epoch": 7.469525959367946, + "grad_norm": 232.5495147705078, + "learning_rate": 1.8789473684210524e-05, + "loss": 36.2202, + "step": 2069 + }, + { + "epoch": 7.47313769751693, + "grad_norm": 218.72299194335938, + "learning_rate": 1.8784029038112523e-05, + "loss": 34.9116, + "step": 2070 + }, + { + "epoch": 7.47313769751693, + "eval_loss": 0.6241349577903748, + "eval_runtime": 3.1499, + "eval_samples_per_second": 56.827, + "eval_steps_per_second": 56.827, + "step": 2070 + }, + { + "epoch": 7.476749435665914, + "grad_norm": 241.78179931640625, + "learning_rate": 1.8778584392014522e-05, + "loss": 36.2476, + "step": 2071 + }, + { + "epoch": 7.480361173814899, + "grad_norm": 194.92982482910156, + "learning_rate": 1.8773139745916517e-05, + "loss": 34.4524, + "step": 2072 + }, + { + "epoch": 7.483972911963883, + "grad_norm": 227.76156616210938, + "learning_rate": 1.8767695099818513e-05, + "loss": 34.5292, + "step": 2073 + }, + { + "epoch": 7.487584650112867, + "grad_norm": 287.61309814453125, + "learning_rate": 1.8762250453720508e-05, + "loss": 37.8068, + "step": 2074 + }, + { + "epoch": 7.491196388261851, + "grad_norm": 191.0822296142578, + "learning_rate": 1.8756805807622503e-05, + "loss": 36.0941, + "step": 2075 + }, + { + "epoch": 7.4948081264108355, + "grad_norm": 197.5564422607422, + "learning_rate": 1.8751361161524502e-05, + "loss": 36.3624, + "step": 2076 + }, + { + "epoch": 7.4984198645598195, + "grad_norm": 187.72479248046875, + "learning_rate": 1.8745916515426497e-05, + "loss": 37.5074, + "step": 2077 + }, + { + "epoch": 7.502031602708803, + "grad_norm": 220.4607391357422, + "learning_rate": 1.8740471869328493e-05, + "loss": 35.6139, + "step": 2078 + }, + { + "epoch": 7.505643340857787, + "grad_norm": 179.05612182617188, + "learning_rate": 1.873502722323049e-05, + "loss": 37.7286, + "step": 2079 + }, + { + "epoch": 7.509255079006772, + "grad_norm": 230.91879272460938, + "learning_rate": 1.8729582577132487e-05, + "loss": 36.1803, + "step": 2080 + }, + { + "epoch": 7.509255079006772, + "eval_loss": 0.6255043148994446, + "eval_runtime": 3.1466, + "eval_samples_per_second": 56.887, + "eval_steps_per_second": 56.887, + "step": 2080 + }, + { + "epoch": 7.512866817155756, + "grad_norm": 182.89437866210938, + "learning_rate": 1.8724137931034482e-05, + "loss": 36.5782, + "step": 2081 + }, + { + "epoch": 7.51647855530474, + "grad_norm": 215.36769104003906, + "learning_rate": 1.871869328493648e-05, + "loss": 38.233, + "step": 2082 + }, + { + "epoch": 7.520090293453725, + "grad_norm": 232.6095733642578, + "learning_rate": 1.8713248638838477e-05, + "loss": 38.6268, + "step": 2083 + }, + { + "epoch": 7.523702031602709, + "grad_norm": 236.94281005859375, + "learning_rate": 1.8707803992740472e-05, + "loss": 38.1768, + "step": 2084 + }, + { + "epoch": 7.527313769751693, + "grad_norm": 214.16079711914062, + "learning_rate": 1.8702359346642467e-05, + "loss": 27.514, + "step": 2085 + }, + { + "epoch": 7.530925507900677, + "grad_norm": 192.6107940673828, + "learning_rate": 1.8696914700544463e-05, + "loss": 24.274, + "step": 2086 + }, + { + "epoch": 7.534537246049661, + "grad_norm": 217.98619079589844, + "learning_rate": 1.869147005444646e-05, + "loss": 23.2824, + "step": 2087 + }, + { + "epoch": 7.538148984198646, + "grad_norm": 183.04296875, + "learning_rate": 1.868602540834846e-05, + "loss": 24.9622, + "step": 2088 + }, + { + "epoch": 7.54176072234763, + "grad_norm": 167.1417236328125, + "learning_rate": 1.8680580762250456e-05, + "loss": 25.1446, + "step": 2089 + }, + { + "epoch": 7.545372460496614, + "grad_norm": 287.29937744140625, + "learning_rate": 1.867513611615245e-05, + "loss": 44.1171, + "step": 2090 + }, + { + "epoch": 7.545372460496614, + "eval_loss": 0.6376849412918091, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.929, + "eval_steps_per_second": 56.929, + "step": 2090 + }, + { + "epoch": 7.5489841986455986, + "grad_norm": 285.3408203125, + "learning_rate": 1.8669691470054446e-05, + "loss": 46.3716, + "step": 2091 + }, + { + "epoch": 7.5525959367945825, + "grad_norm": 233.18389892578125, + "learning_rate": 1.8664246823956445e-05, + "loss": 44.0514, + "step": 2092 + }, + { + "epoch": 7.5562076749435665, + "grad_norm": 256.4196472167969, + "learning_rate": 1.865880217785844e-05, + "loss": 44.1784, + "step": 2093 + }, + { + "epoch": 7.5598194130925505, + "grad_norm": 223.28128051757812, + "learning_rate": 1.8653357531760436e-05, + "loss": 42.9897, + "step": 2094 + }, + { + "epoch": 7.563431151241535, + "grad_norm": 235.2901153564453, + "learning_rate": 1.864791288566243e-05, + "loss": 43.7651, + "step": 2095 + }, + { + "epoch": 7.567042889390519, + "grad_norm": 285.9206237792969, + "learning_rate": 1.8642468239564427e-05, + "loss": 44.6333, + "step": 2096 + }, + { + "epoch": 7.570654627539503, + "grad_norm": 200.00210571289062, + "learning_rate": 1.8637023593466425e-05, + "loss": 43.9845, + "step": 2097 + }, + { + "epoch": 7.574266365688487, + "grad_norm": 277.73394775390625, + "learning_rate": 1.8631578947368424e-05, + "loss": 44.7301, + "step": 2098 + }, + { + "epoch": 7.577878103837472, + "grad_norm": 216.9422149658203, + "learning_rate": 1.862613430127042e-05, + "loss": 44.0409, + "step": 2099 + }, + { + "epoch": 7.581489841986456, + "grad_norm": 198.86639404296875, + "learning_rate": 1.8620689655172415e-05, + "loss": 43.4026, + "step": 2100 + }, + { + "epoch": 7.581489841986456, + "eval_loss": 0.6270378232002258, + "eval_runtime": 3.1464, + "eval_samples_per_second": 56.891, + "eval_steps_per_second": 56.891, + "step": 2100 + }, + { + "epoch": 7.58510158013544, + "grad_norm": 240.495361328125, + "learning_rate": 1.861524500907441e-05, + "loss": 41.4092, + "step": 2101 + }, + { + "epoch": 7.588713318284425, + "grad_norm": 240.1851043701172, + "learning_rate": 1.8609800362976406e-05, + "loss": 40.1396, + "step": 2102 + }, + { + "epoch": 7.592325056433409, + "grad_norm": 241.21495056152344, + "learning_rate": 1.8604355716878405e-05, + "loss": 39.1778, + "step": 2103 + }, + { + "epoch": 7.595936794582393, + "grad_norm": 287.3133544921875, + "learning_rate": 1.85989110707804e-05, + "loss": 41.0348, + "step": 2104 + }, + { + "epoch": 7.599548532731377, + "grad_norm": 230.4313201904297, + "learning_rate": 1.8593466424682395e-05, + "loss": 39.5872, + "step": 2105 + }, + { + "epoch": 7.603160270880361, + "grad_norm": 210.32962036132812, + "learning_rate": 1.858802177858439e-05, + "loss": 40.6146, + "step": 2106 + }, + { + "epoch": 7.606772009029346, + "grad_norm": 185.81752014160156, + "learning_rate": 1.858257713248639e-05, + "loss": 39.6363, + "step": 2107 + }, + { + "epoch": 7.6103837471783295, + "grad_norm": 234.63037109375, + "learning_rate": 1.8577132486388385e-05, + "loss": 40.558, + "step": 2108 + }, + { + "epoch": 7.6139954853273135, + "grad_norm": 289.92803955078125, + "learning_rate": 1.8571687840290384e-05, + "loss": 41.1624, + "step": 2109 + }, + { + "epoch": 7.617607223476298, + "grad_norm": 252.82188415527344, + "learning_rate": 1.856624319419238e-05, + "loss": 41.7827, + "step": 2110 + }, + { + "epoch": 7.617607223476298, + "eval_loss": 0.6290409564971924, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2110 + }, + { + "epoch": 7.621218961625282, + "grad_norm": 201.8303985595703, + "learning_rate": 1.8560798548094374e-05, + "loss": 39.0072, + "step": 2111 + }, + { + "epoch": 7.624830699774266, + "grad_norm": 158.71446228027344, + "learning_rate": 1.855535390199637e-05, + "loss": 39.9822, + "step": 2112 + }, + { + "epoch": 7.62844243792325, + "grad_norm": 171.3879852294922, + "learning_rate": 1.8549909255898365e-05, + "loss": 42.1973, + "step": 2113 + }, + { + "epoch": 7.632054176072235, + "grad_norm": 218.584228515625, + "learning_rate": 1.8544464609800364e-05, + "loss": 42.933, + "step": 2114 + }, + { + "epoch": 7.635665914221219, + "grad_norm": 200.60093688964844, + "learning_rate": 1.853901996370236e-05, + "loss": 41.9847, + "step": 2115 + }, + { + "epoch": 7.639277652370203, + "grad_norm": 210.75128173828125, + "learning_rate": 1.8533575317604358e-05, + "loss": 42.4961, + "step": 2116 + }, + { + "epoch": 7.642889390519187, + "grad_norm": 187.47406005859375, + "learning_rate": 1.8528130671506353e-05, + "loss": 39.3404, + "step": 2117 + }, + { + "epoch": 7.646501128668172, + "grad_norm": 204.87693786621094, + "learning_rate": 1.852268602540835e-05, + "loss": 40.3011, + "step": 2118 + }, + { + "epoch": 7.650112866817156, + "grad_norm": 228.8159637451172, + "learning_rate": 1.8517241379310344e-05, + "loss": 37.4416, + "step": 2119 + }, + { + "epoch": 7.65372460496614, + "grad_norm": 237.59664916992188, + "learning_rate": 1.8511796733212343e-05, + "loss": 35.3079, + "step": 2120 + }, + { + "epoch": 7.65372460496614, + "eval_loss": 0.6256567239761353, + "eval_runtime": 3.1458, + "eval_samples_per_second": 56.902, + "eval_steps_per_second": 56.902, + "step": 2120 + }, + { + "epoch": 7.657336343115124, + "grad_norm": 233.3187713623047, + "learning_rate": 1.850635208711434e-05, + "loss": 34.5055, + "step": 2121 + }, + { + "epoch": 7.660948081264109, + "grad_norm": 232.7037353515625, + "learning_rate": 1.8500907441016334e-05, + "loss": 34.1232, + "step": 2122 + }, + { + "epoch": 7.664559819413093, + "grad_norm": 254.53050231933594, + "learning_rate": 1.849546279491833e-05, + "loss": 35.3301, + "step": 2123 + }, + { + "epoch": 7.668171557562077, + "grad_norm": 234.93154907226562, + "learning_rate": 1.8490018148820324e-05, + "loss": 35.9202, + "step": 2124 + }, + { + "epoch": 7.6717832957110605, + "grad_norm": 237.99671936035156, + "learning_rate": 1.8484573502722327e-05, + "loss": 36.5702, + "step": 2125 + }, + { + "epoch": 7.675395033860045, + "grad_norm": 186.25271606445312, + "learning_rate": 1.8479128856624322e-05, + "loss": 35.9423, + "step": 2126 + }, + { + "epoch": 7.679006772009029, + "grad_norm": 226.461669921875, + "learning_rate": 1.8473684210526317e-05, + "loss": 37.4121, + "step": 2127 + }, + { + "epoch": 7.682618510158013, + "grad_norm": 227.0966033935547, + "learning_rate": 1.8468239564428313e-05, + "loss": 36.8802, + "step": 2128 + }, + { + "epoch": 7.686230248306998, + "grad_norm": 193.4064178466797, + "learning_rate": 1.8462794918330308e-05, + "loss": 36.0245, + "step": 2129 + }, + { + "epoch": 7.689841986455982, + "grad_norm": 279.1668395996094, + "learning_rate": 1.8457350272232304e-05, + "loss": 37.4833, + "step": 2130 + }, + { + "epoch": 7.689841986455982, + "eval_loss": 0.6227458715438843, + "eval_runtime": 3.1429, + "eval_samples_per_second": 56.953, + "eval_steps_per_second": 56.953, + "step": 2130 + }, + { + "epoch": 7.693453724604966, + "grad_norm": 254.59234619140625, + "learning_rate": 1.8451905626134302e-05, + "loss": 36.8538, + "step": 2131 + }, + { + "epoch": 7.69706546275395, + "grad_norm": 191.14463806152344, + "learning_rate": 1.8446460980036298e-05, + "loss": 37.8517, + "step": 2132 + }, + { + "epoch": 7.700677200902934, + "grad_norm": 189.20896911621094, + "learning_rate": 1.8441016333938293e-05, + "loss": 38.406, + "step": 2133 + }, + { + "epoch": 7.704288939051919, + "grad_norm": 209.61175537109375, + "learning_rate": 1.8435571687840292e-05, + "loss": 37.7692, + "step": 2134 + }, + { + "epoch": 7.707900677200903, + "grad_norm": 220.5150146484375, + "learning_rate": 1.8430127041742287e-05, + "loss": 36.087, + "step": 2135 + }, + { + "epoch": 7.711512415349887, + "grad_norm": 211.78372192382812, + "learning_rate": 1.8424682395644286e-05, + "loss": 25.6052, + "step": 2136 + }, + { + "epoch": 7.715124153498872, + "grad_norm": 223.85789489746094, + "learning_rate": 1.841923774954628e-05, + "loss": 23.5576, + "step": 2137 + }, + { + "epoch": 7.718735891647856, + "grad_norm": 163.74220275878906, + "learning_rate": 1.8413793103448277e-05, + "loss": 24.4869, + "step": 2138 + }, + { + "epoch": 7.72234762979684, + "grad_norm": 182.80079650878906, + "learning_rate": 1.8408348457350272e-05, + "loss": 25.1878, + "step": 2139 + }, + { + "epoch": 7.725959367945824, + "grad_norm": 296.0340270996094, + "learning_rate": 1.8402903811252268e-05, + "loss": 44.4643, + "step": 2140 + }, + { + "epoch": 7.725959367945824, + "eval_loss": 0.6382863521575928, + "eval_runtime": 3.1441, + "eval_samples_per_second": 56.932, + "eval_steps_per_second": 56.932, + "step": 2140 + }, + { + "epoch": 7.7295711060948085, + "grad_norm": 248.48643493652344, + "learning_rate": 1.8397459165154263e-05, + "loss": 45.2141, + "step": 2141 + }, + { + "epoch": 7.733182844243792, + "grad_norm": 240.9061279296875, + "learning_rate": 1.8392014519056262e-05, + "loss": 42.9435, + "step": 2142 + }, + { + "epoch": 7.736794582392776, + "grad_norm": 231.62315368652344, + "learning_rate": 1.8386569872958257e-05, + "loss": 42.9769, + "step": 2143 + }, + { + "epoch": 7.74040632054176, + "grad_norm": 244.36915588378906, + "learning_rate": 1.8381125226860256e-05, + "loss": 43.6058, + "step": 2144 + }, + { + "epoch": 7.744018058690745, + "grad_norm": 252.9080047607422, + "learning_rate": 1.837568058076225e-05, + "loss": 43.1753, + "step": 2145 + }, + { + "epoch": 7.747629796839729, + "grad_norm": 274.0201721191406, + "learning_rate": 1.8370235934664247e-05, + "loss": 43.3285, + "step": 2146 + }, + { + "epoch": 7.751241534988713, + "grad_norm": 226.75595092773438, + "learning_rate": 1.8364791288566245e-05, + "loss": 43.3158, + "step": 2147 + }, + { + "epoch": 7.754853273137698, + "grad_norm": 197.0859832763672, + "learning_rate": 1.835934664246824e-05, + "loss": 43.5773, + "step": 2148 + }, + { + "epoch": 7.758465011286682, + "grad_norm": 212.14720153808594, + "learning_rate": 1.8353901996370236e-05, + "loss": 43.9208, + "step": 2149 + }, + { + "epoch": 7.762076749435666, + "grad_norm": 230.22158813476562, + "learning_rate": 1.834845735027223e-05, + "loss": 42.8429, + "step": 2150 + }, + { + "epoch": 7.762076749435666, + "eval_loss": 0.6291994452476501, + "eval_runtime": 3.1473, + "eval_samples_per_second": 56.874, + "eval_steps_per_second": 56.874, + "step": 2150 + }, + { + "epoch": 7.76568848758465, + "grad_norm": 215.79391479492188, + "learning_rate": 1.8343012704174227e-05, + "loss": 40.7289, + "step": 2151 + }, + { + "epoch": 7.769300225733634, + "grad_norm": 210.00296020507812, + "learning_rate": 1.8337568058076222e-05, + "loss": 39.9759, + "step": 2152 + }, + { + "epoch": 7.772911963882619, + "grad_norm": 291.2987976074219, + "learning_rate": 1.8332123411978224e-05, + "loss": 40.551, + "step": 2153 + }, + { + "epoch": 7.776523702031603, + "grad_norm": 218.08819580078125, + "learning_rate": 1.832667876588022e-05, + "loss": 40.7981, + "step": 2154 + }, + { + "epoch": 7.780135440180587, + "grad_norm": 268.615966796875, + "learning_rate": 1.8321234119782215e-05, + "loss": 40.5463, + "step": 2155 + }, + { + "epoch": 7.7837471783295715, + "grad_norm": 269.939697265625, + "learning_rate": 1.831578947368421e-05, + "loss": 40.6168, + "step": 2156 + }, + { + "epoch": 7.7873589164785555, + "grad_norm": 268.9761657714844, + "learning_rate": 1.8310344827586206e-05, + "loss": 41.2449, + "step": 2157 + }, + { + "epoch": 7.7909706546275395, + "grad_norm": 161.08811950683594, + "learning_rate": 1.8304900181488205e-05, + "loss": 40.6308, + "step": 2158 + }, + { + "epoch": 7.794582392776523, + "grad_norm": 190.44696044921875, + "learning_rate": 1.82994555353902e-05, + "loss": 40.9708, + "step": 2159 + }, + { + "epoch": 7.798194130925508, + "grad_norm": 202.4305419921875, + "learning_rate": 1.8294010889292196e-05, + "loss": 41.2053, + "step": 2160 + }, + { + "epoch": 7.798194130925508, + "eval_loss": 0.6233534812927246, + "eval_runtime": 3.1457, + "eval_samples_per_second": 56.903, + "eval_steps_per_second": 56.903, + "step": 2160 + }, + { + "epoch": 7.801805869074492, + "grad_norm": 188.5523681640625, + "learning_rate": 1.828856624319419e-05, + "loss": 40.3928, + "step": 2161 + }, + { + "epoch": 7.805417607223476, + "grad_norm": 184.18296813964844, + "learning_rate": 1.828312159709619e-05, + "loss": 42.3466, + "step": 2162 + }, + { + "epoch": 7.80902934537246, + "grad_norm": 223.9243927001953, + "learning_rate": 1.8277676950998185e-05, + "loss": 42.0301, + "step": 2163 + }, + { + "epoch": 7.812641083521445, + "grad_norm": 202.3498077392578, + "learning_rate": 1.8272232304900184e-05, + "loss": 42.3284, + "step": 2164 + }, + { + "epoch": 7.816252821670429, + "grad_norm": 205.77940368652344, + "learning_rate": 1.826678765880218e-05, + "loss": 42.0951, + "step": 2165 + }, + { + "epoch": 7.819864559819413, + "grad_norm": 191.46728515625, + "learning_rate": 1.8261343012704175e-05, + "loss": 40.826, + "step": 2166 + }, + { + "epoch": 7.823476297968397, + "grad_norm": 276.8330383300781, + "learning_rate": 1.825589836660617e-05, + "loss": 42.7909, + "step": 2167 + }, + { + "epoch": 7.827088036117382, + "grad_norm": 181.93955993652344, + "learning_rate": 1.8250453720508165e-05, + "loss": 38.6068, + "step": 2168 + }, + { + "epoch": 7.830699774266366, + "grad_norm": 178.79856872558594, + "learning_rate": 1.8245009074410164e-05, + "loss": 35.694, + "step": 2169 + }, + { + "epoch": 7.83431151241535, + "grad_norm": 224.6522979736328, + "learning_rate": 1.823956442831216e-05, + "loss": 36.7127, + "step": 2170 + }, + { + "epoch": 7.83431151241535, + "eval_loss": 0.6237645745277405, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 2170 + }, + { + "epoch": 7.837923250564334, + "grad_norm": 203.37196350097656, + "learning_rate": 1.823411978221416e-05, + "loss": 34.0039, + "step": 2171 + }, + { + "epoch": 7.8415349887133186, + "grad_norm": 212.79307556152344, + "learning_rate": 1.8228675136116154e-05, + "loss": 33.2787, + "step": 2172 + }, + { + "epoch": 7.8451467268623025, + "grad_norm": 215.5691375732422, + "learning_rate": 1.822323049001815e-05, + "loss": 35.4241, + "step": 2173 + }, + { + "epoch": 7.8487584650112865, + "grad_norm": 230.0751190185547, + "learning_rate": 1.8217785843920144e-05, + "loss": 36.9333, + "step": 2174 + }, + { + "epoch": 7.852370203160271, + "grad_norm": 217.8132781982422, + "learning_rate": 1.8212341197822143e-05, + "loss": 35.7233, + "step": 2175 + }, + { + "epoch": 7.855981941309255, + "grad_norm": 245.93177795410156, + "learning_rate": 1.820689655172414e-05, + "loss": 36.6111, + "step": 2176 + }, + { + "epoch": 7.859593679458239, + "grad_norm": 210.58218383789062, + "learning_rate": 1.8201451905626134e-05, + "loss": 36.3243, + "step": 2177 + }, + { + "epoch": 7.863205417607223, + "grad_norm": 234.6280059814453, + "learning_rate": 1.819600725952813e-05, + "loss": 37.0315, + "step": 2178 + }, + { + "epoch": 7.866817155756207, + "grad_norm": 184.53121948242188, + "learning_rate": 1.8190562613430125e-05, + "loss": 35.8725, + "step": 2179 + }, + { + "epoch": 7.870428893905192, + "grad_norm": 201.5563507080078, + "learning_rate": 1.8185117967332127e-05, + "loss": 37.9183, + "step": 2180 + }, + { + "epoch": 7.870428893905192, + "eval_loss": 0.6210297346115112, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2180 + }, + { + "epoch": 7.874040632054176, + "grad_norm": 192.29579162597656, + "learning_rate": 1.8179673321234122e-05, + "loss": 37.1709, + "step": 2181 + }, + { + "epoch": 7.87765237020316, + "grad_norm": 246.0638427734375, + "learning_rate": 1.8174228675136118e-05, + "loss": 38.5338, + "step": 2182 + }, + { + "epoch": 7.881264108352145, + "grad_norm": 237.47607421875, + "learning_rate": 1.8168784029038113e-05, + "loss": 37.7041, + "step": 2183 + }, + { + "epoch": 7.884875846501129, + "grad_norm": 215.06407165527344, + "learning_rate": 1.816333938294011e-05, + "loss": 38.1663, + "step": 2184 + }, + { + "epoch": 7.888487584650113, + "grad_norm": 193.76809692382812, + "learning_rate": 1.8157894736842107e-05, + "loss": 32.1679, + "step": 2185 + }, + { + "epoch": 7.892099322799097, + "grad_norm": 208.66111755371094, + "learning_rate": 1.8152450090744103e-05, + "loss": 24.2413, + "step": 2186 + }, + { + "epoch": 7.895711060948082, + "grad_norm": 182.810546875, + "learning_rate": 1.8147005444646098e-05, + "loss": 24.1102, + "step": 2187 + }, + { + "epoch": 7.899322799097066, + "grad_norm": 200.25823974609375, + "learning_rate": 1.8141560798548093e-05, + "loss": 24.5778, + "step": 2188 + }, + { + "epoch": 7.9029345372460496, + "grad_norm": 224.19125366210938, + "learning_rate": 1.813611615245009e-05, + "loss": 26.1643, + "step": 2189 + }, + { + "epoch": 7.9065462753950335, + "grad_norm": 261.03033447265625, + "learning_rate": 1.8130671506352088e-05, + "loss": 45.1071, + "step": 2190 + }, + { + "epoch": 7.9065462753950335, + "eval_loss": 0.6303785443305969, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2190 + }, + { + "epoch": 7.910158013544018, + "grad_norm": 273.6593322753906, + "learning_rate": 1.8125226860254086e-05, + "loss": 43.8271, + "step": 2191 + }, + { + "epoch": 7.913769751693002, + "grad_norm": 304.0534362792969, + "learning_rate": 1.8119782214156082e-05, + "loss": 43.7623, + "step": 2192 + }, + { + "epoch": 7.917381489841986, + "grad_norm": 249.27255249023438, + "learning_rate": 1.8114337568058077e-05, + "loss": 43.7191, + "step": 2193 + }, + { + "epoch": 7.92099322799097, + "grad_norm": 199.5006103515625, + "learning_rate": 1.8108892921960072e-05, + "loss": 44.1019, + "step": 2194 + }, + { + "epoch": 7.924604966139955, + "grad_norm": 228.42832946777344, + "learning_rate": 1.8103448275862068e-05, + "loss": 43.9717, + "step": 2195 + }, + { + "epoch": 7.928216704288939, + "grad_norm": 247.20901489257812, + "learning_rate": 1.8098003629764067e-05, + "loss": 40.022, + "step": 2196 + }, + { + "epoch": 7.931828442437923, + "grad_norm": 297.5372619628906, + "learning_rate": 1.8092558983666062e-05, + "loss": 40.6639, + "step": 2197 + }, + { + "epoch": 7.935440180586907, + "grad_norm": 245.11915588378906, + "learning_rate": 1.8087114337568057e-05, + "loss": 40.3569, + "step": 2198 + }, + { + "epoch": 7.939051918735892, + "grad_norm": 255.53297424316406, + "learning_rate": 1.8081669691470056e-05, + "loss": 41.7983, + "step": 2199 + }, + { + "epoch": 7.942663656884876, + "grad_norm": 226.12783813476562, + "learning_rate": 1.807622504537205e-05, + "loss": 41.7844, + "step": 2200 + }, + { + "epoch": 7.942663656884876, + "eval_loss": 0.6214397549629211, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2200 + }, + { + "epoch": 7.94627539503386, + "grad_norm": 220.90577697753906, + "learning_rate": 1.8070780399274047e-05, + "loss": 42.057, + "step": 2201 + }, + { + "epoch": 7.949887133182845, + "grad_norm": 192.33856201171875, + "learning_rate": 1.8065335753176046e-05, + "loss": 42.0299, + "step": 2202 + }, + { + "epoch": 7.953498871331829, + "grad_norm": 192.8511962890625, + "learning_rate": 1.805989110707804e-05, + "loss": 41.7752, + "step": 2203 + }, + { + "epoch": 7.957110609480813, + "grad_norm": 223.10275268554688, + "learning_rate": 1.8054446460980036e-05, + "loss": 41.0178, + "step": 2204 + }, + { + "epoch": 7.960722347629797, + "grad_norm": 189.8402099609375, + "learning_rate": 1.8049001814882032e-05, + "loss": 37.9747, + "step": 2205 + }, + { + "epoch": 7.9643340857787805, + "grad_norm": 233.5938720703125, + "learning_rate": 1.8043557168784027e-05, + "loss": 35.3994, + "step": 2206 + }, + { + "epoch": 7.967945823927765, + "grad_norm": 218.5577850341797, + "learning_rate": 1.8038112522686026e-05, + "loss": 35.1967, + "step": 2207 + }, + { + "epoch": 7.971557562076749, + "grad_norm": 228.49502563476562, + "learning_rate": 1.8032667876588025e-05, + "loss": 34.5792, + "step": 2208 + }, + { + "epoch": 7.975169300225733, + "grad_norm": 285.4461364746094, + "learning_rate": 1.802722323049002e-05, + "loss": 37.9449, + "step": 2209 + }, + { + "epoch": 7.978781038374718, + "grad_norm": 186.83755493164062, + "learning_rate": 1.8021778584392016e-05, + "loss": 36.3295, + "step": 2210 + }, + { + "epoch": 7.978781038374718, + "eval_loss": 0.6212169528007507, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2210 + }, + { + "epoch": 7.982392776523702, + "grad_norm": 210.31175231933594, + "learning_rate": 1.801633393829401e-05, + "loss": 37.0061, + "step": 2211 + }, + { + "epoch": 7.986004514672686, + "grad_norm": 251.96026611328125, + "learning_rate": 1.8010889292196006e-05, + "loss": 37.8831, + "step": 2212 + }, + { + "epoch": 7.98961625282167, + "grad_norm": 273.8665771484375, + "learning_rate": 1.8005444646098005e-05, + "loss": 38.8926, + "step": 2213 + }, + { + "epoch": 7.993227990970655, + "grad_norm": 207.25836181640625, + "learning_rate": 1.8e-05, + "loss": 30.0468, + "step": 2214 + }, + { + "epoch": 7.996839729119639, + "grad_norm": 200.5218048095703, + "learning_rate": 1.7994555353901996e-05, + "loss": 24.0549, + "step": 2215 + }, + { + "epoch": 8.0, + "grad_norm": 245.7149200439453, + "learning_rate": 1.798911070780399e-05, + "loss": 22.3158, + "step": 2216 + }, + { + "epoch": 8.003611738148985, + "grad_norm": 263.85546875, + "learning_rate": 1.798366606170599e-05, + "loss": 43.2342, + "step": 2217 + }, + { + "epoch": 8.007223476297968, + "grad_norm": 244.57205200195312, + "learning_rate": 1.797822141560799e-05, + "loss": 44.0931, + "step": 2218 + }, + { + "epoch": 8.010835214446953, + "grad_norm": 196.4144287109375, + "learning_rate": 1.7972776769509984e-05, + "loss": 42.1926, + "step": 2219 + }, + { + "epoch": 8.014446952595938, + "grad_norm": 282.3250427246094, + "learning_rate": 1.796733212341198e-05, + "loss": 41.4664, + "step": 2220 + }, + { + "epoch": 8.014446952595938, + "eval_loss": 0.6222901344299316, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 2220 + }, + { + "epoch": 8.01805869074492, + "grad_norm": 186.79281616210938, + "learning_rate": 1.7961887477313975e-05, + "loss": 42.2133, + "step": 2221 + }, + { + "epoch": 8.021670428893906, + "grad_norm": 220.3788299560547, + "learning_rate": 1.795644283121597e-05, + "loss": 42.0159, + "step": 2222 + }, + { + "epoch": 8.025282167042889, + "grad_norm": 262.37078857421875, + "learning_rate": 1.7950998185117966e-05, + "loss": 42.6055, + "step": 2223 + }, + { + "epoch": 8.028893905191874, + "grad_norm": 199.07078552246094, + "learning_rate": 1.7945553539019964e-05, + "loss": 43.3061, + "step": 2224 + }, + { + "epoch": 8.032505643340858, + "grad_norm": 256.6651306152344, + "learning_rate": 1.794010889292196e-05, + "loss": 42.4806, + "step": 2225 + }, + { + "epoch": 8.036117381489841, + "grad_norm": 281.17431640625, + "learning_rate": 1.793466424682396e-05, + "loss": 43.9823, + "step": 2226 + }, + { + "epoch": 8.039729119638826, + "grad_norm": 201.19837951660156, + "learning_rate": 1.7929219600725954e-05, + "loss": 41.8372, + "step": 2227 + }, + { + "epoch": 8.043340857787811, + "grad_norm": 195.1905059814453, + "learning_rate": 1.792377495462795e-05, + "loss": 38.8656, + "step": 2228 + }, + { + "epoch": 8.046952595936794, + "grad_norm": 215.02772521972656, + "learning_rate": 1.7918330308529948e-05, + "loss": 39.8965, + "step": 2229 + }, + { + "epoch": 8.050564334085779, + "grad_norm": 202.16322326660156, + "learning_rate": 1.7912885662431944e-05, + "loss": 41.0917, + "step": 2230 + }, + { + "epoch": 8.050564334085779, + "eval_loss": 0.6212881207466125, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2230 + }, + { + "epoch": 8.054176072234762, + "grad_norm": 218.90786743164062, + "learning_rate": 1.790744101633394e-05, + "loss": 38.5499, + "step": 2231 + }, + { + "epoch": 8.057787810383747, + "grad_norm": 179.57138061523438, + "learning_rate": 1.7901996370235934e-05, + "loss": 39.5915, + "step": 2232 + }, + { + "epoch": 8.061399548532732, + "grad_norm": 242.74801635742188, + "learning_rate": 1.789655172413793e-05, + "loss": 39.6094, + "step": 2233 + }, + { + "epoch": 8.065011286681715, + "grad_norm": 183.07102966308594, + "learning_rate": 1.7891107078039925e-05, + "loss": 40.6025, + "step": 2234 + }, + { + "epoch": 8.0686230248307, + "grad_norm": 192.85418701171875, + "learning_rate": 1.7885662431941924e-05, + "loss": 40.3013, + "step": 2235 + }, + { + "epoch": 8.072234762979685, + "grad_norm": 254.26353454589844, + "learning_rate": 1.7880217785843923e-05, + "loss": 39.1747, + "step": 2236 + }, + { + "epoch": 8.075846501128668, + "grad_norm": 230.7747802734375, + "learning_rate": 1.7874773139745918e-05, + "loss": 40.7569, + "step": 2237 + }, + { + "epoch": 8.079458239277653, + "grad_norm": 179.30528259277344, + "learning_rate": 1.7869328493647913e-05, + "loss": 40.0753, + "step": 2238 + }, + { + "epoch": 8.083069977426636, + "grad_norm": 203.48915100097656, + "learning_rate": 1.786388384754991e-05, + "loss": 41.4453, + "step": 2239 + }, + { + "epoch": 8.08668171557562, + "grad_norm": 274.8970947265625, + "learning_rate": 1.7858439201451908e-05, + "loss": 40.5818, + "step": 2240 + }, + { + "epoch": 8.08668171557562, + "eval_loss": 0.6184170842170715, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.799, + "eval_steps_per_second": 56.799, + "step": 2240 + }, + { + "epoch": 8.090293453724605, + "grad_norm": 237.2452392578125, + "learning_rate": 1.7852994555353903e-05, + "loss": 42.5794, + "step": 2241 + }, + { + "epoch": 8.093905191873588, + "grad_norm": 236.33766174316406, + "learning_rate": 1.7847549909255898e-05, + "loss": 41.89, + "step": 2242 + }, + { + "epoch": 8.097516930022573, + "grad_norm": 269.4791564941406, + "learning_rate": 1.7842105263157894e-05, + "loss": 41.7726, + "step": 2243 + }, + { + "epoch": 8.101128668171558, + "grad_norm": 192.28457641601562, + "learning_rate": 1.783666061705989e-05, + "loss": 40.1187, + "step": 2244 + }, + { + "epoch": 8.104740406320541, + "grad_norm": 201.5625457763672, + "learning_rate": 1.7831215970961888e-05, + "loss": 36.8004, + "step": 2245 + }, + { + "epoch": 8.108352144469526, + "grad_norm": 175.7625274658203, + "learning_rate": 1.7825771324863887e-05, + "loss": 33.8354, + "step": 2246 + }, + { + "epoch": 8.111963882618511, + "grad_norm": 195.6171112060547, + "learning_rate": 1.7820326678765882e-05, + "loss": 33.5176, + "step": 2247 + }, + { + "epoch": 8.115575620767494, + "grad_norm": 158.7554168701172, + "learning_rate": 1.7814882032667877e-05, + "loss": 34.2908, + "step": 2248 + }, + { + "epoch": 8.119187358916479, + "grad_norm": 192.78900146484375, + "learning_rate": 1.7809437386569873e-05, + "loss": 34.0861, + "step": 2249 + }, + { + "epoch": 8.122799097065462, + "grad_norm": 186.6603240966797, + "learning_rate": 1.7803992740471868e-05, + "loss": 35.5742, + "step": 2250 + }, + { + "epoch": 8.122799097065462, + "eval_loss": 0.6207499504089355, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2250 + }, + { + "epoch": 8.126410835214447, + "grad_norm": 264.3590087890625, + "learning_rate": 1.7798548094373867e-05, + "loss": 35.6709, + "step": 2251 + }, + { + "epoch": 8.130022573363432, + "grad_norm": 202.9478302001953, + "learning_rate": 1.7793103448275862e-05, + "loss": 36.4221, + "step": 2252 + }, + { + "epoch": 8.133634311512415, + "grad_norm": 229.260498046875, + "learning_rate": 1.7787658802177858e-05, + "loss": 36.0745, + "step": 2253 + }, + { + "epoch": 8.1372460496614, + "grad_norm": 222.37716674804688, + "learning_rate": 1.7782214156079856e-05, + "loss": 37.3266, + "step": 2254 + }, + { + "epoch": 8.140857787810384, + "grad_norm": 217.02272033691406, + "learning_rate": 1.7776769509981852e-05, + "loss": 37.2819, + "step": 2255 + }, + { + "epoch": 8.144469525959368, + "grad_norm": 247.61016845703125, + "learning_rate": 1.7771324863883847e-05, + "loss": 37.2683, + "step": 2256 + }, + { + "epoch": 8.148081264108352, + "grad_norm": 209.7449493408203, + "learning_rate": 1.7765880217785846e-05, + "loss": 36.7165, + "step": 2257 + }, + { + "epoch": 8.151693002257336, + "grad_norm": 217.30722045898438, + "learning_rate": 1.776043557168784e-05, + "loss": 37.0805, + "step": 2258 + }, + { + "epoch": 8.15530474040632, + "grad_norm": 181.5167236328125, + "learning_rate": 1.7754990925589837e-05, + "loss": 38.0326, + "step": 2259 + }, + { + "epoch": 8.158916478555305, + "grad_norm": 217.4818878173828, + "learning_rate": 1.7749546279491832e-05, + "loss": 37.1798, + "step": 2260 + }, + { + "epoch": 8.158916478555305, + "eval_loss": 0.6218119263648987, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 2260 + }, + { + "epoch": 8.162528216704288, + "grad_norm": 233.60733032226562, + "learning_rate": 1.7744101633393828e-05, + "loss": 36.6039, + "step": 2261 + }, + { + "epoch": 8.166139954853273, + "grad_norm": 184.5128631591797, + "learning_rate": 1.7738656987295826e-05, + "loss": 30.6188, + "step": 2262 + }, + { + "epoch": 8.169751693002258, + "grad_norm": 154.25791931152344, + "learning_rate": 1.7733212341197825e-05, + "loss": 24.0782, + "step": 2263 + }, + { + "epoch": 8.173363431151241, + "grad_norm": 179.92723083496094, + "learning_rate": 1.772776769509982e-05, + "loss": 23.7072, + "step": 2264 + }, + { + "epoch": 8.176975169300226, + "grad_norm": 170.87684631347656, + "learning_rate": 1.7722323049001816e-05, + "loss": 24.0008, + "step": 2265 + }, + { + "epoch": 8.18058690744921, + "grad_norm": 179.25233459472656, + "learning_rate": 1.771687840290381e-05, + "loss": 24.8393, + "step": 2266 + }, + { + "epoch": 8.184198645598194, + "grad_norm": 268.7836608886719, + "learning_rate": 1.7711433756805807e-05, + "loss": 44.0573, + "step": 2267 + }, + { + "epoch": 8.187810383747179, + "grad_norm": 249.12033081054688, + "learning_rate": 1.7705989110707805e-05, + "loss": 45.0218, + "step": 2268 + }, + { + "epoch": 8.191422121896162, + "grad_norm": 275.2551574707031, + "learning_rate": 1.77005444646098e-05, + "loss": 43.1954, + "step": 2269 + }, + { + "epoch": 8.195033860045147, + "grad_norm": 233.5360107421875, + "learning_rate": 1.7695099818511796e-05, + "loss": 43.0807, + "step": 2270 + }, + { + "epoch": 8.195033860045147, + "eval_loss": 0.6311450600624084, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 2270 + }, + { + "epoch": 8.198645598194132, + "grad_norm": 201.01617431640625, + "learning_rate": 1.768965517241379e-05, + "loss": 43.8161, + "step": 2271 + }, + { + "epoch": 8.202257336343115, + "grad_norm": 243.028564453125, + "learning_rate": 1.7684210526315787e-05, + "loss": 43.3388, + "step": 2272 + }, + { + "epoch": 8.2058690744921, + "grad_norm": 191.8246307373047, + "learning_rate": 1.767876588021779e-05, + "loss": 42.6949, + "step": 2273 + }, + { + "epoch": 8.209480812641084, + "grad_norm": 241.33609008789062, + "learning_rate": 1.7673321234119784e-05, + "loss": 43.3541, + "step": 2274 + }, + { + "epoch": 8.213092550790067, + "grad_norm": 247.99066162109375, + "learning_rate": 1.766787658802178e-05, + "loss": 44.4262, + "step": 2275 + }, + { + "epoch": 8.216704288939052, + "grad_norm": 223.35452270507812, + "learning_rate": 1.7662431941923775e-05, + "loss": 42.5696, + "step": 2276 + }, + { + "epoch": 8.220316027088035, + "grad_norm": 208.75209045410156, + "learning_rate": 1.765698729582577e-05, + "loss": 41.9236, + "step": 2277 + }, + { + "epoch": 8.22392776523702, + "grad_norm": 229.60305786132812, + "learning_rate": 1.7651542649727766e-05, + "loss": 39.962, + "step": 2278 + }, + { + "epoch": 8.227539503386005, + "grad_norm": 294.3867492675781, + "learning_rate": 1.7646098003629765e-05, + "loss": 39.0847, + "step": 2279 + }, + { + "epoch": 8.231151241534988, + "grad_norm": 201.49679565429688, + "learning_rate": 1.764065335753176e-05, + "loss": 39.1451, + "step": 2280 + }, + { + "epoch": 8.231151241534988, + "eval_loss": 0.6214079856872559, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 2280 + }, + { + "epoch": 8.234762979683973, + "grad_norm": 201.57894897460938, + "learning_rate": 1.7635208711433756e-05, + "loss": 39.4673, + "step": 2281 + }, + { + "epoch": 8.238374717832958, + "grad_norm": 201.0395965576172, + "learning_rate": 1.7629764065335754e-05, + "loss": 39.9832, + "step": 2282 + }, + { + "epoch": 8.241986455981941, + "grad_norm": 274.41168212890625, + "learning_rate": 1.762431941923775e-05, + "loss": 40.3885, + "step": 2283 + }, + { + "epoch": 8.245598194130926, + "grad_norm": 173.79977416992188, + "learning_rate": 1.761887477313975e-05, + "loss": 39.5292, + "step": 2284 + }, + { + "epoch": 8.249209932279909, + "grad_norm": 194.91806030273438, + "learning_rate": 1.7613430127041744e-05, + "loss": 40.3855, + "step": 2285 + }, + { + "epoch": 8.252821670428894, + "grad_norm": 216.47213745117188, + "learning_rate": 1.760798548094374e-05, + "loss": 40.937, + "step": 2286 + }, + { + "epoch": 8.256433408577879, + "grad_norm": 168.1825714111328, + "learning_rate": 1.7602540834845735e-05, + "loss": 41.2523, + "step": 2287 + }, + { + "epoch": 8.260045146726862, + "grad_norm": 187.51914978027344, + "learning_rate": 1.759709618874773e-05, + "loss": 40.6913, + "step": 2288 + }, + { + "epoch": 8.263656884875846, + "grad_norm": 183.99844360351562, + "learning_rate": 1.759165154264973e-05, + "loss": 42.5074, + "step": 2289 + }, + { + "epoch": 8.267268623024831, + "grad_norm": 201.23797607421875, + "learning_rate": 1.7586206896551724e-05, + "loss": 42.0519, + "step": 2290 + }, + { + "epoch": 8.267268623024831, + "eval_loss": 0.6184054017066956, + "eval_runtime": 3.1465, + "eval_samples_per_second": 56.889, + "eval_steps_per_second": 56.889, + "step": 2290 + }, + { + "epoch": 8.270880361173814, + "grad_norm": 219.0037384033203, + "learning_rate": 1.7580762250453723e-05, + "loss": 41.7059, + "step": 2291 + }, + { + "epoch": 8.2744920993228, + "grad_norm": 221.00173950195312, + "learning_rate": 1.7575317604355718e-05, + "loss": 40.9004, + "step": 2292 + }, + { + "epoch": 8.278103837471784, + "grad_norm": 180.00828552246094, + "learning_rate": 1.7569872958257714e-05, + "loss": 38.7865, + "step": 2293 + }, + { + "epoch": 8.281715575620767, + "grad_norm": 210.69302368164062, + "learning_rate": 1.756442831215971e-05, + "loss": 39.207, + "step": 2294 + }, + { + "epoch": 8.285327313769752, + "grad_norm": 196.8787078857422, + "learning_rate": 1.7558983666061708e-05, + "loss": 39.4472, + "step": 2295 + }, + { + "epoch": 8.288939051918735, + "grad_norm": 229.16331481933594, + "learning_rate": 1.7553539019963703e-05, + "loss": 36.5539, + "step": 2296 + }, + { + "epoch": 8.29255079006772, + "grad_norm": 180.67474365234375, + "learning_rate": 1.75480943738657e-05, + "loss": 34.3887, + "step": 2297 + }, + { + "epoch": 8.296162528216705, + "grad_norm": 234.046875, + "learning_rate": 1.7542649727767694e-05, + "loss": 34.158, + "step": 2298 + }, + { + "epoch": 8.299774266365688, + "grad_norm": 213.34255981445312, + "learning_rate": 1.753720508166969e-05, + "loss": 34.7655, + "step": 2299 + }, + { + "epoch": 8.303386004514673, + "grad_norm": 205.6382598876953, + "learning_rate": 1.753176043557169e-05, + "loss": 34.4223, + "step": 2300 + }, + { + "epoch": 8.303386004514673, + "eval_loss": 0.6200549006462097, + "eval_runtime": 3.1447, + "eval_samples_per_second": 56.921, + "eval_steps_per_second": 56.921, + "step": 2300 + }, + { + "epoch": 8.306997742663658, + "grad_norm": 189.79238891601562, + "learning_rate": 1.7526315789473687e-05, + "loss": 35.3846, + "step": 2301 + }, + { + "epoch": 8.31060948081264, + "grad_norm": 202.27859497070312, + "learning_rate": 1.7520871143375682e-05, + "loss": 34.9006, + "step": 2302 + }, + { + "epoch": 8.314221218961626, + "grad_norm": 217.62327575683594, + "learning_rate": 1.7515426497277678e-05, + "loss": 36.3079, + "step": 2303 + }, + { + "epoch": 8.317832957110609, + "grad_norm": 212.82862854003906, + "learning_rate": 1.7509981851179673e-05, + "loss": 35.8598, + "step": 2304 + }, + { + "epoch": 8.321444695259594, + "grad_norm": 229.778564453125, + "learning_rate": 1.750453720508167e-05, + "loss": 37.0853, + "step": 2305 + }, + { + "epoch": 8.325056433408578, + "grad_norm": 219.99844360351562, + "learning_rate": 1.7499092558983667e-05, + "loss": 38.01, + "step": 2306 + }, + { + "epoch": 8.328668171557561, + "grad_norm": 202.63035583496094, + "learning_rate": 1.7493647912885663e-05, + "loss": 36.4756, + "step": 2307 + }, + { + "epoch": 8.332279909706546, + "grad_norm": 188.44094848632812, + "learning_rate": 1.7488203266787658e-05, + "loss": 37.0509, + "step": 2308 + }, + { + "epoch": 8.335891647855531, + "grad_norm": 187.8760223388672, + "learning_rate": 1.7482758620689657e-05, + "loss": 38.0019, + "step": 2309 + }, + { + "epoch": 8.339503386004514, + "grad_norm": 239.35833740234375, + "learning_rate": 1.7477313974591652e-05, + "loss": 38.2255, + "step": 2310 + }, + { + "epoch": 8.339503386004514, + "eval_loss": 0.6221747994422913, + "eval_runtime": 3.148, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 2310 + }, + { + "epoch": 8.343115124153499, + "grad_norm": 236.3567657470703, + "learning_rate": 1.747186932849365e-05, + "loss": 37.3598, + "step": 2311 + }, + { + "epoch": 8.346726862302482, + "grad_norm": 188.16151428222656, + "learning_rate": 1.7466424682395646e-05, + "loss": 27.1993, + "step": 2312 + }, + { + "epoch": 8.350338600451467, + "grad_norm": 216.58778381347656, + "learning_rate": 1.746098003629764e-05, + "loss": 23.7024, + "step": 2313 + }, + { + "epoch": 8.353950338600452, + "grad_norm": 221.03111267089844, + "learning_rate": 1.7455535390199637e-05, + "loss": 24.2856, + "step": 2314 + }, + { + "epoch": 8.357562076749435, + "grad_norm": 180.36221313476562, + "learning_rate": 1.7450090744101632e-05, + "loss": 23.7624, + "step": 2315 + }, + { + "epoch": 8.36117381489842, + "grad_norm": 198.77438354492188, + "learning_rate": 1.7444646098003628e-05, + "loss": 25.8628, + "step": 2316 + }, + { + "epoch": 8.364785553047405, + "grad_norm": 250.81321716308594, + "learning_rate": 1.7439201451905627e-05, + "loss": 43.4097, + "step": 2317 + }, + { + "epoch": 8.368397291196388, + "grad_norm": 246.19544982910156, + "learning_rate": 1.7433756805807622e-05, + "loss": 44.7141, + "step": 2318 + }, + { + "epoch": 8.372009029345373, + "grad_norm": 245.04241943359375, + "learning_rate": 1.742831215970962e-05, + "loss": 44.4511, + "step": 2319 + }, + { + "epoch": 8.375620767494357, + "grad_norm": 224.05331420898438, + "learning_rate": 1.7422867513611616e-05, + "loss": 43.5971, + "step": 2320 + }, + { + "epoch": 8.375620767494357, + "eval_loss": 0.6324251294136047, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 2320 + }, + { + "epoch": 8.37923250564334, + "grad_norm": 222.3795623779297, + "learning_rate": 1.741742286751361e-05, + "loss": 42.9007, + "step": 2321 + }, + { + "epoch": 8.382844243792325, + "grad_norm": 210.0133514404297, + "learning_rate": 1.741197822141561e-05, + "loss": 42.8733, + "step": 2322 + }, + { + "epoch": 8.386455981941308, + "grad_norm": 222.01031494140625, + "learning_rate": 1.7406533575317606e-05, + "loss": 42.9875, + "step": 2323 + }, + { + "epoch": 8.390067720090293, + "grad_norm": 187.30101013183594, + "learning_rate": 1.74010889292196e-05, + "loss": 42.4873, + "step": 2324 + }, + { + "epoch": 8.393679458239278, + "grad_norm": 188.22048950195312, + "learning_rate": 1.7395644283121596e-05, + "loss": 42.2066, + "step": 2325 + }, + { + "epoch": 8.397291196388261, + "grad_norm": 228.75363159179688, + "learning_rate": 1.7390199637023592e-05, + "loss": 42.7604, + "step": 2326 + }, + { + "epoch": 8.400902934537246, + "grad_norm": 196.8817901611328, + "learning_rate": 1.7384754990925587e-05, + "loss": 42.445, + "step": 2327 + }, + { + "epoch": 8.404514672686231, + "grad_norm": 205.3610382080078, + "learning_rate": 1.737931034482759e-05, + "loss": 39.8408, + "step": 2328 + }, + { + "epoch": 8.408126410835214, + "grad_norm": 259.0702819824219, + "learning_rate": 1.7373865698729585e-05, + "loss": 40.847, + "step": 2329 + }, + { + "epoch": 8.411738148984199, + "grad_norm": 216.12017822265625, + "learning_rate": 1.736842105263158e-05, + "loss": 40.4648, + "step": 2330 + }, + { + "epoch": 8.411738148984199, + "eval_loss": 0.6252871155738831, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2330 + }, + { + "epoch": 8.415349887133182, + "grad_norm": 330.9464111328125, + "learning_rate": 1.7362976406533575e-05, + "loss": 39.7682, + "step": 2331 + }, + { + "epoch": 8.418961625282167, + "grad_norm": 237.19505310058594, + "learning_rate": 1.735753176043557e-05, + "loss": 38.8824, + "step": 2332 + }, + { + "epoch": 8.422573363431152, + "grad_norm": 247.22259521484375, + "learning_rate": 1.735208711433757e-05, + "loss": 40.1187, + "step": 2333 + }, + { + "epoch": 8.426185101580135, + "grad_norm": 267.739990234375, + "learning_rate": 1.7346642468239565e-05, + "loss": 40.4589, + "step": 2334 + }, + { + "epoch": 8.42979683972912, + "grad_norm": 308.715576171875, + "learning_rate": 1.734119782214156e-05, + "loss": 41.5481, + "step": 2335 + }, + { + "epoch": 8.433408577878104, + "grad_norm": 350.8972473144531, + "learning_rate": 1.7335753176043556e-05, + "loss": 41.6628, + "step": 2336 + }, + { + "epoch": 8.437020316027088, + "grad_norm": 245.9825897216797, + "learning_rate": 1.7330308529945555e-05, + "loss": 40.3527, + "step": 2337 + }, + { + "epoch": 8.440632054176072, + "grad_norm": 253.94488525390625, + "learning_rate": 1.732486388384755e-05, + "loss": 39.6388, + "step": 2338 + }, + { + "epoch": 8.444243792325057, + "grad_norm": 226.24179077148438, + "learning_rate": 1.731941923774955e-05, + "loss": 40.5561, + "step": 2339 + }, + { + "epoch": 8.44785553047404, + "grad_norm": 188.66746520996094, + "learning_rate": 1.7313974591651544e-05, + "loss": 41.8422, + "step": 2340 + }, + { + "epoch": 8.44785553047404, + "eval_loss": 0.6197592616081238, + "eval_runtime": 3.1522, + "eval_samples_per_second": 56.786, + "eval_steps_per_second": 56.786, + "step": 2340 + }, + { + "epoch": 8.451467268623025, + "grad_norm": 227.01014709472656, + "learning_rate": 1.730852994555354e-05, + "loss": 41.4184, + "step": 2341 + }, + { + "epoch": 8.455079006772008, + "grad_norm": 187.11643981933594, + "learning_rate": 1.7303085299455535e-05, + "loss": 40.796, + "step": 2342 + }, + { + "epoch": 8.458690744920993, + "grad_norm": 243.1756134033203, + "learning_rate": 1.729764065335753e-05, + "loss": 41.7926, + "step": 2343 + }, + { + "epoch": 8.462302483069978, + "grad_norm": 226.15187072753906, + "learning_rate": 1.729219600725953e-05, + "loss": 41.588, + "step": 2344 + }, + { + "epoch": 8.465914221218961, + "grad_norm": 218.49935913085938, + "learning_rate": 1.7286751361161524e-05, + "loss": 39.6935, + "step": 2345 + }, + { + "epoch": 8.469525959367946, + "grad_norm": 232.4805145263672, + "learning_rate": 1.7281306715063523e-05, + "loss": 37.0718, + "step": 2346 + }, + { + "epoch": 8.47313769751693, + "grad_norm": 201.1748046875, + "learning_rate": 1.727586206896552e-05, + "loss": 33.9633, + "step": 2347 + }, + { + "epoch": 8.476749435665914, + "grad_norm": 208.79733276367188, + "learning_rate": 1.7270417422867514e-05, + "loss": 33.4553, + "step": 2348 + }, + { + "epoch": 8.480361173814899, + "grad_norm": 235.91151428222656, + "learning_rate": 1.726497277676951e-05, + "loss": 33.6144, + "step": 2349 + }, + { + "epoch": 8.483972911963882, + "grad_norm": 206.28811645507812, + "learning_rate": 1.7259528130671508e-05, + "loss": 35.3678, + "step": 2350 + }, + { + "epoch": 8.483972911963882, + "eval_loss": 0.6203061938285828, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 2350 + }, + { + "epoch": 8.487584650112867, + "grad_norm": 305.2204284667969, + "learning_rate": 1.7254083484573503e-05, + "loss": 35.9175, + "step": 2351 + }, + { + "epoch": 8.491196388261852, + "grad_norm": 227.1592254638672, + "learning_rate": 1.72486388384755e-05, + "loss": 35.5001, + "step": 2352 + }, + { + "epoch": 8.494808126410835, + "grad_norm": 194.739501953125, + "learning_rate": 1.7243194192377494e-05, + "loss": 35.0015, + "step": 2353 + }, + { + "epoch": 8.49841986455982, + "grad_norm": 233.8467254638672, + "learning_rate": 1.723774954627949e-05, + "loss": 36.8257, + "step": 2354 + }, + { + "epoch": 8.502031602708804, + "grad_norm": 258.8914489746094, + "learning_rate": 1.7232304900181492e-05, + "loss": 36.1246, + "step": 2355 + }, + { + "epoch": 8.505643340857787, + "grad_norm": 194.8585968017578, + "learning_rate": 1.7226860254083487e-05, + "loss": 36.1245, + "step": 2356 + }, + { + "epoch": 8.509255079006772, + "grad_norm": 191.2276153564453, + "learning_rate": 1.7221415607985483e-05, + "loss": 37.0608, + "step": 2357 + }, + { + "epoch": 8.512866817155757, + "grad_norm": 197.9025115966797, + "learning_rate": 1.7215970961887478e-05, + "loss": 37.0779, + "step": 2358 + }, + { + "epoch": 8.51647855530474, + "grad_norm": 207.01016235351562, + "learning_rate": 1.7210526315789473e-05, + "loss": 37.8432, + "step": 2359 + }, + { + "epoch": 8.520090293453725, + "grad_norm": 222.20201110839844, + "learning_rate": 1.720508166969147e-05, + "loss": 36.6983, + "step": 2360 + }, + { + "epoch": 8.520090293453725, + "eval_loss": 0.6240220665931702, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 2360 + }, + { + "epoch": 8.523702031602708, + "grad_norm": 200.19273376464844, + "learning_rate": 1.7199637023593467e-05, + "loss": 38.0613, + "step": 2361 + }, + { + "epoch": 8.527313769751693, + "grad_norm": 205.36758422851562, + "learning_rate": 1.7194192377495463e-05, + "loss": 29.6395, + "step": 2362 + }, + { + "epoch": 8.530925507900678, + "grad_norm": 206.53396606445312, + "learning_rate": 1.7188747731397458e-05, + "loss": 23.6478, + "step": 2363 + }, + { + "epoch": 8.534537246049661, + "grad_norm": 219.47044372558594, + "learning_rate": 1.7183303085299454e-05, + "loss": 22.8522, + "step": 2364 + }, + { + "epoch": 8.538148984198646, + "grad_norm": 178.48008728027344, + "learning_rate": 1.7177858439201452e-05, + "loss": 24.1411, + "step": 2365 + }, + { + "epoch": 8.54176072234763, + "grad_norm": 222.63731384277344, + "learning_rate": 1.717241379310345e-05, + "loss": 26.2818, + "step": 2366 + }, + { + "epoch": 8.545372460496614, + "grad_norm": 216.6333465576172, + "learning_rate": 1.7166969147005447e-05, + "loss": 42.5599, + "step": 2367 + }, + { + "epoch": 8.548984198645599, + "grad_norm": 241.42532348632812, + "learning_rate": 1.7161524500907442e-05, + "loss": 44.0016, + "step": 2368 + }, + { + "epoch": 8.552595936794582, + "grad_norm": 227.95193481445312, + "learning_rate": 1.7156079854809437e-05, + "loss": 44.1662, + "step": 2369 + }, + { + "epoch": 8.556207674943566, + "grad_norm": 204.9208526611328, + "learning_rate": 1.7150635208711433e-05, + "loss": 41.2255, + "step": 2370 + }, + { + "epoch": 8.556207674943566, + "eval_loss": 0.6293933987617493, + "eval_runtime": 3.1467, + "eval_samples_per_second": 56.884, + "eval_steps_per_second": 56.884, + "step": 2370 + }, + { + "epoch": 8.559819413092551, + "grad_norm": 168.1370849609375, + "learning_rate": 1.7145190562613428e-05, + "loss": 42.8374, + "step": 2371 + }, + { + "epoch": 8.563431151241534, + "grad_norm": 209.16641235351562, + "learning_rate": 1.7139745916515427e-05, + "loss": 42.4378, + "step": 2372 + }, + { + "epoch": 8.56704288939052, + "grad_norm": 235.36373901367188, + "learning_rate": 1.7134301270417422e-05, + "loss": 43.3213, + "step": 2373 + }, + { + "epoch": 8.570654627539504, + "grad_norm": 198.8206329345703, + "learning_rate": 1.712885662431942e-05, + "loss": 43.5621, + "step": 2374 + }, + { + "epoch": 8.574266365688487, + "grad_norm": 191.1640167236328, + "learning_rate": 1.7123411978221416e-05, + "loss": 41.8729, + "step": 2375 + }, + { + "epoch": 8.577878103837472, + "grad_norm": 281.6352233886719, + "learning_rate": 1.7117967332123412e-05, + "loss": 42.8306, + "step": 2376 + }, + { + "epoch": 8.581489841986457, + "grad_norm": 191.68939208984375, + "learning_rate": 1.711252268602541e-05, + "loss": 41.3603, + "step": 2377 + }, + { + "epoch": 8.58510158013544, + "grad_norm": 175.3041229248047, + "learning_rate": 1.7107078039927406e-05, + "loss": 38.7076, + "step": 2378 + }, + { + "epoch": 8.588713318284425, + "grad_norm": 186.31202697753906, + "learning_rate": 1.71016333938294e-05, + "loss": 38.832, + "step": 2379 + }, + { + "epoch": 8.592325056433408, + "grad_norm": 192.0680389404297, + "learning_rate": 1.7096188747731397e-05, + "loss": 40.6542, + "step": 2380 + }, + { + "epoch": 8.592325056433408, + "eval_loss": 0.6245992183685303, + "eval_runtime": 3.1487, + "eval_samples_per_second": 56.848, + "eval_steps_per_second": 56.848, + "step": 2380 + }, + { + "epoch": 8.595936794582393, + "grad_norm": 284.3516540527344, + "learning_rate": 1.7090744101633392e-05, + "loss": 40.3145, + "step": 2381 + }, + { + "epoch": 8.599548532731378, + "grad_norm": 210.2421875, + "learning_rate": 1.708529945553539e-05, + "loss": 39.9109, + "step": 2382 + }, + { + "epoch": 8.60316027088036, + "grad_norm": 202.3438720703125, + "learning_rate": 1.707985480943739e-05, + "loss": 39.0686, + "step": 2383 + }, + { + "epoch": 8.606772009029346, + "grad_norm": 189.5508270263672, + "learning_rate": 1.7074410163339385e-05, + "loss": 40.6673, + "step": 2384 + }, + { + "epoch": 8.610383747178329, + "grad_norm": 199.3516387939453, + "learning_rate": 1.706896551724138e-05, + "loss": 40.5357, + "step": 2385 + }, + { + "epoch": 8.613995485327314, + "grad_norm": 183.11309814453125, + "learning_rate": 1.7063520871143376e-05, + "loss": 40.7691, + "step": 2386 + }, + { + "epoch": 8.617607223476298, + "grad_norm": 347.104248046875, + "learning_rate": 1.705807622504537e-05, + "loss": 40.6822, + "step": 2387 + }, + { + "epoch": 8.621218961625281, + "grad_norm": 341.0453796386719, + "learning_rate": 1.705263157894737e-05, + "loss": 40.9791, + "step": 2388 + }, + { + "epoch": 8.624830699774266, + "grad_norm": 335.33221435546875, + "learning_rate": 1.7047186932849365e-05, + "loss": 41.0977, + "step": 2389 + }, + { + "epoch": 8.628442437923251, + "grad_norm": 209.75198364257812, + "learning_rate": 1.704174228675136e-05, + "loss": 41.3332, + "step": 2390 + }, + { + "epoch": 8.628442437923251, + "eval_loss": 0.6176490783691406, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2390 + }, + { + "epoch": 8.632054176072234, + "grad_norm": 221.6715545654297, + "learning_rate": 1.7036297640653356e-05, + "loss": 41.7456, + "step": 2391 + }, + { + "epoch": 8.635665914221219, + "grad_norm": 255.7875213623047, + "learning_rate": 1.7030852994555355e-05, + "loss": 41.7063, + "step": 2392 + }, + { + "epoch": 8.639277652370204, + "grad_norm": 206.66221618652344, + "learning_rate": 1.7025408348457354e-05, + "loss": 41.941, + "step": 2393 + }, + { + "epoch": 8.642889390519187, + "grad_norm": 381.9871826171875, + "learning_rate": 1.701996370235935e-05, + "loss": 42.8615, + "step": 2394 + }, + { + "epoch": 8.646501128668172, + "grad_norm": 303.8249816894531, + "learning_rate": 1.7014519056261344e-05, + "loss": 37.8472, + "step": 2395 + }, + { + "epoch": 8.650112866817155, + "grad_norm": 201.2444610595703, + "learning_rate": 1.700907441016334e-05, + "loss": 35.4641, + "step": 2396 + }, + { + "epoch": 8.65372460496614, + "grad_norm": 242.34298706054688, + "learning_rate": 1.7003629764065335e-05, + "loss": 33.3414, + "step": 2397 + }, + { + "epoch": 8.657336343115125, + "grad_norm": 214.45384216308594, + "learning_rate": 1.699818511796733e-05, + "loss": 33.7771, + "step": 2398 + }, + { + "epoch": 8.660948081264108, + "grad_norm": 276.4810485839844, + "learning_rate": 1.699274047186933e-05, + "loss": 35.4289, + "step": 2399 + }, + { + "epoch": 8.664559819413093, + "grad_norm": 199.68626403808594, + "learning_rate": 1.6987295825771325e-05, + "loss": 34.4205, + "step": 2400 + }, + { + "epoch": 8.664559819413093, + "eval_loss": 0.6179484128952026, + "eval_runtime": 3.1618, + "eval_samples_per_second": 56.614, + "eval_steps_per_second": 56.614, + "step": 2400 + }, + { + "epoch": 8.668171557562077, + "grad_norm": 239.19200134277344, + "learning_rate": 1.698185117967332e-05, + "loss": 34.3428, + "step": 2401 + }, + { + "epoch": 8.67178329571106, + "grad_norm": 341.44927978515625, + "learning_rate": 1.697640653357532e-05, + "loss": 37.6011, + "step": 2402 + }, + { + "epoch": 8.675395033860045, + "grad_norm": 260.5967102050781, + "learning_rate": 1.6970961887477314e-05, + "loss": 34.9222, + "step": 2403 + }, + { + "epoch": 8.679006772009028, + "grad_norm": 217.9357147216797, + "learning_rate": 1.6965517241379313e-05, + "loss": 36.6177, + "step": 2404 + }, + { + "epoch": 8.682618510158013, + "grad_norm": 355.21917724609375, + "learning_rate": 1.696007259528131e-05, + "loss": 36.3072, + "step": 2405 + }, + { + "epoch": 8.686230248306998, + "grad_norm": 279.37200927734375, + "learning_rate": 1.6954627949183304e-05, + "loss": 36.7026, + "step": 2406 + }, + { + "epoch": 8.689841986455981, + "grad_norm": 344.9017028808594, + "learning_rate": 1.69491833030853e-05, + "loss": 37.5009, + "step": 2407 + }, + { + "epoch": 8.693453724604966, + "grad_norm": 225.28668212890625, + "learning_rate": 1.6943738656987295e-05, + "loss": 36.0914, + "step": 2408 + }, + { + "epoch": 8.697065462753951, + "grad_norm": 233.16372680664062, + "learning_rate": 1.693829401088929e-05, + "loss": 38.0917, + "step": 2409 + }, + { + "epoch": 8.700677200902934, + "grad_norm": 220.2307891845703, + "learning_rate": 1.693284936479129e-05, + "loss": 37.4493, + "step": 2410 + }, + { + "epoch": 8.700677200902934, + "eval_loss": 0.6225734949111938, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2410 + }, + { + "epoch": 8.704288939051919, + "grad_norm": 298.2883605957031, + "learning_rate": 1.6927404718693287e-05, + "loss": 37.6527, + "step": 2411 + }, + { + "epoch": 8.707900677200904, + "grad_norm": 329.1615295410156, + "learning_rate": 1.6921960072595283e-05, + "loss": 30.9627, + "step": 2412 + }, + { + "epoch": 8.711512415349887, + "grad_norm": 192.55380249023438, + "learning_rate": 1.6916515426497278e-05, + "loss": 24.2028, + "step": 2413 + }, + { + "epoch": 8.715124153498872, + "grad_norm": 162.13583374023438, + "learning_rate": 1.6911070780399274e-05, + "loss": 23.3005, + "step": 2414 + }, + { + "epoch": 8.718735891647855, + "grad_norm": 152.95108032226562, + "learning_rate": 1.6905626134301272e-05, + "loss": 24.335, + "step": 2415 + }, + { + "epoch": 8.72234762979684, + "grad_norm": 183.4193572998047, + "learning_rate": 1.6900181488203268e-05, + "loss": 24.9279, + "step": 2416 + }, + { + "epoch": 8.725959367945824, + "grad_norm": 232.93650817871094, + "learning_rate": 1.6894736842105263e-05, + "loss": 43.4574, + "step": 2417 + }, + { + "epoch": 8.729571106094808, + "grad_norm": 226.85890197753906, + "learning_rate": 1.688929219600726e-05, + "loss": 44.4136, + "step": 2418 + }, + { + "epoch": 8.733182844243792, + "grad_norm": 232.16064453125, + "learning_rate": 1.6883847549909254e-05, + "loss": 42.8183, + "step": 2419 + }, + { + "epoch": 8.736794582392777, + "grad_norm": 243.5811767578125, + "learning_rate": 1.6878402903811253e-05, + "loss": 43.3031, + "step": 2420 + }, + { + "epoch": 8.736794582392777, + "eval_loss": 0.6284167170524597, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2420 + }, + { + "epoch": 8.74040632054176, + "grad_norm": 194.7115020751953, + "learning_rate": 1.687295825771325e-05, + "loss": 42.1276, + "step": 2421 + }, + { + "epoch": 8.744018058690745, + "grad_norm": 250.81983947753906, + "learning_rate": 1.6867513611615247e-05, + "loss": 42.5535, + "step": 2422 + }, + { + "epoch": 8.747629796839728, + "grad_norm": 205.1988983154297, + "learning_rate": 1.6862068965517242e-05, + "loss": 42.7745, + "step": 2423 + }, + { + "epoch": 8.751241534988713, + "grad_norm": 159.68243408203125, + "learning_rate": 1.6856624319419238e-05, + "loss": 43.6562, + "step": 2424 + }, + { + "epoch": 8.754853273137698, + "grad_norm": 164.31361389160156, + "learning_rate": 1.6851179673321233e-05, + "loss": 43.4602, + "step": 2425 + }, + { + "epoch": 8.758465011286681, + "grad_norm": 213.9793243408203, + "learning_rate": 1.6845735027223232e-05, + "loss": 42.1559, + "step": 2426 + }, + { + "epoch": 8.762076749435666, + "grad_norm": 205.79107666015625, + "learning_rate": 1.6840290381125227e-05, + "loss": 41.5687, + "step": 2427 + }, + { + "epoch": 8.76568848758465, + "grad_norm": 235.80348205566406, + "learning_rate": 1.6834845735027223e-05, + "loss": 41.0748, + "step": 2428 + }, + { + "epoch": 8.769300225733634, + "grad_norm": 203.84884643554688, + "learning_rate": 1.682940108892922e-05, + "loss": 39.3348, + "step": 2429 + }, + { + "epoch": 8.772911963882619, + "grad_norm": 271.2411804199219, + "learning_rate": 1.6823956442831217e-05, + "loss": 39.357, + "step": 2430 + }, + { + "epoch": 8.772911963882619, + "eval_loss": 0.6211046576499939, + "eval_runtime": 3.1402, + "eval_samples_per_second": 57.002, + "eval_steps_per_second": 57.002, + "step": 2430 + }, + { + "epoch": 8.776523702031604, + "grad_norm": 222.4960174560547, + "learning_rate": 1.6818511796733212e-05, + "loss": 39.2198, + "step": 2431 + }, + { + "epoch": 8.780135440180587, + "grad_norm": 325.9942932128906, + "learning_rate": 1.681306715063521e-05, + "loss": 40.572, + "step": 2432 + }, + { + "epoch": 8.783747178329572, + "grad_norm": 195.2740936279297, + "learning_rate": 1.6807622504537206e-05, + "loss": 39.2727, + "step": 2433 + }, + { + "epoch": 8.787358916478555, + "grad_norm": 196.16964721679688, + "learning_rate": 1.68021778584392e-05, + "loss": 40.6503, + "step": 2434 + }, + { + "epoch": 8.79097065462754, + "grad_norm": 183.2659454345703, + "learning_rate": 1.6796733212341197e-05, + "loss": 41.2074, + "step": 2435 + }, + { + "epoch": 8.794582392776524, + "grad_norm": 293.393798828125, + "learning_rate": 1.6791288566243192e-05, + "loss": 40.2778, + "step": 2436 + }, + { + "epoch": 8.798194130925507, + "grad_norm": 232.8402099609375, + "learning_rate": 1.678584392014519e-05, + "loss": 40.0305, + "step": 2437 + }, + { + "epoch": 8.801805869074492, + "grad_norm": 269.957275390625, + "learning_rate": 1.678039927404719e-05, + "loss": 40.4216, + "step": 2438 + }, + { + "epoch": 8.805417607223477, + "grad_norm": 175.6732635498047, + "learning_rate": 1.6774954627949185e-05, + "loss": 40.7998, + "step": 2439 + }, + { + "epoch": 8.80902934537246, + "grad_norm": 209.0604248046875, + "learning_rate": 1.676950998185118e-05, + "loss": 41.1176, + "step": 2440 + }, + { + "epoch": 8.80902934537246, + "eval_loss": 0.6211614012718201, + "eval_runtime": 3.15, + "eval_samples_per_second": 56.826, + "eval_steps_per_second": 56.826, + "step": 2440 + }, + { + "epoch": 8.812641083521445, + "grad_norm": 229.91171264648438, + "learning_rate": 1.6764065335753176e-05, + "loss": 41.37, + "step": 2441 + }, + { + "epoch": 8.816252821670428, + "grad_norm": 192.99610900878906, + "learning_rate": 1.675862068965517e-05, + "loss": 41.8377, + "step": 2442 + }, + { + "epoch": 8.819864559819413, + "grad_norm": 239.290771484375, + "learning_rate": 1.675317604355717e-05, + "loss": 42.3038, + "step": 2443 + }, + { + "epoch": 8.823476297968398, + "grad_norm": 203.52330017089844, + "learning_rate": 1.6747731397459166e-05, + "loss": 41.3334, + "step": 2444 + }, + { + "epoch": 8.827088036117381, + "grad_norm": 247.99099731445312, + "learning_rate": 1.674228675136116e-05, + "loss": 37.7455, + "step": 2445 + }, + { + "epoch": 8.830699774266366, + "grad_norm": 205.9770965576172, + "learning_rate": 1.6736842105263156e-05, + "loss": 34.6828, + "step": 2446 + }, + { + "epoch": 8.83431151241535, + "grad_norm": 215.47024536132812, + "learning_rate": 1.6731397459165152e-05, + "loss": 34.927, + "step": 2447 + }, + { + "epoch": 8.837923250564334, + "grad_norm": 254.14010620117188, + "learning_rate": 1.6725952813067154e-05, + "loss": 35.3194, + "step": 2448 + }, + { + "epoch": 8.841534988713319, + "grad_norm": 221.18174743652344, + "learning_rate": 1.672050816696915e-05, + "loss": 34.9577, + "step": 2449 + }, + { + "epoch": 8.845146726862303, + "grad_norm": 191.1651611328125, + "learning_rate": 1.6715063520871145e-05, + "loss": 33.7244, + "step": 2450 + }, + { + "epoch": 8.845146726862303, + "eval_loss": 0.6216589212417603, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2450 + }, + { + "epoch": 8.848758465011286, + "grad_norm": 228.3920135498047, + "learning_rate": 1.670961887477314e-05, + "loss": 34.9689, + "step": 2451 + }, + { + "epoch": 8.852370203160271, + "grad_norm": 227.6689910888672, + "learning_rate": 1.6704174228675135e-05, + "loss": 36.0718, + "step": 2452 + }, + { + "epoch": 8.855981941309254, + "grad_norm": 182.38978576660156, + "learning_rate": 1.669872958257713e-05, + "loss": 37.1143, + "step": 2453 + }, + { + "epoch": 8.85959367945824, + "grad_norm": 223.66966247558594, + "learning_rate": 1.669328493647913e-05, + "loss": 34.4468, + "step": 2454 + }, + { + "epoch": 8.863205417607224, + "grad_norm": 260.3930358886719, + "learning_rate": 1.6687840290381125e-05, + "loss": 36.7305, + "step": 2455 + }, + { + "epoch": 8.866817155756207, + "grad_norm": 218.60385131835938, + "learning_rate": 1.668239564428312e-05, + "loss": 36.1995, + "step": 2456 + }, + { + "epoch": 8.870428893905192, + "grad_norm": 227.4342041015625, + "learning_rate": 1.667695099818512e-05, + "loss": 35.9138, + "step": 2457 + }, + { + "epoch": 8.874040632054175, + "grad_norm": 208.42196655273438, + "learning_rate": 1.6671506352087115e-05, + "loss": 37.2621, + "step": 2458 + }, + { + "epoch": 8.87765237020316, + "grad_norm": 214.9486541748047, + "learning_rate": 1.6666061705989113e-05, + "loss": 38.5176, + "step": 2459 + }, + { + "epoch": 8.881264108352145, + "grad_norm": 226.6992645263672, + "learning_rate": 1.666061705989111e-05, + "loss": 38.3917, + "step": 2460 + }, + { + "epoch": 8.881264108352145, + "eval_loss": 0.6277003884315491, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.959, + "eval_steps_per_second": 56.959, + "step": 2460 + }, + { + "epoch": 8.884875846501128, + "grad_norm": 282.3875732421875, + "learning_rate": 1.6655172413793104e-05, + "loss": 39.1439, + "step": 2461 + }, + { + "epoch": 8.888487584650113, + "grad_norm": 240.29022216796875, + "learning_rate": 1.66497277676951e-05, + "loss": 33.7717, + "step": 2462 + }, + { + "epoch": 8.892099322799098, + "grad_norm": 231.84727478027344, + "learning_rate": 1.6644283121597095e-05, + "loss": 24.1146, + "step": 2463 + }, + { + "epoch": 8.89571106094808, + "grad_norm": 215.5159149169922, + "learning_rate": 1.663883847549909e-05, + "loss": 24.0165, + "step": 2464 + }, + { + "epoch": 8.899322799097066, + "grad_norm": 278.42950439453125, + "learning_rate": 1.663339382940109e-05, + "loss": 24.2048, + "step": 2465 + }, + { + "epoch": 8.90293453724605, + "grad_norm": 187.03341674804688, + "learning_rate": 1.6627949183303088e-05, + "loss": 24.7332, + "step": 2466 + }, + { + "epoch": 8.906546275395034, + "grad_norm": 261.2938232421875, + "learning_rate": 1.6622504537205083e-05, + "loss": 42.6764, + "step": 2467 + }, + { + "epoch": 8.910158013544018, + "grad_norm": 234.00880432128906, + "learning_rate": 1.661705989110708e-05, + "loss": 42.9894, + "step": 2468 + }, + { + "epoch": 8.913769751693001, + "grad_norm": 263.2890319824219, + "learning_rate": 1.6611615245009074e-05, + "loss": 43.3274, + "step": 2469 + }, + { + "epoch": 8.917381489841986, + "grad_norm": 286.3260192871094, + "learning_rate": 1.6606170598911073e-05, + "loss": 44.3862, + "step": 2470 + }, + { + "epoch": 8.917381489841986, + "eval_loss": 0.6278789043426514, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2470 + }, + { + "epoch": 8.920993227990971, + "grad_norm": 273.5133972167969, + "learning_rate": 1.6600725952813068e-05, + "loss": 43.4195, + "step": 2471 + }, + { + "epoch": 8.924604966139954, + "grad_norm": 246.2245330810547, + "learning_rate": 1.6595281306715063e-05, + "loss": 43.153, + "step": 2472 + }, + { + "epoch": 8.928216704288939, + "grad_norm": 261.3001403808594, + "learning_rate": 1.658983666061706e-05, + "loss": 41.1276, + "step": 2473 + }, + { + "epoch": 8.931828442437924, + "grad_norm": 263.7626037597656, + "learning_rate": 1.6584392014519054e-05, + "loss": 40.5055, + "step": 2474 + }, + { + "epoch": 8.935440180586907, + "grad_norm": 233.80442810058594, + "learning_rate": 1.6578947368421053e-05, + "loss": 40.7098, + "step": 2475 + }, + { + "epoch": 8.939051918735892, + "grad_norm": 334.1268615722656, + "learning_rate": 1.6573502722323052e-05, + "loss": 40.5404, + "step": 2476 + }, + { + "epoch": 8.942663656884875, + "grad_norm": 319.56689453125, + "learning_rate": 1.6568058076225047e-05, + "loss": 40.3434, + "step": 2477 + }, + { + "epoch": 8.94627539503386, + "grad_norm": 388.0625915527344, + "learning_rate": 1.6562613430127043e-05, + "loss": 41.1956, + "step": 2478 + }, + { + "epoch": 8.949887133182845, + "grad_norm": 256.9087829589844, + "learning_rate": 1.6557168784029038e-05, + "loss": 41.9647, + "step": 2479 + }, + { + "epoch": 8.953498871331828, + "grad_norm": 248.2635040283203, + "learning_rate": 1.6551724137931033e-05, + "loss": 41.1885, + "step": 2480 + }, + { + "epoch": 8.953498871331828, + "eval_loss": 0.6198933124542236, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2480 + }, + { + "epoch": 8.957110609480813, + "grad_norm": 236.89004516601562, + "learning_rate": 1.6546279491833032e-05, + "loss": 41.2178, + "step": 2481 + }, + { + "epoch": 8.960722347629797, + "grad_norm": 260.47357177734375, + "learning_rate": 1.6540834845735027e-05, + "loss": 42.1472, + "step": 2482 + }, + { + "epoch": 8.96433408577878, + "grad_norm": 216.1390380859375, + "learning_rate": 1.6535390199637023e-05, + "loss": 36.14, + "step": 2483 + }, + { + "epoch": 8.967945823927765, + "grad_norm": 194.7316131591797, + "learning_rate": 1.652994555353902e-05, + "loss": 33.7272, + "step": 2484 + }, + { + "epoch": 8.97155756207675, + "grad_norm": 202.0404052734375, + "learning_rate": 1.6524500907441017e-05, + "loss": 34.9427, + "step": 2485 + }, + { + "epoch": 8.975169300225733, + "grad_norm": 196.98463439941406, + "learning_rate": 1.6519056261343016e-05, + "loss": 36.4874, + "step": 2486 + }, + { + "epoch": 8.978781038374718, + "grad_norm": 211.46177673339844, + "learning_rate": 1.651361161524501e-05, + "loss": 35.7667, + "step": 2487 + }, + { + "epoch": 8.982392776523701, + "grad_norm": 190.47093200683594, + "learning_rate": 1.6508166969147006e-05, + "loss": 35.6874, + "step": 2488 + }, + { + "epoch": 8.986004514672686, + "grad_norm": 194.9825897216797, + "learning_rate": 1.6502722323049002e-05, + "loss": 36.8718, + "step": 2489 + }, + { + "epoch": 8.989616252821671, + "grad_norm": 230.24774169921875, + "learning_rate": 1.6497277676950997e-05, + "loss": 37.4962, + "step": 2490 + }, + { + "epoch": 8.989616252821671, + "eval_loss": 0.6168100237846375, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 2490 + }, + { + "epoch": 8.993227990970654, + "grad_norm": 266.5688171386719, + "learning_rate": 1.6491833030852993e-05, + "loss": 35.5063, + "step": 2491 + }, + { + "epoch": 8.996839729119639, + "grad_norm": 230.923828125, + "learning_rate": 1.648638838475499e-05, + "loss": 23.5847, + "step": 2492 + }, + { + "epoch": 9.0, + "grad_norm": 187.365478515625, + "learning_rate": 1.6480943738656987e-05, + "loss": 21.7926, + "step": 2493 + }, + { + "epoch": 9.003611738148985, + "grad_norm": 283.487060546875, + "learning_rate": 1.6475499092558986e-05, + "loss": 41.4221, + "step": 2494 + }, + { + "epoch": 9.007223476297968, + "grad_norm": 234.38009643554688, + "learning_rate": 1.647005444646098e-05, + "loss": 43.3343, + "step": 2495 + }, + { + "epoch": 9.010835214446953, + "grad_norm": 253.75588989257812, + "learning_rate": 1.6464609800362976e-05, + "loss": 42.1983, + "step": 2496 + }, + { + "epoch": 9.014446952595938, + "grad_norm": 224.6202392578125, + "learning_rate": 1.6459165154264975e-05, + "loss": 41.5355, + "step": 2497 + }, + { + "epoch": 9.01805869074492, + "grad_norm": 261.0040588378906, + "learning_rate": 1.645372050816697e-05, + "loss": 42.3058, + "step": 2498 + }, + { + "epoch": 9.021670428893906, + "grad_norm": 191.44142150878906, + "learning_rate": 1.6448275862068966e-05, + "loss": 42.3911, + "step": 2499 + }, + { + "epoch": 9.025282167042889, + "grad_norm": 246.79278564453125, + "learning_rate": 1.644283121597096e-05, + "loss": 41.6238, + "step": 2500 + }, + { + "epoch": 9.025282167042889, + "eval_loss": 0.6220878958702087, + "eval_runtime": 3.1552, + "eval_samples_per_second": 56.731, + "eval_steps_per_second": 56.731, + "step": 2500 + }, + { + "epoch": 9.028893905191874, + "grad_norm": 251.5475311279297, + "learning_rate": 1.6437386569872957e-05, + "loss": 43.9275, + "step": 2501 + }, + { + "epoch": 9.032505643340858, + "grad_norm": 300.0381164550781, + "learning_rate": 1.6431941923774952e-05, + "loss": 42.8938, + "step": 2502 + }, + { + "epoch": 9.036117381489841, + "grad_norm": 310.0517883300781, + "learning_rate": 1.6426497277676954e-05, + "loss": 42.3538, + "step": 2503 + }, + { + "epoch": 9.039729119638826, + "grad_norm": 213.50392150878906, + "learning_rate": 1.642105263157895e-05, + "loss": 40.2305, + "step": 2504 + }, + { + "epoch": 9.043340857787811, + "grad_norm": 173.3816680908203, + "learning_rate": 1.6415607985480945e-05, + "loss": 38.3336, + "step": 2505 + }, + { + "epoch": 9.046952595936794, + "grad_norm": 195.51968383789062, + "learning_rate": 1.641016333938294e-05, + "loss": 38.5937, + "step": 2506 + }, + { + "epoch": 9.050564334085779, + "grad_norm": 195.68910217285156, + "learning_rate": 1.6404718693284936e-05, + "loss": 37.9994, + "step": 2507 + }, + { + "epoch": 9.054176072234762, + "grad_norm": 239.56704711914062, + "learning_rate": 1.6399274047186934e-05, + "loss": 38.6006, + "step": 2508 + }, + { + "epoch": 9.057787810383747, + "grad_norm": 455.8309326171875, + "learning_rate": 1.639382940108893e-05, + "loss": 39.9516, + "step": 2509 + }, + { + "epoch": 9.061399548532732, + "grad_norm": 188.0857696533203, + "learning_rate": 1.6388384754990925e-05, + "loss": 38.8922, + "step": 2510 + }, + { + "epoch": 9.061399548532732, + "eval_loss": 0.6177002191543579, + "eval_runtime": 3.1595, + "eval_samples_per_second": 56.654, + "eval_steps_per_second": 56.654, + "step": 2510 + }, + { + "epoch": 9.065011286681715, + "grad_norm": 211.76168823242188, + "learning_rate": 1.638294010889292e-05, + "loss": 38.8895, + "step": 2511 + }, + { + "epoch": 9.0686230248307, + "grad_norm": 281.7332458496094, + "learning_rate": 1.637749546279492e-05, + "loss": 39.9238, + "step": 2512 + }, + { + "epoch": 9.072234762979685, + "grad_norm": 254.9953155517578, + "learning_rate": 1.6372050816696915e-05, + "loss": 41.2667, + "step": 2513 + }, + { + "epoch": 9.075846501128668, + "grad_norm": 233.8746337890625, + "learning_rate": 1.6366606170598914e-05, + "loss": 39.3087, + "step": 2514 + }, + { + "epoch": 9.079458239277653, + "grad_norm": 317.71270751953125, + "learning_rate": 1.636116152450091e-05, + "loss": 40.4902, + "step": 2515 + }, + { + "epoch": 9.083069977426636, + "grad_norm": 227.5228271484375, + "learning_rate": 1.6355716878402904e-05, + "loss": 40.1197, + "step": 2516 + }, + { + "epoch": 9.08668171557562, + "grad_norm": 225.84423828125, + "learning_rate": 1.63502722323049e-05, + "loss": 42.9099, + "step": 2517 + }, + { + "epoch": 9.090293453724605, + "grad_norm": 255.20858764648438, + "learning_rate": 1.6344827586206895e-05, + "loss": 42.0515, + "step": 2518 + }, + { + "epoch": 9.093905191873588, + "grad_norm": 215.45352172851562, + "learning_rate": 1.6339382940108894e-05, + "loss": 41.6817, + "step": 2519 + }, + { + "epoch": 9.097516930022573, + "grad_norm": 233.5334014892578, + "learning_rate": 1.633393829401089e-05, + "loss": 42.6121, + "step": 2520 + }, + { + "epoch": 9.097516930022573, + "eval_loss": 0.6148340106010437, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.926, + "eval_steps_per_second": 56.926, + "step": 2520 + }, + { + "epoch": 9.101128668171558, + "grad_norm": 196.54132080078125, + "learning_rate": 1.6328493647912888e-05, + "loss": 40.5833, + "step": 2521 + }, + { + "epoch": 9.104740406320541, + "grad_norm": 296.7503967285156, + "learning_rate": 1.6323049001814883e-05, + "loss": 39.098, + "step": 2522 + }, + { + "epoch": 9.108352144469526, + "grad_norm": 272.1104431152344, + "learning_rate": 1.631760435571688e-05, + "loss": 36.0076, + "step": 2523 + }, + { + "epoch": 9.111963882618511, + "grad_norm": 197.3100128173828, + "learning_rate": 1.6312159709618874e-05, + "loss": 33.3503, + "step": 2524 + }, + { + "epoch": 9.115575620767494, + "grad_norm": 223.1310272216797, + "learning_rate": 1.6306715063520873e-05, + "loss": 33.1386, + "step": 2525 + }, + { + "epoch": 9.119187358916479, + "grad_norm": 234.86093139648438, + "learning_rate": 1.630127041742287e-05, + "loss": 34.2101, + "step": 2526 + }, + { + "epoch": 9.122799097065462, + "grad_norm": 244.72328186035156, + "learning_rate": 1.6295825771324864e-05, + "loss": 34.955, + "step": 2527 + }, + { + "epoch": 9.126410835214447, + "grad_norm": 198.89134216308594, + "learning_rate": 1.629038112522686e-05, + "loss": 34.5405, + "step": 2528 + }, + { + "epoch": 9.130022573363432, + "grad_norm": 236.64096069335938, + "learning_rate": 1.6284936479128854e-05, + "loss": 35.2328, + "step": 2529 + }, + { + "epoch": 9.133634311512415, + "grad_norm": 212.8743438720703, + "learning_rate": 1.6279491833030853e-05, + "loss": 34.6642, + "step": 2530 + }, + { + "epoch": 9.133634311512415, + "eval_loss": 0.6154256463050842, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 2530 + }, + { + "epoch": 9.1372460496614, + "grad_norm": 227.15135192871094, + "learning_rate": 1.6274047186932852e-05, + "loss": 35.652, + "step": 2531 + }, + { + "epoch": 9.140857787810384, + "grad_norm": 207.30572509765625, + "learning_rate": 1.6268602540834847e-05, + "loss": 36.8476, + "step": 2532 + }, + { + "epoch": 9.144469525959368, + "grad_norm": 222.18023681640625, + "learning_rate": 1.6263157894736843e-05, + "loss": 35.8299, + "step": 2533 + }, + { + "epoch": 9.148081264108352, + "grad_norm": 283.674072265625, + "learning_rate": 1.6257713248638838e-05, + "loss": 36.5074, + "step": 2534 + }, + { + "epoch": 9.151693002257336, + "grad_norm": 235.69752502441406, + "learning_rate": 1.6252268602540834e-05, + "loss": 37.344, + "step": 2535 + }, + { + "epoch": 9.15530474040632, + "grad_norm": 224.37965393066406, + "learning_rate": 1.6246823956442832e-05, + "loss": 37.8138, + "step": 2536 + }, + { + "epoch": 9.158916478555305, + "grad_norm": 217.52230834960938, + "learning_rate": 1.6241379310344828e-05, + "loss": 37.1529, + "step": 2537 + }, + { + "epoch": 9.162528216704288, + "grad_norm": 234.7586212158203, + "learning_rate": 1.6235934664246823e-05, + "loss": 36.3247, + "step": 2538 + }, + { + "epoch": 9.166139954853273, + "grad_norm": 239.52479553222656, + "learning_rate": 1.623049001814882e-05, + "loss": 30.0805, + "step": 2539 + }, + { + "epoch": 9.169751693002258, + "grad_norm": 223.7616424560547, + "learning_rate": 1.6225045372050817e-05, + "loss": 23.8492, + "step": 2540 + }, + { + "epoch": 9.169751693002258, + "eval_loss": 0.6244915723800659, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.031, + "eval_steps_per_second": 57.031, + "step": 2540 + }, + { + "epoch": 9.173363431151241, + "grad_norm": 213.41371154785156, + "learning_rate": 1.6219600725952816e-05, + "loss": 23.3557, + "step": 2541 + }, + { + "epoch": 9.176975169300226, + "grad_norm": 162.4627685546875, + "learning_rate": 1.621415607985481e-05, + "loss": 23.8834, + "step": 2542 + }, + { + "epoch": 9.18058690744921, + "grad_norm": 172.13250732421875, + "learning_rate": 1.6208711433756807e-05, + "loss": 24.6428, + "step": 2543 + }, + { + "epoch": 9.184198645598194, + "grad_norm": 229.30799865722656, + "learning_rate": 1.6203266787658802e-05, + "loss": 42.5908, + "step": 2544 + }, + { + "epoch": 9.187810383747179, + "grad_norm": 195.30130004882812, + "learning_rate": 1.6197822141560798e-05, + "loss": 43.7286, + "step": 2545 + }, + { + "epoch": 9.191422121896162, + "grad_norm": 227.4984893798828, + "learning_rate": 1.6192377495462793e-05, + "loss": 43.5012, + "step": 2546 + }, + { + "epoch": 9.195033860045147, + "grad_norm": 254.69615173339844, + "learning_rate": 1.6186932849364792e-05, + "loss": 41.9295, + "step": 2547 + }, + { + "epoch": 9.198645598194132, + "grad_norm": 251.33778381347656, + "learning_rate": 1.6181488203266787e-05, + "loss": 42.0838, + "step": 2548 + }, + { + "epoch": 9.202257336343115, + "grad_norm": 237.91677856445312, + "learning_rate": 1.6176043557168786e-05, + "loss": 43.0031, + "step": 2549 + }, + { + "epoch": 9.2058690744921, + "grad_norm": 258.0311584472656, + "learning_rate": 1.617059891107078e-05, + "loss": 42.7196, + "step": 2550 + }, + { + "epoch": 9.2058690744921, + "eval_loss": 0.6245208978652954, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2550 + }, + { + "epoch": 9.209480812641084, + "grad_norm": 197.14703369140625, + "learning_rate": 1.6165154264972777e-05, + "loss": 42.1342, + "step": 2551 + }, + { + "epoch": 9.213092550790067, + "grad_norm": 235.19705200195312, + "learning_rate": 1.6159709618874775e-05, + "loss": 41.8462, + "step": 2552 + }, + { + "epoch": 9.216704288939052, + "grad_norm": 198.409423828125, + "learning_rate": 1.615426497277677e-05, + "loss": 43.5993, + "step": 2553 + }, + { + "epoch": 9.220316027088035, + "grad_norm": 254.08590698242188, + "learning_rate": 1.6148820326678766e-05, + "loss": 40.771, + "step": 2554 + }, + { + "epoch": 9.22392776523702, + "grad_norm": 181.64808654785156, + "learning_rate": 1.614337568058076e-05, + "loss": 39.3511, + "step": 2555 + }, + { + "epoch": 9.227539503386005, + "grad_norm": 294.1127014160156, + "learning_rate": 1.6137931034482757e-05, + "loss": 39.6586, + "step": 2556 + }, + { + "epoch": 9.231151241534988, + "grad_norm": 197.59982299804688, + "learning_rate": 1.6132486388384752e-05, + "loss": 38.2575, + "step": 2557 + }, + { + "epoch": 9.234762979683973, + "grad_norm": 223.74717712402344, + "learning_rate": 1.6127041742286754e-05, + "loss": 38.8801, + "step": 2558 + }, + { + "epoch": 9.238374717832958, + "grad_norm": 279.2779541015625, + "learning_rate": 1.612159709618875e-05, + "loss": 40.4591, + "step": 2559 + }, + { + "epoch": 9.241986455981941, + "grad_norm": 258.75909423828125, + "learning_rate": 1.6116152450090745e-05, + "loss": 39.2172, + "step": 2560 + }, + { + "epoch": 9.241986455981941, + "eval_loss": 0.6209923624992371, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.784, + "eval_steps_per_second": 56.784, + "step": 2560 + }, + { + "epoch": 9.245598194130926, + "grad_norm": 305.0645446777344, + "learning_rate": 1.611070780399274e-05, + "loss": 40.442, + "step": 2561 + }, + { + "epoch": 9.249209932279909, + "grad_norm": 196.18557739257812, + "learning_rate": 1.6105263157894736e-05, + "loss": 39.7092, + "step": 2562 + }, + { + "epoch": 9.252821670428894, + "grad_norm": 214.3220977783203, + "learning_rate": 1.6099818511796735e-05, + "loss": 39.3935, + "step": 2563 + }, + { + "epoch": 9.256433408577879, + "grad_norm": 217.2801055908203, + "learning_rate": 1.609437386569873e-05, + "loss": 40.39, + "step": 2564 + }, + { + "epoch": 9.260045146726862, + "grad_norm": 205.17446899414062, + "learning_rate": 1.6088929219600726e-05, + "loss": 39.9531, + "step": 2565 + }, + { + "epoch": 9.263656884875846, + "grad_norm": 197.3854217529297, + "learning_rate": 1.608348457350272e-05, + "loss": 40.474, + "step": 2566 + }, + { + "epoch": 9.267268623024831, + "grad_norm": 264.3934631347656, + "learning_rate": 1.607803992740472e-05, + "loss": 41.2794, + "step": 2567 + }, + { + "epoch": 9.270880361173814, + "grad_norm": 226.6471710205078, + "learning_rate": 1.6072595281306715e-05, + "loss": 40.3425, + "step": 2568 + }, + { + "epoch": 9.2744920993228, + "grad_norm": 198.62734985351562, + "learning_rate": 1.6067150635208714e-05, + "loss": 41.6261, + "step": 2569 + }, + { + "epoch": 9.278103837471784, + "grad_norm": 207.73509216308594, + "learning_rate": 1.606170598911071e-05, + "loss": 41.7835, + "step": 2570 + }, + { + "epoch": 9.278103837471784, + "eval_loss": 0.6173180937767029, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 2570 + }, + { + "epoch": 9.281715575620767, + "grad_norm": 214.13601684570312, + "learning_rate": 1.6056261343012705e-05, + "loss": 40.0095, + "step": 2571 + }, + { + "epoch": 9.285327313769752, + "grad_norm": 218.0533905029297, + "learning_rate": 1.60508166969147e-05, + "loss": 40.014, + "step": 2572 + }, + { + "epoch": 9.288939051918735, + "grad_norm": 211.27984619140625, + "learning_rate": 1.6045372050816695e-05, + "loss": 36.7399, + "step": 2573 + }, + { + "epoch": 9.29255079006772, + "grad_norm": 201.9020233154297, + "learning_rate": 1.6039927404718694e-05, + "loss": 33.7555, + "step": 2574 + }, + { + "epoch": 9.296162528216705, + "grad_norm": 230.27149963378906, + "learning_rate": 1.603448275862069e-05, + "loss": 32.9646, + "step": 2575 + }, + { + "epoch": 9.299774266365688, + "grad_norm": 208.77622985839844, + "learning_rate": 1.6029038112522685e-05, + "loss": 33.5332, + "step": 2576 + }, + { + "epoch": 9.303386004514673, + "grad_norm": 225.02796936035156, + "learning_rate": 1.6023593466424684e-05, + "loss": 34.2592, + "step": 2577 + }, + { + "epoch": 9.306997742663658, + "grad_norm": 201.79612731933594, + "learning_rate": 1.601814882032668e-05, + "loss": 34.6686, + "step": 2578 + }, + { + "epoch": 9.31060948081264, + "grad_norm": 235.6588134765625, + "learning_rate": 1.6012704174228678e-05, + "loss": 35.4554, + "step": 2579 + }, + { + "epoch": 9.314221218961626, + "grad_norm": 273.51904296875, + "learning_rate": 1.6007259528130673e-05, + "loss": 35.2077, + "step": 2580 + }, + { + "epoch": 9.314221218961626, + "eval_loss": 0.6169624328613281, + "eval_runtime": 3.1501, + "eval_samples_per_second": 56.823, + "eval_steps_per_second": 56.823, + "step": 2580 + }, + { + "epoch": 9.317832957110609, + "grad_norm": 199.19541931152344, + "learning_rate": 1.600181488203267e-05, + "loss": 35.0703, + "step": 2581 + }, + { + "epoch": 9.321444695259594, + "grad_norm": 212.49276733398438, + "learning_rate": 1.5996370235934664e-05, + "loss": 35.9691, + "step": 2582 + }, + { + "epoch": 9.325056433408578, + "grad_norm": 193.7330322265625, + "learning_rate": 1.599092558983666e-05, + "loss": 34.9043, + "step": 2583 + }, + { + "epoch": 9.328668171557561, + "grad_norm": 196.00503540039062, + "learning_rate": 1.5985480943738655e-05, + "loss": 36.3508, + "step": 2584 + }, + { + "epoch": 9.332279909706546, + "grad_norm": 218.78392028808594, + "learning_rate": 1.5980036297640654e-05, + "loss": 34.7672, + "step": 2585 + }, + { + "epoch": 9.335891647855531, + "grad_norm": 235.76873779296875, + "learning_rate": 1.5974591651542652e-05, + "loss": 36.8695, + "step": 2586 + }, + { + "epoch": 9.339503386004514, + "grad_norm": 250.538330078125, + "learning_rate": 1.5969147005444648e-05, + "loss": 37.4531, + "step": 2587 + }, + { + "epoch": 9.343115124153499, + "grad_norm": 234.12469482421875, + "learning_rate": 1.5963702359346643e-05, + "loss": 37.4506, + "step": 2588 + }, + { + "epoch": 9.346726862302482, + "grad_norm": 209.3461151123047, + "learning_rate": 1.595825771324864e-05, + "loss": 31.3062, + "step": 2589 + }, + { + "epoch": 9.350338600451467, + "grad_norm": 211.12277221679688, + "learning_rate": 1.5952813067150637e-05, + "loss": 23.3303, + "step": 2590 + }, + { + "epoch": 9.350338600451467, + "eval_loss": 0.6222187876701355, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 2590 + }, + { + "epoch": 9.353950338600452, + "grad_norm": 200.1257781982422, + "learning_rate": 1.5947368421052633e-05, + "loss": 22.9145, + "step": 2591 + }, + { + "epoch": 9.357562076749435, + "grad_norm": 179.01475524902344, + "learning_rate": 1.5941923774954628e-05, + "loss": 23.8842, + "step": 2592 + }, + { + "epoch": 9.36117381489842, + "grad_norm": 214.9254608154297, + "learning_rate": 1.5936479128856623e-05, + "loss": 25.4154, + "step": 2593 + }, + { + "epoch": 9.364785553047405, + "grad_norm": 211.63735961914062, + "learning_rate": 1.593103448275862e-05, + "loss": 42.6467, + "step": 2594 + }, + { + "epoch": 9.368397291196388, + "grad_norm": 232.43194580078125, + "learning_rate": 1.5925589836660618e-05, + "loss": 43.3501, + "step": 2595 + }, + { + "epoch": 9.372009029345373, + "grad_norm": 220.61468505859375, + "learning_rate": 1.5920145190562616e-05, + "loss": 43.4324, + "step": 2596 + }, + { + "epoch": 9.375620767494357, + "grad_norm": 179.00894165039062, + "learning_rate": 1.591470054446461e-05, + "loss": 41.9646, + "step": 2597 + }, + { + "epoch": 9.37923250564334, + "grad_norm": 203.847412109375, + "learning_rate": 1.5909255898366607e-05, + "loss": 41.1242, + "step": 2598 + }, + { + "epoch": 9.382844243792325, + "grad_norm": 244.20164489746094, + "learning_rate": 1.5903811252268602e-05, + "loss": 42.2451, + "step": 2599 + }, + { + "epoch": 9.386455981941308, + "grad_norm": 203.60154724121094, + "learning_rate": 1.5898366606170598e-05, + "loss": 42.0361, + "step": 2600 + }, + { + "epoch": 9.386455981941308, + "eval_loss": 0.627146303653717, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2600 + }, + { + "epoch": 9.390067720090293, + "grad_norm": 185.1741180419922, + "learning_rate": 1.5892921960072597e-05, + "loss": 41.9657, + "step": 2601 + }, + { + "epoch": 9.393679458239278, + "grad_norm": 211.64219665527344, + "learning_rate": 1.5887477313974592e-05, + "loss": 42.2619, + "step": 2602 + }, + { + "epoch": 9.397291196388261, + "grad_norm": 253.31997680664062, + "learning_rate": 1.5882032667876587e-05, + "loss": 42.5666, + "step": 2603 + }, + { + "epoch": 9.400902934537246, + "grad_norm": 257.8781433105469, + "learning_rate": 1.5876588021778586e-05, + "loss": 43.1747, + "step": 2604 + }, + { + "epoch": 9.404514672686231, + "grad_norm": 171.05398559570312, + "learning_rate": 1.587114337568058e-05, + "loss": 41.2645, + "step": 2605 + }, + { + "epoch": 9.408126410835214, + "grad_norm": 209.83749389648438, + "learning_rate": 1.5865698729582577e-05, + "loss": 38.7138, + "step": 2606 + }, + { + "epoch": 9.411738148984199, + "grad_norm": 303.92059326171875, + "learning_rate": 1.5860254083484576e-05, + "loss": 38.7962, + "step": 2607 + }, + { + "epoch": 9.415349887133182, + "grad_norm": 271.9322204589844, + "learning_rate": 1.585480943738657e-05, + "loss": 39.0622, + "step": 2608 + }, + { + "epoch": 9.418961625282167, + "grad_norm": 222.8749542236328, + "learning_rate": 1.5849364791288566e-05, + "loss": 40.0773, + "step": 2609 + }, + { + "epoch": 9.422573363431152, + "grad_norm": 194.549072265625, + "learning_rate": 1.5843920145190562e-05, + "loss": 39.3495, + "step": 2610 + }, + { + "epoch": 9.422573363431152, + "eval_loss": 0.618250846862793, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.796, + "eval_steps_per_second": 56.796, + "step": 2610 + }, + { + "epoch": 9.426185101580135, + "grad_norm": 231.32623291015625, + "learning_rate": 1.5838475499092557e-05, + "loss": 39.7577, + "step": 2611 + }, + { + "epoch": 9.42979683972912, + "grad_norm": 185.9986114501953, + "learning_rate": 1.5833030852994556e-05, + "loss": 40.9342, + "step": 2612 + }, + { + "epoch": 9.433408577878104, + "grad_norm": 221.356201171875, + "learning_rate": 1.5827586206896555e-05, + "loss": 39.7733, + "step": 2613 + }, + { + "epoch": 9.437020316027088, + "grad_norm": 216.2249755859375, + "learning_rate": 1.582214156079855e-05, + "loss": 39.7559, + "step": 2614 + }, + { + "epoch": 9.440632054176072, + "grad_norm": 263.5106201171875, + "learning_rate": 1.5816696914700546e-05, + "loss": 41.2872, + "step": 2615 + }, + { + "epoch": 9.444243792325057, + "grad_norm": 281.9518127441406, + "learning_rate": 1.581125226860254e-05, + "loss": 41.1114, + "step": 2616 + }, + { + "epoch": 9.44785553047404, + "grad_norm": 200.2808074951172, + "learning_rate": 1.5805807622504536e-05, + "loss": 41.7711, + "step": 2617 + }, + { + "epoch": 9.451467268623025, + "grad_norm": 233.034912109375, + "learning_rate": 1.5800362976406535e-05, + "loss": 41.3306, + "step": 2618 + }, + { + "epoch": 9.455079006772008, + "grad_norm": 215.5499725341797, + "learning_rate": 1.579491833030853e-05, + "loss": 41.0065, + "step": 2619 + }, + { + "epoch": 9.458690744920993, + "grad_norm": 220.21153259277344, + "learning_rate": 1.5789473684210526e-05, + "loss": 42.1116, + "step": 2620 + }, + { + "epoch": 9.458690744920993, + "eval_loss": 0.6146022081375122, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 2620 + }, + { + "epoch": 9.462302483069978, + "grad_norm": 198.20001220703125, + "learning_rate": 1.578402903811252e-05, + "loss": 39.637, + "step": 2621 + }, + { + "epoch": 9.465914221218961, + "grad_norm": 228.18357849121094, + "learning_rate": 1.5778584392014517e-05, + "loss": 37.3831, + "step": 2622 + }, + { + "epoch": 9.469525959367946, + "grad_norm": 207.68040466308594, + "learning_rate": 1.577313974591652e-05, + "loss": 35.6356, + "step": 2623 + }, + { + "epoch": 9.47313769751693, + "grad_norm": 267.0474853515625, + "learning_rate": 1.5767695099818514e-05, + "loss": 34.5549, + "step": 2624 + }, + { + "epoch": 9.476749435665914, + "grad_norm": 191.4129638671875, + "learning_rate": 1.576225045372051e-05, + "loss": 35.1065, + "step": 2625 + }, + { + "epoch": 9.480361173814899, + "grad_norm": 220.85708618164062, + "learning_rate": 1.5756805807622505e-05, + "loss": 34.9115, + "step": 2626 + }, + { + "epoch": 9.483972911963882, + "grad_norm": 218.62460327148438, + "learning_rate": 1.57513611615245e-05, + "loss": 33.9542, + "step": 2627 + }, + { + "epoch": 9.487584650112867, + "grad_norm": 184.085693359375, + "learning_rate": 1.5745916515426496e-05, + "loss": 35.2981, + "step": 2628 + }, + { + "epoch": 9.491196388261852, + "grad_norm": 286.73236083984375, + "learning_rate": 1.5740471869328494e-05, + "loss": 36.8326, + "step": 2629 + }, + { + "epoch": 9.494808126410835, + "grad_norm": 326.4263000488281, + "learning_rate": 1.573502722323049e-05, + "loss": 35.9728, + "step": 2630 + }, + { + "epoch": 9.494808126410835, + "eval_loss": 0.6165672540664673, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2630 + }, + { + "epoch": 9.49841986455982, + "grad_norm": 283.330322265625, + "learning_rate": 1.5729582577132485e-05, + "loss": 37.4227, + "step": 2631 + }, + { + "epoch": 9.502031602708804, + "grad_norm": 208.65829467773438, + "learning_rate": 1.5724137931034484e-05, + "loss": 36.8613, + "step": 2632 + }, + { + "epoch": 9.505643340857787, + "grad_norm": 191.59429931640625, + "learning_rate": 1.571869328493648e-05, + "loss": 36.2332, + "step": 2633 + }, + { + "epoch": 9.509255079006772, + "grad_norm": 306.4736022949219, + "learning_rate": 1.5713248638838478e-05, + "loss": 36.8045, + "step": 2634 + }, + { + "epoch": 9.512866817155757, + "grad_norm": 226.97509765625, + "learning_rate": 1.5707803992740474e-05, + "loss": 37.005, + "step": 2635 + }, + { + "epoch": 9.51647855530474, + "grad_norm": 230.47683715820312, + "learning_rate": 1.570235934664247e-05, + "loss": 36.9168, + "step": 2636 + }, + { + "epoch": 9.520090293453725, + "grad_norm": 221.44483947753906, + "learning_rate": 1.5696914700544464e-05, + "loss": 39.0025, + "step": 2637 + }, + { + "epoch": 9.523702031602708, + "grad_norm": 249.1531219482422, + "learning_rate": 1.569147005444646e-05, + "loss": 38.1069, + "step": 2638 + }, + { + "epoch": 9.527313769751693, + "grad_norm": 276.8532409667969, + "learning_rate": 1.5686025408348455e-05, + "loss": 30.9819, + "step": 2639 + }, + { + "epoch": 9.530925507900678, + "grad_norm": 218.25035095214844, + "learning_rate": 1.5680580762250454e-05, + "loss": 23.4807, + "step": 2640 + }, + { + "epoch": 9.530925507900678, + "eval_loss": 0.619295060634613, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2640 + }, + { + "epoch": 9.534537246049661, + "grad_norm": 185.83737182617188, + "learning_rate": 1.5675136116152453e-05, + "loss": 22.5394, + "step": 2641 + }, + { + "epoch": 9.538148984198646, + "grad_norm": 181.9920654296875, + "learning_rate": 1.5669691470054448e-05, + "loss": 23.9106, + "step": 2642 + }, + { + "epoch": 9.54176072234763, + "grad_norm": 209.20391845703125, + "learning_rate": 1.5664246823956443e-05, + "loss": 25.5328, + "step": 2643 + }, + { + "epoch": 9.545372460496614, + "grad_norm": 223.86093139648438, + "learning_rate": 1.565880217785844e-05, + "loss": 42.8563, + "step": 2644 + }, + { + "epoch": 9.548984198645599, + "grad_norm": 232.3086395263672, + "learning_rate": 1.5653357531760438e-05, + "loss": 44.0178, + "step": 2645 + }, + { + "epoch": 9.552595936794582, + "grad_norm": 223.76541137695312, + "learning_rate": 1.5647912885662433e-05, + "loss": 43.4928, + "step": 2646 + }, + { + "epoch": 9.556207674943566, + "grad_norm": 258.86700439453125, + "learning_rate": 1.5642468239564428e-05, + "loss": 42.3422, + "step": 2647 + }, + { + "epoch": 9.559819413092551, + "grad_norm": 255.09033203125, + "learning_rate": 1.5637023593466424e-05, + "loss": 41.6588, + "step": 2648 + }, + { + "epoch": 9.563431151241534, + "grad_norm": 205.88563537597656, + "learning_rate": 1.563157894736842e-05, + "loss": 41.9267, + "step": 2649 + }, + { + "epoch": 9.56704288939052, + "grad_norm": 204.12318420410156, + "learning_rate": 1.5626134301270418e-05, + "loss": 43.0326, + "step": 2650 + }, + { + "epoch": 9.56704288939052, + "eval_loss": 0.6218730807304382, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2650 + }, + { + "epoch": 9.570654627539504, + "grad_norm": 259.5694274902344, + "learning_rate": 1.5620689655172417e-05, + "loss": 42.9604, + "step": 2651 + }, + { + "epoch": 9.574266365688487, + "grad_norm": 234.35935974121094, + "learning_rate": 1.5615245009074412e-05, + "loss": 42.7316, + "step": 2652 + }, + { + "epoch": 9.577878103837472, + "grad_norm": 237.14346313476562, + "learning_rate": 1.5609800362976407e-05, + "loss": 42.4559, + "step": 2653 + }, + { + "epoch": 9.581489841986457, + "grad_norm": 208.2974395751953, + "learning_rate": 1.5604355716878403e-05, + "loss": 40.1113, + "step": 2654 + }, + { + "epoch": 9.58510158013544, + "grad_norm": 212.18814086914062, + "learning_rate": 1.5598911070780398e-05, + "loss": 38.6515, + "step": 2655 + }, + { + "epoch": 9.588713318284425, + "grad_norm": 245.23240661621094, + "learning_rate": 1.5593466424682397e-05, + "loss": 39.5289, + "step": 2656 + }, + { + "epoch": 9.592325056433408, + "grad_norm": 261.1321105957031, + "learning_rate": 1.5588021778584392e-05, + "loss": 39.3232, + "step": 2657 + }, + { + "epoch": 9.595936794582393, + "grad_norm": 257.67962646484375, + "learning_rate": 1.5582577132486388e-05, + "loss": 40.3963, + "step": 2658 + }, + { + "epoch": 9.599548532731378, + "grad_norm": 299.93914794921875, + "learning_rate": 1.5577132486388383e-05, + "loss": 39.0657, + "step": 2659 + }, + { + "epoch": 9.60316027088036, + "grad_norm": 215.45407104492188, + "learning_rate": 1.5571687840290382e-05, + "loss": 40.1408, + "step": 2660 + }, + { + "epoch": 9.60316027088036, + "eval_loss": 0.6216554045677185, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2660 + }, + { + "epoch": 9.606772009029346, + "grad_norm": 273.9233093261719, + "learning_rate": 1.5566243194192377e-05, + "loss": 40.6894, + "step": 2661 + }, + { + "epoch": 9.610383747178329, + "grad_norm": 220.76344299316406, + "learning_rate": 1.5560798548094376e-05, + "loss": 40.8146, + "step": 2662 + }, + { + "epoch": 9.613995485327314, + "grad_norm": 200.33929443359375, + "learning_rate": 1.555535390199637e-05, + "loss": 40.1362, + "step": 2663 + }, + { + "epoch": 9.617607223476298, + "grad_norm": 223.38536071777344, + "learning_rate": 1.5549909255898367e-05, + "loss": 39.3488, + "step": 2664 + }, + { + "epoch": 9.621218961625281, + "grad_norm": 240.99578857421875, + "learning_rate": 1.5544464609800362e-05, + "loss": 41.771, + "step": 2665 + }, + { + "epoch": 9.624830699774266, + "grad_norm": 202.30323791503906, + "learning_rate": 1.5539019963702357e-05, + "loss": 41.1412, + "step": 2666 + }, + { + "epoch": 9.628442437923251, + "grad_norm": 193.8411865234375, + "learning_rate": 1.5533575317604356e-05, + "loss": 41.0064, + "step": 2667 + }, + { + "epoch": 9.632054176072234, + "grad_norm": 197.1542510986328, + "learning_rate": 1.552813067150635e-05, + "loss": 41.4787, + "step": 2668 + }, + { + "epoch": 9.635665914221219, + "grad_norm": 259.21954345703125, + "learning_rate": 1.552268602540835e-05, + "loss": 41.753, + "step": 2669 + }, + { + "epoch": 9.639277652370204, + "grad_norm": 290.9770202636719, + "learning_rate": 1.5517241379310346e-05, + "loss": 40.4589, + "step": 2670 + }, + { + "epoch": 9.639277652370204, + "eval_loss": 0.6132164001464844, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2670 + }, + { + "epoch": 9.642889390519187, + "grad_norm": 252.86219787597656, + "learning_rate": 1.551179673321234e-05, + "loss": 37.356, + "step": 2671 + }, + { + "epoch": 9.646501128668172, + "grad_norm": 207.79254150390625, + "learning_rate": 1.550635208711434e-05, + "loss": 36.2071, + "step": 2672 + }, + { + "epoch": 9.650112866817155, + "grad_norm": 186.78857421875, + "learning_rate": 1.5500907441016335e-05, + "loss": 33.5074, + "step": 2673 + }, + { + "epoch": 9.65372460496614, + "grad_norm": 212.5107421875, + "learning_rate": 1.549546279491833e-05, + "loss": 33.7103, + "step": 2674 + }, + { + "epoch": 9.657336343115125, + "grad_norm": 243.2950897216797, + "learning_rate": 1.5490018148820326e-05, + "loss": 34.3476, + "step": 2675 + }, + { + "epoch": 9.660948081264108, + "grad_norm": 221.66415405273438, + "learning_rate": 1.548457350272232e-05, + "loss": 34.5377, + "step": 2676 + }, + { + "epoch": 9.664559819413093, + "grad_norm": 231.8260955810547, + "learning_rate": 1.5479128856624317e-05, + "loss": 34.3663, + "step": 2677 + }, + { + "epoch": 9.668171557562077, + "grad_norm": 284.6401062011719, + "learning_rate": 1.547368421052632e-05, + "loss": 35.5723, + "step": 2678 + }, + { + "epoch": 9.67178329571106, + "grad_norm": 373.43865966796875, + "learning_rate": 1.5468239564428314e-05, + "loss": 35.5628, + "step": 2679 + }, + { + "epoch": 9.675395033860045, + "grad_norm": 325.18316650390625, + "learning_rate": 1.546279491833031e-05, + "loss": 35.6192, + "step": 2680 + }, + { + "epoch": 9.675395033860045, + "eval_loss": 0.613842248916626, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 2680 + }, + { + "epoch": 9.679006772009028, + "grad_norm": 353.14739990234375, + "learning_rate": 1.5457350272232305e-05, + "loss": 36.4789, + "step": 2681 + }, + { + "epoch": 9.682618510158013, + "grad_norm": 215.21836853027344, + "learning_rate": 1.54519056261343e-05, + "loss": 36.0412, + "step": 2682 + }, + { + "epoch": 9.686230248306998, + "grad_norm": 219.64930725097656, + "learning_rate": 1.54464609800363e-05, + "loss": 37.1118, + "step": 2683 + }, + { + "epoch": 9.689841986455981, + "grad_norm": 247.86685180664062, + "learning_rate": 1.5441016333938295e-05, + "loss": 36.488, + "step": 2684 + }, + { + "epoch": 9.693453724604966, + "grad_norm": 248.7967071533203, + "learning_rate": 1.543557168784029e-05, + "loss": 36.2925, + "step": 2685 + }, + { + "epoch": 9.697065462753951, + "grad_norm": 243.1404571533203, + "learning_rate": 1.5430127041742285e-05, + "loss": 37.3986, + "step": 2686 + }, + { + "epoch": 9.700677200902934, + "grad_norm": 276.6585388183594, + "learning_rate": 1.5424682395644284e-05, + "loss": 37.9784, + "step": 2687 + }, + { + "epoch": 9.704288939051919, + "grad_norm": 308.171630859375, + "learning_rate": 1.541923774954628e-05, + "loss": 38.1591, + "step": 2688 + }, + { + "epoch": 9.707900677200904, + "grad_norm": 204.4575653076172, + "learning_rate": 1.541379310344828e-05, + "loss": 27.4514, + "step": 2689 + }, + { + "epoch": 9.711512415349887, + "grad_norm": 160.85946655273438, + "learning_rate": 1.5408348457350274e-05, + "loss": 23.7982, + "step": 2690 + }, + { + "epoch": 9.711512415349887, + "eval_loss": 0.619924008846283, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2690 + }, + { + "epoch": 9.715124153498872, + "grad_norm": 215.60049438476562, + "learning_rate": 1.540290381125227e-05, + "loss": 23.3927, + "step": 2691 + }, + { + "epoch": 9.718735891647855, + "grad_norm": 172.84011840820312, + "learning_rate": 1.5397459165154265e-05, + "loss": 24.1876, + "step": 2692 + }, + { + "epoch": 9.72234762979684, + "grad_norm": 208.42361450195312, + "learning_rate": 1.539201451905626e-05, + "loss": 25.1794, + "step": 2693 + }, + { + "epoch": 9.725959367945824, + "grad_norm": 255.73574829101562, + "learning_rate": 1.538656987295826e-05, + "loss": 42.3484, + "step": 2694 + }, + { + "epoch": 9.729571106094808, + "grad_norm": 239.65533447265625, + "learning_rate": 1.5381125226860254e-05, + "loss": 42.8277, + "step": 2695 + }, + { + "epoch": 9.733182844243792, + "grad_norm": 211.2068634033203, + "learning_rate": 1.5375680580762253e-05, + "loss": 42.6536, + "step": 2696 + }, + { + "epoch": 9.736794582392777, + "grad_norm": 302.85003662109375, + "learning_rate": 1.5370235934664248e-05, + "loss": 42.6263, + "step": 2697 + }, + { + "epoch": 9.74040632054176, + "grad_norm": 211.54754638671875, + "learning_rate": 1.5364791288566244e-05, + "loss": 41.5621, + "step": 2698 + }, + { + "epoch": 9.744018058690745, + "grad_norm": 229.22283935546875, + "learning_rate": 1.535934664246824e-05, + "loss": 43.3765, + "step": 2699 + }, + { + "epoch": 9.747629796839728, + "grad_norm": 206.64794921875, + "learning_rate": 1.5353901996370238e-05, + "loss": 41.4923, + "step": 2700 + }, + { + "epoch": 9.747629796839728, + "eval_loss": 0.6202616095542908, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.981, + "eval_steps_per_second": 56.981, + "step": 2700 + }, + { + "epoch": 9.751241534988713, + "grad_norm": 216.98757934570312, + "learning_rate": 1.5348457350272233e-05, + "loss": 43.1931, + "step": 2701 + }, + { + "epoch": 9.754853273137698, + "grad_norm": 222.7340545654297, + "learning_rate": 1.534301270417423e-05, + "loss": 42.485, + "step": 2702 + }, + { + "epoch": 9.758465011286681, + "grad_norm": 291.3454895019531, + "learning_rate": 1.5337568058076224e-05, + "loss": 41.4766, + "step": 2703 + }, + { + "epoch": 9.762076749435666, + "grad_norm": 239.50341796875, + "learning_rate": 1.533212341197822e-05, + "loss": 41.9215, + "step": 2704 + }, + { + "epoch": 9.76568848758465, + "grad_norm": 179.21839904785156, + "learning_rate": 1.5326678765880218e-05, + "loss": 40.6544, + "step": 2705 + }, + { + "epoch": 9.769300225733634, + "grad_norm": 210.89535522460938, + "learning_rate": 1.5321234119782217e-05, + "loss": 38.6204, + "step": 2706 + }, + { + "epoch": 9.772911963882619, + "grad_norm": 239.23291015625, + "learning_rate": 1.5315789473684212e-05, + "loss": 39.4385, + "step": 2707 + }, + { + "epoch": 9.776523702031604, + "grad_norm": 240.22772216796875, + "learning_rate": 1.5310344827586208e-05, + "loss": 40.0139, + "step": 2708 + }, + { + "epoch": 9.780135440180587, + "grad_norm": 185.4588623046875, + "learning_rate": 1.5304900181488203e-05, + "loss": 38.9331, + "step": 2709 + }, + { + "epoch": 9.783747178329572, + "grad_norm": 263.0315856933594, + "learning_rate": 1.52994555353902e-05, + "loss": 38.5485, + "step": 2710 + }, + { + "epoch": 9.783747178329572, + "eval_loss": 0.615914523601532, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2710 + }, + { + "epoch": 9.787358916478555, + "grad_norm": 209.05348205566406, + "learning_rate": 1.5294010889292197e-05, + "loss": 39.4875, + "step": 2711 + }, + { + "epoch": 9.79097065462754, + "grad_norm": 209.72293090820312, + "learning_rate": 1.5288566243194193e-05, + "loss": 40.4742, + "step": 2712 + }, + { + "epoch": 9.794582392776524, + "grad_norm": 210.02908325195312, + "learning_rate": 1.5283121597096188e-05, + "loss": 39.924, + "step": 2713 + }, + { + "epoch": 9.798194130925507, + "grad_norm": 204.3467254638672, + "learning_rate": 1.5277676950998183e-05, + "loss": 40.8893, + "step": 2714 + }, + { + "epoch": 9.801805869074492, + "grad_norm": 253.9317626953125, + "learning_rate": 1.5272232304900182e-05, + "loss": 38.3278, + "step": 2715 + }, + { + "epoch": 9.805417607223477, + "grad_norm": 263.6196594238281, + "learning_rate": 1.526678765880218e-05, + "loss": 40.5242, + "step": 2716 + }, + { + "epoch": 9.80902934537246, + "grad_norm": 230.35621643066406, + "learning_rate": 1.5261343012704176e-05, + "loss": 40.683, + "step": 2717 + }, + { + "epoch": 9.812641083521445, + "grad_norm": 190.16323852539062, + "learning_rate": 1.5255898366606172e-05, + "loss": 40.2472, + "step": 2718 + }, + { + "epoch": 9.816252821670428, + "grad_norm": 202.7122344970703, + "learning_rate": 1.5250453720508167e-05, + "loss": 38.9644, + "step": 2719 + }, + { + "epoch": 9.819864559819413, + "grad_norm": 193.65774536132812, + "learning_rate": 1.5245009074410164e-05, + "loss": 40.9982, + "step": 2720 + }, + { + "epoch": 9.819864559819413, + "eval_loss": 0.6152020692825317, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 2720 + }, + { + "epoch": 9.823476297968398, + "grad_norm": 272.0360412597656, + "learning_rate": 1.523956442831216e-05, + "loss": 40.5518, + "step": 2721 + }, + { + "epoch": 9.827088036117381, + "grad_norm": 200.20777893066406, + "learning_rate": 1.5234119782214155e-05, + "loss": 38.4801, + "step": 2722 + }, + { + "epoch": 9.830699774266366, + "grad_norm": 201.44764709472656, + "learning_rate": 1.5228675136116152e-05, + "loss": 35.7499, + "step": 2723 + }, + { + "epoch": 9.83431151241535, + "grad_norm": 234.89706420898438, + "learning_rate": 1.522323049001815e-05, + "loss": 35.4331, + "step": 2724 + }, + { + "epoch": 9.837923250564334, + "grad_norm": 193.27423095703125, + "learning_rate": 1.5217785843920146e-05, + "loss": 33.0281, + "step": 2725 + }, + { + "epoch": 9.841534988713319, + "grad_norm": 222.28060913085938, + "learning_rate": 1.5212341197822143e-05, + "loss": 34.2237, + "step": 2726 + }, + { + "epoch": 9.845146726862303, + "grad_norm": 264.2764587402344, + "learning_rate": 1.5206896551724139e-05, + "loss": 33.7112, + "step": 2727 + }, + { + "epoch": 9.848758465011286, + "grad_norm": 204.5146484375, + "learning_rate": 1.5201451905626134e-05, + "loss": 33.9014, + "step": 2728 + }, + { + "epoch": 9.852370203160271, + "grad_norm": 198.90907287597656, + "learning_rate": 1.5196007259528131e-05, + "loss": 36.6987, + "step": 2729 + }, + { + "epoch": 9.855981941309254, + "grad_norm": 254.19818115234375, + "learning_rate": 1.5190562613430126e-05, + "loss": 35.4466, + "step": 2730 + }, + { + "epoch": 9.855981941309254, + "eval_loss": 0.6153284311294556, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2730 + }, + { + "epoch": 9.85959367945824, + "grad_norm": 212.53749084472656, + "learning_rate": 1.5185117967332123e-05, + "loss": 35.659, + "step": 2731 + }, + { + "epoch": 9.863205417607224, + "grad_norm": 234.5277557373047, + "learning_rate": 1.5179673321234119e-05, + "loss": 36.7411, + "step": 2732 + }, + { + "epoch": 9.866817155756207, + "grad_norm": 229.25962829589844, + "learning_rate": 1.5174228675136118e-05, + "loss": 36.0713, + "step": 2733 + }, + { + "epoch": 9.870428893905192, + "grad_norm": 259.5096435546875, + "learning_rate": 1.5168784029038115e-05, + "loss": 37.2433, + "step": 2734 + }, + { + "epoch": 9.874040632054175, + "grad_norm": 297.2413024902344, + "learning_rate": 1.516333938294011e-05, + "loss": 37.222, + "step": 2735 + }, + { + "epoch": 9.87765237020316, + "grad_norm": 259.8325500488281, + "learning_rate": 1.5157894736842105e-05, + "loss": 37.096, + "step": 2736 + }, + { + "epoch": 9.881264108352145, + "grad_norm": 275.85888671875, + "learning_rate": 1.5152450090744103e-05, + "loss": 37.769, + "step": 2737 + }, + { + "epoch": 9.884875846501128, + "grad_norm": 261.16656494140625, + "learning_rate": 1.5147005444646098e-05, + "loss": 38.4089, + "step": 2738 + }, + { + "epoch": 9.888487584650113, + "grad_norm": 219.74351501464844, + "learning_rate": 1.5141560798548095e-05, + "loss": 32.5255, + "step": 2739 + }, + { + "epoch": 9.892099322799098, + "grad_norm": 203.9193878173828, + "learning_rate": 1.513611615245009e-05, + "loss": 24.2497, + "step": 2740 + }, + { + "epoch": 9.892099322799098, + "eval_loss": 0.6206448674201965, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 2740 + }, + { + "epoch": 9.89571106094808, + "grad_norm": 224.19454956054688, + "learning_rate": 1.5130671506352086e-05, + "loss": 23.0629, + "step": 2741 + }, + { + "epoch": 9.899322799097066, + "grad_norm": 252.4147186279297, + "learning_rate": 1.5125226860254086e-05, + "loss": 24.5799, + "step": 2742 + }, + { + "epoch": 9.90293453724605, + "grad_norm": 214.79067993164062, + "learning_rate": 1.5119782214156082e-05, + "loss": 24.6773, + "step": 2743 + }, + { + "epoch": 9.906546275395034, + "grad_norm": 225.59848022460938, + "learning_rate": 1.5114337568058077e-05, + "loss": 43.1147, + "step": 2744 + }, + { + "epoch": 9.910158013544018, + "grad_norm": 221.8661651611328, + "learning_rate": 1.5108892921960074e-05, + "loss": 42.7403, + "step": 2745 + }, + { + "epoch": 9.913769751693001, + "grad_norm": 316.3871765136719, + "learning_rate": 1.510344827586207e-05, + "loss": 41.6931, + "step": 2746 + }, + { + "epoch": 9.917381489841986, + "grad_norm": 250.6577911376953, + "learning_rate": 1.5098003629764065e-05, + "loss": 43.3, + "step": 2747 + }, + { + "epoch": 9.920993227990971, + "grad_norm": 222.44386291503906, + "learning_rate": 1.5092558983666062e-05, + "loss": 43.3128, + "step": 2748 + }, + { + "epoch": 9.924604966139954, + "grad_norm": 190.08682250976562, + "learning_rate": 1.5087114337568057e-05, + "loss": 41.4814, + "step": 2749 + }, + { + "epoch": 9.928216704288939, + "grad_norm": 276.9918212890625, + "learning_rate": 1.5081669691470054e-05, + "loss": 41.042, + "step": 2750 + }, + { + "epoch": 9.928216704288939, + "eval_loss": 0.6201648116111755, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2750 + }, + { + "epoch": 9.931828442437924, + "grad_norm": 269.7344970703125, + "learning_rate": 1.507622504537205e-05, + "loss": 40.3064, + "step": 2751 + }, + { + "epoch": 9.935440180586907, + "grad_norm": 263.11663818359375, + "learning_rate": 1.5070780399274049e-05, + "loss": 40.1675, + "step": 2752 + }, + { + "epoch": 9.939051918735892, + "grad_norm": 210.37635803222656, + "learning_rate": 1.5065335753176046e-05, + "loss": 40.5334, + "step": 2753 + }, + { + "epoch": 9.942663656884875, + "grad_norm": 206.09335327148438, + "learning_rate": 1.5059891107078041e-05, + "loss": 41.0429, + "step": 2754 + }, + { + "epoch": 9.94627539503386, + "grad_norm": 245.45013427734375, + "learning_rate": 1.5054446460980036e-05, + "loss": 40.8831, + "step": 2755 + }, + { + "epoch": 9.949887133182845, + "grad_norm": 216.63075256347656, + "learning_rate": 1.5049001814882033e-05, + "loss": 41.2453, + "step": 2756 + }, + { + "epoch": 9.953498871331828, + "grad_norm": 362.12127685546875, + "learning_rate": 1.5043557168784029e-05, + "loss": 40.4561, + "step": 2757 + }, + { + "epoch": 9.957110609480813, + "grad_norm": 222.01434326171875, + "learning_rate": 1.5038112522686024e-05, + "loss": 41.7307, + "step": 2758 + }, + { + "epoch": 9.960722347629797, + "grad_norm": 289.6107177734375, + "learning_rate": 1.5032667876588021e-05, + "loss": 37.83, + "step": 2759 + }, + { + "epoch": 9.96433408577878, + "grad_norm": 231.75274658203125, + "learning_rate": 1.5027223230490017e-05, + "loss": 34.1728, + "step": 2760 + }, + { + "epoch": 9.96433408577878, + "eval_loss": 0.6177247166633606, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2760 + }, + { + "epoch": 9.967945823927765, + "grad_norm": 269.4657287597656, + "learning_rate": 1.5021778584392017e-05, + "loss": 33.8501, + "step": 2761 + }, + { + "epoch": 9.97155756207675, + "grad_norm": 229.73004150390625, + "learning_rate": 1.5016333938294013e-05, + "loss": 35.0989, + "step": 2762 + }, + { + "epoch": 9.975169300225733, + "grad_norm": 215.75350952148438, + "learning_rate": 1.5010889292196008e-05, + "loss": 35.1091, + "step": 2763 + }, + { + "epoch": 9.978781038374718, + "grad_norm": 255.36439514160156, + "learning_rate": 1.5005444646098005e-05, + "loss": 36.8373, + "step": 2764 + }, + { + "epoch": 9.982392776523701, + "grad_norm": 226.71084594726562, + "learning_rate": 1.5e-05, + "loss": 36.6244, + "step": 2765 + }, + { + "epoch": 9.986004514672686, + "grad_norm": 264.1791076660156, + "learning_rate": 1.4994555353901996e-05, + "loss": 36.1925, + "step": 2766 + }, + { + "epoch": 9.989616252821671, + "grad_norm": 281.4349060058594, + "learning_rate": 1.4989110707803993e-05, + "loss": 38.5627, + "step": 2767 + }, + { + "epoch": 9.993227990970654, + "grad_norm": 275.13092041015625, + "learning_rate": 1.498366606170599e-05, + "loss": 33.3277, + "step": 2768 + }, + { + "epoch": 9.996839729119639, + "grad_norm": 215.79550170898438, + "learning_rate": 1.4978221415607985e-05, + "loss": 23.7482, + "step": 2769 + }, + { + "epoch": 10.0, + "grad_norm": 162.03152465820312, + "learning_rate": 1.4972776769509982e-05, + "loss": 21.7078, + "step": 2770 + }, + { + "epoch": 10.0, + "eval_loss": 0.6126651763916016, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2770 + }, + { + "epoch": 10.003611738148985, + "grad_norm": 243.1815185546875, + "learning_rate": 1.4967332123411978e-05, + "loss": 42.2449, + "step": 2771 + }, + { + "epoch": 10.007223476297968, + "grad_norm": 183.29127502441406, + "learning_rate": 1.4961887477313977e-05, + "loss": 41.5925, + "step": 2772 + }, + { + "epoch": 10.010835214446953, + "grad_norm": 206.04238891601562, + "learning_rate": 1.4956442831215972e-05, + "loss": 40.6657, + "step": 2773 + }, + { + "epoch": 10.014446952595938, + "grad_norm": 192.1796875, + "learning_rate": 1.4950998185117967e-05, + "loss": 41.7065, + "step": 2774 + }, + { + "epoch": 10.01805869074492, + "grad_norm": 202.77279663085938, + "learning_rate": 1.4945553539019964e-05, + "loss": 42.0608, + "step": 2775 + }, + { + "epoch": 10.021670428893906, + "grad_norm": 242.37734985351562, + "learning_rate": 1.494010889292196e-05, + "loss": 40.9925, + "step": 2776 + }, + { + "epoch": 10.025282167042889, + "grad_norm": 252.01358032226562, + "learning_rate": 1.4934664246823957e-05, + "loss": 41.1401, + "step": 2777 + }, + { + "epoch": 10.028893905191874, + "grad_norm": 205.82388305664062, + "learning_rate": 1.4929219600725954e-05, + "loss": 41.5, + "step": 2778 + }, + { + "epoch": 10.032505643340858, + "grad_norm": 251.53968811035156, + "learning_rate": 1.492377495462795e-05, + "loss": 41.8218, + "step": 2779 + }, + { + "epoch": 10.036117381489841, + "grad_norm": 236.55564880371094, + "learning_rate": 1.4918330308529945e-05, + "loss": 40.803, + "step": 2780 + }, + { + "epoch": 10.036117381489841, + "eval_loss": 0.6173696517944336, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 2780 + }, + { + "epoch": 10.039729119638826, + "grad_norm": 214.9959716796875, + "learning_rate": 1.4912885662431942e-05, + "loss": 40.522, + "step": 2781 + }, + { + "epoch": 10.043340857787811, + "grad_norm": 213.7000732421875, + "learning_rate": 1.4907441016333939e-05, + "loss": 38.8643, + "step": 2782 + }, + { + "epoch": 10.046952595936794, + "grad_norm": 225.6709747314453, + "learning_rate": 1.4901996370235936e-05, + "loss": 38.3625, + "step": 2783 + }, + { + "epoch": 10.050564334085779, + "grad_norm": 208.83712768554688, + "learning_rate": 1.4896551724137931e-05, + "loss": 38.5355, + "step": 2784 + }, + { + "epoch": 10.054176072234762, + "grad_norm": 185.51219177246094, + "learning_rate": 1.4891107078039927e-05, + "loss": 38.4303, + "step": 2785 + }, + { + "epoch": 10.057787810383747, + "grad_norm": 196.68551635742188, + "learning_rate": 1.4885662431941925e-05, + "loss": 38.1895, + "step": 2786 + }, + { + "epoch": 10.061399548532732, + "grad_norm": 207.4806671142578, + "learning_rate": 1.488021778584392e-05, + "loss": 39.2329, + "step": 2787 + }, + { + "epoch": 10.065011286681715, + "grad_norm": 211.640380859375, + "learning_rate": 1.4874773139745916e-05, + "loss": 40.108, + "step": 2788 + }, + { + "epoch": 10.0686230248307, + "grad_norm": 195.97006225585938, + "learning_rate": 1.4869328493647913e-05, + "loss": 39.6883, + "step": 2789 + }, + { + "epoch": 10.072234762979685, + "grad_norm": 207.20169067382812, + "learning_rate": 1.4863883847549909e-05, + "loss": 40.557, + "step": 2790 + }, + { + "epoch": 10.072234762979685, + "eval_loss": 0.6166439652442932, + "eval_runtime": 3.1461, + "eval_samples_per_second": 56.895, + "eval_steps_per_second": 56.895, + "step": 2790 + }, + { + "epoch": 10.075846501128668, + "grad_norm": 168.4052276611328, + "learning_rate": 1.4858439201451906e-05, + "loss": 39.76, + "step": 2791 + }, + { + "epoch": 10.079458239277653, + "grad_norm": 188.55575561523438, + "learning_rate": 1.4852994555353903e-05, + "loss": 40.4776, + "step": 2792 + }, + { + "epoch": 10.083069977426636, + "grad_norm": 181.60801696777344, + "learning_rate": 1.4847549909255898e-05, + "loss": 40.5414, + "step": 2793 + }, + { + "epoch": 10.08668171557562, + "grad_norm": 205.39608764648438, + "learning_rate": 1.4842105263157895e-05, + "loss": 41.4944, + "step": 2794 + }, + { + "epoch": 10.090293453724605, + "grad_norm": 271.0169372558594, + "learning_rate": 1.4836660617059892e-05, + "loss": 40.6805, + "step": 2795 + }, + { + "epoch": 10.093905191873588, + "grad_norm": 241.97889709472656, + "learning_rate": 1.4831215970961888e-05, + "loss": 39.5473, + "step": 2796 + }, + { + "epoch": 10.097516930022573, + "grad_norm": 211.64260864257812, + "learning_rate": 1.4825771324863885e-05, + "loss": 41.0357, + "step": 2797 + }, + { + "epoch": 10.101128668171558, + "grad_norm": 209.52804565429688, + "learning_rate": 1.482032667876588e-05, + "loss": 41.3357, + "step": 2798 + }, + { + "epoch": 10.104740406320541, + "grad_norm": 243.08419799804688, + "learning_rate": 1.4814882032667876e-05, + "loss": 38.6778, + "step": 2799 + }, + { + "epoch": 10.108352144469526, + "grad_norm": 227.17172241210938, + "learning_rate": 1.4809437386569874e-05, + "loss": 35.1128, + "step": 2800 + }, + { + "epoch": 10.108352144469526, + "eval_loss": 0.6153741478919983, + "eval_runtime": 3.143, + "eval_samples_per_second": 56.952, + "eval_steps_per_second": 56.952, + "step": 2800 + }, + { + "epoch": 10.111963882618511, + "grad_norm": 284.7151794433594, + "learning_rate": 1.480399274047187e-05, + "loss": 33.1712, + "step": 2801 + }, + { + "epoch": 10.115575620767494, + "grad_norm": 234.85169982910156, + "learning_rate": 1.4798548094373867e-05, + "loss": 33.495, + "step": 2802 + }, + { + "epoch": 10.119187358916479, + "grad_norm": 236.6138458251953, + "learning_rate": 1.4793103448275862e-05, + "loss": 33.2318, + "step": 2803 + }, + { + "epoch": 10.122799097065462, + "grad_norm": 240.98997497558594, + "learning_rate": 1.4787658802177858e-05, + "loss": 33.9268, + "step": 2804 + }, + { + "epoch": 10.126410835214447, + "grad_norm": 218.304443359375, + "learning_rate": 1.4782214156079856e-05, + "loss": 34.667, + "step": 2805 + }, + { + "epoch": 10.130022573363432, + "grad_norm": 290.30108642578125, + "learning_rate": 1.4776769509981852e-05, + "loss": 36.7153, + "step": 2806 + }, + { + "epoch": 10.133634311512415, + "grad_norm": 267.7265625, + "learning_rate": 1.4771324863883847e-05, + "loss": 35.2035, + "step": 2807 + }, + { + "epoch": 10.1372460496614, + "grad_norm": 300.4646301269531, + "learning_rate": 1.4765880217785844e-05, + "loss": 35.6581, + "step": 2808 + }, + { + "epoch": 10.140857787810384, + "grad_norm": 234.16448974609375, + "learning_rate": 1.4760435571687841e-05, + "loss": 35.8547, + "step": 2809 + }, + { + "epoch": 10.144469525959368, + "grad_norm": 209.23858642578125, + "learning_rate": 1.4754990925589837e-05, + "loss": 34.47, + "step": 2810 + }, + { + "epoch": 10.144469525959368, + "eval_loss": 0.6160662770271301, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2810 + }, + { + "epoch": 10.148081264108352, + "grad_norm": 207.9628143310547, + "learning_rate": 1.4749546279491834e-05, + "loss": 36.1239, + "step": 2811 + }, + { + "epoch": 10.151693002257336, + "grad_norm": 183.68545532226562, + "learning_rate": 1.4744101633393829e-05, + "loss": 36.759, + "step": 2812 + }, + { + "epoch": 10.15530474040632, + "grad_norm": 222.00164794921875, + "learning_rate": 1.4738656987295826e-05, + "loss": 37.397, + "step": 2813 + }, + { + "epoch": 10.158916478555305, + "grad_norm": 226.9628448486328, + "learning_rate": 1.4733212341197823e-05, + "loss": 36.3648, + "step": 2814 + }, + { + "epoch": 10.162528216704288, + "grad_norm": 271.061279296875, + "learning_rate": 1.4727767695099819e-05, + "loss": 37.8754, + "step": 2815 + }, + { + "epoch": 10.166139954853273, + "grad_norm": 265.2478942871094, + "learning_rate": 1.4722323049001816e-05, + "loss": 33.7491, + "step": 2816 + }, + { + "epoch": 10.169751693002258, + "grad_norm": 227.5030975341797, + "learning_rate": 1.4716878402903811e-05, + "loss": 23.0162, + "step": 2817 + }, + { + "epoch": 10.173363431151241, + "grad_norm": 195.83477783203125, + "learning_rate": 1.4711433756805808e-05, + "loss": 23.5831, + "step": 2818 + }, + { + "epoch": 10.176975169300226, + "grad_norm": 196.982421875, + "learning_rate": 1.4705989110707805e-05, + "loss": 24.1078, + "step": 2819 + }, + { + "epoch": 10.18058690744921, + "grad_norm": 212.73031616210938, + "learning_rate": 1.47005444646098e-05, + "loss": 24.8378, + "step": 2820 + }, + { + "epoch": 10.18058690744921, + "eval_loss": 0.6217848062515259, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 2820 + }, + { + "epoch": 10.184198645598194, + "grad_norm": 261.8343200683594, + "learning_rate": 1.4695099818511796e-05, + "loss": 43.3402, + "step": 2821 + }, + { + "epoch": 10.187810383747179, + "grad_norm": 272.94158935546875, + "learning_rate": 1.4689655172413793e-05, + "loss": 42.8004, + "step": 2822 + }, + { + "epoch": 10.191422121896162, + "grad_norm": 261.5067138671875, + "learning_rate": 1.468421052631579e-05, + "loss": 43.5947, + "step": 2823 + }, + { + "epoch": 10.195033860045147, + "grad_norm": 280.4205322265625, + "learning_rate": 1.4678765880217787e-05, + "loss": 42.1887, + "step": 2824 + }, + { + "epoch": 10.198645598194132, + "grad_norm": 223.82449340820312, + "learning_rate": 1.4673321234119783e-05, + "loss": 40.9825, + "step": 2825 + }, + { + "epoch": 10.202257336343115, + "grad_norm": 261.1077575683594, + "learning_rate": 1.4667876588021778e-05, + "loss": 41.8347, + "step": 2826 + }, + { + "epoch": 10.2058690744921, + "grad_norm": 189.1642608642578, + "learning_rate": 1.4662431941923775e-05, + "loss": 41.7441, + "step": 2827 + }, + { + "epoch": 10.209480812641084, + "grad_norm": 216.94410705566406, + "learning_rate": 1.4656987295825772e-05, + "loss": 42.203, + "step": 2828 + }, + { + "epoch": 10.213092550790067, + "grad_norm": 260.44744873046875, + "learning_rate": 1.4651542649727768e-05, + "loss": 41.8887, + "step": 2829 + }, + { + "epoch": 10.216704288939052, + "grad_norm": 252.21682739257812, + "learning_rate": 1.4646098003629765e-05, + "loss": 42.5977, + "step": 2830 + }, + { + "epoch": 10.216704288939052, + "eval_loss": 0.6175437569618225, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.997, + "eval_steps_per_second": 56.997, + "step": 2830 + }, + { + "epoch": 10.220316027088035, + "grad_norm": 298.4760437011719, + "learning_rate": 1.464065335753176e-05, + "loss": 40.7994, + "step": 2831 + }, + { + "epoch": 10.22392776523702, + "grad_norm": 214.0433349609375, + "learning_rate": 1.4635208711433757e-05, + "loss": 39.1571, + "step": 2832 + }, + { + "epoch": 10.227539503386005, + "grad_norm": 220.59039306640625, + "learning_rate": 1.4629764065335754e-05, + "loss": 38.257, + "step": 2833 + }, + { + "epoch": 10.231151241534988, + "grad_norm": 218.2419891357422, + "learning_rate": 1.462431941923775e-05, + "loss": 38.1954, + "step": 2834 + }, + { + "epoch": 10.234762979683973, + "grad_norm": 241.67674255371094, + "learning_rate": 1.4618874773139747e-05, + "loss": 39.7451, + "step": 2835 + }, + { + "epoch": 10.238374717832958, + "grad_norm": 260.3656005859375, + "learning_rate": 1.4613430127041742e-05, + "loss": 38.8297, + "step": 2836 + }, + { + "epoch": 10.241986455981941, + "grad_norm": 231.78102111816406, + "learning_rate": 1.4607985480943739e-05, + "loss": 38.523, + "step": 2837 + }, + { + "epoch": 10.245598194130926, + "grad_norm": 217.64820861816406, + "learning_rate": 1.4602540834845736e-05, + "loss": 40.0389, + "step": 2838 + }, + { + "epoch": 10.249209932279909, + "grad_norm": 186.45240783691406, + "learning_rate": 1.4597096188747732e-05, + "loss": 40.3306, + "step": 2839 + }, + { + "epoch": 10.252821670428894, + "grad_norm": 225.20480346679688, + "learning_rate": 1.4591651542649727e-05, + "loss": 39.0968, + "step": 2840 + }, + { + "epoch": 10.252821670428894, + "eval_loss": 0.6195141673088074, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 2840 + }, + { + "epoch": 10.256433408577879, + "grad_norm": 367.6174621582031, + "learning_rate": 1.4586206896551724e-05, + "loss": 38.869, + "step": 2841 + }, + { + "epoch": 10.260045146726862, + "grad_norm": 274.3976135253906, + "learning_rate": 1.4580762250453721e-05, + "loss": 39.7781, + "step": 2842 + }, + { + "epoch": 10.263656884875846, + "grad_norm": 193.41665649414062, + "learning_rate": 1.4575317604355718e-05, + "loss": 38.819, + "step": 2843 + }, + { + "epoch": 10.267268623024831, + "grad_norm": 204.2224578857422, + "learning_rate": 1.4569872958257714e-05, + "loss": 41.5495, + "step": 2844 + }, + { + "epoch": 10.270880361173814, + "grad_norm": 276.07476806640625, + "learning_rate": 1.4564428312159709e-05, + "loss": 40.6553, + "step": 2845 + }, + { + "epoch": 10.2744920993228, + "grad_norm": 192.6361541748047, + "learning_rate": 1.4558983666061708e-05, + "loss": 40.2147, + "step": 2846 + }, + { + "epoch": 10.278103837471784, + "grad_norm": 232.6641082763672, + "learning_rate": 1.4553539019963703e-05, + "loss": 40.7223, + "step": 2847 + }, + { + "epoch": 10.281715575620767, + "grad_norm": 266.781005859375, + "learning_rate": 1.4548094373865698e-05, + "loss": 38.0127, + "step": 2848 + }, + { + "epoch": 10.285327313769752, + "grad_norm": 289.5414123535156, + "learning_rate": 1.4542649727767696e-05, + "loss": 35.216, + "step": 2849 + }, + { + "epoch": 10.288939051918735, + "grad_norm": 208.10845947265625, + "learning_rate": 1.4537205081669691e-05, + "loss": 33.829, + "step": 2850 + }, + { + "epoch": 10.288939051918735, + "eval_loss": 0.6140356063842773, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.703, + "eval_steps_per_second": 56.703, + "step": 2850 + }, + { + "epoch": 10.29255079006772, + "grad_norm": 260.80328369140625, + "learning_rate": 1.4531760435571688e-05, + "loss": 33.8409, + "step": 2851 + }, + { + "epoch": 10.296162528216705, + "grad_norm": 202.3874053955078, + "learning_rate": 1.4526315789473685e-05, + "loss": 32.6498, + "step": 2852 + }, + { + "epoch": 10.299774266365688, + "grad_norm": 236.0218048095703, + "learning_rate": 1.452087114337568e-05, + "loss": 33.6538, + "step": 2853 + }, + { + "epoch": 10.303386004514673, + "grad_norm": 219.1603240966797, + "learning_rate": 1.4515426497277678e-05, + "loss": 33.7346, + "step": 2854 + }, + { + "epoch": 10.306997742663658, + "grad_norm": 252.8759307861328, + "learning_rate": 1.4509981851179675e-05, + "loss": 34.6996, + "step": 2855 + }, + { + "epoch": 10.31060948081264, + "grad_norm": 204.89244079589844, + "learning_rate": 1.450453720508167e-05, + "loss": 36.1145, + "step": 2856 + }, + { + "epoch": 10.314221218961626, + "grad_norm": 239.5278778076172, + "learning_rate": 1.4499092558983667e-05, + "loss": 34.8845, + "step": 2857 + }, + { + "epoch": 10.317832957110609, + "grad_norm": 235.02403259277344, + "learning_rate": 1.4493647912885662e-05, + "loss": 36.1006, + "step": 2858 + }, + { + "epoch": 10.321444695259594, + "grad_norm": 219.25686645507812, + "learning_rate": 1.4488203266787658e-05, + "loss": 37.0463, + "step": 2859 + }, + { + "epoch": 10.325056433408578, + "grad_norm": 238.1767578125, + "learning_rate": 1.4482758620689657e-05, + "loss": 35.5543, + "step": 2860 + }, + { + "epoch": 10.325056433408578, + "eval_loss": 0.6116110682487488, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.93, + "eval_steps_per_second": 56.93, + "step": 2860 + }, + { + "epoch": 10.328668171557561, + "grad_norm": 245.4133758544922, + "learning_rate": 1.4477313974591652e-05, + "loss": 35.7557, + "step": 2861 + }, + { + "epoch": 10.332279909706546, + "grad_norm": 231.70779418945312, + "learning_rate": 1.4471869328493647e-05, + "loss": 35.9535, + "step": 2862 + }, + { + "epoch": 10.335891647855531, + "grad_norm": 218.71266174316406, + "learning_rate": 1.4466424682395644e-05, + "loss": 36.747, + "step": 2863 + }, + { + "epoch": 10.339503386004514, + "grad_norm": 206.82247924804688, + "learning_rate": 1.446098003629764e-05, + "loss": 37.4007, + "step": 2864 + }, + { + "epoch": 10.343115124153499, + "grad_norm": 286.6649475097656, + "learning_rate": 1.4455535390199639e-05, + "loss": 38.183, + "step": 2865 + }, + { + "epoch": 10.346726862302482, + "grad_norm": 262.2049865722656, + "learning_rate": 1.4450090744101634e-05, + "loss": 28.1564, + "step": 2866 + }, + { + "epoch": 10.350338600451467, + "grad_norm": 203.03831481933594, + "learning_rate": 1.444464609800363e-05, + "loss": 23.7155, + "step": 2867 + }, + { + "epoch": 10.353950338600452, + "grad_norm": 220.13597106933594, + "learning_rate": 1.4439201451905626e-05, + "loss": 23.5066, + "step": 2868 + }, + { + "epoch": 10.357562076749435, + "grad_norm": 208.22035217285156, + "learning_rate": 1.4433756805807624e-05, + "loss": 23.8087, + "step": 2869 + }, + { + "epoch": 10.36117381489842, + "grad_norm": 202.74989318847656, + "learning_rate": 1.4428312159709619e-05, + "loss": 24.6194, + "step": 2870 + }, + { + "epoch": 10.36117381489842, + "eval_loss": 0.6170971989631653, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 2870 + }, + { + "epoch": 10.364785553047405, + "grad_norm": 251.78924560546875, + "learning_rate": 1.4422867513611616e-05, + "loss": 41.1333, + "step": 2871 + }, + { + "epoch": 10.368397291196388, + "grad_norm": 269.72430419921875, + "learning_rate": 1.4417422867513611e-05, + "loss": 43.5289, + "step": 2872 + }, + { + "epoch": 10.372009029345373, + "grad_norm": 226.14202880859375, + "learning_rate": 1.4411978221415607e-05, + "loss": 42.1575, + "step": 2873 + }, + { + "epoch": 10.375620767494357, + "grad_norm": 230.2255096435547, + "learning_rate": 1.4406533575317606e-05, + "loss": 42.5563, + "step": 2874 + }, + { + "epoch": 10.37923250564334, + "grad_norm": 259.2338562011719, + "learning_rate": 1.4401088929219601e-05, + "loss": 41.517, + "step": 2875 + }, + { + "epoch": 10.382844243792325, + "grad_norm": 280.06414794921875, + "learning_rate": 1.4395644283121598e-05, + "loss": 41.3589, + "step": 2876 + }, + { + "epoch": 10.386455981941308, + "grad_norm": 259.1960754394531, + "learning_rate": 1.4390199637023593e-05, + "loss": 41.539, + "step": 2877 + }, + { + "epoch": 10.390067720090293, + "grad_norm": 244.4931640625, + "learning_rate": 1.438475499092559e-05, + "loss": 41.8689, + "step": 2878 + }, + { + "epoch": 10.393679458239278, + "grad_norm": 195.65065002441406, + "learning_rate": 1.4379310344827588e-05, + "loss": 42.9191, + "step": 2879 + }, + { + "epoch": 10.397291196388261, + "grad_norm": 215.88589477539062, + "learning_rate": 1.4373865698729583e-05, + "loss": 41.4172, + "step": 2880 + }, + { + "epoch": 10.397291196388261, + "eval_loss": 0.6176813840866089, + "eval_runtime": 3.1462, + "eval_samples_per_second": 56.893, + "eval_steps_per_second": 56.893, + "step": 2880 + }, + { + "epoch": 10.400902934537246, + "grad_norm": 175.21368408203125, + "learning_rate": 1.4368421052631578e-05, + "loss": 41.8998, + "step": 2881 + }, + { + "epoch": 10.404514672686231, + "grad_norm": 207.65963745117188, + "learning_rate": 1.4362976406533575e-05, + "loss": 40.33, + "step": 2882 + }, + { + "epoch": 10.408126410835214, + "grad_norm": 213.50526428222656, + "learning_rate": 1.4357531760435572e-05, + "loss": 38.0329, + "step": 2883 + }, + { + "epoch": 10.411738148984199, + "grad_norm": 190.8444366455078, + "learning_rate": 1.4352087114337568e-05, + "loss": 39.0142, + "step": 2884 + }, + { + "epoch": 10.415349887133182, + "grad_norm": 300.2298583984375, + "learning_rate": 1.4346642468239565e-05, + "loss": 38.6364, + "step": 2885 + }, + { + "epoch": 10.418961625282167, + "grad_norm": 183.6144256591797, + "learning_rate": 1.434119782214156e-05, + "loss": 39.6747, + "step": 2886 + }, + { + "epoch": 10.422573363431152, + "grad_norm": 237.85340881347656, + "learning_rate": 1.4335753176043557e-05, + "loss": 38.3018, + "step": 2887 + }, + { + "epoch": 10.426185101580135, + "grad_norm": 325.96624755859375, + "learning_rate": 1.4330308529945554e-05, + "loss": 40.1042, + "step": 2888 + }, + { + "epoch": 10.42979683972912, + "grad_norm": 248.4732666015625, + "learning_rate": 1.432486388384755e-05, + "loss": 40.0357, + "step": 2889 + }, + { + "epoch": 10.433408577878104, + "grad_norm": 374.6653747558594, + "learning_rate": 1.4319419237749547e-05, + "loss": 40.4383, + "step": 2890 + }, + { + "epoch": 10.433408577878104, + "eval_loss": 0.6150367856025696, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.881, + "eval_steps_per_second": 56.881, + "step": 2890 + }, + { + "epoch": 10.437020316027088, + "grad_norm": 229.79647827148438, + "learning_rate": 1.4313974591651542e-05, + "loss": 40.3728, + "step": 2891 + }, + { + "epoch": 10.440632054176072, + "grad_norm": 278.7500915527344, + "learning_rate": 1.430852994555354e-05, + "loss": 39.546, + "step": 2892 + }, + { + "epoch": 10.444243792325057, + "grad_norm": 233.1890106201172, + "learning_rate": 1.4303085299455536e-05, + "loss": 41.8094, + "step": 2893 + }, + { + "epoch": 10.44785553047404, + "grad_norm": 207.7745819091797, + "learning_rate": 1.4297640653357532e-05, + "loss": 40.6225, + "step": 2894 + }, + { + "epoch": 10.451467268623025, + "grad_norm": 233.37892150878906, + "learning_rate": 1.4292196007259529e-05, + "loss": 40.2499, + "step": 2895 + }, + { + "epoch": 10.455079006772008, + "grad_norm": 225.4070587158203, + "learning_rate": 1.4286751361161524e-05, + "loss": 40.3626, + "step": 2896 + }, + { + "epoch": 10.458690744920993, + "grad_norm": 239.60231018066406, + "learning_rate": 1.4281306715063521e-05, + "loss": 40.3149, + "step": 2897 + }, + { + "epoch": 10.462302483069978, + "grad_norm": 225.3981475830078, + "learning_rate": 1.4275862068965518e-05, + "loss": 39.3443, + "step": 2898 + }, + { + "epoch": 10.465914221218961, + "grad_norm": 270.2829284667969, + "learning_rate": 1.4270417422867514e-05, + "loss": 37.8947, + "step": 2899 + }, + { + "epoch": 10.469525959367946, + "grad_norm": 263.66986083984375, + "learning_rate": 1.426497277676951e-05, + "loss": 34.4721, + "step": 2900 + }, + { + "epoch": 10.469525959367946, + "eval_loss": 0.6134031414985657, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2900 + }, + { + "epoch": 10.47313769751693, + "grad_norm": 189.3812255859375, + "learning_rate": 1.4259528130671508e-05, + "loss": 34.3148, + "step": 2901 + }, + { + "epoch": 10.476749435665914, + "grad_norm": 256.7174987792969, + "learning_rate": 1.4254083484573503e-05, + "loss": 32.1693, + "step": 2902 + }, + { + "epoch": 10.480361173814899, + "grad_norm": 265.40692138671875, + "learning_rate": 1.4248638838475499e-05, + "loss": 34.369, + "step": 2903 + }, + { + "epoch": 10.483972911963882, + "grad_norm": 315.6539001464844, + "learning_rate": 1.4243194192377496e-05, + "loss": 34.9479, + "step": 2904 + }, + { + "epoch": 10.487584650112867, + "grad_norm": 263.7816162109375, + "learning_rate": 1.4237749546279491e-05, + "loss": 33.983, + "step": 2905 + }, + { + "epoch": 10.491196388261852, + "grad_norm": 244.69192504882812, + "learning_rate": 1.423230490018149e-05, + "loss": 36.6685, + "step": 2906 + }, + { + "epoch": 10.494808126410835, + "grad_norm": 224.26071166992188, + "learning_rate": 1.4226860254083485e-05, + "loss": 35.0337, + "step": 2907 + }, + { + "epoch": 10.49841986455982, + "grad_norm": 261.0958557128906, + "learning_rate": 1.422141560798548e-05, + "loss": 34.7154, + "step": 2908 + }, + { + "epoch": 10.502031602708804, + "grad_norm": 245.85960388183594, + "learning_rate": 1.4215970961887478e-05, + "loss": 35.4156, + "step": 2909 + }, + { + "epoch": 10.505643340857787, + "grad_norm": 309.3730163574219, + "learning_rate": 1.4210526315789473e-05, + "loss": 36.3999, + "step": 2910 + }, + { + "epoch": 10.505643340857787, + "eval_loss": 0.6144266128540039, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.853, + "eval_steps_per_second": 56.853, + "step": 2910 + }, + { + "epoch": 10.509255079006772, + "grad_norm": 209.9637451171875, + "learning_rate": 1.420508166969147e-05, + "loss": 37.1515, + "step": 2911 + }, + { + "epoch": 10.512866817155757, + "grad_norm": 254.81683349609375, + "learning_rate": 1.4199637023593467e-05, + "loss": 35.5548, + "step": 2912 + }, + { + "epoch": 10.51647855530474, + "grad_norm": 224.94137573242188, + "learning_rate": 1.4194192377495463e-05, + "loss": 36.7691, + "step": 2913 + }, + { + "epoch": 10.520090293453725, + "grad_norm": 223.81838989257812, + "learning_rate": 1.4188747731397458e-05, + "loss": 37.5904, + "step": 2914 + }, + { + "epoch": 10.523702031602708, + "grad_norm": 308.0168151855469, + "learning_rate": 1.4183303085299457e-05, + "loss": 36.1561, + "step": 2915 + }, + { + "epoch": 10.527313769751693, + "grad_norm": 214.77928161621094, + "learning_rate": 1.4177858439201452e-05, + "loss": 27.6309, + "step": 2916 + }, + { + "epoch": 10.530925507900678, + "grad_norm": 153.77163696289062, + "learning_rate": 1.417241379310345e-05, + "loss": 23.6151, + "step": 2917 + }, + { + "epoch": 10.534537246049661, + "grad_norm": 161.12826538085938, + "learning_rate": 1.4166969147005445e-05, + "loss": 23.1684, + "step": 2918 + }, + { + "epoch": 10.538148984198646, + "grad_norm": 228.01441955566406, + "learning_rate": 1.416152450090744e-05, + "loss": 23.4383, + "step": 2919 + }, + { + "epoch": 10.54176072234763, + "grad_norm": 207.55052185058594, + "learning_rate": 1.4156079854809439e-05, + "loss": 25.4699, + "step": 2920 + }, + { + "epoch": 10.54176072234763, + "eval_loss": 0.6177500486373901, + "eval_runtime": 3.1369, + "eval_samples_per_second": 57.063, + "eval_steps_per_second": 57.063, + "step": 2920 + }, + { + "epoch": 10.545372460496614, + "grad_norm": 254.23828125, + "learning_rate": 1.4150635208711434e-05, + "loss": 42.1525, + "step": 2921 + }, + { + "epoch": 10.548984198645599, + "grad_norm": 228.1654815673828, + "learning_rate": 1.414519056261343e-05, + "loss": 42.4282, + "step": 2922 + }, + { + "epoch": 10.552595936794582, + "grad_norm": 258.4981689453125, + "learning_rate": 1.4139745916515427e-05, + "loss": 42.3053, + "step": 2923 + }, + { + "epoch": 10.556207674943566, + "grad_norm": 364.42059326171875, + "learning_rate": 1.4134301270417424e-05, + "loss": 41.9009, + "step": 2924 + }, + { + "epoch": 10.559819413092551, + "grad_norm": 213.5066375732422, + "learning_rate": 1.412885662431942e-05, + "loss": 41.0624, + "step": 2925 + }, + { + "epoch": 10.563431151241534, + "grad_norm": 214.23472595214844, + "learning_rate": 1.4123411978221416e-05, + "loss": 42.2508, + "step": 2926 + }, + { + "epoch": 10.56704288939052, + "grad_norm": 249.8063201904297, + "learning_rate": 1.4117967332123412e-05, + "loss": 43.0671, + "step": 2927 + }, + { + "epoch": 10.570654627539504, + "grad_norm": 210.0769805908203, + "learning_rate": 1.4112522686025409e-05, + "loss": 43.4018, + "step": 2928 + }, + { + "epoch": 10.574266365688487, + "grad_norm": 255.67225646972656, + "learning_rate": 1.4107078039927406e-05, + "loss": 42.9609, + "step": 2929 + }, + { + "epoch": 10.577878103837472, + "grad_norm": 294.2599182128906, + "learning_rate": 1.4101633393829401e-05, + "loss": 41.8748, + "step": 2930 + }, + { + "epoch": 10.577878103837472, + "eval_loss": 0.6147512793540955, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2930 + }, + { + "epoch": 10.581489841986457, + "grad_norm": 212.6685333251953, + "learning_rate": 1.4096188747731398e-05, + "loss": 42.4291, + "step": 2931 + }, + { + "epoch": 10.58510158013544, + "grad_norm": 297.016357421875, + "learning_rate": 1.4090744101633394e-05, + "loss": 39.7291, + "step": 2932 + }, + { + "epoch": 10.588713318284425, + "grad_norm": 280.308837890625, + "learning_rate": 1.4085299455535389e-05, + "loss": 37.4836, + "step": 2933 + }, + { + "epoch": 10.592325056433408, + "grad_norm": 230.28994750976562, + "learning_rate": 1.4079854809437388e-05, + "loss": 39.4075, + "step": 2934 + }, + { + "epoch": 10.595936794582393, + "grad_norm": 377.0367126464844, + "learning_rate": 1.4074410163339383e-05, + "loss": 40.5601, + "step": 2935 + }, + { + "epoch": 10.599548532731378, + "grad_norm": 238.51597595214844, + "learning_rate": 1.406896551724138e-05, + "loss": 38.1238, + "step": 2936 + }, + { + "epoch": 10.60316027088036, + "grad_norm": 197.5536651611328, + "learning_rate": 1.4063520871143376e-05, + "loss": 38.2997, + "step": 2937 + }, + { + "epoch": 10.606772009029346, + "grad_norm": 211.65162658691406, + "learning_rate": 1.4058076225045373e-05, + "loss": 39.1501, + "step": 2938 + }, + { + "epoch": 10.610383747178329, + "grad_norm": 266.4801940917969, + "learning_rate": 1.405263157894737e-05, + "loss": 40.5761, + "step": 2939 + }, + { + "epoch": 10.613995485327314, + "grad_norm": 210.29478454589844, + "learning_rate": 1.4047186932849365e-05, + "loss": 39.7387, + "step": 2940 + }, + { + "epoch": 10.613995485327314, + "eval_loss": 0.6154477000236511, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 2940 + }, + { + "epoch": 10.617607223476298, + "grad_norm": 318.0694580078125, + "learning_rate": 1.404174228675136e-05, + "loss": 38.691, + "step": 2941 + }, + { + "epoch": 10.621218961625281, + "grad_norm": 351.12811279296875, + "learning_rate": 1.4036297640653358e-05, + "loss": 40.3878, + "step": 2942 + }, + { + "epoch": 10.624830699774266, + "grad_norm": 259.8601989746094, + "learning_rate": 1.4030852994555355e-05, + "loss": 38.4447, + "step": 2943 + }, + { + "epoch": 10.628442437923251, + "grad_norm": 249.7741241455078, + "learning_rate": 1.402540834845735e-05, + "loss": 41.1242, + "step": 2944 + }, + { + "epoch": 10.632054176072234, + "grad_norm": 207.11119079589844, + "learning_rate": 1.4019963702359347e-05, + "loss": 40.1977, + "step": 2945 + }, + { + "epoch": 10.635665914221219, + "grad_norm": 199.37295532226562, + "learning_rate": 1.4014519056261343e-05, + "loss": 40.71, + "step": 2946 + }, + { + "epoch": 10.639277652370204, + "grad_norm": 238.85061645507812, + "learning_rate": 1.4009074410163341e-05, + "loss": 41.8822, + "step": 2947 + }, + { + "epoch": 10.642889390519187, + "grad_norm": 212.46388244628906, + "learning_rate": 1.4003629764065337e-05, + "loss": 40.5648, + "step": 2948 + }, + { + "epoch": 10.646501128668172, + "grad_norm": 217.60386657714844, + "learning_rate": 1.3998185117967332e-05, + "loss": 39.6074, + "step": 2949 + }, + { + "epoch": 10.650112866817155, + "grad_norm": 223.88645935058594, + "learning_rate": 1.399274047186933e-05, + "loss": 37.7394, + "step": 2950 + }, + { + "epoch": 10.650112866817155, + "eval_loss": 0.6133999228477478, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 2950 + }, + { + "epoch": 10.65372460496614, + "grad_norm": 248.87986755371094, + "learning_rate": 1.3987295825771325e-05, + "loss": 34.911, + "step": 2951 + }, + { + "epoch": 10.657336343115125, + "grad_norm": 238.0355987548828, + "learning_rate": 1.3981851179673322e-05, + "loss": 34.0325, + "step": 2952 + }, + { + "epoch": 10.660948081264108, + "grad_norm": 212.9556121826172, + "learning_rate": 1.3976406533575319e-05, + "loss": 34.9663, + "step": 2953 + }, + { + "epoch": 10.664559819413093, + "grad_norm": 274.4277648925781, + "learning_rate": 1.3970961887477314e-05, + "loss": 34.2399, + "step": 2954 + }, + { + "epoch": 10.668171557562077, + "grad_norm": 211.77976989746094, + "learning_rate": 1.396551724137931e-05, + "loss": 33.7609, + "step": 2955 + }, + { + "epoch": 10.67178329571106, + "grad_norm": 280.6621398925781, + "learning_rate": 1.3960072595281307e-05, + "loss": 35.2616, + "step": 2956 + }, + { + "epoch": 10.675395033860045, + "grad_norm": 239.06439208984375, + "learning_rate": 1.3954627949183304e-05, + "loss": 34.2542, + "step": 2957 + }, + { + "epoch": 10.679006772009028, + "grad_norm": 271.45806884765625, + "learning_rate": 1.39491833030853e-05, + "loss": 36.0551, + "step": 2958 + }, + { + "epoch": 10.682618510158013, + "grad_norm": 247.76486206054688, + "learning_rate": 1.3943738656987296e-05, + "loss": 36.9935, + "step": 2959 + }, + { + "epoch": 10.686230248306998, + "grad_norm": 259.47930908203125, + "learning_rate": 1.3938294010889292e-05, + "loss": 36.7769, + "step": 2960 + }, + { + "epoch": 10.686230248306998, + "eval_loss": 0.6107803583145142, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.138, + "eval_steps_per_second": 57.138, + "step": 2960 + }, + { + "epoch": 10.689841986455981, + "grad_norm": 247.50103759765625, + "learning_rate": 1.393284936479129e-05, + "loss": 35.4848, + "step": 2961 + }, + { + "epoch": 10.693453724604966, + "grad_norm": 242.37330627441406, + "learning_rate": 1.3927404718693286e-05, + "loss": 36.3881, + "step": 2962 + }, + { + "epoch": 10.697065462753951, + "grad_norm": 200.2835693359375, + "learning_rate": 1.3921960072595281e-05, + "loss": 37.2684, + "step": 2963 + }, + { + "epoch": 10.700677200902934, + "grad_norm": 261.6256103515625, + "learning_rate": 1.3916515426497278e-05, + "loss": 37.4581, + "step": 2964 + }, + { + "epoch": 10.704288939051919, + "grad_norm": 243.7251434326172, + "learning_rate": 1.3911070780399274e-05, + "loss": 35.8237, + "step": 2965 + }, + { + "epoch": 10.707900677200904, + "grad_norm": 172.99339294433594, + "learning_rate": 1.390562613430127e-05, + "loss": 29.5815, + "step": 2966 + }, + { + "epoch": 10.711512415349887, + "grad_norm": 168.88490295410156, + "learning_rate": 1.3900181488203268e-05, + "loss": 23.6597, + "step": 2967 + }, + { + "epoch": 10.715124153498872, + "grad_norm": 213.0456085205078, + "learning_rate": 1.3894736842105263e-05, + "loss": 22.5034, + "step": 2968 + }, + { + "epoch": 10.718735891647855, + "grad_norm": 183.87222290039062, + "learning_rate": 1.388929219600726e-05, + "loss": 24.1696, + "step": 2969 + }, + { + "epoch": 10.72234762979684, + "grad_norm": 179.4297637939453, + "learning_rate": 1.3883847549909256e-05, + "loss": 24.8905, + "step": 2970 + }, + { + "epoch": 10.72234762979684, + "eval_loss": 0.6176853179931641, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 2970 + }, + { + "epoch": 10.725959367945824, + "grad_norm": 214.10662841796875, + "learning_rate": 1.3878402903811253e-05, + "loss": 40.6941, + "step": 2971 + }, + { + "epoch": 10.729571106094808, + "grad_norm": 199.4381103515625, + "learning_rate": 1.387295825771325e-05, + "loss": 42.6363, + "step": 2972 + }, + { + "epoch": 10.733182844243792, + "grad_norm": 182.74517822265625, + "learning_rate": 1.3867513611615245e-05, + "loss": 40.9695, + "step": 2973 + }, + { + "epoch": 10.736794582392777, + "grad_norm": 182.41421508789062, + "learning_rate": 1.386206896551724e-05, + "loss": 40.8893, + "step": 2974 + }, + { + "epoch": 10.74040632054176, + "grad_norm": 215.42904663085938, + "learning_rate": 1.385662431941924e-05, + "loss": 40.6667, + "step": 2975 + }, + { + "epoch": 10.744018058690745, + "grad_norm": 208.15133666992188, + "learning_rate": 1.3851179673321235e-05, + "loss": 42.0714, + "step": 2976 + }, + { + "epoch": 10.747629796839728, + "grad_norm": 224.70242309570312, + "learning_rate": 1.384573502722323e-05, + "loss": 40.9404, + "step": 2977 + }, + { + "epoch": 10.751241534988713, + "grad_norm": 241.45301818847656, + "learning_rate": 1.3840290381125227e-05, + "loss": 43.5597, + "step": 2978 + }, + { + "epoch": 10.754853273137698, + "grad_norm": 201.2677459716797, + "learning_rate": 1.3834845735027222e-05, + "loss": 42.7741, + "step": 2979 + }, + { + "epoch": 10.758465011286681, + "grad_norm": 246.30873107910156, + "learning_rate": 1.3829401088929221e-05, + "loss": 41.7873, + "step": 2980 + }, + { + "epoch": 10.758465011286681, + "eval_loss": 0.6206657886505127, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2980 + }, + { + "epoch": 10.762076749435666, + "grad_norm": 206.91009521484375, + "learning_rate": 1.3823956442831217e-05, + "loss": 42.3601, + "step": 2981 + }, + { + "epoch": 10.76568848758465, + "grad_norm": 206.37472534179688, + "learning_rate": 1.3818511796733212e-05, + "loss": 38.5536, + "step": 2982 + }, + { + "epoch": 10.769300225733634, + "grad_norm": 206.49070739746094, + "learning_rate": 1.3813067150635209e-05, + "loss": 38.1051, + "step": 2983 + }, + { + "epoch": 10.772911963882619, + "grad_norm": 215.02455139160156, + "learning_rate": 1.3807622504537206e-05, + "loss": 39.0797, + "step": 2984 + }, + { + "epoch": 10.776523702031604, + "grad_norm": 254.23757934570312, + "learning_rate": 1.3802177858439202e-05, + "loss": 39.419, + "step": 2985 + }, + { + "epoch": 10.780135440180587, + "grad_norm": 205.85079956054688, + "learning_rate": 1.3796733212341199e-05, + "loss": 39.2075, + "step": 2986 + }, + { + "epoch": 10.783747178329572, + "grad_norm": 216.0372314453125, + "learning_rate": 1.3791288566243194e-05, + "loss": 38.5652, + "step": 2987 + }, + { + "epoch": 10.787358916478555, + "grad_norm": 258.47650146484375, + "learning_rate": 1.3785843920145191e-05, + "loss": 38.1968, + "step": 2988 + }, + { + "epoch": 10.79097065462754, + "grad_norm": 289.07354736328125, + "learning_rate": 1.3780399274047188e-05, + "loss": 40.2233, + "step": 2989 + }, + { + "epoch": 10.794582392776524, + "grad_norm": 332.9964904785156, + "learning_rate": 1.3774954627949184e-05, + "loss": 39.5959, + "step": 2990 + }, + { + "epoch": 10.794582392776524, + "eval_loss": 0.6167517304420471, + "eval_runtime": 3.1556, + "eval_samples_per_second": 56.724, + "eval_steps_per_second": 56.724, + "step": 2990 + }, + { + "epoch": 10.798194130925507, + "grad_norm": 205.10699462890625, + "learning_rate": 1.376950998185118e-05, + "loss": 40.2468, + "step": 2991 + }, + { + "epoch": 10.801805869074492, + "grad_norm": 270.2808837890625, + "learning_rate": 1.3764065335753176e-05, + "loss": 37.5956, + "step": 2992 + }, + { + "epoch": 10.805417607223477, + "grad_norm": 199.32044982910156, + "learning_rate": 1.3758620689655171e-05, + "loss": 38.7289, + "step": 2993 + }, + { + "epoch": 10.80902934537246, + "grad_norm": 196.97547912597656, + "learning_rate": 1.375317604355717e-05, + "loss": 40.6707, + "step": 2994 + }, + { + "epoch": 10.812641083521445, + "grad_norm": 219.34588623046875, + "learning_rate": 1.3747731397459166e-05, + "loss": 39.6782, + "step": 2995 + }, + { + "epoch": 10.816252821670428, + "grad_norm": 261.7323913574219, + "learning_rate": 1.3742286751361161e-05, + "loss": 41.1828, + "step": 2996 + }, + { + "epoch": 10.819864559819413, + "grad_norm": 250.89186096191406, + "learning_rate": 1.3736842105263158e-05, + "loss": 41.3582, + "step": 2997 + }, + { + "epoch": 10.823476297968398, + "grad_norm": 284.7223205566406, + "learning_rate": 1.3731397459165155e-05, + "loss": 39.3584, + "step": 2998 + }, + { + "epoch": 10.827088036117381, + "grad_norm": 212.9114990234375, + "learning_rate": 1.3725952813067152e-05, + "loss": 37.5373, + "step": 2999 + }, + { + "epoch": 10.830699774266366, + "grad_norm": 182.8346405029297, + "learning_rate": 1.3720508166969148e-05, + "loss": 35.2027, + "step": 3000 + }, + { + "epoch": 10.830699774266366, + "eval_loss": 0.6083630919456482, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.702, + "eval_steps_per_second": 56.702, + "step": 3000 + }, + { + "epoch": 10.83431151241535, + "grad_norm": 259.0496520996094, + "learning_rate": 1.3715063520871143e-05, + "loss": 33.4937, + "step": 3001 + }, + { + "epoch": 10.837923250564334, + "grad_norm": 173.037353515625, + "learning_rate": 1.370961887477314e-05, + "loss": 32.8549, + "step": 3002 + }, + { + "epoch": 10.841534988713319, + "grad_norm": 257.9381408691406, + "learning_rate": 1.3704174228675137e-05, + "loss": 33.9163, + "step": 3003 + }, + { + "epoch": 10.845146726862303, + "grad_norm": 248.58355712890625, + "learning_rate": 1.3698729582577132e-05, + "loss": 34.3948, + "step": 3004 + }, + { + "epoch": 10.848758465011286, + "grad_norm": 277.0877990722656, + "learning_rate": 1.369328493647913e-05, + "loss": 34.2868, + "step": 3005 + }, + { + "epoch": 10.852370203160271, + "grad_norm": 220.54014587402344, + "learning_rate": 1.3687840290381125e-05, + "loss": 35.2502, + "step": 3006 + }, + { + "epoch": 10.855981941309254, + "grad_norm": 248.14111328125, + "learning_rate": 1.3682395644283122e-05, + "loss": 33.4599, + "step": 3007 + }, + { + "epoch": 10.85959367945824, + "grad_norm": 284.2827453613281, + "learning_rate": 1.3676950998185119e-05, + "loss": 34.2927, + "step": 3008 + }, + { + "epoch": 10.863205417607224, + "grad_norm": 236.78201293945312, + "learning_rate": 1.3671506352087114e-05, + "loss": 34.9322, + "step": 3009 + }, + { + "epoch": 10.866817155756207, + "grad_norm": 245.58331298828125, + "learning_rate": 1.3666061705989112e-05, + "loss": 35.7628, + "step": 3010 + }, + { + "epoch": 10.866817155756207, + "eval_loss": 0.6125946640968323, + "eval_runtime": 3.1644, + "eval_samples_per_second": 56.566, + "eval_steps_per_second": 56.566, + "step": 3010 + }, + { + "epoch": 10.870428893905192, + "grad_norm": 217.79248046875, + "learning_rate": 1.3660617059891107e-05, + "loss": 35.7332, + "step": 3011 + }, + { + "epoch": 10.874040632054175, + "grad_norm": 258.78729248046875, + "learning_rate": 1.3655172413793104e-05, + "loss": 38.293, + "step": 3012 + }, + { + "epoch": 10.87765237020316, + "grad_norm": 253.94757080078125, + "learning_rate": 1.3649727767695101e-05, + "loss": 37.511, + "step": 3013 + }, + { + "epoch": 10.881264108352145, + "grad_norm": 265.5654602050781, + "learning_rate": 1.3644283121597096e-05, + "loss": 37.5786, + "step": 3014 + }, + { + "epoch": 10.884875846501128, + "grad_norm": 252.11453247070312, + "learning_rate": 1.3638838475499092e-05, + "loss": 37.1039, + "step": 3015 + }, + { + "epoch": 10.888487584650113, + "grad_norm": 259.5934753417969, + "learning_rate": 1.3633393829401089e-05, + "loss": 35.2651, + "step": 3016 + }, + { + "epoch": 10.892099322799098, + "grad_norm": 194.3569793701172, + "learning_rate": 1.3627949183303086e-05, + "loss": 23.7438, + "step": 3017 + }, + { + "epoch": 10.89571106094808, + "grad_norm": 233.95205688476562, + "learning_rate": 1.3622504537205081e-05, + "loss": 23.0061, + "step": 3018 + }, + { + "epoch": 10.899322799097066, + "grad_norm": 185.18495178222656, + "learning_rate": 1.3617059891107078e-05, + "loss": 24.5404, + "step": 3019 + }, + { + "epoch": 10.90293453724605, + "grad_norm": 200.27029418945312, + "learning_rate": 1.3611615245009074e-05, + "loss": 24.3629, + "step": 3020 + }, + { + "epoch": 10.90293453724605, + "eval_loss": 0.6178797483444214, + "eval_runtime": 3.1498, + "eval_samples_per_second": 56.829, + "eval_steps_per_second": 56.829, + "step": 3020 + }, + { + "epoch": 10.906546275395034, + "grad_norm": 226.4281463623047, + "learning_rate": 1.3606170598911073e-05, + "loss": 41.7249, + "step": 3021 + }, + { + "epoch": 10.910158013544018, + "grad_norm": 207.73768615722656, + "learning_rate": 1.3600725952813068e-05, + "loss": 42.1902, + "step": 3022 + }, + { + "epoch": 10.913769751693001, + "grad_norm": 248.69773864746094, + "learning_rate": 1.3595281306715063e-05, + "loss": 40.8419, + "step": 3023 + }, + { + "epoch": 10.917381489841986, + "grad_norm": 224.0100860595703, + "learning_rate": 1.358983666061706e-05, + "loss": 41.483, + "step": 3024 + }, + { + "epoch": 10.920993227990971, + "grad_norm": 217.3524932861328, + "learning_rate": 1.3584392014519056e-05, + "loss": 42.4667, + "step": 3025 + }, + { + "epoch": 10.924604966139954, + "grad_norm": 226.0863494873047, + "learning_rate": 1.3578947368421053e-05, + "loss": 40.8693, + "step": 3026 + }, + { + "epoch": 10.928216704288939, + "grad_norm": 278.3658447265625, + "learning_rate": 1.357350272232305e-05, + "loss": 39.5165, + "step": 3027 + }, + { + "epoch": 10.931828442437924, + "grad_norm": 226.6543731689453, + "learning_rate": 1.3568058076225045e-05, + "loss": 39.3144, + "step": 3028 + }, + { + "epoch": 10.935440180586907, + "grad_norm": 215.39073181152344, + "learning_rate": 1.3562613430127042e-05, + "loss": 39.9823, + "step": 3029 + }, + { + "epoch": 10.939051918735892, + "grad_norm": 239.6291961669922, + "learning_rate": 1.355716878402904e-05, + "loss": 40.898, + "step": 3030 + }, + { + "epoch": 10.939051918735892, + "eval_loss": 0.6163076162338257, + "eval_runtime": 3.153, + "eval_samples_per_second": 56.771, + "eval_steps_per_second": 56.771, + "step": 3030 + }, + { + "epoch": 10.942663656884875, + "grad_norm": 251.20431518554688, + "learning_rate": 1.3551724137931035e-05, + "loss": 40.8357, + "step": 3031 + }, + { + "epoch": 10.94627539503386, + "grad_norm": 243.96022033691406, + "learning_rate": 1.3546279491833032e-05, + "loss": 39.1261, + "step": 3032 + }, + { + "epoch": 10.949887133182845, + "grad_norm": 248.15545654296875, + "learning_rate": 1.3540834845735027e-05, + "loss": 40.9375, + "step": 3033 + }, + { + "epoch": 10.953498871331828, + "grad_norm": 215.00927734375, + "learning_rate": 1.3535390199637023e-05, + "loss": 42.4167, + "step": 3034 + }, + { + "epoch": 10.957110609480813, + "grad_norm": 263.11566162109375, + "learning_rate": 1.3529945553539021e-05, + "loss": 40.7363, + "step": 3035 + }, + { + "epoch": 10.960722347629797, + "grad_norm": 208.59628295898438, + "learning_rate": 1.3524500907441017e-05, + "loss": 35.7124, + "step": 3036 + }, + { + "epoch": 10.96433408577878, + "grad_norm": 187.6036834716797, + "learning_rate": 1.3519056261343012e-05, + "loss": 33.7512, + "step": 3037 + }, + { + "epoch": 10.967945823927765, + "grad_norm": 217.89825439453125, + "learning_rate": 1.351361161524501e-05, + "loss": 33.4262, + "step": 3038 + }, + { + "epoch": 10.97155756207675, + "grad_norm": 235.59889221191406, + "learning_rate": 1.3508166969147005e-05, + "loss": 35.2587, + "step": 3039 + }, + { + "epoch": 10.975169300225733, + "grad_norm": 261.9609680175781, + "learning_rate": 1.3502722323049003e-05, + "loss": 36.1296, + "step": 3040 + }, + { + "epoch": 10.975169300225733, + "eval_loss": 0.610818088054657, + "eval_runtime": 3.1502, + "eval_samples_per_second": 56.822, + "eval_steps_per_second": 56.822, + "step": 3040 + }, + { + "epoch": 10.978781038374718, + "grad_norm": 239.44386291503906, + "learning_rate": 1.3497277676950999e-05, + "loss": 35.6712, + "step": 3041 + }, + { + "epoch": 10.982392776523701, + "grad_norm": 260.9620666503906, + "learning_rate": 1.3491833030852994e-05, + "loss": 35.9054, + "step": 3042 + }, + { + "epoch": 10.986004514672686, + "grad_norm": 246.35678100585938, + "learning_rate": 1.3486388384754991e-05, + "loss": 35.6071, + "step": 3043 + }, + { + "epoch": 10.989616252821671, + "grad_norm": 259.808349609375, + "learning_rate": 1.3480943738656988e-05, + "loss": 37.8261, + "step": 3044 + }, + { + "epoch": 10.993227990970654, + "grad_norm": 187.34579467773438, + "learning_rate": 1.3475499092558984e-05, + "loss": 29.4662, + "step": 3045 + }, + { + "epoch": 10.996839729119639, + "grad_norm": 235.4073486328125, + "learning_rate": 1.3470054446460981e-05, + "loss": 23.668, + "step": 3046 + }, + { + "epoch": 11.0, + "grad_norm": 171.45904541015625, + "learning_rate": 1.3464609800362976e-05, + "loss": 21.3995, + "step": 3047 + }, + { + "epoch": 11.003611738148985, + "grad_norm": 262.18798828125, + "learning_rate": 1.3459165154264972e-05, + "loss": 40.2072, + "step": 3048 + }, + { + "epoch": 11.007223476297968, + "grad_norm": 298.67755126953125, + "learning_rate": 1.345372050816697e-05, + "loss": 42.5345, + "step": 3049 + }, + { + "epoch": 11.010835214446953, + "grad_norm": 215.71389770507812, + "learning_rate": 1.3448275862068966e-05, + "loss": 41.3491, + "step": 3050 + }, + { + "epoch": 11.010835214446953, + "eval_loss": 0.6099278330802917, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 3050 + }, + { + "epoch": 11.014446952595938, + "grad_norm": 243.77044677734375, + "learning_rate": 1.3442831215970963e-05, + "loss": 41.0093, + "step": 3051 + }, + { + "epoch": 11.01805869074492, + "grad_norm": 205.8600616455078, + "learning_rate": 1.3437386569872958e-05, + "loss": 41.944, + "step": 3052 + }, + { + "epoch": 11.021670428893906, + "grad_norm": 204.25608825683594, + "learning_rate": 1.3431941923774955e-05, + "loss": 39.3595, + "step": 3053 + }, + { + "epoch": 11.025282167042889, + "grad_norm": 195.03114318847656, + "learning_rate": 1.3426497277676952e-05, + "loss": 42.0208, + "step": 3054 + }, + { + "epoch": 11.028893905191874, + "grad_norm": 193.05857849121094, + "learning_rate": 1.3421052631578948e-05, + "loss": 41.2148, + "step": 3055 + }, + { + "epoch": 11.032505643340858, + "grad_norm": 255.9553680419922, + "learning_rate": 1.3415607985480943e-05, + "loss": 41.6029, + "step": 3056 + }, + { + "epoch": 11.036117381489841, + "grad_norm": 234.97799682617188, + "learning_rate": 1.341016333938294e-05, + "loss": 41.2583, + "step": 3057 + }, + { + "epoch": 11.039729119638826, + "grad_norm": 183.76707458496094, + "learning_rate": 1.3404718693284937e-05, + "loss": 39.4893, + "step": 3058 + }, + { + "epoch": 11.043340857787811, + "grad_norm": 162.30191040039062, + "learning_rate": 1.3399274047186933e-05, + "loss": 37.697, + "step": 3059 + }, + { + "epoch": 11.046952595936794, + "grad_norm": 223.8235626220703, + "learning_rate": 1.339382940108893e-05, + "loss": 37.2762, + "step": 3060 + }, + { + "epoch": 11.046952595936794, + "eval_loss": 0.6099210381507874, + "eval_runtime": 3.1526, + "eval_samples_per_second": 56.778, + "eval_steps_per_second": 56.778, + "step": 3060 + }, + { + "epoch": 11.050564334085779, + "grad_norm": 203.874755859375, + "learning_rate": 1.3388384754990925e-05, + "loss": 37.7674, + "step": 3061 + }, + { + "epoch": 11.054176072234762, + "grad_norm": 222.9609832763672, + "learning_rate": 1.3382940108892922e-05, + "loss": 39.5784, + "step": 3062 + }, + { + "epoch": 11.057787810383747, + "grad_norm": 177.81871032714844, + "learning_rate": 1.337749546279492e-05, + "loss": 37.5264, + "step": 3063 + }, + { + "epoch": 11.061399548532732, + "grad_norm": 209.53326416015625, + "learning_rate": 1.3372050816696915e-05, + "loss": 38.5067, + "step": 3064 + }, + { + "epoch": 11.065011286681715, + "grad_norm": 228.35260009765625, + "learning_rate": 1.3366606170598912e-05, + "loss": 37.5329, + "step": 3065 + }, + { + "epoch": 11.0686230248307, + "grad_norm": 231.5054168701172, + "learning_rate": 1.3361161524500907e-05, + "loss": 39.8565, + "step": 3066 + }, + { + "epoch": 11.072234762979685, + "grad_norm": 184.31460571289062, + "learning_rate": 1.3355716878402904e-05, + "loss": 37.9703, + "step": 3067 + }, + { + "epoch": 11.075846501128668, + "grad_norm": 230.06463623046875, + "learning_rate": 1.3350272232304901e-05, + "loss": 39.1406, + "step": 3068 + }, + { + "epoch": 11.079458239277653, + "grad_norm": 263.3990478515625, + "learning_rate": 1.3344827586206897e-05, + "loss": 39.8019, + "step": 3069 + }, + { + "epoch": 11.083069977426636, + "grad_norm": 217.89923095703125, + "learning_rate": 1.3339382940108892e-05, + "loss": 40.195, + "step": 3070 + }, + { + "epoch": 11.083069977426636, + "eval_loss": 0.6136859655380249, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 3070 + }, + { + "epoch": 11.08668171557562, + "grad_norm": 238.8343505859375, + "learning_rate": 1.333393829401089e-05, + "loss": 39.1668, + "step": 3071 + }, + { + "epoch": 11.090293453724605, + "grad_norm": 288.6470947265625, + "learning_rate": 1.3328493647912886e-05, + "loss": 40.3355, + "step": 3072 + }, + { + "epoch": 11.093905191873588, + "grad_norm": 284.3423156738281, + "learning_rate": 1.3323049001814883e-05, + "loss": 41.5359, + "step": 3073 + }, + { + "epoch": 11.097516930022573, + "grad_norm": 263.0945739746094, + "learning_rate": 1.3317604355716879e-05, + "loss": 41.3219, + "step": 3074 + }, + { + "epoch": 11.101128668171558, + "grad_norm": 208.96383666992188, + "learning_rate": 1.3312159709618874e-05, + "loss": 39.7292, + "step": 3075 + }, + { + "epoch": 11.104740406320541, + "grad_norm": 233.49888610839844, + "learning_rate": 1.3306715063520873e-05, + "loss": 35.282, + "step": 3076 + }, + { + "epoch": 11.108352144469526, + "grad_norm": 216.6250762939453, + "learning_rate": 1.3301270417422868e-05, + "loss": 34.4335, + "step": 3077 + }, + { + "epoch": 11.111963882618511, + "grad_norm": 182.3594970703125, + "learning_rate": 1.3295825771324864e-05, + "loss": 32.7557, + "step": 3078 + }, + { + "epoch": 11.115575620767494, + "grad_norm": 215.4852752685547, + "learning_rate": 1.329038112522686e-05, + "loss": 32.185, + "step": 3079 + }, + { + "epoch": 11.119187358916479, + "grad_norm": 237.4733123779297, + "learning_rate": 1.3284936479128856e-05, + "loss": 32.8733, + "step": 3080 + }, + { + "epoch": 11.119187358916479, + "eval_loss": 0.6130570769309998, + "eval_runtime": 3.154, + "eval_samples_per_second": 56.754, + "eval_steps_per_second": 56.754, + "step": 3080 + }, + { + "epoch": 11.122799097065462, + "grad_norm": 202.9044952392578, + "learning_rate": 1.3279491833030853e-05, + "loss": 33.89, + "step": 3081 + }, + { + "epoch": 11.126410835214447, + "grad_norm": 230.82086181640625, + "learning_rate": 1.327404718693285e-05, + "loss": 34.0808, + "step": 3082 + }, + { + "epoch": 11.130022573363432, + "grad_norm": 318.1103515625, + "learning_rate": 1.3268602540834846e-05, + "loss": 35.5715, + "step": 3083 + }, + { + "epoch": 11.133634311512415, + "grad_norm": 296.760986328125, + "learning_rate": 1.3263157894736843e-05, + "loss": 36.0701, + "step": 3084 + }, + { + "epoch": 11.1372460496614, + "grad_norm": 355.1922302246094, + "learning_rate": 1.3257713248638838e-05, + "loss": 35.027, + "step": 3085 + }, + { + "epoch": 11.140857787810384, + "grad_norm": 379.0643310546875, + "learning_rate": 1.3252268602540835e-05, + "loss": 36.8225, + "step": 3086 + }, + { + "epoch": 11.144469525959368, + "grad_norm": 271.0293273925781, + "learning_rate": 1.3246823956442832e-05, + "loss": 34.18, + "step": 3087 + }, + { + "epoch": 11.148081264108352, + "grad_norm": 231.29782104492188, + "learning_rate": 1.3241379310344828e-05, + "loss": 37.5546, + "step": 3088 + }, + { + "epoch": 11.151693002257336, + "grad_norm": 236.58180236816406, + "learning_rate": 1.3235934664246823e-05, + "loss": 35.8625, + "step": 3089 + }, + { + "epoch": 11.15530474040632, + "grad_norm": 220.71853637695312, + "learning_rate": 1.3230490018148822e-05, + "loss": 38.1384, + "step": 3090 + }, + { + "epoch": 11.15530474040632, + "eval_loss": 0.6140565276145935, + "eval_runtime": 3.1543, + "eval_samples_per_second": 56.747, + "eval_steps_per_second": 56.747, + "step": 3090 + }, + { + "epoch": 11.158916478555305, + "grad_norm": 251.32090759277344, + "learning_rate": 1.3225045372050817e-05, + "loss": 36.7226, + "step": 3091 + }, + { + "epoch": 11.162528216704288, + "grad_norm": 244.061279296875, + "learning_rate": 1.3219600725952814e-05, + "loss": 37.2144, + "step": 3092 + }, + { + "epoch": 11.166139954853273, + "grad_norm": 274.3013610839844, + "learning_rate": 1.321415607985481e-05, + "loss": 27.0703, + "step": 3093 + }, + { + "epoch": 11.169751693002258, + "grad_norm": 197.1829071044922, + "learning_rate": 1.3208711433756805e-05, + "loss": 23.0504, + "step": 3094 + }, + { + "epoch": 11.173363431151241, + "grad_norm": 205.8387451171875, + "learning_rate": 1.3203266787658804e-05, + "loss": 23.4632, + "step": 3095 + }, + { + "epoch": 11.176975169300226, + "grad_norm": 237.6263427734375, + "learning_rate": 1.31978221415608e-05, + "loss": 23.9426, + "step": 3096 + }, + { + "epoch": 11.18058690744921, + "grad_norm": 177.99688720703125, + "learning_rate": 1.3192377495462795e-05, + "loss": 24.2553, + "step": 3097 + }, + { + "epoch": 11.184198645598194, + "grad_norm": 235.16787719726562, + "learning_rate": 1.3186932849364792e-05, + "loss": 41.3257, + "step": 3098 + }, + { + "epoch": 11.187810383747179, + "grad_norm": 213.4043731689453, + "learning_rate": 1.3181488203266787e-05, + "loss": 42.3344, + "step": 3099 + }, + { + "epoch": 11.191422121896162, + "grad_norm": 162.57554626464844, + "learning_rate": 1.3176043557168784e-05, + "loss": 41.2702, + "step": 3100 + }, + { + "epoch": 11.191422121896162, + "eval_loss": 0.6155741214752197, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.06, + "eval_steps_per_second": 57.06, + "step": 3100 + }, + { + "epoch": 11.195033860045147, + "grad_norm": 215.84335327148438, + "learning_rate": 1.3170598911070781e-05, + "loss": 41.0582, + "step": 3101 + }, + { + "epoch": 11.198645598194132, + "grad_norm": 295.0271301269531, + "learning_rate": 1.3165154264972777e-05, + "loss": 41.3479, + "step": 3102 + }, + { + "epoch": 11.202257336343115, + "grad_norm": 287.3316955566406, + "learning_rate": 1.3159709618874774e-05, + "loss": 41.6267, + "step": 3103 + }, + { + "epoch": 11.2058690744921, + "grad_norm": 249.3993377685547, + "learning_rate": 1.315426497277677e-05, + "loss": 40.5208, + "step": 3104 + }, + { + "epoch": 11.209480812641084, + "grad_norm": 274.5410461425781, + "learning_rate": 1.3148820326678766e-05, + "loss": 41.7072, + "step": 3105 + }, + { + "epoch": 11.213092550790067, + "grad_norm": 259.49627685546875, + "learning_rate": 1.3143375680580763e-05, + "loss": 41.0034, + "step": 3106 + }, + { + "epoch": 11.216704288939052, + "grad_norm": 246.60902404785156, + "learning_rate": 1.3137931034482759e-05, + "loss": 40.1154, + "step": 3107 + }, + { + "epoch": 11.220316027088035, + "grad_norm": 224.0052947998047, + "learning_rate": 1.3132486388384754e-05, + "loss": 41.1167, + "step": 3108 + }, + { + "epoch": 11.22392776523702, + "grad_norm": 204.24021911621094, + "learning_rate": 1.3127041742286753e-05, + "loss": 37.0909, + "step": 3109 + }, + { + "epoch": 11.227539503386005, + "grad_norm": 206.67681884765625, + "learning_rate": 1.3121597096188748e-05, + "loss": 38.0959, + "step": 3110 + }, + { + "epoch": 11.227539503386005, + "eval_loss": 0.6148640513420105, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.798, + "eval_steps_per_second": 56.798, + "step": 3110 + }, + { + "epoch": 11.231151241534988, + "grad_norm": 255.91238403320312, + "learning_rate": 1.3116152450090743e-05, + "loss": 38.8076, + "step": 3111 + }, + { + "epoch": 11.234762979683973, + "grad_norm": 239.5032958984375, + "learning_rate": 1.311070780399274e-05, + "loss": 39.3991, + "step": 3112 + }, + { + "epoch": 11.238374717832958, + "grad_norm": 254.8914031982422, + "learning_rate": 1.3105263157894738e-05, + "loss": 37.7301, + "step": 3113 + }, + { + "epoch": 11.241986455981941, + "grad_norm": 229.97943115234375, + "learning_rate": 1.3099818511796735e-05, + "loss": 38.8527, + "step": 3114 + }, + { + "epoch": 11.245598194130926, + "grad_norm": 208.1148681640625, + "learning_rate": 1.309437386569873e-05, + "loss": 38.8518, + "step": 3115 + }, + { + "epoch": 11.249209932279909, + "grad_norm": 208.49557495117188, + "learning_rate": 1.3088929219600725e-05, + "loss": 38.927, + "step": 3116 + }, + { + "epoch": 11.252821670428894, + "grad_norm": 332.9958801269531, + "learning_rate": 1.3083484573502723e-05, + "loss": 40.0492, + "step": 3117 + }, + { + "epoch": 11.256433408577879, + "grad_norm": 253.16769409179688, + "learning_rate": 1.307803992740472e-05, + "loss": 39.1965, + "step": 3118 + }, + { + "epoch": 11.260045146726862, + "grad_norm": 243.8136444091797, + "learning_rate": 1.3072595281306715e-05, + "loss": 38.2286, + "step": 3119 + }, + { + "epoch": 11.263656884875846, + "grad_norm": 273.6463623046875, + "learning_rate": 1.3067150635208712e-05, + "loss": 39.3751, + "step": 3120 + }, + { + "epoch": 11.263656884875846, + "eval_loss": 0.6175129413604736, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 3120 + }, + { + "epoch": 11.267268623024831, + "grad_norm": 228.980224609375, + "learning_rate": 1.3061705989110707e-05, + "loss": 40.29, + "step": 3121 + }, + { + "epoch": 11.270880361173814, + "grad_norm": 292.6310729980469, + "learning_rate": 1.3056261343012703e-05, + "loss": 41.1785, + "step": 3122 + }, + { + "epoch": 11.2744920993228, + "grad_norm": 217.0737762451172, + "learning_rate": 1.3050816696914702e-05, + "loss": 40.9514, + "step": 3123 + }, + { + "epoch": 11.278103837471784, + "grad_norm": 227.0102081298828, + "learning_rate": 1.3045372050816697e-05, + "loss": 39.6132, + "step": 3124 + }, + { + "epoch": 11.281715575620767, + "grad_norm": 195.74667358398438, + "learning_rate": 1.3039927404718694e-05, + "loss": 39.5024, + "step": 3125 + }, + { + "epoch": 11.285327313769752, + "grad_norm": 222.6744384765625, + "learning_rate": 1.303448275862069e-05, + "loss": 37.7863, + "step": 3126 + }, + { + "epoch": 11.288939051918735, + "grad_norm": 207.1038055419922, + "learning_rate": 1.3029038112522687e-05, + "loss": 34.9129, + "step": 3127 + }, + { + "epoch": 11.29255079006772, + "grad_norm": 227.38330078125, + "learning_rate": 1.3023593466424684e-05, + "loss": 33.231, + "step": 3128 + }, + { + "epoch": 11.296162528216705, + "grad_norm": 254.19442749023438, + "learning_rate": 1.3018148820326679e-05, + "loss": 33.3166, + "step": 3129 + }, + { + "epoch": 11.299774266365688, + "grad_norm": 221.4664306640625, + "learning_rate": 1.3012704174228674e-05, + "loss": 33.2336, + "step": 3130 + }, + { + "epoch": 11.299774266365688, + "eval_loss": 0.6138683557510376, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 3130 + }, + { + "epoch": 11.303386004514673, + "grad_norm": 179.73678588867188, + "learning_rate": 1.3007259528130671e-05, + "loss": 34.0082, + "step": 3131 + }, + { + "epoch": 11.306997742663658, + "grad_norm": 238.66107177734375, + "learning_rate": 1.3001814882032669e-05, + "loss": 33.1898, + "step": 3132 + }, + { + "epoch": 11.31060948081264, + "grad_norm": 315.51934814453125, + "learning_rate": 1.2996370235934666e-05, + "loss": 34.5558, + "step": 3133 + }, + { + "epoch": 11.314221218961626, + "grad_norm": 235.54217529296875, + "learning_rate": 1.2990925589836661e-05, + "loss": 32.4498, + "step": 3134 + }, + { + "epoch": 11.317832957110609, + "grad_norm": 225.9518280029297, + "learning_rate": 1.2985480943738656e-05, + "loss": 34.1823, + "step": 3135 + }, + { + "epoch": 11.321444695259594, + "grad_norm": 276.5481262207031, + "learning_rate": 1.2980036297640655e-05, + "loss": 34.6704, + "step": 3136 + }, + { + "epoch": 11.325056433408578, + "grad_norm": 306.4985656738281, + "learning_rate": 1.297459165154265e-05, + "loss": 35.9149, + "step": 3137 + }, + { + "epoch": 11.328668171557561, + "grad_norm": 207.28550720214844, + "learning_rate": 1.2969147005444646e-05, + "loss": 34.876, + "step": 3138 + }, + { + "epoch": 11.332279909706546, + "grad_norm": 238.89157104492188, + "learning_rate": 1.2963702359346643e-05, + "loss": 36.7191, + "step": 3139 + }, + { + "epoch": 11.335891647855531, + "grad_norm": 281.7445068359375, + "learning_rate": 1.2958257713248638e-05, + "loss": 37.9134, + "step": 3140 + }, + { + "epoch": 11.335891647855531, + "eval_loss": 0.6141538023948669, + "eval_runtime": 3.1622, + "eval_samples_per_second": 56.606, + "eval_steps_per_second": 56.606, + "step": 3140 + }, + { + "epoch": 11.339503386004514, + "grad_norm": 261.58221435546875, + "learning_rate": 1.2952813067150635e-05, + "loss": 36.7193, + "step": 3141 + }, + { + "epoch": 11.343115124153499, + "grad_norm": 260.8083190917969, + "learning_rate": 1.2947368421052633e-05, + "loss": 36.9418, + "step": 3142 + }, + { + "epoch": 11.346726862302482, + "grad_norm": 263.466552734375, + "learning_rate": 1.2941923774954628e-05, + "loss": 31.1083, + "step": 3143 + }, + { + "epoch": 11.350338600451467, + "grad_norm": 201.6587677001953, + "learning_rate": 1.2936479128856625e-05, + "loss": 23.4982, + "step": 3144 + }, + { + "epoch": 11.353950338600452, + "grad_norm": 230.29629516601562, + "learning_rate": 1.293103448275862e-05, + "loss": 22.5417, + "step": 3145 + }, + { + "epoch": 11.357562076749435, + "grad_norm": 193.08795166015625, + "learning_rate": 1.2925589836660617e-05, + "loss": 23.6032, + "step": 3146 + }, + { + "epoch": 11.36117381489842, + "grad_norm": 206.49093627929688, + "learning_rate": 1.2920145190562615e-05, + "loss": 24.1813, + "step": 3147 + }, + { + "epoch": 11.364785553047405, + "grad_norm": 285.38348388671875, + "learning_rate": 1.291470054446461e-05, + "loss": 41.4394, + "step": 3148 + }, + { + "epoch": 11.368397291196388, + "grad_norm": 307.4984130859375, + "learning_rate": 1.2909255898366605e-05, + "loss": 43.8865, + "step": 3149 + }, + { + "epoch": 11.372009029345373, + "grad_norm": 256.685791015625, + "learning_rate": 1.2903811252268604e-05, + "loss": 41.5534, + "step": 3150 + }, + { + "epoch": 11.372009029345373, + "eval_loss": 0.6155339479446411, + "eval_runtime": 3.1488, + "eval_samples_per_second": 56.846, + "eval_steps_per_second": 56.846, + "step": 3150 + }, + { + "epoch": 11.375620767494357, + "grad_norm": 302.5317077636719, + "learning_rate": 1.28983666061706e-05, + "loss": 41.5231, + "step": 3151 + }, + { + "epoch": 11.37923250564334, + "grad_norm": 381.4787292480469, + "learning_rate": 1.2892921960072595e-05, + "loss": 40.7064, + "step": 3152 + }, + { + "epoch": 11.382844243792325, + "grad_norm": 313.63116455078125, + "learning_rate": 1.2887477313974592e-05, + "loss": 41.4045, + "step": 3153 + }, + { + "epoch": 11.386455981941308, + "grad_norm": 265.4134521484375, + "learning_rate": 1.2882032667876587e-05, + "loss": 41.2618, + "step": 3154 + }, + { + "epoch": 11.390067720090293, + "grad_norm": 260.43084716796875, + "learning_rate": 1.2876588021778586e-05, + "loss": 42.6311, + "step": 3155 + }, + { + "epoch": 11.393679458239278, + "grad_norm": 326.7022705078125, + "learning_rate": 1.2871143375680581e-05, + "loss": 41.8859, + "step": 3156 + }, + { + "epoch": 11.397291196388261, + "grad_norm": 420.966552734375, + "learning_rate": 1.2865698729582577e-05, + "loss": 41.8117, + "step": 3157 + }, + { + "epoch": 11.400902934537246, + "grad_norm": 280.8377380371094, + "learning_rate": 1.2860254083484574e-05, + "loss": 41.3303, + "step": 3158 + }, + { + "epoch": 11.404514672686231, + "grad_norm": 238.64564514160156, + "learning_rate": 1.2854809437386571e-05, + "loss": 38.253, + "step": 3159 + }, + { + "epoch": 11.408126410835214, + "grad_norm": 258.8091125488281, + "learning_rate": 1.2849364791288566e-05, + "loss": 39.2494, + "step": 3160 + }, + { + "epoch": 11.408126410835214, + "eval_loss": 0.6130858659744263, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 3160 + }, + { + "epoch": 11.411738148984199, + "grad_norm": 209.76300048828125, + "learning_rate": 1.2843920145190563e-05, + "loss": 39.1069, + "step": 3161 + }, + { + "epoch": 11.415349887133182, + "grad_norm": 215.24072265625, + "learning_rate": 1.2838475499092559e-05, + "loss": 38.8867, + "step": 3162 + }, + { + "epoch": 11.418961625282167, + "grad_norm": 285.4281311035156, + "learning_rate": 1.2833030852994554e-05, + "loss": 38.0298, + "step": 3163 + }, + { + "epoch": 11.422573363431152, + "grad_norm": 322.1593017578125, + "learning_rate": 1.2827586206896553e-05, + "loss": 40.2122, + "step": 3164 + }, + { + "epoch": 11.426185101580135, + "grad_norm": 277.2178955078125, + "learning_rate": 1.2822141560798548e-05, + "loss": 38.0829, + "step": 3165 + }, + { + "epoch": 11.42979683972912, + "grad_norm": 186.9705810546875, + "learning_rate": 1.2816696914700545e-05, + "loss": 40.6601, + "step": 3166 + }, + { + "epoch": 11.433408577878104, + "grad_norm": 210.6102294921875, + "learning_rate": 1.281125226860254e-05, + "loss": 39.0126, + "step": 3167 + }, + { + "epoch": 11.437020316027088, + "grad_norm": 234.50717163085938, + "learning_rate": 1.2805807622504536e-05, + "loss": 38.6465, + "step": 3168 + }, + { + "epoch": 11.440632054176072, + "grad_norm": 217.9093475341797, + "learning_rate": 1.2800362976406535e-05, + "loss": 39.2568, + "step": 3169 + }, + { + "epoch": 11.444243792325057, + "grad_norm": 252.82054138183594, + "learning_rate": 1.279491833030853e-05, + "loss": 39.005, + "step": 3170 + }, + { + "epoch": 11.444243792325057, + "eval_loss": 0.6125118732452393, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 3170 + }, + { + "epoch": 11.44785553047404, + "grad_norm": 290.2322998046875, + "learning_rate": 1.2789473684210526e-05, + "loss": 39.6133, + "step": 3171 + }, + { + "epoch": 11.451467268623025, + "grad_norm": 250.72450256347656, + "learning_rate": 1.2784029038112523e-05, + "loss": 40.3251, + "step": 3172 + }, + { + "epoch": 11.455079006772008, + "grad_norm": 273.91229248046875, + "learning_rate": 1.277858439201452e-05, + "loss": 39.5129, + "step": 3173 + }, + { + "epoch": 11.458690744920993, + "grad_norm": 214.30038452148438, + "learning_rate": 1.2773139745916515e-05, + "loss": 40.5093, + "step": 3174 + }, + { + "epoch": 11.462302483069978, + "grad_norm": 264.251708984375, + "learning_rate": 1.2767695099818512e-05, + "loss": 38.3837, + "step": 3175 + }, + { + "epoch": 11.465914221218961, + "grad_norm": 224.7700653076172, + "learning_rate": 1.2762250453720508e-05, + "loss": 37.8522, + "step": 3176 + }, + { + "epoch": 11.469525959367946, + "grad_norm": 238.35604858398438, + "learning_rate": 1.2756805807622505e-05, + "loss": 34.0249, + "step": 3177 + }, + { + "epoch": 11.47313769751693, + "grad_norm": 181.4731903076172, + "learning_rate": 1.2751361161524502e-05, + "loss": 34.2473, + "step": 3178 + }, + { + "epoch": 11.476749435665914, + "grad_norm": 240.2397003173828, + "learning_rate": 1.2745916515426497e-05, + "loss": 32.8657, + "step": 3179 + }, + { + "epoch": 11.480361173814899, + "grad_norm": 283.2740478515625, + "learning_rate": 1.2740471869328494e-05, + "loss": 34.6619, + "step": 3180 + }, + { + "epoch": 11.480361173814899, + "eval_loss": 0.6126638054847717, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 3180 + }, + { + "epoch": 11.483972911963882, + "grad_norm": 248.70912170410156, + "learning_rate": 1.273502722323049e-05, + "loss": 33.0975, + "step": 3181 + }, + { + "epoch": 11.487584650112867, + "grad_norm": 210.9479217529297, + "learning_rate": 1.2729582577132487e-05, + "loss": 34.2069, + "step": 3182 + }, + { + "epoch": 11.491196388261852, + "grad_norm": 234.31399536132812, + "learning_rate": 1.2724137931034484e-05, + "loss": 35.811, + "step": 3183 + }, + { + "epoch": 11.494808126410835, + "grad_norm": 253.24478149414062, + "learning_rate": 1.271869328493648e-05, + "loss": 35.6234, + "step": 3184 + }, + { + "epoch": 11.49841986455982, + "grad_norm": 259.0565185546875, + "learning_rate": 1.2713248638838476e-05, + "loss": 35.1495, + "step": 3185 + }, + { + "epoch": 11.502031602708804, + "grad_norm": 235.4202880859375, + "learning_rate": 1.2707803992740472e-05, + "loss": 35.1363, + "step": 3186 + }, + { + "epoch": 11.505643340857787, + "grad_norm": 248.30267333984375, + "learning_rate": 1.2702359346642469e-05, + "loss": 35.9653, + "step": 3187 + }, + { + "epoch": 11.509255079006772, + "grad_norm": 197.6142120361328, + "learning_rate": 1.2696914700544466e-05, + "loss": 35.6304, + "step": 3188 + }, + { + "epoch": 11.512866817155757, + "grad_norm": 329.27862548828125, + "learning_rate": 1.2691470054446461e-05, + "loss": 35.6111, + "step": 3189 + }, + { + "epoch": 11.51647855530474, + "grad_norm": 194.7126922607422, + "learning_rate": 1.2686025408348457e-05, + "loss": 35.0693, + "step": 3190 + }, + { + "epoch": 11.51647855530474, + "eval_loss": 0.6106634736061096, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 3190 + }, + { + "epoch": 11.520090293453725, + "grad_norm": 243.0207061767578, + "learning_rate": 1.2680580762250454e-05, + "loss": 37.6373, + "step": 3191 + }, + { + "epoch": 11.523702031602708, + "grad_norm": 282.0947265625, + "learning_rate": 1.267513611615245e-05, + "loss": 36.2595, + "step": 3192 + }, + { + "epoch": 11.527313769751693, + "grad_norm": 249.8011932373047, + "learning_rate": 1.2669691470054446e-05, + "loss": 35.5601, + "step": 3193 + }, + { + "epoch": 11.530925507900678, + "grad_norm": 202.17503356933594, + "learning_rate": 1.2664246823956443e-05, + "loss": 23.1075, + "step": 3194 + }, + { + "epoch": 11.534537246049661, + "grad_norm": 188.78128051757812, + "learning_rate": 1.2658802177858439e-05, + "loss": 22.2458, + "step": 3195 + }, + { + "epoch": 11.538148984198646, + "grad_norm": 219.24722290039062, + "learning_rate": 1.2653357531760437e-05, + "loss": 23.7842, + "step": 3196 + }, + { + "epoch": 11.54176072234763, + "grad_norm": 213.0615234375, + "learning_rate": 1.2647912885662433e-05, + "loss": 25.3773, + "step": 3197 + }, + { + "epoch": 11.545372460496614, + "grad_norm": 274.6806335449219, + "learning_rate": 1.2642468239564428e-05, + "loss": 40.396, + "step": 3198 + }, + { + "epoch": 11.548984198645599, + "grad_norm": 248.91778564453125, + "learning_rate": 1.2637023593466425e-05, + "loss": 42.2405, + "step": 3199 + }, + { + "epoch": 11.552595936794582, + "grad_norm": 228.45591735839844, + "learning_rate": 1.263157894736842e-05, + "loss": 40.7328, + "step": 3200 + }, + { + "epoch": 11.552595936794582, + "eval_loss": 0.6154705286026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.04, + "eval_steps_per_second": 57.04, + "step": 3200 + }, + { + "epoch": 11.556207674943566, + "grad_norm": 206.54483032226562, + "learning_rate": 1.2626134301270418e-05, + "loss": 40.6909, + "step": 3201 + }, + { + "epoch": 11.559819413092551, + "grad_norm": 199.14816284179688, + "learning_rate": 1.2620689655172415e-05, + "loss": 40.6918, + "step": 3202 + }, + { + "epoch": 11.563431151241534, + "grad_norm": 217.4789276123047, + "learning_rate": 1.261524500907441e-05, + "loss": 41.686, + "step": 3203 + }, + { + "epoch": 11.56704288939052, + "grad_norm": 209.83084106445312, + "learning_rate": 1.2609800362976406e-05, + "loss": 40.685, + "step": 3204 + }, + { + "epoch": 11.570654627539504, + "grad_norm": 184.56614685058594, + "learning_rate": 1.2604355716878404e-05, + "loss": 42.1684, + "step": 3205 + }, + { + "epoch": 11.574266365688487, + "grad_norm": 226.84622192382812, + "learning_rate": 1.25989110707804e-05, + "loss": 42.4169, + "step": 3206 + }, + { + "epoch": 11.577878103837472, + "grad_norm": 271.7705383300781, + "learning_rate": 1.2593466424682397e-05, + "loss": 41.9603, + "step": 3207 + }, + { + "epoch": 11.581489841986457, + "grad_norm": 206.48257446289062, + "learning_rate": 1.2588021778584392e-05, + "loss": 39.9903, + "step": 3208 + }, + { + "epoch": 11.58510158013544, + "grad_norm": 190.86009216308594, + "learning_rate": 1.2582577132486388e-05, + "loss": 39.3138, + "step": 3209 + }, + { + "epoch": 11.588713318284425, + "grad_norm": 217.0152130126953, + "learning_rate": 1.2577132486388386e-05, + "loss": 37.652, + "step": 3210 + }, + { + "epoch": 11.588713318284425, + "eval_loss": 0.6143624186515808, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 3210 + }, + { + "epoch": 11.592325056433408, + "grad_norm": 203.3090362548828, + "learning_rate": 1.2571687840290382e-05, + "loss": 38.5532, + "step": 3211 + }, + { + "epoch": 11.595936794582393, + "grad_norm": 237.18287658691406, + "learning_rate": 1.2566243194192377e-05, + "loss": 38.4073, + "step": 3212 + }, + { + "epoch": 11.599548532731378, + "grad_norm": 222.20489501953125, + "learning_rate": 1.2560798548094374e-05, + "loss": 37.7122, + "step": 3213 + }, + { + "epoch": 11.60316027088036, + "grad_norm": 261.4862060546875, + "learning_rate": 1.255535390199637e-05, + "loss": 39.0125, + "step": 3214 + }, + { + "epoch": 11.606772009029346, + "grad_norm": 235.49668884277344, + "learning_rate": 1.2549909255898367e-05, + "loss": 38.1753, + "step": 3215 + }, + { + "epoch": 11.610383747178329, + "grad_norm": 219.66139221191406, + "learning_rate": 1.2544464609800364e-05, + "loss": 40.3478, + "step": 3216 + }, + { + "epoch": 11.613995485327314, + "grad_norm": 282.8075256347656, + "learning_rate": 1.2539019963702359e-05, + "loss": 39.3672, + "step": 3217 + }, + { + "epoch": 11.617607223476298, + "grad_norm": 235.07875061035156, + "learning_rate": 1.2533575317604356e-05, + "loss": 39.8955, + "step": 3218 + }, + { + "epoch": 11.621218961625281, + "grad_norm": 328.829833984375, + "learning_rate": 1.2528130671506353e-05, + "loss": 38.626, + "step": 3219 + }, + { + "epoch": 11.624830699774266, + "grad_norm": 283.1789245605469, + "learning_rate": 1.2522686025408349e-05, + "loss": 40.0565, + "step": 3220 + }, + { + "epoch": 11.624830699774266, + "eval_loss": 0.6113889217376709, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3220 + }, + { + "epoch": 11.628442437923251, + "grad_norm": 230.88047790527344, + "learning_rate": 1.2517241379310346e-05, + "loss": 40.1155, + "step": 3221 + }, + { + "epoch": 11.632054176072234, + "grad_norm": 258.1295166015625, + "learning_rate": 1.2511796733212341e-05, + "loss": 40.4707, + "step": 3222 + }, + { + "epoch": 11.635665914221219, + "grad_norm": 255.82699584960938, + "learning_rate": 1.2506352087114336e-05, + "loss": 41.1296, + "step": 3223 + }, + { + "epoch": 11.639277652370204, + "grad_norm": 226.4784393310547, + "learning_rate": 1.2500907441016335e-05, + "loss": 39.1159, + "step": 3224 + }, + { + "epoch": 11.642889390519187, + "grad_norm": 257.38104248046875, + "learning_rate": 1.249546279491833e-05, + "loss": 40.7933, + "step": 3225 + }, + { + "epoch": 11.646501128668172, + "grad_norm": 218.69070434570312, + "learning_rate": 1.2490018148820328e-05, + "loss": 39.6723, + "step": 3226 + }, + { + "epoch": 11.650112866817155, + "grad_norm": 232.3351287841797, + "learning_rate": 1.2484573502722323e-05, + "loss": 37.5671, + "step": 3227 + }, + { + "epoch": 11.65372460496614, + "grad_norm": 229.93295288085938, + "learning_rate": 1.2479128856624318e-05, + "loss": 32.7819, + "step": 3228 + }, + { + "epoch": 11.657336343115125, + "grad_norm": 265.6002197265625, + "learning_rate": 1.2473684210526317e-05, + "loss": 32.5955, + "step": 3229 + }, + { + "epoch": 11.660948081264108, + "grad_norm": 278.47705078125, + "learning_rate": 1.2468239564428313e-05, + "loss": 32.9901, + "step": 3230 + }, + { + "epoch": 11.660948081264108, + "eval_loss": 0.6078047752380371, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 3230 + }, + { + "epoch": 11.664559819413093, + "grad_norm": 239.9285430908203, + "learning_rate": 1.2462794918330308e-05, + "loss": 33.2737, + "step": 3231 + }, + { + "epoch": 11.668171557562077, + "grad_norm": 358.36090087890625, + "learning_rate": 1.2457350272232305e-05, + "loss": 34.8522, + "step": 3232 + }, + { + "epoch": 11.67178329571106, + "grad_norm": 258.0733642578125, + "learning_rate": 1.2451905626134302e-05, + "loss": 34.6796, + "step": 3233 + }, + { + "epoch": 11.675395033860045, + "grad_norm": 296.21942138671875, + "learning_rate": 1.2446460980036298e-05, + "loss": 35.8479, + "step": 3234 + }, + { + "epoch": 11.679006772009028, + "grad_norm": 229.6141815185547, + "learning_rate": 1.2441016333938295e-05, + "loss": 36.4934, + "step": 3235 + }, + { + "epoch": 11.682618510158013, + "grad_norm": 238.6092987060547, + "learning_rate": 1.243557168784029e-05, + "loss": 35.2253, + "step": 3236 + }, + { + "epoch": 11.686230248306998, + "grad_norm": 300.76300048828125, + "learning_rate": 1.2430127041742287e-05, + "loss": 34.9373, + "step": 3237 + }, + { + "epoch": 11.689841986455981, + "grad_norm": 227.70672607421875, + "learning_rate": 1.2424682395644284e-05, + "loss": 35.4369, + "step": 3238 + }, + { + "epoch": 11.693453724604966, + "grad_norm": 218.36000061035156, + "learning_rate": 1.241923774954628e-05, + "loss": 35.3398, + "step": 3239 + }, + { + "epoch": 11.697065462753951, + "grad_norm": 220.78475952148438, + "learning_rate": 1.2413793103448277e-05, + "loss": 35.7612, + "step": 3240 + }, + { + "epoch": 11.697065462753951, + "eval_loss": 0.6067846417427063, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 3240 + }, + { + "epoch": 11.700677200902934, + "grad_norm": 237.34437561035156, + "learning_rate": 1.2408348457350272e-05, + "loss": 38.0459, + "step": 3241 + }, + { + "epoch": 11.704288939051919, + "grad_norm": 251.60633850097656, + "learning_rate": 1.2402903811252269e-05, + "loss": 35.4676, + "step": 3242 + }, + { + "epoch": 11.707900677200904, + "grad_norm": 214.17117309570312, + "learning_rate": 1.2397459165154266e-05, + "loss": 30.5595, + "step": 3243 + }, + { + "epoch": 11.711512415349887, + "grad_norm": 202.3698272705078, + "learning_rate": 1.2392014519056262e-05, + "loss": 23.7468, + "step": 3244 + }, + { + "epoch": 11.715124153498872, + "grad_norm": 229.11776733398438, + "learning_rate": 1.2386569872958257e-05, + "loss": 23.1255, + "step": 3245 + }, + { + "epoch": 11.718735891647855, + "grad_norm": 175.93829345703125, + "learning_rate": 1.2381125226860254e-05, + "loss": 23.7349, + "step": 3246 + }, + { + "epoch": 11.72234762979684, + "grad_norm": 232.7489471435547, + "learning_rate": 1.2375680580762251e-05, + "loss": 24.4997, + "step": 3247 + }, + { + "epoch": 11.725959367945824, + "grad_norm": 280.5601806640625, + "learning_rate": 1.2370235934664248e-05, + "loss": 42.3811, + "step": 3248 + }, + { + "epoch": 11.729571106094808, + "grad_norm": 292.2538146972656, + "learning_rate": 1.2364791288566244e-05, + "loss": 42.9804, + "step": 3249 + }, + { + "epoch": 11.733182844243792, + "grad_norm": 265.0259704589844, + "learning_rate": 1.2359346642468239e-05, + "loss": 41.1251, + "step": 3250 + }, + { + "epoch": 11.733182844243792, + "eval_loss": 0.6141200065612793, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 3250 + }, + { + "epoch": 11.736794582392777, + "grad_norm": 232.92893981933594, + "learning_rate": 1.2353901996370236e-05, + "loss": 40.9372, + "step": 3251 + }, + { + "epoch": 11.74040632054176, + "grad_norm": 176.99818420410156, + "learning_rate": 1.2348457350272233e-05, + "loss": 41.0757, + "step": 3252 + }, + { + "epoch": 11.744018058690745, + "grad_norm": 206.5728759765625, + "learning_rate": 1.2343012704174228e-05, + "loss": 41.9635, + "step": 3253 + }, + { + "epoch": 11.747629796839728, + "grad_norm": 211.2556915283203, + "learning_rate": 1.2337568058076226e-05, + "loss": 41.5217, + "step": 3254 + }, + { + "epoch": 11.751241534988713, + "grad_norm": 198.8915252685547, + "learning_rate": 1.2332123411978221e-05, + "loss": 42.9997, + "step": 3255 + }, + { + "epoch": 11.754853273137698, + "grad_norm": 291.2761535644531, + "learning_rate": 1.2326678765880218e-05, + "loss": 42.2561, + "step": 3256 + }, + { + "epoch": 11.758465011286681, + "grad_norm": 243.2998046875, + "learning_rate": 1.2321234119782215e-05, + "loss": 41.6219, + "step": 3257 + }, + { + "epoch": 11.762076749435666, + "grad_norm": 266.1149597167969, + "learning_rate": 1.231578947368421e-05, + "loss": 40.1646, + "step": 3258 + }, + { + "epoch": 11.76568848758465, + "grad_norm": 236.6083221435547, + "learning_rate": 1.2310344827586208e-05, + "loss": 39.7079, + "step": 3259 + }, + { + "epoch": 11.769300225733634, + "grad_norm": 196.397216796875, + "learning_rate": 1.2304900181488203e-05, + "loss": 39.6629, + "step": 3260 + }, + { + "epoch": 11.769300225733634, + "eval_loss": 0.6124016046524048, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.055, + "eval_steps_per_second": 57.055, + "step": 3260 + }, + { + "epoch": 11.772911963882619, + "grad_norm": 198.52500915527344, + "learning_rate": 1.22994555353902e-05, + "loss": 38.5285, + "step": 3261 + }, + { + "epoch": 11.776523702031604, + "grad_norm": 236.25477600097656, + "learning_rate": 1.2294010889292197e-05, + "loss": 38.3358, + "step": 3262 + }, + { + "epoch": 11.780135440180587, + "grad_norm": 260.35955810546875, + "learning_rate": 1.2288566243194192e-05, + "loss": 38.374, + "step": 3263 + }, + { + "epoch": 11.783747178329572, + "grad_norm": 313.078857421875, + "learning_rate": 1.2283121597096188e-05, + "loss": 39.124, + "step": 3264 + }, + { + "epoch": 11.787358916478555, + "grad_norm": 191.34027099609375, + "learning_rate": 1.2277676950998187e-05, + "loss": 39.1776, + "step": 3265 + }, + { + "epoch": 11.79097065462754, + "grad_norm": 203.5764923095703, + "learning_rate": 1.2272232304900182e-05, + "loss": 38.7885, + "step": 3266 + }, + { + "epoch": 11.794582392776524, + "grad_norm": 234.38479614257812, + "learning_rate": 1.2266787658802177e-05, + "loss": 39.1353, + "step": 3267 + }, + { + "epoch": 11.798194130925507, + "grad_norm": 254.5694122314453, + "learning_rate": 1.2261343012704174e-05, + "loss": 38.141, + "step": 3268 + }, + { + "epoch": 11.801805869074492, + "grad_norm": 189.8268585205078, + "learning_rate": 1.225589836660617e-05, + "loss": 39.5199, + "step": 3269 + }, + { + "epoch": 11.805417607223477, + "grad_norm": 256.52728271484375, + "learning_rate": 1.2250453720508169e-05, + "loss": 41.5113, + "step": 3270 + }, + { + "epoch": 11.805417607223477, + "eval_loss": 0.6084021329879761, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3270 + }, + { + "epoch": 11.80902934537246, + "grad_norm": 195.57321166992188, + "learning_rate": 1.2245009074410164e-05, + "loss": 39.8129, + "step": 3271 + }, + { + "epoch": 11.812641083521445, + "grad_norm": 228.6748809814453, + "learning_rate": 1.223956442831216e-05, + "loss": 40.2273, + "step": 3272 + }, + { + "epoch": 11.816252821670428, + "grad_norm": 209.96096801757812, + "learning_rate": 1.2234119782214156e-05, + "loss": 40.2254, + "step": 3273 + }, + { + "epoch": 11.819864559819413, + "grad_norm": 247.4613037109375, + "learning_rate": 1.2228675136116152e-05, + "loss": 40.71, + "step": 3274 + }, + { + "epoch": 11.823476297968398, + "grad_norm": 263.0521240234375, + "learning_rate": 1.2223230490018149e-05, + "loss": 39.5572, + "step": 3275 + }, + { + "epoch": 11.827088036117381, + "grad_norm": 225.53634643554688, + "learning_rate": 1.2217785843920146e-05, + "loss": 36.4388, + "step": 3276 + }, + { + "epoch": 11.830699774266366, + "grad_norm": 194.59527587890625, + "learning_rate": 1.2212341197822141e-05, + "loss": 33.1005, + "step": 3277 + }, + { + "epoch": 11.83431151241535, + "grad_norm": 314.715576171875, + "learning_rate": 1.2206896551724138e-05, + "loss": 32.9812, + "step": 3278 + }, + { + "epoch": 11.837923250564334, + "grad_norm": 205.86862182617188, + "learning_rate": 1.2201451905626136e-05, + "loss": 33.6331, + "step": 3279 + }, + { + "epoch": 11.841534988713319, + "grad_norm": 217.54722595214844, + "learning_rate": 1.2196007259528131e-05, + "loss": 33.6535, + "step": 3280 + }, + { + "epoch": 11.841534988713319, + "eval_loss": 0.609620213508606, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 3280 + }, + { + "epoch": 11.845146726862303, + "grad_norm": 231.25390625, + "learning_rate": 1.2190562613430128e-05, + "loss": 34.5218, + "step": 3281 + }, + { + "epoch": 11.848758465011286, + "grad_norm": 208.8440704345703, + "learning_rate": 1.2185117967332123e-05, + "loss": 34.354, + "step": 3282 + }, + { + "epoch": 11.852370203160271, + "grad_norm": 221.25547790527344, + "learning_rate": 1.2179673321234119e-05, + "loss": 34.5705, + "step": 3283 + }, + { + "epoch": 11.855981941309254, + "grad_norm": 331.4505920410156, + "learning_rate": 1.2174228675136118e-05, + "loss": 35.796, + "step": 3284 + }, + { + "epoch": 11.85959367945824, + "grad_norm": 337.1404113769531, + "learning_rate": 1.2168784029038113e-05, + "loss": 36.4544, + "step": 3285 + }, + { + "epoch": 11.863205417607224, + "grad_norm": 238.75303649902344, + "learning_rate": 1.2163339382940108e-05, + "loss": 35.7165, + "step": 3286 + }, + { + "epoch": 11.866817155756207, + "grad_norm": 260.088134765625, + "learning_rate": 1.2157894736842105e-05, + "loss": 35.5461, + "step": 3287 + }, + { + "epoch": 11.870428893905192, + "grad_norm": 265.0240173339844, + "learning_rate": 1.2152450090744102e-05, + "loss": 37.0143, + "step": 3288 + }, + { + "epoch": 11.874040632054175, + "grad_norm": 251.74273681640625, + "learning_rate": 1.21470054446461e-05, + "loss": 36.6145, + "step": 3289 + }, + { + "epoch": 11.87765237020316, + "grad_norm": 216.8999786376953, + "learning_rate": 1.2141560798548095e-05, + "loss": 36.3135, + "step": 3290 + }, + { + "epoch": 11.87765237020316, + "eval_loss": 0.6087896823883057, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.066, + "eval_steps_per_second": 57.066, + "step": 3290 + }, + { + "epoch": 11.881264108352145, + "grad_norm": 256.50006103515625, + "learning_rate": 1.213611615245009e-05, + "loss": 36.6596, + "step": 3291 + }, + { + "epoch": 11.884875846501128, + "grad_norm": 249.34164428710938, + "learning_rate": 1.2130671506352087e-05, + "loss": 37.6473, + "step": 3292 + }, + { + "epoch": 11.888487584650113, + "grad_norm": 211.9344940185547, + "learning_rate": 1.2125226860254084e-05, + "loss": 28.2839, + "step": 3293 + }, + { + "epoch": 11.892099322799098, + "grad_norm": 170.77166748046875, + "learning_rate": 1.211978221415608e-05, + "loss": 23.2231, + "step": 3294 + }, + { + "epoch": 11.89571106094808, + "grad_norm": 177.49789428710938, + "learning_rate": 1.2114337568058077e-05, + "loss": 22.7909, + "step": 3295 + }, + { + "epoch": 11.899322799097066, + "grad_norm": 189.0458221435547, + "learning_rate": 1.2108892921960072e-05, + "loss": 23.8062, + "step": 3296 + }, + { + "epoch": 11.90293453724605, + "grad_norm": 182.90457153320312, + "learning_rate": 1.2103448275862068e-05, + "loss": 24.7812, + "step": 3297 + }, + { + "epoch": 11.906546275395034, + "grad_norm": 232.61126708984375, + "learning_rate": 1.2098003629764066e-05, + "loss": 41.5496, + "step": 3298 + }, + { + "epoch": 11.910158013544018, + "grad_norm": 283.25762939453125, + "learning_rate": 1.2092558983666062e-05, + "loss": 40.7831, + "step": 3299 + }, + { + "epoch": 11.913769751693001, + "grad_norm": 316.6318359375, + "learning_rate": 1.2087114337568059e-05, + "loss": 40.6287, + "step": 3300 + }, + { + "epoch": 11.913769751693001, + "eval_loss": 0.6114257574081421, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 3300 + }, + { + "epoch": 11.917381489841986, + "grad_norm": 248.5615234375, + "learning_rate": 1.2081669691470054e-05, + "loss": 40.5648, + "step": 3301 + }, + { + "epoch": 11.920993227990971, + "grad_norm": 255.31130981445312, + "learning_rate": 1.2076225045372051e-05, + "loss": 42.4736, + "step": 3302 + }, + { + "epoch": 11.924604966139954, + "grad_norm": 229.3546600341797, + "learning_rate": 1.2070780399274048e-05, + "loss": 43.112, + "step": 3303 + }, + { + "epoch": 11.928216704288939, + "grad_norm": 226.89553833007812, + "learning_rate": 1.2065335753176044e-05, + "loss": 37.9527, + "step": 3304 + }, + { + "epoch": 11.931828442437924, + "grad_norm": 210.63919067382812, + "learning_rate": 1.205989110707804e-05, + "loss": 38.7652, + "step": 3305 + }, + { + "epoch": 11.935440180586907, + "grad_norm": 267.75335693359375, + "learning_rate": 1.2054446460980036e-05, + "loss": 39.9077, + "step": 3306 + }, + { + "epoch": 11.939051918735892, + "grad_norm": 255.3372802734375, + "learning_rate": 1.2049001814882033e-05, + "loss": 39.9008, + "step": 3307 + }, + { + "epoch": 11.942663656884875, + "grad_norm": 220.55332946777344, + "learning_rate": 1.2043557168784029e-05, + "loss": 40.8187, + "step": 3308 + }, + { + "epoch": 11.94627539503386, + "grad_norm": 350.15374755859375, + "learning_rate": 1.2038112522686026e-05, + "loss": 40.2937, + "step": 3309 + }, + { + "epoch": 11.949887133182845, + "grad_norm": 296.1144714355469, + "learning_rate": 1.2032667876588021e-05, + "loss": 41.3939, + "step": 3310 + }, + { + "epoch": 11.949887133182845, + "eval_loss": 0.6116041541099548, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3310 + }, + { + "epoch": 11.953498871331828, + "grad_norm": 220.52304077148438, + "learning_rate": 1.202722323049002e-05, + "loss": 39.108, + "step": 3311 + }, + { + "epoch": 11.957110609480813, + "grad_norm": 268.8526916503906, + "learning_rate": 1.2021778584392015e-05, + "loss": 39.547, + "step": 3312 + }, + { + "epoch": 11.960722347629797, + "grad_norm": 205.97677612304688, + "learning_rate": 1.201633393829401e-05, + "loss": 36.7144, + "step": 3313 + }, + { + "epoch": 11.96433408577878, + "grad_norm": 186.62428283691406, + "learning_rate": 1.2010889292196008e-05, + "loss": 34.0491, + "step": 3314 + }, + { + "epoch": 11.967945823927765, + "grad_norm": 214.5521697998047, + "learning_rate": 1.2005444646098003e-05, + "loss": 34.1164, + "step": 3315 + }, + { + "epoch": 11.97155756207675, + "grad_norm": 203.8130340576172, + "learning_rate": 1.2e-05, + "loss": 34.0005, + "step": 3316 + }, + { + "epoch": 11.975169300225733, + "grad_norm": 207.25648498535156, + "learning_rate": 1.1994555353901997e-05, + "loss": 34.0489, + "step": 3317 + }, + { + "epoch": 11.978781038374718, + "grad_norm": 271.1595458984375, + "learning_rate": 1.1989110707803993e-05, + "loss": 35.0359, + "step": 3318 + }, + { + "epoch": 11.982392776523701, + "grad_norm": 266.0697021484375, + "learning_rate": 1.198366606170599e-05, + "loss": 36.4684, + "step": 3319 + }, + { + "epoch": 11.986004514672686, + "grad_norm": 264.1314392089844, + "learning_rate": 1.1978221415607985e-05, + "loss": 35.8805, + "step": 3320 + }, + { + "epoch": 11.986004514672686, + "eval_loss": 0.6101864576339722, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 3320 + }, + { + "epoch": 11.989616252821671, + "grad_norm": 266.34295654296875, + "learning_rate": 1.1972776769509982e-05, + "loss": 37.2928, + "step": 3321 + }, + { + "epoch": 11.993227990970654, + "grad_norm": 222.19161987304688, + "learning_rate": 1.196733212341198e-05, + "loss": 29.0638, + "step": 3322 + }, + { + "epoch": 11.996839729119639, + "grad_norm": 244.96974182128906, + "learning_rate": 1.1961887477313975e-05, + "loss": 23.6752, + "step": 3323 + }, + { + "epoch": 12.0, + "grad_norm": 227.6931915283203, + "learning_rate": 1.195644283121597e-05, + "loss": 20.9293, + "step": 3324 + }, + { + "epoch": 12.003611738148985, + "grad_norm": 259.7235412597656, + "learning_rate": 1.1950998185117969e-05, + "loss": 39.7694, + "step": 3325 + }, + { + "epoch": 12.007223476297968, + "grad_norm": 258.8477783203125, + "learning_rate": 1.1945553539019964e-05, + "loss": 41.3742, + "step": 3326 + }, + { + "epoch": 12.010835214446953, + "grad_norm": 216.0697784423828, + "learning_rate": 1.194010889292196e-05, + "loss": 40.0706, + "step": 3327 + }, + { + "epoch": 12.014446952595938, + "grad_norm": 197.73046875, + "learning_rate": 1.1934664246823957e-05, + "loss": 39.844, + "step": 3328 + }, + { + "epoch": 12.01805869074492, + "grad_norm": 190.29563903808594, + "learning_rate": 1.1929219600725952e-05, + "loss": 41.8877, + "step": 3329 + }, + { + "epoch": 12.021670428893906, + "grad_norm": 190.01197814941406, + "learning_rate": 1.1923774954627951e-05, + "loss": 40.5782, + "step": 3330 + }, + { + "epoch": 12.021670428893906, + "eval_loss": 0.6100598573684692, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3330 + }, + { + "epoch": 12.025282167042889, + "grad_norm": 283.20965576171875, + "learning_rate": 1.1918330308529946e-05, + "loss": 42.9183, + "step": 3331 + }, + { + "epoch": 12.028893905191874, + "grad_norm": 227.9106903076172, + "learning_rate": 1.1912885662431942e-05, + "loss": 41.4606, + "step": 3332 + }, + { + "epoch": 12.032505643340858, + "grad_norm": 217.31640625, + "learning_rate": 1.1907441016333939e-05, + "loss": 40.527, + "step": 3333 + }, + { + "epoch": 12.036117381489841, + "grad_norm": 181.33787536621094, + "learning_rate": 1.1901996370235936e-05, + "loss": 40.2536, + "step": 3334 + }, + { + "epoch": 12.039729119638826, + "grad_norm": 210.638427734375, + "learning_rate": 1.1896551724137931e-05, + "loss": 39.0234, + "step": 3335 + }, + { + "epoch": 12.043340857787811, + "grad_norm": 222.1325225830078, + "learning_rate": 1.1891107078039928e-05, + "loss": 36.6929, + "step": 3336 + }, + { + "epoch": 12.046952595936794, + "grad_norm": 195.0751953125, + "learning_rate": 1.1885662431941924e-05, + "loss": 37.9547, + "step": 3337 + }, + { + "epoch": 12.050564334085779, + "grad_norm": 287.6582946777344, + "learning_rate": 1.1880217785843919e-05, + "loss": 37.9016, + "step": 3338 + }, + { + "epoch": 12.054176072234762, + "grad_norm": 351.43701171875, + "learning_rate": 1.1874773139745918e-05, + "loss": 40.014, + "step": 3339 + }, + { + "epoch": 12.057787810383747, + "grad_norm": 212.9033966064453, + "learning_rate": 1.1869328493647913e-05, + "loss": 37.8761, + "step": 3340 + }, + { + "epoch": 12.057787810383747, + "eval_loss": 0.6093400120735168, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 3340 + }, + { + "epoch": 12.061399548532732, + "grad_norm": 268.8284912109375, + "learning_rate": 1.186388384754991e-05, + "loss": 38.7171, + "step": 3341 + }, + { + "epoch": 12.065011286681715, + "grad_norm": 193.27267456054688, + "learning_rate": 1.1858439201451906e-05, + "loss": 38.4908, + "step": 3342 + }, + { + "epoch": 12.0686230248307, + "grad_norm": 244.18124389648438, + "learning_rate": 1.1852994555353901e-05, + "loss": 37.9388, + "step": 3343 + }, + { + "epoch": 12.072234762979685, + "grad_norm": 311.6593933105469, + "learning_rate": 1.18475499092559e-05, + "loss": 38.4287, + "step": 3344 + }, + { + "epoch": 12.075846501128668, + "grad_norm": 239.28526306152344, + "learning_rate": 1.1842105263157895e-05, + "loss": 38.1349, + "step": 3345 + }, + { + "epoch": 12.079458239277653, + "grad_norm": 312.1795654296875, + "learning_rate": 1.183666061705989e-05, + "loss": 39.8067, + "step": 3346 + }, + { + "epoch": 12.083069977426636, + "grad_norm": 303.3067932128906, + "learning_rate": 1.1831215970961888e-05, + "loss": 40.0617, + "step": 3347 + }, + { + "epoch": 12.08668171557562, + "grad_norm": 280.8705749511719, + "learning_rate": 1.1825771324863885e-05, + "loss": 39.244, + "step": 3348 + }, + { + "epoch": 12.090293453724605, + "grad_norm": 249.89671325683594, + "learning_rate": 1.182032667876588e-05, + "loss": 39.0047, + "step": 3349 + }, + { + "epoch": 12.093905191873588, + "grad_norm": 226.19195556640625, + "learning_rate": 1.1814882032667877e-05, + "loss": 40.8044, + "step": 3350 + }, + { + "epoch": 12.093905191873588, + "eval_loss": 0.6100687384605408, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 3350 + }, + { + "epoch": 12.097516930022573, + "grad_norm": 250.29306030273438, + "learning_rate": 1.1809437386569873e-05, + "loss": 38.0745, + "step": 3351 + }, + { + "epoch": 12.101128668171558, + "grad_norm": 255.06137084960938, + "learning_rate": 1.180399274047187e-05, + "loss": 37.2922, + "step": 3352 + }, + { + "epoch": 12.104740406320541, + "grad_norm": 293.59185791015625, + "learning_rate": 1.1798548094373867e-05, + "loss": 35.488, + "step": 3353 + }, + { + "epoch": 12.108352144469526, + "grad_norm": 260.9599914550781, + "learning_rate": 1.1793103448275862e-05, + "loss": 32.8175, + "step": 3354 + }, + { + "epoch": 12.111963882618511, + "grad_norm": 387.63671875, + "learning_rate": 1.178765880217786e-05, + "loss": 31.3901, + "step": 3355 + }, + { + "epoch": 12.115575620767494, + "grad_norm": 216.2008819580078, + "learning_rate": 1.1782214156079855e-05, + "loss": 32.9512, + "step": 3356 + }, + { + "epoch": 12.119187358916479, + "grad_norm": 260.510498046875, + "learning_rate": 1.177676950998185e-05, + "loss": 31.838, + "step": 3357 + }, + { + "epoch": 12.122799097065462, + "grad_norm": 215.96522521972656, + "learning_rate": 1.1771324863883849e-05, + "loss": 33.5854, + "step": 3358 + }, + { + "epoch": 12.126410835214447, + "grad_norm": 277.2855529785156, + "learning_rate": 1.1765880217785844e-05, + "loss": 34.947, + "step": 3359 + }, + { + "epoch": 12.130022573363432, + "grad_norm": 199.53759765625, + "learning_rate": 1.176043557168784e-05, + "loss": 34.3862, + "step": 3360 + }, + { + "epoch": 12.130022573363432, + "eval_loss": 0.6107886433601379, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 3360 + }, + { + "epoch": 12.133634311512415, + "grad_norm": 244.73654174804688, + "learning_rate": 1.1754990925589837e-05, + "loss": 34.5678, + "step": 3361 + }, + { + "epoch": 12.1372460496614, + "grad_norm": 335.4967346191406, + "learning_rate": 1.1749546279491834e-05, + "loss": 35.8974, + "step": 3362 + }, + { + "epoch": 12.140857787810384, + "grad_norm": 269.8370056152344, + "learning_rate": 1.174410163339383e-05, + "loss": 36.3458, + "step": 3363 + }, + { + "epoch": 12.144469525959368, + "grad_norm": 230.82492065429688, + "learning_rate": 1.1738656987295826e-05, + "loss": 34.6797, + "step": 3364 + }, + { + "epoch": 12.148081264108352, + "grad_norm": 266.6196594238281, + "learning_rate": 1.1733212341197822e-05, + "loss": 35.5799, + "step": 3365 + }, + { + "epoch": 12.151693002257336, + "grad_norm": 268.1825256347656, + "learning_rate": 1.1727767695099819e-05, + "loss": 34.9859, + "step": 3366 + }, + { + "epoch": 12.15530474040632, + "grad_norm": 259.6159362792969, + "learning_rate": 1.1722323049001816e-05, + "loss": 37.2283, + "step": 3367 + }, + { + "epoch": 12.158916478555305, + "grad_norm": 225.1367645263672, + "learning_rate": 1.1716878402903811e-05, + "loss": 37.4073, + "step": 3368 + }, + { + "epoch": 12.162528216704288, + "grad_norm": 277.8457946777344, + "learning_rate": 1.1711433756805808e-05, + "loss": 36.3491, + "step": 3369 + }, + { + "epoch": 12.166139954853273, + "grad_norm": 273.1939697265625, + "learning_rate": 1.1705989110707804e-05, + "loss": 31.4646, + "step": 3370 + }, + { + "epoch": 12.166139954853273, + "eval_loss": 0.6099494695663452, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 3370 + }, + { + "epoch": 12.169751693002258, + "grad_norm": 199.32516479492188, + "learning_rate": 1.17005444646098e-05, + "loss": 22.7125, + "step": 3371 + }, + { + "epoch": 12.173363431151241, + "grad_norm": 195.47630310058594, + "learning_rate": 1.1695099818511798e-05, + "loss": 22.7899, + "step": 3372 + }, + { + "epoch": 12.176975169300226, + "grad_norm": 220.02413940429688, + "learning_rate": 1.1689655172413793e-05, + "loss": 23.4427, + "step": 3373 + }, + { + "epoch": 12.18058690744921, + "grad_norm": 215.43287658691406, + "learning_rate": 1.168421052631579e-05, + "loss": 24.1504, + "step": 3374 + }, + { + "epoch": 12.184198645598194, + "grad_norm": 298.2409973144531, + "learning_rate": 1.1678765880217786e-05, + "loss": 41.4955, + "step": 3375 + }, + { + "epoch": 12.187810383747179, + "grad_norm": 235.94728088378906, + "learning_rate": 1.1673321234119783e-05, + "loss": 42.4273, + "step": 3376 + }, + { + "epoch": 12.191422121896162, + "grad_norm": 235.44480895996094, + "learning_rate": 1.166787658802178e-05, + "loss": 40.6468, + "step": 3377 + }, + { + "epoch": 12.195033860045147, + "grad_norm": 281.5338439941406, + "learning_rate": 1.1662431941923775e-05, + "loss": 39.8335, + "step": 3378 + }, + { + "epoch": 12.198645598194132, + "grad_norm": 185.87339782714844, + "learning_rate": 1.165698729582577e-05, + "loss": 40.8669, + "step": 3379 + }, + { + "epoch": 12.202257336343115, + "grad_norm": 218.88861083984375, + "learning_rate": 1.1651542649727768e-05, + "loss": 40.1351, + "step": 3380 + }, + { + "epoch": 12.202257336343115, + "eval_loss": 0.6128573417663574, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3380 + }, + { + "epoch": 12.2058690744921, + "grad_norm": 192.7227783203125, + "learning_rate": 1.1646098003629765e-05, + "loss": 40.4448, + "step": 3381 + }, + { + "epoch": 12.209480812641084, + "grad_norm": 219.68093872070312, + "learning_rate": 1.1640653357531762e-05, + "loss": 41.579, + "step": 3382 + }, + { + "epoch": 12.213092550790067, + "grad_norm": 235.8788299560547, + "learning_rate": 1.1635208711433757e-05, + "loss": 41.3374, + "step": 3383 + }, + { + "epoch": 12.216704288939052, + "grad_norm": 245.11935424804688, + "learning_rate": 1.1629764065335752e-05, + "loss": 41.1151, + "step": 3384 + }, + { + "epoch": 12.220316027088035, + "grad_norm": 260.2931823730469, + "learning_rate": 1.1624319419237751e-05, + "loss": 38.9502, + "step": 3385 + }, + { + "epoch": 12.22392776523702, + "grad_norm": 240.62734985351562, + "learning_rate": 1.1618874773139747e-05, + "loss": 38.6309, + "step": 3386 + }, + { + "epoch": 12.227539503386005, + "grad_norm": 230.9380645751953, + "learning_rate": 1.1613430127041742e-05, + "loss": 38.3077, + "step": 3387 + }, + { + "epoch": 12.231151241534988, + "grad_norm": 234.40687561035156, + "learning_rate": 1.1607985480943739e-05, + "loss": 37.1566, + "step": 3388 + }, + { + "epoch": 12.234762979683973, + "grad_norm": 216.580810546875, + "learning_rate": 1.1602540834845734e-05, + "loss": 38.4919, + "step": 3389 + }, + { + "epoch": 12.238374717832958, + "grad_norm": 210.75079345703125, + "learning_rate": 1.1597096188747732e-05, + "loss": 38.1647, + "step": 3390 + }, + { + "epoch": 12.238374717832958, + "eval_loss": 0.6105583906173706, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3390 + }, + { + "epoch": 12.241986455981941, + "grad_norm": 207.82180786132812, + "learning_rate": 1.1591651542649729e-05, + "loss": 38.5585, + "step": 3391 + }, + { + "epoch": 12.245598194130926, + "grad_norm": 186.55081176757812, + "learning_rate": 1.1586206896551724e-05, + "loss": 38.0183, + "step": 3392 + }, + { + "epoch": 12.249209932279909, + "grad_norm": 179.60572814941406, + "learning_rate": 1.1580762250453721e-05, + "loss": 39.6951, + "step": 3393 + }, + { + "epoch": 12.252821670428894, + "grad_norm": 212.59837341308594, + "learning_rate": 1.1575317604355718e-05, + "loss": 39.2908, + "step": 3394 + }, + { + "epoch": 12.256433408577879, + "grad_norm": 239.90997314453125, + "learning_rate": 1.1569872958257714e-05, + "loss": 39.9409, + "step": 3395 + }, + { + "epoch": 12.260045146726862, + "grad_norm": 240.729248046875, + "learning_rate": 1.156442831215971e-05, + "loss": 39.2386, + "step": 3396 + }, + { + "epoch": 12.263656884875846, + "grad_norm": 248.6179962158203, + "learning_rate": 1.1558983666061706e-05, + "loss": 37.3296, + "step": 3397 + }, + { + "epoch": 12.267268623024831, + "grad_norm": 192.55084228515625, + "learning_rate": 1.1553539019963701e-05, + "loss": 40.1156, + "step": 3398 + }, + { + "epoch": 12.270880361173814, + "grad_norm": 217.89109802246094, + "learning_rate": 1.15480943738657e-05, + "loss": 41.0677, + "step": 3399 + }, + { + "epoch": 12.2744920993228, + "grad_norm": 240.77633666992188, + "learning_rate": 1.1542649727767695e-05, + "loss": 39.3552, + "step": 3400 + }, + { + "epoch": 12.2744920993228, + "eval_loss": 0.6094763278961182, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3400 + }, + { + "epoch": 12.278103837471784, + "grad_norm": 210.38153076171875, + "learning_rate": 1.1537205081669691e-05, + "loss": 40.2202, + "step": 3401 + }, + { + "epoch": 12.281715575620767, + "grad_norm": 195.49087524414062, + "learning_rate": 1.1531760435571688e-05, + "loss": 37.5473, + "step": 3402 + }, + { + "epoch": 12.285327313769752, + "grad_norm": 254.43972778320312, + "learning_rate": 1.1526315789473683e-05, + "loss": 37.8032, + "step": 3403 + }, + { + "epoch": 12.288939051918735, + "grad_norm": 205.09913635253906, + "learning_rate": 1.1520871143375682e-05, + "loss": 35.1317, + "step": 3404 + }, + { + "epoch": 12.29255079006772, + "grad_norm": 241.22930908203125, + "learning_rate": 1.1515426497277677e-05, + "loss": 32.7809, + "step": 3405 + }, + { + "epoch": 12.296162528216705, + "grad_norm": 226.75311279296875, + "learning_rate": 1.1509981851179673e-05, + "loss": 32.5354, + "step": 3406 + }, + { + "epoch": 12.299774266365688, + "grad_norm": 323.5389709472656, + "learning_rate": 1.150453720508167e-05, + "loss": 33.1533, + "step": 3407 + }, + { + "epoch": 12.303386004514673, + "grad_norm": 306.7039794921875, + "learning_rate": 1.1499092558983667e-05, + "loss": 33.7924, + "step": 3408 + }, + { + "epoch": 12.306997742663658, + "grad_norm": 221.53897094726562, + "learning_rate": 1.1493647912885662e-05, + "loss": 33.829, + "step": 3409 + }, + { + "epoch": 12.31060948081264, + "grad_norm": 301.59527587890625, + "learning_rate": 1.148820326678766e-05, + "loss": 35.4583, + "step": 3410 + }, + { + "epoch": 12.31060948081264, + "eval_loss": 0.6092248558998108, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.058, + "eval_steps_per_second": 57.058, + "step": 3410 + }, + { + "epoch": 12.314221218961626, + "grad_norm": 229.63221740722656, + "learning_rate": 1.1482758620689655e-05, + "loss": 34.3258, + "step": 3411 + }, + { + "epoch": 12.317832957110609, + "grad_norm": 280.6421203613281, + "learning_rate": 1.147731397459165e-05, + "loss": 33.4522, + "step": 3412 + }, + { + "epoch": 12.321444695259594, + "grad_norm": 305.6673889160156, + "learning_rate": 1.1471869328493649e-05, + "loss": 34.8911, + "step": 3413 + }, + { + "epoch": 12.325056433408578, + "grad_norm": 278.5484924316406, + "learning_rate": 1.1466424682395644e-05, + "loss": 36.2668, + "step": 3414 + }, + { + "epoch": 12.328668171557561, + "grad_norm": 246.88082885742188, + "learning_rate": 1.1460980036297641e-05, + "loss": 34.8401, + "step": 3415 + }, + { + "epoch": 12.332279909706546, + "grad_norm": 279.730712890625, + "learning_rate": 1.1455535390199637e-05, + "loss": 36.2382, + "step": 3416 + }, + { + "epoch": 12.335891647855531, + "grad_norm": 243.62918090820312, + "learning_rate": 1.1450090744101634e-05, + "loss": 37.0742, + "step": 3417 + }, + { + "epoch": 12.339503386004514, + "grad_norm": 280.5240783691406, + "learning_rate": 1.1444646098003631e-05, + "loss": 37.0223, + "step": 3418 + }, + { + "epoch": 12.343115124153499, + "grad_norm": 270.56396484375, + "learning_rate": 1.1439201451905626e-05, + "loss": 34.8413, + "step": 3419 + }, + { + "epoch": 12.346726862302482, + "grad_norm": 246.56292724609375, + "learning_rate": 1.1433756805807622e-05, + "loss": 26.5596, + "step": 3420 + }, + { + "epoch": 12.346726862302482, + "eval_loss": 0.6123174428939819, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.143, + "eval_steps_per_second": 57.143, + "step": 3420 + }, + { + "epoch": 12.350338600451467, + "grad_norm": 199.72242736816406, + "learning_rate": 1.1428312159709619e-05, + "loss": 23.3959, + "step": 3421 + }, + { + "epoch": 12.353950338600452, + "grad_norm": 264.9206848144531, + "learning_rate": 1.1422867513611616e-05, + "loss": 23.448, + "step": 3422 + }, + { + "epoch": 12.357562076749435, + "grad_norm": 198.09420776367188, + "learning_rate": 1.1417422867513613e-05, + "loss": 23.4526, + "step": 3423 + }, + { + "epoch": 12.36117381489842, + "grad_norm": 191.74949645996094, + "learning_rate": 1.1411978221415608e-05, + "loss": 23.9586, + "step": 3424 + }, + { + "epoch": 12.364785553047405, + "grad_norm": 270.4527893066406, + "learning_rate": 1.1406533575317604e-05, + "loss": 41.2497, + "step": 3425 + }, + { + "epoch": 12.368397291196388, + "grad_norm": 253.06109619140625, + "learning_rate": 1.1401088929219601e-05, + "loss": 41.7598, + "step": 3426 + }, + { + "epoch": 12.372009029345373, + "grad_norm": 389.3164978027344, + "learning_rate": 1.1395644283121598e-05, + "loss": 42.1145, + "step": 3427 + }, + { + "epoch": 12.375620767494357, + "grad_norm": 405.1527404785156, + "learning_rate": 1.1390199637023593e-05, + "loss": 39.8163, + "step": 3428 + }, + { + "epoch": 12.37923250564334, + "grad_norm": 360.5083312988281, + "learning_rate": 1.138475499092559e-05, + "loss": 40.7344, + "step": 3429 + }, + { + "epoch": 12.382844243792325, + "grad_norm": 276.3650207519531, + "learning_rate": 1.1379310344827586e-05, + "loss": 40.6678, + "step": 3430 + }, + { + "epoch": 12.382844243792325, + "eval_loss": 0.612799346446991, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 3430 + }, + { + "epoch": 12.386455981941308, + "grad_norm": 222.34078979492188, + "learning_rate": 1.1373865698729583e-05, + "loss": 39.8701, + "step": 3431 + }, + { + "epoch": 12.390067720090293, + "grad_norm": 242.1103515625, + "learning_rate": 1.136842105263158e-05, + "loss": 42.031, + "step": 3432 + }, + { + "epoch": 12.393679458239278, + "grad_norm": 231.30453491210938, + "learning_rate": 1.1362976406533575e-05, + "loss": 40.7321, + "step": 3433 + }, + { + "epoch": 12.397291196388261, + "grad_norm": 302.65179443359375, + "learning_rate": 1.1357531760435572e-05, + "loss": 41.5889, + "step": 3434 + }, + { + "epoch": 12.400902934537246, + "grad_norm": 296.4203796386719, + "learning_rate": 1.1352087114337568e-05, + "loss": 40.3939, + "step": 3435 + }, + { + "epoch": 12.404514672686231, + "grad_norm": 281.8349304199219, + "learning_rate": 1.1346642468239565e-05, + "loss": 37.9457, + "step": 3436 + }, + { + "epoch": 12.408126410835214, + "grad_norm": 228.9622039794922, + "learning_rate": 1.1341197822141562e-05, + "loss": 37.4727, + "step": 3437 + }, + { + "epoch": 12.411738148984199, + "grad_norm": 276.8975524902344, + "learning_rate": 1.1335753176043557e-05, + "loss": 36.4285, + "step": 3438 + }, + { + "epoch": 12.415349887133182, + "grad_norm": 218.76206970214844, + "learning_rate": 1.1330308529945553e-05, + "loss": 37.7888, + "step": 3439 + }, + { + "epoch": 12.418961625282167, + "grad_norm": 277.31329345703125, + "learning_rate": 1.1324863883847551e-05, + "loss": 38.6416, + "step": 3440 + }, + { + "epoch": 12.418961625282167, + "eval_loss": 0.6118359565734863, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.105, + "eval_steps_per_second": 57.105, + "step": 3440 + }, + { + "epoch": 12.422573363431152, + "grad_norm": 239.2766876220703, + "learning_rate": 1.1319419237749547e-05, + "loss": 38.3779, + "step": 3441 + }, + { + "epoch": 12.426185101580135, + "grad_norm": 255.43939208984375, + "learning_rate": 1.1313974591651542e-05, + "loss": 38.7581, + "step": 3442 + }, + { + "epoch": 12.42979683972912, + "grad_norm": 196.33380126953125, + "learning_rate": 1.130852994555354e-05, + "loss": 40.1953, + "step": 3443 + }, + { + "epoch": 12.433408577878104, + "grad_norm": 284.2427062988281, + "learning_rate": 1.1303085299455535e-05, + "loss": 39.2743, + "step": 3444 + }, + { + "epoch": 12.437020316027088, + "grad_norm": 303.0172424316406, + "learning_rate": 1.1297640653357533e-05, + "loss": 39.4786, + "step": 3445 + }, + { + "epoch": 12.440632054176072, + "grad_norm": 231.17999267578125, + "learning_rate": 1.1292196007259529e-05, + "loss": 38.6038, + "step": 3446 + }, + { + "epoch": 12.444243792325057, + "grad_norm": 228.89599609375, + "learning_rate": 1.1286751361161524e-05, + "loss": 39.0235, + "step": 3447 + }, + { + "epoch": 12.44785553047404, + "grad_norm": 247.05203247070312, + "learning_rate": 1.1281306715063521e-05, + "loss": 39.9779, + "step": 3448 + }, + { + "epoch": 12.451467268623025, + "grad_norm": 221.5463104248047, + "learning_rate": 1.1275862068965517e-05, + "loss": 40.4104, + "step": 3449 + }, + { + "epoch": 12.455079006772008, + "grad_norm": 254.12820434570312, + "learning_rate": 1.1270417422867514e-05, + "loss": 40.8093, + "step": 3450 + }, + { + "epoch": 12.455079006772008, + "eval_loss": 0.6093817353248596, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 3450 + }, + { + "epoch": 12.458690744920993, + "grad_norm": 214.2323760986328, + "learning_rate": 1.1264972776769511e-05, + "loss": 40.3578, + "step": 3451 + }, + { + "epoch": 12.462302483069978, + "grad_norm": 230.64718627929688, + "learning_rate": 1.1259528130671506e-05, + "loss": 39.772, + "step": 3452 + }, + { + "epoch": 12.465914221218961, + "grad_norm": 217.81838989257812, + "learning_rate": 1.1254083484573502e-05, + "loss": 36.8193, + "step": 3453 + }, + { + "epoch": 12.469525959367946, + "grad_norm": 292.7674560546875, + "learning_rate": 1.12486388384755e-05, + "loss": 33.891, + "step": 3454 + }, + { + "epoch": 12.47313769751693, + "grad_norm": 241.6099395751953, + "learning_rate": 1.1243194192377496e-05, + "loss": 34.8947, + "step": 3455 + }, + { + "epoch": 12.476749435665914, + "grad_norm": 220.97128295898438, + "learning_rate": 1.1237749546279493e-05, + "loss": 31.7715, + "step": 3456 + }, + { + "epoch": 12.480361173814899, + "grad_norm": 191.04376220703125, + "learning_rate": 1.1232304900181488e-05, + "loss": 32.3878, + "step": 3457 + }, + { + "epoch": 12.483972911963882, + "grad_norm": 192.3009796142578, + "learning_rate": 1.1226860254083484e-05, + "loss": 33.3116, + "step": 3458 + }, + { + "epoch": 12.487584650112867, + "grad_norm": 214.22459411621094, + "learning_rate": 1.1221415607985482e-05, + "loss": 34.1394, + "step": 3459 + }, + { + "epoch": 12.491196388261852, + "grad_norm": 225.24191284179688, + "learning_rate": 1.1215970961887478e-05, + "loss": 34.9381, + "step": 3460 + }, + { + "epoch": 12.491196388261852, + "eval_loss": 0.6095408201217651, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 3460 + }, + { + "epoch": 12.494808126410835, + "grad_norm": 240.89199829101562, + "learning_rate": 1.1210526315789473e-05, + "loss": 34.5342, + "step": 3461 + }, + { + "epoch": 12.49841986455982, + "grad_norm": 263.5467224121094, + "learning_rate": 1.120508166969147e-05, + "loss": 35.3287, + "step": 3462 + }, + { + "epoch": 12.502031602708804, + "grad_norm": 253.0650634765625, + "learning_rate": 1.1199637023593467e-05, + "loss": 35.4859, + "step": 3463 + }, + { + "epoch": 12.505643340857787, + "grad_norm": 279.4447937011719, + "learning_rate": 1.1194192377495463e-05, + "loss": 33.919, + "step": 3464 + }, + { + "epoch": 12.509255079006772, + "grad_norm": 246.6184844970703, + "learning_rate": 1.118874773139746e-05, + "loss": 35.2743, + "step": 3465 + }, + { + "epoch": 12.512866817155757, + "grad_norm": 228.4134979248047, + "learning_rate": 1.1183303085299455e-05, + "loss": 36.0865, + "step": 3466 + }, + { + "epoch": 12.51647855530474, + "grad_norm": 264.87835693359375, + "learning_rate": 1.1177858439201452e-05, + "loss": 36.1596, + "step": 3467 + }, + { + "epoch": 12.520090293453725, + "grad_norm": 252.2872772216797, + "learning_rate": 1.117241379310345e-05, + "loss": 35.7293, + "step": 3468 + }, + { + "epoch": 12.523702031602708, + "grad_norm": 277.3695373535156, + "learning_rate": 1.1166969147005445e-05, + "loss": 36.8009, + "step": 3469 + }, + { + "epoch": 12.527313769751693, + "grad_norm": 255.64610290527344, + "learning_rate": 1.1161524500907442e-05, + "loss": 28.5986, + "step": 3470 + }, + { + "epoch": 12.527313769751693, + "eval_loss": 0.6122347116470337, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 3470 + }, + { + "epoch": 12.530925507900678, + "grad_norm": 256.1487121582031, + "learning_rate": 1.1156079854809437e-05, + "loss": 23.1289, + "step": 3471 + }, + { + "epoch": 12.534537246049661, + "grad_norm": 261.9757080078125, + "learning_rate": 1.1150635208711433e-05, + "loss": 22.3379, + "step": 3472 + }, + { + "epoch": 12.538148984198646, + "grad_norm": 194.83432006835938, + "learning_rate": 1.1145190562613431e-05, + "loss": 23.6192, + "step": 3473 + }, + { + "epoch": 12.54176072234763, + "grad_norm": 241.51089477539062, + "learning_rate": 1.1139745916515427e-05, + "loss": 24.0314, + "step": 3474 + }, + { + "epoch": 12.545372460496614, + "grad_norm": 242.6024932861328, + "learning_rate": 1.1134301270417424e-05, + "loss": 40.2969, + "step": 3475 + }, + { + "epoch": 12.548984198645599, + "grad_norm": 292.17303466796875, + "learning_rate": 1.112885662431942e-05, + "loss": 42.3448, + "step": 3476 + }, + { + "epoch": 12.552595936794582, + "grad_norm": 232.811767578125, + "learning_rate": 1.1123411978221416e-05, + "loss": 41.7642, + "step": 3477 + }, + { + "epoch": 12.556207674943566, + "grad_norm": 238.43162536621094, + "learning_rate": 1.1117967332123413e-05, + "loss": 41.0827, + "step": 3478 + }, + { + "epoch": 12.559819413092551, + "grad_norm": 290.20159912109375, + "learning_rate": 1.1112522686025409e-05, + "loss": 41.3795, + "step": 3479 + }, + { + "epoch": 12.563431151241534, + "grad_norm": 197.52903747558594, + "learning_rate": 1.1107078039927404e-05, + "loss": 40.6337, + "step": 3480 + }, + { + "epoch": 12.563431151241534, + "eval_loss": 0.6133883595466614, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.135, + "eval_steps_per_second": 57.135, + "step": 3480 + }, + { + "epoch": 12.56704288939052, + "grad_norm": 259.8161926269531, + "learning_rate": 1.1101633393829401e-05, + "loss": 40.2626, + "step": 3481 + }, + { + "epoch": 12.570654627539504, + "grad_norm": 196.7882537841797, + "learning_rate": 1.1096188747731398e-05, + "loss": 41.0171, + "step": 3482 + }, + { + "epoch": 12.574266365688487, + "grad_norm": 216.27642822265625, + "learning_rate": 1.1090744101633394e-05, + "loss": 42.1328, + "step": 3483 + }, + { + "epoch": 12.577878103837472, + "grad_norm": 292.6575012207031, + "learning_rate": 1.108529945553539e-05, + "loss": 39.9502, + "step": 3484 + }, + { + "epoch": 12.581489841986457, + "grad_norm": 254.43344116210938, + "learning_rate": 1.1079854809437386e-05, + "loss": 41.3409, + "step": 3485 + }, + { + "epoch": 12.58510158013544, + "grad_norm": 211.3965606689453, + "learning_rate": 1.1074410163339385e-05, + "loss": 39.6898, + "step": 3486 + }, + { + "epoch": 12.588713318284425, + "grad_norm": 196.2000274658203, + "learning_rate": 1.106896551724138e-05, + "loss": 38.0837, + "step": 3487 + }, + { + "epoch": 12.592325056433408, + "grad_norm": 224.4564666748047, + "learning_rate": 1.1063520871143376e-05, + "loss": 38.479, + "step": 3488 + }, + { + "epoch": 12.595936794582393, + "grad_norm": 215.7074432373047, + "learning_rate": 1.1058076225045373e-05, + "loss": 38.3103, + "step": 3489 + }, + { + "epoch": 12.599548532731378, + "grad_norm": 278.2279052734375, + "learning_rate": 1.1052631578947368e-05, + "loss": 37.9399, + "step": 3490 + }, + { + "epoch": 12.599548532731378, + "eval_loss": 0.6091782450675964, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 3490 + }, + { + "epoch": 12.60316027088036, + "grad_norm": 236.7021942138672, + "learning_rate": 1.1047186932849365e-05, + "loss": 38.185, + "step": 3491 + }, + { + "epoch": 12.606772009029346, + "grad_norm": 200.35169982910156, + "learning_rate": 1.1041742286751362e-05, + "loss": 38.7405, + "step": 3492 + }, + { + "epoch": 12.610383747178329, + "grad_norm": 211.9726104736328, + "learning_rate": 1.1036297640653358e-05, + "loss": 39.8351, + "step": 3493 + }, + { + "epoch": 12.613995485327314, + "grad_norm": 303.5962829589844, + "learning_rate": 1.1030852994555353e-05, + "loss": 39.3039, + "step": 3494 + }, + { + "epoch": 12.617607223476298, + "grad_norm": 298.086181640625, + "learning_rate": 1.102540834845735e-05, + "loss": 39.9149, + "step": 3495 + }, + { + "epoch": 12.621218961625281, + "grad_norm": 255.69854736328125, + "learning_rate": 1.1019963702359347e-05, + "loss": 36.3617, + "step": 3496 + }, + { + "epoch": 12.624830699774266, + "grad_norm": 273.2884216308594, + "learning_rate": 1.1014519056261344e-05, + "loss": 38.6865, + "step": 3497 + }, + { + "epoch": 12.628442437923251, + "grad_norm": 211.17837524414062, + "learning_rate": 1.100907441016334e-05, + "loss": 40.2771, + "step": 3498 + }, + { + "epoch": 12.632054176072234, + "grad_norm": 253.9141845703125, + "learning_rate": 1.1003629764065335e-05, + "loss": 40.3644, + "step": 3499 + }, + { + "epoch": 12.635665914221219, + "grad_norm": 247.4141082763672, + "learning_rate": 1.0998185117967334e-05, + "loss": 39.9754, + "step": 3500 + }, + { + "epoch": 12.635665914221219, + "eval_loss": 0.6086810827255249, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 3500 + }, + { + "epoch": 12.639277652370204, + "grad_norm": 237.3258056640625, + "learning_rate": 1.0992740471869329e-05, + "loss": 39.9438, + "step": 3501 + }, + { + "epoch": 12.642889390519187, + "grad_norm": 252.87744140625, + "learning_rate": 1.0987295825771325e-05, + "loss": 39.9713, + "step": 3502 + }, + { + "epoch": 12.646501128668172, + "grad_norm": 341.2947998046875, + "learning_rate": 1.0981851179673322e-05, + "loss": 36.54, + "step": 3503 + }, + { + "epoch": 12.650112866817155, + "grad_norm": 212.7144317626953, + "learning_rate": 1.0976406533575317e-05, + "loss": 33.2737, + "step": 3504 + }, + { + "epoch": 12.65372460496614, + "grad_norm": 220.15846252441406, + "learning_rate": 1.0970961887477314e-05, + "loss": 34.8862, + "step": 3505 + }, + { + "epoch": 12.657336343115125, + "grad_norm": 235.8145294189453, + "learning_rate": 1.0965517241379311e-05, + "loss": 31.637, + "step": 3506 + }, + { + "epoch": 12.660948081264108, + "grad_norm": 274.13140869140625, + "learning_rate": 1.0960072595281307e-05, + "loss": 33.6111, + "step": 3507 + }, + { + "epoch": 12.664559819413093, + "grad_norm": 259.9810791015625, + "learning_rate": 1.0954627949183304e-05, + "loss": 34.7118, + "step": 3508 + }, + { + "epoch": 12.668171557562077, + "grad_norm": 244.6074676513672, + "learning_rate": 1.0949183303085299e-05, + "loss": 34.3987, + "step": 3509 + }, + { + "epoch": 12.67178329571106, + "grad_norm": 264.0238037109375, + "learning_rate": 1.0943738656987296e-05, + "loss": 34.7304, + "step": 3510 + }, + { + "epoch": 12.67178329571106, + "eval_loss": 0.6089194416999817, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 3510 + }, + { + "epoch": 12.675395033860045, + "grad_norm": 286.857421875, + "learning_rate": 1.0938294010889293e-05, + "loss": 34.5722, + "step": 3511 + }, + { + "epoch": 12.679006772009028, + "grad_norm": 270.7839660644531, + "learning_rate": 1.0932849364791289e-05, + "loss": 35.6129, + "step": 3512 + }, + { + "epoch": 12.682618510158013, + "grad_norm": 214.4302978515625, + "learning_rate": 1.0927404718693284e-05, + "loss": 34.4318, + "step": 3513 + }, + { + "epoch": 12.686230248306998, + "grad_norm": 362.6913757324219, + "learning_rate": 1.0921960072595283e-05, + "loss": 35.6578, + "step": 3514 + }, + { + "epoch": 12.689841986455981, + "grad_norm": 266.5205993652344, + "learning_rate": 1.0916515426497278e-05, + "loss": 35.8627, + "step": 3515 + }, + { + "epoch": 12.693453724604966, + "grad_norm": 271.8298034667969, + "learning_rate": 1.0911070780399275e-05, + "loss": 36.8931, + "step": 3516 + }, + { + "epoch": 12.697065462753951, + "grad_norm": 230.13815307617188, + "learning_rate": 1.090562613430127e-05, + "loss": 35.8972, + "step": 3517 + }, + { + "epoch": 12.700677200902934, + "grad_norm": 235.57127380371094, + "learning_rate": 1.0900181488203266e-05, + "loss": 36.7884, + "step": 3518 + }, + { + "epoch": 12.704288939051919, + "grad_norm": 274.0856018066406, + "learning_rate": 1.0894736842105265e-05, + "loss": 35.938, + "step": 3519 + }, + { + "epoch": 12.707900677200904, + "grad_norm": 251.9855194091797, + "learning_rate": 1.088929219600726e-05, + "loss": 30.846, + "step": 3520 + }, + { + "epoch": 12.707900677200904, + "eval_loss": 0.6102532148361206, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 3520 + }, + { + "epoch": 12.711512415349887, + "grad_norm": 254.11465454101562, + "learning_rate": 1.0883847549909255e-05, + "loss": 22.8538, + "step": 3521 + }, + { + "epoch": 12.715124153498872, + "grad_norm": 233.05821228027344, + "learning_rate": 1.0878402903811253e-05, + "loss": 22.3346, + "step": 3522 + }, + { + "epoch": 12.718735891647855, + "grad_norm": 223.46646118164062, + "learning_rate": 1.087295825771325e-05, + "loss": 23.8109, + "step": 3523 + }, + { + "epoch": 12.72234762979684, + "grad_norm": 209.4064483642578, + "learning_rate": 1.0867513611615245e-05, + "loss": 24.7694, + "step": 3524 + }, + { + "epoch": 12.725959367945824, + "grad_norm": 299.6215515136719, + "learning_rate": 1.0862068965517242e-05, + "loss": 40.8879, + "step": 3525 + }, + { + "epoch": 12.729571106094808, + "grad_norm": 272.5259704589844, + "learning_rate": 1.0856624319419237e-05, + "loss": 41.5875, + "step": 3526 + }, + { + "epoch": 12.733182844243792, + "grad_norm": 219.70687866210938, + "learning_rate": 1.0851179673321235e-05, + "loss": 41.5546, + "step": 3527 + }, + { + "epoch": 12.736794582392777, + "grad_norm": 250.9104766845703, + "learning_rate": 1.0845735027223232e-05, + "loss": 40.0984, + "step": 3528 + }, + { + "epoch": 12.74040632054176, + "grad_norm": 260.9254150390625, + "learning_rate": 1.0840290381125227e-05, + "loss": 40.564, + "step": 3529 + }, + { + "epoch": 12.744018058690745, + "grad_norm": 275.46221923828125, + "learning_rate": 1.0834845735027224e-05, + "loss": 40.3864, + "step": 3530 + }, + { + "epoch": 12.744018058690745, + "eval_loss": 0.6099677681922913, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 3530 + }, + { + "epoch": 12.747629796839728, + "grad_norm": 200.9589385986328, + "learning_rate": 1.082940108892922e-05, + "loss": 40.5753, + "step": 3531 + }, + { + "epoch": 12.751241534988713, + "grad_norm": 228.87669372558594, + "learning_rate": 1.0823956442831215e-05, + "loss": 41.4702, + "step": 3532 + }, + { + "epoch": 12.754853273137698, + "grad_norm": 218.6998748779297, + "learning_rate": 1.0818511796733214e-05, + "loss": 41.6641, + "step": 3533 + }, + { + "epoch": 12.758465011286681, + "grad_norm": 422.519775390625, + "learning_rate": 1.0813067150635209e-05, + "loss": 41.8016, + "step": 3534 + }, + { + "epoch": 12.762076749435666, + "grad_norm": 198.31935119628906, + "learning_rate": 1.0807622504537204e-05, + "loss": 40.6053, + "step": 3535 + }, + { + "epoch": 12.76568848758465, + "grad_norm": 274.42333984375, + "learning_rate": 1.0802177858439201e-05, + "loss": 38.7974, + "step": 3536 + }, + { + "epoch": 12.769300225733634, + "grad_norm": 267.5847473144531, + "learning_rate": 1.0796733212341199e-05, + "loss": 37.157, + "step": 3537 + }, + { + "epoch": 12.772911963882619, + "grad_norm": 264.9976806640625, + "learning_rate": 1.0791288566243196e-05, + "loss": 38.1585, + "step": 3538 + }, + { + "epoch": 12.776523702031604, + "grad_norm": 216.5603790283203, + "learning_rate": 1.0785843920145191e-05, + "loss": 38.0501, + "step": 3539 + }, + { + "epoch": 12.780135440180587, + "grad_norm": 193.55081176757812, + "learning_rate": 1.0780399274047186e-05, + "loss": 38.3114, + "step": 3540 + }, + { + "epoch": 12.780135440180587, + "eval_loss": 0.6059894561767578, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3540 + }, + { + "epoch": 12.783747178329572, + "grad_norm": 256.3584289550781, + "learning_rate": 1.0774954627949183e-05, + "loss": 38.7056, + "step": 3541 + }, + { + "epoch": 12.787358916478555, + "grad_norm": 203.17401123046875, + "learning_rate": 1.076950998185118e-05, + "loss": 39.3947, + "step": 3542 + }, + { + "epoch": 12.79097065462754, + "grad_norm": 307.99517822265625, + "learning_rate": 1.0764065335753176e-05, + "loss": 39.2121, + "step": 3543 + }, + { + "epoch": 12.794582392776524, + "grad_norm": 199.4147186279297, + "learning_rate": 1.0758620689655173e-05, + "loss": 38.4621, + "step": 3544 + }, + { + "epoch": 12.798194130925507, + "grad_norm": 251.60293579101562, + "learning_rate": 1.0753176043557168e-05, + "loss": 38.2742, + "step": 3545 + }, + { + "epoch": 12.801805869074492, + "grad_norm": 277.1817321777344, + "learning_rate": 1.0747731397459165e-05, + "loss": 38.6803, + "step": 3546 + }, + { + "epoch": 12.805417607223477, + "grad_norm": 303.2837219238281, + "learning_rate": 1.0742286751361163e-05, + "loss": 39.7843, + "step": 3547 + }, + { + "epoch": 12.80902934537246, + "grad_norm": 321.22772216796875, + "learning_rate": 1.0736842105263158e-05, + "loss": 41.3761, + "step": 3548 + }, + { + "epoch": 12.812641083521445, + "grad_norm": 238.89007568359375, + "learning_rate": 1.0731397459165155e-05, + "loss": 40.3649, + "step": 3549 + }, + { + "epoch": 12.816252821670428, + "grad_norm": 251.22291564941406, + "learning_rate": 1.072595281306715e-05, + "loss": 40.8151, + "step": 3550 + }, + { + "epoch": 12.816252821670428, + "eval_loss": 0.6065003275871277, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 3550 + }, + { + "epoch": 12.819864559819413, + "grad_norm": 218.13418579101562, + "learning_rate": 1.0720508166969147e-05, + "loss": 39.381, + "step": 3551 + }, + { + "epoch": 12.823476297968398, + "grad_norm": 250.90328979492188, + "learning_rate": 1.0715063520871145e-05, + "loss": 39.8923, + "step": 3552 + }, + { + "epoch": 12.827088036117381, + "grad_norm": 227.4825897216797, + "learning_rate": 1.070961887477314e-05, + "loss": 36.836, + "step": 3553 + }, + { + "epoch": 12.830699774266366, + "grad_norm": 253.7106475830078, + "learning_rate": 1.0704174228675135e-05, + "loss": 34.499, + "step": 3554 + }, + { + "epoch": 12.83431151241535, + "grad_norm": 280.0548400878906, + "learning_rate": 1.0698729582577132e-05, + "loss": 33.3409, + "step": 3555 + }, + { + "epoch": 12.837923250564334, + "grad_norm": 201.3768768310547, + "learning_rate": 1.069328493647913e-05, + "loss": 32.4868, + "step": 3556 + }, + { + "epoch": 12.841534988713319, + "grad_norm": 245.73446655273438, + "learning_rate": 1.0687840290381125e-05, + "loss": 32.8295, + "step": 3557 + }, + { + "epoch": 12.845146726862303, + "grad_norm": 195.0170440673828, + "learning_rate": 1.0682395644283122e-05, + "loss": 33.2009, + "step": 3558 + }, + { + "epoch": 12.848758465011286, + "grad_norm": 261.66357421875, + "learning_rate": 1.0676950998185117e-05, + "loss": 33.0627, + "step": 3559 + }, + { + "epoch": 12.852370203160271, + "grad_norm": 299.0184326171875, + "learning_rate": 1.0671506352087116e-05, + "loss": 34.184, + "step": 3560 + }, + { + "epoch": 12.852370203160271, + "eval_loss": 0.6077792048454285, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.041, + "eval_steps_per_second": 57.041, + "step": 3560 + }, + { + "epoch": 12.855981941309254, + "grad_norm": 293.9249572753906, + "learning_rate": 1.0666061705989111e-05, + "loss": 34.748, + "step": 3561 + }, + { + "epoch": 12.85959367945824, + "grad_norm": 206.4182586669922, + "learning_rate": 1.0660617059891107e-05, + "loss": 33.8454, + "step": 3562 + }, + { + "epoch": 12.863205417607224, + "grad_norm": 261.4427185058594, + "learning_rate": 1.0655172413793104e-05, + "loss": 35.7317, + "step": 3563 + }, + { + "epoch": 12.866817155756207, + "grad_norm": 236.60704040527344, + "learning_rate": 1.06497277676951e-05, + "loss": 35.2389, + "step": 3564 + }, + { + "epoch": 12.870428893905192, + "grad_norm": 272.9973449707031, + "learning_rate": 1.0644283121597096e-05, + "loss": 34.8523, + "step": 3565 + }, + { + "epoch": 12.874040632054175, + "grad_norm": 228.82540893554688, + "learning_rate": 1.0638838475499093e-05, + "loss": 34.7236, + "step": 3566 + }, + { + "epoch": 12.87765237020316, + "grad_norm": 266.6078796386719, + "learning_rate": 1.0633393829401089e-05, + "loss": 36.1574, + "step": 3567 + }, + { + "epoch": 12.881264108352145, + "grad_norm": 267.52239990234375, + "learning_rate": 1.0627949183303086e-05, + "loss": 36.8466, + "step": 3568 + }, + { + "epoch": 12.884875846501128, + "grad_norm": 261.0372314453125, + "learning_rate": 1.0622504537205083e-05, + "loss": 37.2803, + "step": 3569 + }, + { + "epoch": 12.888487584650113, + "grad_norm": 220.42532348632812, + "learning_rate": 1.0617059891107078e-05, + "loss": 29.4233, + "step": 3570 + }, + { + "epoch": 12.888487584650113, + "eval_loss": 0.6131581664085388, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 3570 + }, + { + "epoch": 12.892099322799098, + "grad_norm": 187.53604125976562, + "learning_rate": 1.0611615245009075e-05, + "loss": 23.3851, + "step": 3571 + }, + { + "epoch": 12.89571106094808, + "grad_norm": 227.1913299560547, + "learning_rate": 1.060617059891107e-05, + "loss": 23.3155, + "step": 3572 + }, + { + "epoch": 12.899322799097066, + "grad_norm": 202.15939331054688, + "learning_rate": 1.0600725952813066e-05, + "loss": 24.4548, + "step": 3573 + }, + { + "epoch": 12.90293453724605, + "grad_norm": 195.67282104492188, + "learning_rate": 1.0595281306715065e-05, + "loss": 24.2037, + "step": 3574 + }, + { + "epoch": 12.906546275395034, + "grad_norm": 303.0018310546875, + "learning_rate": 1.058983666061706e-05, + "loss": 41.6489, + "step": 3575 + }, + { + "epoch": 12.910158013544018, + "grad_norm": 193.92433166503906, + "learning_rate": 1.0584392014519056e-05, + "loss": 40.3682, + "step": 3576 + }, + { + "epoch": 12.913769751693001, + "grad_norm": 305.50750732421875, + "learning_rate": 1.0578947368421053e-05, + "loss": 40.5065, + "step": 3577 + }, + { + "epoch": 12.917381489841986, + "grad_norm": 223.41732788085938, + "learning_rate": 1.0573502722323048e-05, + "loss": 41.6387, + "step": 3578 + }, + { + "epoch": 12.920993227990971, + "grad_norm": 215.65061950683594, + "learning_rate": 1.0568058076225047e-05, + "loss": 41.3623, + "step": 3579 + }, + { + "epoch": 12.924604966139954, + "grad_norm": 223.95880126953125, + "learning_rate": 1.0562613430127042e-05, + "loss": 40.7444, + "step": 3580 + }, + { + "epoch": 12.924604966139954, + "eval_loss": 0.6113386750221252, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.074, + "eval_steps_per_second": 57.074, + "step": 3580 + }, + { + "epoch": 12.928216704288939, + "grad_norm": 247.3272247314453, + "learning_rate": 1.0557168784029038e-05, + "loss": 37.8137, + "step": 3581 + }, + { + "epoch": 12.931828442437924, + "grad_norm": 277.4321594238281, + "learning_rate": 1.0551724137931035e-05, + "loss": 38.6946, + "step": 3582 + }, + { + "epoch": 12.935440180586907, + "grad_norm": 219.15576171875, + "learning_rate": 1.0546279491833032e-05, + "loss": 39.0059, + "step": 3583 + }, + { + "epoch": 12.939051918735892, + "grad_norm": 205.6105194091797, + "learning_rate": 1.0540834845735027e-05, + "loss": 39.2436, + "step": 3584 + }, + { + "epoch": 12.942663656884875, + "grad_norm": 303.84521484375, + "learning_rate": 1.0535390199637024e-05, + "loss": 39.2451, + "step": 3585 + }, + { + "epoch": 12.94627539503386, + "grad_norm": 326.2321472167969, + "learning_rate": 1.052994555353902e-05, + "loss": 38.1849, + "step": 3586 + }, + { + "epoch": 12.949887133182845, + "grad_norm": 332.7608642578125, + "learning_rate": 1.0524500907441015e-05, + "loss": 39.7121, + "step": 3587 + }, + { + "epoch": 12.953498871331828, + "grad_norm": 245.19827270507812, + "learning_rate": 1.0519056261343014e-05, + "loss": 39.6558, + "step": 3588 + }, + { + "epoch": 12.957110609480813, + "grad_norm": 227.54763793945312, + "learning_rate": 1.051361161524501e-05, + "loss": 38.6437, + "step": 3589 + }, + { + "epoch": 12.960722347629797, + "grad_norm": 273.1142272949219, + "learning_rate": 1.0508166969147006e-05, + "loss": 39.083, + "step": 3590 + }, + { + "epoch": 12.960722347629797, + "eval_loss": 0.6050187349319458, + "eval_runtime": 3.1339, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 3590 + }, + { + "epoch": 12.96433408577878, + "grad_norm": 227.0492401123047, + "learning_rate": 1.0502722323049002e-05, + "loss": 34.0254, + "step": 3591 + }, + { + "epoch": 12.967945823927765, + "grad_norm": 201.76736450195312, + "learning_rate": 1.0497277676950999e-05, + "loss": 32.4569, + "step": 3592 + }, + { + "epoch": 12.97155756207675, + "grad_norm": 279.99237060546875, + "learning_rate": 1.0491833030852996e-05, + "loss": 33.8718, + "step": 3593 + }, + { + "epoch": 12.975169300225733, + "grad_norm": 351.647705078125, + "learning_rate": 1.0486388384754991e-05, + "loss": 34.8168, + "step": 3594 + }, + { + "epoch": 12.978781038374718, + "grad_norm": 275.7414855957031, + "learning_rate": 1.0480943738656987e-05, + "loss": 35.1731, + "step": 3595 + }, + { + "epoch": 12.982392776523701, + "grad_norm": 347.0024719238281, + "learning_rate": 1.0475499092558984e-05, + "loss": 35.7127, + "step": 3596 + }, + { + "epoch": 12.986004514672686, + "grad_norm": 304.18218994140625, + "learning_rate": 1.047005444646098e-05, + "loss": 34.7709, + "step": 3597 + }, + { + "epoch": 12.989616252821671, + "grad_norm": 306.33245849609375, + "learning_rate": 1.0464609800362976e-05, + "loss": 37.2105, + "step": 3598 + }, + { + "epoch": 12.993227990970654, + "grad_norm": 326.3535461425781, + "learning_rate": 1.0459165154264973e-05, + "loss": 33.6613, + "step": 3599 + }, + { + "epoch": 12.996839729119639, + "grad_norm": 325.7522888183594, + "learning_rate": 1.0453720508166969e-05, + "loss": 22.8985, + "step": 3600 + }, + { + "epoch": 12.996839729119639, + "eval_loss": 0.6073772311210632, + "eval_runtime": 3.1391, + "eval_samples_per_second": 57.023, + "eval_steps_per_second": 57.023, + "step": 3600 + }, + { + "epoch": 13.0, + "grad_norm": 256.7010498046875, + "learning_rate": 1.0448275862068966e-05, + "loss": 21.3776, + "step": 3601 + }, + { + "epoch": 13.003611738148985, + "grad_norm": 247.7591552734375, + "learning_rate": 1.0442831215970963e-05, + "loss": 39.0509, + "step": 3602 + }, + { + "epoch": 13.007223476297968, + "grad_norm": 389.6626281738281, + "learning_rate": 1.0437386569872958e-05, + "loss": 41.042, + "step": 3603 + }, + { + "epoch": 13.010835214446953, + "grad_norm": 271.01885986328125, + "learning_rate": 1.0431941923774955e-05, + "loss": 39.9542, + "step": 3604 + }, + { + "epoch": 13.014446952595938, + "grad_norm": 263.2490539550781, + "learning_rate": 1.042649727767695e-05, + "loss": 39.8852, + "step": 3605 + }, + { + "epoch": 13.01805869074492, + "grad_norm": 255.46878051757812, + "learning_rate": 1.0421052631578948e-05, + "loss": 39.3902, + "step": 3606 + }, + { + "epoch": 13.021670428893906, + "grad_norm": 206.02244567871094, + "learning_rate": 1.0415607985480945e-05, + "loss": 40.1731, + "step": 3607 + }, + { + "epoch": 13.025282167042889, + "grad_norm": 194.83055114746094, + "learning_rate": 1.041016333938294e-05, + "loss": 39.17, + "step": 3608 + }, + { + "epoch": 13.028893905191874, + "grad_norm": 230.1270294189453, + "learning_rate": 1.0404718693284936e-05, + "loss": 40.3363, + "step": 3609 + }, + { + "epoch": 13.032505643340858, + "grad_norm": 206.0470733642578, + "learning_rate": 1.0399274047186933e-05, + "loss": 40.7774, + "step": 3610 + }, + { + "epoch": 13.032505643340858, + "eval_loss": 0.6078981161117554, + "eval_runtime": 3.1697, + "eval_samples_per_second": 56.472, + "eval_steps_per_second": 56.472, + "step": 3610 + }, + { + "epoch": 13.036117381489841, + "grad_norm": 210.79327392578125, + "learning_rate": 1.039382940108893e-05, + "loss": 40.725, + "step": 3611 + }, + { + "epoch": 13.039729119638826, + "grad_norm": 200.4281768798828, + "learning_rate": 1.0388384754990927e-05, + "loss": 38.8736, + "step": 3612 + }, + { + "epoch": 13.043340857787811, + "grad_norm": 183.33575439453125, + "learning_rate": 1.0382940108892922e-05, + "loss": 37.5542, + "step": 3613 + }, + { + "epoch": 13.046952595936794, + "grad_norm": 195.2568817138672, + "learning_rate": 1.0377495462794918e-05, + "loss": 36.5576, + "step": 3614 + }, + { + "epoch": 13.050564334085779, + "grad_norm": 223.9565887451172, + "learning_rate": 1.0372050816696916e-05, + "loss": 36.9015, + "step": 3615 + }, + { + "epoch": 13.054176072234762, + "grad_norm": 264.0516052246094, + "learning_rate": 1.0366606170598912e-05, + "loss": 38.8146, + "step": 3616 + }, + { + "epoch": 13.057787810383747, + "grad_norm": 247.3844757080078, + "learning_rate": 1.0361161524500907e-05, + "loss": 37.0338, + "step": 3617 + }, + { + "epoch": 13.061399548532732, + "grad_norm": 243.3253173828125, + "learning_rate": 1.0355716878402904e-05, + "loss": 37.3565, + "step": 3618 + }, + { + "epoch": 13.065011286681715, + "grad_norm": 213.89939880371094, + "learning_rate": 1.03502722323049e-05, + "loss": 38.367, + "step": 3619 + }, + { + "epoch": 13.0686230248307, + "grad_norm": 254.04953002929688, + "learning_rate": 1.0344827586206898e-05, + "loss": 38.3101, + "step": 3620 + }, + { + "epoch": 13.0686230248307, + "eval_loss": 0.6108394861221313, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 3620 + }, + { + "epoch": 13.072234762979685, + "grad_norm": 235.3623046875, + "learning_rate": 1.0339382940108894e-05, + "loss": 38.3113, + "step": 3621 + }, + { + "epoch": 13.075846501128668, + "grad_norm": 259.0147399902344, + "learning_rate": 1.0333938294010889e-05, + "loss": 36.9916, + "step": 3622 + }, + { + "epoch": 13.079458239277653, + "grad_norm": 257.96575927734375, + "learning_rate": 1.0328493647912886e-05, + "loss": 36.5944, + "step": 3623 + }, + { + "epoch": 13.083069977426636, + "grad_norm": 228.49131774902344, + "learning_rate": 1.0323049001814882e-05, + "loss": 39.7592, + "step": 3624 + }, + { + "epoch": 13.08668171557562, + "grad_norm": 278.5231018066406, + "learning_rate": 1.0317604355716879e-05, + "loss": 38.7785, + "step": 3625 + }, + { + "epoch": 13.090293453724605, + "grad_norm": 218.6136932373047, + "learning_rate": 1.0312159709618876e-05, + "loss": 39.6878, + "step": 3626 + }, + { + "epoch": 13.093905191873588, + "grad_norm": 231.03012084960938, + "learning_rate": 1.0306715063520871e-05, + "loss": 40.5433, + "step": 3627 + }, + { + "epoch": 13.097516930022573, + "grad_norm": 254.7096405029297, + "learning_rate": 1.0301270417422866e-05, + "loss": 39.1311, + "step": 3628 + }, + { + "epoch": 13.101128668171558, + "grad_norm": 303.50274658203125, + "learning_rate": 1.0295825771324865e-05, + "loss": 38.6237, + "step": 3629 + }, + { + "epoch": 13.104740406320541, + "grad_norm": 217.4394073486328, + "learning_rate": 1.029038112522686e-05, + "loss": 36.5534, + "step": 3630 + }, + { + "epoch": 13.104740406320541, + "eval_loss": 0.6075544357299805, + "eval_runtime": 3.1475, + "eval_samples_per_second": 56.87, + "eval_steps_per_second": 56.87, + "step": 3630 + }, + { + "epoch": 13.108352144469526, + "grad_norm": 249.18490600585938, + "learning_rate": 1.0284936479128858e-05, + "loss": 34.2153, + "step": 3631 + }, + { + "epoch": 13.111963882618511, + "grad_norm": 261.9061584472656, + "learning_rate": 1.0279491833030853e-05, + "loss": 33.7793, + "step": 3632 + }, + { + "epoch": 13.115575620767494, + "grad_norm": 205.93113708496094, + "learning_rate": 1.0274047186932848e-05, + "loss": 31.2934, + "step": 3633 + }, + { + "epoch": 13.119187358916479, + "grad_norm": 203.82980346679688, + "learning_rate": 1.0268602540834847e-05, + "loss": 31.9074, + "step": 3634 + }, + { + "epoch": 13.122799097065462, + "grad_norm": 309.0658874511719, + "learning_rate": 1.0263157894736843e-05, + "loss": 32.6883, + "step": 3635 + }, + { + "epoch": 13.126410835214447, + "grad_norm": 239.59312438964844, + "learning_rate": 1.0257713248638838e-05, + "loss": 34.1261, + "step": 3636 + }, + { + "epoch": 13.130022573363432, + "grad_norm": 360.4351501464844, + "learning_rate": 1.0252268602540835e-05, + "loss": 34.7656, + "step": 3637 + }, + { + "epoch": 13.133634311512415, + "grad_norm": 319.87451171875, + "learning_rate": 1.024682395644283e-05, + "loss": 34.6533, + "step": 3638 + }, + { + "epoch": 13.1372460496614, + "grad_norm": 352.31707763671875, + "learning_rate": 1.0241379310344828e-05, + "loss": 33.9159, + "step": 3639 + }, + { + "epoch": 13.140857787810384, + "grad_norm": 288.85418701171875, + "learning_rate": 1.0235934664246825e-05, + "loss": 34.6115, + "step": 3640 + }, + { + "epoch": 13.140857787810384, + "eval_loss": 0.6106187105178833, + "eval_runtime": 3.1535, + "eval_samples_per_second": 56.763, + "eval_steps_per_second": 56.763, + "step": 3640 + }, + { + "epoch": 13.144469525959368, + "grad_norm": 263.8638000488281, + "learning_rate": 1.023049001814882e-05, + "loss": 34.3008, + "step": 3641 + }, + { + "epoch": 13.148081264108352, + "grad_norm": 308.10650634765625, + "learning_rate": 1.0225045372050817e-05, + "loss": 35.9397, + "step": 3642 + }, + { + "epoch": 13.151693002257336, + "grad_norm": 208.60519409179688, + "learning_rate": 1.0219600725952814e-05, + "loss": 34.2573, + "step": 3643 + }, + { + "epoch": 13.15530474040632, + "grad_norm": 251.36766052246094, + "learning_rate": 1.021415607985481e-05, + "loss": 35.853, + "step": 3644 + }, + { + "epoch": 13.158916478555305, + "grad_norm": 264.94818115234375, + "learning_rate": 1.0208711433756807e-05, + "loss": 35.7057, + "step": 3645 + }, + { + "epoch": 13.162528216704288, + "grad_norm": 313.0333251953125, + "learning_rate": 1.0203266787658802e-05, + "loss": 34.611, + "step": 3646 + }, + { + "epoch": 13.166139954853273, + "grad_norm": 254.9687042236328, + "learning_rate": 1.0197822141560797e-05, + "loss": 31.1751, + "step": 3647 + }, + { + "epoch": 13.169751693002258, + "grad_norm": 219.7308349609375, + "learning_rate": 1.0192377495462796e-05, + "loss": 22.8425, + "step": 3648 + }, + { + "epoch": 13.173363431151241, + "grad_norm": 305.76416015625, + "learning_rate": 1.0186932849364792e-05, + "loss": 22.5266, + "step": 3649 + }, + { + "epoch": 13.176975169300226, + "grad_norm": 301.26239013671875, + "learning_rate": 1.0181488203266787e-05, + "loss": 23.861, + "step": 3650 + }, + { + "epoch": 13.176975169300226, + "eval_loss": 0.6107029914855957, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 3650 + }, + { + "epoch": 13.18058690744921, + "grad_norm": 235.15576171875, + "learning_rate": 1.0176043557168784e-05, + "loss": 24.495, + "step": 3651 + }, + { + "epoch": 13.184198645598194, + "grad_norm": 268.524658203125, + "learning_rate": 1.0170598911070781e-05, + "loss": 40.3819, + "step": 3652 + }, + { + "epoch": 13.187810383747179, + "grad_norm": 257.869140625, + "learning_rate": 1.0165154264972778e-05, + "loss": 42.2715, + "step": 3653 + }, + { + "epoch": 13.191422121896162, + "grad_norm": 191.8995361328125, + "learning_rate": 1.0159709618874774e-05, + "loss": 41.2991, + "step": 3654 + }, + { + "epoch": 13.195033860045147, + "grad_norm": 242.85342407226562, + "learning_rate": 1.0154264972776769e-05, + "loss": 39.6007, + "step": 3655 + }, + { + "epoch": 13.198645598194132, + "grad_norm": 279.1092529296875, + "learning_rate": 1.0148820326678766e-05, + "loss": 39.8502, + "step": 3656 + }, + { + "epoch": 13.202257336343115, + "grad_norm": 233.94708251953125, + "learning_rate": 1.0143375680580763e-05, + "loss": 39.6407, + "step": 3657 + }, + { + "epoch": 13.2058690744921, + "grad_norm": 227.53001403808594, + "learning_rate": 1.0137931034482758e-05, + "loss": 40.3618, + "step": 3658 + }, + { + "epoch": 13.209480812641084, + "grad_norm": 216.17654418945312, + "learning_rate": 1.0132486388384756e-05, + "loss": 41.3187, + "step": 3659 + }, + { + "epoch": 13.213092550790067, + "grad_norm": 199.51072692871094, + "learning_rate": 1.0127041742286751e-05, + "loss": 41.7474, + "step": 3660 + }, + { + "epoch": 13.213092550790067, + "eval_loss": 0.6099065542221069, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3660 + }, + { + "epoch": 13.216704288939052, + "grad_norm": 212.3302001953125, + "learning_rate": 1.0121597096188748e-05, + "loss": 40.8565, + "step": 3661 + }, + { + "epoch": 13.220316027088035, + "grad_norm": 185.42857360839844, + "learning_rate": 1.0116152450090745e-05, + "loss": 41.5302, + "step": 3662 + }, + { + "epoch": 13.22392776523702, + "grad_norm": 241.05487060546875, + "learning_rate": 1.011070780399274e-05, + "loss": 38.6842, + "step": 3663 + }, + { + "epoch": 13.227539503386005, + "grad_norm": 314.1755065917969, + "learning_rate": 1.0105263157894738e-05, + "loss": 37.8021, + "step": 3664 + }, + { + "epoch": 13.231151241534988, + "grad_norm": 262.6571960449219, + "learning_rate": 1.0099818511796733e-05, + "loss": 36.3265, + "step": 3665 + }, + { + "epoch": 13.234762979683973, + "grad_norm": 259.24029541015625, + "learning_rate": 1.009437386569873e-05, + "loss": 38.4521, + "step": 3666 + }, + { + "epoch": 13.238374717832958, + "grad_norm": 223.5182342529297, + "learning_rate": 1.0088929219600727e-05, + "loss": 37.3267, + "step": 3667 + }, + { + "epoch": 13.241986455981941, + "grad_norm": 181.72926330566406, + "learning_rate": 1.0083484573502722e-05, + "loss": 38.0142, + "step": 3668 + }, + { + "epoch": 13.245598194130926, + "grad_norm": 204.99813842773438, + "learning_rate": 1.0078039927404718e-05, + "loss": 37.3513, + "step": 3669 + }, + { + "epoch": 13.249209932279909, + "grad_norm": 184.05482482910156, + "learning_rate": 1.0072595281306715e-05, + "loss": 37.9737, + "step": 3670 + }, + { + "epoch": 13.249209932279909, + "eval_loss": 0.6081296801567078, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.081, + "eval_steps_per_second": 57.081, + "step": 3670 + }, + { + "epoch": 13.252821670428894, + "grad_norm": 261.076416015625, + "learning_rate": 1.0067150635208712e-05, + "loss": 38.1087, + "step": 3671 + }, + { + "epoch": 13.256433408577879, + "grad_norm": 218.79515075683594, + "learning_rate": 1.0061705989110709e-05, + "loss": 37.215, + "step": 3672 + }, + { + "epoch": 13.260045146726862, + "grad_norm": 240.93222045898438, + "learning_rate": 1.0056261343012704e-05, + "loss": 37.4461, + "step": 3673 + }, + { + "epoch": 13.263656884875846, + "grad_norm": 241.46072387695312, + "learning_rate": 1.00508166969147e-05, + "loss": 39.4396, + "step": 3674 + }, + { + "epoch": 13.267268623024831, + "grad_norm": 217.85369873046875, + "learning_rate": 1.0045372050816699e-05, + "loss": 38.5512, + "step": 3675 + }, + { + "epoch": 13.270880361173814, + "grad_norm": 254.53549194335938, + "learning_rate": 1.0039927404718694e-05, + "loss": 39.4436, + "step": 3676 + }, + { + "epoch": 13.2744920993228, + "grad_norm": 330.2030029296875, + "learning_rate": 1.003448275862069e-05, + "loss": 39.6341, + "step": 3677 + }, + { + "epoch": 13.278103837471784, + "grad_norm": 267.6778869628906, + "learning_rate": 1.0029038112522686e-05, + "loss": 38.5305, + "step": 3678 + }, + { + "epoch": 13.281715575620767, + "grad_norm": 251.23703002929688, + "learning_rate": 1.0023593466424682e-05, + "loss": 39.712, + "step": 3679 + }, + { + "epoch": 13.285327313769752, + "grad_norm": 258.8126525878906, + "learning_rate": 1.0018148820326679e-05, + "loss": 37.982, + "step": 3680 + }, + { + "epoch": 13.285327313769752, + "eval_loss": 0.6092600226402283, + "eval_runtime": 3.1494, + "eval_samples_per_second": 56.837, + "eval_steps_per_second": 56.837, + "step": 3680 + }, + { + "epoch": 13.288939051918735, + "grad_norm": 270.01690673828125, + "learning_rate": 1.0012704174228676e-05, + "loss": 35.8938, + "step": 3681 + }, + { + "epoch": 13.29255079006772, + "grad_norm": 271.138671875, + "learning_rate": 1.0007259528130671e-05, + "loss": 33.2221, + "step": 3682 + }, + { + "epoch": 13.296162528216705, + "grad_norm": 239.4976806640625, + "learning_rate": 1.0001814882032668e-05, + "loss": 32.6252, + "step": 3683 + }, + { + "epoch": 13.299774266365688, + "grad_norm": 203.7470245361328, + "learning_rate": 9.996370235934664e-06, + "loss": 32.3694, + "step": 3684 + }, + { + "epoch": 13.303386004514673, + "grad_norm": 255.28419494628906, + "learning_rate": 9.990925589836661e-06, + "loss": 32.7386, + "step": 3685 + }, + { + "epoch": 13.306997742663658, + "grad_norm": 267.82489013671875, + "learning_rate": 9.985480943738658e-06, + "loss": 33.7657, + "step": 3686 + }, + { + "epoch": 13.31060948081264, + "grad_norm": 224.82432556152344, + "learning_rate": 9.980036297640653e-06, + "loss": 34.085, + "step": 3687 + }, + { + "epoch": 13.314221218961626, + "grad_norm": 249.92684936523438, + "learning_rate": 9.974591651542649e-06, + "loss": 33.9186, + "step": 3688 + }, + { + "epoch": 13.317832957110609, + "grad_norm": 249.29620361328125, + "learning_rate": 9.969147005444648e-06, + "loss": 35.0909, + "step": 3689 + }, + { + "epoch": 13.321444695259594, + "grad_norm": 276.4640808105469, + "learning_rate": 9.963702359346643e-06, + "loss": 35.6823, + "step": 3690 + }, + { + "epoch": 13.321444695259594, + "eval_loss": 0.6132593154907227, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 3690 + }, + { + "epoch": 13.325056433408578, + "grad_norm": 245.46163940429688, + "learning_rate": 9.958257713248638e-06, + "loss": 35.7071, + "step": 3691 + }, + { + "epoch": 13.328668171557561, + "grad_norm": 311.008544921875, + "learning_rate": 9.952813067150635e-06, + "loss": 33.6089, + "step": 3692 + }, + { + "epoch": 13.332279909706546, + "grad_norm": 283.2784118652344, + "learning_rate": 9.94736842105263e-06, + "loss": 34.9939, + "step": 3693 + }, + { + "epoch": 13.335891647855531, + "grad_norm": 293.2317199707031, + "learning_rate": 9.94192377495463e-06, + "loss": 37.1149, + "step": 3694 + }, + { + "epoch": 13.339503386004514, + "grad_norm": 263.33111572265625, + "learning_rate": 9.936479128856625e-06, + "loss": 36.5911, + "step": 3695 + }, + { + "epoch": 13.343115124153499, + "grad_norm": 285.1488952636719, + "learning_rate": 9.93103448275862e-06, + "loss": 35.9336, + "step": 3696 + }, + { + "epoch": 13.346726862302482, + "grad_norm": 246.30616760253906, + "learning_rate": 9.925589836660617e-06, + "loss": 26.1555, + "step": 3697 + }, + { + "epoch": 13.350338600451467, + "grad_norm": 185.4857177734375, + "learning_rate": 9.920145190562614e-06, + "loss": 21.9519, + "step": 3698 + }, + { + "epoch": 13.353950338600452, + "grad_norm": 269.6291809082031, + "learning_rate": 9.91470054446461e-06, + "loss": 22.5592, + "step": 3699 + }, + { + "epoch": 13.357562076749435, + "grad_norm": 214.7660675048828, + "learning_rate": 9.909255898366607e-06, + "loss": 23.2505, + "step": 3700 + }, + { + "epoch": 13.357562076749435, + "eval_loss": 0.6123418211936951, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 3700 + }, + { + "epoch": 13.36117381489842, + "grad_norm": 227.8025360107422, + "learning_rate": 9.903811252268602e-06, + "loss": 23.9731, + "step": 3701 + }, + { + "epoch": 13.364785553047405, + "grad_norm": 261.7846374511719, + "learning_rate": 9.898366606170598e-06, + "loss": 40.3869, + "step": 3702 + }, + { + "epoch": 13.368397291196388, + "grad_norm": 305.4109802246094, + "learning_rate": 9.892921960072596e-06, + "loss": 41.9626, + "step": 3703 + }, + { + "epoch": 13.372009029345373, + "grad_norm": 272.86236572265625, + "learning_rate": 9.887477313974592e-06, + "loss": 39.9819, + "step": 3704 + }, + { + "epoch": 13.375620767494357, + "grad_norm": 371.4781188964844, + "learning_rate": 9.882032667876589e-06, + "loss": 40.8074, + "step": 3705 + }, + { + "epoch": 13.37923250564334, + "grad_norm": 278.7463684082031, + "learning_rate": 9.876588021778584e-06, + "loss": 40.6721, + "step": 3706 + }, + { + "epoch": 13.382844243792325, + "grad_norm": 270.41619873046875, + "learning_rate": 9.87114337568058e-06, + "loss": 40.1604, + "step": 3707 + }, + { + "epoch": 13.386455981941308, + "grad_norm": 204.42018127441406, + "learning_rate": 9.865698729582578e-06, + "loss": 41.4666, + "step": 3708 + }, + { + "epoch": 13.390067720090293, + "grad_norm": 197.43289184570312, + "learning_rate": 9.860254083484574e-06, + "loss": 40.953, + "step": 3709 + }, + { + "epoch": 13.393679458239278, + "grad_norm": 203.92056274414062, + "learning_rate": 9.85480943738657e-06, + "loss": 40.6416, + "step": 3710 + }, + { + "epoch": 13.393679458239278, + "eval_loss": 0.608938992023468, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.863, + "eval_steps_per_second": 56.863, + "step": 3710 + }, + { + "epoch": 13.397291196388261, + "grad_norm": 353.2951354980469, + "learning_rate": 9.849364791288566e-06, + "loss": 39.7, + "step": 3711 + }, + { + "epoch": 13.400902934537246, + "grad_norm": 222.94410705566406, + "learning_rate": 9.843920145190563e-06, + "loss": 40.4703, + "step": 3712 + }, + { + "epoch": 13.404514672686231, + "grad_norm": 301.0710754394531, + "learning_rate": 9.83847549909256e-06, + "loss": 37.0453, + "step": 3713 + }, + { + "epoch": 13.408126410835214, + "grad_norm": 251.70263671875, + "learning_rate": 9.833030852994556e-06, + "loss": 37.5346, + "step": 3714 + }, + { + "epoch": 13.411738148984199, + "grad_norm": 201.29335021972656, + "learning_rate": 9.827586206896551e-06, + "loss": 39.0706, + "step": 3715 + }, + { + "epoch": 13.415349887133182, + "grad_norm": 233.82212829589844, + "learning_rate": 9.822141560798548e-06, + "loss": 38.4527, + "step": 3716 + }, + { + "epoch": 13.418961625282167, + "grad_norm": 245.0128936767578, + "learning_rate": 9.816696914700545e-06, + "loss": 37.82, + "step": 3717 + }, + { + "epoch": 13.422573363431152, + "grad_norm": 325.1784973144531, + "learning_rate": 9.81125226860254e-06, + "loss": 38.8858, + "step": 3718 + }, + { + "epoch": 13.426185101580135, + "grad_norm": 196.15032958984375, + "learning_rate": 9.805807622504538e-06, + "loss": 37.1919, + "step": 3719 + }, + { + "epoch": 13.42979683972912, + "grad_norm": 254.73980712890625, + "learning_rate": 9.800362976406533e-06, + "loss": 39.1644, + "step": 3720 + }, + { + "epoch": 13.42979683972912, + "eval_loss": 0.6100116968154907, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 3720 + }, + { + "epoch": 13.433408577878104, + "grad_norm": 253.11489868164062, + "learning_rate": 9.79491833030853e-06, + "loss": 39.8542, + "step": 3721 + }, + { + "epoch": 13.437020316027088, + "grad_norm": 267.8416748046875, + "learning_rate": 9.789473684210527e-06, + "loss": 39.8469, + "step": 3722 + }, + { + "epoch": 13.440632054176072, + "grad_norm": 267.62835693359375, + "learning_rate": 9.784029038112523e-06, + "loss": 37.4556, + "step": 3723 + }, + { + "epoch": 13.444243792325057, + "grad_norm": 346.6018371582031, + "learning_rate": 9.77858439201452e-06, + "loss": 39.7817, + "step": 3724 + }, + { + "epoch": 13.44785553047404, + "grad_norm": 241.95008850097656, + "learning_rate": 9.773139745916515e-06, + "loss": 39.1631, + "step": 3725 + }, + { + "epoch": 13.451467268623025, + "grad_norm": 244.9163055419922, + "learning_rate": 9.767695099818512e-06, + "loss": 38.6152, + "step": 3726 + }, + { + "epoch": 13.455079006772008, + "grad_norm": 243.60633850097656, + "learning_rate": 9.76225045372051e-06, + "loss": 39.5388, + "step": 3727 + }, + { + "epoch": 13.458690744920993, + "grad_norm": 230.57276916503906, + "learning_rate": 9.756805807622505e-06, + "loss": 40.3007, + "step": 3728 + }, + { + "epoch": 13.462302483069978, + "grad_norm": 228.76754760742188, + "learning_rate": 9.7513611615245e-06, + "loss": 37.7111, + "step": 3729 + }, + { + "epoch": 13.465914221218961, + "grad_norm": 292.7367248535156, + "learning_rate": 9.745916515426497e-06, + "loss": 38.4114, + "step": 3730 + }, + { + "epoch": 13.465914221218961, + "eval_loss": 0.6064842939376831, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 3730 + }, + { + "epoch": 13.469525959367946, + "grad_norm": 226.9254150390625, + "learning_rate": 9.740471869328494e-06, + "loss": 34.015, + "step": 3731 + }, + { + "epoch": 13.47313769751693, + "grad_norm": 250.38137817382812, + "learning_rate": 9.73502722323049e-06, + "loss": 34.2911, + "step": 3732 + }, + { + "epoch": 13.476749435665914, + "grad_norm": 230.447265625, + "learning_rate": 9.729582577132487e-06, + "loss": 31.8708, + "step": 3733 + }, + { + "epoch": 13.480361173814899, + "grad_norm": 241.05787658691406, + "learning_rate": 9.724137931034482e-06, + "loss": 34.5685, + "step": 3734 + }, + { + "epoch": 13.483972911963882, + "grad_norm": 248.07254028320312, + "learning_rate": 9.718693284936481e-06, + "loss": 32.6084, + "step": 3735 + }, + { + "epoch": 13.487584650112867, + "grad_norm": 241.22862243652344, + "learning_rate": 9.713248638838476e-06, + "loss": 32.787, + "step": 3736 + }, + { + "epoch": 13.491196388261852, + "grad_norm": 295.4871520996094, + "learning_rate": 9.707803992740472e-06, + "loss": 33.9786, + "step": 3737 + }, + { + "epoch": 13.494808126410835, + "grad_norm": 285.3634948730469, + "learning_rate": 9.702359346642469e-06, + "loss": 33.9872, + "step": 3738 + }, + { + "epoch": 13.49841986455982, + "grad_norm": 302.39947509765625, + "learning_rate": 9.696914700544464e-06, + "loss": 33.9854, + "step": 3739 + }, + { + "epoch": 13.502031602708804, + "grad_norm": 310.0465087890625, + "learning_rate": 9.691470054446461e-06, + "loss": 34.1859, + "step": 3740 + }, + { + "epoch": 13.502031602708804, + "eval_loss": 0.6067100167274475, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 3740 + }, + { + "epoch": 13.505643340857787, + "grad_norm": 319.9311828613281, + "learning_rate": 9.686025408348458e-06, + "loss": 34.5264, + "step": 3741 + }, + { + "epoch": 13.509255079006772, + "grad_norm": 291.75738525390625, + "learning_rate": 9.680580762250454e-06, + "loss": 35.8348, + "step": 3742 + }, + { + "epoch": 13.512866817155757, + "grad_norm": 291.5312805175781, + "learning_rate": 9.675136116152449e-06, + "loss": 33.8803, + "step": 3743 + }, + { + "epoch": 13.51647855530474, + "grad_norm": 228.00588989257812, + "learning_rate": 9.669691470054448e-06, + "loss": 36.1919, + "step": 3744 + }, + { + "epoch": 13.520090293453725, + "grad_norm": 236.5559539794922, + "learning_rate": 9.664246823956443e-06, + "loss": 35.8432, + "step": 3745 + }, + { + "epoch": 13.523702031602708, + "grad_norm": 287.7408752441406, + "learning_rate": 9.65880217785844e-06, + "loss": 37.069, + "step": 3746 + }, + { + "epoch": 13.527313769751693, + "grad_norm": 272.73870849609375, + "learning_rate": 9.653357531760436e-06, + "loss": 29.1896, + "step": 3747 + }, + { + "epoch": 13.530925507900678, + "grad_norm": 256.5550842285156, + "learning_rate": 9.647912885662431e-06, + "loss": 23.0953, + "step": 3748 + }, + { + "epoch": 13.534537246049661, + "grad_norm": 230.98487854003906, + "learning_rate": 9.64246823956443e-06, + "loss": 21.9902, + "step": 3749 + }, + { + "epoch": 13.538148984198646, + "grad_norm": 247.1185760498047, + "learning_rate": 9.637023593466425e-06, + "loss": 23.7439, + "step": 3750 + }, + { + "epoch": 13.538148984198646, + "eval_loss": 0.6106311082839966, + "eval_runtime": 3.1356, + "eval_samples_per_second": 57.086, + "eval_steps_per_second": 57.086, + "step": 3750 + }, + { + "epoch": 13.54176072234763, + "grad_norm": 193.83152770996094, + "learning_rate": 9.63157894736842e-06, + "loss": 24.2292, + "step": 3751 + }, + { + "epoch": 13.545372460496614, + "grad_norm": 322.80487060546875, + "learning_rate": 9.626134301270418e-06, + "loss": 40.9778, + "step": 3752 + }, + { + "epoch": 13.548984198645599, + "grad_norm": 345.0560302734375, + "learning_rate": 9.620689655172413e-06, + "loss": 42.3601, + "step": 3753 + }, + { + "epoch": 13.552595936794582, + "grad_norm": 240.3759002685547, + "learning_rate": 9.61524500907441e-06, + "loss": 41.092, + "step": 3754 + }, + { + "epoch": 13.556207674943566, + "grad_norm": 219.0955352783203, + "learning_rate": 9.609800362976407e-06, + "loss": 40.3108, + "step": 3755 + }, + { + "epoch": 13.559819413092551, + "grad_norm": 255.6158447265625, + "learning_rate": 9.604355716878403e-06, + "loss": 39.8885, + "step": 3756 + }, + { + "epoch": 13.563431151241534, + "grad_norm": 264.55010986328125, + "learning_rate": 9.5989110707804e-06, + "loss": 40.8838, + "step": 3757 + }, + { + "epoch": 13.56704288939052, + "grad_norm": 313.0918273925781, + "learning_rate": 9.593466424682397e-06, + "loss": 40.6634, + "step": 3758 + }, + { + "epoch": 13.570654627539504, + "grad_norm": 304.87396240234375, + "learning_rate": 9.588021778584392e-06, + "loss": 41.8734, + "step": 3759 + }, + { + "epoch": 13.574266365688487, + "grad_norm": 239.76063537597656, + "learning_rate": 9.58257713248639e-06, + "loss": 40.6281, + "step": 3760 + }, + { + "epoch": 13.574266365688487, + "eval_loss": 0.6124129891395569, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.028, + "eval_steps_per_second": 57.028, + "step": 3760 + }, + { + "epoch": 13.577878103837472, + "grad_norm": 201.89422607421875, + "learning_rate": 9.577132486388385e-06, + "loss": 39.6948, + "step": 3761 + }, + { + "epoch": 13.581489841986457, + "grad_norm": 232.8797607421875, + "learning_rate": 9.57168784029038e-06, + "loss": 39.6927, + "step": 3762 + }, + { + "epoch": 13.58510158013544, + "grad_norm": 250.30355834960938, + "learning_rate": 9.566243194192379e-06, + "loss": 37.6926, + "step": 3763 + }, + { + "epoch": 13.588713318284425, + "grad_norm": 256.23626708984375, + "learning_rate": 9.560798548094374e-06, + "loss": 38.248, + "step": 3764 + }, + { + "epoch": 13.592325056433408, + "grad_norm": 234.1791534423828, + "learning_rate": 9.555353901996371e-06, + "loss": 36.8178, + "step": 3765 + }, + { + "epoch": 13.595936794582393, + "grad_norm": 243.87615966796875, + "learning_rate": 9.549909255898367e-06, + "loss": 37.0802, + "step": 3766 + }, + { + "epoch": 13.599548532731378, + "grad_norm": 220.98150634765625, + "learning_rate": 9.544464609800362e-06, + "loss": 37.1251, + "step": 3767 + }, + { + "epoch": 13.60316027088036, + "grad_norm": 235.8653564453125, + "learning_rate": 9.53901996370236e-06, + "loss": 38.2965, + "step": 3768 + }, + { + "epoch": 13.606772009029346, + "grad_norm": 237.66712951660156, + "learning_rate": 9.533575317604356e-06, + "loss": 38.0266, + "step": 3769 + }, + { + "epoch": 13.610383747178329, + "grad_norm": 229.4922637939453, + "learning_rate": 9.528130671506351e-06, + "loss": 38.4199, + "step": 3770 + }, + { + "epoch": 13.610383747178329, + "eval_loss": 0.6078812479972839, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 3770 + }, + { + "epoch": 13.613995485327314, + "grad_norm": 250.82533264160156, + "learning_rate": 9.522686025408349e-06, + "loss": 39.713, + "step": 3771 + }, + { + "epoch": 13.617607223476298, + "grad_norm": 218.97511291503906, + "learning_rate": 9.517241379310346e-06, + "loss": 37.6396, + "step": 3772 + }, + { + "epoch": 13.621218961625281, + "grad_norm": 240.13096618652344, + "learning_rate": 9.511796733212341e-06, + "loss": 39.2808, + "step": 3773 + }, + { + "epoch": 13.624830699774266, + "grad_norm": 214.77957153320312, + "learning_rate": 9.506352087114338e-06, + "loss": 39.1584, + "step": 3774 + }, + { + "epoch": 13.628442437923251, + "grad_norm": 273.2488708496094, + "learning_rate": 9.500907441016333e-06, + "loss": 39.6725, + "step": 3775 + }, + { + "epoch": 13.632054176072234, + "grad_norm": 240.46669006347656, + "learning_rate": 9.49546279491833e-06, + "loss": 40.155, + "step": 3776 + }, + { + "epoch": 13.635665914221219, + "grad_norm": 304.46533203125, + "learning_rate": 9.490018148820328e-06, + "loss": 39.5831, + "step": 3777 + }, + { + "epoch": 13.639277652370204, + "grad_norm": 282.9252624511719, + "learning_rate": 9.484573502722323e-06, + "loss": 40.8392, + "step": 3778 + }, + { + "epoch": 13.642889390519187, + "grad_norm": 229.2595977783203, + "learning_rate": 9.47912885662432e-06, + "loss": 38.4015, + "step": 3779 + }, + { + "epoch": 13.646501128668172, + "grad_norm": 300.0253601074219, + "learning_rate": 9.473684210526315e-06, + "loss": 35.0578, + "step": 3780 + }, + { + "epoch": 13.646501128668172, + "eval_loss": 0.6059401631355286, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 3780 + }, + { + "epoch": 13.650112866817155, + "grad_norm": 266.379638671875, + "learning_rate": 9.468239564428313e-06, + "loss": 33.0308, + "step": 3781 + }, + { + "epoch": 13.65372460496614, + "grad_norm": 248.8190460205078, + "learning_rate": 9.46279491833031e-06, + "loss": 31.7632, + "step": 3782 + }, + { + "epoch": 13.657336343115125, + "grad_norm": 224.4126739501953, + "learning_rate": 9.457350272232305e-06, + "loss": 32.8875, + "step": 3783 + }, + { + "epoch": 13.660948081264108, + "grad_norm": 259.84466552734375, + "learning_rate": 9.4519056261343e-06, + "loss": 32.3248, + "step": 3784 + }, + { + "epoch": 13.664559819413093, + "grad_norm": 233.59483337402344, + "learning_rate": 9.446460980036297e-06, + "loss": 32.5855, + "step": 3785 + }, + { + "epoch": 13.668171557562077, + "grad_norm": 283.1840515136719, + "learning_rate": 9.441016333938295e-06, + "loss": 33.8277, + "step": 3786 + }, + { + "epoch": 13.67178329571106, + "grad_norm": 269.51171875, + "learning_rate": 9.435571687840292e-06, + "loss": 33.8348, + "step": 3787 + }, + { + "epoch": 13.675395033860045, + "grad_norm": 284.6701354980469, + "learning_rate": 9.430127041742287e-06, + "loss": 34.2571, + "step": 3788 + }, + { + "epoch": 13.679006772009028, + "grad_norm": 308.96221923828125, + "learning_rate": 9.424682395644282e-06, + "loss": 34.2313, + "step": 3789 + }, + { + "epoch": 13.682618510158013, + "grad_norm": 229.36366271972656, + "learning_rate": 9.41923774954628e-06, + "loss": 34.6341, + "step": 3790 + }, + { + "epoch": 13.682618510158013, + "eval_loss": 0.606715202331543, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 3790 + }, + { + "epoch": 13.686230248306998, + "grad_norm": 335.4346008300781, + "learning_rate": 9.413793103448277e-06, + "loss": 35.2222, + "step": 3791 + }, + { + "epoch": 13.689841986455981, + "grad_norm": 259.72222900390625, + "learning_rate": 9.408348457350272e-06, + "loss": 34.7416, + "step": 3792 + }, + { + "epoch": 13.693453724604966, + "grad_norm": 275.96112060546875, + "learning_rate": 9.402903811252269e-06, + "loss": 34.2018, + "step": 3793 + }, + { + "epoch": 13.697065462753951, + "grad_norm": 349.28924560546875, + "learning_rate": 9.397459165154264e-06, + "loss": 37.8801, + "step": 3794 + }, + { + "epoch": 13.700677200902934, + "grad_norm": 288.47540283203125, + "learning_rate": 9.392014519056261e-06, + "loss": 37.5101, + "step": 3795 + }, + { + "epoch": 13.704288939051919, + "grad_norm": 255.31033325195312, + "learning_rate": 9.386569872958259e-06, + "loss": 36.9294, + "step": 3796 + }, + { + "epoch": 13.707900677200904, + "grad_norm": 273.757080078125, + "learning_rate": 9.381125226860254e-06, + "loss": 31.64, + "step": 3797 + }, + { + "epoch": 13.711512415349887, + "grad_norm": 236.24928283691406, + "learning_rate": 9.375680580762251e-06, + "loss": 22.9812, + "step": 3798 + }, + { + "epoch": 13.715124153498872, + "grad_norm": 206.70883178710938, + "learning_rate": 9.370235934664246e-06, + "loss": 22.4788, + "step": 3799 + }, + { + "epoch": 13.718735891647855, + "grad_norm": 168.15762329101562, + "learning_rate": 9.364791288566243e-06, + "loss": 23.3803, + "step": 3800 + }, + { + "epoch": 13.718735891647855, + "eval_loss": 0.6092759966850281, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 3800 + }, + { + "epoch": 13.72234762979684, + "grad_norm": 261.88397216796875, + "learning_rate": 9.35934664246824e-06, + "loss": 24.8757, + "step": 3801 + }, + { + "epoch": 13.725959367945824, + "grad_norm": 235.3518829345703, + "learning_rate": 9.353901996370236e-06, + "loss": 39.8777, + "step": 3802 + }, + { + "epoch": 13.729571106094808, + "grad_norm": 226.94027709960938, + "learning_rate": 9.348457350272231e-06, + "loss": 40.4357, + "step": 3803 + }, + { + "epoch": 13.733182844243792, + "grad_norm": 266.2643737792969, + "learning_rate": 9.34301270417423e-06, + "loss": 41.6411, + "step": 3804 + }, + { + "epoch": 13.736794582392777, + "grad_norm": 327.39288330078125, + "learning_rate": 9.337568058076225e-06, + "loss": 39.862, + "step": 3805 + }, + { + "epoch": 13.74040632054176, + "grad_norm": 241.03121948242188, + "learning_rate": 9.332123411978223e-06, + "loss": 39.1833, + "step": 3806 + }, + { + "epoch": 13.744018058690745, + "grad_norm": 232.2872314453125, + "learning_rate": 9.326678765880218e-06, + "loss": 40.6895, + "step": 3807 + }, + { + "epoch": 13.747629796839728, + "grad_norm": 236.909912109375, + "learning_rate": 9.321234119782213e-06, + "loss": 39.5891, + "step": 3808 + }, + { + "epoch": 13.751241534988713, + "grad_norm": 193.81478881835938, + "learning_rate": 9.315789473684212e-06, + "loss": 41.5211, + "step": 3809 + }, + { + "epoch": 13.754853273137698, + "grad_norm": 214.87301635742188, + "learning_rate": 9.310344827586207e-06, + "loss": 41.0726, + "step": 3810 + }, + { + "epoch": 13.754853273137698, + "eval_loss": 0.6098713874816895, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.047, + "eval_steps_per_second": 57.047, + "step": 3810 + }, + { + "epoch": 13.758465011286681, + "grad_norm": 196.57247924804688, + "learning_rate": 9.304900181488203e-06, + "loss": 40.1843, + "step": 3811 + }, + { + "epoch": 13.762076749435666, + "grad_norm": 215.59698486328125, + "learning_rate": 9.2994555353902e-06, + "loss": 37.6279, + "step": 3812 + }, + { + "epoch": 13.76568848758465, + "grad_norm": 221.1280059814453, + "learning_rate": 9.294010889292195e-06, + "loss": 37.9593, + "step": 3813 + }, + { + "epoch": 13.769300225733634, + "grad_norm": 314.94610595703125, + "learning_rate": 9.288566243194192e-06, + "loss": 37.3399, + "step": 3814 + }, + { + "epoch": 13.772911963882619, + "grad_norm": 240.10816955566406, + "learning_rate": 9.28312159709619e-06, + "loss": 38.3185, + "step": 3815 + }, + { + "epoch": 13.776523702031604, + "grad_norm": 229.2427978515625, + "learning_rate": 9.277676950998185e-06, + "loss": 36.9407, + "step": 3816 + }, + { + "epoch": 13.780135440180587, + "grad_norm": 224.78335571289062, + "learning_rate": 9.272232304900182e-06, + "loss": 39.3709, + "step": 3817 + }, + { + "epoch": 13.783747178329572, + "grad_norm": 216.5969696044922, + "learning_rate": 9.266787658802179e-06, + "loss": 38.2303, + "step": 3818 + }, + { + "epoch": 13.787358916478555, + "grad_norm": 208.7849884033203, + "learning_rate": 9.261343012704174e-06, + "loss": 39.492, + "step": 3819 + }, + { + "epoch": 13.79097065462754, + "grad_norm": 215.76475524902344, + "learning_rate": 9.255898366606171e-06, + "loss": 38.5599, + "step": 3820 + }, + { + "epoch": 13.79097065462754, + "eval_loss": 0.6080366969108582, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.041, + "eval_steps_per_second": 57.041, + "step": 3820 + }, + { + "epoch": 13.794582392776524, + "grad_norm": 224.64462280273438, + "learning_rate": 9.250453720508167e-06, + "loss": 39.315, + "step": 3821 + }, + { + "epoch": 13.798194130925507, + "grad_norm": 298.545654296875, + "learning_rate": 9.245009074410162e-06, + "loss": 38.3108, + "step": 3822 + }, + { + "epoch": 13.801805869074492, + "grad_norm": 236.5186767578125, + "learning_rate": 9.239564428312161e-06, + "loss": 39.9223, + "step": 3823 + }, + { + "epoch": 13.805417607223477, + "grad_norm": 251.47999572753906, + "learning_rate": 9.234119782214156e-06, + "loss": 39.4288, + "step": 3824 + }, + { + "epoch": 13.80902934537246, + "grad_norm": 260.8268737792969, + "learning_rate": 9.228675136116152e-06, + "loss": 38.276, + "step": 3825 + }, + { + "epoch": 13.812641083521445, + "grad_norm": 253.25172424316406, + "learning_rate": 9.223230490018149e-06, + "loss": 40.7118, + "step": 3826 + }, + { + "epoch": 13.816252821670428, + "grad_norm": 250.31784057617188, + "learning_rate": 9.217785843920146e-06, + "loss": 40.1916, + "step": 3827 + }, + { + "epoch": 13.819864559819413, + "grad_norm": 228.79234313964844, + "learning_rate": 9.212341197822143e-06, + "loss": 38.1513, + "step": 3828 + }, + { + "epoch": 13.823476297968398, + "grad_norm": 262.689697265625, + "learning_rate": 9.206896551724138e-06, + "loss": 38.43, + "step": 3829 + }, + { + "epoch": 13.827088036117381, + "grad_norm": 191.04139709472656, + "learning_rate": 9.201451905626134e-06, + "loss": 34.2476, + "step": 3830 + }, + { + "epoch": 13.827088036117381, + "eval_loss": 0.6077054142951965, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 3830 + }, + { + "epoch": 13.830699774266366, + "grad_norm": 236.3266143798828, + "learning_rate": 9.196007259528131e-06, + "loss": 33.7892, + "step": 3831 + }, + { + "epoch": 13.83431151241535, + "grad_norm": 284.8748474121094, + "learning_rate": 9.190562613430128e-06, + "loss": 31.9857, + "step": 3832 + }, + { + "epoch": 13.837923250564334, + "grad_norm": 261.17413330078125, + "learning_rate": 9.185117967332123e-06, + "loss": 32.8165, + "step": 3833 + }, + { + "epoch": 13.841534988713319, + "grad_norm": 195.1323699951172, + "learning_rate": 9.17967332123412e-06, + "loss": 33.1709, + "step": 3834 + }, + { + "epoch": 13.845146726862303, + "grad_norm": 220.5006561279297, + "learning_rate": 9.174228675136116e-06, + "loss": 33.149, + "step": 3835 + }, + { + "epoch": 13.848758465011286, + "grad_norm": 236.7254638671875, + "learning_rate": 9.168784029038111e-06, + "loss": 33.633, + "step": 3836 + }, + { + "epoch": 13.852370203160271, + "grad_norm": 269.1921691894531, + "learning_rate": 9.16333938294011e-06, + "loss": 34.6822, + "step": 3837 + }, + { + "epoch": 13.855981941309254, + "grad_norm": 222.4369354248047, + "learning_rate": 9.157894736842105e-06, + "loss": 35.2816, + "step": 3838 + }, + { + "epoch": 13.85959367945824, + "grad_norm": 232.4306640625, + "learning_rate": 9.152450090744102e-06, + "loss": 35.0067, + "step": 3839 + }, + { + "epoch": 13.863205417607224, + "grad_norm": 297.0786437988281, + "learning_rate": 9.147005444646098e-06, + "loss": 34.264, + "step": 3840 + }, + { + "epoch": 13.863205417607224, + "eval_loss": 0.6047748327255249, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 3840 + }, + { + "epoch": 13.866817155756207, + "grad_norm": 370.232421875, + "learning_rate": 9.141560798548095e-06, + "loss": 35.4996, + "step": 3841 + }, + { + "epoch": 13.870428893905192, + "grad_norm": 216.05775451660156, + "learning_rate": 9.136116152450092e-06, + "loss": 36.1403, + "step": 3842 + }, + { + "epoch": 13.874040632054175, + "grad_norm": 233.11138916015625, + "learning_rate": 9.130671506352087e-06, + "loss": 36.0324, + "step": 3843 + }, + { + "epoch": 13.87765237020316, + "grad_norm": 297.1761779785156, + "learning_rate": 9.125226860254083e-06, + "loss": 36.5617, + "step": 3844 + }, + { + "epoch": 13.881264108352145, + "grad_norm": 290.61590576171875, + "learning_rate": 9.11978221415608e-06, + "loss": 36.7113, + "step": 3845 + }, + { + "epoch": 13.884875846501128, + "grad_norm": 293.5744934082031, + "learning_rate": 9.114337568058077e-06, + "loss": 36.9964, + "step": 3846 + }, + { + "epoch": 13.888487584650113, + "grad_norm": 227.73455810546875, + "learning_rate": 9.108892921960072e-06, + "loss": 31.8552, + "step": 3847 + }, + { + "epoch": 13.892099322799098, + "grad_norm": 223.36077880859375, + "learning_rate": 9.10344827586207e-06, + "loss": 22.9122, + "step": 3848 + }, + { + "epoch": 13.89571106094808, + "grad_norm": 181.14501953125, + "learning_rate": 9.098003629764065e-06, + "loss": 22.366, + "step": 3849 + }, + { + "epoch": 13.899322799097066, + "grad_norm": 215.75856018066406, + "learning_rate": 9.092558983666063e-06, + "loss": 23.9545, + "step": 3850 + }, + { + "epoch": 13.899322799097066, + "eval_loss": 0.6072003245353699, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 3850 + }, + { + "epoch": 13.90293453724605, + "grad_norm": 233.22837829589844, + "learning_rate": 9.087114337568059e-06, + "loss": 23.5196, + "step": 3851 + }, + { + "epoch": 13.906546275395034, + "grad_norm": 269.9342041015625, + "learning_rate": 9.081669691470054e-06, + "loss": 41.4605, + "step": 3852 + }, + { + "epoch": 13.910158013544018, + "grad_norm": 304.4266662597656, + "learning_rate": 9.076225045372051e-06, + "loss": 40.2848, + "step": 3853 + }, + { + "epoch": 13.913769751693001, + "grad_norm": 318.2371520996094, + "learning_rate": 9.070780399274047e-06, + "loss": 41.0044, + "step": 3854 + }, + { + "epoch": 13.917381489841986, + "grad_norm": 272.9725341796875, + "learning_rate": 9.065335753176044e-06, + "loss": 40.776, + "step": 3855 + }, + { + "epoch": 13.920993227990971, + "grad_norm": 213.8822784423828, + "learning_rate": 9.059891107078041e-06, + "loss": 39.4964, + "step": 3856 + }, + { + "epoch": 13.924604966139954, + "grad_norm": 239.16128540039062, + "learning_rate": 9.054446460980036e-06, + "loss": 41.3482, + "step": 3857 + }, + { + "epoch": 13.928216704288939, + "grad_norm": 264.839111328125, + "learning_rate": 9.049001814882033e-06, + "loss": 38.2433, + "step": 3858 + }, + { + "epoch": 13.931828442437924, + "grad_norm": 244.00926208496094, + "learning_rate": 9.043557168784029e-06, + "loss": 38.6482, + "step": 3859 + }, + { + "epoch": 13.935440180586907, + "grad_norm": 342.8050537109375, + "learning_rate": 9.038112522686026e-06, + "loss": 39.2047, + "step": 3860 + }, + { + "epoch": 13.935440180586907, + "eval_loss": 0.6078094244003296, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3860 + }, + { + "epoch": 13.939051918735892, + "grad_norm": 232.509033203125, + "learning_rate": 9.032667876588023e-06, + "loss": 39.2827, + "step": 3861 + }, + { + "epoch": 13.942663656884875, + "grad_norm": 343.2891845703125, + "learning_rate": 9.027223230490018e-06, + "loss": 38.2709, + "step": 3862 + }, + { + "epoch": 13.94627539503386, + "grad_norm": 332.9613342285156, + "learning_rate": 9.021778584392014e-06, + "loss": 38.8266, + "step": 3863 + }, + { + "epoch": 13.949887133182845, + "grad_norm": 339.5653076171875, + "learning_rate": 9.016333938294012e-06, + "loss": 39.9249, + "step": 3864 + }, + { + "epoch": 13.953498871331828, + "grad_norm": 269.0108947753906, + "learning_rate": 9.010889292196008e-06, + "loss": 39.4593, + "step": 3865 + }, + { + "epoch": 13.957110609480813, + "grad_norm": 252.5339813232422, + "learning_rate": 9.005444646098003e-06, + "loss": 39.5471, + "step": 3866 + }, + { + "epoch": 13.960722347629797, + "grad_norm": 424.7225646972656, + "learning_rate": 9e-06, + "loss": 35.7505, + "step": 3867 + }, + { + "epoch": 13.96433408577878, + "grad_norm": 286.189208984375, + "learning_rate": 8.994555353901996e-06, + "loss": 32.445, + "step": 3868 + }, + { + "epoch": 13.967945823927765, + "grad_norm": 245.153564453125, + "learning_rate": 8.989110707803994e-06, + "loss": 33.2369, + "step": 3869 + }, + { + "epoch": 13.97155756207675, + "grad_norm": 305.3119812011719, + "learning_rate": 8.98366606170599e-06, + "loss": 31.7864, + "step": 3870 + }, + { + "epoch": 13.97155756207675, + "eval_loss": 0.6069231629371643, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.877, + "eval_steps_per_second": 56.877, + "step": 3870 + }, + { + "epoch": 13.975169300225733, + "grad_norm": 218.70913696289062, + "learning_rate": 8.978221415607985e-06, + "loss": 33.7166, + "step": 3871 + }, + { + "epoch": 13.978781038374718, + "grad_norm": 334.856201171875, + "learning_rate": 8.972776769509982e-06, + "loss": 35.8878, + "step": 3872 + }, + { + "epoch": 13.982392776523701, + "grad_norm": 305.65203857421875, + "learning_rate": 8.96733212341198e-06, + "loss": 35.1525, + "step": 3873 + }, + { + "epoch": 13.986004514672686, + "grad_norm": 330.148193359375, + "learning_rate": 8.961887477313975e-06, + "loss": 34.8268, + "step": 3874 + }, + { + "epoch": 13.989616252821671, + "grad_norm": 288.9424133300781, + "learning_rate": 8.956442831215972e-06, + "loss": 35.5068, + "step": 3875 + }, + { + "epoch": 13.993227990970654, + "grad_norm": 256.2596740722656, + "learning_rate": 8.950998185117967e-06, + "loss": 28.5016, + "step": 3876 + }, + { + "epoch": 13.996839729119639, + "grad_norm": 234.31991577148438, + "learning_rate": 8.945553539019963e-06, + "loss": 23.7416, + "step": 3877 + }, + { + "epoch": 14.0, + "grad_norm": 182.19000244140625, + "learning_rate": 8.940108892921961e-06, + "loss": 21.0329, + "step": 3878 + }, + { + "epoch": 14.003611738148985, + "grad_norm": 254.86355590820312, + "learning_rate": 8.934664246823957e-06, + "loss": 39.94, + "step": 3879 + }, + { + "epoch": 14.007223476297968, + "grad_norm": 229.75650024414062, + "learning_rate": 8.929219600725954e-06, + "loss": 40.3213, + "step": 3880 + }, + { + "epoch": 14.007223476297968, + "eval_loss": 0.604503870010376, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3880 + }, + { + "epoch": 14.010835214446953, + "grad_norm": 220.18190002441406, + "learning_rate": 8.923774954627949e-06, + "loss": 40.1568, + "step": 3881 + }, + { + "epoch": 14.014446952595938, + "grad_norm": 269.5978088378906, + "learning_rate": 8.918330308529945e-06, + "loss": 40.3685, + "step": 3882 + }, + { + "epoch": 14.01805869074492, + "grad_norm": 254.3507537841797, + "learning_rate": 8.912885662431943e-06, + "loss": 40.0845, + "step": 3883 + }, + { + "epoch": 14.021670428893906, + "grad_norm": 251.43653869628906, + "learning_rate": 8.907441016333939e-06, + "loss": 40.1731, + "step": 3884 + }, + { + "epoch": 14.025282167042889, + "grad_norm": 215.91253662109375, + "learning_rate": 8.901996370235934e-06, + "loss": 39.7179, + "step": 3885 + }, + { + "epoch": 14.028893905191874, + "grad_norm": 247.81790161132812, + "learning_rate": 8.896551724137931e-06, + "loss": 41.0822, + "step": 3886 + }, + { + "epoch": 14.032505643340858, + "grad_norm": 232.45892333984375, + "learning_rate": 8.891107078039928e-06, + "loss": 39.7873, + "step": 3887 + }, + { + "epoch": 14.036117381489841, + "grad_norm": 231.8137969970703, + "learning_rate": 8.885662431941924e-06, + "loss": 41.1302, + "step": 3888 + }, + { + "epoch": 14.039729119638826, + "grad_norm": 219.09446716308594, + "learning_rate": 8.88021778584392e-06, + "loss": 39.2293, + "step": 3889 + }, + { + "epoch": 14.043340857787811, + "grad_norm": 187.99874877929688, + "learning_rate": 8.874773139745916e-06, + "loss": 37.3338, + "step": 3890 + }, + { + "epoch": 14.043340857787811, + "eval_loss": 0.603966236114502, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 3890 + }, + { + "epoch": 14.046952595936794, + "grad_norm": 285.2400207519531, + "learning_rate": 8.869328493647913e-06, + "loss": 36.9479, + "step": 3891 + }, + { + "epoch": 14.050564334085779, + "grad_norm": 234.23655700683594, + "learning_rate": 8.86388384754991e-06, + "loss": 35.1313, + "step": 3892 + }, + { + "epoch": 14.054176072234762, + "grad_norm": 234.78717041015625, + "learning_rate": 8.858439201451906e-06, + "loss": 36.5917, + "step": 3893 + }, + { + "epoch": 14.057787810383747, + "grad_norm": 226.53997802734375, + "learning_rate": 8.852994555353903e-06, + "loss": 38.3228, + "step": 3894 + }, + { + "epoch": 14.061399548532732, + "grad_norm": 222.05213928222656, + "learning_rate": 8.847549909255898e-06, + "loss": 37.3542, + "step": 3895 + }, + { + "epoch": 14.065011286681715, + "grad_norm": 222.9646759033203, + "learning_rate": 8.842105263157893e-06, + "loss": 37.6396, + "step": 3896 + }, + { + "epoch": 14.0686230248307, + "grad_norm": 227.78965759277344, + "learning_rate": 8.836660617059892e-06, + "loss": 38.1988, + "step": 3897 + }, + { + "epoch": 14.072234762979685, + "grad_norm": 200.89691162109375, + "learning_rate": 8.831215970961888e-06, + "loss": 38.3981, + "step": 3898 + }, + { + "epoch": 14.075846501128668, + "grad_norm": 212.52891540527344, + "learning_rate": 8.825771324863883e-06, + "loss": 37.3422, + "step": 3899 + }, + { + "epoch": 14.079458239277653, + "grad_norm": 312.33905029296875, + "learning_rate": 8.82032667876588e-06, + "loss": 38.1292, + "step": 3900 + }, + { + "epoch": 14.079458239277653, + "eval_loss": 0.6061921119689941, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.017, + "eval_steps_per_second": 57.017, + "step": 3900 + }, + { + "epoch": 14.083069977426636, + "grad_norm": 261.8415832519531, + "learning_rate": 8.814882032667877e-06, + "loss": 37.5543, + "step": 3901 + }, + { + "epoch": 14.08668171557562, + "grad_norm": 264.625732421875, + "learning_rate": 8.809437386569874e-06, + "loss": 39.3912, + "step": 3902 + }, + { + "epoch": 14.090293453724605, + "grad_norm": 305.7203063964844, + "learning_rate": 8.80399274047187e-06, + "loss": 39.7879, + "step": 3903 + }, + { + "epoch": 14.093905191873588, + "grad_norm": 282.63616943359375, + "learning_rate": 8.798548094373865e-06, + "loss": 38.7212, + "step": 3904 + }, + { + "epoch": 14.097516930022573, + "grad_norm": 246.49169921875, + "learning_rate": 8.793103448275862e-06, + "loss": 40.6198, + "step": 3905 + }, + { + "epoch": 14.101128668171558, + "grad_norm": 283.2737731933594, + "learning_rate": 8.787658802177859e-06, + "loss": 39.6947, + "step": 3906 + }, + { + "epoch": 14.104740406320541, + "grad_norm": 306.95721435546875, + "learning_rate": 8.782214156079855e-06, + "loss": 38.6157, + "step": 3907 + }, + { + "epoch": 14.108352144469526, + "grad_norm": 238.1789093017578, + "learning_rate": 8.776769509981852e-06, + "loss": 35.5328, + "step": 3908 + }, + { + "epoch": 14.111963882618511, + "grad_norm": 233.2298126220703, + "learning_rate": 8.771324863883847e-06, + "loss": 32.4008, + "step": 3909 + }, + { + "epoch": 14.115575620767494, + "grad_norm": 233.46339416503906, + "learning_rate": 8.765880217785846e-06, + "loss": 31.0712, + "step": 3910 + }, + { + "epoch": 14.115575620767494, + "eval_loss": 0.6046931147575378, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 3910 + }, + { + "epoch": 14.119187358916479, + "grad_norm": 226.30343627929688, + "learning_rate": 8.760435571687841e-06, + "loss": 33.252, + "step": 3911 + }, + { + "epoch": 14.122799097065462, + "grad_norm": 247.17465209960938, + "learning_rate": 8.754990925589837e-06, + "loss": 31.526, + "step": 3912 + }, + { + "epoch": 14.126410835214447, + "grad_norm": 208.25439453125, + "learning_rate": 8.749546279491834e-06, + "loss": 32.4838, + "step": 3913 + }, + { + "epoch": 14.130022573363432, + "grad_norm": 236.4488525390625, + "learning_rate": 8.744101633393829e-06, + "loss": 32.7987, + "step": 3914 + }, + { + "epoch": 14.133634311512415, + "grad_norm": 219.13279724121094, + "learning_rate": 8.738656987295826e-06, + "loss": 32.8516, + "step": 3915 + }, + { + "epoch": 14.1372460496614, + "grad_norm": 239.7289581298828, + "learning_rate": 8.733212341197823e-06, + "loss": 33.7763, + "step": 3916 + }, + { + "epoch": 14.140857787810384, + "grad_norm": 226.3568878173828, + "learning_rate": 8.727767695099819e-06, + "loss": 35.675, + "step": 3917 + }, + { + "epoch": 14.144469525959368, + "grad_norm": 302.84307861328125, + "learning_rate": 8.722323049001814e-06, + "loss": 34.0523, + "step": 3918 + }, + { + "epoch": 14.148081264108352, + "grad_norm": 280.40106201171875, + "learning_rate": 8.716878402903811e-06, + "loss": 35.2923, + "step": 3919 + }, + { + "epoch": 14.151693002257336, + "grad_norm": 238.30520629882812, + "learning_rate": 8.711433756805808e-06, + "loss": 36.0242, + "step": 3920 + }, + { + "epoch": 14.151693002257336, + "eval_loss": 0.6067762970924377, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 3920 + }, + { + "epoch": 14.15530474040632, + "grad_norm": 238.6465301513672, + "learning_rate": 8.705989110707805e-06, + "loss": 36.2959, + "step": 3921 + }, + { + "epoch": 14.158916478555305, + "grad_norm": 273.26837158203125, + "learning_rate": 8.7005444646098e-06, + "loss": 35.45, + "step": 3922 + }, + { + "epoch": 14.162528216704288, + "grad_norm": 296.907958984375, + "learning_rate": 8.695099818511796e-06, + "loss": 36.4428, + "step": 3923 + }, + { + "epoch": 14.166139954853273, + "grad_norm": 215.07374572753906, + "learning_rate": 8.689655172413795e-06, + "loss": 26.4171, + "step": 3924 + }, + { + "epoch": 14.169751693002258, + "grad_norm": 217.64779663085938, + "learning_rate": 8.68421052631579e-06, + "loss": 22.5483, + "step": 3925 + }, + { + "epoch": 14.173363431151241, + "grad_norm": 243.59364318847656, + "learning_rate": 8.678765880217785e-06, + "loss": 22.0396, + "step": 3926 + }, + { + "epoch": 14.176975169300226, + "grad_norm": 189.66969299316406, + "learning_rate": 8.673321234119783e-06, + "loss": 23.0957, + "step": 3927 + }, + { + "epoch": 14.18058690744921, + "grad_norm": 191.86180114746094, + "learning_rate": 8.667876588021778e-06, + "loss": 23.9385, + "step": 3928 + }, + { + "epoch": 14.184198645598194, + "grad_norm": 234.34896850585938, + "learning_rate": 8.662431941923775e-06, + "loss": 40.1665, + "step": 3929 + }, + { + "epoch": 14.187810383747179, + "grad_norm": 230.52401733398438, + "learning_rate": 8.656987295825772e-06, + "loss": 40.6752, + "step": 3930 + }, + { + "epoch": 14.187810383747179, + "eval_loss": 0.6088615655899048, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.998, + "eval_steps_per_second": 56.998, + "step": 3930 + }, + { + "epoch": 14.191422121896162, + "grad_norm": 234.06272888183594, + "learning_rate": 8.651542649727767e-06, + "loss": 40.7938, + "step": 3931 + }, + { + "epoch": 14.195033860045147, + "grad_norm": 344.4232482910156, + "learning_rate": 8.646098003629765e-06, + "loss": 38.7342, + "step": 3932 + }, + { + "epoch": 14.198645598194132, + "grad_norm": 375.74365234375, + "learning_rate": 8.640653357531762e-06, + "loss": 40.2052, + "step": 3933 + }, + { + "epoch": 14.202257336343115, + "grad_norm": 258.15570068359375, + "learning_rate": 8.635208711433757e-06, + "loss": 39.7266, + "step": 3934 + }, + { + "epoch": 14.2058690744921, + "grad_norm": 235.2681121826172, + "learning_rate": 8.629764065335754e-06, + "loss": 40.4821, + "step": 3935 + }, + { + "epoch": 14.209480812641084, + "grad_norm": 226.94764709472656, + "learning_rate": 8.62431941923775e-06, + "loss": 41.2414, + "step": 3936 + }, + { + "epoch": 14.213092550790067, + "grad_norm": 236.22109985351562, + "learning_rate": 8.618874773139745e-06, + "loss": 40.5807, + "step": 3937 + }, + { + "epoch": 14.216704288939052, + "grad_norm": 201.31112670898438, + "learning_rate": 8.613430127041744e-06, + "loss": 40.4824, + "step": 3938 + }, + { + "epoch": 14.220316027088035, + "grad_norm": 328.0167541503906, + "learning_rate": 8.607985480943739e-06, + "loss": 38.3881, + "step": 3939 + }, + { + "epoch": 14.22392776523702, + "grad_norm": 281.4416809082031, + "learning_rate": 8.602540834845734e-06, + "loss": 36.5777, + "step": 3940 + }, + { + "epoch": 14.22392776523702, + "eval_loss": 0.6099084615707397, + "eval_runtime": 3.1377, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 3940 + }, + { + "epoch": 14.227539503386005, + "grad_norm": 258.5203552246094, + "learning_rate": 8.597096188747731e-06, + "loss": 37.5071, + "step": 3941 + }, + { + "epoch": 14.231151241534988, + "grad_norm": 274.8222351074219, + "learning_rate": 8.591651542649727e-06, + "loss": 36.358, + "step": 3942 + }, + { + "epoch": 14.234762979683973, + "grad_norm": 253.1671600341797, + "learning_rate": 8.586206896551726e-06, + "loss": 37.5859, + "step": 3943 + }, + { + "epoch": 14.238374717832958, + "grad_norm": 249.80943298339844, + "learning_rate": 8.580762250453721e-06, + "loss": 37.8799, + "step": 3944 + }, + { + "epoch": 14.241986455981941, + "grad_norm": 245.29103088378906, + "learning_rate": 8.575317604355716e-06, + "loss": 36.7551, + "step": 3945 + }, + { + "epoch": 14.245598194130926, + "grad_norm": 205.5915985107422, + "learning_rate": 8.569872958257713e-06, + "loss": 38.4761, + "step": 3946 + }, + { + "epoch": 14.249209932279909, + "grad_norm": 218.10328674316406, + "learning_rate": 8.56442831215971e-06, + "loss": 37.5862, + "step": 3947 + }, + { + "epoch": 14.252821670428894, + "grad_norm": 273.5924072265625, + "learning_rate": 8.558983666061706e-06, + "loss": 39.2851, + "step": 3948 + }, + { + "epoch": 14.256433408577879, + "grad_norm": 235.48069763183594, + "learning_rate": 8.553539019963703e-06, + "loss": 39.0707, + "step": 3949 + }, + { + "epoch": 14.260045146726862, + "grad_norm": 230.93150329589844, + "learning_rate": 8.548094373865698e-06, + "loss": 37.8469, + "step": 3950 + }, + { + "epoch": 14.260045146726862, + "eval_loss": 0.6072147488594055, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 3950 + }, + { + "epoch": 14.263656884875846, + "grad_norm": 226.3638458251953, + "learning_rate": 8.542649727767695e-06, + "loss": 39.4245, + "step": 3951 + }, + { + "epoch": 14.267268623024831, + "grad_norm": 226.74595642089844, + "learning_rate": 8.537205081669693e-06, + "loss": 38.116, + "step": 3952 + }, + { + "epoch": 14.270880361173814, + "grad_norm": 226.1452178955078, + "learning_rate": 8.531760435571688e-06, + "loss": 39.9114, + "step": 3953 + }, + { + "epoch": 14.2744920993228, + "grad_norm": 387.8020324707031, + "learning_rate": 8.526315789473685e-06, + "loss": 38.9457, + "step": 3954 + }, + { + "epoch": 14.278103837471784, + "grad_norm": 381.5679931640625, + "learning_rate": 8.52087114337568e-06, + "loss": 40.7989, + "step": 3955 + }, + { + "epoch": 14.281715575620767, + "grad_norm": 246.16464233398438, + "learning_rate": 8.515426497277677e-06, + "loss": 37.6288, + "step": 3956 + }, + { + "epoch": 14.285327313769752, + "grad_norm": 337.05059814453125, + "learning_rate": 8.509981851179674e-06, + "loss": 37.3276, + "step": 3957 + }, + { + "epoch": 14.288939051918735, + "grad_norm": 223.80421447753906, + "learning_rate": 8.50453720508167e-06, + "loss": 33.9465, + "step": 3958 + }, + { + "epoch": 14.29255079006772, + "grad_norm": 218.9332275390625, + "learning_rate": 8.499092558983665e-06, + "loss": 33.0305, + "step": 3959 + }, + { + "epoch": 14.296162528216705, + "grad_norm": 254.20726013183594, + "learning_rate": 8.493647912885662e-06, + "loss": 31.3806, + "step": 3960 + }, + { + "epoch": 14.296162528216705, + "eval_loss": 0.6070483922958374, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 3960 + }, + { + "epoch": 14.299774266365688, + "grad_norm": 232.96702575683594, + "learning_rate": 8.48820326678766e-06, + "loss": 31.7001, + "step": 3961 + }, + { + "epoch": 14.303386004514673, + "grad_norm": 305.31207275390625, + "learning_rate": 8.482758620689656e-06, + "loss": 32.2629, + "step": 3962 + }, + { + "epoch": 14.306997742663658, + "grad_norm": 253.60858154296875, + "learning_rate": 8.477313974591652e-06, + "loss": 34.2635, + "step": 3963 + }, + { + "epoch": 14.31060948081264, + "grad_norm": 395.4168701171875, + "learning_rate": 8.471869328493647e-06, + "loss": 34.6987, + "step": 3964 + }, + { + "epoch": 14.314221218961626, + "grad_norm": 279.72845458984375, + "learning_rate": 8.466424682395644e-06, + "loss": 34.5488, + "step": 3965 + }, + { + "epoch": 14.317832957110609, + "grad_norm": 285.7306213378906, + "learning_rate": 8.460980036297641e-06, + "loss": 35.2566, + "step": 3966 + }, + { + "epoch": 14.321444695259594, + "grad_norm": 229.04226684570312, + "learning_rate": 8.455535390199637e-06, + "loss": 34.5273, + "step": 3967 + }, + { + "epoch": 14.325056433408578, + "grad_norm": 232.50205993652344, + "learning_rate": 8.450090744101634e-06, + "loss": 34.6337, + "step": 3968 + }, + { + "epoch": 14.328668171557561, + "grad_norm": 225.87583923339844, + "learning_rate": 8.44464609800363e-06, + "loss": 35.1575, + "step": 3969 + }, + { + "epoch": 14.332279909706546, + "grad_norm": 266.2709045410156, + "learning_rate": 8.439201451905626e-06, + "loss": 34.2619, + "step": 3970 + }, + { + "epoch": 14.332279909706546, + "eval_loss": 0.6066078543663025, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 3970 + }, + { + "epoch": 14.335891647855531, + "grad_norm": 283.557373046875, + "learning_rate": 8.433756805807623e-06, + "loss": 35.5713, + "step": 3971 + }, + { + "epoch": 14.339503386004514, + "grad_norm": 288.43707275390625, + "learning_rate": 8.428312159709619e-06, + "loss": 36.7442, + "step": 3972 + }, + { + "epoch": 14.343115124153499, + "grad_norm": 331.3218994140625, + "learning_rate": 8.422867513611616e-06, + "loss": 35.5839, + "step": 3973 + }, + { + "epoch": 14.346726862302482, + "grad_norm": 257.1488037109375, + "learning_rate": 8.417422867513611e-06, + "loss": 30.2221, + "step": 3974 + }, + { + "epoch": 14.350338600451467, + "grad_norm": 200.0919189453125, + "learning_rate": 8.411978221415608e-06, + "loss": 22.217, + "step": 3975 + }, + { + "epoch": 14.353950338600452, + "grad_norm": 245.030029296875, + "learning_rate": 8.406533575317605e-06, + "loss": 22.8927, + "step": 3976 + }, + { + "epoch": 14.357562076749435, + "grad_norm": 208.5701904296875, + "learning_rate": 8.4010889292196e-06, + "loss": 22.9537, + "step": 3977 + }, + { + "epoch": 14.36117381489842, + "grad_norm": 232.0613250732422, + "learning_rate": 8.395644283121596e-06, + "loss": 24.5304, + "step": 3978 + }, + { + "epoch": 14.364785553047405, + "grad_norm": 193.56541442871094, + "learning_rate": 8.390199637023595e-06, + "loss": 39.4552, + "step": 3979 + }, + { + "epoch": 14.368397291196388, + "grad_norm": 230.35507202148438, + "learning_rate": 8.38475499092559e-06, + "loss": 41.0417, + "step": 3980 + }, + { + "epoch": 14.368397291196388, + "eval_loss": 0.6071842908859253, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 3980 + }, + { + "epoch": 14.372009029345373, + "grad_norm": 191.09242248535156, + "learning_rate": 8.379310344827586e-06, + "loss": 40.1548, + "step": 3981 + }, + { + "epoch": 14.375620767494357, + "grad_norm": 249.24520874023438, + "learning_rate": 8.373865698729583e-06, + "loss": 39.5746, + "step": 3982 + }, + { + "epoch": 14.37923250564334, + "grad_norm": 266.509033203125, + "learning_rate": 8.368421052631578e-06, + "loss": 39.2388, + "step": 3983 + }, + { + "epoch": 14.382844243792325, + "grad_norm": 255.36209106445312, + "learning_rate": 8.362976406533577e-06, + "loss": 39.9314, + "step": 3984 + }, + { + "epoch": 14.386455981941308, + "grad_norm": 239.0690460205078, + "learning_rate": 8.357531760435572e-06, + "loss": 39.9124, + "step": 3985 + }, + { + "epoch": 14.390067720090293, + "grad_norm": 211.36135864257812, + "learning_rate": 8.352087114337568e-06, + "loss": 40.1307, + "step": 3986 + }, + { + "epoch": 14.393679458239278, + "grad_norm": 215.28912353515625, + "learning_rate": 8.346642468239565e-06, + "loss": 40.5252, + "step": 3987 + }, + { + "epoch": 14.397291196388261, + "grad_norm": 240.84271240234375, + "learning_rate": 8.34119782214156e-06, + "loss": 40.8348, + "step": 3988 + }, + { + "epoch": 14.400902934537246, + "grad_norm": 228.41758728027344, + "learning_rate": 8.335753176043557e-06, + "loss": 39.8228, + "step": 3989 + }, + { + "epoch": 14.404514672686231, + "grad_norm": 203.0228729248047, + "learning_rate": 8.330308529945554e-06, + "loss": 38.0696, + "step": 3990 + }, + { + "epoch": 14.404514672686231, + "eval_loss": 0.6064196825027466, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.983, + "eval_steps_per_second": 56.983, + "step": 3990 + }, + { + "epoch": 14.408126410835214, + "grad_norm": 245.14646911621094, + "learning_rate": 8.32486388384755e-06, + "loss": 37.3921, + "step": 3991 + }, + { + "epoch": 14.411738148984199, + "grad_norm": 230.0685577392578, + "learning_rate": 8.319419237749545e-06, + "loss": 36.8794, + "step": 3992 + }, + { + "epoch": 14.415349887133182, + "grad_norm": 203.02955627441406, + "learning_rate": 8.313974591651544e-06, + "loss": 38.011, + "step": 3993 + }, + { + "epoch": 14.418961625282167, + "grad_norm": 276.0522766113281, + "learning_rate": 8.30852994555354e-06, + "loss": 37.8114, + "step": 3994 + }, + { + "epoch": 14.422573363431152, + "grad_norm": 205.56423950195312, + "learning_rate": 8.303085299455536e-06, + "loss": 38.1956, + "step": 3995 + }, + { + "epoch": 14.426185101580135, + "grad_norm": 200.71507263183594, + "learning_rate": 8.297640653357532e-06, + "loss": 36.4471, + "step": 3996 + }, + { + "epoch": 14.42979683972912, + "grad_norm": 217.8540496826172, + "learning_rate": 8.292196007259527e-06, + "loss": 37.6204, + "step": 3997 + }, + { + "epoch": 14.433408577878104, + "grad_norm": 228.0621337890625, + "learning_rate": 8.286751361161526e-06, + "loss": 38.6074, + "step": 3998 + }, + { + "epoch": 14.437020316027088, + "grad_norm": 246.05203247070312, + "learning_rate": 8.281306715063521e-06, + "loss": 37.8614, + "step": 3999 + }, + { + "epoch": 14.440632054176072, + "grad_norm": 216.0327911376953, + "learning_rate": 8.275862068965517e-06, + "loss": 37.4941, + "step": 4000 + }, + { + "epoch": 14.440632054176072, + "eval_loss": 0.605604887008667, + "eval_runtime": 3.1399, + "eval_samples_per_second": 57.008, + "eval_steps_per_second": 57.008, + "step": 4000 + }, + { + "epoch": 14.444243792325057, + "grad_norm": 292.38653564453125, + "learning_rate": 8.270417422867514e-06, + "loss": 37.9576, + "step": 4001 + }, + { + "epoch": 14.44785553047404, + "grad_norm": 268.2558288574219, + "learning_rate": 8.26497277676951e-06, + "loss": 38.7505, + "step": 4002 + }, + { + "epoch": 14.451467268623025, + "grad_norm": 324.135498046875, + "learning_rate": 8.259528130671508e-06, + "loss": 39.9733, + "step": 4003 + }, + { + "epoch": 14.455079006772008, + "grad_norm": 269.1458740234375, + "learning_rate": 8.254083484573503e-06, + "loss": 38.8272, + "step": 4004 + }, + { + "epoch": 14.458690744920993, + "grad_norm": 214.26547241210938, + "learning_rate": 8.248638838475499e-06, + "loss": 37.7277, + "step": 4005 + }, + { + "epoch": 14.462302483069978, + "grad_norm": 256.4419860839844, + "learning_rate": 8.243194192377496e-06, + "loss": 39.0446, + "step": 4006 + }, + { + "epoch": 14.465914221218961, + "grad_norm": 226.9741973876953, + "learning_rate": 8.237749546279493e-06, + "loss": 34.2491, + "step": 4007 + }, + { + "epoch": 14.469525959367946, + "grad_norm": 238.4901123046875, + "learning_rate": 8.232304900181488e-06, + "loss": 32.1969, + "step": 4008 + }, + { + "epoch": 14.47313769751693, + "grad_norm": 260.6334533691406, + "learning_rate": 8.226860254083485e-06, + "loss": 32.5999, + "step": 4009 + }, + { + "epoch": 14.476749435665914, + "grad_norm": 227.4844970703125, + "learning_rate": 8.22141560798548e-06, + "loss": 30.3598, + "step": 4010 + }, + { + "epoch": 14.476749435665914, + "eval_loss": 0.6049788594245911, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 4010 + }, + { + "epoch": 14.480361173814899, + "grad_norm": 231.49935913085938, + "learning_rate": 8.215970961887476e-06, + "loss": 32.3228, + "step": 4011 + }, + { + "epoch": 14.483972911963882, + "grad_norm": 246.83099365234375, + "learning_rate": 8.210526315789475e-06, + "loss": 32.1275, + "step": 4012 + }, + { + "epoch": 14.487584650112867, + "grad_norm": 283.0715026855469, + "learning_rate": 8.20508166969147e-06, + "loss": 32.9237, + "step": 4013 + }, + { + "epoch": 14.491196388261852, + "grad_norm": 264.58941650390625, + "learning_rate": 8.199637023593467e-06, + "loss": 34.3091, + "step": 4014 + }, + { + "epoch": 14.494808126410835, + "grad_norm": 207.57241821289062, + "learning_rate": 8.194192377495463e-06, + "loss": 34.2317, + "step": 4015 + }, + { + "epoch": 14.49841986455982, + "grad_norm": 266.3730163574219, + "learning_rate": 8.18874773139746e-06, + "loss": 35.5423, + "step": 4016 + }, + { + "epoch": 14.502031602708804, + "grad_norm": 274.2936096191406, + "learning_rate": 8.183303085299457e-06, + "loss": 34.0383, + "step": 4017 + }, + { + "epoch": 14.505643340857787, + "grad_norm": 345.4320068359375, + "learning_rate": 8.177858439201452e-06, + "loss": 35.6892, + "step": 4018 + }, + { + "epoch": 14.509255079006772, + "grad_norm": 254.9503631591797, + "learning_rate": 8.172413793103448e-06, + "loss": 34.4219, + "step": 4019 + }, + { + "epoch": 14.512866817155757, + "grad_norm": 277.176025390625, + "learning_rate": 8.166969147005445e-06, + "loss": 34.6322, + "step": 4020 + }, + { + "epoch": 14.512866817155757, + "eval_loss": 0.6078911423683167, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 4020 + }, + { + "epoch": 14.51647855530474, + "grad_norm": 267.24737548828125, + "learning_rate": 8.161524500907442e-06, + "loss": 36.4843, + "step": 4021 + }, + { + "epoch": 14.520090293453725, + "grad_norm": 291.5208740234375, + "learning_rate": 8.156079854809437e-06, + "loss": 36.347, + "step": 4022 + }, + { + "epoch": 14.523702031602708, + "grad_norm": 331.9736328125, + "learning_rate": 8.150635208711434e-06, + "loss": 36.5678, + "step": 4023 + }, + { + "epoch": 14.527313769751693, + "grad_norm": 283.7598876953125, + "learning_rate": 8.14519056261343e-06, + "loss": 29.4886, + "step": 4024 + }, + { + "epoch": 14.530925507900678, + "grad_norm": 214.61712646484375, + "learning_rate": 8.139745916515427e-06, + "loss": 23.2178, + "step": 4025 + }, + { + "epoch": 14.534537246049661, + "grad_norm": 286.7948913574219, + "learning_rate": 8.134301270417424e-06, + "loss": 22.0972, + "step": 4026 + }, + { + "epoch": 14.538148984198646, + "grad_norm": 230.6540069580078, + "learning_rate": 8.128856624319419e-06, + "loss": 23.2764, + "step": 4027 + }, + { + "epoch": 14.54176072234763, + "grad_norm": 300.9560241699219, + "learning_rate": 8.123411978221416e-06, + "loss": 24.1889, + "step": 4028 + }, + { + "epoch": 14.545372460496614, + "grad_norm": 211.4068145751953, + "learning_rate": 8.117967332123412e-06, + "loss": 39.0039, + "step": 4029 + }, + { + "epoch": 14.548984198645599, + "grad_norm": 274.3965759277344, + "learning_rate": 8.112522686025409e-06, + "loss": 41.1832, + "step": 4030 + }, + { + "epoch": 14.548984198645599, + "eval_loss": 0.6079195141792297, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 4030 + }, + { + "epoch": 14.552595936794582, + "grad_norm": 247.50657653808594, + "learning_rate": 8.107078039927406e-06, + "loss": 38.28, + "step": 4031 + }, + { + "epoch": 14.556207674943566, + "grad_norm": 216.0500946044922, + "learning_rate": 8.101633393829401e-06, + "loss": 39.5079, + "step": 4032 + }, + { + "epoch": 14.559819413092551, + "grad_norm": 271.37066650390625, + "learning_rate": 8.096188747731396e-06, + "loss": 40.1902, + "step": 4033 + }, + { + "epoch": 14.563431151241534, + "grad_norm": 233.35415649414062, + "learning_rate": 8.090744101633394e-06, + "loss": 40.2113, + "step": 4034 + }, + { + "epoch": 14.56704288939052, + "grad_norm": 214.67381286621094, + "learning_rate": 8.08529945553539e-06, + "loss": 39.794, + "step": 4035 + }, + { + "epoch": 14.570654627539504, + "grad_norm": 298.1142578125, + "learning_rate": 8.079854809437388e-06, + "loss": 39.9214, + "step": 4036 + }, + { + "epoch": 14.574266365688487, + "grad_norm": 197.40823364257812, + "learning_rate": 8.074410163339383e-06, + "loss": 40.9599, + "step": 4037 + }, + { + "epoch": 14.577878103837472, + "grad_norm": 242.1573028564453, + "learning_rate": 8.068965517241378e-06, + "loss": 40.2351, + "step": 4038 + }, + { + "epoch": 14.581489841986457, + "grad_norm": 224.93801879882812, + "learning_rate": 8.063520871143377e-06, + "loss": 39.0174, + "step": 4039 + }, + { + "epoch": 14.58510158013544, + "grad_norm": 295.4931335449219, + "learning_rate": 8.058076225045373e-06, + "loss": 37.4696, + "step": 4040 + }, + { + "epoch": 14.58510158013544, + "eval_loss": 0.6091852188110352, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 4040 + }, + { + "epoch": 14.588713318284425, + "grad_norm": 302.8267517089844, + "learning_rate": 8.052631578947368e-06, + "loss": 37.3227, + "step": 4041 + }, + { + "epoch": 14.592325056433408, + "grad_norm": 355.2379150390625, + "learning_rate": 8.047186932849365e-06, + "loss": 38.433, + "step": 4042 + }, + { + "epoch": 14.595936794582393, + "grad_norm": 304.96234130859375, + "learning_rate": 8.04174228675136e-06, + "loss": 37.8352, + "step": 4043 + }, + { + "epoch": 14.599548532731378, + "grad_norm": 309.294921875, + "learning_rate": 8.036297640653358e-06, + "loss": 38.1734, + "step": 4044 + }, + { + "epoch": 14.60316027088036, + "grad_norm": 216.3328399658203, + "learning_rate": 8.030852994555355e-06, + "loss": 37.3612, + "step": 4045 + }, + { + "epoch": 14.606772009029346, + "grad_norm": 250.9885711669922, + "learning_rate": 8.02540834845735e-06, + "loss": 39.1612, + "step": 4046 + }, + { + "epoch": 14.610383747178329, + "grad_norm": 215.0750732421875, + "learning_rate": 8.019963702359347e-06, + "loss": 39.6837, + "step": 4047 + }, + { + "epoch": 14.613995485327314, + "grad_norm": 234.02069091796875, + "learning_rate": 8.014519056261342e-06, + "loss": 37.9746, + "step": 4048 + }, + { + "epoch": 14.617607223476298, + "grad_norm": 233.7527313232422, + "learning_rate": 8.00907441016334e-06, + "loss": 38.5114, + "step": 4049 + }, + { + "epoch": 14.621218961625281, + "grad_norm": 271.77496337890625, + "learning_rate": 8.003629764065337e-06, + "loss": 37.1647, + "step": 4050 + }, + { + "epoch": 14.621218961625281, + "eval_loss": 0.6047770977020264, + "eval_runtime": 3.1379, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 4050 + }, + { + "epoch": 14.624830699774266, + "grad_norm": 281.7846374511719, + "learning_rate": 7.998185117967332e-06, + "loss": 38.981, + "step": 4051 + }, + { + "epoch": 14.628442437923251, + "grad_norm": 308.8702697753906, + "learning_rate": 7.992740471869327e-06, + "loss": 39.4821, + "step": 4052 + }, + { + "epoch": 14.632054176072234, + "grad_norm": 366.1501770019531, + "learning_rate": 7.987295825771326e-06, + "loss": 39.0898, + "step": 4053 + }, + { + "epoch": 14.635665914221219, + "grad_norm": 276.92962646484375, + "learning_rate": 7.981851179673322e-06, + "loss": 39.6162, + "step": 4054 + }, + { + "epoch": 14.639277652370204, + "grad_norm": 220.0023651123047, + "learning_rate": 7.976406533575319e-06, + "loss": 38.5888, + "step": 4055 + }, + { + "epoch": 14.642889390519187, + "grad_norm": 268.57293701171875, + "learning_rate": 7.970961887477314e-06, + "loss": 38.4631, + "step": 4056 + }, + { + "epoch": 14.646501128668172, + "grad_norm": 307.8072509765625, + "learning_rate": 7.96551724137931e-06, + "loss": 35.4139, + "step": 4057 + }, + { + "epoch": 14.650112866817155, + "grad_norm": 228.11767578125, + "learning_rate": 7.960072595281308e-06, + "loss": 33.3694, + "step": 4058 + }, + { + "epoch": 14.65372460496614, + "grad_norm": 217.6271209716797, + "learning_rate": 7.954627949183304e-06, + "loss": 31.3355, + "step": 4059 + }, + { + "epoch": 14.657336343115125, + "grad_norm": 232.31944274902344, + "learning_rate": 7.949183303085299e-06, + "loss": 32.8306, + "step": 4060 + }, + { + "epoch": 14.657336343115125, + "eval_loss": 0.6018487215042114, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 4060 + }, + { + "epoch": 14.660948081264108, + "grad_norm": 244.58303833007812, + "learning_rate": 7.943738656987296e-06, + "loss": 33.2157, + "step": 4061 + }, + { + "epoch": 14.664559819413093, + "grad_norm": 306.12005615234375, + "learning_rate": 7.938294010889293e-06, + "loss": 33.6361, + "step": 4062 + }, + { + "epoch": 14.668171557562077, + "grad_norm": 266.2792053222656, + "learning_rate": 7.932849364791288e-06, + "loss": 32.3917, + "step": 4063 + }, + { + "epoch": 14.67178329571106, + "grad_norm": 259.373779296875, + "learning_rate": 7.927404718693286e-06, + "loss": 33.3598, + "step": 4064 + }, + { + "epoch": 14.675395033860045, + "grad_norm": 247.35179138183594, + "learning_rate": 7.921960072595281e-06, + "loss": 32.2699, + "step": 4065 + }, + { + "epoch": 14.679006772009028, + "grad_norm": 280.02960205078125, + "learning_rate": 7.916515426497278e-06, + "loss": 33.0305, + "step": 4066 + }, + { + "epoch": 14.682618510158013, + "grad_norm": 394.6492919921875, + "learning_rate": 7.911070780399275e-06, + "loss": 35.1854, + "step": 4067 + }, + { + "epoch": 14.686230248306998, + "grad_norm": 298.6531677246094, + "learning_rate": 7.90562613430127e-06, + "loss": 35.1836, + "step": 4068 + }, + { + "epoch": 14.689841986455981, + "grad_norm": 250.960693359375, + "learning_rate": 7.900181488203268e-06, + "loss": 32.6266, + "step": 4069 + }, + { + "epoch": 14.693453724604966, + "grad_norm": 240.4825897216797, + "learning_rate": 7.894736842105263e-06, + "loss": 35.5937, + "step": 4070 + }, + { + "epoch": 14.693453724604966, + "eval_loss": 0.6042065620422363, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.91, + "eval_steps_per_second": 56.91, + "step": 4070 + }, + { + "epoch": 14.697065462753951, + "grad_norm": 274.6919860839844, + "learning_rate": 7.889292196007258e-06, + "loss": 36.4225, + "step": 4071 + }, + { + "epoch": 14.700677200902934, + "grad_norm": 245.4980010986328, + "learning_rate": 7.883847549909257e-06, + "loss": 36.5503, + "step": 4072 + }, + { + "epoch": 14.704288939051919, + "grad_norm": 373.362548828125, + "learning_rate": 7.878402903811252e-06, + "loss": 35.38, + "step": 4073 + }, + { + "epoch": 14.707900677200904, + "grad_norm": 337.5054626464844, + "learning_rate": 7.872958257713248e-06, + "loss": 28.869, + "step": 4074 + }, + { + "epoch": 14.711512415349887, + "grad_norm": 238.19195556640625, + "learning_rate": 7.867513611615245e-06, + "loss": 22.99, + "step": 4075 + }, + { + "epoch": 14.715124153498872, + "grad_norm": 254.274169921875, + "learning_rate": 7.862068965517242e-06, + "loss": 22.5274, + "step": 4076 + }, + { + "epoch": 14.718735891647855, + "grad_norm": 236.74099731445312, + "learning_rate": 7.856624319419239e-06, + "loss": 23.6756, + "step": 4077 + }, + { + "epoch": 14.72234762979684, + "grad_norm": 239.69911193847656, + "learning_rate": 7.851179673321234e-06, + "loss": 23.2024, + "step": 4078 + }, + { + "epoch": 14.725959367945824, + "grad_norm": 296.35101318359375, + "learning_rate": 7.84573502722323e-06, + "loss": 40.0026, + "step": 4079 + }, + { + "epoch": 14.729571106094808, + "grad_norm": 202.52577209472656, + "learning_rate": 7.840290381125227e-06, + "loss": 41.2817, + "step": 4080 + }, + { + "epoch": 14.729571106094808, + "eval_loss": 0.6069625616073608, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4080 + }, + { + "epoch": 14.733182844243792, + "grad_norm": 290.4194030761719, + "learning_rate": 7.834845735027224e-06, + "loss": 40.5411, + "step": 4081 + }, + { + "epoch": 14.736794582392777, + "grad_norm": 284.0616455078125, + "learning_rate": 7.82940108892922e-06, + "loss": 40.6588, + "step": 4082 + }, + { + "epoch": 14.74040632054176, + "grad_norm": 289.5628967285156, + "learning_rate": 7.823956442831216e-06, + "loss": 38.986, + "step": 4083 + }, + { + "epoch": 14.744018058690745, + "grad_norm": 217.09841918945312, + "learning_rate": 7.818511796733212e-06, + "loss": 38.83, + "step": 4084 + }, + { + "epoch": 14.747629796839728, + "grad_norm": 223.49148559570312, + "learning_rate": 7.813067150635209e-06, + "loss": 39.4897, + "step": 4085 + }, + { + "epoch": 14.751241534988713, + "grad_norm": 240.41578674316406, + "learning_rate": 7.807622504537206e-06, + "loss": 38.9963, + "step": 4086 + }, + { + "epoch": 14.754853273137698, + "grad_norm": 206.7586212158203, + "learning_rate": 7.802177858439201e-06, + "loss": 39.7875, + "step": 4087 + }, + { + "epoch": 14.758465011286681, + "grad_norm": 239.97174072265625, + "learning_rate": 7.796733212341198e-06, + "loss": 39.3977, + "step": 4088 + }, + { + "epoch": 14.762076749435666, + "grad_norm": 204.50839233398438, + "learning_rate": 7.791288566243194e-06, + "loss": 38.7869, + "step": 4089 + }, + { + "epoch": 14.76568848758465, + "grad_norm": 216.79583740234375, + "learning_rate": 7.785843920145191e-06, + "loss": 36.7325, + "step": 4090 + }, + { + "epoch": 14.76568848758465, + "eval_loss": 0.6052367091178894, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.93, + "eval_steps_per_second": 56.93, + "step": 4090 + }, + { + "epoch": 14.769300225733634, + "grad_norm": 251.13209533691406, + "learning_rate": 7.780399274047188e-06, + "loss": 38.2377, + "step": 4091 + }, + { + "epoch": 14.772911963882619, + "grad_norm": 222.745361328125, + "learning_rate": 7.774954627949183e-06, + "loss": 36.8119, + "step": 4092 + }, + { + "epoch": 14.776523702031604, + "grad_norm": 252.72117614746094, + "learning_rate": 7.769509981851179e-06, + "loss": 38.1241, + "step": 4093 + }, + { + "epoch": 14.780135440180587, + "grad_norm": 272.38165283203125, + "learning_rate": 7.764065335753176e-06, + "loss": 37.6839, + "step": 4094 + }, + { + "epoch": 14.783747178329572, + "grad_norm": 301.0637512207031, + "learning_rate": 7.758620689655173e-06, + "loss": 38.1267, + "step": 4095 + }, + { + "epoch": 14.787358916478555, + "grad_norm": 240.22515869140625, + "learning_rate": 7.75317604355717e-06, + "loss": 36.9847, + "step": 4096 + }, + { + "epoch": 14.79097065462754, + "grad_norm": 273.3988952636719, + "learning_rate": 7.747731397459165e-06, + "loss": 39.0368, + "step": 4097 + }, + { + "epoch": 14.794582392776524, + "grad_norm": 252.66497802734375, + "learning_rate": 7.74228675136116e-06, + "loss": 38.6439, + "step": 4098 + }, + { + "epoch": 14.798194130925507, + "grad_norm": 246.3287811279297, + "learning_rate": 7.73684210526316e-06, + "loss": 36.3503, + "step": 4099 + }, + { + "epoch": 14.801805869074492, + "grad_norm": 220.6704559326172, + "learning_rate": 7.731397459165155e-06, + "loss": 38.1603, + "step": 4100 + }, + { + "epoch": 14.801805869074492, + "eval_loss": 0.6043270826339722, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4100 + }, + { + "epoch": 14.805417607223477, + "grad_norm": 215.94979858398438, + "learning_rate": 7.72595281306715e-06, + "loss": 38.9624, + "step": 4101 + }, + { + "epoch": 14.80902934537246, + "grad_norm": 228.76815795898438, + "learning_rate": 7.720508166969147e-06, + "loss": 39.2196, + "step": 4102 + }, + { + "epoch": 14.812641083521445, + "grad_norm": 216.1998291015625, + "learning_rate": 7.715063520871143e-06, + "loss": 39.3677, + "step": 4103 + }, + { + "epoch": 14.816252821670428, + "grad_norm": 266.1018981933594, + "learning_rate": 7.70961887477314e-06, + "loss": 38.1856, + "step": 4104 + }, + { + "epoch": 14.819864559819413, + "grad_norm": 234.2566680908203, + "learning_rate": 7.704174228675137e-06, + "loss": 39.6282, + "step": 4105 + }, + { + "epoch": 14.823476297968398, + "grad_norm": 241.16615295410156, + "learning_rate": 7.698729582577132e-06, + "loss": 38.2693, + "step": 4106 + }, + { + "epoch": 14.827088036117381, + "grad_norm": 332.6835021972656, + "learning_rate": 7.69328493647913e-06, + "loss": 37.7161, + "step": 4107 + }, + { + "epoch": 14.830699774266366, + "grad_norm": 260.1654357910156, + "learning_rate": 7.687840290381126e-06, + "loss": 33.9704, + "step": 4108 + }, + { + "epoch": 14.83431151241535, + "grad_norm": 214.45509338378906, + "learning_rate": 7.682395644283122e-06, + "loss": 32.5126, + "step": 4109 + }, + { + "epoch": 14.837923250564334, + "grad_norm": 257.4847717285156, + "learning_rate": 7.676950998185119e-06, + "loss": 32.0682, + "step": 4110 + }, + { + "epoch": 14.837923250564334, + "eval_loss": 0.6022929549217224, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.957, + "eval_steps_per_second": 56.957, + "step": 4110 + }, + { + "epoch": 14.841534988713319, + "grad_norm": 241.302978515625, + "learning_rate": 7.671506352087114e-06, + "loss": 32.8817, + "step": 4111 + }, + { + "epoch": 14.845146726862303, + "grad_norm": 238.0950164794922, + "learning_rate": 7.66606170598911e-06, + "loss": 31.9995, + "step": 4112 + }, + { + "epoch": 14.848758465011286, + "grad_norm": 239.700439453125, + "learning_rate": 7.660617059891108e-06, + "loss": 32.9681, + "step": 4113 + }, + { + "epoch": 14.852370203160271, + "grad_norm": 234.23890686035156, + "learning_rate": 7.655172413793104e-06, + "loss": 33.6878, + "step": 4114 + }, + { + "epoch": 14.855981941309254, + "grad_norm": 367.3103332519531, + "learning_rate": 7.6497277676951e-06, + "loss": 34.2346, + "step": 4115 + }, + { + "epoch": 14.85959367945824, + "grad_norm": 221.31381225585938, + "learning_rate": 7.644283121597096e-06, + "loss": 35.0148, + "step": 4116 + }, + { + "epoch": 14.863205417607224, + "grad_norm": 352.1162109375, + "learning_rate": 7.638838475499092e-06, + "loss": 34.8326, + "step": 4117 + }, + { + "epoch": 14.866817155756207, + "grad_norm": 296.8202209472656, + "learning_rate": 7.63339382940109e-06, + "loss": 34.2522, + "step": 4118 + }, + { + "epoch": 14.870428893905192, + "grad_norm": 283.4679870605469, + "learning_rate": 7.627949183303086e-06, + "loss": 34.5005, + "step": 4119 + }, + { + "epoch": 14.874040632054175, + "grad_norm": 249.95033264160156, + "learning_rate": 7.622504537205082e-06, + "loss": 34.9581, + "step": 4120 + }, + { + "epoch": 14.874040632054175, + "eval_loss": 0.6031190752983093, + "eval_runtime": 3.1392, + "eval_samples_per_second": 57.02, + "eval_steps_per_second": 57.02, + "step": 4120 + }, + { + "epoch": 14.87765237020316, + "grad_norm": 235.65065002441406, + "learning_rate": 7.6170598911070774e-06, + "loss": 35.3024, + "step": 4121 + }, + { + "epoch": 14.881264108352145, + "grad_norm": 258.1300964355469, + "learning_rate": 7.611615245009075e-06, + "loss": 35.4444, + "step": 4122 + }, + { + "epoch": 14.884875846501128, + "grad_norm": 262.9698791503906, + "learning_rate": 7.606170598911072e-06, + "loss": 36.5643, + "step": 4123 + }, + { + "epoch": 14.888487584650113, + "grad_norm": 274.81781005859375, + "learning_rate": 7.600725952813067e-06, + "loss": 33.0157, + "step": 4124 + }, + { + "epoch": 14.892099322799098, + "grad_norm": 205.41566467285156, + "learning_rate": 7.595281306715063e-06, + "loss": 22.226, + "step": 4125 + }, + { + "epoch": 14.89571106094808, + "grad_norm": 231.19541931152344, + "learning_rate": 7.5898366606170594e-06, + "loss": 22.1499, + "step": 4126 + }, + { + "epoch": 14.899322799097066, + "grad_norm": 203.04856872558594, + "learning_rate": 7.584392014519057e-06, + "loss": 23.3987, + "step": 4127 + }, + { + "epoch": 14.90293453724605, + "grad_norm": 289.031005859375, + "learning_rate": 7.578947368421053e-06, + "loss": 24.3649, + "step": 4128 + }, + { + "epoch": 14.906546275395034, + "grad_norm": 285.2325744628906, + "learning_rate": 7.573502722323049e-06, + "loss": 41.146, + "step": 4129 + }, + { + "epoch": 14.910158013544018, + "grad_norm": 232.21603393554688, + "learning_rate": 7.568058076225045e-06, + "loss": 40.3871, + "step": 4130 + }, + { + "epoch": 14.910158013544018, + "eval_loss": 0.6056836247444153, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 4130 + }, + { + "epoch": 14.913769751693001, + "grad_norm": 358.63238525390625, + "learning_rate": 7.562613430127043e-06, + "loss": 39.5914, + "step": 4131 + }, + { + "epoch": 14.917381489841986, + "grad_norm": 262.66741943359375, + "learning_rate": 7.5571687840290385e-06, + "loss": 39.4552, + "step": 4132 + }, + { + "epoch": 14.920993227990971, + "grad_norm": 228.7096710205078, + "learning_rate": 7.551724137931035e-06, + "loss": 41.5379, + "step": 4133 + }, + { + "epoch": 14.924604966139954, + "grad_norm": 266.6537780761719, + "learning_rate": 7.546279491833031e-06, + "loss": 39.8314, + "step": 4134 + }, + { + "epoch": 14.928216704288939, + "grad_norm": 329.5486755371094, + "learning_rate": 7.540834845735027e-06, + "loss": 37.8247, + "step": 4135 + }, + { + "epoch": 14.931828442437924, + "grad_norm": 391.49127197265625, + "learning_rate": 7.535390199637024e-06, + "loss": 36.8491, + "step": 4136 + }, + { + "epoch": 14.935440180586907, + "grad_norm": 342.66632080078125, + "learning_rate": 7.5299455535390205e-06, + "loss": 37.7245, + "step": 4137 + }, + { + "epoch": 14.939051918735892, + "grad_norm": 309.25115966796875, + "learning_rate": 7.524500907441017e-06, + "loss": 38.3694, + "step": 4138 + }, + { + "epoch": 14.942663656884875, + "grad_norm": 438.21539306640625, + "learning_rate": 7.519056261343012e-06, + "loss": 38.5028, + "step": 4139 + }, + { + "epoch": 14.94627539503386, + "grad_norm": 314.2667541503906, + "learning_rate": 7.513611615245008e-06, + "loss": 39.2531, + "step": 4140 + }, + { + "epoch": 14.94627539503386, + "eval_loss": 0.6075459718704224, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 4140 + }, + { + "epoch": 14.949887133182845, + "grad_norm": 348.3675537109375, + "learning_rate": 7.508166969147006e-06, + "loss": 38.3904, + "step": 4141 + }, + { + "epoch": 14.953498871331828, + "grad_norm": 448.6506652832031, + "learning_rate": 7.5027223230490025e-06, + "loss": 39.0257, + "step": 4142 + }, + { + "epoch": 14.957110609480813, + "grad_norm": 407.4074401855469, + "learning_rate": 7.497277676950998e-06, + "loss": 36.8144, + "step": 4143 + }, + { + "epoch": 14.960722347629797, + "grad_norm": 311.0707702636719, + "learning_rate": 7.491833030852995e-06, + "loss": 34.3852, + "step": 4144 + }, + { + "epoch": 14.96433408577878, + "grad_norm": 316.660400390625, + "learning_rate": 7.486388384754991e-06, + "loss": 32.9411, + "step": 4145 + }, + { + "epoch": 14.967945823927765, + "grad_norm": 405.3203125, + "learning_rate": 7.480943738656988e-06, + "loss": 32.9947, + "step": 4146 + }, + { + "epoch": 14.97155756207675, + "grad_norm": 246.47296142578125, + "learning_rate": 7.475499092558984e-06, + "loss": 34.9284, + "step": 4147 + }, + { + "epoch": 14.975169300225733, + "grad_norm": 250.6293487548828, + "learning_rate": 7.47005444646098e-06, + "loss": 33.5852, + "step": 4148 + }, + { + "epoch": 14.978781038374718, + "grad_norm": 367.8492736816406, + "learning_rate": 7.464609800362977e-06, + "loss": 34.5658, + "step": 4149 + }, + { + "epoch": 14.982392776523701, + "grad_norm": 299.1382141113281, + "learning_rate": 7.459165154264972e-06, + "loss": 35.4483, + "step": 4150 + }, + { + "epoch": 14.982392776523701, + "eval_loss": 0.6054605841636658, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 4150 + }, + { + "epoch": 14.986004514672686, + "grad_norm": 448.0080261230469, + "learning_rate": 7.453720508166969e-06, + "loss": 35.9366, + "step": 4151 + }, + { + "epoch": 14.989616252821671, + "grad_norm": 496.0691223144531, + "learning_rate": 7.448275862068966e-06, + "loss": 37.6222, + "step": 4152 + }, + { + "epoch": 14.993227990970654, + "grad_norm": 300.7026062011719, + "learning_rate": 7.442831215970963e-06, + "loss": 27.5573, + "step": 4153 + }, + { + "epoch": 14.996839729119639, + "grad_norm": 183.81434631347656, + "learning_rate": 7.437386569872958e-06, + "loss": 23.0142, + "step": 4154 + }, + { + "epoch": 15.0, + "grad_norm": 198.61032104492188, + "learning_rate": 7.431941923774954e-06, + "loss": 21.0732, + "step": 4155 + }, + { + "epoch": 15.003611738148985, + "grad_norm": 244.2176513671875, + "learning_rate": 7.426497277676951e-06, + "loss": 39.1709, + "step": 4156 + }, + { + "epoch": 15.007223476297968, + "grad_norm": 211.74375915527344, + "learning_rate": 7.421052631578948e-06, + "loss": 39.9364, + "step": 4157 + }, + { + "epoch": 15.010835214446953, + "grad_norm": 216.2489013671875, + "learning_rate": 7.415607985480944e-06, + "loss": 39.5166, + "step": 4158 + }, + { + "epoch": 15.014446952595938, + "grad_norm": 279.423583984375, + "learning_rate": 7.41016333938294e-06, + "loss": 39.6738, + "step": 4159 + }, + { + "epoch": 15.01805869074492, + "grad_norm": 279.117919921875, + "learning_rate": 7.404718693284937e-06, + "loss": 39.3556, + "step": 4160 + }, + { + "epoch": 15.01805869074492, + "eval_loss": 0.6020110249519348, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 4160 + }, + { + "epoch": 15.021670428893906, + "grad_norm": 213.57162475585938, + "learning_rate": 7.399274047186933e-06, + "loss": 38.9987, + "step": 4161 + }, + { + "epoch": 15.025282167042889, + "grad_norm": 184.1968994140625, + "learning_rate": 7.393829401088929e-06, + "loss": 39.1696, + "step": 4162 + }, + { + "epoch": 15.028893905191874, + "grad_norm": 219.38076782226562, + "learning_rate": 7.388384754990926e-06, + "loss": 39.8897, + "step": 4163 + }, + { + "epoch": 15.032505643340858, + "grad_norm": 225.4325714111328, + "learning_rate": 7.382940108892922e-06, + "loss": 40.7633, + "step": 4164 + }, + { + "epoch": 15.036117381489841, + "grad_norm": 274.78472900390625, + "learning_rate": 7.377495462794918e-06, + "loss": 39.8768, + "step": 4165 + }, + { + "epoch": 15.039729119638826, + "grad_norm": 269.5557861328125, + "learning_rate": 7.3720508166969146e-06, + "loss": 38.4735, + "step": 4166 + }, + { + "epoch": 15.043340857787811, + "grad_norm": 219.78761291503906, + "learning_rate": 7.366606170598912e-06, + "loss": 37.2117, + "step": 4167 + }, + { + "epoch": 15.046952595936794, + "grad_norm": 205.49771118164062, + "learning_rate": 7.361161524500908e-06, + "loss": 36.6855, + "step": 4168 + }, + { + "epoch": 15.050564334085779, + "grad_norm": 235.72068786621094, + "learning_rate": 7.355716878402904e-06, + "loss": 35.4408, + "step": 4169 + }, + { + "epoch": 15.054176072234762, + "grad_norm": 218.84732055664062, + "learning_rate": 7.3502722323049e-06, + "loss": 38.2297, + "step": 4170 + }, + { + "epoch": 15.054176072234762, + "eval_loss": 0.6053969860076904, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 4170 + }, + { + "epoch": 15.057787810383747, + "grad_norm": 195.80685424804688, + "learning_rate": 7.3448275862068966e-06, + "loss": 35.7271, + "step": 4171 + }, + { + "epoch": 15.061399548532732, + "grad_norm": 207.12481689453125, + "learning_rate": 7.339382940108894e-06, + "loss": 37.3393, + "step": 4172 + }, + { + "epoch": 15.065011286681715, + "grad_norm": 211.0287322998047, + "learning_rate": 7.333938294010889e-06, + "loss": 36.9505, + "step": 4173 + }, + { + "epoch": 15.0686230248307, + "grad_norm": 279.0206604003906, + "learning_rate": 7.328493647912886e-06, + "loss": 38.1225, + "step": 4174 + }, + { + "epoch": 15.072234762979685, + "grad_norm": 206.3834228515625, + "learning_rate": 7.323049001814882e-06, + "loss": 37.1117, + "step": 4175 + }, + { + "epoch": 15.075846501128668, + "grad_norm": 266.8707275390625, + "learning_rate": 7.3176043557168786e-06, + "loss": 36.1971, + "step": 4176 + }, + { + "epoch": 15.079458239277653, + "grad_norm": 260.35791015625, + "learning_rate": 7.312159709618875e-06, + "loss": 37.4714, + "step": 4177 + }, + { + "epoch": 15.083069977426636, + "grad_norm": 281.152587890625, + "learning_rate": 7.306715063520871e-06, + "loss": 37.621, + "step": 4178 + }, + { + "epoch": 15.08668171557562, + "grad_norm": 246.25758361816406, + "learning_rate": 7.301270417422868e-06, + "loss": 38.919, + "step": 4179 + }, + { + "epoch": 15.090293453724605, + "grad_norm": 378.4499816894531, + "learning_rate": 7.2958257713248635e-06, + "loss": 39.5783, + "step": 4180 + }, + { + "epoch": 15.090293453724605, + "eval_loss": 0.6071392297744751, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 4180 + }, + { + "epoch": 15.093905191873588, + "grad_norm": 421.0552673339844, + "learning_rate": 7.2903811252268606e-06, + "loss": 38.9023, + "step": 4181 + }, + { + "epoch": 15.097516930022573, + "grad_norm": 264.24359130859375, + "learning_rate": 7.284936479128857e-06, + "loss": 39.6466, + "step": 4182 + }, + { + "epoch": 15.101128668171558, + "grad_norm": 246.88182067871094, + "learning_rate": 7.279491833030854e-06, + "loss": 39.4899, + "step": 4183 + }, + { + "epoch": 15.104740406320541, + "grad_norm": 236.83848571777344, + "learning_rate": 7.274047186932849e-06, + "loss": 35.6587, + "step": 4184 + }, + { + "epoch": 15.108352144469526, + "grad_norm": 278.31573486328125, + "learning_rate": 7.2686025408348455e-06, + "loss": 34.1567, + "step": 4185 + }, + { + "epoch": 15.111963882618511, + "grad_norm": 243.71160888671875, + "learning_rate": 7.2631578947368426e-06, + "loss": 32.1268, + "step": 4186 + }, + { + "epoch": 15.115575620767494, + "grad_norm": 233.81211853027344, + "learning_rate": 7.257713248638839e-06, + "loss": 31.498, + "step": 4187 + }, + { + "epoch": 15.119187358916479, + "grad_norm": 243.12672424316406, + "learning_rate": 7.252268602540835e-06, + "loss": 32.3648, + "step": 4188 + }, + { + "epoch": 15.122799097065462, + "grad_norm": 293.38299560546875, + "learning_rate": 7.246823956442831e-06, + "loss": 32.2236, + "step": 4189 + }, + { + "epoch": 15.126410835214447, + "grad_norm": 249.70071411132812, + "learning_rate": 7.241379310344828e-06, + "loss": 34.5535, + "step": 4190 + }, + { + "epoch": 15.126410835214447, + "eval_loss": 0.6050077676773071, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.934, + "eval_steps_per_second": 56.934, + "step": 4190 + }, + { + "epoch": 15.130022573363432, + "grad_norm": 300.9483642578125, + "learning_rate": 7.235934664246824e-06, + "loss": 32.9552, + "step": 4191 + }, + { + "epoch": 15.133634311512415, + "grad_norm": 228.797607421875, + "learning_rate": 7.23049001814882e-06, + "loss": 33.0974, + "step": 4192 + }, + { + "epoch": 15.1372460496614, + "grad_norm": 279.9087219238281, + "learning_rate": 7.225045372050817e-06, + "loss": 34.2865, + "step": 4193 + }, + { + "epoch": 15.140857787810384, + "grad_norm": 254.15928649902344, + "learning_rate": 7.219600725952813e-06, + "loss": 34.5603, + "step": 4194 + }, + { + "epoch": 15.144469525959368, + "grad_norm": 314.19012451171875, + "learning_rate": 7.2141560798548095e-06, + "loss": 34.6428, + "step": 4195 + }, + { + "epoch": 15.148081264108352, + "grad_norm": 291.8244323730469, + "learning_rate": 7.208711433756806e-06, + "loss": 33.6676, + "step": 4196 + }, + { + "epoch": 15.151693002257336, + "grad_norm": 276.4428405761719, + "learning_rate": 7.203266787658803e-06, + "loss": 33.9118, + "step": 4197 + }, + { + "epoch": 15.15530474040632, + "grad_norm": 265.7801208496094, + "learning_rate": 7.197822141560799e-06, + "loss": 35.1971, + "step": 4198 + }, + { + "epoch": 15.158916478555305, + "grad_norm": 244.48667907714844, + "learning_rate": 7.192377495462795e-06, + "loss": 33.0843, + "step": 4199 + }, + { + "epoch": 15.162528216704288, + "grad_norm": 348.6037902832031, + "learning_rate": 7.1869328493647915e-06, + "loss": 36.7957, + "step": 4200 + }, + { + "epoch": 15.162528216704288, + "eval_loss": 0.6052607297897339, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 4200 + }, + { + "epoch": 15.166139954853273, + "grad_norm": 227.31346130371094, + "learning_rate": 7.181488203266788e-06, + "loss": 28.0234, + "step": 4201 + }, + { + "epoch": 15.169751693002258, + "grad_norm": 208.75048828125, + "learning_rate": 7.176043557168784e-06, + "loss": 22.5147, + "step": 4202 + }, + { + "epoch": 15.173363431151241, + "grad_norm": 222.91090393066406, + "learning_rate": 7.17059891107078e-06, + "loss": 22.1029, + "step": 4203 + }, + { + "epoch": 15.176975169300226, + "grad_norm": 219.40621948242188, + "learning_rate": 7.165154264972777e-06, + "loss": 22.9827, + "step": 4204 + }, + { + "epoch": 15.18058690744921, + "grad_norm": 229.11813354492188, + "learning_rate": 7.1597096188747735e-06, + "loss": 23.6974, + "step": 4205 + }, + { + "epoch": 15.184198645598194, + "grad_norm": 256.7950744628906, + "learning_rate": 7.15426497277677e-06, + "loss": 39.6585, + "step": 4206 + }, + { + "epoch": 15.187810383747179, + "grad_norm": 237.47613525390625, + "learning_rate": 7.148820326678766e-06, + "loss": 40.0478, + "step": 4207 + }, + { + "epoch": 15.191422121896162, + "grad_norm": 259.54296875, + "learning_rate": 7.143375680580762e-06, + "loss": 39.7604, + "step": 4208 + }, + { + "epoch": 15.195033860045147, + "grad_norm": 249.7389678955078, + "learning_rate": 7.137931034482759e-06, + "loss": 39.0201, + "step": 4209 + }, + { + "epoch": 15.198645598194132, + "grad_norm": 298.4624938964844, + "learning_rate": 7.132486388384755e-06, + "loss": 39.8575, + "step": 4210 + }, + { + "epoch": 15.198645598194132, + "eval_loss": 0.6088115572929382, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 4210 + }, + { + "epoch": 15.202257336343115, + "grad_norm": 267.57659912109375, + "learning_rate": 7.127041742286752e-06, + "loss": 38.8929, + "step": 4211 + }, + { + "epoch": 15.2058690744921, + "grad_norm": 243.88333129882812, + "learning_rate": 7.121597096188748e-06, + "loss": 39.6078, + "step": 4212 + }, + { + "epoch": 15.209480812641084, + "grad_norm": 268.2644348144531, + "learning_rate": 7.116152450090745e-06, + "loss": 39.9488, + "step": 4213 + }, + { + "epoch": 15.213092550790067, + "grad_norm": 240.2657928466797, + "learning_rate": 7.11070780399274e-06, + "loss": 40.1645, + "step": 4214 + }, + { + "epoch": 15.216704288939052, + "grad_norm": 198.76910400390625, + "learning_rate": 7.105263157894737e-06, + "loss": 38.2229, + "step": 4215 + }, + { + "epoch": 15.220316027088035, + "grad_norm": 234.11170959472656, + "learning_rate": 7.099818511796734e-06, + "loss": 39.5294, + "step": 4216 + }, + { + "epoch": 15.22392776523702, + "grad_norm": 192.80194091796875, + "learning_rate": 7.094373865698729e-06, + "loss": 36.9752, + "step": 4217 + }, + { + "epoch": 15.227539503386005, + "grad_norm": 241.8236846923828, + "learning_rate": 7.088929219600726e-06, + "loss": 36.1043, + "step": 4218 + }, + { + "epoch": 15.231151241534988, + "grad_norm": 451.6199645996094, + "learning_rate": 7.083484573502722e-06, + "loss": 37.7911, + "step": 4219 + }, + { + "epoch": 15.234762979683973, + "grad_norm": 351.9429626464844, + "learning_rate": 7.0780399274047195e-06, + "loss": 35.5202, + "step": 4220 + }, + { + "epoch": 15.234762979683973, + "eval_loss": 0.6093130111694336, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 4220 + }, + { + "epoch": 15.238374717832958, + "grad_norm": 266.4995422363281, + "learning_rate": 7.072595281306715e-06, + "loss": 37.5552, + "step": 4221 + }, + { + "epoch": 15.241986455981941, + "grad_norm": 258.74578857421875, + "learning_rate": 7.067150635208712e-06, + "loss": 37.1315, + "step": 4222 + }, + { + "epoch": 15.245598194130926, + "grad_norm": 233.30921936035156, + "learning_rate": 7.061705989110708e-06, + "loss": 36.9237, + "step": 4223 + }, + { + "epoch": 15.249209932279909, + "grad_norm": 235.8688201904297, + "learning_rate": 7.056261343012704e-06, + "loss": 38.0112, + "step": 4224 + }, + { + "epoch": 15.252821670428894, + "grad_norm": 214.88436889648438, + "learning_rate": 7.050816696914701e-06, + "loss": 38.5641, + "step": 4225 + }, + { + "epoch": 15.256433408577879, + "grad_norm": 252.64144897460938, + "learning_rate": 7.045372050816697e-06, + "loss": 36.7125, + "step": 4226 + }, + { + "epoch": 15.260045146726862, + "grad_norm": 293.78424072265625, + "learning_rate": 7.039927404718694e-06, + "loss": 37.5956, + "step": 4227 + }, + { + "epoch": 15.263656884875846, + "grad_norm": 234.13510131835938, + "learning_rate": 7.03448275862069e-06, + "loss": 38.1829, + "step": 4228 + }, + { + "epoch": 15.267268623024831, + "grad_norm": 279.534912109375, + "learning_rate": 7.029038112522686e-06, + "loss": 39.0785, + "step": 4229 + }, + { + "epoch": 15.270880361173814, + "grad_norm": 246.4442596435547, + "learning_rate": 7.023593466424683e-06, + "loss": 39.1753, + "step": 4230 + }, + { + "epoch": 15.270880361173814, + "eval_loss": 0.6043311357498169, + "eval_runtime": 3.1452, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 4230 + }, + { + "epoch": 15.2744920993228, + "grad_norm": 233.87466430664062, + "learning_rate": 7.018148820326679e-06, + "loss": 39.8464, + "step": 4231 + }, + { + "epoch": 15.278103837471784, + "grad_norm": 228.54898071289062, + "learning_rate": 7.012704174228675e-06, + "loss": 37.9721, + "step": 4232 + }, + { + "epoch": 15.281715575620767, + "grad_norm": 273.70050048828125, + "learning_rate": 7.007259528130671e-06, + "loss": 38.9153, + "step": 4233 + }, + { + "epoch": 15.285327313769752, + "grad_norm": 269.8402404785156, + "learning_rate": 7.001814882032668e-06, + "loss": 36.7607, + "step": 4234 + }, + { + "epoch": 15.288939051918735, + "grad_norm": 260.13629150390625, + "learning_rate": 6.996370235934665e-06, + "loss": 35.3684, + "step": 4235 + }, + { + "epoch": 15.29255079006772, + "grad_norm": 223.9878692626953, + "learning_rate": 6.990925589836661e-06, + "loss": 32.8784, + "step": 4236 + }, + { + "epoch": 15.296162528216705, + "grad_norm": 225.69212341308594, + "learning_rate": 6.985480943738657e-06, + "loss": 31.3751, + "step": 4237 + }, + { + "epoch": 15.299774266365688, + "grad_norm": 215.99801635742188, + "learning_rate": 6.980036297640653e-06, + "loss": 31.5331, + "step": 4238 + }, + { + "epoch": 15.303386004514673, + "grad_norm": 263.26568603515625, + "learning_rate": 6.97459165154265e-06, + "loss": 32.5806, + "step": 4239 + }, + { + "epoch": 15.306997742663658, + "grad_norm": 203.2392578125, + "learning_rate": 6.969147005444646e-06, + "loss": 31.6379, + "step": 4240 + }, + { + "epoch": 15.306997742663658, + "eval_loss": 0.6046441793441772, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 4240 + }, + { + "epoch": 15.31060948081264, + "grad_norm": 221.2167510986328, + "learning_rate": 6.963702359346643e-06, + "loss": 33.7034, + "step": 4241 + }, + { + "epoch": 15.314221218961626, + "grad_norm": 212.58737182617188, + "learning_rate": 6.958257713248639e-06, + "loss": 32.5511, + "step": 4242 + }, + { + "epoch": 15.317832957110609, + "grad_norm": 270.7123718261719, + "learning_rate": 6.952813067150635e-06, + "loss": 33.2513, + "step": 4243 + }, + { + "epoch": 15.321444695259594, + "grad_norm": 270.2066345214844, + "learning_rate": 6.9473684210526315e-06, + "loss": 33.9559, + "step": 4244 + }, + { + "epoch": 15.325056433408578, + "grad_norm": 232.8043212890625, + "learning_rate": 6.941923774954628e-06, + "loss": 33.9916, + "step": 4245 + }, + { + "epoch": 15.328668171557561, + "grad_norm": 325.419921875, + "learning_rate": 6.936479128856625e-06, + "loss": 35.2098, + "step": 4246 + }, + { + "epoch": 15.332279909706546, + "grad_norm": 303.326416015625, + "learning_rate": 6.93103448275862e-06, + "loss": 35.0784, + "step": 4247 + }, + { + "epoch": 15.335891647855531, + "grad_norm": 327.05963134765625, + "learning_rate": 6.925589836660617e-06, + "loss": 35.9915, + "step": 4248 + }, + { + "epoch": 15.339503386004514, + "grad_norm": 326.58795166015625, + "learning_rate": 6.9201451905626135e-06, + "loss": 35.1914, + "step": 4249 + }, + { + "epoch": 15.343115124153499, + "grad_norm": 406.38812255859375, + "learning_rate": 6.914700544464611e-06, + "loss": 37.1535, + "step": 4250 + }, + { + "epoch": 15.343115124153499, + "eval_loss": 0.6056071519851685, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 4250 + }, + { + "epoch": 15.346726862302482, + "grad_norm": 325.6965637207031, + "learning_rate": 6.909255898366606e-06, + "loss": 29.8698, + "step": 4251 + }, + { + "epoch": 15.350338600451467, + "grad_norm": 212.59727478027344, + "learning_rate": 6.903811252268603e-06, + "loss": 22.2995, + "step": 4252 + }, + { + "epoch": 15.353950338600452, + "grad_norm": 257.447509765625, + "learning_rate": 6.898366606170599e-06, + "loss": 23.1014, + "step": 4253 + }, + { + "epoch": 15.357562076749435, + "grad_norm": 266.139892578125, + "learning_rate": 6.8929219600725955e-06, + "loss": 23.2319, + "step": 4254 + }, + { + "epoch": 15.36117381489842, + "grad_norm": 332.7207336425781, + "learning_rate": 6.887477313974592e-06, + "loss": 23.7218, + "step": 4255 + }, + { + "epoch": 15.364785553047405, + "grad_norm": 272.7341003417969, + "learning_rate": 6.882032667876588e-06, + "loss": 39.5787, + "step": 4256 + }, + { + "epoch": 15.368397291196388, + "grad_norm": 259.00872802734375, + "learning_rate": 6.876588021778585e-06, + "loss": 41.0874, + "step": 4257 + }, + { + "epoch": 15.372009029345373, + "grad_norm": 236.87033081054688, + "learning_rate": 6.8711433756805804e-06, + "loss": 38.9811, + "step": 4258 + }, + { + "epoch": 15.375620767494357, + "grad_norm": 293.6808776855469, + "learning_rate": 6.8656987295825775e-06, + "loss": 39.481, + "step": 4259 + }, + { + "epoch": 15.37923250564334, + "grad_norm": 266.0845947265625, + "learning_rate": 6.860254083484574e-06, + "loss": 39.4595, + "step": 4260 + }, + { + "epoch": 15.37923250564334, + "eval_loss": 0.6039742231369019, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.047, + "eval_steps_per_second": 57.047, + "step": 4260 + }, + { + "epoch": 15.382844243792325, + "grad_norm": 398.0877685546875, + "learning_rate": 6.85480943738657e-06, + "loss": 38.8899, + "step": 4261 + }, + { + "epoch": 15.386455981941308, + "grad_norm": 208.37376403808594, + "learning_rate": 6.849364791288566e-06, + "loss": 39.2194, + "step": 4262 + }, + { + "epoch": 15.390067720090293, + "grad_norm": 214.6958770751953, + "learning_rate": 6.8439201451905624e-06, + "loss": 38.9911, + "step": 4263 + }, + { + "epoch": 15.393679458239278, + "grad_norm": 210.2147674560547, + "learning_rate": 6.8384754990925595e-06, + "loss": 40.5973, + "step": 4264 + }, + { + "epoch": 15.397291196388261, + "grad_norm": 240.47030639648438, + "learning_rate": 6.833030852994556e-06, + "loss": 39.3936, + "step": 4265 + }, + { + "epoch": 15.400902934537246, + "grad_norm": 273.86883544921875, + "learning_rate": 6.827586206896552e-06, + "loss": 40.0848, + "step": 4266 + }, + { + "epoch": 15.404514672686231, + "grad_norm": 239.36453247070312, + "learning_rate": 6.822141560798548e-06, + "loss": 36.5967, + "step": 4267 + }, + { + "epoch": 15.408126410835214, + "grad_norm": 215.3413543701172, + "learning_rate": 6.8166969147005444e-06, + "loss": 37.8173, + "step": 4268 + }, + { + "epoch": 15.411738148984199, + "grad_norm": 260.1557312011719, + "learning_rate": 6.811252268602541e-06, + "loss": 37.7175, + "step": 4269 + }, + { + "epoch": 15.415349887133182, + "grad_norm": 239.4988555908203, + "learning_rate": 6.805807622504537e-06, + "loss": 37.0618, + "step": 4270 + }, + { + "epoch": 15.415349887133182, + "eval_loss": 0.6049810647964478, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 4270 + }, + { + "epoch": 15.418961625282167, + "grad_norm": 223.06094360351562, + "learning_rate": 6.800362976406534e-06, + "loss": 37.0687, + "step": 4271 + }, + { + "epoch": 15.422573363431152, + "grad_norm": 261.7460632324219, + "learning_rate": 6.79491833030853e-06, + "loss": 35.9437, + "step": 4272 + }, + { + "epoch": 15.426185101580135, + "grad_norm": 230.92135620117188, + "learning_rate": 6.7894736842105264e-06, + "loss": 38.3316, + "step": 4273 + }, + { + "epoch": 15.42979683972912, + "grad_norm": 370.6309509277344, + "learning_rate": 6.784029038112523e-06, + "loss": 38.2666, + "step": 4274 + }, + { + "epoch": 15.433408577878104, + "grad_norm": 249.7823944091797, + "learning_rate": 6.77858439201452e-06, + "loss": 38.1159, + "step": 4275 + }, + { + "epoch": 15.437020316027088, + "grad_norm": 404.1676330566406, + "learning_rate": 6.773139745916516e-06, + "loss": 37.6548, + "step": 4276 + }, + { + "epoch": 15.440632054176072, + "grad_norm": 256.3241271972656, + "learning_rate": 6.767695099818511e-06, + "loss": 38.3713, + "step": 4277 + }, + { + "epoch": 15.444243792325057, + "grad_norm": 240.55934143066406, + "learning_rate": 6.7622504537205084e-06, + "loss": 39.2487, + "step": 4278 + }, + { + "epoch": 15.44785553047404, + "grad_norm": 230.010009765625, + "learning_rate": 6.756805807622505e-06, + "loss": 39.4391, + "step": 4279 + }, + { + "epoch": 15.451467268623025, + "grad_norm": 226.51385498046875, + "learning_rate": 6.751361161524502e-06, + "loss": 38.6273, + "step": 4280 + }, + { + "epoch": 15.451467268623025, + "eval_loss": 0.6027400493621826, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.029, + "eval_steps_per_second": 57.029, + "step": 4280 + }, + { + "epoch": 15.455079006772008, + "grad_norm": 314.57476806640625, + "learning_rate": 6.745916515426497e-06, + "loss": 38.583, + "step": 4281 + }, + { + "epoch": 15.458690744920993, + "grad_norm": 229.91238403320312, + "learning_rate": 6.740471869328494e-06, + "loss": 39.2433, + "step": 4282 + }, + { + "epoch": 15.462302483069978, + "grad_norm": 284.7301330566406, + "learning_rate": 6.7350272232304904e-06, + "loss": 38.8577, + "step": 4283 + }, + { + "epoch": 15.465914221218961, + "grad_norm": 209.32266235351562, + "learning_rate": 6.729582577132486e-06, + "loss": 34.928, + "step": 4284 + }, + { + "epoch": 15.469525959367946, + "grad_norm": 264.6195068359375, + "learning_rate": 6.724137931034483e-06, + "loss": 32.0527, + "step": 4285 + }, + { + "epoch": 15.47313769751693, + "grad_norm": 224.2421112060547, + "learning_rate": 6.718693284936479e-06, + "loss": 31.939, + "step": 4286 + }, + { + "epoch": 15.476749435665914, + "grad_norm": 233.0791015625, + "learning_rate": 6.713248638838476e-06, + "loss": 32.5402, + "step": 4287 + }, + { + "epoch": 15.480361173814899, + "grad_norm": 284.129638671875, + "learning_rate": 6.707803992740472e-06, + "loss": 31.0069, + "step": 4288 + }, + { + "epoch": 15.483972911963882, + "grad_norm": 253.6517791748047, + "learning_rate": 6.702359346642469e-06, + "loss": 32.0172, + "step": 4289 + }, + { + "epoch": 15.487584650112867, + "grad_norm": 305.63775634765625, + "learning_rate": 6.696914700544465e-06, + "loss": 34.1643, + "step": 4290 + }, + { + "epoch": 15.487584650112867, + "eval_loss": 0.6044390201568604, + "eval_runtime": 3.1391, + "eval_samples_per_second": 57.023, + "eval_steps_per_second": 57.023, + "step": 4290 + }, + { + "epoch": 15.491196388261852, + "grad_norm": 224.6516876220703, + "learning_rate": 6.691470054446461e-06, + "loss": 32.4735, + "step": 4291 + }, + { + "epoch": 15.494808126410835, + "grad_norm": 257.5385437011719, + "learning_rate": 6.686025408348457e-06, + "loss": 33.9272, + "step": 4292 + }, + { + "epoch": 15.49841986455982, + "grad_norm": 393.9106140136719, + "learning_rate": 6.680580762250454e-06, + "loss": 34.4176, + "step": 4293 + }, + { + "epoch": 15.502031602708804, + "grad_norm": 333.5639953613281, + "learning_rate": 6.675136116152451e-06, + "loss": 34.5695, + "step": 4294 + }, + { + "epoch": 15.505643340857787, + "grad_norm": 319.8660888671875, + "learning_rate": 6.669691470054446e-06, + "loss": 34.5337, + "step": 4295 + }, + { + "epoch": 15.509255079006772, + "grad_norm": 246.78086853027344, + "learning_rate": 6.664246823956443e-06, + "loss": 34.8297, + "step": 4296 + }, + { + "epoch": 15.512866817155757, + "grad_norm": 313.4530944824219, + "learning_rate": 6.658802177858439e-06, + "loss": 34.6901, + "step": 4297 + }, + { + "epoch": 15.51647855530474, + "grad_norm": 257.2852783203125, + "learning_rate": 6.6533575317604364e-06, + "loss": 35.3892, + "step": 4298 + }, + { + "epoch": 15.520090293453725, + "grad_norm": 336.5549011230469, + "learning_rate": 6.647912885662432e-06, + "loss": 36.3347, + "step": 4299 + }, + { + "epoch": 15.523702031602708, + "grad_norm": 275.726806640625, + "learning_rate": 6.642468239564428e-06, + "loss": 36.3559, + "step": 4300 + }, + { + "epoch": 15.523702031602708, + "eval_loss": 0.6056334376335144, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.028, + "eval_steps_per_second": 57.028, + "step": 4300 + }, + { + "epoch": 15.527313769751693, + "grad_norm": 275.5987243652344, + "learning_rate": 6.637023593466425e-06, + "loss": 28.5887, + "step": 4301 + }, + { + "epoch": 15.530925507900678, + "grad_norm": 242.59762573242188, + "learning_rate": 6.631578947368421e-06, + "loss": 22.1398, + "step": 4302 + }, + { + "epoch": 15.534537246049661, + "grad_norm": 228.04344177246094, + "learning_rate": 6.626134301270418e-06, + "loss": 21.4593, + "step": 4303 + }, + { + "epoch": 15.538148984198646, + "grad_norm": 204.2377166748047, + "learning_rate": 6.620689655172414e-06, + "loss": 22.5132, + "step": 4304 + }, + { + "epoch": 15.54176072234763, + "grad_norm": 243.0237579345703, + "learning_rate": 6.615245009074411e-06, + "loss": 24.2777, + "step": 4305 + }, + { + "epoch": 15.545372460496614, + "grad_norm": 227.2841339111328, + "learning_rate": 6.609800362976407e-06, + "loss": 39.7235, + "step": 4306 + }, + { + "epoch": 15.548984198645599, + "grad_norm": 253.8453826904297, + "learning_rate": 6.6043557168784025e-06, + "loss": 39.9317, + "step": 4307 + }, + { + "epoch": 15.552595936794582, + "grad_norm": 243.62757873535156, + "learning_rate": 6.5989110707804e-06, + "loss": 38.9825, + "step": 4308 + }, + { + "epoch": 15.556207674943566, + "grad_norm": 262.4398498535156, + "learning_rate": 6.593466424682396e-06, + "loss": 39.7456, + "step": 4309 + }, + { + "epoch": 15.559819413092551, + "grad_norm": 268.5821228027344, + "learning_rate": 6.588021778584392e-06, + "loss": 39.5152, + "step": 4310 + }, + { + "epoch": 15.559819413092551, + "eval_loss": 0.6060237288475037, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 4310 + }, + { + "epoch": 15.563431151241534, + "grad_norm": 297.6933898925781, + "learning_rate": 6.582577132486388e-06, + "loss": 40.1259, + "step": 4311 + }, + { + "epoch": 15.56704288939052, + "grad_norm": 234.08816528320312, + "learning_rate": 6.577132486388385e-06, + "loss": 40.8591, + "step": 4312 + }, + { + "epoch": 15.570654627539504, + "grad_norm": 292.2416687011719, + "learning_rate": 6.571687840290382e-06, + "loss": 39.2377, + "step": 4313 + }, + { + "epoch": 15.574266365688487, + "grad_norm": 205.25888061523438, + "learning_rate": 6.566243194192377e-06, + "loss": 39.92, + "step": 4314 + }, + { + "epoch": 15.577878103837472, + "grad_norm": 229.06695556640625, + "learning_rate": 6.560798548094374e-06, + "loss": 39.8886, + "step": 4315 + }, + { + "epoch": 15.581489841986457, + "grad_norm": 223.3977508544922, + "learning_rate": 6.55535390199637e-06, + "loss": 38.5423, + "step": 4316 + }, + { + "epoch": 15.58510158013544, + "grad_norm": 254.60203552246094, + "learning_rate": 6.549909255898367e-06, + "loss": 36.8055, + "step": 4317 + }, + { + "epoch": 15.588713318284425, + "grad_norm": 304.463623046875, + "learning_rate": 6.544464609800363e-06, + "loss": 37.6164, + "step": 4318 + }, + { + "epoch": 15.592325056433408, + "grad_norm": 279.955810546875, + "learning_rate": 6.53901996370236e-06, + "loss": 37.4778, + "step": 4319 + }, + { + "epoch": 15.595936794582393, + "grad_norm": 230.11105346679688, + "learning_rate": 6.533575317604356e-06, + "loss": 36.9663, + "step": 4320 + }, + { + "epoch": 15.595936794582393, + "eval_loss": 0.6048213243484497, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.966, + "eval_steps_per_second": 56.966, + "step": 4320 + }, + { + "epoch": 15.599548532731378, + "grad_norm": 261.98187255859375, + "learning_rate": 6.528130671506351e-06, + "loss": 37.7402, + "step": 4321 + }, + { + "epoch": 15.60316027088036, + "grad_norm": 247.34771728515625, + "learning_rate": 6.5226860254083485e-06, + "loss": 37.1402, + "step": 4322 + }, + { + "epoch": 15.606772009029346, + "grad_norm": 277.1517333984375, + "learning_rate": 6.517241379310345e-06, + "loss": 38.3976, + "step": 4323 + }, + { + "epoch": 15.610383747178329, + "grad_norm": 231.89683532714844, + "learning_rate": 6.511796733212342e-06, + "loss": 38.0834, + "step": 4324 + }, + { + "epoch": 15.613995485327314, + "grad_norm": 323.8349304199219, + "learning_rate": 6.506352087114337e-06, + "loss": 37.9085, + "step": 4325 + }, + { + "epoch": 15.617607223476298, + "grad_norm": 263.5240783691406, + "learning_rate": 6.500907441016334e-06, + "loss": 37.0702, + "step": 4326 + }, + { + "epoch": 15.621218961625281, + "grad_norm": 217.0517578125, + "learning_rate": 6.4954627949183305e-06, + "loss": 36.9406, + "step": 4327 + }, + { + "epoch": 15.624830699774266, + "grad_norm": 267.4161682128906, + "learning_rate": 6.4900181488203276e-06, + "loss": 38.8773, + "step": 4328 + }, + { + "epoch": 15.628442437923251, + "grad_norm": 232.36000061035156, + "learning_rate": 6.484573502722323e-06, + "loss": 38.4978, + "step": 4329 + }, + { + "epoch": 15.632054176072234, + "grad_norm": 241.61373901367188, + "learning_rate": 6.479128856624319e-06, + "loss": 38.4895, + "step": 4330 + }, + { + "epoch": 15.632054176072234, + "eval_loss": 0.6024956703186035, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 4330 + }, + { + "epoch": 15.635665914221219, + "grad_norm": 232.27928161621094, + "learning_rate": 6.473684210526316e-06, + "loss": 38.8551, + "step": 4331 + }, + { + "epoch": 15.639277652370204, + "grad_norm": 243.42828369140625, + "learning_rate": 6.4682395644283125e-06, + "loss": 38.6475, + "step": 4332 + }, + { + "epoch": 15.642889390519187, + "grad_norm": 306.2618103027344, + "learning_rate": 6.462794918330309e-06, + "loss": 37.2015, + "step": 4333 + }, + { + "epoch": 15.646501128668172, + "grad_norm": 335.795166015625, + "learning_rate": 6.457350272232305e-06, + "loss": 36.5255, + "step": 4334 + }, + { + "epoch": 15.650112866817155, + "grad_norm": 209.6246337890625, + "learning_rate": 6.451905626134302e-06, + "loss": 32.4219, + "step": 4335 + }, + { + "epoch": 15.65372460496614, + "grad_norm": 283.2094421386719, + "learning_rate": 6.446460980036297e-06, + "loss": 30.9137, + "step": 4336 + }, + { + "epoch": 15.657336343115125, + "grad_norm": 255.4412841796875, + "learning_rate": 6.441016333938294e-06, + "loss": 30.8939, + "step": 4337 + }, + { + "epoch": 15.660948081264108, + "grad_norm": 217.8052215576172, + "learning_rate": 6.435571687840291e-06, + "loss": 31.5974, + "step": 4338 + }, + { + "epoch": 15.664559819413093, + "grad_norm": 215.64398193359375, + "learning_rate": 6.430127041742287e-06, + "loss": 30.0276, + "step": 4339 + }, + { + "epoch": 15.668171557562077, + "grad_norm": 244.32704162597656, + "learning_rate": 6.424682395644283e-06, + "loss": 32.5249, + "step": 4340 + }, + { + "epoch": 15.668171557562077, + "eval_loss": 0.6037233471870422, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.033, + "eval_steps_per_second": 57.033, + "step": 4340 + }, + { + "epoch": 15.67178329571106, + "grad_norm": 270.9132080078125, + "learning_rate": 6.419237749546279e-06, + "loss": 32.9923, + "step": 4341 + }, + { + "epoch": 15.675395033860045, + "grad_norm": 230.20314025878906, + "learning_rate": 6.4137931034482765e-06, + "loss": 32.871, + "step": 4342 + }, + { + "epoch": 15.679006772009028, + "grad_norm": 372.4366149902344, + "learning_rate": 6.408348457350273e-06, + "loss": 35.2687, + "step": 4343 + }, + { + "epoch": 15.682618510158013, + "grad_norm": 325.0901794433594, + "learning_rate": 6.402903811252268e-06, + "loss": 34.3107, + "step": 4344 + }, + { + "epoch": 15.686230248306998, + "grad_norm": 277.8683166503906, + "learning_rate": 6.397459165154265e-06, + "loss": 34.291, + "step": 4345 + }, + { + "epoch": 15.689841986455981, + "grad_norm": 262.566162109375, + "learning_rate": 6.392014519056261e-06, + "loss": 33.2989, + "step": 4346 + }, + { + "epoch": 15.693453724604966, + "grad_norm": 293.56536865234375, + "learning_rate": 6.386569872958258e-06, + "loss": 35.6865, + "step": 4347 + }, + { + "epoch": 15.697065462753951, + "grad_norm": 291.1886291503906, + "learning_rate": 6.381125226860254e-06, + "loss": 35.6959, + "step": 4348 + }, + { + "epoch": 15.700677200902934, + "grad_norm": 265.2365417480469, + "learning_rate": 6.375680580762251e-06, + "loss": 36.479, + "step": 4349 + }, + { + "epoch": 15.704288939051919, + "grad_norm": 342.8822021484375, + "learning_rate": 6.370235934664247e-06, + "loss": 35.9198, + "step": 4350 + }, + { + "epoch": 15.704288939051919, + "eval_loss": 0.603361189365387, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.98, + "eval_steps_per_second": 56.98, + "step": 4350 + }, + { + "epoch": 15.707900677200904, + "grad_norm": 276.1657409667969, + "learning_rate": 6.364791288566243e-06, + "loss": 29.429, + "step": 4351 + }, + { + "epoch": 15.711512415349887, + "grad_norm": 267.2456359863281, + "learning_rate": 6.35934664246824e-06, + "loss": 23.0038, + "step": 4352 + }, + { + "epoch": 15.715124153498872, + "grad_norm": 255.4893798828125, + "learning_rate": 6.353901996370236e-06, + "loss": 21.1185, + "step": 4353 + }, + { + "epoch": 15.718735891647855, + "grad_norm": 252.10501098632812, + "learning_rate": 6.348457350272233e-06, + "loss": 23.1769, + "step": 4354 + }, + { + "epoch": 15.72234762979684, + "grad_norm": 239.63905334472656, + "learning_rate": 6.343012704174228e-06, + "loss": 24.5905, + "step": 4355 + }, + { + "epoch": 15.725959367945824, + "grad_norm": 228.00950622558594, + "learning_rate": 6.337568058076225e-06, + "loss": 39.6657, + "step": 4356 + }, + { + "epoch": 15.729571106094808, + "grad_norm": 234.10647583007812, + "learning_rate": 6.332123411978222e-06, + "loss": 41.145, + "step": 4357 + }, + { + "epoch": 15.733182844243792, + "grad_norm": 236.55223083496094, + "learning_rate": 6.326678765880219e-06, + "loss": 40.2784, + "step": 4358 + }, + { + "epoch": 15.736794582392777, + "grad_norm": 340.1712646484375, + "learning_rate": 6.321234119782214e-06, + "loss": 39.3598, + "step": 4359 + }, + { + "epoch": 15.74040632054176, + "grad_norm": 269.4134826660156, + "learning_rate": 6.31578947368421e-06, + "loss": 38.7777, + "step": 4360 + }, + { + "epoch": 15.74040632054176, + "eval_loss": 0.6048015356063843, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 4360 + }, + { + "epoch": 15.744018058690745, + "grad_norm": 316.5471496582031, + "learning_rate": 6.310344827586207e-06, + "loss": 39.6707, + "step": 4361 + }, + { + "epoch": 15.747629796839728, + "grad_norm": 231.31820678710938, + "learning_rate": 6.304900181488203e-06, + "loss": 38.0009, + "step": 4362 + }, + { + "epoch": 15.751241534988713, + "grad_norm": 207.19117736816406, + "learning_rate": 6.2994555353902e-06, + "loss": 41.6523, + "step": 4363 + }, + { + "epoch": 15.754853273137698, + "grad_norm": 239.8341064453125, + "learning_rate": 6.294010889292196e-06, + "loss": 40.3203, + "step": 4364 + }, + { + "epoch": 15.758465011286681, + "grad_norm": 277.2004089355469, + "learning_rate": 6.288566243194193e-06, + "loss": 39.8026, + "step": 4365 + }, + { + "epoch": 15.762076749435666, + "grad_norm": 227.74728393554688, + "learning_rate": 6.2831215970961886e-06, + "loss": 38.1561, + "step": 4366 + }, + { + "epoch": 15.76568848758465, + "grad_norm": 268.6826477050781, + "learning_rate": 6.277676950998185e-06, + "loss": 37.4653, + "step": 4367 + }, + { + "epoch": 15.769300225733634, + "grad_norm": 308.92950439453125, + "learning_rate": 6.272232304900182e-06, + "loss": 36.3506, + "step": 4368 + }, + { + "epoch": 15.772911963882619, + "grad_norm": 216.53627014160156, + "learning_rate": 6.266787658802178e-06, + "loss": 36.12, + "step": 4369 + }, + { + "epoch": 15.776523702031604, + "grad_norm": 264.0691833496094, + "learning_rate": 6.261343012704174e-06, + "loss": 37.5023, + "step": 4370 + }, + { + "epoch": 15.776523702031604, + "eval_loss": 0.608928382396698, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.989, + "eval_steps_per_second": 56.989, + "step": 4370 + }, + { + "epoch": 15.780135440180587, + "grad_norm": 474.7265319824219, + "learning_rate": 6.2558983666061706e-06, + "loss": 38.8381, + "step": 4371 + }, + { + "epoch": 15.783747178329572, + "grad_norm": 303.66229248046875, + "learning_rate": 6.250453720508168e-06, + "loss": 36.5951, + "step": 4372 + }, + { + "epoch": 15.787358916478555, + "grad_norm": 231.65744018554688, + "learning_rate": 6.245009074410164e-06, + "loss": 36.4717, + "step": 4373 + }, + { + "epoch": 15.79097065462754, + "grad_norm": 235.25833129882812, + "learning_rate": 6.239564428312159e-06, + "loss": 38.4578, + "step": 4374 + }, + { + "epoch": 15.794582392776524, + "grad_norm": 215.5384063720703, + "learning_rate": 6.234119782214156e-06, + "loss": 38.0475, + "step": 4375 + }, + { + "epoch": 15.798194130925507, + "grad_norm": 216.3609619140625, + "learning_rate": 6.2286751361161526e-06, + "loss": 37.1825, + "step": 4376 + }, + { + "epoch": 15.801805869074492, + "grad_norm": 275.54522705078125, + "learning_rate": 6.223230490018149e-06, + "loss": 38.5608, + "step": 4377 + }, + { + "epoch": 15.805417607223477, + "grad_norm": 226.7752685546875, + "learning_rate": 6.217785843920145e-06, + "loss": 38.0612, + "step": 4378 + }, + { + "epoch": 15.80902934537246, + "grad_norm": 262.14501953125, + "learning_rate": 6.212341197822142e-06, + "loss": 38.0049, + "step": 4379 + }, + { + "epoch": 15.812641083521445, + "grad_norm": 299.82196044921875, + "learning_rate": 6.206896551724138e-06, + "loss": 39.1441, + "step": 4380 + }, + { + "epoch": 15.812641083521445, + "eval_loss": 0.6033969521522522, + "eval_runtime": 3.14, + "eval_samples_per_second": 57.007, + "eval_steps_per_second": 57.007, + "step": 4380 + }, + { + "epoch": 15.816252821670428, + "grad_norm": 295.24188232421875, + "learning_rate": 6.2014519056261346e-06, + "loss": 39.266, + "step": 4381 + }, + { + "epoch": 15.819864559819413, + "grad_norm": 298.1729736328125, + "learning_rate": 6.196007259528131e-06, + "loss": 39.4025, + "step": 4382 + }, + { + "epoch": 15.823476297968398, + "grad_norm": 234.97958374023438, + "learning_rate": 6.190562613430127e-06, + "loss": 39.4752, + "step": 4383 + }, + { + "epoch": 15.827088036117381, + "grad_norm": 270.3009338378906, + "learning_rate": 6.185117967332124e-06, + "loss": 36.0322, + "step": 4384 + }, + { + "epoch": 15.830699774266366, + "grad_norm": 279.78314208984375, + "learning_rate": 6.1796733212341195e-06, + "loss": 33.3256, + "step": 4385 + }, + { + "epoch": 15.83431151241535, + "grad_norm": 258.82598876953125, + "learning_rate": 6.1742286751361166e-06, + "loss": 33.1552, + "step": 4386 + }, + { + "epoch": 15.837923250564334, + "grad_norm": 280.8109130859375, + "learning_rate": 6.168784029038113e-06, + "loss": 32.0024, + "step": 4387 + }, + { + "epoch": 15.841534988713319, + "grad_norm": 265.08111572265625, + "learning_rate": 6.163339382940109e-06, + "loss": 32.4901, + "step": 4388 + }, + { + "epoch": 15.845146726862303, + "grad_norm": 316.56427001953125, + "learning_rate": 6.157894736842105e-06, + "loss": 33.1995, + "step": 4389 + }, + { + "epoch": 15.848758465011286, + "grad_norm": 256.03717041015625, + "learning_rate": 6.1524500907441015e-06, + "loss": 33.1914, + "step": 4390 + }, + { + "epoch": 15.848758465011286, + "eval_loss": 0.6017575263977051, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.034, + "eval_steps_per_second": 57.034, + "step": 4390 + }, + { + "epoch": 15.852370203160271, + "grad_norm": 242.54119873046875, + "learning_rate": 6.1470054446460985e-06, + "loss": 33.8459, + "step": 4391 + }, + { + "epoch": 15.855981941309254, + "grad_norm": 259.1406555175781, + "learning_rate": 6.141560798548094e-06, + "loss": 34.1317, + "step": 4392 + }, + { + "epoch": 15.85959367945824, + "grad_norm": 272.77880859375, + "learning_rate": 6.136116152450091e-06, + "loss": 34.2777, + "step": 4393 + }, + { + "epoch": 15.863205417607224, + "grad_norm": 231.60845947265625, + "learning_rate": 6.130671506352087e-06, + "loss": 34.0165, + "step": 4394 + }, + { + "epoch": 15.866817155756207, + "grad_norm": 230.85675048828125, + "learning_rate": 6.125226860254084e-06, + "loss": 34.2761, + "step": 4395 + }, + { + "epoch": 15.870428893905192, + "grad_norm": 307.4486389160156, + "learning_rate": 6.11978221415608e-06, + "loss": 33.7407, + "step": 4396 + }, + { + "epoch": 15.874040632054175, + "grad_norm": 264.7835388183594, + "learning_rate": 6.114337568058076e-06, + "loss": 34.1672, + "step": 4397 + }, + { + "epoch": 15.87765237020316, + "grad_norm": 234.93968200683594, + "learning_rate": 6.108892921960073e-06, + "loss": 35.7158, + "step": 4398 + }, + { + "epoch": 15.881264108352145, + "grad_norm": 300.0079345703125, + "learning_rate": 6.103448275862069e-06, + "loss": 36.1292, + "step": 4399 + }, + { + "epoch": 15.884875846501128, + "grad_norm": 326.20416259765625, + "learning_rate": 6.0980036297640655e-06, + "loss": 34.8222, + "step": 4400 + }, + { + "epoch": 15.884875846501128, + "eval_loss": 0.6024067401885986, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.99, + "eval_steps_per_second": 56.99, + "step": 4400 + }, + { + "epoch": 15.888487584650113, + "grad_norm": 214.6174774169922, + "learning_rate": 6.092558983666062e-06, + "loss": 27.4819, + "step": 4401 + }, + { + "epoch": 15.892099322799098, + "grad_norm": 222.7063446044922, + "learning_rate": 6.087114337568059e-06, + "loss": 22.3862, + "step": 4402 + }, + { + "epoch": 15.89571106094808, + "grad_norm": 277.0006103515625, + "learning_rate": 6.081669691470054e-06, + "loss": 22.8483, + "step": 4403 + }, + { + "epoch": 15.899322799097066, + "grad_norm": 264.3949890136719, + "learning_rate": 6.076225045372051e-06, + "loss": 23.2021, + "step": 4404 + }, + { + "epoch": 15.90293453724605, + "grad_norm": 244.04611206054688, + "learning_rate": 6.0707803992740475e-06, + "loss": 23.9378, + "step": 4405 + }, + { + "epoch": 15.906546275395034, + "grad_norm": 219.24403381347656, + "learning_rate": 6.065335753176044e-06, + "loss": 39.4708, + "step": 4406 + }, + { + "epoch": 15.910158013544018, + "grad_norm": 297.3822937011719, + "learning_rate": 6.05989110707804e-06, + "loss": 39.9151, + "step": 4407 + }, + { + "epoch": 15.913769751693001, + "grad_norm": 282.748291015625, + "learning_rate": 6.054446460980036e-06, + "loss": 39.0545, + "step": 4408 + }, + { + "epoch": 15.917381489841986, + "grad_norm": 274.6419982910156, + "learning_rate": 6.049001814882033e-06, + "loss": 39.7046, + "step": 4409 + }, + { + "epoch": 15.920993227990971, + "grad_norm": 261.2831115722656, + "learning_rate": 6.0435571687840295e-06, + "loss": 39.8849, + "step": 4410 + }, + { + "epoch": 15.920993227990971, + "eval_loss": 0.6017056107521057, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 4410 + }, + { + "epoch": 15.924604966139954, + "grad_norm": 276.61505126953125, + "learning_rate": 6.038112522686026e-06, + "loss": 39.8861, + "step": 4411 + }, + { + "epoch": 15.928216704288939, + "grad_norm": 273.4017333984375, + "learning_rate": 6.032667876588022e-06, + "loss": 36.2526, + "step": 4412 + }, + { + "epoch": 15.931828442437924, + "grad_norm": 314.4811706542969, + "learning_rate": 6.027223230490018e-06, + "loss": 37.1316, + "step": 4413 + }, + { + "epoch": 15.935440180586907, + "grad_norm": 265.7447204589844, + "learning_rate": 6.021778584392014e-06, + "loss": 38.1698, + "step": 4414 + }, + { + "epoch": 15.939051918735892, + "grad_norm": 448.373291015625, + "learning_rate": 6.016333938294011e-06, + "loss": 38.9541, + "step": 4415 + }, + { + "epoch": 15.942663656884875, + "grad_norm": 261.33966064453125, + "learning_rate": 6.010889292196008e-06, + "loss": 36.6694, + "step": 4416 + }, + { + "epoch": 15.94627539503386, + "grad_norm": 383.16363525390625, + "learning_rate": 6.005444646098004e-06, + "loss": 39.1773, + "step": 4417 + }, + { + "epoch": 15.949887133182845, + "grad_norm": 279.26446533203125, + "learning_rate": 6e-06, + "loss": 36.9482, + "step": 4418 + }, + { + "epoch": 15.953498871331828, + "grad_norm": 307.5321960449219, + "learning_rate": 5.994555353901996e-06, + "loss": 36.653, + "step": 4419 + }, + { + "epoch": 15.957110609480813, + "grad_norm": 412.80023193359375, + "learning_rate": 5.989110707803993e-06, + "loss": 36.3768, + "step": 4420 + }, + { + "epoch": 15.957110609480813, + "eval_loss": 0.6033455729484558, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 4420 + }, + { + "epoch": 15.960722347629797, + "grad_norm": 254.2952880859375, + "learning_rate": 5.98366606170599e-06, + "loss": 32.546, + "step": 4421 + }, + { + "epoch": 15.96433408577878, + "grad_norm": 324.0749816894531, + "learning_rate": 5.978221415607985e-06, + "loss": 32.7021, + "step": 4422 + }, + { + "epoch": 15.967945823927765, + "grad_norm": 326.0075988769531, + "learning_rate": 5.972776769509982e-06, + "loss": 33.3823, + "step": 4423 + }, + { + "epoch": 15.97155756207675, + "grad_norm": 252.98471069335938, + "learning_rate": 5.967332123411978e-06, + "loss": 33.3397, + "step": 4424 + }, + { + "epoch": 15.975169300225733, + "grad_norm": 243.14117431640625, + "learning_rate": 5.9618874773139755e-06, + "loss": 34.2781, + "step": 4425 + }, + { + "epoch": 15.978781038374718, + "grad_norm": 304.3429260253906, + "learning_rate": 5.956442831215971e-06, + "loss": 34.1163, + "step": 4426 + }, + { + "epoch": 15.982392776523701, + "grad_norm": 320.1651916503906, + "learning_rate": 5.950998185117968e-06, + "loss": 34.1024, + "step": 4427 + }, + { + "epoch": 15.986004514672686, + "grad_norm": 252.0004425048828, + "learning_rate": 5.945553539019964e-06, + "loss": 35.8121, + "step": 4428 + }, + { + "epoch": 15.989616252821671, + "grad_norm": 342.5635986328125, + "learning_rate": 5.9401088929219595e-06, + "loss": 35.6666, + "step": 4429 + }, + { + "epoch": 15.993227990970654, + "grad_norm": 226.57249450683594, + "learning_rate": 5.934664246823957e-06, + "loss": 30.2617, + "step": 4430 + }, + { + "epoch": 15.993227990970654, + "eval_loss": 0.6029886603355408, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.979, + "eval_steps_per_second": 56.979, + "step": 4430 + }, + { + "epoch": 15.996839729119639, + "grad_norm": 202.94903564453125, + "learning_rate": 5.929219600725953e-06, + "loss": 22.8166, + "step": 4431 + }, + { + "epoch": 16.0, + "grad_norm": 200.84317016601562, + "learning_rate": 5.92377495462795e-06, + "loss": 20.3903, + "step": 4432 + }, + { + "epoch": 16.003611738148983, + "grad_norm": 230.5917510986328, + "learning_rate": 5.918330308529945e-06, + "loss": 39.0985, + "step": 4433 + }, + { + "epoch": 16.00722347629797, + "grad_norm": 285.6978759765625, + "learning_rate": 5.912885662431942e-06, + "loss": 39.2128, + "step": 4434 + }, + { + "epoch": 16.010835214446953, + "grad_norm": 221.70896911621094, + "learning_rate": 5.907441016333939e-06, + "loss": 38.9026, + "step": 4435 + }, + { + "epoch": 16.014446952595936, + "grad_norm": 318.14068603515625, + "learning_rate": 5.901996370235935e-06, + "loss": 38.7336, + "step": 4436 + }, + { + "epoch": 16.018058690744923, + "grad_norm": 324.451904296875, + "learning_rate": 5.896551724137931e-06, + "loss": 38.7117, + "step": 4437 + }, + { + "epoch": 16.021670428893906, + "grad_norm": 295.038818359375, + "learning_rate": 5.891107078039927e-06, + "loss": 39.6053, + "step": 4438 + }, + { + "epoch": 16.02528216704289, + "grad_norm": 267.0055236816406, + "learning_rate": 5.885662431941924e-06, + "loss": 38.931, + "step": 4439 + }, + { + "epoch": 16.028893905191875, + "grad_norm": 269.20074462890625, + "learning_rate": 5.88021778584392e-06, + "loss": 41.1717, + "step": 4440 + }, + { + "epoch": 16.028893905191875, + "eval_loss": 0.6036069393157959, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.899, + "eval_steps_per_second": 56.899, + "step": 4440 + }, + { + "epoch": 16.03250564334086, + "grad_norm": 241.9443359375, + "learning_rate": 5.874773139745917e-06, + "loss": 38.7027, + "step": 4441 + }, + { + "epoch": 16.03611738148984, + "grad_norm": 238.54847717285156, + "learning_rate": 5.869328493647913e-06, + "loss": 39.1284, + "step": 4442 + }, + { + "epoch": 16.039729119638825, + "grad_norm": 339.3023681640625, + "learning_rate": 5.863883847549909e-06, + "loss": 38.0767, + "step": 4443 + }, + { + "epoch": 16.04334085778781, + "grad_norm": 257.29522705078125, + "learning_rate": 5.8584392014519055e-06, + "loss": 34.8207, + "step": 4444 + }, + { + "epoch": 16.046952595936794, + "grad_norm": 264.24200439453125, + "learning_rate": 5.852994555353902e-06, + "loss": 35.5021, + "step": 4445 + }, + { + "epoch": 16.050564334085777, + "grad_norm": 251.3128662109375, + "learning_rate": 5.847549909255899e-06, + "loss": 35.7826, + "step": 4446 + }, + { + "epoch": 16.054176072234764, + "grad_norm": 310.6581726074219, + "learning_rate": 5.842105263157895e-06, + "loss": 36.7373, + "step": 4447 + }, + { + "epoch": 16.057787810383747, + "grad_norm": 299.07550048828125, + "learning_rate": 5.836660617059891e-06, + "loss": 36.4048, + "step": 4448 + }, + { + "epoch": 16.06139954853273, + "grad_norm": 257.58740234375, + "learning_rate": 5.8312159709618875e-06, + "loss": 36.3982, + "step": 4449 + }, + { + "epoch": 16.065011286681717, + "grad_norm": 337.6795654296875, + "learning_rate": 5.825771324863884e-06, + "loss": 36.8518, + "step": 4450 + }, + { + "epoch": 16.065011286681717, + "eval_loss": 0.6036850214004517, + "eval_runtime": 3.1399, + "eval_samples_per_second": 57.009, + "eval_steps_per_second": 57.009, + "step": 4450 + }, + { + "epoch": 16.0686230248307, + "grad_norm": 275.02423095703125, + "learning_rate": 5.820326678765881e-06, + "loss": 36.1763, + "step": 4451 + }, + { + "epoch": 16.072234762979683, + "grad_norm": 263.4334716796875, + "learning_rate": 5.814882032667876e-06, + "loss": 37.6417, + "step": 4452 + }, + { + "epoch": 16.07584650112867, + "grad_norm": 213.16749572753906, + "learning_rate": 5.809437386569873e-06, + "loss": 35.6537, + "step": 4453 + }, + { + "epoch": 16.079458239277653, + "grad_norm": 263.4288330078125, + "learning_rate": 5.8039927404718695e-06, + "loss": 36.5693, + "step": 4454 + }, + { + "epoch": 16.083069977426636, + "grad_norm": 284.67254638671875, + "learning_rate": 5.798548094373866e-06, + "loss": 37.3424, + "step": 4455 + }, + { + "epoch": 16.086681715575622, + "grad_norm": 355.7987060546875, + "learning_rate": 5.793103448275862e-06, + "loss": 38.7851, + "step": 4456 + }, + { + "epoch": 16.090293453724605, + "grad_norm": 249.7351531982422, + "learning_rate": 5.787658802177859e-06, + "loss": 38.1334, + "step": 4457 + }, + { + "epoch": 16.09390519187359, + "grad_norm": 257.4977722167969, + "learning_rate": 5.782214156079855e-06, + "loss": 37.8369, + "step": 4458 + }, + { + "epoch": 16.097516930022575, + "grad_norm": 242.59584045410156, + "learning_rate": 5.776769509981851e-06, + "loss": 37.4005, + "step": 4459 + }, + { + "epoch": 16.101128668171558, + "grad_norm": 270.0740966796875, + "learning_rate": 5.771324863883848e-06, + "loss": 38.2287, + "step": 4460 + }, + { + "epoch": 16.101128668171558, + "eval_loss": 0.6018803119659424, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.04, + "eval_steps_per_second": 57.04, + "step": 4460 + }, + { + "epoch": 16.10474040632054, + "grad_norm": 225.32322692871094, + "learning_rate": 5.765880217785844e-06, + "loss": 35.7162, + "step": 4461 + }, + { + "epoch": 16.108352144469524, + "grad_norm": 275.3272705078125, + "learning_rate": 5.760435571687841e-06, + "loss": 32.8733, + "step": 4462 + }, + { + "epoch": 16.11196388261851, + "grad_norm": 259.5124206542969, + "learning_rate": 5.7549909255898364e-06, + "loss": 33.2271, + "step": 4463 + }, + { + "epoch": 16.115575620767494, + "grad_norm": 249.75738525390625, + "learning_rate": 5.7495462794918335e-06, + "loss": 30.2931, + "step": 4464 + }, + { + "epoch": 16.119187358916477, + "grad_norm": 277.7652282714844, + "learning_rate": 5.74410163339383e-06, + "loss": 30.9294, + "step": 4465 + }, + { + "epoch": 16.122799097065464, + "grad_norm": 223.28250122070312, + "learning_rate": 5.738656987295825e-06, + "loss": 31.7337, + "step": 4466 + }, + { + "epoch": 16.126410835214447, + "grad_norm": 259.5106201171875, + "learning_rate": 5.733212341197822e-06, + "loss": 31.2897, + "step": 4467 + }, + { + "epoch": 16.13002257336343, + "grad_norm": 241.0313720703125, + "learning_rate": 5.7277676950998184e-06, + "loss": 32.8436, + "step": 4468 + }, + { + "epoch": 16.133634311512417, + "grad_norm": 277.46905517578125, + "learning_rate": 5.7223230490018155e-06, + "loss": 33.6823, + "step": 4469 + }, + { + "epoch": 16.1372460496614, + "grad_norm": 264.2905578613281, + "learning_rate": 5.716878402903811e-06, + "loss": 33.1107, + "step": 4470 + }, + { + "epoch": 16.1372460496614, + "eval_loss": 0.6046355962753296, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 4470 + }, + { + "epoch": 16.140857787810383, + "grad_norm": 295.5188903808594, + "learning_rate": 5.711433756805808e-06, + "loss": 33.6291, + "step": 4471 + }, + { + "epoch": 16.14446952595937, + "grad_norm": 282.6014709472656, + "learning_rate": 5.705989110707804e-06, + "loss": 33.0773, + "step": 4472 + }, + { + "epoch": 16.148081264108352, + "grad_norm": 270.7958679199219, + "learning_rate": 5.7005444646098004e-06, + "loss": 35.0269, + "step": 4473 + }, + { + "epoch": 16.151693002257336, + "grad_norm": 344.7304992675781, + "learning_rate": 5.695099818511797e-06, + "loss": 35.1349, + "step": 4474 + }, + { + "epoch": 16.155304740406322, + "grad_norm": 294.5618896484375, + "learning_rate": 5.689655172413793e-06, + "loss": 36.3309, + "step": 4475 + }, + { + "epoch": 16.158916478555305, + "grad_norm": 305.5354309082031, + "learning_rate": 5.68421052631579e-06, + "loss": 35.0976, + "step": 4476 + }, + { + "epoch": 16.16252821670429, + "grad_norm": 293.9934387207031, + "learning_rate": 5.678765880217786e-06, + "loss": 34.9113, + "step": 4477 + }, + { + "epoch": 16.16613995485327, + "grad_norm": 277.9523010253906, + "learning_rate": 5.6733212341197824e-06, + "loss": 24.8815, + "step": 4478 + }, + { + "epoch": 16.169751693002258, + "grad_norm": 297.0547790527344, + "learning_rate": 5.667876588021779e-06, + "loss": 22.4544, + "step": 4479 + }, + { + "epoch": 16.17336343115124, + "grad_norm": 237.44741821289062, + "learning_rate": 5.662431941923776e-06, + "loss": 21.8323, + "step": 4480 + }, + { + "epoch": 16.17336343115124, + "eval_loss": 0.6061411499977112, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.98, + "eval_steps_per_second": 56.98, + "step": 4480 + }, + { + "epoch": 16.176975169300224, + "grad_norm": 220.5832977294922, + "learning_rate": 5.656987295825771e-06, + "loss": 22.7531, + "step": 4481 + }, + { + "epoch": 16.18058690744921, + "grad_norm": 298.8033142089844, + "learning_rate": 5.651542649727767e-06, + "loss": 23.7107, + "step": 4482 + }, + { + "epoch": 16.184198645598194, + "grad_norm": 250.02593994140625, + "learning_rate": 5.6460980036297644e-06, + "loss": 39.1679, + "step": 4483 + }, + { + "epoch": 16.187810383747177, + "grad_norm": 253.00746154785156, + "learning_rate": 5.640653357531761e-06, + "loss": 40.6492, + "step": 4484 + }, + { + "epoch": 16.191422121896164, + "grad_norm": 215.04270935058594, + "learning_rate": 5.635208711433757e-06, + "loss": 38.604, + "step": 4485 + }, + { + "epoch": 16.195033860045147, + "grad_norm": 395.6152648925781, + "learning_rate": 5.629764065335753e-06, + "loss": 39.1417, + "step": 4486 + }, + { + "epoch": 16.19864559819413, + "grad_norm": 380.3653869628906, + "learning_rate": 5.62431941923775e-06, + "loss": 39.4322, + "step": 4487 + }, + { + "epoch": 16.202257336343116, + "grad_norm": 309.3524475097656, + "learning_rate": 5.6188747731397464e-06, + "loss": 39.1721, + "step": 4488 + }, + { + "epoch": 16.2058690744921, + "grad_norm": 237.88262939453125, + "learning_rate": 5.613430127041742e-06, + "loss": 39.1462, + "step": 4489 + }, + { + "epoch": 16.209480812641083, + "grad_norm": 233.66690063476562, + "learning_rate": 5.607985480943739e-06, + "loss": 39.8177, + "step": 4490 + }, + { + "epoch": 16.209480812641083, + "eval_loss": 0.6043822169303894, + "eval_runtime": 3.1418, + "eval_samples_per_second": 56.974, + "eval_steps_per_second": 56.974, + "step": 4490 + }, + { + "epoch": 16.21309255079007, + "grad_norm": 229.3720703125, + "learning_rate": 5.602540834845735e-06, + "loss": 39.7878, + "step": 4491 + }, + { + "epoch": 16.216704288939052, + "grad_norm": 228.66493225097656, + "learning_rate": 5.597096188747731e-06, + "loss": 40.0754, + "step": 4492 + }, + { + "epoch": 16.220316027088035, + "grad_norm": 276.40240478515625, + "learning_rate": 5.591651542649728e-06, + "loss": 38.7709, + "step": 4493 + }, + { + "epoch": 16.223927765237022, + "grad_norm": 268.62371826171875, + "learning_rate": 5.586206896551725e-06, + "loss": 37.7439, + "step": 4494 + }, + { + "epoch": 16.227539503386005, + "grad_norm": 271.0934753417969, + "learning_rate": 5.580762250453721e-06, + "loss": 38.2511, + "step": 4495 + }, + { + "epoch": 16.231151241534988, + "grad_norm": 253.63385009765625, + "learning_rate": 5.575317604355716e-06, + "loss": 36.716, + "step": 4496 + }, + { + "epoch": 16.23476297968397, + "grad_norm": 265.1177978515625, + "learning_rate": 5.569872958257713e-06, + "loss": 36.5517, + "step": 4497 + }, + { + "epoch": 16.238374717832958, + "grad_norm": 332.52972412109375, + "learning_rate": 5.56442831215971e-06, + "loss": 37.1524, + "step": 4498 + }, + { + "epoch": 16.24198645598194, + "grad_norm": 247.53643798828125, + "learning_rate": 5.558983666061707e-06, + "loss": 36.6666, + "step": 4499 + }, + { + "epoch": 16.245598194130924, + "grad_norm": 233.3318634033203, + "learning_rate": 5.553539019963702e-06, + "loss": 37.0842, + "step": 4500 + }, + { + "epoch": 16.245598194130924, + "eval_loss": 0.6042913794517517, + "eval_runtime": 3.14, + "eval_samples_per_second": 57.007, + "eval_steps_per_second": 57.007, + "step": 4500 + }, + { + "epoch": 16.24920993227991, + "grad_norm": 222.98350524902344, + "learning_rate": 5.548094373865699e-06, + "loss": 37.6382, + "step": 4501 + }, + { + "epoch": 16.252821670428894, + "grad_norm": 234.33267211914062, + "learning_rate": 5.542649727767695e-06, + "loss": 38.0509, + "step": 4502 + }, + { + "epoch": 16.256433408577877, + "grad_norm": 303.56005859375, + "learning_rate": 5.5372050816696924e-06, + "loss": 36.509, + "step": 4503 + }, + { + "epoch": 16.260045146726863, + "grad_norm": 232.0821075439453, + "learning_rate": 5.531760435571688e-06, + "loss": 36.3975, + "step": 4504 + }, + { + "epoch": 16.263656884875846, + "grad_norm": 223.3292236328125, + "learning_rate": 5.526315789473684e-06, + "loss": 37.0448, + "step": 4505 + }, + { + "epoch": 16.26726862302483, + "grad_norm": 241.2131805419922, + "learning_rate": 5.520871143375681e-06, + "loss": 37.8635, + "step": 4506 + }, + { + "epoch": 16.270880361173816, + "grad_norm": 288.62689208984375, + "learning_rate": 5.5154264972776765e-06, + "loss": 38.2789, + "step": 4507 + }, + { + "epoch": 16.2744920993228, + "grad_norm": 262.59637451171875, + "learning_rate": 5.5099818511796736e-06, + "loss": 37.9052, + "step": 4508 + }, + { + "epoch": 16.278103837471782, + "grad_norm": 258.0476379394531, + "learning_rate": 5.50453720508167e-06, + "loss": 38.0485, + "step": 4509 + }, + { + "epoch": 16.28171557562077, + "grad_norm": 295.2730407714844, + "learning_rate": 5.499092558983667e-06, + "loss": 37.6134, + "step": 4510 + }, + { + "epoch": 16.28171557562077, + "eval_loss": 0.601740300655365, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 4510 + }, + { + "epoch": 16.285327313769752, + "grad_norm": 246.38548278808594, + "learning_rate": 5.493647912885662e-06, + "loss": 36.1289, + "step": 4511 + }, + { + "epoch": 16.288939051918735, + "grad_norm": 271.28997802734375, + "learning_rate": 5.4882032667876585e-06, + "loss": 31.8834, + "step": 4512 + }, + { + "epoch": 16.292550790067722, + "grad_norm": 231.76246643066406, + "learning_rate": 5.4827586206896556e-06, + "loss": 31.4899, + "step": 4513 + }, + { + "epoch": 16.296162528216705, + "grad_norm": 238.7414093017578, + "learning_rate": 5.477313974591652e-06, + "loss": 31.7102, + "step": 4514 + }, + { + "epoch": 16.299774266365688, + "grad_norm": 302.0710144042969, + "learning_rate": 5.471869328493648e-06, + "loss": 31.3557, + "step": 4515 + }, + { + "epoch": 16.30338600451467, + "grad_norm": 282.72015380859375, + "learning_rate": 5.466424682395644e-06, + "loss": 33.0781, + "step": 4516 + }, + { + "epoch": 16.306997742663658, + "grad_norm": 224.8140869140625, + "learning_rate": 5.460980036297641e-06, + "loss": 33.2963, + "step": 4517 + }, + { + "epoch": 16.31060948081264, + "grad_norm": 239.20570373535156, + "learning_rate": 5.4555353901996376e-06, + "loss": 34.4455, + "step": 4518 + }, + { + "epoch": 16.314221218961624, + "grad_norm": 304.7758483886719, + "learning_rate": 5.450090744101633e-06, + "loss": 34.534, + "step": 4519 + }, + { + "epoch": 16.31783295711061, + "grad_norm": 274.8758239746094, + "learning_rate": 5.44464609800363e-06, + "loss": 33.5232, + "step": 4520 + }, + { + "epoch": 16.31783295711061, + "eval_loss": 0.6031973958015442, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 4520 + }, + { + "epoch": 16.321444695259594, + "grad_norm": 295.1776428222656, + "learning_rate": 5.439201451905626e-06, + "loss": 33.403, + "step": 4521 + }, + { + "epoch": 16.325056433408577, + "grad_norm": 309.03399658203125, + "learning_rate": 5.4337568058076225e-06, + "loss": 34.1785, + "step": 4522 + }, + { + "epoch": 16.328668171557563, + "grad_norm": 285.26385498046875, + "learning_rate": 5.428312159709619e-06, + "loss": 34.4855, + "step": 4523 + }, + { + "epoch": 16.332279909706546, + "grad_norm": 307.0184020996094, + "learning_rate": 5.422867513611616e-06, + "loss": 32.4791, + "step": 4524 + }, + { + "epoch": 16.33589164785553, + "grad_norm": 318.8267822265625, + "learning_rate": 5.417422867513612e-06, + "loss": 35.697, + "step": 4525 + }, + { + "epoch": 16.339503386004516, + "grad_norm": 356.0179138183594, + "learning_rate": 5.411978221415607e-06, + "loss": 36.1811, + "step": 4526 + }, + { + "epoch": 16.3431151241535, + "grad_norm": 332.1255187988281, + "learning_rate": 5.4065335753176045e-06, + "loss": 36.2251, + "step": 4527 + }, + { + "epoch": 16.346726862302482, + "grad_norm": 288.78118896484375, + "learning_rate": 5.401088929219601e-06, + "loss": 32.0518, + "step": 4528 + }, + { + "epoch": 16.35033860045147, + "grad_norm": 250.37245178222656, + "learning_rate": 5.395644283121598e-06, + "loss": 23.627, + "step": 4529 + }, + { + "epoch": 16.353950338600452, + "grad_norm": 199.92352294921875, + "learning_rate": 5.390199637023593e-06, + "loss": 21.7919, + "step": 4530 + }, + { + "epoch": 16.353950338600452, + "eval_loss": 0.6021688580513, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 4530 + }, + { + "epoch": 16.357562076749435, + "grad_norm": 265.47015380859375, + "learning_rate": 5.38475499092559e-06, + "loss": 23.0672, + "step": 4531 + }, + { + "epoch": 16.36117381489842, + "grad_norm": 281.188720703125, + "learning_rate": 5.3793103448275865e-06, + "loss": 22.7983, + "step": 4532 + }, + { + "epoch": 16.364785553047405, + "grad_norm": 195.5351104736328, + "learning_rate": 5.373865698729583e-06, + "loss": 38.1042, + "step": 4533 + }, + { + "epoch": 16.368397291196388, + "grad_norm": 234.76573181152344, + "learning_rate": 5.368421052631579e-06, + "loss": 39.8602, + "step": 4534 + }, + { + "epoch": 16.37200902934537, + "grad_norm": 237.9152374267578, + "learning_rate": 5.362976406533575e-06, + "loss": 40.2156, + "step": 4535 + }, + { + "epoch": 16.375620767494357, + "grad_norm": 297.722900390625, + "learning_rate": 5.357531760435572e-06, + "loss": 39.3676, + "step": 4536 + }, + { + "epoch": 16.37923250564334, + "grad_norm": 218.61727905273438, + "learning_rate": 5.352087114337568e-06, + "loss": 38.7905, + "step": 4537 + }, + { + "epoch": 16.382844243792324, + "grad_norm": 245.19561767578125, + "learning_rate": 5.346642468239565e-06, + "loss": 39.3998, + "step": 4538 + }, + { + "epoch": 16.38645598194131, + "grad_norm": 247.5048370361328, + "learning_rate": 5.341197822141561e-06, + "loss": 40.0835, + "step": 4539 + }, + { + "epoch": 16.390067720090293, + "grad_norm": 214.40684509277344, + "learning_rate": 5.335753176043558e-06, + "loss": 39.1135, + "step": 4540 + }, + { + "epoch": 16.390067720090293, + "eval_loss": 0.6014460325241089, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.946, + "eval_steps_per_second": 56.946, + "step": 4540 + }, + { + "epoch": 16.393679458239276, + "grad_norm": 216.72271728515625, + "learning_rate": 5.330308529945553e-06, + "loss": 38.9449, + "step": 4541 + }, + { + "epoch": 16.397291196388263, + "grad_norm": 224.22262573242188, + "learning_rate": 5.32486388384755e-06, + "loss": 39.2646, + "step": 4542 + }, + { + "epoch": 16.400902934537246, + "grad_norm": 258.6524353027344, + "learning_rate": 5.319419237749547e-06, + "loss": 38.0846, + "step": 4543 + }, + { + "epoch": 16.40451467268623, + "grad_norm": 241.7313232421875, + "learning_rate": 5.313974591651543e-06, + "loss": 37.4963, + "step": 4544 + }, + { + "epoch": 16.408126410835216, + "grad_norm": 241.3990478515625, + "learning_rate": 5.308529945553539e-06, + "loss": 36.4783, + "step": 4545 + }, + { + "epoch": 16.4117381489842, + "grad_norm": 207.1470947265625, + "learning_rate": 5.303085299455535e-06, + "loss": 36.1592, + "step": 4546 + }, + { + "epoch": 16.415349887133182, + "grad_norm": 224.51690673828125, + "learning_rate": 5.2976406533575325e-06, + "loss": 35.7946, + "step": 4547 + }, + { + "epoch": 16.41896162528217, + "grad_norm": 292.4340515136719, + "learning_rate": 5.292196007259528e-06, + "loss": 36.8986, + "step": 4548 + }, + { + "epoch": 16.42257336343115, + "grad_norm": 244.67117309570312, + "learning_rate": 5.286751361161524e-06, + "loss": 37.1165, + "step": 4549 + }, + { + "epoch": 16.426185101580135, + "grad_norm": 331.14654541015625, + "learning_rate": 5.281306715063521e-06, + "loss": 36.4423, + "step": 4550 + }, + { + "epoch": 16.426185101580135, + "eval_loss": 0.6067427396774292, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.946, + "eval_steps_per_second": 56.946, + "step": 4550 + }, + { + "epoch": 16.42979683972912, + "grad_norm": 262.373046875, + "learning_rate": 5.275862068965517e-06, + "loss": 39.0014, + "step": 4551 + }, + { + "epoch": 16.433408577878104, + "grad_norm": 237.48350524902344, + "learning_rate": 5.270417422867514e-06, + "loss": 38.0152, + "step": 4552 + }, + { + "epoch": 16.437020316027088, + "grad_norm": 273.0652770996094, + "learning_rate": 5.26497277676951e-06, + "loss": 37.6952, + "step": 4553 + }, + { + "epoch": 16.44063205417607, + "grad_norm": 239.0780029296875, + "learning_rate": 5.259528130671507e-06, + "loss": 38.4266, + "step": 4554 + }, + { + "epoch": 16.444243792325057, + "grad_norm": 277.978759765625, + "learning_rate": 5.254083484573503e-06, + "loss": 36.5596, + "step": 4555 + }, + { + "epoch": 16.44785553047404, + "grad_norm": 216.2267303466797, + "learning_rate": 5.248638838475499e-06, + "loss": 39.1408, + "step": 4556 + }, + { + "epoch": 16.451467268623023, + "grad_norm": 231.80581665039062, + "learning_rate": 5.243194192377496e-06, + "loss": 38.7286, + "step": 4557 + }, + { + "epoch": 16.45507900677201, + "grad_norm": 236.4004669189453, + "learning_rate": 5.237749546279492e-06, + "loss": 39.2426, + "step": 4558 + }, + { + "epoch": 16.458690744920993, + "grad_norm": 270.0268859863281, + "learning_rate": 5.232304900181488e-06, + "loss": 38.6546, + "step": 4559 + }, + { + "epoch": 16.462302483069976, + "grad_norm": 255.8044891357422, + "learning_rate": 5.226860254083484e-06, + "loss": 37.554, + "step": 4560 + }, + { + "epoch": 16.462302483069976, + "eval_loss": 0.6019929647445679, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.062, + "eval_steps_per_second": 57.062, + "step": 4560 + }, + { + "epoch": 16.465914221218963, + "grad_norm": 321.18499755859375, + "learning_rate": 5.221415607985481e-06, + "loss": 34.9309, + "step": 4561 + }, + { + "epoch": 16.469525959367946, + "grad_norm": 311.94305419921875, + "learning_rate": 5.215970961887478e-06, + "loss": 35.8779, + "step": 4562 + }, + { + "epoch": 16.47313769751693, + "grad_norm": 211.90234375, + "learning_rate": 5.210526315789474e-06, + "loss": 31.8385, + "step": 4563 + }, + { + "epoch": 16.476749435665916, + "grad_norm": 284.64581298828125, + "learning_rate": 5.20508166969147e-06, + "loss": 31.8078, + "step": 4564 + }, + { + "epoch": 16.4803611738149, + "grad_norm": 291.94891357421875, + "learning_rate": 5.199637023593466e-06, + "loss": 33.2542, + "step": 4565 + }, + { + "epoch": 16.483972911963882, + "grad_norm": 243.61956787109375, + "learning_rate": 5.194192377495463e-06, + "loss": 31.5292, + "step": 4566 + }, + { + "epoch": 16.48758465011287, + "grad_norm": 242.07696533203125, + "learning_rate": 5.188747731397459e-06, + "loss": 33.9643, + "step": 4567 + }, + { + "epoch": 16.49119638826185, + "grad_norm": 255.0625457763672, + "learning_rate": 5.183303085299456e-06, + "loss": 33.7718, + "step": 4568 + }, + { + "epoch": 16.494808126410835, + "grad_norm": 249.40240478515625, + "learning_rate": 5.177858439201452e-06, + "loss": 31.5248, + "step": 4569 + }, + { + "epoch": 16.498419864559818, + "grad_norm": 231.3375244140625, + "learning_rate": 5.172413793103449e-06, + "loss": 34.5657, + "step": 4570 + }, + { + "epoch": 16.498419864559818, + "eval_loss": 0.6017265319824219, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.01, + "eval_steps_per_second": 57.01, + "step": 4570 + }, + { + "epoch": 16.502031602708804, + "grad_norm": 247.97012329101562, + "learning_rate": 5.1669691470054445e-06, + "loss": 33.766, + "step": 4571 + }, + { + "epoch": 16.505643340857787, + "grad_norm": 310.730224609375, + "learning_rate": 5.161524500907441e-06, + "loss": 34.0841, + "step": 4572 + }, + { + "epoch": 16.50925507900677, + "grad_norm": 323.5569152832031, + "learning_rate": 5.156079854809438e-06, + "loss": 35.0788, + "step": 4573 + }, + { + "epoch": 16.512866817155757, + "grad_norm": 247.95480346679688, + "learning_rate": 5.150635208711433e-06, + "loss": 33.5322, + "step": 4574 + }, + { + "epoch": 16.51647855530474, + "grad_norm": 307.6163024902344, + "learning_rate": 5.14519056261343e-06, + "loss": 34.4701, + "step": 4575 + }, + { + "epoch": 16.520090293453723, + "grad_norm": 239.569580078125, + "learning_rate": 5.1397459165154265e-06, + "loss": 35.8526, + "step": 4576 + }, + { + "epoch": 16.52370203160271, + "grad_norm": 362.4159240722656, + "learning_rate": 5.134301270417424e-06, + "loss": 36.2235, + "step": 4577 + }, + { + "epoch": 16.527313769751693, + "grad_norm": 321.2509765625, + "learning_rate": 5.128856624319419e-06, + "loss": 33.4705, + "step": 4578 + }, + { + "epoch": 16.530925507900676, + "grad_norm": 248.6092071533203, + "learning_rate": 5.123411978221415e-06, + "loss": 23.1329, + "step": 4579 + }, + { + "epoch": 16.534537246049663, + "grad_norm": 289.8996276855469, + "learning_rate": 5.117967332123412e-06, + "loss": 20.3184, + "step": 4580 + }, + { + "epoch": 16.534537246049663, + "eval_loss": 0.6034744381904602, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.997, + "eval_steps_per_second": 56.997, + "step": 4580 + }, + { + "epoch": 16.538148984198646, + "grad_norm": 215.02142333984375, + "learning_rate": 5.1125226860254085e-06, + "loss": 23.0513, + "step": 4581 + }, + { + "epoch": 16.54176072234763, + "grad_norm": 299.8429870605469, + "learning_rate": 5.107078039927405e-06, + "loss": 24.462, + "step": 4582 + }, + { + "epoch": 16.545372460496615, + "grad_norm": 267.0840759277344, + "learning_rate": 5.101633393829401e-06, + "loss": 39.9148, + "step": 4583 + }, + { + "epoch": 16.5489841986456, + "grad_norm": 227.23731994628906, + "learning_rate": 5.096188747731398e-06, + "loss": 40.6498, + "step": 4584 + }, + { + "epoch": 16.55259593679458, + "grad_norm": 313.9705810546875, + "learning_rate": 5.0907441016333935e-06, + "loss": 38.7711, + "step": 4585 + }, + { + "epoch": 16.55620767494357, + "grad_norm": 398.0429382324219, + "learning_rate": 5.0852994555353905e-06, + "loss": 39.6938, + "step": 4586 + }, + { + "epoch": 16.55981941309255, + "grad_norm": 365.489990234375, + "learning_rate": 5.079854809437387e-06, + "loss": 39.356, + "step": 4587 + }, + { + "epoch": 16.563431151241534, + "grad_norm": 365.05267333984375, + "learning_rate": 5.074410163339383e-06, + "loss": 40.2504, + "step": 4588 + }, + { + "epoch": 16.567042889390518, + "grad_norm": 288.0643310546875, + "learning_rate": 5.068965517241379e-06, + "loss": 39.6045, + "step": 4589 + }, + { + "epoch": 16.570654627539504, + "grad_norm": 262.0147705078125, + "learning_rate": 5.0635208711433755e-06, + "loss": 40.2504, + "step": 4590 + }, + { + "epoch": 16.570654627539504, + "eval_loss": 0.6028281450271606, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 4590 + }, + { + "epoch": 16.574266365688487, + "grad_norm": 325.78387451171875, + "learning_rate": 5.0580762250453725e-06, + "loss": 40.3154, + "step": 4591 + }, + { + "epoch": 16.57787810383747, + "grad_norm": 221.56591796875, + "learning_rate": 5.052631578947369e-06, + "loss": 39.5046, + "step": 4592 + }, + { + "epoch": 16.581489841986457, + "grad_norm": 227.02520751953125, + "learning_rate": 5.047186932849365e-06, + "loss": 38.3611, + "step": 4593 + }, + { + "epoch": 16.58510158013544, + "grad_norm": 232.46922302246094, + "learning_rate": 5.041742286751361e-06, + "loss": 36.5043, + "step": 4594 + }, + { + "epoch": 16.588713318284423, + "grad_norm": 230.59536743164062, + "learning_rate": 5.0362976406533575e-06, + "loss": 36.2179, + "step": 4595 + }, + { + "epoch": 16.59232505643341, + "grad_norm": 439.9609069824219, + "learning_rate": 5.0308529945553545e-06, + "loss": 36.4797, + "step": 4596 + }, + { + "epoch": 16.595936794582393, + "grad_norm": 322.4086608886719, + "learning_rate": 5.02540834845735e-06, + "loss": 37.4151, + "step": 4597 + }, + { + "epoch": 16.599548532731376, + "grad_norm": 318.1732482910156, + "learning_rate": 5.019963702359347e-06, + "loss": 37.2815, + "step": 4598 + }, + { + "epoch": 16.603160270880363, + "grad_norm": 321.34039306640625, + "learning_rate": 5.014519056261343e-06, + "loss": 36.8388, + "step": 4599 + }, + { + "epoch": 16.606772009029346, + "grad_norm": 341.28790283203125, + "learning_rate": 5.0090744101633395e-06, + "loss": 37.9805, + "step": 4600 + }, + { + "epoch": 16.606772009029346, + "eval_loss": 0.6045316457748413, + "eval_runtime": 3.1402, + "eval_samples_per_second": 57.002, + "eval_steps_per_second": 57.002, + "step": 4600 + }, + { + "epoch": 16.61038374717833, + "grad_norm": 259.9163513183594, + "learning_rate": 5.003629764065336e-06, + "loss": 37.5832, + "step": 4601 + }, + { + "epoch": 16.613995485327315, + "grad_norm": 297.02587890625, + "learning_rate": 4.998185117967332e-06, + "loss": 37.3808, + "step": 4602 + }, + { + "epoch": 16.6176072234763, + "grad_norm": 263.32244873046875, + "learning_rate": 4.992740471869329e-06, + "loss": 37.1047, + "step": 4603 + }, + { + "epoch": 16.62121896162528, + "grad_norm": 262.26104736328125, + "learning_rate": 4.987295825771324e-06, + "loss": 38.3592, + "step": 4604 + }, + { + "epoch": 16.624830699774268, + "grad_norm": 253.7144012451172, + "learning_rate": 4.9818511796733215e-06, + "loss": 37.4098, + "step": 4605 + }, + { + "epoch": 16.62844243792325, + "grad_norm": 279.1004943847656, + "learning_rate": 4.976406533575318e-06, + "loss": 39.3865, + "step": 4606 + }, + { + "epoch": 16.632054176072234, + "grad_norm": 298.7977600097656, + "learning_rate": 4.970961887477315e-06, + "loss": 38.6865, + "step": 4607 + }, + { + "epoch": 16.635665914221217, + "grad_norm": 256.7657470703125, + "learning_rate": 4.96551724137931e-06, + "loss": 38.7068, + "step": 4608 + }, + { + "epoch": 16.639277652370204, + "grad_norm": 238.22979736328125, + "learning_rate": 4.960072595281307e-06, + "loss": 37.749, + "step": 4609 + }, + { + "epoch": 16.642889390519187, + "grad_norm": 248.4231414794922, + "learning_rate": 4.9546279491833035e-06, + "loss": 37.582, + "step": 4610 + }, + { + "epoch": 16.642889390519187, + "eval_loss": 0.6026645302772522, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.99, + "eval_steps_per_second": 56.99, + "step": 4610 + }, + { + "epoch": 16.64650112866817, + "grad_norm": 232.70289611816406, + "learning_rate": 4.949183303085299e-06, + "loss": 34.4589, + "step": 4611 + }, + { + "epoch": 16.650112866817157, + "grad_norm": 268.4678955078125, + "learning_rate": 4.943738656987296e-06, + "loss": 32.3619, + "step": 4612 + }, + { + "epoch": 16.65372460496614, + "grad_norm": 272.07794189453125, + "learning_rate": 4.938294010889292e-06, + "loss": 32.3436, + "step": 4613 + }, + { + "epoch": 16.657336343115123, + "grad_norm": 304.4588317871094, + "learning_rate": 4.932849364791289e-06, + "loss": 30.8798, + "step": 4614 + }, + { + "epoch": 16.66094808126411, + "grad_norm": 293.3638000488281, + "learning_rate": 4.927404718693285e-06, + "loss": 31.1892, + "step": 4615 + }, + { + "epoch": 16.664559819413093, + "grad_norm": 292.844482421875, + "learning_rate": 4.921960072595282e-06, + "loss": 31.9604, + "step": 4616 + }, + { + "epoch": 16.668171557562076, + "grad_norm": 246.45339965820312, + "learning_rate": 4.916515426497278e-06, + "loss": 32.242, + "step": 4617 + }, + { + "epoch": 16.671783295711062, + "grad_norm": 269.9577941894531, + "learning_rate": 4.911070780399274e-06, + "loss": 32.5072, + "step": 4618 + }, + { + "epoch": 16.675395033860045, + "grad_norm": 312.8960876464844, + "learning_rate": 4.90562613430127e-06, + "loss": 33.8243, + "step": 4619 + }, + { + "epoch": 16.67900677200903, + "grad_norm": 287.4557189941406, + "learning_rate": 4.900181488203267e-06, + "loss": 34.3557, + "step": 4620 + }, + { + "epoch": 16.67900677200903, + "eval_loss": 0.6047338843345642, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 4620 + }, + { + "epoch": 16.682618510158015, + "grad_norm": 403.533935546875, + "learning_rate": 4.894736842105264e-06, + "loss": 34.6895, + "step": 4621 + }, + { + "epoch": 16.686230248306998, + "grad_norm": 387.5083923339844, + "learning_rate": 4.88929219600726e-06, + "loss": 34.2407, + "step": 4622 + }, + { + "epoch": 16.68984198645598, + "grad_norm": 278.8225402832031, + "learning_rate": 4.883847549909256e-06, + "loss": 33.3489, + "step": 4623 + }, + { + "epoch": 16.693453724604964, + "grad_norm": 270.46685791015625, + "learning_rate": 4.878402903811252e-06, + "loss": 34.2095, + "step": 4624 + }, + { + "epoch": 16.69706546275395, + "grad_norm": 244.6392059326172, + "learning_rate": 4.872958257713249e-06, + "loss": 35.783, + "step": 4625 + }, + { + "epoch": 16.700677200902934, + "grad_norm": 327.0617370605469, + "learning_rate": 4.867513611615245e-06, + "loss": 36.4928, + "step": 4626 + }, + { + "epoch": 16.704288939051917, + "grad_norm": 297.0531311035156, + "learning_rate": 4.862068965517241e-06, + "loss": 33.4827, + "step": 4627 + }, + { + "epoch": 16.707900677200904, + "grad_norm": 366.2174377441406, + "learning_rate": 4.856624319419238e-06, + "loss": 26.9456, + "step": 4628 + }, + { + "epoch": 16.711512415349887, + "grad_norm": 436.22613525390625, + "learning_rate": 4.851179673321234e-06, + "loss": 22.2349, + "step": 4629 + }, + { + "epoch": 16.71512415349887, + "grad_norm": 391.7647705078125, + "learning_rate": 4.845735027223231e-06, + "loss": 22.8557, + "step": 4630 + }, + { + "epoch": 16.71512415349887, + "eval_loss": 0.6052708029747009, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.038, + "eval_steps_per_second": 57.038, + "step": 4630 + }, + { + "epoch": 16.718735891647857, + "grad_norm": 277.8678283691406, + "learning_rate": 4.840290381125227e-06, + "loss": 23.3521, + "step": 4631 + }, + { + "epoch": 16.72234762979684, + "grad_norm": 252.46131896972656, + "learning_rate": 4.834845735027224e-06, + "loss": 23.7394, + "step": 4632 + }, + { + "epoch": 16.725959367945823, + "grad_norm": 214.6287078857422, + "learning_rate": 4.82940108892922e-06, + "loss": 38.6633, + "step": 4633 + }, + { + "epoch": 16.72957110609481, + "grad_norm": 257.454345703125, + "learning_rate": 4.8239564428312155e-06, + "loss": 40.5165, + "step": 4634 + }, + { + "epoch": 16.733182844243792, + "grad_norm": 211.1912841796875, + "learning_rate": 4.818511796733213e-06, + "loss": 38.483, + "step": 4635 + }, + { + "epoch": 16.736794582392776, + "grad_norm": 226.8388214111328, + "learning_rate": 4.813067150635209e-06, + "loss": 39.6143, + "step": 4636 + }, + { + "epoch": 16.740406320541762, + "grad_norm": 263.8160400390625, + "learning_rate": 4.807622504537205e-06, + "loss": 37.8442, + "step": 4637 + }, + { + "epoch": 16.744018058690745, + "grad_norm": 284.8119201660156, + "learning_rate": 4.802177858439201e-06, + "loss": 39.1835, + "step": 4638 + }, + { + "epoch": 16.74762979683973, + "grad_norm": 310.31390380859375, + "learning_rate": 4.796733212341198e-06, + "loss": 38.7035, + "step": 4639 + }, + { + "epoch": 16.751241534988715, + "grad_norm": 212.71315002441406, + "learning_rate": 4.791288566243195e-06, + "loss": 38.8803, + "step": 4640 + }, + { + "epoch": 16.751241534988715, + "eval_loss": 0.6030828952789307, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 4640 + }, + { + "epoch": 16.754853273137698, + "grad_norm": 209.7708740234375, + "learning_rate": 4.78584392014519e-06, + "loss": 39.0808, + "step": 4641 + }, + { + "epoch": 16.75846501128668, + "grad_norm": 251.971435546875, + "learning_rate": 4.780399274047187e-06, + "loss": 39.2025, + "step": 4642 + }, + { + "epoch": 16.762076749435664, + "grad_norm": 210.54151916503906, + "learning_rate": 4.774954627949183e-06, + "loss": 37.7541, + "step": 4643 + }, + { + "epoch": 16.76568848758465, + "grad_norm": 221.22119140625, + "learning_rate": 4.76950998185118e-06, + "loss": 36.4328, + "step": 4644 + }, + { + "epoch": 16.769300225733634, + "grad_norm": 201.45025634765625, + "learning_rate": 4.764065335753176e-06, + "loss": 34.9771, + "step": 4645 + }, + { + "epoch": 16.772911963882617, + "grad_norm": 241.33030700683594, + "learning_rate": 4.758620689655173e-06, + "loss": 37.6231, + "step": 4646 + }, + { + "epoch": 16.776523702031604, + "grad_norm": 282.12255859375, + "learning_rate": 4.753176043557169e-06, + "loss": 36.9822, + "step": 4647 + }, + { + "epoch": 16.780135440180587, + "grad_norm": 239.93885803222656, + "learning_rate": 4.747731397459165e-06, + "loss": 36.3529, + "step": 4648 + }, + { + "epoch": 16.78374717832957, + "grad_norm": 245.9400634765625, + "learning_rate": 4.7422867513611615e-06, + "loss": 37.518, + "step": 4649 + }, + { + "epoch": 16.787358916478556, + "grad_norm": 280.63720703125, + "learning_rate": 4.736842105263158e-06, + "loss": 37.6323, + "step": 4650 + }, + { + "epoch": 16.787358916478556, + "eval_loss": 0.6054876446723938, + "eval_runtime": 3.1439, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 4650 + }, + { + "epoch": 16.79097065462754, + "grad_norm": 368.47698974609375, + "learning_rate": 4.731397459165155e-06, + "loss": 38.1543, + "step": 4651 + }, + { + "epoch": 16.794582392776523, + "grad_norm": 346.9169616699219, + "learning_rate": 4.72595281306715e-06, + "loss": 38.8746, + "step": 4652 + }, + { + "epoch": 16.79819413092551, + "grad_norm": 311.7519836425781, + "learning_rate": 4.720508166969147e-06, + "loss": 37.3475, + "step": 4653 + }, + { + "epoch": 16.801805869074492, + "grad_norm": 323.14910888671875, + "learning_rate": 4.7150635208711435e-06, + "loss": 38.5308, + "step": 4654 + }, + { + "epoch": 16.805417607223475, + "grad_norm": 252.71958923339844, + "learning_rate": 4.70961887477314e-06, + "loss": 38.3275, + "step": 4655 + }, + { + "epoch": 16.809029345372462, + "grad_norm": 364.2929382324219, + "learning_rate": 4.704174228675136e-06, + "loss": 38.9973, + "step": 4656 + }, + { + "epoch": 16.812641083521445, + "grad_norm": 267.23980712890625, + "learning_rate": 4.698729582577132e-06, + "loss": 38.0867, + "step": 4657 + }, + { + "epoch": 16.816252821670428, + "grad_norm": 297.4647521972656, + "learning_rate": 4.693284936479129e-06, + "loss": 38.6933, + "step": 4658 + }, + { + "epoch": 16.819864559819415, + "grad_norm": 276.2767333984375, + "learning_rate": 4.6878402903811255e-06, + "loss": 38.0279, + "step": 4659 + }, + { + "epoch": 16.823476297968398, + "grad_norm": 261.5404052734375, + "learning_rate": 4.682395644283122e-06, + "loss": 36.5149, + "step": 4660 + }, + { + "epoch": 16.823476297968398, + "eval_loss": 0.6019832491874695, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.074, + "eval_steps_per_second": 57.074, + "step": 4660 + }, + { + "epoch": 16.82708803611738, + "grad_norm": 313.2170104980469, + "learning_rate": 4.676950998185118e-06, + "loss": 35.6121, + "step": 4661 + }, + { + "epoch": 16.830699774266364, + "grad_norm": 297.2791442871094, + "learning_rate": 4.671506352087115e-06, + "loss": 31.1869, + "step": 4662 + }, + { + "epoch": 16.83431151241535, + "grad_norm": 269.7320556640625, + "learning_rate": 4.666061705989111e-06, + "loss": 31.8674, + "step": 4663 + }, + { + "epoch": 16.837923250564334, + "grad_norm": 245.3898468017578, + "learning_rate": 4.660617059891107e-06, + "loss": 30.3726, + "step": 4664 + }, + { + "epoch": 16.841534988713317, + "grad_norm": 244.63223266601562, + "learning_rate": 4.655172413793104e-06, + "loss": 32.6154, + "step": 4665 + }, + { + "epoch": 16.845146726862303, + "grad_norm": 263.6791076660156, + "learning_rate": 4.6497277676951e-06, + "loss": 33.0104, + "step": 4666 + }, + { + "epoch": 16.848758465011286, + "grad_norm": 398.6610107421875, + "learning_rate": 4.644283121597096e-06, + "loss": 32.5445, + "step": 4667 + }, + { + "epoch": 16.85237020316027, + "grad_norm": 312.8116149902344, + "learning_rate": 4.6388384754990924e-06, + "loss": 32.5698, + "step": 4668 + }, + { + "epoch": 16.855981941309256, + "grad_norm": 296.6167297363281, + "learning_rate": 4.6333938294010895e-06, + "loss": 33.1377, + "step": 4669 + }, + { + "epoch": 16.85959367945824, + "grad_norm": 285.299560546875, + "learning_rate": 4.627949183303086e-06, + "loss": 33.3279, + "step": 4670 + }, + { + "epoch": 16.85959367945824, + "eval_loss": 0.6027817726135254, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.985, + "eval_steps_per_second": 56.985, + "step": 4670 + }, + { + "epoch": 16.863205417607222, + "grad_norm": 285.2948913574219, + "learning_rate": 4.622504537205081e-06, + "loss": 35.6879, + "step": 4671 + }, + { + "epoch": 16.86681715575621, + "grad_norm": 280.6530456542969, + "learning_rate": 4.617059891107078e-06, + "loss": 32.3154, + "step": 4672 + }, + { + "epoch": 16.870428893905192, + "grad_norm": 314.206787109375, + "learning_rate": 4.6116152450090744e-06, + "loss": 34.3517, + "step": 4673 + }, + { + "epoch": 16.874040632054175, + "grad_norm": 305.9198913574219, + "learning_rate": 4.6061705989110715e-06, + "loss": 34.1571, + "step": 4674 + }, + { + "epoch": 16.877652370203162, + "grad_norm": 287.0543212890625, + "learning_rate": 4.600725952813067e-06, + "loss": 35.1647, + "step": 4675 + }, + { + "epoch": 16.881264108352145, + "grad_norm": 286.912109375, + "learning_rate": 4.595281306715064e-06, + "loss": 34.8698, + "step": 4676 + }, + { + "epoch": 16.884875846501128, + "grad_norm": 322.4527587890625, + "learning_rate": 4.58983666061706e-06, + "loss": 36.3449, + "step": 4677 + }, + { + "epoch": 16.888487584650115, + "grad_norm": 239.41659545898438, + "learning_rate": 4.584392014519056e-06, + "loss": 25.3085, + "step": 4678 + }, + { + "epoch": 16.892099322799098, + "grad_norm": 215.5685577392578, + "learning_rate": 4.578947368421053e-06, + "loss": 22.3485, + "step": 4679 + }, + { + "epoch": 16.89571106094808, + "grad_norm": 291.2452697753906, + "learning_rate": 4.573502722323049e-06, + "loss": 22.3257, + "step": 4680 + }, + { + "epoch": 16.89571106094808, + "eval_loss": 0.6040940284729004, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 4680 + }, + { + "epoch": 16.899322799097064, + "grad_norm": 291.39935302734375, + "learning_rate": 4.568058076225046e-06, + "loss": 23.268, + "step": 4681 + }, + { + "epoch": 16.90293453724605, + "grad_norm": 272.211181640625, + "learning_rate": 4.562613430127041e-06, + "loss": 23.7127, + "step": 4682 + }, + { + "epoch": 16.906546275395034, + "grad_norm": 220.84397888183594, + "learning_rate": 4.5571687840290384e-06, + "loss": 39.2488, + "step": 4683 + }, + { + "epoch": 16.910158013544017, + "grad_norm": 238.49859619140625, + "learning_rate": 4.551724137931035e-06, + "loss": 39.5643, + "step": 4684 + }, + { + "epoch": 16.913769751693003, + "grad_norm": 325.3870544433594, + "learning_rate": 4.546279491833032e-06, + "loss": 38.6149, + "step": 4685 + }, + { + "epoch": 16.917381489841986, + "grad_norm": 307.02349853515625, + "learning_rate": 4.540834845735027e-06, + "loss": 38.0317, + "step": 4686 + }, + { + "epoch": 16.92099322799097, + "grad_norm": 433.99359130859375, + "learning_rate": 4.535390199637023e-06, + "loss": 40.4567, + "step": 4687 + }, + { + "epoch": 16.924604966139956, + "grad_norm": 327.97015380859375, + "learning_rate": 4.5299455535390204e-06, + "loss": 40.3109, + "step": 4688 + }, + { + "epoch": 16.92821670428894, + "grad_norm": 257.20684814453125, + "learning_rate": 4.524500907441017e-06, + "loss": 36.2826, + "step": 4689 + }, + { + "epoch": 16.931828442437922, + "grad_norm": 402.6732177734375, + "learning_rate": 4.519056261343013e-06, + "loss": 36.9163, + "step": 4690 + }, + { + "epoch": 16.931828442437922, + "eval_loss": 0.6016727089881897, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 4690 + }, + { + "epoch": 16.93544018058691, + "grad_norm": 380.8903503417969, + "learning_rate": 4.513611615245009e-06, + "loss": 36.7101, + "step": 4691 + }, + { + "epoch": 16.939051918735892, + "grad_norm": 365.4950256347656, + "learning_rate": 4.508166969147006e-06, + "loss": 37.9853, + "step": 4692 + }, + { + "epoch": 16.942663656884875, + "grad_norm": 302.3895568847656, + "learning_rate": 4.5027223230490016e-06, + "loss": 38.109, + "step": 4693 + }, + { + "epoch": 16.94627539503386, + "grad_norm": 333.5274963378906, + "learning_rate": 4.497277676950998e-06, + "loss": 37.5992, + "step": 4694 + }, + { + "epoch": 16.949887133182845, + "grad_norm": 364.3126525878906, + "learning_rate": 4.491833030852995e-06, + "loss": 38.0139, + "step": 4695 + }, + { + "epoch": 16.953498871331828, + "grad_norm": 509.94671630859375, + "learning_rate": 4.486388384754991e-06, + "loss": 39.8027, + "step": 4696 + }, + { + "epoch": 16.957110609480814, + "grad_norm": 507.8591613769531, + "learning_rate": 4.480943738656987e-06, + "loss": 40.0044, + "step": 4697 + }, + { + "epoch": 16.960722347629797, + "grad_norm": 324.5463562011719, + "learning_rate": 4.4754990925589836e-06, + "loss": 34.9058, + "step": 4698 + }, + { + "epoch": 16.96433408577878, + "grad_norm": 318.39801025390625, + "learning_rate": 4.470054446460981e-06, + "loss": 33.1318, + "step": 4699 + }, + { + "epoch": 16.967945823927764, + "grad_norm": 391.8466796875, + "learning_rate": 4.464609800362977e-06, + "loss": 32.2083, + "step": 4700 + }, + { + "epoch": 16.967945823927764, + "eval_loss": 0.6047930717468262, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.009, + "eval_steps_per_second": 57.009, + "step": 4700 + }, + { + "epoch": 16.97155756207675, + "grad_norm": 530.4073486328125, + "learning_rate": 4.459165154264972e-06, + "loss": 31.9882, + "step": 4701 + }, + { + "epoch": 16.975169300225733, + "grad_norm": 590.9242553710938, + "learning_rate": 4.453720508166969e-06, + "loss": 34.1937, + "step": 4702 + }, + { + "epoch": 16.978781038374716, + "grad_norm": 377.5596618652344, + "learning_rate": 4.4482758620689656e-06, + "loss": 34.6501, + "step": 4703 + }, + { + "epoch": 16.982392776523703, + "grad_norm": 431.2909240722656, + "learning_rate": 4.442831215970962e-06, + "loss": 33.9402, + "step": 4704 + }, + { + "epoch": 16.986004514672686, + "grad_norm": 294.7673645019531, + "learning_rate": 4.437386569872958e-06, + "loss": 33.7873, + "step": 4705 + }, + { + "epoch": 16.98961625282167, + "grad_norm": 346.1203918457031, + "learning_rate": 4.431941923774955e-06, + "loss": 35.2935, + "step": 4706 + }, + { + "epoch": 16.993227990970656, + "grad_norm": 257.8351745605469, + "learning_rate": 4.426497277676951e-06, + "loss": 28.3513, + "step": 4707 + }, + { + "epoch": 16.99683972911964, + "grad_norm": 168.35118103027344, + "learning_rate": 4.421052631578947e-06, + "loss": 22.3009, + "step": 4708 + }, + { + "epoch": 17.0, + "grad_norm": 210.20738220214844, + "learning_rate": 4.415607985480944e-06, + "loss": 20.1848, + "step": 4709 + }, + { + "epoch": 17.003611738148983, + "grad_norm": 234.40866088867188, + "learning_rate": 4.41016333938294e-06, + "loss": 38.0969, + "step": 4710 + }, + { + "epoch": 17.003611738148983, + "eval_loss": 0.6026900410652161, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 4710 + }, + { + "epoch": 17.00722347629797, + "grad_norm": 242.27195739746094, + "learning_rate": 4.404718693284937e-06, + "loss": 38.8902, + "step": 4711 + }, + { + "epoch": 17.010835214446953, + "grad_norm": 215.1695556640625, + "learning_rate": 4.3992740471869325e-06, + "loss": 38.5509, + "step": 4712 + }, + { + "epoch": 17.014446952595936, + "grad_norm": 390.2027587890625, + "learning_rate": 4.3938294010889296e-06, + "loss": 38.5247, + "step": 4713 + }, + { + "epoch": 17.018058690744923, + "grad_norm": 397.77484130859375, + "learning_rate": 4.388384754990926e-06, + "loss": 39.1981, + "step": 4714 + }, + { + "epoch": 17.021670428893906, + "grad_norm": 298.10089111328125, + "learning_rate": 4.382940108892923e-06, + "loss": 38.2627, + "step": 4715 + }, + { + "epoch": 17.02528216704289, + "grad_norm": 291.7283935546875, + "learning_rate": 4.377495462794918e-06, + "loss": 38.8027, + "step": 4716 + }, + { + "epoch": 17.028893905191875, + "grad_norm": 254.8542938232422, + "learning_rate": 4.3720508166969145e-06, + "loss": 38.6095, + "step": 4717 + }, + { + "epoch": 17.03250564334086, + "grad_norm": 244.336181640625, + "learning_rate": 4.3666061705989116e-06, + "loss": 38.2955, + "step": 4718 + }, + { + "epoch": 17.03611738148984, + "grad_norm": 376.92523193359375, + "learning_rate": 4.361161524500907e-06, + "loss": 38.5203, + "step": 4719 + }, + { + "epoch": 17.039729119638825, + "grad_norm": 339.6172790527344, + "learning_rate": 4.355716878402904e-06, + "loss": 37.4332, + "step": 4720 + }, + { + "epoch": 17.039729119638825, + "eval_loss": 0.6024167537689209, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 4720 + }, + { + "epoch": 17.04334085778781, + "grad_norm": 433.0855712890625, + "learning_rate": 4.3502722323049e-06, + "loss": 36.4444, + "step": 4721 + }, + { + "epoch": 17.046952595936794, + "grad_norm": 224.3468475341797, + "learning_rate": 4.344827586206897e-06, + "loss": 35.7802, + "step": 4722 + }, + { + "epoch": 17.050564334085777, + "grad_norm": 385.5466003417969, + "learning_rate": 4.339382940108893e-06, + "loss": 35.4641, + "step": 4723 + }, + { + "epoch": 17.054176072234764, + "grad_norm": 311.80596923828125, + "learning_rate": 4.333938294010889e-06, + "loss": 36.4231, + "step": 4724 + }, + { + "epoch": 17.057787810383747, + "grad_norm": 283.189453125, + "learning_rate": 4.328493647912886e-06, + "loss": 37.5405, + "step": 4725 + }, + { + "epoch": 17.06139954853273, + "grad_norm": 403.85833740234375, + "learning_rate": 4.323049001814882e-06, + "loss": 37.4723, + "step": 4726 + }, + { + "epoch": 17.065011286681717, + "grad_norm": 390.03515625, + "learning_rate": 4.3176043557168785e-06, + "loss": 36.6799, + "step": 4727 + }, + { + "epoch": 17.0686230248307, + "grad_norm": 318.63427734375, + "learning_rate": 4.312159709618875e-06, + "loss": 36.6312, + "step": 4728 + }, + { + "epoch": 17.072234762979683, + "grad_norm": 318.43402099609375, + "learning_rate": 4.306715063520872e-06, + "loss": 37.9104, + "step": 4729 + }, + { + "epoch": 17.07584650112867, + "grad_norm": 320.9336853027344, + "learning_rate": 4.301270417422867e-06, + "loss": 36.7254, + "step": 4730 + }, + { + "epoch": 17.07584650112867, + "eval_loss": 0.6046721339225769, + "eval_runtime": 3.1418, + "eval_samples_per_second": 56.974, + "eval_steps_per_second": 56.974, + "step": 4730 + }, + { + "epoch": 17.079458239277653, + "grad_norm": 345.9001770019531, + "learning_rate": 4.295825771324863e-06, + "loss": 36.0298, + "step": 4731 + }, + { + "epoch": 17.083069977426636, + "grad_norm": 397.10369873046875, + "learning_rate": 4.2903811252268605e-06, + "loss": 37.9418, + "step": 4732 + }, + { + "epoch": 17.086681715575622, + "grad_norm": 293.1039123535156, + "learning_rate": 4.284936479128857e-06, + "loss": 37.2627, + "step": 4733 + }, + { + "epoch": 17.090293453724605, + "grad_norm": 412.5190734863281, + "learning_rate": 4.279491833030853e-06, + "loss": 38.3429, + "step": 4734 + }, + { + "epoch": 17.09390519187359, + "grad_norm": 241.35105895996094, + "learning_rate": 4.274047186932849e-06, + "loss": 38.559, + "step": 4735 + }, + { + "epoch": 17.097516930022575, + "grad_norm": 275.169189453125, + "learning_rate": 4.268602540834846e-06, + "loss": 36.8167, + "step": 4736 + }, + { + "epoch": 17.101128668171558, + "grad_norm": 272.3182678222656, + "learning_rate": 4.2631578947368425e-06, + "loss": 37.0246, + "step": 4737 + }, + { + "epoch": 17.10474040632054, + "grad_norm": 215.6425018310547, + "learning_rate": 4.257713248638839e-06, + "loss": 33.1282, + "step": 4738 + }, + { + "epoch": 17.108352144469524, + "grad_norm": 276.6223449707031, + "learning_rate": 4.252268602540835e-06, + "loss": 33.2698, + "step": 4739 + }, + { + "epoch": 17.11196388261851, + "grad_norm": 311.1632385253906, + "learning_rate": 4.246823956442831e-06, + "loss": 31.0105, + "step": 4740 + }, + { + "epoch": 17.11196388261851, + "eval_loss": 0.6019421815872192, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.089, + "eval_steps_per_second": 57.089, + "step": 4740 + }, + { + "epoch": 17.115575620767494, + "grad_norm": 254.7543487548828, + "learning_rate": 4.241379310344828e-06, + "loss": 31.4721, + "step": 4741 + }, + { + "epoch": 17.119187358916477, + "grad_norm": 239.24957275390625, + "learning_rate": 4.235934664246824e-06, + "loss": 31.0346, + "step": 4742 + }, + { + "epoch": 17.122799097065464, + "grad_norm": 262.0681457519531, + "learning_rate": 4.230490018148821e-06, + "loss": 32.0604, + "step": 4743 + }, + { + "epoch": 17.126410835214447, + "grad_norm": 218.3557586669922, + "learning_rate": 4.225045372050817e-06, + "loss": 32.2036, + "step": 4744 + }, + { + "epoch": 17.13002257336343, + "grad_norm": 277.5924072265625, + "learning_rate": 4.219600725952813e-06, + "loss": 32.1412, + "step": 4745 + }, + { + "epoch": 17.133634311512417, + "grad_norm": 226.93211364746094, + "learning_rate": 4.214156079854809e-06, + "loss": 34.3367, + "step": 4746 + }, + { + "epoch": 17.1372460496614, + "grad_norm": 303.2422180175781, + "learning_rate": 4.208711433756806e-06, + "loss": 33.2001, + "step": 4747 + }, + { + "epoch": 17.140857787810383, + "grad_norm": 257.6164245605469, + "learning_rate": 4.203266787658803e-06, + "loss": 34.155, + "step": 4748 + }, + { + "epoch": 17.14446952595937, + "grad_norm": 361.1567077636719, + "learning_rate": 4.197822141560798e-06, + "loss": 35.236, + "step": 4749 + }, + { + "epoch": 17.148081264108352, + "grad_norm": 292.0034484863281, + "learning_rate": 4.192377495462795e-06, + "loss": 34.304, + "step": 4750 + }, + { + "epoch": 17.148081264108352, + "eval_loss": 0.6034401059150696, + "eval_runtime": 3.1399, + "eval_samples_per_second": 57.008, + "eval_steps_per_second": 57.008, + "step": 4750 + }, + { + "epoch": 17.151693002257336, + "grad_norm": 327.8070983886719, + "learning_rate": 4.186932849364791e-06, + "loss": 33.7346, + "step": 4751 + }, + { + "epoch": 17.155304740406322, + "grad_norm": 312.9547119140625, + "learning_rate": 4.1814882032667885e-06, + "loss": 35.9274, + "step": 4752 + }, + { + "epoch": 17.158916478555305, + "grad_norm": 305.19500732421875, + "learning_rate": 4.176043557168784e-06, + "loss": 35.5567, + "step": 4753 + }, + { + "epoch": 17.16252821670429, + "grad_norm": 339.37152099609375, + "learning_rate": 4.17059891107078e-06, + "loss": 35.8013, + "step": 4754 + }, + { + "epoch": 17.16613995485327, + "grad_norm": 247.36679077148438, + "learning_rate": 4.165154264972777e-06, + "loss": 29.2211, + "step": 4755 + }, + { + "epoch": 17.169751693002258, + "grad_norm": 255.65269470214844, + "learning_rate": 4.1597096188747725e-06, + "loss": 21.6191, + "step": 4756 + }, + { + "epoch": 17.17336343115124, + "grad_norm": 239.66448974609375, + "learning_rate": 4.15426497277677e-06, + "loss": 22.0521, + "step": 4757 + }, + { + "epoch": 17.176975169300224, + "grad_norm": 212.25955200195312, + "learning_rate": 4.148820326678766e-06, + "loss": 22.6641, + "step": 4758 + }, + { + "epoch": 17.18058690744921, + "grad_norm": 229.9394073486328, + "learning_rate": 4.143375680580763e-06, + "loss": 22.8787, + "step": 4759 + }, + { + "epoch": 17.184198645598194, + "grad_norm": 237.46343994140625, + "learning_rate": 4.137931034482758e-06, + "loss": 39.1222, + "step": 4760 + }, + { + "epoch": 17.184198645598194, + "eval_loss": 0.6031526327133179, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 4760 + }, + { + "epoch": 17.187810383747177, + "grad_norm": 229.23849487304688, + "learning_rate": 4.132486388384755e-06, + "loss": 39.7664, + "step": 4761 + }, + { + "epoch": 17.191422121896164, + "grad_norm": 250.67529296875, + "learning_rate": 4.127041742286752e-06, + "loss": 38.6754, + "step": 4762 + }, + { + "epoch": 17.195033860045147, + "grad_norm": 272.9320068359375, + "learning_rate": 4.121597096188748e-06, + "loss": 39.1262, + "step": 4763 + }, + { + "epoch": 17.19864559819413, + "grad_norm": 267.82427978515625, + "learning_rate": 4.116152450090744e-06, + "loss": 38.2223, + "step": 4764 + }, + { + "epoch": 17.202257336343116, + "grad_norm": 266.35760498046875, + "learning_rate": 4.11070780399274e-06, + "loss": 39.2069, + "step": 4765 + }, + { + "epoch": 17.2058690744921, + "grad_norm": 221.62606811523438, + "learning_rate": 4.105263157894737e-06, + "loss": 38.8956, + "step": 4766 + }, + { + "epoch": 17.209480812641083, + "grad_norm": 243.73110961914062, + "learning_rate": 4.099818511796734e-06, + "loss": 41.5868, + "step": 4767 + }, + { + "epoch": 17.21309255079007, + "grad_norm": 268.6092224121094, + "learning_rate": 4.09437386569873e-06, + "loss": 39.1041, + "step": 4768 + }, + { + "epoch": 17.216704288939052, + "grad_norm": 300.3140563964844, + "learning_rate": 4.088929219600726e-06, + "loss": 38.25, + "step": 4769 + }, + { + "epoch": 17.220316027088035, + "grad_norm": 264.56805419921875, + "learning_rate": 4.083484573502722e-06, + "loss": 38.186, + "step": 4770 + }, + { + "epoch": 17.220316027088035, + "eval_loss": 0.6044566631317139, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4770 + }, + { + "epoch": 17.223927765237022, + "grad_norm": 303.47686767578125, + "learning_rate": 4.0780399274047185e-06, + "loss": 37.7011, + "step": 4771 + }, + { + "epoch": 17.227539503386005, + "grad_norm": 238.3590545654297, + "learning_rate": 4.072595281306715e-06, + "loss": 34.6695, + "step": 4772 + }, + { + "epoch": 17.231151241534988, + "grad_norm": 252.90081787109375, + "learning_rate": 4.067150635208712e-06, + "loss": 36.1903, + "step": 4773 + }, + { + "epoch": 17.23476297968397, + "grad_norm": 286.5584716796875, + "learning_rate": 4.061705989110708e-06, + "loss": 36.4185, + "step": 4774 + }, + { + "epoch": 17.238374717832958, + "grad_norm": 322.25323486328125, + "learning_rate": 4.056261343012704e-06, + "loss": 36.0098, + "step": 4775 + }, + { + "epoch": 17.24198645598194, + "grad_norm": 292.09405517578125, + "learning_rate": 4.0508166969147005e-06, + "loss": 35.4347, + "step": 4776 + }, + { + "epoch": 17.245598194130924, + "grad_norm": 295.9725341796875, + "learning_rate": 4.045372050816697e-06, + "loss": 37.3512, + "step": 4777 + }, + { + "epoch": 17.24920993227991, + "grad_norm": 326.34539794921875, + "learning_rate": 4.039927404718694e-06, + "loss": 38.6739, + "step": 4778 + }, + { + "epoch": 17.252821670428894, + "grad_norm": 384.3682861328125, + "learning_rate": 4.034482758620689e-06, + "loss": 38.0995, + "step": 4779 + }, + { + "epoch": 17.256433408577877, + "grad_norm": 400.59136962890625, + "learning_rate": 4.029038112522686e-06, + "loss": 36.7733, + "step": 4780 + }, + { + "epoch": 17.256433408577877, + "eval_loss": 0.6064656972885132, + "eval_runtime": 3.14, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 4780 + }, + { + "epoch": 17.260045146726863, + "grad_norm": 379.5261535644531, + "learning_rate": 4.0235934664246825e-06, + "loss": 36.1385, + "step": 4781 + }, + { + "epoch": 17.263656884875846, + "grad_norm": 277.1004638671875, + "learning_rate": 4.018148820326679e-06, + "loss": 39.1495, + "step": 4782 + }, + { + "epoch": 17.26726862302483, + "grad_norm": 274.6176452636719, + "learning_rate": 4.012704174228675e-06, + "loss": 37.8503, + "step": 4783 + }, + { + "epoch": 17.270880361173816, + "grad_norm": 338.9375305175781, + "learning_rate": 4.007259528130671e-06, + "loss": 39.7149, + "step": 4784 + }, + { + "epoch": 17.2744920993228, + "grad_norm": 299.60662841796875, + "learning_rate": 4.001814882032668e-06, + "loss": 37.6013, + "step": 4785 + }, + { + "epoch": 17.278103837471782, + "grad_norm": 278.9190368652344, + "learning_rate": 3.996370235934664e-06, + "loss": 38.1106, + "step": 4786 + }, + { + "epoch": 17.28171557562077, + "grad_norm": 254.48443603515625, + "learning_rate": 3.990925589836661e-06, + "loss": 35.9676, + "step": 4787 + }, + { + "epoch": 17.285327313769752, + "grad_norm": 274.65338134765625, + "learning_rate": 3.985480943738657e-06, + "loss": 35.3535, + "step": 4788 + }, + { + "epoch": 17.288939051918735, + "grad_norm": 288.748779296875, + "learning_rate": 3.980036297640654e-06, + "loss": 32.7356, + "step": 4789 + }, + { + "epoch": 17.292550790067722, + "grad_norm": 229.0682830810547, + "learning_rate": 3.9745916515426495e-06, + "loss": 31.2048, + "step": 4790 + }, + { + "epoch": 17.292550790067722, + "eval_loss": 0.6020387411117554, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.06, + "eval_steps_per_second": 57.06, + "step": 4790 + }, + { + "epoch": 17.296162528216705, + "grad_norm": 234.29937744140625, + "learning_rate": 3.9691470054446465e-06, + "loss": 31.7953, + "step": 4791 + }, + { + "epoch": 17.299774266365688, + "grad_norm": 236.3527069091797, + "learning_rate": 3.963702359346643e-06, + "loss": 31.6686, + "step": 4792 + }, + { + "epoch": 17.30338600451467, + "grad_norm": 253.44126892089844, + "learning_rate": 3.958257713248639e-06, + "loss": 31.8848, + "step": 4793 + }, + { + "epoch": 17.306997742663658, + "grad_norm": 270.66046142578125, + "learning_rate": 3.952813067150635e-06, + "loss": 32.1593, + "step": 4794 + }, + { + "epoch": 17.31060948081264, + "grad_norm": 242.77777099609375, + "learning_rate": 3.9473684210526315e-06, + "loss": 32.4555, + "step": 4795 + }, + { + "epoch": 17.314221218961624, + "grad_norm": 243.9296112060547, + "learning_rate": 3.9419237749546285e-06, + "loss": 34.0444, + "step": 4796 + }, + { + "epoch": 17.31783295711061, + "grad_norm": 276.2138671875, + "learning_rate": 3.936479128856624e-06, + "loss": 32.0404, + "step": 4797 + }, + { + "epoch": 17.321444695259594, + "grad_norm": 262.97802734375, + "learning_rate": 3.931034482758621e-06, + "loss": 32.4535, + "step": 4798 + }, + { + "epoch": 17.325056433408577, + "grad_norm": 338.9852600097656, + "learning_rate": 3.925589836660617e-06, + "loss": 34.6855, + "step": 4799 + }, + { + "epoch": 17.328668171557563, + "grad_norm": 270.85650634765625, + "learning_rate": 3.9201451905626135e-06, + "loss": 32.2425, + "step": 4800 + }, + { + "epoch": 17.328668171557563, + "eval_loss": 0.603055477142334, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 4800 + }, + { + "epoch": 17.332279909706546, + "grad_norm": 289.17584228515625, + "learning_rate": 3.91470054446461e-06, + "loss": 34.6461, + "step": 4801 + }, + { + "epoch": 17.33589164785553, + "grad_norm": 301.120361328125, + "learning_rate": 3.909255898366606e-06, + "loss": 34.5622, + "step": 4802 + }, + { + "epoch": 17.339503386004516, + "grad_norm": 328.93524169921875, + "learning_rate": 3.903811252268603e-06, + "loss": 34.9585, + "step": 4803 + }, + { + "epoch": 17.3431151241535, + "grad_norm": 445.72003173828125, + "learning_rate": 3.898366606170599e-06, + "loss": 36.9729, + "step": 4804 + }, + { + "epoch": 17.346726862302482, + "grad_norm": 249.7901153564453, + "learning_rate": 3.8929219600725955e-06, + "loss": 30.1609, + "step": 4805 + }, + { + "epoch": 17.35033860045147, + "grad_norm": 230.1756134033203, + "learning_rate": 3.887477313974592e-06, + "loss": 21.6742, + "step": 4806 + }, + { + "epoch": 17.353950338600452, + "grad_norm": 193.68104553222656, + "learning_rate": 3.882032667876588e-06, + "loss": 22.0064, + "step": 4807 + }, + { + "epoch": 17.357562076749435, + "grad_norm": 232.58486938476562, + "learning_rate": 3.876588021778585e-06, + "loss": 23.1576, + "step": 4808 + }, + { + "epoch": 17.36117381489842, + "grad_norm": 256.0340270996094, + "learning_rate": 3.87114337568058e-06, + "loss": 23.5346, + "step": 4809 + }, + { + "epoch": 17.364785553047405, + "grad_norm": 260.8665771484375, + "learning_rate": 3.8656987295825775e-06, + "loss": 39.5267, + "step": 4810 + }, + { + "epoch": 17.364785553047405, + "eval_loss": 0.6040924191474915, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.926, + "eval_steps_per_second": 56.926, + "step": 4810 + }, + { + "epoch": 17.368397291196388, + "grad_norm": 253.2076873779297, + "learning_rate": 3.860254083484574e-06, + "loss": 40.222, + "step": 4811 + }, + { + "epoch": 17.37200902934537, + "grad_norm": 232.68162536621094, + "learning_rate": 3.85480943738657e-06, + "loss": 38.8405, + "step": 4812 + }, + { + "epoch": 17.375620767494357, + "grad_norm": 264.7735290527344, + "learning_rate": 3.849364791288566e-06, + "loss": 37.8169, + "step": 4813 + }, + { + "epoch": 17.37923250564334, + "grad_norm": 305.1289978027344, + "learning_rate": 3.843920145190563e-06, + "loss": 39.4413, + "step": 4814 + }, + { + "epoch": 17.382844243792324, + "grad_norm": 409.03106689453125, + "learning_rate": 3.8384754990925594e-06, + "loss": 40.146, + "step": 4815 + }, + { + "epoch": 17.38645598194131, + "grad_norm": 307.2272644042969, + "learning_rate": 3.833030852994555e-06, + "loss": 39.0141, + "step": 4816 + }, + { + "epoch": 17.390067720090293, + "grad_norm": 272.6708068847656, + "learning_rate": 3.827586206896552e-06, + "loss": 39.4356, + "step": 4817 + }, + { + "epoch": 17.393679458239276, + "grad_norm": 239.75225830078125, + "learning_rate": 3.822141560798548e-06, + "loss": 39.1581, + "step": 4818 + }, + { + "epoch": 17.397291196388263, + "grad_norm": 203.42205810546875, + "learning_rate": 3.816696914700545e-06, + "loss": 39.9827, + "step": 4819 + }, + { + "epoch": 17.400902934537246, + "grad_norm": 217.77159118652344, + "learning_rate": 3.811252268602541e-06, + "loss": 37.5404, + "step": 4820 + }, + { + "epoch": 17.400902934537246, + "eval_loss": 0.6033807396888733, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.033, + "eval_steps_per_second": 57.033, + "step": 4820 + }, + { + "epoch": 17.40451467268623, + "grad_norm": 257.9713134765625, + "learning_rate": 3.8058076225045377e-06, + "loss": 35.6571, + "step": 4821 + }, + { + "epoch": 17.408126410835216, + "grad_norm": 295.11468505859375, + "learning_rate": 3.8003629764065335e-06, + "loss": 34.7256, + "step": 4822 + }, + { + "epoch": 17.4117381489842, + "grad_norm": 248.15908813476562, + "learning_rate": 3.7949183303085297e-06, + "loss": 37.3417, + "step": 4823 + }, + { + "epoch": 17.415349887133182, + "grad_norm": 295.19085693359375, + "learning_rate": 3.7894736842105264e-06, + "loss": 37.0117, + "step": 4824 + }, + { + "epoch": 17.41896162528217, + "grad_norm": 249.31576538085938, + "learning_rate": 3.7840290381125226e-06, + "loss": 37.168, + "step": 4825 + }, + { + "epoch": 17.42257336343115, + "grad_norm": 271.1731262207031, + "learning_rate": 3.7785843920145193e-06, + "loss": 35.9932, + "step": 4826 + }, + { + "epoch": 17.426185101580135, + "grad_norm": 380.6817626953125, + "learning_rate": 3.7731397459165155e-06, + "loss": 36.952, + "step": 4827 + }, + { + "epoch": 17.42979683972912, + "grad_norm": 370.125244140625, + "learning_rate": 3.767695099818512e-06, + "loss": 38.2224, + "step": 4828 + }, + { + "epoch": 17.433408577878104, + "grad_norm": 291.13568115234375, + "learning_rate": 3.7622504537205084e-06, + "loss": 38.5377, + "step": 4829 + }, + { + "epoch": 17.437020316027088, + "grad_norm": 329.5670471191406, + "learning_rate": 3.756805807622504e-06, + "loss": 38.1665, + "step": 4830 + }, + { + "epoch": 17.437020316027088, + "eval_loss": 0.6047329902648926, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.081, + "eval_steps_per_second": 57.081, + "step": 4830 + }, + { + "epoch": 17.44063205417607, + "grad_norm": 266.0620422363281, + "learning_rate": 3.7513611615245012e-06, + "loss": 34.8371, + "step": 4831 + }, + { + "epoch": 17.444243792325057, + "grad_norm": 257.48980712890625, + "learning_rate": 3.7459165154264975e-06, + "loss": 37.1885, + "step": 4832 + }, + { + "epoch": 17.44785553047404, + "grad_norm": 346.8575439453125, + "learning_rate": 3.740471869328494e-06, + "loss": 38.1426, + "step": 4833 + }, + { + "epoch": 17.451467268623023, + "grad_norm": 246.66868591308594, + "learning_rate": 3.73502722323049e-06, + "loss": 37.6658, + "step": 4834 + }, + { + "epoch": 17.45507900677201, + "grad_norm": 309.71087646484375, + "learning_rate": 3.729582577132486e-06, + "loss": 38.2335, + "step": 4835 + }, + { + "epoch": 17.458690744920993, + "grad_norm": 304.1862487792969, + "learning_rate": 3.724137931034483e-06, + "loss": 38.5964, + "step": 4836 + }, + { + "epoch": 17.462302483069976, + "grad_norm": 253.73211669921875, + "learning_rate": 3.718693284936479e-06, + "loss": 38.9237, + "step": 4837 + }, + { + "epoch": 17.465914221218963, + "grad_norm": 208.52822875976562, + "learning_rate": 3.7132486388384757e-06, + "loss": 35.9177, + "step": 4838 + }, + { + "epoch": 17.469525959367946, + "grad_norm": 258.5502014160156, + "learning_rate": 3.707803992740472e-06, + "loss": 33.2577, + "step": 4839 + }, + { + "epoch": 17.47313769751693, + "grad_norm": 269.1754150390625, + "learning_rate": 3.7023593466424686e-06, + "loss": 31.2634, + "step": 4840 + }, + { + "epoch": 17.47313769751693, + "eval_loss": 0.6035012006759644, + "eval_runtime": 3.1369, + "eval_samples_per_second": 57.062, + "eval_steps_per_second": 57.062, + "step": 4840 + }, + { + "epoch": 17.476749435665916, + "grad_norm": 268.5780029296875, + "learning_rate": 3.6969147005444644e-06, + "loss": 30.6732, + "step": 4841 + }, + { + "epoch": 17.4803611738149, + "grad_norm": 223.7191619873047, + "learning_rate": 3.691470054446461e-06, + "loss": 31.5905, + "step": 4842 + }, + { + "epoch": 17.483972911963882, + "grad_norm": 266.960205078125, + "learning_rate": 3.6860254083484573e-06, + "loss": 31.9407, + "step": 4843 + }, + { + "epoch": 17.48758465011287, + "grad_norm": 241.2608184814453, + "learning_rate": 3.680580762250454e-06, + "loss": 31.8078, + "step": 4844 + }, + { + "epoch": 17.49119638826185, + "grad_norm": 315.95166015625, + "learning_rate": 3.67513611615245e-06, + "loss": 33.5336, + "step": 4845 + }, + { + "epoch": 17.494808126410835, + "grad_norm": 277.731689453125, + "learning_rate": 3.669691470054447e-06, + "loss": 33.0484, + "step": 4846 + }, + { + "epoch": 17.498419864559818, + "grad_norm": 272.35137939453125, + "learning_rate": 3.664246823956443e-06, + "loss": 33.5048, + "step": 4847 + }, + { + "epoch": 17.502031602708804, + "grad_norm": 260.4573974609375, + "learning_rate": 3.6588021778584393e-06, + "loss": 33.5782, + "step": 4848 + }, + { + "epoch": 17.505643340857787, + "grad_norm": 285.7935485839844, + "learning_rate": 3.6533575317604355e-06, + "loss": 35.0308, + "step": 4849 + }, + { + "epoch": 17.50925507900677, + "grad_norm": 267.613037109375, + "learning_rate": 3.6479128856624317e-06, + "loss": 34.8067, + "step": 4850 + }, + { + "epoch": 17.50925507900677, + "eval_loss": 0.6035751700401306, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4850 + }, + { + "epoch": 17.512866817155757, + "grad_norm": 301.43536376953125, + "learning_rate": 3.6424682395644284e-06, + "loss": 33.1631, + "step": 4851 + }, + { + "epoch": 17.51647855530474, + "grad_norm": 270.10467529296875, + "learning_rate": 3.6370235934664246e-06, + "loss": 32.978, + "step": 4852 + }, + { + "epoch": 17.520090293453723, + "grad_norm": 280.802001953125, + "learning_rate": 3.6315789473684213e-06, + "loss": 35.3346, + "step": 4853 + }, + { + "epoch": 17.52370203160271, + "grad_norm": 314.7720031738281, + "learning_rate": 3.6261343012704175e-06, + "loss": 33.4881, + "step": 4854 + }, + { + "epoch": 17.527313769751693, + "grad_norm": 347.4674072265625, + "learning_rate": 3.620689655172414e-06, + "loss": 31.5599, + "step": 4855 + }, + { + "epoch": 17.530925507900676, + "grad_norm": 207.3061981201172, + "learning_rate": 3.61524500907441e-06, + "loss": 22.159, + "step": 4856 + }, + { + "epoch": 17.534537246049663, + "grad_norm": 216.7202911376953, + "learning_rate": 3.6098003629764066e-06, + "loss": 21.6584, + "step": 4857 + }, + { + "epoch": 17.538148984198646, + "grad_norm": 260.20452880859375, + "learning_rate": 3.604355716878403e-06, + "loss": 22.9289, + "step": 4858 + }, + { + "epoch": 17.54176072234763, + "grad_norm": 295.9897766113281, + "learning_rate": 3.5989110707803995e-06, + "loss": 23.7172, + "step": 4859 + }, + { + "epoch": 17.545372460496615, + "grad_norm": 226.99484252929688, + "learning_rate": 3.5934664246823957e-06, + "loss": 37.5844, + "step": 4860 + }, + { + "epoch": 17.545372460496615, + "eval_loss": 0.6059216260910034, + "eval_runtime": 3.1302, + "eval_samples_per_second": 57.185, + "eval_steps_per_second": 57.185, + "step": 4860 + }, + { + "epoch": 17.5489841986456, + "grad_norm": 231.67477416992188, + "learning_rate": 3.588021778584392e-06, + "loss": 39.5191, + "step": 4861 + }, + { + "epoch": 17.55259593679458, + "grad_norm": 248.46058654785156, + "learning_rate": 3.5825771324863886e-06, + "loss": 39.4246, + "step": 4862 + }, + { + "epoch": 17.55620767494357, + "grad_norm": 239.17247009277344, + "learning_rate": 3.577132486388385e-06, + "loss": 38.9811, + "step": 4863 + }, + { + "epoch": 17.55981941309255, + "grad_norm": 325.3457946777344, + "learning_rate": 3.571687840290381e-06, + "loss": 38.4724, + "step": 4864 + }, + { + "epoch": 17.563431151241534, + "grad_norm": 264.5011901855469, + "learning_rate": 3.5662431941923773e-06, + "loss": 38.79, + "step": 4865 + }, + { + "epoch": 17.567042889390518, + "grad_norm": 251.97154235839844, + "learning_rate": 3.560798548094374e-06, + "loss": 38.0342, + "step": 4866 + }, + { + "epoch": 17.570654627539504, + "grad_norm": 236.78271484375, + "learning_rate": 3.55535390199637e-06, + "loss": 39.8586, + "step": 4867 + }, + { + "epoch": 17.574266365688487, + "grad_norm": 276.8800048828125, + "learning_rate": 3.549909255898367e-06, + "loss": 37.8967, + "step": 4868 + }, + { + "epoch": 17.57787810383747, + "grad_norm": 255.9346160888672, + "learning_rate": 3.544464609800363e-06, + "loss": 39.9833, + "step": 4869 + }, + { + "epoch": 17.581489841986457, + "grad_norm": 273.71337890625, + "learning_rate": 3.5390199637023597e-06, + "loss": 38.6235, + "step": 4870 + }, + { + "epoch": 17.581489841986457, + "eval_loss": 0.6033145189285278, + "eval_runtime": 3.1252, + "eval_samples_per_second": 57.275, + "eval_steps_per_second": 57.275, + "step": 4870 + }, + { + "epoch": 17.58510158013544, + "grad_norm": 252.93063354492188, + "learning_rate": 3.533575317604356e-06, + "loss": 37.9017, + "step": 4871 + }, + { + "epoch": 17.588713318284423, + "grad_norm": 259.8314208984375, + "learning_rate": 3.528130671506352e-06, + "loss": 34.6046, + "step": 4872 + }, + { + "epoch": 17.59232505643341, + "grad_norm": 230.2709197998047, + "learning_rate": 3.5226860254083484e-06, + "loss": 35.301, + "step": 4873 + }, + { + "epoch": 17.595936794582393, + "grad_norm": 306.6289367675781, + "learning_rate": 3.517241379310345e-06, + "loss": 37.4443, + "step": 4874 + }, + { + "epoch": 17.599548532731376, + "grad_norm": 241.5065460205078, + "learning_rate": 3.5117967332123413e-06, + "loss": 36.3646, + "step": 4875 + }, + { + "epoch": 17.603160270880363, + "grad_norm": 234.2492218017578, + "learning_rate": 3.5063520871143375e-06, + "loss": 36.2621, + "step": 4876 + }, + { + "epoch": 17.606772009029346, + "grad_norm": 256.5443115234375, + "learning_rate": 3.500907441016334e-06, + "loss": 36.2202, + "step": 4877 + }, + { + "epoch": 17.61038374717833, + "grad_norm": 280.31097412109375, + "learning_rate": 3.4954627949183304e-06, + "loss": 37.5031, + "step": 4878 + }, + { + "epoch": 17.613995485327315, + "grad_norm": 304.2773132324219, + "learning_rate": 3.4900181488203267e-06, + "loss": 37.1418, + "step": 4879 + }, + { + "epoch": 17.6176072234763, + "grad_norm": 361.27716064453125, + "learning_rate": 3.484573502722323e-06, + "loss": 37.1474, + "step": 4880 + }, + { + "epoch": 17.6176072234763, + "eval_loss": 0.6052342653274536, + "eval_runtime": 3.1249, + "eval_samples_per_second": 57.282, + "eval_steps_per_second": 57.282, + "step": 4880 + }, + { + "epoch": 17.62121896162528, + "grad_norm": 237.64540100097656, + "learning_rate": 3.4791288566243195e-06, + "loss": 38.0673, + "step": 4881 + }, + { + "epoch": 17.624830699774268, + "grad_norm": 351.27215576171875, + "learning_rate": 3.4736842105263158e-06, + "loss": 38.8272, + "step": 4882 + }, + { + "epoch": 17.62844243792325, + "grad_norm": 277.1895751953125, + "learning_rate": 3.4682395644283124e-06, + "loss": 39.1524, + "step": 4883 + }, + { + "epoch": 17.632054176072234, + "grad_norm": 275.1535949707031, + "learning_rate": 3.4627949183303086e-06, + "loss": 37.9027, + "step": 4884 + }, + { + "epoch": 17.635665914221217, + "grad_norm": 335.01776123046875, + "learning_rate": 3.4573502722323053e-06, + "loss": 36.7233, + "step": 4885 + }, + { + "epoch": 17.639277652370204, + "grad_norm": 297.1637878417969, + "learning_rate": 3.4519056261343015e-06, + "loss": 37.782, + "step": 4886 + }, + { + "epoch": 17.642889390519187, + "grad_norm": 265.400390625, + "learning_rate": 3.4464609800362978e-06, + "loss": 37.6639, + "step": 4887 + }, + { + "epoch": 17.64650112866817, + "grad_norm": 345.3449401855469, + "learning_rate": 3.441016333938294e-06, + "loss": 36.7617, + "step": 4888 + }, + { + "epoch": 17.650112866817157, + "grad_norm": 256.0724182128906, + "learning_rate": 3.4355716878402902e-06, + "loss": 32.9906, + "step": 4889 + }, + { + "epoch": 17.65372460496614, + "grad_norm": 260.698486328125, + "learning_rate": 3.430127041742287e-06, + "loss": 32.0811, + "step": 4890 + }, + { + "epoch": 17.65372460496614, + "eval_loss": 0.603126585483551, + "eval_runtime": 3.1268, + "eval_samples_per_second": 57.247, + "eval_steps_per_second": 57.247, + "step": 4890 + }, + { + "epoch": 17.657336343115123, + "grad_norm": 274.9847717285156, + "learning_rate": 3.424682395644283e-06, + "loss": 31.2138, + "step": 4891 + }, + { + "epoch": 17.66094808126411, + "grad_norm": 345.5099182128906, + "learning_rate": 3.4192377495462798e-06, + "loss": 30.302, + "step": 4892 + }, + { + "epoch": 17.664559819413093, + "grad_norm": 269.1453857421875, + "learning_rate": 3.413793103448276e-06, + "loss": 30.2679, + "step": 4893 + }, + { + "epoch": 17.668171557562076, + "grad_norm": 293.7955017089844, + "learning_rate": 3.4083484573502722e-06, + "loss": 31.7616, + "step": 4894 + }, + { + "epoch": 17.671783295711062, + "grad_norm": 306.1725769042969, + "learning_rate": 3.4029038112522685e-06, + "loss": 33.1265, + "step": 4895 + }, + { + "epoch": 17.675395033860045, + "grad_norm": 329.8185119628906, + "learning_rate": 3.397459165154265e-06, + "loss": 33.2131, + "step": 4896 + }, + { + "epoch": 17.67900677200903, + "grad_norm": 340.790283203125, + "learning_rate": 3.3920145190562613e-06, + "loss": 33.243, + "step": 4897 + }, + { + "epoch": 17.682618510158015, + "grad_norm": 324.004150390625, + "learning_rate": 3.386569872958258e-06, + "loss": 33.6235, + "step": 4898 + }, + { + "epoch": 17.686230248306998, + "grad_norm": 263.9126892089844, + "learning_rate": 3.3811252268602542e-06, + "loss": 33.2524, + "step": 4899 + }, + { + "epoch": 17.68984198645598, + "grad_norm": 274.6680603027344, + "learning_rate": 3.375680580762251e-06, + "loss": 34.6629, + "step": 4900 + }, + { + "epoch": 17.68984198645598, + "eval_loss": 0.6027778387069702, + "eval_runtime": 3.1418, + "eval_samples_per_second": 56.974, + "eval_steps_per_second": 56.974, + "step": 4900 + }, + { + "epoch": 17.693453724604964, + "grad_norm": 317.1280822753906, + "learning_rate": 3.370235934664247e-06, + "loss": 33.3088, + "step": 4901 + }, + { + "epoch": 17.69706546275395, + "grad_norm": 304.1892395019531, + "learning_rate": 3.364791288566243e-06, + "loss": 34.5045, + "step": 4902 + }, + { + "epoch": 17.700677200902934, + "grad_norm": 278.75933837890625, + "learning_rate": 3.3593466424682396e-06, + "loss": 35.8429, + "step": 4903 + }, + { + "epoch": 17.704288939051917, + "grad_norm": 299.76971435546875, + "learning_rate": 3.353901996370236e-06, + "loss": 36.2401, + "step": 4904 + }, + { + "epoch": 17.707900677200904, + "grad_norm": 253.46795654296875, + "learning_rate": 3.3484573502722324e-06, + "loss": 28.938, + "step": 4905 + }, + { + "epoch": 17.711512415349887, + "grad_norm": 220.74098205566406, + "learning_rate": 3.3430127041742287e-06, + "loss": 21.6689, + "step": 4906 + }, + { + "epoch": 17.71512415349887, + "grad_norm": 255.79150390625, + "learning_rate": 3.3375680580762253e-06, + "loss": 21.3497, + "step": 4907 + }, + { + "epoch": 17.718735891647857, + "grad_norm": 284.2683410644531, + "learning_rate": 3.3321234119782216e-06, + "loss": 22.9276, + "step": 4908 + }, + { + "epoch": 17.72234762979684, + "grad_norm": 296.7882080078125, + "learning_rate": 3.3266787658802182e-06, + "loss": 24.7304, + "step": 4909 + }, + { + "epoch": 17.725959367945823, + "grad_norm": 217.35546875, + "learning_rate": 3.321234119782214e-06, + "loss": 38.7687, + "step": 4910 + }, + { + "epoch": 17.725959367945823, + "eval_loss": 0.6015192866325378, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.074, + "eval_steps_per_second": 57.074, + "step": 4910 + }, + { + "epoch": 17.72957110609481, + "grad_norm": 256.7005920410156, + "learning_rate": 3.3157894736842107e-06, + "loss": 39.7421, + "step": 4911 + }, + { + "epoch": 17.733182844243792, + "grad_norm": 203.49417114257812, + "learning_rate": 3.310344827586207e-06, + "loss": 39.2911, + "step": 4912 + }, + { + "epoch": 17.736794582392776, + "grad_norm": 282.81439208984375, + "learning_rate": 3.3049001814882036e-06, + "loss": 39.2524, + "step": 4913 + }, + { + "epoch": 17.740406320541762, + "grad_norm": 315.3716735839844, + "learning_rate": 3.2994555353902e-06, + "loss": 37.2097, + "step": 4914 + }, + { + "epoch": 17.744018058690745, + "grad_norm": 250.96484375, + "learning_rate": 3.294010889292196e-06, + "loss": 37.6568, + "step": 4915 + }, + { + "epoch": 17.74762979683973, + "grad_norm": 299.4822082519531, + "learning_rate": 3.2885662431941927e-06, + "loss": 38.9578, + "step": 4916 + }, + { + "epoch": 17.751241534988715, + "grad_norm": 261.2537536621094, + "learning_rate": 3.2831215970961885e-06, + "loss": 40.3838, + "step": 4917 + }, + { + "epoch": 17.754853273137698, + "grad_norm": 220.55218505859375, + "learning_rate": 3.277676950998185e-06, + "loss": 39.2068, + "step": 4918 + }, + { + "epoch": 17.75846501128668, + "grad_norm": 238.06874084472656, + "learning_rate": 3.2722323049001814e-06, + "loss": 40.5383, + "step": 4919 + }, + { + "epoch": 17.762076749435664, + "grad_norm": 223.9597625732422, + "learning_rate": 3.266787658802178e-06, + "loss": 37.3857, + "step": 4920 + }, + { + "epoch": 17.762076749435664, + "eval_loss": 0.602606475353241, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.033, + "eval_steps_per_second": 57.033, + "step": 4920 + }, + { + "epoch": 17.76568848758465, + "grad_norm": 278.9289245605469, + "learning_rate": 3.2613430127041742e-06, + "loss": 37.187, + "step": 4921 + }, + { + "epoch": 17.769300225733634, + "grad_norm": 306.52398681640625, + "learning_rate": 3.255898366606171e-06, + "loss": 37.5243, + "step": 4922 + }, + { + "epoch": 17.772911963882617, + "grad_norm": 231.3939208984375, + "learning_rate": 3.250453720508167e-06, + "loss": 35.3104, + "step": 4923 + }, + { + "epoch": 17.776523702031604, + "grad_norm": 216.77613830566406, + "learning_rate": 3.2450090744101638e-06, + "loss": 36.0904, + "step": 4924 + }, + { + "epoch": 17.780135440180587, + "grad_norm": 256.0504150390625, + "learning_rate": 3.2395644283121596e-06, + "loss": 36.4117, + "step": 4925 + }, + { + "epoch": 17.78374717832957, + "grad_norm": 253.29734802246094, + "learning_rate": 3.2341197822141562e-06, + "loss": 37.197, + "step": 4926 + }, + { + "epoch": 17.787358916478556, + "grad_norm": 268.80780029296875, + "learning_rate": 3.2286751361161525e-06, + "loss": 36.4606, + "step": 4927 + }, + { + "epoch": 17.79097065462754, + "grad_norm": 302.3041076660156, + "learning_rate": 3.2232304900181487e-06, + "loss": 36.8647, + "step": 4928 + }, + { + "epoch": 17.794582392776523, + "grad_norm": 274.23797607421875, + "learning_rate": 3.2177858439201454e-06, + "loss": 37.3981, + "step": 4929 + }, + { + "epoch": 17.79819413092551, + "grad_norm": 281.4304504394531, + "learning_rate": 3.2123411978221416e-06, + "loss": 37.2304, + "step": 4930 + }, + { + "epoch": 17.79819413092551, + "eval_loss": 0.6050394773483276, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 4930 + }, + { + "epoch": 17.801805869074492, + "grad_norm": 277.47698974609375, + "learning_rate": 3.2068965517241382e-06, + "loss": 35.9281, + "step": 4931 + }, + { + "epoch": 17.805417607223475, + "grad_norm": 394.02294921875, + "learning_rate": 3.201451905626134e-06, + "loss": 39.0143, + "step": 4932 + }, + { + "epoch": 17.809029345372462, + "grad_norm": 252.8087158203125, + "learning_rate": 3.1960072595281307e-06, + "loss": 36.9452, + "step": 4933 + }, + { + "epoch": 17.812641083521445, + "grad_norm": 249.54962158203125, + "learning_rate": 3.190562613430127e-06, + "loss": 39.2442, + "step": 4934 + }, + { + "epoch": 17.816252821670428, + "grad_norm": 286.9231262207031, + "learning_rate": 3.1851179673321236e-06, + "loss": 38.6445, + "step": 4935 + }, + { + "epoch": 17.819864559819415, + "grad_norm": 345.7146911621094, + "learning_rate": 3.17967332123412e-06, + "loss": 37.1794, + "step": 4936 + }, + { + "epoch": 17.823476297968398, + "grad_norm": 271.23089599609375, + "learning_rate": 3.1742286751361165e-06, + "loss": 36.3952, + "step": 4937 + }, + { + "epoch": 17.82708803611738, + "grad_norm": 406.3717346191406, + "learning_rate": 3.1687840290381127e-06, + "loss": 33.8166, + "step": 4938 + }, + { + "epoch": 17.830699774266364, + "grad_norm": 300.12554931640625, + "learning_rate": 3.1633393829401094e-06, + "loss": 30.9614, + "step": 4939 + }, + { + "epoch": 17.83431151241535, + "grad_norm": 229.67218017578125, + "learning_rate": 3.157894736842105e-06, + "loss": 31.8592, + "step": 4940 + }, + { + "epoch": 17.83431151241535, + "eval_loss": 0.6021057367324829, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 4940 + }, + { + "epoch": 17.837923250564334, + "grad_norm": 269.0873107910156, + "learning_rate": 3.1524500907441014e-06, + "loss": 31.7702, + "step": 4941 + }, + { + "epoch": 17.841534988713317, + "grad_norm": 279.0237731933594, + "learning_rate": 3.147005444646098e-06, + "loss": 31.3615, + "step": 4942 + }, + { + "epoch": 17.845146726862303, + "grad_norm": 234.94839477539062, + "learning_rate": 3.1415607985480943e-06, + "loss": 31.9314, + "step": 4943 + }, + { + "epoch": 17.848758465011286, + "grad_norm": 239.25613403320312, + "learning_rate": 3.136116152450091e-06, + "loss": 32.4513, + "step": 4944 + }, + { + "epoch": 17.85237020316027, + "grad_norm": 257.09661865234375, + "learning_rate": 3.130671506352087e-06, + "loss": 34.4964, + "step": 4945 + }, + { + "epoch": 17.855981941309256, + "grad_norm": 328.88006591796875, + "learning_rate": 3.125226860254084e-06, + "loss": 33.1662, + "step": 4946 + }, + { + "epoch": 17.85959367945824, + "grad_norm": 291.4894714355469, + "learning_rate": 3.1197822141560796e-06, + "loss": 34.4406, + "step": 4947 + }, + { + "epoch": 17.863205417607222, + "grad_norm": 282.81158447265625, + "learning_rate": 3.1143375680580763e-06, + "loss": 32.7141, + "step": 4948 + }, + { + "epoch": 17.86681715575621, + "grad_norm": 300.0378112792969, + "learning_rate": 3.1088929219600725e-06, + "loss": 34.3423, + "step": 4949 + }, + { + "epoch": 17.870428893905192, + "grad_norm": 267.2983703613281, + "learning_rate": 3.103448275862069e-06, + "loss": 33.1653, + "step": 4950 + }, + { + "epoch": 17.870428893905192, + "eval_loss": 0.6020416021347046, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.081, + "eval_steps_per_second": 57.081, + "step": 4950 + }, + { + "epoch": 17.874040632054175, + "grad_norm": 270.53277587890625, + "learning_rate": 3.0980036297640654e-06, + "loss": 34.7582, + "step": 4951 + }, + { + "epoch": 17.877652370203162, + "grad_norm": 346.0074157714844, + "learning_rate": 3.092558983666062e-06, + "loss": 35.9911, + "step": 4952 + }, + { + "epoch": 17.881264108352145, + "grad_norm": 367.5807189941406, + "learning_rate": 3.0871143375680583e-06, + "loss": 35.3345, + "step": 4953 + }, + { + "epoch": 17.884875846501128, + "grad_norm": 304.21649169921875, + "learning_rate": 3.0816696914700545e-06, + "loss": 32.9797, + "step": 4954 + }, + { + "epoch": 17.888487584650115, + "grad_norm": 253.14601135253906, + "learning_rate": 3.0762250453720507e-06, + "loss": 22.6226, + "step": 4955 + }, + { + "epoch": 17.892099322799098, + "grad_norm": 270.3512268066406, + "learning_rate": 3.070780399274047e-06, + "loss": 21.9531, + "step": 4956 + }, + { + "epoch": 17.89571106094808, + "grad_norm": 192.73712158203125, + "learning_rate": 3.0653357531760436e-06, + "loss": 21.8497, + "step": 4957 + }, + { + "epoch": 17.899322799097064, + "grad_norm": 254.43759155273438, + "learning_rate": 3.05989110707804e-06, + "loss": 23.2694, + "step": 4958 + }, + { + "epoch": 17.90293453724605, + "grad_norm": 271.2293395996094, + "learning_rate": 3.0544464609800365e-06, + "loss": 22.9774, + "step": 4959 + }, + { + "epoch": 17.906546275395034, + "grad_norm": 213.7334747314453, + "learning_rate": 3.0490018148820327e-06, + "loss": 38.8821, + "step": 4960 + }, + { + "epoch": 17.906546275395034, + "eval_loss": 0.600848913192749, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 4960 + }, + { + "epoch": 17.910158013544017, + "grad_norm": 269.9356384277344, + "learning_rate": 3.0435571687840294e-06, + "loss": 38.6362, + "step": 4961 + }, + { + "epoch": 17.913769751693003, + "grad_norm": 237.6484832763672, + "learning_rate": 3.0381125226860256e-06, + "loss": 39.6388, + "step": 4962 + }, + { + "epoch": 17.917381489841986, + "grad_norm": 304.2347106933594, + "learning_rate": 3.032667876588022e-06, + "loss": 39.4308, + "step": 4963 + }, + { + "epoch": 17.92099322799097, + "grad_norm": 250.6772918701172, + "learning_rate": 3.027223230490018e-06, + "loss": 40.1923, + "step": 4964 + }, + { + "epoch": 17.924604966139956, + "grad_norm": 261.7320556640625, + "learning_rate": 3.0217785843920147e-06, + "loss": 37.862, + "step": 4965 + }, + { + "epoch": 17.92821670428894, + "grad_norm": 385.33197021484375, + "learning_rate": 3.016333938294011e-06, + "loss": 35.9139, + "step": 4966 + }, + { + "epoch": 17.931828442437922, + "grad_norm": 436.6773986816406, + "learning_rate": 3.010889292196007e-06, + "loss": 36.6259, + "step": 4967 + }, + { + "epoch": 17.93544018058691, + "grad_norm": 318.65673828125, + "learning_rate": 3.005444646098004e-06, + "loss": 36.1235, + "step": 4968 + }, + { + "epoch": 17.939051918735892, + "grad_norm": 241.6234893798828, + "learning_rate": 3e-06, + "loss": 37.4148, + "step": 4969 + }, + { + "epoch": 17.942663656884875, + "grad_norm": 316.8415832519531, + "learning_rate": 2.9945553539019963e-06, + "loss": 36.7089, + "step": 4970 + }, + { + "epoch": 17.942663656884875, + "eval_loss": 0.6032605171203613, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.061, + "eval_steps_per_second": 57.061, + "step": 4970 + }, + { + "epoch": 17.94627539503386, + "grad_norm": 322.0501403808594, + "learning_rate": 2.9891107078039925e-06, + "loss": 37.2222, + "step": 4971 + }, + { + "epoch": 17.949887133182845, + "grad_norm": 300.4189453125, + "learning_rate": 2.983666061705989e-06, + "loss": 37.9156, + "step": 4972 + }, + { + "epoch": 17.953498871331828, + "grad_norm": 304.39263916015625, + "learning_rate": 2.9782214156079854e-06, + "loss": 38.5253, + "step": 4973 + }, + { + "epoch": 17.957110609480814, + "grad_norm": 297.4574890136719, + "learning_rate": 2.972776769509982e-06, + "loss": 38.4385, + "step": 4974 + }, + { + "epoch": 17.960722347629797, + "grad_norm": 367.7257080078125, + "learning_rate": 2.9673321234119783e-06, + "loss": 36.2943, + "step": 4975 + }, + { + "epoch": 17.96433408577878, + "grad_norm": 274.61724853515625, + "learning_rate": 2.961887477313975e-06, + "loss": 30.8753, + "step": 4976 + }, + { + "epoch": 17.967945823927764, + "grad_norm": 358.50201416015625, + "learning_rate": 2.956442831215971e-06, + "loss": 32.1308, + "step": 4977 + }, + { + "epoch": 17.97155756207675, + "grad_norm": 493.7792663574219, + "learning_rate": 2.9509981851179674e-06, + "loss": 33.2474, + "step": 4978 + }, + { + "epoch": 17.975169300225733, + "grad_norm": 426.67138671875, + "learning_rate": 2.9455535390199636e-06, + "loss": 33.7065, + "step": 4979 + }, + { + "epoch": 17.978781038374716, + "grad_norm": 524.0231323242188, + "learning_rate": 2.94010889292196e-06, + "loss": 34.6007, + "step": 4980 + }, + { + "epoch": 17.978781038374716, + "eval_loss": 0.6021283268928528, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 4980 + }, + { + "epoch": 17.982392776523703, + "grad_norm": 395.26715087890625, + "learning_rate": 2.9346642468239565e-06, + "loss": 33.9185, + "step": 4981 + }, + { + "epoch": 17.986004514672686, + "grad_norm": 400.0454406738281, + "learning_rate": 2.9292196007259528e-06, + "loss": 34.6485, + "step": 4982 + }, + { + "epoch": 17.98961625282167, + "grad_norm": 376.1269226074219, + "learning_rate": 2.9237749546279494e-06, + "loss": 34.668, + "step": 4983 + }, + { + "epoch": 17.993227990970656, + "grad_norm": 315.5225524902344, + "learning_rate": 2.9183303085299456e-06, + "loss": 30.7058, + "step": 4984 + }, + { + "epoch": 17.99683972911964, + "grad_norm": 221.5032958984375, + "learning_rate": 2.912885662431942e-06, + "loss": 21.8055, + "step": 4985 + }, + { + "epoch": 18.0, + "grad_norm": 226.06068420410156, + "learning_rate": 2.907441016333938e-06, + "loss": 20.5066, + "step": 4986 + }, + { + "epoch": 18.003611738148983, + "grad_norm": 209.69607543945312, + "learning_rate": 2.9019963702359348e-06, + "loss": 37.9156, + "step": 4987 + }, + { + "epoch": 18.00722347629797, + "grad_norm": 218.86709594726562, + "learning_rate": 2.896551724137931e-06, + "loss": 38.8204, + "step": 4988 + }, + { + "epoch": 18.010835214446953, + "grad_norm": 218.38180541992188, + "learning_rate": 2.8911070780399276e-06, + "loss": 38.5472, + "step": 4989 + }, + { + "epoch": 18.014446952595936, + "grad_norm": 338.4778747558594, + "learning_rate": 2.885662431941924e-06, + "loss": 37.7233, + "step": 4990 + }, + { + "epoch": 18.014446952595936, + "eval_loss": 0.6013379096984863, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.979, + "eval_steps_per_second": 56.979, + "step": 4990 + }, + { + "epoch": 18.018058690744923, + "grad_norm": 309.5385437011719, + "learning_rate": 2.8802177858439205e-06, + "loss": 38.3321, + "step": 4991 + }, + { + "epoch": 18.021670428893906, + "grad_norm": 335.67169189453125, + "learning_rate": 2.8747731397459168e-06, + "loss": 38.2367, + "step": 4992 + }, + { + "epoch": 18.02528216704289, + "grad_norm": 260.5025939941406, + "learning_rate": 2.8693284936479126e-06, + "loss": 38.5516, + "step": 4993 + }, + { + "epoch": 18.028893905191875, + "grad_norm": 265.4793395996094, + "learning_rate": 2.8638838475499092e-06, + "loss": 38.9539, + "step": 4994 + }, + { + "epoch": 18.03250564334086, + "grad_norm": 237.87942504882812, + "learning_rate": 2.8584392014519054e-06, + "loss": 39.4582, + "step": 4995 + }, + { + "epoch": 18.03611738148984, + "grad_norm": 252.11746215820312, + "learning_rate": 2.852994555353902e-06, + "loss": 39.3466, + "step": 4996 + }, + { + "epoch": 18.039729119638825, + "grad_norm": 298.1370849609375, + "learning_rate": 2.8475499092558983e-06, + "loss": 36.9779, + "step": 4997 + }, + { + "epoch": 18.04334085778781, + "grad_norm": 341.9007873535156, + "learning_rate": 2.842105263157895e-06, + "loss": 36.5117, + "step": 4998 + }, + { + "epoch": 18.046952595936794, + "grad_norm": 210.0319366455078, + "learning_rate": 2.8366606170598912e-06, + "loss": 34.7543, + "step": 4999 + }, + { + "epoch": 18.050564334085777, + "grad_norm": 385.6400146484375, + "learning_rate": 2.831215970961888e-06, + "loss": 36.4577, + "step": 5000 + }, + { + "epoch": 18.050564334085777, + "eval_loss": 0.6031082272529602, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 5000 + }, + { + "epoch": 18.054176072234764, + "grad_norm": 268.4949035644531, + "learning_rate": 2.8257713248638837e-06, + "loss": 36.3765, + "step": 5001 + }, + { + "epoch": 18.057787810383747, + "grad_norm": 311.2984313964844, + "learning_rate": 2.8203266787658803e-06, + "loss": 35.709, + "step": 5002 + }, + { + "epoch": 18.06139954853273, + "grad_norm": 264.0671081542969, + "learning_rate": 2.8148820326678766e-06, + "loss": 35.7978, + "step": 5003 + }, + { + "epoch": 18.065011286681717, + "grad_norm": 341.0770263671875, + "learning_rate": 2.8094373865698732e-06, + "loss": 36.8963, + "step": 5004 + }, + { + "epoch": 18.0686230248307, + "grad_norm": 253.3942108154297, + "learning_rate": 2.8039927404718694e-06, + "loss": 37.1135, + "step": 5005 + }, + { + "epoch": 18.072234762979683, + "grad_norm": 286.23736572265625, + "learning_rate": 2.7985480943738657e-06, + "loss": 35.736, + "step": 5006 + }, + { + "epoch": 18.07584650112867, + "grad_norm": 327.71295166015625, + "learning_rate": 2.7931034482758623e-06, + "loss": 36.4917, + "step": 5007 + }, + { + "epoch": 18.079458239277653, + "grad_norm": 351.00616455078125, + "learning_rate": 2.787658802177858e-06, + "loss": 37.2807, + "step": 5008 + }, + { + "epoch": 18.083069977426636, + "grad_norm": 291.02923583984375, + "learning_rate": 2.782214156079855e-06, + "loss": 38.0345, + "step": 5009 + }, + { + "epoch": 18.086681715575622, + "grad_norm": 288.7776184082031, + "learning_rate": 2.776769509981851e-06, + "loss": 37.112, + "step": 5010 + }, + { + "epoch": 18.086681715575622, + "eval_loss": 0.6058472990989685, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 5010 + }, + { + "epoch": 18.090293453724605, + "grad_norm": 437.8114929199219, + "learning_rate": 2.7713248638838477e-06, + "loss": 37.9063, + "step": 5011 + }, + { + "epoch": 18.09390519187359, + "grad_norm": 324.5924072265625, + "learning_rate": 2.765880217785844e-06, + "loss": 37.8524, + "step": 5012 + }, + { + "epoch": 18.097516930022575, + "grad_norm": 358.40625, + "learning_rate": 2.7604355716878406e-06, + "loss": 37.5547, + "step": 5013 + }, + { + "epoch": 18.101128668171558, + "grad_norm": 290.75604248046875, + "learning_rate": 2.7549909255898368e-06, + "loss": 36.4437, + "step": 5014 + }, + { + "epoch": 18.10474040632054, + "grad_norm": 284.41424560546875, + "learning_rate": 2.7495462794918334e-06, + "loss": 34.3336, + "step": 5015 + }, + { + "epoch": 18.108352144469524, + "grad_norm": 254.59889221191406, + "learning_rate": 2.7441016333938292e-06, + "loss": 32.4527, + "step": 5016 + }, + { + "epoch": 18.11196388261851, + "grad_norm": 266.0207214355469, + "learning_rate": 2.738656987295826e-06, + "loss": 30.4014, + "step": 5017 + }, + { + "epoch": 18.115575620767494, + "grad_norm": 219.9434356689453, + "learning_rate": 2.733212341197822e-06, + "loss": 30.2838, + "step": 5018 + }, + { + "epoch": 18.119187358916477, + "grad_norm": 312.7678527832031, + "learning_rate": 2.7277676950998188e-06, + "loss": 31.6877, + "step": 5019 + }, + { + "epoch": 18.122799097065464, + "grad_norm": 282.99774169921875, + "learning_rate": 2.722323049001815e-06, + "loss": 33.3686, + "step": 5020 + }, + { + "epoch": 18.122799097065464, + "eval_loss": 0.6027761697769165, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 5020 + }, + { + "epoch": 18.126410835214447, + "grad_norm": 371.9994201660156, + "learning_rate": 2.7168784029038112e-06, + "loss": 32.5397, + "step": 5021 + }, + { + "epoch": 18.13002257336343, + "grad_norm": 241.19049072265625, + "learning_rate": 2.711433756805808e-06, + "loss": 33.4329, + "step": 5022 + }, + { + "epoch": 18.133634311512417, + "grad_norm": 310.2216796875, + "learning_rate": 2.7059891107078037e-06, + "loss": 31.888, + "step": 5023 + }, + { + "epoch": 18.1372460496614, + "grad_norm": 277.1349182128906, + "learning_rate": 2.7005444646098004e-06, + "loss": 33.9345, + "step": 5024 + }, + { + "epoch": 18.140857787810383, + "grad_norm": 419.3515930175781, + "learning_rate": 2.6950998185117966e-06, + "loss": 33.5826, + "step": 5025 + }, + { + "epoch": 18.14446952595937, + "grad_norm": 289.1166687011719, + "learning_rate": 2.6896551724137932e-06, + "loss": 34.324, + "step": 5026 + }, + { + "epoch": 18.148081264108352, + "grad_norm": 364.20233154296875, + "learning_rate": 2.6842105263157895e-06, + "loss": 34.45, + "step": 5027 + }, + { + "epoch": 18.151693002257336, + "grad_norm": 341.71551513671875, + "learning_rate": 2.678765880217786e-06, + "loss": 33.9126, + "step": 5028 + }, + { + "epoch": 18.155304740406322, + "grad_norm": 283.1939697265625, + "learning_rate": 2.6733212341197824e-06, + "loss": 33.7188, + "step": 5029 + }, + { + "epoch": 18.158916478555305, + "grad_norm": 369.6583251953125, + "learning_rate": 2.667876588021779e-06, + "loss": 35.0354, + "step": 5030 + }, + { + "epoch": 18.158916478555305, + "eval_loss": 0.6033984422683716, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 5030 + }, + { + "epoch": 18.16252821670429, + "grad_norm": 323.95806884765625, + "learning_rate": 2.662431941923775e-06, + "loss": 34.6853, + "step": 5031 + }, + { + "epoch": 18.16613995485327, + "grad_norm": 274.2629089355469, + "learning_rate": 2.6569872958257715e-06, + "loss": 32.1261, + "step": 5032 + }, + { + "epoch": 18.169751693002258, + "grad_norm": 229.66163635253906, + "learning_rate": 2.6515426497277677e-06, + "loss": 22.0549, + "step": 5033 + }, + { + "epoch": 18.17336343115124, + "grad_norm": 212.78070068359375, + "learning_rate": 2.646098003629764e-06, + "loss": 21.4483, + "step": 5034 + }, + { + "epoch": 18.176975169300224, + "grad_norm": 184.7995147705078, + "learning_rate": 2.6406533575317606e-06, + "loss": 22.5133, + "step": 5035 + }, + { + "epoch": 18.18058690744921, + "grad_norm": 256.6748046875, + "learning_rate": 2.635208711433757e-06, + "loss": 23.6443, + "step": 5036 + }, + { + "epoch": 18.184198645598194, + "grad_norm": 230.683349609375, + "learning_rate": 2.6297640653357535e-06, + "loss": 38.3633, + "step": 5037 + }, + { + "epoch": 18.187810383747177, + "grad_norm": 251.70166015625, + "learning_rate": 2.6243194192377497e-06, + "loss": 40.1229, + "step": 5038 + }, + { + "epoch": 18.191422121896164, + "grad_norm": 219.9066162109375, + "learning_rate": 2.618874773139746e-06, + "loss": 38.6539, + "step": 5039 + }, + { + "epoch": 18.195033860045147, + "grad_norm": 290.7185974121094, + "learning_rate": 2.613430127041742e-06, + "loss": 38.0385, + "step": 5040 + }, + { + "epoch": 18.195033860045147, + "eval_loss": 0.6022469401359558, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.993, + "eval_steps_per_second": 56.993, + "step": 5040 + }, + { + "epoch": 18.19864559819413, + "grad_norm": 334.9693908691406, + "learning_rate": 2.607985480943739e-06, + "loss": 38.2381, + "step": 5041 + }, + { + "epoch": 18.202257336343116, + "grad_norm": 283.9659423828125, + "learning_rate": 2.602540834845735e-06, + "loss": 39.2603, + "step": 5042 + }, + { + "epoch": 18.2058690744921, + "grad_norm": 291.4002990722656, + "learning_rate": 2.5970961887477317e-06, + "loss": 39.633, + "step": 5043 + }, + { + "epoch": 18.209480812641083, + "grad_norm": 249.14329528808594, + "learning_rate": 2.591651542649728e-06, + "loss": 39.1938, + "step": 5044 + }, + { + "epoch": 18.21309255079007, + "grad_norm": 226.1659393310547, + "learning_rate": 2.5862068965517246e-06, + "loss": 39.8308, + "step": 5045 + }, + { + "epoch": 18.216704288939052, + "grad_norm": 270.2198181152344, + "learning_rate": 2.5807622504537204e-06, + "loss": 38.4712, + "step": 5046 + }, + { + "epoch": 18.220316027088035, + "grad_norm": 263.83819580078125, + "learning_rate": 2.5753176043557166e-06, + "loss": 37.3572, + "step": 5047 + }, + { + "epoch": 18.223927765237022, + "grad_norm": 316.8177795410156, + "learning_rate": 2.5698729582577133e-06, + "loss": 36.3821, + "step": 5048 + }, + { + "epoch": 18.227539503386005, + "grad_norm": 318.7213134765625, + "learning_rate": 2.5644283121597095e-06, + "loss": 34.8209, + "step": 5049 + }, + { + "epoch": 18.231151241534988, + "grad_norm": 267.6168518066406, + "learning_rate": 2.558983666061706e-06, + "loss": 35.6173, + "step": 5050 + }, + { + "epoch": 18.231151241534988, + "eval_loss": 0.6044466495513916, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.038, + "eval_steps_per_second": 57.038, + "step": 5050 + }, + { + "epoch": 18.23476297968397, + "grad_norm": 277.739501953125, + "learning_rate": 2.5535390199637024e-06, + "loss": 35.2828, + "step": 5051 + }, + { + "epoch": 18.238374717832958, + "grad_norm": 288.2068786621094, + "learning_rate": 2.548094373865699e-06, + "loss": 36.7972, + "step": 5052 + }, + { + "epoch": 18.24198645598194, + "grad_norm": 217.59716796875, + "learning_rate": 2.5426497277676953e-06, + "loss": 36.3637, + "step": 5053 + }, + { + "epoch": 18.245598194130924, + "grad_norm": 411.8970031738281, + "learning_rate": 2.5372050816696915e-06, + "loss": 37.3086, + "step": 5054 + }, + { + "epoch": 18.24920993227991, + "grad_norm": 351.9718933105469, + "learning_rate": 2.5317604355716877e-06, + "loss": 37.0896, + "step": 5055 + }, + { + "epoch": 18.252821670428894, + "grad_norm": 343.1683044433594, + "learning_rate": 2.5263157894736844e-06, + "loss": 37.2533, + "step": 5056 + }, + { + "epoch": 18.256433408577877, + "grad_norm": 413.0977783203125, + "learning_rate": 2.5208711433756806e-06, + "loss": 36.9987, + "step": 5057 + }, + { + "epoch": 18.260045146726863, + "grad_norm": 331.73223876953125, + "learning_rate": 2.5154264972776773e-06, + "loss": 36.8624, + "step": 5058 + }, + { + "epoch": 18.263656884875846, + "grad_norm": 434.96990966796875, + "learning_rate": 2.5099818511796735e-06, + "loss": 37.949, + "step": 5059 + }, + { + "epoch": 18.26726862302483, + "grad_norm": 324.4934997558594, + "learning_rate": 2.5045372050816697e-06, + "loss": 37.6272, + "step": 5060 + }, + { + "epoch": 18.26726862302483, + "eval_loss": 0.6042292714118958, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 5060 + }, + { + "epoch": 18.270880361173816, + "grad_norm": 312.1228942871094, + "learning_rate": 2.499092558983666e-06, + "loss": 38.6362, + "step": 5061 + }, + { + "epoch": 18.2744920993228, + "grad_norm": 427.6184997558594, + "learning_rate": 2.493647912885662e-06, + "loss": 39.2934, + "step": 5062 + }, + { + "epoch": 18.278103837471782, + "grad_norm": 344.6819763183594, + "learning_rate": 2.488203266787659e-06, + "loss": 38.0684, + "step": 5063 + }, + { + "epoch": 18.28171557562077, + "grad_norm": 317.42303466796875, + "learning_rate": 2.482758620689655e-06, + "loss": 38.2323, + "step": 5064 + }, + { + "epoch": 18.285327313769752, + "grad_norm": 338.830810546875, + "learning_rate": 2.4773139745916517e-06, + "loss": 34.2699, + "step": 5065 + }, + { + "epoch": 18.288939051918735, + "grad_norm": 286.7263488769531, + "learning_rate": 2.471869328493648e-06, + "loss": 32.5149, + "step": 5066 + }, + { + "epoch": 18.292550790067722, + "grad_norm": 278.9923095703125, + "learning_rate": 2.4664246823956446e-06, + "loss": 31.033, + "step": 5067 + }, + { + "epoch": 18.296162528216705, + "grad_norm": 264.0198669433594, + "learning_rate": 2.460980036297641e-06, + "loss": 29.5549, + "step": 5068 + }, + { + "epoch": 18.299774266365688, + "grad_norm": 241.6163330078125, + "learning_rate": 2.455535390199637e-06, + "loss": 30.2173, + "step": 5069 + }, + { + "epoch": 18.30338600451467, + "grad_norm": 278.5418395996094, + "learning_rate": 2.4500907441016333e-06, + "loss": 30.8286, + "step": 5070 + }, + { + "epoch": 18.30338600451467, + "eval_loss": 0.6035094261169434, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 5070 + }, + { + "epoch": 18.306997742663658, + "grad_norm": 277.5758056640625, + "learning_rate": 2.44464609800363e-06, + "loss": 33.6778, + "step": 5071 + }, + { + "epoch": 18.31060948081264, + "grad_norm": 295.81201171875, + "learning_rate": 2.439201451905626e-06, + "loss": 33.5914, + "step": 5072 + }, + { + "epoch": 18.314221218961624, + "grad_norm": 293.4093017578125, + "learning_rate": 2.4337568058076224e-06, + "loss": 33.6203, + "step": 5073 + }, + { + "epoch": 18.31783295711061, + "grad_norm": 277.2228698730469, + "learning_rate": 2.428312159709619e-06, + "loss": 33.6465, + "step": 5074 + }, + { + "epoch": 18.321444695259594, + "grad_norm": 286.3224792480469, + "learning_rate": 2.4228675136116153e-06, + "loss": 32.6013, + "step": 5075 + }, + { + "epoch": 18.325056433408577, + "grad_norm": 320.6168212890625, + "learning_rate": 2.417422867513612e-06, + "loss": 32.6469, + "step": 5076 + }, + { + "epoch": 18.328668171557563, + "grad_norm": 327.364990234375, + "learning_rate": 2.4119782214156078e-06, + "loss": 34.354, + "step": 5077 + }, + { + "epoch": 18.332279909706546, + "grad_norm": 342.06634521484375, + "learning_rate": 2.4065335753176044e-06, + "loss": 34.3143, + "step": 5078 + }, + { + "epoch": 18.33589164785553, + "grad_norm": 370.70343017578125, + "learning_rate": 2.4010889292196006e-06, + "loss": 33.7771, + "step": 5079 + }, + { + "epoch": 18.339503386004516, + "grad_norm": 358.7357177734375, + "learning_rate": 2.3956442831215973e-06, + "loss": 35.5377, + "step": 5080 + }, + { + "epoch": 18.339503386004516, + "eval_loss": 0.6033809185028076, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.931, + "eval_steps_per_second": 56.931, + "step": 5080 + }, + { + "epoch": 18.3431151241535, + "grad_norm": 463.8668518066406, + "learning_rate": 2.3901996370235935e-06, + "loss": 35.4711, + "step": 5081 + }, + { + "epoch": 18.346726862302482, + "grad_norm": 256.5113220214844, + "learning_rate": 2.38475499092559e-06, + "loss": 26.8532, + "step": 5082 + }, + { + "epoch": 18.35033860045147, + "grad_norm": 228.83883666992188, + "learning_rate": 2.3793103448275864e-06, + "loss": 21.6636, + "step": 5083 + }, + { + "epoch": 18.353950338600452, + "grad_norm": 238.70742797851562, + "learning_rate": 2.3738656987295826e-06, + "loss": 22.2091, + "step": 5084 + }, + { + "epoch": 18.357562076749435, + "grad_norm": 276.8741760253906, + "learning_rate": 2.368421052631579e-06, + "loss": 22.1242, + "step": 5085 + }, + { + "epoch": 18.36117381489842, + "grad_norm": 226.4810333251953, + "learning_rate": 2.362976406533575e-06, + "loss": 23.359, + "step": 5086 + }, + { + "epoch": 18.364785553047405, + "grad_norm": 212.53111267089844, + "learning_rate": 2.3575317604355718e-06, + "loss": 37.7694, + "step": 5087 + }, + { + "epoch": 18.368397291196388, + "grad_norm": 227.26710510253906, + "learning_rate": 2.352087114337568e-06, + "loss": 39.8064, + "step": 5088 + }, + { + "epoch": 18.37200902934537, + "grad_norm": 201.0309295654297, + "learning_rate": 2.3466424682395646e-06, + "loss": 38.9716, + "step": 5089 + }, + { + "epoch": 18.375620767494357, + "grad_norm": 311.7691345214844, + "learning_rate": 2.341197822141561e-06, + "loss": 39.8326, + "step": 5090 + }, + { + "epoch": 18.375620767494357, + "eval_loss": 0.6036086082458496, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.029, + "eval_steps_per_second": 57.029, + "step": 5090 + }, + { + "epoch": 18.37923250564334, + "grad_norm": 251.5362091064453, + "learning_rate": 2.3357531760435575e-06, + "loss": 38.2591, + "step": 5091 + }, + { + "epoch": 18.382844243792324, + "grad_norm": 241.64373779296875, + "learning_rate": 2.3303085299455533e-06, + "loss": 38.0327, + "step": 5092 + }, + { + "epoch": 18.38645598194131, + "grad_norm": 231.7598114013672, + "learning_rate": 2.32486388384755e-06, + "loss": 38.6853, + "step": 5093 + }, + { + "epoch": 18.390067720090293, + "grad_norm": 287.66644287109375, + "learning_rate": 2.3194192377495462e-06, + "loss": 39.6929, + "step": 5094 + }, + { + "epoch": 18.393679458239276, + "grad_norm": 289.3146057128906, + "learning_rate": 2.313974591651543e-06, + "loss": 38.3129, + "step": 5095 + }, + { + "epoch": 18.397291196388263, + "grad_norm": 291.4801330566406, + "learning_rate": 2.308529945553539e-06, + "loss": 38.2505, + "step": 5096 + }, + { + "epoch": 18.400902934537246, + "grad_norm": 337.4052429199219, + "learning_rate": 2.3030852994555358e-06, + "loss": 37.7476, + "step": 5097 + }, + { + "epoch": 18.40451467268623, + "grad_norm": 460.0773010253906, + "learning_rate": 2.297640653357532e-06, + "loss": 36.1112, + "step": 5098 + }, + { + "epoch": 18.408126410835216, + "grad_norm": 322.4940185546875, + "learning_rate": 2.292196007259528e-06, + "loss": 36.5374, + "step": 5099 + }, + { + "epoch": 18.4117381489842, + "grad_norm": 350.4710388183594, + "learning_rate": 2.2867513611615244e-06, + "loss": 37.5286, + "step": 5100 + }, + { + "epoch": 18.4117381489842, + "eval_loss": 0.6045494079589844, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.101, + "eval_steps_per_second": 57.101, + "step": 5100 + }, + { + "epoch": 18.415349887133182, + "grad_norm": 306.18634033203125, + "learning_rate": 2.2813067150635207e-06, + "loss": 37.3676, + "step": 5101 + }, + { + "epoch": 18.41896162528217, + "grad_norm": 289.237060546875, + "learning_rate": 2.2758620689655173e-06, + "loss": 36.6916, + "step": 5102 + }, + { + "epoch": 18.42257336343115, + "grad_norm": 266.69207763671875, + "learning_rate": 2.2704174228675136e-06, + "loss": 36.2887, + "step": 5103 + }, + { + "epoch": 18.426185101580135, + "grad_norm": 264.54119873046875, + "learning_rate": 2.2649727767695102e-06, + "loss": 37.1267, + "step": 5104 + }, + { + "epoch": 18.42979683972912, + "grad_norm": 262.6132507324219, + "learning_rate": 2.2595281306715064e-06, + "loss": 36.6862, + "step": 5105 + }, + { + "epoch": 18.433408577878104, + "grad_norm": 231.68226623535156, + "learning_rate": 2.254083484573503e-06, + "loss": 35.7714, + "step": 5106 + }, + { + "epoch": 18.437020316027088, + "grad_norm": 299.72613525390625, + "learning_rate": 2.248638838475499e-06, + "loss": 37.648, + "step": 5107 + }, + { + "epoch": 18.44063205417607, + "grad_norm": 424.94708251953125, + "learning_rate": 2.2431941923774956e-06, + "loss": 35.9776, + "step": 5108 + }, + { + "epoch": 18.444243792325057, + "grad_norm": 449.78570556640625, + "learning_rate": 2.2377495462794918e-06, + "loss": 38.0571, + "step": 5109 + }, + { + "epoch": 18.44785553047404, + "grad_norm": 284.00634765625, + "learning_rate": 2.2323049001814884e-06, + "loss": 37.758, + "step": 5110 + }, + { + "epoch": 18.44785553047404, + "eval_loss": 0.6064541935920715, + "eval_runtime": 3.1377, + "eval_samples_per_second": 57.048, + "eval_steps_per_second": 57.048, + "step": 5110 + }, + { + "epoch": 18.451467268623023, + "grad_norm": 359.1011962890625, + "learning_rate": 2.2268602540834847e-06, + "loss": 38.8924, + "step": 5111 + }, + { + "epoch": 18.45507900677201, + "grad_norm": 307.7583923339844, + "learning_rate": 2.221415607985481e-06, + "loss": 38.2116, + "step": 5112 + }, + { + "epoch": 18.458690744920993, + "grad_norm": 359.5586242675781, + "learning_rate": 2.2159709618874776e-06, + "loss": 39.6894, + "step": 5113 + }, + { + "epoch": 18.462302483069976, + "grad_norm": 258.3985595703125, + "learning_rate": 2.2105263157894734e-06, + "loss": 36.4586, + "step": 5114 + }, + { + "epoch": 18.465914221218963, + "grad_norm": 363.09600830078125, + "learning_rate": 2.20508166969147e-06, + "loss": 34.489, + "step": 5115 + }, + { + "epoch": 18.469525959367946, + "grad_norm": 237.136474609375, + "learning_rate": 2.1996370235934662e-06, + "loss": 32.5826, + "step": 5116 + }, + { + "epoch": 18.47313769751693, + "grad_norm": 400.25604248046875, + "learning_rate": 2.194192377495463e-06, + "loss": 31.3005, + "step": 5117 + }, + { + "epoch": 18.476749435665916, + "grad_norm": 467.9855651855469, + "learning_rate": 2.188747731397459e-06, + "loss": 30.2261, + "step": 5118 + }, + { + "epoch": 18.4803611738149, + "grad_norm": 384.4250183105469, + "learning_rate": 2.1833030852994558e-06, + "loss": 33.5844, + "step": 5119 + }, + { + "epoch": 18.483972911963882, + "grad_norm": 324.4369201660156, + "learning_rate": 2.177858439201452e-06, + "loss": 32.5136, + "step": 5120 + }, + { + "epoch": 18.483972911963882, + "eval_loss": 0.602573573589325, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 5120 + }, + { + "epoch": 18.48758465011287, + "grad_norm": 372.0033264160156, + "learning_rate": 2.1724137931034487e-06, + "loss": 31.4322, + "step": 5121 + }, + { + "epoch": 18.49119638826185, + "grad_norm": 336.265869140625, + "learning_rate": 2.1669691470054445e-06, + "loss": 34.163, + "step": 5122 + }, + { + "epoch": 18.494808126410835, + "grad_norm": 339.8494873046875, + "learning_rate": 2.161524500907441e-06, + "loss": 31.2627, + "step": 5123 + }, + { + "epoch": 18.498419864559818, + "grad_norm": 279.3925476074219, + "learning_rate": 2.1560798548094374e-06, + "loss": 32.3994, + "step": 5124 + }, + { + "epoch": 18.502031602708804, + "grad_norm": 281.546875, + "learning_rate": 2.1506352087114336e-06, + "loss": 34.8467, + "step": 5125 + }, + { + "epoch": 18.505643340857787, + "grad_norm": 315.8692626953125, + "learning_rate": 2.1451905626134302e-06, + "loss": 33.632, + "step": 5126 + }, + { + "epoch": 18.50925507900677, + "grad_norm": 289.3066711425781, + "learning_rate": 2.1397459165154265e-06, + "loss": 34.312, + "step": 5127 + }, + { + "epoch": 18.512866817155757, + "grad_norm": 274.190673828125, + "learning_rate": 2.134301270417423e-06, + "loss": 32.9937, + "step": 5128 + }, + { + "epoch": 18.51647855530474, + "grad_norm": 317.9950256347656, + "learning_rate": 2.1288566243194194e-06, + "loss": 35.8788, + "step": 5129 + }, + { + "epoch": 18.520090293453723, + "grad_norm": 342.9775695800781, + "learning_rate": 2.1234119782214156e-06, + "loss": 35.2397, + "step": 5130 + }, + { + "epoch": 18.520090293453723, + "eval_loss": 0.6024553179740906, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 5130 + }, + { + "epoch": 18.52370203160271, + "grad_norm": 351.09637451171875, + "learning_rate": 2.117967332123412e-06, + "loss": 33.1556, + "step": 5131 + }, + { + "epoch": 18.527313769751693, + "grad_norm": 229.55613708496094, + "learning_rate": 2.1125226860254085e-06, + "loss": 26.6317, + "step": 5132 + }, + { + "epoch": 18.530925507900676, + "grad_norm": 234.53562927246094, + "learning_rate": 2.1070780399274047e-06, + "loss": 21.316, + "step": 5133 + }, + { + "epoch": 18.534537246049663, + "grad_norm": 241.59982299804688, + "learning_rate": 2.1016333938294014e-06, + "loss": 21.2739, + "step": 5134 + }, + { + "epoch": 18.538148984198646, + "grad_norm": 207.2808380126953, + "learning_rate": 2.0961887477313976e-06, + "loss": 22.736, + "step": 5135 + }, + { + "epoch": 18.54176072234763, + "grad_norm": 236.13955688476562, + "learning_rate": 2.0907441016333942e-06, + "loss": 22.7503, + "step": 5136 + }, + { + "epoch": 18.545372460496615, + "grad_norm": 181.6793670654297, + "learning_rate": 2.08529945553539e-06, + "loss": 37.9001, + "step": 5137 + }, + { + "epoch": 18.5489841986456, + "grad_norm": 249.5441131591797, + "learning_rate": 2.0798548094373863e-06, + "loss": 39.52, + "step": 5138 + }, + { + "epoch": 18.55259593679458, + "grad_norm": 215.67855834960938, + "learning_rate": 2.074410163339383e-06, + "loss": 38.6667, + "step": 5139 + }, + { + "epoch": 18.55620767494357, + "grad_norm": 280.9402770996094, + "learning_rate": 2.068965517241379e-06, + "loss": 36.9602, + "step": 5140 + }, + { + "epoch": 18.55620767494357, + "eval_loss": 0.6027256846427917, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 5140 + }, + { + "epoch": 18.55981941309255, + "grad_norm": 265.9155578613281, + "learning_rate": 2.063520871143376e-06, + "loss": 38.8654, + "step": 5141 + }, + { + "epoch": 18.563431151241534, + "grad_norm": 300.0267028808594, + "learning_rate": 2.058076225045372e-06, + "loss": 38.8917, + "step": 5142 + }, + { + "epoch": 18.567042889390518, + "grad_norm": 243.0481414794922, + "learning_rate": 2.0526315789473687e-06, + "loss": 39.2785, + "step": 5143 + }, + { + "epoch": 18.570654627539504, + "grad_norm": 270.58380126953125, + "learning_rate": 2.047186932849365e-06, + "loss": 39.3892, + "step": 5144 + }, + { + "epoch": 18.574266365688487, + "grad_norm": 311.60430908203125, + "learning_rate": 2.041742286751361e-06, + "loss": 39.5933, + "step": 5145 + }, + { + "epoch": 18.57787810383747, + "grad_norm": 285.160400390625, + "learning_rate": 2.0362976406533574e-06, + "loss": 38.2962, + "step": 5146 + }, + { + "epoch": 18.581489841986457, + "grad_norm": 232.0592041015625, + "learning_rate": 2.030852994555354e-06, + "loss": 38.5965, + "step": 5147 + }, + { + "epoch": 18.58510158013544, + "grad_norm": 221.85525512695312, + "learning_rate": 2.0254083484573503e-06, + "loss": 36.516, + "step": 5148 + }, + { + "epoch": 18.588713318284423, + "grad_norm": 291.9794921875, + "learning_rate": 2.019963702359347e-06, + "loss": 36.3976, + "step": 5149 + }, + { + "epoch": 18.59232505643341, + "grad_norm": 387.8580322265625, + "learning_rate": 2.014519056261343e-06, + "loss": 35.2321, + "step": 5150 + }, + { + "epoch": 18.59232505643341, + "eval_loss": 0.6030355095863342, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.046, + "eval_steps_per_second": 57.046, + "step": 5150 + }, + { + "epoch": 18.595936794582393, + "grad_norm": 300.14508056640625, + "learning_rate": 2.0090744101633394e-06, + "loss": 36.4186, + "step": 5151 + }, + { + "epoch": 18.599548532731376, + "grad_norm": 294.1235656738281, + "learning_rate": 2.0036297640653356e-06, + "loss": 36.014, + "step": 5152 + }, + { + "epoch": 18.603160270880363, + "grad_norm": 389.1570129394531, + "learning_rate": 1.998185117967332e-06, + "loss": 36.1648, + "step": 5153 + }, + { + "epoch": 18.606772009029346, + "grad_norm": 244.6651153564453, + "learning_rate": 1.9927404718693285e-06, + "loss": 36.1033, + "step": 5154 + }, + { + "epoch": 18.61038374717833, + "grad_norm": 302.52996826171875, + "learning_rate": 1.9872958257713247e-06, + "loss": 37.1531, + "step": 5155 + }, + { + "epoch": 18.613995485327315, + "grad_norm": 352.86273193359375, + "learning_rate": 1.9818511796733214e-06, + "loss": 37.8204, + "step": 5156 + }, + { + "epoch": 18.6176072234763, + "grad_norm": 308.61431884765625, + "learning_rate": 1.9764065335753176e-06, + "loss": 37.2097, + "step": 5157 + }, + { + "epoch": 18.62121896162528, + "grad_norm": 288.30712890625, + "learning_rate": 1.9709618874773143e-06, + "loss": 36.4242, + "step": 5158 + }, + { + "epoch": 18.624830699774268, + "grad_norm": 315.9750671386719, + "learning_rate": 1.9655172413793105e-06, + "loss": 35.9204, + "step": 5159 + }, + { + "epoch": 18.62844243792325, + "grad_norm": 468.51055908203125, + "learning_rate": 1.9600725952813067e-06, + "loss": 38.9178, + "step": 5160 + }, + { + "epoch": 18.62844243792325, + "eval_loss": 0.6054540872573853, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 5160 + }, + { + "epoch": 18.632054176072234, + "grad_norm": 310.5861511230469, + "learning_rate": 1.954627949183303e-06, + "loss": 37.9588, + "step": 5161 + }, + { + "epoch": 18.635665914221217, + "grad_norm": 424.3090515136719, + "learning_rate": 1.9491833030852996e-06, + "loss": 38.1028, + "step": 5162 + }, + { + "epoch": 18.639277652370204, + "grad_norm": 330.6189880371094, + "learning_rate": 1.943738656987296e-06, + "loss": 36.5096, + "step": 5163 + }, + { + "epoch": 18.642889390519187, + "grad_norm": 305.9330139160156, + "learning_rate": 1.9382940108892925e-06, + "loss": 36.871, + "step": 5164 + }, + { + "epoch": 18.64650112866817, + "grad_norm": 410.06793212890625, + "learning_rate": 1.9328493647912887e-06, + "loss": 37.4061, + "step": 5165 + }, + { + "epoch": 18.650112866817157, + "grad_norm": 385.49127197265625, + "learning_rate": 1.927404718693285e-06, + "loss": 33.6399, + "step": 5166 + }, + { + "epoch": 18.65372460496614, + "grad_norm": 270.96783447265625, + "learning_rate": 1.9219600725952816e-06, + "loss": 31.3483, + "step": 5167 + }, + { + "epoch": 18.657336343115123, + "grad_norm": 329.84405517578125, + "learning_rate": 1.9165154264972774e-06, + "loss": 30.2639, + "step": 5168 + }, + { + "epoch": 18.66094808126411, + "grad_norm": 413.7260437011719, + "learning_rate": 1.911070780399274e-06, + "loss": 31.2749, + "step": 5169 + }, + { + "epoch": 18.664559819413093, + "grad_norm": 276.43585205078125, + "learning_rate": 1.9056261343012705e-06, + "loss": 30.3596, + "step": 5170 + }, + { + "epoch": 18.664559819413093, + "eval_loss": 0.6022100448608398, + "eval_runtime": 3.1339, + "eval_samples_per_second": 57.117, + "eval_steps_per_second": 57.117, + "step": 5170 + }, + { + "epoch": 18.668171557562076, + "grad_norm": 248.9257049560547, + "learning_rate": 1.9001814882032667e-06, + "loss": 32.4066, + "step": 5171 + }, + { + "epoch": 18.671783295711062, + "grad_norm": 252.70388793945312, + "learning_rate": 1.8947368421052632e-06, + "loss": 32.3724, + "step": 5172 + }, + { + "epoch": 18.675395033860045, + "grad_norm": 325.0677795410156, + "learning_rate": 1.8892921960072596e-06, + "loss": 32.3041, + "step": 5173 + }, + { + "epoch": 18.67900677200903, + "grad_norm": 420.9740295410156, + "learning_rate": 1.883847549909256e-06, + "loss": 32.6609, + "step": 5174 + }, + { + "epoch": 18.682618510158015, + "grad_norm": 239.59371948242188, + "learning_rate": 1.878402903811252e-06, + "loss": 32.8471, + "step": 5175 + }, + { + "epoch": 18.686230248306998, + "grad_norm": 301.13165283203125, + "learning_rate": 1.8729582577132487e-06, + "loss": 32.2686, + "step": 5176 + }, + { + "epoch": 18.68984198645598, + "grad_norm": 282.7923889160156, + "learning_rate": 1.867513611615245e-06, + "loss": 34.2726, + "step": 5177 + }, + { + "epoch": 18.693453724604964, + "grad_norm": 434.20550537109375, + "learning_rate": 1.8620689655172414e-06, + "loss": 35.335, + "step": 5178 + }, + { + "epoch": 18.69706546275395, + "grad_norm": 306.680908203125, + "learning_rate": 1.8566243194192379e-06, + "loss": 33.3156, + "step": 5179 + }, + { + "epoch": 18.700677200902934, + "grad_norm": 253.27711486816406, + "learning_rate": 1.8511796733212343e-06, + "loss": 34.9504, + "step": 5180 + }, + { + "epoch": 18.700677200902934, + "eval_loss": 0.6021104454994202, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 5180 + }, + { + "epoch": 18.704288939051917, + "grad_norm": 391.74945068359375, + "learning_rate": 1.8457350272232305e-06, + "loss": 35.285, + "step": 5181 + }, + { + "epoch": 18.707900677200904, + "grad_norm": 265.4142150878906, + "learning_rate": 1.840290381125227e-06, + "loss": 27.689, + "step": 5182 + }, + { + "epoch": 18.711512415349887, + "grad_norm": 217.80746459960938, + "learning_rate": 1.8348457350272234e-06, + "loss": 22.6159, + "step": 5183 + }, + { + "epoch": 18.71512415349887, + "grad_norm": 220.21180725097656, + "learning_rate": 1.8294010889292196e-06, + "loss": 22.1321, + "step": 5184 + }, + { + "epoch": 18.718735891647857, + "grad_norm": 239.4197998046875, + "learning_rate": 1.8239564428312159e-06, + "loss": 22.5479, + "step": 5185 + }, + { + "epoch": 18.72234762979684, + "grad_norm": 281.7828674316406, + "learning_rate": 1.8185117967332123e-06, + "loss": 23.5363, + "step": 5186 + }, + { + "epoch": 18.725959367945823, + "grad_norm": 231.81980895996094, + "learning_rate": 1.8130671506352088e-06, + "loss": 39.0953, + "step": 5187 + }, + { + "epoch": 18.72957110609481, + "grad_norm": 242.0535430908203, + "learning_rate": 1.807622504537205e-06, + "loss": 39.4842, + "step": 5188 + }, + { + "epoch": 18.733182844243792, + "grad_norm": 235.6869659423828, + "learning_rate": 1.8021778584392014e-06, + "loss": 37.4884, + "step": 5189 + }, + { + "epoch": 18.736794582392776, + "grad_norm": 291.5176086425781, + "learning_rate": 1.7967332123411979e-06, + "loss": 38.9612, + "step": 5190 + }, + { + "epoch": 18.736794582392776, + "eval_loss": 0.6040608286857605, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.034, + "eval_steps_per_second": 57.034, + "step": 5190 + }, + { + "epoch": 18.740406320541762, + "grad_norm": 407.5574645996094, + "learning_rate": 1.7912885662431943e-06, + "loss": 39.3531, + "step": 5191 + }, + { + "epoch": 18.744018058690745, + "grad_norm": 277.07891845703125, + "learning_rate": 1.7858439201451905e-06, + "loss": 38.4866, + "step": 5192 + }, + { + "epoch": 18.74762979683973, + "grad_norm": 350.2939453125, + "learning_rate": 1.780399274047187e-06, + "loss": 38.0073, + "step": 5193 + }, + { + "epoch": 18.751241534988715, + "grad_norm": 395.7618103027344, + "learning_rate": 1.7749546279491834e-06, + "loss": 38.1693, + "step": 5194 + }, + { + "epoch": 18.754853273137698, + "grad_norm": 296.43267822265625, + "learning_rate": 1.7695099818511799e-06, + "loss": 38.6162, + "step": 5195 + }, + { + "epoch": 18.75846501128668, + "grad_norm": 335.7173156738281, + "learning_rate": 1.764065335753176e-06, + "loss": 38.9182, + "step": 5196 + }, + { + "epoch": 18.762076749435664, + "grad_norm": 273.09368896484375, + "learning_rate": 1.7586206896551725e-06, + "loss": 38.0685, + "step": 5197 + }, + { + "epoch": 18.76568848758465, + "grad_norm": 359.718505859375, + "learning_rate": 1.7531760435571688e-06, + "loss": 36.8994, + "step": 5198 + }, + { + "epoch": 18.769300225733634, + "grad_norm": 345.5837097167969, + "learning_rate": 1.7477313974591652e-06, + "loss": 35.375, + "step": 5199 + }, + { + "epoch": 18.772911963882617, + "grad_norm": 266.8583984375, + "learning_rate": 1.7422867513611614e-06, + "loss": 34.7559, + "step": 5200 + }, + { + "epoch": 18.772911963882617, + "eval_loss": 0.6007165908813477, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 5200 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.550543125818245e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..14cf94b7176d0036ab485ba071356721982f51b4 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41c88d717da6bb0a33c747a6a8c8e5ea055c1d2ee9a2994682c36ff237c57950 +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..6af6c4938e70092c4506583ab952a312b7073eda --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:895f37a7506a2b130e8506167e8d180b1ddfc5d2f23fbf3c0ddab89099b26387 +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb7f39f0658511956314453ad34f8646dbecd726 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:985881a7539e7de0feed0acde00da3ca0bc2f2e96ea445ef318a595b00eb8684 +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..eb4ba7dd1787297cc071571f2d43c3c63a598442 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:208a5a28481eda2056538c7a83920a4ee4e6279f7c04546f994e350df162ed62 +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a9f2689fea1186ef19781d9bb396c29b2e01ff3c --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3a2e993707525a0b2634a7eae322dcfd8401a7e77d05526eb302128ce8d1579 +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ed120dd8cade4686faf06cb7802e3682704dedcc --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/trainer_state.json @@ -0,0 +1,42153 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 19.494808126410835, + "eval_steps": 10, + "global_step": 5400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + }, + { + "epoch": 4.33589164785553, + "grad_norm": 232.332763671875, + "learning_rate": 2.3515426497277678e-05, + "loss": 38.1029, + "step": 1201 + }, + { + "epoch": 4.339503386004514, + "grad_norm": 251.8763885498047, + "learning_rate": 2.3509981851179673e-05, + "loss": 40.2538, + "step": 1202 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 260.1363525390625, + "learning_rate": 2.350453720508167e-05, + "loss": 39.115, + "step": 1203 + }, + { + "epoch": 4.346726862302483, + "grad_norm": 227.32473754882812, + "learning_rate": 2.3499092558983667e-05, + "loss": 37.7692, + "step": 1204 + }, + { + "epoch": 4.350338600451467, + "grad_norm": 208.3872528076172, + "learning_rate": 2.3493647912885663e-05, + "loss": 26.7583, + "step": 1205 + }, + { + "epoch": 4.353950338600452, + "grad_norm": 173.05075073242188, + "learning_rate": 2.348820326678766e-05, + "loss": 24.7576, + "step": 1206 + }, + { + "epoch": 4.357562076749436, + "grad_norm": 214.4512939453125, + "learning_rate": 2.3482758620689657e-05, + "loss": 24.8792, + "step": 1207 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 179.293701171875, + "learning_rate": 2.3477313974591652e-05, + "loss": 26.1507, + "step": 1208 + }, + { + "epoch": 4.364785553047404, + "grad_norm": 401.9908142089844, + "learning_rate": 2.3471869328493648e-05, + "loss": 47.4017, + "step": 1209 + }, + { + "epoch": 4.368397291196389, + "grad_norm": 399.3369140625, + "learning_rate": 2.3466424682395643e-05, + "loss": 48.0082, + "step": 1210 + }, + { + "epoch": 4.368397291196389, + "eval_loss": 0.6664602756500244, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.18, + "eval_steps_per_second": 57.18, + "step": 1210 + }, + { + "epoch": 4.372009029345373, + "grad_norm": 320.49090576171875, + "learning_rate": 2.346098003629764e-05, + "loss": 47.4843, + "step": 1211 + }, + { + "epoch": 4.375620767494357, + "grad_norm": 297.55615234375, + "learning_rate": 2.3455535390199637e-05, + "loss": 46.3087, + "step": 1212 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 245.03399658203125, + "learning_rate": 2.3450090744101636e-05, + "loss": 45.4889, + "step": 1213 + }, + { + "epoch": 4.382844243792325, + "grad_norm": 227.94091796875, + "learning_rate": 2.344464609800363e-05, + "loss": 45.8501, + "step": 1214 + }, + { + "epoch": 4.386455981941309, + "grad_norm": 262.7824401855469, + "learning_rate": 2.3439201451905627e-05, + "loss": 46.2737, + "step": 1215 + }, + { + "epoch": 4.390067720090293, + "grad_norm": 235.969970703125, + "learning_rate": 2.3433756805807622e-05, + "loss": 45.2876, + "step": 1216 + }, + { + "epoch": 4.393679458239277, + "grad_norm": 244.8028106689453, + "learning_rate": 2.342831215970962e-05, + "loss": 45.4931, + "step": 1217 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 236.24844360351562, + "learning_rate": 2.3422867513611616e-05, + "loss": 45.6649, + "step": 1218 + }, + { + "epoch": 4.400902934537246, + "grad_norm": 204.7911834716797, + "learning_rate": 2.341742286751361e-05, + "loss": 43.9613, + "step": 1219 + }, + { + "epoch": 4.40451467268623, + "grad_norm": 190.6739044189453, + "learning_rate": 2.3411978221415607e-05, + "loss": 41.9267, + "step": 1220 + }, + { + "epoch": 4.40451467268623, + "eval_loss": 0.6481396555900574, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1220 + }, + { + "epoch": 4.408126410835214, + "grad_norm": 224.25758361816406, + "learning_rate": 2.3406533575317602e-05, + "loss": 42.34, + "step": 1221 + }, + { + "epoch": 4.411738148984199, + "grad_norm": 238.21913146972656, + "learning_rate": 2.34010889292196e-05, + "loss": 40.6947, + "step": 1222 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 255.64395141601562, + "learning_rate": 2.33956442831216e-05, + "loss": 39.8585, + "step": 1223 + }, + { + "epoch": 4.418961625282167, + "grad_norm": 202.08859252929688, + "learning_rate": 2.3390199637023595e-05, + "loss": 42.6031, + "step": 1224 + }, + { + "epoch": 4.422573363431152, + "grad_norm": 222.359619140625, + "learning_rate": 2.338475499092559e-05, + "loss": 41.9946, + "step": 1225 + }, + { + "epoch": 4.426185101580136, + "grad_norm": 198.84461975097656, + "learning_rate": 2.3379310344827586e-05, + "loss": 40.9174, + "step": 1226 + }, + { + "epoch": 4.42979683972912, + "grad_norm": 227.34942626953125, + "learning_rate": 2.337386569872958e-05, + "loss": 42.2865, + "step": 1227 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 249.9097900390625, + "learning_rate": 2.336842105263158e-05, + "loss": 42.6508, + "step": 1228 + }, + { + "epoch": 4.437020316027088, + "grad_norm": 236.96009826660156, + "learning_rate": 2.3362976406533576e-05, + "loss": 43.0846, + "step": 1229 + }, + { + "epoch": 4.440632054176072, + "grad_norm": 183.06201171875, + "learning_rate": 2.335753176043557e-05, + "loss": 42.4119, + "step": 1230 + }, + { + "epoch": 4.440632054176072, + "eval_loss": 0.6428424715995789, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 1230 + }, + { + "epoch": 4.444243792325056, + "grad_norm": 199.0382843017578, + "learning_rate": 2.335208711433757e-05, + "loss": 43.1702, + "step": 1231 + }, + { + "epoch": 4.44785553047404, + "grad_norm": 221.87939453125, + "learning_rate": 2.3346642468239565e-05, + "loss": 43.3518, + "step": 1232 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 205.0601043701172, + "learning_rate": 2.3341197822141564e-05, + "loss": 42.9713, + "step": 1233 + }, + { + "epoch": 4.455079006772009, + "grad_norm": 235.3998565673828, + "learning_rate": 2.333575317604356e-05, + "loss": 42.6973, + "step": 1234 + }, + { + "epoch": 4.458690744920993, + "grad_norm": 171.76986694335938, + "learning_rate": 2.3330308529945555e-05, + "loss": 43.351, + "step": 1235 + }, + { + "epoch": 4.462302483069977, + "grad_norm": 261.549072265625, + "learning_rate": 2.332486388384755e-05, + "loss": 43.8662, + "step": 1236 + }, + { + "epoch": 4.465914221218962, + "grad_norm": 256.76837158203125, + "learning_rate": 2.3319419237749545e-05, + "loss": 40.7938, + "step": 1237 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 176.35060119628906, + "learning_rate": 2.331397459165154e-05, + "loss": 38.1021, + "step": 1238 + }, + { + "epoch": 4.47313769751693, + "grad_norm": 203.00906372070312, + "learning_rate": 2.330852994555354e-05, + "loss": 36.6359, + "step": 1239 + }, + { + "epoch": 4.476749435665914, + "grad_norm": 259.6462707519531, + "learning_rate": 2.3303085299455535e-05, + "loss": 34.448, + "step": 1240 + }, + { + "epoch": 4.476749435665914, + "eval_loss": 0.6386051177978516, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 1240 + }, + { + "epoch": 4.480361173814899, + "grad_norm": 215.24737548828125, + "learning_rate": 2.3297640653357534e-05, + "loss": 35.2353, + "step": 1241 + }, + { + "epoch": 4.483972911963883, + "grad_norm": 249.12355041503906, + "learning_rate": 2.329219600725953e-05, + "loss": 38.2077, + "step": 1242 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 191.0881805419922, + "learning_rate": 2.3286751361161525e-05, + "loss": 36.8363, + "step": 1243 + }, + { + "epoch": 4.491196388261851, + "grad_norm": 229.26449584960938, + "learning_rate": 2.3281306715063523e-05, + "loss": 36.7398, + "step": 1244 + }, + { + "epoch": 4.4948081264108355, + "grad_norm": 184.931884765625, + "learning_rate": 2.327586206896552e-05, + "loss": 35.6614, + "step": 1245 + }, + { + "epoch": 4.4984198645598195, + "grad_norm": 183.7378387451172, + "learning_rate": 2.3270417422867514e-05, + "loss": 36.9818, + "step": 1246 + }, + { + "epoch": 4.502031602708803, + "grad_norm": 191.42543029785156, + "learning_rate": 2.326497277676951e-05, + "loss": 38.1348, + "step": 1247 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 211.6359100341797, + "learning_rate": 2.3259528130671505e-05, + "loss": 37.0112, + "step": 1248 + }, + { + "epoch": 4.509255079006772, + "grad_norm": 245.6946563720703, + "learning_rate": 2.32540834845735e-05, + "loss": 38.6218, + "step": 1249 + }, + { + "epoch": 4.512866817155756, + "grad_norm": 193.29095458984375, + "learning_rate": 2.3248638838475502e-05, + "loss": 36.9687, + "step": 1250 + }, + { + "epoch": 4.512866817155756, + "eval_loss": 0.6432057023048401, + "eval_runtime": 3.1301, + "eval_samples_per_second": 57.187, + "eval_steps_per_second": 57.187, + "step": 1250 + }, + { + "epoch": 4.51647855530474, + "grad_norm": 247.0595245361328, + "learning_rate": 2.3243194192377498e-05, + "loss": 39.8086, + "step": 1251 + }, + { + "epoch": 4.520090293453725, + "grad_norm": 243.1544189453125, + "learning_rate": 2.3237749546279493e-05, + "loss": 38.7245, + "step": 1252 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 322.0834045410156, + "learning_rate": 2.323230490018149e-05, + "loss": 39.5335, + "step": 1253 + }, + { + "epoch": 4.527313769751693, + "grad_norm": 201.5956573486328, + "learning_rate": 2.3226860254083484e-05, + "loss": 30.2928, + "step": 1254 + }, + { + "epoch": 4.530925507900677, + "grad_norm": 186.13291931152344, + "learning_rate": 2.3221415607985483e-05, + "loss": 24.8504, + "step": 1255 + }, + { + "epoch": 4.534537246049661, + "grad_norm": 251.50608825683594, + "learning_rate": 2.3215970961887478e-05, + "loss": 24.5528, + "step": 1256 + }, + { + "epoch": 4.538148984198646, + "grad_norm": 180.21124267578125, + "learning_rate": 2.3210526315789473e-05, + "loss": 25.0864, + "step": 1257 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 206.5410614013672, + "learning_rate": 2.320508166969147e-05, + "loss": 27.1602, + "step": 1258 + }, + { + "epoch": 4.545372460496614, + "grad_norm": 342.1103210449219, + "learning_rate": 2.3199637023593468e-05, + "loss": 47.3734, + "step": 1259 + }, + { + "epoch": 4.5489841986455986, + "grad_norm": 418.3056945800781, + "learning_rate": 2.3194192377495463e-05, + "loss": 48.0316, + "step": 1260 + }, + { + "epoch": 4.5489841986455986, + "eval_loss": 0.6742400527000427, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 1260 + }, + { + "epoch": 4.5525959367945825, + "grad_norm": 369.8560791015625, + "learning_rate": 2.3188747731397462e-05, + "loss": 47.4532, + "step": 1261 + }, + { + "epoch": 4.5562076749435665, + "grad_norm": 322.0288391113281, + "learning_rate": 2.3183303085299457e-05, + "loss": 47.0661, + "step": 1262 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 244.79066467285156, + "learning_rate": 2.3177858439201453e-05, + "loss": 45.1875, + "step": 1263 + }, + { + "epoch": 4.563431151241535, + "grad_norm": 209.29397583007812, + "learning_rate": 2.3172413793103448e-05, + "loss": 46.1355, + "step": 1264 + }, + { + "epoch": 4.567042889390519, + "grad_norm": 271.5123291015625, + "learning_rate": 2.3166969147005443e-05, + "loss": 45.8947, + "step": 1265 + }, + { + "epoch": 4.570654627539503, + "grad_norm": 232.42913818359375, + "learning_rate": 2.3161524500907442e-05, + "loss": 45.6542, + "step": 1266 + }, + { + "epoch": 4.574266365688487, + "grad_norm": 282.50738525390625, + "learning_rate": 2.3156079854809437e-05, + "loss": 45.8805, + "step": 1267 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 203.39031982421875, + "learning_rate": 2.3150635208711436e-05, + "loss": 44.8926, + "step": 1268 + }, + { + "epoch": 4.581489841986456, + "grad_norm": 213.94894409179688, + "learning_rate": 2.314519056261343e-05, + "loss": 43.7589, + "step": 1269 + }, + { + "epoch": 4.58510158013544, + "grad_norm": 198.9677734375, + "learning_rate": 2.3139745916515427e-05, + "loss": 41.819, + "step": 1270 + }, + { + "epoch": 4.58510158013544, + "eval_loss": 0.6428627371788025, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1270 + }, + { + "epoch": 4.588713318284425, + "grad_norm": 197.69903564453125, + "learning_rate": 2.3134301270417422e-05, + "loss": 40.6128, + "step": 1271 + }, + { + "epoch": 4.592325056433409, + "grad_norm": 229.10488891601562, + "learning_rate": 2.312885662431942e-05, + "loss": 41.1856, + "step": 1272 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 254.4750213623047, + "learning_rate": 2.3123411978221417e-05, + "loss": 40.2048, + "step": 1273 + }, + { + "epoch": 4.599548532731377, + "grad_norm": 247.2012939453125, + "learning_rate": 2.3117967332123412e-05, + "loss": 41.663, + "step": 1274 + }, + { + "epoch": 4.603160270880361, + "grad_norm": 196.78761291503906, + "learning_rate": 2.3112522686025407e-05, + "loss": 41.1102, + "step": 1275 + }, + { + "epoch": 4.606772009029346, + "grad_norm": 179.03880310058594, + "learning_rate": 2.3107078039927403e-05, + "loss": 39.6368, + "step": 1276 + }, + { + "epoch": 4.6103837471783295, + "grad_norm": 203.49159240722656, + "learning_rate": 2.3101633393829405e-05, + "loss": 42.9424, + "step": 1277 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 254.80018615722656, + "learning_rate": 2.30961887477314e-05, + "loss": 42.0636, + "step": 1278 + }, + { + "epoch": 4.617607223476298, + "grad_norm": 201.86109924316406, + "learning_rate": 2.3090744101633396e-05, + "loss": 41.4738, + "step": 1279 + }, + { + "epoch": 4.621218961625282, + "grad_norm": 185.1239471435547, + "learning_rate": 2.308529945553539e-05, + "loss": 41.8529, + "step": 1280 + }, + { + "epoch": 4.621218961625282, + "eval_loss": 0.6457561254501343, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 1280 + }, + { + "epoch": 4.624830699774266, + "grad_norm": 198.6769561767578, + "learning_rate": 2.3079854809437386e-05, + "loss": 41.8397, + "step": 1281 + }, + { + "epoch": 4.62844243792325, + "grad_norm": 254.9165496826172, + "learning_rate": 2.3074410163339382e-05, + "loss": 43.5585, + "step": 1282 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 183.61181640625, + "learning_rate": 2.306896551724138e-05, + "loss": 41.7349, + "step": 1283 + }, + { + "epoch": 4.635665914221219, + "grad_norm": 206.0381622314453, + "learning_rate": 2.3063520871143376e-05, + "loss": 42.6239, + "step": 1284 + }, + { + "epoch": 4.639277652370203, + "grad_norm": 188.5303497314453, + "learning_rate": 2.305807622504537e-05, + "loss": 43.0988, + "step": 1285 + }, + { + "epoch": 4.642889390519187, + "grad_norm": 208.30039978027344, + "learning_rate": 2.3052631578947367e-05, + "loss": 43.8379, + "step": 1286 + }, + { + "epoch": 4.646501128668172, + "grad_norm": 209.494384765625, + "learning_rate": 2.3047186932849365e-05, + "loss": 41.4395, + "step": 1287 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 223.97824096679688, + "learning_rate": 2.3041742286751364e-05, + "loss": 38.5792, + "step": 1288 + }, + { + "epoch": 4.65372460496614, + "grad_norm": 209.16192626953125, + "learning_rate": 2.303629764065336e-05, + "loss": 36.2448, + "step": 1289 + }, + { + "epoch": 4.657336343115124, + "grad_norm": 260.72821044921875, + "learning_rate": 2.3030852994555355e-05, + "loss": 35.1692, + "step": 1290 + }, + { + "epoch": 4.657336343115124, + "eval_loss": 0.6381233334541321, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1290 + }, + { + "epoch": 4.660948081264109, + "grad_norm": 222.2270965576172, + "learning_rate": 2.302540834845735e-05, + "loss": 35.2234, + "step": 1291 + }, + { + "epoch": 4.664559819413093, + "grad_norm": 208.68218994140625, + "learning_rate": 2.3019963702359346e-05, + "loss": 35.6167, + "step": 1292 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 199.57015991210938, + "learning_rate": 2.301451905626134e-05, + "loss": 36.9489, + "step": 1293 + }, + { + "epoch": 4.6717832957110605, + "grad_norm": 249.1312255859375, + "learning_rate": 2.300907441016334e-05, + "loss": 37.0681, + "step": 1294 + }, + { + "epoch": 4.675395033860045, + "grad_norm": 227.86341857910156, + "learning_rate": 2.3003629764065335e-05, + "loss": 38.3897, + "step": 1295 + }, + { + "epoch": 4.679006772009029, + "grad_norm": 290.3368225097656, + "learning_rate": 2.2998185117967334e-05, + "loss": 39.1391, + "step": 1296 + }, + { + "epoch": 4.682618510158013, + "grad_norm": 222.59974670410156, + "learning_rate": 2.299274047186933e-05, + "loss": 38.6362, + "step": 1297 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 233.853515625, + "learning_rate": 2.2987295825771325e-05, + "loss": 37.1796, + "step": 1298 + }, + { + "epoch": 4.689841986455982, + "grad_norm": 202.83212280273438, + "learning_rate": 2.2981851179673324e-05, + "loss": 38.5097, + "step": 1299 + }, + { + "epoch": 4.693453724604966, + "grad_norm": 203.59027099609375, + "learning_rate": 2.297640653357532e-05, + "loss": 38.3335, + "step": 1300 + }, + { + "epoch": 4.693453724604966, + "eval_loss": 0.6446877717971802, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 1300 + }, + { + "epoch": 4.69706546275395, + "grad_norm": 250.48324584960938, + "learning_rate": 2.2970961887477314e-05, + "loss": 39.1848, + "step": 1301 + }, + { + "epoch": 4.700677200902934, + "grad_norm": 218.0867462158203, + "learning_rate": 2.296551724137931e-05, + "loss": 38.2276, + "step": 1302 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 316.4258728027344, + "learning_rate": 2.2960072595281305e-05, + "loss": 38.4487, + "step": 1303 + }, + { + "epoch": 4.707900677200903, + "grad_norm": 262.96832275390625, + "learning_rate": 2.29546279491833e-05, + "loss": 29.1075, + "step": 1304 + }, + { + "epoch": 4.711512415349887, + "grad_norm": 261.25897216796875, + "learning_rate": 2.2949183303085303e-05, + "loss": 24.6257, + "step": 1305 + }, + { + "epoch": 4.715124153498872, + "grad_norm": 223.29014587402344, + "learning_rate": 2.2943738656987298e-05, + "loss": 24.4387, + "step": 1306 + }, + { + "epoch": 4.718735891647856, + "grad_norm": 167.95193481445312, + "learning_rate": 2.2938294010889293e-05, + "loss": 25.0916, + "step": 1307 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 203.88392639160156, + "learning_rate": 2.293284936479129e-05, + "loss": 26.1631, + "step": 1308 + }, + { + "epoch": 4.725959367945824, + "grad_norm": 350.67657470703125, + "learning_rate": 2.2927404718693284e-05, + "loss": 47.7021, + "step": 1309 + }, + { + "epoch": 4.7295711060948085, + "grad_norm": 357.1839294433594, + "learning_rate": 2.2921960072595283e-05, + "loss": 47.8161, + "step": 1310 + }, + { + "epoch": 4.7295711060948085, + "eval_loss": 0.6716815829277039, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 1310 + }, + { + "epoch": 4.733182844243792, + "grad_norm": 334.40216064453125, + "learning_rate": 2.291651542649728e-05, + "loss": 47.5608, + "step": 1311 + }, + { + "epoch": 4.736794582392776, + "grad_norm": 322.90008544921875, + "learning_rate": 2.2911070780399274e-05, + "loss": 45.9858, + "step": 1312 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 291.5083923339844, + "learning_rate": 2.290562613430127e-05, + "loss": 45.9813, + "step": 1313 + }, + { + "epoch": 4.744018058690745, + "grad_norm": 234.91102600097656, + "learning_rate": 2.2900181488203268e-05, + "loss": 44.4287, + "step": 1314 + }, + { + "epoch": 4.747629796839729, + "grad_norm": 271.03582763671875, + "learning_rate": 2.2894736842105263e-05, + "loss": 45.3697, + "step": 1315 + }, + { + "epoch": 4.751241534988713, + "grad_norm": 256.219482421875, + "learning_rate": 2.2889292196007262e-05, + "loss": 45.1817, + "step": 1316 + }, + { + "epoch": 4.754853273137698, + "grad_norm": 252.0631561279297, + "learning_rate": 2.2883847549909257e-05, + "loss": 45.2029, + "step": 1317 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 249.41812133789062, + "learning_rate": 2.2878402903811253e-05, + "loss": 44.9802, + "step": 1318 + }, + { + "epoch": 4.762076749435666, + "grad_norm": 208.9102325439453, + "learning_rate": 2.2872958257713248e-05, + "loss": 44.3745, + "step": 1319 + }, + { + "epoch": 4.76568848758465, + "grad_norm": 322.94903564453125, + "learning_rate": 2.2867513611615244e-05, + "loss": 40.9193, + "step": 1320 + }, + { + "epoch": 4.76568848758465, + "eval_loss": 0.6515910029411316, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1320 + }, + { + "epoch": 4.769300225733634, + "grad_norm": 264.6942138671875, + "learning_rate": 2.2862068965517242e-05, + "loss": 39.7286, + "step": 1321 + }, + { + "epoch": 4.772911963882619, + "grad_norm": 276.6095886230469, + "learning_rate": 2.2856624319419238e-05, + "loss": 41.3846, + "step": 1322 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 199.59877014160156, + "learning_rate": 2.2851179673321233e-05, + "loss": 40.5583, + "step": 1323 + }, + { + "epoch": 4.780135440180587, + "grad_norm": 252.59158325195312, + "learning_rate": 2.2845735027223232e-05, + "loss": 40.9513, + "step": 1324 + }, + { + "epoch": 4.7837471783295715, + "grad_norm": 215.53826904296875, + "learning_rate": 2.2840290381125227e-05, + "loss": 41.5119, + "step": 1325 + }, + { + "epoch": 4.7873589164785555, + "grad_norm": 290.7100524902344, + "learning_rate": 2.2834845735027226e-05, + "loss": 42.7646, + "step": 1326 + }, + { + "epoch": 4.7909706546275395, + "grad_norm": 190.2306671142578, + "learning_rate": 2.282940108892922e-05, + "loss": 42.2708, + "step": 1327 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 187.5550079345703, + "learning_rate": 2.2823956442831217e-05, + "loss": 41.9279, + "step": 1328 + }, + { + "epoch": 4.798194130925508, + "grad_norm": 169.10414123535156, + "learning_rate": 2.2818511796733212e-05, + "loss": 42.2688, + "step": 1329 + }, + { + "epoch": 4.801805869074492, + "grad_norm": 199.5216064453125, + "learning_rate": 2.2813067150635208e-05, + "loss": 41.9192, + "step": 1330 + }, + { + "epoch": 4.801805869074492, + "eval_loss": 0.6402038335800171, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 1330 + }, + { + "epoch": 4.805417607223476, + "grad_norm": 222.4996337890625, + "learning_rate": 2.2807622504537203e-05, + "loss": 43.8218, + "step": 1331 + }, + { + "epoch": 4.80902934537246, + "grad_norm": 228.1157684326172, + "learning_rate": 2.2802177858439202e-05, + "loss": 42.9497, + "step": 1332 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 179.83697509765625, + "learning_rate": 2.27967332123412e-05, + "loss": 43.9723, + "step": 1333 + }, + { + "epoch": 4.816252821670429, + "grad_norm": 196.81983947753906, + "learning_rate": 2.2791288566243196e-05, + "loss": 43.3302, + "step": 1334 + }, + { + "epoch": 4.819864559819413, + "grad_norm": 186.61160278320312, + "learning_rate": 2.278584392014519e-05, + "loss": 41.8957, + "step": 1335 + }, + { + "epoch": 4.823476297968397, + "grad_norm": 242.55886840820312, + "learning_rate": 2.2780399274047187e-05, + "loss": 43.1916, + "step": 1336 + }, + { + "epoch": 4.827088036117382, + "grad_norm": 212.07177734375, + "learning_rate": 2.2774954627949185e-05, + "loss": 38.3371, + "step": 1337 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 180.1990966796875, + "learning_rate": 2.276950998185118e-05, + "loss": 36.3413, + "step": 1338 + }, + { + "epoch": 4.83431151241535, + "grad_norm": 202.69529724121094, + "learning_rate": 2.2764065335753176e-05, + "loss": 35.4426, + "step": 1339 + }, + { + "epoch": 4.837923250564334, + "grad_norm": 180.47283935546875, + "learning_rate": 2.275862068965517e-05, + "loss": 35.5281, + "step": 1340 + }, + { + "epoch": 4.837923250564334, + "eval_loss": 0.6356105804443359, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 1340 + }, + { + "epoch": 4.8415349887133186, + "grad_norm": 204.674560546875, + "learning_rate": 2.2753176043557167e-05, + "loss": 36.2566, + "step": 1341 + }, + { + "epoch": 4.8451467268623025, + "grad_norm": 272.1197204589844, + "learning_rate": 2.2747731397459166e-05, + "loss": 36.3862, + "step": 1342 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 235.55101013183594, + "learning_rate": 2.2742286751361165e-05, + "loss": 35.1455, + "step": 1343 + }, + { + "epoch": 4.852370203160271, + "grad_norm": 271.2718200683594, + "learning_rate": 2.273684210526316e-05, + "loss": 37.3824, + "step": 1344 + }, + { + "epoch": 4.855981941309255, + "grad_norm": 242.15728759765625, + "learning_rate": 2.2731397459165155e-05, + "loss": 37.6587, + "step": 1345 + }, + { + "epoch": 4.859593679458239, + "grad_norm": 218.59481811523438, + "learning_rate": 2.272595281306715e-05, + "loss": 36.7602, + "step": 1346 + }, + { + "epoch": 4.863205417607223, + "grad_norm": 231.9490203857422, + "learning_rate": 2.2720508166969146e-05, + "loss": 38.187, + "step": 1347 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 385.56158447265625, + "learning_rate": 2.2715063520871145e-05, + "loss": 38.1905, + "step": 1348 + }, + { + "epoch": 4.870428893905192, + "grad_norm": 219.38204956054688, + "learning_rate": 2.270961887477314e-05, + "loss": 38.2179, + "step": 1349 + }, + { + "epoch": 4.874040632054176, + "grad_norm": 209.46580505371094, + "learning_rate": 2.2704174228675136e-05, + "loss": 37.3696, + "step": 1350 + }, + { + "epoch": 4.874040632054176, + "eval_loss": 0.6412517428398132, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 1350 + }, + { + "epoch": 4.87765237020316, + "grad_norm": 205.53416442871094, + "learning_rate": 2.2698729582577134e-05, + "loss": 38.5144, + "step": 1351 + }, + { + "epoch": 4.881264108352145, + "grad_norm": 214.2522735595703, + "learning_rate": 2.269328493647913e-05, + "loss": 38.7372, + "step": 1352 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 236.9787139892578, + "learning_rate": 2.2687840290381125e-05, + "loss": 38.8987, + "step": 1353 + }, + { + "epoch": 4.888487584650113, + "grad_norm": 247.30906677246094, + "learning_rate": 2.2682395644283124e-05, + "loss": 35.0837, + "step": 1354 + }, + { + "epoch": 4.892099322799097, + "grad_norm": 287.5954284667969, + "learning_rate": 2.267695099818512e-05, + "loss": 25.5272, + "step": 1355 + }, + { + "epoch": 4.895711060948082, + "grad_norm": 254.61672973632812, + "learning_rate": 2.2671506352087115e-05, + "loss": 25.1288, + "step": 1356 + }, + { + "epoch": 4.899322799097066, + "grad_norm": 180.98666381835938, + "learning_rate": 2.266606170598911e-05, + "loss": 25.0588, + "step": 1357 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 213.0275421142578, + "learning_rate": 2.2660617059891105e-05, + "loss": 25.464, + "step": 1358 + }, + { + "epoch": 4.9065462753950335, + "grad_norm": 385.18035888671875, + "learning_rate": 2.2655172413793104e-05, + "loss": 47.0056, + "step": 1359 + }, + { + "epoch": 4.910158013544018, + "grad_norm": 383.4106140136719, + "learning_rate": 2.2649727767695103e-05, + "loss": 46.9892, + "step": 1360 + }, + { + "epoch": 4.910158013544018, + "eval_loss": 0.6618479490280151, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1360 + }, + { + "epoch": 4.913769751693002, + "grad_norm": 415.4345397949219, + "learning_rate": 2.26442831215971e-05, + "loss": 47.1619, + "step": 1361 + }, + { + "epoch": 4.917381489841986, + "grad_norm": 362.338134765625, + "learning_rate": 2.2638838475499094e-05, + "loss": 46.7232, + "step": 1362 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 378.7535400390625, + "learning_rate": 2.263339382940109e-05, + "loss": 46.4438, + "step": 1363 + }, + { + "epoch": 4.924604966139955, + "grad_norm": 251.64901733398438, + "learning_rate": 2.2627949183303085e-05, + "loss": 44.8178, + "step": 1364 + }, + { + "epoch": 4.928216704288939, + "grad_norm": 273.1052551269531, + "learning_rate": 2.2622504537205083e-05, + "loss": 43.0865, + "step": 1365 + }, + { + "epoch": 4.931828442437923, + "grad_norm": 229.66415405273438, + "learning_rate": 2.261705989110708e-05, + "loss": 42.2463, + "step": 1366 + }, + { + "epoch": 4.935440180586907, + "grad_norm": 229.47940063476562, + "learning_rate": 2.2611615245009074e-05, + "loss": 42.4395, + "step": 1367 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 224.48890686035156, + "learning_rate": 2.260617059891107e-05, + "loss": 42.4994, + "step": 1368 + }, + { + "epoch": 4.942663656884876, + "grad_norm": 241.98745727539062, + "learning_rate": 2.2600725952813065e-05, + "loss": 42.5535, + "step": 1369 + }, + { + "epoch": 4.94627539503386, + "grad_norm": 258.1711120605469, + "learning_rate": 2.2595281306715067e-05, + "loss": 42.8475, + "step": 1370 + }, + { + "epoch": 4.94627539503386, + "eval_loss": 0.639252245426178, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.09, + "eval_steps_per_second": 57.09, + "step": 1370 + }, + { + "epoch": 4.949887133182845, + "grad_norm": 204.64927673339844, + "learning_rate": 2.2589836660617062e-05, + "loss": 42.9895, + "step": 1371 + }, + { + "epoch": 4.953498871331829, + "grad_norm": 342.9057922363281, + "learning_rate": 2.2584392014519058e-05, + "loss": 43.1972, + "step": 1372 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 207.45504760742188, + "learning_rate": 2.2578947368421053e-05, + "loss": 42.406, + "step": 1373 + }, + { + "epoch": 4.960722347629797, + "grad_norm": 232.78831481933594, + "learning_rate": 2.257350272232305e-05, + "loss": 36.8817, + "step": 1374 + }, + { + "epoch": 4.9643340857787805, + "grad_norm": 249.3349609375, + "learning_rate": 2.2568058076225044e-05, + "loss": 34.584, + "step": 1375 + }, + { + "epoch": 4.967945823927765, + "grad_norm": 322.7100524902344, + "learning_rate": 2.2562613430127043e-05, + "loss": 36.9512, + "step": 1376 + }, + { + "epoch": 4.971557562076749, + "grad_norm": 357.65228271484375, + "learning_rate": 2.2557168784029038e-05, + "loss": 37.6833, + "step": 1377 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 300.0970153808594, + "learning_rate": 2.2551724137931033e-05, + "loss": 38.597, + "step": 1378 + }, + { + "epoch": 4.978781038374718, + "grad_norm": 234.52508544921875, + "learning_rate": 2.2546279491833032e-05, + "loss": 38.4155, + "step": 1379 + }, + { + "epoch": 4.982392776523702, + "grad_norm": 270.60626220703125, + "learning_rate": 2.2540834845735028e-05, + "loss": 38.1589, + "step": 1380 + }, + { + "epoch": 4.982392776523702, + "eval_loss": 0.6409950256347656, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 1380 + }, + { + "epoch": 4.986004514672686, + "grad_norm": 232.9596710205078, + "learning_rate": 2.2535390199637026e-05, + "loss": 39.281, + "step": 1381 + }, + { + "epoch": 4.98961625282167, + "grad_norm": 248.0550994873047, + "learning_rate": 2.2529945553539022e-05, + "loss": 40.0868, + "step": 1382 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 256.327880859375, + "learning_rate": 2.2524500907441017e-05, + "loss": 28.1259, + "step": 1383 + }, + { + "epoch": 4.996839729119639, + "grad_norm": 198.29559326171875, + "learning_rate": 2.2519056261343012e-05, + "loss": 25.3166, + "step": 1384 + }, + { + "epoch": 5.0, + "grad_norm": 174.66856384277344, + "learning_rate": 2.2513611615245008e-05, + "loss": 22.0749, + "step": 1385 + }, + { + "epoch": 5.003611738148984, + "grad_norm": 309.0927429199219, + "learning_rate": 2.2508166969147003e-05, + "loss": 45.2433, + "step": 1386 + }, + { + "epoch": 5.007223476297969, + "grad_norm": 293.1455383300781, + "learning_rate": 2.2502722323049002e-05, + "loss": 46.7025, + "step": 1387 + }, + { + "epoch": 5.010835214446953, + "grad_norm": 269.47662353515625, + "learning_rate": 2.2497277676951e-05, + "loss": 45.3218, + "step": 1388 + }, + { + "epoch": 5.014446952595937, + "grad_norm": 284.49560546875, + "learning_rate": 2.2491833030852996e-05, + "loss": 44.9849, + "step": 1389 + }, + { + "epoch": 5.018058690744921, + "grad_norm": 223.5511474609375, + "learning_rate": 2.248638838475499e-05, + "loss": 44.887, + "step": 1390 + }, + { + "epoch": 5.018058690744921, + "eval_loss": 0.6435533165931702, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1390 + }, + { + "epoch": 5.021670428893906, + "grad_norm": 243.4492645263672, + "learning_rate": 2.2480943738656987e-05, + "loss": 45.1483, + "step": 1391 + }, + { + "epoch": 5.0252821670428895, + "grad_norm": 265.1712646484375, + "learning_rate": 2.2475499092558986e-05, + "loss": 44.3713, + "step": 1392 + }, + { + "epoch": 5.0288939051918735, + "grad_norm": 190.72190856933594, + "learning_rate": 2.247005444646098e-05, + "loss": 45.3138, + "step": 1393 + }, + { + "epoch": 5.0325056433408575, + "grad_norm": 177.26686096191406, + "learning_rate": 2.2464609800362976e-05, + "loss": 43.302, + "step": 1394 + }, + { + "epoch": 5.036117381489842, + "grad_norm": 198.6124725341797, + "learning_rate": 2.2459165154264972e-05, + "loss": 43.6363, + "step": 1395 + }, + { + "epoch": 5.039729119638826, + "grad_norm": 233.78738403320312, + "learning_rate": 2.2453720508166967e-05, + "loss": 43.0345, + "step": 1396 + }, + { + "epoch": 5.04334085778781, + "grad_norm": 225.48614501953125, + "learning_rate": 2.2448275862068966e-05, + "loss": 41.5932, + "step": 1397 + }, + { + "epoch": 5.046952595936794, + "grad_norm": 204.31179809570312, + "learning_rate": 2.2442831215970965e-05, + "loss": 40.1401, + "step": 1398 + }, + { + "epoch": 5.050564334085779, + "grad_norm": 219.5385284423828, + "learning_rate": 2.243738656987296e-05, + "loss": 40.8834, + "step": 1399 + }, + { + "epoch": 5.054176072234763, + "grad_norm": 168.3094024658203, + "learning_rate": 2.2431941923774956e-05, + "loss": 40.4476, + "step": 1400 + }, + { + "epoch": 5.054176072234763, + "eval_loss": 0.6361114382743835, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 1400 + }, + { + "epoch": 5.057787810383747, + "grad_norm": 169.45201110839844, + "learning_rate": 2.242649727767695e-05, + "loss": 40.1949, + "step": 1401 + }, + { + "epoch": 5.061399548532731, + "grad_norm": 208.84634399414062, + "learning_rate": 2.2421052631578946e-05, + "loss": 41.0091, + "step": 1402 + }, + { + "epoch": 5.065011286681716, + "grad_norm": 248.86221313476562, + "learning_rate": 2.2415607985480945e-05, + "loss": 40.2435, + "step": 1403 + }, + { + "epoch": 5.0686230248307, + "grad_norm": 297.0834655761719, + "learning_rate": 2.241016333938294e-05, + "loss": 42.37, + "step": 1404 + }, + { + "epoch": 5.072234762979684, + "grad_norm": 242.12661743164062, + "learning_rate": 2.2404718693284936e-05, + "loss": 42.3822, + "step": 1405 + }, + { + "epoch": 5.075846501128668, + "grad_norm": 230.1178741455078, + "learning_rate": 2.2399274047186935e-05, + "loss": 41.3722, + "step": 1406 + }, + { + "epoch": 5.079458239277653, + "grad_norm": 191.32371520996094, + "learning_rate": 2.239382940108893e-05, + "loss": 41.8087, + "step": 1407 + }, + { + "epoch": 5.083069977426637, + "grad_norm": 267.28753662109375, + "learning_rate": 2.2388384754990925e-05, + "loss": 42.5938, + "step": 1408 + }, + { + "epoch": 5.0866817155756205, + "grad_norm": 186.61978149414062, + "learning_rate": 2.2382940108892924e-05, + "loss": 42.8553, + "step": 1409 + }, + { + "epoch": 5.090293453724605, + "grad_norm": 242.53433227539062, + "learning_rate": 2.237749546279492e-05, + "loss": 41.9677, + "step": 1410 + }, + { + "epoch": 5.090293453724605, + "eval_loss": 0.6330043077468872, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 1410 + }, + { + "epoch": 5.093905191873589, + "grad_norm": 199.74696350097656, + "learning_rate": 2.2372050816696915e-05, + "loss": 42.9821, + "step": 1411 + }, + { + "epoch": 5.097516930022573, + "grad_norm": 254.1063690185547, + "learning_rate": 2.236660617059891e-05, + "loss": 42.7956, + "step": 1412 + }, + { + "epoch": 5.101128668171557, + "grad_norm": 215.59056091308594, + "learning_rate": 2.2361161524500906e-05, + "loss": 43.6312, + "step": 1413 + }, + { + "epoch": 5.104740406320542, + "grad_norm": 218.69973754882812, + "learning_rate": 2.2355716878402904e-05, + "loss": 40.9468, + "step": 1414 + }, + { + "epoch": 5.108352144469526, + "grad_norm": 200.34927368164062, + "learning_rate": 2.23502722323049e-05, + "loss": 38.2656, + "step": 1415 + }, + { + "epoch": 5.11196388261851, + "grad_norm": 191.56883239746094, + "learning_rate": 2.23448275862069e-05, + "loss": 35.8111, + "step": 1416 + }, + { + "epoch": 5.115575620767494, + "grad_norm": 192.629150390625, + "learning_rate": 2.2339382940108894e-05, + "loss": 35.1287, + "step": 1417 + }, + { + "epoch": 5.119187358916479, + "grad_norm": 217.54855346679688, + "learning_rate": 2.233393829401089e-05, + "loss": 34.9664, + "step": 1418 + }, + { + "epoch": 5.122799097065463, + "grad_norm": 234.12355041503906, + "learning_rate": 2.2328493647912888e-05, + "loss": 35.9252, + "step": 1419 + }, + { + "epoch": 5.126410835214447, + "grad_norm": 201.83477783203125, + "learning_rate": 2.2323049001814884e-05, + "loss": 36.4664, + "step": 1420 + }, + { + "epoch": 5.126410835214447, + "eval_loss": 0.6359394192695618, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 1420 + }, + { + "epoch": 5.130022573363431, + "grad_norm": 212.38943481445312, + "learning_rate": 2.231760435571688e-05, + "loss": 35.2733, + "step": 1421 + }, + { + "epoch": 5.133634311512416, + "grad_norm": 219.8803253173828, + "learning_rate": 2.2312159709618874e-05, + "loss": 37.2009, + "step": 1422 + }, + { + "epoch": 5.1372460496614, + "grad_norm": 222.28221130371094, + "learning_rate": 2.230671506352087e-05, + "loss": 36.9338, + "step": 1423 + }, + { + "epoch": 5.140857787810384, + "grad_norm": 217.56607055664062, + "learning_rate": 2.2301270417422865e-05, + "loss": 38.0419, + "step": 1424 + }, + { + "epoch": 5.144469525959368, + "grad_norm": 232.7363739013672, + "learning_rate": 2.2295825771324867e-05, + "loss": 38.1393, + "step": 1425 + }, + { + "epoch": 5.148081264108352, + "grad_norm": 228.12091064453125, + "learning_rate": 2.2290381125226863e-05, + "loss": 37.4169, + "step": 1426 + }, + { + "epoch": 5.151693002257336, + "grad_norm": 247.9901580810547, + "learning_rate": 2.2284936479128858e-05, + "loss": 37.6386, + "step": 1427 + }, + { + "epoch": 5.15530474040632, + "grad_norm": 227.96649169921875, + "learning_rate": 2.2279491833030853e-05, + "loss": 38.7843, + "step": 1428 + }, + { + "epoch": 5.158916478555304, + "grad_norm": 197.85072326660156, + "learning_rate": 2.227404718693285e-05, + "loss": 37.7056, + "step": 1429 + }, + { + "epoch": 5.162528216704289, + "grad_norm": 270.6370544433594, + "learning_rate": 2.2268602540834848e-05, + "loss": 38.5554, + "step": 1430 + }, + { + "epoch": 5.162528216704289, + "eval_loss": 0.6463288068771362, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 1430 + }, + { + "epoch": 5.166139954853273, + "grad_norm": 251.65847778320312, + "learning_rate": 2.2263157894736843e-05, + "loss": 32.6593, + "step": 1431 + }, + { + "epoch": 5.169751693002257, + "grad_norm": 248.84368896484375, + "learning_rate": 2.225771324863884e-05, + "loss": 24.8031, + "step": 1432 + }, + { + "epoch": 5.173363431151241, + "grad_norm": 218.12979125976562, + "learning_rate": 2.2252268602540834e-05, + "loss": 23.8542, + "step": 1433 + }, + { + "epoch": 5.176975169300226, + "grad_norm": 171.4182586669922, + "learning_rate": 2.2246823956442832e-05, + "loss": 25.1994, + "step": 1434 + }, + { + "epoch": 5.18058690744921, + "grad_norm": 200.76271057128906, + "learning_rate": 2.2241379310344828e-05, + "loss": 25.1259, + "step": 1435 + }, + { + "epoch": 5.184198645598194, + "grad_norm": 324.8979797363281, + "learning_rate": 2.2235934664246827e-05, + "loss": 46.7466, + "step": 1436 + }, + { + "epoch": 5.187810383747179, + "grad_norm": 391.9200439453125, + "learning_rate": 2.2230490018148822e-05, + "loss": 47.366, + "step": 1437 + }, + { + "epoch": 5.191422121896163, + "grad_norm": 332.51080322265625, + "learning_rate": 2.2225045372050817e-05, + "loss": 47.5236, + "step": 1438 + }, + { + "epoch": 5.195033860045147, + "grad_norm": 295.85333251953125, + "learning_rate": 2.2219600725952813e-05, + "loss": 44.9235, + "step": 1439 + }, + { + "epoch": 5.198645598194131, + "grad_norm": 246.46482849121094, + "learning_rate": 2.2214156079854808e-05, + "loss": 44.5892, + "step": 1440 + }, + { + "epoch": 5.198645598194131, + "eval_loss": 0.6501885056495667, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.096, + "eval_steps_per_second": 57.096, + "step": 1440 + }, + { + "epoch": 5.2022573363431155, + "grad_norm": 224.99964904785156, + "learning_rate": 2.2208711433756807e-05, + "loss": 45.1496, + "step": 1441 + }, + { + "epoch": 5.2058690744920995, + "grad_norm": 201.5928497314453, + "learning_rate": 2.2203266787658802e-05, + "loss": 44.2362, + "step": 1442 + }, + { + "epoch": 5.209480812641083, + "grad_norm": 220.72509765625, + "learning_rate": 2.21978221415608e-05, + "loss": 45.7963, + "step": 1443 + }, + { + "epoch": 5.213092550790067, + "grad_norm": 229.04412841796875, + "learning_rate": 2.2192377495462796e-05, + "loss": 44.1812, + "step": 1444 + }, + { + "epoch": 5.216704288939052, + "grad_norm": 214.86207580566406, + "learning_rate": 2.2186932849364792e-05, + "loss": 44.364, + "step": 1445 + }, + { + "epoch": 5.220316027088036, + "grad_norm": 169.3239288330078, + "learning_rate": 2.2181488203266787e-05, + "loss": 44.1106, + "step": 1446 + }, + { + "epoch": 5.22392776523702, + "grad_norm": 180.3131561279297, + "learning_rate": 2.2176043557168786e-05, + "loss": 41.8791, + "step": 1447 + }, + { + "epoch": 5.227539503386004, + "grad_norm": 227.83078002929688, + "learning_rate": 2.217059891107078e-05, + "loss": 39.7917, + "step": 1448 + }, + { + "epoch": 5.231151241534989, + "grad_norm": 267.4294738769531, + "learning_rate": 2.2165154264972777e-05, + "loss": 41.2864, + "step": 1449 + }, + { + "epoch": 5.234762979683973, + "grad_norm": 210.79034423828125, + "learning_rate": 2.2159709618874772e-05, + "loss": 40.7219, + "step": 1450 + }, + { + "epoch": 5.234762979683973, + "eval_loss": 0.6369529366493225, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 1450 + }, + { + "epoch": 5.238374717832957, + "grad_norm": 205.2632598876953, + "learning_rate": 2.2154264972776768e-05, + "loss": 41.0364, + "step": 1451 + }, + { + "epoch": 5.241986455981941, + "grad_norm": 199.7196807861328, + "learning_rate": 2.214882032667877e-05, + "loss": 40.2733, + "step": 1452 + }, + { + "epoch": 5.245598194130926, + "grad_norm": 184.26495361328125, + "learning_rate": 2.2143375680580765e-05, + "loss": 40.3418, + "step": 1453 + }, + { + "epoch": 5.24920993227991, + "grad_norm": 170.1937713623047, + "learning_rate": 2.213793103448276e-05, + "loss": 40.5658, + "step": 1454 + }, + { + "epoch": 5.252821670428894, + "grad_norm": 167.71109008789062, + "learning_rate": 2.2132486388384756e-05, + "loss": 41.9252, + "step": 1455 + }, + { + "epoch": 5.2564334085778786, + "grad_norm": 184.73162841796875, + "learning_rate": 2.212704174228675e-05, + "loss": 40.0485, + "step": 1456 + }, + { + "epoch": 5.2600451467268625, + "grad_norm": 195.0812225341797, + "learning_rate": 2.2121597096188747e-05, + "loss": 41.6424, + "step": 1457 + }, + { + "epoch": 5.2636568848758465, + "grad_norm": 218.23553466796875, + "learning_rate": 2.2116152450090745e-05, + "loss": 40.6179, + "step": 1458 + }, + { + "epoch": 5.2672686230248305, + "grad_norm": 229.79299926757812, + "learning_rate": 2.211070780399274e-05, + "loss": 42.8747, + "step": 1459 + }, + { + "epoch": 5.270880361173815, + "grad_norm": 231.70692443847656, + "learning_rate": 2.2105263157894736e-05, + "loss": 42.7016, + "step": 1460 + }, + { + "epoch": 5.270880361173815, + "eval_loss": 0.6424433588981628, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 1460 + }, + { + "epoch": 5.274492099322799, + "grad_norm": 204.9513397216797, + "learning_rate": 2.209981851179673e-05, + "loss": 41.206, + "step": 1461 + }, + { + "epoch": 5.278103837471783, + "grad_norm": 220.89083862304688, + "learning_rate": 2.209437386569873e-05, + "loss": 44.0126, + "step": 1462 + }, + { + "epoch": 5.281715575620767, + "grad_norm": 266.7763671875, + "learning_rate": 2.208892921960073e-05, + "loss": 41.4934, + "step": 1463 + }, + { + "epoch": 5.285327313769752, + "grad_norm": 241.42636108398438, + "learning_rate": 2.2083484573502724e-05, + "loss": 43.3433, + "step": 1464 + }, + { + "epoch": 5.288939051918736, + "grad_norm": 221.7669219970703, + "learning_rate": 2.207803992740472e-05, + "loss": 35.9569, + "step": 1465 + }, + { + "epoch": 5.29255079006772, + "grad_norm": 236.0152130126953, + "learning_rate": 2.2072595281306715e-05, + "loss": 36.0824, + "step": 1466 + }, + { + "epoch": 5.296162528216704, + "grad_norm": 239.56224060058594, + "learning_rate": 2.206715063520871e-05, + "loss": 33.6127, + "step": 1467 + }, + { + "epoch": 5.299774266365689, + "grad_norm": 277.1287841796875, + "learning_rate": 2.2061705989110706e-05, + "loss": 36.11, + "step": 1468 + }, + { + "epoch": 5.303386004514673, + "grad_norm": 250.19515991210938, + "learning_rate": 2.2056261343012705e-05, + "loss": 36.9984, + "step": 1469 + }, + { + "epoch": 5.306997742663657, + "grad_norm": 214.2754669189453, + "learning_rate": 2.20508166969147e-05, + "loss": 36.5917, + "step": 1470 + }, + { + "epoch": 5.306997742663657, + "eval_loss": 0.6356943845748901, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 1470 + }, + { + "epoch": 5.310609480812641, + "grad_norm": 224.37388610839844, + "learning_rate": 2.20453720508167e-05, + "loss": 36.5302, + "step": 1471 + }, + { + "epoch": 5.314221218961626, + "grad_norm": 276.2541809082031, + "learning_rate": 2.2039927404718694e-05, + "loss": 36.7978, + "step": 1472 + }, + { + "epoch": 5.3178329571106095, + "grad_norm": 361.717041015625, + "learning_rate": 2.203448275862069e-05, + "loss": 37.4063, + "step": 1473 + }, + { + "epoch": 5.3214446952595935, + "grad_norm": 285.3569641113281, + "learning_rate": 2.202903811252269e-05, + "loss": 37.2472, + "step": 1474 + }, + { + "epoch": 5.3250564334085775, + "grad_norm": 268.160400390625, + "learning_rate": 2.2023593466424684e-05, + "loss": 37.7361, + "step": 1475 + }, + { + "epoch": 5.328668171557562, + "grad_norm": 211.38070678710938, + "learning_rate": 2.201814882032668e-05, + "loss": 37.7794, + "step": 1476 + }, + { + "epoch": 5.332279909706546, + "grad_norm": 214.10638427734375, + "learning_rate": 2.2012704174228675e-05, + "loss": 39.0787, + "step": 1477 + }, + { + "epoch": 5.33589164785553, + "grad_norm": 238.9603271484375, + "learning_rate": 2.200725952813067e-05, + "loss": 37.6853, + "step": 1478 + }, + { + "epoch": 5.339503386004514, + "grad_norm": 323.44976806640625, + "learning_rate": 2.2001814882032665e-05, + "loss": 38.2844, + "step": 1479 + }, + { + "epoch": 5.343115124153499, + "grad_norm": 289.6131896972656, + "learning_rate": 2.1996370235934668e-05, + "loss": 38.8953, + "step": 1480 + }, + { + "epoch": 5.343115124153499, + "eval_loss": 0.6462770700454712, + "eval_runtime": 3.1673, + "eval_samples_per_second": 56.516, + "eval_steps_per_second": 56.516, + "step": 1480 + }, + { + "epoch": 5.346726862302483, + "grad_norm": 197.47299194335938, + "learning_rate": 2.1990925589836663e-05, + "loss": 28.126, + "step": 1481 + }, + { + "epoch": 5.350338600451467, + "grad_norm": 198.37156677246094, + "learning_rate": 2.1985480943738658e-05, + "loss": 24.2205, + "step": 1482 + }, + { + "epoch": 5.353950338600452, + "grad_norm": 211.03501892089844, + "learning_rate": 2.1980036297640654e-05, + "loss": 24.119, + "step": 1483 + }, + { + "epoch": 5.357562076749436, + "grad_norm": 182.23316955566406, + "learning_rate": 2.197459165154265e-05, + "loss": 24.7386, + "step": 1484 + }, + { + "epoch": 5.36117381489842, + "grad_norm": 192.6392822265625, + "learning_rate": 2.1969147005444648e-05, + "loss": 26.0739, + "step": 1485 + }, + { + "epoch": 5.364785553047404, + "grad_norm": 380.62896728515625, + "learning_rate": 2.1963702359346643e-05, + "loss": 46.6945, + "step": 1486 + }, + { + "epoch": 5.368397291196389, + "grad_norm": 342.5572814941406, + "learning_rate": 2.195825771324864e-05, + "loss": 46.1797, + "step": 1487 + }, + { + "epoch": 5.372009029345373, + "grad_norm": 311.7198791503906, + "learning_rate": 2.1952813067150634e-05, + "loss": 45.6588, + "step": 1488 + }, + { + "epoch": 5.375620767494357, + "grad_norm": 260.9885559082031, + "learning_rate": 2.1947368421052633e-05, + "loss": 45.2405, + "step": 1489 + }, + { + "epoch": 5.3792325056433405, + "grad_norm": 263.3132019042969, + "learning_rate": 2.1941923774954628e-05, + "loss": 44.117, + "step": 1490 + }, + { + "epoch": 5.3792325056433405, + "eval_loss": 0.644275426864624, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 1490 + }, + { + "epoch": 5.382844243792325, + "grad_norm": 254.92022705078125, + "learning_rate": 2.1936479128856627e-05, + "loss": 45.4002, + "step": 1491 + }, + { + "epoch": 5.386455981941309, + "grad_norm": 246.1839599609375, + "learning_rate": 2.1931034482758622e-05, + "loss": 45.3481, + "step": 1492 + }, + { + "epoch": 5.390067720090293, + "grad_norm": 282.2879638671875, + "learning_rate": 2.1925589836660618e-05, + "loss": 45.3958, + "step": 1493 + }, + { + "epoch": 5.393679458239277, + "grad_norm": 266.9140930175781, + "learning_rate": 2.1920145190562613e-05, + "loss": 44.2959, + "step": 1494 + }, + { + "epoch": 5.397291196388262, + "grad_norm": 196.81199645996094, + "learning_rate": 2.191470054446461e-05, + "loss": 44.765, + "step": 1495 + }, + { + "epoch": 5.400902934537246, + "grad_norm": 270.7329406738281, + "learning_rate": 2.1909255898366607e-05, + "loss": 42.8581, + "step": 1496 + }, + { + "epoch": 5.40451467268623, + "grad_norm": 187.3281707763672, + "learning_rate": 2.1903811252268603e-05, + "loss": 40.7167, + "step": 1497 + }, + { + "epoch": 5.408126410835214, + "grad_norm": 302.9165954589844, + "learning_rate": 2.1898366606170598e-05, + "loss": 41.0712, + "step": 1498 + }, + { + "epoch": 5.411738148984199, + "grad_norm": 395.1492614746094, + "learning_rate": 2.1892921960072597e-05, + "loss": 40.4098, + "step": 1499 + }, + { + "epoch": 5.415349887133183, + "grad_norm": 253.91494750976562, + "learning_rate": 2.1887477313974592e-05, + "loss": 41.2985, + "step": 1500 + }, + { + "epoch": 5.415349887133183, + "eval_loss": 0.6383773684501648, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1500 + }, + { + "epoch": 5.418961625282167, + "grad_norm": 248.4109344482422, + "learning_rate": 2.1882032667876588e-05, + "loss": 41.179, + "step": 1501 + }, + { + "epoch": 5.422573363431152, + "grad_norm": 210.50015258789062, + "learning_rate": 2.1876588021778586e-05, + "loss": 41.1934, + "step": 1502 + }, + { + "epoch": 5.426185101580136, + "grad_norm": 170.64334106445312, + "learning_rate": 2.187114337568058e-05, + "loss": 41.5535, + "step": 1503 + }, + { + "epoch": 5.42979683972912, + "grad_norm": 249.41270446777344, + "learning_rate": 2.1865698729582577e-05, + "loss": 41.8323, + "step": 1504 + }, + { + "epoch": 5.433408577878104, + "grad_norm": 214.53770446777344, + "learning_rate": 2.1860254083484572e-05, + "loss": 42.1517, + "step": 1505 + }, + { + "epoch": 5.437020316027088, + "grad_norm": 225.6502227783203, + "learning_rate": 2.1854809437386568e-05, + "loss": 42.7675, + "step": 1506 + }, + { + "epoch": 5.440632054176072, + "grad_norm": 210.19219970703125, + "learning_rate": 2.1849364791288567e-05, + "loss": 42.5094, + "step": 1507 + }, + { + "epoch": 5.444243792325056, + "grad_norm": 187.03294372558594, + "learning_rate": 2.1843920145190565e-05, + "loss": 42.2218, + "step": 1508 + }, + { + "epoch": 5.44785553047404, + "grad_norm": 227.6764373779297, + "learning_rate": 2.183847549909256e-05, + "loss": 42.7061, + "step": 1509 + }, + { + "epoch": 5.451467268623025, + "grad_norm": 239.2847442626953, + "learning_rate": 2.1833030852994556e-05, + "loss": 43.1959, + "step": 1510 + }, + { + "epoch": 5.451467268623025, + "eval_loss": 0.6405091285705566, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 1510 + }, + { + "epoch": 5.455079006772009, + "grad_norm": 268.887451171875, + "learning_rate": 2.182758620689655e-05, + "loss": 42.4915, + "step": 1511 + }, + { + "epoch": 5.458690744920993, + "grad_norm": 261.0531311035156, + "learning_rate": 2.182214156079855e-05, + "loss": 42.1777, + "step": 1512 + }, + { + "epoch": 5.462302483069977, + "grad_norm": 241.58819580078125, + "learning_rate": 2.1816696914700546e-05, + "loss": 40.8728, + "step": 1513 + }, + { + "epoch": 5.465914221218962, + "grad_norm": 227.302001953125, + "learning_rate": 2.181125226860254e-05, + "loss": 39.8861, + "step": 1514 + }, + { + "epoch": 5.469525959367946, + "grad_norm": 293.8402404785156, + "learning_rate": 2.1805807622504536e-05, + "loss": 36.8716, + "step": 1515 + }, + { + "epoch": 5.47313769751693, + "grad_norm": 332.8829650878906, + "learning_rate": 2.1800362976406532e-05, + "loss": 35.6049, + "step": 1516 + }, + { + "epoch": 5.476749435665914, + "grad_norm": 271.6636962890625, + "learning_rate": 2.179491833030853e-05, + "loss": 34.6785, + "step": 1517 + }, + { + "epoch": 5.480361173814899, + "grad_norm": 211.5673065185547, + "learning_rate": 2.178947368421053e-05, + "loss": 35.5321, + "step": 1518 + }, + { + "epoch": 5.483972911963883, + "grad_norm": 168.95346069335938, + "learning_rate": 2.1784029038112525e-05, + "loss": 35.1604, + "step": 1519 + }, + { + "epoch": 5.487584650112867, + "grad_norm": 242.66725158691406, + "learning_rate": 2.177858439201452e-05, + "loss": 37.8709, + "step": 1520 + }, + { + "epoch": 5.487584650112867, + "eval_loss": 0.6324127912521362, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1520 + }, + { + "epoch": 5.491196388261851, + "grad_norm": 202.7799530029297, + "learning_rate": 2.1773139745916516e-05, + "loss": 38.1727, + "step": 1521 + }, + { + "epoch": 5.4948081264108355, + "grad_norm": 210.12704467773438, + "learning_rate": 2.176769509981851e-05, + "loss": 36.4171, + "step": 1522 + }, + { + "epoch": 5.4984198645598195, + "grad_norm": 214.7133331298828, + "learning_rate": 2.176225045372051e-05, + "loss": 37.7873, + "step": 1523 + }, + { + "epoch": 5.502031602708803, + "grad_norm": 197.89781188964844, + "learning_rate": 2.1756805807622505e-05, + "loss": 37.1096, + "step": 1524 + }, + { + "epoch": 5.505643340857787, + "grad_norm": 203.01992797851562, + "learning_rate": 2.17513611615245e-05, + "loss": 36.9907, + "step": 1525 + }, + { + "epoch": 5.509255079006772, + "grad_norm": 210.42164611816406, + "learning_rate": 2.17459165154265e-05, + "loss": 38.0291, + "step": 1526 + }, + { + "epoch": 5.512866817155756, + "grad_norm": 210.2798309326172, + "learning_rate": 2.1740471869328495e-05, + "loss": 37.5385, + "step": 1527 + }, + { + "epoch": 5.51647855530474, + "grad_norm": 217.986572265625, + "learning_rate": 2.173502722323049e-05, + "loss": 39.2736, + "step": 1528 + }, + { + "epoch": 5.520090293453725, + "grad_norm": 221.05831909179688, + "learning_rate": 2.172958257713249e-05, + "loss": 39.2733, + "step": 1529 + }, + { + "epoch": 5.523702031602709, + "grad_norm": 250.36065673828125, + "learning_rate": 2.1724137931034484e-05, + "loss": 37.8987, + "step": 1530 + }, + { + "epoch": 5.523702031602709, + "eval_loss": 0.6414559483528137, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 1530 + }, + { + "epoch": 5.527313769751693, + "grad_norm": 275.062255859375, + "learning_rate": 2.171869328493648e-05, + "loss": 29.4874, + "step": 1531 + }, + { + "epoch": 5.530925507900677, + "grad_norm": 178.79615783691406, + "learning_rate": 2.1713248638838475e-05, + "loss": 25.2165, + "step": 1532 + }, + { + "epoch": 5.534537246049661, + "grad_norm": 221.6693572998047, + "learning_rate": 2.170780399274047e-05, + "loss": 24.7139, + "step": 1533 + }, + { + "epoch": 5.538148984198646, + "grad_norm": 207.15869140625, + "learning_rate": 2.170235934664247e-05, + "loss": 25.2773, + "step": 1534 + }, + { + "epoch": 5.54176072234763, + "grad_norm": 193.37644958496094, + "learning_rate": 2.1696914700544468e-05, + "loss": 25.7936, + "step": 1535 + }, + { + "epoch": 5.545372460496614, + "grad_norm": 314.101318359375, + "learning_rate": 2.1691470054446463e-05, + "loss": 45.8573, + "step": 1536 + }, + { + "epoch": 5.5489841986455986, + "grad_norm": 376.9578552246094, + "learning_rate": 2.168602540834846e-05, + "loss": 47.1284, + "step": 1537 + }, + { + "epoch": 5.5525959367945825, + "grad_norm": 343.3904724121094, + "learning_rate": 2.1680580762250454e-05, + "loss": 45.1873, + "step": 1538 + }, + { + "epoch": 5.5562076749435665, + "grad_norm": 263.31768798828125, + "learning_rate": 2.167513611615245e-05, + "loss": 45.4906, + "step": 1539 + }, + { + "epoch": 5.5598194130925505, + "grad_norm": 295.50384521484375, + "learning_rate": 2.1669691470054448e-05, + "loss": 44.9259, + "step": 1540 + }, + { + "epoch": 5.5598194130925505, + "eval_loss": 0.6483813524246216, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.923, + "eval_steps_per_second": 56.923, + "step": 1540 + }, + { + "epoch": 5.563431151241535, + "grad_norm": 208.8861846923828, + "learning_rate": 2.1664246823956444e-05, + "loss": 43.7965, + "step": 1541 + }, + { + "epoch": 5.567042889390519, + "grad_norm": 195.8695526123047, + "learning_rate": 2.165880217785844e-05, + "loss": 44.7409, + "step": 1542 + }, + { + "epoch": 5.570654627539503, + "grad_norm": 218.10089111328125, + "learning_rate": 2.1653357531760434e-05, + "loss": 45.9364, + "step": 1543 + }, + { + "epoch": 5.574266365688487, + "grad_norm": 204.17205810546875, + "learning_rate": 2.164791288566243e-05, + "loss": 45.468, + "step": 1544 + }, + { + "epoch": 5.577878103837472, + "grad_norm": 239.03952026367188, + "learning_rate": 2.1642468239564432e-05, + "loss": 44.7685, + "step": 1545 + }, + { + "epoch": 5.581489841986456, + "grad_norm": 251.59300231933594, + "learning_rate": 2.1637023593466427e-05, + "loss": 43.011, + "step": 1546 + }, + { + "epoch": 5.58510158013544, + "grad_norm": 186.72540283203125, + "learning_rate": 2.1631578947368423e-05, + "loss": 41.5255, + "step": 1547 + }, + { + "epoch": 5.588713318284425, + "grad_norm": 199.89732360839844, + "learning_rate": 2.1626134301270418e-05, + "loss": 40.2522, + "step": 1548 + }, + { + "epoch": 5.592325056433409, + "grad_norm": 182.16624450683594, + "learning_rate": 2.1620689655172413e-05, + "loss": 41.0931, + "step": 1549 + }, + { + "epoch": 5.595936794582393, + "grad_norm": 221.58680725097656, + "learning_rate": 2.161524500907441e-05, + "loss": 40.2717, + "step": 1550 + }, + { + "epoch": 5.595936794582393, + "eval_loss": 0.6393340229988098, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 1550 + }, + { + "epoch": 5.599548532731377, + "grad_norm": 209.82183837890625, + "learning_rate": 2.1609800362976408e-05, + "loss": 41.7522, + "step": 1551 + }, + { + "epoch": 5.603160270880361, + "grad_norm": 226.1896209716797, + "learning_rate": 2.1604355716878403e-05, + "loss": 40.8078, + "step": 1552 + }, + { + "epoch": 5.606772009029346, + "grad_norm": 219.57899475097656, + "learning_rate": 2.1598911070780398e-05, + "loss": 42.2331, + "step": 1553 + }, + { + "epoch": 5.6103837471783295, + "grad_norm": 185.2303009033203, + "learning_rate": 2.1593466424682397e-05, + "loss": 42.0695, + "step": 1554 + }, + { + "epoch": 5.6139954853273135, + "grad_norm": 192.32913208007812, + "learning_rate": 2.1588021778584392e-05, + "loss": 42.1317, + "step": 1555 + }, + { + "epoch": 5.617607223476298, + "grad_norm": 183.3128662109375, + "learning_rate": 2.158257713248639e-05, + "loss": 40.4957, + "step": 1556 + }, + { + "epoch": 5.621218961625282, + "grad_norm": 178.10691833496094, + "learning_rate": 2.1577132486388387e-05, + "loss": 40.9154, + "step": 1557 + }, + { + "epoch": 5.624830699774266, + "grad_norm": 207.3495330810547, + "learning_rate": 2.1571687840290382e-05, + "loss": 42.8389, + "step": 1558 + }, + { + "epoch": 5.62844243792325, + "grad_norm": 191.46353149414062, + "learning_rate": 2.1566243194192377e-05, + "loss": 41.9483, + "step": 1559 + }, + { + "epoch": 5.632054176072235, + "grad_norm": 218.9544219970703, + "learning_rate": 2.1560798548094373e-05, + "loss": 41.2037, + "step": 1560 + }, + { + "epoch": 5.632054176072235, + "eval_loss": 0.6345452070236206, + "eval_runtime": 3.1432, + "eval_samples_per_second": 56.949, + "eval_steps_per_second": 56.949, + "step": 1560 + }, + { + "epoch": 5.635665914221219, + "grad_norm": 235.9405059814453, + "learning_rate": 2.1555353901996368e-05, + "loss": 43.1159, + "step": 1561 + }, + { + "epoch": 5.639277652370203, + "grad_norm": 207.1119384765625, + "learning_rate": 2.1549909255898367e-05, + "loss": 43.4384, + "step": 1562 + }, + { + "epoch": 5.642889390519187, + "grad_norm": 305.3013916015625, + "learning_rate": 2.1544464609800366e-05, + "loss": 42.436, + "step": 1563 + }, + { + "epoch": 5.646501128668172, + "grad_norm": 226.25282287597656, + "learning_rate": 2.153901996370236e-05, + "loss": 39.6844, + "step": 1564 + }, + { + "epoch": 5.650112866817156, + "grad_norm": 201.5033416748047, + "learning_rate": 2.1533575317604356e-05, + "loss": 35.9103, + "step": 1565 + }, + { + "epoch": 5.65372460496614, + "grad_norm": 206.63229370117188, + "learning_rate": 2.1528130671506352e-05, + "loss": 35.0026, + "step": 1566 + }, + { + "epoch": 5.657336343115124, + "grad_norm": 212.67581176757812, + "learning_rate": 2.152268602540835e-05, + "loss": 35.6298, + "step": 1567 + }, + { + "epoch": 5.660948081264109, + "grad_norm": 193.2886199951172, + "learning_rate": 2.1517241379310346e-05, + "loss": 36.0356, + "step": 1568 + }, + { + "epoch": 5.664559819413093, + "grad_norm": 166.189208984375, + "learning_rate": 2.151179673321234e-05, + "loss": 35.5423, + "step": 1569 + }, + { + "epoch": 5.668171557562077, + "grad_norm": 288.91552734375, + "learning_rate": 2.1506352087114337e-05, + "loss": 36.6227, + "step": 1570 + }, + { + "epoch": 5.668171557562077, + "eval_loss": 0.6339959502220154, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1570 + }, + { + "epoch": 5.6717832957110605, + "grad_norm": 210.91664123535156, + "learning_rate": 2.1500907441016332e-05, + "loss": 37.3015, + "step": 1571 + }, + { + "epoch": 5.675395033860045, + "grad_norm": 206.54299926757812, + "learning_rate": 2.149546279491833e-05, + "loss": 36.961, + "step": 1572 + }, + { + "epoch": 5.679006772009029, + "grad_norm": 206.55613708496094, + "learning_rate": 2.149001814882033e-05, + "loss": 36.722, + "step": 1573 + }, + { + "epoch": 5.682618510158013, + "grad_norm": 206.86563110351562, + "learning_rate": 2.1484573502722325e-05, + "loss": 37.7482, + "step": 1574 + }, + { + "epoch": 5.686230248306998, + "grad_norm": 219.96533203125, + "learning_rate": 2.147912885662432e-05, + "loss": 37.7964, + "step": 1575 + }, + { + "epoch": 5.689841986455982, + "grad_norm": 226.23887634277344, + "learning_rate": 2.1473684210526316e-05, + "loss": 38.6577, + "step": 1576 + }, + { + "epoch": 5.693453724604966, + "grad_norm": 195.1751708984375, + "learning_rate": 2.146823956442831e-05, + "loss": 36.9764, + "step": 1577 + }, + { + "epoch": 5.69706546275395, + "grad_norm": 194.3510284423828, + "learning_rate": 2.146279491833031e-05, + "loss": 39.4842, + "step": 1578 + }, + { + "epoch": 5.700677200902934, + "grad_norm": 187.02281188964844, + "learning_rate": 2.1457350272232305e-05, + "loss": 38.9574, + "step": 1579 + }, + { + "epoch": 5.704288939051919, + "grad_norm": 242.91925048828125, + "learning_rate": 2.14519056261343e-05, + "loss": 37.6359, + "step": 1580 + }, + { + "epoch": 5.704288939051919, + "eval_loss": 0.6384473443031311, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 1580 + }, + { + "epoch": 5.707900677200903, + "grad_norm": 242.9617156982422, + "learning_rate": 2.14464609800363e-05, + "loss": 31.3564, + "step": 1581 + }, + { + "epoch": 5.711512415349887, + "grad_norm": 182.00540161132812, + "learning_rate": 2.1441016333938295e-05, + "loss": 24.2933, + "step": 1582 + }, + { + "epoch": 5.715124153498872, + "grad_norm": 257.7115173339844, + "learning_rate": 2.143557168784029e-05, + "loss": 24.6299, + "step": 1583 + }, + { + "epoch": 5.718735891647856, + "grad_norm": 198.71554565429688, + "learning_rate": 2.143012704174229e-05, + "loss": 24.7344, + "step": 1584 + }, + { + "epoch": 5.72234762979684, + "grad_norm": 198.24520874023438, + "learning_rate": 2.1424682395644284e-05, + "loss": 26.0825, + "step": 1585 + }, + { + "epoch": 5.725959367945824, + "grad_norm": 248.9528045654297, + "learning_rate": 2.141923774954628e-05, + "loss": 45.1176, + "step": 1586 + }, + { + "epoch": 5.7295711060948085, + "grad_norm": 293.7327575683594, + "learning_rate": 2.1413793103448275e-05, + "loss": 45.8517, + "step": 1587 + }, + { + "epoch": 5.733182844243792, + "grad_norm": 293.1148681640625, + "learning_rate": 2.140834845735027e-05, + "loss": 45.6659, + "step": 1588 + }, + { + "epoch": 5.736794582392776, + "grad_norm": 312.7779846191406, + "learning_rate": 2.140290381125227e-05, + "loss": 44.4863, + "step": 1589 + }, + { + "epoch": 5.74040632054176, + "grad_norm": 309.1000061035156, + "learning_rate": 2.1397459165154265e-05, + "loss": 43.649, + "step": 1590 + }, + { + "epoch": 5.74040632054176, + "eval_loss": 0.6471736431121826, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 1590 + }, + { + "epoch": 5.744018058690745, + "grad_norm": 276.4226989746094, + "learning_rate": 2.1392014519056263e-05, + "loss": 45.3135, + "step": 1591 + }, + { + "epoch": 5.747629796839729, + "grad_norm": 233.6791229248047, + "learning_rate": 2.138656987295826e-05, + "loss": 44.4919, + "step": 1592 + }, + { + "epoch": 5.751241534988713, + "grad_norm": 194.2917022705078, + "learning_rate": 2.1381125226860254e-05, + "loss": 44.8033, + "step": 1593 + }, + { + "epoch": 5.754853273137698, + "grad_norm": 241.76060485839844, + "learning_rate": 2.137568058076225e-05, + "loss": 45.1427, + "step": 1594 + }, + { + "epoch": 5.758465011286682, + "grad_norm": 216.56283569335938, + "learning_rate": 2.137023593466425e-05, + "loss": 43.1769, + "step": 1595 + }, + { + "epoch": 5.762076749435666, + "grad_norm": 230.0026092529297, + "learning_rate": 2.1364791288566244e-05, + "loss": 44.1141, + "step": 1596 + }, + { + "epoch": 5.76568848758465, + "grad_norm": 191.55433654785156, + "learning_rate": 2.135934664246824e-05, + "loss": 40.7227, + "step": 1597 + }, + { + "epoch": 5.769300225733634, + "grad_norm": 180.25885009765625, + "learning_rate": 2.1353901996370235e-05, + "loss": 40.9842, + "step": 1598 + }, + { + "epoch": 5.772911963882619, + "grad_norm": 220.4018096923828, + "learning_rate": 2.134845735027223e-05, + "loss": 40.0403, + "step": 1599 + }, + { + "epoch": 5.776523702031603, + "grad_norm": 264.20587158203125, + "learning_rate": 2.1343012704174232e-05, + "loss": 40.1543, + "step": 1600 + }, + { + "epoch": 5.776523702031603, + "eval_loss": 0.6374311447143555, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1600 + }, + { + "epoch": 5.780135440180587, + "grad_norm": 167.9457244873047, + "learning_rate": 2.1337568058076227e-05, + "loss": 40.9575, + "step": 1601 + }, + { + "epoch": 5.7837471783295715, + "grad_norm": 190.05247497558594, + "learning_rate": 2.1332123411978223e-05, + "loss": 39.5593, + "step": 1602 + }, + { + "epoch": 5.7873589164785555, + "grad_norm": 246.4980926513672, + "learning_rate": 2.1326678765880218e-05, + "loss": 40.7016, + "step": 1603 + }, + { + "epoch": 5.7909706546275395, + "grad_norm": 208.7435302734375, + "learning_rate": 2.1321234119782214e-05, + "loss": 41.7855, + "step": 1604 + }, + { + "epoch": 5.794582392776523, + "grad_norm": 190.84188842773438, + "learning_rate": 2.1315789473684212e-05, + "loss": 41.2129, + "step": 1605 + }, + { + "epoch": 5.798194130925508, + "grad_norm": 196.7161102294922, + "learning_rate": 2.1310344827586208e-05, + "loss": 40.8209, + "step": 1606 + }, + { + "epoch": 5.801805869074492, + "grad_norm": 181.4319305419922, + "learning_rate": 2.1304900181488203e-05, + "loss": 41.8345, + "step": 1607 + }, + { + "epoch": 5.805417607223476, + "grad_norm": 201.2064971923828, + "learning_rate": 2.12994555353902e-05, + "loss": 43.1464, + "step": 1608 + }, + { + "epoch": 5.80902934537246, + "grad_norm": 199.15174865722656, + "learning_rate": 2.1294010889292197e-05, + "loss": 42.6041, + "step": 1609 + }, + { + "epoch": 5.812641083521445, + "grad_norm": 231.0398406982422, + "learning_rate": 2.1288566243194193e-05, + "loss": 42.867, + "step": 1610 + }, + { + "epoch": 5.812641083521445, + "eval_loss": 0.6334222555160522, + "eval_runtime": 3.1534, + "eval_samples_per_second": 56.764, + "eval_steps_per_second": 56.764, + "step": 1610 + }, + { + "epoch": 5.816252821670429, + "grad_norm": 189.26132202148438, + "learning_rate": 2.128312159709619e-05, + "loss": 41.7717, + "step": 1611 + }, + { + "epoch": 5.819864559819413, + "grad_norm": 215.5289764404297, + "learning_rate": 2.1277676950998187e-05, + "loss": 41.3994, + "step": 1612 + }, + { + "epoch": 5.823476297968397, + "grad_norm": 267.4259033203125, + "learning_rate": 2.1272232304900182e-05, + "loss": 41.8173, + "step": 1613 + }, + { + "epoch": 5.827088036117382, + "grad_norm": 241.74749755859375, + "learning_rate": 2.1266787658802178e-05, + "loss": 39.9873, + "step": 1614 + }, + { + "epoch": 5.830699774266366, + "grad_norm": 242.233642578125, + "learning_rate": 2.1261343012704173e-05, + "loss": 37.0662, + "step": 1615 + }, + { + "epoch": 5.83431151241535, + "grad_norm": 217.06141662597656, + "learning_rate": 2.1255898366606172e-05, + "loss": 36.8948, + "step": 1616 + }, + { + "epoch": 5.837923250564334, + "grad_norm": 242.05567932128906, + "learning_rate": 2.1250453720508167e-05, + "loss": 34.9909, + "step": 1617 + }, + { + "epoch": 5.8415349887133186, + "grad_norm": 178.65618896484375, + "learning_rate": 2.1245009074410166e-05, + "loss": 35.603, + "step": 1618 + }, + { + "epoch": 5.8451467268623025, + "grad_norm": 216.36865234375, + "learning_rate": 2.123956442831216e-05, + "loss": 35.9822, + "step": 1619 + }, + { + "epoch": 5.8487584650112865, + "grad_norm": 241.22161865234375, + "learning_rate": 2.1234119782214157e-05, + "loss": 35.1473, + "step": 1620 + }, + { + "epoch": 5.8487584650112865, + "eval_loss": 0.6312161087989807, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 1620 + }, + { + "epoch": 5.852370203160271, + "grad_norm": 192.05210876464844, + "learning_rate": 2.1228675136116152e-05, + "loss": 36.145, + "step": 1621 + }, + { + "epoch": 5.855981941309255, + "grad_norm": 194.0652618408203, + "learning_rate": 2.122323049001815e-05, + "loss": 37.7076, + "step": 1622 + }, + { + "epoch": 5.859593679458239, + "grad_norm": 255.59286499023438, + "learning_rate": 2.1217785843920146e-05, + "loss": 37.6837, + "step": 1623 + }, + { + "epoch": 5.863205417607223, + "grad_norm": 184.0017852783203, + "learning_rate": 2.121234119782214e-05, + "loss": 37.1681, + "step": 1624 + }, + { + "epoch": 5.866817155756207, + "grad_norm": 186.98338317871094, + "learning_rate": 2.1206896551724137e-05, + "loss": 37.4902, + "step": 1625 + }, + { + "epoch": 5.870428893905192, + "grad_norm": 253.53775024414062, + "learning_rate": 2.1201451905626132e-05, + "loss": 37.2771, + "step": 1626 + }, + { + "epoch": 5.874040632054176, + "grad_norm": 196.43038940429688, + "learning_rate": 2.119600725952813e-05, + "loss": 37.7681, + "step": 1627 + }, + { + "epoch": 5.87765237020316, + "grad_norm": 255.99879455566406, + "learning_rate": 2.119056261343013e-05, + "loss": 40.0097, + "step": 1628 + }, + { + "epoch": 5.881264108352145, + "grad_norm": 275.1465148925781, + "learning_rate": 2.1185117967332125e-05, + "loss": 38.1076, + "step": 1629 + }, + { + "epoch": 5.884875846501129, + "grad_norm": 281.8592529296875, + "learning_rate": 2.117967332123412e-05, + "loss": 38.6463, + "step": 1630 + }, + { + "epoch": 5.884875846501129, + "eval_loss": 0.6449099779129028, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 1630 + }, + { + "epoch": 5.888487584650113, + "grad_norm": 246.7912139892578, + "learning_rate": 2.1174228675136116e-05, + "loss": 36.9158, + "step": 1631 + }, + { + "epoch": 5.892099322799097, + "grad_norm": 176.7545623779297, + "learning_rate": 2.116878402903811e-05, + "loss": 25.1153, + "step": 1632 + }, + { + "epoch": 5.895711060948082, + "grad_norm": 202.2602996826172, + "learning_rate": 2.116333938294011e-05, + "loss": 24.1999, + "step": 1633 + }, + { + "epoch": 5.899322799097066, + "grad_norm": 186.26255798339844, + "learning_rate": 2.1157894736842106e-05, + "loss": 24.185, + "step": 1634 + }, + { + "epoch": 5.9029345372460496, + "grad_norm": 231.0543670654297, + "learning_rate": 2.11524500907441e-05, + "loss": 26.1841, + "step": 1635 + }, + { + "epoch": 5.9065462753950335, + "grad_norm": 336.677001953125, + "learning_rate": 2.1147005444646096e-05, + "loss": 47.1367, + "step": 1636 + }, + { + "epoch": 5.910158013544018, + "grad_norm": 299.3211975097656, + "learning_rate": 2.1141560798548095e-05, + "loss": 46.7711, + "step": 1637 + }, + { + "epoch": 5.913769751693002, + "grad_norm": 287.5389099121094, + "learning_rate": 2.1136116152450094e-05, + "loss": 44.9163, + "step": 1638 + }, + { + "epoch": 5.917381489841986, + "grad_norm": 290.34930419921875, + "learning_rate": 2.113067150635209e-05, + "loss": 45.1651, + "step": 1639 + }, + { + "epoch": 5.92099322799097, + "grad_norm": 244.7100372314453, + "learning_rate": 2.1125226860254085e-05, + "loss": 45.6252, + "step": 1640 + }, + { + "epoch": 5.92099322799097, + "eval_loss": 0.6506878733634949, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 1640 + }, + { + "epoch": 5.924604966139955, + "grad_norm": 301.48223876953125, + "learning_rate": 2.111978221415608e-05, + "loss": 44.5345, + "step": 1641 + }, + { + "epoch": 5.928216704288939, + "grad_norm": 261.05987548828125, + "learning_rate": 2.1114337568058075e-05, + "loss": 42.0263, + "step": 1642 + }, + { + "epoch": 5.931828442437923, + "grad_norm": 220.4369659423828, + "learning_rate": 2.110889292196007e-05, + "loss": 41.2405, + "step": 1643 + }, + { + "epoch": 5.935440180586907, + "grad_norm": 261.3221435546875, + "learning_rate": 2.110344827586207e-05, + "loss": 42.2734, + "step": 1644 + }, + { + "epoch": 5.939051918735892, + "grad_norm": 253.70855712890625, + "learning_rate": 2.1098003629764065e-05, + "loss": 43.0752, + "step": 1645 + }, + { + "epoch": 5.942663656884876, + "grad_norm": 198.76138305664062, + "learning_rate": 2.1092558983666064e-05, + "loss": 42.7103, + "step": 1646 + }, + { + "epoch": 5.94627539503386, + "grad_norm": 212.21466064453125, + "learning_rate": 2.108711433756806e-05, + "loss": 42.6215, + "step": 1647 + }, + { + "epoch": 5.949887133182845, + "grad_norm": 212.9633026123047, + "learning_rate": 2.1081669691470055e-05, + "loss": 42.795, + "step": 1648 + }, + { + "epoch": 5.953498871331829, + "grad_norm": 263.2871398925781, + "learning_rate": 2.1076225045372053e-05, + "loss": 43.8843, + "step": 1649 + }, + { + "epoch": 5.957110609480813, + "grad_norm": 207.67120361328125, + "learning_rate": 2.107078039927405e-05, + "loss": 43.0161, + "step": 1650 + }, + { + "epoch": 5.957110609480813, + "eval_loss": 0.6315081715583801, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1650 + }, + { + "epoch": 5.960722347629797, + "grad_norm": 176.6342010498047, + "learning_rate": 2.1065335753176044e-05, + "loss": 38.803, + "step": 1651 + }, + { + "epoch": 5.9643340857787805, + "grad_norm": 223.57485961914062, + "learning_rate": 2.105989110707804e-05, + "loss": 35.1905, + "step": 1652 + }, + { + "epoch": 5.967945823927765, + "grad_norm": 291.507568359375, + "learning_rate": 2.1054446460980035e-05, + "loss": 34.9454, + "step": 1653 + }, + { + "epoch": 5.971557562076749, + "grad_norm": 250.51063537597656, + "learning_rate": 2.104900181488203e-05, + "loss": 37.4404, + "step": 1654 + }, + { + "epoch": 5.975169300225733, + "grad_norm": 307.9601135253906, + "learning_rate": 2.1043557168784032e-05, + "loss": 36.9775, + "step": 1655 + }, + { + "epoch": 5.978781038374718, + "grad_norm": 277.24151611328125, + "learning_rate": 2.1038112522686028e-05, + "loss": 38.2696, + "step": 1656 + }, + { + "epoch": 5.982392776523702, + "grad_norm": 186.7593994140625, + "learning_rate": 2.1032667876588023e-05, + "loss": 37.0656, + "step": 1657 + }, + { + "epoch": 5.986004514672686, + "grad_norm": 201.67047119140625, + "learning_rate": 2.102722323049002e-05, + "loss": 38.1747, + "step": 1658 + }, + { + "epoch": 5.98961625282167, + "grad_norm": 216.87525939941406, + "learning_rate": 2.1021778584392014e-05, + "loss": 39.3248, + "step": 1659 + }, + { + "epoch": 5.993227990970655, + "grad_norm": 227.381103515625, + "learning_rate": 2.1016333938294013e-05, + "loss": 33.4017, + "step": 1660 + }, + { + "epoch": 5.993227990970655, + "eval_loss": 0.6369583010673523, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1660 + }, + { + "epoch": 5.996839729119639, + "grad_norm": 237.2648468017578, + "learning_rate": 2.1010889292196008e-05, + "loss": 24.679, + "step": 1661 + }, + { + "epoch": 6.0, + "grad_norm": 191.99951171875, + "learning_rate": 2.1005444646098003e-05, + "loss": 21.9552, + "step": 1662 + }, + { + "epoch": 6.003611738148984, + "grad_norm": 267.92181396484375, + "learning_rate": 2.1e-05, + "loss": 43.6884, + "step": 1663 + }, + { + "epoch": 6.007223476297969, + "grad_norm": 318.86602783203125, + "learning_rate": 2.0994555353901998e-05, + "loss": 46.0709, + "step": 1664 + }, + { + "epoch": 6.010835214446953, + "grad_norm": 282.772705078125, + "learning_rate": 2.0989110707803993e-05, + "loss": 44.2746, + "step": 1665 + }, + { + "epoch": 6.014446952595937, + "grad_norm": 263.2024841308594, + "learning_rate": 2.0983666061705992e-05, + "loss": 43.818, + "step": 1666 + }, + { + "epoch": 6.018058690744921, + "grad_norm": 229.41725158691406, + "learning_rate": 2.0978221415607987e-05, + "loss": 43.9441, + "step": 1667 + }, + { + "epoch": 6.021670428893906, + "grad_norm": 253.25624084472656, + "learning_rate": 2.0972776769509983e-05, + "loss": 43.517, + "step": 1668 + }, + { + "epoch": 6.0252821670428895, + "grad_norm": 202.00238037109375, + "learning_rate": 2.0967332123411978e-05, + "loss": 44.3685, + "step": 1669 + }, + { + "epoch": 6.0288939051918735, + "grad_norm": 196.92825317382812, + "learning_rate": 2.0961887477313973e-05, + "loss": 44.9367, + "step": 1670 + }, + { + "epoch": 6.0288939051918735, + "eval_loss": 0.6381568312644958, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1670 + }, + { + "epoch": 6.0325056433408575, + "grad_norm": 191.00900268554688, + "learning_rate": 2.0956442831215972e-05, + "loss": 44.0743, + "step": 1671 + }, + { + "epoch": 6.036117381489842, + "grad_norm": 195.92141723632812, + "learning_rate": 2.0950998185117967e-05, + "loss": 43.3278, + "step": 1672 + }, + { + "epoch": 6.039729119638826, + "grad_norm": 230.04708862304688, + "learning_rate": 2.0945553539019963e-05, + "loss": 41.6419, + "step": 1673 + }, + { + "epoch": 6.04334085778781, + "grad_norm": 215.70689392089844, + "learning_rate": 2.094010889292196e-05, + "loss": 41.0927, + "step": 1674 + }, + { + "epoch": 6.046952595936794, + "grad_norm": 227.51797485351562, + "learning_rate": 2.0934664246823957e-05, + "loss": 40.1888, + "step": 1675 + }, + { + "epoch": 6.050564334085779, + "grad_norm": 216.93089294433594, + "learning_rate": 2.0929219600725952e-05, + "loss": 39.8766, + "step": 1676 + }, + { + "epoch": 6.054176072234763, + "grad_norm": 199.3091583251953, + "learning_rate": 2.092377495462795e-05, + "loss": 40.3851, + "step": 1677 + }, + { + "epoch": 6.057787810383747, + "grad_norm": 188.56056213378906, + "learning_rate": 2.0918330308529947e-05, + "loss": 40.5289, + "step": 1678 + }, + { + "epoch": 6.061399548532731, + "grad_norm": 194.23265075683594, + "learning_rate": 2.0912885662431942e-05, + "loss": 40.7509, + "step": 1679 + }, + { + "epoch": 6.065011286681716, + "grad_norm": 199.7327423095703, + "learning_rate": 2.0907441016333937e-05, + "loss": 41.3404, + "step": 1680 + }, + { + "epoch": 6.065011286681716, + "eval_loss": 0.6312655806541443, + "eval_runtime": 3.1482, + "eval_samples_per_second": 56.858, + "eval_steps_per_second": 56.858, + "step": 1680 + }, + { + "epoch": 6.0686230248307, + "grad_norm": 189.40150451660156, + "learning_rate": 2.0901996370235933e-05, + "loss": 41.3719, + "step": 1681 + }, + { + "epoch": 6.072234762979684, + "grad_norm": 222.07705688476562, + "learning_rate": 2.089655172413793e-05, + "loss": 41.8194, + "step": 1682 + }, + { + "epoch": 6.075846501128668, + "grad_norm": 205.6264190673828, + "learning_rate": 2.089110707803993e-05, + "loss": 39.8522, + "step": 1683 + }, + { + "epoch": 6.079458239277653, + "grad_norm": 207.98802185058594, + "learning_rate": 2.0885662431941926e-05, + "loss": 41.5093, + "step": 1684 + }, + { + "epoch": 6.083069977426637, + "grad_norm": 197.24134826660156, + "learning_rate": 2.088021778584392e-05, + "loss": 41.7284, + "step": 1685 + }, + { + "epoch": 6.0866817155756205, + "grad_norm": 220.84255981445312, + "learning_rate": 2.0874773139745916e-05, + "loss": 42.7841, + "step": 1686 + }, + { + "epoch": 6.090293453724605, + "grad_norm": 239.06854248046875, + "learning_rate": 2.0869328493647912e-05, + "loss": 43.6391, + "step": 1687 + }, + { + "epoch": 6.093905191873589, + "grad_norm": 193.2572021484375, + "learning_rate": 2.086388384754991e-05, + "loss": 41.9963, + "step": 1688 + }, + { + "epoch": 6.097516930022573, + "grad_norm": 206.66473388671875, + "learning_rate": 2.0858439201451906e-05, + "loss": 41.9834, + "step": 1689 + }, + { + "epoch": 6.101128668171557, + "grad_norm": 214.81956481933594, + "learning_rate": 2.08529945553539e-05, + "loss": 41.7128, + "step": 1690 + }, + { + "epoch": 6.101128668171557, + "eval_loss": 0.6309775114059448, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1690 + }, + { + "epoch": 6.104740406320542, + "grad_norm": 189.58360290527344, + "learning_rate": 2.0847549909255897e-05, + "loss": 37.7807, + "step": 1691 + }, + { + "epoch": 6.108352144469526, + "grad_norm": 265.76934814453125, + "learning_rate": 2.0842105263157895e-05, + "loss": 37.7091, + "step": 1692 + }, + { + "epoch": 6.11196388261851, + "grad_norm": 266.4632568359375, + "learning_rate": 2.0836660617059894e-05, + "loss": 34.7386, + "step": 1693 + }, + { + "epoch": 6.115575620767494, + "grad_norm": 309.3799743652344, + "learning_rate": 2.083121597096189e-05, + "loss": 34.9386, + "step": 1694 + }, + { + "epoch": 6.119187358916479, + "grad_norm": 252.98681640625, + "learning_rate": 2.0825771324863885e-05, + "loss": 34.9113, + "step": 1695 + }, + { + "epoch": 6.122799097065463, + "grad_norm": 199.3408660888672, + "learning_rate": 2.082032667876588e-05, + "loss": 35.1914, + "step": 1696 + }, + { + "epoch": 6.126410835214447, + "grad_norm": 231.67514038085938, + "learning_rate": 2.0814882032667876e-05, + "loss": 36.3151, + "step": 1697 + }, + { + "epoch": 6.130022573363431, + "grad_norm": 215.49317932128906, + "learning_rate": 2.080943738656987e-05, + "loss": 37.6763, + "step": 1698 + }, + { + "epoch": 6.133634311512416, + "grad_norm": 239.3602752685547, + "learning_rate": 2.080399274047187e-05, + "loss": 35.7805, + "step": 1699 + }, + { + "epoch": 6.1372460496614, + "grad_norm": 192.8195037841797, + "learning_rate": 2.0798548094373865e-05, + "loss": 36.7353, + "step": 1700 + }, + { + "epoch": 6.1372460496614, + "eval_loss": 0.6290757060050964, + "eval_runtime": 3.1486, + "eval_samples_per_second": 56.851, + "eval_steps_per_second": 56.851, + "step": 1700 + }, + { + "epoch": 6.140857787810384, + "grad_norm": 191.125, + "learning_rate": 2.0793103448275864e-05, + "loss": 36.6377, + "step": 1701 + }, + { + "epoch": 6.144469525959368, + "grad_norm": 232.39170837402344, + "learning_rate": 2.078765880217786e-05, + "loss": 36.5235, + "step": 1702 + }, + { + "epoch": 6.148081264108352, + "grad_norm": 259.41204833984375, + "learning_rate": 2.0782214156079855e-05, + "loss": 37.7093, + "step": 1703 + }, + { + "epoch": 6.151693002257336, + "grad_norm": 218.00814819335938, + "learning_rate": 2.0776769509981854e-05, + "loss": 37.8061, + "step": 1704 + }, + { + "epoch": 6.15530474040632, + "grad_norm": 183.78170776367188, + "learning_rate": 2.077132486388385e-05, + "loss": 37.9451, + "step": 1705 + }, + { + "epoch": 6.158916478555304, + "grad_norm": 242.387939453125, + "learning_rate": 2.0765880217785844e-05, + "loss": 38.687, + "step": 1706 + }, + { + "epoch": 6.162528216704289, + "grad_norm": 247.09152221679688, + "learning_rate": 2.076043557168784e-05, + "loss": 38.5109, + "step": 1707 + }, + { + "epoch": 6.166139954853273, + "grad_norm": 202.3104705810547, + "learning_rate": 2.0754990925589835e-05, + "loss": 28.0115, + "step": 1708 + }, + { + "epoch": 6.169751693002257, + "grad_norm": 239.5511016845703, + "learning_rate": 2.0749546279491834e-05, + "loss": 23.8873, + "step": 1709 + }, + { + "epoch": 6.173363431151241, + "grad_norm": 233.80007934570312, + "learning_rate": 2.0744101633393833e-05, + "loss": 24.0236, + "step": 1710 + }, + { + "epoch": 6.173363431151241, + "eval_loss": 0.6451307535171509, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1710 + }, + { + "epoch": 6.176975169300226, + "grad_norm": 231.85955810546875, + "learning_rate": 2.0738656987295828e-05, + "loss": 25.2521, + "step": 1711 + }, + { + "epoch": 6.18058690744921, + "grad_norm": 207.05453491210938, + "learning_rate": 2.0733212341197823e-05, + "loss": 25.5774, + "step": 1712 + }, + { + "epoch": 6.184198645598194, + "grad_norm": 265.9180908203125, + "learning_rate": 2.072776769509982e-05, + "loss": 46.0267, + "step": 1713 + }, + { + "epoch": 6.187810383747179, + "grad_norm": 289.2763671875, + "learning_rate": 2.0722323049001814e-05, + "loss": 46.6262, + "step": 1714 + }, + { + "epoch": 6.191422121896163, + "grad_norm": 254.466552734375, + "learning_rate": 2.0716878402903813e-05, + "loss": 44.2758, + "step": 1715 + }, + { + "epoch": 6.195033860045147, + "grad_norm": 262.713134765625, + "learning_rate": 2.071143375680581e-05, + "loss": 44.6334, + "step": 1716 + }, + { + "epoch": 6.198645598194131, + "grad_norm": 272.8150939941406, + "learning_rate": 2.0705989110707804e-05, + "loss": 44.9617, + "step": 1717 + }, + { + "epoch": 6.2022573363431155, + "grad_norm": 288.115478515625, + "learning_rate": 2.07005444646098e-05, + "loss": 44.4382, + "step": 1718 + }, + { + "epoch": 6.2058690744920995, + "grad_norm": 226.08058166503906, + "learning_rate": 2.0695099818511795e-05, + "loss": 44.8551, + "step": 1719 + }, + { + "epoch": 6.209480812641083, + "grad_norm": 219.95835876464844, + "learning_rate": 2.0689655172413797e-05, + "loss": 45.5901, + "step": 1720 + }, + { + "epoch": 6.209480812641083, + "eval_loss": 0.6379314661026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 1720 + }, + { + "epoch": 6.213092550790067, + "grad_norm": 190.3118896484375, + "learning_rate": 2.0684210526315792e-05, + "loss": 44.0675, + "step": 1721 + }, + { + "epoch": 6.216704288939052, + "grad_norm": 177.408935546875, + "learning_rate": 2.0678765880217787e-05, + "loss": 42.6333, + "step": 1722 + }, + { + "epoch": 6.220316027088036, + "grad_norm": 231.3040313720703, + "learning_rate": 2.0673321234119783e-05, + "loss": 41.6771, + "step": 1723 + }, + { + "epoch": 6.22392776523702, + "grad_norm": 226.51663208007812, + "learning_rate": 2.0667876588021778e-05, + "loss": 41.0829, + "step": 1724 + }, + { + "epoch": 6.227539503386004, + "grad_norm": 184.55775451660156, + "learning_rate": 2.0662431941923774e-05, + "loss": 39.2682, + "step": 1725 + }, + { + "epoch": 6.231151241534989, + "grad_norm": 205.0491943359375, + "learning_rate": 2.0656987295825772e-05, + "loss": 40.4101, + "step": 1726 + }, + { + "epoch": 6.234762979683973, + "grad_norm": 201.45838928222656, + "learning_rate": 2.0651542649727768e-05, + "loss": 39.9147, + "step": 1727 + }, + { + "epoch": 6.238374717832957, + "grad_norm": 220.16213989257812, + "learning_rate": 2.0646098003629763e-05, + "loss": 40.7215, + "step": 1728 + }, + { + "epoch": 6.241986455981941, + "grad_norm": 260.9661560058594, + "learning_rate": 2.0640653357531762e-05, + "loss": 40.0256, + "step": 1729 + }, + { + "epoch": 6.245598194130926, + "grad_norm": 314.2476806640625, + "learning_rate": 2.0635208711433757e-05, + "loss": 41.1147, + "step": 1730 + }, + { + "epoch": 6.245598194130926, + "eval_loss": 0.6347935199737549, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1730 + }, + { + "epoch": 6.24920993227991, + "grad_norm": 262.24505615234375, + "learning_rate": 2.0629764065335756e-05, + "loss": 41.7255, + "step": 1731 + }, + { + "epoch": 6.252821670428894, + "grad_norm": 212.0876922607422, + "learning_rate": 2.062431941923775e-05, + "loss": 41.2559, + "step": 1732 + }, + { + "epoch": 6.2564334085778786, + "grad_norm": 185.3249969482422, + "learning_rate": 2.0618874773139747e-05, + "loss": 41.1664, + "step": 1733 + }, + { + "epoch": 6.2600451467268625, + "grad_norm": 184.7873077392578, + "learning_rate": 2.0613430127041742e-05, + "loss": 41.3357, + "step": 1734 + }, + { + "epoch": 6.2636568848758465, + "grad_norm": 230.11257934570312, + "learning_rate": 2.0607985480943738e-05, + "loss": 43.0978, + "step": 1735 + }, + { + "epoch": 6.2672686230248305, + "grad_norm": 251.255126953125, + "learning_rate": 2.0602540834845733e-05, + "loss": 42.4169, + "step": 1736 + }, + { + "epoch": 6.270880361173815, + "grad_norm": 230.1149444580078, + "learning_rate": 2.0597096188747732e-05, + "loss": 43.2969, + "step": 1737 + }, + { + "epoch": 6.274492099322799, + "grad_norm": 217.2769012451172, + "learning_rate": 2.059165154264973e-05, + "loss": 42.6037, + "step": 1738 + }, + { + "epoch": 6.278103837471783, + "grad_norm": 189.85533142089844, + "learning_rate": 2.0586206896551726e-05, + "loss": 42.1215, + "step": 1739 + }, + { + "epoch": 6.281715575620767, + "grad_norm": 242.15667724609375, + "learning_rate": 2.058076225045372e-05, + "loss": 42.6337, + "step": 1740 + }, + { + "epoch": 6.281715575620767, + "eval_loss": 0.6310555934906006, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 1740 + }, + { + "epoch": 6.285327313769752, + "grad_norm": 213.7873992919922, + "learning_rate": 2.0575317604355717e-05, + "loss": 40.5315, + "step": 1741 + }, + { + "epoch": 6.288939051918736, + "grad_norm": 243.86492919921875, + "learning_rate": 2.0569872958257715e-05, + "loss": 38.9483, + "step": 1742 + }, + { + "epoch": 6.29255079006772, + "grad_norm": 276.0108642578125, + "learning_rate": 2.056442831215971e-05, + "loss": 35.9627, + "step": 1743 + }, + { + "epoch": 6.296162528216704, + "grad_norm": 252.5875701904297, + "learning_rate": 2.0558983666061706e-05, + "loss": 35.4305, + "step": 1744 + }, + { + "epoch": 6.299774266365689, + "grad_norm": 227.15142822265625, + "learning_rate": 2.05535390199637e-05, + "loss": 35.2385, + "step": 1745 + }, + { + "epoch": 6.303386004514673, + "grad_norm": 259.6727294921875, + "learning_rate": 2.0548094373865697e-05, + "loss": 35.735, + "step": 1746 + }, + { + "epoch": 6.306997742663657, + "grad_norm": 185.07765197753906, + "learning_rate": 2.0542649727767696e-05, + "loss": 36.8835, + "step": 1747 + }, + { + "epoch": 6.310609480812641, + "grad_norm": 207.650146484375, + "learning_rate": 2.0537205081669694e-05, + "loss": 36.346, + "step": 1748 + }, + { + "epoch": 6.314221218961626, + "grad_norm": 223.2378692626953, + "learning_rate": 2.053176043557169e-05, + "loss": 36.1527, + "step": 1749 + }, + { + "epoch": 6.3178329571106095, + "grad_norm": 162.90794372558594, + "learning_rate": 2.0526315789473685e-05, + "loss": 35.7408, + "step": 1750 + }, + { + "epoch": 6.3178329571106095, + "eval_loss": 0.6276403069496155, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 1750 + }, + { + "epoch": 6.3214446952595935, + "grad_norm": 165.8592987060547, + "learning_rate": 2.052087114337568e-05, + "loss": 37.7916, + "step": 1751 + }, + { + "epoch": 6.3250564334085775, + "grad_norm": 179.7499542236328, + "learning_rate": 2.0515426497277676e-05, + "loss": 36.8409, + "step": 1752 + }, + { + "epoch": 6.328668171557562, + "grad_norm": 227.0990753173828, + "learning_rate": 2.0509981851179675e-05, + "loss": 37.1766, + "step": 1753 + }, + { + "epoch": 6.332279909706546, + "grad_norm": 216.3297882080078, + "learning_rate": 2.050453720508167e-05, + "loss": 37.5, + "step": 1754 + }, + { + "epoch": 6.33589164785553, + "grad_norm": 197.88409423828125, + "learning_rate": 2.0499092558983666e-05, + "loss": 38.8293, + "step": 1755 + }, + { + "epoch": 6.339503386004514, + "grad_norm": 189.74916076660156, + "learning_rate": 2.049364791288566e-05, + "loss": 37.9873, + "step": 1756 + }, + { + "epoch": 6.343115124153499, + "grad_norm": 241.16644287109375, + "learning_rate": 2.048820326678766e-05, + "loss": 39.3107, + "step": 1757 + }, + { + "epoch": 6.346726862302483, + "grad_norm": 224.3491668701172, + "learning_rate": 2.0482758620689655e-05, + "loss": 36.2482, + "step": 1758 + }, + { + "epoch": 6.350338600451467, + "grad_norm": 217.30882263183594, + "learning_rate": 2.0477313974591654e-05, + "loss": 24.1945, + "step": 1759 + }, + { + "epoch": 6.353950338600452, + "grad_norm": 213.23683166503906, + "learning_rate": 2.047186932849365e-05, + "loss": 24.2356, + "step": 1760 + }, + { + "epoch": 6.353950338600452, + "eval_loss": 0.6382855772972107, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.795, + "eval_steps_per_second": 56.795, + "step": 1760 + }, + { + "epoch": 6.357562076749436, + "grad_norm": 209.8166961669922, + "learning_rate": 2.0466424682395645e-05, + "loss": 25.1916, + "step": 1761 + }, + { + "epoch": 6.36117381489842, + "grad_norm": 197.86773681640625, + "learning_rate": 2.046098003629764e-05, + "loss": 25.1372, + "step": 1762 + }, + { + "epoch": 6.364785553047404, + "grad_norm": 280.80517578125, + "learning_rate": 2.0455535390199635e-05, + "loss": 45.0431, + "step": 1763 + }, + { + "epoch": 6.368397291196389, + "grad_norm": 239.85861206054688, + "learning_rate": 2.0450090744101634e-05, + "loss": 45.4893, + "step": 1764 + }, + { + "epoch": 6.372009029345373, + "grad_norm": 302.56024169921875, + "learning_rate": 2.044464609800363e-05, + "loss": 45.3313, + "step": 1765 + }, + { + "epoch": 6.375620767494357, + "grad_norm": 255.5519256591797, + "learning_rate": 2.043920145190563e-05, + "loss": 44.703, + "step": 1766 + }, + { + "epoch": 6.3792325056433405, + "grad_norm": 223.1331024169922, + "learning_rate": 2.0433756805807624e-05, + "loss": 45.0278, + "step": 1767 + }, + { + "epoch": 6.382844243792325, + "grad_norm": 240.68817138671875, + "learning_rate": 2.042831215970962e-05, + "loss": 44.7298, + "step": 1768 + }, + { + "epoch": 6.386455981941309, + "grad_norm": 239.5072021484375, + "learning_rate": 2.0422867513611614e-05, + "loss": 44.0512, + "step": 1769 + }, + { + "epoch": 6.390067720090293, + "grad_norm": 186.3783416748047, + "learning_rate": 2.0417422867513613e-05, + "loss": 43.8646, + "step": 1770 + }, + { + "epoch": 6.390067720090293, + "eval_loss": 0.6325972676277161, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 1770 + }, + { + "epoch": 6.393679458239277, + "grad_norm": 169.77285766601562, + "learning_rate": 2.041197822141561e-05, + "loss": 43.8688, + "step": 1771 + }, + { + "epoch": 6.397291196388262, + "grad_norm": 158.4019012451172, + "learning_rate": 2.0406533575317604e-05, + "loss": 42.5757, + "step": 1772 + }, + { + "epoch": 6.400902934537246, + "grad_norm": 209.79916381835938, + "learning_rate": 2.04010889292196e-05, + "loss": 44.8075, + "step": 1773 + }, + { + "epoch": 6.40451467268623, + "grad_norm": 215.74639892578125, + "learning_rate": 2.0395644283121595e-05, + "loss": 42.0121, + "step": 1774 + }, + { + "epoch": 6.408126410835214, + "grad_norm": 215.21121215820312, + "learning_rate": 2.0390199637023597e-05, + "loss": 40.6564, + "step": 1775 + }, + { + "epoch": 6.411738148984199, + "grad_norm": 244.49574279785156, + "learning_rate": 2.0384754990925592e-05, + "loss": 40.543, + "step": 1776 + }, + { + "epoch": 6.415349887133183, + "grad_norm": 189.22781372070312, + "learning_rate": 2.0379310344827588e-05, + "loss": 39.5569, + "step": 1777 + }, + { + "epoch": 6.418961625282167, + "grad_norm": 204.32664489746094, + "learning_rate": 2.0373865698729583e-05, + "loss": 40.0789, + "step": 1778 + }, + { + "epoch": 6.422573363431152, + "grad_norm": 217.5277557373047, + "learning_rate": 2.036842105263158e-05, + "loss": 39.6436, + "step": 1779 + }, + { + "epoch": 6.426185101580136, + "grad_norm": 196.25918579101562, + "learning_rate": 2.0362976406533574e-05, + "loss": 41.0794, + "step": 1780 + }, + { + "epoch": 6.426185101580136, + "eval_loss": 0.6334295868873596, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1780 + }, + { + "epoch": 6.42979683972912, + "grad_norm": 191.50656127929688, + "learning_rate": 2.0357531760435573e-05, + "loss": 41.2976, + "step": 1781 + }, + { + "epoch": 6.433408577878104, + "grad_norm": 192.98692321777344, + "learning_rate": 2.0352087114337568e-05, + "loss": 41.0843, + "step": 1782 + }, + { + "epoch": 6.437020316027088, + "grad_norm": 197.32862854003906, + "learning_rate": 2.0346642468239563e-05, + "loss": 40.4123, + "step": 1783 + }, + { + "epoch": 6.440632054176072, + "grad_norm": 205.18751525878906, + "learning_rate": 2.0341197822141562e-05, + "loss": 41.9185, + "step": 1784 + }, + { + "epoch": 6.444243792325056, + "grad_norm": 201.69070434570312, + "learning_rate": 2.0335753176043558e-05, + "loss": 41.6794, + "step": 1785 + }, + { + "epoch": 6.44785553047404, + "grad_norm": 218.77044677734375, + "learning_rate": 2.0330308529945556e-05, + "loss": 43.5805, + "step": 1786 + }, + { + "epoch": 6.451467268623025, + "grad_norm": 183.25967407226562, + "learning_rate": 2.0324863883847552e-05, + "loss": 41.2777, + "step": 1787 + }, + { + "epoch": 6.455079006772009, + "grad_norm": 219.97369384765625, + "learning_rate": 2.0319419237749547e-05, + "loss": 42.4618, + "step": 1788 + }, + { + "epoch": 6.458690744920993, + "grad_norm": 216.1624298095703, + "learning_rate": 2.0313974591651542e-05, + "loss": 41.6424, + "step": 1789 + }, + { + "epoch": 6.462302483069977, + "grad_norm": 222.29965209960938, + "learning_rate": 2.0308529945553538e-05, + "loss": 41.4058, + "step": 1790 + }, + { + "epoch": 6.462302483069977, + "eval_loss": 0.6282982230186462, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 1790 + }, + { + "epoch": 6.465914221218962, + "grad_norm": 215.50511169433594, + "learning_rate": 2.0303085299455533e-05, + "loss": 39.474, + "step": 1791 + }, + { + "epoch": 6.469525959367946, + "grad_norm": 237.2119903564453, + "learning_rate": 2.0297640653357532e-05, + "loss": 36.0508, + "step": 1792 + }, + { + "epoch": 6.47313769751693, + "grad_norm": 234.52975463867188, + "learning_rate": 2.029219600725953e-05, + "loss": 34.1704, + "step": 1793 + }, + { + "epoch": 6.476749435665914, + "grad_norm": 213.22216796875, + "learning_rate": 2.0286751361161526e-05, + "loss": 34.7592, + "step": 1794 + }, + { + "epoch": 6.480361173814899, + "grad_norm": 215.77244567871094, + "learning_rate": 2.028130671506352e-05, + "loss": 35.3051, + "step": 1795 + }, + { + "epoch": 6.483972911963883, + "grad_norm": 179.0439910888672, + "learning_rate": 2.0275862068965517e-05, + "loss": 35.2493, + "step": 1796 + }, + { + "epoch": 6.487584650112867, + "grad_norm": 217.47218322753906, + "learning_rate": 2.0270417422867516e-05, + "loss": 35.6169, + "step": 1797 + }, + { + "epoch": 6.491196388261851, + "grad_norm": 191.3380584716797, + "learning_rate": 2.026497277676951e-05, + "loss": 36.428, + "step": 1798 + }, + { + "epoch": 6.4948081264108355, + "grad_norm": 200.8570098876953, + "learning_rate": 2.0259528130671506e-05, + "loss": 36.5983, + "step": 1799 + }, + { + "epoch": 6.4984198645598195, + "grad_norm": 173.1240234375, + "learning_rate": 2.0254083484573502e-05, + "loss": 36.0163, + "step": 1800 + }, + { + "epoch": 6.4984198645598195, + "eval_loss": 0.6268841624259949, + "eval_runtime": 3.146, + "eval_samples_per_second": 56.898, + "eval_steps_per_second": 56.898, + "step": 1800 + }, + { + "epoch": 6.502031602708803, + "grad_norm": 225.66845703125, + "learning_rate": 2.0248638838475497e-05, + "loss": 36.2461, + "step": 1801 + }, + { + "epoch": 6.505643340857787, + "grad_norm": 189.66233825683594, + "learning_rate": 2.0243194192377496e-05, + "loss": 37.416, + "step": 1802 + }, + { + "epoch": 6.509255079006772, + "grad_norm": 243.0270233154297, + "learning_rate": 2.0237749546279495e-05, + "loss": 38.5309, + "step": 1803 + }, + { + "epoch": 6.512866817155756, + "grad_norm": 192.0927276611328, + "learning_rate": 2.023230490018149e-05, + "loss": 37.087, + "step": 1804 + }, + { + "epoch": 6.51647855530474, + "grad_norm": 222.2957305908203, + "learning_rate": 2.0226860254083486e-05, + "loss": 37.8877, + "step": 1805 + }, + { + "epoch": 6.520090293453725, + "grad_norm": 259.84722900390625, + "learning_rate": 2.022141560798548e-05, + "loss": 39.2138, + "step": 1806 + }, + { + "epoch": 6.523702031602709, + "grad_norm": 205.5794219970703, + "learning_rate": 2.0215970961887476e-05, + "loss": 38.6066, + "step": 1807 + }, + { + "epoch": 6.527313769751693, + "grad_norm": 300.455810546875, + "learning_rate": 2.0210526315789475e-05, + "loss": 36.1581, + "step": 1808 + }, + { + "epoch": 6.530925507900677, + "grad_norm": 207.18063354492188, + "learning_rate": 2.020508166969147e-05, + "loss": 24.3689, + "step": 1809 + }, + { + "epoch": 6.534537246049661, + "grad_norm": 230.98516845703125, + "learning_rate": 2.0199637023593466e-05, + "loss": 23.7019, + "step": 1810 + }, + { + "epoch": 6.534537246049661, + "eval_loss": 0.6379140615463257, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 1810 + }, + { + "epoch": 6.538148984198646, + "grad_norm": 153.8694610595703, + "learning_rate": 2.019419237749546e-05, + "loss": 24.5035, + "step": 1811 + }, + { + "epoch": 6.54176072234763, + "grad_norm": 229.9432373046875, + "learning_rate": 2.018874773139746e-05, + "loss": 26.1645, + "step": 1812 + }, + { + "epoch": 6.545372460496614, + "grad_norm": 325.3592529296875, + "learning_rate": 2.018330308529946e-05, + "loss": 45.6349, + "step": 1813 + }, + { + "epoch": 6.5489841986455986, + "grad_norm": 261.0744323730469, + "learning_rate": 2.0177858439201454e-05, + "loss": 45.5545, + "step": 1814 + }, + { + "epoch": 6.5525959367945825, + "grad_norm": 261.4237976074219, + "learning_rate": 2.017241379310345e-05, + "loss": 45.321, + "step": 1815 + }, + { + "epoch": 6.5562076749435665, + "grad_norm": 238.8377685546875, + "learning_rate": 2.0166969147005445e-05, + "loss": 44.5963, + "step": 1816 + }, + { + "epoch": 6.5598194130925505, + "grad_norm": 225.89730834960938, + "learning_rate": 2.016152450090744e-05, + "loss": 43.593, + "step": 1817 + }, + { + "epoch": 6.563431151241535, + "grad_norm": 265.09625244140625, + "learning_rate": 2.0156079854809436e-05, + "loss": 43.536, + "step": 1818 + }, + { + "epoch": 6.567042889390519, + "grad_norm": 257.9114685058594, + "learning_rate": 2.0150635208711434e-05, + "loss": 44.1125, + "step": 1819 + }, + { + "epoch": 6.570654627539503, + "grad_norm": 188.06382751464844, + "learning_rate": 2.014519056261343e-05, + "loss": 45.097, + "step": 1820 + }, + { + "epoch": 6.570654627539503, + "eval_loss": 0.6347097754478455, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 1820 + }, + { + "epoch": 6.574266365688487, + "grad_norm": 227.7350616455078, + "learning_rate": 2.013974591651543e-05, + "loss": 43.9367, + "step": 1821 + }, + { + "epoch": 6.577878103837472, + "grad_norm": 207.54774475097656, + "learning_rate": 2.0134301270417424e-05, + "loss": 43.8266, + "step": 1822 + }, + { + "epoch": 6.581489841986456, + "grad_norm": 204.62364196777344, + "learning_rate": 2.012885662431942e-05, + "loss": 42.7973, + "step": 1823 + }, + { + "epoch": 6.58510158013544, + "grad_norm": 244.32159423828125, + "learning_rate": 2.0123411978221418e-05, + "loss": 42.7741, + "step": 1824 + }, + { + "epoch": 6.588713318284425, + "grad_norm": 304.9100036621094, + "learning_rate": 2.0117967332123414e-05, + "loss": 40.6529, + "step": 1825 + }, + { + "epoch": 6.592325056433409, + "grad_norm": 275.5767517089844, + "learning_rate": 2.011252268602541e-05, + "loss": 40.2909, + "step": 1826 + }, + { + "epoch": 6.595936794582393, + "grad_norm": 227.69642639160156, + "learning_rate": 2.0107078039927404e-05, + "loss": 39.8786, + "step": 1827 + }, + { + "epoch": 6.599548532731377, + "grad_norm": 261.4333190917969, + "learning_rate": 2.01016333938294e-05, + "loss": 40.7009, + "step": 1828 + }, + { + "epoch": 6.603160270880361, + "grad_norm": 213.0095977783203, + "learning_rate": 2.0096188747731395e-05, + "loss": 40.0595, + "step": 1829 + }, + { + "epoch": 6.606772009029346, + "grad_norm": 251.78590393066406, + "learning_rate": 2.0090744101633397e-05, + "loss": 40.8939, + "step": 1830 + }, + { + "epoch": 6.606772009029346, + "eval_loss": 0.6333281397819519, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1830 + }, + { + "epoch": 6.6103837471783295, + "grad_norm": 224.89805603027344, + "learning_rate": 2.0085299455535393e-05, + "loss": 41.4123, + "step": 1831 + }, + { + "epoch": 6.6139954853273135, + "grad_norm": 195.67982482910156, + "learning_rate": 2.0079854809437388e-05, + "loss": 41.3483, + "step": 1832 + }, + { + "epoch": 6.617607223476298, + "grad_norm": 214.318603515625, + "learning_rate": 2.0074410163339383e-05, + "loss": 40.5516, + "step": 1833 + }, + { + "epoch": 6.621218961625282, + "grad_norm": 226.60968017578125, + "learning_rate": 2.006896551724138e-05, + "loss": 41.3523, + "step": 1834 + }, + { + "epoch": 6.624830699774266, + "grad_norm": 231.63604736328125, + "learning_rate": 2.0063520871143378e-05, + "loss": 41.8734, + "step": 1835 + }, + { + "epoch": 6.62844243792325, + "grad_norm": 224.1644287109375, + "learning_rate": 2.0058076225045373e-05, + "loss": 42.7386, + "step": 1836 + }, + { + "epoch": 6.632054176072235, + "grad_norm": 273.651123046875, + "learning_rate": 2.0052631578947368e-05, + "loss": 42.4525, + "step": 1837 + }, + { + "epoch": 6.635665914221219, + "grad_norm": 270.8088684082031, + "learning_rate": 2.0047186932849364e-05, + "loss": 42.1051, + "step": 1838 + }, + { + "epoch": 6.639277652370203, + "grad_norm": 303.1058044433594, + "learning_rate": 2.0041742286751362e-05, + "loss": 42.1301, + "step": 1839 + }, + { + "epoch": 6.642889390519187, + "grad_norm": 207.29380798339844, + "learning_rate": 2.0036297640653358e-05, + "loss": 42.1495, + "step": 1840 + }, + { + "epoch": 6.642889390519187, + "eval_loss": 0.6321585774421692, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1840 + }, + { + "epoch": 6.646501128668172, + "grad_norm": 262.1852722167969, + "learning_rate": 2.0030852994555357e-05, + "loss": 39.6408, + "step": 1841 + }, + { + "epoch": 6.650112866817156, + "grad_norm": 233.7991943359375, + "learning_rate": 2.0025408348457352e-05, + "loss": 37.6177, + "step": 1842 + }, + { + "epoch": 6.65372460496614, + "grad_norm": 247.25514221191406, + "learning_rate": 2.0019963702359347e-05, + "loss": 35.4287, + "step": 1843 + }, + { + "epoch": 6.657336343115124, + "grad_norm": 191.53343200683594, + "learning_rate": 2.0014519056261343e-05, + "loss": 34.2335, + "step": 1844 + }, + { + "epoch": 6.660948081264109, + "grad_norm": 245.22821044921875, + "learning_rate": 2.0009074410163338e-05, + "loss": 35.8097, + "step": 1845 + }, + { + "epoch": 6.664559819413093, + "grad_norm": 213.8151092529297, + "learning_rate": 2.0003629764065337e-05, + "loss": 35.2621, + "step": 1846 + }, + { + "epoch": 6.668171557562077, + "grad_norm": 174.6085205078125, + "learning_rate": 1.9998185117967332e-05, + "loss": 36.6137, + "step": 1847 + }, + { + "epoch": 6.6717832957110605, + "grad_norm": 287.4677429199219, + "learning_rate": 1.9992740471869328e-05, + "loss": 37.5896, + "step": 1848 + }, + { + "epoch": 6.675395033860045, + "grad_norm": 224.59771728515625, + "learning_rate": 1.9987295825771326e-05, + "loss": 36.5515, + "step": 1849 + }, + { + "epoch": 6.679006772009029, + "grad_norm": 212.73065185546875, + "learning_rate": 1.9981851179673322e-05, + "loss": 36.2511, + "step": 1850 + }, + { + "epoch": 6.679006772009029, + "eval_loss": 0.6308404803276062, + "eval_runtime": 3.1419, + "eval_samples_per_second": 56.972, + "eval_steps_per_second": 56.972, + "step": 1850 + }, + { + "epoch": 6.682618510158013, + "grad_norm": 214.7340850830078, + "learning_rate": 1.9976406533575317e-05, + "loss": 37.6949, + "step": 1851 + }, + { + "epoch": 6.686230248306998, + "grad_norm": 220.3029327392578, + "learning_rate": 1.9970961887477316e-05, + "loss": 36.5785, + "step": 1852 + }, + { + "epoch": 6.689841986455982, + "grad_norm": 198.97564697265625, + "learning_rate": 1.996551724137931e-05, + "loss": 38.5277, + "step": 1853 + }, + { + "epoch": 6.693453724604966, + "grad_norm": 180.94789123535156, + "learning_rate": 1.9960072595281307e-05, + "loss": 37.5197, + "step": 1854 + }, + { + "epoch": 6.69706546275395, + "grad_norm": 212.17584228515625, + "learning_rate": 1.9954627949183302e-05, + "loss": 37.3483, + "step": 1855 + }, + { + "epoch": 6.700677200902934, + "grad_norm": 253.88601684570312, + "learning_rate": 1.9949183303085298e-05, + "loss": 38.5224, + "step": 1856 + }, + { + "epoch": 6.704288939051919, + "grad_norm": 193.17698669433594, + "learning_rate": 1.9943738656987296e-05, + "loss": 37.5679, + "step": 1857 + }, + { + "epoch": 6.707900677200903, + "grad_norm": 217.2652130126953, + "learning_rate": 1.9938294010889295e-05, + "loss": 27.7344, + "step": 1858 + }, + { + "epoch": 6.711512415349887, + "grad_norm": 183.9295196533203, + "learning_rate": 1.993284936479129e-05, + "loss": 24.3864, + "step": 1859 + }, + { + "epoch": 6.715124153498872, + "grad_norm": 200.3455352783203, + "learning_rate": 1.9927404718693286e-05, + "loss": 23.7328, + "step": 1860 + }, + { + "epoch": 6.715124153498872, + "eval_loss": 0.636415421962738, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.943, + "eval_steps_per_second": 56.943, + "step": 1860 + }, + { + "epoch": 6.718735891647856, + "grad_norm": 206.7858123779297, + "learning_rate": 1.992196007259528e-05, + "loss": 24.6541, + "step": 1861 + }, + { + "epoch": 6.72234762979684, + "grad_norm": 208.10414123535156, + "learning_rate": 1.9916515426497277e-05, + "loss": 25.1223, + "step": 1862 + }, + { + "epoch": 6.725959367945824, + "grad_norm": 270.6657409667969, + "learning_rate": 1.9911070780399275e-05, + "loss": 44.8561, + "step": 1863 + }, + { + "epoch": 6.7295711060948085, + "grad_norm": 246.69094848632812, + "learning_rate": 1.990562613430127e-05, + "loss": 45.8683, + "step": 1864 + }, + { + "epoch": 6.733182844243792, + "grad_norm": 243.4462432861328, + "learning_rate": 1.9900181488203266e-05, + "loss": 45.1845, + "step": 1865 + }, + { + "epoch": 6.736794582392776, + "grad_norm": 218.0637969970703, + "learning_rate": 1.989473684210526e-05, + "loss": 43.9492, + "step": 1866 + }, + { + "epoch": 6.74040632054176, + "grad_norm": 200.28140258789062, + "learning_rate": 1.988929219600726e-05, + "loss": 44.0612, + "step": 1867 + }, + { + "epoch": 6.744018058690745, + "grad_norm": 200.3120880126953, + "learning_rate": 1.988384754990926e-05, + "loss": 43.4748, + "step": 1868 + }, + { + "epoch": 6.747629796839729, + "grad_norm": 186.1811065673828, + "learning_rate": 1.9878402903811254e-05, + "loss": 43.6851, + "step": 1869 + }, + { + "epoch": 6.751241534988713, + "grad_norm": 208.15167236328125, + "learning_rate": 1.987295825771325e-05, + "loss": 44.4196, + "step": 1870 + }, + { + "epoch": 6.751241534988713, + "eval_loss": 0.6353851556777954, + "eval_runtime": 3.1436, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1870 + }, + { + "epoch": 6.754853273137698, + "grad_norm": 207.500244140625, + "learning_rate": 1.9867513611615245e-05, + "loss": 44.1493, + "step": 1871 + }, + { + "epoch": 6.758465011286682, + "grad_norm": 238.17047119140625, + "learning_rate": 1.986206896551724e-05, + "loss": 44.6587, + "step": 1872 + }, + { + "epoch": 6.762076749435666, + "grad_norm": 192.9468231201172, + "learning_rate": 1.9856624319419236e-05, + "loss": 43.2409, + "step": 1873 + }, + { + "epoch": 6.76568848758465, + "grad_norm": 205.26492309570312, + "learning_rate": 1.9851179673321235e-05, + "loss": 40.8636, + "step": 1874 + }, + { + "epoch": 6.769300225733634, + "grad_norm": 190.49908447265625, + "learning_rate": 1.984573502722323e-05, + "loss": 41.0769, + "step": 1875 + }, + { + "epoch": 6.772911963882619, + "grad_norm": 206.56097412109375, + "learning_rate": 1.984029038112523e-05, + "loss": 40.1137, + "step": 1876 + }, + { + "epoch": 6.776523702031603, + "grad_norm": 212.89256286621094, + "learning_rate": 1.9834845735027224e-05, + "loss": 41.0114, + "step": 1877 + }, + { + "epoch": 6.780135440180587, + "grad_norm": 197.24267578125, + "learning_rate": 1.982940108892922e-05, + "loss": 40.6027, + "step": 1878 + }, + { + "epoch": 6.7837471783295715, + "grad_norm": 187.01942443847656, + "learning_rate": 1.982395644283122e-05, + "loss": 40.5933, + "step": 1879 + }, + { + "epoch": 6.7873589164785555, + "grad_norm": 236.31092834472656, + "learning_rate": 1.9818511796733214e-05, + "loss": 41.2282, + "step": 1880 + }, + { + "epoch": 6.7873589164785555, + "eval_loss": 0.6299392580986023, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 1880 + }, + { + "epoch": 6.7909706546275395, + "grad_norm": 194.92059326171875, + "learning_rate": 1.981306715063521e-05, + "loss": 41.5858, + "step": 1881 + }, + { + "epoch": 6.794582392776523, + "grad_norm": 192.26272583007812, + "learning_rate": 1.9807622504537205e-05, + "loss": 40.6826, + "step": 1882 + }, + { + "epoch": 6.798194130925508, + "grad_norm": 181.8116912841797, + "learning_rate": 1.98021778584392e-05, + "loss": 40.0867, + "step": 1883 + }, + { + "epoch": 6.801805869074492, + "grad_norm": 219.03494262695312, + "learning_rate": 1.9796733212341195e-05, + "loss": 41.4496, + "step": 1884 + }, + { + "epoch": 6.805417607223476, + "grad_norm": 190.7852325439453, + "learning_rate": 1.9791288566243194e-05, + "loss": 42.4147, + "step": 1885 + }, + { + "epoch": 6.80902934537246, + "grad_norm": 200.32476806640625, + "learning_rate": 1.9785843920145193e-05, + "loss": 42.0316, + "step": 1886 + }, + { + "epoch": 6.812641083521445, + "grad_norm": 240.6086883544922, + "learning_rate": 1.9780399274047188e-05, + "loss": 39.6992, + "step": 1887 + }, + { + "epoch": 6.816252821670429, + "grad_norm": 222.31700134277344, + "learning_rate": 1.9774954627949184e-05, + "loss": 42.9572, + "step": 1888 + }, + { + "epoch": 6.819864559819413, + "grad_norm": 215.65292358398438, + "learning_rate": 1.976950998185118e-05, + "loss": 42.5147, + "step": 1889 + }, + { + "epoch": 6.823476297968397, + "grad_norm": 195.71624755859375, + "learning_rate": 1.9764065335753178e-05, + "loss": 40.9536, + "step": 1890 + }, + { + "epoch": 6.823476297968397, + "eval_loss": 0.6288287043571472, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 1890 + }, + { + "epoch": 6.827088036117382, + "grad_norm": 202.301025390625, + "learning_rate": 1.9758620689655173e-05, + "loss": 40.1754, + "step": 1891 + }, + { + "epoch": 6.830699774266366, + "grad_norm": 217.07186889648438, + "learning_rate": 1.975317604355717e-05, + "loss": 35.7505, + "step": 1892 + }, + { + "epoch": 6.83431151241535, + "grad_norm": 189.78782653808594, + "learning_rate": 1.9747731397459164e-05, + "loss": 34.813, + "step": 1893 + }, + { + "epoch": 6.837923250564334, + "grad_norm": 247.2117462158203, + "learning_rate": 1.974228675136116e-05, + "loss": 33.932, + "step": 1894 + }, + { + "epoch": 6.8415349887133186, + "grad_norm": 244.06321716308594, + "learning_rate": 1.9736842105263158e-05, + "loss": 36.2514, + "step": 1895 + }, + { + "epoch": 6.8451467268623025, + "grad_norm": 235.78692626953125, + "learning_rate": 1.9731397459165157e-05, + "loss": 35.2123, + "step": 1896 + }, + { + "epoch": 6.8487584650112865, + "grad_norm": 193.82456970214844, + "learning_rate": 1.9725952813067152e-05, + "loss": 36.5477, + "step": 1897 + }, + { + "epoch": 6.852370203160271, + "grad_norm": 230.2017059326172, + "learning_rate": 1.9720508166969148e-05, + "loss": 36.1244, + "step": 1898 + }, + { + "epoch": 6.855981941309255, + "grad_norm": 205.5274200439453, + "learning_rate": 1.9715063520871143e-05, + "loss": 36.7059, + "step": 1899 + }, + { + "epoch": 6.859593679458239, + "grad_norm": 236.6873016357422, + "learning_rate": 1.970961887477314e-05, + "loss": 36.6212, + "step": 1900 + }, + { + "epoch": 6.859593679458239, + "eval_loss": 0.6235609650611877, + "eval_runtime": 3.1497, + "eval_samples_per_second": 56.831, + "eval_steps_per_second": 56.831, + "step": 1900 + }, + { + "epoch": 6.863205417607223, + "grad_norm": 217.63638305664062, + "learning_rate": 1.9704174228675137e-05, + "loss": 37.3918, + "step": 1901 + }, + { + "epoch": 6.866817155756207, + "grad_norm": 169.31996154785156, + "learning_rate": 1.9698729582577133e-05, + "loss": 37.8555, + "step": 1902 + }, + { + "epoch": 6.870428893905192, + "grad_norm": 204.2144775390625, + "learning_rate": 1.9693284936479128e-05, + "loss": 38.0013, + "step": 1903 + }, + { + "epoch": 6.874040632054176, + "grad_norm": 219.13595581054688, + "learning_rate": 1.9687840290381127e-05, + "loss": 37.2128, + "step": 1904 + }, + { + "epoch": 6.87765237020316, + "grad_norm": 189.8477325439453, + "learning_rate": 1.9682395644283122e-05, + "loss": 39.272, + "step": 1905 + }, + { + "epoch": 6.881264108352145, + "grad_norm": 214.21360778808594, + "learning_rate": 1.967695099818512e-05, + "loss": 37.5185, + "step": 1906 + }, + { + "epoch": 6.884875846501129, + "grad_norm": 252.57867431640625, + "learning_rate": 1.9671506352087116e-05, + "loss": 37.6195, + "step": 1907 + }, + { + "epoch": 6.888487584650113, + "grad_norm": 169.85382080078125, + "learning_rate": 1.966606170598911e-05, + "loss": 29.083, + "step": 1908 + }, + { + "epoch": 6.892099322799097, + "grad_norm": 161.38137817382812, + "learning_rate": 1.9660617059891107e-05, + "loss": 24.4547, + "step": 1909 + }, + { + "epoch": 6.895711060948082, + "grad_norm": 192.5706787109375, + "learning_rate": 1.9655172413793102e-05, + "loss": 24.2235, + "step": 1910 + }, + { + "epoch": 6.895711060948082, + "eval_loss": 0.6387229561805725, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1910 + }, + { + "epoch": 6.899322799097066, + "grad_norm": 177.5368194580078, + "learning_rate": 1.9649727767695098e-05, + "loss": 24.8032, + "step": 1911 + }, + { + "epoch": 6.9029345372460496, + "grad_norm": 206.98458862304688, + "learning_rate": 1.9644283121597097e-05, + "loss": 25.7293, + "step": 1912 + }, + { + "epoch": 6.9065462753950335, + "grad_norm": 238.7289581298828, + "learning_rate": 1.9638838475499095e-05, + "loss": 44.2514, + "step": 1913 + }, + { + "epoch": 6.910158013544018, + "grad_norm": 225.86854553222656, + "learning_rate": 1.963339382940109e-05, + "loss": 44.4858, + "step": 1914 + }, + { + "epoch": 6.913769751693002, + "grad_norm": 235.71524047851562, + "learning_rate": 1.9627949183303086e-05, + "loss": 44.5351, + "step": 1915 + }, + { + "epoch": 6.917381489841986, + "grad_norm": 233.1634063720703, + "learning_rate": 1.962250453720508e-05, + "loss": 44.0865, + "step": 1916 + }, + { + "epoch": 6.92099322799097, + "grad_norm": 201.48944091796875, + "learning_rate": 1.961705989110708e-05, + "loss": 45.0226, + "step": 1917 + }, + { + "epoch": 6.924604966139955, + "grad_norm": 226.95469665527344, + "learning_rate": 1.9611615245009076e-05, + "loss": 44.3969, + "step": 1918 + }, + { + "epoch": 6.928216704288939, + "grad_norm": 242.79940795898438, + "learning_rate": 1.960617059891107e-05, + "loss": 41.3037, + "step": 1919 + }, + { + "epoch": 6.931828442437923, + "grad_norm": 255.3524932861328, + "learning_rate": 1.9600725952813066e-05, + "loss": 41.3567, + "step": 1920 + }, + { + "epoch": 6.931828442437923, + "eval_loss": 0.6346065998077393, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 1920 + }, + { + "epoch": 6.935440180586907, + "grad_norm": 277.0763854980469, + "learning_rate": 1.9595281306715062e-05, + "loss": 41.142, + "step": 1921 + }, + { + "epoch": 6.939051918735892, + "grad_norm": 176.02658081054688, + "learning_rate": 1.958983666061706e-05, + "loss": 42.1963, + "step": 1922 + }, + { + "epoch": 6.942663656884876, + "grad_norm": 236.36398315429688, + "learning_rate": 1.958439201451906e-05, + "loss": 42.351, + "step": 1923 + }, + { + "epoch": 6.94627539503386, + "grad_norm": 203.0919647216797, + "learning_rate": 1.9578947368421055e-05, + "loss": 41.5248, + "step": 1924 + }, + { + "epoch": 6.949887133182845, + "grad_norm": 273.605712890625, + "learning_rate": 1.957350272232305e-05, + "loss": 42.1004, + "step": 1925 + }, + { + "epoch": 6.953498871331829, + "grad_norm": 214.04319763183594, + "learning_rate": 1.9568058076225045e-05, + "loss": 42.6326, + "step": 1926 + }, + { + "epoch": 6.957110609480813, + "grad_norm": 250.81832885742188, + "learning_rate": 1.956261343012704e-05, + "loss": 43.8045, + "step": 1927 + }, + { + "epoch": 6.960722347629797, + "grad_norm": 233.58116149902344, + "learning_rate": 1.955716878402904e-05, + "loss": 39.8991, + "step": 1928 + }, + { + "epoch": 6.9643340857787805, + "grad_norm": 269.0545654296875, + "learning_rate": 1.9551724137931035e-05, + "loss": 34.6192, + "step": 1929 + }, + { + "epoch": 6.967945823927765, + "grad_norm": 266.1218566894531, + "learning_rate": 1.954627949183303e-05, + "loss": 35.7568, + "step": 1930 + }, + { + "epoch": 6.967945823927765, + "eval_loss": 0.6233173608779907, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1930 + }, + { + "epoch": 6.971557562076749, + "grad_norm": 294.6914978027344, + "learning_rate": 1.9540834845735026e-05, + "loss": 36.0795, + "step": 1931 + }, + { + "epoch": 6.975169300225733, + "grad_norm": 373.6831970214844, + "learning_rate": 1.9535390199637025e-05, + "loss": 37.2715, + "step": 1932 + }, + { + "epoch": 6.978781038374718, + "grad_norm": 240.34738159179688, + "learning_rate": 1.952994555353902e-05, + "loss": 37.8335, + "step": 1933 + }, + { + "epoch": 6.982392776523702, + "grad_norm": 312.1968994140625, + "learning_rate": 1.952450090744102e-05, + "loss": 37.8251, + "step": 1934 + }, + { + "epoch": 6.986004514672686, + "grad_norm": 276.3544006347656, + "learning_rate": 1.9519056261343014e-05, + "loss": 38.8466, + "step": 1935 + }, + { + "epoch": 6.98961625282167, + "grad_norm": 282.6874694824219, + "learning_rate": 1.951361161524501e-05, + "loss": 37.774, + "step": 1936 + }, + { + "epoch": 6.993227990970655, + "grad_norm": 323.96612548828125, + "learning_rate": 1.9508166969147005e-05, + "loss": 34.3747, + "step": 1937 + }, + { + "epoch": 6.996839729119639, + "grad_norm": 235.02915954589844, + "learning_rate": 1.9502722323049e-05, + "loss": 24.5297, + "step": 1938 + }, + { + "epoch": 7.0, + "grad_norm": 176.4046173095703, + "learning_rate": 1.9497277676951e-05, + "loss": 22.3179, + "step": 1939 + }, + { + "epoch": 7.003611738148984, + "grad_norm": 248.2797393798828, + "learning_rate": 1.9491833030852994e-05, + "loss": 42.225, + "step": 1940 + }, + { + "epoch": 7.003611738148984, + "eval_loss": 0.6272363066673279, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.911, + "eval_steps_per_second": 56.911, + "step": 1940 + }, + { + "epoch": 7.007223476297969, + "grad_norm": 235.9131622314453, + "learning_rate": 1.9486388384754993e-05, + "loss": 43.6526, + "step": 1941 + }, + { + "epoch": 7.010835214446953, + "grad_norm": 223.63479614257812, + "learning_rate": 1.948094373865699e-05, + "loss": 42.9052, + "step": 1942 + }, + { + "epoch": 7.014446952595937, + "grad_norm": 203.92141723632812, + "learning_rate": 1.9475499092558984e-05, + "loss": 43.5819, + "step": 1943 + }, + { + "epoch": 7.018058690744921, + "grad_norm": 209.6050567626953, + "learning_rate": 1.947005444646098e-05, + "loss": 43.1077, + "step": 1944 + }, + { + "epoch": 7.021670428893906, + "grad_norm": 245.77700805664062, + "learning_rate": 1.9464609800362978e-05, + "loss": 42.7508, + "step": 1945 + }, + { + "epoch": 7.0252821670428895, + "grad_norm": 203.13465881347656, + "learning_rate": 1.9459165154264973e-05, + "loss": 42.5234, + "step": 1946 + }, + { + "epoch": 7.0288939051918735, + "grad_norm": 226.4978485107422, + "learning_rate": 1.945372050816697e-05, + "loss": 44.0725, + "step": 1947 + }, + { + "epoch": 7.0325056433408575, + "grad_norm": 225.68116760253906, + "learning_rate": 1.9448275862068964e-05, + "loss": 42.6408, + "step": 1948 + }, + { + "epoch": 7.036117381489842, + "grad_norm": 182.14202880859375, + "learning_rate": 1.944283121597096e-05, + "loss": 41.7696, + "step": 1949 + }, + { + "epoch": 7.039729119638826, + "grad_norm": 196.1949005126953, + "learning_rate": 1.9437386569872962e-05, + "loss": 42.7008, + "step": 1950 + }, + { + "epoch": 7.039729119638826, + "eval_loss": 0.6277336478233337, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 1950 + }, + { + "epoch": 7.04334085778781, + "grad_norm": 180.6853485107422, + "learning_rate": 1.9431941923774957e-05, + "loss": 41.9946, + "step": 1951 + }, + { + "epoch": 7.046952595936794, + "grad_norm": 199.0644073486328, + "learning_rate": 1.9426497277676953e-05, + "loss": 39.8965, + "step": 1952 + }, + { + "epoch": 7.050564334085779, + "grad_norm": 208.21371459960938, + "learning_rate": 1.9421052631578948e-05, + "loss": 39.3263, + "step": 1953 + }, + { + "epoch": 7.054176072234763, + "grad_norm": 239.78677368164062, + "learning_rate": 1.9415607985480943e-05, + "loss": 40.1478, + "step": 1954 + }, + { + "epoch": 7.057787810383747, + "grad_norm": 211.55030822753906, + "learning_rate": 1.941016333938294e-05, + "loss": 40.061, + "step": 1955 + }, + { + "epoch": 7.061399548532731, + "grad_norm": 199.51455688476562, + "learning_rate": 1.9404718693284937e-05, + "loss": 39.8707, + "step": 1956 + }, + { + "epoch": 7.065011286681716, + "grad_norm": 183.39486694335938, + "learning_rate": 1.9399274047186933e-05, + "loss": 40.3183, + "step": 1957 + }, + { + "epoch": 7.0686230248307, + "grad_norm": 238.36737060546875, + "learning_rate": 1.9393829401088928e-05, + "loss": 40.8581, + "step": 1958 + }, + { + "epoch": 7.072234762979684, + "grad_norm": 202.5072021484375, + "learning_rate": 1.9388384754990927e-05, + "loss": 40.2192, + "step": 1959 + }, + { + "epoch": 7.075846501128668, + "grad_norm": 204.236083984375, + "learning_rate": 1.9382940108892922e-05, + "loss": 40.8533, + "step": 1960 + }, + { + "epoch": 7.075846501128668, + "eval_loss": 0.6252757906913757, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 1960 + }, + { + "epoch": 7.079458239277653, + "grad_norm": 260.2081298828125, + "learning_rate": 1.937749546279492e-05, + "loss": 39.7229, + "step": 1961 + }, + { + "epoch": 7.083069977426637, + "grad_norm": 241.91722106933594, + "learning_rate": 1.9372050816696917e-05, + "loss": 41.547, + "step": 1962 + }, + { + "epoch": 7.0866817155756205, + "grad_norm": 168.9304656982422, + "learning_rate": 1.9366606170598912e-05, + "loss": 41.4826, + "step": 1963 + }, + { + "epoch": 7.090293453724605, + "grad_norm": 230.05349731445312, + "learning_rate": 1.9361161524500907e-05, + "loss": 41.5411, + "step": 1964 + }, + { + "epoch": 7.093905191873589, + "grad_norm": 172.16851806640625, + "learning_rate": 1.9355716878402903e-05, + "loss": 42.2347, + "step": 1965 + }, + { + "epoch": 7.097516930022573, + "grad_norm": 312.65838623046875, + "learning_rate": 1.9350272232304898e-05, + "loss": 41.4039, + "step": 1966 + }, + { + "epoch": 7.101128668171557, + "grad_norm": 249.62351989746094, + "learning_rate": 1.9344827586206897e-05, + "loss": 41.4234, + "step": 1967 + }, + { + "epoch": 7.104740406320542, + "grad_norm": 250.49143981933594, + "learning_rate": 1.9339382940108896e-05, + "loss": 38.0539, + "step": 1968 + }, + { + "epoch": 7.108352144469526, + "grad_norm": 238.41546630859375, + "learning_rate": 1.933393829401089e-05, + "loss": 35.5584, + "step": 1969 + }, + { + "epoch": 7.11196388261851, + "grad_norm": 200.78282165527344, + "learning_rate": 1.9328493647912886e-05, + "loss": 34.4491, + "step": 1970 + }, + { + "epoch": 7.11196388261851, + "eval_loss": 0.6286216378211975, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 1970 + }, + { + "epoch": 7.115575620767494, + "grad_norm": 244.61717224121094, + "learning_rate": 1.9323049001814882e-05, + "loss": 34.5403, + "step": 1971 + }, + { + "epoch": 7.119187358916479, + "grad_norm": 219.14312744140625, + "learning_rate": 1.931760435571688e-05, + "loss": 35.7815, + "step": 1972 + }, + { + "epoch": 7.122799097065463, + "grad_norm": 221.85130310058594, + "learning_rate": 1.9312159709618876e-05, + "loss": 35.638, + "step": 1973 + }, + { + "epoch": 7.126410835214447, + "grad_norm": 237.97921752929688, + "learning_rate": 1.930671506352087e-05, + "loss": 35.1348, + "step": 1974 + }, + { + "epoch": 7.130022573363431, + "grad_norm": 234.06256103515625, + "learning_rate": 1.9301270417422867e-05, + "loss": 35.8709, + "step": 1975 + }, + { + "epoch": 7.133634311512416, + "grad_norm": 231.6852264404297, + "learning_rate": 1.9295825771324862e-05, + "loss": 36.6859, + "step": 1976 + }, + { + "epoch": 7.1372460496614, + "grad_norm": 208.2762908935547, + "learning_rate": 1.9290381125226857e-05, + "loss": 37.24, + "step": 1977 + }, + { + "epoch": 7.140857787810384, + "grad_norm": 219.8532257080078, + "learning_rate": 1.928493647912886e-05, + "loss": 36.4058, + "step": 1978 + }, + { + "epoch": 7.144469525959368, + "grad_norm": 242.73159790039062, + "learning_rate": 1.9279491833030855e-05, + "loss": 36.7565, + "step": 1979 + }, + { + "epoch": 7.148081264108352, + "grad_norm": 227.09645080566406, + "learning_rate": 1.927404718693285e-05, + "loss": 37.6752, + "step": 1980 + }, + { + "epoch": 7.148081264108352, + "eval_loss": 0.6243596076965332, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 1980 + }, + { + "epoch": 7.151693002257336, + "grad_norm": 236.27169799804688, + "learning_rate": 1.9268602540834846e-05, + "loss": 38.3857, + "step": 1981 + }, + { + "epoch": 7.15530474040632, + "grad_norm": 244.84912109375, + "learning_rate": 1.926315789473684e-05, + "loss": 38.414, + "step": 1982 + }, + { + "epoch": 7.158916478555304, + "grad_norm": 203.36798095703125, + "learning_rate": 1.925771324863884e-05, + "loss": 38.938, + "step": 1983 + }, + { + "epoch": 7.162528216704289, + "grad_norm": 225.50152587890625, + "learning_rate": 1.9252268602540835e-05, + "loss": 37.654, + "step": 1984 + }, + { + "epoch": 7.166139954853273, + "grad_norm": 236.4989471435547, + "learning_rate": 1.924682395644283e-05, + "loss": 28.2794, + "step": 1985 + }, + { + "epoch": 7.169751693002257, + "grad_norm": 173.909423828125, + "learning_rate": 1.9241379310344826e-05, + "loss": 23.3804, + "step": 1986 + }, + { + "epoch": 7.173363431151241, + "grad_norm": 195.63526916503906, + "learning_rate": 1.9235934664246825e-05, + "loss": 24.4696, + "step": 1987 + }, + { + "epoch": 7.176975169300226, + "grad_norm": 150.0059356689453, + "learning_rate": 1.923049001814882e-05, + "loss": 23.9438, + "step": 1988 + }, + { + "epoch": 7.18058690744921, + "grad_norm": 217.61630249023438, + "learning_rate": 1.922504537205082e-05, + "loss": 25.4084, + "step": 1989 + }, + { + "epoch": 7.184198645598194, + "grad_norm": 259.2041015625, + "learning_rate": 1.9219600725952814e-05, + "loss": 44.7159, + "step": 1990 + }, + { + "epoch": 7.184198645598194, + "eval_loss": 0.6465168595314026, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 1990 + }, + { + "epoch": 7.187810383747179, + "grad_norm": 282.1758117675781, + "learning_rate": 1.921415607985481e-05, + "loss": 45.7571, + "step": 1991 + }, + { + "epoch": 7.191422121896163, + "grad_norm": 276.5455322265625, + "learning_rate": 1.9208711433756805e-05, + "loss": 44.7227, + "step": 1992 + }, + { + "epoch": 7.195033860045147, + "grad_norm": 251.93589782714844, + "learning_rate": 1.92032667876588e-05, + "loss": 43.0705, + "step": 1993 + }, + { + "epoch": 7.198645598194131, + "grad_norm": 224.8245086669922, + "learning_rate": 1.91978221415608e-05, + "loss": 43.2009, + "step": 1994 + }, + { + "epoch": 7.2022573363431155, + "grad_norm": 233.61770629882812, + "learning_rate": 1.9192377495462795e-05, + "loss": 43.4496, + "step": 1995 + }, + { + "epoch": 7.2058690744920995, + "grad_norm": 188.65252685546875, + "learning_rate": 1.9186932849364793e-05, + "loss": 42.5907, + "step": 1996 + }, + { + "epoch": 7.209480812641083, + "grad_norm": 185.1155242919922, + "learning_rate": 1.918148820326679e-05, + "loss": 44.4651, + "step": 1997 + }, + { + "epoch": 7.213092550790067, + "grad_norm": 169.09701538085938, + "learning_rate": 1.9176043557168784e-05, + "loss": 43.6325, + "step": 1998 + }, + { + "epoch": 7.216704288939052, + "grad_norm": 198.49114990234375, + "learning_rate": 1.9170598911070783e-05, + "loss": 43.5817, + "step": 1999 + }, + { + "epoch": 7.220316027088036, + "grad_norm": 193.17591857910156, + "learning_rate": 1.916515426497278e-05, + "loss": 41.4884, + "step": 2000 + }, + { + "epoch": 7.220316027088036, + "eval_loss": 0.6329721212387085, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.995, + "eval_steps_per_second": 56.995, + "step": 2000 + }, + { + "epoch": 7.22392776523702, + "grad_norm": 202.32730102539062, + "learning_rate": 1.9159709618874774e-05, + "loss": 41.2168, + "step": 2001 + }, + { + "epoch": 7.227539503386004, + "grad_norm": 206.4916534423828, + "learning_rate": 1.915426497277677e-05, + "loss": 39.9909, + "step": 2002 + }, + { + "epoch": 7.231151241534989, + "grad_norm": 202.2099609375, + "learning_rate": 1.9148820326678765e-05, + "loss": 40.1413, + "step": 2003 + }, + { + "epoch": 7.234762979683973, + "grad_norm": 223.7954559326172, + "learning_rate": 1.914337568058076e-05, + "loss": 39.5872, + "step": 2004 + }, + { + "epoch": 7.238374717832957, + "grad_norm": 225.8967742919922, + "learning_rate": 1.9137931034482762e-05, + "loss": 41.3396, + "step": 2005 + }, + { + "epoch": 7.241986455981941, + "grad_norm": 248.0997772216797, + "learning_rate": 1.9132486388384757e-05, + "loss": 39.012, + "step": 2006 + }, + { + "epoch": 7.245598194130926, + "grad_norm": 227.4576873779297, + "learning_rate": 1.9127041742286753e-05, + "loss": 42.5922, + "step": 2007 + }, + { + "epoch": 7.24920993227991, + "grad_norm": 197.62547302246094, + "learning_rate": 1.9121597096188748e-05, + "loss": 41.6107, + "step": 2008 + }, + { + "epoch": 7.252821670428894, + "grad_norm": 170.18817138671875, + "learning_rate": 1.9116152450090744e-05, + "loss": 40.3326, + "step": 2009 + }, + { + "epoch": 7.2564334085778786, + "grad_norm": 186.9420166015625, + "learning_rate": 1.9110707803992742e-05, + "loss": 41.0365, + "step": 2010 + }, + { + "epoch": 7.2564334085778786, + "eval_loss": 0.6230406761169434, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2010 + }, + { + "epoch": 7.2600451467268625, + "grad_norm": 188.11244201660156, + "learning_rate": 1.9105263157894738e-05, + "loss": 42.0278, + "step": 2011 + }, + { + "epoch": 7.2636568848758465, + "grad_norm": 242.47305297851562, + "learning_rate": 1.9099818511796733e-05, + "loss": 41.5539, + "step": 2012 + }, + { + "epoch": 7.2672686230248305, + "grad_norm": 190.83987426757812, + "learning_rate": 1.909437386569873e-05, + "loss": 41.8641, + "step": 2013 + }, + { + "epoch": 7.270880361173815, + "grad_norm": 214.44650268554688, + "learning_rate": 1.9088929219600724e-05, + "loss": 42.232, + "step": 2014 + }, + { + "epoch": 7.274492099322799, + "grad_norm": 216.3888397216797, + "learning_rate": 1.9083484573502723e-05, + "loss": 41.6186, + "step": 2015 + }, + { + "epoch": 7.278103837471783, + "grad_norm": 210.46673583984375, + "learning_rate": 1.907803992740472e-05, + "loss": 42.2099, + "step": 2016 + }, + { + "epoch": 7.281715575620767, + "grad_norm": 194.84165954589844, + "learning_rate": 1.9072595281306717e-05, + "loss": 42.78, + "step": 2017 + }, + { + "epoch": 7.285327313769752, + "grad_norm": 201.91297912597656, + "learning_rate": 1.9067150635208712e-05, + "loss": 38.7115, + "step": 2018 + }, + { + "epoch": 7.288939051918736, + "grad_norm": 245.42625427246094, + "learning_rate": 1.9061705989110708e-05, + "loss": 35.7841, + "step": 2019 + }, + { + "epoch": 7.29255079006772, + "grad_norm": 182.4967041015625, + "learning_rate": 1.9056261343012703e-05, + "loss": 34.3308, + "step": 2020 + }, + { + "epoch": 7.29255079006772, + "eval_loss": 0.6238341331481934, + "eval_runtime": 3.1431, + "eval_samples_per_second": 56.95, + "eval_steps_per_second": 56.95, + "step": 2020 + }, + { + "epoch": 7.296162528216704, + "grad_norm": 297.3916320800781, + "learning_rate": 1.9050816696914702e-05, + "loss": 34.7534, + "step": 2021 + }, + { + "epoch": 7.299774266365689, + "grad_norm": 211.52554321289062, + "learning_rate": 1.9045372050816697e-05, + "loss": 34.0303, + "step": 2022 + }, + { + "epoch": 7.303386004514673, + "grad_norm": 232.99844360351562, + "learning_rate": 1.9039927404718693e-05, + "loss": 35.7378, + "step": 2023 + }, + { + "epoch": 7.306997742663657, + "grad_norm": 230.34642028808594, + "learning_rate": 1.903448275862069e-05, + "loss": 36.7492, + "step": 2024 + }, + { + "epoch": 7.310609480812641, + "grad_norm": 228.88966369628906, + "learning_rate": 1.9029038112522687e-05, + "loss": 35.1188, + "step": 2025 + }, + { + "epoch": 7.314221218961626, + "grad_norm": 213.2604522705078, + "learning_rate": 1.9023593466424682e-05, + "loss": 35.0688, + "step": 2026 + }, + { + "epoch": 7.3178329571106095, + "grad_norm": 202.62200927734375, + "learning_rate": 1.901814882032668e-05, + "loss": 37.6721, + "step": 2027 + }, + { + "epoch": 7.3214446952595935, + "grad_norm": 191.8877410888672, + "learning_rate": 1.9012704174228676e-05, + "loss": 36.7728, + "step": 2028 + }, + { + "epoch": 7.3250564334085775, + "grad_norm": 211.57571411132812, + "learning_rate": 1.900725952813067e-05, + "loss": 36.6342, + "step": 2029 + }, + { + "epoch": 7.328668171557562, + "grad_norm": 177.2289581298828, + "learning_rate": 1.9001814882032667e-05, + "loss": 36.8319, + "step": 2030 + }, + { + "epoch": 7.328668171557562, + "eval_loss": 0.6231008172035217, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2030 + }, + { + "epoch": 7.332279909706546, + "grad_norm": 227.7028350830078, + "learning_rate": 1.8996370235934662e-05, + "loss": 36.6706, + "step": 2031 + }, + { + "epoch": 7.33589164785553, + "grad_norm": 229.02972412109375, + "learning_rate": 1.899092558983666e-05, + "loss": 37.0749, + "step": 2032 + }, + { + "epoch": 7.339503386004514, + "grad_norm": 234.30946350097656, + "learning_rate": 1.898548094373866e-05, + "loss": 37.3716, + "step": 2033 + }, + { + "epoch": 7.343115124153499, + "grad_norm": 236.79893493652344, + "learning_rate": 1.8980036297640655e-05, + "loss": 38.9503, + "step": 2034 + }, + { + "epoch": 7.346726862302483, + "grad_norm": 256.5646057128906, + "learning_rate": 1.897459165154265e-05, + "loss": 32.5056, + "step": 2035 + }, + { + "epoch": 7.350338600451467, + "grad_norm": 183.38961791992188, + "learning_rate": 1.8969147005444646e-05, + "loss": 25.3982, + "step": 2036 + }, + { + "epoch": 7.353950338600452, + "grad_norm": 214.09742736816406, + "learning_rate": 1.896370235934664e-05, + "loss": 23.2743, + "step": 2037 + }, + { + "epoch": 7.357562076749436, + "grad_norm": 190.10867309570312, + "learning_rate": 1.895825771324864e-05, + "loss": 24.8062, + "step": 2038 + }, + { + "epoch": 7.36117381489842, + "grad_norm": 197.85313415527344, + "learning_rate": 1.8952813067150636e-05, + "loss": 25.5098, + "step": 2039 + }, + { + "epoch": 7.364785553047404, + "grad_norm": 235.79090881347656, + "learning_rate": 1.894736842105263e-05, + "loss": 44.3536, + "step": 2040 + }, + { + "epoch": 7.364785553047404, + "eval_loss": 0.6341925263404846, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.785, + "eval_steps_per_second": 56.785, + "step": 2040 + }, + { + "epoch": 7.368397291196389, + "grad_norm": 232.7415771484375, + "learning_rate": 1.8941923774954626e-05, + "loss": 44.6073, + "step": 2041 + }, + { + "epoch": 7.372009029345373, + "grad_norm": 302.3766174316406, + "learning_rate": 1.8936479128856625e-05, + "loss": 43.8575, + "step": 2042 + }, + { + "epoch": 7.375620767494357, + "grad_norm": 208.41441345214844, + "learning_rate": 1.8931034482758624e-05, + "loss": 42.4378, + "step": 2043 + }, + { + "epoch": 7.3792325056433405, + "grad_norm": 228.000732421875, + "learning_rate": 1.892558983666062e-05, + "loss": 44.5641, + "step": 2044 + }, + { + "epoch": 7.382844243792325, + "grad_norm": 201.757080078125, + "learning_rate": 1.8920145190562615e-05, + "loss": 43.7578, + "step": 2045 + }, + { + "epoch": 7.386455981941309, + "grad_norm": 220.2481689453125, + "learning_rate": 1.891470054446461e-05, + "loss": 42.755, + "step": 2046 + }, + { + "epoch": 7.390067720090293, + "grad_norm": 225.5443115234375, + "learning_rate": 1.8909255898366605e-05, + "loss": 44.3785, + "step": 2047 + }, + { + "epoch": 7.393679458239277, + "grad_norm": 200.2024688720703, + "learning_rate": 1.89038112522686e-05, + "loss": 42.994, + "step": 2048 + }, + { + "epoch": 7.397291196388262, + "grad_norm": 205.64794921875, + "learning_rate": 1.88983666061706e-05, + "loss": 43.1902, + "step": 2049 + }, + { + "epoch": 7.400902934537246, + "grad_norm": 183.3535919189453, + "learning_rate": 1.8892921960072595e-05, + "loss": 40.9422, + "step": 2050 + }, + { + "epoch": 7.400902934537246, + "eval_loss": 0.626913845539093, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2050 + }, + { + "epoch": 7.40451467268623, + "grad_norm": 201.8138885498047, + "learning_rate": 1.8887477313974594e-05, + "loss": 39.4408, + "step": 2051 + }, + { + "epoch": 7.408126410835214, + "grad_norm": 201.8863525390625, + "learning_rate": 1.888203266787659e-05, + "loss": 39.5467, + "step": 2052 + }, + { + "epoch": 7.411738148984199, + "grad_norm": 239.10687255859375, + "learning_rate": 1.8876588021778585e-05, + "loss": 41.2256, + "step": 2053 + }, + { + "epoch": 7.415349887133183, + "grad_norm": 209.47796630859375, + "learning_rate": 1.8871143375680583e-05, + "loss": 40.8963, + "step": 2054 + }, + { + "epoch": 7.418961625282167, + "grad_norm": 202.6414794921875, + "learning_rate": 1.886569872958258e-05, + "loss": 40.5138, + "step": 2055 + }, + { + "epoch": 7.422573363431152, + "grad_norm": 198.01795959472656, + "learning_rate": 1.8860254083484574e-05, + "loss": 39.1767, + "step": 2056 + }, + { + "epoch": 7.426185101580136, + "grad_norm": 173.26507568359375, + "learning_rate": 1.885480943738657e-05, + "loss": 40.6713, + "step": 2057 + }, + { + "epoch": 7.42979683972912, + "grad_norm": 166.11607360839844, + "learning_rate": 1.8849364791288565e-05, + "loss": 41.2602, + "step": 2058 + }, + { + "epoch": 7.433408577878104, + "grad_norm": 200.76956176757812, + "learning_rate": 1.884392014519056e-05, + "loss": 41.0714, + "step": 2059 + }, + { + "epoch": 7.437020316027088, + "grad_norm": 213.75315856933594, + "learning_rate": 1.883847549909256e-05, + "loss": 39.6812, + "step": 2060 + }, + { + "epoch": 7.437020316027088, + "eval_loss": 0.6279598474502563, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.0, + "eval_steps_per_second": 57.0, + "step": 2060 + }, + { + "epoch": 7.440632054176072, + "grad_norm": 221.25025939941406, + "learning_rate": 1.8833030852994558e-05, + "loss": 41.6964, + "step": 2061 + }, + { + "epoch": 7.444243792325056, + "grad_norm": 171.32106018066406, + "learning_rate": 1.8827586206896553e-05, + "loss": 41.4608, + "step": 2062 + }, + { + "epoch": 7.44785553047404, + "grad_norm": 222.76600646972656, + "learning_rate": 1.882214156079855e-05, + "loss": 41.2687, + "step": 2063 + }, + { + "epoch": 7.451467268623025, + "grad_norm": 169.82395935058594, + "learning_rate": 1.8816696914700544e-05, + "loss": 41.6048, + "step": 2064 + }, + { + "epoch": 7.455079006772009, + "grad_norm": 190.5113525390625, + "learning_rate": 1.8811252268602543e-05, + "loss": 41.8843, + "step": 2065 + }, + { + "epoch": 7.458690744920993, + "grad_norm": 194.5990447998047, + "learning_rate": 1.8805807622504538e-05, + "loss": 43.5968, + "step": 2066 + }, + { + "epoch": 7.462302483069977, + "grad_norm": 216.0985870361328, + "learning_rate": 1.8800362976406533e-05, + "loss": 41.6743, + "step": 2067 + }, + { + "epoch": 7.465914221218962, + "grad_norm": 249.05270385742188, + "learning_rate": 1.879491833030853e-05, + "loss": 39.4203, + "step": 2068 + }, + { + "epoch": 7.469525959367946, + "grad_norm": 232.5495147705078, + "learning_rate": 1.8789473684210524e-05, + "loss": 36.2202, + "step": 2069 + }, + { + "epoch": 7.47313769751693, + "grad_norm": 218.72299194335938, + "learning_rate": 1.8784029038112523e-05, + "loss": 34.9116, + "step": 2070 + }, + { + "epoch": 7.47313769751693, + "eval_loss": 0.6241349577903748, + "eval_runtime": 3.1499, + "eval_samples_per_second": 56.827, + "eval_steps_per_second": 56.827, + "step": 2070 + }, + { + "epoch": 7.476749435665914, + "grad_norm": 241.78179931640625, + "learning_rate": 1.8778584392014522e-05, + "loss": 36.2476, + "step": 2071 + }, + { + "epoch": 7.480361173814899, + "grad_norm": 194.92982482910156, + "learning_rate": 1.8773139745916517e-05, + "loss": 34.4524, + "step": 2072 + }, + { + "epoch": 7.483972911963883, + "grad_norm": 227.76156616210938, + "learning_rate": 1.8767695099818513e-05, + "loss": 34.5292, + "step": 2073 + }, + { + "epoch": 7.487584650112867, + "grad_norm": 287.61309814453125, + "learning_rate": 1.8762250453720508e-05, + "loss": 37.8068, + "step": 2074 + }, + { + "epoch": 7.491196388261851, + "grad_norm": 191.0822296142578, + "learning_rate": 1.8756805807622503e-05, + "loss": 36.0941, + "step": 2075 + }, + { + "epoch": 7.4948081264108355, + "grad_norm": 197.5564422607422, + "learning_rate": 1.8751361161524502e-05, + "loss": 36.3624, + "step": 2076 + }, + { + "epoch": 7.4984198645598195, + "grad_norm": 187.72479248046875, + "learning_rate": 1.8745916515426497e-05, + "loss": 37.5074, + "step": 2077 + }, + { + "epoch": 7.502031602708803, + "grad_norm": 220.4607391357422, + "learning_rate": 1.8740471869328493e-05, + "loss": 35.6139, + "step": 2078 + }, + { + "epoch": 7.505643340857787, + "grad_norm": 179.05612182617188, + "learning_rate": 1.873502722323049e-05, + "loss": 37.7286, + "step": 2079 + }, + { + "epoch": 7.509255079006772, + "grad_norm": 230.91879272460938, + "learning_rate": 1.8729582577132487e-05, + "loss": 36.1803, + "step": 2080 + }, + { + "epoch": 7.509255079006772, + "eval_loss": 0.6255043148994446, + "eval_runtime": 3.1466, + "eval_samples_per_second": 56.887, + "eval_steps_per_second": 56.887, + "step": 2080 + }, + { + "epoch": 7.512866817155756, + "grad_norm": 182.89437866210938, + "learning_rate": 1.8724137931034482e-05, + "loss": 36.5782, + "step": 2081 + }, + { + "epoch": 7.51647855530474, + "grad_norm": 215.36769104003906, + "learning_rate": 1.871869328493648e-05, + "loss": 38.233, + "step": 2082 + }, + { + "epoch": 7.520090293453725, + "grad_norm": 232.6095733642578, + "learning_rate": 1.8713248638838477e-05, + "loss": 38.6268, + "step": 2083 + }, + { + "epoch": 7.523702031602709, + "grad_norm": 236.94281005859375, + "learning_rate": 1.8707803992740472e-05, + "loss": 38.1768, + "step": 2084 + }, + { + "epoch": 7.527313769751693, + "grad_norm": 214.16079711914062, + "learning_rate": 1.8702359346642467e-05, + "loss": 27.514, + "step": 2085 + }, + { + "epoch": 7.530925507900677, + "grad_norm": 192.6107940673828, + "learning_rate": 1.8696914700544463e-05, + "loss": 24.274, + "step": 2086 + }, + { + "epoch": 7.534537246049661, + "grad_norm": 217.98619079589844, + "learning_rate": 1.869147005444646e-05, + "loss": 23.2824, + "step": 2087 + }, + { + "epoch": 7.538148984198646, + "grad_norm": 183.04296875, + "learning_rate": 1.868602540834846e-05, + "loss": 24.9622, + "step": 2088 + }, + { + "epoch": 7.54176072234763, + "grad_norm": 167.1417236328125, + "learning_rate": 1.8680580762250456e-05, + "loss": 25.1446, + "step": 2089 + }, + { + "epoch": 7.545372460496614, + "grad_norm": 287.29937744140625, + "learning_rate": 1.867513611615245e-05, + "loss": 44.1171, + "step": 2090 + }, + { + "epoch": 7.545372460496614, + "eval_loss": 0.6376849412918091, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.929, + "eval_steps_per_second": 56.929, + "step": 2090 + }, + { + "epoch": 7.5489841986455986, + "grad_norm": 285.3408203125, + "learning_rate": 1.8669691470054446e-05, + "loss": 46.3716, + "step": 2091 + }, + { + "epoch": 7.5525959367945825, + "grad_norm": 233.18389892578125, + "learning_rate": 1.8664246823956445e-05, + "loss": 44.0514, + "step": 2092 + }, + { + "epoch": 7.5562076749435665, + "grad_norm": 256.4196472167969, + "learning_rate": 1.865880217785844e-05, + "loss": 44.1784, + "step": 2093 + }, + { + "epoch": 7.5598194130925505, + "grad_norm": 223.28128051757812, + "learning_rate": 1.8653357531760436e-05, + "loss": 42.9897, + "step": 2094 + }, + { + "epoch": 7.563431151241535, + "grad_norm": 235.2901153564453, + "learning_rate": 1.864791288566243e-05, + "loss": 43.7651, + "step": 2095 + }, + { + "epoch": 7.567042889390519, + "grad_norm": 285.9206237792969, + "learning_rate": 1.8642468239564427e-05, + "loss": 44.6333, + "step": 2096 + }, + { + "epoch": 7.570654627539503, + "grad_norm": 200.00210571289062, + "learning_rate": 1.8637023593466425e-05, + "loss": 43.9845, + "step": 2097 + }, + { + "epoch": 7.574266365688487, + "grad_norm": 277.73394775390625, + "learning_rate": 1.8631578947368424e-05, + "loss": 44.7301, + "step": 2098 + }, + { + "epoch": 7.577878103837472, + "grad_norm": 216.9422149658203, + "learning_rate": 1.862613430127042e-05, + "loss": 44.0409, + "step": 2099 + }, + { + "epoch": 7.581489841986456, + "grad_norm": 198.86639404296875, + "learning_rate": 1.8620689655172415e-05, + "loss": 43.4026, + "step": 2100 + }, + { + "epoch": 7.581489841986456, + "eval_loss": 0.6270378232002258, + "eval_runtime": 3.1464, + "eval_samples_per_second": 56.891, + "eval_steps_per_second": 56.891, + "step": 2100 + }, + { + "epoch": 7.58510158013544, + "grad_norm": 240.495361328125, + "learning_rate": 1.861524500907441e-05, + "loss": 41.4092, + "step": 2101 + }, + { + "epoch": 7.588713318284425, + "grad_norm": 240.1851043701172, + "learning_rate": 1.8609800362976406e-05, + "loss": 40.1396, + "step": 2102 + }, + { + "epoch": 7.592325056433409, + "grad_norm": 241.21495056152344, + "learning_rate": 1.8604355716878405e-05, + "loss": 39.1778, + "step": 2103 + }, + { + "epoch": 7.595936794582393, + "grad_norm": 287.3133544921875, + "learning_rate": 1.85989110707804e-05, + "loss": 41.0348, + "step": 2104 + }, + { + "epoch": 7.599548532731377, + "grad_norm": 230.4313201904297, + "learning_rate": 1.8593466424682395e-05, + "loss": 39.5872, + "step": 2105 + }, + { + "epoch": 7.603160270880361, + "grad_norm": 210.32962036132812, + "learning_rate": 1.858802177858439e-05, + "loss": 40.6146, + "step": 2106 + }, + { + "epoch": 7.606772009029346, + "grad_norm": 185.81752014160156, + "learning_rate": 1.858257713248639e-05, + "loss": 39.6363, + "step": 2107 + }, + { + "epoch": 7.6103837471783295, + "grad_norm": 234.63037109375, + "learning_rate": 1.8577132486388385e-05, + "loss": 40.558, + "step": 2108 + }, + { + "epoch": 7.6139954853273135, + "grad_norm": 289.92803955078125, + "learning_rate": 1.8571687840290384e-05, + "loss": 41.1624, + "step": 2109 + }, + { + "epoch": 7.617607223476298, + "grad_norm": 252.82188415527344, + "learning_rate": 1.856624319419238e-05, + "loss": 41.7827, + "step": 2110 + }, + { + "epoch": 7.617607223476298, + "eval_loss": 0.6290409564971924, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2110 + }, + { + "epoch": 7.621218961625282, + "grad_norm": 201.8303985595703, + "learning_rate": 1.8560798548094374e-05, + "loss": 39.0072, + "step": 2111 + }, + { + "epoch": 7.624830699774266, + "grad_norm": 158.71446228027344, + "learning_rate": 1.855535390199637e-05, + "loss": 39.9822, + "step": 2112 + }, + { + "epoch": 7.62844243792325, + "grad_norm": 171.3879852294922, + "learning_rate": 1.8549909255898365e-05, + "loss": 42.1973, + "step": 2113 + }, + { + "epoch": 7.632054176072235, + "grad_norm": 218.584228515625, + "learning_rate": 1.8544464609800364e-05, + "loss": 42.933, + "step": 2114 + }, + { + "epoch": 7.635665914221219, + "grad_norm": 200.60093688964844, + "learning_rate": 1.853901996370236e-05, + "loss": 41.9847, + "step": 2115 + }, + { + "epoch": 7.639277652370203, + "grad_norm": 210.75128173828125, + "learning_rate": 1.8533575317604358e-05, + "loss": 42.4961, + "step": 2116 + }, + { + "epoch": 7.642889390519187, + "grad_norm": 187.47406005859375, + "learning_rate": 1.8528130671506353e-05, + "loss": 39.3404, + "step": 2117 + }, + { + "epoch": 7.646501128668172, + "grad_norm": 204.87693786621094, + "learning_rate": 1.852268602540835e-05, + "loss": 40.3011, + "step": 2118 + }, + { + "epoch": 7.650112866817156, + "grad_norm": 228.8159637451172, + "learning_rate": 1.8517241379310344e-05, + "loss": 37.4416, + "step": 2119 + }, + { + "epoch": 7.65372460496614, + "grad_norm": 237.59664916992188, + "learning_rate": 1.8511796733212343e-05, + "loss": 35.3079, + "step": 2120 + }, + { + "epoch": 7.65372460496614, + "eval_loss": 0.6256567239761353, + "eval_runtime": 3.1458, + "eval_samples_per_second": 56.902, + "eval_steps_per_second": 56.902, + "step": 2120 + }, + { + "epoch": 7.657336343115124, + "grad_norm": 233.3187713623047, + "learning_rate": 1.850635208711434e-05, + "loss": 34.5055, + "step": 2121 + }, + { + "epoch": 7.660948081264109, + "grad_norm": 232.7037353515625, + "learning_rate": 1.8500907441016334e-05, + "loss": 34.1232, + "step": 2122 + }, + { + "epoch": 7.664559819413093, + "grad_norm": 254.53050231933594, + "learning_rate": 1.849546279491833e-05, + "loss": 35.3301, + "step": 2123 + }, + { + "epoch": 7.668171557562077, + "grad_norm": 234.93154907226562, + "learning_rate": 1.8490018148820324e-05, + "loss": 35.9202, + "step": 2124 + }, + { + "epoch": 7.6717832957110605, + "grad_norm": 237.99671936035156, + "learning_rate": 1.8484573502722327e-05, + "loss": 36.5702, + "step": 2125 + }, + { + "epoch": 7.675395033860045, + "grad_norm": 186.25271606445312, + "learning_rate": 1.8479128856624322e-05, + "loss": 35.9423, + "step": 2126 + }, + { + "epoch": 7.679006772009029, + "grad_norm": 226.461669921875, + "learning_rate": 1.8473684210526317e-05, + "loss": 37.4121, + "step": 2127 + }, + { + "epoch": 7.682618510158013, + "grad_norm": 227.0966033935547, + "learning_rate": 1.8468239564428313e-05, + "loss": 36.8802, + "step": 2128 + }, + { + "epoch": 7.686230248306998, + "grad_norm": 193.4064178466797, + "learning_rate": 1.8462794918330308e-05, + "loss": 36.0245, + "step": 2129 + }, + { + "epoch": 7.689841986455982, + "grad_norm": 279.1668395996094, + "learning_rate": 1.8457350272232304e-05, + "loss": 37.4833, + "step": 2130 + }, + { + "epoch": 7.689841986455982, + "eval_loss": 0.6227458715438843, + "eval_runtime": 3.1429, + "eval_samples_per_second": 56.953, + "eval_steps_per_second": 56.953, + "step": 2130 + }, + { + "epoch": 7.693453724604966, + "grad_norm": 254.59234619140625, + "learning_rate": 1.8451905626134302e-05, + "loss": 36.8538, + "step": 2131 + }, + { + "epoch": 7.69706546275395, + "grad_norm": 191.14463806152344, + "learning_rate": 1.8446460980036298e-05, + "loss": 37.8517, + "step": 2132 + }, + { + "epoch": 7.700677200902934, + "grad_norm": 189.20896911621094, + "learning_rate": 1.8441016333938293e-05, + "loss": 38.406, + "step": 2133 + }, + { + "epoch": 7.704288939051919, + "grad_norm": 209.61175537109375, + "learning_rate": 1.8435571687840292e-05, + "loss": 37.7692, + "step": 2134 + }, + { + "epoch": 7.707900677200903, + "grad_norm": 220.5150146484375, + "learning_rate": 1.8430127041742287e-05, + "loss": 36.087, + "step": 2135 + }, + { + "epoch": 7.711512415349887, + "grad_norm": 211.78372192382812, + "learning_rate": 1.8424682395644286e-05, + "loss": 25.6052, + "step": 2136 + }, + { + "epoch": 7.715124153498872, + "grad_norm": 223.85789489746094, + "learning_rate": 1.841923774954628e-05, + "loss": 23.5576, + "step": 2137 + }, + { + "epoch": 7.718735891647856, + "grad_norm": 163.74220275878906, + "learning_rate": 1.8413793103448277e-05, + "loss": 24.4869, + "step": 2138 + }, + { + "epoch": 7.72234762979684, + "grad_norm": 182.80079650878906, + "learning_rate": 1.8408348457350272e-05, + "loss": 25.1878, + "step": 2139 + }, + { + "epoch": 7.725959367945824, + "grad_norm": 296.0340270996094, + "learning_rate": 1.8402903811252268e-05, + "loss": 44.4643, + "step": 2140 + }, + { + "epoch": 7.725959367945824, + "eval_loss": 0.6382863521575928, + "eval_runtime": 3.1441, + "eval_samples_per_second": 56.932, + "eval_steps_per_second": 56.932, + "step": 2140 + }, + { + "epoch": 7.7295711060948085, + "grad_norm": 248.48643493652344, + "learning_rate": 1.8397459165154263e-05, + "loss": 45.2141, + "step": 2141 + }, + { + "epoch": 7.733182844243792, + "grad_norm": 240.9061279296875, + "learning_rate": 1.8392014519056262e-05, + "loss": 42.9435, + "step": 2142 + }, + { + "epoch": 7.736794582392776, + "grad_norm": 231.62315368652344, + "learning_rate": 1.8386569872958257e-05, + "loss": 42.9769, + "step": 2143 + }, + { + "epoch": 7.74040632054176, + "grad_norm": 244.36915588378906, + "learning_rate": 1.8381125226860256e-05, + "loss": 43.6058, + "step": 2144 + }, + { + "epoch": 7.744018058690745, + "grad_norm": 252.9080047607422, + "learning_rate": 1.837568058076225e-05, + "loss": 43.1753, + "step": 2145 + }, + { + "epoch": 7.747629796839729, + "grad_norm": 274.0201721191406, + "learning_rate": 1.8370235934664247e-05, + "loss": 43.3285, + "step": 2146 + }, + { + "epoch": 7.751241534988713, + "grad_norm": 226.75595092773438, + "learning_rate": 1.8364791288566245e-05, + "loss": 43.3158, + "step": 2147 + }, + { + "epoch": 7.754853273137698, + "grad_norm": 197.0859832763672, + "learning_rate": 1.835934664246824e-05, + "loss": 43.5773, + "step": 2148 + }, + { + "epoch": 7.758465011286682, + "grad_norm": 212.14720153808594, + "learning_rate": 1.8353901996370236e-05, + "loss": 43.9208, + "step": 2149 + }, + { + "epoch": 7.762076749435666, + "grad_norm": 230.22158813476562, + "learning_rate": 1.834845735027223e-05, + "loss": 42.8429, + "step": 2150 + }, + { + "epoch": 7.762076749435666, + "eval_loss": 0.6291994452476501, + "eval_runtime": 3.1473, + "eval_samples_per_second": 56.874, + "eval_steps_per_second": 56.874, + "step": 2150 + }, + { + "epoch": 7.76568848758465, + "grad_norm": 215.79391479492188, + "learning_rate": 1.8343012704174227e-05, + "loss": 40.7289, + "step": 2151 + }, + { + "epoch": 7.769300225733634, + "grad_norm": 210.00296020507812, + "learning_rate": 1.8337568058076222e-05, + "loss": 39.9759, + "step": 2152 + }, + { + "epoch": 7.772911963882619, + "grad_norm": 291.2987976074219, + "learning_rate": 1.8332123411978224e-05, + "loss": 40.551, + "step": 2153 + }, + { + "epoch": 7.776523702031603, + "grad_norm": 218.08819580078125, + "learning_rate": 1.832667876588022e-05, + "loss": 40.7981, + "step": 2154 + }, + { + "epoch": 7.780135440180587, + "grad_norm": 268.615966796875, + "learning_rate": 1.8321234119782215e-05, + "loss": 40.5463, + "step": 2155 + }, + { + "epoch": 7.7837471783295715, + "grad_norm": 269.939697265625, + "learning_rate": 1.831578947368421e-05, + "loss": 40.6168, + "step": 2156 + }, + { + "epoch": 7.7873589164785555, + "grad_norm": 268.9761657714844, + "learning_rate": 1.8310344827586206e-05, + "loss": 41.2449, + "step": 2157 + }, + { + "epoch": 7.7909706546275395, + "grad_norm": 161.08811950683594, + "learning_rate": 1.8304900181488205e-05, + "loss": 40.6308, + "step": 2158 + }, + { + "epoch": 7.794582392776523, + "grad_norm": 190.44696044921875, + "learning_rate": 1.82994555353902e-05, + "loss": 40.9708, + "step": 2159 + }, + { + "epoch": 7.798194130925508, + "grad_norm": 202.4305419921875, + "learning_rate": 1.8294010889292196e-05, + "loss": 41.2053, + "step": 2160 + }, + { + "epoch": 7.798194130925508, + "eval_loss": 0.6233534812927246, + "eval_runtime": 3.1457, + "eval_samples_per_second": 56.903, + "eval_steps_per_second": 56.903, + "step": 2160 + }, + { + "epoch": 7.801805869074492, + "grad_norm": 188.5523681640625, + "learning_rate": 1.828856624319419e-05, + "loss": 40.3928, + "step": 2161 + }, + { + "epoch": 7.805417607223476, + "grad_norm": 184.18296813964844, + "learning_rate": 1.828312159709619e-05, + "loss": 42.3466, + "step": 2162 + }, + { + "epoch": 7.80902934537246, + "grad_norm": 223.9243927001953, + "learning_rate": 1.8277676950998185e-05, + "loss": 42.0301, + "step": 2163 + }, + { + "epoch": 7.812641083521445, + "grad_norm": 202.3498077392578, + "learning_rate": 1.8272232304900184e-05, + "loss": 42.3284, + "step": 2164 + }, + { + "epoch": 7.816252821670429, + "grad_norm": 205.77940368652344, + "learning_rate": 1.826678765880218e-05, + "loss": 42.0951, + "step": 2165 + }, + { + "epoch": 7.819864559819413, + "grad_norm": 191.46728515625, + "learning_rate": 1.8261343012704175e-05, + "loss": 40.826, + "step": 2166 + }, + { + "epoch": 7.823476297968397, + "grad_norm": 276.8330383300781, + "learning_rate": 1.825589836660617e-05, + "loss": 42.7909, + "step": 2167 + }, + { + "epoch": 7.827088036117382, + "grad_norm": 181.93955993652344, + "learning_rate": 1.8250453720508165e-05, + "loss": 38.6068, + "step": 2168 + }, + { + "epoch": 7.830699774266366, + "grad_norm": 178.79856872558594, + "learning_rate": 1.8245009074410164e-05, + "loss": 35.694, + "step": 2169 + }, + { + "epoch": 7.83431151241535, + "grad_norm": 224.6522979736328, + "learning_rate": 1.823956442831216e-05, + "loss": 36.7127, + "step": 2170 + }, + { + "epoch": 7.83431151241535, + "eval_loss": 0.6237645745277405, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 2170 + }, + { + "epoch": 7.837923250564334, + "grad_norm": 203.37196350097656, + "learning_rate": 1.823411978221416e-05, + "loss": 34.0039, + "step": 2171 + }, + { + "epoch": 7.8415349887133186, + "grad_norm": 212.79307556152344, + "learning_rate": 1.8228675136116154e-05, + "loss": 33.2787, + "step": 2172 + }, + { + "epoch": 7.8451467268623025, + "grad_norm": 215.5691375732422, + "learning_rate": 1.822323049001815e-05, + "loss": 35.4241, + "step": 2173 + }, + { + "epoch": 7.8487584650112865, + "grad_norm": 230.0751190185547, + "learning_rate": 1.8217785843920144e-05, + "loss": 36.9333, + "step": 2174 + }, + { + "epoch": 7.852370203160271, + "grad_norm": 217.8132781982422, + "learning_rate": 1.8212341197822143e-05, + "loss": 35.7233, + "step": 2175 + }, + { + "epoch": 7.855981941309255, + "grad_norm": 245.93177795410156, + "learning_rate": 1.820689655172414e-05, + "loss": 36.6111, + "step": 2176 + }, + { + "epoch": 7.859593679458239, + "grad_norm": 210.58218383789062, + "learning_rate": 1.8201451905626134e-05, + "loss": 36.3243, + "step": 2177 + }, + { + "epoch": 7.863205417607223, + "grad_norm": 234.6280059814453, + "learning_rate": 1.819600725952813e-05, + "loss": 37.0315, + "step": 2178 + }, + { + "epoch": 7.866817155756207, + "grad_norm": 184.53121948242188, + "learning_rate": 1.8190562613430125e-05, + "loss": 35.8725, + "step": 2179 + }, + { + "epoch": 7.870428893905192, + "grad_norm": 201.5563507080078, + "learning_rate": 1.8185117967332127e-05, + "loss": 37.9183, + "step": 2180 + }, + { + "epoch": 7.870428893905192, + "eval_loss": 0.6210297346115112, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2180 + }, + { + "epoch": 7.874040632054176, + "grad_norm": 192.29579162597656, + "learning_rate": 1.8179673321234122e-05, + "loss": 37.1709, + "step": 2181 + }, + { + "epoch": 7.87765237020316, + "grad_norm": 246.0638427734375, + "learning_rate": 1.8174228675136118e-05, + "loss": 38.5338, + "step": 2182 + }, + { + "epoch": 7.881264108352145, + "grad_norm": 237.47607421875, + "learning_rate": 1.8168784029038113e-05, + "loss": 37.7041, + "step": 2183 + }, + { + "epoch": 7.884875846501129, + "grad_norm": 215.06407165527344, + "learning_rate": 1.816333938294011e-05, + "loss": 38.1663, + "step": 2184 + }, + { + "epoch": 7.888487584650113, + "grad_norm": 193.76809692382812, + "learning_rate": 1.8157894736842107e-05, + "loss": 32.1679, + "step": 2185 + }, + { + "epoch": 7.892099322799097, + "grad_norm": 208.66111755371094, + "learning_rate": 1.8152450090744103e-05, + "loss": 24.2413, + "step": 2186 + }, + { + "epoch": 7.895711060948082, + "grad_norm": 182.810546875, + "learning_rate": 1.8147005444646098e-05, + "loss": 24.1102, + "step": 2187 + }, + { + "epoch": 7.899322799097066, + "grad_norm": 200.25823974609375, + "learning_rate": 1.8141560798548093e-05, + "loss": 24.5778, + "step": 2188 + }, + { + "epoch": 7.9029345372460496, + "grad_norm": 224.19125366210938, + "learning_rate": 1.813611615245009e-05, + "loss": 26.1643, + "step": 2189 + }, + { + "epoch": 7.9065462753950335, + "grad_norm": 261.03033447265625, + "learning_rate": 1.8130671506352088e-05, + "loss": 45.1071, + "step": 2190 + }, + { + "epoch": 7.9065462753950335, + "eval_loss": 0.6303785443305969, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2190 + }, + { + "epoch": 7.910158013544018, + "grad_norm": 273.6593322753906, + "learning_rate": 1.8125226860254086e-05, + "loss": 43.8271, + "step": 2191 + }, + { + "epoch": 7.913769751693002, + "grad_norm": 304.0534362792969, + "learning_rate": 1.8119782214156082e-05, + "loss": 43.7623, + "step": 2192 + }, + { + "epoch": 7.917381489841986, + "grad_norm": 249.27255249023438, + "learning_rate": 1.8114337568058077e-05, + "loss": 43.7191, + "step": 2193 + }, + { + "epoch": 7.92099322799097, + "grad_norm": 199.5006103515625, + "learning_rate": 1.8108892921960072e-05, + "loss": 44.1019, + "step": 2194 + }, + { + "epoch": 7.924604966139955, + "grad_norm": 228.42832946777344, + "learning_rate": 1.8103448275862068e-05, + "loss": 43.9717, + "step": 2195 + }, + { + "epoch": 7.928216704288939, + "grad_norm": 247.20901489257812, + "learning_rate": 1.8098003629764067e-05, + "loss": 40.022, + "step": 2196 + }, + { + "epoch": 7.931828442437923, + "grad_norm": 297.5372619628906, + "learning_rate": 1.8092558983666062e-05, + "loss": 40.6639, + "step": 2197 + }, + { + "epoch": 7.935440180586907, + "grad_norm": 245.11915588378906, + "learning_rate": 1.8087114337568057e-05, + "loss": 40.3569, + "step": 2198 + }, + { + "epoch": 7.939051918735892, + "grad_norm": 255.53297424316406, + "learning_rate": 1.8081669691470056e-05, + "loss": 41.7983, + "step": 2199 + }, + { + "epoch": 7.942663656884876, + "grad_norm": 226.12783813476562, + "learning_rate": 1.807622504537205e-05, + "loss": 41.7844, + "step": 2200 + }, + { + "epoch": 7.942663656884876, + "eval_loss": 0.6214397549629211, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2200 + }, + { + "epoch": 7.94627539503386, + "grad_norm": 220.90577697753906, + "learning_rate": 1.8070780399274047e-05, + "loss": 42.057, + "step": 2201 + }, + { + "epoch": 7.949887133182845, + "grad_norm": 192.33856201171875, + "learning_rate": 1.8065335753176046e-05, + "loss": 42.0299, + "step": 2202 + }, + { + "epoch": 7.953498871331829, + "grad_norm": 192.8511962890625, + "learning_rate": 1.805989110707804e-05, + "loss": 41.7752, + "step": 2203 + }, + { + "epoch": 7.957110609480813, + "grad_norm": 223.10275268554688, + "learning_rate": 1.8054446460980036e-05, + "loss": 41.0178, + "step": 2204 + }, + { + "epoch": 7.960722347629797, + "grad_norm": 189.8402099609375, + "learning_rate": 1.8049001814882032e-05, + "loss": 37.9747, + "step": 2205 + }, + { + "epoch": 7.9643340857787805, + "grad_norm": 233.5938720703125, + "learning_rate": 1.8043557168784027e-05, + "loss": 35.3994, + "step": 2206 + }, + { + "epoch": 7.967945823927765, + "grad_norm": 218.5577850341797, + "learning_rate": 1.8038112522686026e-05, + "loss": 35.1967, + "step": 2207 + }, + { + "epoch": 7.971557562076749, + "grad_norm": 228.49502563476562, + "learning_rate": 1.8032667876588025e-05, + "loss": 34.5792, + "step": 2208 + }, + { + "epoch": 7.975169300225733, + "grad_norm": 285.4461364746094, + "learning_rate": 1.802722323049002e-05, + "loss": 37.9449, + "step": 2209 + }, + { + "epoch": 7.978781038374718, + "grad_norm": 186.83755493164062, + "learning_rate": 1.8021778584392016e-05, + "loss": 36.3295, + "step": 2210 + }, + { + "epoch": 7.978781038374718, + "eval_loss": 0.6212169528007507, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2210 + }, + { + "epoch": 7.982392776523702, + "grad_norm": 210.31175231933594, + "learning_rate": 1.801633393829401e-05, + "loss": 37.0061, + "step": 2211 + }, + { + "epoch": 7.986004514672686, + "grad_norm": 251.96026611328125, + "learning_rate": 1.8010889292196006e-05, + "loss": 37.8831, + "step": 2212 + }, + { + "epoch": 7.98961625282167, + "grad_norm": 273.8665771484375, + "learning_rate": 1.8005444646098005e-05, + "loss": 38.8926, + "step": 2213 + }, + { + "epoch": 7.993227990970655, + "grad_norm": 207.25836181640625, + "learning_rate": 1.8e-05, + "loss": 30.0468, + "step": 2214 + }, + { + "epoch": 7.996839729119639, + "grad_norm": 200.5218048095703, + "learning_rate": 1.7994555353901996e-05, + "loss": 24.0549, + "step": 2215 + }, + { + "epoch": 8.0, + "grad_norm": 245.7149200439453, + "learning_rate": 1.798911070780399e-05, + "loss": 22.3158, + "step": 2216 + }, + { + "epoch": 8.003611738148985, + "grad_norm": 263.85546875, + "learning_rate": 1.798366606170599e-05, + "loss": 43.2342, + "step": 2217 + }, + { + "epoch": 8.007223476297968, + "grad_norm": 244.57205200195312, + "learning_rate": 1.797822141560799e-05, + "loss": 44.0931, + "step": 2218 + }, + { + "epoch": 8.010835214446953, + "grad_norm": 196.4144287109375, + "learning_rate": 1.7972776769509984e-05, + "loss": 42.1926, + "step": 2219 + }, + { + "epoch": 8.014446952595938, + "grad_norm": 282.3250427246094, + "learning_rate": 1.796733212341198e-05, + "loss": 41.4664, + "step": 2220 + }, + { + "epoch": 8.014446952595938, + "eval_loss": 0.6222901344299316, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 2220 + }, + { + "epoch": 8.01805869074492, + "grad_norm": 186.79281616210938, + "learning_rate": 1.7961887477313975e-05, + "loss": 42.2133, + "step": 2221 + }, + { + "epoch": 8.021670428893906, + "grad_norm": 220.3788299560547, + "learning_rate": 1.795644283121597e-05, + "loss": 42.0159, + "step": 2222 + }, + { + "epoch": 8.025282167042889, + "grad_norm": 262.37078857421875, + "learning_rate": 1.7950998185117966e-05, + "loss": 42.6055, + "step": 2223 + }, + { + "epoch": 8.028893905191874, + "grad_norm": 199.07078552246094, + "learning_rate": 1.7945553539019964e-05, + "loss": 43.3061, + "step": 2224 + }, + { + "epoch": 8.032505643340858, + "grad_norm": 256.6651306152344, + "learning_rate": 1.794010889292196e-05, + "loss": 42.4806, + "step": 2225 + }, + { + "epoch": 8.036117381489841, + "grad_norm": 281.17431640625, + "learning_rate": 1.793466424682396e-05, + "loss": 43.9823, + "step": 2226 + }, + { + "epoch": 8.039729119638826, + "grad_norm": 201.19837951660156, + "learning_rate": 1.7929219600725954e-05, + "loss": 41.8372, + "step": 2227 + }, + { + "epoch": 8.043340857787811, + "grad_norm": 195.1905059814453, + "learning_rate": 1.792377495462795e-05, + "loss": 38.8656, + "step": 2228 + }, + { + "epoch": 8.046952595936794, + "grad_norm": 215.02772521972656, + "learning_rate": 1.7918330308529948e-05, + "loss": 39.8965, + "step": 2229 + }, + { + "epoch": 8.050564334085779, + "grad_norm": 202.16322326660156, + "learning_rate": 1.7912885662431944e-05, + "loss": 41.0917, + "step": 2230 + }, + { + "epoch": 8.050564334085779, + "eval_loss": 0.6212881207466125, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2230 + }, + { + "epoch": 8.054176072234762, + "grad_norm": 218.90786743164062, + "learning_rate": 1.790744101633394e-05, + "loss": 38.5499, + "step": 2231 + }, + { + "epoch": 8.057787810383747, + "grad_norm": 179.57138061523438, + "learning_rate": 1.7901996370235934e-05, + "loss": 39.5915, + "step": 2232 + }, + { + "epoch": 8.061399548532732, + "grad_norm": 242.74801635742188, + "learning_rate": 1.789655172413793e-05, + "loss": 39.6094, + "step": 2233 + }, + { + "epoch": 8.065011286681715, + "grad_norm": 183.07102966308594, + "learning_rate": 1.7891107078039925e-05, + "loss": 40.6025, + "step": 2234 + }, + { + "epoch": 8.0686230248307, + "grad_norm": 192.85418701171875, + "learning_rate": 1.7885662431941924e-05, + "loss": 40.3013, + "step": 2235 + }, + { + "epoch": 8.072234762979685, + "grad_norm": 254.26353454589844, + "learning_rate": 1.7880217785843923e-05, + "loss": 39.1747, + "step": 2236 + }, + { + "epoch": 8.075846501128668, + "grad_norm": 230.7747802734375, + "learning_rate": 1.7874773139745918e-05, + "loss": 40.7569, + "step": 2237 + }, + { + "epoch": 8.079458239277653, + "grad_norm": 179.30528259277344, + "learning_rate": 1.7869328493647913e-05, + "loss": 40.0753, + "step": 2238 + }, + { + "epoch": 8.083069977426636, + "grad_norm": 203.48915100097656, + "learning_rate": 1.786388384754991e-05, + "loss": 41.4453, + "step": 2239 + }, + { + "epoch": 8.08668171557562, + "grad_norm": 274.8970947265625, + "learning_rate": 1.7858439201451908e-05, + "loss": 40.5818, + "step": 2240 + }, + { + "epoch": 8.08668171557562, + "eval_loss": 0.6184170842170715, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.799, + "eval_steps_per_second": 56.799, + "step": 2240 + }, + { + "epoch": 8.090293453724605, + "grad_norm": 237.2452392578125, + "learning_rate": 1.7852994555353903e-05, + "loss": 42.5794, + "step": 2241 + }, + { + "epoch": 8.093905191873588, + "grad_norm": 236.33766174316406, + "learning_rate": 1.7847549909255898e-05, + "loss": 41.89, + "step": 2242 + }, + { + "epoch": 8.097516930022573, + "grad_norm": 269.4791564941406, + "learning_rate": 1.7842105263157894e-05, + "loss": 41.7726, + "step": 2243 + }, + { + "epoch": 8.101128668171558, + "grad_norm": 192.28457641601562, + "learning_rate": 1.783666061705989e-05, + "loss": 40.1187, + "step": 2244 + }, + { + "epoch": 8.104740406320541, + "grad_norm": 201.5625457763672, + "learning_rate": 1.7831215970961888e-05, + "loss": 36.8004, + "step": 2245 + }, + { + "epoch": 8.108352144469526, + "grad_norm": 175.7625274658203, + "learning_rate": 1.7825771324863887e-05, + "loss": 33.8354, + "step": 2246 + }, + { + "epoch": 8.111963882618511, + "grad_norm": 195.6171112060547, + "learning_rate": 1.7820326678765882e-05, + "loss": 33.5176, + "step": 2247 + }, + { + "epoch": 8.115575620767494, + "grad_norm": 158.7554168701172, + "learning_rate": 1.7814882032667877e-05, + "loss": 34.2908, + "step": 2248 + }, + { + "epoch": 8.119187358916479, + "grad_norm": 192.78900146484375, + "learning_rate": 1.7809437386569873e-05, + "loss": 34.0861, + "step": 2249 + }, + { + "epoch": 8.122799097065462, + "grad_norm": 186.6603240966797, + "learning_rate": 1.7803992740471868e-05, + "loss": 35.5742, + "step": 2250 + }, + { + "epoch": 8.122799097065462, + "eval_loss": 0.6207499504089355, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2250 + }, + { + "epoch": 8.126410835214447, + "grad_norm": 264.3590087890625, + "learning_rate": 1.7798548094373867e-05, + "loss": 35.6709, + "step": 2251 + }, + { + "epoch": 8.130022573363432, + "grad_norm": 202.9478302001953, + "learning_rate": 1.7793103448275862e-05, + "loss": 36.4221, + "step": 2252 + }, + { + "epoch": 8.133634311512415, + "grad_norm": 229.260498046875, + "learning_rate": 1.7787658802177858e-05, + "loss": 36.0745, + "step": 2253 + }, + { + "epoch": 8.1372460496614, + "grad_norm": 222.37716674804688, + "learning_rate": 1.7782214156079856e-05, + "loss": 37.3266, + "step": 2254 + }, + { + "epoch": 8.140857787810384, + "grad_norm": 217.02272033691406, + "learning_rate": 1.7776769509981852e-05, + "loss": 37.2819, + "step": 2255 + }, + { + "epoch": 8.144469525959368, + "grad_norm": 247.61016845703125, + "learning_rate": 1.7771324863883847e-05, + "loss": 37.2683, + "step": 2256 + }, + { + "epoch": 8.148081264108352, + "grad_norm": 209.7449493408203, + "learning_rate": 1.7765880217785846e-05, + "loss": 36.7165, + "step": 2257 + }, + { + "epoch": 8.151693002257336, + "grad_norm": 217.30722045898438, + "learning_rate": 1.776043557168784e-05, + "loss": 37.0805, + "step": 2258 + }, + { + "epoch": 8.15530474040632, + "grad_norm": 181.5167236328125, + "learning_rate": 1.7754990925589837e-05, + "loss": 38.0326, + "step": 2259 + }, + { + "epoch": 8.158916478555305, + "grad_norm": 217.4818878173828, + "learning_rate": 1.7749546279491832e-05, + "loss": 37.1798, + "step": 2260 + }, + { + "epoch": 8.158916478555305, + "eval_loss": 0.6218119263648987, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 2260 + }, + { + "epoch": 8.162528216704288, + "grad_norm": 233.60733032226562, + "learning_rate": 1.7744101633393828e-05, + "loss": 36.6039, + "step": 2261 + }, + { + "epoch": 8.166139954853273, + "grad_norm": 184.5128631591797, + "learning_rate": 1.7738656987295826e-05, + "loss": 30.6188, + "step": 2262 + }, + { + "epoch": 8.169751693002258, + "grad_norm": 154.25791931152344, + "learning_rate": 1.7733212341197825e-05, + "loss": 24.0782, + "step": 2263 + }, + { + "epoch": 8.173363431151241, + "grad_norm": 179.92723083496094, + "learning_rate": 1.772776769509982e-05, + "loss": 23.7072, + "step": 2264 + }, + { + "epoch": 8.176975169300226, + "grad_norm": 170.87684631347656, + "learning_rate": 1.7722323049001816e-05, + "loss": 24.0008, + "step": 2265 + }, + { + "epoch": 8.18058690744921, + "grad_norm": 179.25233459472656, + "learning_rate": 1.771687840290381e-05, + "loss": 24.8393, + "step": 2266 + }, + { + "epoch": 8.184198645598194, + "grad_norm": 268.7836608886719, + "learning_rate": 1.7711433756805807e-05, + "loss": 44.0573, + "step": 2267 + }, + { + "epoch": 8.187810383747179, + "grad_norm": 249.12033081054688, + "learning_rate": 1.7705989110707805e-05, + "loss": 45.0218, + "step": 2268 + }, + { + "epoch": 8.191422121896162, + "grad_norm": 275.2551574707031, + "learning_rate": 1.77005444646098e-05, + "loss": 43.1954, + "step": 2269 + }, + { + "epoch": 8.195033860045147, + "grad_norm": 233.5360107421875, + "learning_rate": 1.7695099818511796e-05, + "loss": 43.0807, + "step": 2270 + }, + { + "epoch": 8.195033860045147, + "eval_loss": 0.6311450600624084, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 2270 + }, + { + "epoch": 8.198645598194132, + "grad_norm": 201.01617431640625, + "learning_rate": 1.768965517241379e-05, + "loss": 43.8161, + "step": 2271 + }, + { + "epoch": 8.202257336343115, + "grad_norm": 243.028564453125, + "learning_rate": 1.7684210526315787e-05, + "loss": 43.3388, + "step": 2272 + }, + { + "epoch": 8.2058690744921, + "grad_norm": 191.8246307373047, + "learning_rate": 1.767876588021779e-05, + "loss": 42.6949, + "step": 2273 + }, + { + "epoch": 8.209480812641084, + "grad_norm": 241.33609008789062, + "learning_rate": 1.7673321234119784e-05, + "loss": 43.3541, + "step": 2274 + }, + { + "epoch": 8.213092550790067, + "grad_norm": 247.99066162109375, + "learning_rate": 1.766787658802178e-05, + "loss": 44.4262, + "step": 2275 + }, + { + "epoch": 8.216704288939052, + "grad_norm": 223.35452270507812, + "learning_rate": 1.7662431941923775e-05, + "loss": 42.5696, + "step": 2276 + }, + { + "epoch": 8.220316027088035, + "grad_norm": 208.75209045410156, + "learning_rate": 1.765698729582577e-05, + "loss": 41.9236, + "step": 2277 + }, + { + "epoch": 8.22392776523702, + "grad_norm": 229.60305786132812, + "learning_rate": 1.7651542649727766e-05, + "loss": 39.962, + "step": 2278 + }, + { + "epoch": 8.227539503386005, + "grad_norm": 294.3867492675781, + "learning_rate": 1.7646098003629765e-05, + "loss": 39.0847, + "step": 2279 + }, + { + "epoch": 8.231151241534988, + "grad_norm": 201.49679565429688, + "learning_rate": 1.764065335753176e-05, + "loss": 39.1451, + "step": 2280 + }, + { + "epoch": 8.231151241534988, + "eval_loss": 0.6214079856872559, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 2280 + }, + { + "epoch": 8.234762979683973, + "grad_norm": 201.57894897460938, + "learning_rate": 1.7635208711433756e-05, + "loss": 39.4673, + "step": 2281 + }, + { + "epoch": 8.238374717832958, + "grad_norm": 201.0395965576172, + "learning_rate": 1.7629764065335754e-05, + "loss": 39.9832, + "step": 2282 + }, + { + "epoch": 8.241986455981941, + "grad_norm": 274.41168212890625, + "learning_rate": 1.762431941923775e-05, + "loss": 40.3885, + "step": 2283 + }, + { + "epoch": 8.245598194130926, + "grad_norm": 173.79977416992188, + "learning_rate": 1.761887477313975e-05, + "loss": 39.5292, + "step": 2284 + }, + { + "epoch": 8.249209932279909, + "grad_norm": 194.91806030273438, + "learning_rate": 1.7613430127041744e-05, + "loss": 40.3855, + "step": 2285 + }, + { + "epoch": 8.252821670428894, + "grad_norm": 216.47213745117188, + "learning_rate": 1.760798548094374e-05, + "loss": 40.937, + "step": 2286 + }, + { + "epoch": 8.256433408577879, + "grad_norm": 168.1825714111328, + "learning_rate": 1.7602540834845735e-05, + "loss": 41.2523, + "step": 2287 + }, + { + "epoch": 8.260045146726862, + "grad_norm": 187.51914978027344, + "learning_rate": 1.759709618874773e-05, + "loss": 40.6913, + "step": 2288 + }, + { + "epoch": 8.263656884875846, + "grad_norm": 183.99844360351562, + "learning_rate": 1.759165154264973e-05, + "loss": 42.5074, + "step": 2289 + }, + { + "epoch": 8.267268623024831, + "grad_norm": 201.23797607421875, + "learning_rate": 1.7586206896551724e-05, + "loss": 42.0519, + "step": 2290 + }, + { + "epoch": 8.267268623024831, + "eval_loss": 0.6184054017066956, + "eval_runtime": 3.1465, + "eval_samples_per_second": 56.889, + "eval_steps_per_second": 56.889, + "step": 2290 + }, + { + "epoch": 8.270880361173814, + "grad_norm": 219.0037384033203, + "learning_rate": 1.7580762250453723e-05, + "loss": 41.7059, + "step": 2291 + }, + { + "epoch": 8.2744920993228, + "grad_norm": 221.00173950195312, + "learning_rate": 1.7575317604355718e-05, + "loss": 40.9004, + "step": 2292 + }, + { + "epoch": 8.278103837471784, + "grad_norm": 180.00828552246094, + "learning_rate": 1.7569872958257714e-05, + "loss": 38.7865, + "step": 2293 + }, + { + "epoch": 8.281715575620767, + "grad_norm": 210.69302368164062, + "learning_rate": 1.756442831215971e-05, + "loss": 39.207, + "step": 2294 + }, + { + "epoch": 8.285327313769752, + "grad_norm": 196.8787078857422, + "learning_rate": 1.7558983666061708e-05, + "loss": 39.4472, + "step": 2295 + }, + { + "epoch": 8.288939051918735, + "grad_norm": 229.16331481933594, + "learning_rate": 1.7553539019963703e-05, + "loss": 36.5539, + "step": 2296 + }, + { + "epoch": 8.29255079006772, + "grad_norm": 180.67474365234375, + "learning_rate": 1.75480943738657e-05, + "loss": 34.3887, + "step": 2297 + }, + { + "epoch": 8.296162528216705, + "grad_norm": 234.046875, + "learning_rate": 1.7542649727767694e-05, + "loss": 34.158, + "step": 2298 + }, + { + "epoch": 8.299774266365688, + "grad_norm": 213.34255981445312, + "learning_rate": 1.753720508166969e-05, + "loss": 34.7655, + "step": 2299 + }, + { + "epoch": 8.303386004514673, + "grad_norm": 205.6382598876953, + "learning_rate": 1.753176043557169e-05, + "loss": 34.4223, + "step": 2300 + }, + { + "epoch": 8.303386004514673, + "eval_loss": 0.6200549006462097, + "eval_runtime": 3.1447, + "eval_samples_per_second": 56.921, + "eval_steps_per_second": 56.921, + "step": 2300 + }, + { + "epoch": 8.306997742663658, + "grad_norm": 189.79238891601562, + "learning_rate": 1.7526315789473687e-05, + "loss": 35.3846, + "step": 2301 + }, + { + "epoch": 8.31060948081264, + "grad_norm": 202.27859497070312, + "learning_rate": 1.7520871143375682e-05, + "loss": 34.9006, + "step": 2302 + }, + { + "epoch": 8.314221218961626, + "grad_norm": 217.62327575683594, + "learning_rate": 1.7515426497277678e-05, + "loss": 36.3079, + "step": 2303 + }, + { + "epoch": 8.317832957110609, + "grad_norm": 212.82862854003906, + "learning_rate": 1.7509981851179673e-05, + "loss": 35.8598, + "step": 2304 + }, + { + "epoch": 8.321444695259594, + "grad_norm": 229.778564453125, + "learning_rate": 1.750453720508167e-05, + "loss": 37.0853, + "step": 2305 + }, + { + "epoch": 8.325056433408578, + "grad_norm": 219.99844360351562, + "learning_rate": 1.7499092558983667e-05, + "loss": 38.01, + "step": 2306 + }, + { + "epoch": 8.328668171557561, + "grad_norm": 202.63035583496094, + "learning_rate": 1.7493647912885663e-05, + "loss": 36.4756, + "step": 2307 + }, + { + "epoch": 8.332279909706546, + "grad_norm": 188.44094848632812, + "learning_rate": 1.7488203266787658e-05, + "loss": 37.0509, + "step": 2308 + }, + { + "epoch": 8.335891647855531, + "grad_norm": 187.8760223388672, + "learning_rate": 1.7482758620689657e-05, + "loss": 38.0019, + "step": 2309 + }, + { + "epoch": 8.339503386004514, + "grad_norm": 239.35833740234375, + "learning_rate": 1.7477313974591652e-05, + "loss": 38.2255, + "step": 2310 + }, + { + "epoch": 8.339503386004514, + "eval_loss": 0.6221747994422913, + "eval_runtime": 3.148, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 2310 + }, + { + "epoch": 8.343115124153499, + "grad_norm": 236.3567657470703, + "learning_rate": 1.747186932849365e-05, + "loss": 37.3598, + "step": 2311 + }, + { + "epoch": 8.346726862302482, + "grad_norm": 188.16151428222656, + "learning_rate": 1.7466424682395646e-05, + "loss": 27.1993, + "step": 2312 + }, + { + "epoch": 8.350338600451467, + "grad_norm": 216.58778381347656, + "learning_rate": 1.746098003629764e-05, + "loss": 23.7024, + "step": 2313 + }, + { + "epoch": 8.353950338600452, + "grad_norm": 221.03111267089844, + "learning_rate": 1.7455535390199637e-05, + "loss": 24.2856, + "step": 2314 + }, + { + "epoch": 8.357562076749435, + "grad_norm": 180.36221313476562, + "learning_rate": 1.7450090744101632e-05, + "loss": 23.7624, + "step": 2315 + }, + { + "epoch": 8.36117381489842, + "grad_norm": 198.77438354492188, + "learning_rate": 1.7444646098003628e-05, + "loss": 25.8628, + "step": 2316 + }, + { + "epoch": 8.364785553047405, + "grad_norm": 250.81321716308594, + "learning_rate": 1.7439201451905627e-05, + "loss": 43.4097, + "step": 2317 + }, + { + "epoch": 8.368397291196388, + "grad_norm": 246.19544982910156, + "learning_rate": 1.7433756805807622e-05, + "loss": 44.7141, + "step": 2318 + }, + { + "epoch": 8.372009029345373, + "grad_norm": 245.04241943359375, + "learning_rate": 1.742831215970962e-05, + "loss": 44.4511, + "step": 2319 + }, + { + "epoch": 8.375620767494357, + "grad_norm": 224.05331420898438, + "learning_rate": 1.7422867513611616e-05, + "loss": 43.5971, + "step": 2320 + }, + { + "epoch": 8.375620767494357, + "eval_loss": 0.6324251294136047, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 2320 + }, + { + "epoch": 8.37923250564334, + "grad_norm": 222.3795623779297, + "learning_rate": 1.741742286751361e-05, + "loss": 42.9007, + "step": 2321 + }, + { + "epoch": 8.382844243792325, + "grad_norm": 210.0133514404297, + "learning_rate": 1.741197822141561e-05, + "loss": 42.8733, + "step": 2322 + }, + { + "epoch": 8.386455981941308, + "grad_norm": 222.01031494140625, + "learning_rate": 1.7406533575317606e-05, + "loss": 42.9875, + "step": 2323 + }, + { + "epoch": 8.390067720090293, + "grad_norm": 187.30101013183594, + "learning_rate": 1.74010889292196e-05, + "loss": 42.4873, + "step": 2324 + }, + { + "epoch": 8.393679458239278, + "grad_norm": 188.22048950195312, + "learning_rate": 1.7395644283121596e-05, + "loss": 42.2066, + "step": 2325 + }, + { + "epoch": 8.397291196388261, + "grad_norm": 228.75363159179688, + "learning_rate": 1.7390199637023592e-05, + "loss": 42.7604, + "step": 2326 + }, + { + "epoch": 8.400902934537246, + "grad_norm": 196.8817901611328, + "learning_rate": 1.7384754990925587e-05, + "loss": 42.445, + "step": 2327 + }, + { + "epoch": 8.404514672686231, + "grad_norm": 205.3610382080078, + "learning_rate": 1.737931034482759e-05, + "loss": 39.8408, + "step": 2328 + }, + { + "epoch": 8.408126410835214, + "grad_norm": 259.0702819824219, + "learning_rate": 1.7373865698729585e-05, + "loss": 40.847, + "step": 2329 + }, + { + "epoch": 8.411738148984199, + "grad_norm": 216.12017822265625, + "learning_rate": 1.736842105263158e-05, + "loss": 40.4648, + "step": 2330 + }, + { + "epoch": 8.411738148984199, + "eval_loss": 0.6252871155738831, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2330 + }, + { + "epoch": 8.415349887133182, + "grad_norm": 330.9464111328125, + "learning_rate": 1.7362976406533575e-05, + "loss": 39.7682, + "step": 2331 + }, + { + "epoch": 8.418961625282167, + "grad_norm": 237.19505310058594, + "learning_rate": 1.735753176043557e-05, + "loss": 38.8824, + "step": 2332 + }, + { + "epoch": 8.422573363431152, + "grad_norm": 247.22259521484375, + "learning_rate": 1.735208711433757e-05, + "loss": 40.1187, + "step": 2333 + }, + { + "epoch": 8.426185101580135, + "grad_norm": 267.739990234375, + "learning_rate": 1.7346642468239565e-05, + "loss": 40.4589, + "step": 2334 + }, + { + "epoch": 8.42979683972912, + "grad_norm": 308.715576171875, + "learning_rate": 1.734119782214156e-05, + "loss": 41.5481, + "step": 2335 + }, + { + "epoch": 8.433408577878104, + "grad_norm": 350.8972473144531, + "learning_rate": 1.7335753176043556e-05, + "loss": 41.6628, + "step": 2336 + }, + { + "epoch": 8.437020316027088, + "grad_norm": 245.9825897216797, + "learning_rate": 1.7330308529945555e-05, + "loss": 40.3527, + "step": 2337 + }, + { + "epoch": 8.440632054176072, + "grad_norm": 253.94488525390625, + "learning_rate": 1.732486388384755e-05, + "loss": 39.6388, + "step": 2338 + }, + { + "epoch": 8.444243792325057, + "grad_norm": 226.24179077148438, + "learning_rate": 1.731941923774955e-05, + "loss": 40.5561, + "step": 2339 + }, + { + "epoch": 8.44785553047404, + "grad_norm": 188.66746520996094, + "learning_rate": 1.7313974591651544e-05, + "loss": 41.8422, + "step": 2340 + }, + { + "epoch": 8.44785553047404, + "eval_loss": 0.6197592616081238, + "eval_runtime": 3.1522, + "eval_samples_per_second": 56.786, + "eval_steps_per_second": 56.786, + "step": 2340 + }, + { + "epoch": 8.451467268623025, + "grad_norm": 227.01014709472656, + "learning_rate": 1.730852994555354e-05, + "loss": 41.4184, + "step": 2341 + }, + { + "epoch": 8.455079006772008, + "grad_norm": 187.11643981933594, + "learning_rate": 1.7303085299455535e-05, + "loss": 40.796, + "step": 2342 + }, + { + "epoch": 8.458690744920993, + "grad_norm": 243.1756134033203, + "learning_rate": 1.729764065335753e-05, + "loss": 41.7926, + "step": 2343 + }, + { + "epoch": 8.462302483069978, + "grad_norm": 226.15187072753906, + "learning_rate": 1.729219600725953e-05, + "loss": 41.588, + "step": 2344 + }, + { + "epoch": 8.465914221218961, + "grad_norm": 218.49935913085938, + "learning_rate": 1.7286751361161524e-05, + "loss": 39.6935, + "step": 2345 + }, + { + "epoch": 8.469525959367946, + "grad_norm": 232.4805145263672, + "learning_rate": 1.7281306715063523e-05, + "loss": 37.0718, + "step": 2346 + }, + { + "epoch": 8.47313769751693, + "grad_norm": 201.1748046875, + "learning_rate": 1.727586206896552e-05, + "loss": 33.9633, + "step": 2347 + }, + { + "epoch": 8.476749435665914, + "grad_norm": 208.79733276367188, + "learning_rate": 1.7270417422867514e-05, + "loss": 33.4553, + "step": 2348 + }, + { + "epoch": 8.480361173814899, + "grad_norm": 235.91151428222656, + "learning_rate": 1.726497277676951e-05, + "loss": 33.6144, + "step": 2349 + }, + { + "epoch": 8.483972911963882, + "grad_norm": 206.28811645507812, + "learning_rate": 1.7259528130671508e-05, + "loss": 35.3678, + "step": 2350 + }, + { + "epoch": 8.483972911963882, + "eval_loss": 0.6203061938285828, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 2350 + }, + { + "epoch": 8.487584650112867, + "grad_norm": 305.2204284667969, + "learning_rate": 1.7254083484573503e-05, + "loss": 35.9175, + "step": 2351 + }, + { + "epoch": 8.491196388261852, + "grad_norm": 227.1592254638672, + "learning_rate": 1.72486388384755e-05, + "loss": 35.5001, + "step": 2352 + }, + { + "epoch": 8.494808126410835, + "grad_norm": 194.739501953125, + "learning_rate": 1.7243194192377494e-05, + "loss": 35.0015, + "step": 2353 + }, + { + "epoch": 8.49841986455982, + "grad_norm": 233.8467254638672, + "learning_rate": 1.723774954627949e-05, + "loss": 36.8257, + "step": 2354 + }, + { + "epoch": 8.502031602708804, + "grad_norm": 258.8914489746094, + "learning_rate": 1.7232304900181492e-05, + "loss": 36.1246, + "step": 2355 + }, + { + "epoch": 8.505643340857787, + "grad_norm": 194.8585968017578, + "learning_rate": 1.7226860254083487e-05, + "loss": 36.1245, + "step": 2356 + }, + { + "epoch": 8.509255079006772, + "grad_norm": 191.2276153564453, + "learning_rate": 1.7221415607985483e-05, + "loss": 37.0608, + "step": 2357 + }, + { + "epoch": 8.512866817155757, + "grad_norm": 197.9025115966797, + "learning_rate": 1.7215970961887478e-05, + "loss": 37.0779, + "step": 2358 + }, + { + "epoch": 8.51647855530474, + "grad_norm": 207.01016235351562, + "learning_rate": 1.7210526315789473e-05, + "loss": 37.8432, + "step": 2359 + }, + { + "epoch": 8.520090293453725, + "grad_norm": 222.20201110839844, + "learning_rate": 1.720508166969147e-05, + "loss": 36.6983, + "step": 2360 + }, + { + "epoch": 8.520090293453725, + "eval_loss": 0.6240220665931702, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 2360 + }, + { + "epoch": 8.523702031602708, + "grad_norm": 200.19273376464844, + "learning_rate": 1.7199637023593467e-05, + "loss": 38.0613, + "step": 2361 + }, + { + "epoch": 8.527313769751693, + "grad_norm": 205.36758422851562, + "learning_rate": 1.7194192377495463e-05, + "loss": 29.6395, + "step": 2362 + }, + { + "epoch": 8.530925507900678, + "grad_norm": 206.53396606445312, + "learning_rate": 1.7188747731397458e-05, + "loss": 23.6478, + "step": 2363 + }, + { + "epoch": 8.534537246049661, + "grad_norm": 219.47044372558594, + "learning_rate": 1.7183303085299454e-05, + "loss": 22.8522, + "step": 2364 + }, + { + "epoch": 8.538148984198646, + "grad_norm": 178.48008728027344, + "learning_rate": 1.7177858439201452e-05, + "loss": 24.1411, + "step": 2365 + }, + { + "epoch": 8.54176072234763, + "grad_norm": 222.63731384277344, + "learning_rate": 1.717241379310345e-05, + "loss": 26.2818, + "step": 2366 + }, + { + "epoch": 8.545372460496614, + "grad_norm": 216.6333465576172, + "learning_rate": 1.7166969147005447e-05, + "loss": 42.5599, + "step": 2367 + }, + { + "epoch": 8.548984198645599, + "grad_norm": 241.42532348632812, + "learning_rate": 1.7161524500907442e-05, + "loss": 44.0016, + "step": 2368 + }, + { + "epoch": 8.552595936794582, + "grad_norm": 227.95193481445312, + "learning_rate": 1.7156079854809437e-05, + "loss": 44.1662, + "step": 2369 + }, + { + "epoch": 8.556207674943566, + "grad_norm": 204.9208526611328, + "learning_rate": 1.7150635208711433e-05, + "loss": 41.2255, + "step": 2370 + }, + { + "epoch": 8.556207674943566, + "eval_loss": 0.6293933987617493, + "eval_runtime": 3.1467, + "eval_samples_per_second": 56.884, + "eval_steps_per_second": 56.884, + "step": 2370 + }, + { + "epoch": 8.559819413092551, + "grad_norm": 168.1370849609375, + "learning_rate": 1.7145190562613428e-05, + "loss": 42.8374, + "step": 2371 + }, + { + "epoch": 8.563431151241534, + "grad_norm": 209.16641235351562, + "learning_rate": 1.7139745916515427e-05, + "loss": 42.4378, + "step": 2372 + }, + { + "epoch": 8.56704288939052, + "grad_norm": 235.36373901367188, + "learning_rate": 1.7134301270417422e-05, + "loss": 43.3213, + "step": 2373 + }, + { + "epoch": 8.570654627539504, + "grad_norm": 198.8206329345703, + "learning_rate": 1.712885662431942e-05, + "loss": 43.5621, + "step": 2374 + }, + { + "epoch": 8.574266365688487, + "grad_norm": 191.1640167236328, + "learning_rate": 1.7123411978221416e-05, + "loss": 41.8729, + "step": 2375 + }, + { + "epoch": 8.577878103837472, + "grad_norm": 281.6352233886719, + "learning_rate": 1.7117967332123412e-05, + "loss": 42.8306, + "step": 2376 + }, + { + "epoch": 8.581489841986457, + "grad_norm": 191.68939208984375, + "learning_rate": 1.711252268602541e-05, + "loss": 41.3603, + "step": 2377 + }, + { + "epoch": 8.58510158013544, + "grad_norm": 175.3041229248047, + "learning_rate": 1.7107078039927406e-05, + "loss": 38.7076, + "step": 2378 + }, + { + "epoch": 8.588713318284425, + "grad_norm": 186.31202697753906, + "learning_rate": 1.71016333938294e-05, + "loss": 38.832, + "step": 2379 + }, + { + "epoch": 8.592325056433408, + "grad_norm": 192.0680389404297, + "learning_rate": 1.7096188747731397e-05, + "loss": 40.6542, + "step": 2380 + }, + { + "epoch": 8.592325056433408, + "eval_loss": 0.6245992183685303, + "eval_runtime": 3.1487, + "eval_samples_per_second": 56.848, + "eval_steps_per_second": 56.848, + "step": 2380 + }, + { + "epoch": 8.595936794582393, + "grad_norm": 284.3516540527344, + "learning_rate": 1.7090744101633392e-05, + "loss": 40.3145, + "step": 2381 + }, + { + "epoch": 8.599548532731378, + "grad_norm": 210.2421875, + "learning_rate": 1.708529945553539e-05, + "loss": 39.9109, + "step": 2382 + }, + { + "epoch": 8.60316027088036, + "grad_norm": 202.3438720703125, + "learning_rate": 1.707985480943739e-05, + "loss": 39.0686, + "step": 2383 + }, + { + "epoch": 8.606772009029346, + "grad_norm": 189.5508270263672, + "learning_rate": 1.7074410163339385e-05, + "loss": 40.6673, + "step": 2384 + }, + { + "epoch": 8.610383747178329, + "grad_norm": 199.3516387939453, + "learning_rate": 1.706896551724138e-05, + "loss": 40.5357, + "step": 2385 + }, + { + "epoch": 8.613995485327314, + "grad_norm": 183.11309814453125, + "learning_rate": 1.7063520871143376e-05, + "loss": 40.7691, + "step": 2386 + }, + { + "epoch": 8.617607223476298, + "grad_norm": 347.104248046875, + "learning_rate": 1.705807622504537e-05, + "loss": 40.6822, + "step": 2387 + }, + { + "epoch": 8.621218961625281, + "grad_norm": 341.0453796386719, + "learning_rate": 1.705263157894737e-05, + "loss": 40.9791, + "step": 2388 + }, + { + "epoch": 8.624830699774266, + "grad_norm": 335.33221435546875, + "learning_rate": 1.7047186932849365e-05, + "loss": 41.0977, + "step": 2389 + }, + { + "epoch": 8.628442437923251, + "grad_norm": 209.75198364257812, + "learning_rate": 1.704174228675136e-05, + "loss": 41.3332, + "step": 2390 + }, + { + "epoch": 8.628442437923251, + "eval_loss": 0.6176490783691406, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2390 + }, + { + "epoch": 8.632054176072234, + "grad_norm": 221.6715545654297, + "learning_rate": 1.7036297640653356e-05, + "loss": 41.7456, + "step": 2391 + }, + { + "epoch": 8.635665914221219, + "grad_norm": 255.7875213623047, + "learning_rate": 1.7030852994555355e-05, + "loss": 41.7063, + "step": 2392 + }, + { + "epoch": 8.639277652370204, + "grad_norm": 206.66221618652344, + "learning_rate": 1.7025408348457354e-05, + "loss": 41.941, + "step": 2393 + }, + { + "epoch": 8.642889390519187, + "grad_norm": 381.9871826171875, + "learning_rate": 1.701996370235935e-05, + "loss": 42.8615, + "step": 2394 + }, + { + "epoch": 8.646501128668172, + "grad_norm": 303.8249816894531, + "learning_rate": 1.7014519056261344e-05, + "loss": 37.8472, + "step": 2395 + }, + { + "epoch": 8.650112866817155, + "grad_norm": 201.2444610595703, + "learning_rate": 1.700907441016334e-05, + "loss": 35.4641, + "step": 2396 + }, + { + "epoch": 8.65372460496614, + "grad_norm": 242.34298706054688, + "learning_rate": 1.7003629764065335e-05, + "loss": 33.3414, + "step": 2397 + }, + { + "epoch": 8.657336343115125, + "grad_norm": 214.45384216308594, + "learning_rate": 1.699818511796733e-05, + "loss": 33.7771, + "step": 2398 + }, + { + "epoch": 8.660948081264108, + "grad_norm": 276.4810485839844, + "learning_rate": 1.699274047186933e-05, + "loss": 35.4289, + "step": 2399 + }, + { + "epoch": 8.664559819413093, + "grad_norm": 199.68626403808594, + "learning_rate": 1.6987295825771325e-05, + "loss": 34.4205, + "step": 2400 + }, + { + "epoch": 8.664559819413093, + "eval_loss": 0.6179484128952026, + "eval_runtime": 3.1618, + "eval_samples_per_second": 56.614, + "eval_steps_per_second": 56.614, + "step": 2400 + }, + { + "epoch": 8.668171557562077, + "grad_norm": 239.19200134277344, + "learning_rate": 1.698185117967332e-05, + "loss": 34.3428, + "step": 2401 + }, + { + "epoch": 8.67178329571106, + "grad_norm": 341.44927978515625, + "learning_rate": 1.697640653357532e-05, + "loss": 37.6011, + "step": 2402 + }, + { + "epoch": 8.675395033860045, + "grad_norm": 260.5967102050781, + "learning_rate": 1.6970961887477314e-05, + "loss": 34.9222, + "step": 2403 + }, + { + "epoch": 8.679006772009028, + "grad_norm": 217.9357147216797, + "learning_rate": 1.6965517241379313e-05, + "loss": 36.6177, + "step": 2404 + }, + { + "epoch": 8.682618510158013, + "grad_norm": 355.21917724609375, + "learning_rate": 1.696007259528131e-05, + "loss": 36.3072, + "step": 2405 + }, + { + "epoch": 8.686230248306998, + "grad_norm": 279.37200927734375, + "learning_rate": 1.6954627949183304e-05, + "loss": 36.7026, + "step": 2406 + }, + { + "epoch": 8.689841986455981, + "grad_norm": 344.9017028808594, + "learning_rate": 1.69491833030853e-05, + "loss": 37.5009, + "step": 2407 + }, + { + "epoch": 8.693453724604966, + "grad_norm": 225.28668212890625, + "learning_rate": 1.6943738656987295e-05, + "loss": 36.0914, + "step": 2408 + }, + { + "epoch": 8.697065462753951, + "grad_norm": 233.16372680664062, + "learning_rate": 1.693829401088929e-05, + "loss": 38.0917, + "step": 2409 + }, + { + "epoch": 8.700677200902934, + "grad_norm": 220.2307891845703, + "learning_rate": 1.693284936479129e-05, + "loss": 37.4493, + "step": 2410 + }, + { + "epoch": 8.700677200902934, + "eval_loss": 0.6225734949111938, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2410 + }, + { + "epoch": 8.704288939051919, + "grad_norm": 298.2883605957031, + "learning_rate": 1.6927404718693287e-05, + "loss": 37.6527, + "step": 2411 + }, + { + "epoch": 8.707900677200904, + "grad_norm": 329.1615295410156, + "learning_rate": 1.6921960072595283e-05, + "loss": 30.9627, + "step": 2412 + }, + { + "epoch": 8.711512415349887, + "grad_norm": 192.55380249023438, + "learning_rate": 1.6916515426497278e-05, + "loss": 24.2028, + "step": 2413 + }, + { + "epoch": 8.715124153498872, + "grad_norm": 162.13583374023438, + "learning_rate": 1.6911070780399274e-05, + "loss": 23.3005, + "step": 2414 + }, + { + "epoch": 8.718735891647855, + "grad_norm": 152.95108032226562, + "learning_rate": 1.6905626134301272e-05, + "loss": 24.335, + "step": 2415 + }, + { + "epoch": 8.72234762979684, + "grad_norm": 183.4193572998047, + "learning_rate": 1.6900181488203268e-05, + "loss": 24.9279, + "step": 2416 + }, + { + "epoch": 8.725959367945824, + "grad_norm": 232.93650817871094, + "learning_rate": 1.6894736842105263e-05, + "loss": 43.4574, + "step": 2417 + }, + { + "epoch": 8.729571106094808, + "grad_norm": 226.85890197753906, + "learning_rate": 1.688929219600726e-05, + "loss": 44.4136, + "step": 2418 + }, + { + "epoch": 8.733182844243792, + "grad_norm": 232.16064453125, + "learning_rate": 1.6883847549909254e-05, + "loss": 42.8183, + "step": 2419 + }, + { + "epoch": 8.736794582392777, + "grad_norm": 243.5811767578125, + "learning_rate": 1.6878402903811253e-05, + "loss": 43.3031, + "step": 2420 + }, + { + "epoch": 8.736794582392777, + "eval_loss": 0.6284167170524597, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2420 + }, + { + "epoch": 8.74040632054176, + "grad_norm": 194.7115020751953, + "learning_rate": 1.687295825771325e-05, + "loss": 42.1276, + "step": 2421 + }, + { + "epoch": 8.744018058690745, + "grad_norm": 250.81983947753906, + "learning_rate": 1.6867513611615247e-05, + "loss": 42.5535, + "step": 2422 + }, + { + "epoch": 8.747629796839728, + "grad_norm": 205.1988983154297, + "learning_rate": 1.6862068965517242e-05, + "loss": 42.7745, + "step": 2423 + }, + { + "epoch": 8.751241534988713, + "grad_norm": 159.68243408203125, + "learning_rate": 1.6856624319419238e-05, + "loss": 43.6562, + "step": 2424 + }, + { + "epoch": 8.754853273137698, + "grad_norm": 164.31361389160156, + "learning_rate": 1.6851179673321233e-05, + "loss": 43.4602, + "step": 2425 + }, + { + "epoch": 8.758465011286681, + "grad_norm": 213.9793243408203, + "learning_rate": 1.6845735027223232e-05, + "loss": 42.1559, + "step": 2426 + }, + { + "epoch": 8.762076749435666, + "grad_norm": 205.79107666015625, + "learning_rate": 1.6840290381125227e-05, + "loss": 41.5687, + "step": 2427 + }, + { + "epoch": 8.76568848758465, + "grad_norm": 235.80348205566406, + "learning_rate": 1.6834845735027223e-05, + "loss": 41.0748, + "step": 2428 + }, + { + "epoch": 8.769300225733634, + "grad_norm": 203.84884643554688, + "learning_rate": 1.682940108892922e-05, + "loss": 39.3348, + "step": 2429 + }, + { + "epoch": 8.772911963882619, + "grad_norm": 271.2411804199219, + "learning_rate": 1.6823956442831217e-05, + "loss": 39.357, + "step": 2430 + }, + { + "epoch": 8.772911963882619, + "eval_loss": 0.6211046576499939, + "eval_runtime": 3.1402, + "eval_samples_per_second": 57.002, + "eval_steps_per_second": 57.002, + "step": 2430 + }, + { + "epoch": 8.776523702031604, + "grad_norm": 222.4960174560547, + "learning_rate": 1.6818511796733212e-05, + "loss": 39.2198, + "step": 2431 + }, + { + "epoch": 8.780135440180587, + "grad_norm": 325.9942932128906, + "learning_rate": 1.681306715063521e-05, + "loss": 40.572, + "step": 2432 + }, + { + "epoch": 8.783747178329572, + "grad_norm": 195.2740936279297, + "learning_rate": 1.6807622504537206e-05, + "loss": 39.2727, + "step": 2433 + }, + { + "epoch": 8.787358916478555, + "grad_norm": 196.16964721679688, + "learning_rate": 1.68021778584392e-05, + "loss": 40.6503, + "step": 2434 + }, + { + "epoch": 8.79097065462754, + "grad_norm": 183.2659454345703, + "learning_rate": 1.6796733212341197e-05, + "loss": 41.2074, + "step": 2435 + }, + { + "epoch": 8.794582392776524, + "grad_norm": 293.393798828125, + "learning_rate": 1.6791288566243192e-05, + "loss": 40.2778, + "step": 2436 + }, + { + "epoch": 8.798194130925507, + "grad_norm": 232.8402099609375, + "learning_rate": 1.678584392014519e-05, + "loss": 40.0305, + "step": 2437 + }, + { + "epoch": 8.801805869074492, + "grad_norm": 269.957275390625, + "learning_rate": 1.678039927404719e-05, + "loss": 40.4216, + "step": 2438 + }, + { + "epoch": 8.805417607223477, + "grad_norm": 175.6732635498047, + "learning_rate": 1.6774954627949185e-05, + "loss": 40.7998, + "step": 2439 + }, + { + "epoch": 8.80902934537246, + "grad_norm": 209.0604248046875, + "learning_rate": 1.676950998185118e-05, + "loss": 41.1176, + "step": 2440 + }, + { + "epoch": 8.80902934537246, + "eval_loss": 0.6211614012718201, + "eval_runtime": 3.15, + "eval_samples_per_second": 56.826, + "eval_steps_per_second": 56.826, + "step": 2440 + }, + { + "epoch": 8.812641083521445, + "grad_norm": 229.91171264648438, + "learning_rate": 1.6764065335753176e-05, + "loss": 41.37, + "step": 2441 + }, + { + "epoch": 8.816252821670428, + "grad_norm": 192.99610900878906, + "learning_rate": 1.675862068965517e-05, + "loss": 41.8377, + "step": 2442 + }, + { + "epoch": 8.819864559819413, + "grad_norm": 239.290771484375, + "learning_rate": 1.675317604355717e-05, + "loss": 42.3038, + "step": 2443 + }, + { + "epoch": 8.823476297968398, + "grad_norm": 203.52330017089844, + "learning_rate": 1.6747731397459166e-05, + "loss": 41.3334, + "step": 2444 + }, + { + "epoch": 8.827088036117381, + "grad_norm": 247.99099731445312, + "learning_rate": 1.674228675136116e-05, + "loss": 37.7455, + "step": 2445 + }, + { + "epoch": 8.830699774266366, + "grad_norm": 205.9770965576172, + "learning_rate": 1.6736842105263156e-05, + "loss": 34.6828, + "step": 2446 + }, + { + "epoch": 8.83431151241535, + "grad_norm": 215.47024536132812, + "learning_rate": 1.6731397459165152e-05, + "loss": 34.927, + "step": 2447 + }, + { + "epoch": 8.837923250564334, + "grad_norm": 254.14010620117188, + "learning_rate": 1.6725952813067154e-05, + "loss": 35.3194, + "step": 2448 + }, + { + "epoch": 8.841534988713319, + "grad_norm": 221.18174743652344, + "learning_rate": 1.672050816696915e-05, + "loss": 34.9577, + "step": 2449 + }, + { + "epoch": 8.845146726862303, + "grad_norm": 191.1651611328125, + "learning_rate": 1.6715063520871145e-05, + "loss": 33.7244, + "step": 2450 + }, + { + "epoch": 8.845146726862303, + "eval_loss": 0.6216589212417603, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2450 + }, + { + "epoch": 8.848758465011286, + "grad_norm": 228.3920135498047, + "learning_rate": 1.670961887477314e-05, + "loss": 34.9689, + "step": 2451 + }, + { + "epoch": 8.852370203160271, + "grad_norm": 227.6689910888672, + "learning_rate": 1.6704174228675135e-05, + "loss": 36.0718, + "step": 2452 + }, + { + "epoch": 8.855981941309254, + "grad_norm": 182.38978576660156, + "learning_rate": 1.669872958257713e-05, + "loss": 37.1143, + "step": 2453 + }, + { + "epoch": 8.85959367945824, + "grad_norm": 223.66966247558594, + "learning_rate": 1.669328493647913e-05, + "loss": 34.4468, + "step": 2454 + }, + { + "epoch": 8.863205417607224, + "grad_norm": 260.3930358886719, + "learning_rate": 1.6687840290381125e-05, + "loss": 36.7305, + "step": 2455 + }, + { + "epoch": 8.866817155756207, + "grad_norm": 218.60385131835938, + "learning_rate": 1.668239564428312e-05, + "loss": 36.1995, + "step": 2456 + }, + { + "epoch": 8.870428893905192, + "grad_norm": 227.4342041015625, + "learning_rate": 1.667695099818512e-05, + "loss": 35.9138, + "step": 2457 + }, + { + "epoch": 8.874040632054175, + "grad_norm": 208.42196655273438, + "learning_rate": 1.6671506352087115e-05, + "loss": 37.2621, + "step": 2458 + }, + { + "epoch": 8.87765237020316, + "grad_norm": 214.9486541748047, + "learning_rate": 1.6666061705989113e-05, + "loss": 38.5176, + "step": 2459 + }, + { + "epoch": 8.881264108352145, + "grad_norm": 226.6992645263672, + "learning_rate": 1.666061705989111e-05, + "loss": 38.3917, + "step": 2460 + }, + { + "epoch": 8.881264108352145, + "eval_loss": 0.6277003884315491, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.959, + "eval_steps_per_second": 56.959, + "step": 2460 + }, + { + "epoch": 8.884875846501128, + "grad_norm": 282.3875732421875, + "learning_rate": 1.6655172413793104e-05, + "loss": 39.1439, + "step": 2461 + }, + { + "epoch": 8.888487584650113, + "grad_norm": 240.29022216796875, + "learning_rate": 1.66497277676951e-05, + "loss": 33.7717, + "step": 2462 + }, + { + "epoch": 8.892099322799098, + "grad_norm": 231.84727478027344, + "learning_rate": 1.6644283121597095e-05, + "loss": 24.1146, + "step": 2463 + }, + { + "epoch": 8.89571106094808, + "grad_norm": 215.5159149169922, + "learning_rate": 1.663883847549909e-05, + "loss": 24.0165, + "step": 2464 + }, + { + "epoch": 8.899322799097066, + "grad_norm": 278.42950439453125, + "learning_rate": 1.663339382940109e-05, + "loss": 24.2048, + "step": 2465 + }, + { + "epoch": 8.90293453724605, + "grad_norm": 187.03341674804688, + "learning_rate": 1.6627949183303088e-05, + "loss": 24.7332, + "step": 2466 + }, + { + "epoch": 8.906546275395034, + "grad_norm": 261.2938232421875, + "learning_rate": 1.6622504537205083e-05, + "loss": 42.6764, + "step": 2467 + }, + { + "epoch": 8.910158013544018, + "grad_norm": 234.00880432128906, + "learning_rate": 1.661705989110708e-05, + "loss": 42.9894, + "step": 2468 + }, + { + "epoch": 8.913769751693001, + "grad_norm": 263.2890319824219, + "learning_rate": 1.6611615245009074e-05, + "loss": 43.3274, + "step": 2469 + }, + { + "epoch": 8.917381489841986, + "grad_norm": 286.3260192871094, + "learning_rate": 1.6606170598911073e-05, + "loss": 44.3862, + "step": 2470 + }, + { + "epoch": 8.917381489841986, + "eval_loss": 0.6278789043426514, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2470 + }, + { + "epoch": 8.920993227990971, + "grad_norm": 273.5133972167969, + "learning_rate": 1.6600725952813068e-05, + "loss": 43.4195, + "step": 2471 + }, + { + "epoch": 8.924604966139954, + "grad_norm": 246.2245330810547, + "learning_rate": 1.6595281306715063e-05, + "loss": 43.153, + "step": 2472 + }, + { + "epoch": 8.928216704288939, + "grad_norm": 261.3001403808594, + "learning_rate": 1.658983666061706e-05, + "loss": 41.1276, + "step": 2473 + }, + { + "epoch": 8.931828442437924, + "grad_norm": 263.7626037597656, + "learning_rate": 1.6584392014519054e-05, + "loss": 40.5055, + "step": 2474 + }, + { + "epoch": 8.935440180586907, + "grad_norm": 233.80442810058594, + "learning_rate": 1.6578947368421053e-05, + "loss": 40.7098, + "step": 2475 + }, + { + "epoch": 8.939051918735892, + "grad_norm": 334.1268615722656, + "learning_rate": 1.6573502722323052e-05, + "loss": 40.5404, + "step": 2476 + }, + { + "epoch": 8.942663656884875, + "grad_norm": 319.56689453125, + "learning_rate": 1.6568058076225047e-05, + "loss": 40.3434, + "step": 2477 + }, + { + "epoch": 8.94627539503386, + "grad_norm": 388.0625915527344, + "learning_rate": 1.6562613430127043e-05, + "loss": 41.1956, + "step": 2478 + }, + { + "epoch": 8.949887133182845, + "grad_norm": 256.9087829589844, + "learning_rate": 1.6557168784029038e-05, + "loss": 41.9647, + "step": 2479 + }, + { + "epoch": 8.953498871331828, + "grad_norm": 248.2635040283203, + "learning_rate": 1.6551724137931033e-05, + "loss": 41.1885, + "step": 2480 + }, + { + "epoch": 8.953498871331828, + "eval_loss": 0.6198933124542236, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2480 + }, + { + "epoch": 8.957110609480813, + "grad_norm": 236.89004516601562, + "learning_rate": 1.6546279491833032e-05, + "loss": 41.2178, + "step": 2481 + }, + { + "epoch": 8.960722347629797, + "grad_norm": 260.47357177734375, + "learning_rate": 1.6540834845735027e-05, + "loss": 42.1472, + "step": 2482 + }, + { + "epoch": 8.96433408577878, + "grad_norm": 216.1390380859375, + "learning_rate": 1.6535390199637023e-05, + "loss": 36.14, + "step": 2483 + }, + { + "epoch": 8.967945823927765, + "grad_norm": 194.7316131591797, + "learning_rate": 1.652994555353902e-05, + "loss": 33.7272, + "step": 2484 + }, + { + "epoch": 8.97155756207675, + "grad_norm": 202.0404052734375, + "learning_rate": 1.6524500907441017e-05, + "loss": 34.9427, + "step": 2485 + }, + { + "epoch": 8.975169300225733, + "grad_norm": 196.98463439941406, + "learning_rate": 1.6519056261343016e-05, + "loss": 36.4874, + "step": 2486 + }, + { + "epoch": 8.978781038374718, + "grad_norm": 211.46177673339844, + "learning_rate": 1.651361161524501e-05, + "loss": 35.7667, + "step": 2487 + }, + { + "epoch": 8.982392776523701, + "grad_norm": 190.47093200683594, + "learning_rate": 1.6508166969147006e-05, + "loss": 35.6874, + "step": 2488 + }, + { + "epoch": 8.986004514672686, + "grad_norm": 194.9825897216797, + "learning_rate": 1.6502722323049002e-05, + "loss": 36.8718, + "step": 2489 + }, + { + "epoch": 8.989616252821671, + "grad_norm": 230.24774169921875, + "learning_rate": 1.6497277676950997e-05, + "loss": 37.4962, + "step": 2490 + }, + { + "epoch": 8.989616252821671, + "eval_loss": 0.6168100237846375, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 2490 + }, + { + "epoch": 8.993227990970654, + "grad_norm": 266.5688171386719, + "learning_rate": 1.6491833030852993e-05, + "loss": 35.5063, + "step": 2491 + }, + { + "epoch": 8.996839729119639, + "grad_norm": 230.923828125, + "learning_rate": 1.648638838475499e-05, + "loss": 23.5847, + "step": 2492 + }, + { + "epoch": 9.0, + "grad_norm": 187.365478515625, + "learning_rate": 1.6480943738656987e-05, + "loss": 21.7926, + "step": 2493 + }, + { + "epoch": 9.003611738148985, + "grad_norm": 283.487060546875, + "learning_rate": 1.6475499092558986e-05, + "loss": 41.4221, + "step": 2494 + }, + { + "epoch": 9.007223476297968, + "grad_norm": 234.38009643554688, + "learning_rate": 1.647005444646098e-05, + "loss": 43.3343, + "step": 2495 + }, + { + "epoch": 9.010835214446953, + "grad_norm": 253.75588989257812, + "learning_rate": 1.6464609800362976e-05, + "loss": 42.1983, + "step": 2496 + }, + { + "epoch": 9.014446952595938, + "grad_norm": 224.6202392578125, + "learning_rate": 1.6459165154264975e-05, + "loss": 41.5355, + "step": 2497 + }, + { + "epoch": 9.01805869074492, + "grad_norm": 261.0040588378906, + "learning_rate": 1.645372050816697e-05, + "loss": 42.3058, + "step": 2498 + }, + { + "epoch": 9.021670428893906, + "grad_norm": 191.44142150878906, + "learning_rate": 1.6448275862068966e-05, + "loss": 42.3911, + "step": 2499 + }, + { + "epoch": 9.025282167042889, + "grad_norm": 246.79278564453125, + "learning_rate": 1.644283121597096e-05, + "loss": 41.6238, + "step": 2500 + }, + { + "epoch": 9.025282167042889, + "eval_loss": 0.6220878958702087, + "eval_runtime": 3.1552, + "eval_samples_per_second": 56.731, + "eval_steps_per_second": 56.731, + "step": 2500 + }, + { + "epoch": 9.028893905191874, + "grad_norm": 251.5475311279297, + "learning_rate": 1.6437386569872957e-05, + "loss": 43.9275, + "step": 2501 + }, + { + "epoch": 9.032505643340858, + "grad_norm": 300.0381164550781, + "learning_rate": 1.6431941923774952e-05, + "loss": 42.8938, + "step": 2502 + }, + { + "epoch": 9.036117381489841, + "grad_norm": 310.0517883300781, + "learning_rate": 1.6426497277676954e-05, + "loss": 42.3538, + "step": 2503 + }, + { + "epoch": 9.039729119638826, + "grad_norm": 213.50392150878906, + "learning_rate": 1.642105263157895e-05, + "loss": 40.2305, + "step": 2504 + }, + { + "epoch": 9.043340857787811, + "grad_norm": 173.3816680908203, + "learning_rate": 1.6415607985480945e-05, + "loss": 38.3336, + "step": 2505 + }, + { + "epoch": 9.046952595936794, + "grad_norm": 195.51968383789062, + "learning_rate": 1.641016333938294e-05, + "loss": 38.5937, + "step": 2506 + }, + { + "epoch": 9.050564334085779, + "grad_norm": 195.68910217285156, + "learning_rate": 1.6404718693284936e-05, + "loss": 37.9994, + "step": 2507 + }, + { + "epoch": 9.054176072234762, + "grad_norm": 239.56704711914062, + "learning_rate": 1.6399274047186934e-05, + "loss": 38.6006, + "step": 2508 + }, + { + "epoch": 9.057787810383747, + "grad_norm": 455.8309326171875, + "learning_rate": 1.639382940108893e-05, + "loss": 39.9516, + "step": 2509 + }, + { + "epoch": 9.061399548532732, + "grad_norm": 188.0857696533203, + "learning_rate": 1.6388384754990925e-05, + "loss": 38.8922, + "step": 2510 + }, + { + "epoch": 9.061399548532732, + "eval_loss": 0.6177002191543579, + "eval_runtime": 3.1595, + "eval_samples_per_second": 56.654, + "eval_steps_per_second": 56.654, + "step": 2510 + }, + { + "epoch": 9.065011286681715, + "grad_norm": 211.76168823242188, + "learning_rate": 1.638294010889292e-05, + "loss": 38.8895, + "step": 2511 + }, + { + "epoch": 9.0686230248307, + "grad_norm": 281.7332458496094, + "learning_rate": 1.637749546279492e-05, + "loss": 39.9238, + "step": 2512 + }, + { + "epoch": 9.072234762979685, + "grad_norm": 254.9953155517578, + "learning_rate": 1.6372050816696915e-05, + "loss": 41.2667, + "step": 2513 + }, + { + "epoch": 9.075846501128668, + "grad_norm": 233.8746337890625, + "learning_rate": 1.6366606170598914e-05, + "loss": 39.3087, + "step": 2514 + }, + { + "epoch": 9.079458239277653, + "grad_norm": 317.71270751953125, + "learning_rate": 1.636116152450091e-05, + "loss": 40.4902, + "step": 2515 + }, + { + "epoch": 9.083069977426636, + "grad_norm": 227.5228271484375, + "learning_rate": 1.6355716878402904e-05, + "loss": 40.1197, + "step": 2516 + }, + { + "epoch": 9.08668171557562, + "grad_norm": 225.84423828125, + "learning_rate": 1.63502722323049e-05, + "loss": 42.9099, + "step": 2517 + }, + { + "epoch": 9.090293453724605, + "grad_norm": 255.20858764648438, + "learning_rate": 1.6344827586206895e-05, + "loss": 42.0515, + "step": 2518 + }, + { + "epoch": 9.093905191873588, + "grad_norm": 215.45352172851562, + "learning_rate": 1.6339382940108894e-05, + "loss": 41.6817, + "step": 2519 + }, + { + "epoch": 9.097516930022573, + "grad_norm": 233.5334014892578, + "learning_rate": 1.633393829401089e-05, + "loss": 42.6121, + "step": 2520 + }, + { + "epoch": 9.097516930022573, + "eval_loss": 0.6148340106010437, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.926, + "eval_steps_per_second": 56.926, + "step": 2520 + }, + { + "epoch": 9.101128668171558, + "grad_norm": 196.54132080078125, + "learning_rate": 1.6328493647912888e-05, + "loss": 40.5833, + "step": 2521 + }, + { + "epoch": 9.104740406320541, + "grad_norm": 296.7503967285156, + "learning_rate": 1.6323049001814883e-05, + "loss": 39.098, + "step": 2522 + }, + { + "epoch": 9.108352144469526, + "grad_norm": 272.1104431152344, + "learning_rate": 1.631760435571688e-05, + "loss": 36.0076, + "step": 2523 + }, + { + "epoch": 9.111963882618511, + "grad_norm": 197.3100128173828, + "learning_rate": 1.6312159709618874e-05, + "loss": 33.3503, + "step": 2524 + }, + { + "epoch": 9.115575620767494, + "grad_norm": 223.1310272216797, + "learning_rate": 1.6306715063520873e-05, + "loss": 33.1386, + "step": 2525 + }, + { + "epoch": 9.119187358916479, + "grad_norm": 234.86093139648438, + "learning_rate": 1.630127041742287e-05, + "loss": 34.2101, + "step": 2526 + }, + { + "epoch": 9.122799097065462, + "grad_norm": 244.72328186035156, + "learning_rate": 1.6295825771324864e-05, + "loss": 34.955, + "step": 2527 + }, + { + "epoch": 9.126410835214447, + "grad_norm": 198.89134216308594, + "learning_rate": 1.629038112522686e-05, + "loss": 34.5405, + "step": 2528 + }, + { + "epoch": 9.130022573363432, + "grad_norm": 236.64096069335938, + "learning_rate": 1.6284936479128854e-05, + "loss": 35.2328, + "step": 2529 + }, + { + "epoch": 9.133634311512415, + "grad_norm": 212.8743438720703, + "learning_rate": 1.6279491833030853e-05, + "loss": 34.6642, + "step": 2530 + }, + { + "epoch": 9.133634311512415, + "eval_loss": 0.6154256463050842, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 2530 + }, + { + "epoch": 9.1372460496614, + "grad_norm": 227.15135192871094, + "learning_rate": 1.6274047186932852e-05, + "loss": 35.652, + "step": 2531 + }, + { + "epoch": 9.140857787810384, + "grad_norm": 207.30572509765625, + "learning_rate": 1.6268602540834847e-05, + "loss": 36.8476, + "step": 2532 + }, + { + "epoch": 9.144469525959368, + "grad_norm": 222.18023681640625, + "learning_rate": 1.6263157894736843e-05, + "loss": 35.8299, + "step": 2533 + }, + { + "epoch": 9.148081264108352, + "grad_norm": 283.674072265625, + "learning_rate": 1.6257713248638838e-05, + "loss": 36.5074, + "step": 2534 + }, + { + "epoch": 9.151693002257336, + "grad_norm": 235.69752502441406, + "learning_rate": 1.6252268602540834e-05, + "loss": 37.344, + "step": 2535 + }, + { + "epoch": 9.15530474040632, + "grad_norm": 224.37965393066406, + "learning_rate": 1.6246823956442832e-05, + "loss": 37.8138, + "step": 2536 + }, + { + "epoch": 9.158916478555305, + "grad_norm": 217.52230834960938, + "learning_rate": 1.6241379310344828e-05, + "loss": 37.1529, + "step": 2537 + }, + { + "epoch": 9.162528216704288, + "grad_norm": 234.7586212158203, + "learning_rate": 1.6235934664246823e-05, + "loss": 36.3247, + "step": 2538 + }, + { + "epoch": 9.166139954853273, + "grad_norm": 239.52479553222656, + "learning_rate": 1.623049001814882e-05, + "loss": 30.0805, + "step": 2539 + }, + { + "epoch": 9.169751693002258, + "grad_norm": 223.7616424560547, + "learning_rate": 1.6225045372050817e-05, + "loss": 23.8492, + "step": 2540 + }, + { + "epoch": 9.169751693002258, + "eval_loss": 0.6244915723800659, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.031, + "eval_steps_per_second": 57.031, + "step": 2540 + }, + { + "epoch": 9.173363431151241, + "grad_norm": 213.41371154785156, + "learning_rate": 1.6219600725952816e-05, + "loss": 23.3557, + "step": 2541 + }, + { + "epoch": 9.176975169300226, + "grad_norm": 162.4627685546875, + "learning_rate": 1.621415607985481e-05, + "loss": 23.8834, + "step": 2542 + }, + { + "epoch": 9.18058690744921, + "grad_norm": 172.13250732421875, + "learning_rate": 1.6208711433756807e-05, + "loss": 24.6428, + "step": 2543 + }, + { + "epoch": 9.184198645598194, + "grad_norm": 229.30799865722656, + "learning_rate": 1.6203266787658802e-05, + "loss": 42.5908, + "step": 2544 + }, + { + "epoch": 9.187810383747179, + "grad_norm": 195.30130004882812, + "learning_rate": 1.6197822141560798e-05, + "loss": 43.7286, + "step": 2545 + }, + { + "epoch": 9.191422121896162, + "grad_norm": 227.4984893798828, + "learning_rate": 1.6192377495462793e-05, + "loss": 43.5012, + "step": 2546 + }, + { + "epoch": 9.195033860045147, + "grad_norm": 254.69615173339844, + "learning_rate": 1.6186932849364792e-05, + "loss": 41.9295, + "step": 2547 + }, + { + "epoch": 9.198645598194132, + "grad_norm": 251.33778381347656, + "learning_rate": 1.6181488203266787e-05, + "loss": 42.0838, + "step": 2548 + }, + { + "epoch": 9.202257336343115, + "grad_norm": 237.91677856445312, + "learning_rate": 1.6176043557168786e-05, + "loss": 43.0031, + "step": 2549 + }, + { + "epoch": 9.2058690744921, + "grad_norm": 258.0311584472656, + "learning_rate": 1.617059891107078e-05, + "loss": 42.7196, + "step": 2550 + }, + { + "epoch": 9.2058690744921, + "eval_loss": 0.6245208978652954, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2550 + }, + { + "epoch": 9.209480812641084, + "grad_norm": 197.14703369140625, + "learning_rate": 1.6165154264972777e-05, + "loss": 42.1342, + "step": 2551 + }, + { + "epoch": 9.213092550790067, + "grad_norm": 235.19705200195312, + "learning_rate": 1.6159709618874775e-05, + "loss": 41.8462, + "step": 2552 + }, + { + "epoch": 9.216704288939052, + "grad_norm": 198.409423828125, + "learning_rate": 1.615426497277677e-05, + "loss": 43.5993, + "step": 2553 + }, + { + "epoch": 9.220316027088035, + "grad_norm": 254.08590698242188, + "learning_rate": 1.6148820326678766e-05, + "loss": 40.771, + "step": 2554 + }, + { + "epoch": 9.22392776523702, + "grad_norm": 181.64808654785156, + "learning_rate": 1.614337568058076e-05, + "loss": 39.3511, + "step": 2555 + }, + { + "epoch": 9.227539503386005, + "grad_norm": 294.1127014160156, + "learning_rate": 1.6137931034482757e-05, + "loss": 39.6586, + "step": 2556 + }, + { + "epoch": 9.231151241534988, + "grad_norm": 197.59982299804688, + "learning_rate": 1.6132486388384752e-05, + "loss": 38.2575, + "step": 2557 + }, + { + "epoch": 9.234762979683973, + "grad_norm": 223.74717712402344, + "learning_rate": 1.6127041742286754e-05, + "loss": 38.8801, + "step": 2558 + }, + { + "epoch": 9.238374717832958, + "grad_norm": 279.2779541015625, + "learning_rate": 1.612159709618875e-05, + "loss": 40.4591, + "step": 2559 + }, + { + "epoch": 9.241986455981941, + "grad_norm": 258.75909423828125, + "learning_rate": 1.6116152450090745e-05, + "loss": 39.2172, + "step": 2560 + }, + { + "epoch": 9.241986455981941, + "eval_loss": 0.6209923624992371, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.784, + "eval_steps_per_second": 56.784, + "step": 2560 + }, + { + "epoch": 9.245598194130926, + "grad_norm": 305.0645446777344, + "learning_rate": 1.611070780399274e-05, + "loss": 40.442, + "step": 2561 + }, + { + "epoch": 9.249209932279909, + "grad_norm": 196.18557739257812, + "learning_rate": 1.6105263157894736e-05, + "loss": 39.7092, + "step": 2562 + }, + { + "epoch": 9.252821670428894, + "grad_norm": 214.3220977783203, + "learning_rate": 1.6099818511796735e-05, + "loss": 39.3935, + "step": 2563 + }, + { + "epoch": 9.256433408577879, + "grad_norm": 217.2801055908203, + "learning_rate": 1.609437386569873e-05, + "loss": 40.39, + "step": 2564 + }, + { + "epoch": 9.260045146726862, + "grad_norm": 205.17446899414062, + "learning_rate": 1.6088929219600726e-05, + "loss": 39.9531, + "step": 2565 + }, + { + "epoch": 9.263656884875846, + "grad_norm": 197.3854217529297, + "learning_rate": 1.608348457350272e-05, + "loss": 40.474, + "step": 2566 + }, + { + "epoch": 9.267268623024831, + "grad_norm": 264.3934631347656, + "learning_rate": 1.607803992740472e-05, + "loss": 41.2794, + "step": 2567 + }, + { + "epoch": 9.270880361173814, + "grad_norm": 226.6471710205078, + "learning_rate": 1.6072595281306715e-05, + "loss": 40.3425, + "step": 2568 + }, + { + "epoch": 9.2744920993228, + "grad_norm": 198.62734985351562, + "learning_rate": 1.6067150635208714e-05, + "loss": 41.6261, + "step": 2569 + }, + { + "epoch": 9.278103837471784, + "grad_norm": 207.73509216308594, + "learning_rate": 1.606170598911071e-05, + "loss": 41.7835, + "step": 2570 + }, + { + "epoch": 9.278103837471784, + "eval_loss": 0.6173180937767029, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 2570 + }, + { + "epoch": 9.281715575620767, + "grad_norm": 214.13601684570312, + "learning_rate": 1.6056261343012705e-05, + "loss": 40.0095, + "step": 2571 + }, + { + "epoch": 9.285327313769752, + "grad_norm": 218.0533905029297, + "learning_rate": 1.60508166969147e-05, + "loss": 40.014, + "step": 2572 + }, + { + "epoch": 9.288939051918735, + "grad_norm": 211.27984619140625, + "learning_rate": 1.6045372050816695e-05, + "loss": 36.7399, + "step": 2573 + }, + { + "epoch": 9.29255079006772, + "grad_norm": 201.9020233154297, + "learning_rate": 1.6039927404718694e-05, + "loss": 33.7555, + "step": 2574 + }, + { + "epoch": 9.296162528216705, + "grad_norm": 230.27149963378906, + "learning_rate": 1.603448275862069e-05, + "loss": 32.9646, + "step": 2575 + }, + { + "epoch": 9.299774266365688, + "grad_norm": 208.77622985839844, + "learning_rate": 1.6029038112522685e-05, + "loss": 33.5332, + "step": 2576 + }, + { + "epoch": 9.303386004514673, + "grad_norm": 225.02796936035156, + "learning_rate": 1.6023593466424684e-05, + "loss": 34.2592, + "step": 2577 + }, + { + "epoch": 9.306997742663658, + "grad_norm": 201.79612731933594, + "learning_rate": 1.601814882032668e-05, + "loss": 34.6686, + "step": 2578 + }, + { + "epoch": 9.31060948081264, + "grad_norm": 235.6588134765625, + "learning_rate": 1.6012704174228678e-05, + "loss": 35.4554, + "step": 2579 + }, + { + "epoch": 9.314221218961626, + "grad_norm": 273.51904296875, + "learning_rate": 1.6007259528130673e-05, + "loss": 35.2077, + "step": 2580 + }, + { + "epoch": 9.314221218961626, + "eval_loss": 0.6169624328613281, + "eval_runtime": 3.1501, + "eval_samples_per_second": 56.823, + "eval_steps_per_second": 56.823, + "step": 2580 + }, + { + "epoch": 9.317832957110609, + "grad_norm": 199.19541931152344, + "learning_rate": 1.600181488203267e-05, + "loss": 35.0703, + "step": 2581 + }, + { + "epoch": 9.321444695259594, + "grad_norm": 212.49276733398438, + "learning_rate": 1.5996370235934664e-05, + "loss": 35.9691, + "step": 2582 + }, + { + "epoch": 9.325056433408578, + "grad_norm": 193.7330322265625, + "learning_rate": 1.599092558983666e-05, + "loss": 34.9043, + "step": 2583 + }, + { + "epoch": 9.328668171557561, + "grad_norm": 196.00503540039062, + "learning_rate": 1.5985480943738655e-05, + "loss": 36.3508, + "step": 2584 + }, + { + "epoch": 9.332279909706546, + "grad_norm": 218.78392028808594, + "learning_rate": 1.5980036297640654e-05, + "loss": 34.7672, + "step": 2585 + }, + { + "epoch": 9.335891647855531, + "grad_norm": 235.76873779296875, + "learning_rate": 1.5974591651542652e-05, + "loss": 36.8695, + "step": 2586 + }, + { + "epoch": 9.339503386004514, + "grad_norm": 250.538330078125, + "learning_rate": 1.5969147005444648e-05, + "loss": 37.4531, + "step": 2587 + }, + { + "epoch": 9.343115124153499, + "grad_norm": 234.12469482421875, + "learning_rate": 1.5963702359346643e-05, + "loss": 37.4506, + "step": 2588 + }, + { + "epoch": 9.346726862302482, + "grad_norm": 209.3461151123047, + "learning_rate": 1.595825771324864e-05, + "loss": 31.3062, + "step": 2589 + }, + { + "epoch": 9.350338600451467, + "grad_norm": 211.12277221679688, + "learning_rate": 1.5952813067150637e-05, + "loss": 23.3303, + "step": 2590 + }, + { + "epoch": 9.350338600451467, + "eval_loss": 0.6222187876701355, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 2590 + }, + { + "epoch": 9.353950338600452, + "grad_norm": 200.1257781982422, + "learning_rate": 1.5947368421052633e-05, + "loss": 22.9145, + "step": 2591 + }, + { + "epoch": 9.357562076749435, + "grad_norm": 179.01475524902344, + "learning_rate": 1.5941923774954628e-05, + "loss": 23.8842, + "step": 2592 + }, + { + "epoch": 9.36117381489842, + "grad_norm": 214.9254608154297, + "learning_rate": 1.5936479128856623e-05, + "loss": 25.4154, + "step": 2593 + }, + { + "epoch": 9.364785553047405, + "grad_norm": 211.63735961914062, + "learning_rate": 1.593103448275862e-05, + "loss": 42.6467, + "step": 2594 + }, + { + "epoch": 9.368397291196388, + "grad_norm": 232.43194580078125, + "learning_rate": 1.5925589836660618e-05, + "loss": 43.3501, + "step": 2595 + }, + { + "epoch": 9.372009029345373, + "grad_norm": 220.61468505859375, + "learning_rate": 1.5920145190562616e-05, + "loss": 43.4324, + "step": 2596 + }, + { + "epoch": 9.375620767494357, + "grad_norm": 179.00894165039062, + "learning_rate": 1.591470054446461e-05, + "loss": 41.9646, + "step": 2597 + }, + { + "epoch": 9.37923250564334, + "grad_norm": 203.847412109375, + "learning_rate": 1.5909255898366607e-05, + "loss": 41.1242, + "step": 2598 + }, + { + "epoch": 9.382844243792325, + "grad_norm": 244.20164489746094, + "learning_rate": 1.5903811252268602e-05, + "loss": 42.2451, + "step": 2599 + }, + { + "epoch": 9.386455981941308, + "grad_norm": 203.60154724121094, + "learning_rate": 1.5898366606170598e-05, + "loss": 42.0361, + "step": 2600 + }, + { + "epoch": 9.386455981941308, + "eval_loss": 0.627146303653717, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2600 + }, + { + "epoch": 9.390067720090293, + "grad_norm": 185.1741180419922, + "learning_rate": 1.5892921960072597e-05, + "loss": 41.9657, + "step": 2601 + }, + { + "epoch": 9.393679458239278, + "grad_norm": 211.64219665527344, + "learning_rate": 1.5887477313974592e-05, + "loss": 42.2619, + "step": 2602 + }, + { + "epoch": 9.397291196388261, + "grad_norm": 253.31997680664062, + "learning_rate": 1.5882032667876587e-05, + "loss": 42.5666, + "step": 2603 + }, + { + "epoch": 9.400902934537246, + "grad_norm": 257.8781433105469, + "learning_rate": 1.5876588021778586e-05, + "loss": 43.1747, + "step": 2604 + }, + { + "epoch": 9.404514672686231, + "grad_norm": 171.05398559570312, + "learning_rate": 1.587114337568058e-05, + "loss": 41.2645, + "step": 2605 + }, + { + "epoch": 9.408126410835214, + "grad_norm": 209.83749389648438, + "learning_rate": 1.5865698729582577e-05, + "loss": 38.7138, + "step": 2606 + }, + { + "epoch": 9.411738148984199, + "grad_norm": 303.92059326171875, + "learning_rate": 1.5860254083484576e-05, + "loss": 38.7962, + "step": 2607 + }, + { + "epoch": 9.415349887133182, + "grad_norm": 271.9322204589844, + "learning_rate": 1.585480943738657e-05, + "loss": 39.0622, + "step": 2608 + }, + { + "epoch": 9.418961625282167, + "grad_norm": 222.8749542236328, + "learning_rate": 1.5849364791288566e-05, + "loss": 40.0773, + "step": 2609 + }, + { + "epoch": 9.422573363431152, + "grad_norm": 194.549072265625, + "learning_rate": 1.5843920145190562e-05, + "loss": 39.3495, + "step": 2610 + }, + { + "epoch": 9.422573363431152, + "eval_loss": 0.618250846862793, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.796, + "eval_steps_per_second": 56.796, + "step": 2610 + }, + { + "epoch": 9.426185101580135, + "grad_norm": 231.32623291015625, + "learning_rate": 1.5838475499092557e-05, + "loss": 39.7577, + "step": 2611 + }, + { + "epoch": 9.42979683972912, + "grad_norm": 185.9986114501953, + "learning_rate": 1.5833030852994556e-05, + "loss": 40.9342, + "step": 2612 + }, + { + "epoch": 9.433408577878104, + "grad_norm": 221.356201171875, + "learning_rate": 1.5827586206896555e-05, + "loss": 39.7733, + "step": 2613 + }, + { + "epoch": 9.437020316027088, + "grad_norm": 216.2249755859375, + "learning_rate": 1.582214156079855e-05, + "loss": 39.7559, + "step": 2614 + }, + { + "epoch": 9.440632054176072, + "grad_norm": 263.5106201171875, + "learning_rate": 1.5816696914700546e-05, + "loss": 41.2872, + "step": 2615 + }, + { + "epoch": 9.444243792325057, + "grad_norm": 281.9518127441406, + "learning_rate": 1.581125226860254e-05, + "loss": 41.1114, + "step": 2616 + }, + { + "epoch": 9.44785553047404, + "grad_norm": 200.2808074951172, + "learning_rate": 1.5805807622504536e-05, + "loss": 41.7711, + "step": 2617 + }, + { + "epoch": 9.451467268623025, + "grad_norm": 233.034912109375, + "learning_rate": 1.5800362976406535e-05, + "loss": 41.3306, + "step": 2618 + }, + { + "epoch": 9.455079006772008, + "grad_norm": 215.5499725341797, + "learning_rate": 1.579491833030853e-05, + "loss": 41.0065, + "step": 2619 + }, + { + "epoch": 9.458690744920993, + "grad_norm": 220.21153259277344, + "learning_rate": 1.5789473684210526e-05, + "loss": 42.1116, + "step": 2620 + }, + { + "epoch": 9.458690744920993, + "eval_loss": 0.6146022081375122, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 2620 + }, + { + "epoch": 9.462302483069978, + "grad_norm": 198.20001220703125, + "learning_rate": 1.578402903811252e-05, + "loss": 39.637, + "step": 2621 + }, + { + "epoch": 9.465914221218961, + "grad_norm": 228.18357849121094, + "learning_rate": 1.5778584392014517e-05, + "loss": 37.3831, + "step": 2622 + }, + { + "epoch": 9.469525959367946, + "grad_norm": 207.68040466308594, + "learning_rate": 1.577313974591652e-05, + "loss": 35.6356, + "step": 2623 + }, + { + "epoch": 9.47313769751693, + "grad_norm": 267.0474853515625, + "learning_rate": 1.5767695099818514e-05, + "loss": 34.5549, + "step": 2624 + }, + { + "epoch": 9.476749435665914, + "grad_norm": 191.4129638671875, + "learning_rate": 1.576225045372051e-05, + "loss": 35.1065, + "step": 2625 + }, + { + "epoch": 9.480361173814899, + "grad_norm": 220.85708618164062, + "learning_rate": 1.5756805807622505e-05, + "loss": 34.9115, + "step": 2626 + }, + { + "epoch": 9.483972911963882, + "grad_norm": 218.62460327148438, + "learning_rate": 1.57513611615245e-05, + "loss": 33.9542, + "step": 2627 + }, + { + "epoch": 9.487584650112867, + "grad_norm": 184.085693359375, + "learning_rate": 1.5745916515426496e-05, + "loss": 35.2981, + "step": 2628 + }, + { + "epoch": 9.491196388261852, + "grad_norm": 286.73236083984375, + "learning_rate": 1.5740471869328494e-05, + "loss": 36.8326, + "step": 2629 + }, + { + "epoch": 9.494808126410835, + "grad_norm": 326.4263000488281, + "learning_rate": 1.573502722323049e-05, + "loss": 35.9728, + "step": 2630 + }, + { + "epoch": 9.494808126410835, + "eval_loss": 0.6165672540664673, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2630 + }, + { + "epoch": 9.49841986455982, + "grad_norm": 283.330322265625, + "learning_rate": 1.5729582577132485e-05, + "loss": 37.4227, + "step": 2631 + }, + { + "epoch": 9.502031602708804, + "grad_norm": 208.65829467773438, + "learning_rate": 1.5724137931034484e-05, + "loss": 36.8613, + "step": 2632 + }, + { + "epoch": 9.505643340857787, + "grad_norm": 191.59429931640625, + "learning_rate": 1.571869328493648e-05, + "loss": 36.2332, + "step": 2633 + }, + { + "epoch": 9.509255079006772, + "grad_norm": 306.4736022949219, + "learning_rate": 1.5713248638838478e-05, + "loss": 36.8045, + "step": 2634 + }, + { + "epoch": 9.512866817155757, + "grad_norm": 226.97509765625, + "learning_rate": 1.5707803992740474e-05, + "loss": 37.005, + "step": 2635 + }, + { + "epoch": 9.51647855530474, + "grad_norm": 230.47683715820312, + "learning_rate": 1.570235934664247e-05, + "loss": 36.9168, + "step": 2636 + }, + { + "epoch": 9.520090293453725, + "grad_norm": 221.44483947753906, + "learning_rate": 1.5696914700544464e-05, + "loss": 39.0025, + "step": 2637 + }, + { + "epoch": 9.523702031602708, + "grad_norm": 249.1531219482422, + "learning_rate": 1.569147005444646e-05, + "loss": 38.1069, + "step": 2638 + }, + { + "epoch": 9.527313769751693, + "grad_norm": 276.8532409667969, + "learning_rate": 1.5686025408348455e-05, + "loss": 30.9819, + "step": 2639 + }, + { + "epoch": 9.530925507900678, + "grad_norm": 218.25035095214844, + "learning_rate": 1.5680580762250454e-05, + "loss": 23.4807, + "step": 2640 + }, + { + "epoch": 9.530925507900678, + "eval_loss": 0.619295060634613, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2640 + }, + { + "epoch": 9.534537246049661, + "grad_norm": 185.83737182617188, + "learning_rate": 1.5675136116152453e-05, + "loss": 22.5394, + "step": 2641 + }, + { + "epoch": 9.538148984198646, + "grad_norm": 181.9920654296875, + "learning_rate": 1.5669691470054448e-05, + "loss": 23.9106, + "step": 2642 + }, + { + "epoch": 9.54176072234763, + "grad_norm": 209.20391845703125, + "learning_rate": 1.5664246823956443e-05, + "loss": 25.5328, + "step": 2643 + }, + { + "epoch": 9.545372460496614, + "grad_norm": 223.86093139648438, + "learning_rate": 1.565880217785844e-05, + "loss": 42.8563, + "step": 2644 + }, + { + "epoch": 9.548984198645599, + "grad_norm": 232.3086395263672, + "learning_rate": 1.5653357531760438e-05, + "loss": 44.0178, + "step": 2645 + }, + { + "epoch": 9.552595936794582, + "grad_norm": 223.76541137695312, + "learning_rate": 1.5647912885662433e-05, + "loss": 43.4928, + "step": 2646 + }, + { + "epoch": 9.556207674943566, + "grad_norm": 258.86700439453125, + "learning_rate": 1.5642468239564428e-05, + "loss": 42.3422, + "step": 2647 + }, + { + "epoch": 9.559819413092551, + "grad_norm": 255.09033203125, + "learning_rate": 1.5637023593466424e-05, + "loss": 41.6588, + "step": 2648 + }, + { + "epoch": 9.563431151241534, + "grad_norm": 205.88563537597656, + "learning_rate": 1.563157894736842e-05, + "loss": 41.9267, + "step": 2649 + }, + { + "epoch": 9.56704288939052, + "grad_norm": 204.12318420410156, + "learning_rate": 1.5626134301270418e-05, + "loss": 43.0326, + "step": 2650 + }, + { + "epoch": 9.56704288939052, + "eval_loss": 0.6218730807304382, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2650 + }, + { + "epoch": 9.570654627539504, + "grad_norm": 259.5694274902344, + "learning_rate": 1.5620689655172417e-05, + "loss": 42.9604, + "step": 2651 + }, + { + "epoch": 9.574266365688487, + "grad_norm": 234.35935974121094, + "learning_rate": 1.5615245009074412e-05, + "loss": 42.7316, + "step": 2652 + }, + { + "epoch": 9.577878103837472, + "grad_norm": 237.14346313476562, + "learning_rate": 1.5609800362976407e-05, + "loss": 42.4559, + "step": 2653 + }, + { + "epoch": 9.581489841986457, + "grad_norm": 208.2974395751953, + "learning_rate": 1.5604355716878403e-05, + "loss": 40.1113, + "step": 2654 + }, + { + "epoch": 9.58510158013544, + "grad_norm": 212.18814086914062, + "learning_rate": 1.5598911070780398e-05, + "loss": 38.6515, + "step": 2655 + }, + { + "epoch": 9.588713318284425, + "grad_norm": 245.23240661621094, + "learning_rate": 1.5593466424682397e-05, + "loss": 39.5289, + "step": 2656 + }, + { + "epoch": 9.592325056433408, + "grad_norm": 261.1321105957031, + "learning_rate": 1.5588021778584392e-05, + "loss": 39.3232, + "step": 2657 + }, + { + "epoch": 9.595936794582393, + "grad_norm": 257.67962646484375, + "learning_rate": 1.5582577132486388e-05, + "loss": 40.3963, + "step": 2658 + }, + { + "epoch": 9.599548532731378, + "grad_norm": 299.93914794921875, + "learning_rate": 1.5577132486388383e-05, + "loss": 39.0657, + "step": 2659 + }, + { + "epoch": 9.60316027088036, + "grad_norm": 215.45407104492188, + "learning_rate": 1.5571687840290382e-05, + "loss": 40.1408, + "step": 2660 + }, + { + "epoch": 9.60316027088036, + "eval_loss": 0.6216554045677185, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2660 + }, + { + "epoch": 9.606772009029346, + "grad_norm": 273.9233093261719, + "learning_rate": 1.5566243194192377e-05, + "loss": 40.6894, + "step": 2661 + }, + { + "epoch": 9.610383747178329, + "grad_norm": 220.76344299316406, + "learning_rate": 1.5560798548094376e-05, + "loss": 40.8146, + "step": 2662 + }, + { + "epoch": 9.613995485327314, + "grad_norm": 200.33929443359375, + "learning_rate": 1.555535390199637e-05, + "loss": 40.1362, + "step": 2663 + }, + { + "epoch": 9.617607223476298, + "grad_norm": 223.38536071777344, + "learning_rate": 1.5549909255898367e-05, + "loss": 39.3488, + "step": 2664 + }, + { + "epoch": 9.621218961625281, + "grad_norm": 240.99578857421875, + "learning_rate": 1.5544464609800362e-05, + "loss": 41.771, + "step": 2665 + }, + { + "epoch": 9.624830699774266, + "grad_norm": 202.30323791503906, + "learning_rate": 1.5539019963702357e-05, + "loss": 41.1412, + "step": 2666 + }, + { + "epoch": 9.628442437923251, + "grad_norm": 193.8411865234375, + "learning_rate": 1.5533575317604356e-05, + "loss": 41.0064, + "step": 2667 + }, + { + "epoch": 9.632054176072234, + "grad_norm": 197.1542510986328, + "learning_rate": 1.552813067150635e-05, + "loss": 41.4787, + "step": 2668 + }, + { + "epoch": 9.635665914221219, + "grad_norm": 259.21954345703125, + "learning_rate": 1.552268602540835e-05, + "loss": 41.753, + "step": 2669 + }, + { + "epoch": 9.639277652370204, + "grad_norm": 290.9770202636719, + "learning_rate": 1.5517241379310346e-05, + "loss": 40.4589, + "step": 2670 + }, + { + "epoch": 9.639277652370204, + "eval_loss": 0.6132164001464844, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2670 + }, + { + "epoch": 9.642889390519187, + "grad_norm": 252.86219787597656, + "learning_rate": 1.551179673321234e-05, + "loss": 37.356, + "step": 2671 + }, + { + "epoch": 9.646501128668172, + "grad_norm": 207.79254150390625, + "learning_rate": 1.550635208711434e-05, + "loss": 36.2071, + "step": 2672 + }, + { + "epoch": 9.650112866817155, + "grad_norm": 186.78857421875, + "learning_rate": 1.5500907441016335e-05, + "loss": 33.5074, + "step": 2673 + }, + { + "epoch": 9.65372460496614, + "grad_norm": 212.5107421875, + "learning_rate": 1.549546279491833e-05, + "loss": 33.7103, + "step": 2674 + }, + { + "epoch": 9.657336343115125, + "grad_norm": 243.2950897216797, + "learning_rate": 1.5490018148820326e-05, + "loss": 34.3476, + "step": 2675 + }, + { + "epoch": 9.660948081264108, + "grad_norm": 221.66415405273438, + "learning_rate": 1.548457350272232e-05, + "loss": 34.5377, + "step": 2676 + }, + { + "epoch": 9.664559819413093, + "grad_norm": 231.8260955810547, + "learning_rate": 1.5479128856624317e-05, + "loss": 34.3663, + "step": 2677 + }, + { + "epoch": 9.668171557562077, + "grad_norm": 284.6401062011719, + "learning_rate": 1.547368421052632e-05, + "loss": 35.5723, + "step": 2678 + }, + { + "epoch": 9.67178329571106, + "grad_norm": 373.43865966796875, + "learning_rate": 1.5468239564428314e-05, + "loss": 35.5628, + "step": 2679 + }, + { + "epoch": 9.675395033860045, + "grad_norm": 325.18316650390625, + "learning_rate": 1.546279491833031e-05, + "loss": 35.6192, + "step": 2680 + }, + { + "epoch": 9.675395033860045, + "eval_loss": 0.613842248916626, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 2680 + }, + { + "epoch": 9.679006772009028, + "grad_norm": 353.14739990234375, + "learning_rate": 1.5457350272232305e-05, + "loss": 36.4789, + "step": 2681 + }, + { + "epoch": 9.682618510158013, + "grad_norm": 215.21836853027344, + "learning_rate": 1.54519056261343e-05, + "loss": 36.0412, + "step": 2682 + }, + { + "epoch": 9.686230248306998, + "grad_norm": 219.64930725097656, + "learning_rate": 1.54464609800363e-05, + "loss": 37.1118, + "step": 2683 + }, + { + "epoch": 9.689841986455981, + "grad_norm": 247.86685180664062, + "learning_rate": 1.5441016333938295e-05, + "loss": 36.488, + "step": 2684 + }, + { + "epoch": 9.693453724604966, + "grad_norm": 248.7967071533203, + "learning_rate": 1.543557168784029e-05, + "loss": 36.2925, + "step": 2685 + }, + { + "epoch": 9.697065462753951, + "grad_norm": 243.1404571533203, + "learning_rate": 1.5430127041742285e-05, + "loss": 37.3986, + "step": 2686 + }, + { + "epoch": 9.700677200902934, + "grad_norm": 276.6585388183594, + "learning_rate": 1.5424682395644284e-05, + "loss": 37.9784, + "step": 2687 + }, + { + "epoch": 9.704288939051919, + "grad_norm": 308.171630859375, + "learning_rate": 1.541923774954628e-05, + "loss": 38.1591, + "step": 2688 + }, + { + "epoch": 9.707900677200904, + "grad_norm": 204.4575653076172, + "learning_rate": 1.541379310344828e-05, + "loss": 27.4514, + "step": 2689 + }, + { + "epoch": 9.711512415349887, + "grad_norm": 160.85946655273438, + "learning_rate": 1.5408348457350274e-05, + "loss": 23.7982, + "step": 2690 + }, + { + "epoch": 9.711512415349887, + "eval_loss": 0.619924008846283, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2690 + }, + { + "epoch": 9.715124153498872, + "grad_norm": 215.60049438476562, + "learning_rate": 1.540290381125227e-05, + "loss": 23.3927, + "step": 2691 + }, + { + "epoch": 9.718735891647855, + "grad_norm": 172.84011840820312, + "learning_rate": 1.5397459165154265e-05, + "loss": 24.1876, + "step": 2692 + }, + { + "epoch": 9.72234762979684, + "grad_norm": 208.42361450195312, + "learning_rate": 1.539201451905626e-05, + "loss": 25.1794, + "step": 2693 + }, + { + "epoch": 9.725959367945824, + "grad_norm": 255.73574829101562, + "learning_rate": 1.538656987295826e-05, + "loss": 42.3484, + "step": 2694 + }, + { + "epoch": 9.729571106094808, + "grad_norm": 239.65533447265625, + "learning_rate": 1.5381125226860254e-05, + "loss": 42.8277, + "step": 2695 + }, + { + "epoch": 9.733182844243792, + "grad_norm": 211.2068634033203, + "learning_rate": 1.5375680580762253e-05, + "loss": 42.6536, + "step": 2696 + }, + { + "epoch": 9.736794582392777, + "grad_norm": 302.85003662109375, + "learning_rate": 1.5370235934664248e-05, + "loss": 42.6263, + "step": 2697 + }, + { + "epoch": 9.74040632054176, + "grad_norm": 211.54754638671875, + "learning_rate": 1.5364791288566244e-05, + "loss": 41.5621, + "step": 2698 + }, + { + "epoch": 9.744018058690745, + "grad_norm": 229.22283935546875, + "learning_rate": 1.535934664246824e-05, + "loss": 43.3765, + "step": 2699 + }, + { + "epoch": 9.747629796839728, + "grad_norm": 206.64794921875, + "learning_rate": 1.5353901996370238e-05, + "loss": 41.4923, + "step": 2700 + }, + { + "epoch": 9.747629796839728, + "eval_loss": 0.6202616095542908, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.981, + "eval_steps_per_second": 56.981, + "step": 2700 + }, + { + "epoch": 9.751241534988713, + "grad_norm": 216.98757934570312, + "learning_rate": 1.5348457350272233e-05, + "loss": 43.1931, + "step": 2701 + }, + { + "epoch": 9.754853273137698, + "grad_norm": 222.7340545654297, + "learning_rate": 1.534301270417423e-05, + "loss": 42.485, + "step": 2702 + }, + { + "epoch": 9.758465011286681, + "grad_norm": 291.3454895019531, + "learning_rate": 1.5337568058076224e-05, + "loss": 41.4766, + "step": 2703 + }, + { + "epoch": 9.762076749435666, + "grad_norm": 239.50341796875, + "learning_rate": 1.533212341197822e-05, + "loss": 41.9215, + "step": 2704 + }, + { + "epoch": 9.76568848758465, + "grad_norm": 179.21839904785156, + "learning_rate": 1.5326678765880218e-05, + "loss": 40.6544, + "step": 2705 + }, + { + "epoch": 9.769300225733634, + "grad_norm": 210.89535522460938, + "learning_rate": 1.5321234119782217e-05, + "loss": 38.6204, + "step": 2706 + }, + { + "epoch": 9.772911963882619, + "grad_norm": 239.23291015625, + "learning_rate": 1.5315789473684212e-05, + "loss": 39.4385, + "step": 2707 + }, + { + "epoch": 9.776523702031604, + "grad_norm": 240.22772216796875, + "learning_rate": 1.5310344827586208e-05, + "loss": 40.0139, + "step": 2708 + }, + { + "epoch": 9.780135440180587, + "grad_norm": 185.4588623046875, + "learning_rate": 1.5304900181488203e-05, + "loss": 38.9331, + "step": 2709 + }, + { + "epoch": 9.783747178329572, + "grad_norm": 263.0315856933594, + "learning_rate": 1.52994555353902e-05, + "loss": 38.5485, + "step": 2710 + }, + { + "epoch": 9.783747178329572, + "eval_loss": 0.615914523601532, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2710 + }, + { + "epoch": 9.787358916478555, + "grad_norm": 209.05348205566406, + "learning_rate": 1.5294010889292197e-05, + "loss": 39.4875, + "step": 2711 + }, + { + "epoch": 9.79097065462754, + "grad_norm": 209.72293090820312, + "learning_rate": 1.5288566243194193e-05, + "loss": 40.4742, + "step": 2712 + }, + { + "epoch": 9.794582392776524, + "grad_norm": 210.02908325195312, + "learning_rate": 1.5283121597096188e-05, + "loss": 39.924, + "step": 2713 + }, + { + "epoch": 9.798194130925507, + "grad_norm": 204.3467254638672, + "learning_rate": 1.5277676950998183e-05, + "loss": 40.8893, + "step": 2714 + }, + { + "epoch": 9.801805869074492, + "grad_norm": 253.9317626953125, + "learning_rate": 1.5272232304900182e-05, + "loss": 38.3278, + "step": 2715 + }, + { + "epoch": 9.805417607223477, + "grad_norm": 263.6196594238281, + "learning_rate": 1.526678765880218e-05, + "loss": 40.5242, + "step": 2716 + }, + { + "epoch": 9.80902934537246, + "grad_norm": 230.35621643066406, + "learning_rate": 1.5261343012704176e-05, + "loss": 40.683, + "step": 2717 + }, + { + "epoch": 9.812641083521445, + "grad_norm": 190.16323852539062, + "learning_rate": 1.5255898366606172e-05, + "loss": 40.2472, + "step": 2718 + }, + { + "epoch": 9.816252821670428, + "grad_norm": 202.7122344970703, + "learning_rate": 1.5250453720508167e-05, + "loss": 38.9644, + "step": 2719 + }, + { + "epoch": 9.819864559819413, + "grad_norm": 193.65774536132812, + "learning_rate": 1.5245009074410164e-05, + "loss": 40.9982, + "step": 2720 + }, + { + "epoch": 9.819864559819413, + "eval_loss": 0.6152020692825317, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 2720 + }, + { + "epoch": 9.823476297968398, + "grad_norm": 272.0360412597656, + "learning_rate": 1.523956442831216e-05, + "loss": 40.5518, + "step": 2721 + }, + { + "epoch": 9.827088036117381, + "grad_norm": 200.20777893066406, + "learning_rate": 1.5234119782214155e-05, + "loss": 38.4801, + "step": 2722 + }, + { + "epoch": 9.830699774266366, + "grad_norm": 201.44764709472656, + "learning_rate": 1.5228675136116152e-05, + "loss": 35.7499, + "step": 2723 + }, + { + "epoch": 9.83431151241535, + "grad_norm": 234.89706420898438, + "learning_rate": 1.522323049001815e-05, + "loss": 35.4331, + "step": 2724 + }, + { + "epoch": 9.837923250564334, + "grad_norm": 193.27423095703125, + "learning_rate": 1.5217785843920146e-05, + "loss": 33.0281, + "step": 2725 + }, + { + "epoch": 9.841534988713319, + "grad_norm": 222.28060913085938, + "learning_rate": 1.5212341197822143e-05, + "loss": 34.2237, + "step": 2726 + }, + { + "epoch": 9.845146726862303, + "grad_norm": 264.2764587402344, + "learning_rate": 1.5206896551724139e-05, + "loss": 33.7112, + "step": 2727 + }, + { + "epoch": 9.848758465011286, + "grad_norm": 204.5146484375, + "learning_rate": 1.5201451905626134e-05, + "loss": 33.9014, + "step": 2728 + }, + { + "epoch": 9.852370203160271, + "grad_norm": 198.90907287597656, + "learning_rate": 1.5196007259528131e-05, + "loss": 36.6987, + "step": 2729 + }, + { + "epoch": 9.855981941309254, + "grad_norm": 254.19818115234375, + "learning_rate": 1.5190562613430126e-05, + "loss": 35.4466, + "step": 2730 + }, + { + "epoch": 9.855981941309254, + "eval_loss": 0.6153284311294556, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2730 + }, + { + "epoch": 9.85959367945824, + "grad_norm": 212.53749084472656, + "learning_rate": 1.5185117967332123e-05, + "loss": 35.659, + "step": 2731 + }, + { + "epoch": 9.863205417607224, + "grad_norm": 234.5277557373047, + "learning_rate": 1.5179673321234119e-05, + "loss": 36.7411, + "step": 2732 + }, + { + "epoch": 9.866817155756207, + "grad_norm": 229.25962829589844, + "learning_rate": 1.5174228675136118e-05, + "loss": 36.0713, + "step": 2733 + }, + { + "epoch": 9.870428893905192, + "grad_norm": 259.5096435546875, + "learning_rate": 1.5168784029038115e-05, + "loss": 37.2433, + "step": 2734 + }, + { + "epoch": 9.874040632054175, + "grad_norm": 297.2413024902344, + "learning_rate": 1.516333938294011e-05, + "loss": 37.222, + "step": 2735 + }, + { + "epoch": 9.87765237020316, + "grad_norm": 259.8325500488281, + "learning_rate": 1.5157894736842105e-05, + "loss": 37.096, + "step": 2736 + }, + { + "epoch": 9.881264108352145, + "grad_norm": 275.85888671875, + "learning_rate": 1.5152450090744103e-05, + "loss": 37.769, + "step": 2737 + }, + { + "epoch": 9.884875846501128, + "grad_norm": 261.16656494140625, + "learning_rate": 1.5147005444646098e-05, + "loss": 38.4089, + "step": 2738 + }, + { + "epoch": 9.888487584650113, + "grad_norm": 219.74351501464844, + "learning_rate": 1.5141560798548095e-05, + "loss": 32.5255, + "step": 2739 + }, + { + "epoch": 9.892099322799098, + "grad_norm": 203.9193878173828, + "learning_rate": 1.513611615245009e-05, + "loss": 24.2497, + "step": 2740 + }, + { + "epoch": 9.892099322799098, + "eval_loss": 0.6206448674201965, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 2740 + }, + { + "epoch": 9.89571106094808, + "grad_norm": 224.19454956054688, + "learning_rate": 1.5130671506352086e-05, + "loss": 23.0629, + "step": 2741 + }, + { + "epoch": 9.899322799097066, + "grad_norm": 252.4147186279297, + "learning_rate": 1.5125226860254086e-05, + "loss": 24.5799, + "step": 2742 + }, + { + "epoch": 9.90293453724605, + "grad_norm": 214.79067993164062, + "learning_rate": 1.5119782214156082e-05, + "loss": 24.6773, + "step": 2743 + }, + { + "epoch": 9.906546275395034, + "grad_norm": 225.59848022460938, + "learning_rate": 1.5114337568058077e-05, + "loss": 43.1147, + "step": 2744 + }, + { + "epoch": 9.910158013544018, + "grad_norm": 221.8661651611328, + "learning_rate": 1.5108892921960074e-05, + "loss": 42.7403, + "step": 2745 + }, + { + "epoch": 9.913769751693001, + "grad_norm": 316.3871765136719, + "learning_rate": 1.510344827586207e-05, + "loss": 41.6931, + "step": 2746 + }, + { + "epoch": 9.917381489841986, + "grad_norm": 250.6577911376953, + "learning_rate": 1.5098003629764065e-05, + "loss": 43.3, + "step": 2747 + }, + { + "epoch": 9.920993227990971, + "grad_norm": 222.44386291503906, + "learning_rate": 1.5092558983666062e-05, + "loss": 43.3128, + "step": 2748 + }, + { + "epoch": 9.924604966139954, + "grad_norm": 190.08682250976562, + "learning_rate": 1.5087114337568057e-05, + "loss": 41.4814, + "step": 2749 + }, + { + "epoch": 9.928216704288939, + "grad_norm": 276.9918212890625, + "learning_rate": 1.5081669691470054e-05, + "loss": 41.042, + "step": 2750 + }, + { + "epoch": 9.928216704288939, + "eval_loss": 0.6201648116111755, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2750 + }, + { + "epoch": 9.931828442437924, + "grad_norm": 269.7344970703125, + "learning_rate": 1.507622504537205e-05, + "loss": 40.3064, + "step": 2751 + }, + { + "epoch": 9.935440180586907, + "grad_norm": 263.11663818359375, + "learning_rate": 1.5070780399274049e-05, + "loss": 40.1675, + "step": 2752 + }, + { + "epoch": 9.939051918735892, + "grad_norm": 210.37635803222656, + "learning_rate": 1.5065335753176046e-05, + "loss": 40.5334, + "step": 2753 + }, + { + "epoch": 9.942663656884875, + "grad_norm": 206.09335327148438, + "learning_rate": 1.5059891107078041e-05, + "loss": 41.0429, + "step": 2754 + }, + { + "epoch": 9.94627539503386, + "grad_norm": 245.45013427734375, + "learning_rate": 1.5054446460980036e-05, + "loss": 40.8831, + "step": 2755 + }, + { + "epoch": 9.949887133182845, + "grad_norm": 216.63075256347656, + "learning_rate": 1.5049001814882033e-05, + "loss": 41.2453, + "step": 2756 + }, + { + "epoch": 9.953498871331828, + "grad_norm": 362.12127685546875, + "learning_rate": 1.5043557168784029e-05, + "loss": 40.4561, + "step": 2757 + }, + { + "epoch": 9.957110609480813, + "grad_norm": 222.01434326171875, + "learning_rate": 1.5038112522686024e-05, + "loss": 41.7307, + "step": 2758 + }, + { + "epoch": 9.960722347629797, + "grad_norm": 289.6107177734375, + "learning_rate": 1.5032667876588021e-05, + "loss": 37.83, + "step": 2759 + }, + { + "epoch": 9.96433408577878, + "grad_norm": 231.75274658203125, + "learning_rate": 1.5027223230490017e-05, + "loss": 34.1728, + "step": 2760 + }, + { + "epoch": 9.96433408577878, + "eval_loss": 0.6177247166633606, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2760 + }, + { + "epoch": 9.967945823927765, + "grad_norm": 269.4657287597656, + "learning_rate": 1.5021778584392017e-05, + "loss": 33.8501, + "step": 2761 + }, + { + "epoch": 9.97155756207675, + "grad_norm": 229.73004150390625, + "learning_rate": 1.5016333938294013e-05, + "loss": 35.0989, + "step": 2762 + }, + { + "epoch": 9.975169300225733, + "grad_norm": 215.75350952148438, + "learning_rate": 1.5010889292196008e-05, + "loss": 35.1091, + "step": 2763 + }, + { + "epoch": 9.978781038374718, + "grad_norm": 255.36439514160156, + "learning_rate": 1.5005444646098005e-05, + "loss": 36.8373, + "step": 2764 + }, + { + "epoch": 9.982392776523701, + "grad_norm": 226.71084594726562, + "learning_rate": 1.5e-05, + "loss": 36.6244, + "step": 2765 + }, + { + "epoch": 9.986004514672686, + "grad_norm": 264.1791076660156, + "learning_rate": 1.4994555353901996e-05, + "loss": 36.1925, + "step": 2766 + }, + { + "epoch": 9.989616252821671, + "grad_norm": 281.4349060058594, + "learning_rate": 1.4989110707803993e-05, + "loss": 38.5627, + "step": 2767 + }, + { + "epoch": 9.993227990970654, + "grad_norm": 275.13092041015625, + "learning_rate": 1.498366606170599e-05, + "loss": 33.3277, + "step": 2768 + }, + { + "epoch": 9.996839729119639, + "grad_norm": 215.79550170898438, + "learning_rate": 1.4978221415607985e-05, + "loss": 23.7482, + "step": 2769 + }, + { + "epoch": 10.0, + "grad_norm": 162.03152465820312, + "learning_rate": 1.4972776769509982e-05, + "loss": 21.7078, + "step": 2770 + }, + { + "epoch": 10.0, + "eval_loss": 0.6126651763916016, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2770 + }, + { + "epoch": 10.003611738148985, + "grad_norm": 243.1815185546875, + "learning_rate": 1.4967332123411978e-05, + "loss": 42.2449, + "step": 2771 + }, + { + "epoch": 10.007223476297968, + "grad_norm": 183.29127502441406, + "learning_rate": 1.4961887477313977e-05, + "loss": 41.5925, + "step": 2772 + }, + { + "epoch": 10.010835214446953, + "grad_norm": 206.04238891601562, + "learning_rate": 1.4956442831215972e-05, + "loss": 40.6657, + "step": 2773 + }, + { + "epoch": 10.014446952595938, + "grad_norm": 192.1796875, + "learning_rate": 1.4950998185117967e-05, + "loss": 41.7065, + "step": 2774 + }, + { + "epoch": 10.01805869074492, + "grad_norm": 202.77279663085938, + "learning_rate": 1.4945553539019964e-05, + "loss": 42.0608, + "step": 2775 + }, + { + "epoch": 10.021670428893906, + "grad_norm": 242.37734985351562, + "learning_rate": 1.494010889292196e-05, + "loss": 40.9925, + "step": 2776 + }, + { + "epoch": 10.025282167042889, + "grad_norm": 252.01358032226562, + "learning_rate": 1.4934664246823957e-05, + "loss": 41.1401, + "step": 2777 + }, + { + "epoch": 10.028893905191874, + "grad_norm": 205.82388305664062, + "learning_rate": 1.4929219600725954e-05, + "loss": 41.5, + "step": 2778 + }, + { + "epoch": 10.032505643340858, + "grad_norm": 251.53968811035156, + "learning_rate": 1.492377495462795e-05, + "loss": 41.8218, + "step": 2779 + }, + { + "epoch": 10.036117381489841, + "grad_norm": 236.55564880371094, + "learning_rate": 1.4918330308529945e-05, + "loss": 40.803, + "step": 2780 + }, + { + "epoch": 10.036117381489841, + "eval_loss": 0.6173696517944336, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 2780 + }, + { + "epoch": 10.039729119638826, + "grad_norm": 214.9959716796875, + "learning_rate": 1.4912885662431942e-05, + "loss": 40.522, + "step": 2781 + }, + { + "epoch": 10.043340857787811, + "grad_norm": 213.7000732421875, + "learning_rate": 1.4907441016333939e-05, + "loss": 38.8643, + "step": 2782 + }, + { + "epoch": 10.046952595936794, + "grad_norm": 225.6709747314453, + "learning_rate": 1.4901996370235936e-05, + "loss": 38.3625, + "step": 2783 + }, + { + "epoch": 10.050564334085779, + "grad_norm": 208.83712768554688, + "learning_rate": 1.4896551724137931e-05, + "loss": 38.5355, + "step": 2784 + }, + { + "epoch": 10.054176072234762, + "grad_norm": 185.51219177246094, + "learning_rate": 1.4891107078039927e-05, + "loss": 38.4303, + "step": 2785 + }, + { + "epoch": 10.057787810383747, + "grad_norm": 196.68551635742188, + "learning_rate": 1.4885662431941925e-05, + "loss": 38.1895, + "step": 2786 + }, + { + "epoch": 10.061399548532732, + "grad_norm": 207.4806671142578, + "learning_rate": 1.488021778584392e-05, + "loss": 39.2329, + "step": 2787 + }, + { + "epoch": 10.065011286681715, + "grad_norm": 211.640380859375, + "learning_rate": 1.4874773139745916e-05, + "loss": 40.108, + "step": 2788 + }, + { + "epoch": 10.0686230248307, + "grad_norm": 195.97006225585938, + "learning_rate": 1.4869328493647913e-05, + "loss": 39.6883, + "step": 2789 + }, + { + "epoch": 10.072234762979685, + "grad_norm": 207.20169067382812, + "learning_rate": 1.4863883847549909e-05, + "loss": 40.557, + "step": 2790 + }, + { + "epoch": 10.072234762979685, + "eval_loss": 0.6166439652442932, + "eval_runtime": 3.1461, + "eval_samples_per_second": 56.895, + "eval_steps_per_second": 56.895, + "step": 2790 + }, + { + "epoch": 10.075846501128668, + "grad_norm": 168.4052276611328, + "learning_rate": 1.4858439201451906e-05, + "loss": 39.76, + "step": 2791 + }, + { + "epoch": 10.079458239277653, + "grad_norm": 188.55575561523438, + "learning_rate": 1.4852994555353903e-05, + "loss": 40.4776, + "step": 2792 + }, + { + "epoch": 10.083069977426636, + "grad_norm": 181.60801696777344, + "learning_rate": 1.4847549909255898e-05, + "loss": 40.5414, + "step": 2793 + }, + { + "epoch": 10.08668171557562, + "grad_norm": 205.39608764648438, + "learning_rate": 1.4842105263157895e-05, + "loss": 41.4944, + "step": 2794 + }, + { + "epoch": 10.090293453724605, + "grad_norm": 271.0169372558594, + "learning_rate": 1.4836660617059892e-05, + "loss": 40.6805, + "step": 2795 + }, + { + "epoch": 10.093905191873588, + "grad_norm": 241.97889709472656, + "learning_rate": 1.4831215970961888e-05, + "loss": 39.5473, + "step": 2796 + }, + { + "epoch": 10.097516930022573, + "grad_norm": 211.64260864257812, + "learning_rate": 1.4825771324863885e-05, + "loss": 41.0357, + "step": 2797 + }, + { + "epoch": 10.101128668171558, + "grad_norm": 209.52804565429688, + "learning_rate": 1.482032667876588e-05, + "loss": 41.3357, + "step": 2798 + }, + { + "epoch": 10.104740406320541, + "grad_norm": 243.08419799804688, + "learning_rate": 1.4814882032667876e-05, + "loss": 38.6778, + "step": 2799 + }, + { + "epoch": 10.108352144469526, + "grad_norm": 227.17172241210938, + "learning_rate": 1.4809437386569874e-05, + "loss": 35.1128, + "step": 2800 + }, + { + "epoch": 10.108352144469526, + "eval_loss": 0.6153741478919983, + "eval_runtime": 3.143, + "eval_samples_per_second": 56.952, + "eval_steps_per_second": 56.952, + "step": 2800 + }, + { + "epoch": 10.111963882618511, + "grad_norm": 284.7151794433594, + "learning_rate": 1.480399274047187e-05, + "loss": 33.1712, + "step": 2801 + }, + { + "epoch": 10.115575620767494, + "grad_norm": 234.85169982910156, + "learning_rate": 1.4798548094373867e-05, + "loss": 33.495, + "step": 2802 + }, + { + "epoch": 10.119187358916479, + "grad_norm": 236.6138458251953, + "learning_rate": 1.4793103448275862e-05, + "loss": 33.2318, + "step": 2803 + }, + { + "epoch": 10.122799097065462, + "grad_norm": 240.98997497558594, + "learning_rate": 1.4787658802177858e-05, + "loss": 33.9268, + "step": 2804 + }, + { + "epoch": 10.126410835214447, + "grad_norm": 218.304443359375, + "learning_rate": 1.4782214156079856e-05, + "loss": 34.667, + "step": 2805 + }, + { + "epoch": 10.130022573363432, + "grad_norm": 290.30108642578125, + "learning_rate": 1.4776769509981852e-05, + "loss": 36.7153, + "step": 2806 + }, + { + "epoch": 10.133634311512415, + "grad_norm": 267.7265625, + "learning_rate": 1.4771324863883847e-05, + "loss": 35.2035, + "step": 2807 + }, + { + "epoch": 10.1372460496614, + "grad_norm": 300.4646301269531, + "learning_rate": 1.4765880217785844e-05, + "loss": 35.6581, + "step": 2808 + }, + { + "epoch": 10.140857787810384, + "grad_norm": 234.16448974609375, + "learning_rate": 1.4760435571687841e-05, + "loss": 35.8547, + "step": 2809 + }, + { + "epoch": 10.144469525959368, + "grad_norm": 209.23858642578125, + "learning_rate": 1.4754990925589837e-05, + "loss": 34.47, + "step": 2810 + }, + { + "epoch": 10.144469525959368, + "eval_loss": 0.6160662770271301, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2810 + }, + { + "epoch": 10.148081264108352, + "grad_norm": 207.9628143310547, + "learning_rate": 1.4749546279491834e-05, + "loss": 36.1239, + "step": 2811 + }, + { + "epoch": 10.151693002257336, + "grad_norm": 183.68545532226562, + "learning_rate": 1.4744101633393829e-05, + "loss": 36.759, + "step": 2812 + }, + { + "epoch": 10.15530474040632, + "grad_norm": 222.00164794921875, + "learning_rate": 1.4738656987295826e-05, + "loss": 37.397, + "step": 2813 + }, + { + "epoch": 10.158916478555305, + "grad_norm": 226.9628448486328, + "learning_rate": 1.4733212341197823e-05, + "loss": 36.3648, + "step": 2814 + }, + { + "epoch": 10.162528216704288, + "grad_norm": 271.061279296875, + "learning_rate": 1.4727767695099819e-05, + "loss": 37.8754, + "step": 2815 + }, + { + "epoch": 10.166139954853273, + "grad_norm": 265.2478942871094, + "learning_rate": 1.4722323049001816e-05, + "loss": 33.7491, + "step": 2816 + }, + { + "epoch": 10.169751693002258, + "grad_norm": 227.5030975341797, + "learning_rate": 1.4716878402903811e-05, + "loss": 23.0162, + "step": 2817 + }, + { + "epoch": 10.173363431151241, + "grad_norm": 195.83477783203125, + "learning_rate": 1.4711433756805808e-05, + "loss": 23.5831, + "step": 2818 + }, + { + "epoch": 10.176975169300226, + "grad_norm": 196.982421875, + "learning_rate": 1.4705989110707805e-05, + "loss": 24.1078, + "step": 2819 + }, + { + "epoch": 10.18058690744921, + "grad_norm": 212.73031616210938, + "learning_rate": 1.47005444646098e-05, + "loss": 24.8378, + "step": 2820 + }, + { + "epoch": 10.18058690744921, + "eval_loss": 0.6217848062515259, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 2820 + }, + { + "epoch": 10.184198645598194, + "grad_norm": 261.8343200683594, + "learning_rate": 1.4695099818511796e-05, + "loss": 43.3402, + "step": 2821 + }, + { + "epoch": 10.187810383747179, + "grad_norm": 272.94158935546875, + "learning_rate": 1.4689655172413793e-05, + "loss": 42.8004, + "step": 2822 + }, + { + "epoch": 10.191422121896162, + "grad_norm": 261.5067138671875, + "learning_rate": 1.468421052631579e-05, + "loss": 43.5947, + "step": 2823 + }, + { + "epoch": 10.195033860045147, + "grad_norm": 280.4205322265625, + "learning_rate": 1.4678765880217787e-05, + "loss": 42.1887, + "step": 2824 + }, + { + "epoch": 10.198645598194132, + "grad_norm": 223.82449340820312, + "learning_rate": 1.4673321234119783e-05, + "loss": 40.9825, + "step": 2825 + }, + { + "epoch": 10.202257336343115, + "grad_norm": 261.1077575683594, + "learning_rate": 1.4667876588021778e-05, + "loss": 41.8347, + "step": 2826 + }, + { + "epoch": 10.2058690744921, + "grad_norm": 189.1642608642578, + "learning_rate": 1.4662431941923775e-05, + "loss": 41.7441, + "step": 2827 + }, + { + "epoch": 10.209480812641084, + "grad_norm": 216.94410705566406, + "learning_rate": 1.4656987295825772e-05, + "loss": 42.203, + "step": 2828 + }, + { + "epoch": 10.213092550790067, + "grad_norm": 260.44744873046875, + "learning_rate": 1.4651542649727768e-05, + "loss": 41.8887, + "step": 2829 + }, + { + "epoch": 10.216704288939052, + "grad_norm": 252.21682739257812, + "learning_rate": 1.4646098003629765e-05, + "loss": 42.5977, + "step": 2830 + }, + { + "epoch": 10.216704288939052, + "eval_loss": 0.6175437569618225, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.997, + "eval_steps_per_second": 56.997, + "step": 2830 + }, + { + "epoch": 10.220316027088035, + "grad_norm": 298.4760437011719, + "learning_rate": 1.464065335753176e-05, + "loss": 40.7994, + "step": 2831 + }, + { + "epoch": 10.22392776523702, + "grad_norm": 214.0433349609375, + "learning_rate": 1.4635208711433757e-05, + "loss": 39.1571, + "step": 2832 + }, + { + "epoch": 10.227539503386005, + "grad_norm": 220.59039306640625, + "learning_rate": 1.4629764065335754e-05, + "loss": 38.257, + "step": 2833 + }, + { + "epoch": 10.231151241534988, + "grad_norm": 218.2419891357422, + "learning_rate": 1.462431941923775e-05, + "loss": 38.1954, + "step": 2834 + }, + { + "epoch": 10.234762979683973, + "grad_norm": 241.67674255371094, + "learning_rate": 1.4618874773139747e-05, + "loss": 39.7451, + "step": 2835 + }, + { + "epoch": 10.238374717832958, + "grad_norm": 260.3656005859375, + "learning_rate": 1.4613430127041742e-05, + "loss": 38.8297, + "step": 2836 + }, + { + "epoch": 10.241986455981941, + "grad_norm": 231.78102111816406, + "learning_rate": 1.4607985480943739e-05, + "loss": 38.523, + "step": 2837 + }, + { + "epoch": 10.245598194130926, + "grad_norm": 217.64820861816406, + "learning_rate": 1.4602540834845736e-05, + "loss": 40.0389, + "step": 2838 + }, + { + "epoch": 10.249209932279909, + "grad_norm": 186.45240783691406, + "learning_rate": 1.4597096188747732e-05, + "loss": 40.3306, + "step": 2839 + }, + { + "epoch": 10.252821670428894, + "grad_norm": 225.20480346679688, + "learning_rate": 1.4591651542649727e-05, + "loss": 39.0968, + "step": 2840 + }, + { + "epoch": 10.252821670428894, + "eval_loss": 0.6195141673088074, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 2840 + }, + { + "epoch": 10.256433408577879, + "grad_norm": 367.6174621582031, + "learning_rate": 1.4586206896551724e-05, + "loss": 38.869, + "step": 2841 + }, + { + "epoch": 10.260045146726862, + "grad_norm": 274.3976135253906, + "learning_rate": 1.4580762250453721e-05, + "loss": 39.7781, + "step": 2842 + }, + { + "epoch": 10.263656884875846, + "grad_norm": 193.41665649414062, + "learning_rate": 1.4575317604355718e-05, + "loss": 38.819, + "step": 2843 + }, + { + "epoch": 10.267268623024831, + "grad_norm": 204.2224578857422, + "learning_rate": 1.4569872958257714e-05, + "loss": 41.5495, + "step": 2844 + }, + { + "epoch": 10.270880361173814, + "grad_norm": 276.07476806640625, + "learning_rate": 1.4564428312159709e-05, + "loss": 40.6553, + "step": 2845 + }, + { + "epoch": 10.2744920993228, + "grad_norm": 192.6361541748047, + "learning_rate": 1.4558983666061708e-05, + "loss": 40.2147, + "step": 2846 + }, + { + "epoch": 10.278103837471784, + "grad_norm": 232.6641082763672, + "learning_rate": 1.4553539019963703e-05, + "loss": 40.7223, + "step": 2847 + }, + { + "epoch": 10.281715575620767, + "grad_norm": 266.781005859375, + "learning_rate": 1.4548094373865698e-05, + "loss": 38.0127, + "step": 2848 + }, + { + "epoch": 10.285327313769752, + "grad_norm": 289.5414123535156, + "learning_rate": 1.4542649727767696e-05, + "loss": 35.216, + "step": 2849 + }, + { + "epoch": 10.288939051918735, + "grad_norm": 208.10845947265625, + "learning_rate": 1.4537205081669691e-05, + "loss": 33.829, + "step": 2850 + }, + { + "epoch": 10.288939051918735, + "eval_loss": 0.6140356063842773, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.703, + "eval_steps_per_second": 56.703, + "step": 2850 + }, + { + "epoch": 10.29255079006772, + "grad_norm": 260.80328369140625, + "learning_rate": 1.4531760435571688e-05, + "loss": 33.8409, + "step": 2851 + }, + { + "epoch": 10.296162528216705, + "grad_norm": 202.3874053955078, + "learning_rate": 1.4526315789473685e-05, + "loss": 32.6498, + "step": 2852 + }, + { + "epoch": 10.299774266365688, + "grad_norm": 236.0218048095703, + "learning_rate": 1.452087114337568e-05, + "loss": 33.6538, + "step": 2853 + }, + { + "epoch": 10.303386004514673, + "grad_norm": 219.1603240966797, + "learning_rate": 1.4515426497277678e-05, + "loss": 33.7346, + "step": 2854 + }, + { + "epoch": 10.306997742663658, + "grad_norm": 252.8759307861328, + "learning_rate": 1.4509981851179675e-05, + "loss": 34.6996, + "step": 2855 + }, + { + "epoch": 10.31060948081264, + "grad_norm": 204.89244079589844, + "learning_rate": 1.450453720508167e-05, + "loss": 36.1145, + "step": 2856 + }, + { + "epoch": 10.314221218961626, + "grad_norm": 239.5278778076172, + "learning_rate": 1.4499092558983667e-05, + "loss": 34.8845, + "step": 2857 + }, + { + "epoch": 10.317832957110609, + "grad_norm": 235.02403259277344, + "learning_rate": 1.4493647912885662e-05, + "loss": 36.1006, + "step": 2858 + }, + { + "epoch": 10.321444695259594, + "grad_norm": 219.25686645507812, + "learning_rate": 1.4488203266787658e-05, + "loss": 37.0463, + "step": 2859 + }, + { + "epoch": 10.325056433408578, + "grad_norm": 238.1767578125, + "learning_rate": 1.4482758620689657e-05, + "loss": 35.5543, + "step": 2860 + }, + { + "epoch": 10.325056433408578, + "eval_loss": 0.6116110682487488, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.93, + "eval_steps_per_second": 56.93, + "step": 2860 + }, + { + "epoch": 10.328668171557561, + "grad_norm": 245.4133758544922, + "learning_rate": 1.4477313974591652e-05, + "loss": 35.7557, + "step": 2861 + }, + { + "epoch": 10.332279909706546, + "grad_norm": 231.70779418945312, + "learning_rate": 1.4471869328493647e-05, + "loss": 35.9535, + "step": 2862 + }, + { + "epoch": 10.335891647855531, + "grad_norm": 218.71266174316406, + "learning_rate": 1.4466424682395644e-05, + "loss": 36.747, + "step": 2863 + }, + { + "epoch": 10.339503386004514, + "grad_norm": 206.82247924804688, + "learning_rate": 1.446098003629764e-05, + "loss": 37.4007, + "step": 2864 + }, + { + "epoch": 10.343115124153499, + "grad_norm": 286.6649475097656, + "learning_rate": 1.4455535390199639e-05, + "loss": 38.183, + "step": 2865 + }, + { + "epoch": 10.346726862302482, + "grad_norm": 262.2049865722656, + "learning_rate": 1.4450090744101634e-05, + "loss": 28.1564, + "step": 2866 + }, + { + "epoch": 10.350338600451467, + "grad_norm": 203.03831481933594, + "learning_rate": 1.444464609800363e-05, + "loss": 23.7155, + "step": 2867 + }, + { + "epoch": 10.353950338600452, + "grad_norm": 220.13597106933594, + "learning_rate": 1.4439201451905626e-05, + "loss": 23.5066, + "step": 2868 + }, + { + "epoch": 10.357562076749435, + "grad_norm": 208.22035217285156, + "learning_rate": 1.4433756805807624e-05, + "loss": 23.8087, + "step": 2869 + }, + { + "epoch": 10.36117381489842, + "grad_norm": 202.74989318847656, + "learning_rate": 1.4428312159709619e-05, + "loss": 24.6194, + "step": 2870 + }, + { + "epoch": 10.36117381489842, + "eval_loss": 0.6170971989631653, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 2870 + }, + { + "epoch": 10.364785553047405, + "grad_norm": 251.78924560546875, + "learning_rate": 1.4422867513611616e-05, + "loss": 41.1333, + "step": 2871 + }, + { + "epoch": 10.368397291196388, + "grad_norm": 269.72430419921875, + "learning_rate": 1.4417422867513611e-05, + "loss": 43.5289, + "step": 2872 + }, + { + "epoch": 10.372009029345373, + "grad_norm": 226.14202880859375, + "learning_rate": 1.4411978221415607e-05, + "loss": 42.1575, + "step": 2873 + }, + { + "epoch": 10.375620767494357, + "grad_norm": 230.2255096435547, + "learning_rate": 1.4406533575317606e-05, + "loss": 42.5563, + "step": 2874 + }, + { + "epoch": 10.37923250564334, + "grad_norm": 259.2338562011719, + "learning_rate": 1.4401088929219601e-05, + "loss": 41.517, + "step": 2875 + }, + { + "epoch": 10.382844243792325, + "grad_norm": 280.06414794921875, + "learning_rate": 1.4395644283121598e-05, + "loss": 41.3589, + "step": 2876 + }, + { + "epoch": 10.386455981941308, + "grad_norm": 259.1960754394531, + "learning_rate": 1.4390199637023593e-05, + "loss": 41.539, + "step": 2877 + }, + { + "epoch": 10.390067720090293, + "grad_norm": 244.4931640625, + "learning_rate": 1.438475499092559e-05, + "loss": 41.8689, + "step": 2878 + }, + { + "epoch": 10.393679458239278, + "grad_norm": 195.65065002441406, + "learning_rate": 1.4379310344827588e-05, + "loss": 42.9191, + "step": 2879 + }, + { + "epoch": 10.397291196388261, + "grad_norm": 215.88589477539062, + "learning_rate": 1.4373865698729583e-05, + "loss": 41.4172, + "step": 2880 + }, + { + "epoch": 10.397291196388261, + "eval_loss": 0.6176813840866089, + "eval_runtime": 3.1462, + "eval_samples_per_second": 56.893, + "eval_steps_per_second": 56.893, + "step": 2880 + }, + { + "epoch": 10.400902934537246, + "grad_norm": 175.21368408203125, + "learning_rate": 1.4368421052631578e-05, + "loss": 41.8998, + "step": 2881 + }, + { + "epoch": 10.404514672686231, + "grad_norm": 207.65963745117188, + "learning_rate": 1.4362976406533575e-05, + "loss": 40.33, + "step": 2882 + }, + { + "epoch": 10.408126410835214, + "grad_norm": 213.50526428222656, + "learning_rate": 1.4357531760435572e-05, + "loss": 38.0329, + "step": 2883 + }, + { + "epoch": 10.411738148984199, + "grad_norm": 190.8444366455078, + "learning_rate": 1.4352087114337568e-05, + "loss": 39.0142, + "step": 2884 + }, + { + "epoch": 10.415349887133182, + "grad_norm": 300.2298583984375, + "learning_rate": 1.4346642468239565e-05, + "loss": 38.6364, + "step": 2885 + }, + { + "epoch": 10.418961625282167, + "grad_norm": 183.6144256591797, + "learning_rate": 1.434119782214156e-05, + "loss": 39.6747, + "step": 2886 + }, + { + "epoch": 10.422573363431152, + "grad_norm": 237.85340881347656, + "learning_rate": 1.4335753176043557e-05, + "loss": 38.3018, + "step": 2887 + }, + { + "epoch": 10.426185101580135, + "grad_norm": 325.96624755859375, + "learning_rate": 1.4330308529945554e-05, + "loss": 40.1042, + "step": 2888 + }, + { + "epoch": 10.42979683972912, + "grad_norm": 248.4732666015625, + "learning_rate": 1.432486388384755e-05, + "loss": 40.0357, + "step": 2889 + }, + { + "epoch": 10.433408577878104, + "grad_norm": 374.6653747558594, + "learning_rate": 1.4319419237749547e-05, + "loss": 40.4383, + "step": 2890 + }, + { + "epoch": 10.433408577878104, + "eval_loss": 0.6150367856025696, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.881, + "eval_steps_per_second": 56.881, + "step": 2890 + }, + { + "epoch": 10.437020316027088, + "grad_norm": 229.79647827148438, + "learning_rate": 1.4313974591651542e-05, + "loss": 40.3728, + "step": 2891 + }, + { + "epoch": 10.440632054176072, + "grad_norm": 278.7500915527344, + "learning_rate": 1.430852994555354e-05, + "loss": 39.546, + "step": 2892 + }, + { + "epoch": 10.444243792325057, + "grad_norm": 233.1890106201172, + "learning_rate": 1.4303085299455536e-05, + "loss": 41.8094, + "step": 2893 + }, + { + "epoch": 10.44785553047404, + "grad_norm": 207.7745819091797, + "learning_rate": 1.4297640653357532e-05, + "loss": 40.6225, + "step": 2894 + }, + { + "epoch": 10.451467268623025, + "grad_norm": 233.37892150878906, + "learning_rate": 1.4292196007259529e-05, + "loss": 40.2499, + "step": 2895 + }, + { + "epoch": 10.455079006772008, + "grad_norm": 225.4070587158203, + "learning_rate": 1.4286751361161524e-05, + "loss": 40.3626, + "step": 2896 + }, + { + "epoch": 10.458690744920993, + "grad_norm": 239.60231018066406, + "learning_rate": 1.4281306715063521e-05, + "loss": 40.3149, + "step": 2897 + }, + { + "epoch": 10.462302483069978, + "grad_norm": 225.3981475830078, + "learning_rate": 1.4275862068965518e-05, + "loss": 39.3443, + "step": 2898 + }, + { + "epoch": 10.465914221218961, + "grad_norm": 270.2829284667969, + "learning_rate": 1.4270417422867514e-05, + "loss": 37.8947, + "step": 2899 + }, + { + "epoch": 10.469525959367946, + "grad_norm": 263.66986083984375, + "learning_rate": 1.426497277676951e-05, + "loss": 34.4721, + "step": 2900 + }, + { + "epoch": 10.469525959367946, + "eval_loss": 0.6134031414985657, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2900 + }, + { + "epoch": 10.47313769751693, + "grad_norm": 189.3812255859375, + "learning_rate": 1.4259528130671508e-05, + "loss": 34.3148, + "step": 2901 + }, + { + "epoch": 10.476749435665914, + "grad_norm": 256.7174987792969, + "learning_rate": 1.4254083484573503e-05, + "loss": 32.1693, + "step": 2902 + }, + { + "epoch": 10.480361173814899, + "grad_norm": 265.40692138671875, + "learning_rate": 1.4248638838475499e-05, + "loss": 34.369, + "step": 2903 + }, + { + "epoch": 10.483972911963882, + "grad_norm": 315.6539001464844, + "learning_rate": 1.4243194192377496e-05, + "loss": 34.9479, + "step": 2904 + }, + { + "epoch": 10.487584650112867, + "grad_norm": 263.7816162109375, + "learning_rate": 1.4237749546279491e-05, + "loss": 33.983, + "step": 2905 + }, + { + "epoch": 10.491196388261852, + "grad_norm": 244.69192504882812, + "learning_rate": 1.423230490018149e-05, + "loss": 36.6685, + "step": 2906 + }, + { + "epoch": 10.494808126410835, + "grad_norm": 224.26071166992188, + "learning_rate": 1.4226860254083485e-05, + "loss": 35.0337, + "step": 2907 + }, + { + "epoch": 10.49841986455982, + "grad_norm": 261.0958557128906, + "learning_rate": 1.422141560798548e-05, + "loss": 34.7154, + "step": 2908 + }, + { + "epoch": 10.502031602708804, + "grad_norm": 245.85960388183594, + "learning_rate": 1.4215970961887478e-05, + "loss": 35.4156, + "step": 2909 + }, + { + "epoch": 10.505643340857787, + "grad_norm": 309.3730163574219, + "learning_rate": 1.4210526315789473e-05, + "loss": 36.3999, + "step": 2910 + }, + { + "epoch": 10.505643340857787, + "eval_loss": 0.6144266128540039, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.853, + "eval_steps_per_second": 56.853, + "step": 2910 + }, + { + "epoch": 10.509255079006772, + "grad_norm": 209.9637451171875, + "learning_rate": 1.420508166969147e-05, + "loss": 37.1515, + "step": 2911 + }, + { + "epoch": 10.512866817155757, + "grad_norm": 254.81683349609375, + "learning_rate": 1.4199637023593467e-05, + "loss": 35.5548, + "step": 2912 + }, + { + "epoch": 10.51647855530474, + "grad_norm": 224.94137573242188, + "learning_rate": 1.4194192377495463e-05, + "loss": 36.7691, + "step": 2913 + }, + { + "epoch": 10.520090293453725, + "grad_norm": 223.81838989257812, + "learning_rate": 1.4188747731397458e-05, + "loss": 37.5904, + "step": 2914 + }, + { + "epoch": 10.523702031602708, + "grad_norm": 308.0168151855469, + "learning_rate": 1.4183303085299457e-05, + "loss": 36.1561, + "step": 2915 + }, + { + "epoch": 10.527313769751693, + "grad_norm": 214.77928161621094, + "learning_rate": 1.4177858439201452e-05, + "loss": 27.6309, + "step": 2916 + }, + { + "epoch": 10.530925507900678, + "grad_norm": 153.77163696289062, + "learning_rate": 1.417241379310345e-05, + "loss": 23.6151, + "step": 2917 + }, + { + "epoch": 10.534537246049661, + "grad_norm": 161.12826538085938, + "learning_rate": 1.4166969147005445e-05, + "loss": 23.1684, + "step": 2918 + }, + { + "epoch": 10.538148984198646, + "grad_norm": 228.01441955566406, + "learning_rate": 1.416152450090744e-05, + "loss": 23.4383, + "step": 2919 + }, + { + "epoch": 10.54176072234763, + "grad_norm": 207.55052185058594, + "learning_rate": 1.4156079854809439e-05, + "loss": 25.4699, + "step": 2920 + }, + { + "epoch": 10.54176072234763, + "eval_loss": 0.6177500486373901, + "eval_runtime": 3.1369, + "eval_samples_per_second": 57.063, + "eval_steps_per_second": 57.063, + "step": 2920 + }, + { + "epoch": 10.545372460496614, + "grad_norm": 254.23828125, + "learning_rate": 1.4150635208711434e-05, + "loss": 42.1525, + "step": 2921 + }, + { + "epoch": 10.548984198645599, + "grad_norm": 228.1654815673828, + "learning_rate": 1.414519056261343e-05, + "loss": 42.4282, + "step": 2922 + }, + { + "epoch": 10.552595936794582, + "grad_norm": 258.4981689453125, + "learning_rate": 1.4139745916515427e-05, + "loss": 42.3053, + "step": 2923 + }, + { + "epoch": 10.556207674943566, + "grad_norm": 364.42059326171875, + "learning_rate": 1.4134301270417424e-05, + "loss": 41.9009, + "step": 2924 + }, + { + "epoch": 10.559819413092551, + "grad_norm": 213.5066375732422, + "learning_rate": 1.412885662431942e-05, + "loss": 41.0624, + "step": 2925 + }, + { + "epoch": 10.563431151241534, + "grad_norm": 214.23472595214844, + "learning_rate": 1.4123411978221416e-05, + "loss": 42.2508, + "step": 2926 + }, + { + "epoch": 10.56704288939052, + "grad_norm": 249.8063201904297, + "learning_rate": 1.4117967332123412e-05, + "loss": 43.0671, + "step": 2927 + }, + { + "epoch": 10.570654627539504, + "grad_norm": 210.0769805908203, + "learning_rate": 1.4112522686025409e-05, + "loss": 43.4018, + "step": 2928 + }, + { + "epoch": 10.574266365688487, + "grad_norm": 255.67225646972656, + "learning_rate": 1.4107078039927406e-05, + "loss": 42.9609, + "step": 2929 + }, + { + "epoch": 10.577878103837472, + "grad_norm": 294.2599182128906, + "learning_rate": 1.4101633393829401e-05, + "loss": 41.8748, + "step": 2930 + }, + { + "epoch": 10.577878103837472, + "eval_loss": 0.6147512793540955, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2930 + }, + { + "epoch": 10.581489841986457, + "grad_norm": 212.6685333251953, + "learning_rate": 1.4096188747731398e-05, + "loss": 42.4291, + "step": 2931 + }, + { + "epoch": 10.58510158013544, + "grad_norm": 297.016357421875, + "learning_rate": 1.4090744101633394e-05, + "loss": 39.7291, + "step": 2932 + }, + { + "epoch": 10.588713318284425, + "grad_norm": 280.308837890625, + "learning_rate": 1.4085299455535389e-05, + "loss": 37.4836, + "step": 2933 + }, + { + "epoch": 10.592325056433408, + "grad_norm": 230.28994750976562, + "learning_rate": 1.4079854809437388e-05, + "loss": 39.4075, + "step": 2934 + }, + { + "epoch": 10.595936794582393, + "grad_norm": 377.0367126464844, + "learning_rate": 1.4074410163339383e-05, + "loss": 40.5601, + "step": 2935 + }, + { + "epoch": 10.599548532731378, + "grad_norm": 238.51597595214844, + "learning_rate": 1.406896551724138e-05, + "loss": 38.1238, + "step": 2936 + }, + { + "epoch": 10.60316027088036, + "grad_norm": 197.5536651611328, + "learning_rate": 1.4063520871143376e-05, + "loss": 38.2997, + "step": 2937 + }, + { + "epoch": 10.606772009029346, + "grad_norm": 211.65162658691406, + "learning_rate": 1.4058076225045373e-05, + "loss": 39.1501, + "step": 2938 + }, + { + "epoch": 10.610383747178329, + "grad_norm": 266.4801940917969, + "learning_rate": 1.405263157894737e-05, + "loss": 40.5761, + "step": 2939 + }, + { + "epoch": 10.613995485327314, + "grad_norm": 210.29478454589844, + "learning_rate": 1.4047186932849365e-05, + "loss": 39.7387, + "step": 2940 + }, + { + "epoch": 10.613995485327314, + "eval_loss": 0.6154477000236511, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 2940 + }, + { + "epoch": 10.617607223476298, + "grad_norm": 318.0694580078125, + "learning_rate": 1.404174228675136e-05, + "loss": 38.691, + "step": 2941 + }, + { + "epoch": 10.621218961625281, + "grad_norm": 351.12811279296875, + "learning_rate": 1.4036297640653358e-05, + "loss": 40.3878, + "step": 2942 + }, + { + "epoch": 10.624830699774266, + "grad_norm": 259.8601989746094, + "learning_rate": 1.4030852994555355e-05, + "loss": 38.4447, + "step": 2943 + }, + { + "epoch": 10.628442437923251, + "grad_norm": 249.7741241455078, + "learning_rate": 1.402540834845735e-05, + "loss": 41.1242, + "step": 2944 + }, + { + "epoch": 10.632054176072234, + "grad_norm": 207.11119079589844, + "learning_rate": 1.4019963702359347e-05, + "loss": 40.1977, + "step": 2945 + }, + { + "epoch": 10.635665914221219, + "grad_norm": 199.37295532226562, + "learning_rate": 1.4014519056261343e-05, + "loss": 40.71, + "step": 2946 + }, + { + "epoch": 10.639277652370204, + "grad_norm": 238.85061645507812, + "learning_rate": 1.4009074410163341e-05, + "loss": 41.8822, + "step": 2947 + }, + { + "epoch": 10.642889390519187, + "grad_norm": 212.46388244628906, + "learning_rate": 1.4003629764065337e-05, + "loss": 40.5648, + "step": 2948 + }, + { + "epoch": 10.646501128668172, + "grad_norm": 217.60386657714844, + "learning_rate": 1.3998185117967332e-05, + "loss": 39.6074, + "step": 2949 + }, + { + "epoch": 10.650112866817155, + "grad_norm": 223.88645935058594, + "learning_rate": 1.399274047186933e-05, + "loss": 37.7394, + "step": 2950 + }, + { + "epoch": 10.650112866817155, + "eval_loss": 0.6133999228477478, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 2950 + }, + { + "epoch": 10.65372460496614, + "grad_norm": 248.87986755371094, + "learning_rate": 1.3987295825771325e-05, + "loss": 34.911, + "step": 2951 + }, + { + "epoch": 10.657336343115125, + "grad_norm": 238.0355987548828, + "learning_rate": 1.3981851179673322e-05, + "loss": 34.0325, + "step": 2952 + }, + { + "epoch": 10.660948081264108, + "grad_norm": 212.9556121826172, + "learning_rate": 1.3976406533575319e-05, + "loss": 34.9663, + "step": 2953 + }, + { + "epoch": 10.664559819413093, + "grad_norm": 274.4277648925781, + "learning_rate": 1.3970961887477314e-05, + "loss": 34.2399, + "step": 2954 + }, + { + "epoch": 10.668171557562077, + "grad_norm": 211.77976989746094, + "learning_rate": 1.396551724137931e-05, + "loss": 33.7609, + "step": 2955 + }, + { + "epoch": 10.67178329571106, + "grad_norm": 280.6621398925781, + "learning_rate": 1.3960072595281307e-05, + "loss": 35.2616, + "step": 2956 + }, + { + "epoch": 10.675395033860045, + "grad_norm": 239.06439208984375, + "learning_rate": 1.3954627949183304e-05, + "loss": 34.2542, + "step": 2957 + }, + { + "epoch": 10.679006772009028, + "grad_norm": 271.45806884765625, + "learning_rate": 1.39491833030853e-05, + "loss": 36.0551, + "step": 2958 + }, + { + "epoch": 10.682618510158013, + "grad_norm": 247.76486206054688, + "learning_rate": 1.3943738656987296e-05, + "loss": 36.9935, + "step": 2959 + }, + { + "epoch": 10.686230248306998, + "grad_norm": 259.47930908203125, + "learning_rate": 1.3938294010889292e-05, + "loss": 36.7769, + "step": 2960 + }, + { + "epoch": 10.686230248306998, + "eval_loss": 0.6107803583145142, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.138, + "eval_steps_per_second": 57.138, + "step": 2960 + }, + { + "epoch": 10.689841986455981, + "grad_norm": 247.50103759765625, + "learning_rate": 1.393284936479129e-05, + "loss": 35.4848, + "step": 2961 + }, + { + "epoch": 10.693453724604966, + "grad_norm": 242.37330627441406, + "learning_rate": 1.3927404718693286e-05, + "loss": 36.3881, + "step": 2962 + }, + { + "epoch": 10.697065462753951, + "grad_norm": 200.2835693359375, + "learning_rate": 1.3921960072595281e-05, + "loss": 37.2684, + "step": 2963 + }, + { + "epoch": 10.700677200902934, + "grad_norm": 261.6256103515625, + "learning_rate": 1.3916515426497278e-05, + "loss": 37.4581, + "step": 2964 + }, + { + "epoch": 10.704288939051919, + "grad_norm": 243.7251434326172, + "learning_rate": 1.3911070780399274e-05, + "loss": 35.8237, + "step": 2965 + }, + { + "epoch": 10.707900677200904, + "grad_norm": 172.99339294433594, + "learning_rate": 1.390562613430127e-05, + "loss": 29.5815, + "step": 2966 + }, + { + "epoch": 10.711512415349887, + "grad_norm": 168.88490295410156, + "learning_rate": 1.3900181488203268e-05, + "loss": 23.6597, + "step": 2967 + }, + { + "epoch": 10.715124153498872, + "grad_norm": 213.0456085205078, + "learning_rate": 1.3894736842105263e-05, + "loss": 22.5034, + "step": 2968 + }, + { + "epoch": 10.718735891647855, + "grad_norm": 183.87222290039062, + "learning_rate": 1.388929219600726e-05, + "loss": 24.1696, + "step": 2969 + }, + { + "epoch": 10.72234762979684, + "grad_norm": 179.4297637939453, + "learning_rate": 1.3883847549909256e-05, + "loss": 24.8905, + "step": 2970 + }, + { + "epoch": 10.72234762979684, + "eval_loss": 0.6176853179931641, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 2970 + }, + { + "epoch": 10.725959367945824, + "grad_norm": 214.10662841796875, + "learning_rate": 1.3878402903811253e-05, + "loss": 40.6941, + "step": 2971 + }, + { + "epoch": 10.729571106094808, + "grad_norm": 199.4381103515625, + "learning_rate": 1.387295825771325e-05, + "loss": 42.6363, + "step": 2972 + }, + { + "epoch": 10.733182844243792, + "grad_norm": 182.74517822265625, + "learning_rate": 1.3867513611615245e-05, + "loss": 40.9695, + "step": 2973 + }, + { + "epoch": 10.736794582392777, + "grad_norm": 182.41421508789062, + "learning_rate": 1.386206896551724e-05, + "loss": 40.8893, + "step": 2974 + }, + { + "epoch": 10.74040632054176, + "grad_norm": 215.42904663085938, + "learning_rate": 1.385662431941924e-05, + "loss": 40.6667, + "step": 2975 + }, + { + "epoch": 10.744018058690745, + "grad_norm": 208.15133666992188, + "learning_rate": 1.3851179673321235e-05, + "loss": 42.0714, + "step": 2976 + }, + { + "epoch": 10.747629796839728, + "grad_norm": 224.70242309570312, + "learning_rate": 1.384573502722323e-05, + "loss": 40.9404, + "step": 2977 + }, + { + "epoch": 10.751241534988713, + "grad_norm": 241.45301818847656, + "learning_rate": 1.3840290381125227e-05, + "loss": 43.5597, + "step": 2978 + }, + { + "epoch": 10.754853273137698, + "grad_norm": 201.2677459716797, + "learning_rate": 1.3834845735027222e-05, + "loss": 42.7741, + "step": 2979 + }, + { + "epoch": 10.758465011286681, + "grad_norm": 246.30873107910156, + "learning_rate": 1.3829401088929221e-05, + "loss": 41.7873, + "step": 2980 + }, + { + "epoch": 10.758465011286681, + "eval_loss": 0.6206657886505127, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2980 + }, + { + "epoch": 10.762076749435666, + "grad_norm": 206.91009521484375, + "learning_rate": 1.3823956442831217e-05, + "loss": 42.3601, + "step": 2981 + }, + { + "epoch": 10.76568848758465, + "grad_norm": 206.37472534179688, + "learning_rate": 1.3818511796733212e-05, + "loss": 38.5536, + "step": 2982 + }, + { + "epoch": 10.769300225733634, + "grad_norm": 206.49070739746094, + "learning_rate": 1.3813067150635209e-05, + "loss": 38.1051, + "step": 2983 + }, + { + "epoch": 10.772911963882619, + "grad_norm": 215.02455139160156, + "learning_rate": 1.3807622504537206e-05, + "loss": 39.0797, + "step": 2984 + }, + { + "epoch": 10.776523702031604, + "grad_norm": 254.23757934570312, + "learning_rate": 1.3802177858439202e-05, + "loss": 39.419, + "step": 2985 + }, + { + "epoch": 10.780135440180587, + "grad_norm": 205.85079956054688, + "learning_rate": 1.3796733212341199e-05, + "loss": 39.2075, + "step": 2986 + }, + { + "epoch": 10.783747178329572, + "grad_norm": 216.0372314453125, + "learning_rate": 1.3791288566243194e-05, + "loss": 38.5652, + "step": 2987 + }, + { + "epoch": 10.787358916478555, + "grad_norm": 258.47650146484375, + "learning_rate": 1.3785843920145191e-05, + "loss": 38.1968, + "step": 2988 + }, + { + "epoch": 10.79097065462754, + "grad_norm": 289.07354736328125, + "learning_rate": 1.3780399274047188e-05, + "loss": 40.2233, + "step": 2989 + }, + { + "epoch": 10.794582392776524, + "grad_norm": 332.9964904785156, + "learning_rate": 1.3774954627949184e-05, + "loss": 39.5959, + "step": 2990 + }, + { + "epoch": 10.794582392776524, + "eval_loss": 0.6167517304420471, + "eval_runtime": 3.1556, + "eval_samples_per_second": 56.724, + "eval_steps_per_second": 56.724, + "step": 2990 + }, + { + "epoch": 10.798194130925507, + "grad_norm": 205.10699462890625, + "learning_rate": 1.376950998185118e-05, + "loss": 40.2468, + "step": 2991 + }, + { + "epoch": 10.801805869074492, + "grad_norm": 270.2808837890625, + "learning_rate": 1.3764065335753176e-05, + "loss": 37.5956, + "step": 2992 + }, + { + "epoch": 10.805417607223477, + "grad_norm": 199.32044982910156, + "learning_rate": 1.3758620689655171e-05, + "loss": 38.7289, + "step": 2993 + }, + { + "epoch": 10.80902934537246, + "grad_norm": 196.97547912597656, + "learning_rate": 1.375317604355717e-05, + "loss": 40.6707, + "step": 2994 + }, + { + "epoch": 10.812641083521445, + "grad_norm": 219.34588623046875, + "learning_rate": 1.3747731397459166e-05, + "loss": 39.6782, + "step": 2995 + }, + { + "epoch": 10.816252821670428, + "grad_norm": 261.7323913574219, + "learning_rate": 1.3742286751361161e-05, + "loss": 41.1828, + "step": 2996 + }, + { + "epoch": 10.819864559819413, + "grad_norm": 250.89186096191406, + "learning_rate": 1.3736842105263158e-05, + "loss": 41.3582, + "step": 2997 + }, + { + "epoch": 10.823476297968398, + "grad_norm": 284.7223205566406, + "learning_rate": 1.3731397459165155e-05, + "loss": 39.3584, + "step": 2998 + }, + { + "epoch": 10.827088036117381, + "grad_norm": 212.9114990234375, + "learning_rate": 1.3725952813067152e-05, + "loss": 37.5373, + "step": 2999 + }, + { + "epoch": 10.830699774266366, + "grad_norm": 182.8346405029297, + "learning_rate": 1.3720508166969148e-05, + "loss": 35.2027, + "step": 3000 + }, + { + "epoch": 10.830699774266366, + "eval_loss": 0.6083630919456482, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.702, + "eval_steps_per_second": 56.702, + "step": 3000 + }, + { + "epoch": 10.83431151241535, + "grad_norm": 259.0496520996094, + "learning_rate": 1.3715063520871143e-05, + "loss": 33.4937, + "step": 3001 + }, + { + "epoch": 10.837923250564334, + "grad_norm": 173.037353515625, + "learning_rate": 1.370961887477314e-05, + "loss": 32.8549, + "step": 3002 + }, + { + "epoch": 10.841534988713319, + "grad_norm": 257.9381408691406, + "learning_rate": 1.3704174228675137e-05, + "loss": 33.9163, + "step": 3003 + }, + { + "epoch": 10.845146726862303, + "grad_norm": 248.58355712890625, + "learning_rate": 1.3698729582577132e-05, + "loss": 34.3948, + "step": 3004 + }, + { + "epoch": 10.848758465011286, + "grad_norm": 277.0877990722656, + "learning_rate": 1.369328493647913e-05, + "loss": 34.2868, + "step": 3005 + }, + { + "epoch": 10.852370203160271, + "grad_norm": 220.54014587402344, + "learning_rate": 1.3687840290381125e-05, + "loss": 35.2502, + "step": 3006 + }, + { + "epoch": 10.855981941309254, + "grad_norm": 248.14111328125, + "learning_rate": 1.3682395644283122e-05, + "loss": 33.4599, + "step": 3007 + }, + { + "epoch": 10.85959367945824, + "grad_norm": 284.2827453613281, + "learning_rate": 1.3676950998185119e-05, + "loss": 34.2927, + "step": 3008 + }, + { + "epoch": 10.863205417607224, + "grad_norm": 236.78201293945312, + "learning_rate": 1.3671506352087114e-05, + "loss": 34.9322, + "step": 3009 + }, + { + "epoch": 10.866817155756207, + "grad_norm": 245.58331298828125, + "learning_rate": 1.3666061705989112e-05, + "loss": 35.7628, + "step": 3010 + }, + { + "epoch": 10.866817155756207, + "eval_loss": 0.6125946640968323, + "eval_runtime": 3.1644, + "eval_samples_per_second": 56.566, + "eval_steps_per_second": 56.566, + "step": 3010 + }, + { + "epoch": 10.870428893905192, + "grad_norm": 217.79248046875, + "learning_rate": 1.3660617059891107e-05, + "loss": 35.7332, + "step": 3011 + }, + { + "epoch": 10.874040632054175, + "grad_norm": 258.78729248046875, + "learning_rate": 1.3655172413793104e-05, + "loss": 38.293, + "step": 3012 + }, + { + "epoch": 10.87765237020316, + "grad_norm": 253.94757080078125, + "learning_rate": 1.3649727767695101e-05, + "loss": 37.511, + "step": 3013 + }, + { + "epoch": 10.881264108352145, + "grad_norm": 265.5654602050781, + "learning_rate": 1.3644283121597096e-05, + "loss": 37.5786, + "step": 3014 + }, + { + "epoch": 10.884875846501128, + "grad_norm": 252.11453247070312, + "learning_rate": 1.3638838475499092e-05, + "loss": 37.1039, + "step": 3015 + }, + { + "epoch": 10.888487584650113, + "grad_norm": 259.5934753417969, + "learning_rate": 1.3633393829401089e-05, + "loss": 35.2651, + "step": 3016 + }, + { + "epoch": 10.892099322799098, + "grad_norm": 194.3569793701172, + "learning_rate": 1.3627949183303086e-05, + "loss": 23.7438, + "step": 3017 + }, + { + "epoch": 10.89571106094808, + "grad_norm": 233.95205688476562, + "learning_rate": 1.3622504537205081e-05, + "loss": 23.0061, + "step": 3018 + }, + { + "epoch": 10.899322799097066, + "grad_norm": 185.18495178222656, + "learning_rate": 1.3617059891107078e-05, + "loss": 24.5404, + "step": 3019 + }, + { + "epoch": 10.90293453724605, + "grad_norm": 200.27029418945312, + "learning_rate": 1.3611615245009074e-05, + "loss": 24.3629, + "step": 3020 + }, + { + "epoch": 10.90293453724605, + "eval_loss": 0.6178797483444214, + "eval_runtime": 3.1498, + "eval_samples_per_second": 56.829, + "eval_steps_per_second": 56.829, + "step": 3020 + }, + { + "epoch": 10.906546275395034, + "grad_norm": 226.4281463623047, + "learning_rate": 1.3606170598911073e-05, + "loss": 41.7249, + "step": 3021 + }, + { + "epoch": 10.910158013544018, + "grad_norm": 207.73768615722656, + "learning_rate": 1.3600725952813068e-05, + "loss": 42.1902, + "step": 3022 + }, + { + "epoch": 10.913769751693001, + "grad_norm": 248.69773864746094, + "learning_rate": 1.3595281306715063e-05, + "loss": 40.8419, + "step": 3023 + }, + { + "epoch": 10.917381489841986, + "grad_norm": 224.0100860595703, + "learning_rate": 1.358983666061706e-05, + "loss": 41.483, + "step": 3024 + }, + { + "epoch": 10.920993227990971, + "grad_norm": 217.3524932861328, + "learning_rate": 1.3584392014519056e-05, + "loss": 42.4667, + "step": 3025 + }, + { + "epoch": 10.924604966139954, + "grad_norm": 226.0863494873047, + "learning_rate": 1.3578947368421053e-05, + "loss": 40.8693, + "step": 3026 + }, + { + "epoch": 10.928216704288939, + "grad_norm": 278.3658447265625, + "learning_rate": 1.357350272232305e-05, + "loss": 39.5165, + "step": 3027 + }, + { + "epoch": 10.931828442437924, + "grad_norm": 226.6543731689453, + "learning_rate": 1.3568058076225045e-05, + "loss": 39.3144, + "step": 3028 + }, + { + "epoch": 10.935440180586907, + "grad_norm": 215.39073181152344, + "learning_rate": 1.3562613430127042e-05, + "loss": 39.9823, + "step": 3029 + }, + { + "epoch": 10.939051918735892, + "grad_norm": 239.6291961669922, + "learning_rate": 1.355716878402904e-05, + "loss": 40.898, + "step": 3030 + }, + { + "epoch": 10.939051918735892, + "eval_loss": 0.6163076162338257, + "eval_runtime": 3.153, + "eval_samples_per_second": 56.771, + "eval_steps_per_second": 56.771, + "step": 3030 + }, + { + "epoch": 10.942663656884875, + "grad_norm": 251.20431518554688, + "learning_rate": 1.3551724137931035e-05, + "loss": 40.8357, + "step": 3031 + }, + { + "epoch": 10.94627539503386, + "grad_norm": 243.96022033691406, + "learning_rate": 1.3546279491833032e-05, + "loss": 39.1261, + "step": 3032 + }, + { + "epoch": 10.949887133182845, + "grad_norm": 248.15545654296875, + "learning_rate": 1.3540834845735027e-05, + "loss": 40.9375, + "step": 3033 + }, + { + "epoch": 10.953498871331828, + "grad_norm": 215.00927734375, + "learning_rate": 1.3535390199637023e-05, + "loss": 42.4167, + "step": 3034 + }, + { + "epoch": 10.957110609480813, + "grad_norm": 263.11566162109375, + "learning_rate": 1.3529945553539021e-05, + "loss": 40.7363, + "step": 3035 + }, + { + "epoch": 10.960722347629797, + "grad_norm": 208.59628295898438, + "learning_rate": 1.3524500907441017e-05, + "loss": 35.7124, + "step": 3036 + }, + { + "epoch": 10.96433408577878, + "grad_norm": 187.6036834716797, + "learning_rate": 1.3519056261343012e-05, + "loss": 33.7512, + "step": 3037 + }, + { + "epoch": 10.967945823927765, + "grad_norm": 217.89825439453125, + "learning_rate": 1.351361161524501e-05, + "loss": 33.4262, + "step": 3038 + }, + { + "epoch": 10.97155756207675, + "grad_norm": 235.59889221191406, + "learning_rate": 1.3508166969147005e-05, + "loss": 35.2587, + "step": 3039 + }, + { + "epoch": 10.975169300225733, + "grad_norm": 261.9609680175781, + "learning_rate": 1.3502722323049003e-05, + "loss": 36.1296, + "step": 3040 + }, + { + "epoch": 10.975169300225733, + "eval_loss": 0.610818088054657, + "eval_runtime": 3.1502, + "eval_samples_per_second": 56.822, + "eval_steps_per_second": 56.822, + "step": 3040 + }, + { + "epoch": 10.978781038374718, + "grad_norm": 239.44386291503906, + "learning_rate": 1.3497277676950999e-05, + "loss": 35.6712, + "step": 3041 + }, + { + "epoch": 10.982392776523701, + "grad_norm": 260.9620666503906, + "learning_rate": 1.3491833030852994e-05, + "loss": 35.9054, + "step": 3042 + }, + { + "epoch": 10.986004514672686, + "grad_norm": 246.35678100585938, + "learning_rate": 1.3486388384754991e-05, + "loss": 35.6071, + "step": 3043 + }, + { + "epoch": 10.989616252821671, + "grad_norm": 259.808349609375, + "learning_rate": 1.3480943738656988e-05, + "loss": 37.8261, + "step": 3044 + }, + { + "epoch": 10.993227990970654, + "grad_norm": 187.34579467773438, + "learning_rate": 1.3475499092558984e-05, + "loss": 29.4662, + "step": 3045 + }, + { + "epoch": 10.996839729119639, + "grad_norm": 235.4073486328125, + "learning_rate": 1.3470054446460981e-05, + "loss": 23.668, + "step": 3046 + }, + { + "epoch": 11.0, + "grad_norm": 171.45904541015625, + "learning_rate": 1.3464609800362976e-05, + "loss": 21.3995, + "step": 3047 + }, + { + "epoch": 11.003611738148985, + "grad_norm": 262.18798828125, + "learning_rate": 1.3459165154264972e-05, + "loss": 40.2072, + "step": 3048 + }, + { + "epoch": 11.007223476297968, + "grad_norm": 298.67755126953125, + "learning_rate": 1.345372050816697e-05, + "loss": 42.5345, + "step": 3049 + }, + { + "epoch": 11.010835214446953, + "grad_norm": 215.71389770507812, + "learning_rate": 1.3448275862068966e-05, + "loss": 41.3491, + "step": 3050 + }, + { + "epoch": 11.010835214446953, + "eval_loss": 0.6099278330802917, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 3050 + }, + { + "epoch": 11.014446952595938, + "grad_norm": 243.77044677734375, + "learning_rate": 1.3442831215970963e-05, + "loss": 41.0093, + "step": 3051 + }, + { + "epoch": 11.01805869074492, + "grad_norm": 205.8600616455078, + "learning_rate": 1.3437386569872958e-05, + "loss": 41.944, + "step": 3052 + }, + { + "epoch": 11.021670428893906, + "grad_norm": 204.25608825683594, + "learning_rate": 1.3431941923774955e-05, + "loss": 39.3595, + "step": 3053 + }, + { + "epoch": 11.025282167042889, + "grad_norm": 195.03114318847656, + "learning_rate": 1.3426497277676952e-05, + "loss": 42.0208, + "step": 3054 + }, + { + "epoch": 11.028893905191874, + "grad_norm": 193.05857849121094, + "learning_rate": 1.3421052631578948e-05, + "loss": 41.2148, + "step": 3055 + }, + { + "epoch": 11.032505643340858, + "grad_norm": 255.9553680419922, + "learning_rate": 1.3415607985480943e-05, + "loss": 41.6029, + "step": 3056 + }, + { + "epoch": 11.036117381489841, + "grad_norm": 234.97799682617188, + "learning_rate": 1.341016333938294e-05, + "loss": 41.2583, + "step": 3057 + }, + { + "epoch": 11.039729119638826, + "grad_norm": 183.76707458496094, + "learning_rate": 1.3404718693284937e-05, + "loss": 39.4893, + "step": 3058 + }, + { + "epoch": 11.043340857787811, + "grad_norm": 162.30191040039062, + "learning_rate": 1.3399274047186933e-05, + "loss": 37.697, + "step": 3059 + }, + { + "epoch": 11.046952595936794, + "grad_norm": 223.8235626220703, + "learning_rate": 1.339382940108893e-05, + "loss": 37.2762, + "step": 3060 + }, + { + "epoch": 11.046952595936794, + "eval_loss": 0.6099210381507874, + "eval_runtime": 3.1526, + "eval_samples_per_second": 56.778, + "eval_steps_per_second": 56.778, + "step": 3060 + }, + { + "epoch": 11.050564334085779, + "grad_norm": 203.874755859375, + "learning_rate": 1.3388384754990925e-05, + "loss": 37.7674, + "step": 3061 + }, + { + "epoch": 11.054176072234762, + "grad_norm": 222.9609832763672, + "learning_rate": 1.3382940108892922e-05, + "loss": 39.5784, + "step": 3062 + }, + { + "epoch": 11.057787810383747, + "grad_norm": 177.81871032714844, + "learning_rate": 1.337749546279492e-05, + "loss": 37.5264, + "step": 3063 + }, + { + "epoch": 11.061399548532732, + "grad_norm": 209.53326416015625, + "learning_rate": 1.3372050816696915e-05, + "loss": 38.5067, + "step": 3064 + }, + { + "epoch": 11.065011286681715, + "grad_norm": 228.35260009765625, + "learning_rate": 1.3366606170598912e-05, + "loss": 37.5329, + "step": 3065 + }, + { + "epoch": 11.0686230248307, + "grad_norm": 231.5054168701172, + "learning_rate": 1.3361161524500907e-05, + "loss": 39.8565, + "step": 3066 + }, + { + "epoch": 11.072234762979685, + "grad_norm": 184.31460571289062, + "learning_rate": 1.3355716878402904e-05, + "loss": 37.9703, + "step": 3067 + }, + { + "epoch": 11.075846501128668, + "grad_norm": 230.06463623046875, + "learning_rate": 1.3350272232304901e-05, + "loss": 39.1406, + "step": 3068 + }, + { + "epoch": 11.079458239277653, + "grad_norm": 263.3990478515625, + "learning_rate": 1.3344827586206897e-05, + "loss": 39.8019, + "step": 3069 + }, + { + "epoch": 11.083069977426636, + "grad_norm": 217.89923095703125, + "learning_rate": 1.3339382940108892e-05, + "loss": 40.195, + "step": 3070 + }, + { + "epoch": 11.083069977426636, + "eval_loss": 0.6136859655380249, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 3070 + }, + { + "epoch": 11.08668171557562, + "grad_norm": 238.8343505859375, + "learning_rate": 1.333393829401089e-05, + "loss": 39.1668, + "step": 3071 + }, + { + "epoch": 11.090293453724605, + "grad_norm": 288.6470947265625, + "learning_rate": 1.3328493647912886e-05, + "loss": 40.3355, + "step": 3072 + }, + { + "epoch": 11.093905191873588, + "grad_norm": 284.3423156738281, + "learning_rate": 1.3323049001814883e-05, + "loss": 41.5359, + "step": 3073 + }, + { + "epoch": 11.097516930022573, + "grad_norm": 263.0945739746094, + "learning_rate": 1.3317604355716879e-05, + "loss": 41.3219, + "step": 3074 + }, + { + "epoch": 11.101128668171558, + "grad_norm": 208.96383666992188, + "learning_rate": 1.3312159709618874e-05, + "loss": 39.7292, + "step": 3075 + }, + { + "epoch": 11.104740406320541, + "grad_norm": 233.49888610839844, + "learning_rate": 1.3306715063520873e-05, + "loss": 35.282, + "step": 3076 + }, + { + "epoch": 11.108352144469526, + "grad_norm": 216.6250762939453, + "learning_rate": 1.3301270417422868e-05, + "loss": 34.4335, + "step": 3077 + }, + { + "epoch": 11.111963882618511, + "grad_norm": 182.3594970703125, + "learning_rate": 1.3295825771324864e-05, + "loss": 32.7557, + "step": 3078 + }, + { + "epoch": 11.115575620767494, + "grad_norm": 215.4852752685547, + "learning_rate": 1.329038112522686e-05, + "loss": 32.185, + "step": 3079 + }, + { + "epoch": 11.119187358916479, + "grad_norm": 237.4733123779297, + "learning_rate": 1.3284936479128856e-05, + "loss": 32.8733, + "step": 3080 + }, + { + "epoch": 11.119187358916479, + "eval_loss": 0.6130570769309998, + "eval_runtime": 3.154, + "eval_samples_per_second": 56.754, + "eval_steps_per_second": 56.754, + "step": 3080 + }, + { + "epoch": 11.122799097065462, + "grad_norm": 202.9044952392578, + "learning_rate": 1.3279491833030853e-05, + "loss": 33.89, + "step": 3081 + }, + { + "epoch": 11.126410835214447, + "grad_norm": 230.82086181640625, + "learning_rate": 1.327404718693285e-05, + "loss": 34.0808, + "step": 3082 + }, + { + "epoch": 11.130022573363432, + "grad_norm": 318.1103515625, + "learning_rate": 1.3268602540834846e-05, + "loss": 35.5715, + "step": 3083 + }, + { + "epoch": 11.133634311512415, + "grad_norm": 296.760986328125, + "learning_rate": 1.3263157894736843e-05, + "loss": 36.0701, + "step": 3084 + }, + { + "epoch": 11.1372460496614, + "grad_norm": 355.1922302246094, + "learning_rate": 1.3257713248638838e-05, + "loss": 35.027, + "step": 3085 + }, + { + "epoch": 11.140857787810384, + "grad_norm": 379.0643310546875, + "learning_rate": 1.3252268602540835e-05, + "loss": 36.8225, + "step": 3086 + }, + { + "epoch": 11.144469525959368, + "grad_norm": 271.0293273925781, + "learning_rate": 1.3246823956442832e-05, + "loss": 34.18, + "step": 3087 + }, + { + "epoch": 11.148081264108352, + "grad_norm": 231.29782104492188, + "learning_rate": 1.3241379310344828e-05, + "loss": 37.5546, + "step": 3088 + }, + { + "epoch": 11.151693002257336, + "grad_norm": 236.58180236816406, + "learning_rate": 1.3235934664246823e-05, + "loss": 35.8625, + "step": 3089 + }, + { + "epoch": 11.15530474040632, + "grad_norm": 220.71853637695312, + "learning_rate": 1.3230490018148822e-05, + "loss": 38.1384, + "step": 3090 + }, + { + "epoch": 11.15530474040632, + "eval_loss": 0.6140565276145935, + "eval_runtime": 3.1543, + "eval_samples_per_second": 56.747, + "eval_steps_per_second": 56.747, + "step": 3090 + }, + { + "epoch": 11.158916478555305, + "grad_norm": 251.32090759277344, + "learning_rate": 1.3225045372050817e-05, + "loss": 36.7226, + "step": 3091 + }, + { + "epoch": 11.162528216704288, + "grad_norm": 244.061279296875, + "learning_rate": 1.3219600725952814e-05, + "loss": 37.2144, + "step": 3092 + }, + { + "epoch": 11.166139954853273, + "grad_norm": 274.3013610839844, + "learning_rate": 1.321415607985481e-05, + "loss": 27.0703, + "step": 3093 + }, + { + "epoch": 11.169751693002258, + "grad_norm": 197.1829071044922, + "learning_rate": 1.3208711433756805e-05, + "loss": 23.0504, + "step": 3094 + }, + { + "epoch": 11.173363431151241, + "grad_norm": 205.8387451171875, + "learning_rate": 1.3203266787658804e-05, + "loss": 23.4632, + "step": 3095 + }, + { + "epoch": 11.176975169300226, + "grad_norm": 237.6263427734375, + "learning_rate": 1.31978221415608e-05, + "loss": 23.9426, + "step": 3096 + }, + { + "epoch": 11.18058690744921, + "grad_norm": 177.99688720703125, + "learning_rate": 1.3192377495462795e-05, + "loss": 24.2553, + "step": 3097 + }, + { + "epoch": 11.184198645598194, + "grad_norm": 235.16787719726562, + "learning_rate": 1.3186932849364792e-05, + "loss": 41.3257, + "step": 3098 + }, + { + "epoch": 11.187810383747179, + "grad_norm": 213.4043731689453, + "learning_rate": 1.3181488203266787e-05, + "loss": 42.3344, + "step": 3099 + }, + { + "epoch": 11.191422121896162, + "grad_norm": 162.57554626464844, + "learning_rate": 1.3176043557168784e-05, + "loss": 41.2702, + "step": 3100 + }, + { + "epoch": 11.191422121896162, + "eval_loss": 0.6155741214752197, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.06, + "eval_steps_per_second": 57.06, + "step": 3100 + }, + { + "epoch": 11.195033860045147, + "grad_norm": 215.84335327148438, + "learning_rate": 1.3170598911070781e-05, + "loss": 41.0582, + "step": 3101 + }, + { + "epoch": 11.198645598194132, + "grad_norm": 295.0271301269531, + "learning_rate": 1.3165154264972777e-05, + "loss": 41.3479, + "step": 3102 + }, + { + "epoch": 11.202257336343115, + "grad_norm": 287.3316955566406, + "learning_rate": 1.3159709618874774e-05, + "loss": 41.6267, + "step": 3103 + }, + { + "epoch": 11.2058690744921, + "grad_norm": 249.3993377685547, + "learning_rate": 1.315426497277677e-05, + "loss": 40.5208, + "step": 3104 + }, + { + "epoch": 11.209480812641084, + "grad_norm": 274.5410461425781, + "learning_rate": 1.3148820326678766e-05, + "loss": 41.7072, + "step": 3105 + }, + { + "epoch": 11.213092550790067, + "grad_norm": 259.49627685546875, + "learning_rate": 1.3143375680580763e-05, + "loss": 41.0034, + "step": 3106 + }, + { + "epoch": 11.216704288939052, + "grad_norm": 246.60902404785156, + "learning_rate": 1.3137931034482759e-05, + "loss": 40.1154, + "step": 3107 + }, + { + "epoch": 11.220316027088035, + "grad_norm": 224.0052947998047, + "learning_rate": 1.3132486388384754e-05, + "loss": 41.1167, + "step": 3108 + }, + { + "epoch": 11.22392776523702, + "grad_norm": 204.24021911621094, + "learning_rate": 1.3127041742286753e-05, + "loss": 37.0909, + "step": 3109 + }, + { + "epoch": 11.227539503386005, + "grad_norm": 206.67681884765625, + "learning_rate": 1.3121597096188748e-05, + "loss": 38.0959, + "step": 3110 + }, + { + "epoch": 11.227539503386005, + "eval_loss": 0.6148640513420105, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.798, + "eval_steps_per_second": 56.798, + "step": 3110 + }, + { + "epoch": 11.231151241534988, + "grad_norm": 255.91238403320312, + "learning_rate": 1.3116152450090743e-05, + "loss": 38.8076, + "step": 3111 + }, + { + "epoch": 11.234762979683973, + "grad_norm": 239.5032958984375, + "learning_rate": 1.311070780399274e-05, + "loss": 39.3991, + "step": 3112 + }, + { + "epoch": 11.238374717832958, + "grad_norm": 254.8914031982422, + "learning_rate": 1.3105263157894738e-05, + "loss": 37.7301, + "step": 3113 + }, + { + "epoch": 11.241986455981941, + "grad_norm": 229.97943115234375, + "learning_rate": 1.3099818511796735e-05, + "loss": 38.8527, + "step": 3114 + }, + { + "epoch": 11.245598194130926, + "grad_norm": 208.1148681640625, + "learning_rate": 1.309437386569873e-05, + "loss": 38.8518, + "step": 3115 + }, + { + "epoch": 11.249209932279909, + "grad_norm": 208.49557495117188, + "learning_rate": 1.3088929219600725e-05, + "loss": 38.927, + "step": 3116 + }, + { + "epoch": 11.252821670428894, + "grad_norm": 332.9958801269531, + "learning_rate": 1.3083484573502723e-05, + "loss": 40.0492, + "step": 3117 + }, + { + "epoch": 11.256433408577879, + "grad_norm": 253.16769409179688, + "learning_rate": 1.307803992740472e-05, + "loss": 39.1965, + "step": 3118 + }, + { + "epoch": 11.260045146726862, + "grad_norm": 243.8136444091797, + "learning_rate": 1.3072595281306715e-05, + "loss": 38.2286, + "step": 3119 + }, + { + "epoch": 11.263656884875846, + "grad_norm": 273.6463623046875, + "learning_rate": 1.3067150635208712e-05, + "loss": 39.3751, + "step": 3120 + }, + { + "epoch": 11.263656884875846, + "eval_loss": 0.6175129413604736, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 3120 + }, + { + "epoch": 11.267268623024831, + "grad_norm": 228.980224609375, + "learning_rate": 1.3061705989110707e-05, + "loss": 40.29, + "step": 3121 + }, + { + "epoch": 11.270880361173814, + "grad_norm": 292.6310729980469, + "learning_rate": 1.3056261343012703e-05, + "loss": 41.1785, + "step": 3122 + }, + { + "epoch": 11.2744920993228, + "grad_norm": 217.0737762451172, + "learning_rate": 1.3050816696914702e-05, + "loss": 40.9514, + "step": 3123 + }, + { + "epoch": 11.278103837471784, + "grad_norm": 227.0102081298828, + "learning_rate": 1.3045372050816697e-05, + "loss": 39.6132, + "step": 3124 + }, + { + "epoch": 11.281715575620767, + "grad_norm": 195.74667358398438, + "learning_rate": 1.3039927404718694e-05, + "loss": 39.5024, + "step": 3125 + }, + { + "epoch": 11.285327313769752, + "grad_norm": 222.6744384765625, + "learning_rate": 1.303448275862069e-05, + "loss": 37.7863, + "step": 3126 + }, + { + "epoch": 11.288939051918735, + "grad_norm": 207.1038055419922, + "learning_rate": 1.3029038112522687e-05, + "loss": 34.9129, + "step": 3127 + }, + { + "epoch": 11.29255079006772, + "grad_norm": 227.38330078125, + "learning_rate": 1.3023593466424684e-05, + "loss": 33.231, + "step": 3128 + }, + { + "epoch": 11.296162528216705, + "grad_norm": 254.19442749023438, + "learning_rate": 1.3018148820326679e-05, + "loss": 33.3166, + "step": 3129 + }, + { + "epoch": 11.299774266365688, + "grad_norm": 221.4664306640625, + "learning_rate": 1.3012704174228674e-05, + "loss": 33.2336, + "step": 3130 + }, + { + "epoch": 11.299774266365688, + "eval_loss": 0.6138683557510376, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 3130 + }, + { + "epoch": 11.303386004514673, + "grad_norm": 179.73678588867188, + "learning_rate": 1.3007259528130671e-05, + "loss": 34.0082, + "step": 3131 + }, + { + "epoch": 11.306997742663658, + "grad_norm": 238.66107177734375, + "learning_rate": 1.3001814882032669e-05, + "loss": 33.1898, + "step": 3132 + }, + { + "epoch": 11.31060948081264, + "grad_norm": 315.51934814453125, + "learning_rate": 1.2996370235934666e-05, + "loss": 34.5558, + "step": 3133 + }, + { + "epoch": 11.314221218961626, + "grad_norm": 235.54217529296875, + "learning_rate": 1.2990925589836661e-05, + "loss": 32.4498, + "step": 3134 + }, + { + "epoch": 11.317832957110609, + "grad_norm": 225.9518280029297, + "learning_rate": 1.2985480943738656e-05, + "loss": 34.1823, + "step": 3135 + }, + { + "epoch": 11.321444695259594, + "grad_norm": 276.5481262207031, + "learning_rate": 1.2980036297640655e-05, + "loss": 34.6704, + "step": 3136 + }, + { + "epoch": 11.325056433408578, + "grad_norm": 306.4985656738281, + "learning_rate": 1.297459165154265e-05, + "loss": 35.9149, + "step": 3137 + }, + { + "epoch": 11.328668171557561, + "grad_norm": 207.28550720214844, + "learning_rate": 1.2969147005444646e-05, + "loss": 34.876, + "step": 3138 + }, + { + "epoch": 11.332279909706546, + "grad_norm": 238.89157104492188, + "learning_rate": 1.2963702359346643e-05, + "loss": 36.7191, + "step": 3139 + }, + { + "epoch": 11.335891647855531, + "grad_norm": 281.7445068359375, + "learning_rate": 1.2958257713248638e-05, + "loss": 37.9134, + "step": 3140 + }, + { + "epoch": 11.335891647855531, + "eval_loss": 0.6141538023948669, + "eval_runtime": 3.1622, + "eval_samples_per_second": 56.606, + "eval_steps_per_second": 56.606, + "step": 3140 + }, + { + "epoch": 11.339503386004514, + "grad_norm": 261.58221435546875, + "learning_rate": 1.2952813067150635e-05, + "loss": 36.7193, + "step": 3141 + }, + { + "epoch": 11.343115124153499, + "grad_norm": 260.8083190917969, + "learning_rate": 1.2947368421052633e-05, + "loss": 36.9418, + "step": 3142 + }, + { + "epoch": 11.346726862302482, + "grad_norm": 263.466552734375, + "learning_rate": 1.2941923774954628e-05, + "loss": 31.1083, + "step": 3143 + }, + { + "epoch": 11.350338600451467, + "grad_norm": 201.6587677001953, + "learning_rate": 1.2936479128856625e-05, + "loss": 23.4982, + "step": 3144 + }, + { + "epoch": 11.353950338600452, + "grad_norm": 230.29629516601562, + "learning_rate": 1.293103448275862e-05, + "loss": 22.5417, + "step": 3145 + }, + { + "epoch": 11.357562076749435, + "grad_norm": 193.08795166015625, + "learning_rate": 1.2925589836660617e-05, + "loss": 23.6032, + "step": 3146 + }, + { + "epoch": 11.36117381489842, + "grad_norm": 206.49093627929688, + "learning_rate": 1.2920145190562615e-05, + "loss": 24.1813, + "step": 3147 + }, + { + "epoch": 11.364785553047405, + "grad_norm": 285.38348388671875, + "learning_rate": 1.291470054446461e-05, + "loss": 41.4394, + "step": 3148 + }, + { + "epoch": 11.368397291196388, + "grad_norm": 307.4984130859375, + "learning_rate": 1.2909255898366605e-05, + "loss": 43.8865, + "step": 3149 + }, + { + "epoch": 11.372009029345373, + "grad_norm": 256.685791015625, + "learning_rate": 1.2903811252268604e-05, + "loss": 41.5534, + "step": 3150 + }, + { + "epoch": 11.372009029345373, + "eval_loss": 0.6155339479446411, + "eval_runtime": 3.1488, + "eval_samples_per_second": 56.846, + "eval_steps_per_second": 56.846, + "step": 3150 + }, + { + "epoch": 11.375620767494357, + "grad_norm": 302.5317077636719, + "learning_rate": 1.28983666061706e-05, + "loss": 41.5231, + "step": 3151 + }, + { + "epoch": 11.37923250564334, + "grad_norm": 381.4787292480469, + "learning_rate": 1.2892921960072595e-05, + "loss": 40.7064, + "step": 3152 + }, + { + "epoch": 11.382844243792325, + "grad_norm": 313.63116455078125, + "learning_rate": 1.2887477313974592e-05, + "loss": 41.4045, + "step": 3153 + }, + { + "epoch": 11.386455981941308, + "grad_norm": 265.4134521484375, + "learning_rate": 1.2882032667876587e-05, + "loss": 41.2618, + "step": 3154 + }, + { + "epoch": 11.390067720090293, + "grad_norm": 260.43084716796875, + "learning_rate": 1.2876588021778586e-05, + "loss": 42.6311, + "step": 3155 + }, + { + "epoch": 11.393679458239278, + "grad_norm": 326.7022705078125, + "learning_rate": 1.2871143375680581e-05, + "loss": 41.8859, + "step": 3156 + }, + { + "epoch": 11.397291196388261, + "grad_norm": 420.966552734375, + "learning_rate": 1.2865698729582577e-05, + "loss": 41.8117, + "step": 3157 + }, + { + "epoch": 11.400902934537246, + "grad_norm": 280.8377380371094, + "learning_rate": 1.2860254083484574e-05, + "loss": 41.3303, + "step": 3158 + }, + { + "epoch": 11.404514672686231, + "grad_norm": 238.64564514160156, + "learning_rate": 1.2854809437386571e-05, + "loss": 38.253, + "step": 3159 + }, + { + "epoch": 11.408126410835214, + "grad_norm": 258.8091125488281, + "learning_rate": 1.2849364791288566e-05, + "loss": 39.2494, + "step": 3160 + }, + { + "epoch": 11.408126410835214, + "eval_loss": 0.6130858659744263, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 3160 + }, + { + "epoch": 11.411738148984199, + "grad_norm": 209.76300048828125, + "learning_rate": 1.2843920145190563e-05, + "loss": 39.1069, + "step": 3161 + }, + { + "epoch": 11.415349887133182, + "grad_norm": 215.24072265625, + "learning_rate": 1.2838475499092559e-05, + "loss": 38.8867, + "step": 3162 + }, + { + "epoch": 11.418961625282167, + "grad_norm": 285.4281311035156, + "learning_rate": 1.2833030852994554e-05, + "loss": 38.0298, + "step": 3163 + }, + { + "epoch": 11.422573363431152, + "grad_norm": 322.1593017578125, + "learning_rate": 1.2827586206896553e-05, + "loss": 40.2122, + "step": 3164 + }, + { + "epoch": 11.426185101580135, + "grad_norm": 277.2178955078125, + "learning_rate": 1.2822141560798548e-05, + "loss": 38.0829, + "step": 3165 + }, + { + "epoch": 11.42979683972912, + "grad_norm": 186.9705810546875, + "learning_rate": 1.2816696914700545e-05, + "loss": 40.6601, + "step": 3166 + }, + { + "epoch": 11.433408577878104, + "grad_norm": 210.6102294921875, + "learning_rate": 1.281125226860254e-05, + "loss": 39.0126, + "step": 3167 + }, + { + "epoch": 11.437020316027088, + "grad_norm": 234.50717163085938, + "learning_rate": 1.2805807622504536e-05, + "loss": 38.6465, + "step": 3168 + }, + { + "epoch": 11.440632054176072, + "grad_norm": 217.9093475341797, + "learning_rate": 1.2800362976406535e-05, + "loss": 39.2568, + "step": 3169 + }, + { + "epoch": 11.444243792325057, + "grad_norm": 252.82054138183594, + "learning_rate": 1.279491833030853e-05, + "loss": 39.005, + "step": 3170 + }, + { + "epoch": 11.444243792325057, + "eval_loss": 0.6125118732452393, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 3170 + }, + { + "epoch": 11.44785553047404, + "grad_norm": 290.2322998046875, + "learning_rate": 1.2789473684210526e-05, + "loss": 39.6133, + "step": 3171 + }, + { + "epoch": 11.451467268623025, + "grad_norm": 250.72450256347656, + "learning_rate": 1.2784029038112523e-05, + "loss": 40.3251, + "step": 3172 + }, + { + "epoch": 11.455079006772008, + "grad_norm": 273.91229248046875, + "learning_rate": 1.277858439201452e-05, + "loss": 39.5129, + "step": 3173 + }, + { + "epoch": 11.458690744920993, + "grad_norm": 214.30038452148438, + "learning_rate": 1.2773139745916515e-05, + "loss": 40.5093, + "step": 3174 + }, + { + "epoch": 11.462302483069978, + "grad_norm": 264.251708984375, + "learning_rate": 1.2767695099818512e-05, + "loss": 38.3837, + "step": 3175 + }, + { + "epoch": 11.465914221218961, + "grad_norm": 224.7700653076172, + "learning_rate": 1.2762250453720508e-05, + "loss": 37.8522, + "step": 3176 + }, + { + "epoch": 11.469525959367946, + "grad_norm": 238.35604858398438, + "learning_rate": 1.2756805807622505e-05, + "loss": 34.0249, + "step": 3177 + }, + { + "epoch": 11.47313769751693, + "grad_norm": 181.4731903076172, + "learning_rate": 1.2751361161524502e-05, + "loss": 34.2473, + "step": 3178 + }, + { + "epoch": 11.476749435665914, + "grad_norm": 240.2397003173828, + "learning_rate": 1.2745916515426497e-05, + "loss": 32.8657, + "step": 3179 + }, + { + "epoch": 11.480361173814899, + "grad_norm": 283.2740478515625, + "learning_rate": 1.2740471869328494e-05, + "loss": 34.6619, + "step": 3180 + }, + { + "epoch": 11.480361173814899, + "eval_loss": 0.6126638054847717, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 3180 + }, + { + "epoch": 11.483972911963882, + "grad_norm": 248.70912170410156, + "learning_rate": 1.273502722323049e-05, + "loss": 33.0975, + "step": 3181 + }, + { + "epoch": 11.487584650112867, + "grad_norm": 210.9479217529297, + "learning_rate": 1.2729582577132487e-05, + "loss": 34.2069, + "step": 3182 + }, + { + "epoch": 11.491196388261852, + "grad_norm": 234.31399536132812, + "learning_rate": 1.2724137931034484e-05, + "loss": 35.811, + "step": 3183 + }, + { + "epoch": 11.494808126410835, + "grad_norm": 253.24478149414062, + "learning_rate": 1.271869328493648e-05, + "loss": 35.6234, + "step": 3184 + }, + { + "epoch": 11.49841986455982, + "grad_norm": 259.0565185546875, + "learning_rate": 1.2713248638838476e-05, + "loss": 35.1495, + "step": 3185 + }, + { + "epoch": 11.502031602708804, + "grad_norm": 235.4202880859375, + "learning_rate": 1.2707803992740472e-05, + "loss": 35.1363, + "step": 3186 + }, + { + "epoch": 11.505643340857787, + "grad_norm": 248.30267333984375, + "learning_rate": 1.2702359346642469e-05, + "loss": 35.9653, + "step": 3187 + }, + { + "epoch": 11.509255079006772, + "grad_norm": 197.6142120361328, + "learning_rate": 1.2696914700544466e-05, + "loss": 35.6304, + "step": 3188 + }, + { + "epoch": 11.512866817155757, + "grad_norm": 329.27862548828125, + "learning_rate": 1.2691470054446461e-05, + "loss": 35.6111, + "step": 3189 + }, + { + "epoch": 11.51647855530474, + "grad_norm": 194.7126922607422, + "learning_rate": 1.2686025408348457e-05, + "loss": 35.0693, + "step": 3190 + }, + { + "epoch": 11.51647855530474, + "eval_loss": 0.6106634736061096, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 3190 + }, + { + "epoch": 11.520090293453725, + "grad_norm": 243.0207061767578, + "learning_rate": 1.2680580762250454e-05, + "loss": 37.6373, + "step": 3191 + }, + { + "epoch": 11.523702031602708, + "grad_norm": 282.0947265625, + "learning_rate": 1.267513611615245e-05, + "loss": 36.2595, + "step": 3192 + }, + { + "epoch": 11.527313769751693, + "grad_norm": 249.8011932373047, + "learning_rate": 1.2669691470054446e-05, + "loss": 35.5601, + "step": 3193 + }, + { + "epoch": 11.530925507900678, + "grad_norm": 202.17503356933594, + "learning_rate": 1.2664246823956443e-05, + "loss": 23.1075, + "step": 3194 + }, + { + "epoch": 11.534537246049661, + "grad_norm": 188.78128051757812, + "learning_rate": 1.2658802177858439e-05, + "loss": 22.2458, + "step": 3195 + }, + { + "epoch": 11.538148984198646, + "grad_norm": 219.24722290039062, + "learning_rate": 1.2653357531760437e-05, + "loss": 23.7842, + "step": 3196 + }, + { + "epoch": 11.54176072234763, + "grad_norm": 213.0615234375, + "learning_rate": 1.2647912885662433e-05, + "loss": 25.3773, + "step": 3197 + }, + { + "epoch": 11.545372460496614, + "grad_norm": 274.6806335449219, + "learning_rate": 1.2642468239564428e-05, + "loss": 40.396, + "step": 3198 + }, + { + "epoch": 11.548984198645599, + "grad_norm": 248.91778564453125, + "learning_rate": 1.2637023593466425e-05, + "loss": 42.2405, + "step": 3199 + }, + { + "epoch": 11.552595936794582, + "grad_norm": 228.45591735839844, + "learning_rate": 1.263157894736842e-05, + "loss": 40.7328, + "step": 3200 + }, + { + "epoch": 11.552595936794582, + "eval_loss": 0.6154705286026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.04, + "eval_steps_per_second": 57.04, + "step": 3200 + }, + { + "epoch": 11.556207674943566, + "grad_norm": 206.54483032226562, + "learning_rate": 1.2626134301270418e-05, + "loss": 40.6909, + "step": 3201 + }, + { + "epoch": 11.559819413092551, + "grad_norm": 199.14816284179688, + "learning_rate": 1.2620689655172415e-05, + "loss": 40.6918, + "step": 3202 + }, + { + "epoch": 11.563431151241534, + "grad_norm": 217.4789276123047, + "learning_rate": 1.261524500907441e-05, + "loss": 41.686, + "step": 3203 + }, + { + "epoch": 11.56704288939052, + "grad_norm": 209.83084106445312, + "learning_rate": 1.2609800362976406e-05, + "loss": 40.685, + "step": 3204 + }, + { + "epoch": 11.570654627539504, + "grad_norm": 184.56614685058594, + "learning_rate": 1.2604355716878404e-05, + "loss": 42.1684, + "step": 3205 + }, + { + "epoch": 11.574266365688487, + "grad_norm": 226.84622192382812, + "learning_rate": 1.25989110707804e-05, + "loss": 42.4169, + "step": 3206 + }, + { + "epoch": 11.577878103837472, + "grad_norm": 271.7705383300781, + "learning_rate": 1.2593466424682397e-05, + "loss": 41.9603, + "step": 3207 + }, + { + "epoch": 11.581489841986457, + "grad_norm": 206.48257446289062, + "learning_rate": 1.2588021778584392e-05, + "loss": 39.9903, + "step": 3208 + }, + { + "epoch": 11.58510158013544, + "grad_norm": 190.86009216308594, + "learning_rate": 1.2582577132486388e-05, + "loss": 39.3138, + "step": 3209 + }, + { + "epoch": 11.588713318284425, + "grad_norm": 217.0152130126953, + "learning_rate": 1.2577132486388386e-05, + "loss": 37.652, + "step": 3210 + }, + { + "epoch": 11.588713318284425, + "eval_loss": 0.6143624186515808, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 3210 + }, + { + "epoch": 11.592325056433408, + "grad_norm": 203.3090362548828, + "learning_rate": 1.2571687840290382e-05, + "loss": 38.5532, + "step": 3211 + }, + { + "epoch": 11.595936794582393, + "grad_norm": 237.18287658691406, + "learning_rate": 1.2566243194192377e-05, + "loss": 38.4073, + "step": 3212 + }, + { + "epoch": 11.599548532731378, + "grad_norm": 222.20489501953125, + "learning_rate": 1.2560798548094374e-05, + "loss": 37.7122, + "step": 3213 + }, + { + "epoch": 11.60316027088036, + "grad_norm": 261.4862060546875, + "learning_rate": 1.255535390199637e-05, + "loss": 39.0125, + "step": 3214 + }, + { + "epoch": 11.606772009029346, + "grad_norm": 235.49668884277344, + "learning_rate": 1.2549909255898367e-05, + "loss": 38.1753, + "step": 3215 + }, + { + "epoch": 11.610383747178329, + "grad_norm": 219.66139221191406, + "learning_rate": 1.2544464609800364e-05, + "loss": 40.3478, + "step": 3216 + }, + { + "epoch": 11.613995485327314, + "grad_norm": 282.8075256347656, + "learning_rate": 1.2539019963702359e-05, + "loss": 39.3672, + "step": 3217 + }, + { + "epoch": 11.617607223476298, + "grad_norm": 235.07875061035156, + "learning_rate": 1.2533575317604356e-05, + "loss": 39.8955, + "step": 3218 + }, + { + "epoch": 11.621218961625281, + "grad_norm": 328.829833984375, + "learning_rate": 1.2528130671506353e-05, + "loss": 38.626, + "step": 3219 + }, + { + "epoch": 11.624830699774266, + "grad_norm": 283.1789245605469, + "learning_rate": 1.2522686025408349e-05, + "loss": 40.0565, + "step": 3220 + }, + { + "epoch": 11.624830699774266, + "eval_loss": 0.6113889217376709, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3220 + }, + { + "epoch": 11.628442437923251, + "grad_norm": 230.88047790527344, + "learning_rate": 1.2517241379310346e-05, + "loss": 40.1155, + "step": 3221 + }, + { + "epoch": 11.632054176072234, + "grad_norm": 258.1295166015625, + "learning_rate": 1.2511796733212341e-05, + "loss": 40.4707, + "step": 3222 + }, + { + "epoch": 11.635665914221219, + "grad_norm": 255.82699584960938, + "learning_rate": 1.2506352087114336e-05, + "loss": 41.1296, + "step": 3223 + }, + { + "epoch": 11.639277652370204, + "grad_norm": 226.4784393310547, + "learning_rate": 1.2500907441016335e-05, + "loss": 39.1159, + "step": 3224 + }, + { + "epoch": 11.642889390519187, + "grad_norm": 257.38104248046875, + "learning_rate": 1.249546279491833e-05, + "loss": 40.7933, + "step": 3225 + }, + { + "epoch": 11.646501128668172, + "grad_norm": 218.69070434570312, + "learning_rate": 1.2490018148820328e-05, + "loss": 39.6723, + "step": 3226 + }, + { + "epoch": 11.650112866817155, + "grad_norm": 232.3351287841797, + "learning_rate": 1.2484573502722323e-05, + "loss": 37.5671, + "step": 3227 + }, + { + "epoch": 11.65372460496614, + "grad_norm": 229.93295288085938, + "learning_rate": 1.2479128856624318e-05, + "loss": 32.7819, + "step": 3228 + }, + { + "epoch": 11.657336343115125, + "grad_norm": 265.6002197265625, + "learning_rate": 1.2473684210526317e-05, + "loss": 32.5955, + "step": 3229 + }, + { + "epoch": 11.660948081264108, + "grad_norm": 278.47705078125, + "learning_rate": 1.2468239564428313e-05, + "loss": 32.9901, + "step": 3230 + }, + { + "epoch": 11.660948081264108, + "eval_loss": 0.6078047752380371, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 3230 + }, + { + "epoch": 11.664559819413093, + "grad_norm": 239.9285430908203, + "learning_rate": 1.2462794918330308e-05, + "loss": 33.2737, + "step": 3231 + }, + { + "epoch": 11.668171557562077, + "grad_norm": 358.36090087890625, + "learning_rate": 1.2457350272232305e-05, + "loss": 34.8522, + "step": 3232 + }, + { + "epoch": 11.67178329571106, + "grad_norm": 258.0733642578125, + "learning_rate": 1.2451905626134302e-05, + "loss": 34.6796, + "step": 3233 + }, + { + "epoch": 11.675395033860045, + "grad_norm": 296.21942138671875, + "learning_rate": 1.2446460980036298e-05, + "loss": 35.8479, + "step": 3234 + }, + { + "epoch": 11.679006772009028, + "grad_norm": 229.6141815185547, + "learning_rate": 1.2441016333938295e-05, + "loss": 36.4934, + "step": 3235 + }, + { + "epoch": 11.682618510158013, + "grad_norm": 238.6092987060547, + "learning_rate": 1.243557168784029e-05, + "loss": 35.2253, + "step": 3236 + }, + { + "epoch": 11.686230248306998, + "grad_norm": 300.76300048828125, + "learning_rate": 1.2430127041742287e-05, + "loss": 34.9373, + "step": 3237 + }, + { + "epoch": 11.689841986455981, + "grad_norm": 227.70672607421875, + "learning_rate": 1.2424682395644284e-05, + "loss": 35.4369, + "step": 3238 + }, + { + "epoch": 11.693453724604966, + "grad_norm": 218.36000061035156, + "learning_rate": 1.241923774954628e-05, + "loss": 35.3398, + "step": 3239 + }, + { + "epoch": 11.697065462753951, + "grad_norm": 220.78475952148438, + "learning_rate": 1.2413793103448277e-05, + "loss": 35.7612, + "step": 3240 + }, + { + "epoch": 11.697065462753951, + "eval_loss": 0.6067846417427063, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 3240 + }, + { + "epoch": 11.700677200902934, + "grad_norm": 237.34437561035156, + "learning_rate": 1.2408348457350272e-05, + "loss": 38.0459, + "step": 3241 + }, + { + "epoch": 11.704288939051919, + "grad_norm": 251.60633850097656, + "learning_rate": 1.2402903811252269e-05, + "loss": 35.4676, + "step": 3242 + }, + { + "epoch": 11.707900677200904, + "grad_norm": 214.17117309570312, + "learning_rate": 1.2397459165154266e-05, + "loss": 30.5595, + "step": 3243 + }, + { + "epoch": 11.711512415349887, + "grad_norm": 202.3698272705078, + "learning_rate": 1.2392014519056262e-05, + "loss": 23.7468, + "step": 3244 + }, + { + "epoch": 11.715124153498872, + "grad_norm": 229.11776733398438, + "learning_rate": 1.2386569872958257e-05, + "loss": 23.1255, + "step": 3245 + }, + { + "epoch": 11.718735891647855, + "grad_norm": 175.93829345703125, + "learning_rate": 1.2381125226860254e-05, + "loss": 23.7349, + "step": 3246 + }, + { + "epoch": 11.72234762979684, + "grad_norm": 232.7489471435547, + "learning_rate": 1.2375680580762251e-05, + "loss": 24.4997, + "step": 3247 + }, + { + "epoch": 11.725959367945824, + "grad_norm": 280.5601806640625, + "learning_rate": 1.2370235934664248e-05, + "loss": 42.3811, + "step": 3248 + }, + { + "epoch": 11.729571106094808, + "grad_norm": 292.2538146972656, + "learning_rate": 1.2364791288566244e-05, + "loss": 42.9804, + "step": 3249 + }, + { + "epoch": 11.733182844243792, + "grad_norm": 265.0259704589844, + "learning_rate": 1.2359346642468239e-05, + "loss": 41.1251, + "step": 3250 + }, + { + "epoch": 11.733182844243792, + "eval_loss": 0.6141200065612793, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 3250 + }, + { + "epoch": 11.736794582392777, + "grad_norm": 232.92893981933594, + "learning_rate": 1.2353901996370236e-05, + "loss": 40.9372, + "step": 3251 + }, + { + "epoch": 11.74040632054176, + "grad_norm": 176.99818420410156, + "learning_rate": 1.2348457350272233e-05, + "loss": 41.0757, + "step": 3252 + }, + { + "epoch": 11.744018058690745, + "grad_norm": 206.5728759765625, + "learning_rate": 1.2343012704174228e-05, + "loss": 41.9635, + "step": 3253 + }, + { + "epoch": 11.747629796839728, + "grad_norm": 211.2556915283203, + "learning_rate": 1.2337568058076226e-05, + "loss": 41.5217, + "step": 3254 + }, + { + "epoch": 11.751241534988713, + "grad_norm": 198.8915252685547, + "learning_rate": 1.2332123411978221e-05, + "loss": 42.9997, + "step": 3255 + }, + { + "epoch": 11.754853273137698, + "grad_norm": 291.2761535644531, + "learning_rate": 1.2326678765880218e-05, + "loss": 42.2561, + "step": 3256 + }, + { + "epoch": 11.758465011286681, + "grad_norm": 243.2998046875, + "learning_rate": 1.2321234119782215e-05, + "loss": 41.6219, + "step": 3257 + }, + { + "epoch": 11.762076749435666, + "grad_norm": 266.1149597167969, + "learning_rate": 1.231578947368421e-05, + "loss": 40.1646, + "step": 3258 + }, + { + "epoch": 11.76568848758465, + "grad_norm": 236.6083221435547, + "learning_rate": 1.2310344827586208e-05, + "loss": 39.7079, + "step": 3259 + }, + { + "epoch": 11.769300225733634, + "grad_norm": 196.397216796875, + "learning_rate": 1.2304900181488203e-05, + "loss": 39.6629, + "step": 3260 + }, + { + "epoch": 11.769300225733634, + "eval_loss": 0.6124016046524048, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.055, + "eval_steps_per_second": 57.055, + "step": 3260 + }, + { + "epoch": 11.772911963882619, + "grad_norm": 198.52500915527344, + "learning_rate": 1.22994555353902e-05, + "loss": 38.5285, + "step": 3261 + }, + { + "epoch": 11.776523702031604, + "grad_norm": 236.25477600097656, + "learning_rate": 1.2294010889292197e-05, + "loss": 38.3358, + "step": 3262 + }, + { + "epoch": 11.780135440180587, + "grad_norm": 260.35955810546875, + "learning_rate": 1.2288566243194192e-05, + "loss": 38.374, + "step": 3263 + }, + { + "epoch": 11.783747178329572, + "grad_norm": 313.078857421875, + "learning_rate": 1.2283121597096188e-05, + "loss": 39.124, + "step": 3264 + }, + { + "epoch": 11.787358916478555, + "grad_norm": 191.34027099609375, + "learning_rate": 1.2277676950998187e-05, + "loss": 39.1776, + "step": 3265 + }, + { + "epoch": 11.79097065462754, + "grad_norm": 203.5764923095703, + "learning_rate": 1.2272232304900182e-05, + "loss": 38.7885, + "step": 3266 + }, + { + "epoch": 11.794582392776524, + "grad_norm": 234.38479614257812, + "learning_rate": 1.2266787658802177e-05, + "loss": 39.1353, + "step": 3267 + }, + { + "epoch": 11.798194130925507, + "grad_norm": 254.5694122314453, + "learning_rate": 1.2261343012704174e-05, + "loss": 38.141, + "step": 3268 + }, + { + "epoch": 11.801805869074492, + "grad_norm": 189.8268585205078, + "learning_rate": 1.225589836660617e-05, + "loss": 39.5199, + "step": 3269 + }, + { + "epoch": 11.805417607223477, + "grad_norm": 256.52728271484375, + "learning_rate": 1.2250453720508169e-05, + "loss": 41.5113, + "step": 3270 + }, + { + "epoch": 11.805417607223477, + "eval_loss": 0.6084021329879761, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3270 + }, + { + "epoch": 11.80902934537246, + "grad_norm": 195.57321166992188, + "learning_rate": 1.2245009074410164e-05, + "loss": 39.8129, + "step": 3271 + }, + { + "epoch": 11.812641083521445, + "grad_norm": 228.6748809814453, + "learning_rate": 1.223956442831216e-05, + "loss": 40.2273, + "step": 3272 + }, + { + "epoch": 11.816252821670428, + "grad_norm": 209.96096801757812, + "learning_rate": 1.2234119782214156e-05, + "loss": 40.2254, + "step": 3273 + }, + { + "epoch": 11.819864559819413, + "grad_norm": 247.4613037109375, + "learning_rate": 1.2228675136116152e-05, + "loss": 40.71, + "step": 3274 + }, + { + "epoch": 11.823476297968398, + "grad_norm": 263.0521240234375, + "learning_rate": 1.2223230490018149e-05, + "loss": 39.5572, + "step": 3275 + }, + { + "epoch": 11.827088036117381, + "grad_norm": 225.53634643554688, + "learning_rate": 1.2217785843920146e-05, + "loss": 36.4388, + "step": 3276 + }, + { + "epoch": 11.830699774266366, + "grad_norm": 194.59527587890625, + "learning_rate": 1.2212341197822141e-05, + "loss": 33.1005, + "step": 3277 + }, + { + "epoch": 11.83431151241535, + "grad_norm": 314.715576171875, + "learning_rate": 1.2206896551724138e-05, + "loss": 32.9812, + "step": 3278 + }, + { + "epoch": 11.837923250564334, + "grad_norm": 205.86862182617188, + "learning_rate": 1.2201451905626136e-05, + "loss": 33.6331, + "step": 3279 + }, + { + "epoch": 11.841534988713319, + "grad_norm": 217.54722595214844, + "learning_rate": 1.2196007259528131e-05, + "loss": 33.6535, + "step": 3280 + }, + { + "epoch": 11.841534988713319, + "eval_loss": 0.609620213508606, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 3280 + }, + { + "epoch": 11.845146726862303, + "grad_norm": 231.25390625, + "learning_rate": 1.2190562613430128e-05, + "loss": 34.5218, + "step": 3281 + }, + { + "epoch": 11.848758465011286, + "grad_norm": 208.8440704345703, + "learning_rate": 1.2185117967332123e-05, + "loss": 34.354, + "step": 3282 + }, + { + "epoch": 11.852370203160271, + "grad_norm": 221.25547790527344, + "learning_rate": 1.2179673321234119e-05, + "loss": 34.5705, + "step": 3283 + }, + { + "epoch": 11.855981941309254, + "grad_norm": 331.4505920410156, + "learning_rate": 1.2174228675136118e-05, + "loss": 35.796, + "step": 3284 + }, + { + "epoch": 11.85959367945824, + "grad_norm": 337.1404113769531, + "learning_rate": 1.2168784029038113e-05, + "loss": 36.4544, + "step": 3285 + }, + { + "epoch": 11.863205417607224, + "grad_norm": 238.75303649902344, + "learning_rate": 1.2163339382940108e-05, + "loss": 35.7165, + "step": 3286 + }, + { + "epoch": 11.866817155756207, + "grad_norm": 260.088134765625, + "learning_rate": 1.2157894736842105e-05, + "loss": 35.5461, + "step": 3287 + }, + { + "epoch": 11.870428893905192, + "grad_norm": 265.0240173339844, + "learning_rate": 1.2152450090744102e-05, + "loss": 37.0143, + "step": 3288 + }, + { + "epoch": 11.874040632054175, + "grad_norm": 251.74273681640625, + "learning_rate": 1.21470054446461e-05, + "loss": 36.6145, + "step": 3289 + }, + { + "epoch": 11.87765237020316, + "grad_norm": 216.8999786376953, + "learning_rate": 1.2141560798548095e-05, + "loss": 36.3135, + "step": 3290 + }, + { + "epoch": 11.87765237020316, + "eval_loss": 0.6087896823883057, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.066, + "eval_steps_per_second": 57.066, + "step": 3290 + }, + { + "epoch": 11.881264108352145, + "grad_norm": 256.50006103515625, + "learning_rate": 1.213611615245009e-05, + "loss": 36.6596, + "step": 3291 + }, + { + "epoch": 11.884875846501128, + "grad_norm": 249.34164428710938, + "learning_rate": 1.2130671506352087e-05, + "loss": 37.6473, + "step": 3292 + }, + { + "epoch": 11.888487584650113, + "grad_norm": 211.9344940185547, + "learning_rate": 1.2125226860254084e-05, + "loss": 28.2839, + "step": 3293 + }, + { + "epoch": 11.892099322799098, + "grad_norm": 170.77166748046875, + "learning_rate": 1.211978221415608e-05, + "loss": 23.2231, + "step": 3294 + }, + { + "epoch": 11.89571106094808, + "grad_norm": 177.49789428710938, + "learning_rate": 1.2114337568058077e-05, + "loss": 22.7909, + "step": 3295 + }, + { + "epoch": 11.899322799097066, + "grad_norm": 189.0458221435547, + "learning_rate": 1.2108892921960072e-05, + "loss": 23.8062, + "step": 3296 + }, + { + "epoch": 11.90293453724605, + "grad_norm": 182.90457153320312, + "learning_rate": 1.2103448275862068e-05, + "loss": 24.7812, + "step": 3297 + }, + { + "epoch": 11.906546275395034, + "grad_norm": 232.61126708984375, + "learning_rate": 1.2098003629764066e-05, + "loss": 41.5496, + "step": 3298 + }, + { + "epoch": 11.910158013544018, + "grad_norm": 283.25762939453125, + "learning_rate": 1.2092558983666062e-05, + "loss": 40.7831, + "step": 3299 + }, + { + "epoch": 11.913769751693001, + "grad_norm": 316.6318359375, + "learning_rate": 1.2087114337568059e-05, + "loss": 40.6287, + "step": 3300 + }, + { + "epoch": 11.913769751693001, + "eval_loss": 0.6114257574081421, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 3300 + }, + { + "epoch": 11.917381489841986, + "grad_norm": 248.5615234375, + "learning_rate": 1.2081669691470054e-05, + "loss": 40.5648, + "step": 3301 + }, + { + "epoch": 11.920993227990971, + "grad_norm": 255.31130981445312, + "learning_rate": 1.2076225045372051e-05, + "loss": 42.4736, + "step": 3302 + }, + { + "epoch": 11.924604966139954, + "grad_norm": 229.3546600341797, + "learning_rate": 1.2070780399274048e-05, + "loss": 43.112, + "step": 3303 + }, + { + "epoch": 11.928216704288939, + "grad_norm": 226.89553833007812, + "learning_rate": 1.2065335753176044e-05, + "loss": 37.9527, + "step": 3304 + }, + { + "epoch": 11.931828442437924, + "grad_norm": 210.63919067382812, + "learning_rate": 1.205989110707804e-05, + "loss": 38.7652, + "step": 3305 + }, + { + "epoch": 11.935440180586907, + "grad_norm": 267.75335693359375, + "learning_rate": 1.2054446460980036e-05, + "loss": 39.9077, + "step": 3306 + }, + { + "epoch": 11.939051918735892, + "grad_norm": 255.3372802734375, + "learning_rate": 1.2049001814882033e-05, + "loss": 39.9008, + "step": 3307 + }, + { + "epoch": 11.942663656884875, + "grad_norm": 220.55332946777344, + "learning_rate": 1.2043557168784029e-05, + "loss": 40.8187, + "step": 3308 + }, + { + "epoch": 11.94627539503386, + "grad_norm": 350.15374755859375, + "learning_rate": 1.2038112522686026e-05, + "loss": 40.2937, + "step": 3309 + }, + { + "epoch": 11.949887133182845, + "grad_norm": 296.1144714355469, + "learning_rate": 1.2032667876588021e-05, + "loss": 41.3939, + "step": 3310 + }, + { + "epoch": 11.949887133182845, + "eval_loss": 0.6116041541099548, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3310 + }, + { + "epoch": 11.953498871331828, + "grad_norm": 220.52304077148438, + "learning_rate": 1.202722323049002e-05, + "loss": 39.108, + "step": 3311 + }, + { + "epoch": 11.957110609480813, + "grad_norm": 268.8526916503906, + "learning_rate": 1.2021778584392015e-05, + "loss": 39.547, + "step": 3312 + }, + { + "epoch": 11.960722347629797, + "grad_norm": 205.97677612304688, + "learning_rate": 1.201633393829401e-05, + "loss": 36.7144, + "step": 3313 + }, + { + "epoch": 11.96433408577878, + "grad_norm": 186.62428283691406, + "learning_rate": 1.2010889292196008e-05, + "loss": 34.0491, + "step": 3314 + }, + { + "epoch": 11.967945823927765, + "grad_norm": 214.5521697998047, + "learning_rate": 1.2005444646098003e-05, + "loss": 34.1164, + "step": 3315 + }, + { + "epoch": 11.97155756207675, + "grad_norm": 203.8130340576172, + "learning_rate": 1.2e-05, + "loss": 34.0005, + "step": 3316 + }, + { + "epoch": 11.975169300225733, + "grad_norm": 207.25648498535156, + "learning_rate": 1.1994555353901997e-05, + "loss": 34.0489, + "step": 3317 + }, + { + "epoch": 11.978781038374718, + "grad_norm": 271.1595458984375, + "learning_rate": 1.1989110707803993e-05, + "loss": 35.0359, + "step": 3318 + }, + { + "epoch": 11.982392776523701, + "grad_norm": 266.0697021484375, + "learning_rate": 1.198366606170599e-05, + "loss": 36.4684, + "step": 3319 + }, + { + "epoch": 11.986004514672686, + "grad_norm": 264.1314392089844, + "learning_rate": 1.1978221415607985e-05, + "loss": 35.8805, + "step": 3320 + }, + { + "epoch": 11.986004514672686, + "eval_loss": 0.6101864576339722, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 3320 + }, + { + "epoch": 11.989616252821671, + "grad_norm": 266.34295654296875, + "learning_rate": 1.1972776769509982e-05, + "loss": 37.2928, + "step": 3321 + }, + { + "epoch": 11.993227990970654, + "grad_norm": 222.19161987304688, + "learning_rate": 1.196733212341198e-05, + "loss": 29.0638, + "step": 3322 + }, + { + "epoch": 11.996839729119639, + "grad_norm": 244.96974182128906, + "learning_rate": 1.1961887477313975e-05, + "loss": 23.6752, + "step": 3323 + }, + { + "epoch": 12.0, + "grad_norm": 227.6931915283203, + "learning_rate": 1.195644283121597e-05, + "loss": 20.9293, + "step": 3324 + }, + { + "epoch": 12.003611738148985, + "grad_norm": 259.7235412597656, + "learning_rate": 1.1950998185117969e-05, + "loss": 39.7694, + "step": 3325 + }, + { + "epoch": 12.007223476297968, + "grad_norm": 258.8477783203125, + "learning_rate": 1.1945553539019964e-05, + "loss": 41.3742, + "step": 3326 + }, + { + "epoch": 12.010835214446953, + "grad_norm": 216.0697784423828, + "learning_rate": 1.194010889292196e-05, + "loss": 40.0706, + "step": 3327 + }, + { + "epoch": 12.014446952595938, + "grad_norm": 197.73046875, + "learning_rate": 1.1934664246823957e-05, + "loss": 39.844, + "step": 3328 + }, + { + "epoch": 12.01805869074492, + "grad_norm": 190.29563903808594, + "learning_rate": 1.1929219600725952e-05, + "loss": 41.8877, + "step": 3329 + }, + { + "epoch": 12.021670428893906, + "grad_norm": 190.01197814941406, + "learning_rate": 1.1923774954627951e-05, + "loss": 40.5782, + "step": 3330 + }, + { + "epoch": 12.021670428893906, + "eval_loss": 0.6100598573684692, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3330 + }, + { + "epoch": 12.025282167042889, + "grad_norm": 283.20965576171875, + "learning_rate": 1.1918330308529946e-05, + "loss": 42.9183, + "step": 3331 + }, + { + "epoch": 12.028893905191874, + "grad_norm": 227.9106903076172, + "learning_rate": 1.1912885662431942e-05, + "loss": 41.4606, + "step": 3332 + }, + { + "epoch": 12.032505643340858, + "grad_norm": 217.31640625, + "learning_rate": 1.1907441016333939e-05, + "loss": 40.527, + "step": 3333 + }, + { + "epoch": 12.036117381489841, + "grad_norm": 181.33787536621094, + "learning_rate": 1.1901996370235936e-05, + "loss": 40.2536, + "step": 3334 + }, + { + "epoch": 12.039729119638826, + "grad_norm": 210.638427734375, + "learning_rate": 1.1896551724137931e-05, + "loss": 39.0234, + "step": 3335 + }, + { + "epoch": 12.043340857787811, + "grad_norm": 222.1325225830078, + "learning_rate": 1.1891107078039928e-05, + "loss": 36.6929, + "step": 3336 + }, + { + "epoch": 12.046952595936794, + "grad_norm": 195.0751953125, + "learning_rate": 1.1885662431941924e-05, + "loss": 37.9547, + "step": 3337 + }, + { + "epoch": 12.050564334085779, + "grad_norm": 287.6582946777344, + "learning_rate": 1.1880217785843919e-05, + "loss": 37.9016, + "step": 3338 + }, + { + "epoch": 12.054176072234762, + "grad_norm": 351.43701171875, + "learning_rate": 1.1874773139745918e-05, + "loss": 40.014, + "step": 3339 + }, + { + "epoch": 12.057787810383747, + "grad_norm": 212.9033966064453, + "learning_rate": 1.1869328493647913e-05, + "loss": 37.8761, + "step": 3340 + }, + { + "epoch": 12.057787810383747, + "eval_loss": 0.6093400120735168, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 3340 + }, + { + "epoch": 12.061399548532732, + "grad_norm": 268.8284912109375, + "learning_rate": 1.186388384754991e-05, + "loss": 38.7171, + "step": 3341 + }, + { + "epoch": 12.065011286681715, + "grad_norm": 193.27267456054688, + "learning_rate": 1.1858439201451906e-05, + "loss": 38.4908, + "step": 3342 + }, + { + "epoch": 12.0686230248307, + "grad_norm": 244.18124389648438, + "learning_rate": 1.1852994555353901e-05, + "loss": 37.9388, + "step": 3343 + }, + { + "epoch": 12.072234762979685, + "grad_norm": 311.6593933105469, + "learning_rate": 1.18475499092559e-05, + "loss": 38.4287, + "step": 3344 + }, + { + "epoch": 12.075846501128668, + "grad_norm": 239.28526306152344, + "learning_rate": 1.1842105263157895e-05, + "loss": 38.1349, + "step": 3345 + }, + { + "epoch": 12.079458239277653, + "grad_norm": 312.1795654296875, + "learning_rate": 1.183666061705989e-05, + "loss": 39.8067, + "step": 3346 + }, + { + "epoch": 12.083069977426636, + "grad_norm": 303.3067932128906, + "learning_rate": 1.1831215970961888e-05, + "loss": 40.0617, + "step": 3347 + }, + { + "epoch": 12.08668171557562, + "grad_norm": 280.8705749511719, + "learning_rate": 1.1825771324863885e-05, + "loss": 39.244, + "step": 3348 + }, + { + "epoch": 12.090293453724605, + "grad_norm": 249.89671325683594, + "learning_rate": 1.182032667876588e-05, + "loss": 39.0047, + "step": 3349 + }, + { + "epoch": 12.093905191873588, + "grad_norm": 226.19195556640625, + "learning_rate": 1.1814882032667877e-05, + "loss": 40.8044, + "step": 3350 + }, + { + "epoch": 12.093905191873588, + "eval_loss": 0.6100687384605408, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 3350 + }, + { + "epoch": 12.097516930022573, + "grad_norm": 250.29306030273438, + "learning_rate": 1.1809437386569873e-05, + "loss": 38.0745, + "step": 3351 + }, + { + "epoch": 12.101128668171558, + "grad_norm": 255.06137084960938, + "learning_rate": 1.180399274047187e-05, + "loss": 37.2922, + "step": 3352 + }, + { + "epoch": 12.104740406320541, + "grad_norm": 293.59185791015625, + "learning_rate": 1.1798548094373867e-05, + "loss": 35.488, + "step": 3353 + }, + { + "epoch": 12.108352144469526, + "grad_norm": 260.9599914550781, + "learning_rate": 1.1793103448275862e-05, + "loss": 32.8175, + "step": 3354 + }, + { + "epoch": 12.111963882618511, + "grad_norm": 387.63671875, + "learning_rate": 1.178765880217786e-05, + "loss": 31.3901, + "step": 3355 + }, + { + "epoch": 12.115575620767494, + "grad_norm": 216.2008819580078, + "learning_rate": 1.1782214156079855e-05, + "loss": 32.9512, + "step": 3356 + }, + { + "epoch": 12.119187358916479, + "grad_norm": 260.510498046875, + "learning_rate": 1.177676950998185e-05, + "loss": 31.838, + "step": 3357 + }, + { + "epoch": 12.122799097065462, + "grad_norm": 215.96522521972656, + "learning_rate": 1.1771324863883849e-05, + "loss": 33.5854, + "step": 3358 + }, + { + "epoch": 12.126410835214447, + "grad_norm": 277.2855529785156, + "learning_rate": 1.1765880217785844e-05, + "loss": 34.947, + "step": 3359 + }, + { + "epoch": 12.130022573363432, + "grad_norm": 199.53759765625, + "learning_rate": 1.176043557168784e-05, + "loss": 34.3862, + "step": 3360 + }, + { + "epoch": 12.130022573363432, + "eval_loss": 0.6107886433601379, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 3360 + }, + { + "epoch": 12.133634311512415, + "grad_norm": 244.73654174804688, + "learning_rate": 1.1754990925589837e-05, + "loss": 34.5678, + "step": 3361 + }, + { + "epoch": 12.1372460496614, + "grad_norm": 335.4967346191406, + "learning_rate": 1.1749546279491834e-05, + "loss": 35.8974, + "step": 3362 + }, + { + "epoch": 12.140857787810384, + "grad_norm": 269.8370056152344, + "learning_rate": 1.174410163339383e-05, + "loss": 36.3458, + "step": 3363 + }, + { + "epoch": 12.144469525959368, + "grad_norm": 230.82492065429688, + "learning_rate": 1.1738656987295826e-05, + "loss": 34.6797, + "step": 3364 + }, + { + "epoch": 12.148081264108352, + "grad_norm": 266.6196594238281, + "learning_rate": 1.1733212341197822e-05, + "loss": 35.5799, + "step": 3365 + }, + { + "epoch": 12.151693002257336, + "grad_norm": 268.1825256347656, + "learning_rate": 1.1727767695099819e-05, + "loss": 34.9859, + "step": 3366 + }, + { + "epoch": 12.15530474040632, + "grad_norm": 259.6159362792969, + "learning_rate": 1.1722323049001816e-05, + "loss": 37.2283, + "step": 3367 + }, + { + "epoch": 12.158916478555305, + "grad_norm": 225.1367645263672, + "learning_rate": 1.1716878402903811e-05, + "loss": 37.4073, + "step": 3368 + }, + { + "epoch": 12.162528216704288, + "grad_norm": 277.8457946777344, + "learning_rate": 1.1711433756805808e-05, + "loss": 36.3491, + "step": 3369 + }, + { + "epoch": 12.166139954853273, + "grad_norm": 273.1939697265625, + "learning_rate": 1.1705989110707804e-05, + "loss": 31.4646, + "step": 3370 + }, + { + "epoch": 12.166139954853273, + "eval_loss": 0.6099494695663452, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 3370 + }, + { + "epoch": 12.169751693002258, + "grad_norm": 199.32516479492188, + "learning_rate": 1.17005444646098e-05, + "loss": 22.7125, + "step": 3371 + }, + { + "epoch": 12.173363431151241, + "grad_norm": 195.47630310058594, + "learning_rate": 1.1695099818511798e-05, + "loss": 22.7899, + "step": 3372 + }, + { + "epoch": 12.176975169300226, + "grad_norm": 220.02413940429688, + "learning_rate": 1.1689655172413793e-05, + "loss": 23.4427, + "step": 3373 + }, + { + "epoch": 12.18058690744921, + "grad_norm": 215.43287658691406, + "learning_rate": 1.168421052631579e-05, + "loss": 24.1504, + "step": 3374 + }, + { + "epoch": 12.184198645598194, + "grad_norm": 298.2409973144531, + "learning_rate": 1.1678765880217786e-05, + "loss": 41.4955, + "step": 3375 + }, + { + "epoch": 12.187810383747179, + "grad_norm": 235.94728088378906, + "learning_rate": 1.1673321234119783e-05, + "loss": 42.4273, + "step": 3376 + }, + { + "epoch": 12.191422121896162, + "grad_norm": 235.44480895996094, + "learning_rate": 1.166787658802178e-05, + "loss": 40.6468, + "step": 3377 + }, + { + "epoch": 12.195033860045147, + "grad_norm": 281.5338439941406, + "learning_rate": 1.1662431941923775e-05, + "loss": 39.8335, + "step": 3378 + }, + { + "epoch": 12.198645598194132, + "grad_norm": 185.87339782714844, + "learning_rate": 1.165698729582577e-05, + "loss": 40.8669, + "step": 3379 + }, + { + "epoch": 12.202257336343115, + "grad_norm": 218.88861083984375, + "learning_rate": 1.1651542649727768e-05, + "loss": 40.1351, + "step": 3380 + }, + { + "epoch": 12.202257336343115, + "eval_loss": 0.6128573417663574, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3380 + }, + { + "epoch": 12.2058690744921, + "grad_norm": 192.7227783203125, + "learning_rate": 1.1646098003629765e-05, + "loss": 40.4448, + "step": 3381 + }, + { + "epoch": 12.209480812641084, + "grad_norm": 219.68093872070312, + "learning_rate": 1.1640653357531762e-05, + "loss": 41.579, + "step": 3382 + }, + { + "epoch": 12.213092550790067, + "grad_norm": 235.8788299560547, + "learning_rate": 1.1635208711433757e-05, + "loss": 41.3374, + "step": 3383 + }, + { + "epoch": 12.216704288939052, + "grad_norm": 245.11935424804688, + "learning_rate": 1.1629764065335752e-05, + "loss": 41.1151, + "step": 3384 + }, + { + "epoch": 12.220316027088035, + "grad_norm": 260.2931823730469, + "learning_rate": 1.1624319419237751e-05, + "loss": 38.9502, + "step": 3385 + }, + { + "epoch": 12.22392776523702, + "grad_norm": 240.62734985351562, + "learning_rate": 1.1618874773139747e-05, + "loss": 38.6309, + "step": 3386 + }, + { + "epoch": 12.227539503386005, + "grad_norm": 230.9380645751953, + "learning_rate": 1.1613430127041742e-05, + "loss": 38.3077, + "step": 3387 + }, + { + "epoch": 12.231151241534988, + "grad_norm": 234.40687561035156, + "learning_rate": 1.1607985480943739e-05, + "loss": 37.1566, + "step": 3388 + }, + { + "epoch": 12.234762979683973, + "grad_norm": 216.580810546875, + "learning_rate": 1.1602540834845734e-05, + "loss": 38.4919, + "step": 3389 + }, + { + "epoch": 12.238374717832958, + "grad_norm": 210.75079345703125, + "learning_rate": 1.1597096188747732e-05, + "loss": 38.1647, + "step": 3390 + }, + { + "epoch": 12.238374717832958, + "eval_loss": 0.6105583906173706, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3390 + }, + { + "epoch": 12.241986455981941, + "grad_norm": 207.82180786132812, + "learning_rate": 1.1591651542649729e-05, + "loss": 38.5585, + "step": 3391 + }, + { + "epoch": 12.245598194130926, + "grad_norm": 186.55081176757812, + "learning_rate": 1.1586206896551724e-05, + "loss": 38.0183, + "step": 3392 + }, + { + "epoch": 12.249209932279909, + "grad_norm": 179.60572814941406, + "learning_rate": 1.1580762250453721e-05, + "loss": 39.6951, + "step": 3393 + }, + { + "epoch": 12.252821670428894, + "grad_norm": 212.59837341308594, + "learning_rate": 1.1575317604355718e-05, + "loss": 39.2908, + "step": 3394 + }, + { + "epoch": 12.256433408577879, + "grad_norm": 239.90997314453125, + "learning_rate": 1.1569872958257714e-05, + "loss": 39.9409, + "step": 3395 + }, + { + "epoch": 12.260045146726862, + "grad_norm": 240.729248046875, + "learning_rate": 1.156442831215971e-05, + "loss": 39.2386, + "step": 3396 + }, + { + "epoch": 12.263656884875846, + "grad_norm": 248.6179962158203, + "learning_rate": 1.1558983666061706e-05, + "loss": 37.3296, + "step": 3397 + }, + { + "epoch": 12.267268623024831, + "grad_norm": 192.55084228515625, + "learning_rate": 1.1553539019963701e-05, + "loss": 40.1156, + "step": 3398 + }, + { + "epoch": 12.270880361173814, + "grad_norm": 217.89109802246094, + "learning_rate": 1.15480943738657e-05, + "loss": 41.0677, + "step": 3399 + }, + { + "epoch": 12.2744920993228, + "grad_norm": 240.77633666992188, + "learning_rate": 1.1542649727767695e-05, + "loss": 39.3552, + "step": 3400 + }, + { + "epoch": 12.2744920993228, + "eval_loss": 0.6094763278961182, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3400 + }, + { + "epoch": 12.278103837471784, + "grad_norm": 210.38153076171875, + "learning_rate": 1.1537205081669691e-05, + "loss": 40.2202, + "step": 3401 + }, + { + "epoch": 12.281715575620767, + "grad_norm": 195.49087524414062, + "learning_rate": 1.1531760435571688e-05, + "loss": 37.5473, + "step": 3402 + }, + { + "epoch": 12.285327313769752, + "grad_norm": 254.43972778320312, + "learning_rate": 1.1526315789473683e-05, + "loss": 37.8032, + "step": 3403 + }, + { + "epoch": 12.288939051918735, + "grad_norm": 205.09913635253906, + "learning_rate": 1.1520871143375682e-05, + "loss": 35.1317, + "step": 3404 + }, + { + "epoch": 12.29255079006772, + "grad_norm": 241.22930908203125, + "learning_rate": 1.1515426497277677e-05, + "loss": 32.7809, + "step": 3405 + }, + { + "epoch": 12.296162528216705, + "grad_norm": 226.75311279296875, + "learning_rate": 1.1509981851179673e-05, + "loss": 32.5354, + "step": 3406 + }, + { + "epoch": 12.299774266365688, + "grad_norm": 323.5389709472656, + "learning_rate": 1.150453720508167e-05, + "loss": 33.1533, + "step": 3407 + }, + { + "epoch": 12.303386004514673, + "grad_norm": 306.7039794921875, + "learning_rate": 1.1499092558983667e-05, + "loss": 33.7924, + "step": 3408 + }, + { + "epoch": 12.306997742663658, + "grad_norm": 221.53897094726562, + "learning_rate": 1.1493647912885662e-05, + "loss": 33.829, + "step": 3409 + }, + { + "epoch": 12.31060948081264, + "grad_norm": 301.59527587890625, + "learning_rate": 1.148820326678766e-05, + "loss": 35.4583, + "step": 3410 + }, + { + "epoch": 12.31060948081264, + "eval_loss": 0.6092248558998108, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.058, + "eval_steps_per_second": 57.058, + "step": 3410 + }, + { + "epoch": 12.314221218961626, + "grad_norm": 229.63221740722656, + "learning_rate": 1.1482758620689655e-05, + "loss": 34.3258, + "step": 3411 + }, + { + "epoch": 12.317832957110609, + "grad_norm": 280.6421203613281, + "learning_rate": 1.147731397459165e-05, + "loss": 33.4522, + "step": 3412 + }, + { + "epoch": 12.321444695259594, + "grad_norm": 305.6673889160156, + "learning_rate": 1.1471869328493649e-05, + "loss": 34.8911, + "step": 3413 + }, + { + "epoch": 12.325056433408578, + "grad_norm": 278.5484924316406, + "learning_rate": 1.1466424682395644e-05, + "loss": 36.2668, + "step": 3414 + }, + { + "epoch": 12.328668171557561, + "grad_norm": 246.88082885742188, + "learning_rate": 1.1460980036297641e-05, + "loss": 34.8401, + "step": 3415 + }, + { + "epoch": 12.332279909706546, + "grad_norm": 279.730712890625, + "learning_rate": 1.1455535390199637e-05, + "loss": 36.2382, + "step": 3416 + }, + { + "epoch": 12.335891647855531, + "grad_norm": 243.62918090820312, + "learning_rate": 1.1450090744101634e-05, + "loss": 37.0742, + "step": 3417 + }, + { + "epoch": 12.339503386004514, + "grad_norm": 280.5240783691406, + "learning_rate": 1.1444646098003631e-05, + "loss": 37.0223, + "step": 3418 + }, + { + "epoch": 12.343115124153499, + "grad_norm": 270.56396484375, + "learning_rate": 1.1439201451905626e-05, + "loss": 34.8413, + "step": 3419 + }, + { + "epoch": 12.346726862302482, + "grad_norm": 246.56292724609375, + "learning_rate": 1.1433756805807622e-05, + "loss": 26.5596, + "step": 3420 + }, + { + "epoch": 12.346726862302482, + "eval_loss": 0.6123174428939819, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.143, + "eval_steps_per_second": 57.143, + "step": 3420 + }, + { + "epoch": 12.350338600451467, + "grad_norm": 199.72242736816406, + "learning_rate": 1.1428312159709619e-05, + "loss": 23.3959, + "step": 3421 + }, + { + "epoch": 12.353950338600452, + "grad_norm": 264.9206848144531, + "learning_rate": 1.1422867513611616e-05, + "loss": 23.448, + "step": 3422 + }, + { + "epoch": 12.357562076749435, + "grad_norm": 198.09420776367188, + "learning_rate": 1.1417422867513613e-05, + "loss": 23.4526, + "step": 3423 + }, + { + "epoch": 12.36117381489842, + "grad_norm": 191.74949645996094, + "learning_rate": 1.1411978221415608e-05, + "loss": 23.9586, + "step": 3424 + }, + { + "epoch": 12.364785553047405, + "grad_norm": 270.4527893066406, + "learning_rate": 1.1406533575317604e-05, + "loss": 41.2497, + "step": 3425 + }, + { + "epoch": 12.368397291196388, + "grad_norm": 253.06109619140625, + "learning_rate": 1.1401088929219601e-05, + "loss": 41.7598, + "step": 3426 + }, + { + "epoch": 12.372009029345373, + "grad_norm": 389.3164978027344, + "learning_rate": 1.1395644283121598e-05, + "loss": 42.1145, + "step": 3427 + }, + { + "epoch": 12.375620767494357, + "grad_norm": 405.1527404785156, + "learning_rate": 1.1390199637023593e-05, + "loss": 39.8163, + "step": 3428 + }, + { + "epoch": 12.37923250564334, + "grad_norm": 360.5083312988281, + "learning_rate": 1.138475499092559e-05, + "loss": 40.7344, + "step": 3429 + }, + { + "epoch": 12.382844243792325, + "grad_norm": 276.3650207519531, + "learning_rate": 1.1379310344827586e-05, + "loss": 40.6678, + "step": 3430 + }, + { + "epoch": 12.382844243792325, + "eval_loss": 0.612799346446991, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 3430 + }, + { + "epoch": 12.386455981941308, + "grad_norm": 222.34078979492188, + "learning_rate": 1.1373865698729583e-05, + "loss": 39.8701, + "step": 3431 + }, + { + "epoch": 12.390067720090293, + "grad_norm": 242.1103515625, + "learning_rate": 1.136842105263158e-05, + "loss": 42.031, + "step": 3432 + }, + { + "epoch": 12.393679458239278, + "grad_norm": 231.30453491210938, + "learning_rate": 1.1362976406533575e-05, + "loss": 40.7321, + "step": 3433 + }, + { + "epoch": 12.397291196388261, + "grad_norm": 302.65179443359375, + "learning_rate": 1.1357531760435572e-05, + "loss": 41.5889, + "step": 3434 + }, + { + "epoch": 12.400902934537246, + "grad_norm": 296.4203796386719, + "learning_rate": 1.1352087114337568e-05, + "loss": 40.3939, + "step": 3435 + }, + { + "epoch": 12.404514672686231, + "grad_norm": 281.8349304199219, + "learning_rate": 1.1346642468239565e-05, + "loss": 37.9457, + "step": 3436 + }, + { + "epoch": 12.408126410835214, + "grad_norm": 228.9622039794922, + "learning_rate": 1.1341197822141562e-05, + "loss": 37.4727, + "step": 3437 + }, + { + "epoch": 12.411738148984199, + "grad_norm": 276.8975524902344, + "learning_rate": 1.1335753176043557e-05, + "loss": 36.4285, + "step": 3438 + }, + { + "epoch": 12.415349887133182, + "grad_norm": 218.76206970214844, + "learning_rate": 1.1330308529945553e-05, + "loss": 37.7888, + "step": 3439 + }, + { + "epoch": 12.418961625282167, + "grad_norm": 277.31329345703125, + "learning_rate": 1.1324863883847551e-05, + "loss": 38.6416, + "step": 3440 + }, + { + "epoch": 12.418961625282167, + "eval_loss": 0.6118359565734863, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.105, + "eval_steps_per_second": 57.105, + "step": 3440 + }, + { + "epoch": 12.422573363431152, + "grad_norm": 239.2766876220703, + "learning_rate": 1.1319419237749547e-05, + "loss": 38.3779, + "step": 3441 + }, + { + "epoch": 12.426185101580135, + "grad_norm": 255.43939208984375, + "learning_rate": 1.1313974591651542e-05, + "loss": 38.7581, + "step": 3442 + }, + { + "epoch": 12.42979683972912, + "grad_norm": 196.33380126953125, + "learning_rate": 1.130852994555354e-05, + "loss": 40.1953, + "step": 3443 + }, + { + "epoch": 12.433408577878104, + "grad_norm": 284.2427062988281, + "learning_rate": 1.1303085299455535e-05, + "loss": 39.2743, + "step": 3444 + }, + { + "epoch": 12.437020316027088, + "grad_norm": 303.0172424316406, + "learning_rate": 1.1297640653357533e-05, + "loss": 39.4786, + "step": 3445 + }, + { + "epoch": 12.440632054176072, + "grad_norm": 231.17999267578125, + "learning_rate": 1.1292196007259529e-05, + "loss": 38.6038, + "step": 3446 + }, + { + "epoch": 12.444243792325057, + "grad_norm": 228.89599609375, + "learning_rate": 1.1286751361161524e-05, + "loss": 39.0235, + "step": 3447 + }, + { + "epoch": 12.44785553047404, + "grad_norm": 247.05203247070312, + "learning_rate": 1.1281306715063521e-05, + "loss": 39.9779, + "step": 3448 + }, + { + "epoch": 12.451467268623025, + "grad_norm": 221.5463104248047, + "learning_rate": 1.1275862068965517e-05, + "loss": 40.4104, + "step": 3449 + }, + { + "epoch": 12.455079006772008, + "grad_norm": 254.12820434570312, + "learning_rate": 1.1270417422867514e-05, + "loss": 40.8093, + "step": 3450 + }, + { + "epoch": 12.455079006772008, + "eval_loss": 0.6093817353248596, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 3450 + }, + { + "epoch": 12.458690744920993, + "grad_norm": 214.2323760986328, + "learning_rate": 1.1264972776769511e-05, + "loss": 40.3578, + "step": 3451 + }, + { + "epoch": 12.462302483069978, + "grad_norm": 230.64718627929688, + "learning_rate": 1.1259528130671506e-05, + "loss": 39.772, + "step": 3452 + }, + { + "epoch": 12.465914221218961, + "grad_norm": 217.81838989257812, + "learning_rate": 1.1254083484573502e-05, + "loss": 36.8193, + "step": 3453 + }, + { + "epoch": 12.469525959367946, + "grad_norm": 292.7674560546875, + "learning_rate": 1.12486388384755e-05, + "loss": 33.891, + "step": 3454 + }, + { + "epoch": 12.47313769751693, + "grad_norm": 241.6099395751953, + "learning_rate": 1.1243194192377496e-05, + "loss": 34.8947, + "step": 3455 + }, + { + "epoch": 12.476749435665914, + "grad_norm": 220.97128295898438, + "learning_rate": 1.1237749546279493e-05, + "loss": 31.7715, + "step": 3456 + }, + { + "epoch": 12.480361173814899, + "grad_norm": 191.04376220703125, + "learning_rate": 1.1232304900181488e-05, + "loss": 32.3878, + "step": 3457 + }, + { + "epoch": 12.483972911963882, + "grad_norm": 192.3009796142578, + "learning_rate": 1.1226860254083484e-05, + "loss": 33.3116, + "step": 3458 + }, + { + "epoch": 12.487584650112867, + "grad_norm": 214.22459411621094, + "learning_rate": 1.1221415607985482e-05, + "loss": 34.1394, + "step": 3459 + }, + { + "epoch": 12.491196388261852, + "grad_norm": 225.24191284179688, + "learning_rate": 1.1215970961887478e-05, + "loss": 34.9381, + "step": 3460 + }, + { + "epoch": 12.491196388261852, + "eval_loss": 0.6095408201217651, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 3460 + }, + { + "epoch": 12.494808126410835, + "grad_norm": 240.89199829101562, + "learning_rate": 1.1210526315789473e-05, + "loss": 34.5342, + "step": 3461 + }, + { + "epoch": 12.49841986455982, + "grad_norm": 263.5467224121094, + "learning_rate": 1.120508166969147e-05, + "loss": 35.3287, + "step": 3462 + }, + { + "epoch": 12.502031602708804, + "grad_norm": 253.0650634765625, + "learning_rate": 1.1199637023593467e-05, + "loss": 35.4859, + "step": 3463 + }, + { + "epoch": 12.505643340857787, + "grad_norm": 279.4447937011719, + "learning_rate": 1.1194192377495463e-05, + "loss": 33.919, + "step": 3464 + }, + { + "epoch": 12.509255079006772, + "grad_norm": 246.6184844970703, + "learning_rate": 1.118874773139746e-05, + "loss": 35.2743, + "step": 3465 + }, + { + "epoch": 12.512866817155757, + "grad_norm": 228.4134979248047, + "learning_rate": 1.1183303085299455e-05, + "loss": 36.0865, + "step": 3466 + }, + { + "epoch": 12.51647855530474, + "grad_norm": 264.87835693359375, + "learning_rate": 1.1177858439201452e-05, + "loss": 36.1596, + "step": 3467 + }, + { + "epoch": 12.520090293453725, + "grad_norm": 252.2872772216797, + "learning_rate": 1.117241379310345e-05, + "loss": 35.7293, + "step": 3468 + }, + { + "epoch": 12.523702031602708, + "grad_norm": 277.3695373535156, + "learning_rate": 1.1166969147005445e-05, + "loss": 36.8009, + "step": 3469 + }, + { + "epoch": 12.527313769751693, + "grad_norm": 255.64610290527344, + "learning_rate": 1.1161524500907442e-05, + "loss": 28.5986, + "step": 3470 + }, + { + "epoch": 12.527313769751693, + "eval_loss": 0.6122347116470337, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 3470 + }, + { + "epoch": 12.530925507900678, + "grad_norm": 256.1487121582031, + "learning_rate": 1.1156079854809437e-05, + "loss": 23.1289, + "step": 3471 + }, + { + "epoch": 12.534537246049661, + "grad_norm": 261.9757080078125, + "learning_rate": 1.1150635208711433e-05, + "loss": 22.3379, + "step": 3472 + }, + { + "epoch": 12.538148984198646, + "grad_norm": 194.83432006835938, + "learning_rate": 1.1145190562613431e-05, + "loss": 23.6192, + "step": 3473 + }, + { + "epoch": 12.54176072234763, + "grad_norm": 241.51089477539062, + "learning_rate": 1.1139745916515427e-05, + "loss": 24.0314, + "step": 3474 + }, + { + "epoch": 12.545372460496614, + "grad_norm": 242.6024932861328, + "learning_rate": 1.1134301270417424e-05, + "loss": 40.2969, + "step": 3475 + }, + { + "epoch": 12.548984198645599, + "grad_norm": 292.17303466796875, + "learning_rate": 1.112885662431942e-05, + "loss": 42.3448, + "step": 3476 + }, + { + "epoch": 12.552595936794582, + "grad_norm": 232.811767578125, + "learning_rate": 1.1123411978221416e-05, + "loss": 41.7642, + "step": 3477 + }, + { + "epoch": 12.556207674943566, + "grad_norm": 238.43162536621094, + "learning_rate": 1.1117967332123413e-05, + "loss": 41.0827, + "step": 3478 + }, + { + "epoch": 12.559819413092551, + "grad_norm": 290.20159912109375, + "learning_rate": 1.1112522686025409e-05, + "loss": 41.3795, + "step": 3479 + }, + { + "epoch": 12.563431151241534, + "grad_norm": 197.52903747558594, + "learning_rate": 1.1107078039927404e-05, + "loss": 40.6337, + "step": 3480 + }, + { + "epoch": 12.563431151241534, + "eval_loss": 0.6133883595466614, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.135, + "eval_steps_per_second": 57.135, + "step": 3480 + }, + { + "epoch": 12.56704288939052, + "grad_norm": 259.8161926269531, + "learning_rate": 1.1101633393829401e-05, + "loss": 40.2626, + "step": 3481 + }, + { + "epoch": 12.570654627539504, + "grad_norm": 196.7882537841797, + "learning_rate": 1.1096188747731398e-05, + "loss": 41.0171, + "step": 3482 + }, + { + "epoch": 12.574266365688487, + "grad_norm": 216.27642822265625, + "learning_rate": 1.1090744101633394e-05, + "loss": 42.1328, + "step": 3483 + }, + { + "epoch": 12.577878103837472, + "grad_norm": 292.6575012207031, + "learning_rate": 1.108529945553539e-05, + "loss": 39.9502, + "step": 3484 + }, + { + "epoch": 12.581489841986457, + "grad_norm": 254.43344116210938, + "learning_rate": 1.1079854809437386e-05, + "loss": 41.3409, + "step": 3485 + }, + { + "epoch": 12.58510158013544, + "grad_norm": 211.3965606689453, + "learning_rate": 1.1074410163339385e-05, + "loss": 39.6898, + "step": 3486 + }, + { + "epoch": 12.588713318284425, + "grad_norm": 196.2000274658203, + "learning_rate": 1.106896551724138e-05, + "loss": 38.0837, + "step": 3487 + }, + { + "epoch": 12.592325056433408, + "grad_norm": 224.4564666748047, + "learning_rate": 1.1063520871143376e-05, + "loss": 38.479, + "step": 3488 + }, + { + "epoch": 12.595936794582393, + "grad_norm": 215.7074432373047, + "learning_rate": 1.1058076225045373e-05, + "loss": 38.3103, + "step": 3489 + }, + { + "epoch": 12.599548532731378, + "grad_norm": 278.2279052734375, + "learning_rate": 1.1052631578947368e-05, + "loss": 37.9399, + "step": 3490 + }, + { + "epoch": 12.599548532731378, + "eval_loss": 0.6091782450675964, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 3490 + }, + { + "epoch": 12.60316027088036, + "grad_norm": 236.7021942138672, + "learning_rate": 1.1047186932849365e-05, + "loss": 38.185, + "step": 3491 + }, + { + "epoch": 12.606772009029346, + "grad_norm": 200.35169982910156, + "learning_rate": 1.1041742286751362e-05, + "loss": 38.7405, + "step": 3492 + }, + { + "epoch": 12.610383747178329, + "grad_norm": 211.9726104736328, + "learning_rate": 1.1036297640653358e-05, + "loss": 39.8351, + "step": 3493 + }, + { + "epoch": 12.613995485327314, + "grad_norm": 303.5962829589844, + "learning_rate": 1.1030852994555353e-05, + "loss": 39.3039, + "step": 3494 + }, + { + "epoch": 12.617607223476298, + "grad_norm": 298.086181640625, + "learning_rate": 1.102540834845735e-05, + "loss": 39.9149, + "step": 3495 + }, + { + "epoch": 12.621218961625281, + "grad_norm": 255.69854736328125, + "learning_rate": 1.1019963702359347e-05, + "loss": 36.3617, + "step": 3496 + }, + { + "epoch": 12.624830699774266, + "grad_norm": 273.2884216308594, + "learning_rate": 1.1014519056261344e-05, + "loss": 38.6865, + "step": 3497 + }, + { + "epoch": 12.628442437923251, + "grad_norm": 211.17837524414062, + "learning_rate": 1.100907441016334e-05, + "loss": 40.2771, + "step": 3498 + }, + { + "epoch": 12.632054176072234, + "grad_norm": 253.9141845703125, + "learning_rate": 1.1003629764065335e-05, + "loss": 40.3644, + "step": 3499 + }, + { + "epoch": 12.635665914221219, + "grad_norm": 247.4141082763672, + "learning_rate": 1.0998185117967334e-05, + "loss": 39.9754, + "step": 3500 + }, + { + "epoch": 12.635665914221219, + "eval_loss": 0.6086810827255249, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 3500 + }, + { + "epoch": 12.639277652370204, + "grad_norm": 237.3258056640625, + "learning_rate": 1.0992740471869329e-05, + "loss": 39.9438, + "step": 3501 + }, + { + "epoch": 12.642889390519187, + "grad_norm": 252.87744140625, + "learning_rate": 1.0987295825771325e-05, + "loss": 39.9713, + "step": 3502 + }, + { + "epoch": 12.646501128668172, + "grad_norm": 341.2947998046875, + "learning_rate": 1.0981851179673322e-05, + "loss": 36.54, + "step": 3503 + }, + { + "epoch": 12.650112866817155, + "grad_norm": 212.7144317626953, + "learning_rate": 1.0976406533575317e-05, + "loss": 33.2737, + "step": 3504 + }, + { + "epoch": 12.65372460496614, + "grad_norm": 220.15846252441406, + "learning_rate": 1.0970961887477314e-05, + "loss": 34.8862, + "step": 3505 + }, + { + "epoch": 12.657336343115125, + "grad_norm": 235.8145294189453, + "learning_rate": 1.0965517241379311e-05, + "loss": 31.637, + "step": 3506 + }, + { + "epoch": 12.660948081264108, + "grad_norm": 274.13140869140625, + "learning_rate": 1.0960072595281307e-05, + "loss": 33.6111, + "step": 3507 + }, + { + "epoch": 12.664559819413093, + "grad_norm": 259.9810791015625, + "learning_rate": 1.0954627949183304e-05, + "loss": 34.7118, + "step": 3508 + }, + { + "epoch": 12.668171557562077, + "grad_norm": 244.6074676513672, + "learning_rate": 1.0949183303085299e-05, + "loss": 34.3987, + "step": 3509 + }, + { + "epoch": 12.67178329571106, + "grad_norm": 264.0238037109375, + "learning_rate": 1.0943738656987296e-05, + "loss": 34.7304, + "step": 3510 + }, + { + "epoch": 12.67178329571106, + "eval_loss": 0.6089194416999817, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 3510 + }, + { + "epoch": 12.675395033860045, + "grad_norm": 286.857421875, + "learning_rate": 1.0938294010889293e-05, + "loss": 34.5722, + "step": 3511 + }, + { + "epoch": 12.679006772009028, + "grad_norm": 270.7839660644531, + "learning_rate": 1.0932849364791289e-05, + "loss": 35.6129, + "step": 3512 + }, + { + "epoch": 12.682618510158013, + "grad_norm": 214.4302978515625, + "learning_rate": 1.0927404718693284e-05, + "loss": 34.4318, + "step": 3513 + }, + { + "epoch": 12.686230248306998, + "grad_norm": 362.6913757324219, + "learning_rate": 1.0921960072595283e-05, + "loss": 35.6578, + "step": 3514 + }, + { + "epoch": 12.689841986455981, + "grad_norm": 266.5205993652344, + "learning_rate": 1.0916515426497278e-05, + "loss": 35.8627, + "step": 3515 + }, + { + "epoch": 12.693453724604966, + "grad_norm": 271.8298034667969, + "learning_rate": 1.0911070780399275e-05, + "loss": 36.8931, + "step": 3516 + }, + { + "epoch": 12.697065462753951, + "grad_norm": 230.13815307617188, + "learning_rate": 1.090562613430127e-05, + "loss": 35.8972, + "step": 3517 + }, + { + "epoch": 12.700677200902934, + "grad_norm": 235.57127380371094, + "learning_rate": 1.0900181488203266e-05, + "loss": 36.7884, + "step": 3518 + }, + { + "epoch": 12.704288939051919, + "grad_norm": 274.0856018066406, + "learning_rate": 1.0894736842105265e-05, + "loss": 35.938, + "step": 3519 + }, + { + "epoch": 12.707900677200904, + "grad_norm": 251.9855194091797, + "learning_rate": 1.088929219600726e-05, + "loss": 30.846, + "step": 3520 + }, + { + "epoch": 12.707900677200904, + "eval_loss": 0.6102532148361206, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 3520 + }, + { + "epoch": 12.711512415349887, + "grad_norm": 254.11465454101562, + "learning_rate": 1.0883847549909255e-05, + "loss": 22.8538, + "step": 3521 + }, + { + "epoch": 12.715124153498872, + "grad_norm": 233.05821228027344, + "learning_rate": 1.0878402903811253e-05, + "loss": 22.3346, + "step": 3522 + }, + { + "epoch": 12.718735891647855, + "grad_norm": 223.46646118164062, + "learning_rate": 1.087295825771325e-05, + "loss": 23.8109, + "step": 3523 + }, + { + "epoch": 12.72234762979684, + "grad_norm": 209.4064483642578, + "learning_rate": 1.0867513611615245e-05, + "loss": 24.7694, + "step": 3524 + }, + { + "epoch": 12.725959367945824, + "grad_norm": 299.6215515136719, + "learning_rate": 1.0862068965517242e-05, + "loss": 40.8879, + "step": 3525 + }, + { + "epoch": 12.729571106094808, + "grad_norm": 272.5259704589844, + "learning_rate": 1.0856624319419237e-05, + "loss": 41.5875, + "step": 3526 + }, + { + "epoch": 12.733182844243792, + "grad_norm": 219.70687866210938, + "learning_rate": 1.0851179673321235e-05, + "loss": 41.5546, + "step": 3527 + }, + { + "epoch": 12.736794582392777, + "grad_norm": 250.9104766845703, + "learning_rate": 1.0845735027223232e-05, + "loss": 40.0984, + "step": 3528 + }, + { + "epoch": 12.74040632054176, + "grad_norm": 260.9254150390625, + "learning_rate": 1.0840290381125227e-05, + "loss": 40.564, + "step": 3529 + }, + { + "epoch": 12.744018058690745, + "grad_norm": 275.46221923828125, + "learning_rate": 1.0834845735027224e-05, + "loss": 40.3864, + "step": 3530 + }, + { + "epoch": 12.744018058690745, + "eval_loss": 0.6099677681922913, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 3530 + }, + { + "epoch": 12.747629796839728, + "grad_norm": 200.9589385986328, + "learning_rate": 1.082940108892922e-05, + "loss": 40.5753, + "step": 3531 + }, + { + "epoch": 12.751241534988713, + "grad_norm": 228.87669372558594, + "learning_rate": 1.0823956442831215e-05, + "loss": 41.4702, + "step": 3532 + }, + { + "epoch": 12.754853273137698, + "grad_norm": 218.6998748779297, + "learning_rate": 1.0818511796733214e-05, + "loss": 41.6641, + "step": 3533 + }, + { + "epoch": 12.758465011286681, + "grad_norm": 422.519775390625, + "learning_rate": 1.0813067150635209e-05, + "loss": 41.8016, + "step": 3534 + }, + { + "epoch": 12.762076749435666, + "grad_norm": 198.31935119628906, + "learning_rate": 1.0807622504537204e-05, + "loss": 40.6053, + "step": 3535 + }, + { + "epoch": 12.76568848758465, + "grad_norm": 274.42333984375, + "learning_rate": 1.0802177858439201e-05, + "loss": 38.7974, + "step": 3536 + }, + { + "epoch": 12.769300225733634, + "grad_norm": 267.5847473144531, + "learning_rate": 1.0796733212341199e-05, + "loss": 37.157, + "step": 3537 + }, + { + "epoch": 12.772911963882619, + "grad_norm": 264.9976806640625, + "learning_rate": 1.0791288566243196e-05, + "loss": 38.1585, + "step": 3538 + }, + { + "epoch": 12.776523702031604, + "grad_norm": 216.5603790283203, + "learning_rate": 1.0785843920145191e-05, + "loss": 38.0501, + "step": 3539 + }, + { + "epoch": 12.780135440180587, + "grad_norm": 193.55081176757812, + "learning_rate": 1.0780399274047186e-05, + "loss": 38.3114, + "step": 3540 + }, + { + "epoch": 12.780135440180587, + "eval_loss": 0.6059894561767578, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3540 + }, + { + "epoch": 12.783747178329572, + "grad_norm": 256.3584289550781, + "learning_rate": 1.0774954627949183e-05, + "loss": 38.7056, + "step": 3541 + }, + { + "epoch": 12.787358916478555, + "grad_norm": 203.17401123046875, + "learning_rate": 1.076950998185118e-05, + "loss": 39.3947, + "step": 3542 + }, + { + "epoch": 12.79097065462754, + "grad_norm": 307.99517822265625, + "learning_rate": 1.0764065335753176e-05, + "loss": 39.2121, + "step": 3543 + }, + { + "epoch": 12.794582392776524, + "grad_norm": 199.4147186279297, + "learning_rate": 1.0758620689655173e-05, + "loss": 38.4621, + "step": 3544 + }, + { + "epoch": 12.798194130925507, + "grad_norm": 251.60293579101562, + "learning_rate": 1.0753176043557168e-05, + "loss": 38.2742, + "step": 3545 + }, + { + "epoch": 12.801805869074492, + "grad_norm": 277.1817321777344, + "learning_rate": 1.0747731397459165e-05, + "loss": 38.6803, + "step": 3546 + }, + { + "epoch": 12.805417607223477, + "grad_norm": 303.2837219238281, + "learning_rate": 1.0742286751361163e-05, + "loss": 39.7843, + "step": 3547 + }, + { + "epoch": 12.80902934537246, + "grad_norm": 321.22772216796875, + "learning_rate": 1.0736842105263158e-05, + "loss": 41.3761, + "step": 3548 + }, + { + "epoch": 12.812641083521445, + "grad_norm": 238.89007568359375, + "learning_rate": 1.0731397459165155e-05, + "loss": 40.3649, + "step": 3549 + }, + { + "epoch": 12.816252821670428, + "grad_norm": 251.22291564941406, + "learning_rate": 1.072595281306715e-05, + "loss": 40.8151, + "step": 3550 + }, + { + "epoch": 12.816252821670428, + "eval_loss": 0.6065003275871277, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 3550 + }, + { + "epoch": 12.819864559819413, + "grad_norm": 218.13418579101562, + "learning_rate": 1.0720508166969147e-05, + "loss": 39.381, + "step": 3551 + }, + { + "epoch": 12.823476297968398, + "grad_norm": 250.90328979492188, + "learning_rate": 1.0715063520871145e-05, + "loss": 39.8923, + "step": 3552 + }, + { + "epoch": 12.827088036117381, + "grad_norm": 227.4825897216797, + "learning_rate": 1.070961887477314e-05, + "loss": 36.836, + "step": 3553 + }, + { + "epoch": 12.830699774266366, + "grad_norm": 253.7106475830078, + "learning_rate": 1.0704174228675135e-05, + "loss": 34.499, + "step": 3554 + }, + { + "epoch": 12.83431151241535, + "grad_norm": 280.0548400878906, + "learning_rate": 1.0698729582577132e-05, + "loss": 33.3409, + "step": 3555 + }, + { + "epoch": 12.837923250564334, + "grad_norm": 201.3768768310547, + "learning_rate": 1.069328493647913e-05, + "loss": 32.4868, + "step": 3556 + }, + { + "epoch": 12.841534988713319, + "grad_norm": 245.73446655273438, + "learning_rate": 1.0687840290381125e-05, + "loss": 32.8295, + "step": 3557 + }, + { + "epoch": 12.845146726862303, + "grad_norm": 195.0170440673828, + "learning_rate": 1.0682395644283122e-05, + "loss": 33.2009, + "step": 3558 + }, + { + "epoch": 12.848758465011286, + "grad_norm": 261.66357421875, + "learning_rate": 1.0676950998185117e-05, + "loss": 33.0627, + "step": 3559 + }, + { + "epoch": 12.852370203160271, + "grad_norm": 299.0184326171875, + "learning_rate": 1.0671506352087116e-05, + "loss": 34.184, + "step": 3560 + }, + { + "epoch": 12.852370203160271, + "eval_loss": 0.6077792048454285, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.041, + "eval_steps_per_second": 57.041, + "step": 3560 + }, + { + "epoch": 12.855981941309254, + "grad_norm": 293.9249572753906, + "learning_rate": 1.0666061705989111e-05, + "loss": 34.748, + "step": 3561 + }, + { + "epoch": 12.85959367945824, + "grad_norm": 206.4182586669922, + "learning_rate": 1.0660617059891107e-05, + "loss": 33.8454, + "step": 3562 + }, + { + "epoch": 12.863205417607224, + "grad_norm": 261.4427185058594, + "learning_rate": 1.0655172413793104e-05, + "loss": 35.7317, + "step": 3563 + }, + { + "epoch": 12.866817155756207, + "grad_norm": 236.60704040527344, + "learning_rate": 1.06497277676951e-05, + "loss": 35.2389, + "step": 3564 + }, + { + "epoch": 12.870428893905192, + "grad_norm": 272.9973449707031, + "learning_rate": 1.0644283121597096e-05, + "loss": 34.8523, + "step": 3565 + }, + { + "epoch": 12.874040632054175, + "grad_norm": 228.82540893554688, + "learning_rate": 1.0638838475499093e-05, + "loss": 34.7236, + "step": 3566 + }, + { + "epoch": 12.87765237020316, + "grad_norm": 266.6078796386719, + "learning_rate": 1.0633393829401089e-05, + "loss": 36.1574, + "step": 3567 + }, + { + "epoch": 12.881264108352145, + "grad_norm": 267.52239990234375, + "learning_rate": 1.0627949183303086e-05, + "loss": 36.8466, + "step": 3568 + }, + { + "epoch": 12.884875846501128, + "grad_norm": 261.0372314453125, + "learning_rate": 1.0622504537205083e-05, + "loss": 37.2803, + "step": 3569 + }, + { + "epoch": 12.888487584650113, + "grad_norm": 220.42532348632812, + "learning_rate": 1.0617059891107078e-05, + "loss": 29.4233, + "step": 3570 + }, + { + "epoch": 12.888487584650113, + "eval_loss": 0.6131581664085388, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 3570 + }, + { + "epoch": 12.892099322799098, + "grad_norm": 187.53604125976562, + "learning_rate": 1.0611615245009075e-05, + "loss": 23.3851, + "step": 3571 + }, + { + "epoch": 12.89571106094808, + "grad_norm": 227.1913299560547, + "learning_rate": 1.060617059891107e-05, + "loss": 23.3155, + "step": 3572 + }, + { + "epoch": 12.899322799097066, + "grad_norm": 202.15939331054688, + "learning_rate": 1.0600725952813066e-05, + "loss": 24.4548, + "step": 3573 + }, + { + "epoch": 12.90293453724605, + "grad_norm": 195.67282104492188, + "learning_rate": 1.0595281306715065e-05, + "loss": 24.2037, + "step": 3574 + }, + { + "epoch": 12.906546275395034, + "grad_norm": 303.0018310546875, + "learning_rate": 1.058983666061706e-05, + "loss": 41.6489, + "step": 3575 + }, + { + "epoch": 12.910158013544018, + "grad_norm": 193.92433166503906, + "learning_rate": 1.0584392014519056e-05, + "loss": 40.3682, + "step": 3576 + }, + { + "epoch": 12.913769751693001, + "grad_norm": 305.50750732421875, + "learning_rate": 1.0578947368421053e-05, + "loss": 40.5065, + "step": 3577 + }, + { + "epoch": 12.917381489841986, + "grad_norm": 223.41732788085938, + "learning_rate": 1.0573502722323048e-05, + "loss": 41.6387, + "step": 3578 + }, + { + "epoch": 12.920993227990971, + "grad_norm": 215.65061950683594, + "learning_rate": 1.0568058076225047e-05, + "loss": 41.3623, + "step": 3579 + }, + { + "epoch": 12.924604966139954, + "grad_norm": 223.95880126953125, + "learning_rate": 1.0562613430127042e-05, + "loss": 40.7444, + "step": 3580 + }, + { + "epoch": 12.924604966139954, + "eval_loss": 0.6113386750221252, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.074, + "eval_steps_per_second": 57.074, + "step": 3580 + }, + { + "epoch": 12.928216704288939, + "grad_norm": 247.3272247314453, + "learning_rate": 1.0557168784029038e-05, + "loss": 37.8137, + "step": 3581 + }, + { + "epoch": 12.931828442437924, + "grad_norm": 277.4321594238281, + "learning_rate": 1.0551724137931035e-05, + "loss": 38.6946, + "step": 3582 + }, + { + "epoch": 12.935440180586907, + "grad_norm": 219.15576171875, + "learning_rate": 1.0546279491833032e-05, + "loss": 39.0059, + "step": 3583 + }, + { + "epoch": 12.939051918735892, + "grad_norm": 205.6105194091797, + "learning_rate": 1.0540834845735027e-05, + "loss": 39.2436, + "step": 3584 + }, + { + "epoch": 12.942663656884875, + "grad_norm": 303.84521484375, + "learning_rate": 1.0535390199637024e-05, + "loss": 39.2451, + "step": 3585 + }, + { + "epoch": 12.94627539503386, + "grad_norm": 326.2321472167969, + "learning_rate": 1.052994555353902e-05, + "loss": 38.1849, + "step": 3586 + }, + { + "epoch": 12.949887133182845, + "grad_norm": 332.7608642578125, + "learning_rate": 1.0524500907441015e-05, + "loss": 39.7121, + "step": 3587 + }, + { + "epoch": 12.953498871331828, + "grad_norm": 245.19827270507812, + "learning_rate": 1.0519056261343014e-05, + "loss": 39.6558, + "step": 3588 + }, + { + "epoch": 12.957110609480813, + "grad_norm": 227.54763793945312, + "learning_rate": 1.051361161524501e-05, + "loss": 38.6437, + "step": 3589 + }, + { + "epoch": 12.960722347629797, + "grad_norm": 273.1142272949219, + "learning_rate": 1.0508166969147006e-05, + "loss": 39.083, + "step": 3590 + }, + { + "epoch": 12.960722347629797, + "eval_loss": 0.6050187349319458, + "eval_runtime": 3.1339, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 3590 + }, + { + "epoch": 12.96433408577878, + "grad_norm": 227.0492401123047, + "learning_rate": 1.0502722323049002e-05, + "loss": 34.0254, + "step": 3591 + }, + { + "epoch": 12.967945823927765, + "grad_norm": 201.76736450195312, + "learning_rate": 1.0497277676950999e-05, + "loss": 32.4569, + "step": 3592 + }, + { + "epoch": 12.97155756207675, + "grad_norm": 279.99237060546875, + "learning_rate": 1.0491833030852996e-05, + "loss": 33.8718, + "step": 3593 + }, + { + "epoch": 12.975169300225733, + "grad_norm": 351.647705078125, + "learning_rate": 1.0486388384754991e-05, + "loss": 34.8168, + "step": 3594 + }, + { + "epoch": 12.978781038374718, + "grad_norm": 275.7414855957031, + "learning_rate": 1.0480943738656987e-05, + "loss": 35.1731, + "step": 3595 + }, + { + "epoch": 12.982392776523701, + "grad_norm": 347.0024719238281, + "learning_rate": 1.0475499092558984e-05, + "loss": 35.7127, + "step": 3596 + }, + { + "epoch": 12.986004514672686, + "grad_norm": 304.18218994140625, + "learning_rate": 1.047005444646098e-05, + "loss": 34.7709, + "step": 3597 + }, + { + "epoch": 12.989616252821671, + "grad_norm": 306.33245849609375, + "learning_rate": 1.0464609800362976e-05, + "loss": 37.2105, + "step": 3598 + }, + { + "epoch": 12.993227990970654, + "grad_norm": 326.3535461425781, + "learning_rate": 1.0459165154264973e-05, + "loss": 33.6613, + "step": 3599 + }, + { + "epoch": 12.996839729119639, + "grad_norm": 325.7522888183594, + "learning_rate": 1.0453720508166969e-05, + "loss": 22.8985, + "step": 3600 + }, + { + "epoch": 12.996839729119639, + "eval_loss": 0.6073772311210632, + "eval_runtime": 3.1391, + "eval_samples_per_second": 57.023, + "eval_steps_per_second": 57.023, + "step": 3600 + }, + { + "epoch": 13.0, + "grad_norm": 256.7010498046875, + "learning_rate": 1.0448275862068966e-05, + "loss": 21.3776, + "step": 3601 + }, + { + "epoch": 13.003611738148985, + "grad_norm": 247.7591552734375, + "learning_rate": 1.0442831215970963e-05, + "loss": 39.0509, + "step": 3602 + }, + { + "epoch": 13.007223476297968, + "grad_norm": 389.6626281738281, + "learning_rate": 1.0437386569872958e-05, + "loss": 41.042, + "step": 3603 + }, + { + "epoch": 13.010835214446953, + "grad_norm": 271.01885986328125, + "learning_rate": 1.0431941923774955e-05, + "loss": 39.9542, + "step": 3604 + }, + { + "epoch": 13.014446952595938, + "grad_norm": 263.2490539550781, + "learning_rate": 1.042649727767695e-05, + "loss": 39.8852, + "step": 3605 + }, + { + "epoch": 13.01805869074492, + "grad_norm": 255.46878051757812, + "learning_rate": 1.0421052631578948e-05, + "loss": 39.3902, + "step": 3606 + }, + { + "epoch": 13.021670428893906, + "grad_norm": 206.02244567871094, + "learning_rate": 1.0415607985480945e-05, + "loss": 40.1731, + "step": 3607 + }, + { + "epoch": 13.025282167042889, + "grad_norm": 194.83055114746094, + "learning_rate": 1.041016333938294e-05, + "loss": 39.17, + "step": 3608 + }, + { + "epoch": 13.028893905191874, + "grad_norm": 230.1270294189453, + "learning_rate": 1.0404718693284936e-05, + "loss": 40.3363, + "step": 3609 + }, + { + "epoch": 13.032505643340858, + "grad_norm": 206.0470733642578, + "learning_rate": 1.0399274047186933e-05, + "loss": 40.7774, + "step": 3610 + }, + { + "epoch": 13.032505643340858, + "eval_loss": 0.6078981161117554, + "eval_runtime": 3.1697, + "eval_samples_per_second": 56.472, + "eval_steps_per_second": 56.472, + "step": 3610 + }, + { + "epoch": 13.036117381489841, + "grad_norm": 210.79327392578125, + "learning_rate": 1.039382940108893e-05, + "loss": 40.725, + "step": 3611 + }, + { + "epoch": 13.039729119638826, + "grad_norm": 200.4281768798828, + "learning_rate": 1.0388384754990927e-05, + "loss": 38.8736, + "step": 3612 + }, + { + "epoch": 13.043340857787811, + "grad_norm": 183.33575439453125, + "learning_rate": 1.0382940108892922e-05, + "loss": 37.5542, + "step": 3613 + }, + { + "epoch": 13.046952595936794, + "grad_norm": 195.2568817138672, + "learning_rate": 1.0377495462794918e-05, + "loss": 36.5576, + "step": 3614 + }, + { + "epoch": 13.050564334085779, + "grad_norm": 223.9565887451172, + "learning_rate": 1.0372050816696916e-05, + "loss": 36.9015, + "step": 3615 + }, + { + "epoch": 13.054176072234762, + "grad_norm": 264.0516052246094, + "learning_rate": 1.0366606170598912e-05, + "loss": 38.8146, + "step": 3616 + }, + { + "epoch": 13.057787810383747, + "grad_norm": 247.3844757080078, + "learning_rate": 1.0361161524500907e-05, + "loss": 37.0338, + "step": 3617 + }, + { + "epoch": 13.061399548532732, + "grad_norm": 243.3253173828125, + "learning_rate": 1.0355716878402904e-05, + "loss": 37.3565, + "step": 3618 + }, + { + "epoch": 13.065011286681715, + "grad_norm": 213.89939880371094, + "learning_rate": 1.03502722323049e-05, + "loss": 38.367, + "step": 3619 + }, + { + "epoch": 13.0686230248307, + "grad_norm": 254.04953002929688, + "learning_rate": 1.0344827586206898e-05, + "loss": 38.3101, + "step": 3620 + }, + { + "epoch": 13.0686230248307, + "eval_loss": 0.6108394861221313, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 3620 + }, + { + "epoch": 13.072234762979685, + "grad_norm": 235.3623046875, + "learning_rate": 1.0339382940108894e-05, + "loss": 38.3113, + "step": 3621 + }, + { + "epoch": 13.075846501128668, + "grad_norm": 259.0147399902344, + "learning_rate": 1.0333938294010889e-05, + "loss": 36.9916, + "step": 3622 + }, + { + "epoch": 13.079458239277653, + "grad_norm": 257.96575927734375, + "learning_rate": 1.0328493647912886e-05, + "loss": 36.5944, + "step": 3623 + }, + { + "epoch": 13.083069977426636, + "grad_norm": 228.49131774902344, + "learning_rate": 1.0323049001814882e-05, + "loss": 39.7592, + "step": 3624 + }, + { + "epoch": 13.08668171557562, + "grad_norm": 278.5231018066406, + "learning_rate": 1.0317604355716879e-05, + "loss": 38.7785, + "step": 3625 + }, + { + "epoch": 13.090293453724605, + "grad_norm": 218.6136932373047, + "learning_rate": 1.0312159709618876e-05, + "loss": 39.6878, + "step": 3626 + }, + { + "epoch": 13.093905191873588, + "grad_norm": 231.03012084960938, + "learning_rate": 1.0306715063520871e-05, + "loss": 40.5433, + "step": 3627 + }, + { + "epoch": 13.097516930022573, + "grad_norm": 254.7096405029297, + "learning_rate": 1.0301270417422866e-05, + "loss": 39.1311, + "step": 3628 + }, + { + "epoch": 13.101128668171558, + "grad_norm": 303.50274658203125, + "learning_rate": 1.0295825771324865e-05, + "loss": 38.6237, + "step": 3629 + }, + { + "epoch": 13.104740406320541, + "grad_norm": 217.4394073486328, + "learning_rate": 1.029038112522686e-05, + "loss": 36.5534, + "step": 3630 + }, + { + "epoch": 13.104740406320541, + "eval_loss": 0.6075544357299805, + "eval_runtime": 3.1475, + "eval_samples_per_second": 56.87, + "eval_steps_per_second": 56.87, + "step": 3630 + }, + { + "epoch": 13.108352144469526, + "grad_norm": 249.18490600585938, + "learning_rate": 1.0284936479128858e-05, + "loss": 34.2153, + "step": 3631 + }, + { + "epoch": 13.111963882618511, + "grad_norm": 261.9061584472656, + "learning_rate": 1.0279491833030853e-05, + "loss": 33.7793, + "step": 3632 + }, + { + "epoch": 13.115575620767494, + "grad_norm": 205.93113708496094, + "learning_rate": 1.0274047186932848e-05, + "loss": 31.2934, + "step": 3633 + }, + { + "epoch": 13.119187358916479, + "grad_norm": 203.82980346679688, + "learning_rate": 1.0268602540834847e-05, + "loss": 31.9074, + "step": 3634 + }, + { + "epoch": 13.122799097065462, + "grad_norm": 309.0658874511719, + "learning_rate": 1.0263157894736843e-05, + "loss": 32.6883, + "step": 3635 + }, + { + "epoch": 13.126410835214447, + "grad_norm": 239.59312438964844, + "learning_rate": 1.0257713248638838e-05, + "loss": 34.1261, + "step": 3636 + }, + { + "epoch": 13.130022573363432, + "grad_norm": 360.4351501464844, + "learning_rate": 1.0252268602540835e-05, + "loss": 34.7656, + "step": 3637 + }, + { + "epoch": 13.133634311512415, + "grad_norm": 319.87451171875, + "learning_rate": 1.024682395644283e-05, + "loss": 34.6533, + "step": 3638 + }, + { + "epoch": 13.1372460496614, + "grad_norm": 352.31707763671875, + "learning_rate": 1.0241379310344828e-05, + "loss": 33.9159, + "step": 3639 + }, + { + "epoch": 13.140857787810384, + "grad_norm": 288.85418701171875, + "learning_rate": 1.0235934664246825e-05, + "loss": 34.6115, + "step": 3640 + }, + { + "epoch": 13.140857787810384, + "eval_loss": 0.6106187105178833, + "eval_runtime": 3.1535, + "eval_samples_per_second": 56.763, + "eval_steps_per_second": 56.763, + "step": 3640 + }, + { + "epoch": 13.144469525959368, + "grad_norm": 263.8638000488281, + "learning_rate": 1.023049001814882e-05, + "loss": 34.3008, + "step": 3641 + }, + { + "epoch": 13.148081264108352, + "grad_norm": 308.10650634765625, + "learning_rate": 1.0225045372050817e-05, + "loss": 35.9397, + "step": 3642 + }, + { + "epoch": 13.151693002257336, + "grad_norm": 208.60519409179688, + "learning_rate": 1.0219600725952814e-05, + "loss": 34.2573, + "step": 3643 + }, + { + "epoch": 13.15530474040632, + "grad_norm": 251.36766052246094, + "learning_rate": 1.021415607985481e-05, + "loss": 35.853, + "step": 3644 + }, + { + "epoch": 13.158916478555305, + "grad_norm": 264.94818115234375, + "learning_rate": 1.0208711433756807e-05, + "loss": 35.7057, + "step": 3645 + }, + { + "epoch": 13.162528216704288, + "grad_norm": 313.0333251953125, + "learning_rate": 1.0203266787658802e-05, + "loss": 34.611, + "step": 3646 + }, + { + "epoch": 13.166139954853273, + "grad_norm": 254.9687042236328, + "learning_rate": 1.0197822141560797e-05, + "loss": 31.1751, + "step": 3647 + }, + { + "epoch": 13.169751693002258, + "grad_norm": 219.7308349609375, + "learning_rate": 1.0192377495462796e-05, + "loss": 22.8425, + "step": 3648 + }, + { + "epoch": 13.173363431151241, + "grad_norm": 305.76416015625, + "learning_rate": 1.0186932849364792e-05, + "loss": 22.5266, + "step": 3649 + }, + { + "epoch": 13.176975169300226, + "grad_norm": 301.26239013671875, + "learning_rate": 1.0181488203266787e-05, + "loss": 23.861, + "step": 3650 + }, + { + "epoch": 13.176975169300226, + "eval_loss": 0.6107029914855957, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 3650 + }, + { + "epoch": 13.18058690744921, + "grad_norm": 235.15576171875, + "learning_rate": 1.0176043557168784e-05, + "loss": 24.495, + "step": 3651 + }, + { + "epoch": 13.184198645598194, + "grad_norm": 268.524658203125, + "learning_rate": 1.0170598911070781e-05, + "loss": 40.3819, + "step": 3652 + }, + { + "epoch": 13.187810383747179, + "grad_norm": 257.869140625, + "learning_rate": 1.0165154264972778e-05, + "loss": 42.2715, + "step": 3653 + }, + { + "epoch": 13.191422121896162, + "grad_norm": 191.8995361328125, + "learning_rate": 1.0159709618874774e-05, + "loss": 41.2991, + "step": 3654 + }, + { + "epoch": 13.195033860045147, + "grad_norm": 242.85342407226562, + "learning_rate": 1.0154264972776769e-05, + "loss": 39.6007, + "step": 3655 + }, + { + "epoch": 13.198645598194132, + "grad_norm": 279.1092529296875, + "learning_rate": 1.0148820326678766e-05, + "loss": 39.8502, + "step": 3656 + }, + { + "epoch": 13.202257336343115, + "grad_norm": 233.94708251953125, + "learning_rate": 1.0143375680580763e-05, + "loss": 39.6407, + "step": 3657 + }, + { + "epoch": 13.2058690744921, + "grad_norm": 227.53001403808594, + "learning_rate": 1.0137931034482758e-05, + "loss": 40.3618, + "step": 3658 + }, + { + "epoch": 13.209480812641084, + "grad_norm": 216.17654418945312, + "learning_rate": 1.0132486388384756e-05, + "loss": 41.3187, + "step": 3659 + }, + { + "epoch": 13.213092550790067, + "grad_norm": 199.51072692871094, + "learning_rate": 1.0127041742286751e-05, + "loss": 41.7474, + "step": 3660 + }, + { + "epoch": 13.213092550790067, + "eval_loss": 0.6099065542221069, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3660 + }, + { + "epoch": 13.216704288939052, + "grad_norm": 212.3302001953125, + "learning_rate": 1.0121597096188748e-05, + "loss": 40.8565, + "step": 3661 + }, + { + "epoch": 13.220316027088035, + "grad_norm": 185.42857360839844, + "learning_rate": 1.0116152450090745e-05, + "loss": 41.5302, + "step": 3662 + }, + { + "epoch": 13.22392776523702, + "grad_norm": 241.05487060546875, + "learning_rate": 1.011070780399274e-05, + "loss": 38.6842, + "step": 3663 + }, + { + "epoch": 13.227539503386005, + "grad_norm": 314.1755065917969, + "learning_rate": 1.0105263157894738e-05, + "loss": 37.8021, + "step": 3664 + }, + { + "epoch": 13.231151241534988, + "grad_norm": 262.6571960449219, + "learning_rate": 1.0099818511796733e-05, + "loss": 36.3265, + "step": 3665 + }, + { + "epoch": 13.234762979683973, + "grad_norm": 259.24029541015625, + "learning_rate": 1.009437386569873e-05, + "loss": 38.4521, + "step": 3666 + }, + { + "epoch": 13.238374717832958, + "grad_norm": 223.5182342529297, + "learning_rate": 1.0088929219600727e-05, + "loss": 37.3267, + "step": 3667 + }, + { + "epoch": 13.241986455981941, + "grad_norm": 181.72926330566406, + "learning_rate": 1.0083484573502722e-05, + "loss": 38.0142, + "step": 3668 + }, + { + "epoch": 13.245598194130926, + "grad_norm": 204.99813842773438, + "learning_rate": 1.0078039927404718e-05, + "loss": 37.3513, + "step": 3669 + }, + { + "epoch": 13.249209932279909, + "grad_norm": 184.05482482910156, + "learning_rate": 1.0072595281306715e-05, + "loss": 37.9737, + "step": 3670 + }, + { + "epoch": 13.249209932279909, + "eval_loss": 0.6081296801567078, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.081, + "eval_steps_per_second": 57.081, + "step": 3670 + }, + { + "epoch": 13.252821670428894, + "grad_norm": 261.076416015625, + "learning_rate": 1.0067150635208712e-05, + "loss": 38.1087, + "step": 3671 + }, + { + "epoch": 13.256433408577879, + "grad_norm": 218.79515075683594, + "learning_rate": 1.0061705989110709e-05, + "loss": 37.215, + "step": 3672 + }, + { + "epoch": 13.260045146726862, + "grad_norm": 240.93222045898438, + "learning_rate": 1.0056261343012704e-05, + "loss": 37.4461, + "step": 3673 + }, + { + "epoch": 13.263656884875846, + "grad_norm": 241.46072387695312, + "learning_rate": 1.00508166969147e-05, + "loss": 39.4396, + "step": 3674 + }, + { + "epoch": 13.267268623024831, + "grad_norm": 217.85369873046875, + "learning_rate": 1.0045372050816699e-05, + "loss": 38.5512, + "step": 3675 + }, + { + "epoch": 13.270880361173814, + "grad_norm": 254.53549194335938, + "learning_rate": 1.0039927404718694e-05, + "loss": 39.4436, + "step": 3676 + }, + { + "epoch": 13.2744920993228, + "grad_norm": 330.2030029296875, + "learning_rate": 1.003448275862069e-05, + "loss": 39.6341, + "step": 3677 + }, + { + "epoch": 13.278103837471784, + "grad_norm": 267.6778869628906, + "learning_rate": 1.0029038112522686e-05, + "loss": 38.5305, + "step": 3678 + }, + { + "epoch": 13.281715575620767, + "grad_norm": 251.23703002929688, + "learning_rate": 1.0023593466424682e-05, + "loss": 39.712, + "step": 3679 + }, + { + "epoch": 13.285327313769752, + "grad_norm": 258.8126525878906, + "learning_rate": 1.0018148820326679e-05, + "loss": 37.982, + "step": 3680 + }, + { + "epoch": 13.285327313769752, + "eval_loss": 0.6092600226402283, + "eval_runtime": 3.1494, + "eval_samples_per_second": 56.837, + "eval_steps_per_second": 56.837, + "step": 3680 + }, + { + "epoch": 13.288939051918735, + "grad_norm": 270.01690673828125, + "learning_rate": 1.0012704174228676e-05, + "loss": 35.8938, + "step": 3681 + }, + { + "epoch": 13.29255079006772, + "grad_norm": 271.138671875, + "learning_rate": 1.0007259528130671e-05, + "loss": 33.2221, + "step": 3682 + }, + { + "epoch": 13.296162528216705, + "grad_norm": 239.4976806640625, + "learning_rate": 1.0001814882032668e-05, + "loss": 32.6252, + "step": 3683 + }, + { + "epoch": 13.299774266365688, + "grad_norm": 203.7470245361328, + "learning_rate": 9.996370235934664e-06, + "loss": 32.3694, + "step": 3684 + }, + { + "epoch": 13.303386004514673, + "grad_norm": 255.28419494628906, + "learning_rate": 9.990925589836661e-06, + "loss": 32.7386, + "step": 3685 + }, + { + "epoch": 13.306997742663658, + "grad_norm": 267.82489013671875, + "learning_rate": 9.985480943738658e-06, + "loss": 33.7657, + "step": 3686 + }, + { + "epoch": 13.31060948081264, + "grad_norm": 224.82432556152344, + "learning_rate": 9.980036297640653e-06, + "loss": 34.085, + "step": 3687 + }, + { + "epoch": 13.314221218961626, + "grad_norm": 249.92684936523438, + "learning_rate": 9.974591651542649e-06, + "loss": 33.9186, + "step": 3688 + }, + { + "epoch": 13.317832957110609, + "grad_norm": 249.29620361328125, + "learning_rate": 9.969147005444648e-06, + "loss": 35.0909, + "step": 3689 + }, + { + "epoch": 13.321444695259594, + "grad_norm": 276.4640808105469, + "learning_rate": 9.963702359346643e-06, + "loss": 35.6823, + "step": 3690 + }, + { + "epoch": 13.321444695259594, + "eval_loss": 0.6132593154907227, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 3690 + }, + { + "epoch": 13.325056433408578, + "grad_norm": 245.46163940429688, + "learning_rate": 9.958257713248638e-06, + "loss": 35.7071, + "step": 3691 + }, + { + "epoch": 13.328668171557561, + "grad_norm": 311.008544921875, + "learning_rate": 9.952813067150635e-06, + "loss": 33.6089, + "step": 3692 + }, + { + "epoch": 13.332279909706546, + "grad_norm": 283.2784118652344, + "learning_rate": 9.94736842105263e-06, + "loss": 34.9939, + "step": 3693 + }, + { + "epoch": 13.335891647855531, + "grad_norm": 293.2317199707031, + "learning_rate": 9.94192377495463e-06, + "loss": 37.1149, + "step": 3694 + }, + { + "epoch": 13.339503386004514, + "grad_norm": 263.33111572265625, + "learning_rate": 9.936479128856625e-06, + "loss": 36.5911, + "step": 3695 + }, + { + "epoch": 13.343115124153499, + "grad_norm": 285.1488952636719, + "learning_rate": 9.93103448275862e-06, + "loss": 35.9336, + "step": 3696 + }, + { + "epoch": 13.346726862302482, + "grad_norm": 246.30616760253906, + "learning_rate": 9.925589836660617e-06, + "loss": 26.1555, + "step": 3697 + }, + { + "epoch": 13.350338600451467, + "grad_norm": 185.4857177734375, + "learning_rate": 9.920145190562614e-06, + "loss": 21.9519, + "step": 3698 + }, + { + "epoch": 13.353950338600452, + "grad_norm": 269.6291809082031, + "learning_rate": 9.91470054446461e-06, + "loss": 22.5592, + "step": 3699 + }, + { + "epoch": 13.357562076749435, + "grad_norm": 214.7660675048828, + "learning_rate": 9.909255898366607e-06, + "loss": 23.2505, + "step": 3700 + }, + { + "epoch": 13.357562076749435, + "eval_loss": 0.6123418211936951, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 3700 + }, + { + "epoch": 13.36117381489842, + "grad_norm": 227.8025360107422, + "learning_rate": 9.903811252268602e-06, + "loss": 23.9731, + "step": 3701 + }, + { + "epoch": 13.364785553047405, + "grad_norm": 261.7846374511719, + "learning_rate": 9.898366606170598e-06, + "loss": 40.3869, + "step": 3702 + }, + { + "epoch": 13.368397291196388, + "grad_norm": 305.4109802246094, + "learning_rate": 9.892921960072596e-06, + "loss": 41.9626, + "step": 3703 + }, + { + "epoch": 13.372009029345373, + "grad_norm": 272.86236572265625, + "learning_rate": 9.887477313974592e-06, + "loss": 39.9819, + "step": 3704 + }, + { + "epoch": 13.375620767494357, + "grad_norm": 371.4781188964844, + "learning_rate": 9.882032667876589e-06, + "loss": 40.8074, + "step": 3705 + }, + { + "epoch": 13.37923250564334, + "grad_norm": 278.7463684082031, + "learning_rate": 9.876588021778584e-06, + "loss": 40.6721, + "step": 3706 + }, + { + "epoch": 13.382844243792325, + "grad_norm": 270.41619873046875, + "learning_rate": 9.87114337568058e-06, + "loss": 40.1604, + "step": 3707 + }, + { + "epoch": 13.386455981941308, + "grad_norm": 204.42018127441406, + "learning_rate": 9.865698729582578e-06, + "loss": 41.4666, + "step": 3708 + }, + { + "epoch": 13.390067720090293, + "grad_norm": 197.43289184570312, + "learning_rate": 9.860254083484574e-06, + "loss": 40.953, + "step": 3709 + }, + { + "epoch": 13.393679458239278, + "grad_norm": 203.92056274414062, + "learning_rate": 9.85480943738657e-06, + "loss": 40.6416, + "step": 3710 + }, + { + "epoch": 13.393679458239278, + "eval_loss": 0.608938992023468, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.863, + "eval_steps_per_second": 56.863, + "step": 3710 + }, + { + "epoch": 13.397291196388261, + "grad_norm": 353.2951354980469, + "learning_rate": 9.849364791288566e-06, + "loss": 39.7, + "step": 3711 + }, + { + "epoch": 13.400902934537246, + "grad_norm": 222.94410705566406, + "learning_rate": 9.843920145190563e-06, + "loss": 40.4703, + "step": 3712 + }, + { + "epoch": 13.404514672686231, + "grad_norm": 301.0710754394531, + "learning_rate": 9.83847549909256e-06, + "loss": 37.0453, + "step": 3713 + }, + { + "epoch": 13.408126410835214, + "grad_norm": 251.70263671875, + "learning_rate": 9.833030852994556e-06, + "loss": 37.5346, + "step": 3714 + }, + { + "epoch": 13.411738148984199, + "grad_norm": 201.29335021972656, + "learning_rate": 9.827586206896551e-06, + "loss": 39.0706, + "step": 3715 + }, + { + "epoch": 13.415349887133182, + "grad_norm": 233.82212829589844, + "learning_rate": 9.822141560798548e-06, + "loss": 38.4527, + "step": 3716 + }, + { + "epoch": 13.418961625282167, + "grad_norm": 245.0128936767578, + "learning_rate": 9.816696914700545e-06, + "loss": 37.82, + "step": 3717 + }, + { + "epoch": 13.422573363431152, + "grad_norm": 325.1784973144531, + "learning_rate": 9.81125226860254e-06, + "loss": 38.8858, + "step": 3718 + }, + { + "epoch": 13.426185101580135, + "grad_norm": 196.15032958984375, + "learning_rate": 9.805807622504538e-06, + "loss": 37.1919, + "step": 3719 + }, + { + "epoch": 13.42979683972912, + "grad_norm": 254.73980712890625, + "learning_rate": 9.800362976406533e-06, + "loss": 39.1644, + "step": 3720 + }, + { + "epoch": 13.42979683972912, + "eval_loss": 0.6100116968154907, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 3720 + }, + { + "epoch": 13.433408577878104, + "grad_norm": 253.11489868164062, + "learning_rate": 9.79491833030853e-06, + "loss": 39.8542, + "step": 3721 + }, + { + "epoch": 13.437020316027088, + "grad_norm": 267.8416748046875, + "learning_rate": 9.789473684210527e-06, + "loss": 39.8469, + "step": 3722 + }, + { + "epoch": 13.440632054176072, + "grad_norm": 267.62835693359375, + "learning_rate": 9.784029038112523e-06, + "loss": 37.4556, + "step": 3723 + }, + { + "epoch": 13.444243792325057, + "grad_norm": 346.6018371582031, + "learning_rate": 9.77858439201452e-06, + "loss": 39.7817, + "step": 3724 + }, + { + "epoch": 13.44785553047404, + "grad_norm": 241.95008850097656, + "learning_rate": 9.773139745916515e-06, + "loss": 39.1631, + "step": 3725 + }, + { + "epoch": 13.451467268623025, + "grad_norm": 244.9163055419922, + "learning_rate": 9.767695099818512e-06, + "loss": 38.6152, + "step": 3726 + }, + { + "epoch": 13.455079006772008, + "grad_norm": 243.60633850097656, + "learning_rate": 9.76225045372051e-06, + "loss": 39.5388, + "step": 3727 + }, + { + "epoch": 13.458690744920993, + "grad_norm": 230.57276916503906, + "learning_rate": 9.756805807622505e-06, + "loss": 40.3007, + "step": 3728 + }, + { + "epoch": 13.462302483069978, + "grad_norm": 228.76754760742188, + "learning_rate": 9.7513611615245e-06, + "loss": 37.7111, + "step": 3729 + }, + { + "epoch": 13.465914221218961, + "grad_norm": 292.7367248535156, + "learning_rate": 9.745916515426497e-06, + "loss": 38.4114, + "step": 3730 + }, + { + "epoch": 13.465914221218961, + "eval_loss": 0.6064842939376831, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 3730 + }, + { + "epoch": 13.469525959367946, + "grad_norm": 226.9254150390625, + "learning_rate": 9.740471869328494e-06, + "loss": 34.015, + "step": 3731 + }, + { + "epoch": 13.47313769751693, + "grad_norm": 250.38137817382812, + "learning_rate": 9.73502722323049e-06, + "loss": 34.2911, + "step": 3732 + }, + { + "epoch": 13.476749435665914, + "grad_norm": 230.447265625, + "learning_rate": 9.729582577132487e-06, + "loss": 31.8708, + "step": 3733 + }, + { + "epoch": 13.480361173814899, + "grad_norm": 241.05787658691406, + "learning_rate": 9.724137931034482e-06, + "loss": 34.5685, + "step": 3734 + }, + { + "epoch": 13.483972911963882, + "grad_norm": 248.07254028320312, + "learning_rate": 9.718693284936481e-06, + "loss": 32.6084, + "step": 3735 + }, + { + "epoch": 13.487584650112867, + "grad_norm": 241.22862243652344, + "learning_rate": 9.713248638838476e-06, + "loss": 32.787, + "step": 3736 + }, + { + "epoch": 13.491196388261852, + "grad_norm": 295.4871520996094, + "learning_rate": 9.707803992740472e-06, + "loss": 33.9786, + "step": 3737 + }, + { + "epoch": 13.494808126410835, + "grad_norm": 285.3634948730469, + "learning_rate": 9.702359346642469e-06, + "loss": 33.9872, + "step": 3738 + }, + { + "epoch": 13.49841986455982, + "grad_norm": 302.39947509765625, + "learning_rate": 9.696914700544464e-06, + "loss": 33.9854, + "step": 3739 + }, + { + "epoch": 13.502031602708804, + "grad_norm": 310.0465087890625, + "learning_rate": 9.691470054446461e-06, + "loss": 34.1859, + "step": 3740 + }, + { + "epoch": 13.502031602708804, + "eval_loss": 0.6067100167274475, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 3740 + }, + { + "epoch": 13.505643340857787, + "grad_norm": 319.9311828613281, + "learning_rate": 9.686025408348458e-06, + "loss": 34.5264, + "step": 3741 + }, + { + "epoch": 13.509255079006772, + "grad_norm": 291.75738525390625, + "learning_rate": 9.680580762250454e-06, + "loss": 35.8348, + "step": 3742 + }, + { + "epoch": 13.512866817155757, + "grad_norm": 291.5312805175781, + "learning_rate": 9.675136116152449e-06, + "loss": 33.8803, + "step": 3743 + }, + { + "epoch": 13.51647855530474, + "grad_norm": 228.00588989257812, + "learning_rate": 9.669691470054448e-06, + "loss": 36.1919, + "step": 3744 + }, + { + "epoch": 13.520090293453725, + "grad_norm": 236.5559539794922, + "learning_rate": 9.664246823956443e-06, + "loss": 35.8432, + "step": 3745 + }, + { + "epoch": 13.523702031602708, + "grad_norm": 287.7408752441406, + "learning_rate": 9.65880217785844e-06, + "loss": 37.069, + "step": 3746 + }, + { + "epoch": 13.527313769751693, + "grad_norm": 272.73870849609375, + "learning_rate": 9.653357531760436e-06, + "loss": 29.1896, + "step": 3747 + }, + { + "epoch": 13.530925507900678, + "grad_norm": 256.5550842285156, + "learning_rate": 9.647912885662431e-06, + "loss": 23.0953, + "step": 3748 + }, + { + "epoch": 13.534537246049661, + "grad_norm": 230.98487854003906, + "learning_rate": 9.64246823956443e-06, + "loss": 21.9902, + "step": 3749 + }, + { + "epoch": 13.538148984198646, + "grad_norm": 247.1185760498047, + "learning_rate": 9.637023593466425e-06, + "loss": 23.7439, + "step": 3750 + }, + { + "epoch": 13.538148984198646, + "eval_loss": 0.6106311082839966, + "eval_runtime": 3.1356, + "eval_samples_per_second": 57.086, + "eval_steps_per_second": 57.086, + "step": 3750 + }, + { + "epoch": 13.54176072234763, + "grad_norm": 193.83152770996094, + "learning_rate": 9.63157894736842e-06, + "loss": 24.2292, + "step": 3751 + }, + { + "epoch": 13.545372460496614, + "grad_norm": 322.80487060546875, + "learning_rate": 9.626134301270418e-06, + "loss": 40.9778, + "step": 3752 + }, + { + "epoch": 13.548984198645599, + "grad_norm": 345.0560302734375, + "learning_rate": 9.620689655172413e-06, + "loss": 42.3601, + "step": 3753 + }, + { + "epoch": 13.552595936794582, + "grad_norm": 240.3759002685547, + "learning_rate": 9.61524500907441e-06, + "loss": 41.092, + "step": 3754 + }, + { + "epoch": 13.556207674943566, + "grad_norm": 219.0955352783203, + "learning_rate": 9.609800362976407e-06, + "loss": 40.3108, + "step": 3755 + }, + { + "epoch": 13.559819413092551, + "grad_norm": 255.6158447265625, + "learning_rate": 9.604355716878403e-06, + "loss": 39.8885, + "step": 3756 + }, + { + "epoch": 13.563431151241534, + "grad_norm": 264.55010986328125, + "learning_rate": 9.5989110707804e-06, + "loss": 40.8838, + "step": 3757 + }, + { + "epoch": 13.56704288939052, + "grad_norm": 313.0918273925781, + "learning_rate": 9.593466424682397e-06, + "loss": 40.6634, + "step": 3758 + }, + { + "epoch": 13.570654627539504, + "grad_norm": 304.87396240234375, + "learning_rate": 9.588021778584392e-06, + "loss": 41.8734, + "step": 3759 + }, + { + "epoch": 13.574266365688487, + "grad_norm": 239.76063537597656, + "learning_rate": 9.58257713248639e-06, + "loss": 40.6281, + "step": 3760 + }, + { + "epoch": 13.574266365688487, + "eval_loss": 0.6124129891395569, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.028, + "eval_steps_per_second": 57.028, + "step": 3760 + }, + { + "epoch": 13.577878103837472, + "grad_norm": 201.89422607421875, + "learning_rate": 9.577132486388385e-06, + "loss": 39.6948, + "step": 3761 + }, + { + "epoch": 13.581489841986457, + "grad_norm": 232.8797607421875, + "learning_rate": 9.57168784029038e-06, + "loss": 39.6927, + "step": 3762 + }, + { + "epoch": 13.58510158013544, + "grad_norm": 250.30355834960938, + "learning_rate": 9.566243194192379e-06, + "loss": 37.6926, + "step": 3763 + }, + { + "epoch": 13.588713318284425, + "grad_norm": 256.23626708984375, + "learning_rate": 9.560798548094374e-06, + "loss": 38.248, + "step": 3764 + }, + { + "epoch": 13.592325056433408, + "grad_norm": 234.1791534423828, + "learning_rate": 9.555353901996371e-06, + "loss": 36.8178, + "step": 3765 + }, + { + "epoch": 13.595936794582393, + "grad_norm": 243.87615966796875, + "learning_rate": 9.549909255898367e-06, + "loss": 37.0802, + "step": 3766 + }, + { + "epoch": 13.599548532731378, + "grad_norm": 220.98150634765625, + "learning_rate": 9.544464609800362e-06, + "loss": 37.1251, + "step": 3767 + }, + { + "epoch": 13.60316027088036, + "grad_norm": 235.8653564453125, + "learning_rate": 9.53901996370236e-06, + "loss": 38.2965, + "step": 3768 + }, + { + "epoch": 13.606772009029346, + "grad_norm": 237.66712951660156, + "learning_rate": 9.533575317604356e-06, + "loss": 38.0266, + "step": 3769 + }, + { + "epoch": 13.610383747178329, + "grad_norm": 229.4922637939453, + "learning_rate": 9.528130671506351e-06, + "loss": 38.4199, + "step": 3770 + }, + { + "epoch": 13.610383747178329, + "eval_loss": 0.6078812479972839, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 3770 + }, + { + "epoch": 13.613995485327314, + "grad_norm": 250.82533264160156, + "learning_rate": 9.522686025408349e-06, + "loss": 39.713, + "step": 3771 + }, + { + "epoch": 13.617607223476298, + "grad_norm": 218.97511291503906, + "learning_rate": 9.517241379310346e-06, + "loss": 37.6396, + "step": 3772 + }, + { + "epoch": 13.621218961625281, + "grad_norm": 240.13096618652344, + "learning_rate": 9.511796733212341e-06, + "loss": 39.2808, + "step": 3773 + }, + { + "epoch": 13.624830699774266, + "grad_norm": 214.77957153320312, + "learning_rate": 9.506352087114338e-06, + "loss": 39.1584, + "step": 3774 + }, + { + "epoch": 13.628442437923251, + "grad_norm": 273.2488708496094, + "learning_rate": 9.500907441016333e-06, + "loss": 39.6725, + "step": 3775 + }, + { + "epoch": 13.632054176072234, + "grad_norm": 240.46669006347656, + "learning_rate": 9.49546279491833e-06, + "loss": 40.155, + "step": 3776 + }, + { + "epoch": 13.635665914221219, + "grad_norm": 304.46533203125, + "learning_rate": 9.490018148820328e-06, + "loss": 39.5831, + "step": 3777 + }, + { + "epoch": 13.639277652370204, + "grad_norm": 282.9252624511719, + "learning_rate": 9.484573502722323e-06, + "loss": 40.8392, + "step": 3778 + }, + { + "epoch": 13.642889390519187, + "grad_norm": 229.2595977783203, + "learning_rate": 9.47912885662432e-06, + "loss": 38.4015, + "step": 3779 + }, + { + "epoch": 13.646501128668172, + "grad_norm": 300.0253601074219, + "learning_rate": 9.473684210526315e-06, + "loss": 35.0578, + "step": 3780 + }, + { + "epoch": 13.646501128668172, + "eval_loss": 0.6059401631355286, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 3780 + }, + { + "epoch": 13.650112866817155, + "grad_norm": 266.379638671875, + "learning_rate": 9.468239564428313e-06, + "loss": 33.0308, + "step": 3781 + }, + { + "epoch": 13.65372460496614, + "grad_norm": 248.8190460205078, + "learning_rate": 9.46279491833031e-06, + "loss": 31.7632, + "step": 3782 + }, + { + "epoch": 13.657336343115125, + "grad_norm": 224.4126739501953, + "learning_rate": 9.457350272232305e-06, + "loss": 32.8875, + "step": 3783 + }, + { + "epoch": 13.660948081264108, + "grad_norm": 259.84466552734375, + "learning_rate": 9.4519056261343e-06, + "loss": 32.3248, + "step": 3784 + }, + { + "epoch": 13.664559819413093, + "grad_norm": 233.59483337402344, + "learning_rate": 9.446460980036297e-06, + "loss": 32.5855, + "step": 3785 + }, + { + "epoch": 13.668171557562077, + "grad_norm": 283.1840515136719, + "learning_rate": 9.441016333938295e-06, + "loss": 33.8277, + "step": 3786 + }, + { + "epoch": 13.67178329571106, + "grad_norm": 269.51171875, + "learning_rate": 9.435571687840292e-06, + "loss": 33.8348, + "step": 3787 + }, + { + "epoch": 13.675395033860045, + "grad_norm": 284.6701354980469, + "learning_rate": 9.430127041742287e-06, + "loss": 34.2571, + "step": 3788 + }, + { + "epoch": 13.679006772009028, + "grad_norm": 308.96221923828125, + "learning_rate": 9.424682395644282e-06, + "loss": 34.2313, + "step": 3789 + }, + { + "epoch": 13.682618510158013, + "grad_norm": 229.36366271972656, + "learning_rate": 9.41923774954628e-06, + "loss": 34.6341, + "step": 3790 + }, + { + "epoch": 13.682618510158013, + "eval_loss": 0.606715202331543, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 3790 + }, + { + "epoch": 13.686230248306998, + "grad_norm": 335.4346008300781, + "learning_rate": 9.413793103448277e-06, + "loss": 35.2222, + "step": 3791 + }, + { + "epoch": 13.689841986455981, + "grad_norm": 259.72222900390625, + "learning_rate": 9.408348457350272e-06, + "loss": 34.7416, + "step": 3792 + }, + { + "epoch": 13.693453724604966, + "grad_norm": 275.96112060546875, + "learning_rate": 9.402903811252269e-06, + "loss": 34.2018, + "step": 3793 + }, + { + "epoch": 13.697065462753951, + "grad_norm": 349.28924560546875, + "learning_rate": 9.397459165154264e-06, + "loss": 37.8801, + "step": 3794 + }, + { + "epoch": 13.700677200902934, + "grad_norm": 288.47540283203125, + "learning_rate": 9.392014519056261e-06, + "loss": 37.5101, + "step": 3795 + }, + { + "epoch": 13.704288939051919, + "grad_norm": 255.31033325195312, + "learning_rate": 9.386569872958259e-06, + "loss": 36.9294, + "step": 3796 + }, + { + "epoch": 13.707900677200904, + "grad_norm": 273.757080078125, + "learning_rate": 9.381125226860254e-06, + "loss": 31.64, + "step": 3797 + }, + { + "epoch": 13.711512415349887, + "grad_norm": 236.24928283691406, + "learning_rate": 9.375680580762251e-06, + "loss": 22.9812, + "step": 3798 + }, + { + "epoch": 13.715124153498872, + "grad_norm": 206.70883178710938, + "learning_rate": 9.370235934664246e-06, + "loss": 22.4788, + "step": 3799 + }, + { + "epoch": 13.718735891647855, + "grad_norm": 168.15762329101562, + "learning_rate": 9.364791288566243e-06, + "loss": 23.3803, + "step": 3800 + }, + { + "epoch": 13.718735891647855, + "eval_loss": 0.6092759966850281, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 3800 + }, + { + "epoch": 13.72234762979684, + "grad_norm": 261.88397216796875, + "learning_rate": 9.35934664246824e-06, + "loss": 24.8757, + "step": 3801 + }, + { + "epoch": 13.725959367945824, + "grad_norm": 235.3518829345703, + "learning_rate": 9.353901996370236e-06, + "loss": 39.8777, + "step": 3802 + }, + { + "epoch": 13.729571106094808, + "grad_norm": 226.94027709960938, + "learning_rate": 9.348457350272231e-06, + "loss": 40.4357, + "step": 3803 + }, + { + "epoch": 13.733182844243792, + "grad_norm": 266.2643737792969, + "learning_rate": 9.34301270417423e-06, + "loss": 41.6411, + "step": 3804 + }, + { + "epoch": 13.736794582392777, + "grad_norm": 327.39288330078125, + "learning_rate": 9.337568058076225e-06, + "loss": 39.862, + "step": 3805 + }, + { + "epoch": 13.74040632054176, + "grad_norm": 241.03121948242188, + "learning_rate": 9.332123411978223e-06, + "loss": 39.1833, + "step": 3806 + }, + { + "epoch": 13.744018058690745, + "grad_norm": 232.2872314453125, + "learning_rate": 9.326678765880218e-06, + "loss": 40.6895, + "step": 3807 + }, + { + "epoch": 13.747629796839728, + "grad_norm": 236.909912109375, + "learning_rate": 9.321234119782213e-06, + "loss": 39.5891, + "step": 3808 + }, + { + "epoch": 13.751241534988713, + "grad_norm": 193.81478881835938, + "learning_rate": 9.315789473684212e-06, + "loss": 41.5211, + "step": 3809 + }, + { + "epoch": 13.754853273137698, + "grad_norm": 214.87301635742188, + "learning_rate": 9.310344827586207e-06, + "loss": 41.0726, + "step": 3810 + }, + { + "epoch": 13.754853273137698, + "eval_loss": 0.6098713874816895, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.047, + "eval_steps_per_second": 57.047, + "step": 3810 + }, + { + "epoch": 13.758465011286681, + "grad_norm": 196.57247924804688, + "learning_rate": 9.304900181488203e-06, + "loss": 40.1843, + "step": 3811 + }, + { + "epoch": 13.762076749435666, + "grad_norm": 215.59698486328125, + "learning_rate": 9.2994555353902e-06, + "loss": 37.6279, + "step": 3812 + }, + { + "epoch": 13.76568848758465, + "grad_norm": 221.1280059814453, + "learning_rate": 9.294010889292195e-06, + "loss": 37.9593, + "step": 3813 + }, + { + "epoch": 13.769300225733634, + "grad_norm": 314.94610595703125, + "learning_rate": 9.288566243194192e-06, + "loss": 37.3399, + "step": 3814 + }, + { + "epoch": 13.772911963882619, + "grad_norm": 240.10816955566406, + "learning_rate": 9.28312159709619e-06, + "loss": 38.3185, + "step": 3815 + }, + { + "epoch": 13.776523702031604, + "grad_norm": 229.2427978515625, + "learning_rate": 9.277676950998185e-06, + "loss": 36.9407, + "step": 3816 + }, + { + "epoch": 13.780135440180587, + "grad_norm": 224.78335571289062, + "learning_rate": 9.272232304900182e-06, + "loss": 39.3709, + "step": 3817 + }, + { + "epoch": 13.783747178329572, + "grad_norm": 216.5969696044922, + "learning_rate": 9.266787658802179e-06, + "loss": 38.2303, + "step": 3818 + }, + { + "epoch": 13.787358916478555, + "grad_norm": 208.7849884033203, + "learning_rate": 9.261343012704174e-06, + "loss": 39.492, + "step": 3819 + }, + { + "epoch": 13.79097065462754, + "grad_norm": 215.76475524902344, + "learning_rate": 9.255898366606171e-06, + "loss": 38.5599, + "step": 3820 + }, + { + "epoch": 13.79097065462754, + "eval_loss": 0.6080366969108582, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.041, + "eval_steps_per_second": 57.041, + "step": 3820 + }, + { + "epoch": 13.794582392776524, + "grad_norm": 224.64462280273438, + "learning_rate": 9.250453720508167e-06, + "loss": 39.315, + "step": 3821 + }, + { + "epoch": 13.798194130925507, + "grad_norm": 298.545654296875, + "learning_rate": 9.245009074410162e-06, + "loss": 38.3108, + "step": 3822 + }, + { + "epoch": 13.801805869074492, + "grad_norm": 236.5186767578125, + "learning_rate": 9.239564428312161e-06, + "loss": 39.9223, + "step": 3823 + }, + { + "epoch": 13.805417607223477, + "grad_norm": 251.47999572753906, + "learning_rate": 9.234119782214156e-06, + "loss": 39.4288, + "step": 3824 + }, + { + "epoch": 13.80902934537246, + "grad_norm": 260.8268737792969, + "learning_rate": 9.228675136116152e-06, + "loss": 38.276, + "step": 3825 + }, + { + "epoch": 13.812641083521445, + "grad_norm": 253.25172424316406, + "learning_rate": 9.223230490018149e-06, + "loss": 40.7118, + "step": 3826 + }, + { + "epoch": 13.816252821670428, + "grad_norm": 250.31784057617188, + "learning_rate": 9.217785843920146e-06, + "loss": 40.1916, + "step": 3827 + }, + { + "epoch": 13.819864559819413, + "grad_norm": 228.79234313964844, + "learning_rate": 9.212341197822143e-06, + "loss": 38.1513, + "step": 3828 + }, + { + "epoch": 13.823476297968398, + "grad_norm": 262.689697265625, + "learning_rate": 9.206896551724138e-06, + "loss": 38.43, + "step": 3829 + }, + { + "epoch": 13.827088036117381, + "grad_norm": 191.04139709472656, + "learning_rate": 9.201451905626134e-06, + "loss": 34.2476, + "step": 3830 + }, + { + "epoch": 13.827088036117381, + "eval_loss": 0.6077054142951965, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 3830 + }, + { + "epoch": 13.830699774266366, + "grad_norm": 236.3266143798828, + "learning_rate": 9.196007259528131e-06, + "loss": 33.7892, + "step": 3831 + }, + { + "epoch": 13.83431151241535, + "grad_norm": 284.8748474121094, + "learning_rate": 9.190562613430128e-06, + "loss": 31.9857, + "step": 3832 + }, + { + "epoch": 13.837923250564334, + "grad_norm": 261.17413330078125, + "learning_rate": 9.185117967332123e-06, + "loss": 32.8165, + "step": 3833 + }, + { + "epoch": 13.841534988713319, + "grad_norm": 195.1323699951172, + "learning_rate": 9.17967332123412e-06, + "loss": 33.1709, + "step": 3834 + }, + { + "epoch": 13.845146726862303, + "grad_norm": 220.5006561279297, + "learning_rate": 9.174228675136116e-06, + "loss": 33.149, + "step": 3835 + }, + { + "epoch": 13.848758465011286, + "grad_norm": 236.7254638671875, + "learning_rate": 9.168784029038111e-06, + "loss": 33.633, + "step": 3836 + }, + { + "epoch": 13.852370203160271, + "grad_norm": 269.1921691894531, + "learning_rate": 9.16333938294011e-06, + "loss": 34.6822, + "step": 3837 + }, + { + "epoch": 13.855981941309254, + "grad_norm": 222.4369354248047, + "learning_rate": 9.157894736842105e-06, + "loss": 35.2816, + "step": 3838 + }, + { + "epoch": 13.85959367945824, + "grad_norm": 232.4306640625, + "learning_rate": 9.152450090744102e-06, + "loss": 35.0067, + "step": 3839 + }, + { + "epoch": 13.863205417607224, + "grad_norm": 297.0786437988281, + "learning_rate": 9.147005444646098e-06, + "loss": 34.264, + "step": 3840 + }, + { + "epoch": 13.863205417607224, + "eval_loss": 0.6047748327255249, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 3840 + }, + { + "epoch": 13.866817155756207, + "grad_norm": 370.232421875, + "learning_rate": 9.141560798548095e-06, + "loss": 35.4996, + "step": 3841 + }, + { + "epoch": 13.870428893905192, + "grad_norm": 216.05775451660156, + "learning_rate": 9.136116152450092e-06, + "loss": 36.1403, + "step": 3842 + }, + { + "epoch": 13.874040632054175, + "grad_norm": 233.11138916015625, + "learning_rate": 9.130671506352087e-06, + "loss": 36.0324, + "step": 3843 + }, + { + "epoch": 13.87765237020316, + "grad_norm": 297.1761779785156, + "learning_rate": 9.125226860254083e-06, + "loss": 36.5617, + "step": 3844 + }, + { + "epoch": 13.881264108352145, + "grad_norm": 290.61590576171875, + "learning_rate": 9.11978221415608e-06, + "loss": 36.7113, + "step": 3845 + }, + { + "epoch": 13.884875846501128, + "grad_norm": 293.5744934082031, + "learning_rate": 9.114337568058077e-06, + "loss": 36.9964, + "step": 3846 + }, + { + "epoch": 13.888487584650113, + "grad_norm": 227.73455810546875, + "learning_rate": 9.108892921960072e-06, + "loss": 31.8552, + "step": 3847 + }, + { + "epoch": 13.892099322799098, + "grad_norm": 223.36077880859375, + "learning_rate": 9.10344827586207e-06, + "loss": 22.9122, + "step": 3848 + }, + { + "epoch": 13.89571106094808, + "grad_norm": 181.14501953125, + "learning_rate": 9.098003629764065e-06, + "loss": 22.366, + "step": 3849 + }, + { + "epoch": 13.899322799097066, + "grad_norm": 215.75856018066406, + "learning_rate": 9.092558983666063e-06, + "loss": 23.9545, + "step": 3850 + }, + { + "epoch": 13.899322799097066, + "eval_loss": 0.6072003245353699, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 3850 + }, + { + "epoch": 13.90293453724605, + "grad_norm": 233.22837829589844, + "learning_rate": 9.087114337568059e-06, + "loss": 23.5196, + "step": 3851 + }, + { + "epoch": 13.906546275395034, + "grad_norm": 269.9342041015625, + "learning_rate": 9.081669691470054e-06, + "loss": 41.4605, + "step": 3852 + }, + { + "epoch": 13.910158013544018, + "grad_norm": 304.4266662597656, + "learning_rate": 9.076225045372051e-06, + "loss": 40.2848, + "step": 3853 + }, + { + "epoch": 13.913769751693001, + "grad_norm": 318.2371520996094, + "learning_rate": 9.070780399274047e-06, + "loss": 41.0044, + "step": 3854 + }, + { + "epoch": 13.917381489841986, + "grad_norm": 272.9725341796875, + "learning_rate": 9.065335753176044e-06, + "loss": 40.776, + "step": 3855 + }, + { + "epoch": 13.920993227990971, + "grad_norm": 213.8822784423828, + "learning_rate": 9.059891107078041e-06, + "loss": 39.4964, + "step": 3856 + }, + { + "epoch": 13.924604966139954, + "grad_norm": 239.16128540039062, + "learning_rate": 9.054446460980036e-06, + "loss": 41.3482, + "step": 3857 + }, + { + "epoch": 13.928216704288939, + "grad_norm": 264.839111328125, + "learning_rate": 9.049001814882033e-06, + "loss": 38.2433, + "step": 3858 + }, + { + "epoch": 13.931828442437924, + "grad_norm": 244.00926208496094, + "learning_rate": 9.043557168784029e-06, + "loss": 38.6482, + "step": 3859 + }, + { + "epoch": 13.935440180586907, + "grad_norm": 342.8050537109375, + "learning_rate": 9.038112522686026e-06, + "loss": 39.2047, + "step": 3860 + }, + { + "epoch": 13.935440180586907, + "eval_loss": 0.6078094244003296, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3860 + }, + { + "epoch": 13.939051918735892, + "grad_norm": 232.509033203125, + "learning_rate": 9.032667876588023e-06, + "loss": 39.2827, + "step": 3861 + }, + { + "epoch": 13.942663656884875, + "grad_norm": 343.2891845703125, + "learning_rate": 9.027223230490018e-06, + "loss": 38.2709, + "step": 3862 + }, + { + "epoch": 13.94627539503386, + "grad_norm": 332.9613342285156, + "learning_rate": 9.021778584392014e-06, + "loss": 38.8266, + "step": 3863 + }, + { + "epoch": 13.949887133182845, + "grad_norm": 339.5653076171875, + "learning_rate": 9.016333938294012e-06, + "loss": 39.9249, + "step": 3864 + }, + { + "epoch": 13.953498871331828, + "grad_norm": 269.0108947753906, + "learning_rate": 9.010889292196008e-06, + "loss": 39.4593, + "step": 3865 + }, + { + "epoch": 13.957110609480813, + "grad_norm": 252.5339813232422, + "learning_rate": 9.005444646098003e-06, + "loss": 39.5471, + "step": 3866 + }, + { + "epoch": 13.960722347629797, + "grad_norm": 424.7225646972656, + "learning_rate": 9e-06, + "loss": 35.7505, + "step": 3867 + }, + { + "epoch": 13.96433408577878, + "grad_norm": 286.189208984375, + "learning_rate": 8.994555353901996e-06, + "loss": 32.445, + "step": 3868 + }, + { + "epoch": 13.967945823927765, + "grad_norm": 245.153564453125, + "learning_rate": 8.989110707803994e-06, + "loss": 33.2369, + "step": 3869 + }, + { + "epoch": 13.97155756207675, + "grad_norm": 305.3119812011719, + "learning_rate": 8.98366606170599e-06, + "loss": 31.7864, + "step": 3870 + }, + { + "epoch": 13.97155756207675, + "eval_loss": 0.6069231629371643, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.877, + "eval_steps_per_second": 56.877, + "step": 3870 + }, + { + "epoch": 13.975169300225733, + "grad_norm": 218.70913696289062, + "learning_rate": 8.978221415607985e-06, + "loss": 33.7166, + "step": 3871 + }, + { + "epoch": 13.978781038374718, + "grad_norm": 334.856201171875, + "learning_rate": 8.972776769509982e-06, + "loss": 35.8878, + "step": 3872 + }, + { + "epoch": 13.982392776523701, + "grad_norm": 305.65203857421875, + "learning_rate": 8.96733212341198e-06, + "loss": 35.1525, + "step": 3873 + }, + { + "epoch": 13.986004514672686, + "grad_norm": 330.148193359375, + "learning_rate": 8.961887477313975e-06, + "loss": 34.8268, + "step": 3874 + }, + { + "epoch": 13.989616252821671, + "grad_norm": 288.9424133300781, + "learning_rate": 8.956442831215972e-06, + "loss": 35.5068, + "step": 3875 + }, + { + "epoch": 13.993227990970654, + "grad_norm": 256.2596740722656, + "learning_rate": 8.950998185117967e-06, + "loss": 28.5016, + "step": 3876 + }, + { + "epoch": 13.996839729119639, + "grad_norm": 234.31991577148438, + "learning_rate": 8.945553539019963e-06, + "loss": 23.7416, + "step": 3877 + }, + { + "epoch": 14.0, + "grad_norm": 182.19000244140625, + "learning_rate": 8.940108892921961e-06, + "loss": 21.0329, + "step": 3878 + }, + { + "epoch": 14.003611738148985, + "grad_norm": 254.86355590820312, + "learning_rate": 8.934664246823957e-06, + "loss": 39.94, + "step": 3879 + }, + { + "epoch": 14.007223476297968, + "grad_norm": 229.75650024414062, + "learning_rate": 8.929219600725954e-06, + "loss": 40.3213, + "step": 3880 + }, + { + "epoch": 14.007223476297968, + "eval_loss": 0.604503870010376, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3880 + }, + { + "epoch": 14.010835214446953, + "grad_norm": 220.18190002441406, + "learning_rate": 8.923774954627949e-06, + "loss": 40.1568, + "step": 3881 + }, + { + "epoch": 14.014446952595938, + "grad_norm": 269.5978088378906, + "learning_rate": 8.918330308529945e-06, + "loss": 40.3685, + "step": 3882 + }, + { + "epoch": 14.01805869074492, + "grad_norm": 254.3507537841797, + "learning_rate": 8.912885662431943e-06, + "loss": 40.0845, + "step": 3883 + }, + { + "epoch": 14.021670428893906, + "grad_norm": 251.43653869628906, + "learning_rate": 8.907441016333939e-06, + "loss": 40.1731, + "step": 3884 + }, + { + "epoch": 14.025282167042889, + "grad_norm": 215.91253662109375, + "learning_rate": 8.901996370235934e-06, + "loss": 39.7179, + "step": 3885 + }, + { + "epoch": 14.028893905191874, + "grad_norm": 247.81790161132812, + "learning_rate": 8.896551724137931e-06, + "loss": 41.0822, + "step": 3886 + }, + { + "epoch": 14.032505643340858, + "grad_norm": 232.45892333984375, + "learning_rate": 8.891107078039928e-06, + "loss": 39.7873, + "step": 3887 + }, + { + "epoch": 14.036117381489841, + "grad_norm": 231.8137969970703, + "learning_rate": 8.885662431941924e-06, + "loss": 41.1302, + "step": 3888 + }, + { + "epoch": 14.039729119638826, + "grad_norm": 219.09446716308594, + "learning_rate": 8.88021778584392e-06, + "loss": 39.2293, + "step": 3889 + }, + { + "epoch": 14.043340857787811, + "grad_norm": 187.99874877929688, + "learning_rate": 8.874773139745916e-06, + "loss": 37.3338, + "step": 3890 + }, + { + "epoch": 14.043340857787811, + "eval_loss": 0.603966236114502, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 3890 + }, + { + "epoch": 14.046952595936794, + "grad_norm": 285.2400207519531, + "learning_rate": 8.869328493647913e-06, + "loss": 36.9479, + "step": 3891 + }, + { + "epoch": 14.050564334085779, + "grad_norm": 234.23655700683594, + "learning_rate": 8.86388384754991e-06, + "loss": 35.1313, + "step": 3892 + }, + { + "epoch": 14.054176072234762, + "grad_norm": 234.78717041015625, + "learning_rate": 8.858439201451906e-06, + "loss": 36.5917, + "step": 3893 + }, + { + "epoch": 14.057787810383747, + "grad_norm": 226.53997802734375, + "learning_rate": 8.852994555353903e-06, + "loss": 38.3228, + "step": 3894 + }, + { + "epoch": 14.061399548532732, + "grad_norm": 222.05213928222656, + "learning_rate": 8.847549909255898e-06, + "loss": 37.3542, + "step": 3895 + }, + { + "epoch": 14.065011286681715, + "grad_norm": 222.9646759033203, + "learning_rate": 8.842105263157893e-06, + "loss": 37.6396, + "step": 3896 + }, + { + "epoch": 14.0686230248307, + "grad_norm": 227.78965759277344, + "learning_rate": 8.836660617059892e-06, + "loss": 38.1988, + "step": 3897 + }, + { + "epoch": 14.072234762979685, + "grad_norm": 200.89691162109375, + "learning_rate": 8.831215970961888e-06, + "loss": 38.3981, + "step": 3898 + }, + { + "epoch": 14.075846501128668, + "grad_norm": 212.52891540527344, + "learning_rate": 8.825771324863883e-06, + "loss": 37.3422, + "step": 3899 + }, + { + "epoch": 14.079458239277653, + "grad_norm": 312.33905029296875, + "learning_rate": 8.82032667876588e-06, + "loss": 38.1292, + "step": 3900 + }, + { + "epoch": 14.079458239277653, + "eval_loss": 0.6061921119689941, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.017, + "eval_steps_per_second": 57.017, + "step": 3900 + }, + { + "epoch": 14.083069977426636, + "grad_norm": 261.8415832519531, + "learning_rate": 8.814882032667877e-06, + "loss": 37.5543, + "step": 3901 + }, + { + "epoch": 14.08668171557562, + "grad_norm": 264.625732421875, + "learning_rate": 8.809437386569874e-06, + "loss": 39.3912, + "step": 3902 + }, + { + "epoch": 14.090293453724605, + "grad_norm": 305.7203063964844, + "learning_rate": 8.80399274047187e-06, + "loss": 39.7879, + "step": 3903 + }, + { + "epoch": 14.093905191873588, + "grad_norm": 282.63616943359375, + "learning_rate": 8.798548094373865e-06, + "loss": 38.7212, + "step": 3904 + }, + { + "epoch": 14.097516930022573, + "grad_norm": 246.49169921875, + "learning_rate": 8.793103448275862e-06, + "loss": 40.6198, + "step": 3905 + }, + { + "epoch": 14.101128668171558, + "grad_norm": 283.2737731933594, + "learning_rate": 8.787658802177859e-06, + "loss": 39.6947, + "step": 3906 + }, + { + "epoch": 14.104740406320541, + "grad_norm": 306.95721435546875, + "learning_rate": 8.782214156079855e-06, + "loss": 38.6157, + "step": 3907 + }, + { + "epoch": 14.108352144469526, + "grad_norm": 238.1789093017578, + "learning_rate": 8.776769509981852e-06, + "loss": 35.5328, + "step": 3908 + }, + { + "epoch": 14.111963882618511, + "grad_norm": 233.2298126220703, + "learning_rate": 8.771324863883847e-06, + "loss": 32.4008, + "step": 3909 + }, + { + "epoch": 14.115575620767494, + "grad_norm": 233.46339416503906, + "learning_rate": 8.765880217785846e-06, + "loss": 31.0712, + "step": 3910 + }, + { + "epoch": 14.115575620767494, + "eval_loss": 0.6046931147575378, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 3910 + }, + { + "epoch": 14.119187358916479, + "grad_norm": 226.30343627929688, + "learning_rate": 8.760435571687841e-06, + "loss": 33.252, + "step": 3911 + }, + { + "epoch": 14.122799097065462, + "grad_norm": 247.17465209960938, + "learning_rate": 8.754990925589837e-06, + "loss": 31.526, + "step": 3912 + }, + { + "epoch": 14.126410835214447, + "grad_norm": 208.25439453125, + "learning_rate": 8.749546279491834e-06, + "loss": 32.4838, + "step": 3913 + }, + { + "epoch": 14.130022573363432, + "grad_norm": 236.4488525390625, + "learning_rate": 8.744101633393829e-06, + "loss": 32.7987, + "step": 3914 + }, + { + "epoch": 14.133634311512415, + "grad_norm": 219.13279724121094, + "learning_rate": 8.738656987295826e-06, + "loss": 32.8516, + "step": 3915 + }, + { + "epoch": 14.1372460496614, + "grad_norm": 239.7289581298828, + "learning_rate": 8.733212341197823e-06, + "loss": 33.7763, + "step": 3916 + }, + { + "epoch": 14.140857787810384, + "grad_norm": 226.3568878173828, + "learning_rate": 8.727767695099819e-06, + "loss": 35.675, + "step": 3917 + }, + { + "epoch": 14.144469525959368, + "grad_norm": 302.84307861328125, + "learning_rate": 8.722323049001814e-06, + "loss": 34.0523, + "step": 3918 + }, + { + "epoch": 14.148081264108352, + "grad_norm": 280.40106201171875, + "learning_rate": 8.716878402903811e-06, + "loss": 35.2923, + "step": 3919 + }, + { + "epoch": 14.151693002257336, + "grad_norm": 238.30520629882812, + "learning_rate": 8.711433756805808e-06, + "loss": 36.0242, + "step": 3920 + }, + { + "epoch": 14.151693002257336, + "eval_loss": 0.6067762970924377, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 3920 + }, + { + "epoch": 14.15530474040632, + "grad_norm": 238.6465301513672, + "learning_rate": 8.705989110707805e-06, + "loss": 36.2959, + "step": 3921 + }, + { + "epoch": 14.158916478555305, + "grad_norm": 273.26837158203125, + "learning_rate": 8.7005444646098e-06, + "loss": 35.45, + "step": 3922 + }, + { + "epoch": 14.162528216704288, + "grad_norm": 296.907958984375, + "learning_rate": 8.695099818511796e-06, + "loss": 36.4428, + "step": 3923 + }, + { + "epoch": 14.166139954853273, + "grad_norm": 215.07374572753906, + "learning_rate": 8.689655172413795e-06, + "loss": 26.4171, + "step": 3924 + }, + { + "epoch": 14.169751693002258, + "grad_norm": 217.64779663085938, + "learning_rate": 8.68421052631579e-06, + "loss": 22.5483, + "step": 3925 + }, + { + "epoch": 14.173363431151241, + "grad_norm": 243.59364318847656, + "learning_rate": 8.678765880217785e-06, + "loss": 22.0396, + "step": 3926 + }, + { + "epoch": 14.176975169300226, + "grad_norm": 189.66969299316406, + "learning_rate": 8.673321234119783e-06, + "loss": 23.0957, + "step": 3927 + }, + { + "epoch": 14.18058690744921, + "grad_norm": 191.86180114746094, + "learning_rate": 8.667876588021778e-06, + "loss": 23.9385, + "step": 3928 + }, + { + "epoch": 14.184198645598194, + "grad_norm": 234.34896850585938, + "learning_rate": 8.662431941923775e-06, + "loss": 40.1665, + "step": 3929 + }, + { + "epoch": 14.187810383747179, + "grad_norm": 230.52401733398438, + "learning_rate": 8.656987295825772e-06, + "loss": 40.6752, + "step": 3930 + }, + { + "epoch": 14.187810383747179, + "eval_loss": 0.6088615655899048, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.998, + "eval_steps_per_second": 56.998, + "step": 3930 + }, + { + "epoch": 14.191422121896162, + "grad_norm": 234.06272888183594, + "learning_rate": 8.651542649727767e-06, + "loss": 40.7938, + "step": 3931 + }, + { + "epoch": 14.195033860045147, + "grad_norm": 344.4232482910156, + "learning_rate": 8.646098003629765e-06, + "loss": 38.7342, + "step": 3932 + }, + { + "epoch": 14.198645598194132, + "grad_norm": 375.74365234375, + "learning_rate": 8.640653357531762e-06, + "loss": 40.2052, + "step": 3933 + }, + { + "epoch": 14.202257336343115, + "grad_norm": 258.15570068359375, + "learning_rate": 8.635208711433757e-06, + "loss": 39.7266, + "step": 3934 + }, + { + "epoch": 14.2058690744921, + "grad_norm": 235.2681121826172, + "learning_rate": 8.629764065335754e-06, + "loss": 40.4821, + "step": 3935 + }, + { + "epoch": 14.209480812641084, + "grad_norm": 226.94764709472656, + "learning_rate": 8.62431941923775e-06, + "loss": 41.2414, + "step": 3936 + }, + { + "epoch": 14.213092550790067, + "grad_norm": 236.22109985351562, + "learning_rate": 8.618874773139745e-06, + "loss": 40.5807, + "step": 3937 + }, + { + "epoch": 14.216704288939052, + "grad_norm": 201.31112670898438, + "learning_rate": 8.613430127041744e-06, + "loss": 40.4824, + "step": 3938 + }, + { + "epoch": 14.220316027088035, + "grad_norm": 328.0167541503906, + "learning_rate": 8.607985480943739e-06, + "loss": 38.3881, + "step": 3939 + }, + { + "epoch": 14.22392776523702, + "grad_norm": 281.4416809082031, + "learning_rate": 8.602540834845734e-06, + "loss": 36.5777, + "step": 3940 + }, + { + "epoch": 14.22392776523702, + "eval_loss": 0.6099084615707397, + "eval_runtime": 3.1377, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 3940 + }, + { + "epoch": 14.227539503386005, + "grad_norm": 258.5203552246094, + "learning_rate": 8.597096188747731e-06, + "loss": 37.5071, + "step": 3941 + }, + { + "epoch": 14.231151241534988, + "grad_norm": 274.8222351074219, + "learning_rate": 8.591651542649727e-06, + "loss": 36.358, + "step": 3942 + }, + { + "epoch": 14.234762979683973, + "grad_norm": 253.1671600341797, + "learning_rate": 8.586206896551726e-06, + "loss": 37.5859, + "step": 3943 + }, + { + "epoch": 14.238374717832958, + "grad_norm": 249.80943298339844, + "learning_rate": 8.580762250453721e-06, + "loss": 37.8799, + "step": 3944 + }, + { + "epoch": 14.241986455981941, + "grad_norm": 245.29103088378906, + "learning_rate": 8.575317604355716e-06, + "loss": 36.7551, + "step": 3945 + }, + { + "epoch": 14.245598194130926, + "grad_norm": 205.5915985107422, + "learning_rate": 8.569872958257713e-06, + "loss": 38.4761, + "step": 3946 + }, + { + "epoch": 14.249209932279909, + "grad_norm": 218.10328674316406, + "learning_rate": 8.56442831215971e-06, + "loss": 37.5862, + "step": 3947 + }, + { + "epoch": 14.252821670428894, + "grad_norm": 273.5924072265625, + "learning_rate": 8.558983666061706e-06, + "loss": 39.2851, + "step": 3948 + }, + { + "epoch": 14.256433408577879, + "grad_norm": 235.48069763183594, + "learning_rate": 8.553539019963703e-06, + "loss": 39.0707, + "step": 3949 + }, + { + "epoch": 14.260045146726862, + "grad_norm": 230.93150329589844, + "learning_rate": 8.548094373865698e-06, + "loss": 37.8469, + "step": 3950 + }, + { + "epoch": 14.260045146726862, + "eval_loss": 0.6072147488594055, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 3950 + }, + { + "epoch": 14.263656884875846, + "grad_norm": 226.3638458251953, + "learning_rate": 8.542649727767695e-06, + "loss": 39.4245, + "step": 3951 + }, + { + "epoch": 14.267268623024831, + "grad_norm": 226.74595642089844, + "learning_rate": 8.537205081669693e-06, + "loss": 38.116, + "step": 3952 + }, + { + "epoch": 14.270880361173814, + "grad_norm": 226.1452178955078, + "learning_rate": 8.531760435571688e-06, + "loss": 39.9114, + "step": 3953 + }, + { + "epoch": 14.2744920993228, + "grad_norm": 387.8020324707031, + "learning_rate": 8.526315789473685e-06, + "loss": 38.9457, + "step": 3954 + }, + { + "epoch": 14.278103837471784, + "grad_norm": 381.5679931640625, + "learning_rate": 8.52087114337568e-06, + "loss": 40.7989, + "step": 3955 + }, + { + "epoch": 14.281715575620767, + "grad_norm": 246.16464233398438, + "learning_rate": 8.515426497277677e-06, + "loss": 37.6288, + "step": 3956 + }, + { + "epoch": 14.285327313769752, + "grad_norm": 337.05059814453125, + "learning_rate": 8.509981851179674e-06, + "loss": 37.3276, + "step": 3957 + }, + { + "epoch": 14.288939051918735, + "grad_norm": 223.80421447753906, + "learning_rate": 8.50453720508167e-06, + "loss": 33.9465, + "step": 3958 + }, + { + "epoch": 14.29255079006772, + "grad_norm": 218.9332275390625, + "learning_rate": 8.499092558983665e-06, + "loss": 33.0305, + "step": 3959 + }, + { + "epoch": 14.296162528216705, + "grad_norm": 254.20726013183594, + "learning_rate": 8.493647912885662e-06, + "loss": 31.3806, + "step": 3960 + }, + { + "epoch": 14.296162528216705, + "eval_loss": 0.6070483922958374, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 3960 + }, + { + "epoch": 14.299774266365688, + "grad_norm": 232.96702575683594, + "learning_rate": 8.48820326678766e-06, + "loss": 31.7001, + "step": 3961 + }, + { + "epoch": 14.303386004514673, + "grad_norm": 305.31207275390625, + "learning_rate": 8.482758620689656e-06, + "loss": 32.2629, + "step": 3962 + }, + { + "epoch": 14.306997742663658, + "grad_norm": 253.60858154296875, + "learning_rate": 8.477313974591652e-06, + "loss": 34.2635, + "step": 3963 + }, + { + "epoch": 14.31060948081264, + "grad_norm": 395.4168701171875, + "learning_rate": 8.471869328493647e-06, + "loss": 34.6987, + "step": 3964 + }, + { + "epoch": 14.314221218961626, + "grad_norm": 279.72845458984375, + "learning_rate": 8.466424682395644e-06, + "loss": 34.5488, + "step": 3965 + }, + { + "epoch": 14.317832957110609, + "grad_norm": 285.7306213378906, + "learning_rate": 8.460980036297641e-06, + "loss": 35.2566, + "step": 3966 + }, + { + "epoch": 14.321444695259594, + "grad_norm": 229.04226684570312, + "learning_rate": 8.455535390199637e-06, + "loss": 34.5273, + "step": 3967 + }, + { + "epoch": 14.325056433408578, + "grad_norm": 232.50205993652344, + "learning_rate": 8.450090744101634e-06, + "loss": 34.6337, + "step": 3968 + }, + { + "epoch": 14.328668171557561, + "grad_norm": 225.87583923339844, + "learning_rate": 8.44464609800363e-06, + "loss": 35.1575, + "step": 3969 + }, + { + "epoch": 14.332279909706546, + "grad_norm": 266.2709045410156, + "learning_rate": 8.439201451905626e-06, + "loss": 34.2619, + "step": 3970 + }, + { + "epoch": 14.332279909706546, + "eval_loss": 0.6066078543663025, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 3970 + }, + { + "epoch": 14.335891647855531, + "grad_norm": 283.557373046875, + "learning_rate": 8.433756805807623e-06, + "loss": 35.5713, + "step": 3971 + }, + { + "epoch": 14.339503386004514, + "grad_norm": 288.43707275390625, + "learning_rate": 8.428312159709619e-06, + "loss": 36.7442, + "step": 3972 + }, + { + "epoch": 14.343115124153499, + "grad_norm": 331.3218994140625, + "learning_rate": 8.422867513611616e-06, + "loss": 35.5839, + "step": 3973 + }, + { + "epoch": 14.346726862302482, + "grad_norm": 257.1488037109375, + "learning_rate": 8.417422867513611e-06, + "loss": 30.2221, + "step": 3974 + }, + { + "epoch": 14.350338600451467, + "grad_norm": 200.0919189453125, + "learning_rate": 8.411978221415608e-06, + "loss": 22.217, + "step": 3975 + }, + { + "epoch": 14.353950338600452, + "grad_norm": 245.030029296875, + "learning_rate": 8.406533575317605e-06, + "loss": 22.8927, + "step": 3976 + }, + { + "epoch": 14.357562076749435, + "grad_norm": 208.5701904296875, + "learning_rate": 8.4010889292196e-06, + "loss": 22.9537, + "step": 3977 + }, + { + "epoch": 14.36117381489842, + "grad_norm": 232.0613250732422, + "learning_rate": 8.395644283121596e-06, + "loss": 24.5304, + "step": 3978 + }, + { + "epoch": 14.364785553047405, + "grad_norm": 193.56541442871094, + "learning_rate": 8.390199637023595e-06, + "loss": 39.4552, + "step": 3979 + }, + { + "epoch": 14.368397291196388, + "grad_norm": 230.35507202148438, + "learning_rate": 8.38475499092559e-06, + "loss": 41.0417, + "step": 3980 + }, + { + "epoch": 14.368397291196388, + "eval_loss": 0.6071842908859253, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 3980 + }, + { + "epoch": 14.372009029345373, + "grad_norm": 191.09242248535156, + "learning_rate": 8.379310344827586e-06, + "loss": 40.1548, + "step": 3981 + }, + { + "epoch": 14.375620767494357, + "grad_norm": 249.24520874023438, + "learning_rate": 8.373865698729583e-06, + "loss": 39.5746, + "step": 3982 + }, + { + "epoch": 14.37923250564334, + "grad_norm": 266.509033203125, + "learning_rate": 8.368421052631578e-06, + "loss": 39.2388, + "step": 3983 + }, + { + "epoch": 14.382844243792325, + "grad_norm": 255.36209106445312, + "learning_rate": 8.362976406533577e-06, + "loss": 39.9314, + "step": 3984 + }, + { + "epoch": 14.386455981941308, + "grad_norm": 239.0690460205078, + "learning_rate": 8.357531760435572e-06, + "loss": 39.9124, + "step": 3985 + }, + { + "epoch": 14.390067720090293, + "grad_norm": 211.36135864257812, + "learning_rate": 8.352087114337568e-06, + "loss": 40.1307, + "step": 3986 + }, + { + "epoch": 14.393679458239278, + "grad_norm": 215.28912353515625, + "learning_rate": 8.346642468239565e-06, + "loss": 40.5252, + "step": 3987 + }, + { + "epoch": 14.397291196388261, + "grad_norm": 240.84271240234375, + "learning_rate": 8.34119782214156e-06, + "loss": 40.8348, + "step": 3988 + }, + { + "epoch": 14.400902934537246, + "grad_norm": 228.41758728027344, + "learning_rate": 8.335753176043557e-06, + "loss": 39.8228, + "step": 3989 + }, + { + "epoch": 14.404514672686231, + "grad_norm": 203.0228729248047, + "learning_rate": 8.330308529945554e-06, + "loss": 38.0696, + "step": 3990 + }, + { + "epoch": 14.404514672686231, + "eval_loss": 0.6064196825027466, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.983, + "eval_steps_per_second": 56.983, + "step": 3990 + }, + { + "epoch": 14.408126410835214, + "grad_norm": 245.14646911621094, + "learning_rate": 8.32486388384755e-06, + "loss": 37.3921, + "step": 3991 + }, + { + "epoch": 14.411738148984199, + "grad_norm": 230.0685577392578, + "learning_rate": 8.319419237749545e-06, + "loss": 36.8794, + "step": 3992 + }, + { + "epoch": 14.415349887133182, + "grad_norm": 203.02955627441406, + "learning_rate": 8.313974591651544e-06, + "loss": 38.011, + "step": 3993 + }, + { + "epoch": 14.418961625282167, + "grad_norm": 276.0522766113281, + "learning_rate": 8.30852994555354e-06, + "loss": 37.8114, + "step": 3994 + }, + { + "epoch": 14.422573363431152, + "grad_norm": 205.56423950195312, + "learning_rate": 8.303085299455536e-06, + "loss": 38.1956, + "step": 3995 + }, + { + "epoch": 14.426185101580135, + "grad_norm": 200.71507263183594, + "learning_rate": 8.297640653357532e-06, + "loss": 36.4471, + "step": 3996 + }, + { + "epoch": 14.42979683972912, + "grad_norm": 217.8540496826172, + "learning_rate": 8.292196007259527e-06, + "loss": 37.6204, + "step": 3997 + }, + { + "epoch": 14.433408577878104, + "grad_norm": 228.0621337890625, + "learning_rate": 8.286751361161526e-06, + "loss": 38.6074, + "step": 3998 + }, + { + "epoch": 14.437020316027088, + "grad_norm": 246.05203247070312, + "learning_rate": 8.281306715063521e-06, + "loss": 37.8614, + "step": 3999 + }, + { + "epoch": 14.440632054176072, + "grad_norm": 216.0327911376953, + "learning_rate": 8.275862068965517e-06, + "loss": 37.4941, + "step": 4000 + }, + { + "epoch": 14.440632054176072, + "eval_loss": 0.605604887008667, + "eval_runtime": 3.1399, + "eval_samples_per_second": 57.008, + "eval_steps_per_second": 57.008, + "step": 4000 + }, + { + "epoch": 14.444243792325057, + "grad_norm": 292.38653564453125, + "learning_rate": 8.270417422867514e-06, + "loss": 37.9576, + "step": 4001 + }, + { + "epoch": 14.44785553047404, + "grad_norm": 268.2558288574219, + "learning_rate": 8.26497277676951e-06, + "loss": 38.7505, + "step": 4002 + }, + { + "epoch": 14.451467268623025, + "grad_norm": 324.135498046875, + "learning_rate": 8.259528130671508e-06, + "loss": 39.9733, + "step": 4003 + }, + { + "epoch": 14.455079006772008, + "grad_norm": 269.1458740234375, + "learning_rate": 8.254083484573503e-06, + "loss": 38.8272, + "step": 4004 + }, + { + "epoch": 14.458690744920993, + "grad_norm": 214.26547241210938, + "learning_rate": 8.248638838475499e-06, + "loss": 37.7277, + "step": 4005 + }, + { + "epoch": 14.462302483069978, + "grad_norm": 256.4419860839844, + "learning_rate": 8.243194192377496e-06, + "loss": 39.0446, + "step": 4006 + }, + { + "epoch": 14.465914221218961, + "grad_norm": 226.9741973876953, + "learning_rate": 8.237749546279493e-06, + "loss": 34.2491, + "step": 4007 + }, + { + "epoch": 14.469525959367946, + "grad_norm": 238.4901123046875, + "learning_rate": 8.232304900181488e-06, + "loss": 32.1969, + "step": 4008 + }, + { + "epoch": 14.47313769751693, + "grad_norm": 260.6334533691406, + "learning_rate": 8.226860254083485e-06, + "loss": 32.5999, + "step": 4009 + }, + { + "epoch": 14.476749435665914, + "grad_norm": 227.4844970703125, + "learning_rate": 8.22141560798548e-06, + "loss": 30.3598, + "step": 4010 + }, + { + "epoch": 14.476749435665914, + "eval_loss": 0.6049788594245911, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 4010 + }, + { + "epoch": 14.480361173814899, + "grad_norm": 231.49935913085938, + "learning_rate": 8.215970961887476e-06, + "loss": 32.3228, + "step": 4011 + }, + { + "epoch": 14.483972911963882, + "grad_norm": 246.83099365234375, + "learning_rate": 8.210526315789475e-06, + "loss": 32.1275, + "step": 4012 + }, + { + "epoch": 14.487584650112867, + "grad_norm": 283.0715026855469, + "learning_rate": 8.20508166969147e-06, + "loss": 32.9237, + "step": 4013 + }, + { + "epoch": 14.491196388261852, + "grad_norm": 264.58941650390625, + "learning_rate": 8.199637023593467e-06, + "loss": 34.3091, + "step": 4014 + }, + { + "epoch": 14.494808126410835, + "grad_norm": 207.57241821289062, + "learning_rate": 8.194192377495463e-06, + "loss": 34.2317, + "step": 4015 + }, + { + "epoch": 14.49841986455982, + "grad_norm": 266.3730163574219, + "learning_rate": 8.18874773139746e-06, + "loss": 35.5423, + "step": 4016 + }, + { + "epoch": 14.502031602708804, + "grad_norm": 274.2936096191406, + "learning_rate": 8.183303085299457e-06, + "loss": 34.0383, + "step": 4017 + }, + { + "epoch": 14.505643340857787, + "grad_norm": 345.4320068359375, + "learning_rate": 8.177858439201452e-06, + "loss": 35.6892, + "step": 4018 + }, + { + "epoch": 14.509255079006772, + "grad_norm": 254.9503631591797, + "learning_rate": 8.172413793103448e-06, + "loss": 34.4219, + "step": 4019 + }, + { + "epoch": 14.512866817155757, + "grad_norm": 277.176025390625, + "learning_rate": 8.166969147005445e-06, + "loss": 34.6322, + "step": 4020 + }, + { + "epoch": 14.512866817155757, + "eval_loss": 0.6078911423683167, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 4020 + }, + { + "epoch": 14.51647855530474, + "grad_norm": 267.24737548828125, + "learning_rate": 8.161524500907442e-06, + "loss": 36.4843, + "step": 4021 + }, + { + "epoch": 14.520090293453725, + "grad_norm": 291.5208740234375, + "learning_rate": 8.156079854809437e-06, + "loss": 36.347, + "step": 4022 + }, + { + "epoch": 14.523702031602708, + "grad_norm": 331.9736328125, + "learning_rate": 8.150635208711434e-06, + "loss": 36.5678, + "step": 4023 + }, + { + "epoch": 14.527313769751693, + "grad_norm": 283.7598876953125, + "learning_rate": 8.14519056261343e-06, + "loss": 29.4886, + "step": 4024 + }, + { + "epoch": 14.530925507900678, + "grad_norm": 214.61712646484375, + "learning_rate": 8.139745916515427e-06, + "loss": 23.2178, + "step": 4025 + }, + { + "epoch": 14.534537246049661, + "grad_norm": 286.7948913574219, + "learning_rate": 8.134301270417424e-06, + "loss": 22.0972, + "step": 4026 + }, + { + "epoch": 14.538148984198646, + "grad_norm": 230.6540069580078, + "learning_rate": 8.128856624319419e-06, + "loss": 23.2764, + "step": 4027 + }, + { + "epoch": 14.54176072234763, + "grad_norm": 300.9560241699219, + "learning_rate": 8.123411978221416e-06, + "loss": 24.1889, + "step": 4028 + }, + { + "epoch": 14.545372460496614, + "grad_norm": 211.4068145751953, + "learning_rate": 8.117967332123412e-06, + "loss": 39.0039, + "step": 4029 + }, + { + "epoch": 14.548984198645599, + "grad_norm": 274.3965759277344, + "learning_rate": 8.112522686025409e-06, + "loss": 41.1832, + "step": 4030 + }, + { + "epoch": 14.548984198645599, + "eval_loss": 0.6079195141792297, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 4030 + }, + { + "epoch": 14.552595936794582, + "grad_norm": 247.50657653808594, + "learning_rate": 8.107078039927406e-06, + "loss": 38.28, + "step": 4031 + }, + { + "epoch": 14.556207674943566, + "grad_norm": 216.0500946044922, + "learning_rate": 8.101633393829401e-06, + "loss": 39.5079, + "step": 4032 + }, + { + "epoch": 14.559819413092551, + "grad_norm": 271.37066650390625, + "learning_rate": 8.096188747731396e-06, + "loss": 40.1902, + "step": 4033 + }, + { + "epoch": 14.563431151241534, + "grad_norm": 233.35415649414062, + "learning_rate": 8.090744101633394e-06, + "loss": 40.2113, + "step": 4034 + }, + { + "epoch": 14.56704288939052, + "grad_norm": 214.67381286621094, + "learning_rate": 8.08529945553539e-06, + "loss": 39.794, + "step": 4035 + }, + { + "epoch": 14.570654627539504, + "grad_norm": 298.1142578125, + "learning_rate": 8.079854809437388e-06, + "loss": 39.9214, + "step": 4036 + }, + { + "epoch": 14.574266365688487, + "grad_norm": 197.40823364257812, + "learning_rate": 8.074410163339383e-06, + "loss": 40.9599, + "step": 4037 + }, + { + "epoch": 14.577878103837472, + "grad_norm": 242.1573028564453, + "learning_rate": 8.068965517241378e-06, + "loss": 40.2351, + "step": 4038 + }, + { + "epoch": 14.581489841986457, + "grad_norm": 224.93801879882812, + "learning_rate": 8.063520871143377e-06, + "loss": 39.0174, + "step": 4039 + }, + { + "epoch": 14.58510158013544, + "grad_norm": 295.4931335449219, + "learning_rate": 8.058076225045373e-06, + "loss": 37.4696, + "step": 4040 + }, + { + "epoch": 14.58510158013544, + "eval_loss": 0.6091852188110352, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 4040 + }, + { + "epoch": 14.588713318284425, + "grad_norm": 302.8267517089844, + "learning_rate": 8.052631578947368e-06, + "loss": 37.3227, + "step": 4041 + }, + { + "epoch": 14.592325056433408, + "grad_norm": 355.2379150390625, + "learning_rate": 8.047186932849365e-06, + "loss": 38.433, + "step": 4042 + }, + { + "epoch": 14.595936794582393, + "grad_norm": 304.96234130859375, + "learning_rate": 8.04174228675136e-06, + "loss": 37.8352, + "step": 4043 + }, + { + "epoch": 14.599548532731378, + "grad_norm": 309.294921875, + "learning_rate": 8.036297640653358e-06, + "loss": 38.1734, + "step": 4044 + }, + { + "epoch": 14.60316027088036, + "grad_norm": 216.3328399658203, + "learning_rate": 8.030852994555355e-06, + "loss": 37.3612, + "step": 4045 + }, + { + "epoch": 14.606772009029346, + "grad_norm": 250.9885711669922, + "learning_rate": 8.02540834845735e-06, + "loss": 39.1612, + "step": 4046 + }, + { + "epoch": 14.610383747178329, + "grad_norm": 215.0750732421875, + "learning_rate": 8.019963702359347e-06, + "loss": 39.6837, + "step": 4047 + }, + { + "epoch": 14.613995485327314, + "grad_norm": 234.02069091796875, + "learning_rate": 8.014519056261342e-06, + "loss": 37.9746, + "step": 4048 + }, + { + "epoch": 14.617607223476298, + "grad_norm": 233.7527313232422, + "learning_rate": 8.00907441016334e-06, + "loss": 38.5114, + "step": 4049 + }, + { + "epoch": 14.621218961625281, + "grad_norm": 271.77496337890625, + "learning_rate": 8.003629764065337e-06, + "loss": 37.1647, + "step": 4050 + }, + { + "epoch": 14.621218961625281, + "eval_loss": 0.6047770977020264, + "eval_runtime": 3.1379, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 4050 + }, + { + "epoch": 14.624830699774266, + "grad_norm": 281.7846374511719, + "learning_rate": 7.998185117967332e-06, + "loss": 38.981, + "step": 4051 + }, + { + "epoch": 14.628442437923251, + "grad_norm": 308.8702697753906, + "learning_rate": 7.992740471869327e-06, + "loss": 39.4821, + "step": 4052 + }, + { + "epoch": 14.632054176072234, + "grad_norm": 366.1501770019531, + "learning_rate": 7.987295825771326e-06, + "loss": 39.0898, + "step": 4053 + }, + { + "epoch": 14.635665914221219, + "grad_norm": 276.92962646484375, + "learning_rate": 7.981851179673322e-06, + "loss": 39.6162, + "step": 4054 + }, + { + "epoch": 14.639277652370204, + "grad_norm": 220.0023651123047, + "learning_rate": 7.976406533575319e-06, + "loss": 38.5888, + "step": 4055 + }, + { + "epoch": 14.642889390519187, + "grad_norm": 268.57293701171875, + "learning_rate": 7.970961887477314e-06, + "loss": 38.4631, + "step": 4056 + }, + { + "epoch": 14.646501128668172, + "grad_norm": 307.8072509765625, + "learning_rate": 7.96551724137931e-06, + "loss": 35.4139, + "step": 4057 + }, + { + "epoch": 14.650112866817155, + "grad_norm": 228.11767578125, + "learning_rate": 7.960072595281308e-06, + "loss": 33.3694, + "step": 4058 + }, + { + "epoch": 14.65372460496614, + "grad_norm": 217.6271209716797, + "learning_rate": 7.954627949183304e-06, + "loss": 31.3355, + "step": 4059 + }, + { + "epoch": 14.657336343115125, + "grad_norm": 232.31944274902344, + "learning_rate": 7.949183303085299e-06, + "loss": 32.8306, + "step": 4060 + }, + { + "epoch": 14.657336343115125, + "eval_loss": 0.6018487215042114, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 4060 + }, + { + "epoch": 14.660948081264108, + "grad_norm": 244.58303833007812, + "learning_rate": 7.943738656987296e-06, + "loss": 33.2157, + "step": 4061 + }, + { + "epoch": 14.664559819413093, + "grad_norm": 306.12005615234375, + "learning_rate": 7.938294010889293e-06, + "loss": 33.6361, + "step": 4062 + }, + { + "epoch": 14.668171557562077, + "grad_norm": 266.2792053222656, + "learning_rate": 7.932849364791288e-06, + "loss": 32.3917, + "step": 4063 + }, + { + "epoch": 14.67178329571106, + "grad_norm": 259.373779296875, + "learning_rate": 7.927404718693286e-06, + "loss": 33.3598, + "step": 4064 + }, + { + "epoch": 14.675395033860045, + "grad_norm": 247.35179138183594, + "learning_rate": 7.921960072595281e-06, + "loss": 32.2699, + "step": 4065 + }, + { + "epoch": 14.679006772009028, + "grad_norm": 280.02960205078125, + "learning_rate": 7.916515426497278e-06, + "loss": 33.0305, + "step": 4066 + }, + { + "epoch": 14.682618510158013, + "grad_norm": 394.6492919921875, + "learning_rate": 7.911070780399275e-06, + "loss": 35.1854, + "step": 4067 + }, + { + "epoch": 14.686230248306998, + "grad_norm": 298.6531677246094, + "learning_rate": 7.90562613430127e-06, + "loss": 35.1836, + "step": 4068 + }, + { + "epoch": 14.689841986455981, + "grad_norm": 250.960693359375, + "learning_rate": 7.900181488203268e-06, + "loss": 32.6266, + "step": 4069 + }, + { + "epoch": 14.693453724604966, + "grad_norm": 240.4825897216797, + "learning_rate": 7.894736842105263e-06, + "loss": 35.5937, + "step": 4070 + }, + { + "epoch": 14.693453724604966, + "eval_loss": 0.6042065620422363, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.91, + "eval_steps_per_second": 56.91, + "step": 4070 + }, + { + "epoch": 14.697065462753951, + "grad_norm": 274.6919860839844, + "learning_rate": 7.889292196007258e-06, + "loss": 36.4225, + "step": 4071 + }, + { + "epoch": 14.700677200902934, + "grad_norm": 245.4980010986328, + "learning_rate": 7.883847549909257e-06, + "loss": 36.5503, + "step": 4072 + }, + { + "epoch": 14.704288939051919, + "grad_norm": 373.362548828125, + "learning_rate": 7.878402903811252e-06, + "loss": 35.38, + "step": 4073 + }, + { + "epoch": 14.707900677200904, + "grad_norm": 337.5054626464844, + "learning_rate": 7.872958257713248e-06, + "loss": 28.869, + "step": 4074 + }, + { + "epoch": 14.711512415349887, + "grad_norm": 238.19195556640625, + "learning_rate": 7.867513611615245e-06, + "loss": 22.99, + "step": 4075 + }, + { + "epoch": 14.715124153498872, + "grad_norm": 254.274169921875, + "learning_rate": 7.862068965517242e-06, + "loss": 22.5274, + "step": 4076 + }, + { + "epoch": 14.718735891647855, + "grad_norm": 236.74099731445312, + "learning_rate": 7.856624319419239e-06, + "loss": 23.6756, + "step": 4077 + }, + { + "epoch": 14.72234762979684, + "grad_norm": 239.69911193847656, + "learning_rate": 7.851179673321234e-06, + "loss": 23.2024, + "step": 4078 + }, + { + "epoch": 14.725959367945824, + "grad_norm": 296.35101318359375, + "learning_rate": 7.84573502722323e-06, + "loss": 40.0026, + "step": 4079 + }, + { + "epoch": 14.729571106094808, + "grad_norm": 202.52577209472656, + "learning_rate": 7.840290381125227e-06, + "loss": 41.2817, + "step": 4080 + }, + { + "epoch": 14.729571106094808, + "eval_loss": 0.6069625616073608, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4080 + }, + { + "epoch": 14.733182844243792, + "grad_norm": 290.4194030761719, + "learning_rate": 7.834845735027224e-06, + "loss": 40.5411, + "step": 4081 + }, + { + "epoch": 14.736794582392777, + "grad_norm": 284.0616455078125, + "learning_rate": 7.82940108892922e-06, + "loss": 40.6588, + "step": 4082 + }, + { + "epoch": 14.74040632054176, + "grad_norm": 289.5628967285156, + "learning_rate": 7.823956442831216e-06, + "loss": 38.986, + "step": 4083 + }, + { + "epoch": 14.744018058690745, + "grad_norm": 217.09841918945312, + "learning_rate": 7.818511796733212e-06, + "loss": 38.83, + "step": 4084 + }, + { + "epoch": 14.747629796839728, + "grad_norm": 223.49148559570312, + "learning_rate": 7.813067150635209e-06, + "loss": 39.4897, + "step": 4085 + }, + { + "epoch": 14.751241534988713, + "grad_norm": 240.41578674316406, + "learning_rate": 7.807622504537206e-06, + "loss": 38.9963, + "step": 4086 + }, + { + "epoch": 14.754853273137698, + "grad_norm": 206.7586212158203, + "learning_rate": 7.802177858439201e-06, + "loss": 39.7875, + "step": 4087 + }, + { + "epoch": 14.758465011286681, + "grad_norm": 239.97174072265625, + "learning_rate": 7.796733212341198e-06, + "loss": 39.3977, + "step": 4088 + }, + { + "epoch": 14.762076749435666, + "grad_norm": 204.50839233398438, + "learning_rate": 7.791288566243194e-06, + "loss": 38.7869, + "step": 4089 + }, + { + "epoch": 14.76568848758465, + "grad_norm": 216.79583740234375, + "learning_rate": 7.785843920145191e-06, + "loss": 36.7325, + "step": 4090 + }, + { + "epoch": 14.76568848758465, + "eval_loss": 0.6052367091178894, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.93, + "eval_steps_per_second": 56.93, + "step": 4090 + }, + { + "epoch": 14.769300225733634, + "grad_norm": 251.13209533691406, + "learning_rate": 7.780399274047188e-06, + "loss": 38.2377, + "step": 4091 + }, + { + "epoch": 14.772911963882619, + "grad_norm": 222.745361328125, + "learning_rate": 7.774954627949183e-06, + "loss": 36.8119, + "step": 4092 + }, + { + "epoch": 14.776523702031604, + "grad_norm": 252.72117614746094, + "learning_rate": 7.769509981851179e-06, + "loss": 38.1241, + "step": 4093 + }, + { + "epoch": 14.780135440180587, + "grad_norm": 272.38165283203125, + "learning_rate": 7.764065335753176e-06, + "loss": 37.6839, + "step": 4094 + }, + { + "epoch": 14.783747178329572, + "grad_norm": 301.0637512207031, + "learning_rate": 7.758620689655173e-06, + "loss": 38.1267, + "step": 4095 + }, + { + "epoch": 14.787358916478555, + "grad_norm": 240.22515869140625, + "learning_rate": 7.75317604355717e-06, + "loss": 36.9847, + "step": 4096 + }, + { + "epoch": 14.79097065462754, + "grad_norm": 273.3988952636719, + "learning_rate": 7.747731397459165e-06, + "loss": 39.0368, + "step": 4097 + }, + { + "epoch": 14.794582392776524, + "grad_norm": 252.66497802734375, + "learning_rate": 7.74228675136116e-06, + "loss": 38.6439, + "step": 4098 + }, + { + "epoch": 14.798194130925507, + "grad_norm": 246.3287811279297, + "learning_rate": 7.73684210526316e-06, + "loss": 36.3503, + "step": 4099 + }, + { + "epoch": 14.801805869074492, + "grad_norm": 220.6704559326172, + "learning_rate": 7.731397459165155e-06, + "loss": 38.1603, + "step": 4100 + }, + { + "epoch": 14.801805869074492, + "eval_loss": 0.6043270826339722, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4100 + }, + { + "epoch": 14.805417607223477, + "grad_norm": 215.94979858398438, + "learning_rate": 7.72595281306715e-06, + "loss": 38.9624, + "step": 4101 + }, + { + "epoch": 14.80902934537246, + "grad_norm": 228.76815795898438, + "learning_rate": 7.720508166969147e-06, + "loss": 39.2196, + "step": 4102 + }, + { + "epoch": 14.812641083521445, + "grad_norm": 216.1998291015625, + "learning_rate": 7.715063520871143e-06, + "loss": 39.3677, + "step": 4103 + }, + { + "epoch": 14.816252821670428, + "grad_norm": 266.1018981933594, + "learning_rate": 7.70961887477314e-06, + "loss": 38.1856, + "step": 4104 + }, + { + "epoch": 14.819864559819413, + "grad_norm": 234.2566680908203, + "learning_rate": 7.704174228675137e-06, + "loss": 39.6282, + "step": 4105 + }, + { + "epoch": 14.823476297968398, + "grad_norm": 241.16615295410156, + "learning_rate": 7.698729582577132e-06, + "loss": 38.2693, + "step": 4106 + }, + { + "epoch": 14.827088036117381, + "grad_norm": 332.6835021972656, + "learning_rate": 7.69328493647913e-06, + "loss": 37.7161, + "step": 4107 + }, + { + "epoch": 14.830699774266366, + "grad_norm": 260.1654357910156, + "learning_rate": 7.687840290381126e-06, + "loss": 33.9704, + "step": 4108 + }, + { + "epoch": 14.83431151241535, + "grad_norm": 214.45509338378906, + "learning_rate": 7.682395644283122e-06, + "loss": 32.5126, + "step": 4109 + }, + { + "epoch": 14.837923250564334, + "grad_norm": 257.4847717285156, + "learning_rate": 7.676950998185119e-06, + "loss": 32.0682, + "step": 4110 + }, + { + "epoch": 14.837923250564334, + "eval_loss": 0.6022929549217224, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.957, + "eval_steps_per_second": 56.957, + "step": 4110 + }, + { + "epoch": 14.841534988713319, + "grad_norm": 241.302978515625, + "learning_rate": 7.671506352087114e-06, + "loss": 32.8817, + "step": 4111 + }, + { + "epoch": 14.845146726862303, + "grad_norm": 238.0950164794922, + "learning_rate": 7.66606170598911e-06, + "loss": 31.9995, + "step": 4112 + }, + { + "epoch": 14.848758465011286, + "grad_norm": 239.700439453125, + "learning_rate": 7.660617059891108e-06, + "loss": 32.9681, + "step": 4113 + }, + { + "epoch": 14.852370203160271, + "grad_norm": 234.23890686035156, + "learning_rate": 7.655172413793104e-06, + "loss": 33.6878, + "step": 4114 + }, + { + "epoch": 14.855981941309254, + "grad_norm": 367.3103332519531, + "learning_rate": 7.6497277676951e-06, + "loss": 34.2346, + "step": 4115 + }, + { + "epoch": 14.85959367945824, + "grad_norm": 221.31381225585938, + "learning_rate": 7.644283121597096e-06, + "loss": 35.0148, + "step": 4116 + }, + { + "epoch": 14.863205417607224, + "grad_norm": 352.1162109375, + "learning_rate": 7.638838475499092e-06, + "loss": 34.8326, + "step": 4117 + }, + { + "epoch": 14.866817155756207, + "grad_norm": 296.8202209472656, + "learning_rate": 7.63339382940109e-06, + "loss": 34.2522, + "step": 4118 + }, + { + "epoch": 14.870428893905192, + "grad_norm": 283.4679870605469, + "learning_rate": 7.627949183303086e-06, + "loss": 34.5005, + "step": 4119 + }, + { + "epoch": 14.874040632054175, + "grad_norm": 249.95033264160156, + "learning_rate": 7.622504537205082e-06, + "loss": 34.9581, + "step": 4120 + }, + { + "epoch": 14.874040632054175, + "eval_loss": 0.6031190752983093, + "eval_runtime": 3.1392, + "eval_samples_per_second": 57.02, + "eval_steps_per_second": 57.02, + "step": 4120 + }, + { + "epoch": 14.87765237020316, + "grad_norm": 235.65065002441406, + "learning_rate": 7.6170598911070774e-06, + "loss": 35.3024, + "step": 4121 + }, + { + "epoch": 14.881264108352145, + "grad_norm": 258.1300964355469, + "learning_rate": 7.611615245009075e-06, + "loss": 35.4444, + "step": 4122 + }, + { + "epoch": 14.884875846501128, + "grad_norm": 262.9698791503906, + "learning_rate": 7.606170598911072e-06, + "loss": 36.5643, + "step": 4123 + }, + { + "epoch": 14.888487584650113, + "grad_norm": 274.81781005859375, + "learning_rate": 7.600725952813067e-06, + "loss": 33.0157, + "step": 4124 + }, + { + "epoch": 14.892099322799098, + "grad_norm": 205.41566467285156, + "learning_rate": 7.595281306715063e-06, + "loss": 22.226, + "step": 4125 + }, + { + "epoch": 14.89571106094808, + "grad_norm": 231.19541931152344, + "learning_rate": 7.5898366606170594e-06, + "loss": 22.1499, + "step": 4126 + }, + { + "epoch": 14.899322799097066, + "grad_norm": 203.04856872558594, + "learning_rate": 7.584392014519057e-06, + "loss": 23.3987, + "step": 4127 + }, + { + "epoch": 14.90293453724605, + "grad_norm": 289.031005859375, + "learning_rate": 7.578947368421053e-06, + "loss": 24.3649, + "step": 4128 + }, + { + "epoch": 14.906546275395034, + "grad_norm": 285.2325744628906, + "learning_rate": 7.573502722323049e-06, + "loss": 41.146, + "step": 4129 + }, + { + "epoch": 14.910158013544018, + "grad_norm": 232.21603393554688, + "learning_rate": 7.568058076225045e-06, + "loss": 40.3871, + "step": 4130 + }, + { + "epoch": 14.910158013544018, + "eval_loss": 0.6056836247444153, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 4130 + }, + { + "epoch": 14.913769751693001, + "grad_norm": 358.63238525390625, + "learning_rate": 7.562613430127043e-06, + "loss": 39.5914, + "step": 4131 + }, + { + "epoch": 14.917381489841986, + "grad_norm": 262.66741943359375, + "learning_rate": 7.5571687840290385e-06, + "loss": 39.4552, + "step": 4132 + }, + { + "epoch": 14.920993227990971, + "grad_norm": 228.7096710205078, + "learning_rate": 7.551724137931035e-06, + "loss": 41.5379, + "step": 4133 + }, + { + "epoch": 14.924604966139954, + "grad_norm": 266.6537780761719, + "learning_rate": 7.546279491833031e-06, + "loss": 39.8314, + "step": 4134 + }, + { + "epoch": 14.928216704288939, + "grad_norm": 329.5486755371094, + "learning_rate": 7.540834845735027e-06, + "loss": 37.8247, + "step": 4135 + }, + { + "epoch": 14.931828442437924, + "grad_norm": 391.49127197265625, + "learning_rate": 7.535390199637024e-06, + "loss": 36.8491, + "step": 4136 + }, + { + "epoch": 14.935440180586907, + "grad_norm": 342.66632080078125, + "learning_rate": 7.5299455535390205e-06, + "loss": 37.7245, + "step": 4137 + }, + { + "epoch": 14.939051918735892, + "grad_norm": 309.25115966796875, + "learning_rate": 7.524500907441017e-06, + "loss": 38.3694, + "step": 4138 + }, + { + "epoch": 14.942663656884875, + "grad_norm": 438.21539306640625, + "learning_rate": 7.519056261343012e-06, + "loss": 38.5028, + "step": 4139 + }, + { + "epoch": 14.94627539503386, + "grad_norm": 314.2667541503906, + "learning_rate": 7.513611615245008e-06, + "loss": 39.2531, + "step": 4140 + }, + { + "epoch": 14.94627539503386, + "eval_loss": 0.6075459718704224, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 4140 + }, + { + "epoch": 14.949887133182845, + "grad_norm": 348.3675537109375, + "learning_rate": 7.508166969147006e-06, + "loss": 38.3904, + "step": 4141 + }, + { + "epoch": 14.953498871331828, + "grad_norm": 448.6506652832031, + "learning_rate": 7.5027223230490025e-06, + "loss": 39.0257, + "step": 4142 + }, + { + "epoch": 14.957110609480813, + "grad_norm": 407.4074401855469, + "learning_rate": 7.497277676950998e-06, + "loss": 36.8144, + "step": 4143 + }, + { + "epoch": 14.960722347629797, + "grad_norm": 311.0707702636719, + "learning_rate": 7.491833030852995e-06, + "loss": 34.3852, + "step": 4144 + }, + { + "epoch": 14.96433408577878, + "grad_norm": 316.660400390625, + "learning_rate": 7.486388384754991e-06, + "loss": 32.9411, + "step": 4145 + }, + { + "epoch": 14.967945823927765, + "grad_norm": 405.3203125, + "learning_rate": 7.480943738656988e-06, + "loss": 32.9947, + "step": 4146 + }, + { + "epoch": 14.97155756207675, + "grad_norm": 246.47296142578125, + "learning_rate": 7.475499092558984e-06, + "loss": 34.9284, + "step": 4147 + }, + { + "epoch": 14.975169300225733, + "grad_norm": 250.6293487548828, + "learning_rate": 7.47005444646098e-06, + "loss": 33.5852, + "step": 4148 + }, + { + "epoch": 14.978781038374718, + "grad_norm": 367.8492736816406, + "learning_rate": 7.464609800362977e-06, + "loss": 34.5658, + "step": 4149 + }, + { + "epoch": 14.982392776523701, + "grad_norm": 299.1382141113281, + "learning_rate": 7.459165154264972e-06, + "loss": 35.4483, + "step": 4150 + }, + { + "epoch": 14.982392776523701, + "eval_loss": 0.6054605841636658, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 4150 + }, + { + "epoch": 14.986004514672686, + "grad_norm": 448.0080261230469, + "learning_rate": 7.453720508166969e-06, + "loss": 35.9366, + "step": 4151 + }, + { + "epoch": 14.989616252821671, + "grad_norm": 496.0691223144531, + "learning_rate": 7.448275862068966e-06, + "loss": 37.6222, + "step": 4152 + }, + { + "epoch": 14.993227990970654, + "grad_norm": 300.7026062011719, + "learning_rate": 7.442831215970963e-06, + "loss": 27.5573, + "step": 4153 + }, + { + "epoch": 14.996839729119639, + "grad_norm": 183.81434631347656, + "learning_rate": 7.437386569872958e-06, + "loss": 23.0142, + "step": 4154 + }, + { + "epoch": 15.0, + "grad_norm": 198.61032104492188, + "learning_rate": 7.431941923774954e-06, + "loss": 21.0732, + "step": 4155 + }, + { + "epoch": 15.003611738148985, + "grad_norm": 244.2176513671875, + "learning_rate": 7.426497277676951e-06, + "loss": 39.1709, + "step": 4156 + }, + { + "epoch": 15.007223476297968, + "grad_norm": 211.74375915527344, + "learning_rate": 7.421052631578948e-06, + "loss": 39.9364, + "step": 4157 + }, + { + "epoch": 15.010835214446953, + "grad_norm": 216.2489013671875, + "learning_rate": 7.415607985480944e-06, + "loss": 39.5166, + "step": 4158 + }, + { + "epoch": 15.014446952595938, + "grad_norm": 279.423583984375, + "learning_rate": 7.41016333938294e-06, + "loss": 39.6738, + "step": 4159 + }, + { + "epoch": 15.01805869074492, + "grad_norm": 279.117919921875, + "learning_rate": 7.404718693284937e-06, + "loss": 39.3556, + "step": 4160 + }, + { + "epoch": 15.01805869074492, + "eval_loss": 0.6020110249519348, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 4160 + }, + { + "epoch": 15.021670428893906, + "grad_norm": 213.57162475585938, + "learning_rate": 7.399274047186933e-06, + "loss": 38.9987, + "step": 4161 + }, + { + "epoch": 15.025282167042889, + "grad_norm": 184.1968994140625, + "learning_rate": 7.393829401088929e-06, + "loss": 39.1696, + "step": 4162 + }, + { + "epoch": 15.028893905191874, + "grad_norm": 219.38076782226562, + "learning_rate": 7.388384754990926e-06, + "loss": 39.8897, + "step": 4163 + }, + { + "epoch": 15.032505643340858, + "grad_norm": 225.4325714111328, + "learning_rate": 7.382940108892922e-06, + "loss": 40.7633, + "step": 4164 + }, + { + "epoch": 15.036117381489841, + "grad_norm": 274.78472900390625, + "learning_rate": 7.377495462794918e-06, + "loss": 39.8768, + "step": 4165 + }, + { + "epoch": 15.039729119638826, + "grad_norm": 269.5557861328125, + "learning_rate": 7.3720508166969146e-06, + "loss": 38.4735, + "step": 4166 + }, + { + "epoch": 15.043340857787811, + "grad_norm": 219.78761291503906, + "learning_rate": 7.366606170598912e-06, + "loss": 37.2117, + "step": 4167 + }, + { + "epoch": 15.046952595936794, + "grad_norm": 205.49771118164062, + "learning_rate": 7.361161524500908e-06, + "loss": 36.6855, + "step": 4168 + }, + { + "epoch": 15.050564334085779, + "grad_norm": 235.72068786621094, + "learning_rate": 7.355716878402904e-06, + "loss": 35.4408, + "step": 4169 + }, + { + "epoch": 15.054176072234762, + "grad_norm": 218.84732055664062, + "learning_rate": 7.3502722323049e-06, + "loss": 38.2297, + "step": 4170 + }, + { + "epoch": 15.054176072234762, + "eval_loss": 0.6053969860076904, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 4170 + }, + { + "epoch": 15.057787810383747, + "grad_norm": 195.80685424804688, + "learning_rate": 7.3448275862068966e-06, + "loss": 35.7271, + "step": 4171 + }, + { + "epoch": 15.061399548532732, + "grad_norm": 207.12481689453125, + "learning_rate": 7.339382940108894e-06, + "loss": 37.3393, + "step": 4172 + }, + { + "epoch": 15.065011286681715, + "grad_norm": 211.0287322998047, + "learning_rate": 7.333938294010889e-06, + "loss": 36.9505, + "step": 4173 + }, + { + "epoch": 15.0686230248307, + "grad_norm": 279.0206604003906, + "learning_rate": 7.328493647912886e-06, + "loss": 38.1225, + "step": 4174 + }, + { + "epoch": 15.072234762979685, + "grad_norm": 206.3834228515625, + "learning_rate": 7.323049001814882e-06, + "loss": 37.1117, + "step": 4175 + }, + { + "epoch": 15.075846501128668, + "grad_norm": 266.8707275390625, + "learning_rate": 7.3176043557168786e-06, + "loss": 36.1971, + "step": 4176 + }, + { + "epoch": 15.079458239277653, + "grad_norm": 260.35791015625, + "learning_rate": 7.312159709618875e-06, + "loss": 37.4714, + "step": 4177 + }, + { + "epoch": 15.083069977426636, + "grad_norm": 281.152587890625, + "learning_rate": 7.306715063520871e-06, + "loss": 37.621, + "step": 4178 + }, + { + "epoch": 15.08668171557562, + "grad_norm": 246.25758361816406, + "learning_rate": 7.301270417422868e-06, + "loss": 38.919, + "step": 4179 + }, + { + "epoch": 15.090293453724605, + "grad_norm": 378.4499816894531, + "learning_rate": 7.2958257713248635e-06, + "loss": 39.5783, + "step": 4180 + }, + { + "epoch": 15.090293453724605, + "eval_loss": 0.6071392297744751, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 4180 + }, + { + "epoch": 15.093905191873588, + "grad_norm": 421.0552673339844, + "learning_rate": 7.2903811252268606e-06, + "loss": 38.9023, + "step": 4181 + }, + { + "epoch": 15.097516930022573, + "grad_norm": 264.24359130859375, + "learning_rate": 7.284936479128857e-06, + "loss": 39.6466, + "step": 4182 + }, + { + "epoch": 15.101128668171558, + "grad_norm": 246.88182067871094, + "learning_rate": 7.279491833030854e-06, + "loss": 39.4899, + "step": 4183 + }, + { + "epoch": 15.104740406320541, + "grad_norm": 236.83848571777344, + "learning_rate": 7.274047186932849e-06, + "loss": 35.6587, + "step": 4184 + }, + { + "epoch": 15.108352144469526, + "grad_norm": 278.31573486328125, + "learning_rate": 7.2686025408348455e-06, + "loss": 34.1567, + "step": 4185 + }, + { + "epoch": 15.111963882618511, + "grad_norm": 243.71160888671875, + "learning_rate": 7.2631578947368426e-06, + "loss": 32.1268, + "step": 4186 + }, + { + "epoch": 15.115575620767494, + "grad_norm": 233.81211853027344, + "learning_rate": 7.257713248638839e-06, + "loss": 31.498, + "step": 4187 + }, + { + "epoch": 15.119187358916479, + "grad_norm": 243.12672424316406, + "learning_rate": 7.252268602540835e-06, + "loss": 32.3648, + "step": 4188 + }, + { + "epoch": 15.122799097065462, + "grad_norm": 293.38299560546875, + "learning_rate": 7.246823956442831e-06, + "loss": 32.2236, + "step": 4189 + }, + { + "epoch": 15.126410835214447, + "grad_norm": 249.70071411132812, + "learning_rate": 7.241379310344828e-06, + "loss": 34.5535, + "step": 4190 + }, + { + "epoch": 15.126410835214447, + "eval_loss": 0.6050077676773071, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.934, + "eval_steps_per_second": 56.934, + "step": 4190 + }, + { + "epoch": 15.130022573363432, + "grad_norm": 300.9483642578125, + "learning_rate": 7.235934664246824e-06, + "loss": 32.9552, + "step": 4191 + }, + { + "epoch": 15.133634311512415, + "grad_norm": 228.797607421875, + "learning_rate": 7.23049001814882e-06, + "loss": 33.0974, + "step": 4192 + }, + { + "epoch": 15.1372460496614, + "grad_norm": 279.9087219238281, + "learning_rate": 7.225045372050817e-06, + "loss": 34.2865, + "step": 4193 + }, + { + "epoch": 15.140857787810384, + "grad_norm": 254.15928649902344, + "learning_rate": 7.219600725952813e-06, + "loss": 34.5603, + "step": 4194 + }, + { + "epoch": 15.144469525959368, + "grad_norm": 314.19012451171875, + "learning_rate": 7.2141560798548095e-06, + "loss": 34.6428, + "step": 4195 + }, + { + "epoch": 15.148081264108352, + "grad_norm": 291.8244323730469, + "learning_rate": 7.208711433756806e-06, + "loss": 33.6676, + "step": 4196 + }, + { + "epoch": 15.151693002257336, + "grad_norm": 276.4428405761719, + "learning_rate": 7.203266787658803e-06, + "loss": 33.9118, + "step": 4197 + }, + { + "epoch": 15.15530474040632, + "grad_norm": 265.7801208496094, + "learning_rate": 7.197822141560799e-06, + "loss": 35.1971, + "step": 4198 + }, + { + "epoch": 15.158916478555305, + "grad_norm": 244.48667907714844, + "learning_rate": 7.192377495462795e-06, + "loss": 33.0843, + "step": 4199 + }, + { + "epoch": 15.162528216704288, + "grad_norm": 348.6037902832031, + "learning_rate": 7.1869328493647915e-06, + "loss": 36.7957, + "step": 4200 + }, + { + "epoch": 15.162528216704288, + "eval_loss": 0.6052607297897339, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 4200 + }, + { + "epoch": 15.166139954853273, + "grad_norm": 227.31346130371094, + "learning_rate": 7.181488203266788e-06, + "loss": 28.0234, + "step": 4201 + }, + { + "epoch": 15.169751693002258, + "grad_norm": 208.75048828125, + "learning_rate": 7.176043557168784e-06, + "loss": 22.5147, + "step": 4202 + }, + { + "epoch": 15.173363431151241, + "grad_norm": 222.91090393066406, + "learning_rate": 7.17059891107078e-06, + "loss": 22.1029, + "step": 4203 + }, + { + "epoch": 15.176975169300226, + "grad_norm": 219.40621948242188, + "learning_rate": 7.165154264972777e-06, + "loss": 22.9827, + "step": 4204 + }, + { + "epoch": 15.18058690744921, + "grad_norm": 229.11813354492188, + "learning_rate": 7.1597096188747735e-06, + "loss": 23.6974, + "step": 4205 + }, + { + "epoch": 15.184198645598194, + "grad_norm": 256.7950744628906, + "learning_rate": 7.15426497277677e-06, + "loss": 39.6585, + "step": 4206 + }, + { + "epoch": 15.187810383747179, + "grad_norm": 237.47613525390625, + "learning_rate": 7.148820326678766e-06, + "loss": 40.0478, + "step": 4207 + }, + { + "epoch": 15.191422121896162, + "grad_norm": 259.54296875, + "learning_rate": 7.143375680580762e-06, + "loss": 39.7604, + "step": 4208 + }, + { + "epoch": 15.195033860045147, + "grad_norm": 249.7389678955078, + "learning_rate": 7.137931034482759e-06, + "loss": 39.0201, + "step": 4209 + }, + { + "epoch": 15.198645598194132, + "grad_norm": 298.4624938964844, + "learning_rate": 7.132486388384755e-06, + "loss": 39.8575, + "step": 4210 + }, + { + "epoch": 15.198645598194132, + "eval_loss": 0.6088115572929382, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 4210 + }, + { + "epoch": 15.202257336343115, + "grad_norm": 267.57659912109375, + "learning_rate": 7.127041742286752e-06, + "loss": 38.8929, + "step": 4211 + }, + { + "epoch": 15.2058690744921, + "grad_norm": 243.88333129882812, + "learning_rate": 7.121597096188748e-06, + "loss": 39.6078, + "step": 4212 + }, + { + "epoch": 15.209480812641084, + "grad_norm": 268.2644348144531, + "learning_rate": 7.116152450090745e-06, + "loss": 39.9488, + "step": 4213 + }, + { + "epoch": 15.213092550790067, + "grad_norm": 240.2657928466797, + "learning_rate": 7.11070780399274e-06, + "loss": 40.1645, + "step": 4214 + }, + { + "epoch": 15.216704288939052, + "grad_norm": 198.76910400390625, + "learning_rate": 7.105263157894737e-06, + "loss": 38.2229, + "step": 4215 + }, + { + "epoch": 15.220316027088035, + "grad_norm": 234.11170959472656, + "learning_rate": 7.099818511796734e-06, + "loss": 39.5294, + "step": 4216 + }, + { + "epoch": 15.22392776523702, + "grad_norm": 192.80194091796875, + "learning_rate": 7.094373865698729e-06, + "loss": 36.9752, + "step": 4217 + }, + { + "epoch": 15.227539503386005, + "grad_norm": 241.8236846923828, + "learning_rate": 7.088929219600726e-06, + "loss": 36.1043, + "step": 4218 + }, + { + "epoch": 15.231151241534988, + "grad_norm": 451.6199645996094, + "learning_rate": 7.083484573502722e-06, + "loss": 37.7911, + "step": 4219 + }, + { + "epoch": 15.234762979683973, + "grad_norm": 351.9429626464844, + "learning_rate": 7.0780399274047195e-06, + "loss": 35.5202, + "step": 4220 + }, + { + "epoch": 15.234762979683973, + "eval_loss": 0.6093130111694336, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 4220 + }, + { + "epoch": 15.238374717832958, + "grad_norm": 266.4995422363281, + "learning_rate": 7.072595281306715e-06, + "loss": 37.5552, + "step": 4221 + }, + { + "epoch": 15.241986455981941, + "grad_norm": 258.74578857421875, + "learning_rate": 7.067150635208712e-06, + "loss": 37.1315, + "step": 4222 + }, + { + "epoch": 15.245598194130926, + "grad_norm": 233.30921936035156, + "learning_rate": 7.061705989110708e-06, + "loss": 36.9237, + "step": 4223 + }, + { + "epoch": 15.249209932279909, + "grad_norm": 235.8688201904297, + "learning_rate": 7.056261343012704e-06, + "loss": 38.0112, + "step": 4224 + }, + { + "epoch": 15.252821670428894, + "grad_norm": 214.88436889648438, + "learning_rate": 7.050816696914701e-06, + "loss": 38.5641, + "step": 4225 + }, + { + "epoch": 15.256433408577879, + "grad_norm": 252.64144897460938, + "learning_rate": 7.045372050816697e-06, + "loss": 36.7125, + "step": 4226 + }, + { + "epoch": 15.260045146726862, + "grad_norm": 293.78424072265625, + "learning_rate": 7.039927404718694e-06, + "loss": 37.5956, + "step": 4227 + }, + { + "epoch": 15.263656884875846, + "grad_norm": 234.13510131835938, + "learning_rate": 7.03448275862069e-06, + "loss": 38.1829, + "step": 4228 + }, + { + "epoch": 15.267268623024831, + "grad_norm": 279.534912109375, + "learning_rate": 7.029038112522686e-06, + "loss": 39.0785, + "step": 4229 + }, + { + "epoch": 15.270880361173814, + "grad_norm": 246.4442596435547, + "learning_rate": 7.023593466424683e-06, + "loss": 39.1753, + "step": 4230 + }, + { + "epoch": 15.270880361173814, + "eval_loss": 0.6043311357498169, + "eval_runtime": 3.1452, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 4230 + }, + { + "epoch": 15.2744920993228, + "grad_norm": 233.87466430664062, + "learning_rate": 7.018148820326679e-06, + "loss": 39.8464, + "step": 4231 + }, + { + "epoch": 15.278103837471784, + "grad_norm": 228.54898071289062, + "learning_rate": 7.012704174228675e-06, + "loss": 37.9721, + "step": 4232 + }, + { + "epoch": 15.281715575620767, + "grad_norm": 273.70050048828125, + "learning_rate": 7.007259528130671e-06, + "loss": 38.9153, + "step": 4233 + }, + { + "epoch": 15.285327313769752, + "grad_norm": 269.8402404785156, + "learning_rate": 7.001814882032668e-06, + "loss": 36.7607, + "step": 4234 + }, + { + "epoch": 15.288939051918735, + "grad_norm": 260.13629150390625, + "learning_rate": 6.996370235934665e-06, + "loss": 35.3684, + "step": 4235 + }, + { + "epoch": 15.29255079006772, + "grad_norm": 223.9878692626953, + "learning_rate": 6.990925589836661e-06, + "loss": 32.8784, + "step": 4236 + }, + { + "epoch": 15.296162528216705, + "grad_norm": 225.69212341308594, + "learning_rate": 6.985480943738657e-06, + "loss": 31.3751, + "step": 4237 + }, + { + "epoch": 15.299774266365688, + "grad_norm": 215.99801635742188, + "learning_rate": 6.980036297640653e-06, + "loss": 31.5331, + "step": 4238 + }, + { + "epoch": 15.303386004514673, + "grad_norm": 263.26568603515625, + "learning_rate": 6.97459165154265e-06, + "loss": 32.5806, + "step": 4239 + }, + { + "epoch": 15.306997742663658, + "grad_norm": 203.2392578125, + "learning_rate": 6.969147005444646e-06, + "loss": 31.6379, + "step": 4240 + }, + { + "epoch": 15.306997742663658, + "eval_loss": 0.6046441793441772, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 4240 + }, + { + "epoch": 15.31060948081264, + "grad_norm": 221.2167510986328, + "learning_rate": 6.963702359346643e-06, + "loss": 33.7034, + "step": 4241 + }, + { + "epoch": 15.314221218961626, + "grad_norm": 212.58737182617188, + "learning_rate": 6.958257713248639e-06, + "loss": 32.5511, + "step": 4242 + }, + { + "epoch": 15.317832957110609, + "grad_norm": 270.7123718261719, + "learning_rate": 6.952813067150635e-06, + "loss": 33.2513, + "step": 4243 + }, + { + "epoch": 15.321444695259594, + "grad_norm": 270.2066345214844, + "learning_rate": 6.9473684210526315e-06, + "loss": 33.9559, + "step": 4244 + }, + { + "epoch": 15.325056433408578, + "grad_norm": 232.8043212890625, + "learning_rate": 6.941923774954628e-06, + "loss": 33.9916, + "step": 4245 + }, + { + "epoch": 15.328668171557561, + "grad_norm": 325.419921875, + "learning_rate": 6.936479128856625e-06, + "loss": 35.2098, + "step": 4246 + }, + { + "epoch": 15.332279909706546, + "grad_norm": 303.326416015625, + "learning_rate": 6.93103448275862e-06, + "loss": 35.0784, + "step": 4247 + }, + { + "epoch": 15.335891647855531, + "grad_norm": 327.05963134765625, + "learning_rate": 6.925589836660617e-06, + "loss": 35.9915, + "step": 4248 + }, + { + "epoch": 15.339503386004514, + "grad_norm": 326.58795166015625, + "learning_rate": 6.9201451905626135e-06, + "loss": 35.1914, + "step": 4249 + }, + { + "epoch": 15.343115124153499, + "grad_norm": 406.38812255859375, + "learning_rate": 6.914700544464611e-06, + "loss": 37.1535, + "step": 4250 + }, + { + "epoch": 15.343115124153499, + "eval_loss": 0.6056071519851685, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 4250 + }, + { + "epoch": 15.346726862302482, + "grad_norm": 325.6965637207031, + "learning_rate": 6.909255898366606e-06, + "loss": 29.8698, + "step": 4251 + }, + { + "epoch": 15.350338600451467, + "grad_norm": 212.59727478027344, + "learning_rate": 6.903811252268603e-06, + "loss": 22.2995, + "step": 4252 + }, + { + "epoch": 15.353950338600452, + "grad_norm": 257.447509765625, + "learning_rate": 6.898366606170599e-06, + "loss": 23.1014, + "step": 4253 + }, + { + "epoch": 15.357562076749435, + "grad_norm": 266.139892578125, + "learning_rate": 6.8929219600725955e-06, + "loss": 23.2319, + "step": 4254 + }, + { + "epoch": 15.36117381489842, + "grad_norm": 332.7207336425781, + "learning_rate": 6.887477313974592e-06, + "loss": 23.7218, + "step": 4255 + }, + { + "epoch": 15.364785553047405, + "grad_norm": 272.7341003417969, + "learning_rate": 6.882032667876588e-06, + "loss": 39.5787, + "step": 4256 + }, + { + "epoch": 15.368397291196388, + "grad_norm": 259.00872802734375, + "learning_rate": 6.876588021778585e-06, + "loss": 41.0874, + "step": 4257 + }, + { + "epoch": 15.372009029345373, + "grad_norm": 236.87033081054688, + "learning_rate": 6.8711433756805804e-06, + "loss": 38.9811, + "step": 4258 + }, + { + "epoch": 15.375620767494357, + "grad_norm": 293.6808776855469, + "learning_rate": 6.8656987295825775e-06, + "loss": 39.481, + "step": 4259 + }, + { + "epoch": 15.37923250564334, + "grad_norm": 266.0845947265625, + "learning_rate": 6.860254083484574e-06, + "loss": 39.4595, + "step": 4260 + }, + { + "epoch": 15.37923250564334, + "eval_loss": 0.6039742231369019, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.047, + "eval_steps_per_second": 57.047, + "step": 4260 + }, + { + "epoch": 15.382844243792325, + "grad_norm": 398.0877685546875, + "learning_rate": 6.85480943738657e-06, + "loss": 38.8899, + "step": 4261 + }, + { + "epoch": 15.386455981941308, + "grad_norm": 208.37376403808594, + "learning_rate": 6.849364791288566e-06, + "loss": 39.2194, + "step": 4262 + }, + { + "epoch": 15.390067720090293, + "grad_norm": 214.6958770751953, + "learning_rate": 6.8439201451905624e-06, + "loss": 38.9911, + "step": 4263 + }, + { + "epoch": 15.393679458239278, + "grad_norm": 210.2147674560547, + "learning_rate": 6.8384754990925595e-06, + "loss": 40.5973, + "step": 4264 + }, + { + "epoch": 15.397291196388261, + "grad_norm": 240.47030639648438, + "learning_rate": 6.833030852994556e-06, + "loss": 39.3936, + "step": 4265 + }, + { + "epoch": 15.400902934537246, + "grad_norm": 273.86883544921875, + "learning_rate": 6.827586206896552e-06, + "loss": 40.0848, + "step": 4266 + }, + { + "epoch": 15.404514672686231, + "grad_norm": 239.36453247070312, + "learning_rate": 6.822141560798548e-06, + "loss": 36.5967, + "step": 4267 + }, + { + "epoch": 15.408126410835214, + "grad_norm": 215.3413543701172, + "learning_rate": 6.8166969147005444e-06, + "loss": 37.8173, + "step": 4268 + }, + { + "epoch": 15.411738148984199, + "grad_norm": 260.1557312011719, + "learning_rate": 6.811252268602541e-06, + "loss": 37.7175, + "step": 4269 + }, + { + "epoch": 15.415349887133182, + "grad_norm": 239.4988555908203, + "learning_rate": 6.805807622504537e-06, + "loss": 37.0618, + "step": 4270 + }, + { + "epoch": 15.415349887133182, + "eval_loss": 0.6049810647964478, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 4270 + }, + { + "epoch": 15.418961625282167, + "grad_norm": 223.06094360351562, + "learning_rate": 6.800362976406534e-06, + "loss": 37.0687, + "step": 4271 + }, + { + "epoch": 15.422573363431152, + "grad_norm": 261.7460632324219, + "learning_rate": 6.79491833030853e-06, + "loss": 35.9437, + "step": 4272 + }, + { + "epoch": 15.426185101580135, + "grad_norm": 230.92135620117188, + "learning_rate": 6.7894736842105264e-06, + "loss": 38.3316, + "step": 4273 + }, + { + "epoch": 15.42979683972912, + "grad_norm": 370.6309509277344, + "learning_rate": 6.784029038112523e-06, + "loss": 38.2666, + "step": 4274 + }, + { + "epoch": 15.433408577878104, + "grad_norm": 249.7823944091797, + "learning_rate": 6.77858439201452e-06, + "loss": 38.1159, + "step": 4275 + }, + { + "epoch": 15.437020316027088, + "grad_norm": 404.1676330566406, + "learning_rate": 6.773139745916516e-06, + "loss": 37.6548, + "step": 4276 + }, + { + "epoch": 15.440632054176072, + "grad_norm": 256.3241271972656, + "learning_rate": 6.767695099818511e-06, + "loss": 38.3713, + "step": 4277 + }, + { + "epoch": 15.444243792325057, + "grad_norm": 240.55934143066406, + "learning_rate": 6.7622504537205084e-06, + "loss": 39.2487, + "step": 4278 + }, + { + "epoch": 15.44785553047404, + "grad_norm": 230.010009765625, + "learning_rate": 6.756805807622505e-06, + "loss": 39.4391, + "step": 4279 + }, + { + "epoch": 15.451467268623025, + "grad_norm": 226.51385498046875, + "learning_rate": 6.751361161524502e-06, + "loss": 38.6273, + "step": 4280 + }, + { + "epoch": 15.451467268623025, + "eval_loss": 0.6027400493621826, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.029, + "eval_steps_per_second": 57.029, + "step": 4280 + }, + { + "epoch": 15.455079006772008, + "grad_norm": 314.57476806640625, + "learning_rate": 6.745916515426497e-06, + "loss": 38.583, + "step": 4281 + }, + { + "epoch": 15.458690744920993, + "grad_norm": 229.91238403320312, + "learning_rate": 6.740471869328494e-06, + "loss": 39.2433, + "step": 4282 + }, + { + "epoch": 15.462302483069978, + "grad_norm": 284.7301330566406, + "learning_rate": 6.7350272232304904e-06, + "loss": 38.8577, + "step": 4283 + }, + { + "epoch": 15.465914221218961, + "grad_norm": 209.32266235351562, + "learning_rate": 6.729582577132486e-06, + "loss": 34.928, + "step": 4284 + }, + { + "epoch": 15.469525959367946, + "grad_norm": 264.6195068359375, + "learning_rate": 6.724137931034483e-06, + "loss": 32.0527, + "step": 4285 + }, + { + "epoch": 15.47313769751693, + "grad_norm": 224.2421112060547, + "learning_rate": 6.718693284936479e-06, + "loss": 31.939, + "step": 4286 + }, + { + "epoch": 15.476749435665914, + "grad_norm": 233.0791015625, + "learning_rate": 6.713248638838476e-06, + "loss": 32.5402, + "step": 4287 + }, + { + "epoch": 15.480361173814899, + "grad_norm": 284.129638671875, + "learning_rate": 6.707803992740472e-06, + "loss": 31.0069, + "step": 4288 + }, + { + "epoch": 15.483972911963882, + "grad_norm": 253.6517791748047, + "learning_rate": 6.702359346642469e-06, + "loss": 32.0172, + "step": 4289 + }, + { + "epoch": 15.487584650112867, + "grad_norm": 305.63775634765625, + "learning_rate": 6.696914700544465e-06, + "loss": 34.1643, + "step": 4290 + }, + { + "epoch": 15.487584650112867, + "eval_loss": 0.6044390201568604, + "eval_runtime": 3.1391, + "eval_samples_per_second": 57.023, + "eval_steps_per_second": 57.023, + "step": 4290 + }, + { + "epoch": 15.491196388261852, + "grad_norm": 224.6516876220703, + "learning_rate": 6.691470054446461e-06, + "loss": 32.4735, + "step": 4291 + }, + { + "epoch": 15.494808126410835, + "grad_norm": 257.5385437011719, + "learning_rate": 6.686025408348457e-06, + "loss": 33.9272, + "step": 4292 + }, + { + "epoch": 15.49841986455982, + "grad_norm": 393.9106140136719, + "learning_rate": 6.680580762250454e-06, + "loss": 34.4176, + "step": 4293 + }, + { + "epoch": 15.502031602708804, + "grad_norm": 333.5639953613281, + "learning_rate": 6.675136116152451e-06, + "loss": 34.5695, + "step": 4294 + }, + { + "epoch": 15.505643340857787, + "grad_norm": 319.8660888671875, + "learning_rate": 6.669691470054446e-06, + "loss": 34.5337, + "step": 4295 + }, + { + "epoch": 15.509255079006772, + "grad_norm": 246.78086853027344, + "learning_rate": 6.664246823956443e-06, + "loss": 34.8297, + "step": 4296 + }, + { + "epoch": 15.512866817155757, + "grad_norm": 313.4530944824219, + "learning_rate": 6.658802177858439e-06, + "loss": 34.6901, + "step": 4297 + }, + { + "epoch": 15.51647855530474, + "grad_norm": 257.2852783203125, + "learning_rate": 6.6533575317604364e-06, + "loss": 35.3892, + "step": 4298 + }, + { + "epoch": 15.520090293453725, + "grad_norm": 336.5549011230469, + "learning_rate": 6.647912885662432e-06, + "loss": 36.3347, + "step": 4299 + }, + { + "epoch": 15.523702031602708, + "grad_norm": 275.726806640625, + "learning_rate": 6.642468239564428e-06, + "loss": 36.3559, + "step": 4300 + }, + { + "epoch": 15.523702031602708, + "eval_loss": 0.6056334376335144, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.028, + "eval_steps_per_second": 57.028, + "step": 4300 + }, + { + "epoch": 15.527313769751693, + "grad_norm": 275.5987243652344, + "learning_rate": 6.637023593466425e-06, + "loss": 28.5887, + "step": 4301 + }, + { + "epoch": 15.530925507900678, + "grad_norm": 242.59762573242188, + "learning_rate": 6.631578947368421e-06, + "loss": 22.1398, + "step": 4302 + }, + { + "epoch": 15.534537246049661, + "grad_norm": 228.04344177246094, + "learning_rate": 6.626134301270418e-06, + "loss": 21.4593, + "step": 4303 + }, + { + "epoch": 15.538148984198646, + "grad_norm": 204.2377166748047, + "learning_rate": 6.620689655172414e-06, + "loss": 22.5132, + "step": 4304 + }, + { + "epoch": 15.54176072234763, + "grad_norm": 243.0237579345703, + "learning_rate": 6.615245009074411e-06, + "loss": 24.2777, + "step": 4305 + }, + { + "epoch": 15.545372460496614, + "grad_norm": 227.2841339111328, + "learning_rate": 6.609800362976407e-06, + "loss": 39.7235, + "step": 4306 + }, + { + "epoch": 15.548984198645599, + "grad_norm": 253.8453826904297, + "learning_rate": 6.6043557168784025e-06, + "loss": 39.9317, + "step": 4307 + }, + { + "epoch": 15.552595936794582, + "grad_norm": 243.62757873535156, + "learning_rate": 6.5989110707804e-06, + "loss": 38.9825, + "step": 4308 + }, + { + "epoch": 15.556207674943566, + "grad_norm": 262.4398498535156, + "learning_rate": 6.593466424682396e-06, + "loss": 39.7456, + "step": 4309 + }, + { + "epoch": 15.559819413092551, + "grad_norm": 268.5821228027344, + "learning_rate": 6.588021778584392e-06, + "loss": 39.5152, + "step": 4310 + }, + { + "epoch": 15.559819413092551, + "eval_loss": 0.6060237288475037, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 4310 + }, + { + "epoch": 15.563431151241534, + "grad_norm": 297.6933898925781, + "learning_rate": 6.582577132486388e-06, + "loss": 40.1259, + "step": 4311 + }, + { + "epoch": 15.56704288939052, + "grad_norm": 234.08816528320312, + "learning_rate": 6.577132486388385e-06, + "loss": 40.8591, + "step": 4312 + }, + { + "epoch": 15.570654627539504, + "grad_norm": 292.2416687011719, + "learning_rate": 6.571687840290382e-06, + "loss": 39.2377, + "step": 4313 + }, + { + "epoch": 15.574266365688487, + "grad_norm": 205.25888061523438, + "learning_rate": 6.566243194192377e-06, + "loss": 39.92, + "step": 4314 + }, + { + "epoch": 15.577878103837472, + "grad_norm": 229.06695556640625, + "learning_rate": 6.560798548094374e-06, + "loss": 39.8886, + "step": 4315 + }, + { + "epoch": 15.581489841986457, + "grad_norm": 223.3977508544922, + "learning_rate": 6.55535390199637e-06, + "loss": 38.5423, + "step": 4316 + }, + { + "epoch": 15.58510158013544, + "grad_norm": 254.60203552246094, + "learning_rate": 6.549909255898367e-06, + "loss": 36.8055, + "step": 4317 + }, + { + "epoch": 15.588713318284425, + "grad_norm": 304.463623046875, + "learning_rate": 6.544464609800363e-06, + "loss": 37.6164, + "step": 4318 + }, + { + "epoch": 15.592325056433408, + "grad_norm": 279.955810546875, + "learning_rate": 6.53901996370236e-06, + "loss": 37.4778, + "step": 4319 + }, + { + "epoch": 15.595936794582393, + "grad_norm": 230.11105346679688, + "learning_rate": 6.533575317604356e-06, + "loss": 36.9663, + "step": 4320 + }, + { + "epoch": 15.595936794582393, + "eval_loss": 0.6048213243484497, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.966, + "eval_steps_per_second": 56.966, + "step": 4320 + }, + { + "epoch": 15.599548532731378, + "grad_norm": 261.98187255859375, + "learning_rate": 6.528130671506351e-06, + "loss": 37.7402, + "step": 4321 + }, + { + "epoch": 15.60316027088036, + "grad_norm": 247.34771728515625, + "learning_rate": 6.5226860254083485e-06, + "loss": 37.1402, + "step": 4322 + }, + { + "epoch": 15.606772009029346, + "grad_norm": 277.1517333984375, + "learning_rate": 6.517241379310345e-06, + "loss": 38.3976, + "step": 4323 + }, + { + "epoch": 15.610383747178329, + "grad_norm": 231.89683532714844, + "learning_rate": 6.511796733212342e-06, + "loss": 38.0834, + "step": 4324 + }, + { + "epoch": 15.613995485327314, + "grad_norm": 323.8349304199219, + "learning_rate": 6.506352087114337e-06, + "loss": 37.9085, + "step": 4325 + }, + { + "epoch": 15.617607223476298, + "grad_norm": 263.5240783691406, + "learning_rate": 6.500907441016334e-06, + "loss": 37.0702, + "step": 4326 + }, + { + "epoch": 15.621218961625281, + "grad_norm": 217.0517578125, + "learning_rate": 6.4954627949183305e-06, + "loss": 36.9406, + "step": 4327 + }, + { + "epoch": 15.624830699774266, + "grad_norm": 267.4161682128906, + "learning_rate": 6.4900181488203276e-06, + "loss": 38.8773, + "step": 4328 + }, + { + "epoch": 15.628442437923251, + "grad_norm": 232.36000061035156, + "learning_rate": 6.484573502722323e-06, + "loss": 38.4978, + "step": 4329 + }, + { + "epoch": 15.632054176072234, + "grad_norm": 241.61373901367188, + "learning_rate": 6.479128856624319e-06, + "loss": 38.4895, + "step": 4330 + }, + { + "epoch": 15.632054176072234, + "eval_loss": 0.6024956703186035, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 4330 + }, + { + "epoch": 15.635665914221219, + "grad_norm": 232.27928161621094, + "learning_rate": 6.473684210526316e-06, + "loss": 38.8551, + "step": 4331 + }, + { + "epoch": 15.639277652370204, + "grad_norm": 243.42828369140625, + "learning_rate": 6.4682395644283125e-06, + "loss": 38.6475, + "step": 4332 + }, + { + "epoch": 15.642889390519187, + "grad_norm": 306.2618103027344, + "learning_rate": 6.462794918330309e-06, + "loss": 37.2015, + "step": 4333 + }, + { + "epoch": 15.646501128668172, + "grad_norm": 335.795166015625, + "learning_rate": 6.457350272232305e-06, + "loss": 36.5255, + "step": 4334 + }, + { + "epoch": 15.650112866817155, + "grad_norm": 209.6246337890625, + "learning_rate": 6.451905626134302e-06, + "loss": 32.4219, + "step": 4335 + }, + { + "epoch": 15.65372460496614, + "grad_norm": 283.2094421386719, + "learning_rate": 6.446460980036297e-06, + "loss": 30.9137, + "step": 4336 + }, + { + "epoch": 15.657336343115125, + "grad_norm": 255.4412841796875, + "learning_rate": 6.441016333938294e-06, + "loss": 30.8939, + "step": 4337 + }, + { + "epoch": 15.660948081264108, + "grad_norm": 217.8052215576172, + "learning_rate": 6.435571687840291e-06, + "loss": 31.5974, + "step": 4338 + }, + { + "epoch": 15.664559819413093, + "grad_norm": 215.64398193359375, + "learning_rate": 6.430127041742287e-06, + "loss": 30.0276, + "step": 4339 + }, + { + "epoch": 15.668171557562077, + "grad_norm": 244.32704162597656, + "learning_rate": 6.424682395644283e-06, + "loss": 32.5249, + "step": 4340 + }, + { + "epoch": 15.668171557562077, + "eval_loss": 0.6037233471870422, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.033, + "eval_steps_per_second": 57.033, + "step": 4340 + }, + { + "epoch": 15.67178329571106, + "grad_norm": 270.9132080078125, + "learning_rate": 6.419237749546279e-06, + "loss": 32.9923, + "step": 4341 + }, + { + "epoch": 15.675395033860045, + "grad_norm": 230.20314025878906, + "learning_rate": 6.4137931034482765e-06, + "loss": 32.871, + "step": 4342 + }, + { + "epoch": 15.679006772009028, + "grad_norm": 372.4366149902344, + "learning_rate": 6.408348457350273e-06, + "loss": 35.2687, + "step": 4343 + }, + { + "epoch": 15.682618510158013, + "grad_norm": 325.0901794433594, + "learning_rate": 6.402903811252268e-06, + "loss": 34.3107, + "step": 4344 + }, + { + "epoch": 15.686230248306998, + "grad_norm": 277.8683166503906, + "learning_rate": 6.397459165154265e-06, + "loss": 34.291, + "step": 4345 + }, + { + "epoch": 15.689841986455981, + "grad_norm": 262.566162109375, + "learning_rate": 6.392014519056261e-06, + "loss": 33.2989, + "step": 4346 + }, + { + "epoch": 15.693453724604966, + "grad_norm": 293.56536865234375, + "learning_rate": 6.386569872958258e-06, + "loss": 35.6865, + "step": 4347 + }, + { + "epoch": 15.697065462753951, + "grad_norm": 291.1886291503906, + "learning_rate": 6.381125226860254e-06, + "loss": 35.6959, + "step": 4348 + }, + { + "epoch": 15.700677200902934, + "grad_norm": 265.2365417480469, + "learning_rate": 6.375680580762251e-06, + "loss": 36.479, + "step": 4349 + }, + { + "epoch": 15.704288939051919, + "grad_norm": 342.8822021484375, + "learning_rate": 6.370235934664247e-06, + "loss": 35.9198, + "step": 4350 + }, + { + "epoch": 15.704288939051919, + "eval_loss": 0.603361189365387, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.98, + "eval_steps_per_second": 56.98, + "step": 4350 + }, + { + "epoch": 15.707900677200904, + "grad_norm": 276.1657409667969, + "learning_rate": 6.364791288566243e-06, + "loss": 29.429, + "step": 4351 + }, + { + "epoch": 15.711512415349887, + "grad_norm": 267.2456359863281, + "learning_rate": 6.35934664246824e-06, + "loss": 23.0038, + "step": 4352 + }, + { + "epoch": 15.715124153498872, + "grad_norm": 255.4893798828125, + "learning_rate": 6.353901996370236e-06, + "loss": 21.1185, + "step": 4353 + }, + { + "epoch": 15.718735891647855, + "grad_norm": 252.10501098632812, + "learning_rate": 6.348457350272233e-06, + "loss": 23.1769, + "step": 4354 + }, + { + "epoch": 15.72234762979684, + "grad_norm": 239.63905334472656, + "learning_rate": 6.343012704174228e-06, + "loss": 24.5905, + "step": 4355 + }, + { + "epoch": 15.725959367945824, + "grad_norm": 228.00950622558594, + "learning_rate": 6.337568058076225e-06, + "loss": 39.6657, + "step": 4356 + }, + { + "epoch": 15.729571106094808, + "grad_norm": 234.10647583007812, + "learning_rate": 6.332123411978222e-06, + "loss": 41.145, + "step": 4357 + }, + { + "epoch": 15.733182844243792, + "grad_norm": 236.55223083496094, + "learning_rate": 6.326678765880219e-06, + "loss": 40.2784, + "step": 4358 + }, + { + "epoch": 15.736794582392777, + "grad_norm": 340.1712646484375, + "learning_rate": 6.321234119782214e-06, + "loss": 39.3598, + "step": 4359 + }, + { + "epoch": 15.74040632054176, + "grad_norm": 269.4134826660156, + "learning_rate": 6.31578947368421e-06, + "loss": 38.7777, + "step": 4360 + }, + { + "epoch": 15.74040632054176, + "eval_loss": 0.6048015356063843, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 4360 + }, + { + "epoch": 15.744018058690745, + "grad_norm": 316.5471496582031, + "learning_rate": 6.310344827586207e-06, + "loss": 39.6707, + "step": 4361 + }, + { + "epoch": 15.747629796839728, + "grad_norm": 231.31820678710938, + "learning_rate": 6.304900181488203e-06, + "loss": 38.0009, + "step": 4362 + }, + { + "epoch": 15.751241534988713, + "grad_norm": 207.19117736816406, + "learning_rate": 6.2994555353902e-06, + "loss": 41.6523, + "step": 4363 + }, + { + "epoch": 15.754853273137698, + "grad_norm": 239.8341064453125, + "learning_rate": 6.294010889292196e-06, + "loss": 40.3203, + "step": 4364 + }, + { + "epoch": 15.758465011286681, + "grad_norm": 277.2004089355469, + "learning_rate": 6.288566243194193e-06, + "loss": 39.8026, + "step": 4365 + }, + { + "epoch": 15.762076749435666, + "grad_norm": 227.74728393554688, + "learning_rate": 6.2831215970961886e-06, + "loss": 38.1561, + "step": 4366 + }, + { + "epoch": 15.76568848758465, + "grad_norm": 268.6826477050781, + "learning_rate": 6.277676950998185e-06, + "loss": 37.4653, + "step": 4367 + }, + { + "epoch": 15.769300225733634, + "grad_norm": 308.92950439453125, + "learning_rate": 6.272232304900182e-06, + "loss": 36.3506, + "step": 4368 + }, + { + "epoch": 15.772911963882619, + "grad_norm": 216.53627014160156, + "learning_rate": 6.266787658802178e-06, + "loss": 36.12, + "step": 4369 + }, + { + "epoch": 15.776523702031604, + "grad_norm": 264.0691833496094, + "learning_rate": 6.261343012704174e-06, + "loss": 37.5023, + "step": 4370 + }, + { + "epoch": 15.776523702031604, + "eval_loss": 0.608928382396698, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.989, + "eval_steps_per_second": 56.989, + "step": 4370 + }, + { + "epoch": 15.780135440180587, + "grad_norm": 474.7265319824219, + "learning_rate": 6.2558983666061706e-06, + "loss": 38.8381, + "step": 4371 + }, + { + "epoch": 15.783747178329572, + "grad_norm": 303.66229248046875, + "learning_rate": 6.250453720508168e-06, + "loss": 36.5951, + "step": 4372 + }, + { + "epoch": 15.787358916478555, + "grad_norm": 231.65744018554688, + "learning_rate": 6.245009074410164e-06, + "loss": 36.4717, + "step": 4373 + }, + { + "epoch": 15.79097065462754, + "grad_norm": 235.25833129882812, + "learning_rate": 6.239564428312159e-06, + "loss": 38.4578, + "step": 4374 + }, + { + "epoch": 15.794582392776524, + "grad_norm": 215.5384063720703, + "learning_rate": 6.234119782214156e-06, + "loss": 38.0475, + "step": 4375 + }, + { + "epoch": 15.798194130925507, + "grad_norm": 216.3609619140625, + "learning_rate": 6.2286751361161526e-06, + "loss": 37.1825, + "step": 4376 + }, + { + "epoch": 15.801805869074492, + "grad_norm": 275.54522705078125, + "learning_rate": 6.223230490018149e-06, + "loss": 38.5608, + "step": 4377 + }, + { + "epoch": 15.805417607223477, + "grad_norm": 226.7752685546875, + "learning_rate": 6.217785843920145e-06, + "loss": 38.0612, + "step": 4378 + }, + { + "epoch": 15.80902934537246, + "grad_norm": 262.14501953125, + "learning_rate": 6.212341197822142e-06, + "loss": 38.0049, + "step": 4379 + }, + { + "epoch": 15.812641083521445, + "grad_norm": 299.82196044921875, + "learning_rate": 6.206896551724138e-06, + "loss": 39.1441, + "step": 4380 + }, + { + "epoch": 15.812641083521445, + "eval_loss": 0.6033969521522522, + "eval_runtime": 3.14, + "eval_samples_per_second": 57.007, + "eval_steps_per_second": 57.007, + "step": 4380 + }, + { + "epoch": 15.816252821670428, + "grad_norm": 295.24188232421875, + "learning_rate": 6.2014519056261346e-06, + "loss": 39.266, + "step": 4381 + }, + { + "epoch": 15.819864559819413, + "grad_norm": 298.1729736328125, + "learning_rate": 6.196007259528131e-06, + "loss": 39.4025, + "step": 4382 + }, + { + "epoch": 15.823476297968398, + "grad_norm": 234.97958374023438, + "learning_rate": 6.190562613430127e-06, + "loss": 39.4752, + "step": 4383 + }, + { + "epoch": 15.827088036117381, + "grad_norm": 270.3009338378906, + "learning_rate": 6.185117967332124e-06, + "loss": 36.0322, + "step": 4384 + }, + { + "epoch": 15.830699774266366, + "grad_norm": 279.78314208984375, + "learning_rate": 6.1796733212341195e-06, + "loss": 33.3256, + "step": 4385 + }, + { + "epoch": 15.83431151241535, + "grad_norm": 258.82598876953125, + "learning_rate": 6.1742286751361166e-06, + "loss": 33.1552, + "step": 4386 + }, + { + "epoch": 15.837923250564334, + "grad_norm": 280.8109130859375, + "learning_rate": 6.168784029038113e-06, + "loss": 32.0024, + "step": 4387 + }, + { + "epoch": 15.841534988713319, + "grad_norm": 265.08111572265625, + "learning_rate": 6.163339382940109e-06, + "loss": 32.4901, + "step": 4388 + }, + { + "epoch": 15.845146726862303, + "grad_norm": 316.56427001953125, + "learning_rate": 6.157894736842105e-06, + "loss": 33.1995, + "step": 4389 + }, + { + "epoch": 15.848758465011286, + "grad_norm": 256.03717041015625, + "learning_rate": 6.1524500907441015e-06, + "loss": 33.1914, + "step": 4390 + }, + { + "epoch": 15.848758465011286, + "eval_loss": 0.6017575263977051, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.034, + "eval_steps_per_second": 57.034, + "step": 4390 + }, + { + "epoch": 15.852370203160271, + "grad_norm": 242.54119873046875, + "learning_rate": 6.1470054446460985e-06, + "loss": 33.8459, + "step": 4391 + }, + { + "epoch": 15.855981941309254, + "grad_norm": 259.1406555175781, + "learning_rate": 6.141560798548094e-06, + "loss": 34.1317, + "step": 4392 + }, + { + "epoch": 15.85959367945824, + "grad_norm": 272.77880859375, + "learning_rate": 6.136116152450091e-06, + "loss": 34.2777, + "step": 4393 + }, + { + "epoch": 15.863205417607224, + "grad_norm": 231.60845947265625, + "learning_rate": 6.130671506352087e-06, + "loss": 34.0165, + "step": 4394 + }, + { + "epoch": 15.866817155756207, + "grad_norm": 230.85675048828125, + "learning_rate": 6.125226860254084e-06, + "loss": 34.2761, + "step": 4395 + }, + { + "epoch": 15.870428893905192, + "grad_norm": 307.4486389160156, + "learning_rate": 6.11978221415608e-06, + "loss": 33.7407, + "step": 4396 + }, + { + "epoch": 15.874040632054175, + "grad_norm": 264.7835388183594, + "learning_rate": 6.114337568058076e-06, + "loss": 34.1672, + "step": 4397 + }, + { + "epoch": 15.87765237020316, + "grad_norm": 234.93968200683594, + "learning_rate": 6.108892921960073e-06, + "loss": 35.7158, + "step": 4398 + }, + { + "epoch": 15.881264108352145, + "grad_norm": 300.0079345703125, + "learning_rate": 6.103448275862069e-06, + "loss": 36.1292, + "step": 4399 + }, + { + "epoch": 15.884875846501128, + "grad_norm": 326.20416259765625, + "learning_rate": 6.0980036297640655e-06, + "loss": 34.8222, + "step": 4400 + }, + { + "epoch": 15.884875846501128, + "eval_loss": 0.6024067401885986, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.99, + "eval_steps_per_second": 56.99, + "step": 4400 + }, + { + "epoch": 15.888487584650113, + "grad_norm": 214.6174774169922, + "learning_rate": 6.092558983666062e-06, + "loss": 27.4819, + "step": 4401 + }, + { + "epoch": 15.892099322799098, + "grad_norm": 222.7063446044922, + "learning_rate": 6.087114337568059e-06, + "loss": 22.3862, + "step": 4402 + }, + { + "epoch": 15.89571106094808, + "grad_norm": 277.0006103515625, + "learning_rate": 6.081669691470054e-06, + "loss": 22.8483, + "step": 4403 + }, + { + "epoch": 15.899322799097066, + "grad_norm": 264.3949890136719, + "learning_rate": 6.076225045372051e-06, + "loss": 23.2021, + "step": 4404 + }, + { + "epoch": 15.90293453724605, + "grad_norm": 244.04611206054688, + "learning_rate": 6.0707803992740475e-06, + "loss": 23.9378, + "step": 4405 + }, + { + "epoch": 15.906546275395034, + "grad_norm": 219.24403381347656, + "learning_rate": 6.065335753176044e-06, + "loss": 39.4708, + "step": 4406 + }, + { + "epoch": 15.910158013544018, + "grad_norm": 297.3822937011719, + "learning_rate": 6.05989110707804e-06, + "loss": 39.9151, + "step": 4407 + }, + { + "epoch": 15.913769751693001, + "grad_norm": 282.748291015625, + "learning_rate": 6.054446460980036e-06, + "loss": 39.0545, + "step": 4408 + }, + { + "epoch": 15.917381489841986, + "grad_norm": 274.6419982910156, + "learning_rate": 6.049001814882033e-06, + "loss": 39.7046, + "step": 4409 + }, + { + "epoch": 15.920993227990971, + "grad_norm": 261.2831115722656, + "learning_rate": 6.0435571687840295e-06, + "loss": 39.8849, + "step": 4410 + }, + { + "epoch": 15.920993227990971, + "eval_loss": 0.6017056107521057, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 4410 + }, + { + "epoch": 15.924604966139954, + "grad_norm": 276.61505126953125, + "learning_rate": 6.038112522686026e-06, + "loss": 39.8861, + "step": 4411 + }, + { + "epoch": 15.928216704288939, + "grad_norm": 273.4017333984375, + "learning_rate": 6.032667876588022e-06, + "loss": 36.2526, + "step": 4412 + }, + { + "epoch": 15.931828442437924, + "grad_norm": 314.4811706542969, + "learning_rate": 6.027223230490018e-06, + "loss": 37.1316, + "step": 4413 + }, + { + "epoch": 15.935440180586907, + "grad_norm": 265.7447204589844, + "learning_rate": 6.021778584392014e-06, + "loss": 38.1698, + "step": 4414 + }, + { + "epoch": 15.939051918735892, + "grad_norm": 448.373291015625, + "learning_rate": 6.016333938294011e-06, + "loss": 38.9541, + "step": 4415 + }, + { + "epoch": 15.942663656884875, + "grad_norm": 261.33966064453125, + "learning_rate": 6.010889292196008e-06, + "loss": 36.6694, + "step": 4416 + }, + { + "epoch": 15.94627539503386, + "grad_norm": 383.16363525390625, + "learning_rate": 6.005444646098004e-06, + "loss": 39.1773, + "step": 4417 + }, + { + "epoch": 15.949887133182845, + "grad_norm": 279.26446533203125, + "learning_rate": 6e-06, + "loss": 36.9482, + "step": 4418 + }, + { + "epoch": 15.953498871331828, + "grad_norm": 307.5321960449219, + "learning_rate": 5.994555353901996e-06, + "loss": 36.653, + "step": 4419 + }, + { + "epoch": 15.957110609480813, + "grad_norm": 412.80023193359375, + "learning_rate": 5.989110707803993e-06, + "loss": 36.3768, + "step": 4420 + }, + { + "epoch": 15.957110609480813, + "eval_loss": 0.6033455729484558, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 4420 + }, + { + "epoch": 15.960722347629797, + "grad_norm": 254.2952880859375, + "learning_rate": 5.98366606170599e-06, + "loss": 32.546, + "step": 4421 + }, + { + "epoch": 15.96433408577878, + "grad_norm": 324.0749816894531, + "learning_rate": 5.978221415607985e-06, + "loss": 32.7021, + "step": 4422 + }, + { + "epoch": 15.967945823927765, + "grad_norm": 326.0075988769531, + "learning_rate": 5.972776769509982e-06, + "loss": 33.3823, + "step": 4423 + }, + { + "epoch": 15.97155756207675, + "grad_norm": 252.98471069335938, + "learning_rate": 5.967332123411978e-06, + "loss": 33.3397, + "step": 4424 + }, + { + "epoch": 15.975169300225733, + "grad_norm": 243.14117431640625, + "learning_rate": 5.9618874773139755e-06, + "loss": 34.2781, + "step": 4425 + }, + { + "epoch": 15.978781038374718, + "grad_norm": 304.3429260253906, + "learning_rate": 5.956442831215971e-06, + "loss": 34.1163, + "step": 4426 + }, + { + "epoch": 15.982392776523701, + "grad_norm": 320.1651916503906, + "learning_rate": 5.950998185117968e-06, + "loss": 34.1024, + "step": 4427 + }, + { + "epoch": 15.986004514672686, + "grad_norm": 252.0004425048828, + "learning_rate": 5.945553539019964e-06, + "loss": 35.8121, + "step": 4428 + }, + { + "epoch": 15.989616252821671, + "grad_norm": 342.5635986328125, + "learning_rate": 5.9401088929219595e-06, + "loss": 35.6666, + "step": 4429 + }, + { + "epoch": 15.993227990970654, + "grad_norm": 226.57249450683594, + "learning_rate": 5.934664246823957e-06, + "loss": 30.2617, + "step": 4430 + }, + { + "epoch": 15.993227990970654, + "eval_loss": 0.6029886603355408, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.979, + "eval_steps_per_second": 56.979, + "step": 4430 + }, + { + "epoch": 15.996839729119639, + "grad_norm": 202.94903564453125, + "learning_rate": 5.929219600725953e-06, + "loss": 22.8166, + "step": 4431 + }, + { + "epoch": 16.0, + "grad_norm": 200.84317016601562, + "learning_rate": 5.92377495462795e-06, + "loss": 20.3903, + "step": 4432 + }, + { + "epoch": 16.003611738148983, + "grad_norm": 230.5917510986328, + "learning_rate": 5.918330308529945e-06, + "loss": 39.0985, + "step": 4433 + }, + { + "epoch": 16.00722347629797, + "grad_norm": 285.6978759765625, + "learning_rate": 5.912885662431942e-06, + "loss": 39.2128, + "step": 4434 + }, + { + "epoch": 16.010835214446953, + "grad_norm": 221.70896911621094, + "learning_rate": 5.907441016333939e-06, + "loss": 38.9026, + "step": 4435 + }, + { + "epoch": 16.014446952595936, + "grad_norm": 318.14068603515625, + "learning_rate": 5.901996370235935e-06, + "loss": 38.7336, + "step": 4436 + }, + { + "epoch": 16.018058690744923, + "grad_norm": 324.451904296875, + "learning_rate": 5.896551724137931e-06, + "loss": 38.7117, + "step": 4437 + }, + { + "epoch": 16.021670428893906, + "grad_norm": 295.038818359375, + "learning_rate": 5.891107078039927e-06, + "loss": 39.6053, + "step": 4438 + }, + { + "epoch": 16.02528216704289, + "grad_norm": 267.0055236816406, + "learning_rate": 5.885662431941924e-06, + "loss": 38.931, + "step": 4439 + }, + { + "epoch": 16.028893905191875, + "grad_norm": 269.20074462890625, + "learning_rate": 5.88021778584392e-06, + "loss": 41.1717, + "step": 4440 + }, + { + "epoch": 16.028893905191875, + "eval_loss": 0.6036069393157959, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.899, + "eval_steps_per_second": 56.899, + "step": 4440 + }, + { + "epoch": 16.03250564334086, + "grad_norm": 241.9443359375, + "learning_rate": 5.874773139745917e-06, + "loss": 38.7027, + "step": 4441 + }, + { + "epoch": 16.03611738148984, + "grad_norm": 238.54847717285156, + "learning_rate": 5.869328493647913e-06, + "loss": 39.1284, + "step": 4442 + }, + { + "epoch": 16.039729119638825, + "grad_norm": 339.3023681640625, + "learning_rate": 5.863883847549909e-06, + "loss": 38.0767, + "step": 4443 + }, + { + "epoch": 16.04334085778781, + "grad_norm": 257.29522705078125, + "learning_rate": 5.8584392014519055e-06, + "loss": 34.8207, + "step": 4444 + }, + { + "epoch": 16.046952595936794, + "grad_norm": 264.24200439453125, + "learning_rate": 5.852994555353902e-06, + "loss": 35.5021, + "step": 4445 + }, + { + "epoch": 16.050564334085777, + "grad_norm": 251.3128662109375, + "learning_rate": 5.847549909255899e-06, + "loss": 35.7826, + "step": 4446 + }, + { + "epoch": 16.054176072234764, + "grad_norm": 310.6581726074219, + "learning_rate": 5.842105263157895e-06, + "loss": 36.7373, + "step": 4447 + }, + { + "epoch": 16.057787810383747, + "grad_norm": 299.07550048828125, + "learning_rate": 5.836660617059891e-06, + "loss": 36.4048, + "step": 4448 + }, + { + "epoch": 16.06139954853273, + "grad_norm": 257.58740234375, + "learning_rate": 5.8312159709618875e-06, + "loss": 36.3982, + "step": 4449 + }, + { + "epoch": 16.065011286681717, + "grad_norm": 337.6795654296875, + "learning_rate": 5.825771324863884e-06, + "loss": 36.8518, + "step": 4450 + }, + { + "epoch": 16.065011286681717, + "eval_loss": 0.6036850214004517, + "eval_runtime": 3.1399, + "eval_samples_per_second": 57.009, + "eval_steps_per_second": 57.009, + "step": 4450 + }, + { + "epoch": 16.0686230248307, + "grad_norm": 275.02423095703125, + "learning_rate": 5.820326678765881e-06, + "loss": 36.1763, + "step": 4451 + }, + { + "epoch": 16.072234762979683, + "grad_norm": 263.4334716796875, + "learning_rate": 5.814882032667876e-06, + "loss": 37.6417, + "step": 4452 + }, + { + "epoch": 16.07584650112867, + "grad_norm": 213.16749572753906, + "learning_rate": 5.809437386569873e-06, + "loss": 35.6537, + "step": 4453 + }, + { + "epoch": 16.079458239277653, + "grad_norm": 263.4288330078125, + "learning_rate": 5.8039927404718695e-06, + "loss": 36.5693, + "step": 4454 + }, + { + "epoch": 16.083069977426636, + "grad_norm": 284.67254638671875, + "learning_rate": 5.798548094373866e-06, + "loss": 37.3424, + "step": 4455 + }, + { + "epoch": 16.086681715575622, + "grad_norm": 355.7987060546875, + "learning_rate": 5.793103448275862e-06, + "loss": 38.7851, + "step": 4456 + }, + { + "epoch": 16.090293453724605, + "grad_norm": 249.7351531982422, + "learning_rate": 5.787658802177859e-06, + "loss": 38.1334, + "step": 4457 + }, + { + "epoch": 16.09390519187359, + "grad_norm": 257.4977722167969, + "learning_rate": 5.782214156079855e-06, + "loss": 37.8369, + "step": 4458 + }, + { + "epoch": 16.097516930022575, + "grad_norm": 242.59584045410156, + "learning_rate": 5.776769509981851e-06, + "loss": 37.4005, + "step": 4459 + }, + { + "epoch": 16.101128668171558, + "grad_norm": 270.0740966796875, + "learning_rate": 5.771324863883848e-06, + "loss": 38.2287, + "step": 4460 + }, + { + "epoch": 16.101128668171558, + "eval_loss": 0.6018803119659424, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.04, + "eval_steps_per_second": 57.04, + "step": 4460 + }, + { + "epoch": 16.10474040632054, + "grad_norm": 225.32322692871094, + "learning_rate": 5.765880217785844e-06, + "loss": 35.7162, + "step": 4461 + }, + { + "epoch": 16.108352144469524, + "grad_norm": 275.3272705078125, + "learning_rate": 5.760435571687841e-06, + "loss": 32.8733, + "step": 4462 + }, + { + "epoch": 16.11196388261851, + "grad_norm": 259.5124206542969, + "learning_rate": 5.7549909255898364e-06, + "loss": 33.2271, + "step": 4463 + }, + { + "epoch": 16.115575620767494, + "grad_norm": 249.75738525390625, + "learning_rate": 5.7495462794918335e-06, + "loss": 30.2931, + "step": 4464 + }, + { + "epoch": 16.119187358916477, + "grad_norm": 277.7652282714844, + "learning_rate": 5.74410163339383e-06, + "loss": 30.9294, + "step": 4465 + }, + { + "epoch": 16.122799097065464, + "grad_norm": 223.28250122070312, + "learning_rate": 5.738656987295825e-06, + "loss": 31.7337, + "step": 4466 + }, + { + "epoch": 16.126410835214447, + "grad_norm": 259.5106201171875, + "learning_rate": 5.733212341197822e-06, + "loss": 31.2897, + "step": 4467 + }, + { + "epoch": 16.13002257336343, + "grad_norm": 241.0313720703125, + "learning_rate": 5.7277676950998184e-06, + "loss": 32.8436, + "step": 4468 + }, + { + "epoch": 16.133634311512417, + "grad_norm": 277.46905517578125, + "learning_rate": 5.7223230490018155e-06, + "loss": 33.6823, + "step": 4469 + }, + { + "epoch": 16.1372460496614, + "grad_norm": 264.2905578613281, + "learning_rate": 5.716878402903811e-06, + "loss": 33.1107, + "step": 4470 + }, + { + "epoch": 16.1372460496614, + "eval_loss": 0.6046355962753296, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 4470 + }, + { + "epoch": 16.140857787810383, + "grad_norm": 295.5188903808594, + "learning_rate": 5.711433756805808e-06, + "loss": 33.6291, + "step": 4471 + }, + { + "epoch": 16.14446952595937, + "grad_norm": 282.6014709472656, + "learning_rate": 5.705989110707804e-06, + "loss": 33.0773, + "step": 4472 + }, + { + "epoch": 16.148081264108352, + "grad_norm": 270.7958679199219, + "learning_rate": 5.7005444646098004e-06, + "loss": 35.0269, + "step": 4473 + }, + { + "epoch": 16.151693002257336, + "grad_norm": 344.7304992675781, + "learning_rate": 5.695099818511797e-06, + "loss": 35.1349, + "step": 4474 + }, + { + "epoch": 16.155304740406322, + "grad_norm": 294.5618896484375, + "learning_rate": 5.689655172413793e-06, + "loss": 36.3309, + "step": 4475 + }, + { + "epoch": 16.158916478555305, + "grad_norm": 305.5354309082031, + "learning_rate": 5.68421052631579e-06, + "loss": 35.0976, + "step": 4476 + }, + { + "epoch": 16.16252821670429, + "grad_norm": 293.9934387207031, + "learning_rate": 5.678765880217786e-06, + "loss": 34.9113, + "step": 4477 + }, + { + "epoch": 16.16613995485327, + "grad_norm": 277.9523010253906, + "learning_rate": 5.6733212341197824e-06, + "loss": 24.8815, + "step": 4478 + }, + { + "epoch": 16.169751693002258, + "grad_norm": 297.0547790527344, + "learning_rate": 5.667876588021779e-06, + "loss": 22.4544, + "step": 4479 + }, + { + "epoch": 16.17336343115124, + "grad_norm": 237.44741821289062, + "learning_rate": 5.662431941923776e-06, + "loss": 21.8323, + "step": 4480 + }, + { + "epoch": 16.17336343115124, + "eval_loss": 0.6061411499977112, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.98, + "eval_steps_per_second": 56.98, + "step": 4480 + }, + { + "epoch": 16.176975169300224, + "grad_norm": 220.5832977294922, + "learning_rate": 5.656987295825771e-06, + "loss": 22.7531, + "step": 4481 + }, + { + "epoch": 16.18058690744921, + "grad_norm": 298.8033142089844, + "learning_rate": 5.651542649727767e-06, + "loss": 23.7107, + "step": 4482 + }, + { + "epoch": 16.184198645598194, + "grad_norm": 250.02593994140625, + "learning_rate": 5.6460980036297644e-06, + "loss": 39.1679, + "step": 4483 + }, + { + "epoch": 16.187810383747177, + "grad_norm": 253.00746154785156, + "learning_rate": 5.640653357531761e-06, + "loss": 40.6492, + "step": 4484 + }, + { + "epoch": 16.191422121896164, + "grad_norm": 215.04270935058594, + "learning_rate": 5.635208711433757e-06, + "loss": 38.604, + "step": 4485 + }, + { + "epoch": 16.195033860045147, + "grad_norm": 395.6152648925781, + "learning_rate": 5.629764065335753e-06, + "loss": 39.1417, + "step": 4486 + }, + { + "epoch": 16.19864559819413, + "grad_norm": 380.3653869628906, + "learning_rate": 5.62431941923775e-06, + "loss": 39.4322, + "step": 4487 + }, + { + "epoch": 16.202257336343116, + "grad_norm": 309.3524475097656, + "learning_rate": 5.6188747731397464e-06, + "loss": 39.1721, + "step": 4488 + }, + { + "epoch": 16.2058690744921, + "grad_norm": 237.88262939453125, + "learning_rate": 5.613430127041742e-06, + "loss": 39.1462, + "step": 4489 + }, + { + "epoch": 16.209480812641083, + "grad_norm": 233.66690063476562, + "learning_rate": 5.607985480943739e-06, + "loss": 39.8177, + "step": 4490 + }, + { + "epoch": 16.209480812641083, + "eval_loss": 0.6043822169303894, + "eval_runtime": 3.1418, + "eval_samples_per_second": 56.974, + "eval_steps_per_second": 56.974, + "step": 4490 + }, + { + "epoch": 16.21309255079007, + "grad_norm": 229.3720703125, + "learning_rate": 5.602540834845735e-06, + "loss": 39.7878, + "step": 4491 + }, + { + "epoch": 16.216704288939052, + "grad_norm": 228.66493225097656, + "learning_rate": 5.597096188747731e-06, + "loss": 40.0754, + "step": 4492 + }, + { + "epoch": 16.220316027088035, + "grad_norm": 276.40240478515625, + "learning_rate": 5.591651542649728e-06, + "loss": 38.7709, + "step": 4493 + }, + { + "epoch": 16.223927765237022, + "grad_norm": 268.62371826171875, + "learning_rate": 5.586206896551725e-06, + "loss": 37.7439, + "step": 4494 + }, + { + "epoch": 16.227539503386005, + "grad_norm": 271.0934753417969, + "learning_rate": 5.580762250453721e-06, + "loss": 38.2511, + "step": 4495 + }, + { + "epoch": 16.231151241534988, + "grad_norm": 253.63385009765625, + "learning_rate": 5.575317604355716e-06, + "loss": 36.716, + "step": 4496 + }, + { + "epoch": 16.23476297968397, + "grad_norm": 265.1177978515625, + "learning_rate": 5.569872958257713e-06, + "loss": 36.5517, + "step": 4497 + }, + { + "epoch": 16.238374717832958, + "grad_norm": 332.52972412109375, + "learning_rate": 5.56442831215971e-06, + "loss": 37.1524, + "step": 4498 + }, + { + "epoch": 16.24198645598194, + "grad_norm": 247.53643798828125, + "learning_rate": 5.558983666061707e-06, + "loss": 36.6666, + "step": 4499 + }, + { + "epoch": 16.245598194130924, + "grad_norm": 233.3318634033203, + "learning_rate": 5.553539019963702e-06, + "loss": 37.0842, + "step": 4500 + }, + { + "epoch": 16.245598194130924, + "eval_loss": 0.6042913794517517, + "eval_runtime": 3.14, + "eval_samples_per_second": 57.007, + "eval_steps_per_second": 57.007, + "step": 4500 + }, + { + "epoch": 16.24920993227991, + "grad_norm": 222.98350524902344, + "learning_rate": 5.548094373865699e-06, + "loss": 37.6382, + "step": 4501 + }, + { + "epoch": 16.252821670428894, + "grad_norm": 234.33267211914062, + "learning_rate": 5.542649727767695e-06, + "loss": 38.0509, + "step": 4502 + }, + { + "epoch": 16.256433408577877, + "grad_norm": 303.56005859375, + "learning_rate": 5.5372050816696924e-06, + "loss": 36.509, + "step": 4503 + }, + { + "epoch": 16.260045146726863, + "grad_norm": 232.0821075439453, + "learning_rate": 5.531760435571688e-06, + "loss": 36.3975, + "step": 4504 + }, + { + "epoch": 16.263656884875846, + "grad_norm": 223.3292236328125, + "learning_rate": 5.526315789473684e-06, + "loss": 37.0448, + "step": 4505 + }, + { + "epoch": 16.26726862302483, + "grad_norm": 241.2131805419922, + "learning_rate": 5.520871143375681e-06, + "loss": 37.8635, + "step": 4506 + }, + { + "epoch": 16.270880361173816, + "grad_norm": 288.62689208984375, + "learning_rate": 5.5154264972776765e-06, + "loss": 38.2789, + "step": 4507 + }, + { + "epoch": 16.2744920993228, + "grad_norm": 262.59637451171875, + "learning_rate": 5.5099818511796736e-06, + "loss": 37.9052, + "step": 4508 + }, + { + "epoch": 16.278103837471782, + "grad_norm": 258.0476379394531, + "learning_rate": 5.50453720508167e-06, + "loss": 38.0485, + "step": 4509 + }, + { + "epoch": 16.28171557562077, + "grad_norm": 295.2730407714844, + "learning_rate": 5.499092558983667e-06, + "loss": 37.6134, + "step": 4510 + }, + { + "epoch": 16.28171557562077, + "eval_loss": 0.601740300655365, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 4510 + }, + { + "epoch": 16.285327313769752, + "grad_norm": 246.38548278808594, + "learning_rate": 5.493647912885662e-06, + "loss": 36.1289, + "step": 4511 + }, + { + "epoch": 16.288939051918735, + "grad_norm": 271.28997802734375, + "learning_rate": 5.4882032667876585e-06, + "loss": 31.8834, + "step": 4512 + }, + { + "epoch": 16.292550790067722, + "grad_norm": 231.76246643066406, + "learning_rate": 5.4827586206896556e-06, + "loss": 31.4899, + "step": 4513 + }, + { + "epoch": 16.296162528216705, + "grad_norm": 238.7414093017578, + "learning_rate": 5.477313974591652e-06, + "loss": 31.7102, + "step": 4514 + }, + { + "epoch": 16.299774266365688, + "grad_norm": 302.0710144042969, + "learning_rate": 5.471869328493648e-06, + "loss": 31.3557, + "step": 4515 + }, + { + "epoch": 16.30338600451467, + "grad_norm": 282.72015380859375, + "learning_rate": 5.466424682395644e-06, + "loss": 33.0781, + "step": 4516 + }, + { + "epoch": 16.306997742663658, + "grad_norm": 224.8140869140625, + "learning_rate": 5.460980036297641e-06, + "loss": 33.2963, + "step": 4517 + }, + { + "epoch": 16.31060948081264, + "grad_norm": 239.20570373535156, + "learning_rate": 5.4555353901996376e-06, + "loss": 34.4455, + "step": 4518 + }, + { + "epoch": 16.314221218961624, + "grad_norm": 304.7758483886719, + "learning_rate": 5.450090744101633e-06, + "loss": 34.534, + "step": 4519 + }, + { + "epoch": 16.31783295711061, + "grad_norm": 274.8758239746094, + "learning_rate": 5.44464609800363e-06, + "loss": 33.5232, + "step": 4520 + }, + { + "epoch": 16.31783295711061, + "eval_loss": 0.6031973958015442, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 4520 + }, + { + "epoch": 16.321444695259594, + "grad_norm": 295.1776428222656, + "learning_rate": 5.439201451905626e-06, + "loss": 33.403, + "step": 4521 + }, + { + "epoch": 16.325056433408577, + "grad_norm": 309.03399658203125, + "learning_rate": 5.4337568058076225e-06, + "loss": 34.1785, + "step": 4522 + }, + { + "epoch": 16.328668171557563, + "grad_norm": 285.26385498046875, + "learning_rate": 5.428312159709619e-06, + "loss": 34.4855, + "step": 4523 + }, + { + "epoch": 16.332279909706546, + "grad_norm": 307.0184020996094, + "learning_rate": 5.422867513611616e-06, + "loss": 32.4791, + "step": 4524 + }, + { + "epoch": 16.33589164785553, + "grad_norm": 318.8267822265625, + "learning_rate": 5.417422867513612e-06, + "loss": 35.697, + "step": 4525 + }, + { + "epoch": 16.339503386004516, + "grad_norm": 356.0179138183594, + "learning_rate": 5.411978221415607e-06, + "loss": 36.1811, + "step": 4526 + }, + { + "epoch": 16.3431151241535, + "grad_norm": 332.1255187988281, + "learning_rate": 5.4065335753176045e-06, + "loss": 36.2251, + "step": 4527 + }, + { + "epoch": 16.346726862302482, + "grad_norm": 288.78118896484375, + "learning_rate": 5.401088929219601e-06, + "loss": 32.0518, + "step": 4528 + }, + { + "epoch": 16.35033860045147, + "grad_norm": 250.37245178222656, + "learning_rate": 5.395644283121598e-06, + "loss": 23.627, + "step": 4529 + }, + { + "epoch": 16.353950338600452, + "grad_norm": 199.92352294921875, + "learning_rate": 5.390199637023593e-06, + "loss": 21.7919, + "step": 4530 + }, + { + "epoch": 16.353950338600452, + "eval_loss": 0.6021688580513, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 4530 + }, + { + "epoch": 16.357562076749435, + "grad_norm": 265.47015380859375, + "learning_rate": 5.38475499092559e-06, + "loss": 23.0672, + "step": 4531 + }, + { + "epoch": 16.36117381489842, + "grad_norm": 281.188720703125, + "learning_rate": 5.3793103448275865e-06, + "loss": 22.7983, + "step": 4532 + }, + { + "epoch": 16.364785553047405, + "grad_norm": 195.5351104736328, + "learning_rate": 5.373865698729583e-06, + "loss": 38.1042, + "step": 4533 + }, + { + "epoch": 16.368397291196388, + "grad_norm": 234.76573181152344, + "learning_rate": 5.368421052631579e-06, + "loss": 39.8602, + "step": 4534 + }, + { + "epoch": 16.37200902934537, + "grad_norm": 237.9152374267578, + "learning_rate": 5.362976406533575e-06, + "loss": 40.2156, + "step": 4535 + }, + { + "epoch": 16.375620767494357, + "grad_norm": 297.722900390625, + "learning_rate": 5.357531760435572e-06, + "loss": 39.3676, + "step": 4536 + }, + { + "epoch": 16.37923250564334, + "grad_norm": 218.61727905273438, + "learning_rate": 5.352087114337568e-06, + "loss": 38.7905, + "step": 4537 + }, + { + "epoch": 16.382844243792324, + "grad_norm": 245.19561767578125, + "learning_rate": 5.346642468239565e-06, + "loss": 39.3998, + "step": 4538 + }, + { + "epoch": 16.38645598194131, + "grad_norm": 247.5048370361328, + "learning_rate": 5.341197822141561e-06, + "loss": 40.0835, + "step": 4539 + }, + { + "epoch": 16.390067720090293, + "grad_norm": 214.40684509277344, + "learning_rate": 5.335753176043558e-06, + "loss": 39.1135, + "step": 4540 + }, + { + "epoch": 16.390067720090293, + "eval_loss": 0.6014460325241089, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.946, + "eval_steps_per_second": 56.946, + "step": 4540 + }, + { + "epoch": 16.393679458239276, + "grad_norm": 216.72271728515625, + "learning_rate": 5.330308529945553e-06, + "loss": 38.9449, + "step": 4541 + }, + { + "epoch": 16.397291196388263, + "grad_norm": 224.22262573242188, + "learning_rate": 5.32486388384755e-06, + "loss": 39.2646, + "step": 4542 + }, + { + "epoch": 16.400902934537246, + "grad_norm": 258.6524353027344, + "learning_rate": 5.319419237749547e-06, + "loss": 38.0846, + "step": 4543 + }, + { + "epoch": 16.40451467268623, + "grad_norm": 241.7313232421875, + "learning_rate": 5.313974591651543e-06, + "loss": 37.4963, + "step": 4544 + }, + { + "epoch": 16.408126410835216, + "grad_norm": 241.3990478515625, + "learning_rate": 5.308529945553539e-06, + "loss": 36.4783, + "step": 4545 + }, + { + "epoch": 16.4117381489842, + "grad_norm": 207.1470947265625, + "learning_rate": 5.303085299455535e-06, + "loss": 36.1592, + "step": 4546 + }, + { + "epoch": 16.415349887133182, + "grad_norm": 224.51690673828125, + "learning_rate": 5.2976406533575325e-06, + "loss": 35.7946, + "step": 4547 + }, + { + "epoch": 16.41896162528217, + "grad_norm": 292.4340515136719, + "learning_rate": 5.292196007259528e-06, + "loss": 36.8986, + "step": 4548 + }, + { + "epoch": 16.42257336343115, + "grad_norm": 244.67117309570312, + "learning_rate": 5.286751361161524e-06, + "loss": 37.1165, + "step": 4549 + }, + { + "epoch": 16.426185101580135, + "grad_norm": 331.14654541015625, + "learning_rate": 5.281306715063521e-06, + "loss": 36.4423, + "step": 4550 + }, + { + "epoch": 16.426185101580135, + "eval_loss": 0.6067427396774292, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.946, + "eval_steps_per_second": 56.946, + "step": 4550 + }, + { + "epoch": 16.42979683972912, + "grad_norm": 262.373046875, + "learning_rate": 5.275862068965517e-06, + "loss": 39.0014, + "step": 4551 + }, + { + "epoch": 16.433408577878104, + "grad_norm": 237.48350524902344, + "learning_rate": 5.270417422867514e-06, + "loss": 38.0152, + "step": 4552 + }, + { + "epoch": 16.437020316027088, + "grad_norm": 273.0652770996094, + "learning_rate": 5.26497277676951e-06, + "loss": 37.6952, + "step": 4553 + }, + { + "epoch": 16.44063205417607, + "grad_norm": 239.0780029296875, + "learning_rate": 5.259528130671507e-06, + "loss": 38.4266, + "step": 4554 + }, + { + "epoch": 16.444243792325057, + "grad_norm": 277.978759765625, + "learning_rate": 5.254083484573503e-06, + "loss": 36.5596, + "step": 4555 + }, + { + "epoch": 16.44785553047404, + "grad_norm": 216.2267303466797, + "learning_rate": 5.248638838475499e-06, + "loss": 39.1408, + "step": 4556 + }, + { + "epoch": 16.451467268623023, + "grad_norm": 231.80581665039062, + "learning_rate": 5.243194192377496e-06, + "loss": 38.7286, + "step": 4557 + }, + { + "epoch": 16.45507900677201, + "grad_norm": 236.4004669189453, + "learning_rate": 5.237749546279492e-06, + "loss": 39.2426, + "step": 4558 + }, + { + "epoch": 16.458690744920993, + "grad_norm": 270.0268859863281, + "learning_rate": 5.232304900181488e-06, + "loss": 38.6546, + "step": 4559 + }, + { + "epoch": 16.462302483069976, + "grad_norm": 255.8044891357422, + "learning_rate": 5.226860254083484e-06, + "loss": 37.554, + "step": 4560 + }, + { + "epoch": 16.462302483069976, + "eval_loss": 0.6019929647445679, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.062, + "eval_steps_per_second": 57.062, + "step": 4560 + }, + { + "epoch": 16.465914221218963, + "grad_norm": 321.18499755859375, + "learning_rate": 5.221415607985481e-06, + "loss": 34.9309, + "step": 4561 + }, + { + "epoch": 16.469525959367946, + "grad_norm": 311.94305419921875, + "learning_rate": 5.215970961887478e-06, + "loss": 35.8779, + "step": 4562 + }, + { + "epoch": 16.47313769751693, + "grad_norm": 211.90234375, + "learning_rate": 5.210526315789474e-06, + "loss": 31.8385, + "step": 4563 + }, + { + "epoch": 16.476749435665916, + "grad_norm": 284.64581298828125, + "learning_rate": 5.20508166969147e-06, + "loss": 31.8078, + "step": 4564 + }, + { + "epoch": 16.4803611738149, + "grad_norm": 291.94891357421875, + "learning_rate": 5.199637023593466e-06, + "loss": 33.2542, + "step": 4565 + }, + { + "epoch": 16.483972911963882, + "grad_norm": 243.61956787109375, + "learning_rate": 5.194192377495463e-06, + "loss": 31.5292, + "step": 4566 + }, + { + "epoch": 16.48758465011287, + "grad_norm": 242.07696533203125, + "learning_rate": 5.188747731397459e-06, + "loss": 33.9643, + "step": 4567 + }, + { + "epoch": 16.49119638826185, + "grad_norm": 255.0625457763672, + "learning_rate": 5.183303085299456e-06, + "loss": 33.7718, + "step": 4568 + }, + { + "epoch": 16.494808126410835, + "grad_norm": 249.40240478515625, + "learning_rate": 5.177858439201452e-06, + "loss": 31.5248, + "step": 4569 + }, + { + "epoch": 16.498419864559818, + "grad_norm": 231.3375244140625, + "learning_rate": 5.172413793103449e-06, + "loss": 34.5657, + "step": 4570 + }, + { + "epoch": 16.498419864559818, + "eval_loss": 0.6017265319824219, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.01, + "eval_steps_per_second": 57.01, + "step": 4570 + }, + { + "epoch": 16.502031602708804, + "grad_norm": 247.97012329101562, + "learning_rate": 5.1669691470054445e-06, + "loss": 33.766, + "step": 4571 + }, + { + "epoch": 16.505643340857787, + "grad_norm": 310.730224609375, + "learning_rate": 5.161524500907441e-06, + "loss": 34.0841, + "step": 4572 + }, + { + "epoch": 16.50925507900677, + "grad_norm": 323.5569152832031, + "learning_rate": 5.156079854809438e-06, + "loss": 35.0788, + "step": 4573 + }, + { + "epoch": 16.512866817155757, + "grad_norm": 247.95480346679688, + "learning_rate": 5.150635208711433e-06, + "loss": 33.5322, + "step": 4574 + }, + { + "epoch": 16.51647855530474, + "grad_norm": 307.6163024902344, + "learning_rate": 5.14519056261343e-06, + "loss": 34.4701, + "step": 4575 + }, + { + "epoch": 16.520090293453723, + "grad_norm": 239.569580078125, + "learning_rate": 5.1397459165154265e-06, + "loss": 35.8526, + "step": 4576 + }, + { + "epoch": 16.52370203160271, + "grad_norm": 362.4159240722656, + "learning_rate": 5.134301270417424e-06, + "loss": 36.2235, + "step": 4577 + }, + { + "epoch": 16.527313769751693, + "grad_norm": 321.2509765625, + "learning_rate": 5.128856624319419e-06, + "loss": 33.4705, + "step": 4578 + }, + { + "epoch": 16.530925507900676, + "grad_norm": 248.6092071533203, + "learning_rate": 5.123411978221415e-06, + "loss": 23.1329, + "step": 4579 + }, + { + "epoch": 16.534537246049663, + "grad_norm": 289.8996276855469, + "learning_rate": 5.117967332123412e-06, + "loss": 20.3184, + "step": 4580 + }, + { + "epoch": 16.534537246049663, + "eval_loss": 0.6034744381904602, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.997, + "eval_steps_per_second": 56.997, + "step": 4580 + }, + { + "epoch": 16.538148984198646, + "grad_norm": 215.02142333984375, + "learning_rate": 5.1125226860254085e-06, + "loss": 23.0513, + "step": 4581 + }, + { + "epoch": 16.54176072234763, + "grad_norm": 299.8429870605469, + "learning_rate": 5.107078039927405e-06, + "loss": 24.462, + "step": 4582 + }, + { + "epoch": 16.545372460496615, + "grad_norm": 267.0840759277344, + "learning_rate": 5.101633393829401e-06, + "loss": 39.9148, + "step": 4583 + }, + { + "epoch": 16.5489841986456, + "grad_norm": 227.23731994628906, + "learning_rate": 5.096188747731398e-06, + "loss": 40.6498, + "step": 4584 + }, + { + "epoch": 16.55259593679458, + "grad_norm": 313.9705810546875, + "learning_rate": 5.0907441016333935e-06, + "loss": 38.7711, + "step": 4585 + }, + { + "epoch": 16.55620767494357, + "grad_norm": 398.0429382324219, + "learning_rate": 5.0852994555353905e-06, + "loss": 39.6938, + "step": 4586 + }, + { + "epoch": 16.55981941309255, + "grad_norm": 365.489990234375, + "learning_rate": 5.079854809437387e-06, + "loss": 39.356, + "step": 4587 + }, + { + "epoch": 16.563431151241534, + "grad_norm": 365.05267333984375, + "learning_rate": 5.074410163339383e-06, + "loss": 40.2504, + "step": 4588 + }, + { + "epoch": 16.567042889390518, + "grad_norm": 288.0643310546875, + "learning_rate": 5.068965517241379e-06, + "loss": 39.6045, + "step": 4589 + }, + { + "epoch": 16.570654627539504, + "grad_norm": 262.0147705078125, + "learning_rate": 5.0635208711433755e-06, + "loss": 40.2504, + "step": 4590 + }, + { + "epoch": 16.570654627539504, + "eval_loss": 0.6028281450271606, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 4590 + }, + { + "epoch": 16.574266365688487, + "grad_norm": 325.78387451171875, + "learning_rate": 5.0580762250453725e-06, + "loss": 40.3154, + "step": 4591 + }, + { + "epoch": 16.57787810383747, + "grad_norm": 221.56591796875, + "learning_rate": 5.052631578947369e-06, + "loss": 39.5046, + "step": 4592 + }, + { + "epoch": 16.581489841986457, + "grad_norm": 227.02520751953125, + "learning_rate": 5.047186932849365e-06, + "loss": 38.3611, + "step": 4593 + }, + { + "epoch": 16.58510158013544, + "grad_norm": 232.46922302246094, + "learning_rate": 5.041742286751361e-06, + "loss": 36.5043, + "step": 4594 + }, + { + "epoch": 16.588713318284423, + "grad_norm": 230.59536743164062, + "learning_rate": 5.0362976406533575e-06, + "loss": 36.2179, + "step": 4595 + }, + { + "epoch": 16.59232505643341, + "grad_norm": 439.9609069824219, + "learning_rate": 5.0308529945553545e-06, + "loss": 36.4797, + "step": 4596 + }, + { + "epoch": 16.595936794582393, + "grad_norm": 322.4086608886719, + "learning_rate": 5.02540834845735e-06, + "loss": 37.4151, + "step": 4597 + }, + { + "epoch": 16.599548532731376, + "grad_norm": 318.1732482910156, + "learning_rate": 5.019963702359347e-06, + "loss": 37.2815, + "step": 4598 + }, + { + "epoch": 16.603160270880363, + "grad_norm": 321.34039306640625, + "learning_rate": 5.014519056261343e-06, + "loss": 36.8388, + "step": 4599 + }, + { + "epoch": 16.606772009029346, + "grad_norm": 341.28790283203125, + "learning_rate": 5.0090744101633395e-06, + "loss": 37.9805, + "step": 4600 + }, + { + "epoch": 16.606772009029346, + "eval_loss": 0.6045316457748413, + "eval_runtime": 3.1402, + "eval_samples_per_second": 57.002, + "eval_steps_per_second": 57.002, + "step": 4600 + }, + { + "epoch": 16.61038374717833, + "grad_norm": 259.9163513183594, + "learning_rate": 5.003629764065336e-06, + "loss": 37.5832, + "step": 4601 + }, + { + "epoch": 16.613995485327315, + "grad_norm": 297.02587890625, + "learning_rate": 4.998185117967332e-06, + "loss": 37.3808, + "step": 4602 + }, + { + "epoch": 16.6176072234763, + "grad_norm": 263.32244873046875, + "learning_rate": 4.992740471869329e-06, + "loss": 37.1047, + "step": 4603 + }, + { + "epoch": 16.62121896162528, + "grad_norm": 262.26104736328125, + "learning_rate": 4.987295825771324e-06, + "loss": 38.3592, + "step": 4604 + }, + { + "epoch": 16.624830699774268, + "grad_norm": 253.7144012451172, + "learning_rate": 4.9818511796733215e-06, + "loss": 37.4098, + "step": 4605 + }, + { + "epoch": 16.62844243792325, + "grad_norm": 279.1004943847656, + "learning_rate": 4.976406533575318e-06, + "loss": 39.3865, + "step": 4606 + }, + { + "epoch": 16.632054176072234, + "grad_norm": 298.7977600097656, + "learning_rate": 4.970961887477315e-06, + "loss": 38.6865, + "step": 4607 + }, + { + "epoch": 16.635665914221217, + "grad_norm": 256.7657470703125, + "learning_rate": 4.96551724137931e-06, + "loss": 38.7068, + "step": 4608 + }, + { + "epoch": 16.639277652370204, + "grad_norm": 238.22979736328125, + "learning_rate": 4.960072595281307e-06, + "loss": 37.749, + "step": 4609 + }, + { + "epoch": 16.642889390519187, + "grad_norm": 248.4231414794922, + "learning_rate": 4.9546279491833035e-06, + "loss": 37.582, + "step": 4610 + }, + { + "epoch": 16.642889390519187, + "eval_loss": 0.6026645302772522, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.99, + "eval_steps_per_second": 56.99, + "step": 4610 + }, + { + "epoch": 16.64650112866817, + "grad_norm": 232.70289611816406, + "learning_rate": 4.949183303085299e-06, + "loss": 34.4589, + "step": 4611 + }, + { + "epoch": 16.650112866817157, + "grad_norm": 268.4678955078125, + "learning_rate": 4.943738656987296e-06, + "loss": 32.3619, + "step": 4612 + }, + { + "epoch": 16.65372460496614, + "grad_norm": 272.07794189453125, + "learning_rate": 4.938294010889292e-06, + "loss": 32.3436, + "step": 4613 + }, + { + "epoch": 16.657336343115123, + "grad_norm": 304.4588317871094, + "learning_rate": 4.932849364791289e-06, + "loss": 30.8798, + "step": 4614 + }, + { + "epoch": 16.66094808126411, + "grad_norm": 293.3638000488281, + "learning_rate": 4.927404718693285e-06, + "loss": 31.1892, + "step": 4615 + }, + { + "epoch": 16.664559819413093, + "grad_norm": 292.844482421875, + "learning_rate": 4.921960072595282e-06, + "loss": 31.9604, + "step": 4616 + }, + { + "epoch": 16.668171557562076, + "grad_norm": 246.45339965820312, + "learning_rate": 4.916515426497278e-06, + "loss": 32.242, + "step": 4617 + }, + { + "epoch": 16.671783295711062, + "grad_norm": 269.9577941894531, + "learning_rate": 4.911070780399274e-06, + "loss": 32.5072, + "step": 4618 + }, + { + "epoch": 16.675395033860045, + "grad_norm": 312.8960876464844, + "learning_rate": 4.90562613430127e-06, + "loss": 33.8243, + "step": 4619 + }, + { + "epoch": 16.67900677200903, + "grad_norm": 287.4557189941406, + "learning_rate": 4.900181488203267e-06, + "loss": 34.3557, + "step": 4620 + }, + { + "epoch": 16.67900677200903, + "eval_loss": 0.6047338843345642, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 4620 + }, + { + "epoch": 16.682618510158015, + "grad_norm": 403.533935546875, + "learning_rate": 4.894736842105264e-06, + "loss": 34.6895, + "step": 4621 + }, + { + "epoch": 16.686230248306998, + "grad_norm": 387.5083923339844, + "learning_rate": 4.88929219600726e-06, + "loss": 34.2407, + "step": 4622 + }, + { + "epoch": 16.68984198645598, + "grad_norm": 278.8225402832031, + "learning_rate": 4.883847549909256e-06, + "loss": 33.3489, + "step": 4623 + }, + { + "epoch": 16.693453724604964, + "grad_norm": 270.46685791015625, + "learning_rate": 4.878402903811252e-06, + "loss": 34.2095, + "step": 4624 + }, + { + "epoch": 16.69706546275395, + "grad_norm": 244.6392059326172, + "learning_rate": 4.872958257713249e-06, + "loss": 35.783, + "step": 4625 + }, + { + "epoch": 16.700677200902934, + "grad_norm": 327.0617370605469, + "learning_rate": 4.867513611615245e-06, + "loss": 36.4928, + "step": 4626 + }, + { + "epoch": 16.704288939051917, + "grad_norm": 297.0531311035156, + "learning_rate": 4.862068965517241e-06, + "loss": 33.4827, + "step": 4627 + }, + { + "epoch": 16.707900677200904, + "grad_norm": 366.2174377441406, + "learning_rate": 4.856624319419238e-06, + "loss": 26.9456, + "step": 4628 + }, + { + "epoch": 16.711512415349887, + "grad_norm": 436.22613525390625, + "learning_rate": 4.851179673321234e-06, + "loss": 22.2349, + "step": 4629 + }, + { + "epoch": 16.71512415349887, + "grad_norm": 391.7647705078125, + "learning_rate": 4.845735027223231e-06, + "loss": 22.8557, + "step": 4630 + }, + { + "epoch": 16.71512415349887, + "eval_loss": 0.6052708029747009, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.038, + "eval_steps_per_second": 57.038, + "step": 4630 + }, + { + "epoch": 16.718735891647857, + "grad_norm": 277.8678283691406, + "learning_rate": 4.840290381125227e-06, + "loss": 23.3521, + "step": 4631 + }, + { + "epoch": 16.72234762979684, + "grad_norm": 252.46131896972656, + "learning_rate": 4.834845735027224e-06, + "loss": 23.7394, + "step": 4632 + }, + { + "epoch": 16.725959367945823, + "grad_norm": 214.6287078857422, + "learning_rate": 4.82940108892922e-06, + "loss": 38.6633, + "step": 4633 + }, + { + "epoch": 16.72957110609481, + "grad_norm": 257.454345703125, + "learning_rate": 4.8239564428312155e-06, + "loss": 40.5165, + "step": 4634 + }, + { + "epoch": 16.733182844243792, + "grad_norm": 211.1912841796875, + "learning_rate": 4.818511796733213e-06, + "loss": 38.483, + "step": 4635 + }, + { + "epoch": 16.736794582392776, + "grad_norm": 226.8388214111328, + "learning_rate": 4.813067150635209e-06, + "loss": 39.6143, + "step": 4636 + }, + { + "epoch": 16.740406320541762, + "grad_norm": 263.8160400390625, + "learning_rate": 4.807622504537205e-06, + "loss": 37.8442, + "step": 4637 + }, + { + "epoch": 16.744018058690745, + "grad_norm": 284.8119201660156, + "learning_rate": 4.802177858439201e-06, + "loss": 39.1835, + "step": 4638 + }, + { + "epoch": 16.74762979683973, + "grad_norm": 310.31390380859375, + "learning_rate": 4.796733212341198e-06, + "loss": 38.7035, + "step": 4639 + }, + { + "epoch": 16.751241534988715, + "grad_norm": 212.71315002441406, + "learning_rate": 4.791288566243195e-06, + "loss": 38.8803, + "step": 4640 + }, + { + "epoch": 16.751241534988715, + "eval_loss": 0.6030828952789307, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 4640 + }, + { + "epoch": 16.754853273137698, + "grad_norm": 209.7708740234375, + "learning_rate": 4.78584392014519e-06, + "loss": 39.0808, + "step": 4641 + }, + { + "epoch": 16.75846501128668, + "grad_norm": 251.971435546875, + "learning_rate": 4.780399274047187e-06, + "loss": 39.2025, + "step": 4642 + }, + { + "epoch": 16.762076749435664, + "grad_norm": 210.54151916503906, + "learning_rate": 4.774954627949183e-06, + "loss": 37.7541, + "step": 4643 + }, + { + "epoch": 16.76568848758465, + "grad_norm": 221.22119140625, + "learning_rate": 4.76950998185118e-06, + "loss": 36.4328, + "step": 4644 + }, + { + "epoch": 16.769300225733634, + "grad_norm": 201.45025634765625, + "learning_rate": 4.764065335753176e-06, + "loss": 34.9771, + "step": 4645 + }, + { + "epoch": 16.772911963882617, + "grad_norm": 241.33030700683594, + "learning_rate": 4.758620689655173e-06, + "loss": 37.6231, + "step": 4646 + }, + { + "epoch": 16.776523702031604, + "grad_norm": 282.12255859375, + "learning_rate": 4.753176043557169e-06, + "loss": 36.9822, + "step": 4647 + }, + { + "epoch": 16.780135440180587, + "grad_norm": 239.93885803222656, + "learning_rate": 4.747731397459165e-06, + "loss": 36.3529, + "step": 4648 + }, + { + "epoch": 16.78374717832957, + "grad_norm": 245.9400634765625, + "learning_rate": 4.7422867513611615e-06, + "loss": 37.518, + "step": 4649 + }, + { + "epoch": 16.787358916478556, + "grad_norm": 280.63720703125, + "learning_rate": 4.736842105263158e-06, + "loss": 37.6323, + "step": 4650 + }, + { + "epoch": 16.787358916478556, + "eval_loss": 0.6054876446723938, + "eval_runtime": 3.1439, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 4650 + }, + { + "epoch": 16.79097065462754, + "grad_norm": 368.47698974609375, + "learning_rate": 4.731397459165155e-06, + "loss": 38.1543, + "step": 4651 + }, + { + "epoch": 16.794582392776523, + "grad_norm": 346.9169616699219, + "learning_rate": 4.72595281306715e-06, + "loss": 38.8746, + "step": 4652 + }, + { + "epoch": 16.79819413092551, + "grad_norm": 311.7519836425781, + "learning_rate": 4.720508166969147e-06, + "loss": 37.3475, + "step": 4653 + }, + { + "epoch": 16.801805869074492, + "grad_norm": 323.14910888671875, + "learning_rate": 4.7150635208711435e-06, + "loss": 38.5308, + "step": 4654 + }, + { + "epoch": 16.805417607223475, + "grad_norm": 252.71958923339844, + "learning_rate": 4.70961887477314e-06, + "loss": 38.3275, + "step": 4655 + }, + { + "epoch": 16.809029345372462, + "grad_norm": 364.2929382324219, + "learning_rate": 4.704174228675136e-06, + "loss": 38.9973, + "step": 4656 + }, + { + "epoch": 16.812641083521445, + "grad_norm": 267.23980712890625, + "learning_rate": 4.698729582577132e-06, + "loss": 38.0867, + "step": 4657 + }, + { + "epoch": 16.816252821670428, + "grad_norm": 297.4647521972656, + "learning_rate": 4.693284936479129e-06, + "loss": 38.6933, + "step": 4658 + }, + { + "epoch": 16.819864559819415, + "grad_norm": 276.2767333984375, + "learning_rate": 4.6878402903811255e-06, + "loss": 38.0279, + "step": 4659 + }, + { + "epoch": 16.823476297968398, + "grad_norm": 261.5404052734375, + "learning_rate": 4.682395644283122e-06, + "loss": 36.5149, + "step": 4660 + }, + { + "epoch": 16.823476297968398, + "eval_loss": 0.6019832491874695, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.074, + "eval_steps_per_second": 57.074, + "step": 4660 + }, + { + "epoch": 16.82708803611738, + "grad_norm": 313.2170104980469, + "learning_rate": 4.676950998185118e-06, + "loss": 35.6121, + "step": 4661 + }, + { + "epoch": 16.830699774266364, + "grad_norm": 297.2791442871094, + "learning_rate": 4.671506352087115e-06, + "loss": 31.1869, + "step": 4662 + }, + { + "epoch": 16.83431151241535, + "grad_norm": 269.7320556640625, + "learning_rate": 4.666061705989111e-06, + "loss": 31.8674, + "step": 4663 + }, + { + "epoch": 16.837923250564334, + "grad_norm": 245.3898468017578, + "learning_rate": 4.660617059891107e-06, + "loss": 30.3726, + "step": 4664 + }, + { + "epoch": 16.841534988713317, + "grad_norm": 244.63223266601562, + "learning_rate": 4.655172413793104e-06, + "loss": 32.6154, + "step": 4665 + }, + { + "epoch": 16.845146726862303, + "grad_norm": 263.6791076660156, + "learning_rate": 4.6497277676951e-06, + "loss": 33.0104, + "step": 4666 + }, + { + "epoch": 16.848758465011286, + "grad_norm": 398.6610107421875, + "learning_rate": 4.644283121597096e-06, + "loss": 32.5445, + "step": 4667 + }, + { + "epoch": 16.85237020316027, + "grad_norm": 312.8116149902344, + "learning_rate": 4.6388384754990924e-06, + "loss": 32.5698, + "step": 4668 + }, + { + "epoch": 16.855981941309256, + "grad_norm": 296.6167297363281, + "learning_rate": 4.6333938294010895e-06, + "loss": 33.1377, + "step": 4669 + }, + { + "epoch": 16.85959367945824, + "grad_norm": 285.299560546875, + "learning_rate": 4.627949183303086e-06, + "loss": 33.3279, + "step": 4670 + }, + { + "epoch": 16.85959367945824, + "eval_loss": 0.6027817726135254, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.985, + "eval_steps_per_second": 56.985, + "step": 4670 + }, + { + "epoch": 16.863205417607222, + "grad_norm": 285.2948913574219, + "learning_rate": 4.622504537205081e-06, + "loss": 35.6879, + "step": 4671 + }, + { + "epoch": 16.86681715575621, + "grad_norm": 280.6530456542969, + "learning_rate": 4.617059891107078e-06, + "loss": 32.3154, + "step": 4672 + }, + { + "epoch": 16.870428893905192, + "grad_norm": 314.206787109375, + "learning_rate": 4.6116152450090744e-06, + "loss": 34.3517, + "step": 4673 + }, + { + "epoch": 16.874040632054175, + "grad_norm": 305.9198913574219, + "learning_rate": 4.6061705989110715e-06, + "loss": 34.1571, + "step": 4674 + }, + { + "epoch": 16.877652370203162, + "grad_norm": 287.0543212890625, + "learning_rate": 4.600725952813067e-06, + "loss": 35.1647, + "step": 4675 + }, + { + "epoch": 16.881264108352145, + "grad_norm": 286.912109375, + "learning_rate": 4.595281306715064e-06, + "loss": 34.8698, + "step": 4676 + }, + { + "epoch": 16.884875846501128, + "grad_norm": 322.4527587890625, + "learning_rate": 4.58983666061706e-06, + "loss": 36.3449, + "step": 4677 + }, + { + "epoch": 16.888487584650115, + "grad_norm": 239.41659545898438, + "learning_rate": 4.584392014519056e-06, + "loss": 25.3085, + "step": 4678 + }, + { + "epoch": 16.892099322799098, + "grad_norm": 215.5685577392578, + "learning_rate": 4.578947368421053e-06, + "loss": 22.3485, + "step": 4679 + }, + { + "epoch": 16.89571106094808, + "grad_norm": 291.2452697753906, + "learning_rate": 4.573502722323049e-06, + "loss": 22.3257, + "step": 4680 + }, + { + "epoch": 16.89571106094808, + "eval_loss": 0.6040940284729004, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 4680 + }, + { + "epoch": 16.899322799097064, + "grad_norm": 291.39935302734375, + "learning_rate": 4.568058076225046e-06, + "loss": 23.268, + "step": 4681 + }, + { + "epoch": 16.90293453724605, + "grad_norm": 272.211181640625, + "learning_rate": 4.562613430127041e-06, + "loss": 23.7127, + "step": 4682 + }, + { + "epoch": 16.906546275395034, + "grad_norm": 220.84397888183594, + "learning_rate": 4.5571687840290384e-06, + "loss": 39.2488, + "step": 4683 + }, + { + "epoch": 16.910158013544017, + "grad_norm": 238.49859619140625, + "learning_rate": 4.551724137931035e-06, + "loss": 39.5643, + "step": 4684 + }, + { + "epoch": 16.913769751693003, + "grad_norm": 325.3870544433594, + "learning_rate": 4.546279491833032e-06, + "loss": 38.6149, + "step": 4685 + }, + { + "epoch": 16.917381489841986, + "grad_norm": 307.02349853515625, + "learning_rate": 4.540834845735027e-06, + "loss": 38.0317, + "step": 4686 + }, + { + "epoch": 16.92099322799097, + "grad_norm": 433.99359130859375, + "learning_rate": 4.535390199637023e-06, + "loss": 40.4567, + "step": 4687 + }, + { + "epoch": 16.924604966139956, + "grad_norm": 327.97015380859375, + "learning_rate": 4.5299455535390204e-06, + "loss": 40.3109, + "step": 4688 + }, + { + "epoch": 16.92821670428894, + "grad_norm": 257.20684814453125, + "learning_rate": 4.524500907441017e-06, + "loss": 36.2826, + "step": 4689 + }, + { + "epoch": 16.931828442437922, + "grad_norm": 402.6732177734375, + "learning_rate": 4.519056261343013e-06, + "loss": 36.9163, + "step": 4690 + }, + { + "epoch": 16.931828442437922, + "eval_loss": 0.6016727089881897, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 4690 + }, + { + "epoch": 16.93544018058691, + "grad_norm": 380.8903503417969, + "learning_rate": 4.513611615245009e-06, + "loss": 36.7101, + "step": 4691 + }, + { + "epoch": 16.939051918735892, + "grad_norm": 365.4950256347656, + "learning_rate": 4.508166969147006e-06, + "loss": 37.9853, + "step": 4692 + }, + { + "epoch": 16.942663656884875, + "grad_norm": 302.3895568847656, + "learning_rate": 4.5027223230490016e-06, + "loss": 38.109, + "step": 4693 + }, + { + "epoch": 16.94627539503386, + "grad_norm": 333.5274963378906, + "learning_rate": 4.497277676950998e-06, + "loss": 37.5992, + "step": 4694 + }, + { + "epoch": 16.949887133182845, + "grad_norm": 364.3126525878906, + "learning_rate": 4.491833030852995e-06, + "loss": 38.0139, + "step": 4695 + }, + { + "epoch": 16.953498871331828, + "grad_norm": 509.94671630859375, + "learning_rate": 4.486388384754991e-06, + "loss": 39.8027, + "step": 4696 + }, + { + "epoch": 16.957110609480814, + "grad_norm": 507.8591613769531, + "learning_rate": 4.480943738656987e-06, + "loss": 40.0044, + "step": 4697 + }, + { + "epoch": 16.960722347629797, + "grad_norm": 324.5463562011719, + "learning_rate": 4.4754990925589836e-06, + "loss": 34.9058, + "step": 4698 + }, + { + "epoch": 16.96433408577878, + "grad_norm": 318.39801025390625, + "learning_rate": 4.470054446460981e-06, + "loss": 33.1318, + "step": 4699 + }, + { + "epoch": 16.967945823927764, + "grad_norm": 391.8466796875, + "learning_rate": 4.464609800362977e-06, + "loss": 32.2083, + "step": 4700 + }, + { + "epoch": 16.967945823927764, + "eval_loss": 0.6047930717468262, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.009, + "eval_steps_per_second": 57.009, + "step": 4700 + }, + { + "epoch": 16.97155756207675, + "grad_norm": 530.4073486328125, + "learning_rate": 4.459165154264972e-06, + "loss": 31.9882, + "step": 4701 + }, + { + "epoch": 16.975169300225733, + "grad_norm": 590.9242553710938, + "learning_rate": 4.453720508166969e-06, + "loss": 34.1937, + "step": 4702 + }, + { + "epoch": 16.978781038374716, + "grad_norm": 377.5596618652344, + "learning_rate": 4.4482758620689656e-06, + "loss": 34.6501, + "step": 4703 + }, + { + "epoch": 16.982392776523703, + "grad_norm": 431.2909240722656, + "learning_rate": 4.442831215970962e-06, + "loss": 33.9402, + "step": 4704 + }, + { + "epoch": 16.986004514672686, + "grad_norm": 294.7673645019531, + "learning_rate": 4.437386569872958e-06, + "loss": 33.7873, + "step": 4705 + }, + { + "epoch": 16.98961625282167, + "grad_norm": 346.1203918457031, + "learning_rate": 4.431941923774955e-06, + "loss": 35.2935, + "step": 4706 + }, + { + "epoch": 16.993227990970656, + "grad_norm": 257.8351745605469, + "learning_rate": 4.426497277676951e-06, + "loss": 28.3513, + "step": 4707 + }, + { + "epoch": 16.99683972911964, + "grad_norm": 168.35118103027344, + "learning_rate": 4.421052631578947e-06, + "loss": 22.3009, + "step": 4708 + }, + { + "epoch": 17.0, + "grad_norm": 210.20738220214844, + "learning_rate": 4.415607985480944e-06, + "loss": 20.1848, + "step": 4709 + }, + { + "epoch": 17.003611738148983, + "grad_norm": 234.40866088867188, + "learning_rate": 4.41016333938294e-06, + "loss": 38.0969, + "step": 4710 + }, + { + "epoch": 17.003611738148983, + "eval_loss": 0.6026900410652161, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 4710 + }, + { + "epoch": 17.00722347629797, + "grad_norm": 242.27195739746094, + "learning_rate": 4.404718693284937e-06, + "loss": 38.8902, + "step": 4711 + }, + { + "epoch": 17.010835214446953, + "grad_norm": 215.1695556640625, + "learning_rate": 4.3992740471869325e-06, + "loss": 38.5509, + "step": 4712 + }, + { + "epoch": 17.014446952595936, + "grad_norm": 390.2027587890625, + "learning_rate": 4.3938294010889296e-06, + "loss": 38.5247, + "step": 4713 + }, + { + "epoch": 17.018058690744923, + "grad_norm": 397.77484130859375, + "learning_rate": 4.388384754990926e-06, + "loss": 39.1981, + "step": 4714 + }, + { + "epoch": 17.021670428893906, + "grad_norm": 298.10089111328125, + "learning_rate": 4.382940108892923e-06, + "loss": 38.2627, + "step": 4715 + }, + { + "epoch": 17.02528216704289, + "grad_norm": 291.7283935546875, + "learning_rate": 4.377495462794918e-06, + "loss": 38.8027, + "step": 4716 + }, + { + "epoch": 17.028893905191875, + "grad_norm": 254.8542938232422, + "learning_rate": 4.3720508166969145e-06, + "loss": 38.6095, + "step": 4717 + }, + { + "epoch": 17.03250564334086, + "grad_norm": 244.336181640625, + "learning_rate": 4.3666061705989116e-06, + "loss": 38.2955, + "step": 4718 + }, + { + "epoch": 17.03611738148984, + "grad_norm": 376.92523193359375, + "learning_rate": 4.361161524500907e-06, + "loss": 38.5203, + "step": 4719 + }, + { + "epoch": 17.039729119638825, + "grad_norm": 339.6172790527344, + "learning_rate": 4.355716878402904e-06, + "loss": 37.4332, + "step": 4720 + }, + { + "epoch": 17.039729119638825, + "eval_loss": 0.6024167537689209, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 4720 + }, + { + "epoch": 17.04334085778781, + "grad_norm": 433.0855712890625, + "learning_rate": 4.3502722323049e-06, + "loss": 36.4444, + "step": 4721 + }, + { + "epoch": 17.046952595936794, + "grad_norm": 224.3468475341797, + "learning_rate": 4.344827586206897e-06, + "loss": 35.7802, + "step": 4722 + }, + { + "epoch": 17.050564334085777, + "grad_norm": 385.5466003417969, + "learning_rate": 4.339382940108893e-06, + "loss": 35.4641, + "step": 4723 + }, + { + "epoch": 17.054176072234764, + "grad_norm": 311.80596923828125, + "learning_rate": 4.333938294010889e-06, + "loss": 36.4231, + "step": 4724 + }, + { + "epoch": 17.057787810383747, + "grad_norm": 283.189453125, + "learning_rate": 4.328493647912886e-06, + "loss": 37.5405, + "step": 4725 + }, + { + "epoch": 17.06139954853273, + "grad_norm": 403.85833740234375, + "learning_rate": 4.323049001814882e-06, + "loss": 37.4723, + "step": 4726 + }, + { + "epoch": 17.065011286681717, + "grad_norm": 390.03515625, + "learning_rate": 4.3176043557168785e-06, + "loss": 36.6799, + "step": 4727 + }, + { + "epoch": 17.0686230248307, + "grad_norm": 318.63427734375, + "learning_rate": 4.312159709618875e-06, + "loss": 36.6312, + "step": 4728 + }, + { + "epoch": 17.072234762979683, + "grad_norm": 318.43402099609375, + "learning_rate": 4.306715063520872e-06, + "loss": 37.9104, + "step": 4729 + }, + { + "epoch": 17.07584650112867, + "grad_norm": 320.9336853027344, + "learning_rate": 4.301270417422867e-06, + "loss": 36.7254, + "step": 4730 + }, + { + "epoch": 17.07584650112867, + "eval_loss": 0.6046721339225769, + "eval_runtime": 3.1418, + "eval_samples_per_second": 56.974, + "eval_steps_per_second": 56.974, + "step": 4730 + }, + { + "epoch": 17.079458239277653, + "grad_norm": 345.9001770019531, + "learning_rate": 4.295825771324863e-06, + "loss": 36.0298, + "step": 4731 + }, + { + "epoch": 17.083069977426636, + "grad_norm": 397.10369873046875, + "learning_rate": 4.2903811252268605e-06, + "loss": 37.9418, + "step": 4732 + }, + { + "epoch": 17.086681715575622, + "grad_norm": 293.1039123535156, + "learning_rate": 4.284936479128857e-06, + "loss": 37.2627, + "step": 4733 + }, + { + "epoch": 17.090293453724605, + "grad_norm": 412.5190734863281, + "learning_rate": 4.279491833030853e-06, + "loss": 38.3429, + "step": 4734 + }, + { + "epoch": 17.09390519187359, + "grad_norm": 241.35105895996094, + "learning_rate": 4.274047186932849e-06, + "loss": 38.559, + "step": 4735 + }, + { + "epoch": 17.097516930022575, + "grad_norm": 275.169189453125, + "learning_rate": 4.268602540834846e-06, + "loss": 36.8167, + "step": 4736 + }, + { + "epoch": 17.101128668171558, + "grad_norm": 272.3182678222656, + "learning_rate": 4.2631578947368425e-06, + "loss": 37.0246, + "step": 4737 + }, + { + "epoch": 17.10474040632054, + "grad_norm": 215.6425018310547, + "learning_rate": 4.257713248638839e-06, + "loss": 33.1282, + "step": 4738 + }, + { + "epoch": 17.108352144469524, + "grad_norm": 276.6223449707031, + "learning_rate": 4.252268602540835e-06, + "loss": 33.2698, + "step": 4739 + }, + { + "epoch": 17.11196388261851, + "grad_norm": 311.1632385253906, + "learning_rate": 4.246823956442831e-06, + "loss": 31.0105, + "step": 4740 + }, + { + "epoch": 17.11196388261851, + "eval_loss": 0.6019421815872192, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.089, + "eval_steps_per_second": 57.089, + "step": 4740 + }, + { + "epoch": 17.115575620767494, + "grad_norm": 254.7543487548828, + "learning_rate": 4.241379310344828e-06, + "loss": 31.4721, + "step": 4741 + }, + { + "epoch": 17.119187358916477, + "grad_norm": 239.24957275390625, + "learning_rate": 4.235934664246824e-06, + "loss": 31.0346, + "step": 4742 + }, + { + "epoch": 17.122799097065464, + "grad_norm": 262.0681457519531, + "learning_rate": 4.230490018148821e-06, + "loss": 32.0604, + "step": 4743 + }, + { + "epoch": 17.126410835214447, + "grad_norm": 218.3557586669922, + "learning_rate": 4.225045372050817e-06, + "loss": 32.2036, + "step": 4744 + }, + { + "epoch": 17.13002257336343, + "grad_norm": 277.5924072265625, + "learning_rate": 4.219600725952813e-06, + "loss": 32.1412, + "step": 4745 + }, + { + "epoch": 17.133634311512417, + "grad_norm": 226.93211364746094, + "learning_rate": 4.214156079854809e-06, + "loss": 34.3367, + "step": 4746 + }, + { + "epoch": 17.1372460496614, + "grad_norm": 303.2422180175781, + "learning_rate": 4.208711433756806e-06, + "loss": 33.2001, + "step": 4747 + }, + { + "epoch": 17.140857787810383, + "grad_norm": 257.6164245605469, + "learning_rate": 4.203266787658803e-06, + "loss": 34.155, + "step": 4748 + }, + { + "epoch": 17.14446952595937, + "grad_norm": 361.1567077636719, + "learning_rate": 4.197822141560798e-06, + "loss": 35.236, + "step": 4749 + }, + { + "epoch": 17.148081264108352, + "grad_norm": 292.0034484863281, + "learning_rate": 4.192377495462795e-06, + "loss": 34.304, + "step": 4750 + }, + { + "epoch": 17.148081264108352, + "eval_loss": 0.6034401059150696, + "eval_runtime": 3.1399, + "eval_samples_per_second": 57.008, + "eval_steps_per_second": 57.008, + "step": 4750 + }, + { + "epoch": 17.151693002257336, + "grad_norm": 327.8070983886719, + "learning_rate": 4.186932849364791e-06, + "loss": 33.7346, + "step": 4751 + }, + { + "epoch": 17.155304740406322, + "grad_norm": 312.9547119140625, + "learning_rate": 4.1814882032667885e-06, + "loss": 35.9274, + "step": 4752 + }, + { + "epoch": 17.158916478555305, + "grad_norm": 305.19500732421875, + "learning_rate": 4.176043557168784e-06, + "loss": 35.5567, + "step": 4753 + }, + { + "epoch": 17.16252821670429, + "grad_norm": 339.37152099609375, + "learning_rate": 4.17059891107078e-06, + "loss": 35.8013, + "step": 4754 + }, + { + "epoch": 17.16613995485327, + "grad_norm": 247.36679077148438, + "learning_rate": 4.165154264972777e-06, + "loss": 29.2211, + "step": 4755 + }, + { + "epoch": 17.169751693002258, + "grad_norm": 255.65269470214844, + "learning_rate": 4.1597096188747725e-06, + "loss": 21.6191, + "step": 4756 + }, + { + "epoch": 17.17336343115124, + "grad_norm": 239.66448974609375, + "learning_rate": 4.15426497277677e-06, + "loss": 22.0521, + "step": 4757 + }, + { + "epoch": 17.176975169300224, + "grad_norm": 212.25955200195312, + "learning_rate": 4.148820326678766e-06, + "loss": 22.6641, + "step": 4758 + }, + { + "epoch": 17.18058690744921, + "grad_norm": 229.9394073486328, + "learning_rate": 4.143375680580763e-06, + "loss": 22.8787, + "step": 4759 + }, + { + "epoch": 17.184198645598194, + "grad_norm": 237.46343994140625, + "learning_rate": 4.137931034482758e-06, + "loss": 39.1222, + "step": 4760 + }, + { + "epoch": 17.184198645598194, + "eval_loss": 0.6031526327133179, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 4760 + }, + { + "epoch": 17.187810383747177, + "grad_norm": 229.23849487304688, + "learning_rate": 4.132486388384755e-06, + "loss": 39.7664, + "step": 4761 + }, + { + "epoch": 17.191422121896164, + "grad_norm": 250.67529296875, + "learning_rate": 4.127041742286752e-06, + "loss": 38.6754, + "step": 4762 + }, + { + "epoch": 17.195033860045147, + "grad_norm": 272.9320068359375, + "learning_rate": 4.121597096188748e-06, + "loss": 39.1262, + "step": 4763 + }, + { + "epoch": 17.19864559819413, + "grad_norm": 267.82427978515625, + "learning_rate": 4.116152450090744e-06, + "loss": 38.2223, + "step": 4764 + }, + { + "epoch": 17.202257336343116, + "grad_norm": 266.35760498046875, + "learning_rate": 4.11070780399274e-06, + "loss": 39.2069, + "step": 4765 + }, + { + "epoch": 17.2058690744921, + "grad_norm": 221.62606811523438, + "learning_rate": 4.105263157894737e-06, + "loss": 38.8956, + "step": 4766 + }, + { + "epoch": 17.209480812641083, + "grad_norm": 243.73110961914062, + "learning_rate": 4.099818511796734e-06, + "loss": 41.5868, + "step": 4767 + }, + { + "epoch": 17.21309255079007, + "grad_norm": 268.6092224121094, + "learning_rate": 4.09437386569873e-06, + "loss": 39.1041, + "step": 4768 + }, + { + "epoch": 17.216704288939052, + "grad_norm": 300.3140563964844, + "learning_rate": 4.088929219600726e-06, + "loss": 38.25, + "step": 4769 + }, + { + "epoch": 17.220316027088035, + "grad_norm": 264.56805419921875, + "learning_rate": 4.083484573502722e-06, + "loss": 38.186, + "step": 4770 + }, + { + "epoch": 17.220316027088035, + "eval_loss": 0.6044566631317139, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4770 + }, + { + "epoch": 17.223927765237022, + "grad_norm": 303.47686767578125, + "learning_rate": 4.0780399274047185e-06, + "loss": 37.7011, + "step": 4771 + }, + { + "epoch": 17.227539503386005, + "grad_norm": 238.3590545654297, + "learning_rate": 4.072595281306715e-06, + "loss": 34.6695, + "step": 4772 + }, + { + "epoch": 17.231151241534988, + "grad_norm": 252.90081787109375, + "learning_rate": 4.067150635208712e-06, + "loss": 36.1903, + "step": 4773 + }, + { + "epoch": 17.23476297968397, + "grad_norm": 286.5584716796875, + "learning_rate": 4.061705989110708e-06, + "loss": 36.4185, + "step": 4774 + }, + { + "epoch": 17.238374717832958, + "grad_norm": 322.25323486328125, + "learning_rate": 4.056261343012704e-06, + "loss": 36.0098, + "step": 4775 + }, + { + "epoch": 17.24198645598194, + "grad_norm": 292.09405517578125, + "learning_rate": 4.0508166969147005e-06, + "loss": 35.4347, + "step": 4776 + }, + { + "epoch": 17.245598194130924, + "grad_norm": 295.9725341796875, + "learning_rate": 4.045372050816697e-06, + "loss": 37.3512, + "step": 4777 + }, + { + "epoch": 17.24920993227991, + "grad_norm": 326.34539794921875, + "learning_rate": 4.039927404718694e-06, + "loss": 38.6739, + "step": 4778 + }, + { + "epoch": 17.252821670428894, + "grad_norm": 384.3682861328125, + "learning_rate": 4.034482758620689e-06, + "loss": 38.0995, + "step": 4779 + }, + { + "epoch": 17.256433408577877, + "grad_norm": 400.59136962890625, + "learning_rate": 4.029038112522686e-06, + "loss": 36.7733, + "step": 4780 + }, + { + "epoch": 17.256433408577877, + "eval_loss": 0.6064656972885132, + "eval_runtime": 3.14, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 4780 + }, + { + "epoch": 17.260045146726863, + "grad_norm": 379.5261535644531, + "learning_rate": 4.0235934664246825e-06, + "loss": 36.1385, + "step": 4781 + }, + { + "epoch": 17.263656884875846, + "grad_norm": 277.1004638671875, + "learning_rate": 4.018148820326679e-06, + "loss": 39.1495, + "step": 4782 + }, + { + "epoch": 17.26726862302483, + "grad_norm": 274.6176452636719, + "learning_rate": 4.012704174228675e-06, + "loss": 37.8503, + "step": 4783 + }, + { + "epoch": 17.270880361173816, + "grad_norm": 338.9375305175781, + "learning_rate": 4.007259528130671e-06, + "loss": 39.7149, + "step": 4784 + }, + { + "epoch": 17.2744920993228, + "grad_norm": 299.60662841796875, + "learning_rate": 4.001814882032668e-06, + "loss": 37.6013, + "step": 4785 + }, + { + "epoch": 17.278103837471782, + "grad_norm": 278.9190368652344, + "learning_rate": 3.996370235934664e-06, + "loss": 38.1106, + "step": 4786 + }, + { + "epoch": 17.28171557562077, + "grad_norm": 254.48443603515625, + "learning_rate": 3.990925589836661e-06, + "loss": 35.9676, + "step": 4787 + }, + { + "epoch": 17.285327313769752, + "grad_norm": 274.65338134765625, + "learning_rate": 3.985480943738657e-06, + "loss": 35.3535, + "step": 4788 + }, + { + "epoch": 17.288939051918735, + "grad_norm": 288.748779296875, + "learning_rate": 3.980036297640654e-06, + "loss": 32.7356, + "step": 4789 + }, + { + "epoch": 17.292550790067722, + "grad_norm": 229.0682830810547, + "learning_rate": 3.9745916515426495e-06, + "loss": 31.2048, + "step": 4790 + }, + { + "epoch": 17.292550790067722, + "eval_loss": 0.6020387411117554, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.06, + "eval_steps_per_second": 57.06, + "step": 4790 + }, + { + "epoch": 17.296162528216705, + "grad_norm": 234.29937744140625, + "learning_rate": 3.9691470054446465e-06, + "loss": 31.7953, + "step": 4791 + }, + { + "epoch": 17.299774266365688, + "grad_norm": 236.3527069091797, + "learning_rate": 3.963702359346643e-06, + "loss": 31.6686, + "step": 4792 + }, + { + "epoch": 17.30338600451467, + "grad_norm": 253.44126892089844, + "learning_rate": 3.958257713248639e-06, + "loss": 31.8848, + "step": 4793 + }, + { + "epoch": 17.306997742663658, + "grad_norm": 270.66046142578125, + "learning_rate": 3.952813067150635e-06, + "loss": 32.1593, + "step": 4794 + }, + { + "epoch": 17.31060948081264, + "grad_norm": 242.77777099609375, + "learning_rate": 3.9473684210526315e-06, + "loss": 32.4555, + "step": 4795 + }, + { + "epoch": 17.314221218961624, + "grad_norm": 243.9296112060547, + "learning_rate": 3.9419237749546285e-06, + "loss": 34.0444, + "step": 4796 + }, + { + "epoch": 17.31783295711061, + "grad_norm": 276.2138671875, + "learning_rate": 3.936479128856624e-06, + "loss": 32.0404, + "step": 4797 + }, + { + "epoch": 17.321444695259594, + "grad_norm": 262.97802734375, + "learning_rate": 3.931034482758621e-06, + "loss": 32.4535, + "step": 4798 + }, + { + "epoch": 17.325056433408577, + "grad_norm": 338.9852600097656, + "learning_rate": 3.925589836660617e-06, + "loss": 34.6855, + "step": 4799 + }, + { + "epoch": 17.328668171557563, + "grad_norm": 270.85650634765625, + "learning_rate": 3.9201451905626135e-06, + "loss": 32.2425, + "step": 4800 + }, + { + "epoch": 17.328668171557563, + "eval_loss": 0.603055477142334, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 4800 + }, + { + "epoch": 17.332279909706546, + "grad_norm": 289.17584228515625, + "learning_rate": 3.91470054446461e-06, + "loss": 34.6461, + "step": 4801 + }, + { + "epoch": 17.33589164785553, + "grad_norm": 301.120361328125, + "learning_rate": 3.909255898366606e-06, + "loss": 34.5622, + "step": 4802 + }, + { + "epoch": 17.339503386004516, + "grad_norm": 328.93524169921875, + "learning_rate": 3.903811252268603e-06, + "loss": 34.9585, + "step": 4803 + }, + { + "epoch": 17.3431151241535, + "grad_norm": 445.72003173828125, + "learning_rate": 3.898366606170599e-06, + "loss": 36.9729, + "step": 4804 + }, + { + "epoch": 17.346726862302482, + "grad_norm": 249.7901153564453, + "learning_rate": 3.8929219600725955e-06, + "loss": 30.1609, + "step": 4805 + }, + { + "epoch": 17.35033860045147, + "grad_norm": 230.1756134033203, + "learning_rate": 3.887477313974592e-06, + "loss": 21.6742, + "step": 4806 + }, + { + "epoch": 17.353950338600452, + "grad_norm": 193.68104553222656, + "learning_rate": 3.882032667876588e-06, + "loss": 22.0064, + "step": 4807 + }, + { + "epoch": 17.357562076749435, + "grad_norm": 232.58486938476562, + "learning_rate": 3.876588021778585e-06, + "loss": 23.1576, + "step": 4808 + }, + { + "epoch": 17.36117381489842, + "grad_norm": 256.0340270996094, + "learning_rate": 3.87114337568058e-06, + "loss": 23.5346, + "step": 4809 + }, + { + "epoch": 17.364785553047405, + "grad_norm": 260.8665771484375, + "learning_rate": 3.8656987295825775e-06, + "loss": 39.5267, + "step": 4810 + }, + { + "epoch": 17.364785553047405, + "eval_loss": 0.6040924191474915, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.926, + "eval_steps_per_second": 56.926, + "step": 4810 + }, + { + "epoch": 17.368397291196388, + "grad_norm": 253.2076873779297, + "learning_rate": 3.860254083484574e-06, + "loss": 40.222, + "step": 4811 + }, + { + "epoch": 17.37200902934537, + "grad_norm": 232.68162536621094, + "learning_rate": 3.85480943738657e-06, + "loss": 38.8405, + "step": 4812 + }, + { + "epoch": 17.375620767494357, + "grad_norm": 264.7735290527344, + "learning_rate": 3.849364791288566e-06, + "loss": 37.8169, + "step": 4813 + }, + { + "epoch": 17.37923250564334, + "grad_norm": 305.1289978027344, + "learning_rate": 3.843920145190563e-06, + "loss": 39.4413, + "step": 4814 + }, + { + "epoch": 17.382844243792324, + "grad_norm": 409.03106689453125, + "learning_rate": 3.8384754990925594e-06, + "loss": 40.146, + "step": 4815 + }, + { + "epoch": 17.38645598194131, + "grad_norm": 307.2272644042969, + "learning_rate": 3.833030852994555e-06, + "loss": 39.0141, + "step": 4816 + }, + { + "epoch": 17.390067720090293, + "grad_norm": 272.6708068847656, + "learning_rate": 3.827586206896552e-06, + "loss": 39.4356, + "step": 4817 + }, + { + "epoch": 17.393679458239276, + "grad_norm": 239.75225830078125, + "learning_rate": 3.822141560798548e-06, + "loss": 39.1581, + "step": 4818 + }, + { + "epoch": 17.397291196388263, + "grad_norm": 203.42205810546875, + "learning_rate": 3.816696914700545e-06, + "loss": 39.9827, + "step": 4819 + }, + { + "epoch": 17.400902934537246, + "grad_norm": 217.77159118652344, + "learning_rate": 3.811252268602541e-06, + "loss": 37.5404, + "step": 4820 + }, + { + "epoch": 17.400902934537246, + "eval_loss": 0.6033807396888733, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.033, + "eval_steps_per_second": 57.033, + "step": 4820 + }, + { + "epoch": 17.40451467268623, + "grad_norm": 257.9713134765625, + "learning_rate": 3.8058076225045377e-06, + "loss": 35.6571, + "step": 4821 + }, + { + "epoch": 17.408126410835216, + "grad_norm": 295.11468505859375, + "learning_rate": 3.8003629764065335e-06, + "loss": 34.7256, + "step": 4822 + }, + { + "epoch": 17.4117381489842, + "grad_norm": 248.15908813476562, + "learning_rate": 3.7949183303085297e-06, + "loss": 37.3417, + "step": 4823 + }, + { + "epoch": 17.415349887133182, + "grad_norm": 295.19085693359375, + "learning_rate": 3.7894736842105264e-06, + "loss": 37.0117, + "step": 4824 + }, + { + "epoch": 17.41896162528217, + "grad_norm": 249.31576538085938, + "learning_rate": 3.7840290381125226e-06, + "loss": 37.168, + "step": 4825 + }, + { + "epoch": 17.42257336343115, + "grad_norm": 271.1731262207031, + "learning_rate": 3.7785843920145193e-06, + "loss": 35.9932, + "step": 4826 + }, + { + "epoch": 17.426185101580135, + "grad_norm": 380.6817626953125, + "learning_rate": 3.7731397459165155e-06, + "loss": 36.952, + "step": 4827 + }, + { + "epoch": 17.42979683972912, + "grad_norm": 370.125244140625, + "learning_rate": 3.767695099818512e-06, + "loss": 38.2224, + "step": 4828 + }, + { + "epoch": 17.433408577878104, + "grad_norm": 291.13568115234375, + "learning_rate": 3.7622504537205084e-06, + "loss": 38.5377, + "step": 4829 + }, + { + "epoch": 17.437020316027088, + "grad_norm": 329.5670471191406, + "learning_rate": 3.756805807622504e-06, + "loss": 38.1665, + "step": 4830 + }, + { + "epoch": 17.437020316027088, + "eval_loss": 0.6047329902648926, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.081, + "eval_steps_per_second": 57.081, + "step": 4830 + }, + { + "epoch": 17.44063205417607, + "grad_norm": 266.0620422363281, + "learning_rate": 3.7513611615245012e-06, + "loss": 34.8371, + "step": 4831 + }, + { + "epoch": 17.444243792325057, + "grad_norm": 257.48980712890625, + "learning_rate": 3.7459165154264975e-06, + "loss": 37.1885, + "step": 4832 + }, + { + "epoch": 17.44785553047404, + "grad_norm": 346.8575439453125, + "learning_rate": 3.740471869328494e-06, + "loss": 38.1426, + "step": 4833 + }, + { + "epoch": 17.451467268623023, + "grad_norm": 246.66868591308594, + "learning_rate": 3.73502722323049e-06, + "loss": 37.6658, + "step": 4834 + }, + { + "epoch": 17.45507900677201, + "grad_norm": 309.71087646484375, + "learning_rate": 3.729582577132486e-06, + "loss": 38.2335, + "step": 4835 + }, + { + "epoch": 17.458690744920993, + "grad_norm": 304.1862487792969, + "learning_rate": 3.724137931034483e-06, + "loss": 38.5964, + "step": 4836 + }, + { + "epoch": 17.462302483069976, + "grad_norm": 253.73211669921875, + "learning_rate": 3.718693284936479e-06, + "loss": 38.9237, + "step": 4837 + }, + { + "epoch": 17.465914221218963, + "grad_norm": 208.52822875976562, + "learning_rate": 3.7132486388384757e-06, + "loss": 35.9177, + "step": 4838 + }, + { + "epoch": 17.469525959367946, + "grad_norm": 258.5502014160156, + "learning_rate": 3.707803992740472e-06, + "loss": 33.2577, + "step": 4839 + }, + { + "epoch": 17.47313769751693, + "grad_norm": 269.1754150390625, + "learning_rate": 3.7023593466424686e-06, + "loss": 31.2634, + "step": 4840 + }, + { + "epoch": 17.47313769751693, + "eval_loss": 0.6035012006759644, + "eval_runtime": 3.1369, + "eval_samples_per_second": 57.062, + "eval_steps_per_second": 57.062, + "step": 4840 + }, + { + "epoch": 17.476749435665916, + "grad_norm": 268.5780029296875, + "learning_rate": 3.6969147005444644e-06, + "loss": 30.6732, + "step": 4841 + }, + { + "epoch": 17.4803611738149, + "grad_norm": 223.7191619873047, + "learning_rate": 3.691470054446461e-06, + "loss": 31.5905, + "step": 4842 + }, + { + "epoch": 17.483972911963882, + "grad_norm": 266.960205078125, + "learning_rate": 3.6860254083484573e-06, + "loss": 31.9407, + "step": 4843 + }, + { + "epoch": 17.48758465011287, + "grad_norm": 241.2608184814453, + "learning_rate": 3.680580762250454e-06, + "loss": 31.8078, + "step": 4844 + }, + { + "epoch": 17.49119638826185, + "grad_norm": 315.95166015625, + "learning_rate": 3.67513611615245e-06, + "loss": 33.5336, + "step": 4845 + }, + { + "epoch": 17.494808126410835, + "grad_norm": 277.731689453125, + "learning_rate": 3.669691470054447e-06, + "loss": 33.0484, + "step": 4846 + }, + { + "epoch": 17.498419864559818, + "grad_norm": 272.35137939453125, + "learning_rate": 3.664246823956443e-06, + "loss": 33.5048, + "step": 4847 + }, + { + "epoch": 17.502031602708804, + "grad_norm": 260.4573974609375, + "learning_rate": 3.6588021778584393e-06, + "loss": 33.5782, + "step": 4848 + }, + { + "epoch": 17.505643340857787, + "grad_norm": 285.7935485839844, + "learning_rate": 3.6533575317604355e-06, + "loss": 35.0308, + "step": 4849 + }, + { + "epoch": 17.50925507900677, + "grad_norm": 267.613037109375, + "learning_rate": 3.6479128856624317e-06, + "loss": 34.8067, + "step": 4850 + }, + { + "epoch": 17.50925507900677, + "eval_loss": 0.6035751700401306, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4850 + }, + { + "epoch": 17.512866817155757, + "grad_norm": 301.43536376953125, + "learning_rate": 3.6424682395644284e-06, + "loss": 33.1631, + "step": 4851 + }, + { + "epoch": 17.51647855530474, + "grad_norm": 270.10467529296875, + "learning_rate": 3.6370235934664246e-06, + "loss": 32.978, + "step": 4852 + }, + { + "epoch": 17.520090293453723, + "grad_norm": 280.802001953125, + "learning_rate": 3.6315789473684213e-06, + "loss": 35.3346, + "step": 4853 + }, + { + "epoch": 17.52370203160271, + "grad_norm": 314.7720031738281, + "learning_rate": 3.6261343012704175e-06, + "loss": 33.4881, + "step": 4854 + }, + { + "epoch": 17.527313769751693, + "grad_norm": 347.4674072265625, + "learning_rate": 3.620689655172414e-06, + "loss": 31.5599, + "step": 4855 + }, + { + "epoch": 17.530925507900676, + "grad_norm": 207.3061981201172, + "learning_rate": 3.61524500907441e-06, + "loss": 22.159, + "step": 4856 + }, + { + "epoch": 17.534537246049663, + "grad_norm": 216.7202911376953, + "learning_rate": 3.6098003629764066e-06, + "loss": 21.6584, + "step": 4857 + }, + { + "epoch": 17.538148984198646, + "grad_norm": 260.20452880859375, + "learning_rate": 3.604355716878403e-06, + "loss": 22.9289, + "step": 4858 + }, + { + "epoch": 17.54176072234763, + "grad_norm": 295.9897766113281, + "learning_rate": 3.5989110707803995e-06, + "loss": 23.7172, + "step": 4859 + }, + { + "epoch": 17.545372460496615, + "grad_norm": 226.99484252929688, + "learning_rate": 3.5934664246823957e-06, + "loss": 37.5844, + "step": 4860 + }, + { + "epoch": 17.545372460496615, + "eval_loss": 0.6059216260910034, + "eval_runtime": 3.1302, + "eval_samples_per_second": 57.185, + "eval_steps_per_second": 57.185, + "step": 4860 + }, + { + "epoch": 17.5489841986456, + "grad_norm": 231.67477416992188, + "learning_rate": 3.588021778584392e-06, + "loss": 39.5191, + "step": 4861 + }, + { + "epoch": 17.55259593679458, + "grad_norm": 248.46058654785156, + "learning_rate": 3.5825771324863886e-06, + "loss": 39.4246, + "step": 4862 + }, + { + "epoch": 17.55620767494357, + "grad_norm": 239.17247009277344, + "learning_rate": 3.577132486388385e-06, + "loss": 38.9811, + "step": 4863 + }, + { + "epoch": 17.55981941309255, + "grad_norm": 325.3457946777344, + "learning_rate": 3.571687840290381e-06, + "loss": 38.4724, + "step": 4864 + }, + { + "epoch": 17.563431151241534, + "grad_norm": 264.5011901855469, + "learning_rate": 3.5662431941923773e-06, + "loss": 38.79, + "step": 4865 + }, + { + "epoch": 17.567042889390518, + "grad_norm": 251.97154235839844, + "learning_rate": 3.560798548094374e-06, + "loss": 38.0342, + "step": 4866 + }, + { + "epoch": 17.570654627539504, + "grad_norm": 236.78271484375, + "learning_rate": 3.55535390199637e-06, + "loss": 39.8586, + "step": 4867 + }, + { + "epoch": 17.574266365688487, + "grad_norm": 276.8800048828125, + "learning_rate": 3.549909255898367e-06, + "loss": 37.8967, + "step": 4868 + }, + { + "epoch": 17.57787810383747, + "grad_norm": 255.9346160888672, + "learning_rate": 3.544464609800363e-06, + "loss": 39.9833, + "step": 4869 + }, + { + "epoch": 17.581489841986457, + "grad_norm": 273.71337890625, + "learning_rate": 3.5390199637023597e-06, + "loss": 38.6235, + "step": 4870 + }, + { + "epoch": 17.581489841986457, + "eval_loss": 0.6033145189285278, + "eval_runtime": 3.1252, + "eval_samples_per_second": 57.275, + "eval_steps_per_second": 57.275, + "step": 4870 + }, + { + "epoch": 17.58510158013544, + "grad_norm": 252.93063354492188, + "learning_rate": 3.533575317604356e-06, + "loss": 37.9017, + "step": 4871 + }, + { + "epoch": 17.588713318284423, + "grad_norm": 259.8314208984375, + "learning_rate": 3.528130671506352e-06, + "loss": 34.6046, + "step": 4872 + }, + { + "epoch": 17.59232505643341, + "grad_norm": 230.2709197998047, + "learning_rate": 3.5226860254083484e-06, + "loss": 35.301, + "step": 4873 + }, + { + "epoch": 17.595936794582393, + "grad_norm": 306.6289367675781, + "learning_rate": 3.517241379310345e-06, + "loss": 37.4443, + "step": 4874 + }, + { + "epoch": 17.599548532731376, + "grad_norm": 241.5065460205078, + "learning_rate": 3.5117967332123413e-06, + "loss": 36.3646, + "step": 4875 + }, + { + "epoch": 17.603160270880363, + "grad_norm": 234.2492218017578, + "learning_rate": 3.5063520871143375e-06, + "loss": 36.2621, + "step": 4876 + }, + { + "epoch": 17.606772009029346, + "grad_norm": 256.5443115234375, + "learning_rate": 3.500907441016334e-06, + "loss": 36.2202, + "step": 4877 + }, + { + "epoch": 17.61038374717833, + "grad_norm": 280.31097412109375, + "learning_rate": 3.4954627949183304e-06, + "loss": 37.5031, + "step": 4878 + }, + { + "epoch": 17.613995485327315, + "grad_norm": 304.2773132324219, + "learning_rate": 3.4900181488203267e-06, + "loss": 37.1418, + "step": 4879 + }, + { + "epoch": 17.6176072234763, + "grad_norm": 361.27716064453125, + "learning_rate": 3.484573502722323e-06, + "loss": 37.1474, + "step": 4880 + }, + { + "epoch": 17.6176072234763, + "eval_loss": 0.6052342653274536, + "eval_runtime": 3.1249, + "eval_samples_per_second": 57.282, + "eval_steps_per_second": 57.282, + "step": 4880 + }, + { + "epoch": 17.62121896162528, + "grad_norm": 237.64540100097656, + "learning_rate": 3.4791288566243195e-06, + "loss": 38.0673, + "step": 4881 + }, + { + "epoch": 17.624830699774268, + "grad_norm": 351.27215576171875, + "learning_rate": 3.4736842105263158e-06, + "loss": 38.8272, + "step": 4882 + }, + { + "epoch": 17.62844243792325, + "grad_norm": 277.1895751953125, + "learning_rate": 3.4682395644283124e-06, + "loss": 39.1524, + "step": 4883 + }, + { + "epoch": 17.632054176072234, + "grad_norm": 275.1535949707031, + "learning_rate": 3.4627949183303086e-06, + "loss": 37.9027, + "step": 4884 + }, + { + "epoch": 17.635665914221217, + "grad_norm": 335.01776123046875, + "learning_rate": 3.4573502722323053e-06, + "loss": 36.7233, + "step": 4885 + }, + { + "epoch": 17.639277652370204, + "grad_norm": 297.1637878417969, + "learning_rate": 3.4519056261343015e-06, + "loss": 37.782, + "step": 4886 + }, + { + "epoch": 17.642889390519187, + "grad_norm": 265.400390625, + "learning_rate": 3.4464609800362978e-06, + "loss": 37.6639, + "step": 4887 + }, + { + "epoch": 17.64650112866817, + "grad_norm": 345.3449401855469, + "learning_rate": 3.441016333938294e-06, + "loss": 36.7617, + "step": 4888 + }, + { + "epoch": 17.650112866817157, + "grad_norm": 256.0724182128906, + "learning_rate": 3.4355716878402902e-06, + "loss": 32.9906, + "step": 4889 + }, + { + "epoch": 17.65372460496614, + "grad_norm": 260.698486328125, + "learning_rate": 3.430127041742287e-06, + "loss": 32.0811, + "step": 4890 + }, + { + "epoch": 17.65372460496614, + "eval_loss": 0.603126585483551, + "eval_runtime": 3.1268, + "eval_samples_per_second": 57.247, + "eval_steps_per_second": 57.247, + "step": 4890 + }, + { + "epoch": 17.657336343115123, + "grad_norm": 274.9847717285156, + "learning_rate": 3.424682395644283e-06, + "loss": 31.2138, + "step": 4891 + }, + { + "epoch": 17.66094808126411, + "grad_norm": 345.5099182128906, + "learning_rate": 3.4192377495462798e-06, + "loss": 30.302, + "step": 4892 + }, + { + "epoch": 17.664559819413093, + "grad_norm": 269.1453857421875, + "learning_rate": 3.413793103448276e-06, + "loss": 30.2679, + "step": 4893 + }, + { + "epoch": 17.668171557562076, + "grad_norm": 293.7955017089844, + "learning_rate": 3.4083484573502722e-06, + "loss": 31.7616, + "step": 4894 + }, + { + "epoch": 17.671783295711062, + "grad_norm": 306.1725769042969, + "learning_rate": 3.4029038112522685e-06, + "loss": 33.1265, + "step": 4895 + }, + { + "epoch": 17.675395033860045, + "grad_norm": 329.8185119628906, + "learning_rate": 3.397459165154265e-06, + "loss": 33.2131, + "step": 4896 + }, + { + "epoch": 17.67900677200903, + "grad_norm": 340.790283203125, + "learning_rate": 3.3920145190562613e-06, + "loss": 33.243, + "step": 4897 + }, + { + "epoch": 17.682618510158015, + "grad_norm": 324.004150390625, + "learning_rate": 3.386569872958258e-06, + "loss": 33.6235, + "step": 4898 + }, + { + "epoch": 17.686230248306998, + "grad_norm": 263.9126892089844, + "learning_rate": 3.3811252268602542e-06, + "loss": 33.2524, + "step": 4899 + }, + { + "epoch": 17.68984198645598, + "grad_norm": 274.6680603027344, + "learning_rate": 3.375680580762251e-06, + "loss": 34.6629, + "step": 4900 + }, + { + "epoch": 17.68984198645598, + "eval_loss": 0.6027778387069702, + "eval_runtime": 3.1418, + "eval_samples_per_second": 56.974, + "eval_steps_per_second": 56.974, + "step": 4900 + }, + { + "epoch": 17.693453724604964, + "grad_norm": 317.1280822753906, + "learning_rate": 3.370235934664247e-06, + "loss": 33.3088, + "step": 4901 + }, + { + "epoch": 17.69706546275395, + "grad_norm": 304.1892395019531, + "learning_rate": 3.364791288566243e-06, + "loss": 34.5045, + "step": 4902 + }, + { + "epoch": 17.700677200902934, + "grad_norm": 278.75933837890625, + "learning_rate": 3.3593466424682396e-06, + "loss": 35.8429, + "step": 4903 + }, + { + "epoch": 17.704288939051917, + "grad_norm": 299.76971435546875, + "learning_rate": 3.353901996370236e-06, + "loss": 36.2401, + "step": 4904 + }, + { + "epoch": 17.707900677200904, + "grad_norm": 253.46795654296875, + "learning_rate": 3.3484573502722324e-06, + "loss": 28.938, + "step": 4905 + }, + { + "epoch": 17.711512415349887, + "grad_norm": 220.74098205566406, + "learning_rate": 3.3430127041742287e-06, + "loss": 21.6689, + "step": 4906 + }, + { + "epoch": 17.71512415349887, + "grad_norm": 255.79150390625, + "learning_rate": 3.3375680580762253e-06, + "loss": 21.3497, + "step": 4907 + }, + { + "epoch": 17.718735891647857, + "grad_norm": 284.2683410644531, + "learning_rate": 3.3321234119782216e-06, + "loss": 22.9276, + "step": 4908 + }, + { + "epoch": 17.72234762979684, + "grad_norm": 296.7882080078125, + "learning_rate": 3.3266787658802182e-06, + "loss": 24.7304, + "step": 4909 + }, + { + "epoch": 17.725959367945823, + "grad_norm": 217.35546875, + "learning_rate": 3.321234119782214e-06, + "loss": 38.7687, + "step": 4910 + }, + { + "epoch": 17.725959367945823, + "eval_loss": 0.6015192866325378, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.074, + "eval_steps_per_second": 57.074, + "step": 4910 + }, + { + "epoch": 17.72957110609481, + "grad_norm": 256.7005920410156, + "learning_rate": 3.3157894736842107e-06, + "loss": 39.7421, + "step": 4911 + }, + { + "epoch": 17.733182844243792, + "grad_norm": 203.49417114257812, + "learning_rate": 3.310344827586207e-06, + "loss": 39.2911, + "step": 4912 + }, + { + "epoch": 17.736794582392776, + "grad_norm": 282.81439208984375, + "learning_rate": 3.3049001814882036e-06, + "loss": 39.2524, + "step": 4913 + }, + { + "epoch": 17.740406320541762, + "grad_norm": 315.3716735839844, + "learning_rate": 3.2994555353902e-06, + "loss": 37.2097, + "step": 4914 + }, + { + "epoch": 17.744018058690745, + "grad_norm": 250.96484375, + "learning_rate": 3.294010889292196e-06, + "loss": 37.6568, + "step": 4915 + }, + { + "epoch": 17.74762979683973, + "grad_norm": 299.4822082519531, + "learning_rate": 3.2885662431941927e-06, + "loss": 38.9578, + "step": 4916 + }, + { + "epoch": 17.751241534988715, + "grad_norm": 261.2537536621094, + "learning_rate": 3.2831215970961885e-06, + "loss": 40.3838, + "step": 4917 + }, + { + "epoch": 17.754853273137698, + "grad_norm": 220.55218505859375, + "learning_rate": 3.277676950998185e-06, + "loss": 39.2068, + "step": 4918 + }, + { + "epoch": 17.75846501128668, + "grad_norm": 238.06874084472656, + "learning_rate": 3.2722323049001814e-06, + "loss": 40.5383, + "step": 4919 + }, + { + "epoch": 17.762076749435664, + "grad_norm": 223.9597625732422, + "learning_rate": 3.266787658802178e-06, + "loss": 37.3857, + "step": 4920 + }, + { + "epoch": 17.762076749435664, + "eval_loss": 0.602606475353241, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.033, + "eval_steps_per_second": 57.033, + "step": 4920 + }, + { + "epoch": 17.76568848758465, + "grad_norm": 278.9289245605469, + "learning_rate": 3.2613430127041742e-06, + "loss": 37.187, + "step": 4921 + }, + { + "epoch": 17.769300225733634, + "grad_norm": 306.52398681640625, + "learning_rate": 3.255898366606171e-06, + "loss": 37.5243, + "step": 4922 + }, + { + "epoch": 17.772911963882617, + "grad_norm": 231.3939208984375, + "learning_rate": 3.250453720508167e-06, + "loss": 35.3104, + "step": 4923 + }, + { + "epoch": 17.776523702031604, + "grad_norm": 216.77613830566406, + "learning_rate": 3.2450090744101638e-06, + "loss": 36.0904, + "step": 4924 + }, + { + "epoch": 17.780135440180587, + "grad_norm": 256.0504150390625, + "learning_rate": 3.2395644283121596e-06, + "loss": 36.4117, + "step": 4925 + }, + { + "epoch": 17.78374717832957, + "grad_norm": 253.29734802246094, + "learning_rate": 3.2341197822141562e-06, + "loss": 37.197, + "step": 4926 + }, + { + "epoch": 17.787358916478556, + "grad_norm": 268.80780029296875, + "learning_rate": 3.2286751361161525e-06, + "loss": 36.4606, + "step": 4927 + }, + { + "epoch": 17.79097065462754, + "grad_norm": 302.3041076660156, + "learning_rate": 3.2232304900181487e-06, + "loss": 36.8647, + "step": 4928 + }, + { + "epoch": 17.794582392776523, + "grad_norm": 274.23797607421875, + "learning_rate": 3.2177858439201454e-06, + "loss": 37.3981, + "step": 4929 + }, + { + "epoch": 17.79819413092551, + "grad_norm": 281.4304504394531, + "learning_rate": 3.2123411978221416e-06, + "loss": 37.2304, + "step": 4930 + }, + { + "epoch": 17.79819413092551, + "eval_loss": 0.6050394773483276, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 4930 + }, + { + "epoch": 17.801805869074492, + "grad_norm": 277.47698974609375, + "learning_rate": 3.2068965517241382e-06, + "loss": 35.9281, + "step": 4931 + }, + { + "epoch": 17.805417607223475, + "grad_norm": 394.02294921875, + "learning_rate": 3.201451905626134e-06, + "loss": 39.0143, + "step": 4932 + }, + { + "epoch": 17.809029345372462, + "grad_norm": 252.8087158203125, + "learning_rate": 3.1960072595281307e-06, + "loss": 36.9452, + "step": 4933 + }, + { + "epoch": 17.812641083521445, + "grad_norm": 249.54962158203125, + "learning_rate": 3.190562613430127e-06, + "loss": 39.2442, + "step": 4934 + }, + { + "epoch": 17.816252821670428, + "grad_norm": 286.9231262207031, + "learning_rate": 3.1851179673321236e-06, + "loss": 38.6445, + "step": 4935 + }, + { + "epoch": 17.819864559819415, + "grad_norm": 345.7146911621094, + "learning_rate": 3.17967332123412e-06, + "loss": 37.1794, + "step": 4936 + }, + { + "epoch": 17.823476297968398, + "grad_norm": 271.23089599609375, + "learning_rate": 3.1742286751361165e-06, + "loss": 36.3952, + "step": 4937 + }, + { + "epoch": 17.82708803611738, + "grad_norm": 406.3717346191406, + "learning_rate": 3.1687840290381127e-06, + "loss": 33.8166, + "step": 4938 + }, + { + "epoch": 17.830699774266364, + "grad_norm": 300.12554931640625, + "learning_rate": 3.1633393829401094e-06, + "loss": 30.9614, + "step": 4939 + }, + { + "epoch": 17.83431151241535, + "grad_norm": 229.67218017578125, + "learning_rate": 3.157894736842105e-06, + "loss": 31.8592, + "step": 4940 + }, + { + "epoch": 17.83431151241535, + "eval_loss": 0.6021057367324829, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 4940 + }, + { + "epoch": 17.837923250564334, + "grad_norm": 269.0873107910156, + "learning_rate": 3.1524500907441014e-06, + "loss": 31.7702, + "step": 4941 + }, + { + "epoch": 17.841534988713317, + "grad_norm": 279.0237731933594, + "learning_rate": 3.147005444646098e-06, + "loss": 31.3615, + "step": 4942 + }, + { + "epoch": 17.845146726862303, + "grad_norm": 234.94839477539062, + "learning_rate": 3.1415607985480943e-06, + "loss": 31.9314, + "step": 4943 + }, + { + "epoch": 17.848758465011286, + "grad_norm": 239.25613403320312, + "learning_rate": 3.136116152450091e-06, + "loss": 32.4513, + "step": 4944 + }, + { + "epoch": 17.85237020316027, + "grad_norm": 257.09661865234375, + "learning_rate": 3.130671506352087e-06, + "loss": 34.4964, + "step": 4945 + }, + { + "epoch": 17.855981941309256, + "grad_norm": 328.88006591796875, + "learning_rate": 3.125226860254084e-06, + "loss": 33.1662, + "step": 4946 + }, + { + "epoch": 17.85959367945824, + "grad_norm": 291.4894714355469, + "learning_rate": 3.1197822141560796e-06, + "loss": 34.4406, + "step": 4947 + }, + { + "epoch": 17.863205417607222, + "grad_norm": 282.81158447265625, + "learning_rate": 3.1143375680580763e-06, + "loss": 32.7141, + "step": 4948 + }, + { + "epoch": 17.86681715575621, + "grad_norm": 300.0378112792969, + "learning_rate": 3.1088929219600725e-06, + "loss": 34.3423, + "step": 4949 + }, + { + "epoch": 17.870428893905192, + "grad_norm": 267.2983703613281, + "learning_rate": 3.103448275862069e-06, + "loss": 33.1653, + "step": 4950 + }, + { + "epoch": 17.870428893905192, + "eval_loss": 0.6020416021347046, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.081, + "eval_steps_per_second": 57.081, + "step": 4950 + }, + { + "epoch": 17.874040632054175, + "grad_norm": 270.53277587890625, + "learning_rate": 3.0980036297640654e-06, + "loss": 34.7582, + "step": 4951 + }, + { + "epoch": 17.877652370203162, + "grad_norm": 346.0074157714844, + "learning_rate": 3.092558983666062e-06, + "loss": 35.9911, + "step": 4952 + }, + { + "epoch": 17.881264108352145, + "grad_norm": 367.5807189941406, + "learning_rate": 3.0871143375680583e-06, + "loss": 35.3345, + "step": 4953 + }, + { + "epoch": 17.884875846501128, + "grad_norm": 304.21649169921875, + "learning_rate": 3.0816696914700545e-06, + "loss": 32.9797, + "step": 4954 + }, + { + "epoch": 17.888487584650115, + "grad_norm": 253.14601135253906, + "learning_rate": 3.0762250453720507e-06, + "loss": 22.6226, + "step": 4955 + }, + { + "epoch": 17.892099322799098, + "grad_norm": 270.3512268066406, + "learning_rate": 3.070780399274047e-06, + "loss": 21.9531, + "step": 4956 + }, + { + "epoch": 17.89571106094808, + "grad_norm": 192.73712158203125, + "learning_rate": 3.0653357531760436e-06, + "loss": 21.8497, + "step": 4957 + }, + { + "epoch": 17.899322799097064, + "grad_norm": 254.43759155273438, + "learning_rate": 3.05989110707804e-06, + "loss": 23.2694, + "step": 4958 + }, + { + "epoch": 17.90293453724605, + "grad_norm": 271.2293395996094, + "learning_rate": 3.0544464609800365e-06, + "loss": 22.9774, + "step": 4959 + }, + { + "epoch": 17.906546275395034, + "grad_norm": 213.7334747314453, + "learning_rate": 3.0490018148820327e-06, + "loss": 38.8821, + "step": 4960 + }, + { + "epoch": 17.906546275395034, + "eval_loss": 0.600848913192749, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 4960 + }, + { + "epoch": 17.910158013544017, + "grad_norm": 269.9356384277344, + "learning_rate": 3.0435571687840294e-06, + "loss": 38.6362, + "step": 4961 + }, + { + "epoch": 17.913769751693003, + "grad_norm": 237.6484832763672, + "learning_rate": 3.0381125226860256e-06, + "loss": 39.6388, + "step": 4962 + }, + { + "epoch": 17.917381489841986, + "grad_norm": 304.2347106933594, + "learning_rate": 3.032667876588022e-06, + "loss": 39.4308, + "step": 4963 + }, + { + "epoch": 17.92099322799097, + "grad_norm": 250.6772918701172, + "learning_rate": 3.027223230490018e-06, + "loss": 40.1923, + "step": 4964 + }, + { + "epoch": 17.924604966139956, + "grad_norm": 261.7320556640625, + "learning_rate": 3.0217785843920147e-06, + "loss": 37.862, + "step": 4965 + }, + { + "epoch": 17.92821670428894, + "grad_norm": 385.33197021484375, + "learning_rate": 3.016333938294011e-06, + "loss": 35.9139, + "step": 4966 + }, + { + "epoch": 17.931828442437922, + "grad_norm": 436.6773986816406, + "learning_rate": 3.010889292196007e-06, + "loss": 36.6259, + "step": 4967 + }, + { + "epoch": 17.93544018058691, + "grad_norm": 318.65673828125, + "learning_rate": 3.005444646098004e-06, + "loss": 36.1235, + "step": 4968 + }, + { + "epoch": 17.939051918735892, + "grad_norm": 241.6234893798828, + "learning_rate": 3e-06, + "loss": 37.4148, + "step": 4969 + }, + { + "epoch": 17.942663656884875, + "grad_norm": 316.8415832519531, + "learning_rate": 2.9945553539019963e-06, + "loss": 36.7089, + "step": 4970 + }, + { + "epoch": 17.942663656884875, + "eval_loss": 0.6032605171203613, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.061, + "eval_steps_per_second": 57.061, + "step": 4970 + }, + { + "epoch": 17.94627539503386, + "grad_norm": 322.0501403808594, + "learning_rate": 2.9891107078039925e-06, + "loss": 37.2222, + "step": 4971 + }, + { + "epoch": 17.949887133182845, + "grad_norm": 300.4189453125, + "learning_rate": 2.983666061705989e-06, + "loss": 37.9156, + "step": 4972 + }, + { + "epoch": 17.953498871331828, + "grad_norm": 304.39263916015625, + "learning_rate": 2.9782214156079854e-06, + "loss": 38.5253, + "step": 4973 + }, + { + "epoch": 17.957110609480814, + "grad_norm": 297.4574890136719, + "learning_rate": 2.972776769509982e-06, + "loss": 38.4385, + "step": 4974 + }, + { + "epoch": 17.960722347629797, + "grad_norm": 367.7257080078125, + "learning_rate": 2.9673321234119783e-06, + "loss": 36.2943, + "step": 4975 + }, + { + "epoch": 17.96433408577878, + "grad_norm": 274.61724853515625, + "learning_rate": 2.961887477313975e-06, + "loss": 30.8753, + "step": 4976 + }, + { + "epoch": 17.967945823927764, + "grad_norm": 358.50201416015625, + "learning_rate": 2.956442831215971e-06, + "loss": 32.1308, + "step": 4977 + }, + { + "epoch": 17.97155756207675, + "grad_norm": 493.7792663574219, + "learning_rate": 2.9509981851179674e-06, + "loss": 33.2474, + "step": 4978 + }, + { + "epoch": 17.975169300225733, + "grad_norm": 426.67138671875, + "learning_rate": 2.9455535390199636e-06, + "loss": 33.7065, + "step": 4979 + }, + { + "epoch": 17.978781038374716, + "grad_norm": 524.0231323242188, + "learning_rate": 2.94010889292196e-06, + "loss": 34.6007, + "step": 4980 + }, + { + "epoch": 17.978781038374716, + "eval_loss": 0.6021283268928528, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 4980 + }, + { + "epoch": 17.982392776523703, + "grad_norm": 395.26715087890625, + "learning_rate": 2.9346642468239565e-06, + "loss": 33.9185, + "step": 4981 + }, + { + "epoch": 17.986004514672686, + "grad_norm": 400.0454406738281, + "learning_rate": 2.9292196007259528e-06, + "loss": 34.6485, + "step": 4982 + }, + { + "epoch": 17.98961625282167, + "grad_norm": 376.1269226074219, + "learning_rate": 2.9237749546279494e-06, + "loss": 34.668, + "step": 4983 + }, + { + "epoch": 17.993227990970656, + "grad_norm": 315.5225524902344, + "learning_rate": 2.9183303085299456e-06, + "loss": 30.7058, + "step": 4984 + }, + { + "epoch": 17.99683972911964, + "grad_norm": 221.5032958984375, + "learning_rate": 2.912885662431942e-06, + "loss": 21.8055, + "step": 4985 + }, + { + "epoch": 18.0, + "grad_norm": 226.06068420410156, + "learning_rate": 2.907441016333938e-06, + "loss": 20.5066, + "step": 4986 + }, + { + "epoch": 18.003611738148983, + "grad_norm": 209.69607543945312, + "learning_rate": 2.9019963702359348e-06, + "loss": 37.9156, + "step": 4987 + }, + { + "epoch": 18.00722347629797, + "grad_norm": 218.86709594726562, + "learning_rate": 2.896551724137931e-06, + "loss": 38.8204, + "step": 4988 + }, + { + "epoch": 18.010835214446953, + "grad_norm": 218.38180541992188, + "learning_rate": 2.8911070780399276e-06, + "loss": 38.5472, + "step": 4989 + }, + { + "epoch": 18.014446952595936, + "grad_norm": 338.4778747558594, + "learning_rate": 2.885662431941924e-06, + "loss": 37.7233, + "step": 4990 + }, + { + "epoch": 18.014446952595936, + "eval_loss": 0.6013379096984863, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.979, + "eval_steps_per_second": 56.979, + "step": 4990 + }, + { + "epoch": 18.018058690744923, + "grad_norm": 309.5385437011719, + "learning_rate": 2.8802177858439205e-06, + "loss": 38.3321, + "step": 4991 + }, + { + "epoch": 18.021670428893906, + "grad_norm": 335.67169189453125, + "learning_rate": 2.8747731397459168e-06, + "loss": 38.2367, + "step": 4992 + }, + { + "epoch": 18.02528216704289, + "grad_norm": 260.5025939941406, + "learning_rate": 2.8693284936479126e-06, + "loss": 38.5516, + "step": 4993 + }, + { + "epoch": 18.028893905191875, + "grad_norm": 265.4793395996094, + "learning_rate": 2.8638838475499092e-06, + "loss": 38.9539, + "step": 4994 + }, + { + "epoch": 18.03250564334086, + "grad_norm": 237.87942504882812, + "learning_rate": 2.8584392014519054e-06, + "loss": 39.4582, + "step": 4995 + }, + { + "epoch": 18.03611738148984, + "grad_norm": 252.11746215820312, + "learning_rate": 2.852994555353902e-06, + "loss": 39.3466, + "step": 4996 + }, + { + "epoch": 18.039729119638825, + "grad_norm": 298.1370849609375, + "learning_rate": 2.8475499092558983e-06, + "loss": 36.9779, + "step": 4997 + }, + { + "epoch": 18.04334085778781, + "grad_norm": 341.9007873535156, + "learning_rate": 2.842105263157895e-06, + "loss": 36.5117, + "step": 4998 + }, + { + "epoch": 18.046952595936794, + "grad_norm": 210.0319366455078, + "learning_rate": 2.8366606170598912e-06, + "loss": 34.7543, + "step": 4999 + }, + { + "epoch": 18.050564334085777, + "grad_norm": 385.6400146484375, + "learning_rate": 2.831215970961888e-06, + "loss": 36.4577, + "step": 5000 + }, + { + "epoch": 18.050564334085777, + "eval_loss": 0.6031082272529602, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 5000 + }, + { + "epoch": 18.054176072234764, + "grad_norm": 268.4949035644531, + "learning_rate": 2.8257713248638837e-06, + "loss": 36.3765, + "step": 5001 + }, + { + "epoch": 18.057787810383747, + "grad_norm": 311.2984313964844, + "learning_rate": 2.8203266787658803e-06, + "loss": 35.709, + "step": 5002 + }, + { + "epoch": 18.06139954853273, + "grad_norm": 264.0671081542969, + "learning_rate": 2.8148820326678766e-06, + "loss": 35.7978, + "step": 5003 + }, + { + "epoch": 18.065011286681717, + "grad_norm": 341.0770263671875, + "learning_rate": 2.8094373865698732e-06, + "loss": 36.8963, + "step": 5004 + }, + { + "epoch": 18.0686230248307, + "grad_norm": 253.3942108154297, + "learning_rate": 2.8039927404718694e-06, + "loss": 37.1135, + "step": 5005 + }, + { + "epoch": 18.072234762979683, + "grad_norm": 286.23736572265625, + "learning_rate": 2.7985480943738657e-06, + "loss": 35.736, + "step": 5006 + }, + { + "epoch": 18.07584650112867, + "grad_norm": 327.71295166015625, + "learning_rate": 2.7931034482758623e-06, + "loss": 36.4917, + "step": 5007 + }, + { + "epoch": 18.079458239277653, + "grad_norm": 351.00616455078125, + "learning_rate": 2.787658802177858e-06, + "loss": 37.2807, + "step": 5008 + }, + { + "epoch": 18.083069977426636, + "grad_norm": 291.02923583984375, + "learning_rate": 2.782214156079855e-06, + "loss": 38.0345, + "step": 5009 + }, + { + "epoch": 18.086681715575622, + "grad_norm": 288.7776184082031, + "learning_rate": 2.776769509981851e-06, + "loss": 37.112, + "step": 5010 + }, + { + "epoch": 18.086681715575622, + "eval_loss": 0.6058472990989685, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 5010 + }, + { + "epoch": 18.090293453724605, + "grad_norm": 437.8114929199219, + "learning_rate": 2.7713248638838477e-06, + "loss": 37.9063, + "step": 5011 + }, + { + "epoch": 18.09390519187359, + "grad_norm": 324.5924072265625, + "learning_rate": 2.765880217785844e-06, + "loss": 37.8524, + "step": 5012 + }, + { + "epoch": 18.097516930022575, + "grad_norm": 358.40625, + "learning_rate": 2.7604355716878406e-06, + "loss": 37.5547, + "step": 5013 + }, + { + "epoch": 18.101128668171558, + "grad_norm": 290.75604248046875, + "learning_rate": 2.7549909255898368e-06, + "loss": 36.4437, + "step": 5014 + }, + { + "epoch": 18.10474040632054, + "grad_norm": 284.41424560546875, + "learning_rate": 2.7495462794918334e-06, + "loss": 34.3336, + "step": 5015 + }, + { + "epoch": 18.108352144469524, + "grad_norm": 254.59889221191406, + "learning_rate": 2.7441016333938292e-06, + "loss": 32.4527, + "step": 5016 + }, + { + "epoch": 18.11196388261851, + "grad_norm": 266.0207214355469, + "learning_rate": 2.738656987295826e-06, + "loss": 30.4014, + "step": 5017 + }, + { + "epoch": 18.115575620767494, + "grad_norm": 219.9434356689453, + "learning_rate": 2.733212341197822e-06, + "loss": 30.2838, + "step": 5018 + }, + { + "epoch": 18.119187358916477, + "grad_norm": 312.7678527832031, + "learning_rate": 2.7277676950998188e-06, + "loss": 31.6877, + "step": 5019 + }, + { + "epoch": 18.122799097065464, + "grad_norm": 282.99774169921875, + "learning_rate": 2.722323049001815e-06, + "loss": 33.3686, + "step": 5020 + }, + { + "epoch": 18.122799097065464, + "eval_loss": 0.6027761697769165, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 5020 + }, + { + "epoch": 18.126410835214447, + "grad_norm": 371.9994201660156, + "learning_rate": 2.7168784029038112e-06, + "loss": 32.5397, + "step": 5021 + }, + { + "epoch": 18.13002257336343, + "grad_norm": 241.19049072265625, + "learning_rate": 2.711433756805808e-06, + "loss": 33.4329, + "step": 5022 + }, + { + "epoch": 18.133634311512417, + "grad_norm": 310.2216796875, + "learning_rate": 2.7059891107078037e-06, + "loss": 31.888, + "step": 5023 + }, + { + "epoch": 18.1372460496614, + "grad_norm": 277.1349182128906, + "learning_rate": 2.7005444646098004e-06, + "loss": 33.9345, + "step": 5024 + }, + { + "epoch": 18.140857787810383, + "grad_norm": 419.3515930175781, + "learning_rate": 2.6950998185117966e-06, + "loss": 33.5826, + "step": 5025 + }, + { + "epoch": 18.14446952595937, + "grad_norm": 289.1166687011719, + "learning_rate": 2.6896551724137932e-06, + "loss": 34.324, + "step": 5026 + }, + { + "epoch": 18.148081264108352, + "grad_norm": 364.20233154296875, + "learning_rate": 2.6842105263157895e-06, + "loss": 34.45, + "step": 5027 + }, + { + "epoch": 18.151693002257336, + "grad_norm": 341.71551513671875, + "learning_rate": 2.678765880217786e-06, + "loss": 33.9126, + "step": 5028 + }, + { + "epoch": 18.155304740406322, + "grad_norm": 283.1939697265625, + "learning_rate": 2.6733212341197824e-06, + "loss": 33.7188, + "step": 5029 + }, + { + "epoch": 18.158916478555305, + "grad_norm": 369.6583251953125, + "learning_rate": 2.667876588021779e-06, + "loss": 35.0354, + "step": 5030 + }, + { + "epoch": 18.158916478555305, + "eval_loss": 0.6033984422683716, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 5030 + }, + { + "epoch": 18.16252821670429, + "grad_norm": 323.95806884765625, + "learning_rate": 2.662431941923775e-06, + "loss": 34.6853, + "step": 5031 + }, + { + "epoch": 18.16613995485327, + "grad_norm": 274.2629089355469, + "learning_rate": 2.6569872958257715e-06, + "loss": 32.1261, + "step": 5032 + }, + { + "epoch": 18.169751693002258, + "grad_norm": 229.66163635253906, + "learning_rate": 2.6515426497277677e-06, + "loss": 22.0549, + "step": 5033 + }, + { + "epoch": 18.17336343115124, + "grad_norm": 212.78070068359375, + "learning_rate": 2.646098003629764e-06, + "loss": 21.4483, + "step": 5034 + }, + { + "epoch": 18.176975169300224, + "grad_norm": 184.7995147705078, + "learning_rate": 2.6406533575317606e-06, + "loss": 22.5133, + "step": 5035 + }, + { + "epoch": 18.18058690744921, + "grad_norm": 256.6748046875, + "learning_rate": 2.635208711433757e-06, + "loss": 23.6443, + "step": 5036 + }, + { + "epoch": 18.184198645598194, + "grad_norm": 230.683349609375, + "learning_rate": 2.6297640653357535e-06, + "loss": 38.3633, + "step": 5037 + }, + { + "epoch": 18.187810383747177, + "grad_norm": 251.70166015625, + "learning_rate": 2.6243194192377497e-06, + "loss": 40.1229, + "step": 5038 + }, + { + "epoch": 18.191422121896164, + "grad_norm": 219.9066162109375, + "learning_rate": 2.618874773139746e-06, + "loss": 38.6539, + "step": 5039 + }, + { + "epoch": 18.195033860045147, + "grad_norm": 290.7185974121094, + "learning_rate": 2.613430127041742e-06, + "loss": 38.0385, + "step": 5040 + }, + { + "epoch": 18.195033860045147, + "eval_loss": 0.6022469401359558, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.993, + "eval_steps_per_second": 56.993, + "step": 5040 + }, + { + "epoch": 18.19864559819413, + "grad_norm": 334.9693908691406, + "learning_rate": 2.607985480943739e-06, + "loss": 38.2381, + "step": 5041 + }, + { + "epoch": 18.202257336343116, + "grad_norm": 283.9659423828125, + "learning_rate": 2.602540834845735e-06, + "loss": 39.2603, + "step": 5042 + }, + { + "epoch": 18.2058690744921, + "grad_norm": 291.4002990722656, + "learning_rate": 2.5970961887477317e-06, + "loss": 39.633, + "step": 5043 + }, + { + "epoch": 18.209480812641083, + "grad_norm": 249.14329528808594, + "learning_rate": 2.591651542649728e-06, + "loss": 39.1938, + "step": 5044 + }, + { + "epoch": 18.21309255079007, + "grad_norm": 226.1659393310547, + "learning_rate": 2.5862068965517246e-06, + "loss": 39.8308, + "step": 5045 + }, + { + "epoch": 18.216704288939052, + "grad_norm": 270.2198181152344, + "learning_rate": 2.5807622504537204e-06, + "loss": 38.4712, + "step": 5046 + }, + { + "epoch": 18.220316027088035, + "grad_norm": 263.83819580078125, + "learning_rate": 2.5753176043557166e-06, + "loss": 37.3572, + "step": 5047 + }, + { + "epoch": 18.223927765237022, + "grad_norm": 316.8177795410156, + "learning_rate": 2.5698729582577133e-06, + "loss": 36.3821, + "step": 5048 + }, + { + "epoch": 18.227539503386005, + "grad_norm": 318.7213134765625, + "learning_rate": 2.5644283121597095e-06, + "loss": 34.8209, + "step": 5049 + }, + { + "epoch": 18.231151241534988, + "grad_norm": 267.6168518066406, + "learning_rate": 2.558983666061706e-06, + "loss": 35.6173, + "step": 5050 + }, + { + "epoch": 18.231151241534988, + "eval_loss": 0.6044466495513916, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.038, + "eval_steps_per_second": 57.038, + "step": 5050 + }, + { + "epoch": 18.23476297968397, + "grad_norm": 277.739501953125, + "learning_rate": 2.5535390199637024e-06, + "loss": 35.2828, + "step": 5051 + }, + { + "epoch": 18.238374717832958, + "grad_norm": 288.2068786621094, + "learning_rate": 2.548094373865699e-06, + "loss": 36.7972, + "step": 5052 + }, + { + "epoch": 18.24198645598194, + "grad_norm": 217.59716796875, + "learning_rate": 2.5426497277676953e-06, + "loss": 36.3637, + "step": 5053 + }, + { + "epoch": 18.245598194130924, + "grad_norm": 411.8970031738281, + "learning_rate": 2.5372050816696915e-06, + "loss": 37.3086, + "step": 5054 + }, + { + "epoch": 18.24920993227991, + "grad_norm": 351.9718933105469, + "learning_rate": 2.5317604355716877e-06, + "loss": 37.0896, + "step": 5055 + }, + { + "epoch": 18.252821670428894, + "grad_norm": 343.1683044433594, + "learning_rate": 2.5263157894736844e-06, + "loss": 37.2533, + "step": 5056 + }, + { + "epoch": 18.256433408577877, + "grad_norm": 413.0977783203125, + "learning_rate": 2.5208711433756806e-06, + "loss": 36.9987, + "step": 5057 + }, + { + "epoch": 18.260045146726863, + "grad_norm": 331.73223876953125, + "learning_rate": 2.5154264972776773e-06, + "loss": 36.8624, + "step": 5058 + }, + { + "epoch": 18.263656884875846, + "grad_norm": 434.96990966796875, + "learning_rate": 2.5099818511796735e-06, + "loss": 37.949, + "step": 5059 + }, + { + "epoch": 18.26726862302483, + "grad_norm": 324.4934997558594, + "learning_rate": 2.5045372050816697e-06, + "loss": 37.6272, + "step": 5060 + }, + { + "epoch": 18.26726862302483, + "eval_loss": 0.6042292714118958, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 5060 + }, + { + "epoch": 18.270880361173816, + "grad_norm": 312.1228942871094, + "learning_rate": 2.499092558983666e-06, + "loss": 38.6362, + "step": 5061 + }, + { + "epoch": 18.2744920993228, + "grad_norm": 427.6184997558594, + "learning_rate": 2.493647912885662e-06, + "loss": 39.2934, + "step": 5062 + }, + { + "epoch": 18.278103837471782, + "grad_norm": 344.6819763183594, + "learning_rate": 2.488203266787659e-06, + "loss": 38.0684, + "step": 5063 + }, + { + "epoch": 18.28171557562077, + "grad_norm": 317.42303466796875, + "learning_rate": 2.482758620689655e-06, + "loss": 38.2323, + "step": 5064 + }, + { + "epoch": 18.285327313769752, + "grad_norm": 338.830810546875, + "learning_rate": 2.4773139745916517e-06, + "loss": 34.2699, + "step": 5065 + }, + { + "epoch": 18.288939051918735, + "grad_norm": 286.7263488769531, + "learning_rate": 2.471869328493648e-06, + "loss": 32.5149, + "step": 5066 + }, + { + "epoch": 18.292550790067722, + "grad_norm": 278.9923095703125, + "learning_rate": 2.4664246823956446e-06, + "loss": 31.033, + "step": 5067 + }, + { + "epoch": 18.296162528216705, + "grad_norm": 264.0198669433594, + "learning_rate": 2.460980036297641e-06, + "loss": 29.5549, + "step": 5068 + }, + { + "epoch": 18.299774266365688, + "grad_norm": 241.6163330078125, + "learning_rate": 2.455535390199637e-06, + "loss": 30.2173, + "step": 5069 + }, + { + "epoch": 18.30338600451467, + "grad_norm": 278.5418395996094, + "learning_rate": 2.4500907441016333e-06, + "loss": 30.8286, + "step": 5070 + }, + { + "epoch": 18.30338600451467, + "eval_loss": 0.6035094261169434, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 5070 + }, + { + "epoch": 18.306997742663658, + "grad_norm": 277.5758056640625, + "learning_rate": 2.44464609800363e-06, + "loss": 33.6778, + "step": 5071 + }, + { + "epoch": 18.31060948081264, + "grad_norm": 295.81201171875, + "learning_rate": 2.439201451905626e-06, + "loss": 33.5914, + "step": 5072 + }, + { + "epoch": 18.314221218961624, + "grad_norm": 293.4093017578125, + "learning_rate": 2.4337568058076224e-06, + "loss": 33.6203, + "step": 5073 + }, + { + "epoch": 18.31783295711061, + "grad_norm": 277.2228698730469, + "learning_rate": 2.428312159709619e-06, + "loss": 33.6465, + "step": 5074 + }, + { + "epoch": 18.321444695259594, + "grad_norm": 286.3224792480469, + "learning_rate": 2.4228675136116153e-06, + "loss": 32.6013, + "step": 5075 + }, + { + "epoch": 18.325056433408577, + "grad_norm": 320.6168212890625, + "learning_rate": 2.417422867513612e-06, + "loss": 32.6469, + "step": 5076 + }, + { + "epoch": 18.328668171557563, + "grad_norm": 327.364990234375, + "learning_rate": 2.4119782214156078e-06, + "loss": 34.354, + "step": 5077 + }, + { + "epoch": 18.332279909706546, + "grad_norm": 342.06634521484375, + "learning_rate": 2.4065335753176044e-06, + "loss": 34.3143, + "step": 5078 + }, + { + "epoch": 18.33589164785553, + "grad_norm": 370.70343017578125, + "learning_rate": 2.4010889292196006e-06, + "loss": 33.7771, + "step": 5079 + }, + { + "epoch": 18.339503386004516, + "grad_norm": 358.7357177734375, + "learning_rate": 2.3956442831215973e-06, + "loss": 35.5377, + "step": 5080 + }, + { + "epoch": 18.339503386004516, + "eval_loss": 0.6033809185028076, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.931, + "eval_steps_per_second": 56.931, + "step": 5080 + }, + { + "epoch": 18.3431151241535, + "grad_norm": 463.8668518066406, + "learning_rate": 2.3901996370235935e-06, + "loss": 35.4711, + "step": 5081 + }, + { + "epoch": 18.346726862302482, + "grad_norm": 256.5113220214844, + "learning_rate": 2.38475499092559e-06, + "loss": 26.8532, + "step": 5082 + }, + { + "epoch": 18.35033860045147, + "grad_norm": 228.83883666992188, + "learning_rate": 2.3793103448275864e-06, + "loss": 21.6636, + "step": 5083 + }, + { + "epoch": 18.353950338600452, + "grad_norm": 238.70742797851562, + "learning_rate": 2.3738656987295826e-06, + "loss": 22.2091, + "step": 5084 + }, + { + "epoch": 18.357562076749435, + "grad_norm": 276.8741760253906, + "learning_rate": 2.368421052631579e-06, + "loss": 22.1242, + "step": 5085 + }, + { + "epoch": 18.36117381489842, + "grad_norm": 226.4810333251953, + "learning_rate": 2.362976406533575e-06, + "loss": 23.359, + "step": 5086 + }, + { + "epoch": 18.364785553047405, + "grad_norm": 212.53111267089844, + "learning_rate": 2.3575317604355718e-06, + "loss": 37.7694, + "step": 5087 + }, + { + "epoch": 18.368397291196388, + "grad_norm": 227.26710510253906, + "learning_rate": 2.352087114337568e-06, + "loss": 39.8064, + "step": 5088 + }, + { + "epoch": 18.37200902934537, + "grad_norm": 201.0309295654297, + "learning_rate": 2.3466424682395646e-06, + "loss": 38.9716, + "step": 5089 + }, + { + "epoch": 18.375620767494357, + "grad_norm": 311.7691345214844, + "learning_rate": 2.341197822141561e-06, + "loss": 39.8326, + "step": 5090 + }, + { + "epoch": 18.375620767494357, + "eval_loss": 0.6036086082458496, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.029, + "eval_steps_per_second": 57.029, + "step": 5090 + }, + { + "epoch": 18.37923250564334, + "grad_norm": 251.5362091064453, + "learning_rate": 2.3357531760435575e-06, + "loss": 38.2591, + "step": 5091 + }, + { + "epoch": 18.382844243792324, + "grad_norm": 241.64373779296875, + "learning_rate": 2.3303085299455533e-06, + "loss": 38.0327, + "step": 5092 + }, + { + "epoch": 18.38645598194131, + "grad_norm": 231.7598114013672, + "learning_rate": 2.32486388384755e-06, + "loss": 38.6853, + "step": 5093 + }, + { + "epoch": 18.390067720090293, + "grad_norm": 287.66644287109375, + "learning_rate": 2.3194192377495462e-06, + "loss": 39.6929, + "step": 5094 + }, + { + "epoch": 18.393679458239276, + "grad_norm": 289.3146057128906, + "learning_rate": 2.313974591651543e-06, + "loss": 38.3129, + "step": 5095 + }, + { + "epoch": 18.397291196388263, + "grad_norm": 291.4801330566406, + "learning_rate": 2.308529945553539e-06, + "loss": 38.2505, + "step": 5096 + }, + { + "epoch": 18.400902934537246, + "grad_norm": 337.4052429199219, + "learning_rate": 2.3030852994555358e-06, + "loss": 37.7476, + "step": 5097 + }, + { + "epoch": 18.40451467268623, + "grad_norm": 460.0773010253906, + "learning_rate": 2.297640653357532e-06, + "loss": 36.1112, + "step": 5098 + }, + { + "epoch": 18.408126410835216, + "grad_norm": 322.4940185546875, + "learning_rate": 2.292196007259528e-06, + "loss": 36.5374, + "step": 5099 + }, + { + "epoch": 18.4117381489842, + "grad_norm": 350.4710388183594, + "learning_rate": 2.2867513611615244e-06, + "loss": 37.5286, + "step": 5100 + }, + { + "epoch": 18.4117381489842, + "eval_loss": 0.6045494079589844, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.101, + "eval_steps_per_second": 57.101, + "step": 5100 + }, + { + "epoch": 18.415349887133182, + "grad_norm": 306.18634033203125, + "learning_rate": 2.2813067150635207e-06, + "loss": 37.3676, + "step": 5101 + }, + { + "epoch": 18.41896162528217, + "grad_norm": 289.237060546875, + "learning_rate": 2.2758620689655173e-06, + "loss": 36.6916, + "step": 5102 + }, + { + "epoch": 18.42257336343115, + "grad_norm": 266.69207763671875, + "learning_rate": 2.2704174228675136e-06, + "loss": 36.2887, + "step": 5103 + }, + { + "epoch": 18.426185101580135, + "grad_norm": 264.54119873046875, + "learning_rate": 2.2649727767695102e-06, + "loss": 37.1267, + "step": 5104 + }, + { + "epoch": 18.42979683972912, + "grad_norm": 262.6132507324219, + "learning_rate": 2.2595281306715064e-06, + "loss": 36.6862, + "step": 5105 + }, + { + "epoch": 18.433408577878104, + "grad_norm": 231.68226623535156, + "learning_rate": 2.254083484573503e-06, + "loss": 35.7714, + "step": 5106 + }, + { + "epoch": 18.437020316027088, + "grad_norm": 299.72613525390625, + "learning_rate": 2.248638838475499e-06, + "loss": 37.648, + "step": 5107 + }, + { + "epoch": 18.44063205417607, + "grad_norm": 424.94708251953125, + "learning_rate": 2.2431941923774956e-06, + "loss": 35.9776, + "step": 5108 + }, + { + "epoch": 18.444243792325057, + "grad_norm": 449.78570556640625, + "learning_rate": 2.2377495462794918e-06, + "loss": 38.0571, + "step": 5109 + }, + { + "epoch": 18.44785553047404, + "grad_norm": 284.00634765625, + "learning_rate": 2.2323049001814884e-06, + "loss": 37.758, + "step": 5110 + }, + { + "epoch": 18.44785553047404, + "eval_loss": 0.6064541935920715, + "eval_runtime": 3.1377, + "eval_samples_per_second": 57.048, + "eval_steps_per_second": 57.048, + "step": 5110 + }, + { + "epoch": 18.451467268623023, + "grad_norm": 359.1011962890625, + "learning_rate": 2.2268602540834847e-06, + "loss": 38.8924, + "step": 5111 + }, + { + "epoch": 18.45507900677201, + "grad_norm": 307.7583923339844, + "learning_rate": 2.221415607985481e-06, + "loss": 38.2116, + "step": 5112 + }, + { + "epoch": 18.458690744920993, + "grad_norm": 359.5586242675781, + "learning_rate": 2.2159709618874776e-06, + "loss": 39.6894, + "step": 5113 + }, + { + "epoch": 18.462302483069976, + "grad_norm": 258.3985595703125, + "learning_rate": 2.2105263157894734e-06, + "loss": 36.4586, + "step": 5114 + }, + { + "epoch": 18.465914221218963, + "grad_norm": 363.09600830078125, + "learning_rate": 2.20508166969147e-06, + "loss": 34.489, + "step": 5115 + }, + { + "epoch": 18.469525959367946, + "grad_norm": 237.136474609375, + "learning_rate": 2.1996370235934662e-06, + "loss": 32.5826, + "step": 5116 + }, + { + "epoch": 18.47313769751693, + "grad_norm": 400.25604248046875, + "learning_rate": 2.194192377495463e-06, + "loss": 31.3005, + "step": 5117 + }, + { + "epoch": 18.476749435665916, + "grad_norm": 467.9855651855469, + "learning_rate": 2.188747731397459e-06, + "loss": 30.2261, + "step": 5118 + }, + { + "epoch": 18.4803611738149, + "grad_norm": 384.4250183105469, + "learning_rate": 2.1833030852994558e-06, + "loss": 33.5844, + "step": 5119 + }, + { + "epoch": 18.483972911963882, + "grad_norm": 324.4369201660156, + "learning_rate": 2.177858439201452e-06, + "loss": 32.5136, + "step": 5120 + }, + { + "epoch": 18.483972911963882, + "eval_loss": 0.602573573589325, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 5120 + }, + { + "epoch": 18.48758465011287, + "grad_norm": 372.0033264160156, + "learning_rate": 2.1724137931034487e-06, + "loss": 31.4322, + "step": 5121 + }, + { + "epoch": 18.49119638826185, + "grad_norm": 336.265869140625, + "learning_rate": 2.1669691470054445e-06, + "loss": 34.163, + "step": 5122 + }, + { + "epoch": 18.494808126410835, + "grad_norm": 339.8494873046875, + "learning_rate": 2.161524500907441e-06, + "loss": 31.2627, + "step": 5123 + }, + { + "epoch": 18.498419864559818, + "grad_norm": 279.3925476074219, + "learning_rate": 2.1560798548094374e-06, + "loss": 32.3994, + "step": 5124 + }, + { + "epoch": 18.502031602708804, + "grad_norm": 281.546875, + "learning_rate": 2.1506352087114336e-06, + "loss": 34.8467, + "step": 5125 + }, + { + "epoch": 18.505643340857787, + "grad_norm": 315.8692626953125, + "learning_rate": 2.1451905626134302e-06, + "loss": 33.632, + "step": 5126 + }, + { + "epoch": 18.50925507900677, + "grad_norm": 289.3066711425781, + "learning_rate": 2.1397459165154265e-06, + "loss": 34.312, + "step": 5127 + }, + { + "epoch": 18.512866817155757, + "grad_norm": 274.190673828125, + "learning_rate": 2.134301270417423e-06, + "loss": 32.9937, + "step": 5128 + }, + { + "epoch": 18.51647855530474, + "grad_norm": 317.9950256347656, + "learning_rate": 2.1288566243194194e-06, + "loss": 35.8788, + "step": 5129 + }, + { + "epoch": 18.520090293453723, + "grad_norm": 342.9775695800781, + "learning_rate": 2.1234119782214156e-06, + "loss": 35.2397, + "step": 5130 + }, + { + "epoch": 18.520090293453723, + "eval_loss": 0.6024553179740906, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 5130 + }, + { + "epoch": 18.52370203160271, + "grad_norm": 351.09637451171875, + "learning_rate": 2.117967332123412e-06, + "loss": 33.1556, + "step": 5131 + }, + { + "epoch": 18.527313769751693, + "grad_norm": 229.55613708496094, + "learning_rate": 2.1125226860254085e-06, + "loss": 26.6317, + "step": 5132 + }, + { + "epoch": 18.530925507900676, + "grad_norm": 234.53562927246094, + "learning_rate": 2.1070780399274047e-06, + "loss": 21.316, + "step": 5133 + }, + { + "epoch": 18.534537246049663, + "grad_norm": 241.59982299804688, + "learning_rate": 2.1016333938294014e-06, + "loss": 21.2739, + "step": 5134 + }, + { + "epoch": 18.538148984198646, + "grad_norm": 207.2808380126953, + "learning_rate": 2.0961887477313976e-06, + "loss": 22.736, + "step": 5135 + }, + { + "epoch": 18.54176072234763, + "grad_norm": 236.13955688476562, + "learning_rate": 2.0907441016333942e-06, + "loss": 22.7503, + "step": 5136 + }, + { + "epoch": 18.545372460496615, + "grad_norm": 181.6793670654297, + "learning_rate": 2.08529945553539e-06, + "loss": 37.9001, + "step": 5137 + }, + { + "epoch": 18.5489841986456, + "grad_norm": 249.5441131591797, + "learning_rate": 2.0798548094373863e-06, + "loss": 39.52, + "step": 5138 + }, + { + "epoch": 18.55259593679458, + "grad_norm": 215.67855834960938, + "learning_rate": 2.074410163339383e-06, + "loss": 38.6667, + "step": 5139 + }, + { + "epoch": 18.55620767494357, + "grad_norm": 280.9402770996094, + "learning_rate": 2.068965517241379e-06, + "loss": 36.9602, + "step": 5140 + }, + { + "epoch": 18.55620767494357, + "eval_loss": 0.6027256846427917, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 5140 + }, + { + "epoch": 18.55981941309255, + "grad_norm": 265.9155578613281, + "learning_rate": 2.063520871143376e-06, + "loss": 38.8654, + "step": 5141 + }, + { + "epoch": 18.563431151241534, + "grad_norm": 300.0267028808594, + "learning_rate": 2.058076225045372e-06, + "loss": 38.8917, + "step": 5142 + }, + { + "epoch": 18.567042889390518, + "grad_norm": 243.0481414794922, + "learning_rate": 2.0526315789473687e-06, + "loss": 39.2785, + "step": 5143 + }, + { + "epoch": 18.570654627539504, + "grad_norm": 270.58380126953125, + "learning_rate": 2.047186932849365e-06, + "loss": 39.3892, + "step": 5144 + }, + { + "epoch": 18.574266365688487, + "grad_norm": 311.60430908203125, + "learning_rate": 2.041742286751361e-06, + "loss": 39.5933, + "step": 5145 + }, + { + "epoch": 18.57787810383747, + "grad_norm": 285.160400390625, + "learning_rate": 2.0362976406533574e-06, + "loss": 38.2962, + "step": 5146 + }, + { + "epoch": 18.581489841986457, + "grad_norm": 232.0592041015625, + "learning_rate": 2.030852994555354e-06, + "loss": 38.5965, + "step": 5147 + }, + { + "epoch": 18.58510158013544, + "grad_norm": 221.85525512695312, + "learning_rate": 2.0254083484573503e-06, + "loss": 36.516, + "step": 5148 + }, + { + "epoch": 18.588713318284423, + "grad_norm": 291.9794921875, + "learning_rate": 2.019963702359347e-06, + "loss": 36.3976, + "step": 5149 + }, + { + "epoch": 18.59232505643341, + "grad_norm": 387.8580322265625, + "learning_rate": 2.014519056261343e-06, + "loss": 35.2321, + "step": 5150 + }, + { + "epoch": 18.59232505643341, + "eval_loss": 0.6030355095863342, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.046, + "eval_steps_per_second": 57.046, + "step": 5150 + }, + { + "epoch": 18.595936794582393, + "grad_norm": 300.14508056640625, + "learning_rate": 2.0090744101633394e-06, + "loss": 36.4186, + "step": 5151 + }, + { + "epoch": 18.599548532731376, + "grad_norm": 294.1235656738281, + "learning_rate": 2.0036297640653356e-06, + "loss": 36.014, + "step": 5152 + }, + { + "epoch": 18.603160270880363, + "grad_norm": 389.1570129394531, + "learning_rate": 1.998185117967332e-06, + "loss": 36.1648, + "step": 5153 + }, + { + "epoch": 18.606772009029346, + "grad_norm": 244.6651153564453, + "learning_rate": 1.9927404718693285e-06, + "loss": 36.1033, + "step": 5154 + }, + { + "epoch": 18.61038374717833, + "grad_norm": 302.52996826171875, + "learning_rate": 1.9872958257713247e-06, + "loss": 37.1531, + "step": 5155 + }, + { + "epoch": 18.613995485327315, + "grad_norm": 352.86273193359375, + "learning_rate": 1.9818511796733214e-06, + "loss": 37.8204, + "step": 5156 + }, + { + "epoch": 18.6176072234763, + "grad_norm": 308.61431884765625, + "learning_rate": 1.9764065335753176e-06, + "loss": 37.2097, + "step": 5157 + }, + { + "epoch": 18.62121896162528, + "grad_norm": 288.30712890625, + "learning_rate": 1.9709618874773143e-06, + "loss": 36.4242, + "step": 5158 + }, + { + "epoch": 18.624830699774268, + "grad_norm": 315.9750671386719, + "learning_rate": 1.9655172413793105e-06, + "loss": 35.9204, + "step": 5159 + }, + { + "epoch": 18.62844243792325, + "grad_norm": 468.51055908203125, + "learning_rate": 1.9600725952813067e-06, + "loss": 38.9178, + "step": 5160 + }, + { + "epoch": 18.62844243792325, + "eval_loss": 0.6054540872573853, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 5160 + }, + { + "epoch": 18.632054176072234, + "grad_norm": 310.5861511230469, + "learning_rate": 1.954627949183303e-06, + "loss": 37.9588, + "step": 5161 + }, + { + "epoch": 18.635665914221217, + "grad_norm": 424.3090515136719, + "learning_rate": 1.9491833030852996e-06, + "loss": 38.1028, + "step": 5162 + }, + { + "epoch": 18.639277652370204, + "grad_norm": 330.6189880371094, + "learning_rate": 1.943738656987296e-06, + "loss": 36.5096, + "step": 5163 + }, + { + "epoch": 18.642889390519187, + "grad_norm": 305.9330139160156, + "learning_rate": 1.9382940108892925e-06, + "loss": 36.871, + "step": 5164 + }, + { + "epoch": 18.64650112866817, + "grad_norm": 410.06793212890625, + "learning_rate": 1.9328493647912887e-06, + "loss": 37.4061, + "step": 5165 + }, + { + "epoch": 18.650112866817157, + "grad_norm": 385.49127197265625, + "learning_rate": 1.927404718693285e-06, + "loss": 33.6399, + "step": 5166 + }, + { + "epoch": 18.65372460496614, + "grad_norm": 270.96783447265625, + "learning_rate": 1.9219600725952816e-06, + "loss": 31.3483, + "step": 5167 + }, + { + "epoch": 18.657336343115123, + "grad_norm": 329.84405517578125, + "learning_rate": 1.9165154264972774e-06, + "loss": 30.2639, + "step": 5168 + }, + { + "epoch": 18.66094808126411, + "grad_norm": 413.7260437011719, + "learning_rate": 1.911070780399274e-06, + "loss": 31.2749, + "step": 5169 + }, + { + "epoch": 18.664559819413093, + "grad_norm": 276.43585205078125, + "learning_rate": 1.9056261343012705e-06, + "loss": 30.3596, + "step": 5170 + }, + { + "epoch": 18.664559819413093, + "eval_loss": 0.6022100448608398, + "eval_runtime": 3.1339, + "eval_samples_per_second": 57.117, + "eval_steps_per_second": 57.117, + "step": 5170 + }, + { + "epoch": 18.668171557562076, + "grad_norm": 248.9257049560547, + "learning_rate": 1.9001814882032667e-06, + "loss": 32.4066, + "step": 5171 + }, + { + "epoch": 18.671783295711062, + "grad_norm": 252.70388793945312, + "learning_rate": 1.8947368421052632e-06, + "loss": 32.3724, + "step": 5172 + }, + { + "epoch": 18.675395033860045, + "grad_norm": 325.0677795410156, + "learning_rate": 1.8892921960072596e-06, + "loss": 32.3041, + "step": 5173 + }, + { + "epoch": 18.67900677200903, + "grad_norm": 420.9740295410156, + "learning_rate": 1.883847549909256e-06, + "loss": 32.6609, + "step": 5174 + }, + { + "epoch": 18.682618510158015, + "grad_norm": 239.59371948242188, + "learning_rate": 1.878402903811252e-06, + "loss": 32.8471, + "step": 5175 + }, + { + "epoch": 18.686230248306998, + "grad_norm": 301.13165283203125, + "learning_rate": 1.8729582577132487e-06, + "loss": 32.2686, + "step": 5176 + }, + { + "epoch": 18.68984198645598, + "grad_norm": 282.7923889160156, + "learning_rate": 1.867513611615245e-06, + "loss": 34.2726, + "step": 5177 + }, + { + "epoch": 18.693453724604964, + "grad_norm": 434.20550537109375, + "learning_rate": 1.8620689655172414e-06, + "loss": 35.335, + "step": 5178 + }, + { + "epoch": 18.69706546275395, + "grad_norm": 306.680908203125, + "learning_rate": 1.8566243194192379e-06, + "loss": 33.3156, + "step": 5179 + }, + { + "epoch": 18.700677200902934, + "grad_norm": 253.27711486816406, + "learning_rate": 1.8511796733212343e-06, + "loss": 34.9504, + "step": 5180 + }, + { + "epoch": 18.700677200902934, + "eval_loss": 0.6021104454994202, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 5180 + }, + { + "epoch": 18.704288939051917, + "grad_norm": 391.74945068359375, + "learning_rate": 1.8457350272232305e-06, + "loss": 35.285, + "step": 5181 + }, + { + "epoch": 18.707900677200904, + "grad_norm": 265.4142150878906, + "learning_rate": 1.840290381125227e-06, + "loss": 27.689, + "step": 5182 + }, + { + "epoch": 18.711512415349887, + "grad_norm": 217.80746459960938, + "learning_rate": 1.8348457350272234e-06, + "loss": 22.6159, + "step": 5183 + }, + { + "epoch": 18.71512415349887, + "grad_norm": 220.21180725097656, + "learning_rate": 1.8294010889292196e-06, + "loss": 22.1321, + "step": 5184 + }, + { + "epoch": 18.718735891647857, + "grad_norm": 239.4197998046875, + "learning_rate": 1.8239564428312159e-06, + "loss": 22.5479, + "step": 5185 + }, + { + "epoch": 18.72234762979684, + "grad_norm": 281.7828674316406, + "learning_rate": 1.8185117967332123e-06, + "loss": 23.5363, + "step": 5186 + }, + { + "epoch": 18.725959367945823, + "grad_norm": 231.81980895996094, + "learning_rate": 1.8130671506352088e-06, + "loss": 39.0953, + "step": 5187 + }, + { + "epoch": 18.72957110609481, + "grad_norm": 242.0535430908203, + "learning_rate": 1.807622504537205e-06, + "loss": 39.4842, + "step": 5188 + }, + { + "epoch": 18.733182844243792, + "grad_norm": 235.6869659423828, + "learning_rate": 1.8021778584392014e-06, + "loss": 37.4884, + "step": 5189 + }, + { + "epoch": 18.736794582392776, + "grad_norm": 291.5176086425781, + "learning_rate": 1.7967332123411979e-06, + "loss": 38.9612, + "step": 5190 + }, + { + "epoch": 18.736794582392776, + "eval_loss": 0.6040608286857605, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.034, + "eval_steps_per_second": 57.034, + "step": 5190 + }, + { + "epoch": 18.740406320541762, + "grad_norm": 407.5574645996094, + "learning_rate": 1.7912885662431943e-06, + "loss": 39.3531, + "step": 5191 + }, + { + "epoch": 18.744018058690745, + "grad_norm": 277.07891845703125, + "learning_rate": 1.7858439201451905e-06, + "loss": 38.4866, + "step": 5192 + }, + { + "epoch": 18.74762979683973, + "grad_norm": 350.2939453125, + "learning_rate": 1.780399274047187e-06, + "loss": 38.0073, + "step": 5193 + }, + { + "epoch": 18.751241534988715, + "grad_norm": 395.7618103027344, + "learning_rate": 1.7749546279491834e-06, + "loss": 38.1693, + "step": 5194 + }, + { + "epoch": 18.754853273137698, + "grad_norm": 296.43267822265625, + "learning_rate": 1.7695099818511799e-06, + "loss": 38.6162, + "step": 5195 + }, + { + "epoch": 18.75846501128668, + "grad_norm": 335.7173156738281, + "learning_rate": 1.764065335753176e-06, + "loss": 38.9182, + "step": 5196 + }, + { + "epoch": 18.762076749435664, + "grad_norm": 273.09368896484375, + "learning_rate": 1.7586206896551725e-06, + "loss": 38.0685, + "step": 5197 + }, + { + "epoch": 18.76568848758465, + "grad_norm": 359.718505859375, + "learning_rate": 1.7531760435571688e-06, + "loss": 36.8994, + "step": 5198 + }, + { + "epoch": 18.769300225733634, + "grad_norm": 345.5837097167969, + "learning_rate": 1.7477313974591652e-06, + "loss": 35.375, + "step": 5199 + }, + { + "epoch": 18.772911963882617, + "grad_norm": 266.8583984375, + "learning_rate": 1.7422867513611614e-06, + "loss": 34.7559, + "step": 5200 + }, + { + "epoch": 18.772911963882617, + "eval_loss": 0.6007165908813477, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 5200 + }, + { + "epoch": 18.776523702031604, + "grad_norm": 317.10662841796875, + "learning_rate": 1.7368421052631579e-06, + "loss": 35.6206, + "step": 5201 + }, + { + "epoch": 18.780135440180587, + "grad_norm": 418.6651916503906, + "learning_rate": 1.7313974591651543e-06, + "loss": 36.7981, + "step": 5202 + }, + { + "epoch": 18.78374717832957, + "grad_norm": 247.767333984375, + "learning_rate": 1.7259528130671508e-06, + "loss": 36.226, + "step": 5203 + }, + { + "epoch": 18.787358916478556, + "grad_norm": 406.6683349609375, + "learning_rate": 1.720508166969147e-06, + "loss": 36.5781, + "step": 5204 + }, + { + "epoch": 18.79097065462754, + "grad_norm": 433.02984619140625, + "learning_rate": 1.7150635208711434e-06, + "loss": 37.8221, + "step": 5205 + }, + { + "epoch": 18.794582392776523, + "grad_norm": 291.1831970214844, + "learning_rate": 1.7096188747731399e-06, + "loss": 37.9125, + "step": 5206 + }, + { + "epoch": 18.79819413092551, + "grad_norm": 276.8603820800781, + "learning_rate": 1.7041742286751361e-06, + "loss": 38.0886, + "step": 5207 + }, + { + "epoch": 18.801805869074492, + "grad_norm": 442.06317138671875, + "learning_rate": 1.6987295825771326e-06, + "loss": 36.8432, + "step": 5208 + }, + { + "epoch": 18.805417607223475, + "grad_norm": 323.7881774902344, + "learning_rate": 1.693284936479129e-06, + "loss": 37.2775, + "step": 5209 + }, + { + "epoch": 18.809029345372462, + "grad_norm": 320.2378234863281, + "learning_rate": 1.6878402903811254e-06, + "loss": 37.4478, + "step": 5210 + }, + { + "epoch": 18.809029345372462, + "eval_loss": 0.6044604182243347, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.946, + "eval_steps_per_second": 56.946, + "step": 5210 + }, + { + "epoch": 18.812641083521445, + "grad_norm": 474.6519470214844, + "learning_rate": 1.6823956442831215e-06, + "loss": 37.9463, + "step": 5211 + }, + { + "epoch": 18.816252821670428, + "grad_norm": 265.7474060058594, + "learning_rate": 1.676950998185118e-06, + "loss": 37.7662, + "step": 5212 + }, + { + "epoch": 18.819864559819415, + "grad_norm": 312.014892578125, + "learning_rate": 1.6715063520871143e-06, + "loss": 37.3329, + "step": 5213 + }, + { + "epoch": 18.823476297968398, + "grad_norm": 407.24884033203125, + "learning_rate": 1.6660617059891108e-06, + "loss": 36.4324, + "step": 5214 + }, + { + "epoch": 18.82708803611738, + "grad_norm": 368.05255126953125, + "learning_rate": 1.660617059891107e-06, + "loss": 33.9691, + "step": 5215 + }, + { + "epoch": 18.830699774266364, + "grad_norm": 410.3034362792969, + "learning_rate": 1.6551724137931035e-06, + "loss": 32.7008, + "step": 5216 + }, + { + "epoch": 18.83431151241535, + "grad_norm": 318.6436462402344, + "learning_rate": 1.6497277676951e-06, + "loss": 32.1152, + "step": 5217 + }, + { + "epoch": 18.837923250564334, + "grad_norm": 366.3927307128906, + "learning_rate": 1.6442831215970963e-06, + "loss": 31.3827, + "step": 5218 + }, + { + "epoch": 18.841534988713317, + "grad_norm": 319.7497863769531, + "learning_rate": 1.6388384754990926e-06, + "loss": 30.781, + "step": 5219 + }, + { + "epoch": 18.845146726862303, + "grad_norm": 405.86669921875, + "learning_rate": 1.633393829401089e-06, + "loss": 30.5807, + "step": 5220 + }, + { + "epoch": 18.845146726862303, + "eval_loss": 0.6014994382858276, + "eval_runtime": 3.1339, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 5220 + }, + { + "epoch": 18.848758465011286, + "grad_norm": 518.0769653320312, + "learning_rate": 1.6279491833030855e-06, + "loss": 33.4028, + "step": 5221 + }, + { + "epoch": 18.85237020316027, + "grad_norm": 390.18609619140625, + "learning_rate": 1.6225045372050819e-06, + "loss": 31.805, + "step": 5222 + }, + { + "epoch": 18.855981941309256, + "grad_norm": 323.1091003417969, + "learning_rate": 1.6170598911070781e-06, + "loss": 33.4414, + "step": 5223 + }, + { + "epoch": 18.85959367945824, + "grad_norm": 311.3610534667969, + "learning_rate": 1.6116152450090744e-06, + "loss": 34.1178, + "step": 5224 + }, + { + "epoch": 18.863205417607222, + "grad_norm": 271.058349609375, + "learning_rate": 1.6061705989110708e-06, + "loss": 34.4702, + "step": 5225 + }, + { + "epoch": 18.86681715575621, + "grad_norm": 301.3417663574219, + "learning_rate": 1.600725952813067e-06, + "loss": 32.5166, + "step": 5226 + }, + { + "epoch": 18.870428893905192, + "grad_norm": 259.4634094238281, + "learning_rate": 1.5952813067150635e-06, + "loss": 32.1952, + "step": 5227 + }, + { + "epoch": 18.874040632054175, + "grad_norm": 299.018310546875, + "learning_rate": 1.58983666061706e-06, + "loss": 33.6772, + "step": 5228 + }, + { + "epoch": 18.877652370203162, + "grad_norm": 286.192626953125, + "learning_rate": 1.5843920145190564e-06, + "loss": 35.4991, + "step": 5229 + }, + { + "epoch": 18.881264108352145, + "grad_norm": 380.0414733886719, + "learning_rate": 1.5789473684210526e-06, + "loss": 34.4324, + "step": 5230 + }, + { + "epoch": 18.881264108352145, + "eval_loss": 0.6009039282798767, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 5230 + }, + { + "epoch": 18.884875846501128, + "grad_norm": 333.0609436035156, + "learning_rate": 1.573502722323049e-06, + "loss": 35.8757, + "step": 5231 + }, + { + "epoch": 18.888487584650115, + "grad_norm": 343.6198425292969, + "learning_rate": 1.5680580762250455e-06, + "loss": 30.4765, + "step": 5232 + }, + { + "epoch": 18.892099322799098, + "grad_norm": 222.56637573242188, + "learning_rate": 1.562613430127042e-06, + "loss": 21.2017, + "step": 5233 + }, + { + "epoch": 18.89571106094808, + "grad_norm": 209.6859130859375, + "learning_rate": 1.5571687840290381e-06, + "loss": 21.5447, + "step": 5234 + }, + { + "epoch": 18.899322799097064, + "grad_norm": 249.7464141845703, + "learning_rate": 1.5517241379310346e-06, + "loss": 23.6495, + "step": 5235 + }, + { + "epoch": 18.90293453724605, + "grad_norm": 267.1141357421875, + "learning_rate": 1.546279491833031e-06, + "loss": 23.0331, + "step": 5236 + }, + { + "epoch": 18.906546275395034, + "grad_norm": 204.96266174316406, + "learning_rate": 1.5408348457350273e-06, + "loss": 37.8988, + "step": 5237 + }, + { + "epoch": 18.910158013544017, + "grad_norm": 247.50706481933594, + "learning_rate": 1.5353901996370235e-06, + "loss": 38.5207, + "step": 5238 + }, + { + "epoch": 18.913769751693003, + "grad_norm": 350.968994140625, + "learning_rate": 1.52994555353902e-06, + "loss": 37.981, + "step": 5239 + }, + { + "epoch": 18.917381489841986, + "grad_norm": 308.0031433105469, + "learning_rate": 1.5245009074410164e-06, + "loss": 39.2602, + "step": 5240 + }, + { + "epoch": 18.917381489841986, + "eval_loss": 0.6020543575286865, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 5240 + }, + { + "epoch": 18.92099322799097, + "grad_norm": 353.0065002441406, + "learning_rate": 1.5190562613430128e-06, + "loss": 39.7331, + "step": 5241 + }, + { + "epoch": 18.924604966139956, + "grad_norm": 495.2381591796875, + "learning_rate": 1.513611615245009e-06, + "loss": 37.6413, + "step": 5242 + }, + { + "epoch": 18.92821670428894, + "grad_norm": 470.453125, + "learning_rate": 1.5081669691470055e-06, + "loss": 36.1928, + "step": 5243 + }, + { + "epoch": 18.931828442437922, + "grad_norm": 632.1090698242188, + "learning_rate": 1.502722323049002e-06, + "loss": 37.4057, + "step": 5244 + }, + { + "epoch": 18.93544018058691, + "grad_norm": 488.4659118652344, + "learning_rate": 1.4972776769509982e-06, + "loss": 37.1323, + "step": 5245 + }, + { + "epoch": 18.939051918735892, + "grad_norm": 426.4764709472656, + "learning_rate": 1.4918330308529946e-06, + "loss": 36.1739, + "step": 5246 + }, + { + "epoch": 18.942663656884875, + "grad_norm": 413.3072509765625, + "learning_rate": 1.486388384754991e-06, + "loss": 36.243, + "step": 5247 + }, + { + "epoch": 18.94627539503386, + "grad_norm": 364.8636169433594, + "learning_rate": 1.4809437386569875e-06, + "loss": 36.8362, + "step": 5248 + }, + { + "epoch": 18.949887133182845, + "grad_norm": 306.2213134765625, + "learning_rate": 1.4754990925589837e-06, + "loss": 38.4677, + "step": 5249 + }, + { + "epoch": 18.953498871331828, + "grad_norm": 300.37664794921875, + "learning_rate": 1.47005444646098e-06, + "loss": 38.1286, + "step": 5250 + }, + { + "epoch": 18.953498871331828, + "eval_loss": 0.6017122864723206, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.009, + "eval_steps_per_second": 57.009, + "step": 5250 + }, + { + "epoch": 18.957110609480814, + "grad_norm": 242.2681884765625, + "learning_rate": 1.4646098003629764e-06, + "loss": 36.9862, + "step": 5251 + }, + { + "epoch": 18.960722347629797, + "grad_norm": 276.28179931640625, + "learning_rate": 1.4591651542649728e-06, + "loss": 35.0475, + "step": 5252 + }, + { + "epoch": 18.96433408577878, + "grad_norm": 256.64508056640625, + "learning_rate": 1.453720508166969e-06, + "loss": 30.4778, + "step": 5253 + }, + { + "epoch": 18.967945823927764, + "grad_norm": 275.1043701171875, + "learning_rate": 1.4482758620689655e-06, + "loss": 32.3847, + "step": 5254 + }, + { + "epoch": 18.97155756207675, + "grad_norm": 324.22955322265625, + "learning_rate": 1.442831215970962e-06, + "loss": 32.9917, + "step": 5255 + }, + { + "epoch": 18.975169300225733, + "grad_norm": 328.7778625488281, + "learning_rate": 1.4373865698729584e-06, + "loss": 31.5901, + "step": 5256 + }, + { + "epoch": 18.978781038374716, + "grad_norm": 307.2234191894531, + "learning_rate": 1.4319419237749546e-06, + "loss": 33.5733, + "step": 5257 + }, + { + "epoch": 18.982392776523703, + "grad_norm": 471.10552978515625, + "learning_rate": 1.426497277676951e-06, + "loss": 33.3204, + "step": 5258 + }, + { + "epoch": 18.986004514672686, + "grad_norm": 286.2314453125, + "learning_rate": 1.4210526315789475e-06, + "loss": 35.8205, + "step": 5259 + }, + { + "epoch": 18.98961625282167, + "grad_norm": 341.5156555175781, + "learning_rate": 1.415607985480944e-06, + "loss": 35.7746, + "step": 5260 + }, + { + "epoch": 18.98961625282167, + "eval_loss": 0.6023879051208496, + "eval_runtime": 3.1375, + "eval_samples_per_second": 57.051, + "eval_steps_per_second": 57.051, + "step": 5260 + }, + { + "epoch": 18.993227990970656, + "grad_norm": 257.73345947265625, + "learning_rate": 1.4101633393829402e-06, + "loss": 26.5263, + "step": 5261 + }, + { + "epoch": 18.99683972911964, + "grad_norm": 197.04811096191406, + "learning_rate": 1.4047186932849366e-06, + "loss": 21.9504, + "step": 5262 + }, + { + "epoch": 19.0, + "grad_norm": 237.48069763183594, + "learning_rate": 1.3992740471869328e-06, + "loss": 20.273, + "step": 5263 + }, + { + "epoch": 19.003611738148983, + "grad_norm": 238.98065185546875, + "learning_rate": 1.393829401088929e-06, + "loss": 37.7406, + "step": 5264 + }, + { + "epoch": 19.00722347629797, + "grad_norm": 209.30593872070312, + "learning_rate": 1.3883847549909255e-06, + "loss": 39.8367, + "step": 5265 + }, + { + "epoch": 19.010835214446953, + "grad_norm": 251.27899169921875, + "learning_rate": 1.382940108892922e-06, + "loss": 39.0155, + "step": 5266 + }, + { + "epoch": 19.014446952595936, + "grad_norm": 278.8317565917969, + "learning_rate": 1.3774954627949184e-06, + "loss": 37.9895, + "step": 5267 + }, + { + "epoch": 19.018058690744923, + "grad_norm": 227.08090209960938, + "learning_rate": 1.3720508166969146e-06, + "loss": 38.2986, + "step": 5268 + }, + { + "epoch": 19.021670428893906, + "grad_norm": 248.63221740722656, + "learning_rate": 1.366606170598911e-06, + "loss": 38.9906, + "step": 5269 + }, + { + "epoch": 19.02528216704289, + "grad_norm": 216.49449157714844, + "learning_rate": 1.3611615245009075e-06, + "loss": 39.4871, + "step": 5270 + }, + { + "epoch": 19.02528216704289, + "eval_loss": 0.6001354455947876, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 5270 + }, + { + "epoch": 19.028893905191875, + "grad_norm": 219.4734649658203, + "learning_rate": 1.355716878402904e-06, + "loss": 38.8617, + "step": 5271 + }, + { + "epoch": 19.03250564334086, + "grad_norm": 211.6996307373047, + "learning_rate": 1.3502722323049002e-06, + "loss": 39.6489, + "step": 5272 + }, + { + "epoch": 19.03611738148984, + "grad_norm": 306.1536865234375, + "learning_rate": 1.3448275862068966e-06, + "loss": 39.4235, + "step": 5273 + }, + { + "epoch": 19.039729119638825, + "grad_norm": 260.87353515625, + "learning_rate": 1.339382940108893e-06, + "loss": 37.9957, + "step": 5274 + }, + { + "epoch": 19.04334085778781, + "grad_norm": 266.5260314941406, + "learning_rate": 1.3339382940108895e-06, + "loss": 36.4288, + "step": 5275 + }, + { + "epoch": 19.046952595936794, + "grad_norm": 295.3840637207031, + "learning_rate": 1.3284936479128857e-06, + "loss": 35.1091, + "step": 5276 + }, + { + "epoch": 19.050564334085777, + "grad_norm": 381.60748291015625, + "learning_rate": 1.323049001814882e-06, + "loss": 37.6468, + "step": 5277 + }, + { + "epoch": 19.054176072234764, + "grad_norm": 430.3531494140625, + "learning_rate": 1.3176043557168784e-06, + "loss": 35.8345, + "step": 5278 + }, + { + "epoch": 19.057787810383747, + "grad_norm": 393.22772216796875, + "learning_rate": 1.3121597096188749e-06, + "loss": 37.1803, + "step": 5279 + }, + { + "epoch": 19.06139954853273, + "grad_norm": 308.1875915527344, + "learning_rate": 1.306715063520871e-06, + "loss": 36.5634, + "step": 5280 + }, + { + "epoch": 19.06139954853273, + "eval_loss": 0.6008215546607971, + "eval_runtime": 3.1371, + "eval_samples_per_second": 57.059, + "eval_steps_per_second": 57.059, + "step": 5280 + }, + { + "epoch": 19.065011286681717, + "grad_norm": 379.57183837890625, + "learning_rate": 1.3012704174228675e-06, + "loss": 36.7718, + "step": 5281 + }, + { + "epoch": 19.0686230248307, + "grad_norm": 482.2864685058594, + "learning_rate": 1.295825771324864e-06, + "loss": 37.0207, + "step": 5282 + }, + { + "epoch": 19.072234762979683, + "grad_norm": 310.96142578125, + "learning_rate": 1.2903811252268602e-06, + "loss": 37.0438, + "step": 5283 + }, + { + "epoch": 19.07584650112867, + "grad_norm": 274.2409973144531, + "learning_rate": 1.2849364791288566e-06, + "loss": 36.3401, + "step": 5284 + }, + { + "epoch": 19.079458239277653, + "grad_norm": 242.37583923339844, + "learning_rate": 1.279491833030853e-06, + "loss": 36.6312, + "step": 5285 + }, + { + "epoch": 19.083069977426636, + "grad_norm": 244.91583251953125, + "learning_rate": 1.2740471869328495e-06, + "loss": 37.4987, + "step": 5286 + }, + { + "epoch": 19.086681715575622, + "grad_norm": 234.21511840820312, + "learning_rate": 1.2686025408348458e-06, + "loss": 38.1373, + "step": 5287 + }, + { + "epoch": 19.090293453724605, + "grad_norm": 277.73931884765625, + "learning_rate": 1.2631578947368422e-06, + "loss": 38.8423, + "step": 5288 + }, + { + "epoch": 19.09390519187359, + "grad_norm": 247.04971313476562, + "learning_rate": 1.2577132486388386e-06, + "loss": 37.2783, + "step": 5289 + }, + { + "epoch": 19.097516930022575, + "grad_norm": 289.022216796875, + "learning_rate": 1.2522686025408349e-06, + "loss": 36.2534, + "step": 5290 + }, + { + "epoch": 19.097516930022575, + "eval_loss": 0.6020083427429199, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.959, + "eval_steps_per_second": 56.959, + "step": 5290 + }, + { + "epoch": 19.101128668171558, + "grad_norm": 294.7291564941406, + "learning_rate": 1.246823956442831e-06, + "loss": 36.4967, + "step": 5291 + }, + { + "epoch": 19.10474040632054, + "grad_norm": 238.0512237548828, + "learning_rate": 1.2413793103448275e-06, + "loss": 34.1439, + "step": 5292 + }, + { + "epoch": 19.108352144469524, + "grad_norm": 254.0712127685547, + "learning_rate": 1.235934664246824e-06, + "loss": 30.9632, + "step": 5293 + }, + { + "epoch": 19.11196388261851, + "grad_norm": 321.169921875, + "learning_rate": 1.2304900181488204e-06, + "loss": 29.2757, + "step": 5294 + }, + { + "epoch": 19.115575620767494, + "grad_norm": 308.8040466308594, + "learning_rate": 1.2250453720508167e-06, + "loss": 31.2651, + "step": 5295 + }, + { + "epoch": 19.119187358916477, + "grad_norm": 369.23004150390625, + "learning_rate": 1.219600725952813e-06, + "loss": 32.9721, + "step": 5296 + }, + { + "epoch": 19.122799097065464, + "grad_norm": 348.9309997558594, + "learning_rate": 1.2141560798548095e-06, + "loss": 31.8663, + "step": 5297 + }, + { + "epoch": 19.126410835214447, + "grad_norm": 330.5960388183594, + "learning_rate": 1.208711433756806e-06, + "loss": 31.6104, + "step": 5298 + }, + { + "epoch": 19.13002257336343, + "grad_norm": 380.59161376953125, + "learning_rate": 1.2032667876588022e-06, + "loss": 32.1911, + "step": 5299 + }, + { + "epoch": 19.133634311512417, + "grad_norm": 402.8847961425781, + "learning_rate": 1.1978221415607986e-06, + "loss": 33.4755, + "step": 5300 + }, + { + "epoch": 19.133634311512417, + "eval_loss": 0.6015223264694214, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 5300 + }, + { + "epoch": 19.1372460496614, + "grad_norm": 409.90667724609375, + "learning_rate": 1.192377495462795e-06, + "loss": 33.7318, + "step": 5301 + }, + { + "epoch": 19.140857787810383, + "grad_norm": 425.7220458984375, + "learning_rate": 1.1869328493647913e-06, + "loss": 33.6745, + "step": 5302 + }, + { + "epoch": 19.14446952595937, + "grad_norm": 373.9212951660156, + "learning_rate": 1.1814882032667876e-06, + "loss": 33.8191, + "step": 5303 + }, + { + "epoch": 19.148081264108352, + "grad_norm": 381.37469482421875, + "learning_rate": 1.176043557168784e-06, + "loss": 33.8767, + "step": 5304 + }, + { + "epoch": 19.151693002257336, + "grad_norm": 267.89288330078125, + "learning_rate": 1.1705989110707804e-06, + "loss": 33.3089, + "step": 5305 + }, + { + "epoch": 19.155304740406322, + "grad_norm": 326.5400390625, + "learning_rate": 1.1651542649727767e-06, + "loss": 35.798, + "step": 5306 + }, + { + "epoch": 19.158916478555305, + "grad_norm": 307.7875061035156, + "learning_rate": 1.1597096188747731e-06, + "loss": 34.2442, + "step": 5307 + }, + { + "epoch": 19.16252821670429, + "grad_norm": 401.6629333496094, + "learning_rate": 1.1542649727767695e-06, + "loss": 34.7408, + "step": 5308 + }, + { + "epoch": 19.16613995485327, + "grad_norm": 297.7433166503906, + "learning_rate": 1.148820326678766e-06, + "loss": 30.2776, + "step": 5309 + }, + { + "epoch": 19.169751693002258, + "grad_norm": 221.2977752685547, + "learning_rate": 1.1433756805807622e-06, + "loss": 21.3755, + "step": 5310 + }, + { + "epoch": 19.169751693002258, + "eval_loss": 0.6015586853027344, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 5310 + }, + { + "epoch": 19.17336343115124, + "grad_norm": 232.3973846435547, + "learning_rate": 1.1379310344827587e-06, + "loss": 20.9516, + "step": 5311 + }, + { + "epoch": 19.176975169300224, + "grad_norm": 220.6842803955078, + "learning_rate": 1.1324863883847551e-06, + "loss": 22.3779, + "step": 5312 + }, + { + "epoch": 19.18058690744921, + "grad_norm": 207.9031982421875, + "learning_rate": 1.1270417422867515e-06, + "loss": 23.4166, + "step": 5313 + }, + { + "epoch": 19.184198645598194, + "grad_norm": 211.70394897460938, + "learning_rate": 1.1215970961887478e-06, + "loss": 37.157, + "step": 5314 + }, + { + "epoch": 19.187810383747177, + "grad_norm": 243.7276611328125, + "learning_rate": 1.1161524500907442e-06, + "loss": 40.0688, + "step": 5315 + }, + { + "epoch": 19.191422121896164, + "grad_norm": 199.99435424804688, + "learning_rate": 1.1107078039927405e-06, + "loss": 38.9213, + "step": 5316 + }, + { + "epoch": 19.195033860045147, + "grad_norm": 214.8607177734375, + "learning_rate": 1.1052631578947367e-06, + "loss": 37.5778, + "step": 5317 + }, + { + "epoch": 19.19864559819413, + "grad_norm": 241.69651794433594, + "learning_rate": 1.0998185117967331e-06, + "loss": 36.9334, + "step": 5318 + }, + { + "epoch": 19.202257336343116, + "grad_norm": 344.64849853515625, + "learning_rate": 1.0943738656987296e-06, + "loss": 38.9315, + "step": 5319 + }, + { + "epoch": 19.2058690744921, + "grad_norm": 248.10731506347656, + "learning_rate": 1.088929219600726e-06, + "loss": 37.94, + "step": 5320 + }, + { + "epoch": 19.2058690744921, + "eval_loss": 0.6011462211608887, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.995, + "eval_steps_per_second": 56.995, + "step": 5320 + }, + { + "epoch": 19.209480812641083, + "grad_norm": 262.3296813964844, + "learning_rate": 1.0834845735027222e-06, + "loss": 38.2016, + "step": 5321 + }, + { + "epoch": 19.21309255079007, + "grad_norm": 276.65179443359375, + "learning_rate": 1.0780399274047187e-06, + "loss": 39.0355, + "step": 5322 + }, + { + "epoch": 19.216704288939052, + "grad_norm": 377.314697265625, + "learning_rate": 1.0725952813067151e-06, + "loss": 39.0543, + "step": 5323 + }, + { + "epoch": 19.220316027088035, + "grad_norm": 282.5917053222656, + "learning_rate": 1.0671506352087116e-06, + "loss": 37.1001, + "step": 5324 + }, + { + "epoch": 19.223927765237022, + "grad_norm": 420.4558410644531, + "learning_rate": 1.0617059891107078e-06, + "loss": 36.5363, + "step": 5325 + }, + { + "epoch": 19.227539503386005, + "grad_norm": 460.62701416015625, + "learning_rate": 1.0562613430127042e-06, + "loss": 35.8127, + "step": 5326 + }, + { + "epoch": 19.231151241534988, + "grad_norm": 492.31170654296875, + "learning_rate": 1.0508166969147007e-06, + "loss": 35.7043, + "step": 5327 + }, + { + "epoch": 19.23476297968397, + "grad_norm": 385.2608947753906, + "learning_rate": 1.0453720508166971e-06, + "loss": 35.0656, + "step": 5328 + }, + { + "epoch": 19.238374717832958, + "grad_norm": 322.3689270019531, + "learning_rate": 1.0399274047186931e-06, + "loss": 37.2145, + "step": 5329 + }, + { + "epoch": 19.24198645598194, + "grad_norm": 309.3829650878906, + "learning_rate": 1.0344827586206896e-06, + "loss": 35.4361, + "step": 5330 + }, + { + "epoch": 19.24198645598194, + "eval_loss": 0.6023690104484558, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 5330 + }, + { + "epoch": 19.245598194130924, + "grad_norm": 342.5604248046875, + "learning_rate": 1.029038112522686e-06, + "loss": 36.9204, + "step": 5331 + }, + { + "epoch": 19.24920993227991, + "grad_norm": 404.432373046875, + "learning_rate": 1.0235934664246825e-06, + "loss": 37.9907, + "step": 5332 + }, + { + "epoch": 19.252821670428894, + "grad_norm": 333.77044677734375, + "learning_rate": 1.0181488203266787e-06, + "loss": 36.1432, + "step": 5333 + }, + { + "epoch": 19.256433408577877, + "grad_norm": 297.11480712890625, + "learning_rate": 1.0127041742286751e-06, + "loss": 37.824, + "step": 5334 + }, + { + "epoch": 19.260045146726863, + "grad_norm": 271.3321838378906, + "learning_rate": 1.0072595281306716e-06, + "loss": 36.0811, + "step": 5335 + }, + { + "epoch": 19.263656884875846, + "grad_norm": 246.6988525390625, + "learning_rate": 1.0018148820326678e-06, + "loss": 36.6415, + "step": 5336 + }, + { + "epoch": 19.26726862302483, + "grad_norm": 264.7515563964844, + "learning_rate": 9.963702359346642e-07, + "loss": 37.048, + "step": 5337 + }, + { + "epoch": 19.270880361173816, + "grad_norm": 238.71475219726562, + "learning_rate": 9.909255898366607e-07, + "loss": 37.3109, + "step": 5338 + }, + { + "epoch": 19.2744920993228, + "grad_norm": 232.89256286621094, + "learning_rate": 9.854809437386571e-07, + "loss": 37.0776, + "step": 5339 + }, + { + "epoch": 19.278103837471782, + "grad_norm": 309.91796875, + "learning_rate": 9.800362976406534e-07, + "loss": 37.5227, + "step": 5340 + }, + { + "epoch": 19.278103837471782, + "eval_loss": 0.603413999080658, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.993, + "eval_steps_per_second": 56.993, + "step": 5340 + }, + { + "epoch": 19.28171557562077, + "grad_norm": 415.85009765625, + "learning_rate": 9.745916515426498e-07, + "loss": 38.7916, + "step": 5341 + }, + { + "epoch": 19.285327313769752, + "grad_norm": 336.5480651855469, + "learning_rate": 9.691470054446462e-07, + "loss": 34.7108, + "step": 5342 + }, + { + "epoch": 19.288939051918735, + "grad_norm": 361.7843017578125, + "learning_rate": 9.637023593466425e-07, + "loss": 33.3624, + "step": 5343 + }, + { + "epoch": 19.292550790067722, + "grad_norm": 278.5044250488281, + "learning_rate": 9.582577132486387e-07, + "loss": 31.9202, + "step": 5344 + }, + { + "epoch": 19.296162528216705, + "grad_norm": 378.85003662109375, + "learning_rate": 9.528130671506353e-07, + "loss": 32.0191, + "step": 5345 + }, + { + "epoch": 19.299774266365688, + "grad_norm": 307.8309020996094, + "learning_rate": 9.473684210526316e-07, + "loss": 30.1278, + "step": 5346 + }, + { + "epoch": 19.30338600451467, + "grad_norm": 377.0649108886719, + "learning_rate": 9.41923774954628e-07, + "loss": 30.8298, + "step": 5347 + }, + { + "epoch": 19.306997742663658, + "grad_norm": 366.9952392578125, + "learning_rate": 9.364791288566244e-07, + "loss": 32.8491, + "step": 5348 + }, + { + "epoch": 19.31060948081264, + "grad_norm": 384.6134948730469, + "learning_rate": 9.310344827586207e-07, + "loss": 33.3014, + "step": 5349 + }, + { + "epoch": 19.314221218961624, + "grad_norm": 377.0379943847656, + "learning_rate": 9.255898366606171e-07, + "loss": 31.1514, + "step": 5350 + }, + { + "epoch": 19.314221218961624, + "eval_loss": 0.6012714505195618, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 5350 + }, + { + "epoch": 19.31783295711061, + "grad_norm": 419.49359130859375, + "learning_rate": 9.201451905626135e-07, + "loss": 32.2402, + "step": 5351 + }, + { + "epoch": 19.321444695259594, + "grad_norm": 290.20050048828125, + "learning_rate": 9.147005444646098e-07, + "loss": 33.9084, + "step": 5352 + }, + { + "epoch": 19.325056433408577, + "grad_norm": 283.597412109375, + "learning_rate": 9.092558983666062e-07, + "loss": 34.3691, + "step": 5353 + }, + { + "epoch": 19.328668171557563, + "grad_norm": 322.4947204589844, + "learning_rate": 9.038112522686025e-07, + "loss": 33.2218, + "step": 5354 + }, + { + "epoch": 19.332279909706546, + "grad_norm": 346.0417785644531, + "learning_rate": 8.983666061705989e-07, + "loss": 32.6409, + "step": 5355 + }, + { + "epoch": 19.33589164785553, + "grad_norm": 282.1748962402344, + "learning_rate": 8.929219600725953e-07, + "loss": 33.722, + "step": 5356 + }, + { + "epoch": 19.339503386004516, + "grad_norm": 302.015625, + "learning_rate": 8.874773139745917e-07, + "loss": 35.1681, + "step": 5357 + }, + { + "epoch": 19.3431151241535, + "grad_norm": 325.37005615234375, + "learning_rate": 8.82032667876588e-07, + "loss": 34.2712, + "step": 5358 + }, + { + "epoch": 19.346726862302482, + "grad_norm": 291.301513671875, + "learning_rate": 8.765880217785844e-07, + "loss": 31.3185, + "step": 5359 + }, + { + "epoch": 19.35033860045147, + "grad_norm": 190.09767150878906, + "learning_rate": 8.711433756805807e-07, + "loss": 22.3868, + "step": 5360 + }, + { + "epoch": 19.35033860045147, + "eval_loss": 0.6009277105331421, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.033, + "eval_steps_per_second": 57.033, + "step": 5360 + }, + { + "epoch": 19.353950338600452, + "grad_norm": 231.69676208496094, + "learning_rate": 8.656987295825772e-07, + "loss": 21.1889, + "step": 5361 + }, + { + "epoch": 19.357562076749435, + "grad_norm": 255.91258239746094, + "learning_rate": 8.602540834845735e-07, + "loss": 23.1246, + "step": 5362 + }, + { + "epoch": 19.36117381489842, + "grad_norm": 265.2499694824219, + "learning_rate": 8.548094373865699e-07, + "loss": 22.9017, + "step": 5363 + }, + { + "epoch": 19.364785553047405, + "grad_norm": 217.06552124023438, + "learning_rate": 8.493647912885663e-07, + "loss": 38.4372, + "step": 5364 + }, + { + "epoch": 19.368397291196388, + "grad_norm": 220.9014434814453, + "learning_rate": 8.439201451905627e-07, + "loss": 38.8259, + "step": 5365 + }, + { + "epoch": 19.37200902934537, + "grad_norm": 217.46336364746094, + "learning_rate": 8.38475499092559e-07, + "loss": 37.7587, + "step": 5366 + }, + { + "epoch": 19.375620767494357, + "grad_norm": 219.59889221191406, + "learning_rate": 8.330308529945554e-07, + "loss": 38.2973, + "step": 5367 + }, + { + "epoch": 19.37923250564334, + "grad_norm": 206.93772888183594, + "learning_rate": 8.275862068965517e-07, + "loss": 36.6878, + "step": 5368 + }, + { + "epoch": 19.382844243792324, + "grad_norm": 268.5470886230469, + "learning_rate": 8.221415607985482e-07, + "loss": 37.4095, + "step": 5369 + }, + { + "epoch": 19.38645598194131, + "grad_norm": 228.70216369628906, + "learning_rate": 8.166969147005445e-07, + "loss": 39.1159, + "step": 5370 + }, + { + "epoch": 19.38645598194131, + "eval_loss": 0.6011511087417603, + "eval_runtime": 3.1369, + "eval_samples_per_second": 57.063, + "eval_steps_per_second": 57.063, + "step": 5370 + }, + { + "epoch": 19.390067720090293, + "grad_norm": 212.8670654296875, + "learning_rate": 8.112522686025409e-07, + "loss": 38.8929, + "step": 5371 + }, + { + "epoch": 19.393679458239276, + "grad_norm": 228.0734405517578, + "learning_rate": 8.058076225045372e-07, + "loss": 39.7208, + "step": 5372 + }, + { + "epoch": 19.397291196388263, + "grad_norm": 239.56906127929688, + "learning_rate": 8.003629764065335e-07, + "loss": 38.3748, + "step": 5373 + }, + { + "epoch": 19.400902934537246, + "grad_norm": 243.6251220703125, + "learning_rate": 7.9491833030853e-07, + "loss": 37.3178, + "step": 5374 + }, + { + "epoch": 19.40451467268623, + "grad_norm": 407.86907958984375, + "learning_rate": 7.894736842105263e-07, + "loss": 36.5418, + "step": 5375 + }, + { + "epoch": 19.408126410835216, + "grad_norm": 260.6579284667969, + "learning_rate": 7.840290381125227e-07, + "loss": 36.9031, + "step": 5376 + }, + { + "epoch": 19.4117381489842, + "grad_norm": 358.63946533203125, + "learning_rate": 7.785843920145191e-07, + "loss": 35.4851, + "step": 5377 + }, + { + "epoch": 19.415349887133182, + "grad_norm": 414.06634521484375, + "learning_rate": 7.731397459165155e-07, + "loss": 34.6983, + "step": 5378 + }, + { + "epoch": 19.41896162528217, + "grad_norm": 471.287109375, + "learning_rate": 7.676950998185117e-07, + "loss": 36.7265, + "step": 5379 + }, + { + "epoch": 19.42257336343115, + "grad_norm": 366.92767333984375, + "learning_rate": 7.622504537205082e-07, + "loss": 35.4779, + "step": 5380 + }, + { + "epoch": 19.42257336343115, + "eval_loss": 0.6010181903839111, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 5380 + }, + { + "epoch": 19.426185101580135, + "grad_norm": 392.23138427734375, + "learning_rate": 7.568058076225045e-07, + "loss": 36.1143, + "step": 5381 + }, + { + "epoch": 19.42979683972912, + "grad_norm": 296.0258483886719, + "learning_rate": 7.51361161524501e-07, + "loss": 37.5785, + "step": 5382 + }, + { + "epoch": 19.433408577878104, + "grad_norm": 425.22247314453125, + "learning_rate": 7.459165154264973e-07, + "loss": 37.7905, + "step": 5383 + }, + { + "epoch": 19.437020316027088, + "grad_norm": 288.7919921875, + "learning_rate": 7.404718693284937e-07, + "loss": 36.3987, + "step": 5384 + }, + { + "epoch": 19.44063205417607, + "grad_norm": 269.2157287597656, + "learning_rate": 7.3502722323049e-07, + "loss": 36.9862, + "step": 5385 + }, + { + "epoch": 19.444243792325057, + "grad_norm": 236.28067016601562, + "learning_rate": 7.295825771324864e-07, + "loss": 36.3645, + "step": 5386 + }, + { + "epoch": 19.44785553047404, + "grad_norm": 217.44627380371094, + "learning_rate": 7.241379310344827e-07, + "loss": 37.0505, + "step": 5387 + }, + { + "epoch": 19.451467268623023, + "grad_norm": 260.61175537109375, + "learning_rate": 7.186932849364792e-07, + "loss": 37.1031, + "step": 5388 + }, + { + "epoch": 19.45507900677201, + "grad_norm": 282.62017822265625, + "learning_rate": 7.132486388384755e-07, + "loss": 38.2061, + "step": 5389 + }, + { + "epoch": 19.458690744920993, + "grad_norm": 231.78170776367188, + "learning_rate": 7.07803992740472e-07, + "loss": 35.8868, + "step": 5390 + }, + { + "epoch": 19.458690744920993, + "eval_loss": 0.6014392375946045, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 5390 + }, + { + "epoch": 19.462302483069976, + "grad_norm": 246.38380432128906, + "learning_rate": 7.023593466424683e-07, + "loss": 36.1871, + "step": 5391 + }, + { + "epoch": 19.465914221218963, + "grad_norm": 239.06924438476562, + "learning_rate": 6.969147005444645e-07, + "loss": 34.5704, + "step": 5392 + }, + { + "epoch": 19.469525959367946, + "grad_norm": 396.09027099609375, + "learning_rate": 6.91470054446461e-07, + "loss": 33.6148, + "step": 5393 + }, + { + "epoch": 19.47313769751693, + "grad_norm": 250.8205108642578, + "learning_rate": 6.860254083484573e-07, + "loss": 31.535, + "step": 5394 + }, + { + "epoch": 19.476749435665916, + "grad_norm": 257.0039978027344, + "learning_rate": 6.805807622504538e-07, + "loss": 31.6366, + "step": 5395 + }, + { + "epoch": 19.4803611738149, + "grad_norm": 283.7515563964844, + "learning_rate": 6.751361161524501e-07, + "loss": 30.4001, + "step": 5396 + }, + { + "epoch": 19.483972911963882, + "grad_norm": 335.6957702636719, + "learning_rate": 6.696914700544465e-07, + "loss": 31.1016, + "step": 5397 + }, + { + "epoch": 19.48758465011287, + "grad_norm": 338.0590515136719, + "learning_rate": 6.642468239564429e-07, + "loss": 31.7707, + "step": 5398 + }, + { + "epoch": 19.49119638826185, + "grad_norm": 409.0957946777344, + "learning_rate": 6.588021778584392e-07, + "loss": 34.904, + "step": 5399 + }, + { + "epoch": 19.494808126410835, + "grad_norm": 265.0601806640625, + "learning_rate": 6.533575317604355e-07, + "loss": 32.1701, + "step": 5400 + }, + { + "epoch": 19.494808126410835, + "eval_loss": 0.6015393137931824, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.04, + "eval_steps_per_second": 57.04, + "step": 5400 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.6484994292672102e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..be3941b5328fd1ccd2f8dafb34a49b4a80e4c6cd --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7991a23e501447dc2f286d36e0f8cbb601735f39ce53187169fe163c5c18b4a +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..38b27abe24d7b34d12c97feadc92a1d1f304957e --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f1d4ec3f16d38b10dec935cc751e03136fbd1a5c56c031f7e3e00df31d1c45f +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a537d95f7e71242a181283e7853e9b20f4245652 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5f2e27d454e40e5acf9d507cc03289fb2f749468df927398410b7364b46c316 +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..644bd9b8aeb88915ad4dec5a3c710e0adea26b38 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e393f802fa65831a37bb4e3a706c5e9b1564b8efd87d696cb707e685ab2dbb4 +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..da030a28fcdfcc8afd51f2e660548711c6b9de1c --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddeb303a224aa4949732928ecf496ccaa5f8496dae6231a50c6e1d792e698aff +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e650c5de2fc21aca74dea7a43757b8fe0343ead9 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/trainer_state.json @@ -0,0 +1,43089 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 19.92821670428894, + "eval_steps": 10, + "global_step": 5520, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + }, + { + "epoch": 2.892099322799097, + "grad_norm": 255.42556762695312, + "learning_rate": 2.569328493647913e-05, + "loss": 25.0794, + "step": 801 + }, + { + "epoch": 2.895711060948081, + "grad_norm": 235.7202606201172, + "learning_rate": 2.5687840290381127e-05, + "loss": 26.0036, + "step": 802 + }, + { + "epoch": 2.8993227990970656, + "grad_norm": 196.36996459960938, + "learning_rate": 2.5682395644283122e-05, + "loss": 26.1592, + "step": 803 + }, + { + "epoch": 2.9029345372460496, + "grad_norm": 254.46896362304688, + "learning_rate": 2.5676950998185118e-05, + "loss": 26.9693, + "step": 804 + }, + { + "epoch": 2.906546275395034, + "grad_norm": 435.552734375, + "learning_rate": 2.5671506352087113e-05, + "loss": 51.2981, + "step": 805 + }, + { + "epoch": 2.910158013544018, + "grad_norm": 523.48388671875, + "learning_rate": 2.566606170598911e-05, + "loss": 50.1727, + "step": 806 + }, + { + "epoch": 2.9137697516930023, + "grad_norm": 432.06561279296875, + "learning_rate": 2.5660617059891107e-05, + "loss": 50.5994, + "step": 807 + }, + { + "epoch": 2.9173814898419863, + "grad_norm": 354.7589416503906, + "learning_rate": 2.5655172413793106e-05, + "loss": 49.3211, + "step": 808 + }, + { + "epoch": 2.9209932279909707, + "grad_norm": 327.1822509765625, + "learning_rate": 2.56497277676951e-05, + "loss": 48.1541, + "step": 809 + }, + { + "epoch": 2.9246049661399547, + "grad_norm": 309.42279052734375, + "learning_rate": 2.5644283121597097e-05, + "loss": 46.6141, + "step": 810 + }, + { + "epoch": 2.9246049661399547, + "eval_loss": 0.6766613721847534, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 810 + }, + { + "epoch": 2.928216704288939, + "grad_norm": 286.34368896484375, + "learning_rate": 2.5638838475499092e-05, + "loss": 43.6344, + "step": 811 + }, + { + "epoch": 2.931828442437923, + "grad_norm": 224.4126739501953, + "learning_rate": 2.563339382940109e-05, + "loss": 43.0613, + "step": 812 + }, + { + "epoch": 2.9354401805869075, + "grad_norm": 246.1632537841797, + "learning_rate": 2.5627949183303086e-05, + "loss": 42.9807, + "step": 813 + }, + { + "epoch": 2.939051918735892, + "grad_norm": 298.6852722167969, + "learning_rate": 2.562250453720508e-05, + "loss": 43.4627, + "step": 814 + }, + { + "epoch": 2.942663656884876, + "grad_norm": 255.9106903076172, + "learning_rate": 2.5617059891107077e-05, + "loss": 42.576, + "step": 815 + }, + { + "epoch": 2.94627539503386, + "grad_norm": 227.76461791992188, + "learning_rate": 2.5611615245009072e-05, + "loss": 43.5352, + "step": 816 + }, + { + "epoch": 2.9498871331828442, + "grad_norm": 262.1735534667969, + "learning_rate": 2.560617059891107e-05, + "loss": 44.6115, + "step": 817 + }, + { + "epoch": 2.9534988713318286, + "grad_norm": 261.7061767578125, + "learning_rate": 2.560072595281307e-05, + "loss": 45.1437, + "step": 818 + }, + { + "epoch": 2.9571106094808126, + "grad_norm": 241.5306396484375, + "learning_rate": 2.5595281306715065e-05, + "loss": 43.7623, + "step": 819 + }, + { + "epoch": 2.9607223476297966, + "grad_norm": 262.2628479003906, + "learning_rate": 2.558983666061706e-05, + "loss": 39.4783, + "step": 820 + }, + { + "epoch": 2.9607223476297966, + "eval_loss": 0.6567817330360413, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 820 + }, + { + "epoch": 2.964334085778781, + "grad_norm": 313.9430236816406, + "learning_rate": 2.5584392014519056e-05, + "loss": 37.5457, + "step": 821 + }, + { + "epoch": 2.9679458239277654, + "grad_norm": 223.93162536621094, + "learning_rate": 2.557894736842105e-05, + "loss": 36.6506, + "step": 822 + }, + { + "epoch": 2.9715575620767494, + "grad_norm": 319.6013488769531, + "learning_rate": 2.557350272232305e-05, + "loss": 39.2182, + "step": 823 + }, + { + "epoch": 2.975169300225734, + "grad_norm": 237.14610290527344, + "learning_rate": 2.5568058076225046e-05, + "loss": 39.3368, + "step": 824 + }, + { + "epoch": 2.9787810383747177, + "grad_norm": 270.99481201171875, + "learning_rate": 2.556261343012704e-05, + "loss": 38.105, + "step": 825 + }, + { + "epoch": 2.982392776523702, + "grad_norm": 236.88687133789062, + "learning_rate": 2.555716878402904e-05, + "loss": 40.1464, + "step": 826 + }, + { + "epoch": 2.986004514672686, + "grad_norm": 205.72084045410156, + "learning_rate": 2.5551724137931035e-05, + "loss": 41.4284, + "step": 827 + }, + { + "epoch": 2.9896162528216705, + "grad_norm": 243.73684692382812, + "learning_rate": 2.554627949183303e-05, + "loss": 41.4085, + "step": 828 + }, + { + "epoch": 2.9932279909706545, + "grad_norm": 200.96815490722656, + "learning_rate": 2.554083484573503e-05, + "loss": 28.59, + "step": 829 + }, + { + "epoch": 2.996839729119639, + "grad_norm": 258.556884765625, + "learning_rate": 2.5535390199637025e-05, + "loss": 25.85, + "step": 830 + }, + { + "epoch": 2.996839729119639, + "eval_loss": 0.6678276062011719, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 262.8646545410156, + "learning_rate": 2.552994555353902e-05, + "loss": 23.6925, + "step": 831 + }, + { + "epoch": 3.0036117381489844, + "grad_norm": 388.7956848144531, + "learning_rate": 2.5524500907441015e-05, + "loss": 47.4725, + "step": 832 + }, + { + "epoch": 3.0072234762979684, + "grad_norm": 371.5218200683594, + "learning_rate": 2.551905626134301e-05, + "loss": 48.1197, + "step": 833 + }, + { + "epoch": 3.010835214446953, + "grad_norm": 296.68096923828125, + "learning_rate": 2.551361161524501e-05, + "loss": 49.0124, + "step": 834 + }, + { + "epoch": 3.0144469525959368, + "grad_norm": 258.3492126464844, + "learning_rate": 2.550816696914701e-05, + "loss": 47.0989, + "step": 835 + }, + { + "epoch": 3.018058690744921, + "grad_norm": 262.0732116699219, + "learning_rate": 2.5502722323049004e-05, + "loss": 46.5102, + "step": 836 + }, + { + "epoch": 3.021670428893905, + "grad_norm": 249.84967041015625, + "learning_rate": 2.5497277676951e-05, + "loss": 47.2614, + "step": 837 + }, + { + "epoch": 3.0252821670428895, + "grad_norm": 259.7544250488281, + "learning_rate": 2.5491833030852995e-05, + "loss": 44.8942, + "step": 838 + }, + { + "epoch": 3.0288939051918735, + "grad_norm": 264.3735656738281, + "learning_rate": 2.5486388384754993e-05, + "loss": 45.42, + "step": 839 + }, + { + "epoch": 3.032505643340858, + "grad_norm": 295.92919921875, + "learning_rate": 2.548094373865699e-05, + "loss": 46.1006, + "step": 840 + }, + { + "epoch": 3.032505643340858, + "eval_loss": 0.6581276059150696, + "eval_runtime": 3.1326, + "eval_samples_per_second": 57.14, + "eval_steps_per_second": 57.14, + "step": 840 + }, + { + "epoch": 3.036117381489842, + "grad_norm": 311.7466125488281, + "learning_rate": 2.5475499092558984e-05, + "loss": 46.1223, + "step": 841 + }, + { + "epoch": 3.0397291196388263, + "grad_norm": 208.77503967285156, + "learning_rate": 2.547005444646098e-05, + "loss": 45.1578, + "step": 842 + }, + { + "epoch": 3.0433408577878103, + "grad_norm": 203.6681671142578, + "learning_rate": 2.5464609800362975e-05, + "loss": 42.9368, + "step": 843 + }, + { + "epoch": 3.0469525959367947, + "grad_norm": 251.1130828857422, + "learning_rate": 2.5459165154264974e-05, + "loss": 42.4021, + "step": 844 + }, + { + "epoch": 3.0505643340857787, + "grad_norm": 253.73077392578125, + "learning_rate": 2.5453720508166972e-05, + "loss": 41.7869, + "step": 845 + }, + { + "epoch": 3.054176072234763, + "grad_norm": 202.12892150878906, + "learning_rate": 2.5448275862068968e-05, + "loss": 41.3124, + "step": 846 + }, + { + "epoch": 3.057787810383747, + "grad_norm": 250.02322387695312, + "learning_rate": 2.5442831215970963e-05, + "loss": 41.1522, + "step": 847 + }, + { + "epoch": 3.0613995485327314, + "grad_norm": 171.8944549560547, + "learning_rate": 2.543738656987296e-05, + "loss": 41.4023, + "step": 848 + }, + { + "epoch": 3.0650112866817154, + "grad_norm": 245.9447784423828, + "learning_rate": 2.5431941923774954e-05, + "loss": 43.0454, + "step": 849 + }, + { + "epoch": 3.0686230248307, + "grad_norm": 216.93519592285156, + "learning_rate": 2.5426497277676953e-05, + "loss": 43.7984, + "step": 850 + }, + { + "epoch": 3.0686230248307, + "eval_loss": 0.6542946100234985, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 850 + }, + { + "epoch": 3.072234762979684, + "grad_norm": 271.1915588378906, + "learning_rate": 2.5421052631578948e-05, + "loss": 42.7845, + "step": 851 + }, + { + "epoch": 3.075846501128668, + "grad_norm": 262.10791015625, + "learning_rate": 2.5415607985480943e-05, + "loss": 43.042, + "step": 852 + }, + { + "epoch": 3.079458239277652, + "grad_norm": 234.5153045654297, + "learning_rate": 2.541016333938294e-05, + "loss": 42.541, + "step": 853 + }, + { + "epoch": 3.0830699774266366, + "grad_norm": 183.66058349609375, + "learning_rate": 2.5404718693284938e-05, + "loss": 43.6462, + "step": 854 + }, + { + "epoch": 3.0866817155756205, + "grad_norm": 232.13169860839844, + "learning_rate": 2.5399274047186933e-05, + "loss": 43.9704, + "step": 855 + }, + { + "epoch": 3.090293453724605, + "grad_norm": 219.72445678710938, + "learning_rate": 2.5393829401088932e-05, + "loss": 43.1515, + "step": 856 + }, + { + "epoch": 3.0939051918735894, + "grad_norm": 215.75115966796875, + "learning_rate": 2.5388384754990927e-05, + "loss": 43.9146, + "step": 857 + }, + { + "epoch": 3.0975169300225733, + "grad_norm": 248.385498046875, + "learning_rate": 2.5382940108892923e-05, + "loss": 44.323, + "step": 858 + }, + { + "epoch": 3.1011286681715577, + "grad_norm": 295.951171875, + "learning_rate": 2.5377495462794918e-05, + "loss": 45.381, + "step": 859 + }, + { + "epoch": 3.1047404063205417, + "grad_norm": 239.43002319335938, + "learning_rate": 2.5372050816696913e-05, + "loss": 42.4742, + "step": 860 + }, + { + "epoch": 3.1047404063205417, + "eval_loss": 0.647969663143158, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 860 + }, + { + "epoch": 3.108352144469526, + "grad_norm": 207.01927185058594, + "learning_rate": 2.5366606170598912e-05, + "loss": 39.4421, + "step": 861 + }, + { + "epoch": 3.11196388261851, + "grad_norm": 255.97584533691406, + "learning_rate": 2.5361161524500907e-05, + "loss": 37.9044, + "step": 862 + }, + { + "epoch": 3.1155756207674945, + "grad_norm": 379.0328674316406, + "learning_rate": 2.5355716878402906e-05, + "loss": 36.04, + "step": 863 + }, + { + "epoch": 3.1191873589164785, + "grad_norm": 216.48049926757812, + "learning_rate": 2.53502722323049e-05, + "loss": 36.4328, + "step": 864 + }, + { + "epoch": 3.122799097065463, + "grad_norm": 242.0985565185547, + "learning_rate": 2.5344827586206897e-05, + "loss": 36.5303, + "step": 865 + }, + { + "epoch": 3.126410835214447, + "grad_norm": 212.566650390625, + "learning_rate": 2.5339382940108892e-05, + "loss": 38.199, + "step": 866 + }, + { + "epoch": 3.1300225733634313, + "grad_norm": 217.37811279296875, + "learning_rate": 2.533393829401089e-05, + "loss": 39.0308, + "step": 867 + }, + { + "epoch": 3.1336343115124152, + "grad_norm": 186.531494140625, + "learning_rate": 2.5328493647912887e-05, + "loss": 36.3811, + "step": 868 + }, + { + "epoch": 3.1372460496613996, + "grad_norm": 202.18603515625, + "learning_rate": 2.5323049001814882e-05, + "loss": 37.8778, + "step": 869 + }, + { + "epoch": 3.1408577878103836, + "grad_norm": 246.00283813476562, + "learning_rate": 2.5317604355716877e-05, + "loss": 38.3339, + "step": 870 + }, + { + "epoch": 3.1408577878103836, + "eval_loss": 0.6535190343856812, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 870 + }, + { + "epoch": 3.144469525959368, + "grad_norm": 246.4683074951172, + "learning_rate": 2.5312159709618873e-05, + "loss": 38.8566, + "step": 871 + }, + { + "epoch": 3.148081264108352, + "grad_norm": 243.6247100830078, + "learning_rate": 2.5306715063520875e-05, + "loss": 38.0433, + "step": 872 + }, + { + "epoch": 3.1516930022573364, + "grad_norm": 273.507080078125, + "learning_rate": 2.530127041742287e-05, + "loss": 40.121, + "step": 873 + }, + { + "epoch": 3.1553047404063204, + "grad_norm": 243.57203674316406, + "learning_rate": 2.5295825771324866e-05, + "loss": 38.9714, + "step": 874 + }, + { + "epoch": 3.1589164785553048, + "grad_norm": 206.15533447265625, + "learning_rate": 2.529038112522686e-05, + "loss": 38.7573, + "step": 875 + }, + { + "epoch": 3.1625282167042887, + "grad_norm": 322.87799072265625, + "learning_rate": 2.5284936479128856e-05, + "loss": 41.3548, + "step": 876 + }, + { + "epoch": 3.166139954853273, + "grad_norm": 259.7116394042969, + "learning_rate": 2.5279491833030852e-05, + "loss": 30.5113, + "step": 877 + }, + { + "epoch": 3.169751693002257, + "grad_norm": 277.6427307128906, + "learning_rate": 2.527404718693285e-05, + "loss": 26.152, + "step": 878 + }, + { + "epoch": 3.1733634311512415, + "grad_norm": 259.84588623046875, + "learning_rate": 2.5268602540834846e-05, + "loss": 25.543, + "step": 879 + }, + { + "epoch": 3.176975169300226, + "grad_norm": 205.59854125976562, + "learning_rate": 2.526315789473684e-05, + "loss": 25.2503, + "step": 880 + }, + { + "epoch": 3.176975169300226, + "eval_loss": 0.6754873394966125, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 880 + }, + { + "epoch": 3.18058690744921, + "grad_norm": 231.8431396484375, + "learning_rate": 2.525771324863884e-05, + "loss": 25.8277, + "step": 881 + }, + { + "epoch": 3.1841986455981943, + "grad_norm": 437.2222900390625, + "learning_rate": 2.5252268602540835e-05, + "loss": 48.7641, + "step": 882 + }, + { + "epoch": 3.1878103837471783, + "grad_norm": 409.23468017578125, + "learning_rate": 2.5246823956442834e-05, + "loss": 49.1716, + "step": 883 + }, + { + "epoch": 3.1914221218961627, + "grad_norm": 446.9589538574219, + "learning_rate": 2.524137931034483e-05, + "loss": 49.1204, + "step": 884 + }, + { + "epoch": 3.1950338600451467, + "grad_norm": 365.4459228515625, + "learning_rate": 2.5235934664246825e-05, + "loss": 46.3124, + "step": 885 + }, + { + "epoch": 3.198645598194131, + "grad_norm": 329.88677978515625, + "learning_rate": 2.523049001814882e-05, + "loss": 46.4021, + "step": 886 + }, + { + "epoch": 3.202257336343115, + "grad_norm": 271.31201171875, + "learning_rate": 2.5225045372050816e-05, + "loss": 46.4958, + "step": 887 + }, + { + "epoch": 3.2058690744920995, + "grad_norm": 281.3929138183594, + "learning_rate": 2.521960072595281e-05, + "loss": 46.238, + "step": 888 + }, + { + "epoch": 3.2094808126410834, + "grad_norm": 279.1689147949219, + "learning_rate": 2.521415607985481e-05, + "loss": 47.0312, + "step": 889 + }, + { + "epoch": 3.213092550790068, + "grad_norm": 296.18115234375, + "learning_rate": 2.520871143375681e-05, + "loss": 46.1837, + "step": 890 + }, + { + "epoch": 3.213092550790068, + "eval_loss": 0.666180431842804, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 890 + }, + { + "epoch": 3.216704288939052, + "grad_norm": 315.7528991699219, + "learning_rate": 2.5203266787658804e-05, + "loss": 46.1023, + "step": 891 + }, + { + "epoch": 3.220316027088036, + "grad_norm": 296.7471923828125, + "learning_rate": 2.51978221415608e-05, + "loss": 43.9875, + "step": 892 + }, + { + "epoch": 3.22392776523702, + "grad_norm": 238.37600708007812, + "learning_rate": 2.5192377495462795e-05, + "loss": 42.0998, + "step": 893 + }, + { + "epoch": 3.2275395033860046, + "grad_norm": 221.9834442138672, + "learning_rate": 2.5186932849364794e-05, + "loss": 40.791, + "step": 894 + }, + { + "epoch": 3.2311512415349886, + "grad_norm": 221.9122314453125, + "learning_rate": 2.518148820326679e-05, + "loss": 41.6985, + "step": 895 + }, + { + "epoch": 3.234762979683973, + "grad_norm": 269.44561767578125, + "learning_rate": 2.5176043557168784e-05, + "loss": 42.978, + "step": 896 + }, + { + "epoch": 3.238374717832957, + "grad_norm": 207.09165954589844, + "learning_rate": 2.517059891107078e-05, + "loss": 41.4141, + "step": 897 + }, + { + "epoch": 3.2419864559819414, + "grad_norm": 236.3747100830078, + "learning_rate": 2.5165154264972775e-05, + "loss": 41.7936, + "step": 898 + }, + { + "epoch": 3.2455981941309253, + "grad_norm": 194.84373474121094, + "learning_rate": 2.515970961887477e-05, + "loss": 42.0031, + "step": 899 + }, + { + "epoch": 3.2492099322799097, + "grad_norm": 220.2052459716797, + "learning_rate": 2.5154264972776773e-05, + "loss": 43.2596, + "step": 900 + }, + { + "epoch": 3.2492099322799097, + "eval_loss": 0.6527710556983948, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 900 + }, + { + "epoch": 3.2528216704288937, + "grad_norm": 190.3020782470703, + "learning_rate": 2.5148820326678768e-05, + "loss": 43.2345, + "step": 901 + }, + { + "epoch": 3.256433408577878, + "grad_norm": 200.23194885253906, + "learning_rate": 2.5143375680580763e-05, + "loss": 42.9185, + "step": 902 + }, + { + "epoch": 3.2600451467268625, + "grad_norm": 207.30697631835938, + "learning_rate": 2.513793103448276e-05, + "loss": 42.7929, + "step": 903 + }, + { + "epoch": 3.2636568848758465, + "grad_norm": 180.4369354248047, + "learning_rate": 2.5132486388384754e-05, + "loss": 43.6829, + "step": 904 + }, + { + "epoch": 3.2672686230248305, + "grad_norm": 169.92384338378906, + "learning_rate": 2.5127041742286753e-05, + "loss": 42.6406, + "step": 905 + }, + { + "epoch": 3.270880361173815, + "grad_norm": 224.46177673339844, + "learning_rate": 2.512159709618875e-05, + "loss": 43.314, + "step": 906 + }, + { + "epoch": 3.2744920993227993, + "grad_norm": 246.6527862548828, + "learning_rate": 2.5116152450090744e-05, + "loss": 44.1259, + "step": 907 + }, + { + "epoch": 3.2781038374717832, + "grad_norm": 201.84552001953125, + "learning_rate": 2.511070780399274e-05, + "loss": 43.7819, + "step": 908 + }, + { + "epoch": 3.2817155756207677, + "grad_norm": 195.65174865722656, + "learning_rate": 2.5105263157894738e-05, + "loss": 41.0509, + "step": 909 + }, + { + "epoch": 3.2853273137697516, + "grad_norm": 238.36911010742188, + "learning_rate": 2.5099818511796733e-05, + "loss": 39.3365, + "step": 910 + }, + { + "epoch": 3.2853273137697516, + "eval_loss": 0.6488128900527954, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 910 + }, + { + "epoch": 3.288939051918736, + "grad_norm": 235.58831787109375, + "learning_rate": 2.5094373865698732e-05, + "loss": 36.081, + "step": 911 + }, + { + "epoch": 3.29255079006772, + "grad_norm": 246.3998565673828, + "learning_rate": 2.5088929219600727e-05, + "loss": 35.9779, + "step": 912 + }, + { + "epoch": 3.2961625282167044, + "grad_norm": 224.34561157226562, + "learning_rate": 2.5083484573502723e-05, + "loss": 35.0636, + "step": 913 + }, + { + "epoch": 3.2997742663656884, + "grad_norm": 203.2981414794922, + "learning_rate": 2.5078039927404718e-05, + "loss": 36.6272, + "step": 914 + }, + { + "epoch": 3.303386004514673, + "grad_norm": 179.4558563232422, + "learning_rate": 2.5072595281306714e-05, + "loss": 36.0493, + "step": 915 + }, + { + "epoch": 3.3069977426636568, + "grad_norm": 240.01748657226562, + "learning_rate": 2.5067150635208712e-05, + "loss": 36.9084, + "step": 916 + }, + { + "epoch": 3.310609480812641, + "grad_norm": 264.4375, + "learning_rate": 2.5061705989110708e-05, + "loss": 37.3878, + "step": 917 + }, + { + "epoch": 3.314221218961625, + "grad_norm": 207.66322326660156, + "learning_rate": 2.5056261343012707e-05, + "loss": 37.0502, + "step": 918 + }, + { + "epoch": 3.3178329571106095, + "grad_norm": 304.8887634277344, + "learning_rate": 2.5050816696914702e-05, + "loss": 39.0532, + "step": 919 + }, + { + "epoch": 3.3214446952595935, + "grad_norm": 242.4520721435547, + "learning_rate": 2.5045372050816697e-05, + "loss": 37.9885, + "step": 920 + }, + { + "epoch": 3.3214446952595935, + "eval_loss": 0.6546927690505981, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 920 + }, + { + "epoch": 3.325056433408578, + "grad_norm": 297.5693054199219, + "learning_rate": 2.5039927404718693e-05, + "loss": 39.0422, + "step": 921 + }, + { + "epoch": 3.328668171557562, + "grad_norm": 208.76441955566406, + "learning_rate": 2.503448275862069e-05, + "loss": 39.2992, + "step": 922 + }, + { + "epoch": 3.3322799097065463, + "grad_norm": 265.2093200683594, + "learning_rate": 2.5029038112522687e-05, + "loss": 39.7897, + "step": 923 + }, + { + "epoch": 3.3358916478555303, + "grad_norm": 279.2838439941406, + "learning_rate": 2.5023593466424682e-05, + "loss": 39.6141, + "step": 924 + }, + { + "epoch": 3.3395033860045147, + "grad_norm": 246.86895751953125, + "learning_rate": 2.5018148820326678e-05, + "loss": 39.5724, + "step": 925 + }, + { + "epoch": 3.343115124153499, + "grad_norm": 315.27838134765625, + "learning_rate": 2.5012704174228673e-05, + "loss": 40.0274, + "step": 926 + }, + { + "epoch": 3.346726862302483, + "grad_norm": 286.7344665527344, + "learning_rate": 2.5007259528130675e-05, + "loss": 27.8964, + "step": 927 + }, + { + "epoch": 3.350338600451467, + "grad_norm": 320.6955261230469, + "learning_rate": 2.500181488203267e-05, + "loss": 26.1333, + "step": 928 + }, + { + "epoch": 3.3539503386004514, + "grad_norm": 271.5133972167969, + "learning_rate": 2.4996370235934666e-05, + "loss": 25.0519, + "step": 929 + }, + { + "epoch": 3.357562076749436, + "grad_norm": 259.59234619140625, + "learning_rate": 2.499092558983666e-05, + "loss": 26.3701, + "step": 930 + }, + { + "epoch": 3.357562076749436, + "eval_loss": 0.6857922077178955, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 930 + }, + { + "epoch": 3.36117381489842, + "grad_norm": 240.87948608398438, + "learning_rate": 2.4985480943738657e-05, + "loss": 26.9694, + "step": 931 + }, + { + "epoch": 3.3647855530474042, + "grad_norm": 410.1781921386719, + "learning_rate": 2.4980036297640655e-05, + "loss": 50.6978, + "step": 932 + }, + { + "epoch": 3.368397291196388, + "grad_norm": 371.2940979003906, + "learning_rate": 2.497459165154265e-05, + "loss": 49.5872, + "step": 933 + }, + { + "epoch": 3.3720090293453726, + "grad_norm": 343.48809814453125, + "learning_rate": 2.4969147005444646e-05, + "loss": 48.7744, + "step": 934 + }, + { + "epoch": 3.3756207674943566, + "grad_norm": 334.878662109375, + "learning_rate": 2.496370235934664e-05, + "loss": 48.104, + "step": 935 + }, + { + "epoch": 3.379232505643341, + "grad_norm": 301.94696044921875, + "learning_rate": 2.4958257713248637e-05, + "loss": 47.1941, + "step": 936 + }, + { + "epoch": 3.382844243792325, + "grad_norm": 295.99810791015625, + "learning_rate": 2.4952813067150636e-05, + "loss": 46.8274, + "step": 937 + }, + { + "epoch": 3.3864559819413094, + "grad_norm": 240.8074188232422, + "learning_rate": 2.4947368421052635e-05, + "loss": 46.8453, + "step": 938 + }, + { + "epoch": 3.3900677200902933, + "grad_norm": 244.65985107421875, + "learning_rate": 2.494192377495463e-05, + "loss": 46.6894, + "step": 939 + }, + { + "epoch": 3.3936794582392777, + "grad_norm": 239.5635223388672, + "learning_rate": 2.4936479128856625e-05, + "loss": 45.5307, + "step": 940 + }, + { + "epoch": 3.3936794582392777, + "eval_loss": 0.6575602293014526, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 940 + }, + { + "epoch": 3.3972911963882617, + "grad_norm": 224.8990936279297, + "learning_rate": 2.493103448275862e-05, + "loss": 44.8465, + "step": 941 + }, + { + "epoch": 3.400902934537246, + "grad_norm": 263.9532165527344, + "learning_rate": 2.4925589836660616e-05, + "loss": 44.8556, + "step": 942 + }, + { + "epoch": 3.40451467268623, + "grad_norm": 252.4757080078125, + "learning_rate": 2.4920145190562615e-05, + "loss": 43.7434, + "step": 943 + }, + { + "epoch": 3.4081264108352145, + "grad_norm": 204.91795349121094, + "learning_rate": 2.491470054446461e-05, + "loss": 40.3602, + "step": 944 + }, + { + "epoch": 3.4117381489841985, + "grad_norm": 259.7920837402344, + "learning_rate": 2.4909255898366606e-05, + "loss": 41.5125, + "step": 945 + }, + { + "epoch": 3.415349887133183, + "grad_norm": 196.34872436523438, + "learning_rate": 2.4903811252268604e-05, + "loss": 42.1967, + "step": 946 + }, + { + "epoch": 3.418961625282167, + "grad_norm": 267.5933837890625, + "learning_rate": 2.48983666061706e-05, + "loss": 41.5637, + "step": 947 + }, + { + "epoch": 3.4225733634311513, + "grad_norm": 261.2299499511719, + "learning_rate": 2.4892921960072595e-05, + "loss": 41.3467, + "step": 948 + }, + { + "epoch": 3.4261851015801357, + "grad_norm": 195.84051513671875, + "learning_rate": 2.4887477313974594e-05, + "loss": 42.9534, + "step": 949 + }, + { + "epoch": 3.4297968397291196, + "grad_norm": 251.25294494628906, + "learning_rate": 2.488203266787659e-05, + "loss": 43.8068, + "step": 950 + }, + { + "epoch": 3.4297968397291196, + "eval_loss": 0.6576783657073975, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.19, + "eval_steps_per_second": 57.19, + "step": 950 + }, + { + "epoch": 3.4334085778781036, + "grad_norm": 221.30291748046875, + "learning_rate": 2.4876588021778585e-05, + "loss": 43.448, + "step": 951 + }, + { + "epoch": 3.437020316027088, + "grad_norm": 244.35842895507812, + "learning_rate": 2.487114337568058e-05, + "loss": 42.7632, + "step": 952 + }, + { + "epoch": 3.4406320541760724, + "grad_norm": 195.3553009033203, + "learning_rate": 2.4865698729582575e-05, + "loss": 43.2151, + "step": 953 + }, + { + "epoch": 3.4442437923250564, + "grad_norm": 179.0012969970703, + "learning_rate": 2.4860254083484574e-05, + "loss": 43.5462, + "step": 954 + }, + { + "epoch": 3.447855530474041, + "grad_norm": 200.3195343017578, + "learning_rate": 2.4854809437386573e-05, + "loss": 43.5087, + "step": 955 + }, + { + "epoch": 3.4514672686230248, + "grad_norm": 263.8428955078125, + "learning_rate": 2.484936479128857e-05, + "loss": 44.1719, + "step": 956 + }, + { + "epoch": 3.455079006772009, + "grad_norm": 208.326416015625, + "learning_rate": 2.4843920145190564e-05, + "loss": 43.245, + "step": 957 + }, + { + "epoch": 3.458690744920993, + "grad_norm": 193.4184112548828, + "learning_rate": 2.483847549909256e-05, + "loss": 44.3687, + "step": 958 + }, + { + "epoch": 3.4623024830699776, + "grad_norm": 201.8892059326172, + "learning_rate": 2.4833030852994555e-05, + "loss": 43.7617, + "step": 959 + }, + { + "epoch": 3.4659142212189615, + "grad_norm": 258.5245056152344, + "learning_rate": 2.4827586206896553e-05, + "loss": 43.0001, + "step": 960 + }, + { + "epoch": 3.4659142212189615, + "eval_loss": 0.6453068256378174, + "eval_runtime": 3.1355, + "eval_samples_per_second": 57.087, + "eval_steps_per_second": 57.087, + "step": 960 + }, + { + "epoch": 3.469525959367946, + "grad_norm": 218.70947265625, + "learning_rate": 2.482214156079855e-05, + "loss": 39.1967, + "step": 961 + }, + { + "epoch": 3.47313769751693, + "grad_norm": 267.3435363769531, + "learning_rate": 2.4816696914700544e-05, + "loss": 36.9852, + "step": 962 + }, + { + "epoch": 3.4767494356659143, + "grad_norm": 285.9330139160156, + "learning_rate": 2.481125226860254e-05, + "loss": 35.9727, + "step": 963 + }, + { + "epoch": 3.4803611738148983, + "grad_norm": 215.71005249023438, + "learning_rate": 2.4805807622504538e-05, + "loss": 36.7653, + "step": 964 + }, + { + "epoch": 3.4839729119638827, + "grad_norm": 232.87876892089844, + "learning_rate": 2.4800362976406537e-05, + "loss": 36.378, + "step": 965 + }, + { + "epoch": 3.4875846501128667, + "grad_norm": 171.5175018310547, + "learning_rate": 2.4794918330308532e-05, + "loss": 36.8383, + "step": 966 + }, + { + "epoch": 3.491196388261851, + "grad_norm": 215.11647033691406, + "learning_rate": 2.4789473684210528e-05, + "loss": 37.8672, + "step": 967 + }, + { + "epoch": 3.494808126410835, + "grad_norm": 219.3248291015625, + "learning_rate": 2.4784029038112523e-05, + "loss": 38.2493, + "step": 968 + }, + { + "epoch": 3.4984198645598195, + "grad_norm": 250.36343383789062, + "learning_rate": 2.477858439201452e-05, + "loss": 37.8047, + "step": 969 + }, + { + "epoch": 3.5020316027088034, + "grad_norm": 218.4738311767578, + "learning_rate": 2.4773139745916514e-05, + "loss": 38.3357, + "step": 970 + }, + { + "epoch": 3.5020316027088034, + "eval_loss": 0.6516546607017517, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.147, + "eval_steps_per_second": 57.147, + "step": 970 + }, + { + "epoch": 3.505643340857788, + "grad_norm": 229.22828674316406, + "learning_rate": 2.4767695099818513e-05, + "loss": 38.4684, + "step": 971 + }, + { + "epoch": 3.5092550790067722, + "grad_norm": 177.5777130126953, + "learning_rate": 2.4762250453720508e-05, + "loss": 38.4852, + "step": 972 + }, + { + "epoch": 3.512866817155756, + "grad_norm": 206.41226196289062, + "learning_rate": 2.4756805807622507e-05, + "loss": 39.2789, + "step": 973 + }, + { + "epoch": 3.51647855530474, + "grad_norm": 206.19235229492188, + "learning_rate": 2.4751361161524502e-05, + "loss": 39.4009, + "step": 974 + }, + { + "epoch": 3.5200902934537246, + "grad_norm": 293.0887145996094, + "learning_rate": 2.4745916515426498e-05, + "loss": 40.2545, + "step": 975 + }, + { + "epoch": 3.523702031602709, + "grad_norm": 304.7360534667969, + "learning_rate": 2.4740471869328496e-05, + "loss": 40.174, + "step": 976 + }, + { + "epoch": 3.527313769751693, + "grad_norm": 292.6968078613281, + "learning_rate": 2.4735027223230492e-05, + "loss": 28.7529, + "step": 977 + }, + { + "epoch": 3.530925507900677, + "grad_norm": 188.4938201904297, + "learning_rate": 2.4729582577132487e-05, + "loss": 25.3517, + "step": 978 + }, + { + "epoch": 3.5345372460496614, + "grad_norm": 187.330322265625, + "learning_rate": 2.4724137931034483e-05, + "loss": 24.9574, + "step": 979 + }, + { + "epoch": 3.5381489841986458, + "grad_norm": 198.25450134277344, + "learning_rate": 2.4718693284936478e-05, + "loss": 26.0505, + "step": 980 + }, + { + "epoch": 3.5381489841986458, + "eval_loss": 0.6837891936302185, + "eval_runtime": 3.1313, + "eval_samples_per_second": 57.164, + "eval_steps_per_second": 57.164, + "step": 980 + }, + { + "epoch": 3.5417607223476297, + "grad_norm": 221.72662353515625, + "learning_rate": 2.4713248638838473e-05, + "loss": 27.1157, + "step": 981 + }, + { + "epoch": 3.545372460496614, + "grad_norm": 449.80987548828125, + "learning_rate": 2.4707803992740472e-05, + "loss": 50.0102, + "step": 982 + }, + { + "epoch": 3.548984198645598, + "grad_norm": 450.6602478027344, + "learning_rate": 2.470235934664247e-05, + "loss": 50.162, + "step": 983 + }, + { + "epoch": 3.5525959367945825, + "grad_norm": 424.1731872558594, + "learning_rate": 2.4696914700544466e-05, + "loss": 49.1374, + "step": 984 + }, + { + "epoch": 3.5562076749435665, + "grad_norm": 339.78997802734375, + "learning_rate": 2.469147005444646e-05, + "loss": 47.5901, + "step": 985 + }, + { + "epoch": 3.559819413092551, + "grad_norm": 270.9290466308594, + "learning_rate": 2.4686025408348457e-05, + "loss": 48.7289, + "step": 986 + }, + { + "epoch": 3.563431151241535, + "grad_norm": 254.77444458007812, + "learning_rate": 2.4680580762250456e-05, + "loss": 45.926, + "step": 987 + }, + { + "epoch": 3.5670428893905193, + "grad_norm": 309.8949890136719, + "learning_rate": 2.467513611615245e-05, + "loss": 46.2578, + "step": 988 + }, + { + "epoch": 3.5706546275395032, + "grad_norm": 264.5209655761719, + "learning_rate": 2.4669691470054447e-05, + "loss": 46.5274, + "step": 989 + }, + { + "epoch": 3.5742663656884877, + "grad_norm": 306.8301696777344, + "learning_rate": 2.4664246823956442e-05, + "loss": 45.0636, + "step": 990 + }, + { + "epoch": 3.5742663656884877, + "eval_loss": 0.6581718921661377, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 990 + }, + { + "epoch": 3.5778781038374716, + "grad_norm": 228.46180725097656, + "learning_rate": 2.4658802177858437e-05, + "loss": 44.8446, + "step": 991 + }, + { + "epoch": 3.581489841986456, + "grad_norm": 246.97792053222656, + "learning_rate": 2.4653357531760436e-05, + "loss": 44.5141, + "step": 992 + }, + { + "epoch": 3.58510158013544, + "grad_norm": 199.88819885253906, + "learning_rate": 2.4647912885662435e-05, + "loss": 42.7822, + "step": 993 + }, + { + "epoch": 3.5887133182844244, + "grad_norm": 291.8363952636719, + "learning_rate": 2.464246823956443e-05, + "loss": 41.7004, + "step": 994 + }, + { + "epoch": 3.592325056433409, + "grad_norm": 194.8997039794922, + "learning_rate": 2.4637023593466426e-05, + "loss": 41.154, + "step": 995 + }, + { + "epoch": 3.595936794582393, + "grad_norm": 271.03863525390625, + "learning_rate": 2.463157894736842e-05, + "loss": 41.4898, + "step": 996 + }, + { + "epoch": 3.5995485327313768, + "grad_norm": 219.783203125, + "learning_rate": 2.4626134301270416e-05, + "loss": 42.7646, + "step": 997 + }, + { + "epoch": 3.603160270880361, + "grad_norm": 232.6287384033203, + "learning_rate": 2.4620689655172415e-05, + "loss": 41.9049, + "step": 998 + }, + { + "epoch": 3.6067720090293456, + "grad_norm": 209.7451934814453, + "learning_rate": 2.461524500907441e-05, + "loss": 42.2493, + "step": 999 + }, + { + "epoch": 3.6103837471783295, + "grad_norm": 202.67608642578125, + "learning_rate": 2.4609800362976406e-05, + "loss": 42.112, + "step": 1000 + }, + { + "epoch": 3.6103837471783295, + "eval_loss": 0.6473406553268433, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 1000 + }, + { + "epoch": 3.6139954853273135, + "grad_norm": 196.04354858398438, + "learning_rate": 2.4604355716878405e-05, + "loss": 42.5126, + "step": 1001 + }, + { + "epoch": 3.617607223476298, + "grad_norm": 187.22372436523438, + "learning_rate": 2.45989110707804e-05, + "loss": 44.0572, + "step": 1002 + }, + { + "epoch": 3.6212189616252823, + "grad_norm": 249.96971130371094, + "learning_rate": 2.4593466424682395e-05, + "loss": 42.9538, + "step": 1003 + }, + { + "epoch": 3.6248306997742663, + "grad_norm": 215.70565795898438, + "learning_rate": 2.4588021778584394e-05, + "loss": 44.328, + "step": 1004 + }, + { + "epoch": 3.6284424379232507, + "grad_norm": 180.83642578125, + "learning_rate": 2.458257713248639e-05, + "loss": 43.6176, + "step": 1005 + }, + { + "epoch": 3.6320541760722347, + "grad_norm": 190.6321563720703, + "learning_rate": 2.4577132486388385e-05, + "loss": 42.8305, + "step": 1006 + }, + { + "epoch": 3.635665914221219, + "grad_norm": 192.47682189941406, + "learning_rate": 2.457168784029038e-05, + "loss": 43.6692, + "step": 1007 + }, + { + "epoch": 3.639277652370203, + "grad_norm": 204.16403198242188, + "learning_rate": 2.4566243194192376e-05, + "loss": 43.6698, + "step": 1008 + }, + { + "epoch": 3.6428893905191875, + "grad_norm": 216.57371520996094, + "learning_rate": 2.4560798548094374e-05, + "loss": 42.4672, + "step": 1009 + }, + { + "epoch": 3.6465011286681714, + "grad_norm": 209.3368377685547, + "learning_rate": 2.4555353901996373e-05, + "loss": 40.9764, + "step": 1010 + }, + { + "epoch": 3.6465011286681714, + "eval_loss": 0.6477307081222534, + "eval_runtime": 3.1316, + "eval_samples_per_second": 57.159, + "eval_steps_per_second": 57.159, + "step": 1010 + }, + { + "epoch": 3.650112866817156, + "grad_norm": 209.23538208007812, + "learning_rate": 2.454990925589837e-05, + "loss": 37.8193, + "step": 1011 + }, + { + "epoch": 3.65372460496614, + "grad_norm": 236.15859985351562, + "learning_rate": 2.4544464609800364e-05, + "loss": 36.6497, + "step": 1012 + }, + { + "epoch": 3.6573363431151242, + "grad_norm": 230.68008422851562, + "learning_rate": 2.453901996370236e-05, + "loss": 36.5181, + "step": 1013 + }, + { + "epoch": 3.660948081264108, + "grad_norm": 233.6422882080078, + "learning_rate": 2.4533575317604355e-05, + "loss": 37.4292, + "step": 1014 + }, + { + "epoch": 3.6645598194130926, + "grad_norm": 263.49554443359375, + "learning_rate": 2.4528130671506354e-05, + "loss": 36.8303, + "step": 1015 + }, + { + "epoch": 3.6681715575620766, + "grad_norm": 259.7931823730469, + "learning_rate": 2.452268602540835e-05, + "loss": 38.5344, + "step": 1016 + }, + { + "epoch": 3.671783295711061, + "grad_norm": 227.5961151123047, + "learning_rate": 2.4517241379310344e-05, + "loss": 37.9728, + "step": 1017 + }, + { + "epoch": 3.6753950338600454, + "grad_norm": 209.28163146972656, + "learning_rate": 2.451179673321234e-05, + "loss": 37.1389, + "step": 1018 + }, + { + "epoch": 3.6790067720090294, + "grad_norm": 284.8781433105469, + "learning_rate": 2.450635208711434e-05, + "loss": 37.4052, + "step": 1019 + }, + { + "epoch": 3.6826185101580133, + "grad_norm": 256.3425598144531, + "learning_rate": 2.4500907441016337e-05, + "loss": 39.1912, + "step": 1020 + }, + { + "epoch": 3.6826185101580133, + "eval_loss": 0.6528274416923523, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.161, + "eval_steps_per_second": 57.161, + "step": 1020 + }, + { + "epoch": 3.6862302483069977, + "grad_norm": 218.8914031982422, + "learning_rate": 2.4495462794918333e-05, + "loss": 37.205, + "step": 1021 + }, + { + "epoch": 3.689841986455982, + "grad_norm": 254.91282653808594, + "learning_rate": 2.4490018148820328e-05, + "loss": 40.28, + "step": 1022 + }, + { + "epoch": 3.693453724604966, + "grad_norm": 235.3753662109375, + "learning_rate": 2.4484573502722323e-05, + "loss": 39.4649, + "step": 1023 + }, + { + "epoch": 3.69706546275395, + "grad_norm": 286.5908203125, + "learning_rate": 2.447912885662432e-05, + "loss": 38.5807, + "step": 1024 + }, + { + "epoch": 3.7006772009029345, + "grad_norm": 227.684814453125, + "learning_rate": 2.4473684210526318e-05, + "loss": 39.2304, + "step": 1025 + }, + { + "epoch": 3.704288939051919, + "grad_norm": 230.00128173828125, + "learning_rate": 2.4468239564428313e-05, + "loss": 39.5135, + "step": 1026 + }, + { + "epoch": 3.707900677200903, + "grad_norm": 198.72862243652344, + "learning_rate": 2.446279491833031e-05, + "loss": 36.6274, + "step": 1027 + }, + { + "epoch": 3.7115124153498873, + "grad_norm": 263.6575012207031, + "learning_rate": 2.4457350272232304e-05, + "loss": 25.5852, + "step": 1028 + }, + { + "epoch": 3.7151241534988713, + "grad_norm": 273.997314453125, + "learning_rate": 2.4451905626134302e-05, + "loss": 24.8593, + "step": 1029 + }, + { + "epoch": 3.7187358916478557, + "grad_norm": 180.25997924804688, + "learning_rate": 2.4446460980036298e-05, + "loss": 25.4596, + "step": 1030 + }, + { + "epoch": 3.7187358916478557, + "eval_loss": 0.6783067584037781, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1030 + }, + { + "epoch": 3.7223476297968396, + "grad_norm": 203.3702850341797, + "learning_rate": 2.4441016333938297e-05, + "loss": 25.9792, + "step": 1031 + }, + { + "epoch": 3.725959367945824, + "grad_norm": 393.4874572753906, + "learning_rate": 2.4435571687840292e-05, + "loss": 48.2254, + "step": 1032 + }, + { + "epoch": 3.729571106094808, + "grad_norm": 369.2442626953125, + "learning_rate": 2.4430127041742287e-05, + "loss": 49.7546, + "step": 1033 + }, + { + "epoch": 3.7331828442437924, + "grad_norm": 339.0132751464844, + "learning_rate": 2.4424682395644283e-05, + "loss": 48.1843, + "step": 1034 + }, + { + "epoch": 3.7367945823927764, + "grad_norm": 322.1737060546875, + "learning_rate": 2.4419237749546278e-05, + "loss": 47.2471, + "step": 1035 + }, + { + "epoch": 3.740406320541761, + "grad_norm": 330.0899658203125, + "learning_rate": 2.4413793103448277e-05, + "loss": 47.5831, + "step": 1036 + }, + { + "epoch": 3.7440180586907448, + "grad_norm": 306.1767578125, + "learning_rate": 2.4408348457350272e-05, + "loss": 47.0229, + "step": 1037 + }, + { + "epoch": 3.747629796839729, + "grad_norm": 279.7237548828125, + "learning_rate": 2.440290381125227e-05, + "loss": 46.801, + "step": 1038 + }, + { + "epoch": 3.751241534988713, + "grad_norm": 277.7254333496094, + "learning_rate": 2.4397459165154266e-05, + "loss": 47.2659, + "step": 1039 + }, + { + "epoch": 3.7548532731376976, + "grad_norm": 288.577880859375, + "learning_rate": 2.4392014519056262e-05, + "loss": 46.1864, + "step": 1040 + }, + { + "epoch": 3.7548532731376976, + "eval_loss": 0.6584362983703613, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 1040 + }, + { + "epoch": 3.758465011286682, + "grad_norm": 282.44989013671875, + "learning_rate": 2.4386569872958257e-05, + "loss": 43.9753, + "step": 1041 + }, + { + "epoch": 3.762076749435666, + "grad_norm": 186.32090759277344, + "learning_rate": 2.4381125226860256e-05, + "loss": 44.1678, + "step": 1042 + }, + { + "epoch": 3.76568848758465, + "grad_norm": 227.9755096435547, + "learning_rate": 2.437568058076225e-05, + "loss": 42.0396, + "step": 1043 + }, + { + "epoch": 3.7693002257336343, + "grad_norm": 188.82789611816406, + "learning_rate": 2.4370235934664247e-05, + "loss": 40.0181, + "step": 1044 + }, + { + "epoch": 3.7729119638826187, + "grad_norm": 222.2530517578125, + "learning_rate": 2.4364791288566242e-05, + "loss": 41.0851, + "step": 1045 + }, + { + "epoch": 3.7765237020316027, + "grad_norm": 196.7293243408203, + "learning_rate": 2.4359346642468238e-05, + "loss": 41.0595, + "step": 1046 + }, + { + "epoch": 3.7801354401805867, + "grad_norm": 247.01638793945312, + "learning_rate": 2.435390199637024e-05, + "loss": 41.8551, + "step": 1047 + }, + { + "epoch": 3.783747178329571, + "grad_norm": 238.08656311035156, + "learning_rate": 2.4348457350272235e-05, + "loss": 41.5365, + "step": 1048 + }, + { + "epoch": 3.7873589164785555, + "grad_norm": 205.6416778564453, + "learning_rate": 2.434301270417423e-05, + "loss": 42.804, + "step": 1049 + }, + { + "epoch": 3.7909706546275395, + "grad_norm": 236.24205017089844, + "learning_rate": 2.4337568058076226e-05, + "loss": 42.4529, + "step": 1050 + }, + { + "epoch": 3.7909706546275395, + "eval_loss": 0.6511489152908325, + "eval_runtime": 3.1324, + "eval_samples_per_second": 57.145, + "eval_steps_per_second": 57.145, + "step": 1050 + }, + { + "epoch": 3.7945823927765234, + "grad_norm": 195.8008575439453, + "learning_rate": 2.433212341197822e-05, + "loss": 42.2678, + "step": 1051 + }, + { + "epoch": 3.798194130925508, + "grad_norm": 218.7563018798828, + "learning_rate": 2.4326678765880217e-05, + "loss": 42.4501, + "step": 1052 + }, + { + "epoch": 3.8018058690744923, + "grad_norm": 209.214599609375, + "learning_rate": 2.4321234119782215e-05, + "loss": 43.0947, + "step": 1053 + }, + { + "epoch": 3.805417607223476, + "grad_norm": 235.3767852783203, + "learning_rate": 2.431578947368421e-05, + "loss": 44.3962, + "step": 1054 + }, + { + "epoch": 3.8090293453724606, + "grad_norm": 189.2035369873047, + "learning_rate": 2.4310344827586206e-05, + "loss": 43.5015, + "step": 1055 + }, + { + "epoch": 3.8126410835214446, + "grad_norm": 185.23617553710938, + "learning_rate": 2.4304900181488205e-05, + "loss": 43.4919, + "step": 1056 + }, + { + "epoch": 3.816252821670429, + "grad_norm": 197.72720336914062, + "learning_rate": 2.42994555353902e-05, + "loss": 43.5435, + "step": 1057 + }, + { + "epoch": 3.819864559819413, + "grad_norm": 210.86380004882812, + "learning_rate": 2.42940108892922e-05, + "loss": 42.8559, + "step": 1058 + }, + { + "epoch": 3.8234762979683974, + "grad_norm": 183.15798950195312, + "learning_rate": 2.4288566243194194e-05, + "loss": 43.2725, + "step": 1059 + }, + { + "epoch": 3.8270880361173814, + "grad_norm": 195.6173858642578, + "learning_rate": 2.428312159709619e-05, + "loss": 39.7816, + "step": 1060 + }, + { + "epoch": 3.8270880361173814, + "eval_loss": 0.6438961625099182, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 1060 + }, + { + "epoch": 3.8306997742663658, + "grad_norm": 218.30186462402344, + "learning_rate": 2.4277676950998185e-05, + "loss": 37.7217, + "step": 1061 + }, + { + "epoch": 3.8343115124153497, + "grad_norm": 226.92807006835938, + "learning_rate": 2.427223230490018e-05, + "loss": 35.3267, + "step": 1062 + }, + { + "epoch": 3.837923250564334, + "grad_norm": 277.8228759765625, + "learning_rate": 2.4266787658802176e-05, + "loss": 36.4474, + "step": 1063 + }, + { + "epoch": 3.8415349887133186, + "grad_norm": 233.3556365966797, + "learning_rate": 2.4261343012704175e-05, + "loss": 36.8737, + "step": 1064 + }, + { + "epoch": 3.8451467268623025, + "grad_norm": 232.3887176513672, + "learning_rate": 2.425589836660617e-05, + "loss": 35.5258, + "step": 1065 + }, + { + "epoch": 3.8487584650112865, + "grad_norm": 212.23741149902344, + "learning_rate": 2.425045372050817e-05, + "loss": 36.264, + "step": 1066 + }, + { + "epoch": 3.852370203160271, + "grad_norm": 262.5358581542969, + "learning_rate": 2.4245009074410164e-05, + "loss": 37.4407, + "step": 1067 + }, + { + "epoch": 3.8559819413092553, + "grad_norm": 250.24459838867188, + "learning_rate": 2.423956442831216e-05, + "loss": 36.753, + "step": 1068 + }, + { + "epoch": 3.8595936794582393, + "grad_norm": 234.84124755859375, + "learning_rate": 2.423411978221416e-05, + "loss": 38.1465, + "step": 1069 + }, + { + "epoch": 3.8632054176072232, + "grad_norm": 258.2744140625, + "learning_rate": 2.4228675136116154e-05, + "loss": 38.1092, + "step": 1070 + }, + { + "epoch": 3.8632054176072232, + "eval_loss": 0.6504554152488708, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 1070 + }, + { + "epoch": 3.8668171557562077, + "grad_norm": 210.83404541015625, + "learning_rate": 2.422323049001815e-05, + "loss": 37.5608, + "step": 1071 + }, + { + "epoch": 3.870428893905192, + "grad_norm": 196.8865203857422, + "learning_rate": 2.4217785843920145e-05, + "loss": 38.8805, + "step": 1072 + }, + { + "epoch": 3.874040632054176, + "grad_norm": 195.45758056640625, + "learning_rate": 2.421234119782214e-05, + "loss": 38.3821, + "step": 1073 + }, + { + "epoch": 3.87765237020316, + "grad_norm": 249.15740966796875, + "learning_rate": 2.4206896551724135e-05, + "loss": 39.2063, + "step": 1074 + }, + { + "epoch": 3.8812641083521444, + "grad_norm": 224.40455627441406, + "learning_rate": 2.4201451905626138e-05, + "loss": 40.8177, + "step": 1075 + }, + { + "epoch": 3.884875846501129, + "grad_norm": 272.9620361328125, + "learning_rate": 2.4196007259528133e-05, + "loss": 39.2645, + "step": 1076 + }, + { + "epoch": 3.888487584650113, + "grad_norm": 230.61953735351562, + "learning_rate": 2.419056261343013e-05, + "loss": 38.6852, + "step": 1077 + }, + { + "epoch": 3.892099322799097, + "grad_norm": 209.87234497070312, + "learning_rate": 2.4185117967332124e-05, + "loss": 29.6319, + "step": 1078 + }, + { + "epoch": 3.895711060948081, + "grad_norm": 249.635009765625, + "learning_rate": 2.417967332123412e-05, + "loss": 24.6807, + "step": 1079 + }, + { + "epoch": 3.8993227990970656, + "grad_norm": 185.14309692382812, + "learning_rate": 2.4174228675136118e-05, + "loss": 25.785, + "step": 1080 + }, + { + "epoch": 3.8993227990970656, + "eval_loss": 0.6733376979827881, + "eval_runtime": 3.129, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 1080 + }, + { + "epoch": 3.9029345372460496, + "grad_norm": 190.28952026367188, + "learning_rate": 2.4168784029038113e-05, + "loss": 25.702, + "step": 1081 + }, + { + "epoch": 3.906546275395034, + "grad_norm": 431.5606689453125, + "learning_rate": 2.416333938294011e-05, + "loss": 49.0322, + "step": 1082 + }, + { + "epoch": 3.910158013544018, + "grad_norm": 396.85345458984375, + "learning_rate": 2.4157894736842104e-05, + "loss": 48.3669, + "step": 1083 + }, + { + "epoch": 3.9137697516930023, + "grad_norm": 369.1654357910156, + "learning_rate": 2.4152450090744103e-05, + "loss": 47.3948, + "step": 1084 + }, + { + "epoch": 3.9173814898419863, + "grad_norm": 320.3822937011719, + "learning_rate": 2.4147005444646098e-05, + "loss": 47.5562, + "step": 1085 + }, + { + "epoch": 3.9209932279909707, + "grad_norm": 300.494140625, + "learning_rate": 2.4141560798548097e-05, + "loss": 46.1018, + "step": 1086 + }, + { + "epoch": 3.9246049661399547, + "grad_norm": 290.30462646484375, + "learning_rate": 2.4136116152450092e-05, + "loss": 44.9794, + "step": 1087 + }, + { + "epoch": 3.928216704288939, + "grad_norm": 299.4498596191406, + "learning_rate": 2.4130671506352088e-05, + "loss": 43.7329, + "step": 1088 + }, + { + "epoch": 3.931828442437923, + "grad_norm": 296.0865783691406, + "learning_rate": 2.4125226860254083e-05, + "loss": 43.3881, + "step": 1089 + }, + { + "epoch": 3.9354401805869075, + "grad_norm": 227.40028381347656, + "learning_rate": 2.411978221415608e-05, + "loss": 42.518, + "step": 1090 + }, + { + "epoch": 3.9354401805869075, + "eval_loss": 0.6501370072364807, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 1090 + }, + { + "epoch": 3.939051918735892, + "grad_norm": 236.79466247558594, + "learning_rate": 2.4114337568058077e-05, + "loss": 42.5582, + "step": 1091 + }, + { + "epoch": 3.942663656884876, + "grad_norm": 331.5859375, + "learning_rate": 2.4108892921960073e-05, + "loss": 42.1563, + "step": 1092 + }, + { + "epoch": 3.94627539503386, + "grad_norm": 296.2573547363281, + "learning_rate": 2.410344827586207e-05, + "loss": 43.1934, + "step": 1093 + }, + { + "epoch": 3.9498871331828442, + "grad_norm": 258.93499755859375, + "learning_rate": 2.4098003629764067e-05, + "loss": 43.4579, + "step": 1094 + }, + { + "epoch": 3.9534988713318286, + "grad_norm": 275.31170654296875, + "learning_rate": 2.4092558983666062e-05, + "loss": 44.4464, + "step": 1095 + }, + { + "epoch": 3.9571106094808126, + "grad_norm": 276.1750183105469, + "learning_rate": 2.4087114337568058e-05, + "loss": 44.9596, + "step": 1096 + }, + { + "epoch": 3.9607223476297966, + "grad_norm": 282.0018310546875, + "learning_rate": 2.4081669691470056e-05, + "loss": 40.7271, + "step": 1097 + }, + { + "epoch": 3.964334085778781, + "grad_norm": 350.2434387207031, + "learning_rate": 2.407622504537205e-05, + "loss": 36.7406, + "step": 1098 + }, + { + "epoch": 3.9679458239277654, + "grad_norm": 264.5498046875, + "learning_rate": 2.4070780399274047e-05, + "loss": 36.6965, + "step": 1099 + }, + { + "epoch": 3.9715575620767494, + "grad_norm": 285.5101623535156, + "learning_rate": 2.4065335753176042e-05, + "loss": 39.8293, + "step": 1100 + }, + { + "epoch": 3.9715575620767494, + "eval_loss": 0.6441511511802673, + "eval_runtime": 3.13, + "eval_samples_per_second": 57.189, + "eval_steps_per_second": 57.189, + "step": 1100 + }, + { + "epoch": 3.975169300225734, + "grad_norm": 307.22113037109375, + "learning_rate": 2.4059891107078038e-05, + "loss": 39.3198, + "step": 1101 + }, + { + "epoch": 3.9787810383747177, + "grad_norm": 214.6739044189453, + "learning_rate": 2.405444646098004e-05, + "loss": 39.2073, + "step": 1102 + }, + { + "epoch": 3.982392776523702, + "grad_norm": 205.13401794433594, + "learning_rate": 2.4049001814882035e-05, + "loss": 39.0405, + "step": 1103 + }, + { + "epoch": 3.986004514672686, + "grad_norm": 200.275634765625, + "learning_rate": 2.404355716878403e-05, + "loss": 40.9828, + "step": 1104 + }, + { + "epoch": 3.9896162528216705, + "grad_norm": 239.47377014160156, + "learning_rate": 2.4038112522686026e-05, + "loss": 40.8515, + "step": 1105 + }, + { + "epoch": 3.9932279909706545, + "grad_norm": 148.22445678710938, + "learning_rate": 2.403266787658802e-05, + "loss": 28.9885, + "step": 1106 + }, + { + "epoch": 3.996839729119639, + "grad_norm": 190.1692352294922, + "learning_rate": 2.4027223230490017e-05, + "loss": 25.4718, + "step": 1107 + }, + { + "epoch": 4.0, + "grad_norm": 180.45884704589844, + "learning_rate": 2.4021778584392016e-05, + "loss": 23.4711, + "step": 1108 + }, + { + "epoch": 4.003611738148984, + "grad_norm": 357.0400390625, + "learning_rate": 2.401633393829401e-05, + "loss": 45.9855, + "step": 1109 + }, + { + "epoch": 4.007223476297969, + "grad_norm": 361.6748962402344, + "learning_rate": 2.4010889292196006e-05, + "loss": 47.2321, + "step": 1110 + }, + { + "epoch": 4.007223476297969, + "eval_loss": 0.6618791818618774, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 1110 + }, + { + "epoch": 4.010835214446953, + "grad_norm": 350.3221740722656, + "learning_rate": 2.4005444646098002e-05, + "loss": 46.4521, + "step": 1111 + }, + { + "epoch": 4.014446952595937, + "grad_norm": 279.218994140625, + "learning_rate": 2.4e-05, + "loss": 45.3017, + "step": 1112 + }, + { + "epoch": 4.018058690744921, + "grad_norm": 247.94485473632812, + "learning_rate": 2.3994555353902e-05, + "loss": 47.0519, + "step": 1113 + }, + { + "epoch": 4.021670428893906, + "grad_norm": 218.910400390625, + "learning_rate": 2.3989110707803995e-05, + "loss": 46.2511, + "step": 1114 + }, + { + "epoch": 4.0252821670428895, + "grad_norm": 229.89830017089844, + "learning_rate": 2.398366606170599e-05, + "loss": 44.8028, + "step": 1115 + }, + { + "epoch": 4.0288939051918735, + "grad_norm": 225.46900939941406, + "learning_rate": 2.3978221415607986e-05, + "loss": 46.1378, + "step": 1116 + }, + { + "epoch": 4.0325056433408575, + "grad_norm": 243.09857177734375, + "learning_rate": 2.397277676950998e-05, + "loss": 45.8397, + "step": 1117 + }, + { + "epoch": 4.036117381489842, + "grad_norm": 219.63043212890625, + "learning_rate": 2.396733212341198e-05, + "loss": 45.481, + "step": 1118 + }, + { + "epoch": 4.039729119638826, + "grad_norm": 214.18118286132812, + "learning_rate": 2.3961887477313975e-05, + "loss": 43.6477, + "step": 1119 + }, + { + "epoch": 4.04334085778781, + "grad_norm": 228.6083984375, + "learning_rate": 2.395644283121597e-05, + "loss": 41.9656, + "step": 1120 + }, + { + "epoch": 4.04334085778781, + "eval_loss": 0.6450154185295105, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 1120 + }, + { + "epoch": 4.046952595936794, + "grad_norm": 234.56243896484375, + "learning_rate": 2.395099818511797e-05, + "loss": 42.0314, + "step": 1121 + }, + { + "epoch": 4.050564334085779, + "grad_norm": 252.39718627929688, + "learning_rate": 2.3945553539019965e-05, + "loss": 41.8559, + "step": 1122 + }, + { + "epoch": 4.054176072234763, + "grad_norm": 249.19015502929688, + "learning_rate": 2.394010889292196e-05, + "loss": 41.411, + "step": 1123 + }, + { + "epoch": 4.057787810383747, + "grad_norm": 216.54139709472656, + "learning_rate": 2.393466424682396e-05, + "loss": 41.2435, + "step": 1124 + }, + { + "epoch": 4.061399548532731, + "grad_norm": 269.6858825683594, + "learning_rate": 2.3929219600725954e-05, + "loss": 40.9555, + "step": 1125 + }, + { + "epoch": 4.065011286681716, + "grad_norm": 289.1708984375, + "learning_rate": 2.392377495462795e-05, + "loss": 41.8034, + "step": 1126 + }, + { + "epoch": 4.0686230248307, + "grad_norm": 225.65097045898438, + "learning_rate": 2.3918330308529945e-05, + "loss": 42.3489, + "step": 1127 + }, + { + "epoch": 4.072234762979684, + "grad_norm": 241.1715545654297, + "learning_rate": 2.391288566243194e-05, + "loss": 42.1899, + "step": 1128 + }, + { + "epoch": 4.075846501128668, + "grad_norm": 225.5276336669922, + "learning_rate": 2.390744101633394e-05, + "loss": 42.7326, + "step": 1129 + }, + { + "epoch": 4.079458239277653, + "grad_norm": 217.30703735351562, + "learning_rate": 2.3901996370235938e-05, + "loss": 41.9397, + "step": 1130 + }, + { + "epoch": 4.079458239277653, + "eval_loss": 0.6440457701683044, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 1130 + }, + { + "epoch": 4.083069977426637, + "grad_norm": 191.2023162841797, + "learning_rate": 2.3896551724137933e-05, + "loss": 43.5721, + "step": 1131 + }, + { + "epoch": 4.0866817155756205, + "grad_norm": 204.53013610839844, + "learning_rate": 2.389110707803993e-05, + "loss": 42.9942, + "step": 1132 + }, + { + "epoch": 4.090293453724605, + "grad_norm": 206.78817749023438, + "learning_rate": 2.3885662431941924e-05, + "loss": 42.8992, + "step": 1133 + }, + { + "epoch": 4.093905191873589, + "grad_norm": 224.03082275390625, + "learning_rate": 2.388021778584392e-05, + "loss": 42.39, + "step": 1134 + }, + { + "epoch": 4.097516930022573, + "grad_norm": 249.23992919921875, + "learning_rate": 2.3874773139745918e-05, + "loss": 43.3473, + "step": 1135 + }, + { + "epoch": 4.101128668171557, + "grad_norm": 249.36526489257812, + "learning_rate": 2.3869328493647914e-05, + "loss": 42.5243, + "step": 1136 + }, + { + "epoch": 4.104740406320542, + "grad_norm": 204.98721313476562, + "learning_rate": 2.386388384754991e-05, + "loss": 39.7782, + "step": 1137 + }, + { + "epoch": 4.108352144469526, + "grad_norm": 204.4314422607422, + "learning_rate": 2.3858439201451904e-05, + "loss": 36.1737, + "step": 1138 + }, + { + "epoch": 4.11196388261851, + "grad_norm": 207.8656005859375, + "learning_rate": 2.3852994555353903e-05, + "loss": 37.119, + "step": 1139 + }, + { + "epoch": 4.115575620767494, + "grad_norm": 204.60365295410156, + "learning_rate": 2.3847549909255902e-05, + "loss": 34.5701, + "step": 1140 + }, + { + "epoch": 4.115575620767494, + "eval_loss": 0.6381516456604004, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.124, + "eval_steps_per_second": 57.124, + "step": 1140 + }, + { + "epoch": 4.119187358916479, + "grad_norm": 207.82247924804688, + "learning_rate": 2.3842105263157897e-05, + "loss": 35.8345, + "step": 1141 + }, + { + "epoch": 4.122799097065463, + "grad_norm": 203.7960662841797, + "learning_rate": 2.3836660617059893e-05, + "loss": 36.3975, + "step": 1142 + }, + { + "epoch": 4.126410835214447, + "grad_norm": 187.17431640625, + "learning_rate": 2.3831215970961888e-05, + "loss": 36.1556, + "step": 1143 + }, + { + "epoch": 4.130022573363431, + "grad_norm": 224.93003845214844, + "learning_rate": 2.3825771324863883e-05, + "loss": 36.8714, + "step": 1144 + }, + { + "epoch": 4.133634311512416, + "grad_norm": 235.7632293701172, + "learning_rate": 2.382032667876588e-05, + "loss": 37.5072, + "step": 1145 + }, + { + "epoch": 4.1372460496614, + "grad_norm": 261.4077453613281, + "learning_rate": 2.3814882032667878e-05, + "loss": 38.0648, + "step": 1146 + }, + { + "epoch": 4.140857787810384, + "grad_norm": 233.9202117919922, + "learning_rate": 2.3809437386569873e-05, + "loss": 37.1813, + "step": 1147 + }, + { + "epoch": 4.144469525959368, + "grad_norm": 343.1669006347656, + "learning_rate": 2.380399274047187e-05, + "loss": 39.7793, + "step": 1148 + }, + { + "epoch": 4.148081264108352, + "grad_norm": 296.18121337890625, + "learning_rate": 2.3798548094373867e-05, + "loss": 39.0443, + "step": 1149 + }, + { + "epoch": 4.151693002257336, + "grad_norm": 261.0748291015625, + "learning_rate": 2.3793103448275862e-05, + "loss": 39.9487, + "step": 1150 + }, + { + "epoch": 4.151693002257336, + "eval_loss": 0.6574633717536926, + "eval_runtime": 3.1318, + "eval_samples_per_second": 57.156, + "eval_steps_per_second": 57.156, + "step": 1150 + }, + { + "epoch": 4.15530474040632, + "grad_norm": 220.5347137451172, + "learning_rate": 2.378765880217786e-05, + "loss": 39.6622, + "step": 1151 + }, + { + "epoch": 4.158916478555304, + "grad_norm": 243.7288360595703, + "learning_rate": 2.3782214156079857e-05, + "loss": 39.6427, + "step": 1152 + }, + { + "epoch": 4.162528216704289, + "grad_norm": 223.01170349121094, + "learning_rate": 2.3776769509981852e-05, + "loss": 39.4682, + "step": 1153 + }, + { + "epoch": 4.166139954853273, + "grad_norm": 292.18768310546875, + "learning_rate": 2.3771324863883847e-05, + "loss": 29.4783, + "step": 1154 + }, + { + "epoch": 4.169751693002257, + "grad_norm": 253.28433227539062, + "learning_rate": 2.3765880217785843e-05, + "loss": 24.6701, + "step": 1155 + }, + { + "epoch": 4.173363431151241, + "grad_norm": 213.90155029296875, + "learning_rate": 2.3760435571687838e-05, + "loss": 24.7208, + "step": 1156 + }, + { + "epoch": 4.176975169300226, + "grad_norm": 216.52125549316406, + "learning_rate": 2.3754990925589837e-05, + "loss": 24.5906, + "step": 1157 + }, + { + "epoch": 4.18058690744921, + "grad_norm": 208.77516174316406, + "learning_rate": 2.3749546279491836e-05, + "loss": 25.9308, + "step": 1158 + }, + { + "epoch": 4.184198645598194, + "grad_norm": 401.13751220703125, + "learning_rate": 2.374410163339383e-05, + "loss": 48.6681, + "step": 1159 + }, + { + "epoch": 4.187810383747179, + "grad_norm": 380.1224365234375, + "learning_rate": 2.3738656987295826e-05, + "loss": 48.9605, + "step": 1160 + }, + { + "epoch": 4.187810383747179, + "eval_loss": 0.6683643460273743, + "eval_runtime": 3.1304, + "eval_samples_per_second": 57.181, + "eval_steps_per_second": 57.181, + "step": 1160 + }, + { + "epoch": 4.191422121896163, + "grad_norm": 383.3838806152344, + "learning_rate": 2.3733212341197822e-05, + "loss": 48.419, + "step": 1161 + }, + { + "epoch": 4.195033860045147, + "grad_norm": 290.1167907714844, + "learning_rate": 2.372776769509982e-05, + "loss": 46.7725, + "step": 1162 + }, + { + "epoch": 4.198645598194131, + "grad_norm": 260.7622375488281, + "learning_rate": 2.3722323049001816e-05, + "loss": 45.6624, + "step": 1163 + }, + { + "epoch": 4.2022573363431155, + "grad_norm": 300.2881774902344, + "learning_rate": 2.371687840290381e-05, + "loss": 45.9416, + "step": 1164 + }, + { + "epoch": 4.2058690744920995, + "grad_norm": 241.06045532226562, + "learning_rate": 2.3711433756805807e-05, + "loss": 45.748, + "step": 1165 + }, + { + "epoch": 4.209480812641083, + "grad_norm": 218.68606567382812, + "learning_rate": 2.3705989110707802e-05, + "loss": 45.6519, + "step": 1166 + }, + { + "epoch": 4.213092550790067, + "grad_norm": 227.5732421875, + "learning_rate": 2.37005444646098e-05, + "loss": 44.2927, + "step": 1167 + }, + { + "epoch": 4.216704288939052, + "grad_norm": 295.8132629394531, + "learning_rate": 2.36950998185118e-05, + "loss": 45.37, + "step": 1168 + }, + { + "epoch": 4.220316027088036, + "grad_norm": 239.5023193359375, + "learning_rate": 2.3689655172413795e-05, + "loss": 44.3496, + "step": 1169 + }, + { + "epoch": 4.22392776523702, + "grad_norm": 211.12631225585938, + "learning_rate": 2.368421052631579e-05, + "loss": 41.8493, + "step": 1170 + }, + { + "epoch": 4.22392776523702, + "eval_loss": 0.6506755948066711, + "eval_runtime": 3.1303, + "eval_samples_per_second": 57.183, + "eval_steps_per_second": 57.183, + "step": 1170 + }, + { + "epoch": 4.227539503386004, + "grad_norm": 291.5223388671875, + "learning_rate": 2.3678765880217786e-05, + "loss": 40.9604, + "step": 1171 + }, + { + "epoch": 4.231151241534989, + "grad_norm": 218.4868927001953, + "learning_rate": 2.367332123411978e-05, + "loss": 40.1213, + "step": 1172 + }, + { + "epoch": 4.234762979683973, + "grad_norm": 176.35243225097656, + "learning_rate": 2.366787658802178e-05, + "loss": 41.5535, + "step": 1173 + }, + { + "epoch": 4.238374717832957, + "grad_norm": 188.4041290283203, + "learning_rate": 2.3662431941923775e-05, + "loss": 40.1666, + "step": 1174 + }, + { + "epoch": 4.241986455981941, + "grad_norm": 236.32740783691406, + "learning_rate": 2.365698729582577e-05, + "loss": 40.667, + "step": 1175 + }, + { + "epoch": 4.245598194130926, + "grad_norm": 197.1793670654297, + "learning_rate": 2.365154264972777e-05, + "loss": 41.7168, + "step": 1176 + }, + { + "epoch": 4.24920993227991, + "grad_norm": 242.61181640625, + "learning_rate": 2.3646098003629765e-05, + "loss": 42.7801, + "step": 1177 + }, + { + "epoch": 4.252821670428894, + "grad_norm": 268.12738037109375, + "learning_rate": 2.364065335753176e-05, + "loss": 42.7235, + "step": 1178 + }, + { + "epoch": 4.2564334085778786, + "grad_norm": 244.36843872070312, + "learning_rate": 2.363520871143376e-05, + "loss": 42.464, + "step": 1179 + }, + { + "epoch": 4.2600451467268625, + "grad_norm": 249.46437072753906, + "learning_rate": 2.3629764065335754e-05, + "loss": 42.0016, + "step": 1180 + }, + { + "epoch": 4.2600451467268625, + "eval_loss": 0.6450306177139282, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 1180 + }, + { + "epoch": 4.2636568848758465, + "grad_norm": 205.0978546142578, + "learning_rate": 2.362431941923775e-05, + "loss": 42.5624, + "step": 1181 + }, + { + "epoch": 4.2672686230248305, + "grad_norm": 220.79122924804688, + "learning_rate": 2.3618874773139745e-05, + "loss": 42.9771, + "step": 1182 + }, + { + "epoch": 4.270880361173815, + "grad_norm": 199.8367156982422, + "learning_rate": 2.361343012704174e-05, + "loss": 43.9198, + "step": 1183 + }, + { + "epoch": 4.274492099322799, + "grad_norm": 195.33636474609375, + "learning_rate": 2.360798548094374e-05, + "loss": 43.2283, + "step": 1184 + }, + { + "epoch": 4.278103837471783, + "grad_norm": 189.04556274414062, + "learning_rate": 2.3602540834845738e-05, + "loss": 43.1352, + "step": 1185 + }, + { + "epoch": 4.281715575620767, + "grad_norm": 196.6824951171875, + "learning_rate": 2.3597096188747734e-05, + "loss": 41.8298, + "step": 1186 + }, + { + "epoch": 4.285327313769752, + "grad_norm": 164.40245056152344, + "learning_rate": 2.359165154264973e-05, + "loss": 42.0144, + "step": 1187 + }, + { + "epoch": 4.288939051918736, + "grad_norm": 212.00314331054688, + "learning_rate": 2.3586206896551724e-05, + "loss": 39.0134, + "step": 1188 + }, + { + "epoch": 4.29255079006772, + "grad_norm": 220.7440643310547, + "learning_rate": 2.358076225045372e-05, + "loss": 35.7557, + "step": 1189 + }, + { + "epoch": 4.296162528216704, + "grad_norm": 196.58985900878906, + "learning_rate": 2.357531760435572e-05, + "loss": 36.0808, + "step": 1190 + }, + { + "epoch": 4.296162528216704, + "eval_loss": 0.6393749713897705, + "eval_runtime": 3.131, + "eval_samples_per_second": 57.171, + "eval_steps_per_second": 57.171, + "step": 1190 + }, + { + "epoch": 4.299774266365689, + "grad_norm": 194.13232421875, + "learning_rate": 2.3569872958257714e-05, + "loss": 36.0987, + "step": 1191 + }, + { + "epoch": 4.303386004514673, + "grad_norm": 224.85240173339844, + "learning_rate": 2.356442831215971e-05, + "loss": 36.764, + "step": 1192 + }, + { + "epoch": 4.306997742663657, + "grad_norm": 218.51856994628906, + "learning_rate": 2.3558983666061705e-05, + "loss": 37.7105, + "step": 1193 + }, + { + "epoch": 4.310609480812641, + "grad_norm": 242.14483642578125, + "learning_rate": 2.35535390199637e-05, + "loss": 38.2378, + "step": 1194 + }, + { + "epoch": 4.314221218961626, + "grad_norm": 245.50604248046875, + "learning_rate": 2.3548094373865702e-05, + "loss": 36.9229, + "step": 1195 + }, + { + "epoch": 4.3178329571106095, + "grad_norm": 215.5889892578125, + "learning_rate": 2.3542649727767697e-05, + "loss": 37.5557, + "step": 1196 + }, + { + "epoch": 4.3214446952595935, + "grad_norm": 203.4392547607422, + "learning_rate": 2.3537205081669693e-05, + "loss": 37.6031, + "step": 1197 + }, + { + "epoch": 4.3250564334085775, + "grad_norm": 231.23709106445312, + "learning_rate": 2.3531760435571688e-05, + "loss": 37.6715, + "step": 1198 + }, + { + "epoch": 4.328668171557562, + "grad_norm": 217.31813049316406, + "learning_rate": 2.3526315789473684e-05, + "loss": 37.645, + "step": 1199 + }, + { + "epoch": 4.332279909706546, + "grad_norm": 182.10690307617188, + "learning_rate": 2.352087114337568e-05, + "loss": 39.1993, + "step": 1200 + }, + { + "epoch": 4.332279909706546, + "eval_loss": 0.6532073616981506, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 1200 + }, + { + "epoch": 4.33589164785553, + "grad_norm": 232.332763671875, + "learning_rate": 2.3515426497277678e-05, + "loss": 38.1029, + "step": 1201 + }, + { + "epoch": 4.339503386004514, + "grad_norm": 251.8763885498047, + "learning_rate": 2.3509981851179673e-05, + "loss": 40.2538, + "step": 1202 + }, + { + "epoch": 4.343115124153499, + "grad_norm": 260.1363525390625, + "learning_rate": 2.350453720508167e-05, + "loss": 39.115, + "step": 1203 + }, + { + "epoch": 4.346726862302483, + "grad_norm": 227.32473754882812, + "learning_rate": 2.3499092558983667e-05, + "loss": 37.7692, + "step": 1204 + }, + { + "epoch": 4.350338600451467, + "grad_norm": 208.3872528076172, + "learning_rate": 2.3493647912885663e-05, + "loss": 26.7583, + "step": 1205 + }, + { + "epoch": 4.353950338600452, + "grad_norm": 173.05075073242188, + "learning_rate": 2.348820326678766e-05, + "loss": 24.7576, + "step": 1206 + }, + { + "epoch": 4.357562076749436, + "grad_norm": 214.4512939453125, + "learning_rate": 2.3482758620689657e-05, + "loss": 24.8792, + "step": 1207 + }, + { + "epoch": 4.36117381489842, + "grad_norm": 179.293701171875, + "learning_rate": 2.3477313974591652e-05, + "loss": 26.1507, + "step": 1208 + }, + { + "epoch": 4.364785553047404, + "grad_norm": 401.9908142089844, + "learning_rate": 2.3471869328493648e-05, + "loss": 47.4017, + "step": 1209 + }, + { + "epoch": 4.368397291196389, + "grad_norm": 399.3369140625, + "learning_rate": 2.3466424682395643e-05, + "loss": 48.0082, + "step": 1210 + }, + { + "epoch": 4.368397291196389, + "eval_loss": 0.6664602756500244, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.18, + "eval_steps_per_second": 57.18, + "step": 1210 + }, + { + "epoch": 4.372009029345373, + "grad_norm": 320.49090576171875, + "learning_rate": 2.346098003629764e-05, + "loss": 47.4843, + "step": 1211 + }, + { + "epoch": 4.375620767494357, + "grad_norm": 297.55615234375, + "learning_rate": 2.3455535390199637e-05, + "loss": 46.3087, + "step": 1212 + }, + { + "epoch": 4.3792325056433405, + "grad_norm": 245.03399658203125, + "learning_rate": 2.3450090744101636e-05, + "loss": 45.4889, + "step": 1213 + }, + { + "epoch": 4.382844243792325, + "grad_norm": 227.94091796875, + "learning_rate": 2.344464609800363e-05, + "loss": 45.8501, + "step": 1214 + }, + { + "epoch": 4.386455981941309, + "grad_norm": 262.7824401855469, + "learning_rate": 2.3439201451905627e-05, + "loss": 46.2737, + "step": 1215 + }, + { + "epoch": 4.390067720090293, + "grad_norm": 235.969970703125, + "learning_rate": 2.3433756805807622e-05, + "loss": 45.2876, + "step": 1216 + }, + { + "epoch": 4.393679458239277, + "grad_norm": 244.8028106689453, + "learning_rate": 2.342831215970962e-05, + "loss": 45.4931, + "step": 1217 + }, + { + "epoch": 4.397291196388262, + "grad_norm": 236.24844360351562, + "learning_rate": 2.3422867513611616e-05, + "loss": 45.6649, + "step": 1218 + }, + { + "epoch": 4.400902934537246, + "grad_norm": 204.7911834716797, + "learning_rate": 2.341742286751361e-05, + "loss": 43.9613, + "step": 1219 + }, + { + "epoch": 4.40451467268623, + "grad_norm": 190.6739044189453, + "learning_rate": 2.3411978221415607e-05, + "loss": 41.9267, + "step": 1220 + }, + { + "epoch": 4.40451467268623, + "eval_loss": 0.6481396555900574, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.142, + "eval_steps_per_second": 57.142, + "step": 1220 + }, + { + "epoch": 4.408126410835214, + "grad_norm": 224.25758361816406, + "learning_rate": 2.3406533575317602e-05, + "loss": 42.34, + "step": 1221 + }, + { + "epoch": 4.411738148984199, + "grad_norm": 238.21913146972656, + "learning_rate": 2.34010889292196e-05, + "loss": 40.6947, + "step": 1222 + }, + { + "epoch": 4.415349887133183, + "grad_norm": 255.64395141601562, + "learning_rate": 2.33956442831216e-05, + "loss": 39.8585, + "step": 1223 + }, + { + "epoch": 4.418961625282167, + "grad_norm": 202.08859252929688, + "learning_rate": 2.3390199637023595e-05, + "loss": 42.6031, + "step": 1224 + }, + { + "epoch": 4.422573363431152, + "grad_norm": 222.359619140625, + "learning_rate": 2.338475499092559e-05, + "loss": 41.9946, + "step": 1225 + }, + { + "epoch": 4.426185101580136, + "grad_norm": 198.84461975097656, + "learning_rate": 2.3379310344827586e-05, + "loss": 40.9174, + "step": 1226 + }, + { + "epoch": 4.42979683972912, + "grad_norm": 227.34942626953125, + "learning_rate": 2.337386569872958e-05, + "loss": 42.2865, + "step": 1227 + }, + { + "epoch": 4.433408577878104, + "grad_norm": 249.9097900390625, + "learning_rate": 2.336842105263158e-05, + "loss": 42.6508, + "step": 1228 + }, + { + "epoch": 4.437020316027088, + "grad_norm": 236.96009826660156, + "learning_rate": 2.3362976406533576e-05, + "loss": 43.0846, + "step": 1229 + }, + { + "epoch": 4.440632054176072, + "grad_norm": 183.06201171875, + "learning_rate": 2.335753176043557e-05, + "loss": 42.4119, + "step": 1230 + }, + { + "epoch": 4.440632054176072, + "eval_loss": 0.6428424715995789, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 1230 + }, + { + "epoch": 4.444243792325056, + "grad_norm": 199.0382843017578, + "learning_rate": 2.335208711433757e-05, + "loss": 43.1702, + "step": 1231 + }, + { + "epoch": 4.44785553047404, + "grad_norm": 221.87939453125, + "learning_rate": 2.3346642468239565e-05, + "loss": 43.3518, + "step": 1232 + }, + { + "epoch": 4.451467268623025, + "grad_norm": 205.0601043701172, + "learning_rate": 2.3341197822141564e-05, + "loss": 42.9713, + "step": 1233 + }, + { + "epoch": 4.455079006772009, + "grad_norm": 235.3998565673828, + "learning_rate": 2.333575317604356e-05, + "loss": 42.6973, + "step": 1234 + }, + { + "epoch": 4.458690744920993, + "grad_norm": 171.76986694335938, + "learning_rate": 2.3330308529945555e-05, + "loss": 43.351, + "step": 1235 + }, + { + "epoch": 4.462302483069977, + "grad_norm": 261.549072265625, + "learning_rate": 2.332486388384755e-05, + "loss": 43.8662, + "step": 1236 + }, + { + "epoch": 4.465914221218962, + "grad_norm": 256.76837158203125, + "learning_rate": 2.3319419237749545e-05, + "loss": 40.7938, + "step": 1237 + }, + { + "epoch": 4.469525959367946, + "grad_norm": 176.35060119628906, + "learning_rate": 2.331397459165154e-05, + "loss": 38.1021, + "step": 1238 + }, + { + "epoch": 4.47313769751693, + "grad_norm": 203.00906372070312, + "learning_rate": 2.330852994555354e-05, + "loss": 36.6359, + "step": 1239 + }, + { + "epoch": 4.476749435665914, + "grad_norm": 259.6462707519531, + "learning_rate": 2.3303085299455535e-05, + "loss": 34.448, + "step": 1240 + }, + { + "epoch": 4.476749435665914, + "eval_loss": 0.6386051177978516, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 1240 + }, + { + "epoch": 4.480361173814899, + "grad_norm": 215.24737548828125, + "learning_rate": 2.3297640653357534e-05, + "loss": 35.2353, + "step": 1241 + }, + { + "epoch": 4.483972911963883, + "grad_norm": 249.12355041503906, + "learning_rate": 2.329219600725953e-05, + "loss": 38.2077, + "step": 1242 + }, + { + "epoch": 4.487584650112867, + "grad_norm": 191.0881805419922, + "learning_rate": 2.3286751361161525e-05, + "loss": 36.8363, + "step": 1243 + }, + { + "epoch": 4.491196388261851, + "grad_norm": 229.26449584960938, + "learning_rate": 2.3281306715063523e-05, + "loss": 36.7398, + "step": 1244 + }, + { + "epoch": 4.4948081264108355, + "grad_norm": 184.931884765625, + "learning_rate": 2.327586206896552e-05, + "loss": 35.6614, + "step": 1245 + }, + { + "epoch": 4.4984198645598195, + "grad_norm": 183.7378387451172, + "learning_rate": 2.3270417422867514e-05, + "loss": 36.9818, + "step": 1246 + }, + { + "epoch": 4.502031602708803, + "grad_norm": 191.42543029785156, + "learning_rate": 2.326497277676951e-05, + "loss": 38.1348, + "step": 1247 + }, + { + "epoch": 4.505643340857787, + "grad_norm": 211.6359100341797, + "learning_rate": 2.3259528130671505e-05, + "loss": 37.0112, + "step": 1248 + }, + { + "epoch": 4.509255079006772, + "grad_norm": 245.6946563720703, + "learning_rate": 2.32540834845735e-05, + "loss": 38.6218, + "step": 1249 + }, + { + "epoch": 4.512866817155756, + "grad_norm": 193.29095458984375, + "learning_rate": 2.3248638838475502e-05, + "loss": 36.9687, + "step": 1250 + }, + { + "epoch": 4.512866817155756, + "eval_loss": 0.6432057023048401, + "eval_runtime": 3.1301, + "eval_samples_per_second": 57.187, + "eval_steps_per_second": 57.187, + "step": 1250 + }, + { + "epoch": 4.51647855530474, + "grad_norm": 247.0595245361328, + "learning_rate": 2.3243194192377498e-05, + "loss": 39.8086, + "step": 1251 + }, + { + "epoch": 4.520090293453725, + "grad_norm": 243.1544189453125, + "learning_rate": 2.3237749546279493e-05, + "loss": 38.7245, + "step": 1252 + }, + { + "epoch": 4.523702031602709, + "grad_norm": 322.0834045410156, + "learning_rate": 2.323230490018149e-05, + "loss": 39.5335, + "step": 1253 + }, + { + "epoch": 4.527313769751693, + "grad_norm": 201.5956573486328, + "learning_rate": 2.3226860254083484e-05, + "loss": 30.2928, + "step": 1254 + }, + { + "epoch": 4.530925507900677, + "grad_norm": 186.13291931152344, + "learning_rate": 2.3221415607985483e-05, + "loss": 24.8504, + "step": 1255 + }, + { + "epoch": 4.534537246049661, + "grad_norm": 251.50608825683594, + "learning_rate": 2.3215970961887478e-05, + "loss": 24.5528, + "step": 1256 + }, + { + "epoch": 4.538148984198646, + "grad_norm": 180.21124267578125, + "learning_rate": 2.3210526315789473e-05, + "loss": 25.0864, + "step": 1257 + }, + { + "epoch": 4.54176072234763, + "grad_norm": 206.5410614013672, + "learning_rate": 2.320508166969147e-05, + "loss": 27.1602, + "step": 1258 + }, + { + "epoch": 4.545372460496614, + "grad_norm": 342.1103210449219, + "learning_rate": 2.3199637023593468e-05, + "loss": 47.3734, + "step": 1259 + }, + { + "epoch": 4.5489841986455986, + "grad_norm": 418.3056945800781, + "learning_rate": 2.3194192377495463e-05, + "loss": 48.0316, + "step": 1260 + }, + { + "epoch": 4.5489841986455986, + "eval_loss": 0.6742400527000427, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 1260 + }, + { + "epoch": 4.5525959367945825, + "grad_norm": 369.8560791015625, + "learning_rate": 2.3188747731397462e-05, + "loss": 47.4532, + "step": 1261 + }, + { + "epoch": 4.5562076749435665, + "grad_norm": 322.0288391113281, + "learning_rate": 2.3183303085299457e-05, + "loss": 47.0661, + "step": 1262 + }, + { + "epoch": 4.5598194130925505, + "grad_norm": 244.79066467285156, + "learning_rate": 2.3177858439201453e-05, + "loss": 45.1875, + "step": 1263 + }, + { + "epoch": 4.563431151241535, + "grad_norm": 209.29397583007812, + "learning_rate": 2.3172413793103448e-05, + "loss": 46.1355, + "step": 1264 + }, + { + "epoch": 4.567042889390519, + "grad_norm": 271.5123291015625, + "learning_rate": 2.3166969147005443e-05, + "loss": 45.8947, + "step": 1265 + }, + { + "epoch": 4.570654627539503, + "grad_norm": 232.42913818359375, + "learning_rate": 2.3161524500907442e-05, + "loss": 45.6542, + "step": 1266 + }, + { + "epoch": 4.574266365688487, + "grad_norm": 282.50738525390625, + "learning_rate": 2.3156079854809437e-05, + "loss": 45.8805, + "step": 1267 + }, + { + "epoch": 4.577878103837472, + "grad_norm": 203.39031982421875, + "learning_rate": 2.3150635208711436e-05, + "loss": 44.8926, + "step": 1268 + }, + { + "epoch": 4.581489841986456, + "grad_norm": 213.94894409179688, + "learning_rate": 2.314519056261343e-05, + "loss": 43.7589, + "step": 1269 + }, + { + "epoch": 4.58510158013544, + "grad_norm": 198.9677734375, + "learning_rate": 2.3139745916515427e-05, + "loss": 41.819, + "step": 1270 + }, + { + "epoch": 4.58510158013544, + "eval_loss": 0.6428627371788025, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1270 + }, + { + "epoch": 4.588713318284425, + "grad_norm": 197.69903564453125, + "learning_rate": 2.3134301270417422e-05, + "loss": 40.6128, + "step": 1271 + }, + { + "epoch": 4.592325056433409, + "grad_norm": 229.10488891601562, + "learning_rate": 2.312885662431942e-05, + "loss": 41.1856, + "step": 1272 + }, + { + "epoch": 4.595936794582393, + "grad_norm": 254.4750213623047, + "learning_rate": 2.3123411978221417e-05, + "loss": 40.2048, + "step": 1273 + }, + { + "epoch": 4.599548532731377, + "grad_norm": 247.2012939453125, + "learning_rate": 2.3117967332123412e-05, + "loss": 41.663, + "step": 1274 + }, + { + "epoch": 4.603160270880361, + "grad_norm": 196.78761291503906, + "learning_rate": 2.3112522686025407e-05, + "loss": 41.1102, + "step": 1275 + }, + { + "epoch": 4.606772009029346, + "grad_norm": 179.03880310058594, + "learning_rate": 2.3107078039927403e-05, + "loss": 39.6368, + "step": 1276 + }, + { + "epoch": 4.6103837471783295, + "grad_norm": 203.49159240722656, + "learning_rate": 2.3101633393829405e-05, + "loss": 42.9424, + "step": 1277 + }, + { + "epoch": 4.6139954853273135, + "grad_norm": 254.80018615722656, + "learning_rate": 2.30961887477314e-05, + "loss": 42.0636, + "step": 1278 + }, + { + "epoch": 4.617607223476298, + "grad_norm": 201.86109924316406, + "learning_rate": 2.3090744101633396e-05, + "loss": 41.4738, + "step": 1279 + }, + { + "epoch": 4.621218961625282, + "grad_norm": 185.1239471435547, + "learning_rate": 2.308529945553539e-05, + "loss": 41.8529, + "step": 1280 + }, + { + "epoch": 4.621218961625282, + "eval_loss": 0.6457561254501343, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 1280 + }, + { + "epoch": 4.624830699774266, + "grad_norm": 198.6769561767578, + "learning_rate": 2.3079854809437386e-05, + "loss": 41.8397, + "step": 1281 + }, + { + "epoch": 4.62844243792325, + "grad_norm": 254.9165496826172, + "learning_rate": 2.3074410163339382e-05, + "loss": 43.5585, + "step": 1282 + }, + { + "epoch": 4.632054176072235, + "grad_norm": 183.61181640625, + "learning_rate": 2.306896551724138e-05, + "loss": 41.7349, + "step": 1283 + }, + { + "epoch": 4.635665914221219, + "grad_norm": 206.0381622314453, + "learning_rate": 2.3063520871143376e-05, + "loss": 42.6239, + "step": 1284 + }, + { + "epoch": 4.639277652370203, + "grad_norm": 188.5303497314453, + "learning_rate": 2.305807622504537e-05, + "loss": 43.0988, + "step": 1285 + }, + { + "epoch": 4.642889390519187, + "grad_norm": 208.30039978027344, + "learning_rate": 2.3052631578947367e-05, + "loss": 43.8379, + "step": 1286 + }, + { + "epoch": 4.646501128668172, + "grad_norm": 209.494384765625, + "learning_rate": 2.3047186932849365e-05, + "loss": 41.4395, + "step": 1287 + }, + { + "epoch": 4.650112866817156, + "grad_norm": 223.97824096679688, + "learning_rate": 2.3041742286751364e-05, + "loss": 38.5792, + "step": 1288 + }, + { + "epoch": 4.65372460496614, + "grad_norm": 209.16192626953125, + "learning_rate": 2.303629764065336e-05, + "loss": 36.2448, + "step": 1289 + }, + { + "epoch": 4.657336343115124, + "grad_norm": 260.72821044921875, + "learning_rate": 2.3030852994555355e-05, + "loss": 35.1692, + "step": 1290 + }, + { + "epoch": 4.657336343115124, + "eval_loss": 0.6381233334541321, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1290 + }, + { + "epoch": 4.660948081264109, + "grad_norm": 222.2270965576172, + "learning_rate": 2.302540834845735e-05, + "loss": 35.2234, + "step": 1291 + }, + { + "epoch": 4.664559819413093, + "grad_norm": 208.68218994140625, + "learning_rate": 2.3019963702359346e-05, + "loss": 35.6167, + "step": 1292 + }, + { + "epoch": 4.668171557562077, + "grad_norm": 199.57015991210938, + "learning_rate": 2.301451905626134e-05, + "loss": 36.9489, + "step": 1293 + }, + { + "epoch": 4.6717832957110605, + "grad_norm": 249.1312255859375, + "learning_rate": 2.300907441016334e-05, + "loss": 37.0681, + "step": 1294 + }, + { + "epoch": 4.675395033860045, + "grad_norm": 227.86341857910156, + "learning_rate": 2.3003629764065335e-05, + "loss": 38.3897, + "step": 1295 + }, + { + "epoch": 4.679006772009029, + "grad_norm": 290.3368225097656, + "learning_rate": 2.2998185117967334e-05, + "loss": 39.1391, + "step": 1296 + }, + { + "epoch": 4.682618510158013, + "grad_norm": 222.59974670410156, + "learning_rate": 2.299274047186933e-05, + "loss": 38.6362, + "step": 1297 + }, + { + "epoch": 4.686230248306998, + "grad_norm": 233.853515625, + "learning_rate": 2.2987295825771325e-05, + "loss": 37.1796, + "step": 1298 + }, + { + "epoch": 4.689841986455982, + "grad_norm": 202.83212280273438, + "learning_rate": 2.2981851179673324e-05, + "loss": 38.5097, + "step": 1299 + }, + { + "epoch": 4.693453724604966, + "grad_norm": 203.59027099609375, + "learning_rate": 2.297640653357532e-05, + "loss": 38.3335, + "step": 1300 + }, + { + "epoch": 4.693453724604966, + "eval_loss": 0.6446877717971802, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 1300 + }, + { + "epoch": 4.69706546275395, + "grad_norm": 250.48324584960938, + "learning_rate": 2.2970961887477314e-05, + "loss": 39.1848, + "step": 1301 + }, + { + "epoch": 4.700677200902934, + "grad_norm": 218.0867462158203, + "learning_rate": 2.296551724137931e-05, + "loss": 38.2276, + "step": 1302 + }, + { + "epoch": 4.704288939051919, + "grad_norm": 316.4258728027344, + "learning_rate": 2.2960072595281305e-05, + "loss": 38.4487, + "step": 1303 + }, + { + "epoch": 4.707900677200903, + "grad_norm": 262.96832275390625, + "learning_rate": 2.29546279491833e-05, + "loss": 29.1075, + "step": 1304 + }, + { + "epoch": 4.711512415349887, + "grad_norm": 261.25897216796875, + "learning_rate": 2.2949183303085303e-05, + "loss": 24.6257, + "step": 1305 + }, + { + "epoch": 4.715124153498872, + "grad_norm": 223.29014587402344, + "learning_rate": 2.2943738656987298e-05, + "loss": 24.4387, + "step": 1306 + }, + { + "epoch": 4.718735891647856, + "grad_norm": 167.95193481445312, + "learning_rate": 2.2938294010889293e-05, + "loss": 25.0916, + "step": 1307 + }, + { + "epoch": 4.72234762979684, + "grad_norm": 203.88392639160156, + "learning_rate": 2.293284936479129e-05, + "loss": 26.1631, + "step": 1308 + }, + { + "epoch": 4.725959367945824, + "grad_norm": 350.67657470703125, + "learning_rate": 2.2927404718693284e-05, + "loss": 47.7021, + "step": 1309 + }, + { + "epoch": 4.7295711060948085, + "grad_norm": 357.1839294433594, + "learning_rate": 2.2921960072595283e-05, + "loss": 47.8161, + "step": 1310 + }, + { + "epoch": 4.7295711060948085, + "eval_loss": 0.6716815829277039, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 1310 + }, + { + "epoch": 4.733182844243792, + "grad_norm": 334.40216064453125, + "learning_rate": 2.291651542649728e-05, + "loss": 47.5608, + "step": 1311 + }, + { + "epoch": 4.736794582392776, + "grad_norm": 322.90008544921875, + "learning_rate": 2.2911070780399274e-05, + "loss": 45.9858, + "step": 1312 + }, + { + "epoch": 4.74040632054176, + "grad_norm": 291.5083923339844, + "learning_rate": 2.290562613430127e-05, + "loss": 45.9813, + "step": 1313 + }, + { + "epoch": 4.744018058690745, + "grad_norm": 234.91102600097656, + "learning_rate": 2.2900181488203268e-05, + "loss": 44.4287, + "step": 1314 + }, + { + "epoch": 4.747629796839729, + "grad_norm": 271.03582763671875, + "learning_rate": 2.2894736842105263e-05, + "loss": 45.3697, + "step": 1315 + }, + { + "epoch": 4.751241534988713, + "grad_norm": 256.219482421875, + "learning_rate": 2.2889292196007262e-05, + "loss": 45.1817, + "step": 1316 + }, + { + "epoch": 4.754853273137698, + "grad_norm": 252.0631561279297, + "learning_rate": 2.2883847549909257e-05, + "loss": 45.2029, + "step": 1317 + }, + { + "epoch": 4.758465011286682, + "grad_norm": 249.41812133789062, + "learning_rate": 2.2878402903811253e-05, + "loss": 44.9802, + "step": 1318 + }, + { + "epoch": 4.762076749435666, + "grad_norm": 208.9102325439453, + "learning_rate": 2.2872958257713248e-05, + "loss": 44.3745, + "step": 1319 + }, + { + "epoch": 4.76568848758465, + "grad_norm": 322.94903564453125, + "learning_rate": 2.2867513611615244e-05, + "loss": 40.9193, + "step": 1320 + }, + { + "epoch": 4.76568848758465, + "eval_loss": 0.6515910029411316, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1320 + }, + { + "epoch": 4.769300225733634, + "grad_norm": 264.6942138671875, + "learning_rate": 2.2862068965517242e-05, + "loss": 39.7286, + "step": 1321 + }, + { + "epoch": 4.772911963882619, + "grad_norm": 276.6095886230469, + "learning_rate": 2.2856624319419238e-05, + "loss": 41.3846, + "step": 1322 + }, + { + "epoch": 4.776523702031603, + "grad_norm": 199.59877014160156, + "learning_rate": 2.2851179673321233e-05, + "loss": 40.5583, + "step": 1323 + }, + { + "epoch": 4.780135440180587, + "grad_norm": 252.59158325195312, + "learning_rate": 2.2845735027223232e-05, + "loss": 40.9513, + "step": 1324 + }, + { + "epoch": 4.7837471783295715, + "grad_norm": 215.53826904296875, + "learning_rate": 2.2840290381125227e-05, + "loss": 41.5119, + "step": 1325 + }, + { + "epoch": 4.7873589164785555, + "grad_norm": 290.7100524902344, + "learning_rate": 2.2834845735027226e-05, + "loss": 42.7646, + "step": 1326 + }, + { + "epoch": 4.7909706546275395, + "grad_norm": 190.2306671142578, + "learning_rate": 2.282940108892922e-05, + "loss": 42.2708, + "step": 1327 + }, + { + "epoch": 4.794582392776523, + "grad_norm": 187.5550079345703, + "learning_rate": 2.2823956442831217e-05, + "loss": 41.9279, + "step": 1328 + }, + { + "epoch": 4.798194130925508, + "grad_norm": 169.10414123535156, + "learning_rate": 2.2818511796733212e-05, + "loss": 42.2688, + "step": 1329 + }, + { + "epoch": 4.801805869074492, + "grad_norm": 199.5216064453125, + "learning_rate": 2.2813067150635208e-05, + "loss": 41.9192, + "step": 1330 + }, + { + "epoch": 4.801805869074492, + "eval_loss": 0.6402038335800171, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 1330 + }, + { + "epoch": 4.805417607223476, + "grad_norm": 222.4996337890625, + "learning_rate": 2.2807622504537203e-05, + "loss": 43.8218, + "step": 1331 + }, + { + "epoch": 4.80902934537246, + "grad_norm": 228.1157684326172, + "learning_rate": 2.2802177858439202e-05, + "loss": 42.9497, + "step": 1332 + }, + { + "epoch": 4.812641083521445, + "grad_norm": 179.83697509765625, + "learning_rate": 2.27967332123412e-05, + "loss": 43.9723, + "step": 1333 + }, + { + "epoch": 4.816252821670429, + "grad_norm": 196.81983947753906, + "learning_rate": 2.2791288566243196e-05, + "loss": 43.3302, + "step": 1334 + }, + { + "epoch": 4.819864559819413, + "grad_norm": 186.61160278320312, + "learning_rate": 2.278584392014519e-05, + "loss": 41.8957, + "step": 1335 + }, + { + "epoch": 4.823476297968397, + "grad_norm": 242.55886840820312, + "learning_rate": 2.2780399274047187e-05, + "loss": 43.1916, + "step": 1336 + }, + { + "epoch": 4.827088036117382, + "grad_norm": 212.07177734375, + "learning_rate": 2.2774954627949185e-05, + "loss": 38.3371, + "step": 1337 + }, + { + "epoch": 4.830699774266366, + "grad_norm": 180.1990966796875, + "learning_rate": 2.276950998185118e-05, + "loss": 36.3413, + "step": 1338 + }, + { + "epoch": 4.83431151241535, + "grad_norm": 202.69529724121094, + "learning_rate": 2.2764065335753176e-05, + "loss": 35.4426, + "step": 1339 + }, + { + "epoch": 4.837923250564334, + "grad_norm": 180.47283935546875, + "learning_rate": 2.275862068965517e-05, + "loss": 35.5281, + "step": 1340 + }, + { + "epoch": 4.837923250564334, + "eval_loss": 0.6356105804443359, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 1340 + }, + { + "epoch": 4.8415349887133186, + "grad_norm": 204.674560546875, + "learning_rate": 2.2753176043557167e-05, + "loss": 36.2566, + "step": 1341 + }, + { + "epoch": 4.8451467268623025, + "grad_norm": 272.1197204589844, + "learning_rate": 2.2747731397459166e-05, + "loss": 36.3862, + "step": 1342 + }, + { + "epoch": 4.8487584650112865, + "grad_norm": 235.55101013183594, + "learning_rate": 2.2742286751361165e-05, + "loss": 35.1455, + "step": 1343 + }, + { + "epoch": 4.852370203160271, + "grad_norm": 271.2718200683594, + "learning_rate": 2.273684210526316e-05, + "loss": 37.3824, + "step": 1344 + }, + { + "epoch": 4.855981941309255, + "grad_norm": 242.15728759765625, + "learning_rate": 2.2731397459165155e-05, + "loss": 37.6587, + "step": 1345 + }, + { + "epoch": 4.859593679458239, + "grad_norm": 218.59481811523438, + "learning_rate": 2.272595281306715e-05, + "loss": 36.7602, + "step": 1346 + }, + { + "epoch": 4.863205417607223, + "grad_norm": 231.9490203857422, + "learning_rate": 2.2720508166969146e-05, + "loss": 38.187, + "step": 1347 + }, + { + "epoch": 4.866817155756207, + "grad_norm": 385.56158447265625, + "learning_rate": 2.2715063520871145e-05, + "loss": 38.1905, + "step": 1348 + }, + { + "epoch": 4.870428893905192, + "grad_norm": 219.38204956054688, + "learning_rate": 2.270961887477314e-05, + "loss": 38.2179, + "step": 1349 + }, + { + "epoch": 4.874040632054176, + "grad_norm": 209.46580505371094, + "learning_rate": 2.2704174228675136e-05, + "loss": 37.3696, + "step": 1350 + }, + { + "epoch": 4.874040632054176, + "eval_loss": 0.6412517428398132, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 1350 + }, + { + "epoch": 4.87765237020316, + "grad_norm": 205.53416442871094, + "learning_rate": 2.2698729582577134e-05, + "loss": 38.5144, + "step": 1351 + }, + { + "epoch": 4.881264108352145, + "grad_norm": 214.2522735595703, + "learning_rate": 2.269328493647913e-05, + "loss": 38.7372, + "step": 1352 + }, + { + "epoch": 4.884875846501129, + "grad_norm": 236.9787139892578, + "learning_rate": 2.2687840290381125e-05, + "loss": 38.8987, + "step": 1353 + }, + { + "epoch": 4.888487584650113, + "grad_norm": 247.30906677246094, + "learning_rate": 2.2682395644283124e-05, + "loss": 35.0837, + "step": 1354 + }, + { + "epoch": 4.892099322799097, + "grad_norm": 287.5954284667969, + "learning_rate": 2.267695099818512e-05, + "loss": 25.5272, + "step": 1355 + }, + { + "epoch": 4.895711060948082, + "grad_norm": 254.61672973632812, + "learning_rate": 2.2671506352087115e-05, + "loss": 25.1288, + "step": 1356 + }, + { + "epoch": 4.899322799097066, + "grad_norm": 180.98666381835938, + "learning_rate": 2.266606170598911e-05, + "loss": 25.0588, + "step": 1357 + }, + { + "epoch": 4.9029345372460496, + "grad_norm": 213.0275421142578, + "learning_rate": 2.2660617059891105e-05, + "loss": 25.464, + "step": 1358 + }, + { + "epoch": 4.9065462753950335, + "grad_norm": 385.18035888671875, + "learning_rate": 2.2655172413793104e-05, + "loss": 47.0056, + "step": 1359 + }, + { + "epoch": 4.910158013544018, + "grad_norm": 383.4106140136719, + "learning_rate": 2.2649727767695103e-05, + "loss": 46.9892, + "step": 1360 + }, + { + "epoch": 4.910158013544018, + "eval_loss": 0.6618479490280151, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 1360 + }, + { + "epoch": 4.913769751693002, + "grad_norm": 415.4345397949219, + "learning_rate": 2.26442831215971e-05, + "loss": 47.1619, + "step": 1361 + }, + { + "epoch": 4.917381489841986, + "grad_norm": 362.338134765625, + "learning_rate": 2.2638838475499094e-05, + "loss": 46.7232, + "step": 1362 + }, + { + "epoch": 4.92099322799097, + "grad_norm": 378.7535400390625, + "learning_rate": 2.263339382940109e-05, + "loss": 46.4438, + "step": 1363 + }, + { + "epoch": 4.924604966139955, + "grad_norm": 251.64901733398438, + "learning_rate": 2.2627949183303085e-05, + "loss": 44.8178, + "step": 1364 + }, + { + "epoch": 4.928216704288939, + "grad_norm": 273.1052551269531, + "learning_rate": 2.2622504537205083e-05, + "loss": 43.0865, + "step": 1365 + }, + { + "epoch": 4.931828442437923, + "grad_norm": 229.66415405273438, + "learning_rate": 2.261705989110708e-05, + "loss": 42.2463, + "step": 1366 + }, + { + "epoch": 4.935440180586907, + "grad_norm": 229.47940063476562, + "learning_rate": 2.2611615245009074e-05, + "loss": 42.4395, + "step": 1367 + }, + { + "epoch": 4.939051918735892, + "grad_norm": 224.48890686035156, + "learning_rate": 2.260617059891107e-05, + "loss": 42.4994, + "step": 1368 + }, + { + "epoch": 4.942663656884876, + "grad_norm": 241.98745727539062, + "learning_rate": 2.2600725952813065e-05, + "loss": 42.5535, + "step": 1369 + }, + { + "epoch": 4.94627539503386, + "grad_norm": 258.1711120605469, + "learning_rate": 2.2595281306715067e-05, + "loss": 42.8475, + "step": 1370 + }, + { + "epoch": 4.94627539503386, + "eval_loss": 0.639252245426178, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.09, + "eval_steps_per_second": 57.09, + "step": 1370 + }, + { + "epoch": 4.949887133182845, + "grad_norm": 204.64927673339844, + "learning_rate": 2.2589836660617062e-05, + "loss": 42.9895, + "step": 1371 + }, + { + "epoch": 4.953498871331829, + "grad_norm": 342.9057922363281, + "learning_rate": 2.2584392014519058e-05, + "loss": 43.1972, + "step": 1372 + }, + { + "epoch": 4.957110609480813, + "grad_norm": 207.45504760742188, + "learning_rate": 2.2578947368421053e-05, + "loss": 42.406, + "step": 1373 + }, + { + "epoch": 4.960722347629797, + "grad_norm": 232.78831481933594, + "learning_rate": 2.257350272232305e-05, + "loss": 36.8817, + "step": 1374 + }, + { + "epoch": 4.9643340857787805, + "grad_norm": 249.3349609375, + "learning_rate": 2.2568058076225044e-05, + "loss": 34.584, + "step": 1375 + }, + { + "epoch": 4.967945823927765, + "grad_norm": 322.7100524902344, + "learning_rate": 2.2562613430127043e-05, + "loss": 36.9512, + "step": 1376 + }, + { + "epoch": 4.971557562076749, + "grad_norm": 357.65228271484375, + "learning_rate": 2.2557168784029038e-05, + "loss": 37.6833, + "step": 1377 + }, + { + "epoch": 4.975169300225733, + "grad_norm": 300.0970153808594, + "learning_rate": 2.2551724137931033e-05, + "loss": 38.597, + "step": 1378 + }, + { + "epoch": 4.978781038374718, + "grad_norm": 234.52508544921875, + "learning_rate": 2.2546279491833032e-05, + "loss": 38.4155, + "step": 1379 + }, + { + "epoch": 4.982392776523702, + "grad_norm": 270.60626220703125, + "learning_rate": 2.2540834845735028e-05, + "loss": 38.1589, + "step": 1380 + }, + { + "epoch": 4.982392776523702, + "eval_loss": 0.6409950256347656, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 1380 + }, + { + "epoch": 4.986004514672686, + "grad_norm": 232.9596710205078, + "learning_rate": 2.2535390199637026e-05, + "loss": 39.281, + "step": 1381 + }, + { + "epoch": 4.98961625282167, + "grad_norm": 248.0550994873047, + "learning_rate": 2.2529945553539022e-05, + "loss": 40.0868, + "step": 1382 + }, + { + "epoch": 4.993227990970655, + "grad_norm": 256.327880859375, + "learning_rate": 2.2524500907441017e-05, + "loss": 28.1259, + "step": 1383 + }, + { + "epoch": 4.996839729119639, + "grad_norm": 198.29559326171875, + "learning_rate": 2.2519056261343012e-05, + "loss": 25.3166, + "step": 1384 + }, + { + "epoch": 5.0, + "grad_norm": 174.66856384277344, + "learning_rate": 2.2513611615245008e-05, + "loss": 22.0749, + "step": 1385 + }, + { + "epoch": 5.003611738148984, + "grad_norm": 309.0927429199219, + "learning_rate": 2.2508166969147003e-05, + "loss": 45.2433, + "step": 1386 + }, + { + "epoch": 5.007223476297969, + "grad_norm": 293.1455383300781, + "learning_rate": 2.2502722323049002e-05, + "loss": 46.7025, + "step": 1387 + }, + { + "epoch": 5.010835214446953, + "grad_norm": 269.47662353515625, + "learning_rate": 2.2497277676951e-05, + "loss": 45.3218, + "step": 1388 + }, + { + "epoch": 5.014446952595937, + "grad_norm": 284.49560546875, + "learning_rate": 2.2491833030852996e-05, + "loss": 44.9849, + "step": 1389 + }, + { + "epoch": 5.018058690744921, + "grad_norm": 223.5511474609375, + "learning_rate": 2.248638838475499e-05, + "loss": 44.887, + "step": 1390 + }, + { + "epoch": 5.018058690744921, + "eval_loss": 0.6435533165931702, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1390 + }, + { + "epoch": 5.021670428893906, + "grad_norm": 243.4492645263672, + "learning_rate": 2.2480943738656987e-05, + "loss": 45.1483, + "step": 1391 + }, + { + "epoch": 5.0252821670428895, + "grad_norm": 265.1712646484375, + "learning_rate": 2.2475499092558986e-05, + "loss": 44.3713, + "step": 1392 + }, + { + "epoch": 5.0288939051918735, + "grad_norm": 190.72190856933594, + "learning_rate": 2.247005444646098e-05, + "loss": 45.3138, + "step": 1393 + }, + { + "epoch": 5.0325056433408575, + "grad_norm": 177.26686096191406, + "learning_rate": 2.2464609800362976e-05, + "loss": 43.302, + "step": 1394 + }, + { + "epoch": 5.036117381489842, + "grad_norm": 198.6124725341797, + "learning_rate": 2.2459165154264972e-05, + "loss": 43.6363, + "step": 1395 + }, + { + "epoch": 5.039729119638826, + "grad_norm": 233.78738403320312, + "learning_rate": 2.2453720508166967e-05, + "loss": 43.0345, + "step": 1396 + }, + { + "epoch": 5.04334085778781, + "grad_norm": 225.48614501953125, + "learning_rate": 2.2448275862068966e-05, + "loss": 41.5932, + "step": 1397 + }, + { + "epoch": 5.046952595936794, + "grad_norm": 204.31179809570312, + "learning_rate": 2.2442831215970965e-05, + "loss": 40.1401, + "step": 1398 + }, + { + "epoch": 5.050564334085779, + "grad_norm": 219.5385284423828, + "learning_rate": 2.243738656987296e-05, + "loss": 40.8834, + "step": 1399 + }, + { + "epoch": 5.054176072234763, + "grad_norm": 168.3094024658203, + "learning_rate": 2.2431941923774956e-05, + "loss": 40.4476, + "step": 1400 + }, + { + "epoch": 5.054176072234763, + "eval_loss": 0.6361114382743835, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 1400 + }, + { + "epoch": 5.057787810383747, + "grad_norm": 169.45201110839844, + "learning_rate": 2.242649727767695e-05, + "loss": 40.1949, + "step": 1401 + }, + { + "epoch": 5.061399548532731, + "grad_norm": 208.84634399414062, + "learning_rate": 2.2421052631578946e-05, + "loss": 41.0091, + "step": 1402 + }, + { + "epoch": 5.065011286681716, + "grad_norm": 248.86221313476562, + "learning_rate": 2.2415607985480945e-05, + "loss": 40.2435, + "step": 1403 + }, + { + "epoch": 5.0686230248307, + "grad_norm": 297.0834655761719, + "learning_rate": 2.241016333938294e-05, + "loss": 42.37, + "step": 1404 + }, + { + "epoch": 5.072234762979684, + "grad_norm": 242.12661743164062, + "learning_rate": 2.2404718693284936e-05, + "loss": 42.3822, + "step": 1405 + }, + { + "epoch": 5.075846501128668, + "grad_norm": 230.1178741455078, + "learning_rate": 2.2399274047186935e-05, + "loss": 41.3722, + "step": 1406 + }, + { + "epoch": 5.079458239277653, + "grad_norm": 191.32371520996094, + "learning_rate": 2.239382940108893e-05, + "loss": 41.8087, + "step": 1407 + }, + { + "epoch": 5.083069977426637, + "grad_norm": 267.28753662109375, + "learning_rate": 2.2388384754990925e-05, + "loss": 42.5938, + "step": 1408 + }, + { + "epoch": 5.0866817155756205, + "grad_norm": 186.61978149414062, + "learning_rate": 2.2382940108892924e-05, + "loss": 42.8553, + "step": 1409 + }, + { + "epoch": 5.090293453724605, + "grad_norm": 242.53433227539062, + "learning_rate": 2.237749546279492e-05, + "loss": 41.9677, + "step": 1410 + }, + { + "epoch": 5.090293453724605, + "eval_loss": 0.6330043077468872, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 1410 + }, + { + "epoch": 5.093905191873589, + "grad_norm": 199.74696350097656, + "learning_rate": 2.2372050816696915e-05, + "loss": 42.9821, + "step": 1411 + }, + { + "epoch": 5.097516930022573, + "grad_norm": 254.1063690185547, + "learning_rate": 2.236660617059891e-05, + "loss": 42.7956, + "step": 1412 + }, + { + "epoch": 5.101128668171557, + "grad_norm": 215.59056091308594, + "learning_rate": 2.2361161524500906e-05, + "loss": 43.6312, + "step": 1413 + }, + { + "epoch": 5.104740406320542, + "grad_norm": 218.69973754882812, + "learning_rate": 2.2355716878402904e-05, + "loss": 40.9468, + "step": 1414 + }, + { + "epoch": 5.108352144469526, + "grad_norm": 200.34927368164062, + "learning_rate": 2.23502722323049e-05, + "loss": 38.2656, + "step": 1415 + }, + { + "epoch": 5.11196388261851, + "grad_norm": 191.56883239746094, + "learning_rate": 2.23448275862069e-05, + "loss": 35.8111, + "step": 1416 + }, + { + "epoch": 5.115575620767494, + "grad_norm": 192.629150390625, + "learning_rate": 2.2339382940108894e-05, + "loss": 35.1287, + "step": 1417 + }, + { + "epoch": 5.119187358916479, + "grad_norm": 217.54855346679688, + "learning_rate": 2.233393829401089e-05, + "loss": 34.9664, + "step": 1418 + }, + { + "epoch": 5.122799097065463, + "grad_norm": 234.12355041503906, + "learning_rate": 2.2328493647912888e-05, + "loss": 35.9252, + "step": 1419 + }, + { + "epoch": 5.126410835214447, + "grad_norm": 201.83477783203125, + "learning_rate": 2.2323049001814884e-05, + "loss": 36.4664, + "step": 1420 + }, + { + "epoch": 5.126410835214447, + "eval_loss": 0.6359394192695618, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 1420 + }, + { + "epoch": 5.130022573363431, + "grad_norm": 212.38943481445312, + "learning_rate": 2.231760435571688e-05, + "loss": 35.2733, + "step": 1421 + }, + { + "epoch": 5.133634311512416, + "grad_norm": 219.8803253173828, + "learning_rate": 2.2312159709618874e-05, + "loss": 37.2009, + "step": 1422 + }, + { + "epoch": 5.1372460496614, + "grad_norm": 222.28221130371094, + "learning_rate": 2.230671506352087e-05, + "loss": 36.9338, + "step": 1423 + }, + { + "epoch": 5.140857787810384, + "grad_norm": 217.56607055664062, + "learning_rate": 2.2301270417422865e-05, + "loss": 38.0419, + "step": 1424 + }, + { + "epoch": 5.144469525959368, + "grad_norm": 232.7363739013672, + "learning_rate": 2.2295825771324867e-05, + "loss": 38.1393, + "step": 1425 + }, + { + "epoch": 5.148081264108352, + "grad_norm": 228.12091064453125, + "learning_rate": 2.2290381125226863e-05, + "loss": 37.4169, + "step": 1426 + }, + { + "epoch": 5.151693002257336, + "grad_norm": 247.9901580810547, + "learning_rate": 2.2284936479128858e-05, + "loss": 37.6386, + "step": 1427 + }, + { + "epoch": 5.15530474040632, + "grad_norm": 227.96649169921875, + "learning_rate": 2.2279491833030853e-05, + "loss": 38.7843, + "step": 1428 + }, + { + "epoch": 5.158916478555304, + "grad_norm": 197.85072326660156, + "learning_rate": 2.227404718693285e-05, + "loss": 37.7056, + "step": 1429 + }, + { + "epoch": 5.162528216704289, + "grad_norm": 270.6370544433594, + "learning_rate": 2.2268602540834848e-05, + "loss": 38.5554, + "step": 1430 + }, + { + "epoch": 5.162528216704289, + "eval_loss": 0.6463288068771362, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 1430 + }, + { + "epoch": 5.166139954853273, + "grad_norm": 251.65847778320312, + "learning_rate": 2.2263157894736843e-05, + "loss": 32.6593, + "step": 1431 + }, + { + "epoch": 5.169751693002257, + "grad_norm": 248.84368896484375, + "learning_rate": 2.225771324863884e-05, + "loss": 24.8031, + "step": 1432 + }, + { + "epoch": 5.173363431151241, + "grad_norm": 218.12979125976562, + "learning_rate": 2.2252268602540834e-05, + "loss": 23.8542, + "step": 1433 + }, + { + "epoch": 5.176975169300226, + "grad_norm": 171.4182586669922, + "learning_rate": 2.2246823956442832e-05, + "loss": 25.1994, + "step": 1434 + }, + { + "epoch": 5.18058690744921, + "grad_norm": 200.76271057128906, + "learning_rate": 2.2241379310344828e-05, + "loss": 25.1259, + "step": 1435 + }, + { + "epoch": 5.184198645598194, + "grad_norm": 324.8979797363281, + "learning_rate": 2.2235934664246827e-05, + "loss": 46.7466, + "step": 1436 + }, + { + "epoch": 5.187810383747179, + "grad_norm": 391.9200439453125, + "learning_rate": 2.2230490018148822e-05, + "loss": 47.366, + "step": 1437 + }, + { + "epoch": 5.191422121896163, + "grad_norm": 332.51080322265625, + "learning_rate": 2.2225045372050817e-05, + "loss": 47.5236, + "step": 1438 + }, + { + "epoch": 5.195033860045147, + "grad_norm": 295.85333251953125, + "learning_rate": 2.2219600725952813e-05, + "loss": 44.9235, + "step": 1439 + }, + { + "epoch": 5.198645598194131, + "grad_norm": 246.46482849121094, + "learning_rate": 2.2214156079854808e-05, + "loss": 44.5892, + "step": 1440 + }, + { + "epoch": 5.198645598194131, + "eval_loss": 0.6501885056495667, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.096, + "eval_steps_per_second": 57.096, + "step": 1440 + }, + { + "epoch": 5.2022573363431155, + "grad_norm": 224.99964904785156, + "learning_rate": 2.2208711433756807e-05, + "loss": 45.1496, + "step": 1441 + }, + { + "epoch": 5.2058690744920995, + "grad_norm": 201.5928497314453, + "learning_rate": 2.2203266787658802e-05, + "loss": 44.2362, + "step": 1442 + }, + { + "epoch": 5.209480812641083, + "grad_norm": 220.72509765625, + "learning_rate": 2.21978221415608e-05, + "loss": 45.7963, + "step": 1443 + }, + { + "epoch": 5.213092550790067, + "grad_norm": 229.04412841796875, + "learning_rate": 2.2192377495462796e-05, + "loss": 44.1812, + "step": 1444 + }, + { + "epoch": 5.216704288939052, + "grad_norm": 214.86207580566406, + "learning_rate": 2.2186932849364792e-05, + "loss": 44.364, + "step": 1445 + }, + { + "epoch": 5.220316027088036, + "grad_norm": 169.3239288330078, + "learning_rate": 2.2181488203266787e-05, + "loss": 44.1106, + "step": 1446 + }, + { + "epoch": 5.22392776523702, + "grad_norm": 180.3131561279297, + "learning_rate": 2.2176043557168786e-05, + "loss": 41.8791, + "step": 1447 + }, + { + "epoch": 5.227539503386004, + "grad_norm": 227.83078002929688, + "learning_rate": 2.217059891107078e-05, + "loss": 39.7917, + "step": 1448 + }, + { + "epoch": 5.231151241534989, + "grad_norm": 267.4294738769531, + "learning_rate": 2.2165154264972777e-05, + "loss": 41.2864, + "step": 1449 + }, + { + "epoch": 5.234762979683973, + "grad_norm": 210.79034423828125, + "learning_rate": 2.2159709618874772e-05, + "loss": 40.7219, + "step": 1450 + }, + { + "epoch": 5.234762979683973, + "eval_loss": 0.6369529366493225, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 1450 + }, + { + "epoch": 5.238374717832957, + "grad_norm": 205.2632598876953, + "learning_rate": 2.2154264972776768e-05, + "loss": 41.0364, + "step": 1451 + }, + { + "epoch": 5.241986455981941, + "grad_norm": 199.7196807861328, + "learning_rate": 2.214882032667877e-05, + "loss": 40.2733, + "step": 1452 + }, + { + "epoch": 5.245598194130926, + "grad_norm": 184.26495361328125, + "learning_rate": 2.2143375680580765e-05, + "loss": 40.3418, + "step": 1453 + }, + { + "epoch": 5.24920993227991, + "grad_norm": 170.1937713623047, + "learning_rate": 2.213793103448276e-05, + "loss": 40.5658, + "step": 1454 + }, + { + "epoch": 5.252821670428894, + "grad_norm": 167.71109008789062, + "learning_rate": 2.2132486388384756e-05, + "loss": 41.9252, + "step": 1455 + }, + { + "epoch": 5.2564334085778786, + "grad_norm": 184.73162841796875, + "learning_rate": 2.212704174228675e-05, + "loss": 40.0485, + "step": 1456 + }, + { + "epoch": 5.2600451467268625, + "grad_norm": 195.0812225341797, + "learning_rate": 2.2121597096188747e-05, + "loss": 41.6424, + "step": 1457 + }, + { + "epoch": 5.2636568848758465, + "grad_norm": 218.23553466796875, + "learning_rate": 2.2116152450090745e-05, + "loss": 40.6179, + "step": 1458 + }, + { + "epoch": 5.2672686230248305, + "grad_norm": 229.79299926757812, + "learning_rate": 2.211070780399274e-05, + "loss": 42.8747, + "step": 1459 + }, + { + "epoch": 5.270880361173815, + "grad_norm": 231.70692443847656, + "learning_rate": 2.2105263157894736e-05, + "loss": 42.7016, + "step": 1460 + }, + { + "epoch": 5.270880361173815, + "eval_loss": 0.6424433588981628, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 1460 + }, + { + "epoch": 5.274492099322799, + "grad_norm": 204.9513397216797, + "learning_rate": 2.209981851179673e-05, + "loss": 41.206, + "step": 1461 + }, + { + "epoch": 5.278103837471783, + "grad_norm": 220.89083862304688, + "learning_rate": 2.209437386569873e-05, + "loss": 44.0126, + "step": 1462 + }, + { + "epoch": 5.281715575620767, + "grad_norm": 266.7763671875, + "learning_rate": 2.208892921960073e-05, + "loss": 41.4934, + "step": 1463 + }, + { + "epoch": 5.285327313769752, + "grad_norm": 241.42636108398438, + "learning_rate": 2.2083484573502724e-05, + "loss": 43.3433, + "step": 1464 + }, + { + "epoch": 5.288939051918736, + "grad_norm": 221.7669219970703, + "learning_rate": 2.207803992740472e-05, + "loss": 35.9569, + "step": 1465 + }, + { + "epoch": 5.29255079006772, + "grad_norm": 236.0152130126953, + "learning_rate": 2.2072595281306715e-05, + "loss": 36.0824, + "step": 1466 + }, + { + "epoch": 5.296162528216704, + "grad_norm": 239.56224060058594, + "learning_rate": 2.206715063520871e-05, + "loss": 33.6127, + "step": 1467 + }, + { + "epoch": 5.299774266365689, + "grad_norm": 277.1287841796875, + "learning_rate": 2.2061705989110706e-05, + "loss": 36.11, + "step": 1468 + }, + { + "epoch": 5.303386004514673, + "grad_norm": 250.19515991210938, + "learning_rate": 2.2056261343012705e-05, + "loss": 36.9984, + "step": 1469 + }, + { + "epoch": 5.306997742663657, + "grad_norm": 214.2754669189453, + "learning_rate": 2.20508166969147e-05, + "loss": 36.5917, + "step": 1470 + }, + { + "epoch": 5.306997742663657, + "eval_loss": 0.6356943845748901, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 1470 + }, + { + "epoch": 5.310609480812641, + "grad_norm": 224.37388610839844, + "learning_rate": 2.20453720508167e-05, + "loss": 36.5302, + "step": 1471 + }, + { + "epoch": 5.314221218961626, + "grad_norm": 276.2541809082031, + "learning_rate": 2.2039927404718694e-05, + "loss": 36.7978, + "step": 1472 + }, + { + "epoch": 5.3178329571106095, + "grad_norm": 361.717041015625, + "learning_rate": 2.203448275862069e-05, + "loss": 37.4063, + "step": 1473 + }, + { + "epoch": 5.3214446952595935, + "grad_norm": 285.3569641113281, + "learning_rate": 2.202903811252269e-05, + "loss": 37.2472, + "step": 1474 + }, + { + "epoch": 5.3250564334085775, + "grad_norm": 268.160400390625, + "learning_rate": 2.2023593466424684e-05, + "loss": 37.7361, + "step": 1475 + }, + { + "epoch": 5.328668171557562, + "grad_norm": 211.38070678710938, + "learning_rate": 2.201814882032668e-05, + "loss": 37.7794, + "step": 1476 + }, + { + "epoch": 5.332279909706546, + "grad_norm": 214.10638427734375, + "learning_rate": 2.2012704174228675e-05, + "loss": 39.0787, + "step": 1477 + }, + { + "epoch": 5.33589164785553, + "grad_norm": 238.9603271484375, + "learning_rate": 2.200725952813067e-05, + "loss": 37.6853, + "step": 1478 + }, + { + "epoch": 5.339503386004514, + "grad_norm": 323.44976806640625, + "learning_rate": 2.2001814882032665e-05, + "loss": 38.2844, + "step": 1479 + }, + { + "epoch": 5.343115124153499, + "grad_norm": 289.6131896972656, + "learning_rate": 2.1996370235934668e-05, + "loss": 38.8953, + "step": 1480 + }, + { + "epoch": 5.343115124153499, + "eval_loss": 0.6462770700454712, + "eval_runtime": 3.1673, + "eval_samples_per_second": 56.516, + "eval_steps_per_second": 56.516, + "step": 1480 + }, + { + "epoch": 5.346726862302483, + "grad_norm": 197.47299194335938, + "learning_rate": 2.1990925589836663e-05, + "loss": 28.126, + "step": 1481 + }, + { + "epoch": 5.350338600451467, + "grad_norm": 198.37156677246094, + "learning_rate": 2.1985480943738658e-05, + "loss": 24.2205, + "step": 1482 + }, + { + "epoch": 5.353950338600452, + "grad_norm": 211.03501892089844, + "learning_rate": 2.1980036297640654e-05, + "loss": 24.119, + "step": 1483 + }, + { + "epoch": 5.357562076749436, + "grad_norm": 182.23316955566406, + "learning_rate": 2.197459165154265e-05, + "loss": 24.7386, + "step": 1484 + }, + { + "epoch": 5.36117381489842, + "grad_norm": 192.6392822265625, + "learning_rate": 2.1969147005444648e-05, + "loss": 26.0739, + "step": 1485 + }, + { + "epoch": 5.364785553047404, + "grad_norm": 380.62896728515625, + "learning_rate": 2.1963702359346643e-05, + "loss": 46.6945, + "step": 1486 + }, + { + "epoch": 5.368397291196389, + "grad_norm": 342.5572814941406, + "learning_rate": 2.195825771324864e-05, + "loss": 46.1797, + "step": 1487 + }, + { + "epoch": 5.372009029345373, + "grad_norm": 311.7198791503906, + "learning_rate": 2.1952813067150634e-05, + "loss": 45.6588, + "step": 1488 + }, + { + "epoch": 5.375620767494357, + "grad_norm": 260.9885559082031, + "learning_rate": 2.1947368421052633e-05, + "loss": 45.2405, + "step": 1489 + }, + { + "epoch": 5.3792325056433405, + "grad_norm": 263.3132019042969, + "learning_rate": 2.1941923774954628e-05, + "loss": 44.117, + "step": 1490 + }, + { + "epoch": 5.3792325056433405, + "eval_loss": 0.644275426864624, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 1490 + }, + { + "epoch": 5.382844243792325, + "grad_norm": 254.92022705078125, + "learning_rate": 2.1936479128856627e-05, + "loss": 45.4002, + "step": 1491 + }, + { + "epoch": 5.386455981941309, + "grad_norm": 246.1839599609375, + "learning_rate": 2.1931034482758622e-05, + "loss": 45.3481, + "step": 1492 + }, + { + "epoch": 5.390067720090293, + "grad_norm": 282.2879638671875, + "learning_rate": 2.1925589836660618e-05, + "loss": 45.3958, + "step": 1493 + }, + { + "epoch": 5.393679458239277, + "grad_norm": 266.9140930175781, + "learning_rate": 2.1920145190562613e-05, + "loss": 44.2959, + "step": 1494 + }, + { + "epoch": 5.397291196388262, + "grad_norm": 196.81199645996094, + "learning_rate": 2.191470054446461e-05, + "loss": 44.765, + "step": 1495 + }, + { + "epoch": 5.400902934537246, + "grad_norm": 270.7329406738281, + "learning_rate": 2.1909255898366607e-05, + "loss": 42.8581, + "step": 1496 + }, + { + "epoch": 5.40451467268623, + "grad_norm": 187.3281707763672, + "learning_rate": 2.1903811252268603e-05, + "loss": 40.7167, + "step": 1497 + }, + { + "epoch": 5.408126410835214, + "grad_norm": 302.9165954589844, + "learning_rate": 2.1898366606170598e-05, + "loss": 41.0712, + "step": 1498 + }, + { + "epoch": 5.411738148984199, + "grad_norm": 395.1492614746094, + "learning_rate": 2.1892921960072597e-05, + "loss": 40.4098, + "step": 1499 + }, + { + "epoch": 5.415349887133183, + "grad_norm": 253.91494750976562, + "learning_rate": 2.1887477313974592e-05, + "loss": 41.2985, + "step": 1500 + }, + { + "epoch": 5.415349887133183, + "eval_loss": 0.6383773684501648, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1500 + }, + { + "epoch": 5.418961625282167, + "grad_norm": 248.4109344482422, + "learning_rate": 2.1882032667876588e-05, + "loss": 41.179, + "step": 1501 + }, + { + "epoch": 5.422573363431152, + "grad_norm": 210.50015258789062, + "learning_rate": 2.1876588021778586e-05, + "loss": 41.1934, + "step": 1502 + }, + { + "epoch": 5.426185101580136, + "grad_norm": 170.64334106445312, + "learning_rate": 2.187114337568058e-05, + "loss": 41.5535, + "step": 1503 + }, + { + "epoch": 5.42979683972912, + "grad_norm": 249.41270446777344, + "learning_rate": 2.1865698729582577e-05, + "loss": 41.8323, + "step": 1504 + }, + { + "epoch": 5.433408577878104, + "grad_norm": 214.53770446777344, + "learning_rate": 2.1860254083484572e-05, + "loss": 42.1517, + "step": 1505 + }, + { + "epoch": 5.437020316027088, + "grad_norm": 225.6502227783203, + "learning_rate": 2.1854809437386568e-05, + "loss": 42.7675, + "step": 1506 + }, + { + "epoch": 5.440632054176072, + "grad_norm": 210.19219970703125, + "learning_rate": 2.1849364791288567e-05, + "loss": 42.5094, + "step": 1507 + }, + { + "epoch": 5.444243792325056, + "grad_norm": 187.03294372558594, + "learning_rate": 2.1843920145190565e-05, + "loss": 42.2218, + "step": 1508 + }, + { + "epoch": 5.44785553047404, + "grad_norm": 227.6764373779297, + "learning_rate": 2.183847549909256e-05, + "loss": 42.7061, + "step": 1509 + }, + { + "epoch": 5.451467268623025, + "grad_norm": 239.2847442626953, + "learning_rate": 2.1833030852994556e-05, + "loss": 43.1959, + "step": 1510 + }, + { + "epoch": 5.451467268623025, + "eval_loss": 0.6405091285705566, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 1510 + }, + { + "epoch": 5.455079006772009, + "grad_norm": 268.887451171875, + "learning_rate": 2.182758620689655e-05, + "loss": 42.4915, + "step": 1511 + }, + { + "epoch": 5.458690744920993, + "grad_norm": 261.0531311035156, + "learning_rate": 2.182214156079855e-05, + "loss": 42.1777, + "step": 1512 + }, + { + "epoch": 5.462302483069977, + "grad_norm": 241.58819580078125, + "learning_rate": 2.1816696914700546e-05, + "loss": 40.8728, + "step": 1513 + }, + { + "epoch": 5.465914221218962, + "grad_norm": 227.302001953125, + "learning_rate": 2.181125226860254e-05, + "loss": 39.8861, + "step": 1514 + }, + { + "epoch": 5.469525959367946, + "grad_norm": 293.8402404785156, + "learning_rate": 2.1805807622504536e-05, + "loss": 36.8716, + "step": 1515 + }, + { + "epoch": 5.47313769751693, + "grad_norm": 332.8829650878906, + "learning_rate": 2.1800362976406532e-05, + "loss": 35.6049, + "step": 1516 + }, + { + "epoch": 5.476749435665914, + "grad_norm": 271.6636962890625, + "learning_rate": 2.179491833030853e-05, + "loss": 34.6785, + "step": 1517 + }, + { + "epoch": 5.480361173814899, + "grad_norm": 211.5673065185547, + "learning_rate": 2.178947368421053e-05, + "loss": 35.5321, + "step": 1518 + }, + { + "epoch": 5.483972911963883, + "grad_norm": 168.95346069335938, + "learning_rate": 2.1784029038112525e-05, + "loss": 35.1604, + "step": 1519 + }, + { + "epoch": 5.487584650112867, + "grad_norm": 242.66725158691406, + "learning_rate": 2.177858439201452e-05, + "loss": 37.8709, + "step": 1520 + }, + { + "epoch": 5.487584650112867, + "eval_loss": 0.6324127912521362, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1520 + }, + { + "epoch": 5.491196388261851, + "grad_norm": 202.7799530029297, + "learning_rate": 2.1773139745916516e-05, + "loss": 38.1727, + "step": 1521 + }, + { + "epoch": 5.4948081264108355, + "grad_norm": 210.12704467773438, + "learning_rate": 2.176769509981851e-05, + "loss": 36.4171, + "step": 1522 + }, + { + "epoch": 5.4984198645598195, + "grad_norm": 214.7133331298828, + "learning_rate": 2.176225045372051e-05, + "loss": 37.7873, + "step": 1523 + }, + { + "epoch": 5.502031602708803, + "grad_norm": 197.89781188964844, + "learning_rate": 2.1756805807622505e-05, + "loss": 37.1096, + "step": 1524 + }, + { + "epoch": 5.505643340857787, + "grad_norm": 203.01992797851562, + "learning_rate": 2.17513611615245e-05, + "loss": 36.9907, + "step": 1525 + }, + { + "epoch": 5.509255079006772, + "grad_norm": 210.42164611816406, + "learning_rate": 2.17459165154265e-05, + "loss": 38.0291, + "step": 1526 + }, + { + "epoch": 5.512866817155756, + "grad_norm": 210.2798309326172, + "learning_rate": 2.1740471869328495e-05, + "loss": 37.5385, + "step": 1527 + }, + { + "epoch": 5.51647855530474, + "grad_norm": 217.986572265625, + "learning_rate": 2.173502722323049e-05, + "loss": 39.2736, + "step": 1528 + }, + { + "epoch": 5.520090293453725, + "grad_norm": 221.05831909179688, + "learning_rate": 2.172958257713249e-05, + "loss": 39.2733, + "step": 1529 + }, + { + "epoch": 5.523702031602709, + "grad_norm": 250.36065673828125, + "learning_rate": 2.1724137931034484e-05, + "loss": 37.8987, + "step": 1530 + }, + { + "epoch": 5.523702031602709, + "eval_loss": 0.6414559483528137, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 1530 + }, + { + "epoch": 5.527313769751693, + "grad_norm": 275.062255859375, + "learning_rate": 2.171869328493648e-05, + "loss": 29.4874, + "step": 1531 + }, + { + "epoch": 5.530925507900677, + "grad_norm": 178.79615783691406, + "learning_rate": 2.1713248638838475e-05, + "loss": 25.2165, + "step": 1532 + }, + { + "epoch": 5.534537246049661, + "grad_norm": 221.6693572998047, + "learning_rate": 2.170780399274047e-05, + "loss": 24.7139, + "step": 1533 + }, + { + "epoch": 5.538148984198646, + "grad_norm": 207.15869140625, + "learning_rate": 2.170235934664247e-05, + "loss": 25.2773, + "step": 1534 + }, + { + "epoch": 5.54176072234763, + "grad_norm": 193.37644958496094, + "learning_rate": 2.1696914700544468e-05, + "loss": 25.7936, + "step": 1535 + }, + { + "epoch": 5.545372460496614, + "grad_norm": 314.101318359375, + "learning_rate": 2.1691470054446463e-05, + "loss": 45.8573, + "step": 1536 + }, + { + "epoch": 5.5489841986455986, + "grad_norm": 376.9578552246094, + "learning_rate": 2.168602540834846e-05, + "loss": 47.1284, + "step": 1537 + }, + { + "epoch": 5.5525959367945825, + "grad_norm": 343.3904724121094, + "learning_rate": 2.1680580762250454e-05, + "loss": 45.1873, + "step": 1538 + }, + { + "epoch": 5.5562076749435665, + "grad_norm": 263.31768798828125, + "learning_rate": 2.167513611615245e-05, + "loss": 45.4906, + "step": 1539 + }, + { + "epoch": 5.5598194130925505, + "grad_norm": 295.50384521484375, + "learning_rate": 2.1669691470054448e-05, + "loss": 44.9259, + "step": 1540 + }, + { + "epoch": 5.5598194130925505, + "eval_loss": 0.6483813524246216, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.923, + "eval_steps_per_second": 56.923, + "step": 1540 + }, + { + "epoch": 5.563431151241535, + "grad_norm": 208.8861846923828, + "learning_rate": 2.1664246823956444e-05, + "loss": 43.7965, + "step": 1541 + }, + { + "epoch": 5.567042889390519, + "grad_norm": 195.8695526123047, + "learning_rate": 2.165880217785844e-05, + "loss": 44.7409, + "step": 1542 + }, + { + "epoch": 5.570654627539503, + "grad_norm": 218.10089111328125, + "learning_rate": 2.1653357531760434e-05, + "loss": 45.9364, + "step": 1543 + }, + { + "epoch": 5.574266365688487, + "grad_norm": 204.17205810546875, + "learning_rate": 2.164791288566243e-05, + "loss": 45.468, + "step": 1544 + }, + { + "epoch": 5.577878103837472, + "grad_norm": 239.03952026367188, + "learning_rate": 2.1642468239564432e-05, + "loss": 44.7685, + "step": 1545 + }, + { + "epoch": 5.581489841986456, + "grad_norm": 251.59300231933594, + "learning_rate": 2.1637023593466427e-05, + "loss": 43.011, + "step": 1546 + }, + { + "epoch": 5.58510158013544, + "grad_norm": 186.72540283203125, + "learning_rate": 2.1631578947368423e-05, + "loss": 41.5255, + "step": 1547 + }, + { + "epoch": 5.588713318284425, + "grad_norm": 199.89732360839844, + "learning_rate": 2.1626134301270418e-05, + "loss": 40.2522, + "step": 1548 + }, + { + "epoch": 5.592325056433409, + "grad_norm": 182.16624450683594, + "learning_rate": 2.1620689655172413e-05, + "loss": 41.0931, + "step": 1549 + }, + { + "epoch": 5.595936794582393, + "grad_norm": 221.58680725097656, + "learning_rate": 2.161524500907441e-05, + "loss": 40.2717, + "step": 1550 + }, + { + "epoch": 5.595936794582393, + "eval_loss": 0.6393340229988098, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 1550 + }, + { + "epoch": 5.599548532731377, + "grad_norm": 209.82183837890625, + "learning_rate": 2.1609800362976408e-05, + "loss": 41.7522, + "step": 1551 + }, + { + "epoch": 5.603160270880361, + "grad_norm": 226.1896209716797, + "learning_rate": 2.1604355716878403e-05, + "loss": 40.8078, + "step": 1552 + }, + { + "epoch": 5.606772009029346, + "grad_norm": 219.57899475097656, + "learning_rate": 2.1598911070780398e-05, + "loss": 42.2331, + "step": 1553 + }, + { + "epoch": 5.6103837471783295, + "grad_norm": 185.2303009033203, + "learning_rate": 2.1593466424682397e-05, + "loss": 42.0695, + "step": 1554 + }, + { + "epoch": 5.6139954853273135, + "grad_norm": 192.32913208007812, + "learning_rate": 2.1588021778584392e-05, + "loss": 42.1317, + "step": 1555 + }, + { + "epoch": 5.617607223476298, + "grad_norm": 183.3128662109375, + "learning_rate": 2.158257713248639e-05, + "loss": 40.4957, + "step": 1556 + }, + { + "epoch": 5.621218961625282, + "grad_norm": 178.10691833496094, + "learning_rate": 2.1577132486388387e-05, + "loss": 40.9154, + "step": 1557 + }, + { + "epoch": 5.624830699774266, + "grad_norm": 207.3495330810547, + "learning_rate": 2.1571687840290382e-05, + "loss": 42.8389, + "step": 1558 + }, + { + "epoch": 5.62844243792325, + "grad_norm": 191.46353149414062, + "learning_rate": 2.1566243194192377e-05, + "loss": 41.9483, + "step": 1559 + }, + { + "epoch": 5.632054176072235, + "grad_norm": 218.9544219970703, + "learning_rate": 2.1560798548094373e-05, + "loss": 41.2037, + "step": 1560 + }, + { + "epoch": 5.632054176072235, + "eval_loss": 0.6345452070236206, + "eval_runtime": 3.1432, + "eval_samples_per_second": 56.949, + "eval_steps_per_second": 56.949, + "step": 1560 + }, + { + "epoch": 5.635665914221219, + "grad_norm": 235.9405059814453, + "learning_rate": 2.1555353901996368e-05, + "loss": 43.1159, + "step": 1561 + }, + { + "epoch": 5.639277652370203, + "grad_norm": 207.1119384765625, + "learning_rate": 2.1549909255898367e-05, + "loss": 43.4384, + "step": 1562 + }, + { + "epoch": 5.642889390519187, + "grad_norm": 305.3013916015625, + "learning_rate": 2.1544464609800366e-05, + "loss": 42.436, + "step": 1563 + }, + { + "epoch": 5.646501128668172, + "grad_norm": 226.25282287597656, + "learning_rate": 2.153901996370236e-05, + "loss": 39.6844, + "step": 1564 + }, + { + "epoch": 5.650112866817156, + "grad_norm": 201.5033416748047, + "learning_rate": 2.1533575317604356e-05, + "loss": 35.9103, + "step": 1565 + }, + { + "epoch": 5.65372460496614, + "grad_norm": 206.63229370117188, + "learning_rate": 2.1528130671506352e-05, + "loss": 35.0026, + "step": 1566 + }, + { + "epoch": 5.657336343115124, + "grad_norm": 212.67581176757812, + "learning_rate": 2.152268602540835e-05, + "loss": 35.6298, + "step": 1567 + }, + { + "epoch": 5.660948081264109, + "grad_norm": 193.2886199951172, + "learning_rate": 2.1517241379310346e-05, + "loss": 36.0356, + "step": 1568 + }, + { + "epoch": 5.664559819413093, + "grad_norm": 166.189208984375, + "learning_rate": 2.151179673321234e-05, + "loss": 35.5423, + "step": 1569 + }, + { + "epoch": 5.668171557562077, + "grad_norm": 288.91552734375, + "learning_rate": 2.1506352087114337e-05, + "loss": 36.6227, + "step": 1570 + }, + { + "epoch": 5.668171557562077, + "eval_loss": 0.6339959502220154, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.024, + "eval_steps_per_second": 57.024, + "step": 1570 + }, + { + "epoch": 5.6717832957110605, + "grad_norm": 210.91664123535156, + "learning_rate": 2.1500907441016332e-05, + "loss": 37.3015, + "step": 1571 + }, + { + "epoch": 5.675395033860045, + "grad_norm": 206.54299926757812, + "learning_rate": 2.149546279491833e-05, + "loss": 36.961, + "step": 1572 + }, + { + "epoch": 5.679006772009029, + "grad_norm": 206.55613708496094, + "learning_rate": 2.149001814882033e-05, + "loss": 36.722, + "step": 1573 + }, + { + "epoch": 5.682618510158013, + "grad_norm": 206.86563110351562, + "learning_rate": 2.1484573502722325e-05, + "loss": 37.7482, + "step": 1574 + }, + { + "epoch": 5.686230248306998, + "grad_norm": 219.96533203125, + "learning_rate": 2.147912885662432e-05, + "loss": 37.7964, + "step": 1575 + }, + { + "epoch": 5.689841986455982, + "grad_norm": 226.23887634277344, + "learning_rate": 2.1473684210526316e-05, + "loss": 38.6577, + "step": 1576 + }, + { + "epoch": 5.693453724604966, + "grad_norm": 195.1751708984375, + "learning_rate": 2.146823956442831e-05, + "loss": 36.9764, + "step": 1577 + }, + { + "epoch": 5.69706546275395, + "grad_norm": 194.3510284423828, + "learning_rate": 2.146279491833031e-05, + "loss": 39.4842, + "step": 1578 + }, + { + "epoch": 5.700677200902934, + "grad_norm": 187.02281188964844, + "learning_rate": 2.1457350272232305e-05, + "loss": 38.9574, + "step": 1579 + }, + { + "epoch": 5.704288939051919, + "grad_norm": 242.91925048828125, + "learning_rate": 2.14519056261343e-05, + "loss": 37.6359, + "step": 1580 + }, + { + "epoch": 5.704288939051919, + "eval_loss": 0.6384473443031311, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 1580 + }, + { + "epoch": 5.707900677200903, + "grad_norm": 242.9617156982422, + "learning_rate": 2.14464609800363e-05, + "loss": 31.3564, + "step": 1581 + }, + { + "epoch": 5.711512415349887, + "grad_norm": 182.00540161132812, + "learning_rate": 2.1441016333938295e-05, + "loss": 24.2933, + "step": 1582 + }, + { + "epoch": 5.715124153498872, + "grad_norm": 257.7115173339844, + "learning_rate": 2.143557168784029e-05, + "loss": 24.6299, + "step": 1583 + }, + { + "epoch": 5.718735891647856, + "grad_norm": 198.71554565429688, + "learning_rate": 2.143012704174229e-05, + "loss": 24.7344, + "step": 1584 + }, + { + "epoch": 5.72234762979684, + "grad_norm": 198.24520874023438, + "learning_rate": 2.1424682395644284e-05, + "loss": 26.0825, + "step": 1585 + }, + { + "epoch": 5.725959367945824, + "grad_norm": 248.9528045654297, + "learning_rate": 2.141923774954628e-05, + "loss": 45.1176, + "step": 1586 + }, + { + "epoch": 5.7295711060948085, + "grad_norm": 293.7327575683594, + "learning_rate": 2.1413793103448275e-05, + "loss": 45.8517, + "step": 1587 + }, + { + "epoch": 5.733182844243792, + "grad_norm": 293.1148681640625, + "learning_rate": 2.140834845735027e-05, + "loss": 45.6659, + "step": 1588 + }, + { + "epoch": 5.736794582392776, + "grad_norm": 312.7779846191406, + "learning_rate": 2.140290381125227e-05, + "loss": 44.4863, + "step": 1589 + }, + { + "epoch": 5.74040632054176, + "grad_norm": 309.1000061035156, + "learning_rate": 2.1397459165154265e-05, + "loss": 43.649, + "step": 1590 + }, + { + "epoch": 5.74040632054176, + "eval_loss": 0.6471736431121826, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 1590 + }, + { + "epoch": 5.744018058690745, + "grad_norm": 276.4226989746094, + "learning_rate": 2.1392014519056263e-05, + "loss": 45.3135, + "step": 1591 + }, + { + "epoch": 5.747629796839729, + "grad_norm": 233.6791229248047, + "learning_rate": 2.138656987295826e-05, + "loss": 44.4919, + "step": 1592 + }, + { + "epoch": 5.751241534988713, + "grad_norm": 194.2917022705078, + "learning_rate": 2.1381125226860254e-05, + "loss": 44.8033, + "step": 1593 + }, + { + "epoch": 5.754853273137698, + "grad_norm": 241.76060485839844, + "learning_rate": 2.137568058076225e-05, + "loss": 45.1427, + "step": 1594 + }, + { + "epoch": 5.758465011286682, + "grad_norm": 216.56283569335938, + "learning_rate": 2.137023593466425e-05, + "loss": 43.1769, + "step": 1595 + }, + { + "epoch": 5.762076749435666, + "grad_norm": 230.0026092529297, + "learning_rate": 2.1364791288566244e-05, + "loss": 44.1141, + "step": 1596 + }, + { + "epoch": 5.76568848758465, + "grad_norm": 191.55433654785156, + "learning_rate": 2.135934664246824e-05, + "loss": 40.7227, + "step": 1597 + }, + { + "epoch": 5.769300225733634, + "grad_norm": 180.25885009765625, + "learning_rate": 2.1353901996370235e-05, + "loss": 40.9842, + "step": 1598 + }, + { + "epoch": 5.772911963882619, + "grad_norm": 220.4018096923828, + "learning_rate": 2.134845735027223e-05, + "loss": 40.0403, + "step": 1599 + }, + { + "epoch": 5.776523702031603, + "grad_norm": 264.20587158203125, + "learning_rate": 2.1343012704174232e-05, + "loss": 40.1543, + "step": 1600 + }, + { + "epoch": 5.776523702031603, + "eval_loss": 0.6374311447143555, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1600 + }, + { + "epoch": 5.780135440180587, + "grad_norm": 167.9457244873047, + "learning_rate": 2.1337568058076227e-05, + "loss": 40.9575, + "step": 1601 + }, + { + "epoch": 5.7837471783295715, + "grad_norm": 190.05247497558594, + "learning_rate": 2.1332123411978223e-05, + "loss": 39.5593, + "step": 1602 + }, + { + "epoch": 5.7873589164785555, + "grad_norm": 246.4980926513672, + "learning_rate": 2.1326678765880218e-05, + "loss": 40.7016, + "step": 1603 + }, + { + "epoch": 5.7909706546275395, + "grad_norm": 208.7435302734375, + "learning_rate": 2.1321234119782214e-05, + "loss": 41.7855, + "step": 1604 + }, + { + "epoch": 5.794582392776523, + "grad_norm": 190.84188842773438, + "learning_rate": 2.1315789473684212e-05, + "loss": 41.2129, + "step": 1605 + }, + { + "epoch": 5.798194130925508, + "grad_norm": 196.7161102294922, + "learning_rate": 2.1310344827586208e-05, + "loss": 40.8209, + "step": 1606 + }, + { + "epoch": 5.801805869074492, + "grad_norm": 181.4319305419922, + "learning_rate": 2.1304900181488203e-05, + "loss": 41.8345, + "step": 1607 + }, + { + "epoch": 5.805417607223476, + "grad_norm": 201.2064971923828, + "learning_rate": 2.12994555353902e-05, + "loss": 43.1464, + "step": 1608 + }, + { + "epoch": 5.80902934537246, + "grad_norm": 199.15174865722656, + "learning_rate": 2.1294010889292197e-05, + "loss": 42.6041, + "step": 1609 + }, + { + "epoch": 5.812641083521445, + "grad_norm": 231.0398406982422, + "learning_rate": 2.1288566243194193e-05, + "loss": 42.867, + "step": 1610 + }, + { + "epoch": 5.812641083521445, + "eval_loss": 0.6334222555160522, + "eval_runtime": 3.1534, + "eval_samples_per_second": 56.764, + "eval_steps_per_second": 56.764, + "step": 1610 + }, + { + "epoch": 5.816252821670429, + "grad_norm": 189.26132202148438, + "learning_rate": 2.128312159709619e-05, + "loss": 41.7717, + "step": 1611 + }, + { + "epoch": 5.819864559819413, + "grad_norm": 215.5289764404297, + "learning_rate": 2.1277676950998187e-05, + "loss": 41.3994, + "step": 1612 + }, + { + "epoch": 5.823476297968397, + "grad_norm": 267.4259033203125, + "learning_rate": 2.1272232304900182e-05, + "loss": 41.8173, + "step": 1613 + }, + { + "epoch": 5.827088036117382, + "grad_norm": 241.74749755859375, + "learning_rate": 2.1266787658802178e-05, + "loss": 39.9873, + "step": 1614 + }, + { + "epoch": 5.830699774266366, + "grad_norm": 242.233642578125, + "learning_rate": 2.1261343012704173e-05, + "loss": 37.0662, + "step": 1615 + }, + { + "epoch": 5.83431151241535, + "grad_norm": 217.06141662597656, + "learning_rate": 2.1255898366606172e-05, + "loss": 36.8948, + "step": 1616 + }, + { + "epoch": 5.837923250564334, + "grad_norm": 242.05567932128906, + "learning_rate": 2.1250453720508167e-05, + "loss": 34.9909, + "step": 1617 + }, + { + "epoch": 5.8415349887133186, + "grad_norm": 178.65618896484375, + "learning_rate": 2.1245009074410166e-05, + "loss": 35.603, + "step": 1618 + }, + { + "epoch": 5.8451467268623025, + "grad_norm": 216.36865234375, + "learning_rate": 2.123956442831216e-05, + "loss": 35.9822, + "step": 1619 + }, + { + "epoch": 5.8487584650112865, + "grad_norm": 241.22161865234375, + "learning_rate": 2.1234119782214157e-05, + "loss": 35.1473, + "step": 1620 + }, + { + "epoch": 5.8487584650112865, + "eval_loss": 0.6312161087989807, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 1620 + }, + { + "epoch": 5.852370203160271, + "grad_norm": 192.05210876464844, + "learning_rate": 2.1228675136116152e-05, + "loss": 36.145, + "step": 1621 + }, + { + "epoch": 5.855981941309255, + "grad_norm": 194.0652618408203, + "learning_rate": 2.122323049001815e-05, + "loss": 37.7076, + "step": 1622 + }, + { + "epoch": 5.859593679458239, + "grad_norm": 255.59286499023438, + "learning_rate": 2.1217785843920146e-05, + "loss": 37.6837, + "step": 1623 + }, + { + "epoch": 5.863205417607223, + "grad_norm": 184.0017852783203, + "learning_rate": 2.121234119782214e-05, + "loss": 37.1681, + "step": 1624 + }, + { + "epoch": 5.866817155756207, + "grad_norm": 186.98338317871094, + "learning_rate": 2.1206896551724137e-05, + "loss": 37.4902, + "step": 1625 + }, + { + "epoch": 5.870428893905192, + "grad_norm": 253.53775024414062, + "learning_rate": 2.1201451905626132e-05, + "loss": 37.2771, + "step": 1626 + }, + { + "epoch": 5.874040632054176, + "grad_norm": 196.43038940429688, + "learning_rate": 2.119600725952813e-05, + "loss": 37.7681, + "step": 1627 + }, + { + "epoch": 5.87765237020316, + "grad_norm": 255.99879455566406, + "learning_rate": 2.119056261343013e-05, + "loss": 40.0097, + "step": 1628 + }, + { + "epoch": 5.881264108352145, + "grad_norm": 275.1465148925781, + "learning_rate": 2.1185117967332125e-05, + "loss": 38.1076, + "step": 1629 + }, + { + "epoch": 5.884875846501129, + "grad_norm": 281.8592529296875, + "learning_rate": 2.117967332123412e-05, + "loss": 38.6463, + "step": 1630 + }, + { + "epoch": 5.884875846501129, + "eval_loss": 0.6449099779129028, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 1630 + }, + { + "epoch": 5.888487584650113, + "grad_norm": 246.7912139892578, + "learning_rate": 2.1174228675136116e-05, + "loss": 36.9158, + "step": 1631 + }, + { + "epoch": 5.892099322799097, + "grad_norm": 176.7545623779297, + "learning_rate": 2.116878402903811e-05, + "loss": 25.1153, + "step": 1632 + }, + { + "epoch": 5.895711060948082, + "grad_norm": 202.2602996826172, + "learning_rate": 2.116333938294011e-05, + "loss": 24.1999, + "step": 1633 + }, + { + "epoch": 5.899322799097066, + "grad_norm": 186.26255798339844, + "learning_rate": 2.1157894736842106e-05, + "loss": 24.185, + "step": 1634 + }, + { + "epoch": 5.9029345372460496, + "grad_norm": 231.0543670654297, + "learning_rate": 2.11524500907441e-05, + "loss": 26.1841, + "step": 1635 + }, + { + "epoch": 5.9065462753950335, + "grad_norm": 336.677001953125, + "learning_rate": 2.1147005444646096e-05, + "loss": 47.1367, + "step": 1636 + }, + { + "epoch": 5.910158013544018, + "grad_norm": 299.3211975097656, + "learning_rate": 2.1141560798548095e-05, + "loss": 46.7711, + "step": 1637 + }, + { + "epoch": 5.913769751693002, + "grad_norm": 287.5389099121094, + "learning_rate": 2.1136116152450094e-05, + "loss": 44.9163, + "step": 1638 + }, + { + "epoch": 5.917381489841986, + "grad_norm": 290.34930419921875, + "learning_rate": 2.113067150635209e-05, + "loss": 45.1651, + "step": 1639 + }, + { + "epoch": 5.92099322799097, + "grad_norm": 244.7100372314453, + "learning_rate": 2.1125226860254085e-05, + "loss": 45.6252, + "step": 1640 + }, + { + "epoch": 5.92099322799097, + "eval_loss": 0.6506878733634949, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 1640 + }, + { + "epoch": 5.924604966139955, + "grad_norm": 301.48223876953125, + "learning_rate": 2.111978221415608e-05, + "loss": 44.5345, + "step": 1641 + }, + { + "epoch": 5.928216704288939, + "grad_norm": 261.05987548828125, + "learning_rate": 2.1114337568058075e-05, + "loss": 42.0263, + "step": 1642 + }, + { + "epoch": 5.931828442437923, + "grad_norm": 220.4369659423828, + "learning_rate": 2.110889292196007e-05, + "loss": 41.2405, + "step": 1643 + }, + { + "epoch": 5.935440180586907, + "grad_norm": 261.3221435546875, + "learning_rate": 2.110344827586207e-05, + "loss": 42.2734, + "step": 1644 + }, + { + "epoch": 5.939051918735892, + "grad_norm": 253.70855712890625, + "learning_rate": 2.1098003629764065e-05, + "loss": 43.0752, + "step": 1645 + }, + { + "epoch": 5.942663656884876, + "grad_norm": 198.76138305664062, + "learning_rate": 2.1092558983666064e-05, + "loss": 42.7103, + "step": 1646 + }, + { + "epoch": 5.94627539503386, + "grad_norm": 212.21466064453125, + "learning_rate": 2.108711433756806e-05, + "loss": 42.6215, + "step": 1647 + }, + { + "epoch": 5.949887133182845, + "grad_norm": 212.9633026123047, + "learning_rate": 2.1081669691470055e-05, + "loss": 42.795, + "step": 1648 + }, + { + "epoch": 5.953498871331829, + "grad_norm": 263.2871398925781, + "learning_rate": 2.1076225045372053e-05, + "loss": 43.8843, + "step": 1649 + }, + { + "epoch": 5.957110609480813, + "grad_norm": 207.67120361328125, + "learning_rate": 2.107078039927405e-05, + "loss": 43.0161, + "step": 1650 + }, + { + "epoch": 5.957110609480813, + "eval_loss": 0.6315081715583801, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1650 + }, + { + "epoch": 5.960722347629797, + "grad_norm": 176.6342010498047, + "learning_rate": 2.1065335753176044e-05, + "loss": 38.803, + "step": 1651 + }, + { + "epoch": 5.9643340857787805, + "grad_norm": 223.57485961914062, + "learning_rate": 2.105989110707804e-05, + "loss": 35.1905, + "step": 1652 + }, + { + "epoch": 5.967945823927765, + "grad_norm": 291.507568359375, + "learning_rate": 2.1054446460980035e-05, + "loss": 34.9454, + "step": 1653 + }, + { + "epoch": 5.971557562076749, + "grad_norm": 250.51063537597656, + "learning_rate": 2.104900181488203e-05, + "loss": 37.4404, + "step": 1654 + }, + { + "epoch": 5.975169300225733, + "grad_norm": 307.9601135253906, + "learning_rate": 2.1043557168784032e-05, + "loss": 36.9775, + "step": 1655 + }, + { + "epoch": 5.978781038374718, + "grad_norm": 277.24151611328125, + "learning_rate": 2.1038112522686028e-05, + "loss": 38.2696, + "step": 1656 + }, + { + "epoch": 5.982392776523702, + "grad_norm": 186.7593994140625, + "learning_rate": 2.1032667876588023e-05, + "loss": 37.0656, + "step": 1657 + }, + { + "epoch": 5.986004514672686, + "grad_norm": 201.67047119140625, + "learning_rate": 2.102722323049002e-05, + "loss": 38.1747, + "step": 1658 + }, + { + "epoch": 5.98961625282167, + "grad_norm": 216.87525939941406, + "learning_rate": 2.1021778584392014e-05, + "loss": 39.3248, + "step": 1659 + }, + { + "epoch": 5.993227990970655, + "grad_norm": 227.381103515625, + "learning_rate": 2.1016333938294013e-05, + "loss": 33.4017, + "step": 1660 + }, + { + "epoch": 5.993227990970655, + "eval_loss": 0.6369583010673523, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 1660 + }, + { + "epoch": 5.996839729119639, + "grad_norm": 237.2648468017578, + "learning_rate": 2.1010889292196008e-05, + "loss": 24.679, + "step": 1661 + }, + { + "epoch": 6.0, + "grad_norm": 191.99951171875, + "learning_rate": 2.1005444646098003e-05, + "loss": 21.9552, + "step": 1662 + }, + { + "epoch": 6.003611738148984, + "grad_norm": 267.92181396484375, + "learning_rate": 2.1e-05, + "loss": 43.6884, + "step": 1663 + }, + { + "epoch": 6.007223476297969, + "grad_norm": 318.86602783203125, + "learning_rate": 2.0994555353901998e-05, + "loss": 46.0709, + "step": 1664 + }, + { + "epoch": 6.010835214446953, + "grad_norm": 282.772705078125, + "learning_rate": 2.0989110707803993e-05, + "loss": 44.2746, + "step": 1665 + }, + { + "epoch": 6.014446952595937, + "grad_norm": 263.2024841308594, + "learning_rate": 2.0983666061705992e-05, + "loss": 43.818, + "step": 1666 + }, + { + "epoch": 6.018058690744921, + "grad_norm": 229.41725158691406, + "learning_rate": 2.0978221415607987e-05, + "loss": 43.9441, + "step": 1667 + }, + { + "epoch": 6.021670428893906, + "grad_norm": 253.25624084472656, + "learning_rate": 2.0972776769509983e-05, + "loss": 43.517, + "step": 1668 + }, + { + "epoch": 6.0252821670428895, + "grad_norm": 202.00238037109375, + "learning_rate": 2.0967332123411978e-05, + "loss": 44.3685, + "step": 1669 + }, + { + "epoch": 6.0288939051918735, + "grad_norm": 196.92825317382812, + "learning_rate": 2.0961887477313973e-05, + "loss": 44.9367, + "step": 1670 + }, + { + "epoch": 6.0288939051918735, + "eval_loss": 0.6381568312644958, + "eval_runtime": 3.1477, + "eval_samples_per_second": 56.867, + "eval_steps_per_second": 56.867, + "step": 1670 + }, + { + "epoch": 6.0325056433408575, + "grad_norm": 191.00900268554688, + "learning_rate": 2.0956442831215972e-05, + "loss": 44.0743, + "step": 1671 + }, + { + "epoch": 6.036117381489842, + "grad_norm": 195.92141723632812, + "learning_rate": 2.0950998185117967e-05, + "loss": 43.3278, + "step": 1672 + }, + { + "epoch": 6.039729119638826, + "grad_norm": 230.04708862304688, + "learning_rate": 2.0945553539019963e-05, + "loss": 41.6419, + "step": 1673 + }, + { + "epoch": 6.04334085778781, + "grad_norm": 215.70689392089844, + "learning_rate": 2.094010889292196e-05, + "loss": 41.0927, + "step": 1674 + }, + { + "epoch": 6.046952595936794, + "grad_norm": 227.51797485351562, + "learning_rate": 2.0934664246823957e-05, + "loss": 40.1888, + "step": 1675 + }, + { + "epoch": 6.050564334085779, + "grad_norm": 216.93089294433594, + "learning_rate": 2.0929219600725952e-05, + "loss": 39.8766, + "step": 1676 + }, + { + "epoch": 6.054176072234763, + "grad_norm": 199.3091583251953, + "learning_rate": 2.092377495462795e-05, + "loss": 40.3851, + "step": 1677 + }, + { + "epoch": 6.057787810383747, + "grad_norm": 188.56056213378906, + "learning_rate": 2.0918330308529947e-05, + "loss": 40.5289, + "step": 1678 + }, + { + "epoch": 6.061399548532731, + "grad_norm": 194.23265075683594, + "learning_rate": 2.0912885662431942e-05, + "loss": 40.7509, + "step": 1679 + }, + { + "epoch": 6.065011286681716, + "grad_norm": 199.7327423095703, + "learning_rate": 2.0907441016333937e-05, + "loss": 41.3404, + "step": 1680 + }, + { + "epoch": 6.065011286681716, + "eval_loss": 0.6312655806541443, + "eval_runtime": 3.1482, + "eval_samples_per_second": 56.858, + "eval_steps_per_second": 56.858, + "step": 1680 + }, + { + "epoch": 6.0686230248307, + "grad_norm": 189.40150451660156, + "learning_rate": 2.0901996370235933e-05, + "loss": 41.3719, + "step": 1681 + }, + { + "epoch": 6.072234762979684, + "grad_norm": 222.07705688476562, + "learning_rate": 2.089655172413793e-05, + "loss": 41.8194, + "step": 1682 + }, + { + "epoch": 6.075846501128668, + "grad_norm": 205.6264190673828, + "learning_rate": 2.089110707803993e-05, + "loss": 39.8522, + "step": 1683 + }, + { + "epoch": 6.079458239277653, + "grad_norm": 207.98802185058594, + "learning_rate": 2.0885662431941926e-05, + "loss": 41.5093, + "step": 1684 + }, + { + "epoch": 6.083069977426637, + "grad_norm": 197.24134826660156, + "learning_rate": 2.088021778584392e-05, + "loss": 41.7284, + "step": 1685 + }, + { + "epoch": 6.0866817155756205, + "grad_norm": 220.84255981445312, + "learning_rate": 2.0874773139745916e-05, + "loss": 42.7841, + "step": 1686 + }, + { + "epoch": 6.090293453724605, + "grad_norm": 239.06854248046875, + "learning_rate": 2.0869328493647912e-05, + "loss": 43.6391, + "step": 1687 + }, + { + "epoch": 6.093905191873589, + "grad_norm": 193.2572021484375, + "learning_rate": 2.086388384754991e-05, + "loss": 41.9963, + "step": 1688 + }, + { + "epoch": 6.097516930022573, + "grad_norm": 206.66473388671875, + "learning_rate": 2.0858439201451906e-05, + "loss": 41.9834, + "step": 1689 + }, + { + "epoch": 6.101128668171557, + "grad_norm": 214.81956481933594, + "learning_rate": 2.08529945553539e-05, + "loss": 41.7128, + "step": 1690 + }, + { + "epoch": 6.101128668171557, + "eval_loss": 0.6309775114059448, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1690 + }, + { + "epoch": 6.104740406320542, + "grad_norm": 189.58360290527344, + "learning_rate": 2.0847549909255897e-05, + "loss": 37.7807, + "step": 1691 + }, + { + "epoch": 6.108352144469526, + "grad_norm": 265.76934814453125, + "learning_rate": 2.0842105263157895e-05, + "loss": 37.7091, + "step": 1692 + }, + { + "epoch": 6.11196388261851, + "grad_norm": 266.4632568359375, + "learning_rate": 2.0836660617059894e-05, + "loss": 34.7386, + "step": 1693 + }, + { + "epoch": 6.115575620767494, + "grad_norm": 309.3799743652344, + "learning_rate": 2.083121597096189e-05, + "loss": 34.9386, + "step": 1694 + }, + { + "epoch": 6.119187358916479, + "grad_norm": 252.98681640625, + "learning_rate": 2.0825771324863885e-05, + "loss": 34.9113, + "step": 1695 + }, + { + "epoch": 6.122799097065463, + "grad_norm": 199.3408660888672, + "learning_rate": 2.082032667876588e-05, + "loss": 35.1914, + "step": 1696 + }, + { + "epoch": 6.126410835214447, + "grad_norm": 231.67514038085938, + "learning_rate": 2.0814882032667876e-05, + "loss": 36.3151, + "step": 1697 + }, + { + "epoch": 6.130022573363431, + "grad_norm": 215.49317932128906, + "learning_rate": 2.080943738656987e-05, + "loss": 37.6763, + "step": 1698 + }, + { + "epoch": 6.133634311512416, + "grad_norm": 239.3602752685547, + "learning_rate": 2.080399274047187e-05, + "loss": 35.7805, + "step": 1699 + }, + { + "epoch": 6.1372460496614, + "grad_norm": 192.8195037841797, + "learning_rate": 2.0798548094373865e-05, + "loss": 36.7353, + "step": 1700 + }, + { + "epoch": 6.1372460496614, + "eval_loss": 0.6290757060050964, + "eval_runtime": 3.1486, + "eval_samples_per_second": 56.851, + "eval_steps_per_second": 56.851, + "step": 1700 + }, + { + "epoch": 6.140857787810384, + "grad_norm": 191.125, + "learning_rate": 2.0793103448275864e-05, + "loss": 36.6377, + "step": 1701 + }, + { + "epoch": 6.144469525959368, + "grad_norm": 232.39170837402344, + "learning_rate": 2.078765880217786e-05, + "loss": 36.5235, + "step": 1702 + }, + { + "epoch": 6.148081264108352, + "grad_norm": 259.41204833984375, + "learning_rate": 2.0782214156079855e-05, + "loss": 37.7093, + "step": 1703 + }, + { + "epoch": 6.151693002257336, + "grad_norm": 218.00814819335938, + "learning_rate": 2.0776769509981854e-05, + "loss": 37.8061, + "step": 1704 + }, + { + "epoch": 6.15530474040632, + "grad_norm": 183.78170776367188, + "learning_rate": 2.077132486388385e-05, + "loss": 37.9451, + "step": 1705 + }, + { + "epoch": 6.158916478555304, + "grad_norm": 242.387939453125, + "learning_rate": 2.0765880217785844e-05, + "loss": 38.687, + "step": 1706 + }, + { + "epoch": 6.162528216704289, + "grad_norm": 247.09152221679688, + "learning_rate": 2.076043557168784e-05, + "loss": 38.5109, + "step": 1707 + }, + { + "epoch": 6.166139954853273, + "grad_norm": 202.3104705810547, + "learning_rate": 2.0754990925589835e-05, + "loss": 28.0115, + "step": 1708 + }, + { + "epoch": 6.169751693002257, + "grad_norm": 239.5511016845703, + "learning_rate": 2.0749546279491834e-05, + "loss": 23.8873, + "step": 1709 + }, + { + "epoch": 6.173363431151241, + "grad_norm": 233.80007934570312, + "learning_rate": 2.0744101633393833e-05, + "loss": 24.0236, + "step": 1710 + }, + { + "epoch": 6.173363431151241, + "eval_loss": 0.6451307535171509, + "eval_runtime": 3.1389, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 1710 + }, + { + "epoch": 6.176975169300226, + "grad_norm": 231.85955810546875, + "learning_rate": 2.0738656987295828e-05, + "loss": 25.2521, + "step": 1711 + }, + { + "epoch": 6.18058690744921, + "grad_norm": 207.05453491210938, + "learning_rate": 2.0733212341197823e-05, + "loss": 25.5774, + "step": 1712 + }, + { + "epoch": 6.184198645598194, + "grad_norm": 265.9180908203125, + "learning_rate": 2.072776769509982e-05, + "loss": 46.0267, + "step": 1713 + }, + { + "epoch": 6.187810383747179, + "grad_norm": 289.2763671875, + "learning_rate": 2.0722323049001814e-05, + "loss": 46.6262, + "step": 1714 + }, + { + "epoch": 6.191422121896163, + "grad_norm": 254.466552734375, + "learning_rate": 2.0716878402903813e-05, + "loss": 44.2758, + "step": 1715 + }, + { + "epoch": 6.195033860045147, + "grad_norm": 262.713134765625, + "learning_rate": 2.071143375680581e-05, + "loss": 44.6334, + "step": 1716 + }, + { + "epoch": 6.198645598194131, + "grad_norm": 272.8150939941406, + "learning_rate": 2.0705989110707804e-05, + "loss": 44.9617, + "step": 1717 + }, + { + "epoch": 6.2022573363431155, + "grad_norm": 288.115478515625, + "learning_rate": 2.07005444646098e-05, + "loss": 44.4382, + "step": 1718 + }, + { + "epoch": 6.2058690744920995, + "grad_norm": 226.08058166503906, + "learning_rate": 2.0695099818511795e-05, + "loss": 44.8551, + "step": 1719 + }, + { + "epoch": 6.209480812641083, + "grad_norm": 219.95835876464844, + "learning_rate": 2.0689655172413797e-05, + "loss": 45.5901, + "step": 1720 + }, + { + "epoch": 6.209480812641083, + "eval_loss": 0.6379314661026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 1720 + }, + { + "epoch": 6.213092550790067, + "grad_norm": 190.3118896484375, + "learning_rate": 2.0684210526315792e-05, + "loss": 44.0675, + "step": 1721 + }, + { + "epoch": 6.216704288939052, + "grad_norm": 177.408935546875, + "learning_rate": 2.0678765880217787e-05, + "loss": 42.6333, + "step": 1722 + }, + { + "epoch": 6.220316027088036, + "grad_norm": 231.3040313720703, + "learning_rate": 2.0673321234119783e-05, + "loss": 41.6771, + "step": 1723 + }, + { + "epoch": 6.22392776523702, + "grad_norm": 226.51663208007812, + "learning_rate": 2.0667876588021778e-05, + "loss": 41.0829, + "step": 1724 + }, + { + "epoch": 6.227539503386004, + "grad_norm": 184.55775451660156, + "learning_rate": 2.0662431941923774e-05, + "loss": 39.2682, + "step": 1725 + }, + { + "epoch": 6.231151241534989, + "grad_norm": 205.0491943359375, + "learning_rate": 2.0656987295825772e-05, + "loss": 40.4101, + "step": 1726 + }, + { + "epoch": 6.234762979683973, + "grad_norm": 201.45838928222656, + "learning_rate": 2.0651542649727768e-05, + "loss": 39.9147, + "step": 1727 + }, + { + "epoch": 6.238374717832957, + "grad_norm": 220.16213989257812, + "learning_rate": 2.0646098003629763e-05, + "loss": 40.7215, + "step": 1728 + }, + { + "epoch": 6.241986455981941, + "grad_norm": 260.9661560058594, + "learning_rate": 2.0640653357531762e-05, + "loss": 40.0256, + "step": 1729 + }, + { + "epoch": 6.245598194130926, + "grad_norm": 314.2476806640625, + "learning_rate": 2.0635208711433757e-05, + "loss": 41.1147, + "step": 1730 + }, + { + "epoch": 6.245598194130926, + "eval_loss": 0.6347935199737549, + "eval_runtime": 3.1446, + "eval_samples_per_second": 56.924, + "eval_steps_per_second": 56.924, + "step": 1730 + }, + { + "epoch": 6.24920993227991, + "grad_norm": 262.24505615234375, + "learning_rate": 2.0629764065335756e-05, + "loss": 41.7255, + "step": 1731 + }, + { + "epoch": 6.252821670428894, + "grad_norm": 212.0876922607422, + "learning_rate": 2.062431941923775e-05, + "loss": 41.2559, + "step": 1732 + }, + { + "epoch": 6.2564334085778786, + "grad_norm": 185.3249969482422, + "learning_rate": 2.0618874773139747e-05, + "loss": 41.1664, + "step": 1733 + }, + { + "epoch": 6.2600451467268625, + "grad_norm": 184.7873077392578, + "learning_rate": 2.0613430127041742e-05, + "loss": 41.3357, + "step": 1734 + }, + { + "epoch": 6.2636568848758465, + "grad_norm": 230.11257934570312, + "learning_rate": 2.0607985480943738e-05, + "loss": 43.0978, + "step": 1735 + }, + { + "epoch": 6.2672686230248305, + "grad_norm": 251.255126953125, + "learning_rate": 2.0602540834845733e-05, + "loss": 42.4169, + "step": 1736 + }, + { + "epoch": 6.270880361173815, + "grad_norm": 230.1149444580078, + "learning_rate": 2.0597096188747732e-05, + "loss": 43.2969, + "step": 1737 + }, + { + "epoch": 6.274492099322799, + "grad_norm": 217.2769012451172, + "learning_rate": 2.059165154264973e-05, + "loss": 42.6037, + "step": 1738 + }, + { + "epoch": 6.278103837471783, + "grad_norm": 189.85533142089844, + "learning_rate": 2.0586206896551726e-05, + "loss": 42.1215, + "step": 1739 + }, + { + "epoch": 6.281715575620767, + "grad_norm": 242.15667724609375, + "learning_rate": 2.058076225045372e-05, + "loss": 42.6337, + "step": 1740 + }, + { + "epoch": 6.281715575620767, + "eval_loss": 0.6310555934906006, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 1740 + }, + { + "epoch": 6.285327313769752, + "grad_norm": 213.7873992919922, + "learning_rate": 2.0575317604355717e-05, + "loss": 40.5315, + "step": 1741 + }, + { + "epoch": 6.288939051918736, + "grad_norm": 243.86492919921875, + "learning_rate": 2.0569872958257715e-05, + "loss": 38.9483, + "step": 1742 + }, + { + "epoch": 6.29255079006772, + "grad_norm": 276.0108642578125, + "learning_rate": 2.056442831215971e-05, + "loss": 35.9627, + "step": 1743 + }, + { + "epoch": 6.296162528216704, + "grad_norm": 252.5875701904297, + "learning_rate": 2.0558983666061706e-05, + "loss": 35.4305, + "step": 1744 + }, + { + "epoch": 6.299774266365689, + "grad_norm": 227.15142822265625, + "learning_rate": 2.05535390199637e-05, + "loss": 35.2385, + "step": 1745 + }, + { + "epoch": 6.303386004514673, + "grad_norm": 259.6727294921875, + "learning_rate": 2.0548094373865697e-05, + "loss": 35.735, + "step": 1746 + }, + { + "epoch": 6.306997742663657, + "grad_norm": 185.07765197753906, + "learning_rate": 2.0542649727767696e-05, + "loss": 36.8835, + "step": 1747 + }, + { + "epoch": 6.310609480812641, + "grad_norm": 207.650146484375, + "learning_rate": 2.0537205081669694e-05, + "loss": 36.346, + "step": 1748 + }, + { + "epoch": 6.314221218961626, + "grad_norm": 223.2378692626953, + "learning_rate": 2.053176043557169e-05, + "loss": 36.1527, + "step": 1749 + }, + { + "epoch": 6.3178329571106095, + "grad_norm": 162.90794372558594, + "learning_rate": 2.0526315789473685e-05, + "loss": 35.7408, + "step": 1750 + }, + { + "epoch": 6.3178329571106095, + "eval_loss": 0.6276403069496155, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 1750 + }, + { + "epoch": 6.3214446952595935, + "grad_norm": 165.8592987060547, + "learning_rate": 2.052087114337568e-05, + "loss": 37.7916, + "step": 1751 + }, + { + "epoch": 6.3250564334085775, + "grad_norm": 179.7499542236328, + "learning_rate": 2.0515426497277676e-05, + "loss": 36.8409, + "step": 1752 + }, + { + "epoch": 6.328668171557562, + "grad_norm": 227.0990753173828, + "learning_rate": 2.0509981851179675e-05, + "loss": 37.1766, + "step": 1753 + }, + { + "epoch": 6.332279909706546, + "grad_norm": 216.3297882080078, + "learning_rate": 2.050453720508167e-05, + "loss": 37.5, + "step": 1754 + }, + { + "epoch": 6.33589164785553, + "grad_norm": 197.88409423828125, + "learning_rate": 2.0499092558983666e-05, + "loss": 38.8293, + "step": 1755 + }, + { + "epoch": 6.339503386004514, + "grad_norm": 189.74916076660156, + "learning_rate": 2.049364791288566e-05, + "loss": 37.9873, + "step": 1756 + }, + { + "epoch": 6.343115124153499, + "grad_norm": 241.16644287109375, + "learning_rate": 2.048820326678766e-05, + "loss": 39.3107, + "step": 1757 + }, + { + "epoch": 6.346726862302483, + "grad_norm": 224.3491668701172, + "learning_rate": 2.0482758620689655e-05, + "loss": 36.2482, + "step": 1758 + }, + { + "epoch": 6.350338600451467, + "grad_norm": 217.30882263183594, + "learning_rate": 2.0477313974591654e-05, + "loss": 24.1945, + "step": 1759 + }, + { + "epoch": 6.353950338600452, + "grad_norm": 213.23683166503906, + "learning_rate": 2.047186932849365e-05, + "loss": 24.2356, + "step": 1760 + }, + { + "epoch": 6.353950338600452, + "eval_loss": 0.6382855772972107, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.795, + "eval_steps_per_second": 56.795, + "step": 1760 + }, + { + "epoch": 6.357562076749436, + "grad_norm": 209.8166961669922, + "learning_rate": 2.0466424682395645e-05, + "loss": 25.1916, + "step": 1761 + }, + { + "epoch": 6.36117381489842, + "grad_norm": 197.86773681640625, + "learning_rate": 2.046098003629764e-05, + "loss": 25.1372, + "step": 1762 + }, + { + "epoch": 6.364785553047404, + "grad_norm": 280.80517578125, + "learning_rate": 2.0455535390199635e-05, + "loss": 45.0431, + "step": 1763 + }, + { + "epoch": 6.368397291196389, + "grad_norm": 239.85861206054688, + "learning_rate": 2.0450090744101634e-05, + "loss": 45.4893, + "step": 1764 + }, + { + "epoch": 6.372009029345373, + "grad_norm": 302.56024169921875, + "learning_rate": 2.044464609800363e-05, + "loss": 45.3313, + "step": 1765 + }, + { + "epoch": 6.375620767494357, + "grad_norm": 255.5519256591797, + "learning_rate": 2.043920145190563e-05, + "loss": 44.703, + "step": 1766 + }, + { + "epoch": 6.3792325056433405, + "grad_norm": 223.1331024169922, + "learning_rate": 2.0433756805807624e-05, + "loss": 45.0278, + "step": 1767 + }, + { + "epoch": 6.382844243792325, + "grad_norm": 240.68817138671875, + "learning_rate": 2.042831215970962e-05, + "loss": 44.7298, + "step": 1768 + }, + { + "epoch": 6.386455981941309, + "grad_norm": 239.5072021484375, + "learning_rate": 2.0422867513611614e-05, + "loss": 44.0512, + "step": 1769 + }, + { + "epoch": 6.390067720090293, + "grad_norm": 186.3783416748047, + "learning_rate": 2.0417422867513613e-05, + "loss": 43.8646, + "step": 1770 + }, + { + "epoch": 6.390067720090293, + "eval_loss": 0.6325972676277161, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 1770 + }, + { + "epoch": 6.393679458239277, + "grad_norm": 169.77285766601562, + "learning_rate": 2.041197822141561e-05, + "loss": 43.8688, + "step": 1771 + }, + { + "epoch": 6.397291196388262, + "grad_norm": 158.4019012451172, + "learning_rate": 2.0406533575317604e-05, + "loss": 42.5757, + "step": 1772 + }, + { + "epoch": 6.400902934537246, + "grad_norm": 209.79916381835938, + "learning_rate": 2.04010889292196e-05, + "loss": 44.8075, + "step": 1773 + }, + { + "epoch": 6.40451467268623, + "grad_norm": 215.74639892578125, + "learning_rate": 2.0395644283121595e-05, + "loss": 42.0121, + "step": 1774 + }, + { + "epoch": 6.408126410835214, + "grad_norm": 215.21121215820312, + "learning_rate": 2.0390199637023597e-05, + "loss": 40.6564, + "step": 1775 + }, + { + "epoch": 6.411738148984199, + "grad_norm": 244.49574279785156, + "learning_rate": 2.0384754990925592e-05, + "loss": 40.543, + "step": 1776 + }, + { + "epoch": 6.415349887133183, + "grad_norm": 189.22781372070312, + "learning_rate": 2.0379310344827588e-05, + "loss": 39.5569, + "step": 1777 + }, + { + "epoch": 6.418961625282167, + "grad_norm": 204.32664489746094, + "learning_rate": 2.0373865698729583e-05, + "loss": 40.0789, + "step": 1778 + }, + { + "epoch": 6.422573363431152, + "grad_norm": 217.5277557373047, + "learning_rate": 2.036842105263158e-05, + "loss": 39.6436, + "step": 1779 + }, + { + "epoch": 6.426185101580136, + "grad_norm": 196.25918579101562, + "learning_rate": 2.0362976406533574e-05, + "loss": 41.0794, + "step": 1780 + }, + { + "epoch": 6.426185101580136, + "eval_loss": 0.6334295868873596, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.878, + "eval_steps_per_second": 56.878, + "step": 1780 + }, + { + "epoch": 6.42979683972912, + "grad_norm": 191.50656127929688, + "learning_rate": 2.0357531760435573e-05, + "loss": 41.2976, + "step": 1781 + }, + { + "epoch": 6.433408577878104, + "grad_norm": 192.98692321777344, + "learning_rate": 2.0352087114337568e-05, + "loss": 41.0843, + "step": 1782 + }, + { + "epoch": 6.437020316027088, + "grad_norm": 197.32862854003906, + "learning_rate": 2.0346642468239563e-05, + "loss": 40.4123, + "step": 1783 + }, + { + "epoch": 6.440632054176072, + "grad_norm": 205.18751525878906, + "learning_rate": 2.0341197822141562e-05, + "loss": 41.9185, + "step": 1784 + }, + { + "epoch": 6.444243792325056, + "grad_norm": 201.69070434570312, + "learning_rate": 2.0335753176043558e-05, + "loss": 41.6794, + "step": 1785 + }, + { + "epoch": 6.44785553047404, + "grad_norm": 218.77044677734375, + "learning_rate": 2.0330308529945556e-05, + "loss": 43.5805, + "step": 1786 + }, + { + "epoch": 6.451467268623025, + "grad_norm": 183.25967407226562, + "learning_rate": 2.0324863883847552e-05, + "loss": 41.2777, + "step": 1787 + }, + { + "epoch": 6.455079006772009, + "grad_norm": 219.97369384765625, + "learning_rate": 2.0319419237749547e-05, + "loss": 42.4618, + "step": 1788 + }, + { + "epoch": 6.458690744920993, + "grad_norm": 216.1624298095703, + "learning_rate": 2.0313974591651542e-05, + "loss": 41.6424, + "step": 1789 + }, + { + "epoch": 6.462302483069977, + "grad_norm": 222.29965209960938, + "learning_rate": 2.0308529945553538e-05, + "loss": 41.4058, + "step": 1790 + }, + { + "epoch": 6.462302483069977, + "eval_loss": 0.6282982230186462, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 1790 + }, + { + "epoch": 6.465914221218962, + "grad_norm": 215.50511169433594, + "learning_rate": 2.0303085299455533e-05, + "loss": 39.474, + "step": 1791 + }, + { + "epoch": 6.469525959367946, + "grad_norm": 237.2119903564453, + "learning_rate": 2.0297640653357532e-05, + "loss": 36.0508, + "step": 1792 + }, + { + "epoch": 6.47313769751693, + "grad_norm": 234.52975463867188, + "learning_rate": 2.029219600725953e-05, + "loss": 34.1704, + "step": 1793 + }, + { + "epoch": 6.476749435665914, + "grad_norm": 213.22216796875, + "learning_rate": 2.0286751361161526e-05, + "loss": 34.7592, + "step": 1794 + }, + { + "epoch": 6.480361173814899, + "grad_norm": 215.77244567871094, + "learning_rate": 2.028130671506352e-05, + "loss": 35.3051, + "step": 1795 + }, + { + "epoch": 6.483972911963883, + "grad_norm": 179.0439910888672, + "learning_rate": 2.0275862068965517e-05, + "loss": 35.2493, + "step": 1796 + }, + { + "epoch": 6.487584650112867, + "grad_norm": 217.47218322753906, + "learning_rate": 2.0270417422867516e-05, + "loss": 35.6169, + "step": 1797 + }, + { + "epoch": 6.491196388261851, + "grad_norm": 191.3380584716797, + "learning_rate": 2.026497277676951e-05, + "loss": 36.428, + "step": 1798 + }, + { + "epoch": 6.4948081264108355, + "grad_norm": 200.8570098876953, + "learning_rate": 2.0259528130671506e-05, + "loss": 36.5983, + "step": 1799 + }, + { + "epoch": 6.4984198645598195, + "grad_norm": 173.1240234375, + "learning_rate": 2.0254083484573502e-05, + "loss": 36.0163, + "step": 1800 + }, + { + "epoch": 6.4984198645598195, + "eval_loss": 0.6268841624259949, + "eval_runtime": 3.146, + "eval_samples_per_second": 56.898, + "eval_steps_per_second": 56.898, + "step": 1800 + }, + { + "epoch": 6.502031602708803, + "grad_norm": 225.66845703125, + "learning_rate": 2.0248638838475497e-05, + "loss": 36.2461, + "step": 1801 + }, + { + "epoch": 6.505643340857787, + "grad_norm": 189.66233825683594, + "learning_rate": 2.0243194192377496e-05, + "loss": 37.416, + "step": 1802 + }, + { + "epoch": 6.509255079006772, + "grad_norm": 243.0270233154297, + "learning_rate": 2.0237749546279495e-05, + "loss": 38.5309, + "step": 1803 + }, + { + "epoch": 6.512866817155756, + "grad_norm": 192.0927276611328, + "learning_rate": 2.023230490018149e-05, + "loss": 37.087, + "step": 1804 + }, + { + "epoch": 6.51647855530474, + "grad_norm": 222.2957305908203, + "learning_rate": 2.0226860254083486e-05, + "loss": 37.8877, + "step": 1805 + }, + { + "epoch": 6.520090293453725, + "grad_norm": 259.84722900390625, + "learning_rate": 2.022141560798548e-05, + "loss": 39.2138, + "step": 1806 + }, + { + "epoch": 6.523702031602709, + "grad_norm": 205.5794219970703, + "learning_rate": 2.0215970961887476e-05, + "loss": 38.6066, + "step": 1807 + }, + { + "epoch": 6.527313769751693, + "grad_norm": 300.455810546875, + "learning_rate": 2.0210526315789475e-05, + "loss": 36.1581, + "step": 1808 + }, + { + "epoch": 6.530925507900677, + "grad_norm": 207.18063354492188, + "learning_rate": 2.020508166969147e-05, + "loss": 24.3689, + "step": 1809 + }, + { + "epoch": 6.534537246049661, + "grad_norm": 230.98516845703125, + "learning_rate": 2.0199637023593466e-05, + "loss": 23.7019, + "step": 1810 + }, + { + "epoch": 6.534537246049661, + "eval_loss": 0.6379140615463257, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 1810 + }, + { + "epoch": 6.538148984198646, + "grad_norm": 153.8694610595703, + "learning_rate": 2.019419237749546e-05, + "loss": 24.5035, + "step": 1811 + }, + { + "epoch": 6.54176072234763, + "grad_norm": 229.9432373046875, + "learning_rate": 2.018874773139746e-05, + "loss": 26.1645, + "step": 1812 + }, + { + "epoch": 6.545372460496614, + "grad_norm": 325.3592529296875, + "learning_rate": 2.018330308529946e-05, + "loss": 45.6349, + "step": 1813 + }, + { + "epoch": 6.5489841986455986, + "grad_norm": 261.0744323730469, + "learning_rate": 2.0177858439201454e-05, + "loss": 45.5545, + "step": 1814 + }, + { + "epoch": 6.5525959367945825, + "grad_norm": 261.4237976074219, + "learning_rate": 2.017241379310345e-05, + "loss": 45.321, + "step": 1815 + }, + { + "epoch": 6.5562076749435665, + "grad_norm": 238.8377685546875, + "learning_rate": 2.0166969147005445e-05, + "loss": 44.5963, + "step": 1816 + }, + { + "epoch": 6.5598194130925505, + "grad_norm": 225.89730834960938, + "learning_rate": 2.016152450090744e-05, + "loss": 43.593, + "step": 1817 + }, + { + "epoch": 6.563431151241535, + "grad_norm": 265.09625244140625, + "learning_rate": 2.0156079854809436e-05, + "loss": 43.536, + "step": 1818 + }, + { + "epoch": 6.567042889390519, + "grad_norm": 257.9114685058594, + "learning_rate": 2.0150635208711434e-05, + "loss": 44.1125, + "step": 1819 + }, + { + "epoch": 6.570654627539503, + "grad_norm": 188.06382751464844, + "learning_rate": 2.014519056261343e-05, + "loss": 45.097, + "step": 1820 + }, + { + "epoch": 6.570654627539503, + "eval_loss": 0.6347097754478455, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 1820 + }, + { + "epoch": 6.574266365688487, + "grad_norm": 227.7350616455078, + "learning_rate": 2.013974591651543e-05, + "loss": 43.9367, + "step": 1821 + }, + { + "epoch": 6.577878103837472, + "grad_norm": 207.54774475097656, + "learning_rate": 2.0134301270417424e-05, + "loss": 43.8266, + "step": 1822 + }, + { + "epoch": 6.581489841986456, + "grad_norm": 204.62364196777344, + "learning_rate": 2.012885662431942e-05, + "loss": 42.7973, + "step": 1823 + }, + { + "epoch": 6.58510158013544, + "grad_norm": 244.32159423828125, + "learning_rate": 2.0123411978221418e-05, + "loss": 42.7741, + "step": 1824 + }, + { + "epoch": 6.588713318284425, + "grad_norm": 304.9100036621094, + "learning_rate": 2.0117967332123414e-05, + "loss": 40.6529, + "step": 1825 + }, + { + "epoch": 6.592325056433409, + "grad_norm": 275.5767517089844, + "learning_rate": 2.011252268602541e-05, + "loss": 40.2909, + "step": 1826 + }, + { + "epoch": 6.595936794582393, + "grad_norm": 227.69642639160156, + "learning_rate": 2.0107078039927404e-05, + "loss": 39.8786, + "step": 1827 + }, + { + "epoch": 6.599548532731377, + "grad_norm": 261.4333190917969, + "learning_rate": 2.01016333938294e-05, + "loss": 40.7009, + "step": 1828 + }, + { + "epoch": 6.603160270880361, + "grad_norm": 213.0095977783203, + "learning_rate": 2.0096188747731395e-05, + "loss": 40.0595, + "step": 1829 + }, + { + "epoch": 6.606772009029346, + "grad_norm": 251.78590393066406, + "learning_rate": 2.0090744101633397e-05, + "loss": 40.8939, + "step": 1830 + }, + { + "epoch": 6.606772009029346, + "eval_loss": 0.6333281397819519, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1830 + }, + { + "epoch": 6.6103837471783295, + "grad_norm": 224.89805603027344, + "learning_rate": 2.0085299455535393e-05, + "loss": 41.4123, + "step": 1831 + }, + { + "epoch": 6.6139954853273135, + "grad_norm": 195.67982482910156, + "learning_rate": 2.0079854809437388e-05, + "loss": 41.3483, + "step": 1832 + }, + { + "epoch": 6.617607223476298, + "grad_norm": 214.318603515625, + "learning_rate": 2.0074410163339383e-05, + "loss": 40.5516, + "step": 1833 + }, + { + "epoch": 6.621218961625282, + "grad_norm": 226.60968017578125, + "learning_rate": 2.006896551724138e-05, + "loss": 41.3523, + "step": 1834 + }, + { + "epoch": 6.624830699774266, + "grad_norm": 231.63604736328125, + "learning_rate": 2.0063520871143378e-05, + "loss": 41.8734, + "step": 1835 + }, + { + "epoch": 6.62844243792325, + "grad_norm": 224.1644287109375, + "learning_rate": 2.0058076225045373e-05, + "loss": 42.7386, + "step": 1836 + }, + { + "epoch": 6.632054176072235, + "grad_norm": 273.651123046875, + "learning_rate": 2.0052631578947368e-05, + "loss": 42.4525, + "step": 1837 + }, + { + "epoch": 6.635665914221219, + "grad_norm": 270.8088684082031, + "learning_rate": 2.0047186932849364e-05, + "loss": 42.1051, + "step": 1838 + }, + { + "epoch": 6.639277652370203, + "grad_norm": 303.1058044433594, + "learning_rate": 2.0041742286751362e-05, + "loss": 42.1301, + "step": 1839 + }, + { + "epoch": 6.642889390519187, + "grad_norm": 207.29380798339844, + "learning_rate": 2.0036297640653358e-05, + "loss": 42.1495, + "step": 1840 + }, + { + "epoch": 6.642889390519187, + "eval_loss": 0.6321585774421692, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1840 + }, + { + "epoch": 6.646501128668172, + "grad_norm": 262.1852722167969, + "learning_rate": 2.0030852994555357e-05, + "loss": 39.6408, + "step": 1841 + }, + { + "epoch": 6.650112866817156, + "grad_norm": 233.7991943359375, + "learning_rate": 2.0025408348457352e-05, + "loss": 37.6177, + "step": 1842 + }, + { + "epoch": 6.65372460496614, + "grad_norm": 247.25514221191406, + "learning_rate": 2.0019963702359347e-05, + "loss": 35.4287, + "step": 1843 + }, + { + "epoch": 6.657336343115124, + "grad_norm": 191.53343200683594, + "learning_rate": 2.0014519056261343e-05, + "loss": 34.2335, + "step": 1844 + }, + { + "epoch": 6.660948081264109, + "grad_norm": 245.22821044921875, + "learning_rate": 2.0009074410163338e-05, + "loss": 35.8097, + "step": 1845 + }, + { + "epoch": 6.664559819413093, + "grad_norm": 213.8151092529297, + "learning_rate": 2.0003629764065337e-05, + "loss": 35.2621, + "step": 1846 + }, + { + "epoch": 6.668171557562077, + "grad_norm": 174.6085205078125, + "learning_rate": 1.9998185117967332e-05, + "loss": 36.6137, + "step": 1847 + }, + { + "epoch": 6.6717832957110605, + "grad_norm": 287.4677429199219, + "learning_rate": 1.9992740471869328e-05, + "loss": 37.5896, + "step": 1848 + }, + { + "epoch": 6.675395033860045, + "grad_norm": 224.59771728515625, + "learning_rate": 1.9987295825771326e-05, + "loss": 36.5515, + "step": 1849 + }, + { + "epoch": 6.679006772009029, + "grad_norm": 212.73065185546875, + "learning_rate": 1.9981851179673322e-05, + "loss": 36.2511, + "step": 1850 + }, + { + "epoch": 6.679006772009029, + "eval_loss": 0.6308404803276062, + "eval_runtime": 3.1419, + "eval_samples_per_second": 56.972, + "eval_steps_per_second": 56.972, + "step": 1850 + }, + { + "epoch": 6.682618510158013, + "grad_norm": 214.7340850830078, + "learning_rate": 1.9976406533575317e-05, + "loss": 37.6949, + "step": 1851 + }, + { + "epoch": 6.686230248306998, + "grad_norm": 220.3029327392578, + "learning_rate": 1.9970961887477316e-05, + "loss": 36.5785, + "step": 1852 + }, + { + "epoch": 6.689841986455982, + "grad_norm": 198.97564697265625, + "learning_rate": 1.996551724137931e-05, + "loss": 38.5277, + "step": 1853 + }, + { + "epoch": 6.693453724604966, + "grad_norm": 180.94789123535156, + "learning_rate": 1.9960072595281307e-05, + "loss": 37.5197, + "step": 1854 + }, + { + "epoch": 6.69706546275395, + "grad_norm": 212.17584228515625, + "learning_rate": 1.9954627949183302e-05, + "loss": 37.3483, + "step": 1855 + }, + { + "epoch": 6.700677200902934, + "grad_norm": 253.88601684570312, + "learning_rate": 1.9949183303085298e-05, + "loss": 38.5224, + "step": 1856 + }, + { + "epoch": 6.704288939051919, + "grad_norm": 193.17698669433594, + "learning_rate": 1.9943738656987296e-05, + "loss": 37.5679, + "step": 1857 + }, + { + "epoch": 6.707900677200903, + "grad_norm": 217.2652130126953, + "learning_rate": 1.9938294010889295e-05, + "loss": 27.7344, + "step": 1858 + }, + { + "epoch": 6.711512415349887, + "grad_norm": 183.9295196533203, + "learning_rate": 1.993284936479129e-05, + "loss": 24.3864, + "step": 1859 + }, + { + "epoch": 6.715124153498872, + "grad_norm": 200.3455352783203, + "learning_rate": 1.9927404718693286e-05, + "loss": 23.7328, + "step": 1860 + }, + { + "epoch": 6.715124153498872, + "eval_loss": 0.636415421962738, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.943, + "eval_steps_per_second": 56.943, + "step": 1860 + }, + { + "epoch": 6.718735891647856, + "grad_norm": 206.7858123779297, + "learning_rate": 1.992196007259528e-05, + "loss": 24.6541, + "step": 1861 + }, + { + "epoch": 6.72234762979684, + "grad_norm": 208.10414123535156, + "learning_rate": 1.9916515426497277e-05, + "loss": 25.1223, + "step": 1862 + }, + { + "epoch": 6.725959367945824, + "grad_norm": 270.6657409667969, + "learning_rate": 1.9911070780399275e-05, + "loss": 44.8561, + "step": 1863 + }, + { + "epoch": 6.7295711060948085, + "grad_norm": 246.69094848632812, + "learning_rate": 1.990562613430127e-05, + "loss": 45.8683, + "step": 1864 + }, + { + "epoch": 6.733182844243792, + "grad_norm": 243.4462432861328, + "learning_rate": 1.9900181488203266e-05, + "loss": 45.1845, + "step": 1865 + }, + { + "epoch": 6.736794582392776, + "grad_norm": 218.0637969970703, + "learning_rate": 1.989473684210526e-05, + "loss": 43.9492, + "step": 1866 + }, + { + "epoch": 6.74040632054176, + "grad_norm": 200.28140258789062, + "learning_rate": 1.988929219600726e-05, + "loss": 44.0612, + "step": 1867 + }, + { + "epoch": 6.744018058690745, + "grad_norm": 200.3120880126953, + "learning_rate": 1.988384754990926e-05, + "loss": 43.4748, + "step": 1868 + }, + { + "epoch": 6.747629796839729, + "grad_norm": 186.1811065673828, + "learning_rate": 1.9878402903811254e-05, + "loss": 43.6851, + "step": 1869 + }, + { + "epoch": 6.751241534988713, + "grad_norm": 208.15167236328125, + "learning_rate": 1.987295825771325e-05, + "loss": 44.4196, + "step": 1870 + }, + { + "epoch": 6.751241534988713, + "eval_loss": 0.6353851556777954, + "eval_runtime": 3.1436, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 1870 + }, + { + "epoch": 6.754853273137698, + "grad_norm": 207.500244140625, + "learning_rate": 1.9867513611615245e-05, + "loss": 44.1493, + "step": 1871 + }, + { + "epoch": 6.758465011286682, + "grad_norm": 238.17047119140625, + "learning_rate": 1.986206896551724e-05, + "loss": 44.6587, + "step": 1872 + }, + { + "epoch": 6.762076749435666, + "grad_norm": 192.9468231201172, + "learning_rate": 1.9856624319419236e-05, + "loss": 43.2409, + "step": 1873 + }, + { + "epoch": 6.76568848758465, + "grad_norm": 205.26492309570312, + "learning_rate": 1.9851179673321235e-05, + "loss": 40.8636, + "step": 1874 + }, + { + "epoch": 6.769300225733634, + "grad_norm": 190.49908447265625, + "learning_rate": 1.984573502722323e-05, + "loss": 41.0769, + "step": 1875 + }, + { + "epoch": 6.772911963882619, + "grad_norm": 206.56097412109375, + "learning_rate": 1.984029038112523e-05, + "loss": 40.1137, + "step": 1876 + }, + { + "epoch": 6.776523702031603, + "grad_norm": 212.89256286621094, + "learning_rate": 1.9834845735027224e-05, + "loss": 41.0114, + "step": 1877 + }, + { + "epoch": 6.780135440180587, + "grad_norm": 197.24267578125, + "learning_rate": 1.982940108892922e-05, + "loss": 40.6027, + "step": 1878 + }, + { + "epoch": 6.7837471783295715, + "grad_norm": 187.01942443847656, + "learning_rate": 1.982395644283122e-05, + "loss": 40.5933, + "step": 1879 + }, + { + "epoch": 6.7873589164785555, + "grad_norm": 236.31092834472656, + "learning_rate": 1.9818511796733214e-05, + "loss": 41.2282, + "step": 1880 + }, + { + "epoch": 6.7873589164785555, + "eval_loss": 0.6299392580986023, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 1880 + }, + { + "epoch": 6.7909706546275395, + "grad_norm": 194.92059326171875, + "learning_rate": 1.981306715063521e-05, + "loss": 41.5858, + "step": 1881 + }, + { + "epoch": 6.794582392776523, + "grad_norm": 192.26272583007812, + "learning_rate": 1.9807622504537205e-05, + "loss": 40.6826, + "step": 1882 + }, + { + "epoch": 6.798194130925508, + "grad_norm": 181.8116912841797, + "learning_rate": 1.98021778584392e-05, + "loss": 40.0867, + "step": 1883 + }, + { + "epoch": 6.801805869074492, + "grad_norm": 219.03494262695312, + "learning_rate": 1.9796733212341195e-05, + "loss": 41.4496, + "step": 1884 + }, + { + "epoch": 6.805417607223476, + "grad_norm": 190.7852325439453, + "learning_rate": 1.9791288566243194e-05, + "loss": 42.4147, + "step": 1885 + }, + { + "epoch": 6.80902934537246, + "grad_norm": 200.32476806640625, + "learning_rate": 1.9785843920145193e-05, + "loss": 42.0316, + "step": 1886 + }, + { + "epoch": 6.812641083521445, + "grad_norm": 240.6086883544922, + "learning_rate": 1.9780399274047188e-05, + "loss": 39.6992, + "step": 1887 + }, + { + "epoch": 6.816252821670429, + "grad_norm": 222.31700134277344, + "learning_rate": 1.9774954627949184e-05, + "loss": 42.9572, + "step": 1888 + }, + { + "epoch": 6.819864559819413, + "grad_norm": 215.65292358398438, + "learning_rate": 1.976950998185118e-05, + "loss": 42.5147, + "step": 1889 + }, + { + "epoch": 6.823476297968397, + "grad_norm": 195.71624755859375, + "learning_rate": 1.9764065335753178e-05, + "loss": 40.9536, + "step": 1890 + }, + { + "epoch": 6.823476297968397, + "eval_loss": 0.6288287043571472, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 1890 + }, + { + "epoch": 6.827088036117382, + "grad_norm": 202.301025390625, + "learning_rate": 1.9758620689655173e-05, + "loss": 40.1754, + "step": 1891 + }, + { + "epoch": 6.830699774266366, + "grad_norm": 217.07186889648438, + "learning_rate": 1.975317604355717e-05, + "loss": 35.7505, + "step": 1892 + }, + { + "epoch": 6.83431151241535, + "grad_norm": 189.78782653808594, + "learning_rate": 1.9747731397459164e-05, + "loss": 34.813, + "step": 1893 + }, + { + "epoch": 6.837923250564334, + "grad_norm": 247.2117462158203, + "learning_rate": 1.974228675136116e-05, + "loss": 33.932, + "step": 1894 + }, + { + "epoch": 6.8415349887133186, + "grad_norm": 244.06321716308594, + "learning_rate": 1.9736842105263158e-05, + "loss": 36.2514, + "step": 1895 + }, + { + "epoch": 6.8451467268623025, + "grad_norm": 235.78692626953125, + "learning_rate": 1.9731397459165157e-05, + "loss": 35.2123, + "step": 1896 + }, + { + "epoch": 6.8487584650112865, + "grad_norm": 193.82456970214844, + "learning_rate": 1.9725952813067152e-05, + "loss": 36.5477, + "step": 1897 + }, + { + "epoch": 6.852370203160271, + "grad_norm": 230.2017059326172, + "learning_rate": 1.9720508166969148e-05, + "loss": 36.1244, + "step": 1898 + }, + { + "epoch": 6.855981941309255, + "grad_norm": 205.5274200439453, + "learning_rate": 1.9715063520871143e-05, + "loss": 36.7059, + "step": 1899 + }, + { + "epoch": 6.859593679458239, + "grad_norm": 236.6873016357422, + "learning_rate": 1.970961887477314e-05, + "loss": 36.6212, + "step": 1900 + }, + { + "epoch": 6.859593679458239, + "eval_loss": 0.6235609650611877, + "eval_runtime": 3.1497, + "eval_samples_per_second": 56.831, + "eval_steps_per_second": 56.831, + "step": 1900 + }, + { + "epoch": 6.863205417607223, + "grad_norm": 217.63638305664062, + "learning_rate": 1.9704174228675137e-05, + "loss": 37.3918, + "step": 1901 + }, + { + "epoch": 6.866817155756207, + "grad_norm": 169.31996154785156, + "learning_rate": 1.9698729582577133e-05, + "loss": 37.8555, + "step": 1902 + }, + { + "epoch": 6.870428893905192, + "grad_norm": 204.2144775390625, + "learning_rate": 1.9693284936479128e-05, + "loss": 38.0013, + "step": 1903 + }, + { + "epoch": 6.874040632054176, + "grad_norm": 219.13595581054688, + "learning_rate": 1.9687840290381127e-05, + "loss": 37.2128, + "step": 1904 + }, + { + "epoch": 6.87765237020316, + "grad_norm": 189.8477325439453, + "learning_rate": 1.9682395644283122e-05, + "loss": 39.272, + "step": 1905 + }, + { + "epoch": 6.881264108352145, + "grad_norm": 214.21360778808594, + "learning_rate": 1.967695099818512e-05, + "loss": 37.5185, + "step": 1906 + }, + { + "epoch": 6.884875846501129, + "grad_norm": 252.57867431640625, + "learning_rate": 1.9671506352087116e-05, + "loss": 37.6195, + "step": 1907 + }, + { + "epoch": 6.888487584650113, + "grad_norm": 169.85382080078125, + "learning_rate": 1.966606170598911e-05, + "loss": 29.083, + "step": 1908 + }, + { + "epoch": 6.892099322799097, + "grad_norm": 161.38137817382812, + "learning_rate": 1.9660617059891107e-05, + "loss": 24.4547, + "step": 1909 + }, + { + "epoch": 6.895711060948082, + "grad_norm": 192.5706787109375, + "learning_rate": 1.9655172413793102e-05, + "loss": 24.2235, + "step": 1910 + }, + { + "epoch": 6.895711060948082, + "eval_loss": 0.6387229561805725, + "eval_runtime": 3.1483, + "eval_samples_per_second": 56.856, + "eval_steps_per_second": 56.856, + "step": 1910 + }, + { + "epoch": 6.899322799097066, + "grad_norm": 177.5368194580078, + "learning_rate": 1.9649727767695098e-05, + "loss": 24.8032, + "step": 1911 + }, + { + "epoch": 6.9029345372460496, + "grad_norm": 206.98458862304688, + "learning_rate": 1.9644283121597097e-05, + "loss": 25.7293, + "step": 1912 + }, + { + "epoch": 6.9065462753950335, + "grad_norm": 238.7289581298828, + "learning_rate": 1.9638838475499095e-05, + "loss": 44.2514, + "step": 1913 + }, + { + "epoch": 6.910158013544018, + "grad_norm": 225.86854553222656, + "learning_rate": 1.963339382940109e-05, + "loss": 44.4858, + "step": 1914 + }, + { + "epoch": 6.913769751693002, + "grad_norm": 235.71524047851562, + "learning_rate": 1.9627949183303086e-05, + "loss": 44.5351, + "step": 1915 + }, + { + "epoch": 6.917381489841986, + "grad_norm": 233.1634063720703, + "learning_rate": 1.962250453720508e-05, + "loss": 44.0865, + "step": 1916 + }, + { + "epoch": 6.92099322799097, + "grad_norm": 201.48944091796875, + "learning_rate": 1.961705989110708e-05, + "loss": 45.0226, + "step": 1917 + }, + { + "epoch": 6.924604966139955, + "grad_norm": 226.95469665527344, + "learning_rate": 1.9611615245009076e-05, + "loss": 44.3969, + "step": 1918 + }, + { + "epoch": 6.928216704288939, + "grad_norm": 242.79940795898438, + "learning_rate": 1.960617059891107e-05, + "loss": 41.3037, + "step": 1919 + }, + { + "epoch": 6.931828442437923, + "grad_norm": 255.3524932861328, + "learning_rate": 1.9600725952813066e-05, + "loss": 41.3567, + "step": 1920 + }, + { + "epoch": 6.931828442437923, + "eval_loss": 0.6346065998077393, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 1920 + }, + { + "epoch": 6.935440180586907, + "grad_norm": 277.0763854980469, + "learning_rate": 1.9595281306715062e-05, + "loss": 41.142, + "step": 1921 + }, + { + "epoch": 6.939051918735892, + "grad_norm": 176.02658081054688, + "learning_rate": 1.958983666061706e-05, + "loss": 42.1963, + "step": 1922 + }, + { + "epoch": 6.942663656884876, + "grad_norm": 236.36398315429688, + "learning_rate": 1.958439201451906e-05, + "loss": 42.351, + "step": 1923 + }, + { + "epoch": 6.94627539503386, + "grad_norm": 203.0919647216797, + "learning_rate": 1.9578947368421055e-05, + "loss": 41.5248, + "step": 1924 + }, + { + "epoch": 6.949887133182845, + "grad_norm": 273.605712890625, + "learning_rate": 1.957350272232305e-05, + "loss": 42.1004, + "step": 1925 + }, + { + "epoch": 6.953498871331829, + "grad_norm": 214.04319763183594, + "learning_rate": 1.9568058076225045e-05, + "loss": 42.6326, + "step": 1926 + }, + { + "epoch": 6.957110609480813, + "grad_norm": 250.81832885742188, + "learning_rate": 1.956261343012704e-05, + "loss": 43.8045, + "step": 1927 + }, + { + "epoch": 6.960722347629797, + "grad_norm": 233.58116149902344, + "learning_rate": 1.955716878402904e-05, + "loss": 39.8991, + "step": 1928 + }, + { + "epoch": 6.9643340857787805, + "grad_norm": 269.0545654296875, + "learning_rate": 1.9551724137931035e-05, + "loss": 34.6192, + "step": 1929 + }, + { + "epoch": 6.967945823927765, + "grad_norm": 266.1218566894531, + "learning_rate": 1.954627949183303e-05, + "loss": 35.7568, + "step": 1930 + }, + { + "epoch": 6.967945823927765, + "eval_loss": 0.6233173608779907, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.975, + "eval_steps_per_second": 56.975, + "step": 1930 + }, + { + "epoch": 6.971557562076749, + "grad_norm": 294.6914978027344, + "learning_rate": 1.9540834845735026e-05, + "loss": 36.0795, + "step": 1931 + }, + { + "epoch": 6.975169300225733, + "grad_norm": 373.6831970214844, + "learning_rate": 1.9535390199637025e-05, + "loss": 37.2715, + "step": 1932 + }, + { + "epoch": 6.978781038374718, + "grad_norm": 240.34738159179688, + "learning_rate": 1.952994555353902e-05, + "loss": 37.8335, + "step": 1933 + }, + { + "epoch": 6.982392776523702, + "grad_norm": 312.1968994140625, + "learning_rate": 1.952450090744102e-05, + "loss": 37.8251, + "step": 1934 + }, + { + "epoch": 6.986004514672686, + "grad_norm": 276.3544006347656, + "learning_rate": 1.9519056261343014e-05, + "loss": 38.8466, + "step": 1935 + }, + { + "epoch": 6.98961625282167, + "grad_norm": 282.6874694824219, + "learning_rate": 1.951361161524501e-05, + "loss": 37.774, + "step": 1936 + }, + { + "epoch": 6.993227990970655, + "grad_norm": 323.96612548828125, + "learning_rate": 1.9508166969147005e-05, + "loss": 34.3747, + "step": 1937 + }, + { + "epoch": 6.996839729119639, + "grad_norm": 235.02915954589844, + "learning_rate": 1.9502722323049e-05, + "loss": 24.5297, + "step": 1938 + }, + { + "epoch": 7.0, + "grad_norm": 176.4046173095703, + "learning_rate": 1.9497277676951e-05, + "loss": 22.3179, + "step": 1939 + }, + { + "epoch": 7.003611738148984, + "grad_norm": 248.2797393798828, + "learning_rate": 1.9491833030852994e-05, + "loss": 42.225, + "step": 1940 + }, + { + "epoch": 7.003611738148984, + "eval_loss": 0.6272363066673279, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.911, + "eval_steps_per_second": 56.911, + "step": 1940 + }, + { + "epoch": 7.007223476297969, + "grad_norm": 235.9131622314453, + "learning_rate": 1.9486388384754993e-05, + "loss": 43.6526, + "step": 1941 + }, + { + "epoch": 7.010835214446953, + "grad_norm": 223.63479614257812, + "learning_rate": 1.948094373865699e-05, + "loss": 42.9052, + "step": 1942 + }, + { + "epoch": 7.014446952595937, + "grad_norm": 203.92141723632812, + "learning_rate": 1.9475499092558984e-05, + "loss": 43.5819, + "step": 1943 + }, + { + "epoch": 7.018058690744921, + "grad_norm": 209.6050567626953, + "learning_rate": 1.947005444646098e-05, + "loss": 43.1077, + "step": 1944 + }, + { + "epoch": 7.021670428893906, + "grad_norm": 245.77700805664062, + "learning_rate": 1.9464609800362978e-05, + "loss": 42.7508, + "step": 1945 + }, + { + "epoch": 7.0252821670428895, + "grad_norm": 203.13465881347656, + "learning_rate": 1.9459165154264973e-05, + "loss": 42.5234, + "step": 1946 + }, + { + "epoch": 7.0288939051918735, + "grad_norm": 226.4978485107422, + "learning_rate": 1.945372050816697e-05, + "loss": 44.0725, + "step": 1947 + }, + { + "epoch": 7.0325056433408575, + "grad_norm": 225.68116760253906, + "learning_rate": 1.9448275862068964e-05, + "loss": 42.6408, + "step": 1948 + }, + { + "epoch": 7.036117381489842, + "grad_norm": 182.14202880859375, + "learning_rate": 1.944283121597096e-05, + "loss": 41.7696, + "step": 1949 + }, + { + "epoch": 7.039729119638826, + "grad_norm": 196.1949005126953, + "learning_rate": 1.9437386569872962e-05, + "loss": 42.7008, + "step": 1950 + }, + { + "epoch": 7.039729119638826, + "eval_loss": 0.6277336478233337, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 1950 + }, + { + "epoch": 7.04334085778781, + "grad_norm": 180.6853485107422, + "learning_rate": 1.9431941923774957e-05, + "loss": 41.9946, + "step": 1951 + }, + { + "epoch": 7.046952595936794, + "grad_norm": 199.0644073486328, + "learning_rate": 1.9426497277676953e-05, + "loss": 39.8965, + "step": 1952 + }, + { + "epoch": 7.050564334085779, + "grad_norm": 208.21371459960938, + "learning_rate": 1.9421052631578948e-05, + "loss": 39.3263, + "step": 1953 + }, + { + "epoch": 7.054176072234763, + "grad_norm": 239.78677368164062, + "learning_rate": 1.9415607985480943e-05, + "loss": 40.1478, + "step": 1954 + }, + { + "epoch": 7.057787810383747, + "grad_norm": 211.55030822753906, + "learning_rate": 1.941016333938294e-05, + "loss": 40.061, + "step": 1955 + }, + { + "epoch": 7.061399548532731, + "grad_norm": 199.51455688476562, + "learning_rate": 1.9404718693284937e-05, + "loss": 39.8707, + "step": 1956 + }, + { + "epoch": 7.065011286681716, + "grad_norm": 183.39486694335938, + "learning_rate": 1.9399274047186933e-05, + "loss": 40.3183, + "step": 1957 + }, + { + "epoch": 7.0686230248307, + "grad_norm": 238.36737060546875, + "learning_rate": 1.9393829401088928e-05, + "loss": 40.8581, + "step": 1958 + }, + { + "epoch": 7.072234762979684, + "grad_norm": 202.5072021484375, + "learning_rate": 1.9388384754990927e-05, + "loss": 40.2192, + "step": 1959 + }, + { + "epoch": 7.075846501128668, + "grad_norm": 204.236083984375, + "learning_rate": 1.9382940108892922e-05, + "loss": 40.8533, + "step": 1960 + }, + { + "epoch": 7.075846501128668, + "eval_loss": 0.6252757906913757, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 1960 + }, + { + "epoch": 7.079458239277653, + "grad_norm": 260.2081298828125, + "learning_rate": 1.937749546279492e-05, + "loss": 39.7229, + "step": 1961 + }, + { + "epoch": 7.083069977426637, + "grad_norm": 241.91722106933594, + "learning_rate": 1.9372050816696917e-05, + "loss": 41.547, + "step": 1962 + }, + { + "epoch": 7.0866817155756205, + "grad_norm": 168.9304656982422, + "learning_rate": 1.9366606170598912e-05, + "loss": 41.4826, + "step": 1963 + }, + { + "epoch": 7.090293453724605, + "grad_norm": 230.05349731445312, + "learning_rate": 1.9361161524500907e-05, + "loss": 41.5411, + "step": 1964 + }, + { + "epoch": 7.093905191873589, + "grad_norm": 172.16851806640625, + "learning_rate": 1.9355716878402903e-05, + "loss": 42.2347, + "step": 1965 + }, + { + "epoch": 7.097516930022573, + "grad_norm": 312.65838623046875, + "learning_rate": 1.9350272232304898e-05, + "loss": 41.4039, + "step": 1966 + }, + { + "epoch": 7.101128668171557, + "grad_norm": 249.62351989746094, + "learning_rate": 1.9344827586206897e-05, + "loss": 41.4234, + "step": 1967 + }, + { + "epoch": 7.104740406320542, + "grad_norm": 250.49143981933594, + "learning_rate": 1.9339382940108896e-05, + "loss": 38.0539, + "step": 1968 + }, + { + "epoch": 7.108352144469526, + "grad_norm": 238.41546630859375, + "learning_rate": 1.933393829401089e-05, + "loss": 35.5584, + "step": 1969 + }, + { + "epoch": 7.11196388261851, + "grad_norm": 200.78282165527344, + "learning_rate": 1.9328493647912886e-05, + "loss": 34.4491, + "step": 1970 + }, + { + "epoch": 7.11196388261851, + "eval_loss": 0.6286216378211975, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 1970 + }, + { + "epoch": 7.115575620767494, + "grad_norm": 244.61717224121094, + "learning_rate": 1.9323049001814882e-05, + "loss": 34.5403, + "step": 1971 + }, + { + "epoch": 7.119187358916479, + "grad_norm": 219.14312744140625, + "learning_rate": 1.931760435571688e-05, + "loss": 35.7815, + "step": 1972 + }, + { + "epoch": 7.122799097065463, + "grad_norm": 221.85130310058594, + "learning_rate": 1.9312159709618876e-05, + "loss": 35.638, + "step": 1973 + }, + { + "epoch": 7.126410835214447, + "grad_norm": 237.97921752929688, + "learning_rate": 1.930671506352087e-05, + "loss": 35.1348, + "step": 1974 + }, + { + "epoch": 7.130022573363431, + "grad_norm": 234.06256103515625, + "learning_rate": 1.9301270417422867e-05, + "loss": 35.8709, + "step": 1975 + }, + { + "epoch": 7.133634311512416, + "grad_norm": 231.6852264404297, + "learning_rate": 1.9295825771324862e-05, + "loss": 36.6859, + "step": 1976 + }, + { + "epoch": 7.1372460496614, + "grad_norm": 208.2762908935547, + "learning_rate": 1.9290381125226857e-05, + "loss": 37.24, + "step": 1977 + }, + { + "epoch": 7.140857787810384, + "grad_norm": 219.8532257080078, + "learning_rate": 1.928493647912886e-05, + "loss": 36.4058, + "step": 1978 + }, + { + "epoch": 7.144469525959368, + "grad_norm": 242.73159790039062, + "learning_rate": 1.9279491833030855e-05, + "loss": 36.7565, + "step": 1979 + }, + { + "epoch": 7.148081264108352, + "grad_norm": 227.09645080566406, + "learning_rate": 1.927404718693285e-05, + "loss": 37.6752, + "step": 1980 + }, + { + "epoch": 7.148081264108352, + "eval_loss": 0.6243596076965332, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 1980 + }, + { + "epoch": 7.151693002257336, + "grad_norm": 236.27169799804688, + "learning_rate": 1.9268602540834846e-05, + "loss": 38.3857, + "step": 1981 + }, + { + "epoch": 7.15530474040632, + "grad_norm": 244.84912109375, + "learning_rate": 1.926315789473684e-05, + "loss": 38.414, + "step": 1982 + }, + { + "epoch": 7.158916478555304, + "grad_norm": 203.36798095703125, + "learning_rate": 1.925771324863884e-05, + "loss": 38.938, + "step": 1983 + }, + { + "epoch": 7.162528216704289, + "grad_norm": 225.50152587890625, + "learning_rate": 1.9252268602540835e-05, + "loss": 37.654, + "step": 1984 + }, + { + "epoch": 7.166139954853273, + "grad_norm": 236.4989471435547, + "learning_rate": 1.924682395644283e-05, + "loss": 28.2794, + "step": 1985 + }, + { + "epoch": 7.169751693002257, + "grad_norm": 173.909423828125, + "learning_rate": 1.9241379310344826e-05, + "loss": 23.3804, + "step": 1986 + }, + { + "epoch": 7.173363431151241, + "grad_norm": 195.63526916503906, + "learning_rate": 1.9235934664246825e-05, + "loss": 24.4696, + "step": 1987 + }, + { + "epoch": 7.176975169300226, + "grad_norm": 150.0059356689453, + "learning_rate": 1.923049001814882e-05, + "loss": 23.9438, + "step": 1988 + }, + { + "epoch": 7.18058690744921, + "grad_norm": 217.61630249023438, + "learning_rate": 1.922504537205082e-05, + "loss": 25.4084, + "step": 1989 + }, + { + "epoch": 7.184198645598194, + "grad_norm": 259.2041015625, + "learning_rate": 1.9219600725952814e-05, + "loss": 44.7159, + "step": 1990 + }, + { + "epoch": 7.184198645598194, + "eval_loss": 0.6465168595314026, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 1990 + }, + { + "epoch": 7.187810383747179, + "grad_norm": 282.1758117675781, + "learning_rate": 1.921415607985481e-05, + "loss": 45.7571, + "step": 1991 + }, + { + "epoch": 7.191422121896163, + "grad_norm": 276.5455322265625, + "learning_rate": 1.9208711433756805e-05, + "loss": 44.7227, + "step": 1992 + }, + { + "epoch": 7.195033860045147, + "grad_norm": 251.93589782714844, + "learning_rate": 1.92032667876588e-05, + "loss": 43.0705, + "step": 1993 + }, + { + "epoch": 7.198645598194131, + "grad_norm": 224.8245086669922, + "learning_rate": 1.91978221415608e-05, + "loss": 43.2009, + "step": 1994 + }, + { + "epoch": 7.2022573363431155, + "grad_norm": 233.61770629882812, + "learning_rate": 1.9192377495462795e-05, + "loss": 43.4496, + "step": 1995 + }, + { + "epoch": 7.2058690744920995, + "grad_norm": 188.65252685546875, + "learning_rate": 1.9186932849364793e-05, + "loss": 42.5907, + "step": 1996 + }, + { + "epoch": 7.209480812641083, + "grad_norm": 185.1155242919922, + "learning_rate": 1.918148820326679e-05, + "loss": 44.4651, + "step": 1997 + }, + { + "epoch": 7.213092550790067, + "grad_norm": 169.09701538085938, + "learning_rate": 1.9176043557168784e-05, + "loss": 43.6325, + "step": 1998 + }, + { + "epoch": 7.216704288939052, + "grad_norm": 198.49114990234375, + "learning_rate": 1.9170598911070783e-05, + "loss": 43.5817, + "step": 1999 + }, + { + "epoch": 7.220316027088036, + "grad_norm": 193.17591857910156, + "learning_rate": 1.916515426497278e-05, + "loss": 41.4884, + "step": 2000 + }, + { + "epoch": 7.220316027088036, + "eval_loss": 0.6329721212387085, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.995, + "eval_steps_per_second": 56.995, + "step": 2000 + }, + { + "epoch": 7.22392776523702, + "grad_norm": 202.32730102539062, + "learning_rate": 1.9159709618874774e-05, + "loss": 41.2168, + "step": 2001 + }, + { + "epoch": 7.227539503386004, + "grad_norm": 206.4916534423828, + "learning_rate": 1.915426497277677e-05, + "loss": 39.9909, + "step": 2002 + }, + { + "epoch": 7.231151241534989, + "grad_norm": 202.2099609375, + "learning_rate": 1.9148820326678765e-05, + "loss": 40.1413, + "step": 2003 + }, + { + "epoch": 7.234762979683973, + "grad_norm": 223.7954559326172, + "learning_rate": 1.914337568058076e-05, + "loss": 39.5872, + "step": 2004 + }, + { + "epoch": 7.238374717832957, + "grad_norm": 225.8967742919922, + "learning_rate": 1.9137931034482762e-05, + "loss": 41.3396, + "step": 2005 + }, + { + "epoch": 7.241986455981941, + "grad_norm": 248.0997772216797, + "learning_rate": 1.9132486388384757e-05, + "loss": 39.012, + "step": 2006 + }, + { + "epoch": 7.245598194130926, + "grad_norm": 227.4576873779297, + "learning_rate": 1.9127041742286753e-05, + "loss": 42.5922, + "step": 2007 + }, + { + "epoch": 7.24920993227991, + "grad_norm": 197.62547302246094, + "learning_rate": 1.9121597096188748e-05, + "loss": 41.6107, + "step": 2008 + }, + { + "epoch": 7.252821670428894, + "grad_norm": 170.18817138671875, + "learning_rate": 1.9116152450090744e-05, + "loss": 40.3326, + "step": 2009 + }, + { + "epoch": 7.2564334085778786, + "grad_norm": 186.9420166015625, + "learning_rate": 1.9110707803992742e-05, + "loss": 41.0365, + "step": 2010 + }, + { + "epoch": 7.2564334085778786, + "eval_loss": 0.6230406761169434, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2010 + }, + { + "epoch": 7.2600451467268625, + "grad_norm": 188.11244201660156, + "learning_rate": 1.9105263157894738e-05, + "loss": 42.0278, + "step": 2011 + }, + { + "epoch": 7.2636568848758465, + "grad_norm": 242.47305297851562, + "learning_rate": 1.9099818511796733e-05, + "loss": 41.5539, + "step": 2012 + }, + { + "epoch": 7.2672686230248305, + "grad_norm": 190.83987426757812, + "learning_rate": 1.909437386569873e-05, + "loss": 41.8641, + "step": 2013 + }, + { + "epoch": 7.270880361173815, + "grad_norm": 214.44650268554688, + "learning_rate": 1.9088929219600724e-05, + "loss": 42.232, + "step": 2014 + }, + { + "epoch": 7.274492099322799, + "grad_norm": 216.3888397216797, + "learning_rate": 1.9083484573502723e-05, + "loss": 41.6186, + "step": 2015 + }, + { + "epoch": 7.278103837471783, + "grad_norm": 210.46673583984375, + "learning_rate": 1.907803992740472e-05, + "loss": 42.2099, + "step": 2016 + }, + { + "epoch": 7.281715575620767, + "grad_norm": 194.84165954589844, + "learning_rate": 1.9072595281306717e-05, + "loss": 42.78, + "step": 2017 + }, + { + "epoch": 7.285327313769752, + "grad_norm": 201.91297912597656, + "learning_rate": 1.9067150635208712e-05, + "loss": 38.7115, + "step": 2018 + }, + { + "epoch": 7.288939051918736, + "grad_norm": 245.42625427246094, + "learning_rate": 1.9061705989110708e-05, + "loss": 35.7841, + "step": 2019 + }, + { + "epoch": 7.29255079006772, + "grad_norm": 182.4967041015625, + "learning_rate": 1.9056261343012703e-05, + "loss": 34.3308, + "step": 2020 + }, + { + "epoch": 7.29255079006772, + "eval_loss": 0.6238341331481934, + "eval_runtime": 3.1431, + "eval_samples_per_second": 56.95, + "eval_steps_per_second": 56.95, + "step": 2020 + }, + { + "epoch": 7.296162528216704, + "grad_norm": 297.3916320800781, + "learning_rate": 1.9050816696914702e-05, + "loss": 34.7534, + "step": 2021 + }, + { + "epoch": 7.299774266365689, + "grad_norm": 211.52554321289062, + "learning_rate": 1.9045372050816697e-05, + "loss": 34.0303, + "step": 2022 + }, + { + "epoch": 7.303386004514673, + "grad_norm": 232.99844360351562, + "learning_rate": 1.9039927404718693e-05, + "loss": 35.7378, + "step": 2023 + }, + { + "epoch": 7.306997742663657, + "grad_norm": 230.34642028808594, + "learning_rate": 1.903448275862069e-05, + "loss": 36.7492, + "step": 2024 + }, + { + "epoch": 7.310609480812641, + "grad_norm": 228.88966369628906, + "learning_rate": 1.9029038112522687e-05, + "loss": 35.1188, + "step": 2025 + }, + { + "epoch": 7.314221218961626, + "grad_norm": 213.2604522705078, + "learning_rate": 1.9023593466424682e-05, + "loss": 35.0688, + "step": 2026 + }, + { + "epoch": 7.3178329571106095, + "grad_norm": 202.62200927734375, + "learning_rate": 1.901814882032668e-05, + "loss": 37.6721, + "step": 2027 + }, + { + "epoch": 7.3214446952595935, + "grad_norm": 191.8877410888672, + "learning_rate": 1.9012704174228676e-05, + "loss": 36.7728, + "step": 2028 + }, + { + "epoch": 7.3250564334085775, + "grad_norm": 211.57571411132812, + "learning_rate": 1.900725952813067e-05, + "loss": 36.6342, + "step": 2029 + }, + { + "epoch": 7.328668171557562, + "grad_norm": 177.2289581298828, + "learning_rate": 1.9001814882032667e-05, + "loss": 36.8319, + "step": 2030 + }, + { + "epoch": 7.328668171557562, + "eval_loss": 0.6231008172035217, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2030 + }, + { + "epoch": 7.332279909706546, + "grad_norm": 227.7028350830078, + "learning_rate": 1.8996370235934662e-05, + "loss": 36.6706, + "step": 2031 + }, + { + "epoch": 7.33589164785553, + "grad_norm": 229.02972412109375, + "learning_rate": 1.899092558983666e-05, + "loss": 37.0749, + "step": 2032 + }, + { + "epoch": 7.339503386004514, + "grad_norm": 234.30946350097656, + "learning_rate": 1.898548094373866e-05, + "loss": 37.3716, + "step": 2033 + }, + { + "epoch": 7.343115124153499, + "grad_norm": 236.79893493652344, + "learning_rate": 1.8980036297640655e-05, + "loss": 38.9503, + "step": 2034 + }, + { + "epoch": 7.346726862302483, + "grad_norm": 256.5646057128906, + "learning_rate": 1.897459165154265e-05, + "loss": 32.5056, + "step": 2035 + }, + { + "epoch": 7.350338600451467, + "grad_norm": 183.38961791992188, + "learning_rate": 1.8969147005444646e-05, + "loss": 25.3982, + "step": 2036 + }, + { + "epoch": 7.353950338600452, + "grad_norm": 214.09742736816406, + "learning_rate": 1.896370235934664e-05, + "loss": 23.2743, + "step": 2037 + }, + { + "epoch": 7.357562076749436, + "grad_norm": 190.10867309570312, + "learning_rate": 1.895825771324864e-05, + "loss": 24.8062, + "step": 2038 + }, + { + "epoch": 7.36117381489842, + "grad_norm": 197.85313415527344, + "learning_rate": 1.8952813067150636e-05, + "loss": 25.5098, + "step": 2039 + }, + { + "epoch": 7.364785553047404, + "grad_norm": 235.79090881347656, + "learning_rate": 1.894736842105263e-05, + "loss": 44.3536, + "step": 2040 + }, + { + "epoch": 7.364785553047404, + "eval_loss": 0.6341925263404846, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.785, + "eval_steps_per_second": 56.785, + "step": 2040 + }, + { + "epoch": 7.368397291196389, + "grad_norm": 232.7415771484375, + "learning_rate": 1.8941923774954626e-05, + "loss": 44.6073, + "step": 2041 + }, + { + "epoch": 7.372009029345373, + "grad_norm": 302.3766174316406, + "learning_rate": 1.8936479128856625e-05, + "loss": 43.8575, + "step": 2042 + }, + { + "epoch": 7.375620767494357, + "grad_norm": 208.41441345214844, + "learning_rate": 1.8931034482758624e-05, + "loss": 42.4378, + "step": 2043 + }, + { + "epoch": 7.3792325056433405, + "grad_norm": 228.000732421875, + "learning_rate": 1.892558983666062e-05, + "loss": 44.5641, + "step": 2044 + }, + { + "epoch": 7.382844243792325, + "grad_norm": 201.757080078125, + "learning_rate": 1.8920145190562615e-05, + "loss": 43.7578, + "step": 2045 + }, + { + "epoch": 7.386455981941309, + "grad_norm": 220.2481689453125, + "learning_rate": 1.891470054446461e-05, + "loss": 42.755, + "step": 2046 + }, + { + "epoch": 7.390067720090293, + "grad_norm": 225.5443115234375, + "learning_rate": 1.8909255898366605e-05, + "loss": 44.3785, + "step": 2047 + }, + { + "epoch": 7.393679458239277, + "grad_norm": 200.2024688720703, + "learning_rate": 1.89038112522686e-05, + "loss": 42.994, + "step": 2048 + }, + { + "epoch": 7.397291196388262, + "grad_norm": 205.64794921875, + "learning_rate": 1.88983666061706e-05, + "loss": 43.1902, + "step": 2049 + }, + { + "epoch": 7.400902934537246, + "grad_norm": 183.3535919189453, + "learning_rate": 1.8892921960072595e-05, + "loss": 40.9422, + "step": 2050 + }, + { + "epoch": 7.400902934537246, + "eval_loss": 0.626913845539093, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2050 + }, + { + "epoch": 7.40451467268623, + "grad_norm": 201.8138885498047, + "learning_rate": 1.8887477313974594e-05, + "loss": 39.4408, + "step": 2051 + }, + { + "epoch": 7.408126410835214, + "grad_norm": 201.8863525390625, + "learning_rate": 1.888203266787659e-05, + "loss": 39.5467, + "step": 2052 + }, + { + "epoch": 7.411738148984199, + "grad_norm": 239.10687255859375, + "learning_rate": 1.8876588021778585e-05, + "loss": 41.2256, + "step": 2053 + }, + { + "epoch": 7.415349887133183, + "grad_norm": 209.47796630859375, + "learning_rate": 1.8871143375680583e-05, + "loss": 40.8963, + "step": 2054 + }, + { + "epoch": 7.418961625282167, + "grad_norm": 202.6414794921875, + "learning_rate": 1.886569872958258e-05, + "loss": 40.5138, + "step": 2055 + }, + { + "epoch": 7.422573363431152, + "grad_norm": 198.01795959472656, + "learning_rate": 1.8860254083484574e-05, + "loss": 39.1767, + "step": 2056 + }, + { + "epoch": 7.426185101580136, + "grad_norm": 173.26507568359375, + "learning_rate": 1.885480943738657e-05, + "loss": 40.6713, + "step": 2057 + }, + { + "epoch": 7.42979683972912, + "grad_norm": 166.11607360839844, + "learning_rate": 1.8849364791288565e-05, + "loss": 41.2602, + "step": 2058 + }, + { + "epoch": 7.433408577878104, + "grad_norm": 200.76956176757812, + "learning_rate": 1.884392014519056e-05, + "loss": 41.0714, + "step": 2059 + }, + { + "epoch": 7.437020316027088, + "grad_norm": 213.75315856933594, + "learning_rate": 1.883847549909256e-05, + "loss": 39.6812, + "step": 2060 + }, + { + "epoch": 7.437020316027088, + "eval_loss": 0.6279598474502563, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.0, + "eval_steps_per_second": 57.0, + "step": 2060 + }, + { + "epoch": 7.440632054176072, + "grad_norm": 221.25025939941406, + "learning_rate": 1.8833030852994558e-05, + "loss": 41.6964, + "step": 2061 + }, + { + "epoch": 7.444243792325056, + "grad_norm": 171.32106018066406, + "learning_rate": 1.8827586206896553e-05, + "loss": 41.4608, + "step": 2062 + }, + { + "epoch": 7.44785553047404, + "grad_norm": 222.76600646972656, + "learning_rate": 1.882214156079855e-05, + "loss": 41.2687, + "step": 2063 + }, + { + "epoch": 7.451467268623025, + "grad_norm": 169.82395935058594, + "learning_rate": 1.8816696914700544e-05, + "loss": 41.6048, + "step": 2064 + }, + { + "epoch": 7.455079006772009, + "grad_norm": 190.5113525390625, + "learning_rate": 1.8811252268602543e-05, + "loss": 41.8843, + "step": 2065 + }, + { + "epoch": 7.458690744920993, + "grad_norm": 194.5990447998047, + "learning_rate": 1.8805807622504538e-05, + "loss": 43.5968, + "step": 2066 + }, + { + "epoch": 7.462302483069977, + "grad_norm": 216.0985870361328, + "learning_rate": 1.8800362976406533e-05, + "loss": 41.6743, + "step": 2067 + }, + { + "epoch": 7.465914221218962, + "grad_norm": 249.05270385742188, + "learning_rate": 1.879491833030853e-05, + "loss": 39.4203, + "step": 2068 + }, + { + "epoch": 7.469525959367946, + "grad_norm": 232.5495147705078, + "learning_rate": 1.8789473684210524e-05, + "loss": 36.2202, + "step": 2069 + }, + { + "epoch": 7.47313769751693, + "grad_norm": 218.72299194335938, + "learning_rate": 1.8784029038112523e-05, + "loss": 34.9116, + "step": 2070 + }, + { + "epoch": 7.47313769751693, + "eval_loss": 0.6241349577903748, + "eval_runtime": 3.1499, + "eval_samples_per_second": 56.827, + "eval_steps_per_second": 56.827, + "step": 2070 + }, + { + "epoch": 7.476749435665914, + "grad_norm": 241.78179931640625, + "learning_rate": 1.8778584392014522e-05, + "loss": 36.2476, + "step": 2071 + }, + { + "epoch": 7.480361173814899, + "grad_norm": 194.92982482910156, + "learning_rate": 1.8773139745916517e-05, + "loss": 34.4524, + "step": 2072 + }, + { + "epoch": 7.483972911963883, + "grad_norm": 227.76156616210938, + "learning_rate": 1.8767695099818513e-05, + "loss": 34.5292, + "step": 2073 + }, + { + "epoch": 7.487584650112867, + "grad_norm": 287.61309814453125, + "learning_rate": 1.8762250453720508e-05, + "loss": 37.8068, + "step": 2074 + }, + { + "epoch": 7.491196388261851, + "grad_norm": 191.0822296142578, + "learning_rate": 1.8756805807622503e-05, + "loss": 36.0941, + "step": 2075 + }, + { + "epoch": 7.4948081264108355, + "grad_norm": 197.5564422607422, + "learning_rate": 1.8751361161524502e-05, + "loss": 36.3624, + "step": 2076 + }, + { + "epoch": 7.4984198645598195, + "grad_norm": 187.72479248046875, + "learning_rate": 1.8745916515426497e-05, + "loss": 37.5074, + "step": 2077 + }, + { + "epoch": 7.502031602708803, + "grad_norm": 220.4607391357422, + "learning_rate": 1.8740471869328493e-05, + "loss": 35.6139, + "step": 2078 + }, + { + "epoch": 7.505643340857787, + "grad_norm": 179.05612182617188, + "learning_rate": 1.873502722323049e-05, + "loss": 37.7286, + "step": 2079 + }, + { + "epoch": 7.509255079006772, + "grad_norm": 230.91879272460938, + "learning_rate": 1.8729582577132487e-05, + "loss": 36.1803, + "step": 2080 + }, + { + "epoch": 7.509255079006772, + "eval_loss": 0.6255043148994446, + "eval_runtime": 3.1466, + "eval_samples_per_second": 56.887, + "eval_steps_per_second": 56.887, + "step": 2080 + }, + { + "epoch": 7.512866817155756, + "grad_norm": 182.89437866210938, + "learning_rate": 1.8724137931034482e-05, + "loss": 36.5782, + "step": 2081 + }, + { + "epoch": 7.51647855530474, + "grad_norm": 215.36769104003906, + "learning_rate": 1.871869328493648e-05, + "loss": 38.233, + "step": 2082 + }, + { + "epoch": 7.520090293453725, + "grad_norm": 232.6095733642578, + "learning_rate": 1.8713248638838477e-05, + "loss": 38.6268, + "step": 2083 + }, + { + "epoch": 7.523702031602709, + "grad_norm": 236.94281005859375, + "learning_rate": 1.8707803992740472e-05, + "loss": 38.1768, + "step": 2084 + }, + { + "epoch": 7.527313769751693, + "grad_norm": 214.16079711914062, + "learning_rate": 1.8702359346642467e-05, + "loss": 27.514, + "step": 2085 + }, + { + "epoch": 7.530925507900677, + "grad_norm": 192.6107940673828, + "learning_rate": 1.8696914700544463e-05, + "loss": 24.274, + "step": 2086 + }, + { + "epoch": 7.534537246049661, + "grad_norm": 217.98619079589844, + "learning_rate": 1.869147005444646e-05, + "loss": 23.2824, + "step": 2087 + }, + { + "epoch": 7.538148984198646, + "grad_norm": 183.04296875, + "learning_rate": 1.868602540834846e-05, + "loss": 24.9622, + "step": 2088 + }, + { + "epoch": 7.54176072234763, + "grad_norm": 167.1417236328125, + "learning_rate": 1.8680580762250456e-05, + "loss": 25.1446, + "step": 2089 + }, + { + "epoch": 7.545372460496614, + "grad_norm": 287.29937744140625, + "learning_rate": 1.867513611615245e-05, + "loss": 44.1171, + "step": 2090 + }, + { + "epoch": 7.545372460496614, + "eval_loss": 0.6376849412918091, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.929, + "eval_steps_per_second": 56.929, + "step": 2090 + }, + { + "epoch": 7.5489841986455986, + "grad_norm": 285.3408203125, + "learning_rate": 1.8669691470054446e-05, + "loss": 46.3716, + "step": 2091 + }, + { + "epoch": 7.5525959367945825, + "grad_norm": 233.18389892578125, + "learning_rate": 1.8664246823956445e-05, + "loss": 44.0514, + "step": 2092 + }, + { + "epoch": 7.5562076749435665, + "grad_norm": 256.4196472167969, + "learning_rate": 1.865880217785844e-05, + "loss": 44.1784, + "step": 2093 + }, + { + "epoch": 7.5598194130925505, + "grad_norm": 223.28128051757812, + "learning_rate": 1.8653357531760436e-05, + "loss": 42.9897, + "step": 2094 + }, + { + "epoch": 7.563431151241535, + "grad_norm": 235.2901153564453, + "learning_rate": 1.864791288566243e-05, + "loss": 43.7651, + "step": 2095 + }, + { + "epoch": 7.567042889390519, + "grad_norm": 285.9206237792969, + "learning_rate": 1.8642468239564427e-05, + "loss": 44.6333, + "step": 2096 + }, + { + "epoch": 7.570654627539503, + "grad_norm": 200.00210571289062, + "learning_rate": 1.8637023593466425e-05, + "loss": 43.9845, + "step": 2097 + }, + { + "epoch": 7.574266365688487, + "grad_norm": 277.73394775390625, + "learning_rate": 1.8631578947368424e-05, + "loss": 44.7301, + "step": 2098 + }, + { + "epoch": 7.577878103837472, + "grad_norm": 216.9422149658203, + "learning_rate": 1.862613430127042e-05, + "loss": 44.0409, + "step": 2099 + }, + { + "epoch": 7.581489841986456, + "grad_norm": 198.86639404296875, + "learning_rate": 1.8620689655172415e-05, + "loss": 43.4026, + "step": 2100 + }, + { + "epoch": 7.581489841986456, + "eval_loss": 0.6270378232002258, + "eval_runtime": 3.1464, + "eval_samples_per_second": 56.891, + "eval_steps_per_second": 56.891, + "step": 2100 + }, + { + "epoch": 7.58510158013544, + "grad_norm": 240.495361328125, + "learning_rate": 1.861524500907441e-05, + "loss": 41.4092, + "step": 2101 + }, + { + "epoch": 7.588713318284425, + "grad_norm": 240.1851043701172, + "learning_rate": 1.8609800362976406e-05, + "loss": 40.1396, + "step": 2102 + }, + { + "epoch": 7.592325056433409, + "grad_norm": 241.21495056152344, + "learning_rate": 1.8604355716878405e-05, + "loss": 39.1778, + "step": 2103 + }, + { + "epoch": 7.595936794582393, + "grad_norm": 287.3133544921875, + "learning_rate": 1.85989110707804e-05, + "loss": 41.0348, + "step": 2104 + }, + { + "epoch": 7.599548532731377, + "grad_norm": 230.4313201904297, + "learning_rate": 1.8593466424682395e-05, + "loss": 39.5872, + "step": 2105 + }, + { + "epoch": 7.603160270880361, + "grad_norm": 210.32962036132812, + "learning_rate": 1.858802177858439e-05, + "loss": 40.6146, + "step": 2106 + }, + { + "epoch": 7.606772009029346, + "grad_norm": 185.81752014160156, + "learning_rate": 1.858257713248639e-05, + "loss": 39.6363, + "step": 2107 + }, + { + "epoch": 7.6103837471783295, + "grad_norm": 234.63037109375, + "learning_rate": 1.8577132486388385e-05, + "loss": 40.558, + "step": 2108 + }, + { + "epoch": 7.6139954853273135, + "grad_norm": 289.92803955078125, + "learning_rate": 1.8571687840290384e-05, + "loss": 41.1624, + "step": 2109 + }, + { + "epoch": 7.617607223476298, + "grad_norm": 252.82188415527344, + "learning_rate": 1.856624319419238e-05, + "loss": 41.7827, + "step": 2110 + }, + { + "epoch": 7.617607223476298, + "eval_loss": 0.6290409564971924, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2110 + }, + { + "epoch": 7.621218961625282, + "grad_norm": 201.8303985595703, + "learning_rate": 1.8560798548094374e-05, + "loss": 39.0072, + "step": 2111 + }, + { + "epoch": 7.624830699774266, + "grad_norm": 158.71446228027344, + "learning_rate": 1.855535390199637e-05, + "loss": 39.9822, + "step": 2112 + }, + { + "epoch": 7.62844243792325, + "grad_norm": 171.3879852294922, + "learning_rate": 1.8549909255898365e-05, + "loss": 42.1973, + "step": 2113 + }, + { + "epoch": 7.632054176072235, + "grad_norm": 218.584228515625, + "learning_rate": 1.8544464609800364e-05, + "loss": 42.933, + "step": 2114 + }, + { + "epoch": 7.635665914221219, + "grad_norm": 200.60093688964844, + "learning_rate": 1.853901996370236e-05, + "loss": 41.9847, + "step": 2115 + }, + { + "epoch": 7.639277652370203, + "grad_norm": 210.75128173828125, + "learning_rate": 1.8533575317604358e-05, + "loss": 42.4961, + "step": 2116 + }, + { + "epoch": 7.642889390519187, + "grad_norm": 187.47406005859375, + "learning_rate": 1.8528130671506353e-05, + "loss": 39.3404, + "step": 2117 + }, + { + "epoch": 7.646501128668172, + "grad_norm": 204.87693786621094, + "learning_rate": 1.852268602540835e-05, + "loss": 40.3011, + "step": 2118 + }, + { + "epoch": 7.650112866817156, + "grad_norm": 228.8159637451172, + "learning_rate": 1.8517241379310344e-05, + "loss": 37.4416, + "step": 2119 + }, + { + "epoch": 7.65372460496614, + "grad_norm": 237.59664916992188, + "learning_rate": 1.8511796733212343e-05, + "loss": 35.3079, + "step": 2120 + }, + { + "epoch": 7.65372460496614, + "eval_loss": 0.6256567239761353, + "eval_runtime": 3.1458, + "eval_samples_per_second": 56.902, + "eval_steps_per_second": 56.902, + "step": 2120 + }, + { + "epoch": 7.657336343115124, + "grad_norm": 233.3187713623047, + "learning_rate": 1.850635208711434e-05, + "loss": 34.5055, + "step": 2121 + }, + { + "epoch": 7.660948081264109, + "grad_norm": 232.7037353515625, + "learning_rate": 1.8500907441016334e-05, + "loss": 34.1232, + "step": 2122 + }, + { + "epoch": 7.664559819413093, + "grad_norm": 254.53050231933594, + "learning_rate": 1.849546279491833e-05, + "loss": 35.3301, + "step": 2123 + }, + { + "epoch": 7.668171557562077, + "grad_norm": 234.93154907226562, + "learning_rate": 1.8490018148820324e-05, + "loss": 35.9202, + "step": 2124 + }, + { + "epoch": 7.6717832957110605, + "grad_norm": 237.99671936035156, + "learning_rate": 1.8484573502722327e-05, + "loss": 36.5702, + "step": 2125 + }, + { + "epoch": 7.675395033860045, + "grad_norm": 186.25271606445312, + "learning_rate": 1.8479128856624322e-05, + "loss": 35.9423, + "step": 2126 + }, + { + "epoch": 7.679006772009029, + "grad_norm": 226.461669921875, + "learning_rate": 1.8473684210526317e-05, + "loss": 37.4121, + "step": 2127 + }, + { + "epoch": 7.682618510158013, + "grad_norm": 227.0966033935547, + "learning_rate": 1.8468239564428313e-05, + "loss": 36.8802, + "step": 2128 + }, + { + "epoch": 7.686230248306998, + "grad_norm": 193.4064178466797, + "learning_rate": 1.8462794918330308e-05, + "loss": 36.0245, + "step": 2129 + }, + { + "epoch": 7.689841986455982, + "grad_norm": 279.1668395996094, + "learning_rate": 1.8457350272232304e-05, + "loss": 37.4833, + "step": 2130 + }, + { + "epoch": 7.689841986455982, + "eval_loss": 0.6227458715438843, + "eval_runtime": 3.1429, + "eval_samples_per_second": 56.953, + "eval_steps_per_second": 56.953, + "step": 2130 + }, + { + "epoch": 7.693453724604966, + "grad_norm": 254.59234619140625, + "learning_rate": 1.8451905626134302e-05, + "loss": 36.8538, + "step": 2131 + }, + { + "epoch": 7.69706546275395, + "grad_norm": 191.14463806152344, + "learning_rate": 1.8446460980036298e-05, + "loss": 37.8517, + "step": 2132 + }, + { + "epoch": 7.700677200902934, + "grad_norm": 189.20896911621094, + "learning_rate": 1.8441016333938293e-05, + "loss": 38.406, + "step": 2133 + }, + { + "epoch": 7.704288939051919, + "grad_norm": 209.61175537109375, + "learning_rate": 1.8435571687840292e-05, + "loss": 37.7692, + "step": 2134 + }, + { + "epoch": 7.707900677200903, + "grad_norm": 220.5150146484375, + "learning_rate": 1.8430127041742287e-05, + "loss": 36.087, + "step": 2135 + }, + { + "epoch": 7.711512415349887, + "grad_norm": 211.78372192382812, + "learning_rate": 1.8424682395644286e-05, + "loss": 25.6052, + "step": 2136 + }, + { + "epoch": 7.715124153498872, + "grad_norm": 223.85789489746094, + "learning_rate": 1.841923774954628e-05, + "loss": 23.5576, + "step": 2137 + }, + { + "epoch": 7.718735891647856, + "grad_norm": 163.74220275878906, + "learning_rate": 1.8413793103448277e-05, + "loss": 24.4869, + "step": 2138 + }, + { + "epoch": 7.72234762979684, + "grad_norm": 182.80079650878906, + "learning_rate": 1.8408348457350272e-05, + "loss": 25.1878, + "step": 2139 + }, + { + "epoch": 7.725959367945824, + "grad_norm": 296.0340270996094, + "learning_rate": 1.8402903811252268e-05, + "loss": 44.4643, + "step": 2140 + }, + { + "epoch": 7.725959367945824, + "eval_loss": 0.6382863521575928, + "eval_runtime": 3.1441, + "eval_samples_per_second": 56.932, + "eval_steps_per_second": 56.932, + "step": 2140 + }, + { + "epoch": 7.7295711060948085, + "grad_norm": 248.48643493652344, + "learning_rate": 1.8397459165154263e-05, + "loss": 45.2141, + "step": 2141 + }, + { + "epoch": 7.733182844243792, + "grad_norm": 240.9061279296875, + "learning_rate": 1.8392014519056262e-05, + "loss": 42.9435, + "step": 2142 + }, + { + "epoch": 7.736794582392776, + "grad_norm": 231.62315368652344, + "learning_rate": 1.8386569872958257e-05, + "loss": 42.9769, + "step": 2143 + }, + { + "epoch": 7.74040632054176, + "grad_norm": 244.36915588378906, + "learning_rate": 1.8381125226860256e-05, + "loss": 43.6058, + "step": 2144 + }, + { + "epoch": 7.744018058690745, + "grad_norm": 252.9080047607422, + "learning_rate": 1.837568058076225e-05, + "loss": 43.1753, + "step": 2145 + }, + { + "epoch": 7.747629796839729, + "grad_norm": 274.0201721191406, + "learning_rate": 1.8370235934664247e-05, + "loss": 43.3285, + "step": 2146 + }, + { + "epoch": 7.751241534988713, + "grad_norm": 226.75595092773438, + "learning_rate": 1.8364791288566245e-05, + "loss": 43.3158, + "step": 2147 + }, + { + "epoch": 7.754853273137698, + "grad_norm": 197.0859832763672, + "learning_rate": 1.835934664246824e-05, + "loss": 43.5773, + "step": 2148 + }, + { + "epoch": 7.758465011286682, + "grad_norm": 212.14720153808594, + "learning_rate": 1.8353901996370236e-05, + "loss": 43.9208, + "step": 2149 + }, + { + "epoch": 7.762076749435666, + "grad_norm": 230.22158813476562, + "learning_rate": 1.834845735027223e-05, + "loss": 42.8429, + "step": 2150 + }, + { + "epoch": 7.762076749435666, + "eval_loss": 0.6291994452476501, + "eval_runtime": 3.1473, + "eval_samples_per_second": 56.874, + "eval_steps_per_second": 56.874, + "step": 2150 + }, + { + "epoch": 7.76568848758465, + "grad_norm": 215.79391479492188, + "learning_rate": 1.8343012704174227e-05, + "loss": 40.7289, + "step": 2151 + }, + { + "epoch": 7.769300225733634, + "grad_norm": 210.00296020507812, + "learning_rate": 1.8337568058076222e-05, + "loss": 39.9759, + "step": 2152 + }, + { + "epoch": 7.772911963882619, + "grad_norm": 291.2987976074219, + "learning_rate": 1.8332123411978224e-05, + "loss": 40.551, + "step": 2153 + }, + { + "epoch": 7.776523702031603, + "grad_norm": 218.08819580078125, + "learning_rate": 1.832667876588022e-05, + "loss": 40.7981, + "step": 2154 + }, + { + "epoch": 7.780135440180587, + "grad_norm": 268.615966796875, + "learning_rate": 1.8321234119782215e-05, + "loss": 40.5463, + "step": 2155 + }, + { + "epoch": 7.7837471783295715, + "grad_norm": 269.939697265625, + "learning_rate": 1.831578947368421e-05, + "loss": 40.6168, + "step": 2156 + }, + { + "epoch": 7.7873589164785555, + "grad_norm": 268.9761657714844, + "learning_rate": 1.8310344827586206e-05, + "loss": 41.2449, + "step": 2157 + }, + { + "epoch": 7.7909706546275395, + "grad_norm": 161.08811950683594, + "learning_rate": 1.8304900181488205e-05, + "loss": 40.6308, + "step": 2158 + }, + { + "epoch": 7.794582392776523, + "grad_norm": 190.44696044921875, + "learning_rate": 1.82994555353902e-05, + "loss": 40.9708, + "step": 2159 + }, + { + "epoch": 7.798194130925508, + "grad_norm": 202.4305419921875, + "learning_rate": 1.8294010889292196e-05, + "loss": 41.2053, + "step": 2160 + }, + { + "epoch": 7.798194130925508, + "eval_loss": 0.6233534812927246, + "eval_runtime": 3.1457, + "eval_samples_per_second": 56.903, + "eval_steps_per_second": 56.903, + "step": 2160 + }, + { + "epoch": 7.801805869074492, + "grad_norm": 188.5523681640625, + "learning_rate": 1.828856624319419e-05, + "loss": 40.3928, + "step": 2161 + }, + { + "epoch": 7.805417607223476, + "grad_norm": 184.18296813964844, + "learning_rate": 1.828312159709619e-05, + "loss": 42.3466, + "step": 2162 + }, + { + "epoch": 7.80902934537246, + "grad_norm": 223.9243927001953, + "learning_rate": 1.8277676950998185e-05, + "loss": 42.0301, + "step": 2163 + }, + { + "epoch": 7.812641083521445, + "grad_norm": 202.3498077392578, + "learning_rate": 1.8272232304900184e-05, + "loss": 42.3284, + "step": 2164 + }, + { + "epoch": 7.816252821670429, + "grad_norm": 205.77940368652344, + "learning_rate": 1.826678765880218e-05, + "loss": 42.0951, + "step": 2165 + }, + { + "epoch": 7.819864559819413, + "grad_norm": 191.46728515625, + "learning_rate": 1.8261343012704175e-05, + "loss": 40.826, + "step": 2166 + }, + { + "epoch": 7.823476297968397, + "grad_norm": 276.8330383300781, + "learning_rate": 1.825589836660617e-05, + "loss": 42.7909, + "step": 2167 + }, + { + "epoch": 7.827088036117382, + "grad_norm": 181.93955993652344, + "learning_rate": 1.8250453720508165e-05, + "loss": 38.6068, + "step": 2168 + }, + { + "epoch": 7.830699774266366, + "grad_norm": 178.79856872558594, + "learning_rate": 1.8245009074410164e-05, + "loss": 35.694, + "step": 2169 + }, + { + "epoch": 7.83431151241535, + "grad_norm": 224.6522979736328, + "learning_rate": 1.823956442831216e-05, + "loss": 36.7127, + "step": 2170 + }, + { + "epoch": 7.83431151241535, + "eval_loss": 0.6237645745277405, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.947, + "eval_steps_per_second": 56.947, + "step": 2170 + }, + { + "epoch": 7.837923250564334, + "grad_norm": 203.37196350097656, + "learning_rate": 1.823411978221416e-05, + "loss": 34.0039, + "step": 2171 + }, + { + "epoch": 7.8415349887133186, + "grad_norm": 212.79307556152344, + "learning_rate": 1.8228675136116154e-05, + "loss": 33.2787, + "step": 2172 + }, + { + "epoch": 7.8451467268623025, + "grad_norm": 215.5691375732422, + "learning_rate": 1.822323049001815e-05, + "loss": 35.4241, + "step": 2173 + }, + { + "epoch": 7.8487584650112865, + "grad_norm": 230.0751190185547, + "learning_rate": 1.8217785843920144e-05, + "loss": 36.9333, + "step": 2174 + }, + { + "epoch": 7.852370203160271, + "grad_norm": 217.8132781982422, + "learning_rate": 1.8212341197822143e-05, + "loss": 35.7233, + "step": 2175 + }, + { + "epoch": 7.855981941309255, + "grad_norm": 245.93177795410156, + "learning_rate": 1.820689655172414e-05, + "loss": 36.6111, + "step": 2176 + }, + { + "epoch": 7.859593679458239, + "grad_norm": 210.58218383789062, + "learning_rate": 1.8201451905626134e-05, + "loss": 36.3243, + "step": 2177 + }, + { + "epoch": 7.863205417607223, + "grad_norm": 234.6280059814453, + "learning_rate": 1.819600725952813e-05, + "loss": 37.0315, + "step": 2178 + }, + { + "epoch": 7.866817155756207, + "grad_norm": 184.53121948242188, + "learning_rate": 1.8190562613430125e-05, + "loss": 35.8725, + "step": 2179 + }, + { + "epoch": 7.870428893905192, + "grad_norm": 201.5563507080078, + "learning_rate": 1.8185117967332127e-05, + "loss": 37.9183, + "step": 2180 + }, + { + "epoch": 7.870428893905192, + "eval_loss": 0.6210297346115112, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2180 + }, + { + "epoch": 7.874040632054176, + "grad_norm": 192.29579162597656, + "learning_rate": 1.8179673321234122e-05, + "loss": 37.1709, + "step": 2181 + }, + { + "epoch": 7.87765237020316, + "grad_norm": 246.0638427734375, + "learning_rate": 1.8174228675136118e-05, + "loss": 38.5338, + "step": 2182 + }, + { + "epoch": 7.881264108352145, + "grad_norm": 237.47607421875, + "learning_rate": 1.8168784029038113e-05, + "loss": 37.7041, + "step": 2183 + }, + { + "epoch": 7.884875846501129, + "grad_norm": 215.06407165527344, + "learning_rate": 1.816333938294011e-05, + "loss": 38.1663, + "step": 2184 + }, + { + "epoch": 7.888487584650113, + "grad_norm": 193.76809692382812, + "learning_rate": 1.8157894736842107e-05, + "loss": 32.1679, + "step": 2185 + }, + { + "epoch": 7.892099322799097, + "grad_norm": 208.66111755371094, + "learning_rate": 1.8152450090744103e-05, + "loss": 24.2413, + "step": 2186 + }, + { + "epoch": 7.895711060948082, + "grad_norm": 182.810546875, + "learning_rate": 1.8147005444646098e-05, + "loss": 24.1102, + "step": 2187 + }, + { + "epoch": 7.899322799097066, + "grad_norm": 200.25823974609375, + "learning_rate": 1.8141560798548093e-05, + "loss": 24.5778, + "step": 2188 + }, + { + "epoch": 7.9029345372460496, + "grad_norm": 224.19125366210938, + "learning_rate": 1.813611615245009e-05, + "loss": 26.1643, + "step": 2189 + }, + { + "epoch": 7.9065462753950335, + "grad_norm": 261.03033447265625, + "learning_rate": 1.8130671506352088e-05, + "loss": 45.1071, + "step": 2190 + }, + { + "epoch": 7.9065462753950335, + "eval_loss": 0.6303785443305969, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2190 + }, + { + "epoch": 7.910158013544018, + "grad_norm": 273.6593322753906, + "learning_rate": 1.8125226860254086e-05, + "loss": 43.8271, + "step": 2191 + }, + { + "epoch": 7.913769751693002, + "grad_norm": 304.0534362792969, + "learning_rate": 1.8119782214156082e-05, + "loss": 43.7623, + "step": 2192 + }, + { + "epoch": 7.917381489841986, + "grad_norm": 249.27255249023438, + "learning_rate": 1.8114337568058077e-05, + "loss": 43.7191, + "step": 2193 + }, + { + "epoch": 7.92099322799097, + "grad_norm": 199.5006103515625, + "learning_rate": 1.8108892921960072e-05, + "loss": 44.1019, + "step": 2194 + }, + { + "epoch": 7.924604966139955, + "grad_norm": 228.42832946777344, + "learning_rate": 1.8103448275862068e-05, + "loss": 43.9717, + "step": 2195 + }, + { + "epoch": 7.928216704288939, + "grad_norm": 247.20901489257812, + "learning_rate": 1.8098003629764067e-05, + "loss": 40.022, + "step": 2196 + }, + { + "epoch": 7.931828442437923, + "grad_norm": 297.5372619628906, + "learning_rate": 1.8092558983666062e-05, + "loss": 40.6639, + "step": 2197 + }, + { + "epoch": 7.935440180586907, + "grad_norm": 245.11915588378906, + "learning_rate": 1.8087114337568057e-05, + "loss": 40.3569, + "step": 2198 + }, + { + "epoch": 7.939051918735892, + "grad_norm": 255.53297424316406, + "learning_rate": 1.8081669691470056e-05, + "loss": 41.7983, + "step": 2199 + }, + { + "epoch": 7.942663656884876, + "grad_norm": 226.12783813476562, + "learning_rate": 1.807622504537205e-05, + "loss": 41.7844, + "step": 2200 + }, + { + "epoch": 7.942663656884876, + "eval_loss": 0.6214397549629211, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2200 + }, + { + "epoch": 7.94627539503386, + "grad_norm": 220.90577697753906, + "learning_rate": 1.8070780399274047e-05, + "loss": 42.057, + "step": 2201 + }, + { + "epoch": 7.949887133182845, + "grad_norm": 192.33856201171875, + "learning_rate": 1.8065335753176046e-05, + "loss": 42.0299, + "step": 2202 + }, + { + "epoch": 7.953498871331829, + "grad_norm": 192.8511962890625, + "learning_rate": 1.805989110707804e-05, + "loss": 41.7752, + "step": 2203 + }, + { + "epoch": 7.957110609480813, + "grad_norm": 223.10275268554688, + "learning_rate": 1.8054446460980036e-05, + "loss": 41.0178, + "step": 2204 + }, + { + "epoch": 7.960722347629797, + "grad_norm": 189.8402099609375, + "learning_rate": 1.8049001814882032e-05, + "loss": 37.9747, + "step": 2205 + }, + { + "epoch": 7.9643340857787805, + "grad_norm": 233.5938720703125, + "learning_rate": 1.8043557168784027e-05, + "loss": 35.3994, + "step": 2206 + }, + { + "epoch": 7.967945823927765, + "grad_norm": 218.5577850341797, + "learning_rate": 1.8038112522686026e-05, + "loss": 35.1967, + "step": 2207 + }, + { + "epoch": 7.971557562076749, + "grad_norm": 228.49502563476562, + "learning_rate": 1.8032667876588025e-05, + "loss": 34.5792, + "step": 2208 + }, + { + "epoch": 7.975169300225733, + "grad_norm": 285.4461364746094, + "learning_rate": 1.802722323049002e-05, + "loss": 37.9449, + "step": 2209 + }, + { + "epoch": 7.978781038374718, + "grad_norm": 186.83755493164062, + "learning_rate": 1.8021778584392016e-05, + "loss": 36.3295, + "step": 2210 + }, + { + "epoch": 7.978781038374718, + "eval_loss": 0.6212169528007507, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 2210 + }, + { + "epoch": 7.982392776523702, + "grad_norm": 210.31175231933594, + "learning_rate": 1.801633393829401e-05, + "loss": 37.0061, + "step": 2211 + }, + { + "epoch": 7.986004514672686, + "grad_norm": 251.96026611328125, + "learning_rate": 1.8010889292196006e-05, + "loss": 37.8831, + "step": 2212 + }, + { + "epoch": 7.98961625282167, + "grad_norm": 273.8665771484375, + "learning_rate": 1.8005444646098005e-05, + "loss": 38.8926, + "step": 2213 + }, + { + "epoch": 7.993227990970655, + "grad_norm": 207.25836181640625, + "learning_rate": 1.8e-05, + "loss": 30.0468, + "step": 2214 + }, + { + "epoch": 7.996839729119639, + "grad_norm": 200.5218048095703, + "learning_rate": 1.7994555353901996e-05, + "loss": 24.0549, + "step": 2215 + }, + { + "epoch": 8.0, + "grad_norm": 245.7149200439453, + "learning_rate": 1.798911070780399e-05, + "loss": 22.3158, + "step": 2216 + }, + { + "epoch": 8.003611738148985, + "grad_norm": 263.85546875, + "learning_rate": 1.798366606170599e-05, + "loss": 43.2342, + "step": 2217 + }, + { + "epoch": 8.007223476297968, + "grad_norm": 244.57205200195312, + "learning_rate": 1.797822141560799e-05, + "loss": 44.0931, + "step": 2218 + }, + { + "epoch": 8.010835214446953, + "grad_norm": 196.4144287109375, + "learning_rate": 1.7972776769509984e-05, + "loss": 42.1926, + "step": 2219 + }, + { + "epoch": 8.014446952595938, + "grad_norm": 282.3250427246094, + "learning_rate": 1.796733212341198e-05, + "loss": 41.4664, + "step": 2220 + }, + { + "epoch": 8.014446952595938, + "eval_loss": 0.6222901344299316, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 2220 + }, + { + "epoch": 8.01805869074492, + "grad_norm": 186.79281616210938, + "learning_rate": 1.7961887477313975e-05, + "loss": 42.2133, + "step": 2221 + }, + { + "epoch": 8.021670428893906, + "grad_norm": 220.3788299560547, + "learning_rate": 1.795644283121597e-05, + "loss": 42.0159, + "step": 2222 + }, + { + "epoch": 8.025282167042889, + "grad_norm": 262.37078857421875, + "learning_rate": 1.7950998185117966e-05, + "loss": 42.6055, + "step": 2223 + }, + { + "epoch": 8.028893905191874, + "grad_norm": 199.07078552246094, + "learning_rate": 1.7945553539019964e-05, + "loss": 43.3061, + "step": 2224 + }, + { + "epoch": 8.032505643340858, + "grad_norm": 256.6651306152344, + "learning_rate": 1.794010889292196e-05, + "loss": 42.4806, + "step": 2225 + }, + { + "epoch": 8.036117381489841, + "grad_norm": 281.17431640625, + "learning_rate": 1.793466424682396e-05, + "loss": 43.9823, + "step": 2226 + }, + { + "epoch": 8.039729119638826, + "grad_norm": 201.19837951660156, + "learning_rate": 1.7929219600725954e-05, + "loss": 41.8372, + "step": 2227 + }, + { + "epoch": 8.043340857787811, + "grad_norm": 195.1905059814453, + "learning_rate": 1.792377495462795e-05, + "loss": 38.8656, + "step": 2228 + }, + { + "epoch": 8.046952595936794, + "grad_norm": 215.02772521972656, + "learning_rate": 1.7918330308529948e-05, + "loss": 39.8965, + "step": 2229 + }, + { + "epoch": 8.050564334085779, + "grad_norm": 202.16322326660156, + "learning_rate": 1.7912885662431944e-05, + "loss": 41.0917, + "step": 2230 + }, + { + "epoch": 8.050564334085779, + "eval_loss": 0.6212881207466125, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2230 + }, + { + "epoch": 8.054176072234762, + "grad_norm": 218.90786743164062, + "learning_rate": 1.790744101633394e-05, + "loss": 38.5499, + "step": 2231 + }, + { + "epoch": 8.057787810383747, + "grad_norm": 179.57138061523438, + "learning_rate": 1.7901996370235934e-05, + "loss": 39.5915, + "step": 2232 + }, + { + "epoch": 8.061399548532732, + "grad_norm": 242.74801635742188, + "learning_rate": 1.789655172413793e-05, + "loss": 39.6094, + "step": 2233 + }, + { + "epoch": 8.065011286681715, + "grad_norm": 183.07102966308594, + "learning_rate": 1.7891107078039925e-05, + "loss": 40.6025, + "step": 2234 + }, + { + "epoch": 8.0686230248307, + "grad_norm": 192.85418701171875, + "learning_rate": 1.7885662431941924e-05, + "loss": 40.3013, + "step": 2235 + }, + { + "epoch": 8.072234762979685, + "grad_norm": 254.26353454589844, + "learning_rate": 1.7880217785843923e-05, + "loss": 39.1747, + "step": 2236 + }, + { + "epoch": 8.075846501128668, + "grad_norm": 230.7747802734375, + "learning_rate": 1.7874773139745918e-05, + "loss": 40.7569, + "step": 2237 + }, + { + "epoch": 8.079458239277653, + "grad_norm": 179.30528259277344, + "learning_rate": 1.7869328493647913e-05, + "loss": 40.0753, + "step": 2238 + }, + { + "epoch": 8.083069977426636, + "grad_norm": 203.48915100097656, + "learning_rate": 1.786388384754991e-05, + "loss": 41.4453, + "step": 2239 + }, + { + "epoch": 8.08668171557562, + "grad_norm": 274.8970947265625, + "learning_rate": 1.7858439201451908e-05, + "loss": 40.5818, + "step": 2240 + }, + { + "epoch": 8.08668171557562, + "eval_loss": 0.6184170842170715, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.799, + "eval_steps_per_second": 56.799, + "step": 2240 + }, + { + "epoch": 8.090293453724605, + "grad_norm": 237.2452392578125, + "learning_rate": 1.7852994555353903e-05, + "loss": 42.5794, + "step": 2241 + }, + { + "epoch": 8.093905191873588, + "grad_norm": 236.33766174316406, + "learning_rate": 1.7847549909255898e-05, + "loss": 41.89, + "step": 2242 + }, + { + "epoch": 8.097516930022573, + "grad_norm": 269.4791564941406, + "learning_rate": 1.7842105263157894e-05, + "loss": 41.7726, + "step": 2243 + }, + { + "epoch": 8.101128668171558, + "grad_norm": 192.28457641601562, + "learning_rate": 1.783666061705989e-05, + "loss": 40.1187, + "step": 2244 + }, + { + "epoch": 8.104740406320541, + "grad_norm": 201.5625457763672, + "learning_rate": 1.7831215970961888e-05, + "loss": 36.8004, + "step": 2245 + }, + { + "epoch": 8.108352144469526, + "grad_norm": 175.7625274658203, + "learning_rate": 1.7825771324863887e-05, + "loss": 33.8354, + "step": 2246 + }, + { + "epoch": 8.111963882618511, + "grad_norm": 195.6171112060547, + "learning_rate": 1.7820326678765882e-05, + "loss": 33.5176, + "step": 2247 + }, + { + "epoch": 8.115575620767494, + "grad_norm": 158.7554168701172, + "learning_rate": 1.7814882032667877e-05, + "loss": 34.2908, + "step": 2248 + }, + { + "epoch": 8.119187358916479, + "grad_norm": 192.78900146484375, + "learning_rate": 1.7809437386569873e-05, + "loss": 34.0861, + "step": 2249 + }, + { + "epoch": 8.122799097065462, + "grad_norm": 186.6603240966797, + "learning_rate": 1.7803992740471868e-05, + "loss": 35.5742, + "step": 2250 + }, + { + "epoch": 8.122799097065462, + "eval_loss": 0.6207499504089355, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2250 + }, + { + "epoch": 8.126410835214447, + "grad_norm": 264.3590087890625, + "learning_rate": 1.7798548094373867e-05, + "loss": 35.6709, + "step": 2251 + }, + { + "epoch": 8.130022573363432, + "grad_norm": 202.9478302001953, + "learning_rate": 1.7793103448275862e-05, + "loss": 36.4221, + "step": 2252 + }, + { + "epoch": 8.133634311512415, + "grad_norm": 229.260498046875, + "learning_rate": 1.7787658802177858e-05, + "loss": 36.0745, + "step": 2253 + }, + { + "epoch": 8.1372460496614, + "grad_norm": 222.37716674804688, + "learning_rate": 1.7782214156079856e-05, + "loss": 37.3266, + "step": 2254 + }, + { + "epoch": 8.140857787810384, + "grad_norm": 217.02272033691406, + "learning_rate": 1.7776769509981852e-05, + "loss": 37.2819, + "step": 2255 + }, + { + "epoch": 8.144469525959368, + "grad_norm": 247.61016845703125, + "learning_rate": 1.7771324863883847e-05, + "loss": 37.2683, + "step": 2256 + }, + { + "epoch": 8.148081264108352, + "grad_norm": 209.7449493408203, + "learning_rate": 1.7765880217785846e-05, + "loss": 36.7165, + "step": 2257 + }, + { + "epoch": 8.151693002257336, + "grad_norm": 217.30722045898438, + "learning_rate": 1.776043557168784e-05, + "loss": 37.0805, + "step": 2258 + }, + { + "epoch": 8.15530474040632, + "grad_norm": 181.5167236328125, + "learning_rate": 1.7754990925589837e-05, + "loss": 38.0326, + "step": 2259 + }, + { + "epoch": 8.158916478555305, + "grad_norm": 217.4818878173828, + "learning_rate": 1.7749546279491832e-05, + "loss": 37.1798, + "step": 2260 + }, + { + "epoch": 8.158916478555305, + "eval_loss": 0.6218119263648987, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 2260 + }, + { + "epoch": 8.162528216704288, + "grad_norm": 233.60733032226562, + "learning_rate": 1.7744101633393828e-05, + "loss": 36.6039, + "step": 2261 + }, + { + "epoch": 8.166139954853273, + "grad_norm": 184.5128631591797, + "learning_rate": 1.7738656987295826e-05, + "loss": 30.6188, + "step": 2262 + }, + { + "epoch": 8.169751693002258, + "grad_norm": 154.25791931152344, + "learning_rate": 1.7733212341197825e-05, + "loss": 24.0782, + "step": 2263 + }, + { + "epoch": 8.173363431151241, + "grad_norm": 179.92723083496094, + "learning_rate": 1.772776769509982e-05, + "loss": 23.7072, + "step": 2264 + }, + { + "epoch": 8.176975169300226, + "grad_norm": 170.87684631347656, + "learning_rate": 1.7722323049001816e-05, + "loss": 24.0008, + "step": 2265 + }, + { + "epoch": 8.18058690744921, + "grad_norm": 179.25233459472656, + "learning_rate": 1.771687840290381e-05, + "loss": 24.8393, + "step": 2266 + }, + { + "epoch": 8.184198645598194, + "grad_norm": 268.7836608886719, + "learning_rate": 1.7711433756805807e-05, + "loss": 44.0573, + "step": 2267 + }, + { + "epoch": 8.187810383747179, + "grad_norm": 249.12033081054688, + "learning_rate": 1.7705989110707805e-05, + "loss": 45.0218, + "step": 2268 + }, + { + "epoch": 8.191422121896162, + "grad_norm": 275.2551574707031, + "learning_rate": 1.77005444646098e-05, + "loss": 43.1954, + "step": 2269 + }, + { + "epoch": 8.195033860045147, + "grad_norm": 233.5360107421875, + "learning_rate": 1.7695099818511796e-05, + "loss": 43.0807, + "step": 2270 + }, + { + "epoch": 8.195033860045147, + "eval_loss": 0.6311450600624084, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 2270 + }, + { + "epoch": 8.198645598194132, + "grad_norm": 201.01617431640625, + "learning_rate": 1.768965517241379e-05, + "loss": 43.8161, + "step": 2271 + }, + { + "epoch": 8.202257336343115, + "grad_norm": 243.028564453125, + "learning_rate": 1.7684210526315787e-05, + "loss": 43.3388, + "step": 2272 + }, + { + "epoch": 8.2058690744921, + "grad_norm": 191.8246307373047, + "learning_rate": 1.767876588021779e-05, + "loss": 42.6949, + "step": 2273 + }, + { + "epoch": 8.209480812641084, + "grad_norm": 241.33609008789062, + "learning_rate": 1.7673321234119784e-05, + "loss": 43.3541, + "step": 2274 + }, + { + "epoch": 8.213092550790067, + "grad_norm": 247.99066162109375, + "learning_rate": 1.766787658802178e-05, + "loss": 44.4262, + "step": 2275 + }, + { + "epoch": 8.216704288939052, + "grad_norm": 223.35452270507812, + "learning_rate": 1.7662431941923775e-05, + "loss": 42.5696, + "step": 2276 + }, + { + "epoch": 8.220316027088035, + "grad_norm": 208.75209045410156, + "learning_rate": 1.765698729582577e-05, + "loss": 41.9236, + "step": 2277 + }, + { + "epoch": 8.22392776523702, + "grad_norm": 229.60305786132812, + "learning_rate": 1.7651542649727766e-05, + "loss": 39.962, + "step": 2278 + }, + { + "epoch": 8.227539503386005, + "grad_norm": 294.3867492675781, + "learning_rate": 1.7646098003629765e-05, + "loss": 39.0847, + "step": 2279 + }, + { + "epoch": 8.231151241534988, + "grad_norm": 201.49679565429688, + "learning_rate": 1.764065335753176e-05, + "loss": 39.1451, + "step": 2280 + }, + { + "epoch": 8.231151241534988, + "eval_loss": 0.6214079856872559, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 2280 + }, + { + "epoch": 8.234762979683973, + "grad_norm": 201.57894897460938, + "learning_rate": 1.7635208711433756e-05, + "loss": 39.4673, + "step": 2281 + }, + { + "epoch": 8.238374717832958, + "grad_norm": 201.0395965576172, + "learning_rate": 1.7629764065335754e-05, + "loss": 39.9832, + "step": 2282 + }, + { + "epoch": 8.241986455981941, + "grad_norm": 274.41168212890625, + "learning_rate": 1.762431941923775e-05, + "loss": 40.3885, + "step": 2283 + }, + { + "epoch": 8.245598194130926, + "grad_norm": 173.79977416992188, + "learning_rate": 1.761887477313975e-05, + "loss": 39.5292, + "step": 2284 + }, + { + "epoch": 8.249209932279909, + "grad_norm": 194.91806030273438, + "learning_rate": 1.7613430127041744e-05, + "loss": 40.3855, + "step": 2285 + }, + { + "epoch": 8.252821670428894, + "grad_norm": 216.47213745117188, + "learning_rate": 1.760798548094374e-05, + "loss": 40.937, + "step": 2286 + }, + { + "epoch": 8.256433408577879, + "grad_norm": 168.1825714111328, + "learning_rate": 1.7602540834845735e-05, + "loss": 41.2523, + "step": 2287 + }, + { + "epoch": 8.260045146726862, + "grad_norm": 187.51914978027344, + "learning_rate": 1.759709618874773e-05, + "loss": 40.6913, + "step": 2288 + }, + { + "epoch": 8.263656884875846, + "grad_norm": 183.99844360351562, + "learning_rate": 1.759165154264973e-05, + "loss": 42.5074, + "step": 2289 + }, + { + "epoch": 8.267268623024831, + "grad_norm": 201.23797607421875, + "learning_rate": 1.7586206896551724e-05, + "loss": 42.0519, + "step": 2290 + }, + { + "epoch": 8.267268623024831, + "eval_loss": 0.6184054017066956, + "eval_runtime": 3.1465, + "eval_samples_per_second": 56.889, + "eval_steps_per_second": 56.889, + "step": 2290 + }, + { + "epoch": 8.270880361173814, + "grad_norm": 219.0037384033203, + "learning_rate": 1.7580762250453723e-05, + "loss": 41.7059, + "step": 2291 + }, + { + "epoch": 8.2744920993228, + "grad_norm": 221.00173950195312, + "learning_rate": 1.7575317604355718e-05, + "loss": 40.9004, + "step": 2292 + }, + { + "epoch": 8.278103837471784, + "grad_norm": 180.00828552246094, + "learning_rate": 1.7569872958257714e-05, + "loss": 38.7865, + "step": 2293 + }, + { + "epoch": 8.281715575620767, + "grad_norm": 210.69302368164062, + "learning_rate": 1.756442831215971e-05, + "loss": 39.207, + "step": 2294 + }, + { + "epoch": 8.285327313769752, + "grad_norm": 196.8787078857422, + "learning_rate": 1.7558983666061708e-05, + "loss": 39.4472, + "step": 2295 + }, + { + "epoch": 8.288939051918735, + "grad_norm": 229.16331481933594, + "learning_rate": 1.7553539019963703e-05, + "loss": 36.5539, + "step": 2296 + }, + { + "epoch": 8.29255079006772, + "grad_norm": 180.67474365234375, + "learning_rate": 1.75480943738657e-05, + "loss": 34.3887, + "step": 2297 + }, + { + "epoch": 8.296162528216705, + "grad_norm": 234.046875, + "learning_rate": 1.7542649727767694e-05, + "loss": 34.158, + "step": 2298 + }, + { + "epoch": 8.299774266365688, + "grad_norm": 213.34255981445312, + "learning_rate": 1.753720508166969e-05, + "loss": 34.7655, + "step": 2299 + }, + { + "epoch": 8.303386004514673, + "grad_norm": 205.6382598876953, + "learning_rate": 1.753176043557169e-05, + "loss": 34.4223, + "step": 2300 + }, + { + "epoch": 8.303386004514673, + "eval_loss": 0.6200549006462097, + "eval_runtime": 3.1447, + "eval_samples_per_second": 56.921, + "eval_steps_per_second": 56.921, + "step": 2300 + }, + { + "epoch": 8.306997742663658, + "grad_norm": 189.79238891601562, + "learning_rate": 1.7526315789473687e-05, + "loss": 35.3846, + "step": 2301 + }, + { + "epoch": 8.31060948081264, + "grad_norm": 202.27859497070312, + "learning_rate": 1.7520871143375682e-05, + "loss": 34.9006, + "step": 2302 + }, + { + "epoch": 8.314221218961626, + "grad_norm": 217.62327575683594, + "learning_rate": 1.7515426497277678e-05, + "loss": 36.3079, + "step": 2303 + }, + { + "epoch": 8.317832957110609, + "grad_norm": 212.82862854003906, + "learning_rate": 1.7509981851179673e-05, + "loss": 35.8598, + "step": 2304 + }, + { + "epoch": 8.321444695259594, + "grad_norm": 229.778564453125, + "learning_rate": 1.750453720508167e-05, + "loss": 37.0853, + "step": 2305 + }, + { + "epoch": 8.325056433408578, + "grad_norm": 219.99844360351562, + "learning_rate": 1.7499092558983667e-05, + "loss": 38.01, + "step": 2306 + }, + { + "epoch": 8.328668171557561, + "grad_norm": 202.63035583496094, + "learning_rate": 1.7493647912885663e-05, + "loss": 36.4756, + "step": 2307 + }, + { + "epoch": 8.332279909706546, + "grad_norm": 188.44094848632812, + "learning_rate": 1.7488203266787658e-05, + "loss": 37.0509, + "step": 2308 + }, + { + "epoch": 8.335891647855531, + "grad_norm": 187.8760223388672, + "learning_rate": 1.7482758620689657e-05, + "loss": 38.0019, + "step": 2309 + }, + { + "epoch": 8.339503386004514, + "grad_norm": 239.35833740234375, + "learning_rate": 1.7477313974591652e-05, + "loss": 38.2255, + "step": 2310 + }, + { + "epoch": 8.339503386004514, + "eval_loss": 0.6221747994422913, + "eval_runtime": 3.148, + "eval_samples_per_second": 56.862, + "eval_steps_per_second": 56.862, + "step": 2310 + }, + { + "epoch": 8.343115124153499, + "grad_norm": 236.3567657470703, + "learning_rate": 1.747186932849365e-05, + "loss": 37.3598, + "step": 2311 + }, + { + "epoch": 8.346726862302482, + "grad_norm": 188.16151428222656, + "learning_rate": 1.7466424682395646e-05, + "loss": 27.1993, + "step": 2312 + }, + { + "epoch": 8.350338600451467, + "grad_norm": 216.58778381347656, + "learning_rate": 1.746098003629764e-05, + "loss": 23.7024, + "step": 2313 + }, + { + "epoch": 8.353950338600452, + "grad_norm": 221.03111267089844, + "learning_rate": 1.7455535390199637e-05, + "loss": 24.2856, + "step": 2314 + }, + { + "epoch": 8.357562076749435, + "grad_norm": 180.36221313476562, + "learning_rate": 1.7450090744101632e-05, + "loss": 23.7624, + "step": 2315 + }, + { + "epoch": 8.36117381489842, + "grad_norm": 198.77438354492188, + "learning_rate": 1.7444646098003628e-05, + "loss": 25.8628, + "step": 2316 + }, + { + "epoch": 8.364785553047405, + "grad_norm": 250.81321716308594, + "learning_rate": 1.7439201451905627e-05, + "loss": 43.4097, + "step": 2317 + }, + { + "epoch": 8.368397291196388, + "grad_norm": 246.19544982910156, + "learning_rate": 1.7433756805807622e-05, + "loss": 44.7141, + "step": 2318 + }, + { + "epoch": 8.372009029345373, + "grad_norm": 245.04241943359375, + "learning_rate": 1.742831215970962e-05, + "loss": 44.4511, + "step": 2319 + }, + { + "epoch": 8.375620767494357, + "grad_norm": 224.05331420898438, + "learning_rate": 1.7422867513611616e-05, + "loss": 43.5971, + "step": 2320 + }, + { + "epoch": 8.375620767494357, + "eval_loss": 0.6324251294136047, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 2320 + }, + { + "epoch": 8.37923250564334, + "grad_norm": 222.3795623779297, + "learning_rate": 1.741742286751361e-05, + "loss": 42.9007, + "step": 2321 + }, + { + "epoch": 8.382844243792325, + "grad_norm": 210.0133514404297, + "learning_rate": 1.741197822141561e-05, + "loss": 42.8733, + "step": 2322 + }, + { + "epoch": 8.386455981941308, + "grad_norm": 222.01031494140625, + "learning_rate": 1.7406533575317606e-05, + "loss": 42.9875, + "step": 2323 + }, + { + "epoch": 8.390067720090293, + "grad_norm": 187.30101013183594, + "learning_rate": 1.74010889292196e-05, + "loss": 42.4873, + "step": 2324 + }, + { + "epoch": 8.393679458239278, + "grad_norm": 188.22048950195312, + "learning_rate": 1.7395644283121596e-05, + "loss": 42.2066, + "step": 2325 + }, + { + "epoch": 8.397291196388261, + "grad_norm": 228.75363159179688, + "learning_rate": 1.7390199637023592e-05, + "loss": 42.7604, + "step": 2326 + }, + { + "epoch": 8.400902934537246, + "grad_norm": 196.8817901611328, + "learning_rate": 1.7384754990925587e-05, + "loss": 42.445, + "step": 2327 + }, + { + "epoch": 8.404514672686231, + "grad_norm": 205.3610382080078, + "learning_rate": 1.737931034482759e-05, + "loss": 39.8408, + "step": 2328 + }, + { + "epoch": 8.408126410835214, + "grad_norm": 259.0702819824219, + "learning_rate": 1.7373865698729585e-05, + "loss": 40.847, + "step": 2329 + }, + { + "epoch": 8.411738148984199, + "grad_norm": 216.12017822265625, + "learning_rate": 1.736842105263158e-05, + "loss": 40.4648, + "step": 2330 + }, + { + "epoch": 8.411738148984199, + "eval_loss": 0.6252871155738831, + "eval_runtime": 3.1421, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2330 + }, + { + "epoch": 8.415349887133182, + "grad_norm": 330.9464111328125, + "learning_rate": 1.7362976406533575e-05, + "loss": 39.7682, + "step": 2331 + }, + { + "epoch": 8.418961625282167, + "grad_norm": 237.19505310058594, + "learning_rate": 1.735753176043557e-05, + "loss": 38.8824, + "step": 2332 + }, + { + "epoch": 8.422573363431152, + "grad_norm": 247.22259521484375, + "learning_rate": 1.735208711433757e-05, + "loss": 40.1187, + "step": 2333 + }, + { + "epoch": 8.426185101580135, + "grad_norm": 267.739990234375, + "learning_rate": 1.7346642468239565e-05, + "loss": 40.4589, + "step": 2334 + }, + { + "epoch": 8.42979683972912, + "grad_norm": 308.715576171875, + "learning_rate": 1.734119782214156e-05, + "loss": 41.5481, + "step": 2335 + }, + { + "epoch": 8.433408577878104, + "grad_norm": 350.8972473144531, + "learning_rate": 1.7335753176043556e-05, + "loss": 41.6628, + "step": 2336 + }, + { + "epoch": 8.437020316027088, + "grad_norm": 245.9825897216797, + "learning_rate": 1.7330308529945555e-05, + "loss": 40.3527, + "step": 2337 + }, + { + "epoch": 8.440632054176072, + "grad_norm": 253.94488525390625, + "learning_rate": 1.732486388384755e-05, + "loss": 39.6388, + "step": 2338 + }, + { + "epoch": 8.444243792325057, + "grad_norm": 226.24179077148438, + "learning_rate": 1.731941923774955e-05, + "loss": 40.5561, + "step": 2339 + }, + { + "epoch": 8.44785553047404, + "grad_norm": 188.66746520996094, + "learning_rate": 1.7313974591651544e-05, + "loss": 41.8422, + "step": 2340 + }, + { + "epoch": 8.44785553047404, + "eval_loss": 0.6197592616081238, + "eval_runtime": 3.1522, + "eval_samples_per_second": 56.786, + "eval_steps_per_second": 56.786, + "step": 2340 + }, + { + "epoch": 8.451467268623025, + "grad_norm": 227.01014709472656, + "learning_rate": 1.730852994555354e-05, + "loss": 41.4184, + "step": 2341 + }, + { + "epoch": 8.455079006772008, + "grad_norm": 187.11643981933594, + "learning_rate": 1.7303085299455535e-05, + "loss": 40.796, + "step": 2342 + }, + { + "epoch": 8.458690744920993, + "grad_norm": 243.1756134033203, + "learning_rate": 1.729764065335753e-05, + "loss": 41.7926, + "step": 2343 + }, + { + "epoch": 8.462302483069978, + "grad_norm": 226.15187072753906, + "learning_rate": 1.729219600725953e-05, + "loss": 41.588, + "step": 2344 + }, + { + "epoch": 8.465914221218961, + "grad_norm": 218.49935913085938, + "learning_rate": 1.7286751361161524e-05, + "loss": 39.6935, + "step": 2345 + }, + { + "epoch": 8.469525959367946, + "grad_norm": 232.4805145263672, + "learning_rate": 1.7281306715063523e-05, + "loss": 37.0718, + "step": 2346 + }, + { + "epoch": 8.47313769751693, + "grad_norm": 201.1748046875, + "learning_rate": 1.727586206896552e-05, + "loss": 33.9633, + "step": 2347 + }, + { + "epoch": 8.476749435665914, + "grad_norm": 208.79733276367188, + "learning_rate": 1.7270417422867514e-05, + "loss": 33.4553, + "step": 2348 + }, + { + "epoch": 8.480361173814899, + "grad_norm": 235.91151428222656, + "learning_rate": 1.726497277676951e-05, + "loss": 33.6144, + "step": 2349 + }, + { + "epoch": 8.483972911963882, + "grad_norm": 206.28811645507812, + "learning_rate": 1.7259528130671508e-05, + "loss": 35.3678, + "step": 2350 + }, + { + "epoch": 8.483972911963882, + "eval_loss": 0.6203061938285828, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 2350 + }, + { + "epoch": 8.487584650112867, + "grad_norm": 305.2204284667969, + "learning_rate": 1.7254083484573503e-05, + "loss": 35.9175, + "step": 2351 + }, + { + "epoch": 8.491196388261852, + "grad_norm": 227.1592254638672, + "learning_rate": 1.72486388384755e-05, + "loss": 35.5001, + "step": 2352 + }, + { + "epoch": 8.494808126410835, + "grad_norm": 194.739501953125, + "learning_rate": 1.7243194192377494e-05, + "loss": 35.0015, + "step": 2353 + }, + { + "epoch": 8.49841986455982, + "grad_norm": 233.8467254638672, + "learning_rate": 1.723774954627949e-05, + "loss": 36.8257, + "step": 2354 + }, + { + "epoch": 8.502031602708804, + "grad_norm": 258.8914489746094, + "learning_rate": 1.7232304900181492e-05, + "loss": 36.1246, + "step": 2355 + }, + { + "epoch": 8.505643340857787, + "grad_norm": 194.8585968017578, + "learning_rate": 1.7226860254083487e-05, + "loss": 36.1245, + "step": 2356 + }, + { + "epoch": 8.509255079006772, + "grad_norm": 191.2276153564453, + "learning_rate": 1.7221415607985483e-05, + "loss": 37.0608, + "step": 2357 + }, + { + "epoch": 8.512866817155757, + "grad_norm": 197.9025115966797, + "learning_rate": 1.7215970961887478e-05, + "loss": 37.0779, + "step": 2358 + }, + { + "epoch": 8.51647855530474, + "grad_norm": 207.01016235351562, + "learning_rate": 1.7210526315789473e-05, + "loss": 37.8432, + "step": 2359 + }, + { + "epoch": 8.520090293453725, + "grad_norm": 222.20201110839844, + "learning_rate": 1.720508166969147e-05, + "loss": 36.6983, + "step": 2360 + }, + { + "epoch": 8.520090293453725, + "eval_loss": 0.6240220665931702, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 2360 + }, + { + "epoch": 8.523702031602708, + "grad_norm": 200.19273376464844, + "learning_rate": 1.7199637023593467e-05, + "loss": 38.0613, + "step": 2361 + }, + { + "epoch": 8.527313769751693, + "grad_norm": 205.36758422851562, + "learning_rate": 1.7194192377495463e-05, + "loss": 29.6395, + "step": 2362 + }, + { + "epoch": 8.530925507900678, + "grad_norm": 206.53396606445312, + "learning_rate": 1.7188747731397458e-05, + "loss": 23.6478, + "step": 2363 + }, + { + "epoch": 8.534537246049661, + "grad_norm": 219.47044372558594, + "learning_rate": 1.7183303085299454e-05, + "loss": 22.8522, + "step": 2364 + }, + { + "epoch": 8.538148984198646, + "grad_norm": 178.48008728027344, + "learning_rate": 1.7177858439201452e-05, + "loss": 24.1411, + "step": 2365 + }, + { + "epoch": 8.54176072234763, + "grad_norm": 222.63731384277344, + "learning_rate": 1.717241379310345e-05, + "loss": 26.2818, + "step": 2366 + }, + { + "epoch": 8.545372460496614, + "grad_norm": 216.6333465576172, + "learning_rate": 1.7166969147005447e-05, + "loss": 42.5599, + "step": 2367 + }, + { + "epoch": 8.548984198645599, + "grad_norm": 241.42532348632812, + "learning_rate": 1.7161524500907442e-05, + "loss": 44.0016, + "step": 2368 + }, + { + "epoch": 8.552595936794582, + "grad_norm": 227.95193481445312, + "learning_rate": 1.7156079854809437e-05, + "loss": 44.1662, + "step": 2369 + }, + { + "epoch": 8.556207674943566, + "grad_norm": 204.9208526611328, + "learning_rate": 1.7150635208711433e-05, + "loss": 41.2255, + "step": 2370 + }, + { + "epoch": 8.556207674943566, + "eval_loss": 0.6293933987617493, + "eval_runtime": 3.1467, + "eval_samples_per_second": 56.884, + "eval_steps_per_second": 56.884, + "step": 2370 + }, + { + "epoch": 8.559819413092551, + "grad_norm": 168.1370849609375, + "learning_rate": 1.7145190562613428e-05, + "loss": 42.8374, + "step": 2371 + }, + { + "epoch": 8.563431151241534, + "grad_norm": 209.16641235351562, + "learning_rate": 1.7139745916515427e-05, + "loss": 42.4378, + "step": 2372 + }, + { + "epoch": 8.56704288939052, + "grad_norm": 235.36373901367188, + "learning_rate": 1.7134301270417422e-05, + "loss": 43.3213, + "step": 2373 + }, + { + "epoch": 8.570654627539504, + "grad_norm": 198.8206329345703, + "learning_rate": 1.712885662431942e-05, + "loss": 43.5621, + "step": 2374 + }, + { + "epoch": 8.574266365688487, + "grad_norm": 191.1640167236328, + "learning_rate": 1.7123411978221416e-05, + "loss": 41.8729, + "step": 2375 + }, + { + "epoch": 8.577878103837472, + "grad_norm": 281.6352233886719, + "learning_rate": 1.7117967332123412e-05, + "loss": 42.8306, + "step": 2376 + }, + { + "epoch": 8.581489841986457, + "grad_norm": 191.68939208984375, + "learning_rate": 1.711252268602541e-05, + "loss": 41.3603, + "step": 2377 + }, + { + "epoch": 8.58510158013544, + "grad_norm": 175.3041229248047, + "learning_rate": 1.7107078039927406e-05, + "loss": 38.7076, + "step": 2378 + }, + { + "epoch": 8.588713318284425, + "grad_norm": 186.31202697753906, + "learning_rate": 1.71016333938294e-05, + "loss": 38.832, + "step": 2379 + }, + { + "epoch": 8.592325056433408, + "grad_norm": 192.0680389404297, + "learning_rate": 1.7096188747731397e-05, + "loss": 40.6542, + "step": 2380 + }, + { + "epoch": 8.592325056433408, + "eval_loss": 0.6245992183685303, + "eval_runtime": 3.1487, + "eval_samples_per_second": 56.848, + "eval_steps_per_second": 56.848, + "step": 2380 + }, + { + "epoch": 8.595936794582393, + "grad_norm": 284.3516540527344, + "learning_rate": 1.7090744101633392e-05, + "loss": 40.3145, + "step": 2381 + }, + { + "epoch": 8.599548532731378, + "grad_norm": 210.2421875, + "learning_rate": 1.708529945553539e-05, + "loss": 39.9109, + "step": 2382 + }, + { + "epoch": 8.60316027088036, + "grad_norm": 202.3438720703125, + "learning_rate": 1.707985480943739e-05, + "loss": 39.0686, + "step": 2383 + }, + { + "epoch": 8.606772009029346, + "grad_norm": 189.5508270263672, + "learning_rate": 1.7074410163339385e-05, + "loss": 40.6673, + "step": 2384 + }, + { + "epoch": 8.610383747178329, + "grad_norm": 199.3516387939453, + "learning_rate": 1.706896551724138e-05, + "loss": 40.5357, + "step": 2385 + }, + { + "epoch": 8.613995485327314, + "grad_norm": 183.11309814453125, + "learning_rate": 1.7063520871143376e-05, + "loss": 40.7691, + "step": 2386 + }, + { + "epoch": 8.617607223476298, + "grad_norm": 347.104248046875, + "learning_rate": 1.705807622504537e-05, + "loss": 40.6822, + "step": 2387 + }, + { + "epoch": 8.621218961625281, + "grad_norm": 341.0453796386719, + "learning_rate": 1.705263157894737e-05, + "loss": 40.9791, + "step": 2388 + }, + { + "epoch": 8.624830699774266, + "grad_norm": 335.33221435546875, + "learning_rate": 1.7047186932849365e-05, + "loss": 41.0977, + "step": 2389 + }, + { + "epoch": 8.628442437923251, + "grad_norm": 209.75198364257812, + "learning_rate": 1.704174228675136e-05, + "loss": 41.3332, + "step": 2390 + }, + { + "epoch": 8.628442437923251, + "eval_loss": 0.6176490783691406, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2390 + }, + { + "epoch": 8.632054176072234, + "grad_norm": 221.6715545654297, + "learning_rate": 1.7036297640653356e-05, + "loss": 41.7456, + "step": 2391 + }, + { + "epoch": 8.635665914221219, + "grad_norm": 255.7875213623047, + "learning_rate": 1.7030852994555355e-05, + "loss": 41.7063, + "step": 2392 + }, + { + "epoch": 8.639277652370204, + "grad_norm": 206.66221618652344, + "learning_rate": 1.7025408348457354e-05, + "loss": 41.941, + "step": 2393 + }, + { + "epoch": 8.642889390519187, + "grad_norm": 381.9871826171875, + "learning_rate": 1.701996370235935e-05, + "loss": 42.8615, + "step": 2394 + }, + { + "epoch": 8.646501128668172, + "grad_norm": 303.8249816894531, + "learning_rate": 1.7014519056261344e-05, + "loss": 37.8472, + "step": 2395 + }, + { + "epoch": 8.650112866817155, + "grad_norm": 201.2444610595703, + "learning_rate": 1.700907441016334e-05, + "loss": 35.4641, + "step": 2396 + }, + { + "epoch": 8.65372460496614, + "grad_norm": 242.34298706054688, + "learning_rate": 1.7003629764065335e-05, + "loss": 33.3414, + "step": 2397 + }, + { + "epoch": 8.657336343115125, + "grad_norm": 214.45384216308594, + "learning_rate": 1.699818511796733e-05, + "loss": 33.7771, + "step": 2398 + }, + { + "epoch": 8.660948081264108, + "grad_norm": 276.4810485839844, + "learning_rate": 1.699274047186933e-05, + "loss": 35.4289, + "step": 2399 + }, + { + "epoch": 8.664559819413093, + "grad_norm": 199.68626403808594, + "learning_rate": 1.6987295825771325e-05, + "loss": 34.4205, + "step": 2400 + }, + { + "epoch": 8.664559819413093, + "eval_loss": 0.6179484128952026, + "eval_runtime": 3.1618, + "eval_samples_per_second": 56.614, + "eval_steps_per_second": 56.614, + "step": 2400 + }, + { + "epoch": 8.668171557562077, + "grad_norm": 239.19200134277344, + "learning_rate": 1.698185117967332e-05, + "loss": 34.3428, + "step": 2401 + }, + { + "epoch": 8.67178329571106, + "grad_norm": 341.44927978515625, + "learning_rate": 1.697640653357532e-05, + "loss": 37.6011, + "step": 2402 + }, + { + "epoch": 8.675395033860045, + "grad_norm": 260.5967102050781, + "learning_rate": 1.6970961887477314e-05, + "loss": 34.9222, + "step": 2403 + }, + { + "epoch": 8.679006772009028, + "grad_norm": 217.9357147216797, + "learning_rate": 1.6965517241379313e-05, + "loss": 36.6177, + "step": 2404 + }, + { + "epoch": 8.682618510158013, + "grad_norm": 355.21917724609375, + "learning_rate": 1.696007259528131e-05, + "loss": 36.3072, + "step": 2405 + }, + { + "epoch": 8.686230248306998, + "grad_norm": 279.37200927734375, + "learning_rate": 1.6954627949183304e-05, + "loss": 36.7026, + "step": 2406 + }, + { + "epoch": 8.689841986455981, + "grad_norm": 344.9017028808594, + "learning_rate": 1.69491833030853e-05, + "loss": 37.5009, + "step": 2407 + }, + { + "epoch": 8.693453724604966, + "grad_norm": 225.28668212890625, + "learning_rate": 1.6943738656987295e-05, + "loss": 36.0914, + "step": 2408 + }, + { + "epoch": 8.697065462753951, + "grad_norm": 233.16372680664062, + "learning_rate": 1.693829401088929e-05, + "loss": 38.0917, + "step": 2409 + }, + { + "epoch": 8.700677200902934, + "grad_norm": 220.2307891845703, + "learning_rate": 1.693284936479129e-05, + "loss": 37.4493, + "step": 2410 + }, + { + "epoch": 8.700677200902934, + "eval_loss": 0.6225734949111938, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2410 + }, + { + "epoch": 8.704288939051919, + "grad_norm": 298.2883605957031, + "learning_rate": 1.6927404718693287e-05, + "loss": 37.6527, + "step": 2411 + }, + { + "epoch": 8.707900677200904, + "grad_norm": 329.1615295410156, + "learning_rate": 1.6921960072595283e-05, + "loss": 30.9627, + "step": 2412 + }, + { + "epoch": 8.711512415349887, + "grad_norm": 192.55380249023438, + "learning_rate": 1.6916515426497278e-05, + "loss": 24.2028, + "step": 2413 + }, + { + "epoch": 8.715124153498872, + "grad_norm": 162.13583374023438, + "learning_rate": 1.6911070780399274e-05, + "loss": 23.3005, + "step": 2414 + }, + { + "epoch": 8.718735891647855, + "grad_norm": 152.95108032226562, + "learning_rate": 1.6905626134301272e-05, + "loss": 24.335, + "step": 2415 + }, + { + "epoch": 8.72234762979684, + "grad_norm": 183.4193572998047, + "learning_rate": 1.6900181488203268e-05, + "loss": 24.9279, + "step": 2416 + }, + { + "epoch": 8.725959367945824, + "grad_norm": 232.93650817871094, + "learning_rate": 1.6894736842105263e-05, + "loss": 43.4574, + "step": 2417 + }, + { + "epoch": 8.729571106094808, + "grad_norm": 226.85890197753906, + "learning_rate": 1.688929219600726e-05, + "loss": 44.4136, + "step": 2418 + }, + { + "epoch": 8.733182844243792, + "grad_norm": 232.16064453125, + "learning_rate": 1.6883847549909254e-05, + "loss": 42.8183, + "step": 2419 + }, + { + "epoch": 8.736794582392777, + "grad_norm": 243.5811767578125, + "learning_rate": 1.6878402903811253e-05, + "loss": 43.3031, + "step": 2420 + }, + { + "epoch": 8.736794582392777, + "eval_loss": 0.6284167170524597, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2420 + }, + { + "epoch": 8.74040632054176, + "grad_norm": 194.7115020751953, + "learning_rate": 1.687295825771325e-05, + "loss": 42.1276, + "step": 2421 + }, + { + "epoch": 8.744018058690745, + "grad_norm": 250.81983947753906, + "learning_rate": 1.6867513611615247e-05, + "loss": 42.5535, + "step": 2422 + }, + { + "epoch": 8.747629796839728, + "grad_norm": 205.1988983154297, + "learning_rate": 1.6862068965517242e-05, + "loss": 42.7745, + "step": 2423 + }, + { + "epoch": 8.751241534988713, + "grad_norm": 159.68243408203125, + "learning_rate": 1.6856624319419238e-05, + "loss": 43.6562, + "step": 2424 + }, + { + "epoch": 8.754853273137698, + "grad_norm": 164.31361389160156, + "learning_rate": 1.6851179673321233e-05, + "loss": 43.4602, + "step": 2425 + }, + { + "epoch": 8.758465011286681, + "grad_norm": 213.9793243408203, + "learning_rate": 1.6845735027223232e-05, + "loss": 42.1559, + "step": 2426 + }, + { + "epoch": 8.762076749435666, + "grad_norm": 205.79107666015625, + "learning_rate": 1.6840290381125227e-05, + "loss": 41.5687, + "step": 2427 + }, + { + "epoch": 8.76568848758465, + "grad_norm": 235.80348205566406, + "learning_rate": 1.6834845735027223e-05, + "loss": 41.0748, + "step": 2428 + }, + { + "epoch": 8.769300225733634, + "grad_norm": 203.84884643554688, + "learning_rate": 1.682940108892922e-05, + "loss": 39.3348, + "step": 2429 + }, + { + "epoch": 8.772911963882619, + "grad_norm": 271.2411804199219, + "learning_rate": 1.6823956442831217e-05, + "loss": 39.357, + "step": 2430 + }, + { + "epoch": 8.772911963882619, + "eval_loss": 0.6211046576499939, + "eval_runtime": 3.1402, + "eval_samples_per_second": 57.002, + "eval_steps_per_second": 57.002, + "step": 2430 + }, + { + "epoch": 8.776523702031604, + "grad_norm": 222.4960174560547, + "learning_rate": 1.6818511796733212e-05, + "loss": 39.2198, + "step": 2431 + }, + { + "epoch": 8.780135440180587, + "grad_norm": 325.9942932128906, + "learning_rate": 1.681306715063521e-05, + "loss": 40.572, + "step": 2432 + }, + { + "epoch": 8.783747178329572, + "grad_norm": 195.2740936279297, + "learning_rate": 1.6807622504537206e-05, + "loss": 39.2727, + "step": 2433 + }, + { + "epoch": 8.787358916478555, + "grad_norm": 196.16964721679688, + "learning_rate": 1.68021778584392e-05, + "loss": 40.6503, + "step": 2434 + }, + { + "epoch": 8.79097065462754, + "grad_norm": 183.2659454345703, + "learning_rate": 1.6796733212341197e-05, + "loss": 41.2074, + "step": 2435 + }, + { + "epoch": 8.794582392776524, + "grad_norm": 293.393798828125, + "learning_rate": 1.6791288566243192e-05, + "loss": 40.2778, + "step": 2436 + }, + { + "epoch": 8.798194130925507, + "grad_norm": 232.8402099609375, + "learning_rate": 1.678584392014519e-05, + "loss": 40.0305, + "step": 2437 + }, + { + "epoch": 8.801805869074492, + "grad_norm": 269.957275390625, + "learning_rate": 1.678039927404719e-05, + "loss": 40.4216, + "step": 2438 + }, + { + "epoch": 8.805417607223477, + "grad_norm": 175.6732635498047, + "learning_rate": 1.6774954627949185e-05, + "loss": 40.7998, + "step": 2439 + }, + { + "epoch": 8.80902934537246, + "grad_norm": 209.0604248046875, + "learning_rate": 1.676950998185118e-05, + "loss": 41.1176, + "step": 2440 + }, + { + "epoch": 8.80902934537246, + "eval_loss": 0.6211614012718201, + "eval_runtime": 3.15, + "eval_samples_per_second": 56.826, + "eval_steps_per_second": 56.826, + "step": 2440 + }, + { + "epoch": 8.812641083521445, + "grad_norm": 229.91171264648438, + "learning_rate": 1.6764065335753176e-05, + "loss": 41.37, + "step": 2441 + }, + { + "epoch": 8.816252821670428, + "grad_norm": 192.99610900878906, + "learning_rate": 1.675862068965517e-05, + "loss": 41.8377, + "step": 2442 + }, + { + "epoch": 8.819864559819413, + "grad_norm": 239.290771484375, + "learning_rate": 1.675317604355717e-05, + "loss": 42.3038, + "step": 2443 + }, + { + "epoch": 8.823476297968398, + "grad_norm": 203.52330017089844, + "learning_rate": 1.6747731397459166e-05, + "loss": 41.3334, + "step": 2444 + }, + { + "epoch": 8.827088036117381, + "grad_norm": 247.99099731445312, + "learning_rate": 1.674228675136116e-05, + "loss": 37.7455, + "step": 2445 + }, + { + "epoch": 8.830699774266366, + "grad_norm": 205.9770965576172, + "learning_rate": 1.6736842105263156e-05, + "loss": 34.6828, + "step": 2446 + }, + { + "epoch": 8.83431151241535, + "grad_norm": 215.47024536132812, + "learning_rate": 1.6731397459165152e-05, + "loss": 34.927, + "step": 2447 + }, + { + "epoch": 8.837923250564334, + "grad_norm": 254.14010620117188, + "learning_rate": 1.6725952813067154e-05, + "loss": 35.3194, + "step": 2448 + }, + { + "epoch": 8.841534988713319, + "grad_norm": 221.18174743652344, + "learning_rate": 1.672050816696915e-05, + "loss": 34.9577, + "step": 2449 + }, + { + "epoch": 8.845146726862303, + "grad_norm": 191.1651611328125, + "learning_rate": 1.6715063520871145e-05, + "loss": 33.7244, + "step": 2450 + }, + { + "epoch": 8.845146726862303, + "eval_loss": 0.6216589212417603, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2450 + }, + { + "epoch": 8.848758465011286, + "grad_norm": 228.3920135498047, + "learning_rate": 1.670961887477314e-05, + "loss": 34.9689, + "step": 2451 + }, + { + "epoch": 8.852370203160271, + "grad_norm": 227.6689910888672, + "learning_rate": 1.6704174228675135e-05, + "loss": 36.0718, + "step": 2452 + }, + { + "epoch": 8.855981941309254, + "grad_norm": 182.38978576660156, + "learning_rate": 1.669872958257713e-05, + "loss": 37.1143, + "step": 2453 + }, + { + "epoch": 8.85959367945824, + "grad_norm": 223.66966247558594, + "learning_rate": 1.669328493647913e-05, + "loss": 34.4468, + "step": 2454 + }, + { + "epoch": 8.863205417607224, + "grad_norm": 260.3930358886719, + "learning_rate": 1.6687840290381125e-05, + "loss": 36.7305, + "step": 2455 + }, + { + "epoch": 8.866817155756207, + "grad_norm": 218.60385131835938, + "learning_rate": 1.668239564428312e-05, + "loss": 36.1995, + "step": 2456 + }, + { + "epoch": 8.870428893905192, + "grad_norm": 227.4342041015625, + "learning_rate": 1.667695099818512e-05, + "loss": 35.9138, + "step": 2457 + }, + { + "epoch": 8.874040632054175, + "grad_norm": 208.42196655273438, + "learning_rate": 1.6671506352087115e-05, + "loss": 37.2621, + "step": 2458 + }, + { + "epoch": 8.87765237020316, + "grad_norm": 214.9486541748047, + "learning_rate": 1.6666061705989113e-05, + "loss": 38.5176, + "step": 2459 + }, + { + "epoch": 8.881264108352145, + "grad_norm": 226.6992645263672, + "learning_rate": 1.666061705989111e-05, + "loss": 38.3917, + "step": 2460 + }, + { + "epoch": 8.881264108352145, + "eval_loss": 0.6277003884315491, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.959, + "eval_steps_per_second": 56.959, + "step": 2460 + }, + { + "epoch": 8.884875846501128, + "grad_norm": 282.3875732421875, + "learning_rate": 1.6655172413793104e-05, + "loss": 39.1439, + "step": 2461 + }, + { + "epoch": 8.888487584650113, + "grad_norm": 240.29022216796875, + "learning_rate": 1.66497277676951e-05, + "loss": 33.7717, + "step": 2462 + }, + { + "epoch": 8.892099322799098, + "grad_norm": 231.84727478027344, + "learning_rate": 1.6644283121597095e-05, + "loss": 24.1146, + "step": 2463 + }, + { + "epoch": 8.89571106094808, + "grad_norm": 215.5159149169922, + "learning_rate": 1.663883847549909e-05, + "loss": 24.0165, + "step": 2464 + }, + { + "epoch": 8.899322799097066, + "grad_norm": 278.42950439453125, + "learning_rate": 1.663339382940109e-05, + "loss": 24.2048, + "step": 2465 + }, + { + "epoch": 8.90293453724605, + "grad_norm": 187.03341674804688, + "learning_rate": 1.6627949183303088e-05, + "loss": 24.7332, + "step": 2466 + }, + { + "epoch": 8.906546275395034, + "grad_norm": 261.2938232421875, + "learning_rate": 1.6622504537205083e-05, + "loss": 42.6764, + "step": 2467 + }, + { + "epoch": 8.910158013544018, + "grad_norm": 234.00880432128906, + "learning_rate": 1.661705989110708e-05, + "loss": 42.9894, + "step": 2468 + }, + { + "epoch": 8.913769751693001, + "grad_norm": 263.2890319824219, + "learning_rate": 1.6611615245009074e-05, + "loss": 43.3274, + "step": 2469 + }, + { + "epoch": 8.917381489841986, + "grad_norm": 286.3260192871094, + "learning_rate": 1.6606170598911073e-05, + "loss": 44.3862, + "step": 2470 + }, + { + "epoch": 8.917381489841986, + "eval_loss": 0.6278789043426514, + "eval_runtime": 3.1423, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2470 + }, + { + "epoch": 8.920993227990971, + "grad_norm": 273.5133972167969, + "learning_rate": 1.6600725952813068e-05, + "loss": 43.4195, + "step": 2471 + }, + { + "epoch": 8.924604966139954, + "grad_norm": 246.2245330810547, + "learning_rate": 1.6595281306715063e-05, + "loss": 43.153, + "step": 2472 + }, + { + "epoch": 8.928216704288939, + "grad_norm": 261.3001403808594, + "learning_rate": 1.658983666061706e-05, + "loss": 41.1276, + "step": 2473 + }, + { + "epoch": 8.931828442437924, + "grad_norm": 263.7626037597656, + "learning_rate": 1.6584392014519054e-05, + "loss": 40.5055, + "step": 2474 + }, + { + "epoch": 8.935440180586907, + "grad_norm": 233.80442810058594, + "learning_rate": 1.6578947368421053e-05, + "loss": 40.7098, + "step": 2475 + }, + { + "epoch": 8.939051918735892, + "grad_norm": 334.1268615722656, + "learning_rate": 1.6573502722323052e-05, + "loss": 40.5404, + "step": 2476 + }, + { + "epoch": 8.942663656884875, + "grad_norm": 319.56689453125, + "learning_rate": 1.6568058076225047e-05, + "loss": 40.3434, + "step": 2477 + }, + { + "epoch": 8.94627539503386, + "grad_norm": 388.0625915527344, + "learning_rate": 1.6562613430127043e-05, + "loss": 41.1956, + "step": 2478 + }, + { + "epoch": 8.949887133182845, + "grad_norm": 256.9087829589844, + "learning_rate": 1.6557168784029038e-05, + "loss": 41.9647, + "step": 2479 + }, + { + "epoch": 8.953498871331828, + "grad_norm": 248.2635040283203, + "learning_rate": 1.6551724137931033e-05, + "loss": 41.1885, + "step": 2480 + }, + { + "epoch": 8.953498871331828, + "eval_loss": 0.6198933124542236, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2480 + }, + { + "epoch": 8.957110609480813, + "grad_norm": 236.89004516601562, + "learning_rate": 1.6546279491833032e-05, + "loss": 41.2178, + "step": 2481 + }, + { + "epoch": 8.960722347629797, + "grad_norm": 260.47357177734375, + "learning_rate": 1.6540834845735027e-05, + "loss": 42.1472, + "step": 2482 + }, + { + "epoch": 8.96433408577878, + "grad_norm": 216.1390380859375, + "learning_rate": 1.6535390199637023e-05, + "loss": 36.14, + "step": 2483 + }, + { + "epoch": 8.967945823927765, + "grad_norm": 194.7316131591797, + "learning_rate": 1.652994555353902e-05, + "loss": 33.7272, + "step": 2484 + }, + { + "epoch": 8.97155756207675, + "grad_norm": 202.0404052734375, + "learning_rate": 1.6524500907441017e-05, + "loss": 34.9427, + "step": 2485 + }, + { + "epoch": 8.975169300225733, + "grad_norm": 196.98463439941406, + "learning_rate": 1.6519056261343016e-05, + "loss": 36.4874, + "step": 2486 + }, + { + "epoch": 8.978781038374718, + "grad_norm": 211.46177673339844, + "learning_rate": 1.651361161524501e-05, + "loss": 35.7667, + "step": 2487 + }, + { + "epoch": 8.982392776523701, + "grad_norm": 190.47093200683594, + "learning_rate": 1.6508166969147006e-05, + "loss": 35.6874, + "step": 2488 + }, + { + "epoch": 8.986004514672686, + "grad_norm": 194.9825897216797, + "learning_rate": 1.6502722323049002e-05, + "loss": 36.8718, + "step": 2489 + }, + { + "epoch": 8.989616252821671, + "grad_norm": 230.24774169921875, + "learning_rate": 1.6497277676950997e-05, + "loss": 37.4962, + "step": 2490 + }, + { + "epoch": 8.989616252821671, + "eval_loss": 0.6168100237846375, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 2490 + }, + { + "epoch": 8.993227990970654, + "grad_norm": 266.5688171386719, + "learning_rate": 1.6491833030852993e-05, + "loss": 35.5063, + "step": 2491 + }, + { + "epoch": 8.996839729119639, + "grad_norm": 230.923828125, + "learning_rate": 1.648638838475499e-05, + "loss": 23.5847, + "step": 2492 + }, + { + "epoch": 9.0, + "grad_norm": 187.365478515625, + "learning_rate": 1.6480943738656987e-05, + "loss": 21.7926, + "step": 2493 + }, + { + "epoch": 9.003611738148985, + "grad_norm": 283.487060546875, + "learning_rate": 1.6475499092558986e-05, + "loss": 41.4221, + "step": 2494 + }, + { + "epoch": 9.007223476297968, + "grad_norm": 234.38009643554688, + "learning_rate": 1.647005444646098e-05, + "loss": 43.3343, + "step": 2495 + }, + { + "epoch": 9.010835214446953, + "grad_norm": 253.75588989257812, + "learning_rate": 1.6464609800362976e-05, + "loss": 42.1983, + "step": 2496 + }, + { + "epoch": 9.014446952595938, + "grad_norm": 224.6202392578125, + "learning_rate": 1.6459165154264975e-05, + "loss": 41.5355, + "step": 2497 + }, + { + "epoch": 9.01805869074492, + "grad_norm": 261.0040588378906, + "learning_rate": 1.645372050816697e-05, + "loss": 42.3058, + "step": 2498 + }, + { + "epoch": 9.021670428893906, + "grad_norm": 191.44142150878906, + "learning_rate": 1.6448275862068966e-05, + "loss": 42.3911, + "step": 2499 + }, + { + "epoch": 9.025282167042889, + "grad_norm": 246.79278564453125, + "learning_rate": 1.644283121597096e-05, + "loss": 41.6238, + "step": 2500 + }, + { + "epoch": 9.025282167042889, + "eval_loss": 0.6220878958702087, + "eval_runtime": 3.1552, + "eval_samples_per_second": 56.731, + "eval_steps_per_second": 56.731, + "step": 2500 + }, + { + "epoch": 9.028893905191874, + "grad_norm": 251.5475311279297, + "learning_rate": 1.6437386569872957e-05, + "loss": 43.9275, + "step": 2501 + }, + { + "epoch": 9.032505643340858, + "grad_norm": 300.0381164550781, + "learning_rate": 1.6431941923774952e-05, + "loss": 42.8938, + "step": 2502 + }, + { + "epoch": 9.036117381489841, + "grad_norm": 310.0517883300781, + "learning_rate": 1.6426497277676954e-05, + "loss": 42.3538, + "step": 2503 + }, + { + "epoch": 9.039729119638826, + "grad_norm": 213.50392150878906, + "learning_rate": 1.642105263157895e-05, + "loss": 40.2305, + "step": 2504 + }, + { + "epoch": 9.043340857787811, + "grad_norm": 173.3816680908203, + "learning_rate": 1.6415607985480945e-05, + "loss": 38.3336, + "step": 2505 + }, + { + "epoch": 9.046952595936794, + "grad_norm": 195.51968383789062, + "learning_rate": 1.641016333938294e-05, + "loss": 38.5937, + "step": 2506 + }, + { + "epoch": 9.050564334085779, + "grad_norm": 195.68910217285156, + "learning_rate": 1.6404718693284936e-05, + "loss": 37.9994, + "step": 2507 + }, + { + "epoch": 9.054176072234762, + "grad_norm": 239.56704711914062, + "learning_rate": 1.6399274047186934e-05, + "loss": 38.6006, + "step": 2508 + }, + { + "epoch": 9.057787810383747, + "grad_norm": 455.8309326171875, + "learning_rate": 1.639382940108893e-05, + "loss": 39.9516, + "step": 2509 + }, + { + "epoch": 9.061399548532732, + "grad_norm": 188.0857696533203, + "learning_rate": 1.6388384754990925e-05, + "loss": 38.8922, + "step": 2510 + }, + { + "epoch": 9.061399548532732, + "eval_loss": 0.6177002191543579, + "eval_runtime": 3.1595, + "eval_samples_per_second": 56.654, + "eval_steps_per_second": 56.654, + "step": 2510 + }, + { + "epoch": 9.065011286681715, + "grad_norm": 211.76168823242188, + "learning_rate": 1.638294010889292e-05, + "loss": 38.8895, + "step": 2511 + }, + { + "epoch": 9.0686230248307, + "grad_norm": 281.7332458496094, + "learning_rate": 1.637749546279492e-05, + "loss": 39.9238, + "step": 2512 + }, + { + "epoch": 9.072234762979685, + "grad_norm": 254.9953155517578, + "learning_rate": 1.6372050816696915e-05, + "loss": 41.2667, + "step": 2513 + }, + { + "epoch": 9.075846501128668, + "grad_norm": 233.8746337890625, + "learning_rate": 1.6366606170598914e-05, + "loss": 39.3087, + "step": 2514 + }, + { + "epoch": 9.079458239277653, + "grad_norm": 317.71270751953125, + "learning_rate": 1.636116152450091e-05, + "loss": 40.4902, + "step": 2515 + }, + { + "epoch": 9.083069977426636, + "grad_norm": 227.5228271484375, + "learning_rate": 1.6355716878402904e-05, + "loss": 40.1197, + "step": 2516 + }, + { + "epoch": 9.08668171557562, + "grad_norm": 225.84423828125, + "learning_rate": 1.63502722323049e-05, + "loss": 42.9099, + "step": 2517 + }, + { + "epoch": 9.090293453724605, + "grad_norm": 255.20858764648438, + "learning_rate": 1.6344827586206895e-05, + "loss": 42.0515, + "step": 2518 + }, + { + "epoch": 9.093905191873588, + "grad_norm": 215.45352172851562, + "learning_rate": 1.6339382940108894e-05, + "loss": 41.6817, + "step": 2519 + }, + { + "epoch": 9.097516930022573, + "grad_norm": 233.5334014892578, + "learning_rate": 1.633393829401089e-05, + "loss": 42.6121, + "step": 2520 + }, + { + "epoch": 9.097516930022573, + "eval_loss": 0.6148340106010437, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.926, + "eval_steps_per_second": 56.926, + "step": 2520 + }, + { + "epoch": 9.101128668171558, + "grad_norm": 196.54132080078125, + "learning_rate": 1.6328493647912888e-05, + "loss": 40.5833, + "step": 2521 + }, + { + "epoch": 9.104740406320541, + "grad_norm": 296.7503967285156, + "learning_rate": 1.6323049001814883e-05, + "loss": 39.098, + "step": 2522 + }, + { + "epoch": 9.108352144469526, + "grad_norm": 272.1104431152344, + "learning_rate": 1.631760435571688e-05, + "loss": 36.0076, + "step": 2523 + }, + { + "epoch": 9.111963882618511, + "grad_norm": 197.3100128173828, + "learning_rate": 1.6312159709618874e-05, + "loss": 33.3503, + "step": 2524 + }, + { + "epoch": 9.115575620767494, + "grad_norm": 223.1310272216797, + "learning_rate": 1.6306715063520873e-05, + "loss": 33.1386, + "step": 2525 + }, + { + "epoch": 9.119187358916479, + "grad_norm": 234.86093139648438, + "learning_rate": 1.630127041742287e-05, + "loss": 34.2101, + "step": 2526 + }, + { + "epoch": 9.122799097065462, + "grad_norm": 244.72328186035156, + "learning_rate": 1.6295825771324864e-05, + "loss": 34.955, + "step": 2527 + }, + { + "epoch": 9.126410835214447, + "grad_norm": 198.89134216308594, + "learning_rate": 1.629038112522686e-05, + "loss": 34.5405, + "step": 2528 + }, + { + "epoch": 9.130022573363432, + "grad_norm": 236.64096069335938, + "learning_rate": 1.6284936479128854e-05, + "loss": 35.2328, + "step": 2529 + }, + { + "epoch": 9.133634311512415, + "grad_norm": 212.8743438720703, + "learning_rate": 1.6279491833030853e-05, + "loss": 34.6642, + "step": 2530 + }, + { + "epoch": 9.133634311512415, + "eval_loss": 0.6154256463050842, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 2530 + }, + { + "epoch": 9.1372460496614, + "grad_norm": 227.15135192871094, + "learning_rate": 1.6274047186932852e-05, + "loss": 35.652, + "step": 2531 + }, + { + "epoch": 9.140857787810384, + "grad_norm": 207.30572509765625, + "learning_rate": 1.6268602540834847e-05, + "loss": 36.8476, + "step": 2532 + }, + { + "epoch": 9.144469525959368, + "grad_norm": 222.18023681640625, + "learning_rate": 1.6263157894736843e-05, + "loss": 35.8299, + "step": 2533 + }, + { + "epoch": 9.148081264108352, + "grad_norm": 283.674072265625, + "learning_rate": 1.6257713248638838e-05, + "loss": 36.5074, + "step": 2534 + }, + { + "epoch": 9.151693002257336, + "grad_norm": 235.69752502441406, + "learning_rate": 1.6252268602540834e-05, + "loss": 37.344, + "step": 2535 + }, + { + "epoch": 9.15530474040632, + "grad_norm": 224.37965393066406, + "learning_rate": 1.6246823956442832e-05, + "loss": 37.8138, + "step": 2536 + }, + { + "epoch": 9.158916478555305, + "grad_norm": 217.52230834960938, + "learning_rate": 1.6241379310344828e-05, + "loss": 37.1529, + "step": 2537 + }, + { + "epoch": 9.162528216704288, + "grad_norm": 234.7586212158203, + "learning_rate": 1.6235934664246823e-05, + "loss": 36.3247, + "step": 2538 + }, + { + "epoch": 9.166139954853273, + "grad_norm": 239.52479553222656, + "learning_rate": 1.623049001814882e-05, + "loss": 30.0805, + "step": 2539 + }, + { + "epoch": 9.169751693002258, + "grad_norm": 223.7616424560547, + "learning_rate": 1.6225045372050817e-05, + "loss": 23.8492, + "step": 2540 + }, + { + "epoch": 9.169751693002258, + "eval_loss": 0.6244915723800659, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.031, + "eval_steps_per_second": 57.031, + "step": 2540 + }, + { + "epoch": 9.173363431151241, + "grad_norm": 213.41371154785156, + "learning_rate": 1.6219600725952816e-05, + "loss": 23.3557, + "step": 2541 + }, + { + "epoch": 9.176975169300226, + "grad_norm": 162.4627685546875, + "learning_rate": 1.621415607985481e-05, + "loss": 23.8834, + "step": 2542 + }, + { + "epoch": 9.18058690744921, + "grad_norm": 172.13250732421875, + "learning_rate": 1.6208711433756807e-05, + "loss": 24.6428, + "step": 2543 + }, + { + "epoch": 9.184198645598194, + "grad_norm": 229.30799865722656, + "learning_rate": 1.6203266787658802e-05, + "loss": 42.5908, + "step": 2544 + }, + { + "epoch": 9.187810383747179, + "grad_norm": 195.30130004882812, + "learning_rate": 1.6197822141560798e-05, + "loss": 43.7286, + "step": 2545 + }, + { + "epoch": 9.191422121896162, + "grad_norm": 227.4984893798828, + "learning_rate": 1.6192377495462793e-05, + "loss": 43.5012, + "step": 2546 + }, + { + "epoch": 9.195033860045147, + "grad_norm": 254.69615173339844, + "learning_rate": 1.6186932849364792e-05, + "loss": 41.9295, + "step": 2547 + }, + { + "epoch": 9.198645598194132, + "grad_norm": 251.33778381347656, + "learning_rate": 1.6181488203266787e-05, + "loss": 42.0838, + "step": 2548 + }, + { + "epoch": 9.202257336343115, + "grad_norm": 237.91677856445312, + "learning_rate": 1.6176043557168786e-05, + "loss": 43.0031, + "step": 2549 + }, + { + "epoch": 9.2058690744921, + "grad_norm": 258.0311584472656, + "learning_rate": 1.617059891107078e-05, + "loss": 42.7196, + "step": 2550 + }, + { + "epoch": 9.2058690744921, + "eval_loss": 0.6245208978652954, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2550 + }, + { + "epoch": 9.209480812641084, + "grad_norm": 197.14703369140625, + "learning_rate": 1.6165154264972777e-05, + "loss": 42.1342, + "step": 2551 + }, + { + "epoch": 9.213092550790067, + "grad_norm": 235.19705200195312, + "learning_rate": 1.6159709618874775e-05, + "loss": 41.8462, + "step": 2552 + }, + { + "epoch": 9.216704288939052, + "grad_norm": 198.409423828125, + "learning_rate": 1.615426497277677e-05, + "loss": 43.5993, + "step": 2553 + }, + { + "epoch": 9.220316027088035, + "grad_norm": 254.08590698242188, + "learning_rate": 1.6148820326678766e-05, + "loss": 40.771, + "step": 2554 + }, + { + "epoch": 9.22392776523702, + "grad_norm": 181.64808654785156, + "learning_rate": 1.614337568058076e-05, + "loss": 39.3511, + "step": 2555 + }, + { + "epoch": 9.227539503386005, + "grad_norm": 294.1127014160156, + "learning_rate": 1.6137931034482757e-05, + "loss": 39.6586, + "step": 2556 + }, + { + "epoch": 9.231151241534988, + "grad_norm": 197.59982299804688, + "learning_rate": 1.6132486388384752e-05, + "loss": 38.2575, + "step": 2557 + }, + { + "epoch": 9.234762979683973, + "grad_norm": 223.74717712402344, + "learning_rate": 1.6127041742286754e-05, + "loss": 38.8801, + "step": 2558 + }, + { + "epoch": 9.238374717832958, + "grad_norm": 279.2779541015625, + "learning_rate": 1.612159709618875e-05, + "loss": 40.4591, + "step": 2559 + }, + { + "epoch": 9.241986455981941, + "grad_norm": 258.75909423828125, + "learning_rate": 1.6116152450090745e-05, + "loss": 39.2172, + "step": 2560 + }, + { + "epoch": 9.241986455981941, + "eval_loss": 0.6209923624992371, + "eval_runtime": 3.1523, + "eval_samples_per_second": 56.784, + "eval_steps_per_second": 56.784, + "step": 2560 + }, + { + "epoch": 9.245598194130926, + "grad_norm": 305.0645446777344, + "learning_rate": 1.611070780399274e-05, + "loss": 40.442, + "step": 2561 + }, + { + "epoch": 9.249209932279909, + "grad_norm": 196.18557739257812, + "learning_rate": 1.6105263157894736e-05, + "loss": 39.7092, + "step": 2562 + }, + { + "epoch": 9.252821670428894, + "grad_norm": 214.3220977783203, + "learning_rate": 1.6099818511796735e-05, + "loss": 39.3935, + "step": 2563 + }, + { + "epoch": 9.256433408577879, + "grad_norm": 217.2801055908203, + "learning_rate": 1.609437386569873e-05, + "loss": 40.39, + "step": 2564 + }, + { + "epoch": 9.260045146726862, + "grad_norm": 205.17446899414062, + "learning_rate": 1.6088929219600726e-05, + "loss": 39.9531, + "step": 2565 + }, + { + "epoch": 9.263656884875846, + "grad_norm": 197.3854217529297, + "learning_rate": 1.608348457350272e-05, + "loss": 40.474, + "step": 2566 + }, + { + "epoch": 9.267268623024831, + "grad_norm": 264.3934631347656, + "learning_rate": 1.607803992740472e-05, + "loss": 41.2794, + "step": 2567 + }, + { + "epoch": 9.270880361173814, + "grad_norm": 226.6471710205078, + "learning_rate": 1.6072595281306715e-05, + "loss": 40.3425, + "step": 2568 + }, + { + "epoch": 9.2744920993228, + "grad_norm": 198.62734985351562, + "learning_rate": 1.6067150635208714e-05, + "loss": 41.6261, + "step": 2569 + }, + { + "epoch": 9.278103837471784, + "grad_norm": 207.73509216308594, + "learning_rate": 1.606170598911071e-05, + "loss": 41.7835, + "step": 2570 + }, + { + "epoch": 9.278103837471784, + "eval_loss": 0.6173180937767029, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 2570 + }, + { + "epoch": 9.281715575620767, + "grad_norm": 214.13601684570312, + "learning_rate": 1.6056261343012705e-05, + "loss": 40.0095, + "step": 2571 + }, + { + "epoch": 9.285327313769752, + "grad_norm": 218.0533905029297, + "learning_rate": 1.60508166969147e-05, + "loss": 40.014, + "step": 2572 + }, + { + "epoch": 9.288939051918735, + "grad_norm": 211.27984619140625, + "learning_rate": 1.6045372050816695e-05, + "loss": 36.7399, + "step": 2573 + }, + { + "epoch": 9.29255079006772, + "grad_norm": 201.9020233154297, + "learning_rate": 1.6039927404718694e-05, + "loss": 33.7555, + "step": 2574 + }, + { + "epoch": 9.296162528216705, + "grad_norm": 230.27149963378906, + "learning_rate": 1.603448275862069e-05, + "loss": 32.9646, + "step": 2575 + }, + { + "epoch": 9.299774266365688, + "grad_norm": 208.77622985839844, + "learning_rate": 1.6029038112522685e-05, + "loss": 33.5332, + "step": 2576 + }, + { + "epoch": 9.303386004514673, + "grad_norm": 225.02796936035156, + "learning_rate": 1.6023593466424684e-05, + "loss": 34.2592, + "step": 2577 + }, + { + "epoch": 9.306997742663658, + "grad_norm": 201.79612731933594, + "learning_rate": 1.601814882032668e-05, + "loss": 34.6686, + "step": 2578 + }, + { + "epoch": 9.31060948081264, + "grad_norm": 235.6588134765625, + "learning_rate": 1.6012704174228678e-05, + "loss": 35.4554, + "step": 2579 + }, + { + "epoch": 9.314221218961626, + "grad_norm": 273.51904296875, + "learning_rate": 1.6007259528130673e-05, + "loss": 35.2077, + "step": 2580 + }, + { + "epoch": 9.314221218961626, + "eval_loss": 0.6169624328613281, + "eval_runtime": 3.1501, + "eval_samples_per_second": 56.823, + "eval_steps_per_second": 56.823, + "step": 2580 + }, + { + "epoch": 9.317832957110609, + "grad_norm": 199.19541931152344, + "learning_rate": 1.600181488203267e-05, + "loss": 35.0703, + "step": 2581 + }, + { + "epoch": 9.321444695259594, + "grad_norm": 212.49276733398438, + "learning_rate": 1.5996370235934664e-05, + "loss": 35.9691, + "step": 2582 + }, + { + "epoch": 9.325056433408578, + "grad_norm": 193.7330322265625, + "learning_rate": 1.599092558983666e-05, + "loss": 34.9043, + "step": 2583 + }, + { + "epoch": 9.328668171557561, + "grad_norm": 196.00503540039062, + "learning_rate": 1.5985480943738655e-05, + "loss": 36.3508, + "step": 2584 + }, + { + "epoch": 9.332279909706546, + "grad_norm": 218.78392028808594, + "learning_rate": 1.5980036297640654e-05, + "loss": 34.7672, + "step": 2585 + }, + { + "epoch": 9.335891647855531, + "grad_norm": 235.76873779296875, + "learning_rate": 1.5974591651542652e-05, + "loss": 36.8695, + "step": 2586 + }, + { + "epoch": 9.339503386004514, + "grad_norm": 250.538330078125, + "learning_rate": 1.5969147005444648e-05, + "loss": 37.4531, + "step": 2587 + }, + { + "epoch": 9.343115124153499, + "grad_norm": 234.12469482421875, + "learning_rate": 1.5963702359346643e-05, + "loss": 37.4506, + "step": 2588 + }, + { + "epoch": 9.346726862302482, + "grad_norm": 209.3461151123047, + "learning_rate": 1.595825771324864e-05, + "loss": 31.3062, + "step": 2589 + }, + { + "epoch": 9.350338600451467, + "grad_norm": 211.12277221679688, + "learning_rate": 1.5952813067150637e-05, + "loss": 23.3303, + "step": 2590 + }, + { + "epoch": 9.350338600451467, + "eval_loss": 0.6222187876701355, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 2590 + }, + { + "epoch": 9.353950338600452, + "grad_norm": 200.1257781982422, + "learning_rate": 1.5947368421052633e-05, + "loss": 22.9145, + "step": 2591 + }, + { + "epoch": 9.357562076749435, + "grad_norm": 179.01475524902344, + "learning_rate": 1.5941923774954628e-05, + "loss": 23.8842, + "step": 2592 + }, + { + "epoch": 9.36117381489842, + "grad_norm": 214.9254608154297, + "learning_rate": 1.5936479128856623e-05, + "loss": 25.4154, + "step": 2593 + }, + { + "epoch": 9.364785553047405, + "grad_norm": 211.63735961914062, + "learning_rate": 1.593103448275862e-05, + "loss": 42.6467, + "step": 2594 + }, + { + "epoch": 9.368397291196388, + "grad_norm": 232.43194580078125, + "learning_rate": 1.5925589836660618e-05, + "loss": 43.3501, + "step": 2595 + }, + { + "epoch": 9.372009029345373, + "grad_norm": 220.61468505859375, + "learning_rate": 1.5920145190562616e-05, + "loss": 43.4324, + "step": 2596 + }, + { + "epoch": 9.375620767494357, + "grad_norm": 179.00894165039062, + "learning_rate": 1.591470054446461e-05, + "loss": 41.9646, + "step": 2597 + }, + { + "epoch": 9.37923250564334, + "grad_norm": 203.847412109375, + "learning_rate": 1.5909255898366607e-05, + "loss": 41.1242, + "step": 2598 + }, + { + "epoch": 9.382844243792325, + "grad_norm": 244.20164489746094, + "learning_rate": 1.5903811252268602e-05, + "loss": 42.2451, + "step": 2599 + }, + { + "epoch": 9.386455981941308, + "grad_norm": 203.60154724121094, + "learning_rate": 1.5898366606170598e-05, + "loss": 42.0361, + "step": 2600 + }, + { + "epoch": 9.386455981941308, + "eval_loss": 0.627146303653717, + "eval_runtime": 3.1484, + "eval_samples_per_second": 56.854, + "eval_steps_per_second": 56.854, + "step": 2600 + }, + { + "epoch": 9.390067720090293, + "grad_norm": 185.1741180419922, + "learning_rate": 1.5892921960072597e-05, + "loss": 41.9657, + "step": 2601 + }, + { + "epoch": 9.393679458239278, + "grad_norm": 211.64219665527344, + "learning_rate": 1.5887477313974592e-05, + "loss": 42.2619, + "step": 2602 + }, + { + "epoch": 9.397291196388261, + "grad_norm": 253.31997680664062, + "learning_rate": 1.5882032667876587e-05, + "loss": 42.5666, + "step": 2603 + }, + { + "epoch": 9.400902934537246, + "grad_norm": 257.8781433105469, + "learning_rate": 1.5876588021778586e-05, + "loss": 43.1747, + "step": 2604 + }, + { + "epoch": 9.404514672686231, + "grad_norm": 171.05398559570312, + "learning_rate": 1.587114337568058e-05, + "loss": 41.2645, + "step": 2605 + }, + { + "epoch": 9.408126410835214, + "grad_norm": 209.83749389648438, + "learning_rate": 1.5865698729582577e-05, + "loss": 38.7138, + "step": 2606 + }, + { + "epoch": 9.411738148984199, + "grad_norm": 303.92059326171875, + "learning_rate": 1.5860254083484576e-05, + "loss": 38.7962, + "step": 2607 + }, + { + "epoch": 9.415349887133182, + "grad_norm": 271.9322204589844, + "learning_rate": 1.585480943738657e-05, + "loss": 39.0622, + "step": 2608 + }, + { + "epoch": 9.418961625282167, + "grad_norm": 222.8749542236328, + "learning_rate": 1.5849364791288566e-05, + "loss": 40.0773, + "step": 2609 + }, + { + "epoch": 9.422573363431152, + "grad_norm": 194.549072265625, + "learning_rate": 1.5843920145190562e-05, + "loss": 39.3495, + "step": 2610 + }, + { + "epoch": 9.422573363431152, + "eval_loss": 0.618250846862793, + "eval_runtime": 3.1517, + "eval_samples_per_second": 56.796, + "eval_steps_per_second": 56.796, + "step": 2610 + }, + { + "epoch": 9.426185101580135, + "grad_norm": 231.32623291015625, + "learning_rate": 1.5838475499092557e-05, + "loss": 39.7577, + "step": 2611 + }, + { + "epoch": 9.42979683972912, + "grad_norm": 185.9986114501953, + "learning_rate": 1.5833030852994556e-05, + "loss": 40.9342, + "step": 2612 + }, + { + "epoch": 9.433408577878104, + "grad_norm": 221.356201171875, + "learning_rate": 1.5827586206896555e-05, + "loss": 39.7733, + "step": 2613 + }, + { + "epoch": 9.437020316027088, + "grad_norm": 216.2249755859375, + "learning_rate": 1.582214156079855e-05, + "loss": 39.7559, + "step": 2614 + }, + { + "epoch": 9.440632054176072, + "grad_norm": 263.5106201171875, + "learning_rate": 1.5816696914700546e-05, + "loss": 41.2872, + "step": 2615 + }, + { + "epoch": 9.444243792325057, + "grad_norm": 281.9518127441406, + "learning_rate": 1.581125226860254e-05, + "loss": 41.1114, + "step": 2616 + }, + { + "epoch": 9.44785553047404, + "grad_norm": 200.2808074951172, + "learning_rate": 1.5805807622504536e-05, + "loss": 41.7711, + "step": 2617 + }, + { + "epoch": 9.451467268623025, + "grad_norm": 233.034912109375, + "learning_rate": 1.5800362976406535e-05, + "loss": 41.3306, + "step": 2618 + }, + { + "epoch": 9.455079006772008, + "grad_norm": 215.5499725341797, + "learning_rate": 1.579491833030853e-05, + "loss": 41.0065, + "step": 2619 + }, + { + "epoch": 9.458690744920993, + "grad_norm": 220.21153259277344, + "learning_rate": 1.5789473684210526e-05, + "loss": 42.1116, + "step": 2620 + }, + { + "epoch": 9.458690744920993, + "eval_loss": 0.6146022081375122, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.915, + "eval_steps_per_second": 56.915, + "step": 2620 + }, + { + "epoch": 9.462302483069978, + "grad_norm": 198.20001220703125, + "learning_rate": 1.578402903811252e-05, + "loss": 39.637, + "step": 2621 + }, + { + "epoch": 9.465914221218961, + "grad_norm": 228.18357849121094, + "learning_rate": 1.5778584392014517e-05, + "loss": 37.3831, + "step": 2622 + }, + { + "epoch": 9.469525959367946, + "grad_norm": 207.68040466308594, + "learning_rate": 1.577313974591652e-05, + "loss": 35.6356, + "step": 2623 + }, + { + "epoch": 9.47313769751693, + "grad_norm": 267.0474853515625, + "learning_rate": 1.5767695099818514e-05, + "loss": 34.5549, + "step": 2624 + }, + { + "epoch": 9.476749435665914, + "grad_norm": 191.4129638671875, + "learning_rate": 1.576225045372051e-05, + "loss": 35.1065, + "step": 2625 + }, + { + "epoch": 9.480361173814899, + "grad_norm": 220.85708618164062, + "learning_rate": 1.5756805807622505e-05, + "loss": 34.9115, + "step": 2626 + }, + { + "epoch": 9.483972911963882, + "grad_norm": 218.62460327148438, + "learning_rate": 1.57513611615245e-05, + "loss": 33.9542, + "step": 2627 + }, + { + "epoch": 9.487584650112867, + "grad_norm": 184.085693359375, + "learning_rate": 1.5745916515426496e-05, + "loss": 35.2981, + "step": 2628 + }, + { + "epoch": 9.491196388261852, + "grad_norm": 286.73236083984375, + "learning_rate": 1.5740471869328494e-05, + "loss": 36.8326, + "step": 2629 + }, + { + "epoch": 9.494808126410835, + "grad_norm": 326.4263000488281, + "learning_rate": 1.573502722323049e-05, + "loss": 35.9728, + "step": 2630 + }, + { + "epoch": 9.494808126410835, + "eval_loss": 0.6165672540664673, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 2630 + }, + { + "epoch": 9.49841986455982, + "grad_norm": 283.330322265625, + "learning_rate": 1.5729582577132485e-05, + "loss": 37.4227, + "step": 2631 + }, + { + "epoch": 9.502031602708804, + "grad_norm": 208.65829467773438, + "learning_rate": 1.5724137931034484e-05, + "loss": 36.8613, + "step": 2632 + }, + { + "epoch": 9.505643340857787, + "grad_norm": 191.59429931640625, + "learning_rate": 1.571869328493648e-05, + "loss": 36.2332, + "step": 2633 + }, + { + "epoch": 9.509255079006772, + "grad_norm": 306.4736022949219, + "learning_rate": 1.5713248638838478e-05, + "loss": 36.8045, + "step": 2634 + }, + { + "epoch": 9.512866817155757, + "grad_norm": 226.97509765625, + "learning_rate": 1.5707803992740474e-05, + "loss": 37.005, + "step": 2635 + }, + { + "epoch": 9.51647855530474, + "grad_norm": 230.47683715820312, + "learning_rate": 1.570235934664247e-05, + "loss": 36.9168, + "step": 2636 + }, + { + "epoch": 9.520090293453725, + "grad_norm": 221.44483947753906, + "learning_rate": 1.5696914700544464e-05, + "loss": 39.0025, + "step": 2637 + }, + { + "epoch": 9.523702031602708, + "grad_norm": 249.1531219482422, + "learning_rate": 1.569147005444646e-05, + "loss": 38.1069, + "step": 2638 + }, + { + "epoch": 9.527313769751693, + "grad_norm": 276.8532409667969, + "learning_rate": 1.5686025408348455e-05, + "loss": 30.9819, + "step": 2639 + }, + { + "epoch": 9.530925507900678, + "grad_norm": 218.25035095214844, + "learning_rate": 1.5680580762250454e-05, + "loss": 23.4807, + "step": 2640 + }, + { + "epoch": 9.530925507900678, + "eval_loss": 0.619295060634613, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.958, + "eval_steps_per_second": 56.958, + "step": 2640 + }, + { + "epoch": 9.534537246049661, + "grad_norm": 185.83737182617188, + "learning_rate": 1.5675136116152453e-05, + "loss": 22.5394, + "step": 2641 + }, + { + "epoch": 9.538148984198646, + "grad_norm": 181.9920654296875, + "learning_rate": 1.5669691470054448e-05, + "loss": 23.9106, + "step": 2642 + }, + { + "epoch": 9.54176072234763, + "grad_norm": 209.20391845703125, + "learning_rate": 1.5664246823956443e-05, + "loss": 25.5328, + "step": 2643 + }, + { + "epoch": 9.545372460496614, + "grad_norm": 223.86093139648438, + "learning_rate": 1.565880217785844e-05, + "loss": 42.8563, + "step": 2644 + }, + { + "epoch": 9.548984198645599, + "grad_norm": 232.3086395263672, + "learning_rate": 1.5653357531760438e-05, + "loss": 44.0178, + "step": 2645 + }, + { + "epoch": 9.552595936794582, + "grad_norm": 223.76541137695312, + "learning_rate": 1.5647912885662433e-05, + "loss": 43.4928, + "step": 2646 + }, + { + "epoch": 9.556207674943566, + "grad_norm": 258.86700439453125, + "learning_rate": 1.5642468239564428e-05, + "loss": 42.3422, + "step": 2647 + }, + { + "epoch": 9.559819413092551, + "grad_norm": 255.09033203125, + "learning_rate": 1.5637023593466424e-05, + "loss": 41.6588, + "step": 2648 + }, + { + "epoch": 9.563431151241534, + "grad_norm": 205.88563537597656, + "learning_rate": 1.563157894736842e-05, + "loss": 41.9267, + "step": 2649 + }, + { + "epoch": 9.56704288939052, + "grad_norm": 204.12318420410156, + "learning_rate": 1.5626134301270418e-05, + "loss": 43.0326, + "step": 2650 + }, + { + "epoch": 9.56704288939052, + "eval_loss": 0.6218730807304382, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2650 + }, + { + "epoch": 9.570654627539504, + "grad_norm": 259.5694274902344, + "learning_rate": 1.5620689655172417e-05, + "loss": 42.9604, + "step": 2651 + }, + { + "epoch": 9.574266365688487, + "grad_norm": 234.35935974121094, + "learning_rate": 1.5615245009074412e-05, + "loss": 42.7316, + "step": 2652 + }, + { + "epoch": 9.577878103837472, + "grad_norm": 237.14346313476562, + "learning_rate": 1.5609800362976407e-05, + "loss": 42.4559, + "step": 2653 + }, + { + "epoch": 9.581489841986457, + "grad_norm": 208.2974395751953, + "learning_rate": 1.5604355716878403e-05, + "loss": 40.1113, + "step": 2654 + }, + { + "epoch": 9.58510158013544, + "grad_norm": 212.18814086914062, + "learning_rate": 1.5598911070780398e-05, + "loss": 38.6515, + "step": 2655 + }, + { + "epoch": 9.588713318284425, + "grad_norm": 245.23240661621094, + "learning_rate": 1.5593466424682397e-05, + "loss": 39.5289, + "step": 2656 + }, + { + "epoch": 9.592325056433408, + "grad_norm": 261.1321105957031, + "learning_rate": 1.5588021778584392e-05, + "loss": 39.3232, + "step": 2657 + }, + { + "epoch": 9.595936794582393, + "grad_norm": 257.67962646484375, + "learning_rate": 1.5582577132486388e-05, + "loss": 40.3963, + "step": 2658 + }, + { + "epoch": 9.599548532731378, + "grad_norm": 299.93914794921875, + "learning_rate": 1.5577132486388383e-05, + "loss": 39.0657, + "step": 2659 + }, + { + "epoch": 9.60316027088036, + "grad_norm": 215.45407104492188, + "learning_rate": 1.5571687840290382e-05, + "loss": 40.1408, + "step": 2660 + }, + { + "epoch": 9.60316027088036, + "eval_loss": 0.6216554045677185, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 2660 + }, + { + "epoch": 9.606772009029346, + "grad_norm": 273.9233093261719, + "learning_rate": 1.5566243194192377e-05, + "loss": 40.6894, + "step": 2661 + }, + { + "epoch": 9.610383747178329, + "grad_norm": 220.76344299316406, + "learning_rate": 1.5560798548094376e-05, + "loss": 40.8146, + "step": 2662 + }, + { + "epoch": 9.613995485327314, + "grad_norm": 200.33929443359375, + "learning_rate": 1.555535390199637e-05, + "loss": 40.1362, + "step": 2663 + }, + { + "epoch": 9.617607223476298, + "grad_norm": 223.38536071777344, + "learning_rate": 1.5549909255898367e-05, + "loss": 39.3488, + "step": 2664 + }, + { + "epoch": 9.621218961625281, + "grad_norm": 240.99578857421875, + "learning_rate": 1.5544464609800362e-05, + "loss": 41.771, + "step": 2665 + }, + { + "epoch": 9.624830699774266, + "grad_norm": 202.30323791503906, + "learning_rate": 1.5539019963702357e-05, + "loss": 41.1412, + "step": 2666 + }, + { + "epoch": 9.628442437923251, + "grad_norm": 193.8411865234375, + "learning_rate": 1.5533575317604356e-05, + "loss": 41.0064, + "step": 2667 + }, + { + "epoch": 9.632054176072234, + "grad_norm": 197.1542510986328, + "learning_rate": 1.552813067150635e-05, + "loss": 41.4787, + "step": 2668 + }, + { + "epoch": 9.635665914221219, + "grad_norm": 259.21954345703125, + "learning_rate": 1.552268602540835e-05, + "loss": 41.753, + "step": 2669 + }, + { + "epoch": 9.639277652370204, + "grad_norm": 290.9770202636719, + "learning_rate": 1.5517241379310346e-05, + "loss": 40.4589, + "step": 2670 + }, + { + "epoch": 9.639277652370204, + "eval_loss": 0.6132164001464844, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.963, + "eval_steps_per_second": 56.963, + "step": 2670 + }, + { + "epoch": 9.642889390519187, + "grad_norm": 252.86219787597656, + "learning_rate": 1.551179673321234e-05, + "loss": 37.356, + "step": 2671 + }, + { + "epoch": 9.646501128668172, + "grad_norm": 207.79254150390625, + "learning_rate": 1.550635208711434e-05, + "loss": 36.2071, + "step": 2672 + }, + { + "epoch": 9.650112866817155, + "grad_norm": 186.78857421875, + "learning_rate": 1.5500907441016335e-05, + "loss": 33.5074, + "step": 2673 + }, + { + "epoch": 9.65372460496614, + "grad_norm": 212.5107421875, + "learning_rate": 1.549546279491833e-05, + "loss": 33.7103, + "step": 2674 + }, + { + "epoch": 9.657336343115125, + "grad_norm": 243.2950897216797, + "learning_rate": 1.5490018148820326e-05, + "loss": 34.3476, + "step": 2675 + }, + { + "epoch": 9.660948081264108, + "grad_norm": 221.66415405273438, + "learning_rate": 1.548457350272232e-05, + "loss": 34.5377, + "step": 2676 + }, + { + "epoch": 9.664559819413093, + "grad_norm": 231.8260955810547, + "learning_rate": 1.5479128856624317e-05, + "loss": 34.3663, + "step": 2677 + }, + { + "epoch": 9.668171557562077, + "grad_norm": 284.6401062011719, + "learning_rate": 1.547368421052632e-05, + "loss": 35.5723, + "step": 2678 + }, + { + "epoch": 9.67178329571106, + "grad_norm": 373.43865966796875, + "learning_rate": 1.5468239564428314e-05, + "loss": 35.5628, + "step": 2679 + }, + { + "epoch": 9.675395033860045, + "grad_norm": 325.18316650390625, + "learning_rate": 1.546279491833031e-05, + "loss": 35.6192, + "step": 2680 + }, + { + "epoch": 9.675395033860045, + "eval_loss": 0.613842248916626, + "eval_runtime": 3.1437, + "eval_samples_per_second": 56.94, + "eval_steps_per_second": 56.94, + "step": 2680 + }, + { + "epoch": 9.679006772009028, + "grad_norm": 353.14739990234375, + "learning_rate": 1.5457350272232305e-05, + "loss": 36.4789, + "step": 2681 + }, + { + "epoch": 9.682618510158013, + "grad_norm": 215.21836853027344, + "learning_rate": 1.54519056261343e-05, + "loss": 36.0412, + "step": 2682 + }, + { + "epoch": 9.686230248306998, + "grad_norm": 219.64930725097656, + "learning_rate": 1.54464609800363e-05, + "loss": 37.1118, + "step": 2683 + }, + { + "epoch": 9.689841986455981, + "grad_norm": 247.86685180664062, + "learning_rate": 1.5441016333938295e-05, + "loss": 36.488, + "step": 2684 + }, + { + "epoch": 9.693453724604966, + "grad_norm": 248.7967071533203, + "learning_rate": 1.543557168784029e-05, + "loss": 36.2925, + "step": 2685 + }, + { + "epoch": 9.697065462753951, + "grad_norm": 243.1404571533203, + "learning_rate": 1.5430127041742285e-05, + "loss": 37.3986, + "step": 2686 + }, + { + "epoch": 9.700677200902934, + "grad_norm": 276.6585388183594, + "learning_rate": 1.5424682395644284e-05, + "loss": 37.9784, + "step": 2687 + }, + { + "epoch": 9.704288939051919, + "grad_norm": 308.171630859375, + "learning_rate": 1.541923774954628e-05, + "loss": 38.1591, + "step": 2688 + }, + { + "epoch": 9.707900677200904, + "grad_norm": 204.4575653076172, + "learning_rate": 1.541379310344828e-05, + "loss": 27.4514, + "step": 2689 + }, + { + "epoch": 9.711512415349887, + "grad_norm": 160.85946655273438, + "learning_rate": 1.5408348457350274e-05, + "loss": 23.7982, + "step": 2690 + }, + { + "epoch": 9.711512415349887, + "eval_loss": 0.619924008846283, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2690 + }, + { + "epoch": 9.715124153498872, + "grad_norm": 215.60049438476562, + "learning_rate": 1.540290381125227e-05, + "loss": 23.3927, + "step": 2691 + }, + { + "epoch": 9.718735891647855, + "grad_norm": 172.84011840820312, + "learning_rate": 1.5397459165154265e-05, + "loss": 24.1876, + "step": 2692 + }, + { + "epoch": 9.72234762979684, + "grad_norm": 208.42361450195312, + "learning_rate": 1.539201451905626e-05, + "loss": 25.1794, + "step": 2693 + }, + { + "epoch": 9.725959367945824, + "grad_norm": 255.73574829101562, + "learning_rate": 1.538656987295826e-05, + "loss": 42.3484, + "step": 2694 + }, + { + "epoch": 9.729571106094808, + "grad_norm": 239.65533447265625, + "learning_rate": 1.5381125226860254e-05, + "loss": 42.8277, + "step": 2695 + }, + { + "epoch": 9.733182844243792, + "grad_norm": 211.2068634033203, + "learning_rate": 1.5375680580762253e-05, + "loss": 42.6536, + "step": 2696 + }, + { + "epoch": 9.736794582392777, + "grad_norm": 302.85003662109375, + "learning_rate": 1.5370235934664248e-05, + "loss": 42.6263, + "step": 2697 + }, + { + "epoch": 9.74040632054176, + "grad_norm": 211.54754638671875, + "learning_rate": 1.5364791288566244e-05, + "loss": 41.5621, + "step": 2698 + }, + { + "epoch": 9.744018058690745, + "grad_norm": 229.22283935546875, + "learning_rate": 1.535934664246824e-05, + "loss": 43.3765, + "step": 2699 + }, + { + "epoch": 9.747629796839728, + "grad_norm": 206.64794921875, + "learning_rate": 1.5353901996370238e-05, + "loss": 41.4923, + "step": 2700 + }, + { + "epoch": 9.747629796839728, + "eval_loss": 0.6202616095542908, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.981, + "eval_steps_per_second": 56.981, + "step": 2700 + }, + { + "epoch": 9.751241534988713, + "grad_norm": 216.98757934570312, + "learning_rate": 1.5348457350272233e-05, + "loss": 43.1931, + "step": 2701 + }, + { + "epoch": 9.754853273137698, + "grad_norm": 222.7340545654297, + "learning_rate": 1.534301270417423e-05, + "loss": 42.485, + "step": 2702 + }, + { + "epoch": 9.758465011286681, + "grad_norm": 291.3454895019531, + "learning_rate": 1.5337568058076224e-05, + "loss": 41.4766, + "step": 2703 + }, + { + "epoch": 9.762076749435666, + "grad_norm": 239.50341796875, + "learning_rate": 1.533212341197822e-05, + "loss": 41.9215, + "step": 2704 + }, + { + "epoch": 9.76568848758465, + "grad_norm": 179.21839904785156, + "learning_rate": 1.5326678765880218e-05, + "loss": 40.6544, + "step": 2705 + }, + { + "epoch": 9.769300225733634, + "grad_norm": 210.89535522460938, + "learning_rate": 1.5321234119782217e-05, + "loss": 38.6204, + "step": 2706 + }, + { + "epoch": 9.772911963882619, + "grad_norm": 239.23291015625, + "learning_rate": 1.5315789473684212e-05, + "loss": 39.4385, + "step": 2707 + }, + { + "epoch": 9.776523702031604, + "grad_norm": 240.22772216796875, + "learning_rate": 1.5310344827586208e-05, + "loss": 40.0139, + "step": 2708 + }, + { + "epoch": 9.780135440180587, + "grad_norm": 185.4588623046875, + "learning_rate": 1.5304900181488203e-05, + "loss": 38.9331, + "step": 2709 + }, + { + "epoch": 9.783747178329572, + "grad_norm": 263.0315856933594, + "learning_rate": 1.52994555353902e-05, + "loss": 38.5485, + "step": 2710 + }, + { + "epoch": 9.783747178329572, + "eval_loss": 0.615914523601532, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.004, + "eval_steps_per_second": 57.004, + "step": 2710 + }, + { + "epoch": 9.787358916478555, + "grad_norm": 209.05348205566406, + "learning_rate": 1.5294010889292197e-05, + "loss": 39.4875, + "step": 2711 + }, + { + "epoch": 9.79097065462754, + "grad_norm": 209.72293090820312, + "learning_rate": 1.5288566243194193e-05, + "loss": 40.4742, + "step": 2712 + }, + { + "epoch": 9.794582392776524, + "grad_norm": 210.02908325195312, + "learning_rate": 1.5283121597096188e-05, + "loss": 39.924, + "step": 2713 + }, + { + "epoch": 9.798194130925507, + "grad_norm": 204.3467254638672, + "learning_rate": 1.5277676950998183e-05, + "loss": 40.8893, + "step": 2714 + }, + { + "epoch": 9.801805869074492, + "grad_norm": 253.9317626953125, + "learning_rate": 1.5272232304900182e-05, + "loss": 38.3278, + "step": 2715 + }, + { + "epoch": 9.805417607223477, + "grad_norm": 263.6196594238281, + "learning_rate": 1.526678765880218e-05, + "loss": 40.5242, + "step": 2716 + }, + { + "epoch": 9.80902934537246, + "grad_norm": 230.35621643066406, + "learning_rate": 1.5261343012704176e-05, + "loss": 40.683, + "step": 2717 + }, + { + "epoch": 9.812641083521445, + "grad_norm": 190.16323852539062, + "learning_rate": 1.5255898366606172e-05, + "loss": 40.2472, + "step": 2718 + }, + { + "epoch": 9.816252821670428, + "grad_norm": 202.7122344970703, + "learning_rate": 1.5250453720508167e-05, + "loss": 38.9644, + "step": 2719 + }, + { + "epoch": 9.819864559819413, + "grad_norm": 193.65774536132812, + "learning_rate": 1.5245009074410164e-05, + "loss": 40.9982, + "step": 2720 + }, + { + "epoch": 9.819864559819413, + "eval_loss": 0.6152020692825317, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 2720 + }, + { + "epoch": 9.823476297968398, + "grad_norm": 272.0360412597656, + "learning_rate": 1.523956442831216e-05, + "loss": 40.5518, + "step": 2721 + }, + { + "epoch": 9.827088036117381, + "grad_norm": 200.20777893066406, + "learning_rate": 1.5234119782214155e-05, + "loss": 38.4801, + "step": 2722 + }, + { + "epoch": 9.830699774266366, + "grad_norm": 201.44764709472656, + "learning_rate": 1.5228675136116152e-05, + "loss": 35.7499, + "step": 2723 + }, + { + "epoch": 9.83431151241535, + "grad_norm": 234.89706420898438, + "learning_rate": 1.522323049001815e-05, + "loss": 35.4331, + "step": 2724 + }, + { + "epoch": 9.837923250564334, + "grad_norm": 193.27423095703125, + "learning_rate": 1.5217785843920146e-05, + "loss": 33.0281, + "step": 2725 + }, + { + "epoch": 9.841534988713319, + "grad_norm": 222.28060913085938, + "learning_rate": 1.5212341197822143e-05, + "loss": 34.2237, + "step": 2726 + }, + { + "epoch": 9.845146726862303, + "grad_norm": 264.2764587402344, + "learning_rate": 1.5206896551724139e-05, + "loss": 33.7112, + "step": 2727 + }, + { + "epoch": 9.848758465011286, + "grad_norm": 204.5146484375, + "learning_rate": 1.5201451905626134e-05, + "loss": 33.9014, + "step": 2728 + }, + { + "epoch": 9.852370203160271, + "grad_norm": 198.90907287597656, + "learning_rate": 1.5196007259528131e-05, + "loss": 36.6987, + "step": 2729 + }, + { + "epoch": 9.855981941309254, + "grad_norm": 254.19818115234375, + "learning_rate": 1.5190562613430126e-05, + "loss": 35.4466, + "step": 2730 + }, + { + "epoch": 9.855981941309254, + "eval_loss": 0.6153284311294556, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 2730 + }, + { + "epoch": 9.85959367945824, + "grad_norm": 212.53749084472656, + "learning_rate": 1.5185117967332123e-05, + "loss": 35.659, + "step": 2731 + }, + { + "epoch": 9.863205417607224, + "grad_norm": 234.5277557373047, + "learning_rate": 1.5179673321234119e-05, + "loss": 36.7411, + "step": 2732 + }, + { + "epoch": 9.866817155756207, + "grad_norm": 229.25962829589844, + "learning_rate": 1.5174228675136118e-05, + "loss": 36.0713, + "step": 2733 + }, + { + "epoch": 9.870428893905192, + "grad_norm": 259.5096435546875, + "learning_rate": 1.5168784029038115e-05, + "loss": 37.2433, + "step": 2734 + }, + { + "epoch": 9.874040632054175, + "grad_norm": 297.2413024902344, + "learning_rate": 1.516333938294011e-05, + "loss": 37.222, + "step": 2735 + }, + { + "epoch": 9.87765237020316, + "grad_norm": 259.8325500488281, + "learning_rate": 1.5157894736842105e-05, + "loss": 37.096, + "step": 2736 + }, + { + "epoch": 9.881264108352145, + "grad_norm": 275.85888671875, + "learning_rate": 1.5152450090744103e-05, + "loss": 37.769, + "step": 2737 + }, + { + "epoch": 9.884875846501128, + "grad_norm": 261.16656494140625, + "learning_rate": 1.5147005444646098e-05, + "loss": 38.4089, + "step": 2738 + }, + { + "epoch": 9.888487584650113, + "grad_norm": 219.74351501464844, + "learning_rate": 1.5141560798548095e-05, + "loss": 32.5255, + "step": 2739 + }, + { + "epoch": 9.892099322799098, + "grad_norm": 203.9193878173828, + "learning_rate": 1.513611615245009e-05, + "loss": 24.2497, + "step": 2740 + }, + { + "epoch": 9.892099322799098, + "eval_loss": 0.6206448674201965, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 2740 + }, + { + "epoch": 9.89571106094808, + "grad_norm": 224.19454956054688, + "learning_rate": 1.5130671506352086e-05, + "loss": 23.0629, + "step": 2741 + }, + { + "epoch": 9.899322799097066, + "grad_norm": 252.4147186279297, + "learning_rate": 1.5125226860254086e-05, + "loss": 24.5799, + "step": 2742 + }, + { + "epoch": 9.90293453724605, + "grad_norm": 214.79067993164062, + "learning_rate": 1.5119782214156082e-05, + "loss": 24.6773, + "step": 2743 + }, + { + "epoch": 9.906546275395034, + "grad_norm": 225.59848022460938, + "learning_rate": 1.5114337568058077e-05, + "loss": 43.1147, + "step": 2744 + }, + { + "epoch": 9.910158013544018, + "grad_norm": 221.8661651611328, + "learning_rate": 1.5108892921960074e-05, + "loss": 42.7403, + "step": 2745 + }, + { + "epoch": 9.913769751693001, + "grad_norm": 316.3871765136719, + "learning_rate": 1.510344827586207e-05, + "loss": 41.6931, + "step": 2746 + }, + { + "epoch": 9.917381489841986, + "grad_norm": 250.6577911376953, + "learning_rate": 1.5098003629764065e-05, + "loss": 43.3, + "step": 2747 + }, + { + "epoch": 9.920993227990971, + "grad_norm": 222.44386291503906, + "learning_rate": 1.5092558983666062e-05, + "loss": 43.3128, + "step": 2748 + }, + { + "epoch": 9.924604966139954, + "grad_norm": 190.08682250976562, + "learning_rate": 1.5087114337568057e-05, + "loss": 41.4814, + "step": 2749 + }, + { + "epoch": 9.928216704288939, + "grad_norm": 276.9918212890625, + "learning_rate": 1.5081669691470054e-05, + "loss": 41.042, + "step": 2750 + }, + { + "epoch": 9.928216704288939, + "eval_loss": 0.6201648116111755, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.927, + "eval_steps_per_second": 56.927, + "step": 2750 + }, + { + "epoch": 9.931828442437924, + "grad_norm": 269.7344970703125, + "learning_rate": 1.507622504537205e-05, + "loss": 40.3064, + "step": 2751 + }, + { + "epoch": 9.935440180586907, + "grad_norm": 263.11663818359375, + "learning_rate": 1.5070780399274049e-05, + "loss": 40.1675, + "step": 2752 + }, + { + "epoch": 9.939051918735892, + "grad_norm": 210.37635803222656, + "learning_rate": 1.5065335753176046e-05, + "loss": 40.5334, + "step": 2753 + }, + { + "epoch": 9.942663656884875, + "grad_norm": 206.09335327148438, + "learning_rate": 1.5059891107078041e-05, + "loss": 41.0429, + "step": 2754 + }, + { + "epoch": 9.94627539503386, + "grad_norm": 245.45013427734375, + "learning_rate": 1.5054446460980036e-05, + "loss": 40.8831, + "step": 2755 + }, + { + "epoch": 9.949887133182845, + "grad_norm": 216.63075256347656, + "learning_rate": 1.5049001814882033e-05, + "loss": 41.2453, + "step": 2756 + }, + { + "epoch": 9.953498871331828, + "grad_norm": 362.12127685546875, + "learning_rate": 1.5043557168784029e-05, + "loss": 40.4561, + "step": 2757 + }, + { + "epoch": 9.957110609480813, + "grad_norm": 222.01434326171875, + "learning_rate": 1.5038112522686024e-05, + "loss": 41.7307, + "step": 2758 + }, + { + "epoch": 9.960722347629797, + "grad_norm": 289.6107177734375, + "learning_rate": 1.5032667876588021e-05, + "loss": 37.83, + "step": 2759 + }, + { + "epoch": 9.96433408577878, + "grad_norm": 231.75274658203125, + "learning_rate": 1.5027223230490017e-05, + "loss": 34.1728, + "step": 2760 + }, + { + "epoch": 9.96433408577878, + "eval_loss": 0.6177247166633606, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 2760 + }, + { + "epoch": 9.967945823927765, + "grad_norm": 269.4657287597656, + "learning_rate": 1.5021778584392017e-05, + "loss": 33.8501, + "step": 2761 + }, + { + "epoch": 9.97155756207675, + "grad_norm": 229.73004150390625, + "learning_rate": 1.5016333938294013e-05, + "loss": 35.0989, + "step": 2762 + }, + { + "epoch": 9.975169300225733, + "grad_norm": 215.75350952148438, + "learning_rate": 1.5010889292196008e-05, + "loss": 35.1091, + "step": 2763 + }, + { + "epoch": 9.978781038374718, + "grad_norm": 255.36439514160156, + "learning_rate": 1.5005444646098005e-05, + "loss": 36.8373, + "step": 2764 + }, + { + "epoch": 9.982392776523701, + "grad_norm": 226.71084594726562, + "learning_rate": 1.5e-05, + "loss": 36.6244, + "step": 2765 + }, + { + "epoch": 9.986004514672686, + "grad_norm": 264.1791076660156, + "learning_rate": 1.4994555353901996e-05, + "loss": 36.1925, + "step": 2766 + }, + { + "epoch": 9.989616252821671, + "grad_norm": 281.4349060058594, + "learning_rate": 1.4989110707803993e-05, + "loss": 38.5627, + "step": 2767 + }, + { + "epoch": 9.993227990970654, + "grad_norm": 275.13092041015625, + "learning_rate": 1.498366606170599e-05, + "loss": 33.3277, + "step": 2768 + }, + { + "epoch": 9.996839729119639, + "grad_norm": 215.79550170898438, + "learning_rate": 1.4978221415607985e-05, + "loss": 23.7482, + "step": 2769 + }, + { + "epoch": 10.0, + "grad_norm": 162.03152465820312, + "learning_rate": 1.4972776769509982e-05, + "loss": 21.7078, + "step": 2770 + }, + { + "epoch": 10.0, + "eval_loss": 0.6126651763916016, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2770 + }, + { + "epoch": 10.003611738148985, + "grad_norm": 243.1815185546875, + "learning_rate": 1.4967332123411978e-05, + "loss": 42.2449, + "step": 2771 + }, + { + "epoch": 10.007223476297968, + "grad_norm": 183.29127502441406, + "learning_rate": 1.4961887477313977e-05, + "loss": 41.5925, + "step": 2772 + }, + { + "epoch": 10.010835214446953, + "grad_norm": 206.04238891601562, + "learning_rate": 1.4956442831215972e-05, + "loss": 40.6657, + "step": 2773 + }, + { + "epoch": 10.014446952595938, + "grad_norm": 192.1796875, + "learning_rate": 1.4950998185117967e-05, + "loss": 41.7065, + "step": 2774 + }, + { + "epoch": 10.01805869074492, + "grad_norm": 202.77279663085938, + "learning_rate": 1.4945553539019964e-05, + "loss": 42.0608, + "step": 2775 + }, + { + "epoch": 10.021670428893906, + "grad_norm": 242.37734985351562, + "learning_rate": 1.494010889292196e-05, + "loss": 40.9925, + "step": 2776 + }, + { + "epoch": 10.025282167042889, + "grad_norm": 252.01358032226562, + "learning_rate": 1.4934664246823957e-05, + "loss": 41.1401, + "step": 2777 + }, + { + "epoch": 10.028893905191874, + "grad_norm": 205.82388305664062, + "learning_rate": 1.4929219600725954e-05, + "loss": 41.5, + "step": 2778 + }, + { + "epoch": 10.032505643340858, + "grad_norm": 251.53968811035156, + "learning_rate": 1.492377495462795e-05, + "loss": 41.8218, + "step": 2779 + }, + { + "epoch": 10.036117381489841, + "grad_norm": 236.55564880371094, + "learning_rate": 1.4918330308529945e-05, + "loss": 40.803, + "step": 2780 + }, + { + "epoch": 10.036117381489841, + "eval_loss": 0.6173696517944336, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 2780 + }, + { + "epoch": 10.039729119638826, + "grad_norm": 214.9959716796875, + "learning_rate": 1.4912885662431942e-05, + "loss": 40.522, + "step": 2781 + }, + { + "epoch": 10.043340857787811, + "grad_norm": 213.7000732421875, + "learning_rate": 1.4907441016333939e-05, + "loss": 38.8643, + "step": 2782 + }, + { + "epoch": 10.046952595936794, + "grad_norm": 225.6709747314453, + "learning_rate": 1.4901996370235936e-05, + "loss": 38.3625, + "step": 2783 + }, + { + "epoch": 10.050564334085779, + "grad_norm": 208.83712768554688, + "learning_rate": 1.4896551724137931e-05, + "loss": 38.5355, + "step": 2784 + }, + { + "epoch": 10.054176072234762, + "grad_norm": 185.51219177246094, + "learning_rate": 1.4891107078039927e-05, + "loss": 38.4303, + "step": 2785 + }, + { + "epoch": 10.057787810383747, + "grad_norm": 196.68551635742188, + "learning_rate": 1.4885662431941925e-05, + "loss": 38.1895, + "step": 2786 + }, + { + "epoch": 10.061399548532732, + "grad_norm": 207.4806671142578, + "learning_rate": 1.488021778584392e-05, + "loss": 39.2329, + "step": 2787 + }, + { + "epoch": 10.065011286681715, + "grad_norm": 211.640380859375, + "learning_rate": 1.4874773139745916e-05, + "loss": 40.108, + "step": 2788 + }, + { + "epoch": 10.0686230248307, + "grad_norm": 195.97006225585938, + "learning_rate": 1.4869328493647913e-05, + "loss": 39.6883, + "step": 2789 + }, + { + "epoch": 10.072234762979685, + "grad_norm": 207.20169067382812, + "learning_rate": 1.4863883847549909e-05, + "loss": 40.557, + "step": 2790 + }, + { + "epoch": 10.072234762979685, + "eval_loss": 0.6166439652442932, + "eval_runtime": 3.1461, + "eval_samples_per_second": 56.895, + "eval_steps_per_second": 56.895, + "step": 2790 + }, + { + "epoch": 10.075846501128668, + "grad_norm": 168.4052276611328, + "learning_rate": 1.4858439201451906e-05, + "loss": 39.76, + "step": 2791 + }, + { + "epoch": 10.079458239277653, + "grad_norm": 188.55575561523438, + "learning_rate": 1.4852994555353903e-05, + "loss": 40.4776, + "step": 2792 + }, + { + "epoch": 10.083069977426636, + "grad_norm": 181.60801696777344, + "learning_rate": 1.4847549909255898e-05, + "loss": 40.5414, + "step": 2793 + }, + { + "epoch": 10.08668171557562, + "grad_norm": 205.39608764648438, + "learning_rate": 1.4842105263157895e-05, + "loss": 41.4944, + "step": 2794 + }, + { + "epoch": 10.090293453724605, + "grad_norm": 271.0169372558594, + "learning_rate": 1.4836660617059892e-05, + "loss": 40.6805, + "step": 2795 + }, + { + "epoch": 10.093905191873588, + "grad_norm": 241.97889709472656, + "learning_rate": 1.4831215970961888e-05, + "loss": 39.5473, + "step": 2796 + }, + { + "epoch": 10.097516930022573, + "grad_norm": 211.64260864257812, + "learning_rate": 1.4825771324863885e-05, + "loss": 41.0357, + "step": 2797 + }, + { + "epoch": 10.101128668171558, + "grad_norm": 209.52804565429688, + "learning_rate": 1.482032667876588e-05, + "loss": 41.3357, + "step": 2798 + }, + { + "epoch": 10.104740406320541, + "grad_norm": 243.08419799804688, + "learning_rate": 1.4814882032667876e-05, + "loss": 38.6778, + "step": 2799 + }, + { + "epoch": 10.108352144469526, + "grad_norm": 227.17172241210938, + "learning_rate": 1.4809437386569874e-05, + "loss": 35.1128, + "step": 2800 + }, + { + "epoch": 10.108352144469526, + "eval_loss": 0.6153741478919983, + "eval_runtime": 3.143, + "eval_samples_per_second": 56.952, + "eval_steps_per_second": 56.952, + "step": 2800 + }, + { + "epoch": 10.111963882618511, + "grad_norm": 284.7151794433594, + "learning_rate": 1.480399274047187e-05, + "loss": 33.1712, + "step": 2801 + }, + { + "epoch": 10.115575620767494, + "grad_norm": 234.85169982910156, + "learning_rate": 1.4798548094373867e-05, + "loss": 33.495, + "step": 2802 + }, + { + "epoch": 10.119187358916479, + "grad_norm": 236.6138458251953, + "learning_rate": 1.4793103448275862e-05, + "loss": 33.2318, + "step": 2803 + }, + { + "epoch": 10.122799097065462, + "grad_norm": 240.98997497558594, + "learning_rate": 1.4787658802177858e-05, + "loss": 33.9268, + "step": 2804 + }, + { + "epoch": 10.126410835214447, + "grad_norm": 218.304443359375, + "learning_rate": 1.4782214156079856e-05, + "loss": 34.667, + "step": 2805 + }, + { + "epoch": 10.130022573363432, + "grad_norm": 290.30108642578125, + "learning_rate": 1.4776769509981852e-05, + "loss": 36.7153, + "step": 2806 + }, + { + "epoch": 10.133634311512415, + "grad_norm": 267.7265625, + "learning_rate": 1.4771324863883847e-05, + "loss": 35.2035, + "step": 2807 + }, + { + "epoch": 10.1372460496614, + "grad_norm": 300.4646301269531, + "learning_rate": 1.4765880217785844e-05, + "loss": 35.6581, + "step": 2808 + }, + { + "epoch": 10.140857787810384, + "grad_norm": 234.16448974609375, + "learning_rate": 1.4760435571687841e-05, + "loss": 35.8547, + "step": 2809 + }, + { + "epoch": 10.144469525959368, + "grad_norm": 209.23858642578125, + "learning_rate": 1.4754990925589837e-05, + "loss": 34.47, + "step": 2810 + }, + { + "epoch": 10.144469525959368, + "eval_loss": 0.6160662770271301, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 2810 + }, + { + "epoch": 10.148081264108352, + "grad_norm": 207.9628143310547, + "learning_rate": 1.4749546279491834e-05, + "loss": 36.1239, + "step": 2811 + }, + { + "epoch": 10.151693002257336, + "grad_norm": 183.68545532226562, + "learning_rate": 1.4744101633393829e-05, + "loss": 36.759, + "step": 2812 + }, + { + "epoch": 10.15530474040632, + "grad_norm": 222.00164794921875, + "learning_rate": 1.4738656987295826e-05, + "loss": 37.397, + "step": 2813 + }, + { + "epoch": 10.158916478555305, + "grad_norm": 226.9628448486328, + "learning_rate": 1.4733212341197823e-05, + "loss": 36.3648, + "step": 2814 + }, + { + "epoch": 10.162528216704288, + "grad_norm": 271.061279296875, + "learning_rate": 1.4727767695099819e-05, + "loss": 37.8754, + "step": 2815 + }, + { + "epoch": 10.166139954853273, + "grad_norm": 265.2478942871094, + "learning_rate": 1.4722323049001816e-05, + "loss": 33.7491, + "step": 2816 + }, + { + "epoch": 10.169751693002258, + "grad_norm": 227.5030975341797, + "learning_rate": 1.4716878402903811e-05, + "loss": 23.0162, + "step": 2817 + }, + { + "epoch": 10.173363431151241, + "grad_norm": 195.83477783203125, + "learning_rate": 1.4711433756805808e-05, + "loss": 23.5831, + "step": 2818 + }, + { + "epoch": 10.176975169300226, + "grad_norm": 196.982421875, + "learning_rate": 1.4705989110707805e-05, + "loss": 24.1078, + "step": 2819 + }, + { + "epoch": 10.18058690744921, + "grad_norm": 212.73031616210938, + "learning_rate": 1.47005444646098e-05, + "loss": 24.8378, + "step": 2820 + }, + { + "epoch": 10.18058690744921, + "eval_loss": 0.6217848062515259, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 2820 + }, + { + "epoch": 10.184198645598194, + "grad_norm": 261.8343200683594, + "learning_rate": 1.4695099818511796e-05, + "loss": 43.3402, + "step": 2821 + }, + { + "epoch": 10.187810383747179, + "grad_norm": 272.94158935546875, + "learning_rate": 1.4689655172413793e-05, + "loss": 42.8004, + "step": 2822 + }, + { + "epoch": 10.191422121896162, + "grad_norm": 261.5067138671875, + "learning_rate": 1.468421052631579e-05, + "loss": 43.5947, + "step": 2823 + }, + { + "epoch": 10.195033860045147, + "grad_norm": 280.4205322265625, + "learning_rate": 1.4678765880217787e-05, + "loss": 42.1887, + "step": 2824 + }, + { + "epoch": 10.198645598194132, + "grad_norm": 223.82449340820312, + "learning_rate": 1.4673321234119783e-05, + "loss": 40.9825, + "step": 2825 + }, + { + "epoch": 10.202257336343115, + "grad_norm": 261.1077575683594, + "learning_rate": 1.4667876588021778e-05, + "loss": 41.8347, + "step": 2826 + }, + { + "epoch": 10.2058690744921, + "grad_norm": 189.1642608642578, + "learning_rate": 1.4662431941923775e-05, + "loss": 41.7441, + "step": 2827 + }, + { + "epoch": 10.209480812641084, + "grad_norm": 216.94410705566406, + "learning_rate": 1.4656987295825772e-05, + "loss": 42.203, + "step": 2828 + }, + { + "epoch": 10.213092550790067, + "grad_norm": 260.44744873046875, + "learning_rate": 1.4651542649727768e-05, + "loss": 41.8887, + "step": 2829 + }, + { + "epoch": 10.216704288939052, + "grad_norm": 252.21682739257812, + "learning_rate": 1.4646098003629765e-05, + "loss": 42.5977, + "step": 2830 + }, + { + "epoch": 10.216704288939052, + "eval_loss": 0.6175437569618225, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.997, + "eval_steps_per_second": 56.997, + "step": 2830 + }, + { + "epoch": 10.220316027088035, + "grad_norm": 298.4760437011719, + "learning_rate": 1.464065335753176e-05, + "loss": 40.7994, + "step": 2831 + }, + { + "epoch": 10.22392776523702, + "grad_norm": 214.0433349609375, + "learning_rate": 1.4635208711433757e-05, + "loss": 39.1571, + "step": 2832 + }, + { + "epoch": 10.227539503386005, + "grad_norm": 220.59039306640625, + "learning_rate": 1.4629764065335754e-05, + "loss": 38.257, + "step": 2833 + }, + { + "epoch": 10.231151241534988, + "grad_norm": 218.2419891357422, + "learning_rate": 1.462431941923775e-05, + "loss": 38.1954, + "step": 2834 + }, + { + "epoch": 10.234762979683973, + "grad_norm": 241.67674255371094, + "learning_rate": 1.4618874773139747e-05, + "loss": 39.7451, + "step": 2835 + }, + { + "epoch": 10.238374717832958, + "grad_norm": 260.3656005859375, + "learning_rate": 1.4613430127041742e-05, + "loss": 38.8297, + "step": 2836 + }, + { + "epoch": 10.241986455981941, + "grad_norm": 231.78102111816406, + "learning_rate": 1.4607985480943739e-05, + "loss": 38.523, + "step": 2837 + }, + { + "epoch": 10.245598194130926, + "grad_norm": 217.64820861816406, + "learning_rate": 1.4602540834845736e-05, + "loss": 40.0389, + "step": 2838 + }, + { + "epoch": 10.249209932279909, + "grad_norm": 186.45240783691406, + "learning_rate": 1.4597096188747732e-05, + "loss": 40.3306, + "step": 2839 + }, + { + "epoch": 10.252821670428894, + "grad_norm": 225.20480346679688, + "learning_rate": 1.4591651542649727e-05, + "loss": 39.0968, + "step": 2840 + }, + { + "epoch": 10.252821670428894, + "eval_loss": 0.6195141673088074, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 2840 + }, + { + "epoch": 10.256433408577879, + "grad_norm": 367.6174621582031, + "learning_rate": 1.4586206896551724e-05, + "loss": 38.869, + "step": 2841 + }, + { + "epoch": 10.260045146726862, + "grad_norm": 274.3976135253906, + "learning_rate": 1.4580762250453721e-05, + "loss": 39.7781, + "step": 2842 + }, + { + "epoch": 10.263656884875846, + "grad_norm": 193.41665649414062, + "learning_rate": 1.4575317604355718e-05, + "loss": 38.819, + "step": 2843 + }, + { + "epoch": 10.267268623024831, + "grad_norm": 204.2224578857422, + "learning_rate": 1.4569872958257714e-05, + "loss": 41.5495, + "step": 2844 + }, + { + "epoch": 10.270880361173814, + "grad_norm": 276.07476806640625, + "learning_rate": 1.4564428312159709e-05, + "loss": 40.6553, + "step": 2845 + }, + { + "epoch": 10.2744920993228, + "grad_norm": 192.6361541748047, + "learning_rate": 1.4558983666061708e-05, + "loss": 40.2147, + "step": 2846 + }, + { + "epoch": 10.278103837471784, + "grad_norm": 232.6641082763672, + "learning_rate": 1.4553539019963703e-05, + "loss": 40.7223, + "step": 2847 + }, + { + "epoch": 10.281715575620767, + "grad_norm": 266.781005859375, + "learning_rate": 1.4548094373865698e-05, + "loss": 38.0127, + "step": 2848 + }, + { + "epoch": 10.285327313769752, + "grad_norm": 289.5414123535156, + "learning_rate": 1.4542649727767696e-05, + "loss": 35.216, + "step": 2849 + }, + { + "epoch": 10.288939051918735, + "grad_norm": 208.10845947265625, + "learning_rate": 1.4537205081669691e-05, + "loss": 33.829, + "step": 2850 + }, + { + "epoch": 10.288939051918735, + "eval_loss": 0.6140356063842773, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.703, + "eval_steps_per_second": 56.703, + "step": 2850 + }, + { + "epoch": 10.29255079006772, + "grad_norm": 260.80328369140625, + "learning_rate": 1.4531760435571688e-05, + "loss": 33.8409, + "step": 2851 + }, + { + "epoch": 10.296162528216705, + "grad_norm": 202.3874053955078, + "learning_rate": 1.4526315789473685e-05, + "loss": 32.6498, + "step": 2852 + }, + { + "epoch": 10.299774266365688, + "grad_norm": 236.0218048095703, + "learning_rate": 1.452087114337568e-05, + "loss": 33.6538, + "step": 2853 + }, + { + "epoch": 10.303386004514673, + "grad_norm": 219.1603240966797, + "learning_rate": 1.4515426497277678e-05, + "loss": 33.7346, + "step": 2854 + }, + { + "epoch": 10.306997742663658, + "grad_norm": 252.8759307861328, + "learning_rate": 1.4509981851179675e-05, + "loss": 34.6996, + "step": 2855 + }, + { + "epoch": 10.31060948081264, + "grad_norm": 204.89244079589844, + "learning_rate": 1.450453720508167e-05, + "loss": 36.1145, + "step": 2856 + }, + { + "epoch": 10.314221218961626, + "grad_norm": 239.5278778076172, + "learning_rate": 1.4499092558983667e-05, + "loss": 34.8845, + "step": 2857 + }, + { + "epoch": 10.317832957110609, + "grad_norm": 235.02403259277344, + "learning_rate": 1.4493647912885662e-05, + "loss": 36.1006, + "step": 2858 + }, + { + "epoch": 10.321444695259594, + "grad_norm": 219.25686645507812, + "learning_rate": 1.4488203266787658e-05, + "loss": 37.0463, + "step": 2859 + }, + { + "epoch": 10.325056433408578, + "grad_norm": 238.1767578125, + "learning_rate": 1.4482758620689657e-05, + "loss": 35.5543, + "step": 2860 + }, + { + "epoch": 10.325056433408578, + "eval_loss": 0.6116110682487488, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.93, + "eval_steps_per_second": 56.93, + "step": 2860 + }, + { + "epoch": 10.328668171557561, + "grad_norm": 245.4133758544922, + "learning_rate": 1.4477313974591652e-05, + "loss": 35.7557, + "step": 2861 + }, + { + "epoch": 10.332279909706546, + "grad_norm": 231.70779418945312, + "learning_rate": 1.4471869328493647e-05, + "loss": 35.9535, + "step": 2862 + }, + { + "epoch": 10.335891647855531, + "grad_norm": 218.71266174316406, + "learning_rate": 1.4466424682395644e-05, + "loss": 36.747, + "step": 2863 + }, + { + "epoch": 10.339503386004514, + "grad_norm": 206.82247924804688, + "learning_rate": 1.446098003629764e-05, + "loss": 37.4007, + "step": 2864 + }, + { + "epoch": 10.343115124153499, + "grad_norm": 286.6649475097656, + "learning_rate": 1.4455535390199639e-05, + "loss": 38.183, + "step": 2865 + }, + { + "epoch": 10.346726862302482, + "grad_norm": 262.2049865722656, + "learning_rate": 1.4450090744101634e-05, + "loss": 28.1564, + "step": 2866 + }, + { + "epoch": 10.350338600451467, + "grad_norm": 203.03831481933594, + "learning_rate": 1.444464609800363e-05, + "loss": 23.7155, + "step": 2867 + }, + { + "epoch": 10.353950338600452, + "grad_norm": 220.13597106933594, + "learning_rate": 1.4439201451905626e-05, + "loss": 23.5066, + "step": 2868 + }, + { + "epoch": 10.357562076749435, + "grad_norm": 208.22035217285156, + "learning_rate": 1.4433756805807624e-05, + "loss": 23.8087, + "step": 2869 + }, + { + "epoch": 10.36117381489842, + "grad_norm": 202.74989318847656, + "learning_rate": 1.4428312159709619e-05, + "loss": 24.6194, + "step": 2870 + }, + { + "epoch": 10.36117381489842, + "eval_loss": 0.6170971989631653, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 2870 + }, + { + "epoch": 10.364785553047405, + "grad_norm": 251.78924560546875, + "learning_rate": 1.4422867513611616e-05, + "loss": 41.1333, + "step": 2871 + }, + { + "epoch": 10.368397291196388, + "grad_norm": 269.72430419921875, + "learning_rate": 1.4417422867513611e-05, + "loss": 43.5289, + "step": 2872 + }, + { + "epoch": 10.372009029345373, + "grad_norm": 226.14202880859375, + "learning_rate": 1.4411978221415607e-05, + "loss": 42.1575, + "step": 2873 + }, + { + "epoch": 10.375620767494357, + "grad_norm": 230.2255096435547, + "learning_rate": 1.4406533575317606e-05, + "loss": 42.5563, + "step": 2874 + }, + { + "epoch": 10.37923250564334, + "grad_norm": 259.2338562011719, + "learning_rate": 1.4401088929219601e-05, + "loss": 41.517, + "step": 2875 + }, + { + "epoch": 10.382844243792325, + "grad_norm": 280.06414794921875, + "learning_rate": 1.4395644283121598e-05, + "loss": 41.3589, + "step": 2876 + }, + { + "epoch": 10.386455981941308, + "grad_norm": 259.1960754394531, + "learning_rate": 1.4390199637023593e-05, + "loss": 41.539, + "step": 2877 + }, + { + "epoch": 10.390067720090293, + "grad_norm": 244.4931640625, + "learning_rate": 1.438475499092559e-05, + "loss": 41.8689, + "step": 2878 + }, + { + "epoch": 10.393679458239278, + "grad_norm": 195.65065002441406, + "learning_rate": 1.4379310344827588e-05, + "loss": 42.9191, + "step": 2879 + }, + { + "epoch": 10.397291196388261, + "grad_norm": 215.88589477539062, + "learning_rate": 1.4373865698729583e-05, + "loss": 41.4172, + "step": 2880 + }, + { + "epoch": 10.397291196388261, + "eval_loss": 0.6176813840866089, + "eval_runtime": 3.1462, + "eval_samples_per_second": 56.893, + "eval_steps_per_second": 56.893, + "step": 2880 + }, + { + "epoch": 10.400902934537246, + "grad_norm": 175.21368408203125, + "learning_rate": 1.4368421052631578e-05, + "loss": 41.8998, + "step": 2881 + }, + { + "epoch": 10.404514672686231, + "grad_norm": 207.65963745117188, + "learning_rate": 1.4362976406533575e-05, + "loss": 40.33, + "step": 2882 + }, + { + "epoch": 10.408126410835214, + "grad_norm": 213.50526428222656, + "learning_rate": 1.4357531760435572e-05, + "loss": 38.0329, + "step": 2883 + }, + { + "epoch": 10.411738148984199, + "grad_norm": 190.8444366455078, + "learning_rate": 1.4352087114337568e-05, + "loss": 39.0142, + "step": 2884 + }, + { + "epoch": 10.415349887133182, + "grad_norm": 300.2298583984375, + "learning_rate": 1.4346642468239565e-05, + "loss": 38.6364, + "step": 2885 + }, + { + "epoch": 10.418961625282167, + "grad_norm": 183.6144256591797, + "learning_rate": 1.434119782214156e-05, + "loss": 39.6747, + "step": 2886 + }, + { + "epoch": 10.422573363431152, + "grad_norm": 237.85340881347656, + "learning_rate": 1.4335753176043557e-05, + "loss": 38.3018, + "step": 2887 + }, + { + "epoch": 10.426185101580135, + "grad_norm": 325.96624755859375, + "learning_rate": 1.4330308529945554e-05, + "loss": 40.1042, + "step": 2888 + }, + { + "epoch": 10.42979683972912, + "grad_norm": 248.4732666015625, + "learning_rate": 1.432486388384755e-05, + "loss": 40.0357, + "step": 2889 + }, + { + "epoch": 10.433408577878104, + "grad_norm": 374.6653747558594, + "learning_rate": 1.4319419237749547e-05, + "loss": 40.4383, + "step": 2890 + }, + { + "epoch": 10.433408577878104, + "eval_loss": 0.6150367856025696, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.881, + "eval_steps_per_second": 56.881, + "step": 2890 + }, + { + "epoch": 10.437020316027088, + "grad_norm": 229.79647827148438, + "learning_rate": 1.4313974591651542e-05, + "loss": 40.3728, + "step": 2891 + }, + { + "epoch": 10.440632054176072, + "grad_norm": 278.7500915527344, + "learning_rate": 1.430852994555354e-05, + "loss": 39.546, + "step": 2892 + }, + { + "epoch": 10.444243792325057, + "grad_norm": 233.1890106201172, + "learning_rate": 1.4303085299455536e-05, + "loss": 41.8094, + "step": 2893 + }, + { + "epoch": 10.44785553047404, + "grad_norm": 207.7745819091797, + "learning_rate": 1.4297640653357532e-05, + "loss": 40.6225, + "step": 2894 + }, + { + "epoch": 10.451467268623025, + "grad_norm": 233.37892150878906, + "learning_rate": 1.4292196007259529e-05, + "loss": 40.2499, + "step": 2895 + }, + { + "epoch": 10.455079006772008, + "grad_norm": 225.4070587158203, + "learning_rate": 1.4286751361161524e-05, + "loss": 40.3626, + "step": 2896 + }, + { + "epoch": 10.458690744920993, + "grad_norm": 239.60231018066406, + "learning_rate": 1.4281306715063521e-05, + "loss": 40.3149, + "step": 2897 + }, + { + "epoch": 10.462302483069978, + "grad_norm": 225.3981475830078, + "learning_rate": 1.4275862068965518e-05, + "loss": 39.3443, + "step": 2898 + }, + { + "epoch": 10.465914221218961, + "grad_norm": 270.2829284667969, + "learning_rate": 1.4270417422867514e-05, + "loss": 37.8947, + "step": 2899 + }, + { + "epoch": 10.469525959367946, + "grad_norm": 263.66986083984375, + "learning_rate": 1.426497277676951e-05, + "loss": 34.4721, + "step": 2900 + }, + { + "epoch": 10.469525959367946, + "eval_loss": 0.6134031414985657, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 2900 + }, + { + "epoch": 10.47313769751693, + "grad_norm": 189.3812255859375, + "learning_rate": 1.4259528130671508e-05, + "loss": 34.3148, + "step": 2901 + }, + { + "epoch": 10.476749435665914, + "grad_norm": 256.7174987792969, + "learning_rate": 1.4254083484573503e-05, + "loss": 32.1693, + "step": 2902 + }, + { + "epoch": 10.480361173814899, + "grad_norm": 265.40692138671875, + "learning_rate": 1.4248638838475499e-05, + "loss": 34.369, + "step": 2903 + }, + { + "epoch": 10.483972911963882, + "grad_norm": 315.6539001464844, + "learning_rate": 1.4243194192377496e-05, + "loss": 34.9479, + "step": 2904 + }, + { + "epoch": 10.487584650112867, + "grad_norm": 263.7816162109375, + "learning_rate": 1.4237749546279491e-05, + "loss": 33.983, + "step": 2905 + }, + { + "epoch": 10.491196388261852, + "grad_norm": 244.69192504882812, + "learning_rate": 1.423230490018149e-05, + "loss": 36.6685, + "step": 2906 + }, + { + "epoch": 10.494808126410835, + "grad_norm": 224.26071166992188, + "learning_rate": 1.4226860254083485e-05, + "loss": 35.0337, + "step": 2907 + }, + { + "epoch": 10.49841986455982, + "grad_norm": 261.0958557128906, + "learning_rate": 1.422141560798548e-05, + "loss": 34.7154, + "step": 2908 + }, + { + "epoch": 10.502031602708804, + "grad_norm": 245.85960388183594, + "learning_rate": 1.4215970961887478e-05, + "loss": 35.4156, + "step": 2909 + }, + { + "epoch": 10.505643340857787, + "grad_norm": 309.3730163574219, + "learning_rate": 1.4210526315789473e-05, + "loss": 36.3999, + "step": 2910 + }, + { + "epoch": 10.505643340857787, + "eval_loss": 0.6144266128540039, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.853, + "eval_steps_per_second": 56.853, + "step": 2910 + }, + { + "epoch": 10.509255079006772, + "grad_norm": 209.9637451171875, + "learning_rate": 1.420508166969147e-05, + "loss": 37.1515, + "step": 2911 + }, + { + "epoch": 10.512866817155757, + "grad_norm": 254.81683349609375, + "learning_rate": 1.4199637023593467e-05, + "loss": 35.5548, + "step": 2912 + }, + { + "epoch": 10.51647855530474, + "grad_norm": 224.94137573242188, + "learning_rate": 1.4194192377495463e-05, + "loss": 36.7691, + "step": 2913 + }, + { + "epoch": 10.520090293453725, + "grad_norm": 223.81838989257812, + "learning_rate": 1.4188747731397458e-05, + "loss": 37.5904, + "step": 2914 + }, + { + "epoch": 10.523702031602708, + "grad_norm": 308.0168151855469, + "learning_rate": 1.4183303085299457e-05, + "loss": 36.1561, + "step": 2915 + }, + { + "epoch": 10.527313769751693, + "grad_norm": 214.77928161621094, + "learning_rate": 1.4177858439201452e-05, + "loss": 27.6309, + "step": 2916 + }, + { + "epoch": 10.530925507900678, + "grad_norm": 153.77163696289062, + "learning_rate": 1.417241379310345e-05, + "loss": 23.6151, + "step": 2917 + }, + { + "epoch": 10.534537246049661, + "grad_norm": 161.12826538085938, + "learning_rate": 1.4166969147005445e-05, + "loss": 23.1684, + "step": 2918 + }, + { + "epoch": 10.538148984198646, + "grad_norm": 228.01441955566406, + "learning_rate": 1.416152450090744e-05, + "loss": 23.4383, + "step": 2919 + }, + { + "epoch": 10.54176072234763, + "grad_norm": 207.55052185058594, + "learning_rate": 1.4156079854809439e-05, + "loss": 25.4699, + "step": 2920 + }, + { + "epoch": 10.54176072234763, + "eval_loss": 0.6177500486373901, + "eval_runtime": 3.1369, + "eval_samples_per_second": 57.063, + "eval_steps_per_second": 57.063, + "step": 2920 + }, + { + "epoch": 10.545372460496614, + "grad_norm": 254.23828125, + "learning_rate": 1.4150635208711434e-05, + "loss": 42.1525, + "step": 2921 + }, + { + "epoch": 10.548984198645599, + "grad_norm": 228.1654815673828, + "learning_rate": 1.414519056261343e-05, + "loss": 42.4282, + "step": 2922 + }, + { + "epoch": 10.552595936794582, + "grad_norm": 258.4981689453125, + "learning_rate": 1.4139745916515427e-05, + "loss": 42.3053, + "step": 2923 + }, + { + "epoch": 10.556207674943566, + "grad_norm": 364.42059326171875, + "learning_rate": 1.4134301270417424e-05, + "loss": 41.9009, + "step": 2924 + }, + { + "epoch": 10.559819413092551, + "grad_norm": 213.5066375732422, + "learning_rate": 1.412885662431942e-05, + "loss": 41.0624, + "step": 2925 + }, + { + "epoch": 10.563431151241534, + "grad_norm": 214.23472595214844, + "learning_rate": 1.4123411978221416e-05, + "loss": 42.2508, + "step": 2926 + }, + { + "epoch": 10.56704288939052, + "grad_norm": 249.8063201904297, + "learning_rate": 1.4117967332123412e-05, + "loss": 43.0671, + "step": 2927 + }, + { + "epoch": 10.570654627539504, + "grad_norm": 210.0769805908203, + "learning_rate": 1.4112522686025409e-05, + "loss": 43.4018, + "step": 2928 + }, + { + "epoch": 10.574266365688487, + "grad_norm": 255.67225646972656, + "learning_rate": 1.4107078039927406e-05, + "loss": 42.9609, + "step": 2929 + }, + { + "epoch": 10.577878103837472, + "grad_norm": 294.2599182128906, + "learning_rate": 1.4101633393829401e-05, + "loss": 41.8748, + "step": 2930 + }, + { + "epoch": 10.577878103837472, + "eval_loss": 0.6147512793540955, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 2930 + }, + { + "epoch": 10.581489841986457, + "grad_norm": 212.6685333251953, + "learning_rate": 1.4096188747731398e-05, + "loss": 42.4291, + "step": 2931 + }, + { + "epoch": 10.58510158013544, + "grad_norm": 297.016357421875, + "learning_rate": 1.4090744101633394e-05, + "loss": 39.7291, + "step": 2932 + }, + { + "epoch": 10.588713318284425, + "grad_norm": 280.308837890625, + "learning_rate": 1.4085299455535389e-05, + "loss": 37.4836, + "step": 2933 + }, + { + "epoch": 10.592325056433408, + "grad_norm": 230.28994750976562, + "learning_rate": 1.4079854809437388e-05, + "loss": 39.4075, + "step": 2934 + }, + { + "epoch": 10.595936794582393, + "grad_norm": 377.0367126464844, + "learning_rate": 1.4074410163339383e-05, + "loss": 40.5601, + "step": 2935 + }, + { + "epoch": 10.599548532731378, + "grad_norm": 238.51597595214844, + "learning_rate": 1.406896551724138e-05, + "loss": 38.1238, + "step": 2936 + }, + { + "epoch": 10.60316027088036, + "grad_norm": 197.5536651611328, + "learning_rate": 1.4063520871143376e-05, + "loss": 38.2997, + "step": 2937 + }, + { + "epoch": 10.606772009029346, + "grad_norm": 211.65162658691406, + "learning_rate": 1.4058076225045373e-05, + "loss": 39.1501, + "step": 2938 + }, + { + "epoch": 10.610383747178329, + "grad_norm": 266.4801940917969, + "learning_rate": 1.405263157894737e-05, + "loss": 40.5761, + "step": 2939 + }, + { + "epoch": 10.613995485327314, + "grad_norm": 210.29478454589844, + "learning_rate": 1.4047186932849365e-05, + "loss": 39.7387, + "step": 2940 + }, + { + "epoch": 10.613995485327314, + "eval_loss": 0.6154477000236511, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 2940 + }, + { + "epoch": 10.617607223476298, + "grad_norm": 318.0694580078125, + "learning_rate": 1.404174228675136e-05, + "loss": 38.691, + "step": 2941 + }, + { + "epoch": 10.621218961625281, + "grad_norm": 351.12811279296875, + "learning_rate": 1.4036297640653358e-05, + "loss": 40.3878, + "step": 2942 + }, + { + "epoch": 10.624830699774266, + "grad_norm": 259.8601989746094, + "learning_rate": 1.4030852994555355e-05, + "loss": 38.4447, + "step": 2943 + }, + { + "epoch": 10.628442437923251, + "grad_norm": 249.7741241455078, + "learning_rate": 1.402540834845735e-05, + "loss": 41.1242, + "step": 2944 + }, + { + "epoch": 10.632054176072234, + "grad_norm": 207.11119079589844, + "learning_rate": 1.4019963702359347e-05, + "loss": 40.1977, + "step": 2945 + }, + { + "epoch": 10.635665914221219, + "grad_norm": 199.37295532226562, + "learning_rate": 1.4014519056261343e-05, + "loss": 40.71, + "step": 2946 + }, + { + "epoch": 10.639277652370204, + "grad_norm": 238.85061645507812, + "learning_rate": 1.4009074410163341e-05, + "loss": 41.8822, + "step": 2947 + }, + { + "epoch": 10.642889390519187, + "grad_norm": 212.46388244628906, + "learning_rate": 1.4003629764065337e-05, + "loss": 40.5648, + "step": 2948 + }, + { + "epoch": 10.646501128668172, + "grad_norm": 217.60386657714844, + "learning_rate": 1.3998185117967332e-05, + "loss": 39.6074, + "step": 2949 + }, + { + "epoch": 10.650112866817155, + "grad_norm": 223.88645935058594, + "learning_rate": 1.399274047186933e-05, + "loss": 37.7394, + "step": 2950 + }, + { + "epoch": 10.650112866817155, + "eval_loss": 0.6133999228477478, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 2950 + }, + { + "epoch": 10.65372460496614, + "grad_norm": 248.87986755371094, + "learning_rate": 1.3987295825771325e-05, + "loss": 34.911, + "step": 2951 + }, + { + "epoch": 10.657336343115125, + "grad_norm": 238.0355987548828, + "learning_rate": 1.3981851179673322e-05, + "loss": 34.0325, + "step": 2952 + }, + { + "epoch": 10.660948081264108, + "grad_norm": 212.9556121826172, + "learning_rate": 1.3976406533575319e-05, + "loss": 34.9663, + "step": 2953 + }, + { + "epoch": 10.664559819413093, + "grad_norm": 274.4277648925781, + "learning_rate": 1.3970961887477314e-05, + "loss": 34.2399, + "step": 2954 + }, + { + "epoch": 10.668171557562077, + "grad_norm": 211.77976989746094, + "learning_rate": 1.396551724137931e-05, + "loss": 33.7609, + "step": 2955 + }, + { + "epoch": 10.67178329571106, + "grad_norm": 280.6621398925781, + "learning_rate": 1.3960072595281307e-05, + "loss": 35.2616, + "step": 2956 + }, + { + "epoch": 10.675395033860045, + "grad_norm": 239.06439208984375, + "learning_rate": 1.3954627949183304e-05, + "loss": 34.2542, + "step": 2957 + }, + { + "epoch": 10.679006772009028, + "grad_norm": 271.45806884765625, + "learning_rate": 1.39491833030853e-05, + "loss": 36.0551, + "step": 2958 + }, + { + "epoch": 10.682618510158013, + "grad_norm": 247.76486206054688, + "learning_rate": 1.3943738656987296e-05, + "loss": 36.9935, + "step": 2959 + }, + { + "epoch": 10.686230248306998, + "grad_norm": 259.47930908203125, + "learning_rate": 1.3938294010889292e-05, + "loss": 36.7769, + "step": 2960 + }, + { + "epoch": 10.686230248306998, + "eval_loss": 0.6107803583145142, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.138, + "eval_steps_per_second": 57.138, + "step": 2960 + }, + { + "epoch": 10.689841986455981, + "grad_norm": 247.50103759765625, + "learning_rate": 1.393284936479129e-05, + "loss": 35.4848, + "step": 2961 + }, + { + "epoch": 10.693453724604966, + "grad_norm": 242.37330627441406, + "learning_rate": 1.3927404718693286e-05, + "loss": 36.3881, + "step": 2962 + }, + { + "epoch": 10.697065462753951, + "grad_norm": 200.2835693359375, + "learning_rate": 1.3921960072595281e-05, + "loss": 37.2684, + "step": 2963 + }, + { + "epoch": 10.700677200902934, + "grad_norm": 261.6256103515625, + "learning_rate": 1.3916515426497278e-05, + "loss": 37.4581, + "step": 2964 + }, + { + "epoch": 10.704288939051919, + "grad_norm": 243.7251434326172, + "learning_rate": 1.3911070780399274e-05, + "loss": 35.8237, + "step": 2965 + }, + { + "epoch": 10.707900677200904, + "grad_norm": 172.99339294433594, + "learning_rate": 1.390562613430127e-05, + "loss": 29.5815, + "step": 2966 + }, + { + "epoch": 10.711512415349887, + "grad_norm": 168.88490295410156, + "learning_rate": 1.3900181488203268e-05, + "loss": 23.6597, + "step": 2967 + }, + { + "epoch": 10.715124153498872, + "grad_norm": 213.0456085205078, + "learning_rate": 1.3894736842105263e-05, + "loss": 22.5034, + "step": 2968 + }, + { + "epoch": 10.718735891647855, + "grad_norm": 183.87222290039062, + "learning_rate": 1.388929219600726e-05, + "loss": 24.1696, + "step": 2969 + }, + { + "epoch": 10.72234762979684, + "grad_norm": 179.4297637939453, + "learning_rate": 1.3883847549909256e-05, + "loss": 24.8905, + "step": 2970 + }, + { + "epoch": 10.72234762979684, + "eval_loss": 0.6176853179931641, + "eval_runtime": 3.1438, + "eval_samples_per_second": 56.938, + "eval_steps_per_second": 56.938, + "step": 2970 + }, + { + "epoch": 10.725959367945824, + "grad_norm": 214.10662841796875, + "learning_rate": 1.3878402903811253e-05, + "loss": 40.6941, + "step": 2971 + }, + { + "epoch": 10.729571106094808, + "grad_norm": 199.4381103515625, + "learning_rate": 1.387295825771325e-05, + "loss": 42.6363, + "step": 2972 + }, + { + "epoch": 10.733182844243792, + "grad_norm": 182.74517822265625, + "learning_rate": 1.3867513611615245e-05, + "loss": 40.9695, + "step": 2973 + }, + { + "epoch": 10.736794582392777, + "grad_norm": 182.41421508789062, + "learning_rate": 1.386206896551724e-05, + "loss": 40.8893, + "step": 2974 + }, + { + "epoch": 10.74040632054176, + "grad_norm": 215.42904663085938, + "learning_rate": 1.385662431941924e-05, + "loss": 40.6667, + "step": 2975 + }, + { + "epoch": 10.744018058690745, + "grad_norm": 208.15133666992188, + "learning_rate": 1.3851179673321235e-05, + "loss": 42.0714, + "step": 2976 + }, + { + "epoch": 10.747629796839728, + "grad_norm": 224.70242309570312, + "learning_rate": 1.384573502722323e-05, + "loss": 40.9404, + "step": 2977 + }, + { + "epoch": 10.751241534988713, + "grad_norm": 241.45301818847656, + "learning_rate": 1.3840290381125227e-05, + "loss": 43.5597, + "step": 2978 + }, + { + "epoch": 10.754853273137698, + "grad_norm": 201.2677459716797, + "learning_rate": 1.3834845735027222e-05, + "loss": 42.7741, + "step": 2979 + }, + { + "epoch": 10.758465011286681, + "grad_norm": 246.30873107910156, + "learning_rate": 1.3829401088929221e-05, + "loss": 41.7873, + "step": 2980 + }, + { + "epoch": 10.758465011286681, + "eval_loss": 0.6206657886505127, + "eval_runtime": 3.1469, + "eval_samples_per_second": 56.882, + "eval_steps_per_second": 56.882, + "step": 2980 + }, + { + "epoch": 10.762076749435666, + "grad_norm": 206.91009521484375, + "learning_rate": 1.3823956442831217e-05, + "loss": 42.3601, + "step": 2981 + }, + { + "epoch": 10.76568848758465, + "grad_norm": 206.37472534179688, + "learning_rate": 1.3818511796733212e-05, + "loss": 38.5536, + "step": 2982 + }, + { + "epoch": 10.769300225733634, + "grad_norm": 206.49070739746094, + "learning_rate": 1.3813067150635209e-05, + "loss": 38.1051, + "step": 2983 + }, + { + "epoch": 10.772911963882619, + "grad_norm": 215.02455139160156, + "learning_rate": 1.3807622504537206e-05, + "loss": 39.0797, + "step": 2984 + }, + { + "epoch": 10.776523702031604, + "grad_norm": 254.23757934570312, + "learning_rate": 1.3802177858439202e-05, + "loss": 39.419, + "step": 2985 + }, + { + "epoch": 10.780135440180587, + "grad_norm": 205.85079956054688, + "learning_rate": 1.3796733212341199e-05, + "loss": 39.2075, + "step": 2986 + }, + { + "epoch": 10.783747178329572, + "grad_norm": 216.0372314453125, + "learning_rate": 1.3791288566243194e-05, + "loss": 38.5652, + "step": 2987 + }, + { + "epoch": 10.787358916478555, + "grad_norm": 258.47650146484375, + "learning_rate": 1.3785843920145191e-05, + "loss": 38.1968, + "step": 2988 + }, + { + "epoch": 10.79097065462754, + "grad_norm": 289.07354736328125, + "learning_rate": 1.3780399274047188e-05, + "loss": 40.2233, + "step": 2989 + }, + { + "epoch": 10.794582392776524, + "grad_norm": 332.9964904785156, + "learning_rate": 1.3774954627949184e-05, + "loss": 39.5959, + "step": 2990 + }, + { + "epoch": 10.794582392776524, + "eval_loss": 0.6167517304420471, + "eval_runtime": 3.1556, + "eval_samples_per_second": 56.724, + "eval_steps_per_second": 56.724, + "step": 2990 + }, + { + "epoch": 10.798194130925507, + "grad_norm": 205.10699462890625, + "learning_rate": 1.376950998185118e-05, + "loss": 40.2468, + "step": 2991 + }, + { + "epoch": 10.801805869074492, + "grad_norm": 270.2808837890625, + "learning_rate": 1.3764065335753176e-05, + "loss": 37.5956, + "step": 2992 + }, + { + "epoch": 10.805417607223477, + "grad_norm": 199.32044982910156, + "learning_rate": 1.3758620689655171e-05, + "loss": 38.7289, + "step": 2993 + }, + { + "epoch": 10.80902934537246, + "grad_norm": 196.97547912597656, + "learning_rate": 1.375317604355717e-05, + "loss": 40.6707, + "step": 2994 + }, + { + "epoch": 10.812641083521445, + "grad_norm": 219.34588623046875, + "learning_rate": 1.3747731397459166e-05, + "loss": 39.6782, + "step": 2995 + }, + { + "epoch": 10.816252821670428, + "grad_norm": 261.7323913574219, + "learning_rate": 1.3742286751361161e-05, + "loss": 41.1828, + "step": 2996 + }, + { + "epoch": 10.819864559819413, + "grad_norm": 250.89186096191406, + "learning_rate": 1.3736842105263158e-05, + "loss": 41.3582, + "step": 2997 + }, + { + "epoch": 10.823476297968398, + "grad_norm": 284.7223205566406, + "learning_rate": 1.3731397459165155e-05, + "loss": 39.3584, + "step": 2998 + }, + { + "epoch": 10.827088036117381, + "grad_norm": 212.9114990234375, + "learning_rate": 1.3725952813067152e-05, + "loss": 37.5373, + "step": 2999 + }, + { + "epoch": 10.830699774266366, + "grad_norm": 182.8346405029297, + "learning_rate": 1.3720508166969148e-05, + "loss": 35.2027, + "step": 3000 + }, + { + "epoch": 10.830699774266366, + "eval_loss": 0.6083630919456482, + "eval_runtime": 3.1568, + "eval_samples_per_second": 56.702, + "eval_steps_per_second": 56.702, + "step": 3000 + }, + { + "epoch": 10.83431151241535, + "grad_norm": 259.0496520996094, + "learning_rate": 1.3715063520871143e-05, + "loss": 33.4937, + "step": 3001 + }, + { + "epoch": 10.837923250564334, + "grad_norm": 173.037353515625, + "learning_rate": 1.370961887477314e-05, + "loss": 32.8549, + "step": 3002 + }, + { + "epoch": 10.841534988713319, + "grad_norm": 257.9381408691406, + "learning_rate": 1.3704174228675137e-05, + "loss": 33.9163, + "step": 3003 + }, + { + "epoch": 10.845146726862303, + "grad_norm": 248.58355712890625, + "learning_rate": 1.3698729582577132e-05, + "loss": 34.3948, + "step": 3004 + }, + { + "epoch": 10.848758465011286, + "grad_norm": 277.0877990722656, + "learning_rate": 1.369328493647913e-05, + "loss": 34.2868, + "step": 3005 + }, + { + "epoch": 10.852370203160271, + "grad_norm": 220.54014587402344, + "learning_rate": 1.3687840290381125e-05, + "loss": 35.2502, + "step": 3006 + }, + { + "epoch": 10.855981941309254, + "grad_norm": 248.14111328125, + "learning_rate": 1.3682395644283122e-05, + "loss": 33.4599, + "step": 3007 + }, + { + "epoch": 10.85959367945824, + "grad_norm": 284.2827453613281, + "learning_rate": 1.3676950998185119e-05, + "loss": 34.2927, + "step": 3008 + }, + { + "epoch": 10.863205417607224, + "grad_norm": 236.78201293945312, + "learning_rate": 1.3671506352087114e-05, + "loss": 34.9322, + "step": 3009 + }, + { + "epoch": 10.866817155756207, + "grad_norm": 245.58331298828125, + "learning_rate": 1.3666061705989112e-05, + "loss": 35.7628, + "step": 3010 + }, + { + "epoch": 10.866817155756207, + "eval_loss": 0.6125946640968323, + "eval_runtime": 3.1644, + "eval_samples_per_second": 56.566, + "eval_steps_per_second": 56.566, + "step": 3010 + }, + { + "epoch": 10.870428893905192, + "grad_norm": 217.79248046875, + "learning_rate": 1.3660617059891107e-05, + "loss": 35.7332, + "step": 3011 + }, + { + "epoch": 10.874040632054175, + "grad_norm": 258.78729248046875, + "learning_rate": 1.3655172413793104e-05, + "loss": 38.293, + "step": 3012 + }, + { + "epoch": 10.87765237020316, + "grad_norm": 253.94757080078125, + "learning_rate": 1.3649727767695101e-05, + "loss": 37.511, + "step": 3013 + }, + { + "epoch": 10.881264108352145, + "grad_norm": 265.5654602050781, + "learning_rate": 1.3644283121597096e-05, + "loss": 37.5786, + "step": 3014 + }, + { + "epoch": 10.884875846501128, + "grad_norm": 252.11453247070312, + "learning_rate": 1.3638838475499092e-05, + "loss": 37.1039, + "step": 3015 + }, + { + "epoch": 10.888487584650113, + "grad_norm": 259.5934753417969, + "learning_rate": 1.3633393829401089e-05, + "loss": 35.2651, + "step": 3016 + }, + { + "epoch": 10.892099322799098, + "grad_norm": 194.3569793701172, + "learning_rate": 1.3627949183303086e-05, + "loss": 23.7438, + "step": 3017 + }, + { + "epoch": 10.89571106094808, + "grad_norm": 233.95205688476562, + "learning_rate": 1.3622504537205081e-05, + "loss": 23.0061, + "step": 3018 + }, + { + "epoch": 10.899322799097066, + "grad_norm": 185.18495178222656, + "learning_rate": 1.3617059891107078e-05, + "loss": 24.5404, + "step": 3019 + }, + { + "epoch": 10.90293453724605, + "grad_norm": 200.27029418945312, + "learning_rate": 1.3611615245009074e-05, + "loss": 24.3629, + "step": 3020 + }, + { + "epoch": 10.90293453724605, + "eval_loss": 0.6178797483444214, + "eval_runtime": 3.1498, + "eval_samples_per_second": 56.829, + "eval_steps_per_second": 56.829, + "step": 3020 + }, + { + "epoch": 10.906546275395034, + "grad_norm": 226.4281463623047, + "learning_rate": 1.3606170598911073e-05, + "loss": 41.7249, + "step": 3021 + }, + { + "epoch": 10.910158013544018, + "grad_norm": 207.73768615722656, + "learning_rate": 1.3600725952813068e-05, + "loss": 42.1902, + "step": 3022 + }, + { + "epoch": 10.913769751693001, + "grad_norm": 248.69773864746094, + "learning_rate": 1.3595281306715063e-05, + "loss": 40.8419, + "step": 3023 + }, + { + "epoch": 10.917381489841986, + "grad_norm": 224.0100860595703, + "learning_rate": 1.358983666061706e-05, + "loss": 41.483, + "step": 3024 + }, + { + "epoch": 10.920993227990971, + "grad_norm": 217.3524932861328, + "learning_rate": 1.3584392014519056e-05, + "loss": 42.4667, + "step": 3025 + }, + { + "epoch": 10.924604966139954, + "grad_norm": 226.0863494873047, + "learning_rate": 1.3578947368421053e-05, + "loss": 40.8693, + "step": 3026 + }, + { + "epoch": 10.928216704288939, + "grad_norm": 278.3658447265625, + "learning_rate": 1.357350272232305e-05, + "loss": 39.5165, + "step": 3027 + }, + { + "epoch": 10.931828442437924, + "grad_norm": 226.6543731689453, + "learning_rate": 1.3568058076225045e-05, + "loss": 39.3144, + "step": 3028 + }, + { + "epoch": 10.935440180586907, + "grad_norm": 215.39073181152344, + "learning_rate": 1.3562613430127042e-05, + "loss": 39.9823, + "step": 3029 + }, + { + "epoch": 10.939051918735892, + "grad_norm": 239.6291961669922, + "learning_rate": 1.355716878402904e-05, + "loss": 40.898, + "step": 3030 + }, + { + "epoch": 10.939051918735892, + "eval_loss": 0.6163076162338257, + "eval_runtime": 3.153, + "eval_samples_per_second": 56.771, + "eval_steps_per_second": 56.771, + "step": 3030 + }, + { + "epoch": 10.942663656884875, + "grad_norm": 251.20431518554688, + "learning_rate": 1.3551724137931035e-05, + "loss": 40.8357, + "step": 3031 + }, + { + "epoch": 10.94627539503386, + "grad_norm": 243.96022033691406, + "learning_rate": 1.3546279491833032e-05, + "loss": 39.1261, + "step": 3032 + }, + { + "epoch": 10.949887133182845, + "grad_norm": 248.15545654296875, + "learning_rate": 1.3540834845735027e-05, + "loss": 40.9375, + "step": 3033 + }, + { + "epoch": 10.953498871331828, + "grad_norm": 215.00927734375, + "learning_rate": 1.3535390199637023e-05, + "loss": 42.4167, + "step": 3034 + }, + { + "epoch": 10.957110609480813, + "grad_norm": 263.11566162109375, + "learning_rate": 1.3529945553539021e-05, + "loss": 40.7363, + "step": 3035 + }, + { + "epoch": 10.960722347629797, + "grad_norm": 208.59628295898438, + "learning_rate": 1.3524500907441017e-05, + "loss": 35.7124, + "step": 3036 + }, + { + "epoch": 10.96433408577878, + "grad_norm": 187.6036834716797, + "learning_rate": 1.3519056261343012e-05, + "loss": 33.7512, + "step": 3037 + }, + { + "epoch": 10.967945823927765, + "grad_norm": 217.89825439453125, + "learning_rate": 1.351361161524501e-05, + "loss": 33.4262, + "step": 3038 + }, + { + "epoch": 10.97155756207675, + "grad_norm": 235.59889221191406, + "learning_rate": 1.3508166969147005e-05, + "loss": 35.2587, + "step": 3039 + }, + { + "epoch": 10.975169300225733, + "grad_norm": 261.9609680175781, + "learning_rate": 1.3502722323049003e-05, + "loss": 36.1296, + "step": 3040 + }, + { + "epoch": 10.975169300225733, + "eval_loss": 0.610818088054657, + "eval_runtime": 3.1502, + "eval_samples_per_second": 56.822, + "eval_steps_per_second": 56.822, + "step": 3040 + }, + { + "epoch": 10.978781038374718, + "grad_norm": 239.44386291503906, + "learning_rate": 1.3497277676950999e-05, + "loss": 35.6712, + "step": 3041 + }, + { + "epoch": 10.982392776523701, + "grad_norm": 260.9620666503906, + "learning_rate": 1.3491833030852994e-05, + "loss": 35.9054, + "step": 3042 + }, + { + "epoch": 10.986004514672686, + "grad_norm": 246.35678100585938, + "learning_rate": 1.3486388384754991e-05, + "loss": 35.6071, + "step": 3043 + }, + { + "epoch": 10.989616252821671, + "grad_norm": 259.808349609375, + "learning_rate": 1.3480943738656988e-05, + "loss": 37.8261, + "step": 3044 + }, + { + "epoch": 10.993227990970654, + "grad_norm": 187.34579467773438, + "learning_rate": 1.3475499092558984e-05, + "loss": 29.4662, + "step": 3045 + }, + { + "epoch": 10.996839729119639, + "grad_norm": 235.4073486328125, + "learning_rate": 1.3470054446460981e-05, + "loss": 23.668, + "step": 3046 + }, + { + "epoch": 11.0, + "grad_norm": 171.45904541015625, + "learning_rate": 1.3464609800362976e-05, + "loss": 21.3995, + "step": 3047 + }, + { + "epoch": 11.003611738148985, + "grad_norm": 262.18798828125, + "learning_rate": 1.3459165154264972e-05, + "loss": 40.2072, + "step": 3048 + }, + { + "epoch": 11.007223476297968, + "grad_norm": 298.67755126953125, + "learning_rate": 1.345372050816697e-05, + "loss": 42.5345, + "step": 3049 + }, + { + "epoch": 11.010835214446953, + "grad_norm": 215.71389770507812, + "learning_rate": 1.3448275862068966e-05, + "loss": 41.3491, + "step": 3050 + }, + { + "epoch": 11.010835214446953, + "eval_loss": 0.6099278330802917, + "eval_runtime": 3.1503, + "eval_samples_per_second": 56.82, + "eval_steps_per_second": 56.82, + "step": 3050 + }, + { + "epoch": 11.014446952595938, + "grad_norm": 243.77044677734375, + "learning_rate": 1.3442831215970963e-05, + "loss": 41.0093, + "step": 3051 + }, + { + "epoch": 11.01805869074492, + "grad_norm": 205.8600616455078, + "learning_rate": 1.3437386569872958e-05, + "loss": 41.944, + "step": 3052 + }, + { + "epoch": 11.021670428893906, + "grad_norm": 204.25608825683594, + "learning_rate": 1.3431941923774955e-05, + "loss": 39.3595, + "step": 3053 + }, + { + "epoch": 11.025282167042889, + "grad_norm": 195.03114318847656, + "learning_rate": 1.3426497277676952e-05, + "loss": 42.0208, + "step": 3054 + }, + { + "epoch": 11.028893905191874, + "grad_norm": 193.05857849121094, + "learning_rate": 1.3421052631578948e-05, + "loss": 41.2148, + "step": 3055 + }, + { + "epoch": 11.032505643340858, + "grad_norm": 255.9553680419922, + "learning_rate": 1.3415607985480943e-05, + "loss": 41.6029, + "step": 3056 + }, + { + "epoch": 11.036117381489841, + "grad_norm": 234.97799682617188, + "learning_rate": 1.341016333938294e-05, + "loss": 41.2583, + "step": 3057 + }, + { + "epoch": 11.039729119638826, + "grad_norm": 183.76707458496094, + "learning_rate": 1.3404718693284937e-05, + "loss": 39.4893, + "step": 3058 + }, + { + "epoch": 11.043340857787811, + "grad_norm": 162.30191040039062, + "learning_rate": 1.3399274047186933e-05, + "loss": 37.697, + "step": 3059 + }, + { + "epoch": 11.046952595936794, + "grad_norm": 223.8235626220703, + "learning_rate": 1.339382940108893e-05, + "loss": 37.2762, + "step": 3060 + }, + { + "epoch": 11.046952595936794, + "eval_loss": 0.6099210381507874, + "eval_runtime": 3.1526, + "eval_samples_per_second": 56.778, + "eval_steps_per_second": 56.778, + "step": 3060 + }, + { + "epoch": 11.050564334085779, + "grad_norm": 203.874755859375, + "learning_rate": 1.3388384754990925e-05, + "loss": 37.7674, + "step": 3061 + }, + { + "epoch": 11.054176072234762, + "grad_norm": 222.9609832763672, + "learning_rate": 1.3382940108892922e-05, + "loss": 39.5784, + "step": 3062 + }, + { + "epoch": 11.057787810383747, + "grad_norm": 177.81871032714844, + "learning_rate": 1.337749546279492e-05, + "loss": 37.5264, + "step": 3063 + }, + { + "epoch": 11.061399548532732, + "grad_norm": 209.53326416015625, + "learning_rate": 1.3372050816696915e-05, + "loss": 38.5067, + "step": 3064 + }, + { + "epoch": 11.065011286681715, + "grad_norm": 228.35260009765625, + "learning_rate": 1.3366606170598912e-05, + "loss": 37.5329, + "step": 3065 + }, + { + "epoch": 11.0686230248307, + "grad_norm": 231.5054168701172, + "learning_rate": 1.3361161524500907e-05, + "loss": 39.8565, + "step": 3066 + }, + { + "epoch": 11.072234762979685, + "grad_norm": 184.31460571289062, + "learning_rate": 1.3355716878402904e-05, + "loss": 37.9703, + "step": 3067 + }, + { + "epoch": 11.075846501128668, + "grad_norm": 230.06463623046875, + "learning_rate": 1.3350272232304901e-05, + "loss": 39.1406, + "step": 3068 + }, + { + "epoch": 11.079458239277653, + "grad_norm": 263.3990478515625, + "learning_rate": 1.3344827586206897e-05, + "loss": 39.8019, + "step": 3069 + }, + { + "epoch": 11.083069977426636, + "grad_norm": 217.89923095703125, + "learning_rate": 1.3339382940108892e-05, + "loss": 40.195, + "step": 3070 + }, + { + "epoch": 11.083069977426636, + "eval_loss": 0.6136859655380249, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.97, + "eval_steps_per_second": 56.97, + "step": 3070 + }, + { + "epoch": 11.08668171557562, + "grad_norm": 238.8343505859375, + "learning_rate": 1.333393829401089e-05, + "loss": 39.1668, + "step": 3071 + }, + { + "epoch": 11.090293453724605, + "grad_norm": 288.6470947265625, + "learning_rate": 1.3328493647912886e-05, + "loss": 40.3355, + "step": 3072 + }, + { + "epoch": 11.093905191873588, + "grad_norm": 284.3423156738281, + "learning_rate": 1.3323049001814883e-05, + "loss": 41.5359, + "step": 3073 + }, + { + "epoch": 11.097516930022573, + "grad_norm": 263.0945739746094, + "learning_rate": 1.3317604355716879e-05, + "loss": 41.3219, + "step": 3074 + }, + { + "epoch": 11.101128668171558, + "grad_norm": 208.96383666992188, + "learning_rate": 1.3312159709618874e-05, + "loss": 39.7292, + "step": 3075 + }, + { + "epoch": 11.104740406320541, + "grad_norm": 233.49888610839844, + "learning_rate": 1.3306715063520873e-05, + "loss": 35.282, + "step": 3076 + }, + { + "epoch": 11.108352144469526, + "grad_norm": 216.6250762939453, + "learning_rate": 1.3301270417422868e-05, + "loss": 34.4335, + "step": 3077 + }, + { + "epoch": 11.111963882618511, + "grad_norm": 182.3594970703125, + "learning_rate": 1.3295825771324864e-05, + "loss": 32.7557, + "step": 3078 + }, + { + "epoch": 11.115575620767494, + "grad_norm": 215.4852752685547, + "learning_rate": 1.329038112522686e-05, + "loss": 32.185, + "step": 3079 + }, + { + "epoch": 11.119187358916479, + "grad_norm": 237.4733123779297, + "learning_rate": 1.3284936479128856e-05, + "loss": 32.8733, + "step": 3080 + }, + { + "epoch": 11.119187358916479, + "eval_loss": 0.6130570769309998, + "eval_runtime": 3.154, + "eval_samples_per_second": 56.754, + "eval_steps_per_second": 56.754, + "step": 3080 + }, + { + "epoch": 11.122799097065462, + "grad_norm": 202.9044952392578, + "learning_rate": 1.3279491833030853e-05, + "loss": 33.89, + "step": 3081 + }, + { + "epoch": 11.126410835214447, + "grad_norm": 230.82086181640625, + "learning_rate": 1.327404718693285e-05, + "loss": 34.0808, + "step": 3082 + }, + { + "epoch": 11.130022573363432, + "grad_norm": 318.1103515625, + "learning_rate": 1.3268602540834846e-05, + "loss": 35.5715, + "step": 3083 + }, + { + "epoch": 11.133634311512415, + "grad_norm": 296.760986328125, + "learning_rate": 1.3263157894736843e-05, + "loss": 36.0701, + "step": 3084 + }, + { + "epoch": 11.1372460496614, + "grad_norm": 355.1922302246094, + "learning_rate": 1.3257713248638838e-05, + "loss": 35.027, + "step": 3085 + }, + { + "epoch": 11.140857787810384, + "grad_norm": 379.0643310546875, + "learning_rate": 1.3252268602540835e-05, + "loss": 36.8225, + "step": 3086 + }, + { + "epoch": 11.144469525959368, + "grad_norm": 271.0293273925781, + "learning_rate": 1.3246823956442832e-05, + "loss": 34.18, + "step": 3087 + }, + { + "epoch": 11.148081264108352, + "grad_norm": 231.29782104492188, + "learning_rate": 1.3241379310344828e-05, + "loss": 37.5546, + "step": 3088 + }, + { + "epoch": 11.151693002257336, + "grad_norm": 236.58180236816406, + "learning_rate": 1.3235934664246823e-05, + "loss": 35.8625, + "step": 3089 + }, + { + "epoch": 11.15530474040632, + "grad_norm": 220.71853637695312, + "learning_rate": 1.3230490018148822e-05, + "loss": 38.1384, + "step": 3090 + }, + { + "epoch": 11.15530474040632, + "eval_loss": 0.6140565276145935, + "eval_runtime": 3.1543, + "eval_samples_per_second": 56.747, + "eval_steps_per_second": 56.747, + "step": 3090 + }, + { + "epoch": 11.158916478555305, + "grad_norm": 251.32090759277344, + "learning_rate": 1.3225045372050817e-05, + "loss": 36.7226, + "step": 3091 + }, + { + "epoch": 11.162528216704288, + "grad_norm": 244.061279296875, + "learning_rate": 1.3219600725952814e-05, + "loss": 37.2144, + "step": 3092 + }, + { + "epoch": 11.166139954853273, + "grad_norm": 274.3013610839844, + "learning_rate": 1.321415607985481e-05, + "loss": 27.0703, + "step": 3093 + }, + { + "epoch": 11.169751693002258, + "grad_norm": 197.1829071044922, + "learning_rate": 1.3208711433756805e-05, + "loss": 23.0504, + "step": 3094 + }, + { + "epoch": 11.173363431151241, + "grad_norm": 205.8387451171875, + "learning_rate": 1.3203266787658804e-05, + "loss": 23.4632, + "step": 3095 + }, + { + "epoch": 11.176975169300226, + "grad_norm": 237.6263427734375, + "learning_rate": 1.31978221415608e-05, + "loss": 23.9426, + "step": 3096 + }, + { + "epoch": 11.18058690744921, + "grad_norm": 177.99688720703125, + "learning_rate": 1.3192377495462795e-05, + "loss": 24.2553, + "step": 3097 + }, + { + "epoch": 11.184198645598194, + "grad_norm": 235.16787719726562, + "learning_rate": 1.3186932849364792e-05, + "loss": 41.3257, + "step": 3098 + }, + { + "epoch": 11.187810383747179, + "grad_norm": 213.4043731689453, + "learning_rate": 1.3181488203266787e-05, + "loss": 42.3344, + "step": 3099 + }, + { + "epoch": 11.191422121896162, + "grad_norm": 162.57554626464844, + "learning_rate": 1.3176043557168784e-05, + "loss": 41.2702, + "step": 3100 + }, + { + "epoch": 11.191422121896162, + "eval_loss": 0.6155741214752197, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.06, + "eval_steps_per_second": 57.06, + "step": 3100 + }, + { + "epoch": 11.195033860045147, + "grad_norm": 215.84335327148438, + "learning_rate": 1.3170598911070781e-05, + "loss": 41.0582, + "step": 3101 + }, + { + "epoch": 11.198645598194132, + "grad_norm": 295.0271301269531, + "learning_rate": 1.3165154264972777e-05, + "loss": 41.3479, + "step": 3102 + }, + { + "epoch": 11.202257336343115, + "grad_norm": 287.3316955566406, + "learning_rate": 1.3159709618874774e-05, + "loss": 41.6267, + "step": 3103 + }, + { + "epoch": 11.2058690744921, + "grad_norm": 249.3993377685547, + "learning_rate": 1.315426497277677e-05, + "loss": 40.5208, + "step": 3104 + }, + { + "epoch": 11.209480812641084, + "grad_norm": 274.5410461425781, + "learning_rate": 1.3148820326678766e-05, + "loss": 41.7072, + "step": 3105 + }, + { + "epoch": 11.213092550790067, + "grad_norm": 259.49627685546875, + "learning_rate": 1.3143375680580763e-05, + "loss": 41.0034, + "step": 3106 + }, + { + "epoch": 11.216704288939052, + "grad_norm": 246.60902404785156, + "learning_rate": 1.3137931034482759e-05, + "loss": 40.1154, + "step": 3107 + }, + { + "epoch": 11.220316027088035, + "grad_norm": 224.0052947998047, + "learning_rate": 1.3132486388384754e-05, + "loss": 41.1167, + "step": 3108 + }, + { + "epoch": 11.22392776523702, + "grad_norm": 204.24021911621094, + "learning_rate": 1.3127041742286753e-05, + "loss": 37.0909, + "step": 3109 + }, + { + "epoch": 11.227539503386005, + "grad_norm": 206.67681884765625, + "learning_rate": 1.3121597096188748e-05, + "loss": 38.0959, + "step": 3110 + }, + { + "epoch": 11.227539503386005, + "eval_loss": 0.6148640513420105, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.798, + "eval_steps_per_second": 56.798, + "step": 3110 + }, + { + "epoch": 11.231151241534988, + "grad_norm": 255.91238403320312, + "learning_rate": 1.3116152450090743e-05, + "loss": 38.8076, + "step": 3111 + }, + { + "epoch": 11.234762979683973, + "grad_norm": 239.5032958984375, + "learning_rate": 1.311070780399274e-05, + "loss": 39.3991, + "step": 3112 + }, + { + "epoch": 11.238374717832958, + "grad_norm": 254.8914031982422, + "learning_rate": 1.3105263157894738e-05, + "loss": 37.7301, + "step": 3113 + }, + { + "epoch": 11.241986455981941, + "grad_norm": 229.97943115234375, + "learning_rate": 1.3099818511796735e-05, + "loss": 38.8527, + "step": 3114 + }, + { + "epoch": 11.245598194130926, + "grad_norm": 208.1148681640625, + "learning_rate": 1.309437386569873e-05, + "loss": 38.8518, + "step": 3115 + }, + { + "epoch": 11.249209932279909, + "grad_norm": 208.49557495117188, + "learning_rate": 1.3088929219600725e-05, + "loss": 38.927, + "step": 3116 + }, + { + "epoch": 11.252821670428894, + "grad_norm": 332.9958801269531, + "learning_rate": 1.3083484573502723e-05, + "loss": 40.0492, + "step": 3117 + }, + { + "epoch": 11.256433408577879, + "grad_norm": 253.16769409179688, + "learning_rate": 1.307803992740472e-05, + "loss": 39.1965, + "step": 3118 + }, + { + "epoch": 11.260045146726862, + "grad_norm": 243.8136444091797, + "learning_rate": 1.3072595281306715e-05, + "loss": 38.2286, + "step": 3119 + }, + { + "epoch": 11.263656884875846, + "grad_norm": 273.6463623046875, + "learning_rate": 1.3067150635208712e-05, + "loss": 39.3751, + "step": 3120 + }, + { + "epoch": 11.263656884875846, + "eval_loss": 0.6175129413604736, + "eval_runtime": 3.145, + "eval_samples_per_second": 56.916, + "eval_steps_per_second": 56.916, + "step": 3120 + }, + { + "epoch": 11.267268623024831, + "grad_norm": 228.980224609375, + "learning_rate": 1.3061705989110707e-05, + "loss": 40.29, + "step": 3121 + }, + { + "epoch": 11.270880361173814, + "grad_norm": 292.6310729980469, + "learning_rate": 1.3056261343012703e-05, + "loss": 41.1785, + "step": 3122 + }, + { + "epoch": 11.2744920993228, + "grad_norm": 217.0737762451172, + "learning_rate": 1.3050816696914702e-05, + "loss": 40.9514, + "step": 3123 + }, + { + "epoch": 11.278103837471784, + "grad_norm": 227.0102081298828, + "learning_rate": 1.3045372050816697e-05, + "loss": 39.6132, + "step": 3124 + }, + { + "epoch": 11.281715575620767, + "grad_norm": 195.74667358398438, + "learning_rate": 1.3039927404718694e-05, + "loss": 39.5024, + "step": 3125 + }, + { + "epoch": 11.285327313769752, + "grad_norm": 222.6744384765625, + "learning_rate": 1.303448275862069e-05, + "loss": 37.7863, + "step": 3126 + }, + { + "epoch": 11.288939051918735, + "grad_norm": 207.1038055419922, + "learning_rate": 1.3029038112522687e-05, + "loss": 34.9129, + "step": 3127 + }, + { + "epoch": 11.29255079006772, + "grad_norm": 227.38330078125, + "learning_rate": 1.3023593466424684e-05, + "loss": 33.231, + "step": 3128 + }, + { + "epoch": 11.296162528216705, + "grad_norm": 254.19442749023438, + "learning_rate": 1.3018148820326679e-05, + "loss": 33.3166, + "step": 3129 + }, + { + "epoch": 11.299774266365688, + "grad_norm": 221.4664306640625, + "learning_rate": 1.3012704174228674e-05, + "loss": 33.2336, + "step": 3130 + }, + { + "epoch": 11.299774266365688, + "eval_loss": 0.6138683557510376, + "eval_runtime": 3.1463, + "eval_samples_per_second": 56.892, + "eval_steps_per_second": 56.892, + "step": 3130 + }, + { + "epoch": 11.303386004514673, + "grad_norm": 179.73678588867188, + "learning_rate": 1.3007259528130671e-05, + "loss": 34.0082, + "step": 3131 + }, + { + "epoch": 11.306997742663658, + "grad_norm": 238.66107177734375, + "learning_rate": 1.3001814882032669e-05, + "loss": 33.1898, + "step": 3132 + }, + { + "epoch": 11.31060948081264, + "grad_norm": 315.51934814453125, + "learning_rate": 1.2996370235934666e-05, + "loss": 34.5558, + "step": 3133 + }, + { + "epoch": 11.314221218961626, + "grad_norm": 235.54217529296875, + "learning_rate": 1.2990925589836661e-05, + "loss": 32.4498, + "step": 3134 + }, + { + "epoch": 11.317832957110609, + "grad_norm": 225.9518280029297, + "learning_rate": 1.2985480943738656e-05, + "loss": 34.1823, + "step": 3135 + }, + { + "epoch": 11.321444695259594, + "grad_norm": 276.5481262207031, + "learning_rate": 1.2980036297640655e-05, + "loss": 34.6704, + "step": 3136 + }, + { + "epoch": 11.325056433408578, + "grad_norm": 306.4985656738281, + "learning_rate": 1.297459165154265e-05, + "loss": 35.9149, + "step": 3137 + }, + { + "epoch": 11.328668171557561, + "grad_norm": 207.28550720214844, + "learning_rate": 1.2969147005444646e-05, + "loss": 34.876, + "step": 3138 + }, + { + "epoch": 11.332279909706546, + "grad_norm": 238.89157104492188, + "learning_rate": 1.2963702359346643e-05, + "loss": 36.7191, + "step": 3139 + }, + { + "epoch": 11.335891647855531, + "grad_norm": 281.7445068359375, + "learning_rate": 1.2958257713248638e-05, + "loss": 37.9134, + "step": 3140 + }, + { + "epoch": 11.335891647855531, + "eval_loss": 0.6141538023948669, + "eval_runtime": 3.1622, + "eval_samples_per_second": 56.606, + "eval_steps_per_second": 56.606, + "step": 3140 + }, + { + "epoch": 11.339503386004514, + "grad_norm": 261.58221435546875, + "learning_rate": 1.2952813067150635e-05, + "loss": 36.7193, + "step": 3141 + }, + { + "epoch": 11.343115124153499, + "grad_norm": 260.8083190917969, + "learning_rate": 1.2947368421052633e-05, + "loss": 36.9418, + "step": 3142 + }, + { + "epoch": 11.346726862302482, + "grad_norm": 263.466552734375, + "learning_rate": 1.2941923774954628e-05, + "loss": 31.1083, + "step": 3143 + }, + { + "epoch": 11.350338600451467, + "grad_norm": 201.6587677001953, + "learning_rate": 1.2936479128856625e-05, + "loss": 23.4982, + "step": 3144 + }, + { + "epoch": 11.353950338600452, + "grad_norm": 230.29629516601562, + "learning_rate": 1.293103448275862e-05, + "loss": 22.5417, + "step": 3145 + }, + { + "epoch": 11.357562076749435, + "grad_norm": 193.08795166015625, + "learning_rate": 1.2925589836660617e-05, + "loss": 23.6032, + "step": 3146 + }, + { + "epoch": 11.36117381489842, + "grad_norm": 206.49093627929688, + "learning_rate": 1.2920145190562615e-05, + "loss": 24.1813, + "step": 3147 + }, + { + "epoch": 11.364785553047405, + "grad_norm": 285.38348388671875, + "learning_rate": 1.291470054446461e-05, + "loss": 41.4394, + "step": 3148 + }, + { + "epoch": 11.368397291196388, + "grad_norm": 307.4984130859375, + "learning_rate": 1.2909255898366605e-05, + "loss": 43.8865, + "step": 3149 + }, + { + "epoch": 11.372009029345373, + "grad_norm": 256.685791015625, + "learning_rate": 1.2903811252268604e-05, + "loss": 41.5534, + "step": 3150 + }, + { + "epoch": 11.372009029345373, + "eval_loss": 0.6155339479446411, + "eval_runtime": 3.1488, + "eval_samples_per_second": 56.846, + "eval_steps_per_second": 56.846, + "step": 3150 + }, + { + "epoch": 11.375620767494357, + "grad_norm": 302.5317077636719, + "learning_rate": 1.28983666061706e-05, + "loss": 41.5231, + "step": 3151 + }, + { + "epoch": 11.37923250564334, + "grad_norm": 381.4787292480469, + "learning_rate": 1.2892921960072595e-05, + "loss": 40.7064, + "step": 3152 + }, + { + "epoch": 11.382844243792325, + "grad_norm": 313.63116455078125, + "learning_rate": 1.2887477313974592e-05, + "loss": 41.4045, + "step": 3153 + }, + { + "epoch": 11.386455981941308, + "grad_norm": 265.4134521484375, + "learning_rate": 1.2882032667876587e-05, + "loss": 41.2618, + "step": 3154 + }, + { + "epoch": 11.390067720090293, + "grad_norm": 260.43084716796875, + "learning_rate": 1.2876588021778586e-05, + "loss": 42.6311, + "step": 3155 + }, + { + "epoch": 11.393679458239278, + "grad_norm": 326.7022705078125, + "learning_rate": 1.2871143375680581e-05, + "loss": 41.8859, + "step": 3156 + }, + { + "epoch": 11.397291196388261, + "grad_norm": 420.966552734375, + "learning_rate": 1.2865698729582577e-05, + "loss": 41.8117, + "step": 3157 + }, + { + "epoch": 11.400902934537246, + "grad_norm": 280.8377380371094, + "learning_rate": 1.2860254083484574e-05, + "loss": 41.3303, + "step": 3158 + }, + { + "epoch": 11.404514672686231, + "grad_norm": 238.64564514160156, + "learning_rate": 1.2854809437386571e-05, + "loss": 38.253, + "step": 3159 + }, + { + "epoch": 11.408126410835214, + "grad_norm": 258.8091125488281, + "learning_rate": 1.2849364791288566e-05, + "loss": 39.2494, + "step": 3160 + }, + { + "epoch": 11.408126410835214, + "eval_loss": 0.6130858659744263, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.9, + "eval_steps_per_second": 56.9, + "step": 3160 + }, + { + "epoch": 11.411738148984199, + "grad_norm": 209.76300048828125, + "learning_rate": 1.2843920145190563e-05, + "loss": 39.1069, + "step": 3161 + }, + { + "epoch": 11.415349887133182, + "grad_norm": 215.24072265625, + "learning_rate": 1.2838475499092559e-05, + "loss": 38.8867, + "step": 3162 + }, + { + "epoch": 11.418961625282167, + "grad_norm": 285.4281311035156, + "learning_rate": 1.2833030852994554e-05, + "loss": 38.0298, + "step": 3163 + }, + { + "epoch": 11.422573363431152, + "grad_norm": 322.1593017578125, + "learning_rate": 1.2827586206896553e-05, + "loss": 40.2122, + "step": 3164 + }, + { + "epoch": 11.426185101580135, + "grad_norm": 277.2178955078125, + "learning_rate": 1.2822141560798548e-05, + "loss": 38.0829, + "step": 3165 + }, + { + "epoch": 11.42979683972912, + "grad_norm": 186.9705810546875, + "learning_rate": 1.2816696914700545e-05, + "loss": 40.6601, + "step": 3166 + }, + { + "epoch": 11.433408577878104, + "grad_norm": 210.6102294921875, + "learning_rate": 1.281125226860254e-05, + "loss": 39.0126, + "step": 3167 + }, + { + "epoch": 11.437020316027088, + "grad_norm": 234.50717163085938, + "learning_rate": 1.2805807622504536e-05, + "loss": 38.6465, + "step": 3168 + }, + { + "epoch": 11.440632054176072, + "grad_norm": 217.9093475341797, + "learning_rate": 1.2800362976406535e-05, + "loss": 39.2568, + "step": 3169 + }, + { + "epoch": 11.444243792325057, + "grad_norm": 252.82054138183594, + "learning_rate": 1.279491833030853e-05, + "loss": 39.005, + "step": 3170 + }, + { + "epoch": 11.444243792325057, + "eval_loss": 0.6125118732452393, + "eval_runtime": 3.1425, + "eval_samples_per_second": 56.961, + "eval_steps_per_second": 56.961, + "step": 3170 + }, + { + "epoch": 11.44785553047404, + "grad_norm": 290.2322998046875, + "learning_rate": 1.2789473684210526e-05, + "loss": 39.6133, + "step": 3171 + }, + { + "epoch": 11.451467268623025, + "grad_norm": 250.72450256347656, + "learning_rate": 1.2784029038112523e-05, + "loss": 40.3251, + "step": 3172 + }, + { + "epoch": 11.455079006772008, + "grad_norm": 273.91229248046875, + "learning_rate": 1.277858439201452e-05, + "loss": 39.5129, + "step": 3173 + }, + { + "epoch": 11.458690744920993, + "grad_norm": 214.30038452148438, + "learning_rate": 1.2773139745916515e-05, + "loss": 40.5093, + "step": 3174 + }, + { + "epoch": 11.462302483069978, + "grad_norm": 264.251708984375, + "learning_rate": 1.2767695099818512e-05, + "loss": 38.3837, + "step": 3175 + }, + { + "epoch": 11.465914221218961, + "grad_norm": 224.7700653076172, + "learning_rate": 1.2762250453720508e-05, + "loss": 37.8522, + "step": 3176 + }, + { + "epoch": 11.469525959367946, + "grad_norm": 238.35604858398438, + "learning_rate": 1.2756805807622505e-05, + "loss": 34.0249, + "step": 3177 + }, + { + "epoch": 11.47313769751693, + "grad_norm": 181.4731903076172, + "learning_rate": 1.2751361161524502e-05, + "loss": 34.2473, + "step": 3178 + }, + { + "epoch": 11.476749435665914, + "grad_norm": 240.2397003173828, + "learning_rate": 1.2745916515426497e-05, + "loss": 32.8657, + "step": 3179 + }, + { + "epoch": 11.480361173814899, + "grad_norm": 283.2740478515625, + "learning_rate": 1.2740471869328494e-05, + "loss": 34.6619, + "step": 3180 + }, + { + "epoch": 11.480361173814899, + "eval_loss": 0.6126638054847717, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 3180 + }, + { + "epoch": 11.483972911963882, + "grad_norm": 248.70912170410156, + "learning_rate": 1.273502722323049e-05, + "loss": 33.0975, + "step": 3181 + }, + { + "epoch": 11.487584650112867, + "grad_norm": 210.9479217529297, + "learning_rate": 1.2729582577132487e-05, + "loss": 34.2069, + "step": 3182 + }, + { + "epoch": 11.491196388261852, + "grad_norm": 234.31399536132812, + "learning_rate": 1.2724137931034484e-05, + "loss": 35.811, + "step": 3183 + }, + { + "epoch": 11.494808126410835, + "grad_norm": 253.24478149414062, + "learning_rate": 1.271869328493648e-05, + "loss": 35.6234, + "step": 3184 + }, + { + "epoch": 11.49841986455982, + "grad_norm": 259.0565185546875, + "learning_rate": 1.2713248638838476e-05, + "loss": 35.1495, + "step": 3185 + }, + { + "epoch": 11.502031602708804, + "grad_norm": 235.4202880859375, + "learning_rate": 1.2707803992740472e-05, + "loss": 35.1363, + "step": 3186 + }, + { + "epoch": 11.505643340857787, + "grad_norm": 248.30267333984375, + "learning_rate": 1.2702359346642469e-05, + "loss": 35.9653, + "step": 3187 + }, + { + "epoch": 11.509255079006772, + "grad_norm": 197.6142120361328, + "learning_rate": 1.2696914700544466e-05, + "loss": 35.6304, + "step": 3188 + }, + { + "epoch": 11.512866817155757, + "grad_norm": 329.27862548828125, + "learning_rate": 1.2691470054446461e-05, + "loss": 35.6111, + "step": 3189 + }, + { + "epoch": 11.51647855530474, + "grad_norm": 194.7126922607422, + "learning_rate": 1.2686025408348457e-05, + "loss": 35.0693, + "step": 3190 + }, + { + "epoch": 11.51647855530474, + "eval_loss": 0.6106634736061096, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 3190 + }, + { + "epoch": 11.520090293453725, + "grad_norm": 243.0207061767578, + "learning_rate": 1.2680580762250454e-05, + "loss": 37.6373, + "step": 3191 + }, + { + "epoch": 11.523702031602708, + "grad_norm": 282.0947265625, + "learning_rate": 1.267513611615245e-05, + "loss": 36.2595, + "step": 3192 + }, + { + "epoch": 11.527313769751693, + "grad_norm": 249.8011932373047, + "learning_rate": 1.2669691470054446e-05, + "loss": 35.5601, + "step": 3193 + }, + { + "epoch": 11.530925507900678, + "grad_norm": 202.17503356933594, + "learning_rate": 1.2664246823956443e-05, + "loss": 23.1075, + "step": 3194 + }, + { + "epoch": 11.534537246049661, + "grad_norm": 188.78128051757812, + "learning_rate": 1.2658802177858439e-05, + "loss": 22.2458, + "step": 3195 + }, + { + "epoch": 11.538148984198646, + "grad_norm": 219.24722290039062, + "learning_rate": 1.2653357531760437e-05, + "loss": 23.7842, + "step": 3196 + }, + { + "epoch": 11.54176072234763, + "grad_norm": 213.0615234375, + "learning_rate": 1.2647912885662433e-05, + "loss": 25.3773, + "step": 3197 + }, + { + "epoch": 11.545372460496614, + "grad_norm": 274.6806335449219, + "learning_rate": 1.2642468239564428e-05, + "loss": 40.396, + "step": 3198 + }, + { + "epoch": 11.548984198645599, + "grad_norm": 248.91778564453125, + "learning_rate": 1.2637023593466425e-05, + "loss": 42.2405, + "step": 3199 + }, + { + "epoch": 11.552595936794582, + "grad_norm": 228.45591735839844, + "learning_rate": 1.263157894736842e-05, + "loss": 40.7328, + "step": 3200 + }, + { + "epoch": 11.552595936794582, + "eval_loss": 0.6154705286026001, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.04, + "eval_steps_per_second": 57.04, + "step": 3200 + }, + { + "epoch": 11.556207674943566, + "grad_norm": 206.54483032226562, + "learning_rate": 1.2626134301270418e-05, + "loss": 40.6909, + "step": 3201 + }, + { + "epoch": 11.559819413092551, + "grad_norm": 199.14816284179688, + "learning_rate": 1.2620689655172415e-05, + "loss": 40.6918, + "step": 3202 + }, + { + "epoch": 11.563431151241534, + "grad_norm": 217.4789276123047, + "learning_rate": 1.261524500907441e-05, + "loss": 41.686, + "step": 3203 + }, + { + "epoch": 11.56704288939052, + "grad_norm": 209.83084106445312, + "learning_rate": 1.2609800362976406e-05, + "loss": 40.685, + "step": 3204 + }, + { + "epoch": 11.570654627539504, + "grad_norm": 184.56614685058594, + "learning_rate": 1.2604355716878404e-05, + "loss": 42.1684, + "step": 3205 + }, + { + "epoch": 11.574266365688487, + "grad_norm": 226.84622192382812, + "learning_rate": 1.25989110707804e-05, + "loss": 42.4169, + "step": 3206 + }, + { + "epoch": 11.577878103837472, + "grad_norm": 271.7705383300781, + "learning_rate": 1.2593466424682397e-05, + "loss": 41.9603, + "step": 3207 + }, + { + "epoch": 11.581489841986457, + "grad_norm": 206.48257446289062, + "learning_rate": 1.2588021778584392e-05, + "loss": 39.9903, + "step": 3208 + }, + { + "epoch": 11.58510158013544, + "grad_norm": 190.86009216308594, + "learning_rate": 1.2582577132486388e-05, + "loss": 39.3138, + "step": 3209 + }, + { + "epoch": 11.588713318284425, + "grad_norm": 217.0152130126953, + "learning_rate": 1.2577132486388386e-05, + "loss": 37.652, + "step": 3210 + }, + { + "epoch": 11.588713318284425, + "eval_loss": 0.6143624186515808, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 3210 + }, + { + "epoch": 11.592325056433408, + "grad_norm": 203.3090362548828, + "learning_rate": 1.2571687840290382e-05, + "loss": 38.5532, + "step": 3211 + }, + { + "epoch": 11.595936794582393, + "grad_norm": 237.18287658691406, + "learning_rate": 1.2566243194192377e-05, + "loss": 38.4073, + "step": 3212 + }, + { + "epoch": 11.599548532731378, + "grad_norm": 222.20489501953125, + "learning_rate": 1.2560798548094374e-05, + "loss": 37.7122, + "step": 3213 + }, + { + "epoch": 11.60316027088036, + "grad_norm": 261.4862060546875, + "learning_rate": 1.255535390199637e-05, + "loss": 39.0125, + "step": 3214 + }, + { + "epoch": 11.606772009029346, + "grad_norm": 235.49668884277344, + "learning_rate": 1.2549909255898367e-05, + "loss": 38.1753, + "step": 3215 + }, + { + "epoch": 11.610383747178329, + "grad_norm": 219.66139221191406, + "learning_rate": 1.2544464609800364e-05, + "loss": 40.3478, + "step": 3216 + }, + { + "epoch": 11.613995485327314, + "grad_norm": 282.8075256347656, + "learning_rate": 1.2539019963702359e-05, + "loss": 39.3672, + "step": 3217 + }, + { + "epoch": 11.617607223476298, + "grad_norm": 235.07875061035156, + "learning_rate": 1.2533575317604356e-05, + "loss": 39.8955, + "step": 3218 + }, + { + "epoch": 11.621218961625281, + "grad_norm": 328.829833984375, + "learning_rate": 1.2528130671506353e-05, + "loss": 38.626, + "step": 3219 + }, + { + "epoch": 11.624830699774266, + "grad_norm": 283.1789245605469, + "learning_rate": 1.2522686025408349e-05, + "loss": 40.0565, + "step": 3220 + }, + { + "epoch": 11.624830699774266, + "eval_loss": 0.6113889217376709, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3220 + }, + { + "epoch": 11.628442437923251, + "grad_norm": 230.88047790527344, + "learning_rate": 1.2517241379310346e-05, + "loss": 40.1155, + "step": 3221 + }, + { + "epoch": 11.632054176072234, + "grad_norm": 258.1295166015625, + "learning_rate": 1.2511796733212341e-05, + "loss": 40.4707, + "step": 3222 + }, + { + "epoch": 11.635665914221219, + "grad_norm": 255.82699584960938, + "learning_rate": 1.2506352087114336e-05, + "loss": 41.1296, + "step": 3223 + }, + { + "epoch": 11.639277652370204, + "grad_norm": 226.4784393310547, + "learning_rate": 1.2500907441016335e-05, + "loss": 39.1159, + "step": 3224 + }, + { + "epoch": 11.642889390519187, + "grad_norm": 257.38104248046875, + "learning_rate": 1.249546279491833e-05, + "loss": 40.7933, + "step": 3225 + }, + { + "epoch": 11.646501128668172, + "grad_norm": 218.69070434570312, + "learning_rate": 1.2490018148820328e-05, + "loss": 39.6723, + "step": 3226 + }, + { + "epoch": 11.650112866817155, + "grad_norm": 232.3351287841797, + "learning_rate": 1.2484573502722323e-05, + "loss": 37.5671, + "step": 3227 + }, + { + "epoch": 11.65372460496614, + "grad_norm": 229.93295288085938, + "learning_rate": 1.2479128856624318e-05, + "loss": 32.7819, + "step": 3228 + }, + { + "epoch": 11.657336343115125, + "grad_norm": 265.6002197265625, + "learning_rate": 1.2473684210526317e-05, + "loss": 32.5955, + "step": 3229 + }, + { + "epoch": 11.660948081264108, + "grad_norm": 278.47705078125, + "learning_rate": 1.2468239564428313e-05, + "loss": 32.9901, + "step": 3230 + }, + { + "epoch": 11.660948081264108, + "eval_loss": 0.6078047752380371, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 3230 + }, + { + "epoch": 11.664559819413093, + "grad_norm": 239.9285430908203, + "learning_rate": 1.2462794918330308e-05, + "loss": 33.2737, + "step": 3231 + }, + { + "epoch": 11.668171557562077, + "grad_norm": 358.36090087890625, + "learning_rate": 1.2457350272232305e-05, + "loss": 34.8522, + "step": 3232 + }, + { + "epoch": 11.67178329571106, + "grad_norm": 258.0733642578125, + "learning_rate": 1.2451905626134302e-05, + "loss": 34.6796, + "step": 3233 + }, + { + "epoch": 11.675395033860045, + "grad_norm": 296.21942138671875, + "learning_rate": 1.2446460980036298e-05, + "loss": 35.8479, + "step": 3234 + }, + { + "epoch": 11.679006772009028, + "grad_norm": 229.6141815185547, + "learning_rate": 1.2441016333938295e-05, + "loss": 36.4934, + "step": 3235 + }, + { + "epoch": 11.682618510158013, + "grad_norm": 238.6092987060547, + "learning_rate": 1.243557168784029e-05, + "loss": 35.2253, + "step": 3236 + }, + { + "epoch": 11.686230248306998, + "grad_norm": 300.76300048828125, + "learning_rate": 1.2430127041742287e-05, + "loss": 34.9373, + "step": 3237 + }, + { + "epoch": 11.689841986455981, + "grad_norm": 227.70672607421875, + "learning_rate": 1.2424682395644284e-05, + "loss": 35.4369, + "step": 3238 + }, + { + "epoch": 11.693453724604966, + "grad_norm": 218.36000061035156, + "learning_rate": 1.241923774954628e-05, + "loss": 35.3398, + "step": 3239 + }, + { + "epoch": 11.697065462753951, + "grad_norm": 220.78475952148438, + "learning_rate": 1.2413793103448277e-05, + "loss": 35.7612, + "step": 3240 + }, + { + "epoch": 11.697065462753951, + "eval_loss": 0.6067846417427063, + "eval_runtime": 3.1322, + "eval_samples_per_second": 57.148, + "eval_steps_per_second": 57.148, + "step": 3240 + }, + { + "epoch": 11.700677200902934, + "grad_norm": 237.34437561035156, + "learning_rate": 1.2408348457350272e-05, + "loss": 38.0459, + "step": 3241 + }, + { + "epoch": 11.704288939051919, + "grad_norm": 251.60633850097656, + "learning_rate": 1.2402903811252269e-05, + "loss": 35.4676, + "step": 3242 + }, + { + "epoch": 11.707900677200904, + "grad_norm": 214.17117309570312, + "learning_rate": 1.2397459165154266e-05, + "loss": 30.5595, + "step": 3243 + }, + { + "epoch": 11.711512415349887, + "grad_norm": 202.3698272705078, + "learning_rate": 1.2392014519056262e-05, + "loss": 23.7468, + "step": 3244 + }, + { + "epoch": 11.715124153498872, + "grad_norm": 229.11776733398438, + "learning_rate": 1.2386569872958257e-05, + "loss": 23.1255, + "step": 3245 + }, + { + "epoch": 11.718735891647855, + "grad_norm": 175.93829345703125, + "learning_rate": 1.2381125226860254e-05, + "loss": 23.7349, + "step": 3246 + }, + { + "epoch": 11.72234762979684, + "grad_norm": 232.7489471435547, + "learning_rate": 1.2375680580762251e-05, + "loss": 24.4997, + "step": 3247 + }, + { + "epoch": 11.725959367945824, + "grad_norm": 280.5601806640625, + "learning_rate": 1.2370235934664248e-05, + "loss": 42.3811, + "step": 3248 + }, + { + "epoch": 11.729571106094808, + "grad_norm": 292.2538146972656, + "learning_rate": 1.2364791288566244e-05, + "loss": 42.9804, + "step": 3249 + }, + { + "epoch": 11.733182844243792, + "grad_norm": 265.0259704589844, + "learning_rate": 1.2359346642468239e-05, + "loss": 41.1251, + "step": 3250 + }, + { + "epoch": 11.733182844243792, + "eval_loss": 0.6141200065612793, + "eval_runtime": 3.1404, + "eval_samples_per_second": 56.999, + "eval_steps_per_second": 56.999, + "step": 3250 + }, + { + "epoch": 11.736794582392777, + "grad_norm": 232.92893981933594, + "learning_rate": 1.2353901996370236e-05, + "loss": 40.9372, + "step": 3251 + }, + { + "epoch": 11.74040632054176, + "grad_norm": 176.99818420410156, + "learning_rate": 1.2348457350272233e-05, + "loss": 41.0757, + "step": 3252 + }, + { + "epoch": 11.744018058690745, + "grad_norm": 206.5728759765625, + "learning_rate": 1.2343012704174228e-05, + "loss": 41.9635, + "step": 3253 + }, + { + "epoch": 11.747629796839728, + "grad_norm": 211.2556915283203, + "learning_rate": 1.2337568058076226e-05, + "loss": 41.5217, + "step": 3254 + }, + { + "epoch": 11.751241534988713, + "grad_norm": 198.8915252685547, + "learning_rate": 1.2332123411978221e-05, + "loss": 42.9997, + "step": 3255 + }, + { + "epoch": 11.754853273137698, + "grad_norm": 291.2761535644531, + "learning_rate": 1.2326678765880218e-05, + "loss": 42.2561, + "step": 3256 + }, + { + "epoch": 11.758465011286681, + "grad_norm": 243.2998046875, + "learning_rate": 1.2321234119782215e-05, + "loss": 41.6219, + "step": 3257 + }, + { + "epoch": 11.762076749435666, + "grad_norm": 266.1149597167969, + "learning_rate": 1.231578947368421e-05, + "loss": 40.1646, + "step": 3258 + }, + { + "epoch": 11.76568848758465, + "grad_norm": 236.6083221435547, + "learning_rate": 1.2310344827586208e-05, + "loss": 39.7079, + "step": 3259 + }, + { + "epoch": 11.769300225733634, + "grad_norm": 196.397216796875, + "learning_rate": 1.2304900181488203e-05, + "loss": 39.6629, + "step": 3260 + }, + { + "epoch": 11.769300225733634, + "eval_loss": 0.6124016046524048, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.055, + "eval_steps_per_second": 57.055, + "step": 3260 + }, + { + "epoch": 11.772911963882619, + "grad_norm": 198.52500915527344, + "learning_rate": 1.22994555353902e-05, + "loss": 38.5285, + "step": 3261 + }, + { + "epoch": 11.776523702031604, + "grad_norm": 236.25477600097656, + "learning_rate": 1.2294010889292197e-05, + "loss": 38.3358, + "step": 3262 + }, + { + "epoch": 11.780135440180587, + "grad_norm": 260.35955810546875, + "learning_rate": 1.2288566243194192e-05, + "loss": 38.374, + "step": 3263 + }, + { + "epoch": 11.783747178329572, + "grad_norm": 313.078857421875, + "learning_rate": 1.2283121597096188e-05, + "loss": 39.124, + "step": 3264 + }, + { + "epoch": 11.787358916478555, + "grad_norm": 191.34027099609375, + "learning_rate": 1.2277676950998187e-05, + "loss": 39.1776, + "step": 3265 + }, + { + "epoch": 11.79097065462754, + "grad_norm": 203.5764923095703, + "learning_rate": 1.2272232304900182e-05, + "loss": 38.7885, + "step": 3266 + }, + { + "epoch": 11.794582392776524, + "grad_norm": 234.38479614257812, + "learning_rate": 1.2266787658802177e-05, + "loss": 39.1353, + "step": 3267 + }, + { + "epoch": 11.798194130925507, + "grad_norm": 254.5694122314453, + "learning_rate": 1.2261343012704174e-05, + "loss": 38.141, + "step": 3268 + }, + { + "epoch": 11.801805869074492, + "grad_norm": 189.8268585205078, + "learning_rate": 1.225589836660617e-05, + "loss": 39.5199, + "step": 3269 + }, + { + "epoch": 11.805417607223477, + "grad_norm": 256.52728271484375, + "learning_rate": 1.2250453720508169e-05, + "loss": 41.5113, + "step": 3270 + }, + { + "epoch": 11.805417607223477, + "eval_loss": 0.6084021329879761, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.073, + "eval_steps_per_second": 57.073, + "step": 3270 + }, + { + "epoch": 11.80902934537246, + "grad_norm": 195.57321166992188, + "learning_rate": 1.2245009074410164e-05, + "loss": 39.8129, + "step": 3271 + }, + { + "epoch": 11.812641083521445, + "grad_norm": 228.6748809814453, + "learning_rate": 1.223956442831216e-05, + "loss": 40.2273, + "step": 3272 + }, + { + "epoch": 11.816252821670428, + "grad_norm": 209.96096801757812, + "learning_rate": 1.2234119782214156e-05, + "loss": 40.2254, + "step": 3273 + }, + { + "epoch": 11.819864559819413, + "grad_norm": 247.4613037109375, + "learning_rate": 1.2228675136116152e-05, + "loss": 40.71, + "step": 3274 + }, + { + "epoch": 11.823476297968398, + "grad_norm": 263.0521240234375, + "learning_rate": 1.2223230490018149e-05, + "loss": 39.5572, + "step": 3275 + }, + { + "epoch": 11.827088036117381, + "grad_norm": 225.53634643554688, + "learning_rate": 1.2217785843920146e-05, + "loss": 36.4388, + "step": 3276 + }, + { + "epoch": 11.830699774266366, + "grad_norm": 194.59527587890625, + "learning_rate": 1.2212341197822141e-05, + "loss": 33.1005, + "step": 3277 + }, + { + "epoch": 11.83431151241535, + "grad_norm": 314.715576171875, + "learning_rate": 1.2206896551724138e-05, + "loss": 32.9812, + "step": 3278 + }, + { + "epoch": 11.837923250564334, + "grad_norm": 205.86862182617188, + "learning_rate": 1.2201451905626136e-05, + "loss": 33.6331, + "step": 3279 + }, + { + "epoch": 11.841534988713319, + "grad_norm": 217.54722595214844, + "learning_rate": 1.2196007259528131e-05, + "loss": 33.6535, + "step": 3280 + }, + { + "epoch": 11.841534988713319, + "eval_loss": 0.609620213508606, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 3280 + }, + { + "epoch": 11.845146726862303, + "grad_norm": 231.25390625, + "learning_rate": 1.2190562613430128e-05, + "loss": 34.5218, + "step": 3281 + }, + { + "epoch": 11.848758465011286, + "grad_norm": 208.8440704345703, + "learning_rate": 1.2185117967332123e-05, + "loss": 34.354, + "step": 3282 + }, + { + "epoch": 11.852370203160271, + "grad_norm": 221.25547790527344, + "learning_rate": 1.2179673321234119e-05, + "loss": 34.5705, + "step": 3283 + }, + { + "epoch": 11.855981941309254, + "grad_norm": 331.4505920410156, + "learning_rate": 1.2174228675136118e-05, + "loss": 35.796, + "step": 3284 + }, + { + "epoch": 11.85959367945824, + "grad_norm": 337.1404113769531, + "learning_rate": 1.2168784029038113e-05, + "loss": 36.4544, + "step": 3285 + }, + { + "epoch": 11.863205417607224, + "grad_norm": 238.75303649902344, + "learning_rate": 1.2163339382940108e-05, + "loss": 35.7165, + "step": 3286 + }, + { + "epoch": 11.866817155756207, + "grad_norm": 260.088134765625, + "learning_rate": 1.2157894736842105e-05, + "loss": 35.5461, + "step": 3287 + }, + { + "epoch": 11.870428893905192, + "grad_norm": 265.0240173339844, + "learning_rate": 1.2152450090744102e-05, + "loss": 37.0143, + "step": 3288 + }, + { + "epoch": 11.874040632054175, + "grad_norm": 251.74273681640625, + "learning_rate": 1.21470054446461e-05, + "loss": 36.6145, + "step": 3289 + }, + { + "epoch": 11.87765237020316, + "grad_norm": 216.8999786376953, + "learning_rate": 1.2141560798548095e-05, + "loss": 36.3135, + "step": 3290 + }, + { + "epoch": 11.87765237020316, + "eval_loss": 0.6087896823883057, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.066, + "eval_steps_per_second": 57.066, + "step": 3290 + }, + { + "epoch": 11.881264108352145, + "grad_norm": 256.50006103515625, + "learning_rate": 1.213611615245009e-05, + "loss": 36.6596, + "step": 3291 + }, + { + "epoch": 11.884875846501128, + "grad_norm": 249.34164428710938, + "learning_rate": 1.2130671506352087e-05, + "loss": 37.6473, + "step": 3292 + }, + { + "epoch": 11.888487584650113, + "grad_norm": 211.9344940185547, + "learning_rate": 1.2125226860254084e-05, + "loss": 28.2839, + "step": 3293 + }, + { + "epoch": 11.892099322799098, + "grad_norm": 170.77166748046875, + "learning_rate": 1.211978221415608e-05, + "loss": 23.2231, + "step": 3294 + }, + { + "epoch": 11.89571106094808, + "grad_norm": 177.49789428710938, + "learning_rate": 1.2114337568058077e-05, + "loss": 22.7909, + "step": 3295 + }, + { + "epoch": 11.899322799097066, + "grad_norm": 189.0458221435547, + "learning_rate": 1.2108892921960072e-05, + "loss": 23.8062, + "step": 3296 + }, + { + "epoch": 11.90293453724605, + "grad_norm": 182.90457153320312, + "learning_rate": 1.2103448275862068e-05, + "loss": 24.7812, + "step": 3297 + }, + { + "epoch": 11.906546275395034, + "grad_norm": 232.61126708984375, + "learning_rate": 1.2098003629764066e-05, + "loss": 41.5496, + "step": 3298 + }, + { + "epoch": 11.910158013544018, + "grad_norm": 283.25762939453125, + "learning_rate": 1.2092558983666062e-05, + "loss": 40.7831, + "step": 3299 + }, + { + "epoch": 11.913769751693001, + "grad_norm": 316.6318359375, + "learning_rate": 1.2087114337568059e-05, + "loss": 40.6287, + "step": 3300 + }, + { + "epoch": 11.913769751693001, + "eval_loss": 0.6114257574081421, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 3300 + }, + { + "epoch": 11.917381489841986, + "grad_norm": 248.5615234375, + "learning_rate": 1.2081669691470054e-05, + "loss": 40.5648, + "step": 3301 + }, + { + "epoch": 11.920993227990971, + "grad_norm": 255.31130981445312, + "learning_rate": 1.2076225045372051e-05, + "loss": 42.4736, + "step": 3302 + }, + { + "epoch": 11.924604966139954, + "grad_norm": 229.3546600341797, + "learning_rate": 1.2070780399274048e-05, + "loss": 43.112, + "step": 3303 + }, + { + "epoch": 11.928216704288939, + "grad_norm": 226.89553833007812, + "learning_rate": 1.2065335753176044e-05, + "loss": 37.9527, + "step": 3304 + }, + { + "epoch": 11.931828442437924, + "grad_norm": 210.63919067382812, + "learning_rate": 1.205989110707804e-05, + "loss": 38.7652, + "step": 3305 + }, + { + "epoch": 11.935440180586907, + "grad_norm": 267.75335693359375, + "learning_rate": 1.2054446460980036e-05, + "loss": 39.9077, + "step": 3306 + }, + { + "epoch": 11.939051918735892, + "grad_norm": 255.3372802734375, + "learning_rate": 1.2049001814882033e-05, + "loss": 39.9008, + "step": 3307 + }, + { + "epoch": 11.942663656884875, + "grad_norm": 220.55332946777344, + "learning_rate": 1.2043557168784029e-05, + "loss": 40.8187, + "step": 3308 + }, + { + "epoch": 11.94627539503386, + "grad_norm": 350.15374755859375, + "learning_rate": 1.2038112522686026e-05, + "loss": 40.2937, + "step": 3309 + }, + { + "epoch": 11.949887133182845, + "grad_norm": 296.1144714355469, + "learning_rate": 1.2032667876588021e-05, + "loss": 41.3939, + "step": 3310 + }, + { + "epoch": 11.949887133182845, + "eval_loss": 0.6116041541099548, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3310 + }, + { + "epoch": 11.953498871331828, + "grad_norm": 220.52304077148438, + "learning_rate": 1.202722323049002e-05, + "loss": 39.108, + "step": 3311 + }, + { + "epoch": 11.957110609480813, + "grad_norm": 268.8526916503906, + "learning_rate": 1.2021778584392015e-05, + "loss": 39.547, + "step": 3312 + }, + { + "epoch": 11.960722347629797, + "grad_norm": 205.97677612304688, + "learning_rate": 1.201633393829401e-05, + "loss": 36.7144, + "step": 3313 + }, + { + "epoch": 11.96433408577878, + "grad_norm": 186.62428283691406, + "learning_rate": 1.2010889292196008e-05, + "loss": 34.0491, + "step": 3314 + }, + { + "epoch": 11.967945823927765, + "grad_norm": 214.5521697998047, + "learning_rate": 1.2005444646098003e-05, + "loss": 34.1164, + "step": 3315 + }, + { + "epoch": 11.97155756207675, + "grad_norm": 203.8130340576172, + "learning_rate": 1.2e-05, + "loss": 34.0005, + "step": 3316 + }, + { + "epoch": 11.975169300225733, + "grad_norm": 207.25648498535156, + "learning_rate": 1.1994555353901997e-05, + "loss": 34.0489, + "step": 3317 + }, + { + "epoch": 11.978781038374718, + "grad_norm": 271.1595458984375, + "learning_rate": 1.1989110707803993e-05, + "loss": 35.0359, + "step": 3318 + }, + { + "epoch": 11.982392776523701, + "grad_norm": 266.0697021484375, + "learning_rate": 1.198366606170599e-05, + "loss": 36.4684, + "step": 3319 + }, + { + "epoch": 11.986004514672686, + "grad_norm": 264.1314392089844, + "learning_rate": 1.1978221415607985e-05, + "loss": 35.8805, + "step": 3320 + }, + { + "epoch": 11.986004514672686, + "eval_loss": 0.6101864576339722, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 3320 + }, + { + "epoch": 11.989616252821671, + "grad_norm": 266.34295654296875, + "learning_rate": 1.1972776769509982e-05, + "loss": 37.2928, + "step": 3321 + }, + { + "epoch": 11.993227990970654, + "grad_norm": 222.19161987304688, + "learning_rate": 1.196733212341198e-05, + "loss": 29.0638, + "step": 3322 + }, + { + "epoch": 11.996839729119639, + "grad_norm": 244.96974182128906, + "learning_rate": 1.1961887477313975e-05, + "loss": 23.6752, + "step": 3323 + }, + { + "epoch": 12.0, + "grad_norm": 227.6931915283203, + "learning_rate": 1.195644283121597e-05, + "loss": 20.9293, + "step": 3324 + }, + { + "epoch": 12.003611738148985, + "grad_norm": 259.7235412597656, + "learning_rate": 1.1950998185117969e-05, + "loss": 39.7694, + "step": 3325 + }, + { + "epoch": 12.007223476297968, + "grad_norm": 258.8477783203125, + "learning_rate": 1.1945553539019964e-05, + "loss": 41.3742, + "step": 3326 + }, + { + "epoch": 12.010835214446953, + "grad_norm": 216.0697784423828, + "learning_rate": 1.194010889292196e-05, + "loss": 40.0706, + "step": 3327 + }, + { + "epoch": 12.014446952595938, + "grad_norm": 197.73046875, + "learning_rate": 1.1934664246823957e-05, + "loss": 39.844, + "step": 3328 + }, + { + "epoch": 12.01805869074492, + "grad_norm": 190.29563903808594, + "learning_rate": 1.1929219600725952e-05, + "loss": 41.8877, + "step": 3329 + }, + { + "epoch": 12.021670428893906, + "grad_norm": 190.01197814941406, + "learning_rate": 1.1923774954627951e-05, + "loss": 40.5782, + "step": 3330 + }, + { + "epoch": 12.021670428893906, + "eval_loss": 0.6100598573684692, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3330 + }, + { + "epoch": 12.025282167042889, + "grad_norm": 283.20965576171875, + "learning_rate": 1.1918330308529946e-05, + "loss": 42.9183, + "step": 3331 + }, + { + "epoch": 12.028893905191874, + "grad_norm": 227.9106903076172, + "learning_rate": 1.1912885662431942e-05, + "loss": 41.4606, + "step": 3332 + }, + { + "epoch": 12.032505643340858, + "grad_norm": 217.31640625, + "learning_rate": 1.1907441016333939e-05, + "loss": 40.527, + "step": 3333 + }, + { + "epoch": 12.036117381489841, + "grad_norm": 181.33787536621094, + "learning_rate": 1.1901996370235936e-05, + "loss": 40.2536, + "step": 3334 + }, + { + "epoch": 12.039729119638826, + "grad_norm": 210.638427734375, + "learning_rate": 1.1896551724137931e-05, + "loss": 39.0234, + "step": 3335 + }, + { + "epoch": 12.043340857787811, + "grad_norm": 222.1325225830078, + "learning_rate": 1.1891107078039928e-05, + "loss": 36.6929, + "step": 3336 + }, + { + "epoch": 12.046952595936794, + "grad_norm": 195.0751953125, + "learning_rate": 1.1885662431941924e-05, + "loss": 37.9547, + "step": 3337 + }, + { + "epoch": 12.050564334085779, + "grad_norm": 287.6582946777344, + "learning_rate": 1.1880217785843919e-05, + "loss": 37.9016, + "step": 3338 + }, + { + "epoch": 12.054176072234762, + "grad_norm": 351.43701171875, + "learning_rate": 1.1874773139745918e-05, + "loss": 40.014, + "step": 3339 + }, + { + "epoch": 12.057787810383747, + "grad_norm": 212.9033966064453, + "learning_rate": 1.1869328493647913e-05, + "loss": 37.8761, + "step": 3340 + }, + { + "epoch": 12.057787810383747, + "eval_loss": 0.6093400120735168, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 3340 + }, + { + "epoch": 12.061399548532732, + "grad_norm": 268.8284912109375, + "learning_rate": 1.186388384754991e-05, + "loss": 38.7171, + "step": 3341 + }, + { + "epoch": 12.065011286681715, + "grad_norm": 193.27267456054688, + "learning_rate": 1.1858439201451906e-05, + "loss": 38.4908, + "step": 3342 + }, + { + "epoch": 12.0686230248307, + "grad_norm": 244.18124389648438, + "learning_rate": 1.1852994555353901e-05, + "loss": 37.9388, + "step": 3343 + }, + { + "epoch": 12.072234762979685, + "grad_norm": 311.6593933105469, + "learning_rate": 1.18475499092559e-05, + "loss": 38.4287, + "step": 3344 + }, + { + "epoch": 12.075846501128668, + "grad_norm": 239.28526306152344, + "learning_rate": 1.1842105263157895e-05, + "loss": 38.1349, + "step": 3345 + }, + { + "epoch": 12.079458239277653, + "grad_norm": 312.1795654296875, + "learning_rate": 1.183666061705989e-05, + "loss": 39.8067, + "step": 3346 + }, + { + "epoch": 12.083069977426636, + "grad_norm": 303.3067932128906, + "learning_rate": 1.1831215970961888e-05, + "loss": 40.0617, + "step": 3347 + }, + { + "epoch": 12.08668171557562, + "grad_norm": 280.8705749511719, + "learning_rate": 1.1825771324863885e-05, + "loss": 39.244, + "step": 3348 + }, + { + "epoch": 12.090293453724605, + "grad_norm": 249.89671325683594, + "learning_rate": 1.182032667876588e-05, + "loss": 39.0047, + "step": 3349 + }, + { + "epoch": 12.093905191873588, + "grad_norm": 226.19195556640625, + "learning_rate": 1.1814882032667877e-05, + "loss": 40.8044, + "step": 3350 + }, + { + "epoch": 12.093905191873588, + "eval_loss": 0.6100687384605408, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 3350 + }, + { + "epoch": 12.097516930022573, + "grad_norm": 250.29306030273438, + "learning_rate": 1.1809437386569873e-05, + "loss": 38.0745, + "step": 3351 + }, + { + "epoch": 12.101128668171558, + "grad_norm": 255.06137084960938, + "learning_rate": 1.180399274047187e-05, + "loss": 37.2922, + "step": 3352 + }, + { + "epoch": 12.104740406320541, + "grad_norm": 293.59185791015625, + "learning_rate": 1.1798548094373867e-05, + "loss": 35.488, + "step": 3353 + }, + { + "epoch": 12.108352144469526, + "grad_norm": 260.9599914550781, + "learning_rate": 1.1793103448275862e-05, + "loss": 32.8175, + "step": 3354 + }, + { + "epoch": 12.111963882618511, + "grad_norm": 387.63671875, + "learning_rate": 1.178765880217786e-05, + "loss": 31.3901, + "step": 3355 + }, + { + "epoch": 12.115575620767494, + "grad_norm": 216.2008819580078, + "learning_rate": 1.1782214156079855e-05, + "loss": 32.9512, + "step": 3356 + }, + { + "epoch": 12.119187358916479, + "grad_norm": 260.510498046875, + "learning_rate": 1.177676950998185e-05, + "loss": 31.838, + "step": 3357 + }, + { + "epoch": 12.122799097065462, + "grad_norm": 215.96522521972656, + "learning_rate": 1.1771324863883849e-05, + "loss": 33.5854, + "step": 3358 + }, + { + "epoch": 12.126410835214447, + "grad_norm": 277.2855529785156, + "learning_rate": 1.1765880217785844e-05, + "loss": 34.947, + "step": 3359 + }, + { + "epoch": 12.130022573363432, + "grad_norm": 199.53759765625, + "learning_rate": 1.176043557168784e-05, + "loss": 34.3862, + "step": 3360 + }, + { + "epoch": 12.130022573363432, + "eval_loss": 0.6107886433601379, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 3360 + }, + { + "epoch": 12.133634311512415, + "grad_norm": 244.73654174804688, + "learning_rate": 1.1754990925589837e-05, + "loss": 34.5678, + "step": 3361 + }, + { + "epoch": 12.1372460496614, + "grad_norm": 335.4967346191406, + "learning_rate": 1.1749546279491834e-05, + "loss": 35.8974, + "step": 3362 + }, + { + "epoch": 12.140857787810384, + "grad_norm": 269.8370056152344, + "learning_rate": 1.174410163339383e-05, + "loss": 36.3458, + "step": 3363 + }, + { + "epoch": 12.144469525959368, + "grad_norm": 230.82492065429688, + "learning_rate": 1.1738656987295826e-05, + "loss": 34.6797, + "step": 3364 + }, + { + "epoch": 12.148081264108352, + "grad_norm": 266.6196594238281, + "learning_rate": 1.1733212341197822e-05, + "loss": 35.5799, + "step": 3365 + }, + { + "epoch": 12.151693002257336, + "grad_norm": 268.1825256347656, + "learning_rate": 1.1727767695099819e-05, + "loss": 34.9859, + "step": 3366 + }, + { + "epoch": 12.15530474040632, + "grad_norm": 259.6159362792969, + "learning_rate": 1.1722323049001816e-05, + "loss": 37.2283, + "step": 3367 + }, + { + "epoch": 12.158916478555305, + "grad_norm": 225.1367645263672, + "learning_rate": 1.1716878402903811e-05, + "loss": 37.4073, + "step": 3368 + }, + { + "epoch": 12.162528216704288, + "grad_norm": 277.8457946777344, + "learning_rate": 1.1711433756805808e-05, + "loss": 36.3491, + "step": 3369 + }, + { + "epoch": 12.166139954853273, + "grad_norm": 273.1939697265625, + "learning_rate": 1.1705989110707804e-05, + "loss": 31.4646, + "step": 3370 + }, + { + "epoch": 12.166139954853273, + "eval_loss": 0.6099494695663452, + "eval_runtime": 3.1323, + "eval_samples_per_second": 57.146, + "eval_steps_per_second": 57.146, + "step": 3370 + }, + { + "epoch": 12.169751693002258, + "grad_norm": 199.32516479492188, + "learning_rate": 1.17005444646098e-05, + "loss": 22.7125, + "step": 3371 + }, + { + "epoch": 12.173363431151241, + "grad_norm": 195.47630310058594, + "learning_rate": 1.1695099818511798e-05, + "loss": 22.7899, + "step": 3372 + }, + { + "epoch": 12.176975169300226, + "grad_norm": 220.02413940429688, + "learning_rate": 1.1689655172413793e-05, + "loss": 23.4427, + "step": 3373 + }, + { + "epoch": 12.18058690744921, + "grad_norm": 215.43287658691406, + "learning_rate": 1.168421052631579e-05, + "loss": 24.1504, + "step": 3374 + }, + { + "epoch": 12.184198645598194, + "grad_norm": 298.2409973144531, + "learning_rate": 1.1678765880217786e-05, + "loss": 41.4955, + "step": 3375 + }, + { + "epoch": 12.187810383747179, + "grad_norm": 235.94728088378906, + "learning_rate": 1.1673321234119783e-05, + "loss": 42.4273, + "step": 3376 + }, + { + "epoch": 12.191422121896162, + "grad_norm": 235.44480895996094, + "learning_rate": 1.166787658802178e-05, + "loss": 40.6468, + "step": 3377 + }, + { + "epoch": 12.195033860045147, + "grad_norm": 281.5338439941406, + "learning_rate": 1.1662431941923775e-05, + "loss": 39.8335, + "step": 3378 + }, + { + "epoch": 12.198645598194132, + "grad_norm": 185.87339782714844, + "learning_rate": 1.165698729582577e-05, + "loss": 40.8669, + "step": 3379 + }, + { + "epoch": 12.202257336343115, + "grad_norm": 218.88861083984375, + "learning_rate": 1.1651542649727768e-05, + "loss": 40.1351, + "step": 3380 + }, + { + "epoch": 12.202257336343115, + "eval_loss": 0.6128573417663574, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 3380 + }, + { + "epoch": 12.2058690744921, + "grad_norm": 192.7227783203125, + "learning_rate": 1.1646098003629765e-05, + "loss": 40.4448, + "step": 3381 + }, + { + "epoch": 12.209480812641084, + "grad_norm": 219.68093872070312, + "learning_rate": 1.1640653357531762e-05, + "loss": 41.579, + "step": 3382 + }, + { + "epoch": 12.213092550790067, + "grad_norm": 235.8788299560547, + "learning_rate": 1.1635208711433757e-05, + "loss": 41.3374, + "step": 3383 + }, + { + "epoch": 12.216704288939052, + "grad_norm": 245.11935424804688, + "learning_rate": 1.1629764065335752e-05, + "loss": 41.1151, + "step": 3384 + }, + { + "epoch": 12.220316027088035, + "grad_norm": 260.2931823730469, + "learning_rate": 1.1624319419237751e-05, + "loss": 38.9502, + "step": 3385 + }, + { + "epoch": 12.22392776523702, + "grad_norm": 240.62734985351562, + "learning_rate": 1.1618874773139747e-05, + "loss": 38.6309, + "step": 3386 + }, + { + "epoch": 12.227539503386005, + "grad_norm": 230.9380645751953, + "learning_rate": 1.1613430127041742e-05, + "loss": 38.3077, + "step": 3387 + }, + { + "epoch": 12.231151241534988, + "grad_norm": 234.40687561035156, + "learning_rate": 1.1607985480943739e-05, + "loss": 37.1566, + "step": 3388 + }, + { + "epoch": 12.234762979683973, + "grad_norm": 216.580810546875, + "learning_rate": 1.1602540834845734e-05, + "loss": 38.4919, + "step": 3389 + }, + { + "epoch": 12.238374717832958, + "grad_norm": 210.75079345703125, + "learning_rate": 1.1597096188747732e-05, + "loss": 38.1647, + "step": 3390 + }, + { + "epoch": 12.238374717832958, + "eval_loss": 0.6105583906173706, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3390 + }, + { + "epoch": 12.241986455981941, + "grad_norm": 207.82180786132812, + "learning_rate": 1.1591651542649729e-05, + "loss": 38.5585, + "step": 3391 + }, + { + "epoch": 12.245598194130926, + "grad_norm": 186.55081176757812, + "learning_rate": 1.1586206896551724e-05, + "loss": 38.0183, + "step": 3392 + }, + { + "epoch": 12.249209932279909, + "grad_norm": 179.60572814941406, + "learning_rate": 1.1580762250453721e-05, + "loss": 39.6951, + "step": 3393 + }, + { + "epoch": 12.252821670428894, + "grad_norm": 212.59837341308594, + "learning_rate": 1.1575317604355718e-05, + "loss": 39.2908, + "step": 3394 + }, + { + "epoch": 12.256433408577879, + "grad_norm": 239.90997314453125, + "learning_rate": 1.1569872958257714e-05, + "loss": 39.9409, + "step": 3395 + }, + { + "epoch": 12.260045146726862, + "grad_norm": 240.729248046875, + "learning_rate": 1.156442831215971e-05, + "loss": 39.2386, + "step": 3396 + }, + { + "epoch": 12.263656884875846, + "grad_norm": 248.6179962158203, + "learning_rate": 1.1558983666061706e-05, + "loss": 37.3296, + "step": 3397 + }, + { + "epoch": 12.267268623024831, + "grad_norm": 192.55084228515625, + "learning_rate": 1.1553539019963701e-05, + "loss": 40.1156, + "step": 3398 + }, + { + "epoch": 12.270880361173814, + "grad_norm": 217.89109802246094, + "learning_rate": 1.15480943738657e-05, + "loss": 41.0677, + "step": 3399 + }, + { + "epoch": 12.2744920993228, + "grad_norm": 240.77633666992188, + "learning_rate": 1.1542649727767695e-05, + "loss": 39.3552, + "step": 3400 + }, + { + "epoch": 12.2744920993228, + "eval_loss": 0.6094763278961182, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 3400 + }, + { + "epoch": 12.278103837471784, + "grad_norm": 210.38153076171875, + "learning_rate": 1.1537205081669691e-05, + "loss": 40.2202, + "step": 3401 + }, + { + "epoch": 12.281715575620767, + "grad_norm": 195.49087524414062, + "learning_rate": 1.1531760435571688e-05, + "loss": 37.5473, + "step": 3402 + }, + { + "epoch": 12.285327313769752, + "grad_norm": 254.43972778320312, + "learning_rate": 1.1526315789473683e-05, + "loss": 37.8032, + "step": 3403 + }, + { + "epoch": 12.288939051918735, + "grad_norm": 205.09913635253906, + "learning_rate": 1.1520871143375682e-05, + "loss": 35.1317, + "step": 3404 + }, + { + "epoch": 12.29255079006772, + "grad_norm": 241.22930908203125, + "learning_rate": 1.1515426497277677e-05, + "loss": 32.7809, + "step": 3405 + }, + { + "epoch": 12.296162528216705, + "grad_norm": 226.75311279296875, + "learning_rate": 1.1509981851179673e-05, + "loss": 32.5354, + "step": 3406 + }, + { + "epoch": 12.299774266365688, + "grad_norm": 323.5389709472656, + "learning_rate": 1.150453720508167e-05, + "loss": 33.1533, + "step": 3407 + }, + { + "epoch": 12.303386004514673, + "grad_norm": 306.7039794921875, + "learning_rate": 1.1499092558983667e-05, + "loss": 33.7924, + "step": 3408 + }, + { + "epoch": 12.306997742663658, + "grad_norm": 221.53897094726562, + "learning_rate": 1.1493647912885662e-05, + "loss": 33.829, + "step": 3409 + }, + { + "epoch": 12.31060948081264, + "grad_norm": 301.59527587890625, + "learning_rate": 1.148820326678766e-05, + "loss": 35.4583, + "step": 3410 + }, + { + "epoch": 12.31060948081264, + "eval_loss": 0.6092248558998108, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.058, + "eval_steps_per_second": 57.058, + "step": 3410 + }, + { + "epoch": 12.314221218961626, + "grad_norm": 229.63221740722656, + "learning_rate": 1.1482758620689655e-05, + "loss": 34.3258, + "step": 3411 + }, + { + "epoch": 12.317832957110609, + "grad_norm": 280.6421203613281, + "learning_rate": 1.147731397459165e-05, + "loss": 33.4522, + "step": 3412 + }, + { + "epoch": 12.321444695259594, + "grad_norm": 305.6673889160156, + "learning_rate": 1.1471869328493649e-05, + "loss": 34.8911, + "step": 3413 + }, + { + "epoch": 12.325056433408578, + "grad_norm": 278.5484924316406, + "learning_rate": 1.1466424682395644e-05, + "loss": 36.2668, + "step": 3414 + }, + { + "epoch": 12.328668171557561, + "grad_norm": 246.88082885742188, + "learning_rate": 1.1460980036297641e-05, + "loss": 34.8401, + "step": 3415 + }, + { + "epoch": 12.332279909706546, + "grad_norm": 279.730712890625, + "learning_rate": 1.1455535390199637e-05, + "loss": 36.2382, + "step": 3416 + }, + { + "epoch": 12.335891647855531, + "grad_norm": 243.62918090820312, + "learning_rate": 1.1450090744101634e-05, + "loss": 37.0742, + "step": 3417 + }, + { + "epoch": 12.339503386004514, + "grad_norm": 280.5240783691406, + "learning_rate": 1.1444646098003631e-05, + "loss": 37.0223, + "step": 3418 + }, + { + "epoch": 12.343115124153499, + "grad_norm": 270.56396484375, + "learning_rate": 1.1439201451905626e-05, + "loss": 34.8413, + "step": 3419 + }, + { + "epoch": 12.346726862302482, + "grad_norm": 246.56292724609375, + "learning_rate": 1.1433756805807622e-05, + "loss": 26.5596, + "step": 3420 + }, + { + "epoch": 12.346726862302482, + "eval_loss": 0.6123174428939819, + "eval_runtime": 3.1325, + "eval_samples_per_second": 57.143, + "eval_steps_per_second": 57.143, + "step": 3420 + }, + { + "epoch": 12.350338600451467, + "grad_norm": 199.72242736816406, + "learning_rate": 1.1428312159709619e-05, + "loss": 23.3959, + "step": 3421 + }, + { + "epoch": 12.353950338600452, + "grad_norm": 264.9206848144531, + "learning_rate": 1.1422867513611616e-05, + "loss": 23.448, + "step": 3422 + }, + { + "epoch": 12.357562076749435, + "grad_norm": 198.09420776367188, + "learning_rate": 1.1417422867513613e-05, + "loss": 23.4526, + "step": 3423 + }, + { + "epoch": 12.36117381489842, + "grad_norm": 191.74949645996094, + "learning_rate": 1.1411978221415608e-05, + "loss": 23.9586, + "step": 3424 + }, + { + "epoch": 12.364785553047405, + "grad_norm": 270.4527893066406, + "learning_rate": 1.1406533575317604e-05, + "loss": 41.2497, + "step": 3425 + }, + { + "epoch": 12.368397291196388, + "grad_norm": 253.06109619140625, + "learning_rate": 1.1401088929219601e-05, + "loss": 41.7598, + "step": 3426 + }, + { + "epoch": 12.372009029345373, + "grad_norm": 389.3164978027344, + "learning_rate": 1.1395644283121598e-05, + "loss": 42.1145, + "step": 3427 + }, + { + "epoch": 12.375620767494357, + "grad_norm": 405.1527404785156, + "learning_rate": 1.1390199637023593e-05, + "loss": 39.8163, + "step": 3428 + }, + { + "epoch": 12.37923250564334, + "grad_norm": 360.5083312988281, + "learning_rate": 1.138475499092559e-05, + "loss": 40.7344, + "step": 3429 + }, + { + "epoch": 12.382844243792325, + "grad_norm": 276.3650207519531, + "learning_rate": 1.1379310344827586e-05, + "loss": 40.6678, + "step": 3430 + }, + { + "epoch": 12.382844243792325, + "eval_loss": 0.612799346446991, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 3430 + }, + { + "epoch": 12.386455981941308, + "grad_norm": 222.34078979492188, + "learning_rate": 1.1373865698729583e-05, + "loss": 39.8701, + "step": 3431 + }, + { + "epoch": 12.390067720090293, + "grad_norm": 242.1103515625, + "learning_rate": 1.136842105263158e-05, + "loss": 42.031, + "step": 3432 + }, + { + "epoch": 12.393679458239278, + "grad_norm": 231.30453491210938, + "learning_rate": 1.1362976406533575e-05, + "loss": 40.7321, + "step": 3433 + }, + { + "epoch": 12.397291196388261, + "grad_norm": 302.65179443359375, + "learning_rate": 1.1357531760435572e-05, + "loss": 41.5889, + "step": 3434 + }, + { + "epoch": 12.400902934537246, + "grad_norm": 296.4203796386719, + "learning_rate": 1.1352087114337568e-05, + "loss": 40.3939, + "step": 3435 + }, + { + "epoch": 12.404514672686231, + "grad_norm": 281.8349304199219, + "learning_rate": 1.1346642468239565e-05, + "loss": 37.9457, + "step": 3436 + }, + { + "epoch": 12.408126410835214, + "grad_norm": 228.9622039794922, + "learning_rate": 1.1341197822141562e-05, + "loss": 37.4727, + "step": 3437 + }, + { + "epoch": 12.411738148984199, + "grad_norm": 276.8975524902344, + "learning_rate": 1.1335753176043557e-05, + "loss": 36.4285, + "step": 3438 + }, + { + "epoch": 12.415349887133182, + "grad_norm": 218.76206970214844, + "learning_rate": 1.1330308529945553e-05, + "loss": 37.7888, + "step": 3439 + }, + { + "epoch": 12.418961625282167, + "grad_norm": 277.31329345703125, + "learning_rate": 1.1324863883847551e-05, + "loss": 38.6416, + "step": 3440 + }, + { + "epoch": 12.418961625282167, + "eval_loss": 0.6118359565734863, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.105, + "eval_steps_per_second": 57.105, + "step": 3440 + }, + { + "epoch": 12.422573363431152, + "grad_norm": 239.2766876220703, + "learning_rate": 1.1319419237749547e-05, + "loss": 38.3779, + "step": 3441 + }, + { + "epoch": 12.426185101580135, + "grad_norm": 255.43939208984375, + "learning_rate": 1.1313974591651542e-05, + "loss": 38.7581, + "step": 3442 + }, + { + "epoch": 12.42979683972912, + "grad_norm": 196.33380126953125, + "learning_rate": 1.130852994555354e-05, + "loss": 40.1953, + "step": 3443 + }, + { + "epoch": 12.433408577878104, + "grad_norm": 284.2427062988281, + "learning_rate": 1.1303085299455535e-05, + "loss": 39.2743, + "step": 3444 + }, + { + "epoch": 12.437020316027088, + "grad_norm": 303.0172424316406, + "learning_rate": 1.1297640653357533e-05, + "loss": 39.4786, + "step": 3445 + }, + { + "epoch": 12.440632054176072, + "grad_norm": 231.17999267578125, + "learning_rate": 1.1292196007259529e-05, + "loss": 38.6038, + "step": 3446 + }, + { + "epoch": 12.444243792325057, + "grad_norm": 228.89599609375, + "learning_rate": 1.1286751361161524e-05, + "loss": 39.0235, + "step": 3447 + }, + { + "epoch": 12.44785553047404, + "grad_norm": 247.05203247070312, + "learning_rate": 1.1281306715063521e-05, + "loss": 39.9779, + "step": 3448 + }, + { + "epoch": 12.451467268623025, + "grad_norm": 221.5463104248047, + "learning_rate": 1.1275862068965517e-05, + "loss": 40.4104, + "step": 3449 + }, + { + "epoch": 12.455079006772008, + "grad_norm": 254.12820434570312, + "learning_rate": 1.1270417422867514e-05, + "loss": 40.8093, + "step": 3450 + }, + { + "epoch": 12.455079006772008, + "eval_loss": 0.6093817353248596, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 3450 + }, + { + "epoch": 12.458690744920993, + "grad_norm": 214.2323760986328, + "learning_rate": 1.1264972776769511e-05, + "loss": 40.3578, + "step": 3451 + }, + { + "epoch": 12.462302483069978, + "grad_norm": 230.64718627929688, + "learning_rate": 1.1259528130671506e-05, + "loss": 39.772, + "step": 3452 + }, + { + "epoch": 12.465914221218961, + "grad_norm": 217.81838989257812, + "learning_rate": 1.1254083484573502e-05, + "loss": 36.8193, + "step": 3453 + }, + { + "epoch": 12.469525959367946, + "grad_norm": 292.7674560546875, + "learning_rate": 1.12486388384755e-05, + "loss": 33.891, + "step": 3454 + }, + { + "epoch": 12.47313769751693, + "grad_norm": 241.6099395751953, + "learning_rate": 1.1243194192377496e-05, + "loss": 34.8947, + "step": 3455 + }, + { + "epoch": 12.476749435665914, + "grad_norm": 220.97128295898438, + "learning_rate": 1.1237749546279493e-05, + "loss": 31.7715, + "step": 3456 + }, + { + "epoch": 12.480361173814899, + "grad_norm": 191.04376220703125, + "learning_rate": 1.1232304900181488e-05, + "loss": 32.3878, + "step": 3457 + }, + { + "epoch": 12.483972911963882, + "grad_norm": 192.3009796142578, + "learning_rate": 1.1226860254083484e-05, + "loss": 33.3116, + "step": 3458 + }, + { + "epoch": 12.487584650112867, + "grad_norm": 214.22459411621094, + "learning_rate": 1.1221415607985482e-05, + "loss": 34.1394, + "step": 3459 + }, + { + "epoch": 12.491196388261852, + "grad_norm": 225.24191284179688, + "learning_rate": 1.1215970961887478e-05, + "loss": 34.9381, + "step": 3460 + }, + { + "epoch": 12.491196388261852, + "eval_loss": 0.6095408201217651, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 3460 + }, + { + "epoch": 12.494808126410835, + "grad_norm": 240.89199829101562, + "learning_rate": 1.1210526315789473e-05, + "loss": 34.5342, + "step": 3461 + }, + { + "epoch": 12.49841986455982, + "grad_norm": 263.5467224121094, + "learning_rate": 1.120508166969147e-05, + "loss": 35.3287, + "step": 3462 + }, + { + "epoch": 12.502031602708804, + "grad_norm": 253.0650634765625, + "learning_rate": 1.1199637023593467e-05, + "loss": 35.4859, + "step": 3463 + }, + { + "epoch": 12.505643340857787, + "grad_norm": 279.4447937011719, + "learning_rate": 1.1194192377495463e-05, + "loss": 33.919, + "step": 3464 + }, + { + "epoch": 12.509255079006772, + "grad_norm": 246.6184844970703, + "learning_rate": 1.118874773139746e-05, + "loss": 35.2743, + "step": 3465 + }, + { + "epoch": 12.512866817155757, + "grad_norm": 228.4134979248047, + "learning_rate": 1.1183303085299455e-05, + "loss": 36.0865, + "step": 3466 + }, + { + "epoch": 12.51647855530474, + "grad_norm": 264.87835693359375, + "learning_rate": 1.1177858439201452e-05, + "loss": 36.1596, + "step": 3467 + }, + { + "epoch": 12.520090293453725, + "grad_norm": 252.2872772216797, + "learning_rate": 1.117241379310345e-05, + "loss": 35.7293, + "step": 3468 + }, + { + "epoch": 12.523702031602708, + "grad_norm": 277.3695373535156, + "learning_rate": 1.1166969147005445e-05, + "loss": 36.8009, + "step": 3469 + }, + { + "epoch": 12.527313769751693, + "grad_norm": 255.64610290527344, + "learning_rate": 1.1161524500907442e-05, + "loss": 28.5986, + "step": 3470 + }, + { + "epoch": 12.527313769751693, + "eval_loss": 0.6122347116470337, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.136, + "eval_steps_per_second": 57.136, + "step": 3470 + }, + { + "epoch": 12.530925507900678, + "grad_norm": 256.1487121582031, + "learning_rate": 1.1156079854809437e-05, + "loss": 23.1289, + "step": 3471 + }, + { + "epoch": 12.534537246049661, + "grad_norm": 261.9757080078125, + "learning_rate": 1.1150635208711433e-05, + "loss": 22.3379, + "step": 3472 + }, + { + "epoch": 12.538148984198646, + "grad_norm": 194.83432006835938, + "learning_rate": 1.1145190562613431e-05, + "loss": 23.6192, + "step": 3473 + }, + { + "epoch": 12.54176072234763, + "grad_norm": 241.51089477539062, + "learning_rate": 1.1139745916515427e-05, + "loss": 24.0314, + "step": 3474 + }, + { + "epoch": 12.545372460496614, + "grad_norm": 242.6024932861328, + "learning_rate": 1.1134301270417424e-05, + "loss": 40.2969, + "step": 3475 + }, + { + "epoch": 12.548984198645599, + "grad_norm": 292.17303466796875, + "learning_rate": 1.112885662431942e-05, + "loss": 42.3448, + "step": 3476 + }, + { + "epoch": 12.552595936794582, + "grad_norm": 232.811767578125, + "learning_rate": 1.1123411978221416e-05, + "loss": 41.7642, + "step": 3477 + }, + { + "epoch": 12.556207674943566, + "grad_norm": 238.43162536621094, + "learning_rate": 1.1117967332123413e-05, + "loss": 41.0827, + "step": 3478 + }, + { + "epoch": 12.559819413092551, + "grad_norm": 290.20159912109375, + "learning_rate": 1.1112522686025409e-05, + "loss": 41.3795, + "step": 3479 + }, + { + "epoch": 12.563431151241534, + "grad_norm": 197.52903747558594, + "learning_rate": 1.1107078039927404e-05, + "loss": 40.6337, + "step": 3480 + }, + { + "epoch": 12.563431151241534, + "eval_loss": 0.6133883595466614, + "eval_runtime": 3.1329, + "eval_samples_per_second": 57.135, + "eval_steps_per_second": 57.135, + "step": 3480 + }, + { + "epoch": 12.56704288939052, + "grad_norm": 259.8161926269531, + "learning_rate": 1.1101633393829401e-05, + "loss": 40.2626, + "step": 3481 + }, + { + "epoch": 12.570654627539504, + "grad_norm": 196.7882537841797, + "learning_rate": 1.1096188747731398e-05, + "loss": 41.0171, + "step": 3482 + }, + { + "epoch": 12.574266365688487, + "grad_norm": 216.27642822265625, + "learning_rate": 1.1090744101633394e-05, + "loss": 42.1328, + "step": 3483 + }, + { + "epoch": 12.577878103837472, + "grad_norm": 292.6575012207031, + "learning_rate": 1.108529945553539e-05, + "loss": 39.9502, + "step": 3484 + }, + { + "epoch": 12.581489841986457, + "grad_norm": 254.43344116210938, + "learning_rate": 1.1079854809437386e-05, + "loss": 41.3409, + "step": 3485 + }, + { + "epoch": 12.58510158013544, + "grad_norm": 211.3965606689453, + "learning_rate": 1.1074410163339385e-05, + "loss": 39.6898, + "step": 3486 + }, + { + "epoch": 12.588713318284425, + "grad_norm": 196.2000274658203, + "learning_rate": 1.106896551724138e-05, + "loss": 38.0837, + "step": 3487 + }, + { + "epoch": 12.592325056433408, + "grad_norm": 224.4564666748047, + "learning_rate": 1.1063520871143376e-05, + "loss": 38.479, + "step": 3488 + }, + { + "epoch": 12.595936794582393, + "grad_norm": 215.7074432373047, + "learning_rate": 1.1058076225045373e-05, + "loss": 38.3103, + "step": 3489 + }, + { + "epoch": 12.599548532731378, + "grad_norm": 278.2279052734375, + "learning_rate": 1.1052631578947368e-05, + "loss": 37.9399, + "step": 3490 + }, + { + "epoch": 12.599548532731378, + "eval_loss": 0.6091782450675964, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.127, + "eval_steps_per_second": 57.127, + "step": 3490 + }, + { + "epoch": 12.60316027088036, + "grad_norm": 236.7021942138672, + "learning_rate": 1.1047186932849365e-05, + "loss": 38.185, + "step": 3491 + }, + { + "epoch": 12.606772009029346, + "grad_norm": 200.35169982910156, + "learning_rate": 1.1041742286751362e-05, + "loss": 38.7405, + "step": 3492 + }, + { + "epoch": 12.610383747178329, + "grad_norm": 211.9726104736328, + "learning_rate": 1.1036297640653358e-05, + "loss": 39.8351, + "step": 3493 + }, + { + "epoch": 12.613995485327314, + "grad_norm": 303.5962829589844, + "learning_rate": 1.1030852994555353e-05, + "loss": 39.3039, + "step": 3494 + }, + { + "epoch": 12.617607223476298, + "grad_norm": 298.086181640625, + "learning_rate": 1.102540834845735e-05, + "loss": 39.9149, + "step": 3495 + }, + { + "epoch": 12.621218961625281, + "grad_norm": 255.69854736328125, + "learning_rate": 1.1019963702359347e-05, + "loss": 36.3617, + "step": 3496 + }, + { + "epoch": 12.624830699774266, + "grad_norm": 273.2884216308594, + "learning_rate": 1.1014519056261344e-05, + "loss": 38.6865, + "step": 3497 + }, + { + "epoch": 12.628442437923251, + "grad_norm": 211.17837524414062, + "learning_rate": 1.100907441016334e-05, + "loss": 40.2771, + "step": 3498 + }, + { + "epoch": 12.632054176072234, + "grad_norm": 253.9141845703125, + "learning_rate": 1.1003629764065335e-05, + "loss": 40.3644, + "step": 3499 + }, + { + "epoch": 12.635665914221219, + "grad_norm": 247.4141082763672, + "learning_rate": 1.0998185117967334e-05, + "loss": 39.9754, + "step": 3500 + }, + { + "epoch": 12.635665914221219, + "eval_loss": 0.6086810827255249, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 3500 + }, + { + "epoch": 12.639277652370204, + "grad_norm": 237.3258056640625, + "learning_rate": 1.0992740471869329e-05, + "loss": 39.9438, + "step": 3501 + }, + { + "epoch": 12.642889390519187, + "grad_norm": 252.87744140625, + "learning_rate": 1.0987295825771325e-05, + "loss": 39.9713, + "step": 3502 + }, + { + "epoch": 12.646501128668172, + "grad_norm": 341.2947998046875, + "learning_rate": 1.0981851179673322e-05, + "loss": 36.54, + "step": 3503 + }, + { + "epoch": 12.650112866817155, + "grad_norm": 212.7144317626953, + "learning_rate": 1.0976406533575317e-05, + "loss": 33.2737, + "step": 3504 + }, + { + "epoch": 12.65372460496614, + "grad_norm": 220.15846252441406, + "learning_rate": 1.0970961887477314e-05, + "loss": 34.8862, + "step": 3505 + }, + { + "epoch": 12.657336343115125, + "grad_norm": 235.8145294189453, + "learning_rate": 1.0965517241379311e-05, + "loss": 31.637, + "step": 3506 + }, + { + "epoch": 12.660948081264108, + "grad_norm": 274.13140869140625, + "learning_rate": 1.0960072595281307e-05, + "loss": 33.6111, + "step": 3507 + }, + { + "epoch": 12.664559819413093, + "grad_norm": 259.9810791015625, + "learning_rate": 1.0954627949183304e-05, + "loss": 34.7118, + "step": 3508 + }, + { + "epoch": 12.668171557562077, + "grad_norm": 244.6074676513672, + "learning_rate": 1.0949183303085299e-05, + "loss": 34.3987, + "step": 3509 + }, + { + "epoch": 12.67178329571106, + "grad_norm": 264.0238037109375, + "learning_rate": 1.0943738656987296e-05, + "loss": 34.7304, + "step": 3510 + }, + { + "epoch": 12.67178329571106, + "eval_loss": 0.6089194416999817, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 3510 + }, + { + "epoch": 12.675395033860045, + "grad_norm": 286.857421875, + "learning_rate": 1.0938294010889293e-05, + "loss": 34.5722, + "step": 3511 + }, + { + "epoch": 12.679006772009028, + "grad_norm": 270.7839660644531, + "learning_rate": 1.0932849364791289e-05, + "loss": 35.6129, + "step": 3512 + }, + { + "epoch": 12.682618510158013, + "grad_norm": 214.4302978515625, + "learning_rate": 1.0927404718693284e-05, + "loss": 34.4318, + "step": 3513 + }, + { + "epoch": 12.686230248306998, + "grad_norm": 362.6913757324219, + "learning_rate": 1.0921960072595283e-05, + "loss": 35.6578, + "step": 3514 + }, + { + "epoch": 12.689841986455981, + "grad_norm": 266.5205993652344, + "learning_rate": 1.0916515426497278e-05, + "loss": 35.8627, + "step": 3515 + }, + { + "epoch": 12.693453724604966, + "grad_norm": 271.8298034667969, + "learning_rate": 1.0911070780399275e-05, + "loss": 36.8931, + "step": 3516 + }, + { + "epoch": 12.697065462753951, + "grad_norm": 230.13815307617188, + "learning_rate": 1.090562613430127e-05, + "loss": 35.8972, + "step": 3517 + }, + { + "epoch": 12.700677200902934, + "grad_norm": 235.57127380371094, + "learning_rate": 1.0900181488203266e-05, + "loss": 36.7884, + "step": 3518 + }, + { + "epoch": 12.704288939051919, + "grad_norm": 274.0856018066406, + "learning_rate": 1.0894736842105265e-05, + "loss": 35.938, + "step": 3519 + }, + { + "epoch": 12.707900677200904, + "grad_norm": 251.9855194091797, + "learning_rate": 1.088929219600726e-05, + "loss": 30.846, + "step": 3520 + }, + { + "epoch": 12.707900677200904, + "eval_loss": 0.6102532148361206, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 3520 + }, + { + "epoch": 12.711512415349887, + "grad_norm": 254.11465454101562, + "learning_rate": 1.0883847549909255e-05, + "loss": 22.8538, + "step": 3521 + }, + { + "epoch": 12.715124153498872, + "grad_norm": 233.05821228027344, + "learning_rate": 1.0878402903811253e-05, + "loss": 22.3346, + "step": 3522 + }, + { + "epoch": 12.718735891647855, + "grad_norm": 223.46646118164062, + "learning_rate": 1.087295825771325e-05, + "loss": 23.8109, + "step": 3523 + }, + { + "epoch": 12.72234762979684, + "grad_norm": 209.4064483642578, + "learning_rate": 1.0867513611615245e-05, + "loss": 24.7694, + "step": 3524 + }, + { + "epoch": 12.725959367945824, + "grad_norm": 299.6215515136719, + "learning_rate": 1.0862068965517242e-05, + "loss": 40.8879, + "step": 3525 + }, + { + "epoch": 12.729571106094808, + "grad_norm": 272.5259704589844, + "learning_rate": 1.0856624319419237e-05, + "loss": 41.5875, + "step": 3526 + }, + { + "epoch": 12.733182844243792, + "grad_norm": 219.70687866210938, + "learning_rate": 1.0851179673321235e-05, + "loss": 41.5546, + "step": 3527 + }, + { + "epoch": 12.736794582392777, + "grad_norm": 250.9104766845703, + "learning_rate": 1.0845735027223232e-05, + "loss": 40.0984, + "step": 3528 + }, + { + "epoch": 12.74040632054176, + "grad_norm": 260.9254150390625, + "learning_rate": 1.0840290381125227e-05, + "loss": 40.564, + "step": 3529 + }, + { + "epoch": 12.744018058690745, + "grad_norm": 275.46221923828125, + "learning_rate": 1.0834845735027224e-05, + "loss": 40.3864, + "step": 3530 + }, + { + "epoch": 12.744018058690745, + "eval_loss": 0.6099677681922913, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 3530 + }, + { + "epoch": 12.747629796839728, + "grad_norm": 200.9589385986328, + "learning_rate": 1.082940108892922e-05, + "loss": 40.5753, + "step": 3531 + }, + { + "epoch": 12.751241534988713, + "grad_norm": 228.87669372558594, + "learning_rate": 1.0823956442831215e-05, + "loss": 41.4702, + "step": 3532 + }, + { + "epoch": 12.754853273137698, + "grad_norm": 218.6998748779297, + "learning_rate": 1.0818511796733214e-05, + "loss": 41.6641, + "step": 3533 + }, + { + "epoch": 12.758465011286681, + "grad_norm": 422.519775390625, + "learning_rate": 1.0813067150635209e-05, + "loss": 41.8016, + "step": 3534 + }, + { + "epoch": 12.762076749435666, + "grad_norm": 198.31935119628906, + "learning_rate": 1.0807622504537204e-05, + "loss": 40.6053, + "step": 3535 + }, + { + "epoch": 12.76568848758465, + "grad_norm": 274.42333984375, + "learning_rate": 1.0802177858439201e-05, + "loss": 38.7974, + "step": 3536 + }, + { + "epoch": 12.769300225733634, + "grad_norm": 267.5847473144531, + "learning_rate": 1.0796733212341199e-05, + "loss": 37.157, + "step": 3537 + }, + { + "epoch": 12.772911963882619, + "grad_norm": 264.9976806640625, + "learning_rate": 1.0791288566243196e-05, + "loss": 38.1585, + "step": 3538 + }, + { + "epoch": 12.776523702031604, + "grad_norm": 216.5603790283203, + "learning_rate": 1.0785843920145191e-05, + "loss": 38.0501, + "step": 3539 + }, + { + "epoch": 12.780135440180587, + "grad_norm": 193.55081176757812, + "learning_rate": 1.0780399274047186e-05, + "loss": 38.3114, + "step": 3540 + }, + { + "epoch": 12.780135440180587, + "eval_loss": 0.6059894561767578, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 3540 + }, + { + "epoch": 12.783747178329572, + "grad_norm": 256.3584289550781, + "learning_rate": 1.0774954627949183e-05, + "loss": 38.7056, + "step": 3541 + }, + { + "epoch": 12.787358916478555, + "grad_norm": 203.17401123046875, + "learning_rate": 1.076950998185118e-05, + "loss": 39.3947, + "step": 3542 + }, + { + "epoch": 12.79097065462754, + "grad_norm": 307.99517822265625, + "learning_rate": 1.0764065335753176e-05, + "loss": 39.2121, + "step": 3543 + }, + { + "epoch": 12.794582392776524, + "grad_norm": 199.4147186279297, + "learning_rate": 1.0758620689655173e-05, + "loss": 38.4621, + "step": 3544 + }, + { + "epoch": 12.798194130925507, + "grad_norm": 251.60293579101562, + "learning_rate": 1.0753176043557168e-05, + "loss": 38.2742, + "step": 3545 + }, + { + "epoch": 12.801805869074492, + "grad_norm": 277.1817321777344, + "learning_rate": 1.0747731397459165e-05, + "loss": 38.6803, + "step": 3546 + }, + { + "epoch": 12.805417607223477, + "grad_norm": 303.2837219238281, + "learning_rate": 1.0742286751361163e-05, + "loss": 39.7843, + "step": 3547 + }, + { + "epoch": 12.80902934537246, + "grad_norm": 321.22772216796875, + "learning_rate": 1.0736842105263158e-05, + "loss": 41.3761, + "step": 3548 + }, + { + "epoch": 12.812641083521445, + "grad_norm": 238.89007568359375, + "learning_rate": 1.0731397459165155e-05, + "loss": 40.3649, + "step": 3549 + }, + { + "epoch": 12.816252821670428, + "grad_norm": 251.22291564941406, + "learning_rate": 1.072595281306715e-05, + "loss": 40.8151, + "step": 3550 + }, + { + "epoch": 12.816252821670428, + "eval_loss": 0.6065003275871277, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 3550 + }, + { + "epoch": 12.819864559819413, + "grad_norm": 218.13418579101562, + "learning_rate": 1.0720508166969147e-05, + "loss": 39.381, + "step": 3551 + }, + { + "epoch": 12.823476297968398, + "grad_norm": 250.90328979492188, + "learning_rate": 1.0715063520871145e-05, + "loss": 39.8923, + "step": 3552 + }, + { + "epoch": 12.827088036117381, + "grad_norm": 227.4825897216797, + "learning_rate": 1.070961887477314e-05, + "loss": 36.836, + "step": 3553 + }, + { + "epoch": 12.830699774266366, + "grad_norm": 253.7106475830078, + "learning_rate": 1.0704174228675135e-05, + "loss": 34.499, + "step": 3554 + }, + { + "epoch": 12.83431151241535, + "grad_norm": 280.0548400878906, + "learning_rate": 1.0698729582577132e-05, + "loss": 33.3409, + "step": 3555 + }, + { + "epoch": 12.837923250564334, + "grad_norm": 201.3768768310547, + "learning_rate": 1.069328493647913e-05, + "loss": 32.4868, + "step": 3556 + }, + { + "epoch": 12.841534988713319, + "grad_norm": 245.73446655273438, + "learning_rate": 1.0687840290381125e-05, + "loss": 32.8295, + "step": 3557 + }, + { + "epoch": 12.845146726862303, + "grad_norm": 195.0170440673828, + "learning_rate": 1.0682395644283122e-05, + "loss": 33.2009, + "step": 3558 + }, + { + "epoch": 12.848758465011286, + "grad_norm": 261.66357421875, + "learning_rate": 1.0676950998185117e-05, + "loss": 33.0627, + "step": 3559 + }, + { + "epoch": 12.852370203160271, + "grad_norm": 299.0184326171875, + "learning_rate": 1.0671506352087116e-05, + "loss": 34.184, + "step": 3560 + }, + { + "epoch": 12.852370203160271, + "eval_loss": 0.6077792048454285, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.041, + "eval_steps_per_second": 57.041, + "step": 3560 + }, + { + "epoch": 12.855981941309254, + "grad_norm": 293.9249572753906, + "learning_rate": 1.0666061705989111e-05, + "loss": 34.748, + "step": 3561 + }, + { + "epoch": 12.85959367945824, + "grad_norm": 206.4182586669922, + "learning_rate": 1.0660617059891107e-05, + "loss": 33.8454, + "step": 3562 + }, + { + "epoch": 12.863205417607224, + "grad_norm": 261.4427185058594, + "learning_rate": 1.0655172413793104e-05, + "loss": 35.7317, + "step": 3563 + }, + { + "epoch": 12.866817155756207, + "grad_norm": 236.60704040527344, + "learning_rate": 1.06497277676951e-05, + "loss": 35.2389, + "step": 3564 + }, + { + "epoch": 12.870428893905192, + "grad_norm": 272.9973449707031, + "learning_rate": 1.0644283121597096e-05, + "loss": 34.8523, + "step": 3565 + }, + { + "epoch": 12.874040632054175, + "grad_norm": 228.82540893554688, + "learning_rate": 1.0638838475499093e-05, + "loss": 34.7236, + "step": 3566 + }, + { + "epoch": 12.87765237020316, + "grad_norm": 266.6078796386719, + "learning_rate": 1.0633393829401089e-05, + "loss": 36.1574, + "step": 3567 + }, + { + "epoch": 12.881264108352145, + "grad_norm": 267.52239990234375, + "learning_rate": 1.0627949183303086e-05, + "loss": 36.8466, + "step": 3568 + }, + { + "epoch": 12.884875846501128, + "grad_norm": 261.0372314453125, + "learning_rate": 1.0622504537205083e-05, + "loss": 37.2803, + "step": 3569 + }, + { + "epoch": 12.888487584650113, + "grad_norm": 220.42532348632812, + "learning_rate": 1.0617059891107078e-05, + "loss": 29.4233, + "step": 3570 + }, + { + "epoch": 12.888487584650113, + "eval_loss": 0.6131581664085388, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 3570 + }, + { + "epoch": 12.892099322799098, + "grad_norm": 187.53604125976562, + "learning_rate": 1.0611615245009075e-05, + "loss": 23.3851, + "step": 3571 + }, + { + "epoch": 12.89571106094808, + "grad_norm": 227.1913299560547, + "learning_rate": 1.060617059891107e-05, + "loss": 23.3155, + "step": 3572 + }, + { + "epoch": 12.899322799097066, + "grad_norm": 202.15939331054688, + "learning_rate": 1.0600725952813066e-05, + "loss": 24.4548, + "step": 3573 + }, + { + "epoch": 12.90293453724605, + "grad_norm": 195.67282104492188, + "learning_rate": 1.0595281306715065e-05, + "loss": 24.2037, + "step": 3574 + }, + { + "epoch": 12.906546275395034, + "grad_norm": 303.0018310546875, + "learning_rate": 1.058983666061706e-05, + "loss": 41.6489, + "step": 3575 + }, + { + "epoch": 12.910158013544018, + "grad_norm": 193.92433166503906, + "learning_rate": 1.0584392014519056e-05, + "loss": 40.3682, + "step": 3576 + }, + { + "epoch": 12.913769751693001, + "grad_norm": 305.50750732421875, + "learning_rate": 1.0578947368421053e-05, + "loss": 40.5065, + "step": 3577 + }, + { + "epoch": 12.917381489841986, + "grad_norm": 223.41732788085938, + "learning_rate": 1.0573502722323048e-05, + "loss": 41.6387, + "step": 3578 + }, + { + "epoch": 12.920993227990971, + "grad_norm": 215.65061950683594, + "learning_rate": 1.0568058076225047e-05, + "loss": 41.3623, + "step": 3579 + }, + { + "epoch": 12.924604966139954, + "grad_norm": 223.95880126953125, + "learning_rate": 1.0562613430127042e-05, + "loss": 40.7444, + "step": 3580 + }, + { + "epoch": 12.924604966139954, + "eval_loss": 0.6113386750221252, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.074, + "eval_steps_per_second": 57.074, + "step": 3580 + }, + { + "epoch": 12.928216704288939, + "grad_norm": 247.3272247314453, + "learning_rate": 1.0557168784029038e-05, + "loss": 37.8137, + "step": 3581 + }, + { + "epoch": 12.931828442437924, + "grad_norm": 277.4321594238281, + "learning_rate": 1.0551724137931035e-05, + "loss": 38.6946, + "step": 3582 + }, + { + "epoch": 12.935440180586907, + "grad_norm": 219.15576171875, + "learning_rate": 1.0546279491833032e-05, + "loss": 39.0059, + "step": 3583 + }, + { + "epoch": 12.939051918735892, + "grad_norm": 205.6105194091797, + "learning_rate": 1.0540834845735027e-05, + "loss": 39.2436, + "step": 3584 + }, + { + "epoch": 12.942663656884875, + "grad_norm": 303.84521484375, + "learning_rate": 1.0535390199637024e-05, + "loss": 39.2451, + "step": 3585 + }, + { + "epoch": 12.94627539503386, + "grad_norm": 326.2321472167969, + "learning_rate": 1.052994555353902e-05, + "loss": 38.1849, + "step": 3586 + }, + { + "epoch": 12.949887133182845, + "grad_norm": 332.7608642578125, + "learning_rate": 1.0524500907441015e-05, + "loss": 39.7121, + "step": 3587 + }, + { + "epoch": 12.953498871331828, + "grad_norm": 245.19827270507812, + "learning_rate": 1.0519056261343014e-05, + "loss": 39.6558, + "step": 3588 + }, + { + "epoch": 12.957110609480813, + "grad_norm": 227.54763793945312, + "learning_rate": 1.051361161524501e-05, + "loss": 38.6437, + "step": 3589 + }, + { + "epoch": 12.960722347629797, + "grad_norm": 273.1142272949219, + "learning_rate": 1.0508166969147006e-05, + "loss": 39.083, + "step": 3590 + }, + { + "epoch": 12.960722347629797, + "eval_loss": 0.6050187349319458, + "eval_runtime": 3.1339, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 3590 + }, + { + "epoch": 12.96433408577878, + "grad_norm": 227.0492401123047, + "learning_rate": 1.0502722323049002e-05, + "loss": 34.0254, + "step": 3591 + }, + { + "epoch": 12.967945823927765, + "grad_norm": 201.76736450195312, + "learning_rate": 1.0497277676950999e-05, + "loss": 32.4569, + "step": 3592 + }, + { + "epoch": 12.97155756207675, + "grad_norm": 279.99237060546875, + "learning_rate": 1.0491833030852996e-05, + "loss": 33.8718, + "step": 3593 + }, + { + "epoch": 12.975169300225733, + "grad_norm": 351.647705078125, + "learning_rate": 1.0486388384754991e-05, + "loss": 34.8168, + "step": 3594 + }, + { + "epoch": 12.978781038374718, + "grad_norm": 275.7414855957031, + "learning_rate": 1.0480943738656987e-05, + "loss": 35.1731, + "step": 3595 + }, + { + "epoch": 12.982392776523701, + "grad_norm": 347.0024719238281, + "learning_rate": 1.0475499092558984e-05, + "loss": 35.7127, + "step": 3596 + }, + { + "epoch": 12.986004514672686, + "grad_norm": 304.18218994140625, + "learning_rate": 1.047005444646098e-05, + "loss": 34.7709, + "step": 3597 + }, + { + "epoch": 12.989616252821671, + "grad_norm": 306.33245849609375, + "learning_rate": 1.0464609800362976e-05, + "loss": 37.2105, + "step": 3598 + }, + { + "epoch": 12.993227990970654, + "grad_norm": 326.3535461425781, + "learning_rate": 1.0459165154264973e-05, + "loss": 33.6613, + "step": 3599 + }, + { + "epoch": 12.996839729119639, + "grad_norm": 325.7522888183594, + "learning_rate": 1.0453720508166969e-05, + "loss": 22.8985, + "step": 3600 + }, + { + "epoch": 12.996839729119639, + "eval_loss": 0.6073772311210632, + "eval_runtime": 3.1391, + "eval_samples_per_second": 57.023, + "eval_steps_per_second": 57.023, + "step": 3600 + }, + { + "epoch": 13.0, + "grad_norm": 256.7010498046875, + "learning_rate": 1.0448275862068966e-05, + "loss": 21.3776, + "step": 3601 + }, + { + "epoch": 13.003611738148985, + "grad_norm": 247.7591552734375, + "learning_rate": 1.0442831215970963e-05, + "loss": 39.0509, + "step": 3602 + }, + { + "epoch": 13.007223476297968, + "grad_norm": 389.6626281738281, + "learning_rate": 1.0437386569872958e-05, + "loss": 41.042, + "step": 3603 + }, + { + "epoch": 13.010835214446953, + "grad_norm": 271.01885986328125, + "learning_rate": 1.0431941923774955e-05, + "loss": 39.9542, + "step": 3604 + }, + { + "epoch": 13.014446952595938, + "grad_norm": 263.2490539550781, + "learning_rate": 1.042649727767695e-05, + "loss": 39.8852, + "step": 3605 + }, + { + "epoch": 13.01805869074492, + "grad_norm": 255.46878051757812, + "learning_rate": 1.0421052631578948e-05, + "loss": 39.3902, + "step": 3606 + }, + { + "epoch": 13.021670428893906, + "grad_norm": 206.02244567871094, + "learning_rate": 1.0415607985480945e-05, + "loss": 40.1731, + "step": 3607 + }, + { + "epoch": 13.025282167042889, + "grad_norm": 194.83055114746094, + "learning_rate": 1.041016333938294e-05, + "loss": 39.17, + "step": 3608 + }, + { + "epoch": 13.028893905191874, + "grad_norm": 230.1270294189453, + "learning_rate": 1.0404718693284936e-05, + "loss": 40.3363, + "step": 3609 + }, + { + "epoch": 13.032505643340858, + "grad_norm": 206.0470733642578, + "learning_rate": 1.0399274047186933e-05, + "loss": 40.7774, + "step": 3610 + }, + { + "epoch": 13.032505643340858, + "eval_loss": 0.6078981161117554, + "eval_runtime": 3.1697, + "eval_samples_per_second": 56.472, + "eval_steps_per_second": 56.472, + "step": 3610 + }, + { + "epoch": 13.036117381489841, + "grad_norm": 210.79327392578125, + "learning_rate": 1.039382940108893e-05, + "loss": 40.725, + "step": 3611 + }, + { + "epoch": 13.039729119638826, + "grad_norm": 200.4281768798828, + "learning_rate": 1.0388384754990927e-05, + "loss": 38.8736, + "step": 3612 + }, + { + "epoch": 13.043340857787811, + "grad_norm": 183.33575439453125, + "learning_rate": 1.0382940108892922e-05, + "loss": 37.5542, + "step": 3613 + }, + { + "epoch": 13.046952595936794, + "grad_norm": 195.2568817138672, + "learning_rate": 1.0377495462794918e-05, + "loss": 36.5576, + "step": 3614 + }, + { + "epoch": 13.050564334085779, + "grad_norm": 223.9565887451172, + "learning_rate": 1.0372050816696916e-05, + "loss": 36.9015, + "step": 3615 + }, + { + "epoch": 13.054176072234762, + "grad_norm": 264.0516052246094, + "learning_rate": 1.0366606170598912e-05, + "loss": 38.8146, + "step": 3616 + }, + { + "epoch": 13.057787810383747, + "grad_norm": 247.3844757080078, + "learning_rate": 1.0361161524500907e-05, + "loss": 37.0338, + "step": 3617 + }, + { + "epoch": 13.061399548532732, + "grad_norm": 243.3253173828125, + "learning_rate": 1.0355716878402904e-05, + "loss": 37.3565, + "step": 3618 + }, + { + "epoch": 13.065011286681715, + "grad_norm": 213.89939880371094, + "learning_rate": 1.03502722323049e-05, + "loss": 38.367, + "step": 3619 + }, + { + "epoch": 13.0686230248307, + "grad_norm": 254.04953002929688, + "learning_rate": 1.0344827586206898e-05, + "loss": 38.3101, + "step": 3620 + }, + { + "epoch": 13.0686230248307, + "eval_loss": 0.6108394861221313, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 3620 + }, + { + "epoch": 13.072234762979685, + "grad_norm": 235.3623046875, + "learning_rate": 1.0339382940108894e-05, + "loss": 38.3113, + "step": 3621 + }, + { + "epoch": 13.075846501128668, + "grad_norm": 259.0147399902344, + "learning_rate": 1.0333938294010889e-05, + "loss": 36.9916, + "step": 3622 + }, + { + "epoch": 13.079458239277653, + "grad_norm": 257.96575927734375, + "learning_rate": 1.0328493647912886e-05, + "loss": 36.5944, + "step": 3623 + }, + { + "epoch": 13.083069977426636, + "grad_norm": 228.49131774902344, + "learning_rate": 1.0323049001814882e-05, + "loss": 39.7592, + "step": 3624 + }, + { + "epoch": 13.08668171557562, + "grad_norm": 278.5231018066406, + "learning_rate": 1.0317604355716879e-05, + "loss": 38.7785, + "step": 3625 + }, + { + "epoch": 13.090293453724605, + "grad_norm": 218.6136932373047, + "learning_rate": 1.0312159709618876e-05, + "loss": 39.6878, + "step": 3626 + }, + { + "epoch": 13.093905191873588, + "grad_norm": 231.03012084960938, + "learning_rate": 1.0306715063520871e-05, + "loss": 40.5433, + "step": 3627 + }, + { + "epoch": 13.097516930022573, + "grad_norm": 254.7096405029297, + "learning_rate": 1.0301270417422866e-05, + "loss": 39.1311, + "step": 3628 + }, + { + "epoch": 13.101128668171558, + "grad_norm": 303.50274658203125, + "learning_rate": 1.0295825771324865e-05, + "loss": 38.6237, + "step": 3629 + }, + { + "epoch": 13.104740406320541, + "grad_norm": 217.4394073486328, + "learning_rate": 1.029038112522686e-05, + "loss": 36.5534, + "step": 3630 + }, + { + "epoch": 13.104740406320541, + "eval_loss": 0.6075544357299805, + "eval_runtime": 3.1475, + "eval_samples_per_second": 56.87, + "eval_steps_per_second": 56.87, + "step": 3630 + }, + { + "epoch": 13.108352144469526, + "grad_norm": 249.18490600585938, + "learning_rate": 1.0284936479128858e-05, + "loss": 34.2153, + "step": 3631 + }, + { + "epoch": 13.111963882618511, + "grad_norm": 261.9061584472656, + "learning_rate": 1.0279491833030853e-05, + "loss": 33.7793, + "step": 3632 + }, + { + "epoch": 13.115575620767494, + "grad_norm": 205.93113708496094, + "learning_rate": 1.0274047186932848e-05, + "loss": 31.2934, + "step": 3633 + }, + { + "epoch": 13.119187358916479, + "grad_norm": 203.82980346679688, + "learning_rate": 1.0268602540834847e-05, + "loss": 31.9074, + "step": 3634 + }, + { + "epoch": 13.122799097065462, + "grad_norm": 309.0658874511719, + "learning_rate": 1.0263157894736843e-05, + "loss": 32.6883, + "step": 3635 + }, + { + "epoch": 13.126410835214447, + "grad_norm": 239.59312438964844, + "learning_rate": 1.0257713248638838e-05, + "loss": 34.1261, + "step": 3636 + }, + { + "epoch": 13.130022573363432, + "grad_norm": 360.4351501464844, + "learning_rate": 1.0252268602540835e-05, + "loss": 34.7656, + "step": 3637 + }, + { + "epoch": 13.133634311512415, + "grad_norm": 319.87451171875, + "learning_rate": 1.024682395644283e-05, + "loss": 34.6533, + "step": 3638 + }, + { + "epoch": 13.1372460496614, + "grad_norm": 352.31707763671875, + "learning_rate": 1.0241379310344828e-05, + "loss": 33.9159, + "step": 3639 + }, + { + "epoch": 13.140857787810384, + "grad_norm": 288.85418701171875, + "learning_rate": 1.0235934664246825e-05, + "loss": 34.6115, + "step": 3640 + }, + { + "epoch": 13.140857787810384, + "eval_loss": 0.6106187105178833, + "eval_runtime": 3.1535, + "eval_samples_per_second": 56.763, + "eval_steps_per_second": 56.763, + "step": 3640 + }, + { + "epoch": 13.144469525959368, + "grad_norm": 263.8638000488281, + "learning_rate": 1.023049001814882e-05, + "loss": 34.3008, + "step": 3641 + }, + { + "epoch": 13.148081264108352, + "grad_norm": 308.10650634765625, + "learning_rate": 1.0225045372050817e-05, + "loss": 35.9397, + "step": 3642 + }, + { + "epoch": 13.151693002257336, + "grad_norm": 208.60519409179688, + "learning_rate": 1.0219600725952814e-05, + "loss": 34.2573, + "step": 3643 + }, + { + "epoch": 13.15530474040632, + "grad_norm": 251.36766052246094, + "learning_rate": 1.021415607985481e-05, + "loss": 35.853, + "step": 3644 + }, + { + "epoch": 13.158916478555305, + "grad_norm": 264.94818115234375, + "learning_rate": 1.0208711433756807e-05, + "loss": 35.7057, + "step": 3645 + }, + { + "epoch": 13.162528216704288, + "grad_norm": 313.0333251953125, + "learning_rate": 1.0203266787658802e-05, + "loss": 34.611, + "step": 3646 + }, + { + "epoch": 13.166139954853273, + "grad_norm": 254.9687042236328, + "learning_rate": 1.0197822141560797e-05, + "loss": 31.1751, + "step": 3647 + }, + { + "epoch": 13.169751693002258, + "grad_norm": 219.7308349609375, + "learning_rate": 1.0192377495462796e-05, + "loss": 22.8425, + "step": 3648 + }, + { + "epoch": 13.173363431151241, + "grad_norm": 305.76416015625, + "learning_rate": 1.0186932849364792e-05, + "loss": 22.5266, + "step": 3649 + }, + { + "epoch": 13.176975169300226, + "grad_norm": 301.26239013671875, + "learning_rate": 1.0181488203266787e-05, + "loss": 23.861, + "step": 3650 + }, + { + "epoch": 13.176975169300226, + "eval_loss": 0.6107029914855957, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 3650 + }, + { + "epoch": 13.18058690744921, + "grad_norm": 235.15576171875, + "learning_rate": 1.0176043557168784e-05, + "loss": 24.495, + "step": 3651 + }, + { + "epoch": 13.184198645598194, + "grad_norm": 268.524658203125, + "learning_rate": 1.0170598911070781e-05, + "loss": 40.3819, + "step": 3652 + }, + { + "epoch": 13.187810383747179, + "grad_norm": 257.869140625, + "learning_rate": 1.0165154264972778e-05, + "loss": 42.2715, + "step": 3653 + }, + { + "epoch": 13.191422121896162, + "grad_norm": 191.8995361328125, + "learning_rate": 1.0159709618874774e-05, + "loss": 41.2991, + "step": 3654 + }, + { + "epoch": 13.195033860045147, + "grad_norm": 242.85342407226562, + "learning_rate": 1.0154264972776769e-05, + "loss": 39.6007, + "step": 3655 + }, + { + "epoch": 13.198645598194132, + "grad_norm": 279.1092529296875, + "learning_rate": 1.0148820326678766e-05, + "loss": 39.8502, + "step": 3656 + }, + { + "epoch": 13.202257336343115, + "grad_norm": 233.94708251953125, + "learning_rate": 1.0143375680580763e-05, + "loss": 39.6407, + "step": 3657 + }, + { + "epoch": 13.2058690744921, + "grad_norm": 227.53001403808594, + "learning_rate": 1.0137931034482758e-05, + "loss": 40.3618, + "step": 3658 + }, + { + "epoch": 13.209480812641084, + "grad_norm": 216.17654418945312, + "learning_rate": 1.0132486388384756e-05, + "loss": 41.3187, + "step": 3659 + }, + { + "epoch": 13.213092550790067, + "grad_norm": 199.51072692871094, + "learning_rate": 1.0127041742286751e-05, + "loss": 41.7474, + "step": 3660 + }, + { + "epoch": 13.213092550790067, + "eval_loss": 0.6099065542221069, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3660 + }, + { + "epoch": 13.216704288939052, + "grad_norm": 212.3302001953125, + "learning_rate": 1.0121597096188748e-05, + "loss": 40.8565, + "step": 3661 + }, + { + "epoch": 13.220316027088035, + "grad_norm": 185.42857360839844, + "learning_rate": 1.0116152450090745e-05, + "loss": 41.5302, + "step": 3662 + }, + { + "epoch": 13.22392776523702, + "grad_norm": 241.05487060546875, + "learning_rate": 1.011070780399274e-05, + "loss": 38.6842, + "step": 3663 + }, + { + "epoch": 13.227539503386005, + "grad_norm": 314.1755065917969, + "learning_rate": 1.0105263157894738e-05, + "loss": 37.8021, + "step": 3664 + }, + { + "epoch": 13.231151241534988, + "grad_norm": 262.6571960449219, + "learning_rate": 1.0099818511796733e-05, + "loss": 36.3265, + "step": 3665 + }, + { + "epoch": 13.234762979683973, + "grad_norm": 259.24029541015625, + "learning_rate": 1.009437386569873e-05, + "loss": 38.4521, + "step": 3666 + }, + { + "epoch": 13.238374717832958, + "grad_norm": 223.5182342529297, + "learning_rate": 1.0088929219600727e-05, + "loss": 37.3267, + "step": 3667 + }, + { + "epoch": 13.241986455981941, + "grad_norm": 181.72926330566406, + "learning_rate": 1.0083484573502722e-05, + "loss": 38.0142, + "step": 3668 + }, + { + "epoch": 13.245598194130926, + "grad_norm": 204.99813842773438, + "learning_rate": 1.0078039927404718e-05, + "loss": 37.3513, + "step": 3669 + }, + { + "epoch": 13.249209932279909, + "grad_norm": 184.05482482910156, + "learning_rate": 1.0072595281306715e-05, + "loss": 37.9737, + "step": 3670 + }, + { + "epoch": 13.249209932279909, + "eval_loss": 0.6081296801567078, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.081, + "eval_steps_per_second": 57.081, + "step": 3670 + }, + { + "epoch": 13.252821670428894, + "grad_norm": 261.076416015625, + "learning_rate": 1.0067150635208712e-05, + "loss": 38.1087, + "step": 3671 + }, + { + "epoch": 13.256433408577879, + "grad_norm": 218.79515075683594, + "learning_rate": 1.0061705989110709e-05, + "loss": 37.215, + "step": 3672 + }, + { + "epoch": 13.260045146726862, + "grad_norm": 240.93222045898438, + "learning_rate": 1.0056261343012704e-05, + "loss": 37.4461, + "step": 3673 + }, + { + "epoch": 13.263656884875846, + "grad_norm": 241.46072387695312, + "learning_rate": 1.00508166969147e-05, + "loss": 39.4396, + "step": 3674 + }, + { + "epoch": 13.267268623024831, + "grad_norm": 217.85369873046875, + "learning_rate": 1.0045372050816699e-05, + "loss": 38.5512, + "step": 3675 + }, + { + "epoch": 13.270880361173814, + "grad_norm": 254.53549194335938, + "learning_rate": 1.0039927404718694e-05, + "loss": 39.4436, + "step": 3676 + }, + { + "epoch": 13.2744920993228, + "grad_norm": 330.2030029296875, + "learning_rate": 1.003448275862069e-05, + "loss": 39.6341, + "step": 3677 + }, + { + "epoch": 13.278103837471784, + "grad_norm": 267.6778869628906, + "learning_rate": 1.0029038112522686e-05, + "loss": 38.5305, + "step": 3678 + }, + { + "epoch": 13.281715575620767, + "grad_norm": 251.23703002929688, + "learning_rate": 1.0023593466424682e-05, + "loss": 39.712, + "step": 3679 + }, + { + "epoch": 13.285327313769752, + "grad_norm": 258.8126525878906, + "learning_rate": 1.0018148820326679e-05, + "loss": 37.982, + "step": 3680 + }, + { + "epoch": 13.285327313769752, + "eval_loss": 0.6092600226402283, + "eval_runtime": 3.1494, + "eval_samples_per_second": 56.837, + "eval_steps_per_second": 56.837, + "step": 3680 + }, + { + "epoch": 13.288939051918735, + "grad_norm": 270.01690673828125, + "learning_rate": 1.0012704174228676e-05, + "loss": 35.8938, + "step": 3681 + }, + { + "epoch": 13.29255079006772, + "grad_norm": 271.138671875, + "learning_rate": 1.0007259528130671e-05, + "loss": 33.2221, + "step": 3682 + }, + { + "epoch": 13.296162528216705, + "grad_norm": 239.4976806640625, + "learning_rate": 1.0001814882032668e-05, + "loss": 32.6252, + "step": 3683 + }, + { + "epoch": 13.299774266365688, + "grad_norm": 203.7470245361328, + "learning_rate": 9.996370235934664e-06, + "loss": 32.3694, + "step": 3684 + }, + { + "epoch": 13.303386004514673, + "grad_norm": 255.28419494628906, + "learning_rate": 9.990925589836661e-06, + "loss": 32.7386, + "step": 3685 + }, + { + "epoch": 13.306997742663658, + "grad_norm": 267.82489013671875, + "learning_rate": 9.985480943738658e-06, + "loss": 33.7657, + "step": 3686 + }, + { + "epoch": 13.31060948081264, + "grad_norm": 224.82432556152344, + "learning_rate": 9.980036297640653e-06, + "loss": 34.085, + "step": 3687 + }, + { + "epoch": 13.314221218961626, + "grad_norm": 249.92684936523438, + "learning_rate": 9.974591651542649e-06, + "loss": 33.9186, + "step": 3688 + }, + { + "epoch": 13.317832957110609, + "grad_norm": 249.29620361328125, + "learning_rate": 9.969147005444648e-06, + "loss": 35.0909, + "step": 3689 + }, + { + "epoch": 13.321444695259594, + "grad_norm": 276.4640808105469, + "learning_rate": 9.963702359346643e-06, + "loss": 35.6823, + "step": 3690 + }, + { + "epoch": 13.321444695259594, + "eval_loss": 0.6132593154907227, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 3690 + }, + { + "epoch": 13.325056433408578, + "grad_norm": 245.46163940429688, + "learning_rate": 9.958257713248638e-06, + "loss": 35.7071, + "step": 3691 + }, + { + "epoch": 13.328668171557561, + "grad_norm": 311.008544921875, + "learning_rate": 9.952813067150635e-06, + "loss": 33.6089, + "step": 3692 + }, + { + "epoch": 13.332279909706546, + "grad_norm": 283.2784118652344, + "learning_rate": 9.94736842105263e-06, + "loss": 34.9939, + "step": 3693 + }, + { + "epoch": 13.335891647855531, + "grad_norm": 293.2317199707031, + "learning_rate": 9.94192377495463e-06, + "loss": 37.1149, + "step": 3694 + }, + { + "epoch": 13.339503386004514, + "grad_norm": 263.33111572265625, + "learning_rate": 9.936479128856625e-06, + "loss": 36.5911, + "step": 3695 + }, + { + "epoch": 13.343115124153499, + "grad_norm": 285.1488952636719, + "learning_rate": 9.93103448275862e-06, + "loss": 35.9336, + "step": 3696 + }, + { + "epoch": 13.346726862302482, + "grad_norm": 246.30616760253906, + "learning_rate": 9.925589836660617e-06, + "loss": 26.1555, + "step": 3697 + }, + { + "epoch": 13.350338600451467, + "grad_norm": 185.4857177734375, + "learning_rate": 9.920145190562614e-06, + "loss": 21.9519, + "step": 3698 + }, + { + "epoch": 13.353950338600452, + "grad_norm": 269.6291809082031, + "learning_rate": 9.91470054446461e-06, + "loss": 22.5592, + "step": 3699 + }, + { + "epoch": 13.357562076749435, + "grad_norm": 214.7660675048828, + "learning_rate": 9.909255898366607e-06, + "loss": 23.2505, + "step": 3700 + }, + { + "epoch": 13.357562076749435, + "eval_loss": 0.6123418211936951, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 3700 + }, + { + "epoch": 13.36117381489842, + "grad_norm": 227.8025360107422, + "learning_rate": 9.903811252268602e-06, + "loss": 23.9731, + "step": 3701 + }, + { + "epoch": 13.364785553047405, + "grad_norm": 261.7846374511719, + "learning_rate": 9.898366606170598e-06, + "loss": 40.3869, + "step": 3702 + }, + { + "epoch": 13.368397291196388, + "grad_norm": 305.4109802246094, + "learning_rate": 9.892921960072596e-06, + "loss": 41.9626, + "step": 3703 + }, + { + "epoch": 13.372009029345373, + "grad_norm": 272.86236572265625, + "learning_rate": 9.887477313974592e-06, + "loss": 39.9819, + "step": 3704 + }, + { + "epoch": 13.375620767494357, + "grad_norm": 371.4781188964844, + "learning_rate": 9.882032667876589e-06, + "loss": 40.8074, + "step": 3705 + }, + { + "epoch": 13.37923250564334, + "grad_norm": 278.7463684082031, + "learning_rate": 9.876588021778584e-06, + "loss": 40.6721, + "step": 3706 + }, + { + "epoch": 13.382844243792325, + "grad_norm": 270.41619873046875, + "learning_rate": 9.87114337568058e-06, + "loss": 40.1604, + "step": 3707 + }, + { + "epoch": 13.386455981941308, + "grad_norm": 204.42018127441406, + "learning_rate": 9.865698729582578e-06, + "loss": 41.4666, + "step": 3708 + }, + { + "epoch": 13.390067720090293, + "grad_norm": 197.43289184570312, + "learning_rate": 9.860254083484574e-06, + "loss": 40.953, + "step": 3709 + }, + { + "epoch": 13.393679458239278, + "grad_norm": 203.92056274414062, + "learning_rate": 9.85480943738657e-06, + "loss": 40.6416, + "step": 3710 + }, + { + "epoch": 13.393679458239278, + "eval_loss": 0.608938992023468, + "eval_runtime": 3.1479, + "eval_samples_per_second": 56.863, + "eval_steps_per_second": 56.863, + "step": 3710 + }, + { + "epoch": 13.397291196388261, + "grad_norm": 353.2951354980469, + "learning_rate": 9.849364791288566e-06, + "loss": 39.7, + "step": 3711 + }, + { + "epoch": 13.400902934537246, + "grad_norm": 222.94410705566406, + "learning_rate": 9.843920145190563e-06, + "loss": 40.4703, + "step": 3712 + }, + { + "epoch": 13.404514672686231, + "grad_norm": 301.0710754394531, + "learning_rate": 9.83847549909256e-06, + "loss": 37.0453, + "step": 3713 + }, + { + "epoch": 13.408126410835214, + "grad_norm": 251.70263671875, + "learning_rate": 9.833030852994556e-06, + "loss": 37.5346, + "step": 3714 + }, + { + "epoch": 13.411738148984199, + "grad_norm": 201.29335021972656, + "learning_rate": 9.827586206896551e-06, + "loss": 39.0706, + "step": 3715 + }, + { + "epoch": 13.415349887133182, + "grad_norm": 233.82212829589844, + "learning_rate": 9.822141560798548e-06, + "loss": 38.4527, + "step": 3716 + }, + { + "epoch": 13.418961625282167, + "grad_norm": 245.0128936767578, + "learning_rate": 9.816696914700545e-06, + "loss": 37.82, + "step": 3717 + }, + { + "epoch": 13.422573363431152, + "grad_norm": 325.1784973144531, + "learning_rate": 9.81125226860254e-06, + "loss": 38.8858, + "step": 3718 + }, + { + "epoch": 13.426185101580135, + "grad_norm": 196.15032958984375, + "learning_rate": 9.805807622504538e-06, + "loss": 37.1919, + "step": 3719 + }, + { + "epoch": 13.42979683972912, + "grad_norm": 254.73980712890625, + "learning_rate": 9.800362976406533e-06, + "loss": 39.1644, + "step": 3720 + }, + { + "epoch": 13.42979683972912, + "eval_loss": 0.6100116968154907, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 3720 + }, + { + "epoch": 13.433408577878104, + "grad_norm": 253.11489868164062, + "learning_rate": 9.79491833030853e-06, + "loss": 39.8542, + "step": 3721 + }, + { + "epoch": 13.437020316027088, + "grad_norm": 267.8416748046875, + "learning_rate": 9.789473684210527e-06, + "loss": 39.8469, + "step": 3722 + }, + { + "epoch": 13.440632054176072, + "grad_norm": 267.62835693359375, + "learning_rate": 9.784029038112523e-06, + "loss": 37.4556, + "step": 3723 + }, + { + "epoch": 13.444243792325057, + "grad_norm": 346.6018371582031, + "learning_rate": 9.77858439201452e-06, + "loss": 39.7817, + "step": 3724 + }, + { + "epoch": 13.44785553047404, + "grad_norm": 241.95008850097656, + "learning_rate": 9.773139745916515e-06, + "loss": 39.1631, + "step": 3725 + }, + { + "epoch": 13.451467268623025, + "grad_norm": 244.9163055419922, + "learning_rate": 9.767695099818512e-06, + "loss": 38.6152, + "step": 3726 + }, + { + "epoch": 13.455079006772008, + "grad_norm": 243.60633850097656, + "learning_rate": 9.76225045372051e-06, + "loss": 39.5388, + "step": 3727 + }, + { + "epoch": 13.458690744920993, + "grad_norm": 230.57276916503906, + "learning_rate": 9.756805807622505e-06, + "loss": 40.3007, + "step": 3728 + }, + { + "epoch": 13.462302483069978, + "grad_norm": 228.76754760742188, + "learning_rate": 9.7513611615245e-06, + "loss": 37.7111, + "step": 3729 + }, + { + "epoch": 13.465914221218961, + "grad_norm": 292.7367248535156, + "learning_rate": 9.745916515426497e-06, + "loss": 38.4114, + "step": 3730 + }, + { + "epoch": 13.465914221218961, + "eval_loss": 0.6064842939376831, + "eval_runtime": 3.1485, + "eval_samples_per_second": 56.852, + "eval_steps_per_second": 56.852, + "step": 3730 + }, + { + "epoch": 13.469525959367946, + "grad_norm": 226.9254150390625, + "learning_rate": 9.740471869328494e-06, + "loss": 34.015, + "step": 3731 + }, + { + "epoch": 13.47313769751693, + "grad_norm": 250.38137817382812, + "learning_rate": 9.73502722323049e-06, + "loss": 34.2911, + "step": 3732 + }, + { + "epoch": 13.476749435665914, + "grad_norm": 230.447265625, + "learning_rate": 9.729582577132487e-06, + "loss": 31.8708, + "step": 3733 + }, + { + "epoch": 13.480361173814899, + "grad_norm": 241.05787658691406, + "learning_rate": 9.724137931034482e-06, + "loss": 34.5685, + "step": 3734 + }, + { + "epoch": 13.483972911963882, + "grad_norm": 248.07254028320312, + "learning_rate": 9.718693284936481e-06, + "loss": 32.6084, + "step": 3735 + }, + { + "epoch": 13.487584650112867, + "grad_norm": 241.22862243652344, + "learning_rate": 9.713248638838476e-06, + "loss": 32.787, + "step": 3736 + }, + { + "epoch": 13.491196388261852, + "grad_norm": 295.4871520996094, + "learning_rate": 9.707803992740472e-06, + "loss": 33.9786, + "step": 3737 + }, + { + "epoch": 13.494808126410835, + "grad_norm": 285.3634948730469, + "learning_rate": 9.702359346642469e-06, + "loss": 33.9872, + "step": 3738 + }, + { + "epoch": 13.49841986455982, + "grad_norm": 302.39947509765625, + "learning_rate": 9.696914700544464e-06, + "loss": 33.9854, + "step": 3739 + }, + { + "epoch": 13.502031602708804, + "grad_norm": 310.0465087890625, + "learning_rate": 9.691470054446461e-06, + "loss": 34.1859, + "step": 3740 + }, + { + "epoch": 13.502031602708804, + "eval_loss": 0.6067100167274475, + "eval_runtime": 3.1393, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 3740 + }, + { + "epoch": 13.505643340857787, + "grad_norm": 319.9311828613281, + "learning_rate": 9.686025408348458e-06, + "loss": 34.5264, + "step": 3741 + }, + { + "epoch": 13.509255079006772, + "grad_norm": 291.75738525390625, + "learning_rate": 9.680580762250454e-06, + "loss": 35.8348, + "step": 3742 + }, + { + "epoch": 13.512866817155757, + "grad_norm": 291.5312805175781, + "learning_rate": 9.675136116152449e-06, + "loss": 33.8803, + "step": 3743 + }, + { + "epoch": 13.51647855530474, + "grad_norm": 228.00588989257812, + "learning_rate": 9.669691470054448e-06, + "loss": 36.1919, + "step": 3744 + }, + { + "epoch": 13.520090293453725, + "grad_norm": 236.5559539794922, + "learning_rate": 9.664246823956443e-06, + "loss": 35.8432, + "step": 3745 + }, + { + "epoch": 13.523702031602708, + "grad_norm": 287.7408752441406, + "learning_rate": 9.65880217785844e-06, + "loss": 37.069, + "step": 3746 + }, + { + "epoch": 13.527313769751693, + "grad_norm": 272.73870849609375, + "learning_rate": 9.653357531760436e-06, + "loss": 29.1896, + "step": 3747 + }, + { + "epoch": 13.530925507900678, + "grad_norm": 256.5550842285156, + "learning_rate": 9.647912885662431e-06, + "loss": 23.0953, + "step": 3748 + }, + { + "epoch": 13.534537246049661, + "grad_norm": 230.98487854003906, + "learning_rate": 9.64246823956443e-06, + "loss": 21.9902, + "step": 3749 + }, + { + "epoch": 13.538148984198646, + "grad_norm": 247.1185760498047, + "learning_rate": 9.637023593466425e-06, + "loss": 23.7439, + "step": 3750 + }, + { + "epoch": 13.538148984198646, + "eval_loss": 0.6106311082839966, + "eval_runtime": 3.1356, + "eval_samples_per_second": 57.086, + "eval_steps_per_second": 57.086, + "step": 3750 + }, + { + "epoch": 13.54176072234763, + "grad_norm": 193.83152770996094, + "learning_rate": 9.63157894736842e-06, + "loss": 24.2292, + "step": 3751 + }, + { + "epoch": 13.545372460496614, + "grad_norm": 322.80487060546875, + "learning_rate": 9.626134301270418e-06, + "loss": 40.9778, + "step": 3752 + }, + { + "epoch": 13.548984198645599, + "grad_norm": 345.0560302734375, + "learning_rate": 9.620689655172413e-06, + "loss": 42.3601, + "step": 3753 + }, + { + "epoch": 13.552595936794582, + "grad_norm": 240.3759002685547, + "learning_rate": 9.61524500907441e-06, + "loss": 41.092, + "step": 3754 + }, + { + "epoch": 13.556207674943566, + "grad_norm": 219.0955352783203, + "learning_rate": 9.609800362976407e-06, + "loss": 40.3108, + "step": 3755 + }, + { + "epoch": 13.559819413092551, + "grad_norm": 255.6158447265625, + "learning_rate": 9.604355716878403e-06, + "loss": 39.8885, + "step": 3756 + }, + { + "epoch": 13.563431151241534, + "grad_norm": 264.55010986328125, + "learning_rate": 9.5989110707804e-06, + "loss": 40.8838, + "step": 3757 + }, + { + "epoch": 13.56704288939052, + "grad_norm": 313.0918273925781, + "learning_rate": 9.593466424682397e-06, + "loss": 40.6634, + "step": 3758 + }, + { + "epoch": 13.570654627539504, + "grad_norm": 304.87396240234375, + "learning_rate": 9.588021778584392e-06, + "loss": 41.8734, + "step": 3759 + }, + { + "epoch": 13.574266365688487, + "grad_norm": 239.76063537597656, + "learning_rate": 9.58257713248639e-06, + "loss": 40.6281, + "step": 3760 + }, + { + "epoch": 13.574266365688487, + "eval_loss": 0.6124129891395569, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.028, + "eval_steps_per_second": 57.028, + "step": 3760 + }, + { + "epoch": 13.577878103837472, + "grad_norm": 201.89422607421875, + "learning_rate": 9.577132486388385e-06, + "loss": 39.6948, + "step": 3761 + }, + { + "epoch": 13.581489841986457, + "grad_norm": 232.8797607421875, + "learning_rate": 9.57168784029038e-06, + "loss": 39.6927, + "step": 3762 + }, + { + "epoch": 13.58510158013544, + "grad_norm": 250.30355834960938, + "learning_rate": 9.566243194192379e-06, + "loss": 37.6926, + "step": 3763 + }, + { + "epoch": 13.588713318284425, + "grad_norm": 256.23626708984375, + "learning_rate": 9.560798548094374e-06, + "loss": 38.248, + "step": 3764 + }, + { + "epoch": 13.592325056433408, + "grad_norm": 234.1791534423828, + "learning_rate": 9.555353901996371e-06, + "loss": 36.8178, + "step": 3765 + }, + { + "epoch": 13.595936794582393, + "grad_norm": 243.87615966796875, + "learning_rate": 9.549909255898367e-06, + "loss": 37.0802, + "step": 3766 + }, + { + "epoch": 13.599548532731378, + "grad_norm": 220.98150634765625, + "learning_rate": 9.544464609800362e-06, + "loss": 37.1251, + "step": 3767 + }, + { + "epoch": 13.60316027088036, + "grad_norm": 235.8653564453125, + "learning_rate": 9.53901996370236e-06, + "loss": 38.2965, + "step": 3768 + }, + { + "epoch": 13.606772009029346, + "grad_norm": 237.66712951660156, + "learning_rate": 9.533575317604356e-06, + "loss": 38.0266, + "step": 3769 + }, + { + "epoch": 13.610383747178329, + "grad_norm": 229.4922637939453, + "learning_rate": 9.528130671506351e-06, + "loss": 38.4199, + "step": 3770 + }, + { + "epoch": 13.610383747178329, + "eval_loss": 0.6078812479972839, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 3770 + }, + { + "epoch": 13.613995485327314, + "grad_norm": 250.82533264160156, + "learning_rate": 9.522686025408349e-06, + "loss": 39.713, + "step": 3771 + }, + { + "epoch": 13.617607223476298, + "grad_norm": 218.97511291503906, + "learning_rate": 9.517241379310346e-06, + "loss": 37.6396, + "step": 3772 + }, + { + "epoch": 13.621218961625281, + "grad_norm": 240.13096618652344, + "learning_rate": 9.511796733212341e-06, + "loss": 39.2808, + "step": 3773 + }, + { + "epoch": 13.624830699774266, + "grad_norm": 214.77957153320312, + "learning_rate": 9.506352087114338e-06, + "loss": 39.1584, + "step": 3774 + }, + { + "epoch": 13.628442437923251, + "grad_norm": 273.2488708496094, + "learning_rate": 9.500907441016333e-06, + "loss": 39.6725, + "step": 3775 + }, + { + "epoch": 13.632054176072234, + "grad_norm": 240.46669006347656, + "learning_rate": 9.49546279491833e-06, + "loss": 40.155, + "step": 3776 + }, + { + "epoch": 13.635665914221219, + "grad_norm": 304.46533203125, + "learning_rate": 9.490018148820328e-06, + "loss": 39.5831, + "step": 3777 + }, + { + "epoch": 13.639277652370204, + "grad_norm": 282.9252624511719, + "learning_rate": 9.484573502722323e-06, + "loss": 40.8392, + "step": 3778 + }, + { + "epoch": 13.642889390519187, + "grad_norm": 229.2595977783203, + "learning_rate": 9.47912885662432e-06, + "loss": 38.4015, + "step": 3779 + }, + { + "epoch": 13.646501128668172, + "grad_norm": 300.0253601074219, + "learning_rate": 9.473684210526315e-06, + "loss": 35.0578, + "step": 3780 + }, + { + "epoch": 13.646501128668172, + "eval_loss": 0.6059401631355286, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.967, + "eval_steps_per_second": 56.967, + "step": 3780 + }, + { + "epoch": 13.650112866817155, + "grad_norm": 266.379638671875, + "learning_rate": 9.468239564428313e-06, + "loss": 33.0308, + "step": 3781 + }, + { + "epoch": 13.65372460496614, + "grad_norm": 248.8190460205078, + "learning_rate": 9.46279491833031e-06, + "loss": 31.7632, + "step": 3782 + }, + { + "epoch": 13.657336343115125, + "grad_norm": 224.4126739501953, + "learning_rate": 9.457350272232305e-06, + "loss": 32.8875, + "step": 3783 + }, + { + "epoch": 13.660948081264108, + "grad_norm": 259.84466552734375, + "learning_rate": 9.4519056261343e-06, + "loss": 32.3248, + "step": 3784 + }, + { + "epoch": 13.664559819413093, + "grad_norm": 233.59483337402344, + "learning_rate": 9.446460980036297e-06, + "loss": 32.5855, + "step": 3785 + }, + { + "epoch": 13.668171557562077, + "grad_norm": 283.1840515136719, + "learning_rate": 9.441016333938295e-06, + "loss": 33.8277, + "step": 3786 + }, + { + "epoch": 13.67178329571106, + "grad_norm": 269.51171875, + "learning_rate": 9.435571687840292e-06, + "loss": 33.8348, + "step": 3787 + }, + { + "epoch": 13.675395033860045, + "grad_norm": 284.6701354980469, + "learning_rate": 9.430127041742287e-06, + "loss": 34.2571, + "step": 3788 + }, + { + "epoch": 13.679006772009028, + "grad_norm": 308.96221923828125, + "learning_rate": 9.424682395644282e-06, + "loss": 34.2313, + "step": 3789 + }, + { + "epoch": 13.682618510158013, + "grad_norm": 229.36366271972656, + "learning_rate": 9.41923774954628e-06, + "loss": 34.6341, + "step": 3790 + }, + { + "epoch": 13.682618510158013, + "eval_loss": 0.606715202331543, + "eval_runtime": 3.1366, + "eval_samples_per_second": 57.068, + "eval_steps_per_second": 57.068, + "step": 3790 + }, + { + "epoch": 13.686230248306998, + "grad_norm": 335.4346008300781, + "learning_rate": 9.413793103448277e-06, + "loss": 35.2222, + "step": 3791 + }, + { + "epoch": 13.689841986455981, + "grad_norm": 259.72222900390625, + "learning_rate": 9.408348457350272e-06, + "loss": 34.7416, + "step": 3792 + }, + { + "epoch": 13.693453724604966, + "grad_norm": 275.96112060546875, + "learning_rate": 9.402903811252269e-06, + "loss": 34.2018, + "step": 3793 + }, + { + "epoch": 13.697065462753951, + "grad_norm": 349.28924560546875, + "learning_rate": 9.397459165154264e-06, + "loss": 37.8801, + "step": 3794 + }, + { + "epoch": 13.700677200902934, + "grad_norm": 288.47540283203125, + "learning_rate": 9.392014519056261e-06, + "loss": 37.5101, + "step": 3795 + }, + { + "epoch": 13.704288939051919, + "grad_norm": 255.31033325195312, + "learning_rate": 9.386569872958259e-06, + "loss": 36.9294, + "step": 3796 + }, + { + "epoch": 13.707900677200904, + "grad_norm": 273.757080078125, + "learning_rate": 9.381125226860254e-06, + "loss": 31.64, + "step": 3797 + }, + { + "epoch": 13.711512415349887, + "grad_norm": 236.24928283691406, + "learning_rate": 9.375680580762251e-06, + "loss": 22.9812, + "step": 3798 + }, + { + "epoch": 13.715124153498872, + "grad_norm": 206.70883178710938, + "learning_rate": 9.370235934664246e-06, + "loss": 22.4788, + "step": 3799 + }, + { + "epoch": 13.718735891647855, + "grad_norm": 168.15762329101562, + "learning_rate": 9.364791288566243e-06, + "loss": 23.3803, + "step": 3800 + }, + { + "epoch": 13.718735891647855, + "eval_loss": 0.6092759966850281, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.962, + "eval_steps_per_second": 56.962, + "step": 3800 + }, + { + "epoch": 13.72234762979684, + "grad_norm": 261.88397216796875, + "learning_rate": 9.35934664246824e-06, + "loss": 24.8757, + "step": 3801 + }, + { + "epoch": 13.725959367945824, + "grad_norm": 235.3518829345703, + "learning_rate": 9.353901996370236e-06, + "loss": 39.8777, + "step": 3802 + }, + { + "epoch": 13.729571106094808, + "grad_norm": 226.94027709960938, + "learning_rate": 9.348457350272231e-06, + "loss": 40.4357, + "step": 3803 + }, + { + "epoch": 13.733182844243792, + "grad_norm": 266.2643737792969, + "learning_rate": 9.34301270417423e-06, + "loss": 41.6411, + "step": 3804 + }, + { + "epoch": 13.736794582392777, + "grad_norm": 327.39288330078125, + "learning_rate": 9.337568058076225e-06, + "loss": 39.862, + "step": 3805 + }, + { + "epoch": 13.74040632054176, + "grad_norm": 241.03121948242188, + "learning_rate": 9.332123411978223e-06, + "loss": 39.1833, + "step": 3806 + }, + { + "epoch": 13.744018058690745, + "grad_norm": 232.2872314453125, + "learning_rate": 9.326678765880218e-06, + "loss": 40.6895, + "step": 3807 + }, + { + "epoch": 13.747629796839728, + "grad_norm": 236.909912109375, + "learning_rate": 9.321234119782213e-06, + "loss": 39.5891, + "step": 3808 + }, + { + "epoch": 13.751241534988713, + "grad_norm": 193.81478881835938, + "learning_rate": 9.315789473684212e-06, + "loss": 41.5211, + "step": 3809 + }, + { + "epoch": 13.754853273137698, + "grad_norm": 214.87301635742188, + "learning_rate": 9.310344827586207e-06, + "loss": 41.0726, + "step": 3810 + }, + { + "epoch": 13.754853273137698, + "eval_loss": 0.6098713874816895, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.047, + "eval_steps_per_second": 57.047, + "step": 3810 + }, + { + "epoch": 13.758465011286681, + "grad_norm": 196.57247924804688, + "learning_rate": 9.304900181488203e-06, + "loss": 40.1843, + "step": 3811 + }, + { + "epoch": 13.762076749435666, + "grad_norm": 215.59698486328125, + "learning_rate": 9.2994555353902e-06, + "loss": 37.6279, + "step": 3812 + }, + { + "epoch": 13.76568848758465, + "grad_norm": 221.1280059814453, + "learning_rate": 9.294010889292195e-06, + "loss": 37.9593, + "step": 3813 + }, + { + "epoch": 13.769300225733634, + "grad_norm": 314.94610595703125, + "learning_rate": 9.288566243194192e-06, + "loss": 37.3399, + "step": 3814 + }, + { + "epoch": 13.772911963882619, + "grad_norm": 240.10816955566406, + "learning_rate": 9.28312159709619e-06, + "loss": 38.3185, + "step": 3815 + }, + { + "epoch": 13.776523702031604, + "grad_norm": 229.2427978515625, + "learning_rate": 9.277676950998185e-06, + "loss": 36.9407, + "step": 3816 + }, + { + "epoch": 13.780135440180587, + "grad_norm": 224.78335571289062, + "learning_rate": 9.272232304900182e-06, + "loss": 39.3709, + "step": 3817 + }, + { + "epoch": 13.783747178329572, + "grad_norm": 216.5969696044922, + "learning_rate": 9.266787658802179e-06, + "loss": 38.2303, + "step": 3818 + }, + { + "epoch": 13.787358916478555, + "grad_norm": 208.7849884033203, + "learning_rate": 9.261343012704174e-06, + "loss": 39.492, + "step": 3819 + }, + { + "epoch": 13.79097065462754, + "grad_norm": 215.76475524902344, + "learning_rate": 9.255898366606171e-06, + "loss": 38.5599, + "step": 3820 + }, + { + "epoch": 13.79097065462754, + "eval_loss": 0.6080366969108582, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.041, + "eval_steps_per_second": 57.041, + "step": 3820 + }, + { + "epoch": 13.794582392776524, + "grad_norm": 224.64462280273438, + "learning_rate": 9.250453720508167e-06, + "loss": 39.315, + "step": 3821 + }, + { + "epoch": 13.798194130925507, + "grad_norm": 298.545654296875, + "learning_rate": 9.245009074410162e-06, + "loss": 38.3108, + "step": 3822 + }, + { + "epoch": 13.801805869074492, + "grad_norm": 236.5186767578125, + "learning_rate": 9.239564428312161e-06, + "loss": 39.9223, + "step": 3823 + }, + { + "epoch": 13.805417607223477, + "grad_norm": 251.47999572753906, + "learning_rate": 9.234119782214156e-06, + "loss": 39.4288, + "step": 3824 + }, + { + "epoch": 13.80902934537246, + "grad_norm": 260.8268737792969, + "learning_rate": 9.228675136116152e-06, + "loss": 38.276, + "step": 3825 + }, + { + "epoch": 13.812641083521445, + "grad_norm": 253.25172424316406, + "learning_rate": 9.223230490018149e-06, + "loss": 40.7118, + "step": 3826 + }, + { + "epoch": 13.816252821670428, + "grad_norm": 250.31784057617188, + "learning_rate": 9.217785843920146e-06, + "loss": 40.1916, + "step": 3827 + }, + { + "epoch": 13.819864559819413, + "grad_norm": 228.79234313964844, + "learning_rate": 9.212341197822143e-06, + "loss": 38.1513, + "step": 3828 + }, + { + "epoch": 13.823476297968398, + "grad_norm": 262.689697265625, + "learning_rate": 9.206896551724138e-06, + "loss": 38.43, + "step": 3829 + }, + { + "epoch": 13.827088036117381, + "grad_norm": 191.04139709472656, + "learning_rate": 9.201451905626134e-06, + "loss": 34.2476, + "step": 3830 + }, + { + "epoch": 13.827088036117381, + "eval_loss": 0.6077054142951965, + "eval_runtime": 3.1445, + "eval_samples_per_second": 56.925, + "eval_steps_per_second": 56.925, + "step": 3830 + }, + { + "epoch": 13.830699774266366, + "grad_norm": 236.3266143798828, + "learning_rate": 9.196007259528131e-06, + "loss": 33.7892, + "step": 3831 + }, + { + "epoch": 13.83431151241535, + "grad_norm": 284.8748474121094, + "learning_rate": 9.190562613430128e-06, + "loss": 31.9857, + "step": 3832 + }, + { + "epoch": 13.837923250564334, + "grad_norm": 261.17413330078125, + "learning_rate": 9.185117967332123e-06, + "loss": 32.8165, + "step": 3833 + }, + { + "epoch": 13.841534988713319, + "grad_norm": 195.1323699951172, + "learning_rate": 9.17967332123412e-06, + "loss": 33.1709, + "step": 3834 + }, + { + "epoch": 13.845146726862303, + "grad_norm": 220.5006561279297, + "learning_rate": 9.174228675136116e-06, + "loss": 33.149, + "step": 3835 + }, + { + "epoch": 13.848758465011286, + "grad_norm": 236.7254638671875, + "learning_rate": 9.168784029038111e-06, + "loss": 33.633, + "step": 3836 + }, + { + "epoch": 13.852370203160271, + "grad_norm": 269.1921691894531, + "learning_rate": 9.16333938294011e-06, + "loss": 34.6822, + "step": 3837 + }, + { + "epoch": 13.855981941309254, + "grad_norm": 222.4369354248047, + "learning_rate": 9.157894736842105e-06, + "loss": 35.2816, + "step": 3838 + }, + { + "epoch": 13.85959367945824, + "grad_norm": 232.4306640625, + "learning_rate": 9.152450090744102e-06, + "loss": 35.0067, + "step": 3839 + }, + { + "epoch": 13.863205417607224, + "grad_norm": 297.0786437988281, + "learning_rate": 9.147005444646098e-06, + "loss": 34.264, + "step": 3840 + }, + { + "epoch": 13.863205417607224, + "eval_loss": 0.6047748327255249, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 3840 + }, + { + "epoch": 13.866817155756207, + "grad_norm": 370.232421875, + "learning_rate": 9.141560798548095e-06, + "loss": 35.4996, + "step": 3841 + }, + { + "epoch": 13.870428893905192, + "grad_norm": 216.05775451660156, + "learning_rate": 9.136116152450092e-06, + "loss": 36.1403, + "step": 3842 + }, + { + "epoch": 13.874040632054175, + "grad_norm": 233.11138916015625, + "learning_rate": 9.130671506352087e-06, + "loss": 36.0324, + "step": 3843 + }, + { + "epoch": 13.87765237020316, + "grad_norm": 297.1761779785156, + "learning_rate": 9.125226860254083e-06, + "loss": 36.5617, + "step": 3844 + }, + { + "epoch": 13.881264108352145, + "grad_norm": 290.61590576171875, + "learning_rate": 9.11978221415608e-06, + "loss": 36.7113, + "step": 3845 + }, + { + "epoch": 13.884875846501128, + "grad_norm": 293.5744934082031, + "learning_rate": 9.114337568058077e-06, + "loss": 36.9964, + "step": 3846 + }, + { + "epoch": 13.888487584650113, + "grad_norm": 227.73455810546875, + "learning_rate": 9.108892921960072e-06, + "loss": 31.8552, + "step": 3847 + }, + { + "epoch": 13.892099322799098, + "grad_norm": 223.36077880859375, + "learning_rate": 9.10344827586207e-06, + "loss": 22.9122, + "step": 3848 + }, + { + "epoch": 13.89571106094808, + "grad_norm": 181.14501953125, + "learning_rate": 9.098003629764065e-06, + "loss": 22.366, + "step": 3849 + }, + { + "epoch": 13.899322799097066, + "grad_norm": 215.75856018066406, + "learning_rate": 9.092558983666063e-06, + "loss": 23.9545, + "step": 3850 + }, + { + "epoch": 13.899322799097066, + "eval_loss": 0.6072003245353699, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 3850 + }, + { + "epoch": 13.90293453724605, + "grad_norm": 233.22837829589844, + "learning_rate": 9.087114337568059e-06, + "loss": 23.5196, + "step": 3851 + }, + { + "epoch": 13.906546275395034, + "grad_norm": 269.9342041015625, + "learning_rate": 9.081669691470054e-06, + "loss": 41.4605, + "step": 3852 + }, + { + "epoch": 13.910158013544018, + "grad_norm": 304.4266662597656, + "learning_rate": 9.076225045372051e-06, + "loss": 40.2848, + "step": 3853 + }, + { + "epoch": 13.913769751693001, + "grad_norm": 318.2371520996094, + "learning_rate": 9.070780399274047e-06, + "loss": 41.0044, + "step": 3854 + }, + { + "epoch": 13.917381489841986, + "grad_norm": 272.9725341796875, + "learning_rate": 9.065335753176044e-06, + "loss": 40.776, + "step": 3855 + }, + { + "epoch": 13.920993227990971, + "grad_norm": 213.8822784423828, + "learning_rate": 9.059891107078041e-06, + "loss": 39.4964, + "step": 3856 + }, + { + "epoch": 13.924604966139954, + "grad_norm": 239.16128540039062, + "learning_rate": 9.054446460980036e-06, + "loss": 41.3482, + "step": 3857 + }, + { + "epoch": 13.928216704288939, + "grad_norm": 264.839111328125, + "learning_rate": 9.049001814882033e-06, + "loss": 38.2433, + "step": 3858 + }, + { + "epoch": 13.931828442437924, + "grad_norm": 244.00926208496094, + "learning_rate": 9.043557168784029e-06, + "loss": 38.6482, + "step": 3859 + }, + { + "epoch": 13.935440180586907, + "grad_norm": 342.8050537109375, + "learning_rate": 9.038112522686026e-06, + "loss": 39.2047, + "step": 3860 + }, + { + "epoch": 13.935440180586907, + "eval_loss": 0.6078094244003296, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3860 + }, + { + "epoch": 13.939051918735892, + "grad_norm": 232.509033203125, + "learning_rate": 9.032667876588023e-06, + "loss": 39.2827, + "step": 3861 + }, + { + "epoch": 13.942663656884875, + "grad_norm": 343.2891845703125, + "learning_rate": 9.027223230490018e-06, + "loss": 38.2709, + "step": 3862 + }, + { + "epoch": 13.94627539503386, + "grad_norm": 332.9613342285156, + "learning_rate": 9.021778584392014e-06, + "loss": 38.8266, + "step": 3863 + }, + { + "epoch": 13.949887133182845, + "grad_norm": 339.5653076171875, + "learning_rate": 9.016333938294012e-06, + "loss": 39.9249, + "step": 3864 + }, + { + "epoch": 13.953498871331828, + "grad_norm": 269.0108947753906, + "learning_rate": 9.010889292196008e-06, + "loss": 39.4593, + "step": 3865 + }, + { + "epoch": 13.957110609480813, + "grad_norm": 252.5339813232422, + "learning_rate": 9.005444646098003e-06, + "loss": 39.5471, + "step": 3866 + }, + { + "epoch": 13.960722347629797, + "grad_norm": 424.7225646972656, + "learning_rate": 9e-06, + "loss": 35.7505, + "step": 3867 + }, + { + "epoch": 13.96433408577878, + "grad_norm": 286.189208984375, + "learning_rate": 8.994555353901996e-06, + "loss": 32.445, + "step": 3868 + }, + { + "epoch": 13.967945823927765, + "grad_norm": 245.153564453125, + "learning_rate": 8.989110707803994e-06, + "loss": 33.2369, + "step": 3869 + }, + { + "epoch": 13.97155756207675, + "grad_norm": 305.3119812011719, + "learning_rate": 8.98366606170599e-06, + "loss": 31.7864, + "step": 3870 + }, + { + "epoch": 13.97155756207675, + "eval_loss": 0.6069231629371643, + "eval_runtime": 3.1471, + "eval_samples_per_second": 56.877, + "eval_steps_per_second": 56.877, + "step": 3870 + }, + { + "epoch": 13.975169300225733, + "grad_norm": 218.70913696289062, + "learning_rate": 8.978221415607985e-06, + "loss": 33.7166, + "step": 3871 + }, + { + "epoch": 13.978781038374718, + "grad_norm": 334.856201171875, + "learning_rate": 8.972776769509982e-06, + "loss": 35.8878, + "step": 3872 + }, + { + "epoch": 13.982392776523701, + "grad_norm": 305.65203857421875, + "learning_rate": 8.96733212341198e-06, + "loss": 35.1525, + "step": 3873 + }, + { + "epoch": 13.986004514672686, + "grad_norm": 330.148193359375, + "learning_rate": 8.961887477313975e-06, + "loss": 34.8268, + "step": 3874 + }, + { + "epoch": 13.989616252821671, + "grad_norm": 288.9424133300781, + "learning_rate": 8.956442831215972e-06, + "loss": 35.5068, + "step": 3875 + }, + { + "epoch": 13.993227990970654, + "grad_norm": 256.2596740722656, + "learning_rate": 8.950998185117967e-06, + "loss": 28.5016, + "step": 3876 + }, + { + "epoch": 13.996839729119639, + "grad_norm": 234.31991577148438, + "learning_rate": 8.945553539019963e-06, + "loss": 23.7416, + "step": 3877 + }, + { + "epoch": 14.0, + "grad_norm": 182.19000244140625, + "learning_rate": 8.940108892921961e-06, + "loss": 21.0329, + "step": 3878 + }, + { + "epoch": 14.003611738148985, + "grad_norm": 254.86355590820312, + "learning_rate": 8.934664246823957e-06, + "loss": 39.94, + "step": 3879 + }, + { + "epoch": 14.007223476297968, + "grad_norm": 229.75650024414062, + "learning_rate": 8.929219600725954e-06, + "loss": 40.3213, + "step": 3880 + }, + { + "epoch": 14.007223476297968, + "eval_loss": 0.604503870010376, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.96, + "eval_steps_per_second": 56.96, + "step": 3880 + }, + { + "epoch": 14.010835214446953, + "grad_norm": 220.18190002441406, + "learning_rate": 8.923774954627949e-06, + "loss": 40.1568, + "step": 3881 + }, + { + "epoch": 14.014446952595938, + "grad_norm": 269.5978088378906, + "learning_rate": 8.918330308529945e-06, + "loss": 40.3685, + "step": 3882 + }, + { + "epoch": 14.01805869074492, + "grad_norm": 254.3507537841797, + "learning_rate": 8.912885662431943e-06, + "loss": 40.0845, + "step": 3883 + }, + { + "epoch": 14.021670428893906, + "grad_norm": 251.43653869628906, + "learning_rate": 8.907441016333939e-06, + "loss": 40.1731, + "step": 3884 + }, + { + "epoch": 14.025282167042889, + "grad_norm": 215.91253662109375, + "learning_rate": 8.901996370235934e-06, + "loss": 39.7179, + "step": 3885 + }, + { + "epoch": 14.028893905191874, + "grad_norm": 247.81790161132812, + "learning_rate": 8.896551724137931e-06, + "loss": 41.0822, + "step": 3886 + }, + { + "epoch": 14.032505643340858, + "grad_norm": 232.45892333984375, + "learning_rate": 8.891107078039928e-06, + "loss": 39.7873, + "step": 3887 + }, + { + "epoch": 14.036117381489841, + "grad_norm": 231.8137969970703, + "learning_rate": 8.885662431941924e-06, + "loss": 41.1302, + "step": 3888 + }, + { + "epoch": 14.039729119638826, + "grad_norm": 219.09446716308594, + "learning_rate": 8.88021778584392e-06, + "loss": 39.2293, + "step": 3889 + }, + { + "epoch": 14.043340857787811, + "grad_norm": 187.99874877929688, + "learning_rate": 8.874773139745916e-06, + "loss": 37.3338, + "step": 3890 + }, + { + "epoch": 14.043340857787811, + "eval_loss": 0.603966236114502, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.077, + "eval_steps_per_second": 57.077, + "step": 3890 + }, + { + "epoch": 14.046952595936794, + "grad_norm": 285.2400207519531, + "learning_rate": 8.869328493647913e-06, + "loss": 36.9479, + "step": 3891 + }, + { + "epoch": 14.050564334085779, + "grad_norm": 234.23655700683594, + "learning_rate": 8.86388384754991e-06, + "loss": 35.1313, + "step": 3892 + }, + { + "epoch": 14.054176072234762, + "grad_norm": 234.78717041015625, + "learning_rate": 8.858439201451906e-06, + "loss": 36.5917, + "step": 3893 + }, + { + "epoch": 14.057787810383747, + "grad_norm": 226.53997802734375, + "learning_rate": 8.852994555353903e-06, + "loss": 38.3228, + "step": 3894 + }, + { + "epoch": 14.061399548532732, + "grad_norm": 222.05213928222656, + "learning_rate": 8.847549909255898e-06, + "loss": 37.3542, + "step": 3895 + }, + { + "epoch": 14.065011286681715, + "grad_norm": 222.9646759033203, + "learning_rate": 8.842105263157893e-06, + "loss": 37.6396, + "step": 3896 + }, + { + "epoch": 14.0686230248307, + "grad_norm": 227.78965759277344, + "learning_rate": 8.836660617059892e-06, + "loss": 38.1988, + "step": 3897 + }, + { + "epoch": 14.072234762979685, + "grad_norm": 200.89691162109375, + "learning_rate": 8.831215970961888e-06, + "loss": 38.3981, + "step": 3898 + }, + { + "epoch": 14.075846501128668, + "grad_norm": 212.52891540527344, + "learning_rate": 8.825771324863883e-06, + "loss": 37.3422, + "step": 3899 + }, + { + "epoch": 14.079458239277653, + "grad_norm": 312.33905029296875, + "learning_rate": 8.82032667876588e-06, + "loss": 38.1292, + "step": 3900 + }, + { + "epoch": 14.079458239277653, + "eval_loss": 0.6061921119689941, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.017, + "eval_steps_per_second": 57.017, + "step": 3900 + }, + { + "epoch": 14.083069977426636, + "grad_norm": 261.8415832519531, + "learning_rate": 8.814882032667877e-06, + "loss": 37.5543, + "step": 3901 + }, + { + "epoch": 14.08668171557562, + "grad_norm": 264.625732421875, + "learning_rate": 8.809437386569874e-06, + "loss": 39.3912, + "step": 3902 + }, + { + "epoch": 14.090293453724605, + "grad_norm": 305.7203063964844, + "learning_rate": 8.80399274047187e-06, + "loss": 39.7879, + "step": 3903 + }, + { + "epoch": 14.093905191873588, + "grad_norm": 282.63616943359375, + "learning_rate": 8.798548094373865e-06, + "loss": 38.7212, + "step": 3904 + }, + { + "epoch": 14.097516930022573, + "grad_norm": 246.49169921875, + "learning_rate": 8.793103448275862e-06, + "loss": 40.6198, + "step": 3905 + }, + { + "epoch": 14.101128668171558, + "grad_norm": 283.2737731933594, + "learning_rate": 8.787658802177859e-06, + "loss": 39.6947, + "step": 3906 + }, + { + "epoch": 14.104740406320541, + "grad_norm": 306.95721435546875, + "learning_rate": 8.782214156079855e-06, + "loss": 38.6157, + "step": 3907 + }, + { + "epoch": 14.108352144469526, + "grad_norm": 238.1789093017578, + "learning_rate": 8.776769509981852e-06, + "loss": 35.5328, + "step": 3908 + }, + { + "epoch": 14.111963882618511, + "grad_norm": 233.2298126220703, + "learning_rate": 8.771324863883847e-06, + "loss": 32.4008, + "step": 3909 + }, + { + "epoch": 14.115575620767494, + "grad_norm": 233.46339416503906, + "learning_rate": 8.765880217785846e-06, + "loss": 31.0712, + "step": 3910 + }, + { + "epoch": 14.115575620767494, + "eval_loss": 0.6046931147575378, + "eval_runtime": 3.1417, + "eval_samples_per_second": 56.976, + "eval_steps_per_second": 56.976, + "step": 3910 + }, + { + "epoch": 14.119187358916479, + "grad_norm": 226.30343627929688, + "learning_rate": 8.760435571687841e-06, + "loss": 33.252, + "step": 3911 + }, + { + "epoch": 14.122799097065462, + "grad_norm": 247.17465209960938, + "learning_rate": 8.754990925589837e-06, + "loss": 31.526, + "step": 3912 + }, + { + "epoch": 14.126410835214447, + "grad_norm": 208.25439453125, + "learning_rate": 8.749546279491834e-06, + "loss": 32.4838, + "step": 3913 + }, + { + "epoch": 14.130022573363432, + "grad_norm": 236.4488525390625, + "learning_rate": 8.744101633393829e-06, + "loss": 32.7987, + "step": 3914 + }, + { + "epoch": 14.133634311512415, + "grad_norm": 219.13279724121094, + "learning_rate": 8.738656987295826e-06, + "loss": 32.8516, + "step": 3915 + }, + { + "epoch": 14.1372460496614, + "grad_norm": 239.7289581298828, + "learning_rate": 8.733212341197823e-06, + "loss": 33.7763, + "step": 3916 + }, + { + "epoch": 14.140857787810384, + "grad_norm": 226.3568878173828, + "learning_rate": 8.727767695099819e-06, + "loss": 35.675, + "step": 3917 + }, + { + "epoch": 14.144469525959368, + "grad_norm": 302.84307861328125, + "learning_rate": 8.722323049001814e-06, + "loss": 34.0523, + "step": 3918 + }, + { + "epoch": 14.148081264108352, + "grad_norm": 280.40106201171875, + "learning_rate": 8.716878402903811e-06, + "loss": 35.2923, + "step": 3919 + }, + { + "epoch": 14.151693002257336, + "grad_norm": 238.30520629882812, + "learning_rate": 8.711433756805808e-06, + "loss": 36.0242, + "step": 3920 + }, + { + "epoch": 14.151693002257336, + "eval_loss": 0.6067762970924377, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 3920 + }, + { + "epoch": 14.15530474040632, + "grad_norm": 238.6465301513672, + "learning_rate": 8.705989110707805e-06, + "loss": 36.2959, + "step": 3921 + }, + { + "epoch": 14.158916478555305, + "grad_norm": 273.26837158203125, + "learning_rate": 8.7005444646098e-06, + "loss": 35.45, + "step": 3922 + }, + { + "epoch": 14.162528216704288, + "grad_norm": 296.907958984375, + "learning_rate": 8.695099818511796e-06, + "loss": 36.4428, + "step": 3923 + }, + { + "epoch": 14.166139954853273, + "grad_norm": 215.07374572753906, + "learning_rate": 8.689655172413795e-06, + "loss": 26.4171, + "step": 3924 + }, + { + "epoch": 14.169751693002258, + "grad_norm": 217.64779663085938, + "learning_rate": 8.68421052631579e-06, + "loss": 22.5483, + "step": 3925 + }, + { + "epoch": 14.173363431151241, + "grad_norm": 243.59364318847656, + "learning_rate": 8.678765880217785e-06, + "loss": 22.0396, + "step": 3926 + }, + { + "epoch": 14.176975169300226, + "grad_norm": 189.66969299316406, + "learning_rate": 8.673321234119783e-06, + "loss": 23.0957, + "step": 3927 + }, + { + "epoch": 14.18058690744921, + "grad_norm": 191.86180114746094, + "learning_rate": 8.667876588021778e-06, + "loss": 23.9385, + "step": 3928 + }, + { + "epoch": 14.184198645598194, + "grad_norm": 234.34896850585938, + "learning_rate": 8.662431941923775e-06, + "loss": 40.1665, + "step": 3929 + }, + { + "epoch": 14.187810383747179, + "grad_norm": 230.52401733398438, + "learning_rate": 8.656987295825772e-06, + "loss": 40.6752, + "step": 3930 + }, + { + "epoch": 14.187810383747179, + "eval_loss": 0.6088615655899048, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.998, + "eval_steps_per_second": 56.998, + "step": 3930 + }, + { + "epoch": 14.191422121896162, + "grad_norm": 234.06272888183594, + "learning_rate": 8.651542649727767e-06, + "loss": 40.7938, + "step": 3931 + }, + { + "epoch": 14.195033860045147, + "grad_norm": 344.4232482910156, + "learning_rate": 8.646098003629765e-06, + "loss": 38.7342, + "step": 3932 + }, + { + "epoch": 14.198645598194132, + "grad_norm": 375.74365234375, + "learning_rate": 8.640653357531762e-06, + "loss": 40.2052, + "step": 3933 + }, + { + "epoch": 14.202257336343115, + "grad_norm": 258.15570068359375, + "learning_rate": 8.635208711433757e-06, + "loss": 39.7266, + "step": 3934 + }, + { + "epoch": 14.2058690744921, + "grad_norm": 235.2681121826172, + "learning_rate": 8.629764065335754e-06, + "loss": 40.4821, + "step": 3935 + }, + { + "epoch": 14.209480812641084, + "grad_norm": 226.94764709472656, + "learning_rate": 8.62431941923775e-06, + "loss": 41.2414, + "step": 3936 + }, + { + "epoch": 14.213092550790067, + "grad_norm": 236.22109985351562, + "learning_rate": 8.618874773139745e-06, + "loss": 40.5807, + "step": 3937 + }, + { + "epoch": 14.216704288939052, + "grad_norm": 201.31112670898438, + "learning_rate": 8.613430127041744e-06, + "loss": 40.4824, + "step": 3938 + }, + { + "epoch": 14.220316027088035, + "grad_norm": 328.0167541503906, + "learning_rate": 8.607985480943739e-06, + "loss": 38.3881, + "step": 3939 + }, + { + "epoch": 14.22392776523702, + "grad_norm": 281.4416809082031, + "learning_rate": 8.602540834845734e-06, + "loss": 36.5777, + "step": 3940 + }, + { + "epoch": 14.22392776523702, + "eval_loss": 0.6099084615707397, + "eval_runtime": 3.1377, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 3940 + }, + { + "epoch": 14.227539503386005, + "grad_norm": 258.5203552246094, + "learning_rate": 8.597096188747731e-06, + "loss": 37.5071, + "step": 3941 + }, + { + "epoch": 14.231151241534988, + "grad_norm": 274.8222351074219, + "learning_rate": 8.591651542649727e-06, + "loss": 36.358, + "step": 3942 + }, + { + "epoch": 14.234762979683973, + "grad_norm": 253.1671600341797, + "learning_rate": 8.586206896551726e-06, + "loss": 37.5859, + "step": 3943 + }, + { + "epoch": 14.238374717832958, + "grad_norm": 249.80943298339844, + "learning_rate": 8.580762250453721e-06, + "loss": 37.8799, + "step": 3944 + }, + { + "epoch": 14.241986455981941, + "grad_norm": 245.29103088378906, + "learning_rate": 8.575317604355716e-06, + "loss": 36.7551, + "step": 3945 + }, + { + "epoch": 14.245598194130926, + "grad_norm": 205.5915985107422, + "learning_rate": 8.569872958257713e-06, + "loss": 38.4761, + "step": 3946 + }, + { + "epoch": 14.249209932279909, + "grad_norm": 218.10328674316406, + "learning_rate": 8.56442831215971e-06, + "loss": 37.5862, + "step": 3947 + }, + { + "epoch": 14.252821670428894, + "grad_norm": 273.5924072265625, + "learning_rate": 8.558983666061706e-06, + "loss": 39.2851, + "step": 3948 + }, + { + "epoch": 14.256433408577879, + "grad_norm": 235.48069763183594, + "learning_rate": 8.553539019963703e-06, + "loss": 39.0707, + "step": 3949 + }, + { + "epoch": 14.260045146726862, + "grad_norm": 230.93150329589844, + "learning_rate": 8.548094373865698e-06, + "loss": 37.8469, + "step": 3950 + }, + { + "epoch": 14.260045146726862, + "eval_loss": 0.6072147488594055, + "eval_runtime": 3.1414, + "eval_samples_per_second": 56.982, + "eval_steps_per_second": 56.982, + "step": 3950 + }, + { + "epoch": 14.263656884875846, + "grad_norm": 226.3638458251953, + "learning_rate": 8.542649727767695e-06, + "loss": 39.4245, + "step": 3951 + }, + { + "epoch": 14.267268623024831, + "grad_norm": 226.74595642089844, + "learning_rate": 8.537205081669693e-06, + "loss": 38.116, + "step": 3952 + }, + { + "epoch": 14.270880361173814, + "grad_norm": 226.1452178955078, + "learning_rate": 8.531760435571688e-06, + "loss": 39.9114, + "step": 3953 + }, + { + "epoch": 14.2744920993228, + "grad_norm": 387.8020324707031, + "learning_rate": 8.526315789473685e-06, + "loss": 38.9457, + "step": 3954 + }, + { + "epoch": 14.278103837471784, + "grad_norm": 381.5679931640625, + "learning_rate": 8.52087114337568e-06, + "loss": 40.7989, + "step": 3955 + }, + { + "epoch": 14.281715575620767, + "grad_norm": 246.16464233398438, + "learning_rate": 8.515426497277677e-06, + "loss": 37.6288, + "step": 3956 + }, + { + "epoch": 14.285327313769752, + "grad_norm": 337.05059814453125, + "learning_rate": 8.509981851179674e-06, + "loss": 37.3276, + "step": 3957 + }, + { + "epoch": 14.288939051918735, + "grad_norm": 223.80421447753906, + "learning_rate": 8.50453720508167e-06, + "loss": 33.9465, + "step": 3958 + }, + { + "epoch": 14.29255079006772, + "grad_norm": 218.9332275390625, + "learning_rate": 8.499092558983665e-06, + "loss": 33.0305, + "step": 3959 + }, + { + "epoch": 14.296162528216705, + "grad_norm": 254.20726013183594, + "learning_rate": 8.493647912885662e-06, + "loss": 31.3806, + "step": 3960 + }, + { + "epoch": 14.296162528216705, + "eval_loss": 0.6070483922958374, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.984, + "eval_steps_per_second": 56.984, + "step": 3960 + }, + { + "epoch": 14.299774266365688, + "grad_norm": 232.96702575683594, + "learning_rate": 8.48820326678766e-06, + "loss": 31.7001, + "step": 3961 + }, + { + "epoch": 14.303386004514673, + "grad_norm": 305.31207275390625, + "learning_rate": 8.482758620689656e-06, + "loss": 32.2629, + "step": 3962 + }, + { + "epoch": 14.306997742663658, + "grad_norm": 253.60858154296875, + "learning_rate": 8.477313974591652e-06, + "loss": 34.2635, + "step": 3963 + }, + { + "epoch": 14.31060948081264, + "grad_norm": 395.4168701171875, + "learning_rate": 8.471869328493647e-06, + "loss": 34.6987, + "step": 3964 + }, + { + "epoch": 14.314221218961626, + "grad_norm": 279.72845458984375, + "learning_rate": 8.466424682395644e-06, + "loss": 34.5488, + "step": 3965 + }, + { + "epoch": 14.317832957110609, + "grad_norm": 285.7306213378906, + "learning_rate": 8.460980036297641e-06, + "loss": 35.2566, + "step": 3966 + }, + { + "epoch": 14.321444695259594, + "grad_norm": 229.04226684570312, + "learning_rate": 8.455535390199637e-06, + "loss": 34.5273, + "step": 3967 + }, + { + "epoch": 14.325056433408578, + "grad_norm": 232.50205993652344, + "learning_rate": 8.450090744101634e-06, + "loss": 34.6337, + "step": 3968 + }, + { + "epoch": 14.328668171557561, + "grad_norm": 225.87583923339844, + "learning_rate": 8.44464609800363e-06, + "loss": 35.1575, + "step": 3969 + }, + { + "epoch": 14.332279909706546, + "grad_norm": 266.2709045410156, + "learning_rate": 8.439201451905626e-06, + "loss": 34.2619, + "step": 3970 + }, + { + "epoch": 14.332279909706546, + "eval_loss": 0.6066078543663025, + "eval_runtime": 3.149, + "eval_samples_per_second": 56.843, + "eval_steps_per_second": 56.843, + "step": 3970 + }, + { + "epoch": 14.335891647855531, + "grad_norm": 283.557373046875, + "learning_rate": 8.433756805807623e-06, + "loss": 35.5713, + "step": 3971 + }, + { + "epoch": 14.339503386004514, + "grad_norm": 288.43707275390625, + "learning_rate": 8.428312159709619e-06, + "loss": 36.7442, + "step": 3972 + }, + { + "epoch": 14.343115124153499, + "grad_norm": 331.3218994140625, + "learning_rate": 8.422867513611616e-06, + "loss": 35.5839, + "step": 3973 + }, + { + "epoch": 14.346726862302482, + "grad_norm": 257.1488037109375, + "learning_rate": 8.417422867513611e-06, + "loss": 30.2221, + "step": 3974 + }, + { + "epoch": 14.350338600451467, + "grad_norm": 200.0919189453125, + "learning_rate": 8.411978221415608e-06, + "loss": 22.217, + "step": 3975 + }, + { + "epoch": 14.353950338600452, + "grad_norm": 245.030029296875, + "learning_rate": 8.406533575317605e-06, + "loss": 22.8927, + "step": 3976 + }, + { + "epoch": 14.357562076749435, + "grad_norm": 208.5701904296875, + "learning_rate": 8.4010889292196e-06, + "loss": 22.9537, + "step": 3977 + }, + { + "epoch": 14.36117381489842, + "grad_norm": 232.0613250732422, + "learning_rate": 8.395644283121596e-06, + "loss": 24.5304, + "step": 3978 + }, + { + "epoch": 14.364785553047405, + "grad_norm": 193.56541442871094, + "learning_rate": 8.390199637023595e-06, + "loss": 39.4552, + "step": 3979 + }, + { + "epoch": 14.368397291196388, + "grad_norm": 230.35507202148438, + "learning_rate": 8.38475499092559e-06, + "loss": 41.0417, + "step": 3980 + }, + { + "epoch": 14.368397291196388, + "eval_loss": 0.6071842908859253, + "eval_runtime": 3.1416, + "eval_samples_per_second": 56.978, + "eval_steps_per_second": 56.978, + "step": 3980 + }, + { + "epoch": 14.372009029345373, + "grad_norm": 191.09242248535156, + "learning_rate": 8.379310344827586e-06, + "loss": 40.1548, + "step": 3981 + }, + { + "epoch": 14.375620767494357, + "grad_norm": 249.24520874023438, + "learning_rate": 8.373865698729583e-06, + "loss": 39.5746, + "step": 3982 + }, + { + "epoch": 14.37923250564334, + "grad_norm": 266.509033203125, + "learning_rate": 8.368421052631578e-06, + "loss": 39.2388, + "step": 3983 + }, + { + "epoch": 14.382844243792325, + "grad_norm": 255.36209106445312, + "learning_rate": 8.362976406533577e-06, + "loss": 39.9314, + "step": 3984 + }, + { + "epoch": 14.386455981941308, + "grad_norm": 239.0690460205078, + "learning_rate": 8.357531760435572e-06, + "loss": 39.9124, + "step": 3985 + }, + { + "epoch": 14.390067720090293, + "grad_norm": 211.36135864257812, + "learning_rate": 8.352087114337568e-06, + "loss": 40.1307, + "step": 3986 + }, + { + "epoch": 14.393679458239278, + "grad_norm": 215.28912353515625, + "learning_rate": 8.346642468239565e-06, + "loss": 40.5252, + "step": 3987 + }, + { + "epoch": 14.397291196388261, + "grad_norm": 240.84271240234375, + "learning_rate": 8.34119782214156e-06, + "loss": 40.8348, + "step": 3988 + }, + { + "epoch": 14.400902934537246, + "grad_norm": 228.41758728027344, + "learning_rate": 8.335753176043557e-06, + "loss": 39.8228, + "step": 3989 + }, + { + "epoch": 14.404514672686231, + "grad_norm": 203.0228729248047, + "learning_rate": 8.330308529945554e-06, + "loss": 38.0696, + "step": 3990 + }, + { + "epoch": 14.404514672686231, + "eval_loss": 0.6064196825027466, + "eval_runtime": 3.1413, + "eval_samples_per_second": 56.983, + "eval_steps_per_second": 56.983, + "step": 3990 + }, + { + "epoch": 14.408126410835214, + "grad_norm": 245.14646911621094, + "learning_rate": 8.32486388384755e-06, + "loss": 37.3921, + "step": 3991 + }, + { + "epoch": 14.411738148984199, + "grad_norm": 230.0685577392578, + "learning_rate": 8.319419237749545e-06, + "loss": 36.8794, + "step": 3992 + }, + { + "epoch": 14.415349887133182, + "grad_norm": 203.02955627441406, + "learning_rate": 8.313974591651544e-06, + "loss": 38.011, + "step": 3993 + }, + { + "epoch": 14.418961625282167, + "grad_norm": 276.0522766113281, + "learning_rate": 8.30852994555354e-06, + "loss": 37.8114, + "step": 3994 + }, + { + "epoch": 14.422573363431152, + "grad_norm": 205.56423950195312, + "learning_rate": 8.303085299455536e-06, + "loss": 38.1956, + "step": 3995 + }, + { + "epoch": 14.426185101580135, + "grad_norm": 200.71507263183594, + "learning_rate": 8.297640653357532e-06, + "loss": 36.4471, + "step": 3996 + }, + { + "epoch": 14.42979683972912, + "grad_norm": 217.8540496826172, + "learning_rate": 8.292196007259527e-06, + "loss": 37.6204, + "step": 3997 + }, + { + "epoch": 14.433408577878104, + "grad_norm": 228.0621337890625, + "learning_rate": 8.286751361161526e-06, + "loss": 38.6074, + "step": 3998 + }, + { + "epoch": 14.437020316027088, + "grad_norm": 246.05203247070312, + "learning_rate": 8.281306715063521e-06, + "loss": 37.8614, + "step": 3999 + }, + { + "epoch": 14.440632054176072, + "grad_norm": 216.0327911376953, + "learning_rate": 8.275862068965517e-06, + "loss": 37.4941, + "step": 4000 + }, + { + "epoch": 14.440632054176072, + "eval_loss": 0.605604887008667, + "eval_runtime": 3.1399, + "eval_samples_per_second": 57.008, + "eval_steps_per_second": 57.008, + "step": 4000 + }, + { + "epoch": 14.444243792325057, + "grad_norm": 292.38653564453125, + "learning_rate": 8.270417422867514e-06, + "loss": 37.9576, + "step": 4001 + }, + { + "epoch": 14.44785553047404, + "grad_norm": 268.2558288574219, + "learning_rate": 8.26497277676951e-06, + "loss": 38.7505, + "step": 4002 + }, + { + "epoch": 14.451467268623025, + "grad_norm": 324.135498046875, + "learning_rate": 8.259528130671508e-06, + "loss": 39.9733, + "step": 4003 + }, + { + "epoch": 14.455079006772008, + "grad_norm": 269.1458740234375, + "learning_rate": 8.254083484573503e-06, + "loss": 38.8272, + "step": 4004 + }, + { + "epoch": 14.458690744920993, + "grad_norm": 214.26547241210938, + "learning_rate": 8.248638838475499e-06, + "loss": 37.7277, + "step": 4005 + }, + { + "epoch": 14.462302483069978, + "grad_norm": 256.4419860839844, + "learning_rate": 8.243194192377496e-06, + "loss": 39.0446, + "step": 4006 + }, + { + "epoch": 14.465914221218961, + "grad_norm": 226.9741973876953, + "learning_rate": 8.237749546279493e-06, + "loss": 34.2491, + "step": 4007 + }, + { + "epoch": 14.469525959367946, + "grad_norm": 238.4901123046875, + "learning_rate": 8.232304900181488e-06, + "loss": 32.1969, + "step": 4008 + }, + { + "epoch": 14.47313769751693, + "grad_norm": 260.6334533691406, + "learning_rate": 8.226860254083485e-06, + "loss": 32.5999, + "step": 4009 + }, + { + "epoch": 14.476749435665914, + "grad_norm": 227.4844970703125, + "learning_rate": 8.22141560798548e-06, + "loss": 30.3598, + "step": 4010 + }, + { + "epoch": 14.476749435665914, + "eval_loss": 0.6049788594245911, + "eval_runtime": 3.1451, + "eval_samples_per_second": 56.914, + "eval_steps_per_second": 56.914, + "step": 4010 + }, + { + "epoch": 14.480361173814899, + "grad_norm": 231.49935913085938, + "learning_rate": 8.215970961887476e-06, + "loss": 32.3228, + "step": 4011 + }, + { + "epoch": 14.483972911963882, + "grad_norm": 246.83099365234375, + "learning_rate": 8.210526315789475e-06, + "loss": 32.1275, + "step": 4012 + }, + { + "epoch": 14.487584650112867, + "grad_norm": 283.0715026855469, + "learning_rate": 8.20508166969147e-06, + "loss": 32.9237, + "step": 4013 + }, + { + "epoch": 14.491196388261852, + "grad_norm": 264.58941650390625, + "learning_rate": 8.199637023593467e-06, + "loss": 34.3091, + "step": 4014 + }, + { + "epoch": 14.494808126410835, + "grad_norm": 207.57241821289062, + "learning_rate": 8.194192377495463e-06, + "loss": 34.2317, + "step": 4015 + }, + { + "epoch": 14.49841986455982, + "grad_norm": 266.3730163574219, + "learning_rate": 8.18874773139746e-06, + "loss": 35.5423, + "step": 4016 + }, + { + "epoch": 14.502031602708804, + "grad_norm": 274.2936096191406, + "learning_rate": 8.183303085299457e-06, + "loss": 34.0383, + "step": 4017 + }, + { + "epoch": 14.505643340857787, + "grad_norm": 345.4320068359375, + "learning_rate": 8.177858439201452e-06, + "loss": 35.6892, + "step": 4018 + }, + { + "epoch": 14.509255079006772, + "grad_norm": 254.9503631591797, + "learning_rate": 8.172413793103448e-06, + "loss": 34.4219, + "step": 4019 + }, + { + "epoch": 14.512866817155757, + "grad_norm": 277.176025390625, + "learning_rate": 8.166969147005445e-06, + "loss": 34.6322, + "step": 4020 + }, + { + "epoch": 14.512866817155757, + "eval_loss": 0.6078911423683167, + "eval_runtime": 3.1428, + "eval_samples_per_second": 56.956, + "eval_steps_per_second": 56.956, + "step": 4020 + }, + { + "epoch": 14.51647855530474, + "grad_norm": 267.24737548828125, + "learning_rate": 8.161524500907442e-06, + "loss": 36.4843, + "step": 4021 + }, + { + "epoch": 14.520090293453725, + "grad_norm": 291.5208740234375, + "learning_rate": 8.156079854809437e-06, + "loss": 36.347, + "step": 4022 + }, + { + "epoch": 14.523702031602708, + "grad_norm": 331.9736328125, + "learning_rate": 8.150635208711434e-06, + "loss": 36.5678, + "step": 4023 + }, + { + "epoch": 14.527313769751693, + "grad_norm": 283.7598876953125, + "learning_rate": 8.14519056261343e-06, + "loss": 29.4886, + "step": 4024 + }, + { + "epoch": 14.530925507900678, + "grad_norm": 214.61712646484375, + "learning_rate": 8.139745916515427e-06, + "loss": 23.2178, + "step": 4025 + }, + { + "epoch": 14.534537246049661, + "grad_norm": 286.7948913574219, + "learning_rate": 8.134301270417424e-06, + "loss": 22.0972, + "step": 4026 + }, + { + "epoch": 14.538148984198646, + "grad_norm": 230.6540069580078, + "learning_rate": 8.128856624319419e-06, + "loss": 23.2764, + "step": 4027 + }, + { + "epoch": 14.54176072234763, + "grad_norm": 300.9560241699219, + "learning_rate": 8.123411978221416e-06, + "loss": 24.1889, + "step": 4028 + }, + { + "epoch": 14.545372460496614, + "grad_norm": 211.4068145751953, + "learning_rate": 8.117967332123412e-06, + "loss": 39.0039, + "step": 4029 + }, + { + "epoch": 14.548984198645599, + "grad_norm": 274.3965759277344, + "learning_rate": 8.112522686025409e-06, + "loss": 41.1832, + "step": 4030 + }, + { + "epoch": 14.548984198645599, + "eval_loss": 0.6079195141792297, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 4030 + }, + { + "epoch": 14.552595936794582, + "grad_norm": 247.50657653808594, + "learning_rate": 8.107078039927406e-06, + "loss": 38.28, + "step": 4031 + }, + { + "epoch": 14.556207674943566, + "grad_norm": 216.0500946044922, + "learning_rate": 8.101633393829401e-06, + "loss": 39.5079, + "step": 4032 + }, + { + "epoch": 14.559819413092551, + "grad_norm": 271.37066650390625, + "learning_rate": 8.096188747731396e-06, + "loss": 40.1902, + "step": 4033 + }, + { + "epoch": 14.563431151241534, + "grad_norm": 233.35415649414062, + "learning_rate": 8.090744101633394e-06, + "loss": 40.2113, + "step": 4034 + }, + { + "epoch": 14.56704288939052, + "grad_norm": 214.67381286621094, + "learning_rate": 8.08529945553539e-06, + "loss": 39.794, + "step": 4035 + }, + { + "epoch": 14.570654627539504, + "grad_norm": 298.1142578125, + "learning_rate": 8.079854809437388e-06, + "loss": 39.9214, + "step": 4036 + }, + { + "epoch": 14.574266365688487, + "grad_norm": 197.40823364257812, + "learning_rate": 8.074410163339383e-06, + "loss": 40.9599, + "step": 4037 + }, + { + "epoch": 14.577878103837472, + "grad_norm": 242.1573028564453, + "learning_rate": 8.068965517241378e-06, + "loss": 40.2351, + "step": 4038 + }, + { + "epoch": 14.581489841986457, + "grad_norm": 224.93801879882812, + "learning_rate": 8.063520871143377e-06, + "loss": 39.0174, + "step": 4039 + }, + { + "epoch": 14.58510158013544, + "grad_norm": 295.4931335449219, + "learning_rate": 8.058076225045373e-06, + "loss": 37.4696, + "step": 4040 + }, + { + "epoch": 14.58510158013544, + "eval_loss": 0.6091852188110352, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.064, + "eval_steps_per_second": 57.064, + "step": 4040 + }, + { + "epoch": 14.588713318284425, + "grad_norm": 302.8267517089844, + "learning_rate": 8.052631578947368e-06, + "loss": 37.3227, + "step": 4041 + }, + { + "epoch": 14.592325056433408, + "grad_norm": 355.2379150390625, + "learning_rate": 8.047186932849365e-06, + "loss": 38.433, + "step": 4042 + }, + { + "epoch": 14.595936794582393, + "grad_norm": 304.96234130859375, + "learning_rate": 8.04174228675136e-06, + "loss": 37.8352, + "step": 4043 + }, + { + "epoch": 14.599548532731378, + "grad_norm": 309.294921875, + "learning_rate": 8.036297640653358e-06, + "loss": 38.1734, + "step": 4044 + }, + { + "epoch": 14.60316027088036, + "grad_norm": 216.3328399658203, + "learning_rate": 8.030852994555355e-06, + "loss": 37.3612, + "step": 4045 + }, + { + "epoch": 14.606772009029346, + "grad_norm": 250.9885711669922, + "learning_rate": 8.02540834845735e-06, + "loss": 39.1612, + "step": 4046 + }, + { + "epoch": 14.610383747178329, + "grad_norm": 215.0750732421875, + "learning_rate": 8.019963702359347e-06, + "loss": 39.6837, + "step": 4047 + }, + { + "epoch": 14.613995485327314, + "grad_norm": 234.02069091796875, + "learning_rate": 8.014519056261342e-06, + "loss": 37.9746, + "step": 4048 + }, + { + "epoch": 14.617607223476298, + "grad_norm": 233.7527313232422, + "learning_rate": 8.00907441016334e-06, + "loss": 38.5114, + "step": 4049 + }, + { + "epoch": 14.621218961625281, + "grad_norm": 271.77496337890625, + "learning_rate": 8.003629764065337e-06, + "loss": 37.1647, + "step": 4050 + }, + { + "epoch": 14.621218961625281, + "eval_loss": 0.6047770977020264, + "eval_runtime": 3.1379, + "eval_samples_per_second": 57.045, + "eval_steps_per_second": 57.045, + "step": 4050 + }, + { + "epoch": 14.624830699774266, + "grad_norm": 281.7846374511719, + "learning_rate": 7.998185117967332e-06, + "loss": 38.981, + "step": 4051 + }, + { + "epoch": 14.628442437923251, + "grad_norm": 308.8702697753906, + "learning_rate": 7.992740471869327e-06, + "loss": 39.4821, + "step": 4052 + }, + { + "epoch": 14.632054176072234, + "grad_norm": 366.1501770019531, + "learning_rate": 7.987295825771326e-06, + "loss": 39.0898, + "step": 4053 + }, + { + "epoch": 14.635665914221219, + "grad_norm": 276.92962646484375, + "learning_rate": 7.981851179673322e-06, + "loss": 39.6162, + "step": 4054 + }, + { + "epoch": 14.639277652370204, + "grad_norm": 220.0023651123047, + "learning_rate": 7.976406533575319e-06, + "loss": 38.5888, + "step": 4055 + }, + { + "epoch": 14.642889390519187, + "grad_norm": 268.57293701171875, + "learning_rate": 7.970961887477314e-06, + "loss": 38.4631, + "step": 4056 + }, + { + "epoch": 14.646501128668172, + "grad_norm": 307.8072509765625, + "learning_rate": 7.96551724137931e-06, + "loss": 35.4139, + "step": 4057 + }, + { + "epoch": 14.650112866817155, + "grad_norm": 228.11767578125, + "learning_rate": 7.960072595281308e-06, + "loss": 33.3694, + "step": 4058 + }, + { + "epoch": 14.65372460496614, + "grad_norm": 217.6271209716797, + "learning_rate": 7.954627949183304e-06, + "loss": 31.3355, + "step": 4059 + }, + { + "epoch": 14.657336343115125, + "grad_norm": 232.31944274902344, + "learning_rate": 7.949183303085299e-06, + "loss": 32.8306, + "step": 4060 + }, + { + "epoch": 14.657336343115125, + "eval_loss": 0.6018487215042114, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 4060 + }, + { + "epoch": 14.660948081264108, + "grad_norm": 244.58303833007812, + "learning_rate": 7.943738656987296e-06, + "loss": 33.2157, + "step": 4061 + }, + { + "epoch": 14.664559819413093, + "grad_norm": 306.12005615234375, + "learning_rate": 7.938294010889293e-06, + "loss": 33.6361, + "step": 4062 + }, + { + "epoch": 14.668171557562077, + "grad_norm": 266.2792053222656, + "learning_rate": 7.932849364791288e-06, + "loss": 32.3917, + "step": 4063 + }, + { + "epoch": 14.67178329571106, + "grad_norm": 259.373779296875, + "learning_rate": 7.927404718693286e-06, + "loss": 33.3598, + "step": 4064 + }, + { + "epoch": 14.675395033860045, + "grad_norm": 247.35179138183594, + "learning_rate": 7.921960072595281e-06, + "loss": 32.2699, + "step": 4065 + }, + { + "epoch": 14.679006772009028, + "grad_norm": 280.02960205078125, + "learning_rate": 7.916515426497278e-06, + "loss": 33.0305, + "step": 4066 + }, + { + "epoch": 14.682618510158013, + "grad_norm": 394.6492919921875, + "learning_rate": 7.911070780399275e-06, + "loss": 35.1854, + "step": 4067 + }, + { + "epoch": 14.686230248306998, + "grad_norm": 298.6531677246094, + "learning_rate": 7.90562613430127e-06, + "loss": 35.1836, + "step": 4068 + }, + { + "epoch": 14.689841986455981, + "grad_norm": 250.960693359375, + "learning_rate": 7.900181488203268e-06, + "loss": 32.6266, + "step": 4069 + }, + { + "epoch": 14.693453724604966, + "grad_norm": 240.4825897216797, + "learning_rate": 7.894736842105263e-06, + "loss": 35.5937, + "step": 4070 + }, + { + "epoch": 14.693453724604966, + "eval_loss": 0.6042065620422363, + "eval_runtime": 3.1453, + "eval_samples_per_second": 56.91, + "eval_steps_per_second": 56.91, + "step": 4070 + }, + { + "epoch": 14.697065462753951, + "grad_norm": 274.6919860839844, + "learning_rate": 7.889292196007258e-06, + "loss": 36.4225, + "step": 4071 + }, + { + "epoch": 14.700677200902934, + "grad_norm": 245.4980010986328, + "learning_rate": 7.883847549909257e-06, + "loss": 36.5503, + "step": 4072 + }, + { + "epoch": 14.704288939051919, + "grad_norm": 373.362548828125, + "learning_rate": 7.878402903811252e-06, + "loss": 35.38, + "step": 4073 + }, + { + "epoch": 14.707900677200904, + "grad_norm": 337.5054626464844, + "learning_rate": 7.872958257713248e-06, + "loss": 28.869, + "step": 4074 + }, + { + "epoch": 14.711512415349887, + "grad_norm": 238.19195556640625, + "learning_rate": 7.867513611615245e-06, + "loss": 22.99, + "step": 4075 + }, + { + "epoch": 14.715124153498872, + "grad_norm": 254.274169921875, + "learning_rate": 7.862068965517242e-06, + "loss": 22.5274, + "step": 4076 + }, + { + "epoch": 14.718735891647855, + "grad_norm": 236.74099731445312, + "learning_rate": 7.856624319419239e-06, + "loss": 23.6756, + "step": 4077 + }, + { + "epoch": 14.72234762979684, + "grad_norm": 239.69911193847656, + "learning_rate": 7.851179673321234e-06, + "loss": 23.2024, + "step": 4078 + }, + { + "epoch": 14.725959367945824, + "grad_norm": 296.35101318359375, + "learning_rate": 7.84573502722323e-06, + "loss": 40.0026, + "step": 4079 + }, + { + "epoch": 14.729571106094808, + "grad_norm": 202.52577209472656, + "learning_rate": 7.840290381125227e-06, + "loss": 41.2817, + "step": 4080 + }, + { + "epoch": 14.729571106094808, + "eval_loss": 0.6069625616073608, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4080 + }, + { + "epoch": 14.733182844243792, + "grad_norm": 290.4194030761719, + "learning_rate": 7.834845735027224e-06, + "loss": 40.5411, + "step": 4081 + }, + { + "epoch": 14.736794582392777, + "grad_norm": 284.0616455078125, + "learning_rate": 7.82940108892922e-06, + "loss": 40.6588, + "step": 4082 + }, + { + "epoch": 14.74040632054176, + "grad_norm": 289.5628967285156, + "learning_rate": 7.823956442831216e-06, + "loss": 38.986, + "step": 4083 + }, + { + "epoch": 14.744018058690745, + "grad_norm": 217.09841918945312, + "learning_rate": 7.818511796733212e-06, + "loss": 38.83, + "step": 4084 + }, + { + "epoch": 14.747629796839728, + "grad_norm": 223.49148559570312, + "learning_rate": 7.813067150635209e-06, + "loss": 39.4897, + "step": 4085 + }, + { + "epoch": 14.751241534988713, + "grad_norm": 240.41578674316406, + "learning_rate": 7.807622504537206e-06, + "loss": 38.9963, + "step": 4086 + }, + { + "epoch": 14.754853273137698, + "grad_norm": 206.7586212158203, + "learning_rate": 7.802177858439201e-06, + "loss": 39.7875, + "step": 4087 + }, + { + "epoch": 14.758465011286681, + "grad_norm": 239.97174072265625, + "learning_rate": 7.796733212341198e-06, + "loss": 39.3977, + "step": 4088 + }, + { + "epoch": 14.762076749435666, + "grad_norm": 204.50839233398438, + "learning_rate": 7.791288566243194e-06, + "loss": 38.7869, + "step": 4089 + }, + { + "epoch": 14.76568848758465, + "grad_norm": 216.79583740234375, + "learning_rate": 7.785843920145191e-06, + "loss": 36.7325, + "step": 4090 + }, + { + "epoch": 14.76568848758465, + "eval_loss": 0.6052367091178894, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.93, + "eval_steps_per_second": 56.93, + "step": 4090 + }, + { + "epoch": 14.769300225733634, + "grad_norm": 251.13209533691406, + "learning_rate": 7.780399274047188e-06, + "loss": 38.2377, + "step": 4091 + }, + { + "epoch": 14.772911963882619, + "grad_norm": 222.745361328125, + "learning_rate": 7.774954627949183e-06, + "loss": 36.8119, + "step": 4092 + }, + { + "epoch": 14.776523702031604, + "grad_norm": 252.72117614746094, + "learning_rate": 7.769509981851179e-06, + "loss": 38.1241, + "step": 4093 + }, + { + "epoch": 14.780135440180587, + "grad_norm": 272.38165283203125, + "learning_rate": 7.764065335753176e-06, + "loss": 37.6839, + "step": 4094 + }, + { + "epoch": 14.783747178329572, + "grad_norm": 301.0637512207031, + "learning_rate": 7.758620689655173e-06, + "loss": 38.1267, + "step": 4095 + }, + { + "epoch": 14.787358916478555, + "grad_norm": 240.22515869140625, + "learning_rate": 7.75317604355717e-06, + "loss": 36.9847, + "step": 4096 + }, + { + "epoch": 14.79097065462754, + "grad_norm": 273.3988952636719, + "learning_rate": 7.747731397459165e-06, + "loss": 39.0368, + "step": 4097 + }, + { + "epoch": 14.794582392776524, + "grad_norm": 252.66497802734375, + "learning_rate": 7.74228675136116e-06, + "loss": 38.6439, + "step": 4098 + }, + { + "epoch": 14.798194130925507, + "grad_norm": 246.3287811279297, + "learning_rate": 7.73684210526316e-06, + "loss": 36.3503, + "step": 4099 + }, + { + "epoch": 14.801805869074492, + "grad_norm": 220.6704559326172, + "learning_rate": 7.731397459165155e-06, + "loss": 38.1603, + "step": 4100 + }, + { + "epoch": 14.801805869074492, + "eval_loss": 0.6043270826339722, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4100 + }, + { + "epoch": 14.805417607223477, + "grad_norm": 215.94979858398438, + "learning_rate": 7.72595281306715e-06, + "loss": 38.9624, + "step": 4101 + }, + { + "epoch": 14.80902934537246, + "grad_norm": 228.76815795898438, + "learning_rate": 7.720508166969147e-06, + "loss": 39.2196, + "step": 4102 + }, + { + "epoch": 14.812641083521445, + "grad_norm": 216.1998291015625, + "learning_rate": 7.715063520871143e-06, + "loss": 39.3677, + "step": 4103 + }, + { + "epoch": 14.816252821670428, + "grad_norm": 266.1018981933594, + "learning_rate": 7.70961887477314e-06, + "loss": 38.1856, + "step": 4104 + }, + { + "epoch": 14.819864559819413, + "grad_norm": 234.2566680908203, + "learning_rate": 7.704174228675137e-06, + "loss": 39.6282, + "step": 4105 + }, + { + "epoch": 14.823476297968398, + "grad_norm": 241.16615295410156, + "learning_rate": 7.698729582577132e-06, + "loss": 38.2693, + "step": 4106 + }, + { + "epoch": 14.827088036117381, + "grad_norm": 332.6835021972656, + "learning_rate": 7.69328493647913e-06, + "loss": 37.7161, + "step": 4107 + }, + { + "epoch": 14.830699774266366, + "grad_norm": 260.1654357910156, + "learning_rate": 7.687840290381126e-06, + "loss": 33.9704, + "step": 4108 + }, + { + "epoch": 14.83431151241535, + "grad_norm": 214.45509338378906, + "learning_rate": 7.682395644283122e-06, + "loss": 32.5126, + "step": 4109 + }, + { + "epoch": 14.837923250564334, + "grad_norm": 257.4847717285156, + "learning_rate": 7.676950998185119e-06, + "loss": 32.0682, + "step": 4110 + }, + { + "epoch": 14.837923250564334, + "eval_loss": 0.6022929549217224, + "eval_runtime": 3.1427, + "eval_samples_per_second": 56.957, + "eval_steps_per_second": 56.957, + "step": 4110 + }, + { + "epoch": 14.841534988713319, + "grad_norm": 241.302978515625, + "learning_rate": 7.671506352087114e-06, + "loss": 32.8817, + "step": 4111 + }, + { + "epoch": 14.845146726862303, + "grad_norm": 238.0950164794922, + "learning_rate": 7.66606170598911e-06, + "loss": 31.9995, + "step": 4112 + }, + { + "epoch": 14.848758465011286, + "grad_norm": 239.700439453125, + "learning_rate": 7.660617059891108e-06, + "loss": 32.9681, + "step": 4113 + }, + { + "epoch": 14.852370203160271, + "grad_norm": 234.23890686035156, + "learning_rate": 7.655172413793104e-06, + "loss": 33.6878, + "step": 4114 + }, + { + "epoch": 14.855981941309254, + "grad_norm": 367.3103332519531, + "learning_rate": 7.6497277676951e-06, + "loss": 34.2346, + "step": 4115 + }, + { + "epoch": 14.85959367945824, + "grad_norm": 221.31381225585938, + "learning_rate": 7.644283121597096e-06, + "loss": 35.0148, + "step": 4116 + }, + { + "epoch": 14.863205417607224, + "grad_norm": 352.1162109375, + "learning_rate": 7.638838475499092e-06, + "loss": 34.8326, + "step": 4117 + }, + { + "epoch": 14.866817155756207, + "grad_norm": 296.8202209472656, + "learning_rate": 7.63339382940109e-06, + "loss": 34.2522, + "step": 4118 + }, + { + "epoch": 14.870428893905192, + "grad_norm": 283.4679870605469, + "learning_rate": 7.627949183303086e-06, + "loss": 34.5005, + "step": 4119 + }, + { + "epoch": 14.874040632054175, + "grad_norm": 249.95033264160156, + "learning_rate": 7.622504537205082e-06, + "loss": 34.9581, + "step": 4120 + }, + { + "epoch": 14.874040632054175, + "eval_loss": 0.6031190752983093, + "eval_runtime": 3.1392, + "eval_samples_per_second": 57.02, + "eval_steps_per_second": 57.02, + "step": 4120 + }, + { + "epoch": 14.87765237020316, + "grad_norm": 235.65065002441406, + "learning_rate": 7.6170598911070774e-06, + "loss": 35.3024, + "step": 4121 + }, + { + "epoch": 14.881264108352145, + "grad_norm": 258.1300964355469, + "learning_rate": 7.611615245009075e-06, + "loss": 35.4444, + "step": 4122 + }, + { + "epoch": 14.884875846501128, + "grad_norm": 262.9698791503906, + "learning_rate": 7.606170598911072e-06, + "loss": 36.5643, + "step": 4123 + }, + { + "epoch": 14.888487584650113, + "grad_norm": 274.81781005859375, + "learning_rate": 7.600725952813067e-06, + "loss": 33.0157, + "step": 4124 + }, + { + "epoch": 14.892099322799098, + "grad_norm": 205.41566467285156, + "learning_rate": 7.595281306715063e-06, + "loss": 22.226, + "step": 4125 + }, + { + "epoch": 14.89571106094808, + "grad_norm": 231.19541931152344, + "learning_rate": 7.5898366606170594e-06, + "loss": 22.1499, + "step": 4126 + }, + { + "epoch": 14.899322799097066, + "grad_norm": 203.04856872558594, + "learning_rate": 7.584392014519057e-06, + "loss": 23.3987, + "step": 4127 + }, + { + "epoch": 14.90293453724605, + "grad_norm": 289.031005859375, + "learning_rate": 7.578947368421053e-06, + "loss": 24.3649, + "step": 4128 + }, + { + "epoch": 14.906546275395034, + "grad_norm": 285.2325744628906, + "learning_rate": 7.573502722323049e-06, + "loss": 41.146, + "step": 4129 + }, + { + "epoch": 14.910158013544018, + "grad_norm": 232.21603393554688, + "learning_rate": 7.568058076225045e-06, + "loss": 40.3871, + "step": 4130 + }, + { + "epoch": 14.910158013544018, + "eval_loss": 0.6056836247444153, + "eval_runtime": 3.142, + "eval_samples_per_second": 56.969, + "eval_steps_per_second": 56.969, + "step": 4130 + }, + { + "epoch": 14.913769751693001, + "grad_norm": 358.63238525390625, + "learning_rate": 7.562613430127043e-06, + "loss": 39.5914, + "step": 4131 + }, + { + "epoch": 14.917381489841986, + "grad_norm": 262.66741943359375, + "learning_rate": 7.5571687840290385e-06, + "loss": 39.4552, + "step": 4132 + }, + { + "epoch": 14.920993227990971, + "grad_norm": 228.7096710205078, + "learning_rate": 7.551724137931035e-06, + "loss": 41.5379, + "step": 4133 + }, + { + "epoch": 14.924604966139954, + "grad_norm": 266.6537780761719, + "learning_rate": 7.546279491833031e-06, + "loss": 39.8314, + "step": 4134 + }, + { + "epoch": 14.928216704288939, + "grad_norm": 329.5486755371094, + "learning_rate": 7.540834845735027e-06, + "loss": 37.8247, + "step": 4135 + }, + { + "epoch": 14.931828442437924, + "grad_norm": 391.49127197265625, + "learning_rate": 7.535390199637024e-06, + "loss": 36.8491, + "step": 4136 + }, + { + "epoch": 14.935440180586907, + "grad_norm": 342.66632080078125, + "learning_rate": 7.5299455535390205e-06, + "loss": 37.7245, + "step": 4137 + }, + { + "epoch": 14.939051918735892, + "grad_norm": 309.25115966796875, + "learning_rate": 7.524500907441017e-06, + "loss": 38.3694, + "step": 4138 + }, + { + "epoch": 14.942663656884875, + "grad_norm": 438.21539306640625, + "learning_rate": 7.519056261343012e-06, + "loss": 38.5028, + "step": 4139 + }, + { + "epoch": 14.94627539503386, + "grad_norm": 314.2667541503906, + "learning_rate": 7.513611615245008e-06, + "loss": 39.2531, + "step": 4140 + }, + { + "epoch": 14.94627539503386, + "eval_loss": 0.6075459718704224, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.069, + "eval_steps_per_second": 57.069, + "step": 4140 + }, + { + "epoch": 14.949887133182845, + "grad_norm": 348.3675537109375, + "learning_rate": 7.508166969147006e-06, + "loss": 38.3904, + "step": 4141 + }, + { + "epoch": 14.953498871331828, + "grad_norm": 448.6506652832031, + "learning_rate": 7.5027223230490025e-06, + "loss": 39.0257, + "step": 4142 + }, + { + "epoch": 14.957110609480813, + "grad_norm": 407.4074401855469, + "learning_rate": 7.497277676950998e-06, + "loss": 36.8144, + "step": 4143 + }, + { + "epoch": 14.960722347629797, + "grad_norm": 311.0707702636719, + "learning_rate": 7.491833030852995e-06, + "loss": 34.3852, + "step": 4144 + }, + { + "epoch": 14.96433408577878, + "grad_norm": 316.660400390625, + "learning_rate": 7.486388384754991e-06, + "loss": 32.9411, + "step": 4145 + }, + { + "epoch": 14.967945823927765, + "grad_norm": 405.3203125, + "learning_rate": 7.480943738656988e-06, + "loss": 32.9947, + "step": 4146 + }, + { + "epoch": 14.97155756207675, + "grad_norm": 246.47296142578125, + "learning_rate": 7.475499092558984e-06, + "loss": 34.9284, + "step": 4147 + }, + { + "epoch": 14.975169300225733, + "grad_norm": 250.6293487548828, + "learning_rate": 7.47005444646098e-06, + "loss": 33.5852, + "step": 4148 + }, + { + "epoch": 14.978781038374718, + "grad_norm": 367.8492736816406, + "learning_rate": 7.464609800362977e-06, + "loss": 34.5658, + "step": 4149 + }, + { + "epoch": 14.982392776523701, + "grad_norm": 299.1382141113281, + "learning_rate": 7.459165154264972e-06, + "loss": 35.4483, + "step": 4150 + }, + { + "epoch": 14.982392776523701, + "eval_loss": 0.6054605841636658, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 4150 + }, + { + "epoch": 14.986004514672686, + "grad_norm": 448.0080261230469, + "learning_rate": 7.453720508166969e-06, + "loss": 35.9366, + "step": 4151 + }, + { + "epoch": 14.989616252821671, + "grad_norm": 496.0691223144531, + "learning_rate": 7.448275862068966e-06, + "loss": 37.6222, + "step": 4152 + }, + { + "epoch": 14.993227990970654, + "grad_norm": 300.7026062011719, + "learning_rate": 7.442831215970963e-06, + "loss": 27.5573, + "step": 4153 + }, + { + "epoch": 14.996839729119639, + "grad_norm": 183.81434631347656, + "learning_rate": 7.437386569872958e-06, + "loss": 23.0142, + "step": 4154 + }, + { + "epoch": 15.0, + "grad_norm": 198.61032104492188, + "learning_rate": 7.431941923774954e-06, + "loss": 21.0732, + "step": 4155 + }, + { + "epoch": 15.003611738148985, + "grad_norm": 244.2176513671875, + "learning_rate": 7.426497277676951e-06, + "loss": 39.1709, + "step": 4156 + }, + { + "epoch": 15.007223476297968, + "grad_norm": 211.74375915527344, + "learning_rate": 7.421052631578948e-06, + "loss": 39.9364, + "step": 4157 + }, + { + "epoch": 15.010835214446953, + "grad_norm": 216.2489013671875, + "learning_rate": 7.415607985480944e-06, + "loss": 39.5166, + "step": 4158 + }, + { + "epoch": 15.014446952595938, + "grad_norm": 279.423583984375, + "learning_rate": 7.41016333938294e-06, + "loss": 39.6738, + "step": 4159 + }, + { + "epoch": 15.01805869074492, + "grad_norm": 279.117919921875, + "learning_rate": 7.404718693284937e-06, + "loss": 39.3556, + "step": 4160 + }, + { + "epoch": 15.01805869074492, + "eval_loss": 0.6020110249519348, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 4160 + }, + { + "epoch": 15.021670428893906, + "grad_norm": 213.57162475585938, + "learning_rate": 7.399274047186933e-06, + "loss": 38.9987, + "step": 4161 + }, + { + "epoch": 15.025282167042889, + "grad_norm": 184.1968994140625, + "learning_rate": 7.393829401088929e-06, + "loss": 39.1696, + "step": 4162 + }, + { + "epoch": 15.028893905191874, + "grad_norm": 219.38076782226562, + "learning_rate": 7.388384754990926e-06, + "loss": 39.8897, + "step": 4163 + }, + { + "epoch": 15.032505643340858, + "grad_norm": 225.4325714111328, + "learning_rate": 7.382940108892922e-06, + "loss": 40.7633, + "step": 4164 + }, + { + "epoch": 15.036117381489841, + "grad_norm": 274.78472900390625, + "learning_rate": 7.377495462794918e-06, + "loss": 39.8768, + "step": 4165 + }, + { + "epoch": 15.039729119638826, + "grad_norm": 269.5557861328125, + "learning_rate": 7.3720508166969146e-06, + "loss": 38.4735, + "step": 4166 + }, + { + "epoch": 15.043340857787811, + "grad_norm": 219.78761291503906, + "learning_rate": 7.366606170598912e-06, + "loss": 37.2117, + "step": 4167 + }, + { + "epoch": 15.046952595936794, + "grad_norm": 205.49771118164062, + "learning_rate": 7.361161524500908e-06, + "loss": 36.6855, + "step": 4168 + }, + { + "epoch": 15.050564334085779, + "grad_norm": 235.72068786621094, + "learning_rate": 7.355716878402904e-06, + "loss": 35.4408, + "step": 4169 + }, + { + "epoch": 15.054176072234762, + "grad_norm": 218.84732055664062, + "learning_rate": 7.3502722323049e-06, + "loss": 38.2297, + "step": 4170 + }, + { + "epoch": 15.054176072234762, + "eval_loss": 0.6053969860076904, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 4170 + }, + { + "epoch": 15.057787810383747, + "grad_norm": 195.80685424804688, + "learning_rate": 7.3448275862068966e-06, + "loss": 35.7271, + "step": 4171 + }, + { + "epoch": 15.061399548532732, + "grad_norm": 207.12481689453125, + "learning_rate": 7.339382940108894e-06, + "loss": 37.3393, + "step": 4172 + }, + { + "epoch": 15.065011286681715, + "grad_norm": 211.0287322998047, + "learning_rate": 7.333938294010889e-06, + "loss": 36.9505, + "step": 4173 + }, + { + "epoch": 15.0686230248307, + "grad_norm": 279.0206604003906, + "learning_rate": 7.328493647912886e-06, + "loss": 38.1225, + "step": 4174 + }, + { + "epoch": 15.072234762979685, + "grad_norm": 206.3834228515625, + "learning_rate": 7.323049001814882e-06, + "loss": 37.1117, + "step": 4175 + }, + { + "epoch": 15.075846501128668, + "grad_norm": 266.8707275390625, + "learning_rate": 7.3176043557168786e-06, + "loss": 36.1971, + "step": 4176 + }, + { + "epoch": 15.079458239277653, + "grad_norm": 260.35791015625, + "learning_rate": 7.312159709618875e-06, + "loss": 37.4714, + "step": 4177 + }, + { + "epoch": 15.083069977426636, + "grad_norm": 281.152587890625, + "learning_rate": 7.306715063520871e-06, + "loss": 37.621, + "step": 4178 + }, + { + "epoch": 15.08668171557562, + "grad_norm": 246.25758361816406, + "learning_rate": 7.301270417422868e-06, + "loss": 38.919, + "step": 4179 + }, + { + "epoch": 15.090293453724605, + "grad_norm": 378.4499816894531, + "learning_rate": 7.2958257713248635e-06, + "loss": 39.5783, + "step": 4180 + }, + { + "epoch": 15.090293453724605, + "eval_loss": 0.6071392297744751, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 4180 + }, + { + "epoch": 15.093905191873588, + "grad_norm": 421.0552673339844, + "learning_rate": 7.2903811252268606e-06, + "loss": 38.9023, + "step": 4181 + }, + { + "epoch": 15.097516930022573, + "grad_norm": 264.24359130859375, + "learning_rate": 7.284936479128857e-06, + "loss": 39.6466, + "step": 4182 + }, + { + "epoch": 15.101128668171558, + "grad_norm": 246.88182067871094, + "learning_rate": 7.279491833030854e-06, + "loss": 39.4899, + "step": 4183 + }, + { + "epoch": 15.104740406320541, + "grad_norm": 236.83848571777344, + "learning_rate": 7.274047186932849e-06, + "loss": 35.6587, + "step": 4184 + }, + { + "epoch": 15.108352144469526, + "grad_norm": 278.31573486328125, + "learning_rate": 7.2686025408348455e-06, + "loss": 34.1567, + "step": 4185 + }, + { + "epoch": 15.111963882618511, + "grad_norm": 243.71160888671875, + "learning_rate": 7.2631578947368426e-06, + "loss": 32.1268, + "step": 4186 + }, + { + "epoch": 15.115575620767494, + "grad_norm": 233.81211853027344, + "learning_rate": 7.257713248638839e-06, + "loss": 31.498, + "step": 4187 + }, + { + "epoch": 15.119187358916479, + "grad_norm": 243.12672424316406, + "learning_rate": 7.252268602540835e-06, + "loss": 32.3648, + "step": 4188 + }, + { + "epoch": 15.122799097065462, + "grad_norm": 293.38299560546875, + "learning_rate": 7.246823956442831e-06, + "loss": 32.2236, + "step": 4189 + }, + { + "epoch": 15.126410835214447, + "grad_norm": 249.70071411132812, + "learning_rate": 7.241379310344828e-06, + "loss": 34.5535, + "step": 4190 + }, + { + "epoch": 15.126410835214447, + "eval_loss": 0.6050077676773071, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.934, + "eval_steps_per_second": 56.934, + "step": 4190 + }, + { + "epoch": 15.130022573363432, + "grad_norm": 300.9483642578125, + "learning_rate": 7.235934664246824e-06, + "loss": 32.9552, + "step": 4191 + }, + { + "epoch": 15.133634311512415, + "grad_norm": 228.797607421875, + "learning_rate": 7.23049001814882e-06, + "loss": 33.0974, + "step": 4192 + }, + { + "epoch": 15.1372460496614, + "grad_norm": 279.9087219238281, + "learning_rate": 7.225045372050817e-06, + "loss": 34.2865, + "step": 4193 + }, + { + "epoch": 15.140857787810384, + "grad_norm": 254.15928649902344, + "learning_rate": 7.219600725952813e-06, + "loss": 34.5603, + "step": 4194 + }, + { + "epoch": 15.144469525959368, + "grad_norm": 314.19012451171875, + "learning_rate": 7.2141560798548095e-06, + "loss": 34.6428, + "step": 4195 + }, + { + "epoch": 15.148081264108352, + "grad_norm": 291.8244323730469, + "learning_rate": 7.208711433756806e-06, + "loss": 33.6676, + "step": 4196 + }, + { + "epoch": 15.151693002257336, + "grad_norm": 276.4428405761719, + "learning_rate": 7.203266787658803e-06, + "loss": 33.9118, + "step": 4197 + }, + { + "epoch": 15.15530474040632, + "grad_norm": 265.7801208496094, + "learning_rate": 7.197822141560799e-06, + "loss": 35.1971, + "step": 4198 + }, + { + "epoch": 15.158916478555305, + "grad_norm": 244.48667907714844, + "learning_rate": 7.192377495462795e-06, + "loss": 33.0843, + "step": 4199 + }, + { + "epoch": 15.162528216704288, + "grad_norm": 348.6037902832031, + "learning_rate": 7.1869328493647915e-06, + "loss": 36.7957, + "step": 4200 + }, + { + "epoch": 15.162528216704288, + "eval_loss": 0.6052607297897339, + "eval_runtime": 3.1435, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 4200 + }, + { + "epoch": 15.166139954853273, + "grad_norm": 227.31346130371094, + "learning_rate": 7.181488203266788e-06, + "loss": 28.0234, + "step": 4201 + }, + { + "epoch": 15.169751693002258, + "grad_norm": 208.75048828125, + "learning_rate": 7.176043557168784e-06, + "loss": 22.5147, + "step": 4202 + }, + { + "epoch": 15.173363431151241, + "grad_norm": 222.91090393066406, + "learning_rate": 7.17059891107078e-06, + "loss": 22.1029, + "step": 4203 + }, + { + "epoch": 15.176975169300226, + "grad_norm": 219.40621948242188, + "learning_rate": 7.165154264972777e-06, + "loss": 22.9827, + "step": 4204 + }, + { + "epoch": 15.18058690744921, + "grad_norm": 229.11813354492188, + "learning_rate": 7.1597096188747735e-06, + "loss": 23.6974, + "step": 4205 + }, + { + "epoch": 15.184198645598194, + "grad_norm": 256.7950744628906, + "learning_rate": 7.15426497277677e-06, + "loss": 39.6585, + "step": 4206 + }, + { + "epoch": 15.187810383747179, + "grad_norm": 237.47613525390625, + "learning_rate": 7.148820326678766e-06, + "loss": 40.0478, + "step": 4207 + }, + { + "epoch": 15.191422121896162, + "grad_norm": 259.54296875, + "learning_rate": 7.143375680580762e-06, + "loss": 39.7604, + "step": 4208 + }, + { + "epoch": 15.195033860045147, + "grad_norm": 249.7389678955078, + "learning_rate": 7.137931034482759e-06, + "loss": 39.0201, + "step": 4209 + }, + { + "epoch": 15.198645598194132, + "grad_norm": 298.4624938964844, + "learning_rate": 7.132486388384755e-06, + "loss": 39.8575, + "step": 4210 + }, + { + "epoch": 15.198645598194132, + "eval_loss": 0.6088115572929382, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.996, + "eval_steps_per_second": 56.996, + "step": 4210 + }, + { + "epoch": 15.202257336343115, + "grad_norm": 267.57659912109375, + "learning_rate": 7.127041742286752e-06, + "loss": 38.8929, + "step": 4211 + }, + { + "epoch": 15.2058690744921, + "grad_norm": 243.88333129882812, + "learning_rate": 7.121597096188748e-06, + "loss": 39.6078, + "step": 4212 + }, + { + "epoch": 15.209480812641084, + "grad_norm": 268.2644348144531, + "learning_rate": 7.116152450090745e-06, + "loss": 39.9488, + "step": 4213 + }, + { + "epoch": 15.213092550790067, + "grad_norm": 240.2657928466797, + "learning_rate": 7.11070780399274e-06, + "loss": 40.1645, + "step": 4214 + }, + { + "epoch": 15.216704288939052, + "grad_norm": 198.76910400390625, + "learning_rate": 7.105263157894737e-06, + "loss": 38.2229, + "step": 4215 + }, + { + "epoch": 15.220316027088035, + "grad_norm": 234.11170959472656, + "learning_rate": 7.099818511796734e-06, + "loss": 39.5294, + "step": 4216 + }, + { + "epoch": 15.22392776523702, + "grad_norm": 192.80194091796875, + "learning_rate": 7.094373865698729e-06, + "loss": 36.9752, + "step": 4217 + }, + { + "epoch": 15.227539503386005, + "grad_norm": 241.8236846923828, + "learning_rate": 7.088929219600726e-06, + "loss": 36.1043, + "step": 4218 + }, + { + "epoch": 15.231151241534988, + "grad_norm": 451.6199645996094, + "learning_rate": 7.083484573502722e-06, + "loss": 37.7911, + "step": 4219 + }, + { + "epoch": 15.234762979683973, + "grad_norm": 351.9429626464844, + "learning_rate": 7.0780399274047195e-06, + "loss": 35.5202, + "step": 4220 + }, + { + "epoch": 15.234762979683973, + "eval_loss": 0.6093130111694336, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 4220 + }, + { + "epoch": 15.238374717832958, + "grad_norm": 266.4995422363281, + "learning_rate": 7.072595281306715e-06, + "loss": 37.5552, + "step": 4221 + }, + { + "epoch": 15.241986455981941, + "grad_norm": 258.74578857421875, + "learning_rate": 7.067150635208712e-06, + "loss": 37.1315, + "step": 4222 + }, + { + "epoch": 15.245598194130926, + "grad_norm": 233.30921936035156, + "learning_rate": 7.061705989110708e-06, + "loss": 36.9237, + "step": 4223 + }, + { + "epoch": 15.249209932279909, + "grad_norm": 235.8688201904297, + "learning_rate": 7.056261343012704e-06, + "loss": 38.0112, + "step": 4224 + }, + { + "epoch": 15.252821670428894, + "grad_norm": 214.88436889648438, + "learning_rate": 7.050816696914701e-06, + "loss": 38.5641, + "step": 4225 + }, + { + "epoch": 15.256433408577879, + "grad_norm": 252.64144897460938, + "learning_rate": 7.045372050816697e-06, + "loss": 36.7125, + "step": 4226 + }, + { + "epoch": 15.260045146726862, + "grad_norm": 293.78424072265625, + "learning_rate": 7.039927404718694e-06, + "loss": 37.5956, + "step": 4227 + }, + { + "epoch": 15.263656884875846, + "grad_norm": 234.13510131835938, + "learning_rate": 7.03448275862069e-06, + "loss": 38.1829, + "step": 4228 + }, + { + "epoch": 15.267268623024831, + "grad_norm": 279.534912109375, + "learning_rate": 7.029038112522686e-06, + "loss": 39.0785, + "step": 4229 + }, + { + "epoch": 15.270880361173814, + "grad_norm": 246.4442596435547, + "learning_rate": 7.023593466424683e-06, + "loss": 39.1753, + "step": 4230 + }, + { + "epoch": 15.270880361173814, + "eval_loss": 0.6043311357498169, + "eval_runtime": 3.1452, + "eval_samples_per_second": 56.913, + "eval_steps_per_second": 56.913, + "step": 4230 + }, + { + "epoch": 15.2744920993228, + "grad_norm": 233.87466430664062, + "learning_rate": 7.018148820326679e-06, + "loss": 39.8464, + "step": 4231 + }, + { + "epoch": 15.278103837471784, + "grad_norm": 228.54898071289062, + "learning_rate": 7.012704174228675e-06, + "loss": 37.9721, + "step": 4232 + }, + { + "epoch": 15.281715575620767, + "grad_norm": 273.70050048828125, + "learning_rate": 7.007259528130671e-06, + "loss": 38.9153, + "step": 4233 + }, + { + "epoch": 15.285327313769752, + "grad_norm": 269.8402404785156, + "learning_rate": 7.001814882032668e-06, + "loss": 36.7607, + "step": 4234 + }, + { + "epoch": 15.288939051918735, + "grad_norm": 260.13629150390625, + "learning_rate": 6.996370235934665e-06, + "loss": 35.3684, + "step": 4235 + }, + { + "epoch": 15.29255079006772, + "grad_norm": 223.9878692626953, + "learning_rate": 6.990925589836661e-06, + "loss": 32.8784, + "step": 4236 + }, + { + "epoch": 15.296162528216705, + "grad_norm": 225.69212341308594, + "learning_rate": 6.985480943738657e-06, + "loss": 31.3751, + "step": 4237 + }, + { + "epoch": 15.299774266365688, + "grad_norm": 215.99801635742188, + "learning_rate": 6.980036297640653e-06, + "loss": 31.5331, + "step": 4238 + }, + { + "epoch": 15.303386004514673, + "grad_norm": 263.26568603515625, + "learning_rate": 6.97459165154265e-06, + "loss": 32.5806, + "step": 4239 + }, + { + "epoch": 15.306997742663658, + "grad_norm": 203.2392578125, + "learning_rate": 6.969147005444646e-06, + "loss": 31.6379, + "step": 4240 + }, + { + "epoch": 15.306997742663658, + "eval_loss": 0.6046441793441772, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.992, + "eval_steps_per_second": 56.992, + "step": 4240 + }, + { + "epoch": 15.31060948081264, + "grad_norm": 221.2167510986328, + "learning_rate": 6.963702359346643e-06, + "loss": 33.7034, + "step": 4241 + }, + { + "epoch": 15.314221218961626, + "grad_norm": 212.58737182617188, + "learning_rate": 6.958257713248639e-06, + "loss": 32.5511, + "step": 4242 + }, + { + "epoch": 15.317832957110609, + "grad_norm": 270.7123718261719, + "learning_rate": 6.952813067150635e-06, + "loss": 33.2513, + "step": 4243 + }, + { + "epoch": 15.321444695259594, + "grad_norm": 270.2066345214844, + "learning_rate": 6.9473684210526315e-06, + "loss": 33.9559, + "step": 4244 + }, + { + "epoch": 15.325056433408578, + "grad_norm": 232.8043212890625, + "learning_rate": 6.941923774954628e-06, + "loss": 33.9916, + "step": 4245 + }, + { + "epoch": 15.328668171557561, + "grad_norm": 325.419921875, + "learning_rate": 6.936479128856625e-06, + "loss": 35.2098, + "step": 4246 + }, + { + "epoch": 15.332279909706546, + "grad_norm": 303.326416015625, + "learning_rate": 6.93103448275862e-06, + "loss": 35.0784, + "step": 4247 + }, + { + "epoch": 15.335891647855531, + "grad_norm": 327.05963134765625, + "learning_rate": 6.925589836660617e-06, + "loss": 35.9915, + "step": 4248 + }, + { + "epoch": 15.339503386004514, + "grad_norm": 326.58795166015625, + "learning_rate": 6.9201451905626135e-06, + "loss": 35.1914, + "step": 4249 + }, + { + "epoch": 15.343115124153499, + "grad_norm": 406.38812255859375, + "learning_rate": 6.914700544464611e-06, + "loss": 37.1535, + "step": 4250 + }, + { + "epoch": 15.343115124153499, + "eval_loss": 0.6056071519851685, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 4250 + }, + { + "epoch": 15.346726862302482, + "grad_norm": 325.6965637207031, + "learning_rate": 6.909255898366606e-06, + "loss": 29.8698, + "step": 4251 + }, + { + "epoch": 15.350338600451467, + "grad_norm": 212.59727478027344, + "learning_rate": 6.903811252268603e-06, + "loss": 22.2995, + "step": 4252 + }, + { + "epoch": 15.353950338600452, + "grad_norm": 257.447509765625, + "learning_rate": 6.898366606170599e-06, + "loss": 23.1014, + "step": 4253 + }, + { + "epoch": 15.357562076749435, + "grad_norm": 266.139892578125, + "learning_rate": 6.8929219600725955e-06, + "loss": 23.2319, + "step": 4254 + }, + { + "epoch": 15.36117381489842, + "grad_norm": 332.7207336425781, + "learning_rate": 6.887477313974592e-06, + "loss": 23.7218, + "step": 4255 + }, + { + "epoch": 15.364785553047405, + "grad_norm": 272.7341003417969, + "learning_rate": 6.882032667876588e-06, + "loss": 39.5787, + "step": 4256 + }, + { + "epoch": 15.368397291196388, + "grad_norm": 259.00872802734375, + "learning_rate": 6.876588021778585e-06, + "loss": 41.0874, + "step": 4257 + }, + { + "epoch": 15.372009029345373, + "grad_norm": 236.87033081054688, + "learning_rate": 6.8711433756805804e-06, + "loss": 38.9811, + "step": 4258 + }, + { + "epoch": 15.375620767494357, + "grad_norm": 293.6808776855469, + "learning_rate": 6.8656987295825775e-06, + "loss": 39.481, + "step": 4259 + }, + { + "epoch": 15.37923250564334, + "grad_norm": 266.0845947265625, + "learning_rate": 6.860254083484574e-06, + "loss": 39.4595, + "step": 4260 + }, + { + "epoch": 15.37923250564334, + "eval_loss": 0.6039742231369019, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.047, + "eval_steps_per_second": 57.047, + "step": 4260 + }, + { + "epoch": 15.382844243792325, + "grad_norm": 398.0877685546875, + "learning_rate": 6.85480943738657e-06, + "loss": 38.8899, + "step": 4261 + }, + { + "epoch": 15.386455981941308, + "grad_norm": 208.37376403808594, + "learning_rate": 6.849364791288566e-06, + "loss": 39.2194, + "step": 4262 + }, + { + "epoch": 15.390067720090293, + "grad_norm": 214.6958770751953, + "learning_rate": 6.8439201451905624e-06, + "loss": 38.9911, + "step": 4263 + }, + { + "epoch": 15.393679458239278, + "grad_norm": 210.2147674560547, + "learning_rate": 6.8384754990925595e-06, + "loss": 40.5973, + "step": 4264 + }, + { + "epoch": 15.397291196388261, + "grad_norm": 240.47030639648438, + "learning_rate": 6.833030852994556e-06, + "loss": 39.3936, + "step": 4265 + }, + { + "epoch": 15.400902934537246, + "grad_norm": 273.86883544921875, + "learning_rate": 6.827586206896552e-06, + "loss": 40.0848, + "step": 4266 + }, + { + "epoch": 15.404514672686231, + "grad_norm": 239.36453247070312, + "learning_rate": 6.822141560798548e-06, + "loss": 36.5967, + "step": 4267 + }, + { + "epoch": 15.408126410835214, + "grad_norm": 215.3413543701172, + "learning_rate": 6.8166969147005444e-06, + "loss": 37.8173, + "step": 4268 + }, + { + "epoch": 15.411738148984199, + "grad_norm": 260.1557312011719, + "learning_rate": 6.811252268602541e-06, + "loss": 37.7175, + "step": 4269 + }, + { + "epoch": 15.415349887133182, + "grad_norm": 239.4988555908203, + "learning_rate": 6.805807622504537e-06, + "loss": 37.0618, + "step": 4270 + }, + { + "epoch": 15.415349887133182, + "eval_loss": 0.6049810647964478, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 4270 + }, + { + "epoch": 15.418961625282167, + "grad_norm": 223.06094360351562, + "learning_rate": 6.800362976406534e-06, + "loss": 37.0687, + "step": 4271 + }, + { + "epoch": 15.422573363431152, + "grad_norm": 261.7460632324219, + "learning_rate": 6.79491833030853e-06, + "loss": 35.9437, + "step": 4272 + }, + { + "epoch": 15.426185101580135, + "grad_norm": 230.92135620117188, + "learning_rate": 6.7894736842105264e-06, + "loss": 38.3316, + "step": 4273 + }, + { + "epoch": 15.42979683972912, + "grad_norm": 370.6309509277344, + "learning_rate": 6.784029038112523e-06, + "loss": 38.2666, + "step": 4274 + }, + { + "epoch": 15.433408577878104, + "grad_norm": 249.7823944091797, + "learning_rate": 6.77858439201452e-06, + "loss": 38.1159, + "step": 4275 + }, + { + "epoch": 15.437020316027088, + "grad_norm": 404.1676330566406, + "learning_rate": 6.773139745916516e-06, + "loss": 37.6548, + "step": 4276 + }, + { + "epoch": 15.440632054176072, + "grad_norm": 256.3241271972656, + "learning_rate": 6.767695099818511e-06, + "loss": 38.3713, + "step": 4277 + }, + { + "epoch": 15.444243792325057, + "grad_norm": 240.55934143066406, + "learning_rate": 6.7622504537205084e-06, + "loss": 39.2487, + "step": 4278 + }, + { + "epoch": 15.44785553047404, + "grad_norm": 230.010009765625, + "learning_rate": 6.756805807622505e-06, + "loss": 39.4391, + "step": 4279 + }, + { + "epoch": 15.451467268623025, + "grad_norm": 226.51385498046875, + "learning_rate": 6.751361161524502e-06, + "loss": 38.6273, + "step": 4280 + }, + { + "epoch": 15.451467268623025, + "eval_loss": 0.6027400493621826, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.029, + "eval_steps_per_second": 57.029, + "step": 4280 + }, + { + "epoch": 15.455079006772008, + "grad_norm": 314.57476806640625, + "learning_rate": 6.745916515426497e-06, + "loss": 38.583, + "step": 4281 + }, + { + "epoch": 15.458690744920993, + "grad_norm": 229.91238403320312, + "learning_rate": 6.740471869328494e-06, + "loss": 39.2433, + "step": 4282 + }, + { + "epoch": 15.462302483069978, + "grad_norm": 284.7301330566406, + "learning_rate": 6.7350272232304904e-06, + "loss": 38.8577, + "step": 4283 + }, + { + "epoch": 15.465914221218961, + "grad_norm": 209.32266235351562, + "learning_rate": 6.729582577132486e-06, + "loss": 34.928, + "step": 4284 + }, + { + "epoch": 15.469525959367946, + "grad_norm": 264.6195068359375, + "learning_rate": 6.724137931034483e-06, + "loss": 32.0527, + "step": 4285 + }, + { + "epoch": 15.47313769751693, + "grad_norm": 224.2421112060547, + "learning_rate": 6.718693284936479e-06, + "loss": 31.939, + "step": 4286 + }, + { + "epoch": 15.476749435665914, + "grad_norm": 233.0791015625, + "learning_rate": 6.713248638838476e-06, + "loss": 32.5402, + "step": 4287 + }, + { + "epoch": 15.480361173814899, + "grad_norm": 284.129638671875, + "learning_rate": 6.707803992740472e-06, + "loss": 31.0069, + "step": 4288 + }, + { + "epoch": 15.483972911963882, + "grad_norm": 253.6517791748047, + "learning_rate": 6.702359346642469e-06, + "loss": 32.0172, + "step": 4289 + }, + { + "epoch": 15.487584650112867, + "grad_norm": 305.63775634765625, + "learning_rate": 6.696914700544465e-06, + "loss": 34.1643, + "step": 4290 + }, + { + "epoch": 15.487584650112867, + "eval_loss": 0.6044390201568604, + "eval_runtime": 3.1391, + "eval_samples_per_second": 57.023, + "eval_steps_per_second": 57.023, + "step": 4290 + }, + { + "epoch": 15.491196388261852, + "grad_norm": 224.6516876220703, + "learning_rate": 6.691470054446461e-06, + "loss": 32.4735, + "step": 4291 + }, + { + "epoch": 15.494808126410835, + "grad_norm": 257.5385437011719, + "learning_rate": 6.686025408348457e-06, + "loss": 33.9272, + "step": 4292 + }, + { + "epoch": 15.49841986455982, + "grad_norm": 393.9106140136719, + "learning_rate": 6.680580762250454e-06, + "loss": 34.4176, + "step": 4293 + }, + { + "epoch": 15.502031602708804, + "grad_norm": 333.5639953613281, + "learning_rate": 6.675136116152451e-06, + "loss": 34.5695, + "step": 4294 + }, + { + "epoch": 15.505643340857787, + "grad_norm": 319.8660888671875, + "learning_rate": 6.669691470054446e-06, + "loss": 34.5337, + "step": 4295 + }, + { + "epoch": 15.509255079006772, + "grad_norm": 246.78086853027344, + "learning_rate": 6.664246823956443e-06, + "loss": 34.8297, + "step": 4296 + }, + { + "epoch": 15.512866817155757, + "grad_norm": 313.4530944824219, + "learning_rate": 6.658802177858439e-06, + "loss": 34.6901, + "step": 4297 + }, + { + "epoch": 15.51647855530474, + "grad_norm": 257.2852783203125, + "learning_rate": 6.6533575317604364e-06, + "loss": 35.3892, + "step": 4298 + }, + { + "epoch": 15.520090293453725, + "grad_norm": 336.5549011230469, + "learning_rate": 6.647912885662432e-06, + "loss": 36.3347, + "step": 4299 + }, + { + "epoch": 15.523702031602708, + "grad_norm": 275.726806640625, + "learning_rate": 6.642468239564428e-06, + "loss": 36.3559, + "step": 4300 + }, + { + "epoch": 15.523702031602708, + "eval_loss": 0.6056334376335144, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.028, + "eval_steps_per_second": 57.028, + "step": 4300 + }, + { + "epoch": 15.527313769751693, + "grad_norm": 275.5987243652344, + "learning_rate": 6.637023593466425e-06, + "loss": 28.5887, + "step": 4301 + }, + { + "epoch": 15.530925507900678, + "grad_norm": 242.59762573242188, + "learning_rate": 6.631578947368421e-06, + "loss": 22.1398, + "step": 4302 + }, + { + "epoch": 15.534537246049661, + "grad_norm": 228.04344177246094, + "learning_rate": 6.626134301270418e-06, + "loss": 21.4593, + "step": 4303 + }, + { + "epoch": 15.538148984198646, + "grad_norm": 204.2377166748047, + "learning_rate": 6.620689655172414e-06, + "loss": 22.5132, + "step": 4304 + }, + { + "epoch": 15.54176072234763, + "grad_norm": 243.0237579345703, + "learning_rate": 6.615245009074411e-06, + "loss": 24.2777, + "step": 4305 + }, + { + "epoch": 15.545372460496614, + "grad_norm": 227.2841339111328, + "learning_rate": 6.609800362976407e-06, + "loss": 39.7235, + "step": 4306 + }, + { + "epoch": 15.548984198645599, + "grad_norm": 253.8453826904297, + "learning_rate": 6.6043557168784025e-06, + "loss": 39.9317, + "step": 4307 + }, + { + "epoch": 15.552595936794582, + "grad_norm": 243.62757873535156, + "learning_rate": 6.5989110707804e-06, + "loss": 38.9825, + "step": 4308 + }, + { + "epoch": 15.556207674943566, + "grad_norm": 262.4398498535156, + "learning_rate": 6.593466424682396e-06, + "loss": 39.7456, + "step": 4309 + }, + { + "epoch": 15.559819413092551, + "grad_norm": 268.5821228027344, + "learning_rate": 6.588021778584392e-06, + "loss": 39.5152, + "step": 4310 + }, + { + "epoch": 15.559819413092551, + "eval_loss": 0.6060237288475037, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 4310 + }, + { + "epoch": 15.563431151241534, + "grad_norm": 297.6933898925781, + "learning_rate": 6.582577132486388e-06, + "loss": 40.1259, + "step": 4311 + }, + { + "epoch": 15.56704288939052, + "grad_norm": 234.08816528320312, + "learning_rate": 6.577132486388385e-06, + "loss": 40.8591, + "step": 4312 + }, + { + "epoch": 15.570654627539504, + "grad_norm": 292.2416687011719, + "learning_rate": 6.571687840290382e-06, + "loss": 39.2377, + "step": 4313 + }, + { + "epoch": 15.574266365688487, + "grad_norm": 205.25888061523438, + "learning_rate": 6.566243194192377e-06, + "loss": 39.92, + "step": 4314 + }, + { + "epoch": 15.577878103837472, + "grad_norm": 229.06695556640625, + "learning_rate": 6.560798548094374e-06, + "loss": 39.8886, + "step": 4315 + }, + { + "epoch": 15.581489841986457, + "grad_norm": 223.3977508544922, + "learning_rate": 6.55535390199637e-06, + "loss": 38.5423, + "step": 4316 + }, + { + "epoch": 15.58510158013544, + "grad_norm": 254.60203552246094, + "learning_rate": 6.549909255898367e-06, + "loss": 36.8055, + "step": 4317 + }, + { + "epoch": 15.588713318284425, + "grad_norm": 304.463623046875, + "learning_rate": 6.544464609800363e-06, + "loss": 37.6164, + "step": 4318 + }, + { + "epoch": 15.592325056433408, + "grad_norm": 279.955810546875, + "learning_rate": 6.53901996370236e-06, + "loss": 37.4778, + "step": 4319 + }, + { + "epoch": 15.595936794582393, + "grad_norm": 230.11105346679688, + "learning_rate": 6.533575317604356e-06, + "loss": 36.9663, + "step": 4320 + }, + { + "epoch": 15.595936794582393, + "eval_loss": 0.6048213243484497, + "eval_runtime": 3.1422, + "eval_samples_per_second": 56.966, + "eval_steps_per_second": 56.966, + "step": 4320 + }, + { + "epoch": 15.599548532731378, + "grad_norm": 261.98187255859375, + "learning_rate": 6.528130671506351e-06, + "loss": 37.7402, + "step": 4321 + }, + { + "epoch": 15.60316027088036, + "grad_norm": 247.34771728515625, + "learning_rate": 6.5226860254083485e-06, + "loss": 37.1402, + "step": 4322 + }, + { + "epoch": 15.606772009029346, + "grad_norm": 277.1517333984375, + "learning_rate": 6.517241379310345e-06, + "loss": 38.3976, + "step": 4323 + }, + { + "epoch": 15.610383747178329, + "grad_norm": 231.89683532714844, + "learning_rate": 6.511796733212342e-06, + "loss": 38.0834, + "step": 4324 + }, + { + "epoch": 15.613995485327314, + "grad_norm": 323.8349304199219, + "learning_rate": 6.506352087114337e-06, + "loss": 37.9085, + "step": 4325 + }, + { + "epoch": 15.617607223476298, + "grad_norm": 263.5240783691406, + "learning_rate": 6.500907441016334e-06, + "loss": 37.0702, + "step": 4326 + }, + { + "epoch": 15.621218961625281, + "grad_norm": 217.0517578125, + "learning_rate": 6.4954627949183305e-06, + "loss": 36.9406, + "step": 4327 + }, + { + "epoch": 15.624830699774266, + "grad_norm": 267.4161682128906, + "learning_rate": 6.4900181488203276e-06, + "loss": 38.8773, + "step": 4328 + }, + { + "epoch": 15.628442437923251, + "grad_norm": 232.36000061035156, + "learning_rate": 6.484573502722323e-06, + "loss": 38.4978, + "step": 4329 + }, + { + "epoch": 15.632054176072234, + "grad_norm": 241.61373901367188, + "learning_rate": 6.479128856624319e-06, + "loss": 38.4895, + "step": 4330 + }, + { + "epoch": 15.632054176072234, + "eval_loss": 0.6024956703186035, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 4330 + }, + { + "epoch": 15.635665914221219, + "grad_norm": 232.27928161621094, + "learning_rate": 6.473684210526316e-06, + "loss": 38.8551, + "step": 4331 + }, + { + "epoch": 15.639277652370204, + "grad_norm": 243.42828369140625, + "learning_rate": 6.4682395644283125e-06, + "loss": 38.6475, + "step": 4332 + }, + { + "epoch": 15.642889390519187, + "grad_norm": 306.2618103027344, + "learning_rate": 6.462794918330309e-06, + "loss": 37.2015, + "step": 4333 + }, + { + "epoch": 15.646501128668172, + "grad_norm": 335.795166015625, + "learning_rate": 6.457350272232305e-06, + "loss": 36.5255, + "step": 4334 + }, + { + "epoch": 15.650112866817155, + "grad_norm": 209.6246337890625, + "learning_rate": 6.451905626134302e-06, + "loss": 32.4219, + "step": 4335 + }, + { + "epoch": 15.65372460496614, + "grad_norm": 283.2094421386719, + "learning_rate": 6.446460980036297e-06, + "loss": 30.9137, + "step": 4336 + }, + { + "epoch": 15.657336343115125, + "grad_norm": 255.4412841796875, + "learning_rate": 6.441016333938294e-06, + "loss": 30.8939, + "step": 4337 + }, + { + "epoch": 15.660948081264108, + "grad_norm": 217.8052215576172, + "learning_rate": 6.435571687840291e-06, + "loss": 31.5974, + "step": 4338 + }, + { + "epoch": 15.664559819413093, + "grad_norm": 215.64398193359375, + "learning_rate": 6.430127041742287e-06, + "loss": 30.0276, + "step": 4339 + }, + { + "epoch": 15.668171557562077, + "grad_norm": 244.32704162597656, + "learning_rate": 6.424682395644283e-06, + "loss": 32.5249, + "step": 4340 + }, + { + "epoch": 15.668171557562077, + "eval_loss": 0.6037233471870422, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.033, + "eval_steps_per_second": 57.033, + "step": 4340 + }, + { + "epoch": 15.67178329571106, + "grad_norm": 270.9132080078125, + "learning_rate": 6.419237749546279e-06, + "loss": 32.9923, + "step": 4341 + }, + { + "epoch": 15.675395033860045, + "grad_norm": 230.20314025878906, + "learning_rate": 6.4137931034482765e-06, + "loss": 32.871, + "step": 4342 + }, + { + "epoch": 15.679006772009028, + "grad_norm": 372.4366149902344, + "learning_rate": 6.408348457350273e-06, + "loss": 35.2687, + "step": 4343 + }, + { + "epoch": 15.682618510158013, + "grad_norm": 325.0901794433594, + "learning_rate": 6.402903811252268e-06, + "loss": 34.3107, + "step": 4344 + }, + { + "epoch": 15.686230248306998, + "grad_norm": 277.8683166503906, + "learning_rate": 6.397459165154265e-06, + "loss": 34.291, + "step": 4345 + }, + { + "epoch": 15.689841986455981, + "grad_norm": 262.566162109375, + "learning_rate": 6.392014519056261e-06, + "loss": 33.2989, + "step": 4346 + }, + { + "epoch": 15.693453724604966, + "grad_norm": 293.56536865234375, + "learning_rate": 6.386569872958258e-06, + "loss": 35.6865, + "step": 4347 + }, + { + "epoch": 15.697065462753951, + "grad_norm": 291.1886291503906, + "learning_rate": 6.381125226860254e-06, + "loss": 35.6959, + "step": 4348 + }, + { + "epoch": 15.700677200902934, + "grad_norm": 265.2365417480469, + "learning_rate": 6.375680580762251e-06, + "loss": 36.479, + "step": 4349 + }, + { + "epoch": 15.704288939051919, + "grad_norm": 342.8822021484375, + "learning_rate": 6.370235934664247e-06, + "loss": 35.9198, + "step": 4350 + }, + { + "epoch": 15.704288939051919, + "eval_loss": 0.603361189365387, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.98, + "eval_steps_per_second": 56.98, + "step": 4350 + }, + { + "epoch": 15.707900677200904, + "grad_norm": 276.1657409667969, + "learning_rate": 6.364791288566243e-06, + "loss": 29.429, + "step": 4351 + }, + { + "epoch": 15.711512415349887, + "grad_norm": 267.2456359863281, + "learning_rate": 6.35934664246824e-06, + "loss": 23.0038, + "step": 4352 + }, + { + "epoch": 15.715124153498872, + "grad_norm": 255.4893798828125, + "learning_rate": 6.353901996370236e-06, + "loss": 21.1185, + "step": 4353 + }, + { + "epoch": 15.718735891647855, + "grad_norm": 252.10501098632812, + "learning_rate": 6.348457350272233e-06, + "loss": 23.1769, + "step": 4354 + }, + { + "epoch": 15.72234762979684, + "grad_norm": 239.63905334472656, + "learning_rate": 6.343012704174228e-06, + "loss": 24.5905, + "step": 4355 + }, + { + "epoch": 15.725959367945824, + "grad_norm": 228.00950622558594, + "learning_rate": 6.337568058076225e-06, + "loss": 39.6657, + "step": 4356 + }, + { + "epoch": 15.729571106094808, + "grad_norm": 234.10647583007812, + "learning_rate": 6.332123411978222e-06, + "loss": 41.145, + "step": 4357 + }, + { + "epoch": 15.733182844243792, + "grad_norm": 236.55223083496094, + "learning_rate": 6.326678765880219e-06, + "loss": 40.2784, + "step": 4358 + }, + { + "epoch": 15.736794582392777, + "grad_norm": 340.1712646484375, + "learning_rate": 6.321234119782214e-06, + "loss": 39.3598, + "step": 4359 + }, + { + "epoch": 15.74040632054176, + "grad_norm": 269.4134826660156, + "learning_rate": 6.31578947368421e-06, + "loss": 38.7777, + "step": 4360 + }, + { + "epoch": 15.74040632054176, + "eval_loss": 0.6048015356063843, + "eval_runtime": 3.144, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 4360 + }, + { + "epoch": 15.744018058690745, + "grad_norm": 316.5471496582031, + "learning_rate": 6.310344827586207e-06, + "loss": 39.6707, + "step": 4361 + }, + { + "epoch": 15.747629796839728, + "grad_norm": 231.31820678710938, + "learning_rate": 6.304900181488203e-06, + "loss": 38.0009, + "step": 4362 + }, + { + "epoch": 15.751241534988713, + "grad_norm": 207.19117736816406, + "learning_rate": 6.2994555353902e-06, + "loss": 41.6523, + "step": 4363 + }, + { + "epoch": 15.754853273137698, + "grad_norm": 239.8341064453125, + "learning_rate": 6.294010889292196e-06, + "loss": 40.3203, + "step": 4364 + }, + { + "epoch": 15.758465011286681, + "grad_norm": 277.2004089355469, + "learning_rate": 6.288566243194193e-06, + "loss": 39.8026, + "step": 4365 + }, + { + "epoch": 15.762076749435666, + "grad_norm": 227.74728393554688, + "learning_rate": 6.2831215970961886e-06, + "loss": 38.1561, + "step": 4366 + }, + { + "epoch": 15.76568848758465, + "grad_norm": 268.6826477050781, + "learning_rate": 6.277676950998185e-06, + "loss": 37.4653, + "step": 4367 + }, + { + "epoch": 15.769300225733634, + "grad_norm": 308.92950439453125, + "learning_rate": 6.272232304900182e-06, + "loss": 36.3506, + "step": 4368 + }, + { + "epoch": 15.772911963882619, + "grad_norm": 216.53627014160156, + "learning_rate": 6.266787658802178e-06, + "loss": 36.12, + "step": 4369 + }, + { + "epoch": 15.776523702031604, + "grad_norm": 264.0691833496094, + "learning_rate": 6.261343012704174e-06, + "loss": 37.5023, + "step": 4370 + }, + { + "epoch": 15.776523702031604, + "eval_loss": 0.608928382396698, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.989, + "eval_steps_per_second": 56.989, + "step": 4370 + }, + { + "epoch": 15.780135440180587, + "grad_norm": 474.7265319824219, + "learning_rate": 6.2558983666061706e-06, + "loss": 38.8381, + "step": 4371 + }, + { + "epoch": 15.783747178329572, + "grad_norm": 303.66229248046875, + "learning_rate": 6.250453720508168e-06, + "loss": 36.5951, + "step": 4372 + }, + { + "epoch": 15.787358916478555, + "grad_norm": 231.65744018554688, + "learning_rate": 6.245009074410164e-06, + "loss": 36.4717, + "step": 4373 + }, + { + "epoch": 15.79097065462754, + "grad_norm": 235.25833129882812, + "learning_rate": 6.239564428312159e-06, + "loss": 38.4578, + "step": 4374 + }, + { + "epoch": 15.794582392776524, + "grad_norm": 215.5384063720703, + "learning_rate": 6.234119782214156e-06, + "loss": 38.0475, + "step": 4375 + }, + { + "epoch": 15.798194130925507, + "grad_norm": 216.3609619140625, + "learning_rate": 6.2286751361161526e-06, + "loss": 37.1825, + "step": 4376 + }, + { + "epoch": 15.801805869074492, + "grad_norm": 275.54522705078125, + "learning_rate": 6.223230490018149e-06, + "loss": 38.5608, + "step": 4377 + }, + { + "epoch": 15.805417607223477, + "grad_norm": 226.7752685546875, + "learning_rate": 6.217785843920145e-06, + "loss": 38.0612, + "step": 4378 + }, + { + "epoch": 15.80902934537246, + "grad_norm": 262.14501953125, + "learning_rate": 6.212341197822142e-06, + "loss": 38.0049, + "step": 4379 + }, + { + "epoch": 15.812641083521445, + "grad_norm": 299.82196044921875, + "learning_rate": 6.206896551724138e-06, + "loss": 39.1441, + "step": 4380 + }, + { + "epoch": 15.812641083521445, + "eval_loss": 0.6033969521522522, + "eval_runtime": 3.14, + "eval_samples_per_second": 57.007, + "eval_steps_per_second": 57.007, + "step": 4380 + }, + { + "epoch": 15.816252821670428, + "grad_norm": 295.24188232421875, + "learning_rate": 6.2014519056261346e-06, + "loss": 39.266, + "step": 4381 + }, + { + "epoch": 15.819864559819413, + "grad_norm": 298.1729736328125, + "learning_rate": 6.196007259528131e-06, + "loss": 39.4025, + "step": 4382 + }, + { + "epoch": 15.823476297968398, + "grad_norm": 234.97958374023438, + "learning_rate": 6.190562613430127e-06, + "loss": 39.4752, + "step": 4383 + }, + { + "epoch": 15.827088036117381, + "grad_norm": 270.3009338378906, + "learning_rate": 6.185117967332124e-06, + "loss": 36.0322, + "step": 4384 + }, + { + "epoch": 15.830699774266366, + "grad_norm": 279.78314208984375, + "learning_rate": 6.1796733212341195e-06, + "loss": 33.3256, + "step": 4385 + }, + { + "epoch": 15.83431151241535, + "grad_norm": 258.82598876953125, + "learning_rate": 6.1742286751361166e-06, + "loss": 33.1552, + "step": 4386 + }, + { + "epoch": 15.837923250564334, + "grad_norm": 280.8109130859375, + "learning_rate": 6.168784029038113e-06, + "loss": 32.0024, + "step": 4387 + }, + { + "epoch": 15.841534988713319, + "grad_norm": 265.08111572265625, + "learning_rate": 6.163339382940109e-06, + "loss": 32.4901, + "step": 4388 + }, + { + "epoch": 15.845146726862303, + "grad_norm": 316.56427001953125, + "learning_rate": 6.157894736842105e-06, + "loss": 33.1995, + "step": 4389 + }, + { + "epoch": 15.848758465011286, + "grad_norm": 256.03717041015625, + "learning_rate": 6.1524500907441015e-06, + "loss": 33.1914, + "step": 4390 + }, + { + "epoch": 15.848758465011286, + "eval_loss": 0.6017575263977051, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.034, + "eval_steps_per_second": 57.034, + "step": 4390 + }, + { + "epoch": 15.852370203160271, + "grad_norm": 242.54119873046875, + "learning_rate": 6.1470054446460985e-06, + "loss": 33.8459, + "step": 4391 + }, + { + "epoch": 15.855981941309254, + "grad_norm": 259.1406555175781, + "learning_rate": 6.141560798548094e-06, + "loss": 34.1317, + "step": 4392 + }, + { + "epoch": 15.85959367945824, + "grad_norm": 272.77880859375, + "learning_rate": 6.136116152450091e-06, + "loss": 34.2777, + "step": 4393 + }, + { + "epoch": 15.863205417607224, + "grad_norm": 231.60845947265625, + "learning_rate": 6.130671506352087e-06, + "loss": 34.0165, + "step": 4394 + }, + { + "epoch": 15.866817155756207, + "grad_norm": 230.85675048828125, + "learning_rate": 6.125226860254084e-06, + "loss": 34.2761, + "step": 4395 + }, + { + "epoch": 15.870428893905192, + "grad_norm": 307.4486389160156, + "learning_rate": 6.11978221415608e-06, + "loss": 33.7407, + "step": 4396 + }, + { + "epoch": 15.874040632054175, + "grad_norm": 264.7835388183594, + "learning_rate": 6.114337568058076e-06, + "loss": 34.1672, + "step": 4397 + }, + { + "epoch": 15.87765237020316, + "grad_norm": 234.93968200683594, + "learning_rate": 6.108892921960073e-06, + "loss": 35.7158, + "step": 4398 + }, + { + "epoch": 15.881264108352145, + "grad_norm": 300.0079345703125, + "learning_rate": 6.103448275862069e-06, + "loss": 36.1292, + "step": 4399 + }, + { + "epoch": 15.884875846501128, + "grad_norm": 326.20416259765625, + "learning_rate": 6.0980036297640655e-06, + "loss": 34.8222, + "step": 4400 + }, + { + "epoch": 15.884875846501128, + "eval_loss": 0.6024067401885986, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.99, + "eval_steps_per_second": 56.99, + "step": 4400 + }, + { + "epoch": 15.888487584650113, + "grad_norm": 214.6174774169922, + "learning_rate": 6.092558983666062e-06, + "loss": 27.4819, + "step": 4401 + }, + { + "epoch": 15.892099322799098, + "grad_norm": 222.7063446044922, + "learning_rate": 6.087114337568059e-06, + "loss": 22.3862, + "step": 4402 + }, + { + "epoch": 15.89571106094808, + "grad_norm": 277.0006103515625, + "learning_rate": 6.081669691470054e-06, + "loss": 22.8483, + "step": 4403 + }, + { + "epoch": 15.899322799097066, + "grad_norm": 264.3949890136719, + "learning_rate": 6.076225045372051e-06, + "loss": 23.2021, + "step": 4404 + }, + { + "epoch": 15.90293453724605, + "grad_norm": 244.04611206054688, + "learning_rate": 6.0707803992740475e-06, + "loss": 23.9378, + "step": 4405 + }, + { + "epoch": 15.906546275395034, + "grad_norm": 219.24403381347656, + "learning_rate": 6.065335753176044e-06, + "loss": 39.4708, + "step": 4406 + }, + { + "epoch": 15.910158013544018, + "grad_norm": 297.3822937011719, + "learning_rate": 6.05989110707804e-06, + "loss": 39.9151, + "step": 4407 + }, + { + "epoch": 15.913769751693001, + "grad_norm": 282.748291015625, + "learning_rate": 6.054446460980036e-06, + "loss": 39.0545, + "step": 4408 + }, + { + "epoch": 15.917381489841986, + "grad_norm": 274.6419982910156, + "learning_rate": 6.049001814882033e-06, + "loss": 39.7046, + "step": 4409 + }, + { + "epoch": 15.920993227990971, + "grad_norm": 261.2831115722656, + "learning_rate": 6.0435571687840295e-06, + "loss": 39.8849, + "step": 4410 + }, + { + "epoch": 15.920993227990971, + "eval_loss": 0.6017056107521057, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 4410 + }, + { + "epoch": 15.924604966139954, + "grad_norm": 276.61505126953125, + "learning_rate": 6.038112522686026e-06, + "loss": 39.8861, + "step": 4411 + }, + { + "epoch": 15.928216704288939, + "grad_norm": 273.4017333984375, + "learning_rate": 6.032667876588022e-06, + "loss": 36.2526, + "step": 4412 + }, + { + "epoch": 15.931828442437924, + "grad_norm": 314.4811706542969, + "learning_rate": 6.027223230490018e-06, + "loss": 37.1316, + "step": 4413 + }, + { + "epoch": 15.935440180586907, + "grad_norm": 265.7447204589844, + "learning_rate": 6.021778584392014e-06, + "loss": 38.1698, + "step": 4414 + }, + { + "epoch": 15.939051918735892, + "grad_norm": 448.373291015625, + "learning_rate": 6.016333938294011e-06, + "loss": 38.9541, + "step": 4415 + }, + { + "epoch": 15.942663656884875, + "grad_norm": 261.33966064453125, + "learning_rate": 6.010889292196008e-06, + "loss": 36.6694, + "step": 4416 + }, + { + "epoch": 15.94627539503386, + "grad_norm": 383.16363525390625, + "learning_rate": 6.005444646098004e-06, + "loss": 39.1773, + "step": 4417 + }, + { + "epoch": 15.949887133182845, + "grad_norm": 279.26446533203125, + "learning_rate": 6e-06, + "loss": 36.9482, + "step": 4418 + }, + { + "epoch": 15.953498871331828, + "grad_norm": 307.5321960449219, + "learning_rate": 5.994555353901996e-06, + "loss": 36.653, + "step": 4419 + }, + { + "epoch": 15.957110609480813, + "grad_norm": 412.80023193359375, + "learning_rate": 5.989110707803993e-06, + "loss": 36.3768, + "step": 4420 + }, + { + "epoch": 15.957110609480813, + "eval_loss": 0.6033455729484558, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 4420 + }, + { + "epoch": 15.960722347629797, + "grad_norm": 254.2952880859375, + "learning_rate": 5.98366606170599e-06, + "loss": 32.546, + "step": 4421 + }, + { + "epoch": 15.96433408577878, + "grad_norm": 324.0749816894531, + "learning_rate": 5.978221415607985e-06, + "loss": 32.7021, + "step": 4422 + }, + { + "epoch": 15.967945823927765, + "grad_norm": 326.0075988769531, + "learning_rate": 5.972776769509982e-06, + "loss": 33.3823, + "step": 4423 + }, + { + "epoch": 15.97155756207675, + "grad_norm": 252.98471069335938, + "learning_rate": 5.967332123411978e-06, + "loss": 33.3397, + "step": 4424 + }, + { + "epoch": 15.975169300225733, + "grad_norm": 243.14117431640625, + "learning_rate": 5.9618874773139755e-06, + "loss": 34.2781, + "step": 4425 + }, + { + "epoch": 15.978781038374718, + "grad_norm": 304.3429260253906, + "learning_rate": 5.956442831215971e-06, + "loss": 34.1163, + "step": 4426 + }, + { + "epoch": 15.982392776523701, + "grad_norm": 320.1651916503906, + "learning_rate": 5.950998185117968e-06, + "loss": 34.1024, + "step": 4427 + }, + { + "epoch": 15.986004514672686, + "grad_norm": 252.0004425048828, + "learning_rate": 5.945553539019964e-06, + "loss": 35.8121, + "step": 4428 + }, + { + "epoch": 15.989616252821671, + "grad_norm": 342.5635986328125, + "learning_rate": 5.9401088929219595e-06, + "loss": 35.6666, + "step": 4429 + }, + { + "epoch": 15.993227990970654, + "grad_norm": 226.57249450683594, + "learning_rate": 5.934664246823957e-06, + "loss": 30.2617, + "step": 4430 + }, + { + "epoch": 15.993227990970654, + "eval_loss": 0.6029886603355408, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.979, + "eval_steps_per_second": 56.979, + "step": 4430 + }, + { + "epoch": 15.996839729119639, + "grad_norm": 202.94903564453125, + "learning_rate": 5.929219600725953e-06, + "loss": 22.8166, + "step": 4431 + }, + { + "epoch": 16.0, + "grad_norm": 200.84317016601562, + "learning_rate": 5.92377495462795e-06, + "loss": 20.3903, + "step": 4432 + }, + { + "epoch": 16.003611738148983, + "grad_norm": 230.5917510986328, + "learning_rate": 5.918330308529945e-06, + "loss": 39.0985, + "step": 4433 + }, + { + "epoch": 16.00722347629797, + "grad_norm": 285.6978759765625, + "learning_rate": 5.912885662431942e-06, + "loss": 39.2128, + "step": 4434 + }, + { + "epoch": 16.010835214446953, + "grad_norm": 221.70896911621094, + "learning_rate": 5.907441016333939e-06, + "loss": 38.9026, + "step": 4435 + }, + { + "epoch": 16.014446952595936, + "grad_norm": 318.14068603515625, + "learning_rate": 5.901996370235935e-06, + "loss": 38.7336, + "step": 4436 + }, + { + "epoch": 16.018058690744923, + "grad_norm": 324.451904296875, + "learning_rate": 5.896551724137931e-06, + "loss": 38.7117, + "step": 4437 + }, + { + "epoch": 16.021670428893906, + "grad_norm": 295.038818359375, + "learning_rate": 5.891107078039927e-06, + "loss": 39.6053, + "step": 4438 + }, + { + "epoch": 16.02528216704289, + "grad_norm": 267.0055236816406, + "learning_rate": 5.885662431941924e-06, + "loss": 38.931, + "step": 4439 + }, + { + "epoch": 16.028893905191875, + "grad_norm": 269.20074462890625, + "learning_rate": 5.88021778584392e-06, + "loss": 41.1717, + "step": 4440 + }, + { + "epoch": 16.028893905191875, + "eval_loss": 0.6036069393157959, + "eval_runtime": 3.1459, + "eval_samples_per_second": 56.899, + "eval_steps_per_second": 56.899, + "step": 4440 + }, + { + "epoch": 16.03250564334086, + "grad_norm": 241.9443359375, + "learning_rate": 5.874773139745917e-06, + "loss": 38.7027, + "step": 4441 + }, + { + "epoch": 16.03611738148984, + "grad_norm": 238.54847717285156, + "learning_rate": 5.869328493647913e-06, + "loss": 39.1284, + "step": 4442 + }, + { + "epoch": 16.039729119638825, + "grad_norm": 339.3023681640625, + "learning_rate": 5.863883847549909e-06, + "loss": 38.0767, + "step": 4443 + }, + { + "epoch": 16.04334085778781, + "grad_norm": 257.29522705078125, + "learning_rate": 5.8584392014519055e-06, + "loss": 34.8207, + "step": 4444 + }, + { + "epoch": 16.046952595936794, + "grad_norm": 264.24200439453125, + "learning_rate": 5.852994555353902e-06, + "loss": 35.5021, + "step": 4445 + }, + { + "epoch": 16.050564334085777, + "grad_norm": 251.3128662109375, + "learning_rate": 5.847549909255899e-06, + "loss": 35.7826, + "step": 4446 + }, + { + "epoch": 16.054176072234764, + "grad_norm": 310.6581726074219, + "learning_rate": 5.842105263157895e-06, + "loss": 36.7373, + "step": 4447 + }, + { + "epoch": 16.057787810383747, + "grad_norm": 299.07550048828125, + "learning_rate": 5.836660617059891e-06, + "loss": 36.4048, + "step": 4448 + }, + { + "epoch": 16.06139954853273, + "grad_norm": 257.58740234375, + "learning_rate": 5.8312159709618875e-06, + "loss": 36.3982, + "step": 4449 + }, + { + "epoch": 16.065011286681717, + "grad_norm": 337.6795654296875, + "learning_rate": 5.825771324863884e-06, + "loss": 36.8518, + "step": 4450 + }, + { + "epoch": 16.065011286681717, + "eval_loss": 0.6036850214004517, + "eval_runtime": 3.1399, + "eval_samples_per_second": 57.009, + "eval_steps_per_second": 57.009, + "step": 4450 + }, + { + "epoch": 16.0686230248307, + "grad_norm": 275.02423095703125, + "learning_rate": 5.820326678765881e-06, + "loss": 36.1763, + "step": 4451 + }, + { + "epoch": 16.072234762979683, + "grad_norm": 263.4334716796875, + "learning_rate": 5.814882032667876e-06, + "loss": 37.6417, + "step": 4452 + }, + { + "epoch": 16.07584650112867, + "grad_norm": 213.16749572753906, + "learning_rate": 5.809437386569873e-06, + "loss": 35.6537, + "step": 4453 + }, + { + "epoch": 16.079458239277653, + "grad_norm": 263.4288330078125, + "learning_rate": 5.8039927404718695e-06, + "loss": 36.5693, + "step": 4454 + }, + { + "epoch": 16.083069977426636, + "grad_norm": 284.67254638671875, + "learning_rate": 5.798548094373866e-06, + "loss": 37.3424, + "step": 4455 + }, + { + "epoch": 16.086681715575622, + "grad_norm": 355.7987060546875, + "learning_rate": 5.793103448275862e-06, + "loss": 38.7851, + "step": 4456 + }, + { + "epoch": 16.090293453724605, + "grad_norm": 249.7351531982422, + "learning_rate": 5.787658802177859e-06, + "loss": 38.1334, + "step": 4457 + }, + { + "epoch": 16.09390519187359, + "grad_norm": 257.4977722167969, + "learning_rate": 5.782214156079855e-06, + "loss": 37.8369, + "step": 4458 + }, + { + "epoch": 16.097516930022575, + "grad_norm": 242.59584045410156, + "learning_rate": 5.776769509981851e-06, + "loss": 37.4005, + "step": 4459 + }, + { + "epoch": 16.101128668171558, + "grad_norm": 270.0740966796875, + "learning_rate": 5.771324863883848e-06, + "loss": 38.2287, + "step": 4460 + }, + { + "epoch": 16.101128668171558, + "eval_loss": 0.6018803119659424, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.04, + "eval_steps_per_second": 57.04, + "step": 4460 + }, + { + "epoch": 16.10474040632054, + "grad_norm": 225.32322692871094, + "learning_rate": 5.765880217785844e-06, + "loss": 35.7162, + "step": 4461 + }, + { + "epoch": 16.108352144469524, + "grad_norm": 275.3272705078125, + "learning_rate": 5.760435571687841e-06, + "loss": 32.8733, + "step": 4462 + }, + { + "epoch": 16.11196388261851, + "grad_norm": 259.5124206542969, + "learning_rate": 5.7549909255898364e-06, + "loss": 33.2271, + "step": 4463 + }, + { + "epoch": 16.115575620767494, + "grad_norm": 249.75738525390625, + "learning_rate": 5.7495462794918335e-06, + "loss": 30.2931, + "step": 4464 + }, + { + "epoch": 16.119187358916477, + "grad_norm": 277.7652282714844, + "learning_rate": 5.74410163339383e-06, + "loss": 30.9294, + "step": 4465 + }, + { + "epoch": 16.122799097065464, + "grad_norm": 223.28250122070312, + "learning_rate": 5.738656987295825e-06, + "loss": 31.7337, + "step": 4466 + }, + { + "epoch": 16.126410835214447, + "grad_norm": 259.5106201171875, + "learning_rate": 5.733212341197822e-06, + "loss": 31.2897, + "step": 4467 + }, + { + "epoch": 16.13002257336343, + "grad_norm": 241.0313720703125, + "learning_rate": 5.7277676950998184e-06, + "loss": 32.8436, + "step": 4468 + }, + { + "epoch": 16.133634311512417, + "grad_norm": 277.46905517578125, + "learning_rate": 5.7223230490018155e-06, + "loss": 33.6823, + "step": 4469 + }, + { + "epoch": 16.1372460496614, + "grad_norm": 264.2905578613281, + "learning_rate": 5.716878402903811e-06, + "loss": 33.1107, + "step": 4470 + }, + { + "epoch": 16.1372460496614, + "eval_loss": 0.6046355962753296, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.015, + "eval_steps_per_second": 57.015, + "step": 4470 + }, + { + "epoch": 16.140857787810383, + "grad_norm": 295.5188903808594, + "learning_rate": 5.711433756805808e-06, + "loss": 33.6291, + "step": 4471 + }, + { + "epoch": 16.14446952595937, + "grad_norm": 282.6014709472656, + "learning_rate": 5.705989110707804e-06, + "loss": 33.0773, + "step": 4472 + }, + { + "epoch": 16.148081264108352, + "grad_norm": 270.7958679199219, + "learning_rate": 5.7005444646098004e-06, + "loss": 35.0269, + "step": 4473 + }, + { + "epoch": 16.151693002257336, + "grad_norm": 344.7304992675781, + "learning_rate": 5.695099818511797e-06, + "loss": 35.1349, + "step": 4474 + }, + { + "epoch": 16.155304740406322, + "grad_norm": 294.5618896484375, + "learning_rate": 5.689655172413793e-06, + "loss": 36.3309, + "step": 4475 + }, + { + "epoch": 16.158916478555305, + "grad_norm": 305.5354309082031, + "learning_rate": 5.68421052631579e-06, + "loss": 35.0976, + "step": 4476 + }, + { + "epoch": 16.16252821670429, + "grad_norm": 293.9934387207031, + "learning_rate": 5.678765880217786e-06, + "loss": 34.9113, + "step": 4477 + }, + { + "epoch": 16.16613995485327, + "grad_norm": 277.9523010253906, + "learning_rate": 5.6733212341197824e-06, + "loss": 24.8815, + "step": 4478 + }, + { + "epoch": 16.169751693002258, + "grad_norm": 297.0547790527344, + "learning_rate": 5.667876588021779e-06, + "loss": 22.4544, + "step": 4479 + }, + { + "epoch": 16.17336343115124, + "grad_norm": 237.44741821289062, + "learning_rate": 5.662431941923776e-06, + "loss": 21.8323, + "step": 4480 + }, + { + "epoch": 16.17336343115124, + "eval_loss": 0.6061411499977112, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.98, + "eval_steps_per_second": 56.98, + "step": 4480 + }, + { + "epoch": 16.176975169300224, + "grad_norm": 220.5832977294922, + "learning_rate": 5.656987295825771e-06, + "loss": 22.7531, + "step": 4481 + }, + { + "epoch": 16.18058690744921, + "grad_norm": 298.8033142089844, + "learning_rate": 5.651542649727767e-06, + "loss": 23.7107, + "step": 4482 + }, + { + "epoch": 16.184198645598194, + "grad_norm": 250.02593994140625, + "learning_rate": 5.6460980036297644e-06, + "loss": 39.1679, + "step": 4483 + }, + { + "epoch": 16.187810383747177, + "grad_norm": 253.00746154785156, + "learning_rate": 5.640653357531761e-06, + "loss": 40.6492, + "step": 4484 + }, + { + "epoch": 16.191422121896164, + "grad_norm": 215.04270935058594, + "learning_rate": 5.635208711433757e-06, + "loss": 38.604, + "step": 4485 + }, + { + "epoch": 16.195033860045147, + "grad_norm": 395.6152648925781, + "learning_rate": 5.629764065335753e-06, + "loss": 39.1417, + "step": 4486 + }, + { + "epoch": 16.19864559819413, + "grad_norm": 380.3653869628906, + "learning_rate": 5.62431941923775e-06, + "loss": 39.4322, + "step": 4487 + }, + { + "epoch": 16.202257336343116, + "grad_norm": 309.3524475097656, + "learning_rate": 5.6188747731397464e-06, + "loss": 39.1721, + "step": 4488 + }, + { + "epoch": 16.2058690744921, + "grad_norm": 237.88262939453125, + "learning_rate": 5.613430127041742e-06, + "loss": 39.1462, + "step": 4489 + }, + { + "epoch": 16.209480812641083, + "grad_norm": 233.66690063476562, + "learning_rate": 5.607985480943739e-06, + "loss": 39.8177, + "step": 4490 + }, + { + "epoch": 16.209480812641083, + "eval_loss": 0.6043822169303894, + "eval_runtime": 3.1418, + "eval_samples_per_second": 56.974, + "eval_steps_per_second": 56.974, + "step": 4490 + }, + { + "epoch": 16.21309255079007, + "grad_norm": 229.3720703125, + "learning_rate": 5.602540834845735e-06, + "loss": 39.7878, + "step": 4491 + }, + { + "epoch": 16.216704288939052, + "grad_norm": 228.66493225097656, + "learning_rate": 5.597096188747731e-06, + "loss": 40.0754, + "step": 4492 + }, + { + "epoch": 16.220316027088035, + "grad_norm": 276.40240478515625, + "learning_rate": 5.591651542649728e-06, + "loss": 38.7709, + "step": 4493 + }, + { + "epoch": 16.223927765237022, + "grad_norm": 268.62371826171875, + "learning_rate": 5.586206896551725e-06, + "loss": 37.7439, + "step": 4494 + }, + { + "epoch": 16.227539503386005, + "grad_norm": 271.0934753417969, + "learning_rate": 5.580762250453721e-06, + "loss": 38.2511, + "step": 4495 + }, + { + "epoch": 16.231151241534988, + "grad_norm": 253.63385009765625, + "learning_rate": 5.575317604355716e-06, + "loss": 36.716, + "step": 4496 + }, + { + "epoch": 16.23476297968397, + "grad_norm": 265.1177978515625, + "learning_rate": 5.569872958257713e-06, + "loss": 36.5517, + "step": 4497 + }, + { + "epoch": 16.238374717832958, + "grad_norm": 332.52972412109375, + "learning_rate": 5.56442831215971e-06, + "loss": 37.1524, + "step": 4498 + }, + { + "epoch": 16.24198645598194, + "grad_norm": 247.53643798828125, + "learning_rate": 5.558983666061707e-06, + "loss": 36.6666, + "step": 4499 + }, + { + "epoch": 16.245598194130924, + "grad_norm": 233.3318634033203, + "learning_rate": 5.553539019963702e-06, + "loss": 37.0842, + "step": 4500 + }, + { + "epoch": 16.245598194130924, + "eval_loss": 0.6042913794517517, + "eval_runtime": 3.14, + "eval_samples_per_second": 57.007, + "eval_steps_per_second": 57.007, + "step": 4500 + }, + { + "epoch": 16.24920993227991, + "grad_norm": 222.98350524902344, + "learning_rate": 5.548094373865699e-06, + "loss": 37.6382, + "step": 4501 + }, + { + "epoch": 16.252821670428894, + "grad_norm": 234.33267211914062, + "learning_rate": 5.542649727767695e-06, + "loss": 38.0509, + "step": 4502 + }, + { + "epoch": 16.256433408577877, + "grad_norm": 303.56005859375, + "learning_rate": 5.5372050816696924e-06, + "loss": 36.509, + "step": 4503 + }, + { + "epoch": 16.260045146726863, + "grad_norm": 232.0821075439453, + "learning_rate": 5.531760435571688e-06, + "loss": 36.3975, + "step": 4504 + }, + { + "epoch": 16.263656884875846, + "grad_norm": 223.3292236328125, + "learning_rate": 5.526315789473684e-06, + "loss": 37.0448, + "step": 4505 + }, + { + "epoch": 16.26726862302483, + "grad_norm": 241.2131805419922, + "learning_rate": 5.520871143375681e-06, + "loss": 37.8635, + "step": 4506 + }, + { + "epoch": 16.270880361173816, + "grad_norm": 288.62689208984375, + "learning_rate": 5.5154264972776765e-06, + "loss": 38.2789, + "step": 4507 + }, + { + "epoch": 16.2744920993228, + "grad_norm": 262.59637451171875, + "learning_rate": 5.5099818511796736e-06, + "loss": 37.9052, + "step": 4508 + }, + { + "epoch": 16.278103837471782, + "grad_norm": 258.0476379394531, + "learning_rate": 5.50453720508167e-06, + "loss": 38.0485, + "step": 4509 + }, + { + "epoch": 16.28171557562077, + "grad_norm": 295.2730407714844, + "learning_rate": 5.499092558983667e-06, + "loss": 37.6134, + "step": 4510 + }, + { + "epoch": 16.28171557562077, + "eval_loss": 0.601740300655365, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 4510 + }, + { + "epoch": 16.285327313769752, + "grad_norm": 246.38548278808594, + "learning_rate": 5.493647912885662e-06, + "loss": 36.1289, + "step": 4511 + }, + { + "epoch": 16.288939051918735, + "grad_norm": 271.28997802734375, + "learning_rate": 5.4882032667876585e-06, + "loss": 31.8834, + "step": 4512 + }, + { + "epoch": 16.292550790067722, + "grad_norm": 231.76246643066406, + "learning_rate": 5.4827586206896556e-06, + "loss": 31.4899, + "step": 4513 + }, + { + "epoch": 16.296162528216705, + "grad_norm": 238.7414093017578, + "learning_rate": 5.477313974591652e-06, + "loss": 31.7102, + "step": 4514 + }, + { + "epoch": 16.299774266365688, + "grad_norm": 302.0710144042969, + "learning_rate": 5.471869328493648e-06, + "loss": 31.3557, + "step": 4515 + }, + { + "epoch": 16.30338600451467, + "grad_norm": 282.72015380859375, + "learning_rate": 5.466424682395644e-06, + "loss": 33.0781, + "step": 4516 + }, + { + "epoch": 16.306997742663658, + "grad_norm": 224.8140869140625, + "learning_rate": 5.460980036297641e-06, + "loss": 33.2963, + "step": 4517 + }, + { + "epoch": 16.31060948081264, + "grad_norm": 239.20570373535156, + "learning_rate": 5.4555353901996376e-06, + "loss": 34.4455, + "step": 4518 + }, + { + "epoch": 16.314221218961624, + "grad_norm": 304.7758483886719, + "learning_rate": 5.450090744101633e-06, + "loss": 34.534, + "step": 4519 + }, + { + "epoch": 16.31783295711061, + "grad_norm": 274.8758239746094, + "learning_rate": 5.44464609800363e-06, + "loss": 33.5232, + "step": 4520 + }, + { + "epoch": 16.31783295711061, + "eval_loss": 0.6031973958015442, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.071, + "eval_steps_per_second": 57.071, + "step": 4520 + }, + { + "epoch": 16.321444695259594, + "grad_norm": 295.1776428222656, + "learning_rate": 5.439201451905626e-06, + "loss": 33.403, + "step": 4521 + }, + { + "epoch": 16.325056433408577, + "grad_norm": 309.03399658203125, + "learning_rate": 5.4337568058076225e-06, + "loss": 34.1785, + "step": 4522 + }, + { + "epoch": 16.328668171557563, + "grad_norm": 285.26385498046875, + "learning_rate": 5.428312159709619e-06, + "loss": 34.4855, + "step": 4523 + }, + { + "epoch": 16.332279909706546, + "grad_norm": 307.0184020996094, + "learning_rate": 5.422867513611616e-06, + "loss": 32.4791, + "step": 4524 + }, + { + "epoch": 16.33589164785553, + "grad_norm": 318.8267822265625, + "learning_rate": 5.417422867513612e-06, + "loss": 35.697, + "step": 4525 + }, + { + "epoch": 16.339503386004516, + "grad_norm": 356.0179138183594, + "learning_rate": 5.411978221415607e-06, + "loss": 36.1811, + "step": 4526 + }, + { + "epoch": 16.3431151241535, + "grad_norm": 332.1255187988281, + "learning_rate": 5.4065335753176045e-06, + "loss": 36.2251, + "step": 4527 + }, + { + "epoch": 16.346726862302482, + "grad_norm": 288.78118896484375, + "learning_rate": 5.401088929219601e-06, + "loss": 32.0518, + "step": 4528 + }, + { + "epoch": 16.35033860045147, + "grad_norm": 250.37245178222656, + "learning_rate": 5.395644283121598e-06, + "loss": 23.627, + "step": 4529 + }, + { + "epoch": 16.353950338600452, + "grad_norm": 199.92352294921875, + "learning_rate": 5.390199637023593e-06, + "loss": 21.7919, + "step": 4530 + }, + { + "epoch": 16.353950338600452, + "eval_loss": 0.6021688580513, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 4530 + }, + { + "epoch": 16.357562076749435, + "grad_norm": 265.47015380859375, + "learning_rate": 5.38475499092559e-06, + "loss": 23.0672, + "step": 4531 + }, + { + "epoch": 16.36117381489842, + "grad_norm": 281.188720703125, + "learning_rate": 5.3793103448275865e-06, + "loss": 22.7983, + "step": 4532 + }, + { + "epoch": 16.364785553047405, + "grad_norm": 195.5351104736328, + "learning_rate": 5.373865698729583e-06, + "loss": 38.1042, + "step": 4533 + }, + { + "epoch": 16.368397291196388, + "grad_norm": 234.76573181152344, + "learning_rate": 5.368421052631579e-06, + "loss": 39.8602, + "step": 4534 + }, + { + "epoch": 16.37200902934537, + "grad_norm": 237.9152374267578, + "learning_rate": 5.362976406533575e-06, + "loss": 40.2156, + "step": 4535 + }, + { + "epoch": 16.375620767494357, + "grad_norm": 297.722900390625, + "learning_rate": 5.357531760435572e-06, + "loss": 39.3676, + "step": 4536 + }, + { + "epoch": 16.37923250564334, + "grad_norm": 218.61727905273438, + "learning_rate": 5.352087114337568e-06, + "loss": 38.7905, + "step": 4537 + }, + { + "epoch": 16.382844243792324, + "grad_norm": 245.19561767578125, + "learning_rate": 5.346642468239565e-06, + "loss": 39.3998, + "step": 4538 + }, + { + "epoch": 16.38645598194131, + "grad_norm": 247.5048370361328, + "learning_rate": 5.341197822141561e-06, + "loss": 40.0835, + "step": 4539 + }, + { + "epoch": 16.390067720090293, + "grad_norm": 214.40684509277344, + "learning_rate": 5.335753176043558e-06, + "loss": 39.1135, + "step": 4540 + }, + { + "epoch": 16.390067720090293, + "eval_loss": 0.6014460325241089, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.946, + "eval_steps_per_second": 56.946, + "step": 4540 + }, + { + "epoch": 16.393679458239276, + "grad_norm": 216.72271728515625, + "learning_rate": 5.330308529945553e-06, + "loss": 38.9449, + "step": 4541 + }, + { + "epoch": 16.397291196388263, + "grad_norm": 224.22262573242188, + "learning_rate": 5.32486388384755e-06, + "loss": 39.2646, + "step": 4542 + }, + { + "epoch": 16.400902934537246, + "grad_norm": 258.6524353027344, + "learning_rate": 5.319419237749547e-06, + "loss": 38.0846, + "step": 4543 + }, + { + "epoch": 16.40451467268623, + "grad_norm": 241.7313232421875, + "learning_rate": 5.313974591651543e-06, + "loss": 37.4963, + "step": 4544 + }, + { + "epoch": 16.408126410835216, + "grad_norm": 241.3990478515625, + "learning_rate": 5.308529945553539e-06, + "loss": 36.4783, + "step": 4545 + }, + { + "epoch": 16.4117381489842, + "grad_norm": 207.1470947265625, + "learning_rate": 5.303085299455535e-06, + "loss": 36.1592, + "step": 4546 + }, + { + "epoch": 16.415349887133182, + "grad_norm": 224.51690673828125, + "learning_rate": 5.2976406533575325e-06, + "loss": 35.7946, + "step": 4547 + }, + { + "epoch": 16.41896162528217, + "grad_norm": 292.4340515136719, + "learning_rate": 5.292196007259528e-06, + "loss": 36.8986, + "step": 4548 + }, + { + "epoch": 16.42257336343115, + "grad_norm": 244.67117309570312, + "learning_rate": 5.286751361161524e-06, + "loss": 37.1165, + "step": 4549 + }, + { + "epoch": 16.426185101580135, + "grad_norm": 331.14654541015625, + "learning_rate": 5.281306715063521e-06, + "loss": 36.4423, + "step": 4550 + }, + { + "epoch": 16.426185101580135, + "eval_loss": 0.6067427396774292, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.946, + "eval_steps_per_second": 56.946, + "step": 4550 + }, + { + "epoch": 16.42979683972912, + "grad_norm": 262.373046875, + "learning_rate": 5.275862068965517e-06, + "loss": 39.0014, + "step": 4551 + }, + { + "epoch": 16.433408577878104, + "grad_norm": 237.48350524902344, + "learning_rate": 5.270417422867514e-06, + "loss": 38.0152, + "step": 4552 + }, + { + "epoch": 16.437020316027088, + "grad_norm": 273.0652770996094, + "learning_rate": 5.26497277676951e-06, + "loss": 37.6952, + "step": 4553 + }, + { + "epoch": 16.44063205417607, + "grad_norm": 239.0780029296875, + "learning_rate": 5.259528130671507e-06, + "loss": 38.4266, + "step": 4554 + }, + { + "epoch": 16.444243792325057, + "grad_norm": 277.978759765625, + "learning_rate": 5.254083484573503e-06, + "loss": 36.5596, + "step": 4555 + }, + { + "epoch": 16.44785553047404, + "grad_norm": 216.2267303466797, + "learning_rate": 5.248638838475499e-06, + "loss": 39.1408, + "step": 4556 + }, + { + "epoch": 16.451467268623023, + "grad_norm": 231.80581665039062, + "learning_rate": 5.243194192377496e-06, + "loss": 38.7286, + "step": 4557 + }, + { + "epoch": 16.45507900677201, + "grad_norm": 236.4004669189453, + "learning_rate": 5.237749546279492e-06, + "loss": 39.2426, + "step": 4558 + }, + { + "epoch": 16.458690744920993, + "grad_norm": 270.0268859863281, + "learning_rate": 5.232304900181488e-06, + "loss": 38.6546, + "step": 4559 + }, + { + "epoch": 16.462302483069976, + "grad_norm": 255.8044891357422, + "learning_rate": 5.226860254083484e-06, + "loss": 37.554, + "step": 4560 + }, + { + "epoch": 16.462302483069976, + "eval_loss": 0.6019929647445679, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.062, + "eval_steps_per_second": 57.062, + "step": 4560 + }, + { + "epoch": 16.465914221218963, + "grad_norm": 321.18499755859375, + "learning_rate": 5.221415607985481e-06, + "loss": 34.9309, + "step": 4561 + }, + { + "epoch": 16.469525959367946, + "grad_norm": 311.94305419921875, + "learning_rate": 5.215970961887478e-06, + "loss": 35.8779, + "step": 4562 + }, + { + "epoch": 16.47313769751693, + "grad_norm": 211.90234375, + "learning_rate": 5.210526315789474e-06, + "loss": 31.8385, + "step": 4563 + }, + { + "epoch": 16.476749435665916, + "grad_norm": 284.64581298828125, + "learning_rate": 5.20508166969147e-06, + "loss": 31.8078, + "step": 4564 + }, + { + "epoch": 16.4803611738149, + "grad_norm": 291.94891357421875, + "learning_rate": 5.199637023593466e-06, + "loss": 33.2542, + "step": 4565 + }, + { + "epoch": 16.483972911963882, + "grad_norm": 243.61956787109375, + "learning_rate": 5.194192377495463e-06, + "loss": 31.5292, + "step": 4566 + }, + { + "epoch": 16.48758465011287, + "grad_norm": 242.07696533203125, + "learning_rate": 5.188747731397459e-06, + "loss": 33.9643, + "step": 4567 + }, + { + "epoch": 16.49119638826185, + "grad_norm": 255.0625457763672, + "learning_rate": 5.183303085299456e-06, + "loss": 33.7718, + "step": 4568 + }, + { + "epoch": 16.494808126410835, + "grad_norm": 249.40240478515625, + "learning_rate": 5.177858439201452e-06, + "loss": 31.5248, + "step": 4569 + }, + { + "epoch": 16.498419864559818, + "grad_norm": 231.3375244140625, + "learning_rate": 5.172413793103449e-06, + "loss": 34.5657, + "step": 4570 + }, + { + "epoch": 16.498419864559818, + "eval_loss": 0.6017265319824219, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.01, + "eval_steps_per_second": 57.01, + "step": 4570 + }, + { + "epoch": 16.502031602708804, + "grad_norm": 247.97012329101562, + "learning_rate": 5.1669691470054445e-06, + "loss": 33.766, + "step": 4571 + }, + { + "epoch": 16.505643340857787, + "grad_norm": 310.730224609375, + "learning_rate": 5.161524500907441e-06, + "loss": 34.0841, + "step": 4572 + }, + { + "epoch": 16.50925507900677, + "grad_norm": 323.5569152832031, + "learning_rate": 5.156079854809438e-06, + "loss": 35.0788, + "step": 4573 + }, + { + "epoch": 16.512866817155757, + "grad_norm": 247.95480346679688, + "learning_rate": 5.150635208711433e-06, + "loss": 33.5322, + "step": 4574 + }, + { + "epoch": 16.51647855530474, + "grad_norm": 307.6163024902344, + "learning_rate": 5.14519056261343e-06, + "loss": 34.4701, + "step": 4575 + }, + { + "epoch": 16.520090293453723, + "grad_norm": 239.569580078125, + "learning_rate": 5.1397459165154265e-06, + "loss": 35.8526, + "step": 4576 + }, + { + "epoch": 16.52370203160271, + "grad_norm": 362.4159240722656, + "learning_rate": 5.134301270417424e-06, + "loss": 36.2235, + "step": 4577 + }, + { + "epoch": 16.527313769751693, + "grad_norm": 321.2509765625, + "learning_rate": 5.128856624319419e-06, + "loss": 33.4705, + "step": 4578 + }, + { + "epoch": 16.530925507900676, + "grad_norm": 248.6092071533203, + "learning_rate": 5.123411978221415e-06, + "loss": 23.1329, + "step": 4579 + }, + { + "epoch": 16.534537246049663, + "grad_norm": 289.8996276855469, + "learning_rate": 5.117967332123412e-06, + "loss": 20.3184, + "step": 4580 + }, + { + "epoch": 16.534537246049663, + "eval_loss": 0.6034744381904602, + "eval_runtime": 3.1405, + "eval_samples_per_second": 56.997, + "eval_steps_per_second": 56.997, + "step": 4580 + }, + { + "epoch": 16.538148984198646, + "grad_norm": 215.02142333984375, + "learning_rate": 5.1125226860254085e-06, + "loss": 23.0513, + "step": 4581 + }, + { + "epoch": 16.54176072234763, + "grad_norm": 299.8429870605469, + "learning_rate": 5.107078039927405e-06, + "loss": 24.462, + "step": 4582 + }, + { + "epoch": 16.545372460496615, + "grad_norm": 267.0840759277344, + "learning_rate": 5.101633393829401e-06, + "loss": 39.9148, + "step": 4583 + }, + { + "epoch": 16.5489841986456, + "grad_norm": 227.23731994628906, + "learning_rate": 5.096188747731398e-06, + "loss": 40.6498, + "step": 4584 + }, + { + "epoch": 16.55259593679458, + "grad_norm": 313.9705810546875, + "learning_rate": 5.0907441016333935e-06, + "loss": 38.7711, + "step": 4585 + }, + { + "epoch": 16.55620767494357, + "grad_norm": 398.0429382324219, + "learning_rate": 5.0852994555353905e-06, + "loss": 39.6938, + "step": 4586 + }, + { + "epoch": 16.55981941309255, + "grad_norm": 365.489990234375, + "learning_rate": 5.079854809437387e-06, + "loss": 39.356, + "step": 4587 + }, + { + "epoch": 16.563431151241534, + "grad_norm": 365.05267333984375, + "learning_rate": 5.074410163339383e-06, + "loss": 40.2504, + "step": 4588 + }, + { + "epoch": 16.567042889390518, + "grad_norm": 288.0643310546875, + "learning_rate": 5.068965517241379e-06, + "loss": 39.6045, + "step": 4589 + }, + { + "epoch": 16.570654627539504, + "grad_norm": 262.0147705078125, + "learning_rate": 5.0635208711433755e-06, + "loss": 40.2504, + "step": 4590 + }, + { + "epoch": 16.570654627539504, + "eval_loss": 0.6028281450271606, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 4590 + }, + { + "epoch": 16.574266365688487, + "grad_norm": 325.78387451171875, + "learning_rate": 5.0580762250453725e-06, + "loss": 40.3154, + "step": 4591 + }, + { + "epoch": 16.57787810383747, + "grad_norm": 221.56591796875, + "learning_rate": 5.052631578947369e-06, + "loss": 39.5046, + "step": 4592 + }, + { + "epoch": 16.581489841986457, + "grad_norm": 227.02520751953125, + "learning_rate": 5.047186932849365e-06, + "loss": 38.3611, + "step": 4593 + }, + { + "epoch": 16.58510158013544, + "grad_norm": 232.46922302246094, + "learning_rate": 5.041742286751361e-06, + "loss": 36.5043, + "step": 4594 + }, + { + "epoch": 16.588713318284423, + "grad_norm": 230.59536743164062, + "learning_rate": 5.0362976406533575e-06, + "loss": 36.2179, + "step": 4595 + }, + { + "epoch": 16.59232505643341, + "grad_norm": 439.9609069824219, + "learning_rate": 5.0308529945553545e-06, + "loss": 36.4797, + "step": 4596 + }, + { + "epoch": 16.595936794582393, + "grad_norm": 322.4086608886719, + "learning_rate": 5.02540834845735e-06, + "loss": 37.4151, + "step": 4597 + }, + { + "epoch": 16.599548532731376, + "grad_norm": 318.1732482910156, + "learning_rate": 5.019963702359347e-06, + "loss": 37.2815, + "step": 4598 + }, + { + "epoch": 16.603160270880363, + "grad_norm": 321.34039306640625, + "learning_rate": 5.014519056261343e-06, + "loss": 36.8388, + "step": 4599 + }, + { + "epoch": 16.606772009029346, + "grad_norm": 341.28790283203125, + "learning_rate": 5.0090744101633395e-06, + "loss": 37.9805, + "step": 4600 + }, + { + "epoch": 16.606772009029346, + "eval_loss": 0.6045316457748413, + "eval_runtime": 3.1402, + "eval_samples_per_second": 57.002, + "eval_steps_per_second": 57.002, + "step": 4600 + }, + { + "epoch": 16.61038374717833, + "grad_norm": 259.9163513183594, + "learning_rate": 5.003629764065336e-06, + "loss": 37.5832, + "step": 4601 + }, + { + "epoch": 16.613995485327315, + "grad_norm": 297.02587890625, + "learning_rate": 4.998185117967332e-06, + "loss": 37.3808, + "step": 4602 + }, + { + "epoch": 16.6176072234763, + "grad_norm": 263.32244873046875, + "learning_rate": 4.992740471869329e-06, + "loss": 37.1047, + "step": 4603 + }, + { + "epoch": 16.62121896162528, + "grad_norm": 262.26104736328125, + "learning_rate": 4.987295825771324e-06, + "loss": 38.3592, + "step": 4604 + }, + { + "epoch": 16.624830699774268, + "grad_norm": 253.7144012451172, + "learning_rate": 4.9818511796733215e-06, + "loss": 37.4098, + "step": 4605 + }, + { + "epoch": 16.62844243792325, + "grad_norm": 279.1004943847656, + "learning_rate": 4.976406533575318e-06, + "loss": 39.3865, + "step": 4606 + }, + { + "epoch": 16.632054176072234, + "grad_norm": 298.7977600097656, + "learning_rate": 4.970961887477315e-06, + "loss": 38.6865, + "step": 4607 + }, + { + "epoch": 16.635665914221217, + "grad_norm": 256.7657470703125, + "learning_rate": 4.96551724137931e-06, + "loss": 38.7068, + "step": 4608 + }, + { + "epoch": 16.639277652370204, + "grad_norm": 238.22979736328125, + "learning_rate": 4.960072595281307e-06, + "loss": 37.749, + "step": 4609 + }, + { + "epoch": 16.642889390519187, + "grad_norm": 248.4231414794922, + "learning_rate": 4.9546279491833035e-06, + "loss": 37.582, + "step": 4610 + }, + { + "epoch": 16.642889390519187, + "eval_loss": 0.6026645302772522, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.99, + "eval_steps_per_second": 56.99, + "step": 4610 + }, + { + "epoch": 16.64650112866817, + "grad_norm": 232.70289611816406, + "learning_rate": 4.949183303085299e-06, + "loss": 34.4589, + "step": 4611 + }, + { + "epoch": 16.650112866817157, + "grad_norm": 268.4678955078125, + "learning_rate": 4.943738656987296e-06, + "loss": 32.3619, + "step": 4612 + }, + { + "epoch": 16.65372460496614, + "grad_norm": 272.07794189453125, + "learning_rate": 4.938294010889292e-06, + "loss": 32.3436, + "step": 4613 + }, + { + "epoch": 16.657336343115123, + "grad_norm": 304.4588317871094, + "learning_rate": 4.932849364791289e-06, + "loss": 30.8798, + "step": 4614 + }, + { + "epoch": 16.66094808126411, + "grad_norm": 293.3638000488281, + "learning_rate": 4.927404718693285e-06, + "loss": 31.1892, + "step": 4615 + }, + { + "epoch": 16.664559819413093, + "grad_norm": 292.844482421875, + "learning_rate": 4.921960072595282e-06, + "loss": 31.9604, + "step": 4616 + }, + { + "epoch": 16.668171557562076, + "grad_norm": 246.45339965820312, + "learning_rate": 4.916515426497278e-06, + "loss": 32.242, + "step": 4617 + }, + { + "epoch": 16.671783295711062, + "grad_norm": 269.9577941894531, + "learning_rate": 4.911070780399274e-06, + "loss": 32.5072, + "step": 4618 + }, + { + "epoch": 16.675395033860045, + "grad_norm": 312.8960876464844, + "learning_rate": 4.90562613430127e-06, + "loss": 33.8243, + "step": 4619 + }, + { + "epoch": 16.67900677200903, + "grad_norm": 287.4557189941406, + "learning_rate": 4.900181488203267e-06, + "loss": 34.3557, + "step": 4620 + }, + { + "epoch": 16.67900677200903, + "eval_loss": 0.6047338843345642, + "eval_runtime": 3.1387, + "eval_samples_per_second": 57.03, + "eval_steps_per_second": 57.03, + "step": 4620 + }, + { + "epoch": 16.682618510158015, + "grad_norm": 403.533935546875, + "learning_rate": 4.894736842105264e-06, + "loss": 34.6895, + "step": 4621 + }, + { + "epoch": 16.686230248306998, + "grad_norm": 387.5083923339844, + "learning_rate": 4.88929219600726e-06, + "loss": 34.2407, + "step": 4622 + }, + { + "epoch": 16.68984198645598, + "grad_norm": 278.8225402832031, + "learning_rate": 4.883847549909256e-06, + "loss": 33.3489, + "step": 4623 + }, + { + "epoch": 16.693453724604964, + "grad_norm": 270.46685791015625, + "learning_rate": 4.878402903811252e-06, + "loss": 34.2095, + "step": 4624 + }, + { + "epoch": 16.69706546275395, + "grad_norm": 244.6392059326172, + "learning_rate": 4.872958257713249e-06, + "loss": 35.783, + "step": 4625 + }, + { + "epoch": 16.700677200902934, + "grad_norm": 327.0617370605469, + "learning_rate": 4.867513611615245e-06, + "loss": 36.4928, + "step": 4626 + }, + { + "epoch": 16.704288939051917, + "grad_norm": 297.0531311035156, + "learning_rate": 4.862068965517241e-06, + "loss": 33.4827, + "step": 4627 + }, + { + "epoch": 16.707900677200904, + "grad_norm": 366.2174377441406, + "learning_rate": 4.856624319419238e-06, + "loss": 26.9456, + "step": 4628 + }, + { + "epoch": 16.711512415349887, + "grad_norm": 436.22613525390625, + "learning_rate": 4.851179673321234e-06, + "loss": 22.2349, + "step": 4629 + }, + { + "epoch": 16.71512415349887, + "grad_norm": 391.7647705078125, + "learning_rate": 4.845735027223231e-06, + "loss": 22.8557, + "step": 4630 + }, + { + "epoch": 16.71512415349887, + "eval_loss": 0.6052708029747009, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.038, + "eval_steps_per_second": 57.038, + "step": 4630 + }, + { + "epoch": 16.718735891647857, + "grad_norm": 277.8678283691406, + "learning_rate": 4.840290381125227e-06, + "loss": 23.3521, + "step": 4631 + }, + { + "epoch": 16.72234762979684, + "grad_norm": 252.46131896972656, + "learning_rate": 4.834845735027224e-06, + "loss": 23.7394, + "step": 4632 + }, + { + "epoch": 16.725959367945823, + "grad_norm": 214.6287078857422, + "learning_rate": 4.82940108892922e-06, + "loss": 38.6633, + "step": 4633 + }, + { + "epoch": 16.72957110609481, + "grad_norm": 257.454345703125, + "learning_rate": 4.8239564428312155e-06, + "loss": 40.5165, + "step": 4634 + }, + { + "epoch": 16.733182844243792, + "grad_norm": 211.1912841796875, + "learning_rate": 4.818511796733213e-06, + "loss": 38.483, + "step": 4635 + }, + { + "epoch": 16.736794582392776, + "grad_norm": 226.8388214111328, + "learning_rate": 4.813067150635209e-06, + "loss": 39.6143, + "step": 4636 + }, + { + "epoch": 16.740406320541762, + "grad_norm": 263.8160400390625, + "learning_rate": 4.807622504537205e-06, + "loss": 37.8442, + "step": 4637 + }, + { + "epoch": 16.744018058690745, + "grad_norm": 284.8119201660156, + "learning_rate": 4.802177858439201e-06, + "loss": 39.1835, + "step": 4638 + }, + { + "epoch": 16.74762979683973, + "grad_norm": 310.31390380859375, + "learning_rate": 4.796733212341198e-06, + "loss": 38.7035, + "step": 4639 + }, + { + "epoch": 16.751241534988715, + "grad_norm": 212.71315002441406, + "learning_rate": 4.791288566243195e-06, + "loss": 38.8803, + "step": 4640 + }, + { + "epoch": 16.751241534988715, + "eval_loss": 0.6030828952789307, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.027, + "eval_steps_per_second": 57.027, + "step": 4640 + }, + { + "epoch": 16.754853273137698, + "grad_norm": 209.7708740234375, + "learning_rate": 4.78584392014519e-06, + "loss": 39.0808, + "step": 4641 + }, + { + "epoch": 16.75846501128668, + "grad_norm": 251.971435546875, + "learning_rate": 4.780399274047187e-06, + "loss": 39.2025, + "step": 4642 + }, + { + "epoch": 16.762076749435664, + "grad_norm": 210.54151916503906, + "learning_rate": 4.774954627949183e-06, + "loss": 37.7541, + "step": 4643 + }, + { + "epoch": 16.76568848758465, + "grad_norm": 221.22119140625, + "learning_rate": 4.76950998185118e-06, + "loss": 36.4328, + "step": 4644 + }, + { + "epoch": 16.769300225733634, + "grad_norm": 201.45025634765625, + "learning_rate": 4.764065335753176e-06, + "loss": 34.9771, + "step": 4645 + }, + { + "epoch": 16.772911963882617, + "grad_norm": 241.33030700683594, + "learning_rate": 4.758620689655173e-06, + "loss": 37.6231, + "step": 4646 + }, + { + "epoch": 16.776523702031604, + "grad_norm": 282.12255859375, + "learning_rate": 4.753176043557169e-06, + "loss": 36.9822, + "step": 4647 + }, + { + "epoch": 16.780135440180587, + "grad_norm": 239.93885803222656, + "learning_rate": 4.747731397459165e-06, + "loss": 36.3529, + "step": 4648 + }, + { + "epoch": 16.78374717832957, + "grad_norm": 245.9400634765625, + "learning_rate": 4.7422867513611615e-06, + "loss": 37.518, + "step": 4649 + }, + { + "epoch": 16.787358916478556, + "grad_norm": 280.63720703125, + "learning_rate": 4.736842105263158e-06, + "loss": 37.6323, + "step": 4650 + }, + { + "epoch": 16.787358916478556, + "eval_loss": 0.6054876446723938, + "eval_runtime": 3.1439, + "eval_samples_per_second": 56.935, + "eval_steps_per_second": 56.935, + "step": 4650 + }, + { + "epoch": 16.79097065462754, + "grad_norm": 368.47698974609375, + "learning_rate": 4.731397459165155e-06, + "loss": 38.1543, + "step": 4651 + }, + { + "epoch": 16.794582392776523, + "grad_norm": 346.9169616699219, + "learning_rate": 4.72595281306715e-06, + "loss": 38.8746, + "step": 4652 + }, + { + "epoch": 16.79819413092551, + "grad_norm": 311.7519836425781, + "learning_rate": 4.720508166969147e-06, + "loss": 37.3475, + "step": 4653 + }, + { + "epoch": 16.801805869074492, + "grad_norm": 323.14910888671875, + "learning_rate": 4.7150635208711435e-06, + "loss": 38.5308, + "step": 4654 + }, + { + "epoch": 16.805417607223475, + "grad_norm": 252.71958923339844, + "learning_rate": 4.70961887477314e-06, + "loss": 38.3275, + "step": 4655 + }, + { + "epoch": 16.809029345372462, + "grad_norm": 364.2929382324219, + "learning_rate": 4.704174228675136e-06, + "loss": 38.9973, + "step": 4656 + }, + { + "epoch": 16.812641083521445, + "grad_norm": 267.23980712890625, + "learning_rate": 4.698729582577132e-06, + "loss": 38.0867, + "step": 4657 + }, + { + "epoch": 16.816252821670428, + "grad_norm": 297.4647521972656, + "learning_rate": 4.693284936479129e-06, + "loss": 38.6933, + "step": 4658 + }, + { + "epoch": 16.819864559819415, + "grad_norm": 276.2767333984375, + "learning_rate": 4.6878402903811255e-06, + "loss": 38.0279, + "step": 4659 + }, + { + "epoch": 16.823476297968398, + "grad_norm": 261.5404052734375, + "learning_rate": 4.682395644283122e-06, + "loss": 36.5149, + "step": 4660 + }, + { + "epoch": 16.823476297968398, + "eval_loss": 0.6019832491874695, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.074, + "eval_steps_per_second": 57.074, + "step": 4660 + }, + { + "epoch": 16.82708803611738, + "grad_norm": 313.2170104980469, + "learning_rate": 4.676950998185118e-06, + "loss": 35.6121, + "step": 4661 + }, + { + "epoch": 16.830699774266364, + "grad_norm": 297.2791442871094, + "learning_rate": 4.671506352087115e-06, + "loss": 31.1869, + "step": 4662 + }, + { + "epoch": 16.83431151241535, + "grad_norm": 269.7320556640625, + "learning_rate": 4.666061705989111e-06, + "loss": 31.8674, + "step": 4663 + }, + { + "epoch": 16.837923250564334, + "grad_norm": 245.3898468017578, + "learning_rate": 4.660617059891107e-06, + "loss": 30.3726, + "step": 4664 + }, + { + "epoch": 16.841534988713317, + "grad_norm": 244.63223266601562, + "learning_rate": 4.655172413793104e-06, + "loss": 32.6154, + "step": 4665 + }, + { + "epoch": 16.845146726862303, + "grad_norm": 263.6791076660156, + "learning_rate": 4.6497277676951e-06, + "loss": 33.0104, + "step": 4666 + }, + { + "epoch": 16.848758465011286, + "grad_norm": 398.6610107421875, + "learning_rate": 4.644283121597096e-06, + "loss": 32.5445, + "step": 4667 + }, + { + "epoch": 16.85237020316027, + "grad_norm": 312.8116149902344, + "learning_rate": 4.6388384754990924e-06, + "loss": 32.5698, + "step": 4668 + }, + { + "epoch": 16.855981941309256, + "grad_norm": 296.6167297363281, + "learning_rate": 4.6333938294010895e-06, + "loss": 33.1377, + "step": 4669 + }, + { + "epoch": 16.85959367945824, + "grad_norm": 285.299560546875, + "learning_rate": 4.627949183303086e-06, + "loss": 33.3279, + "step": 4670 + }, + { + "epoch": 16.85959367945824, + "eval_loss": 0.6027817726135254, + "eval_runtime": 3.1412, + "eval_samples_per_second": 56.985, + "eval_steps_per_second": 56.985, + "step": 4670 + }, + { + "epoch": 16.863205417607222, + "grad_norm": 285.2948913574219, + "learning_rate": 4.622504537205081e-06, + "loss": 35.6879, + "step": 4671 + }, + { + "epoch": 16.86681715575621, + "grad_norm": 280.6530456542969, + "learning_rate": 4.617059891107078e-06, + "loss": 32.3154, + "step": 4672 + }, + { + "epoch": 16.870428893905192, + "grad_norm": 314.206787109375, + "learning_rate": 4.6116152450090744e-06, + "loss": 34.3517, + "step": 4673 + }, + { + "epoch": 16.874040632054175, + "grad_norm": 305.9198913574219, + "learning_rate": 4.6061705989110715e-06, + "loss": 34.1571, + "step": 4674 + }, + { + "epoch": 16.877652370203162, + "grad_norm": 287.0543212890625, + "learning_rate": 4.600725952813067e-06, + "loss": 35.1647, + "step": 4675 + }, + { + "epoch": 16.881264108352145, + "grad_norm": 286.912109375, + "learning_rate": 4.595281306715064e-06, + "loss": 34.8698, + "step": 4676 + }, + { + "epoch": 16.884875846501128, + "grad_norm": 322.4527587890625, + "learning_rate": 4.58983666061706e-06, + "loss": 36.3449, + "step": 4677 + }, + { + "epoch": 16.888487584650115, + "grad_norm": 239.41659545898438, + "learning_rate": 4.584392014519056e-06, + "loss": 25.3085, + "step": 4678 + }, + { + "epoch": 16.892099322799098, + "grad_norm": 215.5685577392578, + "learning_rate": 4.578947368421053e-06, + "loss": 22.3485, + "step": 4679 + }, + { + "epoch": 16.89571106094808, + "grad_norm": 291.2452697753906, + "learning_rate": 4.573502722323049e-06, + "loss": 22.3257, + "step": 4680 + }, + { + "epoch": 16.89571106094808, + "eval_loss": 0.6040940284729004, + "eval_runtime": 3.141, + "eval_samples_per_second": 56.988, + "eval_steps_per_second": 56.988, + "step": 4680 + }, + { + "epoch": 16.899322799097064, + "grad_norm": 291.39935302734375, + "learning_rate": 4.568058076225046e-06, + "loss": 23.268, + "step": 4681 + }, + { + "epoch": 16.90293453724605, + "grad_norm": 272.211181640625, + "learning_rate": 4.562613430127041e-06, + "loss": 23.7127, + "step": 4682 + }, + { + "epoch": 16.906546275395034, + "grad_norm": 220.84397888183594, + "learning_rate": 4.5571687840290384e-06, + "loss": 39.2488, + "step": 4683 + }, + { + "epoch": 16.910158013544017, + "grad_norm": 238.49859619140625, + "learning_rate": 4.551724137931035e-06, + "loss": 39.5643, + "step": 4684 + }, + { + "epoch": 16.913769751693003, + "grad_norm": 325.3870544433594, + "learning_rate": 4.546279491833032e-06, + "loss": 38.6149, + "step": 4685 + }, + { + "epoch": 16.917381489841986, + "grad_norm": 307.02349853515625, + "learning_rate": 4.540834845735027e-06, + "loss": 38.0317, + "step": 4686 + }, + { + "epoch": 16.92099322799097, + "grad_norm": 433.99359130859375, + "learning_rate": 4.535390199637023e-06, + "loss": 40.4567, + "step": 4687 + }, + { + "epoch": 16.924604966139956, + "grad_norm": 327.97015380859375, + "learning_rate": 4.5299455535390204e-06, + "loss": 40.3109, + "step": 4688 + }, + { + "epoch": 16.92821670428894, + "grad_norm": 257.20684814453125, + "learning_rate": 4.524500907441017e-06, + "loss": 36.2826, + "step": 4689 + }, + { + "epoch": 16.931828442437922, + "grad_norm": 402.6732177734375, + "learning_rate": 4.519056261343013e-06, + "loss": 36.9163, + "step": 4690 + }, + { + "epoch": 16.931828442437922, + "eval_loss": 0.6016727089881897, + "eval_runtime": 3.1434, + "eval_samples_per_second": 56.944, + "eval_steps_per_second": 56.944, + "step": 4690 + }, + { + "epoch": 16.93544018058691, + "grad_norm": 380.8903503417969, + "learning_rate": 4.513611615245009e-06, + "loss": 36.7101, + "step": 4691 + }, + { + "epoch": 16.939051918735892, + "grad_norm": 365.4950256347656, + "learning_rate": 4.508166969147006e-06, + "loss": 37.9853, + "step": 4692 + }, + { + "epoch": 16.942663656884875, + "grad_norm": 302.3895568847656, + "learning_rate": 4.5027223230490016e-06, + "loss": 38.109, + "step": 4693 + }, + { + "epoch": 16.94627539503386, + "grad_norm": 333.5274963378906, + "learning_rate": 4.497277676950998e-06, + "loss": 37.5992, + "step": 4694 + }, + { + "epoch": 16.949887133182845, + "grad_norm": 364.3126525878906, + "learning_rate": 4.491833030852995e-06, + "loss": 38.0139, + "step": 4695 + }, + { + "epoch": 16.953498871331828, + "grad_norm": 509.94671630859375, + "learning_rate": 4.486388384754991e-06, + "loss": 39.8027, + "step": 4696 + }, + { + "epoch": 16.957110609480814, + "grad_norm": 507.8591613769531, + "learning_rate": 4.480943738656987e-06, + "loss": 40.0044, + "step": 4697 + }, + { + "epoch": 16.960722347629797, + "grad_norm": 324.5463562011719, + "learning_rate": 4.4754990925589836e-06, + "loss": 34.9058, + "step": 4698 + }, + { + "epoch": 16.96433408577878, + "grad_norm": 318.39801025390625, + "learning_rate": 4.470054446460981e-06, + "loss": 33.1318, + "step": 4699 + }, + { + "epoch": 16.967945823927764, + "grad_norm": 391.8466796875, + "learning_rate": 4.464609800362977e-06, + "loss": 32.2083, + "step": 4700 + }, + { + "epoch": 16.967945823927764, + "eval_loss": 0.6047930717468262, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.009, + "eval_steps_per_second": 57.009, + "step": 4700 + }, + { + "epoch": 16.97155756207675, + "grad_norm": 530.4073486328125, + "learning_rate": 4.459165154264972e-06, + "loss": 31.9882, + "step": 4701 + }, + { + "epoch": 16.975169300225733, + "grad_norm": 590.9242553710938, + "learning_rate": 4.453720508166969e-06, + "loss": 34.1937, + "step": 4702 + }, + { + "epoch": 16.978781038374716, + "grad_norm": 377.5596618652344, + "learning_rate": 4.4482758620689656e-06, + "loss": 34.6501, + "step": 4703 + }, + { + "epoch": 16.982392776523703, + "grad_norm": 431.2909240722656, + "learning_rate": 4.442831215970962e-06, + "loss": 33.9402, + "step": 4704 + }, + { + "epoch": 16.986004514672686, + "grad_norm": 294.7673645019531, + "learning_rate": 4.437386569872958e-06, + "loss": 33.7873, + "step": 4705 + }, + { + "epoch": 16.98961625282167, + "grad_norm": 346.1203918457031, + "learning_rate": 4.431941923774955e-06, + "loss": 35.2935, + "step": 4706 + }, + { + "epoch": 16.993227990970656, + "grad_norm": 257.8351745605469, + "learning_rate": 4.426497277676951e-06, + "loss": 28.3513, + "step": 4707 + }, + { + "epoch": 16.99683972911964, + "grad_norm": 168.35118103027344, + "learning_rate": 4.421052631578947e-06, + "loss": 22.3009, + "step": 4708 + }, + { + "epoch": 17.0, + "grad_norm": 210.20738220214844, + "learning_rate": 4.415607985480944e-06, + "loss": 20.1848, + "step": 4709 + }, + { + "epoch": 17.003611738148983, + "grad_norm": 234.40866088867188, + "learning_rate": 4.41016333938294e-06, + "loss": 38.0969, + "step": 4710 + }, + { + "epoch": 17.003611738148983, + "eval_loss": 0.6026900410652161, + "eval_runtime": 3.1364, + "eval_samples_per_second": 57.072, + "eval_steps_per_second": 57.072, + "step": 4710 + }, + { + "epoch": 17.00722347629797, + "grad_norm": 242.27195739746094, + "learning_rate": 4.404718693284937e-06, + "loss": 38.8902, + "step": 4711 + }, + { + "epoch": 17.010835214446953, + "grad_norm": 215.1695556640625, + "learning_rate": 4.3992740471869325e-06, + "loss": 38.5509, + "step": 4712 + }, + { + "epoch": 17.014446952595936, + "grad_norm": 390.2027587890625, + "learning_rate": 4.3938294010889296e-06, + "loss": 38.5247, + "step": 4713 + }, + { + "epoch": 17.018058690744923, + "grad_norm": 397.77484130859375, + "learning_rate": 4.388384754990926e-06, + "loss": 39.1981, + "step": 4714 + }, + { + "epoch": 17.021670428893906, + "grad_norm": 298.10089111328125, + "learning_rate": 4.382940108892923e-06, + "loss": 38.2627, + "step": 4715 + }, + { + "epoch": 17.02528216704289, + "grad_norm": 291.7283935546875, + "learning_rate": 4.377495462794918e-06, + "loss": 38.8027, + "step": 4716 + }, + { + "epoch": 17.028893905191875, + "grad_norm": 254.8542938232422, + "learning_rate": 4.3720508166969145e-06, + "loss": 38.6095, + "step": 4717 + }, + { + "epoch": 17.03250564334086, + "grad_norm": 244.336181640625, + "learning_rate": 4.3666061705989116e-06, + "loss": 38.2955, + "step": 4718 + }, + { + "epoch": 17.03611738148984, + "grad_norm": 376.92523193359375, + "learning_rate": 4.361161524500907e-06, + "loss": 38.5203, + "step": 4719 + }, + { + "epoch": 17.039729119638825, + "grad_norm": 339.6172790527344, + "learning_rate": 4.355716878402904e-06, + "loss": 37.4332, + "step": 4720 + }, + { + "epoch": 17.039729119638825, + "eval_loss": 0.6024167537689209, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 4720 + }, + { + "epoch": 17.04334085778781, + "grad_norm": 433.0855712890625, + "learning_rate": 4.3502722323049e-06, + "loss": 36.4444, + "step": 4721 + }, + { + "epoch": 17.046952595936794, + "grad_norm": 224.3468475341797, + "learning_rate": 4.344827586206897e-06, + "loss": 35.7802, + "step": 4722 + }, + { + "epoch": 17.050564334085777, + "grad_norm": 385.5466003417969, + "learning_rate": 4.339382940108893e-06, + "loss": 35.4641, + "step": 4723 + }, + { + "epoch": 17.054176072234764, + "grad_norm": 311.80596923828125, + "learning_rate": 4.333938294010889e-06, + "loss": 36.4231, + "step": 4724 + }, + { + "epoch": 17.057787810383747, + "grad_norm": 283.189453125, + "learning_rate": 4.328493647912886e-06, + "loss": 37.5405, + "step": 4725 + }, + { + "epoch": 17.06139954853273, + "grad_norm": 403.85833740234375, + "learning_rate": 4.323049001814882e-06, + "loss": 37.4723, + "step": 4726 + }, + { + "epoch": 17.065011286681717, + "grad_norm": 390.03515625, + "learning_rate": 4.3176043557168785e-06, + "loss": 36.6799, + "step": 4727 + }, + { + "epoch": 17.0686230248307, + "grad_norm": 318.63427734375, + "learning_rate": 4.312159709618875e-06, + "loss": 36.6312, + "step": 4728 + }, + { + "epoch": 17.072234762979683, + "grad_norm": 318.43402099609375, + "learning_rate": 4.306715063520872e-06, + "loss": 37.9104, + "step": 4729 + }, + { + "epoch": 17.07584650112867, + "grad_norm": 320.9336853027344, + "learning_rate": 4.301270417422867e-06, + "loss": 36.7254, + "step": 4730 + }, + { + "epoch": 17.07584650112867, + "eval_loss": 0.6046721339225769, + "eval_runtime": 3.1418, + "eval_samples_per_second": 56.974, + "eval_steps_per_second": 56.974, + "step": 4730 + }, + { + "epoch": 17.079458239277653, + "grad_norm": 345.9001770019531, + "learning_rate": 4.295825771324863e-06, + "loss": 36.0298, + "step": 4731 + }, + { + "epoch": 17.083069977426636, + "grad_norm": 397.10369873046875, + "learning_rate": 4.2903811252268605e-06, + "loss": 37.9418, + "step": 4732 + }, + { + "epoch": 17.086681715575622, + "grad_norm": 293.1039123535156, + "learning_rate": 4.284936479128857e-06, + "loss": 37.2627, + "step": 4733 + }, + { + "epoch": 17.090293453724605, + "grad_norm": 412.5190734863281, + "learning_rate": 4.279491833030853e-06, + "loss": 38.3429, + "step": 4734 + }, + { + "epoch": 17.09390519187359, + "grad_norm": 241.35105895996094, + "learning_rate": 4.274047186932849e-06, + "loss": 38.559, + "step": 4735 + }, + { + "epoch": 17.097516930022575, + "grad_norm": 275.169189453125, + "learning_rate": 4.268602540834846e-06, + "loss": 36.8167, + "step": 4736 + }, + { + "epoch": 17.101128668171558, + "grad_norm": 272.3182678222656, + "learning_rate": 4.2631578947368425e-06, + "loss": 37.0246, + "step": 4737 + }, + { + "epoch": 17.10474040632054, + "grad_norm": 215.6425018310547, + "learning_rate": 4.257713248638839e-06, + "loss": 33.1282, + "step": 4738 + }, + { + "epoch": 17.108352144469524, + "grad_norm": 276.6223449707031, + "learning_rate": 4.252268602540835e-06, + "loss": 33.2698, + "step": 4739 + }, + { + "epoch": 17.11196388261851, + "grad_norm": 311.1632385253906, + "learning_rate": 4.246823956442831e-06, + "loss": 31.0105, + "step": 4740 + }, + { + "epoch": 17.11196388261851, + "eval_loss": 0.6019421815872192, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.089, + "eval_steps_per_second": 57.089, + "step": 4740 + }, + { + "epoch": 17.115575620767494, + "grad_norm": 254.7543487548828, + "learning_rate": 4.241379310344828e-06, + "loss": 31.4721, + "step": 4741 + }, + { + "epoch": 17.119187358916477, + "grad_norm": 239.24957275390625, + "learning_rate": 4.235934664246824e-06, + "loss": 31.0346, + "step": 4742 + }, + { + "epoch": 17.122799097065464, + "grad_norm": 262.0681457519531, + "learning_rate": 4.230490018148821e-06, + "loss": 32.0604, + "step": 4743 + }, + { + "epoch": 17.126410835214447, + "grad_norm": 218.3557586669922, + "learning_rate": 4.225045372050817e-06, + "loss": 32.2036, + "step": 4744 + }, + { + "epoch": 17.13002257336343, + "grad_norm": 277.5924072265625, + "learning_rate": 4.219600725952813e-06, + "loss": 32.1412, + "step": 4745 + }, + { + "epoch": 17.133634311512417, + "grad_norm": 226.93211364746094, + "learning_rate": 4.214156079854809e-06, + "loss": 34.3367, + "step": 4746 + }, + { + "epoch": 17.1372460496614, + "grad_norm": 303.2422180175781, + "learning_rate": 4.208711433756806e-06, + "loss": 33.2001, + "step": 4747 + }, + { + "epoch": 17.140857787810383, + "grad_norm": 257.6164245605469, + "learning_rate": 4.203266787658803e-06, + "loss": 34.155, + "step": 4748 + }, + { + "epoch": 17.14446952595937, + "grad_norm": 361.1567077636719, + "learning_rate": 4.197822141560798e-06, + "loss": 35.236, + "step": 4749 + }, + { + "epoch": 17.148081264108352, + "grad_norm": 292.0034484863281, + "learning_rate": 4.192377495462795e-06, + "loss": 34.304, + "step": 4750 + }, + { + "epoch": 17.148081264108352, + "eval_loss": 0.6034401059150696, + "eval_runtime": 3.1399, + "eval_samples_per_second": 57.008, + "eval_steps_per_second": 57.008, + "step": 4750 + }, + { + "epoch": 17.151693002257336, + "grad_norm": 327.8070983886719, + "learning_rate": 4.186932849364791e-06, + "loss": 33.7346, + "step": 4751 + }, + { + "epoch": 17.155304740406322, + "grad_norm": 312.9547119140625, + "learning_rate": 4.1814882032667885e-06, + "loss": 35.9274, + "step": 4752 + }, + { + "epoch": 17.158916478555305, + "grad_norm": 305.19500732421875, + "learning_rate": 4.176043557168784e-06, + "loss": 35.5567, + "step": 4753 + }, + { + "epoch": 17.16252821670429, + "grad_norm": 339.37152099609375, + "learning_rate": 4.17059891107078e-06, + "loss": 35.8013, + "step": 4754 + }, + { + "epoch": 17.16613995485327, + "grad_norm": 247.36679077148438, + "learning_rate": 4.165154264972777e-06, + "loss": 29.2211, + "step": 4755 + }, + { + "epoch": 17.169751693002258, + "grad_norm": 255.65269470214844, + "learning_rate": 4.1597096188747725e-06, + "loss": 21.6191, + "step": 4756 + }, + { + "epoch": 17.17336343115124, + "grad_norm": 239.66448974609375, + "learning_rate": 4.15426497277677e-06, + "loss": 22.0521, + "step": 4757 + }, + { + "epoch": 17.176975169300224, + "grad_norm": 212.25955200195312, + "learning_rate": 4.148820326678766e-06, + "loss": 22.6641, + "step": 4758 + }, + { + "epoch": 17.18058690744921, + "grad_norm": 229.9394073486328, + "learning_rate": 4.143375680580763e-06, + "loss": 22.8787, + "step": 4759 + }, + { + "epoch": 17.184198645598194, + "grad_norm": 237.46343994140625, + "learning_rate": 4.137931034482758e-06, + "loss": 39.1222, + "step": 4760 + }, + { + "epoch": 17.184198645598194, + "eval_loss": 0.6031526327133179, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 4760 + }, + { + "epoch": 17.187810383747177, + "grad_norm": 229.23849487304688, + "learning_rate": 4.132486388384755e-06, + "loss": 39.7664, + "step": 4761 + }, + { + "epoch": 17.191422121896164, + "grad_norm": 250.67529296875, + "learning_rate": 4.127041742286752e-06, + "loss": 38.6754, + "step": 4762 + }, + { + "epoch": 17.195033860045147, + "grad_norm": 272.9320068359375, + "learning_rate": 4.121597096188748e-06, + "loss": 39.1262, + "step": 4763 + }, + { + "epoch": 17.19864559819413, + "grad_norm": 267.82427978515625, + "learning_rate": 4.116152450090744e-06, + "loss": 38.2223, + "step": 4764 + }, + { + "epoch": 17.202257336343116, + "grad_norm": 266.35760498046875, + "learning_rate": 4.11070780399274e-06, + "loss": 39.2069, + "step": 4765 + }, + { + "epoch": 17.2058690744921, + "grad_norm": 221.62606811523438, + "learning_rate": 4.105263157894737e-06, + "loss": 38.8956, + "step": 4766 + }, + { + "epoch": 17.209480812641083, + "grad_norm": 243.73110961914062, + "learning_rate": 4.099818511796734e-06, + "loss": 41.5868, + "step": 4767 + }, + { + "epoch": 17.21309255079007, + "grad_norm": 268.6092224121094, + "learning_rate": 4.09437386569873e-06, + "loss": 39.1041, + "step": 4768 + }, + { + "epoch": 17.216704288939052, + "grad_norm": 300.3140563964844, + "learning_rate": 4.088929219600726e-06, + "loss": 38.25, + "step": 4769 + }, + { + "epoch": 17.220316027088035, + "grad_norm": 264.56805419921875, + "learning_rate": 4.083484573502722e-06, + "loss": 38.186, + "step": 4770 + }, + { + "epoch": 17.220316027088035, + "eval_loss": 0.6044566631317139, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4770 + }, + { + "epoch": 17.223927765237022, + "grad_norm": 303.47686767578125, + "learning_rate": 4.0780399274047185e-06, + "loss": 37.7011, + "step": 4771 + }, + { + "epoch": 17.227539503386005, + "grad_norm": 238.3590545654297, + "learning_rate": 4.072595281306715e-06, + "loss": 34.6695, + "step": 4772 + }, + { + "epoch": 17.231151241534988, + "grad_norm": 252.90081787109375, + "learning_rate": 4.067150635208712e-06, + "loss": 36.1903, + "step": 4773 + }, + { + "epoch": 17.23476297968397, + "grad_norm": 286.5584716796875, + "learning_rate": 4.061705989110708e-06, + "loss": 36.4185, + "step": 4774 + }, + { + "epoch": 17.238374717832958, + "grad_norm": 322.25323486328125, + "learning_rate": 4.056261343012704e-06, + "loss": 36.0098, + "step": 4775 + }, + { + "epoch": 17.24198645598194, + "grad_norm": 292.09405517578125, + "learning_rate": 4.0508166969147005e-06, + "loss": 35.4347, + "step": 4776 + }, + { + "epoch": 17.245598194130924, + "grad_norm": 295.9725341796875, + "learning_rate": 4.045372050816697e-06, + "loss": 37.3512, + "step": 4777 + }, + { + "epoch": 17.24920993227991, + "grad_norm": 326.34539794921875, + "learning_rate": 4.039927404718694e-06, + "loss": 38.6739, + "step": 4778 + }, + { + "epoch": 17.252821670428894, + "grad_norm": 384.3682861328125, + "learning_rate": 4.034482758620689e-06, + "loss": 38.0995, + "step": 4779 + }, + { + "epoch": 17.256433408577877, + "grad_norm": 400.59136962890625, + "learning_rate": 4.029038112522686e-06, + "loss": 36.7733, + "step": 4780 + }, + { + "epoch": 17.256433408577877, + "eval_loss": 0.6064656972885132, + "eval_runtime": 3.14, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 4780 + }, + { + "epoch": 17.260045146726863, + "grad_norm": 379.5261535644531, + "learning_rate": 4.0235934664246825e-06, + "loss": 36.1385, + "step": 4781 + }, + { + "epoch": 17.263656884875846, + "grad_norm": 277.1004638671875, + "learning_rate": 4.018148820326679e-06, + "loss": 39.1495, + "step": 4782 + }, + { + "epoch": 17.26726862302483, + "grad_norm": 274.6176452636719, + "learning_rate": 4.012704174228675e-06, + "loss": 37.8503, + "step": 4783 + }, + { + "epoch": 17.270880361173816, + "grad_norm": 338.9375305175781, + "learning_rate": 4.007259528130671e-06, + "loss": 39.7149, + "step": 4784 + }, + { + "epoch": 17.2744920993228, + "grad_norm": 299.60662841796875, + "learning_rate": 4.001814882032668e-06, + "loss": 37.6013, + "step": 4785 + }, + { + "epoch": 17.278103837471782, + "grad_norm": 278.9190368652344, + "learning_rate": 3.996370235934664e-06, + "loss": 38.1106, + "step": 4786 + }, + { + "epoch": 17.28171557562077, + "grad_norm": 254.48443603515625, + "learning_rate": 3.990925589836661e-06, + "loss": 35.9676, + "step": 4787 + }, + { + "epoch": 17.285327313769752, + "grad_norm": 274.65338134765625, + "learning_rate": 3.985480943738657e-06, + "loss": 35.3535, + "step": 4788 + }, + { + "epoch": 17.288939051918735, + "grad_norm": 288.748779296875, + "learning_rate": 3.980036297640654e-06, + "loss": 32.7356, + "step": 4789 + }, + { + "epoch": 17.292550790067722, + "grad_norm": 229.0682830810547, + "learning_rate": 3.9745916515426495e-06, + "loss": 31.2048, + "step": 4790 + }, + { + "epoch": 17.292550790067722, + "eval_loss": 0.6020387411117554, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.06, + "eval_steps_per_second": 57.06, + "step": 4790 + }, + { + "epoch": 17.296162528216705, + "grad_norm": 234.29937744140625, + "learning_rate": 3.9691470054446465e-06, + "loss": 31.7953, + "step": 4791 + }, + { + "epoch": 17.299774266365688, + "grad_norm": 236.3527069091797, + "learning_rate": 3.963702359346643e-06, + "loss": 31.6686, + "step": 4792 + }, + { + "epoch": 17.30338600451467, + "grad_norm": 253.44126892089844, + "learning_rate": 3.958257713248639e-06, + "loss": 31.8848, + "step": 4793 + }, + { + "epoch": 17.306997742663658, + "grad_norm": 270.66046142578125, + "learning_rate": 3.952813067150635e-06, + "loss": 32.1593, + "step": 4794 + }, + { + "epoch": 17.31060948081264, + "grad_norm": 242.77777099609375, + "learning_rate": 3.9473684210526315e-06, + "loss": 32.4555, + "step": 4795 + }, + { + "epoch": 17.314221218961624, + "grad_norm": 243.9296112060547, + "learning_rate": 3.9419237749546285e-06, + "loss": 34.0444, + "step": 4796 + }, + { + "epoch": 17.31783295711061, + "grad_norm": 276.2138671875, + "learning_rate": 3.936479128856624e-06, + "loss": 32.0404, + "step": 4797 + }, + { + "epoch": 17.321444695259594, + "grad_norm": 262.97802734375, + "learning_rate": 3.931034482758621e-06, + "loss": 32.4535, + "step": 4798 + }, + { + "epoch": 17.325056433408577, + "grad_norm": 338.9852600097656, + "learning_rate": 3.925589836660617e-06, + "loss": 34.6855, + "step": 4799 + }, + { + "epoch": 17.328668171557563, + "grad_norm": 270.85650634765625, + "learning_rate": 3.9201451905626135e-06, + "loss": 32.2425, + "step": 4800 + }, + { + "epoch": 17.328668171557563, + "eval_loss": 0.603055477142334, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 4800 + }, + { + "epoch": 17.332279909706546, + "grad_norm": 289.17584228515625, + "learning_rate": 3.91470054446461e-06, + "loss": 34.6461, + "step": 4801 + }, + { + "epoch": 17.33589164785553, + "grad_norm": 301.120361328125, + "learning_rate": 3.909255898366606e-06, + "loss": 34.5622, + "step": 4802 + }, + { + "epoch": 17.339503386004516, + "grad_norm": 328.93524169921875, + "learning_rate": 3.903811252268603e-06, + "loss": 34.9585, + "step": 4803 + }, + { + "epoch": 17.3431151241535, + "grad_norm": 445.72003173828125, + "learning_rate": 3.898366606170599e-06, + "loss": 36.9729, + "step": 4804 + }, + { + "epoch": 17.346726862302482, + "grad_norm": 249.7901153564453, + "learning_rate": 3.8929219600725955e-06, + "loss": 30.1609, + "step": 4805 + }, + { + "epoch": 17.35033860045147, + "grad_norm": 230.1756134033203, + "learning_rate": 3.887477313974592e-06, + "loss": 21.6742, + "step": 4806 + }, + { + "epoch": 17.353950338600452, + "grad_norm": 193.68104553222656, + "learning_rate": 3.882032667876588e-06, + "loss": 22.0064, + "step": 4807 + }, + { + "epoch": 17.357562076749435, + "grad_norm": 232.58486938476562, + "learning_rate": 3.876588021778585e-06, + "loss": 23.1576, + "step": 4808 + }, + { + "epoch": 17.36117381489842, + "grad_norm": 256.0340270996094, + "learning_rate": 3.87114337568058e-06, + "loss": 23.5346, + "step": 4809 + }, + { + "epoch": 17.364785553047405, + "grad_norm": 260.8665771484375, + "learning_rate": 3.8656987295825775e-06, + "loss": 39.5267, + "step": 4810 + }, + { + "epoch": 17.364785553047405, + "eval_loss": 0.6040924191474915, + "eval_runtime": 3.1444, + "eval_samples_per_second": 56.926, + "eval_steps_per_second": 56.926, + "step": 4810 + }, + { + "epoch": 17.368397291196388, + "grad_norm": 253.2076873779297, + "learning_rate": 3.860254083484574e-06, + "loss": 40.222, + "step": 4811 + }, + { + "epoch": 17.37200902934537, + "grad_norm": 232.68162536621094, + "learning_rate": 3.85480943738657e-06, + "loss": 38.8405, + "step": 4812 + }, + { + "epoch": 17.375620767494357, + "grad_norm": 264.7735290527344, + "learning_rate": 3.849364791288566e-06, + "loss": 37.8169, + "step": 4813 + }, + { + "epoch": 17.37923250564334, + "grad_norm": 305.1289978027344, + "learning_rate": 3.843920145190563e-06, + "loss": 39.4413, + "step": 4814 + }, + { + "epoch": 17.382844243792324, + "grad_norm": 409.03106689453125, + "learning_rate": 3.8384754990925594e-06, + "loss": 40.146, + "step": 4815 + }, + { + "epoch": 17.38645598194131, + "grad_norm": 307.2272644042969, + "learning_rate": 3.833030852994555e-06, + "loss": 39.0141, + "step": 4816 + }, + { + "epoch": 17.390067720090293, + "grad_norm": 272.6708068847656, + "learning_rate": 3.827586206896552e-06, + "loss": 39.4356, + "step": 4817 + }, + { + "epoch": 17.393679458239276, + "grad_norm": 239.75225830078125, + "learning_rate": 3.822141560798548e-06, + "loss": 39.1581, + "step": 4818 + }, + { + "epoch": 17.397291196388263, + "grad_norm": 203.42205810546875, + "learning_rate": 3.816696914700545e-06, + "loss": 39.9827, + "step": 4819 + }, + { + "epoch": 17.400902934537246, + "grad_norm": 217.77159118652344, + "learning_rate": 3.811252268602541e-06, + "loss": 37.5404, + "step": 4820 + }, + { + "epoch": 17.400902934537246, + "eval_loss": 0.6033807396888733, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.033, + "eval_steps_per_second": 57.033, + "step": 4820 + }, + { + "epoch": 17.40451467268623, + "grad_norm": 257.9713134765625, + "learning_rate": 3.8058076225045377e-06, + "loss": 35.6571, + "step": 4821 + }, + { + "epoch": 17.408126410835216, + "grad_norm": 295.11468505859375, + "learning_rate": 3.8003629764065335e-06, + "loss": 34.7256, + "step": 4822 + }, + { + "epoch": 17.4117381489842, + "grad_norm": 248.15908813476562, + "learning_rate": 3.7949183303085297e-06, + "loss": 37.3417, + "step": 4823 + }, + { + "epoch": 17.415349887133182, + "grad_norm": 295.19085693359375, + "learning_rate": 3.7894736842105264e-06, + "loss": 37.0117, + "step": 4824 + }, + { + "epoch": 17.41896162528217, + "grad_norm": 249.31576538085938, + "learning_rate": 3.7840290381125226e-06, + "loss": 37.168, + "step": 4825 + }, + { + "epoch": 17.42257336343115, + "grad_norm": 271.1731262207031, + "learning_rate": 3.7785843920145193e-06, + "loss": 35.9932, + "step": 4826 + }, + { + "epoch": 17.426185101580135, + "grad_norm": 380.6817626953125, + "learning_rate": 3.7731397459165155e-06, + "loss": 36.952, + "step": 4827 + }, + { + "epoch": 17.42979683972912, + "grad_norm": 370.125244140625, + "learning_rate": 3.767695099818512e-06, + "loss": 38.2224, + "step": 4828 + }, + { + "epoch": 17.433408577878104, + "grad_norm": 291.13568115234375, + "learning_rate": 3.7622504537205084e-06, + "loss": 38.5377, + "step": 4829 + }, + { + "epoch": 17.437020316027088, + "grad_norm": 329.5670471191406, + "learning_rate": 3.756805807622504e-06, + "loss": 38.1665, + "step": 4830 + }, + { + "epoch": 17.437020316027088, + "eval_loss": 0.6047329902648926, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.081, + "eval_steps_per_second": 57.081, + "step": 4830 + }, + { + "epoch": 17.44063205417607, + "grad_norm": 266.0620422363281, + "learning_rate": 3.7513611615245012e-06, + "loss": 34.8371, + "step": 4831 + }, + { + "epoch": 17.444243792325057, + "grad_norm": 257.48980712890625, + "learning_rate": 3.7459165154264975e-06, + "loss": 37.1885, + "step": 4832 + }, + { + "epoch": 17.44785553047404, + "grad_norm": 346.8575439453125, + "learning_rate": 3.740471869328494e-06, + "loss": 38.1426, + "step": 4833 + }, + { + "epoch": 17.451467268623023, + "grad_norm": 246.66868591308594, + "learning_rate": 3.73502722323049e-06, + "loss": 37.6658, + "step": 4834 + }, + { + "epoch": 17.45507900677201, + "grad_norm": 309.71087646484375, + "learning_rate": 3.729582577132486e-06, + "loss": 38.2335, + "step": 4835 + }, + { + "epoch": 17.458690744920993, + "grad_norm": 304.1862487792969, + "learning_rate": 3.724137931034483e-06, + "loss": 38.5964, + "step": 4836 + }, + { + "epoch": 17.462302483069976, + "grad_norm": 253.73211669921875, + "learning_rate": 3.718693284936479e-06, + "loss": 38.9237, + "step": 4837 + }, + { + "epoch": 17.465914221218963, + "grad_norm": 208.52822875976562, + "learning_rate": 3.7132486388384757e-06, + "loss": 35.9177, + "step": 4838 + }, + { + "epoch": 17.469525959367946, + "grad_norm": 258.5502014160156, + "learning_rate": 3.707803992740472e-06, + "loss": 33.2577, + "step": 4839 + }, + { + "epoch": 17.47313769751693, + "grad_norm": 269.1754150390625, + "learning_rate": 3.7023593466424686e-06, + "loss": 31.2634, + "step": 4840 + }, + { + "epoch": 17.47313769751693, + "eval_loss": 0.6035012006759644, + "eval_runtime": 3.1369, + "eval_samples_per_second": 57.062, + "eval_steps_per_second": 57.062, + "step": 4840 + }, + { + "epoch": 17.476749435665916, + "grad_norm": 268.5780029296875, + "learning_rate": 3.6969147005444644e-06, + "loss": 30.6732, + "step": 4841 + }, + { + "epoch": 17.4803611738149, + "grad_norm": 223.7191619873047, + "learning_rate": 3.691470054446461e-06, + "loss": 31.5905, + "step": 4842 + }, + { + "epoch": 17.483972911963882, + "grad_norm": 266.960205078125, + "learning_rate": 3.6860254083484573e-06, + "loss": 31.9407, + "step": 4843 + }, + { + "epoch": 17.48758465011287, + "grad_norm": 241.2608184814453, + "learning_rate": 3.680580762250454e-06, + "loss": 31.8078, + "step": 4844 + }, + { + "epoch": 17.49119638826185, + "grad_norm": 315.95166015625, + "learning_rate": 3.67513611615245e-06, + "loss": 33.5336, + "step": 4845 + }, + { + "epoch": 17.494808126410835, + "grad_norm": 277.731689453125, + "learning_rate": 3.669691470054447e-06, + "loss": 33.0484, + "step": 4846 + }, + { + "epoch": 17.498419864559818, + "grad_norm": 272.35137939453125, + "learning_rate": 3.664246823956443e-06, + "loss": 33.5048, + "step": 4847 + }, + { + "epoch": 17.502031602708804, + "grad_norm": 260.4573974609375, + "learning_rate": 3.6588021778584393e-06, + "loss": 33.5782, + "step": 4848 + }, + { + "epoch": 17.505643340857787, + "grad_norm": 285.7935485839844, + "learning_rate": 3.6533575317604355e-06, + "loss": 35.0308, + "step": 4849 + }, + { + "epoch": 17.50925507900677, + "grad_norm": 267.613037109375, + "learning_rate": 3.6479128856624317e-06, + "loss": 34.8067, + "step": 4850 + }, + { + "epoch": 17.50925507900677, + "eval_loss": 0.6035751700401306, + "eval_runtime": 3.1383, + "eval_samples_per_second": 57.037, + "eval_steps_per_second": 57.037, + "step": 4850 + }, + { + "epoch": 17.512866817155757, + "grad_norm": 301.43536376953125, + "learning_rate": 3.6424682395644284e-06, + "loss": 33.1631, + "step": 4851 + }, + { + "epoch": 17.51647855530474, + "grad_norm": 270.10467529296875, + "learning_rate": 3.6370235934664246e-06, + "loss": 32.978, + "step": 4852 + }, + { + "epoch": 17.520090293453723, + "grad_norm": 280.802001953125, + "learning_rate": 3.6315789473684213e-06, + "loss": 35.3346, + "step": 4853 + }, + { + "epoch": 17.52370203160271, + "grad_norm": 314.7720031738281, + "learning_rate": 3.6261343012704175e-06, + "loss": 33.4881, + "step": 4854 + }, + { + "epoch": 17.527313769751693, + "grad_norm": 347.4674072265625, + "learning_rate": 3.620689655172414e-06, + "loss": 31.5599, + "step": 4855 + }, + { + "epoch": 17.530925507900676, + "grad_norm": 207.3061981201172, + "learning_rate": 3.61524500907441e-06, + "loss": 22.159, + "step": 4856 + }, + { + "epoch": 17.534537246049663, + "grad_norm": 216.7202911376953, + "learning_rate": 3.6098003629764066e-06, + "loss": 21.6584, + "step": 4857 + }, + { + "epoch": 17.538148984198646, + "grad_norm": 260.20452880859375, + "learning_rate": 3.604355716878403e-06, + "loss": 22.9289, + "step": 4858 + }, + { + "epoch": 17.54176072234763, + "grad_norm": 295.9897766113281, + "learning_rate": 3.5989110707803995e-06, + "loss": 23.7172, + "step": 4859 + }, + { + "epoch": 17.545372460496615, + "grad_norm": 226.99484252929688, + "learning_rate": 3.5934664246823957e-06, + "loss": 37.5844, + "step": 4860 + }, + { + "epoch": 17.545372460496615, + "eval_loss": 0.6059216260910034, + "eval_runtime": 3.1302, + "eval_samples_per_second": 57.185, + "eval_steps_per_second": 57.185, + "step": 4860 + }, + { + "epoch": 17.5489841986456, + "grad_norm": 231.67477416992188, + "learning_rate": 3.588021778584392e-06, + "loss": 39.5191, + "step": 4861 + }, + { + "epoch": 17.55259593679458, + "grad_norm": 248.46058654785156, + "learning_rate": 3.5825771324863886e-06, + "loss": 39.4246, + "step": 4862 + }, + { + "epoch": 17.55620767494357, + "grad_norm": 239.17247009277344, + "learning_rate": 3.577132486388385e-06, + "loss": 38.9811, + "step": 4863 + }, + { + "epoch": 17.55981941309255, + "grad_norm": 325.3457946777344, + "learning_rate": 3.571687840290381e-06, + "loss": 38.4724, + "step": 4864 + }, + { + "epoch": 17.563431151241534, + "grad_norm": 264.5011901855469, + "learning_rate": 3.5662431941923773e-06, + "loss": 38.79, + "step": 4865 + }, + { + "epoch": 17.567042889390518, + "grad_norm": 251.97154235839844, + "learning_rate": 3.560798548094374e-06, + "loss": 38.0342, + "step": 4866 + }, + { + "epoch": 17.570654627539504, + "grad_norm": 236.78271484375, + "learning_rate": 3.55535390199637e-06, + "loss": 39.8586, + "step": 4867 + }, + { + "epoch": 17.574266365688487, + "grad_norm": 276.8800048828125, + "learning_rate": 3.549909255898367e-06, + "loss": 37.8967, + "step": 4868 + }, + { + "epoch": 17.57787810383747, + "grad_norm": 255.9346160888672, + "learning_rate": 3.544464609800363e-06, + "loss": 39.9833, + "step": 4869 + }, + { + "epoch": 17.581489841986457, + "grad_norm": 273.71337890625, + "learning_rate": 3.5390199637023597e-06, + "loss": 38.6235, + "step": 4870 + }, + { + "epoch": 17.581489841986457, + "eval_loss": 0.6033145189285278, + "eval_runtime": 3.1252, + "eval_samples_per_second": 57.275, + "eval_steps_per_second": 57.275, + "step": 4870 + }, + { + "epoch": 17.58510158013544, + "grad_norm": 252.93063354492188, + "learning_rate": 3.533575317604356e-06, + "loss": 37.9017, + "step": 4871 + }, + { + "epoch": 17.588713318284423, + "grad_norm": 259.8314208984375, + "learning_rate": 3.528130671506352e-06, + "loss": 34.6046, + "step": 4872 + }, + { + "epoch": 17.59232505643341, + "grad_norm": 230.2709197998047, + "learning_rate": 3.5226860254083484e-06, + "loss": 35.301, + "step": 4873 + }, + { + "epoch": 17.595936794582393, + "grad_norm": 306.6289367675781, + "learning_rate": 3.517241379310345e-06, + "loss": 37.4443, + "step": 4874 + }, + { + "epoch": 17.599548532731376, + "grad_norm": 241.5065460205078, + "learning_rate": 3.5117967332123413e-06, + "loss": 36.3646, + "step": 4875 + }, + { + "epoch": 17.603160270880363, + "grad_norm": 234.2492218017578, + "learning_rate": 3.5063520871143375e-06, + "loss": 36.2621, + "step": 4876 + }, + { + "epoch": 17.606772009029346, + "grad_norm": 256.5443115234375, + "learning_rate": 3.500907441016334e-06, + "loss": 36.2202, + "step": 4877 + }, + { + "epoch": 17.61038374717833, + "grad_norm": 280.31097412109375, + "learning_rate": 3.4954627949183304e-06, + "loss": 37.5031, + "step": 4878 + }, + { + "epoch": 17.613995485327315, + "grad_norm": 304.2773132324219, + "learning_rate": 3.4900181488203267e-06, + "loss": 37.1418, + "step": 4879 + }, + { + "epoch": 17.6176072234763, + "grad_norm": 361.27716064453125, + "learning_rate": 3.484573502722323e-06, + "loss": 37.1474, + "step": 4880 + }, + { + "epoch": 17.6176072234763, + "eval_loss": 0.6052342653274536, + "eval_runtime": 3.1249, + "eval_samples_per_second": 57.282, + "eval_steps_per_second": 57.282, + "step": 4880 + }, + { + "epoch": 17.62121896162528, + "grad_norm": 237.64540100097656, + "learning_rate": 3.4791288566243195e-06, + "loss": 38.0673, + "step": 4881 + }, + { + "epoch": 17.624830699774268, + "grad_norm": 351.27215576171875, + "learning_rate": 3.4736842105263158e-06, + "loss": 38.8272, + "step": 4882 + }, + { + "epoch": 17.62844243792325, + "grad_norm": 277.1895751953125, + "learning_rate": 3.4682395644283124e-06, + "loss": 39.1524, + "step": 4883 + }, + { + "epoch": 17.632054176072234, + "grad_norm": 275.1535949707031, + "learning_rate": 3.4627949183303086e-06, + "loss": 37.9027, + "step": 4884 + }, + { + "epoch": 17.635665914221217, + "grad_norm": 335.01776123046875, + "learning_rate": 3.4573502722323053e-06, + "loss": 36.7233, + "step": 4885 + }, + { + "epoch": 17.639277652370204, + "grad_norm": 297.1637878417969, + "learning_rate": 3.4519056261343015e-06, + "loss": 37.782, + "step": 4886 + }, + { + "epoch": 17.642889390519187, + "grad_norm": 265.400390625, + "learning_rate": 3.4464609800362978e-06, + "loss": 37.6639, + "step": 4887 + }, + { + "epoch": 17.64650112866817, + "grad_norm": 345.3449401855469, + "learning_rate": 3.441016333938294e-06, + "loss": 36.7617, + "step": 4888 + }, + { + "epoch": 17.650112866817157, + "grad_norm": 256.0724182128906, + "learning_rate": 3.4355716878402902e-06, + "loss": 32.9906, + "step": 4889 + }, + { + "epoch": 17.65372460496614, + "grad_norm": 260.698486328125, + "learning_rate": 3.430127041742287e-06, + "loss": 32.0811, + "step": 4890 + }, + { + "epoch": 17.65372460496614, + "eval_loss": 0.603126585483551, + "eval_runtime": 3.1268, + "eval_samples_per_second": 57.247, + "eval_steps_per_second": 57.247, + "step": 4890 + }, + { + "epoch": 17.657336343115123, + "grad_norm": 274.9847717285156, + "learning_rate": 3.424682395644283e-06, + "loss": 31.2138, + "step": 4891 + }, + { + "epoch": 17.66094808126411, + "grad_norm": 345.5099182128906, + "learning_rate": 3.4192377495462798e-06, + "loss": 30.302, + "step": 4892 + }, + { + "epoch": 17.664559819413093, + "grad_norm": 269.1453857421875, + "learning_rate": 3.413793103448276e-06, + "loss": 30.2679, + "step": 4893 + }, + { + "epoch": 17.668171557562076, + "grad_norm": 293.7955017089844, + "learning_rate": 3.4083484573502722e-06, + "loss": 31.7616, + "step": 4894 + }, + { + "epoch": 17.671783295711062, + "grad_norm": 306.1725769042969, + "learning_rate": 3.4029038112522685e-06, + "loss": 33.1265, + "step": 4895 + }, + { + "epoch": 17.675395033860045, + "grad_norm": 329.8185119628906, + "learning_rate": 3.397459165154265e-06, + "loss": 33.2131, + "step": 4896 + }, + { + "epoch": 17.67900677200903, + "grad_norm": 340.790283203125, + "learning_rate": 3.3920145190562613e-06, + "loss": 33.243, + "step": 4897 + }, + { + "epoch": 17.682618510158015, + "grad_norm": 324.004150390625, + "learning_rate": 3.386569872958258e-06, + "loss": 33.6235, + "step": 4898 + }, + { + "epoch": 17.686230248306998, + "grad_norm": 263.9126892089844, + "learning_rate": 3.3811252268602542e-06, + "loss": 33.2524, + "step": 4899 + }, + { + "epoch": 17.68984198645598, + "grad_norm": 274.6680603027344, + "learning_rate": 3.375680580762251e-06, + "loss": 34.6629, + "step": 4900 + }, + { + "epoch": 17.68984198645598, + "eval_loss": 0.6027778387069702, + "eval_runtime": 3.1418, + "eval_samples_per_second": 56.974, + "eval_steps_per_second": 56.974, + "step": 4900 + }, + { + "epoch": 17.693453724604964, + "grad_norm": 317.1280822753906, + "learning_rate": 3.370235934664247e-06, + "loss": 33.3088, + "step": 4901 + }, + { + "epoch": 17.69706546275395, + "grad_norm": 304.1892395019531, + "learning_rate": 3.364791288566243e-06, + "loss": 34.5045, + "step": 4902 + }, + { + "epoch": 17.700677200902934, + "grad_norm": 278.75933837890625, + "learning_rate": 3.3593466424682396e-06, + "loss": 35.8429, + "step": 4903 + }, + { + "epoch": 17.704288939051917, + "grad_norm": 299.76971435546875, + "learning_rate": 3.353901996370236e-06, + "loss": 36.2401, + "step": 4904 + }, + { + "epoch": 17.707900677200904, + "grad_norm": 253.46795654296875, + "learning_rate": 3.3484573502722324e-06, + "loss": 28.938, + "step": 4905 + }, + { + "epoch": 17.711512415349887, + "grad_norm": 220.74098205566406, + "learning_rate": 3.3430127041742287e-06, + "loss": 21.6689, + "step": 4906 + }, + { + "epoch": 17.71512415349887, + "grad_norm": 255.79150390625, + "learning_rate": 3.3375680580762253e-06, + "loss": 21.3497, + "step": 4907 + }, + { + "epoch": 17.718735891647857, + "grad_norm": 284.2683410644531, + "learning_rate": 3.3321234119782216e-06, + "loss": 22.9276, + "step": 4908 + }, + { + "epoch": 17.72234762979684, + "grad_norm": 296.7882080078125, + "learning_rate": 3.3266787658802182e-06, + "loss": 24.7304, + "step": 4909 + }, + { + "epoch": 17.725959367945823, + "grad_norm": 217.35546875, + "learning_rate": 3.321234119782214e-06, + "loss": 38.7687, + "step": 4910 + }, + { + "epoch": 17.725959367945823, + "eval_loss": 0.6015192866325378, + "eval_runtime": 3.1363, + "eval_samples_per_second": 57.074, + "eval_steps_per_second": 57.074, + "step": 4910 + }, + { + "epoch": 17.72957110609481, + "grad_norm": 256.7005920410156, + "learning_rate": 3.3157894736842107e-06, + "loss": 39.7421, + "step": 4911 + }, + { + "epoch": 17.733182844243792, + "grad_norm": 203.49417114257812, + "learning_rate": 3.310344827586207e-06, + "loss": 39.2911, + "step": 4912 + }, + { + "epoch": 17.736794582392776, + "grad_norm": 282.81439208984375, + "learning_rate": 3.3049001814882036e-06, + "loss": 39.2524, + "step": 4913 + }, + { + "epoch": 17.740406320541762, + "grad_norm": 315.3716735839844, + "learning_rate": 3.2994555353902e-06, + "loss": 37.2097, + "step": 4914 + }, + { + "epoch": 17.744018058690745, + "grad_norm": 250.96484375, + "learning_rate": 3.294010889292196e-06, + "loss": 37.6568, + "step": 4915 + }, + { + "epoch": 17.74762979683973, + "grad_norm": 299.4822082519531, + "learning_rate": 3.2885662431941927e-06, + "loss": 38.9578, + "step": 4916 + }, + { + "epoch": 17.751241534988715, + "grad_norm": 261.2537536621094, + "learning_rate": 3.2831215970961885e-06, + "loss": 40.3838, + "step": 4917 + }, + { + "epoch": 17.754853273137698, + "grad_norm": 220.55218505859375, + "learning_rate": 3.277676950998185e-06, + "loss": 39.2068, + "step": 4918 + }, + { + "epoch": 17.75846501128668, + "grad_norm": 238.06874084472656, + "learning_rate": 3.2722323049001814e-06, + "loss": 40.5383, + "step": 4919 + }, + { + "epoch": 17.762076749435664, + "grad_norm": 223.9597625732422, + "learning_rate": 3.266787658802178e-06, + "loss": 37.3857, + "step": 4920 + }, + { + "epoch": 17.762076749435664, + "eval_loss": 0.602606475353241, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.033, + "eval_steps_per_second": 57.033, + "step": 4920 + }, + { + "epoch": 17.76568848758465, + "grad_norm": 278.9289245605469, + "learning_rate": 3.2613430127041742e-06, + "loss": 37.187, + "step": 4921 + }, + { + "epoch": 17.769300225733634, + "grad_norm": 306.52398681640625, + "learning_rate": 3.255898366606171e-06, + "loss": 37.5243, + "step": 4922 + }, + { + "epoch": 17.772911963882617, + "grad_norm": 231.3939208984375, + "learning_rate": 3.250453720508167e-06, + "loss": 35.3104, + "step": 4923 + }, + { + "epoch": 17.776523702031604, + "grad_norm": 216.77613830566406, + "learning_rate": 3.2450090744101638e-06, + "loss": 36.0904, + "step": 4924 + }, + { + "epoch": 17.780135440180587, + "grad_norm": 256.0504150390625, + "learning_rate": 3.2395644283121596e-06, + "loss": 36.4117, + "step": 4925 + }, + { + "epoch": 17.78374717832957, + "grad_norm": 253.29734802246094, + "learning_rate": 3.2341197822141562e-06, + "loss": 37.197, + "step": 4926 + }, + { + "epoch": 17.787358916478556, + "grad_norm": 268.80780029296875, + "learning_rate": 3.2286751361161525e-06, + "loss": 36.4606, + "step": 4927 + }, + { + "epoch": 17.79097065462754, + "grad_norm": 302.3041076660156, + "learning_rate": 3.2232304900181487e-06, + "loss": 36.8647, + "step": 4928 + }, + { + "epoch": 17.794582392776523, + "grad_norm": 274.23797607421875, + "learning_rate": 3.2177858439201454e-06, + "loss": 37.3981, + "step": 4929 + }, + { + "epoch": 17.79819413092551, + "grad_norm": 281.4304504394531, + "learning_rate": 3.2123411978221416e-06, + "loss": 37.2304, + "step": 4930 + }, + { + "epoch": 17.79819413092551, + "eval_loss": 0.6050394773483276, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 4930 + }, + { + "epoch": 17.801805869074492, + "grad_norm": 277.47698974609375, + "learning_rate": 3.2068965517241382e-06, + "loss": 35.9281, + "step": 4931 + }, + { + "epoch": 17.805417607223475, + "grad_norm": 394.02294921875, + "learning_rate": 3.201451905626134e-06, + "loss": 39.0143, + "step": 4932 + }, + { + "epoch": 17.809029345372462, + "grad_norm": 252.8087158203125, + "learning_rate": 3.1960072595281307e-06, + "loss": 36.9452, + "step": 4933 + }, + { + "epoch": 17.812641083521445, + "grad_norm": 249.54962158203125, + "learning_rate": 3.190562613430127e-06, + "loss": 39.2442, + "step": 4934 + }, + { + "epoch": 17.816252821670428, + "grad_norm": 286.9231262207031, + "learning_rate": 3.1851179673321236e-06, + "loss": 38.6445, + "step": 4935 + }, + { + "epoch": 17.819864559819415, + "grad_norm": 345.7146911621094, + "learning_rate": 3.17967332123412e-06, + "loss": 37.1794, + "step": 4936 + }, + { + "epoch": 17.823476297968398, + "grad_norm": 271.23089599609375, + "learning_rate": 3.1742286751361165e-06, + "loss": 36.3952, + "step": 4937 + }, + { + "epoch": 17.82708803611738, + "grad_norm": 406.3717346191406, + "learning_rate": 3.1687840290381127e-06, + "loss": 33.8166, + "step": 4938 + }, + { + "epoch": 17.830699774266364, + "grad_norm": 300.12554931640625, + "learning_rate": 3.1633393829401094e-06, + "loss": 30.9614, + "step": 4939 + }, + { + "epoch": 17.83431151241535, + "grad_norm": 229.67218017578125, + "learning_rate": 3.157894736842105e-06, + "loss": 31.8592, + "step": 4940 + }, + { + "epoch": 17.83431151241535, + "eval_loss": 0.6021057367324829, + "eval_runtime": 3.1376, + "eval_samples_per_second": 57.049, + "eval_steps_per_second": 57.049, + "step": 4940 + }, + { + "epoch": 17.837923250564334, + "grad_norm": 269.0873107910156, + "learning_rate": 3.1524500907441014e-06, + "loss": 31.7702, + "step": 4941 + }, + { + "epoch": 17.841534988713317, + "grad_norm": 279.0237731933594, + "learning_rate": 3.147005444646098e-06, + "loss": 31.3615, + "step": 4942 + }, + { + "epoch": 17.845146726862303, + "grad_norm": 234.94839477539062, + "learning_rate": 3.1415607985480943e-06, + "loss": 31.9314, + "step": 4943 + }, + { + "epoch": 17.848758465011286, + "grad_norm": 239.25613403320312, + "learning_rate": 3.136116152450091e-06, + "loss": 32.4513, + "step": 4944 + }, + { + "epoch": 17.85237020316027, + "grad_norm": 257.09661865234375, + "learning_rate": 3.130671506352087e-06, + "loss": 34.4964, + "step": 4945 + }, + { + "epoch": 17.855981941309256, + "grad_norm": 328.88006591796875, + "learning_rate": 3.125226860254084e-06, + "loss": 33.1662, + "step": 4946 + }, + { + "epoch": 17.85959367945824, + "grad_norm": 291.4894714355469, + "learning_rate": 3.1197822141560796e-06, + "loss": 34.4406, + "step": 4947 + }, + { + "epoch": 17.863205417607222, + "grad_norm": 282.81158447265625, + "learning_rate": 3.1143375680580763e-06, + "loss": 32.7141, + "step": 4948 + }, + { + "epoch": 17.86681715575621, + "grad_norm": 300.0378112792969, + "learning_rate": 3.1088929219600725e-06, + "loss": 34.3423, + "step": 4949 + }, + { + "epoch": 17.870428893905192, + "grad_norm": 267.2983703613281, + "learning_rate": 3.103448275862069e-06, + "loss": 33.1653, + "step": 4950 + }, + { + "epoch": 17.870428893905192, + "eval_loss": 0.6020416021347046, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.081, + "eval_steps_per_second": 57.081, + "step": 4950 + }, + { + "epoch": 17.874040632054175, + "grad_norm": 270.53277587890625, + "learning_rate": 3.0980036297640654e-06, + "loss": 34.7582, + "step": 4951 + }, + { + "epoch": 17.877652370203162, + "grad_norm": 346.0074157714844, + "learning_rate": 3.092558983666062e-06, + "loss": 35.9911, + "step": 4952 + }, + { + "epoch": 17.881264108352145, + "grad_norm": 367.5807189941406, + "learning_rate": 3.0871143375680583e-06, + "loss": 35.3345, + "step": 4953 + }, + { + "epoch": 17.884875846501128, + "grad_norm": 304.21649169921875, + "learning_rate": 3.0816696914700545e-06, + "loss": 32.9797, + "step": 4954 + }, + { + "epoch": 17.888487584650115, + "grad_norm": 253.14601135253906, + "learning_rate": 3.0762250453720507e-06, + "loss": 22.6226, + "step": 4955 + }, + { + "epoch": 17.892099322799098, + "grad_norm": 270.3512268066406, + "learning_rate": 3.070780399274047e-06, + "loss": 21.9531, + "step": 4956 + }, + { + "epoch": 17.89571106094808, + "grad_norm": 192.73712158203125, + "learning_rate": 3.0653357531760436e-06, + "loss": 21.8497, + "step": 4957 + }, + { + "epoch": 17.899322799097064, + "grad_norm": 254.43759155273438, + "learning_rate": 3.05989110707804e-06, + "loss": 23.2694, + "step": 4958 + }, + { + "epoch": 17.90293453724605, + "grad_norm": 271.2293395996094, + "learning_rate": 3.0544464609800365e-06, + "loss": 22.9774, + "step": 4959 + }, + { + "epoch": 17.906546275395034, + "grad_norm": 213.7334747314453, + "learning_rate": 3.0490018148820327e-06, + "loss": 38.8821, + "step": 4960 + }, + { + "epoch": 17.906546275395034, + "eval_loss": 0.600848913192749, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.036, + "eval_steps_per_second": 57.036, + "step": 4960 + }, + { + "epoch": 17.910158013544017, + "grad_norm": 269.9356384277344, + "learning_rate": 3.0435571687840294e-06, + "loss": 38.6362, + "step": 4961 + }, + { + "epoch": 17.913769751693003, + "grad_norm": 237.6484832763672, + "learning_rate": 3.0381125226860256e-06, + "loss": 39.6388, + "step": 4962 + }, + { + "epoch": 17.917381489841986, + "grad_norm": 304.2347106933594, + "learning_rate": 3.032667876588022e-06, + "loss": 39.4308, + "step": 4963 + }, + { + "epoch": 17.92099322799097, + "grad_norm": 250.6772918701172, + "learning_rate": 3.027223230490018e-06, + "loss": 40.1923, + "step": 4964 + }, + { + "epoch": 17.924604966139956, + "grad_norm": 261.7320556640625, + "learning_rate": 3.0217785843920147e-06, + "loss": 37.862, + "step": 4965 + }, + { + "epoch": 17.92821670428894, + "grad_norm": 385.33197021484375, + "learning_rate": 3.016333938294011e-06, + "loss": 35.9139, + "step": 4966 + }, + { + "epoch": 17.931828442437922, + "grad_norm": 436.6773986816406, + "learning_rate": 3.010889292196007e-06, + "loss": 36.6259, + "step": 4967 + }, + { + "epoch": 17.93544018058691, + "grad_norm": 318.65673828125, + "learning_rate": 3.005444646098004e-06, + "loss": 36.1235, + "step": 4968 + }, + { + "epoch": 17.939051918735892, + "grad_norm": 241.6234893798828, + "learning_rate": 3e-06, + "loss": 37.4148, + "step": 4969 + }, + { + "epoch": 17.942663656884875, + "grad_norm": 316.8415832519531, + "learning_rate": 2.9945553539019963e-06, + "loss": 36.7089, + "step": 4970 + }, + { + "epoch": 17.942663656884875, + "eval_loss": 0.6032605171203613, + "eval_runtime": 3.137, + "eval_samples_per_second": 57.061, + "eval_steps_per_second": 57.061, + "step": 4970 + }, + { + "epoch": 17.94627539503386, + "grad_norm": 322.0501403808594, + "learning_rate": 2.9891107078039925e-06, + "loss": 37.2222, + "step": 4971 + }, + { + "epoch": 17.949887133182845, + "grad_norm": 300.4189453125, + "learning_rate": 2.983666061705989e-06, + "loss": 37.9156, + "step": 4972 + }, + { + "epoch": 17.953498871331828, + "grad_norm": 304.39263916015625, + "learning_rate": 2.9782214156079854e-06, + "loss": 38.5253, + "step": 4973 + }, + { + "epoch": 17.957110609480814, + "grad_norm": 297.4574890136719, + "learning_rate": 2.972776769509982e-06, + "loss": 38.4385, + "step": 4974 + }, + { + "epoch": 17.960722347629797, + "grad_norm": 367.7257080078125, + "learning_rate": 2.9673321234119783e-06, + "loss": 36.2943, + "step": 4975 + }, + { + "epoch": 17.96433408577878, + "grad_norm": 274.61724853515625, + "learning_rate": 2.961887477313975e-06, + "loss": 30.8753, + "step": 4976 + }, + { + "epoch": 17.967945823927764, + "grad_norm": 358.50201416015625, + "learning_rate": 2.956442831215971e-06, + "loss": 32.1308, + "step": 4977 + }, + { + "epoch": 17.97155756207675, + "grad_norm": 493.7792663574219, + "learning_rate": 2.9509981851179674e-06, + "loss": 33.2474, + "step": 4978 + }, + { + "epoch": 17.975169300225733, + "grad_norm": 426.67138671875, + "learning_rate": 2.9455535390199636e-06, + "loss": 33.7065, + "step": 4979 + }, + { + "epoch": 17.978781038374716, + "grad_norm": 524.0231323242188, + "learning_rate": 2.94010889292196e-06, + "loss": 34.6007, + "step": 4980 + }, + { + "epoch": 17.978781038374716, + "eval_loss": 0.6021283268928528, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.121, + "eval_steps_per_second": 57.121, + "step": 4980 + }, + { + "epoch": 17.982392776523703, + "grad_norm": 395.26715087890625, + "learning_rate": 2.9346642468239565e-06, + "loss": 33.9185, + "step": 4981 + }, + { + "epoch": 17.986004514672686, + "grad_norm": 400.0454406738281, + "learning_rate": 2.9292196007259528e-06, + "loss": 34.6485, + "step": 4982 + }, + { + "epoch": 17.98961625282167, + "grad_norm": 376.1269226074219, + "learning_rate": 2.9237749546279494e-06, + "loss": 34.668, + "step": 4983 + }, + { + "epoch": 17.993227990970656, + "grad_norm": 315.5225524902344, + "learning_rate": 2.9183303085299456e-06, + "loss": 30.7058, + "step": 4984 + }, + { + "epoch": 17.99683972911964, + "grad_norm": 221.5032958984375, + "learning_rate": 2.912885662431942e-06, + "loss": 21.8055, + "step": 4985 + }, + { + "epoch": 18.0, + "grad_norm": 226.06068420410156, + "learning_rate": 2.907441016333938e-06, + "loss": 20.5066, + "step": 4986 + }, + { + "epoch": 18.003611738148983, + "grad_norm": 209.69607543945312, + "learning_rate": 2.9019963702359348e-06, + "loss": 37.9156, + "step": 4987 + }, + { + "epoch": 18.00722347629797, + "grad_norm": 218.86709594726562, + "learning_rate": 2.896551724137931e-06, + "loss": 38.8204, + "step": 4988 + }, + { + "epoch": 18.010835214446953, + "grad_norm": 218.38180541992188, + "learning_rate": 2.8911070780399276e-06, + "loss": 38.5472, + "step": 4989 + }, + { + "epoch": 18.014446952595936, + "grad_norm": 338.4778747558594, + "learning_rate": 2.885662431941924e-06, + "loss": 37.7233, + "step": 4990 + }, + { + "epoch": 18.014446952595936, + "eval_loss": 0.6013379096984863, + "eval_runtime": 3.1415, + "eval_samples_per_second": 56.979, + "eval_steps_per_second": 56.979, + "step": 4990 + }, + { + "epoch": 18.018058690744923, + "grad_norm": 309.5385437011719, + "learning_rate": 2.8802177858439205e-06, + "loss": 38.3321, + "step": 4991 + }, + { + "epoch": 18.021670428893906, + "grad_norm": 335.67169189453125, + "learning_rate": 2.8747731397459168e-06, + "loss": 38.2367, + "step": 4992 + }, + { + "epoch": 18.02528216704289, + "grad_norm": 260.5025939941406, + "learning_rate": 2.8693284936479126e-06, + "loss": 38.5516, + "step": 4993 + }, + { + "epoch": 18.028893905191875, + "grad_norm": 265.4793395996094, + "learning_rate": 2.8638838475499092e-06, + "loss": 38.9539, + "step": 4994 + }, + { + "epoch": 18.03250564334086, + "grad_norm": 237.87942504882812, + "learning_rate": 2.8584392014519054e-06, + "loss": 39.4582, + "step": 4995 + }, + { + "epoch": 18.03611738148984, + "grad_norm": 252.11746215820312, + "learning_rate": 2.852994555353902e-06, + "loss": 39.3466, + "step": 4996 + }, + { + "epoch": 18.039729119638825, + "grad_norm": 298.1370849609375, + "learning_rate": 2.8475499092558983e-06, + "loss": 36.9779, + "step": 4997 + }, + { + "epoch": 18.04334085778781, + "grad_norm": 341.9007873535156, + "learning_rate": 2.842105263157895e-06, + "loss": 36.5117, + "step": 4998 + }, + { + "epoch": 18.046952595936794, + "grad_norm": 210.0319366455078, + "learning_rate": 2.8366606170598912e-06, + "loss": 34.7543, + "step": 4999 + }, + { + "epoch": 18.050564334085777, + "grad_norm": 385.6400146484375, + "learning_rate": 2.831215970961888e-06, + "loss": 36.4577, + "step": 5000 + }, + { + "epoch": 18.050564334085777, + "eval_loss": 0.6031082272529602, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.011, + "eval_steps_per_second": 57.011, + "step": 5000 + }, + { + "epoch": 18.054176072234764, + "grad_norm": 268.4949035644531, + "learning_rate": 2.8257713248638837e-06, + "loss": 36.3765, + "step": 5001 + }, + { + "epoch": 18.057787810383747, + "grad_norm": 311.2984313964844, + "learning_rate": 2.8203266787658803e-06, + "loss": 35.709, + "step": 5002 + }, + { + "epoch": 18.06139954853273, + "grad_norm": 264.0671081542969, + "learning_rate": 2.8148820326678766e-06, + "loss": 35.7978, + "step": 5003 + }, + { + "epoch": 18.065011286681717, + "grad_norm": 341.0770263671875, + "learning_rate": 2.8094373865698732e-06, + "loss": 36.8963, + "step": 5004 + }, + { + "epoch": 18.0686230248307, + "grad_norm": 253.3942108154297, + "learning_rate": 2.8039927404718694e-06, + "loss": 37.1135, + "step": 5005 + }, + { + "epoch": 18.072234762979683, + "grad_norm": 286.23736572265625, + "learning_rate": 2.7985480943738657e-06, + "loss": 35.736, + "step": 5006 + }, + { + "epoch": 18.07584650112867, + "grad_norm": 327.71295166015625, + "learning_rate": 2.7931034482758623e-06, + "loss": 36.4917, + "step": 5007 + }, + { + "epoch": 18.079458239277653, + "grad_norm": 351.00616455078125, + "learning_rate": 2.787658802177858e-06, + "loss": 37.2807, + "step": 5008 + }, + { + "epoch": 18.083069977426636, + "grad_norm": 291.02923583984375, + "learning_rate": 2.782214156079855e-06, + "loss": 38.0345, + "step": 5009 + }, + { + "epoch": 18.086681715575622, + "grad_norm": 288.7776184082031, + "learning_rate": 2.776769509981851e-06, + "loss": 37.112, + "step": 5010 + }, + { + "epoch": 18.086681715575622, + "eval_loss": 0.6058472990989685, + "eval_runtime": 3.1359, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 5010 + }, + { + "epoch": 18.090293453724605, + "grad_norm": 437.8114929199219, + "learning_rate": 2.7713248638838477e-06, + "loss": 37.9063, + "step": 5011 + }, + { + "epoch": 18.09390519187359, + "grad_norm": 324.5924072265625, + "learning_rate": 2.765880217785844e-06, + "loss": 37.8524, + "step": 5012 + }, + { + "epoch": 18.097516930022575, + "grad_norm": 358.40625, + "learning_rate": 2.7604355716878406e-06, + "loss": 37.5547, + "step": 5013 + }, + { + "epoch": 18.101128668171558, + "grad_norm": 290.75604248046875, + "learning_rate": 2.7549909255898368e-06, + "loss": 36.4437, + "step": 5014 + }, + { + "epoch": 18.10474040632054, + "grad_norm": 284.41424560546875, + "learning_rate": 2.7495462794918334e-06, + "loss": 34.3336, + "step": 5015 + }, + { + "epoch": 18.108352144469524, + "grad_norm": 254.59889221191406, + "learning_rate": 2.7441016333938292e-06, + "loss": 32.4527, + "step": 5016 + }, + { + "epoch": 18.11196388261851, + "grad_norm": 266.0207214355469, + "learning_rate": 2.738656987295826e-06, + "loss": 30.4014, + "step": 5017 + }, + { + "epoch": 18.115575620767494, + "grad_norm": 219.9434356689453, + "learning_rate": 2.733212341197822e-06, + "loss": 30.2838, + "step": 5018 + }, + { + "epoch": 18.119187358916477, + "grad_norm": 312.7678527832031, + "learning_rate": 2.7277676950998188e-06, + "loss": 31.6877, + "step": 5019 + }, + { + "epoch": 18.122799097065464, + "grad_norm": 282.99774169921875, + "learning_rate": 2.722323049001815e-06, + "loss": 33.3686, + "step": 5020 + }, + { + "epoch": 18.122799097065464, + "eval_loss": 0.6027761697769165, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 5020 + }, + { + "epoch": 18.126410835214447, + "grad_norm": 371.9994201660156, + "learning_rate": 2.7168784029038112e-06, + "loss": 32.5397, + "step": 5021 + }, + { + "epoch": 18.13002257336343, + "grad_norm": 241.19049072265625, + "learning_rate": 2.711433756805808e-06, + "loss": 33.4329, + "step": 5022 + }, + { + "epoch": 18.133634311512417, + "grad_norm": 310.2216796875, + "learning_rate": 2.7059891107078037e-06, + "loss": 31.888, + "step": 5023 + }, + { + "epoch": 18.1372460496614, + "grad_norm": 277.1349182128906, + "learning_rate": 2.7005444646098004e-06, + "loss": 33.9345, + "step": 5024 + }, + { + "epoch": 18.140857787810383, + "grad_norm": 419.3515930175781, + "learning_rate": 2.6950998185117966e-06, + "loss": 33.5826, + "step": 5025 + }, + { + "epoch": 18.14446952595937, + "grad_norm": 289.1166687011719, + "learning_rate": 2.6896551724137932e-06, + "loss": 34.324, + "step": 5026 + }, + { + "epoch": 18.148081264108352, + "grad_norm": 364.20233154296875, + "learning_rate": 2.6842105263157895e-06, + "loss": 34.45, + "step": 5027 + }, + { + "epoch": 18.151693002257336, + "grad_norm": 341.71551513671875, + "learning_rate": 2.678765880217786e-06, + "loss": 33.9126, + "step": 5028 + }, + { + "epoch": 18.155304740406322, + "grad_norm": 283.1939697265625, + "learning_rate": 2.6733212341197824e-06, + "loss": 33.7188, + "step": 5029 + }, + { + "epoch": 18.158916478555305, + "grad_norm": 369.6583251953125, + "learning_rate": 2.667876588021779e-06, + "loss": 35.0354, + "step": 5030 + }, + { + "epoch": 18.158916478555305, + "eval_loss": 0.6033984422683716, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 5030 + }, + { + "epoch": 18.16252821670429, + "grad_norm": 323.95806884765625, + "learning_rate": 2.662431941923775e-06, + "loss": 34.6853, + "step": 5031 + }, + { + "epoch": 18.16613995485327, + "grad_norm": 274.2629089355469, + "learning_rate": 2.6569872958257715e-06, + "loss": 32.1261, + "step": 5032 + }, + { + "epoch": 18.169751693002258, + "grad_norm": 229.66163635253906, + "learning_rate": 2.6515426497277677e-06, + "loss": 22.0549, + "step": 5033 + }, + { + "epoch": 18.17336343115124, + "grad_norm": 212.78070068359375, + "learning_rate": 2.646098003629764e-06, + "loss": 21.4483, + "step": 5034 + }, + { + "epoch": 18.176975169300224, + "grad_norm": 184.7995147705078, + "learning_rate": 2.6406533575317606e-06, + "loss": 22.5133, + "step": 5035 + }, + { + "epoch": 18.18058690744921, + "grad_norm": 256.6748046875, + "learning_rate": 2.635208711433757e-06, + "loss": 23.6443, + "step": 5036 + }, + { + "epoch": 18.184198645598194, + "grad_norm": 230.683349609375, + "learning_rate": 2.6297640653357535e-06, + "loss": 38.3633, + "step": 5037 + }, + { + "epoch": 18.187810383747177, + "grad_norm": 251.70166015625, + "learning_rate": 2.6243194192377497e-06, + "loss": 40.1229, + "step": 5038 + }, + { + "epoch": 18.191422121896164, + "grad_norm": 219.9066162109375, + "learning_rate": 2.618874773139746e-06, + "loss": 38.6539, + "step": 5039 + }, + { + "epoch": 18.195033860045147, + "grad_norm": 290.7185974121094, + "learning_rate": 2.613430127041742e-06, + "loss": 38.0385, + "step": 5040 + }, + { + "epoch": 18.195033860045147, + "eval_loss": 0.6022469401359558, + "eval_runtime": 3.1408, + "eval_samples_per_second": 56.993, + "eval_steps_per_second": 56.993, + "step": 5040 + }, + { + "epoch": 18.19864559819413, + "grad_norm": 334.9693908691406, + "learning_rate": 2.607985480943739e-06, + "loss": 38.2381, + "step": 5041 + }, + { + "epoch": 18.202257336343116, + "grad_norm": 283.9659423828125, + "learning_rate": 2.602540834845735e-06, + "loss": 39.2603, + "step": 5042 + }, + { + "epoch": 18.2058690744921, + "grad_norm": 291.4002990722656, + "learning_rate": 2.5970961887477317e-06, + "loss": 39.633, + "step": 5043 + }, + { + "epoch": 18.209480812641083, + "grad_norm": 249.14329528808594, + "learning_rate": 2.591651542649728e-06, + "loss": 39.1938, + "step": 5044 + }, + { + "epoch": 18.21309255079007, + "grad_norm": 226.1659393310547, + "learning_rate": 2.5862068965517246e-06, + "loss": 39.8308, + "step": 5045 + }, + { + "epoch": 18.216704288939052, + "grad_norm": 270.2198181152344, + "learning_rate": 2.5807622504537204e-06, + "loss": 38.4712, + "step": 5046 + }, + { + "epoch": 18.220316027088035, + "grad_norm": 263.83819580078125, + "learning_rate": 2.5753176043557166e-06, + "loss": 37.3572, + "step": 5047 + }, + { + "epoch": 18.223927765237022, + "grad_norm": 316.8177795410156, + "learning_rate": 2.5698729582577133e-06, + "loss": 36.3821, + "step": 5048 + }, + { + "epoch": 18.227539503386005, + "grad_norm": 318.7213134765625, + "learning_rate": 2.5644283121597095e-06, + "loss": 34.8209, + "step": 5049 + }, + { + "epoch": 18.231151241534988, + "grad_norm": 267.6168518066406, + "learning_rate": 2.558983666061706e-06, + "loss": 35.6173, + "step": 5050 + }, + { + "epoch": 18.231151241534988, + "eval_loss": 0.6044466495513916, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.038, + "eval_steps_per_second": 57.038, + "step": 5050 + }, + { + "epoch": 18.23476297968397, + "grad_norm": 277.739501953125, + "learning_rate": 2.5535390199637024e-06, + "loss": 35.2828, + "step": 5051 + }, + { + "epoch": 18.238374717832958, + "grad_norm": 288.2068786621094, + "learning_rate": 2.548094373865699e-06, + "loss": 36.7972, + "step": 5052 + }, + { + "epoch": 18.24198645598194, + "grad_norm": 217.59716796875, + "learning_rate": 2.5426497277676953e-06, + "loss": 36.3637, + "step": 5053 + }, + { + "epoch": 18.245598194130924, + "grad_norm": 411.8970031738281, + "learning_rate": 2.5372050816696915e-06, + "loss": 37.3086, + "step": 5054 + }, + { + "epoch": 18.24920993227991, + "grad_norm": 351.9718933105469, + "learning_rate": 2.5317604355716877e-06, + "loss": 37.0896, + "step": 5055 + }, + { + "epoch": 18.252821670428894, + "grad_norm": 343.1683044433594, + "learning_rate": 2.5263157894736844e-06, + "loss": 37.2533, + "step": 5056 + }, + { + "epoch": 18.256433408577877, + "grad_norm": 413.0977783203125, + "learning_rate": 2.5208711433756806e-06, + "loss": 36.9987, + "step": 5057 + }, + { + "epoch": 18.260045146726863, + "grad_norm": 331.73223876953125, + "learning_rate": 2.5154264972776773e-06, + "loss": 36.8624, + "step": 5058 + }, + { + "epoch": 18.263656884875846, + "grad_norm": 434.96990966796875, + "learning_rate": 2.5099818511796735e-06, + "loss": 37.949, + "step": 5059 + }, + { + "epoch": 18.26726862302483, + "grad_norm": 324.4934997558594, + "learning_rate": 2.5045372050816697e-06, + "loss": 37.6272, + "step": 5060 + }, + { + "epoch": 18.26726862302483, + "eval_loss": 0.6042292714118958, + "eval_runtime": 3.1335, + "eval_samples_per_second": 57.125, + "eval_steps_per_second": 57.125, + "step": 5060 + }, + { + "epoch": 18.270880361173816, + "grad_norm": 312.1228942871094, + "learning_rate": 2.499092558983666e-06, + "loss": 38.6362, + "step": 5061 + }, + { + "epoch": 18.2744920993228, + "grad_norm": 427.6184997558594, + "learning_rate": 2.493647912885662e-06, + "loss": 39.2934, + "step": 5062 + }, + { + "epoch": 18.278103837471782, + "grad_norm": 344.6819763183594, + "learning_rate": 2.488203266787659e-06, + "loss": 38.0684, + "step": 5063 + }, + { + "epoch": 18.28171557562077, + "grad_norm": 317.42303466796875, + "learning_rate": 2.482758620689655e-06, + "loss": 38.2323, + "step": 5064 + }, + { + "epoch": 18.285327313769752, + "grad_norm": 338.830810546875, + "learning_rate": 2.4773139745916517e-06, + "loss": 34.2699, + "step": 5065 + }, + { + "epoch": 18.288939051918735, + "grad_norm": 286.7263488769531, + "learning_rate": 2.471869328493648e-06, + "loss": 32.5149, + "step": 5066 + }, + { + "epoch": 18.292550790067722, + "grad_norm": 278.9923095703125, + "learning_rate": 2.4664246823956446e-06, + "loss": 31.033, + "step": 5067 + }, + { + "epoch": 18.296162528216705, + "grad_norm": 264.0198669433594, + "learning_rate": 2.460980036297641e-06, + "loss": 29.5549, + "step": 5068 + }, + { + "epoch": 18.299774266365688, + "grad_norm": 241.6163330078125, + "learning_rate": 2.455535390199637e-06, + "loss": 30.2173, + "step": 5069 + }, + { + "epoch": 18.30338600451467, + "grad_norm": 278.5418395996094, + "learning_rate": 2.4500907441016333e-06, + "loss": 30.8286, + "step": 5070 + }, + { + "epoch": 18.30338600451467, + "eval_loss": 0.6035094261169434, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 5070 + }, + { + "epoch": 18.306997742663658, + "grad_norm": 277.5758056640625, + "learning_rate": 2.44464609800363e-06, + "loss": 33.6778, + "step": 5071 + }, + { + "epoch": 18.31060948081264, + "grad_norm": 295.81201171875, + "learning_rate": 2.439201451905626e-06, + "loss": 33.5914, + "step": 5072 + }, + { + "epoch": 18.314221218961624, + "grad_norm": 293.4093017578125, + "learning_rate": 2.4337568058076224e-06, + "loss": 33.6203, + "step": 5073 + }, + { + "epoch": 18.31783295711061, + "grad_norm": 277.2228698730469, + "learning_rate": 2.428312159709619e-06, + "loss": 33.6465, + "step": 5074 + }, + { + "epoch": 18.321444695259594, + "grad_norm": 286.3224792480469, + "learning_rate": 2.4228675136116153e-06, + "loss": 32.6013, + "step": 5075 + }, + { + "epoch": 18.325056433408577, + "grad_norm": 320.6168212890625, + "learning_rate": 2.417422867513612e-06, + "loss": 32.6469, + "step": 5076 + }, + { + "epoch": 18.328668171557563, + "grad_norm": 327.364990234375, + "learning_rate": 2.4119782214156078e-06, + "loss": 34.354, + "step": 5077 + }, + { + "epoch": 18.332279909706546, + "grad_norm": 342.06634521484375, + "learning_rate": 2.4065335753176044e-06, + "loss": 34.3143, + "step": 5078 + }, + { + "epoch": 18.33589164785553, + "grad_norm": 370.70343017578125, + "learning_rate": 2.4010889292196006e-06, + "loss": 33.7771, + "step": 5079 + }, + { + "epoch": 18.339503386004516, + "grad_norm": 358.7357177734375, + "learning_rate": 2.3956442831215973e-06, + "loss": 35.5377, + "step": 5080 + }, + { + "epoch": 18.339503386004516, + "eval_loss": 0.6033809185028076, + "eval_runtime": 3.1442, + "eval_samples_per_second": 56.931, + "eval_steps_per_second": 56.931, + "step": 5080 + }, + { + "epoch": 18.3431151241535, + "grad_norm": 463.8668518066406, + "learning_rate": 2.3901996370235935e-06, + "loss": 35.4711, + "step": 5081 + }, + { + "epoch": 18.346726862302482, + "grad_norm": 256.5113220214844, + "learning_rate": 2.38475499092559e-06, + "loss": 26.8532, + "step": 5082 + }, + { + "epoch": 18.35033860045147, + "grad_norm": 228.83883666992188, + "learning_rate": 2.3793103448275864e-06, + "loss": 21.6636, + "step": 5083 + }, + { + "epoch": 18.353950338600452, + "grad_norm": 238.70742797851562, + "learning_rate": 2.3738656987295826e-06, + "loss": 22.2091, + "step": 5084 + }, + { + "epoch": 18.357562076749435, + "grad_norm": 276.8741760253906, + "learning_rate": 2.368421052631579e-06, + "loss": 22.1242, + "step": 5085 + }, + { + "epoch": 18.36117381489842, + "grad_norm": 226.4810333251953, + "learning_rate": 2.362976406533575e-06, + "loss": 23.359, + "step": 5086 + }, + { + "epoch": 18.364785553047405, + "grad_norm": 212.53111267089844, + "learning_rate": 2.3575317604355718e-06, + "loss": 37.7694, + "step": 5087 + }, + { + "epoch": 18.368397291196388, + "grad_norm": 227.26710510253906, + "learning_rate": 2.352087114337568e-06, + "loss": 39.8064, + "step": 5088 + }, + { + "epoch": 18.37200902934537, + "grad_norm": 201.0309295654297, + "learning_rate": 2.3466424682395646e-06, + "loss": 38.9716, + "step": 5089 + }, + { + "epoch": 18.375620767494357, + "grad_norm": 311.7691345214844, + "learning_rate": 2.341197822141561e-06, + "loss": 39.8326, + "step": 5090 + }, + { + "epoch": 18.375620767494357, + "eval_loss": 0.6036086082458496, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.029, + "eval_steps_per_second": 57.029, + "step": 5090 + }, + { + "epoch": 18.37923250564334, + "grad_norm": 251.5362091064453, + "learning_rate": 2.3357531760435575e-06, + "loss": 38.2591, + "step": 5091 + }, + { + "epoch": 18.382844243792324, + "grad_norm": 241.64373779296875, + "learning_rate": 2.3303085299455533e-06, + "loss": 38.0327, + "step": 5092 + }, + { + "epoch": 18.38645598194131, + "grad_norm": 231.7598114013672, + "learning_rate": 2.32486388384755e-06, + "loss": 38.6853, + "step": 5093 + }, + { + "epoch": 18.390067720090293, + "grad_norm": 287.66644287109375, + "learning_rate": 2.3194192377495462e-06, + "loss": 39.6929, + "step": 5094 + }, + { + "epoch": 18.393679458239276, + "grad_norm": 289.3146057128906, + "learning_rate": 2.313974591651543e-06, + "loss": 38.3129, + "step": 5095 + }, + { + "epoch": 18.397291196388263, + "grad_norm": 291.4801330566406, + "learning_rate": 2.308529945553539e-06, + "loss": 38.2505, + "step": 5096 + }, + { + "epoch": 18.400902934537246, + "grad_norm": 337.4052429199219, + "learning_rate": 2.3030852994555358e-06, + "loss": 37.7476, + "step": 5097 + }, + { + "epoch": 18.40451467268623, + "grad_norm": 460.0773010253906, + "learning_rate": 2.297640653357532e-06, + "loss": 36.1112, + "step": 5098 + }, + { + "epoch": 18.408126410835216, + "grad_norm": 322.4940185546875, + "learning_rate": 2.292196007259528e-06, + "loss": 36.5374, + "step": 5099 + }, + { + "epoch": 18.4117381489842, + "grad_norm": 350.4710388183594, + "learning_rate": 2.2867513611615244e-06, + "loss": 37.5286, + "step": 5100 + }, + { + "epoch": 18.4117381489842, + "eval_loss": 0.6045494079589844, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.101, + "eval_steps_per_second": 57.101, + "step": 5100 + }, + { + "epoch": 18.415349887133182, + "grad_norm": 306.18634033203125, + "learning_rate": 2.2813067150635207e-06, + "loss": 37.3676, + "step": 5101 + }, + { + "epoch": 18.41896162528217, + "grad_norm": 289.237060546875, + "learning_rate": 2.2758620689655173e-06, + "loss": 36.6916, + "step": 5102 + }, + { + "epoch": 18.42257336343115, + "grad_norm": 266.69207763671875, + "learning_rate": 2.2704174228675136e-06, + "loss": 36.2887, + "step": 5103 + }, + { + "epoch": 18.426185101580135, + "grad_norm": 264.54119873046875, + "learning_rate": 2.2649727767695102e-06, + "loss": 37.1267, + "step": 5104 + }, + { + "epoch": 18.42979683972912, + "grad_norm": 262.6132507324219, + "learning_rate": 2.2595281306715064e-06, + "loss": 36.6862, + "step": 5105 + }, + { + "epoch": 18.433408577878104, + "grad_norm": 231.68226623535156, + "learning_rate": 2.254083484573503e-06, + "loss": 35.7714, + "step": 5106 + }, + { + "epoch": 18.437020316027088, + "grad_norm": 299.72613525390625, + "learning_rate": 2.248638838475499e-06, + "loss": 37.648, + "step": 5107 + }, + { + "epoch": 18.44063205417607, + "grad_norm": 424.94708251953125, + "learning_rate": 2.2431941923774956e-06, + "loss": 35.9776, + "step": 5108 + }, + { + "epoch": 18.444243792325057, + "grad_norm": 449.78570556640625, + "learning_rate": 2.2377495462794918e-06, + "loss": 38.0571, + "step": 5109 + }, + { + "epoch": 18.44785553047404, + "grad_norm": 284.00634765625, + "learning_rate": 2.2323049001814884e-06, + "loss": 37.758, + "step": 5110 + }, + { + "epoch": 18.44785553047404, + "eval_loss": 0.6064541935920715, + "eval_runtime": 3.1377, + "eval_samples_per_second": 57.048, + "eval_steps_per_second": 57.048, + "step": 5110 + }, + { + "epoch": 18.451467268623023, + "grad_norm": 359.1011962890625, + "learning_rate": 2.2268602540834847e-06, + "loss": 38.8924, + "step": 5111 + }, + { + "epoch": 18.45507900677201, + "grad_norm": 307.7583923339844, + "learning_rate": 2.221415607985481e-06, + "loss": 38.2116, + "step": 5112 + }, + { + "epoch": 18.458690744920993, + "grad_norm": 359.5586242675781, + "learning_rate": 2.2159709618874776e-06, + "loss": 39.6894, + "step": 5113 + }, + { + "epoch": 18.462302483069976, + "grad_norm": 258.3985595703125, + "learning_rate": 2.2105263157894734e-06, + "loss": 36.4586, + "step": 5114 + }, + { + "epoch": 18.465914221218963, + "grad_norm": 363.09600830078125, + "learning_rate": 2.20508166969147e-06, + "loss": 34.489, + "step": 5115 + }, + { + "epoch": 18.469525959367946, + "grad_norm": 237.136474609375, + "learning_rate": 2.1996370235934662e-06, + "loss": 32.5826, + "step": 5116 + }, + { + "epoch": 18.47313769751693, + "grad_norm": 400.25604248046875, + "learning_rate": 2.194192377495463e-06, + "loss": 31.3005, + "step": 5117 + }, + { + "epoch": 18.476749435665916, + "grad_norm": 467.9855651855469, + "learning_rate": 2.188747731397459e-06, + "loss": 30.2261, + "step": 5118 + }, + { + "epoch": 18.4803611738149, + "grad_norm": 384.4250183105469, + "learning_rate": 2.1833030852994558e-06, + "loss": 33.5844, + "step": 5119 + }, + { + "epoch": 18.483972911963882, + "grad_norm": 324.4369201660156, + "learning_rate": 2.177858439201452e-06, + "loss": 32.5136, + "step": 5120 + }, + { + "epoch": 18.483972911963882, + "eval_loss": 0.602573573589325, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 5120 + }, + { + "epoch": 18.48758465011287, + "grad_norm": 372.0033264160156, + "learning_rate": 2.1724137931034487e-06, + "loss": 31.4322, + "step": 5121 + }, + { + "epoch": 18.49119638826185, + "grad_norm": 336.265869140625, + "learning_rate": 2.1669691470054445e-06, + "loss": 34.163, + "step": 5122 + }, + { + "epoch": 18.494808126410835, + "grad_norm": 339.8494873046875, + "learning_rate": 2.161524500907441e-06, + "loss": 31.2627, + "step": 5123 + }, + { + "epoch": 18.498419864559818, + "grad_norm": 279.3925476074219, + "learning_rate": 2.1560798548094374e-06, + "loss": 32.3994, + "step": 5124 + }, + { + "epoch": 18.502031602708804, + "grad_norm": 281.546875, + "learning_rate": 2.1506352087114336e-06, + "loss": 34.8467, + "step": 5125 + }, + { + "epoch": 18.505643340857787, + "grad_norm": 315.8692626953125, + "learning_rate": 2.1451905626134302e-06, + "loss": 33.632, + "step": 5126 + }, + { + "epoch": 18.50925507900677, + "grad_norm": 289.3066711425781, + "learning_rate": 2.1397459165154265e-06, + "loss": 34.312, + "step": 5127 + }, + { + "epoch": 18.512866817155757, + "grad_norm": 274.190673828125, + "learning_rate": 2.134301270417423e-06, + "loss": 32.9937, + "step": 5128 + }, + { + "epoch": 18.51647855530474, + "grad_norm": 317.9950256347656, + "learning_rate": 2.1288566243194194e-06, + "loss": 35.8788, + "step": 5129 + }, + { + "epoch": 18.520090293453723, + "grad_norm": 342.9775695800781, + "learning_rate": 2.1234119782214156e-06, + "loss": 35.2397, + "step": 5130 + }, + { + "epoch": 18.520090293453723, + "eval_loss": 0.6024553179740906, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.987, + "eval_steps_per_second": 56.987, + "step": 5130 + }, + { + "epoch": 18.52370203160271, + "grad_norm": 351.09637451171875, + "learning_rate": 2.117967332123412e-06, + "loss": 33.1556, + "step": 5131 + }, + { + "epoch": 18.527313769751693, + "grad_norm": 229.55613708496094, + "learning_rate": 2.1125226860254085e-06, + "loss": 26.6317, + "step": 5132 + }, + { + "epoch": 18.530925507900676, + "grad_norm": 234.53562927246094, + "learning_rate": 2.1070780399274047e-06, + "loss": 21.316, + "step": 5133 + }, + { + "epoch": 18.534537246049663, + "grad_norm": 241.59982299804688, + "learning_rate": 2.1016333938294014e-06, + "loss": 21.2739, + "step": 5134 + }, + { + "epoch": 18.538148984198646, + "grad_norm": 207.2808380126953, + "learning_rate": 2.0961887477313976e-06, + "loss": 22.736, + "step": 5135 + }, + { + "epoch": 18.54176072234763, + "grad_norm": 236.13955688476562, + "learning_rate": 2.0907441016333942e-06, + "loss": 22.7503, + "step": 5136 + }, + { + "epoch": 18.545372460496615, + "grad_norm": 181.6793670654297, + "learning_rate": 2.08529945553539e-06, + "loss": 37.9001, + "step": 5137 + }, + { + "epoch": 18.5489841986456, + "grad_norm": 249.5441131591797, + "learning_rate": 2.0798548094373863e-06, + "loss": 39.52, + "step": 5138 + }, + { + "epoch": 18.55259593679458, + "grad_norm": 215.67855834960938, + "learning_rate": 2.074410163339383e-06, + "loss": 38.6667, + "step": 5139 + }, + { + "epoch": 18.55620767494357, + "grad_norm": 280.9402770996094, + "learning_rate": 2.068965517241379e-06, + "loss": 36.9602, + "step": 5140 + }, + { + "epoch": 18.55620767494357, + "eval_loss": 0.6027256846427917, + "eval_runtime": 3.1361, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 5140 + }, + { + "epoch": 18.55981941309255, + "grad_norm": 265.9155578613281, + "learning_rate": 2.063520871143376e-06, + "loss": 38.8654, + "step": 5141 + }, + { + "epoch": 18.563431151241534, + "grad_norm": 300.0267028808594, + "learning_rate": 2.058076225045372e-06, + "loss": 38.8917, + "step": 5142 + }, + { + "epoch": 18.567042889390518, + "grad_norm": 243.0481414794922, + "learning_rate": 2.0526315789473687e-06, + "loss": 39.2785, + "step": 5143 + }, + { + "epoch": 18.570654627539504, + "grad_norm": 270.58380126953125, + "learning_rate": 2.047186932849365e-06, + "loss": 39.3892, + "step": 5144 + }, + { + "epoch": 18.574266365688487, + "grad_norm": 311.60430908203125, + "learning_rate": 2.041742286751361e-06, + "loss": 39.5933, + "step": 5145 + }, + { + "epoch": 18.57787810383747, + "grad_norm": 285.160400390625, + "learning_rate": 2.0362976406533574e-06, + "loss": 38.2962, + "step": 5146 + }, + { + "epoch": 18.581489841986457, + "grad_norm": 232.0592041015625, + "learning_rate": 2.030852994555354e-06, + "loss": 38.5965, + "step": 5147 + }, + { + "epoch": 18.58510158013544, + "grad_norm": 221.85525512695312, + "learning_rate": 2.0254083484573503e-06, + "loss": 36.516, + "step": 5148 + }, + { + "epoch": 18.588713318284423, + "grad_norm": 291.9794921875, + "learning_rate": 2.019963702359347e-06, + "loss": 36.3976, + "step": 5149 + }, + { + "epoch": 18.59232505643341, + "grad_norm": 387.8580322265625, + "learning_rate": 2.014519056261343e-06, + "loss": 35.2321, + "step": 5150 + }, + { + "epoch": 18.59232505643341, + "eval_loss": 0.6030355095863342, + "eval_runtime": 3.1378, + "eval_samples_per_second": 57.046, + "eval_steps_per_second": 57.046, + "step": 5150 + }, + { + "epoch": 18.595936794582393, + "grad_norm": 300.14508056640625, + "learning_rate": 2.0090744101633394e-06, + "loss": 36.4186, + "step": 5151 + }, + { + "epoch": 18.599548532731376, + "grad_norm": 294.1235656738281, + "learning_rate": 2.0036297640653356e-06, + "loss": 36.014, + "step": 5152 + }, + { + "epoch": 18.603160270880363, + "grad_norm": 389.1570129394531, + "learning_rate": 1.998185117967332e-06, + "loss": 36.1648, + "step": 5153 + }, + { + "epoch": 18.606772009029346, + "grad_norm": 244.6651153564453, + "learning_rate": 1.9927404718693285e-06, + "loss": 36.1033, + "step": 5154 + }, + { + "epoch": 18.61038374717833, + "grad_norm": 302.52996826171875, + "learning_rate": 1.9872958257713247e-06, + "loss": 37.1531, + "step": 5155 + }, + { + "epoch": 18.613995485327315, + "grad_norm": 352.86273193359375, + "learning_rate": 1.9818511796733214e-06, + "loss": 37.8204, + "step": 5156 + }, + { + "epoch": 18.6176072234763, + "grad_norm": 308.61431884765625, + "learning_rate": 1.9764065335753176e-06, + "loss": 37.2097, + "step": 5157 + }, + { + "epoch": 18.62121896162528, + "grad_norm": 288.30712890625, + "learning_rate": 1.9709618874773143e-06, + "loss": 36.4242, + "step": 5158 + }, + { + "epoch": 18.624830699774268, + "grad_norm": 315.9750671386719, + "learning_rate": 1.9655172413793105e-06, + "loss": 35.9204, + "step": 5159 + }, + { + "epoch": 18.62844243792325, + "grad_norm": 468.51055908203125, + "learning_rate": 1.9600725952813067e-06, + "loss": 38.9178, + "step": 5160 + }, + { + "epoch": 18.62844243792325, + "eval_loss": 0.6054540872573853, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 5160 + }, + { + "epoch": 18.632054176072234, + "grad_norm": 310.5861511230469, + "learning_rate": 1.954627949183303e-06, + "loss": 37.9588, + "step": 5161 + }, + { + "epoch": 18.635665914221217, + "grad_norm": 424.3090515136719, + "learning_rate": 1.9491833030852996e-06, + "loss": 38.1028, + "step": 5162 + }, + { + "epoch": 18.639277652370204, + "grad_norm": 330.6189880371094, + "learning_rate": 1.943738656987296e-06, + "loss": 36.5096, + "step": 5163 + }, + { + "epoch": 18.642889390519187, + "grad_norm": 305.9330139160156, + "learning_rate": 1.9382940108892925e-06, + "loss": 36.871, + "step": 5164 + }, + { + "epoch": 18.64650112866817, + "grad_norm": 410.06793212890625, + "learning_rate": 1.9328493647912887e-06, + "loss": 37.4061, + "step": 5165 + }, + { + "epoch": 18.650112866817157, + "grad_norm": 385.49127197265625, + "learning_rate": 1.927404718693285e-06, + "loss": 33.6399, + "step": 5166 + }, + { + "epoch": 18.65372460496614, + "grad_norm": 270.96783447265625, + "learning_rate": 1.9219600725952816e-06, + "loss": 31.3483, + "step": 5167 + }, + { + "epoch": 18.657336343115123, + "grad_norm": 329.84405517578125, + "learning_rate": 1.9165154264972774e-06, + "loss": 30.2639, + "step": 5168 + }, + { + "epoch": 18.66094808126411, + "grad_norm": 413.7260437011719, + "learning_rate": 1.911070780399274e-06, + "loss": 31.2749, + "step": 5169 + }, + { + "epoch": 18.664559819413093, + "grad_norm": 276.43585205078125, + "learning_rate": 1.9056261343012705e-06, + "loss": 30.3596, + "step": 5170 + }, + { + "epoch": 18.664559819413093, + "eval_loss": 0.6022100448608398, + "eval_runtime": 3.1339, + "eval_samples_per_second": 57.117, + "eval_steps_per_second": 57.117, + "step": 5170 + }, + { + "epoch": 18.668171557562076, + "grad_norm": 248.9257049560547, + "learning_rate": 1.9001814882032667e-06, + "loss": 32.4066, + "step": 5171 + }, + { + "epoch": 18.671783295711062, + "grad_norm": 252.70388793945312, + "learning_rate": 1.8947368421052632e-06, + "loss": 32.3724, + "step": 5172 + }, + { + "epoch": 18.675395033860045, + "grad_norm": 325.0677795410156, + "learning_rate": 1.8892921960072596e-06, + "loss": 32.3041, + "step": 5173 + }, + { + "epoch": 18.67900677200903, + "grad_norm": 420.9740295410156, + "learning_rate": 1.883847549909256e-06, + "loss": 32.6609, + "step": 5174 + }, + { + "epoch": 18.682618510158015, + "grad_norm": 239.59371948242188, + "learning_rate": 1.878402903811252e-06, + "loss": 32.8471, + "step": 5175 + }, + { + "epoch": 18.686230248306998, + "grad_norm": 301.13165283203125, + "learning_rate": 1.8729582577132487e-06, + "loss": 32.2686, + "step": 5176 + }, + { + "epoch": 18.68984198645598, + "grad_norm": 282.7923889160156, + "learning_rate": 1.867513611615245e-06, + "loss": 34.2726, + "step": 5177 + }, + { + "epoch": 18.693453724604964, + "grad_norm": 434.20550537109375, + "learning_rate": 1.8620689655172414e-06, + "loss": 35.335, + "step": 5178 + }, + { + "epoch": 18.69706546275395, + "grad_norm": 306.680908203125, + "learning_rate": 1.8566243194192379e-06, + "loss": 33.3156, + "step": 5179 + }, + { + "epoch": 18.700677200902934, + "grad_norm": 253.27711486816406, + "learning_rate": 1.8511796733212343e-06, + "loss": 34.9504, + "step": 5180 + }, + { + "epoch": 18.700677200902934, + "eval_loss": 0.6021104454994202, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.994, + "eval_steps_per_second": 56.994, + "step": 5180 + }, + { + "epoch": 18.704288939051917, + "grad_norm": 391.74945068359375, + "learning_rate": 1.8457350272232305e-06, + "loss": 35.285, + "step": 5181 + }, + { + "epoch": 18.707900677200904, + "grad_norm": 265.4142150878906, + "learning_rate": 1.840290381125227e-06, + "loss": 27.689, + "step": 5182 + }, + { + "epoch": 18.711512415349887, + "grad_norm": 217.80746459960938, + "learning_rate": 1.8348457350272234e-06, + "loss": 22.6159, + "step": 5183 + }, + { + "epoch": 18.71512415349887, + "grad_norm": 220.21180725097656, + "learning_rate": 1.8294010889292196e-06, + "loss": 22.1321, + "step": 5184 + }, + { + "epoch": 18.718735891647857, + "grad_norm": 239.4197998046875, + "learning_rate": 1.8239564428312159e-06, + "loss": 22.5479, + "step": 5185 + }, + { + "epoch": 18.72234762979684, + "grad_norm": 281.7828674316406, + "learning_rate": 1.8185117967332123e-06, + "loss": 23.5363, + "step": 5186 + }, + { + "epoch": 18.725959367945823, + "grad_norm": 231.81980895996094, + "learning_rate": 1.8130671506352088e-06, + "loss": 39.0953, + "step": 5187 + }, + { + "epoch": 18.72957110609481, + "grad_norm": 242.0535430908203, + "learning_rate": 1.807622504537205e-06, + "loss": 39.4842, + "step": 5188 + }, + { + "epoch": 18.733182844243792, + "grad_norm": 235.6869659423828, + "learning_rate": 1.8021778584392014e-06, + "loss": 37.4884, + "step": 5189 + }, + { + "epoch": 18.736794582392776, + "grad_norm": 291.5176086425781, + "learning_rate": 1.7967332123411979e-06, + "loss": 38.9612, + "step": 5190 + }, + { + "epoch": 18.736794582392776, + "eval_loss": 0.6040608286857605, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.034, + "eval_steps_per_second": 57.034, + "step": 5190 + }, + { + "epoch": 18.740406320541762, + "grad_norm": 407.5574645996094, + "learning_rate": 1.7912885662431943e-06, + "loss": 39.3531, + "step": 5191 + }, + { + "epoch": 18.744018058690745, + "grad_norm": 277.07891845703125, + "learning_rate": 1.7858439201451905e-06, + "loss": 38.4866, + "step": 5192 + }, + { + "epoch": 18.74762979683973, + "grad_norm": 350.2939453125, + "learning_rate": 1.780399274047187e-06, + "loss": 38.0073, + "step": 5193 + }, + { + "epoch": 18.751241534988715, + "grad_norm": 395.7618103027344, + "learning_rate": 1.7749546279491834e-06, + "loss": 38.1693, + "step": 5194 + }, + { + "epoch": 18.754853273137698, + "grad_norm": 296.43267822265625, + "learning_rate": 1.7695099818511799e-06, + "loss": 38.6162, + "step": 5195 + }, + { + "epoch": 18.75846501128668, + "grad_norm": 335.7173156738281, + "learning_rate": 1.764065335753176e-06, + "loss": 38.9182, + "step": 5196 + }, + { + "epoch": 18.762076749435664, + "grad_norm": 273.09368896484375, + "learning_rate": 1.7586206896551725e-06, + "loss": 38.0685, + "step": 5197 + }, + { + "epoch": 18.76568848758465, + "grad_norm": 359.718505859375, + "learning_rate": 1.7531760435571688e-06, + "loss": 36.8994, + "step": 5198 + }, + { + "epoch": 18.769300225733634, + "grad_norm": 345.5837097167969, + "learning_rate": 1.7477313974591652e-06, + "loss": 35.375, + "step": 5199 + }, + { + "epoch": 18.772911963882617, + "grad_norm": 266.8583984375, + "learning_rate": 1.7422867513611614e-06, + "loss": 34.7559, + "step": 5200 + }, + { + "epoch": 18.772911963882617, + "eval_loss": 0.6007165908813477, + "eval_runtime": 3.1395, + "eval_samples_per_second": 57.016, + "eval_steps_per_second": 57.016, + "step": 5200 + }, + { + "epoch": 18.776523702031604, + "grad_norm": 317.10662841796875, + "learning_rate": 1.7368421052631579e-06, + "loss": 35.6206, + "step": 5201 + }, + { + "epoch": 18.780135440180587, + "grad_norm": 418.6651916503906, + "learning_rate": 1.7313974591651543e-06, + "loss": 36.7981, + "step": 5202 + }, + { + "epoch": 18.78374717832957, + "grad_norm": 247.767333984375, + "learning_rate": 1.7259528130671508e-06, + "loss": 36.226, + "step": 5203 + }, + { + "epoch": 18.787358916478556, + "grad_norm": 406.6683349609375, + "learning_rate": 1.720508166969147e-06, + "loss": 36.5781, + "step": 5204 + }, + { + "epoch": 18.79097065462754, + "grad_norm": 433.02984619140625, + "learning_rate": 1.7150635208711434e-06, + "loss": 37.8221, + "step": 5205 + }, + { + "epoch": 18.794582392776523, + "grad_norm": 291.1831970214844, + "learning_rate": 1.7096188747731399e-06, + "loss": 37.9125, + "step": 5206 + }, + { + "epoch": 18.79819413092551, + "grad_norm": 276.8603820800781, + "learning_rate": 1.7041742286751361e-06, + "loss": 38.0886, + "step": 5207 + }, + { + "epoch": 18.801805869074492, + "grad_norm": 442.06317138671875, + "learning_rate": 1.6987295825771326e-06, + "loss": 36.8432, + "step": 5208 + }, + { + "epoch": 18.805417607223475, + "grad_norm": 323.7881774902344, + "learning_rate": 1.693284936479129e-06, + "loss": 37.2775, + "step": 5209 + }, + { + "epoch": 18.809029345372462, + "grad_norm": 320.2378234863281, + "learning_rate": 1.6878402903811254e-06, + "loss": 37.4478, + "step": 5210 + }, + { + "epoch": 18.809029345372462, + "eval_loss": 0.6044604182243347, + "eval_runtime": 3.1433, + "eval_samples_per_second": 56.946, + "eval_steps_per_second": 56.946, + "step": 5210 + }, + { + "epoch": 18.812641083521445, + "grad_norm": 474.6519470214844, + "learning_rate": 1.6823956442831215e-06, + "loss": 37.9463, + "step": 5211 + }, + { + "epoch": 18.816252821670428, + "grad_norm": 265.7474060058594, + "learning_rate": 1.676950998185118e-06, + "loss": 37.7662, + "step": 5212 + }, + { + "epoch": 18.819864559819415, + "grad_norm": 312.014892578125, + "learning_rate": 1.6715063520871143e-06, + "loss": 37.3329, + "step": 5213 + }, + { + "epoch": 18.823476297968398, + "grad_norm": 407.24884033203125, + "learning_rate": 1.6660617059891108e-06, + "loss": 36.4324, + "step": 5214 + }, + { + "epoch": 18.82708803611738, + "grad_norm": 368.05255126953125, + "learning_rate": 1.660617059891107e-06, + "loss": 33.9691, + "step": 5215 + }, + { + "epoch": 18.830699774266364, + "grad_norm": 410.3034362792969, + "learning_rate": 1.6551724137931035e-06, + "loss": 32.7008, + "step": 5216 + }, + { + "epoch": 18.83431151241535, + "grad_norm": 318.6436462402344, + "learning_rate": 1.6497277676951e-06, + "loss": 32.1152, + "step": 5217 + }, + { + "epoch": 18.837923250564334, + "grad_norm": 366.3927307128906, + "learning_rate": 1.6442831215970963e-06, + "loss": 31.3827, + "step": 5218 + }, + { + "epoch": 18.841534988713317, + "grad_norm": 319.7497863769531, + "learning_rate": 1.6388384754990926e-06, + "loss": 30.781, + "step": 5219 + }, + { + "epoch": 18.845146726862303, + "grad_norm": 405.86669921875, + "learning_rate": 1.633393829401089e-06, + "loss": 30.5807, + "step": 5220 + }, + { + "epoch": 18.845146726862303, + "eval_loss": 0.6014994382858276, + "eval_runtime": 3.1339, + "eval_samples_per_second": 57.118, + "eval_steps_per_second": 57.118, + "step": 5220 + }, + { + "epoch": 18.848758465011286, + "grad_norm": 518.0769653320312, + "learning_rate": 1.6279491833030855e-06, + "loss": 33.4028, + "step": 5221 + }, + { + "epoch": 18.85237020316027, + "grad_norm": 390.18609619140625, + "learning_rate": 1.6225045372050819e-06, + "loss": 31.805, + "step": 5222 + }, + { + "epoch": 18.855981941309256, + "grad_norm": 323.1091003417969, + "learning_rate": 1.6170598911070781e-06, + "loss": 33.4414, + "step": 5223 + }, + { + "epoch": 18.85959367945824, + "grad_norm": 311.3610534667969, + "learning_rate": 1.6116152450090744e-06, + "loss": 34.1178, + "step": 5224 + }, + { + "epoch": 18.863205417607222, + "grad_norm": 271.058349609375, + "learning_rate": 1.6061705989110708e-06, + "loss": 34.4702, + "step": 5225 + }, + { + "epoch": 18.86681715575621, + "grad_norm": 301.3417663574219, + "learning_rate": 1.600725952813067e-06, + "loss": 32.5166, + "step": 5226 + }, + { + "epoch": 18.870428893905192, + "grad_norm": 259.4634094238281, + "learning_rate": 1.5952813067150635e-06, + "loss": 32.1952, + "step": 5227 + }, + { + "epoch": 18.874040632054175, + "grad_norm": 299.018310546875, + "learning_rate": 1.58983666061706e-06, + "loss": 33.6772, + "step": 5228 + }, + { + "epoch": 18.877652370203162, + "grad_norm": 286.192626953125, + "learning_rate": 1.5843920145190564e-06, + "loss": 35.4991, + "step": 5229 + }, + { + "epoch": 18.881264108352145, + "grad_norm": 380.0414733886719, + "learning_rate": 1.5789473684210526e-06, + "loss": 34.4324, + "step": 5230 + }, + { + "epoch": 18.881264108352145, + "eval_loss": 0.6009039282798767, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.078, + "eval_steps_per_second": 57.078, + "step": 5230 + }, + { + "epoch": 18.884875846501128, + "grad_norm": 333.0609436035156, + "learning_rate": 1.573502722323049e-06, + "loss": 35.8757, + "step": 5231 + }, + { + "epoch": 18.888487584650115, + "grad_norm": 343.6198425292969, + "learning_rate": 1.5680580762250455e-06, + "loss": 30.4765, + "step": 5232 + }, + { + "epoch": 18.892099322799098, + "grad_norm": 222.56637573242188, + "learning_rate": 1.562613430127042e-06, + "loss": 21.2017, + "step": 5233 + }, + { + "epoch": 18.89571106094808, + "grad_norm": 209.6859130859375, + "learning_rate": 1.5571687840290381e-06, + "loss": 21.5447, + "step": 5234 + }, + { + "epoch": 18.899322799097064, + "grad_norm": 249.7464141845703, + "learning_rate": 1.5517241379310346e-06, + "loss": 23.6495, + "step": 5235 + }, + { + "epoch": 18.90293453724605, + "grad_norm": 267.1141357421875, + "learning_rate": 1.546279491833031e-06, + "loss": 23.0331, + "step": 5236 + }, + { + "epoch": 18.906546275395034, + "grad_norm": 204.96266174316406, + "learning_rate": 1.5408348457350273e-06, + "loss": 37.8988, + "step": 5237 + }, + { + "epoch": 18.910158013544017, + "grad_norm": 247.50706481933594, + "learning_rate": 1.5353901996370235e-06, + "loss": 38.5207, + "step": 5238 + }, + { + "epoch": 18.913769751693003, + "grad_norm": 350.968994140625, + "learning_rate": 1.52994555353902e-06, + "loss": 37.981, + "step": 5239 + }, + { + "epoch": 18.917381489841986, + "grad_norm": 308.0031433105469, + "learning_rate": 1.5245009074410164e-06, + "loss": 39.2602, + "step": 5240 + }, + { + "epoch": 18.917381489841986, + "eval_loss": 0.6020543575286865, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 5240 + }, + { + "epoch": 18.92099322799097, + "grad_norm": 353.0065002441406, + "learning_rate": 1.5190562613430128e-06, + "loss": 39.7331, + "step": 5241 + }, + { + "epoch": 18.924604966139956, + "grad_norm": 495.2381591796875, + "learning_rate": 1.513611615245009e-06, + "loss": 37.6413, + "step": 5242 + }, + { + "epoch": 18.92821670428894, + "grad_norm": 470.453125, + "learning_rate": 1.5081669691470055e-06, + "loss": 36.1928, + "step": 5243 + }, + { + "epoch": 18.931828442437922, + "grad_norm": 632.1090698242188, + "learning_rate": 1.502722323049002e-06, + "loss": 37.4057, + "step": 5244 + }, + { + "epoch": 18.93544018058691, + "grad_norm": 488.4659118652344, + "learning_rate": 1.4972776769509982e-06, + "loss": 37.1323, + "step": 5245 + }, + { + "epoch": 18.939051918735892, + "grad_norm": 426.4764709472656, + "learning_rate": 1.4918330308529946e-06, + "loss": 36.1739, + "step": 5246 + }, + { + "epoch": 18.942663656884875, + "grad_norm": 413.3072509765625, + "learning_rate": 1.486388384754991e-06, + "loss": 36.243, + "step": 5247 + }, + { + "epoch": 18.94627539503386, + "grad_norm": 364.8636169433594, + "learning_rate": 1.4809437386569875e-06, + "loss": 36.8362, + "step": 5248 + }, + { + "epoch": 18.949887133182845, + "grad_norm": 306.2213134765625, + "learning_rate": 1.4754990925589837e-06, + "loss": 38.4677, + "step": 5249 + }, + { + "epoch": 18.953498871331828, + "grad_norm": 300.37664794921875, + "learning_rate": 1.47005444646098e-06, + "loss": 38.1286, + "step": 5250 + }, + { + "epoch": 18.953498871331828, + "eval_loss": 0.6017122864723206, + "eval_runtime": 3.1398, + "eval_samples_per_second": 57.009, + "eval_steps_per_second": 57.009, + "step": 5250 + }, + { + "epoch": 18.957110609480814, + "grad_norm": 242.2681884765625, + "learning_rate": 1.4646098003629764e-06, + "loss": 36.9862, + "step": 5251 + }, + { + "epoch": 18.960722347629797, + "grad_norm": 276.28179931640625, + "learning_rate": 1.4591651542649728e-06, + "loss": 35.0475, + "step": 5252 + }, + { + "epoch": 18.96433408577878, + "grad_norm": 256.64508056640625, + "learning_rate": 1.453720508166969e-06, + "loss": 30.4778, + "step": 5253 + }, + { + "epoch": 18.967945823927764, + "grad_norm": 275.1043701171875, + "learning_rate": 1.4482758620689655e-06, + "loss": 32.3847, + "step": 5254 + }, + { + "epoch": 18.97155756207675, + "grad_norm": 324.22955322265625, + "learning_rate": 1.442831215970962e-06, + "loss": 32.9917, + "step": 5255 + }, + { + "epoch": 18.975169300225733, + "grad_norm": 328.7778625488281, + "learning_rate": 1.4373865698729584e-06, + "loss": 31.5901, + "step": 5256 + }, + { + "epoch": 18.978781038374716, + "grad_norm": 307.2234191894531, + "learning_rate": 1.4319419237749546e-06, + "loss": 33.5733, + "step": 5257 + }, + { + "epoch": 18.982392776523703, + "grad_norm": 471.10552978515625, + "learning_rate": 1.426497277676951e-06, + "loss": 33.3204, + "step": 5258 + }, + { + "epoch": 18.986004514672686, + "grad_norm": 286.2314453125, + "learning_rate": 1.4210526315789475e-06, + "loss": 35.8205, + "step": 5259 + }, + { + "epoch": 18.98961625282167, + "grad_norm": 341.5156555175781, + "learning_rate": 1.415607985480944e-06, + "loss": 35.7746, + "step": 5260 + }, + { + "epoch": 18.98961625282167, + "eval_loss": 0.6023879051208496, + "eval_runtime": 3.1375, + "eval_samples_per_second": 57.051, + "eval_steps_per_second": 57.051, + "step": 5260 + }, + { + "epoch": 18.993227990970656, + "grad_norm": 257.73345947265625, + "learning_rate": 1.4101633393829402e-06, + "loss": 26.5263, + "step": 5261 + }, + { + "epoch": 18.99683972911964, + "grad_norm": 197.04811096191406, + "learning_rate": 1.4047186932849366e-06, + "loss": 21.9504, + "step": 5262 + }, + { + "epoch": 19.0, + "grad_norm": 237.48069763183594, + "learning_rate": 1.3992740471869328e-06, + "loss": 20.273, + "step": 5263 + }, + { + "epoch": 19.003611738148983, + "grad_norm": 238.98065185546875, + "learning_rate": 1.393829401088929e-06, + "loss": 37.7406, + "step": 5264 + }, + { + "epoch": 19.00722347629797, + "grad_norm": 209.30593872070312, + "learning_rate": 1.3883847549909255e-06, + "loss": 39.8367, + "step": 5265 + }, + { + "epoch": 19.010835214446953, + "grad_norm": 251.27899169921875, + "learning_rate": 1.382940108892922e-06, + "loss": 39.0155, + "step": 5266 + }, + { + "epoch": 19.014446952595936, + "grad_norm": 278.8317565917969, + "learning_rate": 1.3774954627949184e-06, + "loss": 37.9895, + "step": 5267 + }, + { + "epoch": 19.018058690744923, + "grad_norm": 227.08090209960938, + "learning_rate": 1.3720508166969146e-06, + "loss": 38.2986, + "step": 5268 + }, + { + "epoch": 19.021670428893906, + "grad_norm": 248.63221740722656, + "learning_rate": 1.366606170598911e-06, + "loss": 38.9906, + "step": 5269 + }, + { + "epoch": 19.02528216704289, + "grad_norm": 216.49449157714844, + "learning_rate": 1.3611615245009075e-06, + "loss": 39.4871, + "step": 5270 + }, + { + "epoch": 19.02528216704289, + "eval_loss": 0.6001354455947876, + "eval_runtime": 3.1409, + "eval_samples_per_second": 56.991, + "eval_steps_per_second": 56.991, + "step": 5270 + }, + { + "epoch": 19.028893905191875, + "grad_norm": 219.4734649658203, + "learning_rate": 1.355716878402904e-06, + "loss": 38.8617, + "step": 5271 + }, + { + "epoch": 19.03250564334086, + "grad_norm": 211.6996307373047, + "learning_rate": 1.3502722323049002e-06, + "loss": 39.6489, + "step": 5272 + }, + { + "epoch": 19.03611738148984, + "grad_norm": 306.1536865234375, + "learning_rate": 1.3448275862068966e-06, + "loss": 39.4235, + "step": 5273 + }, + { + "epoch": 19.039729119638825, + "grad_norm": 260.87353515625, + "learning_rate": 1.339382940108893e-06, + "loss": 37.9957, + "step": 5274 + }, + { + "epoch": 19.04334085778781, + "grad_norm": 266.5260314941406, + "learning_rate": 1.3339382940108895e-06, + "loss": 36.4288, + "step": 5275 + }, + { + "epoch": 19.046952595936794, + "grad_norm": 295.3840637207031, + "learning_rate": 1.3284936479128857e-06, + "loss": 35.1091, + "step": 5276 + }, + { + "epoch": 19.050564334085777, + "grad_norm": 381.60748291015625, + "learning_rate": 1.323049001814882e-06, + "loss": 37.6468, + "step": 5277 + }, + { + "epoch": 19.054176072234764, + "grad_norm": 430.3531494140625, + "learning_rate": 1.3176043557168784e-06, + "loss": 35.8345, + "step": 5278 + }, + { + "epoch": 19.057787810383747, + "grad_norm": 393.22772216796875, + "learning_rate": 1.3121597096188749e-06, + "loss": 37.1803, + "step": 5279 + }, + { + "epoch": 19.06139954853273, + "grad_norm": 308.1875915527344, + "learning_rate": 1.306715063520871e-06, + "loss": 36.5634, + "step": 5280 + }, + { + "epoch": 19.06139954853273, + "eval_loss": 0.6008215546607971, + "eval_runtime": 3.1371, + "eval_samples_per_second": 57.059, + "eval_steps_per_second": 57.059, + "step": 5280 + }, + { + "epoch": 19.065011286681717, + "grad_norm": 379.57183837890625, + "learning_rate": 1.3012704174228675e-06, + "loss": 36.7718, + "step": 5281 + }, + { + "epoch": 19.0686230248307, + "grad_norm": 482.2864685058594, + "learning_rate": 1.295825771324864e-06, + "loss": 37.0207, + "step": 5282 + }, + { + "epoch": 19.072234762979683, + "grad_norm": 310.96142578125, + "learning_rate": 1.2903811252268602e-06, + "loss": 37.0438, + "step": 5283 + }, + { + "epoch": 19.07584650112867, + "grad_norm": 274.2409973144531, + "learning_rate": 1.2849364791288566e-06, + "loss": 36.3401, + "step": 5284 + }, + { + "epoch": 19.079458239277653, + "grad_norm": 242.37583923339844, + "learning_rate": 1.279491833030853e-06, + "loss": 36.6312, + "step": 5285 + }, + { + "epoch": 19.083069977426636, + "grad_norm": 244.91583251953125, + "learning_rate": 1.2740471869328495e-06, + "loss": 37.4987, + "step": 5286 + }, + { + "epoch": 19.086681715575622, + "grad_norm": 234.21511840820312, + "learning_rate": 1.2686025408348458e-06, + "loss": 38.1373, + "step": 5287 + }, + { + "epoch": 19.090293453724605, + "grad_norm": 277.73931884765625, + "learning_rate": 1.2631578947368422e-06, + "loss": 38.8423, + "step": 5288 + }, + { + "epoch": 19.09390519187359, + "grad_norm": 247.04971313476562, + "learning_rate": 1.2577132486388386e-06, + "loss": 37.2783, + "step": 5289 + }, + { + "epoch": 19.097516930022575, + "grad_norm": 289.022216796875, + "learning_rate": 1.2522686025408349e-06, + "loss": 36.2534, + "step": 5290 + }, + { + "epoch": 19.097516930022575, + "eval_loss": 0.6020083427429199, + "eval_runtime": 3.1426, + "eval_samples_per_second": 56.959, + "eval_steps_per_second": 56.959, + "step": 5290 + }, + { + "epoch": 19.101128668171558, + "grad_norm": 294.7291564941406, + "learning_rate": 1.246823956442831e-06, + "loss": 36.4967, + "step": 5291 + }, + { + "epoch": 19.10474040632054, + "grad_norm": 238.0512237548828, + "learning_rate": 1.2413793103448275e-06, + "loss": 34.1439, + "step": 5292 + }, + { + "epoch": 19.108352144469524, + "grad_norm": 254.0712127685547, + "learning_rate": 1.235934664246824e-06, + "loss": 30.9632, + "step": 5293 + }, + { + "epoch": 19.11196388261851, + "grad_norm": 321.169921875, + "learning_rate": 1.2304900181488204e-06, + "loss": 29.2757, + "step": 5294 + }, + { + "epoch": 19.115575620767494, + "grad_norm": 308.8040466308594, + "learning_rate": 1.2250453720508167e-06, + "loss": 31.2651, + "step": 5295 + }, + { + "epoch": 19.119187358916477, + "grad_norm": 369.23004150390625, + "learning_rate": 1.219600725952813e-06, + "loss": 32.9721, + "step": 5296 + }, + { + "epoch": 19.122799097065464, + "grad_norm": 348.9309997558594, + "learning_rate": 1.2141560798548095e-06, + "loss": 31.8663, + "step": 5297 + }, + { + "epoch": 19.126410835214447, + "grad_norm": 330.5960388183594, + "learning_rate": 1.208711433756806e-06, + "loss": 31.6104, + "step": 5298 + }, + { + "epoch": 19.13002257336343, + "grad_norm": 380.59161376953125, + "learning_rate": 1.2032667876588022e-06, + "loss": 32.1911, + "step": 5299 + }, + { + "epoch": 19.133634311512417, + "grad_norm": 402.8847961425781, + "learning_rate": 1.1978221415607986e-06, + "loss": 33.4755, + "step": 5300 + }, + { + "epoch": 19.133634311512417, + "eval_loss": 0.6015223264694214, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.043, + "eval_steps_per_second": 57.043, + "step": 5300 + }, + { + "epoch": 19.1372460496614, + "grad_norm": 409.90667724609375, + "learning_rate": 1.192377495462795e-06, + "loss": 33.7318, + "step": 5301 + }, + { + "epoch": 19.140857787810383, + "grad_norm": 425.7220458984375, + "learning_rate": 1.1869328493647913e-06, + "loss": 33.6745, + "step": 5302 + }, + { + "epoch": 19.14446952595937, + "grad_norm": 373.9212951660156, + "learning_rate": 1.1814882032667876e-06, + "loss": 33.8191, + "step": 5303 + }, + { + "epoch": 19.148081264108352, + "grad_norm": 381.37469482421875, + "learning_rate": 1.176043557168784e-06, + "loss": 33.8767, + "step": 5304 + }, + { + "epoch": 19.151693002257336, + "grad_norm": 267.89288330078125, + "learning_rate": 1.1705989110707804e-06, + "loss": 33.3089, + "step": 5305 + }, + { + "epoch": 19.155304740406322, + "grad_norm": 326.5400390625, + "learning_rate": 1.1651542649727767e-06, + "loss": 35.798, + "step": 5306 + }, + { + "epoch": 19.158916478555305, + "grad_norm": 307.7875061035156, + "learning_rate": 1.1597096188747731e-06, + "loss": 34.2442, + "step": 5307 + }, + { + "epoch": 19.16252821670429, + "grad_norm": 401.6629333496094, + "learning_rate": 1.1542649727767695e-06, + "loss": 34.7408, + "step": 5308 + }, + { + "epoch": 19.16613995485327, + "grad_norm": 297.7433166503906, + "learning_rate": 1.148820326678766e-06, + "loss": 30.2776, + "step": 5309 + }, + { + "epoch": 19.169751693002258, + "grad_norm": 221.2977752685547, + "learning_rate": 1.1433756805807622e-06, + "loss": 21.3755, + "step": 5310 + }, + { + "epoch": 19.169751693002258, + "eval_loss": 0.6015586853027344, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.039, + "eval_steps_per_second": 57.039, + "step": 5310 + }, + { + "epoch": 19.17336343115124, + "grad_norm": 232.3973846435547, + "learning_rate": 1.1379310344827587e-06, + "loss": 20.9516, + "step": 5311 + }, + { + "epoch": 19.176975169300224, + "grad_norm": 220.6842803955078, + "learning_rate": 1.1324863883847551e-06, + "loss": 22.3779, + "step": 5312 + }, + { + "epoch": 19.18058690744921, + "grad_norm": 207.9031982421875, + "learning_rate": 1.1270417422867515e-06, + "loss": 23.4166, + "step": 5313 + }, + { + "epoch": 19.184198645598194, + "grad_norm": 211.70394897460938, + "learning_rate": 1.1215970961887478e-06, + "loss": 37.157, + "step": 5314 + }, + { + "epoch": 19.187810383747177, + "grad_norm": 243.7276611328125, + "learning_rate": 1.1161524500907442e-06, + "loss": 40.0688, + "step": 5315 + }, + { + "epoch": 19.191422121896164, + "grad_norm": 199.99435424804688, + "learning_rate": 1.1107078039927405e-06, + "loss": 38.9213, + "step": 5316 + }, + { + "epoch": 19.195033860045147, + "grad_norm": 214.8607177734375, + "learning_rate": 1.1052631578947367e-06, + "loss": 37.5778, + "step": 5317 + }, + { + "epoch": 19.19864559819413, + "grad_norm": 241.69651794433594, + "learning_rate": 1.0998185117967331e-06, + "loss": 36.9334, + "step": 5318 + }, + { + "epoch": 19.202257336343116, + "grad_norm": 344.64849853515625, + "learning_rate": 1.0943738656987296e-06, + "loss": 38.9315, + "step": 5319 + }, + { + "epoch": 19.2058690744921, + "grad_norm": 248.10731506347656, + "learning_rate": 1.088929219600726e-06, + "loss": 37.94, + "step": 5320 + }, + { + "epoch": 19.2058690744921, + "eval_loss": 0.6011462211608887, + "eval_runtime": 3.1406, + "eval_samples_per_second": 56.995, + "eval_steps_per_second": 56.995, + "step": 5320 + }, + { + "epoch": 19.209480812641083, + "grad_norm": 262.3296813964844, + "learning_rate": 1.0834845735027222e-06, + "loss": 38.2016, + "step": 5321 + }, + { + "epoch": 19.21309255079007, + "grad_norm": 276.65179443359375, + "learning_rate": 1.0780399274047187e-06, + "loss": 39.0355, + "step": 5322 + }, + { + "epoch": 19.216704288939052, + "grad_norm": 377.314697265625, + "learning_rate": 1.0725952813067151e-06, + "loss": 39.0543, + "step": 5323 + }, + { + "epoch": 19.220316027088035, + "grad_norm": 282.5917053222656, + "learning_rate": 1.0671506352087116e-06, + "loss": 37.1001, + "step": 5324 + }, + { + "epoch": 19.223927765237022, + "grad_norm": 420.4558410644531, + "learning_rate": 1.0617059891107078e-06, + "loss": 36.5363, + "step": 5325 + }, + { + "epoch": 19.227539503386005, + "grad_norm": 460.62701416015625, + "learning_rate": 1.0562613430127042e-06, + "loss": 35.8127, + "step": 5326 + }, + { + "epoch": 19.231151241534988, + "grad_norm": 492.31170654296875, + "learning_rate": 1.0508166969147007e-06, + "loss": 35.7043, + "step": 5327 + }, + { + "epoch": 19.23476297968397, + "grad_norm": 385.2608947753906, + "learning_rate": 1.0453720508166971e-06, + "loss": 35.0656, + "step": 5328 + }, + { + "epoch": 19.238374717832958, + "grad_norm": 322.3689270019531, + "learning_rate": 1.0399274047186931e-06, + "loss": 37.2145, + "step": 5329 + }, + { + "epoch": 19.24198645598194, + "grad_norm": 309.3829650878906, + "learning_rate": 1.0344827586206896e-06, + "loss": 35.4361, + "step": 5330 + }, + { + "epoch": 19.24198645598194, + "eval_loss": 0.6023690104484558, + "eval_runtime": 3.1424, + "eval_samples_per_second": 56.964, + "eval_steps_per_second": 56.964, + "step": 5330 + }, + { + "epoch": 19.245598194130924, + "grad_norm": 342.5604248046875, + "learning_rate": 1.029038112522686e-06, + "loss": 36.9204, + "step": 5331 + }, + { + "epoch": 19.24920993227991, + "grad_norm": 404.432373046875, + "learning_rate": 1.0235934664246825e-06, + "loss": 37.9907, + "step": 5332 + }, + { + "epoch": 19.252821670428894, + "grad_norm": 333.77044677734375, + "learning_rate": 1.0181488203266787e-06, + "loss": 36.1432, + "step": 5333 + }, + { + "epoch": 19.256433408577877, + "grad_norm": 297.11480712890625, + "learning_rate": 1.0127041742286751e-06, + "loss": 37.824, + "step": 5334 + }, + { + "epoch": 19.260045146726863, + "grad_norm": 271.3321838378906, + "learning_rate": 1.0072595281306716e-06, + "loss": 36.0811, + "step": 5335 + }, + { + "epoch": 19.263656884875846, + "grad_norm": 246.6988525390625, + "learning_rate": 1.0018148820326678e-06, + "loss": 36.6415, + "step": 5336 + }, + { + "epoch": 19.26726862302483, + "grad_norm": 264.7515563964844, + "learning_rate": 9.963702359346642e-07, + "loss": 37.048, + "step": 5337 + }, + { + "epoch": 19.270880361173816, + "grad_norm": 238.71475219726562, + "learning_rate": 9.909255898366607e-07, + "loss": 37.3109, + "step": 5338 + }, + { + "epoch": 19.2744920993228, + "grad_norm": 232.89256286621094, + "learning_rate": 9.854809437386571e-07, + "loss": 37.0776, + "step": 5339 + }, + { + "epoch": 19.278103837471782, + "grad_norm": 309.91796875, + "learning_rate": 9.800362976406534e-07, + "loss": 37.5227, + "step": 5340 + }, + { + "epoch": 19.278103837471782, + "eval_loss": 0.603413999080658, + "eval_runtime": 3.1407, + "eval_samples_per_second": 56.993, + "eval_steps_per_second": 56.993, + "step": 5340 + }, + { + "epoch": 19.28171557562077, + "grad_norm": 415.85009765625, + "learning_rate": 9.745916515426498e-07, + "loss": 38.7916, + "step": 5341 + }, + { + "epoch": 19.285327313769752, + "grad_norm": 336.5480651855469, + "learning_rate": 9.691470054446462e-07, + "loss": 34.7108, + "step": 5342 + }, + { + "epoch": 19.288939051918735, + "grad_norm": 361.7843017578125, + "learning_rate": 9.637023593466425e-07, + "loss": 33.3624, + "step": 5343 + }, + { + "epoch": 19.292550790067722, + "grad_norm": 278.5044250488281, + "learning_rate": 9.582577132486387e-07, + "loss": 31.9202, + "step": 5344 + }, + { + "epoch": 19.296162528216705, + "grad_norm": 378.85003662109375, + "learning_rate": 9.528130671506353e-07, + "loss": 32.0191, + "step": 5345 + }, + { + "epoch": 19.299774266365688, + "grad_norm": 307.8309020996094, + "learning_rate": 9.473684210526316e-07, + "loss": 30.1278, + "step": 5346 + }, + { + "epoch": 19.30338600451467, + "grad_norm": 377.0649108886719, + "learning_rate": 9.41923774954628e-07, + "loss": 30.8298, + "step": 5347 + }, + { + "epoch": 19.306997742663658, + "grad_norm": 366.9952392578125, + "learning_rate": 9.364791288566244e-07, + "loss": 32.8491, + "step": 5348 + }, + { + "epoch": 19.31060948081264, + "grad_norm": 384.6134948730469, + "learning_rate": 9.310344827586207e-07, + "loss": 33.3014, + "step": 5349 + }, + { + "epoch": 19.314221218961624, + "grad_norm": 377.0379943847656, + "learning_rate": 9.255898366606171e-07, + "loss": 31.1514, + "step": 5350 + }, + { + "epoch": 19.314221218961624, + "eval_loss": 0.6012714505195618, + "eval_runtime": 3.1381, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 5350 + }, + { + "epoch": 19.31783295711061, + "grad_norm": 419.49359130859375, + "learning_rate": 9.201451905626135e-07, + "loss": 32.2402, + "step": 5351 + }, + { + "epoch": 19.321444695259594, + "grad_norm": 290.20050048828125, + "learning_rate": 9.147005444646098e-07, + "loss": 33.9084, + "step": 5352 + }, + { + "epoch": 19.325056433408577, + "grad_norm": 283.597412109375, + "learning_rate": 9.092558983666062e-07, + "loss": 34.3691, + "step": 5353 + }, + { + "epoch": 19.328668171557563, + "grad_norm": 322.4947204589844, + "learning_rate": 9.038112522686025e-07, + "loss": 33.2218, + "step": 5354 + }, + { + "epoch": 19.332279909706546, + "grad_norm": 346.0417785644531, + "learning_rate": 8.983666061705989e-07, + "loss": 32.6409, + "step": 5355 + }, + { + "epoch": 19.33589164785553, + "grad_norm": 282.1748962402344, + "learning_rate": 8.929219600725953e-07, + "loss": 33.722, + "step": 5356 + }, + { + "epoch": 19.339503386004516, + "grad_norm": 302.015625, + "learning_rate": 8.874773139745917e-07, + "loss": 35.1681, + "step": 5357 + }, + { + "epoch": 19.3431151241535, + "grad_norm": 325.37005615234375, + "learning_rate": 8.82032667876588e-07, + "loss": 34.2712, + "step": 5358 + }, + { + "epoch": 19.346726862302482, + "grad_norm": 291.301513671875, + "learning_rate": 8.765880217785844e-07, + "loss": 31.3185, + "step": 5359 + }, + { + "epoch": 19.35033860045147, + "grad_norm": 190.09767150878906, + "learning_rate": 8.711433756805807e-07, + "loss": 22.3868, + "step": 5360 + }, + { + "epoch": 19.35033860045147, + "eval_loss": 0.6009277105331421, + "eval_runtime": 3.1385, + "eval_samples_per_second": 57.033, + "eval_steps_per_second": 57.033, + "step": 5360 + }, + { + "epoch": 19.353950338600452, + "grad_norm": 231.69676208496094, + "learning_rate": 8.656987295825772e-07, + "loss": 21.1889, + "step": 5361 + }, + { + "epoch": 19.357562076749435, + "grad_norm": 255.91258239746094, + "learning_rate": 8.602540834845735e-07, + "loss": 23.1246, + "step": 5362 + }, + { + "epoch": 19.36117381489842, + "grad_norm": 265.2499694824219, + "learning_rate": 8.548094373865699e-07, + "loss": 22.9017, + "step": 5363 + }, + { + "epoch": 19.364785553047405, + "grad_norm": 217.06552124023438, + "learning_rate": 8.493647912885663e-07, + "loss": 38.4372, + "step": 5364 + }, + { + "epoch": 19.368397291196388, + "grad_norm": 220.9014434814453, + "learning_rate": 8.439201451905627e-07, + "loss": 38.8259, + "step": 5365 + }, + { + "epoch": 19.37200902934537, + "grad_norm": 217.46336364746094, + "learning_rate": 8.38475499092559e-07, + "loss": 37.7587, + "step": 5366 + }, + { + "epoch": 19.375620767494357, + "grad_norm": 219.59889221191406, + "learning_rate": 8.330308529945554e-07, + "loss": 38.2973, + "step": 5367 + }, + { + "epoch": 19.37923250564334, + "grad_norm": 206.93772888183594, + "learning_rate": 8.275862068965517e-07, + "loss": 36.6878, + "step": 5368 + }, + { + "epoch": 19.382844243792324, + "grad_norm": 268.5470886230469, + "learning_rate": 8.221415607985482e-07, + "loss": 37.4095, + "step": 5369 + }, + { + "epoch": 19.38645598194131, + "grad_norm": 228.70216369628906, + "learning_rate": 8.166969147005445e-07, + "loss": 39.1159, + "step": 5370 + }, + { + "epoch": 19.38645598194131, + "eval_loss": 0.6011511087417603, + "eval_runtime": 3.1369, + "eval_samples_per_second": 57.063, + "eval_steps_per_second": 57.063, + "step": 5370 + }, + { + "epoch": 19.390067720090293, + "grad_norm": 212.8670654296875, + "learning_rate": 8.112522686025409e-07, + "loss": 38.8929, + "step": 5371 + }, + { + "epoch": 19.393679458239276, + "grad_norm": 228.0734405517578, + "learning_rate": 8.058076225045372e-07, + "loss": 39.7208, + "step": 5372 + }, + { + "epoch": 19.397291196388263, + "grad_norm": 239.56906127929688, + "learning_rate": 8.003629764065335e-07, + "loss": 38.3748, + "step": 5373 + }, + { + "epoch": 19.400902934537246, + "grad_norm": 243.6251220703125, + "learning_rate": 7.9491833030853e-07, + "loss": 37.3178, + "step": 5374 + }, + { + "epoch": 19.40451467268623, + "grad_norm": 407.86907958984375, + "learning_rate": 7.894736842105263e-07, + "loss": 36.5418, + "step": 5375 + }, + { + "epoch": 19.408126410835216, + "grad_norm": 260.6579284667969, + "learning_rate": 7.840290381125227e-07, + "loss": 36.9031, + "step": 5376 + }, + { + "epoch": 19.4117381489842, + "grad_norm": 358.63946533203125, + "learning_rate": 7.785843920145191e-07, + "loss": 35.4851, + "step": 5377 + }, + { + "epoch": 19.415349887133182, + "grad_norm": 414.06634521484375, + "learning_rate": 7.731397459165155e-07, + "loss": 34.6983, + "step": 5378 + }, + { + "epoch": 19.41896162528217, + "grad_norm": 471.287109375, + "learning_rate": 7.676950998185117e-07, + "loss": 36.7265, + "step": 5379 + }, + { + "epoch": 19.42257336343115, + "grad_norm": 366.92767333984375, + "learning_rate": 7.622504537205082e-07, + "loss": 35.4779, + "step": 5380 + }, + { + "epoch": 19.42257336343115, + "eval_loss": 0.6010181903839111, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 5380 + }, + { + "epoch": 19.426185101580135, + "grad_norm": 392.23138427734375, + "learning_rate": 7.568058076225045e-07, + "loss": 36.1143, + "step": 5381 + }, + { + "epoch": 19.42979683972912, + "grad_norm": 296.0258483886719, + "learning_rate": 7.51361161524501e-07, + "loss": 37.5785, + "step": 5382 + }, + { + "epoch": 19.433408577878104, + "grad_norm": 425.22247314453125, + "learning_rate": 7.459165154264973e-07, + "loss": 37.7905, + "step": 5383 + }, + { + "epoch": 19.437020316027088, + "grad_norm": 288.7919921875, + "learning_rate": 7.404718693284937e-07, + "loss": 36.3987, + "step": 5384 + }, + { + "epoch": 19.44063205417607, + "grad_norm": 269.2157287597656, + "learning_rate": 7.3502722323049e-07, + "loss": 36.9862, + "step": 5385 + }, + { + "epoch": 19.444243792325057, + "grad_norm": 236.28067016601562, + "learning_rate": 7.295825771324864e-07, + "loss": 36.3645, + "step": 5386 + }, + { + "epoch": 19.44785553047404, + "grad_norm": 217.44627380371094, + "learning_rate": 7.241379310344827e-07, + "loss": 37.0505, + "step": 5387 + }, + { + "epoch": 19.451467268623023, + "grad_norm": 260.61175537109375, + "learning_rate": 7.186932849364792e-07, + "loss": 37.1031, + "step": 5388 + }, + { + "epoch": 19.45507900677201, + "grad_norm": 282.62017822265625, + "learning_rate": 7.132486388384755e-07, + "loss": 38.2061, + "step": 5389 + }, + { + "epoch": 19.458690744920993, + "grad_norm": 231.78170776367188, + "learning_rate": 7.07803992740472e-07, + "loss": 35.8868, + "step": 5390 + }, + { + "epoch": 19.458690744920993, + "eval_loss": 0.6014392375946045, + "eval_runtime": 3.1328, + "eval_samples_per_second": 57.137, + "eval_steps_per_second": 57.137, + "step": 5390 + }, + { + "epoch": 19.462302483069976, + "grad_norm": 246.38380432128906, + "learning_rate": 7.023593466424683e-07, + "loss": 36.1871, + "step": 5391 + }, + { + "epoch": 19.465914221218963, + "grad_norm": 239.06924438476562, + "learning_rate": 6.969147005444645e-07, + "loss": 34.5704, + "step": 5392 + }, + { + "epoch": 19.469525959367946, + "grad_norm": 396.09027099609375, + "learning_rate": 6.91470054446461e-07, + "loss": 33.6148, + "step": 5393 + }, + { + "epoch": 19.47313769751693, + "grad_norm": 250.8205108642578, + "learning_rate": 6.860254083484573e-07, + "loss": 31.535, + "step": 5394 + }, + { + "epoch": 19.476749435665916, + "grad_norm": 257.0039978027344, + "learning_rate": 6.805807622504538e-07, + "loss": 31.6366, + "step": 5395 + }, + { + "epoch": 19.4803611738149, + "grad_norm": 283.7515563964844, + "learning_rate": 6.751361161524501e-07, + "loss": 30.4001, + "step": 5396 + }, + { + "epoch": 19.483972911963882, + "grad_norm": 335.6957702636719, + "learning_rate": 6.696914700544465e-07, + "loss": 31.1016, + "step": 5397 + }, + { + "epoch": 19.48758465011287, + "grad_norm": 338.0590515136719, + "learning_rate": 6.642468239564429e-07, + "loss": 31.7707, + "step": 5398 + }, + { + "epoch": 19.49119638826185, + "grad_norm": 409.0957946777344, + "learning_rate": 6.588021778584392e-07, + "loss": 34.904, + "step": 5399 + }, + { + "epoch": 19.494808126410835, + "grad_norm": 265.0601806640625, + "learning_rate": 6.533575317604355e-07, + "loss": 32.1701, + "step": 5400 + }, + { + "epoch": 19.494808126410835, + "eval_loss": 0.6015393137931824, + "eval_runtime": 3.1382, + "eval_samples_per_second": 57.04, + "eval_steps_per_second": 57.04, + "step": 5400 + }, + { + "epoch": 19.498419864559818, + "grad_norm": 354.3403625488281, + "learning_rate": 6.47912885662432e-07, + "loss": 32.7803, + "step": 5401 + }, + { + "epoch": 19.502031602708804, + "grad_norm": 257.71124267578125, + "learning_rate": 6.424682395644283e-07, + "loss": 33.4401, + "step": 5402 + }, + { + "epoch": 19.505643340857787, + "grad_norm": 325.73876953125, + "learning_rate": 6.370235934664248e-07, + "loss": 33.3075, + "step": 5403 + }, + { + "epoch": 19.50925507900677, + "grad_norm": 283.1676940917969, + "learning_rate": 6.315789473684211e-07, + "loss": 34.5868, + "step": 5404 + }, + { + "epoch": 19.512866817155757, + "grad_norm": 265.0743713378906, + "learning_rate": 6.261343012704174e-07, + "loss": 34.2399, + "step": 5405 + }, + { + "epoch": 19.51647855530474, + "grad_norm": 381.4061279296875, + "learning_rate": 6.206896551724138e-07, + "loss": 35.8848, + "step": 5406 + }, + { + "epoch": 19.520090293453723, + "grad_norm": 311.1829833984375, + "learning_rate": 6.152450090744102e-07, + "loss": 34.5162, + "step": 5407 + }, + { + "epoch": 19.52370203160271, + "grad_norm": 301.8170471191406, + "learning_rate": 6.098003629764065e-07, + "loss": 34.0525, + "step": 5408 + }, + { + "epoch": 19.527313769751693, + "grad_norm": 276.9403076171875, + "learning_rate": 6.04355716878403e-07, + "loss": 28.6084, + "step": 5409 + }, + { + "epoch": 19.530925507900676, + "grad_norm": 221.44195556640625, + "learning_rate": 5.989110707803993e-07, + "loss": 21.827, + "step": 5410 + }, + { + "epoch": 19.530925507900676, + "eval_loss": 0.601222813129425, + "eval_runtime": 3.1388, + "eval_samples_per_second": 57.029, + "eval_steps_per_second": 57.029, + "step": 5410 + }, + { + "epoch": 19.534537246049663, + "grad_norm": 215.0915069580078, + "learning_rate": 5.934664246823957e-07, + "loss": 21.4303, + "step": 5411 + }, + { + "epoch": 19.538148984198646, + "grad_norm": 230.7354736328125, + "learning_rate": 5.88021778584392e-07, + "loss": 22.3575, + "step": 5412 + }, + { + "epoch": 19.54176072234763, + "grad_norm": 257.53533935546875, + "learning_rate": 5.825771324863883e-07, + "loss": 23.2244, + "step": 5413 + }, + { + "epoch": 19.545372460496615, + "grad_norm": 226.0248260498047, + "learning_rate": 5.771324863883848e-07, + "loss": 37.427, + "step": 5414 + }, + { + "epoch": 19.5489841986456, + "grad_norm": 204.3394775390625, + "learning_rate": 5.716878402903811e-07, + "loss": 38.7522, + "step": 5415 + }, + { + "epoch": 19.55259593679458, + "grad_norm": 213.9196014404297, + "learning_rate": 5.662431941923776e-07, + "loss": 38.0999, + "step": 5416 + }, + { + "epoch": 19.55620767494357, + "grad_norm": 183.85964965820312, + "learning_rate": 5.607985480943739e-07, + "loss": 38.154, + "step": 5417 + }, + { + "epoch": 19.55981941309255, + "grad_norm": 212.41763305664062, + "learning_rate": 5.553539019963702e-07, + "loss": 38.0258, + "step": 5418 + }, + { + "epoch": 19.563431151241534, + "grad_norm": 225.71121215820312, + "learning_rate": 5.499092558983666e-07, + "loss": 38.8271, + "step": 5419 + }, + { + "epoch": 19.567042889390518, + "grad_norm": 235.203125, + "learning_rate": 5.44464609800363e-07, + "loss": 37.5532, + "step": 5420 + }, + { + "epoch": 19.567042889390518, + "eval_loss": 0.6008686423301697, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 5420 + }, + { + "epoch": 19.570654627539504, + "grad_norm": 208.5715789794922, + "learning_rate": 5.390199637023593e-07, + "loss": 40.1166, + "step": 5421 + }, + { + "epoch": 19.574266365688487, + "grad_norm": 264.13909912109375, + "learning_rate": 5.335753176043558e-07, + "loss": 37.8543, + "step": 5422 + }, + { + "epoch": 19.57787810383747, + "grad_norm": 218.41786193847656, + "learning_rate": 5.281306715063521e-07, + "loss": 39.0052, + "step": 5423 + }, + { + "epoch": 19.581489841986457, + "grad_norm": 286.737060546875, + "learning_rate": 5.226860254083486e-07, + "loss": 38.3982, + "step": 5424 + }, + { + "epoch": 19.58510158013544, + "grad_norm": 291.76617431640625, + "learning_rate": 5.172413793103448e-07, + "loss": 37.0972, + "step": 5425 + }, + { + "epoch": 19.588713318284423, + "grad_norm": 300.4125671386719, + "learning_rate": 5.117967332123412e-07, + "loss": 34.297, + "step": 5426 + }, + { + "epoch": 19.59232505643341, + "grad_norm": 359.1770935058594, + "learning_rate": 5.063520871143376e-07, + "loss": 35.5142, + "step": 5427 + }, + { + "epoch": 19.595936794582393, + "grad_norm": 384.48028564453125, + "learning_rate": 5.009074410163339e-07, + "loss": 36.9965, + "step": 5428 + }, + { + "epoch": 19.599548532731376, + "grad_norm": 415.5469055175781, + "learning_rate": 4.954627949183303e-07, + "loss": 37.3736, + "step": 5429 + }, + { + "epoch": 19.603160270880363, + "grad_norm": 236.56715393066406, + "learning_rate": 4.900181488203267e-07, + "loss": 36.4009, + "step": 5430 + }, + { + "epoch": 19.603160270880363, + "eval_loss": 0.6017860770225525, + "eval_runtime": 3.1384, + "eval_samples_per_second": 57.035, + "eval_steps_per_second": 57.035, + "step": 5430 + }, + { + "epoch": 19.606772009029346, + "grad_norm": 411.9438171386719, + "learning_rate": 4.845735027223231e-07, + "loss": 35.4744, + "step": 5431 + }, + { + "epoch": 19.61038374717833, + "grad_norm": 306.6455993652344, + "learning_rate": 4.791288566243194e-07, + "loss": 36.853, + "step": 5432 + }, + { + "epoch": 19.613995485327315, + "grad_norm": 289.98883056640625, + "learning_rate": 4.736842105263158e-07, + "loss": 37.7418, + "step": 5433 + }, + { + "epoch": 19.6176072234763, + "grad_norm": 227.83628845214844, + "learning_rate": 4.682395644283122e-07, + "loss": 36.2866, + "step": 5434 + }, + { + "epoch": 19.62121896162528, + "grad_norm": 260.56695556640625, + "learning_rate": 4.627949183303086e-07, + "loss": 35.5141, + "step": 5435 + }, + { + "epoch": 19.624830699774268, + "grad_norm": 236.0625762939453, + "learning_rate": 4.573502722323049e-07, + "loss": 37.7585, + "step": 5436 + }, + { + "epoch": 19.62844243792325, + "grad_norm": 299.8916015625, + "learning_rate": 4.5190562613430125e-07, + "loss": 39.0317, + "step": 5437 + }, + { + "epoch": 19.632054176072234, + "grad_norm": 236.15243530273438, + "learning_rate": 4.4646098003629764e-07, + "loss": 38.0213, + "step": 5438 + }, + { + "epoch": 19.635665914221217, + "grad_norm": 291.18182373046875, + "learning_rate": 4.41016333938294e-07, + "loss": 37.9197, + "step": 5439 + }, + { + "epoch": 19.639277652370204, + "grad_norm": 243.15419006347656, + "learning_rate": 4.3557168784029036e-07, + "loss": 36.2963, + "step": 5440 + }, + { + "epoch": 19.639277652370204, + "eval_loss": 0.6023871302604675, + "eval_runtime": 3.1823, + "eval_samples_per_second": 56.249, + "eval_steps_per_second": 56.249, + "step": 5440 + }, + { + "epoch": 19.642889390519187, + "grad_norm": 247.60049438476562, + "learning_rate": 4.3012704174228675e-07, + "loss": 36.5442, + "step": 5441 + }, + { + "epoch": 19.64650112866817, + "grad_norm": 268.0565490722656, + "learning_rate": 4.2468239564428314e-07, + "loss": 34.3726, + "step": 5442 + }, + { + "epoch": 19.650112866817157, + "grad_norm": 251.00057983398438, + "learning_rate": 4.192377495462795e-07, + "loss": 32.1691, + "step": 5443 + }, + { + "epoch": 19.65372460496614, + "grad_norm": 321.4367370605469, + "learning_rate": 4.1379310344827586e-07, + "loss": 31.5831, + "step": 5444 + }, + { + "epoch": 19.657336343115123, + "grad_norm": 328.7476501464844, + "learning_rate": 4.0834845735027225e-07, + "loss": 32.1178, + "step": 5445 + }, + { + "epoch": 19.66094808126411, + "grad_norm": 264.1122741699219, + "learning_rate": 4.029038112522686e-07, + "loss": 30.9057, + "step": 5446 + }, + { + "epoch": 19.664559819413093, + "grad_norm": 443.7752380371094, + "learning_rate": 3.97459165154265e-07, + "loss": 32.1608, + "step": 5447 + }, + { + "epoch": 19.668171557562076, + "grad_norm": 239.18614196777344, + "learning_rate": 3.9201451905626137e-07, + "loss": 32.152, + "step": 5448 + }, + { + "epoch": 19.671783295711062, + "grad_norm": 259.49249267578125, + "learning_rate": 3.8656987295825776e-07, + "loss": 32.9004, + "step": 5449 + }, + { + "epoch": 19.675395033860045, + "grad_norm": 270.965576171875, + "learning_rate": 3.811252268602541e-07, + "loss": 32.8006, + "step": 5450 + }, + { + "epoch": 19.675395033860045, + "eval_loss": 0.6013672947883606, + "eval_runtime": 3.15, + "eval_samples_per_second": 56.826, + "eval_steps_per_second": 56.826, + "step": 5450 + }, + { + "epoch": 19.67900677200903, + "grad_norm": 261.61962890625, + "learning_rate": 3.756805807622505e-07, + "loss": 32.5202, + "step": 5451 + }, + { + "epoch": 19.682618510158015, + "grad_norm": 421.79974365234375, + "learning_rate": 3.7023593466424687e-07, + "loss": 33.8715, + "step": 5452 + }, + { + "epoch": 19.686230248306998, + "grad_norm": 257.54522705078125, + "learning_rate": 3.647912885662432e-07, + "loss": 33.7379, + "step": 5453 + }, + { + "epoch": 19.68984198645598, + "grad_norm": 290.4663391113281, + "learning_rate": 3.593466424682396e-07, + "loss": 34.0276, + "step": 5454 + }, + { + "epoch": 19.693453724604964, + "grad_norm": 358.5994567871094, + "learning_rate": 3.53901996370236e-07, + "loss": 33.697, + "step": 5455 + }, + { + "epoch": 19.69706546275395, + "grad_norm": 387.7028503417969, + "learning_rate": 3.4845735027223227e-07, + "loss": 36.1719, + "step": 5456 + }, + { + "epoch": 19.700677200902934, + "grad_norm": 358.7620544433594, + "learning_rate": 3.4301270417422866e-07, + "loss": 34.2382, + "step": 5457 + }, + { + "epoch": 19.704288939051917, + "grad_norm": 395.00140380859375, + "learning_rate": 3.3756805807622505e-07, + "loss": 34.9605, + "step": 5458 + }, + { + "epoch": 19.707900677200904, + "grad_norm": 291.4330749511719, + "learning_rate": 3.3212341197822143e-07, + "loss": 25.9945, + "step": 5459 + }, + { + "epoch": 19.711512415349887, + "grad_norm": 218.69113159179688, + "learning_rate": 3.2667876588021777e-07, + "loss": 21.5688, + "step": 5460 + }, + { + "epoch": 19.711512415349887, + "eval_loss": 0.601553201675415, + "eval_runtime": 3.1514, + "eval_samples_per_second": 56.799, + "eval_steps_per_second": 56.799, + "step": 5460 + }, + { + "epoch": 19.71512415349887, + "grad_norm": 236.54107666015625, + "learning_rate": 3.2123411978221416e-07, + "loss": 21.5821, + "step": 5461 + }, + { + "epoch": 19.718735891647857, + "grad_norm": 272.6702880859375, + "learning_rate": 3.1578947368421055e-07, + "loss": 21.9283, + "step": 5462 + }, + { + "epoch": 19.72234762979684, + "grad_norm": 264.74005126953125, + "learning_rate": 3.103448275862069e-07, + "loss": 22.9388, + "step": 5463 + }, + { + "epoch": 19.725959367945823, + "grad_norm": 207.89337158203125, + "learning_rate": 3.0490018148820327e-07, + "loss": 39.0581, + "step": 5464 + }, + { + "epoch": 19.72957110609481, + "grad_norm": 240.96636962890625, + "learning_rate": 2.9945553539019966e-07, + "loss": 38.9328, + "step": 5465 + }, + { + "epoch": 19.733182844243792, + "grad_norm": 239.6488037109375, + "learning_rate": 2.94010889292196e-07, + "loss": 38.9602, + "step": 5466 + }, + { + "epoch": 19.736794582392776, + "grad_norm": 233.20974731445312, + "learning_rate": 2.885662431941924e-07, + "loss": 39.4486, + "step": 5467 + }, + { + "epoch": 19.740406320541762, + "grad_norm": 224.98013305664062, + "learning_rate": 2.831215970961888e-07, + "loss": 39.0727, + "step": 5468 + }, + { + "epoch": 19.744018058690745, + "grad_norm": 245.26980590820312, + "learning_rate": 2.776769509981851e-07, + "loss": 37.5583, + "step": 5469 + }, + { + "epoch": 19.74762979683973, + "grad_norm": 205.14044189453125, + "learning_rate": 2.722323049001815e-07, + "loss": 38.6332, + "step": 5470 + }, + { + "epoch": 19.74762979683973, + "eval_loss": 0.6001273393630981, + "eval_runtime": 3.1492, + "eval_samples_per_second": 56.84, + "eval_steps_per_second": 56.84, + "step": 5470 + }, + { + "epoch": 19.751241534988715, + "grad_norm": 229.19940185546875, + "learning_rate": 2.667876588021779e-07, + "loss": 38.4545, + "step": 5471 + }, + { + "epoch": 19.754853273137698, + "grad_norm": 260.04083251953125, + "learning_rate": 2.613430127041743e-07, + "loss": 37.4461, + "step": 5472 + }, + { + "epoch": 19.75846501128668, + "grad_norm": 252.2135772705078, + "learning_rate": 2.558983666061706e-07, + "loss": 38.6207, + "step": 5473 + }, + { + "epoch": 19.762076749435664, + "grad_norm": 211.760009765625, + "learning_rate": 2.5045372050816695e-07, + "loss": 36.4307, + "step": 5474 + }, + { + "epoch": 19.76568848758465, + "grad_norm": 227.18177795410156, + "learning_rate": 2.4500907441016334e-07, + "loss": 35.7522, + "step": 5475 + }, + { + "epoch": 19.769300225733634, + "grad_norm": 276.8219299316406, + "learning_rate": 2.395644283121597e-07, + "loss": 35.3123, + "step": 5476 + }, + { + "epoch": 19.772911963882617, + "grad_norm": 302.77362060546875, + "learning_rate": 2.341197822141561e-07, + "loss": 35.8374, + "step": 5477 + }, + { + "epoch": 19.776523702031604, + "grad_norm": 279.4811096191406, + "learning_rate": 2.2867513611615246e-07, + "loss": 36.6637, + "step": 5478 + }, + { + "epoch": 19.780135440180587, + "grad_norm": 390.7204284667969, + "learning_rate": 2.2323049001814882e-07, + "loss": 35.9263, + "step": 5479 + }, + { + "epoch": 19.78374717832957, + "grad_norm": 250.87916564941406, + "learning_rate": 2.1778584392014518e-07, + "loss": 37.1529, + "step": 5480 + }, + { + "epoch": 19.78374717832957, + "eval_loss": 0.6011965274810791, + "eval_runtime": 3.1499, + "eval_samples_per_second": 56.826, + "eval_steps_per_second": 56.826, + "step": 5480 + }, + { + "epoch": 19.787358916478556, + "grad_norm": 285.9814453125, + "learning_rate": 2.1234119782214157e-07, + "loss": 36.2204, + "step": 5481 + }, + { + "epoch": 19.79097065462754, + "grad_norm": 263.5719299316406, + "learning_rate": 2.0689655172413793e-07, + "loss": 36.4458, + "step": 5482 + }, + { + "epoch": 19.794582392776523, + "grad_norm": 252.95606994628906, + "learning_rate": 2.014519056261343e-07, + "loss": 35.8917, + "step": 5483 + }, + { + "epoch": 19.79819413092551, + "grad_norm": 400.2224426269531, + "learning_rate": 1.9600725952813068e-07, + "loss": 37.6994, + "step": 5484 + }, + { + "epoch": 19.801805869074492, + "grad_norm": 304.3626403808594, + "learning_rate": 1.9056261343012705e-07, + "loss": 36.6016, + "step": 5485 + }, + { + "epoch": 19.805417607223475, + "grad_norm": 328.90875244140625, + "learning_rate": 1.8511796733212343e-07, + "loss": 38.4323, + "step": 5486 + }, + { + "epoch": 19.809029345372462, + "grad_norm": 242.90084838867188, + "learning_rate": 1.796733212341198e-07, + "loss": 37.1693, + "step": 5487 + }, + { + "epoch": 19.812641083521445, + "grad_norm": 246.82679748535156, + "learning_rate": 1.7422867513611613e-07, + "loss": 36.9844, + "step": 5488 + }, + { + "epoch": 19.816252821670428, + "grad_norm": 247.83578491210938, + "learning_rate": 1.6878402903811252e-07, + "loss": 37.1382, + "step": 5489 + }, + { + "epoch": 19.819864559819415, + "grad_norm": 346.5638732910156, + "learning_rate": 1.6333938294010889e-07, + "loss": 39.0924, + "step": 5490 + }, + { + "epoch": 19.819864559819415, + "eval_loss": 0.6002302765846252, + "eval_runtime": 3.1467, + "eval_samples_per_second": 56.884, + "eval_steps_per_second": 56.884, + "step": 5490 + }, + { + "epoch": 19.823476297968398, + "grad_norm": 268.696044921875, + "learning_rate": 1.5789473684210527e-07, + "loss": 35.7904, + "step": 5491 + }, + { + "epoch": 19.82708803611738, + "grad_norm": 236.77597045898438, + "learning_rate": 1.5245009074410164e-07, + "loss": 34.8324, + "step": 5492 + }, + { + "epoch": 19.830699774266364, + "grad_norm": 282.07012939453125, + "learning_rate": 1.47005444646098e-07, + "loss": 30.9181, + "step": 5493 + }, + { + "epoch": 19.83431151241535, + "grad_norm": 304.3028259277344, + "learning_rate": 1.415607985480944e-07, + "loss": 29.8768, + "step": 5494 + }, + { + "epoch": 19.837923250564334, + "grad_norm": 345.91217041015625, + "learning_rate": 1.3611615245009075e-07, + "loss": 29.9774, + "step": 5495 + }, + { + "epoch": 19.841534988713317, + "grad_norm": 305.09893798828125, + "learning_rate": 1.3067150635208714e-07, + "loss": 30.6578, + "step": 5496 + }, + { + "epoch": 19.845146726862303, + "grad_norm": 279.6992492675781, + "learning_rate": 1.2522686025408348e-07, + "loss": 31.6408, + "step": 5497 + }, + { + "epoch": 19.848758465011286, + "grad_norm": 433.50579833984375, + "learning_rate": 1.1978221415607984e-07, + "loss": 32.7726, + "step": 5498 + }, + { + "epoch": 19.85237020316027, + "grad_norm": 264.6114196777344, + "learning_rate": 1.1433756805807623e-07, + "loss": 33.2589, + "step": 5499 + }, + { + "epoch": 19.855981941309256, + "grad_norm": 233.0192108154297, + "learning_rate": 1.0889292196007259e-07, + "loss": 32.5284, + "step": 5500 + }, + { + "epoch": 19.855981941309256, + "eval_loss": 0.6009184718132019, + "eval_runtime": 3.1505, + "eval_samples_per_second": 56.816, + "eval_steps_per_second": 56.816, + "step": 5500 + }, + { + "epoch": 19.85959367945824, + "grad_norm": 268.5655212402344, + "learning_rate": 1.0344827586206897e-07, + "loss": 34.1255, + "step": 5501 + }, + { + "epoch": 19.863205417607222, + "grad_norm": 242.97332763671875, + "learning_rate": 9.800362976406534e-08, + "loss": 32.1586, + "step": 5502 + }, + { + "epoch": 19.86681715575621, + "grad_norm": 250.1754913330078, + "learning_rate": 9.255898366606172e-08, + "loss": 33.2971, + "step": 5503 + }, + { + "epoch": 19.870428893905192, + "grad_norm": 303.9489440917969, + "learning_rate": 8.711433756805807e-08, + "loss": 32.6599, + "step": 5504 + }, + { + "epoch": 19.874040632054175, + "grad_norm": 282.8628845214844, + "learning_rate": 8.166969147005444e-08, + "loss": 33.5164, + "step": 5505 + }, + { + "epoch": 19.877652370203162, + "grad_norm": 319.90228271484375, + "learning_rate": 7.622504537205082e-08, + "loss": 33.9399, + "step": 5506 + }, + { + "epoch": 19.881264108352145, + "grad_norm": 324.5431213378906, + "learning_rate": 7.07803992740472e-08, + "loss": 35.1216, + "step": 5507 + }, + { + "epoch": 19.884875846501128, + "grad_norm": 312.98297119140625, + "learning_rate": 6.533575317604357e-08, + "loss": 34.3538, + "step": 5508 + }, + { + "epoch": 19.888487584650115, + "grad_norm": 331.80718994140625, + "learning_rate": 5.989110707803992e-08, + "loss": 27.5229, + "step": 5509 + }, + { + "epoch": 19.892099322799098, + "grad_norm": 228.25613403320312, + "learning_rate": 5.4446460980036295e-08, + "loss": 22.0451, + "step": 5510 + }, + { + "epoch": 19.892099322799098, + "eval_loss": 0.599698543548584, + "eval_runtime": 3.1515, + "eval_samples_per_second": 56.798, + "eval_steps_per_second": 56.798, + "step": 5510 + }, + { + "epoch": 19.89571106094808, + "grad_norm": 211.70677185058594, + "learning_rate": 4.900181488203267e-08, + "loss": 21.0534, + "step": 5511 + }, + { + "epoch": 19.899322799097064, + "grad_norm": 209.34217834472656, + "learning_rate": 4.3557168784029033e-08, + "loss": 22.478, + "step": 5512 + }, + { + "epoch": 19.90293453724605, + "grad_norm": 219.7806396484375, + "learning_rate": 3.811252268602541e-08, + "loss": 23.3247, + "step": 5513 + }, + { + "epoch": 19.906546275395034, + "grad_norm": 243.7207489013672, + "learning_rate": 3.2667876588021785e-08, + "loss": 37.8099, + "step": 5514 + }, + { + "epoch": 19.910158013544017, + "grad_norm": 236.4864044189453, + "learning_rate": 2.7223230490018148e-08, + "loss": 37.3804, + "step": 5515 + }, + { + "epoch": 19.913769751693003, + "grad_norm": 269.2445373535156, + "learning_rate": 2.1778584392014517e-08, + "loss": 38.5405, + "step": 5516 + }, + { + "epoch": 19.917381489841986, + "grad_norm": 190.2155303955078, + "learning_rate": 1.6333938294010892e-08, + "loss": 37.7808, + "step": 5517 + }, + { + "epoch": 19.92099322799097, + "grad_norm": 228.72300720214844, + "learning_rate": 1.0889292196007258e-08, + "loss": 39.002, + "step": 5518 + }, + { + "epoch": 19.924604966139956, + "grad_norm": 305.3551025390625, + "learning_rate": 5.444646098003629e-09, + "loss": 37.3566, + "step": 5519 + }, + { + "epoch": 19.92821670428894, + "grad_norm": 300.5411071777344, + "learning_rate": 0.0, + "loss": 34.3978, + "step": 5520 + }, + { + "epoch": 19.92821670428894, + "eval_loss": 0.6009259223937988, + "eval_runtime": 3.1539, + "eval_samples_per_second": 56.756, + "eval_steps_per_second": 56.756, + "step": 5520 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.7070932149777203e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-5520/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ff2bcf5eb31e97481ec640082d46c4c41628a051 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:439dd218c72255eb4cd284b3ea5604e3df661d34f63367549eae84a02c732da7 +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..31b42b8cc46aca41480cb3788abccaaede6fd8d3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a43b188457f5e3b771263bf62448753b16e481422822be1d1513ed1f536e13f4 +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c415e5b4736c82b5932eac4d3d7e86b3c5966a1 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19c1c4a55b9544dce2e39c26c8fe2586967434d7c19609b76381655a15152642 +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2788fdd9671ce3dbef88243720f9fdb460b21c54 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:883b16917ea3121fde28b1d70c2092707e72c2daca32701b346547f560038963 +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..261906383a38319e67e18fea41313d8121e28c94 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e59252530920d303752b8a748ee741f7fd5ec5cb128147c00e2d25fe6c92d5b +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..83b22796710925c8e19fb64beb58a00803889558 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/trainer_state.json @@ -0,0 +1,4713 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.166139954853273, + "eval_steps": 10, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.945029075402752e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/README.md b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e4feb3150528da03af8064a7fd88a913317bf7af --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/README.md @@ -0,0 +1,202 @@ +--- +base_model: meta-llama/Llama-3.2-1B-Instruct +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.14.0 \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/adapter_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0f59cbf4b6144798e2c4d1543aea9f3e1212b557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/adapter_config.json @@ -0,0 +1,37 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "bias": "none", + "eva_config": null, + "exclude_modules": null, + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_bias": false, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "o_proj", + "k_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/adapter_model.safetensors b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..249b6ead0f722821ddf195416ab5187b77a768c8 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b16cc7ea912e1165b49db2b2803efa0fa77fa593dbf4cae7c08c3eefc73773f0 +size 45118424 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/config.json new file mode 100644 index 0000000000000000000000000000000000000000..72a2f6f76b611143dfc46337dd423637b90816e3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/config.json @@ -0,0 +1,44 @@ +{ + "_attn_implementation_autoset": true, + "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", + "action_dim": 4, + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "eos_token_id": [ + 128001, + 128008, + 128009 + ], + "head_dim": 64, + "hidden_act": "silu", + "hidden_size": 2048, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama_lowdim", + "num_attention_heads": 32, + "num_hidden_layers": 16, + "num_key_value_heads": 8, + "obs_dim": 9, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.47.1", + "use_cache": false, + "use_joint_mlp_projector": true, + "vocab_size": 128256 +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/mlp_projector.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/mlp_projector.bin new file mode 100644 index 0000000000000000000000000000000000000000..5f4e0fa975b9bbbbd2fbf0091a749858cd09e239 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/mlp_projector.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6d4eeeaae77ba5a84d1a50b9c6c60d9e914bab888204c0dbfeff86161eefef1 +size 16902336 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/optimizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ae85b71b4b61ccad45c644a7a3e106d14f90da8 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39038b6e0308302ac5a2acbe2f67197925c8aa0121771e63c551bcfc27782d7d +size 124114426 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/rng_state.pth b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4fc0d7d380d97b5cc99d9f13fb3cd13b845742a0 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0fee1a2e75be1a3d6a6326307048cbbd266b703305f3095650e5f2cdb7f80b7 +size 14244 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/scheduler.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..eeede9fdf921f17b870efc4dd5790ef6d13ac557 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c47a0955c49e432f51665afc0252e375aeaf62065c373876f8c0fdd915f0518e +size 1064 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/special_tokens_map.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..b43be96621d147110fb8a18b5776ec6e38516127 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/special_tokens_map.json @@ -0,0 +1,17 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "<|eot_id|>" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/tokenizer.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1c1d8d5c9024994f1d3b00f9662b8dd89ca13cf2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/tokenizer_config.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..fb0e8efdce308a03b5350019cf1f24fa4375396f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/tokenizer_config.json @@ -0,0 +1,2064 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "chat_template": "{{- bos_token }}\n{%- if custom_tools is defined %}\n {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n {%- if strftime_now is defined %}\n {%- set date_string = strftime_now(\"%d %b %Y\") %}\n {%- else %}\n {%- set date_string = \"26 Jul 2024\" %}\n {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n {%- set system_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n{%- else %}\n {%- set system_message = \"\" %}\n{%- endif %}\n\n{#- System message #}\n{{- \"<|start_header_id|>system<|end_header_id|>\\n\\n\" }}\n{%- if tools is not none %}\n {{- \"Environment: ipython\\n\" }}\n{%- endif %}\n{{- \"Cutting Knowledge Date: December 2023\\n\" }}\n{{- \"Today Date: \" + date_string + \"\\n\\n\" }}\n{%- if tools is not none and not tools_in_user_message %}\n {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n{%- endif %}\n{{- system_message }}\n{{- \"<|eot_id|>\" }}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n {#- Extract the first user message so we can plug it in here #}\n {%- if messages | length != 0 %}\n {%- set first_user_message = messages[0]['content']|trim %}\n {%- set messages = messages[1:] %}\n {%- else %}\n {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n {{- '<|start_header_id|>user<|end_header_id|>\\n\\n' -}}\n {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n {{- \"with its proper arguments that best answers the given prompt.\\n\\n\" }}\n {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n {{- \"Do not use variables.\\n\\n\" }}\n {%- for t in tools %}\n {{- t | tojson(indent=4) }}\n {{- \"\\n\\n\" }}\n {%- endfor %}\n {{- first_user_message + \"<|eot_id|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' }}\n {%- elif 'tool_calls' in message %}\n {%- if not message.tool_calls|length == 1 %}\n {{- raise_exception(\"This model only supports single tool-calls at once!\") }}\n {%- endif %}\n {%- set tool_call = message.tool_calls[0].function %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' -}}\n {{- '{\"name\": \"' + tool_call.name + '\", ' }}\n {{- '\"parameters\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- \"}\" }}\n {{- \"<|eot_id|>\" }}\n {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n {{- \"<|start_header_id|>ipython<|end_header_id|>\\n\\n\" }}\n {%- if message.content is mapping or message.content is iterable %}\n {{- message.content | tojson }}\n {%- else %}\n {{- message.content }}\n {%- endif %}\n {{- \"<|eot_id|>\" }}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}\n", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/trainer_state.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..cb356d38cc23a8ff1fe1e758b94a3418206da4f3 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/trainer_state.json @@ -0,0 +1,6273 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.888487584650113, + "eval_steps": 10, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0036117381489841984, + "grad_norm": 826.8458251953125, + "learning_rate": 3e-06, + "loss": 72.9219, + "step": 1 + }, + { + "epoch": 0.007223476297968397, + "grad_norm": 716.6332397460938, + "learning_rate": 6e-06, + "loss": 72.5411, + "step": 2 + }, + { + "epoch": 0.010835214446952596, + "grad_norm": 653.662109375, + "learning_rate": 9e-06, + "loss": 68.2333, + "step": 3 + }, + { + "epoch": 0.014446952595936794, + "grad_norm": 678.8214111328125, + "learning_rate": 1.2e-05, + "loss": 67.0506, + "step": 4 + }, + { + "epoch": 0.01805869074492099, + "grad_norm": 584.922607421875, + "learning_rate": 1.5e-05, + "loss": 67.0048, + "step": 5 + }, + { + "epoch": 0.02167042889390519, + "grad_norm": 678.7247924804688, + "learning_rate": 1.8e-05, + "loss": 68.4059, + "step": 6 + }, + { + "epoch": 0.02528216704288939, + "grad_norm": 911.47509765625, + "learning_rate": 2.1e-05, + "loss": 71.0148, + "step": 7 + }, + { + "epoch": 0.028893905191873587, + "grad_norm": 924.4578247070312, + "learning_rate": 2.4e-05, + "loss": 71.4146, + "step": 8 + }, + { + "epoch": 0.03250564334085779, + "grad_norm": 1064.275634765625, + "learning_rate": 2.7000000000000002e-05, + "loss": 70.8432, + "step": 9 + }, + { + "epoch": 0.03611738148984198, + "grad_norm": 850.4259033203125, + "learning_rate": 3e-05, + "loss": 73.1796, + "step": 10 + }, + { + "epoch": 0.03611738148984198, + "eval_loss": 1.0189366340637207, + "eval_runtime": 3.1411, + "eval_samples_per_second": 56.986, + "eval_steps_per_second": 56.986, + "step": 10 + }, + { + "epoch": 0.03972911963882619, + "grad_norm": 870.9306030273438, + "learning_rate": 2.9994555353901996e-05, + "loss": 70.5576, + "step": 11 + }, + { + "epoch": 0.04334085778781038, + "grad_norm": 794.9625244140625, + "learning_rate": 2.998911070780399e-05, + "loss": 65.2322, + "step": 12 + }, + { + "epoch": 0.04695259593679458, + "grad_norm": 989.5623779296875, + "learning_rate": 2.998366606170599e-05, + "loss": 62.7158, + "step": 13 + }, + { + "epoch": 0.05056433408577878, + "grad_norm": 941.0211181640625, + "learning_rate": 2.9978221415607986e-05, + "loss": 65.801, + "step": 14 + }, + { + "epoch": 0.05417607223476298, + "grad_norm": 863.9938354492188, + "learning_rate": 2.9972776769509984e-05, + "loss": 63.4828, + "step": 15 + }, + { + "epoch": 0.057787810383747175, + "grad_norm": 711.3890991210938, + "learning_rate": 2.996733212341198e-05, + "loss": 61.3068, + "step": 16 + }, + { + "epoch": 0.06139954853273138, + "grad_norm": 885.39501953125, + "learning_rate": 2.9961887477313975e-05, + "loss": 64.1753, + "step": 17 + }, + { + "epoch": 0.06501128668171557, + "grad_norm": 655.5796508789062, + "learning_rate": 2.995644283121597e-05, + "loss": 63.6775, + "step": 18 + }, + { + "epoch": 0.06862302483069978, + "grad_norm": 681.5781860351562, + "learning_rate": 2.995099818511797e-05, + "loss": 62.8369, + "step": 19 + }, + { + "epoch": 0.07223476297968397, + "grad_norm": 605.4241943359375, + "learning_rate": 2.9945553539019965e-05, + "loss": 61.3176, + "step": 20 + }, + { + "epoch": 0.07223476297968397, + "eval_loss": 0.9650000929832458, + "eval_runtime": 3.1443, + "eval_samples_per_second": 56.928, + "eval_steps_per_second": 56.928, + "step": 20 + }, + { + "epoch": 0.07584650112866817, + "grad_norm": 588.86376953125, + "learning_rate": 2.994010889292196e-05, + "loss": 61.9691, + "step": 21 + }, + { + "epoch": 0.07945823927765237, + "grad_norm": 729.6580810546875, + "learning_rate": 2.9934664246823956e-05, + "loss": 61.2061, + "step": 22 + }, + { + "epoch": 0.08306997742663656, + "grad_norm": 621.9948120117188, + "learning_rate": 2.992921960072595e-05, + "loss": 63.1849, + "step": 23 + }, + { + "epoch": 0.08668171557562077, + "grad_norm": 816.0555419921875, + "learning_rate": 2.9923774954627953e-05, + "loss": 64.0214, + "step": 24 + }, + { + "epoch": 0.09029345372460497, + "grad_norm": 728.7860107421875, + "learning_rate": 2.991833030852995e-05, + "loss": 62.0578, + "step": 25 + }, + { + "epoch": 0.09390519187358916, + "grad_norm": 897.5223999023438, + "learning_rate": 2.9912885662431944e-05, + "loss": 62.6916, + "step": 26 + }, + { + "epoch": 0.09751693002257336, + "grad_norm": 624.7844848632812, + "learning_rate": 2.990744101633394e-05, + "loss": 61.2081, + "step": 27 + }, + { + "epoch": 0.10112866817155756, + "grad_norm": 661.22119140625, + "learning_rate": 2.9901996370235935e-05, + "loss": 60.2182, + "step": 28 + }, + { + "epoch": 0.10474040632054175, + "grad_norm": 574.8737182617188, + "learning_rate": 2.989655172413793e-05, + "loss": 57.5996, + "step": 29 + }, + { + "epoch": 0.10835214446952596, + "grad_norm": 766.5988159179688, + "learning_rate": 2.989110707803993e-05, + "loss": 55.8385, + "step": 30 + }, + { + "epoch": 0.10835214446952596, + "eval_loss": 0.9189058542251587, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.014, + "eval_steps_per_second": 57.014, + "step": 30 + }, + { + "epoch": 0.11196388261851016, + "grad_norm": 851.9244995117188, + "learning_rate": 2.9885662431941924e-05, + "loss": 53.8883, + "step": 31 + }, + { + "epoch": 0.11557562076749435, + "grad_norm": 689.07470703125, + "learning_rate": 2.988021778584392e-05, + "loss": 52.2324, + "step": 32 + }, + { + "epoch": 0.11918735891647855, + "grad_norm": 716.2824096679688, + "learning_rate": 2.9874773139745915e-05, + "loss": 54.1271, + "step": 33 + }, + { + "epoch": 0.12279909706546276, + "grad_norm": 718.0765991210938, + "learning_rate": 2.9869328493647914e-05, + "loss": 50.9066, + "step": 34 + }, + { + "epoch": 0.12641083521444696, + "grad_norm": 774.018798828125, + "learning_rate": 2.9863883847549912e-05, + "loss": 51.6759, + "step": 35 + }, + { + "epoch": 0.13002257336343115, + "grad_norm": 725.5440063476562, + "learning_rate": 2.9858439201451908e-05, + "loss": 52.6699, + "step": 36 + }, + { + "epoch": 0.13363431151241534, + "grad_norm": 669.84765625, + "learning_rate": 2.9852994555353903e-05, + "loss": 51.6784, + "step": 37 + }, + { + "epoch": 0.13724604966139956, + "grad_norm": 569.4988403320312, + "learning_rate": 2.98475499092559e-05, + "loss": 53.7148, + "step": 38 + }, + { + "epoch": 0.14085778781038374, + "grad_norm": 723.3594360351562, + "learning_rate": 2.9842105263157894e-05, + "loss": 54.6741, + "step": 39 + }, + { + "epoch": 0.14446952595936793, + "grad_norm": 709.8211059570312, + "learning_rate": 2.983666061705989e-05, + "loss": 52.1797, + "step": 40 + }, + { + "epoch": 0.14446952595936793, + "eval_loss": 0.8676205277442932, + "eval_runtime": 3.1293, + "eval_samples_per_second": 57.202, + "eval_steps_per_second": 57.202, + "step": 40 + }, + { + "epoch": 0.14808126410835215, + "grad_norm": 641.121337890625, + "learning_rate": 2.9831215970961888e-05, + "loss": 50.9864, + "step": 41 + }, + { + "epoch": 0.15169300225733634, + "grad_norm": 653.1666259765625, + "learning_rate": 2.9825771324863884e-05, + "loss": 50.4881, + "step": 42 + }, + { + "epoch": 0.15530474040632053, + "grad_norm": 701.0926513671875, + "learning_rate": 2.9820326678765882e-05, + "loss": 51.6959, + "step": 43 + }, + { + "epoch": 0.15891647855530475, + "grad_norm": 838.512451171875, + "learning_rate": 2.9814882032667878e-05, + "loss": 54.8321, + "step": 44 + }, + { + "epoch": 0.16252821670428894, + "grad_norm": 905.4413452148438, + "learning_rate": 2.9809437386569873e-05, + "loss": 50.6469, + "step": 45 + }, + { + "epoch": 0.16613995485327313, + "grad_norm": 762.02783203125, + "learning_rate": 2.9803992740471872e-05, + "loss": 47.2041, + "step": 46 + }, + { + "epoch": 0.16975169300225734, + "grad_norm": 718.588623046875, + "learning_rate": 2.9798548094373867e-05, + "loss": 36.6458, + "step": 47 + }, + { + "epoch": 0.17336343115124153, + "grad_norm": 974.5386962890625, + "learning_rate": 2.9793103448275863e-05, + "loss": 35.4111, + "step": 48 + }, + { + "epoch": 0.17697516930022572, + "grad_norm": 809.5028076171875, + "learning_rate": 2.9787658802177858e-05, + "loss": 35.6902, + "step": 49 + }, + { + "epoch": 0.18058690744920994, + "grad_norm": 814.4694213867188, + "learning_rate": 2.9782214156079853e-05, + "loss": 34.4502, + "step": 50 + }, + { + "epoch": 0.18058690744920994, + "eval_loss": 0.8749106526374817, + "eval_runtime": 3.133, + "eval_samples_per_second": 57.133, + "eval_steps_per_second": 57.133, + "step": 50 + }, + { + "epoch": 0.18419864559819413, + "grad_norm": 2068.640380859375, + "learning_rate": 2.9776769509981852e-05, + "loss": 67.7942, + "step": 51 + }, + { + "epoch": 0.18781038374717832, + "grad_norm": 1760.789306640625, + "learning_rate": 2.977132486388385e-05, + "loss": 64.3457, + "step": 52 + }, + { + "epoch": 0.19142212189616253, + "grad_norm": 1317.9237060546875, + "learning_rate": 2.9765880217785846e-05, + "loss": 62.0075, + "step": 53 + }, + { + "epoch": 0.19503386004514672, + "grad_norm": 949.7896118164062, + "learning_rate": 2.976043557168784e-05, + "loss": 60.4988, + "step": 54 + }, + { + "epoch": 0.1986455981941309, + "grad_norm": 862.1629028320312, + "learning_rate": 2.9754990925589837e-05, + "loss": 56.8426, + "step": 55 + }, + { + "epoch": 0.20225733634311513, + "grad_norm": 978.5818481445312, + "learning_rate": 2.9749546279491832e-05, + "loss": 56.7855, + "step": 56 + }, + { + "epoch": 0.20586907449209932, + "grad_norm": 1055.5872802734375, + "learning_rate": 2.974410163339383e-05, + "loss": 58.6869, + "step": 57 + }, + { + "epoch": 0.2094808126410835, + "grad_norm": 971.089599609375, + "learning_rate": 2.9738656987295827e-05, + "loss": 57.318, + "step": 58 + }, + { + "epoch": 0.21309255079006773, + "grad_norm": 823.1680908203125, + "learning_rate": 2.9733212341197822e-05, + "loss": 56.7783, + "step": 59 + }, + { + "epoch": 0.21670428893905191, + "grad_norm": 660.5692138671875, + "learning_rate": 2.9727767695099817e-05, + "loss": 57.0712, + "step": 60 + }, + { + "epoch": 0.21670428893905191, + "eval_loss": 0.8012754917144775, + "eval_runtime": 3.1336, + "eval_samples_per_second": 57.123, + "eval_steps_per_second": 57.123, + "step": 60 + }, + { + "epoch": 0.2203160270880361, + "grad_norm": 669.174072265625, + "learning_rate": 2.9722323049001816e-05, + "loss": 55.192, + "step": 61 + }, + { + "epoch": 0.22392776523702032, + "grad_norm": 688.8255004882812, + "learning_rate": 2.971687840290381e-05, + "loss": 50.8828, + "step": 62 + }, + { + "epoch": 0.2275395033860045, + "grad_norm": 699.8623657226562, + "learning_rate": 2.971143375680581e-05, + "loss": 50.3083, + "step": 63 + }, + { + "epoch": 0.2311512415349887, + "grad_norm": 559.8364868164062, + "learning_rate": 2.9705989110707806e-05, + "loss": 49.7228, + "step": 64 + }, + { + "epoch": 0.23476297968397292, + "grad_norm": 709.3836059570312, + "learning_rate": 2.97005444646098e-05, + "loss": 49.8954, + "step": 65 + }, + { + "epoch": 0.2383747178329571, + "grad_norm": 678.072265625, + "learning_rate": 2.9695099818511796e-05, + "loss": 49.1461, + "step": 66 + }, + { + "epoch": 0.24198645598194132, + "grad_norm": 672.2944946289062, + "learning_rate": 2.9689655172413792e-05, + "loss": 49.6423, + "step": 67 + }, + { + "epoch": 0.2455981941309255, + "grad_norm": 494.2787780761719, + "learning_rate": 2.968421052631579e-05, + "loss": 49.3827, + "step": 68 + }, + { + "epoch": 0.2492099322799097, + "grad_norm": 440.1124267578125, + "learning_rate": 2.9678765880217786e-05, + "loss": 51.385, + "step": 69 + }, + { + "epoch": 0.2528216704288939, + "grad_norm": 592.347412109375, + "learning_rate": 2.9673321234119785e-05, + "loss": 50.4029, + "step": 70 + }, + { + "epoch": 0.2528216704288939, + "eval_loss": 0.7771623730659485, + "eval_runtime": 3.1347, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 70 + }, + { + "epoch": 0.2564334085778781, + "grad_norm": 637.4396362304688, + "learning_rate": 2.966787658802178e-05, + "loss": 51.1812, + "step": 71 + }, + { + "epoch": 0.2600451467268623, + "grad_norm": 485.1819763183594, + "learning_rate": 2.9662431941923776e-05, + "loss": 51.0345, + "step": 72 + }, + { + "epoch": 0.2636568848758465, + "grad_norm": 598.6526489257812, + "learning_rate": 2.9656987295825774e-05, + "loss": 52.2199, + "step": 73 + }, + { + "epoch": 0.2672686230248307, + "grad_norm": 554.0598754882812, + "learning_rate": 2.965154264972777e-05, + "loss": 51.7395, + "step": 74 + }, + { + "epoch": 0.2708803611738149, + "grad_norm": 587.4779052734375, + "learning_rate": 2.9646098003629765e-05, + "loss": 51.124, + "step": 75 + }, + { + "epoch": 0.2744920993227991, + "grad_norm": 483.8338317871094, + "learning_rate": 2.964065335753176e-05, + "loss": 50.7046, + "step": 76 + }, + { + "epoch": 0.2781038374717833, + "grad_norm": 556.6511840820312, + "learning_rate": 2.9635208711433756e-05, + "loss": 49.4543, + "step": 77 + }, + { + "epoch": 0.2817155756207675, + "grad_norm": 535.6243286132812, + "learning_rate": 2.962976406533575e-05, + "loss": 49.1305, + "step": 78 + }, + { + "epoch": 0.2853273137697517, + "grad_norm": 550.9852905273438, + "learning_rate": 2.962431941923775e-05, + "loss": 47.6811, + "step": 79 + }, + { + "epoch": 0.28893905191873587, + "grad_norm": 686.528076171875, + "learning_rate": 2.961887477313975e-05, + "loss": 44.97, + "step": 80 + }, + { + "epoch": 0.28893905191873587, + "eval_loss": 0.7604023814201355, + "eval_runtime": 3.1365, + "eval_samples_per_second": 57.07, + "eval_steps_per_second": 57.07, + "step": 80 + }, + { + "epoch": 0.2925507900677201, + "grad_norm": 828.4725952148438, + "learning_rate": 2.9613430127041744e-05, + "loss": 42.5007, + "step": 81 + }, + { + "epoch": 0.2961625282167043, + "grad_norm": 644.0596313476562, + "learning_rate": 2.960798548094374e-05, + "loss": 41.9718, + "step": 82 + }, + { + "epoch": 0.2997742663656885, + "grad_norm": 578.7656860351562, + "learning_rate": 2.9602540834845735e-05, + "loss": 44.1048, + "step": 83 + }, + { + "epoch": 0.3033860045146727, + "grad_norm": 589.760498046875, + "learning_rate": 2.9597096188747734e-05, + "loss": 43.6878, + "step": 84 + }, + { + "epoch": 0.30699774266365687, + "grad_norm": 715.7012939453125, + "learning_rate": 2.959165154264973e-05, + "loss": 44.3374, + "step": 85 + }, + { + "epoch": 0.31060948081264106, + "grad_norm": 649.3252563476562, + "learning_rate": 2.9586206896551724e-05, + "loss": 44.4499, + "step": 86 + }, + { + "epoch": 0.3142212189616253, + "grad_norm": 640.3419189453125, + "learning_rate": 2.958076225045372e-05, + "loss": 44.4535, + "step": 87 + }, + { + "epoch": 0.3178329571106095, + "grad_norm": 591.23388671875, + "learning_rate": 2.9575317604355715e-05, + "loss": 45.0348, + "step": 88 + }, + { + "epoch": 0.3214446952595937, + "grad_norm": 544.8179321289062, + "learning_rate": 2.9569872958257714e-05, + "loss": 44.1963, + "step": 89 + }, + { + "epoch": 0.32505643340857787, + "grad_norm": 594.55224609375, + "learning_rate": 2.9564428312159713e-05, + "loss": 44.9479, + "step": 90 + }, + { + "epoch": 0.32505643340857787, + "eval_loss": 0.7576387524604797, + "eval_runtime": 3.1374, + "eval_samples_per_second": 57.053, + "eval_steps_per_second": 57.053, + "step": 90 + }, + { + "epoch": 0.32866817155756206, + "grad_norm": 536.4320068359375, + "learning_rate": 2.9558983666061708e-05, + "loss": 45.3891, + "step": 91 + }, + { + "epoch": 0.33227990970654625, + "grad_norm": 536.9632568359375, + "learning_rate": 2.9553539019963703e-05, + "loss": 44.9822, + "step": 92 + }, + { + "epoch": 0.3358916478555305, + "grad_norm": 505.9728698730469, + "learning_rate": 2.95480943738657e-05, + "loss": 45.066, + "step": 93 + }, + { + "epoch": 0.3395033860045147, + "grad_norm": 453.0039367675781, + "learning_rate": 2.9542649727767694e-05, + "loss": 43.6293, + "step": 94 + }, + { + "epoch": 0.3431151241534989, + "grad_norm": 466.6203308105469, + "learning_rate": 2.9537205081669693e-05, + "loss": 44.3293, + "step": 95 + }, + { + "epoch": 0.34672686230248306, + "grad_norm": 532.4081420898438, + "learning_rate": 2.953176043557169e-05, + "loss": 40.2154, + "step": 96 + }, + { + "epoch": 0.35033860045146725, + "grad_norm": 577.1102294921875, + "learning_rate": 2.9526315789473684e-05, + "loss": 31.5673, + "step": 97 + }, + { + "epoch": 0.35395033860045144, + "grad_norm": 441.4743347167969, + "learning_rate": 2.9520871143375683e-05, + "loss": 29.3586, + "step": 98 + }, + { + "epoch": 0.3575620767494357, + "grad_norm": 432.3975830078125, + "learning_rate": 2.9515426497277678e-05, + "loss": 29.042, + "step": 99 + }, + { + "epoch": 0.3611738148984199, + "grad_norm": 642.6970825195312, + "learning_rate": 2.9509981851179673e-05, + "loss": 31.7708, + "step": 100 + }, + { + "epoch": 0.3611738148984199, + "eval_loss": 0.8288812637329102, + "eval_runtime": 3.1289, + "eval_samples_per_second": 57.208, + "eval_steps_per_second": 57.208, + "step": 100 + }, + { + "epoch": 0.36478555304740407, + "grad_norm": 1607.9227294921875, + "learning_rate": 2.9504537205081672e-05, + "loss": 64.0132, + "step": 101 + }, + { + "epoch": 0.36839729119638825, + "grad_norm": 1462.2889404296875, + "learning_rate": 2.9499092558983667e-05, + "loss": 62.4924, + "step": 102 + }, + { + "epoch": 0.37200902934537244, + "grad_norm": 1075.0196533203125, + "learning_rate": 2.9493647912885663e-05, + "loss": 58.4323, + "step": 103 + }, + { + "epoch": 0.37562076749435663, + "grad_norm": 884.6957397460938, + "learning_rate": 2.9488203266787658e-05, + "loss": 55.141, + "step": 104 + }, + { + "epoch": 0.3792325056433409, + "grad_norm": 783.4414672851562, + "learning_rate": 2.9482758620689654e-05, + "loss": 54.163, + "step": 105 + }, + { + "epoch": 0.38284424379232507, + "grad_norm": 982.4120483398438, + "learning_rate": 2.9477313974591652e-05, + "loss": 55.1398, + "step": 106 + }, + { + "epoch": 0.38645598194130926, + "grad_norm": 853.049560546875, + "learning_rate": 2.947186932849365e-05, + "loss": 53.8404, + "step": 107 + }, + { + "epoch": 0.39006772009029345, + "grad_norm": 722.6901245117188, + "learning_rate": 2.9466424682395647e-05, + "loss": 53.1712, + "step": 108 + }, + { + "epoch": 0.39367945823927764, + "grad_norm": 691.1047973632812, + "learning_rate": 2.9460980036297642e-05, + "loss": 53.1349, + "step": 109 + }, + { + "epoch": 0.3972911963882618, + "grad_norm": 659.1260986328125, + "learning_rate": 2.9455535390199637e-05, + "loss": 53.1488, + "step": 110 + }, + { + "epoch": 0.3972911963882618, + "eval_loss": 0.7457038164138794, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.115, + "eval_steps_per_second": 57.115, + "step": 110 + }, + { + "epoch": 0.40090293453724607, + "grad_norm": 575.7744750976562, + "learning_rate": 2.9450090744101633e-05, + "loss": 49.9333, + "step": 111 + }, + { + "epoch": 0.40451467268623026, + "grad_norm": 482.8963317871094, + "learning_rate": 2.944464609800363e-05, + "loss": 47.9028, + "step": 112 + }, + { + "epoch": 0.40812641083521445, + "grad_norm": 563.2509765625, + "learning_rate": 2.9439201451905627e-05, + "loss": 46.8302, + "step": 113 + }, + { + "epoch": 0.41173814898419864, + "grad_norm": 597.126953125, + "learning_rate": 2.9433756805807622e-05, + "loss": 46.6043, + "step": 114 + }, + { + "epoch": 0.4153498871331828, + "grad_norm": 674.9114990234375, + "learning_rate": 2.9428312159709618e-05, + "loss": 47.9899, + "step": 115 + }, + { + "epoch": 0.418961625282167, + "grad_norm": 564.3960571289062, + "learning_rate": 2.9422867513611616e-05, + "loss": 46.5175, + "step": 116 + }, + { + "epoch": 0.42257336343115126, + "grad_norm": 482.7253723144531, + "learning_rate": 2.9417422867513615e-05, + "loss": 46.1521, + "step": 117 + }, + { + "epoch": 0.42618510158013545, + "grad_norm": 412.52935791015625, + "learning_rate": 2.941197822141561e-05, + "loss": 46.1505, + "step": 118 + }, + { + "epoch": 0.42979683972911964, + "grad_norm": 483.7874450683594, + "learning_rate": 2.9406533575317606e-05, + "loss": 47.1023, + "step": 119 + }, + { + "epoch": 0.43340857787810383, + "grad_norm": 469.2854309082031, + "learning_rate": 2.94010889292196e-05, + "loss": 46.5822, + "step": 120 + }, + { + "epoch": 0.43340857787810383, + "eval_loss": 0.719998300075531, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 120 + }, + { + "epoch": 0.437020316027088, + "grad_norm": 413.3222351074219, + "learning_rate": 2.9395644283121597e-05, + "loss": 46.4077, + "step": 121 + }, + { + "epoch": 0.4406320541760722, + "grad_norm": 473.6437683105469, + "learning_rate": 2.9390199637023592e-05, + "loss": 46.7971, + "step": 122 + }, + { + "epoch": 0.44424379232505645, + "grad_norm": 477.3919677734375, + "learning_rate": 2.938475499092559e-05, + "loss": 48.0664, + "step": 123 + }, + { + "epoch": 0.44785553047404064, + "grad_norm": 505.3496398925781, + "learning_rate": 2.9379310344827586e-05, + "loss": 47.9131, + "step": 124 + }, + { + "epoch": 0.45146726862302483, + "grad_norm": 502.92755126953125, + "learning_rate": 2.937386569872958e-05, + "loss": 47.1492, + "step": 125 + }, + { + "epoch": 0.455079006772009, + "grad_norm": 483.64117431640625, + "learning_rate": 2.936842105263158e-05, + "loss": 49.1792, + "step": 126 + }, + { + "epoch": 0.4586907449209932, + "grad_norm": 459.92559814453125, + "learning_rate": 2.9362976406533576e-05, + "loss": 49.4426, + "step": 127 + }, + { + "epoch": 0.4623024830699774, + "grad_norm": 401.9190673828125, + "learning_rate": 2.9357531760435575e-05, + "loss": 46.2051, + "step": 128 + }, + { + "epoch": 0.46591422121896164, + "grad_norm": 601.756103515625, + "learning_rate": 2.935208711433757e-05, + "loss": 43.9258, + "step": 129 + }, + { + "epoch": 0.46952595936794583, + "grad_norm": 954.7610473632812, + "learning_rate": 2.9346642468239565e-05, + "loss": 43.7106, + "step": 130 + }, + { + "epoch": 0.46952595936794583, + "eval_loss": 0.7346343398094177, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 130 + }, + { + "epoch": 0.47313769751693, + "grad_norm": 635.9511108398438, + "learning_rate": 2.934119782214156e-05, + "loss": 40.4499, + "step": 131 + }, + { + "epoch": 0.4767494356659142, + "grad_norm": 603.8322143554688, + "learning_rate": 2.9335753176043556e-05, + "loss": 40.9184, + "step": 132 + }, + { + "epoch": 0.4803611738148984, + "grad_norm": 435.4403381347656, + "learning_rate": 2.933030852994555e-05, + "loss": 41.3631, + "step": 133 + }, + { + "epoch": 0.48397291196388265, + "grad_norm": 445.1494140625, + "learning_rate": 2.932486388384755e-05, + "loss": 41.1298, + "step": 134 + }, + { + "epoch": 0.48758465011286684, + "grad_norm": 627.1956176757812, + "learning_rate": 2.931941923774955e-05, + "loss": 42.6427, + "step": 135 + }, + { + "epoch": 0.491196388261851, + "grad_norm": 364.08441162109375, + "learning_rate": 2.9313974591651544e-05, + "loss": 40.8941, + "step": 136 + }, + { + "epoch": 0.4948081264108352, + "grad_norm": 521.076904296875, + "learning_rate": 2.930852994555354e-05, + "loss": 43.2699, + "step": 137 + }, + { + "epoch": 0.4984198645598194, + "grad_norm": 480.8160095214844, + "learning_rate": 2.9303085299455535e-05, + "loss": 42.9513, + "step": 138 + }, + { + "epoch": 0.5020316027088036, + "grad_norm": 484.83172607421875, + "learning_rate": 2.9297640653357534e-05, + "loss": 42.4648, + "step": 139 + }, + { + "epoch": 0.5056433408577878, + "grad_norm": 478.5503845214844, + "learning_rate": 2.929219600725953e-05, + "loss": 43.4351, + "step": 140 + }, + { + "epoch": 0.5056433408577878, + "eval_loss": 0.7245867252349854, + "eval_runtime": 3.1305, + "eval_samples_per_second": 57.178, + "eval_steps_per_second": 57.178, + "step": 140 + }, + { + "epoch": 0.509255079006772, + "grad_norm": 501.84991455078125, + "learning_rate": 2.9286751361161525e-05, + "loss": 42.7249, + "step": 141 + }, + { + "epoch": 0.5128668171557562, + "grad_norm": 496.357177734375, + "learning_rate": 2.928130671506352e-05, + "loss": 42.7323, + "step": 142 + }, + { + "epoch": 0.5164785553047404, + "grad_norm": 476.9631042480469, + "learning_rate": 2.9275862068965515e-05, + "loss": 44.2251, + "step": 143 + }, + { + "epoch": 0.5200902934537246, + "grad_norm": 435.324951171875, + "learning_rate": 2.9270417422867514e-05, + "loss": 43.2753, + "step": 144 + }, + { + "epoch": 0.5237020316027088, + "grad_norm": 486.4795227050781, + "learning_rate": 2.9264972776769513e-05, + "loss": 43.9547, + "step": 145 + }, + { + "epoch": 0.527313769751693, + "grad_norm": 573.4031372070312, + "learning_rate": 2.925952813067151e-05, + "loss": 32.5569, + "step": 146 + }, + { + "epoch": 0.5309255079006772, + "grad_norm": 429.2251892089844, + "learning_rate": 2.9254083484573504e-05, + "loss": 29.0521, + "step": 147 + }, + { + "epoch": 0.5345372460496614, + "grad_norm": 478.80426025390625, + "learning_rate": 2.92486388384755e-05, + "loss": 28.9163, + "step": 148 + }, + { + "epoch": 0.5381489841986457, + "grad_norm": 475.7033996582031, + "learning_rate": 2.9243194192377495e-05, + "loss": 29.2594, + "step": 149 + }, + { + "epoch": 0.5417607223476298, + "grad_norm": 530.3062133789062, + "learning_rate": 2.9237749546279493e-05, + "loss": 29.8788, + "step": 150 + }, + { + "epoch": 0.5417607223476298, + "eval_loss": 0.8220540285110474, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.112, + "eval_steps_per_second": 57.112, + "step": 150 + }, + { + "epoch": 0.545372460496614, + "grad_norm": 1374.2142333984375, + "learning_rate": 2.923230490018149e-05, + "loss": 62.5697, + "step": 151 + }, + { + "epoch": 0.5489841986455982, + "grad_norm": 1227.5701904296875, + "learning_rate": 2.9226860254083484e-05, + "loss": 61.1637, + "step": 152 + }, + { + "epoch": 0.5525959367945824, + "grad_norm": 980.4124145507812, + "learning_rate": 2.9221415607985483e-05, + "loss": 57.9838, + "step": 153 + }, + { + "epoch": 0.5562076749435666, + "grad_norm": 792.8090209960938, + "learning_rate": 2.9215970961887478e-05, + "loss": 56.3787, + "step": 154 + }, + { + "epoch": 0.5598194130925508, + "grad_norm": 602.3422241210938, + "learning_rate": 2.9210526315789474e-05, + "loss": 52.8103, + "step": 155 + }, + { + "epoch": 0.563431151241535, + "grad_norm": 493.4346008300781, + "learning_rate": 2.9205081669691472e-05, + "loss": 51.002, + "step": 156 + }, + { + "epoch": 0.5670428893905192, + "grad_norm": 619.7504272460938, + "learning_rate": 2.9199637023593468e-05, + "loss": 50.0153, + "step": 157 + }, + { + "epoch": 0.5706546275395034, + "grad_norm": 610.8827514648438, + "learning_rate": 2.9194192377495463e-05, + "loss": 52.3504, + "step": 158 + }, + { + "epoch": 0.5742663656884875, + "grad_norm": 670.8658447265625, + "learning_rate": 2.918874773139746e-05, + "loss": 52.9307, + "step": 159 + }, + { + "epoch": 0.5778781038374717, + "grad_norm": 552.539306640625, + "learning_rate": 2.9183303085299454e-05, + "loss": 49.7189, + "step": 160 + }, + { + "epoch": 0.5778781038374717, + "eval_loss": 0.7438566088676453, + "eval_runtime": 3.134, + "eval_samples_per_second": 57.116, + "eval_steps_per_second": 57.116, + "step": 160 + }, + { + "epoch": 0.581489841986456, + "grad_norm": 605.2847900390625, + "learning_rate": 2.9177858439201453e-05, + "loss": 50.6365, + "step": 161 + }, + { + "epoch": 0.5851015801354402, + "grad_norm": 460.163818359375, + "learning_rate": 2.9172413793103448e-05, + "loss": 45.5784, + "step": 162 + }, + { + "epoch": 0.5887133182844244, + "grad_norm": 630.098876953125, + "learning_rate": 2.9166969147005447e-05, + "loss": 45.6859, + "step": 163 + }, + { + "epoch": 0.5923250564334086, + "grad_norm": 532.3728637695312, + "learning_rate": 2.9161524500907442e-05, + "loss": 45.3804, + "step": 164 + }, + { + "epoch": 0.5959367945823928, + "grad_norm": 510.09234619140625, + "learning_rate": 2.9156079854809438e-05, + "loss": 44.6911, + "step": 165 + }, + { + "epoch": 0.599548532731377, + "grad_norm": 465.53887939453125, + "learning_rate": 2.9150635208711436e-05, + "loss": 45.7436, + "step": 166 + }, + { + "epoch": 0.6031602708803612, + "grad_norm": 413.5904235839844, + "learning_rate": 2.9145190562613432e-05, + "loss": 45.3019, + "step": 167 + }, + { + "epoch": 0.6067720090293454, + "grad_norm": 514.5824584960938, + "learning_rate": 2.9139745916515427e-05, + "loss": 46.0631, + "step": 168 + }, + { + "epoch": 0.6103837471783295, + "grad_norm": 402.7557373046875, + "learning_rate": 2.9134301270417423e-05, + "loss": 46.032, + "step": 169 + }, + { + "epoch": 0.6139954853273137, + "grad_norm": 434.61138916015625, + "learning_rate": 2.9128856624319418e-05, + "loss": 46.1674, + "step": 170 + }, + { + "epoch": 0.6139954853273137, + "eval_loss": 0.7043496966362, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.162, + "eval_steps_per_second": 57.162, + "step": 170 + }, + { + "epoch": 0.6176072234762979, + "grad_norm": 368.5428771972656, + "learning_rate": 2.9123411978221413e-05, + "loss": 47.4448, + "step": 171 + }, + { + "epoch": 0.6212189616252821, + "grad_norm": 382.7486267089844, + "learning_rate": 2.9117967332123415e-05, + "loss": 46.0437, + "step": 172 + }, + { + "epoch": 0.6248306997742664, + "grad_norm": 373.2402038574219, + "learning_rate": 2.911252268602541e-05, + "loss": 47.2806, + "step": 173 + }, + { + "epoch": 0.6284424379232506, + "grad_norm": 404.00799560546875, + "learning_rate": 2.9107078039927406e-05, + "loss": 46.9239, + "step": 174 + }, + { + "epoch": 0.6320541760722348, + "grad_norm": 421.1421203613281, + "learning_rate": 2.91016333938294e-05, + "loss": 47.2773, + "step": 175 + }, + { + "epoch": 0.635665914221219, + "grad_norm": 384.21380615234375, + "learning_rate": 2.9096188747731397e-05, + "loss": 47.7277, + "step": 176 + }, + { + "epoch": 0.6392776523702032, + "grad_norm": 401.65625, + "learning_rate": 2.9090744101633396e-05, + "loss": 47.4115, + "step": 177 + }, + { + "epoch": 0.6428893905191874, + "grad_norm": 389.7224426269531, + "learning_rate": 2.908529945553539e-05, + "loss": 46.9206, + "step": 178 + }, + { + "epoch": 0.6465011286681716, + "grad_norm": 370.7626037597656, + "learning_rate": 2.9079854809437387e-05, + "loss": 43.074, + "step": 179 + }, + { + "epoch": 0.6501128668171557, + "grad_norm": 397.579833984375, + "learning_rate": 2.9074410163339382e-05, + "loss": 40.7953, + "step": 180 + }, + { + "epoch": 0.6501128668171557, + "eval_loss": 0.7069951295852661, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.109, + "eval_steps_per_second": 57.109, + "step": 180 + }, + { + "epoch": 0.6537246049661399, + "grad_norm": 355.5390625, + "learning_rate": 2.906896551724138e-05, + "loss": 39.9822, + "step": 181 + }, + { + "epoch": 0.6573363431151241, + "grad_norm": 385.6095275878906, + "learning_rate": 2.9063520871143376e-05, + "loss": 38.2107, + "step": 182 + }, + { + "epoch": 0.6609480812641083, + "grad_norm": 469.42449951171875, + "learning_rate": 2.9058076225045375e-05, + "loss": 40.9879, + "step": 183 + }, + { + "epoch": 0.6645598194130925, + "grad_norm": 374.644287109375, + "learning_rate": 2.905263157894737e-05, + "loss": 39.9646, + "step": 184 + }, + { + "epoch": 0.6681715575620768, + "grad_norm": 423.72412109375, + "learning_rate": 2.9047186932849366e-05, + "loss": 40.37, + "step": 185 + }, + { + "epoch": 0.671783295711061, + "grad_norm": 374.5202331542969, + "learning_rate": 2.904174228675136e-05, + "loss": 40.593, + "step": 186 + }, + { + "epoch": 0.6753950338600452, + "grad_norm": 352.500244140625, + "learning_rate": 2.9036297640653356e-05, + "loss": 40.4483, + "step": 187 + }, + { + "epoch": 0.6790067720090294, + "grad_norm": 368.6827392578125, + "learning_rate": 2.9030852994555355e-05, + "loss": 41.0123, + "step": 188 + }, + { + "epoch": 0.6826185101580136, + "grad_norm": 339.8343200683594, + "learning_rate": 2.902540834845735e-05, + "loss": 41.0098, + "step": 189 + }, + { + "epoch": 0.6862302483069977, + "grad_norm": 362.53424072265625, + "learning_rate": 2.901996370235935e-05, + "loss": 42.0471, + "step": 190 + }, + { + "epoch": 0.6862302483069977, + "eval_loss": 0.7149370908737183, + "eval_runtime": 3.1346, + "eval_samples_per_second": 57.104, + "eval_steps_per_second": 57.104, + "step": 190 + }, + { + "epoch": 0.6898419864559819, + "grad_norm": 394.1274719238281, + "learning_rate": 2.9014519056261345e-05, + "loss": 43.0053, + "step": 191 + }, + { + "epoch": 0.6934537246049661, + "grad_norm": 370.6410217285156, + "learning_rate": 2.900907441016334e-05, + "loss": 42.6179, + "step": 192 + }, + { + "epoch": 0.6970654627539503, + "grad_norm": 396.1412048339844, + "learning_rate": 2.9003629764065335e-05, + "loss": 42.4657, + "step": 193 + }, + { + "epoch": 0.7006772009029345, + "grad_norm": 359.99468994140625, + "learning_rate": 2.8998185117967334e-05, + "loss": 41.6011, + "step": 194 + }, + { + "epoch": 0.7042889390519187, + "grad_norm": 436.6610107421875, + "learning_rate": 2.899274047186933e-05, + "loss": 42.8562, + "step": 195 + }, + { + "epoch": 0.7079006772009029, + "grad_norm": 472.45355224609375, + "learning_rate": 2.8987295825771325e-05, + "loss": 35.0799, + "step": 196 + }, + { + "epoch": 0.7115124153498872, + "grad_norm": 441.8983154296875, + "learning_rate": 2.898185117967332e-05, + "loss": 29.5268, + "step": 197 + }, + { + "epoch": 0.7151241534988714, + "grad_norm": 376.2563171386719, + "learning_rate": 2.8976406533575316e-05, + "loss": 27.1006, + "step": 198 + }, + { + "epoch": 0.7187358916478556, + "grad_norm": 345.8896789550781, + "learning_rate": 2.8970961887477318e-05, + "loss": 27.4286, + "step": 199 + }, + { + "epoch": 0.7223476297968398, + "grad_norm": 408.644775390625, + "learning_rate": 2.8965517241379313e-05, + "loss": 27.3932, + "step": 200 + }, + { + "epoch": 0.7223476297968398, + "eval_loss": 0.7911182641983032, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 200 + }, + { + "epoch": 0.7259593679458239, + "grad_norm": 1156.6982421875, + "learning_rate": 2.896007259528131e-05, + "loss": 58.3407, + "step": 201 + }, + { + "epoch": 0.7295711060948081, + "grad_norm": 1127.2872314453125, + "learning_rate": 2.8954627949183304e-05, + "loss": 58.1773, + "step": 202 + }, + { + "epoch": 0.7331828442437923, + "grad_norm": 974.721923828125, + "learning_rate": 2.89491833030853e-05, + "loss": 57.3066, + "step": 203 + }, + { + "epoch": 0.7367945823927765, + "grad_norm": 724.0964965820312, + "learning_rate": 2.8943738656987295e-05, + "loss": 54.5647, + "step": 204 + }, + { + "epoch": 0.7404063205417607, + "grad_norm": 577.144287109375, + "learning_rate": 2.8938294010889294e-05, + "loss": 51.5741, + "step": 205 + }, + { + "epoch": 0.7440180586907449, + "grad_norm": 406.2142028808594, + "learning_rate": 2.893284936479129e-05, + "loss": 49.4595, + "step": 206 + }, + { + "epoch": 0.7476297968397291, + "grad_norm": 537.4603271484375, + "learning_rate": 2.8927404718693284e-05, + "loss": 50.7602, + "step": 207 + }, + { + "epoch": 0.7512415349887133, + "grad_norm": 696.2557373046875, + "learning_rate": 2.892196007259528e-05, + "loss": 50.6034, + "step": 208 + }, + { + "epoch": 0.7548532731376976, + "grad_norm": 644.7799682617188, + "learning_rate": 2.891651542649728e-05, + "loss": 50.5617, + "step": 209 + }, + { + "epoch": 0.7584650112866818, + "grad_norm": 443.0915832519531, + "learning_rate": 2.8911070780399277e-05, + "loss": 48.4847, + "step": 210 + }, + { + "epoch": 0.7584650112866818, + "eval_loss": 0.7149282097816467, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 210 + }, + { + "epoch": 0.762076749435666, + "grad_norm": 359.4809875488281, + "learning_rate": 2.8905626134301273e-05, + "loss": 46.2606, + "step": 211 + }, + { + "epoch": 0.7656884875846501, + "grad_norm": 422.4323425292969, + "learning_rate": 2.8900181488203268e-05, + "loss": 45.7595, + "step": 212 + }, + { + "epoch": 0.7693002257336343, + "grad_norm": 374.7406311035156, + "learning_rate": 2.8894736842105263e-05, + "loss": 45.5474, + "step": 213 + }, + { + "epoch": 0.7729119638826185, + "grad_norm": 360.0633544921875, + "learning_rate": 2.888929219600726e-05, + "loss": 43.0967, + "step": 214 + }, + { + "epoch": 0.7765237020316027, + "grad_norm": 401.3516540527344, + "learning_rate": 2.8883847549909254e-05, + "loss": 44.7585, + "step": 215 + }, + { + "epoch": 0.7801354401805869, + "grad_norm": 461.3826904296875, + "learning_rate": 2.8878402903811253e-05, + "loss": 44.1134, + "step": 216 + }, + { + "epoch": 0.7837471783295711, + "grad_norm": 388.8529052734375, + "learning_rate": 2.887295825771325e-05, + "loss": 44.1363, + "step": 217 + }, + { + "epoch": 0.7873589164785553, + "grad_norm": 365.8173828125, + "learning_rate": 2.8867513611615247e-05, + "loss": 45.4802, + "step": 218 + }, + { + "epoch": 0.7909706546275395, + "grad_norm": 362.2828369140625, + "learning_rate": 2.8862068965517243e-05, + "loss": 45.2052, + "step": 219 + }, + { + "epoch": 0.7945823927765236, + "grad_norm": 387.8126220703125, + "learning_rate": 2.8856624319419238e-05, + "loss": 46.6664, + "step": 220 + }, + { + "epoch": 0.7945823927765236, + "eval_loss": 0.7011916637420654, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.11, + "eval_steps_per_second": 57.11, + "step": 220 + }, + { + "epoch": 0.798194130925508, + "grad_norm": 481.8575744628906, + "learning_rate": 2.8851179673321237e-05, + "loss": 45.9061, + "step": 221 + }, + { + "epoch": 0.8018058690744921, + "grad_norm": 403.699462890625, + "learning_rate": 2.8845735027223232e-05, + "loss": 46.1226, + "step": 222 + }, + { + "epoch": 0.8054176072234763, + "grad_norm": 389.87646484375, + "learning_rate": 2.8840290381125227e-05, + "loss": 47.5213, + "step": 223 + }, + { + "epoch": 0.8090293453724605, + "grad_norm": 351.58551025390625, + "learning_rate": 2.8834845735027223e-05, + "loss": 46.5401, + "step": 224 + }, + { + "epoch": 0.8126410835214447, + "grad_norm": 419.92437744140625, + "learning_rate": 2.8829401088929218e-05, + "loss": 46.655, + "step": 225 + }, + { + "epoch": 0.8162528216704289, + "grad_norm": 369.2700500488281, + "learning_rate": 2.8823956442831214e-05, + "loss": 47.2712, + "step": 226 + }, + { + "epoch": 0.8198645598194131, + "grad_norm": 350.486328125, + "learning_rate": 2.8818511796733216e-05, + "loss": 45.873, + "step": 227 + }, + { + "epoch": 0.8234762979683973, + "grad_norm": 370.6356201171875, + "learning_rate": 2.881306715063521e-05, + "loss": 45.5976, + "step": 228 + }, + { + "epoch": 0.8270880361173815, + "grad_norm": 388.7554931640625, + "learning_rate": 2.8807622504537207e-05, + "loss": 45.4359, + "step": 229 + }, + { + "epoch": 0.8306997742663657, + "grad_norm": 356.65447998046875, + "learning_rate": 2.8802177858439202e-05, + "loss": 41.5546, + "step": 230 + }, + { + "epoch": 0.8306997742663657, + "eval_loss": 0.6976248621940613, + "eval_runtime": 3.1394, + "eval_samples_per_second": 57.018, + "eval_steps_per_second": 57.018, + "step": 230 + }, + { + "epoch": 0.8343115124153498, + "grad_norm": 484.9076232910156, + "learning_rate": 2.8796733212341197e-05, + "loss": 40.3896, + "step": 231 + }, + { + "epoch": 0.837923250564334, + "grad_norm": 426.18902587890625, + "learning_rate": 2.8791288566243196e-05, + "loss": 38.1999, + "step": 232 + }, + { + "epoch": 0.8415349887133183, + "grad_norm": 387.5289001464844, + "learning_rate": 2.878584392014519e-05, + "loss": 38.8128, + "step": 233 + }, + { + "epoch": 0.8451467268623025, + "grad_norm": 491.71331787109375, + "learning_rate": 2.8780399274047187e-05, + "loss": 39.1003, + "step": 234 + }, + { + "epoch": 0.8487584650112867, + "grad_norm": 376.87249755859375, + "learning_rate": 2.8774954627949182e-05, + "loss": 40.7458, + "step": 235 + }, + { + "epoch": 0.8523702031602709, + "grad_norm": 459.1217041015625, + "learning_rate": 2.876950998185118e-05, + "loss": 41.9222, + "step": 236 + }, + { + "epoch": 0.8559819413092551, + "grad_norm": 445.1222229003906, + "learning_rate": 2.8764065335753176e-05, + "loss": 41.0784, + "step": 237 + }, + { + "epoch": 0.8595936794582393, + "grad_norm": 375.32843017578125, + "learning_rate": 2.8758620689655175e-05, + "loss": 41.5524, + "step": 238 + }, + { + "epoch": 0.8632054176072235, + "grad_norm": 303.4617614746094, + "learning_rate": 2.875317604355717e-05, + "loss": 41.5471, + "step": 239 + }, + { + "epoch": 0.8668171557562077, + "grad_norm": 360.2012634277344, + "learning_rate": 2.8747731397459166e-05, + "loss": 40.503, + "step": 240 + }, + { + "epoch": 0.8668171557562077, + "eval_loss": 0.695322573184967, + "eval_runtime": 3.1341, + "eval_samples_per_second": 57.113, + "eval_steps_per_second": 57.113, + "step": 240 + }, + { + "epoch": 0.8704288939051918, + "grad_norm": 384.7886047363281, + "learning_rate": 2.874228675136116e-05, + "loss": 41.8679, + "step": 241 + }, + { + "epoch": 0.874040632054176, + "grad_norm": 344.9561767578125, + "learning_rate": 2.8736842105263157e-05, + "loss": 42.4417, + "step": 242 + }, + { + "epoch": 0.8776523702031602, + "grad_norm": 356.1025695800781, + "learning_rate": 2.8731397459165155e-05, + "loss": 42.0715, + "step": 243 + }, + { + "epoch": 0.8812641083521444, + "grad_norm": 416.7387390136719, + "learning_rate": 2.872595281306715e-05, + "loss": 42.4063, + "step": 244 + }, + { + "epoch": 0.8848758465011287, + "grad_norm": 392.5692138671875, + "learning_rate": 2.872050816696915e-05, + "loss": 41.3914, + "step": 245 + }, + { + "epoch": 0.8884875846501129, + "grad_norm": 405.326416015625, + "learning_rate": 2.8715063520871145e-05, + "loss": 34.0761, + "step": 246 + }, + { + "epoch": 0.8920993227990971, + "grad_norm": 484.799072265625, + "learning_rate": 2.870961887477314e-05, + "loss": 28.2779, + "step": 247 + }, + { + "epoch": 0.8957110609480813, + "grad_norm": 499.2939147949219, + "learning_rate": 2.8704174228675136e-05, + "loss": 27.6529, + "step": 248 + }, + { + "epoch": 0.8993227990970655, + "grad_norm": 381.8467102050781, + "learning_rate": 2.8698729582577135e-05, + "loss": 27.4412, + "step": 249 + }, + { + "epoch": 0.9029345372460497, + "grad_norm": 344.0008850097656, + "learning_rate": 2.869328493647913e-05, + "loss": 28.0706, + "step": 250 + }, + { + "epoch": 0.9029345372460497, + "eval_loss": 0.7842397093772888, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.085, + "eval_steps_per_second": 57.085, + "step": 250 + }, + { + "epoch": 0.9065462753950339, + "grad_norm": 1059.8311767578125, + "learning_rate": 2.8687840290381125e-05, + "loss": 58.7628, + "step": 251 + }, + { + "epoch": 0.910158013544018, + "grad_norm": 1057.7684326171875, + "learning_rate": 2.868239564428312e-05, + "loss": 57.5323, + "step": 252 + }, + { + "epoch": 0.9137697516930022, + "grad_norm": 976.0852661132812, + "learning_rate": 2.8676950998185116e-05, + "loss": 55.8152, + "step": 253 + }, + { + "epoch": 0.9173814898419864, + "grad_norm": 860.575439453125, + "learning_rate": 2.8671506352087115e-05, + "loss": 55.599, + "step": 254 + }, + { + "epoch": 0.9209932279909706, + "grad_norm": 615.1729736328125, + "learning_rate": 2.8666061705989114e-05, + "loss": 52.4687, + "step": 255 + }, + { + "epoch": 0.9246049661399548, + "grad_norm": 489.91754150390625, + "learning_rate": 2.866061705989111e-05, + "loss": 50.4135, + "step": 256 + }, + { + "epoch": 0.9282167042889391, + "grad_norm": 490.24029541015625, + "learning_rate": 2.8655172413793104e-05, + "loss": 48.5034, + "step": 257 + }, + { + "epoch": 0.9318284424379233, + "grad_norm": 396.28326416015625, + "learning_rate": 2.86497277676951e-05, + "loss": 47.2695, + "step": 258 + }, + { + "epoch": 0.9354401805869075, + "grad_norm": 382.5725402832031, + "learning_rate": 2.86442831215971e-05, + "loss": 46.0009, + "step": 259 + }, + { + "epoch": 0.9390519187358917, + "grad_norm": 403.9846496582031, + "learning_rate": 2.8638838475499094e-05, + "loss": 45.5784, + "step": 260 + }, + { + "epoch": 0.9390519187358917, + "eval_loss": 0.7167119979858398, + "eval_runtime": 3.147, + "eval_samples_per_second": 56.88, + "eval_steps_per_second": 56.88, + "step": 260 + }, + { + "epoch": 0.9426636568848759, + "grad_norm": 501.81561279296875, + "learning_rate": 2.863339382940109e-05, + "loss": 46.9294, + "step": 261 + }, + { + "epoch": 0.94627539503386, + "grad_norm": 500.6963806152344, + "learning_rate": 2.8627949183303085e-05, + "loss": 47.5202, + "step": 262 + }, + { + "epoch": 0.9498871331828442, + "grad_norm": 453.0813903808594, + "learning_rate": 2.862250453720508e-05, + "loss": 47.7158, + "step": 263 + }, + { + "epoch": 0.9534988713318284, + "grad_norm": 460.04742431640625, + "learning_rate": 2.861705989110708e-05, + "loss": 48.9962, + "step": 264 + }, + { + "epoch": 0.9571106094808126, + "grad_norm": 426.95196533203125, + "learning_rate": 2.8611615245009078e-05, + "loss": 48.8704, + "step": 265 + }, + { + "epoch": 0.9607223476297968, + "grad_norm": 381.4711608886719, + "learning_rate": 2.8606170598911073e-05, + "loss": 43.871, + "step": 266 + }, + { + "epoch": 0.964334085778781, + "grad_norm": 333.3099060058594, + "learning_rate": 2.860072595281307e-05, + "loss": 38.4646, + "step": 267 + }, + { + "epoch": 0.9679458239277653, + "grad_norm": 325.5362548828125, + "learning_rate": 2.8595281306715064e-05, + "loss": 37.1731, + "step": 268 + }, + { + "epoch": 0.9715575620767495, + "grad_norm": 379.2328796386719, + "learning_rate": 2.858983666061706e-05, + "loss": 39.6756, + "step": 269 + }, + { + "epoch": 0.9751693002257337, + "grad_norm": 407.74200439453125, + "learning_rate": 2.8584392014519058e-05, + "loss": 41.435, + "step": 270 + }, + { + "epoch": 0.9751693002257337, + "eval_loss": 0.6935378909111023, + "eval_runtime": 3.1372, + "eval_samples_per_second": 57.057, + "eval_steps_per_second": 57.057, + "step": 270 + }, + { + "epoch": 0.9787810383747179, + "grad_norm": 432.80267333984375, + "learning_rate": 2.8578947368421053e-05, + "loss": 41.3764, + "step": 271 + }, + { + "epoch": 0.982392776523702, + "grad_norm": 386.5149841308594, + "learning_rate": 2.857350272232305e-05, + "loss": 39.6562, + "step": 272 + }, + { + "epoch": 0.9860045146726862, + "grad_norm": 394.14471435546875, + "learning_rate": 2.8568058076225047e-05, + "loss": 42.5142, + "step": 273 + }, + { + "epoch": 0.9896162528216704, + "grad_norm": 389.7673645019531, + "learning_rate": 2.8562613430127043e-05, + "loss": 41.819, + "step": 274 + }, + { + "epoch": 0.9932279909706546, + "grad_norm": 473.9212951660156, + "learning_rate": 2.8557168784029038e-05, + "loss": 34.3601, + "step": 275 + }, + { + "epoch": 0.9968397291196388, + "grad_norm": 422.0166320800781, + "learning_rate": 2.8551724137931037e-05, + "loss": 27.1479, + "step": 276 + }, + { + "epoch": 1.0, + "grad_norm": 287.4736633300781, + "learning_rate": 2.8546279491833032e-05, + "loss": 23.7312, + "step": 277 + }, + { + "epoch": 1.0036117381489842, + "grad_norm": 877.3681030273438, + "learning_rate": 2.8540834845735028e-05, + "loss": 54.4935, + "step": 278 + }, + { + "epoch": 1.0072234762979684, + "grad_norm": 739.6668090820312, + "learning_rate": 2.8535390199637023e-05, + "loss": 52.8877, + "step": 279 + }, + { + "epoch": 1.0108352144469526, + "grad_norm": 718.5248413085938, + "learning_rate": 2.852994555353902e-05, + "loss": 52.3691, + "step": 280 + }, + { + "epoch": 1.0108352144469526, + "eval_loss": 0.7196069359779358, + "eval_runtime": 3.139, + "eval_samples_per_second": 57.025, + "eval_steps_per_second": 57.025, + "step": 280 + }, + { + "epoch": 1.0144469525959368, + "grad_norm": 532.3770141601562, + "learning_rate": 2.8524500907441017e-05, + "loss": 49.2538, + "step": 281 + }, + { + "epoch": 1.018058690744921, + "grad_norm": 431.7366027832031, + "learning_rate": 2.8519056261343016e-05, + "loss": 48.7469, + "step": 282 + }, + { + "epoch": 1.0216704288939051, + "grad_norm": 338.91424560546875, + "learning_rate": 2.851361161524501e-05, + "loss": 47.96, + "step": 283 + }, + { + "epoch": 1.0252821670428893, + "grad_norm": 448.8798828125, + "learning_rate": 2.8508166969147007e-05, + "loss": 48.9088, + "step": 284 + }, + { + "epoch": 1.0288939051918735, + "grad_norm": 395.4872131347656, + "learning_rate": 2.8502722323049002e-05, + "loss": 49.1375, + "step": 285 + }, + { + "epoch": 1.0325056433408577, + "grad_norm": 428.61285400390625, + "learning_rate": 2.8497277676950998e-05, + "loss": 49.393, + "step": 286 + }, + { + "epoch": 1.036117381489842, + "grad_norm": 380.1004943847656, + "learning_rate": 2.8491833030852996e-05, + "loss": 49.0513, + "step": 287 + }, + { + "epoch": 1.039729119638826, + "grad_norm": 318.0881042480469, + "learning_rate": 2.8486388384754992e-05, + "loss": 47.6806, + "step": 288 + }, + { + "epoch": 1.0433408577878103, + "grad_norm": 331.2910461425781, + "learning_rate": 2.8480943738656987e-05, + "loss": 45.9821, + "step": 289 + }, + { + "epoch": 1.0469525959367947, + "grad_norm": 280.7160339355469, + "learning_rate": 2.8475499092558982e-05, + "loss": 43.9498, + "step": 290 + }, + { + "epoch": 1.0469525959367947, + "eval_loss": 0.682730495929718, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 290 + }, + { + "epoch": 1.0505643340857789, + "grad_norm": 246.1832733154297, + "learning_rate": 2.8470054446460978e-05, + "loss": 42.5624, + "step": 291 + }, + { + "epoch": 1.054176072234763, + "grad_norm": 262.2304992675781, + "learning_rate": 2.846460980036298e-05, + "loss": 43.2049, + "step": 292 + }, + { + "epoch": 1.0577878103837473, + "grad_norm": 290.2947082519531, + "learning_rate": 2.8459165154264975e-05, + "loss": 43.5165, + "step": 293 + }, + { + "epoch": 1.0613995485327314, + "grad_norm": 269.8375244140625, + "learning_rate": 2.845372050816697e-05, + "loss": 43.5621, + "step": 294 + }, + { + "epoch": 1.0650112866817156, + "grad_norm": 275.5233459472656, + "learning_rate": 2.8448275862068966e-05, + "loss": 42.9337, + "step": 295 + }, + { + "epoch": 1.0686230248306998, + "grad_norm": 275.6507873535156, + "learning_rate": 2.844283121597096e-05, + "loss": 45.051, + "step": 296 + }, + { + "epoch": 1.072234762979684, + "grad_norm": 340.93536376953125, + "learning_rate": 2.8437386569872957e-05, + "loss": 44.6103, + "step": 297 + }, + { + "epoch": 1.0758465011286682, + "grad_norm": 286.0844421386719, + "learning_rate": 2.8431941923774956e-05, + "loss": 44.5428, + "step": 298 + }, + { + "epoch": 1.0794582392776524, + "grad_norm": 316.6739501953125, + "learning_rate": 2.842649727767695e-05, + "loss": 45.631, + "step": 299 + }, + { + "epoch": 1.0830699774266366, + "grad_norm": 256.1273193359375, + "learning_rate": 2.8421052631578946e-05, + "loss": 45.0464, + "step": 300 + }, + { + "epoch": 1.0830699774266366, + "eval_loss": 0.6778246760368347, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.099, + "eval_steps_per_second": 57.099, + "step": 300 + }, + { + "epoch": 1.0866817155756208, + "grad_norm": 281.78082275390625, + "learning_rate": 2.8415607985480945e-05, + "loss": 45.8102, + "step": 301 + }, + { + "epoch": 1.090293453724605, + "grad_norm": 297.08770751953125, + "learning_rate": 2.841016333938294e-05, + "loss": 45.3131, + "step": 302 + }, + { + "epoch": 1.0939051918735891, + "grad_norm": 388.77972412109375, + "learning_rate": 2.840471869328494e-05, + "loss": 44.9113, + "step": 303 + }, + { + "epoch": 1.0975169300225733, + "grad_norm": 301.92913818359375, + "learning_rate": 2.8399274047186935e-05, + "loss": 45.9125, + "step": 304 + }, + { + "epoch": 1.1011286681715575, + "grad_norm": 387.6468505859375, + "learning_rate": 2.839382940108893e-05, + "loss": 45.7297, + "step": 305 + }, + { + "epoch": 1.1047404063205417, + "grad_norm": 315.0013427734375, + "learning_rate": 2.8388384754990926e-05, + "loss": 45.2253, + "step": 306 + }, + { + "epoch": 1.108352144469526, + "grad_norm": 341.985595703125, + "learning_rate": 2.838294010889292e-05, + "loss": 42.6807, + "step": 307 + }, + { + "epoch": 1.11196388261851, + "grad_norm": 390.09674072265625, + "learning_rate": 2.8377495462794916e-05, + "loss": 39.3621, + "step": 308 + }, + { + "epoch": 1.1155756207674943, + "grad_norm": 391.62640380859375, + "learning_rate": 2.8372050816696915e-05, + "loss": 37.6168, + "step": 309 + }, + { + "epoch": 1.1191873589164785, + "grad_norm": 353.9164123535156, + "learning_rate": 2.8366606170598914e-05, + "loss": 38.7192, + "step": 310 + }, + { + "epoch": 1.1191873589164785, + "eval_loss": 0.6953558325767517, + "eval_runtime": 3.1291, + "eval_samples_per_second": 57.205, + "eval_steps_per_second": 57.205, + "step": 310 + }, + { + "epoch": 1.1227990970654627, + "grad_norm": 302.96240234375, + "learning_rate": 2.836116152450091e-05, + "loss": 39.5022, + "step": 311 + }, + { + "epoch": 1.1264108352144468, + "grad_norm": 400.8553466796875, + "learning_rate": 2.8355716878402905e-05, + "loss": 39.6587, + "step": 312 + }, + { + "epoch": 1.1300225733634313, + "grad_norm": 345.9519348144531, + "learning_rate": 2.83502722323049e-05, + "loss": 39.8058, + "step": 313 + }, + { + "epoch": 1.1336343115124152, + "grad_norm": 337.1177978515625, + "learning_rate": 2.83448275862069e-05, + "loss": 39.951, + "step": 314 + }, + { + "epoch": 1.1372460496613996, + "grad_norm": 301.2976989746094, + "learning_rate": 2.8339382940108894e-05, + "loss": 39.309, + "step": 315 + }, + { + "epoch": 1.1408577878103838, + "grad_norm": 406.03094482421875, + "learning_rate": 2.833393829401089e-05, + "loss": 40.6924, + "step": 316 + }, + { + "epoch": 1.144469525959368, + "grad_norm": 390.6329345703125, + "learning_rate": 2.8328493647912885e-05, + "loss": 41.3554, + "step": 317 + }, + { + "epoch": 1.1480812641083522, + "grad_norm": 321.64508056640625, + "learning_rate": 2.832304900181488e-05, + "loss": 41.1766, + "step": 318 + }, + { + "epoch": 1.1516930022573364, + "grad_norm": 283.5152282714844, + "learning_rate": 2.831760435571688e-05, + "loss": 40.1808, + "step": 319 + }, + { + "epoch": 1.1553047404063206, + "grad_norm": 348.6308288574219, + "learning_rate": 2.8312159709618878e-05, + "loss": 42.0895, + "step": 320 + }, + { + "epoch": 1.1553047404063206, + "eval_loss": 0.69289630651474, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 320 + }, + { + "epoch": 1.1589164785553048, + "grad_norm": 316.7882995605469, + "learning_rate": 2.8306715063520873e-05, + "loss": 41.5536, + "step": 321 + }, + { + "epoch": 1.162528216704289, + "grad_norm": 373.5389099121094, + "learning_rate": 2.830127041742287e-05, + "loss": 41.4083, + "step": 322 + }, + { + "epoch": 1.1661399548532732, + "grad_norm": 382.8615417480469, + "learning_rate": 2.8295825771324864e-05, + "loss": 39.9028, + "step": 323 + }, + { + "epoch": 1.1697516930022573, + "grad_norm": 327.3189392089844, + "learning_rate": 2.829038112522686e-05, + "loss": 28.8617, + "step": 324 + }, + { + "epoch": 1.1733634311512415, + "grad_norm": 307.2225036621094, + "learning_rate": 2.8284936479128858e-05, + "loss": 27.1866, + "step": 325 + }, + { + "epoch": 1.1769751693002257, + "grad_norm": 257.647705078125, + "learning_rate": 2.8279491833030854e-05, + "loss": 27.7946, + "step": 326 + }, + { + "epoch": 1.18058690744921, + "grad_norm": 286.5907897949219, + "learning_rate": 2.827404718693285e-05, + "loss": 27.1481, + "step": 327 + }, + { + "epoch": 1.184198645598194, + "grad_norm": 914.318603515625, + "learning_rate": 2.8268602540834848e-05, + "loss": 55.8025, + "step": 328 + }, + { + "epoch": 1.1878103837471783, + "grad_norm": 858.4988403320312, + "learning_rate": 2.8263157894736843e-05, + "loss": 56.1987, + "step": 329 + }, + { + "epoch": 1.1914221218961625, + "grad_norm": 800.506103515625, + "learning_rate": 2.825771324863884e-05, + "loss": 54.3495, + "step": 330 + }, + { + "epoch": 1.1914221218961625, + "eval_loss": 0.7448948621749878, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.132, + "eval_steps_per_second": 57.132, + "step": 330 + }, + { + "epoch": 1.1950338600451467, + "grad_norm": 692.0379028320312, + "learning_rate": 2.8252268602540837e-05, + "loss": 53.5803, + "step": 331 + }, + { + "epoch": 1.1986455981941309, + "grad_norm": 527.4228515625, + "learning_rate": 2.8246823956442833e-05, + "loss": 50.6743, + "step": 332 + }, + { + "epoch": 1.202257336343115, + "grad_norm": 460.74169921875, + "learning_rate": 2.8241379310344828e-05, + "loss": 49.3978, + "step": 333 + }, + { + "epoch": 1.2058690744920992, + "grad_norm": 388.31201171875, + "learning_rate": 2.8235934664246823e-05, + "loss": 49.7682, + "step": 334 + }, + { + "epoch": 1.2094808126410834, + "grad_norm": 414.94775390625, + "learning_rate": 2.823049001814882e-05, + "loss": 48.4647, + "step": 335 + }, + { + "epoch": 1.2130925507900678, + "grad_norm": 440.1581115722656, + "learning_rate": 2.8225045372050818e-05, + "loss": 48.9792, + "step": 336 + }, + { + "epoch": 1.2167042889390518, + "grad_norm": 377.634033203125, + "learning_rate": 2.8219600725952813e-05, + "loss": 48.0859, + "step": 337 + }, + { + "epoch": 1.2203160270880362, + "grad_norm": 286.99462890625, + "learning_rate": 2.8214156079854812e-05, + "loss": 46.2391, + "step": 338 + }, + { + "epoch": 1.2239277652370204, + "grad_norm": 353.834716796875, + "learning_rate": 2.8208711433756807e-05, + "loss": 45.4826, + "step": 339 + }, + { + "epoch": 1.2275395033860046, + "grad_norm": 311.1981506347656, + "learning_rate": 2.8203266787658802e-05, + "loss": 43.7182, + "step": 340 + }, + { + "epoch": 1.2275395033860046, + "eval_loss": 0.6925392150878906, + "eval_runtime": 3.1338, + "eval_samples_per_second": 57.119, + "eval_steps_per_second": 57.119, + "step": 340 + }, + { + "epoch": 1.2311512415349888, + "grad_norm": 343.8255920410156, + "learning_rate": 2.8197822141560798e-05, + "loss": 45.2841, + "step": 341 + }, + { + "epoch": 1.234762979683973, + "grad_norm": 275.9765930175781, + "learning_rate": 2.8192377495462797e-05, + "loss": 43.4088, + "step": 342 + }, + { + "epoch": 1.2383747178329572, + "grad_norm": 228.50440979003906, + "learning_rate": 2.8186932849364792e-05, + "loss": 42.6711, + "step": 343 + }, + { + "epoch": 1.2419864559819414, + "grad_norm": 253.25831604003906, + "learning_rate": 2.8181488203266787e-05, + "loss": 43.0506, + "step": 344 + }, + { + "epoch": 1.2455981941309255, + "grad_norm": 243.9517059326172, + "learning_rate": 2.8176043557168783e-05, + "loss": 44.455, + "step": 345 + }, + { + "epoch": 1.2492099322799097, + "grad_norm": 245.95286560058594, + "learning_rate": 2.8170598911070778e-05, + "loss": 45.154, + "step": 346 + }, + { + "epoch": 1.252821670428894, + "grad_norm": 245.6503143310547, + "learning_rate": 2.816515426497278e-05, + "loss": 44.0179, + "step": 347 + }, + { + "epoch": 1.256433408577878, + "grad_norm": 290.8607177734375, + "learning_rate": 2.8159709618874776e-05, + "loss": 45.7594, + "step": 348 + }, + { + "epoch": 1.2600451467268623, + "grad_norm": 259.909912109375, + "learning_rate": 2.815426497277677e-05, + "loss": 44.4864, + "step": 349 + }, + { + "epoch": 1.2636568848758465, + "grad_norm": 284.4267272949219, + "learning_rate": 2.8148820326678766e-05, + "loss": 47.1445, + "step": 350 + }, + { + "epoch": 1.2636568848758465, + "eval_loss": 0.6740585565567017, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.093, + "eval_steps_per_second": 57.093, + "step": 350 + }, + { + "epoch": 1.2672686230248307, + "grad_norm": 362.87164306640625, + "learning_rate": 2.8143375680580762e-05, + "loss": 46.3238, + "step": 351 + }, + { + "epoch": 1.2708803611738149, + "grad_norm": 290.58477783203125, + "learning_rate": 2.813793103448276e-05, + "loss": 46.2261, + "step": 352 + }, + { + "epoch": 1.274492099322799, + "grad_norm": 289.98101806640625, + "learning_rate": 2.8132486388384756e-05, + "loss": 44.5556, + "step": 353 + }, + { + "epoch": 1.2781038374717832, + "grad_norm": 318.88604736328125, + "learning_rate": 2.812704174228675e-05, + "loss": 45.2847, + "step": 354 + }, + { + "epoch": 1.2817155756207674, + "grad_norm": 318.89227294921875, + "learning_rate": 2.8121597096188747e-05, + "loss": 44.1901, + "step": 355 + }, + { + "epoch": 1.2853273137697516, + "grad_norm": 389.559814453125, + "learning_rate": 2.8116152450090746e-05, + "loss": 43.0926, + "step": 356 + }, + { + "epoch": 1.2889390519187358, + "grad_norm": 444.1388854980469, + "learning_rate": 2.811070780399274e-05, + "loss": 40.3614, + "step": 357 + }, + { + "epoch": 1.29255079006772, + "grad_norm": 395.99993896484375, + "learning_rate": 2.810526315789474e-05, + "loss": 38.9221, + "step": 358 + }, + { + "epoch": 1.2961625282167044, + "grad_norm": 346.8638000488281, + "learning_rate": 2.8099818511796735e-05, + "loss": 37.6041, + "step": 359 + }, + { + "epoch": 1.2997742663656884, + "grad_norm": 255.75537109375, + "learning_rate": 2.809437386569873e-05, + "loss": 38.9997, + "step": 360 + }, + { + "epoch": 1.2997742663656884, + "eval_loss": 0.689025342464447, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 360 + }, + { + "epoch": 1.3033860045146728, + "grad_norm": 337.3376770019531, + "learning_rate": 2.8088929219600726e-05, + "loss": 38.4022, + "step": 361 + }, + { + "epoch": 1.3069977426636568, + "grad_norm": 343.35205078125, + "learning_rate": 2.808348457350272e-05, + "loss": 39.1616, + "step": 362 + }, + { + "epoch": 1.3106094808126412, + "grad_norm": 396.1789245605469, + "learning_rate": 2.807803992740472e-05, + "loss": 41.1005, + "step": 363 + }, + { + "epoch": 1.3142212189616254, + "grad_norm": 358.3716735839844, + "learning_rate": 2.8072595281306715e-05, + "loss": 40.1739, + "step": 364 + }, + { + "epoch": 1.3178329571106095, + "grad_norm": 475.8331298828125, + "learning_rate": 2.8067150635208714e-05, + "loss": 41.1481, + "step": 365 + }, + { + "epoch": 1.3214446952595937, + "grad_norm": 322.4574279785156, + "learning_rate": 2.806170598911071e-05, + "loss": 41.1013, + "step": 366 + }, + { + "epoch": 1.325056433408578, + "grad_norm": 386.1807861328125, + "learning_rate": 2.8056261343012705e-05, + "loss": 41.1077, + "step": 367 + }, + { + "epoch": 1.3286681715575621, + "grad_norm": 335.3432312011719, + "learning_rate": 2.80508166969147e-05, + "loss": 40.5549, + "step": 368 + }, + { + "epoch": 1.3322799097065463, + "grad_norm": 344.7771911621094, + "learning_rate": 2.80453720508167e-05, + "loss": 41.4764, + "step": 369 + }, + { + "epoch": 1.3358916478555305, + "grad_norm": 373.671142578125, + "learning_rate": 2.8039927404718694e-05, + "loss": 42.3345, + "step": 370 + }, + { + "epoch": 1.3358916478555305, + "eval_loss": 0.6789068579673767, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 370 + }, + { + "epoch": 1.3395033860045147, + "grad_norm": 374.783203125, + "learning_rate": 2.803448275862069e-05, + "loss": 41.2196, + "step": 371 + }, + { + "epoch": 1.3431151241534989, + "grad_norm": 391.8028564453125, + "learning_rate": 2.8029038112522685e-05, + "loss": 41.4368, + "step": 372 + }, + { + "epoch": 1.346726862302483, + "grad_norm": 364.7682800292969, + "learning_rate": 2.802359346642468e-05, + "loss": 38.4212, + "step": 373 + }, + { + "epoch": 1.3503386004514673, + "grad_norm": 335.779541015625, + "learning_rate": 2.8018148820326683e-05, + "loss": 26.7865, + "step": 374 + }, + { + "epoch": 1.3539503386004514, + "grad_norm": 353.480224609375, + "learning_rate": 2.8012704174228678e-05, + "loss": 25.3621, + "step": 375 + }, + { + "epoch": 1.3575620767494356, + "grad_norm": 246.8798370361328, + "learning_rate": 2.8007259528130674e-05, + "loss": 26.8962, + "step": 376 + }, + { + "epoch": 1.3611738148984198, + "grad_norm": 384.77801513671875, + "learning_rate": 2.800181488203267e-05, + "loss": 27.0153, + "step": 377 + }, + { + "epoch": 1.364785553047404, + "grad_norm": 781.5183715820312, + "learning_rate": 2.7996370235934664e-05, + "loss": 53.2037, + "step": 378 + }, + { + "epoch": 1.3683972911963882, + "grad_norm": 765.4360961914062, + "learning_rate": 2.799092558983666e-05, + "loss": 55.7635, + "step": 379 + }, + { + "epoch": 1.3720090293453724, + "grad_norm": 725.854736328125, + "learning_rate": 2.798548094373866e-05, + "loss": 52.802, + "step": 380 + }, + { + "epoch": 1.3720090293453724, + "eval_loss": 0.7313510179519653, + "eval_runtime": 3.1334, + "eval_samples_per_second": 57.126, + "eval_steps_per_second": 57.126, + "step": 380 + }, + { + "epoch": 1.3756207674943566, + "grad_norm": 564.2916259765625, + "learning_rate": 2.7980036297640654e-05, + "loss": 51.6548, + "step": 381 + }, + { + "epoch": 1.379232505643341, + "grad_norm": 459.6091003417969, + "learning_rate": 2.797459165154265e-05, + "loss": 48.4082, + "step": 382 + }, + { + "epoch": 1.382844243792325, + "grad_norm": 373.1909484863281, + "learning_rate": 2.7969147005444645e-05, + "loss": 48.5173, + "step": 383 + }, + { + "epoch": 1.3864559819413094, + "grad_norm": 371.30169677734375, + "learning_rate": 2.7963702359346643e-05, + "loss": 47.5063, + "step": 384 + }, + { + "epoch": 1.3900677200902933, + "grad_norm": 336.7066345214844, + "learning_rate": 2.7958257713248642e-05, + "loss": 48.371, + "step": 385 + }, + { + "epoch": 1.3936794582392777, + "grad_norm": 338.871826171875, + "learning_rate": 2.7952813067150638e-05, + "loss": 46.209, + "step": 386 + }, + { + "epoch": 1.3972911963882617, + "grad_norm": 404.99749755859375, + "learning_rate": 2.7947368421052633e-05, + "loss": 48.0522, + "step": 387 + }, + { + "epoch": 1.4009029345372461, + "grad_norm": 374.24017333984375, + "learning_rate": 2.7941923774954628e-05, + "loss": 46.1458, + "step": 388 + }, + { + "epoch": 1.4045146726862303, + "grad_norm": 269.91937255859375, + "learning_rate": 2.7936479128856624e-05, + "loss": 44.5361, + "step": 389 + }, + { + "epoch": 1.4081264108352145, + "grad_norm": 340.3489074707031, + "learning_rate": 2.793103448275862e-05, + "loss": 44.2957, + "step": 390 + }, + { + "epoch": 1.4081264108352145, + "eval_loss": 0.6879153251647949, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.079, + "eval_steps_per_second": 57.079, + "step": 390 + }, + { + "epoch": 1.4117381489841987, + "grad_norm": 275.49676513671875, + "learning_rate": 2.7925589836660618e-05, + "loss": 43.126, + "step": 391 + }, + { + "epoch": 1.4153498871331829, + "grad_norm": 241.9796905517578, + "learning_rate": 2.7920145190562613e-05, + "loss": 43.3793, + "step": 392 + }, + { + "epoch": 1.418961625282167, + "grad_norm": 274.9486389160156, + "learning_rate": 2.7914700544464612e-05, + "loss": 42.0434, + "step": 393 + }, + { + "epoch": 1.4225733634311513, + "grad_norm": 259.0799255371094, + "learning_rate": 2.7909255898366607e-05, + "loss": 43.9504, + "step": 394 + }, + { + "epoch": 1.4261851015801355, + "grad_norm": 311.82464599609375, + "learning_rate": 2.7903811252268603e-05, + "loss": 43.505, + "step": 395 + }, + { + "epoch": 1.4297968397291196, + "grad_norm": 301.56243896484375, + "learning_rate": 2.78983666061706e-05, + "loss": 44.5498, + "step": 396 + }, + { + "epoch": 1.4334085778781038, + "grad_norm": 251.64212036132812, + "learning_rate": 2.7892921960072597e-05, + "loss": 44.5173, + "step": 397 + }, + { + "epoch": 1.437020316027088, + "grad_norm": 294.3619384765625, + "learning_rate": 2.7887477313974592e-05, + "loss": 45.396, + "step": 398 + }, + { + "epoch": 1.4406320541760722, + "grad_norm": 273.31427001953125, + "learning_rate": 2.7882032667876588e-05, + "loss": 43.6358, + "step": 399 + }, + { + "epoch": 1.4442437923250564, + "grad_norm": 317.6174011230469, + "learning_rate": 2.7876588021778583e-05, + "loss": 45.2258, + "step": 400 + }, + { + "epoch": 1.4442437923250564, + "eval_loss": 0.6741424202919006, + "eval_runtime": 3.1349, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 400 + }, + { + "epoch": 1.4478555304740406, + "grad_norm": 267.40118408203125, + "learning_rate": 2.787114337568058e-05, + "loss": 44.0452, + "step": 401 + }, + { + "epoch": 1.4514672686230248, + "grad_norm": 243.23074340820312, + "learning_rate": 2.786569872958258e-05, + "loss": 44.8225, + "step": 402 + }, + { + "epoch": 1.455079006772009, + "grad_norm": 313.2763366699219, + "learning_rate": 2.7860254083484576e-05, + "loss": 46.3814, + "step": 403 + }, + { + "epoch": 1.4586907449209932, + "grad_norm": 348.0602722167969, + "learning_rate": 2.785480943738657e-05, + "loss": 44.5303, + "step": 404 + }, + { + "epoch": 1.4623024830699773, + "grad_norm": 307.08819580078125, + "learning_rate": 2.7849364791288567e-05, + "loss": 46.2257, + "step": 405 + }, + { + "epoch": 1.4659142212189615, + "grad_norm": 283.5260925292969, + "learning_rate": 2.7843920145190562e-05, + "loss": 42.795, + "step": 406 + }, + { + "epoch": 1.469525959367946, + "grad_norm": 470.912841796875, + "learning_rate": 2.783847549909256e-05, + "loss": 40.3193, + "step": 407 + }, + { + "epoch": 1.47313769751693, + "grad_norm": 499.6931457519531, + "learning_rate": 2.7833030852994556e-05, + "loss": 39.166, + "step": 408 + }, + { + "epoch": 1.4767494356659143, + "grad_norm": 440.8569641113281, + "learning_rate": 2.782758620689655e-05, + "loss": 38.0724, + "step": 409 + }, + { + "epoch": 1.4803611738148983, + "grad_norm": 307.85919189453125, + "learning_rate": 2.7822141560798547e-05, + "loss": 38.5902, + "step": 410 + }, + { + "epoch": 1.4803611738148983, + "eval_loss": 0.6782167553901672, + "eval_runtime": 3.1368, + "eval_samples_per_second": 57.065, + "eval_steps_per_second": 57.065, + "step": 410 + }, + { + "epoch": 1.4839729119638827, + "grad_norm": 300.9029846191406, + "learning_rate": 2.7816696914700546e-05, + "loss": 38.9796, + "step": 411 + }, + { + "epoch": 1.487584650112867, + "grad_norm": 406.2210693359375, + "learning_rate": 2.781125226860254e-05, + "loss": 38.3992, + "step": 412 + }, + { + "epoch": 1.491196388261851, + "grad_norm": 374.5141906738281, + "learning_rate": 2.780580762250454e-05, + "loss": 39.0204, + "step": 413 + }, + { + "epoch": 1.4948081264108353, + "grad_norm": 437.4369201660156, + "learning_rate": 2.7800362976406535e-05, + "loss": 40.1299, + "step": 414 + }, + { + "epoch": 1.4984198645598195, + "grad_norm": 272.6376953125, + "learning_rate": 2.779491833030853e-05, + "loss": 40.1278, + "step": 415 + }, + { + "epoch": 1.5020316027088036, + "grad_norm": 320.0819091796875, + "learning_rate": 2.7789473684210526e-05, + "loss": 39.6137, + "step": 416 + }, + { + "epoch": 1.5056433408577878, + "grad_norm": 351.5314025878906, + "learning_rate": 2.778402903811252e-05, + "loss": 41.0757, + "step": 417 + }, + { + "epoch": 1.509255079006772, + "grad_norm": 331.9538879394531, + "learning_rate": 2.777858439201452e-05, + "loss": 40.3751, + "step": 418 + }, + { + "epoch": 1.5128668171557562, + "grad_norm": 339.1962585449219, + "learning_rate": 2.7773139745916516e-05, + "loss": 41.3858, + "step": 419 + }, + { + "epoch": 1.5164785553047404, + "grad_norm": 264.5666198730469, + "learning_rate": 2.776769509981851e-05, + "loss": 42.1872, + "step": 420 + }, + { + "epoch": 1.5164785553047404, + "eval_loss": 0.6758362650871277, + "eval_runtime": 3.1397, + "eval_samples_per_second": 57.012, + "eval_steps_per_second": 57.012, + "step": 420 + }, + { + "epoch": 1.5200902934537246, + "grad_norm": 282.2214050292969, + "learning_rate": 2.776225045372051e-05, + "loss": 41.5158, + "step": 421 + }, + { + "epoch": 1.5237020316027088, + "grad_norm": 314.0169677734375, + "learning_rate": 2.7756805807622505e-05, + "loss": 39.6937, + "step": 422 + }, + { + "epoch": 1.527313769751693, + "grad_norm": 258.1871337890625, + "learning_rate": 2.77513611615245e-05, + "loss": 30.1697, + "step": 423 + }, + { + "epoch": 1.5309255079006772, + "grad_norm": 213.88528442382812, + "learning_rate": 2.77459165154265e-05, + "loss": 26.0674, + "step": 424 + }, + { + "epoch": 1.5345372460496614, + "grad_norm": 313.9029235839844, + "learning_rate": 2.7740471869328495e-05, + "loss": 26.2021, + "step": 425 + }, + { + "epoch": 1.5381489841986458, + "grad_norm": 334.663330078125, + "learning_rate": 2.773502722323049e-05, + "loss": 26.9734, + "step": 426 + }, + { + "epoch": 1.5417607223476297, + "grad_norm": 304.77117919921875, + "learning_rate": 2.7729582577132486e-05, + "loss": 27.5513, + "step": 427 + }, + { + "epoch": 1.5453724604966141, + "grad_norm": 642.5489501953125, + "learning_rate": 2.772413793103448e-05, + "loss": 52.8855, + "step": 428 + }, + { + "epoch": 1.548984198645598, + "grad_norm": 579.0210571289062, + "learning_rate": 2.771869328493648e-05, + "loss": 53.095, + "step": 429 + }, + { + "epoch": 1.5525959367945825, + "grad_norm": 502.8334045410156, + "learning_rate": 2.771324863883848e-05, + "loss": 52.0631, + "step": 430 + }, + { + "epoch": 1.5525959367945825, + "eval_loss": 0.70591801404953, + "eval_runtime": 3.1403, + "eval_samples_per_second": 57.001, + "eval_steps_per_second": 57.001, + "step": 430 + }, + { + "epoch": 1.5562076749435665, + "grad_norm": 452.4619140625, + "learning_rate": 2.7707803992740474e-05, + "loss": 49.6795, + "step": 431 + }, + { + "epoch": 1.559819413092551, + "grad_norm": 434.84326171875, + "learning_rate": 2.770235934664247e-05, + "loss": 50.089, + "step": 432 + }, + { + "epoch": 1.5634311512415349, + "grad_norm": 389.4812927246094, + "learning_rate": 2.7696914700544465e-05, + "loss": 48.5218, + "step": 433 + }, + { + "epoch": 1.5670428893905193, + "grad_norm": 279.72027587890625, + "learning_rate": 2.769147005444646e-05, + "loss": 48.0049, + "step": 434 + }, + { + "epoch": 1.5706546275395032, + "grad_norm": 294.6167907714844, + "learning_rate": 2.768602540834846e-05, + "loss": 47.7967, + "step": 435 + }, + { + "epoch": 1.5742663656884877, + "grad_norm": 296.6061706542969, + "learning_rate": 2.7680580762250454e-05, + "loss": 48.3725, + "step": 436 + }, + { + "epoch": 1.5778781038374716, + "grad_norm": 448.601318359375, + "learning_rate": 2.767513611615245e-05, + "loss": 47.3637, + "step": 437 + }, + { + "epoch": 1.581489841986456, + "grad_norm": 401.6792297363281, + "learning_rate": 2.7669691470054445e-05, + "loss": 45.8428, + "step": 438 + }, + { + "epoch": 1.5851015801354402, + "grad_norm": 383.7574768066406, + "learning_rate": 2.7664246823956444e-05, + "loss": 45.0625, + "step": 439 + }, + { + "epoch": 1.5887133182844244, + "grad_norm": 354.9222412109375, + "learning_rate": 2.7658802177858442e-05, + "loss": 45.0018, + "step": 440 + }, + { + "epoch": 1.5887133182844244, + "eval_loss": 0.6869362592697144, + "eval_runtime": 3.1396, + "eval_samples_per_second": 57.013, + "eval_steps_per_second": 57.013, + "step": 440 + }, + { + "epoch": 1.5923250564334086, + "grad_norm": 332.02191162109375, + "learning_rate": 2.7653357531760438e-05, + "loss": 42.2533, + "step": 441 + }, + { + "epoch": 1.5959367945823928, + "grad_norm": 214.40272521972656, + "learning_rate": 2.7647912885662433e-05, + "loss": 43.0666, + "step": 442 + }, + { + "epoch": 1.599548532731377, + "grad_norm": 234.03184509277344, + "learning_rate": 2.764246823956443e-05, + "loss": 43.7141, + "step": 443 + }, + { + "epoch": 1.6031602708803612, + "grad_norm": 290.4942626953125, + "learning_rate": 2.7637023593466424e-05, + "loss": 43.1818, + "step": 444 + }, + { + "epoch": 1.6067720090293454, + "grad_norm": 368.3863525390625, + "learning_rate": 2.7631578947368423e-05, + "loss": 44.992, + "step": 445 + }, + { + "epoch": 1.6103837471783295, + "grad_norm": 256.7243347167969, + "learning_rate": 2.7626134301270418e-05, + "loss": 43.9973, + "step": 446 + }, + { + "epoch": 1.6139954853273137, + "grad_norm": 235.7418670654297, + "learning_rate": 2.7620689655172413e-05, + "loss": 44.1467, + "step": 447 + }, + { + "epoch": 1.617607223476298, + "grad_norm": 269.3458251953125, + "learning_rate": 2.7615245009074412e-05, + "loss": 44.3638, + "step": 448 + }, + { + "epoch": 1.6212189616252821, + "grad_norm": 267.63104248046875, + "learning_rate": 2.7609800362976408e-05, + "loss": 45.5499, + "step": 449 + }, + { + "epoch": 1.6248306997742663, + "grad_norm": 266.48260498046875, + "learning_rate": 2.7604355716878403e-05, + "loss": 44.6896, + "step": 450 + }, + { + "epoch": 1.6248306997742663, + "eval_loss": 0.6687367558479309, + "eval_runtime": 3.138, + "eval_samples_per_second": 57.042, + "eval_steps_per_second": 57.042, + "step": 450 + }, + { + "epoch": 1.6284424379232507, + "grad_norm": 280.531005859375, + "learning_rate": 2.7598911070780402e-05, + "loss": 44.4839, + "step": 451 + }, + { + "epoch": 1.6320541760722347, + "grad_norm": 277.5115966796875, + "learning_rate": 2.7593466424682397e-05, + "loss": 44.4457, + "step": 452 + }, + { + "epoch": 1.635665914221219, + "grad_norm": 283.0730285644531, + "learning_rate": 2.7588021778584393e-05, + "loss": 45.3896, + "step": 453 + }, + { + "epoch": 1.639277652370203, + "grad_norm": 220.58546447753906, + "learning_rate": 2.7582577132486388e-05, + "loss": 45.1627, + "step": 454 + }, + { + "epoch": 1.6428893905191875, + "grad_norm": 221.82968139648438, + "learning_rate": 2.7577132486388383e-05, + "loss": 44.0173, + "step": 455 + }, + { + "epoch": 1.6465011286681714, + "grad_norm": 293.05828857421875, + "learning_rate": 2.7571687840290382e-05, + "loss": 41.7427, + "step": 456 + }, + { + "epoch": 1.6501128668171559, + "grad_norm": 308.2817077636719, + "learning_rate": 2.756624319419238e-05, + "loss": 38.7822, + "step": 457 + }, + { + "epoch": 1.6537246049661398, + "grad_norm": 322.1114196777344, + "learning_rate": 2.7560798548094376e-05, + "loss": 37.8994, + "step": 458 + }, + { + "epoch": 1.6573363431151242, + "grad_norm": 357.4956359863281, + "learning_rate": 2.755535390199637e-05, + "loss": 38.2092, + "step": 459 + }, + { + "epoch": 1.6609480812641082, + "grad_norm": 298.619384765625, + "learning_rate": 2.7549909255898367e-05, + "loss": 39.1363, + "step": 460 + }, + { + "epoch": 1.6609480812641082, + "eval_loss": 0.6787883639335632, + "eval_runtime": 3.1331, + "eval_samples_per_second": 57.131, + "eval_steps_per_second": 57.131, + "step": 460 + }, + { + "epoch": 1.6645598194130926, + "grad_norm": 353.0351867675781, + "learning_rate": 2.7544464609800362e-05, + "loss": 37.5096, + "step": 461 + }, + { + "epoch": 1.6681715575620768, + "grad_norm": 344.4702453613281, + "learning_rate": 2.753901996370236e-05, + "loss": 39.3225, + "step": 462 + }, + { + "epoch": 1.671783295711061, + "grad_norm": 349.8557434082031, + "learning_rate": 2.7533575317604357e-05, + "loss": 39.4745, + "step": 463 + }, + { + "epoch": 1.6753950338600452, + "grad_norm": 285.15765380859375, + "learning_rate": 2.7528130671506352e-05, + "loss": 39.7513, + "step": 464 + }, + { + "epoch": 1.6790067720090294, + "grad_norm": 329.09149169921875, + "learning_rate": 2.7522686025408347e-05, + "loss": 40.441, + "step": 465 + }, + { + "epoch": 1.6826185101580136, + "grad_norm": 246.67437744140625, + "learning_rate": 2.7517241379310343e-05, + "loss": 40.0033, + "step": 466 + }, + { + "epoch": 1.6862302483069977, + "grad_norm": 299.9590148925781, + "learning_rate": 2.7511796733212345e-05, + "loss": 41.4558, + "step": 467 + }, + { + "epoch": 1.689841986455982, + "grad_norm": 315.5220642089844, + "learning_rate": 2.750635208711434e-05, + "loss": 40.8088, + "step": 468 + }, + { + "epoch": 1.6934537246049661, + "grad_norm": 256.2172546386719, + "learning_rate": 2.7500907441016336e-05, + "loss": 40.4457, + "step": 469 + }, + { + "epoch": 1.6970654627539503, + "grad_norm": 345.38983154296875, + "learning_rate": 2.749546279491833e-05, + "loss": 42.0739, + "step": 470 + }, + { + "epoch": 1.6970654627539503, + "eval_loss": 0.6835405826568604, + "eval_runtime": 3.1373, + "eval_samples_per_second": 57.056, + "eval_steps_per_second": 57.056, + "step": 470 + }, + { + "epoch": 1.7006772009029345, + "grad_norm": 425.0630187988281, + "learning_rate": 2.7490018148820326e-05, + "loss": 41.6554, + "step": 471 + }, + { + "epoch": 1.7042889390519187, + "grad_norm": 286.5938720703125, + "learning_rate": 2.7484573502722322e-05, + "loss": 39.4656, + "step": 472 + }, + { + "epoch": 1.7079006772009029, + "grad_norm": 356.6265869140625, + "learning_rate": 2.747912885662432e-05, + "loss": 26.6268, + "step": 473 + }, + { + "epoch": 1.7115124153498873, + "grad_norm": 319.0960388183594, + "learning_rate": 2.7473684210526316e-05, + "loss": 26.344, + "step": 474 + }, + { + "epoch": 1.7151241534988713, + "grad_norm": 217.50375366210938, + "learning_rate": 2.746823956442831e-05, + "loss": 27.099, + "step": 475 + }, + { + "epoch": 1.7187358916478557, + "grad_norm": 199.71047973632812, + "learning_rate": 2.746279491833031e-05, + "loss": 27.293, + "step": 476 + }, + { + "epoch": 1.7223476297968396, + "grad_norm": 222.1556396484375, + "learning_rate": 2.7457350272232305e-05, + "loss": 26.5473, + "step": 477 + }, + { + "epoch": 1.725959367945824, + "grad_norm": 740.6441650390625, + "learning_rate": 2.7451905626134304e-05, + "loss": 53.8046, + "step": 478 + }, + { + "epoch": 1.729571106094808, + "grad_norm": 792.884765625, + "learning_rate": 2.74464609800363e-05, + "loss": 54.1166, + "step": 479 + }, + { + "epoch": 1.7331828442437924, + "grad_norm": 613.0400390625, + "learning_rate": 2.7441016333938295e-05, + "loss": 51.6648, + "step": 480 + }, + { + "epoch": 1.7331828442437924, + "eval_loss": 0.704450786113739, + "eval_runtime": 3.1367, + "eval_samples_per_second": 57.067, + "eval_steps_per_second": 57.067, + "step": 480 + }, + { + "epoch": 1.7367945823927764, + "grad_norm": 469.0580139160156, + "learning_rate": 2.743557168784029e-05, + "loss": 49.7201, + "step": 481 + }, + { + "epoch": 1.7404063205417608, + "grad_norm": 347.80206298828125, + "learning_rate": 2.7430127041742286e-05, + "loss": 48.8945, + "step": 482 + }, + { + "epoch": 1.7440180586907448, + "grad_norm": 334.3109436035156, + "learning_rate": 2.742468239564428e-05, + "loss": 48.1509, + "step": 483 + }, + { + "epoch": 1.7476297968397292, + "grad_norm": 276.7007141113281, + "learning_rate": 2.741923774954628e-05, + "loss": 47.801, + "step": 484 + }, + { + "epoch": 1.7512415349887132, + "grad_norm": 322.46575927734375, + "learning_rate": 2.741379310344828e-05, + "loss": 47.9838, + "step": 485 + }, + { + "epoch": 1.7548532731376976, + "grad_norm": 320.534912109375, + "learning_rate": 2.7408348457350274e-05, + "loss": 46.9847, + "step": 486 + }, + { + "epoch": 1.7584650112866818, + "grad_norm": 415.94580078125, + "learning_rate": 2.740290381125227e-05, + "loss": 47.8941, + "step": 487 + }, + { + "epoch": 1.762076749435666, + "grad_norm": 299.8996276855469, + "learning_rate": 2.7397459165154265e-05, + "loss": 46.5792, + "step": 488 + }, + { + "epoch": 1.7656884875846501, + "grad_norm": 275.3497314453125, + "learning_rate": 2.7392014519056264e-05, + "loss": 43.625, + "step": 489 + }, + { + "epoch": 1.7693002257336343, + "grad_norm": 281.14251708984375, + "learning_rate": 2.738656987295826e-05, + "loss": 42.5925, + "step": 490 + }, + { + "epoch": 1.7693002257336343, + "eval_loss": 0.6785204410552979, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.152, + "eval_steps_per_second": 57.152, + "step": 490 + }, + { + "epoch": 1.7729119638826185, + "grad_norm": 355.0955505371094, + "learning_rate": 2.7381125226860254e-05, + "loss": 43.7302, + "step": 491 + }, + { + "epoch": 1.7765237020316027, + "grad_norm": 302.5424499511719, + "learning_rate": 2.737568058076225e-05, + "loss": 42.6815, + "step": 492 + }, + { + "epoch": 1.780135440180587, + "grad_norm": 288.3701171875, + "learning_rate": 2.7370235934664245e-05, + "loss": 43.3076, + "step": 493 + }, + { + "epoch": 1.783747178329571, + "grad_norm": 288.28863525390625, + "learning_rate": 2.7364791288566244e-05, + "loss": 43.5499, + "step": 494 + }, + { + "epoch": 1.7873589164785553, + "grad_norm": 277.82171630859375, + "learning_rate": 2.7359346642468243e-05, + "loss": 45.5163, + "step": 495 + }, + { + "epoch": 1.7909706546275395, + "grad_norm": 240.2311248779297, + "learning_rate": 2.7353901996370238e-05, + "loss": 43.8984, + "step": 496 + }, + { + "epoch": 1.7945823927765236, + "grad_norm": 280.1030578613281, + "learning_rate": 2.7348457350272233e-05, + "loss": 44.7699, + "step": 497 + }, + { + "epoch": 1.7981941309255078, + "grad_norm": 260.6531982421875, + "learning_rate": 2.734301270417423e-05, + "loss": 44.2966, + "step": 498 + }, + { + "epoch": 1.8018058690744923, + "grad_norm": 284.82989501953125, + "learning_rate": 2.7337568058076224e-05, + "loss": 44.8812, + "step": 499 + }, + { + "epoch": 1.8054176072234762, + "grad_norm": 228.4029541015625, + "learning_rate": 2.7332123411978223e-05, + "loss": 45.3101, + "step": 500 + }, + { + "epoch": 1.8054176072234762, + "eval_loss": 0.6614294648170471, + "eval_runtime": 3.1354, + "eval_samples_per_second": 57.091, + "eval_steps_per_second": 57.091, + "step": 500 + }, + { + "epoch": 1.8090293453724606, + "grad_norm": 253.9024200439453, + "learning_rate": 2.732667876588022e-05, + "loss": 44.7325, + "step": 501 + }, + { + "epoch": 1.8126410835214446, + "grad_norm": 234.1785888671875, + "learning_rate": 2.7321234119782214e-05, + "loss": 44.5544, + "step": 502 + }, + { + "epoch": 1.816252821670429, + "grad_norm": 252.2194061279297, + "learning_rate": 2.7315789473684213e-05, + "loss": 45.0984, + "step": 503 + }, + { + "epoch": 1.819864559819413, + "grad_norm": 244.02610778808594, + "learning_rate": 2.7310344827586208e-05, + "loss": 44.5591, + "step": 504 + }, + { + "epoch": 1.8234762979683974, + "grad_norm": 252.48089599609375, + "learning_rate": 2.7304900181488203e-05, + "loss": 43.7073, + "step": 505 + }, + { + "epoch": 1.8270880361173814, + "grad_norm": 258.9751892089844, + "learning_rate": 2.7299455535390202e-05, + "loss": 40.7267, + "step": 506 + }, + { + "epoch": 1.8306997742663658, + "grad_norm": 322.91387939453125, + "learning_rate": 2.7294010889292197e-05, + "loss": 39.0883, + "step": 507 + }, + { + "epoch": 1.8343115124153497, + "grad_norm": 392.3733215332031, + "learning_rate": 2.7288566243194193e-05, + "loss": 37.8859, + "step": 508 + }, + { + "epoch": 1.8379232505643341, + "grad_norm": 330.35089111328125, + "learning_rate": 2.7283121597096188e-05, + "loss": 37.6328, + "step": 509 + }, + { + "epoch": 1.8415349887133183, + "grad_norm": 306.2722473144531, + "learning_rate": 2.7277676950998184e-05, + "loss": 38.4354, + "step": 510 + }, + { + "epoch": 1.8415349887133183, + "eval_loss": 0.6802475452423096, + "eval_runtime": 3.1337, + "eval_samples_per_second": 57.12, + "eval_steps_per_second": 57.12, + "step": 510 + }, + { + "epoch": 1.8451467268623025, + "grad_norm": 376.08319091796875, + "learning_rate": 2.7272232304900182e-05, + "loss": 37.5668, + "step": 511 + }, + { + "epoch": 1.8487584650112867, + "grad_norm": 245.11607360839844, + "learning_rate": 2.7266787658802178e-05, + "loss": 39.0387, + "step": 512 + }, + { + "epoch": 1.852370203160271, + "grad_norm": 326.08740234375, + "learning_rate": 2.7261343012704177e-05, + "loss": 39.8013, + "step": 513 + }, + { + "epoch": 1.855981941309255, + "grad_norm": 286.0172119140625, + "learning_rate": 2.7255898366606172e-05, + "loss": 39.1249, + "step": 514 + }, + { + "epoch": 1.8595936794582393, + "grad_norm": 279.9872741699219, + "learning_rate": 2.7250453720508167e-05, + "loss": 38.9208, + "step": 515 + }, + { + "epoch": 1.8632054176072235, + "grad_norm": 273.5589904785156, + "learning_rate": 2.7245009074410163e-05, + "loss": 39.6188, + "step": 516 + }, + { + "epoch": 1.8668171557562077, + "grad_norm": 241.08322143554688, + "learning_rate": 2.723956442831216e-05, + "loss": 39.9636, + "step": 517 + }, + { + "epoch": 1.8704288939051918, + "grad_norm": 282.7255554199219, + "learning_rate": 2.7234119782214157e-05, + "loss": 39.7664, + "step": 518 + }, + { + "epoch": 1.874040632054176, + "grad_norm": 276.45819091796875, + "learning_rate": 2.7228675136116152e-05, + "loss": 40.4444, + "step": 519 + }, + { + "epoch": 1.8776523702031602, + "grad_norm": 274.9344787597656, + "learning_rate": 2.7223230490018148e-05, + "loss": 41.3736, + "step": 520 + }, + { + "epoch": 1.8776523702031602, + "eval_loss": 0.6779935956001282, + "eval_runtime": 3.1278, + "eval_samples_per_second": 57.228, + "eval_steps_per_second": 57.228, + "step": 520 + }, + { + "epoch": 1.8812641083521444, + "grad_norm": 251.0371551513672, + "learning_rate": 2.7217785843920143e-05, + "loss": 41.0723, + "step": 521 + }, + { + "epoch": 1.8848758465011288, + "grad_norm": 313.0828552246094, + "learning_rate": 2.7212341197822145e-05, + "loss": 41.552, + "step": 522 + }, + { + "epoch": 1.8884875846501128, + "grad_norm": 246.2321319580078, + "learning_rate": 2.720689655172414e-05, + "loss": 41.1185, + "step": 523 + }, + { + "epoch": 1.8920993227990972, + "grad_norm": 243.4658660888672, + "learning_rate": 2.7201451905626136e-05, + "loss": 26.9467, + "step": 524 + }, + { + "epoch": 1.8957110609480812, + "grad_norm": 234.8782196044922, + "learning_rate": 2.719600725952813e-05, + "loss": 26.1988, + "step": 525 + }, + { + "epoch": 1.8993227990970656, + "grad_norm": 218.89500427246094, + "learning_rate": 2.7190562613430127e-05, + "loss": 26.4887, + "step": 526 + }, + { + "epoch": 1.9029345372460496, + "grad_norm": 241.71937561035156, + "learning_rate": 2.7185117967332122e-05, + "loss": 26.9755, + "step": 527 + }, + { + "epoch": 1.906546275395034, + "grad_norm": 532.9345092773438, + "learning_rate": 2.717967332123412e-05, + "loss": 52.2138, + "step": 528 + }, + { + "epoch": 1.910158013544018, + "grad_norm": 600.501220703125, + "learning_rate": 2.7174228675136116e-05, + "loss": 51.3975, + "step": 529 + }, + { + "epoch": 1.9137697516930023, + "grad_norm": 570.4301147460938, + "learning_rate": 2.716878402903811e-05, + "loss": 51.2848, + "step": 530 + }, + { + "epoch": 1.9137697516930023, + "eval_loss": 0.7027958035469055, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 530 + }, + { + "epoch": 1.9173814898419863, + "grad_norm": 406.2899475097656, + "learning_rate": 2.716333938294011e-05, + "loss": 49.1175, + "step": 531 + }, + { + "epoch": 1.9209932279909707, + "grad_norm": 369.8658447265625, + "learning_rate": 2.7157894736842106e-05, + "loss": 47.7343, + "step": 532 + }, + { + "epoch": 1.9246049661399547, + "grad_norm": 338.6764831542969, + "learning_rate": 2.7152450090744105e-05, + "loss": 48.1818, + "step": 533 + }, + { + "epoch": 1.928216704288939, + "grad_norm": 283.4834899902344, + "learning_rate": 2.71470054446461e-05, + "loss": 45.686, + "step": 534 + }, + { + "epoch": 1.9318284424379233, + "grad_norm": 327.53472900390625, + "learning_rate": 2.7141560798548095e-05, + "loss": 44.3277, + "step": 535 + }, + { + "epoch": 1.9354401805869075, + "grad_norm": 329.0078125, + "learning_rate": 2.713611615245009e-05, + "loss": 44.4469, + "step": 536 + }, + { + "epoch": 1.9390519187358917, + "grad_norm": 270.3822021484375, + "learning_rate": 2.7130671506352086e-05, + "loss": 43.9265, + "step": 537 + }, + { + "epoch": 1.9426636568848759, + "grad_norm": 224.95742797851562, + "learning_rate": 2.7125226860254085e-05, + "loss": 44.3072, + "step": 538 + }, + { + "epoch": 1.94627539503386, + "grad_norm": 240.5491943359375, + "learning_rate": 2.711978221415608e-05, + "loss": 43.8803, + "step": 539 + }, + { + "epoch": 1.9498871331828442, + "grad_norm": 284.5292663574219, + "learning_rate": 2.711433756805808e-05, + "loss": 46.5793, + "step": 540 + }, + { + "epoch": 1.9498871331828442, + "eval_loss": 0.6684675812721252, + "eval_runtime": 3.1311, + "eval_samples_per_second": 57.168, + "eval_steps_per_second": 57.168, + "step": 540 + }, + { + "epoch": 1.9534988713318284, + "grad_norm": 281.6688537597656, + "learning_rate": 2.7108892921960074e-05, + "loss": 45.9066, + "step": 541 + }, + { + "epoch": 1.9571106094808126, + "grad_norm": 283.83514404296875, + "learning_rate": 2.710344827586207e-05, + "loss": 45.677, + "step": 542 + }, + { + "epoch": 1.9607223476297968, + "grad_norm": 227.1653289794922, + "learning_rate": 2.7098003629764065e-05, + "loss": 40.1636, + "step": 543 + }, + { + "epoch": 1.964334085778781, + "grad_norm": 294.7088928222656, + "learning_rate": 2.7092558983666064e-05, + "loss": 37.5158, + "step": 544 + }, + { + "epoch": 1.9679458239277654, + "grad_norm": 225.4500732421875, + "learning_rate": 2.708711433756806e-05, + "loss": 36.7221, + "step": 545 + }, + { + "epoch": 1.9715575620767494, + "grad_norm": 244.72509765625, + "learning_rate": 2.7081669691470055e-05, + "loss": 39.5524, + "step": 546 + }, + { + "epoch": 1.9751693002257338, + "grad_norm": 232.00390625, + "learning_rate": 2.707622504537205e-05, + "loss": 38.6021, + "step": 547 + }, + { + "epoch": 1.9787810383747177, + "grad_norm": 241.02322387695312, + "learning_rate": 2.7070780399274045e-05, + "loss": 39.9881, + "step": 548 + }, + { + "epoch": 1.9823927765237022, + "grad_norm": 244.6790771484375, + "learning_rate": 2.7065335753176044e-05, + "loss": 40.5002, + "step": 549 + }, + { + "epoch": 1.9860045146726861, + "grad_norm": 336.2419128417969, + "learning_rate": 2.7059891107078043e-05, + "loss": 41.5041, + "step": 550 + }, + { + "epoch": 1.9860045146726861, + "eval_loss": 0.68587726354599, + "eval_runtime": 3.1319, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 550 + }, + { + "epoch": 1.9896162528216705, + "grad_norm": 283.76629638671875, + "learning_rate": 2.705444646098004e-05, + "loss": 41.0936, + "step": 551 + }, + { + "epoch": 1.9932279909706545, + "grad_norm": 236.99427795410156, + "learning_rate": 2.7049001814882034e-05, + "loss": 30.5475, + "step": 552 + }, + { + "epoch": 1.996839729119639, + "grad_norm": 247.35618591308594, + "learning_rate": 2.704355716878403e-05, + "loss": 25.829, + "step": 553 + }, + { + "epoch": 2.0, + "grad_norm": 206.315185546875, + "learning_rate": 2.7038112522686025e-05, + "loss": 24.0575, + "step": 554 + }, + { + "epoch": 2.0036117381489844, + "grad_norm": 499.1221923828125, + "learning_rate": 2.7032667876588023e-05, + "loss": 50.3468, + "step": 555 + }, + { + "epoch": 2.0072234762979684, + "grad_norm": 415.1005859375, + "learning_rate": 2.702722323049002e-05, + "loss": 50.1256, + "step": 556 + }, + { + "epoch": 2.010835214446953, + "grad_norm": 414.549072265625, + "learning_rate": 2.7021778584392014e-05, + "loss": 50.4985, + "step": 557 + }, + { + "epoch": 2.0144469525959368, + "grad_norm": 339.5193786621094, + "learning_rate": 2.701633393829401e-05, + "loss": 48.4158, + "step": 558 + }, + { + "epoch": 2.018058690744921, + "grad_norm": 318.3045654296875, + "learning_rate": 2.7010889292196008e-05, + "loss": 48.3497, + "step": 559 + }, + { + "epoch": 2.021670428893905, + "grad_norm": 298.7594909667969, + "learning_rate": 2.7005444646098007e-05, + "loss": 47.0476, + "step": 560 + }, + { + "epoch": 2.021670428893905, + "eval_loss": 0.6773737668991089, + "eval_runtime": 3.132, + "eval_samples_per_second": 57.153, + "eval_steps_per_second": 57.153, + "step": 560 + }, + { + "epoch": 2.0252821670428895, + "grad_norm": 238.1414031982422, + "learning_rate": 2.7000000000000002e-05, + "loss": 46.6903, + "step": 561 + }, + { + "epoch": 2.0288939051918735, + "grad_norm": 225.9528045654297, + "learning_rate": 2.6994555353901998e-05, + "loss": 47.351, + "step": 562 + }, + { + "epoch": 2.032505643340858, + "grad_norm": 264.1337890625, + "learning_rate": 2.6989110707803993e-05, + "loss": 46.7924, + "step": 563 + }, + { + "epoch": 2.036117381489842, + "grad_norm": 257.63311767578125, + "learning_rate": 2.698366606170599e-05, + "loss": 45.4036, + "step": 564 + }, + { + "epoch": 2.0397291196388263, + "grad_norm": 283.10980224609375, + "learning_rate": 2.6978221415607984e-05, + "loss": 45.1304, + "step": 565 + }, + { + "epoch": 2.0433408577878103, + "grad_norm": 280.585205078125, + "learning_rate": 2.6972776769509983e-05, + "loss": 45.1448, + "step": 566 + }, + { + "epoch": 2.0469525959367947, + "grad_norm": 282.609375, + "learning_rate": 2.6967332123411978e-05, + "loss": 43.4235, + "step": 567 + }, + { + "epoch": 2.0505643340857787, + "grad_norm": 259.24346923828125, + "learning_rate": 2.6961887477313977e-05, + "loss": 42.5758, + "step": 568 + }, + { + "epoch": 2.054176072234763, + "grad_norm": 246.6533966064453, + "learning_rate": 2.6956442831215972e-05, + "loss": 42.2048, + "step": 569 + }, + { + "epoch": 2.057787810383747, + "grad_norm": 250.3376007080078, + "learning_rate": 2.6950998185117968e-05, + "loss": 43.8324, + "step": 570 + }, + { + "epoch": 2.057787810383747, + "eval_loss": 0.665416419506073, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.107, + "eval_steps_per_second": 57.107, + "step": 570 + }, + { + "epoch": 2.0613995485327314, + "grad_norm": 255.93833923339844, + "learning_rate": 2.6945553539019966e-05, + "loss": 42.8104, + "step": 571 + }, + { + "epoch": 2.0650112866817154, + "grad_norm": 254.2652587890625, + "learning_rate": 2.6940108892921962e-05, + "loss": 43.7011, + "step": 572 + }, + { + "epoch": 2.0686230248307, + "grad_norm": 249.3634033203125, + "learning_rate": 2.6934664246823957e-05, + "loss": 44.6409, + "step": 573 + }, + { + "epoch": 2.072234762979684, + "grad_norm": 227.1998291015625, + "learning_rate": 2.6929219600725953e-05, + "loss": 43.5825, + "step": 574 + }, + { + "epoch": 2.075846501128668, + "grad_norm": 268.9518127441406, + "learning_rate": 2.6923774954627948e-05, + "loss": 44.9313, + "step": 575 + }, + { + "epoch": 2.079458239277652, + "grad_norm": 246.07669067382812, + "learning_rate": 2.6918330308529943e-05, + "loss": 42.9967, + "step": 576 + }, + { + "epoch": 2.0830699774266366, + "grad_norm": 244.62857055664062, + "learning_rate": 2.6912885662431945e-05, + "loss": 45.3452, + "step": 577 + }, + { + "epoch": 2.0866817155756205, + "grad_norm": 211.9022216796875, + "learning_rate": 2.690744101633394e-05, + "loss": 44.339, + "step": 578 + }, + { + "epoch": 2.090293453724605, + "grad_norm": 247.23330688476562, + "learning_rate": 2.6901996370235936e-05, + "loss": 44.9766, + "step": 579 + }, + { + "epoch": 2.0939051918735894, + "grad_norm": 205.37115478515625, + "learning_rate": 2.689655172413793e-05, + "loss": 43.7674, + "step": 580 + }, + { + "epoch": 2.0939051918735894, + "eval_loss": 0.6593071222305298, + "eval_runtime": 3.1344, + "eval_samples_per_second": 57.108, + "eval_steps_per_second": 57.108, + "step": 580 + }, + { + "epoch": 2.0975169300225733, + "grad_norm": 276.552734375, + "learning_rate": 2.6891107078039927e-05, + "loss": 45.9008, + "step": 581 + }, + { + "epoch": 2.1011286681715577, + "grad_norm": 222.2236785888672, + "learning_rate": 2.6885662431941926e-05, + "loss": 42.9123, + "step": 582 + }, + { + "epoch": 2.1047404063205417, + "grad_norm": 211.22268676757812, + "learning_rate": 2.688021778584392e-05, + "loss": 41.8036, + "step": 583 + }, + { + "epoch": 2.108352144469526, + "grad_norm": 247.3801727294922, + "learning_rate": 2.6874773139745917e-05, + "loss": 39.8408, + "step": 584 + }, + { + "epoch": 2.11196388261851, + "grad_norm": 325.9136962890625, + "learning_rate": 2.6869328493647912e-05, + "loss": 38.5368, + "step": 585 + }, + { + "epoch": 2.1155756207674945, + "grad_norm": 332.1748046875, + "learning_rate": 2.686388384754991e-05, + "loss": 38.2694, + "step": 586 + }, + { + "epoch": 2.1191873589164785, + "grad_norm": 249.74398803710938, + "learning_rate": 2.6858439201451906e-05, + "loss": 38.0195, + "step": 587 + }, + { + "epoch": 2.122799097065463, + "grad_norm": 278.7181396484375, + "learning_rate": 2.6852994555353905e-05, + "loss": 37.6475, + "step": 588 + }, + { + "epoch": 2.126410835214447, + "grad_norm": 254.46157836914062, + "learning_rate": 2.68475499092559e-05, + "loss": 37.5423, + "step": 589 + }, + { + "epoch": 2.1300225733634313, + "grad_norm": 345.65704345703125, + "learning_rate": 2.6842105263157896e-05, + "loss": 39.5874, + "step": 590 + }, + { + "epoch": 2.1300225733634313, + "eval_loss": 0.6737480163574219, + "eval_runtime": 3.136, + "eval_samples_per_second": 57.08, + "eval_steps_per_second": 57.08, + "step": 590 + }, + { + "epoch": 2.1336343115124152, + "grad_norm": 282.4167785644531, + "learning_rate": 2.683666061705989e-05, + "loss": 38.7371, + "step": 591 + }, + { + "epoch": 2.1372460496613996, + "grad_norm": 243.5838623046875, + "learning_rate": 2.6831215970961886e-05, + "loss": 39.2955, + "step": 592 + }, + { + "epoch": 2.1408577878103836, + "grad_norm": 229.0329132080078, + "learning_rate": 2.6825771324863885e-05, + "loss": 39.2204, + "step": 593 + }, + { + "epoch": 2.144469525959368, + "grad_norm": 247.46646118164062, + "learning_rate": 2.682032667876588e-05, + "loss": 39.3951, + "step": 594 + }, + { + "epoch": 2.148081264108352, + "grad_norm": 219.598388671875, + "learning_rate": 2.6814882032667876e-05, + "loss": 38.9043, + "step": 595 + }, + { + "epoch": 2.1516930022573364, + "grad_norm": 251.1849822998047, + "learning_rate": 2.6809437386569875e-05, + "loss": 39.1682, + "step": 596 + }, + { + "epoch": 2.1553047404063204, + "grad_norm": 316.6958312988281, + "learning_rate": 2.680399274047187e-05, + "loss": 39.6977, + "step": 597 + }, + { + "epoch": 2.1589164785553048, + "grad_norm": 305.8714904785156, + "learning_rate": 2.6798548094373865e-05, + "loss": 40.4904, + "step": 598 + }, + { + "epoch": 2.1625282167042887, + "grad_norm": 283.31634521484375, + "learning_rate": 2.6793103448275864e-05, + "loss": 41.0051, + "step": 599 + }, + { + "epoch": 2.166139954853273, + "grad_norm": 299.8731384277344, + "learning_rate": 2.678765880217786e-05, + "loss": 38.0505, + "step": 600 + }, + { + "epoch": 2.166139954853273, + "eval_loss": 0.6916811466217041, + "eval_runtime": 3.1348, + "eval_samples_per_second": 57.1, + "eval_steps_per_second": 57.1, + "step": 600 + }, + { + "epoch": 2.169751693002257, + "grad_norm": 255.5745086669922, + "learning_rate": 2.6782214156079855e-05, + "loss": 26.855, + "step": 601 + }, + { + "epoch": 2.1733634311512415, + "grad_norm": 228.0767822265625, + "learning_rate": 2.677676950998185e-05, + "loss": 25.9643, + "step": 602 + }, + { + "epoch": 2.176975169300226, + "grad_norm": 254.83799743652344, + "learning_rate": 2.6771324863883846e-05, + "loss": 26.3989, + "step": 603 + }, + { + "epoch": 2.18058690744921, + "grad_norm": 228.3594512939453, + "learning_rate": 2.6765880217785845e-05, + "loss": 26.5123, + "step": 604 + }, + { + "epoch": 2.1841986455981943, + "grad_norm": 480.9405822753906, + "learning_rate": 2.6760435571687843e-05, + "loss": 50.0409, + "step": 605 + }, + { + "epoch": 2.1878103837471783, + "grad_norm": 491.6844177246094, + "learning_rate": 2.675499092558984e-05, + "loss": 52.4059, + "step": 606 + }, + { + "epoch": 2.1914221218961627, + "grad_norm": 423.5033264160156, + "learning_rate": 2.6749546279491834e-05, + "loss": 50.7535, + "step": 607 + }, + { + "epoch": 2.1950338600451467, + "grad_norm": 407.8076171875, + "learning_rate": 2.674410163339383e-05, + "loss": 47.8934, + "step": 608 + }, + { + "epoch": 2.198645598194131, + "grad_norm": 339.0987854003906, + "learning_rate": 2.6738656987295825e-05, + "loss": 48.2125, + "step": 609 + }, + { + "epoch": 2.202257336343115, + "grad_norm": 336.1163635253906, + "learning_rate": 2.6733212341197824e-05, + "loss": 47.6501, + "step": 610 + }, + { + "epoch": 2.202257336343115, + "eval_loss": 0.67746502161026, + "eval_runtime": 3.1321, + "eval_samples_per_second": 57.15, + "eval_steps_per_second": 57.15, + "step": 610 + }, + { + "epoch": 2.2058690744920995, + "grad_norm": 289.6402587890625, + "learning_rate": 2.672776769509982e-05, + "loss": 46.557, + "step": 611 + }, + { + "epoch": 2.2094808126410834, + "grad_norm": 270.03790283203125, + "learning_rate": 2.6722323049001814e-05, + "loss": 48.0728, + "step": 612 + }, + { + "epoch": 2.213092550790068, + "grad_norm": 241.3233184814453, + "learning_rate": 2.671687840290381e-05, + "loss": 45.9273, + "step": 613 + }, + { + "epoch": 2.216704288939052, + "grad_norm": 270.06201171875, + "learning_rate": 2.671143375680581e-05, + "loss": 45.7327, + "step": 614 + }, + { + "epoch": 2.220316027088036, + "grad_norm": 239.87757873535156, + "learning_rate": 2.6705989110707807e-05, + "loss": 44.1507, + "step": 615 + }, + { + "epoch": 2.22392776523702, + "grad_norm": 240.35128784179688, + "learning_rate": 2.6700544464609803e-05, + "loss": 42.8332, + "step": 616 + }, + { + "epoch": 2.2275395033860046, + "grad_norm": 256.8591613769531, + "learning_rate": 2.6695099818511798e-05, + "loss": 42.2531, + "step": 617 + }, + { + "epoch": 2.2311512415349886, + "grad_norm": 255.26673889160156, + "learning_rate": 2.6689655172413793e-05, + "loss": 41.9307, + "step": 618 + }, + { + "epoch": 2.234762979683973, + "grad_norm": 235.0786895751953, + "learning_rate": 2.668421052631579e-05, + "loss": 43.077, + "step": 619 + }, + { + "epoch": 2.238374717832957, + "grad_norm": 242.18040466308594, + "learning_rate": 2.6678765880217784e-05, + "loss": 43.3731, + "step": 620 + }, + { + "epoch": 2.238374717832957, + "eval_loss": 0.6694422364234924, + "eval_runtime": 3.1351, + "eval_samples_per_second": 57.095, + "eval_steps_per_second": 57.095, + "step": 620 + }, + { + "epoch": 2.2419864559819414, + "grad_norm": 221.5685272216797, + "learning_rate": 2.6673321234119783e-05, + "loss": 42.2524, + "step": 621 + }, + { + "epoch": 2.2455981941309253, + "grad_norm": 212.63059997558594, + "learning_rate": 2.666787658802178e-05, + "loss": 42.9608, + "step": 622 + }, + { + "epoch": 2.2492099322799097, + "grad_norm": 204.1076202392578, + "learning_rate": 2.6662431941923777e-05, + "loss": 43.4169, + "step": 623 + }, + { + "epoch": 2.2528216704288937, + "grad_norm": 237.20144653320312, + "learning_rate": 2.6656987295825773e-05, + "loss": 43.4894, + "step": 624 + }, + { + "epoch": 2.256433408577878, + "grad_norm": 223.0536346435547, + "learning_rate": 2.6651542649727768e-05, + "loss": 42.8705, + "step": 625 + }, + { + "epoch": 2.2600451467268625, + "grad_norm": 262.2052001953125, + "learning_rate": 2.6646098003629767e-05, + "loss": 44.3716, + "step": 626 + }, + { + "epoch": 2.2636568848758465, + "grad_norm": 236.05906677246094, + "learning_rate": 2.6640653357531762e-05, + "loss": 44.4382, + "step": 627 + }, + { + "epoch": 2.2672686230248305, + "grad_norm": 238.1580810546875, + "learning_rate": 2.6635208711433757e-05, + "loss": 44.2845, + "step": 628 + }, + { + "epoch": 2.270880361173815, + "grad_norm": 256.60260009765625, + "learning_rate": 2.6629764065335753e-05, + "loss": 45.3699, + "step": 629 + }, + { + "epoch": 2.2744920993227993, + "grad_norm": 259.56512451171875, + "learning_rate": 2.6624319419237748e-05, + "loss": 43.3712, + "step": 630 + }, + { + "epoch": 2.2744920993227993, + "eval_loss": 0.6590501070022583, + "eval_runtime": 3.1299, + "eval_samples_per_second": 57.191, + "eval_steps_per_second": 57.191, + "step": 630 + }, + { + "epoch": 2.2781038374717832, + "grad_norm": 223.30166625976562, + "learning_rate": 2.6618874773139744e-05, + "loss": 44.44, + "step": 631 + }, + { + "epoch": 2.2817155756207677, + "grad_norm": 232.5362091064453, + "learning_rate": 2.6613430127041746e-05, + "loss": 45.6937, + "step": 632 + }, + { + "epoch": 2.2853273137697516, + "grad_norm": 212.84373474121094, + "learning_rate": 2.660798548094374e-05, + "loss": 42.2653, + "step": 633 + }, + { + "epoch": 2.288939051918736, + "grad_norm": 224.66473388671875, + "learning_rate": 2.6602540834845737e-05, + "loss": 40.0918, + "step": 634 + }, + { + "epoch": 2.29255079006772, + "grad_norm": 309.7171325683594, + "learning_rate": 2.6597096188747732e-05, + "loss": 37.3983, + "step": 635 + }, + { + "epoch": 2.2961625282167044, + "grad_norm": 313.0796203613281, + "learning_rate": 2.6591651542649727e-05, + "loss": 35.773, + "step": 636 + }, + { + "epoch": 2.2997742663656884, + "grad_norm": 357.21990966796875, + "learning_rate": 2.6586206896551726e-05, + "loss": 38.4892, + "step": 637 + }, + { + "epoch": 2.303386004514673, + "grad_norm": 319.89306640625, + "learning_rate": 2.658076225045372e-05, + "loss": 37.8198, + "step": 638 + }, + { + "epoch": 2.3069977426636568, + "grad_norm": 236.42787170410156, + "learning_rate": 2.6575317604355717e-05, + "loss": 37.3511, + "step": 639 + }, + { + "epoch": 2.310609480812641, + "grad_norm": 293.1517639160156, + "learning_rate": 2.6569872958257712e-05, + "loss": 38.6153, + "step": 640 + }, + { + "epoch": 2.310609480812641, + "eval_loss": 0.6686823964118958, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 640 + }, + { + "epoch": 2.314221218961625, + "grad_norm": 259.3082580566406, + "learning_rate": 2.6564428312159708e-05, + "loss": 38.5685, + "step": 641 + }, + { + "epoch": 2.3178329571106095, + "grad_norm": 223.2305145263672, + "learning_rate": 2.6558983666061706e-05, + "loss": 39.8481, + "step": 642 + }, + { + "epoch": 2.3214446952595935, + "grad_norm": 220.5127410888672, + "learning_rate": 2.6553539019963705e-05, + "loss": 39.2202, + "step": 643 + }, + { + "epoch": 2.325056433408578, + "grad_norm": 239.54837036132812, + "learning_rate": 2.65480943738657e-05, + "loss": 39.407, + "step": 644 + }, + { + "epoch": 2.328668171557562, + "grad_norm": 297.1054382324219, + "learning_rate": 2.6542649727767696e-05, + "loss": 38.6256, + "step": 645 + }, + { + "epoch": 2.3322799097065463, + "grad_norm": 274.5492858886719, + "learning_rate": 2.653720508166969e-05, + "loss": 39.5373, + "step": 646 + }, + { + "epoch": 2.3358916478555303, + "grad_norm": 285.3461608886719, + "learning_rate": 2.6531760435571687e-05, + "loss": 40.8163, + "step": 647 + }, + { + "epoch": 2.3395033860045147, + "grad_norm": 280.4156799316406, + "learning_rate": 2.6526315789473685e-05, + "loss": 39.5177, + "step": 648 + }, + { + "epoch": 2.343115124153499, + "grad_norm": 304.635986328125, + "learning_rate": 2.652087114337568e-05, + "loss": 39.3931, + "step": 649 + }, + { + "epoch": 2.346726862302483, + "grad_norm": 261.5251159667969, + "learning_rate": 2.6515426497277676e-05, + "loss": 36.4478, + "step": 650 + }, + { + "epoch": 2.346726862302483, + "eval_loss": 0.6913852691650391, + "eval_runtime": 3.1307, + "eval_samples_per_second": 57.176, + "eval_steps_per_second": 57.176, + "step": 650 + }, + { + "epoch": 2.350338600451467, + "grad_norm": 315.155029296875, + "learning_rate": 2.6509981851179675e-05, + "loss": 28.0558, + "step": 651 + }, + { + "epoch": 2.3539503386004514, + "grad_norm": 244.11302185058594, + "learning_rate": 2.650453720508167e-05, + "loss": 25.3844, + "step": 652 + }, + { + "epoch": 2.357562076749436, + "grad_norm": 214.6631317138672, + "learning_rate": 2.649909255898367e-05, + "loss": 25.7319, + "step": 653 + }, + { + "epoch": 2.36117381489842, + "grad_norm": 239.142822265625, + "learning_rate": 2.6493647912885664e-05, + "loss": 26.9239, + "step": 654 + }, + { + "epoch": 2.3647855530474042, + "grad_norm": 466.52301025390625, + "learning_rate": 2.648820326678766e-05, + "loss": 51.6751, + "step": 655 + }, + { + "epoch": 2.368397291196388, + "grad_norm": 417.0456848144531, + "learning_rate": 2.6482758620689655e-05, + "loss": 50.3911, + "step": 656 + }, + { + "epoch": 2.3720090293453726, + "grad_norm": 428.5924987792969, + "learning_rate": 2.647731397459165e-05, + "loss": 50.2844, + "step": 657 + }, + { + "epoch": 2.3756207674943566, + "grad_norm": 377.35205078125, + "learning_rate": 2.6471869328493646e-05, + "loss": 49.4586, + "step": 658 + }, + { + "epoch": 2.379232505643341, + "grad_norm": 319.4757080078125, + "learning_rate": 2.6466424682395645e-05, + "loss": 47.2812, + "step": 659 + }, + { + "epoch": 2.382844243792325, + "grad_norm": 294.8909912109375, + "learning_rate": 2.6460980036297644e-05, + "loss": 46.9634, + "step": 660 + }, + { + "epoch": 2.382844243792325, + "eval_loss": 0.669245183467865, + "eval_runtime": 3.1358, + "eval_samples_per_second": 57.082, + "eval_steps_per_second": 57.082, + "step": 660 + }, + { + "epoch": 2.3864559819413094, + "grad_norm": 261.2301940917969, + "learning_rate": 2.645553539019964e-05, + "loss": 46.9558, + "step": 661 + }, + { + "epoch": 2.3900677200902933, + "grad_norm": 258.07611083984375, + "learning_rate": 2.6450090744101634e-05, + "loss": 47.8019, + "step": 662 + }, + { + "epoch": 2.3936794582392777, + "grad_norm": 224.54913330078125, + "learning_rate": 2.644464609800363e-05, + "loss": 45.0965, + "step": 663 + }, + { + "epoch": 2.3972911963882617, + "grad_norm": 294.1282958984375, + "learning_rate": 2.643920145190563e-05, + "loss": 46.5213, + "step": 664 + }, + { + "epoch": 2.400902934537246, + "grad_norm": 286.87744140625, + "learning_rate": 2.6433756805807624e-05, + "loss": 44.6797, + "step": 665 + }, + { + "epoch": 2.40451467268623, + "grad_norm": 297.0935974121094, + "learning_rate": 2.642831215970962e-05, + "loss": 44.6766, + "step": 666 + }, + { + "epoch": 2.4081264108352145, + "grad_norm": 245.94793701171875, + "learning_rate": 2.6422867513611615e-05, + "loss": 42.2207, + "step": 667 + }, + { + "epoch": 2.4117381489841985, + "grad_norm": 227.2701416015625, + "learning_rate": 2.641742286751361e-05, + "loss": 41.9373, + "step": 668 + }, + { + "epoch": 2.415349887133183, + "grad_norm": 236.96005249023438, + "learning_rate": 2.641197822141561e-05, + "loss": 43.5779, + "step": 669 + }, + { + "epoch": 2.418961625282167, + "grad_norm": 244.6314239501953, + "learning_rate": 2.6406533575317608e-05, + "loss": 41.6609, + "step": 670 + }, + { + "epoch": 2.418961625282167, + "eval_loss": 0.6653958559036255, + "eval_runtime": 3.1314, + "eval_samples_per_second": 57.163, + "eval_steps_per_second": 57.163, + "step": 670 + }, + { + "epoch": 2.4225733634311513, + "grad_norm": 252.40667724609375, + "learning_rate": 2.6401088929219603e-05, + "loss": 43.4188, + "step": 671 + }, + { + "epoch": 2.4261851015801357, + "grad_norm": 218.78762817382812, + "learning_rate": 2.63956442831216e-05, + "loss": 42.4463, + "step": 672 + }, + { + "epoch": 2.4297968397291196, + "grad_norm": 216.69850158691406, + "learning_rate": 2.6390199637023594e-05, + "loss": 43.9986, + "step": 673 + }, + { + "epoch": 2.4334085778781036, + "grad_norm": 222.8838348388672, + "learning_rate": 2.638475499092559e-05, + "loss": 44.358, + "step": 674 + }, + { + "epoch": 2.437020316027088, + "grad_norm": 227.73489379882812, + "learning_rate": 2.6379310344827588e-05, + "loss": 42.2287, + "step": 675 + }, + { + "epoch": 2.4406320541760724, + "grad_norm": 227.0625762939453, + "learning_rate": 2.6373865698729583e-05, + "loss": 44.0429, + "step": 676 + }, + { + "epoch": 2.4442437923250564, + "grad_norm": 212.73170471191406, + "learning_rate": 2.636842105263158e-05, + "loss": 43.5408, + "step": 677 + }, + { + "epoch": 2.447855530474041, + "grad_norm": 213.81211853027344, + "learning_rate": 2.6362976406533574e-05, + "loss": 44.9755, + "step": 678 + }, + { + "epoch": 2.4514672686230248, + "grad_norm": 232.90858459472656, + "learning_rate": 2.6357531760435573e-05, + "loss": 44.0524, + "step": 679 + }, + { + "epoch": 2.455079006772009, + "grad_norm": 260.18408203125, + "learning_rate": 2.6352087114337568e-05, + "loss": 45.1275, + "step": 680 + }, + { + "epoch": 2.455079006772009, + "eval_loss": 0.6535969972610474, + "eval_runtime": 3.1327, + "eval_samples_per_second": 57.139, + "eval_steps_per_second": 57.139, + "step": 680 + }, + { + "epoch": 2.458690744920993, + "grad_norm": 283.89227294921875, + "learning_rate": 2.6346642468239567e-05, + "loss": 43.8625, + "step": 681 + }, + { + "epoch": 2.4623024830699776, + "grad_norm": 214.93670654296875, + "learning_rate": 2.6341197822141562e-05, + "loss": 44.1129, + "step": 682 + }, + { + "epoch": 2.4659142212189615, + "grad_norm": 207.7038116455078, + "learning_rate": 2.6335753176043558e-05, + "loss": 43.7334, + "step": 683 + }, + { + "epoch": 2.469525959367946, + "grad_norm": 320.4886779785156, + "learning_rate": 2.6330308529945553e-05, + "loss": 38.3952, + "step": 684 + }, + { + "epoch": 2.47313769751693, + "grad_norm": 356.9686279296875, + "learning_rate": 2.632486388384755e-05, + "loss": 38.443, + "step": 685 + }, + { + "epoch": 2.4767494356659143, + "grad_norm": 251.1065216064453, + "learning_rate": 2.6319419237749547e-05, + "loss": 36.617, + "step": 686 + }, + { + "epoch": 2.4803611738148983, + "grad_norm": 301.3539123535156, + "learning_rate": 2.6313974591651543e-05, + "loss": 38.6545, + "step": 687 + }, + { + "epoch": 2.4839729119638827, + "grad_norm": 206.49517822265625, + "learning_rate": 2.630852994555354e-05, + "loss": 37.6828, + "step": 688 + }, + { + "epoch": 2.4875846501128667, + "grad_norm": 230.03382873535156, + "learning_rate": 2.6303085299455537e-05, + "loss": 39.2154, + "step": 689 + }, + { + "epoch": 2.491196388261851, + "grad_norm": 224.42454528808594, + "learning_rate": 2.6297640653357532e-05, + "loss": 37.5136, + "step": 690 + }, + { + "epoch": 2.491196388261851, + "eval_loss": 0.657163679599762, + "eval_runtime": 3.1455, + "eval_samples_per_second": 56.906, + "eval_steps_per_second": 56.906, + "step": 690 + }, + { + "epoch": 2.494808126410835, + "grad_norm": 186.3481903076172, + "learning_rate": 2.6292196007259528e-05, + "loss": 37.1198, + "step": 691 + }, + { + "epoch": 2.4984198645598195, + "grad_norm": 231.81553649902344, + "learning_rate": 2.6286751361161526e-05, + "loss": 38.3062, + "step": 692 + }, + { + "epoch": 2.5020316027088034, + "grad_norm": 221.0079803466797, + "learning_rate": 2.6281306715063522e-05, + "loss": 39.543, + "step": 693 + }, + { + "epoch": 2.505643340857788, + "grad_norm": 251.6171112060547, + "learning_rate": 2.6275862068965517e-05, + "loss": 38.5384, + "step": 694 + }, + { + "epoch": 2.5092550790067722, + "grad_norm": 239.07843017578125, + "learning_rate": 2.6270417422867512e-05, + "loss": 39.2217, + "step": 695 + }, + { + "epoch": 2.512866817155756, + "grad_norm": 256.3560485839844, + "learning_rate": 2.6264972776769508e-05, + "loss": 39.2529, + "step": 696 + }, + { + "epoch": 2.51647855530474, + "grad_norm": 245.74522399902344, + "learning_rate": 2.625952813067151e-05, + "loss": 39.6369, + "step": 697 + }, + { + "epoch": 2.5200902934537246, + "grad_norm": 279.8902893066406, + "learning_rate": 2.6254083484573505e-05, + "loss": 40.1488, + "step": 698 + }, + { + "epoch": 2.523702031602709, + "grad_norm": 267.12811279296875, + "learning_rate": 2.62486388384755e-05, + "loss": 40.6809, + "step": 699 + }, + { + "epoch": 2.527313769751693, + "grad_norm": 291.1154479980469, + "learning_rate": 2.6243194192377496e-05, + "loss": 35.9417, + "step": 700 + }, + { + "epoch": 2.527313769751693, + "eval_loss": 0.677870512008667, + "eval_runtime": 3.1315, + "eval_samples_per_second": 57.16, + "eval_steps_per_second": 57.16, + "step": 700 + }, + { + "epoch": 2.530925507900677, + "grad_norm": 362.4072570800781, + "learning_rate": 2.623774954627949e-05, + "loss": 26.5904, + "step": 701 + }, + { + "epoch": 2.5345372460496614, + "grad_norm": 346.172607421875, + "learning_rate": 2.6232304900181487e-05, + "loss": 25.4147, + "step": 702 + }, + { + "epoch": 2.5381489841986458, + "grad_norm": 193.29322814941406, + "learning_rate": 2.6226860254083486e-05, + "loss": 26.0221, + "step": 703 + }, + { + "epoch": 2.5417607223476297, + "grad_norm": 195.480224609375, + "learning_rate": 2.622141560798548e-05, + "loss": 26.1951, + "step": 704 + }, + { + "epoch": 2.545372460496614, + "grad_norm": 459.3381652832031, + "learning_rate": 2.6215970961887476e-05, + "loss": 49.9201, + "step": 705 + }, + { + "epoch": 2.548984198645598, + "grad_norm": 430.83160400390625, + "learning_rate": 2.6210526315789475e-05, + "loss": 51.813, + "step": 706 + }, + { + "epoch": 2.5525959367945825, + "grad_norm": 395.5831604003906, + "learning_rate": 2.620508166969147e-05, + "loss": 49.6055, + "step": 707 + }, + { + "epoch": 2.5562076749435665, + "grad_norm": 349.0957946777344, + "learning_rate": 2.619963702359347e-05, + "loss": 47.4299, + "step": 708 + }, + { + "epoch": 2.559819413092551, + "grad_norm": 317.4203796386719, + "learning_rate": 2.6194192377495465e-05, + "loss": 47.5578, + "step": 709 + }, + { + "epoch": 2.563431151241535, + "grad_norm": 284.44659423828125, + "learning_rate": 2.618874773139746e-05, + "loss": 47.19, + "step": 710 + }, + { + "epoch": 2.563431151241535, + "eval_loss": 0.6700878739356995, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.172, + "eval_steps_per_second": 57.172, + "step": 710 + }, + { + "epoch": 2.5670428893905193, + "grad_norm": 313.7208251953125, + "learning_rate": 2.6183303085299456e-05, + "loss": 47.0923, + "step": 711 + }, + { + "epoch": 2.5706546275395032, + "grad_norm": 284.9776611328125, + "learning_rate": 2.617785843920145e-05, + "loss": 47.024, + "step": 712 + }, + { + "epoch": 2.5742663656884877, + "grad_norm": 264.72515869140625, + "learning_rate": 2.6172413793103446e-05, + "loss": 47.0988, + "step": 713 + }, + { + "epoch": 2.5778781038374716, + "grad_norm": 244.52915954589844, + "learning_rate": 2.6166969147005445e-05, + "loss": 46.1584, + "step": 714 + }, + { + "epoch": 2.581489841986456, + "grad_norm": 255.7130889892578, + "learning_rate": 2.6161524500907444e-05, + "loss": 44.7084, + "step": 715 + }, + { + "epoch": 2.58510158013544, + "grad_norm": 276.594482421875, + "learning_rate": 2.615607985480944e-05, + "loss": 44.2491, + "step": 716 + }, + { + "epoch": 2.5887133182844244, + "grad_norm": 274.7431335449219, + "learning_rate": 2.6150635208711435e-05, + "loss": 42.3555, + "step": 717 + }, + { + "epoch": 2.592325056433409, + "grad_norm": 276.954833984375, + "learning_rate": 2.614519056261343e-05, + "loss": 43.7643, + "step": 718 + }, + { + "epoch": 2.595936794582393, + "grad_norm": 194.3367156982422, + "learning_rate": 2.613974591651543e-05, + "loss": 42.2725, + "step": 719 + }, + { + "epoch": 2.5995485327313768, + "grad_norm": 176.41236877441406, + "learning_rate": 2.6134301270417424e-05, + "loss": 42.5421, + "step": 720 + }, + { + "epoch": 2.5995485327313768, + "eval_loss": 0.6591465473175049, + "eval_runtime": 3.1343, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 720 + }, + { + "epoch": 2.603160270880361, + "grad_norm": 205.8289031982422, + "learning_rate": 2.612885662431942e-05, + "loss": 41.3438, + "step": 721 + }, + { + "epoch": 2.6067720090293456, + "grad_norm": 204.97471618652344, + "learning_rate": 2.6123411978221415e-05, + "loss": 44.5023, + "step": 722 + }, + { + "epoch": 2.6103837471783295, + "grad_norm": 230.9344482421875, + "learning_rate": 2.611796733212341e-05, + "loss": 42.8994, + "step": 723 + }, + { + "epoch": 2.6139954853273135, + "grad_norm": 186.5467987060547, + "learning_rate": 2.6112522686025406e-05, + "loss": 43.5145, + "step": 724 + }, + { + "epoch": 2.617607223476298, + "grad_norm": 212.39852905273438, + "learning_rate": 2.6107078039927408e-05, + "loss": 44.4214, + "step": 725 + }, + { + "epoch": 2.6212189616252823, + "grad_norm": 214.2425994873047, + "learning_rate": 2.6101633393829403e-05, + "loss": 44.4302, + "step": 726 + }, + { + "epoch": 2.6248306997742663, + "grad_norm": 203.32107543945312, + "learning_rate": 2.60961887477314e-05, + "loss": 44.4621, + "step": 727 + }, + { + "epoch": 2.6284424379232507, + "grad_norm": 227.49472045898438, + "learning_rate": 2.6090744101633394e-05, + "loss": 44.0782, + "step": 728 + }, + { + "epoch": 2.6320541760722347, + "grad_norm": 242.5762939453125, + "learning_rate": 2.608529945553539e-05, + "loss": 44.7653, + "step": 729 + }, + { + "epoch": 2.635665914221219, + "grad_norm": 223.6633758544922, + "learning_rate": 2.6079854809437388e-05, + "loss": 43.9382, + "step": 730 + }, + { + "epoch": 2.635665914221219, + "eval_loss": 0.6548755764961243, + "eval_runtime": 3.1317, + "eval_samples_per_second": 57.158, + "eval_steps_per_second": 57.158, + "step": 730 + }, + { + "epoch": 2.639277652370203, + "grad_norm": 237.716552734375, + "learning_rate": 2.6074410163339384e-05, + "loss": 45.1399, + "step": 731 + }, + { + "epoch": 2.6428893905191875, + "grad_norm": 214.22898864746094, + "learning_rate": 2.606896551724138e-05, + "loss": 40.4102, + "step": 732 + }, + { + "epoch": 2.6465011286681714, + "grad_norm": 312.23956298828125, + "learning_rate": 2.6063520871143374e-05, + "loss": 39.1898, + "step": 733 + }, + { + "epoch": 2.650112866817156, + "grad_norm": 199.07408142089844, + "learning_rate": 2.6058076225045373e-05, + "loss": 36.5315, + "step": 734 + }, + { + "epoch": 2.65372460496614, + "grad_norm": 229.65692138671875, + "learning_rate": 2.605263157894737e-05, + "loss": 36.8341, + "step": 735 + }, + { + "epoch": 2.6573363431151242, + "grad_norm": 222.81546020507812, + "learning_rate": 2.6047186932849367e-05, + "loss": 36.1602, + "step": 736 + }, + { + "epoch": 2.660948081264108, + "grad_norm": 253.58770751953125, + "learning_rate": 2.6041742286751363e-05, + "loss": 36.7221, + "step": 737 + }, + { + "epoch": 2.6645598194130926, + "grad_norm": 291.77325439453125, + "learning_rate": 2.6036297640653358e-05, + "loss": 37.0262, + "step": 738 + }, + { + "epoch": 2.6681715575620766, + "grad_norm": 293.3721618652344, + "learning_rate": 2.6030852994555353e-05, + "loss": 38.2955, + "step": 739 + }, + { + "epoch": 2.671783295711061, + "grad_norm": 210.18955993652344, + "learning_rate": 2.602540834845735e-05, + "loss": 38.9277, + "step": 740 + }, + { + "epoch": 2.671783295711061, + "eval_loss": 0.6631377339363098, + "eval_runtime": 3.1357, + "eval_samples_per_second": 57.084, + "eval_steps_per_second": 57.084, + "step": 740 + }, + { + "epoch": 2.6753950338600454, + "grad_norm": 224.5206298828125, + "learning_rate": 2.6019963702359348e-05, + "loss": 39.1264, + "step": 741 + }, + { + "epoch": 2.6790067720090294, + "grad_norm": 307.2724914550781, + "learning_rate": 2.6014519056261343e-05, + "loss": 40.2912, + "step": 742 + }, + { + "epoch": 2.6826185101580133, + "grad_norm": 287.6835021972656, + "learning_rate": 2.600907441016334e-05, + "loss": 39.1569, + "step": 743 + }, + { + "epoch": 2.6862302483069977, + "grad_norm": 286.31817626953125, + "learning_rate": 2.6003629764065337e-05, + "loss": 38.4985, + "step": 744 + }, + { + "epoch": 2.689841986455982, + "grad_norm": 269.58740234375, + "learning_rate": 2.5998185117967332e-05, + "loss": 40.7763, + "step": 745 + }, + { + "epoch": 2.693453724604966, + "grad_norm": 222.31248474121094, + "learning_rate": 2.599274047186933e-05, + "loss": 39.9336, + "step": 746 + }, + { + "epoch": 2.69706546275395, + "grad_norm": 214.96624755859375, + "learning_rate": 2.5987295825771327e-05, + "loss": 39.4074, + "step": 747 + }, + { + "epoch": 2.7006772009029345, + "grad_norm": 296.5968322753906, + "learning_rate": 2.5981851179673322e-05, + "loss": 40.9984, + "step": 748 + }, + { + "epoch": 2.704288939051919, + "grad_norm": 228.1329803466797, + "learning_rate": 2.5976406533575317e-05, + "loss": 38.6395, + "step": 749 + }, + { + "epoch": 2.707900677200903, + "grad_norm": 254.83538818359375, + "learning_rate": 2.5970961887477313e-05, + "loss": 28.1237, + "step": 750 + }, + { + "epoch": 2.707900677200903, + "eval_loss": 0.6833599209785461, + "eval_runtime": 3.1401, + "eval_samples_per_second": 57.005, + "eval_steps_per_second": 57.005, + "step": 750 + }, + { + "epoch": 2.7115124153498873, + "grad_norm": 196.43338012695312, + "learning_rate": 2.5965517241379308e-05, + "loss": 25.9143, + "step": 751 + }, + { + "epoch": 2.7151241534988713, + "grad_norm": 223.3903350830078, + "learning_rate": 2.596007259528131e-05, + "loss": 25.2986, + "step": 752 + }, + { + "epoch": 2.7187358916478557, + "grad_norm": 220.7471923828125, + "learning_rate": 2.5954627949183306e-05, + "loss": 25.9919, + "step": 753 + }, + { + "epoch": 2.7223476297968396, + "grad_norm": 204.15382385253906, + "learning_rate": 2.59491833030853e-05, + "loss": 26.2117, + "step": 754 + }, + { + "epoch": 2.725959367945824, + "grad_norm": 536.2657470703125, + "learning_rate": 2.5943738656987296e-05, + "loss": 52.0849, + "step": 755 + }, + { + "epoch": 2.729571106094808, + "grad_norm": 623.6157836914062, + "learning_rate": 2.5938294010889292e-05, + "loss": 51.8393, + "step": 756 + }, + { + "epoch": 2.7331828442437924, + "grad_norm": 491.22821044921875, + "learning_rate": 2.593284936479129e-05, + "loss": 51.4791, + "step": 757 + }, + { + "epoch": 2.7367945823927764, + "grad_norm": 414.8413391113281, + "learning_rate": 2.5927404718693286e-05, + "loss": 49.5306, + "step": 758 + }, + { + "epoch": 2.740406320541761, + "grad_norm": 363.01715087890625, + "learning_rate": 2.592196007259528e-05, + "loss": 47.1806, + "step": 759 + }, + { + "epoch": 2.7440180586907448, + "grad_norm": 309.8416442871094, + "learning_rate": 2.5916515426497277e-05, + "loss": 47.7516, + "step": 760 + }, + { + "epoch": 2.7440180586907448, + "eval_loss": 0.6723723411560059, + "eval_runtime": 3.1345, + "eval_samples_per_second": 57.106, + "eval_steps_per_second": 57.106, + "step": 760 + }, + { + "epoch": 2.747629796839729, + "grad_norm": 297.294189453125, + "learning_rate": 2.5911070780399276e-05, + "loss": 46.8819, + "step": 761 + }, + { + "epoch": 2.751241534988713, + "grad_norm": 271.9000549316406, + "learning_rate": 2.590562613430127e-05, + "loss": 46.6719, + "step": 762 + }, + { + "epoch": 2.7548532731376976, + "grad_norm": 223.2354278564453, + "learning_rate": 2.590018148820327e-05, + "loss": 45.829, + "step": 763 + }, + { + "epoch": 2.758465011286682, + "grad_norm": 267.2200012207031, + "learning_rate": 2.5894736842105265e-05, + "loss": 46.8854, + "step": 764 + }, + { + "epoch": 2.762076749435666, + "grad_norm": 240.17990112304688, + "learning_rate": 2.588929219600726e-05, + "loss": 44.7511, + "step": 765 + }, + { + "epoch": 2.76568848758465, + "grad_norm": 319.76959228515625, + "learning_rate": 2.5883847549909256e-05, + "loss": 42.0385, + "step": 766 + }, + { + "epoch": 2.7693002257336343, + "grad_norm": 221.0363006591797, + "learning_rate": 2.587840290381125e-05, + "loss": 43.6279, + "step": 767 + }, + { + "epoch": 2.7729119638826187, + "grad_norm": 211.2090606689453, + "learning_rate": 2.587295825771325e-05, + "loss": 42.0023, + "step": 768 + }, + { + "epoch": 2.7765237020316027, + "grad_norm": 214.4199981689453, + "learning_rate": 2.5867513611615245e-05, + "loss": 41.4171, + "step": 769 + }, + { + "epoch": 2.7801354401805867, + "grad_norm": 248.0699462890625, + "learning_rate": 2.586206896551724e-05, + "loss": 42.2437, + "step": 770 + }, + { + "epoch": 2.7801354401805867, + "eval_loss": 0.6616525053977966, + "eval_runtime": 3.1342, + "eval_samples_per_second": 57.111, + "eval_steps_per_second": 57.111, + "step": 770 + }, + { + "epoch": 2.783747178329571, + "grad_norm": 204.29493713378906, + "learning_rate": 2.585662431941924e-05, + "loss": 42.0058, + "step": 771 + }, + { + "epoch": 2.7873589164785555, + "grad_norm": 223.14077758789062, + "learning_rate": 2.5851179673321235e-05, + "loss": 43.7219, + "step": 772 + }, + { + "epoch": 2.7909706546275395, + "grad_norm": 219.99261474609375, + "learning_rate": 2.584573502722323e-05, + "loss": 44.1564, + "step": 773 + }, + { + "epoch": 2.7945823927765234, + "grad_norm": 194.47219848632812, + "learning_rate": 2.584029038112523e-05, + "loss": 43.631, + "step": 774 + }, + { + "epoch": 2.798194130925508, + "grad_norm": 191.4344940185547, + "learning_rate": 2.5834845735027224e-05, + "loss": 43.4141, + "step": 775 + }, + { + "epoch": 2.8018058690744923, + "grad_norm": 218.28073120117188, + "learning_rate": 2.582940108892922e-05, + "loss": 43.936, + "step": 776 + }, + { + "epoch": 2.805417607223476, + "grad_norm": 186.77444458007812, + "learning_rate": 2.5823956442831215e-05, + "loss": 44.7909, + "step": 777 + }, + { + "epoch": 2.8090293453724606, + "grad_norm": 205.01918029785156, + "learning_rate": 2.581851179673321e-05, + "loss": 43.726, + "step": 778 + }, + { + "epoch": 2.8126410835214446, + "grad_norm": 200.90245056152344, + "learning_rate": 2.581306715063521e-05, + "loss": 43.7542, + "step": 779 + }, + { + "epoch": 2.816252821670429, + "grad_norm": 200.3115692138672, + "learning_rate": 2.5807622504537208e-05, + "loss": 44.6297, + "step": 780 + }, + { + "epoch": 2.816252821670429, + "eval_loss": 0.65194171667099, + "eval_runtime": 3.1309, + "eval_samples_per_second": 57.173, + "eval_steps_per_second": 57.173, + "step": 780 + }, + { + "epoch": 2.819864559819413, + "grad_norm": 278.512939453125, + "learning_rate": 2.5802177858439204e-05, + "loss": 44.8844, + "step": 781 + }, + { + "epoch": 2.8234762979683974, + "grad_norm": 231.06387329101562, + "learning_rate": 2.57967332123412e-05, + "loss": 44.1802, + "step": 782 + }, + { + "epoch": 2.8270880361173814, + "grad_norm": 232.7779083251953, + "learning_rate": 2.5791288566243194e-05, + "loss": 41.4748, + "step": 783 + }, + { + "epoch": 2.8306997742663658, + "grad_norm": 229.13340759277344, + "learning_rate": 2.578584392014519e-05, + "loss": 38.8538, + "step": 784 + }, + { + "epoch": 2.8343115124153497, + "grad_norm": 290.4147644042969, + "learning_rate": 2.578039927404719e-05, + "loss": 38.5641, + "step": 785 + }, + { + "epoch": 2.837923250564334, + "grad_norm": 285.3528137207031, + "learning_rate": 2.5774954627949184e-05, + "loss": 36.2725, + "step": 786 + }, + { + "epoch": 2.8415349887133186, + "grad_norm": 218.9436492919922, + "learning_rate": 2.576950998185118e-05, + "loss": 36.5417, + "step": 787 + }, + { + "epoch": 2.8451467268623025, + "grad_norm": 264.1986083984375, + "learning_rate": 2.5764065335753175e-05, + "loss": 37.4064, + "step": 788 + }, + { + "epoch": 2.8487584650112865, + "grad_norm": 182.3573760986328, + "learning_rate": 2.5758620689655173e-05, + "loss": 38.2529, + "step": 789 + }, + { + "epoch": 2.852370203160271, + "grad_norm": 213.42701721191406, + "learning_rate": 2.5753176043557172e-05, + "loss": 38.1339, + "step": 790 + }, + { + "epoch": 2.852370203160271, + "eval_loss": 0.6563644409179688, + "eval_runtime": 3.1295, + "eval_samples_per_second": 57.198, + "eval_steps_per_second": 57.198, + "step": 790 + }, + { + "epoch": 2.8559819413092553, + "grad_norm": 277.4792175292969, + "learning_rate": 2.5747731397459168e-05, + "loss": 37.8052, + "step": 791 + }, + { + "epoch": 2.8595936794582393, + "grad_norm": 299.55462646484375, + "learning_rate": 2.5742286751361163e-05, + "loss": 38.4587, + "step": 792 + }, + { + "epoch": 2.8632054176072232, + "grad_norm": 253.10867309570312, + "learning_rate": 2.5736842105263158e-05, + "loss": 39.4709, + "step": 793 + }, + { + "epoch": 2.8668171557562077, + "grad_norm": 228.04470825195312, + "learning_rate": 2.5731397459165154e-05, + "loss": 39.0288, + "step": 794 + }, + { + "epoch": 2.870428893905192, + "grad_norm": 211.8145751953125, + "learning_rate": 2.572595281306715e-05, + "loss": 39.8209, + "step": 795 + }, + { + "epoch": 2.874040632054176, + "grad_norm": 201.8890838623047, + "learning_rate": 2.5720508166969148e-05, + "loss": 40.0695, + "step": 796 + }, + { + "epoch": 2.87765237020316, + "grad_norm": 243.47447204589844, + "learning_rate": 2.5715063520871143e-05, + "loss": 40.036, + "step": 797 + }, + { + "epoch": 2.8812641083521444, + "grad_norm": 289.123291015625, + "learning_rate": 2.5709618874773142e-05, + "loss": 40.683, + "step": 798 + }, + { + "epoch": 2.884875846501129, + "grad_norm": 292.1619567871094, + "learning_rate": 2.5704174228675137e-05, + "loss": 41.132, + "step": 799 + }, + { + "epoch": 2.888487584650113, + "grad_norm": 279.39947509765625, + "learning_rate": 2.5698729582577133e-05, + "loss": 32.0392, + "step": 800 + }, + { + "epoch": 2.888487584650113, + "eval_loss": 0.6752151250839233, + "eval_runtime": 3.1352, + "eval_samples_per_second": 57.094, + "eval_steps_per_second": 57.094, + "step": 800 + } + ], + "logging_steps": 1, + "max_steps": 5520, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.926172950573875e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/training_args.bin b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..e6c3e690171f819d131f1e6f539e9149fa903b8f --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/checkpoint-800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e022e3d974aac327c05aae83b4d8cb01bf304b0b5adee68431d6222a9c2d2c5 +size 5944 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/normalizer.pt b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/normalizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c710318a53f358df84325d7ad4e3214bc02b0f95 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/normalizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7863dd8b1cec54ea35c3840df161a34ee700911175e25f412f15c526ba1f4db9 +size 940 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/train_llm_workspace.log b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/train_llm_workspace.log new file mode 100644 index 0000000000000000000000000000000000000000..f681ec3d1f8dc1a64d9f07ab0e7018c2ec2ec1b7 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/train_llm_workspace.log @@ -0,0 +1,5 @@ +[2025-03-10 09:20:33,509][datasets.arrow_dataset][WARNING] - Setting TOKENIZERS_PARALLELISM=false for forked processes. +[2025-03-10 09:20:37,556][root][INFO] - gcc -pthread -B /home/chyang/miniconda3/envs/llm-bc/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /home/chyang/miniconda3/envs/llm-bc/include -I/home/chyang/miniconda3/envs/llm-bc/include -fPIC -O2 -isystem /home/chyang/miniconda3/envs/llm-bc/include -fPIC -c /tmp/tmp279_x4mk/test.c -o /tmp/tmp279_x4mk/test.o +[2025-03-10 09:20:37,599][root][INFO] - gcc -pthread -B /home/chyang/miniconda3/envs/llm-bc/compiler_compat /tmp/tmp279_x4mk/test.o -laio -o /tmp/tmp279_x4mk/a.out +[2025-03-10 09:20:37,845][root][INFO] - gcc -pthread -B /home/chyang/miniconda3/envs/llm-bc/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /home/chyang/miniconda3/envs/llm-bc/include -I/home/chyang/miniconda3/envs/llm-bc/include -fPIC -O2 -isystem /home/chyang/miniconda3/envs/llm-bc/include -fPIC -c /tmp/tmpsqvpk3lb/test.c -o /tmp/tmpsqvpk3lb/test.o +[2025-03-10 09:20:37,894][root][INFO] - gcc -pthread -B /home/chyang/miniconda3/envs/llm-bc/compiler_compat /tmp/tmpsqvpk3lb/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpsqvpk3lb/a.out diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/debug-internal.log b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..d47d248bd7e877599a5823e406f6c4b1db663b90 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/debug-internal.log @@ -0,0 +1,17 @@ +{"time":"2025-03-10T09:20:27.395245778+08:00","level":"INFO","msg":"using version","core version":"0.18.6"} +{"time":"2025-03-10T09:20:27.395272821+08:00","level":"INFO","msg":"created symlink","path":"/home/chyang/workspace/LLM-BC/data/outputs/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/logs/debug-core.log"} +{"time":"2025-03-10T09:20:27.501784545+08:00","level":"INFO","msg":"created new stream","id":"6awu8klx"} +{"time":"2025-03-10T09:20:27.501800611+08:00","level":"INFO","msg":"stream: started","id":"6awu8klx"} +{"time":"2025-03-10T09:20:27.501864561+08:00","level":"INFO","msg":"sender: started","stream_id":"6awu8klx"} +{"time":"2025-03-10T09:20:27.501854739+08:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"6awu8klx"}} +{"time":"2025-03-10T09:20:27.502027573+08:00","level":"INFO","msg":"handler: started","stream_id":{"value":"6awu8klx"}} +{"time":"2025-03-10T09:20:28.05035496+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-03-10T14:12:46.314827029+08:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-03-10T14:12:46.315636093+08:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-03-10T14:12:47.315465643+08:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"saving job artifact","runtime_seconds":0.404143887},{"desc":"uploading output.log","runtime_seconds":0.161471187,"progress":"990.7KB/990.7KB"},{"desc":"uploading config.yaml","runtime_seconds":0.161454803,"progress":"15.6KB/15.6KB"}],"total_operations":3}} +{"time":"2025-03-10T14:12:48.510476348+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-03-10T14:12:49.833150896+08:00","level":"INFO","msg":"stream: closing","id":"6awu8klx"} +{"time":"2025-03-10T14:12:49.833196761+08:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"6awu8klx"}} +{"time":"2025-03-10T14:12:49.833235054+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"6awu8klx"}} +{"time":"2025-03-10T14:12:49.833251953+08:00","level":"INFO","msg":"sender: closed","stream_id":"6awu8klx"} +{"time":"2025-03-10T14:12:49.833413851+08:00","level":"INFO","msg":"stream: closed","id":"6awu8klx"} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/debug.log b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..2e9a5446b7901eb8c9425ec184d05040be64cf54 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/debug.log @@ -0,0 +1,35 @@ +2025-03-10 09:20:27,390 INFO MainThread:3973458 [wandb_setup.py:_flush():79] Current SDK version is 0.18.6 +2025-03-10 09:20:27,390 INFO MainThread:3973458 [wandb_setup.py:_flush():79] Configure stats pid to 3973458 +2025-03-10 09:20:27,390 INFO MainThread:3973458 [wandb_setup.py:_flush():79] Loading settings from /home/chyang/.config/wandb/settings +2025-03-10 09:20:27,390 INFO MainThread:3973458 [wandb_setup.py:_flush():79] Loading settings from /home/chyang/workspace/LLM-BC/wandb/settings +2025-03-10 09:20:27,390 INFO MainThread:3973458 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2025-03-10 09:20:27,390 INFO MainThread:3973458 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None} +2025-03-10 09:20:27,390 INFO MainThread:3973458 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'llmbc/workspace/train_llm_workspace.py', 'program_abspath': '/home/chyang/workspace/LLM-BC/llmbc/workspace/train_llm_workspace.py', 'program': '/home/chyang/workspace/LLM-BC/./llmbc/workspace/train_llm_workspace.py'} +2025-03-10 09:20:27,390 INFO MainThread:3973458 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-03-10 09:20:27,390 INFO MainThread:3973458 [wandb_init.py:_log_setup():533] Logging user logs to /home/chyang/workspace/LLM-BC/data/outputs/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/logs/debug.log +2025-03-10 09:20:27,390 INFO MainThread:3973458 [wandb_init.py:_log_setup():534] Logging internal logs to /home/chyang/workspace/LLM-BC/data/outputs/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/logs/debug-internal.log +2025-03-10 09:20:27,390 INFO MainThread:3973458 [wandb_init.py:init():619] calling init triggers +2025-03-10 09:20:27,391 INFO MainThread:3973458 [wandb_init.py:init():626] wandb.init called with sweep_config: {} +config: {'name': 'train_llm_lowdim', '_target_': 'llmbc.workspace.train_llm_workspace.TrainLLMWorkspace', 'obs_dim': 9, 'action_dim': 4, 'horizon': 1, 'n_obs_steps': 1, 'n_action_steps': 1, 'task_name': 'push-v2', 'exp_name': 'train llm', 'model_name': 'meta-llama/Llama-3.2-1B-Instruct', 'use_quantization': False, 'lora_config': {'r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'bias': 'none', 'task_type': 'CAUSAL_LM'}, 'dataset': {'test_data_ratio': 0.01}, 'debug': False, 'training': {'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 64, 'optim': 'paged_adamw_32bit', 'num_train_epochs': 20, 'eval_strategy': 'steps', 'logging_steps': 1, 'warmup_steps': 10, 'logging_strategy': 'steps', 'learning_rate': 3e-05, 'fp16': False, 'bf16': True, 'tf32': True, 'group_by_length': True, 'report_to': 'wandb', 'save_steps': 200, 'eval_steps': 10, 'use_joint_mlp_projector': True, 'joint_obs_action_mlp_lr': 1e-06}, 'trainer': {'obs_dim': 9, 'action_dim': 4, 'use_joint_mlp_projector': True, 'max_seq_length': 100, 'dataset_text_field': 'text', 'packing': False}, 'logging': {'project': 'llm_module_finetuning', 'resume': True, 'mode': 'online', 'name': '2025.03.10-09.20.26_train_llm_lowdim_push-v2', 'tags': ['train_llm_lowdim', 'push-v2', 'train llm'], 'id': None, 'group': None}, 'multi_run': {'run_dir': 'data/outputs/2025.03.10/09.20.26_train_llm_lowdim_push-v2', 'wandb_name_base': '2025.03.10-09.20.26_train_llm_lowdim_push-v2'}, 'task': {'name': 'push-v2', 'obs_dim': 9, 'action_dim': 4, 'env_runner': {'_target_': 'llmbc.env_runner.metaworld_lowdim_runner.MetaworldLowdimRunner', 'env_name': 'llf-metaworld-push-v2', 'max_steps': 30, 'n_obs_steps': 1, 'n_action_steps': 1, 'instruction_type': 'b', 'feedback_type': ['hp', 'hn', 'fp'], 'visual': False}, 'dataset': {'_target_': 'llmbc.dataset.metaworld_lowdim_dataset.MetaworldLowdimDataset', 'data_path': 'datasets/push-v2-general.pt', 'data_path2': 'datasets/push-v2.pt', 'horizon': 1, 'pad_before': 0, 'pad_after': 0, 'obs_eef_target': True, 'use_manual_normalizer': False, 'val_ratio': 0.2, 'dummy_normalizer': True}, 'instructor': {'_target_': 'llmbc.translator.instructor.metaworld_instructor.push_v2_instructor.PushV2Instructor'}}, 'llm': {'name': 'meta-llama/Llama-3.2-1B-Instruct', 'model_name': 'Llama-3.2-1B-Instruct', 'use_quantization': False, 'load_from_checkpoint': False, 'adaptor_path': '/home/chyang/workspace/LLM-BC/data/outputs/2025.03.05/13.39.46_train_llm_lowdim_sweep-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-sweep-v2/checkpoint-3550', 'use_orig_model': False, 'use_joint_mlp_projector': True, 'load_from_mlp_projector_checkpoint': True, 'mlp_projector_checkpoint_path': '/home/chyang/workspace/LLM-BC/data/outputs/2025.03.09/12.41.09_train_mlp_projector_metaworld/checkpoints/latest.ckpt', 'max_length': 100, 'config_target': 'llmbc.model.llm.llama_lowdim_model.LowdimLlamaConfig', 'causal_lm_target': 'llmbc.model.llm.llama_lowdim_model.LowdimLlamaForCausalLM', 'lora_config': {'r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'bias': 'none', 'task_type': 'CAUSAL_LM'}, 'prompter': {'_target_': 'llmbc.translator.prompter.llama_prompter.LlamaPrompter', 'use_joint_mlp_projector': True}, 'hydra': {'job': {'override_dirname': 'meta-llama/Llama-3.2-1B-Instruct'}, 'run': {'dir': 'data/outputs/2025.03.10/09.20.26_meta-llama/Llama-3.2-1B-Instruct'}}}} +2025-03-10 09:20:27,391 INFO MainThread:3973458 [wandb_init.py:init():669] starting backend +2025-03-10 09:20:27,391 INFO MainThread:3973458 [wandb_init.py:init():673] sending inform_init request +2025-03-10 09:20:27,391 INFO MainThread:3973458 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-03-10 09:20:27,391 INFO MainThread:3973458 [wandb_init.py:init():686] backend started and connected +2025-03-10 09:20:27,395 INFO MainThread:3973458 [wandb_init.py:init():781] updated telemetry +2025-03-10 09:20:27,414 INFO MainThread:3973458 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout +2025-03-10 09:20:28,047 INFO MainThread:3973458 [wandb_init.py:init():867] starting run threads in backend +2025-03-10 09:20:28,132 INFO MainThread:3973458 [wandb_run.py:_console_start():2451] atexit reg +2025-03-10 09:20:28,132 INFO MainThread:3973458 [wandb_run.py:_redirect():2299] redirect: wrap_raw +2025-03-10 09:20:28,133 INFO MainThread:3973458 [wandb_run.py:_redirect():2364] Wrapping output streams. +2025-03-10 09:20:28,133 INFO MainThread:3973458 [wandb_run.py:_redirect():2389] Redirects installed. +2025-03-10 09:20:28,136 INFO MainThread:3973458 [wandb_init.py:init():911] run started, returning control to user process +2025-03-10 09:20:40,580 INFO MainThread:3973458 [wandb_run.py:_config_callback():1389] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': , 'auto_mapping': None, 'base_model_name_or_path': 'meta-llama/Llama-3.2-1B-Instruct', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': {'v_proj', 'down_proj', 'gate_proj', 'q_proj', 'o_proj', 'k_proj', 'up_proj'}, 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.05, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'loftq_config': {}, 'eva_config': None, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False}}, 'obs_dim': 9, 'action_dim': 4, 'use_joint_mlp_projector': True, 'vocab_size': 128256, 'max_position_embeddings': 131072, 'hidden_size': 2048, 'intermediate_size': 8192, 'num_hidden_layers': 16, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': False, 'rope_theta': 500000.0, 'rope_scaling': {'factor': 32.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 64, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': None, 'eos_token_id': [128001, 128008, 128009], 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'meta-llama/Llama-3.2-1B-Instruct', '_attn_implementation_autoset': True, 'transformers_version': '4.47.1', 'model_type': 'llama_lowdim', 'output_dir': '/home/chyang/workspace/LLM-BC/data/outputs/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 64, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 3e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 20, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 10, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/home/chyang/workspace/LLM-BC/data/outputs/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/runs/Mar10_09-20-37_user', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 200, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': True, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 10, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '/home/chyang/workspace/LLM-BC/data/outputs/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'paged_adamw_32bit', 'optim_args': None, 'adafactor': False, 'group_by_length': True, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': None, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': None, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False, 'dataset_text_field': 'text', 'packing': False, 'max_seq_length': 100, 'dataset_num_proc': None, 'dataset_batch_size': 1000, 'model_init_kwargs': None, 'dataset_kwargs': {}, 'eval_packing': None, 'num_of_sequences': 1024, 'chars_per_token': '', 'use_liger': False, 'joint_obs_action_mlp_lr': 1e-06, 'obs_mlp_lr': None, 'action_mlp_lr': None} +2025-03-10 09:20:40,583 INFO MainThread:3973458 [wandb_config.py:__setitem__():154] config set model/num_parameters = 1251311616 - > +2025-03-10 09:20:40,583 INFO MainThread:3973458 [wandb_run.py:_config_callback():1389] config_cb model/num_parameters 1251311616 None +2025-03-10 14:12:46,313 INFO MainThread:3973458 [wandb_run.py:_finish():2146] finishing run chyang25-national-taiwan-university/llm_module_finetuning/6awu8klx +2025-03-10 14:12:46,314 INFO MainThread:3973458 [wandb_run.py:_atexit_cleanup():2414] got exitcode: 0 +2025-03-10 14:12:46,314 INFO MainThread:3973458 [wandb_run.py:_restore():2396] restore +2025-03-10 14:12:46,314 INFO MainThread:3973458 [wandb_run.py:_restore():2402] restore done +2025-03-10 14:12:49,828 INFO MainThread:3973458 [wandb_run.py:_footer_history_summary_info():3963] rendering history +2025-03-10 14:12:49,829 INFO MainThread:3973458 [wandb_run.py:_footer_history_summary_info():3995] rendering summary +2025-03-10 14:12:49,832 INFO MainThread:3973458 [wandb_run.py:_footer_sync_info():3922] logging synced files diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/files/config.yaml b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6cee3f9ae71deee220a2fab65ce515267e46b831 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/files/config.yaml @@ -0,0 +1,744 @@ +_attn_implementation_autoset: + value: true +_name_or_path: + value: meta-llama/Llama-3.2-1B-Instruct +_target_: + value: llmbc.workspace.train_llm_workspace.TrainLLMWorkspace +_wandb: + value: + cli_version: 0.18.6 + m: + - "1": train/grad_norm + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/global_step + "6": + - 3 + "7": [] + - "1": eval/loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/samples_per_second + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/steps_per_second + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/learning_rate + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/epoch + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": train/loss + "5": 2 + "6": + - 1 + - 3 + "7": [] + - "1": eval/runtime + "5": 2 + "6": + - 1 + - 3 + "7": [] + python_version: 3.9.21 + t: + "1": + - 1 + - 2 + - 3 + - 5 + - 11 + - 12 + - 41 + - 49 + - 50 + - 51 + - 53 + - 55 + - 71 + - 84 + - 98 + "2": + - 1 + - 2 + - 3 + - 5 + - 11 + - 12 + - 41 + - 49 + - 50 + - 51 + - 53 + - 55 + - 71 + - 84 + - 98 + "3": + - 2 + - 7 + - 13 + - 15 + - 16 + - 19 + - 23 + - 55 + - 62 + - 66 + "4": 3.9.21 + "5": 0.18.6 + "6": 4.47.1 + "8": + - 5 + "9": + "1": transformers_trainer + "12": 0.18.6 + "13": linux-x86_64 +accelerator_config: + value: + dispatch_batches: null + even_batches: true + gradient_accumulation_kwargs: null + non_blocking: false + split_batches: false + use_seedable_sampler: true +action_dim: + value: 4 +action_mlp_lr: + value: null +adafactor: + value: false +adam_beta1: + value: 0.9 +adam_beta2: + value: 0.999 +adam_epsilon: + value: 1e-08 +add_cross_attention: + value: false +architectures: + value: + - LlamaForCausalLM +attention_bias: + value: false +attention_dropout: + value: 0 +auto_find_batch_size: + value: false +average_tokens_across_devices: + value: false +bad_words_ids: + value: null +batch_eval_metrics: + value: false +begin_suppress_tokens: + value: null +bf16: + value: true +bf16_full_eval: + value: false +bos_token_id: + value: 128000 +chars_per_token: + value: +chunk_size_feed_forward: + value: 0 +cross_attention_hidden_size: + value: null +data_seed: + value: null +dataloader_drop_last: + value: false +dataloader_num_workers: + value: 0 +dataloader_persistent_workers: + value: false +dataloader_pin_memory: + value: true +dataloader_prefetch_factor: + value: null +dataset: + value: + test_data_ratio: 0.01 +dataset_batch_size: + value: 1000 +dataset_num_proc: + value: null +dataset_text_field: + value: text +ddp_backend: + value: null +ddp_broadcast_buffers: + value: null +ddp_bucket_cap_mb: + value: null +ddp_find_unused_parameters: + value: null +ddp_timeout: + value: 1800 +debug: + value: [] +decoder_start_token_id: + value: null +deepspeed: + value: null +disable_tqdm: + value: false +dispatch_batches: + value: null +diversity_penalty: + value: 0 +do_eval: + value: true +do_predict: + value: false +do_sample: + value: false +do_train: + value: false +early_stopping: + value: false +encoder_no_repeat_ngram_size: + value: 0 +eos_token_id: + value: + - 128001 + - 128008 + - 128009 +eval_accumulation_steps: + value: null +eval_delay: + value: 0 +eval_do_concat_batches: + value: true +eval_on_start: + value: false +eval_packing: + value: null +eval_steps: + value: 10 +eval_strategy: + value: steps +eval_use_gather_object: + value: false +evaluation_strategy: + value: null +exp_name: + value: train llm +exponential_decay_length_penalty: + value: null +finetuning_task: + value: null +forced_bos_token_id: + value: null +forced_eos_token_id: + value: null +fp16: + value: false +fp16_backend: + value: auto +fp16_full_eval: + value: false +fp16_opt_level: + value: O1 +fsdp: + value: [] +fsdp_config: + value: + min_num_params: 0 + xla: false + xla_fsdp_grad_ckpt: false + xla_fsdp_v2: false +fsdp_min_num_params: + value: 0 +fsdp_transformer_layer_cls_to_wrap: + value: null +full_determinism: + value: false +gradient_accumulation_steps: + value: 64 +gradient_checkpointing: + value: false +gradient_checkpointing_kwargs: + value: null +greater_is_better: + value: null +group_by_length: + value: true +half_precision_backend: + value: auto +head_dim: + value: 64 +hidden_act: + value: silu +hidden_size: + value: 2048 +horizon: + value: 1 +hub_always_push: + value: false +hub_model_id: + value: null +hub_private_repo: + value: null +hub_strategy: + value: every_save +hub_token: + value: +id2label: + value: + "0": LABEL_0 + "1": LABEL_1 +ignore_data_skip: + value: false +include_for_metrics: + value: [] +include_inputs_for_metrics: + value: false +include_num_input_tokens_seen: + value: false +include_tokens_per_second: + value: false +initializer_range: + value: 0.02 +intermediate_size: + value: 8192 +is_decoder: + value: false +is_encoder_decoder: + value: false +jit_mode_eval: + value: false +joint_obs_action_mlp_lr: + value: 1e-06 +label_names: + value: null +label_smoothing_factor: + value: 0 +label2id: + value: + LABEL_0: 0 + LABEL_1: 1 +learning_rate: + value: 3e-05 +length_column_name: + value: length +length_penalty: + value: 1 +llm: + value: + adaptor_path: /home/chyang/workspace/LLM-BC/data/outputs/2025.03.05/13.39.46_train_llm_lowdim_sweep-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-sweep-v2/checkpoint-3550 + causal_lm_target: llmbc.model.llm.llama_lowdim_model.LowdimLlamaForCausalLM + config_target: llmbc.model.llm.llama_lowdim_model.LowdimLlamaConfig + hydra: + job: + override_dirname: meta-llama/Llama-3.2-1B-Instruct + run: + dir: data/outputs/2025.03.10/09.20.26_meta-llama/Llama-3.2-1B-Instruct + load_from_checkpoint: false + load_from_mlp_projector_checkpoint: true + lora_config: + bias: none + lora_alpha: 32 + lora_dropout: 0.05 + r: 16 + task_type: CAUSAL_LM + max_length: 100 + mlp_projector_checkpoint_path: /home/chyang/workspace/LLM-BC/data/outputs/2025.03.09/12.41.09_train_mlp_projector_metaworld/checkpoints/latest.ckpt + model_name: Llama-3.2-1B-Instruct + name: meta-llama/Llama-3.2-1B-Instruct + prompter: + _target_: llmbc.translator.prompter.llama_prompter.LlamaPrompter + use_joint_mlp_projector: true + use_joint_mlp_projector: true + use_orig_model: false + use_quantization: false +load_best_model_at_end: + value: false +local_rank: + value: 0 +log_level: + value: passive +log_level_replica: + value: warning +log_on_each_node: + value: true +logging: + value: + group: null + id: null + mode: online + name: 2025.03.10-09.20.26_train_llm_lowdim_push-v2 + project: llm_module_finetuning + resume: true + tags: + - train_llm_lowdim + - push-v2 + - train llm +logging_dir: + value: /home/chyang/workspace/LLM-BC/data/outputs/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/runs/Mar10_09-20-37_user +logging_first_step: + value: false +logging_nan_inf_filter: + value: true +logging_steps: + value: 1 +logging_strategy: + value: steps +lora_config: + value: + bias: none + lora_alpha: 32 + lora_dropout: 0.05 + r: 16 + task_type: CAUSAL_LM +lr_scheduler_type: + value: linear +max_grad_norm: + value: 1 +max_length: + value: 20 +max_position_embeddings: + value: 131072 +max_seq_length: + value: 100 +max_steps: + value: -1 +metric_for_best_model: + value: null +min_length: + value: 0 +mlp_bias: + value: false +model/num_parameters: + value: 1251311616 +model_init_kwargs: + value: null +model_name: + value: meta-llama/Llama-3.2-1B-Instruct +model_type: + value: llama_lowdim +mp_parameters: + value: "" +multi_run: + value: + run_dir: data/outputs/2025.03.10/09.20.26_train_llm_lowdim_push-v2 + wandb_name_base: 2025.03.10-09.20.26_train_llm_lowdim_push-v2 +n_action_steps: + value: 1 +n_obs_steps: + value: 1 +name: + value: train_llm_lowdim +neftune_noise_alpha: + value: null +no_cuda: + value: false +no_repeat_ngram_size: + value: 0 +num_attention_heads: + value: 32 +num_beam_groups: + value: 1 +num_beams: + value: 1 +num_hidden_layers: + value: 16 +num_key_value_heads: + value: 8 +num_of_sequences: + value: 1024 +num_return_sequences: + value: 1 +num_train_epochs: + value: 20 +obs_dim: + value: 9 +obs_mlp_lr: + value: null +optim: + value: paged_adamw_32bit +optim_args: + value: null +optim_target_modules: + value: null +output_attentions: + value: false +output_dir: + value: /home/chyang/workspace/LLM-BC/data/outputs/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2 +output_hidden_states: + value: false +output_scores: + value: false +overwrite_output_dir: + value: false +packing: + value: false +pad_token_id: + value: null +past_index: + value: -1 +peft_config: + value: + default: + auto_mapping: null + base_model_name_or_path: meta-llama/Llama-3.2-1B-Instruct + bias: none + eva_config: null + exclude_modules: null + fan_in_fan_out: false + inference_mode: false + init_lora_weights: true + layer_replication: null + layers_pattern: null + layers_to_transform: null + lora_alpha: 32 + lora_bias: false + lora_dropout: 0.05 + megatron_config: null + megatron_core: megatron.core + modules_to_save: null + peft_type: LORA + r: 16 + revision: null + runtime_config: + ephemeral_gpu_offload: false + target_modules: + - v_proj + - down_proj + - gate_proj + - q_proj + - o_proj + - k_proj + - up_proj + task_type: CAUSAL_LM + use_dora: false + use_rslora: false +per_device_eval_batch_size: + value: 1 +per_device_train_batch_size: + value: 1 +per_gpu_eval_batch_size: + value: null +per_gpu_train_batch_size: + value: null +prediction_loss_only: + value: false +prefix: + value: null +pretraining_tp: + value: 1 +problem_type: + value: null +push_to_hub: + value: false +push_to_hub_model_id: + value: null +push_to_hub_organization: + value: null +push_to_hub_token: + value: +ray_scope: + value: last +remove_invalid_values: + value: false +remove_unused_columns: + value: true +repetition_penalty: + value: 1 +report_to: + value: + - wandb +restore_callback_states_from_checkpoint: + value: false +resume_from_checkpoint: + value: null +return_dict: + value: true +return_dict_in_generate: + value: false +rms_norm_eps: + value: 1e-05 +rope_scaling: + value: + factor: 32 + high_freq_factor: 4 + low_freq_factor: 1 + original_max_position_embeddings: 8192 + rope_type: llama3 +rope_theta: + value: 500000 +run_name: + value: /home/chyang/workspace/LLM-BC/data/outputs/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2 +save_on_each_node: + value: false +save_only_model: + value: false +save_safetensors: + value: true +save_steps: + value: 200 +save_strategy: + value: steps +save_total_limit: + value: null +seed: + value: 42 +sep_token_id: + value: null +skip_memory_metrics: + value: true +split_batches: + value: null +suppress_tokens: + value: null +task: + value: + action_dim: 4 + dataset: + _target_: llmbc.dataset.metaworld_lowdim_dataset.MetaworldLowdimDataset + data_path: datasets/push-v2-general.pt + data_path2: datasets/push-v2.pt + dummy_normalizer: true + horizon: 1 + obs_eef_target: true + pad_after: 0 + pad_before: 0 + use_manual_normalizer: false + val_ratio: 0.2 + env_runner: + _target_: llmbc.env_runner.metaworld_lowdim_runner.MetaworldLowdimRunner + env_name: llf-metaworld-push-v2 + feedback_type: + - hp + - hn + - fp + instruction_type: b + max_steps: 30 + n_action_steps: 1 + n_obs_steps: 1 + visual: false + instructor: + _target_: llmbc.translator.instructor.metaworld_instructor.push_v2_instructor.PushV2Instructor + name: push-v2 + obs_dim: 9 +task_name: + value: push-v2 +task_specific_params: + value: null +temperature: + value: 1 +tf_legacy_loss: + value: false +tf32: + value: true +tie_encoder_decoder: + value: false +tie_word_embeddings: + value: true +tokenizer_class: + value: null +top_k: + value: 50 +top_p: + value: 1 +torch_compile: + value: false +torch_compile_backend: + value: null +torch_compile_mode: + value: null +torch_dtype: + value: bfloat16 +torch_empty_cache_steps: + value: null +torchdynamo: + value: null +torchscript: + value: false +tpu_metrics_debug: + value: false +tpu_num_cores: + value: null +trainer: + value: + action_dim: 4 + dataset_text_field: text + max_seq_length: 100 + obs_dim: 9 + packing: false + use_joint_mlp_projector: true +training: + value: + bf16: true + eval_steps: 10 + eval_strategy: steps + fp16: false + gradient_accumulation_steps: 64 + group_by_length: true + joint_obs_action_mlp_lr: 1e-06 + learning_rate: 3e-05 + logging_steps: 1 + logging_strategy: steps + num_train_epochs: 20 + optim: paged_adamw_32bit + per_device_eval_batch_size: 1 + per_device_train_batch_size: 1 + report_to: wandb + save_steps: 200 + seed: 42 + tf32: true + use_joint_mlp_projector: true + warmup_steps: 10 +transformers_version: + value: 4.47.1 +typical_p: + value: 1 +use_bfloat16: + value: false +use_cache: + value: false +use_cpu: + value: false +use_ipex: + value: false +use_joint_mlp_projector: + value: true +use_legacy_prediction_loop: + value: false +use_liger: + value: false +use_liger_kernel: + value: false +use_mps_device: + value: false +use_quantization: + value: false +vocab_size: + value: 128256 +warmup_ratio: + value: 0 +warmup_steps: + value: 10 +weight_decay: + value: 0 diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/files/output.log b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..4ae7feeb8cb5f00d29d9add0b7b73fe2506814d2 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/files/output.log @@ -0,0 +1,7241 @@ +You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. +Some weights of LowdimLlamaForCausalLM were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct and are newly initialized: ['model.joint_obs_action_projector.projector.0.bias', 'model.joint_obs_action_projector.projector.0.weight', 'model.joint_obs_action_projector.projector.2.bias', 'model.joint_obs_action_projector.projector.2.weight'] +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. +Loading from mlp projector checkpoint: /home/chyang/workspace/LLM-BC/data/outputs/2025.03.09/12.41.09_train_mlp_projector_metaworld/checkpoints/latest.ckpt +Loading the original prtrained LLM with Lora Adaptor. +Multistep Flattening Dataset: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17899/17899 [00:00<00:00, 24766.78it/s] +Setting TOKENIZERS_PARALLELISM=false for forked processes. +[2025-03-10 09:20:33,509][datasets.arrow_dataset][WARNING] - Setting TOKENIZERS_PARALLELISM=false for forked processes. +Map (num_proc=4): 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 17899/17899 [00:03<00:00, 4903.96 examples/s] +DatasetDict({ + train: Dataset({ + features: ['obs', 'action', 'description', 'input', 'output', 'text', 'input_ids', 'labels'], + num_rows: 17720 + }) + test: Dataset({ + features: ['obs', 'action', 'description', 'input', 'output', 'text', 'input_ids', 'labels'], + num_rows: 179 + }) +}) +/home/chyang/miniconda3/envs/llm-bc/lib/python3.9/site-packages/huggingface_hub/utils/_deprecation.py:100: FutureWarning: Deprecated argument(s) used in '__init__': max_seq_length, dataset_text_field. Will not be supported from version '1.0.0'. + +Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead. + warnings.warn(message, FutureWarning) +/home/chyang/miniconda3/envs/llm-bc/lib/python3.9/site-packages/trl/trainer/sft_trainer.py:283: UserWarning: You passed a `max_seq_length` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`. + warnings.warn( +/home/chyang/miniconda3/envs/llm-bc/lib/python3.9/site-packages/trl/trainer/sft_trainer.py:321: UserWarning: You passed a `dataset_text_field` argument to the SFTTrainer, the value you passed will override the one in the `SFTConfig`. + warnings.warn( +/home/chyang/miniconda3/envs/llm-bc/lib/python3.9/site-packages/trl/trainer/sft_trainer.py:401: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `LowdimSFTTrainer.__init__`. Use `processing_class` instead. + super().__init__( +[2025-03-10 09:20:37,440] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2025-03-10 09:20:37,556][root][INFO] - gcc -pthread -B /home/chyang/miniconda3/envs/llm-bc/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /home/chyang/miniconda3/envs/llm-bc/include -I/home/chyang/miniconda3/envs/llm-bc/include -fPIC -O2 -isystem /home/chyang/miniconda3/envs/llm-bc/include -fPIC -c /tmp/tmp279_x4mk/test.c -o /tmp/tmp279_x4mk/test.o +[2025-03-10 09:20:37,599][root][INFO] - gcc -pthread -B /home/chyang/miniconda3/envs/llm-bc/compiler_compat /tmp/tmp279_x4mk/test.o -laio -o /tmp/tmp279_x4mk/a.out +[2025-03-10 09:20:37,845][root][INFO] - gcc -pthread -B /home/chyang/miniconda3/envs/llm-bc/compiler_compat -Wno-unused-result -Wsign-compare -DNDEBUG -O2 -Wall -fPIC -O2 -isystem /home/chyang/miniconda3/envs/llm-bc/include -I/home/chyang/miniconda3/envs/llm-bc/include -fPIC -O2 -isystem /home/chyang/miniconda3/envs/llm-bc/include -fPIC -c /tmp/tmpsqvpk3lb/test.c -o /tmp/tmpsqvpk3lb/test.o +[2025-03-10 09:20:37,894][root][INFO] - gcc -pthread -B /home/chyang/miniconda3/envs/llm-bc/compiler_compat /tmp/tmpsqvpk3lb/test.o -L/usr/local/cuda -L/usr/local/cuda/lib64 -lcufile -o /tmp/tmpsqvpk3lb/a.out +wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter. + 0%|β–Ž | 10/5520 [00:27<4:13:24, 2.76s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'loss': 72.9219, 'grad_norm': 826.8458251953125, 'learning_rate': 3e-06, 'epoch': 0.0} +{'loss': 72.5411, 'grad_norm': 716.6332397460938, 'learning_rate': 6e-06, 'epoch': 0.01} +{'loss': 68.2333, 'grad_norm': 653.662109375, 'learning_rate': 9e-06, 'epoch': 0.01} +{'loss': 67.0506, 'grad_norm': 678.8214111328125, 'learning_rate': 1.2e-05, 'epoch': 0.01} +{'loss': 67.0048, 'grad_norm': 584.922607421875, 'learning_rate': 1.5e-05, 'epoch': 0.02} +{'loss': 68.4059, 'grad_norm': 678.7247924804688, 'learning_rate': 1.8e-05, 'epoch': 0.02} +{'loss': 71.0148, 'grad_norm': 911.47509765625, 'learning_rate': 2.1e-05, 'epoch': 0.03} +{'loss': 71.4146, 'grad_norm': 924.4578247070312, 'learning_rate': 2.4e-05, 'epoch': 0.03} +{'loss': 70.8432, 'grad_norm': 1064.275634765625, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.03} +{'loss': 73.1796, 'grad_norm': 850.4259033203125, 'learning_rate': 3e-05, 'epoch': 0.04} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 0%|β–Œ | 20/5520 [00:58<4:15:50, 2.79s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 1.0189366340637207, 'eval_runtime': 3.1411, 'eval_samples_per_second': 56.986, 'eval_steps_per_second': 56.986, 'epoch': 0.04} +{'loss': 70.5576, 'grad_norm': 870.9306030273438, 'learning_rate': 2.9994555353901996e-05, 'epoch': 0.04} +{'loss': 65.2322, 'grad_norm': 794.9625244140625, 'learning_rate': 2.998911070780399e-05, 'epoch': 0.04} +{'loss': 62.7158, 'grad_norm': 989.5623779296875, 'learning_rate': 2.998366606170599e-05, 'epoch': 0.05} +{'loss': 65.801, 'grad_norm': 941.0211181640625, 'learning_rate': 2.9978221415607986e-05, 'epoch': 0.05} +{'loss': 63.4828, 'grad_norm': 863.9938354492188, 'learning_rate': 2.9972776769509984e-05, 'epoch': 0.05} +{'loss': 61.3068, 'grad_norm': 711.3890991210938, 'learning_rate': 2.996733212341198e-05, 'epoch': 0.06} +{'loss': 64.1753, 'grad_norm': 885.39501953125, 'learning_rate': 2.9961887477313975e-05, 'epoch': 0.06} +{'loss': 63.6775, 'grad_norm': 655.5796508789062, 'learning_rate': 2.995644283121597e-05, 'epoch': 0.07} +{'loss': 62.8369, 'grad_norm': 681.5781860351562, 'learning_rate': 2.995099818511797e-05, 'epoch': 0.07} +{'loss': 61.3176, 'grad_norm': 605.4241943359375, 'learning_rate': 2.9945553539019965e-05, 'epoch': 0.07} + 1%|β–‰ | 30/5520 [01:29<4:15:14, 2.79s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.9650000929832458, 'eval_runtime': 3.1443, 'eval_samples_per_second': 56.928, 'eval_steps_per_second': 56.928, 'epoch': 0.07} +{'loss': 61.9691, 'grad_norm': 588.86376953125, 'learning_rate': 2.994010889292196e-05, 'epoch': 0.08} +{'loss': 61.2061, 'grad_norm': 729.6580810546875, 'learning_rate': 2.9934664246823956e-05, 'epoch': 0.08} +{'loss': 63.1849, 'grad_norm': 621.9948120117188, 'learning_rate': 2.992921960072595e-05, 'epoch': 0.08} +{'loss': 64.0214, 'grad_norm': 816.0555419921875, 'learning_rate': 2.9923774954627953e-05, 'epoch': 0.09} +{'loss': 62.0578, 'grad_norm': 728.7860107421875, 'learning_rate': 2.991833030852995e-05, 'epoch': 0.09} +{'loss': 62.6916, 'grad_norm': 897.5223999023438, 'learning_rate': 2.9912885662431944e-05, 'epoch': 0.09} +{'loss': 61.2081, 'grad_norm': 624.7844848632812, 'learning_rate': 2.990744101633394e-05, 'epoch': 0.1} +{'loss': 60.2182, 'grad_norm': 661.22119140625, 'learning_rate': 2.9901996370235935e-05, 'epoch': 0.1} +{'loss': 57.5996, 'grad_norm': 574.8737182617188, 'learning_rate': 2.989655172413793e-05, 'epoch': 0.1} +{'loss': 55.8385, 'grad_norm': 766.5988159179688, 'learning_rate': 2.989110707803993e-05, 'epoch': 0.11} + 1%|β–ˆβ– | 40/5520 [02:00<4:15:20, 2.80s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.9189058542251587, 'eval_runtime': 3.1396, 'eval_samples_per_second': 57.014, 'eval_steps_per_second': 57.014, 'epoch': 0.11} +{'loss': 53.8883, 'grad_norm': 851.9244995117188, 'learning_rate': 2.9885662431941924e-05, 'epoch': 0.11} +{'loss': 52.2324, 'grad_norm': 689.07470703125, 'learning_rate': 2.988021778584392e-05, 'epoch': 0.12} +{'loss': 54.1271, 'grad_norm': 716.2824096679688, 'learning_rate': 2.9874773139745915e-05, 'epoch': 0.12} +{'loss': 50.9066, 'grad_norm': 718.0765991210938, 'learning_rate': 2.9869328493647914e-05, 'epoch': 0.12} +{'loss': 51.6759, 'grad_norm': 774.018798828125, 'learning_rate': 2.9863883847549912e-05, 'epoch': 0.13} +{'loss': 52.6699, 'grad_norm': 725.5440063476562, 'learning_rate': 2.9858439201451908e-05, 'epoch': 0.13} +{'loss': 51.6784, 'grad_norm': 669.84765625, 'learning_rate': 2.9852994555353903e-05, 'epoch': 0.13} +{'loss': 53.7148, 'grad_norm': 569.4988403320312, 'learning_rate': 2.98475499092559e-05, 'epoch': 0.14} +{'loss': 54.6741, 'grad_norm': 723.3594360351562, 'learning_rate': 2.9842105263157894e-05, 'epoch': 0.14} +{'loss': 52.1797, 'grad_norm': 709.8211059570312, 'learning_rate': 2.983666061705989e-05, 'epoch': 0.14} + 1%|β–ˆβ–Œ | 50/5520 [02:30<4:14:21, 2.79s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.8676205277442932, 'eval_runtime': 3.1293, 'eval_samples_per_second': 57.202, 'eval_steps_per_second': 57.202, 'epoch': 0.14} +{'loss': 50.9864, 'grad_norm': 641.121337890625, 'learning_rate': 2.9831215970961888e-05, 'epoch': 0.15} +{'loss': 50.4881, 'grad_norm': 653.1666259765625, 'learning_rate': 2.9825771324863884e-05, 'epoch': 0.15} +{'loss': 51.6959, 'grad_norm': 701.0926513671875, 'learning_rate': 2.9820326678765882e-05, 'epoch': 0.16} +{'loss': 54.8321, 'grad_norm': 838.512451171875, 'learning_rate': 2.9814882032667878e-05, 'epoch': 0.16} +{'loss': 50.6469, 'grad_norm': 905.4413452148438, 'learning_rate': 2.9809437386569873e-05, 'epoch': 0.16} +{'loss': 47.2041, 'grad_norm': 762.02783203125, 'learning_rate': 2.9803992740471872e-05, 'epoch': 0.17} +{'loss': 36.6458, 'grad_norm': 718.588623046875, 'learning_rate': 2.9798548094373867e-05, 'epoch': 0.17} +{'loss': 35.4111, 'grad_norm': 974.5386962890625, 'learning_rate': 2.9793103448275863e-05, 'epoch': 0.17} +{'loss': 35.6902, 'grad_norm': 809.5028076171875, 'learning_rate': 2.9787658802177858e-05, 'epoch': 0.18} +{'loss': 34.4502, 'grad_norm': 814.4694213867188, 'learning_rate': 2.9782214156079853e-05, 'epoch': 0.18} + 1%|β–ˆβ–Š | 60/5520 [03:01<4:13:05, 2.78s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.8749106526374817, 'eval_runtime': 3.133, 'eval_samples_per_second': 57.133, 'eval_steps_per_second': 57.133, 'epoch': 0.18} +{'loss': 67.7942, 'grad_norm': 2068.640380859375, 'learning_rate': 2.9776769509981852e-05, 'epoch': 0.18} +{'loss': 64.3457, 'grad_norm': 1760.789306640625, 'learning_rate': 2.977132486388385e-05, 'epoch': 0.19} +{'loss': 62.0075, 'grad_norm': 1317.9237060546875, 'learning_rate': 2.9765880217785846e-05, 'epoch': 0.19} +{'loss': 60.4988, 'grad_norm': 949.7896118164062, 'learning_rate': 2.976043557168784e-05, 'epoch': 0.2} +{'loss': 56.8426, 'grad_norm': 862.1629028320312, 'learning_rate': 2.9754990925589837e-05, 'epoch': 0.2} +{'loss': 56.7855, 'grad_norm': 978.5818481445312, 'learning_rate': 2.9749546279491832e-05, 'epoch': 0.2} +{'loss': 58.6869, 'grad_norm': 1055.5872802734375, 'learning_rate': 2.974410163339383e-05, 'epoch': 0.21} +{'loss': 57.318, 'grad_norm': 971.089599609375, 'learning_rate': 2.9738656987295827e-05, 'epoch': 0.21} +{'loss': 56.7783, 'grad_norm': 823.1680908203125, 'learning_rate': 2.9733212341197822e-05, 'epoch': 0.21} +{'loss': 57.0712, 'grad_norm': 660.5692138671875, 'learning_rate': 2.9727767695099817e-05, 'epoch': 0.22} + 1%|β–ˆβ–ˆβ– | 70/5520 [03:31<4:12:09, 2.78s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.8012754917144775, 'eval_runtime': 3.1336, 'eval_samples_per_second': 57.123, 'eval_steps_per_second': 57.123, 'epoch': 0.22} +{'loss': 55.192, 'grad_norm': 669.174072265625, 'learning_rate': 2.9722323049001816e-05, 'epoch': 0.22} +{'loss': 50.8828, 'grad_norm': 688.8255004882812, 'learning_rate': 2.971687840290381e-05, 'epoch': 0.22} +{'loss': 50.3083, 'grad_norm': 699.8623657226562, 'learning_rate': 2.971143375680581e-05, 'epoch': 0.23} +{'loss': 49.7228, 'grad_norm': 559.8364868164062, 'learning_rate': 2.9705989110707806e-05, 'epoch': 0.23} +{'loss': 49.8954, 'grad_norm': 709.3836059570312, 'learning_rate': 2.97005444646098e-05, 'epoch': 0.23} +{'loss': 49.1461, 'grad_norm': 678.072265625, 'learning_rate': 2.9695099818511796e-05, 'epoch': 0.24} +{'loss': 49.6423, 'grad_norm': 672.2944946289062, 'learning_rate': 2.9689655172413792e-05, 'epoch': 0.24} +{'loss': 49.3827, 'grad_norm': 494.2787780761719, 'learning_rate': 2.968421052631579e-05, 'epoch': 0.25} +{'loss': 51.385, 'grad_norm': 440.1124267578125, 'learning_rate': 2.9678765880217786e-05, 'epoch': 0.25} +{'loss': 50.4029, 'grad_norm': 592.347412109375, 'learning_rate': 2.9673321234119785e-05, 'epoch': 0.25} + 1%|β–ˆβ–ˆβ– | 80/5520 [04:02<4:12:40, 2.79s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.7771623730659485, 'eval_runtime': 3.1347, 'eval_samples_per_second': 57.104, 'eval_steps_per_second': 57.104, 'epoch': 0.25} +{'loss': 51.1812, 'grad_norm': 637.4396362304688, 'learning_rate': 2.966787658802178e-05, 'epoch': 0.26} +{'loss': 51.0345, 'grad_norm': 485.1819763183594, 'learning_rate': 2.9662431941923776e-05, 'epoch': 0.26} +{'loss': 52.2199, 'grad_norm': 598.6526489257812, 'learning_rate': 2.9656987295825774e-05, 'epoch': 0.26} +{'loss': 51.7395, 'grad_norm': 554.0598754882812, 'learning_rate': 2.965154264972777e-05, 'epoch': 0.27} +{'loss': 51.124, 'grad_norm': 587.4779052734375, 'learning_rate': 2.9646098003629765e-05, 'epoch': 0.27} +{'loss': 50.7046, 'grad_norm': 483.8338317871094, 'learning_rate': 2.964065335753176e-05, 'epoch': 0.27} +{'loss': 49.4543, 'grad_norm': 556.6511840820312, 'learning_rate': 2.9635208711433756e-05, 'epoch': 0.28} +{'loss': 49.1305, 'grad_norm': 535.6243286132812, 'learning_rate': 2.962976406533575e-05, 'epoch': 0.28} +{'loss': 47.6811, 'grad_norm': 550.9852905273438, 'learning_rate': 2.962431941923775e-05, 'epoch': 0.29} +{'loss': 44.97, 'grad_norm': 686.528076171875, 'learning_rate': 2.961887477313975e-05, 'epoch': 0.29} + 2%|β–ˆβ–ˆβ–Š | 90/5520 [04:33<4:11:21, 2.78s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.7604023814201355, 'eval_runtime': 3.1365, 'eval_samples_per_second': 57.07, 'eval_steps_per_second': 57.07, 'epoch': 0.29} +{'loss': 42.5007, 'grad_norm': 828.4725952148438, 'learning_rate': 2.9613430127041744e-05, 'epoch': 0.29} +{'loss': 41.9718, 'grad_norm': 644.0596313476562, 'learning_rate': 2.960798548094374e-05, 'epoch': 0.3} +{'loss': 44.1048, 'grad_norm': 578.7656860351562, 'learning_rate': 2.9602540834845735e-05, 'epoch': 0.3} +{'loss': 43.6878, 'grad_norm': 589.760498046875, 'learning_rate': 2.9597096188747734e-05, 'epoch': 0.3} +{'loss': 44.3374, 'grad_norm': 715.7012939453125, 'learning_rate': 2.959165154264973e-05, 'epoch': 0.31} +{'loss': 44.4499, 'grad_norm': 649.3252563476562, 'learning_rate': 2.9586206896551724e-05, 'epoch': 0.31} +{'loss': 44.4535, 'grad_norm': 640.3419189453125, 'learning_rate': 2.958076225045372e-05, 'epoch': 0.31} +{'loss': 45.0348, 'grad_norm': 591.23388671875, 'learning_rate': 2.9575317604355715e-05, 'epoch': 0.32} +{'loss': 44.1963, 'grad_norm': 544.8179321289062, 'learning_rate': 2.9569872958257714e-05, 'epoch': 0.32} +{'loss': 44.9479, 'grad_norm': 594.55224609375, 'learning_rate': 2.9564428312159713e-05, 'epoch': 0.33} + 2%|β–ˆβ–ˆβ–ˆ | 100/5520 [05:03<4:12:40, 2.80s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.7576387524604797, 'eval_runtime': 3.1374, 'eval_samples_per_second': 57.053, 'eval_steps_per_second': 57.053, 'epoch': 0.33} +{'loss': 45.3891, 'grad_norm': 536.4320068359375, 'learning_rate': 2.9558983666061708e-05, 'epoch': 0.33} +{'loss': 44.9822, 'grad_norm': 536.9632568359375, 'learning_rate': 2.9553539019963703e-05, 'epoch': 0.33} +{'loss': 45.066, 'grad_norm': 505.9728698730469, 'learning_rate': 2.95480943738657e-05, 'epoch': 0.34} +{'loss': 43.6293, 'grad_norm': 453.0039367675781, 'learning_rate': 2.9542649727767694e-05, 'epoch': 0.34} +{'loss': 44.3293, 'grad_norm': 466.6203308105469, 'learning_rate': 2.9537205081669693e-05, 'epoch': 0.34} +{'loss': 40.2154, 'grad_norm': 532.4081420898438, 'learning_rate': 2.953176043557169e-05, 'epoch': 0.35} +{'loss': 31.5673, 'grad_norm': 577.1102294921875, 'learning_rate': 2.9526315789473684e-05, 'epoch': 0.35} +{'loss': 29.3586, 'grad_norm': 441.4743347167969, 'learning_rate': 2.9520871143375683e-05, 'epoch': 0.35} +{'loss': 29.042, 'grad_norm': 432.3975830078125, 'learning_rate': 2.9515426497277678e-05, 'epoch': 0.36} +{'loss': 31.7708, 'grad_norm': 642.6970825195312, 'learning_rate': 2.9509981851179673e-05, 'epoch': 0.36} + 2%|β–ˆβ–ˆβ–ˆβ–Ž | 110/5520 [05:34<4:13:14, 2.81s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.8288812637329102, 'eval_runtime': 3.1289, 'eval_samples_per_second': 57.208, 'eval_steps_per_second': 57.208, 'epoch': 0.36} +{'loss': 64.0132, 'grad_norm': 1607.9227294921875, 'learning_rate': 2.9504537205081672e-05, 'epoch': 0.36} +{'loss': 62.4924, 'grad_norm': 1462.2889404296875, 'learning_rate': 2.9499092558983667e-05, 'epoch': 0.37} +{'loss': 58.4323, 'grad_norm': 1075.0196533203125, 'learning_rate': 2.9493647912885663e-05, 'epoch': 0.37} +{'loss': 55.141, 'grad_norm': 884.6957397460938, 'learning_rate': 2.9488203266787658e-05, 'epoch': 0.38} +{'loss': 54.163, 'grad_norm': 783.4414672851562, 'learning_rate': 2.9482758620689654e-05, 'epoch': 0.38} +{'loss': 55.1398, 'grad_norm': 982.4120483398438, 'learning_rate': 2.9477313974591652e-05, 'epoch': 0.38} +{'loss': 53.8404, 'grad_norm': 853.049560546875, 'learning_rate': 2.947186932849365e-05, 'epoch': 0.39} +{'loss': 53.1712, 'grad_norm': 722.6901245117188, 'learning_rate': 2.9466424682395647e-05, 'epoch': 0.39} +{'loss': 53.1349, 'grad_norm': 691.1047973632812, 'learning_rate': 2.9460980036297642e-05, 'epoch': 0.39} +{'loss': 53.1488, 'grad_norm': 659.1260986328125, 'learning_rate': 2.9455535390199637e-05, 'epoch': 0.4} + 2%|β–ˆβ–ˆβ–ˆβ–‹ | 120/5520 [06:05<4:11:17, 2.79s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.7457038164138794, 'eval_runtime': 3.134, 'eval_samples_per_second': 57.115, 'eval_steps_per_second': 57.115, 'epoch': 0.4} +{'loss': 49.9333, 'grad_norm': 575.7744750976562, 'learning_rate': 2.9450090744101633e-05, 'epoch': 0.4} +{'loss': 47.9028, 'grad_norm': 482.8963317871094, 'learning_rate': 2.944464609800363e-05, 'epoch': 0.4} +{'loss': 46.8302, 'grad_norm': 563.2509765625, 'learning_rate': 2.9439201451905627e-05, 'epoch': 0.41} +{'loss': 46.6043, 'grad_norm': 597.126953125, 'learning_rate': 2.9433756805807622e-05, 'epoch': 0.41} +{'loss': 47.9899, 'grad_norm': 674.9114990234375, 'learning_rate': 2.9428312159709618e-05, 'epoch': 0.42} +{'loss': 46.5175, 'grad_norm': 564.3960571289062, 'learning_rate': 2.9422867513611616e-05, 'epoch': 0.42} +{'loss': 46.1521, 'grad_norm': 482.7253723144531, 'learning_rate': 2.9417422867513615e-05, 'epoch': 0.42} +{'loss': 46.1505, 'grad_norm': 412.52935791015625, 'learning_rate': 2.941197822141561e-05, 'epoch': 0.43} +{'loss': 47.1023, 'grad_norm': 483.7874450683594, 'learning_rate': 2.9406533575317606e-05, 'epoch': 0.43} +{'loss': 46.5822, 'grad_norm': 469.2854309082031, 'learning_rate': 2.94010889292196e-05, 'epoch': 0.43} + 2%|β–ˆβ–ˆβ–ˆβ–‰ | 130/5520 [06:36<4:09:56, 2.78s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.719998300075531, 'eval_runtime': 3.1321, 'eval_samples_per_second': 57.15, 'eval_steps_per_second': 57.15, 'epoch': 0.43} +{'loss': 46.4077, 'grad_norm': 413.3222351074219, 'learning_rate': 2.9395644283121597e-05, 'epoch': 0.44} +{'loss': 46.7971, 'grad_norm': 473.6437683105469, 'learning_rate': 2.9390199637023592e-05, 'epoch': 0.44} +{'loss': 48.0664, 'grad_norm': 477.3919677734375, 'learning_rate': 2.938475499092559e-05, 'epoch': 0.44} +{'loss': 47.9131, 'grad_norm': 505.3496398925781, 'learning_rate': 2.9379310344827586e-05, 'epoch': 0.45} +{'loss': 47.1492, 'grad_norm': 502.92755126953125, 'learning_rate': 2.937386569872958e-05, 'epoch': 0.45} +{'loss': 49.1792, 'grad_norm': 483.64117431640625, 'learning_rate': 2.936842105263158e-05, 'epoch': 0.46} +{'loss': 49.4426, 'grad_norm': 459.92559814453125, 'learning_rate': 2.9362976406533576e-05, 'epoch': 0.46} +{'loss': 46.2051, 'grad_norm': 401.9190673828125, 'learning_rate': 2.9357531760435575e-05, 'epoch': 0.46} +{'loss': 43.9258, 'grad_norm': 601.756103515625, 'learning_rate': 2.935208711433757e-05, 'epoch': 0.47} +{'loss': 43.7106, 'grad_norm': 954.7610473632812, 'learning_rate': 2.9346642468239565e-05, 'epoch': 0.47} + 3%|β–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 140/5520 [07:06<4:10:12, 2.79s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.7346343398094177, 'eval_runtime': 3.1372, 'eval_samples_per_second': 57.057, 'eval_steps_per_second': 57.057, 'epoch': 0.47} +{'loss': 40.4499, 'grad_norm': 635.9511108398438, 'learning_rate': 2.934119782214156e-05, 'epoch': 0.47} +{'loss': 40.9184, 'grad_norm': 603.8322143554688, 'learning_rate': 2.9335753176043556e-05, 'epoch': 0.48} +{'loss': 41.3631, 'grad_norm': 435.4403381347656, 'learning_rate': 2.933030852994555e-05, 'epoch': 0.48} +{'loss': 41.1298, 'grad_norm': 445.1494140625, 'learning_rate': 2.932486388384755e-05, 'epoch': 0.48} +{'loss': 42.6427, 'grad_norm': 627.1956176757812, 'learning_rate': 2.931941923774955e-05, 'epoch': 0.49} +{'loss': 40.8941, 'grad_norm': 364.08441162109375, 'learning_rate': 2.9313974591651544e-05, 'epoch': 0.49} +{'loss': 43.2699, 'grad_norm': 521.076904296875, 'learning_rate': 2.930852994555354e-05, 'epoch': 0.49} +{'loss': 42.9513, 'grad_norm': 480.8160095214844, 'learning_rate': 2.9303085299455535e-05, 'epoch': 0.5} +{'loss': 42.4648, 'grad_norm': 484.83172607421875, 'learning_rate': 2.9297640653357534e-05, 'epoch': 0.5} +{'loss': 43.4351, 'grad_norm': 478.5503845214844, 'learning_rate': 2.929219600725953e-05, 'epoch': 0.51} + 3%|β–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 150/5520 [07:37<4:10:02, 2.79s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.7245867252349854, 'eval_runtime': 3.1305, 'eval_samples_per_second': 57.178, 'eval_steps_per_second': 57.178, 'epoch': 0.51} +{'loss': 42.7249, 'grad_norm': 501.84991455078125, 'learning_rate': 2.9286751361161525e-05, 'epoch': 0.51} +{'loss': 42.7323, 'grad_norm': 496.357177734375, 'learning_rate': 2.928130671506352e-05, 'epoch': 0.51} +{'loss': 44.2251, 'grad_norm': 476.9631042480469, 'learning_rate': 2.9275862068965515e-05, 'epoch': 0.52} +{'loss': 43.2753, 'grad_norm': 435.324951171875, 'learning_rate': 2.9270417422867514e-05, 'epoch': 0.52} +{'loss': 43.9547, 'grad_norm': 486.4795227050781, 'learning_rate': 2.9264972776769513e-05, 'epoch': 0.52} +{'loss': 32.5569, 'grad_norm': 573.4031372070312, 'learning_rate': 2.925952813067151e-05, 'epoch': 0.53} +{'loss': 29.0521, 'grad_norm': 429.2251892089844, 'learning_rate': 2.9254083484573504e-05, 'epoch': 0.53} +{'loss': 28.9163, 'grad_norm': 478.80426025390625, 'learning_rate': 2.92486388384755e-05, 'epoch': 0.53} +{'loss': 29.2594, 'grad_norm': 475.7033996582031, 'learning_rate': 2.9243194192377495e-05, 'epoch': 0.54} +{'loss': 29.8788, 'grad_norm': 530.3062133789062, 'learning_rate': 2.9237749546279493e-05, 'epoch': 0.54} + 3%|β–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 160/5520 [08:08<4:09:21, 2.79s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.8220540285110474, 'eval_runtime': 3.1342, 'eval_samples_per_second': 57.112, 'eval_steps_per_second': 57.112, 'epoch': 0.54} +{'loss': 62.5697, 'grad_norm': 1374.2142333984375, 'learning_rate': 2.923230490018149e-05, 'epoch': 0.55} +{'loss': 61.1637, 'grad_norm': 1227.5701904296875, 'learning_rate': 2.9226860254083484e-05, 'epoch': 0.55} +{'loss': 57.9838, 'grad_norm': 980.4124145507812, 'learning_rate': 2.9221415607985483e-05, 'epoch': 0.55} +{'loss': 56.3787, 'grad_norm': 792.8090209960938, 'learning_rate': 2.9215970961887478e-05, 'epoch': 0.56} +{'loss': 52.8103, 'grad_norm': 602.3422241210938, 'learning_rate': 2.9210526315789474e-05, 'epoch': 0.56} +{'loss': 51.002, 'grad_norm': 493.4346008300781, 'learning_rate': 2.9205081669691472e-05, 'epoch': 0.56} +{'loss': 50.0153, 'grad_norm': 619.7504272460938, 'learning_rate': 2.9199637023593468e-05, 'epoch': 0.57} +{'loss': 52.3504, 'grad_norm': 610.8827514648438, 'learning_rate': 2.9194192377495463e-05, 'epoch': 0.57} +{'loss': 52.9307, 'grad_norm': 670.8658447265625, 'learning_rate': 2.918874773139746e-05, 'epoch': 0.57} +{'loss': 49.7189, 'grad_norm': 552.539306640625, 'learning_rate': 2.9183303085299454e-05, 'epoch': 0.58} + 3%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 170/5520 [08:38<4:10:01, 2.80s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.7438566088676453, 'eval_runtime': 3.134, 'eval_samples_per_second': 57.116, 'eval_steps_per_second': 57.116, 'epoch': 0.58} +{'loss': 50.6365, 'grad_norm': 605.2847900390625, 'learning_rate': 2.9177858439201453e-05, 'epoch': 0.58} +{'loss': 45.5784, 'grad_norm': 460.163818359375, 'learning_rate': 2.9172413793103448e-05, 'epoch': 0.59} +{'loss': 45.6859, 'grad_norm': 630.098876953125, 'learning_rate': 2.9166969147005447e-05, 'epoch': 0.59} +{'loss': 45.3804, 'grad_norm': 532.3728637695312, 'learning_rate': 2.9161524500907442e-05, 'epoch': 0.59} +{'loss': 44.6911, 'grad_norm': 510.09234619140625, 'learning_rate': 2.9156079854809438e-05, 'epoch': 0.6} +{'loss': 45.7436, 'grad_norm': 465.53887939453125, 'learning_rate': 2.9150635208711436e-05, 'epoch': 0.6} +{'loss': 45.3019, 'grad_norm': 413.5904235839844, 'learning_rate': 2.9145190562613432e-05, 'epoch': 0.6} +{'loss': 46.0631, 'grad_norm': 514.5824584960938, 'learning_rate': 2.9139745916515427e-05, 'epoch': 0.61} +{'loss': 46.032, 'grad_norm': 402.7557373046875, 'learning_rate': 2.9134301270417423e-05, 'epoch': 0.61} +{'loss': 46.1674, 'grad_norm': 434.61138916015625, 'learning_rate': 2.9128856624319418e-05, 'epoch': 0.61} + 3%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 180/5520 [09:09<4:07:49, 2.78s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.7043496966362, 'eval_runtime': 3.1315, 'eval_samples_per_second': 57.162, 'eval_steps_per_second': 57.162, 'epoch': 0.61} +{'loss': 47.4448, 'grad_norm': 368.5428771972656, 'learning_rate': 2.9123411978221413e-05, 'epoch': 0.62} +{'loss': 46.0437, 'grad_norm': 382.7486267089844, 'learning_rate': 2.9117967332123415e-05, 'epoch': 0.62} +{'loss': 47.2806, 'grad_norm': 373.2402038574219, 'learning_rate': 2.911252268602541e-05, 'epoch': 0.62} +{'loss': 46.9239, 'grad_norm': 404.00799560546875, 'learning_rate': 2.9107078039927406e-05, 'epoch': 0.63} +{'loss': 47.2773, 'grad_norm': 421.1421203613281, 'learning_rate': 2.91016333938294e-05, 'epoch': 0.63} +{'loss': 47.7277, 'grad_norm': 384.21380615234375, 'learning_rate': 2.9096188747731397e-05, 'epoch': 0.64} +{'loss': 47.4115, 'grad_norm': 401.65625, 'learning_rate': 2.9090744101633396e-05, 'epoch': 0.64} +{'loss': 46.9206, 'grad_norm': 389.7224426269531, 'learning_rate': 2.908529945553539e-05, 'epoch': 0.64} +{'loss': 43.074, 'grad_norm': 370.7626037597656, 'learning_rate': 2.9079854809437387e-05, 'epoch': 0.65} +{'loss': 40.7953, 'grad_norm': 397.579833984375, 'learning_rate': 2.9074410163339382e-05, 'epoch': 0.65} + 3%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 190/5520 [09:40<4:07:45, 2.79s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.7069951295852661, 'eval_runtime': 3.1343, 'eval_samples_per_second': 57.109, 'eval_steps_per_second': 57.109, 'epoch': 0.65} +{'loss': 39.9822, 'grad_norm': 355.5390625, 'learning_rate': 2.906896551724138e-05, 'epoch': 0.65} +{'loss': 38.2107, 'grad_norm': 385.6095275878906, 'learning_rate': 2.9063520871143376e-05, 'epoch': 0.66} +{'loss': 40.9879, 'grad_norm': 469.42449951171875, 'learning_rate': 2.9058076225045375e-05, 'epoch': 0.66} +{'loss': 39.9646, 'grad_norm': 374.644287109375, 'learning_rate': 2.905263157894737e-05, 'epoch': 0.66} +{'loss': 40.37, 'grad_norm': 423.72412109375, 'learning_rate': 2.9047186932849366e-05, 'epoch': 0.67} +{'loss': 40.593, 'grad_norm': 374.5202331542969, 'learning_rate': 2.904174228675136e-05, 'epoch': 0.67} +{'loss': 40.4483, 'grad_norm': 352.500244140625, 'learning_rate': 2.9036297640653356e-05, 'epoch': 0.68} +{'loss': 41.0123, 'grad_norm': 368.6827392578125, 'learning_rate': 2.9030852994555355e-05, 'epoch': 0.68} +{'loss': 41.0098, 'grad_norm': 339.8343200683594, 'learning_rate': 2.902540834845735e-05, 'epoch': 0.68} +{'loss': 42.0471, 'grad_norm': 362.53424072265625, 'learning_rate': 2.901996370235935e-05, 'epoch': 0.69} + 4%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 200/5520 [10:10<4:08:31, 2.80s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.7149370908737183, 'eval_runtime': 3.1346, 'eval_samples_per_second': 57.104, 'eval_steps_per_second': 57.104, 'epoch': 0.69} +{'loss': 43.0053, 'grad_norm': 394.1274719238281, 'learning_rate': 2.9014519056261345e-05, 'epoch': 0.69} +{'loss': 42.6179, 'grad_norm': 370.6410217285156, 'learning_rate': 2.900907441016334e-05, 'epoch': 0.69} +{'loss': 42.4657, 'grad_norm': 396.1412048339844, 'learning_rate': 2.9003629764065335e-05, 'epoch': 0.7} +{'loss': 41.6011, 'grad_norm': 359.99468994140625, 'learning_rate': 2.8998185117967334e-05, 'epoch': 0.7} +{'loss': 42.8562, 'grad_norm': 436.6610107421875, 'learning_rate': 2.899274047186933e-05, 'epoch': 0.7} +{'loss': 35.0799, 'grad_norm': 472.45355224609375, 'learning_rate': 2.8987295825771325e-05, 'epoch': 0.71} +{'loss': 29.5268, 'grad_norm': 441.8983154296875, 'learning_rate': 2.898185117967332e-05, 'epoch': 0.71} +{'loss': 27.1006, 'grad_norm': 376.2563171386719, 'learning_rate': 2.8976406533575316e-05, 'epoch': 0.72} +{'loss': 27.4286, 'grad_norm': 345.8896789550781, 'learning_rate': 2.8970961887477318e-05, 'epoch': 0.72} +{'loss': 27.3932, 'grad_norm': 408.644775390625, 'learning_rate': 2.8965517241379313e-05, 'epoch': 0.72} + 4%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 200/5520 [10:13<4:08:31, 2.80s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 4%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 210/5520 [10:42<4:08:54, 2.81s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.7911182641983032, 'eval_runtime': 3.1317, 'eval_samples_per_second': 57.158, 'eval_steps_per_second': 57.158, 'epoch': 0.72} +{'loss': 58.3407, 'grad_norm': 1156.6982421875, 'learning_rate': 2.896007259528131e-05, 'epoch': 0.73} +{'loss': 58.1773, 'grad_norm': 1127.2872314453125, 'learning_rate': 2.8954627949183304e-05, 'epoch': 0.73} +{'loss': 57.3066, 'grad_norm': 974.721923828125, 'learning_rate': 2.89491833030853e-05, 'epoch': 0.73} +{'loss': 54.5647, 'grad_norm': 724.0964965820312, 'learning_rate': 2.8943738656987295e-05, 'epoch': 0.74} +{'loss': 51.5741, 'grad_norm': 577.144287109375, 'learning_rate': 2.8938294010889294e-05, 'epoch': 0.74} +{'loss': 49.4595, 'grad_norm': 406.2142028808594, 'learning_rate': 2.893284936479129e-05, 'epoch': 0.74} +{'loss': 50.7602, 'grad_norm': 537.4603271484375, 'learning_rate': 2.8927404718693284e-05, 'epoch': 0.75} +{'loss': 50.6034, 'grad_norm': 696.2557373046875, 'learning_rate': 2.892196007259528e-05, 'epoch': 0.75} +{'loss': 50.5617, 'grad_norm': 644.7799682617188, 'learning_rate': 2.891651542649728e-05, 'epoch': 0.75} +{'loss': 48.4847, 'grad_norm': 443.0915832519531, 'learning_rate': 2.8911070780399277e-05, 'epoch': 0.76} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 4%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 220/5520 [11:13<4:07:13, 2.80s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.7149282097816467, 'eval_runtime': 3.1344, 'eval_samples_per_second': 57.108, 'eval_steps_per_second': 57.108, 'epoch': 0.76} +{'loss': 46.2606, 'grad_norm': 359.4809875488281, 'learning_rate': 2.8905626134301273e-05, 'epoch': 0.76} +{'loss': 45.7595, 'grad_norm': 422.4323425292969, 'learning_rate': 2.8900181488203268e-05, 'epoch': 0.77} +{'loss': 45.5474, 'grad_norm': 374.7406311035156, 'learning_rate': 2.8894736842105263e-05, 'epoch': 0.77} +{'loss': 43.0967, 'grad_norm': 360.0633544921875, 'learning_rate': 2.888929219600726e-05, 'epoch': 0.77} +{'loss': 44.7585, 'grad_norm': 401.3516540527344, 'learning_rate': 2.8883847549909254e-05, 'epoch': 0.78} +{'loss': 44.1134, 'grad_norm': 461.3826904296875, 'learning_rate': 2.8878402903811253e-05, 'epoch': 0.78} +{'loss': 44.1363, 'grad_norm': 388.8529052734375, 'learning_rate': 2.887295825771325e-05, 'epoch': 0.78} +{'loss': 45.4802, 'grad_norm': 365.8173828125, 'learning_rate': 2.8867513611615247e-05, 'epoch': 0.79} +{'loss': 45.2052, 'grad_norm': 362.2828369140625, 'learning_rate': 2.8862068965517243e-05, 'epoch': 0.79} +{'loss': 46.6664, 'grad_norm': 387.8126220703125, 'learning_rate': 2.8856624319419238e-05, 'epoch': 0.79} + 4%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 230/5520 [11:44<4:07:15, 2.80s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.7011916637420654, 'eval_runtime': 3.1343, 'eval_samples_per_second': 57.11, 'eval_steps_per_second': 57.11, 'epoch': 0.79} +{'loss': 45.9061, 'grad_norm': 481.8575744628906, 'learning_rate': 2.8851179673321237e-05, 'epoch': 0.8} +{'loss': 46.1226, 'grad_norm': 403.699462890625, 'learning_rate': 2.8845735027223232e-05, 'epoch': 0.8} +{'loss': 47.5213, 'grad_norm': 389.87646484375, 'learning_rate': 2.8840290381125227e-05, 'epoch': 0.81} +{'loss': 46.5401, 'grad_norm': 351.58551025390625, 'learning_rate': 2.8834845735027223e-05, 'epoch': 0.81} +{'loss': 46.655, 'grad_norm': 419.92437744140625, 'learning_rate': 2.8829401088929218e-05, 'epoch': 0.81} +{'loss': 47.2712, 'grad_norm': 369.2700500488281, 'learning_rate': 2.8823956442831214e-05, 'epoch': 0.82} +{'loss': 45.873, 'grad_norm': 350.486328125, 'learning_rate': 2.8818511796733216e-05, 'epoch': 0.82} +{'loss': 45.5976, 'grad_norm': 370.6356201171875, 'learning_rate': 2.881306715063521e-05, 'epoch': 0.82} +{'loss': 45.4359, 'grad_norm': 388.7554931640625, 'learning_rate': 2.8807622504537207e-05, 'epoch': 0.83} +{'loss': 41.5546, 'grad_norm': 356.65447998046875, 'learning_rate': 2.8802177858439202e-05, 'epoch': 0.83} + 4%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 240/5520 [12:14<4:07:02, 2.81s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6976248621940613, 'eval_runtime': 3.1394, 'eval_samples_per_second': 57.018, 'eval_steps_per_second': 57.018, 'epoch': 0.83} +{'loss': 40.3896, 'grad_norm': 484.9076232910156, 'learning_rate': 2.8796733212341197e-05, 'epoch': 0.83} +{'loss': 38.1999, 'grad_norm': 426.18902587890625, 'learning_rate': 2.8791288566243196e-05, 'epoch': 0.84} +{'loss': 38.8128, 'grad_norm': 387.5289001464844, 'learning_rate': 2.878584392014519e-05, 'epoch': 0.84} +{'loss': 39.1003, 'grad_norm': 491.71331787109375, 'learning_rate': 2.8780399274047187e-05, 'epoch': 0.85} +{'loss': 40.7458, 'grad_norm': 376.87249755859375, 'learning_rate': 2.8774954627949182e-05, 'epoch': 0.85} +{'loss': 41.9222, 'grad_norm': 459.1217041015625, 'learning_rate': 2.876950998185118e-05, 'epoch': 0.85} +{'loss': 41.0784, 'grad_norm': 445.1222229003906, 'learning_rate': 2.8764065335753176e-05, 'epoch': 0.86} +{'loss': 41.5524, 'grad_norm': 375.32843017578125, 'learning_rate': 2.8758620689655175e-05, 'epoch': 0.86} +{'loss': 41.5471, 'grad_norm': 303.4617614746094, 'learning_rate': 2.875317604355717e-05, 'epoch': 0.86} +{'loss': 40.503, 'grad_norm': 360.2012634277344, 'learning_rate': 2.8747731397459166e-05, 'epoch': 0.87} + 5%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 250/5520 [12:45<4:06:04, 2.80s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.695322573184967, 'eval_runtime': 3.1341, 'eval_samples_per_second': 57.113, 'eval_steps_per_second': 57.113, 'epoch': 0.87} +{'loss': 41.8679, 'grad_norm': 384.7886047363281, 'learning_rate': 2.874228675136116e-05, 'epoch': 0.87} +{'loss': 42.4417, 'grad_norm': 344.9561767578125, 'learning_rate': 2.8736842105263157e-05, 'epoch': 0.87} +{'loss': 42.0715, 'grad_norm': 356.1025695800781, 'learning_rate': 2.8731397459165155e-05, 'epoch': 0.88} +{'loss': 42.4063, 'grad_norm': 416.7387390136719, 'learning_rate': 2.872595281306715e-05, 'epoch': 0.88} +{'loss': 41.3914, 'grad_norm': 392.5692138671875, 'learning_rate': 2.872050816696915e-05, 'epoch': 0.88} +{'loss': 34.0761, 'grad_norm': 405.326416015625, 'learning_rate': 2.8715063520871145e-05, 'epoch': 0.89} +{'loss': 28.2779, 'grad_norm': 484.799072265625, 'learning_rate': 2.870961887477314e-05, 'epoch': 0.89} +{'loss': 27.6529, 'grad_norm': 499.2939147949219, 'learning_rate': 2.8704174228675136e-05, 'epoch': 0.9} +{'loss': 27.4412, 'grad_norm': 381.8467102050781, 'learning_rate': 2.8698729582577135e-05, 'epoch': 0.9} +{'loss': 28.0706, 'grad_norm': 344.0008850097656, 'learning_rate': 2.869328493647913e-05, 'epoch': 0.9} + 5%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 260/5520 [13:16<4:06:19, 2.81s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.7842397093772888, 'eval_runtime': 3.1357, 'eval_samples_per_second': 57.085, 'eval_steps_per_second': 57.085, 'epoch': 0.9} +{'loss': 58.7628, 'grad_norm': 1059.8311767578125, 'learning_rate': 2.8687840290381125e-05, 'epoch': 0.91} +{'loss': 57.5323, 'grad_norm': 1057.7684326171875, 'learning_rate': 2.868239564428312e-05, 'epoch': 0.91} +{'loss': 55.8152, 'grad_norm': 976.0852661132812, 'learning_rate': 2.8676950998185116e-05, 'epoch': 0.91} +{'loss': 55.599, 'grad_norm': 860.575439453125, 'learning_rate': 2.8671506352087115e-05, 'epoch': 0.92} +{'loss': 52.4687, 'grad_norm': 615.1729736328125, 'learning_rate': 2.8666061705989114e-05, 'epoch': 0.92} +{'loss': 50.4135, 'grad_norm': 489.91754150390625, 'learning_rate': 2.866061705989111e-05, 'epoch': 0.92} +{'loss': 48.5034, 'grad_norm': 490.24029541015625, 'learning_rate': 2.8655172413793104e-05, 'epoch': 0.93} +{'loss': 47.2695, 'grad_norm': 396.28326416015625, 'learning_rate': 2.86497277676951e-05, 'epoch': 0.93} +{'loss': 46.0009, 'grad_norm': 382.5725402832031, 'learning_rate': 2.86442831215971e-05, 'epoch': 0.94} +{'loss': 45.5784, 'grad_norm': 403.9846496582031, 'learning_rate': 2.8638838475499094e-05, 'epoch': 0.94} + 5%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 270/5520 [13:47<4:04:49, 2.80s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.7167119979858398, 'eval_runtime': 3.147, 'eval_samples_per_second': 56.88, 'eval_steps_per_second': 56.88, 'epoch': 0.94} +{'loss': 46.9294, 'grad_norm': 501.81561279296875, 'learning_rate': 2.863339382940109e-05, 'epoch': 0.94} +{'loss': 47.5202, 'grad_norm': 500.6963806152344, 'learning_rate': 2.8627949183303085e-05, 'epoch': 0.95} +{'loss': 47.7158, 'grad_norm': 453.0813903808594, 'learning_rate': 2.862250453720508e-05, 'epoch': 0.95} +{'loss': 48.9962, 'grad_norm': 460.04742431640625, 'learning_rate': 2.861705989110708e-05, 'epoch': 0.95} +{'loss': 48.8704, 'grad_norm': 426.95196533203125, 'learning_rate': 2.8611615245009078e-05, 'epoch': 0.96} +{'loss': 43.871, 'grad_norm': 381.4711608886719, 'learning_rate': 2.8606170598911073e-05, 'epoch': 0.96} +{'loss': 38.4646, 'grad_norm': 333.3099060058594, 'learning_rate': 2.860072595281307e-05, 'epoch': 0.96} +{'loss': 37.1731, 'grad_norm': 325.5362548828125, 'learning_rate': 2.8595281306715064e-05, 'epoch': 0.97} +{'loss': 39.6756, 'grad_norm': 379.2328796386719, 'learning_rate': 2.858983666061706e-05, 'epoch': 0.97} +{'loss': 41.435, 'grad_norm': 407.74200439453125, 'learning_rate': 2.8584392014519058e-05, 'epoch': 0.98} + 5%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 280/5520 [14:17<4:03:05, 2.78s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6935378909111023, 'eval_runtime': 3.1372, 'eval_samples_per_second': 57.057, 'eval_steps_per_second': 57.057, 'epoch': 0.98} +{'loss': 41.3764, 'grad_norm': 432.80267333984375, 'learning_rate': 2.8578947368421053e-05, 'epoch': 0.98} +{'loss': 39.6562, 'grad_norm': 386.5149841308594, 'learning_rate': 2.857350272232305e-05, 'epoch': 0.98} +{'loss': 42.5142, 'grad_norm': 394.14471435546875, 'learning_rate': 2.8568058076225047e-05, 'epoch': 0.99} +{'loss': 41.819, 'grad_norm': 389.7673645019531, 'learning_rate': 2.8562613430127043e-05, 'epoch': 0.99} +{'loss': 34.3601, 'grad_norm': 473.9212951660156, 'learning_rate': 2.8557168784029038e-05, 'epoch': 0.99} +{'loss': 27.1479, 'grad_norm': 422.0166320800781, 'learning_rate': 2.8551724137931037e-05, 'epoch': 1.0} +{'loss': 23.7312, 'grad_norm': 287.4736633300781, 'learning_rate': 2.8546279491833032e-05, 'epoch': 1.0} +{'loss': 54.4935, 'grad_norm': 877.3681030273438, 'learning_rate': 2.8540834845735028e-05, 'epoch': 1.0} +{'loss': 52.8877, 'grad_norm': 739.6668090820312, 'learning_rate': 2.8535390199637023e-05, 'epoch': 1.01} +{'loss': 52.3691, 'grad_norm': 718.5248413085938, 'learning_rate': 2.852994555353902e-05, 'epoch': 1.01} + 5%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 290/5520 [14:48<4:03:53, 2.80s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.7196069359779358, 'eval_runtime': 3.139, 'eval_samples_per_second': 57.025, 'eval_steps_per_second': 57.025, 'epoch': 1.01} +{'loss': 49.2538, 'grad_norm': 532.3770141601562, 'learning_rate': 2.8524500907441017e-05, 'epoch': 1.01} +{'loss': 48.7469, 'grad_norm': 431.7366027832031, 'learning_rate': 2.8519056261343016e-05, 'epoch': 1.02} +{'loss': 47.96, 'grad_norm': 338.91424560546875, 'learning_rate': 2.851361161524501e-05, 'epoch': 1.02} +{'loss': 48.9088, 'grad_norm': 448.8798828125, 'learning_rate': 2.8508166969147007e-05, 'epoch': 1.03} +{'loss': 49.1375, 'grad_norm': 395.4872131347656, 'learning_rate': 2.8502722323049002e-05, 'epoch': 1.03} +{'loss': 49.393, 'grad_norm': 428.61285400390625, 'learning_rate': 2.8497277676950998e-05, 'epoch': 1.03} +{'loss': 49.0513, 'grad_norm': 380.1004943847656, 'learning_rate': 2.8491833030852996e-05, 'epoch': 1.04} +{'loss': 47.6806, 'grad_norm': 318.0881042480469, 'learning_rate': 2.8486388384754992e-05, 'epoch': 1.04} +{'loss': 45.9821, 'grad_norm': 331.2910461425781, 'learning_rate': 2.8480943738656987e-05, 'epoch': 1.04} +{'loss': 43.9498, 'grad_norm': 280.7160339355469, 'learning_rate': 2.8475499092558982e-05, 'epoch': 1.05} + 5%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 300/5520 [15:19<4:04:26, 2.81s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.682730495929718, 'eval_runtime': 3.1351, 'eval_samples_per_second': 57.095, 'eval_steps_per_second': 57.095, 'epoch': 1.05} +{'loss': 42.5624, 'grad_norm': 246.1832733154297, 'learning_rate': 2.8470054446460978e-05, 'epoch': 1.05} +{'loss': 43.2049, 'grad_norm': 262.2304992675781, 'learning_rate': 2.846460980036298e-05, 'epoch': 1.05} +{'loss': 43.5165, 'grad_norm': 290.2947082519531, 'learning_rate': 2.8459165154264975e-05, 'epoch': 1.06} +{'loss': 43.5621, 'grad_norm': 269.8375244140625, 'learning_rate': 2.845372050816697e-05, 'epoch': 1.06} +{'loss': 42.9337, 'grad_norm': 275.5233459472656, 'learning_rate': 2.8448275862068966e-05, 'epoch': 1.07} +{'loss': 45.051, 'grad_norm': 275.6507873535156, 'learning_rate': 2.844283121597096e-05, 'epoch': 1.07} +{'loss': 44.6103, 'grad_norm': 340.93536376953125, 'learning_rate': 2.8437386569872957e-05, 'epoch': 1.07} +{'loss': 44.5428, 'grad_norm': 286.0844421386719, 'learning_rate': 2.8431941923774956e-05, 'epoch': 1.08} +{'loss': 45.631, 'grad_norm': 316.6739501953125, 'learning_rate': 2.842649727767695e-05, 'epoch': 1.08} +{'loss': 45.0464, 'grad_norm': 256.1273193359375, 'learning_rate': 2.8421052631578946e-05, 'epoch': 1.08} + 6%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 310/5520 [15:50<4:02:38, 2.79s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6778246760368347, 'eval_runtime': 3.1349, 'eval_samples_per_second': 57.099, 'eval_steps_per_second': 57.099, 'epoch': 1.08} +{'loss': 45.8102, 'grad_norm': 281.78082275390625, 'learning_rate': 2.8415607985480945e-05, 'epoch': 1.09} +{'loss': 45.3131, 'grad_norm': 297.08770751953125, 'learning_rate': 2.841016333938294e-05, 'epoch': 1.09} +{'loss': 44.9113, 'grad_norm': 388.77972412109375, 'learning_rate': 2.840471869328494e-05, 'epoch': 1.09} +{'loss': 45.9125, 'grad_norm': 301.92913818359375, 'learning_rate': 2.8399274047186935e-05, 'epoch': 1.1} +{'loss': 45.7297, 'grad_norm': 387.6468505859375, 'learning_rate': 2.839382940108893e-05, 'epoch': 1.1} +{'loss': 45.2253, 'grad_norm': 315.0013427734375, 'learning_rate': 2.8388384754990926e-05, 'epoch': 1.1} +{'loss': 42.6807, 'grad_norm': 341.985595703125, 'learning_rate': 2.838294010889292e-05, 'epoch': 1.11} +{'loss': 39.3621, 'grad_norm': 390.09674072265625, 'learning_rate': 2.8377495462794916e-05, 'epoch': 1.11} +{'loss': 37.6168, 'grad_norm': 391.62640380859375, 'learning_rate': 2.8372050816696915e-05, 'epoch': 1.12} +{'loss': 38.7192, 'grad_norm': 353.9164123535156, 'learning_rate': 2.8366606170598914e-05, 'epoch': 1.12} + 6%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 320/5520 [16:21<4:02:54, 2.80s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6953558325767517, 'eval_runtime': 3.1291, 'eval_samples_per_second': 57.205, 'eval_steps_per_second': 57.205, 'epoch': 1.12} +{'loss': 39.5022, 'grad_norm': 302.96240234375, 'learning_rate': 2.836116152450091e-05, 'epoch': 1.12} +{'loss': 39.6587, 'grad_norm': 400.8553466796875, 'learning_rate': 2.8355716878402905e-05, 'epoch': 1.13} +{'loss': 39.8058, 'grad_norm': 345.9519348144531, 'learning_rate': 2.83502722323049e-05, 'epoch': 1.13} +{'loss': 39.951, 'grad_norm': 337.1177978515625, 'learning_rate': 2.83448275862069e-05, 'epoch': 1.13} +{'loss': 39.309, 'grad_norm': 301.2976989746094, 'learning_rate': 2.8339382940108894e-05, 'epoch': 1.14} +{'loss': 40.6924, 'grad_norm': 406.03094482421875, 'learning_rate': 2.833393829401089e-05, 'epoch': 1.14} +{'loss': 41.3554, 'grad_norm': 390.6329345703125, 'learning_rate': 2.8328493647912885e-05, 'epoch': 1.14} +{'loss': 41.1766, 'grad_norm': 321.64508056640625, 'learning_rate': 2.832304900181488e-05, 'epoch': 1.15} +{'loss': 40.1808, 'grad_norm': 283.5152282714844, 'learning_rate': 2.831760435571688e-05, 'epoch': 1.15} +{'loss': 42.0895, 'grad_norm': 348.6308288574219, 'learning_rate': 2.8312159709618878e-05, 'epoch': 1.16} + 6%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 330/5520 [16:51<4:03:39, 2.82s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.69289630651474, 'eval_runtime': 3.1367, 'eval_samples_per_second': 57.067, 'eval_steps_per_second': 57.067, 'epoch': 1.16} +{'loss': 41.5536, 'grad_norm': 316.7882995605469, 'learning_rate': 2.8306715063520873e-05, 'epoch': 1.16} +{'loss': 41.4083, 'grad_norm': 373.5389099121094, 'learning_rate': 2.830127041742287e-05, 'epoch': 1.16} +{'loss': 39.9028, 'grad_norm': 382.8615417480469, 'learning_rate': 2.8295825771324864e-05, 'epoch': 1.17} +{'loss': 28.8617, 'grad_norm': 327.3189392089844, 'learning_rate': 2.829038112522686e-05, 'epoch': 1.17} +{'loss': 27.1866, 'grad_norm': 307.2225036621094, 'learning_rate': 2.8284936479128858e-05, 'epoch': 1.17} +{'loss': 27.7946, 'grad_norm': 257.647705078125, 'learning_rate': 2.8279491833030854e-05, 'epoch': 1.18} +{'loss': 27.1481, 'grad_norm': 286.5907897949219, 'learning_rate': 2.827404718693285e-05, 'epoch': 1.18} +{'loss': 55.8025, 'grad_norm': 914.318603515625, 'learning_rate': 2.8268602540834848e-05, 'epoch': 1.18} +{'loss': 56.1987, 'grad_norm': 858.4988403320312, 'learning_rate': 2.8263157894736843e-05, 'epoch': 1.19} +{'loss': 54.3495, 'grad_norm': 800.506103515625, 'learning_rate': 2.825771324863884e-05, 'epoch': 1.19} + 6%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 340/5520 [17:22<4:01:43, 2.80s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.7448948621749878, 'eval_runtime': 3.1331, 'eval_samples_per_second': 57.132, 'eval_steps_per_second': 57.132, 'epoch': 1.19} +{'loss': 53.5803, 'grad_norm': 692.0379028320312, 'learning_rate': 2.8252268602540837e-05, 'epoch': 1.2} +{'loss': 50.6743, 'grad_norm': 527.4228515625, 'learning_rate': 2.8246823956442833e-05, 'epoch': 1.2} +{'loss': 49.3978, 'grad_norm': 460.74169921875, 'learning_rate': 2.8241379310344828e-05, 'epoch': 1.2} +{'loss': 49.7682, 'grad_norm': 388.31201171875, 'learning_rate': 2.8235934664246823e-05, 'epoch': 1.21} +{'loss': 48.4647, 'grad_norm': 414.94775390625, 'learning_rate': 2.823049001814882e-05, 'epoch': 1.21} +{'loss': 48.9792, 'grad_norm': 440.1581115722656, 'learning_rate': 2.8225045372050818e-05, 'epoch': 1.21} +{'loss': 48.0859, 'grad_norm': 377.634033203125, 'learning_rate': 2.8219600725952813e-05, 'epoch': 1.22} +{'loss': 46.2391, 'grad_norm': 286.99462890625, 'learning_rate': 2.8214156079854812e-05, 'epoch': 1.22} +{'loss': 45.4826, 'grad_norm': 353.834716796875, 'learning_rate': 2.8208711433756807e-05, 'epoch': 1.22} +{'loss': 43.7182, 'grad_norm': 311.1981506347656, 'learning_rate': 2.8203266787658802e-05, 'epoch': 1.23} + 6%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 350/5520 [17:53<4:02:26, 2.81s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6925392150878906, 'eval_runtime': 3.1338, 'eval_samples_per_second': 57.119, 'eval_steps_per_second': 57.119, 'epoch': 1.23} +{'loss': 45.2841, 'grad_norm': 343.8255920410156, 'learning_rate': 2.8197822141560798e-05, 'epoch': 1.23} +{'loss': 43.4088, 'grad_norm': 275.9765930175781, 'learning_rate': 2.8192377495462797e-05, 'epoch': 1.23} +{'loss': 42.6711, 'grad_norm': 228.50440979003906, 'learning_rate': 2.8186932849364792e-05, 'epoch': 1.24} +{'loss': 43.0506, 'grad_norm': 253.25831604003906, 'learning_rate': 2.8181488203266787e-05, 'epoch': 1.24} +{'loss': 44.455, 'grad_norm': 243.9517059326172, 'learning_rate': 2.8176043557168783e-05, 'epoch': 1.25} +{'loss': 45.154, 'grad_norm': 245.95286560058594, 'learning_rate': 2.8170598911070778e-05, 'epoch': 1.25} +{'loss': 44.0179, 'grad_norm': 245.6503143310547, 'learning_rate': 2.816515426497278e-05, 'epoch': 1.25} +{'loss': 45.7594, 'grad_norm': 290.8607177734375, 'learning_rate': 2.8159709618874776e-05, 'epoch': 1.26} +{'loss': 44.4864, 'grad_norm': 259.909912109375, 'learning_rate': 2.815426497277677e-05, 'epoch': 1.26} +{'loss': 47.1445, 'grad_norm': 284.4267272949219, 'learning_rate': 2.8148820326678766e-05, 'epoch': 1.26} + 7%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 360/5520 [18:24<4:00:16, 2.79s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6740585565567017, 'eval_runtime': 3.1352, 'eval_samples_per_second': 57.093, 'eval_steps_per_second': 57.093, 'epoch': 1.26} +{'loss': 46.3238, 'grad_norm': 362.87164306640625, 'learning_rate': 2.8143375680580762e-05, 'epoch': 1.27} +{'loss': 46.2261, 'grad_norm': 290.58477783203125, 'learning_rate': 2.813793103448276e-05, 'epoch': 1.27} +{'loss': 44.5556, 'grad_norm': 289.98101806640625, 'learning_rate': 2.8132486388384756e-05, 'epoch': 1.27} +{'loss': 45.2847, 'grad_norm': 318.88604736328125, 'learning_rate': 2.812704174228675e-05, 'epoch': 1.28} +{'loss': 44.1901, 'grad_norm': 318.89227294921875, 'learning_rate': 2.8121597096188747e-05, 'epoch': 1.28} +{'loss': 43.0926, 'grad_norm': 389.559814453125, 'learning_rate': 2.8116152450090746e-05, 'epoch': 1.29} +{'loss': 40.3614, 'grad_norm': 444.1388854980469, 'learning_rate': 2.811070780399274e-05, 'epoch': 1.29} +{'loss': 38.9221, 'grad_norm': 395.99993896484375, 'learning_rate': 2.810526315789474e-05, 'epoch': 1.29} +{'loss': 37.6041, 'grad_norm': 346.8638000488281, 'learning_rate': 2.8099818511796735e-05, 'epoch': 1.3} +{'loss': 38.9997, 'grad_norm': 255.75537109375, 'learning_rate': 2.809437386569873e-05, 'epoch': 1.3} + 7%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 370/5520 [18:55<4:01:18, 2.81s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.689025342464447, 'eval_runtime': 3.132, 'eval_samples_per_second': 57.152, 'eval_steps_per_second': 57.152, 'epoch': 1.3} +{'loss': 38.4022, 'grad_norm': 337.3376770019531, 'learning_rate': 2.8088929219600726e-05, 'epoch': 1.3} +{'loss': 39.1616, 'grad_norm': 343.35205078125, 'learning_rate': 2.808348457350272e-05, 'epoch': 1.31} +{'loss': 41.1005, 'grad_norm': 396.1789245605469, 'learning_rate': 2.807803992740472e-05, 'epoch': 1.31} +{'loss': 40.1739, 'grad_norm': 358.3716735839844, 'learning_rate': 2.8072595281306715e-05, 'epoch': 1.31} +{'loss': 41.1481, 'grad_norm': 475.8331298828125, 'learning_rate': 2.8067150635208714e-05, 'epoch': 1.32} +{'loss': 41.1013, 'grad_norm': 322.4574279785156, 'learning_rate': 2.806170598911071e-05, 'epoch': 1.32} +{'loss': 41.1077, 'grad_norm': 386.1807861328125, 'learning_rate': 2.8056261343012705e-05, 'epoch': 1.33} +{'loss': 40.5549, 'grad_norm': 335.3432312011719, 'learning_rate': 2.80508166969147e-05, 'epoch': 1.33} +{'loss': 41.4764, 'grad_norm': 344.7771911621094, 'learning_rate': 2.80453720508167e-05, 'epoch': 1.33} +{'loss': 42.3345, 'grad_norm': 373.671142578125, 'learning_rate': 2.8039927404718694e-05, 'epoch': 1.34} + 7%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 380/5520 [19:26<4:01:07, 2.81s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6789068579673767, 'eval_runtime': 3.1345, 'eval_samples_per_second': 57.106, 'eval_steps_per_second': 57.106, 'epoch': 1.34} +{'loss': 41.2196, 'grad_norm': 374.783203125, 'learning_rate': 2.803448275862069e-05, 'epoch': 1.34} +{'loss': 41.4368, 'grad_norm': 391.8028564453125, 'learning_rate': 2.8029038112522685e-05, 'epoch': 1.34} +{'loss': 38.4212, 'grad_norm': 364.7682800292969, 'learning_rate': 2.802359346642468e-05, 'epoch': 1.35} +{'loss': 26.7865, 'grad_norm': 335.779541015625, 'learning_rate': 2.8018148820326683e-05, 'epoch': 1.35} +{'loss': 25.3621, 'grad_norm': 353.480224609375, 'learning_rate': 2.8012704174228678e-05, 'epoch': 1.35} +{'loss': 26.8962, 'grad_norm': 246.8798370361328, 'learning_rate': 2.8007259528130674e-05, 'epoch': 1.36} +{'loss': 27.0153, 'grad_norm': 384.77801513671875, 'learning_rate': 2.800181488203267e-05, 'epoch': 1.36} +{'loss': 53.2037, 'grad_norm': 781.5183715820312, 'learning_rate': 2.7996370235934664e-05, 'epoch': 1.36} +{'loss': 55.7635, 'grad_norm': 765.4360961914062, 'learning_rate': 2.799092558983666e-05, 'epoch': 1.37} +{'loss': 52.802, 'grad_norm': 725.854736328125, 'learning_rate': 2.798548094373866e-05, 'epoch': 1.37} + 7%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 390/5520 [19:56<3:59:44, 2.80s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.7313510179519653, 'eval_runtime': 3.1334, 'eval_samples_per_second': 57.126, 'eval_steps_per_second': 57.126, 'epoch': 1.37} +{'loss': 51.6548, 'grad_norm': 564.2916259765625, 'learning_rate': 2.7980036297640654e-05, 'epoch': 1.38} +{'loss': 48.4082, 'grad_norm': 459.6091003417969, 'learning_rate': 2.797459165154265e-05, 'epoch': 1.38} +{'loss': 48.5173, 'grad_norm': 373.1909484863281, 'learning_rate': 2.7969147005444645e-05, 'epoch': 1.38} +{'loss': 47.5063, 'grad_norm': 371.30169677734375, 'learning_rate': 2.7963702359346643e-05, 'epoch': 1.39} +{'loss': 48.371, 'grad_norm': 336.7066345214844, 'learning_rate': 2.7958257713248642e-05, 'epoch': 1.39} +{'loss': 46.209, 'grad_norm': 338.871826171875, 'learning_rate': 2.7952813067150638e-05, 'epoch': 1.39} +{'loss': 48.0522, 'grad_norm': 404.99749755859375, 'learning_rate': 2.7947368421052633e-05, 'epoch': 1.4} +{'loss': 46.1458, 'grad_norm': 374.24017333984375, 'learning_rate': 2.7941923774954628e-05, 'epoch': 1.4} +{'loss': 44.5361, 'grad_norm': 269.91937255859375, 'learning_rate': 2.7936479128856624e-05, 'epoch': 1.4} +{'loss': 44.2957, 'grad_norm': 340.3489074707031, 'learning_rate': 2.793103448275862e-05, 'epoch': 1.41} + 7%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 400/5520 [20:27<3:59:52, 2.81s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6879153251647949, 'eval_runtime': 3.136, 'eval_samples_per_second': 57.079, 'eval_steps_per_second': 57.079, 'epoch': 1.41} +{'loss': 43.126, 'grad_norm': 275.49676513671875, 'learning_rate': 2.7925589836660618e-05, 'epoch': 1.41} +{'loss': 43.3793, 'grad_norm': 241.9796905517578, 'learning_rate': 2.7920145190562613e-05, 'epoch': 1.42} +{'loss': 42.0434, 'grad_norm': 274.9486389160156, 'learning_rate': 2.7914700544464612e-05, 'epoch': 1.42} +{'loss': 43.9504, 'grad_norm': 259.0799255371094, 'learning_rate': 2.7909255898366607e-05, 'epoch': 1.42} +{'loss': 43.505, 'grad_norm': 311.82464599609375, 'learning_rate': 2.7903811252268603e-05, 'epoch': 1.43} +{'loss': 44.5498, 'grad_norm': 301.56243896484375, 'learning_rate': 2.78983666061706e-05, 'epoch': 1.43} +{'loss': 44.5173, 'grad_norm': 251.64212036132812, 'learning_rate': 2.7892921960072597e-05, 'epoch': 1.43} +{'loss': 45.396, 'grad_norm': 294.3619384765625, 'learning_rate': 2.7887477313974592e-05, 'epoch': 1.44} +{'loss': 43.6358, 'grad_norm': 273.31427001953125, 'learning_rate': 2.7882032667876588e-05, 'epoch': 1.44} +{'loss': 45.2258, 'grad_norm': 317.6174011230469, 'learning_rate': 2.7876588021778583e-05, 'epoch': 1.44} + 7%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 400/5520 [20:30<3:59:52, 2.81s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 7%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 410/5520 [20:59<4:00:14, 2.82s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6741424202919006, 'eval_runtime': 3.1349, 'eval_samples_per_second': 57.1, 'eval_steps_per_second': 57.1, 'epoch': 1.44} +{'loss': 44.0452, 'grad_norm': 267.40118408203125, 'learning_rate': 2.787114337568058e-05, 'epoch': 1.45} +{'loss': 44.8225, 'grad_norm': 243.23074340820312, 'learning_rate': 2.786569872958258e-05, 'epoch': 1.45} +{'loss': 46.3814, 'grad_norm': 313.2763366699219, 'learning_rate': 2.7860254083484576e-05, 'epoch': 1.46} +{'loss': 44.5303, 'grad_norm': 348.0602722167969, 'learning_rate': 2.785480943738657e-05, 'epoch': 1.46} +{'loss': 46.2257, 'grad_norm': 307.08819580078125, 'learning_rate': 2.7849364791288567e-05, 'epoch': 1.46} +{'loss': 42.795, 'grad_norm': 283.5260925292969, 'learning_rate': 2.7843920145190562e-05, 'epoch': 1.47} +{'loss': 40.3193, 'grad_norm': 470.912841796875, 'learning_rate': 2.783847549909256e-05, 'epoch': 1.47} +{'loss': 39.166, 'grad_norm': 499.6931457519531, 'learning_rate': 2.7833030852994556e-05, 'epoch': 1.47} +{'loss': 38.0724, 'grad_norm': 440.8569641113281, 'learning_rate': 2.782758620689655e-05, 'epoch': 1.48} +{'loss': 38.5902, 'grad_norm': 307.85919189453125, 'learning_rate': 2.7822141560798547e-05, 'epoch': 1.48} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 8%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 420/5520 [21:30<3:58:23, 2.80s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6782167553901672, 'eval_runtime': 3.1368, 'eval_samples_per_second': 57.065, 'eval_steps_per_second': 57.065, 'epoch': 1.48} +{'loss': 38.9796, 'grad_norm': 300.9029846191406, 'learning_rate': 2.7816696914700546e-05, 'epoch': 1.48} +{'loss': 38.3992, 'grad_norm': 406.2210693359375, 'learning_rate': 2.781125226860254e-05, 'epoch': 1.49} +{'loss': 39.0204, 'grad_norm': 374.5141906738281, 'learning_rate': 2.780580762250454e-05, 'epoch': 1.49} +{'loss': 40.1299, 'grad_norm': 437.4369201660156, 'learning_rate': 2.7800362976406535e-05, 'epoch': 1.49} +{'loss': 40.1278, 'grad_norm': 272.6376953125, 'learning_rate': 2.779491833030853e-05, 'epoch': 1.5} +{'loss': 39.6137, 'grad_norm': 320.0819091796875, 'learning_rate': 2.7789473684210526e-05, 'epoch': 1.5} +{'loss': 41.0757, 'grad_norm': 351.5314025878906, 'learning_rate': 2.778402903811252e-05, 'epoch': 1.51} +{'loss': 40.3751, 'grad_norm': 331.9538879394531, 'learning_rate': 2.777858439201452e-05, 'epoch': 1.51} +{'loss': 41.3858, 'grad_norm': 339.1962585449219, 'learning_rate': 2.7773139745916516e-05, 'epoch': 1.51} +{'loss': 42.1872, 'grad_norm': 264.5666198730469, 'learning_rate': 2.776769509981851e-05, 'epoch': 1.52} + 8%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 430/5520 [22:01<3:58:12, 2.81s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6758362650871277, 'eval_runtime': 3.1397, 'eval_samples_per_second': 57.012, 'eval_steps_per_second': 57.012, 'epoch': 1.52} +{'loss': 41.5158, 'grad_norm': 282.2214050292969, 'learning_rate': 2.776225045372051e-05, 'epoch': 1.52} +{'loss': 39.6937, 'grad_norm': 314.0169677734375, 'learning_rate': 2.7756805807622505e-05, 'epoch': 1.52} +{'loss': 30.1697, 'grad_norm': 258.1871337890625, 'learning_rate': 2.77513611615245e-05, 'epoch': 1.53} +{'loss': 26.0674, 'grad_norm': 213.88528442382812, 'learning_rate': 2.77459165154265e-05, 'epoch': 1.53} +{'loss': 26.2021, 'grad_norm': 313.9029235839844, 'learning_rate': 2.7740471869328495e-05, 'epoch': 1.53} +{'loss': 26.9734, 'grad_norm': 334.663330078125, 'learning_rate': 2.773502722323049e-05, 'epoch': 1.54} +{'loss': 27.5513, 'grad_norm': 304.77117919921875, 'learning_rate': 2.7729582577132486e-05, 'epoch': 1.54} +{'loss': 52.8855, 'grad_norm': 642.5489501953125, 'learning_rate': 2.772413793103448e-05, 'epoch': 1.55} +{'loss': 53.095, 'grad_norm': 579.0210571289062, 'learning_rate': 2.771869328493648e-05, 'epoch': 1.55} +{'loss': 52.0631, 'grad_norm': 502.8334045410156, 'learning_rate': 2.771324863883848e-05, 'epoch': 1.55} + 8%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 440/5520 [22:32<3:58:38, 2.82s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.70591801404953, 'eval_runtime': 3.1403, 'eval_samples_per_second': 57.001, 'eval_steps_per_second': 57.001, 'epoch': 1.55} +{'loss': 49.6795, 'grad_norm': 452.4619140625, 'learning_rate': 2.7707803992740474e-05, 'epoch': 1.56} +{'loss': 50.089, 'grad_norm': 434.84326171875, 'learning_rate': 2.770235934664247e-05, 'epoch': 1.56} +{'loss': 48.5218, 'grad_norm': 389.4812927246094, 'learning_rate': 2.7696914700544465e-05, 'epoch': 1.56} +{'loss': 48.0049, 'grad_norm': 279.72027587890625, 'learning_rate': 2.769147005444646e-05, 'epoch': 1.57} +{'loss': 47.7967, 'grad_norm': 294.6167907714844, 'learning_rate': 2.768602540834846e-05, 'epoch': 1.57} +{'loss': 48.3725, 'grad_norm': 296.6061706542969, 'learning_rate': 2.7680580762250454e-05, 'epoch': 1.57} +{'loss': 47.3637, 'grad_norm': 448.601318359375, 'learning_rate': 2.767513611615245e-05, 'epoch': 1.58} +{'loss': 45.8428, 'grad_norm': 401.6792297363281, 'learning_rate': 2.7669691470054445e-05, 'epoch': 1.58} +{'loss': 45.0625, 'grad_norm': 383.7574768066406, 'learning_rate': 2.7664246823956444e-05, 'epoch': 1.59} +{'loss': 45.0018, 'grad_norm': 354.9222412109375, 'learning_rate': 2.7658802177858442e-05, 'epoch': 1.59} + 8%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 450/5520 [23:02<3:57:56, 2.82s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6869362592697144, 'eval_runtime': 3.1396, 'eval_samples_per_second': 57.013, 'eval_steps_per_second': 57.013, 'epoch': 1.59} +{'loss': 42.2533, 'grad_norm': 332.02191162109375, 'learning_rate': 2.7653357531760438e-05, 'epoch': 1.59} +{'loss': 43.0666, 'grad_norm': 214.40272521972656, 'learning_rate': 2.7647912885662433e-05, 'epoch': 1.6} +{'loss': 43.7141, 'grad_norm': 234.03184509277344, 'learning_rate': 2.764246823956443e-05, 'epoch': 1.6} +{'loss': 43.1818, 'grad_norm': 290.4942626953125, 'learning_rate': 2.7637023593466424e-05, 'epoch': 1.6} +{'loss': 44.992, 'grad_norm': 368.3863525390625, 'learning_rate': 2.7631578947368423e-05, 'epoch': 1.61} +{'loss': 43.9973, 'grad_norm': 256.7243347167969, 'learning_rate': 2.7626134301270418e-05, 'epoch': 1.61} +{'loss': 44.1467, 'grad_norm': 235.7418670654297, 'learning_rate': 2.7620689655172413e-05, 'epoch': 1.61} +{'loss': 44.3638, 'grad_norm': 269.3458251953125, 'learning_rate': 2.7615245009074412e-05, 'epoch': 1.62} +{'loss': 45.5499, 'grad_norm': 267.63104248046875, 'learning_rate': 2.7609800362976408e-05, 'epoch': 1.62} +{'loss': 44.6896, 'grad_norm': 266.48260498046875, 'learning_rate': 2.7604355716878403e-05, 'epoch': 1.62} + 8%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 460/5520 [23:33<3:56:18, 2.80s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6687367558479309, 'eval_runtime': 3.138, 'eval_samples_per_second': 57.042, 'eval_steps_per_second': 57.042, 'epoch': 1.62} +{'loss': 44.4839, 'grad_norm': 280.531005859375, 'learning_rate': 2.7598911070780402e-05, 'epoch': 1.63} +{'loss': 44.4457, 'grad_norm': 277.5115966796875, 'learning_rate': 2.7593466424682397e-05, 'epoch': 1.63} +{'loss': 45.3896, 'grad_norm': 283.0730285644531, 'learning_rate': 2.7588021778584393e-05, 'epoch': 1.64} +{'loss': 45.1627, 'grad_norm': 220.58546447753906, 'learning_rate': 2.7582577132486388e-05, 'epoch': 1.64} +{'loss': 44.0173, 'grad_norm': 221.82968139648438, 'learning_rate': 2.7577132486388383e-05, 'epoch': 1.64} +{'loss': 41.7427, 'grad_norm': 293.05828857421875, 'learning_rate': 2.7571687840290382e-05, 'epoch': 1.65} +{'loss': 38.7822, 'grad_norm': 308.2817077636719, 'learning_rate': 2.756624319419238e-05, 'epoch': 1.65} +{'loss': 37.8994, 'grad_norm': 322.1114196777344, 'learning_rate': 2.7560798548094376e-05, 'epoch': 1.65} +{'loss': 38.2092, 'grad_norm': 357.4956359863281, 'learning_rate': 2.755535390199637e-05, 'epoch': 1.66} +{'loss': 39.1363, 'grad_norm': 298.619384765625, 'learning_rate': 2.7549909255898367e-05, 'epoch': 1.66} + 9%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 470/5520 [24:04<3:56:04, 2.80s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6787883639335632, 'eval_runtime': 3.1331, 'eval_samples_per_second': 57.131, 'eval_steps_per_second': 57.131, 'epoch': 1.66} +{'loss': 37.5096, 'grad_norm': 353.0351867675781, 'learning_rate': 2.7544464609800362e-05, 'epoch': 1.66} +{'loss': 39.3225, 'grad_norm': 344.4702453613281, 'learning_rate': 2.753901996370236e-05, 'epoch': 1.67} +{'loss': 39.4745, 'grad_norm': 349.8557434082031, 'learning_rate': 2.7533575317604357e-05, 'epoch': 1.67} +{'loss': 39.7513, 'grad_norm': 285.15765380859375, 'learning_rate': 2.7528130671506352e-05, 'epoch': 1.68} +{'loss': 40.441, 'grad_norm': 329.09149169921875, 'learning_rate': 2.7522686025408347e-05, 'epoch': 1.68} +{'loss': 40.0033, 'grad_norm': 246.67437744140625, 'learning_rate': 2.7517241379310343e-05, 'epoch': 1.68} +{'loss': 41.4558, 'grad_norm': 299.9590148925781, 'learning_rate': 2.7511796733212345e-05, 'epoch': 1.69} +{'loss': 40.8088, 'grad_norm': 315.5220642089844, 'learning_rate': 2.750635208711434e-05, 'epoch': 1.69} +{'loss': 40.4457, 'grad_norm': 256.2172546386719, 'learning_rate': 2.7500907441016336e-05, 'epoch': 1.69} +{'loss': 42.0739, 'grad_norm': 345.38983154296875, 'learning_rate': 2.749546279491833e-05, 'epoch': 1.7} + 9%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 480/5520 [24:35<3:55:36, 2.80s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6835405826568604, 'eval_runtime': 3.1373, 'eval_samples_per_second': 57.056, 'eval_steps_per_second': 57.056, 'epoch': 1.7} +{'loss': 41.6554, 'grad_norm': 425.0630187988281, 'learning_rate': 2.7490018148820326e-05, 'epoch': 1.7} +{'loss': 39.4656, 'grad_norm': 286.5938720703125, 'learning_rate': 2.7484573502722322e-05, 'epoch': 1.7} +{'loss': 26.6268, 'grad_norm': 356.6265869140625, 'learning_rate': 2.747912885662432e-05, 'epoch': 1.71} +{'loss': 26.344, 'grad_norm': 319.0960388183594, 'learning_rate': 2.7473684210526316e-05, 'epoch': 1.71} +{'loss': 27.099, 'grad_norm': 217.50375366210938, 'learning_rate': 2.746823956442831e-05, 'epoch': 1.72} +{'loss': 27.293, 'grad_norm': 199.71047973632812, 'learning_rate': 2.746279491833031e-05, 'epoch': 1.72} +{'loss': 26.5473, 'grad_norm': 222.1556396484375, 'learning_rate': 2.7457350272232305e-05, 'epoch': 1.72} +{'loss': 53.8046, 'grad_norm': 740.6441650390625, 'learning_rate': 2.7451905626134304e-05, 'epoch': 1.73} +{'loss': 54.1166, 'grad_norm': 792.884765625, 'learning_rate': 2.74464609800363e-05, 'epoch': 1.73} +{'loss': 51.6648, 'grad_norm': 613.0400390625, 'learning_rate': 2.7441016333938295e-05, 'epoch': 1.73} + 9%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 490/5520 [25:06<3:54:55, 2.80s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.704450786113739, 'eval_runtime': 3.1367, 'eval_samples_per_second': 57.067, 'eval_steps_per_second': 57.067, 'epoch': 1.73} +{'loss': 49.7201, 'grad_norm': 469.0580139160156, 'learning_rate': 2.743557168784029e-05, 'epoch': 1.74} +{'loss': 48.8945, 'grad_norm': 347.80206298828125, 'learning_rate': 2.7430127041742286e-05, 'epoch': 1.74} +{'loss': 48.1509, 'grad_norm': 334.3109436035156, 'learning_rate': 2.742468239564428e-05, 'epoch': 1.74} +{'loss': 47.801, 'grad_norm': 276.7007141113281, 'learning_rate': 2.741923774954628e-05, 'epoch': 1.75} +{'loss': 47.9838, 'grad_norm': 322.46575927734375, 'learning_rate': 2.741379310344828e-05, 'epoch': 1.75} +{'loss': 46.9847, 'grad_norm': 320.534912109375, 'learning_rate': 2.7408348457350274e-05, 'epoch': 1.75} +{'loss': 47.8941, 'grad_norm': 415.94580078125, 'learning_rate': 2.740290381125227e-05, 'epoch': 1.76} +{'loss': 46.5792, 'grad_norm': 299.8996276855469, 'learning_rate': 2.7397459165154265e-05, 'epoch': 1.76} +{'loss': 43.625, 'grad_norm': 275.3497314453125, 'learning_rate': 2.7392014519056264e-05, 'epoch': 1.77} +{'loss': 42.5925, 'grad_norm': 281.14251708984375, 'learning_rate': 2.738656987295826e-05, 'epoch': 1.77} + 9%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 500/5520 [25:37<3:55:35, 2.82s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6785204410552979, 'eval_runtime': 3.132, 'eval_samples_per_second': 57.152, 'eval_steps_per_second': 57.152, 'epoch': 1.77} +{'loss': 43.7302, 'grad_norm': 355.0955505371094, 'learning_rate': 2.7381125226860254e-05, 'epoch': 1.77} +{'loss': 42.6815, 'grad_norm': 302.5424499511719, 'learning_rate': 2.737568058076225e-05, 'epoch': 1.78} +{'loss': 43.3076, 'grad_norm': 288.3701171875, 'learning_rate': 2.7370235934664245e-05, 'epoch': 1.78} +{'loss': 43.5499, 'grad_norm': 288.28863525390625, 'learning_rate': 2.7364791288566244e-05, 'epoch': 1.78} +{'loss': 45.5163, 'grad_norm': 277.82171630859375, 'learning_rate': 2.7359346642468243e-05, 'epoch': 1.79} +{'loss': 43.8984, 'grad_norm': 240.2311248779297, 'learning_rate': 2.7353901996370238e-05, 'epoch': 1.79} +{'loss': 44.7699, 'grad_norm': 280.1030578613281, 'learning_rate': 2.7348457350272233e-05, 'epoch': 1.79} +{'loss': 44.2966, 'grad_norm': 260.6531982421875, 'learning_rate': 2.734301270417423e-05, 'epoch': 1.8} +{'loss': 44.8812, 'grad_norm': 284.82989501953125, 'learning_rate': 2.7337568058076224e-05, 'epoch': 1.8} +{'loss': 45.3101, 'grad_norm': 228.4029541015625, 'learning_rate': 2.7332123411978223e-05, 'epoch': 1.81} + 9%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 510/5520 [26:08<3:55:10, 2.82s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6614294648170471, 'eval_runtime': 3.1354, 'eval_samples_per_second': 57.091, 'eval_steps_per_second': 57.091, 'epoch': 1.81} +{'loss': 44.7325, 'grad_norm': 253.9024200439453, 'learning_rate': 2.732667876588022e-05, 'epoch': 1.81} +{'loss': 44.5544, 'grad_norm': 234.1785888671875, 'learning_rate': 2.7321234119782214e-05, 'epoch': 1.81} +{'loss': 45.0984, 'grad_norm': 252.2194061279297, 'learning_rate': 2.7315789473684213e-05, 'epoch': 1.82} +{'loss': 44.5591, 'grad_norm': 244.02610778808594, 'learning_rate': 2.7310344827586208e-05, 'epoch': 1.82} +{'loss': 43.7073, 'grad_norm': 252.48089599609375, 'learning_rate': 2.7304900181488203e-05, 'epoch': 1.82} +{'loss': 40.7267, 'grad_norm': 258.9751892089844, 'learning_rate': 2.7299455535390202e-05, 'epoch': 1.83} +{'loss': 39.0883, 'grad_norm': 322.91387939453125, 'learning_rate': 2.7294010889292197e-05, 'epoch': 1.83} +{'loss': 37.8859, 'grad_norm': 392.3733215332031, 'learning_rate': 2.7288566243194193e-05, 'epoch': 1.83} +{'loss': 37.6328, 'grad_norm': 330.35089111328125, 'learning_rate': 2.7283121597096188e-05, 'epoch': 1.84} +{'loss': 38.4354, 'grad_norm': 306.2722473144531, 'learning_rate': 2.7277676950998184e-05, 'epoch': 1.84} + 9%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 520/5520 [26:39<3:54:19, 2.81s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6802475452423096, 'eval_runtime': 3.1337, 'eval_samples_per_second': 57.12, 'eval_steps_per_second': 57.12, 'epoch': 1.84} +{'loss': 37.5668, 'grad_norm': 376.08319091796875, 'learning_rate': 2.7272232304900182e-05, 'epoch': 1.85} +{'loss': 39.0387, 'grad_norm': 245.11607360839844, 'learning_rate': 2.7266787658802178e-05, 'epoch': 1.85} +{'loss': 39.8013, 'grad_norm': 326.08740234375, 'learning_rate': 2.7261343012704177e-05, 'epoch': 1.85} +{'loss': 39.1249, 'grad_norm': 286.0172119140625, 'learning_rate': 2.7255898366606172e-05, 'epoch': 1.86} +{'loss': 38.9208, 'grad_norm': 279.9872741699219, 'learning_rate': 2.7250453720508167e-05, 'epoch': 1.86} +{'loss': 39.6188, 'grad_norm': 273.5589904785156, 'learning_rate': 2.7245009074410163e-05, 'epoch': 1.86} +{'loss': 39.9636, 'grad_norm': 241.08322143554688, 'learning_rate': 2.723956442831216e-05, 'epoch': 1.87} +{'loss': 39.7664, 'grad_norm': 282.7255554199219, 'learning_rate': 2.7234119782214157e-05, 'epoch': 1.87} +{'loss': 40.4444, 'grad_norm': 276.45819091796875, 'learning_rate': 2.7228675136116152e-05, 'epoch': 1.87} +{'loss': 41.3736, 'grad_norm': 274.9344787597656, 'learning_rate': 2.7223230490018148e-05, 'epoch': 1.88} + 10%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 530/5520 [27:10<3:56:07, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6779935956001282, 'eval_runtime': 3.1278, 'eval_samples_per_second': 57.228, 'eval_steps_per_second': 57.228, 'epoch': 1.88} +{'loss': 41.0723, 'grad_norm': 251.0371551513672, 'learning_rate': 2.7217785843920143e-05, 'epoch': 1.88} +{'loss': 41.552, 'grad_norm': 313.0828552246094, 'learning_rate': 2.7212341197822145e-05, 'epoch': 1.88} +{'loss': 41.1185, 'grad_norm': 246.2321319580078, 'learning_rate': 2.720689655172414e-05, 'epoch': 1.89} +{'loss': 26.9467, 'grad_norm': 243.4658660888672, 'learning_rate': 2.7201451905626136e-05, 'epoch': 1.89} +{'loss': 26.1988, 'grad_norm': 234.8782196044922, 'learning_rate': 2.719600725952813e-05, 'epoch': 1.9} +{'loss': 26.4887, 'grad_norm': 218.89500427246094, 'learning_rate': 2.7190562613430127e-05, 'epoch': 1.9} +{'loss': 26.9755, 'grad_norm': 241.71937561035156, 'learning_rate': 2.7185117967332122e-05, 'epoch': 1.9} +{'loss': 52.2138, 'grad_norm': 532.9345092773438, 'learning_rate': 2.717967332123412e-05, 'epoch': 1.91} +{'loss': 51.3975, 'grad_norm': 600.501220703125, 'learning_rate': 2.7174228675136116e-05, 'epoch': 1.91} +{'loss': 51.2848, 'grad_norm': 570.4301147460938, 'learning_rate': 2.716878402903811e-05, 'epoch': 1.91} + 10%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 540/5520 [27:41<3:55:49, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.7027958035469055, 'eval_runtime': 3.1319, 'eval_samples_per_second': 57.153, 'eval_steps_per_second': 57.153, 'epoch': 1.91} +{'loss': 49.1175, 'grad_norm': 406.2899475097656, 'learning_rate': 2.716333938294011e-05, 'epoch': 1.92} +{'loss': 47.7343, 'grad_norm': 369.8658447265625, 'learning_rate': 2.7157894736842106e-05, 'epoch': 1.92} +{'loss': 48.1818, 'grad_norm': 338.6764831542969, 'learning_rate': 2.7152450090744105e-05, 'epoch': 1.92} +{'loss': 45.686, 'grad_norm': 283.4834899902344, 'learning_rate': 2.71470054446461e-05, 'epoch': 1.93} +{'loss': 44.3277, 'grad_norm': 327.53472900390625, 'learning_rate': 2.7141560798548095e-05, 'epoch': 1.93} +{'loss': 44.4469, 'grad_norm': 329.0078125, 'learning_rate': 2.713611615245009e-05, 'epoch': 1.94} +{'loss': 43.9265, 'grad_norm': 270.3822021484375, 'learning_rate': 2.7130671506352086e-05, 'epoch': 1.94} +{'loss': 44.3072, 'grad_norm': 224.95742797851562, 'learning_rate': 2.7125226860254085e-05, 'epoch': 1.94} +{'loss': 43.8803, 'grad_norm': 240.5491943359375, 'learning_rate': 2.711978221415608e-05, 'epoch': 1.95} +{'loss': 46.5793, 'grad_norm': 284.5292663574219, 'learning_rate': 2.711433756805808e-05, 'epoch': 1.95} + 10%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 550/5520 [28:12<3:56:01, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6684675812721252, 'eval_runtime': 3.1311, 'eval_samples_per_second': 57.168, 'eval_steps_per_second': 57.168, 'epoch': 1.95} +{'loss': 45.9066, 'grad_norm': 281.6688537597656, 'learning_rate': 2.7108892921960074e-05, 'epoch': 1.95} +{'loss': 45.677, 'grad_norm': 283.83514404296875, 'learning_rate': 2.710344827586207e-05, 'epoch': 1.96} +{'loss': 40.1636, 'grad_norm': 227.1653289794922, 'learning_rate': 2.7098003629764065e-05, 'epoch': 1.96} +{'loss': 37.5158, 'grad_norm': 294.7088928222656, 'learning_rate': 2.7092558983666064e-05, 'epoch': 1.96} +{'loss': 36.7221, 'grad_norm': 225.4500732421875, 'learning_rate': 2.708711433756806e-05, 'epoch': 1.97} +{'loss': 39.5524, 'grad_norm': 244.72509765625, 'learning_rate': 2.7081669691470055e-05, 'epoch': 1.97} +{'loss': 38.6021, 'grad_norm': 232.00390625, 'learning_rate': 2.707622504537205e-05, 'epoch': 1.98} +{'loss': 39.9881, 'grad_norm': 241.02322387695312, 'learning_rate': 2.7070780399274045e-05, 'epoch': 1.98} +{'loss': 40.5002, 'grad_norm': 244.6790771484375, 'learning_rate': 2.7065335753176044e-05, 'epoch': 1.98} +{'loss': 41.5041, 'grad_norm': 336.2419128417969, 'learning_rate': 2.7059891107078043e-05, 'epoch': 1.99} + 10%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 560/5520 [28:43<3:53:23, 2.82s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.68587726354599, 'eval_runtime': 3.1319, 'eval_samples_per_second': 57.153, 'eval_steps_per_second': 57.153, 'epoch': 1.99} +{'loss': 41.0936, 'grad_norm': 283.76629638671875, 'learning_rate': 2.705444646098004e-05, 'epoch': 1.99} +{'loss': 30.5475, 'grad_norm': 236.99427795410156, 'learning_rate': 2.7049001814882034e-05, 'epoch': 1.99} +{'loss': 25.829, 'grad_norm': 247.35618591308594, 'learning_rate': 2.704355716878403e-05, 'epoch': 2.0} +{'loss': 24.0575, 'grad_norm': 206.315185546875, 'learning_rate': 2.7038112522686025e-05, 'epoch': 2.0} +{'loss': 50.3468, 'grad_norm': 499.1221923828125, 'learning_rate': 2.7032667876588023e-05, 'epoch': 2.0} +{'loss': 50.1256, 'grad_norm': 415.1005859375, 'learning_rate': 2.702722323049002e-05, 'epoch': 2.01} +{'loss': 50.4985, 'grad_norm': 414.549072265625, 'learning_rate': 2.7021778584392014e-05, 'epoch': 2.01} +{'loss': 48.4158, 'grad_norm': 339.5193786621094, 'learning_rate': 2.701633393829401e-05, 'epoch': 2.01} +{'loss': 48.3497, 'grad_norm': 318.3045654296875, 'learning_rate': 2.7010889292196008e-05, 'epoch': 2.02} +{'loss': 47.0476, 'grad_norm': 298.7594909667969, 'learning_rate': 2.7005444646098007e-05, 'epoch': 2.02} + 10%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 570/5520 [29:14<3:53:42, 2.83s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6773737668991089, 'eval_runtime': 3.132, 'eval_samples_per_second': 57.153, 'eval_steps_per_second': 57.153, 'epoch': 2.02} +{'loss': 46.6903, 'grad_norm': 238.1414031982422, 'learning_rate': 2.7000000000000002e-05, 'epoch': 2.03} +{'loss': 47.351, 'grad_norm': 225.9528045654297, 'learning_rate': 2.6994555353901998e-05, 'epoch': 2.03} +{'loss': 46.7924, 'grad_norm': 264.1337890625, 'learning_rate': 2.6989110707803993e-05, 'epoch': 2.03} +{'loss': 45.4036, 'grad_norm': 257.63311767578125, 'learning_rate': 2.698366606170599e-05, 'epoch': 2.04} +{'loss': 45.1304, 'grad_norm': 283.10980224609375, 'learning_rate': 2.6978221415607984e-05, 'epoch': 2.04} +{'loss': 45.1448, 'grad_norm': 280.585205078125, 'learning_rate': 2.6972776769509983e-05, 'epoch': 2.04} +{'loss': 43.4235, 'grad_norm': 282.609375, 'learning_rate': 2.6967332123411978e-05, 'epoch': 2.05} +{'loss': 42.5758, 'grad_norm': 259.24346923828125, 'learning_rate': 2.6961887477313977e-05, 'epoch': 2.05} +{'loss': 42.2048, 'grad_norm': 246.6533966064453, 'learning_rate': 2.6956442831215972e-05, 'epoch': 2.05} +{'loss': 43.8324, 'grad_norm': 250.3376007080078, 'learning_rate': 2.6950998185117968e-05, 'epoch': 2.06} + 11%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 580/5520 [29:45<3:52:24, 2.82s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.665416419506073, 'eval_runtime': 3.1344, 'eval_samples_per_second': 57.107, 'eval_steps_per_second': 57.107, 'epoch': 2.06} +{'loss': 42.8104, 'grad_norm': 255.93833923339844, 'learning_rate': 2.6945553539019966e-05, 'epoch': 2.06} +{'loss': 43.7011, 'grad_norm': 254.2652587890625, 'learning_rate': 2.6940108892921962e-05, 'epoch': 2.07} +{'loss': 44.6409, 'grad_norm': 249.3634033203125, 'learning_rate': 2.6934664246823957e-05, 'epoch': 2.07} +{'loss': 43.5825, 'grad_norm': 227.1998291015625, 'learning_rate': 2.6929219600725953e-05, 'epoch': 2.07} +{'loss': 44.9313, 'grad_norm': 268.9518127441406, 'learning_rate': 2.6923774954627948e-05, 'epoch': 2.08} +{'loss': 42.9967, 'grad_norm': 246.07669067382812, 'learning_rate': 2.6918330308529943e-05, 'epoch': 2.08} +{'loss': 45.3452, 'grad_norm': 244.62857055664062, 'learning_rate': 2.6912885662431945e-05, 'epoch': 2.08} +{'loss': 44.339, 'grad_norm': 211.9022216796875, 'learning_rate': 2.690744101633394e-05, 'epoch': 2.09} +{'loss': 44.9766, 'grad_norm': 247.23330688476562, 'learning_rate': 2.6901996370235936e-05, 'epoch': 2.09} +{'loss': 43.7674, 'grad_norm': 205.37115478515625, 'learning_rate': 2.689655172413793e-05, 'epoch': 2.09} + 11%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 590/5520 [30:16<3:52:45, 2.83s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6593071222305298, 'eval_runtime': 3.1344, 'eval_samples_per_second': 57.108, 'eval_steps_per_second': 57.108, 'epoch': 2.09} +{'loss': 45.9008, 'grad_norm': 276.552734375, 'learning_rate': 2.6891107078039927e-05, 'epoch': 2.1} +{'loss': 42.9123, 'grad_norm': 222.2236785888672, 'learning_rate': 2.6885662431941926e-05, 'epoch': 2.1} +{'loss': 41.8036, 'grad_norm': 211.22268676757812, 'learning_rate': 2.688021778584392e-05, 'epoch': 2.1} +{'loss': 39.8408, 'grad_norm': 247.3801727294922, 'learning_rate': 2.6874773139745917e-05, 'epoch': 2.11} +{'loss': 38.5368, 'grad_norm': 325.9136962890625, 'learning_rate': 2.6869328493647912e-05, 'epoch': 2.11} +{'loss': 38.2694, 'grad_norm': 332.1748046875, 'learning_rate': 2.686388384754991e-05, 'epoch': 2.12} +{'loss': 38.0195, 'grad_norm': 249.74398803710938, 'learning_rate': 2.6858439201451906e-05, 'epoch': 2.12} +{'loss': 37.6475, 'grad_norm': 278.7181396484375, 'learning_rate': 2.6852994555353905e-05, 'epoch': 2.12} +{'loss': 37.5423, 'grad_norm': 254.46157836914062, 'learning_rate': 2.68475499092559e-05, 'epoch': 2.13} +{'loss': 39.5874, 'grad_norm': 345.65704345703125, 'learning_rate': 2.6842105263157896e-05, 'epoch': 2.13} + 11%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 600/5520 [30:47<3:52:56, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6737480163574219, 'eval_runtime': 3.136, 'eval_samples_per_second': 57.08, 'eval_steps_per_second': 57.08, 'epoch': 2.13} +{'loss': 38.7371, 'grad_norm': 282.4167785644531, 'learning_rate': 2.683666061705989e-05, 'epoch': 2.13} +{'loss': 39.2955, 'grad_norm': 243.5838623046875, 'learning_rate': 2.6831215970961886e-05, 'epoch': 2.14} +{'loss': 39.2204, 'grad_norm': 229.0329132080078, 'learning_rate': 2.6825771324863885e-05, 'epoch': 2.14} +{'loss': 39.3951, 'grad_norm': 247.46646118164062, 'learning_rate': 2.682032667876588e-05, 'epoch': 2.14} +{'loss': 38.9043, 'grad_norm': 219.598388671875, 'learning_rate': 2.6814882032667876e-05, 'epoch': 2.15} +{'loss': 39.1682, 'grad_norm': 251.1849822998047, 'learning_rate': 2.6809437386569875e-05, 'epoch': 2.15} +{'loss': 39.6977, 'grad_norm': 316.6958312988281, 'learning_rate': 2.680399274047187e-05, 'epoch': 2.16} +{'loss': 40.4904, 'grad_norm': 305.8714904785156, 'learning_rate': 2.6798548094373865e-05, 'epoch': 2.16} +{'loss': 41.0051, 'grad_norm': 283.31634521484375, 'learning_rate': 2.6793103448275864e-05, 'epoch': 2.16} +{'loss': 38.0505, 'grad_norm': 299.8731384277344, 'learning_rate': 2.678765880217786e-05, 'epoch': 2.17} + 11%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 600/5520 [30:50<3:52:56, 2.84s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 11%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 610/5520 [31:19<3:54:04, 2.86s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6916811466217041, 'eval_runtime': 3.1348, 'eval_samples_per_second': 57.1, 'eval_steps_per_second': 57.1, 'epoch': 2.17} +{'loss': 26.855, 'grad_norm': 255.5745086669922, 'learning_rate': 2.6782214156079855e-05, 'epoch': 2.17} +{'loss': 25.9643, 'grad_norm': 228.0767822265625, 'learning_rate': 2.677676950998185e-05, 'epoch': 2.17} +{'loss': 26.3989, 'grad_norm': 254.83799743652344, 'learning_rate': 2.6771324863883846e-05, 'epoch': 2.18} +{'loss': 26.5123, 'grad_norm': 228.3594512939453, 'learning_rate': 2.6765880217785845e-05, 'epoch': 2.18} +{'loss': 50.0409, 'grad_norm': 480.9405822753906, 'learning_rate': 2.6760435571687843e-05, 'epoch': 2.18} +{'loss': 52.4059, 'grad_norm': 491.6844177246094, 'learning_rate': 2.675499092558984e-05, 'epoch': 2.19} +{'loss': 50.7535, 'grad_norm': 423.5033264160156, 'learning_rate': 2.6749546279491834e-05, 'epoch': 2.19} +{'loss': 47.8934, 'grad_norm': 407.8076171875, 'learning_rate': 2.674410163339383e-05, 'epoch': 2.2} +{'loss': 48.2125, 'grad_norm': 339.0987854003906, 'learning_rate': 2.6738656987295825e-05, 'epoch': 2.2} +{'loss': 47.6501, 'grad_norm': 336.1163635253906, 'learning_rate': 2.6733212341197824e-05, 'epoch': 2.2} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 11%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 620/5520 [31:50<3:51:37, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.67746502161026, 'eval_runtime': 3.1321, 'eval_samples_per_second': 57.15, 'eval_steps_per_second': 57.15, 'epoch': 2.2} +{'loss': 46.557, 'grad_norm': 289.6402587890625, 'learning_rate': 2.672776769509982e-05, 'epoch': 2.21} +{'loss': 48.0728, 'grad_norm': 270.03790283203125, 'learning_rate': 2.6722323049001814e-05, 'epoch': 2.21} +{'loss': 45.9273, 'grad_norm': 241.3233184814453, 'learning_rate': 2.671687840290381e-05, 'epoch': 2.21} +{'loss': 45.7327, 'grad_norm': 270.06201171875, 'learning_rate': 2.671143375680581e-05, 'epoch': 2.22} +{'loss': 44.1507, 'grad_norm': 239.87757873535156, 'learning_rate': 2.6705989110707807e-05, 'epoch': 2.22} +{'loss': 42.8332, 'grad_norm': 240.35128784179688, 'learning_rate': 2.6700544464609803e-05, 'epoch': 2.22} +{'loss': 42.2531, 'grad_norm': 256.8591613769531, 'learning_rate': 2.6695099818511798e-05, 'epoch': 2.23} +{'loss': 41.9307, 'grad_norm': 255.26673889160156, 'learning_rate': 2.6689655172413793e-05, 'epoch': 2.23} +{'loss': 43.077, 'grad_norm': 235.0786895751953, 'learning_rate': 2.668421052631579e-05, 'epoch': 2.23} +{'loss': 43.3731, 'grad_norm': 242.18040466308594, 'learning_rate': 2.6678765880217784e-05, 'epoch': 2.24} + 11%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 630/5520 [32:21<3:50:50, 2.83s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6694422364234924, 'eval_runtime': 3.1351, 'eval_samples_per_second': 57.095, 'eval_steps_per_second': 57.095, 'epoch': 2.24} +{'loss': 42.2524, 'grad_norm': 221.5685272216797, 'learning_rate': 2.6673321234119783e-05, 'epoch': 2.24} +{'loss': 42.9608, 'grad_norm': 212.63059997558594, 'learning_rate': 2.666787658802178e-05, 'epoch': 2.25} +{'loss': 43.4169, 'grad_norm': 204.1076202392578, 'learning_rate': 2.6662431941923777e-05, 'epoch': 2.25} +{'loss': 43.4894, 'grad_norm': 237.20144653320312, 'learning_rate': 2.6656987295825773e-05, 'epoch': 2.25} +{'loss': 42.8705, 'grad_norm': 223.0536346435547, 'learning_rate': 2.6651542649727768e-05, 'epoch': 2.26} +{'loss': 44.3716, 'grad_norm': 262.2052001953125, 'learning_rate': 2.6646098003629767e-05, 'epoch': 2.26} +{'loss': 44.4382, 'grad_norm': 236.05906677246094, 'learning_rate': 2.6640653357531762e-05, 'epoch': 2.26} +{'loss': 44.2845, 'grad_norm': 238.1580810546875, 'learning_rate': 2.6635208711433757e-05, 'epoch': 2.27} +{'loss': 45.3699, 'grad_norm': 256.60260009765625, 'learning_rate': 2.6629764065335753e-05, 'epoch': 2.27} +{'loss': 43.3712, 'grad_norm': 259.56512451171875, 'learning_rate': 2.6624319419237748e-05, 'epoch': 2.27} + 12%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 640/5520 [32:52<3:50:42, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6590501070022583, 'eval_runtime': 3.1299, 'eval_samples_per_second': 57.191, 'eval_steps_per_second': 57.191, 'epoch': 2.27} +{'loss': 44.44, 'grad_norm': 223.30166625976562, 'learning_rate': 2.6618874773139744e-05, 'epoch': 2.28} +{'loss': 45.6937, 'grad_norm': 232.5362091064453, 'learning_rate': 2.6613430127041746e-05, 'epoch': 2.28} +{'loss': 42.2653, 'grad_norm': 212.84373474121094, 'learning_rate': 2.660798548094374e-05, 'epoch': 2.29} +{'loss': 40.0918, 'grad_norm': 224.66473388671875, 'learning_rate': 2.6602540834845737e-05, 'epoch': 2.29} +{'loss': 37.3983, 'grad_norm': 309.7171325683594, 'learning_rate': 2.6597096188747732e-05, 'epoch': 2.29} +{'loss': 35.773, 'grad_norm': 313.0796203613281, 'learning_rate': 2.6591651542649727e-05, 'epoch': 2.3} +{'loss': 38.4892, 'grad_norm': 357.21990966796875, 'learning_rate': 2.6586206896551726e-05, 'epoch': 2.3} +{'loss': 37.8198, 'grad_norm': 319.89306640625, 'learning_rate': 2.658076225045372e-05, 'epoch': 2.3} +{'loss': 37.3511, 'grad_norm': 236.42787170410156, 'learning_rate': 2.6575317604355717e-05, 'epoch': 2.31} +{'loss': 38.6153, 'grad_norm': 293.1517639160156, 'learning_rate': 2.6569872958257712e-05, 'epoch': 2.31} + 12%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 650/5520 [33:24<3:50:41, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6686823964118958, 'eval_runtime': 3.1317, 'eval_samples_per_second': 57.158, 'eval_steps_per_second': 57.158, 'epoch': 2.31} +{'loss': 38.5685, 'grad_norm': 259.3082580566406, 'learning_rate': 2.6564428312159708e-05, 'epoch': 2.31} +{'loss': 39.8481, 'grad_norm': 223.2305145263672, 'learning_rate': 2.6558983666061706e-05, 'epoch': 2.32} +{'loss': 39.2202, 'grad_norm': 220.5127410888672, 'learning_rate': 2.6553539019963705e-05, 'epoch': 2.32} +{'loss': 39.407, 'grad_norm': 239.54837036132812, 'learning_rate': 2.65480943738657e-05, 'epoch': 2.33} +{'loss': 38.6256, 'grad_norm': 297.1054382324219, 'learning_rate': 2.6542649727767696e-05, 'epoch': 2.33} +{'loss': 39.5373, 'grad_norm': 274.5492858886719, 'learning_rate': 2.653720508166969e-05, 'epoch': 2.33} +{'loss': 40.8163, 'grad_norm': 285.3461608886719, 'learning_rate': 2.6531760435571687e-05, 'epoch': 2.34} +{'loss': 39.5177, 'grad_norm': 280.4156799316406, 'learning_rate': 2.6526315789473685e-05, 'epoch': 2.34} +{'loss': 39.3931, 'grad_norm': 304.635986328125, 'learning_rate': 2.652087114337568e-05, 'epoch': 2.34} +{'loss': 36.4478, 'grad_norm': 261.5251159667969, 'learning_rate': 2.6515426497277676e-05, 'epoch': 2.35} + 12%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 660/5520 [33:55<3:50:11, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6913852691650391, 'eval_runtime': 3.1307, 'eval_samples_per_second': 57.176, 'eval_steps_per_second': 57.176, 'epoch': 2.35} +{'loss': 28.0558, 'grad_norm': 315.155029296875, 'learning_rate': 2.6509981851179675e-05, 'epoch': 2.35} +{'loss': 25.3844, 'grad_norm': 244.11302185058594, 'learning_rate': 2.650453720508167e-05, 'epoch': 2.35} +{'loss': 25.7319, 'grad_norm': 214.6631317138672, 'learning_rate': 2.649909255898367e-05, 'epoch': 2.36} +{'loss': 26.9239, 'grad_norm': 239.142822265625, 'learning_rate': 2.6493647912885664e-05, 'epoch': 2.36} +{'loss': 51.6751, 'grad_norm': 466.52301025390625, 'learning_rate': 2.648820326678766e-05, 'epoch': 2.36} +{'loss': 50.3911, 'grad_norm': 417.0456848144531, 'learning_rate': 2.6482758620689655e-05, 'epoch': 2.37} +{'loss': 50.2844, 'grad_norm': 428.5924987792969, 'learning_rate': 2.647731397459165e-05, 'epoch': 2.37} +{'loss': 49.4586, 'grad_norm': 377.35205078125, 'learning_rate': 2.6471869328493646e-05, 'epoch': 2.38} +{'loss': 47.2812, 'grad_norm': 319.4757080078125, 'learning_rate': 2.6466424682395645e-05, 'epoch': 2.38} +{'loss': 46.9634, 'grad_norm': 294.8909912109375, 'learning_rate': 2.6460980036297644e-05, 'epoch': 2.38} + 12%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 670/5520 [34:26<3:49:20, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.669245183467865, 'eval_runtime': 3.1358, 'eval_samples_per_second': 57.082, 'eval_steps_per_second': 57.082, 'epoch': 2.38} +{'loss': 46.9558, 'grad_norm': 261.2301940917969, 'learning_rate': 2.645553539019964e-05, 'epoch': 2.39} +{'loss': 47.8019, 'grad_norm': 258.07611083984375, 'learning_rate': 2.6450090744101634e-05, 'epoch': 2.39} +{'loss': 45.0965, 'grad_norm': 224.54913330078125, 'learning_rate': 2.644464609800363e-05, 'epoch': 2.39} +{'loss': 46.5213, 'grad_norm': 294.1282958984375, 'learning_rate': 2.643920145190563e-05, 'epoch': 2.4} +{'loss': 44.6797, 'grad_norm': 286.87744140625, 'learning_rate': 2.6433756805807624e-05, 'epoch': 2.4} +{'loss': 44.6766, 'grad_norm': 297.0935974121094, 'learning_rate': 2.642831215970962e-05, 'epoch': 2.4} +{'loss': 42.2207, 'grad_norm': 245.94793701171875, 'learning_rate': 2.6422867513611615e-05, 'epoch': 2.41} +{'loss': 41.9373, 'grad_norm': 227.2701416015625, 'learning_rate': 2.641742286751361e-05, 'epoch': 2.41} +{'loss': 43.5779, 'grad_norm': 236.96005249023438, 'learning_rate': 2.641197822141561e-05, 'epoch': 2.42} +{'loss': 41.6609, 'grad_norm': 244.6314239501953, 'learning_rate': 2.6406533575317608e-05, 'epoch': 2.42} + 12%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 680/5520 [34:57<3:48:41, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6653958559036255, 'eval_runtime': 3.1314, 'eval_samples_per_second': 57.163, 'eval_steps_per_second': 57.163, 'epoch': 2.42} +{'loss': 43.4188, 'grad_norm': 252.40667724609375, 'learning_rate': 2.6401088929219603e-05, 'epoch': 2.42} +{'loss': 42.4463, 'grad_norm': 218.78762817382812, 'learning_rate': 2.63956442831216e-05, 'epoch': 2.43} +{'loss': 43.9986, 'grad_norm': 216.69850158691406, 'learning_rate': 2.6390199637023594e-05, 'epoch': 2.43} +{'loss': 44.358, 'grad_norm': 222.8838348388672, 'learning_rate': 2.638475499092559e-05, 'epoch': 2.43} +{'loss': 42.2287, 'grad_norm': 227.73489379882812, 'learning_rate': 2.6379310344827588e-05, 'epoch': 2.44} +{'loss': 44.0429, 'grad_norm': 227.0625762939453, 'learning_rate': 2.6373865698729583e-05, 'epoch': 2.44} +{'loss': 43.5408, 'grad_norm': 212.73170471191406, 'learning_rate': 2.636842105263158e-05, 'epoch': 2.44} +{'loss': 44.9755, 'grad_norm': 213.81211853027344, 'learning_rate': 2.6362976406533574e-05, 'epoch': 2.45} +{'loss': 44.0524, 'grad_norm': 232.90858459472656, 'learning_rate': 2.6357531760435573e-05, 'epoch': 2.45} +{'loss': 45.1275, 'grad_norm': 260.18408203125, 'learning_rate': 2.6352087114337568e-05, 'epoch': 2.46} + 12%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 690/5520 [35:28<3:49:06, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6535969972610474, 'eval_runtime': 3.1327, 'eval_samples_per_second': 57.139, 'eval_steps_per_second': 57.139, 'epoch': 2.46} +{'loss': 43.8625, 'grad_norm': 283.89227294921875, 'learning_rate': 2.6346642468239567e-05, 'epoch': 2.46} +{'loss': 44.1129, 'grad_norm': 214.93670654296875, 'learning_rate': 2.6341197822141562e-05, 'epoch': 2.46} +{'loss': 43.7334, 'grad_norm': 207.7038116455078, 'learning_rate': 2.6335753176043558e-05, 'epoch': 2.47} +{'loss': 38.3952, 'grad_norm': 320.4886779785156, 'learning_rate': 2.6330308529945553e-05, 'epoch': 2.47} +{'loss': 38.443, 'grad_norm': 356.9686279296875, 'learning_rate': 2.632486388384755e-05, 'epoch': 2.47} +{'loss': 36.617, 'grad_norm': 251.1065216064453, 'learning_rate': 2.6319419237749547e-05, 'epoch': 2.48} +{'loss': 38.6545, 'grad_norm': 301.3539123535156, 'learning_rate': 2.6313974591651543e-05, 'epoch': 2.48} +{'loss': 37.6828, 'grad_norm': 206.49517822265625, 'learning_rate': 2.630852994555354e-05, 'epoch': 2.48} +{'loss': 39.2154, 'grad_norm': 230.03382873535156, 'learning_rate': 2.6303085299455537e-05, 'epoch': 2.49} +{'loss': 37.5136, 'grad_norm': 224.42454528808594, 'learning_rate': 2.6297640653357532e-05, 'epoch': 2.49} + 13%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 700/5520 [35:59<3:48:03, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.657163679599762, 'eval_runtime': 3.1455, 'eval_samples_per_second': 56.906, 'eval_steps_per_second': 56.906, 'epoch': 2.49} +{'loss': 37.1198, 'grad_norm': 186.3481903076172, 'learning_rate': 2.6292196007259528e-05, 'epoch': 2.49} +{'loss': 38.3062, 'grad_norm': 231.81553649902344, 'learning_rate': 2.6286751361161526e-05, 'epoch': 2.5} +{'loss': 39.543, 'grad_norm': 221.0079803466797, 'learning_rate': 2.6281306715063522e-05, 'epoch': 2.5} +{'loss': 38.5384, 'grad_norm': 251.6171112060547, 'learning_rate': 2.6275862068965517e-05, 'epoch': 2.51} +{'loss': 39.2217, 'grad_norm': 239.07843017578125, 'learning_rate': 2.6270417422867512e-05, 'epoch': 2.51} +{'loss': 39.2529, 'grad_norm': 256.3560485839844, 'learning_rate': 2.6264972776769508e-05, 'epoch': 2.51} +{'loss': 39.6369, 'grad_norm': 245.74522399902344, 'learning_rate': 2.625952813067151e-05, 'epoch': 2.52} +{'loss': 40.1488, 'grad_norm': 279.8902893066406, 'learning_rate': 2.6254083484573505e-05, 'epoch': 2.52} +{'loss': 40.6809, 'grad_norm': 267.12811279296875, 'learning_rate': 2.62486388384755e-05, 'epoch': 2.52} +{'loss': 35.9417, 'grad_norm': 291.1154479980469, 'learning_rate': 2.6243194192377496e-05, 'epoch': 2.53} + 13%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 710/5520 [36:31<3:48:06, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.677870512008667, 'eval_runtime': 3.1315, 'eval_samples_per_second': 57.16, 'eval_steps_per_second': 57.16, 'epoch': 2.53} +{'loss': 26.5904, 'grad_norm': 362.4072570800781, 'learning_rate': 2.623774954627949e-05, 'epoch': 2.53} +{'loss': 25.4147, 'grad_norm': 346.172607421875, 'learning_rate': 2.6232304900181487e-05, 'epoch': 2.53} +{'loss': 26.0221, 'grad_norm': 193.29322814941406, 'learning_rate': 2.6226860254083486e-05, 'epoch': 2.54} +{'loss': 26.1951, 'grad_norm': 195.480224609375, 'learning_rate': 2.622141560798548e-05, 'epoch': 2.54} +{'loss': 49.9201, 'grad_norm': 459.3381652832031, 'learning_rate': 2.6215970961887476e-05, 'epoch': 2.55} +{'loss': 51.813, 'grad_norm': 430.83160400390625, 'learning_rate': 2.6210526315789475e-05, 'epoch': 2.55} +{'loss': 49.6055, 'grad_norm': 395.5831604003906, 'learning_rate': 2.620508166969147e-05, 'epoch': 2.55} +{'loss': 47.4299, 'grad_norm': 349.0957946777344, 'learning_rate': 2.619963702359347e-05, 'epoch': 2.56} +{'loss': 47.5578, 'grad_norm': 317.4203796386719, 'learning_rate': 2.6194192377495465e-05, 'epoch': 2.56} +{'loss': 47.19, 'grad_norm': 284.44659423828125, 'learning_rate': 2.618874773139746e-05, 'epoch': 2.56} + 13%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 720/5520 [37:02<3:48:13, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6700878739356995, 'eval_runtime': 3.1309, 'eval_samples_per_second': 57.172, 'eval_steps_per_second': 57.172, 'epoch': 2.56} +{'loss': 47.0923, 'grad_norm': 313.7208251953125, 'learning_rate': 2.6183303085299456e-05, 'epoch': 2.57} +{'loss': 47.024, 'grad_norm': 284.9776611328125, 'learning_rate': 2.617785843920145e-05, 'epoch': 2.57} +{'loss': 47.0988, 'grad_norm': 264.72515869140625, 'learning_rate': 2.6172413793103446e-05, 'epoch': 2.57} +{'loss': 46.1584, 'grad_norm': 244.52915954589844, 'learning_rate': 2.6166969147005445e-05, 'epoch': 2.58} +{'loss': 44.7084, 'grad_norm': 255.7130889892578, 'learning_rate': 2.6161524500907444e-05, 'epoch': 2.58} +{'loss': 44.2491, 'grad_norm': 276.594482421875, 'learning_rate': 2.615607985480944e-05, 'epoch': 2.59} +{'loss': 42.3555, 'grad_norm': 274.7431335449219, 'learning_rate': 2.6150635208711435e-05, 'epoch': 2.59} +{'loss': 43.7643, 'grad_norm': 276.954833984375, 'learning_rate': 2.614519056261343e-05, 'epoch': 2.59} +{'loss': 42.2725, 'grad_norm': 194.3367156982422, 'learning_rate': 2.613974591651543e-05, 'epoch': 2.6} +{'loss': 42.5421, 'grad_norm': 176.41236877441406, 'learning_rate': 2.6134301270417424e-05, 'epoch': 2.6} + 13%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 730/5520 [37:33<3:46:16, 2.83s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6591465473175049, 'eval_runtime': 3.1343, 'eval_samples_per_second': 57.111, 'eval_steps_per_second': 57.111, 'epoch': 2.6} +{'loss': 41.3438, 'grad_norm': 205.8289031982422, 'learning_rate': 2.612885662431942e-05, 'epoch': 2.6} +{'loss': 44.5023, 'grad_norm': 204.97471618652344, 'learning_rate': 2.6123411978221415e-05, 'epoch': 2.61} +{'loss': 42.8994, 'grad_norm': 230.9344482421875, 'learning_rate': 2.611796733212341e-05, 'epoch': 2.61} +{'loss': 43.5145, 'grad_norm': 186.5467987060547, 'learning_rate': 2.6112522686025406e-05, 'epoch': 2.61} +{'loss': 44.4214, 'grad_norm': 212.39852905273438, 'learning_rate': 2.6107078039927408e-05, 'epoch': 2.62} +{'loss': 44.4302, 'grad_norm': 214.2425994873047, 'learning_rate': 2.6101633393829403e-05, 'epoch': 2.62} +{'loss': 44.4621, 'grad_norm': 203.32107543945312, 'learning_rate': 2.60961887477314e-05, 'epoch': 2.62} +{'loss': 44.0782, 'grad_norm': 227.49472045898438, 'learning_rate': 2.6090744101633394e-05, 'epoch': 2.63} +{'loss': 44.7653, 'grad_norm': 242.5762939453125, 'learning_rate': 2.608529945553539e-05, 'epoch': 2.63} +{'loss': 43.9382, 'grad_norm': 223.6633758544922, 'learning_rate': 2.6079854809437388e-05, 'epoch': 2.64} + 13%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 740/5520 [38:04<3:46:14, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6548755764961243, 'eval_runtime': 3.1317, 'eval_samples_per_second': 57.158, 'eval_steps_per_second': 57.158, 'epoch': 2.64} +{'loss': 45.1399, 'grad_norm': 237.716552734375, 'learning_rate': 2.6074410163339384e-05, 'epoch': 2.64} +{'loss': 40.4102, 'grad_norm': 214.22898864746094, 'learning_rate': 2.606896551724138e-05, 'epoch': 2.64} +{'loss': 39.1898, 'grad_norm': 312.23956298828125, 'learning_rate': 2.6063520871143374e-05, 'epoch': 2.65} +{'loss': 36.5315, 'grad_norm': 199.07408142089844, 'learning_rate': 2.6058076225045373e-05, 'epoch': 2.65} +{'loss': 36.8341, 'grad_norm': 229.65692138671875, 'learning_rate': 2.605263157894737e-05, 'epoch': 2.65} +{'loss': 36.1602, 'grad_norm': 222.81546020507812, 'learning_rate': 2.6047186932849367e-05, 'epoch': 2.66} +{'loss': 36.7221, 'grad_norm': 253.58770751953125, 'learning_rate': 2.6041742286751363e-05, 'epoch': 2.66} +{'loss': 37.0262, 'grad_norm': 291.77325439453125, 'learning_rate': 2.6036297640653358e-05, 'epoch': 2.66} +{'loss': 38.2955, 'grad_norm': 293.3721618652344, 'learning_rate': 2.6030852994555353e-05, 'epoch': 2.67} +{'loss': 38.9277, 'grad_norm': 210.18955993652344, 'learning_rate': 2.602540834845735e-05, 'epoch': 2.67} + 14%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 750/5520 [38:35<3:46:11, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6631377339363098, 'eval_runtime': 3.1357, 'eval_samples_per_second': 57.084, 'eval_steps_per_second': 57.084, 'epoch': 2.67} +{'loss': 39.1264, 'grad_norm': 224.5206298828125, 'learning_rate': 2.6019963702359348e-05, 'epoch': 2.68} +{'loss': 40.2912, 'grad_norm': 307.2724914550781, 'learning_rate': 2.6014519056261343e-05, 'epoch': 2.68} +{'loss': 39.1569, 'grad_norm': 287.6835021972656, 'learning_rate': 2.600907441016334e-05, 'epoch': 2.68} +{'loss': 38.4985, 'grad_norm': 286.31817626953125, 'learning_rate': 2.6003629764065337e-05, 'epoch': 2.69} +{'loss': 40.7763, 'grad_norm': 269.58740234375, 'learning_rate': 2.5998185117967332e-05, 'epoch': 2.69} +{'loss': 39.9336, 'grad_norm': 222.31248474121094, 'learning_rate': 2.599274047186933e-05, 'epoch': 2.69} +{'loss': 39.4074, 'grad_norm': 214.96624755859375, 'learning_rate': 2.5987295825771327e-05, 'epoch': 2.7} +{'loss': 40.9984, 'grad_norm': 296.5968322753906, 'learning_rate': 2.5981851179673322e-05, 'epoch': 2.7} +{'loss': 38.6395, 'grad_norm': 228.1329803466797, 'learning_rate': 2.5976406533575317e-05, 'epoch': 2.7} +{'loss': 28.1237, 'grad_norm': 254.83538818359375, 'learning_rate': 2.5970961887477313e-05, 'epoch': 2.71} + 14%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 760/5520 [39:07<3:45:59, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6833599209785461, 'eval_runtime': 3.1401, 'eval_samples_per_second': 57.005, 'eval_steps_per_second': 57.005, 'epoch': 2.71} +{'loss': 25.9143, 'grad_norm': 196.43338012695312, 'learning_rate': 2.5965517241379308e-05, 'epoch': 2.71} +{'loss': 25.2986, 'grad_norm': 223.3903350830078, 'learning_rate': 2.596007259528131e-05, 'epoch': 2.72} +{'loss': 25.9919, 'grad_norm': 220.7471923828125, 'learning_rate': 2.5954627949183306e-05, 'epoch': 2.72} +{'loss': 26.2117, 'grad_norm': 204.15382385253906, 'learning_rate': 2.59491833030853e-05, 'epoch': 2.72} +{'loss': 52.0849, 'grad_norm': 536.2657470703125, 'learning_rate': 2.5943738656987296e-05, 'epoch': 2.73} +{'loss': 51.8393, 'grad_norm': 623.6157836914062, 'learning_rate': 2.5938294010889292e-05, 'epoch': 2.73} +{'loss': 51.4791, 'grad_norm': 491.22821044921875, 'learning_rate': 2.593284936479129e-05, 'epoch': 2.73} +{'loss': 49.5306, 'grad_norm': 414.8413391113281, 'learning_rate': 2.5927404718693286e-05, 'epoch': 2.74} +{'loss': 47.1806, 'grad_norm': 363.01715087890625, 'learning_rate': 2.592196007259528e-05, 'epoch': 2.74} +{'loss': 47.7516, 'grad_norm': 309.8416442871094, 'learning_rate': 2.5916515426497277e-05, 'epoch': 2.74} + 14%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 770/5520 [39:38<3:45:02, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6723723411560059, 'eval_runtime': 3.1345, 'eval_samples_per_second': 57.106, 'eval_steps_per_second': 57.106, 'epoch': 2.74} +{'loss': 46.8819, 'grad_norm': 297.294189453125, 'learning_rate': 2.5911070780399276e-05, 'epoch': 2.75} +{'loss': 46.6719, 'grad_norm': 271.9000549316406, 'learning_rate': 2.590562613430127e-05, 'epoch': 2.75} +{'loss': 45.829, 'grad_norm': 223.2354278564453, 'learning_rate': 2.590018148820327e-05, 'epoch': 2.75} +{'loss': 46.8854, 'grad_norm': 267.2200012207031, 'learning_rate': 2.5894736842105265e-05, 'epoch': 2.76} +{'loss': 44.7511, 'grad_norm': 240.17990112304688, 'learning_rate': 2.588929219600726e-05, 'epoch': 2.76} +{'loss': 42.0385, 'grad_norm': 319.76959228515625, 'learning_rate': 2.5883847549909256e-05, 'epoch': 2.77} +{'loss': 43.6279, 'grad_norm': 221.0363006591797, 'learning_rate': 2.587840290381125e-05, 'epoch': 2.77} +{'loss': 42.0023, 'grad_norm': 211.2090606689453, 'learning_rate': 2.587295825771325e-05, 'epoch': 2.77} +{'loss': 41.4171, 'grad_norm': 214.4199981689453, 'learning_rate': 2.5867513611615245e-05, 'epoch': 2.78} +{'loss': 42.2437, 'grad_norm': 248.0699462890625, 'learning_rate': 2.586206896551724e-05, 'epoch': 2.78} + 14%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 780/5520 [40:09<3:44:28, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6616525053977966, 'eval_runtime': 3.1342, 'eval_samples_per_second': 57.111, 'eval_steps_per_second': 57.111, 'epoch': 2.78} +{'loss': 42.0058, 'grad_norm': 204.29493713378906, 'learning_rate': 2.585662431941924e-05, 'epoch': 2.78} +{'loss': 43.7219, 'grad_norm': 223.14077758789062, 'learning_rate': 2.5851179673321235e-05, 'epoch': 2.79} +{'loss': 44.1564, 'grad_norm': 219.99261474609375, 'learning_rate': 2.584573502722323e-05, 'epoch': 2.79} +{'loss': 43.631, 'grad_norm': 194.47219848632812, 'learning_rate': 2.584029038112523e-05, 'epoch': 2.79} +{'loss': 43.4141, 'grad_norm': 191.4344940185547, 'learning_rate': 2.5834845735027224e-05, 'epoch': 2.8} +{'loss': 43.936, 'grad_norm': 218.28073120117188, 'learning_rate': 2.582940108892922e-05, 'epoch': 2.8} +{'loss': 44.7909, 'grad_norm': 186.77444458007812, 'learning_rate': 2.5823956442831215e-05, 'epoch': 2.81} +{'loss': 43.726, 'grad_norm': 205.01918029785156, 'learning_rate': 2.581851179673321e-05, 'epoch': 2.81} +{'loss': 43.7542, 'grad_norm': 200.90245056152344, 'learning_rate': 2.581306715063521e-05, 'epoch': 2.81} +{'loss': 44.6297, 'grad_norm': 200.3115692138672, 'learning_rate': 2.5807622504537208e-05, 'epoch': 2.82} + 14%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 790/5520 [40:40<3:43:14, 2.83s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.65194171667099, 'eval_runtime': 3.1309, 'eval_samples_per_second': 57.173, 'eval_steps_per_second': 57.173, 'epoch': 2.82} +{'loss': 44.8844, 'grad_norm': 278.512939453125, 'learning_rate': 2.5802177858439204e-05, 'epoch': 2.82} +{'loss': 44.1802, 'grad_norm': 231.06387329101562, 'learning_rate': 2.57967332123412e-05, 'epoch': 2.82} +{'loss': 41.4748, 'grad_norm': 232.7779083251953, 'learning_rate': 2.5791288566243194e-05, 'epoch': 2.83} +{'loss': 38.8538, 'grad_norm': 229.13340759277344, 'learning_rate': 2.578584392014519e-05, 'epoch': 2.83} +{'loss': 38.5641, 'grad_norm': 290.4147644042969, 'learning_rate': 2.578039927404719e-05, 'epoch': 2.83} +{'loss': 36.2725, 'grad_norm': 285.3528137207031, 'learning_rate': 2.5774954627949184e-05, 'epoch': 2.84} +{'loss': 36.5417, 'grad_norm': 218.9436492919922, 'learning_rate': 2.576950998185118e-05, 'epoch': 2.84} +{'loss': 37.4064, 'grad_norm': 264.1986083984375, 'learning_rate': 2.5764065335753175e-05, 'epoch': 2.85} +{'loss': 38.2529, 'grad_norm': 182.3573760986328, 'learning_rate': 2.5758620689655173e-05, 'epoch': 2.85} +{'loss': 38.1339, 'grad_norm': 213.42701721191406, 'learning_rate': 2.5753176043557172e-05, 'epoch': 2.85} + 14%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 800/5520 [41:11<3:43:35, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6563644409179688, 'eval_runtime': 3.1295, 'eval_samples_per_second': 57.198, 'eval_steps_per_second': 57.198, 'epoch': 2.85} +{'loss': 37.8052, 'grad_norm': 277.4792175292969, 'learning_rate': 2.5747731397459168e-05, 'epoch': 2.86} +{'loss': 38.4587, 'grad_norm': 299.55462646484375, 'learning_rate': 2.5742286751361163e-05, 'epoch': 2.86} +{'loss': 39.4709, 'grad_norm': 253.10867309570312, 'learning_rate': 2.5736842105263158e-05, 'epoch': 2.86} +{'loss': 39.0288, 'grad_norm': 228.04470825195312, 'learning_rate': 2.5731397459165154e-05, 'epoch': 2.87} +{'loss': 39.8209, 'grad_norm': 211.8145751953125, 'learning_rate': 2.572595281306715e-05, 'epoch': 2.87} +{'loss': 40.0695, 'grad_norm': 201.8890838623047, 'learning_rate': 2.5720508166969148e-05, 'epoch': 2.87} +{'loss': 40.036, 'grad_norm': 243.47447204589844, 'learning_rate': 2.5715063520871143e-05, 'epoch': 2.88} +{'loss': 40.683, 'grad_norm': 289.123291015625, 'learning_rate': 2.5709618874773142e-05, 'epoch': 2.88} +{'loss': 41.132, 'grad_norm': 292.1619567871094, 'learning_rate': 2.5704174228675137e-05, 'epoch': 2.88} +{'loss': 32.0392, 'grad_norm': 279.39947509765625, 'learning_rate': 2.5698729582577133e-05, 'epoch': 2.89} + 14%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 800/5520 [41:14<3:43:35, 2.84s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 15%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 810/5520 [41:43<3:44:33, 2.86s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6752151250839233, 'eval_runtime': 3.1352, 'eval_samples_per_second': 57.094, 'eval_steps_per_second': 57.094, 'epoch': 2.89} +{'loss': 25.0794, 'grad_norm': 255.42556762695312, 'learning_rate': 2.569328493647913e-05, 'epoch': 2.89} +{'loss': 26.0036, 'grad_norm': 235.7202606201172, 'learning_rate': 2.5687840290381127e-05, 'epoch': 2.9} +{'loss': 26.1592, 'grad_norm': 196.36996459960938, 'learning_rate': 2.5682395644283122e-05, 'epoch': 2.9} +{'loss': 26.9693, 'grad_norm': 254.46896362304688, 'learning_rate': 2.5676950998185118e-05, 'epoch': 2.9} +{'loss': 51.2981, 'grad_norm': 435.552734375, 'learning_rate': 2.5671506352087113e-05, 'epoch': 2.91} +{'loss': 50.1727, 'grad_norm': 523.48388671875, 'learning_rate': 2.566606170598911e-05, 'epoch': 2.91} +{'loss': 50.5994, 'grad_norm': 432.06561279296875, 'learning_rate': 2.5660617059891107e-05, 'epoch': 2.91} +{'loss': 49.3211, 'grad_norm': 354.7589416503906, 'learning_rate': 2.5655172413793106e-05, 'epoch': 2.92} +{'loss': 48.1541, 'grad_norm': 327.1822509765625, 'learning_rate': 2.56497277676951e-05, 'epoch': 2.92} +{'loss': 46.6141, 'grad_norm': 309.42279052734375, 'learning_rate': 2.5644283121597097e-05, 'epoch': 2.92} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 15%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 820/5520 [42:15<3:42:41, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6766613721847534, 'eval_runtime': 3.1328, 'eval_samples_per_second': 57.137, 'eval_steps_per_second': 57.137, 'epoch': 2.92} +{'loss': 43.6344, 'grad_norm': 286.34368896484375, 'learning_rate': 2.5638838475499092e-05, 'epoch': 2.93} +{'loss': 43.0613, 'grad_norm': 224.4126739501953, 'learning_rate': 2.563339382940109e-05, 'epoch': 2.93} +{'loss': 42.9807, 'grad_norm': 246.1632537841797, 'learning_rate': 2.5627949183303086e-05, 'epoch': 2.94} +{'loss': 43.4627, 'grad_norm': 298.6852722167969, 'learning_rate': 2.562250453720508e-05, 'epoch': 2.94} +{'loss': 42.576, 'grad_norm': 255.9106903076172, 'learning_rate': 2.5617059891107077e-05, 'epoch': 2.94} +{'loss': 43.5352, 'grad_norm': 227.76461791992188, 'learning_rate': 2.5611615245009072e-05, 'epoch': 2.95} +{'loss': 44.6115, 'grad_norm': 262.1735534667969, 'learning_rate': 2.560617059891107e-05, 'epoch': 2.95} +{'loss': 45.1437, 'grad_norm': 261.7061767578125, 'learning_rate': 2.560072595281307e-05, 'epoch': 2.95} +{'loss': 43.7623, 'grad_norm': 241.5306396484375, 'learning_rate': 2.5595281306715065e-05, 'epoch': 2.96} +{'loss': 39.4783, 'grad_norm': 262.2628479003906, 'learning_rate': 2.558983666061706e-05, 'epoch': 2.96} + 15%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 830/5520 [42:46<3:43:34, 2.86s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6567817330360413, 'eval_runtime': 3.1365, 'eval_samples_per_second': 57.069, 'eval_steps_per_second': 57.069, 'epoch': 2.96} +{'loss': 37.5457, 'grad_norm': 313.9430236816406, 'learning_rate': 2.5584392014519056e-05, 'epoch': 2.96} +{'loss': 36.6506, 'grad_norm': 223.93162536621094, 'learning_rate': 2.557894736842105e-05, 'epoch': 2.97} +{'loss': 39.2182, 'grad_norm': 319.6013488769531, 'learning_rate': 2.557350272232305e-05, 'epoch': 2.97} +{'loss': 39.3368, 'grad_norm': 237.14610290527344, 'learning_rate': 2.5568058076225046e-05, 'epoch': 2.98} +{'loss': 38.105, 'grad_norm': 270.99481201171875, 'learning_rate': 2.556261343012704e-05, 'epoch': 2.98} +{'loss': 40.1464, 'grad_norm': 236.88687133789062, 'learning_rate': 2.555716878402904e-05, 'epoch': 2.98} +{'loss': 41.4284, 'grad_norm': 205.72084045410156, 'learning_rate': 2.5551724137931035e-05, 'epoch': 2.99} +{'loss': 41.4085, 'grad_norm': 243.73684692382812, 'learning_rate': 2.554627949183303e-05, 'epoch': 2.99} +{'loss': 28.59, 'grad_norm': 200.96815490722656, 'learning_rate': 2.554083484573503e-05, 'epoch': 2.99} +{'loss': 25.85, 'grad_norm': 258.556884765625, 'learning_rate': 2.5535390199637025e-05, 'epoch': 3.0} + 15%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 840/5520 [43:17<3:42:18, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6678276062011719, 'eval_runtime': 3.1338, 'eval_samples_per_second': 57.118, 'eval_steps_per_second': 57.118, 'epoch': 3.0} +{'loss': 23.6925, 'grad_norm': 262.8646545410156, 'learning_rate': 2.552994555353902e-05, 'epoch': 3.0} +{'loss': 47.4725, 'grad_norm': 388.7956848144531, 'learning_rate': 2.5524500907441015e-05, 'epoch': 3.0} +{'loss': 48.1197, 'grad_norm': 371.5218200683594, 'learning_rate': 2.551905626134301e-05, 'epoch': 3.01} +{'loss': 49.0124, 'grad_norm': 296.68096923828125, 'learning_rate': 2.551361161524501e-05, 'epoch': 3.01} +{'loss': 47.0989, 'grad_norm': 258.3492126464844, 'learning_rate': 2.550816696914701e-05, 'epoch': 3.01} +{'loss': 46.5102, 'grad_norm': 262.0732116699219, 'learning_rate': 2.5502722323049004e-05, 'epoch': 3.02} +{'loss': 47.2614, 'grad_norm': 249.84967041015625, 'learning_rate': 2.5497277676951e-05, 'epoch': 3.02} +{'loss': 44.8942, 'grad_norm': 259.7544250488281, 'learning_rate': 2.5491833030852995e-05, 'epoch': 3.03} +{'loss': 45.42, 'grad_norm': 264.3735656738281, 'learning_rate': 2.5486388384754993e-05, 'epoch': 3.03} +{'loss': 46.1006, 'grad_norm': 295.92919921875, 'learning_rate': 2.548094373865699e-05, 'epoch': 3.03} + 15%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 850/5520 [43:48<3:42:23, 2.86s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6581276059150696, 'eval_runtime': 3.1326, 'eval_samples_per_second': 57.14, 'eval_steps_per_second': 57.14, 'epoch': 3.03} +{'loss': 46.1223, 'grad_norm': 311.7466125488281, 'learning_rate': 2.5475499092558984e-05, 'epoch': 3.04} +{'loss': 45.1578, 'grad_norm': 208.77503967285156, 'learning_rate': 2.547005444646098e-05, 'epoch': 3.04} +{'loss': 42.9368, 'grad_norm': 203.6681671142578, 'learning_rate': 2.5464609800362975e-05, 'epoch': 3.04} +{'loss': 42.4021, 'grad_norm': 251.1130828857422, 'learning_rate': 2.5459165154264974e-05, 'epoch': 3.05} +{'loss': 41.7869, 'grad_norm': 253.73077392578125, 'learning_rate': 2.5453720508166972e-05, 'epoch': 3.05} +{'loss': 41.3124, 'grad_norm': 202.12892150878906, 'learning_rate': 2.5448275862068968e-05, 'epoch': 3.05} +{'loss': 41.1522, 'grad_norm': 250.02322387695312, 'learning_rate': 2.5442831215970963e-05, 'epoch': 3.06} +{'loss': 41.4023, 'grad_norm': 171.8944549560547, 'learning_rate': 2.543738656987296e-05, 'epoch': 3.06} +{'loss': 43.0454, 'grad_norm': 245.9447784423828, 'learning_rate': 2.5431941923774954e-05, 'epoch': 3.07} +{'loss': 43.7984, 'grad_norm': 216.93519592285156, 'learning_rate': 2.5426497277676953e-05, 'epoch': 3.07} + 16%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 860/5520 [44:19<3:40:00, 2.83s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6542946100234985, 'eval_runtime': 3.1331, 'eval_samples_per_second': 57.132, 'eval_steps_per_second': 57.132, 'epoch': 3.07} +{'loss': 42.7845, 'grad_norm': 271.1915588378906, 'learning_rate': 2.5421052631578948e-05, 'epoch': 3.07} +{'loss': 43.042, 'grad_norm': 262.10791015625, 'learning_rate': 2.5415607985480943e-05, 'epoch': 3.08} +{'loss': 42.541, 'grad_norm': 234.5153045654297, 'learning_rate': 2.541016333938294e-05, 'epoch': 3.08} +{'loss': 43.6462, 'grad_norm': 183.66058349609375, 'learning_rate': 2.5404718693284938e-05, 'epoch': 3.08} +{'loss': 43.9704, 'grad_norm': 232.13169860839844, 'learning_rate': 2.5399274047186933e-05, 'epoch': 3.09} +{'loss': 43.1515, 'grad_norm': 219.72445678710938, 'learning_rate': 2.5393829401088932e-05, 'epoch': 3.09} +{'loss': 43.9146, 'grad_norm': 215.75115966796875, 'learning_rate': 2.5388384754990927e-05, 'epoch': 3.09} +{'loss': 44.323, 'grad_norm': 248.385498046875, 'learning_rate': 2.5382940108892923e-05, 'epoch': 3.1} +{'loss': 45.381, 'grad_norm': 295.951171875, 'learning_rate': 2.5377495462794918e-05, 'epoch': 3.1} +{'loss': 42.4742, 'grad_norm': 239.43002319335938, 'learning_rate': 2.5372050816696913e-05, 'epoch': 3.1} + 16%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 870/5520 [44:50<3:40:22, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.647969663143158, 'eval_runtime': 3.1334, 'eval_samples_per_second': 57.126, 'eval_steps_per_second': 57.126, 'epoch': 3.1} +{'loss': 39.4421, 'grad_norm': 207.01927185058594, 'learning_rate': 2.5366606170598912e-05, 'epoch': 3.11} +{'loss': 37.9044, 'grad_norm': 255.97584533691406, 'learning_rate': 2.5361161524500907e-05, 'epoch': 3.11} +{'loss': 36.04, 'grad_norm': 379.0328674316406, 'learning_rate': 2.5355716878402906e-05, 'epoch': 3.12} +{'loss': 36.4328, 'grad_norm': 216.48049926757812, 'learning_rate': 2.53502722323049e-05, 'epoch': 3.12} +{'loss': 36.5303, 'grad_norm': 242.0985565185547, 'learning_rate': 2.5344827586206897e-05, 'epoch': 3.12} +{'loss': 38.199, 'grad_norm': 212.566650390625, 'learning_rate': 2.5339382940108892e-05, 'epoch': 3.13} +{'loss': 39.0308, 'grad_norm': 217.37811279296875, 'learning_rate': 2.533393829401089e-05, 'epoch': 3.13} +{'loss': 36.3811, 'grad_norm': 186.531494140625, 'learning_rate': 2.5328493647912887e-05, 'epoch': 3.13} +{'loss': 37.8778, 'grad_norm': 202.18603515625, 'learning_rate': 2.5323049001814882e-05, 'epoch': 3.14} +{'loss': 38.3339, 'grad_norm': 246.00283813476562, 'learning_rate': 2.5317604355716877e-05, 'epoch': 3.14} + 16%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 880/5520 [45:22<3:40:05, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6535190343856812, 'eval_runtime': 3.1337, 'eval_samples_per_second': 57.121, 'eval_steps_per_second': 57.121, 'epoch': 3.14} +{'loss': 38.8566, 'grad_norm': 246.4683074951172, 'learning_rate': 2.5312159709618873e-05, 'epoch': 3.14} +{'loss': 38.0433, 'grad_norm': 243.6247100830078, 'learning_rate': 2.5306715063520875e-05, 'epoch': 3.15} +{'loss': 40.121, 'grad_norm': 273.507080078125, 'learning_rate': 2.530127041742287e-05, 'epoch': 3.15} +{'loss': 38.9714, 'grad_norm': 243.57203674316406, 'learning_rate': 2.5295825771324866e-05, 'epoch': 3.16} +{'loss': 38.7573, 'grad_norm': 206.15533447265625, 'learning_rate': 2.529038112522686e-05, 'epoch': 3.16} +{'loss': 41.3548, 'grad_norm': 322.87799072265625, 'learning_rate': 2.5284936479128856e-05, 'epoch': 3.16} +{'loss': 30.5113, 'grad_norm': 259.7116394042969, 'learning_rate': 2.5279491833030852e-05, 'epoch': 3.17} +{'loss': 26.152, 'grad_norm': 277.6427307128906, 'learning_rate': 2.527404718693285e-05, 'epoch': 3.17} +{'loss': 25.543, 'grad_norm': 259.84588623046875, 'learning_rate': 2.5268602540834846e-05, 'epoch': 3.17} +{'loss': 25.2503, 'grad_norm': 205.59854125976562, 'learning_rate': 2.526315789473684e-05, 'epoch': 3.18} + 16%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 890/5520 [45:53<3:39:28, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6754873394966125, 'eval_runtime': 3.1451, 'eval_samples_per_second': 56.914, 'eval_steps_per_second': 56.914, 'epoch': 3.18} +{'loss': 25.8277, 'grad_norm': 231.8431396484375, 'learning_rate': 2.525771324863884e-05, 'epoch': 3.18} +{'loss': 48.7641, 'grad_norm': 437.2222900390625, 'learning_rate': 2.5252268602540835e-05, 'epoch': 3.18} +{'loss': 49.1716, 'grad_norm': 409.23468017578125, 'learning_rate': 2.5246823956442834e-05, 'epoch': 3.19} +{'loss': 49.1204, 'grad_norm': 446.9589538574219, 'learning_rate': 2.524137931034483e-05, 'epoch': 3.19} +{'loss': 46.3124, 'grad_norm': 365.4459228515625, 'learning_rate': 2.5235934664246825e-05, 'epoch': 3.2} +{'loss': 46.4021, 'grad_norm': 329.88677978515625, 'learning_rate': 2.523049001814882e-05, 'epoch': 3.2} +{'loss': 46.4958, 'grad_norm': 271.31201171875, 'learning_rate': 2.5225045372050816e-05, 'epoch': 3.2} +{'loss': 46.238, 'grad_norm': 281.3929138183594, 'learning_rate': 2.521960072595281e-05, 'epoch': 3.21} +{'loss': 47.0312, 'grad_norm': 279.1689147949219, 'learning_rate': 2.521415607985481e-05, 'epoch': 3.21} +{'loss': 46.1837, 'grad_norm': 296.18115234375, 'learning_rate': 2.520871143375681e-05, 'epoch': 3.21} + 16%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 900/5520 [46:24<3:39:50, 2.86s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.666180431842804, 'eval_runtime': 3.1315, 'eval_samples_per_second': 57.161, 'eval_steps_per_second': 57.161, 'epoch': 3.21} +{'loss': 46.1023, 'grad_norm': 315.7528991699219, 'learning_rate': 2.5203266787658804e-05, 'epoch': 3.22} +{'loss': 43.9875, 'grad_norm': 296.7471923828125, 'learning_rate': 2.51978221415608e-05, 'epoch': 3.22} +{'loss': 42.0998, 'grad_norm': 238.37600708007812, 'learning_rate': 2.5192377495462795e-05, 'epoch': 3.22} +{'loss': 40.791, 'grad_norm': 221.9834442138672, 'learning_rate': 2.5186932849364794e-05, 'epoch': 3.23} +{'loss': 41.6985, 'grad_norm': 221.9122314453125, 'learning_rate': 2.518148820326679e-05, 'epoch': 3.23} +{'loss': 42.978, 'grad_norm': 269.44561767578125, 'learning_rate': 2.5176043557168784e-05, 'epoch': 3.23} +{'loss': 41.4141, 'grad_norm': 207.09165954589844, 'learning_rate': 2.517059891107078e-05, 'epoch': 3.24} +{'loss': 41.7936, 'grad_norm': 236.3747100830078, 'learning_rate': 2.5165154264972775e-05, 'epoch': 3.24} +{'loss': 42.0031, 'grad_norm': 194.84373474121094, 'learning_rate': 2.515970961887477e-05, 'epoch': 3.25} +{'loss': 43.2596, 'grad_norm': 220.2052459716797, 'learning_rate': 2.5154264972776773e-05, 'epoch': 3.25} + 16%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 910/5520 [46:55<3:37:29, 2.83s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6527710556983948, 'eval_runtime': 3.1368, 'eval_samples_per_second': 57.065, 'eval_steps_per_second': 57.065, 'epoch': 3.25} +{'loss': 43.2345, 'grad_norm': 190.3020782470703, 'learning_rate': 2.5148820326678768e-05, 'epoch': 3.25} +{'loss': 42.9185, 'grad_norm': 200.23194885253906, 'learning_rate': 2.5143375680580763e-05, 'epoch': 3.26} +{'loss': 42.7929, 'grad_norm': 207.30697631835938, 'learning_rate': 2.513793103448276e-05, 'epoch': 3.26} +{'loss': 43.6829, 'grad_norm': 180.4369354248047, 'learning_rate': 2.5132486388384754e-05, 'epoch': 3.26} +{'loss': 42.6406, 'grad_norm': 169.92384338378906, 'learning_rate': 2.5127041742286753e-05, 'epoch': 3.27} +{'loss': 43.314, 'grad_norm': 224.46177673339844, 'learning_rate': 2.512159709618875e-05, 'epoch': 3.27} +{'loss': 44.1259, 'grad_norm': 246.6527862548828, 'learning_rate': 2.5116152450090744e-05, 'epoch': 3.27} +{'loss': 43.7819, 'grad_norm': 201.84552001953125, 'learning_rate': 2.511070780399274e-05, 'epoch': 3.28} +{'loss': 41.0509, 'grad_norm': 195.65174865722656, 'learning_rate': 2.5105263157894738e-05, 'epoch': 3.28} +{'loss': 39.3365, 'grad_norm': 238.36911010742188, 'learning_rate': 2.5099818511796733e-05, 'epoch': 3.29} + 17%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 920/5520 [47:27<3:38:29, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6488128900527954, 'eval_runtime': 3.133, 'eval_samples_per_second': 57.133, 'eval_steps_per_second': 57.133, 'epoch': 3.29} +{'loss': 36.081, 'grad_norm': 235.58831787109375, 'learning_rate': 2.5094373865698732e-05, 'epoch': 3.29} +{'loss': 35.9779, 'grad_norm': 246.3998565673828, 'learning_rate': 2.5088929219600727e-05, 'epoch': 3.29} +{'loss': 35.0636, 'grad_norm': 224.34561157226562, 'learning_rate': 2.5083484573502723e-05, 'epoch': 3.3} +{'loss': 36.6272, 'grad_norm': 203.2981414794922, 'learning_rate': 2.5078039927404718e-05, 'epoch': 3.3} +{'loss': 36.0493, 'grad_norm': 179.4558563232422, 'learning_rate': 2.5072595281306714e-05, 'epoch': 3.3} +{'loss': 36.9084, 'grad_norm': 240.01748657226562, 'learning_rate': 2.5067150635208712e-05, 'epoch': 3.31} +{'loss': 37.3878, 'grad_norm': 264.4375, 'learning_rate': 2.5061705989110708e-05, 'epoch': 3.31} +{'loss': 37.0502, 'grad_norm': 207.66322326660156, 'learning_rate': 2.5056261343012707e-05, 'epoch': 3.31} +{'loss': 39.0532, 'grad_norm': 304.8887634277344, 'learning_rate': 2.5050816696914702e-05, 'epoch': 3.32} +{'loss': 37.9885, 'grad_norm': 242.4520721435547, 'learning_rate': 2.5045372050816697e-05, 'epoch': 3.32} + 17%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 930/5520 [47:58<3:37:54, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6546927690505981, 'eval_runtime': 3.1329, 'eval_samples_per_second': 57.136, 'eval_steps_per_second': 57.136, 'epoch': 3.32} +{'loss': 39.0422, 'grad_norm': 297.5693054199219, 'learning_rate': 2.5039927404718693e-05, 'epoch': 3.33} +{'loss': 39.2992, 'grad_norm': 208.76441955566406, 'learning_rate': 2.503448275862069e-05, 'epoch': 3.33} +{'loss': 39.7897, 'grad_norm': 265.2093200683594, 'learning_rate': 2.5029038112522687e-05, 'epoch': 3.33} +{'loss': 39.6141, 'grad_norm': 279.2838439941406, 'learning_rate': 2.5023593466424682e-05, 'epoch': 3.34} +{'loss': 39.5724, 'grad_norm': 246.86895751953125, 'learning_rate': 2.5018148820326678e-05, 'epoch': 3.34} +{'loss': 40.0274, 'grad_norm': 315.27838134765625, 'learning_rate': 2.5012704174228673e-05, 'epoch': 3.34} +{'loss': 27.8964, 'grad_norm': 286.7344665527344, 'learning_rate': 2.5007259528130675e-05, 'epoch': 3.35} +{'loss': 26.1333, 'grad_norm': 320.6955261230469, 'learning_rate': 2.500181488203267e-05, 'epoch': 3.35} +{'loss': 25.0519, 'grad_norm': 271.5133972167969, 'learning_rate': 2.4996370235934666e-05, 'epoch': 3.35} +{'loss': 26.3701, 'grad_norm': 259.59234619140625, 'learning_rate': 2.499092558983666e-05, 'epoch': 3.36} + 17%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 940/5520 [48:29<3:36:36, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6857922077178955, 'eval_runtime': 3.1352, 'eval_samples_per_second': 57.093, 'eval_steps_per_second': 57.093, 'epoch': 3.36} +{'loss': 26.9694, 'grad_norm': 240.87948608398438, 'learning_rate': 2.4985480943738657e-05, 'epoch': 3.36} +{'loss': 50.6978, 'grad_norm': 410.1781921386719, 'learning_rate': 2.4980036297640655e-05, 'epoch': 3.36} +{'loss': 49.5872, 'grad_norm': 371.2940979003906, 'learning_rate': 2.497459165154265e-05, 'epoch': 3.37} +{'loss': 48.7744, 'grad_norm': 343.48809814453125, 'learning_rate': 2.4969147005444646e-05, 'epoch': 3.37} +{'loss': 48.104, 'grad_norm': 334.878662109375, 'learning_rate': 2.496370235934664e-05, 'epoch': 3.38} +{'loss': 47.1941, 'grad_norm': 301.94696044921875, 'learning_rate': 2.4958257713248637e-05, 'epoch': 3.38} +{'loss': 46.8274, 'grad_norm': 295.99810791015625, 'learning_rate': 2.4952813067150636e-05, 'epoch': 3.38} +{'loss': 46.8453, 'grad_norm': 240.8074188232422, 'learning_rate': 2.4947368421052635e-05, 'epoch': 3.39} +{'loss': 46.6894, 'grad_norm': 244.65985107421875, 'learning_rate': 2.494192377495463e-05, 'epoch': 3.39} +{'loss': 45.5307, 'grad_norm': 239.5635223388672, 'learning_rate': 2.4936479128856625e-05, 'epoch': 3.39} + 17%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 950/5520 [49:00<3:35:42, 2.83s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6575602293014526, 'eval_runtime': 3.1373, 'eval_samples_per_second': 57.056, 'eval_steps_per_second': 57.056, 'epoch': 3.39} +{'loss': 44.8465, 'grad_norm': 224.8990936279297, 'learning_rate': 2.493103448275862e-05, 'epoch': 3.4} +{'loss': 44.8556, 'grad_norm': 263.9532165527344, 'learning_rate': 2.4925589836660616e-05, 'epoch': 3.4} +{'loss': 43.7434, 'grad_norm': 252.4757080078125, 'learning_rate': 2.4920145190562615e-05, 'epoch': 3.4} +{'loss': 40.3602, 'grad_norm': 204.91795349121094, 'learning_rate': 2.491470054446461e-05, 'epoch': 3.41} +{'loss': 41.5125, 'grad_norm': 259.7920837402344, 'learning_rate': 2.4909255898366606e-05, 'epoch': 3.41} +{'loss': 42.1967, 'grad_norm': 196.34872436523438, 'learning_rate': 2.4903811252268604e-05, 'epoch': 3.42} +{'loss': 41.5637, 'grad_norm': 267.5933837890625, 'learning_rate': 2.48983666061706e-05, 'epoch': 3.42} +{'loss': 41.3467, 'grad_norm': 261.2299499511719, 'learning_rate': 2.4892921960072595e-05, 'epoch': 3.42} +{'loss': 42.9534, 'grad_norm': 195.84051513671875, 'learning_rate': 2.4887477313974594e-05, 'epoch': 3.43} +{'loss': 43.8068, 'grad_norm': 251.25294494628906, 'learning_rate': 2.488203266787659e-05, 'epoch': 3.43} + 17%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 960/5520 [49:31<3:35:12, 2.83s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6576783657073975, 'eval_runtime': 3.1299, 'eval_samples_per_second': 57.19, 'eval_steps_per_second': 57.19, 'epoch': 3.43} +{'loss': 43.448, 'grad_norm': 221.30291748046875, 'learning_rate': 2.4876588021778585e-05, 'epoch': 3.43} +{'loss': 42.7632, 'grad_norm': 244.35842895507812, 'learning_rate': 2.487114337568058e-05, 'epoch': 3.44} +{'loss': 43.2151, 'grad_norm': 195.3553009033203, 'learning_rate': 2.4865698729582575e-05, 'epoch': 3.44} +{'loss': 43.5462, 'grad_norm': 179.0012969970703, 'learning_rate': 2.4860254083484574e-05, 'epoch': 3.44} +{'loss': 43.5087, 'grad_norm': 200.3195343017578, 'learning_rate': 2.4854809437386573e-05, 'epoch': 3.45} +{'loss': 44.1719, 'grad_norm': 263.8428955078125, 'learning_rate': 2.484936479128857e-05, 'epoch': 3.45} +{'loss': 43.245, 'grad_norm': 208.326416015625, 'learning_rate': 2.4843920145190564e-05, 'epoch': 3.46} +{'loss': 44.3687, 'grad_norm': 193.4184112548828, 'learning_rate': 2.483847549909256e-05, 'epoch': 3.46} +{'loss': 43.7617, 'grad_norm': 201.8892059326172, 'learning_rate': 2.4833030852994555e-05, 'epoch': 3.46} +{'loss': 43.0001, 'grad_norm': 258.5245056152344, 'learning_rate': 2.4827586206896553e-05, 'epoch': 3.47} + 18%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 970/5520 [50:02<3:35:02, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6453068256378174, 'eval_runtime': 3.1355, 'eval_samples_per_second': 57.087, 'eval_steps_per_second': 57.087, 'epoch': 3.47} +{'loss': 39.1967, 'grad_norm': 218.70947265625, 'learning_rate': 2.482214156079855e-05, 'epoch': 3.47} +{'loss': 36.9852, 'grad_norm': 267.3435363769531, 'learning_rate': 2.4816696914700544e-05, 'epoch': 3.47} +{'loss': 35.9727, 'grad_norm': 285.9330139160156, 'learning_rate': 2.481125226860254e-05, 'epoch': 3.48} +{'loss': 36.7653, 'grad_norm': 215.71005249023438, 'learning_rate': 2.4805807622504538e-05, 'epoch': 3.48} +{'loss': 36.378, 'grad_norm': 232.87876892089844, 'learning_rate': 2.4800362976406537e-05, 'epoch': 3.48} +{'loss': 36.8383, 'grad_norm': 171.5175018310547, 'learning_rate': 2.4794918330308532e-05, 'epoch': 3.49} +{'loss': 37.8672, 'grad_norm': 215.11647033691406, 'learning_rate': 2.4789473684210528e-05, 'epoch': 3.49} +{'loss': 38.2493, 'grad_norm': 219.3248291015625, 'learning_rate': 2.4784029038112523e-05, 'epoch': 3.49} +{'loss': 37.8047, 'grad_norm': 250.36343383789062, 'learning_rate': 2.477858439201452e-05, 'epoch': 3.5} +{'loss': 38.3357, 'grad_norm': 218.4738311767578, 'learning_rate': 2.4773139745916514e-05, 'epoch': 3.5} + 18%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 980/5520 [50:34<3:35:51, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6516546607017517, 'eval_runtime': 3.1323, 'eval_samples_per_second': 57.147, 'eval_steps_per_second': 57.147, 'epoch': 3.5} +{'loss': 38.4684, 'grad_norm': 229.22828674316406, 'learning_rate': 2.4767695099818513e-05, 'epoch': 3.51} +{'loss': 38.4852, 'grad_norm': 177.5777130126953, 'learning_rate': 2.4762250453720508e-05, 'epoch': 3.51} +{'loss': 39.2789, 'grad_norm': 206.41226196289062, 'learning_rate': 2.4756805807622507e-05, 'epoch': 3.51} +{'loss': 39.4009, 'grad_norm': 206.19235229492188, 'learning_rate': 2.4751361161524502e-05, 'epoch': 3.52} +{'loss': 40.2545, 'grad_norm': 293.0887145996094, 'learning_rate': 2.4745916515426498e-05, 'epoch': 3.52} +{'loss': 40.174, 'grad_norm': 304.7360534667969, 'learning_rate': 2.4740471869328496e-05, 'epoch': 3.52} +{'loss': 28.7529, 'grad_norm': 292.6968078613281, 'learning_rate': 2.4735027223230492e-05, 'epoch': 3.53} +{'loss': 25.3517, 'grad_norm': 188.4938201904297, 'learning_rate': 2.4729582577132487e-05, 'epoch': 3.53} +{'loss': 24.9574, 'grad_norm': 187.330322265625, 'learning_rate': 2.4724137931034483e-05, 'epoch': 3.53} +{'loss': 26.0505, 'grad_norm': 198.25450134277344, 'learning_rate': 2.4718693284936478e-05, 'epoch': 3.54} + 18%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 990/5520 [51:05<3:34:46, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6837891936302185, 'eval_runtime': 3.1313, 'eval_samples_per_second': 57.164, 'eval_steps_per_second': 57.164, 'epoch': 3.54} +{'loss': 27.1157, 'grad_norm': 221.72662353515625, 'learning_rate': 2.4713248638838473e-05, 'epoch': 3.54} +{'loss': 50.0102, 'grad_norm': 449.80987548828125, 'learning_rate': 2.4707803992740472e-05, 'epoch': 3.55} +{'loss': 50.162, 'grad_norm': 450.6602478027344, 'learning_rate': 2.470235934664247e-05, 'epoch': 3.55} +{'loss': 49.1374, 'grad_norm': 424.1731872558594, 'learning_rate': 2.4696914700544466e-05, 'epoch': 3.55} +{'loss': 47.5901, 'grad_norm': 339.78997802734375, 'learning_rate': 2.469147005444646e-05, 'epoch': 3.56} +{'loss': 48.7289, 'grad_norm': 270.9290466308594, 'learning_rate': 2.4686025408348457e-05, 'epoch': 3.56} +{'loss': 45.926, 'grad_norm': 254.77444458007812, 'learning_rate': 2.4680580762250456e-05, 'epoch': 3.56} +{'loss': 46.2578, 'grad_norm': 309.8949890136719, 'learning_rate': 2.467513611615245e-05, 'epoch': 3.57} +{'loss': 46.5274, 'grad_norm': 264.5209655761719, 'learning_rate': 2.4669691470054447e-05, 'epoch': 3.57} +{'loss': 45.0636, 'grad_norm': 306.8301696777344, 'learning_rate': 2.4664246823956442e-05, 'epoch': 3.57} + 18%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1000/5520 [51:36<3:34:26, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6581718921661377, 'eval_runtime': 3.1307, 'eval_samples_per_second': 57.176, 'eval_steps_per_second': 57.176, 'epoch': 3.57} +{'loss': 44.8446, 'grad_norm': 228.46180725097656, 'learning_rate': 2.4658802177858437e-05, 'epoch': 3.58} +{'loss': 44.5141, 'grad_norm': 246.97792053222656, 'learning_rate': 2.4653357531760436e-05, 'epoch': 3.58} +{'loss': 42.7822, 'grad_norm': 199.88819885253906, 'learning_rate': 2.4647912885662435e-05, 'epoch': 3.59} +{'loss': 41.7004, 'grad_norm': 291.8363952636719, 'learning_rate': 2.464246823956443e-05, 'epoch': 3.59} +{'loss': 41.154, 'grad_norm': 194.8997039794922, 'learning_rate': 2.4637023593466426e-05, 'epoch': 3.59} +{'loss': 41.4898, 'grad_norm': 271.03863525390625, 'learning_rate': 2.463157894736842e-05, 'epoch': 3.6} +{'loss': 42.7646, 'grad_norm': 219.783203125, 'learning_rate': 2.4626134301270416e-05, 'epoch': 3.6} +{'loss': 41.9049, 'grad_norm': 232.6287384033203, 'learning_rate': 2.4620689655172415e-05, 'epoch': 3.6} +{'loss': 42.2493, 'grad_norm': 209.7451934814453, 'learning_rate': 2.461524500907441e-05, 'epoch': 3.61} +{'loss': 42.112, 'grad_norm': 202.67608642578125, 'learning_rate': 2.4609800362976406e-05, 'epoch': 3.61} + 18%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1000/5520 [51:39<3:34:26, 2.85s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 18%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1010/5520 [52:09<3:35:34, 2.87s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6473406553268433, 'eval_runtime': 3.136, 'eval_samples_per_second': 57.078, 'eval_steps_per_second': 57.078, 'epoch': 3.61} +{'loss': 42.5126, 'grad_norm': 196.04354858398438, 'learning_rate': 2.4604355716878405e-05, 'epoch': 3.61} +{'loss': 44.0572, 'grad_norm': 187.22372436523438, 'learning_rate': 2.45989110707804e-05, 'epoch': 3.62} +{'loss': 42.9538, 'grad_norm': 249.96971130371094, 'learning_rate': 2.4593466424682395e-05, 'epoch': 3.62} +{'loss': 44.328, 'grad_norm': 215.70565795898438, 'learning_rate': 2.4588021778584394e-05, 'epoch': 3.62} +{'loss': 43.6176, 'grad_norm': 180.83642578125, 'learning_rate': 2.458257713248639e-05, 'epoch': 3.63} +{'loss': 42.8305, 'grad_norm': 190.6321563720703, 'learning_rate': 2.4577132486388385e-05, 'epoch': 3.63} +{'loss': 43.6692, 'grad_norm': 192.47682189941406, 'learning_rate': 2.457168784029038e-05, 'epoch': 3.64} +{'loss': 43.6698, 'grad_norm': 204.16403198242188, 'learning_rate': 2.4566243194192376e-05, 'epoch': 3.64} +{'loss': 42.4672, 'grad_norm': 216.57371520996094, 'learning_rate': 2.4560798548094374e-05, 'epoch': 3.64} +{'loss': 40.9764, 'grad_norm': 209.3368377685547, 'learning_rate': 2.4555353901996373e-05, 'epoch': 3.65} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 18%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1020/5520 [52:40<3:33:28, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6477307081222534, 'eval_runtime': 3.1316, 'eval_samples_per_second': 57.159, 'eval_steps_per_second': 57.159, 'epoch': 3.65} +{'loss': 37.8193, 'grad_norm': 209.23538208007812, 'learning_rate': 2.454990925589837e-05, 'epoch': 3.65} +{'loss': 36.6497, 'grad_norm': 236.15859985351562, 'learning_rate': 2.4544464609800364e-05, 'epoch': 3.65} +{'loss': 36.5181, 'grad_norm': 230.68008422851562, 'learning_rate': 2.453901996370236e-05, 'epoch': 3.66} +{'loss': 37.4292, 'grad_norm': 233.6422882080078, 'learning_rate': 2.4533575317604355e-05, 'epoch': 3.66} +{'loss': 36.8303, 'grad_norm': 263.49554443359375, 'learning_rate': 2.4528130671506354e-05, 'epoch': 3.66} +{'loss': 38.5344, 'grad_norm': 259.7931823730469, 'learning_rate': 2.452268602540835e-05, 'epoch': 3.67} +{'loss': 37.9728, 'grad_norm': 227.5961151123047, 'learning_rate': 2.4517241379310344e-05, 'epoch': 3.67} +{'loss': 37.1389, 'grad_norm': 209.28163146972656, 'learning_rate': 2.451179673321234e-05, 'epoch': 3.68} +{'loss': 37.4052, 'grad_norm': 284.8781433105469, 'learning_rate': 2.450635208711434e-05, 'epoch': 3.68} +{'loss': 39.1912, 'grad_norm': 256.3425598144531, 'learning_rate': 2.4500907441016337e-05, 'epoch': 3.68} + 19%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1030/5520 [53:11<3:33:02, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6528274416923523, 'eval_runtime': 3.1315, 'eval_samples_per_second': 57.161, 'eval_steps_per_second': 57.161, 'epoch': 3.68} +{'loss': 37.205, 'grad_norm': 218.8914031982422, 'learning_rate': 2.4495462794918333e-05, 'epoch': 3.69} +{'loss': 40.28, 'grad_norm': 254.91282653808594, 'learning_rate': 2.4490018148820328e-05, 'epoch': 3.69} +{'loss': 39.4649, 'grad_norm': 235.3753662109375, 'learning_rate': 2.4484573502722323e-05, 'epoch': 3.69} +{'loss': 38.5807, 'grad_norm': 286.5908203125, 'learning_rate': 2.447912885662432e-05, 'epoch': 3.7} +{'loss': 39.2304, 'grad_norm': 227.684814453125, 'learning_rate': 2.4473684210526318e-05, 'epoch': 3.7} +{'loss': 39.5135, 'grad_norm': 230.00128173828125, 'learning_rate': 2.4468239564428313e-05, 'epoch': 3.7} +{'loss': 36.6274, 'grad_norm': 198.72862243652344, 'learning_rate': 2.446279491833031e-05, 'epoch': 3.71} +{'loss': 25.5852, 'grad_norm': 263.6575012207031, 'learning_rate': 2.4457350272232304e-05, 'epoch': 3.71} +{'loss': 24.8593, 'grad_norm': 273.997314453125, 'learning_rate': 2.4451905626134302e-05, 'epoch': 3.72} +{'loss': 25.4596, 'grad_norm': 180.25997924804688, 'learning_rate': 2.4446460980036298e-05, 'epoch': 3.72} + 19%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1040/5520 [53:43<3:32:13, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6783067584037781, 'eval_runtime': 3.1325, 'eval_samples_per_second': 57.142, 'eval_steps_per_second': 57.142, 'epoch': 3.72} +{'loss': 25.9792, 'grad_norm': 203.3702850341797, 'learning_rate': 2.4441016333938297e-05, 'epoch': 3.72} +{'loss': 48.2254, 'grad_norm': 393.4874572753906, 'learning_rate': 2.4435571687840292e-05, 'epoch': 3.73} +{'loss': 49.7546, 'grad_norm': 369.2442626953125, 'learning_rate': 2.4430127041742287e-05, 'epoch': 3.73} +{'loss': 48.1843, 'grad_norm': 339.0132751464844, 'learning_rate': 2.4424682395644283e-05, 'epoch': 3.73} +{'loss': 47.2471, 'grad_norm': 322.1737060546875, 'learning_rate': 2.4419237749546278e-05, 'epoch': 3.74} +{'loss': 47.5831, 'grad_norm': 330.0899658203125, 'learning_rate': 2.4413793103448277e-05, 'epoch': 3.74} +{'loss': 47.0229, 'grad_norm': 306.1767578125, 'learning_rate': 2.4408348457350272e-05, 'epoch': 3.74} +{'loss': 46.801, 'grad_norm': 279.7237548828125, 'learning_rate': 2.440290381125227e-05, 'epoch': 3.75} +{'loss': 47.2659, 'grad_norm': 277.7254333496094, 'learning_rate': 2.4397459165154266e-05, 'epoch': 3.75} +{'loss': 46.1864, 'grad_norm': 288.577880859375, 'learning_rate': 2.4392014519056262e-05, 'epoch': 3.75} + 19%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1050/5520 [54:14<3:31:57, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6584362983703613, 'eval_runtime': 3.1331, 'eval_samples_per_second': 57.133, 'eval_steps_per_second': 57.133, 'epoch': 3.75} +{'loss': 43.9753, 'grad_norm': 282.44989013671875, 'learning_rate': 2.4386569872958257e-05, 'epoch': 3.76} +{'loss': 44.1678, 'grad_norm': 186.32090759277344, 'learning_rate': 2.4381125226860256e-05, 'epoch': 3.76} +{'loss': 42.0396, 'grad_norm': 227.9755096435547, 'learning_rate': 2.437568058076225e-05, 'epoch': 3.77} +{'loss': 40.0181, 'grad_norm': 188.82789611816406, 'learning_rate': 2.4370235934664247e-05, 'epoch': 3.77} +{'loss': 41.0851, 'grad_norm': 222.2530517578125, 'learning_rate': 2.4364791288566242e-05, 'epoch': 3.77} +{'loss': 41.0595, 'grad_norm': 196.7293243408203, 'learning_rate': 2.4359346642468238e-05, 'epoch': 3.78} +{'loss': 41.8551, 'grad_norm': 247.01638793945312, 'learning_rate': 2.435390199637024e-05, 'epoch': 3.78} +{'loss': 41.5365, 'grad_norm': 238.08656311035156, 'learning_rate': 2.4348457350272235e-05, 'epoch': 3.78} +{'loss': 42.804, 'grad_norm': 205.6416778564453, 'learning_rate': 2.434301270417423e-05, 'epoch': 3.79} +{'loss': 42.4529, 'grad_norm': 236.24205017089844, 'learning_rate': 2.4337568058076226e-05, 'epoch': 3.79} + 19%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1060/5520 [54:45<3:31:44, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6511489152908325, 'eval_runtime': 3.1324, 'eval_samples_per_second': 57.145, 'eval_steps_per_second': 57.145, 'epoch': 3.79} +{'loss': 42.2678, 'grad_norm': 195.8008575439453, 'learning_rate': 2.433212341197822e-05, 'epoch': 3.79} +{'loss': 42.4501, 'grad_norm': 218.7563018798828, 'learning_rate': 2.4326678765880217e-05, 'epoch': 3.8} +{'loss': 43.0947, 'grad_norm': 209.214599609375, 'learning_rate': 2.4321234119782215e-05, 'epoch': 3.8} +{'loss': 44.3962, 'grad_norm': 235.3767852783203, 'learning_rate': 2.431578947368421e-05, 'epoch': 3.81} +{'loss': 43.5015, 'grad_norm': 189.2035369873047, 'learning_rate': 2.4310344827586206e-05, 'epoch': 3.81} +{'loss': 43.4919, 'grad_norm': 185.23617553710938, 'learning_rate': 2.4304900181488205e-05, 'epoch': 3.81} +{'loss': 43.5435, 'grad_norm': 197.72720336914062, 'learning_rate': 2.42994555353902e-05, 'epoch': 3.82} +{'loss': 42.8559, 'grad_norm': 210.86380004882812, 'learning_rate': 2.42940108892922e-05, 'epoch': 3.82} +{'loss': 43.2725, 'grad_norm': 183.15798950195312, 'learning_rate': 2.4288566243194194e-05, 'epoch': 3.82} +{'loss': 39.7816, 'grad_norm': 195.6173858642578, 'learning_rate': 2.428312159709619e-05, 'epoch': 3.83} + 19%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1070/5520 [55:16<3:30:06, 2.83s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6438961625099182, 'eval_runtime': 3.1358, 'eval_samples_per_second': 57.082, 'eval_steps_per_second': 57.082, 'epoch': 3.83} +{'loss': 37.7217, 'grad_norm': 218.30186462402344, 'learning_rate': 2.4277676950998185e-05, 'epoch': 3.83} +{'loss': 35.3267, 'grad_norm': 226.92807006835938, 'learning_rate': 2.427223230490018e-05, 'epoch': 3.83} +{'loss': 36.4474, 'grad_norm': 277.8228759765625, 'learning_rate': 2.4266787658802176e-05, 'epoch': 3.84} +{'loss': 36.8737, 'grad_norm': 233.3556365966797, 'learning_rate': 2.4261343012704175e-05, 'epoch': 3.84} +{'loss': 35.5258, 'grad_norm': 232.3887176513672, 'learning_rate': 2.425589836660617e-05, 'epoch': 3.85} +{'loss': 36.264, 'grad_norm': 212.23741149902344, 'learning_rate': 2.425045372050817e-05, 'epoch': 3.85} +{'loss': 37.4407, 'grad_norm': 262.5358581542969, 'learning_rate': 2.4245009074410164e-05, 'epoch': 3.85} +{'loss': 36.753, 'grad_norm': 250.24459838867188, 'learning_rate': 2.423956442831216e-05, 'epoch': 3.86} +{'loss': 38.1465, 'grad_norm': 234.84124755859375, 'learning_rate': 2.423411978221416e-05, 'epoch': 3.86} +{'loss': 38.1092, 'grad_norm': 258.2744140625, 'learning_rate': 2.4228675136116154e-05, 'epoch': 3.86} + 20%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1080/5520 [55:47<3:30:45, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6504554152488708, 'eval_runtime': 3.1368, 'eval_samples_per_second': 57.065, 'eval_steps_per_second': 57.065, 'epoch': 3.86} +{'loss': 37.5608, 'grad_norm': 210.83404541015625, 'learning_rate': 2.422323049001815e-05, 'epoch': 3.87} +{'loss': 38.8805, 'grad_norm': 196.8865203857422, 'learning_rate': 2.4217785843920145e-05, 'epoch': 3.87} +{'loss': 38.3821, 'grad_norm': 195.45758056640625, 'learning_rate': 2.421234119782214e-05, 'epoch': 3.87} +{'loss': 39.2063, 'grad_norm': 249.15740966796875, 'learning_rate': 2.4206896551724135e-05, 'epoch': 3.88} +{'loss': 40.8177, 'grad_norm': 224.40455627441406, 'learning_rate': 2.4201451905626138e-05, 'epoch': 3.88} +{'loss': 39.2645, 'grad_norm': 272.9620361328125, 'learning_rate': 2.4196007259528133e-05, 'epoch': 3.88} +{'loss': 38.6852, 'grad_norm': 230.61953735351562, 'learning_rate': 2.419056261343013e-05, 'epoch': 3.89} +{'loss': 29.6319, 'grad_norm': 209.87234497070312, 'learning_rate': 2.4185117967332124e-05, 'epoch': 3.89} +{'loss': 24.6807, 'grad_norm': 249.635009765625, 'learning_rate': 2.417967332123412e-05, 'epoch': 3.9} +{'loss': 25.785, 'grad_norm': 185.14309692382812, 'learning_rate': 2.4174228675136118e-05, 'epoch': 3.9} + 20%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1090/5520 [56:19<3:30:16, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6733376979827881, 'eval_runtime': 3.129, 'eval_samples_per_second': 57.208, 'eval_steps_per_second': 57.208, 'epoch': 3.9} +{'loss': 25.702, 'grad_norm': 190.28952026367188, 'learning_rate': 2.4168784029038113e-05, 'epoch': 3.9} +{'loss': 49.0322, 'grad_norm': 431.5606689453125, 'learning_rate': 2.416333938294011e-05, 'epoch': 3.91} +{'loss': 48.3669, 'grad_norm': 396.85345458984375, 'learning_rate': 2.4157894736842104e-05, 'epoch': 3.91} +{'loss': 47.3948, 'grad_norm': 369.1654357910156, 'learning_rate': 2.4152450090744103e-05, 'epoch': 3.91} +{'loss': 47.5562, 'grad_norm': 320.3822937011719, 'learning_rate': 2.4147005444646098e-05, 'epoch': 3.92} +{'loss': 46.1018, 'grad_norm': 300.494140625, 'learning_rate': 2.4141560798548097e-05, 'epoch': 3.92} +{'loss': 44.9794, 'grad_norm': 290.30462646484375, 'learning_rate': 2.4136116152450092e-05, 'epoch': 3.92} +{'loss': 43.7329, 'grad_norm': 299.4498596191406, 'learning_rate': 2.4130671506352088e-05, 'epoch': 3.93} +{'loss': 43.3881, 'grad_norm': 296.0865783691406, 'learning_rate': 2.4125226860254083e-05, 'epoch': 3.93} +{'loss': 42.518, 'grad_norm': 227.40028381347656, 'learning_rate': 2.411978221415608e-05, 'epoch': 3.94} + 20%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1100/5520 [56:50<3:29:09, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6501370072364807, 'eval_runtime': 3.1357, 'eval_samples_per_second': 57.084, 'eval_steps_per_second': 57.084, 'epoch': 3.94} +{'loss': 42.5582, 'grad_norm': 236.79466247558594, 'learning_rate': 2.4114337568058077e-05, 'epoch': 3.94} +{'loss': 42.1563, 'grad_norm': 331.5859375, 'learning_rate': 2.4108892921960073e-05, 'epoch': 3.94} +{'loss': 43.1934, 'grad_norm': 296.2573547363281, 'learning_rate': 2.410344827586207e-05, 'epoch': 3.95} +{'loss': 43.4579, 'grad_norm': 258.93499755859375, 'learning_rate': 2.4098003629764067e-05, 'epoch': 3.95} +{'loss': 44.4464, 'grad_norm': 275.31170654296875, 'learning_rate': 2.4092558983666062e-05, 'epoch': 3.95} +{'loss': 44.9596, 'grad_norm': 276.1750183105469, 'learning_rate': 2.4087114337568058e-05, 'epoch': 3.96} +{'loss': 40.7271, 'grad_norm': 282.0018310546875, 'learning_rate': 2.4081669691470056e-05, 'epoch': 3.96} +{'loss': 36.7406, 'grad_norm': 350.2434387207031, 'learning_rate': 2.407622504537205e-05, 'epoch': 3.96} +{'loss': 36.6965, 'grad_norm': 264.5498046875, 'learning_rate': 2.4070780399274047e-05, 'epoch': 3.97} +{'loss': 39.8293, 'grad_norm': 285.5101623535156, 'learning_rate': 2.4065335753176042e-05, 'epoch': 3.97} + 20%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1110/5520 [57:21<3:25:05, 2.79s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6441511511802673, 'eval_runtime': 3.13, 'eval_samples_per_second': 57.189, 'eval_steps_per_second': 57.189, 'epoch': 3.97} +{'loss': 39.3198, 'grad_norm': 307.22113037109375, 'learning_rate': 2.4059891107078038e-05, 'epoch': 3.98} +{'loss': 39.2073, 'grad_norm': 214.6739044189453, 'learning_rate': 2.405444646098004e-05, 'epoch': 3.98} +{'loss': 39.0405, 'grad_norm': 205.13401794433594, 'learning_rate': 2.4049001814882035e-05, 'epoch': 3.98} +{'loss': 40.9828, 'grad_norm': 200.275634765625, 'learning_rate': 2.404355716878403e-05, 'epoch': 3.99} +{'loss': 40.8515, 'grad_norm': 239.47377014160156, 'learning_rate': 2.4038112522686026e-05, 'epoch': 3.99} +{'loss': 28.9885, 'grad_norm': 148.22445678710938, 'learning_rate': 2.403266787658802e-05, 'epoch': 3.99} +{'loss': 25.4718, 'grad_norm': 190.1692352294922, 'learning_rate': 2.4027223230490017e-05, 'epoch': 4.0} +{'loss': 23.4711, 'grad_norm': 180.45884704589844, 'learning_rate': 2.4021778584392016e-05, 'epoch': 4.0} +{'loss': 45.9855, 'grad_norm': 357.0400390625, 'learning_rate': 2.401633393829401e-05, 'epoch': 4.0} +{'loss': 47.2321, 'grad_norm': 361.6748962402344, 'learning_rate': 2.4010889292196006e-05, 'epoch': 4.01} + 20%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1120/5520 [57:52<3:28:18, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6618791818618774, 'eval_runtime': 3.1331, 'eval_samples_per_second': 57.131, 'eval_steps_per_second': 57.131, 'epoch': 4.01} +{'loss': 46.4521, 'grad_norm': 350.3221740722656, 'learning_rate': 2.4005444646098002e-05, 'epoch': 4.01} +{'loss': 45.3017, 'grad_norm': 279.218994140625, 'learning_rate': 2.4e-05, 'epoch': 4.01} +{'loss': 47.0519, 'grad_norm': 247.94485473632812, 'learning_rate': 2.3994555353902e-05, 'epoch': 4.02} +{'loss': 46.2511, 'grad_norm': 218.910400390625, 'learning_rate': 2.3989110707803995e-05, 'epoch': 4.02} +{'loss': 44.8028, 'grad_norm': 229.89830017089844, 'learning_rate': 2.398366606170599e-05, 'epoch': 4.03} +{'loss': 46.1378, 'grad_norm': 225.46900939941406, 'learning_rate': 2.3978221415607986e-05, 'epoch': 4.03} +{'loss': 45.8397, 'grad_norm': 243.09857177734375, 'learning_rate': 2.397277676950998e-05, 'epoch': 4.03} +{'loss': 45.481, 'grad_norm': 219.63043212890625, 'learning_rate': 2.396733212341198e-05, 'epoch': 4.04} +{'loss': 43.6477, 'grad_norm': 214.18118286132812, 'learning_rate': 2.3961887477313975e-05, 'epoch': 4.04} +{'loss': 41.9656, 'grad_norm': 228.6083984375, 'learning_rate': 2.395644283121597e-05, 'epoch': 4.04} + 20%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1130/5520 [58:23<3:28:36, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6450154185295105, 'eval_runtime': 3.1336, 'eval_samples_per_second': 57.123, 'eval_steps_per_second': 57.123, 'epoch': 4.04} +{'loss': 42.0314, 'grad_norm': 234.56243896484375, 'learning_rate': 2.395099818511797e-05, 'epoch': 4.05} +{'loss': 41.8559, 'grad_norm': 252.39718627929688, 'learning_rate': 2.3945553539019965e-05, 'epoch': 4.05} +{'loss': 41.411, 'grad_norm': 249.19015502929688, 'learning_rate': 2.394010889292196e-05, 'epoch': 4.05} +{'loss': 41.2435, 'grad_norm': 216.54139709472656, 'learning_rate': 2.393466424682396e-05, 'epoch': 4.06} +{'loss': 40.9555, 'grad_norm': 269.6858825683594, 'learning_rate': 2.3929219600725954e-05, 'epoch': 4.06} +{'loss': 41.8034, 'grad_norm': 289.1708984375, 'learning_rate': 2.392377495462795e-05, 'epoch': 4.07} +{'loss': 42.3489, 'grad_norm': 225.65097045898438, 'learning_rate': 2.3918330308529945e-05, 'epoch': 4.07} +{'loss': 42.1899, 'grad_norm': 241.1715545654297, 'learning_rate': 2.391288566243194e-05, 'epoch': 4.07} +{'loss': 42.7326, 'grad_norm': 225.5276336669922, 'learning_rate': 2.390744101633394e-05, 'epoch': 4.08} +{'loss': 41.9397, 'grad_norm': 217.30703735351562, 'learning_rate': 2.3901996370235938e-05, 'epoch': 4.08} + 21%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1140/5520 [58:54<3:26:45, 2.83s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6440457701683044, 'eval_runtime': 3.1323, 'eval_samples_per_second': 57.146, 'eval_steps_per_second': 57.146, 'epoch': 4.08} +{'loss': 43.5721, 'grad_norm': 191.2023162841797, 'learning_rate': 2.3896551724137933e-05, 'epoch': 4.08} +{'loss': 42.9942, 'grad_norm': 204.53013610839844, 'learning_rate': 2.389110707803993e-05, 'epoch': 4.09} +{'loss': 42.8992, 'grad_norm': 206.78817749023438, 'learning_rate': 2.3885662431941924e-05, 'epoch': 4.09} +{'loss': 42.39, 'grad_norm': 224.03082275390625, 'learning_rate': 2.388021778584392e-05, 'epoch': 4.09} +{'loss': 43.3473, 'grad_norm': 249.23992919921875, 'learning_rate': 2.3874773139745918e-05, 'epoch': 4.1} +{'loss': 42.5243, 'grad_norm': 249.36526489257812, 'learning_rate': 2.3869328493647914e-05, 'epoch': 4.1} +{'loss': 39.7782, 'grad_norm': 204.98721313476562, 'learning_rate': 2.386388384754991e-05, 'epoch': 4.1} +{'loss': 36.1737, 'grad_norm': 204.4314422607422, 'learning_rate': 2.3858439201451904e-05, 'epoch': 4.11} +{'loss': 37.119, 'grad_norm': 207.8656005859375, 'learning_rate': 2.3852994555353903e-05, 'epoch': 4.11} +{'loss': 34.5701, 'grad_norm': 204.60365295410156, 'learning_rate': 2.3847549909255902e-05, 'epoch': 4.12} + 21%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1150/5520 [59:26<3:27:07, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6381516456604004, 'eval_runtime': 3.1335, 'eval_samples_per_second': 57.124, 'eval_steps_per_second': 57.124, 'epoch': 4.12} +{'loss': 35.8345, 'grad_norm': 207.82247924804688, 'learning_rate': 2.3842105263157897e-05, 'epoch': 4.12} +{'loss': 36.3975, 'grad_norm': 203.7960662841797, 'learning_rate': 2.3836660617059893e-05, 'epoch': 4.12} +{'loss': 36.1556, 'grad_norm': 187.17431640625, 'learning_rate': 2.3831215970961888e-05, 'epoch': 4.13} +{'loss': 36.8714, 'grad_norm': 224.93003845214844, 'learning_rate': 2.3825771324863883e-05, 'epoch': 4.13} +{'loss': 37.5072, 'grad_norm': 235.7632293701172, 'learning_rate': 2.382032667876588e-05, 'epoch': 4.13} +{'loss': 38.0648, 'grad_norm': 261.4077453613281, 'learning_rate': 2.3814882032667878e-05, 'epoch': 4.14} +{'loss': 37.1813, 'grad_norm': 233.9202117919922, 'learning_rate': 2.3809437386569873e-05, 'epoch': 4.14} +{'loss': 39.7793, 'grad_norm': 343.1669006347656, 'learning_rate': 2.380399274047187e-05, 'epoch': 4.14} +{'loss': 39.0443, 'grad_norm': 296.18121337890625, 'learning_rate': 2.3798548094373867e-05, 'epoch': 4.15} +{'loss': 39.9487, 'grad_norm': 261.0748291015625, 'learning_rate': 2.3793103448275862e-05, 'epoch': 4.15} + 21%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1160/5520 [59:57<3:27:19, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6574633717536926, 'eval_runtime': 3.1318, 'eval_samples_per_second': 57.156, 'eval_steps_per_second': 57.156, 'epoch': 4.15} +{'loss': 39.6622, 'grad_norm': 220.5347137451172, 'learning_rate': 2.378765880217786e-05, 'epoch': 4.16} +{'loss': 39.6427, 'grad_norm': 243.7288360595703, 'learning_rate': 2.3782214156079857e-05, 'epoch': 4.16} +{'loss': 39.4682, 'grad_norm': 223.01170349121094, 'learning_rate': 2.3776769509981852e-05, 'epoch': 4.16} +{'loss': 29.4783, 'grad_norm': 292.18768310546875, 'learning_rate': 2.3771324863883847e-05, 'epoch': 4.17} +{'loss': 24.6701, 'grad_norm': 253.28433227539062, 'learning_rate': 2.3765880217785843e-05, 'epoch': 4.17} +{'loss': 24.7208, 'grad_norm': 213.90155029296875, 'learning_rate': 2.3760435571687838e-05, 'epoch': 4.17} +{'loss': 24.5906, 'grad_norm': 216.52125549316406, 'learning_rate': 2.3754990925589837e-05, 'epoch': 4.18} +{'loss': 25.9308, 'grad_norm': 208.77516174316406, 'learning_rate': 2.3749546279491836e-05, 'epoch': 4.18} +{'loss': 48.6681, 'grad_norm': 401.13751220703125, 'learning_rate': 2.374410163339383e-05, 'epoch': 4.18} +{'loss': 48.9605, 'grad_norm': 380.1224365234375, 'learning_rate': 2.3738656987295826e-05, 'epoch': 4.19} + 21%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1170/5520 [1:00:28<3:26:31, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6683643460273743, 'eval_runtime': 3.1304, 'eval_samples_per_second': 57.181, 'eval_steps_per_second': 57.181, 'epoch': 4.19} +{'loss': 48.419, 'grad_norm': 383.3838806152344, 'learning_rate': 2.3733212341197822e-05, 'epoch': 4.19} +{'loss': 46.7725, 'grad_norm': 290.1167907714844, 'learning_rate': 2.372776769509982e-05, 'epoch': 4.2} +{'loss': 45.6624, 'grad_norm': 260.7622375488281, 'learning_rate': 2.3722323049001816e-05, 'epoch': 4.2} +{'loss': 45.9416, 'grad_norm': 300.2881774902344, 'learning_rate': 2.371687840290381e-05, 'epoch': 4.2} +{'loss': 45.748, 'grad_norm': 241.06045532226562, 'learning_rate': 2.3711433756805807e-05, 'epoch': 4.21} +{'loss': 45.6519, 'grad_norm': 218.68606567382812, 'learning_rate': 2.3705989110707802e-05, 'epoch': 4.21} +{'loss': 44.2927, 'grad_norm': 227.5732421875, 'learning_rate': 2.37005444646098e-05, 'epoch': 4.21} +{'loss': 45.37, 'grad_norm': 295.8132629394531, 'learning_rate': 2.36950998185118e-05, 'epoch': 4.22} +{'loss': 44.3496, 'grad_norm': 239.5023193359375, 'learning_rate': 2.3689655172413795e-05, 'epoch': 4.22} +{'loss': 41.8493, 'grad_norm': 211.12631225585938, 'learning_rate': 2.368421052631579e-05, 'epoch': 4.22} + 21%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1180/5520 [1:00:59<3:25:04, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6506755948066711, 'eval_runtime': 3.1303, 'eval_samples_per_second': 57.183, 'eval_steps_per_second': 57.183, 'epoch': 4.22} +{'loss': 40.9604, 'grad_norm': 291.5223388671875, 'learning_rate': 2.3678765880217786e-05, 'epoch': 4.23} +{'loss': 40.1213, 'grad_norm': 218.4868927001953, 'learning_rate': 2.367332123411978e-05, 'epoch': 4.23} +{'loss': 41.5535, 'grad_norm': 176.35243225097656, 'learning_rate': 2.366787658802178e-05, 'epoch': 4.23} +{'loss': 40.1666, 'grad_norm': 188.4041290283203, 'learning_rate': 2.3662431941923775e-05, 'epoch': 4.24} +{'loss': 40.667, 'grad_norm': 236.32740783691406, 'learning_rate': 2.365698729582577e-05, 'epoch': 4.24} +{'loss': 41.7168, 'grad_norm': 197.1793670654297, 'learning_rate': 2.365154264972777e-05, 'epoch': 4.25} +{'loss': 42.7801, 'grad_norm': 242.61181640625, 'learning_rate': 2.3646098003629765e-05, 'epoch': 4.25} +{'loss': 42.7235, 'grad_norm': 268.12738037109375, 'learning_rate': 2.364065335753176e-05, 'epoch': 4.25} +{'loss': 42.464, 'grad_norm': 244.36843872070312, 'learning_rate': 2.363520871143376e-05, 'epoch': 4.26} +{'loss': 42.0016, 'grad_norm': 249.46437072753906, 'learning_rate': 2.3629764065335754e-05, 'epoch': 4.26} + 22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1190/5520 [1:01:30<3:25:38, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6450306177139282, 'eval_runtime': 3.1335, 'eval_samples_per_second': 57.125, 'eval_steps_per_second': 57.125, 'epoch': 4.26} +{'loss': 42.5624, 'grad_norm': 205.0978546142578, 'learning_rate': 2.362431941923775e-05, 'epoch': 4.26} +{'loss': 42.9771, 'grad_norm': 220.79122924804688, 'learning_rate': 2.3618874773139745e-05, 'epoch': 4.27} +{'loss': 43.9198, 'grad_norm': 199.8367156982422, 'learning_rate': 2.361343012704174e-05, 'epoch': 4.27} +{'loss': 43.2283, 'grad_norm': 195.33636474609375, 'learning_rate': 2.360798548094374e-05, 'epoch': 4.27} +{'loss': 43.1352, 'grad_norm': 189.04556274414062, 'learning_rate': 2.3602540834845738e-05, 'epoch': 4.28} +{'loss': 41.8298, 'grad_norm': 196.6824951171875, 'learning_rate': 2.3597096188747734e-05, 'epoch': 4.28} +{'loss': 42.0144, 'grad_norm': 164.40245056152344, 'learning_rate': 2.359165154264973e-05, 'epoch': 4.29} +{'loss': 39.0134, 'grad_norm': 212.00314331054688, 'learning_rate': 2.3586206896551724e-05, 'epoch': 4.29} +{'loss': 35.7557, 'grad_norm': 220.7440643310547, 'learning_rate': 2.358076225045372e-05, 'epoch': 4.29} +{'loss': 36.0808, 'grad_norm': 196.58985900878906, 'learning_rate': 2.357531760435572e-05, 'epoch': 4.3} + 22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1200/5520 [1:02:02<3:24:43, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6393749713897705, 'eval_runtime': 3.131, 'eval_samples_per_second': 57.171, 'eval_steps_per_second': 57.171, 'epoch': 4.3} +{'loss': 36.0987, 'grad_norm': 194.13232421875, 'learning_rate': 2.3569872958257714e-05, 'epoch': 4.3} +{'loss': 36.764, 'grad_norm': 224.85240173339844, 'learning_rate': 2.356442831215971e-05, 'epoch': 4.3} +{'loss': 37.7105, 'grad_norm': 218.51856994628906, 'learning_rate': 2.3558983666061705e-05, 'epoch': 4.31} +{'loss': 38.2378, 'grad_norm': 242.14483642578125, 'learning_rate': 2.35535390199637e-05, 'epoch': 4.31} +{'loss': 36.9229, 'grad_norm': 245.50604248046875, 'learning_rate': 2.3548094373865702e-05, 'epoch': 4.31} +{'loss': 37.5557, 'grad_norm': 215.5889892578125, 'learning_rate': 2.3542649727767697e-05, 'epoch': 4.32} +{'loss': 37.6031, 'grad_norm': 203.4392547607422, 'learning_rate': 2.3537205081669693e-05, 'epoch': 4.32} +{'loss': 37.6715, 'grad_norm': 231.23709106445312, 'learning_rate': 2.3531760435571688e-05, 'epoch': 4.33} +{'loss': 37.645, 'grad_norm': 217.31813049316406, 'learning_rate': 2.3526315789473684e-05, 'epoch': 4.33} +{'loss': 39.1993, 'grad_norm': 182.10690307617188, 'learning_rate': 2.352087114337568e-05, 'epoch': 4.33} + 22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1200/5520 [1:02:05<3:24:43, 2.84s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1210/5520 [1:02:34<3:26:22, 2.87s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6532073616981506, 'eval_runtime': 3.1314, 'eval_samples_per_second': 57.163, 'eval_steps_per_second': 57.163, 'epoch': 4.33} +{'loss': 38.1029, 'grad_norm': 232.332763671875, 'learning_rate': 2.3515426497277678e-05, 'epoch': 4.34} +{'loss': 40.2538, 'grad_norm': 251.8763885498047, 'learning_rate': 2.3509981851179673e-05, 'epoch': 4.34} +{'loss': 39.115, 'grad_norm': 260.1363525390625, 'learning_rate': 2.350453720508167e-05, 'epoch': 4.34} +{'loss': 37.7692, 'grad_norm': 227.32473754882812, 'learning_rate': 2.3499092558983667e-05, 'epoch': 4.35} +{'loss': 26.7583, 'grad_norm': 208.3872528076172, 'learning_rate': 2.3493647912885663e-05, 'epoch': 4.35} +{'loss': 24.7576, 'grad_norm': 173.05075073242188, 'learning_rate': 2.348820326678766e-05, 'epoch': 4.35} +{'loss': 24.8792, 'grad_norm': 214.4512939453125, 'learning_rate': 2.3482758620689657e-05, 'epoch': 4.36} +{'loss': 26.1507, 'grad_norm': 179.293701171875, 'learning_rate': 2.3477313974591652e-05, 'epoch': 4.36} +{'loss': 47.4017, 'grad_norm': 401.9908142089844, 'learning_rate': 2.3471869328493648e-05, 'epoch': 4.36} +{'loss': 48.0082, 'grad_norm': 399.3369140625, 'learning_rate': 2.3466424682395643e-05, 'epoch': 4.37} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1220/5520 [1:03:05<3:24:13, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6664602756500244, 'eval_runtime': 3.1305, 'eval_samples_per_second': 57.18, 'eval_steps_per_second': 57.18, 'epoch': 4.37} +{'loss': 47.4843, 'grad_norm': 320.49090576171875, 'learning_rate': 2.346098003629764e-05, 'epoch': 4.37} +{'loss': 46.3087, 'grad_norm': 297.55615234375, 'learning_rate': 2.3455535390199637e-05, 'epoch': 4.38} +{'loss': 45.4889, 'grad_norm': 245.03399658203125, 'learning_rate': 2.3450090744101636e-05, 'epoch': 4.38} +{'loss': 45.8501, 'grad_norm': 227.94091796875, 'learning_rate': 2.344464609800363e-05, 'epoch': 4.38} +{'loss': 46.2737, 'grad_norm': 262.7824401855469, 'learning_rate': 2.3439201451905627e-05, 'epoch': 4.39} +{'loss': 45.2876, 'grad_norm': 235.969970703125, 'learning_rate': 2.3433756805807622e-05, 'epoch': 4.39} +{'loss': 45.4931, 'grad_norm': 244.8028106689453, 'learning_rate': 2.342831215970962e-05, 'epoch': 4.39} +{'loss': 45.6649, 'grad_norm': 236.24844360351562, 'learning_rate': 2.3422867513611616e-05, 'epoch': 4.4} +{'loss': 43.9613, 'grad_norm': 204.7911834716797, 'learning_rate': 2.341742286751361e-05, 'epoch': 4.4} +{'loss': 41.9267, 'grad_norm': 190.6739044189453, 'learning_rate': 2.3411978221415607e-05, 'epoch': 4.4} + 22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1230/5520 [1:03:36<3:23:43, 2.85s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6481396555900574, 'eval_runtime': 3.1325, 'eval_samples_per_second': 57.142, 'eval_steps_per_second': 57.142, 'epoch': 4.4} +{'loss': 42.34, 'grad_norm': 224.25758361816406, 'learning_rate': 2.3406533575317602e-05, 'epoch': 4.41} +{'loss': 40.6947, 'grad_norm': 238.21913146972656, 'learning_rate': 2.34010889292196e-05, 'epoch': 4.41} +{'loss': 39.8585, 'grad_norm': 255.64395141601562, 'learning_rate': 2.33956442831216e-05, 'epoch': 4.42} +{'loss': 42.6031, 'grad_norm': 202.08859252929688, 'learning_rate': 2.3390199637023595e-05, 'epoch': 4.42} +{'loss': 41.9946, 'grad_norm': 222.359619140625, 'learning_rate': 2.338475499092559e-05, 'epoch': 4.42} +{'loss': 40.9174, 'grad_norm': 198.84461975097656, 'learning_rate': 2.3379310344827586e-05, 'epoch': 4.43} +{'loss': 42.2865, 'grad_norm': 227.34942626953125, 'learning_rate': 2.337386569872958e-05, 'epoch': 4.43} +{'loss': 42.6508, 'grad_norm': 249.9097900390625, 'learning_rate': 2.336842105263158e-05, 'epoch': 4.43} +{'loss': 43.0846, 'grad_norm': 236.96009826660156, 'learning_rate': 2.3362976406533576e-05, 'epoch': 4.44} +{'loss': 42.4119, 'grad_norm': 183.06201171875, 'learning_rate': 2.335753176043557e-05, 'epoch': 4.44} + 22%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1240/5520 [1:04:07<3:22:06, 2.83s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6428424715995789, 'eval_runtime': 3.1359, 'eval_samples_per_second': 57.08, 'eval_steps_per_second': 57.08, 'epoch': 4.44} +{'loss': 43.1702, 'grad_norm': 199.0382843017578, 'learning_rate': 2.335208711433757e-05, 'epoch': 4.44} +{'loss': 43.3518, 'grad_norm': 221.87939453125, 'learning_rate': 2.3346642468239565e-05, 'epoch': 4.45} +{'loss': 42.9713, 'grad_norm': 205.0601043701172, 'learning_rate': 2.3341197822141564e-05, 'epoch': 4.45} +{'loss': 42.6973, 'grad_norm': 235.3998565673828, 'learning_rate': 2.333575317604356e-05, 'epoch': 4.46} +{'loss': 43.351, 'grad_norm': 171.76986694335938, 'learning_rate': 2.3330308529945555e-05, 'epoch': 4.46} +{'loss': 43.8662, 'grad_norm': 261.549072265625, 'learning_rate': 2.332486388384755e-05, 'epoch': 4.46} +{'loss': 40.7938, 'grad_norm': 256.76837158203125, 'learning_rate': 2.3319419237749545e-05, 'epoch': 4.47} +{'loss': 38.1021, 'grad_norm': 176.35060119628906, 'learning_rate': 2.331397459165154e-05, 'epoch': 4.47} +{'loss': 36.6359, 'grad_norm': 203.00906372070312, 'learning_rate': 2.330852994555354e-05, 'epoch': 4.47} +{'loss': 34.448, 'grad_norm': 259.6462707519531, 'learning_rate': 2.3303085299455535e-05, 'epoch': 4.48} + 23%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1250/5520 [1:04:38<3:19:32, 2.80s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6386051177978516, 'eval_runtime': 3.1322, 'eval_samples_per_second': 57.148, 'eval_steps_per_second': 57.148, 'epoch': 4.48} +{'loss': 35.2353, 'grad_norm': 215.24737548828125, 'learning_rate': 2.3297640653357534e-05, 'epoch': 4.48} +{'loss': 38.2077, 'grad_norm': 249.12355041503906, 'learning_rate': 2.329219600725953e-05, 'epoch': 4.48} +{'loss': 36.8363, 'grad_norm': 191.0881805419922, 'learning_rate': 2.3286751361161525e-05, 'epoch': 4.49} +{'loss': 36.7398, 'grad_norm': 229.26449584960938, 'learning_rate': 2.3281306715063523e-05, 'epoch': 4.49} +{'loss': 35.6614, 'grad_norm': 184.931884765625, 'learning_rate': 2.327586206896552e-05, 'epoch': 4.49} +{'loss': 36.9818, 'grad_norm': 183.7378387451172, 'learning_rate': 2.3270417422867514e-05, 'epoch': 4.5} +{'loss': 38.1348, 'grad_norm': 191.42543029785156, 'learning_rate': 2.326497277676951e-05, 'epoch': 4.5} +{'loss': 37.0112, 'grad_norm': 211.6359100341797, 'learning_rate': 2.3259528130671505e-05, 'epoch': 4.51} +{'loss': 38.6218, 'grad_norm': 245.6946563720703, 'learning_rate': 2.32540834845735e-05, 'epoch': 4.51} +{'loss': 36.9687, 'grad_norm': 193.29095458984375, 'learning_rate': 2.3248638838475502e-05, 'epoch': 4.51} + 23%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 1260/5520 [1:05:09<3:20:19, 2.82s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6432057023048401, 'eval_runtime': 3.1301, 'eval_samples_per_second': 57.187, 'eval_steps_per_second': 57.187, 'epoch': 4.51} +{'loss': 39.8086, 'grad_norm': 247.0595245361328, 'learning_rate': 2.3243194192377498e-05, 'epoch': 4.52} +{'loss': 38.7245, 'grad_norm': 243.1544189453125, 'learning_rate': 2.3237749546279493e-05, 'epoch': 4.52} +{'loss': 39.5335, 'grad_norm': 322.0834045410156, 'learning_rate': 2.323230490018149e-05, 'epoch': 4.52} +{'loss': 30.2928, 'grad_norm': 201.5956573486328, 'learning_rate': 2.3226860254083484e-05, 'epoch': 4.53} +{'loss': 24.8504, 'grad_norm': 186.13291931152344, 'learning_rate': 2.3221415607985483e-05, 'epoch': 4.53} +{'loss': 24.5528, 'grad_norm': 251.50608825683594, 'learning_rate': 2.3215970961887478e-05, 'epoch': 4.53} +{'loss': 25.0864, 'grad_norm': 180.21124267578125, 'learning_rate': 2.3210526315789473e-05, 'epoch': 4.54} +{'loss': 27.1602, 'grad_norm': 206.5410614013672, 'learning_rate': 2.320508166969147e-05, 'epoch': 4.54} +{'loss': 47.3734, 'grad_norm': 342.1103210449219, 'learning_rate': 2.3199637023593468e-05, 'epoch': 4.55} +{'loss': 48.0316, 'grad_norm': 418.3056945800781, 'learning_rate': 2.3194192377495463e-05, 'epoch': 4.55} + 23%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1270/5520 [1:05:40<3:20:30, 2.83s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6742400527000427, 'eval_runtime': 3.1338, 'eval_samples_per_second': 57.119, 'eval_steps_per_second': 57.119, 'epoch': 4.55} +{'loss': 47.4532, 'grad_norm': 369.8560791015625, 'learning_rate': 2.3188747731397462e-05, 'epoch': 4.55} +{'loss': 47.0661, 'grad_norm': 322.0288391113281, 'learning_rate': 2.3183303085299457e-05, 'epoch': 4.56} +{'loss': 45.1875, 'grad_norm': 244.79066467285156, 'learning_rate': 2.3177858439201453e-05, 'epoch': 4.56} +{'loss': 46.1355, 'grad_norm': 209.29397583007812, 'learning_rate': 2.3172413793103448e-05, 'epoch': 4.56} +{'loss': 45.8947, 'grad_norm': 271.5123291015625, 'learning_rate': 2.3166969147005443e-05, 'epoch': 4.57} +{'loss': 45.6542, 'grad_norm': 232.42913818359375, 'learning_rate': 2.3161524500907442e-05, 'epoch': 4.57} +{'loss': 45.8805, 'grad_norm': 282.50738525390625, 'learning_rate': 2.3156079854809437e-05, 'epoch': 4.57} +{'loss': 44.8926, 'grad_norm': 203.39031982421875, 'learning_rate': 2.3150635208711436e-05, 'epoch': 4.58} +{'loss': 43.7589, 'grad_norm': 213.94894409179688, 'learning_rate': 2.314519056261343e-05, 'epoch': 4.58} +{'loss': 41.819, 'grad_norm': 198.9677734375, 'learning_rate': 2.3139745916515427e-05, 'epoch': 4.59} + 23%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1280/5520 [1:06:11<3:19:56, 2.83s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6428627371788025, 'eval_runtime': 3.1389, 'eval_samples_per_second': 57.027, 'eval_steps_per_second': 57.027, 'epoch': 4.59} +{'loss': 40.6128, 'grad_norm': 197.69903564453125, 'learning_rate': 2.3134301270417422e-05, 'epoch': 4.59} +{'loss': 41.1856, 'grad_norm': 229.10488891601562, 'learning_rate': 2.312885662431942e-05, 'epoch': 4.59} +{'loss': 40.2048, 'grad_norm': 254.4750213623047, 'learning_rate': 2.3123411978221417e-05, 'epoch': 4.6} +{'loss': 41.663, 'grad_norm': 247.2012939453125, 'learning_rate': 2.3117967332123412e-05, 'epoch': 4.6} +{'loss': 41.1102, 'grad_norm': 196.78761291503906, 'learning_rate': 2.3112522686025407e-05, 'epoch': 4.6} +{'loss': 39.6368, 'grad_norm': 179.03880310058594, 'learning_rate': 2.3107078039927403e-05, 'epoch': 4.61} +{'loss': 42.9424, 'grad_norm': 203.49159240722656, 'learning_rate': 2.3101633393829405e-05, 'epoch': 4.61} +{'loss': 42.0636, 'grad_norm': 254.80018615722656, 'learning_rate': 2.30961887477314e-05, 'epoch': 4.61} +{'loss': 41.4738, 'grad_norm': 201.86109924316406, 'learning_rate': 2.3090744101633396e-05, 'epoch': 4.62} +{'loss': 41.8529, 'grad_norm': 185.1239471435547, 'learning_rate': 2.308529945553539e-05, 'epoch': 4.62} + 23%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1290/5520 [1:06:42<3:18:28, 2.82s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6457561254501343, 'eval_runtime': 3.1341, 'eval_samples_per_second': 57.113, 'eval_steps_per_second': 57.113, 'epoch': 4.62} +{'loss': 41.8397, 'grad_norm': 198.6769561767578, 'learning_rate': 2.3079854809437386e-05, 'epoch': 4.62} +{'loss': 43.5585, 'grad_norm': 254.9165496826172, 'learning_rate': 2.3074410163339382e-05, 'epoch': 4.63} +{'loss': 41.7349, 'grad_norm': 183.61181640625, 'learning_rate': 2.306896551724138e-05, 'epoch': 4.63} +{'loss': 42.6239, 'grad_norm': 206.0381622314453, 'learning_rate': 2.3063520871143376e-05, 'epoch': 4.64} +{'loss': 43.0988, 'grad_norm': 188.5303497314453, 'learning_rate': 2.305807622504537e-05, 'epoch': 4.64} +{'loss': 43.8379, 'grad_norm': 208.30039978027344, 'learning_rate': 2.3052631578947367e-05, 'epoch': 4.64} +{'loss': 41.4395, 'grad_norm': 209.494384765625, 'learning_rate': 2.3047186932849365e-05, 'epoch': 4.65} +{'loss': 38.5792, 'grad_norm': 223.97824096679688, 'learning_rate': 2.3041742286751364e-05, 'epoch': 4.65} +{'loss': 36.2448, 'grad_norm': 209.16192626953125, 'learning_rate': 2.303629764065336e-05, 'epoch': 4.65} +{'loss': 35.1692, 'grad_norm': 260.72821044921875, 'learning_rate': 2.3030852994555355e-05, 'epoch': 4.66} + 24%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1300/5520 [1:07:13<3:18:15, 2.82s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6381233334541321, 'eval_runtime': 3.1443, 'eval_samples_per_second': 56.928, 'eval_steps_per_second': 56.928, 'epoch': 4.66} +{'loss': 35.2234, 'grad_norm': 222.2270965576172, 'learning_rate': 2.302540834845735e-05, 'epoch': 4.66} +{'loss': 35.6167, 'grad_norm': 208.68218994140625, 'learning_rate': 2.3019963702359346e-05, 'epoch': 4.66} +{'loss': 36.9489, 'grad_norm': 199.57015991210938, 'learning_rate': 2.301451905626134e-05, 'epoch': 4.67} +{'loss': 37.0681, 'grad_norm': 249.1312255859375, 'learning_rate': 2.300907441016334e-05, 'epoch': 4.67} +{'loss': 38.3897, 'grad_norm': 227.86341857910156, 'learning_rate': 2.3003629764065335e-05, 'epoch': 4.68} +{'loss': 39.1391, 'grad_norm': 290.3368225097656, 'learning_rate': 2.2998185117967334e-05, 'epoch': 4.68} +{'loss': 38.6362, 'grad_norm': 222.59974670410156, 'learning_rate': 2.299274047186933e-05, 'epoch': 4.68} +{'loss': 37.1796, 'grad_norm': 233.853515625, 'learning_rate': 2.2987295825771325e-05, 'epoch': 4.69} +{'loss': 38.5097, 'grad_norm': 202.83212280273438, 'learning_rate': 2.2981851179673324e-05, 'epoch': 4.69} +{'loss': 38.3335, 'grad_norm': 203.59027099609375, 'learning_rate': 2.297640653357532e-05, 'epoch': 4.69} + 24%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1310/5520 [1:07:44<3:19:07, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6446877717971802, 'eval_runtime': 3.1368, 'eval_samples_per_second': 57.064, 'eval_steps_per_second': 57.064, 'epoch': 4.69} +{'loss': 39.1848, 'grad_norm': 250.48324584960938, 'learning_rate': 2.2970961887477314e-05, 'epoch': 4.7} +{'loss': 38.2276, 'grad_norm': 218.0867462158203, 'learning_rate': 2.296551724137931e-05, 'epoch': 4.7} +{'loss': 38.4487, 'grad_norm': 316.4258728027344, 'learning_rate': 2.2960072595281305e-05, 'epoch': 4.7} +{'loss': 29.1075, 'grad_norm': 262.96832275390625, 'learning_rate': 2.29546279491833e-05, 'epoch': 4.71} +{'loss': 24.6257, 'grad_norm': 261.25897216796875, 'learning_rate': 2.2949183303085303e-05, 'epoch': 4.71} +{'loss': 24.4387, 'grad_norm': 223.29014587402344, 'learning_rate': 2.2943738656987298e-05, 'epoch': 4.72} +{'loss': 25.0916, 'grad_norm': 167.95193481445312, 'learning_rate': 2.2938294010889293e-05, 'epoch': 4.72} +{'loss': 26.1631, 'grad_norm': 203.88392639160156, 'learning_rate': 2.293284936479129e-05, 'epoch': 4.72} +{'loss': 47.7021, 'grad_norm': 350.67657470703125, 'learning_rate': 2.2927404718693284e-05, 'epoch': 4.73} +{'loss': 47.8161, 'grad_norm': 357.1839294433594, 'learning_rate': 2.2921960072595283e-05, 'epoch': 4.73} + 24%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1320/5520 [1:08:15<3:17:15, 2.82s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6716815829277039, 'eval_runtime': 3.1361, 'eval_samples_per_second': 57.077, 'eval_steps_per_second': 57.077, 'epoch': 4.73} +{'loss': 47.5608, 'grad_norm': 334.40216064453125, 'learning_rate': 2.291651542649728e-05, 'epoch': 4.73} +{'loss': 45.9858, 'grad_norm': 322.90008544921875, 'learning_rate': 2.2911070780399274e-05, 'epoch': 4.74} +{'loss': 45.9813, 'grad_norm': 291.5083923339844, 'learning_rate': 2.290562613430127e-05, 'epoch': 4.74} +{'loss': 44.4287, 'grad_norm': 234.91102600097656, 'learning_rate': 2.2900181488203268e-05, 'epoch': 4.74} +{'loss': 45.3697, 'grad_norm': 271.03582763671875, 'learning_rate': 2.2894736842105263e-05, 'epoch': 4.75} +{'loss': 45.1817, 'grad_norm': 256.219482421875, 'learning_rate': 2.2889292196007262e-05, 'epoch': 4.75} +{'loss': 45.2029, 'grad_norm': 252.0631561279297, 'learning_rate': 2.2883847549909257e-05, 'epoch': 4.75} +{'loss': 44.9802, 'grad_norm': 249.41812133789062, 'learning_rate': 2.2878402903811253e-05, 'epoch': 4.76} +{'loss': 44.3745, 'grad_norm': 208.9102325439453, 'learning_rate': 2.2872958257713248e-05, 'epoch': 4.76} +{'loss': 40.9193, 'grad_norm': 322.94903564453125, 'learning_rate': 2.2867513611615244e-05, 'epoch': 4.77} + 24%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1330/5520 [1:08:47<3:18:02, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6515910029411316, 'eval_runtime': 3.1376, 'eval_samples_per_second': 57.049, 'eval_steps_per_second': 57.049, 'epoch': 4.77} +{'loss': 39.7286, 'grad_norm': 264.6942138671875, 'learning_rate': 2.2862068965517242e-05, 'epoch': 4.77} +{'loss': 41.3846, 'grad_norm': 276.6095886230469, 'learning_rate': 2.2856624319419238e-05, 'epoch': 4.77} +{'loss': 40.5583, 'grad_norm': 199.59877014160156, 'learning_rate': 2.2851179673321233e-05, 'epoch': 4.78} +{'loss': 40.9513, 'grad_norm': 252.59158325195312, 'learning_rate': 2.2845735027223232e-05, 'epoch': 4.78} +{'loss': 41.5119, 'grad_norm': 215.53826904296875, 'learning_rate': 2.2840290381125227e-05, 'epoch': 4.78} +{'loss': 42.7646, 'grad_norm': 290.7100524902344, 'learning_rate': 2.2834845735027226e-05, 'epoch': 4.79} +{'loss': 42.2708, 'grad_norm': 190.2306671142578, 'learning_rate': 2.282940108892922e-05, 'epoch': 4.79} +{'loss': 41.9279, 'grad_norm': 187.5550079345703, 'learning_rate': 2.2823956442831217e-05, 'epoch': 4.79} +{'loss': 42.2688, 'grad_norm': 169.10414123535156, 'learning_rate': 2.2818511796733212e-05, 'epoch': 4.8} +{'loss': 41.9192, 'grad_norm': 199.5216064453125, 'learning_rate': 2.2813067150635208e-05, 'epoch': 4.8} + 24%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1340/5520 [1:09:18<3:17:18, 2.83s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6402038335800171, 'eval_runtime': 3.1407, 'eval_samples_per_second': 56.994, 'eval_steps_per_second': 56.994, 'epoch': 4.8} +{'loss': 43.8218, 'grad_norm': 222.4996337890625, 'learning_rate': 2.2807622504537203e-05, 'epoch': 4.81} +{'loss': 42.9497, 'grad_norm': 228.1157684326172, 'learning_rate': 2.2802177858439202e-05, 'epoch': 4.81} +{'loss': 43.9723, 'grad_norm': 179.83697509765625, 'learning_rate': 2.27967332123412e-05, 'epoch': 4.81} +{'loss': 43.3302, 'grad_norm': 196.81983947753906, 'learning_rate': 2.2791288566243196e-05, 'epoch': 4.82} +{'loss': 41.8957, 'grad_norm': 186.61160278320312, 'learning_rate': 2.278584392014519e-05, 'epoch': 4.82} +{'loss': 43.1916, 'grad_norm': 242.55886840820312, 'learning_rate': 2.2780399274047187e-05, 'epoch': 4.82} +{'loss': 38.3371, 'grad_norm': 212.07177734375, 'learning_rate': 2.2774954627949185e-05, 'epoch': 4.83} +{'loss': 36.3413, 'grad_norm': 180.1990966796875, 'learning_rate': 2.276950998185118e-05, 'epoch': 4.83} +{'loss': 35.4426, 'grad_norm': 202.69529724121094, 'learning_rate': 2.2764065335753176e-05, 'epoch': 4.83} +{'loss': 35.5281, 'grad_norm': 180.47283935546875, 'learning_rate': 2.275862068965517e-05, 'epoch': 4.84} + 24%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1350/5520 [1:09:48<3:16:07, 2.82s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6356105804443359, 'eval_runtime': 3.1378, 'eval_samples_per_second': 57.045, 'eval_steps_per_second': 57.045, 'epoch': 4.84} +{'loss': 36.2566, 'grad_norm': 204.674560546875, 'learning_rate': 2.2753176043557167e-05, 'epoch': 4.84} +{'loss': 36.3862, 'grad_norm': 272.1197204589844, 'learning_rate': 2.2747731397459166e-05, 'epoch': 4.85} +{'loss': 35.1455, 'grad_norm': 235.55101013183594, 'learning_rate': 2.2742286751361165e-05, 'epoch': 4.85} +{'loss': 37.3824, 'grad_norm': 271.2718200683594, 'learning_rate': 2.273684210526316e-05, 'epoch': 4.85} +{'loss': 37.6587, 'grad_norm': 242.15728759765625, 'learning_rate': 2.2731397459165155e-05, 'epoch': 4.86} +{'loss': 36.7602, 'grad_norm': 218.59481811523438, 'learning_rate': 2.272595281306715e-05, 'epoch': 4.86} +{'loss': 38.187, 'grad_norm': 231.9490203857422, 'learning_rate': 2.2720508166969146e-05, 'epoch': 4.86} +{'loss': 38.1905, 'grad_norm': 385.56158447265625, 'learning_rate': 2.2715063520871145e-05, 'epoch': 4.87} +{'loss': 38.2179, 'grad_norm': 219.38204956054688, 'learning_rate': 2.270961887477314e-05, 'epoch': 4.87} +{'loss': 37.3696, 'grad_norm': 209.46580505371094, 'learning_rate': 2.2704174228675136e-05, 'epoch': 4.87} + 25%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1360/5520 [1:10:20<3:15:36, 2.82s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6412517428398132, 'eval_runtime': 3.1349, 'eval_samples_per_second': 57.099, 'eval_steps_per_second': 57.099, 'epoch': 4.87} +{'loss': 38.5144, 'grad_norm': 205.53416442871094, 'learning_rate': 2.2698729582577134e-05, 'epoch': 4.88} +{'loss': 38.7372, 'grad_norm': 214.2522735595703, 'learning_rate': 2.269328493647913e-05, 'epoch': 4.88} +{'loss': 38.8987, 'grad_norm': 236.9787139892578, 'learning_rate': 2.2687840290381125e-05, 'epoch': 4.88} +{'loss': 35.0837, 'grad_norm': 247.30906677246094, 'learning_rate': 2.2682395644283124e-05, 'epoch': 4.89} +{'loss': 25.5272, 'grad_norm': 287.5954284667969, 'learning_rate': 2.267695099818512e-05, 'epoch': 4.89} +{'loss': 25.1288, 'grad_norm': 254.61672973632812, 'learning_rate': 2.2671506352087115e-05, 'epoch': 4.9} +{'loss': 25.0588, 'grad_norm': 180.98666381835938, 'learning_rate': 2.266606170598911e-05, 'epoch': 4.9} +{'loss': 25.464, 'grad_norm': 213.0275421142578, 'learning_rate': 2.2660617059891105e-05, 'epoch': 4.9} +{'loss': 47.0056, 'grad_norm': 385.18035888671875, 'learning_rate': 2.2655172413793104e-05, 'epoch': 4.91} +{'loss': 46.9892, 'grad_norm': 383.4106140136719, 'learning_rate': 2.2649727767695103e-05, 'epoch': 4.91} + 25%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1370/5520 [1:10:51<3:16:21, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6618479490280151, 'eval_runtime': 3.1376, 'eval_samples_per_second': 57.049, 'eval_steps_per_second': 57.049, 'epoch': 4.91} +{'loss': 47.1619, 'grad_norm': 415.4345397949219, 'learning_rate': 2.26442831215971e-05, 'epoch': 4.91} +{'loss': 46.7232, 'grad_norm': 362.338134765625, 'learning_rate': 2.2638838475499094e-05, 'epoch': 4.92} +{'loss': 46.4438, 'grad_norm': 378.7535400390625, 'learning_rate': 2.263339382940109e-05, 'epoch': 4.92} +{'loss': 44.8178, 'grad_norm': 251.64901733398438, 'learning_rate': 2.2627949183303085e-05, 'epoch': 4.92} +{'loss': 43.0865, 'grad_norm': 273.1052551269531, 'learning_rate': 2.2622504537205083e-05, 'epoch': 4.93} +{'loss': 42.2463, 'grad_norm': 229.66415405273438, 'learning_rate': 2.261705989110708e-05, 'epoch': 4.93} +{'loss': 42.4395, 'grad_norm': 229.47940063476562, 'learning_rate': 2.2611615245009074e-05, 'epoch': 4.94} +{'loss': 42.4994, 'grad_norm': 224.48890686035156, 'learning_rate': 2.260617059891107e-05, 'epoch': 4.94} +{'loss': 42.5535, 'grad_norm': 241.98745727539062, 'learning_rate': 2.2600725952813065e-05, 'epoch': 4.94} +{'loss': 42.8475, 'grad_norm': 258.1711120605469, 'learning_rate': 2.2595281306715067e-05, 'epoch': 4.95} + 25%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1380/5520 [1:11:22<3:15:18, 2.83s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.639252245426178, 'eval_runtime': 3.1354, 'eval_samples_per_second': 57.09, 'eval_steps_per_second': 57.09, 'epoch': 4.95} +{'loss': 42.9895, 'grad_norm': 204.64927673339844, 'learning_rate': 2.2589836660617062e-05, 'epoch': 4.95} +{'loss': 43.1972, 'grad_norm': 342.9057922363281, 'learning_rate': 2.2584392014519058e-05, 'epoch': 4.95} +{'loss': 42.406, 'grad_norm': 207.45504760742188, 'learning_rate': 2.2578947368421053e-05, 'epoch': 4.96} +{'loss': 36.8817, 'grad_norm': 232.78831481933594, 'learning_rate': 2.257350272232305e-05, 'epoch': 4.96} +{'loss': 34.584, 'grad_norm': 249.3349609375, 'learning_rate': 2.2568058076225044e-05, 'epoch': 4.96} +{'loss': 36.9512, 'grad_norm': 322.7100524902344, 'learning_rate': 2.2562613430127043e-05, 'epoch': 4.97} +{'loss': 37.6833, 'grad_norm': 357.65228271484375, 'learning_rate': 2.2557168784029038e-05, 'epoch': 4.97} +{'loss': 38.597, 'grad_norm': 300.0970153808594, 'learning_rate': 2.2551724137931033e-05, 'epoch': 4.98} +{'loss': 38.4155, 'grad_norm': 234.52508544921875, 'learning_rate': 2.2546279491833032e-05, 'epoch': 4.98} +{'loss': 38.1589, 'grad_norm': 270.60626220703125, 'learning_rate': 2.2540834845735028e-05, 'epoch': 4.98} + 25%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1390/5520 [1:11:52<3:12:59, 2.80s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6409950256347656, 'eval_runtime': 3.1363, 'eval_samples_per_second': 57.073, 'eval_steps_per_second': 57.073, 'epoch': 4.98} +{'loss': 39.281, 'grad_norm': 232.9596710205078, 'learning_rate': 2.2535390199637026e-05, 'epoch': 4.99} +{'loss': 40.0868, 'grad_norm': 248.0550994873047, 'learning_rate': 2.2529945553539022e-05, 'epoch': 4.99} +{'loss': 28.1259, 'grad_norm': 256.327880859375, 'learning_rate': 2.2524500907441017e-05, 'epoch': 4.99} +{'loss': 25.3166, 'grad_norm': 198.29559326171875, 'learning_rate': 2.2519056261343012e-05, 'epoch': 5.0} +{'loss': 22.0749, 'grad_norm': 174.66856384277344, 'learning_rate': 2.2513611615245008e-05, 'epoch': 5.0} +{'loss': 45.2433, 'grad_norm': 309.0927429199219, 'learning_rate': 2.2508166969147003e-05, 'epoch': 5.0} +{'loss': 46.7025, 'grad_norm': 293.1455383300781, 'learning_rate': 2.2502722323049002e-05, 'epoch': 5.01} +{'loss': 45.3218, 'grad_norm': 269.47662353515625, 'learning_rate': 2.2497277676951e-05, 'epoch': 5.01} +{'loss': 44.9849, 'grad_norm': 284.49560546875, 'learning_rate': 2.2491833030852996e-05, 'epoch': 5.01} +{'loss': 44.887, 'grad_norm': 223.5511474609375, 'learning_rate': 2.248638838475499e-05, 'epoch': 5.02} + 25%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1400/5520 [1:12:23<3:14:20, 2.83s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6435533165931702, 'eval_runtime': 3.139, 'eval_samples_per_second': 57.024, 'eval_steps_per_second': 57.024, 'epoch': 5.02} +{'loss': 45.1483, 'grad_norm': 243.4492645263672, 'learning_rate': 2.2480943738656987e-05, 'epoch': 5.02} +{'loss': 44.3713, 'grad_norm': 265.1712646484375, 'learning_rate': 2.2475499092558986e-05, 'epoch': 5.03} +{'loss': 45.3138, 'grad_norm': 190.72190856933594, 'learning_rate': 2.247005444646098e-05, 'epoch': 5.03} +{'loss': 43.302, 'grad_norm': 177.26686096191406, 'learning_rate': 2.2464609800362976e-05, 'epoch': 5.03} +{'loss': 43.6363, 'grad_norm': 198.6124725341797, 'learning_rate': 2.2459165154264972e-05, 'epoch': 5.04} +{'loss': 43.0345, 'grad_norm': 233.78738403320312, 'learning_rate': 2.2453720508166967e-05, 'epoch': 5.04} +{'loss': 41.5932, 'grad_norm': 225.48614501953125, 'learning_rate': 2.2448275862068966e-05, 'epoch': 5.04} +{'loss': 40.1401, 'grad_norm': 204.31179809570312, 'learning_rate': 2.2442831215970965e-05, 'epoch': 5.05} +{'loss': 40.8834, 'grad_norm': 219.5385284423828, 'learning_rate': 2.243738656987296e-05, 'epoch': 5.05} +{'loss': 40.4476, 'grad_norm': 168.3094024658203, 'learning_rate': 2.2431941923774956e-05, 'epoch': 5.05} + 25%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1400/5520 [1:12:26<3:14:20, 2.83s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 26%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1410/5520 [1:12:55<3:13:35, 2.83s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6361114382743835, 'eval_runtime': 3.138, 'eval_samples_per_second': 57.043, 'eval_steps_per_second': 57.043, 'epoch': 5.05} +{'loss': 40.1949, 'grad_norm': 169.45201110839844, 'learning_rate': 2.242649727767695e-05, 'epoch': 5.06} +{'loss': 41.0091, 'grad_norm': 208.84634399414062, 'learning_rate': 2.2421052631578946e-05, 'epoch': 5.06} +{'loss': 40.2435, 'grad_norm': 248.86221313476562, 'learning_rate': 2.2415607985480945e-05, 'epoch': 5.07} +{'loss': 42.37, 'grad_norm': 297.0834655761719, 'learning_rate': 2.241016333938294e-05, 'epoch': 5.07} +{'loss': 42.3822, 'grad_norm': 242.12661743164062, 'learning_rate': 2.2404718693284936e-05, 'epoch': 5.07} +{'loss': 41.3722, 'grad_norm': 230.1178741455078, 'learning_rate': 2.2399274047186935e-05, 'epoch': 5.08} +{'loss': 41.8087, 'grad_norm': 191.32371520996094, 'learning_rate': 2.239382940108893e-05, 'epoch': 5.08} +{'loss': 42.5938, 'grad_norm': 267.28753662109375, 'learning_rate': 2.2388384754990925e-05, 'epoch': 5.08} +{'loss': 42.8553, 'grad_norm': 186.61978149414062, 'learning_rate': 2.2382940108892924e-05, 'epoch': 5.09} +{'loss': 41.9677, 'grad_norm': 242.53433227539062, 'learning_rate': 2.237749546279492e-05, 'epoch': 5.09} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 26%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1420/5520 [1:13:26<3:13:33, 2.83s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6330043077468872, 'eval_runtime': 3.1334, 'eval_samples_per_second': 57.127, 'eval_steps_per_second': 57.127, 'epoch': 5.09} +{'loss': 42.9821, 'grad_norm': 199.74696350097656, 'learning_rate': 2.2372050816696915e-05, 'epoch': 5.09} +{'loss': 42.7956, 'grad_norm': 254.1063690185547, 'learning_rate': 2.236660617059891e-05, 'epoch': 5.1} +{'loss': 43.6312, 'grad_norm': 215.59056091308594, 'learning_rate': 2.2361161524500906e-05, 'epoch': 5.1} +{'loss': 40.9468, 'grad_norm': 218.69973754882812, 'learning_rate': 2.2355716878402904e-05, 'epoch': 5.1} +{'loss': 38.2656, 'grad_norm': 200.34927368164062, 'learning_rate': 2.23502722323049e-05, 'epoch': 5.11} +{'loss': 35.8111, 'grad_norm': 191.56883239746094, 'learning_rate': 2.23448275862069e-05, 'epoch': 5.11} +{'loss': 35.1287, 'grad_norm': 192.629150390625, 'learning_rate': 2.2339382940108894e-05, 'epoch': 5.12} +{'loss': 34.9664, 'grad_norm': 217.54855346679688, 'learning_rate': 2.233393829401089e-05, 'epoch': 5.12} +{'loss': 35.9252, 'grad_norm': 234.12355041503906, 'learning_rate': 2.2328493647912888e-05, 'epoch': 5.12} +{'loss': 36.4664, 'grad_norm': 201.83477783203125, 'learning_rate': 2.2323049001814884e-05, 'epoch': 5.13} + 26%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1430/5520 [1:13:57<3:12:12, 2.82s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6359394192695618, 'eval_runtime': 3.1334, 'eval_samples_per_second': 57.126, 'eval_steps_per_second': 57.126, 'epoch': 5.13} +{'loss': 35.2733, 'grad_norm': 212.38943481445312, 'learning_rate': 2.231760435571688e-05, 'epoch': 5.13} +{'loss': 37.2009, 'grad_norm': 219.8803253173828, 'learning_rate': 2.2312159709618874e-05, 'epoch': 5.13} +{'loss': 36.9338, 'grad_norm': 222.28221130371094, 'learning_rate': 2.230671506352087e-05, 'epoch': 5.14} +{'loss': 38.0419, 'grad_norm': 217.56607055664062, 'learning_rate': 2.2301270417422865e-05, 'epoch': 5.14} +{'loss': 38.1393, 'grad_norm': 232.7363739013672, 'learning_rate': 2.2295825771324867e-05, 'epoch': 5.14} +{'loss': 37.4169, 'grad_norm': 228.12091064453125, 'learning_rate': 2.2290381125226863e-05, 'epoch': 5.15} +{'loss': 37.6386, 'grad_norm': 247.9901580810547, 'learning_rate': 2.2284936479128858e-05, 'epoch': 5.15} +{'loss': 38.7843, 'grad_norm': 227.96649169921875, 'learning_rate': 2.2279491833030853e-05, 'epoch': 5.16} +{'loss': 37.7056, 'grad_norm': 197.85072326660156, 'learning_rate': 2.227404718693285e-05, 'epoch': 5.16} +{'loss': 38.5554, 'grad_norm': 270.6370544433594, 'learning_rate': 2.2268602540834848e-05, 'epoch': 5.16} + 26%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1440/5520 [1:14:28<3:12:16, 2.83s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6463288068771362, 'eval_runtime': 3.1382, 'eval_samples_per_second': 57.039, 'eval_steps_per_second': 57.039, 'epoch': 5.16} +{'loss': 32.6593, 'grad_norm': 251.65847778320312, 'learning_rate': 2.2263157894736843e-05, 'epoch': 5.17} +{'loss': 24.8031, 'grad_norm': 248.84368896484375, 'learning_rate': 2.225771324863884e-05, 'epoch': 5.17} +{'loss': 23.8542, 'grad_norm': 218.12979125976562, 'learning_rate': 2.2252268602540834e-05, 'epoch': 5.17} +{'loss': 25.1994, 'grad_norm': 171.4182586669922, 'learning_rate': 2.2246823956442832e-05, 'epoch': 5.18} +{'loss': 25.1259, 'grad_norm': 200.76271057128906, 'learning_rate': 2.2241379310344828e-05, 'epoch': 5.18} +{'loss': 46.7466, 'grad_norm': 324.8979797363281, 'learning_rate': 2.2235934664246827e-05, 'epoch': 5.18} +{'loss': 47.366, 'grad_norm': 391.9200439453125, 'learning_rate': 2.2230490018148822e-05, 'epoch': 5.19} +{'loss': 47.5236, 'grad_norm': 332.51080322265625, 'learning_rate': 2.2225045372050817e-05, 'epoch': 5.19} +{'loss': 44.9235, 'grad_norm': 295.85333251953125, 'learning_rate': 2.2219600725952813e-05, 'epoch': 5.2} +{'loss': 44.5892, 'grad_norm': 246.46482849121094, 'learning_rate': 2.2214156079854808e-05, 'epoch': 5.2} + 26%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1450/5520 [1:14:59<3:11:33, 2.82s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6501885056495667, 'eval_runtime': 3.1351, 'eval_samples_per_second': 57.096, 'eval_steps_per_second': 57.096, 'epoch': 5.2} +{'loss': 45.1496, 'grad_norm': 224.99964904785156, 'learning_rate': 2.2208711433756807e-05, 'epoch': 5.2} +{'loss': 44.2362, 'grad_norm': 201.5928497314453, 'learning_rate': 2.2203266787658802e-05, 'epoch': 5.21} +{'loss': 45.7963, 'grad_norm': 220.72509765625, 'learning_rate': 2.21978221415608e-05, 'epoch': 5.21} +{'loss': 44.1812, 'grad_norm': 229.04412841796875, 'learning_rate': 2.2192377495462796e-05, 'epoch': 5.21} +{'loss': 44.364, 'grad_norm': 214.86207580566406, 'learning_rate': 2.2186932849364792e-05, 'epoch': 5.22} +{'loss': 44.1106, 'grad_norm': 169.3239288330078, 'learning_rate': 2.2181488203266787e-05, 'epoch': 5.22} +{'loss': 41.8791, 'grad_norm': 180.3131561279297, 'learning_rate': 2.2176043557168786e-05, 'epoch': 5.22} +{'loss': 39.7917, 'grad_norm': 227.83078002929688, 'learning_rate': 2.217059891107078e-05, 'epoch': 5.23} +{'loss': 41.2864, 'grad_norm': 267.4294738769531, 'learning_rate': 2.2165154264972777e-05, 'epoch': 5.23} +{'loss': 40.7219, 'grad_norm': 210.79034423828125, 'learning_rate': 2.2159709618874772e-05, 'epoch': 5.23} + 26%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1460/5520 [1:15:30<3:11:56, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6369529366493225, 'eval_runtime': 3.1387, 'eval_samples_per_second': 57.03, 'eval_steps_per_second': 57.03, 'epoch': 5.23} +{'loss': 41.0364, 'grad_norm': 205.2632598876953, 'learning_rate': 2.2154264972776768e-05, 'epoch': 5.24} +{'loss': 40.2733, 'grad_norm': 199.7196807861328, 'learning_rate': 2.214882032667877e-05, 'epoch': 5.24} +{'loss': 40.3418, 'grad_norm': 184.26495361328125, 'learning_rate': 2.2143375680580765e-05, 'epoch': 5.25} +{'loss': 40.5658, 'grad_norm': 170.1937713623047, 'learning_rate': 2.213793103448276e-05, 'epoch': 5.25} +{'loss': 41.9252, 'grad_norm': 167.71109008789062, 'learning_rate': 2.2132486388384756e-05, 'epoch': 5.25} +{'loss': 40.0485, 'grad_norm': 184.73162841796875, 'learning_rate': 2.212704174228675e-05, 'epoch': 5.26} +{'loss': 41.6424, 'grad_norm': 195.0812225341797, 'learning_rate': 2.2121597096188747e-05, 'epoch': 5.26} +{'loss': 40.6179, 'grad_norm': 218.23553466796875, 'learning_rate': 2.2116152450090745e-05, 'epoch': 5.26} +{'loss': 42.8747, 'grad_norm': 229.79299926757812, 'learning_rate': 2.211070780399274e-05, 'epoch': 5.27} +{'loss': 42.7016, 'grad_norm': 231.70692443847656, 'learning_rate': 2.2105263157894736e-05, 'epoch': 5.27} + 27%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1470/5520 [1:16:01<3:10:36, 2.82s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6424433588981628, 'eval_runtime': 3.1433, 'eval_samples_per_second': 56.947, 'eval_steps_per_second': 56.947, 'epoch': 5.27} +{'loss': 41.206, 'grad_norm': 204.9513397216797, 'learning_rate': 2.209981851179673e-05, 'epoch': 5.27} +{'loss': 44.0126, 'grad_norm': 220.89083862304688, 'learning_rate': 2.209437386569873e-05, 'epoch': 5.28} +{'loss': 41.4934, 'grad_norm': 266.7763671875, 'learning_rate': 2.208892921960073e-05, 'epoch': 5.28} +{'loss': 43.3433, 'grad_norm': 241.42636108398438, 'learning_rate': 2.2083484573502724e-05, 'epoch': 5.29} +{'loss': 35.9569, 'grad_norm': 221.7669219970703, 'learning_rate': 2.207803992740472e-05, 'epoch': 5.29} +{'loss': 36.0824, 'grad_norm': 236.0152130126953, 'learning_rate': 2.2072595281306715e-05, 'epoch': 5.29} +{'loss': 33.6127, 'grad_norm': 239.56224060058594, 'learning_rate': 2.206715063520871e-05, 'epoch': 5.3} +{'loss': 36.11, 'grad_norm': 277.1287841796875, 'learning_rate': 2.2061705989110706e-05, 'epoch': 5.3} +{'loss': 36.9984, 'grad_norm': 250.19515991210938, 'learning_rate': 2.2056261343012705e-05, 'epoch': 5.3} +{'loss': 36.5917, 'grad_norm': 214.2754669189453, 'learning_rate': 2.20508166969147e-05, 'epoch': 5.31} + 27%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1480/5520 [1:16:34<3:36:25, 3.21s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6356943845748901, 'eval_runtime': 3.1365, 'eval_samples_per_second': 57.07, 'eval_steps_per_second': 57.07, 'epoch': 5.31} +{'loss': 36.5302, 'grad_norm': 224.37388610839844, 'learning_rate': 2.20453720508167e-05, 'epoch': 5.31} +{'loss': 36.7978, 'grad_norm': 276.2541809082031, 'learning_rate': 2.2039927404718694e-05, 'epoch': 5.31} +{'loss': 37.4063, 'grad_norm': 361.717041015625, 'learning_rate': 2.203448275862069e-05, 'epoch': 5.32} +{'loss': 37.2472, 'grad_norm': 285.3569641113281, 'learning_rate': 2.202903811252269e-05, 'epoch': 5.32} +{'loss': 37.7361, 'grad_norm': 268.160400390625, 'learning_rate': 2.2023593466424684e-05, 'epoch': 5.33} +{'loss': 37.7794, 'grad_norm': 211.38070678710938, 'learning_rate': 2.201814882032668e-05, 'epoch': 5.33} +{'loss': 39.0787, 'grad_norm': 214.10638427734375, 'learning_rate': 2.2012704174228675e-05, 'epoch': 5.33} +{'loss': 37.6853, 'grad_norm': 238.9603271484375, 'learning_rate': 2.200725952813067e-05, 'epoch': 5.34} +{'loss': 38.2844, 'grad_norm': 323.44976806640625, 'learning_rate': 2.2001814882032665e-05, 'epoch': 5.34} +{'loss': 38.8953, 'grad_norm': 289.6131896972656, 'learning_rate': 2.1996370235934668e-05, 'epoch': 5.34} + 27%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1490/5520 [1:17:06<3:13:53, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6462770700454712, 'eval_runtime': 3.1673, 'eval_samples_per_second': 56.516, 'eval_steps_per_second': 56.516, 'epoch': 5.34} +{'loss': 28.126, 'grad_norm': 197.47299194335938, 'learning_rate': 2.1990925589836663e-05, 'epoch': 5.35} +{'loss': 24.2205, 'grad_norm': 198.37156677246094, 'learning_rate': 2.1985480943738658e-05, 'epoch': 5.35} +{'loss': 24.119, 'grad_norm': 211.03501892089844, 'learning_rate': 2.1980036297640654e-05, 'epoch': 5.35} +{'loss': 24.7386, 'grad_norm': 182.23316955566406, 'learning_rate': 2.197459165154265e-05, 'epoch': 5.36} +{'loss': 26.0739, 'grad_norm': 192.6392822265625, 'learning_rate': 2.1969147005444648e-05, 'epoch': 5.36} +{'loss': 46.6945, 'grad_norm': 380.62896728515625, 'learning_rate': 2.1963702359346643e-05, 'epoch': 5.36} +{'loss': 46.1797, 'grad_norm': 342.5572814941406, 'learning_rate': 2.195825771324864e-05, 'epoch': 5.37} +{'loss': 45.6588, 'grad_norm': 311.7198791503906, 'learning_rate': 2.1952813067150634e-05, 'epoch': 5.37} +{'loss': 45.2405, 'grad_norm': 260.9885559082031, 'learning_rate': 2.1947368421052633e-05, 'epoch': 5.38} +{'loss': 44.117, 'grad_norm': 263.3132019042969, 'learning_rate': 2.1941923774954628e-05, 'epoch': 5.38} + 27%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1500/5520 [1:17:37<3:12:12, 2.87s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.644275426864624, 'eval_runtime': 3.1366, 'eval_samples_per_second': 57.068, 'eval_steps_per_second': 57.068, 'epoch': 5.38} +{'loss': 45.4002, 'grad_norm': 254.92022705078125, 'learning_rate': 2.1936479128856627e-05, 'epoch': 5.38} +{'loss': 45.3481, 'grad_norm': 246.1839599609375, 'learning_rate': 2.1931034482758622e-05, 'epoch': 5.39} +{'loss': 45.3958, 'grad_norm': 282.2879638671875, 'learning_rate': 2.1925589836660618e-05, 'epoch': 5.39} +{'loss': 44.2959, 'grad_norm': 266.9140930175781, 'learning_rate': 2.1920145190562613e-05, 'epoch': 5.39} +{'loss': 44.765, 'grad_norm': 196.81199645996094, 'learning_rate': 2.191470054446461e-05, 'epoch': 5.4} +{'loss': 42.8581, 'grad_norm': 270.7329406738281, 'learning_rate': 2.1909255898366607e-05, 'epoch': 5.4} +{'loss': 40.7167, 'grad_norm': 187.3281707763672, 'learning_rate': 2.1903811252268603e-05, 'epoch': 5.4} +{'loss': 41.0712, 'grad_norm': 302.9165954589844, 'learning_rate': 2.1898366606170598e-05, 'epoch': 5.41} +{'loss': 40.4098, 'grad_norm': 395.1492614746094, 'learning_rate': 2.1892921960072597e-05, 'epoch': 5.41} +{'loss': 41.2985, 'grad_norm': 253.91494750976562, 'learning_rate': 2.1887477313974592e-05, 'epoch': 5.42} + 27%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1510/5520 [1:18:09<3:12:00, 2.87s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6383773684501648, 'eval_runtime': 3.1389, 'eval_samples_per_second': 57.027, 'eval_steps_per_second': 57.027, 'epoch': 5.42} +{'loss': 41.179, 'grad_norm': 248.4109344482422, 'learning_rate': 2.1882032667876588e-05, 'epoch': 5.42} +{'loss': 41.1934, 'grad_norm': 210.50015258789062, 'learning_rate': 2.1876588021778586e-05, 'epoch': 5.42} +{'loss': 41.5535, 'grad_norm': 170.64334106445312, 'learning_rate': 2.187114337568058e-05, 'epoch': 5.43} +{'loss': 41.8323, 'grad_norm': 249.41270446777344, 'learning_rate': 2.1865698729582577e-05, 'epoch': 5.43} +{'loss': 42.1517, 'grad_norm': 214.53770446777344, 'learning_rate': 2.1860254083484572e-05, 'epoch': 5.43} +{'loss': 42.7675, 'grad_norm': 225.6502227783203, 'learning_rate': 2.1854809437386568e-05, 'epoch': 5.44} +{'loss': 42.5094, 'grad_norm': 210.19219970703125, 'learning_rate': 2.1849364791288567e-05, 'epoch': 5.44} +{'loss': 42.2218, 'grad_norm': 187.03294372558594, 'learning_rate': 2.1843920145190565e-05, 'epoch': 5.44} +{'loss': 42.7061, 'grad_norm': 227.6764373779297, 'learning_rate': 2.183847549909256e-05, 'epoch': 5.45} +{'loss': 43.1959, 'grad_norm': 239.2847442626953, 'learning_rate': 2.1833030852994556e-05, 'epoch': 5.45} + 28%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1520/5520 [1:18:40<3:11:44, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6405091285705566, 'eval_runtime': 3.1451, 'eval_samples_per_second': 56.914, 'eval_steps_per_second': 56.914, 'epoch': 5.45} +{'loss': 42.4915, 'grad_norm': 268.887451171875, 'learning_rate': 2.182758620689655e-05, 'epoch': 5.46} +{'loss': 42.1777, 'grad_norm': 261.0531311035156, 'learning_rate': 2.182214156079855e-05, 'epoch': 5.46} +{'loss': 40.8728, 'grad_norm': 241.58819580078125, 'learning_rate': 2.1816696914700546e-05, 'epoch': 5.46} +{'loss': 39.8861, 'grad_norm': 227.302001953125, 'learning_rate': 2.181125226860254e-05, 'epoch': 5.47} +{'loss': 36.8716, 'grad_norm': 293.8402404785156, 'learning_rate': 2.1805807622504536e-05, 'epoch': 5.47} +{'loss': 35.6049, 'grad_norm': 332.8829650878906, 'learning_rate': 2.1800362976406532e-05, 'epoch': 5.47} +{'loss': 34.6785, 'grad_norm': 271.6636962890625, 'learning_rate': 2.179491833030853e-05, 'epoch': 5.48} +{'loss': 35.5321, 'grad_norm': 211.5673065185547, 'learning_rate': 2.178947368421053e-05, 'epoch': 5.48} +{'loss': 35.1604, 'grad_norm': 168.95346069335938, 'learning_rate': 2.1784029038112525e-05, 'epoch': 5.48} +{'loss': 37.8709, 'grad_norm': 242.66725158691406, 'learning_rate': 2.177858439201452e-05, 'epoch': 5.49} + 28%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1530/5520 [1:19:12<3:10:45, 2.87s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6324127912521362, 'eval_runtime': 3.1417, 'eval_samples_per_second': 56.975, 'eval_steps_per_second': 56.975, 'epoch': 5.49} +{'loss': 38.1727, 'grad_norm': 202.7799530029297, 'learning_rate': 2.1773139745916516e-05, 'epoch': 5.49} +{'loss': 36.4171, 'grad_norm': 210.12704467773438, 'learning_rate': 2.176769509981851e-05, 'epoch': 5.49} +{'loss': 37.7873, 'grad_norm': 214.7133331298828, 'learning_rate': 2.176225045372051e-05, 'epoch': 5.5} +{'loss': 37.1096, 'grad_norm': 197.89781188964844, 'learning_rate': 2.1756805807622505e-05, 'epoch': 5.5} +{'loss': 36.9907, 'grad_norm': 203.01992797851562, 'learning_rate': 2.17513611615245e-05, 'epoch': 5.51} +{'loss': 38.0291, 'grad_norm': 210.42164611816406, 'learning_rate': 2.17459165154265e-05, 'epoch': 5.51} +{'loss': 37.5385, 'grad_norm': 210.2798309326172, 'learning_rate': 2.1740471869328495e-05, 'epoch': 5.51} +{'loss': 39.2736, 'grad_norm': 217.986572265625, 'learning_rate': 2.173502722323049e-05, 'epoch': 5.52} +{'loss': 39.2733, 'grad_norm': 221.05831909179688, 'learning_rate': 2.172958257713249e-05, 'epoch': 5.52} +{'loss': 37.8987, 'grad_norm': 250.36065673828125, 'learning_rate': 2.1724137931034484e-05, 'epoch': 5.52} + 28%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1540/5520 [1:19:43<3:09:35, 2.86s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6414559483528137, 'eval_runtime': 3.1397, 'eval_samples_per_second': 57.011, 'eval_steps_per_second': 57.011, 'epoch': 5.52} +{'loss': 29.4874, 'grad_norm': 275.062255859375, 'learning_rate': 2.171869328493648e-05, 'epoch': 5.53} +{'loss': 25.2165, 'grad_norm': 178.79615783691406, 'learning_rate': 2.1713248638838475e-05, 'epoch': 5.53} +{'loss': 24.7139, 'grad_norm': 221.6693572998047, 'learning_rate': 2.170780399274047e-05, 'epoch': 5.53} +{'loss': 25.2773, 'grad_norm': 207.15869140625, 'learning_rate': 2.170235934664247e-05, 'epoch': 5.54} +{'loss': 25.7936, 'grad_norm': 193.37644958496094, 'learning_rate': 2.1696914700544468e-05, 'epoch': 5.54} +{'loss': 45.8573, 'grad_norm': 314.101318359375, 'learning_rate': 2.1691470054446463e-05, 'epoch': 5.55} +{'loss': 47.1284, 'grad_norm': 376.9578552246094, 'learning_rate': 2.168602540834846e-05, 'epoch': 5.55} +{'loss': 45.1873, 'grad_norm': 343.3904724121094, 'learning_rate': 2.1680580762250454e-05, 'epoch': 5.55} +{'loss': 45.4906, 'grad_norm': 263.31768798828125, 'learning_rate': 2.167513611615245e-05, 'epoch': 5.56} +{'loss': 44.9259, 'grad_norm': 295.50384521484375, 'learning_rate': 2.1669691470054448e-05, 'epoch': 5.56} + 28%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1550/5520 [1:20:15<3:09:42, 2.87s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6483813524246216, 'eval_runtime': 3.1446, 'eval_samples_per_second': 56.923, 'eval_steps_per_second': 56.923, 'epoch': 5.56} +{'loss': 43.7965, 'grad_norm': 208.8861846923828, 'learning_rate': 2.1664246823956444e-05, 'epoch': 5.56} +{'loss': 44.7409, 'grad_norm': 195.8695526123047, 'learning_rate': 2.165880217785844e-05, 'epoch': 5.57} +{'loss': 45.9364, 'grad_norm': 218.10089111328125, 'learning_rate': 2.1653357531760434e-05, 'epoch': 5.57} +{'loss': 45.468, 'grad_norm': 204.17205810546875, 'learning_rate': 2.164791288566243e-05, 'epoch': 5.57} +{'loss': 44.7685, 'grad_norm': 239.03952026367188, 'learning_rate': 2.1642468239564432e-05, 'epoch': 5.58} +{'loss': 43.011, 'grad_norm': 251.59300231933594, 'learning_rate': 2.1637023593466427e-05, 'epoch': 5.58} +{'loss': 41.5255, 'grad_norm': 186.72540283203125, 'learning_rate': 2.1631578947368423e-05, 'epoch': 5.59} +{'loss': 40.2522, 'grad_norm': 199.89732360839844, 'learning_rate': 2.1626134301270418e-05, 'epoch': 5.59} +{'loss': 41.0931, 'grad_norm': 182.16624450683594, 'learning_rate': 2.1620689655172413e-05, 'epoch': 5.59} +{'loss': 40.2717, 'grad_norm': 221.58680725097656, 'learning_rate': 2.161524500907441e-05, 'epoch': 5.6} + 28%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1560/5520 [1:20:46<3:08:29, 2.86s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6393340229988098, 'eval_runtime': 3.141, 'eval_samples_per_second': 56.988, 'eval_steps_per_second': 56.988, 'epoch': 5.6} +{'loss': 41.7522, 'grad_norm': 209.82183837890625, 'learning_rate': 2.1609800362976408e-05, 'epoch': 5.6} +{'loss': 40.8078, 'grad_norm': 226.1896209716797, 'learning_rate': 2.1604355716878403e-05, 'epoch': 5.6} +{'loss': 42.2331, 'grad_norm': 219.57899475097656, 'learning_rate': 2.1598911070780398e-05, 'epoch': 5.61} +{'loss': 42.0695, 'grad_norm': 185.2303009033203, 'learning_rate': 2.1593466424682397e-05, 'epoch': 5.61} +{'loss': 42.1317, 'grad_norm': 192.32913208007812, 'learning_rate': 2.1588021778584392e-05, 'epoch': 5.61} +{'loss': 40.4957, 'grad_norm': 183.3128662109375, 'learning_rate': 2.158257713248639e-05, 'epoch': 5.62} +{'loss': 40.9154, 'grad_norm': 178.10691833496094, 'learning_rate': 2.1577132486388387e-05, 'epoch': 5.62} +{'loss': 42.8389, 'grad_norm': 207.3495330810547, 'learning_rate': 2.1571687840290382e-05, 'epoch': 5.62} +{'loss': 41.9483, 'grad_norm': 191.46353149414062, 'learning_rate': 2.1566243194192377e-05, 'epoch': 5.63} +{'loss': 41.2037, 'grad_norm': 218.9544219970703, 'learning_rate': 2.1560798548094373e-05, 'epoch': 5.63} + 28%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1570/5520 [1:21:17<3:10:41, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6345452070236206, 'eval_runtime': 3.1432, 'eval_samples_per_second': 56.949, 'eval_steps_per_second': 56.949, 'epoch': 5.63} +{'loss': 43.1159, 'grad_norm': 235.9405059814453, 'learning_rate': 2.1555353901996368e-05, 'epoch': 5.64} +{'loss': 43.4384, 'grad_norm': 207.1119384765625, 'learning_rate': 2.1549909255898367e-05, 'epoch': 5.64} +{'loss': 42.436, 'grad_norm': 305.3013916015625, 'learning_rate': 2.1544464609800366e-05, 'epoch': 5.64} +{'loss': 39.6844, 'grad_norm': 226.25282287597656, 'learning_rate': 2.153901996370236e-05, 'epoch': 5.65} +{'loss': 35.9103, 'grad_norm': 201.5033416748047, 'learning_rate': 2.1533575317604356e-05, 'epoch': 5.65} +{'loss': 35.0026, 'grad_norm': 206.63229370117188, 'learning_rate': 2.1528130671506352e-05, 'epoch': 5.65} +{'loss': 35.6298, 'grad_norm': 212.67581176757812, 'learning_rate': 2.152268602540835e-05, 'epoch': 5.66} +{'loss': 36.0356, 'grad_norm': 193.2886199951172, 'learning_rate': 2.1517241379310346e-05, 'epoch': 5.66} +{'loss': 35.5423, 'grad_norm': 166.189208984375, 'learning_rate': 2.151179673321234e-05, 'epoch': 5.66} +{'loss': 36.6227, 'grad_norm': 288.91552734375, 'learning_rate': 2.1506352087114337e-05, 'epoch': 5.67} + 29%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1580/5520 [1:21:49<3:08:17, 2.87s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6339959502220154, 'eval_runtime': 3.139, 'eval_samples_per_second': 57.024, 'eval_steps_per_second': 57.024, 'epoch': 5.67} +{'loss': 37.3015, 'grad_norm': 210.91664123535156, 'learning_rate': 2.1500907441016332e-05, 'epoch': 5.67} +{'loss': 36.961, 'grad_norm': 206.54299926757812, 'learning_rate': 2.149546279491833e-05, 'epoch': 5.68} +{'loss': 36.722, 'grad_norm': 206.55613708496094, 'learning_rate': 2.149001814882033e-05, 'epoch': 5.68} +{'loss': 37.7482, 'grad_norm': 206.86563110351562, 'learning_rate': 2.1484573502722325e-05, 'epoch': 5.68} +{'loss': 37.7964, 'grad_norm': 219.96533203125, 'learning_rate': 2.147912885662432e-05, 'epoch': 5.69} +{'loss': 38.6577, 'grad_norm': 226.23887634277344, 'learning_rate': 2.1473684210526316e-05, 'epoch': 5.69} +{'loss': 36.9764, 'grad_norm': 195.1751708984375, 'learning_rate': 2.146823956442831e-05, 'epoch': 5.69} +{'loss': 39.4842, 'grad_norm': 194.3510284423828, 'learning_rate': 2.146279491833031e-05, 'epoch': 5.7} +{'loss': 38.9574, 'grad_norm': 187.02281188964844, 'learning_rate': 2.1457350272232305e-05, 'epoch': 5.7} +{'loss': 37.6359, 'grad_norm': 242.91925048828125, 'learning_rate': 2.14519056261343e-05, 'epoch': 5.7} + 29%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1590/5520 [1:22:20<3:07:26, 2.86s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6384473443031311, 'eval_runtime': 3.1383, 'eval_samples_per_second': 57.036, 'eval_steps_per_second': 57.036, 'epoch': 5.7} +{'loss': 31.3564, 'grad_norm': 242.9617156982422, 'learning_rate': 2.14464609800363e-05, 'epoch': 5.71} +{'loss': 24.2933, 'grad_norm': 182.00540161132812, 'learning_rate': 2.1441016333938295e-05, 'epoch': 5.71} +{'loss': 24.6299, 'grad_norm': 257.7115173339844, 'learning_rate': 2.143557168784029e-05, 'epoch': 5.72} +{'loss': 24.7344, 'grad_norm': 198.71554565429688, 'learning_rate': 2.143012704174229e-05, 'epoch': 5.72} +{'loss': 26.0825, 'grad_norm': 198.24520874023438, 'learning_rate': 2.1424682395644284e-05, 'epoch': 5.72} +{'loss': 45.1176, 'grad_norm': 248.9528045654297, 'learning_rate': 2.141923774954628e-05, 'epoch': 5.73} +{'loss': 45.8517, 'grad_norm': 293.7327575683594, 'learning_rate': 2.1413793103448275e-05, 'epoch': 5.73} +{'loss': 45.6659, 'grad_norm': 293.1148681640625, 'learning_rate': 2.140834845735027e-05, 'epoch': 5.73} +{'loss': 44.4863, 'grad_norm': 312.7779846191406, 'learning_rate': 2.140290381125227e-05, 'epoch': 5.74} +{'loss': 43.649, 'grad_norm': 309.1000061035156, 'learning_rate': 2.1397459165154265e-05, 'epoch': 5.74} + 29%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1600/5520 [1:22:51<3:06:36, 2.86s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6471736431121826, 'eval_runtime': 3.1394, 'eval_samples_per_second': 57.016, 'eval_steps_per_second': 57.016, 'epoch': 5.74} +{'loss': 45.3135, 'grad_norm': 276.4226989746094, 'learning_rate': 2.1392014519056263e-05, 'epoch': 5.74} +{'loss': 44.4919, 'grad_norm': 233.6791229248047, 'learning_rate': 2.138656987295826e-05, 'epoch': 5.75} +{'loss': 44.8033, 'grad_norm': 194.2917022705078, 'learning_rate': 2.1381125226860254e-05, 'epoch': 5.75} +{'loss': 45.1427, 'grad_norm': 241.76060485839844, 'learning_rate': 2.137568058076225e-05, 'epoch': 5.75} +{'loss': 43.1769, 'grad_norm': 216.56283569335938, 'learning_rate': 2.137023593466425e-05, 'epoch': 5.76} +{'loss': 44.1141, 'grad_norm': 230.0026092529297, 'learning_rate': 2.1364791288566244e-05, 'epoch': 5.76} +{'loss': 40.7227, 'grad_norm': 191.55433654785156, 'learning_rate': 2.135934664246824e-05, 'epoch': 5.77} +{'loss': 40.9842, 'grad_norm': 180.25885009765625, 'learning_rate': 2.1353901996370235e-05, 'epoch': 5.77} +{'loss': 40.0403, 'grad_norm': 220.4018096923828, 'learning_rate': 2.134845735027223e-05, 'epoch': 5.77} +{'loss': 40.1543, 'grad_norm': 264.20587158203125, 'learning_rate': 2.1343012704174232e-05, 'epoch': 5.78} + 29%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1600/5520 [1:22:55<3:06:36, 2.86s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 29%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1610/5520 [1:23:24<3:08:55, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6374311447143555, 'eval_runtime': 3.1477, 'eval_samples_per_second': 56.867, 'eval_steps_per_second': 56.867, 'epoch': 5.78} +{'loss': 40.9575, 'grad_norm': 167.9457244873047, 'learning_rate': 2.1337568058076227e-05, 'epoch': 5.78} +{'loss': 39.5593, 'grad_norm': 190.05247497558594, 'learning_rate': 2.1332123411978223e-05, 'epoch': 5.78} +{'loss': 40.7016, 'grad_norm': 246.4980926513672, 'learning_rate': 2.1326678765880218e-05, 'epoch': 5.79} +{'loss': 41.7855, 'grad_norm': 208.7435302734375, 'learning_rate': 2.1321234119782214e-05, 'epoch': 5.79} +{'loss': 41.2129, 'grad_norm': 190.84188842773438, 'learning_rate': 2.1315789473684212e-05, 'epoch': 5.79} +{'loss': 40.8209, 'grad_norm': 196.7161102294922, 'learning_rate': 2.1310344827586208e-05, 'epoch': 5.8} +{'loss': 41.8345, 'grad_norm': 181.4319305419922, 'learning_rate': 2.1304900181488203e-05, 'epoch': 5.8} +{'loss': 43.1464, 'grad_norm': 201.2064971923828, 'learning_rate': 2.12994555353902e-05, 'epoch': 5.81} +{'loss': 42.6041, 'grad_norm': 199.15174865722656, 'learning_rate': 2.1294010889292197e-05, 'epoch': 5.81} +{'loss': 42.867, 'grad_norm': 231.0398406982422, 'learning_rate': 2.1288566243194193e-05, 'epoch': 5.81} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 29%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1620/5520 [1:23:56<3:12:28, 2.96s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6334222555160522, 'eval_runtime': 3.1534, 'eval_samples_per_second': 56.764, 'eval_steps_per_second': 56.764, 'epoch': 5.81} +{'loss': 41.7717, 'grad_norm': 189.26132202148438, 'learning_rate': 2.128312159709619e-05, 'epoch': 5.82} +{'loss': 41.3994, 'grad_norm': 215.5289764404297, 'learning_rate': 2.1277676950998187e-05, 'epoch': 5.82} +{'loss': 41.8173, 'grad_norm': 267.4259033203125, 'learning_rate': 2.1272232304900182e-05, 'epoch': 5.82} +{'loss': 39.9873, 'grad_norm': 241.74749755859375, 'learning_rate': 2.1266787658802178e-05, 'epoch': 5.83} +{'loss': 37.0662, 'grad_norm': 242.233642578125, 'learning_rate': 2.1261343012704173e-05, 'epoch': 5.83} +{'loss': 36.8948, 'grad_norm': 217.06141662597656, 'learning_rate': 2.1255898366606172e-05, 'epoch': 5.83} +{'loss': 34.9909, 'grad_norm': 242.05567932128906, 'learning_rate': 2.1250453720508167e-05, 'epoch': 5.84} +{'loss': 35.603, 'grad_norm': 178.65618896484375, 'learning_rate': 2.1245009074410166e-05, 'epoch': 5.84} +{'loss': 35.9822, 'grad_norm': 216.36865234375, 'learning_rate': 2.123956442831216e-05, 'epoch': 5.85} +{'loss': 35.1473, 'grad_norm': 241.22161865234375, 'learning_rate': 2.1234119782214157e-05, 'epoch': 5.85} + 30%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1630/5520 [1:24:28<3:06:37, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6312161087989807, 'eval_runtime': 3.1421, 'eval_samples_per_second': 56.969, 'eval_steps_per_second': 56.969, 'epoch': 5.85} +{'loss': 36.145, 'grad_norm': 192.05210876464844, 'learning_rate': 2.1228675136116152e-05, 'epoch': 5.85} +{'loss': 37.7076, 'grad_norm': 194.0652618408203, 'learning_rate': 2.122323049001815e-05, 'epoch': 5.86} +{'loss': 37.6837, 'grad_norm': 255.59286499023438, 'learning_rate': 2.1217785843920146e-05, 'epoch': 5.86} +{'loss': 37.1681, 'grad_norm': 184.0017852783203, 'learning_rate': 2.121234119782214e-05, 'epoch': 5.86} +{'loss': 37.4902, 'grad_norm': 186.98338317871094, 'learning_rate': 2.1206896551724137e-05, 'epoch': 5.87} +{'loss': 37.2771, 'grad_norm': 253.53775024414062, 'learning_rate': 2.1201451905626132e-05, 'epoch': 5.87} +{'loss': 37.7681, 'grad_norm': 196.43038940429688, 'learning_rate': 2.119600725952813e-05, 'epoch': 5.87} +{'loss': 40.0097, 'grad_norm': 255.99879455566406, 'learning_rate': 2.119056261343013e-05, 'epoch': 5.88} +{'loss': 38.1076, 'grad_norm': 275.1465148925781, 'learning_rate': 2.1185117967332125e-05, 'epoch': 5.88} +{'loss': 38.6463, 'grad_norm': 281.8592529296875, 'learning_rate': 2.117967332123412e-05, 'epoch': 5.88} + 30%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1640/5520 [1:25:00<3:18:22, 3.07s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6449099779129028, 'eval_runtime': 3.1396, 'eval_samples_per_second': 57.015, 'eval_steps_per_second': 57.015, 'epoch': 5.88} +{'loss': 36.9158, 'grad_norm': 246.7912139892578, 'learning_rate': 2.1174228675136116e-05, 'epoch': 5.89} +{'loss': 25.1153, 'grad_norm': 176.7545623779297, 'learning_rate': 2.116878402903811e-05, 'epoch': 5.89} +{'loss': 24.1999, 'grad_norm': 202.2602996826172, 'learning_rate': 2.116333938294011e-05, 'epoch': 5.9} +{'loss': 24.185, 'grad_norm': 186.26255798339844, 'learning_rate': 2.1157894736842106e-05, 'epoch': 5.9} +{'loss': 26.1841, 'grad_norm': 231.0543670654297, 'learning_rate': 2.11524500907441e-05, 'epoch': 5.9} +{'loss': 47.1367, 'grad_norm': 336.677001953125, 'learning_rate': 2.1147005444646096e-05, 'epoch': 5.91} +{'loss': 46.7711, 'grad_norm': 299.3211975097656, 'learning_rate': 2.1141560798548095e-05, 'epoch': 5.91} +{'loss': 44.9163, 'grad_norm': 287.5389099121094, 'learning_rate': 2.1136116152450094e-05, 'epoch': 5.91} +{'loss': 45.1651, 'grad_norm': 290.34930419921875, 'learning_rate': 2.113067150635209e-05, 'epoch': 5.92} +{'loss': 45.6252, 'grad_norm': 244.7100372314453, 'learning_rate': 2.1125226860254085e-05, 'epoch': 5.92} + 30%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1650/5520 [1:25:32<3:07:46, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6506878733634949, 'eval_runtime': 3.1485, 'eval_samples_per_second': 56.852, 'eval_steps_per_second': 56.852, 'epoch': 5.92} +{'loss': 44.5345, 'grad_norm': 301.48223876953125, 'learning_rate': 2.111978221415608e-05, 'epoch': 5.92} +{'loss': 42.0263, 'grad_norm': 261.05987548828125, 'learning_rate': 2.1114337568058075e-05, 'epoch': 5.93} +{'loss': 41.2405, 'grad_norm': 220.4369659423828, 'learning_rate': 2.110889292196007e-05, 'epoch': 5.93} +{'loss': 42.2734, 'grad_norm': 261.3221435546875, 'learning_rate': 2.110344827586207e-05, 'epoch': 5.94} +{'loss': 43.0752, 'grad_norm': 253.70855712890625, 'learning_rate': 2.1098003629764065e-05, 'epoch': 5.94} +{'loss': 42.7103, 'grad_norm': 198.76138305664062, 'learning_rate': 2.1092558983666064e-05, 'epoch': 5.94} +{'loss': 42.6215, 'grad_norm': 212.21466064453125, 'learning_rate': 2.108711433756806e-05, 'epoch': 5.95} +{'loss': 42.795, 'grad_norm': 212.9633026123047, 'learning_rate': 2.1081669691470055e-05, 'epoch': 5.95} +{'loss': 43.8843, 'grad_norm': 263.2871398925781, 'learning_rate': 2.1076225045372053e-05, 'epoch': 5.95} +{'loss': 43.0161, 'grad_norm': 207.67120361328125, 'learning_rate': 2.107078039927405e-05, 'epoch': 5.96} + 30%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1660/5520 [1:26:04<3:05:49, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6315081715583801, 'eval_runtime': 3.1446, 'eval_samples_per_second': 56.924, 'eval_steps_per_second': 56.924, 'epoch': 5.96} +{'loss': 38.803, 'grad_norm': 176.6342010498047, 'learning_rate': 2.1065335753176044e-05, 'epoch': 5.96} +{'loss': 35.1905, 'grad_norm': 223.57485961914062, 'learning_rate': 2.105989110707804e-05, 'epoch': 5.96} +{'loss': 34.9454, 'grad_norm': 291.507568359375, 'learning_rate': 2.1054446460980035e-05, 'epoch': 5.97} +{'loss': 37.4404, 'grad_norm': 250.51063537597656, 'learning_rate': 2.104900181488203e-05, 'epoch': 5.97} +{'loss': 36.9775, 'grad_norm': 307.9601135253906, 'learning_rate': 2.1043557168784032e-05, 'epoch': 5.98} +{'loss': 38.2696, 'grad_norm': 277.24151611328125, 'learning_rate': 2.1038112522686028e-05, 'epoch': 5.98} +{'loss': 37.0656, 'grad_norm': 186.7593994140625, 'learning_rate': 2.1032667876588023e-05, 'epoch': 5.98} +{'loss': 38.1747, 'grad_norm': 201.67047119140625, 'learning_rate': 2.102722323049002e-05, 'epoch': 5.99} +{'loss': 39.3248, 'grad_norm': 216.87525939941406, 'learning_rate': 2.1021778584392014e-05, 'epoch': 5.99} +{'loss': 33.4017, 'grad_norm': 227.381103515625, 'learning_rate': 2.1016333938294013e-05, 'epoch': 5.99} + 30%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1670/5520 [1:26:35<3:05:43, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6369583010673523, 'eval_runtime': 3.1443, 'eval_samples_per_second': 56.928, 'eval_steps_per_second': 56.928, 'epoch': 5.99} +{'loss': 24.679, 'grad_norm': 237.2648468017578, 'learning_rate': 2.1010889292196008e-05, 'epoch': 6.0} +{'loss': 21.9552, 'grad_norm': 191.99951171875, 'learning_rate': 2.1005444646098003e-05, 'epoch': 6.0} +{'loss': 43.6884, 'grad_norm': 267.92181396484375, 'learning_rate': 2.1e-05, 'epoch': 6.0} +{'loss': 46.0709, 'grad_norm': 318.86602783203125, 'learning_rate': 2.0994555353901998e-05, 'epoch': 6.01} +{'loss': 44.2746, 'grad_norm': 282.772705078125, 'learning_rate': 2.0989110707803993e-05, 'epoch': 6.01} +{'loss': 43.818, 'grad_norm': 263.2024841308594, 'learning_rate': 2.0983666061705992e-05, 'epoch': 6.01} +{'loss': 43.9441, 'grad_norm': 229.41725158691406, 'learning_rate': 2.0978221415607987e-05, 'epoch': 6.02} +{'loss': 43.517, 'grad_norm': 253.25624084472656, 'learning_rate': 2.0972776769509983e-05, 'epoch': 6.02} +{'loss': 44.3685, 'grad_norm': 202.00238037109375, 'learning_rate': 2.0967332123411978e-05, 'epoch': 6.03} +{'loss': 44.9367, 'grad_norm': 196.92825317382812, 'learning_rate': 2.0961887477313973e-05, 'epoch': 6.03} + 30%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1680/5520 [1:27:07<3:04:21, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6381568312644958, 'eval_runtime': 3.1477, 'eval_samples_per_second': 56.867, 'eval_steps_per_second': 56.867, 'epoch': 6.03} +{'loss': 44.0743, 'grad_norm': 191.00900268554688, 'learning_rate': 2.0956442831215972e-05, 'epoch': 6.03} +{'loss': 43.3278, 'grad_norm': 195.92141723632812, 'learning_rate': 2.0950998185117967e-05, 'epoch': 6.04} +{'loss': 41.6419, 'grad_norm': 230.04708862304688, 'learning_rate': 2.0945553539019963e-05, 'epoch': 6.04} +{'loss': 41.0927, 'grad_norm': 215.70689392089844, 'learning_rate': 2.094010889292196e-05, 'epoch': 6.04} +{'loss': 40.1888, 'grad_norm': 227.51797485351562, 'learning_rate': 2.0934664246823957e-05, 'epoch': 6.05} +{'loss': 39.8766, 'grad_norm': 216.93089294433594, 'learning_rate': 2.0929219600725952e-05, 'epoch': 6.05} +{'loss': 40.3851, 'grad_norm': 199.3091583251953, 'learning_rate': 2.092377495462795e-05, 'epoch': 6.05} +{'loss': 40.5289, 'grad_norm': 188.56056213378906, 'learning_rate': 2.0918330308529947e-05, 'epoch': 6.06} +{'loss': 40.7509, 'grad_norm': 194.23265075683594, 'learning_rate': 2.0912885662431942e-05, 'epoch': 6.06} +{'loss': 41.3404, 'grad_norm': 199.7327423095703, 'learning_rate': 2.0907441016333937e-05, 'epoch': 6.07} + 31%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1690/5520 [1:27:38<3:03:33, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6312655806541443, 'eval_runtime': 3.1482, 'eval_samples_per_second': 56.858, 'eval_steps_per_second': 56.858, 'epoch': 6.07} +{'loss': 41.3719, 'grad_norm': 189.40150451660156, 'learning_rate': 2.0901996370235933e-05, 'epoch': 6.07} +{'loss': 41.8194, 'grad_norm': 222.07705688476562, 'learning_rate': 2.089655172413793e-05, 'epoch': 6.07} +{'loss': 39.8522, 'grad_norm': 205.6264190673828, 'learning_rate': 2.089110707803993e-05, 'epoch': 6.08} +{'loss': 41.5093, 'grad_norm': 207.98802185058594, 'learning_rate': 2.0885662431941926e-05, 'epoch': 6.08} +{'loss': 41.7284, 'grad_norm': 197.24134826660156, 'learning_rate': 2.088021778584392e-05, 'epoch': 6.08} +{'loss': 42.7841, 'grad_norm': 220.84255981445312, 'learning_rate': 2.0874773139745916e-05, 'epoch': 6.09} +{'loss': 43.6391, 'grad_norm': 239.06854248046875, 'learning_rate': 2.0869328493647912e-05, 'epoch': 6.09} +{'loss': 41.9963, 'grad_norm': 193.2572021484375, 'learning_rate': 2.086388384754991e-05, 'epoch': 6.09} +{'loss': 41.9834, 'grad_norm': 206.66473388671875, 'learning_rate': 2.0858439201451906e-05, 'epoch': 6.1} +{'loss': 41.7128, 'grad_norm': 214.81956481933594, 'learning_rate': 2.08529945553539e-05, 'epoch': 6.1} + 31%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1700/5520 [1:28:10<3:02:14, 2.86s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6309775114059448, 'eval_runtime': 3.1471, 'eval_samples_per_second': 56.878, 'eval_steps_per_second': 56.878, 'epoch': 6.1} +{'loss': 37.7807, 'grad_norm': 189.58360290527344, 'learning_rate': 2.0847549909255897e-05, 'epoch': 6.1} +{'loss': 37.7091, 'grad_norm': 265.76934814453125, 'learning_rate': 2.0842105263157895e-05, 'epoch': 6.11} +{'loss': 34.7386, 'grad_norm': 266.4632568359375, 'learning_rate': 2.0836660617059894e-05, 'epoch': 6.11} +{'loss': 34.9386, 'grad_norm': 309.3799743652344, 'learning_rate': 2.083121597096189e-05, 'epoch': 6.12} +{'loss': 34.9113, 'grad_norm': 252.98681640625, 'learning_rate': 2.0825771324863885e-05, 'epoch': 6.12} +{'loss': 35.1914, 'grad_norm': 199.3408660888672, 'learning_rate': 2.082032667876588e-05, 'epoch': 6.12} +{'loss': 36.3151, 'grad_norm': 231.67514038085938, 'learning_rate': 2.0814882032667876e-05, 'epoch': 6.13} +{'loss': 37.6763, 'grad_norm': 215.49317932128906, 'learning_rate': 2.080943738656987e-05, 'epoch': 6.13} +{'loss': 35.7805, 'grad_norm': 239.3602752685547, 'learning_rate': 2.080399274047187e-05, 'epoch': 6.13} +{'loss': 36.7353, 'grad_norm': 192.8195037841797, 'learning_rate': 2.0798548094373865e-05, 'epoch': 6.14} + 31%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1710/5520 [1:28:41<3:02:19, 2.87s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6290757060050964, 'eval_runtime': 3.1486, 'eval_samples_per_second': 56.851, 'eval_steps_per_second': 56.851, 'epoch': 6.14} +{'loss': 36.6377, 'grad_norm': 191.125, 'learning_rate': 2.0793103448275864e-05, 'epoch': 6.14} +{'loss': 36.5235, 'grad_norm': 232.39170837402344, 'learning_rate': 2.078765880217786e-05, 'epoch': 6.14} +{'loss': 37.7093, 'grad_norm': 259.41204833984375, 'learning_rate': 2.0782214156079855e-05, 'epoch': 6.15} +{'loss': 37.8061, 'grad_norm': 218.00814819335938, 'learning_rate': 2.0776769509981854e-05, 'epoch': 6.15} +{'loss': 37.9451, 'grad_norm': 183.78170776367188, 'learning_rate': 2.077132486388385e-05, 'epoch': 6.16} +{'loss': 38.687, 'grad_norm': 242.387939453125, 'learning_rate': 2.0765880217785844e-05, 'epoch': 6.16} +{'loss': 38.5109, 'grad_norm': 247.09152221679688, 'learning_rate': 2.076043557168784e-05, 'epoch': 6.16} +{'loss': 28.0115, 'grad_norm': 202.3104705810547, 'learning_rate': 2.0754990925589835e-05, 'epoch': 6.17} +{'loss': 23.8873, 'grad_norm': 239.5511016845703, 'learning_rate': 2.0749546279491834e-05, 'epoch': 6.17} +{'loss': 24.0236, 'grad_norm': 233.80007934570312, 'learning_rate': 2.0744101633393833e-05, 'epoch': 6.17} + 31%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1720/5520 [1:29:13<3:02:41, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6451307535171509, 'eval_runtime': 3.1389, 'eval_samples_per_second': 57.027, 'eval_steps_per_second': 57.027, 'epoch': 6.17} +{'loss': 25.2521, 'grad_norm': 231.85955810546875, 'learning_rate': 2.0738656987295828e-05, 'epoch': 6.18} +{'loss': 25.5774, 'grad_norm': 207.05453491210938, 'learning_rate': 2.0733212341197823e-05, 'epoch': 6.18} +{'loss': 46.0267, 'grad_norm': 265.9180908203125, 'learning_rate': 2.072776769509982e-05, 'epoch': 6.18} +{'loss': 46.6262, 'grad_norm': 289.2763671875, 'learning_rate': 2.0722323049001814e-05, 'epoch': 6.19} +{'loss': 44.2758, 'grad_norm': 254.466552734375, 'learning_rate': 2.0716878402903813e-05, 'epoch': 6.19} +{'loss': 44.6334, 'grad_norm': 262.713134765625, 'learning_rate': 2.071143375680581e-05, 'epoch': 6.2} +{'loss': 44.9617, 'grad_norm': 272.8150939941406, 'learning_rate': 2.0705989110707804e-05, 'epoch': 6.2} +{'loss': 44.4382, 'grad_norm': 288.115478515625, 'learning_rate': 2.07005444646098e-05, 'epoch': 6.2} +{'loss': 44.8551, 'grad_norm': 226.08058166503906, 'learning_rate': 2.0695099818511795e-05, 'epoch': 6.21} +{'loss': 45.5901, 'grad_norm': 219.95835876464844, 'learning_rate': 2.0689655172413797e-05, 'epoch': 6.21} + 31%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1730/5520 [1:29:44<3:01:12, 2.87s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6379314661026001, 'eval_runtime': 3.1381, 'eval_samples_per_second': 57.042, 'eval_steps_per_second': 57.042, 'epoch': 6.21} +{'loss': 44.0675, 'grad_norm': 190.3118896484375, 'learning_rate': 2.0684210526315792e-05, 'epoch': 6.21} +{'loss': 42.6333, 'grad_norm': 177.408935546875, 'learning_rate': 2.0678765880217787e-05, 'epoch': 6.22} +{'loss': 41.6771, 'grad_norm': 231.3040313720703, 'learning_rate': 2.0673321234119783e-05, 'epoch': 6.22} +{'loss': 41.0829, 'grad_norm': 226.51663208007812, 'learning_rate': 2.0667876588021778e-05, 'epoch': 6.22} +{'loss': 39.2682, 'grad_norm': 184.55775451660156, 'learning_rate': 2.0662431941923774e-05, 'epoch': 6.23} +{'loss': 40.4101, 'grad_norm': 205.0491943359375, 'learning_rate': 2.0656987295825772e-05, 'epoch': 6.23} +{'loss': 39.9147, 'grad_norm': 201.45838928222656, 'learning_rate': 2.0651542649727768e-05, 'epoch': 6.23} +{'loss': 40.7215, 'grad_norm': 220.16213989257812, 'learning_rate': 2.0646098003629763e-05, 'epoch': 6.24} +{'loss': 40.0256, 'grad_norm': 260.9661560058594, 'learning_rate': 2.0640653357531762e-05, 'epoch': 6.24} +{'loss': 41.1147, 'grad_norm': 314.2476806640625, 'learning_rate': 2.0635208711433757e-05, 'epoch': 6.25} + 32%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1740/5520 [1:30:16<3:00:34, 2.87s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6347935199737549, 'eval_runtime': 3.1446, 'eval_samples_per_second': 56.924, 'eval_steps_per_second': 56.924, 'epoch': 6.25} +{'loss': 41.7255, 'grad_norm': 262.24505615234375, 'learning_rate': 2.0629764065335756e-05, 'epoch': 6.25} +{'loss': 41.2559, 'grad_norm': 212.0876922607422, 'learning_rate': 2.062431941923775e-05, 'epoch': 6.25} +{'loss': 41.1664, 'grad_norm': 185.3249969482422, 'learning_rate': 2.0618874773139747e-05, 'epoch': 6.26} +{'loss': 41.3357, 'grad_norm': 184.7873077392578, 'learning_rate': 2.0613430127041742e-05, 'epoch': 6.26} +{'loss': 43.0978, 'grad_norm': 230.11257934570312, 'learning_rate': 2.0607985480943738e-05, 'epoch': 6.26} +{'loss': 42.4169, 'grad_norm': 251.255126953125, 'learning_rate': 2.0602540834845733e-05, 'epoch': 6.27} +{'loss': 43.2969, 'grad_norm': 230.1149444580078, 'learning_rate': 2.0597096188747732e-05, 'epoch': 6.27} +{'loss': 42.6037, 'grad_norm': 217.2769012451172, 'learning_rate': 2.059165154264973e-05, 'epoch': 6.27} +{'loss': 42.1215, 'grad_norm': 189.85533142089844, 'learning_rate': 2.0586206896551726e-05, 'epoch': 6.28} +{'loss': 42.6337, 'grad_norm': 242.15667724609375, 'learning_rate': 2.058076225045372e-05, 'epoch': 6.28} + 32%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1750/5520 [1:30:48<3:01:42, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6310555934906006, 'eval_runtime': 3.1425, 'eval_samples_per_second': 56.96, 'eval_steps_per_second': 56.96, 'epoch': 6.28} +{'loss': 40.5315, 'grad_norm': 213.7873992919922, 'learning_rate': 2.0575317604355717e-05, 'epoch': 6.29} +{'loss': 38.9483, 'grad_norm': 243.86492919921875, 'learning_rate': 2.0569872958257715e-05, 'epoch': 6.29} +{'loss': 35.9627, 'grad_norm': 276.0108642578125, 'learning_rate': 2.056442831215971e-05, 'epoch': 6.29} +{'loss': 35.4305, 'grad_norm': 252.5875701904297, 'learning_rate': 2.0558983666061706e-05, 'epoch': 6.3} +{'loss': 35.2385, 'grad_norm': 227.15142822265625, 'learning_rate': 2.05535390199637e-05, 'epoch': 6.3} +{'loss': 35.735, 'grad_norm': 259.6727294921875, 'learning_rate': 2.0548094373865697e-05, 'epoch': 6.3} +{'loss': 36.8835, 'grad_norm': 185.07765197753906, 'learning_rate': 2.0542649727767696e-05, 'epoch': 6.31} +{'loss': 36.346, 'grad_norm': 207.650146484375, 'learning_rate': 2.0537205081669694e-05, 'epoch': 6.31} +{'loss': 36.1527, 'grad_norm': 223.2378692626953, 'learning_rate': 2.053176043557169e-05, 'epoch': 6.31} +{'loss': 35.7408, 'grad_norm': 162.90794372558594, 'learning_rate': 2.0526315789473685e-05, 'epoch': 6.32} + 32%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 1760/5520 [1:31:19<3:00:50, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6276403069496155, 'eval_runtime': 3.1411, 'eval_samples_per_second': 56.987, 'eval_steps_per_second': 56.987, 'epoch': 6.32} +{'loss': 37.7916, 'grad_norm': 165.8592987060547, 'learning_rate': 2.052087114337568e-05, 'epoch': 6.32} +{'loss': 36.8409, 'grad_norm': 179.7499542236328, 'learning_rate': 2.0515426497277676e-05, 'epoch': 6.33} +{'loss': 37.1766, 'grad_norm': 227.0990753173828, 'learning_rate': 2.0509981851179675e-05, 'epoch': 6.33} +{'loss': 37.5, 'grad_norm': 216.3297882080078, 'learning_rate': 2.050453720508167e-05, 'epoch': 6.33} +{'loss': 38.8293, 'grad_norm': 197.88409423828125, 'learning_rate': 2.0499092558983666e-05, 'epoch': 6.34} +{'loss': 37.9873, 'grad_norm': 189.74916076660156, 'learning_rate': 2.049364791288566e-05, 'epoch': 6.34} +{'loss': 39.3107, 'grad_norm': 241.16644287109375, 'learning_rate': 2.048820326678766e-05, 'epoch': 6.34} +{'loss': 36.2482, 'grad_norm': 224.3491668701172, 'learning_rate': 2.0482758620689655e-05, 'epoch': 6.35} +{'loss': 24.1945, 'grad_norm': 217.30882263183594, 'learning_rate': 2.0477313974591654e-05, 'epoch': 6.35} +{'loss': 24.2356, 'grad_norm': 213.23683166503906, 'learning_rate': 2.047186932849365e-05, 'epoch': 6.35} + 32%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1770/5520 [1:31:51<3:00:46, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6382855772972107, 'eval_runtime': 3.1517, 'eval_samples_per_second': 56.795, 'eval_steps_per_second': 56.795, 'epoch': 6.35} +{'loss': 25.1916, 'grad_norm': 209.8166961669922, 'learning_rate': 2.0466424682395645e-05, 'epoch': 6.36} +{'loss': 25.1372, 'grad_norm': 197.86773681640625, 'learning_rate': 2.046098003629764e-05, 'epoch': 6.36} +{'loss': 45.0431, 'grad_norm': 280.80517578125, 'learning_rate': 2.0455535390199635e-05, 'epoch': 6.36} +{'loss': 45.4893, 'grad_norm': 239.85861206054688, 'learning_rate': 2.0450090744101634e-05, 'epoch': 6.37} +{'loss': 45.3313, 'grad_norm': 302.56024169921875, 'learning_rate': 2.044464609800363e-05, 'epoch': 6.37} +{'loss': 44.703, 'grad_norm': 255.5519256591797, 'learning_rate': 2.043920145190563e-05, 'epoch': 6.38} +{'loss': 45.0278, 'grad_norm': 223.1331024169922, 'learning_rate': 2.0433756805807624e-05, 'epoch': 6.38} +{'loss': 44.7298, 'grad_norm': 240.68817138671875, 'learning_rate': 2.042831215970962e-05, 'epoch': 6.38} +{'loss': 44.0512, 'grad_norm': 239.5072021484375, 'learning_rate': 2.0422867513611614e-05, 'epoch': 6.39} +{'loss': 43.8646, 'grad_norm': 186.3783416748047, 'learning_rate': 2.0417422867513613e-05, 'epoch': 6.39} + 32%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1780/5520 [1:32:23<3:00:13, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6325972676277161, 'eval_runtime': 3.1424, 'eval_samples_per_second': 56.964, 'eval_steps_per_second': 56.964, 'epoch': 6.39} +{'loss': 43.8688, 'grad_norm': 169.77285766601562, 'learning_rate': 2.041197822141561e-05, 'epoch': 6.39} +{'loss': 42.5757, 'grad_norm': 158.4019012451172, 'learning_rate': 2.0406533575317604e-05, 'epoch': 6.4} +{'loss': 44.8075, 'grad_norm': 209.79916381835938, 'learning_rate': 2.04010889292196e-05, 'epoch': 6.4} +{'loss': 42.0121, 'grad_norm': 215.74639892578125, 'learning_rate': 2.0395644283121595e-05, 'epoch': 6.4} +{'loss': 40.6564, 'grad_norm': 215.21121215820312, 'learning_rate': 2.0390199637023597e-05, 'epoch': 6.41} +{'loss': 40.543, 'grad_norm': 244.49574279785156, 'learning_rate': 2.0384754990925592e-05, 'epoch': 6.41} +{'loss': 39.5569, 'grad_norm': 189.22781372070312, 'learning_rate': 2.0379310344827588e-05, 'epoch': 6.42} +{'loss': 40.0789, 'grad_norm': 204.32664489746094, 'learning_rate': 2.0373865698729583e-05, 'epoch': 6.42} +{'loss': 39.6436, 'grad_norm': 217.5277557373047, 'learning_rate': 2.036842105263158e-05, 'epoch': 6.42} +{'loss': 41.0794, 'grad_norm': 196.25918579101562, 'learning_rate': 2.0362976406533574e-05, 'epoch': 6.43} + 32%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1790/5520 [1:32:55<2:58:52, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6334295868873596, 'eval_runtime': 3.1471, 'eval_samples_per_second': 56.878, 'eval_steps_per_second': 56.878, 'epoch': 6.43} +{'loss': 41.2976, 'grad_norm': 191.50656127929688, 'learning_rate': 2.0357531760435573e-05, 'epoch': 6.43} +{'loss': 41.0843, 'grad_norm': 192.98692321777344, 'learning_rate': 2.0352087114337568e-05, 'epoch': 6.43} +{'loss': 40.4123, 'grad_norm': 197.32862854003906, 'learning_rate': 2.0346642468239563e-05, 'epoch': 6.44} +{'loss': 41.9185, 'grad_norm': 205.18751525878906, 'learning_rate': 2.0341197822141562e-05, 'epoch': 6.44} +{'loss': 41.6794, 'grad_norm': 201.69070434570312, 'learning_rate': 2.0335753176043558e-05, 'epoch': 6.44} +{'loss': 43.5805, 'grad_norm': 218.77044677734375, 'learning_rate': 2.0330308529945556e-05, 'epoch': 6.45} +{'loss': 41.2777, 'grad_norm': 183.25967407226562, 'learning_rate': 2.0324863883847552e-05, 'epoch': 6.45} +{'loss': 42.4618, 'grad_norm': 219.97369384765625, 'learning_rate': 2.0319419237749547e-05, 'epoch': 6.46} +{'loss': 41.6424, 'grad_norm': 216.1624298095703, 'learning_rate': 2.0313974591651542e-05, 'epoch': 6.46} +{'loss': 41.4058, 'grad_norm': 222.29965209960938, 'learning_rate': 2.0308529945553538e-05, 'epoch': 6.46} + 33%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1800/5520 [1:33:26<2:59:38, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6282982230186462, 'eval_runtime': 3.1404, 'eval_samples_per_second': 56.999, 'eval_steps_per_second': 56.999, 'epoch': 6.46} +{'loss': 39.474, 'grad_norm': 215.50511169433594, 'learning_rate': 2.0303085299455533e-05, 'epoch': 6.47} +{'loss': 36.0508, 'grad_norm': 237.2119903564453, 'learning_rate': 2.0297640653357532e-05, 'epoch': 6.47} +{'loss': 34.1704, 'grad_norm': 234.52975463867188, 'learning_rate': 2.029219600725953e-05, 'epoch': 6.47} +{'loss': 34.7592, 'grad_norm': 213.22216796875, 'learning_rate': 2.0286751361161526e-05, 'epoch': 6.48} +{'loss': 35.3051, 'grad_norm': 215.77244567871094, 'learning_rate': 2.028130671506352e-05, 'epoch': 6.48} +{'loss': 35.2493, 'grad_norm': 179.0439910888672, 'learning_rate': 2.0275862068965517e-05, 'epoch': 6.48} +{'loss': 35.6169, 'grad_norm': 217.47218322753906, 'learning_rate': 2.0270417422867516e-05, 'epoch': 6.49} +{'loss': 36.428, 'grad_norm': 191.3380584716797, 'learning_rate': 2.026497277676951e-05, 'epoch': 6.49} +{'loss': 36.5983, 'grad_norm': 200.8570098876953, 'learning_rate': 2.0259528130671506e-05, 'epoch': 6.49} +{'loss': 36.0163, 'grad_norm': 173.1240234375, 'learning_rate': 2.0254083484573502e-05, 'epoch': 6.5} + 33%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1800/5520 [1:33:29<2:59:38, 2.90s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 33%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1810/5520 [1:33:59<3:00:07, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6268841624259949, 'eval_runtime': 3.146, 'eval_samples_per_second': 56.898, 'eval_steps_per_second': 56.898, 'epoch': 6.5} +{'loss': 36.2461, 'grad_norm': 225.66845703125, 'learning_rate': 2.0248638838475497e-05, 'epoch': 6.5} +{'loss': 37.416, 'grad_norm': 189.66233825683594, 'learning_rate': 2.0243194192377496e-05, 'epoch': 6.51} +{'loss': 38.5309, 'grad_norm': 243.0270233154297, 'learning_rate': 2.0237749546279495e-05, 'epoch': 6.51} +{'loss': 37.087, 'grad_norm': 192.0927276611328, 'learning_rate': 2.023230490018149e-05, 'epoch': 6.51} +{'loss': 37.8877, 'grad_norm': 222.2957305908203, 'learning_rate': 2.0226860254083486e-05, 'epoch': 6.52} +{'loss': 39.2138, 'grad_norm': 259.84722900390625, 'learning_rate': 2.022141560798548e-05, 'epoch': 6.52} +{'loss': 38.6066, 'grad_norm': 205.5794219970703, 'learning_rate': 2.0215970961887476e-05, 'epoch': 6.52} +{'loss': 36.1581, 'grad_norm': 300.455810546875, 'learning_rate': 2.0210526315789475e-05, 'epoch': 6.53} +{'loss': 24.3689, 'grad_norm': 207.18063354492188, 'learning_rate': 2.020508166969147e-05, 'epoch': 6.53} +{'loss': 23.7019, 'grad_norm': 230.98516845703125, 'learning_rate': 2.0199637023593466e-05, 'epoch': 6.53} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 33%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1820/5520 [1:34:31<2:58:49, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6379140615463257, 'eval_runtime': 3.1451, 'eval_samples_per_second': 56.915, 'eval_steps_per_second': 56.915, 'epoch': 6.53} +{'loss': 24.5035, 'grad_norm': 153.8694610595703, 'learning_rate': 2.019419237749546e-05, 'epoch': 6.54} +{'loss': 26.1645, 'grad_norm': 229.9432373046875, 'learning_rate': 2.018874773139746e-05, 'epoch': 6.54} +{'loss': 45.6349, 'grad_norm': 325.3592529296875, 'learning_rate': 2.018330308529946e-05, 'epoch': 6.55} +{'loss': 45.5545, 'grad_norm': 261.0744323730469, 'learning_rate': 2.0177858439201454e-05, 'epoch': 6.55} +{'loss': 45.321, 'grad_norm': 261.4237976074219, 'learning_rate': 2.017241379310345e-05, 'epoch': 6.55} +{'loss': 44.5963, 'grad_norm': 238.8377685546875, 'learning_rate': 2.0166969147005445e-05, 'epoch': 6.56} +{'loss': 43.593, 'grad_norm': 225.89730834960938, 'learning_rate': 2.016152450090744e-05, 'epoch': 6.56} +{'loss': 43.536, 'grad_norm': 265.09625244140625, 'learning_rate': 2.0156079854809436e-05, 'epoch': 6.56} +{'loss': 44.1125, 'grad_norm': 257.9114685058594, 'learning_rate': 2.0150635208711434e-05, 'epoch': 6.57} +{'loss': 45.097, 'grad_norm': 188.06382751464844, 'learning_rate': 2.014519056261343e-05, 'epoch': 6.57} + 33%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 1830/5520 [1:35:02<2:57:55, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6347097754478455, 'eval_runtime': 3.1438, 'eval_samples_per_second': 56.938, 'eval_steps_per_second': 56.938, 'epoch': 6.57} +{'loss': 43.9367, 'grad_norm': 227.7350616455078, 'learning_rate': 2.013974591651543e-05, 'epoch': 6.57} +{'loss': 43.8266, 'grad_norm': 207.54774475097656, 'learning_rate': 2.0134301270417424e-05, 'epoch': 6.58} +{'loss': 42.7973, 'grad_norm': 204.62364196777344, 'learning_rate': 2.012885662431942e-05, 'epoch': 6.58} +{'loss': 42.7741, 'grad_norm': 244.32159423828125, 'learning_rate': 2.0123411978221418e-05, 'epoch': 6.59} +{'loss': 40.6529, 'grad_norm': 304.9100036621094, 'learning_rate': 2.0117967332123414e-05, 'epoch': 6.59} +{'loss': 40.2909, 'grad_norm': 275.5767517089844, 'learning_rate': 2.011252268602541e-05, 'epoch': 6.59} +{'loss': 39.8786, 'grad_norm': 227.69642639160156, 'learning_rate': 2.0107078039927404e-05, 'epoch': 6.6} +{'loss': 40.7009, 'grad_norm': 261.4333190917969, 'learning_rate': 2.01016333938294e-05, 'epoch': 6.6} +{'loss': 40.0595, 'grad_norm': 213.0095977783203, 'learning_rate': 2.0096188747731395e-05, 'epoch': 6.6} +{'loss': 40.8939, 'grad_norm': 251.78590393066406, 'learning_rate': 2.0090744101633397e-05, 'epoch': 6.61} + 33%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1840/5520 [1:35:34<2:56:49, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6333281397819519, 'eval_runtime': 3.1437, 'eval_samples_per_second': 56.94, 'eval_steps_per_second': 56.94, 'epoch': 6.61} +{'loss': 41.4123, 'grad_norm': 224.89805603027344, 'learning_rate': 2.0085299455535393e-05, 'epoch': 6.61} +{'loss': 41.3483, 'grad_norm': 195.67982482910156, 'learning_rate': 2.0079854809437388e-05, 'epoch': 6.61} +{'loss': 40.5516, 'grad_norm': 214.318603515625, 'learning_rate': 2.0074410163339383e-05, 'epoch': 6.62} +{'loss': 41.3523, 'grad_norm': 226.60968017578125, 'learning_rate': 2.006896551724138e-05, 'epoch': 6.62} +{'loss': 41.8734, 'grad_norm': 231.63604736328125, 'learning_rate': 2.0063520871143378e-05, 'epoch': 6.62} +{'loss': 42.7386, 'grad_norm': 224.1644287109375, 'learning_rate': 2.0058076225045373e-05, 'epoch': 6.63} +{'loss': 42.4525, 'grad_norm': 273.651123046875, 'learning_rate': 2.0052631578947368e-05, 'epoch': 6.63} +{'loss': 42.1051, 'grad_norm': 270.8088684082031, 'learning_rate': 2.0047186932849364e-05, 'epoch': 6.64} +{'loss': 42.1301, 'grad_norm': 303.1058044433594, 'learning_rate': 2.0041742286751362e-05, 'epoch': 6.64} +{'loss': 42.1495, 'grad_norm': 207.29380798339844, 'learning_rate': 2.0036297640653358e-05, 'epoch': 6.64} + 34%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1850/5520 [1:36:06<2:56:55, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6321585774421692, 'eval_runtime': 3.1483, 'eval_samples_per_second': 56.856, 'eval_steps_per_second': 56.856, 'epoch': 6.64} +{'loss': 39.6408, 'grad_norm': 262.1852722167969, 'learning_rate': 2.0030852994555357e-05, 'epoch': 6.65} +{'loss': 37.6177, 'grad_norm': 233.7991943359375, 'learning_rate': 2.0025408348457352e-05, 'epoch': 6.65} +{'loss': 35.4287, 'grad_norm': 247.25514221191406, 'learning_rate': 2.0019963702359347e-05, 'epoch': 6.65} +{'loss': 34.2335, 'grad_norm': 191.53343200683594, 'learning_rate': 2.0014519056261343e-05, 'epoch': 6.66} +{'loss': 35.8097, 'grad_norm': 245.22821044921875, 'learning_rate': 2.0009074410163338e-05, 'epoch': 6.66} +{'loss': 35.2621, 'grad_norm': 213.8151092529297, 'learning_rate': 2.0003629764065337e-05, 'epoch': 6.66} +{'loss': 36.6137, 'grad_norm': 174.6085205078125, 'learning_rate': 1.9998185117967332e-05, 'epoch': 6.67} +{'loss': 37.5896, 'grad_norm': 287.4677429199219, 'learning_rate': 1.9992740471869328e-05, 'epoch': 6.67} +{'loss': 36.5515, 'grad_norm': 224.59771728515625, 'learning_rate': 1.9987295825771326e-05, 'epoch': 6.68} +{'loss': 36.2511, 'grad_norm': 212.73065185546875, 'learning_rate': 1.9981851179673322e-05, 'epoch': 6.68} + 34%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1860/5520 [1:36:37<2:56:43, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6308404803276062, 'eval_runtime': 3.1419, 'eval_samples_per_second': 56.972, 'eval_steps_per_second': 56.972, 'epoch': 6.68} +{'loss': 37.6949, 'grad_norm': 214.7340850830078, 'learning_rate': 1.9976406533575317e-05, 'epoch': 6.68} +{'loss': 36.5785, 'grad_norm': 220.3029327392578, 'learning_rate': 1.9970961887477316e-05, 'epoch': 6.69} +{'loss': 38.5277, 'grad_norm': 198.97564697265625, 'learning_rate': 1.996551724137931e-05, 'epoch': 6.69} +{'loss': 37.5197, 'grad_norm': 180.94789123535156, 'learning_rate': 1.9960072595281307e-05, 'epoch': 6.69} +{'loss': 37.3483, 'grad_norm': 212.17584228515625, 'learning_rate': 1.9954627949183302e-05, 'epoch': 6.7} +{'loss': 38.5224, 'grad_norm': 253.88601684570312, 'learning_rate': 1.9949183303085298e-05, 'epoch': 6.7} +{'loss': 37.5679, 'grad_norm': 193.17698669433594, 'learning_rate': 1.9943738656987296e-05, 'epoch': 6.7} +{'loss': 27.7344, 'grad_norm': 217.2652130126953, 'learning_rate': 1.9938294010889295e-05, 'epoch': 6.71} +{'loss': 24.3864, 'grad_norm': 183.9295196533203, 'learning_rate': 1.993284936479129e-05, 'epoch': 6.71} +{'loss': 23.7328, 'grad_norm': 200.3455352783203, 'learning_rate': 1.9927404718693286e-05, 'epoch': 6.72} + 34%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1870/5520 [1:37:09<2:55:47, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.636415421962738, 'eval_runtime': 3.1435, 'eval_samples_per_second': 56.943, 'eval_steps_per_second': 56.943, 'epoch': 6.72} +{'loss': 24.6541, 'grad_norm': 206.7858123779297, 'learning_rate': 1.992196007259528e-05, 'epoch': 6.72} +{'loss': 25.1223, 'grad_norm': 208.10414123535156, 'learning_rate': 1.9916515426497277e-05, 'epoch': 6.72} +{'loss': 44.8561, 'grad_norm': 270.6657409667969, 'learning_rate': 1.9911070780399275e-05, 'epoch': 6.73} +{'loss': 45.8683, 'grad_norm': 246.69094848632812, 'learning_rate': 1.990562613430127e-05, 'epoch': 6.73} +{'loss': 45.1845, 'grad_norm': 243.4462432861328, 'learning_rate': 1.9900181488203266e-05, 'epoch': 6.73} +{'loss': 43.9492, 'grad_norm': 218.0637969970703, 'learning_rate': 1.989473684210526e-05, 'epoch': 6.74} +{'loss': 44.0612, 'grad_norm': 200.28140258789062, 'learning_rate': 1.988929219600726e-05, 'epoch': 6.74} +{'loss': 43.4748, 'grad_norm': 200.3120880126953, 'learning_rate': 1.988384754990926e-05, 'epoch': 6.74} +{'loss': 43.6851, 'grad_norm': 186.1811065673828, 'learning_rate': 1.9878402903811254e-05, 'epoch': 6.75} +{'loss': 44.4196, 'grad_norm': 208.15167236328125, 'learning_rate': 1.987295825771325e-05, 'epoch': 6.75} + 34%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1880/5520 [1:37:41<2:55:39, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6353851556777954, 'eval_runtime': 3.1436, 'eval_samples_per_second': 56.94, 'eval_steps_per_second': 56.94, 'epoch': 6.75} +{'loss': 44.1493, 'grad_norm': 207.500244140625, 'learning_rate': 1.9867513611615245e-05, 'epoch': 6.75} +{'loss': 44.6587, 'grad_norm': 238.17047119140625, 'learning_rate': 1.986206896551724e-05, 'epoch': 6.76} +{'loss': 43.2409, 'grad_norm': 192.9468231201172, 'learning_rate': 1.9856624319419236e-05, 'epoch': 6.76} +{'loss': 40.8636, 'grad_norm': 205.26492309570312, 'learning_rate': 1.9851179673321235e-05, 'epoch': 6.77} +{'loss': 41.0769, 'grad_norm': 190.49908447265625, 'learning_rate': 1.984573502722323e-05, 'epoch': 6.77} +{'loss': 40.1137, 'grad_norm': 206.56097412109375, 'learning_rate': 1.984029038112523e-05, 'epoch': 6.77} +{'loss': 41.0114, 'grad_norm': 212.89256286621094, 'learning_rate': 1.9834845735027224e-05, 'epoch': 6.78} +{'loss': 40.6027, 'grad_norm': 197.24267578125, 'learning_rate': 1.982940108892922e-05, 'epoch': 6.78} +{'loss': 40.5933, 'grad_norm': 187.01942443847656, 'learning_rate': 1.982395644283122e-05, 'epoch': 6.78} +{'loss': 41.2282, 'grad_norm': 236.31092834472656, 'learning_rate': 1.9818511796733214e-05, 'epoch': 6.79} + 34%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1890/5520 [1:38:12<2:54:22, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6299392580986023, 'eval_runtime': 3.1357, 'eval_samples_per_second': 57.085, 'eval_steps_per_second': 57.085, 'epoch': 6.79} +{'loss': 41.5858, 'grad_norm': 194.92059326171875, 'learning_rate': 1.981306715063521e-05, 'epoch': 6.79} +{'loss': 40.6826, 'grad_norm': 192.26272583007812, 'learning_rate': 1.9807622504537205e-05, 'epoch': 6.79} +{'loss': 40.0867, 'grad_norm': 181.8116912841797, 'learning_rate': 1.98021778584392e-05, 'epoch': 6.8} +{'loss': 41.4496, 'grad_norm': 219.03494262695312, 'learning_rate': 1.9796733212341195e-05, 'epoch': 6.8} +{'loss': 42.4147, 'grad_norm': 190.7852325439453, 'learning_rate': 1.9791288566243194e-05, 'epoch': 6.81} +{'loss': 42.0316, 'grad_norm': 200.32476806640625, 'learning_rate': 1.9785843920145193e-05, 'epoch': 6.81} +{'loss': 39.6992, 'grad_norm': 240.6086883544922, 'learning_rate': 1.9780399274047188e-05, 'epoch': 6.81} +{'loss': 42.9572, 'grad_norm': 222.31700134277344, 'learning_rate': 1.9774954627949184e-05, 'epoch': 6.82} +{'loss': 42.5147, 'grad_norm': 215.65292358398438, 'learning_rate': 1.976950998185118e-05, 'epoch': 6.82} +{'loss': 40.9536, 'grad_norm': 195.71624755859375, 'learning_rate': 1.9764065335753178e-05, 'epoch': 6.82} + 34%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1900/5520 [1:38:44<2:53:36, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6288287043571472, 'eval_runtime': 3.144, 'eval_samples_per_second': 56.935, 'eval_steps_per_second': 56.935, 'epoch': 6.82} +{'loss': 40.1754, 'grad_norm': 202.301025390625, 'learning_rate': 1.9758620689655173e-05, 'epoch': 6.83} +{'loss': 35.7505, 'grad_norm': 217.07186889648438, 'learning_rate': 1.975317604355717e-05, 'epoch': 6.83} +{'loss': 34.813, 'grad_norm': 189.78782653808594, 'learning_rate': 1.9747731397459164e-05, 'epoch': 6.83} +{'loss': 33.932, 'grad_norm': 247.2117462158203, 'learning_rate': 1.974228675136116e-05, 'epoch': 6.84} +{'loss': 36.2514, 'grad_norm': 244.06321716308594, 'learning_rate': 1.9736842105263158e-05, 'epoch': 6.84} +{'loss': 35.2123, 'grad_norm': 235.78692626953125, 'learning_rate': 1.9731397459165157e-05, 'epoch': 6.85} +{'loss': 36.5477, 'grad_norm': 193.82456970214844, 'learning_rate': 1.9725952813067152e-05, 'epoch': 6.85} +{'loss': 36.1244, 'grad_norm': 230.2017059326172, 'learning_rate': 1.9720508166969148e-05, 'epoch': 6.85} +{'loss': 36.7059, 'grad_norm': 205.5274200439453, 'learning_rate': 1.9715063520871143e-05, 'epoch': 6.86} +{'loss': 36.6212, 'grad_norm': 236.6873016357422, 'learning_rate': 1.970961887477314e-05, 'epoch': 6.86} + 35%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1910/5520 [1:39:16<2:54:03, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6235609650611877, 'eval_runtime': 3.1497, 'eval_samples_per_second': 56.831, 'eval_steps_per_second': 56.831, 'epoch': 6.86} +{'loss': 37.3918, 'grad_norm': 217.63638305664062, 'learning_rate': 1.9704174228675137e-05, 'epoch': 6.86} +{'loss': 37.8555, 'grad_norm': 169.31996154785156, 'learning_rate': 1.9698729582577133e-05, 'epoch': 6.87} +{'loss': 38.0013, 'grad_norm': 204.2144775390625, 'learning_rate': 1.9693284936479128e-05, 'epoch': 6.87} +{'loss': 37.2128, 'grad_norm': 219.13595581054688, 'learning_rate': 1.9687840290381127e-05, 'epoch': 6.87} +{'loss': 39.272, 'grad_norm': 189.8477325439453, 'learning_rate': 1.9682395644283122e-05, 'epoch': 6.88} +{'loss': 37.5185, 'grad_norm': 214.21360778808594, 'learning_rate': 1.967695099818512e-05, 'epoch': 6.88} +{'loss': 37.6195, 'grad_norm': 252.57867431640625, 'learning_rate': 1.9671506352087116e-05, 'epoch': 6.88} +{'loss': 29.083, 'grad_norm': 169.85382080078125, 'learning_rate': 1.966606170598911e-05, 'epoch': 6.89} +{'loss': 24.4547, 'grad_norm': 161.38137817382812, 'learning_rate': 1.9660617059891107e-05, 'epoch': 6.89} +{'loss': 24.2235, 'grad_norm': 192.5706787109375, 'learning_rate': 1.9655172413793102e-05, 'epoch': 6.9} + 35%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1920/5520 [1:39:47<2:53:01, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6387229561805725, 'eval_runtime': 3.1483, 'eval_samples_per_second': 56.856, 'eval_steps_per_second': 56.856, 'epoch': 6.9} +{'loss': 24.8032, 'grad_norm': 177.5368194580078, 'learning_rate': 1.9649727767695098e-05, 'epoch': 6.9} +{'loss': 25.7293, 'grad_norm': 206.98458862304688, 'learning_rate': 1.9644283121597097e-05, 'epoch': 6.9} +{'loss': 44.2514, 'grad_norm': 238.7289581298828, 'learning_rate': 1.9638838475499095e-05, 'epoch': 6.91} +{'loss': 44.4858, 'grad_norm': 225.86854553222656, 'learning_rate': 1.963339382940109e-05, 'epoch': 6.91} +{'loss': 44.5351, 'grad_norm': 235.71524047851562, 'learning_rate': 1.9627949183303086e-05, 'epoch': 6.91} +{'loss': 44.0865, 'grad_norm': 233.1634063720703, 'learning_rate': 1.962250453720508e-05, 'epoch': 6.92} +{'loss': 45.0226, 'grad_norm': 201.48944091796875, 'learning_rate': 1.961705989110708e-05, 'epoch': 6.92} +{'loss': 44.3969, 'grad_norm': 226.95469665527344, 'learning_rate': 1.9611615245009076e-05, 'epoch': 6.92} +{'loss': 41.3037, 'grad_norm': 242.79940795898438, 'learning_rate': 1.960617059891107e-05, 'epoch': 6.93} +{'loss': 41.3567, 'grad_norm': 255.3524932861328, 'learning_rate': 1.9600725952813066e-05, 'epoch': 6.93} + 35%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1930/5520 [1:40:19<2:52:36, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6346065998077393, 'eval_runtime': 3.1393, 'eval_samples_per_second': 57.018, 'eval_steps_per_second': 57.018, 'epoch': 6.93} +{'loss': 41.142, 'grad_norm': 277.0763854980469, 'learning_rate': 1.9595281306715062e-05, 'epoch': 6.94} +{'loss': 42.1963, 'grad_norm': 176.02658081054688, 'learning_rate': 1.958983666061706e-05, 'epoch': 6.94} +{'loss': 42.351, 'grad_norm': 236.36398315429688, 'learning_rate': 1.958439201451906e-05, 'epoch': 6.94} +{'loss': 41.5248, 'grad_norm': 203.0919647216797, 'learning_rate': 1.9578947368421055e-05, 'epoch': 6.95} +{'loss': 42.1004, 'grad_norm': 273.605712890625, 'learning_rate': 1.957350272232305e-05, 'epoch': 6.95} +{'loss': 42.6326, 'grad_norm': 214.04319763183594, 'learning_rate': 1.9568058076225045e-05, 'epoch': 6.95} +{'loss': 43.8045, 'grad_norm': 250.81832885742188, 'learning_rate': 1.956261343012704e-05, 'epoch': 6.96} +{'loss': 39.8991, 'grad_norm': 233.58116149902344, 'learning_rate': 1.955716878402904e-05, 'epoch': 6.96} +{'loss': 34.6192, 'grad_norm': 269.0545654296875, 'learning_rate': 1.9551724137931035e-05, 'epoch': 6.96} +{'loss': 35.7568, 'grad_norm': 266.1218566894531, 'learning_rate': 1.954627949183303e-05, 'epoch': 6.97} + 35%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1940/5520 [1:40:50<2:49:23, 2.84s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6233173608779907, 'eval_runtime': 3.1417, 'eval_samples_per_second': 56.975, 'eval_steps_per_second': 56.975, 'epoch': 6.97} +{'loss': 36.0795, 'grad_norm': 294.6914978027344, 'learning_rate': 1.9540834845735026e-05, 'epoch': 6.97} +{'loss': 37.2715, 'grad_norm': 373.6831970214844, 'learning_rate': 1.9535390199637025e-05, 'epoch': 6.98} +{'loss': 37.8335, 'grad_norm': 240.34738159179688, 'learning_rate': 1.952994555353902e-05, 'epoch': 6.98} +{'loss': 37.8251, 'grad_norm': 312.1968994140625, 'learning_rate': 1.952450090744102e-05, 'epoch': 6.98} +{'loss': 38.8466, 'grad_norm': 276.3544006347656, 'learning_rate': 1.9519056261343014e-05, 'epoch': 6.99} +{'loss': 37.774, 'grad_norm': 282.6874694824219, 'learning_rate': 1.951361161524501e-05, 'epoch': 6.99} +{'loss': 34.3747, 'grad_norm': 323.96612548828125, 'learning_rate': 1.9508166969147005e-05, 'epoch': 6.99} +{'loss': 24.5297, 'grad_norm': 235.02915954589844, 'learning_rate': 1.9502722323049e-05, 'epoch': 7.0} +{'loss': 22.3179, 'grad_norm': 176.4046173095703, 'learning_rate': 1.9497277676951e-05, 'epoch': 7.0} +{'loss': 42.225, 'grad_norm': 248.2797393798828, 'learning_rate': 1.9491833030852994e-05, 'epoch': 7.0} + 35%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1950/5520 [1:41:22<2:53:12, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6272363066673279, 'eval_runtime': 3.1453, 'eval_samples_per_second': 56.911, 'eval_steps_per_second': 56.911, 'epoch': 7.0} +{'loss': 43.6526, 'grad_norm': 235.9131622314453, 'learning_rate': 1.9486388384754993e-05, 'epoch': 7.01} +{'loss': 42.9052, 'grad_norm': 223.63479614257812, 'learning_rate': 1.948094373865699e-05, 'epoch': 7.01} +{'loss': 43.5819, 'grad_norm': 203.92141723632812, 'learning_rate': 1.9475499092558984e-05, 'epoch': 7.01} +{'loss': 43.1077, 'grad_norm': 209.6050567626953, 'learning_rate': 1.947005444646098e-05, 'epoch': 7.02} +{'loss': 42.7508, 'grad_norm': 245.77700805664062, 'learning_rate': 1.9464609800362978e-05, 'epoch': 7.02} +{'loss': 42.5234, 'grad_norm': 203.13465881347656, 'learning_rate': 1.9459165154264973e-05, 'epoch': 7.03} +{'loss': 44.0725, 'grad_norm': 226.4978485107422, 'learning_rate': 1.945372050816697e-05, 'epoch': 7.03} +{'loss': 42.6408, 'grad_norm': 225.68116760253906, 'learning_rate': 1.9448275862068964e-05, 'epoch': 7.03} +{'loss': 41.7696, 'grad_norm': 182.14202880859375, 'learning_rate': 1.944283121597096e-05, 'epoch': 7.04} +{'loss': 42.7008, 'grad_norm': 196.1949005126953, 'learning_rate': 1.9437386569872962e-05, 'epoch': 7.04} + 36%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 1960/5520 [1:41:54<2:51:37, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6277336478233337, 'eval_runtime': 3.1479, 'eval_samples_per_second': 56.862, 'eval_steps_per_second': 56.862, 'epoch': 7.04} +{'loss': 41.9946, 'grad_norm': 180.6853485107422, 'learning_rate': 1.9431941923774957e-05, 'epoch': 7.04} +{'loss': 39.8965, 'grad_norm': 199.0644073486328, 'learning_rate': 1.9426497277676953e-05, 'epoch': 7.05} +{'loss': 39.3263, 'grad_norm': 208.21371459960938, 'learning_rate': 1.9421052631578948e-05, 'epoch': 7.05} +{'loss': 40.1478, 'grad_norm': 239.78677368164062, 'learning_rate': 1.9415607985480943e-05, 'epoch': 7.05} +{'loss': 40.061, 'grad_norm': 211.55030822753906, 'learning_rate': 1.941016333938294e-05, 'epoch': 7.06} +{'loss': 39.8707, 'grad_norm': 199.51455688476562, 'learning_rate': 1.9404718693284937e-05, 'epoch': 7.06} +{'loss': 40.3183, 'grad_norm': 183.39486694335938, 'learning_rate': 1.9399274047186933e-05, 'epoch': 7.07} +{'loss': 40.8581, 'grad_norm': 238.36737060546875, 'learning_rate': 1.9393829401088928e-05, 'epoch': 7.07} +{'loss': 40.2192, 'grad_norm': 202.5072021484375, 'learning_rate': 1.9388384754990927e-05, 'epoch': 7.07} +{'loss': 40.8533, 'grad_norm': 204.236083984375, 'learning_rate': 1.9382940108892922e-05, 'epoch': 7.08} + 36%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 1970/5520 [1:42:26<2:51:15, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6252757906913757, 'eval_runtime': 3.1416, 'eval_samples_per_second': 56.978, 'eval_steps_per_second': 56.978, 'epoch': 7.08} +{'loss': 39.7229, 'grad_norm': 260.2081298828125, 'learning_rate': 1.937749546279492e-05, 'epoch': 7.08} +{'loss': 41.547, 'grad_norm': 241.91722106933594, 'learning_rate': 1.9372050816696917e-05, 'epoch': 7.08} +{'loss': 41.4826, 'grad_norm': 168.9304656982422, 'learning_rate': 1.9366606170598912e-05, 'epoch': 7.09} +{'loss': 41.5411, 'grad_norm': 230.05349731445312, 'learning_rate': 1.9361161524500907e-05, 'epoch': 7.09} +{'loss': 42.2347, 'grad_norm': 172.16851806640625, 'learning_rate': 1.9355716878402903e-05, 'epoch': 7.09} +{'loss': 41.4039, 'grad_norm': 312.65838623046875, 'learning_rate': 1.9350272232304898e-05, 'epoch': 7.1} +{'loss': 41.4234, 'grad_norm': 249.62351989746094, 'learning_rate': 1.9344827586206897e-05, 'epoch': 7.1} +{'loss': 38.0539, 'grad_norm': 250.49143981933594, 'learning_rate': 1.9339382940108896e-05, 'epoch': 7.1} +{'loss': 35.5584, 'grad_norm': 238.41546630859375, 'learning_rate': 1.933393829401089e-05, 'epoch': 7.11} +{'loss': 34.4491, 'grad_norm': 200.78282165527344, 'learning_rate': 1.9328493647912886e-05, 'epoch': 7.11} + 36%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 1980/5520 [1:42:57<2:49:54, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6286216378211975, 'eval_runtime': 3.1414, 'eval_samples_per_second': 56.982, 'eval_steps_per_second': 56.982, 'epoch': 7.11} +{'loss': 34.5403, 'grad_norm': 244.61717224121094, 'learning_rate': 1.9323049001814882e-05, 'epoch': 7.12} +{'loss': 35.7815, 'grad_norm': 219.14312744140625, 'learning_rate': 1.931760435571688e-05, 'epoch': 7.12} +{'loss': 35.638, 'grad_norm': 221.85130310058594, 'learning_rate': 1.9312159709618876e-05, 'epoch': 7.12} +{'loss': 35.1348, 'grad_norm': 237.97921752929688, 'learning_rate': 1.930671506352087e-05, 'epoch': 7.13} +{'loss': 35.8709, 'grad_norm': 234.06256103515625, 'learning_rate': 1.9301270417422867e-05, 'epoch': 7.13} +{'loss': 36.6859, 'grad_norm': 231.6852264404297, 'learning_rate': 1.9295825771324862e-05, 'epoch': 7.13} +{'loss': 37.24, 'grad_norm': 208.2762908935547, 'learning_rate': 1.9290381125226857e-05, 'epoch': 7.14} +{'loss': 36.4058, 'grad_norm': 219.8532257080078, 'learning_rate': 1.928493647912886e-05, 'epoch': 7.14} +{'loss': 36.7565, 'grad_norm': 242.73159790039062, 'learning_rate': 1.9279491833030855e-05, 'epoch': 7.14} +{'loss': 37.6752, 'grad_norm': 227.09645080566406, 'learning_rate': 1.927404718693285e-05, 'epoch': 7.15} + 36%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 1990/5520 [1:43:29<2:49:26, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6243596076965332, 'eval_runtime': 3.1424, 'eval_samples_per_second': 56.962, 'eval_steps_per_second': 56.962, 'epoch': 7.15} +{'loss': 38.3857, 'grad_norm': 236.27169799804688, 'learning_rate': 1.9268602540834846e-05, 'epoch': 7.15} +{'loss': 38.414, 'grad_norm': 244.84912109375, 'learning_rate': 1.926315789473684e-05, 'epoch': 7.16} +{'loss': 38.938, 'grad_norm': 203.36798095703125, 'learning_rate': 1.925771324863884e-05, 'epoch': 7.16} +{'loss': 37.654, 'grad_norm': 225.50152587890625, 'learning_rate': 1.9252268602540835e-05, 'epoch': 7.16} +{'loss': 28.2794, 'grad_norm': 236.4989471435547, 'learning_rate': 1.924682395644283e-05, 'epoch': 7.17} +{'loss': 23.3804, 'grad_norm': 173.909423828125, 'learning_rate': 1.9241379310344826e-05, 'epoch': 7.17} +{'loss': 24.4696, 'grad_norm': 195.63526916503906, 'learning_rate': 1.9235934664246825e-05, 'epoch': 7.17} +{'loss': 23.9438, 'grad_norm': 150.0059356689453, 'learning_rate': 1.923049001814882e-05, 'epoch': 7.18} +{'loss': 25.4084, 'grad_norm': 217.61630249023438, 'learning_rate': 1.922504537205082e-05, 'epoch': 7.18} +{'loss': 44.7159, 'grad_norm': 259.2041015625, 'learning_rate': 1.9219600725952814e-05, 'epoch': 7.18} + 36%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2000/5520 [1:44:01<2:49:36, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6465168595314026, 'eval_runtime': 3.1409, 'eval_samples_per_second': 56.991, 'eval_steps_per_second': 56.991, 'epoch': 7.18} +{'loss': 45.7571, 'grad_norm': 282.1758117675781, 'learning_rate': 1.921415607985481e-05, 'epoch': 7.19} +{'loss': 44.7227, 'grad_norm': 276.5455322265625, 'learning_rate': 1.9208711433756805e-05, 'epoch': 7.19} +{'loss': 43.0705, 'grad_norm': 251.93589782714844, 'learning_rate': 1.92032667876588e-05, 'epoch': 7.2} +{'loss': 43.2009, 'grad_norm': 224.8245086669922, 'learning_rate': 1.91978221415608e-05, 'epoch': 7.2} +{'loss': 43.4496, 'grad_norm': 233.61770629882812, 'learning_rate': 1.9192377495462795e-05, 'epoch': 7.2} +{'loss': 42.5907, 'grad_norm': 188.65252685546875, 'learning_rate': 1.9186932849364793e-05, 'epoch': 7.21} +{'loss': 44.4651, 'grad_norm': 185.1155242919922, 'learning_rate': 1.918148820326679e-05, 'epoch': 7.21} +{'loss': 43.6325, 'grad_norm': 169.09701538085938, 'learning_rate': 1.9176043557168784e-05, 'epoch': 7.21} +{'loss': 43.5817, 'grad_norm': 198.49114990234375, 'learning_rate': 1.9170598911070783e-05, 'epoch': 7.22} +{'loss': 41.4884, 'grad_norm': 193.17591857910156, 'learning_rate': 1.916515426497278e-05, 'epoch': 7.22} + 36%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2000/5520 [1:44:04<2:49:36, 2.89s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 36%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2010/5520 [1:44:33<2:49:58, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6329721212387085, 'eval_runtime': 3.1406, 'eval_samples_per_second': 56.995, 'eval_steps_per_second': 56.995, 'epoch': 7.22} +{'loss': 41.2168, 'grad_norm': 202.32730102539062, 'learning_rate': 1.9159709618874774e-05, 'epoch': 7.22} +{'loss': 39.9909, 'grad_norm': 206.4916534423828, 'learning_rate': 1.915426497277677e-05, 'epoch': 7.23} +{'loss': 40.1413, 'grad_norm': 202.2099609375, 'learning_rate': 1.9148820326678765e-05, 'epoch': 7.23} +{'loss': 39.5872, 'grad_norm': 223.7954559326172, 'learning_rate': 1.914337568058076e-05, 'epoch': 7.23} +{'loss': 41.3396, 'grad_norm': 225.8967742919922, 'learning_rate': 1.9137931034482762e-05, 'epoch': 7.24} +{'loss': 39.012, 'grad_norm': 248.0997772216797, 'learning_rate': 1.9132486388384757e-05, 'epoch': 7.24} +{'loss': 42.5922, 'grad_norm': 227.4576873779297, 'learning_rate': 1.9127041742286753e-05, 'epoch': 7.25} +{'loss': 41.6107, 'grad_norm': 197.62547302246094, 'learning_rate': 1.9121597096188748e-05, 'epoch': 7.25} +{'loss': 40.3326, 'grad_norm': 170.18817138671875, 'learning_rate': 1.9116152450090744e-05, 'epoch': 7.25} +{'loss': 41.0365, 'grad_norm': 186.9420166015625, 'learning_rate': 1.9110707803992742e-05, 'epoch': 7.26} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 37%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2020/5520 [1:45:05<2:48:13, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6230406761169434, 'eval_runtime': 3.141, 'eval_samples_per_second': 56.988, 'eval_steps_per_second': 56.988, 'epoch': 7.26} +{'loss': 42.0278, 'grad_norm': 188.11244201660156, 'learning_rate': 1.9105263157894738e-05, 'epoch': 7.26} +{'loss': 41.5539, 'grad_norm': 242.47305297851562, 'learning_rate': 1.9099818511796733e-05, 'epoch': 7.26} +{'loss': 41.8641, 'grad_norm': 190.83987426757812, 'learning_rate': 1.909437386569873e-05, 'epoch': 7.27} +{'loss': 42.232, 'grad_norm': 214.44650268554688, 'learning_rate': 1.9088929219600724e-05, 'epoch': 7.27} +{'loss': 41.6186, 'grad_norm': 216.3888397216797, 'learning_rate': 1.9083484573502723e-05, 'epoch': 7.27} +{'loss': 42.2099, 'grad_norm': 210.46673583984375, 'learning_rate': 1.907803992740472e-05, 'epoch': 7.28} +{'loss': 42.78, 'grad_norm': 194.84165954589844, 'learning_rate': 1.9072595281306717e-05, 'epoch': 7.28} +{'loss': 38.7115, 'grad_norm': 201.91297912597656, 'learning_rate': 1.9067150635208712e-05, 'epoch': 7.29} +{'loss': 35.7841, 'grad_norm': 245.42625427246094, 'learning_rate': 1.9061705989110708e-05, 'epoch': 7.29} +{'loss': 34.3308, 'grad_norm': 182.4967041015625, 'learning_rate': 1.9056261343012703e-05, 'epoch': 7.29} + 37%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2030/5520 [1:45:37<2:48:46, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6238341331481934, 'eval_runtime': 3.1431, 'eval_samples_per_second': 56.95, 'eval_steps_per_second': 56.95, 'epoch': 7.29} +{'loss': 34.7534, 'grad_norm': 297.3916320800781, 'learning_rate': 1.9050816696914702e-05, 'epoch': 7.3} +{'loss': 34.0303, 'grad_norm': 211.52554321289062, 'learning_rate': 1.9045372050816697e-05, 'epoch': 7.3} +{'loss': 35.7378, 'grad_norm': 232.99844360351562, 'learning_rate': 1.9039927404718693e-05, 'epoch': 7.3} +{'loss': 36.7492, 'grad_norm': 230.34642028808594, 'learning_rate': 1.903448275862069e-05, 'epoch': 7.31} +{'loss': 35.1188, 'grad_norm': 228.88966369628906, 'learning_rate': 1.9029038112522687e-05, 'epoch': 7.31} +{'loss': 35.0688, 'grad_norm': 213.2604522705078, 'learning_rate': 1.9023593466424682e-05, 'epoch': 7.31} +{'loss': 37.6721, 'grad_norm': 202.62200927734375, 'learning_rate': 1.901814882032668e-05, 'epoch': 7.32} +{'loss': 36.7728, 'grad_norm': 191.8877410888672, 'learning_rate': 1.9012704174228676e-05, 'epoch': 7.32} +{'loss': 36.6342, 'grad_norm': 211.57571411132812, 'learning_rate': 1.900725952813067e-05, 'epoch': 7.33} +{'loss': 36.8319, 'grad_norm': 177.2289581298828, 'learning_rate': 1.9001814882032667e-05, 'epoch': 7.33} + 37%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2040/5520 [1:46:08<2:47:31, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6231008172035217, 'eval_runtime': 3.141, 'eval_samples_per_second': 56.988, 'eval_steps_per_second': 56.988, 'epoch': 7.33} +{'loss': 36.6706, 'grad_norm': 227.7028350830078, 'learning_rate': 1.8996370235934662e-05, 'epoch': 7.33} +{'loss': 37.0749, 'grad_norm': 229.02972412109375, 'learning_rate': 1.899092558983666e-05, 'epoch': 7.34} +{'loss': 37.3716, 'grad_norm': 234.30946350097656, 'learning_rate': 1.898548094373866e-05, 'epoch': 7.34} +{'loss': 38.9503, 'grad_norm': 236.79893493652344, 'learning_rate': 1.8980036297640655e-05, 'epoch': 7.34} +{'loss': 32.5056, 'grad_norm': 256.5646057128906, 'learning_rate': 1.897459165154265e-05, 'epoch': 7.35} +{'loss': 25.3982, 'grad_norm': 183.38961791992188, 'learning_rate': 1.8969147005444646e-05, 'epoch': 7.35} +{'loss': 23.2743, 'grad_norm': 214.09742736816406, 'learning_rate': 1.896370235934664e-05, 'epoch': 7.35} +{'loss': 24.8062, 'grad_norm': 190.10867309570312, 'learning_rate': 1.895825771324864e-05, 'epoch': 7.36} +{'loss': 25.5098, 'grad_norm': 197.85313415527344, 'learning_rate': 1.8952813067150636e-05, 'epoch': 7.36} +{'loss': 44.3536, 'grad_norm': 235.79090881347656, 'learning_rate': 1.894736842105263e-05, 'epoch': 7.36} + 37%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2050/5520 [1:46:40<2:46:06, 2.87s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6341925263404846, 'eval_runtime': 3.1523, 'eval_samples_per_second': 56.785, 'eval_steps_per_second': 56.785, 'epoch': 7.36} +{'loss': 44.6073, 'grad_norm': 232.7415771484375, 'learning_rate': 1.8941923774954626e-05, 'epoch': 7.37} +{'loss': 43.8575, 'grad_norm': 302.3766174316406, 'learning_rate': 1.8936479128856625e-05, 'epoch': 7.37} +{'loss': 42.4378, 'grad_norm': 208.41441345214844, 'learning_rate': 1.8931034482758624e-05, 'epoch': 7.38} +{'loss': 44.5641, 'grad_norm': 228.000732421875, 'learning_rate': 1.892558983666062e-05, 'epoch': 7.38} +{'loss': 43.7578, 'grad_norm': 201.757080078125, 'learning_rate': 1.8920145190562615e-05, 'epoch': 7.38} +{'loss': 42.755, 'grad_norm': 220.2481689453125, 'learning_rate': 1.891470054446461e-05, 'epoch': 7.39} +{'loss': 44.3785, 'grad_norm': 225.5443115234375, 'learning_rate': 1.8909255898366605e-05, 'epoch': 7.39} +{'loss': 42.994, 'grad_norm': 200.2024688720703, 'learning_rate': 1.89038112522686e-05, 'epoch': 7.39} +{'loss': 43.1902, 'grad_norm': 205.64794921875, 'learning_rate': 1.88983666061706e-05, 'epoch': 7.4} +{'loss': 40.9422, 'grad_norm': 183.3535919189453, 'learning_rate': 1.8892921960072595e-05, 'epoch': 7.4} + 37%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2060/5520 [1:47:12<2:46:12, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.626913845539093, 'eval_runtime': 3.1424, 'eval_samples_per_second': 56.963, 'eval_steps_per_second': 56.963, 'epoch': 7.4} +{'loss': 39.4408, 'grad_norm': 201.8138885498047, 'learning_rate': 1.8887477313974594e-05, 'epoch': 7.4} +{'loss': 39.5467, 'grad_norm': 201.8863525390625, 'learning_rate': 1.888203266787659e-05, 'epoch': 7.41} +{'loss': 41.2256, 'grad_norm': 239.10687255859375, 'learning_rate': 1.8876588021778585e-05, 'epoch': 7.41} +{'loss': 40.8963, 'grad_norm': 209.47796630859375, 'learning_rate': 1.8871143375680583e-05, 'epoch': 7.42} +{'loss': 40.5138, 'grad_norm': 202.6414794921875, 'learning_rate': 1.886569872958258e-05, 'epoch': 7.42} +{'loss': 39.1767, 'grad_norm': 198.01795959472656, 'learning_rate': 1.8860254083484574e-05, 'epoch': 7.42} +{'loss': 40.6713, 'grad_norm': 173.26507568359375, 'learning_rate': 1.885480943738657e-05, 'epoch': 7.43} +{'loss': 41.2602, 'grad_norm': 166.11607360839844, 'learning_rate': 1.8849364791288565e-05, 'epoch': 7.43} +{'loss': 41.0714, 'grad_norm': 200.76956176757812, 'learning_rate': 1.884392014519056e-05, 'epoch': 7.43} +{'loss': 39.6812, 'grad_norm': 213.75315856933594, 'learning_rate': 1.883847549909256e-05, 'epoch': 7.44} + 38%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2070/5520 [1:47:43<2:46:17, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6279598474502563, 'eval_runtime': 3.1403, 'eval_samples_per_second': 57.0, 'eval_steps_per_second': 57.0, 'epoch': 7.44} +{'loss': 41.6964, 'grad_norm': 221.25025939941406, 'learning_rate': 1.8833030852994558e-05, 'epoch': 7.44} +{'loss': 41.4608, 'grad_norm': 171.32106018066406, 'learning_rate': 1.8827586206896553e-05, 'epoch': 7.44} +{'loss': 41.2687, 'grad_norm': 222.76600646972656, 'learning_rate': 1.882214156079855e-05, 'epoch': 7.45} +{'loss': 41.6048, 'grad_norm': 169.82395935058594, 'learning_rate': 1.8816696914700544e-05, 'epoch': 7.45} +{'loss': 41.8843, 'grad_norm': 190.5113525390625, 'learning_rate': 1.8811252268602543e-05, 'epoch': 7.46} +{'loss': 43.5968, 'grad_norm': 194.5990447998047, 'learning_rate': 1.8805807622504538e-05, 'epoch': 7.46} +{'loss': 41.6743, 'grad_norm': 216.0985870361328, 'learning_rate': 1.8800362976406533e-05, 'epoch': 7.46} +{'loss': 39.4203, 'grad_norm': 249.05270385742188, 'learning_rate': 1.879491833030853e-05, 'epoch': 7.47} +{'loss': 36.2202, 'grad_norm': 232.5495147705078, 'learning_rate': 1.8789473684210524e-05, 'epoch': 7.47} +{'loss': 34.9116, 'grad_norm': 218.72299194335938, 'learning_rate': 1.8784029038112523e-05, 'epoch': 7.47} + 38%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2080/5520 [1:48:15<2:45:05, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6241349577903748, 'eval_runtime': 3.1499, 'eval_samples_per_second': 56.827, 'eval_steps_per_second': 56.827, 'epoch': 7.47} +{'loss': 36.2476, 'grad_norm': 241.78179931640625, 'learning_rate': 1.8778584392014522e-05, 'epoch': 7.48} +{'loss': 34.4524, 'grad_norm': 194.92982482910156, 'learning_rate': 1.8773139745916517e-05, 'epoch': 7.48} +{'loss': 34.5292, 'grad_norm': 227.76156616210938, 'learning_rate': 1.8767695099818513e-05, 'epoch': 7.48} +{'loss': 37.8068, 'grad_norm': 287.61309814453125, 'learning_rate': 1.8762250453720508e-05, 'epoch': 7.49} +{'loss': 36.0941, 'grad_norm': 191.0822296142578, 'learning_rate': 1.8756805807622503e-05, 'epoch': 7.49} +{'loss': 36.3624, 'grad_norm': 197.5564422607422, 'learning_rate': 1.8751361161524502e-05, 'epoch': 7.49} +{'loss': 37.5074, 'grad_norm': 187.72479248046875, 'learning_rate': 1.8745916515426497e-05, 'epoch': 7.5} +{'loss': 35.6139, 'grad_norm': 220.4607391357422, 'learning_rate': 1.8740471869328493e-05, 'epoch': 7.5} +{'loss': 37.7286, 'grad_norm': 179.05612182617188, 'learning_rate': 1.873502722323049e-05, 'epoch': 7.51} +{'loss': 36.1803, 'grad_norm': 230.91879272460938, 'learning_rate': 1.8729582577132487e-05, 'epoch': 7.51} + 38%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2090/5520 [1:48:47<2:46:13, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6255043148994446, 'eval_runtime': 3.1466, 'eval_samples_per_second': 56.887, 'eval_steps_per_second': 56.887, 'epoch': 7.51} +{'loss': 36.5782, 'grad_norm': 182.89437866210938, 'learning_rate': 1.8724137931034482e-05, 'epoch': 7.51} +{'loss': 38.233, 'grad_norm': 215.36769104003906, 'learning_rate': 1.871869328493648e-05, 'epoch': 7.52} +{'loss': 38.6268, 'grad_norm': 232.6095733642578, 'learning_rate': 1.8713248638838477e-05, 'epoch': 7.52} +{'loss': 38.1768, 'grad_norm': 236.94281005859375, 'learning_rate': 1.8707803992740472e-05, 'epoch': 7.52} +{'loss': 27.514, 'grad_norm': 214.16079711914062, 'learning_rate': 1.8702359346642467e-05, 'epoch': 7.53} +{'loss': 24.274, 'grad_norm': 192.6107940673828, 'learning_rate': 1.8696914700544463e-05, 'epoch': 7.53} +{'loss': 23.2824, 'grad_norm': 217.98619079589844, 'learning_rate': 1.869147005444646e-05, 'epoch': 7.53} +{'loss': 24.9622, 'grad_norm': 183.04296875, 'learning_rate': 1.868602540834846e-05, 'epoch': 7.54} +{'loss': 25.1446, 'grad_norm': 167.1417236328125, 'learning_rate': 1.8680580762250456e-05, 'epoch': 7.54} +{'loss': 44.1171, 'grad_norm': 287.29937744140625, 'learning_rate': 1.867513611615245e-05, 'epoch': 7.55} + 38%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2100/5520 [1:49:18<2:45:20, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6376849412918091, 'eval_runtime': 3.1443, 'eval_samples_per_second': 56.929, 'eval_steps_per_second': 56.929, 'epoch': 7.55} +{'loss': 46.3716, 'grad_norm': 285.3408203125, 'learning_rate': 1.8669691470054446e-05, 'epoch': 7.55} +{'loss': 44.0514, 'grad_norm': 233.18389892578125, 'learning_rate': 1.8664246823956445e-05, 'epoch': 7.55} +{'loss': 44.1784, 'grad_norm': 256.4196472167969, 'learning_rate': 1.865880217785844e-05, 'epoch': 7.56} +{'loss': 42.9897, 'grad_norm': 223.28128051757812, 'learning_rate': 1.8653357531760436e-05, 'epoch': 7.56} +{'loss': 43.7651, 'grad_norm': 235.2901153564453, 'learning_rate': 1.864791288566243e-05, 'epoch': 7.56} +{'loss': 44.6333, 'grad_norm': 285.9206237792969, 'learning_rate': 1.8642468239564427e-05, 'epoch': 7.57} +{'loss': 43.9845, 'grad_norm': 200.00210571289062, 'learning_rate': 1.8637023593466425e-05, 'epoch': 7.57} +{'loss': 44.7301, 'grad_norm': 277.73394775390625, 'learning_rate': 1.8631578947368424e-05, 'epoch': 7.57} +{'loss': 44.0409, 'grad_norm': 216.9422149658203, 'learning_rate': 1.862613430127042e-05, 'epoch': 7.58} +{'loss': 43.4026, 'grad_norm': 198.86639404296875, 'learning_rate': 1.8620689655172415e-05, 'epoch': 7.58} + 38%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2110/5520 [1:49:50<2:44:24, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6270378232002258, 'eval_runtime': 3.1464, 'eval_samples_per_second': 56.891, 'eval_steps_per_second': 56.891, 'epoch': 7.58} +{'loss': 41.4092, 'grad_norm': 240.495361328125, 'learning_rate': 1.861524500907441e-05, 'epoch': 7.59} +{'loss': 40.1396, 'grad_norm': 240.1851043701172, 'learning_rate': 1.8609800362976406e-05, 'epoch': 7.59} +{'loss': 39.1778, 'grad_norm': 241.21495056152344, 'learning_rate': 1.8604355716878405e-05, 'epoch': 7.59} +{'loss': 41.0348, 'grad_norm': 287.3133544921875, 'learning_rate': 1.85989110707804e-05, 'epoch': 7.6} +{'loss': 39.5872, 'grad_norm': 230.4313201904297, 'learning_rate': 1.8593466424682395e-05, 'epoch': 7.6} +{'loss': 40.6146, 'grad_norm': 210.32962036132812, 'learning_rate': 1.858802177858439e-05, 'epoch': 7.6} +{'loss': 39.6363, 'grad_norm': 185.81752014160156, 'learning_rate': 1.858257713248639e-05, 'epoch': 7.61} +{'loss': 40.558, 'grad_norm': 234.63037109375, 'learning_rate': 1.8577132486388385e-05, 'epoch': 7.61} +{'loss': 41.1624, 'grad_norm': 289.92803955078125, 'learning_rate': 1.8571687840290384e-05, 'epoch': 7.61} +{'loss': 41.7827, 'grad_norm': 252.82188415527344, 'learning_rate': 1.856624319419238e-05, 'epoch': 7.62} + 38%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2120/5520 [1:50:22<2:43:33, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6290409564971924, 'eval_runtime': 3.1469, 'eval_samples_per_second': 56.882, 'eval_steps_per_second': 56.882, 'epoch': 7.62} +{'loss': 39.0072, 'grad_norm': 201.8303985595703, 'learning_rate': 1.8560798548094374e-05, 'epoch': 7.62} +{'loss': 39.9822, 'grad_norm': 158.71446228027344, 'learning_rate': 1.855535390199637e-05, 'epoch': 7.62} +{'loss': 42.1973, 'grad_norm': 171.3879852294922, 'learning_rate': 1.8549909255898365e-05, 'epoch': 7.63} +{'loss': 42.933, 'grad_norm': 218.584228515625, 'learning_rate': 1.8544464609800364e-05, 'epoch': 7.63} +{'loss': 41.9847, 'grad_norm': 200.60093688964844, 'learning_rate': 1.853901996370236e-05, 'epoch': 7.64} +{'loss': 42.4961, 'grad_norm': 210.75128173828125, 'learning_rate': 1.8533575317604358e-05, 'epoch': 7.64} +{'loss': 39.3404, 'grad_norm': 187.47406005859375, 'learning_rate': 1.8528130671506353e-05, 'epoch': 7.64} +{'loss': 40.3011, 'grad_norm': 204.87693786621094, 'learning_rate': 1.852268602540835e-05, 'epoch': 7.65} +{'loss': 37.4416, 'grad_norm': 228.8159637451172, 'learning_rate': 1.8517241379310344e-05, 'epoch': 7.65} +{'loss': 35.3079, 'grad_norm': 237.59664916992188, 'learning_rate': 1.8511796733212343e-05, 'epoch': 7.65} + 39%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2130/5520 [1:50:53<2:43:48, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6256567239761353, 'eval_runtime': 3.1458, 'eval_samples_per_second': 56.902, 'eval_steps_per_second': 56.902, 'epoch': 7.65} +{'loss': 34.5055, 'grad_norm': 233.3187713623047, 'learning_rate': 1.850635208711434e-05, 'epoch': 7.66} +{'loss': 34.1232, 'grad_norm': 232.7037353515625, 'learning_rate': 1.8500907441016334e-05, 'epoch': 7.66} +{'loss': 35.3301, 'grad_norm': 254.53050231933594, 'learning_rate': 1.849546279491833e-05, 'epoch': 7.66} +{'loss': 35.9202, 'grad_norm': 234.93154907226562, 'learning_rate': 1.8490018148820324e-05, 'epoch': 7.67} +{'loss': 36.5702, 'grad_norm': 237.99671936035156, 'learning_rate': 1.8484573502722327e-05, 'epoch': 7.67} +{'loss': 35.9423, 'grad_norm': 186.25271606445312, 'learning_rate': 1.8479128856624322e-05, 'epoch': 7.68} +{'loss': 37.4121, 'grad_norm': 226.461669921875, 'learning_rate': 1.8473684210526317e-05, 'epoch': 7.68} +{'loss': 36.8802, 'grad_norm': 227.0966033935547, 'learning_rate': 1.8468239564428313e-05, 'epoch': 7.68} +{'loss': 36.0245, 'grad_norm': 193.4064178466797, 'learning_rate': 1.8462794918330308e-05, 'epoch': 7.69} +{'loss': 37.4833, 'grad_norm': 279.1668395996094, 'learning_rate': 1.8457350272232304e-05, 'epoch': 7.69} + 39%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2140/5520 [1:51:25<2:43:16, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6227458715438843, 'eval_runtime': 3.1429, 'eval_samples_per_second': 56.953, 'eval_steps_per_second': 56.953, 'epoch': 7.69} +{'loss': 36.8538, 'grad_norm': 254.59234619140625, 'learning_rate': 1.8451905626134302e-05, 'epoch': 7.69} +{'loss': 37.8517, 'grad_norm': 191.14463806152344, 'learning_rate': 1.8446460980036298e-05, 'epoch': 7.7} +{'loss': 38.406, 'grad_norm': 189.20896911621094, 'learning_rate': 1.8441016333938293e-05, 'epoch': 7.7} +{'loss': 37.7692, 'grad_norm': 209.61175537109375, 'learning_rate': 1.8435571687840292e-05, 'epoch': 7.7} +{'loss': 36.087, 'grad_norm': 220.5150146484375, 'learning_rate': 1.8430127041742287e-05, 'epoch': 7.71} +{'loss': 25.6052, 'grad_norm': 211.78372192382812, 'learning_rate': 1.8424682395644286e-05, 'epoch': 7.71} +{'loss': 23.5576, 'grad_norm': 223.85789489746094, 'learning_rate': 1.841923774954628e-05, 'epoch': 7.72} +{'loss': 24.4869, 'grad_norm': 163.74220275878906, 'learning_rate': 1.8413793103448277e-05, 'epoch': 7.72} +{'loss': 25.1878, 'grad_norm': 182.80079650878906, 'learning_rate': 1.8408348457350272e-05, 'epoch': 7.72} +{'loss': 44.4643, 'grad_norm': 296.0340270996094, 'learning_rate': 1.8402903811252268e-05, 'epoch': 7.73} + 39%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2150/5520 [1:51:57<2:42:39, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6382863521575928, 'eval_runtime': 3.1441, 'eval_samples_per_second': 56.932, 'eval_steps_per_second': 56.932, 'epoch': 7.73} +{'loss': 45.2141, 'grad_norm': 248.48643493652344, 'learning_rate': 1.8397459165154263e-05, 'epoch': 7.73} +{'loss': 42.9435, 'grad_norm': 240.9061279296875, 'learning_rate': 1.8392014519056262e-05, 'epoch': 7.73} +{'loss': 42.9769, 'grad_norm': 231.62315368652344, 'learning_rate': 1.8386569872958257e-05, 'epoch': 7.74} +{'loss': 43.6058, 'grad_norm': 244.36915588378906, 'learning_rate': 1.8381125226860256e-05, 'epoch': 7.74} +{'loss': 43.1753, 'grad_norm': 252.9080047607422, 'learning_rate': 1.837568058076225e-05, 'epoch': 7.74} +{'loss': 43.3285, 'grad_norm': 274.0201721191406, 'learning_rate': 1.8370235934664247e-05, 'epoch': 7.75} +{'loss': 43.3158, 'grad_norm': 226.75595092773438, 'learning_rate': 1.8364791288566245e-05, 'epoch': 7.75} +{'loss': 43.5773, 'grad_norm': 197.0859832763672, 'learning_rate': 1.835934664246824e-05, 'epoch': 7.75} +{'loss': 43.9208, 'grad_norm': 212.14720153808594, 'learning_rate': 1.8353901996370236e-05, 'epoch': 7.76} +{'loss': 42.8429, 'grad_norm': 230.22158813476562, 'learning_rate': 1.834845735027223e-05, 'epoch': 7.76} + 39%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2160/5520 [1:52:28<2:41:21, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6291994452476501, 'eval_runtime': 3.1473, 'eval_samples_per_second': 56.874, 'eval_steps_per_second': 56.874, 'epoch': 7.76} +{'loss': 40.7289, 'grad_norm': 215.79391479492188, 'learning_rate': 1.8343012704174227e-05, 'epoch': 7.77} +{'loss': 39.9759, 'grad_norm': 210.00296020507812, 'learning_rate': 1.8337568058076222e-05, 'epoch': 7.77} +{'loss': 40.551, 'grad_norm': 291.2987976074219, 'learning_rate': 1.8332123411978224e-05, 'epoch': 7.77} +{'loss': 40.7981, 'grad_norm': 218.08819580078125, 'learning_rate': 1.832667876588022e-05, 'epoch': 7.78} +{'loss': 40.5463, 'grad_norm': 268.615966796875, 'learning_rate': 1.8321234119782215e-05, 'epoch': 7.78} +{'loss': 40.6168, 'grad_norm': 269.939697265625, 'learning_rate': 1.831578947368421e-05, 'epoch': 7.78} +{'loss': 41.2449, 'grad_norm': 268.9761657714844, 'learning_rate': 1.8310344827586206e-05, 'epoch': 7.79} +{'loss': 40.6308, 'grad_norm': 161.08811950683594, 'learning_rate': 1.8304900181488205e-05, 'epoch': 7.79} +{'loss': 40.9708, 'grad_norm': 190.44696044921875, 'learning_rate': 1.82994555353902e-05, 'epoch': 7.79} +{'loss': 41.2053, 'grad_norm': 202.4305419921875, 'learning_rate': 1.8294010889292196e-05, 'epoch': 7.8} + 39%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2170/5520 [1:53:00<2:40:33, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6233534812927246, 'eval_runtime': 3.1457, 'eval_samples_per_second': 56.903, 'eval_steps_per_second': 56.903, 'epoch': 7.8} +{'loss': 40.3928, 'grad_norm': 188.5523681640625, 'learning_rate': 1.828856624319419e-05, 'epoch': 7.8} +{'loss': 42.3466, 'grad_norm': 184.18296813964844, 'learning_rate': 1.828312159709619e-05, 'epoch': 7.81} +{'loss': 42.0301, 'grad_norm': 223.9243927001953, 'learning_rate': 1.8277676950998185e-05, 'epoch': 7.81} +{'loss': 42.3284, 'grad_norm': 202.3498077392578, 'learning_rate': 1.8272232304900184e-05, 'epoch': 7.81} +{'loss': 42.0951, 'grad_norm': 205.77940368652344, 'learning_rate': 1.826678765880218e-05, 'epoch': 7.82} +{'loss': 40.826, 'grad_norm': 191.46728515625, 'learning_rate': 1.8261343012704175e-05, 'epoch': 7.82} +{'loss': 42.7909, 'grad_norm': 276.8330383300781, 'learning_rate': 1.825589836660617e-05, 'epoch': 7.82} +{'loss': 38.6068, 'grad_norm': 181.93955993652344, 'learning_rate': 1.8250453720508165e-05, 'epoch': 7.83} +{'loss': 35.694, 'grad_norm': 178.79856872558594, 'learning_rate': 1.8245009074410164e-05, 'epoch': 7.83} +{'loss': 36.7127, 'grad_norm': 224.6522979736328, 'learning_rate': 1.823956442831216e-05, 'epoch': 7.83} + 39%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2180/5520 [1:53:32<2:40:23, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6237645745277405, 'eval_runtime': 3.1433, 'eval_samples_per_second': 56.947, 'eval_steps_per_second': 56.947, 'epoch': 7.83} +{'loss': 34.0039, 'grad_norm': 203.37196350097656, 'learning_rate': 1.823411978221416e-05, 'epoch': 7.84} +{'loss': 33.2787, 'grad_norm': 212.79307556152344, 'learning_rate': 1.8228675136116154e-05, 'epoch': 7.84} +{'loss': 35.4241, 'grad_norm': 215.5691375732422, 'learning_rate': 1.822323049001815e-05, 'epoch': 7.85} +{'loss': 36.9333, 'grad_norm': 230.0751190185547, 'learning_rate': 1.8217785843920144e-05, 'epoch': 7.85} +{'loss': 35.7233, 'grad_norm': 217.8132781982422, 'learning_rate': 1.8212341197822143e-05, 'epoch': 7.85} +{'loss': 36.6111, 'grad_norm': 245.93177795410156, 'learning_rate': 1.820689655172414e-05, 'epoch': 7.86} +{'loss': 36.3243, 'grad_norm': 210.58218383789062, 'learning_rate': 1.8201451905626134e-05, 'epoch': 7.86} +{'loss': 37.0315, 'grad_norm': 234.6280059814453, 'learning_rate': 1.819600725952813e-05, 'epoch': 7.86} +{'loss': 35.8725, 'grad_norm': 184.53121948242188, 'learning_rate': 1.8190562613430125e-05, 'epoch': 7.87} +{'loss': 37.9183, 'grad_norm': 201.5563507080078, 'learning_rate': 1.8185117967332127e-05, 'epoch': 7.87} + 40%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2190/5520 [1:54:03<2:41:17, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6210297346115112, 'eval_runtime': 3.142, 'eval_samples_per_second': 56.969, 'eval_steps_per_second': 56.969, 'epoch': 7.87} +{'loss': 37.1709, 'grad_norm': 192.29579162597656, 'learning_rate': 1.8179673321234122e-05, 'epoch': 7.87} +{'loss': 38.5338, 'grad_norm': 246.0638427734375, 'learning_rate': 1.8174228675136118e-05, 'epoch': 7.88} +{'loss': 37.7041, 'grad_norm': 237.47607421875, 'learning_rate': 1.8168784029038113e-05, 'epoch': 7.88} +{'loss': 38.1663, 'grad_norm': 215.06407165527344, 'learning_rate': 1.816333938294011e-05, 'epoch': 7.88} +{'loss': 32.1679, 'grad_norm': 193.76809692382812, 'learning_rate': 1.8157894736842107e-05, 'epoch': 7.89} +{'loss': 24.2413, 'grad_norm': 208.66111755371094, 'learning_rate': 1.8152450090744103e-05, 'epoch': 7.89} +{'loss': 24.1102, 'grad_norm': 182.810546875, 'learning_rate': 1.8147005444646098e-05, 'epoch': 7.9} +{'loss': 24.5778, 'grad_norm': 200.25823974609375, 'learning_rate': 1.8141560798548093e-05, 'epoch': 7.9} +{'loss': 26.1643, 'grad_norm': 224.19125366210938, 'learning_rate': 1.813611615245009e-05, 'epoch': 7.9} +{'loss': 45.1071, 'grad_norm': 261.03033447265625, 'learning_rate': 1.8130671506352088e-05, 'epoch': 7.91} + 40%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2200/5520 [1:54:35<2:40:09, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6303785443305969, 'eval_runtime': 3.1423, 'eval_samples_per_second': 56.964, 'eval_steps_per_second': 56.964, 'epoch': 7.91} +{'loss': 43.8271, 'grad_norm': 273.6593322753906, 'learning_rate': 1.8125226860254086e-05, 'epoch': 7.91} +{'loss': 43.7623, 'grad_norm': 304.0534362792969, 'learning_rate': 1.8119782214156082e-05, 'epoch': 7.91} +{'loss': 43.7191, 'grad_norm': 249.27255249023438, 'learning_rate': 1.8114337568058077e-05, 'epoch': 7.92} +{'loss': 44.1019, 'grad_norm': 199.5006103515625, 'learning_rate': 1.8108892921960072e-05, 'epoch': 7.92} +{'loss': 43.9717, 'grad_norm': 228.42832946777344, 'learning_rate': 1.8103448275862068e-05, 'epoch': 7.92} +{'loss': 40.022, 'grad_norm': 247.20901489257812, 'learning_rate': 1.8098003629764067e-05, 'epoch': 7.93} +{'loss': 40.6639, 'grad_norm': 297.5372619628906, 'learning_rate': 1.8092558983666062e-05, 'epoch': 7.93} +{'loss': 40.3569, 'grad_norm': 245.11915588378906, 'learning_rate': 1.8087114337568057e-05, 'epoch': 7.94} +{'loss': 41.7983, 'grad_norm': 255.53297424316406, 'learning_rate': 1.8081669691470056e-05, 'epoch': 7.94} +{'loss': 41.7844, 'grad_norm': 226.12783813476562, 'learning_rate': 1.807622504537205e-05, 'epoch': 7.94} + 40%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2200/5520 [1:54:38<2:40:09, 2.89s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 40%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2210/5520 [1:55:08<2:41:00, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6214397549629211, 'eval_runtime': 3.1428, 'eval_samples_per_second': 56.956, 'eval_steps_per_second': 56.956, 'epoch': 7.94} +{'loss': 42.057, 'grad_norm': 220.90577697753906, 'learning_rate': 1.8070780399274047e-05, 'epoch': 7.95} +{'loss': 42.0299, 'grad_norm': 192.33856201171875, 'learning_rate': 1.8065335753176046e-05, 'epoch': 7.95} +{'loss': 41.7752, 'grad_norm': 192.8511962890625, 'learning_rate': 1.805989110707804e-05, 'epoch': 7.95} +{'loss': 41.0178, 'grad_norm': 223.10275268554688, 'learning_rate': 1.8054446460980036e-05, 'epoch': 7.96} +{'loss': 37.9747, 'grad_norm': 189.8402099609375, 'learning_rate': 1.8049001814882032e-05, 'epoch': 7.96} +{'loss': 35.3994, 'grad_norm': 233.5938720703125, 'learning_rate': 1.8043557168784027e-05, 'epoch': 7.96} +{'loss': 35.1967, 'grad_norm': 218.5577850341797, 'learning_rate': 1.8038112522686026e-05, 'epoch': 7.97} +{'loss': 34.5792, 'grad_norm': 228.49502563476562, 'learning_rate': 1.8032667876588025e-05, 'epoch': 7.97} +{'loss': 37.9449, 'grad_norm': 285.4461364746094, 'learning_rate': 1.802722323049002e-05, 'epoch': 7.98} +{'loss': 36.3295, 'grad_norm': 186.83755493164062, 'learning_rate': 1.8021778584392016e-05, 'epoch': 7.98} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 40%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2220/5520 [1:55:39<2:38:02, 2.87s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6212169528007507, 'eval_runtime': 3.1428, 'eval_samples_per_second': 56.956, 'eval_steps_per_second': 56.956, 'epoch': 7.98} +{'loss': 37.0061, 'grad_norm': 210.31175231933594, 'learning_rate': 1.801633393829401e-05, 'epoch': 7.98} +{'loss': 37.8831, 'grad_norm': 251.96026611328125, 'learning_rate': 1.8010889292196006e-05, 'epoch': 7.99} +{'loss': 38.8926, 'grad_norm': 273.8665771484375, 'learning_rate': 1.8005444646098005e-05, 'epoch': 7.99} +{'loss': 30.0468, 'grad_norm': 207.25836181640625, 'learning_rate': 1.8e-05, 'epoch': 7.99} +{'loss': 24.0549, 'grad_norm': 200.5218048095703, 'learning_rate': 1.7994555353901996e-05, 'epoch': 8.0} +{'loss': 22.3158, 'grad_norm': 245.7149200439453, 'learning_rate': 1.798911070780399e-05, 'epoch': 8.0} +{'loss': 43.2342, 'grad_norm': 263.85546875, 'learning_rate': 1.798366606170599e-05, 'epoch': 8.0} +{'loss': 44.0931, 'grad_norm': 244.57205200195312, 'learning_rate': 1.797822141560799e-05, 'epoch': 8.01} +{'loss': 42.1926, 'grad_norm': 196.4144287109375, 'learning_rate': 1.7972776769509984e-05, 'epoch': 8.01} +{'loss': 41.4664, 'grad_norm': 282.3250427246094, 'learning_rate': 1.796733212341198e-05, 'epoch': 8.01} + 40%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2230/5520 [1:56:11<2:38:00, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6222901344299316, 'eval_runtime': 3.1406, 'eval_samples_per_second': 56.996, 'eval_steps_per_second': 56.996, 'epoch': 8.01} +{'loss': 42.2133, 'grad_norm': 186.79281616210938, 'learning_rate': 1.7961887477313975e-05, 'epoch': 8.02} +{'loss': 42.0159, 'grad_norm': 220.3788299560547, 'learning_rate': 1.795644283121597e-05, 'epoch': 8.02} +{'loss': 42.6055, 'grad_norm': 262.37078857421875, 'learning_rate': 1.7950998185117966e-05, 'epoch': 8.03} +{'loss': 43.3061, 'grad_norm': 199.07078552246094, 'learning_rate': 1.7945553539019964e-05, 'epoch': 8.03} +{'loss': 42.4806, 'grad_norm': 256.6651306152344, 'learning_rate': 1.794010889292196e-05, 'epoch': 8.03} +{'loss': 43.9823, 'grad_norm': 281.17431640625, 'learning_rate': 1.793466424682396e-05, 'epoch': 8.04} +{'loss': 41.8372, 'grad_norm': 201.19837951660156, 'learning_rate': 1.7929219600725954e-05, 'epoch': 8.04} +{'loss': 38.8656, 'grad_norm': 195.1905059814453, 'learning_rate': 1.792377495462795e-05, 'epoch': 8.04} +{'loss': 39.8965, 'grad_norm': 215.02772521972656, 'learning_rate': 1.7918330308529948e-05, 'epoch': 8.05} +{'loss': 41.0917, 'grad_norm': 202.16322326660156, 'learning_rate': 1.7912885662431944e-05, 'epoch': 8.05} + 41%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2240/5520 [1:56:42<2:38:19, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6212881207466125, 'eval_runtime': 3.1412, 'eval_samples_per_second': 56.984, 'eval_steps_per_second': 56.984, 'epoch': 8.05} +{'loss': 38.5499, 'grad_norm': 218.90786743164062, 'learning_rate': 1.790744101633394e-05, 'epoch': 8.05} +{'loss': 39.5915, 'grad_norm': 179.57138061523438, 'learning_rate': 1.7901996370235934e-05, 'epoch': 8.06} +{'loss': 39.6094, 'grad_norm': 242.74801635742188, 'learning_rate': 1.789655172413793e-05, 'epoch': 8.06} +{'loss': 40.6025, 'grad_norm': 183.07102966308594, 'learning_rate': 1.7891107078039925e-05, 'epoch': 8.07} +{'loss': 40.3013, 'grad_norm': 192.85418701171875, 'learning_rate': 1.7885662431941924e-05, 'epoch': 8.07} +{'loss': 39.1747, 'grad_norm': 254.26353454589844, 'learning_rate': 1.7880217785843923e-05, 'epoch': 8.07} +{'loss': 40.7569, 'grad_norm': 230.7747802734375, 'learning_rate': 1.7874773139745918e-05, 'epoch': 8.08} +{'loss': 40.0753, 'grad_norm': 179.30528259277344, 'learning_rate': 1.7869328493647913e-05, 'epoch': 8.08} +{'loss': 41.4453, 'grad_norm': 203.48915100097656, 'learning_rate': 1.786388384754991e-05, 'epoch': 8.08} +{'loss': 40.5818, 'grad_norm': 274.8970947265625, 'learning_rate': 1.7858439201451908e-05, 'epoch': 8.09} + 41%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2250/5520 [1:57:14<2:36:43, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6184170842170715, 'eval_runtime': 3.1515, 'eval_samples_per_second': 56.799, 'eval_steps_per_second': 56.799, 'epoch': 8.09} +{'loss': 42.5794, 'grad_norm': 237.2452392578125, 'learning_rate': 1.7852994555353903e-05, 'epoch': 8.09} +{'loss': 41.89, 'grad_norm': 236.33766174316406, 'learning_rate': 1.7847549909255898e-05, 'epoch': 8.09} +{'loss': 41.7726, 'grad_norm': 269.4791564941406, 'learning_rate': 1.7842105263157894e-05, 'epoch': 8.1} +{'loss': 40.1187, 'grad_norm': 192.28457641601562, 'learning_rate': 1.783666061705989e-05, 'epoch': 8.1} +{'loss': 36.8004, 'grad_norm': 201.5625457763672, 'learning_rate': 1.7831215970961888e-05, 'epoch': 8.1} +{'loss': 33.8354, 'grad_norm': 175.7625274658203, 'learning_rate': 1.7825771324863887e-05, 'epoch': 8.11} +{'loss': 33.5176, 'grad_norm': 195.6171112060547, 'learning_rate': 1.7820326678765882e-05, 'epoch': 8.11} +{'loss': 34.2908, 'grad_norm': 158.7554168701172, 'learning_rate': 1.7814882032667877e-05, 'epoch': 8.12} +{'loss': 34.0861, 'grad_norm': 192.78900146484375, 'learning_rate': 1.7809437386569873e-05, 'epoch': 8.12} +{'loss': 35.5742, 'grad_norm': 186.6603240966797, 'learning_rate': 1.7803992740471868e-05, 'epoch': 8.12} + 41%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2260/5520 [1:57:46<2:37:08, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6207499504089355, 'eval_runtime': 3.1451, 'eval_samples_per_second': 56.913, 'eval_steps_per_second': 56.913, 'epoch': 8.12} +{'loss': 35.6709, 'grad_norm': 264.3590087890625, 'learning_rate': 1.7798548094373867e-05, 'epoch': 8.13} +{'loss': 36.4221, 'grad_norm': 202.9478302001953, 'learning_rate': 1.7793103448275862e-05, 'epoch': 8.13} +{'loss': 36.0745, 'grad_norm': 229.260498046875, 'learning_rate': 1.7787658802177858e-05, 'epoch': 8.13} +{'loss': 37.3266, 'grad_norm': 222.37716674804688, 'learning_rate': 1.7782214156079856e-05, 'epoch': 8.14} +{'loss': 37.2819, 'grad_norm': 217.02272033691406, 'learning_rate': 1.7776769509981852e-05, 'epoch': 8.14} +{'loss': 37.2683, 'grad_norm': 247.61016845703125, 'learning_rate': 1.7771324863883847e-05, 'epoch': 8.14} +{'loss': 36.7165, 'grad_norm': 209.7449493408203, 'learning_rate': 1.7765880217785846e-05, 'epoch': 8.15} +{'loss': 37.0805, 'grad_norm': 217.30722045898438, 'learning_rate': 1.776043557168784e-05, 'epoch': 8.15} +{'loss': 38.0326, 'grad_norm': 181.5167236328125, 'learning_rate': 1.7754990925589837e-05, 'epoch': 8.16} +{'loss': 37.1798, 'grad_norm': 217.4818878173828, 'learning_rate': 1.7749546279491832e-05, 'epoch': 8.16} + 41%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2270/5520 [1:58:17<2:37:22, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6218119263648987, 'eval_runtime': 3.1425, 'eval_samples_per_second': 56.961, 'eval_steps_per_second': 56.961, 'epoch': 8.16} +{'loss': 36.6039, 'grad_norm': 233.60733032226562, 'learning_rate': 1.7744101633393828e-05, 'epoch': 8.16} +{'loss': 30.6188, 'grad_norm': 184.5128631591797, 'learning_rate': 1.7738656987295826e-05, 'epoch': 8.17} +{'loss': 24.0782, 'grad_norm': 154.25791931152344, 'learning_rate': 1.7733212341197825e-05, 'epoch': 8.17} +{'loss': 23.7072, 'grad_norm': 179.92723083496094, 'learning_rate': 1.772776769509982e-05, 'epoch': 8.17} +{'loss': 24.0008, 'grad_norm': 170.87684631347656, 'learning_rate': 1.7722323049001816e-05, 'epoch': 8.18} +{'loss': 24.8393, 'grad_norm': 179.25233459472656, 'learning_rate': 1.771687840290381e-05, 'epoch': 8.18} +{'loss': 44.0573, 'grad_norm': 268.7836608886719, 'learning_rate': 1.7711433756805807e-05, 'epoch': 8.18} +{'loss': 45.0218, 'grad_norm': 249.12033081054688, 'learning_rate': 1.7705989110707805e-05, 'epoch': 8.19} +{'loss': 43.1954, 'grad_norm': 275.2551574707031, 'learning_rate': 1.77005444646098e-05, 'epoch': 8.19} +{'loss': 43.0807, 'grad_norm': 233.5360107421875, 'learning_rate': 1.7695099818511796e-05, 'epoch': 8.2} + 41%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2280/5520 [1:58:49<2:36:23, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6311450600624084, 'eval_runtime': 3.1403, 'eval_samples_per_second': 57.001, 'eval_steps_per_second': 57.001, 'epoch': 8.2} +{'loss': 43.8161, 'grad_norm': 201.01617431640625, 'learning_rate': 1.768965517241379e-05, 'epoch': 8.2} +{'loss': 43.3388, 'grad_norm': 243.028564453125, 'learning_rate': 1.7684210526315787e-05, 'epoch': 8.2} +{'loss': 42.6949, 'grad_norm': 191.8246307373047, 'learning_rate': 1.767876588021779e-05, 'epoch': 8.21} +{'loss': 43.3541, 'grad_norm': 241.33609008789062, 'learning_rate': 1.7673321234119784e-05, 'epoch': 8.21} +{'loss': 44.4262, 'grad_norm': 247.99066162109375, 'learning_rate': 1.766787658802178e-05, 'epoch': 8.21} +{'loss': 42.5696, 'grad_norm': 223.35452270507812, 'learning_rate': 1.7662431941923775e-05, 'epoch': 8.22} +{'loss': 41.9236, 'grad_norm': 208.75209045410156, 'learning_rate': 1.765698729582577e-05, 'epoch': 8.22} +{'loss': 39.962, 'grad_norm': 229.60305786132812, 'learning_rate': 1.7651542649727766e-05, 'epoch': 8.22} +{'loss': 39.0847, 'grad_norm': 294.3867492675781, 'learning_rate': 1.7646098003629765e-05, 'epoch': 8.23} +{'loss': 39.1451, 'grad_norm': 201.49679565429688, 'learning_rate': 1.764065335753176e-05, 'epoch': 8.23} + 41%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2290/5520 [1:59:21<2:36:51, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6214079856872559, 'eval_runtime': 3.1503, 'eval_samples_per_second': 56.82, 'eval_steps_per_second': 56.82, 'epoch': 8.23} +{'loss': 39.4673, 'grad_norm': 201.57894897460938, 'learning_rate': 1.7635208711433756e-05, 'epoch': 8.23} +{'loss': 39.9832, 'grad_norm': 201.0395965576172, 'learning_rate': 1.7629764065335754e-05, 'epoch': 8.24} +{'loss': 40.3885, 'grad_norm': 274.41168212890625, 'learning_rate': 1.762431941923775e-05, 'epoch': 8.24} +{'loss': 39.5292, 'grad_norm': 173.79977416992188, 'learning_rate': 1.761887477313975e-05, 'epoch': 8.25} +{'loss': 40.3855, 'grad_norm': 194.91806030273438, 'learning_rate': 1.7613430127041744e-05, 'epoch': 8.25} +{'loss': 40.937, 'grad_norm': 216.47213745117188, 'learning_rate': 1.760798548094374e-05, 'epoch': 8.25} +{'loss': 41.2523, 'grad_norm': 168.1825714111328, 'learning_rate': 1.7602540834845735e-05, 'epoch': 8.26} +{'loss': 40.6913, 'grad_norm': 187.51914978027344, 'learning_rate': 1.759709618874773e-05, 'epoch': 8.26} +{'loss': 42.5074, 'grad_norm': 183.99844360351562, 'learning_rate': 1.759165154264973e-05, 'epoch': 8.26} +{'loss': 42.0519, 'grad_norm': 201.23797607421875, 'learning_rate': 1.7586206896551724e-05, 'epoch': 8.27} + 42%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2300/5520 [1:59:53<2:35:49, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6184054017066956, 'eval_runtime': 3.1465, 'eval_samples_per_second': 56.889, 'eval_steps_per_second': 56.889, 'epoch': 8.27} +{'loss': 41.7059, 'grad_norm': 219.0037384033203, 'learning_rate': 1.7580762250453723e-05, 'epoch': 8.27} +{'loss': 40.9004, 'grad_norm': 221.00173950195312, 'learning_rate': 1.7575317604355718e-05, 'epoch': 8.27} +{'loss': 38.7865, 'grad_norm': 180.00828552246094, 'learning_rate': 1.7569872958257714e-05, 'epoch': 8.28} +{'loss': 39.207, 'grad_norm': 210.69302368164062, 'learning_rate': 1.756442831215971e-05, 'epoch': 8.28} +{'loss': 39.4472, 'grad_norm': 196.8787078857422, 'learning_rate': 1.7558983666061708e-05, 'epoch': 8.29} +{'loss': 36.5539, 'grad_norm': 229.16331481933594, 'learning_rate': 1.7553539019963703e-05, 'epoch': 8.29} +{'loss': 34.3887, 'grad_norm': 180.67474365234375, 'learning_rate': 1.75480943738657e-05, 'epoch': 8.29} +{'loss': 34.158, 'grad_norm': 234.046875, 'learning_rate': 1.7542649727767694e-05, 'epoch': 8.3} +{'loss': 34.7655, 'grad_norm': 213.34255981445312, 'learning_rate': 1.753720508166969e-05, 'epoch': 8.3} +{'loss': 34.4223, 'grad_norm': 205.6382598876953, 'learning_rate': 1.753176043557169e-05, 'epoch': 8.3} + 42%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2310/5520 [2:00:25<2:35:05, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6200549006462097, 'eval_runtime': 3.1447, 'eval_samples_per_second': 56.921, 'eval_steps_per_second': 56.921, 'epoch': 8.3} +{'loss': 35.3846, 'grad_norm': 189.79238891601562, 'learning_rate': 1.7526315789473687e-05, 'epoch': 8.31} +{'loss': 34.9006, 'grad_norm': 202.27859497070312, 'learning_rate': 1.7520871143375682e-05, 'epoch': 8.31} +{'loss': 36.3079, 'grad_norm': 217.62327575683594, 'learning_rate': 1.7515426497277678e-05, 'epoch': 8.31} +{'loss': 35.8598, 'grad_norm': 212.82862854003906, 'learning_rate': 1.7509981851179673e-05, 'epoch': 8.32} +{'loss': 37.0853, 'grad_norm': 229.778564453125, 'learning_rate': 1.750453720508167e-05, 'epoch': 8.32} +{'loss': 38.01, 'grad_norm': 219.99844360351562, 'learning_rate': 1.7499092558983667e-05, 'epoch': 8.33} +{'loss': 36.4756, 'grad_norm': 202.63035583496094, 'learning_rate': 1.7493647912885663e-05, 'epoch': 8.33} +{'loss': 37.0509, 'grad_norm': 188.44094848632812, 'learning_rate': 1.7488203266787658e-05, 'epoch': 8.33} +{'loss': 38.0019, 'grad_norm': 187.8760223388672, 'learning_rate': 1.7482758620689657e-05, 'epoch': 8.34} +{'loss': 38.2255, 'grad_norm': 239.35833740234375, 'learning_rate': 1.7477313974591652e-05, 'epoch': 8.34} + 42%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2320/5520 [2:00:56<2:34:39, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6221747994422913, 'eval_runtime': 3.148, 'eval_samples_per_second': 56.862, 'eval_steps_per_second': 56.862, 'epoch': 8.34} +{'loss': 37.3598, 'grad_norm': 236.3567657470703, 'learning_rate': 1.747186932849365e-05, 'epoch': 8.34} +{'loss': 27.1993, 'grad_norm': 188.16151428222656, 'learning_rate': 1.7466424682395646e-05, 'epoch': 8.35} +{'loss': 23.7024, 'grad_norm': 216.58778381347656, 'learning_rate': 1.746098003629764e-05, 'epoch': 8.35} +{'loss': 24.2856, 'grad_norm': 221.03111267089844, 'learning_rate': 1.7455535390199637e-05, 'epoch': 8.35} +{'loss': 23.7624, 'grad_norm': 180.36221313476562, 'learning_rate': 1.7450090744101632e-05, 'epoch': 8.36} +{'loss': 25.8628, 'grad_norm': 198.77438354492188, 'learning_rate': 1.7444646098003628e-05, 'epoch': 8.36} +{'loss': 43.4097, 'grad_norm': 250.81321716308594, 'learning_rate': 1.7439201451905627e-05, 'epoch': 8.36} +{'loss': 44.7141, 'grad_norm': 246.19544982910156, 'learning_rate': 1.7433756805807622e-05, 'epoch': 8.37} +{'loss': 44.4511, 'grad_norm': 245.04241943359375, 'learning_rate': 1.742831215970962e-05, 'epoch': 8.37} +{'loss': 43.5971, 'grad_norm': 224.05331420898438, 'learning_rate': 1.7422867513611616e-05, 'epoch': 8.38} + 42%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2330/5520 [2:01:28<2:34:26, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6324251294136047, 'eval_runtime': 3.1463, 'eval_samples_per_second': 56.892, 'eval_steps_per_second': 56.892, 'epoch': 8.38} +{'loss': 42.9007, 'grad_norm': 222.3795623779297, 'learning_rate': 1.741742286751361e-05, 'epoch': 8.38} +{'loss': 42.8733, 'grad_norm': 210.0133514404297, 'learning_rate': 1.741197822141561e-05, 'epoch': 8.38} +{'loss': 42.9875, 'grad_norm': 222.01031494140625, 'learning_rate': 1.7406533575317606e-05, 'epoch': 8.39} +{'loss': 42.4873, 'grad_norm': 187.30101013183594, 'learning_rate': 1.74010889292196e-05, 'epoch': 8.39} +{'loss': 42.2066, 'grad_norm': 188.22048950195312, 'learning_rate': 1.7395644283121596e-05, 'epoch': 8.39} +{'loss': 42.7604, 'grad_norm': 228.75363159179688, 'learning_rate': 1.7390199637023592e-05, 'epoch': 8.4} +{'loss': 42.445, 'grad_norm': 196.8817901611328, 'learning_rate': 1.7384754990925587e-05, 'epoch': 8.4} +{'loss': 39.8408, 'grad_norm': 205.3610382080078, 'learning_rate': 1.737931034482759e-05, 'epoch': 8.4} +{'loss': 40.847, 'grad_norm': 259.0702819824219, 'learning_rate': 1.7373865698729585e-05, 'epoch': 8.41} +{'loss': 40.4648, 'grad_norm': 216.12017822265625, 'learning_rate': 1.736842105263158e-05, 'epoch': 8.41} + 42%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2340/5520 [2:02:00<2:33:47, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6252871155738831, 'eval_runtime': 3.1421, 'eval_samples_per_second': 56.969, 'eval_steps_per_second': 56.969, 'epoch': 8.41} +{'loss': 39.7682, 'grad_norm': 330.9464111328125, 'learning_rate': 1.7362976406533575e-05, 'epoch': 8.42} +{'loss': 38.8824, 'grad_norm': 237.19505310058594, 'learning_rate': 1.735753176043557e-05, 'epoch': 8.42} +{'loss': 40.1187, 'grad_norm': 247.22259521484375, 'learning_rate': 1.735208711433757e-05, 'epoch': 8.42} +{'loss': 40.4589, 'grad_norm': 267.739990234375, 'learning_rate': 1.7346642468239565e-05, 'epoch': 8.43} +{'loss': 41.5481, 'grad_norm': 308.715576171875, 'learning_rate': 1.734119782214156e-05, 'epoch': 8.43} +{'loss': 41.6628, 'grad_norm': 350.8972473144531, 'learning_rate': 1.7335753176043556e-05, 'epoch': 8.43} +{'loss': 40.3527, 'grad_norm': 245.9825897216797, 'learning_rate': 1.7330308529945555e-05, 'epoch': 8.44} +{'loss': 39.6388, 'grad_norm': 253.94488525390625, 'learning_rate': 1.732486388384755e-05, 'epoch': 8.44} +{'loss': 40.5561, 'grad_norm': 226.24179077148438, 'learning_rate': 1.731941923774955e-05, 'epoch': 8.44} +{'loss': 41.8422, 'grad_norm': 188.66746520996094, 'learning_rate': 1.7313974591651544e-05, 'epoch': 8.45} + 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2350/5520 [2:02:32<2:32:43, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6197592616081238, 'eval_runtime': 3.1522, 'eval_samples_per_second': 56.786, 'eval_steps_per_second': 56.786, 'epoch': 8.45} +{'loss': 41.4184, 'grad_norm': 227.01014709472656, 'learning_rate': 1.730852994555354e-05, 'epoch': 8.45} +{'loss': 40.796, 'grad_norm': 187.11643981933594, 'learning_rate': 1.7303085299455535e-05, 'epoch': 8.46} +{'loss': 41.7926, 'grad_norm': 243.1756134033203, 'learning_rate': 1.729764065335753e-05, 'epoch': 8.46} +{'loss': 41.588, 'grad_norm': 226.15187072753906, 'learning_rate': 1.729219600725953e-05, 'epoch': 8.46} +{'loss': 39.6935, 'grad_norm': 218.49935913085938, 'learning_rate': 1.7286751361161524e-05, 'epoch': 8.47} +{'loss': 37.0718, 'grad_norm': 232.4805145263672, 'learning_rate': 1.7281306715063523e-05, 'epoch': 8.47} +{'loss': 33.9633, 'grad_norm': 201.1748046875, 'learning_rate': 1.727586206896552e-05, 'epoch': 8.47} +{'loss': 33.4553, 'grad_norm': 208.79733276367188, 'learning_rate': 1.7270417422867514e-05, 'epoch': 8.48} +{'loss': 33.6144, 'grad_norm': 235.91151428222656, 'learning_rate': 1.726497277676951e-05, 'epoch': 8.48} +{'loss': 35.3678, 'grad_norm': 206.28811645507812, 'learning_rate': 1.7259528130671508e-05, 'epoch': 8.48} + 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2360/5520 [2:03:04<2:32:40, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6203061938285828, 'eval_runtime': 3.1417, 'eval_samples_per_second': 56.976, 'eval_steps_per_second': 56.976, 'epoch': 8.48} +{'loss': 35.9175, 'grad_norm': 305.2204284667969, 'learning_rate': 1.7254083484573503e-05, 'epoch': 8.49} +{'loss': 35.5001, 'grad_norm': 227.1592254638672, 'learning_rate': 1.72486388384755e-05, 'epoch': 8.49} +{'loss': 35.0015, 'grad_norm': 194.739501953125, 'learning_rate': 1.7243194192377494e-05, 'epoch': 8.49} +{'loss': 36.8257, 'grad_norm': 233.8467254638672, 'learning_rate': 1.723774954627949e-05, 'epoch': 8.5} +{'loss': 36.1246, 'grad_norm': 258.8914489746094, 'learning_rate': 1.7232304900181492e-05, 'epoch': 8.5} +{'loss': 36.1245, 'grad_norm': 194.8585968017578, 'learning_rate': 1.7226860254083487e-05, 'epoch': 8.51} +{'loss': 37.0608, 'grad_norm': 191.2276153564453, 'learning_rate': 1.7221415607985483e-05, 'epoch': 8.51} +{'loss': 37.0779, 'grad_norm': 197.9025115966797, 'learning_rate': 1.7215970961887478e-05, 'epoch': 8.51} +{'loss': 37.8432, 'grad_norm': 207.01016235351562, 'learning_rate': 1.7210526315789473e-05, 'epoch': 8.52} +{'loss': 36.6983, 'grad_norm': 222.20201110839844, 'learning_rate': 1.720508166969147e-05, 'epoch': 8.52} + 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2370/5520 [2:03:35<2:31:57, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6240220665931702, 'eval_runtime': 3.1492, 'eval_samples_per_second': 56.84, 'eval_steps_per_second': 56.84, 'epoch': 8.52} +{'loss': 38.0613, 'grad_norm': 200.19273376464844, 'learning_rate': 1.7199637023593467e-05, 'epoch': 8.52} +{'loss': 29.6395, 'grad_norm': 205.36758422851562, 'learning_rate': 1.7194192377495463e-05, 'epoch': 8.53} +{'loss': 23.6478, 'grad_norm': 206.53396606445312, 'learning_rate': 1.7188747731397458e-05, 'epoch': 8.53} +{'loss': 22.8522, 'grad_norm': 219.47044372558594, 'learning_rate': 1.7183303085299454e-05, 'epoch': 8.53} +{'loss': 24.1411, 'grad_norm': 178.48008728027344, 'learning_rate': 1.7177858439201452e-05, 'epoch': 8.54} +{'loss': 26.2818, 'grad_norm': 222.63731384277344, 'learning_rate': 1.717241379310345e-05, 'epoch': 8.54} +{'loss': 42.5599, 'grad_norm': 216.6333465576172, 'learning_rate': 1.7166969147005447e-05, 'epoch': 8.55} +{'loss': 44.0016, 'grad_norm': 241.42532348632812, 'learning_rate': 1.7161524500907442e-05, 'epoch': 8.55} +{'loss': 44.1662, 'grad_norm': 227.95193481445312, 'learning_rate': 1.7156079854809437e-05, 'epoch': 8.55} +{'loss': 41.2255, 'grad_norm': 204.9208526611328, 'learning_rate': 1.7150635208711433e-05, 'epoch': 8.56} + 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2380/5520 [2:04:07<2:32:35, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6293933987617493, 'eval_runtime': 3.1467, 'eval_samples_per_second': 56.884, 'eval_steps_per_second': 56.884, 'epoch': 8.56} +{'loss': 42.8374, 'grad_norm': 168.1370849609375, 'learning_rate': 1.7145190562613428e-05, 'epoch': 8.56} +{'loss': 42.4378, 'grad_norm': 209.16641235351562, 'learning_rate': 1.7139745916515427e-05, 'epoch': 8.56} +{'loss': 43.3213, 'grad_norm': 235.36373901367188, 'learning_rate': 1.7134301270417422e-05, 'epoch': 8.57} +{'loss': 43.5621, 'grad_norm': 198.8206329345703, 'learning_rate': 1.712885662431942e-05, 'epoch': 8.57} +{'loss': 41.8729, 'grad_norm': 191.1640167236328, 'learning_rate': 1.7123411978221416e-05, 'epoch': 8.57} +{'loss': 42.8306, 'grad_norm': 281.6352233886719, 'learning_rate': 1.7117967332123412e-05, 'epoch': 8.58} +{'loss': 41.3603, 'grad_norm': 191.68939208984375, 'learning_rate': 1.711252268602541e-05, 'epoch': 8.58} +{'loss': 38.7076, 'grad_norm': 175.3041229248047, 'learning_rate': 1.7107078039927406e-05, 'epoch': 8.59} +{'loss': 38.832, 'grad_norm': 186.31202697753906, 'learning_rate': 1.71016333938294e-05, 'epoch': 8.59} +{'loss': 40.6542, 'grad_norm': 192.0680389404297, 'learning_rate': 1.7096188747731397e-05, 'epoch': 8.59} + 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2390/5520 [2:04:39<2:31:44, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6245992183685303, 'eval_runtime': 3.1487, 'eval_samples_per_second': 56.848, 'eval_steps_per_second': 56.848, 'epoch': 8.59} +{'loss': 40.3145, 'grad_norm': 284.3516540527344, 'learning_rate': 1.7090744101633392e-05, 'epoch': 8.6} +{'loss': 39.9109, 'grad_norm': 210.2421875, 'learning_rate': 1.708529945553539e-05, 'epoch': 8.6} +{'loss': 39.0686, 'grad_norm': 202.3438720703125, 'learning_rate': 1.707985480943739e-05, 'epoch': 8.6} +{'loss': 40.6673, 'grad_norm': 189.5508270263672, 'learning_rate': 1.7074410163339385e-05, 'epoch': 8.61} +{'loss': 40.5357, 'grad_norm': 199.3516387939453, 'learning_rate': 1.706896551724138e-05, 'epoch': 8.61} +{'loss': 40.7691, 'grad_norm': 183.11309814453125, 'learning_rate': 1.7063520871143376e-05, 'epoch': 8.61} +{'loss': 40.6822, 'grad_norm': 347.104248046875, 'learning_rate': 1.705807622504537e-05, 'epoch': 8.62} +{'loss': 40.9791, 'grad_norm': 341.0453796386719, 'learning_rate': 1.705263157894737e-05, 'epoch': 8.62} +{'loss': 41.0977, 'grad_norm': 335.33221435546875, 'learning_rate': 1.7047186932849365e-05, 'epoch': 8.62} +{'loss': 41.3332, 'grad_norm': 209.75198364257812, 'learning_rate': 1.704174228675136e-05, 'epoch': 8.63} + 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2400/5520 [2:05:11<2:30:14, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6176490783691406, 'eval_runtime': 3.1444, 'eval_samples_per_second': 56.927, 'eval_steps_per_second': 56.927, 'epoch': 8.63} +{'loss': 41.7456, 'grad_norm': 221.6715545654297, 'learning_rate': 1.7036297640653356e-05, 'epoch': 8.63} +{'loss': 41.7063, 'grad_norm': 255.7875213623047, 'learning_rate': 1.7030852994555355e-05, 'epoch': 8.64} +{'loss': 41.941, 'grad_norm': 206.66221618652344, 'learning_rate': 1.7025408348457354e-05, 'epoch': 8.64} +{'loss': 42.8615, 'grad_norm': 381.9871826171875, 'learning_rate': 1.701996370235935e-05, 'epoch': 8.64} +{'loss': 37.8472, 'grad_norm': 303.8249816894531, 'learning_rate': 1.7014519056261344e-05, 'epoch': 8.65} +{'loss': 35.4641, 'grad_norm': 201.2444610595703, 'learning_rate': 1.700907441016334e-05, 'epoch': 8.65} +{'loss': 33.3414, 'grad_norm': 242.34298706054688, 'learning_rate': 1.7003629764065335e-05, 'epoch': 8.65} +{'loss': 33.7771, 'grad_norm': 214.45384216308594, 'learning_rate': 1.699818511796733e-05, 'epoch': 8.66} +{'loss': 35.4289, 'grad_norm': 276.4810485839844, 'learning_rate': 1.699274047186933e-05, 'epoch': 8.66} +{'loss': 34.4205, 'grad_norm': 199.68626403808594, 'learning_rate': 1.6987295825771325e-05, 'epoch': 8.66} + 43%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2400/5520 [2:05:14<2:30:14, 2.89s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 44%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2410/5520 [2:05:44<2:32:37, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6179484128952026, 'eval_runtime': 3.1618, 'eval_samples_per_second': 56.614, 'eval_steps_per_second': 56.614, 'epoch': 8.66} +{'loss': 34.3428, 'grad_norm': 239.19200134277344, 'learning_rate': 1.698185117967332e-05, 'epoch': 8.67} +{'loss': 37.6011, 'grad_norm': 341.44927978515625, 'learning_rate': 1.697640653357532e-05, 'epoch': 8.67} +{'loss': 34.9222, 'grad_norm': 260.5967102050781, 'learning_rate': 1.6970961887477314e-05, 'epoch': 8.68} +{'loss': 36.6177, 'grad_norm': 217.9357147216797, 'learning_rate': 1.6965517241379313e-05, 'epoch': 8.68} +{'loss': 36.3072, 'grad_norm': 355.21917724609375, 'learning_rate': 1.696007259528131e-05, 'epoch': 8.68} +{'loss': 36.7026, 'grad_norm': 279.37200927734375, 'learning_rate': 1.6954627949183304e-05, 'epoch': 8.69} +{'loss': 37.5009, 'grad_norm': 344.9017028808594, 'learning_rate': 1.69491833030853e-05, 'epoch': 8.69} +{'loss': 36.0914, 'grad_norm': 225.28668212890625, 'learning_rate': 1.6943738656987295e-05, 'epoch': 8.69} +{'loss': 38.0917, 'grad_norm': 233.16372680664062, 'learning_rate': 1.693829401088929e-05, 'epoch': 8.7} +{'loss': 37.4493, 'grad_norm': 220.2307891845703, 'learning_rate': 1.693284936479129e-05, 'epoch': 8.7} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 44%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2420/5520 [2:06:16<2:31:11, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6225734949111938, 'eval_runtime': 3.1427, 'eval_samples_per_second': 56.958, 'eval_steps_per_second': 56.958, 'epoch': 8.7} +{'loss': 37.6527, 'grad_norm': 298.2883605957031, 'learning_rate': 1.6927404718693287e-05, 'epoch': 8.7} +{'loss': 30.9627, 'grad_norm': 329.1615295410156, 'learning_rate': 1.6921960072595283e-05, 'epoch': 8.71} +{'loss': 24.2028, 'grad_norm': 192.55380249023438, 'learning_rate': 1.6916515426497278e-05, 'epoch': 8.71} +{'loss': 23.3005, 'grad_norm': 162.13583374023438, 'learning_rate': 1.6911070780399274e-05, 'epoch': 8.72} +{'loss': 24.335, 'grad_norm': 152.95108032226562, 'learning_rate': 1.6905626134301272e-05, 'epoch': 8.72} +{'loss': 24.9279, 'grad_norm': 183.4193572998047, 'learning_rate': 1.6900181488203268e-05, 'epoch': 8.72} +{'loss': 43.4574, 'grad_norm': 232.93650817871094, 'learning_rate': 1.6894736842105263e-05, 'epoch': 8.73} +{'loss': 44.4136, 'grad_norm': 226.85890197753906, 'learning_rate': 1.688929219600726e-05, 'epoch': 8.73} +{'loss': 42.8183, 'grad_norm': 232.16064453125, 'learning_rate': 1.6883847549909254e-05, 'epoch': 8.73} +{'loss': 43.3031, 'grad_norm': 243.5811767578125, 'learning_rate': 1.6878402903811253e-05, 'epoch': 8.74} + 44%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2430/5520 [2:06:48<2:28:55, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6284167170524597, 'eval_runtime': 3.1444, 'eval_samples_per_second': 56.927, 'eval_steps_per_second': 56.927, 'epoch': 8.74} +{'loss': 42.1276, 'grad_norm': 194.7115020751953, 'learning_rate': 1.687295825771325e-05, 'epoch': 8.74} +{'loss': 42.5535, 'grad_norm': 250.81983947753906, 'learning_rate': 1.6867513611615247e-05, 'epoch': 8.74} +{'loss': 42.7745, 'grad_norm': 205.1988983154297, 'learning_rate': 1.6862068965517242e-05, 'epoch': 8.75} +{'loss': 43.6562, 'grad_norm': 159.68243408203125, 'learning_rate': 1.6856624319419238e-05, 'epoch': 8.75} +{'loss': 43.4602, 'grad_norm': 164.31361389160156, 'learning_rate': 1.6851179673321233e-05, 'epoch': 8.75} +{'loss': 42.1559, 'grad_norm': 213.9793243408203, 'learning_rate': 1.6845735027223232e-05, 'epoch': 8.76} +{'loss': 41.5687, 'grad_norm': 205.79107666015625, 'learning_rate': 1.6840290381125227e-05, 'epoch': 8.76} +{'loss': 41.0748, 'grad_norm': 235.80348205566406, 'learning_rate': 1.6834845735027223e-05, 'epoch': 8.77} +{'loss': 39.3348, 'grad_norm': 203.84884643554688, 'learning_rate': 1.682940108892922e-05, 'epoch': 8.77} +{'loss': 39.357, 'grad_norm': 271.2411804199219, 'learning_rate': 1.6823956442831217e-05, 'epoch': 8.77} + 44%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2440/5520 [2:07:20<2:29:40, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6211046576499939, 'eval_runtime': 3.1402, 'eval_samples_per_second': 57.002, 'eval_steps_per_second': 57.002, 'epoch': 8.77} +{'loss': 39.2198, 'grad_norm': 222.4960174560547, 'learning_rate': 1.6818511796733212e-05, 'epoch': 8.78} +{'loss': 40.572, 'grad_norm': 325.9942932128906, 'learning_rate': 1.681306715063521e-05, 'epoch': 8.78} +{'loss': 39.2727, 'grad_norm': 195.2740936279297, 'learning_rate': 1.6807622504537206e-05, 'epoch': 8.78} +{'loss': 40.6503, 'grad_norm': 196.16964721679688, 'learning_rate': 1.68021778584392e-05, 'epoch': 8.79} +{'loss': 41.2074, 'grad_norm': 183.2659454345703, 'learning_rate': 1.6796733212341197e-05, 'epoch': 8.79} +{'loss': 40.2778, 'grad_norm': 293.393798828125, 'learning_rate': 1.6791288566243192e-05, 'epoch': 8.79} +{'loss': 40.0305, 'grad_norm': 232.8402099609375, 'learning_rate': 1.678584392014519e-05, 'epoch': 8.8} +{'loss': 40.4216, 'grad_norm': 269.957275390625, 'learning_rate': 1.678039927404719e-05, 'epoch': 8.8} +{'loss': 40.7998, 'grad_norm': 175.6732635498047, 'learning_rate': 1.6774954627949185e-05, 'epoch': 8.81} +{'loss': 41.1176, 'grad_norm': 209.0604248046875, 'learning_rate': 1.676950998185118e-05, 'epoch': 8.81} + 44%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 2450/5520 [2:07:52<2:31:05, 2.95s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6211614012718201, 'eval_runtime': 3.15, 'eval_samples_per_second': 56.826, 'eval_steps_per_second': 56.826, 'epoch': 8.81} +{'loss': 41.37, 'grad_norm': 229.91171264648438, 'learning_rate': 1.6764065335753176e-05, 'epoch': 8.81} +{'loss': 41.8377, 'grad_norm': 192.99610900878906, 'learning_rate': 1.675862068965517e-05, 'epoch': 8.82} +{'loss': 42.3038, 'grad_norm': 239.290771484375, 'learning_rate': 1.675317604355717e-05, 'epoch': 8.82} +{'loss': 41.3334, 'grad_norm': 203.52330017089844, 'learning_rate': 1.6747731397459166e-05, 'epoch': 8.82} +{'loss': 37.7455, 'grad_norm': 247.99099731445312, 'learning_rate': 1.674228675136116e-05, 'epoch': 8.83} +{'loss': 34.6828, 'grad_norm': 205.9770965576172, 'learning_rate': 1.6736842105263156e-05, 'epoch': 8.83} +{'loss': 34.927, 'grad_norm': 215.47024536132812, 'learning_rate': 1.6731397459165152e-05, 'epoch': 8.83} +{'loss': 35.3194, 'grad_norm': 254.14010620117188, 'learning_rate': 1.6725952813067154e-05, 'epoch': 8.84} +{'loss': 34.9577, 'grad_norm': 221.18174743652344, 'learning_rate': 1.672050816696915e-05, 'epoch': 8.84} +{'loss': 33.7244, 'grad_norm': 191.1651611328125, 'learning_rate': 1.6715063520871145e-05, 'epoch': 8.85} + 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2460/5520 [2:08:24<2:28:00, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6216589212417603, 'eval_runtime': 3.1424, 'eval_samples_per_second': 56.963, 'eval_steps_per_second': 56.963, 'epoch': 8.85} +{'loss': 34.9689, 'grad_norm': 228.3920135498047, 'learning_rate': 1.670961887477314e-05, 'epoch': 8.85} +{'loss': 36.0718, 'grad_norm': 227.6689910888672, 'learning_rate': 1.6704174228675135e-05, 'epoch': 8.85} +{'loss': 37.1143, 'grad_norm': 182.38978576660156, 'learning_rate': 1.669872958257713e-05, 'epoch': 8.86} +{'loss': 34.4468, 'grad_norm': 223.66966247558594, 'learning_rate': 1.669328493647913e-05, 'epoch': 8.86} +{'loss': 36.7305, 'grad_norm': 260.3930358886719, 'learning_rate': 1.6687840290381125e-05, 'epoch': 8.86} +{'loss': 36.1995, 'grad_norm': 218.60385131835938, 'learning_rate': 1.668239564428312e-05, 'epoch': 8.87} +{'loss': 35.9138, 'grad_norm': 227.4342041015625, 'learning_rate': 1.667695099818512e-05, 'epoch': 8.87} +{'loss': 37.2621, 'grad_norm': 208.42196655273438, 'learning_rate': 1.6671506352087115e-05, 'epoch': 8.87} +{'loss': 38.5176, 'grad_norm': 214.9486541748047, 'learning_rate': 1.6666061705989113e-05, 'epoch': 8.88} +{'loss': 38.3917, 'grad_norm': 226.6992645263672, 'learning_rate': 1.666061705989111e-05, 'epoch': 8.88} + 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2470/5520 [2:08:56<2:27:36, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6277003884315491, 'eval_runtime': 3.1426, 'eval_samples_per_second': 56.959, 'eval_steps_per_second': 56.959, 'epoch': 8.88} +{'loss': 39.1439, 'grad_norm': 282.3875732421875, 'learning_rate': 1.6655172413793104e-05, 'epoch': 8.88} +{'loss': 33.7717, 'grad_norm': 240.29022216796875, 'learning_rate': 1.66497277676951e-05, 'epoch': 8.89} +{'loss': 24.1146, 'grad_norm': 231.84727478027344, 'learning_rate': 1.6644283121597095e-05, 'epoch': 8.89} +{'loss': 24.0165, 'grad_norm': 215.5159149169922, 'learning_rate': 1.663883847549909e-05, 'epoch': 8.9} +{'loss': 24.2048, 'grad_norm': 278.42950439453125, 'learning_rate': 1.663339382940109e-05, 'epoch': 8.9} +{'loss': 24.7332, 'grad_norm': 187.03341674804688, 'learning_rate': 1.6627949183303088e-05, 'epoch': 8.9} +{'loss': 42.6764, 'grad_norm': 261.2938232421875, 'learning_rate': 1.6622504537205083e-05, 'epoch': 8.91} +{'loss': 42.9894, 'grad_norm': 234.00880432128906, 'learning_rate': 1.661705989110708e-05, 'epoch': 8.91} +{'loss': 43.3274, 'grad_norm': 263.2890319824219, 'learning_rate': 1.6611615245009074e-05, 'epoch': 8.91} +{'loss': 44.3862, 'grad_norm': 286.3260192871094, 'learning_rate': 1.6606170598911073e-05, 'epoch': 8.92} + 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2480/5520 [2:09:27<2:26:20, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6278789043426514, 'eval_runtime': 3.1423, 'eval_samples_per_second': 56.964, 'eval_steps_per_second': 56.964, 'epoch': 8.92} +{'loss': 43.4195, 'grad_norm': 273.5133972167969, 'learning_rate': 1.6600725952813068e-05, 'epoch': 8.92} +{'loss': 43.153, 'grad_norm': 246.2245330810547, 'learning_rate': 1.6595281306715063e-05, 'epoch': 8.92} +{'loss': 41.1276, 'grad_norm': 261.3001403808594, 'learning_rate': 1.658983666061706e-05, 'epoch': 8.93} +{'loss': 40.5055, 'grad_norm': 263.7626037597656, 'learning_rate': 1.6584392014519054e-05, 'epoch': 8.93} +{'loss': 40.7098, 'grad_norm': 233.80442810058594, 'learning_rate': 1.6578947368421053e-05, 'epoch': 8.94} +{'loss': 40.5404, 'grad_norm': 334.1268615722656, 'learning_rate': 1.6573502722323052e-05, 'epoch': 8.94} +{'loss': 40.3434, 'grad_norm': 319.56689453125, 'learning_rate': 1.6568058076225047e-05, 'epoch': 8.94} +{'loss': 41.1956, 'grad_norm': 388.0625915527344, 'learning_rate': 1.6562613430127043e-05, 'epoch': 8.95} +{'loss': 41.9647, 'grad_norm': 256.9087829589844, 'learning_rate': 1.6557168784029038e-05, 'epoch': 8.95} +{'loss': 41.1885, 'grad_norm': 248.2635040283203, 'learning_rate': 1.6551724137931033e-05, 'epoch': 8.95} + 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2490/5520 [2:09:59<2:31:04, 2.99s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6198933124542236, 'eval_runtime': 3.1401, 'eval_samples_per_second': 57.004, 'eval_steps_per_second': 57.004, 'epoch': 8.95} +{'loss': 41.2178, 'grad_norm': 236.89004516601562, 'learning_rate': 1.6546279491833032e-05, 'epoch': 8.96} +{'loss': 42.1472, 'grad_norm': 260.47357177734375, 'learning_rate': 1.6540834845735027e-05, 'epoch': 8.96} +{'loss': 36.14, 'grad_norm': 216.1390380859375, 'learning_rate': 1.6535390199637023e-05, 'epoch': 8.96} +{'loss': 33.7272, 'grad_norm': 194.7316131591797, 'learning_rate': 1.652994555353902e-05, 'epoch': 8.97} +{'loss': 34.9427, 'grad_norm': 202.0404052734375, 'learning_rate': 1.6524500907441017e-05, 'epoch': 8.97} +{'loss': 36.4874, 'grad_norm': 196.98463439941406, 'learning_rate': 1.6519056261343016e-05, 'epoch': 8.98} +{'loss': 35.7667, 'grad_norm': 211.46177673339844, 'learning_rate': 1.651361161524501e-05, 'epoch': 8.98} +{'loss': 35.6874, 'grad_norm': 190.47093200683594, 'learning_rate': 1.6508166969147006e-05, 'epoch': 8.98} +{'loss': 36.8718, 'grad_norm': 194.9825897216797, 'learning_rate': 1.6502722323049002e-05, 'epoch': 8.99} +{'loss': 37.4962, 'grad_norm': 230.24774169921875, 'learning_rate': 1.6497277676950997e-05, 'epoch': 8.99} + 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2500/5520 [2:10:31<2:28:24, 2.95s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6168100237846375, 'eval_runtime': 3.142, 'eval_samples_per_second': 56.97, 'eval_steps_per_second': 56.97, 'epoch': 8.99} +{'loss': 35.5063, 'grad_norm': 266.5688171386719, 'learning_rate': 1.6491833030852993e-05, 'epoch': 8.99} +{'loss': 23.5847, 'grad_norm': 230.923828125, 'learning_rate': 1.648638838475499e-05, 'epoch': 9.0} +{'loss': 21.7926, 'grad_norm': 187.365478515625, 'learning_rate': 1.6480943738656987e-05, 'epoch': 9.0} +{'loss': 41.4221, 'grad_norm': 283.487060546875, 'learning_rate': 1.6475499092558986e-05, 'epoch': 9.0} +{'loss': 43.3343, 'grad_norm': 234.38009643554688, 'learning_rate': 1.647005444646098e-05, 'epoch': 9.01} +{'loss': 42.1983, 'grad_norm': 253.75588989257812, 'learning_rate': 1.6464609800362976e-05, 'epoch': 9.01} +{'loss': 41.5355, 'grad_norm': 224.6202392578125, 'learning_rate': 1.6459165154264975e-05, 'epoch': 9.01} +{'loss': 42.3058, 'grad_norm': 261.0040588378906, 'learning_rate': 1.645372050816697e-05, 'epoch': 9.02} +{'loss': 42.3911, 'grad_norm': 191.44142150878906, 'learning_rate': 1.6448275862068966e-05, 'epoch': 9.02} +{'loss': 41.6238, 'grad_norm': 246.79278564453125, 'learning_rate': 1.644283121597096e-05, 'epoch': 9.03} + 45%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2510/5520 [2:11:03<2:27:13, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6220878958702087, 'eval_runtime': 3.1552, 'eval_samples_per_second': 56.731, 'eval_steps_per_second': 56.731, 'epoch': 9.03} +{'loss': 43.9275, 'grad_norm': 251.5475311279297, 'learning_rate': 1.6437386569872957e-05, 'epoch': 9.03} +{'loss': 42.8938, 'grad_norm': 300.0381164550781, 'learning_rate': 1.6431941923774952e-05, 'epoch': 9.03} +{'loss': 42.3538, 'grad_norm': 310.0517883300781, 'learning_rate': 1.6426497277676954e-05, 'epoch': 9.04} +{'loss': 40.2305, 'grad_norm': 213.50392150878906, 'learning_rate': 1.642105263157895e-05, 'epoch': 9.04} +{'loss': 38.3336, 'grad_norm': 173.3816680908203, 'learning_rate': 1.6415607985480945e-05, 'epoch': 9.04} +{'loss': 38.5937, 'grad_norm': 195.51968383789062, 'learning_rate': 1.641016333938294e-05, 'epoch': 9.05} +{'loss': 37.9994, 'grad_norm': 195.68910217285156, 'learning_rate': 1.6404718693284936e-05, 'epoch': 9.05} +{'loss': 38.6006, 'grad_norm': 239.56704711914062, 'learning_rate': 1.6399274047186934e-05, 'epoch': 9.05} +{'loss': 39.9516, 'grad_norm': 455.8309326171875, 'learning_rate': 1.639382940108893e-05, 'epoch': 9.06} +{'loss': 38.8922, 'grad_norm': 188.0857696533203, 'learning_rate': 1.6388384754990925e-05, 'epoch': 9.06} + 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 2520/5520 [2:11:36<2:27:00, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6177002191543579, 'eval_runtime': 3.1595, 'eval_samples_per_second': 56.654, 'eval_steps_per_second': 56.654, 'epoch': 9.06} +{'loss': 38.8895, 'grad_norm': 211.76168823242188, 'learning_rate': 1.638294010889292e-05, 'epoch': 9.07} +{'loss': 39.9238, 'grad_norm': 281.7332458496094, 'learning_rate': 1.637749546279492e-05, 'epoch': 9.07} +{'loss': 41.2667, 'grad_norm': 254.9953155517578, 'learning_rate': 1.6372050816696915e-05, 'epoch': 9.07} +{'loss': 39.3087, 'grad_norm': 233.8746337890625, 'learning_rate': 1.6366606170598914e-05, 'epoch': 9.08} +{'loss': 40.4902, 'grad_norm': 317.71270751953125, 'learning_rate': 1.636116152450091e-05, 'epoch': 9.08} +{'loss': 40.1197, 'grad_norm': 227.5228271484375, 'learning_rate': 1.6355716878402904e-05, 'epoch': 9.08} +{'loss': 42.9099, 'grad_norm': 225.84423828125, 'learning_rate': 1.63502722323049e-05, 'epoch': 9.09} +{'loss': 42.0515, 'grad_norm': 255.20858764648438, 'learning_rate': 1.6344827586206895e-05, 'epoch': 9.09} +{'loss': 41.6817, 'grad_norm': 215.45352172851562, 'learning_rate': 1.6339382940108894e-05, 'epoch': 9.09} +{'loss': 42.6121, 'grad_norm': 233.5334014892578, 'learning_rate': 1.633393829401089e-05, 'epoch': 9.1} + 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2530/5520 [2:12:08<2:25:01, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6148340106010437, 'eval_runtime': 3.1444, 'eval_samples_per_second': 56.926, 'eval_steps_per_second': 56.926, 'epoch': 9.1} +{'loss': 40.5833, 'grad_norm': 196.54132080078125, 'learning_rate': 1.6328493647912888e-05, 'epoch': 9.1} +{'loss': 39.098, 'grad_norm': 296.7503967285156, 'learning_rate': 1.6323049001814883e-05, 'epoch': 9.1} +{'loss': 36.0076, 'grad_norm': 272.1104431152344, 'learning_rate': 1.631760435571688e-05, 'epoch': 9.11} +{'loss': 33.3503, 'grad_norm': 197.3100128173828, 'learning_rate': 1.6312159709618874e-05, 'epoch': 9.11} +{'loss': 33.1386, 'grad_norm': 223.1310272216797, 'learning_rate': 1.6306715063520873e-05, 'epoch': 9.12} +{'loss': 34.2101, 'grad_norm': 234.86093139648438, 'learning_rate': 1.630127041742287e-05, 'epoch': 9.12} +{'loss': 34.955, 'grad_norm': 244.72328186035156, 'learning_rate': 1.6295825771324864e-05, 'epoch': 9.12} +{'loss': 34.5405, 'grad_norm': 198.89134216308594, 'learning_rate': 1.629038112522686e-05, 'epoch': 9.13} +{'loss': 35.2328, 'grad_norm': 236.64096069335938, 'learning_rate': 1.6284936479128854e-05, 'epoch': 9.13} +{'loss': 34.6642, 'grad_norm': 212.8743438720703, 'learning_rate': 1.6279491833030853e-05, 'epoch': 9.13} + 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2540/5520 [2:12:40<2:27:59, 2.98s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6154256463050842, 'eval_runtime': 3.1434, 'eval_samples_per_second': 56.944, 'eval_steps_per_second': 56.944, 'epoch': 9.13} +{'loss': 35.652, 'grad_norm': 227.15135192871094, 'learning_rate': 1.6274047186932852e-05, 'epoch': 9.14} +{'loss': 36.8476, 'grad_norm': 207.30572509765625, 'learning_rate': 1.6268602540834847e-05, 'epoch': 9.14} +{'loss': 35.8299, 'grad_norm': 222.18023681640625, 'learning_rate': 1.6263157894736843e-05, 'epoch': 9.14} +{'loss': 36.5074, 'grad_norm': 283.674072265625, 'learning_rate': 1.6257713248638838e-05, 'epoch': 9.15} +{'loss': 37.344, 'grad_norm': 235.69752502441406, 'learning_rate': 1.6252268602540834e-05, 'epoch': 9.15} +{'loss': 37.8138, 'grad_norm': 224.37965393066406, 'learning_rate': 1.6246823956442832e-05, 'epoch': 9.16} +{'loss': 37.1529, 'grad_norm': 217.52230834960938, 'learning_rate': 1.6241379310344828e-05, 'epoch': 9.16} +{'loss': 36.3247, 'grad_norm': 234.7586212158203, 'learning_rate': 1.6235934664246823e-05, 'epoch': 9.16} +{'loss': 30.0805, 'grad_norm': 239.52479553222656, 'learning_rate': 1.623049001814882e-05, 'epoch': 9.17} +{'loss': 23.8492, 'grad_norm': 223.7616424560547, 'learning_rate': 1.6225045372050817e-05, 'epoch': 9.17} + 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2550/5520 [2:13:12<2:26:01, 2.95s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6244915723800659, 'eval_runtime': 3.1387, 'eval_samples_per_second': 57.031, 'eval_steps_per_second': 57.031, 'epoch': 9.17} +{'loss': 23.3557, 'grad_norm': 213.41371154785156, 'learning_rate': 1.6219600725952816e-05, 'epoch': 9.17} +{'loss': 23.8834, 'grad_norm': 162.4627685546875, 'learning_rate': 1.621415607985481e-05, 'epoch': 9.18} +{'loss': 24.6428, 'grad_norm': 172.13250732421875, 'learning_rate': 1.6208711433756807e-05, 'epoch': 9.18} +{'loss': 42.5908, 'grad_norm': 229.30799865722656, 'learning_rate': 1.6203266787658802e-05, 'epoch': 9.18} +{'loss': 43.7286, 'grad_norm': 195.30130004882812, 'learning_rate': 1.6197822141560798e-05, 'epoch': 9.19} +{'loss': 43.5012, 'grad_norm': 227.4984893798828, 'learning_rate': 1.6192377495462793e-05, 'epoch': 9.19} +{'loss': 41.9295, 'grad_norm': 254.69615173339844, 'learning_rate': 1.6186932849364792e-05, 'epoch': 9.2} +{'loss': 42.0838, 'grad_norm': 251.33778381347656, 'learning_rate': 1.6181488203266787e-05, 'epoch': 9.2} +{'loss': 43.0031, 'grad_norm': 237.91677856445312, 'learning_rate': 1.6176043557168786e-05, 'epoch': 9.2} +{'loss': 42.7196, 'grad_norm': 258.0311584472656, 'learning_rate': 1.617059891107078e-05, 'epoch': 9.21} + 46%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2560/5520 [2:13:45<2:25:07, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6245208978652954, 'eval_runtime': 3.1484, 'eval_samples_per_second': 56.854, 'eval_steps_per_second': 56.854, 'epoch': 9.21} +{'loss': 42.1342, 'grad_norm': 197.14703369140625, 'learning_rate': 1.6165154264972777e-05, 'epoch': 9.21} +{'loss': 41.8462, 'grad_norm': 235.19705200195312, 'learning_rate': 1.6159709618874775e-05, 'epoch': 9.21} +{'loss': 43.5993, 'grad_norm': 198.409423828125, 'learning_rate': 1.615426497277677e-05, 'epoch': 9.22} +{'loss': 40.771, 'grad_norm': 254.08590698242188, 'learning_rate': 1.6148820326678766e-05, 'epoch': 9.22} +{'loss': 39.3511, 'grad_norm': 181.64808654785156, 'learning_rate': 1.614337568058076e-05, 'epoch': 9.22} +{'loss': 39.6586, 'grad_norm': 294.1127014160156, 'learning_rate': 1.6137931034482757e-05, 'epoch': 9.23} +{'loss': 38.2575, 'grad_norm': 197.59982299804688, 'learning_rate': 1.6132486388384752e-05, 'epoch': 9.23} +{'loss': 38.8801, 'grad_norm': 223.74717712402344, 'learning_rate': 1.6127041742286754e-05, 'epoch': 9.23} +{'loss': 40.4591, 'grad_norm': 279.2779541015625, 'learning_rate': 1.612159709618875e-05, 'epoch': 9.24} +{'loss': 39.2172, 'grad_norm': 258.75909423828125, 'learning_rate': 1.6116152450090745e-05, 'epoch': 9.24} + 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2570/5520 [2:14:17<2:22:27, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6209923624992371, 'eval_runtime': 3.1523, 'eval_samples_per_second': 56.784, 'eval_steps_per_second': 56.784, 'epoch': 9.24} +{'loss': 40.442, 'grad_norm': 305.0645446777344, 'learning_rate': 1.611070780399274e-05, 'epoch': 9.25} +{'loss': 39.7092, 'grad_norm': 196.18557739257812, 'learning_rate': 1.6105263157894736e-05, 'epoch': 9.25} +{'loss': 39.3935, 'grad_norm': 214.3220977783203, 'learning_rate': 1.6099818511796735e-05, 'epoch': 9.25} +{'loss': 40.39, 'grad_norm': 217.2801055908203, 'learning_rate': 1.609437386569873e-05, 'epoch': 9.26} +{'loss': 39.9531, 'grad_norm': 205.17446899414062, 'learning_rate': 1.6088929219600726e-05, 'epoch': 9.26} +{'loss': 40.474, 'grad_norm': 197.3854217529297, 'learning_rate': 1.608348457350272e-05, 'epoch': 9.26} +{'loss': 41.2794, 'grad_norm': 264.3934631347656, 'learning_rate': 1.607803992740472e-05, 'epoch': 9.27} +{'loss': 40.3425, 'grad_norm': 226.6471710205078, 'learning_rate': 1.6072595281306715e-05, 'epoch': 9.27} +{'loss': 41.6261, 'grad_norm': 198.62734985351562, 'learning_rate': 1.6067150635208714e-05, 'epoch': 9.27} +{'loss': 41.7835, 'grad_norm': 207.73509216308594, 'learning_rate': 1.606170598911071e-05, 'epoch': 9.28} + 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2580/5520 [2:14:48<2:23:00, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6173180937767029, 'eval_runtime': 3.149, 'eval_samples_per_second': 56.843, 'eval_steps_per_second': 56.843, 'epoch': 9.28} +{'loss': 40.0095, 'grad_norm': 214.13601684570312, 'learning_rate': 1.6056261343012705e-05, 'epoch': 9.28} +{'loss': 40.014, 'grad_norm': 218.0533905029297, 'learning_rate': 1.60508166969147e-05, 'epoch': 9.29} +{'loss': 36.7399, 'grad_norm': 211.27984619140625, 'learning_rate': 1.6045372050816695e-05, 'epoch': 9.29} +{'loss': 33.7555, 'grad_norm': 201.9020233154297, 'learning_rate': 1.6039927404718694e-05, 'epoch': 9.29} +{'loss': 32.9646, 'grad_norm': 230.27149963378906, 'learning_rate': 1.603448275862069e-05, 'epoch': 9.3} +{'loss': 33.5332, 'grad_norm': 208.77622985839844, 'learning_rate': 1.6029038112522685e-05, 'epoch': 9.3} +{'loss': 34.2592, 'grad_norm': 225.02796936035156, 'learning_rate': 1.6023593466424684e-05, 'epoch': 9.3} +{'loss': 34.6686, 'grad_norm': 201.79612731933594, 'learning_rate': 1.601814882032668e-05, 'epoch': 9.31} +{'loss': 35.4554, 'grad_norm': 235.6588134765625, 'learning_rate': 1.6012704174228678e-05, 'epoch': 9.31} +{'loss': 35.2077, 'grad_norm': 273.51904296875, 'learning_rate': 1.6007259528130673e-05, 'epoch': 9.31} + 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2590/5520 [2:15:20<2:21:40, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6169624328613281, 'eval_runtime': 3.1501, 'eval_samples_per_second': 56.823, 'eval_steps_per_second': 56.823, 'epoch': 9.31} +{'loss': 35.0703, 'grad_norm': 199.19541931152344, 'learning_rate': 1.600181488203267e-05, 'epoch': 9.32} +{'loss': 35.9691, 'grad_norm': 212.49276733398438, 'learning_rate': 1.5996370235934664e-05, 'epoch': 9.32} +{'loss': 34.9043, 'grad_norm': 193.7330322265625, 'learning_rate': 1.599092558983666e-05, 'epoch': 9.33} +{'loss': 36.3508, 'grad_norm': 196.00503540039062, 'learning_rate': 1.5985480943738655e-05, 'epoch': 9.33} +{'loss': 34.7672, 'grad_norm': 218.78392028808594, 'learning_rate': 1.5980036297640654e-05, 'epoch': 9.33} +{'loss': 36.8695, 'grad_norm': 235.76873779296875, 'learning_rate': 1.5974591651542652e-05, 'epoch': 9.34} +{'loss': 37.4531, 'grad_norm': 250.538330078125, 'learning_rate': 1.5969147005444648e-05, 'epoch': 9.34} +{'loss': 37.4506, 'grad_norm': 234.12469482421875, 'learning_rate': 1.5963702359346643e-05, 'epoch': 9.34} +{'loss': 31.3062, 'grad_norm': 209.3461151123047, 'learning_rate': 1.595825771324864e-05, 'epoch': 9.35} +{'loss': 23.3303, 'grad_norm': 211.12277221679688, 'learning_rate': 1.5952813067150637e-05, 'epoch': 9.35} + 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2600/5520 [2:15:52<2:20:56, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6222187876701355, 'eval_runtime': 3.1459, 'eval_samples_per_second': 56.9, 'eval_steps_per_second': 56.9, 'epoch': 9.35} +{'loss': 22.9145, 'grad_norm': 200.1257781982422, 'learning_rate': 1.5947368421052633e-05, 'epoch': 9.35} +{'loss': 23.8842, 'grad_norm': 179.01475524902344, 'learning_rate': 1.5941923774954628e-05, 'epoch': 9.36} +{'loss': 25.4154, 'grad_norm': 214.9254608154297, 'learning_rate': 1.5936479128856623e-05, 'epoch': 9.36} +{'loss': 42.6467, 'grad_norm': 211.63735961914062, 'learning_rate': 1.593103448275862e-05, 'epoch': 9.36} +{'loss': 43.3501, 'grad_norm': 232.43194580078125, 'learning_rate': 1.5925589836660618e-05, 'epoch': 9.37} +{'loss': 43.4324, 'grad_norm': 220.61468505859375, 'learning_rate': 1.5920145190562616e-05, 'epoch': 9.37} +{'loss': 41.9646, 'grad_norm': 179.00894165039062, 'learning_rate': 1.591470054446461e-05, 'epoch': 9.38} +{'loss': 41.1242, 'grad_norm': 203.847412109375, 'learning_rate': 1.5909255898366607e-05, 'epoch': 9.38} +{'loss': 42.2451, 'grad_norm': 244.20164489746094, 'learning_rate': 1.5903811252268602e-05, 'epoch': 9.38} +{'loss': 42.0361, 'grad_norm': 203.60154724121094, 'learning_rate': 1.5898366606170598e-05, 'epoch': 9.39} + 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2600/5520 [2:15:55<2:20:56, 2.90s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2610/5520 [2:16:25<2:21:16, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.627146303653717, 'eval_runtime': 3.1484, 'eval_samples_per_second': 56.854, 'eval_steps_per_second': 56.854, 'epoch': 9.39} +{'loss': 41.9657, 'grad_norm': 185.1741180419922, 'learning_rate': 1.5892921960072597e-05, 'epoch': 9.39} +{'loss': 42.2619, 'grad_norm': 211.64219665527344, 'learning_rate': 1.5887477313974592e-05, 'epoch': 9.39} +{'loss': 42.5666, 'grad_norm': 253.31997680664062, 'learning_rate': 1.5882032667876587e-05, 'epoch': 9.4} +{'loss': 43.1747, 'grad_norm': 257.8781433105469, 'learning_rate': 1.5876588021778586e-05, 'epoch': 9.4} +{'loss': 41.2645, 'grad_norm': 171.05398559570312, 'learning_rate': 1.587114337568058e-05, 'epoch': 9.4} +{'loss': 38.7138, 'grad_norm': 209.83749389648438, 'learning_rate': 1.5865698729582577e-05, 'epoch': 9.41} +{'loss': 38.7962, 'grad_norm': 303.92059326171875, 'learning_rate': 1.5860254083484576e-05, 'epoch': 9.41} +{'loss': 39.0622, 'grad_norm': 271.9322204589844, 'learning_rate': 1.585480943738657e-05, 'epoch': 9.42} +{'loss': 40.0773, 'grad_norm': 222.8749542236328, 'learning_rate': 1.5849364791288566e-05, 'epoch': 9.42} +{'loss': 39.3495, 'grad_norm': 194.549072265625, 'learning_rate': 1.5843920145190562e-05, 'epoch': 9.42} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 47%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2620/5520 [2:16:56<2:19:26, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.618250846862793, 'eval_runtime': 3.1517, 'eval_samples_per_second': 56.796, 'eval_steps_per_second': 56.796, 'epoch': 9.42} +{'loss': 39.7577, 'grad_norm': 231.32623291015625, 'learning_rate': 1.5838475499092557e-05, 'epoch': 9.43} +{'loss': 40.9342, 'grad_norm': 185.9986114501953, 'learning_rate': 1.5833030852994556e-05, 'epoch': 9.43} +{'loss': 39.7733, 'grad_norm': 221.356201171875, 'learning_rate': 1.5827586206896555e-05, 'epoch': 9.43} +{'loss': 39.7559, 'grad_norm': 216.2249755859375, 'learning_rate': 1.582214156079855e-05, 'epoch': 9.44} +{'loss': 41.2872, 'grad_norm': 263.5106201171875, 'learning_rate': 1.5816696914700546e-05, 'epoch': 9.44} +{'loss': 41.1114, 'grad_norm': 281.9518127441406, 'learning_rate': 1.581125226860254e-05, 'epoch': 9.44} +{'loss': 41.7711, 'grad_norm': 200.2808074951172, 'learning_rate': 1.5805807622504536e-05, 'epoch': 9.45} +{'loss': 41.3306, 'grad_norm': 233.034912109375, 'learning_rate': 1.5800362976406535e-05, 'epoch': 9.45} +{'loss': 41.0065, 'grad_norm': 215.5499725341797, 'learning_rate': 1.579491833030853e-05, 'epoch': 9.46} +{'loss': 42.1116, 'grad_norm': 220.21153259277344, 'learning_rate': 1.5789473684210526e-05, 'epoch': 9.46} + 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2630/5520 [2:17:28<2:19:41, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6146022081375122, 'eval_runtime': 3.145, 'eval_samples_per_second': 56.915, 'eval_steps_per_second': 56.915, 'epoch': 9.46} +{'loss': 39.637, 'grad_norm': 198.20001220703125, 'learning_rate': 1.578402903811252e-05, 'epoch': 9.46} +{'loss': 37.3831, 'grad_norm': 228.18357849121094, 'learning_rate': 1.5778584392014517e-05, 'epoch': 9.47} +{'loss': 35.6356, 'grad_norm': 207.68040466308594, 'learning_rate': 1.577313974591652e-05, 'epoch': 9.47} +{'loss': 34.5549, 'grad_norm': 267.0474853515625, 'learning_rate': 1.5767695099818514e-05, 'epoch': 9.47} +{'loss': 35.1065, 'grad_norm': 191.4129638671875, 'learning_rate': 1.576225045372051e-05, 'epoch': 9.48} +{'loss': 34.9115, 'grad_norm': 220.85708618164062, 'learning_rate': 1.5756805807622505e-05, 'epoch': 9.48} +{'loss': 33.9542, 'grad_norm': 218.62460327148438, 'learning_rate': 1.57513611615245e-05, 'epoch': 9.48} +{'loss': 35.2981, 'grad_norm': 184.085693359375, 'learning_rate': 1.5745916515426496e-05, 'epoch': 9.49} +{'loss': 36.8326, 'grad_norm': 286.73236083984375, 'learning_rate': 1.5740471869328494e-05, 'epoch': 9.49} +{'loss': 35.9728, 'grad_norm': 326.4263000488281, 'learning_rate': 1.573502722323049e-05, 'epoch': 9.49} + 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2640/5520 [2:18:00<2:19:18, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6165672540664673, 'eval_runtime': 3.1424, 'eval_samples_per_second': 56.964, 'eval_steps_per_second': 56.964, 'epoch': 9.49} +{'loss': 37.4227, 'grad_norm': 283.330322265625, 'learning_rate': 1.5729582577132485e-05, 'epoch': 9.5} +{'loss': 36.8613, 'grad_norm': 208.65829467773438, 'learning_rate': 1.5724137931034484e-05, 'epoch': 9.5} +{'loss': 36.2332, 'grad_norm': 191.59429931640625, 'learning_rate': 1.571869328493648e-05, 'epoch': 9.51} +{'loss': 36.8045, 'grad_norm': 306.4736022949219, 'learning_rate': 1.5713248638838478e-05, 'epoch': 9.51} +{'loss': 37.005, 'grad_norm': 226.97509765625, 'learning_rate': 1.5707803992740474e-05, 'epoch': 9.51} +{'loss': 36.9168, 'grad_norm': 230.47683715820312, 'learning_rate': 1.570235934664247e-05, 'epoch': 9.52} +{'loss': 39.0025, 'grad_norm': 221.44483947753906, 'learning_rate': 1.5696914700544464e-05, 'epoch': 9.52} +{'loss': 38.1069, 'grad_norm': 249.1531219482422, 'learning_rate': 1.569147005444646e-05, 'epoch': 9.52} +{'loss': 30.9819, 'grad_norm': 276.8532409667969, 'learning_rate': 1.5686025408348455e-05, 'epoch': 9.53} +{'loss': 23.4807, 'grad_norm': 218.25035095214844, 'learning_rate': 1.5680580762250454e-05, 'epoch': 9.53} + 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2650/5520 [2:18:32<2:19:10, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.619295060634613, 'eval_runtime': 3.1427, 'eval_samples_per_second': 56.958, 'eval_steps_per_second': 56.958, 'epoch': 9.53} +{'loss': 22.5394, 'grad_norm': 185.83737182617188, 'learning_rate': 1.5675136116152453e-05, 'epoch': 9.53} +{'loss': 23.9106, 'grad_norm': 181.9920654296875, 'learning_rate': 1.5669691470054448e-05, 'epoch': 9.54} +{'loss': 25.5328, 'grad_norm': 209.20391845703125, 'learning_rate': 1.5664246823956443e-05, 'epoch': 9.54} +{'loss': 42.8563, 'grad_norm': 223.86093139648438, 'learning_rate': 1.565880217785844e-05, 'epoch': 9.55} +{'loss': 44.0178, 'grad_norm': 232.3086395263672, 'learning_rate': 1.5653357531760438e-05, 'epoch': 9.55} +{'loss': 43.4928, 'grad_norm': 223.76541137695312, 'learning_rate': 1.5647912885662433e-05, 'epoch': 9.55} +{'loss': 42.3422, 'grad_norm': 258.86700439453125, 'learning_rate': 1.5642468239564428e-05, 'epoch': 9.56} +{'loss': 41.6588, 'grad_norm': 255.09033203125, 'learning_rate': 1.5637023593466424e-05, 'epoch': 9.56} +{'loss': 41.9267, 'grad_norm': 205.88563537597656, 'learning_rate': 1.563157894736842e-05, 'epoch': 9.56} +{'loss': 43.0326, 'grad_norm': 204.12318420410156, 'learning_rate': 1.5626134301270418e-05, 'epoch': 9.57} + 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2660/5520 [2:19:03<2:17:24, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6218730807304382, 'eval_runtime': 3.1401, 'eval_samples_per_second': 57.004, 'eval_steps_per_second': 57.004, 'epoch': 9.57} +{'loss': 42.9604, 'grad_norm': 259.5694274902344, 'learning_rate': 1.5620689655172417e-05, 'epoch': 9.57} +{'loss': 42.7316, 'grad_norm': 234.35935974121094, 'learning_rate': 1.5615245009074412e-05, 'epoch': 9.57} +{'loss': 42.4559, 'grad_norm': 237.14346313476562, 'learning_rate': 1.5609800362976407e-05, 'epoch': 9.58} +{'loss': 40.1113, 'grad_norm': 208.2974395751953, 'learning_rate': 1.5604355716878403e-05, 'epoch': 9.58} +{'loss': 38.6515, 'grad_norm': 212.18814086914062, 'learning_rate': 1.5598911070780398e-05, 'epoch': 9.59} +{'loss': 39.5289, 'grad_norm': 245.23240661621094, 'learning_rate': 1.5593466424682397e-05, 'epoch': 9.59} +{'loss': 39.3232, 'grad_norm': 261.1321105957031, 'learning_rate': 1.5588021778584392e-05, 'epoch': 9.59} +{'loss': 40.3963, 'grad_norm': 257.67962646484375, 'learning_rate': 1.5582577132486388e-05, 'epoch': 9.6} +{'loss': 39.0657, 'grad_norm': 299.93914794921875, 'learning_rate': 1.5577132486388383e-05, 'epoch': 9.6} +{'loss': 40.1408, 'grad_norm': 215.45407104492188, 'learning_rate': 1.5571687840290382e-05, 'epoch': 9.6} + 48%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2670/5520 [2:19:35<2:17:09, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6216554045677185, 'eval_runtime': 3.1451, 'eval_samples_per_second': 56.913, 'eval_steps_per_second': 56.913, 'epoch': 9.6} +{'loss': 40.6894, 'grad_norm': 273.9233093261719, 'learning_rate': 1.5566243194192377e-05, 'epoch': 9.61} +{'loss': 40.8146, 'grad_norm': 220.76344299316406, 'learning_rate': 1.5560798548094376e-05, 'epoch': 9.61} +{'loss': 40.1362, 'grad_norm': 200.33929443359375, 'learning_rate': 1.555535390199637e-05, 'epoch': 9.61} +{'loss': 39.3488, 'grad_norm': 223.38536071777344, 'learning_rate': 1.5549909255898367e-05, 'epoch': 9.62} +{'loss': 41.771, 'grad_norm': 240.99578857421875, 'learning_rate': 1.5544464609800362e-05, 'epoch': 9.62} +{'loss': 41.1412, 'grad_norm': 202.30323791503906, 'learning_rate': 1.5539019963702357e-05, 'epoch': 9.62} +{'loss': 41.0064, 'grad_norm': 193.8411865234375, 'learning_rate': 1.5533575317604356e-05, 'epoch': 9.63} +{'loss': 41.4787, 'grad_norm': 197.1542510986328, 'learning_rate': 1.552813067150635e-05, 'epoch': 9.63} +{'loss': 41.753, 'grad_norm': 259.21954345703125, 'learning_rate': 1.552268602540835e-05, 'epoch': 9.64} +{'loss': 40.4589, 'grad_norm': 290.9770202636719, 'learning_rate': 1.5517241379310346e-05, 'epoch': 9.64} + 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2680/5520 [2:20:07<2:17:05, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6132164001464844, 'eval_runtime': 3.1424, 'eval_samples_per_second': 56.963, 'eval_steps_per_second': 56.963, 'epoch': 9.64} +{'loss': 37.356, 'grad_norm': 252.86219787597656, 'learning_rate': 1.551179673321234e-05, 'epoch': 9.64} +{'loss': 36.2071, 'grad_norm': 207.79254150390625, 'learning_rate': 1.550635208711434e-05, 'epoch': 9.65} +{'loss': 33.5074, 'grad_norm': 186.78857421875, 'learning_rate': 1.5500907441016335e-05, 'epoch': 9.65} +{'loss': 33.7103, 'grad_norm': 212.5107421875, 'learning_rate': 1.549546279491833e-05, 'epoch': 9.65} +{'loss': 34.3476, 'grad_norm': 243.2950897216797, 'learning_rate': 1.5490018148820326e-05, 'epoch': 9.66} +{'loss': 34.5377, 'grad_norm': 221.66415405273438, 'learning_rate': 1.548457350272232e-05, 'epoch': 9.66} +{'loss': 34.3663, 'grad_norm': 231.8260955810547, 'learning_rate': 1.5479128856624317e-05, 'epoch': 9.66} +{'loss': 35.5723, 'grad_norm': 284.6401062011719, 'learning_rate': 1.547368421052632e-05, 'epoch': 9.67} +{'loss': 35.5628, 'grad_norm': 373.43865966796875, 'learning_rate': 1.5468239564428314e-05, 'epoch': 9.67} +{'loss': 35.6192, 'grad_norm': 325.18316650390625, 'learning_rate': 1.546279491833031e-05, 'epoch': 9.68} + 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2690/5520 [2:20:39<2:16:54, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.613842248916626, 'eval_runtime': 3.1437, 'eval_samples_per_second': 56.94, 'eval_steps_per_second': 56.94, 'epoch': 9.68} +{'loss': 36.4789, 'grad_norm': 353.14739990234375, 'learning_rate': 1.5457350272232305e-05, 'epoch': 9.68} +{'loss': 36.0412, 'grad_norm': 215.21836853027344, 'learning_rate': 1.54519056261343e-05, 'epoch': 9.68} +{'loss': 37.1118, 'grad_norm': 219.64930725097656, 'learning_rate': 1.54464609800363e-05, 'epoch': 9.69} +{'loss': 36.488, 'grad_norm': 247.86685180664062, 'learning_rate': 1.5441016333938295e-05, 'epoch': 9.69} +{'loss': 36.2925, 'grad_norm': 248.7967071533203, 'learning_rate': 1.543557168784029e-05, 'epoch': 9.69} +{'loss': 37.3986, 'grad_norm': 243.1404571533203, 'learning_rate': 1.5430127041742285e-05, 'epoch': 9.7} +{'loss': 37.9784, 'grad_norm': 276.6585388183594, 'learning_rate': 1.5424682395644284e-05, 'epoch': 9.7} +{'loss': 38.1591, 'grad_norm': 308.171630859375, 'learning_rate': 1.541923774954628e-05, 'epoch': 9.7} +{'loss': 27.4514, 'grad_norm': 204.4575653076172, 'learning_rate': 1.541379310344828e-05, 'epoch': 9.71} +{'loss': 23.7982, 'grad_norm': 160.85946655273438, 'learning_rate': 1.5408348457350274e-05, 'epoch': 9.71} + 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2700/5520 [2:21:11<2:16:04, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.619924008846283, 'eval_runtime': 3.145, 'eval_samples_per_second': 56.916, 'eval_steps_per_second': 56.916, 'epoch': 9.71} +{'loss': 23.3927, 'grad_norm': 215.60049438476562, 'learning_rate': 1.540290381125227e-05, 'epoch': 9.72} +{'loss': 24.1876, 'grad_norm': 172.84011840820312, 'learning_rate': 1.5397459165154265e-05, 'epoch': 9.72} +{'loss': 25.1794, 'grad_norm': 208.42361450195312, 'learning_rate': 1.539201451905626e-05, 'epoch': 9.72} +{'loss': 42.3484, 'grad_norm': 255.73574829101562, 'learning_rate': 1.538656987295826e-05, 'epoch': 9.73} +{'loss': 42.8277, 'grad_norm': 239.65533447265625, 'learning_rate': 1.5381125226860254e-05, 'epoch': 9.73} +{'loss': 42.6536, 'grad_norm': 211.2068634033203, 'learning_rate': 1.5375680580762253e-05, 'epoch': 9.73} +{'loss': 42.6263, 'grad_norm': 302.85003662109375, 'learning_rate': 1.5370235934664248e-05, 'epoch': 9.74} +{'loss': 41.5621, 'grad_norm': 211.54754638671875, 'learning_rate': 1.5364791288566244e-05, 'epoch': 9.74} +{'loss': 43.3765, 'grad_norm': 229.22283935546875, 'learning_rate': 1.535934664246824e-05, 'epoch': 9.74} +{'loss': 41.4923, 'grad_norm': 206.64794921875, 'learning_rate': 1.5353901996370238e-05, 'epoch': 9.75} + 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2710/5520 [2:21:42<2:15:26, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6202616095542908, 'eval_runtime': 3.1414, 'eval_samples_per_second': 56.981, 'eval_steps_per_second': 56.981, 'epoch': 9.75} +{'loss': 43.1931, 'grad_norm': 216.98757934570312, 'learning_rate': 1.5348457350272233e-05, 'epoch': 9.75} +{'loss': 42.485, 'grad_norm': 222.7340545654297, 'learning_rate': 1.534301270417423e-05, 'epoch': 9.75} +{'loss': 41.4766, 'grad_norm': 291.3454895019531, 'learning_rate': 1.5337568058076224e-05, 'epoch': 9.76} +{'loss': 41.9215, 'grad_norm': 239.50341796875, 'learning_rate': 1.533212341197822e-05, 'epoch': 9.76} +{'loss': 40.6544, 'grad_norm': 179.21839904785156, 'learning_rate': 1.5326678765880218e-05, 'epoch': 9.77} +{'loss': 38.6204, 'grad_norm': 210.89535522460938, 'learning_rate': 1.5321234119782217e-05, 'epoch': 9.77} +{'loss': 39.4385, 'grad_norm': 239.23291015625, 'learning_rate': 1.5315789473684212e-05, 'epoch': 9.77} +{'loss': 40.0139, 'grad_norm': 240.22772216796875, 'learning_rate': 1.5310344827586208e-05, 'epoch': 9.78} +{'loss': 38.9331, 'grad_norm': 185.4588623046875, 'learning_rate': 1.5304900181488203e-05, 'epoch': 9.78} +{'loss': 38.5485, 'grad_norm': 263.0315856933594, 'learning_rate': 1.52994555353902e-05, 'epoch': 9.78} + 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2720/5520 [2:22:14<2:15:18, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.615914523601532, 'eval_runtime': 3.1401, 'eval_samples_per_second': 57.004, 'eval_steps_per_second': 57.004, 'epoch': 9.78} +{'loss': 39.4875, 'grad_norm': 209.05348205566406, 'learning_rate': 1.5294010889292197e-05, 'epoch': 9.79} +{'loss': 40.4742, 'grad_norm': 209.72293090820312, 'learning_rate': 1.5288566243194193e-05, 'epoch': 9.79} +{'loss': 39.924, 'grad_norm': 210.02908325195312, 'learning_rate': 1.5283121597096188e-05, 'epoch': 9.79} +{'loss': 40.8893, 'grad_norm': 204.3467254638672, 'learning_rate': 1.5277676950998183e-05, 'epoch': 9.8} +{'loss': 38.3278, 'grad_norm': 253.9317626953125, 'learning_rate': 1.5272232304900182e-05, 'epoch': 9.8} +{'loss': 40.5242, 'grad_norm': 263.6196594238281, 'learning_rate': 1.526678765880218e-05, 'epoch': 9.81} +{'loss': 40.683, 'grad_norm': 230.35621643066406, 'learning_rate': 1.5261343012704176e-05, 'epoch': 9.81} +{'loss': 40.2472, 'grad_norm': 190.16323852539062, 'learning_rate': 1.5255898366606172e-05, 'epoch': 9.81} +{'loss': 38.9644, 'grad_norm': 202.7122344970703, 'learning_rate': 1.5250453720508167e-05, 'epoch': 9.82} +{'loss': 40.9982, 'grad_norm': 193.65774536132812, 'learning_rate': 1.5245009074410164e-05, 'epoch': 9.82} + 49%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2730/5520 [2:22:46<2:14:48, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6152020692825317, 'eval_runtime': 3.1394, 'eval_samples_per_second': 57.018, 'eval_steps_per_second': 57.018, 'epoch': 9.82} +{'loss': 40.5518, 'grad_norm': 272.0360412597656, 'learning_rate': 1.523956442831216e-05, 'epoch': 9.82} +{'loss': 38.4801, 'grad_norm': 200.20777893066406, 'learning_rate': 1.5234119782214155e-05, 'epoch': 9.83} +{'loss': 35.7499, 'grad_norm': 201.44764709472656, 'learning_rate': 1.5228675136116152e-05, 'epoch': 9.83} +{'loss': 35.4331, 'grad_norm': 234.89706420898438, 'learning_rate': 1.522323049001815e-05, 'epoch': 9.83} +{'loss': 33.0281, 'grad_norm': 193.27423095703125, 'learning_rate': 1.5217785843920146e-05, 'epoch': 9.84} +{'loss': 34.2237, 'grad_norm': 222.28060913085938, 'learning_rate': 1.5212341197822143e-05, 'epoch': 9.84} +{'loss': 33.7112, 'grad_norm': 264.2764587402344, 'learning_rate': 1.5206896551724139e-05, 'epoch': 9.85} +{'loss': 33.9014, 'grad_norm': 204.5146484375, 'learning_rate': 1.5201451905626134e-05, 'epoch': 9.85} +{'loss': 36.6987, 'grad_norm': 198.90907287597656, 'learning_rate': 1.5196007259528131e-05, 'epoch': 9.85} +{'loss': 35.4466, 'grad_norm': 254.19818115234375, 'learning_rate': 1.5190562613430126e-05, 'epoch': 9.86} + 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2740/5520 [2:23:18<2:14:33, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6153284311294556, 'eval_runtime': 3.145, 'eval_samples_per_second': 56.916, 'eval_steps_per_second': 56.916, 'epoch': 9.86} +{'loss': 35.659, 'grad_norm': 212.53749084472656, 'learning_rate': 1.5185117967332123e-05, 'epoch': 9.86} +{'loss': 36.7411, 'grad_norm': 234.5277557373047, 'learning_rate': 1.5179673321234119e-05, 'epoch': 9.86} +{'loss': 36.0713, 'grad_norm': 229.25962829589844, 'learning_rate': 1.5174228675136118e-05, 'epoch': 9.87} +{'loss': 37.2433, 'grad_norm': 259.5096435546875, 'learning_rate': 1.5168784029038115e-05, 'epoch': 9.87} +{'loss': 37.222, 'grad_norm': 297.2413024902344, 'learning_rate': 1.516333938294011e-05, 'epoch': 9.87} +{'loss': 37.096, 'grad_norm': 259.8325500488281, 'learning_rate': 1.5157894736842105e-05, 'epoch': 9.88} +{'loss': 37.769, 'grad_norm': 275.85888671875, 'learning_rate': 1.5152450090744103e-05, 'epoch': 9.88} +{'loss': 38.4089, 'grad_norm': 261.16656494140625, 'learning_rate': 1.5147005444646098e-05, 'epoch': 9.88} +{'loss': 32.5255, 'grad_norm': 219.74351501464844, 'learning_rate': 1.5141560798548095e-05, 'epoch': 9.89} +{'loss': 24.2497, 'grad_norm': 203.9193878173828, 'learning_rate': 1.513611615245009e-05, 'epoch': 9.89} + 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2750/5520 [2:23:49<2:13:42, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6206448674201965, 'eval_runtime': 3.1398, 'eval_samples_per_second': 57.011, 'eval_steps_per_second': 57.011, 'epoch': 9.89} +{'loss': 23.0629, 'grad_norm': 224.19454956054688, 'learning_rate': 1.5130671506352086e-05, 'epoch': 9.9} +{'loss': 24.5799, 'grad_norm': 252.4147186279297, 'learning_rate': 1.5125226860254086e-05, 'epoch': 9.9} +{'loss': 24.6773, 'grad_norm': 214.79067993164062, 'learning_rate': 1.5119782214156082e-05, 'epoch': 9.9} +{'loss': 43.1147, 'grad_norm': 225.59848022460938, 'learning_rate': 1.5114337568058077e-05, 'epoch': 9.91} +{'loss': 42.7403, 'grad_norm': 221.8661651611328, 'learning_rate': 1.5108892921960074e-05, 'epoch': 9.91} +{'loss': 41.6931, 'grad_norm': 316.3871765136719, 'learning_rate': 1.510344827586207e-05, 'epoch': 9.91} +{'loss': 43.3, 'grad_norm': 250.6577911376953, 'learning_rate': 1.5098003629764065e-05, 'epoch': 9.92} +{'loss': 43.3128, 'grad_norm': 222.44386291503906, 'learning_rate': 1.5092558983666062e-05, 'epoch': 9.92} +{'loss': 41.4814, 'grad_norm': 190.08682250976562, 'learning_rate': 1.5087114337568057e-05, 'epoch': 9.92} +{'loss': 41.042, 'grad_norm': 276.9918212890625, 'learning_rate': 1.5081669691470054e-05, 'epoch': 9.93} + 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2760/5520 [2:24:21<2:13:16, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6201648116111755, 'eval_runtime': 3.1444, 'eval_samples_per_second': 56.927, 'eval_steps_per_second': 56.927, 'epoch': 9.93} +{'loss': 40.3064, 'grad_norm': 269.7344970703125, 'learning_rate': 1.507622504537205e-05, 'epoch': 9.93} +{'loss': 40.1675, 'grad_norm': 263.11663818359375, 'learning_rate': 1.5070780399274049e-05, 'epoch': 9.94} +{'loss': 40.5334, 'grad_norm': 210.37635803222656, 'learning_rate': 1.5065335753176046e-05, 'epoch': 9.94} +{'loss': 41.0429, 'grad_norm': 206.09335327148438, 'learning_rate': 1.5059891107078041e-05, 'epoch': 9.94} +{'loss': 40.8831, 'grad_norm': 245.45013427734375, 'learning_rate': 1.5054446460980036e-05, 'epoch': 9.95} +{'loss': 41.2453, 'grad_norm': 216.63075256347656, 'learning_rate': 1.5049001814882033e-05, 'epoch': 9.95} +{'loss': 40.4561, 'grad_norm': 362.12127685546875, 'learning_rate': 1.5043557168784029e-05, 'epoch': 9.95} +{'loss': 41.7307, 'grad_norm': 222.01434326171875, 'learning_rate': 1.5038112522686024e-05, 'epoch': 9.96} +{'loss': 37.83, 'grad_norm': 289.6107177734375, 'learning_rate': 1.5032667876588021e-05, 'epoch': 9.96} +{'loss': 34.1728, 'grad_norm': 231.75274658203125, 'learning_rate': 1.5027223230490017e-05, 'epoch': 9.96} + 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2770/5520 [2:24:53<2:07:58, 2.79s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6177247166633606, 'eval_runtime': 3.141, 'eval_samples_per_second': 56.988, 'eval_steps_per_second': 56.988, 'epoch': 9.96} +{'loss': 33.8501, 'grad_norm': 269.4657287597656, 'learning_rate': 1.5021778584392017e-05, 'epoch': 9.97} +{'loss': 35.0989, 'grad_norm': 229.73004150390625, 'learning_rate': 1.5016333938294013e-05, 'epoch': 9.97} +{'loss': 35.1091, 'grad_norm': 215.75350952148438, 'learning_rate': 1.5010889292196008e-05, 'epoch': 9.98} +{'loss': 36.8373, 'grad_norm': 255.36439514160156, 'learning_rate': 1.5005444646098005e-05, 'epoch': 9.98} +{'loss': 36.6244, 'grad_norm': 226.71084594726562, 'learning_rate': 1.5e-05, 'epoch': 9.98} +{'loss': 36.1925, 'grad_norm': 264.1791076660156, 'learning_rate': 1.4994555353901996e-05, 'epoch': 9.99} +{'loss': 38.5627, 'grad_norm': 281.4349060058594, 'learning_rate': 1.4989110707803993e-05, 'epoch': 9.99} +{'loss': 33.3277, 'grad_norm': 275.13092041015625, 'learning_rate': 1.498366606170599e-05, 'epoch': 9.99} +{'loss': 23.7482, 'grad_norm': 215.79550170898438, 'learning_rate': 1.4978221415607985e-05, 'epoch': 10.0} +{'loss': 21.7078, 'grad_norm': 162.03152465820312, 'learning_rate': 1.4972776769509982e-05, 'epoch': 10.0} + 50%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2780/5520 [2:25:24<2:12:17, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6126651763916016, 'eval_runtime': 3.1408, 'eval_samples_per_second': 56.992, 'eval_steps_per_second': 56.992, 'epoch': 10.0} +{'loss': 42.2449, 'grad_norm': 243.1815185546875, 'learning_rate': 1.4967332123411978e-05, 'epoch': 10.0} +{'loss': 41.5925, 'grad_norm': 183.29127502441406, 'learning_rate': 1.4961887477313977e-05, 'epoch': 10.01} +{'loss': 40.6657, 'grad_norm': 206.04238891601562, 'learning_rate': 1.4956442831215972e-05, 'epoch': 10.01} +{'loss': 41.7065, 'grad_norm': 192.1796875, 'learning_rate': 1.4950998185117967e-05, 'epoch': 10.01} +{'loss': 42.0608, 'grad_norm': 202.77279663085938, 'learning_rate': 1.4945553539019964e-05, 'epoch': 10.02} +{'loss': 40.9925, 'grad_norm': 242.37734985351562, 'learning_rate': 1.494010889292196e-05, 'epoch': 10.02} +{'loss': 41.1401, 'grad_norm': 252.01358032226562, 'learning_rate': 1.4934664246823957e-05, 'epoch': 10.03} +{'loss': 41.5, 'grad_norm': 205.82388305664062, 'learning_rate': 1.4929219600725954e-05, 'epoch': 10.03} +{'loss': 41.8218, 'grad_norm': 251.53968811035156, 'learning_rate': 1.492377495462795e-05, 'epoch': 10.03} +{'loss': 40.803, 'grad_norm': 236.55564880371094, 'learning_rate': 1.4918330308529945e-05, 'epoch': 10.04} + 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2790/5520 [2:25:56<2:11:42, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6173696517944336, 'eval_runtime': 3.1455, 'eval_samples_per_second': 56.906, 'eval_steps_per_second': 56.906, 'epoch': 10.04} +{'loss': 40.522, 'grad_norm': 214.9959716796875, 'learning_rate': 1.4912885662431942e-05, 'epoch': 10.04} +{'loss': 38.8643, 'grad_norm': 213.7000732421875, 'learning_rate': 1.4907441016333939e-05, 'epoch': 10.04} +{'loss': 38.3625, 'grad_norm': 225.6709747314453, 'learning_rate': 1.4901996370235936e-05, 'epoch': 10.05} +{'loss': 38.5355, 'grad_norm': 208.83712768554688, 'learning_rate': 1.4896551724137931e-05, 'epoch': 10.05} +{'loss': 38.4303, 'grad_norm': 185.51219177246094, 'learning_rate': 1.4891107078039927e-05, 'epoch': 10.05} +{'loss': 38.1895, 'grad_norm': 196.68551635742188, 'learning_rate': 1.4885662431941925e-05, 'epoch': 10.06} +{'loss': 39.2329, 'grad_norm': 207.4806671142578, 'learning_rate': 1.488021778584392e-05, 'epoch': 10.06} +{'loss': 40.108, 'grad_norm': 211.640380859375, 'learning_rate': 1.4874773139745916e-05, 'epoch': 10.07} +{'loss': 39.6883, 'grad_norm': 195.97006225585938, 'learning_rate': 1.4869328493647913e-05, 'epoch': 10.07} +{'loss': 40.557, 'grad_norm': 207.20169067382812, 'learning_rate': 1.4863883847549909e-05, 'epoch': 10.07} + 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2800/5520 [2:26:28<2:10:55, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6166439652442932, 'eval_runtime': 3.1461, 'eval_samples_per_second': 56.895, 'eval_steps_per_second': 56.895, 'epoch': 10.07} +{'loss': 39.76, 'grad_norm': 168.4052276611328, 'learning_rate': 1.4858439201451906e-05, 'epoch': 10.08} +{'loss': 40.4776, 'grad_norm': 188.55575561523438, 'learning_rate': 1.4852994555353903e-05, 'epoch': 10.08} +{'loss': 40.5414, 'grad_norm': 181.60801696777344, 'learning_rate': 1.4847549909255898e-05, 'epoch': 10.08} +{'loss': 41.4944, 'grad_norm': 205.39608764648438, 'learning_rate': 1.4842105263157895e-05, 'epoch': 10.09} +{'loss': 40.6805, 'grad_norm': 271.0169372558594, 'learning_rate': 1.4836660617059892e-05, 'epoch': 10.09} +{'loss': 39.5473, 'grad_norm': 241.97889709472656, 'learning_rate': 1.4831215970961888e-05, 'epoch': 10.09} +{'loss': 41.0357, 'grad_norm': 211.64260864257812, 'learning_rate': 1.4825771324863885e-05, 'epoch': 10.1} +{'loss': 41.3357, 'grad_norm': 209.52804565429688, 'learning_rate': 1.482032667876588e-05, 'epoch': 10.1} +{'loss': 38.6778, 'grad_norm': 243.08419799804688, 'learning_rate': 1.4814882032667876e-05, 'epoch': 10.1} +{'loss': 35.1128, 'grad_norm': 227.17172241210938, 'learning_rate': 1.4809437386569874e-05, 'epoch': 10.11} + 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2800/5520 [2:26:31<2:10:55, 2.89s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2810/5520 [2:27:00<2:11:18, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6153741478919983, 'eval_runtime': 3.143, 'eval_samples_per_second': 56.952, 'eval_steps_per_second': 56.952, 'epoch': 10.11} +{'loss': 33.1712, 'grad_norm': 284.7151794433594, 'learning_rate': 1.480399274047187e-05, 'epoch': 10.11} +{'loss': 33.495, 'grad_norm': 234.85169982910156, 'learning_rate': 1.4798548094373867e-05, 'epoch': 10.12} +{'loss': 33.2318, 'grad_norm': 236.6138458251953, 'learning_rate': 1.4793103448275862e-05, 'epoch': 10.12} +{'loss': 33.9268, 'grad_norm': 240.98997497558594, 'learning_rate': 1.4787658802177858e-05, 'epoch': 10.12} +{'loss': 34.667, 'grad_norm': 218.304443359375, 'learning_rate': 1.4782214156079856e-05, 'epoch': 10.13} +{'loss': 36.7153, 'grad_norm': 290.30108642578125, 'learning_rate': 1.4776769509981852e-05, 'epoch': 10.13} +{'loss': 35.2035, 'grad_norm': 267.7265625, 'learning_rate': 1.4771324863883847e-05, 'epoch': 10.13} +{'loss': 35.6581, 'grad_norm': 300.4646301269531, 'learning_rate': 1.4765880217785844e-05, 'epoch': 10.14} +{'loss': 35.8547, 'grad_norm': 234.16448974609375, 'learning_rate': 1.4760435571687841e-05, 'epoch': 10.14} +{'loss': 34.47, 'grad_norm': 209.23858642578125, 'learning_rate': 1.4754990925589837e-05, 'epoch': 10.14} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2820/5520 [2:27:32<2:10:54, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6160662770271301, 'eval_runtime': 3.1408, 'eval_samples_per_second': 56.992, 'eval_steps_per_second': 56.992, 'epoch': 10.14} +{'loss': 36.1239, 'grad_norm': 207.9628143310547, 'learning_rate': 1.4749546279491834e-05, 'epoch': 10.15} +{'loss': 36.759, 'grad_norm': 183.68545532226562, 'learning_rate': 1.4744101633393829e-05, 'epoch': 10.15} +{'loss': 37.397, 'grad_norm': 222.00164794921875, 'learning_rate': 1.4738656987295826e-05, 'epoch': 10.16} +{'loss': 36.3648, 'grad_norm': 226.9628448486328, 'learning_rate': 1.4733212341197823e-05, 'epoch': 10.16} +{'loss': 37.8754, 'grad_norm': 271.061279296875, 'learning_rate': 1.4727767695099819e-05, 'epoch': 10.16} +{'loss': 33.7491, 'grad_norm': 265.2478942871094, 'learning_rate': 1.4722323049001816e-05, 'epoch': 10.17} +{'loss': 23.0162, 'grad_norm': 227.5030975341797, 'learning_rate': 1.4716878402903811e-05, 'epoch': 10.17} +{'loss': 23.5831, 'grad_norm': 195.83477783203125, 'learning_rate': 1.4711433756805808e-05, 'epoch': 10.17} +{'loss': 24.1078, 'grad_norm': 196.982421875, 'learning_rate': 1.4705989110707805e-05, 'epoch': 10.18} +{'loss': 24.8378, 'grad_norm': 212.73031616210938, 'learning_rate': 1.47005444646098e-05, 'epoch': 10.18} + 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 2830/5520 [2:28:04<2:10:20, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6217848062515259, 'eval_runtime': 3.1445, 'eval_samples_per_second': 56.925, 'eval_steps_per_second': 56.925, 'epoch': 10.18} +{'loss': 43.3402, 'grad_norm': 261.8343200683594, 'learning_rate': 1.4695099818511796e-05, 'epoch': 10.18} +{'loss': 42.8004, 'grad_norm': 272.94158935546875, 'learning_rate': 1.4689655172413793e-05, 'epoch': 10.19} +{'loss': 43.5947, 'grad_norm': 261.5067138671875, 'learning_rate': 1.468421052631579e-05, 'epoch': 10.19} +{'loss': 42.1887, 'grad_norm': 280.4205322265625, 'learning_rate': 1.4678765880217787e-05, 'epoch': 10.2} +{'loss': 40.9825, 'grad_norm': 223.82449340820312, 'learning_rate': 1.4673321234119783e-05, 'epoch': 10.2} +{'loss': 41.8347, 'grad_norm': 261.1077575683594, 'learning_rate': 1.4667876588021778e-05, 'epoch': 10.2} +{'loss': 41.7441, 'grad_norm': 189.1642608642578, 'learning_rate': 1.4662431941923775e-05, 'epoch': 10.21} +{'loss': 42.203, 'grad_norm': 216.94410705566406, 'learning_rate': 1.4656987295825772e-05, 'epoch': 10.21} +{'loss': 41.8887, 'grad_norm': 260.44744873046875, 'learning_rate': 1.4651542649727768e-05, 'epoch': 10.21} +{'loss': 42.5977, 'grad_norm': 252.21682739257812, 'learning_rate': 1.4646098003629765e-05, 'epoch': 10.22} + 51%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2840/5520 [2:28:36<2:09:02, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6175437569618225, 'eval_runtime': 3.1405, 'eval_samples_per_second': 56.997, 'eval_steps_per_second': 56.997, 'epoch': 10.22} +{'loss': 40.7994, 'grad_norm': 298.4760437011719, 'learning_rate': 1.464065335753176e-05, 'epoch': 10.22} +{'loss': 39.1571, 'grad_norm': 214.0433349609375, 'learning_rate': 1.4635208711433757e-05, 'epoch': 10.22} +{'loss': 38.257, 'grad_norm': 220.59039306640625, 'learning_rate': 1.4629764065335754e-05, 'epoch': 10.23} +{'loss': 38.1954, 'grad_norm': 218.2419891357422, 'learning_rate': 1.462431941923775e-05, 'epoch': 10.23} +{'loss': 39.7451, 'grad_norm': 241.67674255371094, 'learning_rate': 1.4618874773139747e-05, 'epoch': 10.23} +{'loss': 38.8297, 'grad_norm': 260.3656005859375, 'learning_rate': 1.4613430127041742e-05, 'epoch': 10.24} +{'loss': 38.523, 'grad_norm': 231.78102111816406, 'learning_rate': 1.4607985480943739e-05, 'epoch': 10.24} +{'loss': 40.0389, 'grad_norm': 217.64820861816406, 'learning_rate': 1.4602540834845736e-05, 'epoch': 10.25} +{'loss': 40.3306, 'grad_norm': 186.45240783691406, 'learning_rate': 1.4597096188747732e-05, 'epoch': 10.25} +{'loss': 39.0968, 'grad_norm': 225.20480346679688, 'learning_rate': 1.4591651542649727e-05, 'epoch': 10.25} + 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2850/5520 [2:29:08<2:10:31, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6195141673088074, 'eval_runtime': 3.1422, 'eval_samples_per_second': 56.967, 'eval_steps_per_second': 56.967, 'epoch': 10.25} +{'loss': 38.869, 'grad_norm': 367.6174621582031, 'learning_rate': 1.4586206896551724e-05, 'epoch': 10.26} +{'loss': 39.7781, 'grad_norm': 274.3976135253906, 'learning_rate': 1.4580762250453721e-05, 'epoch': 10.26} +{'loss': 38.819, 'grad_norm': 193.41665649414062, 'learning_rate': 1.4575317604355718e-05, 'epoch': 10.26} +{'loss': 41.5495, 'grad_norm': 204.2224578857422, 'learning_rate': 1.4569872958257714e-05, 'epoch': 10.27} +{'loss': 40.6553, 'grad_norm': 276.07476806640625, 'learning_rate': 1.4564428312159709e-05, 'epoch': 10.27} +{'loss': 40.2147, 'grad_norm': 192.6361541748047, 'learning_rate': 1.4558983666061708e-05, 'epoch': 10.27} +{'loss': 40.7223, 'grad_norm': 232.6641082763672, 'learning_rate': 1.4553539019963703e-05, 'epoch': 10.28} +{'loss': 38.0127, 'grad_norm': 266.781005859375, 'learning_rate': 1.4548094373865698e-05, 'epoch': 10.28} +{'loss': 35.216, 'grad_norm': 289.5414123535156, 'learning_rate': 1.4542649727767696e-05, 'epoch': 10.29} +{'loss': 33.829, 'grad_norm': 208.10845947265625, 'learning_rate': 1.4537205081669691e-05, 'epoch': 10.29} + 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2860/5520 [2:29:40<2:11:36, 2.97s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6140356063842773, 'eval_runtime': 3.1568, 'eval_samples_per_second': 56.703, 'eval_steps_per_second': 56.703, 'epoch': 10.29} +{'loss': 33.8409, 'grad_norm': 260.80328369140625, 'learning_rate': 1.4531760435571688e-05, 'epoch': 10.29} +{'loss': 32.6498, 'grad_norm': 202.3874053955078, 'learning_rate': 1.4526315789473685e-05, 'epoch': 10.3} +{'loss': 33.6538, 'grad_norm': 236.0218048095703, 'learning_rate': 1.452087114337568e-05, 'epoch': 10.3} +{'loss': 33.7346, 'grad_norm': 219.1603240966797, 'learning_rate': 1.4515426497277678e-05, 'epoch': 10.3} +{'loss': 34.6996, 'grad_norm': 252.8759307861328, 'learning_rate': 1.4509981851179675e-05, 'epoch': 10.31} +{'loss': 36.1145, 'grad_norm': 204.89244079589844, 'learning_rate': 1.450453720508167e-05, 'epoch': 10.31} +{'loss': 34.8845, 'grad_norm': 239.5278778076172, 'learning_rate': 1.4499092558983667e-05, 'epoch': 10.31} +{'loss': 36.1006, 'grad_norm': 235.02403259277344, 'learning_rate': 1.4493647912885662e-05, 'epoch': 10.32} +{'loss': 37.0463, 'grad_norm': 219.25686645507812, 'learning_rate': 1.4488203266787658e-05, 'epoch': 10.32} +{'loss': 35.5543, 'grad_norm': 238.1767578125, 'learning_rate': 1.4482758620689657e-05, 'epoch': 10.33} + 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2870/5520 [2:30:12<2:09:46, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6116110682487488, 'eval_runtime': 3.1442, 'eval_samples_per_second': 56.93, 'eval_steps_per_second': 56.93, 'epoch': 10.33} +{'loss': 35.7557, 'grad_norm': 245.4133758544922, 'learning_rate': 1.4477313974591652e-05, 'epoch': 10.33} +{'loss': 35.9535, 'grad_norm': 231.70779418945312, 'learning_rate': 1.4471869328493647e-05, 'epoch': 10.33} +{'loss': 36.747, 'grad_norm': 218.71266174316406, 'learning_rate': 1.4466424682395644e-05, 'epoch': 10.34} +{'loss': 37.4007, 'grad_norm': 206.82247924804688, 'learning_rate': 1.446098003629764e-05, 'epoch': 10.34} +{'loss': 38.183, 'grad_norm': 286.6649475097656, 'learning_rate': 1.4455535390199639e-05, 'epoch': 10.34} +{'loss': 28.1564, 'grad_norm': 262.2049865722656, 'learning_rate': 1.4450090744101634e-05, 'epoch': 10.35} +{'loss': 23.7155, 'grad_norm': 203.03831481933594, 'learning_rate': 1.444464609800363e-05, 'epoch': 10.35} +{'loss': 23.5066, 'grad_norm': 220.13597106933594, 'learning_rate': 1.4439201451905626e-05, 'epoch': 10.35} +{'loss': 23.8087, 'grad_norm': 208.22035217285156, 'learning_rate': 1.4433756805807624e-05, 'epoch': 10.36} +{'loss': 24.6194, 'grad_norm': 202.74989318847656, 'learning_rate': 1.4428312159709619e-05, 'epoch': 10.36} + 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2880/5520 [2:30:44<2:07:28, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6170971989631653, 'eval_runtime': 3.1366, 'eval_samples_per_second': 57.068, 'eval_steps_per_second': 57.068, 'epoch': 10.36} +{'loss': 41.1333, 'grad_norm': 251.78924560546875, 'learning_rate': 1.4422867513611616e-05, 'epoch': 10.36} +{'loss': 43.5289, 'grad_norm': 269.72430419921875, 'learning_rate': 1.4417422867513611e-05, 'epoch': 10.37} +{'loss': 42.1575, 'grad_norm': 226.14202880859375, 'learning_rate': 1.4411978221415607e-05, 'epoch': 10.37} +{'loss': 42.5563, 'grad_norm': 230.2255096435547, 'learning_rate': 1.4406533575317606e-05, 'epoch': 10.38} +{'loss': 41.517, 'grad_norm': 259.2338562011719, 'learning_rate': 1.4401088929219601e-05, 'epoch': 10.38} +{'loss': 41.3589, 'grad_norm': 280.06414794921875, 'learning_rate': 1.4395644283121598e-05, 'epoch': 10.38} +{'loss': 41.539, 'grad_norm': 259.1960754394531, 'learning_rate': 1.4390199637023593e-05, 'epoch': 10.39} +{'loss': 41.8689, 'grad_norm': 244.4931640625, 'learning_rate': 1.438475499092559e-05, 'epoch': 10.39} +{'loss': 42.9191, 'grad_norm': 195.65065002441406, 'learning_rate': 1.4379310344827588e-05, 'epoch': 10.39} +{'loss': 41.4172, 'grad_norm': 215.88589477539062, 'learning_rate': 1.4373865698729583e-05, 'epoch': 10.4} + 52%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2890/5520 [2:31:16<2:07:22, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6176813840866089, 'eval_runtime': 3.1462, 'eval_samples_per_second': 56.893, 'eval_steps_per_second': 56.893, 'epoch': 10.4} +{'loss': 41.8998, 'grad_norm': 175.21368408203125, 'learning_rate': 1.4368421052631578e-05, 'epoch': 10.4} +{'loss': 40.33, 'grad_norm': 207.65963745117188, 'learning_rate': 1.4362976406533575e-05, 'epoch': 10.4} +{'loss': 38.0329, 'grad_norm': 213.50526428222656, 'learning_rate': 1.4357531760435572e-05, 'epoch': 10.41} +{'loss': 39.0142, 'grad_norm': 190.8444366455078, 'learning_rate': 1.4352087114337568e-05, 'epoch': 10.41} +{'loss': 38.6364, 'grad_norm': 300.2298583984375, 'learning_rate': 1.4346642468239565e-05, 'epoch': 10.42} +{'loss': 39.6747, 'grad_norm': 183.6144256591797, 'learning_rate': 1.434119782214156e-05, 'epoch': 10.42} +{'loss': 38.3018, 'grad_norm': 237.85340881347656, 'learning_rate': 1.4335753176043557e-05, 'epoch': 10.42} +{'loss': 40.1042, 'grad_norm': 325.96624755859375, 'learning_rate': 1.4330308529945554e-05, 'epoch': 10.43} +{'loss': 40.0357, 'grad_norm': 248.4732666015625, 'learning_rate': 1.432486388384755e-05, 'epoch': 10.43} +{'loss': 40.4383, 'grad_norm': 374.6653747558594, 'learning_rate': 1.4319419237749547e-05, 'epoch': 10.43} + 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2900/5520 [2:31:48<2:08:48, 2.95s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6150367856025696, 'eval_runtime': 3.1469, 'eval_samples_per_second': 56.881, 'eval_steps_per_second': 56.881, 'epoch': 10.43} +{'loss': 40.3728, 'grad_norm': 229.79647827148438, 'learning_rate': 1.4313974591651542e-05, 'epoch': 10.44} +{'loss': 39.546, 'grad_norm': 278.7500915527344, 'learning_rate': 1.430852994555354e-05, 'epoch': 10.44} +{'loss': 41.8094, 'grad_norm': 233.1890106201172, 'learning_rate': 1.4303085299455536e-05, 'epoch': 10.44} +{'loss': 40.6225, 'grad_norm': 207.7745819091797, 'learning_rate': 1.4297640653357532e-05, 'epoch': 10.45} +{'loss': 40.2499, 'grad_norm': 233.37892150878906, 'learning_rate': 1.4292196007259529e-05, 'epoch': 10.45} +{'loss': 40.3626, 'grad_norm': 225.4070587158203, 'learning_rate': 1.4286751361161524e-05, 'epoch': 10.46} +{'loss': 40.3149, 'grad_norm': 239.60231018066406, 'learning_rate': 1.4281306715063521e-05, 'epoch': 10.46} +{'loss': 39.3443, 'grad_norm': 225.3981475830078, 'learning_rate': 1.4275862068965518e-05, 'epoch': 10.46} +{'loss': 37.8947, 'grad_norm': 270.2829284667969, 'learning_rate': 1.4270417422867514e-05, 'epoch': 10.47} +{'loss': 34.4721, 'grad_norm': 263.66986083984375, 'learning_rate': 1.426497277676951e-05, 'epoch': 10.47} + 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2910/5520 [2:32:20<2:09:28, 2.98s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6134031414985657, 'eval_runtime': 3.1413, 'eval_samples_per_second': 56.984, 'eval_steps_per_second': 56.984, 'epoch': 10.47} +{'loss': 34.3148, 'grad_norm': 189.3812255859375, 'learning_rate': 1.4259528130671508e-05, 'epoch': 10.47} +{'loss': 32.1693, 'grad_norm': 256.7174987792969, 'learning_rate': 1.4254083484573503e-05, 'epoch': 10.48} +{'loss': 34.369, 'grad_norm': 265.40692138671875, 'learning_rate': 1.4248638838475499e-05, 'epoch': 10.48} +{'loss': 34.9479, 'grad_norm': 315.6539001464844, 'learning_rate': 1.4243194192377496e-05, 'epoch': 10.48} +{'loss': 33.983, 'grad_norm': 263.7816162109375, 'learning_rate': 1.4237749546279491e-05, 'epoch': 10.49} +{'loss': 36.6685, 'grad_norm': 244.69192504882812, 'learning_rate': 1.423230490018149e-05, 'epoch': 10.49} +{'loss': 35.0337, 'grad_norm': 224.26071166992188, 'learning_rate': 1.4226860254083485e-05, 'epoch': 10.49} +{'loss': 34.7154, 'grad_norm': 261.0958557128906, 'learning_rate': 1.422141560798548e-05, 'epoch': 10.5} +{'loss': 35.4156, 'grad_norm': 245.85960388183594, 'learning_rate': 1.4215970961887478e-05, 'epoch': 10.5} +{'loss': 36.3999, 'grad_norm': 309.3730163574219, 'learning_rate': 1.4210526315789473e-05, 'epoch': 10.51} + 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2920/5520 [2:32:53<2:10:30, 3.01s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6144266128540039, 'eval_runtime': 3.1485, 'eval_samples_per_second': 56.853, 'eval_steps_per_second': 56.853, 'epoch': 10.51} +{'loss': 37.1515, 'grad_norm': 209.9637451171875, 'learning_rate': 1.420508166969147e-05, 'epoch': 10.51} +{'loss': 35.5548, 'grad_norm': 254.81683349609375, 'learning_rate': 1.4199637023593467e-05, 'epoch': 10.51} +{'loss': 36.7691, 'grad_norm': 224.94137573242188, 'learning_rate': 1.4194192377495463e-05, 'epoch': 10.52} +{'loss': 37.5904, 'grad_norm': 223.81838989257812, 'learning_rate': 1.4188747731397458e-05, 'epoch': 10.52} +{'loss': 36.1561, 'grad_norm': 308.0168151855469, 'learning_rate': 1.4183303085299457e-05, 'epoch': 10.52} +{'loss': 27.6309, 'grad_norm': 214.77928161621094, 'learning_rate': 1.4177858439201452e-05, 'epoch': 10.53} +{'loss': 23.6151, 'grad_norm': 153.77163696289062, 'learning_rate': 1.417241379310345e-05, 'epoch': 10.53} +{'loss': 23.1684, 'grad_norm': 161.12826538085938, 'learning_rate': 1.4166969147005445e-05, 'epoch': 10.53} +{'loss': 23.4383, 'grad_norm': 228.01441955566406, 'learning_rate': 1.416152450090744e-05, 'epoch': 10.54} +{'loss': 25.4699, 'grad_norm': 207.55052185058594, 'learning_rate': 1.4156079854809439e-05, 'epoch': 10.54} + 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 2930/5520 [2:33:26<2:09:38, 3.00s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6177500486373901, 'eval_runtime': 3.1369, 'eval_samples_per_second': 57.063, 'eval_steps_per_second': 57.063, 'epoch': 10.54} +{'loss': 42.1525, 'grad_norm': 254.23828125, 'learning_rate': 1.4150635208711434e-05, 'epoch': 10.55} +{'loss': 42.4282, 'grad_norm': 228.1654815673828, 'learning_rate': 1.414519056261343e-05, 'epoch': 10.55} +{'loss': 42.3053, 'grad_norm': 258.4981689453125, 'learning_rate': 1.4139745916515427e-05, 'epoch': 10.55} +{'loss': 41.9009, 'grad_norm': 364.42059326171875, 'learning_rate': 1.4134301270417424e-05, 'epoch': 10.56} +{'loss': 41.0624, 'grad_norm': 213.5066375732422, 'learning_rate': 1.412885662431942e-05, 'epoch': 10.56} +{'loss': 42.2508, 'grad_norm': 214.23472595214844, 'learning_rate': 1.4123411978221416e-05, 'epoch': 10.56} +{'loss': 43.0671, 'grad_norm': 249.8063201904297, 'learning_rate': 1.4117967332123412e-05, 'epoch': 10.57} +{'loss': 43.4018, 'grad_norm': 210.0769805908203, 'learning_rate': 1.4112522686025409e-05, 'epoch': 10.57} +{'loss': 42.9609, 'grad_norm': 255.67225646972656, 'learning_rate': 1.4107078039927406e-05, 'epoch': 10.57} +{'loss': 41.8748, 'grad_norm': 294.2599182128906, 'learning_rate': 1.4101633393829401e-05, 'epoch': 10.58} + 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2940/5520 [2:33:59<2:08:53, 3.00s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6147512793540955, 'eval_runtime': 3.142, 'eval_samples_per_second': 56.969, 'eval_steps_per_second': 56.969, 'epoch': 10.58} +{'loss': 42.4291, 'grad_norm': 212.6685333251953, 'learning_rate': 1.4096188747731398e-05, 'epoch': 10.58} +{'loss': 39.7291, 'grad_norm': 297.016357421875, 'learning_rate': 1.4090744101633394e-05, 'epoch': 10.59} +{'loss': 37.4836, 'grad_norm': 280.308837890625, 'learning_rate': 1.4085299455535389e-05, 'epoch': 10.59} +{'loss': 39.4075, 'grad_norm': 230.28994750976562, 'learning_rate': 1.4079854809437388e-05, 'epoch': 10.59} +{'loss': 40.5601, 'grad_norm': 377.0367126464844, 'learning_rate': 1.4074410163339383e-05, 'epoch': 10.6} +{'loss': 38.1238, 'grad_norm': 238.51597595214844, 'learning_rate': 1.406896551724138e-05, 'epoch': 10.6} +{'loss': 38.2997, 'grad_norm': 197.5536651611328, 'learning_rate': 1.4063520871143376e-05, 'epoch': 10.6} +{'loss': 39.1501, 'grad_norm': 211.65162658691406, 'learning_rate': 1.4058076225045373e-05, 'epoch': 10.61} +{'loss': 40.5761, 'grad_norm': 266.4801940917969, 'learning_rate': 1.405263157894737e-05, 'epoch': 10.61} +{'loss': 39.7387, 'grad_norm': 210.29478454589844, 'learning_rate': 1.4047186932849365e-05, 'epoch': 10.61} + 53%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2950/5520 [2:34:32<2:09:15, 3.02s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6154477000236511, 'eval_runtime': 3.1384, 'eval_samples_per_second': 57.035, 'eval_steps_per_second': 57.035, 'epoch': 10.61} +{'loss': 38.691, 'grad_norm': 318.0694580078125, 'learning_rate': 1.404174228675136e-05, 'epoch': 10.62} +{'loss': 40.3878, 'grad_norm': 351.12811279296875, 'learning_rate': 1.4036297640653358e-05, 'epoch': 10.62} +{'loss': 38.4447, 'grad_norm': 259.8601989746094, 'learning_rate': 1.4030852994555355e-05, 'epoch': 10.62} +{'loss': 41.1242, 'grad_norm': 249.7741241455078, 'learning_rate': 1.402540834845735e-05, 'epoch': 10.63} +{'loss': 40.1977, 'grad_norm': 207.11119079589844, 'learning_rate': 1.4019963702359347e-05, 'epoch': 10.63} +{'loss': 40.71, 'grad_norm': 199.37295532226562, 'learning_rate': 1.4014519056261343e-05, 'epoch': 10.64} +{'loss': 41.8822, 'grad_norm': 238.85061645507812, 'learning_rate': 1.4009074410163341e-05, 'epoch': 10.64} +{'loss': 40.5648, 'grad_norm': 212.46388244628906, 'learning_rate': 1.4003629764065337e-05, 'epoch': 10.64} +{'loss': 39.6074, 'grad_norm': 217.60386657714844, 'learning_rate': 1.3998185117967332e-05, 'epoch': 10.65} +{'loss': 37.7394, 'grad_norm': 223.88645935058594, 'learning_rate': 1.399274047186933e-05, 'epoch': 10.65} + 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 2960/5520 [2:35:05<2:08:36, 3.01s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6133999228477478, 'eval_runtime': 3.1372, 'eval_samples_per_second': 57.057, 'eval_steps_per_second': 57.057, 'epoch': 10.65} +{'loss': 34.911, 'grad_norm': 248.87986755371094, 'learning_rate': 1.3987295825771325e-05, 'epoch': 10.65} +{'loss': 34.0325, 'grad_norm': 238.0355987548828, 'learning_rate': 1.3981851179673322e-05, 'epoch': 10.66} +{'loss': 34.9663, 'grad_norm': 212.9556121826172, 'learning_rate': 1.3976406533575319e-05, 'epoch': 10.66} +{'loss': 34.2399, 'grad_norm': 274.4277648925781, 'learning_rate': 1.3970961887477314e-05, 'epoch': 10.66} +{'loss': 33.7609, 'grad_norm': 211.77976989746094, 'learning_rate': 1.396551724137931e-05, 'epoch': 10.67} +{'loss': 35.2616, 'grad_norm': 280.6621398925781, 'learning_rate': 1.3960072595281307e-05, 'epoch': 10.67} +{'loss': 34.2542, 'grad_norm': 239.06439208984375, 'learning_rate': 1.3954627949183304e-05, 'epoch': 10.68} +{'loss': 36.0551, 'grad_norm': 271.45806884765625, 'learning_rate': 1.39491833030853e-05, 'epoch': 10.68} +{'loss': 36.9935, 'grad_norm': 247.76486206054688, 'learning_rate': 1.3943738656987296e-05, 'epoch': 10.68} +{'loss': 36.7769, 'grad_norm': 259.47930908203125, 'learning_rate': 1.3938294010889292e-05, 'epoch': 10.69} + 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 2970/5520 [2:35:37<2:03:37, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6107803583145142, 'eval_runtime': 3.1328, 'eval_samples_per_second': 57.138, 'eval_steps_per_second': 57.138, 'epoch': 10.69} +{'loss': 35.4848, 'grad_norm': 247.50103759765625, 'learning_rate': 1.393284936479129e-05, 'epoch': 10.69} +{'loss': 36.3881, 'grad_norm': 242.37330627441406, 'learning_rate': 1.3927404718693286e-05, 'epoch': 10.69} +{'loss': 37.2684, 'grad_norm': 200.2835693359375, 'learning_rate': 1.3921960072595281e-05, 'epoch': 10.7} +{'loss': 37.4581, 'grad_norm': 261.6256103515625, 'learning_rate': 1.3916515426497278e-05, 'epoch': 10.7} +{'loss': 35.8237, 'grad_norm': 243.7251434326172, 'learning_rate': 1.3911070780399274e-05, 'epoch': 10.7} +{'loss': 29.5815, 'grad_norm': 172.99339294433594, 'learning_rate': 1.390562613430127e-05, 'epoch': 10.71} +{'loss': 23.6597, 'grad_norm': 168.88490295410156, 'learning_rate': 1.3900181488203268e-05, 'epoch': 10.71} +{'loss': 22.5034, 'grad_norm': 213.0456085205078, 'learning_rate': 1.3894736842105263e-05, 'epoch': 10.72} +{'loss': 24.1696, 'grad_norm': 183.87222290039062, 'learning_rate': 1.388929219600726e-05, 'epoch': 10.72} +{'loss': 24.8905, 'grad_norm': 179.4297637939453, 'learning_rate': 1.3883847549909256e-05, 'epoch': 10.72} + 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 2980/5520 [2:36:09<2:03:15, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6176853179931641, 'eval_runtime': 3.1438, 'eval_samples_per_second': 56.938, 'eval_steps_per_second': 56.938, 'epoch': 10.72} +{'loss': 40.6941, 'grad_norm': 214.10662841796875, 'learning_rate': 1.3878402903811253e-05, 'epoch': 10.73} +{'loss': 42.6363, 'grad_norm': 199.4381103515625, 'learning_rate': 1.387295825771325e-05, 'epoch': 10.73} +{'loss': 40.9695, 'grad_norm': 182.74517822265625, 'learning_rate': 1.3867513611615245e-05, 'epoch': 10.73} +{'loss': 40.8893, 'grad_norm': 182.41421508789062, 'learning_rate': 1.386206896551724e-05, 'epoch': 10.74} +{'loss': 40.6667, 'grad_norm': 215.42904663085938, 'learning_rate': 1.385662431941924e-05, 'epoch': 10.74} +{'loss': 42.0714, 'grad_norm': 208.15133666992188, 'learning_rate': 1.3851179673321235e-05, 'epoch': 10.74} +{'loss': 40.9404, 'grad_norm': 224.70242309570312, 'learning_rate': 1.384573502722323e-05, 'epoch': 10.75} +{'loss': 43.5597, 'grad_norm': 241.45301818847656, 'learning_rate': 1.3840290381125227e-05, 'epoch': 10.75} +{'loss': 42.7741, 'grad_norm': 201.2677459716797, 'learning_rate': 1.3834845735027222e-05, 'epoch': 10.75} +{'loss': 41.7873, 'grad_norm': 246.30873107910156, 'learning_rate': 1.3829401088929221e-05, 'epoch': 10.76} + 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 2990/5520 [2:36:41<2:05:40, 2.98s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6206657886505127, 'eval_runtime': 3.1469, 'eval_samples_per_second': 56.882, 'eval_steps_per_second': 56.882, 'epoch': 10.76} +{'loss': 42.3601, 'grad_norm': 206.91009521484375, 'learning_rate': 1.3823956442831217e-05, 'epoch': 10.76} +{'loss': 38.5536, 'grad_norm': 206.37472534179688, 'learning_rate': 1.3818511796733212e-05, 'epoch': 10.77} +{'loss': 38.1051, 'grad_norm': 206.49070739746094, 'learning_rate': 1.3813067150635209e-05, 'epoch': 10.77} +{'loss': 39.0797, 'grad_norm': 215.02455139160156, 'learning_rate': 1.3807622504537206e-05, 'epoch': 10.77} +{'loss': 39.419, 'grad_norm': 254.23757934570312, 'learning_rate': 1.3802177858439202e-05, 'epoch': 10.78} +{'loss': 39.2075, 'grad_norm': 205.85079956054688, 'learning_rate': 1.3796733212341199e-05, 'epoch': 10.78} +{'loss': 38.5652, 'grad_norm': 216.0372314453125, 'learning_rate': 1.3791288566243194e-05, 'epoch': 10.78} +{'loss': 38.1968, 'grad_norm': 258.47650146484375, 'learning_rate': 1.3785843920145191e-05, 'epoch': 10.79} +{'loss': 40.2233, 'grad_norm': 289.07354736328125, 'learning_rate': 1.3780399274047188e-05, 'epoch': 10.79} +{'loss': 39.5959, 'grad_norm': 332.9964904785156, 'learning_rate': 1.3774954627949184e-05, 'epoch': 10.79} + 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3000/5520 [2:37:14<2:06:24, 3.01s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6167517304420471, 'eval_runtime': 3.1556, 'eval_samples_per_second': 56.724, 'eval_steps_per_second': 56.724, 'epoch': 10.79} +{'loss': 40.2468, 'grad_norm': 205.10699462890625, 'learning_rate': 1.376950998185118e-05, 'epoch': 10.8} +{'loss': 37.5956, 'grad_norm': 270.2808837890625, 'learning_rate': 1.3764065335753176e-05, 'epoch': 10.8} +{'loss': 38.7289, 'grad_norm': 199.32044982910156, 'learning_rate': 1.3758620689655171e-05, 'epoch': 10.81} +{'loss': 40.6707, 'grad_norm': 196.97547912597656, 'learning_rate': 1.375317604355717e-05, 'epoch': 10.81} +{'loss': 39.6782, 'grad_norm': 219.34588623046875, 'learning_rate': 1.3747731397459166e-05, 'epoch': 10.81} +{'loss': 41.1828, 'grad_norm': 261.7323913574219, 'learning_rate': 1.3742286751361161e-05, 'epoch': 10.82} +{'loss': 41.3582, 'grad_norm': 250.89186096191406, 'learning_rate': 1.3736842105263158e-05, 'epoch': 10.82} +{'loss': 39.3584, 'grad_norm': 284.7223205566406, 'learning_rate': 1.3731397459165155e-05, 'epoch': 10.82} +{'loss': 37.5373, 'grad_norm': 212.9114990234375, 'learning_rate': 1.3725952813067152e-05, 'epoch': 10.83} +{'loss': 35.2027, 'grad_norm': 182.8346405029297, 'learning_rate': 1.3720508166969148e-05, 'epoch': 10.83} + 54%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3000/5520 [2:37:17<2:06:24, 3.01s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3010/5520 [2:37:48<2:06:45, 3.03s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6083630919456482, 'eval_runtime': 3.1568, 'eval_samples_per_second': 56.702, 'eval_steps_per_second': 56.702, 'epoch': 10.83} +{'loss': 33.4937, 'grad_norm': 259.0496520996094, 'learning_rate': 1.3715063520871143e-05, 'epoch': 10.83} +{'loss': 32.8549, 'grad_norm': 173.037353515625, 'learning_rate': 1.370961887477314e-05, 'epoch': 10.84} +{'loss': 33.9163, 'grad_norm': 257.9381408691406, 'learning_rate': 1.3704174228675137e-05, 'epoch': 10.84} +{'loss': 34.3948, 'grad_norm': 248.58355712890625, 'learning_rate': 1.3698729582577132e-05, 'epoch': 10.85} +{'loss': 34.2868, 'grad_norm': 277.0877990722656, 'learning_rate': 1.369328493647913e-05, 'epoch': 10.85} +{'loss': 35.2502, 'grad_norm': 220.54014587402344, 'learning_rate': 1.3687840290381125e-05, 'epoch': 10.85} +{'loss': 33.4599, 'grad_norm': 248.14111328125, 'learning_rate': 1.3682395644283122e-05, 'epoch': 10.86} +{'loss': 34.2927, 'grad_norm': 284.2827453613281, 'learning_rate': 1.3676950998185119e-05, 'epoch': 10.86} +{'loss': 34.9322, 'grad_norm': 236.78201293945312, 'learning_rate': 1.3671506352087114e-05, 'epoch': 10.86} +{'loss': 35.7628, 'grad_norm': 245.58331298828125, 'learning_rate': 1.3666061705989112e-05, 'epoch': 10.87} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3020/5520 [2:38:21<2:03:22, 2.96s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6125946640968323, 'eval_runtime': 3.1644, 'eval_samples_per_second': 56.566, 'eval_steps_per_second': 56.566, 'epoch': 10.87} +{'loss': 35.7332, 'grad_norm': 217.79248046875, 'learning_rate': 1.3660617059891107e-05, 'epoch': 10.87} +{'loss': 38.293, 'grad_norm': 258.78729248046875, 'learning_rate': 1.3655172413793104e-05, 'epoch': 10.87} +{'loss': 37.511, 'grad_norm': 253.94757080078125, 'learning_rate': 1.3649727767695101e-05, 'epoch': 10.88} +{'loss': 37.5786, 'grad_norm': 265.5654602050781, 'learning_rate': 1.3644283121597096e-05, 'epoch': 10.88} +{'loss': 37.1039, 'grad_norm': 252.11453247070312, 'learning_rate': 1.3638838475499092e-05, 'epoch': 10.88} +{'loss': 35.2651, 'grad_norm': 259.5934753417969, 'learning_rate': 1.3633393829401089e-05, 'epoch': 10.89} +{'loss': 23.7438, 'grad_norm': 194.3569793701172, 'learning_rate': 1.3627949183303086e-05, 'epoch': 10.89} +{'loss': 23.0061, 'grad_norm': 233.95205688476562, 'learning_rate': 1.3622504537205081e-05, 'epoch': 10.9} +{'loss': 24.5404, 'grad_norm': 185.18495178222656, 'learning_rate': 1.3617059891107078e-05, 'epoch': 10.9} +{'loss': 24.3629, 'grad_norm': 200.27029418945312, 'learning_rate': 1.3611615245009074e-05, 'epoch': 10.9} + 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3030/5520 [2:38:53<2:01:50, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6178797483444214, 'eval_runtime': 3.1498, 'eval_samples_per_second': 56.829, 'eval_steps_per_second': 56.829, 'epoch': 10.9} +{'loss': 41.7249, 'grad_norm': 226.4281463623047, 'learning_rate': 1.3606170598911073e-05, 'epoch': 10.91} +{'loss': 42.1902, 'grad_norm': 207.73768615722656, 'learning_rate': 1.3600725952813068e-05, 'epoch': 10.91} +{'loss': 40.8419, 'grad_norm': 248.69773864746094, 'learning_rate': 1.3595281306715063e-05, 'epoch': 10.91} +{'loss': 41.483, 'grad_norm': 224.0100860595703, 'learning_rate': 1.358983666061706e-05, 'epoch': 10.92} +{'loss': 42.4667, 'grad_norm': 217.3524932861328, 'learning_rate': 1.3584392014519056e-05, 'epoch': 10.92} +{'loss': 40.8693, 'grad_norm': 226.0863494873047, 'learning_rate': 1.3578947368421053e-05, 'epoch': 10.92} +{'loss': 39.5165, 'grad_norm': 278.3658447265625, 'learning_rate': 1.357350272232305e-05, 'epoch': 10.93} +{'loss': 39.3144, 'grad_norm': 226.6543731689453, 'learning_rate': 1.3568058076225045e-05, 'epoch': 10.93} +{'loss': 39.9823, 'grad_norm': 215.39073181152344, 'learning_rate': 1.3562613430127042e-05, 'epoch': 10.94} +{'loss': 40.898, 'grad_norm': 239.6291961669922, 'learning_rate': 1.355716878402904e-05, 'epoch': 10.94} + 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3040/5520 [2:39:25<2:02:07, 2.95s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6163076162338257, 'eval_runtime': 3.153, 'eval_samples_per_second': 56.771, 'eval_steps_per_second': 56.771, 'epoch': 10.94} +{'loss': 40.8357, 'grad_norm': 251.20431518554688, 'learning_rate': 1.3551724137931035e-05, 'epoch': 10.94} +{'loss': 39.1261, 'grad_norm': 243.96022033691406, 'learning_rate': 1.3546279491833032e-05, 'epoch': 10.95} +{'loss': 40.9375, 'grad_norm': 248.15545654296875, 'learning_rate': 1.3540834845735027e-05, 'epoch': 10.95} +{'loss': 42.4167, 'grad_norm': 215.00927734375, 'learning_rate': 1.3535390199637023e-05, 'epoch': 10.95} +{'loss': 40.7363, 'grad_norm': 263.11566162109375, 'learning_rate': 1.3529945553539021e-05, 'epoch': 10.96} +{'loss': 35.7124, 'grad_norm': 208.59628295898438, 'learning_rate': 1.3524500907441017e-05, 'epoch': 10.96} +{'loss': 33.7512, 'grad_norm': 187.6036834716797, 'learning_rate': 1.3519056261343012e-05, 'epoch': 10.96} +{'loss': 33.4262, 'grad_norm': 217.89825439453125, 'learning_rate': 1.351361161524501e-05, 'epoch': 10.97} +{'loss': 35.2587, 'grad_norm': 235.59889221191406, 'learning_rate': 1.3508166969147005e-05, 'epoch': 10.97} +{'loss': 36.1296, 'grad_norm': 261.9609680175781, 'learning_rate': 1.3502722323049003e-05, 'epoch': 10.98} + 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3050/5520 [2:39:58<2:02:01, 2.96s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.610818088054657, 'eval_runtime': 3.1502, 'eval_samples_per_second': 56.822, 'eval_steps_per_second': 56.822, 'epoch': 10.98} +{'loss': 35.6712, 'grad_norm': 239.44386291503906, 'learning_rate': 1.3497277676950999e-05, 'epoch': 10.98} +{'loss': 35.9054, 'grad_norm': 260.9620666503906, 'learning_rate': 1.3491833030852994e-05, 'epoch': 10.98} +{'loss': 35.6071, 'grad_norm': 246.35678100585938, 'learning_rate': 1.3486388384754991e-05, 'epoch': 10.99} +{'loss': 37.8261, 'grad_norm': 259.808349609375, 'learning_rate': 1.3480943738656988e-05, 'epoch': 10.99} +{'loss': 29.4662, 'grad_norm': 187.34579467773438, 'learning_rate': 1.3475499092558984e-05, 'epoch': 10.99} +{'loss': 23.668, 'grad_norm': 235.4073486328125, 'learning_rate': 1.3470054446460981e-05, 'epoch': 11.0} +{'loss': 21.3995, 'grad_norm': 171.45904541015625, 'learning_rate': 1.3464609800362976e-05, 'epoch': 11.0} +{'loss': 40.2072, 'grad_norm': 262.18798828125, 'learning_rate': 1.3459165154264972e-05, 'epoch': 11.0} +{'loss': 42.5345, 'grad_norm': 298.67755126953125, 'learning_rate': 1.345372050816697e-05, 'epoch': 11.01} +{'loss': 41.3491, 'grad_norm': 215.71389770507812, 'learning_rate': 1.3448275862068966e-05, 'epoch': 11.01} + 55%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3060/5520 [2:40:30<2:02:49, 3.00s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6099278330802917, 'eval_runtime': 3.1503, 'eval_samples_per_second': 56.82, 'eval_steps_per_second': 56.82, 'epoch': 11.01} +{'loss': 41.0093, 'grad_norm': 243.77044677734375, 'learning_rate': 1.3442831215970963e-05, 'epoch': 11.01} +{'loss': 41.944, 'grad_norm': 205.8600616455078, 'learning_rate': 1.3437386569872958e-05, 'epoch': 11.02} +{'loss': 39.3595, 'grad_norm': 204.25608825683594, 'learning_rate': 1.3431941923774955e-05, 'epoch': 11.02} +{'loss': 42.0208, 'grad_norm': 195.03114318847656, 'learning_rate': 1.3426497277676952e-05, 'epoch': 11.03} +{'loss': 41.2148, 'grad_norm': 193.05857849121094, 'learning_rate': 1.3421052631578948e-05, 'epoch': 11.03} +{'loss': 41.6029, 'grad_norm': 255.9553680419922, 'learning_rate': 1.3415607985480943e-05, 'epoch': 11.03} +{'loss': 41.2583, 'grad_norm': 234.97799682617188, 'learning_rate': 1.341016333938294e-05, 'epoch': 11.04} +{'loss': 39.4893, 'grad_norm': 183.76707458496094, 'learning_rate': 1.3404718693284937e-05, 'epoch': 11.04} +{'loss': 37.697, 'grad_norm': 162.30191040039062, 'learning_rate': 1.3399274047186933e-05, 'epoch': 11.04} +{'loss': 37.2762, 'grad_norm': 223.8235626220703, 'learning_rate': 1.339382940108893e-05, 'epoch': 11.05} + 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3070/5520 [2:41:03<2:01:06, 2.97s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6099210381507874, 'eval_runtime': 3.1526, 'eval_samples_per_second': 56.778, 'eval_steps_per_second': 56.778, 'epoch': 11.05} +{'loss': 37.7674, 'grad_norm': 203.874755859375, 'learning_rate': 1.3388384754990925e-05, 'epoch': 11.05} +{'loss': 39.5784, 'grad_norm': 222.9609832763672, 'learning_rate': 1.3382940108892922e-05, 'epoch': 11.05} +{'loss': 37.5264, 'grad_norm': 177.81871032714844, 'learning_rate': 1.337749546279492e-05, 'epoch': 11.06} +{'loss': 38.5067, 'grad_norm': 209.53326416015625, 'learning_rate': 1.3372050816696915e-05, 'epoch': 11.06} +{'loss': 37.5329, 'grad_norm': 228.35260009765625, 'learning_rate': 1.3366606170598912e-05, 'epoch': 11.07} +{'loss': 39.8565, 'grad_norm': 231.5054168701172, 'learning_rate': 1.3361161524500907e-05, 'epoch': 11.07} +{'loss': 37.9703, 'grad_norm': 184.31460571289062, 'learning_rate': 1.3355716878402904e-05, 'epoch': 11.07} +{'loss': 39.1406, 'grad_norm': 230.06463623046875, 'learning_rate': 1.3350272232304901e-05, 'epoch': 11.08} +{'loss': 39.8019, 'grad_norm': 263.3990478515625, 'learning_rate': 1.3344827586206897e-05, 'epoch': 11.08} +{'loss': 40.195, 'grad_norm': 217.89923095703125, 'learning_rate': 1.3339382940108892e-05, 'epoch': 11.08} + 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3080/5520 [2:41:35<1:58:51, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6136859655380249, 'eval_runtime': 3.142, 'eval_samples_per_second': 56.97, 'eval_steps_per_second': 56.97, 'epoch': 11.08} +{'loss': 39.1668, 'grad_norm': 238.8343505859375, 'learning_rate': 1.333393829401089e-05, 'epoch': 11.09} +{'loss': 40.3355, 'grad_norm': 288.6470947265625, 'learning_rate': 1.3328493647912886e-05, 'epoch': 11.09} +{'loss': 41.5359, 'grad_norm': 284.3423156738281, 'learning_rate': 1.3323049001814883e-05, 'epoch': 11.09} +{'loss': 41.3219, 'grad_norm': 263.0945739746094, 'learning_rate': 1.3317604355716879e-05, 'epoch': 11.1} +{'loss': 39.7292, 'grad_norm': 208.96383666992188, 'learning_rate': 1.3312159709618874e-05, 'epoch': 11.1} +{'loss': 35.282, 'grad_norm': 233.49888610839844, 'learning_rate': 1.3306715063520873e-05, 'epoch': 11.1} +{'loss': 34.4335, 'grad_norm': 216.6250762939453, 'learning_rate': 1.3301270417422868e-05, 'epoch': 11.11} +{'loss': 32.7557, 'grad_norm': 182.3594970703125, 'learning_rate': 1.3295825771324864e-05, 'epoch': 11.11} +{'loss': 32.185, 'grad_norm': 215.4852752685547, 'learning_rate': 1.329038112522686e-05, 'epoch': 11.12} +{'loss': 32.8733, 'grad_norm': 237.4733123779297, 'learning_rate': 1.3284936479128856e-05, 'epoch': 11.12} + 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3090/5520 [2:42:07<1:59:42, 2.96s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6130570769309998, 'eval_runtime': 3.154, 'eval_samples_per_second': 56.754, 'eval_steps_per_second': 56.754, 'epoch': 11.12} +{'loss': 33.89, 'grad_norm': 202.9044952392578, 'learning_rate': 1.3279491833030853e-05, 'epoch': 11.12} +{'loss': 34.0808, 'grad_norm': 230.82086181640625, 'learning_rate': 1.327404718693285e-05, 'epoch': 11.13} +{'loss': 35.5715, 'grad_norm': 318.1103515625, 'learning_rate': 1.3268602540834846e-05, 'epoch': 11.13} +{'loss': 36.0701, 'grad_norm': 296.760986328125, 'learning_rate': 1.3263157894736843e-05, 'epoch': 11.13} +{'loss': 35.027, 'grad_norm': 355.1922302246094, 'learning_rate': 1.3257713248638838e-05, 'epoch': 11.14} +{'loss': 36.8225, 'grad_norm': 379.0643310546875, 'learning_rate': 1.3252268602540835e-05, 'epoch': 11.14} +{'loss': 34.18, 'grad_norm': 271.0293273925781, 'learning_rate': 1.3246823956442832e-05, 'epoch': 11.14} +{'loss': 37.5546, 'grad_norm': 231.29782104492188, 'learning_rate': 1.3241379310344828e-05, 'epoch': 11.15} +{'loss': 35.8625, 'grad_norm': 236.58180236816406, 'learning_rate': 1.3235934664246823e-05, 'epoch': 11.15} +{'loss': 38.1384, 'grad_norm': 220.71853637695312, 'learning_rate': 1.3230490018148822e-05, 'epoch': 11.16} + 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3100/5520 [2:42:40<1:58:37, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6140565276145935, 'eval_runtime': 3.1543, 'eval_samples_per_second': 56.747, 'eval_steps_per_second': 56.747, 'epoch': 11.16} +{'loss': 36.7226, 'grad_norm': 251.32090759277344, 'learning_rate': 1.3225045372050817e-05, 'epoch': 11.16} +{'loss': 37.2144, 'grad_norm': 244.061279296875, 'learning_rate': 1.3219600725952814e-05, 'epoch': 11.16} +{'loss': 27.0703, 'grad_norm': 274.3013610839844, 'learning_rate': 1.321415607985481e-05, 'epoch': 11.17} +{'loss': 23.0504, 'grad_norm': 197.1829071044922, 'learning_rate': 1.3208711433756805e-05, 'epoch': 11.17} +{'loss': 23.4632, 'grad_norm': 205.8387451171875, 'learning_rate': 1.3203266787658804e-05, 'epoch': 11.17} +{'loss': 23.9426, 'grad_norm': 237.6263427734375, 'learning_rate': 1.31978221415608e-05, 'epoch': 11.18} +{'loss': 24.2553, 'grad_norm': 177.99688720703125, 'learning_rate': 1.3192377495462795e-05, 'epoch': 11.18} +{'loss': 41.3257, 'grad_norm': 235.16787719726562, 'learning_rate': 1.3186932849364792e-05, 'epoch': 11.18} +{'loss': 42.3344, 'grad_norm': 213.4043731689453, 'learning_rate': 1.3181488203266787e-05, 'epoch': 11.19} +{'loss': 41.2702, 'grad_norm': 162.57554626464844, 'learning_rate': 1.3176043557168784e-05, 'epoch': 11.19} + 56%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3110/5520 [2:43:12<1:58:04, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6155741214752197, 'eval_runtime': 3.137, 'eval_samples_per_second': 57.06, 'eval_steps_per_second': 57.06, 'epoch': 11.19} +{'loss': 41.0582, 'grad_norm': 215.84335327148438, 'learning_rate': 1.3170598911070781e-05, 'epoch': 11.2} +{'loss': 41.3479, 'grad_norm': 295.0271301269531, 'learning_rate': 1.3165154264972777e-05, 'epoch': 11.2} +{'loss': 41.6267, 'grad_norm': 287.3316955566406, 'learning_rate': 1.3159709618874774e-05, 'epoch': 11.2} +{'loss': 40.5208, 'grad_norm': 249.3993377685547, 'learning_rate': 1.315426497277677e-05, 'epoch': 11.21} +{'loss': 41.7072, 'grad_norm': 274.5410461425781, 'learning_rate': 1.3148820326678766e-05, 'epoch': 11.21} +{'loss': 41.0034, 'grad_norm': 259.49627685546875, 'learning_rate': 1.3143375680580763e-05, 'epoch': 11.21} +{'loss': 40.1154, 'grad_norm': 246.60902404785156, 'learning_rate': 1.3137931034482759e-05, 'epoch': 11.22} +{'loss': 41.1167, 'grad_norm': 224.0052947998047, 'learning_rate': 1.3132486388384754e-05, 'epoch': 11.22} +{'loss': 37.0909, 'grad_norm': 204.24021911621094, 'learning_rate': 1.3127041742286753e-05, 'epoch': 11.22} +{'loss': 38.0959, 'grad_norm': 206.67681884765625, 'learning_rate': 1.3121597096188748e-05, 'epoch': 11.23} + 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3120/5520 [2:43:44<1:56:03, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6148640513420105, 'eval_runtime': 3.1515, 'eval_samples_per_second': 56.798, 'eval_steps_per_second': 56.798, 'epoch': 11.23} +{'loss': 38.8076, 'grad_norm': 255.91238403320312, 'learning_rate': 1.3116152450090743e-05, 'epoch': 11.23} +{'loss': 39.3991, 'grad_norm': 239.5032958984375, 'learning_rate': 1.311070780399274e-05, 'epoch': 11.23} +{'loss': 37.7301, 'grad_norm': 254.8914031982422, 'learning_rate': 1.3105263157894738e-05, 'epoch': 11.24} +{'loss': 38.8527, 'grad_norm': 229.97943115234375, 'learning_rate': 1.3099818511796735e-05, 'epoch': 11.24} +{'loss': 38.8518, 'grad_norm': 208.1148681640625, 'learning_rate': 1.309437386569873e-05, 'epoch': 11.25} +{'loss': 38.927, 'grad_norm': 208.49557495117188, 'learning_rate': 1.3088929219600725e-05, 'epoch': 11.25} +{'loss': 40.0492, 'grad_norm': 332.9958801269531, 'learning_rate': 1.3083484573502723e-05, 'epoch': 11.25} +{'loss': 39.1965, 'grad_norm': 253.16769409179688, 'learning_rate': 1.307803992740472e-05, 'epoch': 11.26} +{'loss': 38.2286, 'grad_norm': 243.8136444091797, 'learning_rate': 1.3072595281306715e-05, 'epoch': 11.26} +{'loss': 39.3751, 'grad_norm': 273.6463623046875, 'learning_rate': 1.3067150635208712e-05, 'epoch': 11.26} + 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3130/5520 [2:44:16<1:57:31, 2.95s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6175129413604736, 'eval_runtime': 3.145, 'eval_samples_per_second': 56.916, 'eval_steps_per_second': 56.916, 'epoch': 11.26} +{'loss': 40.29, 'grad_norm': 228.980224609375, 'learning_rate': 1.3061705989110707e-05, 'epoch': 11.27} +{'loss': 41.1785, 'grad_norm': 292.6310729980469, 'learning_rate': 1.3056261343012703e-05, 'epoch': 11.27} +{'loss': 40.9514, 'grad_norm': 217.0737762451172, 'learning_rate': 1.3050816696914702e-05, 'epoch': 11.27} +{'loss': 39.6132, 'grad_norm': 227.0102081298828, 'learning_rate': 1.3045372050816697e-05, 'epoch': 11.28} +{'loss': 39.5024, 'grad_norm': 195.74667358398438, 'learning_rate': 1.3039927404718694e-05, 'epoch': 11.28} +{'loss': 37.7863, 'grad_norm': 222.6744384765625, 'learning_rate': 1.303448275862069e-05, 'epoch': 11.29} +{'loss': 34.9129, 'grad_norm': 207.1038055419922, 'learning_rate': 1.3029038112522687e-05, 'epoch': 11.29} +{'loss': 33.231, 'grad_norm': 227.38330078125, 'learning_rate': 1.3023593466424684e-05, 'epoch': 11.29} +{'loss': 33.3166, 'grad_norm': 254.19442749023438, 'learning_rate': 1.3018148820326679e-05, 'epoch': 11.3} +{'loss': 33.2336, 'grad_norm': 221.4664306640625, 'learning_rate': 1.3012704174228674e-05, 'epoch': 11.3} + 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3140/5520 [2:44:48<1:57:31, 2.96s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6138683557510376, 'eval_runtime': 3.1463, 'eval_samples_per_second': 56.892, 'eval_steps_per_second': 56.892, 'epoch': 11.3} +{'loss': 34.0082, 'grad_norm': 179.73678588867188, 'learning_rate': 1.3007259528130671e-05, 'epoch': 11.3} +{'loss': 33.1898, 'grad_norm': 238.66107177734375, 'learning_rate': 1.3001814882032669e-05, 'epoch': 11.31} +{'loss': 34.5558, 'grad_norm': 315.51934814453125, 'learning_rate': 1.2996370235934666e-05, 'epoch': 11.31} +{'loss': 32.4498, 'grad_norm': 235.54217529296875, 'learning_rate': 1.2990925589836661e-05, 'epoch': 11.31} +{'loss': 34.1823, 'grad_norm': 225.9518280029297, 'learning_rate': 1.2985480943738656e-05, 'epoch': 11.32} +{'loss': 34.6704, 'grad_norm': 276.5481262207031, 'learning_rate': 1.2980036297640655e-05, 'epoch': 11.32} +{'loss': 35.9149, 'grad_norm': 306.4985656738281, 'learning_rate': 1.297459165154265e-05, 'epoch': 11.33} +{'loss': 34.876, 'grad_norm': 207.28550720214844, 'learning_rate': 1.2969147005444646e-05, 'epoch': 11.33} +{'loss': 36.7191, 'grad_norm': 238.89157104492188, 'learning_rate': 1.2963702359346643e-05, 'epoch': 11.33} +{'loss': 37.9134, 'grad_norm': 281.7445068359375, 'learning_rate': 1.2958257713248638e-05, 'epoch': 11.34} + 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3150/5520 [2:45:21<1:58:13, 2.99s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6141538023948669, 'eval_runtime': 3.1622, 'eval_samples_per_second': 56.606, 'eval_steps_per_second': 56.606, 'epoch': 11.34} +{'loss': 36.7193, 'grad_norm': 261.58221435546875, 'learning_rate': 1.2952813067150635e-05, 'epoch': 11.34} +{'loss': 36.9418, 'grad_norm': 260.8083190917969, 'learning_rate': 1.2947368421052633e-05, 'epoch': 11.34} +{'loss': 31.1083, 'grad_norm': 263.466552734375, 'learning_rate': 1.2941923774954628e-05, 'epoch': 11.35} +{'loss': 23.4982, 'grad_norm': 201.6587677001953, 'learning_rate': 1.2936479128856625e-05, 'epoch': 11.35} +{'loss': 22.5417, 'grad_norm': 230.29629516601562, 'learning_rate': 1.293103448275862e-05, 'epoch': 11.35} +{'loss': 23.6032, 'grad_norm': 193.08795166015625, 'learning_rate': 1.2925589836660617e-05, 'epoch': 11.36} +{'loss': 24.1813, 'grad_norm': 206.49093627929688, 'learning_rate': 1.2920145190562615e-05, 'epoch': 11.36} +{'loss': 41.4394, 'grad_norm': 285.38348388671875, 'learning_rate': 1.291470054446461e-05, 'epoch': 11.36} +{'loss': 43.8865, 'grad_norm': 307.4984130859375, 'learning_rate': 1.2909255898366605e-05, 'epoch': 11.37} +{'loss': 41.5534, 'grad_norm': 256.685791015625, 'learning_rate': 1.2903811252268604e-05, 'epoch': 11.37} + 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3160/5520 [2:45:54<1:56:12, 2.95s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6155339479446411, 'eval_runtime': 3.1488, 'eval_samples_per_second': 56.846, 'eval_steps_per_second': 56.846, 'epoch': 11.37} +{'loss': 41.5231, 'grad_norm': 302.5317077636719, 'learning_rate': 1.28983666061706e-05, 'epoch': 11.38} +{'loss': 40.7064, 'grad_norm': 381.4787292480469, 'learning_rate': 1.2892921960072595e-05, 'epoch': 11.38} +{'loss': 41.4045, 'grad_norm': 313.63116455078125, 'learning_rate': 1.2887477313974592e-05, 'epoch': 11.38} +{'loss': 41.2618, 'grad_norm': 265.4134521484375, 'learning_rate': 1.2882032667876587e-05, 'epoch': 11.39} +{'loss': 42.6311, 'grad_norm': 260.43084716796875, 'learning_rate': 1.2876588021778586e-05, 'epoch': 11.39} +{'loss': 41.8859, 'grad_norm': 326.7022705078125, 'learning_rate': 1.2871143375680581e-05, 'epoch': 11.39} +{'loss': 41.8117, 'grad_norm': 420.966552734375, 'learning_rate': 1.2865698729582577e-05, 'epoch': 11.4} +{'loss': 41.3303, 'grad_norm': 280.8377380371094, 'learning_rate': 1.2860254083484574e-05, 'epoch': 11.4} +{'loss': 38.253, 'grad_norm': 238.64564514160156, 'learning_rate': 1.2854809437386571e-05, 'epoch': 11.4} +{'loss': 39.2494, 'grad_norm': 258.8091125488281, 'learning_rate': 1.2849364791288566e-05, 'epoch': 11.41} + 57%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3170/5520 [2:46:26<1:55:22, 2.95s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6130858659744263, 'eval_runtime': 3.1459, 'eval_samples_per_second': 56.9, 'eval_steps_per_second': 56.9, 'epoch': 11.41} +{'loss': 39.1069, 'grad_norm': 209.76300048828125, 'learning_rate': 1.2843920145190563e-05, 'epoch': 11.41} +{'loss': 38.8867, 'grad_norm': 215.24072265625, 'learning_rate': 1.2838475499092559e-05, 'epoch': 11.42} +{'loss': 38.0298, 'grad_norm': 285.4281311035156, 'learning_rate': 1.2833030852994554e-05, 'epoch': 11.42} +{'loss': 40.2122, 'grad_norm': 322.1593017578125, 'learning_rate': 1.2827586206896553e-05, 'epoch': 11.42} +{'loss': 38.0829, 'grad_norm': 277.2178955078125, 'learning_rate': 1.2822141560798548e-05, 'epoch': 11.43} +{'loss': 40.6601, 'grad_norm': 186.9705810546875, 'learning_rate': 1.2816696914700545e-05, 'epoch': 11.43} +{'loss': 39.0126, 'grad_norm': 210.6102294921875, 'learning_rate': 1.281125226860254e-05, 'epoch': 11.43} +{'loss': 38.6465, 'grad_norm': 234.50717163085938, 'learning_rate': 1.2805807622504536e-05, 'epoch': 11.44} +{'loss': 39.2568, 'grad_norm': 217.9093475341797, 'learning_rate': 1.2800362976406535e-05, 'epoch': 11.44} +{'loss': 39.005, 'grad_norm': 252.82054138183594, 'learning_rate': 1.279491833030853e-05, 'epoch': 11.44} + 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3180/5520 [2:46:58<1:54:05, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6125118732452393, 'eval_runtime': 3.1425, 'eval_samples_per_second': 56.961, 'eval_steps_per_second': 56.961, 'epoch': 11.44} +{'loss': 39.6133, 'grad_norm': 290.2322998046875, 'learning_rate': 1.2789473684210526e-05, 'epoch': 11.45} +{'loss': 40.3251, 'grad_norm': 250.72450256347656, 'learning_rate': 1.2784029038112523e-05, 'epoch': 11.45} +{'loss': 39.5129, 'grad_norm': 273.91229248046875, 'learning_rate': 1.277858439201452e-05, 'epoch': 11.46} +{'loss': 40.5093, 'grad_norm': 214.30038452148438, 'learning_rate': 1.2773139745916515e-05, 'epoch': 11.46} +{'loss': 38.3837, 'grad_norm': 264.251708984375, 'learning_rate': 1.2767695099818512e-05, 'epoch': 11.46} +{'loss': 37.8522, 'grad_norm': 224.7700653076172, 'learning_rate': 1.2762250453720508e-05, 'epoch': 11.47} +{'loss': 34.0249, 'grad_norm': 238.35604858398438, 'learning_rate': 1.2756805807622505e-05, 'epoch': 11.47} +{'loss': 34.2473, 'grad_norm': 181.4731903076172, 'learning_rate': 1.2751361161524502e-05, 'epoch': 11.47} +{'loss': 32.8657, 'grad_norm': 240.2397003173828, 'learning_rate': 1.2745916515426497e-05, 'epoch': 11.48} +{'loss': 34.6619, 'grad_norm': 283.2740478515625, 'learning_rate': 1.2740471869328494e-05, 'epoch': 11.48} + 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3190/5520 [2:47:30<1:53:41, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6126638054847717, 'eval_runtime': 3.1416, 'eval_samples_per_second': 56.976, 'eval_steps_per_second': 56.976, 'epoch': 11.48} +{'loss': 33.0975, 'grad_norm': 248.70912170410156, 'learning_rate': 1.273502722323049e-05, 'epoch': 11.48} +{'loss': 34.2069, 'grad_norm': 210.9479217529297, 'learning_rate': 1.2729582577132487e-05, 'epoch': 11.49} +{'loss': 35.811, 'grad_norm': 234.31399536132812, 'learning_rate': 1.2724137931034484e-05, 'epoch': 11.49} +{'loss': 35.6234, 'grad_norm': 253.24478149414062, 'learning_rate': 1.271869328493648e-05, 'epoch': 11.49} +{'loss': 35.1495, 'grad_norm': 259.0565185546875, 'learning_rate': 1.2713248638838476e-05, 'epoch': 11.5} +{'loss': 35.1363, 'grad_norm': 235.4202880859375, 'learning_rate': 1.2707803992740472e-05, 'epoch': 11.5} +{'loss': 35.9653, 'grad_norm': 248.30267333984375, 'learning_rate': 1.2702359346642469e-05, 'epoch': 11.51} +{'loss': 35.6304, 'grad_norm': 197.6142120361328, 'learning_rate': 1.2696914700544466e-05, 'epoch': 11.51} +{'loss': 35.6111, 'grad_norm': 329.27862548828125, 'learning_rate': 1.2691470054446461e-05, 'epoch': 11.51} +{'loss': 35.0693, 'grad_norm': 194.7126922607422, 'learning_rate': 1.2686025408348457e-05, 'epoch': 11.52} + 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3200/5520 [2:48:02<1:53:07, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6106634736061096, 'eval_runtime': 3.1408, 'eval_samples_per_second': 56.991, 'eval_steps_per_second': 56.991, 'epoch': 11.52} +{'loss': 37.6373, 'grad_norm': 243.0207061767578, 'learning_rate': 1.2680580762250454e-05, 'epoch': 11.52} +{'loss': 36.2595, 'grad_norm': 282.0947265625, 'learning_rate': 1.267513611615245e-05, 'epoch': 11.52} +{'loss': 35.5601, 'grad_norm': 249.8011932373047, 'learning_rate': 1.2669691470054446e-05, 'epoch': 11.53} +{'loss': 23.1075, 'grad_norm': 202.17503356933594, 'learning_rate': 1.2664246823956443e-05, 'epoch': 11.53} +{'loss': 22.2458, 'grad_norm': 188.78128051757812, 'learning_rate': 1.2658802177858439e-05, 'epoch': 11.53} +{'loss': 23.7842, 'grad_norm': 219.24722290039062, 'learning_rate': 1.2653357531760437e-05, 'epoch': 11.54} +{'loss': 25.3773, 'grad_norm': 213.0615234375, 'learning_rate': 1.2647912885662433e-05, 'epoch': 11.54} +{'loss': 40.396, 'grad_norm': 274.6806335449219, 'learning_rate': 1.2642468239564428e-05, 'epoch': 11.55} +{'loss': 42.2405, 'grad_norm': 248.91778564453125, 'learning_rate': 1.2637023593466425e-05, 'epoch': 11.55} +{'loss': 40.7328, 'grad_norm': 228.45591735839844, 'learning_rate': 1.263157894736842e-05, 'epoch': 11.55} + 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3200/5520 [2:48:05<1:53:07, 2.93s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3210/5520 [2:48:35<1:53:10, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6154705286026001, 'eval_runtime': 3.1381, 'eval_samples_per_second': 57.04, 'eval_steps_per_second': 57.04, 'epoch': 11.55} +{'loss': 40.6909, 'grad_norm': 206.54483032226562, 'learning_rate': 1.2626134301270418e-05, 'epoch': 11.56} +{'loss': 40.6918, 'grad_norm': 199.14816284179688, 'learning_rate': 1.2620689655172415e-05, 'epoch': 11.56} +{'loss': 41.686, 'grad_norm': 217.4789276123047, 'learning_rate': 1.261524500907441e-05, 'epoch': 11.56} +{'loss': 40.685, 'grad_norm': 209.83084106445312, 'learning_rate': 1.2609800362976406e-05, 'epoch': 11.57} +{'loss': 42.1684, 'grad_norm': 184.56614685058594, 'learning_rate': 1.2604355716878404e-05, 'epoch': 11.57} +{'loss': 42.4169, 'grad_norm': 226.84622192382812, 'learning_rate': 1.25989110707804e-05, 'epoch': 11.57} +{'loss': 41.9603, 'grad_norm': 271.7705383300781, 'learning_rate': 1.2593466424682397e-05, 'epoch': 11.58} +{'loss': 39.9903, 'grad_norm': 206.48257446289062, 'learning_rate': 1.2588021778584392e-05, 'epoch': 11.58} +{'loss': 39.3138, 'grad_norm': 190.86009216308594, 'learning_rate': 1.2582577132486388e-05, 'epoch': 11.59} +{'loss': 37.652, 'grad_norm': 217.0152130126953, 'learning_rate': 1.2577132486388386e-05, 'epoch': 11.59} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 58%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3220/5520 [2:49:07<1:51:51, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6143624186515808, 'eval_runtime': 3.1406, 'eval_samples_per_second': 56.996, 'eval_steps_per_second': 56.996, 'epoch': 11.59} +{'loss': 38.5532, 'grad_norm': 203.3090362548828, 'learning_rate': 1.2571687840290382e-05, 'epoch': 11.59} +{'loss': 38.4073, 'grad_norm': 237.18287658691406, 'learning_rate': 1.2566243194192377e-05, 'epoch': 11.6} +{'loss': 37.7122, 'grad_norm': 222.20489501953125, 'learning_rate': 1.2560798548094374e-05, 'epoch': 11.6} +{'loss': 39.0125, 'grad_norm': 261.4862060546875, 'learning_rate': 1.255535390199637e-05, 'epoch': 11.6} +{'loss': 38.1753, 'grad_norm': 235.49668884277344, 'learning_rate': 1.2549909255898367e-05, 'epoch': 11.61} +{'loss': 40.3478, 'grad_norm': 219.66139221191406, 'learning_rate': 1.2544464609800364e-05, 'epoch': 11.61} +{'loss': 39.3672, 'grad_norm': 282.8075256347656, 'learning_rate': 1.2539019963702359e-05, 'epoch': 11.61} +{'loss': 39.8955, 'grad_norm': 235.07875061035156, 'learning_rate': 1.2533575317604356e-05, 'epoch': 11.62} +{'loss': 38.626, 'grad_norm': 328.829833984375, 'learning_rate': 1.2528130671506353e-05, 'epoch': 11.62} +{'loss': 40.0565, 'grad_norm': 283.1789245605469, 'learning_rate': 1.2522686025408349e-05, 'epoch': 11.62} + 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3230/5520 [2:49:39<1:50:13, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6113889217376709, 'eval_runtime': 3.1363, 'eval_samples_per_second': 57.073, 'eval_steps_per_second': 57.073, 'epoch': 11.62} +{'loss': 40.1155, 'grad_norm': 230.88047790527344, 'learning_rate': 1.2517241379310346e-05, 'epoch': 11.63} +{'loss': 40.4707, 'grad_norm': 258.1295166015625, 'learning_rate': 1.2511796733212341e-05, 'epoch': 11.63} +{'loss': 41.1296, 'grad_norm': 255.82699584960938, 'learning_rate': 1.2506352087114336e-05, 'epoch': 11.64} +{'loss': 39.1159, 'grad_norm': 226.4784393310547, 'learning_rate': 1.2500907441016335e-05, 'epoch': 11.64} +{'loss': 40.7933, 'grad_norm': 257.38104248046875, 'learning_rate': 1.249546279491833e-05, 'epoch': 11.64} +{'loss': 39.6723, 'grad_norm': 218.69070434570312, 'learning_rate': 1.2490018148820328e-05, 'epoch': 11.65} +{'loss': 37.5671, 'grad_norm': 232.3351287841797, 'learning_rate': 1.2484573502722323e-05, 'epoch': 11.65} +{'loss': 32.7819, 'grad_norm': 229.93295288085938, 'learning_rate': 1.2479128856624318e-05, 'epoch': 11.65} +{'loss': 32.5955, 'grad_norm': 265.6002197265625, 'learning_rate': 1.2473684210526317e-05, 'epoch': 11.66} +{'loss': 32.9901, 'grad_norm': 278.47705078125, 'learning_rate': 1.2468239564428313e-05, 'epoch': 11.66} + 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3240/5520 [2:50:10<1:49:50, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6078047752380371, 'eval_runtime': 3.1366, 'eval_samples_per_second': 57.068, 'eval_steps_per_second': 57.068, 'epoch': 11.66} +{'loss': 33.2737, 'grad_norm': 239.9285430908203, 'learning_rate': 1.2462794918330308e-05, 'epoch': 11.66} +{'loss': 34.8522, 'grad_norm': 358.36090087890625, 'learning_rate': 1.2457350272232305e-05, 'epoch': 11.67} +{'loss': 34.6796, 'grad_norm': 258.0733642578125, 'learning_rate': 1.2451905626134302e-05, 'epoch': 11.67} +{'loss': 35.8479, 'grad_norm': 296.21942138671875, 'learning_rate': 1.2446460980036298e-05, 'epoch': 11.68} +{'loss': 36.4934, 'grad_norm': 229.6141815185547, 'learning_rate': 1.2441016333938295e-05, 'epoch': 11.68} +{'loss': 35.2253, 'grad_norm': 238.6092987060547, 'learning_rate': 1.243557168784029e-05, 'epoch': 11.68} +{'loss': 34.9373, 'grad_norm': 300.76300048828125, 'learning_rate': 1.2430127041742287e-05, 'epoch': 11.69} +{'loss': 35.4369, 'grad_norm': 227.70672607421875, 'learning_rate': 1.2424682395644284e-05, 'epoch': 11.69} +{'loss': 35.3398, 'grad_norm': 218.36000061035156, 'learning_rate': 1.241923774954628e-05, 'epoch': 11.69} +{'loss': 35.7612, 'grad_norm': 220.78475952148438, 'learning_rate': 1.2413793103448277e-05, 'epoch': 11.7} + 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3250/5520 [2:50:42<1:50:49, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6067846417427063, 'eval_runtime': 3.1322, 'eval_samples_per_second': 57.148, 'eval_steps_per_second': 57.148, 'epoch': 11.7} +{'loss': 38.0459, 'grad_norm': 237.34437561035156, 'learning_rate': 1.2408348457350272e-05, 'epoch': 11.7} +{'loss': 35.4676, 'grad_norm': 251.60633850097656, 'learning_rate': 1.2402903811252269e-05, 'epoch': 11.7} +{'loss': 30.5595, 'grad_norm': 214.17117309570312, 'learning_rate': 1.2397459165154266e-05, 'epoch': 11.71} +{'loss': 23.7468, 'grad_norm': 202.3698272705078, 'learning_rate': 1.2392014519056262e-05, 'epoch': 11.71} +{'loss': 23.1255, 'grad_norm': 229.11776733398438, 'learning_rate': 1.2386569872958257e-05, 'epoch': 11.72} +{'loss': 23.7349, 'grad_norm': 175.93829345703125, 'learning_rate': 1.2381125226860254e-05, 'epoch': 11.72} +{'loss': 24.4997, 'grad_norm': 232.7489471435547, 'learning_rate': 1.2375680580762251e-05, 'epoch': 11.72} +{'loss': 42.3811, 'grad_norm': 280.5601806640625, 'learning_rate': 1.2370235934664248e-05, 'epoch': 11.73} +{'loss': 42.9804, 'grad_norm': 292.2538146972656, 'learning_rate': 1.2364791288566244e-05, 'epoch': 11.73} +{'loss': 41.1251, 'grad_norm': 265.0259704589844, 'learning_rate': 1.2359346642468239e-05, 'epoch': 11.73} + 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3260/5520 [2:51:14<1:49:00, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6141200065612793, 'eval_runtime': 3.1404, 'eval_samples_per_second': 56.999, 'eval_steps_per_second': 56.999, 'epoch': 11.73} +{'loss': 40.9372, 'grad_norm': 232.92893981933594, 'learning_rate': 1.2353901996370236e-05, 'epoch': 11.74} +{'loss': 41.0757, 'grad_norm': 176.99818420410156, 'learning_rate': 1.2348457350272233e-05, 'epoch': 11.74} +{'loss': 41.9635, 'grad_norm': 206.5728759765625, 'learning_rate': 1.2343012704174228e-05, 'epoch': 11.74} +{'loss': 41.5217, 'grad_norm': 211.2556915283203, 'learning_rate': 1.2337568058076226e-05, 'epoch': 11.75} +{'loss': 42.9997, 'grad_norm': 198.8915252685547, 'learning_rate': 1.2332123411978221e-05, 'epoch': 11.75} +{'loss': 42.2561, 'grad_norm': 291.2761535644531, 'learning_rate': 1.2326678765880218e-05, 'epoch': 11.75} +{'loss': 41.6219, 'grad_norm': 243.2998046875, 'learning_rate': 1.2321234119782215e-05, 'epoch': 11.76} +{'loss': 40.1646, 'grad_norm': 266.1149597167969, 'learning_rate': 1.231578947368421e-05, 'epoch': 11.76} +{'loss': 39.7079, 'grad_norm': 236.6083221435547, 'learning_rate': 1.2310344827586208e-05, 'epoch': 11.77} +{'loss': 39.6629, 'grad_norm': 196.397216796875, 'learning_rate': 1.2304900181488203e-05, 'epoch': 11.77} + 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3270/5520 [2:51:45<1:48:18, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6124016046524048, 'eval_runtime': 3.1373, 'eval_samples_per_second': 57.055, 'eval_steps_per_second': 57.055, 'epoch': 11.77} +{'loss': 38.5285, 'grad_norm': 198.52500915527344, 'learning_rate': 1.22994555353902e-05, 'epoch': 11.77} +{'loss': 38.3358, 'grad_norm': 236.25477600097656, 'learning_rate': 1.2294010889292197e-05, 'epoch': 11.78} +{'loss': 38.374, 'grad_norm': 260.35955810546875, 'learning_rate': 1.2288566243194192e-05, 'epoch': 11.78} +{'loss': 39.124, 'grad_norm': 313.078857421875, 'learning_rate': 1.2283121597096188e-05, 'epoch': 11.78} +{'loss': 39.1776, 'grad_norm': 191.34027099609375, 'learning_rate': 1.2277676950998187e-05, 'epoch': 11.79} +{'loss': 38.7885, 'grad_norm': 203.5764923095703, 'learning_rate': 1.2272232304900182e-05, 'epoch': 11.79} +{'loss': 39.1353, 'grad_norm': 234.38479614257812, 'learning_rate': 1.2266787658802177e-05, 'epoch': 11.79} +{'loss': 38.141, 'grad_norm': 254.5694122314453, 'learning_rate': 1.2261343012704174e-05, 'epoch': 11.8} +{'loss': 39.5199, 'grad_norm': 189.8268585205078, 'learning_rate': 1.225589836660617e-05, 'epoch': 11.8} +{'loss': 41.5113, 'grad_norm': 256.52728271484375, 'learning_rate': 1.2250453720508169e-05, 'epoch': 11.81} + 59%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3280/5520 [2:52:17<1:47:58, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6084021329879761, 'eval_runtime': 3.1363, 'eval_samples_per_second': 57.073, 'eval_steps_per_second': 57.073, 'epoch': 11.81} +{'loss': 39.8129, 'grad_norm': 195.57321166992188, 'learning_rate': 1.2245009074410164e-05, 'epoch': 11.81} +{'loss': 40.2273, 'grad_norm': 228.6748809814453, 'learning_rate': 1.223956442831216e-05, 'epoch': 11.81} +{'loss': 40.2254, 'grad_norm': 209.96096801757812, 'learning_rate': 1.2234119782214156e-05, 'epoch': 11.82} +{'loss': 40.71, 'grad_norm': 247.4613037109375, 'learning_rate': 1.2228675136116152e-05, 'epoch': 11.82} +{'loss': 39.5572, 'grad_norm': 263.0521240234375, 'learning_rate': 1.2223230490018149e-05, 'epoch': 11.82} +{'loss': 36.4388, 'grad_norm': 225.53634643554688, 'learning_rate': 1.2217785843920146e-05, 'epoch': 11.83} +{'loss': 33.1005, 'grad_norm': 194.59527587890625, 'learning_rate': 1.2212341197822141e-05, 'epoch': 11.83} +{'loss': 32.9812, 'grad_norm': 314.715576171875, 'learning_rate': 1.2206896551724138e-05, 'epoch': 11.83} +{'loss': 33.6331, 'grad_norm': 205.86862182617188, 'learning_rate': 1.2201451905626136e-05, 'epoch': 11.84} +{'loss': 33.6535, 'grad_norm': 217.54722595214844, 'learning_rate': 1.2196007259528131e-05, 'epoch': 11.84} + 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3290/5520 [2:52:49<1:47:13, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.609620213508606, 'eval_runtime': 3.1387, 'eval_samples_per_second': 57.03, 'eval_steps_per_second': 57.03, 'epoch': 11.84} +{'loss': 34.5218, 'grad_norm': 231.25390625, 'learning_rate': 1.2190562613430128e-05, 'epoch': 11.85} +{'loss': 34.354, 'grad_norm': 208.8440704345703, 'learning_rate': 1.2185117967332123e-05, 'epoch': 11.85} +{'loss': 34.5705, 'grad_norm': 221.25547790527344, 'learning_rate': 1.2179673321234119e-05, 'epoch': 11.85} +{'loss': 35.796, 'grad_norm': 331.4505920410156, 'learning_rate': 1.2174228675136118e-05, 'epoch': 11.86} +{'loss': 36.4544, 'grad_norm': 337.1404113769531, 'learning_rate': 1.2168784029038113e-05, 'epoch': 11.86} +{'loss': 35.7165, 'grad_norm': 238.75303649902344, 'learning_rate': 1.2163339382940108e-05, 'epoch': 11.86} +{'loss': 35.5461, 'grad_norm': 260.088134765625, 'learning_rate': 1.2157894736842105e-05, 'epoch': 11.87} +{'loss': 37.0143, 'grad_norm': 265.0240173339844, 'learning_rate': 1.2152450090744102e-05, 'epoch': 11.87} +{'loss': 36.6145, 'grad_norm': 251.74273681640625, 'learning_rate': 1.21470054446461e-05, 'epoch': 11.87} +{'loss': 36.3135, 'grad_norm': 216.8999786376953, 'learning_rate': 1.2141560798548095e-05, 'epoch': 11.88} + 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3300/5520 [2:53:20<1:46:47, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6087896823883057, 'eval_runtime': 3.1367, 'eval_samples_per_second': 57.066, 'eval_steps_per_second': 57.066, 'epoch': 11.88} +{'loss': 36.6596, 'grad_norm': 256.50006103515625, 'learning_rate': 1.213611615245009e-05, 'epoch': 11.88} +{'loss': 37.6473, 'grad_norm': 249.34164428710938, 'learning_rate': 1.2130671506352087e-05, 'epoch': 11.88} +{'loss': 28.2839, 'grad_norm': 211.9344940185547, 'learning_rate': 1.2125226860254084e-05, 'epoch': 11.89} +{'loss': 23.2231, 'grad_norm': 170.77166748046875, 'learning_rate': 1.211978221415608e-05, 'epoch': 11.89} +{'loss': 22.7909, 'grad_norm': 177.49789428710938, 'learning_rate': 1.2114337568058077e-05, 'epoch': 11.9} +{'loss': 23.8062, 'grad_norm': 189.0458221435547, 'learning_rate': 1.2108892921960072e-05, 'epoch': 11.9} +{'loss': 24.7812, 'grad_norm': 182.90457153320312, 'learning_rate': 1.2103448275862068e-05, 'epoch': 11.9} +{'loss': 41.5496, 'grad_norm': 232.61126708984375, 'learning_rate': 1.2098003629764066e-05, 'epoch': 11.91} +{'loss': 40.7831, 'grad_norm': 283.25762939453125, 'learning_rate': 1.2092558983666062e-05, 'epoch': 11.91} +{'loss': 40.6287, 'grad_norm': 316.6318359375, 'learning_rate': 1.2087114337568059e-05, 'epoch': 11.91} + 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3310/5520 [2:53:52<1:45:44, 2.87s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6114257574081421, 'eval_runtime': 3.1351, 'eval_samples_per_second': 57.095, 'eval_steps_per_second': 57.095, 'epoch': 11.91} +{'loss': 40.5648, 'grad_norm': 248.5615234375, 'learning_rate': 1.2081669691470054e-05, 'epoch': 11.92} +{'loss': 42.4736, 'grad_norm': 255.31130981445312, 'learning_rate': 1.2076225045372051e-05, 'epoch': 11.92} +{'loss': 43.112, 'grad_norm': 229.3546600341797, 'learning_rate': 1.2070780399274048e-05, 'epoch': 11.92} +{'loss': 37.9527, 'grad_norm': 226.89553833007812, 'learning_rate': 1.2065335753176044e-05, 'epoch': 11.93} +{'loss': 38.7652, 'grad_norm': 210.63919067382812, 'learning_rate': 1.205989110707804e-05, 'epoch': 11.93} +{'loss': 39.9077, 'grad_norm': 267.75335693359375, 'learning_rate': 1.2054446460980036e-05, 'epoch': 11.94} +{'loss': 39.9008, 'grad_norm': 255.3372802734375, 'learning_rate': 1.2049001814882033e-05, 'epoch': 11.94} +{'loss': 40.8187, 'grad_norm': 220.55332946777344, 'learning_rate': 1.2043557168784029e-05, 'epoch': 11.94} +{'loss': 40.2937, 'grad_norm': 350.15374755859375, 'learning_rate': 1.2038112522686026e-05, 'epoch': 11.95} +{'loss': 41.3939, 'grad_norm': 296.1144714355469, 'learning_rate': 1.2032667876588021e-05, 'epoch': 11.95} + 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3320/5520 [2:54:24<1:45:46, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6116041541099548, 'eval_runtime': 3.1346, 'eval_samples_per_second': 57.104, 'eval_steps_per_second': 57.104, 'epoch': 11.95} +{'loss': 39.108, 'grad_norm': 220.52304077148438, 'learning_rate': 1.202722323049002e-05, 'epoch': 11.95} +{'loss': 39.547, 'grad_norm': 268.8526916503906, 'learning_rate': 1.2021778584392015e-05, 'epoch': 11.96} +{'loss': 36.7144, 'grad_norm': 205.97677612304688, 'learning_rate': 1.201633393829401e-05, 'epoch': 11.96} +{'loss': 34.0491, 'grad_norm': 186.62428283691406, 'learning_rate': 1.2010889292196008e-05, 'epoch': 11.96} +{'loss': 34.1164, 'grad_norm': 214.5521697998047, 'learning_rate': 1.2005444646098003e-05, 'epoch': 11.97} +{'loss': 34.0005, 'grad_norm': 203.8130340576172, 'learning_rate': 1.2e-05, 'epoch': 11.97} +{'loss': 34.0489, 'grad_norm': 207.25648498535156, 'learning_rate': 1.1994555353901997e-05, 'epoch': 11.98} +{'loss': 35.0359, 'grad_norm': 271.1595458984375, 'learning_rate': 1.1989110707803993e-05, 'epoch': 11.98} +{'loss': 36.4684, 'grad_norm': 266.0697021484375, 'learning_rate': 1.198366606170599e-05, 'epoch': 11.98} +{'loss': 35.8805, 'grad_norm': 264.1314392089844, 'learning_rate': 1.1978221415607985e-05, 'epoch': 11.99} + 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3330/5520 [2:54:55<1:44:36, 2.87s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6101864576339722, 'eval_runtime': 3.1361, 'eval_samples_per_second': 57.077, 'eval_steps_per_second': 57.077, 'epoch': 11.99} +{'loss': 37.2928, 'grad_norm': 266.34295654296875, 'learning_rate': 1.1972776769509982e-05, 'epoch': 11.99} +{'loss': 29.0638, 'grad_norm': 222.19161987304688, 'learning_rate': 1.196733212341198e-05, 'epoch': 11.99} +{'loss': 23.6752, 'grad_norm': 244.96974182128906, 'learning_rate': 1.1961887477313975e-05, 'epoch': 12.0} +{'loss': 20.9293, 'grad_norm': 227.6931915283203, 'learning_rate': 1.195644283121597e-05, 'epoch': 12.0} +{'loss': 39.7694, 'grad_norm': 259.7235412597656, 'learning_rate': 1.1950998185117969e-05, 'epoch': 12.0} +{'loss': 41.3742, 'grad_norm': 258.8477783203125, 'learning_rate': 1.1945553539019964e-05, 'epoch': 12.01} +{'loss': 40.0706, 'grad_norm': 216.0697784423828, 'learning_rate': 1.194010889292196e-05, 'epoch': 12.01} +{'loss': 39.844, 'grad_norm': 197.73046875, 'learning_rate': 1.1934664246823957e-05, 'epoch': 12.01} +{'loss': 41.8877, 'grad_norm': 190.29563903808594, 'learning_rate': 1.1929219600725952e-05, 'epoch': 12.02} +{'loss': 40.5782, 'grad_norm': 190.01197814941406, 'learning_rate': 1.1923774954627951e-05, 'epoch': 12.02} + 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3340/5520 [2:55:27<1:44:50, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6100598573684692, 'eval_runtime': 3.1334, 'eval_samples_per_second': 57.126, 'eval_steps_per_second': 57.126, 'epoch': 12.02} +{'loss': 42.9183, 'grad_norm': 283.20965576171875, 'learning_rate': 1.1918330308529946e-05, 'epoch': 12.03} +{'loss': 41.4606, 'grad_norm': 227.9106903076172, 'learning_rate': 1.1912885662431942e-05, 'epoch': 12.03} +{'loss': 40.527, 'grad_norm': 217.31640625, 'learning_rate': 1.1907441016333939e-05, 'epoch': 12.03} +{'loss': 40.2536, 'grad_norm': 181.33787536621094, 'learning_rate': 1.1901996370235936e-05, 'epoch': 12.04} +{'loss': 39.0234, 'grad_norm': 210.638427734375, 'learning_rate': 1.1896551724137931e-05, 'epoch': 12.04} +{'loss': 36.6929, 'grad_norm': 222.1325225830078, 'learning_rate': 1.1891107078039928e-05, 'epoch': 12.04} +{'loss': 37.9547, 'grad_norm': 195.0751953125, 'learning_rate': 1.1885662431941924e-05, 'epoch': 12.05} +{'loss': 37.9016, 'grad_norm': 287.6582946777344, 'learning_rate': 1.1880217785843919e-05, 'epoch': 12.05} +{'loss': 40.014, 'grad_norm': 351.43701171875, 'learning_rate': 1.1874773139745918e-05, 'epoch': 12.05} +{'loss': 37.8761, 'grad_norm': 212.9033966064453, 'learning_rate': 1.1869328493647913e-05, 'epoch': 12.06} + 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3350/5520 [2:55:58<1:44:06, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6093400120735168, 'eval_runtime': 3.1352, 'eval_samples_per_second': 57.093, 'eval_steps_per_second': 57.093, 'epoch': 12.06} +{'loss': 38.7171, 'grad_norm': 268.8284912109375, 'learning_rate': 1.186388384754991e-05, 'epoch': 12.06} +{'loss': 38.4908, 'grad_norm': 193.27267456054688, 'learning_rate': 1.1858439201451906e-05, 'epoch': 12.07} +{'loss': 37.9388, 'grad_norm': 244.18124389648438, 'learning_rate': 1.1852994555353901e-05, 'epoch': 12.07} +{'loss': 38.4287, 'grad_norm': 311.6593933105469, 'learning_rate': 1.18475499092559e-05, 'epoch': 12.07} +{'loss': 38.1349, 'grad_norm': 239.28526306152344, 'learning_rate': 1.1842105263157895e-05, 'epoch': 12.08} +{'loss': 39.8067, 'grad_norm': 312.1795654296875, 'learning_rate': 1.183666061705989e-05, 'epoch': 12.08} +{'loss': 40.0617, 'grad_norm': 303.3067932128906, 'learning_rate': 1.1831215970961888e-05, 'epoch': 12.08} +{'loss': 39.244, 'grad_norm': 280.8705749511719, 'learning_rate': 1.1825771324863885e-05, 'epoch': 12.09} +{'loss': 39.0047, 'grad_norm': 249.89671325683594, 'learning_rate': 1.182032667876588e-05, 'epoch': 12.09} +{'loss': 40.8044, 'grad_norm': 226.19195556640625, 'learning_rate': 1.1814882032667877e-05, 'epoch': 12.09} + 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3360/5520 [2:56:30<1:43:09, 2.87s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6100687384605408, 'eval_runtime': 3.1344, 'eval_samples_per_second': 57.109, 'eval_steps_per_second': 57.109, 'epoch': 12.09} +{'loss': 38.0745, 'grad_norm': 250.29306030273438, 'learning_rate': 1.1809437386569873e-05, 'epoch': 12.1} +{'loss': 37.2922, 'grad_norm': 255.06137084960938, 'learning_rate': 1.180399274047187e-05, 'epoch': 12.1} +{'loss': 35.488, 'grad_norm': 293.59185791015625, 'learning_rate': 1.1798548094373867e-05, 'epoch': 12.1} +{'loss': 32.8175, 'grad_norm': 260.9599914550781, 'learning_rate': 1.1793103448275862e-05, 'epoch': 12.11} +{'loss': 31.3901, 'grad_norm': 387.63671875, 'learning_rate': 1.178765880217786e-05, 'epoch': 12.11} +{'loss': 32.9512, 'grad_norm': 216.2008819580078, 'learning_rate': 1.1782214156079855e-05, 'epoch': 12.12} +{'loss': 31.838, 'grad_norm': 260.510498046875, 'learning_rate': 1.177676950998185e-05, 'epoch': 12.12} +{'loss': 33.5854, 'grad_norm': 215.96522521972656, 'learning_rate': 1.1771324863883849e-05, 'epoch': 12.12} +{'loss': 34.947, 'grad_norm': 277.2855529785156, 'learning_rate': 1.1765880217785844e-05, 'epoch': 12.13} +{'loss': 34.3862, 'grad_norm': 199.53759765625, 'learning_rate': 1.176043557168784e-05, 'epoch': 12.13} + 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3370/5520 [2:57:01<1:42:53, 2.87s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6107886433601379, 'eval_runtime': 3.1348, 'eval_samples_per_second': 57.1, 'eval_steps_per_second': 57.1, 'epoch': 12.13} +{'loss': 34.5678, 'grad_norm': 244.73654174804688, 'learning_rate': 1.1754990925589837e-05, 'epoch': 12.13} +{'loss': 35.8974, 'grad_norm': 335.4967346191406, 'learning_rate': 1.1749546279491834e-05, 'epoch': 12.14} +{'loss': 36.3458, 'grad_norm': 269.8370056152344, 'learning_rate': 1.174410163339383e-05, 'epoch': 12.14} +{'loss': 34.6797, 'grad_norm': 230.82492065429688, 'learning_rate': 1.1738656987295826e-05, 'epoch': 12.14} +{'loss': 35.5799, 'grad_norm': 266.6196594238281, 'learning_rate': 1.1733212341197822e-05, 'epoch': 12.15} +{'loss': 34.9859, 'grad_norm': 268.1825256347656, 'learning_rate': 1.1727767695099819e-05, 'epoch': 12.15} +{'loss': 37.2283, 'grad_norm': 259.6159362792969, 'learning_rate': 1.1722323049001816e-05, 'epoch': 12.16} +{'loss': 37.4073, 'grad_norm': 225.1367645263672, 'learning_rate': 1.1716878402903811e-05, 'epoch': 12.16} +{'loss': 36.3491, 'grad_norm': 277.8457946777344, 'learning_rate': 1.1711433756805808e-05, 'epoch': 12.16} +{'loss': 31.4646, 'grad_norm': 273.1939697265625, 'learning_rate': 1.1705989110707804e-05, 'epoch': 12.17} + 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3380/5520 [2:57:33<1:42:44, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6099494695663452, 'eval_runtime': 3.1323, 'eval_samples_per_second': 57.146, 'eval_steps_per_second': 57.146, 'epoch': 12.17} +{'loss': 22.7125, 'grad_norm': 199.32516479492188, 'learning_rate': 1.17005444646098e-05, 'epoch': 12.17} +{'loss': 22.7899, 'grad_norm': 195.47630310058594, 'learning_rate': 1.1695099818511798e-05, 'epoch': 12.17} +{'loss': 23.4427, 'grad_norm': 220.02413940429688, 'learning_rate': 1.1689655172413793e-05, 'epoch': 12.18} +{'loss': 24.1504, 'grad_norm': 215.43287658691406, 'learning_rate': 1.168421052631579e-05, 'epoch': 12.18} +{'loss': 41.4955, 'grad_norm': 298.2409973144531, 'learning_rate': 1.1678765880217786e-05, 'epoch': 12.18} +{'loss': 42.4273, 'grad_norm': 235.94728088378906, 'learning_rate': 1.1673321234119783e-05, 'epoch': 12.19} +{'loss': 40.6468, 'grad_norm': 235.44480895996094, 'learning_rate': 1.166787658802178e-05, 'epoch': 12.19} +{'loss': 39.8335, 'grad_norm': 281.5338439941406, 'learning_rate': 1.1662431941923775e-05, 'epoch': 12.2} +{'loss': 40.8669, 'grad_norm': 185.87339782714844, 'learning_rate': 1.165698729582577e-05, 'epoch': 12.2} +{'loss': 40.1351, 'grad_norm': 218.88861083984375, 'learning_rate': 1.1651542649727768e-05, 'epoch': 12.2} + 61%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3390/5520 [2:58:04<1:42:19, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6128573417663574, 'eval_runtime': 3.1346, 'eval_samples_per_second': 57.104, 'eval_steps_per_second': 57.104, 'epoch': 12.2} +{'loss': 40.4448, 'grad_norm': 192.7227783203125, 'learning_rate': 1.1646098003629765e-05, 'epoch': 12.21} +{'loss': 41.579, 'grad_norm': 219.68093872070312, 'learning_rate': 1.1640653357531762e-05, 'epoch': 12.21} +{'loss': 41.3374, 'grad_norm': 235.8788299560547, 'learning_rate': 1.1635208711433757e-05, 'epoch': 12.21} +{'loss': 41.1151, 'grad_norm': 245.11935424804688, 'learning_rate': 1.1629764065335752e-05, 'epoch': 12.22} +{'loss': 38.9502, 'grad_norm': 260.2931823730469, 'learning_rate': 1.1624319419237751e-05, 'epoch': 12.22} +{'loss': 38.6309, 'grad_norm': 240.62734985351562, 'learning_rate': 1.1618874773139747e-05, 'epoch': 12.22} +{'loss': 38.3077, 'grad_norm': 230.9380645751953, 'learning_rate': 1.1613430127041742e-05, 'epoch': 12.23} +{'loss': 37.1566, 'grad_norm': 234.40687561035156, 'learning_rate': 1.1607985480943739e-05, 'epoch': 12.23} +{'loss': 38.4919, 'grad_norm': 216.580810546875, 'learning_rate': 1.1602540834845734e-05, 'epoch': 12.23} +{'loss': 38.1647, 'grad_norm': 210.75079345703125, 'learning_rate': 1.1597096188747732e-05, 'epoch': 12.24} + 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3400/5520 [2:58:36<1:41:35, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6105583906173706, 'eval_runtime': 3.1364, 'eval_samples_per_second': 57.072, 'eval_steps_per_second': 57.072, 'epoch': 12.24} +{'loss': 38.5585, 'grad_norm': 207.82180786132812, 'learning_rate': 1.1591651542649729e-05, 'epoch': 12.24} +{'loss': 38.0183, 'grad_norm': 186.55081176757812, 'learning_rate': 1.1586206896551724e-05, 'epoch': 12.25} +{'loss': 39.6951, 'grad_norm': 179.60572814941406, 'learning_rate': 1.1580762250453721e-05, 'epoch': 12.25} +{'loss': 39.2908, 'grad_norm': 212.59837341308594, 'learning_rate': 1.1575317604355718e-05, 'epoch': 12.25} +{'loss': 39.9409, 'grad_norm': 239.90997314453125, 'learning_rate': 1.1569872958257714e-05, 'epoch': 12.26} +{'loss': 39.2386, 'grad_norm': 240.729248046875, 'learning_rate': 1.156442831215971e-05, 'epoch': 12.26} +{'loss': 37.3296, 'grad_norm': 248.6179962158203, 'learning_rate': 1.1558983666061706e-05, 'epoch': 12.26} +{'loss': 40.1156, 'grad_norm': 192.55084228515625, 'learning_rate': 1.1553539019963701e-05, 'epoch': 12.27} +{'loss': 41.0677, 'grad_norm': 217.89109802246094, 'learning_rate': 1.15480943738657e-05, 'epoch': 12.27} +{'loss': 39.3552, 'grad_norm': 240.77633666992188, 'learning_rate': 1.1542649727767695e-05, 'epoch': 12.27} + 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3400/5520 [2:58:39<1:41:35, 2.88s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3410/5520 [2:59:08<1:41:49, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6094763278961182, 'eval_runtime': 3.1364, 'eval_samples_per_second': 57.072, 'eval_steps_per_second': 57.072, 'epoch': 12.27} +{'loss': 40.2202, 'grad_norm': 210.38153076171875, 'learning_rate': 1.1537205081669691e-05, 'epoch': 12.28} +{'loss': 37.5473, 'grad_norm': 195.49087524414062, 'learning_rate': 1.1531760435571688e-05, 'epoch': 12.28} +{'loss': 37.8032, 'grad_norm': 254.43972778320312, 'learning_rate': 1.1526315789473683e-05, 'epoch': 12.29} +{'loss': 35.1317, 'grad_norm': 205.09913635253906, 'learning_rate': 1.1520871143375682e-05, 'epoch': 12.29} +{'loss': 32.7809, 'grad_norm': 241.22930908203125, 'learning_rate': 1.1515426497277677e-05, 'epoch': 12.29} +{'loss': 32.5354, 'grad_norm': 226.75311279296875, 'learning_rate': 1.1509981851179673e-05, 'epoch': 12.3} +{'loss': 33.1533, 'grad_norm': 323.5389709472656, 'learning_rate': 1.150453720508167e-05, 'epoch': 12.3} +{'loss': 33.7924, 'grad_norm': 306.7039794921875, 'learning_rate': 1.1499092558983667e-05, 'epoch': 12.3} +{'loss': 33.829, 'grad_norm': 221.53897094726562, 'learning_rate': 1.1493647912885662e-05, 'epoch': 12.31} +{'loss': 35.4583, 'grad_norm': 301.59527587890625, 'learning_rate': 1.148820326678766e-05, 'epoch': 12.31} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3420/5520 [2:59:40<1:41:09, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6092248558998108, 'eval_runtime': 3.1372, 'eval_samples_per_second': 57.058, 'eval_steps_per_second': 57.058, 'epoch': 12.31} +{'loss': 34.3258, 'grad_norm': 229.63221740722656, 'learning_rate': 1.1482758620689655e-05, 'epoch': 12.31} +{'loss': 33.4522, 'grad_norm': 280.6421203613281, 'learning_rate': 1.147731397459165e-05, 'epoch': 12.32} +{'loss': 34.8911, 'grad_norm': 305.6673889160156, 'learning_rate': 1.1471869328493649e-05, 'epoch': 12.32} +{'loss': 36.2668, 'grad_norm': 278.5484924316406, 'learning_rate': 1.1466424682395644e-05, 'epoch': 12.33} +{'loss': 34.8401, 'grad_norm': 246.88082885742188, 'learning_rate': 1.1460980036297641e-05, 'epoch': 12.33} +{'loss': 36.2382, 'grad_norm': 279.730712890625, 'learning_rate': 1.1455535390199637e-05, 'epoch': 12.33} +{'loss': 37.0742, 'grad_norm': 243.62918090820312, 'learning_rate': 1.1450090744101634e-05, 'epoch': 12.34} +{'loss': 37.0223, 'grad_norm': 280.5240783691406, 'learning_rate': 1.1444646098003631e-05, 'epoch': 12.34} +{'loss': 34.8413, 'grad_norm': 270.56396484375, 'learning_rate': 1.1439201451905626e-05, 'epoch': 12.34} +{'loss': 26.5596, 'grad_norm': 246.56292724609375, 'learning_rate': 1.1433756805807622e-05, 'epoch': 12.35} + 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3430/5520 [3:00:11<1:40:32, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6123174428939819, 'eval_runtime': 3.1325, 'eval_samples_per_second': 57.143, 'eval_steps_per_second': 57.143, 'epoch': 12.35} +{'loss': 23.3959, 'grad_norm': 199.72242736816406, 'learning_rate': 1.1428312159709619e-05, 'epoch': 12.35} +{'loss': 23.448, 'grad_norm': 264.9206848144531, 'learning_rate': 1.1422867513611616e-05, 'epoch': 12.35} +{'loss': 23.4526, 'grad_norm': 198.09420776367188, 'learning_rate': 1.1417422867513613e-05, 'epoch': 12.36} +{'loss': 23.9586, 'grad_norm': 191.74949645996094, 'learning_rate': 1.1411978221415608e-05, 'epoch': 12.36} +{'loss': 41.2497, 'grad_norm': 270.4527893066406, 'learning_rate': 1.1406533575317604e-05, 'epoch': 12.36} +{'loss': 41.7598, 'grad_norm': 253.06109619140625, 'learning_rate': 1.1401088929219601e-05, 'epoch': 12.37} +{'loss': 42.1145, 'grad_norm': 389.3164978027344, 'learning_rate': 1.1395644283121598e-05, 'epoch': 12.37} +{'loss': 39.8163, 'grad_norm': 405.1527404785156, 'learning_rate': 1.1390199637023593e-05, 'epoch': 12.38} +{'loss': 40.7344, 'grad_norm': 360.5083312988281, 'learning_rate': 1.138475499092559e-05, 'epoch': 12.38} +{'loss': 40.6678, 'grad_norm': 276.3650207519531, 'learning_rate': 1.1379310344827586e-05, 'epoch': 12.38} + 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3440/5520 [3:00:43<1:40:11, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.612799346446991, 'eval_runtime': 3.1342, 'eval_samples_per_second': 57.112, 'eval_steps_per_second': 57.112, 'epoch': 12.38} +{'loss': 39.8701, 'grad_norm': 222.34078979492188, 'learning_rate': 1.1373865698729583e-05, 'epoch': 12.39} +{'loss': 42.031, 'grad_norm': 242.1103515625, 'learning_rate': 1.136842105263158e-05, 'epoch': 12.39} +{'loss': 40.7321, 'grad_norm': 231.30453491210938, 'learning_rate': 1.1362976406533575e-05, 'epoch': 12.39} +{'loss': 41.5889, 'grad_norm': 302.65179443359375, 'learning_rate': 1.1357531760435572e-05, 'epoch': 12.4} +{'loss': 40.3939, 'grad_norm': 296.4203796386719, 'learning_rate': 1.1352087114337568e-05, 'epoch': 12.4} +{'loss': 37.9457, 'grad_norm': 281.8349304199219, 'learning_rate': 1.1346642468239565e-05, 'epoch': 12.4} +{'loss': 37.4727, 'grad_norm': 228.9622039794922, 'learning_rate': 1.1341197822141562e-05, 'epoch': 12.41} +{'loss': 36.4285, 'grad_norm': 276.8975524902344, 'learning_rate': 1.1335753176043557e-05, 'epoch': 12.41} +{'loss': 37.7888, 'grad_norm': 218.76206970214844, 'learning_rate': 1.1330308529945553e-05, 'epoch': 12.42} +{'loss': 38.6416, 'grad_norm': 277.31329345703125, 'learning_rate': 1.1324863883847551e-05, 'epoch': 12.42} + 62%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3450/5520 [3:01:15<1:39:24, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6118359565734863, 'eval_runtime': 3.1346, 'eval_samples_per_second': 57.105, 'eval_steps_per_second': 57.105, 'epoch': 12.42} +{'loss': 38.3779, 'grad_norm': 239.2766876220703, 'learning_rate': 1.1319419237749547e-05, 'epoch': 12.42} +{'loss': 38.7581, 'grad_norm': 255.43939208984375, 'learning_rate': 1.1313974591651542e-05, 'epoch': 12.43} +{'loss': 40.1953, 'grad_norm': 196.33380126953125, 'learning_rate': 1.130852994555354e-05, 'epoch': 12.43} +{'loss': 39.2743, 'grad_norm': 284.2427062988281, 'learning_rate': 1.1303085299455535e-05, 'epoch': 12.43} +{'loss': 39.4786, 'grad_norm': 303.0172424316406, 'learning_rate': 1.1297640653357533e-05, 'epoch': 12.44} +{'loss': 38.6038, 'grad_norm': 231.17999267578125, 'learning_rate': 1.1292196007259529e-05, 'epoch': 12.44} +{'loss': 39.0235, 'grad_norm': 228.89599609375, 'learning_rate': 1.1286751361161524e-05, 'epoch': 12.44} +{'loss': 39.9779, 'grad_norm': 247.05203247070312, 'learning_rate': 1.1281306715063521e-05, 'epoch': 12.45} +{'loss': 40.4104, 'grad_norm': 221.5463104248047, 'learning_rate': 1.1275862068965517e-05, 'epoch': 12.45} +{'loss': 40.8093, 'grad_norm': 254.12820434570312, 'learning_rate': 1.1270417422867514e-05, 'epoch': 12.46} + 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3460/5520 [3:01:46<1:39:11, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6093817353248596, 'eval_runtime': 3.1364, 'eval_samples_per_second': 57.071, 'eval_steps_per_second': 57.071, 'epoch': 12.46} +{'loss': 40.3578, 'grad_norm': 214.2323760986328, 'learning_rate': 1.1264972776769511e-05, 'epoch': 12.46} +{'loss': 39.772, 'grad_norm': 230.64718627929688, 'learning_rate': 1.1259528130671506e-05, 'epoch': 12.46} +{'loss': 36.8193, 'grad_norm': 217.81838989257812, 'learning_rate': 1.1254083484573502e-05, 'epoch': 12.47} +{'loss': 33.891, 'grad_norm': 292.7674560546875, 'learning_rate': 1.12486388384755e-05, 'epoch': 12.47} +{'loss': 34.8947, 'grad_norm': 241.6099395751953, 'learning_rate': 1.1243194192377496e-05, 'epoch': 12.47} +{'loss': 31.7715, 'grad_norm': 220.97128295898438, 'learning_rate': 1.1237749546279493e-05, 'epoch': 12.48} +{'loss': 32.3878, 'grad_norm': 191.04376220703125, 'learning_rate': 1.1232304900181488e-05, 'epoch': 12.48} +{'loss': 33.3116, 'grad_norm': 192.3009796142578, 'learning_rate': 1.1226860254083484e-05, 'epoch': 12.48} +{'loss': 34.1394, 'grad_norm': 214.22459411621094, 'learning_rate': 1.1221415607985482e-05, 'epoch': 12.49} +{'loss': 34.9381, 'grad_norm': 225.24191284179688, 'learning_rate': 1.1215970961887478e-05, 'epoch': 12.49} + 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3470/5520 [3:02:18<1:38:52, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6095408201217651, 'eval_runtime': 3.136, 'eval_samples_per_second': 57.079, 'eval_steps_per_second': 57.079, 'epoch': 12.49} +{'loss': 34.5342, 'grad_norm': 240.89199829101562, 'learning_rate': 1.1210526315789473e-05, 'epoch': 12.49} +{'loss': 35.3287, 'grad_norm': 263.5467224121094, 'learning_rate': 1.120508166969147e-05, 'epoch': 12.5} +{'loss': 35.4859, 'grad_norm': 253.0650634765625, 'learning_rate': 1.1199637023593467e-05, 'epoch': 12.5} +{'loss': 33.919, 'grad_norm': 279.4447937011719, 'learning_rate': 1.1194192377495463e-05, 'epoch': 12.51} +{'loss': 35.2743, 'grad_norm': 246.6184844970703, 'learning_rate': 1.118874773139746e-05, 'epoch': 12.51} +{'loss': 36.0865, 'grad_norm': 228.4134979248047, 'learning_rate': 1.1183303085299455e-05, 'epoch': 12.51} +{'loss': 36.1596, 'grad_norm': 264.87835693359375, 'learning_rate': 1.1177858439201452e-05, 'epoch': 12.52} +{'loss': 35.7293, 'grad_norm': 252.2872772216797, 'learning_rate': 1.117241379310345e-05, 'epoch': 12.52} +{'loss': 36.8009, 'grad_norm': 277.3695373535156, 'learning_rate': 1.1166969147005445e-05, 'epoch': 12.52} +{'loss': 28.5986, 'grad_norm': 255.64610290527344, 'learning_rate': 1.1161524500907442e-05, 'epoch': 12.53} + 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3480/5520 [3:02:50<1:38:06, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6122347116470337, 'eval_runtime': 3.1329, 'eval_samples_per_second': 57.136, 'eval_steps_per_second': 57.136, 'epoch': 12.53} +{'loss': 23.1289, 'grad_norm': 256.1487121582031, 'learning_rate': 1.1156079854809437e-05, 'epoch': 12.53} +{'loss': 22.3379, 'grad_norm': 261.9757080078125, 'learning_rate': 1.1150635208711433e-05, 'epoch': 12.53} +{'loss': 23.6192, 'grad_norm': 194.83432006835938, 'learning_rate': 1.1145190562613431e-05, 'epoch': 12.54} +{'loss': 24.0314, 'grad_norm': 241.51089477539062, 'learning_rate': 1.1139745916515427e-05, 'epoch': 12.54} +{'loss': 40.2969, 'grad_norm': 242.6024932861328, 'learning_rate': 1.1134301270417424e-05, 'epoch': 12.55} +{'loss': 42.3448, 'grad_norm': 292.17303466796875, 'learning_rate': 1.112885662431942e-05, 'epoch': 12.55} +{'loss': 41.7642, 'grad_norm': 232.811767578125, 'learning_rate': 1.1123411978221416e-05, 'epoch': 12.55} +{'loss': 41.0827, 'grad_norm': 238.43162536621094, 'learning_rate': 1.1117967332123413e-05, 'epoch': 12.56} +{'loss': 41.3795, 'grad_norm': 290.20159912109375, 'learning_rate': 1.1112522686025409e-05, 'epoch': 12.56} +{'loss': 40.6337, 'grad_norm': 197.52903747558594, 'learning_rate': 1.1107078039927404e-05, 'epoch': 12.56} + 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3490/5520 [3:03:21<1:37:59, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6133883595466614, 'eval_runtime': 3.1329, 'eval_samples_per_second': 57.135, 'eval_steps_per_second': 57.135, 'epoch': 12.56} +{'loss': 40.2626, 'grad_norm': 259.8161926269531, 'learning_rate': 1.1101633393829401e-05, 'epoch': 12.57} +{'loss': 41.0171, 'grad_norm': 196.7882537841797, 'learning_rate': 1.1096188747731398e-05, 'epoch': 12.57} +{'loss': 42.1328, 'grad_norm': 216.27642822265625, 'learning_rate': 1.1090744101633394e-05, 'epoch': 12.57} +{'loss': 39.9502, 'grad_norm': 292.6575012207031, 'learning_rate': 1.108529945553539e-05, 'epoch': 12.58} +{'loss': 41.3409, 'grad_norm': 254.43344116210938, 'learning_rate': 1.1079854809437386e-05, 'epoch': 12.58} +{'loss': 39.6898, 'grad_norm': 211.3965606689453, 'learning_rate': 1.1074410163339385e-05, 'epoch': 12.59} +{'loss': 38.0837, 'grad_norm': 196.2000274658203, 'learning_rate': 1.106896551724138e-05, 'epoch': 12.59} +{'loss': 38.479, 'grad_norm': 224.4564666748047, 'learning_rate': 1.1063520871143376e-05, 'epoch': 12.59} +{'loss': 38.3103, 'grad_norm': 215.7074432373047, 'learning_rate': 1.1058076225045373e-05, 'epoch': 12.6} +{'loss': 37.9399, 'grad_norm': 278.2279052734375, 'learning_rate': 1.1052631578947368e-05, 'epoch': 12.6} + 63%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3500/5520 [3:03:53<1:36:57, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6091782450675964, 'eval_runtime': 3.1334, 'eval_samples_per_second': 57.127, 'eval_steps_per_second': 57.127, 'epoch': 12.6} +{'loss': 38.185, 'grad_norm': 236.7021942138672, 'learning_rate': 1.1047186932849365e-05, 'epoch': 12.6} +{'loss': 38.7405, 'grad_norm': 200.35169982910156, 'learning_rate': 1.1041742286751362e-05, 'epoch': 12.61} +{'loss': 39.8351, 'grad_norm': 211.9726104736328, 'learning_rate': 1.1036297640653358e-05, 'epoch': 12.61} +{'loss': 39.3039, 'grad_norm': 303.5962829589844, 'learning_rate': 1.1030852994555353e-05, 'epoch': 12.61} +{'loss': 39.9149, 'grad_norm': 298.086181640625, 'learning_rate': 1.102540834845735e-05, 'epoch': 12.62} +{'loss': 36.3617, 'grad_norm': 255.69854736328125, 'learning_rate': 1.1019963702359347e-05, 'epoch': 12.62} +{'loss': 38.6865, 'grad_norm': 273.2884216308594, 'learning_rate': 1.1014519056261344e-05, 'epoch': 12.62} +{'loss': 40.2771, 'grad_norm': 211.17837524414062, 'learning_rate': 1.100907441016334e-05, 'epoch': 12.63} +{'loss': 40.3644, 'grad_norm': 253.9141845703125, 'learning_rate': 1.1003629764065335e-05, 'epoch': 12.63} +{'loss': 39.9754, 'grad_norm': 247.4141082763672, 'learning_rate': 1.0998185117967334e-05, 'epoch': 12.64} + 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3510/5520 [3:04:24<1:36:17, 2.87s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6086810827255249, 'eval_runtime': 3.1368, 'eval_samples_per_second': 57.065, 'eval_steps_per_second': 57.065, 'epoch': 12.64} +{'loss': 39.9438, 'grad_norm': 237.3258056640625, 'learning_rate': 1.0992740471869329e-05, 'epoch': 12.64} +{'loss': 39.9713, 'grad_norm': 252.87744140625, 'learning_rate': 1.0987295825771325e-05, 'epoch': 12.64} +{'loss': 36.54, 'grad_norm': 341.2947998046875, 'learning_rate': 1.0981851179673322e-05, 'epoch': 12.65} +{'loss': 33.2737, 'grad_norm': 212.7144317626953, 'learning_rate': 1.0976406533575317e-05, 'epoch': 12.65} +{'loss': 34.8862, 'grad_norm': 220.15846252441406, 'learning_rate': 1.0970961887477314e-05, 'epoch': 12.65} +{'loss': 31.637, 'grad_norm': 235.8145294189453, 'learning_rate': 1.0965517241379311e-05, 'epoch': 12.66} +{'loss': 33.6111, 'grad_norm': 274.13140869140625, 'learning_rate': 1.0960072595281307e-05, 'epoch': 12.66} +{'loss': 34.7118, 'grad_norm': 259.9810791015625, 'learning_rate': 1.0954627949183304e-05, 'epoch': 12.66} +{'loss': 34.3987, 'grad_norm': 244.6074676513672, 'learning_rate': 1.0949183303085299e-05, 'epoch': 12.67} +{'loss': 34.7304, 'grad_norm': 264.0238037109375, 'learning_rate': 1.0943738656987296e-05, 'epoch': 12.67} + 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 3520/5520 [3:04:56<1:35:59, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6089194416999817, 'eval_runtime': 3.1395, 'eval_samples_per_second': 57.015, 'eval_steps_per_second': 57.015, 'epoch': 12.67} +{'loss': 34.5722, 'grad_norm': 286.857421875, 'learning_rate': 1.0938294010889293e-05, 'epoch': 12.68} +{'loss': 35.6129, 'grad_norm': 270.7839660644531, 'learning_rate': 1.0932849364791289e-05, 'epoch': 12.68} +{'loss': 34.4318, 'grad_norm': 214.4302978515625, 'learning_rate': 1.0927404718693284e-05, 'epoch': 12.68} +{'loss': 35.6578, 'grad_norm': 362.6913757324219, 'learning_rate': 1.0921960072595283e-05, 'epoch': 12.69} +{'loss': 35.8627, 'grad_norm': 266.5205993652344, 'learning_rate': 1.0916515426497278e-05, 'epoch': 12.69} +{'loss': 36.8931, 'grad_norm': 271.8298034667969, 'learning_rate': 1.0911070780399275e-05, 'epoch': 12.69} +{'loss': 35.8972, 'grad_norm': 230.13815307617188, 'learning_rate': 1.090562613430127e-05, 'epoch': 12.7} +{'loss': 36.7884, 'grad_norm': 235.57127380371094, 'learning_rate': 1.0900181488203266e-05, 'epoch': 12.7} +{'loss': 35.938, 'grad_norm': 274.0856018066406, 'learning_rate': 1.0894736842105265e-05, 'epoch': 12.7} +{'loss': 30.846, 'grad_norm': 251.9855194091797, 'learning_rate': 1.088929219600726e-05, 'epoch': 12.71} + 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3530/5520 [3:05:27<1:35:44, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6102532148361206, 'eval_runtime': 3.1368, 'eval_samples_per_second': 57.065, 'eval_steps_per_second': 57.065, 'epoch': 12.71} +{'loss': 22.8538, 'grad_norm': 254.11465454101562, 'learning_rate': 1.0883847549909255e-05, 'epoch': 12.71} +{'loss': 22.3346, 'grad_norm': 233.05821228027344, 'learning_rate': 1.0878402903811253e-05, 'epoch': 12.72} +{'loss': 23.8109, 'grad_norm': 223.46646118164062, 'learning_rate': 1.087295825771325e-05, 'epoch': 12.72} +{'loss': 24.7694, 'grad_norm': 209.4064483642578, 'learning_rate': 1.0867513611615245e-05, 'epoch': 12.72} +{'loss': 40.8879, 'grad_norm': 299.6215515136719, 'learning_rate': 1.0862068965517242e-05, 'epoch': 12.73} +{'loss': 41.5875, 'grad_norm': 272.5259704589844, 'learning_rate': 1.0856624319419237e-05, 'epoch': 12.73} +{'loss': 41.5546, 'grad_norm': 219.70687866210938, 'learning_rate': 1.0851179673321235e-05, 'epoch': 12.73} +{'loss': 40.0984, 'grad_norm': 250.9104766845703, 'learning_rate': 1.0845735027223232e-05, 'epoch': 12.74} +{'loss': 40.564, 'grad_norm': 260.9254150390625, 'learning_rate': 1.0840290381125227e-05, 'epoch': 12.74} +{'loss': 40.3864, 'grad_norm': 275.46221923828125, 'learning_rate': 1.0834845735027224e-05, 'epoch': 12.74} + 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3540/5520 [3:05:59<1:35:26, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6099677681922913, 'eval_runtime': 3.1443, 'eval_samples_per_second': 56.928, 'eval_steps_per_second': 56.928, 'epoch': 12.74} +{'loss': 40.5753, 'grad_norm': 200.9589385986328, 'learning_rate': 1.082940108892922e-05, 'epoch': 12.75} +{'loss': 41.4702, 'grad_norm': 228.87669372558594, 'learning_rate': 1.0823956442831215e-05, 'epoch': 12.75} +{'loss': 41.6641, 'grad_norm': 218.6998748779297, 'learning_rate': 1.0818511796733214e-05, 'epoch': 12.75} +{'loss': 41.8016, 'grad_norm': 422.519775390625, 'learning_rate': 1.0813067150635209e-05, 'epoch': 12.76} +{'loss': 40.6053, 'grad_norm': 198.31935119628906, 'learning_rate': 1.0807622504537204e-05, 'epoch': 12.76} +{'loss': 38.7974, 'grad_norm': 274.42333984375, 'learning_rate': 1.0802177858439201e-05, 'epoch': 12.77} +{'loss': 37.157, 'grad_norm': 267.5847473144531, 'learning_rate': 1.0796733212341199e-05, 'epoch': 12.77} +{'loss': 38.1585, 'grad_norm': 264.9976806640625, 'learning_rate': 1.0791288566243196e-05, 'epoch': 12.77} +{'loss': 38.0501, 'grad_norm': 216.5603790283203, 'learning_rate': 1.0785843920145191e-05, 'epoch': 12.78} +{'loss': 38.3114, 'grad_norm': 193.55081176757812, 'learning_rate': 1.0780399274047186e-05, 'epoch': 12.78} + 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3550/5520 [3:06:31<1:34:33, 2.88s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6059894561767578, 'eval_runtime': 3.1334, 'eval_samples_per_second': 57.126, 'eval_steps_per_second': 57.126, 'epoch': 12.78} +{'loss': 38.7056, 'grad_norm': 256.3584289550781, 'learning_rate': 1.0774954627949183e-05, 'epoch': 12.78} +{'loss': 39.3947, 'grad_norm': 203.17401123046875, 'learning_rate': 1.076950998185118e-05, 'epoch': 12.79} +{'loss': 39.2121, 'grad_norm': 307.99517822265625, 'learning_rate': 1.0764065335753176e-05, 'epoch': 12.79} +{'loss': 38.4621, 'grad_norm': 199.4147186279297, 'learning_rate': 1.0758620689655173e-05, 'epoch': 12.79} +{'loss': 38.2742, 'grad_norm': 251.60293579101562, 'learning_rate': 1.0753176043557168e-05, 'epoch': 12.8} +{'loss': 38.6803, 'grad_norm': 277.1817321777344, 'learning_rate': 1.0747731397459165e-05, 'epoch': 12.8} +{'loss': 39.7843, 'grad_norm': 303.2837219238281, 'learning_rate': 1.0742286751361163e-05, 'epoch': 12.81} +{'loss': 41.3761, 'grad_norm': 321.22772216796875, 'learning_rate': 1.0736842105263158e-05, 'epoch': 12.81} +{'loss': 40.3649, 'grad_norm': 238.89007568359375, 'learning_rate': 1.0731397459165155e-05, 'epoch': 12.81} +{'loss': 40.8151, 'grad_norm': 251.22291564941406, 'learning_rate': 1.072595281306715e-05, 'epoch': 12.82} + 64%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3560/5520 [3:07:02<1:34:38, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6065003275871277, 'eval_runtime': 3.1366, 'eval_samples_per_second': 57.067, 'eval_steps_per_second': 57.067, 'epoch': 12.82} +{'loss': 39.381, 'grad_norm': 218.13418579101562, 'learning_rate': 1.0720508166969147e-05, 'epoch': 12.82} +{'loss': 39.8923, 'grad_norm': 250.90328979492188, 'learning_rate': 1.0715063520871145e-05, 'epoch': 12.82} +{'loss': 36.836, 'grad_norm': 227.4825897216797, 'learning_rate': 1.070961887477314e-05, 'epoch': 12.83} +{'loss': 34.499, 'grad_norm': 253.7106475830078, 'learning_rate': 1.0704174228675135e-05, 'epoch': 12.83} +{'loss': 33.3409, 'grad_norm': 280.0548400878906, 'learning_rate': 1.0698729582577132e-05, 'epoch': 12.83} +{'loss': 32.4868, 'grad_norm': 201.3768768310547, 'learning_rate': 1.069328493647913e-05, 'epoch': 12.84} +{'loss': 32.8295, 'grad_norm': 245.73446655273438, 'learning_rate': 1.0687840290381125e-05, 'epoch': 12.84} +{'loss': 33.2009, 'grad_norm': 195.0170440673828, 'learning_rate': 1.0682395644283122e-05, 'epoch': 12.85} +{'loss': 33.0627, 'grad_norm': 261.66357421875, 'learning_rate': 1.0676950998185117e-05, 'epoch': 12.85} +{'loss': 34.184, 'grad_norm': 299.0184326171875, 'learning_rate': 1.0671506352087116e-05, 'epoch': 12.85} + 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3570/5520 [3:07:34<1:33:56, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6077792048454285, 'eval_runtime': 3.1381, 'eval_samples_per_second': 57.041, 'eval_steps_per_second': 57.041, 'epoch': 12.85} +{'loss': 34.748, 'grad_norm': 293.9249572753906, 'learning_rate': 1.0666061705989111e-05, 'epoch': 12.86} +{'loss': 33.8454, 'grad_norm': 206.4182586669922, 'learning_rate': 1.0660617059891107e-05, 'epoch': 12.86} +{'loss': 35.7317, 'grad_norm': 261.4427185058594, 'learning_rate': 1.0655172413793104e-05, 'epoch': 12.86} +{'loss': 35.2389, 'grad_norm': 236.60704040527344, 'learning_rate': 1.06497277676951e-05, 'epoch': 12.87} +{'loss': 34.8523, 'grad_norm': 272.9973449707031, 'learning_rate': 1.0644283121597096e-05, 'epoch': 12.87} +{'loss': 34.7236, 'grad_norm': 228.82540893554688, 'learning_rate': 1.0638838475499093e-05, 'epoch': 12.87} +{'loss': 36.1574, 'grad_norm': 266.6078796386719, 'learning_rate': 1.0633393829401089e-05, 'epoch': 12.88} +{'loss': 36.8466, 'grad_norm': 267.52239990234375, 'learning_rate': 1.0627949183303086e-05, 'epoch': 12.88} +{'loss': 37.2803, 'grad_norm': 261.0372314453125, 'learning_rate': 1.0622504537205083e-05, 'epoch': 12.88} +{'loss': 29.4233, 'grad_norm': 220.42532348632812, 'learning_rate': 1.0617059891107078e-05, 'epoch': 12.89} + 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3580/5520 [3:08:06<1:33:55, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6131581664085388, 'eval_runtime': 3.1384, 'eval_samples_per_second': 57.035, 'eval_steps_per_second': 57.035, 'epoch': 12.89} +{'loss': 23.3851, 'grad_norm': 187.53604125976562, 'learning_rate': 1.0611615245009075e-05, 'epoch': 12.89} +{'loss': 23.3155, 'grad_norm': 227.1913299560547, 'learning_rate': 1.060617059891107e-05, 'epoch': 12.9} +{'loss': 24.4548, 'grad_norm': 202.15939331054688, 'learning_rate': 1.0600725952813066e-05, 'epoch': 12.9} +{'loss': 24.2037, 'grad_norm': 195.67282104492188, 'learning_rate': 1.0595281306715065e-05, 'epoch': 12.9} +{'loss': 41.6489, 'grad_norm': 303.0018310546875, 'learning_rate': 1.058983666061706e-05, 'epoch': 12.91} +{'loss': 40.3682, 'grad_norm': 193.92433166503906, 'learning_rate': 1.0584392014519056e-05, 'epoch': 12.91} +{'loss': 40.5065, 'grad_norm': 305.50750732421875, 'learning_rate': 1.0578947368421053e-05, 'epoch': 12.91} +{'loss': 41.6387, 'grad_norm': 223.41732788085938, 'learning_rate': 1.0573502722323048e-05, 'epoch': 12.92} +{'loss': 41.3623, 'grad_norm': 215.65061950683594, 'learning_rate': 1.0568058076225047e-05, 'epoch': 12.92} +{'loss': 40.7444, 'grad_norm': 223.95880126953125, 'learning_rate': 1.0562613430127042e-05, 'epoch': 12.92} + 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3590/5520 [3:08:38<1:33:03, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6113386750221252, 'eval_runtime': 3.1363, 'eval_samples_per_second': 57.074, 'eval_steps_per_second': 57.074, 'epoch': 12.92} +{'loss': 37.8137, 'grad_norm': 247.3272247314453, 'learning_rate': 1.0557168784029038e-05, 'epoch': 12.93} +{'loss': 38.6946, 'grad_norm': 277.4321594238281, 'learning_rate': 1.0551724137931035e-05, 'epoch': 12.93} +{'loss': 39.0059, 'grad_norm': 219.15576171875, 'learning_rate': 1.0546279491833032e-05, 'epoch': 12.94} +{'loss': 39.2436, 'grad_norm': 205.6105194091797, 'learning_rate': 1.0540834845735027e-05, 'epoch': 12.94} +{'loss': 39.2451, 'grad_norm': 303.84521484375, 'learning_rate': 1.0535390199637024e-05, 'epoch': 12.94} +{'loss': 38.1849, 'grad_norm': 326.2321472167969, 'learning_rate': 1.052994555353902e-05, 'epoch': 12.95} +{'loss': 39.7121, 'grad_norm': 332.7608642578125, 'learning_rate': 1.0524500907441015e-05, 'epoch': 12.95} +{'loss': 39.6558, 'grad_norm': 245.19827270507812, 'learning_rate': 1.0519056261343014e-05, 'epoch': 12.95} +{'loss': 38.6437, 'grad_norm': 227.54763793945312, 'learning_rate': 1.051361161524501e-05, 'epoch': 12.96} +{'loss': 39.083, 'grad_norm': 273.1142272949219, 'learning_rate': 1.0508166969147006e-05, 'epoch': 12.96} + 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3600/5520 [3:09:09<1:33:04, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6050187349319458, 'eval_runtime': 3.1339, 'eval_samples_per_second': 57.118, 'eval_steps_per_second': 57.118, 'epoch': 12.96} +{'loss': 34.0254, 'grad_norm': 227.0492401123047, 'learning_rate': 1.0502722323049002e-05, 'epoch': 12.96} +{'loss': 32.4569, 'grad_norm': 201.76736450195312, 'learning_rate': 1.0497277676950999e-05, 'epoch': 12.97} +{'loss': 33.8718, 'grad_norm': 279.99237060546875, 'learning_rate': 1.0491833030852996e-05, 'epoch': 12.97} +{'loss': 34.8168, 'grad_norm': 351.647705078125, 'learning_rate': 1.0486388384754991e-05, 'epoch': 12.98} +{'loss': 35.1731, 'grad_norm': 275.7414855957031, 'learning_rate': 1.0480943738656987e-05, 'epoch': 12.98} +{'loss': 35.7127, 'grad_norm': 347.0024719238281, 'learning_rate': 1.0475499092558984e-05, 'epoch': 12.98} +{'loss': 34.7709, 'grad_norm': 304.18218994140625, 'learning_rate': 1.047005444646098e-05, 'epoch': 12.99} +{'loss': 37.2105, 'grad_norm': 306.33245849609375, 'learning_rate': 1.0464609800362976e-05, 'epoch': 12.99} +{'loss': 33.6613, 'grad_norm': 326.3535461425781, 'learning_rate': 1.0459165154264973e-05, 'epoch': 12.99} +{'loss': 22.8985, 'grad_norm': 325.7522888183594, 'learning_rate': 1.0453720508166969e-05, 'epoch': 13.0} + 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3600/5520 [3:09:13<1:33:04, 2.91s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 65%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3610/5520 [3:09:42<1:32:03, 2.89s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6073772311210632, 'eval_runtime': 3.1391, 'eval_samples_per_second': 57.023, 'eval_steps_per_second': 57.023, 'epoch': 13.0} +{'loss': 21.3776, 'grad_norm': 256.7010498046875, 'learning_rate': 1.0448275862068966e-05, 'epoch': 13.0} +{'loss': 39.0509, 'grad_norm': 247.7591552734375, 'learning_rate': 1.0442831215970963e-05, 'epoch': 13.0} +{'loss': 41.042, 'grad_norm': 389.6626281738281, 'learning_rate': 1.0437386569872958e-05, 'epoch': 13.01} +{'loss': 39.9542, 'grad_norm': 271.01885986328125, 'learning_rate': 1.0431941923774955e-05, 'epoch': 13.01} +{'loss': 39.8852, 'grad_norm': 263.2490539550781, 'learning_rate': 1.042649727767695e-05, 'epoch': 13.01} +{'loss': 39.3902, 'grad_norm': 255.46878051757812, 'learning_rate': 1.0421052631578948e-05, 'epoch': 13.02} +{'loss': 40.1731, 'grad_norm': 206.02244567871094, 'learning_rate': 1.0415607985480945e-05, 'epoch': 13.02} +{'loss': 39.17, 'grad_norm': 194.83055114746094, 'learning_rate': 1.041016333938294e-05, 'epoch': 13.03} +{'loss': 40.3363, 'grad_norm': 230.1270294189453, 'learning_rate': 1.0404718693284936e-05, 'epoch': 13.03} +{'loss': 40.7774, 'grad_norm': 206.0470733642578, 'learning_rate': 1.0399274047186933e-05, 'epoch': 13.03} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3620/5520 [3:10:13<1:31:44, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6078981161117554, 'eval_runtime': 3.1697, 'eval_samples_per_second': 56.472, 'eval_steps_per_second': 56.472, 'epoch': 13.03} +{'loss': 40.725, 'grad_norm': 210.79327392578125, 'learning_rate': 1.039382940108893e-05, 'epoch': 13.04} +{'loss': 38.8736, 'grad_norm': 200.4281768798828, 'learning_rate': 1.0388384754990927e-05, 'epoch': 13.04} +{'loss': 37.5542, 'grad_norm': 183.33575439453125, 'learning_rate': 1.0382940108892922e-05, 'epoch': 13.04} +{'loss': 36.5576, 'grad_norm': 195.2568817138672, 'learning_rate': 1.0377495462794918e-05, 'epoch': 13.05} +{'loss': 36.9015, 'grad_norm': 223.9565887451172, 'learning_rate': 1.0372050816696916e-05, 'epoch': 13.05} +{'loss': 38.8146, 'grad_norm': 264.0516052246094, 'learning_rate': 1.0366606170598912e-05, 'epoch': 13.05} +{'loss': 37.0338, 'grad_norm': 247.3844757080078, 'learning_rate': 1.0361161524500907e-05, 'epoch': 13.06} +{'loss': 37.3565, 'grad_norm': 243.3253173828125, 'learning_rate': 1.0355716878402904e-05, 'epoch': 13.06} +{'loss': 38.367, 'grad_norm': 213.89939880371094, 'learning_rate': 1.03502722323049e-05, 'epoch': 13.07} +{'loss': 38.3101, 'grad_norm': 254.04953002929688, 'learning_rate': 1.0344827586206898e-05, 'epoch': 13.07} + 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3630/5520 [3:10:45<1:31:42, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6108394861221313, 'eval_runtime': 3.1342, 'eval_samples_per_second': 57.112, 'eval_steps_per_second': 57.112, 'epoch': 13.07} +{'loss': 38.3113, 'grad_norm': 235.3623046875, 'learning_rate': 1.0339382940108894e-05, 'epoch': 13.07} +{'loss': 36.9916, 'grad_norm': 259.0147399902344, 'learning_rate': 1.0333938294010889e-05, 'epoch': 13.08} +{'loss': 36.5944, 'grad_norm': 257.96575927734375, 'learning_rate': 1.0328493647912886e-05, 'epoch': 13.08} +{'loss': 39.7592, 'grad_norm': 228.49131774902344, 'learning_rate': 1.0323049001814882e-05, 'epoch': 13.08} +{'loss': 38.7785, 'grad_norm': 278.5231018066406, 'learning_rate': 1.0317604355716879e-05, 'epoch': 13.09} +{'loss': 39.6878, 'grad_norm': 218.6136932373047, 'learning_rate': 1.0312159709618876e-05, 'epoch': 13.09} +{'loss': 40.5433, 'grad_norm': 231.03012084960938, 'learning_rate': 1.0306715063520871e-05, 'epoch': 13.09} +{'loss': 39.1311, 'grad_norm': 254.7096405029297, 'learning_rate': 1.0301270417422866e-05, 'epoch': 13.1} +{'loss': 38.6237, 'grad_norm': 303.50274658203125, 'learning_rate': 1.0295825771324865e-05, 'epoch': 13.1} +{'loss': 36.5534, 'grad_norm': 217.4394073486328, 'learning_rate': 1.029038112522686e-05, 'epoch': 13.1} + 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3640/5520 [3:11:17<1:31:41, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6075544357299805, 'eval_runtime': 3.1475, 'eval_samples_per_second': 56.87, 'eval_steps_per_second': 56.87, 'epoch': 13.1} +{'loss': 34.2153, 'grad_norm': 249.18490600585938, 'learning_rate': 1.0284936479128858e-05, 'epoch': 13.11} +{'loss': 33.7793, 'grad_norm': 261.9061584472656, 'learning_rate': 1.0279491833030853e-05, 'epoch': 13.11} +{'loss': 31.2934, 'grad_norm': 205.93113708496094, 'learning_rate': 1.0274047186932848e-05, 'epoch': 13.12} +{'loss': 31.9074, 'grad_norm': 203.82980346679688, 'learning_rate': 1.0268602540834847e-05, 'epoch': 13.12} +{'loss': 32.6883, 'grad_norm': 309.0658874511719, 'learning_rate': 1.0263157894736843e-05, 'epoch': 13.12} +{'loss': 34.1261, 'grad_norm': 239.59312438964844, 'learning_rate': 1.0257713248638838e-05, 'epoch': 13.13} +{'loss': 34.7656, 'grad_norm': 360.4351501464844, 'learning_rate': 1.0252268602540835e-05, 'epoch': 13.13} +{'loss': 34.6533, 'grad_norm': 319.87451171875, 'learning_rate': 1.024682395644283e-05, 'epoch': 13.13} +{'loss': 33.9159, 'grad_norm': 352.31707763671875, 'learning_rate': 1.0241379310344828e-05, 'epoch': 13.14} +{'loss': 34.6115, 'grad_norm': 288.85418701171875, 'learning_rate': 1.0235934664246825e-05, 'epoch': 13.14} + 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3650/5520 [3:11:49<1:30:23, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6106187105178833, 'eval_runtime': 3.1535, 'eval_samples_per_second': 56.763, 'eval_steps_per_second': 56.763, 'epoch': 13.14} +{'loss': 34.3008, 'grad_norm': 263.8638000488281, 'learning_rate': 1.023049001814882e-05, 'epoch': 13.14} +{'loss': 35.9397, 'grad_norm': 308.10650634765625, 'learning_rate': 1.0225045372050817e-05, 'epoch': 13.15} +{'loss': 34.2573, 'grad_norm': 208.60519409179688, 'learning_rate': 1.0219600725952814e-05, 'epoch': 13.15} +{'loss': 35.853, 'grad_norm': 251.36766052246094, 'learning_rate': 1.021415607985481e-05, 'epoch': 13.16} +{'loss': 35.7057, 'grad_norm': 264.94818115234375, 'learning_rate': 1.0208711433756807e-05, 'epoch': 13.16} +{'loss': 34.611, 'grad_norm': 313.0333251953125, 'learning_rate': 1.0203266787658802e-05, 'epoch': 13.16} +{'loss': 31.1751, 'grad_norm': 254.9687042236328, 'learning_rate': 1.0197822141560797e-05, 'epoch': 13.17} +{'loss': 22.8425, 'grad_norm': 219.7308349609375, 'learning_rate': 1.0192377495462796e-05, 'epoch': 13.17} +{'loss': 22.5266, 'grad_norm': 305.76416015625, 'learning_rate': 1.0186932849364792e-05, 'epoch': 13.17} +{'loss': 23.861, 'grad_norm': 301.26239013671875, 'learning_rate': 1.0181488203266787e-05, 'epoch': 13.18} + 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 3660/5520 [3:12:21<1:31:40, 2.96s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6107029914855957, 'eval_runtime': 3.1314, 'eval_samples_per_second': 57.162, 'eval_steps_per_second': 57.162, 'epoch': 13.18} +{'loss': 24.495, 'grad_norm': 235.15576171875, 'learning_rate': 1.0176043557168784e-05, 'epoch': 13.18} +{'loss': 40.3819, 'grad_norm': 268.524658203125, 'learning_rate': 1.0170598911070781e-05, 'epoch': 13.18} +{'loss': 42.2715, 'grad_norm': 257.869140625, 'learning_rate': 1.0165154264972778e-05, 'epoch': 13.19} +{'loss': 41.2991, 'grad_norm': 191.8995361328125, 'learning_rate': 1.0159709618874774e-05, 'epoch': 13.19} +{'loss': 39.6007, 'grad_norm': 242.85342407226562, 'learning_rate': 1.0154264972776769e-05, 'epoch': 13.2} +{'loss': 39.8502, 'grad_norm': 279.1092529296875, 'learning_rate': 1.0148820326678766e-05, 'epoch': 13.2} +{'loss': 39.6407, 'grad_norm': 233.94708251953125, 'learning_rate': 1.0143375680580763e-05, 'epoch': 13.2} +{'loss': 40.3618, 'grad_norm': 227.53001403808594, 'learning_rate': 1.0137931034482758e-05, 'epoch': 13.21} +{'loss': 41.3187, 'grad_norm': 216.17654418945312, 'learning_rate': 1.0132486388384756e-05, 'epoch': 13.21} +{'loss': 41.7474, 'grad_norm': 199.51072692871094, 'learning_rate': 1.0127041742286751e-05, 'epoch': 13.21} + 66%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3670/5520 [3:12:54<1:31:43, 2.98s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6099065542221069, 'eval_runtime': 3.1426, 'eval_samples_per_second': 56.96, 'eval_steps_per_second': 56.96, 'epoch': 13.21} +{'loss': 40.8565, 'grad_norm': 212.3302001953125, 'learning_rate': 1.0121597096188748e-05, 'epoch': 13.22} +{'loss': 41.5302, 'grad_norm': 185.42857360839844, 'learning_rate': 1.0116152450090745e-05, 'epoch': 13.22} +{'loss': 38.6842, 'grad_norm': 241.05487060546875, 'learning_rate': 1.011070780399274e-05, 'epoch': 13.22} +{'loss': 37.8021, 'grad_norm': 314.1755065917969, 'learning_rate': 1.0105263157894738e-05, 'epoch': 13.23} +{'loss': 36.3265, 'grad_norm': 262.6571960449219, 'learning_rate': 1.0099818511796733e-05, 'epoch': 13.23} +{'loss': 38.4521, 'grad_norm': 259.24029541015625, 'learning_rate': 1.009437386569873e-05, 'epoch': 13.23} +{'loss': 37.3267, 'grad_norm': 223.5182342529297, 'learning_rate': 1.0088929219600727e-05, 'epoch': 13.24} +{'loss': 38.0142, 'grad_norm': 181.72926330566406, 'learning_rate': 1.0083484573502722e-05, 'epoch': 13.24} +{'loss': 37.3513, 'grad_norm': 204.99813842773438, 'learning_rate': 1.0078039927404718e-05, 'epoch': 13.25} +{'loss': 37.9737, 'grad_norm': 184.05482482910156, 'learning_rate': 1.0072595281306715e-05, 'epoch': 13.25} + 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3680/5520 [3:13:26<1:29:39, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6081296801567078, 'eval_runtime': 3.1359, 'eval_samples_per_second': 57.081, 'eval_steps_per_second': 57.081, 'epoch': 13.25} +{'loss': 38.1087, 'grad_norm': 261.076416015625, 'learning_rate': 1.0067150635208712e-05, 'epoch': 13.25} +{'loss': 37.215, 'grad_norm': 218.79515075683594, 'learning_rate': 1.0061705989110709e-05, 'epoch': 13.26} +{'loss': 37.4461, 'grad_norm': 240.93222045898438, 'learning_rate': 1.0056261343012704e-05, 'epoch': 13.26} +{'loss': 39.4396, 'grad_norm': 241.46072387695312, 'learning_rate': 1.00508166969147e-05, 'epoch': 13.26} +{'loss': 38.5512, 'grad_norm': 217.85369873046875, 'learning_rate': 1.0045372050816699e-05, 'epoch': 13.27} +{'loss': 39.4436, 'grad_norm': 254.53549194335938, 'learning_rate': 1.0039927404718694e-05, 'epoch': 13.27} +{'loss': 39.6341, 'grad_norm': 330.2030029296875, 'learning_rate': 1.003448275862069e-05, 'epoch': 13.27} +{'loss': 38.5305, 'grad_norm': 267.6778869628906, 'learning_rate': 1.0029038112522686e-05, 'epoch': 13.28} +{'loss': 39.712, 'grad_norm': 251.23703002929688, 'learning_rate': 1.0023593466424682e-05, 'epoch': 13.28} +{'loss': 37.982, 'grad_norm': 258.8126525878906, 'learning_rate': 1.0018148820326679e-05, 'epoch': 13.29} + 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3690/5520 [3:13:58<1:28:51, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6092600226402283, 'eval_runtime': 3.1494, 'eval_samples_per_second': 56.837, 'eval_steps_per_second': 56.837, 'epoch': 13.29} +{'loss': 35.8938, 'grad_norm': 270.01690673828125, 'learning_rate': 1.0012704174228676e-05, 'epoch': 13.29} +{'loss': 33.2221, 'grad_norm': 271.138671875, 'learning_rate': 1.0007259528130671e-05, 'epoch': 13.29} +{'loss': 32.6252, 'grad_norm': 239.4976806640625, 'learning_rate': 1.0001814882032668e-05, 'epoch': 13.3} +{'loss': 32.3694, 'grad_norm': 203.7470245361328, 'learning_rate': 9.996370235934664e-06, 'epoch': 13.3} +{'loss': 32.7386, 'grad_norm': 255.28419494628906, 'learning_rate': 9.990925589836661e-06, 'epoch': 13.3} +{'loss': 33.7657, 'grad_norm': 267.82489013671875, 'learning_rate': 9.985480943738658e-06, 'epoch': 13.31} +{'loss': 34.085, 'grad_norm': 224.82432556152344, 'learning_rate': 9.980036297640653e-06, 'epoch': 13.31} +{'loss': 33.9186, 'grad_norm': 249.92684936523438, 'learning_rate': 9.974591651542649e-06, 'epoch': 13.31} +{'loss': 35.0909, 'grad_norm': 249.29620361328125, 'learning_rate': 9.969147005444648e-06, 'epoch': 13.32} +{'loss': 35.6823, 'grad_norm': 276.4640808105469, 'learning_rate': 9.963702359346643e-06, 'epoch': 13.32} + 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3700/5520 [3:14:30<1:28:51, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6132593154907227, 'eval_runtime': 3.1401, 'eval_samples_per_second': 57.005, 'eval_steps_per_second': 57.005, 'epoch': 13.32} +{'loss': 35.7071, 'grad_norm': 245.46163940429688, 'learning_rate': 9.958257713248638e-06, 'epoch': 13.33} +{'loss': 33.6089, 'grad_norm': 311.008544921875, 'learning_rate': 9.952813067150635e-06, 'epoch': 13.33} +{'loss': 34.9939, 'grad_norm': 283.2784118652344, 'learning_rate': 9.94736842105263e-06, 'epoch': 13.33} +{'loss': 37.1149, 'grad_norm': 293.2317199707031, 'learning_rate': 9.94192377495463e-06, 'epoch': 13.34} +{'loss': 36.5911, 'grad_norm': 263.33111572265625, 'learning_rate': 9.936479128856625e-06, 'epoch': 13.34} +{'loss': 35.9336, 'grad_norm': 285.1488952636719, 'learning_rate': 9.93103448275862e-06, 'epoch': 13.34} +{'loss': 26.1555, 'grad_norm': 246.30616760253906, 'learning_rate': 9.925589836660617e-06, 'epoch': 13.35} +{'loss': 21.9519, 'grad_norm': 185.4857177734375, 'learning_rate': 9.920145190562614e-06, 'epoch': 13.35} +{'loss': 22.5592, 'grad_norm': 269.6291809082031, 'learning_rate': 9.91470054446461e-06, 'epoch': 13.35} +{'loss': 23.2505, 'grad_norm': 214.7660675048828, 'learning_rate': 9.909255898366607e-06, 'epoch': 13.36} + 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3710/5520 [3:15:02<1:28:30, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6123418211936951, 'eval_runtime': 3.1401, 'eval_samples_per_second': 57.005, 'eval_steps_per_second': 57.005, 'epoch': 13.36} +{'loss': 23.9731, 'grad_norm': 227.8025360107422, 'learning_rate': 9.903811252268602e-06, 'epoch': 13.36} +{'loss': 40.3869, 'grad_norm': 261.7846374511719, 'learning_rate': 9.898366606170598e-06, 'epoch': 13.36} +{'loss': 41.9626, 'grad_norm': 305.4109802246094, 'learning_rate': 9.892921960072596e-06, 'epoch': 13.37} +{'loss': 39.9819, 'grad_norm': 272.86236572265625, 'learning_rate': 9.887477313974592e-06, 'epoch': 13.37} +{'loss': 40.8074, 'grad_norm': 371.4781188964844, 'learning_rate': 9.882032667876589e-06, 'epoch': 13.38} +{'loss': 40.6721, 'grad_norm': 278.7463684082031, 'learning_rate': 9.876588021778584e-06, 'epoch': 13.38} +{'loss': 40.1604, 'grad_norm': 270.41619873046875, 'learning_rate': 9.87114337568058e-06, 'epoch': 13.38} +{'loss': 41.4666, 'grad_norm': 204.42018127441406, 'learning_rate': 9.865698729582578e-06, 'epoch': 13.39} +{'loss': 40.953, 'grad_norm': 197.43289184570312, 'learning_rate': 9.860254083484574e-06, 'epoch': 13.39} +{'loss': 40.6416, 'grad_norm': 203.92056274414062, 'learning_rate': 9.85480943738657e-06, 'epoch': 13.39} + 67%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3720/5520 [3:15:34<1:27:44, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.608938992023468, 'eval_runtime': 3.1479, 'eval_samples_per_second': 56.863, 'eval_steps_per_second': 56.863, 'epoch': 13.39} +{'loss': 39.7, 'grad_norm': 353.2951354980469, 'learning_rate': 9.849364791288566e-06, 'epoch': 13.4} +{'loss': 40.4703, 'grad_norm': 222.94410705566406, 'learning_rate': 9.843920145190563e-06, 'epoch': 13.4} +{'loss': 37.0453, 'grad_norm': 301.0710754394531, 'learning_rate': 9.83847549909256e-06, 'epoch': 13.4} +{'loss': 37.5346, 'grad_norm': 251.70263671875, 'learning_rate': 9.833030852994556e-06, 'epoch': 13.41} +{'loss': 39.0706, 'grad_norm': 201.29335021972656, 'learning_rate': 9.827586206896551e-06, 'epoch': 13.41} +{'loss': 38.4527, 'grad_norm': 233.82212829589844, 'learning_rate': 9.822141560798548e-06, 'epoch': 13.42} +{'loss': 37.82, 'grad_norm': 245.0128936767578, 'learning_rate': 9.816696914700545e-06, 'epoch': 13.42} +{'loss': 38.8858, 'grad_norm': 325.1784973144531, 'learning_rate': 9.81125226860254e-06, 'epoch': 13.42} +{'loss': 37.1919, 'grad_norm': 196.15032958984375, 'learning_rate': 9.805807622504538e-06, 'epoch': 13.43} +{'loss': 39.1644, 'grad_norm': 254.73980712890625, 'learning_rate': 9.800362976406533e-06, 'epoch': 13.43} + 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3730/5520 [3:16:06<1:26:45, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6100116968154907, 'eval_runtime': 3.1394, 'eval_samples_per_second': 57.016, 'eval_steps_per_second': 57.016, 'epoch': 13.43} +{'loss': 39.8542, 'grad_norm': 253.11489868164062, 'learning_rate': 9.79491833030853e-06, 'epoch': 13.43} +{'loss': 39.8469, 'grad_norm': 267.8416748046875, 'learning_rate': 9.789473684210527e-06, 'epoch': 13.44} +{'loss': 37.4556, 'grad_norm': 267.62835693359375, 'learning_rate': 9.784029038112523e-06, 'epoch': 13.44} +{'loss': 39.7817, 'grad_norm': 346.6018371582031, 'learning_rate': 9.77858439201452e-06, 'epoch': 13.44} +{'loss': 39.1631, 'grad_norm': 241.95008850097656, 'learning_rate': 9.773139745916515e-06, 'epoch': 13.45} +{'loss': 38.6152, 'grad_norm': 244.9163055419922, 'learning_rate': 9.767695099818512e-06, 'epoch': 13.45} +{'loss': 39.5388, 'grad_norm': 243.60633850097656, 'learning_rate': 9.76225045372051e-06, 'epoch': 13.46} +{'loss': 40.3007, 'grad_norm': 230.57276916503906, 'learning_rate': 9.756805807622505e-06, 'epoch': 13.46} +{'loss': 37.7111, 'grad_norm': 228.76754760742188, 'learning_rate': 9.7513611615245e-06, 'epoch': 13.46} +{'loss': 38.4114, 'grad_norm': 292.7367248535156, 'learning_rate': 9.745916515426497e-06, 'epoch': 13.47} + 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3740/5520 [3:16:38<1:27:25, 2.95s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6064842939376831, 'eval_runtime': 3.1485, 'eval_samples_per_second': 56.852, 'eval_steps_per_second': 56.852, 'epoch': 13.47} +{'loss': 34.015, 'grad_norm': 226.9254150390625, 'learning_rate': 9.740471869328494e-06, 'epoch': 13.47} +{'loss': 34.2911, 'grad_norm': 250.38137817382812, 'learning_rate': 9.73502722323049e-06, 'epoch': 13.47} +{'loss': 31.8708, 'grad_norm': 230.447265625, 'learning_rate': 9.729582577132487e-06, 'epoch': 13.48} +{'loss': 34.5685, 'grad_norm': 241.05787658691406, 'learning_rate': 9.724137931034482e-06, 'epoch': 13.48} +{'loss': 32.6084, 'grad_norm': 248.07254028320312, 'learning_rate': 9.718693284936481e-06, 'epoch': 13.48} +{'loss': 32.787, 'grad_norm': 241.22862243652344, 'learning_rate': 9.713248638838476e-06, 'epoch': 13.49} +{'loss': 33.9786, 'grad_norm': 295.4871520996094, 'learning_rate': 9.707803992740472e-06, 'epoch': 13.49} +{'loss': 33.9872, 'grad_norm': 285.3634948730469, 'learning_rate': 9.702359346642469e-06, 'epoch': 13.49} +{'loss': 33.9854, 'grad_norm': 302.39947509765625, 'learning_rate': 9.696914700544464e-06, 'epoch': 13.5} +{'loss': 34.1859, 'grad_norm': 310.0465087890625, 'learning_rate': 9.691470054446461e-06, 'epoch': 13.5} + 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3750/5520 [3:17:10<1:26:07, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6067100167274475, 'eval_runtime': 3.1393, 'eval_samples_per_second': 57.018, 'eval_steps_per_second': 57.018, 'epoch': 13.5} +{'loss': 34.5264, 'grad_norm': 319.9311828613281, 'learning_rate': 9.686025408348458e-06, 'epoch': 13.51} +{'loss': 35.8348, 'grad_norm': 291.75738525390625, 'learning_rate': 9.680580762250454e-06, 'epoch': 13.51} +{'loss': 33.8803, 'grad_norm': 291.5312805175781, 'learning_rate': 9.675136116152449e-06, 'epoch': 13.51} +{'loss': 36.1919, 'grad_norm': 228.00588989257812, 'learning_rate': 9.669691470054448e-06, 'epoch': 13.52} +{'loss': 35.8432, 'grad_norm': 236.5559539794922, 'learning_rate': 9.664246823956443e-06, 'epoch': 13.52} +{'loss': 37.069, 'grad_norm': 287.7408752441406, 'learning_rate': 9.65880217785844e-06, 'epoch': 13.52} +{'loss': 29.1896, 'grad_norm': 272.73870849609375, 'learning_rate': 9.653357531760436e-06, 'epoch': 13.53} +{'loss': 23.0953, 'grad_norm': 256.5550842285156, 'learning_rate': 9.647912885662431e-06, 'epoch': 13.53} +{'loss': 21.9902, 'grad_norm': 230.98487854003906, 'learning_rate': 9.64246823956443e-06, 'epoch': 13.53} +{'loss': 23.7439, 'grad_norm': 247.1185760498047, 'learning_rate': 9.637023593466425e-06, 'epoch': 13.54} + 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3760/5520 [3:17:42<1:25:33, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6106311082839966, 'eval_runtime': 3.1356, 'eval_samples_per_second': 57.086, 'eval_steps_per_second': 57.086, 'epoch': 13.54} +{'loss': 24.2292, 'grad_norm': 193.83152770996094, 'learning_rate': 9.63157894736842e-06, 'epoch': 13.54} +{'loss': 40.9778, 'grad_norm': 322.80487060546875, 'learning_rate': 9.626134301270418e-06, 'epoch': 13.55} +{'loss': 42.3601, 'grad_norm': 345.0560302734375, 'learning_rate': 9.620689655172413e-06, 'epoch': 13.55} +{'loss': 41.092, 'grad_norm': 240.3759002685547, 'learning_rate': 9.61524500907441e-06, 'epoch': 13.55} +{'loss': 40.3108, 'grad_norm': 219.0955352783203, 'learning_rate': 9.609800362976407e-06, 'epoch': 13.56} +{'loss': 39.8885, 'grad_norm': 255.6158447265625, 'learning_rate': 9.604355716878403e-06, 'epoch': 13.56} +{'loss': 40.8838, 'grad_norm': 264.55010986328125, 'learning_rate': 9.5989110707804e-06, 'epoch': 13.56} +{'loss': 40.6634, 'grad_norm': 313.0918273925781, 'learning_rate': 9.593466424682397e-06, 'epoch': 13.57} +{'loss': 41.8734, 'grad_norm': 304.87396240234375, 'learning_rate': 9.588021778584392e-06, 'epoch': 13.57} +{'loss': 40.6281, 'grad_norm': 239.76063537597656, 'learning_rate': 9.58257713248639e-06, 'epoch': 13.57} + 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3770/5520 [3:18:14<1:25:12, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6124129891395569, 'eval_runtime': 3.1388, 'eval_samples_per_second': 57.028, 'eval_steps_per_second': 57.028, 'epoch': 13.57} +{'loss': 39.6948, 'grad_norm': 201.89422607421875, 'learning_rate': 9.577132486388385e-06, 'epoch': 13.58} +{'loss': 39.6927, 'grad_norm': 232.8797607421875, 'learning_rate': 9.57168784029038e-06, 'epoch': 13.58} +{'loss': 37.6926, 'grad_norm': 250.30355834960938, 'learning_rate': 9.566243194192379e-06, 'epoch': 13.59} +{'loss': 38.248, 'grad_norm': 256.23626708984375, 'learning_rate': 9.560798548094374e-06, 'epoch': 13.59} +{'loss': 36.8178, 'grad_norm': 234.1791534423828, 'learning_rate': 9.555353901996371e-06, 'epoch': 13.59} +{'loss': 37.0802, 'grad_norm': 243.87615966796875, 'learning_rate': 9.549909255898367e-06, 'epoch': 13.6} +{'loss': 37.1251, 'grad_norm': 220.98150634765625, 'learning_rate': 9.544464609800362e-06, 'epoch': 13.6} +{'loss': 38.2965, 'grad_norm': 235.8653564453125, 'learning_rate': 9.53901996370236e-06, 'epoch': 13.6} +{'loss': 38.0266, 'grad_norm': 237.66712951660156, 'learning_rate': 9.533575317604356e-06, 'epoch': 13.61} +{'loss': 38.4199, 'grad_norm': 229.4922637939453, 'learning_rate': 9.528130671506351e-06, 'epoch': 13.61} + 68%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3780/5520 [3:18:46<1:24:47, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6078812479972839, 'eval_runtime': 3.1383, 'eval_samples_per_second': 57.037, 'eval_steps_per_second': 57.037, 'epoch': 13.61} +{'loss': 39.713, 'grad_norm': 250.82533264160156, 'learning_rate': 9.522686025408349e-06, 'epoch': 13.61} +{'loss': 37.6396, 'grad_norm': 218.97511291503906, 'learning_rate': 9.517241379310346e-06, 'epoch': 13.62} +{'loss': 39.2808, 'grad_norm': 240.13096618652344, 'learning_rate': 9.511796733212341e-06, 'epoch': 13.62} +{'loss': 39.1584, 'grad_norm': 214.77957153320312, 'learning_rate': 9.506352087114338e-06, 'epoch': 13.62} +{'loss': 39.6725, 'grad_norm': 273.2488708496094, 'learning_rate': 9.500907441016333e-06, 'epoch': 13.63} +{'loss': 40.155, 'grad_norm': 240.46669006347656, 'learning_rate': 9.49546279491833e-06, 'epoch': 13.63} +{'loss': 39.5831, 'grad_norm': 304.46533203125, 'learning_rate': 9.490018148820328e-06, 'epoch': 13.64} +{'loss': 40.8392, 'grad_norm': 282.9252624511719, 'learning_rate': 9.484573502722323e-06, 'epoch': 13.64} +{'loss': 38.4015, 'grad_norm': 229.2595977783203, 'learning_rate': 9.47912885662432e-06, 'epoch': 13.64} +{'loss': 35.0578, 'grad_norm': 300.0253601074219, 'learning_rate': 9.473684210526315e-06, 'epoch': 13.65} + 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3790/5520 [3:19:18<1:24:32, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6059401631355286, 'eval_runtime': 3.1422, 'eval_samples_per_second': 56.967, 'eval_steps_per_second': 56.967, 'epoch': 13.65} +{'loss': 33.0308, 'grad_norm': 266.379638671875, 'learning_rate': 9.468239564428313e-06, 'epoch': 13.65} +{'loss': 31.7632, 'grad_norm': 248.8190460205078, 'learning_rate': 9.46279491833031e-06, 'epoch': 13.65} +{'loss': 32.8875, 'grad_norm': 224.4126739501953, 'learning_rate': 9.457350272232305e-06, 'epoch': 13.66} +{'loss': 32.3248, 'grad_norm': 259.84466552734375, 'learning_rate': 9.4519056261343e-06, 'epoch': 13.66} +{'loss': 32.5855, 'grad_norm': 233.59483337402344, 'learning_rate': 9.446460980036297e-06, 'epoch': 13.66} +{'loss': 33.8277, 'grad_norm': 283.1840515136719, 'learning_rate': 9.441016333938295e-06, 'epoch': 13.67} +{'loss': 33.8348, 'grad_norm': 269.51171875, 'learning_rate': 9.435571687840292e-06, 'epoch': 13.67} +{'loss': 34.2571, 'grad_norm': 284.6701354980469, 'learning_rate': 9.430127041742287e-06, 'epoch': 13.68} +{'loss': 34.2313, 'grad_norm': 308.96221923828125, 'learning_rate': 9.424682395644282e-06, 'epoch': 13.68} +{'loss': 34.6341, 'grad_norm': 229.36366271972656, 'learning_rate': 9.41923774954628e-06, 'epoch': 13.68} + 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3800/5520 [3:19:50<1:24:10, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.606715202331543, 'eval_runtime': 3.1366, 'eval_samples_per_second': 57.068, 'eval_steps_per_second': 57.068, 'epoch': 13.68} +{'loss': 35.2222, 'grad_norm': 335.4346008300781, 'learning_rate': 9.413793103448277e-06, 'epoch': 13.69} +{'loss': 34.7416, 'grad_norm': 259.72222900390625, 'learning_rate': 9.408348457350272e-06, 'epoch': 13.69} +{'loss': 34.2018, 'grad_norm': 275.96112060546875, 'learning_rate': 9.402903811252269e-06, 'epoch': 13.69} +{'loss': 37.8801, 'grad_norm': 349.28924560546875, 'learning_rate': 9.397459165154264e-06, 'epoch': 13.7} +{'loss': 37.5101, 'grad_norm': 288.47540283203125, 'learning_rate': 9.392014519056261e-06, 'epoch': 13.7} +{'loss': 36.9294, 'grad_norm': 255.31033325195312, 'learning_rate': 9.386569872958259e-06, 'epoch': 13.7} +{'loss': 31.64, 'grad_norm': 273.757080078125, 'learning_rate': 9.381125226860254e-06, 'epoch': 13.71} +{'loss': 22.9812, 'grad_norm': 236.24928283691406, 'learning_rate': 9.375680580762251e-06, 'epoch': 13.71} +{'loss': 22.4788, 'grad_norm': 206.70883178710938, 'learning_rate': 9.370235934664246e-06, 'epoch': 13.72} +{'loss': 23.3803, 'grad_norm': 168.15762329101562, 'learning_rate': 9.364791288566243e-06, 'epoch': 13.72} + 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3800/5520 [3:19:53<1:24:10, 2.94s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3810/5520 [3:20:23<1:24:06, 2.95s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6092759966850281, 'eval_runtime': 3.1424, 'eval_samples_per_second': 56.962, 'eval_steps_per_second': 56.962, 'epoch': 13.72} +{'loss': 24.8757, 'grad_norm': 261.88397216796875, 'learning_rate': 9.35934664246824e-06, 'epoch': 13.72} +{'loss': 39.8777, 'grad_norm': 235.3518829345703, 'learning_rate': 9.353901996370236e-06, 'epoch': 13.73} +{'loss': 40.4357, 'grad_norm': 226.94027709960938, 'learning_rate': 9.348457350272231e-06, 'epoch': 13.73} +{'loss': 41.6411, 'grad_norm': 266.2643737792969, 'learning_rate': 9.34301270417423e-06, 'epoch': 13.73} +{'loss': 39.862, 'grad_norm': 327.39288330078125, 'learning_rate': 9.337568058076225e-06, 'epoch': 13.74} +{'loss': 39.1833, 'grad_norm': 241.03121948242188, 'learning_rate': 9.332123411978223e-06, 'epoch': 13.74} +{'loss': 40.6895, 'grad_norm': 232.2872314453125, 'learning_rate': 9.326678765880218e-06, 'epoch': 13.74} +{'loss': 39.5891, 'grad_norm': 236.909912109375, 'learning_rate': 9.321234119782213e-06, 'epoch': 13.75} +{'loss': 41.5211, 'grad_norm': 193.81478881835938, 'learning_rate': 9.315789473684212e-06, 'epoch': 13.75} +{'loss': 41.0726, 'grad_norm': 214.87301635742188, 'learning_rate': 9.310344827586207e-06, 'epoch': 13.75} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3820/5520 [3:20:55<1:23:00, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6098713874816895, 'eval_runtime': 3.1378, 'eval_samples_per_second': 57.047, 'eval_steps_per_second': 57.047, 'epoch': 13.75} +{'loss': 40.1843, 'grad_norm': 196.57247924804688, 'learning_rate': 9.304900181488203e-06, 'epoch': 13.76} +{'loss': 37.6279, 'grad_norm': 215.59698486328125, 'learning_rate': 9.2994555353902e-06, 'epoch': 13.76} +{'loss': 37.9593, 'grad_norm': 221.1280059814453, 'learning_rate': 9.294010889292195e-06, 'epoch': 13.77} +{'loss': 37.3399, 'grad_norm': 314.94610595703125, 'learning_rate': 9.288566243194192e-06, 'epoch': 13.77} +{'loss': 38.3185, 'grad_norm': 240.10816955566406, 'learning_rate': 9.28312159709619e-06, 'epoch': 13.77} +{'loss': 36.9407, 'grad_norm': 229.2427978515625, 'learning_rate': 9.277676950998185e-06, 'epoch': 13.78} +{'loss': 39.3709, 'grad_norm': 224.78335571289062, 'learning_rate': 9.272232304900182e-06, 'epoch': 13.78} +{'loss': 38.2303, 'grad_norm': 216.5969696044922, 'learning_rate': 9.266787658802179e-06, 'epoch': 13.78} +{'loss': 39.492, 'grad_norm': 208.7849884033203, 'learning_rate': 9.261343012704174e-06, 'epoch': 13.79} +{'loss': 38.5599, 'grad_norm': 215.76475524902344, 'learning_rate': 9.255898366606171e-06, 'epoch': 13.79} + 69%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3830/5520 [3:21:28<1:22:29, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6080366969108582, 'eval_runtime': 3.1381, 'eval_samples_per_second': 57.041, 'eval_steps_per_second': 57.041, 'epoch': 13.79} +{'loss': 39.315, 'grad_norm': 224.64462280273438, 'learning_rate': 9.250453720508167e-06, 'epoch': 13.79} +{'loss': 38.3108, 'grad_norm': 298.545654296875, 'learning_rate': 9.245009074410162e-06, 'epoch': 13.8} +{'loss': 39.9223, 'grad_norm': 236.5186767578125, 'learning_rate': 9.239564428312161e-06, 'epoch': 13.8} +{'loss': 39.4288, 'grad_norm': 251.47999572753906, 'learning_rate': 9.234119782214156e-06, 'epoch': 13.81} +{'loss': 38.276, 'grad_norm': 260.8268737792969, 'learning_rate': 9.228675136116152e-06, 'epoch': 13.81} +{'loss': 40.7118, 'grad_norm': 253.25172424316406, 'learning_rate': 9.223230490018149e-06, 'epoch': 13.81} +{'loss': 40.1916, 'grad_norm': 250.31784057617188, 'learning_rate': 9.217785843920146e-06, 'epoch': 13.82} +{'loss': 38.1513, 'grad_norm': 228.79234313964844, 'learning_rate': 9.212341197822143e-06, 'epoch': 13.82} +{'loss': 38.43, 'grad_norm': 262.689697265625, 'learning_rate': 9.206896551724138e-06, 'epoch': 13.82} +{'loss': 34.2476, 'grad_norm': 191.04139709472656, 'learning_rate': 9.201451905626134e-06, 'epoch': 13.83} + 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3840/5520 [3:22:00<1:22:09, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6077054142951965, 'eval_runtime': 3.1445, 'eval_samples_per_second': 56.925, 'eval_steps_per_second': 56.925, 'epoch': 13.83} +{'loss': 33.7892, 'grad_norm': 236.3266143798828, 'learning_rate': 9.196007259528131e-06, 'epoch': 13.83} +{'loss': 31.9857, 'grad_norm': 284.8748474121094, 'learning_rate': 9.190562613430128e-06, 'epoch': 13.83} +{'loss': 32.8165, 'grad_norm': 261.17413330078125, 'learning_rate': 9.185117967332123e-06, 'epoch': 13.84} +{'loss': 33.1709, 'grad_norm': 195.1323699951172, 'learning_rate': 9.17967332123412e-06, 'epoch': 13.84} +{'loss': 33.149, 'grad_norm': 220.5006561279297, 'learning_rate': 9.174228675136116e-06, 'epoch': 13.85} +{'loss': 33.633, 'grad_norm': 236.7254638671875, 'learning_rate': 9.168784029038111e-06, 'epoch': 13.85} +{'loss': 34.6822, 'grad_norm': 269.1921691894531, 'learning_rate': 9.16333938294011e-06, 'epoch': 13.85} +{'loss': 35.2816, 'grad_norm': 222.4369354248047, 'learning_rate': 9.157894736842105e-06, 'epoch': 13.86} +{'loss': 35.0067, 'grad_norm': 232.4306640625, 'learning_rate': 9.152450090744102e-06, 'epoch': 13.86} +{'loss': 34.264, 'grad_norm': 297.0786437988281, 'learning_rate': 9.147005444646098e-06, 'epoch': 13.86} + 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3850/5520 [3:22:32<1:21:46, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6047748327255249, 'eval_runtime': 3.1408, 'eval_samples_per_second': 56.992, 'eval_steps_per_second': 56.992, 'epoch': 13.86} +{'loss': 35.4996, 'grad_norm': 370.232421875, 'learning_rate': 9.141560798548095e-06, 'epoch': 13.87} +{'loss': 36.1403, 'grad_norm': 216.05775451660156, 'learning_rate': 9.136116152450092e-06, 'epoch': 13.87} +{'loss': 36.0324, 'grad_norm': 233.11138916015625, 'learning_rate': 9.130671506352087e-06, 'epoch': 13.87} +{'loss': 36.5617, 'grad_norm': 297.1761779785156, 'learning_rate': 9.125226860254083e-06, 'epoch': 13.88} +{'loss': 36.7113, 'grad_norm': 290.61590576171875, 'learning_rate': 9.11978221415608e-06, 'epoch': 13.88} +{'loss': 36.9964, 'grad_norm': 293.5744934082031, 'learning_rate': 9.114337568058077e-06, 'epoch': 13.88} +{'loss': 31.8552, 'grad_norm': 227.73455810546875, 'learning_rate': 9.108892921960072e-06, 'epoch': 13.89} +{'loss': 22.9122, 'grad_norm': 223.36077880859375, 'learning_rate': 9.10344827586207e-06, 'epoch': 13.89} +{'loss': 22.366, 'grad_norm': 181.14501953125, 'learning_rate': 9.098003629764065e-06, 'epoch': 13.9} +{'loss': 23.9545, 'grad_norm': 215.75856018066406, 'learning_rate': 9.092558983666063e-06, 'epoch': 13.9} + 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3860/5520 [3:23:04<1:20:53, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6072003245353699, 'eval_runtime': 3.1394, 'eval_samples_per_second': 57.018, 'eval_steps_per_second': 57.018, 'epoch': 13.9} +{'loss': 23.5196, 'grad_norm': 233.22837829589844, 'learning_rate': 9.087114337568059e-06, 'epoch': 13.9} +{'loss': 41.4605, 'grad_norm': 269.9342041015625, 'learning_rate': 9.081669691470054e-06, 'epoch': 13.91} +{'loss': 40.2848, 'grad_norm': 304.4266662597656, 'learning_rate': 9.076225045372051e-06, 'epoch': 13.91} +{'loss': 41.0044, 'grad_norm': 318.2371520996094, 'learning_rate': 9.070780399274047e-06, 'epoch': 13.91} +{'loss': 40.776, 'grad_norm': 272.9725341796875, 'learning_rate': 9.065335753176044e-06, 'epoch': 13.92} +{'loss': 39.4964, 'grad_norm': 213.8822784423828, 'learning_rate': 9.059891107078041e-06, 'epoch': 13.92} +{'loss': 41.3482, 'grad_norm': 239.16128540039062, 'learning_rate': 9.054446460980036e-06, 'epoch': 13.92} +{'loss': 38.2433, 'grad_norm': 264.839111328125, 'learning_rate': 9.049001814882033e-06, 'epoch': 13.93} +{'loss': 38.6482, 'grad_norm': 244.00926208496094, 'learning_rate': 9.043557168784029e-06, 'epoch': 13.93} +{'loss': 39.2047, 'grad_norm': 342.8050537109375, 'learning_rate': 9.038112522686026e-06, 'epoch': 13.94} + 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3870/5520 [3:23:36<1:20:32, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6078094244003296, 'eval_runtime': 3.1426, 'eval_samples_per_second': 56.96, 'eval_steps_per_second': 56.96, 'epoch': 13.94} +{'loss': 39.2827, 'grad_norm': 232.509033203125, 'learning_rate': 9.032667876588023e-06, 'epoch': 13.94} +{'loss': 38.2709, 'grad_norm': 343.2891845703125, 'learning_rate': 9.027223230490018e-06, 'epoch': 13.94} +{'loss': 38.8266, 'grad_norm': 332.9613342285156, 'learning_rate': 9.021778584392014e-06, 'epoch': 13.95} +{'loss': 39.9249, 'grad_norm': 339.5653076171875, 'learning_rate': 9.016333938294012e-06, 'epoch': 13.95} +{'loss': 39.4593, 'grad_norm': 269.0108947753906, 'learning_rate': 9.010889292196008e-06, 'epoch': 13.95} +{'loss': 39.5471, 'grad_norm': 252.5339813232422, 'learning_rate': 9.005444646098003e-06, 'epoch': 13.96} +{'loss': 35.7505, 'grad_norm': 424.7225646972656, 'learning_rate': 9e-06, 'epoch': 13.96} +{'loss': 32.445, 'grad_norm': 286.189208984375, 'learning_rate': 8.994555353901996e-06, 'epoch': 13.96} +{'loss': 33.2369, 'grad_norm': 245.153564453125, 'learning_rate': 8.989110707803994e-06, 'epoch': 13.97} +{'loss': 31.7864, 'grad_norm': 305.3119812011719, 'learning_rate': 8.98366606170599e-06, 'epoch': 13.97} + 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3880/5520 [3:24:07<1:18:29, 2.87s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6069231629371643, 'eval_runtime': 3.1471, 'eval_samples_per_second': 56.877, 'eval_steps_per_second': 56.877, 'epoch': 13.97} +{'loss': 33.7166, 'grad_norm': 218.70913696289062, 'learning_rate': 8.978221415607985e-06, 'epoch': 13.98} +{'loss': 35.8878, 'grad_norm': 334.856201171875, 'learning_rate': 8.972776769509982e-06, 'epoch': 13.98} +{'loss': 35.1525, 'grad_norm': 305.65203857421875, 'learning_rate': 8.96733212341198e-06, 'epoch': 13.98} +{'loss': 34.8268, 'grad_norm': 330.148193359375, 'learning_rate': 8.961887477313975e-06, 'epoch': 13.99} +{'loss': 35.5068, 'grad_norm': 288.9424133300781, 'learning_rate': 8.956442831215972e-06, 'epoch': 13.99} +{'loss': 28.5016, 'grad_norm': 256.2596740722656, 'learning_rate': 8.950998185117967e-06, 'epoch': 13.99} +{'loss': 23.7416, 'grad_norm': 234.31991577148438, 'learning_rate': 8.945553539019963e-06, 'epoch': 14.0} +{'loss': 21.0329, 'grad_norm': 182.19000244140625, 'learning_rate': 8.940108892921961e-06, 'epoch': 14.0} +{'loss': 39.94, 'grad_norm': 254.86355590820312, 'learning_rate': 8.934664246823957e-06, 'epoch': 14.0} +{'loss': 40.3213, 'grad_norm': 229.75650024414062, 'learning_rate': 8.929219600725954e-06, 'epoch': 14.01} + 70%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3890/5520 [3:24:39<1:19:22, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.604503870010376, 'eval_runtime': 3.1426, 'eval_samples_per_second': 56.96, 'eval_steps_per_second': 56.96, 'epoch': 14.01} +{'loss': 40.1568, 'grad_norm': 220.18190002441406, 'learning_rate': 8.923774954627949e-06, 'epoch': 14.01} +{'loss': 40.3685, 'grad_norm': 269.5978088378906, 'learning_rate': 8.918330308529945e-06, 'epoch': 14.01} +{'loss': 40.0845, 'grad_norm': 254.3507537841797, 'learning_rate': 8.912885662431943e-06, 'epoch': 14.02} +{'loss': 40.1731, 'grad_norm': 251.43653869628906, 'learning_rate': 8.907441016333939e-06, 'epoch': 14.02} +{'loss': 39.7179, 'grad_norm': 215.91253662109375, 'learning_rate': 8.901996370235934e-06, 'epoch': 14.03} +{'loss': 41.0822, 'grad_norm': 247.81790161132812, 'learning_rate': 8.896551724137931e-06, 'epoch': 14.03} +{'loss': 39.7873, 'grad_norm': 232.45892333984375, 'learning_rate': 8.891107078039928e-06, 'epoch': 14.03} +{'loss': 41.1302, 'grad_norm': 231.8137969970703, 'learning_rate': 8.885662431941924e-06, 'epoch': 14.04} +{'loss': 39.2293, 'grad_norm': 219.09446716308594, 'learning_rate': 8.88021778584392e-06, 'epoch': 14.04} +{'loss': 37.3338, 'grad_norm': 187.99874877929688, 'learning_rate': 8.874773139745916e-06, 'epoch': 14.04} + 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3900/5520 [3:25:11<1:18:51, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.603966236114502, 'eval_runtime': 3.1361, 'eval_samples_per_second': 57.077, 'eval_steps_per_second': 57.077, 'epoch': 14.04} +{'loss': 36.9479, 'grad_norm': 285.2400207519531, 'learning_rate': 8.869328493647913e-06, 'epoch': 14.05} +{'loss': 35.1313, 'grad_norm': 234.23655700683594, 'learning_rate': 8.86388384754991e-06, 'epoch': 14.05} +{'loss': 36.5917, 'grad_norm': 234.78717041015625, 'learning_rate': 8.858439201451906e-06, 'epoch': 14.05} +{'loss': 38.3228, 'grad_norm': 226.53997802734375, 'learning_rate': 8.852994555353903e-06, 'epoch': 14.06} +{'loss': 37.3542, 'grad_norm': 222.05213928222656, 'learning_rate': 8.847549909255898e-06, 'epoch': 14.06} +{'loss': 37.6396, 'grad_norm': 222.9646759033203, 'learning_rate': 8.842105263157893e-06, 'epoch': 14.07} +{'loss': 38.1988, 'grad_norm': 227.78965759277344, 'learning_rate': 8.836660617059892e-06, 'epoch': 14.07} +{'loss': 38.3981, 'grad_norm': 200.89691162109375, 'learning_rate': 8.831215970961888e-06, 'epoch': 14.07} +{'loss': 37.3422, 'grad_norm': 212.52891540527344, 'learning_rate': 8.825771324863883e-06, 'epoch': 14.08} +{'loss': 38.1292, 'grad_norm': 312.33905029296875, 'learning_rate': 8.82032667876588e-06, 'epoch': 14.08} + 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3910/5520 [3:25:43<1:18:07, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6061921119689941, 'eval_runtime': 3.1394, 'eval_samples_per_second': 57.017, 'eval_steps_per_second': 57.017, 'epoch': 14.08} +{'loss': 37.5543, 'grad_norm': 261.8415832519531, 'learning_rate': 8.814882032667877e-06, 'epoch': 14.08} +{'loss': 39.3912, 'grad_norm': 264.625732421875, 'learning_rate': 8.809437386569874e-06, 'epoch': 14.09} +{'loss': 39.7879, 'grad_norm': 305.7203063964844, 'learning_rate': 8.80399274047187e-06, 'epoch': 14.09} +{'loss': 38.7212, 'grad_norm': 282.63616943359375, 'learning_rate': 8.798548094373865e-06, 'epoch': 14.09} +{'loss': 40.6198, 'grad_norm': 246.49169921875, 'learning_rate': 8.793103448275862e-06, 'epoch': 14.1} +{'loss': 39.6947, 'grad_norm': 283.2737731933594, 'learning_rate': 8.787658802177859e-06, 'epoch': 14.1} +{'loss': 38.6157, 'grad_norm': 306.95721435546875, 'learning_rate': 8.782214156079855e-06, 'epoch': 14.1} +{'loss': 35.5328, 'grad_norm': 238.1789093017578, 'learning_rate': 8.776769509981852e-06, 'epoch': 14.11} +{'loss': 32.4008, 'grad_norm': 233.2298126220703, 'learning_rate': 8.771324863883847e-06, 'epoch': 14.11} +{'loss': 31.0712, 'grad_norm': 233.46339416503906, 'learning_rate': 8.765880217785846e-06, 'epoch': 14.12} + 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3920/5520 [3:26:15<1:18:04, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6046931147575378, 'eval_runtime': 3.1417, 'eval_samples_per_second': 56.976, 'eval_steps_per_second': 56.976, 'epoch': 14.12} +{'loss': 33.252, 'grad_norm': 226.30343627929688, 'learning_rate': 8.760435571687841e-06, 'epoch': 14.12} +{'loss': 31.526, 'grad_norm': 247.17465209960938, 'learning_rate': 8.754990925589837e-06, 'epoch': 14.12} +{'loss': 32.4838, 'grad_norm': 208.25439453125, 'learning_rate': 8.749546279491834e-06, 'epoch': 14.13} +{'loss': 32.7987, 'grad_norm': 236.4488525390625, 'learning_rate': 8.744101633393829e-06, 'epoch': 14.13} +{'loss': 32.8516, 'grad_norm': 219.13279724121094, 'learning_rate': 8.738656987295826e-06, 'epoch': 14.13} +{'loss': 33.7763, 'grad_norm': 239.7289581298828, 'learning_rate': 8.733212341197823e-06, 'epoch': 14.14} +{'loss': 35.675, 'grad_norm': 226.3568878173828, 'learning_rate': 8.727767695099819e-06, 'epoch': 14.14} +{'loss': 34.0523, 'grad_norm': 302.84307861328125, 'learning_rate': 8.722323049001814e-06, 'epoch': 14.14} +{'loss': 35.2923, 'grad_norm': 280.40106201171875, 'learning_rate': 8.716878402903811e-06, 'epoch': 14.15} +{'loss': 36.0242, 'grad_norm': 238.30520629882812, 'learning_rate': 8.711433756805808e-06, 'epoch': 14.15} + 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3930/5520 [3:26:47<1:17:50, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6067762970924377, 'eval_runtime': 3.142, 'eval_samples_per_second': 56.969, 'eval_steps_per_second': 56.969, 'epoch': 14.15} +{'loss': 36.2959, 'grad_norm': 238.6465301513672, 'learning_rate': 8.705989110707805e-06, 'epoch': 14.16} +{'loss': 35.45, 'grad_norm': 273.26837158203125, 'learning_rate': 8.7005444646098e-06, 'epoch': 14.16} +{'loss': 36.4428, 'grad_norm': 296.907958984375, 'learning_rate': 8.695099818511796e-06, 'epoch': 14.16} +{'loss': 26.4171, 'grad_norm': 215.07374572753906, 'learning_rate': 8.689655172413795e-06, 'epoch': 14.17} +{'loss': 22.5483, 'grad_norm': 217.64779663085938, 'learning_rate': 8.68421052631579e-06, 'epoch': 14.17} +{'loss': 22.0396, 'grad_norm': 243.59364318847656, 'learning_rate': 8.678765880217785e-06, 'epoch': 14.17} +{'loss': 23.0957, 'grad_norm': 189.66969299316406, 'learning_rate': 8.673321234119783e-06, 'epoch': 14.18} +{'loss': 23.9385, 'grad_norm': 191.86180114746094, 'learning_rate': 8.667876588021778e-06, 'epoch': 14.18} +{'loss': 40.1665, 'grad_norm': 234.34896850585938, 'learning_rate': 8.662431941923775e-06, 'epoch': 14.18} +{'loss': 40.6752, 'grad_norm': 230.52401733398438, 'learning_rate': 8.656987295825772e-06, 'epoch': 14.19} + 71%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3940/5520 [3:27:19<1:17:23, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6088615655899048, 'eval_runtime': 3.1405, 'eval_samples_per_second': 56.998, 'eval_steps_per_second': 56.998, 'epoch': 14.19} +{'loss': 40.7938, 'grad_norm': 234.06272888183594, 'learning_rate': 8.651542649727767e-06, 'epoch': 14.19} +{'loss': 38.7342, 'grad_norm': 344.4232482910156, 'learning_rate': 8.646098003629765e-06, 'epoch': 14.2} +{'loss': 40.2052, 'grad_norm': 375.74365234375, 'learning_rate': 8.640653357531762e-06, 'epoch': 14.2} +{'loss': 39.7266, 'grad_norm': 258.15570068359375, 'learning_rate': 8.635208711433757e-06, 'epoch': 14.2} +{'loss': 40.4821, 'grad_norm': 235.2681121826172, 'learning_rate': 8.629764065335754e-06, 'epoch': 14.21} +{'loss': 41.2414, 'grad_norm': 226.94764709472656, 'learning_rate': 8.62431941923775e-06, 'epoch': 14.21} +{'loss': 40.5807, 'grad_norm': 236.22109985351562, 'learning_rate': 8.618874773139745e-06, 'epoch': 14.21} +{'loss': 40.4824, 'grad_norm': 201.31112670898438, 'learning_rate': 8.613430127041744e-06, 'epoch': 14.22} +{'loss': 38.3881, 'grad_norm': 328.0167541503906, 'learning_rate': 8.607985480943739e-06, 'epoch': 14.22} +{'loss': 36.5777, 'grad_norm': 281.4416809082031, 'learning_rate': 8.602540834845734e-06, 'epoch': 14.22} + 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3950/5520 [3:27:52<1:16:40, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6099084615707397, 'eval_runtime': 3.1377, 'eval_samples_per_second': 57.049, 'eval_steps_per_second': 57.049, 'epoch': 14.22} +{'loss': 37.5071, 'grad_norm': 258.5203552246094, 'learning_rate': 8.597096188747731e-06, 'epoch': 14.23} +{'loss': 36.358, 'grad_norm': 274.8222351074219, 'learning_rate': 8.591651542649727e-06, 'epoch': 14.23} +{'loss': 37.5859, 'grad_norm': 253.1671600341797, 'learning_rate': 8.586206896551726e-06, 'epoch': 14.23} +{'loss': 37.8799, 'grad_norm': 249.80943298339844, 'learning_rate': 8.580762250453721e-06, 'epoch': 14.24} +{'loss': 36.7551, 'grad_norm': 245.29103088378906, 'learning_rate': 8.575317604355716e-06, 'epoch': 14.24} +{'loss': 38.4761, 'grad_norm': 205.5915985107422, 'learning_rate': 8.569872958257713e-06, 'epoch': 14.25} +{'loss': 37.5862, 'grad_norm': 218.10328674316406, 'learning_rate': 8.56442831215971e-06, 'epoch': 14.25} +{'loss': 39.2851, 'grad_norm': 273.5924072265625, 'learning_rate': 8.558983666061706e-06, 'epoch': 14.25} +{'loss': 39.0707, 'grad_norm': 235.48069763183594, 'learning_rate': 8.553539019963703e-06, 'epoch': 14.26} +{'loss': 37.8469, 'grad_norm': 230.93150329589844, 'learning_rate': 8.548094373865698e-06, 'epoch': 14.26} + 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3960/5520 [3:28:24<1:15:40, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6072147488594055, 'eval_runtime': 3.1414, 'eval_samples_per_second': 56.982, 'eval_steps_per_second': 56.982, 'epoch': 14.26} +{'loss': 39.4245, 'grad_norm': 226.3638458251953, 'learning_rate': 8.542649727767695e-06, 'epoch': 14.26} +{'loss': 38.116, 'grad_norm': 226.74595642089844, 'learning_rate': 8.537205081669693e-06, 'epoch': 14.27} +{'loss': 39.9114, 'grad_norm': 226.1452178955078, 'learning_rate': 8.531760435571688e-06, 'epoch': 14.27} +{'loss': 38.9457, 'grad_norm': 387.8020324707031, 'learning_rate': 8.526315789473685e-06, 'epoch': 14.27} +{'loss': 40.7989, 'grad_norm': 381.5679931640625, 'learning_rate': 8.52087114337568e-06, 'epoch': 14.28} +{'loss': 37.6288, 'grad_norm': 246.16464233398438, 'learning_rate': 8.515426497277677e-06, 'epoch': 14.28} +{'loss': 37.3276, 'grad_norm': 337.05059814453125, 'learning_rate': 8.509981851179674e-06, 'epoch': 14.29} +{'loss': 33.9465, 'grad_norm': 223.80421447753906, 'learning_rate': 8.50453720508167e-06, 'epoch': 14.29} +{'loss': 33.0305, 'grad_norm': 218.9332275390625, 'learning_rate': 8.499092558983665e-06, 'epoch': 14.29} +{'loss': 31.3806, 'grad_norm': 254.20726013183594, 'learning_rate': 8.493647912885662e-06, 'epoch': 14.3} + 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 3970/5520 [3:28:56<1:15:01, 2.90s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6070483922958374, 'eval_runtime': 3.1412, 'eval_samples_per_second': 56.984, 'eval_steps_per_second': 56.984, 'epoch': 14.3} +{'loss': 31.7001, 'grad_norm': 232.96702575683594, 'learning_rate': 8.48820326678766e-06, 'epoch': 14.3} +{'loss': 32.2629, 'grad_norm': 305.31207275390625, 'learning_rate': 8.482758620689656e-06, 'epoch': 14.3} +{'loss': 34.2635, 'grad_norm': 253.60858154296875, 'learning_rate': 8.477313974591652e-06, 'epoch': 14.31} +{'loss': 34.6987, 'grad_norm': 395.4168701171875, 'learning_rate': 8.471869328493647e-06, 'epoch': 14.31} +{'loss': 34.5488, 'grad_norm': 279.72845458984375, 'learning_rate': 8.466424682395644e-06, 'epoch': 14.31} +{'loss': 35.2566, 'grad_norm': 285.7306213378906, 'learning_rate': 8.460980036297641e-06, 'epoch': 14.32} +{'loss': 34.5273, 'grad_norm': 229.04226684570312, 'learning_rate': 8.455535390199637e-06, 'epoch': 14.32} +{'loss': 34.6337, 'grad_norm': 232.50205993652344, 'learning_rate': 8.450090744101634e-06, 'epoch': 14.33} +{'loss': 35.1575, 'grad_norm': 225.87583923339844, 'learning_rate': 8.44464609800363e-06, 'epoch': 14.33} +{'loss': 34.2619, 'grad_norm': 266.2709045410156, 'learning_rate': 8.439201451905626e-06, 'epoch': 14.33} + 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 3980/5520 [3:29:28<1:15:54, 2.96s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6066078543663025, 'eval_runtime': 3.149, 'eval_samples_per_second': 56.843, 'eval_steps_per_second': 56.843, 'epoch': 14.33} +{'loss': 35.5713, 'grad_norm': 283.557373046875, 'learning_rate': 8.433756805807623e-06, 'epoch': 14.34} +{'loss': 36.7442, 'grad_norm': 288.43707275390625, 'learning_rate': 8.428312159709619e-06, 'epoch': 14.34} +{'loss': 35.5839, 'grad_norm': 331.3218994140625, 'learning_rate': 8.422867513611616e-06, 'epoch': 14.34} +{'loss': 30.2221, 'grad_norm': 257.1488037109375, 'learning_rate': 8.417422867513611e-06, 'epoch': 14.35} +{'loss': 22.217, 'grad_norm': 200.0919189453125, 'learning_rate': 8.411978221415608e-06, 'epoch': 14.35} +{'loss': 22.8927, 'grad_norm': 245.030029296875, 'learning_rate': 8.406533575317605e-06, 'epoch': 14.35} +{'loss': 22.9537, 'grad_norm': 208.5701904296875, 'learning_rate': 8.4010889292196e-06, 'epoch': 14.36} +{'loss': 24.5304, 'grad_norm': 232.0613250732422, 'learning_rate': 8.395644283121596e-06, 'epoch': 14.36} +{'loss': 39.4552, 'grad_norm': 193.56541442871094, 'learning_rate': 8.390199637023595e-06, 'epoch': 14.36} +{'loss': 41.0417, 'grad_norm': 230.35507202148438, 'learning_rate': 8.38475499092559e-06, 'epoch': 14.37} + 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 3990/5520 [3:30:00<1:14:56, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6071842908859253, 'eval_runtime': 3.1416, 'eval_samples_per_second': 56.978, 'eval_steps_per_second': 56.978, 'epoch': 14.37} +{'loss': 40.1548, 'grad_norm': 191.09242248535156, 'learning_rate': 8.379310344827586e-06, 'epoch': 14.37} +{'loss': 39.5746, 'grad_norm': 249.24520874023438, 'learning_rate': 8.373865698729583e-06, 'epoch': 14.38} +{'loss': 39.2388, 'grad_norm': 266.509033203125, 'learning_rate': 8.368421052631578e-06, 'epoch': 14.38} +{'loss': 39.9314, 'grad_norm': 255.36209106445312, 'learning_rate': 8.362976406533577e-06, 'epoch': 14.38} +{'loss': 39.9124, 'grad_norm': 239.0690460205078, 'learning_rate': 8.357531760435572e-06, 'epoch': 14.39} +{'loss': 40.1307, 'grad_norm': 211.36135864257812, 'learning_rate': 8.352087114337568e-06, 'epoch': 14.39} +{'loss': 40.5252, 'grad_norm': 215.28912353515625, 'learning_rate': 8.346642468239565e-06, 'epoch': 14.39} +{'loss': 40.8348, 'grad_norm': 240.84271240234375, 'learning_rate': 8.34119782214156e-06, 'epoch': 14.4} +{'loss': 39.8228, 'grad_norm': 228.41758728027344, 'learning_rate': 8.335753176043557e-06, 'epoch': 14.4} +{'loss': 38.0696, 'grad_norm': 203.0228729248047, 'learning_rate': 8.330308529945554e-06, 'epoch': 14.4} + 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 4000/5520 [3:30:32<1:14:10, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6064196825027466, 'eval_runtime': 3.1413, 'eval_samples_per_second': 56.983, 'eval_steps_per_second': 56.983, 'epoch': 14.4} +{'loss': 37.3921, 'grad_norm': 245.14646911621094, 'learning_rate': 8.32486388384755e-06, 'epoch': 14.41} +{'loss': 36.8794, 'grad_norm': 230.0685577392578, 'learning_rate': 8.319419237749545e-06, 'epoch': 14.41} +{'loss': 38.011, 'grad_norm': 203.02955627441406, 'learning_rate': 8.313974591651544e-06, 'epoch': 14.42} +{'loss': 37.8114, 'grad_norm': 276.0522766113281, 'learning_rate': 8.30852994555354e-06, 'epoch': 14.42} +{'loss': 38.1956, 'grad_norm': 205.56423950195312, 'learning_rate': 8.303085299455536e-06, 'epoch': 14.42} +{'loss': 36.4471, 'grad_norm': 200.71507263183594, 'learning_rate': 8.297640653357532e-06, 'epoch': 14.43} +{'loss': 37.6204, 'grad_norm': 217.8540496826172, 'learning_rate': 8.292196007259527e-06, 'epoch': 14.43} +{'loss': 38.6074, 'grad_norm': 228.0621337890625, 'learning_rate': 8.286751361161526e-06, 'epoch': 14.43} +{'loss': 37.8614, 'grad_norm': 246.05203247070312, 'learning_rate': 8.281306715063521e-06, 'epoch': 14.44} +{'loss': 37.4941, 'grad_norm': 216.0327911376953, 'learning_rate': 8.275862068965517e-06, 'epoch': 14.44} + 72%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 4000/5520 [3:30:35<1:14:10, 2.93s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 4010/5520 [3:31:05<1:13:57, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.605604887008667, 'eval_runtime': 3.1399, 'eval_samples_per_second': 57.008, 'eval_steps_per_second': 57.008, 'epoch': 14.44} +{'loss': 37.9576, 'grad_norm': 292.38653564453125, 'learning_rate': 8.270417422867514e-06, 'epoch': 14.44} +{'loss': 38.7505, 'grad_norm': 268.2558288574219, 'learning_rate': 8.26497277676951e-06, 'epoch': 14.45} +{'loss': 39.9733, 'grad_norm': 324.135498046875, 'learning_rate': 8.259528130671508e-06, 'epoch': 14.45} +{'loss': 38.8272, 'grad_norm': 269.1458740234375, 'learning_rate': 8.254083484573503e-06, 'epoch': 14.46} +{'loss': 37.7277, 'grad_norm': 214.26547241210938, 'learning_rate': 8.248638838475499e-06, 'epoch': 14.46} +{'loss': 39.0446, 'grad_norm': 256.4419860839844, 'learning_rate': 8.243194192377496e-06, 'epoch': 14.46} +{'loss': 34.2491, 'grad_norm': 226.9741973876953, 'learning_rate': 8.237749546279493e-06, 'epoch': 14.47} +{'loss': 32.1969, 'grad_norm': 238.4901123046875, 'learning_rate': 8.232304900181488e-06, 'epoch': 14.47} +{'loss': 32.5999, 'grad_norm': 260.6334533691406, 'learning_rate': 8.226860254083485e-06, 'epoch': 14.47} +{'loss': 30.3598, 'grad_norm': 227.4844970703125, 'learning_rate': 8.22141560798548e-06, 'epoch': 14.48} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 4020/5520 [3:31:37<1:13:01, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6049788594245911, 'eval_runtime': 3.1451, 'eval_samples_per_second': 56.914, 'eval_steps_per_second': 56.914, 'epoch': 14.48} +{'loss': 32.3228, 'grad_norm': 231.49935913085938, 'learning_rate': 8.215970961887476e-06, 'epoch': 14.48} +{'loss': 32.1275, 'grad_norm': 246.83099365234375, 'learning_rate': 8.210526315789475e-06, 'epoch': 14.48} +{'loss': 32.9237, 'grad_norm': 283.0715026855469, 'learning_rate': 8.20508166969147e-06, 'epoch': 14.49} +{'loss': 34.3091, 'grad_norm': 264.58941650390625, 'learning_rate': 8.199637023593467e-06, 'epoch': 14.49} +{'loss': 34.2317, 'grad_norm': 207.57241821289062, 'learning_rate': 8.194192377495463e-06, 'epoch': 14.49} +{'loss': 35.5423, 'grad_norm': 266.3730163574219, 'learning_rate': 8.18874773139746e-06, 'epoch': 14.5} +{'loss': 34.0383, 'grad_norm': 274.2936096191406, 'learning_rate': 8.183303085299457e-06, 'epoch': 14.5} +{'loss': 35.6892, 'grad_norm': 345.4320068359375, 'learning_rate': 8.177858439201452e-06, 'epoch': 14.51} +{'loss': 34.4219, 'grad_norm': 254.9503631591797, 'learning_rate': 8.172413793103448e-06, 'epoch': 14.51} +{'loss': 34.6322, 'grad_norm': 277.176025390625, 'learning_rate': 8.166969147005445e-06, 'epoch': 14.51} + 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 4030/5520 [3:32:09<1:12:56, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6078911423683167, 'eval_runtime': 3.1428, 'eval_samples_per_second': 56.956, 'eval_steps_per_second': 56.956, 'epoch': 14.51} +{'loss': 36.4843, 'grad_norm': 267.24737548828125, 'learning_rate': 8.161524500907442e-06, 'epoch': 14.52} +{'loss': 36.347, 'grad_norm': 291.5208740234375, 'learning_rate': 8.156079854809437e-06, 'epoch': 14.52} +{'loss': 36.5678, 'grad_norm': 331.9736328125, 'learning_rate': 8.150635208711434e-06, 'epoch': 14.52} +{'loss': 29.4886, 'grad_norm': 283.7598876953125, 'learning_rate': 8.14519056261343e-06, 'epoch': 14.53} +{'loss': 23.2178, 'grad_norm': 214.61712646484375, 'learning_rate': 8.139745916515427e-06, 'epoch': 14.53} +{'loss': 22.0972, 'grad_norm': 286.7948913574219, 'learning_rate': 8.134301270417424e-06, 'epoch': 14.53} +{'loss': 23.2764, 'grad_norm': 230.6540069580078, 'learning_rate': 8.128856624319419e-06, 'epoch': 14.54} +{'loss': 24.1889, 'grad_norm': 300.9560241699219, 'learning_rate': 8.123411978221416e-06, 'epoch': 14.54} +{'loss': 39.0039, 'grad_norm': 211.4068145751953, 'learning_rate': 8.117967332123412e-06, 'epoch': 14.55} +{'loss': 41.1832, 'grad_norm': 274.3965759277344, 'learning_rate': 8.112522686025409e-06, 'epoch': 14.55} + 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 4040/5520 [3:32:41<1:12:16, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6079195141792297, 'eval_runtime': 3.1388, 'eval_samples_per_second': 57.027, 'eval_steps_per_second': 57.027, 'epoch': 14.55} +{'loss': 38.28, 'grad_norm': 247.50657653808594, 'learning_rate': 8.107078039927406e-06, 'epoch': 14.55} +{'loss': 39.5079, 'grad_norm': 216.0500946044922, 'learning_rate': 8.101633393829401e-06, 'epoch': 14.56} +{'loss': 40.1902, 'grad_norm': 271.37066650390625, 'learning_rate': 8.096188747731396e-06, 'epoch': 14.56} +{'loss': 40.2113, 'grad_norm': 233.35415649414062, 'learning_rate': 8.090744101633394e-06, 'epoch': 14.56} +{'loss': 39.794, 'grad_norm': 214.67381286621094, 'learning_rate': 8.08529945553539e-06, 'epoch': 14.57} +{'loss': 39.9214, 'grad_norm': 298.1142578125, 'learning_rate': 8.079854809437388e-06, 'epoch': 14.57} +{'loss': 40.9599, 'grad_norm': 197.40823364257812, 'learning_rate': 8.074410163339383e-06, 'epoch': 14.57} +{'loss': 40.2351, 'grad_norm': 242.1573028564453, 'learning_rate': 8.068965517241378e-06, 'epoch': 14.58} +{'loss': 39.0174, 'grad_norm': 224.93801879882812, 'learning_rate': 8.063520871143377e-06, 'epoch': 14.58} +{'loss': 37.4696, 'grad_norm': 295.4931335449219, 'learning_rate': 8.058076225045373e-06, 'epoch': 14.59} + 73%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 4050/5520 [3:33:13<1:11:55, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6091852188110352, 'eval_runtime': 3.1368, 'eval_samples_per_second': 57.064, 'eval_steps_per_second': 57.064, 'epoch': 14.59} +{'loss': 37.3227, 'grad_norm': 302.8267517089844, 'learning_rate': 8.052631578947368e-06, 'epoch': 14.59} +{'loss': 38.433, 'grad_norm': 355.2379150390625, 'learning_rate': 8.047186932849365e-06, 'epoch': 14.59} +{'loss': 37.8352, 'grad_norm': 304.96234130859375, 'learning_rate': 8.04174228675136e-06, 'epoch': 14.6} +{'loss': 38.1734, 'grad_norm': 309.294921875, 'learning_rate': 8.036297640653358e-06, 'epoch': 14.6} +{'loss': 37.3612, 'grad_norm': 216.3328399658203, 'learning_rate': 8.030852994555355e-06, 'epoch': 14.6} +{'loss': 39.1612, 'grad_norm': 250.9885711669922, 'learning_rate': 8.02540834845735e-06, 'epoch': 14.61} +{'loss': 39.6837, 'grad_norm': 215.0750732421875, 'learning_rate': 8.019963702359347e-06, 'epoch': 14.61} +{'loss': 37.9746, 'grad_norm': 234.02069091796875, 'learning_rate': 8.014519056261342e-06, 'epoch': 14.61} +{'loss': 38.5114, 'grad_norm': 233.7527313232422, 'learning_rate': 8.00907441016334e-06, 'epoch': 14.62} +{'loss': 37.1647, 'grad_norm': 271.77496337890625, 'learning_rate': 8.003629764065337e-06, 'epoch': 14.62} + 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 4060/5520 [3:33:45<1:11:12, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6047770977020264, 'eval_runtime': 3.1379, 'eval_samples_per_second': 57.045, 'eval_steps_per_second': 57.045, 'epoch': 14.62} +{'loss': 38.981, 'grad_norm': 281.7846374511719, 'learning_rate': 7.998185117967332e-06, 'epoch': 14.62} +{'loss': 39.4821, 'grad_norm': 308.8702697753906, 'learning_rate': 7.992740471869327e-06, 'epoch': 14.63} +{'loss': 39.0898, 'grad_norm': 366.1501770019531, 'learning_rate': 7.987295825771326e-06, 'epoch': 14.63} +{'loss': 39.6162, 'grad_norm': 276.92962646484375, 'learning_rate': 7.981851179673322e-06, 'epoch': 14.64} +{'loss': 38.5888, 'grad_norm': 220.0023651123047, 'learning_rate': 7.976406533575319e-06, 'epoch': 14.64} +{'loss': 38.4631, 'grad_norm': 268.57293701171875, 'learning_rate': 7.970961887477314e-06, 'epoch': 14.64} +{'loss': 35.4139, 'grad_norm': 307.8072509765625, 'learning_rate': 7.96551724137931e-06, 'epoch': 14.65} +{'loss': 33.3694, 'grad_norm': 228.11767578125, 'learning_rate': 7.960072595281308e-06, 'epoch': 14.65} +{'loss': 31.3355, 'grad_norm': 217.6271209716797, 'learning_rate': 7.954627949183304e-06, 'epoch': 14.65} +{'loss': 32.8306, 'grad_norm': 232.31944274902344, 'learning_rate': 7.949183303085299e-06, 'epoch': 14.66} + 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 4070/5520 [3:34:17<1:10:46, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6018487215042114, 'eval_runtime': 3.1396, 'eval_samples_per_second': 57.014, 'eval_steps_per_second': 57.014, 'epoch': 14.66} +{'loss': 33.2157, 'grad_norm': 244.58303833007812, 'learning_rate': 7.943738656987296e-06, 'epoch': 14.66} +{'loss': 33.6361, 'grad_norm': 306.12005615234375, 'learning_rate': 7.938294010889293e-06, 'epoch': 14.66} +{'loss': 32.3917, 'grad_norm': 266.2792053222656, 'learning_rate': 7.932849364791288e-06, 'epoch': 14.67} +{'loss': 33.3598, 'grad_norm': 259.373779296875, 'learning_rate': 7.927404718693286e-06, 'epoch': 14.67} +{'loss': 32.2699, 'grad_norm': 247.35179138183594, 'learning_rate': 7.921960072595281e-06, 'epoch': 14.68} +{'loss': 33.0305, 'grad_norm': 280.02960205078125, 'learning_rate': 7.916515426497278e-06, 'epoch': 14.68} +{'loss': 35.1854, 'grad_norm': 394.6492919921875, 'learning_rate': 7.911070780399275e-06, 'epoch': 14.68} +{'loss': 35.1836, 'grad_norm': 298.6531677246094, 'learning_rate': 7.90562613430127e-06, 'epoch': 14.69} +{'loss': 32.6266, 'grad_norm': 250.960693359375, 'learning_rate': 7.900181488203268e-06, 'epoch': 14.69} +{'loss': 35.5937, 'grad_norm': 240.4825897216797, 'learning_rate': 7.894736842105263e-06, 'epoch': 14.69} + 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 4080/5520 [3:34:49<1:10:23, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6042065620422363, 'eval_runtime': 3.1453, 'eval_samples_per_second': 56.91, 'eval_steps_per_second': 56.91, 'epoch': 14.69} +{'loss': 36.4225, 'grad_norm': 274.6919860839844, 'learning_rate': 7.889292196007258e-06, 'epoch': 14.7} +{'loss': 36.5503, 'grad_norm': 245.4980010986328, 'learning_rate': 7.883847549909257e-06, 'epoch': 14.7} +{'loss': 35.38, 'grad_norm': 373.362548828125, 'learning_rate': 7.878402903811252e-06, 'epoch': 14.7} +{'loss': 28.869, 'grad_norm': 337.5054626464844, 'learning_rate': 7.872958257713248e-06, 'epoch': 14.71} +{'loss': 22.99, 'grad_norm': 238.19195556640625, 'learning_rate': 7.867513611615245e-06, 'epoch': 14.71} +{'loss': 22.5274, 'grad_norm': 254.274169921875, 'learning_rate': 7.862068965517242e-06, 'epoch': 14.72} +{'loss': 23.6756, 'grad_norm': 236.74099731445312, 'learning_rate': 7.856624319419239e-06, 'epoch': 14.72} +{'loss': 23.2024, 'grad_norm': 239.69911193847656, 'learning_rate': 7.851179673321234e-06, 'epoch': 14.72} +{'loss': 40.0026, 'grad_norm': 296.35101318359375, 'learning_rate': 7.84573502722323e-06, 'epoch': 14.73} +{'loss': 41.2817, 'grad_norm': 202.52577209472656, 'learning_rate': 7.840290381125227e-06, 'epoch': 14.73} + 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 4090/5520 [3:35:21<1:09:55, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6069625616073608, 'eval_runtime': 3.1383, 'eval_samples_per_second': 57.037, 'eval_steps_per_second': 57.037, 'epoch': 14.73} +{'loss': 40.5411, 'grad_norm': 290.4194030761719, 'learning_rate': 7.834845735027224e-06, 'epoch': 14.73} +{'loss': 40.6588, 'grad_norm': 284.0616455078125, 'learning_rate': 7.82940108892922e-06, 'epoch': 14.74} +{'loss': 38.986, 'grad_norm': 289.5628967285156, 'learning_rate': 7.823956442831216e-06, 'epoch': 14.74} +{'loss': 38.83, 'grad_norm': 217.09841918945312, 'learning_rate': 7.818511796733212e-06, 'epoch': 14.74} +{'loss': 39.4897, 'grad_norm': 223.49148559570312, 'learning_rate': 7.813067150635209e-06, 'epoch': 14.75} +{'loss': 38.9963, 'grad_norm': 240.41578674316406, 'learning_rate': 7.807622504537206e-06, 'epoch': 14.75} +{'loss': 39.7875, 'grad_norm': 206.7586212158203, 'learning_rate': 7.802177858439201e-06, 'epoch': 14.75} +{'loss': 39.3977, 'grad_norm': 239.97174072265625, 'learning_rate': 7.796733212341198e-06, 'epoch': 14.76} +{'loss': 38.7869, 'grad_norm': 204.50839233398438, 'learning_rate': 7.791288566243194e-06, 'epoch': 14.76} +{'loss': 36.7325, 'grad_norm': 216.79583740234375, 'learning_rate': 7.785843920145191e-06, 'epoch': 14.77} + 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 4100/5520 [3:35:53<1:09:07, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6052367091178894, 'eval_runtime': 3.1442, 'eval_samples_per_second': 56.93, 'eval_steps_per_second': 56.93, 'epoch': 14.77} +{'loss': 38.2377, 'grad_norm': 251.13209533691406, 'learning_rate': 7.780399274047188e-06, 'epoch': 14.77} +{'loss': 36.8119, 'grad_norm': 222.745361328125, 'learning_rate': 7.774954627949183e-06, 'epoch': 14.77} +{'loss': 38.1241, 'grad_norm': 252.72117614746094, 'learning_rate': 7.769509981851179e-06, 'epoch': 14.78} +{'loss': 37.6839, 'grad_norm': 272.38165283203125, 'learning_rate': 7.764065335753176e-06, 'epoch': 14.78} +{'loss': 38.1267, 'grad_norm': 301.0637512207031, 'learning_rate': 7.758620689655173e-06, 'epoch': 14.78} +{'loss': 36.9847, 'grad_norm': 240.22515869140625, 'learning_rate': 7.75317604355717e-06, 'epoch': 14.79} +{'loss': 39.0368, 'grad_norm': 273.3988952636719, 'learning_rate': 7.747731397459165e-06, 'epoch': 14.79} +{'loss': 38.6439, 'grad_norm': 252.66497802734375, 'learning_rate': 7.74228675136116e-06, 'epoch': 14.79} +{'loss': 36.3503, 'grad_norm': 246.3287811279297, 'learning_rate': 7.73684210526316e-06, 'epoch': 14.8} +{'loss': 38.1603, 'grad_norm': 220.6704559326172, 'learning_rate': 7.731397459165155e-06, 'epoch': 14.8} + 74%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 4110/5520 [3:36:26<1:08:51, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6043270826339722, 'eval_runtime': 3.1383, 'eval_samples_per_second': 57.037, 'eval_steps_per_second': 57.037, 'epoch': 14.8} +{'loss': 38.9624, 'grad_norm': 215.94979858398438, 'learning_rate': 7.72595281306715e-06, 'epoch': 14.81} +{'loss': 39.2196, 'grad_norm': 228.76815795898438, 'learning_rate': 7.720508166969147e-06, 'epoch': 14.81} +{'loss': 39.3677, 'grad_norm': 216.1998291015625, 'learning_rate': 7.715063520871143e-06, 'epoch': 14.81} +{'loss': 38.1856, 'grad_norm': 266.1018981933594, 'learning_rate': 7.70961887477314e-06, 'epoch': 14.82} +{'loss': 39.6282, 'grad_norm': 234.2566680908203, 'learning_rate': 7.704174228675137e-06, 'epoch': 14.82} +{'loss': 38.2693, 'grad_norm': 241.16615295410156, 'learning_rate': 7.698729582577132e-06, 'epoch': 14.82} +{'loss': 37.7161, 'grad_norm': 332.6835021972656, 'learning_rate': 7.69328493647913e-06, 'epoch': 14.83} +{'loss': 33.9704, 'grad_norm': 260.1654357910156, 'learning_rate': 7.687840290381126e-06, 'epoch': 14.83} +{'loss': 32.5126, 'grad_norm': 214.45509338378906, 'learning_rate': 7.682395644283122e-06, 'epoch': 14.83} +{'loss': 32.0682, 'grad_norm': 257.4847717285156, 'learning_rate': 7.676950998185119e-06, 'epoch': 14.84} + 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 4120/5520 [3:36:58<1:08:18, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6022929549217224, 'eval_runtime': 3.1427, 'eval_samples_per_second': 56.957, 'eval_steps_per_second': 56.957, 'epoch': 14.84} +{'loss': 32.8817, 'grad_norm': 241.302978515625, 'learning_rate': 7.671506352087114e-06, 'epoch': 14.84} +{'loss': 31.9995, 'grad_norm': 238.0950164794922, 'learning_rate': 7.66606170598911e-06, 'epoch': 14.85} +{'loss': 32.9681, 'grad_norm': 239.700439453125, 'learning_rate': 7.660617059891108e-06, 'epoch': 14.85} +{'loss': 33.6878, 'grad_norm': 234.23890686035156, 'learning_rate': 7.655172413793104e-06, 'epoch': 14.85} +{'loss': 34.2346, 'grad_norm': 367.3103332519531, 'learning_rate': 7.6497277676951e-06, 'epoch': 14.86} +{'loss': 35.0148, 'grad_norm': 221.31381225585938, 'learning_rate': 7.644283121597096e-06, 'epoch': 14.86} +{'loss': 34.8326, 'grad_norm': 352.1162109375, 'learning_rate': 7.638838475499092e-06, 'epoch': 14.86} +{'loss': 34.2522, 'grad_norm': 296.8202209472656, 'learning_rate': 7.63339382940109e-06, 'epoch': 14.87} +{'loss': 34.5005, 'grad_norm': 283.4679870605469, 'learning_rate': 7.627949183303086e-06, 'epoch': 14.87} +{'loss': 34.9581, 'grad_norm': 249.95033264160156, 'learning_rate': 7.622504537205082e-06, 'epoch': 14.87} + 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 4130/5520 [3:37:30<1:08:05, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6031190752983093, 'eval_runtime': 3.1392, 'eval_samples_per_second': 57.02, 'eval_steps_per_second': 57.02, 'epoch': 14.87} +{'loss': 35.3024, 'grad_norm': 235.65065002441406, 'learning_rate': 7.6170598911070774e-06, 'epoch': 14.88} +{'loss': 35.4444, 'grad_norm': 258.1300964355469, 'learning_rate': 7.611615245009075e-06, 'epoch': 14.88} +{'loss': 36.5643, 'grad_norm': 262.9698791503906, 'learning_rate': 7.606170598911072e-06, 'epoch': 14.88} +{'loss': 33.0157, 'grad_norm': 274.81781005859375, 'learning_rate': 7.600725952813067e-06, 'epoch': 14.89} +{'loss': 22.226, 'grad_norm': 205.41566467285156, 'learning_rate': 7.595281306715063e-06, 'epoch': 14.89} +{'loss': 22.1499, 'grad_norm': 231.19541931152344, 'learning_rate': 7.5898366606170594e-06, 'epoch': 14.9} +{'loss': 23.3987, 'grad_norm': 203.04856872558594, 'learning_rate': 7.584392014519057e-06, 'epoch': 14.9} +{'loss': 24.3649, 'grad_norm': 289.031005859375, 'learning_rate': 7.578947368421053e-06, 'epoch': 14.9} +{'loss': 41.146, 'grad_norm': 285.2325744628906, 'learning_rate': 7.573502722323049e-06, 'epoch': 14.91} +{'loss': 40.3871, 'grad_norm': 232.21603393554688, 'learning_rate': 7.568058076225045e-06, 'epoch': 14.91} + 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 4140/5520 [3:38:02<1:07:21, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6056836247444153, 'eval_runtime': 3.142, 'eval_samples_per_second': 56.969, 'eval_steps_per_second': 56.969, 'epoch': 14.91} +{'loss': 39.5914, 'grad_norm': 358.63238525390625, 'learning_rate': 7.562613430127043e-06, 'epoch': 14.91} +{'loss': 39.4552, 'grad_norm': 262.66741943359375, 'learning_rate': 7.5571687840290385e-06, 'epoch': 14.92} +{'loss': 41.5379, 'grad_norm': 228.7096710205078, 'learning_rate': 7.551724137931035e-06, 'epoch': 14.92} +{'loss': 39.8314, 'grad_norm': 266.6537780761719, 'learning_rate': 7.546279491833031e-06, 'epoch': 14.92} +{'loss': 37.8247, 'grad_norm': 329.5486755371094, 'learning_rate': 7.540834845735027e-06, 'epoch': 14.93} +{'loss': 36.8491, 'grad_norm': 391.49127197265625, 'learning_rate': 7.535390199637024e-06, 'epoch': 14.93} +{'loss': 37.7245, 'grad_norm': 342.66632080078125, 'learning_rate': 7.5299455535390205e-06, 'epoch': 14.94} +{'loss': 38.3694, 'grad_norm': 309.25115966796875, 'learning_rate': 7.524500907441017e-06, 'epoch': 14.94} +{'loss': 38.5028, 'grad_norm': 438.21539306640625, 'learning_rate': 7.519056261343012e-06, 'epoch': 14.94} +{'loss': 39.2531, 'grad_norm': 314.2667541503906, 'learning_rate': 7.513611615245008e-06, 'epoch': 14.95} + 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 4150/5520 [3:38:34<1:06:53, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6075459718704224, 'eval_runtime': 3.1365, 'eval_samples_per_second': 57.069, 'eval_steps_per_second': 57.069, 'epoch': 14.95} +{'loss': 38.3904, 'grad_norm': 348.3675537109375, 'learning_rate': 7.508166969147006e-06, 'epoch': 14.95} +{'loss': 39.0257, 'grad_norm': 448.6506652832031, 'learning_rate': 7.5027223230490025e-06, 'epoch': 14.95} +{'loss': 36.8144, 'grad_norm': 407.4074401855469, 'learning_rate': 7.497277676950998e-06, 'epoch': 14.96} +{'loss': 34.3852, 'grad_norm': 311.0707702636719, 'learning_rate': 7.491833030852995e-06, 'epoch': 14.96} +{'loss': 32.9411, 'grad_norm': 316.660400390625, 'learning_rate': 7.486388384754991e-06, 'epoch': 14.96} +{'loss': 32.9947, 'grad_norm': 405.3203125, 'learning_rate': 7.480943738656988e-06, 'epoch': 14.97} +{'loss': 34.9284, 'grad_norm': 246.47296142578125, 'learning_rate': 7.475499092558984e-06, 'epoch': 14.97} +{'loss': 33.5852, 'grad_norm': 250.6293487548828, 'learning_rate': 7.47005444646098e-06, 'epoch': 14.98} +{'loss': 34.5658, 'grad_norm': 367.8492736816406, 'learning_rate': 7.464609800362977e-06, 'epoch': 14.98} +{'loss': 35.4483, 'grad_norm': 299.1382141113281, 'learning_rate': 7.459165154264972e-06, 'epoch': 14.98} + 75%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 4160/5520 [3:39:06<1:06:26, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6054605841636658, 'eval_runtime': 3.1384, 'eval_samples_per_second': 57.035, 'eval_steps_per_second': 57.035, 'epoch': 14.98} +{'loss': 35.9366, 'grad_norm': 448.0080261230469, 'learning_rate': 7.453720508166969e-06, 'epoch': 14.99} +{'loss': 37.6222, 'grad_norm': 496.0691223144531, 'learning_rate': 7.448275862068966e-06, 'epoch': 14.99} +{'loss': 27.5573, 'grad_norm': 300.7026062011719, 'learning_rate': 7.442831215970963e-06, 'epoch': 14.99} +{'loss': 23.0142, 'grad_norm': 183.81434631347656, 'learning_rate': 7.437386569872958e-06, 'epoch': 15.0} +{'loss': 21.0732, 'grad_norm': 198.61032104492188, 'learning_rate': 7.431941923774954e-06, 'epoch': 15.0} +{'loss': 39.1709, 'grad_norm': 244.2176513671875, 'learning_rate': 7.426497277676951e-06, 'epoch': 15.0} +{'loss': 39.9364, 'grad_norm': 211.74375915527344, 'learning_rate': 7.421052631578948e-06, 'epoch': 15.01} +{'loss': 39.5166, 'grad_norm': 216.2489013671875, 'learning_rate': 7.415607985480944e-06, 'epoch': 15.01} +{'loss': 39.6738, 'grad_norm': 279.423583984375, 'learning_rate': 7.41016333938294e-06, 'epoch': 15.01} +{'loss': 39.3556, 'grad_norm': 279.117919921875, 'learning_rate': 7.404718693284937e-06, 'epoch': 15.02} + 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 4170/5520 [3:39:38<1:05:48, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6020110249519348, 'eval_runtime': 3.141, 'eval_samples_per_second': 56.987, 'eval_steps_per_second': 56.987, 'epoch': 15.02} +{'loss': 38.9987, 'grad_norm': 213.57162475585938, 'learning_rate': 7.399274047186933e-06, 'epoch': 15.02} +{'loss': 39.1696, 'grad_norm': 184.1968994140625, 'learning_rate': 7.393829401088929e-06, 'epoch': 15.03} +{'loss': 39.8897, 'grad_norm': 219.38076782226562, 'learning_rate': 7.388384754990926e-06, 'epoch': 15.03} +{'loss': 40.7633, 'grad_norm': 225.4325714111328, 'learning_rate': 7.382940108892922e-06, 'epoch': 15.03} +{'loss': 39.8768, 'grad_norm': 274.78472900390625, 'learning_rate': 7.377495462794918e-06, 'epoch': 15.04} +{'loss': 38.4735, 'grad_norm': 269.5557861328125, 'learning_rate': 7.3720508166969146e-06, 'epoch': 15.04} +{'loss': 37.2117, 'grad_norm': 219.78761291503906, 'learning_rate': 7.366606170598912e-06, 'epoch': 15.04} +{'loss': 36.6855, 'grad_norm': 205.49771118164062, 'learning_rate': 7.361161524500908e-06, 'epoch': 15.05} +{'loss': 35.4408, 'grad_norm': 235.72068786621094, 'learning_rate': 7.355716878402904e-06, 'epoch': 15.05} +{'loss': 38.2297, 'grad_norm': 218.84732055664062, 'learning_rate': 7.3502722323049e-06, 'epoch': 15.05} + 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 4180/5520 [3:40:10<1:05:04, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6053969860076904, 'eval_runtime': 3.1349, 'eval_samples_per_second': 57.099, 'eval_steps_per_second': 57.099, 'epoch': 15.05} +{'loss': 35.7271, 'grad_norm': 195.80685424804688, 'learning_rate': 7.3448275862068966e-06, 'epoch': 15.06} +{'loss': 37.3393, 'grad_norm': 207.12481689453125, 'learning_rate': 7.339382940108894e-06, 'epoch': 15.06} +{'loss': 36.9505, 'grad_norm': 211.0287322998047, 'learning_rate': 7.333938294010889e-06, 'epoch': 15.07} +{'loss': 38.1225, 'grad_norm': 279.0206604003906, 'learning_rate': 7.328493647912886e-06, 'epoch': 15.07} +{'loss': 37.1117, 'grad_norm': 206.3834228515625, 'learning_rate': 7.323049001814882e-06, 'epoch': 15.07} +{'loss': 36.1971, 'grad_norm': 266.8707275390625, 'learning_rate': 7.3176043557168786e-06, 'epoch': 15.08} +{'loss': 37.4714, 'grad_norm': 260.35791015625, 'learning_rate': 7.312159709618875e-06, 'epoch': 15.08} +{'loss': 37.621, 'grad_norm': 281.152587890625, 'learning_rate': 7.306715063520871e-06, 'epoch': 15.08} +{'loss': 38.919, 'grad_norm': 246.25758361816406, 'learning_rate': 7.301270417422868e-06, 'epoch': 15.09} +{'loss': 39.5783, 'grad_norm': 378.4499816894531, 'learning_rate': 7.2958257713248635e-06, 'epoch': 15.09} + 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 4190/5520 [3:40:41<1:04:40, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6071392297744751, 'eval_runtime': 3.1395, 'eval_samples_per_second': 57.016, 'eval_steps_per_second': 57.016, 'epoch': 15.09} +{'loss': 38.9023, 'grad_norm': 421.0552673339844, 'learning_rate': 7.2903811252268606e-06, 'epoch': 15.09} +{'loss': 39.6466, 'grad_norm': 264.24359130859375, 'learning_rate': 7.284936479128857e-06, 'epoch': 15.1} +{'loss': 39.4899, 'grad_norm': 246.88182067871094, 'learning_rate': 7.279491833030854e-06, 'epoch': 15.1} +{'loss': 35.6587, 'grad_norm': 236.83848571777344, 'learning_rate': 7.274047186932849e-06, 'epoch': 15.1} +{'loss': 34.1567, 'grad_norm': 278.31573486328125, 'learning_rate': 7.2686025408348455e-06, 'epoch': 15.11} +{'loss': 32.1268, 'grad_norm': 243.71160888671875, 'learning_rate': 7.2631578947368426e-06, 'epoch': 15.11} +{'loss': 31.498, 'grad_norm': 233.81211853027344, 'learning_rate': 7.257713248638839e-06, 'epoch': 15.12} +{'loss': 32.3648, 'grad_norm': 243.12672424316406, 'learning_rate': 7.252268602540835e-06, 'epoch': 15.12} +{'loss': 32.2236, 'grad_norm': 293.38299560546875, 'learning_rate': 7.246823956442831e-06, 'epoch': 15.12} +{'loss': 34.5535, 'grad_norm': 249.70071411132812, 'learning_rate': 7.241379310344828e-06, 'epoch': 15.13} + 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 4200/5520 [3:41:14<1:04:31, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6050077676773071, 'eval_runtime': 3.144, 'eval_samples_per_second': 56.934, 'eval_steps_per_second': 56.934, 'epoch': 15.13} +{'loss': 32.9552, 'grad_norm': 300.9483642578125, 'learning_rate': 7.235934664246824e-06, 'epoch': 15.13} +{'loss': 33.0974, 'grad_norm': 228.797607421875, 'learning_rate': 7.23049001814882e-06, 'epoch': 15.13} +{'loss': 34.2865, 'grad_norm': 279.9087219238281, 'learning_rate': 7.225045372050817e-06, 'epoch': 15.14} +{'loss': 34.5603, 'grad_norm': 254.15928649902344, 'learning_rate': 7.219600725952813e-06, 'epoch': 15.14} +{'loss': 34.6428, 'grad_norm': 314.19012451171875, 'learning_rate': 7.2141560798548095e-06, 'epoch': 15.14} +{'loss': 33.6676, 'grad_norm': 291.8244323730469, 'learning_rate': 7.208711433756806e-06, 'epoch': 15.15} +{'loss': 33.9118, 'grad_norm': 276.4428405761719, 'learning_rate': 7.203266787658803e-06, 'epoch': 15.15} +{'loss': 35.1971, 'grad_norm': 265.7801208496094, 'learning_rate': 7.197822141560799e-06, 'epoch': 15.16} +{'loss': 33.0843, 'grad_norm': 244.48667907714844, 'learning_rate': 7.192377495462795e-06, 'epoch': 15.16} +{'loss': 36.7957, 'grad_norm': 348.6037902832031, 'learning_rate': 7.1869328493647915e-06, 'epoch': 15.16} + 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 4200/5520 [3:41:17<1:04:31, 2.93s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 4210/5520 [3:41:46<1:04:14, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6052607297897339, 'eval_runtime': 3.1435, 'eval_samples_per_second': 56.944, 'eval_steps_per_second': 56.944, 'epoch': 15.16} +{'loss': 28.0234, 'grad_norm': 227.31346130371094, 'learning_rate': 7.181488203266788e-06, 'epoch': 15.17} +{'loss': 22.5147, 'grad_norm': 208.75048828125, 'learning_rate': 7.176043557168784e-06, 'epoch': 15.17} +{'loss': 22.1029, 'grad_norm': 222.91090393066406, 'learning_rate': 7.17059891107078e-06, 'epoch': 15.17} +{'loss': 22.9827, 'grad_norm': 219.40621948242188, 'learning_rate': 7.165154264972777e-06, 'epoch': 15.18} +{'loss': 23.6974, 'grad_norm': 229.11813354492188, 'learning_rate': 7.1597096188747735e-06, 'epoch': 15.18} +{'loss': 39.6585, 'grad_norm': 256.7950744628906, 'learning_rate': 7.15426497277677e-06, 'epoch': 15.18} +{'loss': 40.0478, 'grad_norm': 237.47613525390625, 'learning_rate': 7.148820326678766e-06, 'epoch': 15.19} +{'loss': 39.7604, 'grad_norm': 259.54296875, 'learning_rate': 7.143375680580762e-06, 'epoch': 15.19} +{'loss': 39.0201, 'grad_norm': 249.7389678955078, 'learning_rate': 7.137931034482759e-06, 'epoch': 15.2} +{'loss': 39.8575, 'grad_norm': 298.4624938964844, 'learning_rate': 7.132486388384755e-06, 'epoch': 15.2} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 76%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 4220/5520 [3:42:19<1:03:32, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6088115572929382, 'eval_runtime': 3.1406, 'eval_samples_per_second': 56.996, 'eval_steps_per_second': 56.996, 'epoch': 15.2} +{'loss': 38.8929, 'grad_norm': 267.57659912109375, 'learning_rate': 7.127041742286752e-06, 'epoch': 15.2} +{'loss': 39.6078, 'grad_norm': 243.88333129882812, 'learning_rate': 7.121597096188748e-06, 'epoch': 15.21} +{'loss': 39.9488, 'grad_norm': 268.2644348144531, 'learning_rate': 7.116152450090745e-06, 'epoch': 15.21} +{'loss': 40.1645, 'grad_norm': 240.2657928466797, 'learning_rate': 7.11070780399274e-06, 'epoch': 15.21} +{'loss': 38.2229, 'grad_norm': 198.76910400390625, 'learning_rate': 7.105263157894737e-06, 'epoch': 15.22} +{'loss': 39.5294, 'grad_norm': 234.11170959472656, 'learning_rate': 7.099818511796734e-06, 'epoch': 15.22} +{'loss': 36.9752, 'grad_norm': 192.80194091796875, 'learning_rate': 7.094373865698729e-06, 'epoch': 15.22} +{'loss': 36.1043, 'grad_norm': 241.8236846923828, 'learning_rate': 7.088929219600726e-06, 'epoch': 15.23} +{'loss': 37.7911, 'grad_norm': 451.6199645996094, 'learning_rate': 7.083484573502722e-06, 'epoch': 15.23} +{'loss': 35.5202, 'grad_norm': 351.9429626464844, 'learning_rate': 7.0780399274047195e-06, 'epoch': 15.23} + 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 4230/5520 [3:42:51<1:03:02, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6093130111694336, 'eval_runtime': 3.1403, 'eval_samples_per_second': 57.001, 'eval_steps_per_second': 57.001, 'epoch': 15.23} +{'loss': 37.5552, 'grad_norm': 266.4995422363281, 'learning_rate': 7.072595281306715e-06, 'epoch': 15.24} +{'loss': 37.1315, 'grad_norm': 258.74578857421875, 'learning_rate': 7.067150635208712e-06, 'epoch': 15.24} +{'loss': 36.9237, 'grad_norm': 233.30921936035156, 'learning_rate': 7.061705989110708e-06, 'epoch': 15.25} +{'loss': 38.0112, 'grad_norm': 235.8688201904297, 'learning_rate': 7.056261343012704e-06, 'epoch': 15.25} +{'loss': 38.5641, 'grad_norm': 214.88436889648438, 'learning_rate': 7.050816696914701e-06, 'epoch': 15.25} +{'loss': 36.7125, 'grad_norm': 252.64144897460938, 'learning_rate': 7.045372050816697e-06, 'epoch': 15.26} +{'loss': 37.5956, 'grad_norm': 293.78424072265625, 'learning_rate': 7.039927404718694e-06, 'epoch': 15.26} +{'loss': 38.1829, 'grad_norm': 234.13510131835938, 'learning_rate': 7.03448275862069e-06, 'epoch': 15.26} +{'loss': 39.0785, 'grad_norm': 279.534912109375, 'learning_rate': 7.029038112522686e-06, 'epoch': 15.27} +{'loss': 39.1753, 'grad_norm': 246.4442596435547, 'learning_rate': 7.023593466424683e-06, 'epoch': 15.27} + 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 4240/5520 [3:43:23<1:02:31, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6043311357498169, 'eval_runtime': 3.1452, 'eval_samples_per_second': 56.913, 'eval_steps_per_second': 56.913, 'epoch': 15.27} +{'loss': 39.8464, 'grad_norm': 233.87466430664062, 'learning_rate': 7.018148820326679e-06, 'epoch': 15.27} +{'loss': 37.9721, 'grad_norm': 228.54898071289062, 'learning_rate': 7.012704174228675e-06, 'epoch': 15.28} +{'loss': 38.9153, 'grad_norm': 273.70050048828125, 'learning_rate': 7.007259528130671e-06, 'epoch': 15.28} +{'loss': 36.7607, 'grad_norm': 269.8402404785156, 'learning_rate': 7.001814882032668e-06, 'epoch': 15.29} +{'loss': 35.3684, 'grad_norm': 260.13629150390625, 'learning_rate': 6.996370235934665e-06, 'epoch': 15.29} +{'loss': 32.8784, 'grad_norm': 223.9878692626953, 'learning_rate': 6.990925589836661e-06, 'epoch': 15.29} +{'loss': 31.3751, 'grad_norm': 225.69212341308594, 'learning_rate': 6.985480943738657e-06, 'epoch': 15.3} +{'loss': 31.5331, 'grad_norm': 215.99801635742188, 'learning_rate': 6.980036297640653e-06, 'epoch': 15.3} +{'loss': 32.5806, 'grad_norm': 263.26568603515625, 'learning_rate': 6.97459165154265e-06, 'epoch': 15.3} +{'loss': 31.6379, 'grad_norm': 203.2392578125, 'learning_rate': 6.969147005444646e-06, 'epoch': 15.31} + 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 4250/5520 [3:43:55<1:02:15, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6046441793441772, 'eval_runtime': 3.1408, 'eval_samples_per_second': 56.992, 'eval_steps_per_second': 56.992, 'epoch': 15.31} +{'loss': 33.7034, 'grad_norm': 221.2167510986328, 'learning_rate': 6.963702359346643e-06, 'epoch': 15.31} +{'loss': 32.5511, 'grad_norm': 212.58737182617188, 'learning_rate': 6.958257713248639e-06, 'epoch': 15.31} +{'loss': 33.2513, 'grad_norm': 270.7123718261719, 'learning_rate': 6.952813067150635e-06, 'epoch': 15.32} +{'loss': 33.9559, 'grad_norm': 270.2066345214844, 'learning_rate': 6.9473684210526315e-06, 'epoch': 15.32} +{'loss': 33.9916, 'grad_norm': 232.8043212890625, 'learning_rate': 6.941923774954628e-06, 'epoch': 15.33} +{'loss': 35.2098, 'grad_norm': 325.419921875, 'learning_rate': 6.936479128856625e-06, 'epoch': 15.33} +{'loss': 35.0784, 'grad_norm': 303.326416015625, 'learning_rate': 6.93103448275862e-06, 'epoch': 15.33} +{'loss': 35.9915, 'grad_norm': 327.05963134765625, 'learning_rate': 6.925589836660617e-06, 'epoch': 15.34} +{'loss': 35.1914, 'grad_norm': 326.58795166015625, 'learning_rate': 6.9201451905626135e-06, 'epoch': 15.34} +{'loss': 37.1535, 'grad_norm': 406.38812255859375, 'learning_rate': 6.914700544464611e-06, 'epoch': 15.34} + 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 4260/5520 [3:44:27<1:01:29, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6056071519851685, 'eval_runtime': 3.1411, 'eval_samples_per_second': 56.986, 'eval_steps_per_second': 56.986, 'epoch': 15.34} +{'loss': 29.8698, 'grad_norm': 325.6965637207031, 'learning_rate': 6.909255898366606e-06, 'epoch': 15.35} +{'loss': 22.2995, 'grad_norm': 212.59727478027344, 'learning_rate': 6.903811252268603e-06, 'epoch': 15.35} +{'loss': 23.1014, 'grad_norm': 257.447509765625, 'learning_rate': 6.898366606170599e-06, 'epoch': 15.35} +{'loss': 23.2319, 'grad_norm': 266.139892578125, 'learning_rate': 6.8929219600725955e-06, 'epoch': 15.36} +{'loss': 23.7218, 'grad_norm': 332.7207336425781, 'learning_rate': 6.887477313974592e-06, 'epoch': 15.36} +{'loss': 39.5787, 'grad_norm': 272.7341003417969, 'learning_rate': 6.882032667876588e-06, 'epoch': 15.36} +{'loss': 41.0874, 'grad_norm': 259.00872802734375, 'learning_rate': 6.876588021778585e-06, 'epoch': 15.37} +{'loss': 38.9811, 'grad_norm': 236.87033081054688, 'learning_rate': 6.8711433756805804e-06, 'epoch': 15.37} +{'loss': 39.481, 'grad_norm': 293.6808776855469, 'learning_rate': 6.8656987295825775e-06, 'epoch': 15.38} +{'loss': 39.4595, 'grad_norm': 266.0845947265625, 'learning_rate': 6.860254083484574e-06, 'epoch': 15.38} + 77%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 4270/5520 [3:44:59<1:01:06, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6039742231369019, 'eval_runtime': 3.1378, 'eval_samples_per_second': 57.047, 'eval_steps_per_second': 57.047, 'epoch': 15.38} +{'loss': 38.8899, 'grad_norm': 398.0877685546875, 'learning_rate': 6.85480943738657e-06, 'epoch': 15.38} +{'loss': 39.2194, 'grad_norm': 208.37376403808594, 'learning_rate': 6.849364791288566e-06, 'epoch': 15.39} +{'loss': 38.9911, 'grad_norm': 214.6958770751953, 'learning_rate': 6.8439201451905624e-06, 'epoch': 15.39} +{'loss': 40.5973, 'grad_norm': 210.2147674560547, 'learning_rate': 6.8384754990925595e-06, 'epoch': 15.39} +{'loss': 39.3936, 'grad_norm': 240.47030639648438, 'learning_rate': 6.833030852994556e-06, 'epoch': 15.4} +{'loss': 40.0848, 'grad_norm': 273.86883544921875, 'learning_rate': 6.827586206896552e-06, 'epoch': 15.4} +{'loss': 36.5967, 'grad_norm': 239.36453247070312, 'learning_rate': 6.822141560798548e-06, 'epoch': 15.4} +{'loss': 37.8173, 'grad_norm': 215.3413543701172, 'learning_rate': 6.8166969147005444e-06, 'epoch': 15.41} +{'loss': 37.7175, 'grad_norm': 260.1557312011719, 'learning_rate': 6.811252268602541e-06, 'epoch': 15.41} +{'loss': 37.0618, 'grad_norm': 239.4988555908203, 'learning_rate': 6.805807622504537e-06, 'epoch': 15.42} + 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 4280/5520 [3:45:31<1:00:25, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6049810647964478, 'eval_runtime': 3.1372, 'eval_samples_per_second': 57.057, 'eval_steps_per_second': 57.057, 'epoch': 15.42} +{'loss': 37.0687, 'grad_norm': 223.06094360351562, 'learning_rate': 6.800362976406534e-06, 'epoch': 15.42} +{'loss': 35.9437, 'grad_norm': 261.7460632324219, 'learning_rate': 6.79491833030853e-06, 'epoch': 15.42} +{'loss': 38.3316, 'grad_norm': 230.92135620117188, 'learning_rate': 6.7894736842105264e-06, 'epoch': 15.43} +{'loss': 38.2666, 'grad_norm': 370.6309509277344, 'learning_rate': 6.784029038112523e-06, 'epoch': 15.43} +{'loss': 38.1159, 'grad_norm': 249.7823944091797, 'learning_rate': 6.77858439201452e-06, 'epoch': 15.43} +{'loss': 37.6548, 'grad_norm': 404.1676330566406, 'learning_rate': 6.773139745916516e-06, 'epoch': 15.44} +{'loss': 38.3713, 'grad_norm': 256.3241271972656, 'learning_rate': 6.767695099818511e-06, 'epoch': 15.44} +{'loss': 39.2487, 'grad_norm': 240.55934143066406, 'learning_rate': 6.7622504537205084e-06, 'epoch': 15.44} +{'loss': 39.4391, 'grad_norm': 230.010009765625, 'learning_rate': 6.756805807622505e-06, 'epoch': 15.45} +{'loss': 38.6273, 'grad_norm': 226.51385498046875, 'learning_rate': 6.751361161524502e-06, 'epoch': 15.45} + 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 4290/5520 [3:46:03<1:00:00, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6027400493621826, 'eval_runtime': 3.1387, 'eval_samples_per_second': 57.029, 'eval_steps_per_second': 57.029, 'epoch': 15.45} +{'loss': 38.583, 'grad_norm': 314.57476806640625, 'learning_rate': 6.745916515426497e-06, 'epoch': 15.46} +{'loss': 39.2433, 'grad_norm': 229.91238403320312, 'learning_rate': 6.740471869328494e-06, 'epoch': 15.46} +{'loss': 38.8577, 'grad_norm': 284.7301330566406, 'learning_rate': 6.7350272232304904e-06, 'epoch': 15.46} +{'loss': 34.928, 'grad_norm': 209.32266235351562, 'learning_rate': 6.729582577132486e-06, 'epoch': 15.47} +{'loss': 32.0527, 'grad_norm': 264.6195068359375, 'learning_rate': 6.724137931034483e-06, 'epoch': 15.47} +{'loss': 31.939, 'grad_norm': 224.2421112060547, 'learning_rate': 6.718693284936479e-06, 'epoch': 15.47} +{'loss': 32.5402, 'grad_norm': 233.0791015625, 'learning_rate': 6.713248638838476e-06, 'epoch': 15.48} +{'loss': 31.0069, 'grad_norm': 284.129638671875, 'learning_rate': 6.707803992740472e-06, 'epoch': 15.48} +{'loss': 32.0172, 'grad_norm': 253.6517791748047, 'learning_rate': 6.702359346642469e-06, 'epoch': 15.48} +{'loss': 34.1643, 'grad_norm': 305.63775634765625, 'learning_rate': 6.696914700544465e-06, 'epoch': 15.49} + 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 4300/5520 [3:46:35<59:35, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6044390201568604, 'eval_runtime': 3.1391, 'eval_samples_per_second': 57.023, 'eval_steps_per_second': 57.023, 'epoch': 15.49} +{'loss': 32.4735, 'grad_norm': 224.6516876220703, 'learning_rate': 6.691470054446461e-06, 'epoch': 15.49} +{'loss': 33.9272, 'grad_norm': 257.5385437011719, 'learning_rate': 6.686025408348457e-06, 'epoch': 15.49} +{'loss': 34.4176, 'grad_norm': 393.9106140136719, 'learning_rate': 6.680580762250454e-06, 'epoch': 15.5} +{'loss': 34.5695, 'grad_norm': 333.5639953613281, 'learning_rate': 6.675136116152451e-06, 'epoch': 15.5} +{'loss': 34.5337, 'grad_norm': 319.8660888671875, 'learning_rate': 6.669691470054446e-06, 'epoch': 15.51} +{'loss': 34.8297, 'grad_norm': 246.78086853027344, 'learning_rate': 6.664246823956443e-06, 'epoch': 15.51} +{'loss': 34.6901, 'grad_norm': 313.4530944824219, 'learning_rate': 6.658802177858439e-06, 'epoch': 15.51} +{'loss': 35.3892, 'grad_norm': 257.2852783203125, 'learning_rate': 6.6533575317604364e-06, 'epoch': 15.52} +{'loss': 36.3347, 'grad_norm': 336.5549011230469, 'learning_rate': 6.647912885662432e-06, 'epoch': 15.52} +{'loss': 36.3559, 'grad_norm': 275.726806640625, 'learning_rate': 6.642468239564428e-06, 'epoch': 15.52} + 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 4310/5520 [3:47:07<59:10, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6056334376335144, 'eval_runtime': 3.1388, 'eval_samples_per_second': 57.028, 'eval_steps_per_second': 57.028, 'epoch': 15.52} +{'loss': 28.5887, 'grad_norm': 275.5987243652344, 'learning_rate': 6.637023593466425e-06, 'epoch': 15.53} +{'loss': 22.1398, 'grad_norm': 242.59762573242188, 'learning_rate': 6.631578947368421e-06, 'epoch': 15.53} +{'loss': 21.4593, 'grad_norm': 228.04344177246094, 'learning_rate': 6.626134301270418e-06, 'epoch': 15.53} +{'loss': 22.5132, 'grad_norm': 204.2377166748047, 'learning_rate': 6.620689655172414e-06, 'epoch': 15.54} +{'loss': 24.2777, 'grad_norm': 243.0237579345703, 'learning_rate': 6.615245009074411e-06, 'epoch': 15.54} +{'loss': 39.7235, 'grad_norm': 227.2841339111328, 'learning_rate': 6.609800362976407e-06, 'epoch': 15.55} +{'loss': 39.9317, 'grad_norm': 253.8453826904297, 'learning_rate': 6.6043557168784025e-06, 'epoch': 15.55} +{'loss': 38.9825, 'grad_norm': 243.62757873535156, 'learning_rate': 6.5989110707804e-06, 'epoch': 15.55} +{'loss': 39.7456, 'grad_norm': 262.4398498535156, 'learning_rate': 6.593466424682396e-06, 'epoch': 15.56} +{'loss': 39.5152, 'grad_norm': 268.5821228027344, 'learning_rate': 6.588021778584392e-06, 'epoch': 15.56} + 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 4320/5520 [3:47:39<58:45, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6060237288475037, 'eval_runtime': 3.1376, 'eval_samples_per_second': 57.049, 'eval_steps_per_second': 57.049, 'epoch': 15.56} +{'loss': 40.1259, 'grad_norm': 297.6933898925781, 'learning_rate': 6.582577132486388e-06, 'epoch': 15.56} +{'loss': 40.8591, 'grad_norm': 234.08816528320312, 'learning_rate': 6.577132486388385e-06, 'epoch': 15.57} +{'loss': 39.2377, 'grad_norm': 292.2416687011719, 'learning_rate': 6.571687840290382e-06, 'epoch': 15.57} +{'loss': 39.92, 'grad_norm': 205.25888061523438, 'learning_rate': 6.566243194192377e-06, 'epoch': 15.57} +{'loss': 39.8886, 'grad_norm': 229.06695556640625, 'learning_rate': 6.560798548094374e-06, 'epoch': 15.58} +{'loss': 38.5423, 'grad_norm': 223.3977508544922, 'learning_rate': 6.55535390199637e-06, 'epoch': 15.58} +{'loss': 36.8055, 'grad_norm': 254.60203552246094, 'learning_rate': 6.549909255898367e-06, 'epoch': 15.59} +{'loss': 37.6164, 'grad_norm': 304.463623046875, 'learning_rate': 6.544464609800363e-06, 'epoch': 15.59} +{'loss': 37.4778, 'grad_norm': 279.955810546875, 'learning_rate': 6.53901996370236e-06, 'epoch': 15.59} +{'loss': 36.9663, 'grad_norm': 230.11105346679688, 'learning_rate': 6.533575317604356e-06, 'epoch': 15.6} + 78%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 4330/5520 [3:48:11<58:10, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6048213243484497, 'eval_runtime': 3.1422, 'eval_samples_per_second': 56.966, 'eval_steps_per_second': 56.966, 'epoch': 15.6} +{'loss': 37.7402, 'grad_norm': 261.98187255859375, 'learning_rate': 6.528130671506351e-06, 'epoch': 15.6} +{'loss': 37.1402, 'grad_norm': 247.34771728515625, 'learning_rate': 6.5226860254083485e-06, 'epoch': 15.6} +{'loss': 38.3976, 'grad_norm': 277.1517333984375, 'learning_rate': 6.517241379310345e-06, 'epoch': 15.61} +{'loss': 38.0834, 'grad_norm': 231.89683532714844, 'learning_rate': 6.511796733212342e-06, 'epoch': 15.61} +{'loss': 37.9085, 'grad_norm': 323.8349304199219, 'learning_rate': 6.506352087114337e-06, 'epoch': 15.61} +{'loss': 37.0702, 'grad_norm': 263.5240783691406, 'learning_rate': 6.500907441016334e-06, 'epoch': 15.62} +{'loss': 36.9406, 'grad_norm': 217.0517578125, 'learning_rate': 6.4954627949183305e-06, 'epoch': 15.62} +{'loss': 38.8773, 'grad_norm': 267.4161682128906, 'learning_rate': 6.4900181488203276e-06, 'epoch': 15.62} +{'loss': 38.4978, 'grad_norm': 232.36000061035156, 'learning_rate': 6.484573502722323e-06, 'epoch': 15.63} +{'loss': 38.4895, 'grad_norm': 241.61373901367188, 'learning_rate': 6.479128856624319e-06, 'epoch': 15.63} + 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 4340/5520 [3:48:43<57:36, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6024956703186035, 'eval_runtime': 3.1364, 'eval_samples_per_second': 57.072, 'eval_steps_per_second': 57.072, 'epoch': 15.63} +{'loss': 38.8551, 'grad_norm': 232.27928161621094, 'learning_rate': 6.473684210526316e-06, 'epoch': 15.64} +{'loss': 38.6475, 'grad_norm': 243.42828369140625, 'learning_rate': 6.4682395644283125e-06, 'epoch': 15.64} +{'loss': 37.2015, 'grad_norm': 306.2618103027344, 'learning_rate': 6.462794918330309e-06, 'epoch': 15.64} +{'loss': 36.5255, 'grad_norm': 335.795166015625, 'learning_rate': 6.457350272232305e-06, 'epoch': 15.65} +{'loss': 32.4219, 'grad_norm': 209.6246337890625, 'learning_rate': 6.451905626134302e-06, 'epoch': 15.65} +{'loss': 30.9137, 'grad_norm': 283.2094421386719, 'learning_rate': 6.446460980036297e-06, 'epoch': 15.65} +{'loss': 30.8939, 'grad_norm': 255.4412841796875, 'learning_rate': 6.441016333938294e-06, 'epoch': 15.66} +{'loss': 31.5974, 'grad_norm': 217.8052215576172, 'learning_rate': 6.435571687840291e-06, 'epoch': 15.66} +{'loss': 30.0276, 'grad_norm': 215.64398193359375, 'learning_rate': 6.430127041742287e-06, 'epoch': 15.66} +{'loss': 32.5249, 'grad_norm': 244.32704162597656, 'learning_rate': 6.424682395644283e-06, 'epoch': 15.67} + 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 4350/5520 [3:49:15<57:21, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6037233471870422, 'eval_runtime': 3.1385, 'eval_samples_per_second': 57.033, 'eval_steps_per_second': 57.033, 'epoch': 15.67} +{'loss': 32.9923, 'grad_norm': 270.9132080078125, 'learning_rate': 6.419237749546279e-06, 'epoch': 15.67} +{'loss': 32.871, 'grad_norm': 230.20314025878906, 'learning_rate': 6.4137931034482765e-06, 'epoch': 15.68} +{'loss': 35.2687, 'grad_norm': 372.4366149902344, 'learning_rate': 6.408348457350273e-06, 'epoch': 15.68} +{'loss': 34.3107, 'grad_norm': 325.0901794433594, 'learning_rate': 6.402903811252268e-06, 'epoch': 15.68} +{'loss': 34.291, 'grad_norm': 277.8683166503906, 'learning_rate': 6.397459165154265e-06, 'epoch': 15.69} +{'loss': 33.2989, 'grad_norm': 262.566162109375, 'learning_rate': 6.392014519056261e-06, 'epoch': 15.69} +{'loss': 35.6865, 'grad_norm': 293.56536865234375, 'learning_rate': 6.386569872958258e-06, 'epoch': 15.69} +{'loss': 35.6959, 'grad_norm': 291.1886291503906, 'learning_rate': 6.381125226860254e-06, 'epoch': 15.7} +{'loss': 36.479, 'grad_norm': 265.2365417480469, 'learning_rate': 6.375680580762251e-06, 'epoch': 15.7} +{'loss': 35.9198, 'grad_norm': 342.8822021484375, 'learning_rate': 6.370235934664247e-06, 'epoch': 15.7} + 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 4360/5520 [3:49:48<56:55, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.603361189365387, 'eval_runtime': 3.1415, 'eval_samples_per_second': 56.98, 'eval_steps_per_second': 56.98, 'epoch': 15.7} +{'loss': 29.429, 'grad_norm': 276.1657409667969, 'learning_rate': 6.364791288566243e-06, 'epoch': 15.71} +{'loss': 23.0038, 'grad_norm': 267.2456359863281, 'learning_rate': 6.35934664246824e-06, 'epoch': 15.71} +{'loss': 21.1185, 'grad_norm': 255.4893798828125, 'learning_rate': 6.353901996370236e-06, 'epoch': 15.72} +{'loss': 23.1769, 'grad_norm': 252.10501098632812, 'learning_rate': 6.348457350272233e-06, 'epoch': 15.72} +{'loss': 24.5905, 'grad_norm': 239.63905334472656, 'learning_rate': 6.343012704174228e-06, 'epoch': 15.72} +{'loss': 39.6657, 'grad_norm': 228.00950622558594, 'learning_rate': 6.337568058076225e-06, 'epoch': 15.73} +{'loss': 41.145, 'grad_norm': 234.10647583007812, 'learning_rate': 6.332123411978222e-06, 'epoch': 15.73} +{'loss': 40.2784, 'grad_norm': 236.55223083496094, 'learning_rate': 6.326678765880219e-06, 'epoch': 15.73} +{'loss': 39.3598, 'grad_norm': 340.1712646484375, 'learning_rate': 6.321234119782214e-06, 'epoch': 15.74} +{'loss': 38.7777, 'grad_norm': 269.4134826660156, 'learning_rate': 6.31578947368421e-06, 'epoch': 15.74} + 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 4370/5520 [3:50:20<56:14, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6048015356063843, 'eval_runtime': 3.144, 'eval_samples_per_second': 56.935, 'eval_steps_per_second': 56.935, 'epoch': 15.74} +{'loss': 39.6707, 'grad_norm': 316.5471496582031, 'learning_rate': 6.310344827586207e-06, 'epoch': 15.74} +{'loss': 38.0009, 'grad_norm': 231.31820678710938, 'learning_rate': 6.304900181488203e-06, 'epoch': 15.75} +{'loss': 41.6523, 'grad_norm': 207.19117736816406, 'learning_rate': 6.2994555353902e-06, 'epoch': 15.75} +{'loss': 40.3203, 'grad_norm': 239.8341064453125, 'learning_rate': 6.294010889292196e-06, 'epoch': 15.75} +{'loss': 39.8026, 'grad_norm': 277.2004089355469, 'learning_rate': 6.288566243194193e-06, 'epoch': 15.76} +{'loss': 38.1561, 'grad_norm': 227.74728393554688, 'learning_rate': 6.2831215970961886e-06, 'epoch': 15.76} +{'loss': 37.4653, 'grad_norm': 268.6826477050781, 'learning_rate': 6.277676950998185e-06, 'epoch': 15.77} +{'loss': 36.3506, 'grad_norm': 308.92950439453125, 'learning_rate': 6.272232304900182e-06, 'epoch': 15.77} +{'loss': 36.12, 'grad_norm': 216.53627014160156, 'learning_rate': 6.266787658802178e-06, 'epoch': 15.77} +{'loss': 37.5023, 'grad_norm': 264.0691833496094, 'learning_rate': 6.261343012704174e-06, 'epoch': 15.78} + 79%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 4380/5520 [3:50:52<55:44, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.608928382396698, 'eval_runtime': 3.141, 'eval_samples_per_second': 56.989, 'eval_steps_per_second': 56.989, 'epoch': 15.78} +{'loss': 38.8381, 'grad_norm': 474.7265319824219, 'learning_rate': 6.2558983666061706e-06, 'epoch': 15.78} +{'loss': 36.5951, 'grad_norm': 303.66229248046875, 'learning_rate': 6.250453720508168e-06, 'epoch': 15.78} +{'loss': 36.4717, 'grad_norm': 231.65744018554688, 'learning_rate': 6.245009074410164e-06, 'epoch': 15.79} +{'loss': 38.4578, 'grad_norm': 235.25833129882812, 'learning_rate': 6.239564428312159e-06, 'epoch': 15.79} +{'loss': 38.0475, 'grad_norm': 215.5384063720703, 'learning_rate': 6.234119782214156e-06, 'epoch': 15.79} +{'loss': 37.1825, 'grad_norm': 216.3609619140625, 'learning_rate': 6.2286751361161526e-06, 'epoch': 15.8} +{'loss': 38.5608, 'grad_norm': 275.54522705078125, 'learning_rate': 6.223230490018149e-06, 'epoch': 15.8} +{'loss': 38.0612, 'grad_norm': 226.7752685546875, 'learning_rate': 6.217785843920145e-06, 'epoch': 15.81} +{'loss': 38.0049, 'grad_norm': 262.14501953125, 'learning_rate': 6.212341197822142e-06, 'epoch': 15.81} +{'loss': 39.1441, 'grad_norm': 299.82196044921875, 'learning_rate': 6.206896551724138e-06, 'epoch': 15.81} + 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 4390/5520 [3:51:24<55:08, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6033969521522522, 'eval_runtime': 3.14, 'eval_samples_per_second': 57.007, 'eval_steps_per_second': 57.007, 'epoch': 15.81} +{'loss': 39.266, 'grad_norm': 295.24188232421875, 'learning_rate': 6.2014519056261346e-06, 'epoch': 15.82} +{'loss': 39.4025, 'grad_norm': 298.1729736328125, 'learning_rate': 6.196007259528131e-06, 'epoch': 15.82} +{'loss': 39.4752, 'grad_norm': 234.97958374023438, 'learning_rate': 6.190562613430127e-06, 'epoch': 15.82} +{'loss': 36.0322, 'grad_norm': 270.3009338378906, 'learning_rate': 6.185117967332124e-06, 'epoch': 15.83} +{'loss': 33.3256, 'grad_norm': 279.78314208984375, 'learning_rate': 6.1796733212341195e-06, 'epoch': 15.83} +{'loss': 33.1552, 'grad_norm': 258.82598876953125, 'learning_rate': 6.1742286751361166e-06, 'epoch': 15.83} +{'loss': 32.0024, 'grad_norm': 280.8109130859375, 'learning_rate': 6.168784029038113e-06, 'epoch': 15.84} +{'loss': 32.4901, 'grad_norm': 265.08111572265625, 'learning_rate': 6.163339382940109e-06, 'epoch': 15.84} +{'loss': 33.1995, 'grad_norm': 316.56427001953125, 'learning_rate': 6.157894736842105e-06, 'epoch': 15.85} +{'loss': 33.1914, 'grad_norm': 256.03717041015625, 'learning_rate': 6.1524500907441015e-06, 'epoch': 15.85} + 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 4400/5520 [3:51:56<54:52, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6017575263977051, 'eval_runtime': 3.1385, 'eval_samples_per_second': 57.034, 'eval_steps_per_second': 57.034, 'epoch': 15.85} +{'loss': 33.8459, 'grad_norm': 242.54119873046875, 'learning_rate': 6.1470054446460985e-06, 'epoch': 15.85} +{'loss': 34.1317, 'grad_norm': 259.1406555175781, 'learning_rate': 6.141560798548094e-06, 'epoch': 15.86} +{'loss': 34.2777, 'grad_norm': 272.77880859375, 'learning_rate': 6.136116152450091e-06, 'epoch': 15.86} +{'loss': 34.0165, 'grad_norm': 231.60845947265625, 'learning_rate': 6.130671506352087e-06, 'epoch': 15.86} +{'loss': 34.2761, 'grad_norm': 230.85675048828125, 'learning_rate': 6.125226860254084e-06, 'epoch': 15.87} +{'loss': 33.7407, 'grad_norm': 307.4486389160156, 'learning_rate': 6.11978221415608e-06, 'epoch': 15.87} +{'loss': 34.1672, 'grad_norm': 264.7835388183594, 'learning_rate': 6.114337568058076e-06, 'epoch': 15.87} +{'loss': 35.7158, 'grad_norm': 234.93968200683594, 'learning_rate': 6.108892921960073e-06, 'epoch': 15.88} +{'loss': 36.1292, 'grad_norm': 300.0079345703125, 'learning_rate': 6.103448275862069e-06, 'epoch': 15.88} +{'loss': 34.8222, 'grad_norm': 326.20416259765625, 'learning_rate': 6.0980036297640655e-06, 'epoch': 15.88} + 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 4400/5520 [3:51:59<54:52, 2.94s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 4410/5520 [3:52:29<54:28, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6024067401885986, 'eval_runtime': 3.1409, 'eval_samples_per_second': 56.99, 'eval_steps_per_second': 56.99, 'epoch': 15.88} +{'loss': 27.4819, 'grad_norm': 214.6174774169922, 'learning_rate': 6.092558983666062e-06, 'epoch': 15.89} +{'loss': 22.3862, 'grad_norm': 222.7063446044922, 'learning_rate': 6.087114337568059e-06, 'epoch': 15.89} +{'loss': 22.8483, 'grad_norm': 277.0006103515625, 'learning_rate': 6.081669691470054e-06, 'epoch': 15.9} +{'loss': 23.2021, 'grad_norm': 264.3949890136719, 'learning_rate': 6.076225045372051e-06, 'epoch': 15.9} +{'loss': 23.9378, 'grad_norm': 244.04611206054688, 'learning_rate': 6.0707803992740475e-06, 'epoch': 15.9} +{'loss': 39.4708, 'grad_norm': 219.24403381347656, 'learning_rate': 6.065335753176044e-06, 'epoch': 15.91} +{'loss': 39.9151, 'grad_norm': 297.3822937011719, 'learning_rate': 6.05989110707804e-06, 'epoch': 15.91} +{'loss': 39.0545, 'grad_norm': 282.748291015625, 'learning_rate': 6.054446460980036e-06, 'epoch': 15.91} +{'loss': 39.7046, 'grad_norm': 274.6419982910156, 'learning_rate': 6.049001814882033e-06, 'epoch': 15.92} +{'loss': 39.8849, 'grad_norm': 261.2831115722656, 'learning_rate': 6.0435571687840295e-06, 'epoch': 15.92} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 4420/5520 [3:53:01<53:45, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6017056107521057, 'eval_runtime': 3.1351, 'eval_samples_per_second': 57.095, 'eval_steps_per_second': 57.095, 'epoch': 15.92} +{'loss': 39.8861, 'grad_norm': 276.61505126953125, 'learning_rate': 6.038112522686026e-06, 'epoch': 15.92} +{'loss': 36.2526, 'grad_norm': 273.4017333984375, 'learning_rate': 6.032667876588022e-06, 'epoch': 15.93} +{'loss': 37.1316, 'grad_norm': 314.4811706542969, 'learning_rate': 6.027223230490018e-06, 'epoch': 15.93} +{'loss': 38.1698, 'grad_norm': 265.7447204589844, 'learning_rate': 6.021778584392014e-06, 'epoch': 15.94} +{'loss': 38.9541, 'grad_norm': 448.373291015625, 'learning_rate': 6.016333938294011e-06, 'epoch': 15.94} +{'loss': 36.6694, 'grad_norm': 261.33966064453125, 'learning_rate': 6.010889292196008e-06, 'epoch': 15.94} +{'loss': 39.1773, 'grad_norm': 383.16363525390625, 'learning_rate': 6.005444646098004e-06, 'epoch': 15.95} +{'loss': 36.9482, 'grad_norm': 279.26446533203125, 'learning_rate': 6e-06, 'epoch': 15.95} +{'loss': 36.653, 'grad_norm': 307.5321960449219, 'learning_rate': 5.994555353901996e-06, 'epoch': 15.95} +{'loss': 36.3768, 'grad_norm': 412.80023193359375, 'learning_rate': 5.989110707803993e-06, 'epoch': 15.96} + 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 4430/5520 [3:53:33<53:17, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6033455729484558, 'eval_runtime': 3.1365, 'eval_samples_per_second': 57.071, 'eval_steps_per_second': 57.071, 'epoch': 15.96} +{'loss': 32.546, 'grad_norm': 254.2952880859375, 'learning_rate': 5.98366606170599e-06, 'epoch': 15.96} +{'loss': 32.7021, 'grad_norm': 324.0749816894531, 'learning_rate': 5.978221415607985e-06, 'epoch': 15.96} +{'loss': 33.3823, 'grad_norm': 326.0075988769531, 'learning_rate': 5.972776769509982e-06, 'epoch': 15.97} +{'loss': 33.3397, 'grad_norm': 252.98471069335938, 'learning_rate': 5.967332123411978e-06, 'epoch': 15.97} +{'loss': 34.2781, 'grad_norm': 243.14117431640625, 'learning_rate': 5.9618874773139755e-06, 'epoch': 15.98} +{'loss': 34.1163, 'grad_norm': 304.3429260253906, 'learning_rate': 5.956442831215971e-06, 'epoch': 15.98} +{'loss': 34.1024, 'grad_norm': 320.1651916503906, 'learning_rate': 5.950998185117968e-06, 'epoch': 15.98} +{'loss': 35.8121, 'grad_norm': 252.0004425048828, 'learning_rate': 5.945553539019964e-06, 'epoch': 15.99} +{'loss': 35.6666, 'grad_norm': 342.5635986328125, 'learning_rate': 5.9401088929219595e-06, 'epoch': 15.99} +{'loss': 30.2617, 'grad_norm': 226.57249450683594, 'learning_rate': 5.934664246823957e-06, 'epoch': 15.99} + 80%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 4440/5520 [3:54:05<52:45, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6029886603355408, 'eval_runtime': 3.1415, 'eval_samples_per_second': 56.979, 'eval_steps_per_second': 56.979, 'epoch': 15.99} +{'loss': 22.8166, 'grad_norm': 202.94903564453125, 'learning_rate': 5.929219600725953e-06, 'epoch': 16.0} +{'loss': 20.3903, 'grad_norm': 200.84317016601562, 'learning_rate': 5.92377495462795e-06, 'epoch': 16.0} +{'loss': 39.0985, 'grad_norm': 230.5917510986328, 'learning_rate': 5.918330308529945e-06, 'epoch': 16.0} +{'loss': 39.2128, 'grad_norm': 285.6978759765625, 'learning_rate': 5.912885662431942e-06, 'epoch': 16.01} +{'loss': 38.9026, 'grad_norm': 221.70896911621094, 'learning_rate': 5.907441016333939e-06, 'epoch': 16.01} +{'loss': 38.7336, 'grad_norm': 318.14068603515625, 'learning_rate': 5.901996370235935e-06, 'epoch': 16.01} +{'loss': 38.7117, 'grad_norm': 324.451904296875, 'learning_rate': 5.896551724137931e-06, 'epoch': 16.02} +{'loss': 39.6053, 'grad_norm': 295.038818359375, 'learning_rate': 5.891107078039927e-06, 'epoch': 16.02} +{'loss': 38.931, 'grad_norm': 267.0055236816406, 'learning_rate': 5.885662431941924e-06, 'epoch': 16.03} +{'loss': 41.1717, 'grad_norm': 269.20074462890625, 'learning_rate': 5.88021778584392e-06, 'epoch': 16.03} + 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 4450/5520 [3:54:37<52:03, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6036069393157959, 'eval_runtime': 3.1459, 'eval_samples_per_second': 56.899, 'eval_steps_per_second': 56.899, 'epoch': 16.03} +{'loss': 38.7027, 'grad_norm': 241.9443359375, 'learning_rate': 5.874773139745917e-06, 'epoch': 16.03} +{'loss': 39.1284, 'grad_norm': 238.54847717285156, 'learning_rate': 5.869328493647913e-06, 'epoch': 16.04} +{'loss': 38.0767, 'grad_norm': 339.3023681640625, 'learning_rate': 5.863883847549909e-06, 'epoch': 16.04} +{'loss': 34.8207, 'grad_norm': 257.29522705078125, 'learning_rate': 5.8584392014519055e-06, 'epoch': 16.04} +{'loss': 35.5021, 'grad_norm': 264.24200439453125, 'learning_rate': 5.852994555353902e-06, 'epoch': 16.05} +{'loss': 35.7826, 'grad_norm': 251.3128662109375, 'learning_rate': 5.847549909255899e-06, 'epoch': 16.05} +{'loss': 36.7373, 'grad_norm': 310.6581726074219, 'learning_rate': 5.842105263157895e-06, 'epoch': 16.05} +{'loss': 36.4048, 'grad_norm': 299.07550048828125, 'learning_rate': 5.836660617059891e-06, 'epoch': 16.06} +{'loss': 36.3982, 'grad_norm': 257.58740234375, 'learning_rate': 5.8312159709618875e-06, 'epoch': 16.06} +{'loss': 36.8518, 'grad_norm': 337.6795654296875, 'learning_rate': 5.825771324863884e-06, 'epoch': 16.07} + 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 4460/5520 [3:55:09<51:25, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6036850214004517, 'eval_runtime': 3.1399, 'eval_samples_per_second': 57.009, 'eval_steps_per_second': 57.009, 'epoch': 16.07} +{'loss': 36.1763, 'grad_norm': 275.02423095703125, 'learning_rate': 5.820326678765881e-06, 'epoch': 16.07} +{'loss': 37.6417, 'grad_norm': 263.4334716796875, 'learning_rate': 5.814882032667876e-06, 'epoch': 16.07} +{'loss': 35.6537, 'grad_norm': 213.16749572753906, 'learning_rate': 5.809437386569873e-06, 'epoch': 16.08} +{'loss': 36.5693, 'grad_norm': 263.4288330078125, 'learning_rate': 5.8039927404718695e-06, 'epoch': 16.08} +{'loss': 37.3424, 'grad_norm': 284.67254638671875, 'learning_rate': 5.798548094373866e-06, 'epoch': 16.08} +{'loss': 38.7851, 'grad_norm': 355.7987060546875, 'learning_rate': 5.793103448275862e-06, 'epoch': 16.09} +{'loss': 38.1334, 'grad_norm': 249.7351531982422, 'learning_rate': 5.787658802177859e-06, 'epoch': 16.09} +{'loss': 37.8369, 'grad_norm': 257.4977722167969, 'learning_rate': 5.782214156079855e-06, 'epoch': 16.09} +{'loss': 37.4005, 'grad_norm': 242.59584045410156, 'learning_rate': 5.776769509981851e-06, 'epoch': 16.1} +{'loss': 38.2287, 'grad_norm': 270.0740966796875, 'learning_rate': 5.771324863883848e-06, 'epoch': 16.1} + 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 4470/5520 [3:55:41<51:18, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6018803119659424, 'eval_runtime': 3.1381, 'eval_samples_per_second': 57.04, 'eval_steps_per_second': 57.04, 'epoch': 16.1} +{'loss': 35.7162, 'grad_norm': 225.32322692871094, 'learning_rate': 5.765880217785844e-06, 'epoch': 16.1} +{'loss': 32.8733, 'grad_norm': 275.3272705078125, 'learning_rate': 5.760435571687841e-06, 'epoch': 16.11} +{'loss': 33.2271, 'grad_norm': 259.5124206542969, 'learning_rate': 5.7549909255898364e-06, 'epoch': 16.11} +{'loss': 30.2931, 'grad_norm': 249.75738525390625, 'learning_rate': 5.7495462794918335e-06, 'epoch': 16.12} +{'loss': 30.9294, 'grad_norm': 277.7652282714844, 'learning_rate': 5.74410163339383e-06, 'epoch': 16.12} +{'loss': 31.7337, 'grad_norm': 223.28250122070312, 'learning_rate': 5.738656987295825e-06, 'epoch': 16.12} +{'loss': 31.2897, 'grad_norm': 259.5106201171875, 'learning_rate': 5.733212341197822e-06, 'epoch': 16.13} +{'loss': 32.8436, 'grad_norm': 241.0313720703125, 'learning_rate': 5.7277676950998184e-06, 'epoch': 16.13} +{'loss': 33.6823, 'grad_norm': 277.46905517578125, 'learning_rate': 5.7223230490018155e-06, 'epoch': 16.13} +{'loss': 33.1107, 'grad_norm': 264.2905578613281, 'learning_rate': 5.716878402903811e-06, 'epoch': 16.14} + 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 4480/5520 [3:56:13<50:42, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6046355962753296, 'eval_runtime': 3.1395, 'eval_samples_per_second': 57.015, 'eval_steps_per_second': 57.015, 'epoch': 16.14} +{'loss': 33.6291, 'grad_norm': 295.5188903808594, 'learning_rate': 5.711433756805808e-06, 'epoch': 16.14} +{'loss': 33.0773, 'grad_norm': 282.6014709472656, 'learning_rate': 5.705989110707804e-06, 'epoch': 16.14} +{'loss': 35.0269, 'grad_norm': 270.7958679199219, 'learning_rate': 5.7005444646098004e-06, 'epoch': 16.15} +{'loss': 35.1349, 'grad_norm': 344.7304992675781, 'learning_rate': 5.695099818511797e-06, 'epoch': 16.15} +{'loss': 36.3309, 'grad_norm': 294.5618896484375, 'learning_rate': 5.689655172413793e-06, 'epoch': 16.16} +{'loss': 35.0976, 'grad_norm': 305.5354309082031, 'learning_rate': 5.68421052631579e-06, 'epoch': 16.16} +{'loss': 34.9113, 'grad_norm': 293.9934387207031, 'learning_rate': 5.678765880217786e-06, 'epoch': 16.16} +{'loss': 24.8815, 'grad_norm': 277.9523010253906, 'learning_rate': 5.6733212341197824e-06, 'epoch': 16.17} +{'loss': 22.4544, 'grad_norm': 297.0547790527344, 'learning_rate': 5.667876588021779e-06, 'epoch': 16.17} +{'loss': 21.8323, 'grad_norm': 237.44741821289062, 'learning_rate': 5.662431941923776e-06, 'epoch': 16.17} + 81%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 4490/5520 [3:56:45<50:20, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6061411499977112, 'eval_runtime': 3.1415, 'eval_samples_per_second': 56.98, 'eval_steps_per_second': 56.98, 'epoch': 16.17} +{'loss': 22.7531, 'grad_norm': 220.5832977294922, 'learning_rate': 5.656987295825771e-06, 'epoch': 16.18} +{'loss': 23.7107, 'grad_norm': 298.8033142089844, 'learning_rate': 5.651542649727767e-06, 'epoch': 16.18} +{'loss': 39.1679, 'grad_norm': 250.02593994140625, 'learning_rate': 5.6460980036297644e-06, 'epoch': 16.18} +{'loss': 40.6492, 'grad_norm': 253.00746154785156, 'learning_rate': 5.640653357531761e-06, 'epoch': 16.19} +{'loss': 38.604, 'grad_norm': 215.04270935058594, 'learning_rate': 5.635208711433757e-06, 'epoch': 16.19} +{'loss': 39.1417, 'grad_norm': 395.6152648925781, 'learning_rate': 5.629764065335753e-06, 'epoch': 16.2} +{'loss': 39.4322, 'grad_norm': 380.3653869628906, 'learning_rate': 5.62431941923775e-06, 'epoch': 16.2} +{'loss': 39.1721, 'grad_norm': 309.3524475097656, 'learning_rate': 5.6188747731397464e-06, 'epoch': 16.2} +{'loss': 39.1462, 'grad_norm': 237.88262939453125, 'learning_rate': 5.613430127041742e-06, 'epoch': 16.21} +{'loss': 39.8177, 'grad_norm': 233.66690063476562, 'learning_rate': 5.607985480943739e-06, 'epoch': 16.21} + 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 4500/5520 [3:57:17<49:48, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6043822169303894, 'eval_runtime': 3.1418, 'eval_samples_per_second': 56.974, 'eval_steps_per_second': 56.974, 'epoch': 16.21} +{'loss': 39.7878, 'grad_norm': 229.3720703125, 'learning_rate': 5.602540834845735e-06, 'epoch': 16.21} +{'loss': 40.0754, 'grad_norm': 228.66493225097656, 'learning_rate': 5.597096188747731e-06, 'epoch': 16.22} +{'loss': 38.7709, 'grad_norm': 276.40240478515625, 'learning_rate': 5.591651542649728e-06, 'epoch': 16.22} +{'loss': 37.7439, 'grad_norm': 268.62371826171875, 'learning_rate': 5.586206896551725e-06, 'epoch': 16.22} +{'loss': 38.2511, 'grad_norm': 271.0934753417969, 'learning_rate': 5.580762250453721e-06, 'epoch': 16.23} +{'loss': 36.716, 'grad_norm': 253.63385009765625, 'learning_rate': 5.575317604355716e-06, 'epoch': 16.23} +{'loss': 36.5517, 'grad_norm': 265.1177978515625, 'learning_rate': 5.569872958257713e-06, 'epoch': 16.23} +{'loss': 37.1524, 'grad_norm': 332.52972412109375, 'learning_rate': 5.56442831215971e-06, 'epoch': 16.24} +{'loss': 36.6666, 'grad_norm': 247.53643798828125, 'learning_rate': 5.558983666061707e-06, 'epoch': 16.24} +{'loss': 37.0842, 'grad_norm': 233.3318634033203, 'learning_rate': 5.553539019963702e-06, 'epoch': 16.25} + 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 4510/5520 [3:57:49<49:08, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6042913794517517, 'eval_runtime': 3.14, 'eval_samples_per_second': 57.007, 'eval_steps_per_second': 57.007, 'epoch': 16.25} +{'loss': 37.6382, 'grad_norm': 222.98350524902344, 'learning_rate': 5.548094373865699e-06, 'epoch': 16.25} +{'loss': 38.0509, 'grad_norm': 234.33267211914062, 'learning_rate': 5.542649727767695e-06, 'epoch': 16.25} +{'loss': 36.509, 'grad_norm': 303.56005859375, 'learning_rate': 5.5372050816696924e-06, 'epoch': 16.26} +{'loss': 36.3975, 'grad_norm': 232.0821075439453, 'learning_rate': 5.531760435571688e-06, 'epoch': 16.26} +{'loss': 37.0448, 'grad_norm': 223.3292236328125, 'learning_rate': 5.526315789473684e-06, 'epoch': 16.26} +{'loss': 37.8635, 'grad_norm': 241.2131805419922, 'learning_rate': 5.520871143375681e-06, 'epoch': 16.27} +{'loss': 38.2789, 'grad_norm': 288.62689208984375, 'learning_rate': 5.5154264972776765e-06, 'epoch': 16.27} +{'loss': 37.9052, 'grad_norm': 262.59637451171875, 'learning_rate': 5.5099818511796736e-06, 'epoch': 16.27} +{'loss': 38.0485, 'grad_norm': 258.0476379394531, 'learning_rate': 5.50453720508167e-06, 'epoch': 16.28} +{'loss': 37.6134, 'grad_norm': 295.2730407714844, 'learning_rate': 5.499092558983667e-06, 'epoch': 16.28} + 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 4520/5520 [3:58:21<48:58, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.601740300655365, 'eval_runtime': 3.1387, 'eval_samples_per_second': 57.03, 'eval_steps_per_second': 57.03, 'epoch': 16.28} +{'loss': 36.1289, 'grad_norm': 246.38548278808594, 'learning_rate': 5.493647912885662e-06, 'epoch': 16.29} +{'loss': 31.8834, 'grad_norm': 271.28997802734375, 'learning_rate': 5.4882032667876585e-06, 'epoch': 16.29} +{'loss': 31.4899, 'grad_norm': 231.76246643066406, 'learning_rate': 5.4827586206896556e-06, 'epoch': 16.29} +{'loss': 31.7102, 'grad_norm': 238.7414093017578, 'learning_rate': 5.477313974591652e-06, 'epoch': 16.3} +{'loss': 31.3557, 'grad_norm': 302.0710144042969, 'learning_rate': 5.471869328493648e-06, 'epoch': 16.3} +{'loss': 33.0781, 'grad_norm': 282.72015380859375, 'learning_rate': 5.466424682395644e-06, 'epoch': 16.3} +{'loss': 33.2963, 'grad_norm': 224.8140869140625, 'learning_rate': 5.460980036297641e-06, 'epoch': 16.31} +{'loss': 34.4455, 'grad_norm': 239.20570373535156, 'learning_rate': 5.4555353901996376e-06, 'epoch': 16.31} +{'loss': 34.534, 'grad_norm': 304.7758483886719, 'learning_rate': 5.450090744101633e-06, 'epoch': 16.31} +{'loss': 33.5232, 'grad_norm': 274.8758239746094, 'learning_rate': 5.44464609800363e-06, 'epoch': 16.32} + 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 4530/5520 [3:58:53<48:15, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6031973958015442, 'eval_runtime': 3.1365, 'eval_samples_per_second': 57.071, 'eval_steps_per_second': 57.071, 'epoch': 16.32} +{'loss': 33.403, 'grad_norm': 295.1776428222656, 'learning_rate': 5.439201451905626e-06, 'epoch': 16.32} +{'loss': 34.1785, 'grad_norm': 309.03399658203125, 'learning_rate': 5.4337568058076225e-06, 'epoch': 16.33} +{'loss': 34.4855, 'grad_norm': 285.26385498046875, 'learning_rate': 5.428312159709619e-06, 'epoch': 16.33} +{'loss': 32.4791, 'grad_norm': 307.0184020996094, 'learning_rate': 5.422867513611616e-06, 'epoch': 16.33} +{'loss': 35.697, 'grad_norm': 318.8267822265625, 'learning_rate': 5.417422867513612e-06, 'epoch': 16.34} +{'loss': 36.1811, 'grad_norm': 356.0179138183594, 'learning_rate': 5.411978221415607e-06, 'epoch': 16.34} +{'loss': 36.2251, 'grad_norm': 332.1255187988281, 'learning_rate': 5.4065335753176045e-06, 'epoch': 16.34} +{'loss': 32.0518, 'grad_norm': 288.78118896484375, 'learning_rate': 5.401088929219601e-06, 'epoch': 16.35} +{'loss': 23.627, 'grad_norm': 250.37245178222656, 'learning_rate': 5.395644283121598e-06, 'epoch': 16.35} +{'loss': 21.7919, 'grad_norm': 199.92352294921875, 'learning_rate': 5.390199637023593e-06, 'epoch': 16.35} + 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 4540/5520 [3:59:25<47:55, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6021688580513, 'eval_runtime': 3.1361, 'eval_samples_per_second': 57.078, 'eval_steps_per_second': 57.078, 'epoch': 16.35} +{'loss': 23.0672, 'grad_norm': 265.47015380859375, 'learning_rate': 5.38475499092559e-06, 'epoch': 16.36} +{'loss': 22.7983, 'grad_norm': 281.188720703125, 'learning_rate': 5.3793103448275865e-06, 'epoch': 16.36} +{'loss': 38.1042, 'grad_norm': 195.5351104736328, 'learning_rate': 5.373865698729583e-06, 'epoch': 16.36} +{'loss': 39.8602, 'grad_norm': 234.76573181152344, 'learning_rate': 5.368421052631579e-06, 'epoch': 16.37} +{'loss': 40.2156, 'grad_norm': 237.9152374267578, 'learning_rate': 5.362976406533575e-06, 'epoch': 16.37} +{'loss': 39.3676, 'grad_norm': 297.722900390625, 'learning_rate': 5.357531760435572e-06, 'epoch': 16.38} +{'loss': 38.7905, 'grad_norm': 218.61727905273438, 'learning_rate': 5.352087114337568e-06, 'epoch': 16.38} +{'loss': 39.3998, 'grad_norm': 245.19561767578125, 'learning_rate': 5.346642468239565e-06, 'epoch': 16.38} +{'loss': 40.0835, 'grad_norm': 247.5048370361328, 'learning_rate': 5.341197822141561e-06, 'epoch': 16.39} +{'loss': 39.1135, 'grad_norm': 214.40684509277344, 'learning_rate': 5.335753176043558e-06, 'epoch': 16.39} + 82%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 4550/5520 [3:59:58<47:26, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6014460325241089, 'eval_runtime': 3.1433, 'eval_samples_per_second': 56.946, 'eval_steps_per_second': 56.946, 'epoch': 16.39} +{'loss': 38.9449, 'grad_norm': 216.72271728515625, 'learning_rate': 5.330308529945553e-06, 'epoch': 16.39} +{'loss': 39.2646, 'grad_norm': 224.22262573242188, 'learning_rate': 5.32486388384755e-06, 'epoch': 16.4} +{'loss': 38.0846, 'grad_norm': 258.6524353027344, 'learning_rate': 5.319419237749547e-06, 'epoch': 16.4} +{'loss': 37.4963, 'grad_norm': 241.7313232421875, 'learning_rate': 5.313974591651543e-06, 'epoch': 16.4} +{'loss': 36.4783, 'grad_norm': 241.3990478515625, 'learning_rate': 5.308529945553539e-06, 'epoch': 16.41} +{'loss': 36.1592, 'grad_norm': 207.1470947265625, 'learning_rate': 5.303085299455535e-06, 'epoch': 16.41} +{'loss': 35.7946, 'grad_norm': 224.51690673828125, 'learning_rate': 5.2976406533575325e-06, 'epoch': 16.42} +{'loss': 36.8986, 'grad_norm': 292.4340515136719, 'learning_rate': 5.292196007259528e-06, 'epoch': 16.42} +{'loss': 37.1165, 'grad_norm': 244.67117309570312, 'learning_rate': 5.286751361161524e-06, 'epoch': 16.42} +{'loss': 36.4423, 'grad_norm': 331.14654541015625, 'learning_rate': 5.281306715063521e-06, 'epoch': 16.43} + 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 4560/5520 [4:00:30<46:46, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6067427396774292, 'eval_runtime': 3.1434, 'eval_samples_per_second': 56.946, 'eval_steps_per_second': 56.946, 'epoch': 16.43} +{'loss': 39.0014, 'grad_norm': 262.373046875, 'learning_rate': 5.275862068965517e-06, 'epoch': 16.43} +{'loss': 38.0152, 'grad_norm': 237.48350524902344, 'learning_rate': 5.270417422867514e-06, 'epoch': 16.43} +{'loss': 37.6952, 'grad_norm': 273.0652770996094, 'learning_rate': 5.26497277676951e-06, 'epoch': 16.44} +{'loss': 38.4266, 'grad_norm': 239.0780029296875, 'learning_rate': 5.259528130671507e-06, 'epoch': 16.44} +{'loss': 36.5596, 'grad_norm': 277.978759765625, 'learning_rate': 5.254083484573503e-06, 'epoch': 16.44} +{'loss': 39.1408, 'grad_norm': 216.2267303466797, 'learning_rate': 5.248638838475499e-06, 'epoch': 16.45} +{'loss': 38.7286, 'grad_norm': 231.80581665039062, 'learning_rate': 5.243194192377496e-06, 'epoch': 16.45} +{'loss': 39.2426, 'grad_norm': 236.4004669189453, 'learning_rate': 5.237749546279492e-06, 'epoch': 16.46} +{'loss': 38.6546, 'grad_norm': 270.0268859863281, 'learning_rate': 5.232304900181488e-06, 'epoch': 16.46} +{'loss': 37.554, 'grad_norm': 255.8044891357422, 'learning_rate': 5.226860254083484e-06, 'epoch': 16.46} + 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 4570/5520 [4:01:02<46:33, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6019929647445679, 'eval_runtime': 3.137, 'eval_samples_per_second': 57.062, 'eval_steps_per_second': 57.062, 'epoch': 16.46} +{'loss': 34.9309, 'grad_norm': 321.18499755859375, 'learning_rate': 5.221415607985481e-06, 'epoch': 16.47} +{'loss': 35.8779, 'grad_norm': 311.94305419921875, 'learning_rate': 5.215970961887478e-06, 'epoch': 16.47} +{'loss': 31.8385, 'grad_norm': 211.90234375, 'learning_rate': 5.210526315789474e-06, 'epoch': 16.47} +{'loss': 31.8078, 'grad_norm': 284.64581298828125, 'learning_rate': 5.20508166969147e-06, 'epoch': 16.48} +{'loss': 33.2542, 'grad_norm': 291.94891357421875, 'learning_rate': 5.199637023593466e-06, 'epoch': 16.48} +{'loss': 31.5292, 'grad_norm': 243.61956787109375, 'learning_rate': 5.194192377495463e-06, 'epoch': 16.48} +{'loss': 33.9643, 'grad_norm': 242.07696533203125, 'learning_rate': 5.188747731397459e-06, 'epoch': 16.49} +{'loss': 33.7718, 'grad_norm': 255.0625457763672, 'learning_rate': 5.183303085299456e-06, 'epoch': 16.49} +{'loss': 31.5248, 'grad_norm': 249.40240478515625, 'learning_rate': 5.177858439201452e-06, 'epoch': 16.49} +{'loss': 34.5657, 'grad_norm': 231.3375244140625, 'learning_rate': 5.172413793103449e-06, 'epoch': 16.5} + 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 4580/5520 [4:01:34<46:04, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6017265319824219, 'eval_runtime': 3.1398, 'eval_samples_per_second': 57.01, 'eval_steps_per_second': 57.01, 'epoch': 16.5} +{'loss': 33.766, 'grad_norm': 247.97012329101562, 'learning_rate': 5.1669691470054445e-06, 'epoch': 16.5} +{'loss': 34.0841, 'grad_norm': 310.730224609375, 'learning_rate': 5.161524500907441e-06, 'epoch': 16.51} +{'loss': 35.0788, 'grad_norm': 323.5569152832031, 'learning_rate': 5.156079854809438e-06, 'epoch': 16.51} +{'loss': 33.5322, 'grad_norm': 247.95480346679688, 'learning_rate': 5.150635208711433e-06, 'epoch': 16.51} +{'loss': 34.4701, 'grad_norm': 307.6163024902344, 'learning_rate': 5.14519056261343e-06, 'epoch': 16.52} +{'loss': 35.8526, 'grad_norm': 239.569580078125, 'learning_rate': 5.1397459165154265e-06, 'epoch': 16.52} +{'loss': 36.2235, 'grad_norm': 362.4159240722656, 'learning_rate': 5.134301270417424e-06, 'epoch': 16.52} +{'loss': 33.4705, 'grad_norm': 321.2509765625, 'learning_rate': 5.128856624319419e-06, 'epoch': 16.53} +{'loss': 23.1329, 'grad_norm': 248.6092071533203, 'learning_rate': 5.123411978221415e-06, 'epoch': 16.53} +{'loss': 20.3184, 'grad_norm': 289.8996276855469, 'learning_rate': 5.117967332123412e-06, 'epoch': 16.53} + 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 4590/5520 [4:02:06<45:29, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6034744381904602, 'eval_runtime': 3.1405, 'eval_samples_per_second': 56.997, 'eval_steps_per_second': 56.997, 'epoch': 16.53} +{'loss': 23.0513, 'grad_norm': 215.02142333984375, 'learning_rate': 5.1125226860254085e-06, 'epoch': 16.54} +{'loss': 24.462, 'grad_norm': 299.8429870605469, 'learning_rate': 5.107078039927405e-06, 'epoch': 16.54} +{'loss': 39.9148, 'grad_norm': 267.0840759277344, 'learning_rate': 5.101633393829401e-06, 'epoch': 16.55} +{'loss': 40.6498, 'grad_norm': 227.23731994628906, 'learning_rate': 5.096188747731398e-06, 'epoch': 16.55} +{'loss': 38.7711, 'grad_norm': 313.9705810546875, 'learning_rate': 5.0907441016333935e-06, 'epoch': 16.55} +{'loss': 39.6938, 'grad_norm': 398.0429382324219, 'learning_rate': 5.0852994555353905e-06, 'epoch': 16.56} +{'loss': 39.356, 'grad_norm': 365.489990234375, 'learning_rate': 5.079854809437387e-06, 'epoch': 16.56} +{'loss': 40.2504, 'grad_norm': 365.05267333984375, 'learning_rate': 5.074410163339383e-06, 'epoch': 16.56} +{'loss': 39.6045, 'grad_norm': 288.0643310546875, 'learning_rate': 5.068965517241379e-06, 'epoch': 16.57} +{'loss': 40.2504, 'grad_norm': 262.0147705078125, 'learning_rate': 5.0635208711433755e-06, 'epoch': 16.57} + 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 4600/5520 [4:02:38<45:15, 2.95s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6028281450271606, 'eval_runtime': 3.1492, 'eval_samples_per_second': 56.84, 'eval_steps_per_second': 56.84, 'epoch': 16.57} +{'loss': 40.3154, 'grad_norm': 325.78387451171875, 'learning_rate': 5.0580762250453725e-06, 'epoch': 16.57} +{'loss': 39.5046, 'grad_norm': 221.56591796875, 'learning_rate': 5.052631578947369e-06, 'epoch': 16.58} +{'loss': 38.3611, 'grad_norm': 227.02520751953125, 'learning_rate': 5.047186932849365e-06, 'epoch': 16.58} +{'loss': 36.5043, 'grad_norm': 232.46922302246094, 'learning_rate': 5.041742286751361e-06, 'epoch': 16.59} +{'loss': 36.2179, 'grad_norm': 230.59536743164062, 'learning_rate': 5.0362976406533575e-06, 'epoch': 16.59} +{'loss': 36.4797, 'grad_norm': 439.9609069824219, 'learning_rate': 5.0308529945553545e-06, 'epoch': 16.59} +{'loss': 37.4151, 'grad_norm': 322.4086608886719, 'learning_rate': 5.02540834845735e-06, 'epoch': 16.6} +{'loss': 37.2815, 'grad_norm': 318.1732482910156, 'learning_rate': 5.019963702359347e-06, 'epoch': 16.6} +{'loss': 36.8388, 'grad_norm': 321.34039306640625, 'learning_rate': 5.014519056261343e-06, 'epoch': 16.6} +{'loss': 37.9805, 'grad_norm': 341.28790283203125, 'learning_rate': 5.0090744101633395e-06, 'epoch': 16.61} + 83%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 4600/5520 [4:02:42<45:15, 2.95s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 4610/5520 [4:03:11<44:37, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6045316457748413, 'eval_runtime': 3.1402, 'eval_samples_per_second': 57.002, 'eval_steps_per_second': 57.002, 'epoch': 16.61} +{'loss': 37.5832, 'grad_norm': 259.9163513183594, 'learning_rate': 5.003629764065336e-06, 'epoch': 16.61} +{'loss': 37.3808, 'grad_norm': 297.02587890625, 'learning_rate': 4.998185117967332e-06, 'epoch': 16.61} +{'loss': 37.1047, 'grad_norm': 263.32244873046875, 'learning_rate': 4.992740471869329e-06, 'epoch': 16.62} +{'loss': 38.3592, 'grad_norm': 262.26104736328125, 'learning_rate': 4.987295825771324e-06, 'epoch': 16.62} +{'loss': 37.4098, 'grad_norm': 253.7144012451172, 'learning_rate': 4.9818511796733215e-06, 'epoch': 16.62} +{'loss': 39.3865, 'grad_norm': 279.1004943847656, 'learning_rate': 4.976406533575318e-06, 'epoch': 16.63} +{'loss': 38.6865, 'grad_norm': 298.7977600097656, 'learning_rate': 4.970961887477315e-06, 'epoch': 16.63} +{'loss': 38.7068, 'grad_norm': 256.7657470703125, 'learning_rate': 4.96551724137931e-06, 'epoch': 16.64} +{'loss': 37.749, 'grad_norm': 238.22979736328125, 'learning_rate': 4.960072595281307e-06, 'epoch': 16.64} +{'loss': 37.582, 'grad_norm': 248.4231414794922, 'learning_rate': 4.9546279491833035e-06, 'epoch': 16.64} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 4620/5520 [4:03:43<43:51, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6026645302772522, 'eval_runtime': 3.1409, 'eval_samples_per_second': 56.99, 'eval_steps_per_second': 56.99, 'epoch': 16.64} +{'loss': 34.4589, 'grad_norm': 232.70289611816406, 'learning_rate': 4.949183303085299e-06, 'epoch': 16.65} +{'loss': 32.3619, 'grad_norm': 268.4678955078125, 'learning_rate': 4.943738656987296e-06, 'epoch': 16.65} +{'loss': 32.3436, 'grad_norm': 272.07794189453125, 'learning_rate': 4.938294010889292e-06, 'epoch': 16.65} +{'loss': 30.8798, 'grad_norm': 304.4588317871094, 'learning_rate': 4.932849364791289e-06, 'epoch': 16.66} +{'loss': 31.1892, 'grad_norm': 293.3638000488281, 'learning_rate': 4.927404718693285e-06, 'epoch': 16.66} +{'loss': 31.9604, 'grad_norm': 292.844482421875, 'learning_rate': 4.921960072595282e-06, 'epoch': 16.66} +{'loss': 32.242, 'grad_norm': 246.45339965820312, 'learning_rate': 4.916515426497278e-06, 'epoch': 16.67} +{'loss': 32.5072, 'grad_norm': 269.9577941894531, 'learning_rate': 4.911070780399274e-06, 'epoch': 16.67} +{'loss': 33.8243, 'grad_norm': 312.8960876464844, 'learning_rate': 4.90562613430127e-06, 'epoch': 16.68} +{'loss': 34.3557, 'grad_norm': 287.4557189941406, 'learning_rate': 4.900181488203267e-06, 'epoch': 16.68} + 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 4630/5520 [4:04:16<43:28, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6047338843345642, 'eval_runtime': 3.1387, 'eval_samples_per_second': 57.03, 'eval_steps_per_second': 57.03, 'epoch': 16.68} +{'loss': 34.6895, 'grad_norm': 403.533935546875, 'learning_rate': 4.894736842105264e-06, 'epoch': 16.68} +{'loss': 34.2407, 'grad_norm': 387.5083923339844, 'learning_rate': 4.88929219600726e-06, 'epoch': 16.69} +{'loss': 33.3489, 'grad_norm': 278.8225402832031, 'learning_rate': 4.883847549909256e-06, 'epoch': 16.69} +{'loss': 34.2095, 'grad_norm': 270.46685791015625, 'learning_rate': 4.878402903811252e-06, 'epoch': 16.69} +{'loss': 35.783, 'grad_norm': 244.6392059326172, 'learning_rate': 4.872958257713249e-06, 'epoch': 16.7} +{'loss': 36.4928, 'grad_norm': 327.0617370605469, 'learning_rate': 4.867513611615245e-06, 'epoch': 16.7} +{'loss': 33.4827, 'grad_norm': 297.0531311035156, 'learning_rate': 4.862068965517241e-06, 'epoch': 16.7} +{'loss': 26.9456, 'grad_norm': 366.2174377441406, 'learning_rate': 4.856624319419238e-06, 'epoch': 16.71} +{'loss': 22.2349, 'grad_norm': 436.22613525390625, 'learning_rate': 4.851179673321234e-06, 'epoch': 16.71} +{'loss': 22.8557, 'grad_norm': 391.7647705078125, 'learning_rate': 4.845735027223231e-06, 'epoch': 16.72} + 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 4640/5520 [4:04:48<43:06, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6052708029747009, 'eval_runtime': 3.1382, 'eval_samples_per_second': 57.038, 'eval_steps_per_second': 57.038, 'epoch': 16.72} +{'loss': 23.3521, 'grad_norm': 277.8678283691406, 'learning_rate': 4.840290381125227e-06, 'epoch': 16.72} +{'loss': 23.7394, 'grad_norm': 252.46131896972656, 'learning_rate': 4.834845735027224e-06, 'epoch': 16.72} +{'loss': 38.6633, 'grad_norm': 214.6287078857422, 'learning_rate': 4.82940108892922e-06, 'epoch': 16.73} +{'loss': 40.5165, 'grad_norm': 257.454345703125, 'learning_rate': 4.8239564428312155e-06, 'epoch': 16.73} +{'loss': 38.483, 'grad_norm': 211.1912841796875, 'learning_rate': 4.818511796733213e-06, 'epoch': 16.73} +{'loss': 39.6143, 'grad_norm': 226.8388214111328, 'learning_rate': 4.813067150635209e-06, 'epoch': 16.74} +{'loss': 37.8442, 'grad_norm': 263.8160400390625, 'learning_rate': 4.807622504537205e-06, 'epoch': 16.74} +{'loss': 39.1835, 'grad_norm': 284.8119201660156, 'learning_rate': 4.802177858439201e-06, 'epoch': 16.74} +{'loss': 38.7035, 'grad_norm': 310.31390380859375, 'learning_rate': 4.796733212341198e-06, 'epoch': 16.75} +{'loss': 38.8803, 'grad_norm': 212.71315002441406, 'learning_rate': 4.791288566243195e-06, 'epoch': 16.75} + 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 4650/5520 [4:05:20<42:45, 2.95s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6030828952789307, 'eval_runtime': 3.1388, 'eval_samples_per_second': 57.027, 'eval_steps_per_second': 57.027, 'epoch': 16.75} +{'loss': 39.0808, 'grad_norm': 209.7708740234375, 'learning_rate': 4.78584392014519e-06, 'epoch': 16.75} +{'loss': 39.2025, 'grad_norm': 251.971435546875, 'learning_rate': 4.780399274047187e-06, 'epoch': 16.76} +{'loss': 37.7541, 'grad_norm': 210.54151916503906, 'learning_rate': 4.774954627949183e-06, 'epoch': 16.76} +{'loss': 36.4328, 'grad_norm': 221.22119140625, 'learning_rate': 4.76950998185118e-06, 'epoch': 16.77} +{'loss': 34.9771, 'grad_norm': 201.45025634765625, 'learning_rate': 4.764065335753176e-06, 'epoch': 16.77} +{'loss': 37.6231, 'grad_norm': 241.33030700683594, 'learning_rate': 4.758620689655173e-06, 'epoch': 16.77} +{'loss': 36.9822, 'grad_norm': 282.12255859375, 'learning_rate': 4.753176043557169e-06, 'epoch': 16.78} +{'loss': 36.3529, 'grad_norm': 239.93885803222656, 'learning_rate': 4.747731397459165e-06, 'epoch': 16.78} +{'loss': 37.518, 'grad_norm': 245.9400634765625, 'learning_rate': 4.7422867513611615e-06, 'epoch': 16.78} +{'loss': 37.6323, 'grad_norm': 280.63720703125, 'learning_rate': 4.736842105263158e-06, 'epoch': 16.79} + 84%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 4660/5520 [4:05:52<41:47, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6054876446723938, 'eval_runtime': 3.1439, 'eval_samples_per_second': 56.935, 'eval_steps_per_second': 56.935, 'epoch': 16.79} +{'loss': 38.1543, 'grad_norm': 368.47698974609375, 'learning_rate': 4.731397459165155e-06, 'epoch': 16.79} +{'loss': 38.8746, 'grad_norm': 346.9169616699219, 'learning_rate': 4.72595281306715e-06, 'epoch': 16.79} +{'loss': 37.3475, 'grad_norm': 311.7519836425781, 'learning_rate': 4.720508166969147e-06, 'epoch': 16.8} +{'loss': 38.5308, 'grad_norm': 323.14910888671875, 'learning_rate': 4.7150635208711435e-06, 'epoch': 16.8} +{'loss': 38.3275, 'grad_norm': 252.71958923339844, 'learning_rate': 4.70961887477314e-06, 'epoch': 16.81} +{'loss': 38.9973, 'grad_norm': 364.2929382324219, 'learning_rate': 4.704174228675136e-06, 'epoch': 16.81} +{'loss': 38.0867, 'grad_norm': 267.23980712890625, 'learning_rate': 4.698729582577132e-06, 'epoch': 16.81} +{'loss': 38.6933, 'grad_norm': 297.4647521972656, 'learning_rate': 4.693284936479129e-06, 'epoch': 16.82} +{'loss': 38.0279, 'grad_norm': 276.2767333984375, 'learning_rate': 4.6878402903811255e-06, 'epoch': 16.82} +{'loss': 36.5149, 'grad_norm': 261.5404052734375, 'learning_rate': 4.682395644283122e-06, 'epoch': 16.82} + 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 4670/5520 [4:06:24<41:30, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6019832491874695, 'eval_runtime': 3.1363, 'eval_samples_per_second': 57.074, 'eval_steps_per_second': 57.074, 'epoch': 16.82} +{'loss': 35.6121, 'grad_norm': 313.2170104980469, 'learning_rate': 4.676950998185118e-06, 'epoch': 16.83} +{'loss': 31.1869, 'grad_norm': 297.2791442871094, 'learning_rate': 4.671506352087115e-06, 'epoch': 16.83} +{'loss': 31.8674, 'grad_norm': 269.7320556640625, 'learning_rate': 4.666061705989111e-06, 'epoch': 16.83} +{'loss': 30.3726, 'grad_norm': 245.3898468017578, 'learning_rate': 4.660617059891107e-06, 'epoch': 16.84} +{'loss': 32.6154, 'grad_norm': 244.63223266601562, 'learning_rate': 4.655172413793104e-06, 'epoch': 16.84} +{'loss': 33.0104, 'grad_norm': 263.6791076660156, 'learning_rate': 4.6497277676951e-06, 'epoch': 16.85} +{'loss': 32.5445, 'grad_norm': 398.6610107421875, 'learning_rate': 4.644283121597096e-06, 'epoch': 16.85} +{'loss': 32.5698, 'grad_norm': 312.8116149902344, 'learning_rate': 4.6388384754990924e-06, 'epoch': 16.85} +{'loss': 33.1377, 'grad_norm': 296.6167297363281, 'learning_rate': 4.6333938294010895e-06, 'epoch': 16.86} +{'loss': 33.3279, 'grad_norm': 285.299560546875, 'learning_rate': 4.627949183303086e-06, 'epoch': 16.86} + 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 4680/5520 [4:06:56<40:57, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6027817726135254, 'eval_runtime': 3.1412, 'eval_samples_per_second': 56.985, 'eval_steps_per_second': 56.985, 'epoch': 16.86} +{'loss': 35.6879, 'grad_norm': 285.2948913574219, 'learning_rate': 4.622504537205081e-06, 'epoch': 16.86} +{'loss': 32.3154, 'grad_norm': 280.6530456542969, 'learning_rate': 4.617059891107078e-06, 'epoch': 16.87} +{'loss': 34.3517, 'grad_norm': 314.206787109375, 'learning_rate': 4.6116152450090744e-06, 'epoch': 16.87} +{'loss': 34.1571, 'grad_norm': 305.9198913574219, 'learning_rate': 4.6061705989110715e-06, 'epoch': 16.87} +{'loss': 35.1647, 'grad_norm': 287.0543212890625, 'learning_rate': 4.600725952813067e-06, 'epoch': 16.88} +{'loss': 34.8698, 'grad_norm': 286.912109375, 'learning_rate': 4.595281306715064e-06, 'epoch': 16.88} +{'loss': 36.3449, 'grad_norm': 322.4527587890625, 'learning_rate': 4.58983666061706e-06, 'epoch': 16.88} +{'loss': 25.3085, 'grad_norm': 239.41659545898438, 'learning_rate': 4.584392014519056e-06, 'epoch': 16.89} +{'loss': 22.3485, 'grad_norm': 215.5685577392578, 'learning_rate': 4.578947368421053e-06, 'epoch': 16.89} +{'loss': 22.3257, 'grad_norm': 291.2452697753906, 'learning_rate': 4.573502722323049e-06, 'epoch': 16.9} + 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 4690/5520 [4:07:28<40:42, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6040940284729004, 'eval_runtime': 3.141, 'eval_samples_per_second': 56.988, 'eval_steps_per_second': 56.988, 'epoch': 16.9} +{'loss': 23.268, 'grad_norm': 291.39935302734375, 'learning_rate': 4.568058076225046e-06, 'epoch': 16.9} +{'loss': 23.7127, 'grad_norm': 272.211181640625, 'learning_rate': 4.562613430127041e-06, 'epoch': 16.9} +{'loss': 39.2488, 'grad_norm': 220.84397888183594, 'learning_rate': 4.5571687840290384e-06, 'epoch': 16.91} +{'loss': 39.5643, 'grad_norm': 238.49859619140625, 'learning_rate': 4.551724137931035e-06, 'epoch': 16.91} +{'loss': 38.6149, 'grad_norm': 325.3870544433594, 'learning_rate': 4.546279491833032e-06, 'epoch': 16.91} +{'loss': 38.0317, 'grad_norm': 307.02349853515625, 'learning_rate': 4.540834845735027e-06, 'epoch': 16.92} +{'loss': 40.4567, 'grad_norm': 433.99359130859375, 'learning_rate': 4.535390199637023e-06, 'epoch': 16.92} +{'loss': 40.3109, 'grad_norm': 327.97015380859375, 'learning_rate': 4.5299455535390204e-06, 'epoch': 16.92} +{'loss': 36.2826, 'grad_norm': 257.20684814453125, 'learning_rate': 4.524500907441017e-06, 'epoch': 16.93} +{'loss': 36.9163, 'grad_norm': 402.6732177734375, 'learning_rate': 4.519056261343013e-06, 'epoch': 16.93} + 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 4700/5520 [4:08:01<40:01, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6016727089881897, 'eval_runtime': 3.1434, 'eval_samples_per_second': 56.944, 'eval_steps_per_second': 56.944, 'epoch': 16.93} +{'loss': 36.7101, 'grad_norm': 380.8903503417969, 'learning_rate': 4.513611615245009e-06, 'epoch': 16.94} +{'loss': 37.9853, 'grad_norm': 365.4950256347656, 'learning_rate': 4.508166969147006e-06, 'epoch': 16.94} +{'loss': 38.109, 'grad_norm': 302.3895568847656, 'learning_rate': 4.5027223230490016e-06, 'epoch': 16.94} +{'loss': 37.5992, 'grad_norm': 333.5274963378906, 'learning_rate': 4.497277676950998e-06, 'epoch': 16.95} +{'loss': 38.0139, 'grad_norm': 364.3126525878906, 'learning_rate': 4.491833030852995e-06, 'epoch': 16.95} +{'loss': 39.8027, 'grad_norm': 509.94671630859375, 'learning_rate': 4.486388384754991e-06, 'epoch': 16.95} +{'loss': 40.0044, 'grad_norm': 507.8591613769531, 'learning_rate': 4.480943738656987e-06, 'epoch': 16.96} +{'loss': 34.9058, 'grad_norm': 324.5463562011719, 'learning_rate': 4.4754990925589836e-06, 'epoch': 16.96} +{'loss': 33.1318, 'grad_norm': 318.39801025390625, 'learning_rate': 4.470054446460981e-06, 'epoch': 16.96} +{'loss': 32.2083, 'grad_norm': 391.8466796875, 'learning_rate': 4.464609800362977e-06, 'epoch': 16.97} + 85%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 4710/5520 [4:08:32<38:42, 2.87s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6047930717468262, 'eval_runtime': 3.1398, 'eval_samples_per_second': 57.009, 'eval_steps_per_second': 57.009, 'epoch': 16.97} +{'loss': 31.9882, 'grad_norm': 530.4073486328125, 'learning_rate': 4.459165154264972e-06, 'epoch': 16.97} +{'loss': 34.1937, 'grad_norm': 590.9242553710938, 'learning_rate': 4.453720508166969e-06, 'epoch': 16.98} +{'loss': 34.6501, 'grad_norm': 377.5596618652344, 'learning_rate': 4.4482758620689656e-06, 'epoch': 16.98} +{'loss': 33.9402, 'grad_norm': 431.2909240722656, 'learning_rate': 4.442831215970962e-06, 'epoch': 16.98} +{'loss': 33.7873, 'grad_norm': 294.7673645019531, 'learning_rate': 4.437386569872958e-06, 'epoch': 16.99} +{'loss': 35.2935, 'grad_norm': 346.1203918457031, 'learning_rate': 4.431941923774955e-06, 'epoch': 16.99} +{'loss': 28.3513, 'grad_norm': 257.8351745605469, 'learning_rate': 4.426497277676951e-06, 'epoch': 16.99} +{'loss': 22.3009, 'grad_norm': 168.35118103027344, 'learning_rate': 4.421052631578947e-06, 'epoch': 17.0} +{'loss': 20.1848, 'grad_norm': 210.20738220214844, 'learning_rate': 4.415607985480944e-06, 'epoch': 17.0} +{'loss': 38.0969, 'grad_norm': 234.40866088867188, 'learning_rate': 4.41016333938294e-06, 'epoch': 17.0} + 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 4720/5520 [4:09:05<39:00, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6026900410652161, 'eval_runtime': 3.1364, 'eval_samples_per_second': 57.072, 'eval_steps_per_second': 57.072, 'epoch': 17.0} +{'loss': 38.8902, 'grad_norm': 242.27195739746094, 'learning_rate': 4.404718693284937e-06, 'epoch': 17.01} +{'loss': 38.5509, 'grad_norm': 215.1695556640625, 'learning_rate': 4.3992740471869325e-06, 'epoch': 17.01} +{'loss': 38.5247, 'grad_norm': 390.2027587890625, 'learning_rate': 4.3938294010889296e-06, 'epoch': 17.01} +{'loss': 39.1981, 'grad_norm': 397.77484130859375, 'learning_rate': 4.388384754990926e-06, 'epoch': 17.02} +{'loss': 38.2627, 'grad_norm': 298.10089111328125, 'learning_rate': 4.382940108892923e-06, 'epoch': 17.02} +{'loss': 38.8027, 'grad_norm': 291.7283935546875, 'learning_rate': 4.377495462794918e-06, 'epoch': 17.03} +{'loss': 38.6095, 'grad_norm': 254.8542938232422, 'learning_rate': 4.3720508166969145e-06, 'epoch': 17.03} +{'loss': 38.2955, 'grad_norm': 244.336181640625, 'learning_rate': 4.3666061705989116e-06, 'epoch': 17.03} +{'loss': 38.5203, 'grad_norm': 376.92523193359375, 'learning_rate': 4.361161524500907e-06, 'epoch': 17.04} +{'loss': 37.4332, 'grad_norm': 339.6172790527344, 'learning_rate': 4.355716878402904e-06, 'epoch': 17.04} + 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 4730/5520 [4:09:37<38:41, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6024167537689209, 'eval_runtime': 3.1401, 'eval_samples_per_second': 57.005, 'eval_steps_per_second': 57.005, 'epoch': 17.04} +{'loss': 36.4444, 'grad_norm': 433.0855712890625, 'learning_rate': 4.3502722323049e-06, 'epoch': 17.04} +{'loss': 35.7802, 'grad_norm': 224.3468475341797, 'learning_rate': 4.344827586206897e-06, 'epoch': 17.05} +{'loss': 35.4641, 'grad_norm': 385.5466003417969, 'learning_rate': 4.339382940108893e-06, 'epoch': 17.05} +{'loss': 36.4231, 'grad_norm': 311.80596923828125, 'learning_rate': 4.333938294010889e-06, 'epoch': 17.05} +{'loss': 37.5405, 'grad_norm': 283.189453125, 'learning_rate': 4.328493647912886e-06, 'epoch': 17.06} +{'loss': 37.4723, 'grad_norm': 403.85833740234375, 'learning_rate': 4.323049001814882e-06, 'epoch': 17.06} +{'loss': 36.6799, 'grad_norm': 390.03515625, 'learning_rate': 4.3176043557168785e-06, 'epoch': 17.07} +{'loss': 36.6312, 'grad_norm': 318.63427734375, 'learning_rate': 4.312159709618875e-06, 'epoch': 17.07} +{'loss': 37.9104, 'grad_norm': 318.43402099609375, 'learning_rate': 4.306715063520872e-06, 'epoch': 17.07} +{'loss': 36.7254, 'grad_norm': 320.9336853027344, 'learning_rate': 4.301270417422867e-06, 'epoch': 17.08} + 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 4740/5520 [4:10:09<38:00, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6046721339225769, 'eval_runtime': 3.1418, 'eval_samples_per_second': 56.974, 'eval_steps_per_second': 56.974, 'epoch': 17.08} +{'loss': 36.0298, 'grad_norm': 345.9001770019531, 'learning_rate': 4.295825771324863e-06, 'epoch': 17.08} +{'loss': 37.9418, 'grad_norm': 397.10369873046875, 'learning_rate': 4.2903811252268605e-06, 'epoch': 17.08} +{'loss': 37.2627, 'grad_norm': 293.1039123535156, 'learning_rate': 4.284936479128857e-06, 'epoch': 17.09} +{'loss': 38.3429, 'grad_norm': 412.5190734863281, 'learning_rate': 4.279491833030853e-06, 'epoch': 17.09} +{'loss': 38.559, 'grad_norm': 241.35105895996094, 'learning_rate': 4.274047186932849e-06, 'epoch': 17.09} +{'loss': 36.8167, 'grad_norm': 275.169189453125, 'learning_rate': 4.268602540834846e-06, 'epoch': 17.1} +{'loss': 37.0246, 'grad_norm': 272.3182678222656, 'learning_rate': 4.2631578947368425e-06, 'epoch': 17.1} +{'loss': 33.1282, 'grad_norm': 215.6425018310547, 'learning_rate': 4.257713248638839e-06, 'epoch': 17.1} +{'loss': 33.2698, 'grad_norm': 276.6223449707031, 'learning_rate': 4.252268602540835e-06, 'epoch': 17.11} +{'loss': 31.0105, 'grad_norm': 311.1632385253906, 'learning_rate': 4.246823956442831e-06, 'epoch': 17.11} + 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 4750/5520 [4:10:41<37:38, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6019421815872192, 'eval_runtime': 3.1354, 'eval_samples_per_second': 57.089, 'eval_steps_per_second': 57.089, 'epoch': 17.11} +{'loss': 31.4721, 'grad_norm': 254.7543487548828, 'learning_rate': 4.241379310344828e-06, 'epoch': 17.12} +{'loss': 31.0346, 'grad_norm': 239.24957275390625, 'learning_rate': 4.235934664246824e-06, 'epoch': 17.12} +{'loss': 32.0604, 'grad_norm': 262.0681457519531, 'learning_rate': 4.230490018148821e-06, 'epoch': 17.12} +{'loss': 32.2036, 'grad_norm': 218.3557586669922, 'learning_rate': 4.225045372050817e-06, 'epoch': 17.13} +{'loss': 32.1412, 'grad_norm': 277.5924072265625, 'learning_rate': 4.219600725952813e-06, 'epoch': 17.13} +{'loss': 34.3367, 'grad_norm': 226.93211364746094, 'learning_rate': 4.214156079854809e-06, 'epoch': 17.13} +{'loss': 33.2001, 'grad_norm': 303.2422180175781, 'learning_rate': 4.208711433756806e-06, 'epoch': 17.14} +{'loss': 34.155, 'grad_norm': 257.6164245605469, 'learning_rate': 4.203266787658803e-06, 'epoch': 17.14} +{'loss': 35.236, 'grad_norm': 361.1567077636719, 'learning_rate': 4.197822141560798e-06, 'epoch': 17.14} +{'loss': 34.304, 'grad_norm': 292.0034484863281, 'learning_rate': 4.192377495462795e-06, 'epoch': 17.15} + 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 4760/5520 [4:11:13<37:09, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6034401059150696, 'eval_runtime': 3.1399, 'eval_samples_per_second': 57.008, 'eval_steps_per_second': 57.008, 'epoch': 17.15} +{'loss': 33.7346, 'grad_norm': 327.8070983886719, 'learning_rate': 4.186932849364791e-06, 'epoch': 17.15} +{'loss': 35.9274, 'grad_norm': 312.9547119140625, 'learning_rate': 4.1814882032667885e-06, 'epoch': 17.16} +{'loss': 35.5567, 'grad_norm': 305.19500732421875, 'learning_rate': 4.176043557168784e-06, 'epoch': 17.16} +{'loss': 35.8013, 'grad_norm': 339.37152099609375, 'learning_rate': 4.17059891107078e-06, 'epoch': 17.16} +{'loss': 29.2211, 'grad_norm': 247.36679077148438, 'learning_rate': 4.165154264972777e-06, 'epoch': 17.17} +{'loss': 21.6191, 'grad_norm': 255.65269470214844, 'learning_rate': 4.1597096188747725e-06, 'epoch': 17.17} +{'loss': 22.0521, 'grad_norm': 239.66448974609375, 'learning_rate': 4.15426497277677e-06, 'epoch': 17.17} +{'loss': 22.6641, 'grad_norm': 212.25955200195312, 'learning_rate': 4.148820326678766e-06, 'epoch': 17.18} +{'loss': 22.8787, 'grad_norm': 229.9394073486328, 'learning_rate': 4.143375680580763e-06, 'epoch': 17.18} +{'loss': 39.1222, 'grad_norm': 237.46343994140625, 'learning_rate': 4.137931034482758e-06, 'epoch': 17.18} + 86%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 4770/5520 [4:11:45<36:34, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6031526327133179, 'eval_runtime': 3.1372, 'eval_samples_per_second': 57.057, 'eval_steps_per_second': 57.057, 'epoch': 17.18} +{'loss': 39.7664, 'grad_norm': 229.23849487304688, 'learning_rate': 4.132486388384755e-06, 'epoch': 17.19} +{'loss': 38.6754, 'grad_norm': 250.67529296875, 'learning_rate': 4.127041742286752e-06, 'epoch': 17.19} +{'loss': 39.1262, 'grad_norm': 272.9320068359375, 'learning_rate': 4.121597096188748e-06, 'epoch': 17.2} +{'loss': 38.2223, 'grad_norm': 267.82427978515625, 'learning_rate': 4.116152450090744e-06, 'epoch': 17.2} +{'loss': 39.2069, 'grad_norm': 266.35760498046875, 'learning_rate': 4.11070780399274e-06, 'epoch': 17.2} +{'loss': 38.8956, 'grad_norm': 221.62606811523438, 'learning_rate': 4.105263157894737e-06, 'epoch': 17.21} +{'loss': 41.5868, 'grad_norm': 243.73110961914062, 'learning_rate': 4.099818511796734e-06, 'epoch': 17.21} +{'loss': 39.1041, 'grad_norm': 268.6092224121094, 'learning_rate': 4.09437386569873e-06, 'epoch': 17.21} +{'loss': 38.25, 'grad_norm': 300.3140563964844, 'learning_rate': 4.088929219600726e-06, 'epoch': 17.22} +{'loss': 38.186, 'grad_norm': 264.56805419921875, 'learning_rate': 4.083484573502722e-06, 'epoch': 17.22} + 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 4780/5520 [4:12:17<36:04, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6044566631317139, 'eval_runtime': 3.1383, 'eval_samples_per_second': 57.037, 'eval_steps_per_second': 57.037, 'epoch': 17.22} +{'loss': 37.7011, 'grad_norm': 303.47686767578125, 'learning_rate': 4.0780399274047185e-06, 'epoch': 17.22} +{'loss': 34.6695, 'grad_norm': 238.3590545654297, 'learning_rate': 4.072595281306715e-06, 'epoch': 17.23} +{'loss': 36.1903, 'grad_norm': 252.90081787109375, 'learning_rate': 4.067150635208712e-06, 'epoch': 17.23} +{'loss': 36.4185, 'grad_norm': 286.5584716796875, 'learning_rate': 4.061705989110708e-06, 'epoch': 17.23} +{'loss': 36.0098, 'grad_norm': 322.25323486328125, 'learning_rate': 4.056261343012704e-06, 'epoch': 17.24} +{'loss': 35.4347, 'grad_norm': 292.09405517578125, 'learning_rate': 4.0508166969147005e-06, 'epoch': 17.24} +{'loss': 37.3512, 'grad_norm': 295.9725341796875, 'learning_rate': 4.045372050816697e-06, 'epoch': 17.25} +{'loss': 38.6739, 'grad_norm': 326.34539794921875, 'learning_rate': 4.039927404718694e-06, 'epoch': 17.25} +{'loss': 38.0995, 'grad_norm': 384.3682861328125, 'learning_rate': 4.034482758620689e-06, 'epoch': 17.25} +{'loss': 36.7733, 'grad_norm': 400.59136962890625, 'learning_rate': 4.029038112522686e-06, 'epoch': 17.26} + 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 4790/5520 [4:12:49<35:42, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6064656972885132, 'eval_runtime': 3.14, 'eval_samples_per_second': 57.005, 'eval_steps_per_second': 57.005, 'epoch': 17.26} +{'loss': 36.1385, 'grad_norm': 379.5261535644531, 'learning_rate': 4.0235934664246825e-06, 'epoch': 17.26} +{'loss': 39.1495, 'grad_norm': 277.1004638671875, 'learning_rate': 4.018148820326679e-06, 'epoch': 17.26} +{'loss': 37.8503, 'grad_norm': 274.6176452636719, 'learning_rate': 4.012704174228675e-06, 'epoch': 17.27} +{'loss': 39.7149, 'grad_norm': 338.9375305175781, 'learning_rate': 4.007259528130671e-06, 'epoch': 17.27} +{'loss': 37.6013, 'grad_norm': 299.60662841796875, 'learning_rate': 4.001814882032668e-06, 'epoch': 17.27} +{'loss': 38.1106, 'grad_norm': 278.9190368652344, 'learning_rate': 3.996370235934664e-06, 'epoch': 17.28} +{'loss': 35.9676, 'grad_norm': 254.48443603515625, 'learning_rate': 3.990925589836661e-06, 'epoch': 17.28} +{'loss': 35.3535, 'grad_norm': 274.65338134765625, 'learning_rate': 3.985480943738657e-06, 'epoch': 17.29} +{'loss': 32.7356, 'grad_norm': 288.748779296875, 'learning_rate': 3.980036297640654e-06, 'epoch': 17.29} +{'loss': 31.2048, 'grad_norm': 229.0682830810547, 'learning_rate': 3.9745916515426495e-06, 'epoch': 17.29} + 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 4800/5520 [4:13:21<35:10, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6020387411117554, 'eval_runtime': 3.137, 'eval_samples_per_second': 57.06, 'eval_steps_per_second': 57.06, 'epoch': 17.29} +{'loss': 31.7953, 'grad_norm': 234.29937744140625, 'learning_rate': 3.9691470054446465e-06, 'epoch': 17.3} +{'loss': 31.6686, 'grad_norm': 236.3527069091797, 'learning_rate': 3.963702359346643e-06, 'epoch': 17.3} +{'loss': 31.8848, 'grad_norm': 253.44126892089844, 'learning_rate': 3.958257713248639e-06, 'epoch': 17.3} +{'loss': 32.1593, 'grad_norm': 270.66046142578125, 'learning_rate': 3.952813067150635e-06, 'epoch': 17.31} +{'loss': 32.4555, 'grad_norm': 242.77777099609375, 'learning_rate': 3.9473684210526315e-06, 'epoch': 17.31} +{'loss': 34.0444, 'grad_norm': 243.9296112060547, 'learning_rate': 3.9419237749546285e-06, 'epoch': 17.31} +{'loss': 32.0404, 'grad_norm': 276.2138671875, 'learning_rate': 3.936479128856624e-06, 'epoch': 17.32} +{'loss': 32.4535, 'grad_norm': 262.97802734375, 'learning_rate': 3.931034482758621e-06, 'epoch': 17.32} +{'loss': 34.6855, 'grad_norm': 338.9852600097656, 'learning_rate': 3.925589836660617e-06, 'epoch': 17.33} +{'loss': 32.2425, 'grad_norm': 270.85650634765625, 'learning_rate': 3.9201451905626135e-06, 'epoch': 17.33} + 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 4800/5520 [4:13:24<35:10, 2.93s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 4810/5520 [4:13:54<34:57, 2.95s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.603055477142334, 'eval_runtime': 3.1384, 'eval_samples_per_second': 57.035, 'eval_steps_per_second': 57.035, 'epoch': 17.33} +{'loss': 34.6461, 'grad_norm': 289.17584228515625, 'learning_rate': 3.91470054446461e-06, 'epoch': 17.33} +{'loss': 34.5622, 'grad_norm': 301.120361328125, 'learning_rate': 3.909255898366606e-06, 'epoch': 17.34} +{'loss': 34.9585, 'grad_norm': 328.93524169921875, 'learning_rate': 3.903811252268603e-06, 'epoch': 17.34} +{'loss': 36.9729, 'grad_norm': 445.72003173828125, 'learning_rate': 3.898366606170599e-06, 'epoch': 17.34} +{'loss': 30.1609, 'grad_norm': 249.7901153564453, 'learning_rate': 3.8929219600725955e-06, 'epoch': 17.35} +{'loss': 21.6742, 'grad_norm': 230.1756134033203, 'learning_rate': 3.887477313974592e-06, 'epoch': 17.35} +{'loss': 22.0064, 'grad_norm': 193.68104553222656, 'learning_rate': 3.882032667876588e-06, 'epoch': 17.35} +{'loss': 23.1576, 'grad_norm': 232.58486938476562, 'learning_rate': 3.876588021778585e-06, 'epoch': 17.36} +{'loss': 23.5346, 'grad_norm': 256.0340270996094, 'learning_rate': 3.87114337568058e-06, 'epoch': 17.36} +{'loss': 39.5267, 'grad_norm': 260.8665771484375, 'learning_rate': 3.8656987295825775e-06, 'epoch': 17.36} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 87%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 4820/5520 [4:14:26<34:19, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6040924191474915, 'eval_runtime': 3.1444, 'eval_samples_per_second': 56.926, 'eval_steps_per_second': 56.926, 'epoch': 17.36} +{'loss': 40.222, 'grad_norm': 253.2076873779297, 'learning_rate': 3.860254083484574e-06, 'epoch': 17.37} +{'loss': 38.8405, 'grad_norm': 232.68162536621094, 'learning_rate': 3.85480943738657e-06, 'epoch': 17.37} +{'loss': 37.8169, 'grad_norm': 264.7735290527344, 'learning_rate': 3.849364791288566e-06, 'epoch': 17.38} +{'loss': 39.4413, 'grad_norm': 305.1289978027344, 'learning_rate': 3.843920145190563e-06, 'epoch': 17.38} +{'loss': 40.146, 'grad_norm': 409.03106689453125, 'learning_rate': 3.8384754990925594e-06, 'epoch': 17.38} +{'loss': 39.0141, 'grad_norm': 307.2272644042969, 'learning_rate': 3.833030852994555e-06, 'epoch': 17.39} +{'loss': 39.4356, 'grad_norm': 272.6708068847656, 'learning_rate': 3.827586206896552e-06, 'epoch': 17.39} +{'loss': 39.1581, 'grad_norm': 239.75225830078125, 'learning_rate': 3.822141560798548e-06, 'epoch': 17.39} +{'loss': 39.9827, 'grad_norm': 203.42205810546875, 'learning_rate': 3.816696914700545e-06, 'epoch': 17.4} +{'loss': 37.5404, 'grad_norm': 217.77159118652344, 'learning_rate': 3.811252268602541e-06, 'epoch': 17.4} + 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 4830/5520 [4:14:59<33:43, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6033807396888733, 'eval_runtime': 3.1385, 'eval_samples_per_second': 57.033, 'eval_steps_per_second': 57.033, 'epoch': 17.4} +{'loss': 35.6571, 'grad_norm': 257.9713134765625, 'learning_rate': 3.8058076225045377e-06, 'epoch': 17.4} +{'loss': 34.7256, 'grad_norm': 295.11468505859375, 'learning_rate': 3.8003629764065335e-06, 'epoch': 17.41} +{'loss': 37.3417, 'grad_norm': 248.15908813476562, 'learning_rate': 3.7949183303085297e-06, 'epoch': 17.41} +{'loss': 37.0117, 'grad_norm': 295.19085693359375, 'learning_rate': 3.7894736842105264e-06, 'epoch': 17.42} +{'loss': 37.168, 'grad_norm': 249.31576538085938, 'learning_rate': 3.7840290381125226e-06, 'epoch': 17.42} +{'loss': 35.9932, 'grad_norm': 271.1731262207031, 'learning_rate': 3.7785843920145193e-06, 'epoch': 17.42} +{'loss': 36.952, 'grad_norm': 380.6817626953125, 'learning_rate': 3.7731397459165155e-06, 'epoch': 17.43} +{'loss': 38.2224, 'grad_norm': 370.125244140625, 'learning_rate': 3.767695099818512e-06, 'epoch': 17.43} +{'loss': 38.5377, 'grad_norm': 291.13568115234375, 'learning_rate': 3.7622504537205084e-06, 'epoch': 17.43} +{'loss': 38.1665, 'grad_norm': 329.5670471191406, 'learning_rate': 3.756805807622504e-06, 'epoch': 17.44} + 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 4840/5520 [4:15:31<33:15, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6047329902648926, 'eval_runtime': 3.1359, 'eval_samples_per_second': 57.081, 'eval_steps_per_second': 57.081, 'epoch': 17.44} +{'loss': 34.8371, 'grad_norm': 266.0620422363281, 'learning_rate': 3.7513611615245012e-06, 'epoch': 17.44} +{'loss': 37.1885, 'grad_norm': 257.48980712890625, 'learning_rate': 3.7459165154264975e-06, 'epoch': 17.44} +{'loss': 38.1426, 'grad_norm': 346.8575439453125, 'learning_rate': 3.740471869328494e-06, 'epoch': 17.45} +{'loss': 37.6658, 'grad_norm': 246.66868591308594, 'learning_rate': 3.73502722323049e-06, 'epoch': 17.45} +{'loss': 38.2335, 'grad_norm': 309.71087646484375, 'learning_rate': 3.729582577132486e-06, 'epoch': 17.46} +{'loss': 38.5964, 'grad_norm': 304.1862487792969, 'learning_rate': 3.724137931034483e-06, 'epoch': 17.46} +{'loss': 38.9237, 'grad_norm': 253.73211669921875, 'learning_rate': 3.718693284936479e-06, 'epoch': 17.46} +{'loss': 35.9177, 'grad_norm': 208.52822875976562, 'learning_rate': 3.7132486388384757e-06, 'epoch': 17.47} +{'loss': 33.2577, 'grad_norm': 258.5502014160156, 'learning_rate': 3.707803992740472e-06, 'epoch': 17.47} +{'loss': 31.2634, 'grad_norm': 269.1754150390625, 'learning_rate': 3.7023593466424686e-06, 'epoch': 17.47} + 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 4850/5520 [4:16:03<32:43, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6035012006759644, 'eval_runtime': 3.1369, 'eval_samples_per_second': 57.062, 'eval_steps_per_second': 57.062, 'epoch': 17.47} +{'loss': 30.6732, 'grad_norm': 268.5780029296875, 'learning_rate': 3.6969147005444644e-06, 'epoch': 17.48} +{'loss': 31.5905, 'grad_norm': 223.7191619873047, 'learning_rate': 3.691470054446461e-06, 'epoch': 17.48} +{'loss': 31.9407, 'grad_norm': 266.960205078125, 'learning_rate': 3.6860254083484573e-06, 'epoch': 17.48} +{'loss': 31.8078, 'grad_norm': 241.2608184814453, 'learning_rate': 3.680580762250454e-06, 'epoch': 17.49} +{'loss': 33.5336, 'grad_norm': 315.95166015625, 'learning_rate': 3.67513611615245e-06, 'epoch': 17.49} +{'loss': 33.0484, 'grad_norm': 277.731689453125, 'learning_rate': 3.669691470054447e-06, 'epoch': 17.49} +{'loss': 33.5048, 'grad_norm': 272.35137939453125, 'learning_rate': 3.664246823956443e-06, 'epoch': 17.5} +{'loss': 33.5782, 'grad_norm': 260.4573974609375, 'learning_rate': 3.6588021778584393e-06, 'epoch': 17.5} +{'loss': 35.0308, 'grad_norm': 285.7935485839844, 'learning_rate': 3.6533575317604355e-06, 'epoch': 17.51} +{'loss': 34.8067, 'grad_norm': 267.613037109375, 'learning_rate': 3.6479128856624317e-06, 'epoch': 17.51} + 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 4860/5520 [4:16:36<33:15, 3.02s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6035751700401306, 'eval_runtime': 3.1383, 'eval_samples_per_second': 57.037, 'eval_steps_per_second': 57.037, 'epoch': 17.51} +{'loss': 33.1631, 'grad_norm': 301.43536376953125, 'learning_rate': 3.6424682395644284e-06, 'epoch': 17.51} +{'loss': 32.978, 'grad_norm': 270.10467529296875, 'learning_rate': 3.6370235934664246e-06, 'epoch': 17.52} +{'loss': 35.3346, 'grad_norm': 280.802001953125, 'learning_rate': 3.6315789473684213e-06, 'epoch': 17.52} +{'loss': 33.4881, 'grad_norm': 314.7720031738281, 'learning_rate': 3.6261343012704175e-06, 'epoch': 17.52} +{'loss': 31.5599, 'grad_norm': 347.4674072265625, 'learning_rate': 3.620689655172414e-06, 'epoch': 17.53} +{'loss': 22.159, 'grad_norm': 207.3061981201172, 'learning_rate': 3.61524500907441e-06, 'epoch': 17.53} +{'loss': 21.6584, 'grad_norm': 216.7202911376953, 'learning_rate': 3.6098003629764066e-06, 'epoch': 17.53} +{'loss': 22.9289, 'grad_norm': 260.20452880859375, 'learning_rate': 3.604355716878403e-06, 'epoch': 17.54} +{'loss': 23.7172, 'grad_norm': 295.9897766113281, 'learning_rate': 3.5989110707803995e-06, 'epoch': 17.54} +{'loss': 37.5844, 'grad_norm': 226.99484252929688, 'learning_rate': 3.5934664246823957e-06, 'epoch': 17.55} + 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 4870/5520 [4:17:08<32:36, 3.01s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6059216260910034, 'eval_runtime': 3.1302, 'eval_samples_per_second': 57.185, 'eval_steps_per_second': 57.185, 'epoch': 17.55} +{'loss': 39.5191, 'grad_norm': 231.67477416992188, 'learning_rate': 3.588021778584392e-06, 'epoch': 17.55} +{'loss': 39.4246, 'grad_norm': 248.46058654785156, 'learning_rate': 3.5825771324863886e-06, 'epoch': 17.55} +{'loss': 38.9811, 'grad_norm': 239.17247009277344, 'learning_rate': 3.577132486388385e-06, 'epoch': 17.56} +{'loss': 38.4724, 'grad_norm': 325.3457946777344, 'learning_rate': 3.571687840290381e-06, 'epoch': 17.56} +{'loss': 38.79, 'grad_norm': 264.5011901855469, 'learning_rate': 3.5662431941923773e-06, 'epoch': 17.56} +{'loss': 38.0342, 'grad_norm': 251.97154235839844, 'learning_rate': 3.560798548094374e-06, 'epoch': 17.57} +{'loss': 39.8586, 'grad_norm': 236.78271484375, 'learning_rate': 3.55535390199637e-06, 'epoch': 17.57} +{'loss': 37.8967, 'grad_norm': 276.8800048828125, 'learning_rate': 3.549909255898367e-06, 'epoch': 17.57} +{'loss': 39.9833, 'grad_norm': 255.9346160888672, 'learning_rate': 3.544464609800363e-06, 'epoch': 17.58} +{'loss': 38.6235, 'grad_norm': 273.71337890625, 'learning_rate': 3.5390199637023597e-06, 'epoch': 17.58} + 88%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 4880/5520 [4:17:41<32:08, 3.01s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6033145189285278, 'eval_runtime': 3.1252, 'eval_samples_per_second': 57.275, 'eval_steps_per_second': 57.275, 'epoch': 17.58} +{'loss': 37.9017, 'grad_norm': 252.93063354492188, 'learning_rate': 3.533575317604356e-06, 'epoch': 17.59} +{'loss': 34.6046, 'grad_norm': 259.8314208984375, 'learning_rate': 3.528130671506352e-06, 'epoch': 17.59} +{'loss': 35.301, 'grad_norm': 230.2709197998047, 'learning_rate': 3.5226860254083484e-06, 'epoch': 17.59} +{'loss': 37.4443, 'grad_norm': 306.6289367675781, 'learning_rate': 3.517241379310345e-06, 'epoch': 17.6} +{'loss': 36.3646, 'grad_norm': 241.5065460205078, 'learning_rate': 3.5117967332123413e-06, 'epoch': 17.6} +{'loss': 36.2621, 'grad_norm': 234.2492218017578, 'learning_rate': 3.5063520871143375e-06, 'epoch': 17.6} +{'loss': 36.2202, 'grad_norm': 256.5443115234375, 'learning_rate': 3.500907441016334e-06, 'epoch': 17.61} +{'loss': 37.5031, 'grad_norm': 280.31097412109375, 'learning_rate': 3.4954627949183304e-06, 'epoch': 17.61} +{'loss': 37.1418, 'grad_norm': 304.2773132324219, 'learning_rate': 3.4900181488203267e-06, 'epoch': 17.61} +{'loss': 37.1474, 'grad_norm': 361.27716064453125, 'learning_rate': 3.484573502722323e-06, 'epoch': 17.62} + 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 4890/5520 [4:18:14<31:48, 3.03s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6052342653274536, 'eval_runtime': 3.1249, 'eval_samples_per_second': 57.282, 'eval_steps_per_second': 57.282, 'epoch': 17.62} +{'loss': 38.0673, 'grad_norm': 237.64540100097656, 'learning_rate': 3.4791288566243195e-06, 'epoch': 17.62} +{'loss': 38.8272, 'grad_norm': 351.27215576171875, 'learning_rate': 3.4736842105263158e-06, 'epoch': 17.62} +{'loss': 39.1524, 'grad_norm': 277.1895751953125, 'learning_rate': 3.4682395644283124e-06, 'epoch': 17.63} +{'loss': 37.9027, 'grad_norm': 275.1535949707031, 'learning_rate': 3.4627949183303086e-06, 'epoch': 17.63} +{'loss': 36.7233, 'grad_norm': 335.01776123046875, 'learning_rate': 3.4573502722323053e-06, 'epoch': 17.64} +{'loss': 37.782, 'grad_norm': 297.1637878417969, 'learning_rate': 3.4519056261343015e-06, 'epoch': 17.64} +{'loss': 37.6639, 'grad_norm': 265.400390625, 'learning_rate': 3.4464609800362978e-06, 'epoch': 17.64} +{'loss': 36.7617, 'grad_norm': 345.3449401855469, 'learning_rate': 3.441016333938294e-06, 'epoch': 17.65} +{'loss': 32.9906, 'grad_norm': 256.0724182128906, 'learning_rate': 3.4355716878402902e-06, 'epoch': 17.65} +{'loss': 32.0811, 'grad_norm': 260.698486328125, 'learning_rate': 3.430127041742287e-06, 'epoch': 17.65} + 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 4900/5520 [4:18:47<30:16, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.603126585483551, 'eval_runtime': 3.1268, 'eval_samples_per_second': 57.247, 'eval_steps_per_second': 57.247, 'epoch': 17.65} +{'loss': 31.2138, 'grad_norm': 274.9847717285156, 'learning_rate': 3.424682395644283e-06, 'epoch': 17.66} +{'loss': 30.302, 'grad_norm': 345.5099182128906, 'learning_rate': 3.4192377495462798e-06, 'epoch': 17.66} +{'loss': 30.2679, 'grad_norm': 269.1453857421875, 'learning_rate': 3.413793103448276e-06, 'epoch': 17.66} +{'loss': 31.7616, 'grad_norm': 293.7955017089844, 'learning_rate': 3.4083484573502722e-06, 'epoch': 17.67} +{'loss': 33.1265, 'grad_norm': 306.1725769042969, 'learning_rate': 3.4029038112522685e-06, 'epoch': 17.67} +{'loss': 33.2131, 'grad_norm': 329.8185119628906, 'learning_rate': 3.397459165154265e-06, 'epoch': 17.68} +{'loss': 33.243, 'grad_norm': 340.790283203125, 'learning_rate': 3.3920145190562613e-06, 'epoch': 17.68} +{'loss': 33.6235, 'grad_norm': 324.004150390625, 'learning_rate': 3.386569872958258e-06, 'epoch': 17.68} +{'loss': 33.2524, 'grad_norm': 263.9126892089844, 'learning_rate': 3.3811252268602542e-06, 'epoch': 17.69} +{'loss': 34.6629, 'grad_norm': 274.6680603027344, 'learning_rate': 3.375680580762251e-06, 'epoch': 17.69} + 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 4910/5520 [4:19:19<29:53, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6027778387069702, 'eval_runtime': 3.1418, 'eval_samples_per_second': 56.974, 'eval_steps_per_second': 56.974, 'epoch': 17.69} +{'loss': 33.3088, 'grad_norm': 317.1280822753906, 'learning_rate': 3.370235934664247e-06, 'epoch': 17.69} +{'loss': 34.5045, 'grad_norm': 304.1892395019531, 'learning_rate': 3.364791288566243e-06, 'epoch': 17.7} +{'loss': 35.8429, 'grad_norm': 278.75933837890625, 'learning_rate': 3.3593466424682396e-06, 'epoch': 17.7} +{'loss': 36.2401, 'grad_norm': 299.76971435546875, 'learning_rate': 3.353901996370236e-06, 'epoch': 17.7} +{'loss': 28.938, 'grad_norm': 253.46795654296875, 'learning_rate': 3.3484573502722324e-06, 'epoch': 17.71} +{'loss': 21.6689, 'grad_norm': 220.74098205566406, 'learning_rate': 3.3430127041742287e-06, 'epoch': 17.71} +{'loss': 21.3497, 'grad_norm': 255.79150390625, 'learning_rate': 3.3375680580762253e-06, 'epoch': 17.72} +{'loss': 22.9276, 'grad_norm': 284.2683410644531, 'learning_rate': 3.3321234119782216e-06, 'epoch': 17.72} +{'loss': 24.7304, 'grad_norm': 296.7882080078125, 'learning_rate': 3.3266787658802182e-06, 'epoch': 17.72} +{'loss': 38.7687, 'grad_norm': 217.35546875, 'learning_rate': 3.321234119782214e-06, 'epoch': 17.73} + 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 4920/5520 [4:19:51<29:19, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6015192866325378, 'eval_runtime': 3.1363, 'eval_samples_per_second': 57.074, 'eval_steps_per_second': 57.074, 'epoch': 17.73} +{'loss': 39.7421, 'grad_norm': 256.7005920410156, 'learning_rate': 3.3157894736842107e-06, 'epoch': 17.73} +{'loss': 39.2911, 'grad_norm': 203.49417114257812, 'learning_rate': 3.310344827586207e-06, 'epoch': 17.73} +{'loss': 39.2524, 'grad_norm': 282.81439208984375, 'learning_rate': 3.3049001814882036e-06, 'epoch': 17.74} +{'loss': 37.2097, 'grad_norm': 315.3716735839844, 'learning_rate': 3.2994555353902e-06, 'epoch': 17.74} +{'loss': 37.6568, 'grad_norm': 250.96484375, 'learning_rate': 3.294010889292196e-06, 'epoch': 17.74} +{'loss': 38.9578, 'grad_norm': 299.4822082519531, 'learning_rate': 3.2885662431941927e-06, 'epoch': 17.75} +{'loss': 40.3838, 'grad_norm': 261.2537536621094, 'learning_rate': 3.2831215970961885e-06, 'epoch': 17.75} +{'loss': 39.2068, 'grad_norm': 220.55218505859375, 'learning_rate': 3.277676950998185e-06, 'epoch': 17.75} +{'loss': 40.5383, 'grad_norm': 238.06874084472656, 'learning_rate': 3.2722323049001814e-06, 'epoch': 17.76} +{'loss': 37.3857, 'grad_norm': 223.9597625732422, 'learning_rate': 3.266787658802178e-06, 'epoch': 17.76} + 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 4930/5520 [4:20:23<28:44, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.602606475353241, 'eval_runtime': 3.1385, 'eval_samples_per_second': 57.033, 'eval_steps_per_second': 57.033, 'epoch': 17.76} +{'loss': 37.187, 'grad_norm': 278.9289245605469, 'learning_rate': 3.2613430127041742e-06, 'epoch': 17.77} +{'loss': 37.5243, 'grad_norm': 306.52398681640625, 'learning_rate': 3.255898366606171e-06, 'epoch': 17.77} +{'loss': 35.3104, 'grad_norm': 231.3939208984375, 'learning_rate': 3.250453720508167e-06, 'epoch': 17.77} +{'loss': 36.0904, 'grad_norm': 216.77613830566406, 'learning_rate': 3.2450090744101638e-06, 'epoch': 17.78} +{'loss': 36.4117, 'grad_norm': 256.0504150390625, 'learning_rate': 3.2395644283121596e-06, 'epoch': 17.78} +{'loss': 37.197, 'grad_norm': 253.29734802246094, 'learning_rate': 3.2341197822141562e-06, 'epoch': 17.78} +{'loss': 36.4606, 'grad_norm': 268.80780029296875, 'learning_rate': 3.2286751361161525e-06, 'epoch': 17.79} +{'loss': 36.8647, 'grad_norm': 302.3041076660156, 'learning_rate': 3.2232304900181487e-06, 'epoch': 17.79} +{'loss': 37.3981, 'grad_norm': 274.23797607421875, 'learning_rate': 3.2177858439201454e-06, 'epoch': 17.79} +{'loss': 37.2304, 'grad_norm': 281.4304504394531, 'learning_rate': 3.2123411978221416e-06, 'epoch': 17.8} + 89%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 4940/5520 [4:20:55<28:11, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6050394773483276, 'eval_runtime': 3.1396, 'eval_samples_per_second': 57.013, 'eval_steps_per_second': 57.013, 'epoch': 17.8} +{'loss': 35.9281, 'grad_norm': 277.47698974609375, 'learning_rate': 3.2068965517241382e-06, 'epoch': 17.8} +{'loss': 39.0143, 'grad_norm': 394.02294921875, 'learning_rate': 3.201451905626134e-06, 'epoch': 17.81} +{'loss': 36.9452, 'grad_norm': 252.8087158203125, 'learning_rate': 3.1960072595281307e-06, 'epoch': 17.81} +{'loss': 39.2442, 'grad_norm': 249.54962158203125, 'learning_rate': 3.190562613430127e-06, 'epoch': 17.81} +{'loss': 38.6445, 'grad_norm': 286.9231262207031, 'learning_rate': 3.1851179673321236e-06, 'epoch': 17.82} +{'loss': 37.1794, 'grad_norm': 345.7146911621094, 'learning_rate': 3.17967332123412e-06, 'epoch': 17.82} +{'loss': 36.3952, 'grad_norm': 271.23089599609375, 'learning_rate': 3.1742286751361165e-06, 'epoch': 17.82} +{'loss': 33.8166, 'grad_norm': 406.3717346191406, 'learning_rate': 3.1687840290381127e-06, 'epoch': 17.83} +{'loss': 30.9614, 'grad_norm': 300.12554931640625, 'learning_rate': 3.1633393829401094e-06, 'epoch': 17.83} +{'loss': 31.8592, 'grad_norm': 229.67218017578125, 'learning_rate': 3.157894736842105e-06, 'epoch': 17.83} + 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 4950/5520 [4:21:27<27:38, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6021057367324829, 'eval_runtime': 3.1376, 'eval_samples_per_second': 57.049, 'eval_steps_per_second': 57.049, 'epoch': 17.83} +{'loss': 31.7702, 'grad_norm': 269.0873107910156, 'learning_rate': 3.1524500907441014e-06, 'epoch': 17.84} +{'loss': 31.3615, 'grad_norm': 279.0237731933594, 'learning_rate': 3.147005444646098e-06, 'epoch': 17.84} +{'loss': 31.9314, 'grad_norm': 234.94839477539062, 'learning_rate': 3.1415607985480943e-06, 'epoch': 17.85} +{'loss': 32.4513, 'grad_norm': 239.25613403320312, 'learning_rate': 3.136116152450091e-06, 'epoch': 17.85} +{'loss': 34.4964, 'grad_norm': 257.09661865234375, 'learning_rate': 3.130671506352087e-06, 'epoch': 17.85} +{'loss': 33.1662, 'grad_norm': 328.88006591796875, 'learning_rate': 3.125226860254084e-06, 'epoch': 17.86} +{'loss': 34.4406, 'grad_norm': 291.4894714355469, 'learning_rate': 3.1197822141560796e-06, 'epoch': 17.86} +{'loss': 32.7141, 'grad_norm': 282.81158447265625, 'learning_rate': 3.1143375680580763e-06, 'epoch': 17.86} +{'loss': 34.3423, 'grad_norm': 300.0378112792969, 'learning_rate': 3.1088929219600725e-06, 'epoch': 17.87} +{'loss': 33.1653, 'grad_norm': 267.2983703613281, 'learning_rate': 3.103448275862069e-06, 'epoch': 17.87} + 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 4960/5520 [4:21:59<27:25, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6020416021347046, 'eval_runtime': 3.1359, 'eval_samples_per_second': 57.081, 'eval_steps_per_second': 57.081, 'epoch': 17.87} +{'loss': 34.7582, 'grad_norm': 270.53277587890625, 'learning_rate': 3.0980036297640654e-06, 'epoch': 17.87} +{'loss': 35.9911, 'grad_norm': 346.0074157714844, 'learning_rate': 3.092558983666062e-06, 'epoch': 17.88} +{'loss': 35.3345, 'grad_norm': 367.5807189941406, 'learning_rate': 3.0871143375680583e-06, 'epoch': 17.88} +{'loss': 32.9797, 'grad_norm': 304.21649169921875, 'learning_rate': 3.0816696914700545e-06, 'epoch': 17.88} +{'loss': 22.6226, 'grad_norm': 253.14601135253906, 'learning_rate': 3.0762250453720507e-06, 'epoch': 17.89} +{'loss': 21.9531, 'grad_norm': 270.3512268066406, 'learning_rate': 3.070780399274047e-06, 'epoch': 17.89} +{'loss': 21.8497, 'grad_norm': 192.73712158203125, 'learning_rate': 3.0653357531760436e-06, 'epoch': 17.9} +{'loss': 23.2694, 'grad_norm': 254.43759155273438, 'learning_rate': 3.05989110707804e-06, 'epoch': 17.9} +{'loss': 22.9774, 'grad_norm': 271.2293395996094, 'learning_rate': 3.0544464609800365e-06, 'epoch': 17.9} +{'loss': 38.8821, 'grad_norm': 213.7334747314453, 'learning_rate': 3.0490018148820327e-06, 'epoch': 17.91} + 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 4970/5520 [4:22:31<26:47, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.600848913192749, 'eval_runtime': 3.1384, 'eval_samples_per_second': 57.036, 'eval_steps_per_second': 57.036, 'epoch': 17.91} +{'loss': 38.6362, 'grad_norm': 269.9356384277344, 'learning_rate': 3.0435571687840294e-06, 'epoch': 17.91} +{'loss': 39.6388, 'grad_norm': 237.6484832763672, 'learning_rate': 3.0381125226860256e-06, 'epoch': 17.91} +{'loss': 39.4308, 'grad_norm': 304.2347106933594, 'learning_rate': 3.032667876588022e-06, 'epoch': 17.92} +{'loss': 40.1923, 'grad_norm': 250.6772918701172, 'learning_rate': 3.027223230490018e-06, 'epoch': 17.92} +{'loss': 37.862, 'grad_norm': 261.7320556640625, 'learning_rate': 3.0217785843920147e-06, 'epoch': 17.92} +{'loss': 35.9139, 'grad_norm': 385.33197021484375, 'learning_rate': 3.016333938294011e-06, 'epoch': 17.93} +{'loss': 36.6259, 'grad_norm': 436.6773986816406, 'learning_rate': 3.010889292196007e-06, 'epoch': 17.93} +{'loss': 36.1235, 'grad_norm': 318.65673828125, 'learning_rate': 3.005444646098004e-06, 'epoch': 17.94} +{'loss': 37.4148, 'grad_norm': 241.6234893798828, 'learning_rate': 3e-06, 'epoch': 17.94} +{'loss': 36.7089, 'grad_norm': 316.8415832519531, 'learning_rate': 2.9945553539019963e-06, 'epoch': 17.94} + 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 4980/5520 [4:23:03<26:24, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6032605171203613, 'eval_runtime': 3.137, 'eval_samples_per_second': 57.061, 'eval_steps_per_second': 57.061, 'epoch': 17.94} +{'loss': 37.2222, 'grad_norm': 322.0501403808594, 'learning_rate': 2.9891107078039925e-06, 'epoch': 17.95} +{'loss': 37.9156, 'grad_norm': 300.4189453125, 'learning_rate': 2.983666061705989e-06, 'epoch': 17.95} +{'loss': 38.5253, 'grad_norm': 304.39263916015625, 'learning_rate': 2.9782214156079854e-06, 'epoch': 17.95} +{'loss': 38.4385, 'grad_norm': 297.4574890136719, 'learning_rate': 2.972776769509982e-06, 'epoch': 17.96} +{'loss': 36.2943, 'grad_norm': 367.7257080078125, 'learning_rate': 2.9673321234119783e-06, 'epoch': 17.96} +{'loss': 30.8753, 'grad_norm': 274.61724853515625, 'learning_rate': 2.961887477313975e-06, 'epoch': 17.96} +{'loss': 32.1308, 'grad_norm': 358.50201416015625, 'learning_rate': 2.956442831215971e-06, 'epoch': 17.97} +{'loss': 33.2474, 'grad_norm': 493.7792663574219, 'learning_rate': 2.9509981851179674e-06, 'epoch': 17.97} +{'loss': 33.7065, 'grad_norm': 426.67138671875, 'learning_rate': 2.9455535390199636e-06, 'epoch': 17.98} +{'loss': 34.6007, 'grad_norm': 524.0231323242188, 'learning_rate': 2.94010889292196e-06, 'epoch': 17.98} + 90%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 4990/5520 [4:23:35<25:46, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6021283268928528, 'eval_runtime': 3.1337, 'eval_samples_per_second': 57.121, 'eval_steps_per_second': 57.121, 'epoch': 17.98} +{'loss': 33.9185, 'grad_norm': 395.26715087890625, 'learning_rate': 2.9346642468239565e-06, 'epoch': 17.98} +{'loss': 34.6485, 'grad_norm': 400.0454406738281, 'learning_rate': 2.9292196007259528e-06, 'epoch': 17.99} +{'loss': 34.668, 'grad_norm': 376.1269226074219, 'learning_rate': 2.9237749546279494e-06, 'epoch': 17.99} +{'loss': 30.7058, 'grad_norm': 315.5225524902344, 'learning_rate': 2.9183303085299456e-06, 'epoch': 17.99} +{'loss': 21.8055, 'grad_norm': 221.5032958984375, 'learning_rate': 2.912885662431942e-06, 'epoch': 18.0} +{'loss': 20.5066, 'grad_norm': 226.06068420410156, 'learning_rate': 2.907441016333938e-06, 'epoch': 18.0} +{'loss': 37.9156, 'grad_norm': 209.69607543945312, 'learning_rate': 2.9019963702359348e-06, 'epoch': 18.0} +{'loss': 38.8204, 'grad_norm': 218.86709594726562, 'learning_rate': 2.896551724137931e-06, 'epoch': 18.01} +{'loss': 38.5472, 'grad_norm': 218.38180541992188, 'learning_rate': 2.8911070780399276e-06, 'epoch': 18.01} +{'loss': 37.7233, 'grad_norm': 338.4778747558594, 'learning_rate': 2.885662431941924e-06, 'epoch': 18.01} + 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 5000/5520 [4:24:07<25:28, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6013379096984863, 'eval_runtime': 3.1415, 'eval_samples_per_second': 56.979, 'eval_steps_per_second': 56.979, 'epoch': 18.01} +{'loss': 38.3321, 'grad_norm': 309.5385437011719, 'learning_rate': 2.8802177858439205e-06, 'epoch': 18.02} +{'loss': 38.2367, 'grad_norm': 335.67169189453125, 'learning_rate': 2.8747731397459168e-06, 'epoch': 18.02} +{'loss': 38.5516, 'grad_norm': 260.5025939941406, 'learning_rate': 2.8693284936479126e-06, 'epoch': 18.03} +{'loss': 38.9539, 'grad_norm': 265.4793395996094, 'learning_rate': 2.8638838475499092e-06, 'epoch': 18.03} +{'loss': 39.4582, 'grad_norm': 237.87942504882812, 'learning_rate': 2.8584392014519054e-06, 'epoch': 18.03} +{'loss': 39.3466, 'grad_norm': 252.11746215820312, 'learning_rate': 2.852994555353902e-06, 'epoch': 18.04} +{'loss': 36.9779, 'grad_norm': 298.1370849609375, 'learning_rate': 2.8475499092558983e-06, 'epoch': 18.04} +{'loss': 36.5117, 'grad_norm': 341.9007873535156, 'learning_rate': 2.842105263157895e-06, 'epoch': 18.04} +{'loss': 34.7543, 'grad_norm': 210.0319366455078, 'learning_rate': 2.8366606170598912e-06, 'epoch': 18.05} +{'loss': 36.4577, 'grad_norm': 385.6400146484375, 'learning_rate': 2.831215970961888e-06, 'epoch': 18.05} + 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 5000/5520 [4:24:10<25:28, 2.94s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 5010/5520 [4:24:41<25:02, 2.95s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6031082272529602, 'eval_runtime': 3.1398, 'eval_samples_per_second': 57.011, 'eval_steps_per_second': 57.011, 'epoch': 18.05} +{'loss': 36.3765, 'grad_norm': 268.4949035644531, 'learning_rate': 2.8257713248638837e-06, 'epoch': 18.05} +{'loss': 35.709, 'grad_norm': 311.2984313964844, 'learning_rate': 2.8203266787658803e-06, 'epoch': 18.06} +{'loss': 35.7978, 'grad_norm': 264.0671081542969, 'learning_rate': 2.8148820326678766e-06, 'epoch': 18.06} +{'loss': 36.8963, 'grad_norm': 341.0770263671875, 'learning_rate': 2.8094373865698732e-06, 'epoch': 18.07} +{'loss': 37.1135, 'grad_norm': 253.3942108154297, 'learning_rate': 2.8039927404718694e-06, 'epoch': 18.07} +{'loss': 35.736, 'grad_norm': 286.23736572265625, 'learning_rate': 2.7985480943738657e-06, 'epoch': 18.07} +{'loss': 36.4917, 'grad_norm': 327.71295166015625, 'learning_rate': 2.7931034482758623e-06, 'epoch': 18.08} +{'loss': 37.2807, 'grad_norm': 351.00616455078125, 'learning_rate': 2.787658802177858e-06, 'epoch': 18.08} +{'loss': 38.0345, 'grad_norm': 291.02923583984375, 'learning_rate': 2.782214156079855e-06, 'epoch': 18.08} +{'loss': 37.112, 'grad_norm': 288.7776184082031, 'learning_rate': 2.776769509981851e-06, 'epoch': 18.09} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 5020/5520 [4:25:13<24:22, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6058472990989685, 'eval_runtime': 3.1359, 'eval_samples_per_second': 57.082, 'eval_steps_per_second': 57.082, 'epoch': 18.09} +{'loss': 37.9063, 'grad_norm': 437.8114929199219, 'learning_rate': 2.7713248638838477e-06, 'epoch': 18.09} +{'loss': 37.8524, 'grad_norm': 324.5924072265625, 'learning_rate': 2.765880217785844e-06, 'epoch': 18.09} +{'loss': 37.5547, 'grad_norm': 358.40625, 'learning_rate': 2.7604355716878406e-06, 'epoch': 18.1} +{'loss': 36.4437, 'grad_norm': 290.75604248046875, 'learning_rate': 2.7549909255898368e-06, 'epoch': 18.1} +{'loss': 34.3336, 'grad_norm': 284.41424560546875, 'learning_rate': 2.7495462794918334e-06, 'epoch': 18.1} +{'loss': 32.4527, 'grad_norm': 254.59889221191406, 'learning_rate': 2.7441016333938292e-06, 'epoch': 18.11} +{'loss': 30.4014, 'grad_norm': 266.0207214355469, 'learning_rate': 2.738656987295826e-06, 'epoch': 18.11} +{'loss': 30.2838, 'grad_norm': 219.9434356689453, 'learning_rate': 2.733212341197822e-06, 'epoch': 18.12} +{'loss': 31.6877, 'grad_norm': 312.7678527832031, 'learning_rate': 2.7277676950998188e-06, 'epoch': 18.12} +{'loss': 33.3686, 'grad_norm': 282.99774169921875, 'learning_rate': 2.722323049001815e-06, 'epoch': 18.12} + 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 5030/5520 [4:25:45<23:48, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6027761697769165, 'eval_runtime': 3.1395, 'eval_samples_per_second': 57.016, 'eval_steps_per_second': 57.016, 'epoch': 18.12} +{'loss': 32.5397, 'grad_norm': 371.9994201660156, 'learning_rate': 2.7168784029038112e-06, 'epoch': 18.13} +{'loss': 33.4329, 'grad_norm': 241.19049072265625, 'learning_rate': 2.711433756805808e-06, 'epoch': 18.13} +{'loss': 31.888, 'grad_norm': 310.2216796875, 'learning_rate': 2.7059891107078037e-06, 'epoch': 18.13} +{'loss': 33.9345, 'grad_norm': 277.1349182128906, 'learning_rate': 2.7005444646098004e-06, 'epoch': 18.14} +{'loss': 33.5826, 'grad_norm': 419.3515930175781, 'learning_rate': 2.6950998185117966e-06, 'epoch': 18.14} +{'loss': 34.324, 'grad_norm': 289.1166687011719, 'learning_rate': 2.6896551724137932e-06, 'epoch': 18.14} +{'loss': 34.45, 'grad_norm': 364.20233154296875, 'learning_rate': 2.6842105263157895e-06, 'epoch': 18.15} +{'loss': 33.9126, 'grad_norm': 341.71551513671875, 'learning_rate': 2.678765880217786e-06, 'epoch': 18.15} +{'loss': 33.7188, 'grad_norm': 283.1939697265625, 'learning_rate': 2.6733212341197824e-06, 'epoch': 18.16} +{'loss': 35.0354, 'grad_norm': 369.6583251953125, 'learning_rate': 2.667876588021779e-06, 'epoch': 18.16} + 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 5040/5520 [4:26:17<23:31, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6033984422683716, 'eval_runtime': 3.1394, 'eval_samples_per_second': 57.018, 'eval_steps_per_second': 57.018, 'epoch': 18.16} +{'loss': 34.6853, 'grad_norm': 323.95806884765625, 'learning_rate': 2.662431941923775e-06, 'epoch': 18.16} +{'loss': 32.1261, 'grad_norm': 274.2629089355469, 'learning_rate': 2.6569872958257715e-06, 'epoch': 18.17} +{'loss': 22.0549, 'grad_norm': 229.66163635253906, 'learning_rate': 2.6515426497277677e-06, 'epoch': 18.17} +{'loss': 21.4483, 'grad_norm': 212.78070068359375, 'learning_rate': 2.646098003629764e-06, 'epoch': 18.17} +{'loss': 22.5133, 'grad_norm': 184.7995147705078, 'learning_rate': 2.6406533575317606e-06, 'epoch': 18.18} +{'loss': 23.6443, 'grad_norm': 256.6748046875, 'learning_rate': 2.635208711433757e-06, 'epoch': 18.18} +{'loss': 38.3633, 'grad_norm': 230.683349609375, 'learning_rate': 2.6297640653357535e-06, 'epoch': 18.18} +{'loss': 40.1229, 'grad_norm': 251.70166015625, 'learning_rate': 2.6243194192377497e-06, 'epoch': 18.19} +{'loss': 38.6539, 'grad_norm': 219.9066162109375, 'learning_rate': 2.618874773139746e-06, 'epoch': 18.19} +{'loss': 38.0385, 'grad_norm': 290.7185974121094, 'learning_rate': 2.613430127041742e-06, 'epoch': 18.2} + 91%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 5050/5520 [4:26:49<22:56, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6022469401359558, 'eval_runtime': 3.1408, 'eval_samples_per_second': 56.993, 'eval_steps_per_second': 56.993, 'epoch': 18.2} +{'loss': 38.2381, 'grad_norm': 334.9693908691406, 'learning_rate': 2.607985480943739e-06, 'epoch': 18.2} +{'loss': 39.2603, 'grad_norm': 283.9659423828125, 'learning_rate': 2.602540834845735e-06, 'epoch': 18.2} +{'loss': 39.633, 'grad_norm': 291.4002990722656, 'learning_rate': 2.5970961887477317e-06, 'epoch': 18.21} +{'loss': 39.1938, 'grad_norm': 249.14329528808594, 'learning_rate': 2.591651542649728e-06, 'epoch': 18.21} +{'loss': 39.8308, 'grad_norm': 226.1659393310547, 'learning_rate': 2.5862068965517246e-06, 'epoch': 18.21} +{'loss': 38.4712, 'grad_norm': 270.2198181152344, 'learning_rate': 2.5807622504537204e-06, 'epoch': 18.22} +{'loss': 37.3572, 'grad_norm': 263.83819580078125, 'learning_rate': 2.5753176043557166e-06, 'epoch': 18.22} +{'loss': 36.3821, 'grad_norm': 316.8177795410156, 'learning_rate': 2.5698729582577133e-06, 'epoch': 18.22} +{'loss': 34.8209, 'grad_norm': 318.7213134765625, 'learning_rate': 2.5644283121597095e-06, 'epoch': 18.23} +{'loss': 35.6173, 'grad_norm': 267.6168518066406, 'learning_rate': 2.558983666061706e-06, 'epoch': 18.23} + 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 5060/5520 [4:27:21<22:23, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6044466495513916, 'eval_runtime': 3.1382, 'eval_samples_per_second': 57.038, 'eval_steps_per_second': 57.038, 'epoch': 18.23} +{'loss': 35.2828, 'grad_norm': 277.739501953125, 'learning_rate': 2.5535390199637024e-06, 'epoch': 18.23} +{'loss': 36.7972, 'grad_norm': 288.2068786621094, 'learning_rate': 2.548094373865699e-06, 'epoch': 18.24} +{'loss': 36.3637, 'grad_norm': 217.59716796875, 'learning_rate': 2.5426497277676953e-06, 'epoch': 18.24} +{'loss': 37.3086, 'grad_norm': 411.8970031738281, 'learning_rate': 2.5372050816696915e-06, 'epoch': 18.25} +{'loss': 37.0896, 'grad_norm': 351.9718933105469, 'learning_rate': 2.5317604355716877e-06, 'epoch': 18.25} +{'loss': 37.2533, 'grad_norm': 343.1683044433594, 'learning_rate': 2.5263157894736844e-06, 'epoch': 18.25} +{'loss': 36.9987, 'grad_norm': 413.0977783203125, 'learning_rate': 2.5208711433756806e-06, 'epoch': 18.26} +{'loss': 36.8624, 'grad_norm': 331.73223876953125, 'learning_rate': 2.5154264972776773e-06, 'epoch': 18.26} +{'loss': 37.949, 'grad_norm': 434.96990966796875, 'learning_rate': 2.5099818511796735e-06, 'epoch': 18.26} +{'loss': 37.6272, 'grad_norm': 324.4934997558594, 'learning_rate': 2.5045372050816697e-06, 'epoch': 18.27} + 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 5070/5520 [4:27:53<21:52, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6042292714118958, 'eval_runtime': 3.1335, 'eval_samples_per_second': 57.125, 'eval_steps_per_second': 57.125, 'epoch': 18.27} +{'loss': 38.6362, 'grad_norm': 312.1228942871094, 'learning_rate': 2.499092558983666e-06, 'epoch': 18.27} +{'loss': 39.2934, 'grad_norm': 427.6184997558594, 'learning_rate': 2.493647912885662e-06, 'epoch': 18.27} +{'loss': 38.0684, 'grad_norm': 344.6819763183594, 'learning_rate': 2.488203266787659e-06, 'epoch': 18.28} +{'loss': 38.2323, 'grad_norm': 317.42303466796875, 'learning_rate': 2.482758620689655e-06, 'epoch': 18.28} +{'loss': 34.2699, 'grad_norm': 338.830810546875, 'learning_rate': 2.4773139745916517e-06, 'epoch': 18.29} +{'loss': 32.5149, 'grad_norm': 286.7263488769531, 'learning_rate': 2.471869328493648e-06, 'epoch': 18.29} +{'loss': 31.033, 'grad_norm': 278.9923095703125, 'learning_rate': 2.4664246823956446e-06, 'epoch': 18.29} +{'loss': 29.5549, 'grad_norm': 264.0198669433594, 'learning_rate': 2.460980036297641e-06, 'epoch': 18.3} +{'loss': 30.2173, 'grad_norm': 241.6163330078125, 'learning_rate': 2.455535390199637e-06, 'epoch': 18.3} +{'loss': 30.8286, 'grad_norm': 278.5418395996094, 'learning_rate': 2.4500907441016333e-06, 'epoch': 18.3} + 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 5080/5520 [4:28:25<21:30, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6035094261169434, 'eval_runtime': 3.1352, 'eval_samples_per_second': 57.094, 'eval_steps_per_second': 57.094, 'epoch': 18.3} +{'loss': 33.6778, 'grad_norm': 277.5758056640625, 'learning_rate': 2.44464609800363e-06, 'epoch': 18.31} +{'loss': 33.5914, 'grad_norm': 295.81201171875, 'learning_rate': 2.439201451905626e-06, 'epoch': 18.31} +{'loss': 33.6203, 'grad_norm': 293.4093017578125, 'learning_rate': 2.4337568058076224e-06, 'epoch': 18.31} +{'loss': 33.6465, 'grad_norm': 277.2228698730469, 'learning_rate': 2.428312159709619e-06, 'epoch': 18.32} +{'loss': 32.6013, 'grad_norm': 286.3224792480469, 'learning_rate': 2.4228675136116153e-06, 'epoch': 18.32} +{'loss': 32.6469, 'grad_norm': 320.6168212890625, 'learning_rate': 2.417422867513612e-06, 'epoch': 18.33} +{'loss': 34.354, 'grad_norm': 327.364990234375, 'learning_rate': 2.4119782214156078e-06, 'epoch': 18.33} +{'loss': 34.3143, 'grad_norm': 342.06634521484375, 'learning_rate': 2.4065335753176044e-06, 'epoch': 18.33} +{'loss': 33.7771, 'grad_norm': 370.70343017578125, 'learning_rate': 2.4010889292196006e-06, 'epoch': 18.34} +{'loss': 35.5377, 'grad_norm': 358.7357177734375, 'learning_rate': 2.3956442831215973e-06, 'epoch': 18.34} + 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 5090/5520 [4:28:57<21:06, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6033809185028076, 'eval_runtime': 3.1442, 'eval_samples_per_second': 56.931, 'eval_steps_per_second': 56.931, 'epoch': 18.34} +{'loss': 35.4711, 'grad_norm': 463.8668518066406, 'learning_rate': 2.3901996370235935e-06, 'epoch': 18.34} +{'loss': 26.8532, 'grad_norm': 256.5113220214844, 'learning_rate': 2.38475499092559e-06, 'epoch': 18.35} +{'loss': 21.6636, 'grad_norm': 228.83883666992188, 'learning_rate': 2.3793103448275864e-06, 'epoch': 18.35} +{'loss': 22.2091, 'grad_norm': 238.70742797851562, 'learning_rate': 2.3738656987295826e-06, 'epoch': 18.35} +{'loss': 22.1242, 'grad_norm': 276.8741760253906, 'learning_rate': 2.368421052631579e-06, 'epoch': 18.36} +{'loss': 23.359, 'grad_norm': 226.4810333251953, 'learning_rate': 2.362976406533575e-06, 'epoch': 18.36} +{'loss': 37.7694, 'grad_norm': 212.53111267089844, 'learning_rate': 2.3575317604355718e-06, 'epoch': 18.36} +{'loss': 39.8064, 'grad_norm': 227.26710510253906, 'learning_rate': 2.352087114337568e-06, 'epoch': 18.37} +{'loss': 38.9716, 'grad_norm': 201.0309295654297, 'learning_rate': 2.3466424682395646e-06, 'epoch': 18.37} +{'loss': 39.8326, 'grad_norm': 311.7691345214844, 'learning_rate': 2.341197822141561e-06, 'epoch': 18.38} + 92%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 5100/5520 [4:29:29<20:28, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6036086082458496, 'eval_runtime': 3.1388, 'eval_samples_per_second': 57.029, 'eval_steps_per_second': 57.029, 'epoch': 18.38} +{'loss': 38.2591, 'grad_norm': 251.5362091064453, 'learning_rate': 2.3357531760435575e-06, 'epoch': 18.38} +{'loss': 38.0327, 'grad_norm': 241.64373779296875, 'learning_rate': 2.3303085299455533e-06, 'epoch': 18.38} +{'loss': 38.6853, 'grad_norm': 231.7598114013672, 'learning_rate': 2.32486388384755e-06, 'epoch': 18.39} +{'loss': 39.6929, 'grad_norm': 287.66644287109375, 'learning_rate': 2.3194192377495462e-06, 'epoch': 18.39} +{'loss': 38.3129, 'grad_norm': 289.3146057128906, 'learning_rate': 2.313974591651543e-06, 'epoch': 18.39} +{'loss': 38.2505, 'grad_norm': 291.4801330566406, 'learning_rate': 2.308529945553539e-06, 'epoch': 18.4} +{'loss': 37.7476, 'grad_norm': 337.4052429199219, 'learning_rate': 2.3030852994555358e-06, 'epoch': 18.4} +{'loss': 36.1112, 'grad_norm': 460.0773010253906, 'learning_rate': 2.297640653357532e-06, 'epoch': 18.4} +{'loss': 36.5374, 'grad_norm': 322.4940185546875, 'learning_rate': 2.292196007259528e-06, 'epoch': 18.41} +{'loss': 37.5286, 'grad_norm': 350.4710388183594, 'learning_rate': 2.2867513611615244e-06, 'epoch': 18.41} + 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 5110/5520 [4:30:01<19:55, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6045494079589844, 'eval_runtime': 3.1348, 'eval_samples_per_second': 57.101, 'eval_steps_per_second': 57.101, 'epoch': 18.41} +{'loss': 37.3676, 'grad_norm': 306.18634033203125, 'learning_rate': 2.2813067150635207e-06, 'epoch': 18.42} +{'loss': 36.6916, 'grad_norm': 289.237060546875, 'learning_rate': 2.2758620689655173e-06, 'epoch': 18.42} +{'loss': 36.2887, 'grad_norm': 266.69207763671875, 'learning_rate': 2.2704174228675136e-06, 'epoch': 18.42} +{'loss': 37.1267, 'grad_norm': 264.54119873046875, 'learning_rate': 2.2649727767695102e-06, 'epoch': 18.43} +{'loss': 36.6862, 'grad_norm': 262.6132507324219, 'learning_rate': 2.2595281306715064e-06, 'epoch': 18.43} +{'loss': 35.7714, 'grad_norm': 231.68226623535156, 'learning_rate': 2.254083484573503e-06, 'epoch': 18.43} +{'loss': 37.648, 'grad_norm': 299.72613525390625, 'learning_rate': 2.248638838475499e-06, 'epoch': 18.44} +{'loss': 35.9776, 'grad_norm': 424.94708251953125, 'learning_rate': 2.2431941923774956e-06, 'epoch': 18.44} +{'loss': 38.0571, 'grad_norm': 449.78570556640625, 'learning_rate': 2.2377495462794918e-06, 'epoch': 18.44} +{'loss': 37.758, 'grad_norm': 284.00634765625, 'learning_rate': 2.2323049001814884e-06, 'epoch': 18.45} + 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 5120/5520 [4:30:33<19:32, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6064541935920715, 'eval_runtime': 3.1377, 'eval_samples_per_second': 57.048, 'eval_steps_per_second': 57.048, 'epoch': 18.45} +{'loss': 38.8924, 'grad_norm': 359.1011962890625, 'learning_rate': 2.2268602540834847e-06, 'epoch': 18.45} +{'loss': 38.2116, 'grad_norm': 307.7583923339844, 'learning_rate': 2.221415607985481e-06, 'epoch': 18.46} +{'loss': 39.6894, 'grad_norm': 359.5586242675781, 'learning_rate': 2.2159709618874776e-06, 'epoch': 18.46} +{'loss': 36.4586, 'grad_norm': 258.3985595703125, 'learning_rate': 2.2105263157894734e-06, 'epoch': 18.46} +{'loss': 34.489, 'grad_norm': 363.09600830078125, 'learning_rate': 2.20508166969147e-06, 'epoch': 18.47} +{'loss': 32.5826, 'grad_norm': 237.136474609375, 'learning_rate': 2.1996370235934662e-06, 'epoch': 18.47} +{'loss': 31.3005, 'grad_norm': 400.25604248046875, 'learning_rate': 2.194192377495463e-06, 'epoch': 18.47} +{'loss': 30.2261, 'grad_norm': 467.9855651855469, 'learning_rate': 2.188747731397459e-06, 'epoch': 18.48} +{'loss': 33.5844, 'grad_norm': 384.4250183105469, 'learning_rate': 2.1833030852994558e-06, 'epoch': 18.48} +{'loss': 32.5136, 'grad_norm': 324.4369201660156, 'learning_rate': 2.177858439201452e-06, 'epoch': 18.48} + 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 5130/5520 [4:31:05<18:54, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.602573573589325, 'eval_runtime': 3.138, 'eval_samples_per_second': 57.043, 'eval_steps_per_second': 57.043, 'epoch': 18.48} +{'loss': 31.4322, 'grad_norm': 372.0033264160156, 'learning_rate': 2.1724137931034487e-06, 'epoch': 18.49} +{'loss': 34.163, 'grad_norm': 336.265869140625, 'learning_rate': 2.1669691470054445e-06, 'epoch': 18.49} +{'loss': 31.2627, 'grad_norm': 339.8494873046875, 'learning_rate': 2.161524500907441e-06, 'epoch': 18.49} +{'loss': 32.3994, 'grad_norm': 279.3925476074219, 'learning_rate': 2.1560798548094374e-06, 'epoch': 18.5} +{'loss': 34.8467, 'grad_norm': 281.546875, 'learning_rate': 2.1506352087114336e-06, 'epoch': 18.5} +{'loss': 33.632, 'grad_norm': 315.8692626953125, 'learning_rate': 2.1451905626134302e-06, 'epoch': 18.51} +{'loss': 34.312, 'grad_norm': 289.3066711425781, 'learning_rate': 2.1397459165154265e-06, 'epoch': 18.51} +{'loss': 32.9937, 'grad_norm': 274.190673828125, 'learning_rate': 2.134301270417423e-06, 'epoch': 18.51} +{'loss': 35.8788, 'grad_norm': 317.9950256347656, 'learning_rate': 2.1288566243194194e-06, 'epoch': 18.52} +{'loss': 35.2397, 'grad_norm': 342.9775695800781, 'learning_rate': 2.1234119782214156e-06, 'epoch': 18.52} + 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Ž | 5140/5520 [4:31:37<18:37, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6024553179740906, 'eval_runtime': 3.1411, 'eval_samples_per_second': 56.987, 'eval_steps_per_second': 56.987, 'epoch': 18.52} +{'loss': 33.1556, 'grad_norm': 351.09637451171875, 'learning_rate': 2.117967332123412e-06, 'epoch': 18.52} +{'loss': 26.6317, 'grad_norm': 229.55613708496094, 'learning_rate': 2.1125226860254085e-06, 'epoch': 18.53} +{'loss': 21.316, 'grad_norm': 234.53562927246094, 'learning_rate': 2.1070780399274047e-06, 'epoch': 18.53} +{'loss': 21.2739, 'grad_norm': 241.59982299804688, 'learning_rate': 2.1016333938294014e-06, 'epoch': 18.53} +{'loss': 22.736, 'grad_norm': 207.2808380126953, 'learning_rate': 2.0961887477313976e-06, 'epoch': 18.54} +{'loss': 22.7503, 'grad_norm': 236.13955688476562, 'learning_rate': 2.0907441016333942e-06, 'epoch': 18.54} +{'loss': 37.9001, 'grad_norm': 181.6793670654297, 'learning_rate': 2.08529945553539e-06, 'epoch': 18.55} +{'loss': 39.52, 'grad_norm': 249.5441131591797, 'learning_rate': 2.0798548094373863e-06, 'epoch': 18.55} +{'loss': 38.6667, 'grad_norm': 215.67855834960938, 'learning_rate': 2.074410163339383e-06, 'epoch': 18.55} +{'loss': 36.9602, 'grad_norm': 280.9402770996094, 'learning_rate': 2.068965517241379e-06, 'epoch': 18.56} + 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Œ | 5150/5520 [4:32:09<18:01, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6027256846427917, 'eval_runtime': 3.1361, 'eval_samples_per_second': 57.078, 'eval_steps_per_second': 57.078, 'epoch': 18.56} +{'loss': 38.8654, 'grad_norm': 265.9155578613281, 'learning_rate': 2.063520871143376e-06, 'epoch': 18.56} +{'loss': 38.8917, 'grad_norm': 300.0267028808594, 'learning_rate': 2.058076225045372e-06, 'epoch': 18.56} +{'loss': 39.2785, 'grad_norm': 243.0481414794922, 'learning_rate': 2.0526315789473687e-06, 'epoch': 18.57} +{'loss': 39.3892, 'grad_norm': 270.58380126953125, 'learning_rate': 2.047186932849365e-06, 'epoch': 18.57} +{'loss': 39.5933, 'grad_norm': 311.60430908203125, 'learning_rate': 2.041742286751361e-06, 'epoch': 18.57} +{'loss': 38.2962, 'grad_norm': 285.160400390625, 'learning_rate': 2.0362976406533574e-06, 'epoch': 18.58} +{'loss': 38.5965, 'grad_norm': 232.0592041015625, 'learning_rate': 2.030852994555354e-06, 'epoch': 18.58} +{'loss': 36.516, 'grad_norm': 221.85525512695312, 'learning_rate': 2.0254083484573503e-06, 'epoch': 18.59} +{'loss': 36.3976, 'grad_norm': 291.9794921875, 'learning_rate': 2.019963702359347e-06, 'epoch': 18.59} +{'loss': 35.2321, 'grad_norm': 387.8580322265625, 'learning_rate': 2.014519056261343e-06, 'epoch': 18.59} + 93%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–Š | 5160/5520 [4:32:41<17:31, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6030355095863342, 'eval_runtime': 3.1378, 'eval_samples_per_second': 57.046, 'eval_steps_per_second': 57.046, 'epoch': 18.59} +{'loss': 36.4186, 'grad_norm': 300.14508056640625, 'learning_rate': 2.0090744101633394e-06, 'epoch': 18.6} +{'loss': 36.014, 'grad_norm': 294.1235656738281, 'learning_rate': 2.0036297640653356e-06, 'epoch': 18.6} +{'loss': 36.1648, 'grad_norm': 389.1570129394531, 'learning_rate': 1.998185117967332e-06, 'epoch': 18.6} +{'loss': 36.1033, 'grad_norm': 244.6651153564453, 'learning_rate': 1.9927404718693285e-06, 'epoch': 18.61} +{'loss': 37.1531, 'grad_norm': 302.52996826171875, 'learning_rate': 1.9872958257713247e-06, 'epoch': 18.61} +{'loss': 37.8204, 'grad_norm': 352.86273193359375, 'learning_rate': 1.9818511796733214e-06, 'epoch': 18.61} +{'loss': 37.2097, 'grad_norm': 308.61431884765625, 'learning_rate': 1.9764065335753176e-06, 'epoch': 18.62} +{'loss': 36.4242, 'grad_norm': 288.30712890625, 'learning_rate': 1.9709618874773143e-06, 'epoch': 18.62} +{'loss': 35.9204, 'grad_norm': 315.9750671386719, 'learning_rate': 1.9655172413793105e-06, 'epoch': 18.62} +{'loss': 38.9178, 'grad_norm': 468.51055908203125, 'learning_rate': 1.9600725952813067e-06, 'epoch': 18.63} + 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 5170/5520 [4:33:13<16:57, 2.91s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6054540872573853, 'eval_runtime': 3.139, 'eval_samples_per_second': 57.025, 'eval_steps_per_second': 57.025, 'epoch': 18.63} +{'loss': 37.9588, 'grad_norm': 310.5861511230469, 'learning_rate': 1.954627949183303e-06, 'epoch': 18.63} +{'loss': 38.1028, 'grad_norm': 424.3090515136719, 'learning_rate': 1.9491833030852996e-06, 'epoch': 18.64} +{'loss': 36.5096, 'grad_norm': 330.6189880371094, 'learning_rate': 1.943738656987296e-06, 'epoch': 18.64} +{'loss': 36.871, 'grad_norm': 305.9330139160156, 'learning_rate': 1.9382940108892925e-06, 'epoch': 18.64} +{'loss': 37.4061, 'grad_norm': 410.06793212890625, 'learning_rate': 1.9328493647912887e-06, 'epoch': 18.65} +{'loss': 33.6399, 'grad_norm': 385.49127197265625, 'learning_rate': 1.927404718693285e-06, 'epoch': 18.65} +{'loss': 31.3483, 'grad_norm': 270.96783447265625, 'learning_rate': 1.9219600725952816e-06, 'epoch': 18.65} +{'loss': 30.2639, 'grad_norm': 329.84405517578125, 'learning_rate': 1.9165154264972774e-06, 'epoch': 18.66} +{'loss': 31.2749, 'grad_norm': 413.7260437011719, 'learning_rate': 1.911070780399274e-06, 'epoch': 18.66} +{'loss': 30.3596, 'grad_norm': 276.43585205078125, 'learning_rate': 1.9056261343012705e-06, 'epoch': 18.66} + 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 5180/5520 [4:33:45<16:37, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6022100448608398, 'eval_runtime': 3.1339, 'eval_samples_per_second': 57.117, 'eval_steps_per_second': 57.117, 'epoch': 18.66} +{'loss': 32.4066, 'grad_norm': 248.9257049560547, 'learning_rate': 1.9001814882032667e-06, 'epoch': 18.67} +{'loss': 32.3724, 'grad_norm': 252.70388793945312, 'learning_rate': 1.8947368421052632e-06, 'epoch': 18.67} +{'loss': 32.3041, 'grad_norm': 325.0677795410156, 'learning_rate': 1.8892921960072596e-06, 'epoch': 18.68} +{'loss': 32.6609, 'grad_norm': 420.9740295410156, 'learning_rate': 1.883847549909256e-06, 'epoch': 18.68} +{'loss': 32.8471, 'grad_norm': 239.59371948242188, 'learning_rate': 1.878402903811252e-06, 'epoch': 18.68} +{'loss': 32.2686, 'grad_norm': 301.13165283203125, 'learning_rate': 1.8729582577132487e-06, 'epoch': 18.69} +{'loss': 34.2726, 'grad_norm': 282.7923889160156, 'learning_rate': 1.867513611615245e-06, 'epoch': 18.69} +{'loss': 35.335, 'grad_norm': 434.20550537109375, 'learning_rate': 1.8620689655172414e-06, 'epoch': 18.69} +{'loss': 33.3156, 'grad_norm': 306.680908203125, 'learning_rate': 1.8566243194192379e-06, 'epoch': 18.7} +{'loss': 34.9504, 'grad_norm': 253.27711486816406, 'learning_rate': 1.8511796733212343e-06, 'epoch': 18.7} + 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 5190/5520 [4:34:18<16:09, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6021104454994202, 'eval_runtime': 3.1407, 'eval_samples_per_second': 56.994, 'eval_steps_per_second': 56.994, 'epoch': 18.7} +{'loss': 35.285, 'grad_norm': 391.74945068359375, 'learning_rate': 1.8457350272232305e-06, 'epoch': 18.7} +{'loss': 27.689, 'grad_norm': 265.4142150878906, 'learning_rate': 1.840290381125227e-06, 'epoch': 18.71} +{'loss': 22.6159, 'grad_norm': 217.80746459960938, 'learning_rate': 1.8348457350272234e-06, 'epoch': 18.71} +{'loss': 22.1321, 'grad_norm': 220.21180725097656, 'learning_rate': 1.8294010889292196e-06, 'epoch': 18.72} +{'loss': 22.5479, 'grad_norm': 239.4197998046875, 'learning_rate': 1.8239564428312159e-06, 'epoch': 18.72} +{'loss': 23.5363, 'grad_norm': 281.7828674316406, 'learning_rate': 1.8185117967332123e-06, 'epoch': 18.72} +{'loss': 39.0953, 'grad_norm': 231.81980895996094, 'learning_rate': 1.8130671506352088e-06, 'epoch': 18.73} +{'loss': 39.4842, 'grad_norm': 242.0535430908203, 'learning_rate': 1.807622504537205e-06, 'epoch': 18.73} +{'loss': 37.4884, 'grad_norm': 235.6869659423828, 'learning_rate': 1.8021778584392014e-06, 'epoch': 18.73} +{'loss': 38.9612, 'grad_norm': 291.5176086425781, 'learning_rate': 1.7967332123411979e-06, 'epoch': 18.74} + 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 5200/5520 [4:34:50<15:34, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6040608286857605, 'eval_runtime': 3.1385, 'eval_samples_per_second': 57.034, 'eval_steps_per_second': 57.034, 'epoch': 18.74} +{'loss': 39.3531, 'grad_norm': 407.5574645996094, 'learning_rate': 1.7912885662431943e-06, 'epoch': 18.74} +{'loss': 38.4866, 'grad_norm': 277.07891845703125, 'learning_rate': 1.7858439201451905e-06, 'epoch': 18.74} +{'loss': 38.0073, 'grad_norm': 350.2939453125, 'learning_rate': 1.780399274047187e-06, 'epoch': 18.75} +{'loss': 38.1693, 'grad_norm': 395.7618103027344, 'learning_rate': 1.7749546279491834e-06, 'epoch': 18.75} +{'loss': 38.6162, 'grad_norm': 296.43267822265625, 'learning_rate': 1.7695099818511799e-06, 'epoch': 18.75} +{'loss': 38.9182, 'grad_norm': 335.7173156738281, 'learning_rate': 1.764065335753176e-06, 'epoch': 18.76} +{'loss': 38.0685, 'grad_norm': 273.09368896484375, 'learning_rate': 1.7586206896551725e-06, 'epoch': 18.76} +{'loss': 36.8994, 'grad_norm': 359.718505859375, 'learning_rate': 1.7531760435571688e-06, 'epoch': 18.77} +{'loss': 35.375, 'grad_norm': 345.5837097167969, 'learning_rate': 1.7477313974591652e-06, 'epoch': 18.77} +{'loss': 34.7559, 'grad_norm': 266.8583984375, 'learning_rate': 1.7422867513611614e-06, 'epoch': 18.77} + 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 5200/5520 [4:34:53<15:34, 2.92s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 94%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 5210/5520 [4:35:23<15:22, 2.98s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6007165908813477, 'eval_runtime': 3.1395, 'eval_samples_per_second': 57.016, 'eval_steps_per_second': 57.016, 'epoch': 18.77} +{'loss': 35.6206, 'grad_norm': 317.10662841796875, 'learning_rate': 1.7368421052631579e-06, 'epoch': 18.78} +{'loss': 36.7981, 'grad_norm': 418.6651916503906, 'learning_rate': 1.7313974591651543e-06, 'epoch': 18.78} +{'loss': 36.226, 'grad_norm': 247.767333984375, 'learning_rate': 1.7259528130671508e-06, 'epoch': 18.78} +{'loss': 36.5781, 'grad_norm': 406.6683349609375, 'learning_rate': 1.720508166969147e-06, 'epoch': 18.79} +{'loss': 37.8221, 'grad_norm': 433.02984619140625, 'learning_rate': 1.7150635208711434e-06, 'epoch': 18.79} +{'loss': 37.9125, 'grad_norm': 291.1831970214844, 'learning_rate': 1.7096188747731399e-06, 'epoch': 18.79} +{'loss': 38.0886, 'grad_norm': 276.8603820800781, 'learning_rate': 1.7041742286751361e-06, 'epoch': 18.8} +{'loss': 36.8432, 'grad_norm': 442.06317138671875, 'learning_rate': 1.6987295825771326e-06, 'epoch': 18.8} +{'loss': 37.2775, 'grad_norm': 323.7881774902344, 'learning_rate': 1.693284936479129e-06, 'epoch': 18.81} +{'loss': 37.4478, 'grad_norm': 320.2378234863281, 'learning_rate': 1.6878402903811254e-06, 'epoch': 18.81} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 5220/5520 [4:35:55<14:42, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6044604182243347, 'eval_runtime': 3.1433, 'eval_samples_per_second': 56.946, 'eval_steps_per_second': 56.946, 'epoch': 18.81} +{'loss': 37.9463, 'grad_norm': 474.6519470214844, 'learning_rate': 1.6823956442831215e-06, 'epoch': 18.81} +{'loss': 37.7662, 'grad_norm': 265.7474060058594, 'learning_rate': 1.676950998185118e-06, 'epoch': 18.82} +{'loss': 37.3329, 'grad_norm': 312.014892578125, 'learning_rate': 1.6715063520871143e-06, 'epoch': 18.82} +{'loss': 36.4324, 'grad_norm': 407.24884033203125, 'learning_rate': 1.6660617059891108e-06, 'epoch': 18.82} +{'loss': 33.9691, 'grad_norm': 368.05255126953125, 'learning_rate': 1.660617059891107e-06, 'epoch': 18.83} +{'loss': 32.7008, 'grad_norm': 410.3034362792969, 'learning_rate': 1.6551724137931035e-06, 'epoch': 18.83} +{'loss': 32.1152, 'grad_norm': 318.6436462402344, 'learning_rate': 1.6497277676951e-06, 'epoch': 18.83} +{'loss': 31.3827, 'grad_norm': 366.3927307128906, 'learning_rate': 1.6442831215970963e-06, 'epoch': 18.84} +{'loss': 30.781, 'grad_norm': 319.7497863769531, 'learning_rate': 1.6388384754990926e-06, 'epoch': 18.84} +{'loss': 30.5807, 'grad_norm': 405.86669921875, 'learning_rate': 1.633393829401089e-06, 'epoch': 18.85} + 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 5230/5520 [4:36:27<14:13, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6014994382858276, 'eval_runtime': 3.1339, 'eval_samples_per_second': 57.118, 'eval_steps_per_second': 57.118, 'epoch': 18.85} +{'loss': 33.4028, 'grad_norm': 518.0769653320312, 'learning_rate': 1.6279491833030855e-06, 'epoch': 18.85} +{'loss': 31.805, 'grad_norm': 390.18609619140625, 'learning_rate': 1.6225045372050819e-06, 'epoch': 18.85} +{'loss': 33.4414, 'grad_norm': 323.1091003417969, 'learning_rate': 1.6170598911070781e-06, 'epoch': 18.86} +{'loss': 34.1178, 'grad_norm': 311.3610534667969, 'learning_rate': 1.6116152450090744e-06, 'epoch': 18.86} +{'loss': 34.4702, 'grad_norm': 271.058349609375, 'learning_rate': 1.6061705989110708e-06, 'epoch': 18.86} +{'loss': 32.5166, 'grad_norm': 301.3417663574219, 'learning_rate': 1.600725952813067e-06, 'epoch': 18.87} +{'loss': 32.1952, 'grad_norm': 259.4634094238281, 'learning_rate': 1.5952813067150635e-06, 'epoch': 18.87} +{'loss': 33.6772, 'grad_norm': 299.018310546875, 'learning_rate': 1.58983666061706e-06, 'epoch': 18.87} +{'loss': 35.4991, 'grad_norm': 286.192626953125, 'learning_rate': 1.5843920145190564e-06, 'epoch': 18.88} +{'loss': 34.4324, 'grad_norm': 380.0414733886719, 'learning_rate': 1.5789473684210526e-06, 'epoch': 18.88} + 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 5240/5520 [4:36:59<13:37, 2.92s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6009039282798767, 'eval_runtime': 3.136, 'eval_samples_per_second': 57.078, 'eval_steps_per_second': 57.078, 'epoch': 18.88} +{'loss': 35.8757, 'grad_norm': 333.0609436035156, 'learning_rate': 1.573502722323049e-06, 'epoch': 18.88} +{'loss': 30.4765, 'grad_norm': 343.6198425292969, 'learning_rate': 1.5680580762250455e-06, 'epoch': 18.89} +{'loss': 21.2017, 'grad_norm': 222.56637573242188, 'learning_rate': 1.562613430127042e-06, 'epoch': 18.89} +{'loss': 21.5447, 'grad_norm': 209.6859130859375, 'learning_rate': 1.5571687840290381e-06, 'epoch': 18.9} +{'loss': 23.6495, 'grad_norm': 249.7464141845703, 'learning_rate': 1.5517241379310346e-06, 'epoch': 18.9} +{'loss': 23.0331, 'grad_norm': 267.1141357421875, 'learning_rate': 1.546279491833031e-06, 'epoch': 18.9} +{'loss': 37.8988, 'grad_norm': 204.96266174316406, 'learning_rate': 1.5408348457350273e-06, 'epoch': 18.91} +{'loss': 38.5207, 'grad_norm': 247.50706481933594, 'learning_rate': 1.5353901996370235e-06, 'epoch': 18.91} +{'loss': 37.981, 'grad_norm': 350.968994140625, 'learning_rate': 1.52994555353902e-06, 'epoch': 18.91} +{'loss': 39.2602, 'grad_norm': 308.0031433105469, 'learning_rate': 1.5245009074410164e-06, 'epoch': 18.92} + 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 5250/5520 [4:37:31<13:15, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6020543575286865, 'eval_runtime': 3.1367, 'eval_samples_per_second': 57.067, 'eval_steps_per_second': 57.067, 'epoch': 18.92} +{'loss': 39.7331, 'grad_norm': 353.0065002441406, 'learning_rate': 1.5190562613430128e-06, 'epoch': 18.92} +{'loss': 37.6413, 'grad_norm': 495.2381591796875, 'learning_rate': 1.513611615245009e-06, 'epoch': 18.92} +{'loss': 36.1928, 'grad_norm': 470.453125, 'learning_rate': 1.5081669691470055e-06, 'epoch': 18.93} +{'loss': 37.4057, 'grad_norm': 632.1090698242188, 'learning_rate': 1.502722323049002e-06, 'epoch': 18.93} +{'loss': 37.1323, 'grad_norm': 488.4659118652344, 'learning_rate': 1.4972776769509982e-06, 'epoch': 18.94} +{'loss': 36.1739, 'grad_norm': 426.4764709472656, 'learning_rate': 1.4918330308529946e-06, 'epoch': 18.94} +{'loss': 36.243, 'grad_norm': 413.3072509765625, 'learning_rate': 1.486388384754991e-06, 'epoch': 18.94} +{'loss': 36.8362, 'grad_norm': 364.8636169433594, 'learning_rate': 1.4809437386569875e-06, 'epoch': 18.95} +{'loss': 38.4677, 'grad_norm': 306.2213134765625, 'learning_rate': 1.4754990925589837e-06, 'epoch': 18.95} +{'loss': 38.1286, 'grad_norm': 300.37664794921875, 'learning_rate': 1.47005444646098e-06, 'epoch': 18.95} + 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 5260/5520 [4:38:03<12:42, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6017122864723206, 'eval_runtime': 3.1398, 'eval_samples_per_second': 57.009, 'eval_steps_per_second': 57.009, 'epoch': 18.95} +{'loss': 36.9862, 'grad_norm': 242.2681884765625, 'learning_rate': 1.4646098003629764e-06, 'epoch': 18.96} +{'loss': 35.0475, 'grad_norm': 276.28179931640625, 'learning_rate': 1.4591651542649728e-06, 'epoch': 18.96} +{'loss': 30.4778, 'grad_norm': 256.64508056640625, 'learning_rate': 1.453720508166969e-06, 'epoch': 18.96} +{'loss': 32.3847, 'grad_norm': 275.1043701171875, 'learning_rate': 1.4482758620689655e-06, 'epoch': 18.97} +{'loss': 32.9917, 'grad_norm': 324.22955322265625, 'learning_rate': 1.442831215970962e-06, 'epoch': 18.97} +{'loss': 31.5901, 'grad_norm': 328.7778625488281, 'learning_rate': 1.4373865698729584e-06, 'epoch': 18.98} +{'loss': 33.5733, 'grad_norm': 307.2234191894531, 'learning_rate': 1.4319419237749546e-06, 'epoch': 18.98} +{'loss': 33.3204, 'grad_norm': 471.10552978515625, 'learning_rate': 1.426497277676951e-06, 'epoch': 18.98} +{'loss': 35.8205, 'grad_norm': 286.2314453125, 'learning_rate': 1.4210526315789475e-06, 'epoch': 18.99} +{'loss': 35.7746, 'grad_norm': 341.5156555175781, 'learning_rate': 1.415607985480944e-06, 'epoch': 18.99} + 95%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 5270/5520 [4:38:35<12:12, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6023879051208496, 'eval_runtime': 3.1375, 'eval_samples_per_second': 57.051, 'eval_steps_per_second': 57.051, 'epoch': 18.99} +{'loss': 26.5263, 'grad_norm': 257.73345947265625, 'learning_rate': 1.4101633393829402e-06, 'epoch': 18.99} +{'loss': 21.9504, 'grad_norm': 197.04811096191406, 'learning_rate': 1.4047186932849366e-06, 'epoch': 19.0} +{'loss': 20.273, 'grad_norm': 237.48069763183594, 'learning_rate': 1.3992740471869328e-06, 'epoch': 19.0} +{'loss': 37.7406, 'grad_norm': 238.98065185546875, 'learning_rate': 1.393829401088929e-06, 'epoch': 19.0} +{'loss': 39.8367, 'grad_norm': 209.30593872070312, 'learning_rate': 1.3883847549909255e-06, 'epoch': 19.01} +{'loss': 39.0155, 'grad_norm': 251.27899169921875, 'learning_rate': 1.382940108892922e-06, 'epoch': 19.01} +{'loss': 37.9895, 'grad_norm': 278.8317565917969, 'learning_rate': 1.3774954627949184e-06, 'epoch': 19.01} +{'loss': 38.2986, 'grad_norm': 227.08090209960938, 'learning_rate': 1.3720508166969146e-06, 'epoch': 19.02} +{'loss': 38.9906, 'grad_norm': 248.63221740722656, 'learning_rate': 1.366606170598911e-06, 'epoch': 19.02} +{'loss': 39.4871, 'grad_norm': 216.49449157714844, 'learning_rate': 1.3611615245009075e-06, 'epoch': 19.03} + 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 5280/5520 [4:39:07<11:42, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6001354455947876, 'eval_runtime': 3.1409, 'eval_samples_per_second': 56.991, 'eval_steps_per_second': 56.991, 'epoch': 19.03} +{'loss': 38.8617, 'grad_norm': 219.4734649658203, 'learning_rate': 1.355716878402904e-06, 'epoch': 19.03} +{'loss': 39.6489, 'grad_norm': 211.6996307373047, 'learning_rate': 1.3502722323049002e-06, 'epoch': 19.03} +{'loss': 39.4235, 'grad_norm': 306.1536865234375, 'learning_rate': 1.3448275862068966e-06, 'epoch': 19.04} +{'loss': 37.9957, 'grad_norm': 260.87353515625, 'learning_rate': 1.339382940108893e-06, 'epoch': 19.04} +{'loss': 36.4288, 'grad_norm': 266.5260314941406, 'learning_rate': 1.3339382940108895e-06, 'epoch': 19.04} +{'loss': 35.1091, 'grad_norm': 295.3840637207031, 'learning_rate': 1.3284936479128857e-06, 'epoch': 19.05} +{'loss': 37.6468, 'grad_norm': 381.60748291015625, 'learning_rate': 1.323049001814882e-06, 'epoch': 19.05} +{'loss': 35.8345, 'grad_norm': 430.3531494140625, 'learning_rate': 1.3176043557168784e-06, 'epoch': 19.05} +{'loss': 37.1803, 'grad_norm': 393.22772216796875, 'learning_rate': 1.3121597096188749e-06, 'epoch': 19.06} +{'loss': 36.5634, 'grad_norm': 308.1875915527344, 'learning_rate': 1.306715063520871e-06, 'epoch': 19.06} + 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 5290/5520 [4:39:39<11:13, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6008215546607971, 'eval_runtime': 3.1371, 'eval_samples_per_second': 57.059, 'eval_steps_per_second': 57.059, 'epoch': 19.06} +{'loss': 36.7718, 'grad_norm': 379.57183837890625, 'learning_rate': 1.3012704174228675e-06, 'epoch': 19.07} +{'loss': 37.0207, 'grad_norm': 482.2864685058594, 'learning_rate': 1.295825771324864e-06, 'epoch': 19.07} +{'loss': 37.0438, 'grad_norm': 310.96142578125, 'learning_rate': 1.2903811252268602e-06, 'epoch': 19.07} +{'loss': 36.3401, 'grad_norm': 274.2409973144531, 'learning_rate': 1.2849364791288566e-06, 'epoch': 19.08} +{'loss': 36.6312, 'grad_norm': 242.37583923339844, 'learning_rate': 1.279491833030853e-06, 'epoch': 19.08} +{'loss': 37.4987, 'grad_norm': 244.91583251953125, 'learning_rate': 1.2740471869328495e-06, 'epoch': 19.08} +{'loss': 38.1373, 'grad_norm': 234.21511840820312, 'learning_rate': 1.2686025408348458e-06, 'epoch': 19.09} +{'loss': 38.8423, 'grad_norm': 277.73931884765625, 'learning_rate': 1.2631578947368422e-06, 'epoch': 19.09} +{'loss': 37.2783, 'grad_norm': 247.04971313476562, 'learning_rate': 1.2577132486388386e-06, 'epoch': 19.09} +{'loss': 36.2534, 'grad_norm': 289.022216796875, 'learning_rate': 1.2522686025408349e-06, 'epoch': 19.1} + 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 5300/5520 [4:40:11<10:46, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6020083427429199, 'eval_runtime': 3.1426, 'eval_samples_per_second': 56.959, 'eval_steps_per_second': 56.959, 'epoch': 19.1} +{'loss': 36.4967, 'grad_norm': 294.7291564941406, 'learning_rate': 1.246823956442831e-06, 'epoch': 19.1} +{'loss': 34.1439, 'grad_norm': 238.0512237548828, 'learning_rate': 1.2413793103448275e-06, 'epoch': 19.1} +{'loss': 30.9632, 'grad_norm': 254.0712127685547, 'learning_rate': 1.235934664246824e-06, 'epoch': 19.11} +{'loss': 29.2757, 'grad_norm': 321.169921875, 'learning_rate': 1.2304900181488204e-06, 'epoch': 19.11} +{'loss': 31.2651, 'grad_norm': 308.8040466308594, 'learning_rate': 1.2250453720508167e-06, 'epoch': 19.12} +{'loss': 32.9721, 'grad_norm': 369.23004150390625, 'learning_rate': 1.219600725952813e-06, 'epoch': 19.12} +{'loss': 31.8663, 'grad_norm': 348.9309997558594, 'learning_rate': 1.2141560798548095e-06, 'epoch': 19.12} +{'loss': 31.6104, 'grad_norm': 330.5960388183594, 'learning_rate': 1.208711433756806e-06, 'epoch': 19.13} +{'loss': 32.1911, 'grad_norm': 380.59161376953125, 'learning_rate': 1.2032667876588022e-06, 'epoch': 19.13} +{'loss': 33.4755, 'grad_norm': 402.8847961425781, 'learning_rate': 1.1978221415607986e-06, 'epoch': 19.13} + 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 5310/5520 [4:40:44<10:15, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6015223264694214, 'eval_runtime': 3.138, 'eval_samples_per_second': 57.043, 'eval_steps_per_second': 57.043, 'epoch': 19.13} +{'loss': 33.7318, 'grad_norm': 409.90667724609375, 'learning_rate': 1.192377495462795e-06, 'epoch': 19.14} +{'loss': 33.6745, 'grad_norm': 425.7220458984375, 'learning_rate': 1.1869328493647913e-06, 'epoch': 19.14} +{'loss': 33.8191, 'grad_norm': 373.9212951660156, 'learning_rate': 1.1814882032667876e-06, 'epoch': 19.14} +{'loss': 33.8767, 'grad_norm': 381.37469482421875, 'learning_rate': 1.176043557168784e-06, 'epoch': 19.15} +{'loss': 33.3089, 'grad_norm': 267.89288330078125, 'learning_rate': 1.1705989110707804e-06, 'epoch': 19.15} +{'loss': 35.798, 'grad_norm': 326.5400390625, 'learning_rate': 1.1651542649727767e-06, 'epoch': 19.16} +{'loss': 34.2442, 'grad_norm': 307.7875061035156, 'learning_rate': 1.1597096188747731e-06, 'epoch': 19.16} +{'loss': 34.7408, 'grad_norm': 401.6629333496094, 'learning_rate': 1.1542649727767695e-06, 'epoch': 19.16} +{'loss': 30.2776, 'grad_norm': 297.7433166503906, 'learning_rate': 1.148820326678766e-06, 'epoch': 19.17} +{'loss': 21.3755, 'grad_norm': 221.2977752685547, 'learning_rate': 1.1433756805807622e-06, 'epoch': 19.17} + 96%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 5320/5520 [4:41:16<09:48, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6015586853027344, 'eval_runtime': 3.1382, 'eval_samples_per_second': 57.039, 'eval_steps_per_second': 57.039, 'epoch': 19.17} +{'loss': 20.9516, 'grad_norm': 232.3973846435547, 'learning_rate': 1.1379310344827587e-06, 'epoch': 19.17} +{'loss': 22.3779, 'grad_norm': 220.6842803955078, 'learning_rate': 1.1324863883847551e-06, 'epoch': 19.18} +{'loss': 23.4166, 'grad_norm': 207.9031982421875, 'learning_rate': 1.1270417422867515e-06, 'epoch': 19.18} +{'loss': 37.157, 'grad_norm': 211.70394897460938, 'learning_rate': 1.1215970961887478e-06, 'epoch': 19.18} +{'loss': 40.0688, 'grad_norm': 243.7276611328125, 'learning_rate': 1.1161524500907442e-06, 'epoch': 19.19} +{'loss': 38.9213, 'grad_norm': 199.99435424804688, 'learning_rate': 1.1107078039927405e-06, 'epoch': 19.19} +{'loss': 37.5778, 'grad_norm': 214.8607177734375, 'learning_rate': 1.1052631578947367e-06, 'epoch': 19.2} +{'loss': 36.9334, 'grad_norm': 241.69651794433594, 'learning_rate': 1.0998185117967331e-06, 'epoch': 19.2} +{'loss': 38.9315, 'grad_norm': 344.64849853515625, 'learning_rate': 1.0943738656987296e-06, 'epoch': 19.2} +{'loss': 37.94, 'grad_norm': 248.10731506347656, 'learning_rate': 1.088929219600726e-06, 'epoch': 19.21} + 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 5330/5520 [4:41:48<09:19, 2.95s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6011462211608887, 'eval_runtime': 3.1406, 'eval_samples_per_second': 56.995, 'eval_steps_per_second': 56.995, 'epoch': 19.21} +{'loss': 38.2016, 'grad_norm': 262.3296813964844, 'learning_rate': 1.0834845735027222e-06, 'epoch': 19.21} +{'loss': 39.0355, 'grad_norm': 276.65179443359375, 'learning_rate': 1.0780399274047187e-06, 'epoch': 19.21} +{'loss': 39.0543, 'grad_norm': 377.314697265625, 'learning_rate': 1.0725952813067151e-06, 'epoch': 19.22} +{'loss': 37.1001, 'grad_norm': 282.5917053222656, 'learning_rate': 1.0671506352087116e-06, 'epoch': 19.22} +{'loss': 36.5363, 'grad_norm': 420.4558410644531, 'learning_rate': 1.0617059891107078e-06, 'epoch': 19.22} +{'loss': 35.8127, 'grad_norm': 460.62701416015625, 'learning_rate': 1.0562613430127042e-06, 'epoch': 19.23} +{'loss': 35.7043, 'grad_norm': 492.31170654296875, 'learning_rate': 1.0508166969147007e-06, 'epoch': 19.23} +{'loss': 35.0656, 'grad_norm': 385.2608947753906, 'learning_rate': 1.0453720508166971e-06, 'epoch': 19.23} +{'loss': 37.2145, 'grad_norm': 322.3689270019531, 'learning_rate': 1.0399274047186931e-06, 'epoch': 19.24} +{'loss': 35.4361, 'grad_norm': 309.3829650878906, 'learning_rate': 1.0344827586206896e-06, 'epoch': 19.24} + 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 5340/5520 [4:42:20<08:48, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6023690104484558, 'eval_runtime': 3.1424, 'eval_samples_per_second': 56.964, 'eval_steps_per_second': 56.964, 'epoch': 19.24} +{'loss': 36.9204, 'grad_norm': 342.5604248046875, 'learning_rate': 1.029038112522686e-06, 'epoch': 19.25} +{'loss': 37.9907, 'grad_norm': 404.432373046875, 'learning_rate': 1.0235934664246825e-06, 'epoch': 19.25} +{'loss': 36.1432, 'grad_norm': 333.77044677734375, 'learning_rate': 1.0181488203266787e-06, 'epoch': 19.25} +{'loss': 37.824, 'grad_norm': 297.11480712890625, 'learning_rate': 1.0127041742286751e-06, 'epoch': 19.26} +{'loss': 36.0811, 'grad_norm': 271.3321838378906, 'learning_rate': 1.0072595281306716e-06, 'epoch': 19.26} +{'loss': 36.6415, 'grad_norm': 246.6988525390625, 'learning_rate': 1.0018148820326678e-06, 'epoch': 19.26} +{'loss': 37.048, 'grad_norm': 264.7515563964844, 'learning_rate': 9.963702359346642e-07, 'epoch': 19.27} +{'loss': 37.3109, 'grad_norm': 238.71475219726562, 'learning_rate': 9.909255898366607e-07, 'epoch': 19.27} +{'loss': 37.0776, 'grad_norm': 232.89256286621094, 'learning_rate': 9.854809437386571e-07, 'epoch': 19.27} +{'loss': 37.5227, 'grad_norm': 309.91796875, 'learning_rate': 9.800362976406534e-07, 'epoch': 19.28} + 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 5350/5520 [4:42:52<08:18, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.603413999080658, 'eval_runtime': 3.1407, 'eval_samples_per_second': 56.993, 'eval_steps_per_second': 56.993, 'epoch': 19.28} +{'loss': 38.7916, 'grad_norm': 415.85009765625, 'learning_rate': 9.745916515426498e-07, 'epoch': 19.28} +{'loss': 34.7108, 'grad_norm': 336.5480651855469, 'learning_rate': 9.691470054446462e-07, 'epoch': 19.29} +{'loss': 33.3624, 'grad_norm': 361.7843017578125, 'learning_rate': 9.637023593466425e-07, 'epoch': 19.29} +{'loss': 31.9202, 'grad_norm': 278.5044250488281, 'learning_rate': 9.582577132486387e-07, 'epoch': 19.29} +{'loss': 32.0191, 'grad_norm': 378.85003662109375, 'learning_rate': 9.528130671506353e-07, 'epoch': 19.3} +{'loss': 30.1278, 'grad_norm': 307.8309020996094, 'learning_rate': 9.473684210526316e-07, 'epoch': 19.3} +{'loss': 30.8298, 'grad_norm': 377.0649108886719, 'learning_rate': 9.41923774954628e-07, 'epoch': 19.3} +{'loss': 32.8491, 'grad_norm': 366.9952392578125, 'learning_rate': 9.364791288566244e-07, 'epoch': 19.31} +{'loss': 33.3014, 'grad_norm': 384.6134948730469, 'learning_rate': 9.310344827586207e-07, 'epoch': 19.31} +{'loss': 31.1514, 'grad_norm': 377.0379943847656, 'learning_rate': 9.255898366606171e-07, 'epoch': 19.31} + 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 5360/5520 [4:43:24<07:49, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6012714505195618, 'eval_runtime': 3.1381, 'eval_samples_per_second': 57.042, 'eval_steps_per_second': 57.042, 'epoch': 19.31} +{'loss': 32.2402, 'grad_norm': 419.49359130859375, 'learning_rate': 9.201451905626135e-07, 'epoch': 19.32} +{'loss': 33.9084, 'grad_norm': 290.20050048828125, 'learning_rate': 9.147005444646098e-07, 'epoch': 19.32} +{'loss': 34.3691, 'grad_norm': 283.597412109375, 'learning_rate': 9.092558983666062e-07, 'epoch': 19.33} +{'loss': 33.2218, 'grad_norm': 322.4947204589844, 'learning_rate': 9.038112522686025e-07, 'epoch': 19.33} +{'loss': 32.6409, 'grad_norm': 346.0417785644531, 'learning_rate': 8.983666061705989e-07, 'epoch': 19.33} +{'loss': 33.722, 'grad_norm': 282.1748962402344, 'learning_rate': 8.929219600725953e-07, 'epoch': 19.34} +{'loss': 35.1681, 'grad_norm': 302.015625, 'learning_rate': 8.874773139745917e-07, 'epoch': 19.34} +{'loss': 34.2712, 'grad_norm': 325.37005615234375, 'learning_rate': 8.82032667876588e-07, 'epoch': 19.34} +{'loss': 31.3185, 'grad_norm': 291.301513671875, 'learning_rate': 8.765880217785844e-07, 'epoch': 19.35} +{'loss': 22.3868, 'grad_norm': 190.09767150878906, 'learning_rate': 8.711433756805807e-07, 'epoch': 19.35} + 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 5370/5520 [4:43:56<07:20, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6009277105331421, 'eval_runtime': 3.1385, 'eval_samples_per_second': 57.033, 'eval_steps_per_second': 57.033, 'epoch': 19.35} +{'loss': 21.1889, 'grad_norm': 231.69676208496094, 'learning_rate': 8.656987295825772e-07, 'epoch': 19.35} +{'loss': 23.1246, 'grad_norm': 255.91258239746094, 'learning_rate': 8.602540834845735e-07, 'epoch': 19.36} +{'loss': 22.9017, 'grad_norm': 265.2499694824219, 'learning_rate': 8.548094373865699e-07, 'epoch': 19.36} +{'loss': 38.4372, 'grad_norm': 217.06552124023438, 'learning_rate': 8.493647912885663e-07, 'epoch': 19.36} +{'loss': 38.8259, 'grad_norm': 220.9014434814453, 'learning_rate': 8.439201451905627e-07, 'epoch': 19.37} +{'loss': 37.7587, 'grad_norm': 217.46336364746094, 'learning_rate': 8.38475499092559e-07, 'epoch': 19.37} +{'loss': 38.2973, 'grad_norm': 219.59889221191406, 'learning_rate': 8.330308529945554e-07, 'epoch': 19.38} +{'loss': 36.6878, 'grad_norm': 206.93772888183594, 'learning_rate': 8.275862068965517e-07, 'epoch': 19.38} +{'loss': 37.4095, 'grad_norm': 268.5470886230469, 'learning_rate': 8.221415607985482e-07, 'epoch': 19.38} +{'loss': 39.1159, 'grad_norm': 228.70216369628906, 'learning_rate': 8.166969147005445e-07, 'epoch': 19.39} + 97%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 5380/5520 [4:44:29<06:49, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6011511087417603, 'eval_runtime': 3.1369, 'eval_samples_per_second': 57.063, 'eval_steps_per_second': 57.063, 'epoch': 19.39} +{'loss': 38.8929, 'grad_norm': 212.8670654296875, 'learning_rate': 8.112522686025409e-07, 'epoch': 19.39} +{'loss': 39.7208, 'grad_norm': 228.0734405517578, 'learning_rate': 8.058076225045372e-07, 'epoch': 19.39} +{'loss': 38.3748, 'grad_norm': 239.56906127929688, 'learning_rate': 8.003629764065335e-07, 'epoch': 19.4} +{'loss': 37.3178, 'grad_norm': 243.6251220703125, 'learning_rate': 7.9491833030853e-07, 'epoch': 19.4} +{'loss': 36.5418, 'grad_norm': 407.86907958984375, 'learning_rate': 7.894736842105263e-07, 'epoch': 19.4} +{'loss': 36.9031, 'grad_norm': 260.6579284667969, 'learning_rate': 7.840290381125227e-07, 'epoch': 19.41} +{'loss': 35.4851, 'grad_norm': 358.63946533203125, 'learning_rate': 7.785843920145191e-07, 'epoch': 19.41} +{'loss': 34.6983, 'grad_norm': 414.06634521484375, 'learning_rate': 7.731397459165155e-07, 'epoch': 19.42} +{'loss': 36.7265, 'grad_norm': 471.287109375, 'learning_rate': 7.676950998185117e-07, 'epoch': 19.42} +{'loss': 35.4779, 'grad_norm': 366.92767333984375, 'learning_rate': 7.622504537205082e-07, 'epoch': 19.42} + 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 5390/5520 [4:45:01<06:21, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6010181903839111, 'eval_runtime': 3.147, 'eval_samples_per_second': 56.88, 'eval_steps_per_second': 56.88, 'epoch': 19.42} +{'loss': 36.1143, 'grad_norm': 392.23138427734375, 'learning_rate': 7.568058076225045e-07, 'epoch': 19.43} +{'loss': 37.5785, 'grad_norm': 296.0258483886719, 'learning_rate': 7.51361161524501e-07, 'epoch': 19.43} +{'loss': 37.7905, 'grad_norm': 425.22247314453125, 'learning_rate': 7.459165154264973e-07, 'epoch': 19.43} +{'loss': 36.3987, 'grad_norm': 288.7919921875, 'learning_rate': 7.404718693284937e-07, 'epoch': 19.44} +{'loss': 36.9862, 'grad_norm': 269.2157287597656, 'learning_rate': 7.3502722323049e-07, 'epoch': 19.44} +{'loss': 36.3645, 'grad_norm': 236.28067016601562, 'learning_rate': 7.295825771324864e-07, 'epoch': 19.44} +{'loss': 37.0505, 'grad_norm': 217.44627380371094, 'learning_rate': 7.241379310344827e-07, 'epoch': 19.45} +{'loss': 37.1031, 'grad_norm': 260.61175537109375, 'learning_rate': 7.186932849364792e-07, 'epoch': 19.45} +{'loss': 38.2061, 'grad_norm': 282.62017822265625, 'learning_rate': 7.132486388384755e-07, 'epoch': 19.46} +{'loss': 35.8868, 'grad_norm': 231.78170776367188, 'learning_rate': 7.07803992740472e-07, 'epoch': 19.46} + 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 5400/5520 [4:45:33<05:52, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6014392375946045, 'eval_runtime': 3.1328, 'eval_samples_per_second': 57.137, 'eval_steps_per_second': 57.137, 'epoch': 19.46} +{'loss': 36.1871, 'grad_norm': 246.38380432128906, 'learning_rate': 7.023593466424683e-07, 'epoch': 19.46} +{'loss': 34.5704, 'grad_norm': 239.06924438476562, 'learning_rate': 6.969147005444645e-07, 'epoch': 19.47} +{'loss': 33.6148, 'grad_norm': 396.09027099609375, 'learning_rate': 6.91470054446461e-07, 'epoch': 19.47} +{'loss': 31.535, 'grad_norm': 250.8205108642578, 'learning_rate': 6.860254083484573e-07, 'epoch': 19.47} +{'loss': 31.6366, 'grad_norm': 257.0039978027344, 'learning_rate': 6.805807622504538e-07, 'epoch': 19.48} +{'loss': 30.4001, 'grad_norm': 283.7515563964844, 'learning_rate': 6.751361161524501e-07, 'epoch': 19.48} +{'loss': 31.1016, 'grad_norm': 335.6957702636719, 'learning_rate': 6.696914700544465e-07, 'epoch': 19.48} +{'loss': 31.7707, 'grad_norm': 338.0590515136719, 'learning_rate': 6.642468239564429e-07, 'epoch': 19.49} +{'loss': 34.904, 'grad_norm': 409.0957946777344, 'learning_rate': 6.588021778584392e-07, 'epoch': 19.49} +{'loss': 32.1701, 'grad_norm': 265.0601806640625, 'learning_rate': 6.533575317604355e-07, 'epoch': 19.49} + 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 5400/5520 [4:45:36<05:52, 2.94s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. + 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 5410/5520 [4:46:06<05:24, 2.95s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6015393137931824, 'eval_runtime': 3.1382, 'eval_samples_per_second': 57.04, 'eval_steps_per_second': 57.04, 'epoch': 19.49} +{'loss': 32.7803, 'grad_norm': 354.3403625488281, 'learning_rate': 6.47912885662432e-07, 'epoch': 19.5} +{'loss': 33.4401, 'grad_norm': 257.71124267578125, 'learning_rate': 6.424682395644283e-07, 'epoch': 19.5} +{'loss': 33.3075, 'grad_norm': 325.73876953125, 'learning_rate': 6.370235934664248e-07, 'epoch': 19.51} +{'loss': 34.5868, 'grad_norm': 283.1676940917969, 'learning_rate': 6.315789473684211e-07, 'epoch': 19.51} +{'loss': 34.2399, 'grad_norm': 265.0743713378906, 'learning_rate': 6.261343012704174e-07, 'epoch': 19.51} +{'loss': 35.8848, 'grad_norm': 381.4061279296875, 'learning_rate': 6.206896551724138e-07, 'epoch': 19.52} +{'loss': 34.5162, 'grad_norm': 311.1829833984375, 'learning_rate': 6.152450090744102e-07, 'epoch': 19.52} +{'loss': 34.0525, 'grad_norm': 301.8170471191406, 'learning_rate': 6.098003629764065e-07, 'epoch': 19.52} +{'loss': 28.6084, 'grad_norm': 276.9403076171875, 'learning_rate': 6.04355716878403e-07, 'epoch': 19.53} +{'loss': 21.827, 'grad_norm': 221.44195556640625, 'learning_rate': 5.989110707803993e-07, 'epoch': 19.53} +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. + 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 5420/5520 [4:46:38<04:53, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.601222813129425, 'eval_runtime': 3.1388, 'eval_samples_per_second': 57.029, 'eval_steps_per_second': 57.029, 'epoch': 19.53} +{'loss': 21.4303, 'grad_norm': 215.0915069580078, 'learning_rate': 5.934664246823957e-07, 'epoch': 19.53} +{'loss': 22.3575, 'grad_norm': 230.7354736328125, 'learning_rate': 5.88021778584392e-07, 'epoch': 19.54} +{'loss': 23.2244, 'grad_norm': 257.53533935546875, 'learning_rate': 5.825771324863883e-07, 'epoch': 19.54} +{'loss': 37.427, 'grad_norm': 226.0248260498047, 'learning_rate': 5.771324863883848e-07, 'epoch': 19.55} +{'loss': 38.7522, 'grad_norm': 204.3394775390625, 'learning_rate': 5.716878402903811e-07, 'epoch': 19.55} +{'loss': 38.0999, 'grad_norm': 213.9196014404297, 'learning_rate': 5.662431941923776e-07, 'epoch': 19.55} +{'loss': 38.154, 'grad_norm': 183.85964965820312, 'learning_rate': 5.607985480943739e-07, 'epoch': 19.56} +{'loss': 38.0258, 'grad_norm': 212.41763305664062, 'learning_rate': 5.553539019963702e-07, 'epoch': 19.56} +{'loss': 38.8271, 'grad_norm': 225.71121215820312, 'learning_rate': 5.499092558983666e-07, 'epoch': 19.56} +{'loss': 37.5532, 'grad_norm': 235.203125, 'learning_rate': 5.44464609800363e-07, 'epoch': 19.57} + 98%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 5430/5520 [4:47:10<04:24, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6008686423301697, 'eval_runtime': 3.1367, 'eval_samples_per_second': 57.067, 'eval_steps_per_second': 57.067, 'epoch': 19.57} +{'loss': 40.1166, 'grad_norm': 208.5715789794922, 'learning_rate': 5.390199637023593e-07, 'epoch': 19.57} +{'loss': 37.8543, 'grad_norm': 264.13909912109375, 'learning_rate': 5.335753176043558e-07, 'epoch': 19.57} +{'loss': 39.0052, 'grad_norm': 218.41786193847656, 'learning_rate': 5.281306715063521e-07, 'epoch': 19.58} +{'loss': 38.3982, 'grad_norm': 286.737060546875, 'learning_rate': 5.226860254083486e-07, 'epoch': 19.58} +{'loss': 37.0972, 'grad_norm': 291.76617431640625, 'learning_rate': 5.172413793103448e-07, 'epoch': 19.59} +{'loss': 34.297, 'grad_norm': 300.4125671386719, 'learning_rate': 5.117967332123412e-07, 'epoch': 19.59} +{'loss': 35.5142, 'grad_norm': 359.1770935058594, 'learning_rate': 5.063520871143376e-07, 'epoch': 19.59} +{'loss': 36.9965, 'grad_norm': 384.48028564453125, 'learning_rate': 5.009074410163339e-07, 'epoch': 19.6} +{'loss': 37.3736, 'grad_norm': 415.5469055175781, 'learning_rate': 4.954627949183303e-07, 'epoch': 19.6} +{'loss': 36.4009, 'grad_norm': 236.56715393066406, 'learning_rate': 4.900181488203267e-07, 'epoch': 19.6} + 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 5440/5520 [4:47:43<04:01, 3.02s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6017860770225525, 'eval_runtime': 3.1384, 'eval_samples_per_second': 57.035, 'eval_steps_per_second': 57.035, 'epoch': 19.6} +{'loss': 35.4744, 'grad_norm': 411.9438171386719, 'learning_rate': 4.845735027223231e-07, 'epoch': 19.61} +{'loss': 36.853, 'grad_norm': 306.6455993652344, 'learning_rate': 4.791288566243194e-07, 'epoch': 19.61} +{'loss': 37.7418, 'grad_norm': 289.98883056640625, 'learning_rate': 4.736842105263158e-07, 'epoch': 19.61} +{'loss': 36.2866, 'grad_norm': 227.83628845214844, 'learning_rate': 4.682395644283122e-07, 'epoch': 19.62} +{'loss': 35.5141, 'grad_norm': 260.56695556640625, 'learning_rate': 4.627949183303086e-07, 'epoch': 19.62} +{'loss': 37.7585, 'grad_norm': 236.0625762939453, 'learning_rate': 4.573502722323049e-07, 'epoch': 19.62} +{'loss': 39.0317, 'grad_norm': 299.8916015625, 'learning_rate': 4.5190562613430125e-07, 'epoch': 19.63} +{'loss': 38.0213, 'grad_norm': 236.15243530273438, 'learning_rate': 4.4646098003629764e-07, 'epoch': 19.63} +{'loss': 37.9197, 'grad_norm': 291.18182373046875, 'learning_rate': 4.41016333938294e-07, 'epoch': 19.64} +{'loss': 36.2963, 'grad_norm': 243.15419006347656, 'learning_rate': 4.3557168784029036e-07, 'epoch': 19.64} + 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 5450/5520 [4:48:15<03:26, 2.96s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6023871302604675, 'eval_runtime': 3.1823, 'eval_samples_per_second': 56.249, 'eval_steps_per_second': 56.249, 'epoch': 19.64} +{'loss': 36.5442, 'grad_norm': 247.60049438476562, 'learning_rate': 4.3012704174228675e-07, 'epoch': 19.64} +{'loss': 34.3726, 'grad_norm': 268.0565490722656, 'learning_rate': 4.2468239564428314e-07, 'epoch': 19.65} +{'loss': 32.1691, 'grad_norm': 251.00057983398438, 'learning_rate': 4.192377495462795e-07, 'epoch': 19.65} +{'loss': 31.5831, 'grad_norm': 321.4367370605469, 'learning_rate': 4.1379310344827586e-07, 'epoch': 19.65} +{'loss': 32.1178, 'grad_norm': 328.7476501464844, 'learning_rate': 4.0834845735027225e-07, 'epoch': 19.66} +{'loss': 30.9057, 'grad_norm': 264.1122741699219, 'learning_rate': 4.029038112522686e-07, 'epoch': 19.66} +{'loss': 32.1608, 'grad_norm': 443.7752380371094, 'learning_rate': 3.97459165154265e-07, 'epoch': 19.66} +{'loss': 32.152, 'grad_norm': 239.18614196777344, 'learning_rate': 3.9201451905626137e-07, 'epoch': 19.67} +{'loss': 32.9004, 'grad_norm': 259.49249267578125, 'learning_rate': 3.8656987295825776e-07, 'epoch': 19.67} +{'loss': 32.8006, 'grad_norm': 270.965576171875, 'learning_rate': 3.811252268602541e-07, 'epoch': 19.68} + 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ– | 5460/5520 [4:48:48<02:57, 2.95s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6013672947883606, 'eval_runtime': 3.15, 'eval_samples_per_second': 56.826, 'eval_steps_per_second': 56.826, 'epoch': 19.68} +{'loss': 32.5202, 'grad_norm': 261.61962890625, 'learning_rate': 3.756805807622505e-07, 'epoch': 19.68} +{'loss': 33.8715, 'grad_norm': 421.79974365234375, 'learning_rate': 3.7023593466424687e-07, 'epoch': 19.68} +{'loss': 33.7379, 'grad_norm': 257.54522705078125, 'learning_rate': 3.647912885662432e-07, 'epoch': 19.69} +{'loss': 34.0276, 'grad_norm': 290.4663391113281, 'learning_rate': 3.593466424682396e-07, 'epoch': 19.69} +{'loss': 33.697, 'grad_norm': 358.5994567871094, 'learning_rate': 3.53901996370236e-07, 'epoch': 19.69} +{'loss': 36.1719, 'grad_norm': 387.7028503417969, 'learning_rate': 3.4845735027223227e-07, 'epoch': 19.7} +{'loss': 34.2382, 'grad_norm': 358.7620544433594, 'learning_rate': 3.4301270417422866e-07, 'epoch': 19.7} +{'loss': 34.9605, 'grad_norm': 395.00140380859375, 'learning_rate': 3.3756805807622505e-07, 'epoch': 19.7} +{'loss': 25.9945, 'grad_norm': 291.4330749511719, 'learning_rate': 3.3212341197822143e-07, 'epoch': 19.71} +{'loss': 21.5688, 'grad_norm': 218.69113159179688, 'learning_rate': 3.2667876588021777e-07, 'epoch': 19.71} + 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹ | 5470/5520 [4:49:20<02:27, 2.95s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.601553201675415, 'eval_runtime': 3.1514, 'eval_samples_per_second': 56.799, 'eval_steps_per_second': 56.799, 'epoch': 19.71} +{'loss': 21.5821, 'grad_norm': 236.54107666015625, 'learning_rate': 3.2123411978221416e-07, 'epoch': 19.72} +{'loss': 21.9283, 'grad_norm': 272.6702880859375, 'learning_rate': 3.1578947368421055e-07, 'epoch': 19.72} +{'loss': 22.9388, 'grad_norm': 264.74005126953125, 'learning_rate': 3.103448275862069e-07, 'epoch': 19.72} +{'loss': 39.0581, 'grad_norm': 207.89337158203125, 'learning_rate': 3.0490018148820327e-07, 'epoch': 19.73} +{'loss': 38.9328, 'grad_norm': 240.96636962890625, 'learning_rate': 2.9945553539019966e-07, 'epoch': 19.73} +{'loss': 38.9602, 'grad_norm': 239.6488037109375, 'learning_rate': 2.94010889292196e-07, 'epoch': 19.73} +{'loss': 39.4486, 'grad_norm': 233.20974731445312, 'learning_rate': 2.885662431941924e-07, 'epoch': 19.74} +{'loss': 39.0727, 'grad_norm': 224.98013305664062, 'learning_rate': 2.831215970961888e-07, 'epoch': 19.74} +{'loss': 37.5583, 'grad_norm': 245.26980590820312, 'learning_rate': 2.776769509981851e-07, 'epoch': 19.74} +{'loss': 38.6332, 'grad_norm': 205.14044189453125, 'learning_rate': 2.722323049001815e-07, 'epoch': 19.75} + 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰ | 5480/5520 [4:49:52<01:58, 2.96s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6001273393630981, 'eval_runtime': 3.1492, 'eval_samples_per_second': 56.84, 'eval_steps_per_second': 56.84, 'epoch': 19.75} +{'loss': 38.4545, 'grad_norm': 229.19940185546875, 'learning_rate': 2.667876588021779e-07, 'epoch': 19.75} +{'loss': 37.4461, 'grad_norm': 260.04083251953125, 'learning_rate': 2.613430127041743e-07, 'epoch': 19.75} +{'loss': 38.6207, 'grad_norm': 252.2135772705078, 'learning_rate': 2.558983666061706e-07, 'epoch': 19.76} +{'loss': 36.4307, 'grad_norm': 211.760009765625, 'learning_rate': 2.5045372050816695e-07, 'epoch': 19.76} +{'loss': 35.7522, 'grad_norm': 227.18177795410156, 'learning_rate': 2.4500907441016334e-07, 'epoch': 19.77} +{'loss': 35.3123, 'grad_norm': 276.8219299316406, 'learning_rate': 2.395644283121597e-07, 'epoch': 19.77} +{'loss': 35.8374, 'grad_norm': 302.77362060546875, 'learning_rate': 2.341197822141561e-07, 'epoch': 19.77} +{'loss': 36.6637, 'grad_norm': 279.4811096191406, 'learning_rate': 2.2867513611615246e-07, 'epoch': 19.78} +{'loss': 35.9263, 'grad_norm': 390.7204284667969, 'learning_rate': 2.2323049001814882e-07, 'epoch': 19.78} +{'loss': 37.1529, 'grad_norm': 250.87916564941406, 'learning_rate': 2.1778584392014518e-07, 'epoch': 19.78} + 99%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 5490/5520 [4:50:24<01:27, 2.93s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6011965274810791, 'eval_runtime': 3.1499, 'eval_samples_per_second': 56.826, 'eval_steps_per_second': 56.826, 'epoch': 19.78} +{'loss': 36.2204, 'grad_norm': 285.9814453125, 'learning_rate': 2.1234119782214157e-07, 'epoch': 19.79} +{'loss': 36.4458, 'grad_norm': 263.5719299316406, 'learning_rate': 2.0689655172413793e-07, 'epoch': 19.79} +{'loss': 35.8917, 'grad_norm': 252.95606994628906, 'learning_rate': 2.014519056261343e-07, 'epoch': 19.79} +{'loss': 37.6994, 'grad_norm': 400.2224426269531, 'learning_rate': 1.9600725952813068e-07, 'epoch': 19.8} +{'loss': 36.6016, 'grad_norm': 304.3626403808594, 'learning_rate': 1.9056261343012705e-07, 'epoch': 19.8} +{'loss': 38.4323, 'grad_norm': 328.90875244140625, 'learning_rate': 1.8511796733212343e-07, 'epoch': 19.81} +{'loss': 37.1693, 'grad_norm': 242.90084838867188, 'learning_rate': 1.796733212341198e-07, 'epoch': 19.81} +{'loss': 36.9844, 'grad_norm': 246.82679748535156, 'learning_rate': 1.7422867513611613e-07, 'epoch': 19.81} +{'loss': 37.1382, 'grad_norm': 247.83578491210938, 'learning_rate': 1.6878402903811252e-07, 'epoch': 19.82} +{'loss': 39.0924, 'grad_norm': 346.5638732910156, 'learning_rate': 1.6333938294010889e-07, 'epoch': 19.82} +100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–| 5500/5520 [4:50:57<00:58, 2.94s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6002302765846252, 'eval_runtime': 3.1467, 'eval_samples_per_second': 56.884, 'eval_steps_per_second': 56.884, 'epoch': 19.82} +{'loss': 35.7904, 'grad_norm': 268.696044921875, 'learning_rate': 1.5789473684210527e-07, 'epoch': 19.82} +{'loss': 34.8324, 'grad_norm': 236.77597045898438, 'learning_rate': 1.5245009074410164e-07, 'epoch': 19.83} +{'loss': 30.9181, 'grad_norm': 282.07012939453125, 'learning_rate': 1.47005444646098e-07, 'epoch': 19.83} +{'loss': 29.8768, 'grad_norm': 304.3028259277344, 'learning_rate': 1.415607985480944e-07, 'epoch': 19.83} +{'loss': 29.9774, 'grad_norm': 345.91217041015625, 'learning_rate': 1.3611615245009075e-07, 'epoch': 19.84} +{'loss': 30.6578, 'grad_norm': 305.09893798828125, 'learning_rate': 1.3067150635208714e-07, 'epoch': 19.84} +{'loss': 31.6408, 'grad_norm': 279.6992492675781, 'learning_rate': 1.2522686025408348e-07, 'epoch': 19.85} +{'loss': 32.7726, 'grad_norm': 433.50579833984375, 'learning_rate': 1.1978221415607984e-07, 'epoch': 19.85} +{'loss': 33.2589, 'grad_norm': 264.6114196777344, 'learning_rate': 1.1433756805807623e-07, 'epoch': 19.85} +{'loss': 32.5284, 'grad_norm': 233.0192108154297, 'learning_rate': 1.0889292196007259e-07, 'epoch': 19.86} +100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‹| 5510/5520 [4:51:29<00:29, 2.95s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.6009184718132019, 'eval_runtime': 3.1505, 'eval_samples_per_second': 56.816, 'eval_steps_per_second': 56.816, 'epoch': 19.86} +{'loss': 34.1255, 'grad_norm': 268.5655212402344, 'learning_rate': 1.0344827586206897e-07, 'epoch': 19.86} +{'loss': 32.1586, 'grad_norm': 242.97332763671875, 'learning_rate': 9.800362976406534e-08, 'epoch': 19.86} +{'loss': 33.2971, 'grad_norm': 250.1754913330078, 'learning_rate': 9.255898366606172e-08, 'epoch': 19.87} +{'loss': 32.6599, 'grad_norm': 303.9489440917969, 'learning_rate': 8.711433756805807e-08, 'epoch': 19.87} +{'loss': 33.5164, 'grad_norm': 282.8628845214844, 'learning_rate': 8.166969147005444e-08, 'epoch': 19.87} +{'loss': 33.9399, 'grad_norm': 319.90228271484375, 'learning_rate': 7.622504537205082e-08, 'epoch': 19.88} +{'loss': 35.1216, 'grad_norm': 324.5431213378906, 'learning_rate': 7.07803992740472e-08, 'epoch': 19.88} +{'loss': 34.3538, 'grad_norm': 312.98297119140625, 'learning_rate': 6.533575317604357e-08, 'epoch': 19.88} +{'loss': 27.5229, 'grad_norm': 331.80718994140625, 'learning_rate': 5.989110707803992e-08, 'epoch': 19.89} +{'loss': 22.0451, 'grad_norm': 228.25613403320312, 'learning_rate': 5.4446460980036295e-08, 'epoch': 19.89} +100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5520/5520 [4:52:01<00:00, 2.95s/it]Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead. +{'eval_loss': 0.599698543548584, 'eval_runtime': 3.1515, 'eval_samples_per_second': 56.798, 'eval_steps_per_second': 56.798, 'epoch': 19.89} +{'loss': 21.0534, 'grad_norm': 211.70677185058594, 'learning_rate': 4.900181488203267e-08, 'epoch': 19.9} +{'loss': 22.478, 'grad_norm': 209.34217834472656, 'learning_rate': 4.3557168784029033e-08, 'epoch': 19.9} +{'loss': 23.3247, 'grad_norm': 219.7806396484375, 'learning_rate': 3.811252268602541e-08, 'epoch': 19.9} +{'loss': 37.8099, 'grad_norm': 243.7207489013672, 'learning_rate': 3.2667876588021785e-08, 'epoch': 19.91} +{'loss': 37.3804, 'grad_norm': 236.4864044189453, 'learning_rate': 2.7223230490018148e-08, 'epoch': 19.91} +{'loss': 38.5405, 'grad_norm': 269.2445373535156, 'learning_rate': 2.1778584392014517e-08, 'epoch': 19.91} +{'loss': 37.7808, 'grad_norm': 190.2155303955078, 'learning_rate': 1.6333938294010892e-08, 'epoch': 19.92} +{'loss': 39.002, 'grad_norm': 228.72300720214844, 'learning_rate': 1.0889292196007258e-08, 'epoch': 19.92} +{'loss': 37.3566, 'grad_norm': 305.3551025390625, 'learning_rate': 5.444646098003629e-09, 'epoch': 19.92} +{'loss': 34.3978, 'grad_norm': 300.5411071777344, 'learning_rate': 0.0, 'epoch': 19.93} +100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5520/5520 [4:52:04<00:00, 2.95s/it]You are using a model of type llama to instantiate a model of type llama_lowdim. This is not supported for all configurations of models and can yield errors. +100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5520/5520 [4:52:05<00:00, 3.17s/it] +{'eval_loss': 0.6009259223937988, 'eval_runtime': 3.1539, 'eval_samples_per_second': 56.756, 'eval_steps_per_second': 56.756, 'epoch': 19.93} +{'train_runtime': 17525.7354, 'train_samples_per_second': 20.222, 'train_steps_per_second': 0.315, 'train_loss': 38.1645543354145, 'epoch': 19.93} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/files/wandb-metadata.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..1b88c9bc37474db4268b99e86721fae2c76b9bb9 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/files/wandb-metadata.json @@ -0,0 +1,49 @@ +{ + "os": "Linux-5.15.0-101-generic-x86_64-with-glibc2.31", + "python": "3.9.21", + "startedAt": "2025-03-10T01:20:27.392098Z", + "program": "/home/chyang/workspace/LLM-BC/./llmbc/workspace/train_llm_workspace.py", + "codePath": "llmbc/workspace/train_llm_workspace.py", + "git": { + "remote": "https://github.com/CHYang25/LLM-BC.git", + "commit": "ec37a6bce756581f839f10930e4346ac6b4a3d63" + }, + "email": "chris920325@gmail.com", + "root": "/home/chyang/workspace/LLM-BC/data/outputs/2025.03.10/09.20.26_train_llm_lowdim_push-v2", + "host": "user", + "username": "chyang", + "executable": "/home/chyang/miniconda3/envs/llm-bc/bin/python3", + "codePathLocal": "llmbc/workspace/train_llm_workspace.py", + "cpu_count": 20, + "cpu_count_logical": 40, + "gpu": "NVIDIA GeForce RTX 4090", + "gpu_count": 2, + "disk": { + "/": { + "total": "1967317549056", + "used": "1595126108160" + } + }, + "memory": { + "total": "134536400896" + }, + "cpu": { + "count": 20, + "countLogical": 40 + }, + "gpu_nvidia": [ + { + "name": "NVIDIA GeForce RTX 4090", + "memoryTotal": "25757220864", + "cudaCores": 16384, + "architecture": "Ada" + }, + { + "name": "NVIDIA GeForce RTX 4090", + "memoryTotal": "25757220864", + "cudaCores": 16384, + "architecture": "Ada" + } + ], + "cudaVersion": "12.2" +} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/files/wandb-summary.json b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..599805ddae7c469669c636f084ca7baf0ce1b2bb --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/files/wandb-summary.json @@ -0,0 +1 @@ +{"_timestamp":1.741587166312121e+09,"eval/samples_per_second":56.756,"train/learning_rate":0,"eval/loss":0.6009259223937988,"eval/runtime":3.1539,"train_samples_per_second":20.222,"train/grad_norm":300.5411071777344,"_wandb":{"runtime":17538},"train/epoch":19.92821670428894,"train/global_step":5520,"eval/steps_per_second":56.756,"total_flos":2.7070932149777203e+17,"train/loss":34.3978,"train_runtime":17525.7354,"_runtime":17538.92010038,"train_steps_per_second":0.315,"train_loss":38.1645543354145,"_step":6072} \ No newline at end of file diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/logs/debug-core.log b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..a8f4b0feeb678500d12f50b208eb4f8b0a5d9880 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/logs/debug-core.log @@ -0,0 +1,16 @@ +{"time":"2025-03-10T09:20:26.81369481+08:00","level":"INFO","msg":"started logging, with flags","port-filename":"/tmp/tmp81xvmws3/port-3973458.txt","pid":3973458,"debug":false,"disable-analytics":false} +{"time":"2025-03-10T09:20:26.813715088+08:00","level":"INFO","msg":"FeatureState","shutdownOnParentExitEnabled":false} +{"time":"2025-03-10T09:20:26.814562178+08:00","level":"INFO","msg":"server is running","addr":{"IP":"127.0.0.1","Port":43241,"Zone":""}} +{"time":"2025-03-10T09:20:26.814587619+08:00","level":"INFO","msg":"Will exit if parent process dies.","ppid":3973458} +{"time":"2025-03-10T09:20:27.011813726+08:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"127.0.0.1:56100"} +{"time":"2025-03-10T09:20:27.394930937+08:00","level":"INFO","msg":"handleInformInit: received","streamId":"6awu8klx","id":"127.0.0.1:56100"} +{"time":"2025-03-10T09:20:27.501803346+08:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"6awu8klx","id":"127.0.0.1:56100"} +{"time":"2025-03-10T14:12:49.833063168+08:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"6awu8klx","id":"127.0.0.1:56100"} +{"time":"2025-03-10T14:12:49.833433088+08:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"6awu8klx","id":"127.0.0.1:56100"} +{"time":"2025-03-10T14:12:50.472952579+08:00","level":"INFO","msg":"handleInformTeardown: server teardown initiated","id":"127.0.0.1:56100"} +{"time":"2025-03-10T14:12:50.473016368+08:00","level":"INFO","msg":"handleInformTeardown: server shutdown complete","id":"127.0.0.1:56100"} +{"time":"2025-03-10T14:12:50.473030885+08:00","level":"INFO","msg":"server is shutting down"} +{"time":"2025-03-10T14:12:50.473088577+08:00","level":"INFO","msg":"connection: Close: initiating connection closure","id":"127.0.0.1:56100"} +{"time":"2025-03-10T14:12:50.473197237+08:00","level":"INFO","msg":"connection: Close: connection successfully closed","id":"127.0.0.1:56100"} +{"time":"2025-03-10T14:12:50.473252706+08:00","level":"INFO","msg":"connection: ManageConnectionData: connection closed","id":"127.0.0.1:56100"} +{"time":"2025-03-10T14:12:50.473272189+08:00","level":"INFO","msg":"server is closed"} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/logs/debug-internal.log b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..d47d248bd7e877599a5823e406f6c4b1db663b90 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/logs/debug-internal.log @@ -0,0 +1,17 @@ +{"time":"2025-03-10T09:20:27.395245778+08:00","level":"INFO","msg":"using version","core version":"0.18.6"} +{"time":"2025-03-10T09:20:27.395272821+08:00","level":"INFO","msg":"created symlink","path":"/home/chyang/workspace/LLM-BC/data/outputs/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/logs/debug-core.log"} +{"time":"2025-03-10T09:20:27.501784545+08:00","level":"INFO","msg":"created new stream","id":"6awu8klx"} +{"time":"2025-03-10T09:20:27.501800611+08:00","level":"INFO","msg":"stream: started","id":"6awu8klx"} +{"time":"2025-03-10T09:20:27.501864561+08:00","level":"INFO","msg":"sender: started","stream_id":"6awu8klx"} +{"time":"2025-03-10T09:20:27.501854739+08:00","level":"INFO","msg":"writer: Do: started","stream_id":{"value":"6awu8klx"}} +{"time":"2025-03-10T09:20:27.502027573+08:00","level":"INFO","msg":"handler: started","stream_id":{"value":"6awu8klx"}} +{"time":"2025-03-10T09:20:28.05035496+08:00","level":"INFO","msg":"Starting system monitor"} +{"time":"2025-03-10T14:12:46.314827029+08:00","level":"INFO","msg":"Stopping system monitor"} +{"time":"2025-03-10T14:12:46.315636093+08:00","level":"INFO","msg":"Stopped system monitor"} +{"time":"2025-03-10T14:12:47.315465643+08:00","level":"INFO","msg":"handler: operation stats","stats":{"operations":[{"desc":"saving job artifact","runtime_seconds":0.404143887},{"desc":"uploading output.log","runtime_seconds":0.161471187,"progress":"990.7KB/990.7KB"},{"desc":"uploading config.yaml","runtime_seconds":0.161454803,"progress":"15.6KB/15.6KB"}],"total_operations":3}} +{"time":"2025-03-10T14:12:48.510476348+08:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2025-03-10T14:12:49.833150896+08:00","level":"INFO","msg":"stream: closing","id":"6awu8klx"} +{"time":"2025-03-10T14:12:49.833196761+08:00","level":"INFO","msg":"handler: closed","stream_id":{"value":"6awu8klx"}} +{"time":"2025-03-10T14:12:49.833235054+08:00","level":"INFO","msg":"writer: Close: closed","stream_id":{"value":"6awu8klx"}} +{"time":"2025-03-10T14:12:49.833251953+08:00","level":"INFO","msg":"sender: closed","stream_id":"6awu8klx"} +{"time":"2025-03-10T14:12:49.833413851+08:00","level":"INFO","msg":"stream: closed","id":"6awu8klx"} diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/logs/debug.log b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..2e9a5446b7901eb8c9425ec184d05040be64cf54 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/logs/debug.log @@ -0,0 +1,35 @@ +2025-03-10 09:20:27,390 INFO MainThread:3973458 [wandb_setup.py:_flush():79] Current SDK version is 0.18.6 +2025-03-10 09:20:27,390 INFO MainThread:3973458 [wandb_setup.py:_flush():79] Configure stats pid to 3973458 +2025-03-10 09:20:27,390 INFO MainThread:3973458 [wandb_setup.py:_flush():79] Loading settings from /home/chyang/.config/wandb/settings +2025-03-10 09:20:27,390 INFO MainThread:3973458 [wandb_setup.py:_flush():79] Loading settings from /home/chyang/workspace/LLM-BC/wandb/settings +2025-03-10 09:20:27,390 INFO MainThread:3973458 [wandb_setup.py:_flush():79] Loading settings from environment variables: {} +2025-03-10 09:20:27,390 INFO MainThread:3973458 [wandb_setup.py:_flush():79] Applying setup settings: {'mode': 'online', '_disable_service': None} +2025-03-10 09:20:27,390 INFO MainThread:3973458 [wandb_setup.py:_flush():79] Inferring run settings from compute environment: {'program_relpath': 'llmbc/workspace/train_llm_workspace.py', 'program_abspath': '/home/chyang/workspace/LLM-BC/llmbc/workspace/train_llm_workspace.py', 'program': '/home/chyang/workspace/LLM-BC/./llmbc/workspace/train_llm_workspace.py'} +2025-03-10 09:20:27,390 INFO MainThread:3973458 [wandb_setup.py:_flush():79] Applying login settings: {} +2025-03-10 09:20:27,390 INFO MainThread:3973458 [wandb_init.py:_log_setup():533] Logging user logs to /home/chyang/workspace/LLM-BC/data/outputs/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/logs/debug.log +2025-03-10 09:20:27,390 INFO MainThread:3973458 [wandb_init.py:_log_setup():534] Logging internal logs to /home/chyang/workspace/LLM-BC/data/outputs/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/logs/debug-internal.log +2025-03-10 09:20:27,390 INFO MainThread:3973458 [wandb_init.py:init():619] calling init triggers +2025-03-10 09:20:27,391 INFO MainThread:3973458 [wandb_init.py:init():626] wandb.init called with sweep_config: {} +config: {'name': 'train_llm_lowdim', '_target_': 'llmbc.workspace.train_llm_workspace.TrainLLMWorkspace', 'obs_dim': 9, 'action_dim': 4, 'horizon': 1, 'n_obs_steps': 1, 'n_action_steps': 1, 'task_name': 'push-v2', 'exp_name': 'train llm', 'model_name': 'meta-llama/Llama-3.2-1B-Instruct', 'use_quantization': False, 'lora_config': {'r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'bias': 'none', 'task_type': 'CAUSAL_LM'}, 'dataset': {'test_data_ratio': 0.01}, 'debug': False, 'training': {'seed': 42, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 64, 'optim': 'paged_adamw_32bit', 'num_train_epochs': 20, 'eval_strategy': 'steps', 'logging_steps': 1, 'warmup_steps': 10, 'logging_strategy': 'steps', 'learning_rate': 3e-05, 'fp16': False, 'bf16': True, 'tf32': True, 'group_by_length': True, 'report_to': 'wandb', 'save_steps': 200, 'eval_steps': 10, 'use_joint_mlp_projector': True, 'joint_obs_action_mlp_lr': 1e-06}, 'trainer': {'obs_dim': 9, 'action_dim': 4, 'use_joint_mlp_projector': True, 'max_seq_length': 100, 'dataset_text_field': 'text', 'packing': False}, 'logging': {'project': 'llm_module_finetuning', 'resume': True, 'mode': 'online', 'name': '2025.03.10-09.20.26_train_llm_lowdim_push-v2', 'tags': ['train_llm_lowdim', 'push-v2', 'train llm'], 'id': None, 'group': None}, 'multi_run': {'run_dir': 'data/outputs/2025.03.10/09.20.26_train_llm_lowdim_push-v2', 'wandb_name_base': '2025.03.10-09.20.26_train_llm_lowdim_push-v2'}, 'task': {'name': 'push-v2', 'obs_dim': 9, 'action_dim': 4, 'env_runner': {'_target_': 'llmbc.env_runner.metaworld_lowdim_runner.MetaworldLowdimRunner', 'env_name': 'llf-metaworld-push-v2', 'max_steps': 30, 'n_obs_steps': 1, 'n_action_steps': 1, 'instruction_type': 'b', 'feedback_type': ['hp', 'hn', 'fp'], 'visual': False}, 'dataset': {'_target_': 'llmbc.dataset.metaworld_lowdim_dataset.MetaworldLowdimDataset', 'data_path': 'datasets/push-v2-general.pt', 'data_path2': 'datasets/push-v2.pt', 'horizon': 1, 'pad_before': 0, 'pad_after': 0, 'obs_eef_target': True, 'use_manual_normalizer': False, 'val_ratio': 0.2, 'dummy_normalizer': True}, 'instructor': {'_target_': 'llmbc.translator.instructor.metaworld_instructor.push_v2_instructor.PushV2Instructor'}}, 'llm': {'name': 'meta-llama/Llama-3.2-1B-Instruct', 'model_name': 'Llama-3.2-1B-Instruct', 'use_quantization': False, 'load_from_checkpoint': False, 'adaptor_path': '/home/chyang/workspace/LLM-BC/data/outputs/2025.03.05/13.39.46_train_llm_lowdim_sweep-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-sweep-v2/checkpoint-3550', 'use_orig_model': False, 'use_joint_mlp_projector': True, 'load_from_mlp_projector_checkpoint': True, 'mlp_projector_checkpoint_path': '/home/chyang/workspace/LLM-BC/data/outputs/2025.03.09/12.41.09_train_mlp_projector_metaworld/checkpoints/latest.ckpt', 'max_length': 100, 'config_target': 'llmbc.model.llm.llama_lowdim_model.LowdimLlamaConfig', 'causal_lm_target': 'llmbc.model.llm.llama_lowdim_model.LowdimLlamaForCausalLM', 'lora_config': {'r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'bias': 'none', 'task_type': 'CAUSAL_LM'}, 'prompter': {'_target_': 'llmbc.translator.prompter.llama_prompter.LlamaPrompter', 'use_joint_mlp_projector': True}, 'hydra': {'job': {'override_dirname': 'meta-llama/Llama-3.2-1B-Instruct'}, 'run': {'dir': 'data/outputs/2025.03.10/09.20.26_meta-llama/Llama-3.2-1B-Instruct'}}}} +2025-03-10 09:20:27,391 INFO MainThread:3973458 [wandb_init.py:init():669] starting backend +2025-03-10 09:20:27,391 INFO MainThread:3973458 [wandb_init.py:init():673] sending inform_init request +2025-03-10 09:20:27,391 INFO MainThread:3973458 [backend.py:_multiprocessing_setup():104] multiprocessing start_methods=fork,spawn,forkserver, using: spawn +2025-03-10 09:20:27,391 INFO MainThread:3973458 [wandb_init.py:init():686] backend started and connected +2025-03-10 09:20:27,395 INFO MainThread:3973458 [wandb_init.py:init():781] updated telemetry +2025-03-10 09:20:27,414 INFO MainThread:3973458 [wandb_init.py:init():814] communicating run to backend with 90.0 second timeout +2025-03-10 09:20:28,047 INFO MainThread:3973458 [wandb_init.py:init():867] starting run threads in backend +2025-03-10 09:20:28,132 INFO MainThread:3973458 [wandb_run.py:_console_start():2451] atexit reg +2025-03-10 09:20:28,132 INFO MainThread:3973458 [wandb_run.py:_redirect():2299] redirect: wrap_raw +2025-03-10 09:20:28,133 INFO MainThread:3973458 [wandb_run.py:_redirect():2364] Wrapping output streams. +2025-03-10 09:20:28,133 INFO MainThread:3973458 [wandb_run.py:_redirect():2389] Redirects installed. +2025-03-10 09:20:28,136 INFO MainThread:3973458 [wandb_init.py:init():911] run started, returning control to user process +2025-03-10 09:20:40,580 INFO MainThread:3973458 [wandb_run.py:_config_callback():1389] config_cb None None {'peft_config': {'default': {'task_type': 'CAUSAL_LM', 'peft_type': , 'auto_mapping': None, 'base_model_name_or_path': 'meta-llama/Llama-3.2-1B-Instruct', 'revision': None, 'inference_mode': False, 'r': 16, 'target_modules': {'v_proj', 'down_proj', 'gate_proj', 'q_proj', 'o_proj', 'k_proj', 'up_proj'}, 'exclude_modules': None, 'lora_alpha': 32, 'lora_dropout': 0.05, 'fan_in_fan_out': False, 'bias': 'none', 'use_rslora': False, 'modules_to_save': None, 'init_lora_weights': True, 'layers_to_transform': None, 'layers_pattern': None, 'rank_pattern': {}, 'alpha_pattern': {}, 'megatron_config': None, 'megatron_core': 'megatron.core', 'loftq_config': {}, 'eva_config': None, 'use_dora': False, 'layer_replication': None, 'runtime_config': {'ephemeral_gpu_offload': False}, 'lora_bias': False}}, 'obs_dim': 9, 'action_dim': 4, 'use_joint_mlp_projector': True, 'vocab_size': 128256, 'max_position_embeddings': 131072, 'hidden_size': 2048, 'intermediate_size': 8192, 'num_hidden_layers': 16, 'num_attention_heads': 32, 'num_key_value_heads': 8, 'hidden_act': 'silu', 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'pretraining_tp': 1, 'use_cache': False, 'rope_theta': 500000.0, 'rope_scaling': {'factor': 32.0, 'high_freq_factor': 4.0, 'low_freq_factor': 1.0, 'original_max_position_embeddings': 8192, 'rope_type': 'llama3'}, 'attention_bias': False, 'attention_dropout': 0.0, 'mlp_bias': False, 'head_dim': 64, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'architectures': ['LlamaForCausalLM'], 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 128000, 'pad_token_id': None, 'eos_token_id': [128001, 128008, 128009], 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, '_name_or_path': 'meta-llama/Llama-3.2-1B-Instruct', '_attn_implementation_autoset': True, 'transformers_version': '4.47.1', 'model_type': 'llama_lowdim', 'output_dir': '/home/chyang/workspace/LLM-BC/data/outputs/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2', 'overwrite_output_dir': False, 'do_train': False, 'do_eval': True, 'do_predict': False, 'eval_strategy': 'steps', 'prediction_loss_only': False, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 64, 'eval_accumulation_steps': None, 'eval_delay': 0, 'torch_empty_cache_steps': None, 'learning_rate': 3e-05, 'weight_decay': 0.0, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 20, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'lr_scheduler_kwargs': {}, 'warmup_ratio': 0.0, 'warmup_steps': 10, 'log_level': 'passive', 'log_level_replica': 'warning', 'log_on_each_node': True, 'logging_dir': '/home/chyang/workspace/LLM-BC/data/outputs/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2/runs/Mar10_09-20-37_user', 'logging_strategy': 'steps', 'logging_first_step': False, 'logging_steps': 1, 'logging_nan_inf_filter': True, 'save_strategy': 'steps', 'save_steps': 200, 'save_total_limit': None, 'save_safetensors': True, 'save_on_each_node': False, 'save_only_model': False, 'restore_callback_states_from_checkpoint': False, 'no_cuda': False, 'use_cpu': False, 'use_mps_device': False, 'seed': 42, 'data_seed': None, 'jit_mode_eval': False, 'use_ipex': False, 'bf16': True, 'fp16': False, 'fp16_opt_level': 'O1', 'half_precision_backend': 'auto', 'bf16_full_eval': False, 'fp16_full_eval': False, 'tf32': True, 'local_rank': 0, 'ddp_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': False, 'eval_steps': 10, 'dataloader_num_workers': 0, 'dataloader_prefetch_factor': None, 'past_index': -1, 'run_name': '/home/chyang/workspace/LLM-BC/data/outputs/2025.03.10/09.20.26_train_llm_lowdim_push-v2/meta-llama/Llama-3.2-1B-Instruct-finetuned-push-v2', 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'fsdp': [], 'fsdp_min_num_params': 0, 'fsdp_config': {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}, 'fsdp_transformer_layer_cls_to_wrap': None, 'accelerator_config': {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}, 'deepspeed': None, 'label_smoothing_factor': 0.0, 'optim': 'paged_adamw_32bit', 'optim_args': None, 'adafactor': False, 'group_by_length': True, 'length_column_name': 'length', 'report_to': ['wandb'], 'ddp_find_unused_parameters': None, 'ddp_bucket_cap_mb': None, 'ddp_broadcast_buffers': None, 'dataloader_pin_memory': True, 'dataloader_persistent_workers': False, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_strategy': 'every_save', 'hub_token': '', 'hub_private_repo': None, 'hub_always_push': False, 'gradient_checkpointing': False, 'gradient_checkpointing_kwargs': None, 'include_inputs_for_metrics': False, 'include_for_metrics': [], 'eval_do_concat_batches': True, 'fp16_backend': 'auto', 'evaluation_strategy': None, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': '', 'mp_parameters': '', 'auto_find_batch_size': False, 'full_determinism': False, 'torchdynamo': None, 'ray_scope': 'last', 'ddp_timeout': 1800, 'torch_compile': False, 'torch_compile_backend': None, 'torch_compile_mode': None, 'dispatch_batches': None, 'split_batches': None, 'include_tokens_per_second': False, 'include_num_input_tokens_seen': False, 'neftune_noise_alpha': None, 'optim_target_modules': None, 'batch_eval_metrics': False, 'eval_on_start': False, 'use_liger_kernel': False, 'eval_use_gather_object': False, 'average_tokens_across_devices': False, 'dataset_text_field': 'text', 'packing': False, 'max_seq_length': 100, 'dataset_num_proc': None, 'dataset_batch_size': 1000, 'model_init_kwargs': None, 'dataset_kwargs': {}, 'eval_packing': None, 'num_of_sequences': 1024, 'chars_per_token': '', 'use_liger': False, 'joint_obs_action_mlp_lr': 1e-06, 'obs_mlp_lr': None, 'action_mlp_lr': None} +2025-03-10 09:20:40,583 INFO MainThread:3973458 [wandb_config.py:__setitem__():154] config set model/num_parameters = 1251311616 - > +2025-03-10 09:20:40,583 INFO MainThread:3973458 [wandb_run.py:_config_callback():1389] config_cb model/num_parameters 1251311616 None +2025-03-10 14:12:46,313 INFO MainThread:3973458 [wandb_run.py:_finish():2146] finishing run chyang25-national-taiwan-university/llm_module_finetuning/6awu8klx +2025-03-10 14:12:46,314 INFO MainThread:3973458 [wandb_run.py:_atexit_cleanup():2414] got exitcode: 0 +2025-03-10 14:12:46,314 INFO MainThread:3973458 [wandb_run.py:_restore():2396] restore +2025-03-10 14:12:46,314 INFO MainThread:3973458 [wandb_run.py:_restore():2402] restore done +2025-03-10 14:12:49,828 INFO MainThread:3973458 [wandb_run.py:_footer_history_summary_info():3963] rendering history +2025-03-10 14:12:49,829 INFO MainThread:3973458 [wandb_run.py:_footer_history_summary_info():3995] rendering summary +2025-03-10 14:12:49,832 INFO MainThread:3973458 [wandb_run.py:_footer_sync_info():3922] logging synced files diff --git a/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/run-6awu8klx.wandb b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/run-6awu8klx.wandb new file mode 100644 index 0000000000000000000000000000000000000000..55e92a5ab922b84bb08ea7427a70cf8dd27b6c14 --- /dev/null +++ b/2025.03.10/09.20.26_train_llm_lowdim_push-v2/wandb/run-20250310_092027-6awu8klx/run-6awu8klx.wandb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc83fcdc354f75fcd6bdb3140ac68ee7abe7fce88f9fa6ba090274d29c62be29 +size 26372124